From 784206fde23f9f5153d328891125457c27739186 Mon Sep 17 00:00:00 2001
From: Ivan Mikhnenkov <39604625+ivanmikhnenkov@users.noreply.github.com>
Date: Thu, 27 Oct 2022 01:20:57 +0300
Subject: [PATCH 001/638] updated to 5th stable diffusion checkpoint (#57)

* updated to 5th stable diffusion checkpoint

* updated all stable diffusion example files to checkpoint v1.5
---
 examples/05_stable_diffusion/benchmark.py                       | 2 +-
 examples/05_stable_diffusion/benchmark_pt.py                    | 2 +-
 examples/05_stable_diffusion/compile.py                         | 2 +-
 examples/05_stable_diffusion/demo.py                            | 2 +-
 examples/05_stable_diffusion/demo_img2img.py                    | 2 +-
 examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py   | 2 +-
 .../pipeline_stable_diffusion_img2img_ait.py                    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
index bda7da289..811743da9 100644
--- a/examples/05_stable_diffusion/benchmark.py
+++ b/examples/05_stable_diffusion/benchmark.py
@@ -288,7 +288,7 @@ def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
         access_token = token
 
     pipe = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=access_token,
diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/benchmark_pt.py
index 3534eaf62..13b8738cc 100644
--- a/examples/05_stable_diffusion/benchmark_pt.py
+++ b/examples/05_stable_diffusion/benchmark_pt.py
@@ -27,7 +27,7 @@
 )
 def run(token, prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token,
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
index 4c6288a84..513df5b9b 100644
--- a/examples/05_stable_diffusion/compile.py
+++ b/examples/05_stable_diffusion/compile.py
@@ -333,7 +333,7 @@ def compile_diffusers(token, batch_size, img2img=False, use_fp16_acc=True, conve
         access_token = token
 
     pipe = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=access_token,
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
index 5a7b8b79e..cef5c7aaa 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/demo.py
@@ -27,7 +27,7 @@
 )
 def run(token, prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token,
diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/demo_img2img.py
index 5a9f8d0d6..65bdaa874 100644
--- a/examples/05_stable_diffusion/demo_img2img.py
+++ b/examples/05_stable_diffusion/demo_img2img.py
@@ -35,7 +35,7 @@ def run(token, prompt, benchmark):
 
     # load the pipeline
     device = "cuda"
-    model_id_or_path = "CompVis/stable-diffusion-v1-4"
+    model_id_or_path = "runwayml/stable-diffusion-v1-5"
     pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
         model_id_or_path,
         revision="fp16",
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
index bf4450e22..5234117b1 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
@@ -60,7 +60,7 @@ class StableDiffusionAITPipeline(StableDiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
index 9d18a7d32..d6c75ab05 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
@@ -70,7 +70,7 @@ class StableDiffusionImg2ImgAITPipeline(StableDiffusionImg2ImgPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """

From 0965ed07fff0cc36a6b3933f2d4aa4e187892ad8 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Mon, 7 Nov 2022 02:22:42 +0000
Subject: [PATCH 002/638] Support different sizes via recompilation (StableDiff
 demo) (#71)

Mostly, this commit is just re-establishing the relationship
between various previously-hardcoded constants and the target
image size (since the latent size is 1/8 of the image size,
hardcoding the latent sizes is inconvenient).

This adds `--width` and `--height` options to both compile.py
and demo.py, and provided these both match you can process
different sizes. For img2img mode, the size options passed at
compile time must match the size of the actual input image.

Consequently, the `--img2img` flag for `compile.py` no longer
exists: all this ever did was change the hardcoded size to
match the default input image used by `demo_img2img.py`. Yikes.

Sooo it's slightly more flexible than before, but still has no
support for a single binary to handle different image sizes. It
isn't super clear that compiling a generic binary is useful: the
upstream project can do that just fine: isn't the whole point
of AITemplates to achieve performance gains via aggressive
constant propagation and benchmarking to select the optimal
kernels?
---
 examples/05_stable_diffusion/compile.py      | 13 ++++++++-----
 examples/05_stable_diffusion/demo.py         |  6 ++++--
 examples/05_stable_diffusion/demo_img2img.py |  6 ++++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
index 513df5b9b..e91af3bb9 100644
--- a/examples/05_stable_diffusion/compile.py
+++ b/examples/05_stable_diffusion/compile.py
@@ -316,11 +316,12 @@ def compile_vae(
 
 @click.command()
 @click.option("--token", default="", help="access token")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option("--batch-size", default=1, help="batch size")
-@click.option("--img2img", default=False, help="compile img2img models")
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
-def compile_diffusers(token, batch_size, img2img=False, use_fp16_acc=True, convert_conv_to_gemm=True):
+def compile_diffusers(token, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True):
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
@@ -339,19 +340,21 @@ def compile_diffusers(token, batch_size, img2img=False, use_fp16_acc=True, conve
         use_auth_token=access_token,
     ).to("cuda")
 
-    width = 96 if img2img else 64
+    ww = width // 8
+    hh = height // 8
 
     # CLIP
     compile_clip(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
     # UNet
     compile_unet(
         batch_size=batch_size * 2,
-        ww=width,
+        ww=ww,
+        hh=hh,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
     )
     # VAE
-    compile_vae(batch_size=batch_size, width=width, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+    compile_vae(batch_size=batch_size, width=ww, height=hh, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
index cef5c7aaa..51859e886 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/demo.py
@@ -21,11 +21,13 @@
 
 @click.command()
 @click.option("--token", default="", help="access token")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, prompt, benchmark):
+def run(token, width, height, prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         "runwayml/stable-diffusion-v1-5",
         revision="fp16",
@@ -34,7 +36,7 @@ def run(token, prompt, benchmark):
     ).to("cuda")
 
     with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
+        image = pipe(prompt, height, width).images[0]
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt)
             print(f"sd e2e: {t} ms")
diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/demo_img2img.py
index 65bdaa874..844aac726 100644
--- a/examples/05_stable_diffusion/demo_img2img.py
+++ b/examples/05_stable_diffusion/demo_img2img.py
@@ -25,13 +25,15 @@
 
 @click.command()
 @click.option("--token", default="", help="access token")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option(
     "--prompt", default="A fantasy landscape, trending on artstation", help="prompt"
 )
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, prompt, benchmark):
+def run(token, width, height, prompt, benchmark):
 
     # load the pipeline
     device = "cuda"
@@ -49,7 +51,7 @@ def run(token, prompt, benchmark):
 
     response = requests.get(url)
     init_image = Image.open(BytesIO(response.content)).convert("RGB")
-    init_image = init_image.resize((768, 512))
+    init_image = init_image.resize((height, width))
 
     with torch.autocast("cuda"):
         images = pipe(

From f7878c907167b41423d63e5355fc2685cda58f8e Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 9 Nov 2022 12:55:16 -0800
Subject: [PATCH 003/638] v0.1.1 (#74)

* v0.11

* update cutlass

* fix

* add missing files

* patch cutlass

Co-authored-by: Bing Xu <bingxu@fb.com>
---
 .gitmodules                                   |    2 +-
 3rdparty/cutlass                              |    2 +-
 examples/03_bert/benchmark_mi250.sh           |    6 +-
 examples/05_stable_diffusion/benchmark_pt.py  |    1 +
 examples/05_stable_diffusion/compile.py       |   18 +-
 examples/05_stable_diffusion/demo_img2img.py  |    2 +-
 .../05_stable_diffusion/modeling/attention.py |    1 +
 examples/05_stable_diffusion/modeling/clip.py |   12 +-
 .../pipeline_stable_diffusion_img2img_ait.py  |    4 +-
 python/aitemplate/__init__.py                 |   18 +-
 python/aitemplate/backend/backend_spec.py     |   50 +-
 python/aitemplate/backend/builder.py          |  102 +-
 python/aitemplate/backend/codegen.py          |   92 +-
 .../backend/common/concatenate_common.py      |   58 +-
 .../backend/common/elementwise_common.py      |   70 +-
 .../aitemplate/backend/common/split_common.py |   51 +-
 .../backend/common/tensor/argmax_common.py    |   10 +-
 .../common/tensor/batch_gather_common.py      |   26 +-
 .../common/tensor/permute021_common.py        |   39 +-
 .../common/tensor/permute102_common.py        |   37 +-
 .../common/tensor/permute210_common.py        |   39 +-
 .../backend/common/tensor/slice_common.py     |   48 +-
 .../backend/common/tensor/topk_common.py      |   27 +-
 .../backend/common/tensor_accessor_codegen.py |   52 +-
 .../backend/common/upsampling2d_common.py     |   45 +-
 .../common/vision_ops/efficient_nms_common.py |   63 +-
 .../common/vision_ops/efficient_nms_kernel.py |    2 +-
 .../backend/common/vision_ops/nms_common.py   |   48 +-
 .../common/vision_ops/roi_align_common.py     |   45 +-
 python/aitemplate/backend/cuda/__init__.py    |    1 +
 .../backend/cuda/attention/__init__.py        |    4 +-
 .../backend/cuda/attention/flash_attention.py |   20 +-
 .../cuda/attention/mem_eff_attention.py       |  262 ++
 .../backend/cuda/attention/src/fmha.h         |   14 -
 .../backend/cuda/attention/src/fmha/gemm.h    |   14 -
 .../cuda/attention/src/fmha/gmem_tile.h       |   14 -
 .../cuda/attention/src/fmha/kernel_traits.h   |   14 -
 .../backend/cuda/attention/src/fmha/mask.h    |   14 -
 .../cuda/attention/src/fmha/smem_tile.h       |   14 -
 .../backend/cuda/attention/src/fmha/softmax.h |   14 -
 .../backend/cuda/attention/src/fmha/utils.h   |   14 -
 .../src/fmha_block_fprop_fp16_kernel.sm80.cu  |   14 -
 .../src/fmha_block_fprop_kernel_1xN.h         |   14 -
 .../cuda/attention/src/fmha_blockmask.h       |   14 -
 .../src/fmha_fprop_fp16_kernel.sm80.cu        |   14 -
 .../attention/src/fmha_fprop_kernel_1xN.h     |   14 -
 .../backend/cuda/attention/src/fmha_kernel.h  |   14 -
 .../backend/cuda/attention/src/fmha_utils.h   |   14 -
 .../backend/cuda/attention/src/philox.cuh     |   14 -
 .../aitemplate/backend/cuda/conv2d/common.py  |   18 +-
 .../conv2d/common_conv2d_bias_activation.py   |   59 +-
 .../common_conv2d_bias_add_activation.py      |   65 +-
 .../aitemplate/backend/cuda/conv2d/conv2d.py  |   47 +-
 .../backend/cuda/conv2d/conv2d_bias.py        |    2 +-
 .../backend/cuda/conv2d/conv2d_bias_add.py    |    2 +-
 .../cuda/conv2d/conv2d_bias_add_hardswish.py  |    2 +-
 .../cuda/conv2d/conv2d_bias_add_relu.py       |    2 +-
 .../cuda/conv2d/conv2d_bias_few_channels.py   |    2 +-
 .../cuda/conv2d/conv2d_bias_hardswish.py      |    2 +-
 .../conv2d_bias_hardswish_few_channels.py     |    2 +-
 .../backend/cuda/conv2d/conv2d_bias_relu.py   |    2 +-
 .../conv2d/conv2d_bias_relu_few_channels.py   |    2 +-
 .../cuda/conv2d/conv2d_bias_sigmoid.py        |    2 +-
 .../backend/cuda/conv2d/transposed_conv2d.py  |   16 +-
 .../cuda/conv2d/transposed_conv2d_bias.py     |   17 +-
 .../backend/cuda/conv3d/__init__.py           |   20 +
 .../aitemplate/backend/cuda/conv3d/common.py  |  364 +++
 .../aitemplate/backend/cuda/conv3d/conv3d.py  |  496 ++++
 .../backend/cuda/conv3d/depthwise_conv3d.py   |  331 +++
 .../backend/cuda/elementwise/__init__.py      |    4 +-
 .../backend/cuda/elementwise/custom_math.cuh  |  158 ++
 .../cuda/elementwise/fused_elementwise.py     |    1 +
 .../cuda/elementwise/int_elementwise.py       |   67 +
 .../backend/cuda/embedding/bert_embeddings.py |  125 +-
 .../cuda/gemm_epilogue_vistor/__init__.py     |   16 +-
 .../bmm_common_softmax.py                     |    4 +-
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |    2 +-
 .../gemm_epilogue_vistor/common_dual_gemm.py  |  458 ++++
 .../gemm_epilogue_vistor/common_softmax.py    |    8 +-
 .../dual_gemm_rcr_fast_gelu.py                |  348 +++
 .../dual_gemm_rcr_silu.py                     |  220 ++
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |    2 +-
 .../include/gemm_with_softmax.h               |   14 -
 .../backend/cuda/gemm_special/bmm_rcr_n1.py   |   33 +-
 .../cuda/gemm_special/bmm_rrr_k1_tanh.py      |   83 +-
 .../cuda/gemm_special/gemm_rrr_small_nk.py    |   81 +-
 .../backend/cuda/gemm_universal/__init__.py   |    1 +
 .../backend/cuda/gemm_universal/bmm_ccr.py    |   35 +-
 .../cuda/gemm_universal/bmm_ccr_add.py        |   19 +-
 .../backend/cuda/gemm_universal/bmm_common.py |  182 +-
 .../backend/cuda/gemm_universal/bmm_crr.py    |   35 +-
 .../cuda/gemm_universal/bmm_crr_add.py        |   21 +-
 .../cuda/gemm_universal/bmm_permute_common.py |  153 +-
 .../backend/cuda/gemm_universal/bmm_rcr.py    |   36 +-
 .../cuda/gemm_universal/bmm_rcr_permute.py    |   36 +-
 .../backend/cuda/gemm_universal/bmm_rrr.py    |   35 +-
 .../cuda/gemm_universal/bmm_rrr_add.py        |   21 +-
 .../cuda/gemm_universal/bmm_rrr_permute.py    |   40 +-
 .../backend/cuda/gemm_universal/common.py     |  433 ++-
 .../cuda/gemm_universal/common_bias.py        |   22 +-
 .../gemm_universal/common_bias_activation.py  |   21 +-
 .../gemm_universal/common_bias_broadcast.py   |  248 +-
 .../cuda/gemm_universal/common_no_bias.py     |  105 +
 .../cuda/gemm_universal/common_permute.py     |  215 +-
 .../backend/cuda/gemm_universal/gemm_rcr.py   |   39 +-
 .../cuda/gemm_universal/gemm_rcr_bias.py      |   37 +-
 .../cuda/gemm_universal/gemm_rcr_bias_add.py  |    5 +-
 .../gemm_universal/gemm_rcr_bias_add_add.py   |    5 +-
 .../gemm_rcr_bias_add_add_relu.py             |    5 +-
 .../gemm_universal/gemm_rcr_bias_add_relu.py  |    5 +-
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |   11 +-
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py |   11 +-
 .../gemm_universal/gemm_rcr_bias_hardswish.py |   11 +-
 .../cuda/gemm_universal/gemm_rcr_bias_mul.py  |    5 +-
 .../gemm_universal/gemm_rcr_bias_mul_add.py   |    5 +-
 .../gemm_universal/gemm_rcr_bias_mul_tanh.py  |    5 +-
 .../gemm_universal/gemm_rcr_bias_permute.py   |   23 +-
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |   11 +-
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |   11 +-
 .../gemm_rcr_bias_sigmoid_mul.py              |    5 +-
 .../gemm_rcr_bias_sigmoid_mul_tanh.py         |    5 +-
 .../gemm_universal/gemm_rcr_bias_swish.py     |   11 +-
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py |   11 +-
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py |  170 ++
 .../cuda/gemm_universal/gemm_rcr_permute.py   |   51 +-
 .../backend/cuda/gemm_universal/gemm_rrr.py   |   41 +-
 .../cuda/gemm_universal/gemm_rrr_permute.py   |   51 +-
 .../cuda/gemm_universal/group_common.py       |  272 +-
 .../cuda/gemm_universal/group_common_bias.py  |   18 +-
 .../cuda/gemm_universal/group_gemm_rcr.py     |   16 +-
 .../gemm_universal/group_gemm_rcr_bias.py     |    6 +-
 .../group_gemm_rcr_bias_relu.py               |    6 +-
 .../group_gemm_rcr_bias_sigmoid.py            |    6 +-
 .../cuda/gemm_universal/perm021fc_ccr.py      |   35 +-
 .../cuda/gemm_universal/perm021fc_ccr_bias.py |   23 +-
 .../perm021fc_ccr_bias_permute.py             |   29 +-
 .../cuda/gemm_universal/perm021fc_crc.py      |   29 +-
 .../cuda/gemm_universal/perm021fc_crc_bias.py |   15 +-
 .../cuda/gemm_universal/perm102_bmm_rcr.py    |   41 +-
 .../gemm_universal/perm102_bmm_rcr_bias.py    |   27 +-
 .../cuda/gemm_universal/perm102_bmm_rrr.py    |   43 +-
 .../gemm_universal/perm102_bmm_rrr_bias.py    |   26 +-
 .../cuda/groupnorm/groupnorm_common.py        |   51 +-
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  388 ++-
 .../backend/cuda/groupnorm/layer_norm.cuh     | 2404 +++++++++++++++++
 .../batch_layernorm_sigmoid_mul.py            |   25 +-
 .../group_layernorm_sigmoid_mul.py            |   97 +-
 .../layernorm_sigmoid_mul/layernorm_common.py |    9 +-
 .../layernorm_sigmoid_mul.py                  |   30 +-
 .../layernorm_sigmoid_mul_kernel.cuh          |  163 +-
 .../aitemplate/backend/cuda/lib_template.py   |   18 +-
 .../backend/cuda/padding/nhwc3to4.py          |   35 +-
 .../backend/cuda/padding/nhwc3to8.py          |   69 +-
 .../backend/cuda/padding/pad_last_dim.py      |   51 +-
 .../backend/cuda/pool2d/avg_pool2d.py         |   18 +-
 .../backend/cuda/pool2d/max_pool2d.py         |   21 +-
 .../aitemplate/backend/cuda/pool2d/pool2d.py  |    4 +-
 .../backend/cuda/reduce/reduce_3d.py          |   49 +-
 .../backend/cuda/reduce/reduce_common.py      |   71 +-
 .../backend/cuda/reduce/reduce_small_axis.py  |   47 +-
 python/aitemplate/backend/cuda/reduce/var.py  |    7 +-
 .../backend/cuda/softmax/softmax.py           |   32 +-
 python/aitemplate/backend/cuda/target_def.py  |  213 +-
 .../backend/cuda/tensor/__init__.py           |    2 +
 .../aitemplate/backend/cuda/tensor/gather.py  |   43 +-
 .../backend/cuda/tensor/permute.cuh           |  369 +++
 .../aitemplate/backend/cuda/tensor/permute.py |  183 ++
 .../backend/cuda/upsample/upsampling2d.py     |    6 +-
 .../backend/cuda/upsample/upsampling2d_add.py |    8 +-
 .../backend/cuda/view_ops/view_ops.py         |   26 +-
 .../cuda/vision_ops/nms/batched_nms.py        |   21 +-
 .../vision_ops/nms/batched_nms_kernel.cuh     |   14 -
 .../cuda/vision_ops/roi_ops/roi_align.py      |    7 +-
 .../cuda/vision_ops/roi_ops/roi_ops.py        |    6 +-
 python/aitemplate/backend/main_templates.py   |  197 +-
 python/aitemplate/backend/profiler_cache.py   |  309 ++-
 python/aitemplate/backend/profiler_runner.py  |  166 +-
 .../aitemplate/backend/rocm/conv2d/common.py  |   14 +-
 .../aitemplate/backend/rocm/conv2d/conv2d.py  |    2 +-
 .../backend/rocm/conv2d/conv2d_bias.py        |    2 +-
 .../rocm/conv2d/conv2d_bias_add_relu.py       |    2 +-
 .../backend/rocm/conv2d/conv2d_bias_relu.py   |    2 +-
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |    2 +-
 .../backend/rocm/conv2d/transposed_conv2d.py  |    2 +-
 .../conv2d/transposed_conv2d_bias_relu.py     |    2 +-
 .../aitemplate/backend/rocm/gemm/bmm_ccr.py   |    2 +-
 .../backend/rocm/gemm/bmm_common.py           |    2 +-
 .../aitemplate/backend/rocm/gemm/bmm_crr.py   |    2 +-
 .../aitemplate/backend/rocm/gemm/bmm_rcr.py   |    2 +-
 .../backend/rocm/gemm/bmm_rcr_permute.py      |    2 +-
 .../aitemplate/backend/rocm/gemm/bmm_rrr.py   |    2 +-
 .../backend/rocm/gemm/bmm_rrr_permute.py      |    2 +-
 .../backend/rocm/gemm/bmm_softmax_bmm.py      |    2 +-
 .../rocm/gemm/bmm_softmax_bmm_permute.py      |    2 +-
 python/aitemplate/backend/rocm/gemm/common.py |   11 +-
 .../aitemplate/backend/rocm/gemm/gemm_rcr.py  |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias.py        |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias_add.py    |    2 +-
 .../rocm/gemm/gemm_rcr_bias_add_add.py        |    2 +-
 .../rocm/gemm/gemm_rcr_bias_add_add_relu.py   |    2 +-
 .../rocm/gemm/gemm_rcr_bias_add_relu.py       |    2 +-
 .../rocm/gemm/gemm_rcr_bias_fast_gelu.py      |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias_mul.py    |    2 +-
 .../rocm/gemm/gemm_rcr_bias_mul_add.py        |    2 +-
 .../rocm/gemm/gemm_rcr_bias_mul_tanh.py       |    2 +-
 .../rocm/gemm/gemm_rcr_bias_permute.py        |    2 +-
 .../rocm/gemm/gemm_rcr_bias_permute_m2n3.py   |    2 +-
 .../rocm/gemm/gemm_rcr_bias_permute_m3n2.py   |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias_relu.py   |    2 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid.py        |    2 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid_mul.py    |    2 +-
 .../gemm/gemm_rcr_bias_sigmoid_mul_tanh.py    |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias_swish.py  |    2 +-
 .../backend/rocm/gemm/gemm_rcr_bias_tanh.py   |    2 +-
 .../rocm/gemm/gemm_rcr_permute_m2n3.py        |    2 +-
 .../aitemplate/backend/rocm/gemm/gemm_rrr.py  |    2 +-
 .../rocm/gemm/gemm_rrr_bias_permute.py        |    2 +-
 .../aitemplate/backend/rocm/lib_template.py   |    8 +-
 .../backend/rocm/normalization/groupnorm.py   |    2 +-
 .../rocm/normalization/groupnorm_swish.py     |    2 +-
 .../backend/rocm/normalization/layernorm.py   |    2 +-
 .../backend/rocm/normalization/norm_common.py |   11 +-
 .../backend/rocm/normalization/softmax.py     |    2 +-
 python/aitemplate/backend/rocm/target_def.py  |  146 +-
 python/aitemplate/backend/target.py           |   42 +-
 python/aitemplate/compiler/__init__.py        |    3 +-
 python/aitemplate/compiler/base.py            |   92 +-
 python/aitemplate/compiler/compiler.py        |   56 +-
 python/aitemplate/compiler/dtype.py           |  136 +
 python/aitemplate/compiler/model.py           |   73 +-
 .../compiler/ops/attention/__init__.py        |    3 +-
 .../compiler/ops/attention/flash_attention.py |   10 +
 .../ops/attention/mem_eff_attention.py        |  179 ++
 .../compiler/ops/common/__init__.py           |    1 +
 .../compiler/ops/common/elementwise.py        |    6 +
 .../compiler/ops/common/epilogue.py           |    4 +
 .../compiler/ops/common/fused_elementwise.py  |    5 +-
 .../compiler/ops/common/int_elementwise.py    |  142 +
 python/aitemplate/compiler/ops/common/math.py |   16 +
 .../compiler/ops/common/view_ops.py           |  135 +-
 .../aitemplate/compiler/ops/conv/__init__.py  |    2 +
 .../compiler/ops/conv/cache_entry.py          |   65 +
 .../ops/conv/common_conv2d_bias_activation.py |    6 +
 .../conv/common_conv2d_bias_add_activation.py |    6 +
 python/aitemplate/compiler/ops/conv/conv2d.py |   18 +-
 .../compiler/ops/conv/conv2d_bias.py          |   10 +
 .../compiler/ops/conv/conv2d_bias_add.py      |    6 +
 .../ops/conv/conv2d_bias_add_hardswish.py     |    6 +
 .../compiler/ops/conv/conv2d_bias_add_relu.py |    6 +
 .../ops/conv/conv2d_bias_few_channels.py      |    8 +-
 .../ops/conv/conv2d_bias_hardswish.py         |    6 +
 .../conv2d_bias_hardswish_few_channels.py     |    6 +
 .../compiler/ops/conv/conv2d_bias_relu.py     |    6 +
 .../ops/conv/conv2d_bias_relu_few_channels.py |    8 +-
 .../compiler/ops/conv/conv2d_bias_sigmoid.py  |    6 +
 python/aitemplate/compiler/ops/conv/conv3d.py |  623 +++++
 .../compiler/ops/conv/depthwise_conv3d.py     |  290 ++
 .../conv/special_conv2d_bias_activation.py    |   13 +
 .../ops/gemm_epilogue_vistor/__init__.py      |   10 +-
 .../dual_gemm_rcr_fast_gelu.py                |   77 +
 .../dual_gemm_rcr_silu.py                     |   77 +
 .../compiler/ops/gemm_special/bmm_rcr_n1.py   |    1 -
 .../compiler/ops/gemm_universal/__init__.py   |    1 +
 .../ops/gemm_universal/bmm_rcr_permute.py     |   10 +-
 .../ops/gemm_universal/bmm_rrr_permute.py     |    9 +-
 .../ops/gemm_universal/bmm_softmax_bmm.py     |    3 +
 .../gemm_universal/bmm_softmax_bmm_permute.py |   11 +-
 .../ops/gemm_universal/gemm_common.py         |  291 +-
 .../compiler/ops/gemm_universal/gemm_rcr.py   |    1 -
 .../ops/gemm_universal/gemm_rcr_bias.py       |    1 -
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |    1 -
 .../ops/gemm_universal/gemm_rcr_bias_gelu.py  |    1 -
 .../gemm_universal/gemm_rcr_bias_hardswish.py |    1 -
 .../gemm_universal/gemm_rcr_bias_permute.py   |    9 +-
 .../ops/gemm_universal/gemm_rcr_bias_relu.py  |    1 -
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |    1 -
 .../ops/gemm_universal/gemm_rcr_bias_swish.py |    1 -
 .../ops/gemm_universal/gemm_rcr_bias_tanh.py  |    1 -
 .../ops/gemm_universal/gemm_rcr_fast_gelu.py  |   41 +
 .../ops/gemm_universal/gemm_rcr_permute.py    |   37 +-
 .../compiler/ops/gemm_universal/gemm_rrr.py   |    1 -
 .../gemm_universal/gemm_rrr_bias_permute.py   |    3 +-
 .../ops/gemm_universal/gemm_rrr_permute.py    |    8 +-
 .../ops/gemm_universal/group_gemm_rcr.py      |   13 +-
 .../ops/gemm_universal/group_gemm_rcr_bias.py |    5 +-
 .../group_gemm_rcr_bias_relu.py               |    1 -
 .../group_gemm_rcr_bias_sigmoid.py            |    1 -
 .../ops/gemm_universal/perm021fc_ccr.py       |    1 -
 .../ops/gemm_universal/perm021fc_ccr_bias.py  |    1 -
 .../perm021fc_ccr_bias_permute.py             |    4 +-
 .../ops/gemm_universal/perm102_bmm_rcr.py     |    1 -
 .../gemm_universal/perm102_bmm_rcr_bias.py    |    1 -
 .../ops/gemm_universal/perm102_bmm_rrr.py     |    1 -
 .../gemm_universal/perm102_bmm_rrr_bias.py    |    1 -
 .../compiler/ops/groupnorm/groupnorm.py       |   23 +-
 .../compiler/ops/layernorm/group_layernorm.py |    1 -
 .../layernorm/group_layernorm_sigmoid_mul.py  |    1 -
 .../compiler/ops/layernorm/layernorm.py       |   38 +-
 .../ops/layernorm/layernorm_sigmoid_mul.py    |   17 +-
 .../compiler/ops/padding/nhwc_pad_common.py   |    6 +
 .../compiler/ops/padding/pad_last_dim.py      |    3 +
 python/aitemplate/compiler/ops/pool/pool2d.py |   10 +
 .../compiler/ops/reduce/reduce_common.py      |   92 +-
 python/aitemplate/compiler/ops/reduce/var.py  |    8 +
 .../compiler/ops/reduce/vector_norm.py        |    8 +
 .../compiler/ops/softmax/softmax.py           |   30 +-
 .../aitemplate/compiler/ops/tensor/argmax.py  |   13 +-
 .../compiler/ops/tensor/concatenate.py        |    5 -
 .../aitemplate/compiler/ops/tensor/permute.py |   44 +-
 .../ops/tensor/slice_reshape_scatter.py       |   13 +-
 .../compiler/ops/tensor/slice_scatter.py      |   15 +-
 python/aitemplate/compiler/ops/tensor/topk.py |   25 +-
 .../ops/upsample/upsampling_common.py         |    6 +
 .../ops/vision_ops/nms/batched_nms.py         |    6 +
 .../ops/vision_ops/nms/efficient_nms.py       |   20 +-
 .../compiler/ops/vision_ops/nms/nms.py        |   20 +-
 .../roi_ops/multi_level_roi_align.py          |    6 +
 .../ops/vision_ops/roi_ops/roi_ops.py         |   17 +
 python/aitemplate/compiler/public/__init__.py |    4 +
 python/aitemplate/compiler/stable_set.py      |  100 +
 python/aitemplate/compiler/tensor_accessor.py |    2 +-
 .../aitemplate/compiler/transform/__init__.py |    3 +-
 .../compiler/transform/apply_padding.py       |    2 +-
 .../compiler/transform/constant_folding.py    |   16 +-
 .../compiler/transform/fuse_group_ops.py      |    2 +-
 .../transform/fuse_mm_elementwise_patterns.py |   16 +
 .../transform/fuse_mm_reshape_permute.py      |  189 ++
 .../aitemplate/compiler/transform/fuse_ops.py |   12 +
 .../transform/fuse_permute_bmm_and_gemm.py    |  246 ++
 .../compiler/transform/fuse_split.py          |    4 +-
 .../compiler/transform/name_graph.py          |   13 +-
 .../compiler/transform/optimize_graph.py      |    9 +-
 .../aitemplate/compiler/transform/profile.py  |   90 +-
 .../compiler/transform/profile_dynamic_dim.py |   35 +-
 .../transform/split_large_concat_ops.py       |  124 +
 .../transform/transform_memory_ops.py         |   16 +-
 .../transform_strided_op_and_view_op.py       |    3 +-
 .../transform/transform_strided_ops.py        |   14 +-
 .../compiler/transform/transform_utils.py     |    6 +-
 python/aitemplate/frontend/nn/__init__.py     |    5 +-
 python/aitemplate/frontend/nn/attention.py    |  189 +-
 .../aitemplate/frontend/nn/conv2d/conv2d.py   |    1 +
 .../frontend/nn/conv2d/conv2d_bias.py         |   41 +
 .../nn/conv2d/conv2d_bias_add_hardswish.py    |   21 +
 .../nn/conv2d/conv2d_bias_add_relu.py         |   21 +
 .../nn/conv2d/conv2d_bias_few_channels.py     |    7 +-
 .../nn/conv2d/conv2d_bias_hardswish.py        |    2 +
 .../conv2d_bias_hardswish_few_channels.py     |    7 +-
 .../frontend/nn/conv2d/conv2d_bias_relu.py    |    2 +
 .../conv2d/conv2d_bias_relu_few_channels.py   |    7 +-
 .../frontend/nn/conv2d/conv2d_bias_sigmoid.py |    2 +
 .../nn/conv2d/transposed_conv2d_bias.py       |   49 +-
 .../nn/conv2d/transposed_conv2d_bias_relu.py  |    2 +
 python/aitemplate/frontend/nn/dropout.py      |    5 +
 python/aitemplate/frontend/nn/dual_gemm.py    |   72 +
 python/aitemplate/frontend/nn/embedding.py    |   11 +
 python/aitemplate/frontend/nn/identity.py     |    2 +
 python/aitemplate/frontend/nn/linear.py       |   37 +
 python/aitemplate/frontend/nn/padding.py      |    2 +
 python/aitemplate/frontend/nn/pool2d.py       |   50 +
 python/aitemplate/frontend/nn/roi_ops.py      |   71 +
 python/aitemplate/frontend/nn/upsample.py     |   21 +
 python/aitemplate/frontend/nn/view_ops.py     |   25 +
 python/aitemplate/testing/detect_target.py    |   10 +-
 python/aitemplate/utils/__init__.py           |    1 +
 python/aitemplate/utils/alignment.py          |   36 +
 python/aitemplate/utils/graph_utils.py        |   20 +-
 python/aitemplate/utils/logger.py             |   20 +
 python/aitemplate/utils/mk_ck_lib/__init__.py |   18 +
 .../aitemplate/utils/mk_ck_lib/generator.py   |    2 -
 .../utils/mk_cutlass_lib/extra_enum.py        |   10 +-
 .../utils/mk_cutlass_lib/extra_gemm_emit.py   |  136 +
 .../utils/serialization/ait_program.py        |   90 +
 .../utils/serialization/serdes_code.py        |  393 +++
 python/aitemplate/utils/torch_utils.py        |    1 +
 python/aitemplate/utils/visualization/plot.py |   76 +-
 python/setup.py                               |   32 +-
 static/csrc/debug_utility.cpp                 |   80 +
 static/csrc/model_container.cpp               |   16 +-
 static/csrc/model_interface.cpp               |   83 +-
 static/include/debug_utility.h                |   30 +
 static/include/model_container.h              |   20 +-
 static/include/model_interface.h              |   46 +-
 static/include/raii_wrapper.h                 |   29 +-
 tests/unittest/backend/test_cuda_graph.py     |   79 +
 tests/unittest/backend/test_model_api.py      |   44 +-
 tests/unittest/backend/test_profiler.py       |   77 +
 .../unittest/benchmark/test_gemm_benchmark.py |  321 +++
 .../compiler/test_constant_folding.py         |    6 +-
 .../compiler/test_fuse_mm_elementwise.py      |   26 +
 .../compiler/test_fuse_mm_reshape_permute.py  |  125 +
 .../compiler/test_fuse_permute_gemm.py        |   86 +
 ...st_fused_elementwise_complex_dependency.py |   86 +
 tests/unittest/compiler/test_group_fusions.py |    5 +
 .../test_pad_gemm_with_elementwise.py         |   45 +
 .../compiler/test_split_large_concat.py       |  462 ++++
 .../compiler/test_strided_group_gemm.py       |    5 +-
 .../compiler/test_strided_op_cat_pattern.py   |  226 +-
 .../compiler/test_strided_reshape_cat.py      |    5 +-
 tests/unittest/compiler/test_tensor.py        |   54 +
 .../unittest/compiler/test_transform_utils.py |    3 +-
 tests/unittest/ops/test_activation.py         |   93 +-
 tests/unittest/ops/test_argmax.py             |   12 +-
 tests/unittest/ops/test_attention.py          |  336 ++-
 tests/unittest/ops/test_bmm_add.py            |    2 +-
 tests/unittest/ops/test_bmm_permute.py        |   16 +-
 tests/unittest/ops/test_bmm_rcr_n1.py         |    2 +-
 tests/unittest/ops/test_bmm_rrr_k1_tanh.py    |    2 +-
 tests/unittest/ops/test_bmm_softmax.py        |    3 +
 tests/unittest/ops/test_bmm_softmax_bmm.py    |   23 +-
 tests/unittest/ops/test_concatenate_tanh.py   |   10 +-
 tests/unittest/ops/test_conv.py               |   10 +-
 tests/unittest/ops/test_conv2d_bias_add.py    |    8 +-
 tests/unittest/ops/test_conv3d.py             |   89 +
 tests/unittest/ops/test_conv_bias.py          |    8 +-
 .../ops/test_conv_bias_act_few_channels.py    |   16 +-
 .../ops/test_conv_bias_add_hardswish.py       |    8 +-
 tests/unittest/ops/test_conv_bias_add_relu.py |    8 +-
 .../unittest/ops/test_conv_bias_hardswish.py  |    8 +-
 tests/unittest/ops/test_conv_bias_relu.py     |    8 +-
 tests/unittest/ops/test_conv_bias_sigmoid.py  |    8 +-
 tests/unittest/ops/test_cross_attention.py    |  133 +
 tests/unittest/ops/test_depthwise_conv3d.py   |  123 +
 tests/unittest/ops/test_dual_gemm.py          |  193 ++
 tests/unittest/ops/test_dynamic_conv.py       |   96 +-
 tests/unittest/ops/test_efficient_nms.py      |   24 +-
 tests/unittest/ops/test_fpn_roi_align.py      |   14 +
 tests/unittest/ops/test_fused_elementwise.py  |  337 ++-
 tests/unittest/ops/test_gemm.py               |   46 +
 tests/unittest/ops/test_gemm_bias.py          |    3 +-
 .../unittest/ops/test_gemm_bias_broadcast.py  |    2 +
 .../unittest/ops/test_gemm_bias_hardswish.py  |    2 +-
 tests/unittest/ops/test_gemm_bias_permute.py  |   26 +-
 tests/unittest/ops/test_gemm_bias_relu.py     |    2 +-
 tests/unittest/ops/test_gemm_bias_sigmoid.py  |    2 +-
 tests/unittest/ops/test_gemm_bias_softmax.py  |    5 +
 tests/unittest/ops/test_gemm_bias_swish.py    |    2 +-
 tests/unittest/ops/test_gemm_bias_tanh.py     |    2 +-
 tests/unittest/ops/test_gemm_permute.py       |  119 +-
 .../ops/test_gemm_rcr_bias_fast_gelu.py       |    2 +-
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py |   91 +
 tests/unittest/ops/test_gemm_rrr_small_nk.py  |    2 +-
 tests/unittest/ops/test_gemm_softmax.py       |    7 +-
 tests/unittest/ops/test_group_gemm_rcr.py     |    4 +-
 .../unittest/ops/test_group_gemm_rcr_bias.py  |    2 +-
 .../test_group_gemm_rcr_bias_activation.py    |    2 +-
 .../ops/test_group_gemm_rcr_bias_cat.py       |    2 +-
 tests/unittest/ops/test_group_gemm_rcr_cat.py |    2 +-
 tests/unittest/ops/test_groupnorm.py          |   23 +-
 .../test_int_elementwise_dynamic_reshape.py   |  114 +
 tests/unittest/ops/test_nms.py                |   29 +-
 tests/unittest/ops/test_norm.py               |   13 +
 tests/unittest/ops/test_pad_last_dim.py       |   10 +-
 tests/unittest/ops/test_permute.py            |   34 +-
 tests/unittest/ops/test_reduce.py             |   85 +-
 tests/unittest/ops/test_size_getitem_ops.py   |   11 +-
 tests/unittest/ops/test_topk.py               |   33 +-
 tests/unittest/ops/test_transpose_conv2d.py   |    8 +-
 .../ops/test_transpose_conv2d_bias.py         |    8 +-
 tests/unittest/ops/test_var.py                |   13 +
 tests/unittest/test_stable_set.py             |   68 +
 tests/unittest/util/test_debug_utils.py       |  138 +
 tests/unittest/util/test_serdes.py            |  290 ++
 463 files changed, 19731 insertions(+), 3247 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/attention/mem_eff_attention.py
 create mode 100644 python/aitemplate/backend/cuda/conv3d/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/conv3d/common.py
 create mode 100644 python/aitemplate/backend/cuda/conv3d/conv3d.py
 create mode 100644 python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
 create mode 100644 python/aitemplate/backend/cuda/elementwise/int_elementwise.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute.cuh
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute.py
 create mode 100644 python/aitemplate/compiler/dtype.py
 create mode 100644 python/aitemplate/compiler/ops/attention/mem_eff_attention.py
 create mode 100644 python/aitemplate/compiler/ops/common/int_elementwise.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv3d.py
 create mode 100644 python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
 create mode 100644 python/aitemplate/compiler/stable_set.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
 create mode 100644 python/aitemplate/compiler/transform/split_large_concat_ops.py
 create mode 100644 python/aitemplate/frontend/nn/dual_gemm.py
 create mode 100644 python/aitemplate/utils/alignment.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/__init__.py
 create mode 100644 python/aitemplate/utils/serialization/ait_program.py
 create mode 100644 python/aitemplate/utils/serialization/serdes_code.py
 create mode 100644 static/csrc/debug_utility.cpp
 create mode 100644 static/include/debug_utility.h
 create mode 100644 tests/unittest/backend/test_cuda_graph.py
 create mode 100644 tests/unittest/backend/test_profiler.py
 create mode 100644 tests/unittest/benchmark/test_gemm_benchmark.py
 create mode 100644 tests/unittest/compiler/test_fuse_mm_reshape_permute.py
 create mode 100644 tests/unittest/compiler/test_fuse_permute_gemm.py
 create mode 100644 tests/unittest/compiler/test_split_large_concat.py
 create mode 100644 tests/unittest/compiler/test_tensor.py
 create mode 100644 tests/unittest/ops/test_conv3d.py
 create mode 100644 tests/unittest/ops/test_cross_attention.py
 create mode 100644 tests/unittest/ops/test_depthwise_conv3d.py
 create mode 100644 tests/unittest/ops/test_dual_gemm.py
 create mode 100644 tests/unittest/ops/test_gemm_rcr_fast_gelu.py
 create mode 100644 tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
 create mode 100644 tests/unittest/test_stable_set.py
 create mode 100644 tests/unittest/util/test_debug_utils.py
 create mode 100644 tests/unittest/util/test_serdes.py

diff --git a/.gitmodules b/.gitmodules
index 2aeb63ba5..a82a39064 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
+	url = https://github.com/AITemplate/cutlass.git
 [submodule "3rdparty/cub"]
 	path = 3rdparty/cub
 	url = https://github.com/NVIDIA/cub.git
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index dadc881a9..f434be22a 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit dadc881a9606f95cba1b20acda03c9d07c286239
+Subproject commit f434be22a6270f9f000712286f92545ccca045b7
diff --git a/examples/03_bert/benchmark_mi250.sh b/examples/03_bert/benchmark_mi250.sh
index dab4ae50c..4bacb3407 100644
--- a/examples/03_bert/benchmark_mi250.sh
+++ b/examples/03_bert/benchmark_mi250.sh
@@ -4,8 +4,8 @@
 HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
 
 #1GCD
-HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1
+HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1"
 
 #2GCD
-HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1 &
-HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $1 && fg
+HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1" &
+HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size "$1" && fg
diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/benchmark_pt.py
index 13b8738cc..05c65e9bf 100644
--- a/examples/05_stable_diffusion/benchmark_pt.py
+++ b/examples/05_stable_diffusion/benchmark_pt.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 import click
 import torch
 
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
index e91af3bb9..9c87f4155 100644
--- a/examples/05_stable_diffusion/compile.py
+++ b/examples/05_stable_diffusion/compile.py
@@ -321,7 +321,9 @@ def compile_vae(
 @click.option("--batch-size", default=1, help="batch size")
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
-def compile_diffusers(token, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True):
+def compile_diffusers(
+    token, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
+):
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
@@ -344,7 +346,11 @@ def compile_diffusers(token, width, height, batch_size, use_fp16_acc=True, conve
     hh = height // 8
 
     # CLIP
-    compile_clip(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+    compile_clip(
+        batch_size=batch_size,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
     # UNet
     compile_unet(
         batch_size=batch_size * 2,
@@ -354,7 +360,13 @@ def compile_diffusers(token, width, height, batch_size, use_fp16_acc=True, conve
         convert_conv_to_gemm=convert_conv_to_gemm,
     )
     # VAE
-    compile_vae(batch_size=batch_size, width=ww, height=hh, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+    compile_vae(
+        batch_size=batch_size,
+        width=ww,
+        height=hh,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/demo_img2img.py
index 844aac726..569a713ed 100644
--- a/examples/05_stable_diffusion/demo_img2img.py
+++ b/examples/05_stable_diffusion/demo_img2img.py
@@ -17,9 +17,9 @@
 import click
 import requests
 import torch
-from PIL import Image
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from PIL import Image
 from pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
 
 
diff --git a/examples/05_stable_diffusion/modeling/attention.py b/examples/05_stable_diffusion/modeling/attention.py
index efabc3c0c..14993e6d9 100644
--- a/examples/05_stable_diffusion/modeling/attention.py
+++ b/examples/05_stable_diffusion/modeling/attention.py
@@ -69,6 +69,7 @@ def __init__(
             self.num_heads,
             qkv_bias=True,
             has_residual=True,
+            use_mem_eff=True,
         )
         self.rescale_output_factor = rescale_output_factor
 
diff --git a/examples/05_stable_diffusion/modeling/clip.py b/examples/05_stable_diffusion/modeling/clip.py
index c66ecfb90..f9687d64a 100644
--- a/examples/05_stable_diffusion/modeling/clip.py
+++ b/examples/05_stable_diffusion/modeling/clip.py
@@ -85,14 +85,12 @@ def forward(self, x, context=None, mask=None, residual=None):
         )
 
         if USE_CUDA:
-            q = q * self.scale
-            attn = ops.bmm_rcr()(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
             )
-            attn = ops.softmax()(attn, -1)
-            v = ops.reshape()(v, [bs * nheads, -1, d])
-            out = ops.bmm_rrr_permute((nheads,))(attn, v)
         else:
             OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
             out = OP(
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
index d6c75ab05..251326b55 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
@@ -12,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
+# flakes8: noqa
 import inspect
 import os
 from typing import List, Optional, Union
@@ -346,7 +348,7 @@ def __call__(
             if isinstance(self.scheduler, LMSDiscreteScheduler):
                 sigma = self.scheduler.sigmas[t_index]
                 # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latent_model_input / ((sigma ** 2 + 1) ** 0.5)
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
                 latent_model_input = latent_model_input.to(self.unet.dtype)
                 t = t.to(self.unet.dtype)
 
diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
index ed1d8a72e..9adca1347 100644
--- a/python/aitemplate/__init__.py
+++ b/python/aitemplate/__init__.py
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
-import os
 import sys
 
 from . import backend, compiler, frontend, testing, utils
@@ -25,18 +23,4 @@
 
 __all__ = ["backend", "compiler", "frontend", "testing", "utils"]
 
-root_logger = logging.getLogger(__name__)
-info_handle = logging.StreamHandler()
-formatter = logging.Formatter("%(asctime)s %(levelname)s <%(name)s> %(message)s")
-info_handle.setFormatter(formatter)
-root_logger.addHandler(info_handle)
-root_logger.propagate = False
-
-DEFAULT_LOGLEVEL = logging.getLogger().level
-log_level_str = os.environ.get("LOGLEVEL", None)
-LOG_LEVEL = (
-    getattr(logging, log_level_str.upper())
-    if log_level_str is not None
-    else DEFAULT_LOGLEVEL
-)
-root_logger.setLevel(LOG_LEVEL)
+root_logger = utils.logger.setup_logger(__name__)
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 44daa1f3c..62fd07ade 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -26,8 +26,24 @@
 from .target import Target
 
 
-@dataclass
 class BackendSpec:
+    pass
+
+
+@dataclass
+class CPUBackendSpec(BackendSpec):
+    func_enum_to_func_name: Dict[FuncEnum, str] = field(
+        default_factory=lambda: {
+            FuncEnum.ADD: "+",
+            FuncEnum.SUB: "-",
+            FuncEnum.MUL: "*",
+            FuncEnum.DIV: "/",
+        }
+    )
+
+
+@dataclass
+class GPUBackendSpec(BackendSpec):
     dtype_to_backend_fp16_dtype: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "half",
@@ -70,7 +86,6 @@ class BackendSpec:
             "float",
         ]
     )
-
     func_enum_to_func_name: Dict[FuncEnum, Dict[str, str]] = field(
         default_factory=lambda: {
             FuncEnum.ADD: {
@@ -174,6 +189,24 @@ class BackendSpec:
                 "half": "hsilu",
                 "float": "fsilu",
             },
+            FuncEnum.POW: {
+                "half2": "h2pow",
+                "half": "hpow",
+                "float": "fpow",
+            },
+            FuncEnum.GELU: {
+                "half": "hgelu",
+                "float": "fgelu",
+            },
+            FuncEnum.FASTGELU: {
+                "half": "h_fast_gelu",
+                "float": "f_fast_gelu",
+            },
+            FuncEnum.SOFTPLUS: {
+                "half2": "h2softplus",
+                "half": "hsoftplus",
+                "float": "fsoftplus",
+            },
         }
     )
 
@@ -183,10 +216,10 @@ def get_backend_type(
         dtype: str,
         num_elements_to_backend_type_list: List[Tuple[int, str]],
     ) -> str:
-        if dtype != "float16":
+        if dtype not in ("float16", "float"):
             raise NotImplementedError("Unsupported dtype {}!".format(dtype))
-        for num, backend_type in num_elements_to_backend_type_list:
-            if num_elements % num == 0:
+        for alignment, backend_type in num_elements_to_backend_type_list:
+            if num_elements % alignment == 0:
                 return backend_type
         raise RuntimeError(
             "Failed to infer data type! num_elements: {}, num_elements_to_backend_type_list: {}".format(
@@ -216,9 +249,12 @@ def get_fp16_dtype(self, dtype: str):
     def dtype_to_backend_type(self, dtype: str):
         return self.get_dtype_to_dtype(dtype, self.dtype_to_backend_dtype)
 
+    def dtype_to_lib_type(self, dtype: str):
+        raise NotImplementedError
+
 
 @dataclass
-class ROCMSpec(BackendSpec):
+class ROCMSpec(GPUBackendSpec):
     backend_name = "rocm"
     index_type = "int64_t"
     prefix = "hip"
@@ -250,7 +286,7 @@ def dtype_to_lib_type(self, dtype: str):
 
 
 @dataclass
-class CUDASpec(BackendSpec):
+class CUDASpec(GPUBackendSpec):
     backend_name = "cuda"
     index_type = "int64_t"
     prefix = "cuda"
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 80699a79b..bd0b8c4eb 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -23,6 +23,8 @@
 import os
 import pathlib
 import re
+import shlex
+import subprocess
 import typing
 from typing import Optional
 
@@ -35,6 +37,30 @@
 # pylint: disable=W0221,C0103
 
 
+def _run_make_cmds(cmds, timeout):
+    logger.debug(__name__, f"make {cmds=}")
+    proc = subprocess.Popen(
+        [" && ".join(cmds)],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    try:
+        out, err = proc.communicate(timeout)
+    except subprocess.TimeoutExpired as e:
+        proc.kill()
+        out, err = proc.communicate()
+        raise e
+    finally:
+        if proc.returncode != 0:
+            # Let's always print out more info upon any failures.
+            logger_f = logger.info
+        else:
+            logger_f = logger.debug
+        logger_f(__name__, f"make stdout: {out.decode()}\nmake stderr: {err.decode()}")
+
+
 def process_task(task: Task) -> None:
     """This function extracts stdout and stderr from a finished task.
     If the task process return code is not 0, will mark the task as
@@ -156,6 +182,8 @@ def __init__(self, n_jobs: int = -1, timeout: int = 180) -> None:
         if num_builder is not None:
             n_jobs = int(num_builder)
         self._runner = Runner(n_jobs, timeout)
+        self._n_jobs = n_jobs
+        self._timeout = timeout
 
     def build_objs(
         self,
@@ -250,14 +278,17 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
 %.obj : %.bin
     {{bfile_cmd}}
 
-.PHONY: all
+.PHONY: all clean clean_constants
 all: {{target}}
 
 {{target}}: $(obj_files)
     $(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
 
 clean:
-    rm -f *.obj test.so
+    rm -f *.obj {{target}} test.so
+
+clean_constants:
+    rm -f constants.bin
 """
         )
 
@@ -293,3 +324,70 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
         with open(dumpfile, "w+") as f:
             # fix the makefile indentation
             f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
+
+    def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
+        makefile_template = jinja2.Template(
+            """
+programs = {{programs}}
+all: $(programs)
+.PHONY: all clean
+
+$(programs): %: %.{{cpp}}
+    {{cc_cmd}}
+
+clean:
+    rm -f $(programs)
+"""
+        )
+        program_relative_paths = sorted(
+            {f[1].split(os.path.join(profiler_dir, ""))[-1] for f in file_pairs}
+        )
+        logger.info(__name__, f"compiling {len(program_relative_paths)} profiler srcs")
+        programs = " ".join(program_relative_paths)
+        cc_cmd = Target.current().compile_cmd(True).format(target="$@", src="$<")
+        makefile_str = makefile_template.render(
+            cpp="cu",
+            programs=programs,
+            cc_cmd=cc_cmd,
+        )
+
+        dumpfile = os.path.join(profiler_dir, "Makefile")
+        with open(dumpfile, "w+") as f:
+            # fix the makefile indentation
+            f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
+
+    def make_profilers(self, generated_profilers, workdir):
+        file_pairs = [f for gp in generated_profilers for f in gp]
+        if not file_pairs:
+            return
+        build_dir = shlex.quote(os.path.join(workdir, "profiler"))
+        self._gen_makefile_for_profilers(file_pairs, build_dir)
+        make_path = shlex.quote(Target.current().make())
+        make_flags = " ".join(
+            [
+                "--output-sync",
+                f"-C {build_dir}",
+            ]
+        )
+        make_clean_cmd = f" {make_path} {make_flags} clean "
+        make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
+        cmds = [make_clean_cmd, make_all_cmd]
+        _run_make_cmds(cmds, self._timeout)
+
+    def make(self, file_pairs, dll_name, workdir, test_name):
+        self.gen_makefile(file_pairs, dll_name, workdir, test_name)
+        make_path = shlex.quote(Target.current().make())
+        build_dir = shlex.quote(os.path.join(workdir, test_name))
+        make_flags = " ".join(
+            [
+                "--output-sync",
+                f"-C {build_dir}",
+            ]
+        )
+        make_clean_cmd = f" {make_path} {make_flags} clean "
+        make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
+        make_clean_constants_cmd = f" {make_path} {make_flags} clean_constants "
+        cmds = [make_clean_cmd, make_all_cmd]
+        if not logger.is_debug():
+            cmds.append(make_clean_constants_cmd)
+        _run_make_cmds(cmds, self._timeout)
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index fcd806882..6ad72b854 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -28,11 +28,13 @@
 
 from aitemplate.backend.main_templates import MODEL_CONTAINER_TEMPLATE, MODEL_TEMPLATE
 from aitemplate.compiler.base import Operator
+from aitemplate.compiler.dtype import dtype_to_enumerator, get_dtype_size
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from aitemplate.compiler.transform.memory_planning import Workspace
+from aitemplate.utils import logger
 
-from ..compiler.base import get_dtype_size, IntImm, IntVar, Tensor
+from ..compiler.base import IntImm, IntVar, IntVarTensor, Tensor
 from . import registry
 from .target import Target
 
@@ -44,6 +46,7 @@
     "int": "int32_t*",
     "int32": "int32_t*",
     "int64": "int64_t*",
+    "bool": "bool*",
 }
 
 
@@ -61,10 +64,12 @@ def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_str
         Pass-through to gen_profiler kernels of nodes in the graph.
         See also: :func:`~aitemplate.compiler.transform.profile.profile`
     """
+    results = []
     for node in sorted_graph:
         for func in node.src_ops():
             if "has_profiler" in func._attrs and func._attrs["has_profiler"]:
-                func.gen_profiler(workdir, dynamic_profiling_strategy)
+                results.append(func.gen_profiler(workdir, dynamic_profiling_strategy))
+    return results
 
 
 def gen_function_src(
@@ -100,6 +105,7 @@ def gen_function_src(
                 with open(src_path, "w") as fo:
                     fo.write(func.gen_function())
                 exist_func.add(fname)
+    logger.info(__name__, f"generated {len(file_pairs)} function srcs")
     return file_pairs
 
 
@@ -171,22 +177,6 @@ def set_value_from_map(map_name: Any, var_name: Any, indent: str = "    ") -> st
     return f'{indent}{value} = static_cast<decltype({value})>({map_name}["{key}"]);'
 
 
-def dtype_to_enumerator(dtype):
-    def _impl(dtype):
-        if dtype == "float16":
-            return "kHalf"
-        elif dtype == "float32" or dtype == "float":
-            return "kFloat"
-        elif dtype == "int32" or dtype == "int":
-            return "kInt"
-        elif dtype == "int64":
-            return "kLong"
-        else:
-            raise AssertionError(f"unknown dtype {dtype}")
-
-    return f"AITemplateDtype::{_impl(dtype)}"
-
-
 def count_inputs_outputs(graph):
     n_inputs = n_outputs = 0
     for node in graph:
@@ -217,7 +207,7 @@ def check_not_null(
     if tensor_idx is None:
         check = name
     else:
-        check = f"params[{tensor_idx}].ptr"
+        check = f"params_[{tensor_idx}].ptr"
 
     shape = ["1"]
     lower_bound_is_zero = False
@@ -249,7 +239,7 @@ def check_not_null(
 
 def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
     src_name = src_tensor._attrs["name"]
-    dst_ptr = f"params[{dst_idx}].ptr"
+    dst_ptr = f"params_[{dst_idx}].ptr"
     shape = ["1"]
     for dim in dst_tensor._attrs["shape"]:
         if isinstance(dim, IntImm):
@@ -271,10 +261,12 @@ def __init__(
         num_outputs: int,
         constants_data_file: io.BytesIO,
         output_name_to_idx: Dict[str, int],
+        check_all_nan_and_inf: bool = False,
+        check_all_outputs: bool = False,
     ):
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
-        self.f_ptr_decl = registry.get(self.target.name() + ".lib.ptr_decl")
+        self.f_ptr_decl = registry.get(self.target.name() + ".lib.void_ptr_decl")
 
         self.constants_data_file = constants_data_file
 
@@ -321,6 +313,12 @@ def __init__(
             num_outputs,
         )
 
+        self.check_all_nan_and_inf = check_all_nan_and_inf
+        self.check_all_outputs = check_all_outputs
+
+        # This records whether or not we should debug header.
+        self.debug_header = False
+
     def _tensor_slice_func(
         self,
         node: Tensor,
@@ -351,7 +349,7 @@ def max_value(var_or_imm):
             for dim in tensor._attrs["shape"]
         )
         self.set_up_param_dynamic_shapes.append(
-            set_value(f"params[{idx}].shape_ptrs", f"{{{param_shape_init}}}")
+            set_value(f"params_[{idx}].shape_ptrs", f"{{{param_shape_init}}}")
         )
         name = tensor._attrs["name"]
         self.set_up_param_names.append(set_value(f"param_names_[{idx}]", f'"{name}"'))
@@ -384,7 +382,7 @@ def _codegen_param_setup(
             self.owned_constants_init.append(constant_info)
             self.constants_data_size += num_bytes
             self.num_constants += 1
-        else:
+        elif not isinstance(tensor, IntVarTensor):
             # Unbound constant. We will expect the user to set this via SetConstant.
             self.set_up_constant_names.append(
                 set_value(
@@ -393,7 +391,8 @@ def _codegen_param_setup(
                 )
             )
             self._record_param_tensor_info(
-                tensor, self.unbound_constant_idx + self.num_inputs + self.num_outputs
+                tensor,
+                self.unbound_constant_idx + self.num_inputs + self.num_outputs,
             )
             self.unbound_constant_idx += 1
             self.set_inputs.append(check_not_null(tensor))
@@ -413,7 +412,7 @@ def _codegen_input_tensor(self, tensor: Tensor) -> None:
         self.set_inputs.append(
             set_value(
                 name,
-                f"static_cast<decltype({name})>(params[{self.input_idx}].ptr)",
+                f"static_cast<decltype({name})>(params_[{self.input_idx}].ptr)",
             )
         )
         self.set_inputs.append(check_not_null(tensor))
@@ -444,7 +443,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(
                 set_value(
                     name,
-                    f"static_cast<decltype({name})>(params[{ptr_idx}].ptr)",
+                    f"static_cast<decltype({name})>(params_[{ptr_idx}].ptr)",
                 )
             )
 
@@ -488,7 +487,7 @@ def _codegen_output_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(
                 set_value(
                     name,
-                    f"static_cast<decltype({name})>(params[{self.input_idx}].ptr)",
+                    f"static_cast<decltype({name})>(params_[{self.input_idx}].ptr)",
                 )
             )
             self._record_param_tensor_info(tensor, self.input_idx)
@@ -525,6 +524,9 @@ def _process_dims_for_op(self, node: Operator) -> None:
 
     def _process_src_ops(self, node: Tensor) -> None:
         funcs = node.src_ops()
+        if len(funcs) == 0:
+            return
+
         for func in funcs:
             f_func_decl = registry.get(
                 ".".join((self.target.name(), func._attrs["op"], "func_decl"))
@@ -550,12 +552,36 @@ def _process_src_ops(self, node: Tensor) -> None:
                     self.state_record.add(func._attrs["name"])
             self._process_dims_for_op(func)
 
+        if self.check_all_nan_and_inf or node._attrs.get("check_nan_and_inf", False):
+            self._append_check_nan_and_inf(node)
+        if self.check_all_outputs or node._attrs.get("check_outputs", False):
+            self._append_check_outputs(node)
+
+    def _append_check_nan_and_inf(self, node: Tensor):
+        self.debug_header = True
+        tensor_name = node._attrs["name"]
+        elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
+        self.func_seq.append(
+            f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        )
+
+    def _append_check_outputs(self, node: Tensor):
+        self.debug_header = True
+        tensor_name = node._attrs["name"]
+        elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
+        self.func_seq.append(
+            f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        )
+
     def append_tensor(self, node: Tensor) -> None:
         if node._attrs["nop"]:
             return
         name = node._attrs["name"]
         dtype = node._attrs["dtype"]
-        self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
+        if isinstance(node, IntVarTensor):
+            self.tensor_decl.append(self.f_var_decl(name=name))
+        else:
+            self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
 
         is_param = node._attrs["is_param"]
         is_output = node._attrs["is_output"]
@@ -576,14 +602,14 @@ def append_tensor(self, node: Tensor) -> None:
         elif has_output_aliases:
             # Special case: internal tensor that aliases an output.
             self._codegen_output_aliases_tensor(node)
-        elif not is_view:
+        elif not is_view and not isinstance(node, IntVarTensor):
             # Normal, internal tensor that is not a view: point it to the
             # internal blob of memory
             assert (
                 node._attrs["offset"] >= 0
             ), f"Non-parameter node '{name}' must have non-negative offset"
             self.tensor_slice.append(self._tensor_slice_func(node, "blob_ptr"))
-        else:
+        elif not isinstance(node, IntVarTensor):
             # Normal view, point it to the same memory as whatever it
             # aliases
             self.set_inputs.append(set_value(name, view._attrs["name"]))
@@ -621,6 +647,7 @@ def generate_source(self) -> Dict[str, str]:
             function_state="\n".join(self.function_state),
             target_has_graph_mode=target_has_graph_mode,
             unique_workspace_size=self.workspace.unique_size,
+            debug_header=self.debug_header,
         )
 
         result["model-generated.h"] = model_def
@@ -678,6 +705,8 @@ def gen_library_src(  # noqa: C901
     workdir: str,
     output_tensors: List[Tensor],
     model_name: str = "",
+    check_all_nan_and_inf: bool = False,
+    check_all_outputs: bool = False,
 ) -> list[Tuple[str, str]]:
     """Generate model driver source code files for the given graph
 
@@ -722,6 +751,8 @@ def to_obj_name(name: str):
         num_outputs,
         constants_data_file,
         output_name_to_index,
+        check_all_nan_and_inf,
+        check_all_outputs,
     )
     for node in sorted_graph:
         model_container_generator.append_tensor(node)
@@ -741,4 +772,5 @@ def to_obj_name(name: str):
     for fname in sources:
         to_build.append((fname, to_obj_name(fname)))
 
+    logger.info(__name__, f"generated {len(to_build)} library srcs")
     return to_build
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index 99f24bb03..001afe0ac 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -22,9 +22,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-    {{elem_output_type}} * /*output*/,
+    void * /*output*/,
     {{index_type}} *[] /*output_shape*/,
-    const {{elem_input_type}} *[] /*inputs*/,
+    const void *[] /*inputs*/,
     const {{index_type}} *[], /* real_input_shapes, representing shapes of those inputs
                                  whose masks are False, i.e. inputs that will be
                                  copied to the output tensor by concat.*/
@@ -161,7 +161,7 @@
 
   constexpr unsigned read_t_sz = sizeof(READ_T);
   constexpr unsigned elem_t_sz = sizeof(ELEM_T);
-  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  static_assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
   constexpr INDEX_T n_of_elem_t = read_t_sz / elem_t_sz;
   // number of READ_T elements per thread
   INDEX_T reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
@@ -225,9 +225,9 @@
 template <typename ELEM_T, typename INDEX_T, {{index_type}} Rank, {{index_type}} NumInputs,
           {{index_type}} ElemsPerThread, {{index_type}} ThreadsPerBlock>
 void concatenate_kernel_launcher(
-    ELEM_T *output,
+    void *output,
     const {{index_type}} *output_shape,
-    const ELEM_T *inputs[],
+    const void *inputs[],
     const {{index_type}} *real_input_shapes[],
     const TensorAccessor *input_accessors[],
     const int64_t concat_dim_offsets[],
@@ -248,7 +248,7 @@
   INDEX_T max_num_input_elems = 0;
   for (INDEX_T i = 0; i < NumInputs; i++) {
     INDEX_T num_elems = get_num_elems(real_input_shapes[i], Rank);
-    input_meta.inputs[i] = inputs[i];
+    input_meta.inputs[i] = static_cast<const ELEM_T*>(inputs[i]);
     input_meta.input_accessors[i] = *(input_accessors[i]);
     input_meta.concat_dim_offsets[i] = concat_dim_offsets[i];
     input_meta.concat_dim_values[i] = real_input_shapes[i][concat_dim];
@@ -272,7 +272,7 @@
       }                                                                     \\
       concatenate_kernel<vec_type, ELEM_T, INDEX_T, Rank, NumInputs, ElemsPerThread> \\
         <<<grid_config, ThreadsPerBlock, 0, stream>>>(                      \\
-            output,                                                         \\
+            static_cast<ELEM_T*>(output),                                   \\
             output_meta,                                                    \\
             input_meta,                                                     \\
             concat_dim,                                                     \\
@@ -309,9 +309,9 @@
 {{header_src}}
 
 void {{func_name}}(
-    {{elem_output_type}} *output,
+    void *output,
     {{index_type}} *output_shape[],
-    const {{elem_input_type}} *inputs[],
+    const void *inputs[],
     const {{index_type}} *real_input_shapes[],
     const {{index_type}} *all_input_shapes[],
     const bool input_masks[],
@@ -322,6 +322,7 @@
     {{index_type}} num_all_inputs,
     {{prefix}}Stream_t stream
     ) {
+  // DO NOTHING
 }
 """
 )
@@ -406,9 +407,9 @@
 {{kernel_src}}
 
 void {{func_name}}(
-    {{elem_output_type}} *output,
+    void *output,
     {{index_type}} *output_shape[],
-    const {{elem_input_type}} *inputs[],
+    const void *inputs[],
     const {{index_type}} *real_input_shapes[], /* real_input_shapes, representing
                                  shapes of those inputs whose masks are False,
                                  i.e. inputs that will be copied to the output
@@ -520,7 +521,7 @@
     """
 {{indent}}{
 
-{{indent}}  const {{input_elem_type}} *inputs[] = {
+{{indent}}  const void *inputs[] = {
 {{indent}}    {{inputs}}
 {{indent}}  };
 
@@ -579,16 +580,8 @@ def gen_function_decl(func_attrs, backend_spec):
     str
         Rendered function declaration.
     """
-    # get dtype from orig_x in case actual "inputs" is turned into empty
-    # by some transformation
-    orig_x = func_attrs["original_inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_output_type=output_type,
-        elem_input_type=input_type,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
     )
@@ -691,8 +684,6 @@ def _stride(shape, dim):
         return SRC_TEMPLATE.render(
             kernel_src=kernel_src,
             func_name=func_attrs["name"],
-            elem_input_type=input_type,
-            elem_output_type=output_type,
             exec_paths=exec_paths,
             index_type=backend_spec.index_type,
             prefix=backend_spec.prefix,
@@ -700,8 +691,6 @@ def _stride(shape, dim):
 
     return DUMMY_KERNEL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         header_src=header_src,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
@@ -719,14 +708,8 @@ def gen_function_call(
     ----------
     func_attrs : Dict[str, Any]
         Stores the operation attributes.
-    index_type: str
-        Index type.
-    cast_to_const_half_ptr_template: jinja template
-        Cast to const half ptr template.
-    cast_to_half_ptr_template: jinja template
-        Cast to half ptr template.
-    dtype_to_backend_type: Dict[str, str]
-        Stores python dtype to backend (rocm, cuda) type.
+    backend_spec : BackendSpec
+        CUDA / RocM type definitions
     indent : str, optional
         Indent for template, by default "  ".
 
@@ -746,12 +729,7 @@ def gen_function_call(
     y = func_attrs["outputs"][0]
     concat_dim = func_attrs["concat_dim"]
 
-    input_names = ",\n      ".join(
-        [
-            backend_spec.cast_to_const_half_ptr_template.render(name=i._attrs["name"])
-            for i in inputs
-        ]
-    )
+    input_names = ",\n      ".join([i._attrs["name"] for i in inputs])
     real_input_shape_defs = []
     real_input_shape_names = []
     for idx, (i, input_accessor) in enumerate(zip(inputs, input_accessors)):
@@ -769,7 +747,6 @@ def gen_function_call(
 
     y_shape = y._attrs["shape"]
     y_dim_refs = ", ".join(["&" + dim._attrs["name"] for dim in y_shape])
-    casted_y_ptr = backend_spec.cast_to_half_ptr_template.render(name=y._attrs["name"])
 
     input_masks = func_attrs["input_masks"]
     input_indices = [idx for idx, m in enumerate(input_masks) if m is True]
@@ -819,7 +796,6 @@ def gen_function_call(
 
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
-        input_elem_type=backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"]),
         inputs=input_names,
         real_input_shape_defs="".join(real_input_shape_defs),
         real_input_shapes=", ".join(real_input_shape_names),
@@ -830,7 +806,7 @@ def gen_function_call(
         output_dim_refs=y_dim_refs,
         func_name=func_attrs["name"],
         output=y._attrs["name"],
-        output_ptr=casted_y_ptr,
+        output_ptr=y._attrs["name"],
         concat_dim=concat_dim,
         rank=len(orig_x._attrs["shape"]),
         num_real_inputs=len(inputs),
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 14872058a..546763505 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -20,11 +20,11 @@
 from typing import Any, Dict, List, Tuple
 
 import jinja2
+from aitemplate.backend.backend_spec import BackendSpec
 
 from ...compiler.base import IntImm, IntVar, Operator, Tensor
 from ...compiler.tensor_accessor import TensorAccessor
 from ...utils import shape_utils
-from ..backend_spec import BackendSpec
 from . import tensor_accessor_codegen
 
 CONSTANT_TEMPLATE = jinja2.Template(
@@ -96,11 +96,11 @@
 KERNEL_TEMPLATE = jinja2.Template(
     """
 __global__ void
-{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} int n_elements) {
+{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements) {
   const int bid = blockIdx.x;
   const int tid = threadIdx.x;
-  const int idx = bid * FUSED_ELE_THREAD_SIZE + tid;
-  const int idx_elem = idx * N_ELEMENTS_PER_THREAD;
+  const {{index_type}} idx = bid * FUSED_ELE_THREAD_SIZE + tid;
+  const {{index_type}} idx_elem = idx * N_ELEMENTS_PER_THREAD;
   if (idx_elem >= n_elements) {
     return;
   }
@@ -115,8 +115,8 @@
     """
 )
 
-FUNC_DECL_INPUT_PARAM_TEMPLATE = jinja2.Template("const {{data_t}}* input{{idx}}")
-FUNC_DECL_OUTPUT_PARAM_TEMPLATE = jinja2.Template("{{data_t}}* output{{idx}}")
+FUNC_DECL_INPUT_PARAM_TEMPLATE = jinja2.Template("const void* input{{idx}}")
+FUNC_DECL_OUTPUT_PARAM_TEMPLATE = jinja2.Template("void* output{{idx}}")
 KERNEL_CALL_INPUT_PARAM_TEMPLATE = jinja2.Template(
     "reinterpret_cast<const {{read_t}}*>(input{{idx}})"
 )
@@ -140,7 +140,7 @@
 
 }  // namespace
 
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} int n_elements, {{prefix}}Stream_t stream) {
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
     if (n_elements == 0) {
       return;
     }
@@ -157,14 +157,14 @@
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} int n_elements, {{prefix}}Stream_t stream);
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
     """
 )
 
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{
-    {{indent}}int {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
     {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{func_name}}_n_elements, {{stream}});
 {{indent}}}
     """
@@ -357,14 +357,10 @@ def _get_types_and_sizes(
 
     # Handle input broadcast.
     output_shape = output_accessors[0].original_shapes
-    dtype = "float16"
+    dtype = inputs[0]._attrs["dtype"]
     input_broadcast_sizes = []
     min_num_elements = None
-    for input_tensor, input_accessor in zip(inputs, input_accessors):
-        if input_tensor._attrs["dtype"] != "float16":
-            raise NotImplementedError(
-                "Unsupported dtype {}!".format(input_tensor._attrs["dtype"])
-            )
+    for input_accessor in input_accessors:
         input_shape = input_accessor.original_shapes
         broadcastable, _ = shape_utils.get_broadcast_max_shape(
             output_shape, input_shape
@@ -433,7 +429,7 @@ def _parse_func_metadata(
     op_type = backend_spec.get_backend_type(
         alignment, dtype, backend_spec.op_num_elements_to_backend_type
     )
-    data_type = backend_spec.get_fp16_dtype(dtype)
+    data_type = backend_spec.dtype_to_backend_type(dtype)
     sub_func_metadata, op_type = _get_sub_func_metadata(
         ops, data_type, op_type, backend_spec
     )
@@ -645,6 +641,7 @@ def _gen_kernel_function(
 
     kernel_func = KERNEL_TEMPLATE.render(
         func_name=func_attrs["name"],
+        index_type=index_type,
         output_params=output_params_decl,
         input_params=input_params_decl,
         dynamic_dims=_gen_dynamic_dim_str(
@@ -699,17 +696,13 @@ def fused_elementwise_gen_function(
     )
     output_params_decl = ",".join(
         [
-            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(
-                data_t=fused_elementwise_metadata.data_t, idx=i
-            )
+            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(idx=i)
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
     )
     input_params_decl = ",".join(
         [
-            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(
-                data_t=fused_elementwise_metadata.data_t, idx=i
-            )
+            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(idx=i)
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
     )
@@ -737,6 +730,7 @@ def fused_elementwise_gen_function(
 
     function = FUNC_TEMPLATE.render(
         prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
         head=backend_spec.header_src_template.render(extra_header=head_template),
         constant=constant,
         custom_libs=custom_libs,
@@ -787,23 +781,20 @@ def fused_elementwise_gen_function_decl(
     )
     output_params_decl = ",".join(
         [
-            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(
-                data_t=fused_elementwise_metadata.data_t, idx=i
-            )
+            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(idx=i)
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
     )
     input_params_decl = ",".join(
         [
-            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(
-                data_t=fused_elementwise_metadata.data_t, idx=i
-            )
+            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(idx=i)
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
     )
 
     function_decl = FUNC_DECL_TEMPLATE.render(
         prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
         func_name=func_name,
         output_params=output_params_decl,
         input_params=input_params_decl,
@@ -840,27 +831,9 @@ def fused_elementwise_gen_function_call(
         backend_spec,
     )
 
-    output_params_vec = []
-    for output in outputs:
-        if output._attrs["dtype"] != "float16":
-            raise NotImplementedError(
-                "Unsupported dtype {}".format(output._attrs["dtype"])
-            )
-        output_params_vec.append(
-            backend_spec.cast_to_half_ptr_template.render(name=output._attrs["name"])
-        )
-    output_params = ",".join(output_params_vec)
+    output_params = ",".join([output._attrs["name"] for output in outputs])
 
-    input_params_vec = []
-    for inp in inputs:
-        if inp._attrs["dtype"] != "float16":
-            raise NotImplementedError(
-                "Unsupported dtype {}".format(inp._attrs["dtype"])
-            )
-        input_params_vec.append(
-            backend_spec.cast_to_half_ptr_template.render(name=inp._attrs["name"])
-        )
-    input_params = ",".join(input_params_vec)
+    input_params = ",".join([input._attrs["name"] for input in inputs])
 
     num_elements_calculator = _gen_int_var_product_str(
         output_accessors[0].original_shapes
@@ -869,6 +842,7 @@ def fused_elementwise_gen_function_call(
     return FUNC_CALL_TEMPLATE.render(
         stream=backend_spec.stream,
         func_name=func_attrs["name"],
+        index_type=backend_spec.index_type,
         calculate_n=num_elements_calculator,
         output_params=output_params,
         input_params=input_params,
diff --git a/python/aitemplate/backend/common/split_common.py b/python/aitemplate/backend/common/split_common.py
index 9205c90ee..a1dbaa930 100644
--- a/python/aitemplate/backend/common/split_common.py
+++ b/python/aitemplate/backend/common/split_common.py
@@ -20,9 +20,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-    {{elem_output_type}} *[] /*outputs*/,
+    void *[] /*outputs*/,
     {{index_type}} **[] /*output_shapes*/,
-    const {{elem_input_type}} * /*input*/,
+    const void * /*input*/,
     const {{index_type}} * /*input_shape*/,
     {{index_type}} /*num_splits*/,
     {{index_type}} [] /*split_sizes*/,
@@ -127,9 +127,9 @@
   int64_t split_dim_size = output_meta.split_dim_sizes[blockIdx.y];
   int64_t input_offset = output_offset * input_split_dim_stride;
 
-  unsigned read_t_sz = sizeof(READ_T);
-  unsigned elem_t_sz = sizeof(ELEM_T);
-  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  unsigned constexpr read_t_sz = sizeof(READ_T);
+  unsigned constexpr elem_t_sz = sizeof(ELEM_T);
+  static_assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
   {{index_type}} n_of_elem_t = read_t_sz / elem_t_sz;
   // number of READ_T elements per thread
   {{index_type}} reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
@@ -196,9 +196,9 @@
 template <typename ELEM_T, {{index_type}} Rank, {{index_type}} NumSplits,
           {{index_type}} ElemsPerThread, {{index_type}} ThreadsPerBlock>
 void split_kernel_launcher(
-    ELEM_T *outputs[],
+    void *outputs[],
     {{index_type}} *output_shapes[],
-    const ELEM_T *input,
+    const void *input,
     const {{index_type}} *input_shape,
     const {{index_type}} split_dim,
     {{prefix}}Stream_t stream
@@ -217,7 +217,7 @@
   {{index_type}} offset = 0;
   LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
   for ({{index_type}} i = 0; i < NumSplits; i++) {
-    output_meta.outputs[i] = outputs[i];
+    output_meta.outputs[i] = static_cast<ELEM_T*>(outputs[i]);
     output_meta.split_dim_offsets[i] = offset;
     output_meta.split_dim_sizes[i] = output_shapes[i][split_dim];
     output_meta.num_elems[i] = get_num_elems(output_shapes[i], Rank);
@@ -246,7 +246,7 @@
       }                                                                \\
       split_kernel<vec_type, ELEM_T, Rank, NumSplits, ElemsPerThread>  \\
         <<<grid_config, ThreadsPerBlock, 0, stream>>>(                 \\
-            input,                                                     \\
+            static_cast<const ELEM_T*>(input),                         \\
             input_meta,                                                \\
             output_meta,                                               \\
             split_dim,                                                 \\
@@ -309,9 +309,9 @@
     """
 {{kernel_src}}
 void {{func_name}}(
-    {{elem_output_type}}* outputs[],
+    void* outputs[],
     {{index_type}} **output_shapes[],
-    const {{elem_input_type}}* input,
+    const void* input,
     const {{index_type}} *input_shape,
     {{index_type}} num_splits,
     {{index_type}} split_sizes[],
@@ -390,7 +390,7 @@
     """
 {{indent}}{
 
-{{indent}}  {{output_elem_type}} *outputs[] = {
+{{indent}}  void *outputs[] = {
 {{indent}}    {{outputs}}
 {{indent}}  };
 
@@ -431,21 +431,17 @@ def gen_function_decl(func_attrs, backend_spec):
     ----------
     func_attrs : Dict[str, Any]
         Stores the operation attributes.
+    backend_spec : BackendSpec
+        Cuda/Rocm type definitions
     Returns
     -------
     str
         Rendered function declaration.
     """
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
         func_name=func_attrs["name"],
-        elem_output_type=output_type,
-        elem_input_type=input_type,
     )
 
 
@@ -470,6 +466,9 @@ def gen_function(func_attrs, backend_spec):
     input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
     # TODO: consider to add profiling paths for tuning
     # elems_per_thread and threads_per_block
     exec_paths = EXEC_COND_TEMPLATE.render(
@@ -490,8 +489,6 @@ def gen_function(func_attrs, backend_spec):
     return SRC_TEMPLATE.render(
         kernel_src=kernel_src,
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         exec_paths=exec_paths,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
@@ -515,16 +512,10 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     """
     x = func_attrs["inputs"][0]
     outputs = func_attrs["outputs"]
-    y = outputs[0]
     split_dim = func_attrs["split_dim"]
     num_splits = len(func_attrs["split_sizes"])
 
-    output_names = ",\n      ".join(
-        [
-            backend_spec.cast_to_half_ptr_template.render(name=i._attrs["name"])
-            for i in outputs
-        ]
-    )
+    output_names = ",\n      ".join([i._attrs["name"] for i in outputs])
 
     output_shape_defs = []
     output_shape_names = []
@@ -545,22 +536,18 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
 
     x_shape = x._attrs["shape"]
     x_dims = ", ".join([dim._attrs["name"] for dim in x_shape])
-    casted_x_ptr = backend_spec.cast_to_const_half_ptr_template.render(
-        name=x._attrs["name"]
-    )
 
     split_sizes = ", ".join([str(i) for i in func_attrs["split_sizes"]])
 
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
-        output_elem_type=backend_spec.dtype_to_backend_type(y._attrs["dtype"]),
         outputs=output_names,
         output_shape_defs="".join(output_shape_defs),
         output_shapes=", ".join(output_shape_names),
         input_dims=x_dims,
         func_name=func_attrs["name"],
         input_name=x._attrs["name"],
-        input_ptr=casted_x_ptr,
+        input_ptr=x._attrs["name"],
         split_dim=split_dim,
         rank=len(x._attrs["shape"]),
         num_splits=num_splits,
diff --git a/python/aitemplate/backend/common/tensor/argmax_common.py b/python/aitemplate/backend/common/tensor/argmax_common.py
index bb422646e..67c3d4b94 100644
--- a/python/aitemplate/backend/common/tensor/argmax_common.py
+++ b/python/aitemplate/backend/common/tensor/argmax_common.py
@@ -21,9 +21,6 @@
 
 import jinja2
 
-from ... import builder
-from ...target import Target
-
 # pylint: disable=C0301
 
 FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
@@ -285,7 +282,7 @@ class MultiplyFunctor final {
 {{indent}}    {{elem_cnt}},
 {{indent}}    {{instance_size}},
 {{indent}}    {{instance_num}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -450,7 +447,4 @@ def gen_profiler(
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+    return file_pairs
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
index 86bbea7a0..97e8aee77 100644
--- a/python/aitemplate/backend/common/tensor/batch_gather_common.py
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -22,11 +22,6 @@
 
 # pylint: disable=C0301
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    """reinterpret_cast<half*>(
-        {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
-)
-
 FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
 
 FUNC_TEMPLATE = jinja2.Template(
@@ -41,15 +36,15 @@
 
 {{func_signature}}
 {
-    batch_gather_launcher<half, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, input, indices, workspace, output);
+    batch_gather_launcher<{{dtype}}, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
-                   const half* input,
+void {{func_name}}(void* output,
+                   const void* input,
                    const int64_t* indices,
                    const {{index_type}} batch_num,
                    const {{index_type}} indices_num,
@@ -74,7 +69,7 @@
 {{indent}}    {{indices_num}},
 {{indent}}    {{instance_size}},
 {{indent}}    {{gather_dim_size}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -156,12 +151,10 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 2
 
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+
+    input_name = func_attrs["inputs"][0]._attrs["name"]
+
     indices_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["inputs"][1]._attrs["name"]
     )
@@ -208,6 +201,9 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], index_type=index_type, prefix=prefix
         ),
+        dtype=backend_spec.dtype_to_backend_dtype[
+            func_attrs["inputs"][0]._attrs["dtype"]
+        ],
     )
 
 
diff --git a/python/aitemplate/backend/common/tensor/permute021_common.py b/python/aitemplate/backend/common/tensor/permute021_common.py
index db5ed63fd..30ab97b80 100644
--- a/python/aitemplate/backend/common/tensor/permute021_common.py
+++ b/python/aitemplate/backend/common/tensor/permute021_common.py
@@ -28,15 +28,15 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{lib_dtype}}*,
-  {{lib_dtype}}*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  {{prefix}}Stream_t
+  const void* /*input*/,
+  void* /* output */,
+  int64_t* /* x_dim0 */,
+  int64_t* /* x_dim1 */,
+  int64_t* /* x_dim2 */,
+  int64_t* /* y_dim0 */,
+  int64_t* /* y_dim1 */,
+  int64_t* /* y_dim2 */,
+  {{prefix}}Stream_t /* stream */
 );
 """
 )
@@ -44,8 +44,8 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    ({{lib_dtype}}*)({{in_ptr}}),
-{{indent}}    ({{lib_dtype}}*)({{out_ptr}}),
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
@@ -138,8 +138,8 @@
   }
 }
 
-void permute021_launcher({{lib_dtype}}* in_ptr,
-                         {{lib_dtype}}* out_ptr,
+void permute021_launcher(const void* in_ptr,
+                         void* out_ptr,
                          int x_dim0,
                          int x_dim1,
                          int x_dim2,
@@ -151,8 +151,8 @@
   dim3 grid((c + 31)/32, (h*w + 31)/32, n);
   dim3 block(32, 8);
   nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
-    ({{lib_dtype}}*)out_ptr,
-    (const {{lib_dtype}}*)in_ptr,
+    static_cast<{{lib_dtype}}*>(out_ptr),
+    static_cast<const {{lib_dtype}}*>(in_ptr),
     n,
     h,
     w,
@@ -162,8 +162,8 @@
 } // namespace
 
 void {{function_name}} (
-    {{lib_dtype}}* in_ptr,
-    {{lib_dtype}}* out_ptr,
+    const void* in_ptr,
+    void* out_ptr,
     int64_t* x_dim0,
     int64_t* x_dim1,
     int64_t* x_dim2,
@@ -258,11 +258,8 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     """
 
     func_name = func_attrs["name"]
-    x = func_attrs["inputs"][0]
-    xdtype = x._attrs["dtype"]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
     )
 
@@ -286,7 +283,6 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
 
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
-    xdtype = x._attrs["dtype"]
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
@@ -300,5 +296,4 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
         y_dim1="&" + yshape[1]._attrs["name"],
         y_dim2="&" + yshape[2]._attrs["name"],
         indent=indent,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
     )
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index 807e65bef..7c367ed8a 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -36,14 +36,14 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{lib_dtype}}*,
-  {{lib_dtype}}*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
+  const void* /* input */,
+  void* /* output */,
+  int64_t* /* x_dim0 */,
+  int64_t* /* x_dim1 */,
+  int64_t* /* x_dim2 */,
+  int64_t* /* y_dim0 */,
+  int64_t* /* y_dim1 */,
+  int64_t* /* y_dim2 */,
   {{prefix}}Stream_t
 );
 """
@@ -52,8 +52,8 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    ({{lib_dtype}}*){{in_ptr}},
-{{indent}}    ({{lib_dtype}}*){{out_ptr}},
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
@@ -149,8 +149,8 @@
   }
 }
 
-void permute102_launcher({{lib_dtype}}* in_ptr,
-                         {{lib_dtype}}* out_ptr,
+void permute102_launcher(const void* in_ptr,
+                         void* out_ptr,
                          int x_dim0,
                          int x_dim1,
                          int x_dim2,
@@ -162,8 +162,8 @@
   dim3 grid((c + TILE_SIZE - 1)/TILE_SIZE, (h*w + TILE_SIZE -1)/TILE_SIZE, n);
   dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
   nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
-    out_ptr,
-    (const {{lib_dtype}}*)in_ptr,
+    static_cast<{{lib_dtype}}*>(out_ptr),
+    static_cast<const {{lib_dtype}}*>(in_ptr),
     n,
     h,
     w,
@@ -173,8 +173,8 @@
 } // namespace
 
 void {{function_name}} (
-    {{lib_dtype}}* in_ptr,
-    {{lib_dtype}}* out_ptr,
+    const void* in_ptr,
+    void* out_ptr,
     int64_t* x_dim0,
     int64_t* x_dim1,
     int64_t* x_dim2,
@@ -265,11 +265,8 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
         Function declaration
     """
     func_name = func_attrs["name"]
-    x = func_attrs["inputs"][0]
-    xdtype = x._attrs["dtype"]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
     )
 
@@ -292,7 +289,6 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     """
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
-    xdtype = x._attrs["dtype"]
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
@@ -306,5 +302,4 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
         y_dim1="&" + yshape[1]._attrs["name"],
         y_dim2="&" + yshape[2]._attrs["name"],
         indent=indent,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
     )
diff --git a/python/aitemplate/backend/common/tensor/permute210_common.py b/python/aitemplate/backend/common/tensor/permute210_common.py
index fa1d5d25a..35894b315 100644
--- a/python/aitemplate/backend/common/tensor/permute210_common.py
+++ b/python/aitemplate/backend/common/tensor/permute210_common.py
@@ -35,15 +35,15 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{lib_dtype}}*,
-  {{lib_dtype}}*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  {{prefix}}Stream_t
+  const void* /* input */,
+  void* /* output */,
+  int64_t* /* x_dim0 */,
+  int64_t* /* x_dim1 */,
+  int64_t* /* x_dim2 */,
+  int64_t* /* y_dim0 */,
+  int64_t* /* y_dim1 */,
+  int64_t* /* y_dim2 */,
+  {{prefix}}Stream_t /* stream */
 );
 """
 )
@@ -51,8 +51,8 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    static_cast<{{lib_dtype}}*>({{in_ptr}}),
-{{indent}}    static_cast<{{lib_dtype}}*>({{out_ptr}}),
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
@@ -158,8 +158,8 @@
   }
 }
 
-void permute210_launcher({{lib_dtype}}* in_ptr,
-                         {{lib_dtype}}* out_ptr,
+void permute210_launcher(const void* in_ptr,
+                         void* out_ptr,
                          int x_dim0,
                          int x_dim1,
                          int x_dim2,
@@ -167,8 +167,8 @@
   dim3 grid((x_dim2 + (TILE_SIZE-1))/TILE_SIZE, x_dim1, (x_dim0 + (TILE_SIZE-1))/TILE_SIZE);
   dim3 block(TILE_SIZE, TILE_SIZE/4);
   permute210_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
-    out_ptr,
-    (const {{lib_dtype}}*)in_ptr,
+    static_cast<{{lib_dtype}}*>(out_ptr),
+    static_cast<const {{lib_dtype}}*>(in_ptr),
     x_dim0,
     x_dim1,
     x_dim2
@@ -177,8 +177,8 @@
 } // namespace
 
 void {{function_name}} (
-    {{lib_dtype}}* in_ptr,
-    {{lib_dtype}}* out_ptr,
+    const void* in_ptr,
+    void* out_ptr,
     int64_t* x_dim0,
     int64_t* x_dim1,
     int64_t* x_dim2,
@@ -244,12 +244,9 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
         Function declaration
     """
     func_name = func_attrs["name"]
-    x = func_attrs["inputs"][0]
-    xdtype = x._attrs["dtype"]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
         prefix=backend_spec.prefix,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
     )
 
 
@@ -271,7 +268,6 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     """
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
-    xdtype = x._attrs["dtype"]
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
@@ -285,5 +281,4 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
         y_dim1="&" + yshape[1]._attrs["name"],
         y_dim2="&" + yshape[2]._attrs["name"],
         indent=indent,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
     )
diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index fb17116de..f42f213f2 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -17,11 +17,6 @@
 """
 import jinja2
 
-CAST_TO_CONST_HALF_PTR_TEMPLATE = jinja2.Template("reinterpret_cast<half*>({{name}})")
-
-
-CAST_TO_HALF_PTR_TEMPLATE = jinja2.Template("reinterpret_cast<const half*>({{name}})")
-
 
 SHAPE_UPDATE_FUNC = jinja2.Template(
     """
@@ -59,9 +54,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-    {{elem_output_type}} * /*output*/,
+    void * /*output*/,
     int64_t *[] /*output_shape*/,
-    const {{elem_input_type}} *[] /*inputs*/,
+    const void *[] /*inputs*/,
     const int64_t *[] /*input_shapes*/,
     const int64_t *[] /*orig_slice_start_indices*/,
     const int64_t *[] /*orig_slice_end_indices*/,
@@ -203,9 +198,9 @@
   int64_t scatter_dim_size = slice_meta_data.dim_sizes[block_y];
   int64_t scatter_offset = slice_meta_data.offsets[block_y];
 
-  unsigned read_t_sz = sizeof(READ_T);
-  unsigned elem_t_sz = sizeof(ELEM_T);
-  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  constexpr unsigned read_t_sz = sizeof(READ_T);
+  constexpr unsigned elem_t_sz = sizeof(ELEM_T);
+  static_assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
   {{index_type}}  n_of_elem_t = read_t_sz / elem_t_sz;
   // number of READ_T elements per thread
   {{index_type}}  reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
@@ -275,7 +270,6 @@
 template <typename ELEM_T, {{index_type}}  Rank>
 static LoadVecType get_input_vec_type(
     const int64_t *output_strides,
-    const ELEM_T *input,
     const int64_t *input_shape,
     const int64_t *input_strides,
     const int64_t *slice_start_indices,
@@ -404,7 +398,6 @@
   for ({{index_type}}  i = 0; i < NumInputs; i++) {
     LoadVecType vec_type = get_input_vec_type<ELEM_T, Rank>(
         scatter_meta_data.output_strides,
-        inputs[i],
         input_shapes[i],
         slice_meta_data.input_strides[i],
         slice_start_indices[i].data(),
@@ -516,7 +509,7 @@
 {{indent}}                                {{num_inputs}}/*NumInputs*/,
 {{indent}}                                {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                                {{threads_per_block}}/*ThreadsPerBlock*/>(
-{{indent}}      output, local_output_shape, inputs, input_shapes,
+{{indent}}      static_cast<{{elem_type}}*>(output), local_output_shape, reinterpret_cast<const {{elem_type}}**>(inputs), input_shapes,
 {{indent}}      slice_start_indices, slice_end_indices, scatter_dim, stream);
 {{indent}}  return;
 {{indent}}}
@@ -529,9 +522,9 @@
 {{kernel_src}}
 
 void {{func_name}}(
-    {{elem_output_type}} *output,
+    void *output,
     int64_t *output_shape[],
-    const {{elem_input_type}} *inputs[],
+    const void *inputs[],
     const int64_t *input_shapes[],
     const int64_t *orig_slice_start_indices[],
     const int64_t *orig_slice_end_indices[],
@@ -615,7 +608,7 @@
 {{indent}}{
 {{output_shape_def}}
 
-{{indent}}  const half *inputs[] = {
+{{indent}}  const void *inputs[] = {
 {{indent}}    {{inputs}}
 {{indent}}  };
 
@@ -687,14 +680,8 @@ def gen_function_decl(func_attrs, backend_spec):
     str
         Rendered function declaration.
     """
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_output_type=output_type,
-        elem_input_type=input_type,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
     )
@@ -742,6 +729,9 @@ def gen_function(
     input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
     # TODO: consider to add profiling paths for tuning
     # elems_per_thread and threads_per_block
     exec_paths = EXEC_COND_TEMPLATE.render(
@@ -774,8 +764,6 @@ def gen_function(
     return SRC_TEMPLATE.render(
         kernel_src=kernel_src,
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         shape_function=shape_func,
         exec_paths=exec_paths,
         index_type=backend_spec.index_type,
@@ -827,12 +815,7 @@ def gen_function_call(
     x = inputs[0]
     y = outputs[0]
 
-    input_names = ",\n        ".join(
-        [
-            backend_spec.cast_to_const_half_ptr_template.render(name=i._attrs["name"])
-            for i in inputs
-        ]
-    )
+    input_names = ",\n        ".join([i._attrs["name"] for i in inputs])
 
     input_shape_defs = []
     input_shape_names = []
@@ -880,14 +863,11 @@ def gen_function_call(
             indent=indent, output_name=y._attrs["name"], output_dim_refs=y_dim_refs
         )
 
-    casted_y_ptr = backend_spec.cast_to_half_ptr_template.render(name=y._attrs["name"])
-
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
         func_name=func_name,
-        output_elem_type=backend_spec.dtype_to_backend_type(y._attrs["dtype"]),
         output_name=y._attrs["name"],
-        output_ptr=casted_y_ptr,
+        output_ptr=y._attrs["name"],
         output_shape_def=output_shape_def,
         inputs=input_names,
         input_shape_defs="".join(input_shape_defs),
diff --git a/python/aitemplate/backend/common/tensor/topk_common.py b/python/aitemplate/backend/common/tensor/topk_common.py
index 6b82ef531..044833bc0 100644
--- a/python/aitemplate/backend/common/tensor/topk_common.py
+++ b/python/aitemplate/backend/common/tensor/topk_common.py
@@ -21,9 +21,6 @@
 
 import jinja2
 
-from ... import builder
-from ...target import Target
-
 # pylint: disable=C0301
 
 FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
@@ -40,7 +37,7 @@
 
 {{func_signature}}
 {
-    topk_launcher<half>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output);
+    topk_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output);
 }
     """
 )
@@ -64,10 +61,10 @@
   int instance_num = std::stoi(argv[3]);
 
   float runtime_ms = 0;
-  const int64_t sorted_in_aligned_bytes = GetAlignedSize(elem_cnt * sizeof(half));
+  const int64_t sorted_in_aligned_bytes = GetAlignedSize(elem_cnt * sizeof({{dtype}}));
   const int64_t indices_aligned_bytes = GetAlignedSize(elem_cnt * sizeof(int64_t));
   const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;
-  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<half, int64_t>(instance_size, instance_num);
+  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<{{dtype}}, int64_t>(instance_size, instance_num);
   GLOBAL_WORKSPACE_SIZE  =  GetAlignedSize(sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes + temp_storage_bytes);
   std::cout << "TIME:" << runtime_ms << std::endl;
   std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
@@ -78,7 +75,7 @@
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(int64_t* output,
-                   const half* input,
+                   const void* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
                    const {{index_type}} instance_num,
@@ -102,7 +99,7 @@
 {{indent}}    {{instance_size}},
 {{indent}}    {{instance_num}},
 {{indent}}    {{top_k}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -624,12 +621,14 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     """
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
     return FUNC_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], index_type=index_type, prefix=prefix
         ),
         kernel=KERNEL_TEMPLATE.render(cub=backend_spec.cub, prefix=prefix),
+        dtype=dtype,
     )
 
 
@@ -681,9 +680,7 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["outputs"][0]._attrs["name"]
     )
-    input_name = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
-    )
+    input_name = func_attrs["inputs"][0]._attrs["name"]
 
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
@@ -754,16 +751,16 @@ def gen_profiler(
     file_pairs = []
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     code = PROFILER_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], index_type=index_type, prefix=prefix
         ),
         kernel=KERNEL_TEMPLATE.render(cub=backend_spec.cub, prefix=prefix),
+        dtype=dtype,
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+    return file_pairs
diff --git a/python/aitemplate/backend/common/tensor_accessor_codegen.py b/python/aitemplate/backend/common/tensor_accessor_codegen.py
index e2e873647..6d6174d27 100644
--- a/python/aitemplate/backend/common/tensor_accessor_codegen.py
+++ b/python/aitemplate/backend/common/tensor_accessor_codegen.py
@@ -22,6 +22,7 @@
 import jinja2
 
 from ...compiler.tensor_accessor import TensorAccessor
+from ...utils import alignment
 from ..target import Target
 
 # Template used to transform a Python TensorAccessor object
@@ -71,27 +72,6 @@ def get_libs() -> str:
     )
 
 
-# Currently read4, add2 is best for both backend, so two backend seems identical.
-# They may diverge when we got deeper understanding / further optimization.
-ALIGNMENTS = [
-    8,
-    4,
-    2,
-    1,
-]
-
-
-def _find_max_alignment(number: int) -> int:
-    """
-    Return the first alignment value that meets the alignment requirement
-    for accessing the `number` of elements.
-    """
-    for alignment in ALIGNMENTS:
-        if number % alignment == 0:
-            return alignment
-    return 1
-
-
 def find_max_alignment_for_accessor(accessor: TensorAccessor) -> int:
     """the max alignment value that meets the requirement specified by
        the accessor
@@ -105,17 +85,21 @@ def find_max_alignment_for_accessor(accessor: TensorAccessor) -> int:
     int
         the max alignment value
     """
-    alignment = _find_max_alignment(accessor.offset)
+    align = alignment.find_max_alignment(accessor.offset)
     if not accessor.is_contiguous:
-        alignment = min(
-            alignment,
-            _find_max_alignment(accessor.original_total_elements_from_stride_dim),
+        align = min(
+            align,
+            alignment.find_max_alignment(
+                accessor.original_total_elements_from_stride_dim
+            ),
         )
-        alignment = min(
-            alignment,
-            _find_max_alignment(accessor.actual_total_elements_from_stride_dim),
+        align = min(
+            align,
+            alignment.find_max_alignment(
+                accessor.actual_total_elements_from_stride_dim
+            ),
         )
-    return alignment
+    return align
 
 
 def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
@@ -132,11 +116,11 @@ def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
     int
         the max alignment value
     """
-    alignment = max(ALIGNMENTS)
+    align = max(alignment.ALIGNMENTS)
     # Handle accessors
     for accessor in accessors:
-        alignment = min(alignment, find_max_alignment_for_accessor(accessor))
-    return alignment
+        align = min(align, find_max_alignment_for_accessor(accessor))
+    return align
 
 
 def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> int:
@@ -158,6 +142,6 @@ def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> in
         the max alignment value
     """
     # get initial alignment based on the number of elements being accessed
-    alignment = _find_max_alignment(num_elements)
+    align = alignment.find_max_alignment(num_elements)
     accessor_alignment = find_max_alignment_for_accessors(accessors)
-    return min(alignment, accessor_alignment)
+    return min(align, accessor_alignment)
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index 6d7aadd3c..736ee6482 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -23,12 +23,12 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}bilinear_upsampling_luncher(
-{{indent}}    in_ptr,
+{{indent}}bilinear_upsampling_launcher(
+{{indent}}    static_cast<const {{dtype}}*>(in_ptr),
 {% if bias_add %}
-  {{indent}}    res_ptr,
+  {{indent}}    static_cast<const {{dtype}}*>(res_ptr),
 {% endif %}
-{{indent}}    out_ptr,
+{{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -200,11 +200,12 @@
   return (n + m - 1) / m;
 }
 
-void bilinear_upsampling_luncher({{elem_input_type}}* input,
+template<typename ELEM_T>
+void bilinear_upsampling_launcher(const ELEM_T* input,
                     {% if bias_add %}
-                      {{elem_input_type}}* input_res,
+                      const ELEM_T* input_res,
                     {% endif %}
-                      {{elem_output_type}}* output,
+                      ELEM_T* output,
                       const {{index_type}} N,
                       const {{index_type}} H,
                       const {{index_type}} W,
@@ -257,11 +258,11 @@
 } // namespace
 
 void {{function_name}} (
-    {{elem_input_type}}* in_ptr,
+    const void* in_ptr,
     {% if bias_add %}
-      {{elem_input_type}}* res_ptr,
+    const void* res_ptr,
     {% endif %}
-    {{elem_output_type}}* out_ptr,
+    void* out_ptr,
     {{index_type}}* batch,
     {{index_type}}* in_h,
     {{index_type}}* in_w,
@@ -284,11 +285,11 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_input_type}}*,
+  const void*,
   {% if bias_add %}
-    {{elem_input_type}}*,
+  const void*,
   {% endif %}
-  {{elem_output_type}}*,
+  void*,
   {{index_type}}*,
   {{index_type}}*,
   {{index_type}}*,
@@ -304,11 +305,11 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr}}),
+{{indent}}    {{in_ptr}},
 {% if bias_add %}
-  {{indent}}    static_cast<{{elem_input_type}}*>({{res_ptr}}),
+{{indent}}    {{res_ptr}},
 {% endif %}
-{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{out_ptr}},
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_in_h}},
 {{indent}}    {{p_in_w}},
@@ -337,16 +338,10 @@ def gen_function_decl(func_attrs, backend_spec, bias_add=False):
     str
         Rendered function declaration stmt
     """
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         bias_add=bias_add,
     )
 
@@ -383,14 +378,10 @@ def gen_function_call(func_attrs, backend_spec, indent="  ", bias_add=False):
     xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     if bias_add:
         r = func_attrs["inputs"][1]
         return FUNC_CALL_TEMPLATE.render(
             func_name=func_attrs["name"],
-            elem_input_type=input_type,
-            elem_output_type=output_type,
             index_type=backend_spec.index_type,
             in_ptr=x._attrs["name"],
             res_ptr=r._attrs["name"],
@@ -408,8 +399,6 @@ def gen_function_call(func_attrs, backend_spec, indent="  ", bias_add=False):
     else:
         return FUNC_CALL_TEMPLATE.render(
             func_name=func_attrs["name"],
-            elem_input_type=input_type,
-            elem_output_type=output_type,
             index_type=backend_spec.index_type,
             in_ptr=x._attrs["name"],
             out_ptr=y._attrs["name"],
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
index 8431e5d87..fd0ca6c50 100644
--- a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
@@ -21,14 +21,10 @@
 
 import jinja2
 
-from ... import builder
-from ...target import Target
 from .efficient_nms_kernel import kernel
 
 # pylint: disable=C0301
 
-FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
-
 FUNC_TEMPLATE = jinja2.Template(
     """
 {{header_files}}
@@ -96,7 +92,7 @@
   int batchSize = std::stoi(argv[1]);
   int numScoreElements = std::stoi(argv[2]);
   int numClasses = std::stoi(argv[3]);
-  GLOBAL_WORKSPACE_SIZE = EfficientNMSWorkspaceSize<half>(batchSize, numScoreElements, numClasses);
+  GLOBAL_WORKSPACE_SIZE = EfficientNMSWorkspaceSize<{{elem_input_type}}>(batchSize, numScoreElements, numClasses);
 
   std::cout << "TIME:" << runtime_ms << std::endl;
   std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
@@ -106,12 +102,12 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(int64_t* num_detections,
-                   half* detection_boxes,
-                   half* detection_scores,
-                   int64_t* detection_classe,
-                   const half* proposals,
-                   const half* fgScores,
+void {{func_name}}(void* num_detections,
+                   void* detection_boxes,
+                   void* detection_scores,
+                   void* detection_classe,
+                   const void* proposals,
+                   const void* fgScores,
                    int64_t* batch,
                    int64_t* num_rois,
                    int64_t* num_classes,
@@ -147,7 +143,7 @@
 {{indent}}    {{nmsMaxOut}},
 {{indent}}    {{iouThreshold}},
 {{indent}}    {{minBoxSize}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -155,9 +151,16 @@
 
 def gen_function(func_attrs: Dict[str, Any], header_files, backend_spec) -> str:
     """the function for generating nms kernel"""
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         header_files=header_files,
-        kernel=kernel.render(prefix=backend_spec.prefix, cub=backend_spec.cub),
+        kernel=kernel.render(
+            prefix=backend_spec.prefix,
+            cub=backend_spec.cub,
+            elem_input_type=elem_input_type,
+        ),
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], prefix=backend_spec.prefix
         ),
@@ -178,21 +181,12 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     assert len(func_attrs["outputs"]) == 4
     assert len(func_attrs["inputs"]) == 2
 
-    num_detections = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
-    detection_boxes = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["outputs"][1]._attrs["name"]
-    )
-    detection_scores = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["outputs"][2]._attrs["name"]
-    )
-    detection_classes = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][3]._attrs["name"]
-    )
+    num_detections = func_attrs["outputs"][0]._attrs["name"]
+    detection_boxes = func_attrs["outputs"][1]._attrs["name"]
+    detection_scores = func_attrs["outputs"][2]._attrs["name"]
+    detection_classes = func_attrs["outputs"][3]._attrs["name"]
     (input_name, score_name) = (
-        backend_spec.cast_to_half_ptr_template.render(name=input_tensor._attrs["name"])
-        for input_tensor in func_attrs["inputs"]
+        input_tensor._attrs["name"] for input_tensor in func_attrs["inputs"]
     )
 
     x = func_attrs["inputs"][0]
@@ -235,16 +229,21 @@ def gen_profiler(func_attrs, workdir, header_files, backend_spec):
     """the function for generating profiler for nms op"""
     op_type = func_attrs["op"]
     file_pairs = []
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     code = PROFILER_TEMPLATE.render(
         header_files=header_files,
-        kernel=kernel.render(prefix=backend_spec.prefix, cub=backend_spec.cub),
+        elem_input_type=elem_input_type,
+        kernel=kernel.render(
+            prefix=backend_spec.prefix,
+            cub=backend_spec.cub,
+            elem_input_type=elem_input_type,
+        ),
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], prefix=backend_spec.prefix
         ),
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+    return file_pairs
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
index 5d5631f14..c40b01e7c 100644
--- a/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
@@ -1143,7 +1143,7 @@
   if (param.scoreBits <= 0 || param.scoreBits > 10) {
     param.scoreBits = -1;
   }
-  EfficientNMSDispatch<__half>(
+  EfficientNMSDispatch<{{elem_input_type}}>(
       param,
       boxesInput,
       scoresInput,
diff --git a/python/aitemplate/backend/common/vision_ops/nms_common.py b/python/aitemplate/backend/common/vision_ops/nms_common.py
index 50cc5e356..53e2b6f31 100644
--- a/python/aitemplate/backend/common/vision_ops/nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/nms_common.py
@@ -21,8 +21,6 @@
 
 import jinja2
 
-from ... import builder
-from ...target import Target
 from .nms_kernel import KERNEL_TEMPLATE
 
 # pylint: disable=C0301
@@ -43,7 +41,9 @@
 
     const int N = *batch;
     const int R = *num_rois;
-    nmsGpu<half, half>(stream, N, R, preNmsTop, nmsMaxOut, iouThreshold, minBoxSize, fgScores, proposals, workspace, rois);
+    nmsGpu<{{elem_scores_type}}, {{elem_rois_type}}>(
+        stream, N, R, preNmsTop, nmsMaxOut, iouThreshold, minBoxSize,
+        fgScores, proposals, workspace, rois);
 }
     """
 )
@@ -70,9 +70,9 @@
 
   float runtime_ms = 0;
   const int64_t offsets_bytes = GetCudaAlignedSize((instance_num+1) * sizeof(int64_t));
-  const int64_t scores_bytes = GetCudaAlignedSize(elem_cnt * sizeof(half));
-  const int64_t boxes_bytes = GetCudaAlignedSize(elem_cnt * 4 * sizeof(half));
-  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<half, int64_t>(instance_num, instance_size);
+  const int64_t scores_bytes = GetCudaAlignedSize(elem_cnt * sizeof({{elem_scores_type}}));
+  const int64_t boxes_bytes = GetCudaAlignedSize(elem_cnt * 4 * sizeof({{elem_rois_type}}));
+  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<{{elem_scores_type}}, int64_t>(instance_num, instance_size);
 
   GLOBAL_WORKSPACE_SIZE = GetCudaAlignedSize(offsets_bytes + scores_bytes + boxes_bytes + temp_storage_bytes);
 
@@ -84,9 +84,9 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* rois,
-                   const half* proposals,
-                   const half* fgScores,
+void {{func_name}}(void* rois,
+                   const void* proposals,
+                   const void* fgScores,
                    int64_t* batch,
                    int64_t* num_rois,
                    const {{index_type}} preNmsTop,
@@ -114,7 +114,7 @@
 {{indent}}    {{nmsMaxOut}},
 {{indent}}    {{iouThreshold}},
 {{indent}}    {{minBoxSize}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -129,8 +129,16 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     else:
         cuda_hmaxmin = False
 
+    elem_rois_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_scores_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][1]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         T_SIZE=t_size,
+        elem_scores_type=elem_scores_type,
+        elem_rois_type=elem_rois_type,
         header_files=header_files,
         kernel=KERNEL_TEMPLATE.render(
             prefix=backend_spec.prefix, cub=backend_spec.cub, cuda_hmaxmin=cuda_hmaxmin
@@ -159,12 +167,9 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent: str) ->
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 2
 
-    output_name = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, score_name) = (
-        backend_spec.cast_to_half_ptr_template.render(name=input_tensor._attrs["name"])
-        for input_tensor in func_attrs["inputs"]
+        input_tensor._attrs["name"] for input_tensor in func_attrs["inputs"]
     )
 
     x = func_attrs["inputs"][0]
@@ -215,8 +220,16 @@ def gen_profiler(
     else:
         cuda_hmaxmin = False
 
+    elem_rois_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_scores_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][1]._attrs["dtype"]
+    )
     code = PROFILER_TEMPLATE.render(
         T_SIZE=t_size,
+        elem_scores_type=elem_scores_type,
+        elem_rois_type=elem_rois_type,
         header_files=header_files,
         kernel=KERNEL_TEMPLATE.render(
             prefix=backend_spec.prefix, cub=backend_spec.cub, cuda_hmaxmin=cuda_hmaxmin
@@ -229,7 +242,4 @@ def gen_profiler(
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+    return file_pairs
diff --git a/python/aitemplate/backend/common/vision_ops/roi_align_common.py b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
index b658b711f..d7c64d60e 100644
--- a/python/aitemplate/backend/common/vision_ops/roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
@@ -23,10 +23,10 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}roi_align_launcher<float, {{num_rois}}, {{pooled_size}}>(
-{{indent}}    in_ptr,
-{{indent}}    rois_ptr,
-{{indent}}    out_ptr,
+{{indent}}roi_align_launcher<{{library_dtype}}, float, {{num_rois}}, {{pooled_size}}>(
+{{indent}}    static_cast<const {{library_dtype}}*>(in_ptr),
+{{indent}}    static_cast<const {{library_dtype}}*>(rois_ptr),
+{{indent}}    static_cast<{{library_dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -212,10 +212,10 @@
 }
 
 
-template <typename T, int64_t num_rois, int pool_size>
-void roi_align_launcher({{elem_input_type}}* input,
-                      {{elem_input_type}}* rois,
-                      {{elem_output_type}}* output,
+template <typename LibraryT, typename T, int64_t num_rois, int pool_size>
+void roi_align_launcher(const LibraryT* input,
+                        const LibraryT* rois,
+                        LibraryT* output,
                       const {{index_type}} N,
                       const {{index_type}} H,
                       const {{index_type}} W,
@@ -243,9 +243,9 @@
 } // namespace
 
 void {{function_name}} (
-    {{elem_input_type}}* in_ptr,
-    {{elem_input_type}}* rois_ptr,
-    {{elem_output_type}}* out_ptr,
+    const void* in_ptr,
+    const void* rois_ptr,
+    void* out_ptr,
     {{index_type}}* batch,
     {{index_type}}* in_h,
     {{index_type}}* in_w,
@@ -273,9 +273,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_output_type}}*,
+  const void*,
+  const void*,
+  void*,
   {{index_type}}*,
   {{index_type}}*,
   {{index_type}}*,
@@ -295,9 +295,9 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{rois_ptr}}),
-{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{in_ptr}},
+{{indent}}    {{rois_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_in_h}},
 {{indent}}    {{p_in_w}},
@@ -330,16 +330,10 @@ def gen_function_decl(func_attrs, backend_spec):
     str
         Rendered function declaration stmt
     """
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
     )
 
 
@@ -364,9 +358,6 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
 
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
-
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
@@ -386,7 +377,5 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
         if func_attrs["continuous_coordinate"]
         else "false",
         backend_spec=backend_spec,
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index 38586aab5..f2ff7c11f 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -19,6 +19,7 @@
 from . import cuda_common, lib_template, target_def, utils
 from .common import *
 from .conv2d import *
+from .conv3d import *
 from .elementwise import *
 from .embedding import *
 from .gemm_special import *
diff --git a/python/aitemplate/backend/cuda/attention/__init__.py b/python/aitemplate/backend/cuda/attention/__init__.py
index 61a47c3ad..9636980b4 100644
--- a/python/aitemplate/backend/cuda/attention/__init__.py
+++ b/python/aitemplate/backend/cuda/attention/__init__.py
@@ -15,6 +15,6 @@
 """
 cuda flash_attention module init
 """
-from . import flash_attention
+from . import flash_attention, mem_eff_attention
 
-__all__ = ["flash_attention"]
+__all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/backend/cuda/attention/flash_attention.py b/python/aitemplate/backend/cuda/attention/flash_attention.py
index b2fe5c0ca..55d781ceb 100644
--- a/python/aitemplate/backend/cuda/attention/flash_attention.py
+++ b/python/aitemplate/backend/cuda/attention/flash_attention.py
@@ -23,10 +23,6 @@
 
 # pylint: disable=C0301
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
-
 FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int*>({{name}})")
 
 FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<float*>({{name}})")
@@ -202,8 +198,8 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
-                   const half* qkv,
+void {{func_name}}(void* output,
+                   const void* qkv,
                    const int* cu_seqlens,
                    float* softmax_lse,
                    float* o_tmp,
@@ -275,13 +271,9 @@ def flash_attention_gen_function_call(func_attrs, indent="  "):
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 2
 
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
 
-    qkv_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
-    )
+    qkv_name = func_attrs["inputs"][0]._attrs["name"]
 
     seqlens_name = FUNC_CALL_INT32_PARAM_TEMPLATE.render(
         name=func_attrs["inputs"][1]._attrs["name"]
@@ -303,8 +295,8 @@ def flash_attention_gen_function_call(func_attrs, indent="  "):
         output=output_name,
         qkv=qkv_name,
         cu_seqlens=seqlens_name,
-        softmax_lse="reinterpret_cast<float*>(global_workspace)",
-        o_tmp="reinterpret_cast<float*>(global_workspace + {} * sizeof(float))".format(
+        softmax_lse="reinterpret_cast<float*>(global_workspace_)",
+        o_tmp="reinterpret_cast<float*>(global_workspace_ + {} * sizeof(float))".format(
             batch_size * num_heads * seq_len
         ),
         batch_size=batch_size,
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
new file mode 100644
index 000000000..3948182d2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -0,0 +1,262 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+attention kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "kernel_forward.h"
+
+{{func_signature}}
+{
+
+    /*
+    problem_sizes0 [b, m, n, k]
+    [head_number * batch_size, m, mkv, k0]
+    [head_number * batch_size, seq_length, seq_length_kv, head_size]
+
+    problem_sizes1
+    [head_number * batch_size, m, k1, mkv]
+    [head_number * batch_size, seq_length, head_size_v, seq_length_kv]
+
+    m = seq_len
+    n = seq_len
+    k = head_size
+
+    Q: B, M, K
+    K: B, N, K
+    P: B, M, N
+    V: B, N, K
+    O: B, M, K
+    output: bs, num_head, seq_len, head_size
+    */
+
+
+    using ArchTag = cutlass::arch::Sm80;
+    constexpr bool kIs64x64 = {{kIs64x64}};
+    constexpr bool kSingleValueIteration = {{kSingleValueIteration}};
+
+    // Set grid size
+    constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;
+    constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;
+    if (kIs64x64 && head_size_v > kKeysPerBlock) {
+        std::cerr << "WARNING: you will get better performance with `kIs64x64=false`";
+    }
+    if (kSingleValueIteration && head_size_v > kKeysPerBlock) {
+        std::cerr << "ERROR  : Use kSingleValueIteration to keep output in RF. " \
+        "This requires to have `head_size <= kKeysPerBlock` " \
+        "but head_size_v=" << head_size_v << " and kKeysPerBlock=" << kKeysPerBlock << "";
+        return;
+    }
+    if (!kSingleValueIteration && head_size_v <= kKeysPerBlock) {
+        std::cerr << "WARNING: you will get better performance with `kSingleValueIteration=true` (keeps the output in RF rather than GMEM)";
+    }
+
+    using Attention = AttentionKernel<
+        {{elem_input_type}}, // scalar_t
+        ArchTag,
+        true, // memory is aligned
+        kQueriesPerBlock,
+        kKeysPerBlock,
+        kSingleValueIteration
+    >;
+
+    int block_O_size = (*batch_size) * seq_len * num_heads * head_size_v;
+    typename Attention::Params p;
+    {
+        // set parameters
+        p.query_ptr = static_cast<{{elem_input_type}}*>(query);
+        p.key_ptr = static_cast<{{elem_input_type}}*>(key);
+        p.value_ptr = static_cast<{{elem_input_type}}*>(value);
+        p.logsumexp_ptr = nullptr; // Only needed for bw
+        p.output_accum_ptr = nullptr;
+        if (Attention::kNeedsOutputAccumulatorBuffer) {
+          p.output_accum_ptr = accum_ptr;
+        }
+        p.output_ptr = static_cast<{{elem_input_type}}*>(output);
+
+        p.num_heads = num_heads;
+        p.num_batches = *batch_size;
+        p.head_dim = head_size;
+        p.head_dim_value = head_size_v;
+        p.num_queries = seq_len;
+        p.num_keys = seq_len_kv;
+        p.causal = is_causal;
+
+
+        p.q_strideM = head_size;
+        p.k_strideM = head_size;
+        p.v_strideM = head_size_v;
+
+        p.q_strideH = p.q_strideM * seq_len;
+        p.k_strideH = p.k_strideM * seq_len_kv;
+        p.v_strideH = p.v_strideM * seq_len_kv;
+        p.o_strideH = head_size_v;
+        p.q_strideB = p.q_strideH * num_heads;
+        p.k_strideB = p.k_strideH * num_heads;
+        p.v_strideB = p.v_strideH * num_heads;
+        p.o_strideB = head_size_v * seq_len * num_heads;
+    }
+
+    // launch kernel
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+      cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) {
+      std::cerr << "Kernel does not support these inputs" << std::endl;
+      return;
+    }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    cudaError_t err = cudaDeviceSynchronize();
+
+    if (err != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(err);
+      return;
+    }
+
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   void* query,
+                   void* key,
+                   void* value,
+                   float* accum_ptr,
+                   int64_t* batch_size,
+                   int seq_len,
+                   int seq_len_kv,
+                   int num_heads,
+                   int head_size,
+                   int head_size_v,
+                   float p_dropout,
+                   float softmax_scale,
+                   bool is_causal,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}},
+{{indent}}    {{accum_ptr}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{seq_len}},
+{{indent}}    {{seq_len_kv}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{head_size}},
+{{indent}}    {{head_size_v}},
+{{indent}}    {{p_dropout}},
+{{indent}}    {{softmax_scale}},
+{{indent}}    {{is_causal}}, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.mem_eff_attention.gen_function")
+def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    return FUNC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        kIs64x64="true" if func_attrs["head_size"] <= 64 else "false",
+        kSingleValueIteration="true" if func_attrs["head_size"] <= 128 else "false",
+    )
+
+
+@registry.reg("cuda.mem_eff_attention.func_decl")
+def mem_eff_attention_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.mem_eff_attention.func_call")
+def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 3
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    batch_size = "&" + xshape[0]._attrs["name"]
+    seq_len = x._attrs["shape"][2]._attrs["values"][0]
+
+    num_heads = x._attrs["shape"][1]._attrs["values"][0]
+    head_size = x._attrs["shape"][3]._attrs["values"][0]
+    p_dropout = func_attrs["dropout"]
+    is_causal = func_attrs["causal"]
+    softmax_scale = head_size ** (-0.5)
+
+    v = func_attrs["inputs"][2]
+    seq_len_kv = v._attrs["shape"][2]._attrs["values"][0]
+    head_size_v = v._attrs["shape"][3]._attrs["values"][0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        accum_ptr="reinterpret_cast<float*>(global_workspace_)",
+        batch_size=batch_size,
+        seq_len=seq_len,
+        seq_len_kv=seq_len_kv,
+        num_heads=num_heads,
+        head_size=head_size,
+        head_size_v=head_size_v,
+        p_dropout=p_dropout,
+        softmax_scale=softmax_scale,
+        is_causal="true" if is_causal else "false",
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha.h b/python/aitemplate/backend/cuda/attention/src/fmha.h
index 9cc516722..066f236c7 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
index 433676370..254abe31b 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
index 119ac6a6f..fa00d5984 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
index 27aad1b80..3b7487e3b 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/mask.h b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
index ec07012af..358acb90a 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
index 0bb8285d2..c3f87a71d 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
index 02e82c427..ec5461966 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/utils.h b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
index 7bc0b3df9..4a95ccce6 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
index 46bddc48e..92756cc6f 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
index 89776414a..d90ab5065 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /***************************************************************************************************
  * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
index 9de497e7f..94dd66718 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
index 5031d81a0..aa4138983 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
index 1cd4c191c..86f39f3c7 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /***************************************************************************************************
  * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
index 43692802b..41f49ffda 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_utils.h b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
index af8456621..a27bd40d9 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/philox.cuh b/python/aitemplate/backend/cuda/attention/src/philox.cuh
index 36e788400..4ab1a63ff 100644
--- a/python/aitemplate/backend/cuda/attention/src/philox.cuh
+++ b/python/aitemplate/backend/cuda/attention/src/philox.cuh
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 // Pytorch also has an implementation of Philox RNG:
 // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
 #pragma once
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 9e0de0d91..61c6c05f8 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -16,12 +16,15 @@
 common template for conv2d
 """
 import re
+
 from collections import OrderedDict
 from hashlib import sha1
 from typing import List
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ...target import Target
 from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
 
@@ -153,15 +156,19 @@ def gen_function(
     inst_def_flag = set()
     instances = {}
     instance_decl = ""
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     for key, value in exec_path.items():
         fname = "f" + sha1(key.encode()).hexdigest()
+
+        emit_instance = f_emit_instance(op_instance[value])
         if value not in inst_def_flag:
-            config = f_emit_instance(op_instance[value])
             inst_def_flag.add(value)
+            config = emit_instance
         else:
             config = ""
         inst = instance_template.render(
-            config=config, name=fname, config_name=extract_config_name(config)
+            config=config, name=fname, config_name=extract_config_name(emit_instance)
         )
         instances[key] = inst
         instance_decl += inst
@@ -191,13 +198,16 @@ def gen_function(
     exec_paths = ""
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
-        program = exec_template.render(indent="    ", instance=fname)
+        program = exec_template.render(
+            indent=" " * 4,
+            instance=fname,
+            dtype=dtype,
+        )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
         function_name=func_name,
-        dtype="cutlass::half_t",
         shape_function=shape_func,
         exec_paths=exec_paths,
         extra_header=extra_header,
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
index ddcef02b3..aa48d92f9 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from . import common
 
 # pylint: disable=C0103,C0301
@@ -34,10 +36,10 @@
 //  TODO: cast to right dtype
 {{indent}}typename {{instance}}::Arguments arguments{
 {{indent}}    problem_size,
-{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
-{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
-{{indent}}    {(cutlass::half_t*)(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
-{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
 {{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
 {{indent}}};
 {{indent}}{{instance}} implicit_gemm_op;
@@ -89,10 +91,10 @@
 {{instances_def}}
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* weight_ptr,
-    cutlass::half_t* out_ptr,
-    cutlass::half_t* bias_ptr,
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    void* bias_ptr,
     uint8_t* workspace,
     int64_t* batch,
     int64_t* out_ch,
@@ -177,10 +179,10 @@
   cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
   //
   // warmup
-  conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
-       (cutlass::half_t*) b.device_data(),
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -200,12 +202,12 @@
   for (auto & event : events) {
     cudaEventCreate(&event);
   }
-  cudaEventRecord(events[0]);
+  cudaEventRecord(events[0], stream);
   for (int i = 0; i < 5; ++i) {
-      conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
-       (cutlass::half_t*) b.device_data(),
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -222,7 +224,7 @@
        pad,
        stream);
   }
-  cudaEventRecord(events[1]);
+  cudaEventRecord(events[1], stream);
   cudaEventSynchronize(events[1]);
   float runtime_ms = 0;
   cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
@@ -245,10 +247,10 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
+  void*,
   uint8_t*,
   int64_t*,
   int64_t*,
@@ -275,7 +277,7 @@
 {{indent}}    {{weight_ptr}},
 {{indent}}    {{out_ptr}},
 {{indent}}    {{bias_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_out_ch}},
 {{indent}}    {{p_in_ch}},
@@ -314,6 +316,9 @@ def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
         dilate="dilation",
         pad="pad",
     )
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     file_pairs = []
     for op_name, op in op_instance.items():
         config = common.emit_instance(op)
@@ -324,12 +329,14 @@ def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
             config_name=config_name, name=name, config=config
         )
         exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name
+            indent="  ",
+            is_profiler=True,
+            instance=name,
+            dtype=dtype,
         )
         op_func = SRC_TEMPLATE.render(
             instances=instance,
             function_name="conv",
-            dtype="cutlass::half_t",
             shape_func="",
             exec_paths=exec_program,
             extra_header=extra_header,
@@ -339,7 +346,7 @@ def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function_call(func_attrs, indent="  "):
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
index 0647769a1..5439f1fc0 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from . import common
 
 # pylint: disable=C0301,C0103
@@ -34,13 +36,13 @@
 //  TODO: cast to right dtype
 {{indent}}typename {{instance}}::Arguments arguments{
 {{indent}}    problem_size,
-{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
-{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
-{{indent}}    {(cutlass::half_t*)(res_ptr), layout_C},
-{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(res_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
 {{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
 {{indent}}    cutlass::conv::SplitKMode::kSerial,
-{{indent}}    (cutlass::half_t*)(bias_ptr),
+{{indent}}    static_cast<{{dtype}}*>(bias_ptr),
 {{indent}}    nullptr, 0, *out_ch
 {{indent}}};
 {{indent}}{{instance}} implicit_gemm_op;
@@ -90,11 +92,11 @@
 {{instances_def}}
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* weight_ptr,
-    cutlass::half_t* out_ptr,
-    cutlass::half_t* bias_ptr,
-    cutlass::half_t* res_ptr,
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    void* bias_ptr,
+    void* res_ptr,
     uint8_t* workspace,
     int64_t* batch,
     int64_t* out_ch,
@@ -180,11 +182,11 @@
   cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
   //
   // warmup
-  conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
-       (cutlass::half_t*) b.device_data(),
-       (cutlass::half_t*) r.device_data(),
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
+       r.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -204,13 +206,13 @@
   for (auto & event : events) {
     cudaEventCreate(&event);
   }
-  cudaEventRecord(events[0]);
+  cudaEventRecord(events[0], stream);
   for (int i = 0; i < 5; ++i) {
-      conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
-       (cutlass::half_t*) b.device_data(),
-       (cutlass::half_t*) r.device_data(),
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
+       r.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -227,7 +229,7 @@
        pad,
        stream);
   }
-  cudaEventRecord(events[1]);
+  cudaEventRecord(events[1], stream);
   cudaEventSynchronize(events[1]);
   float runtime_ms = 0;
   cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
@@ -251,11 +253,11 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
+  void*,
+  void*,
   uint8_t*,
   int64_t*,
   int64_t*,
@@ -283,7 +285,7 @@
 {{indent}}    {{out_ptr}},
 {{indent}}    {{bias_ptr}},
 {{indent}}    {{res_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_out_ch}},
 {{indent}}    {{p_in_ch}},
@@ -322,6 +324,8 @@ def gen_profiler(func_attrs, workdir, shape_template):
         dilate="dilation",
         pad="pad",
     )
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     file_pairs = []
     for op_name, op in op_instance.items():
         config = common.emit_instance(op)
@@ -331,12 +335,11 @@ def gen_profiler(func_attrs, workdir, shape_template):
             config_name=config_name, name=name, config=config
         )
         exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
         )
         op_func = SRC_TEMPLATE.render(
             instances=instance,
             function_name="conv",
-            dtype="cutlass::half_t",
             shape_func="",
             exec_paths=exec_program,
         )
@@ -345,4 +348,4 @@ def gen_profiler(func_attrs, workdir, shape_template):
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
index 7e5da403f..3279e2ff7 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import common
 
@@ -35,10 +37,10 @@
 //  TODO: cast to right dtype
 {{indent}}typename {{instance}}::Arguments arguments{
 {{indent}}    problem_size,
-{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
-{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
-{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
-{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
 {{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
 {{indent}}};
 {{indent}}{{instance}} implicit_gemm_op;
@@ -87,9 +89,9 @@
 {{instances_def}}
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* weight_ptr,
-    cutlass::half_t* out_ptr,
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
     uint8_t* workspace,
     int64_t* batch,
     int64_t* out_ch,
@@ -175,9 +177,9 @@
 
   //
   // warmup
-  conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -197,11 +199,11 @@
   for (auto & event : events) {
     cudaEventCreate(&event);
   }
-  cudaEventRecord(events[0]);
+  cudaEventRecord(events[0], stream);
   for (int i = 0; i < 5; ++i) {
-      conv((cutlass::half_t*) x.device_data(),
-       (cutlass::half_t*) w.device_data(),
-       (cutlass::half_t*) y.device_data(),
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
        global_workspace,
        &NI,
        &CO,
@@ -218,7 +220,7 @@
        pad,
        stream);
   }
-  cudaEventRecord(events[1]);
+  cudaEventRecord(events[1], stream);
   cudaEventSynchronize(events[1]);
   float runtime_ms = 0;
   cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
@@ -241,9 +243,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
   uint8_t*,
   int64_t*,
   int64_t*,
@@ -269,7 +271,7 @@
 {{indent}}    {{in_ptr}},
 {{indent}}    {{weight_ptr}},
 {{indent}}    {{out_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_out_ch}},
 {{indent}}    {{p_in_ch}},
@@ -317,6 +319,8 @@ def gen_profiler(func_attrs, workdir, shape_template):
         pad="pad",
     )
     file_pairs = []
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     for op_name, op in op_instance.items():
         config = common.emit_instance(op)
         config_name = common.extract_config_name(config)
@@ -325,12 +329,11 @@ def gen_profiler(func_attrs, workdir, shape_template):
             config_name=config_name, name=name, config=config
         )
         exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
         )
         op_func = SRC_TEMPLATE.render(
             instances=instance,
             function_name="conv",
-            dtype="cutlass::half_t",
             shape_func="",
             exec_paths=exec_program,
         )
@@ -339,7 +342,7 @@ def gen_profiler(func_attrs, workdir, shape_template):
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.conv2d.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
index c1ce2ac94..c4fb32c42 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -30,7 +30,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 @registry.reg("cuda.conv2d_bias.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
     """Codegen for conv2d profiler."""
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
index 663495f22..07ecbbff6 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -67,7 +67,7 @@ def fproc_f16(op):
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
index 10aa46619..09d975ae4 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -67,7 +67,7 @@ def fproc_f16(op):
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
index b6b96704f..5a5e7314b 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -67,7 +67,7 @@ def fproc_f16(op):
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
index b8ddfa205..584eddbfe 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -140,7 +140,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 @registry.reg("cuda.conv2d_bias_few_channels.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
     """generate code for profiling"""
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
index e31ad9095..ccdc3ae1e 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -28,7 +28,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
index f305f3344..f8de585fa 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -52,7 +52,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
     """generate code for profiling"""
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
index ea75bdd9d..920e13d5c 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -28,7 +28,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 
 @registry.reg("cuda.conv2d_bias_relu.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_relu.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
index e207bc10a..39019c5f1 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -44,7 +44,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
     """generate code for profiling"""
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
index 5ad4ccd6a..cbb896e71 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -29,7 +29,7 @@ def conv2d_config(func_attrs, dtype="float16"):
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_profiler")
 def gen_profiler(func_attrs, workdir, shape_template):
-    cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(func_attrs, workdir, shape_template)
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
index b1b6acbc1..574f0d361 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -19,6 +19,8 @@
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import common, conv2d
 
@@ -51,9 +53,9 @@
 {{instances_def}}
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* weight_ptr,
-    cutlass::half_t* out_ptr,
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
     uint8_t* workspace,
     int64_t* batch,
     int64_t* out_ch,
@@ -209,6 +211,9 @@ def gen_profiler(func_attrs, workdir, shape_template):
         pad="pad",
     )
     file_pairs = []
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     for op_name, op in op_instance.items():
         config = emit_instance(op)
 
@@ -218,12 +223,11 @@ def gen_profiler(func_attrs, workdir, shape_template):
             config_name=config_name, name=name, config=config
         )
         exec_program = conv2d.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
         )
         op_func = SRC_TEMPLATE.render(
             instances=instance,
             function_name="conv",
-            dtype="cutlass::half_t",
             shape_func="",
             exec_paths=exec_program,
         )
@@ -232,7 +236,7 @@ def gen_profiler(func_attrs, workdir, shape_template):
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.transposed_conv2d.filter")
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
index 2df9642fa..35b08d19f 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -19,6 +19,8 @@
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import common, common_conv2d_bias_activation as cba
 
@@ -51,10 +53,10 @@
 {{instances_def}}
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* weight_ptr,
-    cutlass::half_t* out_ptr,
-    cutlass::half_t* bias_ptr,
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    void* bias_ptr,
     uint8_t* workspace,
     int64_t* batch,
     int64_t* out_ch,
@@ -215,6 +217,8 @@ def gen_profiler(func_attrs, workdir, shape_template):
         dilate="dilation",
         pad="pad",
     )
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     file_pairs = []
     for op_name, op in op_instance.items():
         config = emit_instance(op)
@@ -225,12 +229,11 @@ def gen_profiler(func_attrs, workdir, shape_template):
             config_name=config_name, name=name, config=config
         )
         exec_program = cba.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
         )
         op_func = SRC_TEMPLATE.render(
             instances=instance,
             function_name="conv",
-            dtype="cutlass::half_t",
             shape_func="",
             exec_paths=exec_program,
         )
@@ -239,7 +242,7 @@ def gen_profiler(func_attrs, workdir, shape_template):
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.transposed_conv2d_bias.filter")
diff --git a/python/aitemplate/backend/cuda/conv3d/__init__.py b/python/aitemplate/backend/cuda/conv3d/__init__.py
new file mode 100644
index 000000000..ba1388ae4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA conv3d module init
+"""
+from . import conv3d, depthwise_conv3d
+
+__all__ = ["conv3d", "depthwise_conv3d"]
diff --git a/python/aitemplate/backend/cuda/conv3d/common.py b/python/aitemplate/backend/cuda/conv3d/common.py
new file mode 100644
index 000000000..461c4e6e9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/common.py
@@ -0,0 +1,364 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA conv3d common functions
+"""
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import List
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ...target import Target
+from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  int64_t*, // kernel size
+  int64_t*,
+  int64_t*,
+  int, // strides
+  int,
+  int,
+  int, // padding
+  int,
+  int,
+  int, // dilation
+  int,
+  int,
+  int64_t*, // in_batch
+  int64_t*, // in_ch
+  int64_t*, // in_t
+  int64_t*, // in_h
+  int64_t*, // in_w
+  int64_t*, // out_ch
+  int64_t*, // out_t
+  int64_t*, // out_h
+  int64_t*, // out_w
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_kernel_t}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{stride_t}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{padding_t}},
+{{indent}}    {{padding_h}},
+{{indent}}    {{padding_w}},
+{{indent}}    {{dilation_t}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{p_in_batch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_in_t}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_out_t}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_in_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_t="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_t="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_t="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_t=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        padding_t=func_attrs["pad"][0],
+        padding_h=func_attrs["pad"][1],
+        padding_w=func_attrs["pad"][2],
+        dilation_t=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        indent=indent,
+    )
+
+
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass{{opcode_class}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def kernel_name(op):
+    """generate cuda kernel name"""
+    from cutlass_lib import library
+
+    threadblock = op.tile_description.procedural_name()
+    extended_name = op.extended_name()
+    opcode_class_name = library.OpcodeClassNames[
+        op.tile_description.math_instruction.opcode_class
+    ]
+    layout = "ndhwc"  # op.layout_name()
+    align_ab = op.A.alignment
+    align_c = op.C.alignment
+    name = KERNEL_KEY_TEMPLATE.render(
+        threadblock=threadblock,
+        extended_name=extended_name,
+        opcode_class_name=opcode_class_name,
+        layout=layout,
+        align_ab=align_ab,
+        align_c=align_c,
+    )
+    return name.replace("\n", "")
+
+
+def emit_instance(op):
+    """emit instance"""
+    import cutlass_lib
+
+    # if hasattr(op, "binary_op"):
+    #     emiter = cutlass_lib.conv3d_operation.EmitConv3dWithBroadcastInstance()
+    # else:
+    #     emiter = cutlass_lib.conv3d_operation.EmitConv3dInstance()
+    emiter = cutlass_lib.conv3d_operation.EmitConv3dInstance()
+    op_def = emiter.emit(op)
+    return op_def
+
+
+def extract_config(func_attrs, f_proc_op=None):
+    """Extracts cutlass config for conv kernels."""
+    import copy
+
+    import cutlass_lib
+
+    def f_proc_op_default(op):
+        # import cutlass_lib
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.tile_description.math_instruction.element_accumulator == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    op_kind = cutlass_lib.library.OperationKind.Conv3d
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
+    ret = []
+    conv3d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.conv_kind == conv_kind:
+            if f_proc_op is None:
+                ret = f_proc_op_default(op)
+            else:
+                ret = f_proc_op(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = kernel_name(op_inst)
+                    conv3d_ops[key] = op_inst
+
+    return conv3d_ops
+
+
+def extract_config_name(config):
+    """Extracts config name from a given config."""
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[2]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_function(
+    func_attrs,
+    instance_template,
+    exec_template,
+    src_template,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+    f_emit_instance=emit_instance,
+    extra_header="",
+):
+    """Function definition codegen."""
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        if value not in inst_def_flag:
+            config = f_emit_instance(op_instance[value])
+            inst_def_flag.add(value)
+        else:
+            config = ""
+        inst = instance_template.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_d",
+        x_dim2="*in_h",
+        x_dim3="*in_w",
+        x_dim4="*in_ch",
+        w_dim0="*out_ch",
+        w_dim1="*kernel_d",
+        w_dim2="*kernel_h",
+        w_dim3="*kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_d",
+        y_dim2="*out_h",
+        y_dim3="*out_w",
+        y_dim4="*out_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        extra_header=extra_header,
+    )
+
+
+def cal_align_ab(x_shape: List[int]) -> int:
+    """Returns input alignment."""
+    k = x_shape[4]  # CI
+    if k % 8 == 0:
+        return 8
+    if k % 4 == 0:
+        return 4
+    if k % 2 == 0:
+        return 2
+    raise RuntimeError("a/b is not aligned")
+
+
+def function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    ab_alignment = cal_align_ab(x_shape)
+    tmp = cfg.split("_")
+    align_c = int(tmp[-1])
+    align_ab = int(tmp[-2])
+    if align_c != func_attrs["epilogue_alignment"]:
+        return False
+    if align_ab != ab_alignment:
+        return False
+    return True
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
new file mode 100644
index 000000000..045092131
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -0,0 +1,496 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for conv3d.
+"""
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_d,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_d,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_d,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_d = *in_d;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_d = *kernel_d;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_d = *out_d;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNDHWC;
+  TensorNDHWC layout_A(TensorNDHWC::packed(cutlass::make_Coord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNDHWC layout_B(TensorNDHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNDHWC layout_C(TensorNDHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_d, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv3dProblemSize problem_size(
+    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),
+    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),
+    cutlass::make_Coord(pad_d, pad_h, pad_w),
+    cutlass::make_Coord(stride_d, stride_h, stride_w),
+    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),
+    cutlass::conv::Mode::kCrossCorrelation,
+    1,
+    1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv3d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_d = std::stoi(argv[2]);
+  int64_t in_h = std::stoi(argv[3]);
+  int64_t in_w = std::stoi(argv[4]);
+  int64_t in_ch = std::stoi(argv[5]);
+  int64_t kernel_d = std::stoi(argv[6]);
+  int64_t kernel_h = std::stoi(argv[7]);
+  int64_t kernel_w = std::stoi(argv[8]);
+  int64_t out_ch = std::stoi(argv[9]);
+  int stride_d = std::stoi(argv[10]);
+  int stride_h = std::stoi(argv[11]);
+  int stride_w = std::stoi(argv[12]);
+  int pad_d = std::stoi(argv[13]);
+  int pad_h = std::stoi(argv[14]);
+  int pad_w = std::stoi(argv[15]);
+  int dilation_d = std::stoi(argv[16]);
+  int dilation_h = std::stoi(argv[17]);
+  int dilation_w = std::stoi(argv[18]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, DI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KD, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, DO, HO, WO, CO});
+
+  //
+  // warmup
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KD,
+       &KH,
+       &KW,
+       &DI,
+       &HI,
+       &WI,
+       &NO,
+       &DO,
+       &HO,
+       &WO,
+       stride_d,
+       stride_h,
+       stride_w,
+       dilation_d,
+       dilation_h,
+       dilation_w,
+       pad_d,
+       pad_h,
+       pad_w,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 5; ++i) {
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KD,
+       &KH,
+       &KW,
+       &DI,
+       &HI,
+       &WI,
+       &NO,
+       &DO,
+       &HO,
+       &WO,
+       stride_d,
+       stride_h,
+       stride_w,
+       dilation_d,
+       dilation_h,
+       dilation_w,
+       pad_d,
+       pad_h,
+       pad_w,
+       stream);
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_d}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_d}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_d}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride_d}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{dilation_d}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{pad_d}},
+{{indent}}    {{pad_h}},
+{{indent}}    {{pad_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+@registry.reg("cuda.conv3d.config")
+def conv3d_config(func_attrs, dtype="float16"):
+    """Populates conv3d cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv3d.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """Codegen for conv3d profiler."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_d",
+        x_dim2="in_h",
+        x_dim3="in_w",
+        x_dim4="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_d",
+        w_dim2="kernel_h",
+        w_dim3="kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(op)
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvFwdInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    return common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.conv3d.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv3d function."""
+    return common.gen_function(
+        func_attrs,
+        INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv3d.func_decl")
+def conv3d_gen_function_decl(func_attrs):
+    """Codegen for conv3d function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv3d.func_call")
+def conv3d_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv3d function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_d="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_d="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_d="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_d=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        dilation_d=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        pad_d=func_attrs["pad"][0],
+        pad_h=func_attrs["pad"][1],
+        pad_w=func_attrs["pad"][2],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv3d.filter")
+def conv3d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
new file mode 100644
index 000000000..92158b6ae
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
@@ -0,0 +1,331 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for depthwise_conv3d.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+#include <algorithm>
+#include <limits>
+#include <assert.h>
+
+namespace {
+#define CUDA_KERNEL_LOOP(i, n)                                                                          \\
+    int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;                                         \\
+    for (int64_t i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+template <typename scalar_t, typename accscalar_t, typename Telement, int element_in_Tio, int kernel_k, int dil_d>
+__global__ void conv_depthwise3d_cuda_kernel(
+    const scalar_t * input,
+    const half* kernel,
+    scalar_t * output,
+    int _kT, int _kH, int _kW,
+    int strideT, int strideH, int strideW,
+    int paddingT, int paddingH, int paddingW,
+    int _dilationT, int _dilationH, int _dilationW,
+    int iC, int iT, int iH, int iW,
+    int oT, int oH, int oW,
+    int num_outputs)
+{
+  int kT = kernel_k > 0? kernel_k: _kT;
+  int kH = kernel_k > 0? kernel_k: _kH;
+  int kW = kernel_k > 0? kernel_k: _kW;
+
+  int dilationT = dil_d > 0? dil_d: _dilationT;
+  int dilationH = dil_d > 0? dil_d: _dilationH;
+  int dilationW = dil_d > 0? dil_d: _dilationW;
+
+  const int oC = iC;
+  const int channel_multiplier = 1;
+
+  CUDA_KERNEL_LOOP(index, num_outputs) {
+    const int out_channel = index  % oC;
+    const int out_col = (index / oC) % oW;
+    const int out_row = (index / oC / oW) % oH;
+    const int out_frame = (index / oC / oW / oH) % oT;
+    const int batch = index / oC / oW / oH / oT;
+
+    const int in_channel = out_channel / channel_multiplier;
+
+    const int in_col_start = out_col * strideW - paddingW;
+    const int in_row_start = out_row * strideH - paddingH;
+    const int in_frame_start = out_frame * strideT - paddingT;
+
+    const int in_offset = in_channel + iC * (in_col_start + iW * (in_row_start + iH * (in_frame_start + iT* batch)));
+    const int out_offset = out_channel + oC * (out_col + oW * (out_row + oH * (out_frame + oT* batch)));
+
+    accscalar_t sum[8];
+    for (int tk = 0; tk < element_in_Tio; tk++){
+        sum[tk] = 0;
+    }
+    const half *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
+    const scalar_t *input_ptr = input + in_offset;
+    for (int k_frame = 0; k_frame < kT; ++k_frame) {
+      const int in_frame = in_frame_start + k_frame * dilationT;
+      for (int k_row = 0; k_row < kH; ++k_row) {
+        const int in_row = in_row_start + k_row * dilationH;
+        for (int k_col = 0; k_col < kW; ++k_col) {
+          const int in_col = in_col_start + k_col * dilationW;
+          if (in_frame >= 0 && in_row >= 0 && in_col >= 0 &&
+              in_frame < iT && in_row < iH && in_col < iW) {
+            scalar_t input_val = __ldg(input_ptr);
+            Telement* pack_input = reinterpret_cast<Telement*>(&input_val);
+
+            for (int tk = 0; tk < element_in_Tio; tk++){
+                accscalar_t op1 = __half2float(pack_input[tk]);
+                sum[tk] += op1 * __half2float(kernel_ptr[tk*kT*kH*kW]);
+            }
+          }
+          kernel_ptr += 1;
+          input_ptr += dilationW * iC;
+        }
+        input_ptr += iC * (iW * dilationH - kW * dilationW);
+      }
+      input_ptr += iC * iW * (iH * dilationT - kH * dilationH);
+    }
+
+    scalar_t output_val;
+    Telement* pack_output = reinterpret_cast<Telement*>(&output_val);
+    for (int tk = 0; tk < element_in_Tio; tk++){
+        pack_output[tk] = __float2half(sum[tk]);
+    }
+    output[out_offset] = output_val;
+  }
+}
+
+#define NODEF_OR_EQUAL(x, y) ((y) < 0 || (x) == (y))
+#define NODEF_OR_EQUAL_3(x, y1, y2, y3) \\
+  (NODEF_OR_EQUAL(x, y1) && \\
+   NODEF_OR_EQUAL(x, y2) && \\
+   NODEF_OR_EQUAL(x, y3))
+
+
+#define DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(kernel_k, dil_d)                       \\
+  if (NODEF_OR_EQUAL_3(kernel_k, (kernel_t), (kernel_h), (kernel_w)) &&                 \\
+      NODEF_OR_EQUAL_3(dil_d, (dilation_t), (dilation_h), (dilation_w))) {              \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, kernel_k, dil_d>  \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);                                                     \\
+  } else                                                                \\
+
+#define DWCONV3D_FORWARD_DISPATCH_OTHERS                                \\
+  {                                                                     \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, -1, -1>           \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);}                                                    \\
+
+
+void conv_depthwise3d_launcher(
+    const half * input,
+    const half * weight,
+    half * output,
+    int kernel_t,
+    int kernel_h,
+    int kernel_w,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int n,
+    int c,
+    int t,
+    int h,
+    int w,
+    int to,
+    int ho,
+    int wo,
+    cudaStream_t stream
+    ) {
+
+  assert(to > 0);
+  assert(ho > 0);
+  assert(wo > 0);
+
+  int64_t num_outputs = n * to * ho * wo * c;
+  int64_t block = 256;
+  int64_t grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536);
+
+  int64_t num_inputs = n * t * h * w * c;
+  int64_t num_weights = c * kernel_t * kernel_h * kernel_w;
+  int64_t smem = 0;
+
+  // Range check to avoid overflow in CUDA kernels.
+  assert((num_inputs <= std::numeric_limits<int32_t>::max()) &&
+              "Input tensor is too large.");
+  assert((num_outputs <= std::numeric_limits<int32_t>::max()) &&
+              "Output tensor is too large.");
+  assert((num_weights <= 1024*8) &&
+              "Weight tensor is too large.");
+
+  assert((padding_t * 2 + t <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_h * 2 + h <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_w * 2 + w <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+
+
+  using accscalar_t = float;
+  using Telement = half;
+  {% if csize == 0 %}
+    using scalar_t = float4;
+    c = c/8;
+    num_outputs = num_outputs/8;
+    #define element_in_Tio 8
+  {% elif csize == 2 %}
+    using scalar_t = half2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = half;
+    #define element_in_Tio 1
+  {% endif %}
+
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 1)
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, 1)
+  DWCONV3D_FORWARD_DISPATCH_OTHERS
+}
+
+#undef DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION
+#undef DWCONV3D_FORWARD_DISPATCH_OTHERS
+#undef CUDA_KERNEL_LOOP
+} // namespace
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    int64_t* p_kt,
+    int64_t* p_kh,
+    int64_t* p_kw,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int64_t* p_batch,
+    int64_t* p_in_ch,
+    int64_t* p_in_t,
+    int64_t* p_in_h,
+    int64_t* p_in_w,
+    int64_t* p_out_ch,
+    int64_t* p_out_t,
+    int64_t* p_out_h,
+    int64_t* p_out_w,
+    cudaStream_t stream
+) {
+  int kt = *p_kt;
+  int kh = *p_kh;
+  int kw = *p_kw;
+  int batch = *p_batch;
+  int in_ch = *p_in_ch;
+  int in_t = *p_in_t;
+  int in_h = *p_in_h;
+  int in_w = *p_in_w;
+  int out_ch = *p_out_ch;
+  int out_t = *p_out_t;
+  int out_h = *p_out_h;
+  int out_w = *p_out_w;
+
+  conv_depthwise3d_launcher(
+    (const half*)in_ptr,
+    (const half*)weight_ptr,
+    (half*)out_ptr,
+    kt,
+    kh,
+    kw,
+    stride_t,
+    stride_h,
+    stride_w,
+    padding_t,
+    padding_h,
+    padding_w,
+    dilation_t,
+    dilation_h,
+    dilation_w,
+    batch,
+    in_ch,
+    in_t,
+    in_h,
+    in_w,
+    out_t,
+    out_h,
+    out_w,
+    stream
+  );
+
+  return;
+}
+"""
+)
+
+
+@registry.reg("cuda.depthwise_conv3d.gen_function")
+def gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    csize = func_attrs["group"] % 8
+    return SRC_TEMPLATE.render(function_name=func_name, csize=csize)
+
+
+@registry.reg("cuda.depthwise_conv3d.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name)
+
+
+@registry.reg("cuda.depthwise_conv3d.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/elementwise/__init__.py b/python/aitemplate/backend/cuda/elementwise/__init__.py
index 0bf6e473f..18bff2803 100644
--- a/python/aitemplate/backend/cuda/elementwise/__init__.py
+++ b/python/aitemplate/backend/cuda/elementwise/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import fused_elementwise
+from . import fused_elementwise, int_elementwise
 
-__all__ = ["fused_elementwise"]
+__all__ = ["fused_elementwise", "int_elementwise"]
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 2adddd531..07d1650f5 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -296,4 +296,162 @@ __device__ half2 hmin2_nan(const half2 a, const half2 b) {
 #endif
 }
 
+// pow impl
+__device__ half hpow(const half a, const half b);
+
+__device__ half2 h2pow(const half2 a, const half2 b) {
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  if (b1 != b2) {
+    half a1 = __low2half(a);
+    half a2 = __high2half(a);
+    half c1 = hpow(a1, b1);
+    half c2 = hpow(a2, b2);
+    return __halves2half2(c1, c2);
+  }
+
+  // New special cases can be added if needed, such as
+  // an powi for cases where b is an integer
+  if (__hbeq2(b, half2(0.0, 0.0))) {
+    return half2(1.0, 1.0);
+  }
+  if (__hbeq2(b, half2(1.0, 1.0))) {
+    return a;
+  }
+  if (__hbeq2(b, half2(2.0, 2.0))) {
+    return __hmul2(a, a);
+  }
+  if (__hbeq2(b, half2(3.0, 3.0))) {
+    return __hmul2(__hmul2(a, a), a);
+  }
+  if (__hbeq2(b, half2(0.5, 0.5))) {
+    return h2sqrt(a);
+  }
+  if (__hbeq2(b, half2(-0.5, -0.5))) {
+    return h2rsqrt(a);
+  }
+  if (__hbeq2(b, half2(-1.0, -1.0))) {
+    return __h2div(half2(1.0, 1.0), a);
+  }
+  if (__hbeq2(b, half2(-2.0, -2.0))) {
+    return __h2div(half2(1.0, 1.0), __hmul2(a, a));
+  }
+
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+
+  // low 16 bits
+  half c1 =
+      static_cast<half>(pow(static_cast<double>(a1), static_cast<double>(b1)));
+  // high 16 bits
+  half c2 =
+      static_cast<half>(pow(static_cast<double>(a2), static_cast<double>(b2)));
+  return __halves2half2(c1, c2);
+}
+
+__device__ half hpow(const half a, const half b) {
+  if (b == half(0.0)) {
+    return half(1.0);
+  }
+  if (b == half(1.0)) {
+    return a;
+  }
+  if (b == half(2.0)) {
+    return a * a;
+  }
+  if (b == half(3.0)) {
+    return a * a * a;
+  }
+  if (b == half(0.5)) {
+    return hsqrt(a);
+  }
+  if (b == half(-0.5)) {
+    return hrsqrt(a);
+  }
+  if (b == half(-1.0)) {
+    return half(1.0) / a;
+  }
+  if (b == half(-2.0)) {
+    return half(1.0) / (a * a);
+  }
+  return static_cast<half>(pow(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__device__ float fpow(const float a, const float b) {
+  if (b == float(0.0)) {
+    return float(1.0);
+  }
+  if (b == float(1.0)) {
+    return a;
+  }
+  if (b == float(2.0)) {
+    return a * a;
+  }
+  if (b == float(3.0)) {
+    return a * a * a;
+  }
+  if (b == float(0.5)) {
+    return sqrt(a);
+  }
+  if (b == float(-0.5)) {
+    return rsqrt(a);
+  }
+  if (b == float(-1.0)) {
+    return float(1.0) / a;
+  }
+  if (b == float(-2.0)) {
+    return float(1.0) / (a * a);
+  }
+  return static_cast<float>(
+      pow(static_cast<double>(a), static_cast<double>(b)));
+}
+
+//
+// GELU function definitions implemented as described by
+//   Hendrycks, D., and Gimpel, K. in
+//   "Gaussian Error Linear Units (GELUs)." (2020)
+//   https://arxiv.org/pdf/1606.08415.pdf
+//
+// Floating-point constants are Taylor coefficients described in the paper.
+//
+__device__ half hgelu(const half a) {
+  cutlass::epilogue::thread::GELU<cutlass::half_t> gelu_op;
+  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
+}
+
+__device__ float fgelu(const float a) {
+  cutlass::epilogue::thread::GELU<float> gelu_op;
+  return gelu_op(a);
+}
+
+__device__ half h_fast_gelu(const half a) {
+  cutlass::epilogue::thread::GELU_taylor<cutlass::half_t> gelu_op;
+  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
+}
+
+__device__ float f_fast_gelu(const float a) {
+  cutlass::epilogue::thread::GELU_taylor<float> gelu_op;
+  return gelu_op(a);
+}
+
+__device__ float fsoftplus(
+    const float a,
+    const float beta,
+    const float threshold) {
+  return (a * beta > threshold) ? a : log1pf(expf(a * beta)) / beta;
+}
+
+__device__ half hsoftplus(const half a, const half beta, const half threshold) {
+  half one_val = one();
+  return __hgt(__hmul(a, beta), threshold)
+      ? a
+      : __hdiv(hlog(__hadd(one_val, hexp(__hmul(a, beta)))), beta);
+}
+
+__device__ half2
+h2softplus(const half2 a, const half2 beta, const half2 threshold) {
+  return half2(
+      hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
+}
+
 #endif
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
index f25013aec..667310726 100644
--- a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -29,6 +29,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/constants.h"
+#include "cutlass/epilogue/thread/activation.h"
 """
 
 
diff --git a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
new file mode 100644
index 000000000..ad9be8b98
--- /dev/null
+++ b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+IntElementwise codegen for CUDA.
+"""
+
+import jinja2
+
+from ....compiler.base import IntVarTensor
+from ... import registry
+
+from ...backend_spec import CPUBackendSpec
+
+
+INT_VAR_FUNC_TEMPLATE = jinja2.Template(
+    """
+      {{lhs}} = {{rhs}};
+"""
+)
+
+
+@registry.reg("cuda.int_elementwise.gen_function")
+def dummpy_int_elementwise_gen_function(func_attrs):
+    return ""
+
+
+@registry.reg("cuda.int_elementwise.func_decl")
+def dummpy_int_elementwise_gen_function_decl(func_attrs):
+    return ""
+
+
+@registry.reg("cuda.int_elementwise.func_call")
+def int_elementwise_gen_function_call(func_attrs, indent):
+    """Generates int_elementwise function call."""
+    func_enum = func_attrs["func"]
+    inputs = func_attrs["inputs"]
+    outputs = func_attrs["outputs"]
+    assert (
+        len(outputs) == 1
+    ), f"Elementwise op for IntVarTensor should only generate 1 output, got {len(outputs)}"
+    input_params_vec = []
+    for inp in inputs:
+        assert isinstance(
+            inp, IntVarTensor
+        ), f"only inputs of IntVarTensor are allowed for OP with output of IntVarTensor, got type{inp}"
+        input_params_vec.append(inp._attrs["int_var"]._attrs["name"])
+    backend_spec = CPUBackendSpec()
+    op = backend_spec.func_enum_to_func_name.get(func_enum)
+    rhs = op.join(input_params_vec)
+    lhs = outputs[0]._attrs["name"]
+    func_call = INT_VAR_FUNC_TEMPLATE.render(
+        lhs=lhs,
+        rhs=rhs,
+    )
+    return func_call
diff --git a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
index 19b7ec384..e62826889 100644
--- a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
@@ -21,6 +21,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
@@ -52,6 +53,7 @@
   warpReduceSum<T>(val);
 
   if (lane == 0) {
+#pragma unroll
     shared[wid] = val[0];
   }
 
@@ -59,6 +61,7 @@
 
   // blockDim.x is round up to multiples of 32
   bool is_mask = threadIdx.x < (blockDim.x / 32);
+#pragma unroll
   val[0] = is_mask ? shared[lane] : (T)(0.0f);
 
   warpReduceSum<T>(val);
@@ -79,7 +82,7 @@
   return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
 }
 
-template <typename INDEX_T>
+template <typename ElemT, typename INDEX_T>
 __global__ void bert_embeddings_kernel(
     uint4* output,
     INDEX_T* input_ids,
@@ -95,9 +98,10 @@
     const int64_t type_vocab_size,
     const int64_t max_position_embeddings,
     const float eps) {
+  constexpr int num_elems_in_uint4 = sizeof(uint4) / sizeof(ElemT);
   const int tid = threadIdx.x;
   const int bid = blockIdx.x;
-  const int embedding_dim_div_8 = embedding_dim / 8;
+  const int embedding_dim_div_n = embedding_dim / num_elems_in_uint4;
 
   const int64_t input_id = input_ids[bid];
   const int64_t token_type_id = token_type_ids[bid];
@@ -110,37 +114,37 @@
     return;
   }
 
-  word_embeddings = word_embeddings + input_id * embedding_dim_div_8;
+  word_embeddings = word_embeddings + input_id * embedding_dim_div_n;
   token_type_embeddings =
-      token_type_embeddings + token_type_id * embedding_dim_div_8;
-  position_embeddings = position_embeddings + position_id * embedding_dim_div_8;
+      token_type_embeddings + token_type_id * embedding_dim_div_n;
+  position_embeddings = position_embeddings + position_id * embedding_dim_div_n;
 
   uint4 word_embedding{0, 0, 0, 0};
   uint4 token_type_embedding{0, 0, 0, 0};
   uint4 position_embedding{0, 0, 0, 0};
 
-  if (tid < embedding_dim_div_8) {
+  if (tid < embedding_dim_div_n) {
     word_embedding = word_embeddings[tid];
     token_type_embedding = token_type_embeddings[tid];
     position_embedding = position_embeddings[tid];
   }
   uint4 embedding{0, 0, 0, 0};
 
-  half* word_emb_vec = reinterpret_cast<half*>(&word_embedding);
-  half* token_emb_vec = reinterpret_cast<half*>(&token_type_embedding);
-  half* pos_emb_vec = reinterpret_cast<half*>(&position_embedding);
+  ElemT* word_emb_vec = reinterpret_cast<ElemT*>(&word_embedding);
+  ElemT* token_emb_vec = reinterpret_cast<ElemT*>(&token_type_embedding);
+  ElemT* pos_emb_vec = reinterpret_cast<ElemT*>(&position_embedding);
 
-  half* emb_vec = reinterpret_cast<half*>(&embedding);
+  ElemT* emb_vec = reinterpret_cast<ElemT*>(&embedding);
 
   // layernorm
   __shared__ float s_mean, s_variance;
   float local_sums[1] = {0.0f};
 
 #pragma unroll
-  for (int i = 0; i < 8; i++) {
+  for (int i = 0; i < num_elems_in_uint4; i++) {
     float sum = word_emb_vec[i] + token_emb_vec[i] + pos_emb_vec[i];
     local_sums[0] += sum;
-    emb_vec[i] = (half)sum;
+    emb_vec[i] = static_cast<ElemT>(sum);
   }
 
   if (blockDim.x <= 32) {
@@ -155,9 +159,9 @@
 
   local_sums[0] = 0.0f;
 
-  if (tid < embedding_dim_div_8) {
+  if (tid < embedding_dim_div_n) {
 #pragma unroll
-    for (int i = 0; i < 8; i++) {
+    for (int i = 0; i < num_elems_in_uint4; i++) {
       float val = emb_vec[i];
       local_sums[0] += (val - s_mean) * (val - s_mean);
     }
@@ -173,13 +177,13 @@
   }
   __syncthreads();
 
-  if (tid < embedding_dim_div_8) {
+  if (tid < embedding_dim_div_n) {
     uint4 local_gamma = gamma[tid];
-    half* gamma_vec = reinterpret_cast<half*>(&local_gamma);
+    ElemT* gamma_vec = reinterpret_cast<ElemT*>(&local_gamma);
     uint4 local_beta = beta[tid];
-    half* beta_vec = reinterpret_cast<half*>(&local_beta);
+    ElemT* beta_vec = reinterpret_cast<ElemT*>(&local_beta);
 #pragma unroll
-    for (int i = 0; i < 8; i++) {
+    for (int i = 0; i < num_elems_in_uint4; i++) {
       emb_vec[i] = normalize(
           (float)emb_vec[i],
           s_mean,
@@ -190,23 +194,23 @@
   }
 
   // write to output
-  if (tid < embedding_dim_div_8) {
-    output = output + bid * embedding_dim_div_8;
+  if (tid < embedding_dim_div_n) {
+    output = output + bid * embedding_dim_div_n;
     output[tid] = embedding;
   }
 }
 
-template <typename INDEX_T>
+template <typename ElemT, typename INDEX_T>
 void bert_embeddings_launcher(
-    half* output,
+    ElemT* output,
     INDEX_T* input_ids,
     INDEX_T* token_type_ids,
     INDEX_T* position_ids,
-    half* word_embeddings,
-    half* token_type_embeddings,
-    half* position_embeddings,
-    half* gamma,
-    half* beta,
+    ElemT* word_embeddings,
+    ElemT* token_type_embeddings,
+    ElemT* position_embeddings,
+    ElemT* gamma,
+    ElemT* beta,
     const int64_t indices_num,
     const int64_t embedding_dim,
     const int64_t vocab_size,
@@ -214,17 +218,21 @@
     const int64_t max_position_embeddings,
     const float eps,
     cudaStream_t stream) {
-  if (embedding_dim % 8 != 0) {
-    throw std::runtime_error("embedding dim must be multiple of 8");
+  constexpr int num_elems_in_uint4 = sizeof(uint4) / sizeof(ElemT);
+  if (embedding_dim % num_elems_in_uint4 != 0) {
+    throw std::runtime_error(
+        "embedding dim must be multiple of num_elems_in_uint4: " +
+        std::to_string(num_elems_in_uint4)
+    );
   }
   dim3 grid(indices_num);
 
   // round up to multiple of 32
-  int64_t num_threads = embedding_dim / 8;
+  int64_t num_threads = embedding_dim / num_elems_in_uint4;
   num_threads = (num_threads + 31) / 32 * 32;
   dim3 block(num_threads);
 
-  bert_embeddings_kernel<INDEX_T><<<grid, block, 0, stream>>>(
+  bert_embeddings_kernel<{{elem_input_type}}, INDEX_T><<<grid, block, 0, stream>>>(
       reinterpret_cast<uint4*>(output),
       input_ids,
       token_type_ids,
@@ -245,16 +253,16 @@
 
 {{func_signature}}
 {
-    bert_embeddings_launcher<{{index_type}}>(
-      output,
+    bert_embeddings_launcher<{{elem_input_type}}, {{index_type}}>(
+      static_cast<{{elem_input_type}}*>(output),
       input_ids,
       token_type_ids,
       position_ids,
-      word_embeddings,
-      token_type_embeddings,
-      position_embeddings,
-      gamma,
-      beta,
+      static_cast<{{elem_input_type}}*>(word_embeddings),
+      static_cast<{{elem_input_type}}*>(token_type_embeddings),
+      static_cast<{{elem_input_type}}*>(position_embeddings),
+      static_cast<{{elem_input_type}}*>(gamma),
+      static_cast<{{elem_input_type}}*>(beta),
       indices_num,
       embedding_dim,
       vocab_size,
@@ -270,15 +278,15 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
+void {{func_name}}(void* output,
                    {{index_type}}* input_ids,
                    {{index_type}}* token_type_ids,
                    {{index_type}}* position_ids,
-                   half* word_embeddings,
-                   half* token_type_embeddings,
-                   half* position_embeddings,
-                   half* gamma,
-                   half* beta,
+                   void* word_embeddings,
+                   void* token_type_embeddings,
+                   void* position_embeddings,
+                   void* gamma,
+                   void* beta,
                    const int64_t indices_num,
                    const int64_t embedding_dim,
                    const int64_t vocab_size,
@@ -342,9 +350,14 @@ def python_int_dtype_to_c_dtype(dtype):
 
 @registry.reg("cuda.bert_embeddings.gen_function")
 def bert_embeddings_gen_function(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][3]._attrs["dtype"]
+    )
     dtype = python_int_dtype_to_c_dtype(func_attrs["inputs"][0]._attrs["dtype"])
     return FUNC_TEMPLATE.render(
         index_type=dtype,
+        elem_input_type=elem_input_type,
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"],
             index_type=dtype,
@@ -363,10 +376,6 @@ def bert_embeddings_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
     )
 
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
-
 FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
 FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int32_t*>({{name}})")
 
@@ -405,26 +414,18 @@ def bert_embeddings_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -
     max_position_embeddings = position_embeddings._size(0).value()
 
     eps = func_attrs["eps"]
-    output_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_str = func_attrs["outputs"][0]._attrs["name"]
 
     input_ids_str = get_int_param_template(input_ids)
     token_type_ids_str = get_int_param_template(token_type_ids)
     position_ids_str = get_int_param_template(position_ids)
 
-    word_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=word_embeddings._attrs["name"]
-    )
-    token_type_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=token_type_embeddings._attrs["name"]
-    )
-    position_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=position_embeddings._attrs["name"]
-    )
+    word_embeddings_str = word_embeddings._attrs["name"]
+    token_type_embeddings_str = token_type_embeddings._attrs["name"]
+    position_embeddings_str = position_embeddings._attrs["name"]
 
-    gamma_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
-    beta_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+    gamma_str = gamma._attrs["name"]
+    beta_str = beta._attrs["name"]
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
index 604984059..3c3873c83 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
@@ -13,6 +13,18 @@
 #  limitations under the License.
 #
 
-from . import bmm_rcr_softmax, gemm_rcr_bias_softmax, gemm_rcr_softmax
+from . import (
+    bmm_rcr_softmax,
+    dual_gemm_rcr_fast_gelu,
+    dual_gemm_rcr_silu,
+    gemm_rcr_bias_softmax,
+    gemm_rcr_softmax,
+)
 
-__all__ = ["bmm_rcr_softmax", "gemm_rcr_bias_softmax", "gemm_rcr_softmax"]
+__all__ = [
+    "bmm_rcr_softmax",
+    "gemm_rcr_bias_softmax",
+    "gemm_rcr_softmax",
+    "dual_gemm_rcr_silu",
+    "dual_gemm_rcr_fast_gelu",
+]
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
index 4a63ff1fc..af5753b3a 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
@@ -67,7 +67,7 @@
 {{indent}}    {{d_ptr}},
 {{indent}}    {{n_ptr}},
 {{indent}}    {{soft_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{a_dim0_ptr}},
 {{indent}}    {{a_dim1_ptr}},
 {{indent}}    {{a_dim2_ptr}},
@@ -182,7 +182,7 @@ def gen_profiler(
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function_decl(func_attrs):
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 751a19a84..4a4b745a9 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -101,7 +101,7 @@ def bmm_rcr_softmax_config(func_attrs, dtype="float16"):
 @registry.reg("cuda.bmm_rcr_softmax.gen_profiler")
 def gen_profiler(func_attrs, workdir, dim_info_dict):
     """Generate code for profiling"""
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
         dim_info_dict,
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
new file mode 100644
index 000000000..fdcf0e741
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -0,0 +1,458 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for dual gemm.
+D0 = epilogue0(X @ B0, C0)
+D1 = epilogue0(X @ B1, C1)
+D2 = element_wise(D0, D1)
+"""
+
+from functools import partial
+from hashlib import sha1
+from typing import Any, Dict
+
+import jinja2
+
+from ...backend_spec import CUDASpec
+from ...common import gemm_common
+from ...target import Target
+from ..gemm_universal import common
+
+# pylint: disable=C0301,C0415,R1705
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "device/dual_gemm.h"
+#include "thread/left_silu_and_mul.h"
+
+typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
+decltype(nullptr_ref) ref_B0, ref_B1;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+"""
+)
+
+# HACK: we don't record different permutation shape,
+# because it has little impact on execution time compared.
+# Therefore, no matter what permutation shape it is,
+# we will use the same kernel, i.e. the first generated perm_shape
+# At runtime, the kernel will be regenerated and thus the correctness will not be affected.
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = a_dim0 * a_dim1;
+  int64_t b_ptr_sz = b_dim0 * b_dim1;
+  int64_t c_ptr_sz = c_dim0 * c_dim1;
+
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+
+{% if has_bias %}
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 3
+{% endif %}
+
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+//  TODO: cast to right dtype
+//{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
+{{indent}}using ElementCompute = typename {{instance}}::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
+
+{{indent}}typename {{instance}}::Arguments arguments{
+
+{{problem_args}}
+
+{{indent}}};
+{% if is_profiler %}
+{{indent}}// https://youtu.be/-Rp7UPbhErE
+{{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} gemm_op;
+{% endif %}
+
+{{indent}} auto status = gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op.initialize(arguments, workspace, stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+
+{{indent}}return;
+"""
+)
+
+
+def kernel_name(op, func_attrs):
+    """Returns kernel_name given input cutlass op_instance and operator attrs."""
+
+    from cutlass_lib import library
+
+    threadblock = op.tile_description.procedural_name()
+    extended_name = op.extended_name()
+    opcode_class_name = library.OpcodeClassNames[
+        op.tile_description.math_instruction.opcode_class
+    ]
+    layout = op.layout_name()
+    align_ab = op.A.alignment
+    align_c = op.C.alignment
+
+    name = KERNEL_KEY_TEMPLATE.render(
+        threadblock=threadblock,
+        extended_name=extended_name,
+        opcode_class_name=opcode_class_name,
+        layout=layout,
+        align_ab=align_ab,
+        align_c=align_c,
+    )
+    return name.replace("\n", "")
+
+
+def extract_config(f_proc_op, func_attrs):
+    return common.extract_config(f_proc_op, partial(kernel_name, func_attrs=func_attrs))
+
+
+def dual_gemm_instance(
+    op_def: str, func_attrs: Dict[str, Any], for_profiler: bool
+) -> str:
+    tmp = op_def.replace(
+        "GemmIdentityThreadblockSwizzle<8>", "GemmIdentityThreadblockSwizzle<1>"
+    )
+    return tmp
+
+
+def emit_instance(
+    op,
+    for_profiler,
+    f_instance_convertor=dual_gemm_instance,
+    emit_kernel=False,
+    func_attrs=None,
+):
+    import cutlass_lib
+
+    emiter = cutlass_lib.gemm_operation.EmitDualGemmInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
+    return op_def
+
+
+def default_fproc_f16(
+    *,
+    op,
+    a_layout,
+    b_layout,
+    c_layout,
+    epiligue_name,
+    epiligue2_name,
+    permute_layout=None,
+):
+    import copy
+
+    import cutlass_lib
+
+    ret = []
+    data_type = cutlass_lib.library.DataType.f16
+    acc_type = cutlass_lib.library.DataType.f32
+    # check target use fp16 acc
+    if "use_fp16_acc" in Target.current()._kwargs:
+        if Target.current()._kwargs["use_fp16_acc"]:
+            acc_type = cutlass_lib.library.DataType.f16
+    if (
+        op.A.element == data_type
+        and op.B.element == data_type
+        and op.C.element == data_type
+        and op.accumulator_type() == acc_type
+        and op.A.layout == a_layout
+        and op.B.layout == b_layout
+    ):
+        op = copy.deepcopy(op)
+        # set output major
+        op.C.layout = c_layout
+        # set epilogue
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
+        op.epilogue_functor2 = cutlass_lib.library.EpilogueFunctorName[epiligue2_name]
+        op.element_epilogue = acc_type
+        if permute_layout is not None:
+            op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
+                permute_layout
+            ]
+        # set C alignment
+        for i in [8, 4, 2, 1]:
+            op = copy.deepcopy(op)
+            op.C.alignment = i
+            ret.append(op)
+    return ret
+
+
+def make_fproc_f16(func_attrs, layout):
+    """
+    This function sets a callback for processing the epilogue of the kernel
+    associated with func_attrs.
+    """
+
+    def fproc_f16(op):
+        a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
+        return default_fproc_f16(
+            op=op,
+            a_layout=a_layout,
+            b_layout=b_layout,
+            c_layout=c_layout,
+            epiligue_name=func_attrs["epilogue"],
+            epiligue2_name=func_attrs["epilogue2"],
+        )
+
+    func_attrs["op_instance"] = extract_config(fproc_f16, func_attrs)
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    exec_cond_template,
+    problem_args,
+    input_ndims,
+    weight_ndims,
+    output_ndims,
+    dim_info_dict,
+    f_instance_convertor=dual_gemm_instance,
+    emit_kernel=False,
+    support_split_k=False,
+    input_addr_calculator="",
+    output_addr_calculator="",
+    extra_code="",
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(
+                op_instance[algo],
+                for_profiler=False,
+                f_instance_convertor=f_instance_convertor,
+                emit_kernel=emit_kernel,
+                func_attrs=func_attrs,
+            )
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = common.INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=common.extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+    shape_eval_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            support_split_k=support_split_k,
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+    )
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        dtype="cutlass::half_t",
+        shape_eval=shape_eval_func,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        support_split_k=support_split_k,
+        has_d=common.has_d(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+        extra_code=extra_code,
+    )
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    args_parser_template,
+    emit_kernel=False,
+    support_split_k=False,
+    output_addr_calculator="",
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    ndims = 2
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    has_bias = bias_ptr_arg is not None
+    instance_name_base = "GemmInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        support_split_k=support_split_k,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+    )
+
+    function_name = "gemm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
+        config = emit_instance(
+            op, for_profiler=True, emit_kernel=emit_kernel, func_attrs=func_attrs
+        )
+        config_name = common.extract_config_name(config)
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=instance_name, config=config
+        )
+        benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            support_split_k=support_split_k,
+            split_k="split_k",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+        )
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        output_addr_calculator=output_addr_calculator,
+        support_split_k=support_split_k,
+        extra_code=extra_code,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = common.FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=function_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr="bias_ptr",
+        c_ptr="c_ptr",
+        split_k="split_k",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+    )
+    # TODO: Render args_parse by caller.
+    args_parse = (
+        args_parser_template
+        if isinstance(args_parser_template, str)
+        else args_parser_template.render()
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        support_split_k=support_split_k,
+        args_parse=args_parse,
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        func_call=func_call,
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(has_bias=has_bias),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
+    # build
+    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index ff5e4b084..5f172d6ea 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -172,7 +172,7 @@
 {{indent}}    {{d_ptr}},
 {{indent}}    {{n_ptr}},
 {{indent}}    {{soft_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{split_k}},
 {% for dim in adims %}
 {{indent}}    {{dim}},
@@ -342,11 +342,11 @@
   for (auto & event : events) {
     cudaEventCreate(&event);
   }
-  cudaEventRecord(events[0]);
+  cudaEventRecord(events[0], stream);
   for (int i = 0; i < 5; ++i) {
     {{func_call}}
   }
-  cudaEventRecord(events[1]);
+  cudaEventRecord(events[1], stream);
   cudaEventSynchronize(events[1]);
   float runtime_ms = 0;
   cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
@@ -535,4 +535,4 @@ def gen_profiler(
         )
         common.add_profiler(file_pairs, workdir, op_type, op_name, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
new file mode 100644
index 000000000..5c626cbb8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -0,0 +1,348 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ..gemm_universal import common, common_bias
+from ..gemm_universal.layout import RCR
+from . import common_dual_gemm
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmCoord{M, N, K},
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
+    ref_B0,
+    nullptr_ref, // D0
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
+    ref_B1,
+    nullptr_ref, // D1
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
+    {ElementCompute(1), ElementCompute(0)},
+    {ElementCompute(1), ElementCompute(0)},
+    {},
+    1 // kSplitKSerial
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+    int64_t M = std::atoi(argv[1]);
+    int64_t N = std::atoi(argv[2]);
+    int64_t K = std::atoi(argv[3]);
+    int64_t split_k = std::atoi(argv[4]);
+
+    int64_t a_dim0 = M;
+    int64_t a_dim1 = K;
+    int64_t b_dim0 = N;
+    int64_t b_dim1 = K;
+    int64_t c_dim0 = M;
+    int64_t c_dim1 = N;
+"""
+)
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmCoord{M, N, K},
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
+    ref_B0,
+    nullptr_ref, // D0
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
+    ref_B1,
+    nullptr_ref, // D1
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
+    {ElementCompute(1), ElementCompute(0)},
+    {ElementCompute(1), ElementCompute(0)},
+    {},
+    1 // kSplitKSerial
+"""
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+#include "device/dual_gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LeftFastGeluAndMul {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  struct Params{};
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LeftFastGeluAndMul(Params const &/*params*/) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(false);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &lhs,
+    FragmentAccumulator const &rhs) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_to_compute;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> compute_to_output;
+
+    ComputeFragment converted_lhs = accumulator_to_compute(lhs);
+    ComputeFragment converted_rhs = accumulator_to_compute(rhs);
+
+    cutlass::epilogue::thread::GELU_taylor<ComputeFragment> gelu;
+    cutlass::multiplies<ComputeFragment> mul;
+    auto gelu_lhs = gelu(converted_lhs);
+    return compute_to_output(mul(gelu_lhs, converted_rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementOutput operator()(
+      ElementAccumulator const& lhs,
+      ElementAccumulator const& rhs
+  ) const {
+      ElementCompute convert_lhs(lhs);
+      ElementCompute convert_rhs(rhs);
+      cutlass::epilogue::thread::GELU_taylor<ElementCompute> gelu;
+      cutlass::multiplies<ElementCompute> mul;
+      auto gelu_lhs = gelu(convert_lhs);
+      return ElementOutput(mul(gelu_lhs, convert_rhs));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+
+typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
+decltype(nullptr_ref) ref_B0, ref_B1;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+"""
+)
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim0"
+    )
+    return common_dual_gemm.gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
+    else:
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    return common_dual_gemm.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_fast_gelu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
new file mode 100644
index 000000000..211259e9e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -0,0 +1,220 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ..gemm_universal import common, common_bias
+from ..gemm_universal.layout import RCR
+from . import common_dual_gemm
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmCoord{M, N, K},
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
+    ref_B0,
+    nullptr_ref, // D0
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
+    ref_B1,
+    nullptr_ref, // D1
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
+    {ElementCompute(1), ElementCompute(0)},
+    {ElementCompute(1), ElementCompute(0)},
+    {},
+    1 // kSplitKSerial
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+    int64_t M = std::atoi(argv[1]);
+    int64_t N = std::atoi(argv[2]);
+    int64_t K = std::atoi(argv[3]);
+    int64_t split_k = std::atoi(argv[4]);
+
+    int64_t a_dim0 = M;
+    int64_t a_dim1 = K;
+    int64_t b_dim0 = N;
+    int64_t b_dim1 = K;
+    int64_t c_dim0 = M;
+    int64_t c_dim1 = N;
+"""
+)
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmCoord{M, N, K},
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
+    ref_B0,
+    nullptr_ref, // D0
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
+    ref_B1,
+    nullptr_ref, // D1
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
+    {ElementCompute(1), ElementCompute(0)},
+    {ElementCompute(1), ElementCompute(0)},
+    {},
+    1 // kSplitKSerial
+"""
+)
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim0"
+    )
+    return common_dual_gemm.gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
+    else:
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    return common_dual_gemm.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.dual_gemm_rcr_silu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index eb3fcde49..45d69ac00 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -87,7 +87,7 @@ def common_gen_profiler(
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
     )
-    common_softmax.gen_profiler(
+    return common_softmax.gen_profiler(
         func_attrs,
         workdir,
         dim_info_dict,
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
index 3b168b3d8..d5e7351a9 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 namespace cutlass {
 
 template <
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
index 5582ee24e..42e203069 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -43,9 +43,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
+  void*,
+  void*,
+  void*,
   {% for i in range(3) %}
   int64_t*,
   {% endfor %}
@@ -92,9 +92,9 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}bmm_rcr_n1_launcher<{{elem_input_type}}, {{read_vec_type}}, {{K}}>(
-{{indent}}    a_ptr,
-{{indent}}    b_ptr,
-{{indent}}    c_ptr,
+{{indent}}    ({{elem_input_type}}*)a_ptr,
+{{indent}}    ({{elem_input_type}}*)b_ptr,
+{{indent}}    ({{elem_input_type}}*)c_ptr,
 {{indent}}    B,
 {{indent}}    M,
 {{indent}}    alpha,
@@ -447,9 +447,9 @@
 } // namespace
 
 void {{function_name}} (
-    {{elem_input_type}}* a_ptr,
-    {{elem_input_type}}* b_ptr,
-    {{elem_input_type}}* c_ptr,
+    void* a_ptr,
+    void* b_ptr,
+    void* c_ptr,
     {% for i in range(3) %}
     int64_t *a_dim{{loop.index0}},
     {% endfor %}
@@ -496,8 +496,10 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
     bk = _get_original_dim_val(func_attrs, 1, 2)
     assert ak == bk, f"ak is not equal to bk. ak: {ak}, bk: {bk}"
 
-    elem_input_type = "cutlass::half_t"
     backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     vec_lens = list(zip(*backend_spec.read_num_elements_to_backend_type))[0][:-1]
     alignment = tensor_accessor_codegen.find_max_alignment(
         ak, func_attrs["input_accessors"]
@@ -560,8 +562,17 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
 @registry.reg("cuda.bmm_rcr_n1.func_decl")
 def gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     return FUNC_DECL_TEMPLATE.render(
-        func_name=func_name, elem_input_type="cutlass::half_t"
+        func_name=func_name,
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
index de29a6ab7..eb5cfe109 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
@@ -23,6 +23,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from ..gemm_universal import common
 
@@ -31,9 +32,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
   {% for i in range(3) %}
   int64_t*,
   {% endfor %}
@@ -71,10 +72,10 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}bmm_rrr_k1_tanh_launcher(
-{{indent}}    a_ptr,
-{{indent}}    b_ptr,
-{{indent}}    c_ptr,
+{{indent}}bmm_rrr_k1_tanh_launcher<{{elem_input_type}}>(
+{{indent}}    ({{elem_input_type}}*)a_ptr,
+{{indent}}    ({{elem_input_type}}*)b_ptr,
+{{indent}}    ({{elem_input_type}}*)c_ptr,
 {{indent}}    B,
 {{indent}}    M,
 {{indent}}    N,
@@ -86,6 +87,7 @@
 
 SRC_TEMPLATE = jinja2.Template(
     """
+#include <iostream>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
@@ -97,6 +99,10 @@
 
 namespace {
 
+template <typename T>
+__device__ T fast_tanh(T x);
+
+template <>
 __device__ half fast_tanh(half x) {
   #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
 
@@ -108,7 +114,7 @@
   #endif
 }
 
-template<int num_thread>
+template<typename ElemT, int num_thread>
 __global__ void bmm_rrr_k1_tanh_kernel(const float4* a_ptr,
                                   const float4* b_ptr,
                                   float4* c_ptr,
@@ -116,58 +122,75 @@
                                   const int M,
                                   const int N) {
   // TODO: check boundary
-  half tmp[64];
+  constexpr int num_elems_in_float4 = sizeof(float4) / sizeof(ElemT);
+  ElemT tmp[num_elems_in_float4 * num_elems_in_float4];
   int idx = blockIdx.x * num_thread + threadIdx.x;
   int m = idx % M;
   int b = idx / M;
   int a_idx_base = b * M + m;
   float4 a_vec = __ldg(a_ptr + a_idx_base);
-  half* a_vec_ptr = (half*)(&a_vec);
+  ElemT* a_vec_ptr = (ElemT*)(&a_vec);
   for (int n = 0; n < N; ++n) {
     int b_idx_base = b * N + n;
     float4 b_vec = __ldg(b_ptr + b_idx_base);
-    half* b_vec_ptr = (half*)(&b_vec);
-    for (int i = 0; i < 8; ++i) {
+    ElemT* b_vec_ptr = (ElemT*)(&b_vec);
+    for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < 8; ++j) {
-        tmp[i * 8 + j] = fast_tanh(__hmul(a_vec_ptr[i], b_vec_ptr[j]));
+      for (int j = 0; j < num_elems_in_float4; ++j) {
+        tmp[i * num_elems_in_float4 + j] = fast_tanh(__hmul(a_vec_ptr[i], b_vec_ptr[j]));
       }
     }
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 8; ++i) {
-        int c_idx = (b * M * 8  + m * 8 + i) * N  + n;
-        c_ptr[c_idx] = *((const float4*)(tmp + i * 8));
+    for (int i = 0; i < num_elems_in_float4; ++i) {
+        int c_idx = (b * M * num_elems_in_float4  + m * num_elems_in_float4 + i) * N  + n;
+        c_ptr[c_idx] = *((const float4*)(tmp + i * num_elems_in_float4));
     }
   }
 }
 
 
-void bmm_rrr_k1_tanh_launcher(cutlass::half_t* a_ptr,
-                         cutlass::half_t* b_ptr,
-                         cutlass::half_t* c_ptr,
+template <typename ElemT>
+void bmm_rrr_k1_tanh_launcher(ElemT* a_ptr,
+                         ElemT* b_ptr,
+                         ElemT* c_ptr,
                          int B,
                          int M,
                          int N,
                          cudaStream_t stream) {
+  constexpr int num_elems_in_float4 = sizeof(float4) / sizeof(ElemT);
+  if (M % num_elems_in_float4 != 0) {
+     auto msg = std::string("Got error: ") + std::to_string(M) + "%" +
+       std::to_string(num_elems_in_float4) + " != 0 " +
+       " at " + __FILE__ + ": " + std::to_string(__LINE__);
+     std::cerr << msg << std::endl;
+     throw std::runtime_error(msg);
+  }
+  if (N % num_elems_in_float4 != 0) {
+     auto msg = std::string("Got error: ") + std::to_string(N) + "%" +
+       std::to_string(num_elems_in_float4) + " != 0 " +
+       " at " + __FILE__ + ": " + std::to_string(__LINE__);
+     std::cerr << msg << std::endl;
+     throw std::runtime_error(msg);
+  }
   const int nthread = 256;
   dim3 thread_block(nthread);
-  dim3 grid(B * M / nthread / 8);
-  bmm_rrr_k1_tanh_kernel<nthread><<<grid, thread_block, 0, stream>>>(
+  dim3 grid(B * M / nthread / num_elems_in_float4);
+  bmm_rrr_k1_tanh_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
     (const float4*)a_ptr,
     (const float4*)b_ptr,
     (float4*) c_ptr,
     B,
-    M / 8,
-    N / 8
+    M / num_elems_in_float4,
+    N / num_elems_in_float4
   );
 }
 
 } // namespace
 
 void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-    cutlass::half_t* c_ptr,
+    void* a_ptr,
+    void* b_ptr,
+    void* c_ptr,
     {% for i in range(3) %}
     int64_t *a_dim{{loop.index0}},
     {% endfor %}
@@ -199,7 +222,11 @@ def gen_function(func_attrs, exec_cond_template, dim_info_dict):
         weight_ndims=3,
         output_ndims=3,
     )
-    exec_paths = EXEC_TEMPLATE.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
     return SRC_TEMPLATE.render(
         function_name=func_name,
         shape_function=shape_func,
diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index 81ed764e8..b53b74f37 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -28,6 +28,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from ...target import Target
 from ..gemm_universal import common
@@ -38,9 +39,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
   {% for i in range(a_ndim) %}
   int64_t*,
   {% endfor %}
@@ -81,10 +82,10 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}gemm_rrr_small_nk_launcher<{{N}}, {{K}}>(
-{{indent}}    a_ptr,
-{{indent}}    b_ptr,
-{{indent}}    c_ptr,
+{{indent}}gemm_rrr_small_nk_launcher<{{elem_input_type}}, {{N}}, {{K}}>(
+{{indent}}    ({{elem_input_type}}*)a_ptr,
+{{indent}}    ({{elem_input_type}}*)b_ptr,
+{{indent}}    ({{elem_input_type}}*)c_ptr,
 {{indent}}    M,
 {{indent}}    use_fp16_acc,
 {{indent}}    stream
@@ -96,6 +97,8 @@
 
 SRC_TEMPLATE = jinja2.Template(
     """
+#include <iostream>
+#include <type_traits>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
@@ -107,10 +110,8 @@
 // B matrix: K x N
 // C tile: 8 x N
 template<int num_thread, int N, int K, bool USE_FP16_ACC>
-__global__ void gemm_rrr_small_nk_kernel(float4* a_ptr,
-                                         float4* b_ptr,
-                                         float4* c_ptr,
-                                         int M) {
+__global__ void gemm_rrr_small_nk_kernel_half(
+    float4* a_ptr, float4* b_ptr, float4* c_ptr, int M) {
   int idx = blockIdx.x * num_thread + threadIdx.x;
 
   if (idx >= (M + 7) / 8) {
@@ -223,40 +224,48 @@
 }
 
 // N <= 8, K <= 8
-template<int N, int K>
-void gemm_rrr_small_nk_launcher(cutlass::half_t* a_ptr,
-                         cutlass::half_t* b_ptr,
-                         cutlass::half_t* c_ptr,
+template<typename ElemT, int N, int K>
+void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
+                         ElemT* b_ptr,
+                         ElemT* c_ptr,
                          int M,
                          bool use_fp16_acc,
                          cudaStream_t stream) {
+  constexpr int num_elems_in_float4 = sizeof(float4) / sizeof(ElemT);
   const int nthread = 256;
   dim3 thread_block(nthread);
-  const int n_element_per_t = nthread * 8;
+  constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if(use_fp16_acc) {
-    gemm_rrr_small_nk_kernel<nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
-      (float4*)a_ptr,
-      (float4*)b_ptr,
-      (float4*)c_ptr,
-      M
-    );
+  if constexpr (std::is_same<ElemT, half>::value) {
+    if(use_fp16_acc) {
+      gemm_rrr_small_nk_kernel_half<nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
+        (float4*)a_ptr,
+        (float4*)b_ptr,
+        (float4*)c_ptr,
+        M
+      );
+    } else {
+      gemm_rrr_small_nk_kernel_half<nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
+        (float4*)a_ptr,
+        (float4*)b_ptr,
+        (float4*)c_ptr,
+        M
+      );
+    }
   } else {
-    gemm_rrr_small_nk_kernel<nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
-      (float4*)a_ptr,
-      (float4*)b_ptr,
-      (float4*)c_ptr,
-      M
-    );
+    auto msg = std::string("Got error: unsupported elem type ") +
+      " at " + __FILE__ + ": " + std::to_string(__LINE__);
+    std::cerr << msg << std::endl;
+    throw std::runtime_error(msg);
   }
 }
 
 } // namespace
 
 void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-    cutlass::half_t* c_ptr,
+    void* a_ptr,
+    void* b_ptr,
+    void* c_ptr,
     {% for i in range(a_ndim) %}
     int64_t *a_dim{{loop.index0}},
     {% endfor %}
@@ -299,11 +308,17 @@ def gen_function(func_attrs, exec_cond_template, dim_info_dict):
         weight_ndims=2,
         output_ndims=c_ndim,
     )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     if n == 0 or k == 0:
         # avoid "zero-sized variable not allowed in device code" error
         exec_paths = ""
     else:
-        exec_paths = EXEC_TEMPLATE.render(indent="  ", N=n, K=k)
+        exec_paths = EXEC_TEMPLATE.render(
+            indent="  ", elem_input_type=elem_input_type, N=n, K=k
+        )
     return SRC_TEMPLATE.render(
         function_name=func_name,
         shape_function=shape_func,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index c07983128..9d04403bc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -42,6 +42,7 @@
     gemm_rcr_bias_sigmoid_mul_tanh,
     gemm_rcr_bias_swish,
     gemm_rcr_bias_tanh,
+    gemm_rcr_fast_gelu,
     gemm_rcr_permute,
     gemm_rrr,
     gemm_rrr_permute,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
index 25ad9e9a8..b8e3fa6c1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
@@ -44,22 +44,30 @@ def _get_problem_info(**kwargs):
 
 @registry.reg("cuda.bmm_ccr.config")
 def bmm_ccr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.bmm_ccr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -74,16 +82,21 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -97,12 +110,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
index ea9ff0510..fe8e605f0 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
@@ -29,7 +29,7 @@ def bmm_ccr_add_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.bmm_ccr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -45,7 +45,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
     )
 
     mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="d_ptr",
+        bias_ptr="(d_ptr)",
         alpha_value=func_attrs.get("alpha", 1),
         beta_value=1,
     )
@@ -54,11 +54,14 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -73,14 +76,18 @@ def gen_function(
     dim_info_dict,
 ):
     mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+        bias_ptr="(d_ptr)",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 7b22806e3..6a00b0fc5 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -19,6 +19,7 @@
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import common
 
@@ -55,12 +56,12 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
 {% if has_d %}
-  cutlass::half_t*,
+  void*,
 {% endif %}
-  cutlass::half_t*,
+  void*,
   uint8_t*,
 {% if support_split_k %}
   int,
@@ -85,6 +86,9 @@
 {{indent}}{
 {{indent}}{{local_dim_defs}}
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
 {{indent}}    {{a_ptr}},
 {{indent}}    {{b_ptr}},
 {% if has_d %}
@@ -94,7 +98,7 @@
 {{indent}}    {{bias_ptr}},
 {% endif %}
 {{indent}}    {{c_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {% for dim in a_dims_ptr %}
 {{indent}}    {{dim}},
 {% endfor %}
@@ -135,14 +139,14 @@
   // need to tune it for other devices
   int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
 
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
 {% if has_bias %}
-  memory_pool->AllocateHalfTensor(c_dim2, mem_pool_sz);  // bias_ptr: index 3
+  memory_pool->AllocateTensor(c_dim2, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
 {% if has_d %}
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3 (no bias) or 4
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3 (no bias) or 4
 {% endif %}
 """
 )
@@ -189,10 +193,10 @@ def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
     {{mm_info.problem_size}},
     {{mm_info.batch_size}},
     {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},
-    (void*) {{mm_info.a_ptr}},
-    (void*) {{mm_info.b_ptr}},
-    (void*) {{mm_info.bias_ptr}},
-    (void*) {{mm_info.c_ptr}},
+    {{mm_info.a_ptr}},
+    {{mm_info.b_ptr}},
+    {{mm_info.bias_ptr}},
+    {{mm_info.c_ptr}},
     {{mm_info.a_batch_stride}},
     {{mm_info.b_batch_stride}},
     {{mm_info.bias_batch_stride}},
@@ -232,6 +236,7 @@ def _fill(arr, idx, val):
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args,
@@ -240,6 +245,10 @@ def gen_profiler(
 ):
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    backend_spec = CUDASpec()
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     has_d = False
     if "has_d" in func_attrs:
         has_d = func_attrs["has_d"]
@@ -247,75 +256,114 @@ def gen_profiler(
     a_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     b_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     c_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
+    b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
+    c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
     shape_func = gemm_common.gen_shape_eval_code(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
     assert not (has_d and has_bias)
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = common.EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=problem_args,
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+    )
+
+    function_name = "bmm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(op, for_profiler=True)
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = common.EXEC_TEMPLATE.render(
+        benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            problem_args=problem_args,
-        )
-        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
-            input_ndims=a_ndims,
-            weight_ndims=b_ndims,
-            output_ndims=c_ndims,
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="bmm",
-            input_ndims=a_ndims,
-            weight_ndims=b_ndims,
-            output_ndims=c_ndims,
-            shape_eval=shape_func,
-            input_output_checks=input_output_checks,
-            exec_paths=exec_program,
-            has_d=has_d,
-        )
-        a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
-        b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
-        c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="bmm",
-            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
-            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
             has_bias=has_bias,
             bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
-            d_ptr="memory_pool->RequestHalfTensorByIdx(%d)" % (4 if has_bias else 3),
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
             has_d=has_d,
-            a_dims_ptr=a_dims_ptr,
-            b_dims_ptr=b_dims_ptr,
-            c_dims_ptr=c_dims_ptr,
-        )
-        code = common.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser,
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(
-                name=name,
-                a_ndims=a_ndims,
-                b_ndims=b_ndims,
-                c_ndims=c_ndims,
-                has_d=has_d,
-                has_bias=has_bias,
-            ),
+            adims=a_dims_ptr,
+            bdims=b_dims_ptr,
+            cdims=c_dims_ptr,
         )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        has_d=has_d,
+    )
+    benchmark_adims = [f"a_dim{idx}" for idx in range(a_ndims)]
+    benchmark_bdims = [f"b_dim{idx}" for idx in range(b_ndims)]
+    benchmark_cdims = [f"c_dim{idx}" for idx in range(c_ndims)]
+    func_call = FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=function_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr="bias_ptr",
+        c_ptr="c_ptr",
+        d_ptr="d_ptr",
+        has_d=has_d,
+        a_dims_ptr=benchmark_adims,
+        b_dims_ptr=benchmark_bdims,
+        c_dims_ptr=benchmark_cdims,
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        has_d=has_d,
+        args_parse=args_parser,
+        function_name=function_name,
+        func_call=func_call,
+        name=instance_name_base,
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(
+            a_ndims=a_ndims,
+            b_ndims=b_ndims,
+            c_ndims=c_ndims,
+            has_d=has_d,
+            has_bias=has_bias,
+        ),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function_decl(func_attrs):
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
index 62d6eee96..213234342 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
@@ -46,22 +46,30 @@ def _get_problem_info(**kwargs):
 
 @registry.reg("cuda.bmm_crr.config")
 def bmm_crr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.bmm_crr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -76,16 +84,21 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -99,12 +112,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
index 2767af9b0..ce62a6a1e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
@@ -31,7 +31,7 @@ def bmm_crr_add_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.bmm_crr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -47,18 +47,23 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
     )
 
     mm_info = bmm_crr._get_problem_info(
-        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+        bias_ptr="d_ptr",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -73,14 +78,18 @@ def gen_function(
     dim_info_dict,
 ):
     mm_info = bmm_crr._get_problem_info(
-        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+        bias_ptr="d_ptr",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
index 582bfd38e..222522396 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
@@ -15,6 +15,7 @@
 """
 Common functions and templates for bmm_permute-family ops
 """
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from ..gemm_universal import common, common_bias
 
@@ -26,6 +27,7 @@
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args,
@@ -37,6 +39,10 @@ def gen_profiler(
     """Generate code for profiling"""
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    backend_spec = CUDASpec()
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     has_d = False
     if "has_d" in func_attrs:
         has_d = func_attrs["has_d"]
@@ -44,14 +50,32 @@ def gen_profiler(
     a_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     b_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     c_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
+    b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
+    c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
     shape_func = gemm_common.gen_shape_eval_code(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
     assert not (has_d and has_bias)
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = common.EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=problem_args,
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+    )
+
+    function_name = "bmm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common_permute.emit_instance(
             op,
             for_profiler=True,
@@ -59,66 +83,87 @@ def gen_profiler(
             func_attrs=func_attrs,
         )
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = common.EXEC_TEMPLATE.render(
+        benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            problem_args=problem_args,
-        )
-        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
-            input_ndims=a_ndims,
-            weight_ndims=b_ndims,
-            output_ndims=c_ndims,
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="bmm",
-            input_ndims=a_ndims,
-            weight_ndims=b_ndims,
-            output_ndims=c_ndims,
-            shape_eval=shape_func,
-            input_output_checks=input_output_checks,
-            exec_paths=exec_program,
-            has_d=has_d,
-            extra_code=extra_code,
-        )
-        a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
-        b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
-        c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
-        func_call = bmm_common.FUNC_CALL_TEMPLATE.render(
-            func_name="bmm",
-            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
-            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
             has_bias=has_bias,
             bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
-            d_ptr="memory_pool->RequestHalfTensorByIdx(%d)" % (4 if has_bias else 3),
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
             has_d=has_d,
-            a_dims_ptr=a_dims_ptr,
-            b_dims_ptr=b_dims_ptr,
-            c_dims_ptr=c_dims_ptr,
-        )
-        code = common.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser,
-            func_call=func_call,
-            name=name,
-            tensor_decl=bmm_common.TENSOR_DECL_TEMPLATE.render(
-                name=name,
-                a_ndims=a_ndims,
-                b_ndims=b_ndims,
-                c_ndims=c_ndims,
-                has_d=has_d,
-                has_bias=has_bias,
-            ),
+            adims=a_dims_ptr,
+            bdims=b_dims_ptr,
+            cdims=c_dims_ptr,
         )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        has_d=has_d,
+        extra_code=extra_code,
+    )
+    benchmark_adims = [f"a_dim{idx}" for idx in range(a_ndims)]
+    benchmark_bdims = [f"b_dim{idx}" for idx in range(b_ndims)]
+    benchmark_cdims = [f"c_dim{idx}" for idx in range(c_ndims)]
+    func_call = bmm_common.FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=function_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr=bias_ptr_arg,
+        c_ptr="c_ptr",
+        d_ptr="d_ptr",
+        has_d=has_d,
+        a_dims_ptr=benchmark_adims,
+        b_dims_ptr=benchmark_bdims,
+        c_dims_ptr=benchmark_cdims,
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        has_d=has_d,
+        args_parse=args_parser,
+        function_name=function_name,
+        func_call=func_call,
+        name=instance_name_base,
+        input_ndims=a_ndims,
+        weight_ndims=b_ndims,
+        output_ndims=c_ndims,
+        tensor_decl=bmm_common.TENSOR_DECL_TEMPLATE.render(
+            a_ndims=a_ndims,
+            b_ndims=b_ndims,
+            c_ndims=c_ndims,
+            has_d=has_d,
+            has_bias=has_bias,
+        ),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function_decl(func_attrs):
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
index d660f3c61..c8afa49aa 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
@@ -19,6 +19,7 @@
 """
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import bmm_common, common
 from .layout import RCR
@@ -47,11 +48,11 @@ def _get_default_problem_info(**kwargs):
 
 @registry.reg("cuda.bmm_rcr.config")
 def bmm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 @registry.reg("cuda.bmm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -66,16 +67,21 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_default_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -89,6 +95,14 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     input_a_batch_stride_dim = "M * K"
     input_a_stride_k_dim = "K"
     input_a_offset = 0
@@ -151,10 +165,10 @@ def gen_function(
 
     bmm_problem_info = bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(a_ptr + input_a_offset)",
-        b_ptr="(b_ptr + input_b_offset)",
-        bias_ptr="(c_ptr + output_offset)",
-        c_ptr="(c_ptr + output_offset)",
+        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
+        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
+        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="input_a_batch_stride",
         b_batch_stride="input_b_batch_stride",
         bias_batch_stride="output_batch_stride",
@@ -168,7 +182,9 @@ def gen_function(
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 2dc737be5..17574b62e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -19,6 +19,7 @@
 """
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import bmm_common, bmm_permute_common, common, common_permute
 
@@ -27,23 +28,31 @@
 
 @registry.reg("cuda.bmm_rcr_permute.config")
 def bmm_rcr_permute_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common_permute.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
-    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = common_permute.extract_config(fproc, func_attrs)
 
 
 @registry.reg("cuda.bmm_rcr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -78,9 +87,10 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         mm_info=bmm_problem_info,
     )
 
-    bmm_permute_common.gen_profiler(
+    return bmm_permute_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -96,6 +106,14 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     input_a_batch_stride_dim = "M * K"
     input_a_stride_k_dim = "K"
     input_a_offset = 0
@@ -148,10 +166,10 @@ def gen_function(
 
     bmm_problem_info = bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(a_ptr + input_a_offset)",
-        b_ptr="(b_ptr + input_b_offset)",
-        bias_ptr="(c_ptr + output_offset)",
-        c_ptr="(c_ptr + output_offset)",
+        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
+        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
+        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="input_a_batch_stride",
         b_batch_stride="input_b_batch_stride",
         bias_batch_stride="output_batch_stride",
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
index bc752b1bb..489059f31 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
@@ -46,22 +46,30 @@ def _get_problem_info(**kwargs):
 
 @registry.reg("cuda.bmm_rrr.config")
 def bmm_rrr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.bmm_rrr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -76,16 +84,21 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -99,12 +112,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
index bb8201291..44fbda070 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
@@ -31,7 +31,7 @@ def bmm_rrr_add_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.bmm_rrr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -47,18 +47,23 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
     )
 
     mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+        bias_ptr="d_ptr",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -73,14 +78,18 @@ def gen_function(
     dim_info_dict,
 ):
     mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+        bias_ptr="d_ptr",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
     bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index d1d17ee8d..40a69bd28 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -19,6 +19,7 @@
 """
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import bmm_common, bmm_permute_common, common, common_permute
 
@@ -27,23 +28,31 @@
 
 @registry.reg("cuda.bmm_rrr_permute.config")
 def bmm_rrr_permute_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common_permute.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
-    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = common_permute.extract_config(fproc, func_attrs)
 
 
 @registry.reg("cuda.bmm_rrr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_dims = bmm_common.reverse_dim_info_mapping(
         dim_info_dict, gemm_common.Source.INPUT, 0
     )
@@ -78,9 +87,10 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         mm_info=bmm_problem_info,
     )
 
-    bmm_permute_common.gen_profiler(
+    return bmm_permute_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -96,6 +106,14 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     input_a_batch_stride_dim = "M * K"
     input_a_stride_k_dim = "K"
     input_a_offset = 0
@@ -158,10 +176,10 @@ def gen_function(
 
     bmm_problem_info = bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(a_ptr + input_a_offset)",
-        b_ptr="(b_ptr + input_b_offset)",
-        bias_ptr="(c_ptr + output_offset)",
-        c_ptr="(c_ptr + output_offset)",
+        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
+        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
+        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="input_a_batch_stride",
         b_batch_stride="input_b_batch_stride",
         bias_batch_stride="output_batch_stride",
@@ -175,7 +193,9 @@ def gen_function(
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
 
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     return bmm_permute_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 199311035..9c18ab765 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -27,7 +27,8 @@
 
 from ....compiler.base import IntImm
 
-from ... import builder
+from ...backend_spec import CUDASpec
+
 from ...common import gemm_common, tensor_accessor_codegen
 from ...target import Target
 
@@ -153,13 +154,19 @@
 
 {{instances}}
 
+{% if is_profiler %}
+template <typename GemmInstance>
+void {{function_name}} (
+    GemmInstance& gemm_op,
+{% else %}
 void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
+{% endif %}
+    void* a_ptr,
+    void* b_ptr,
 {% if has_d %}
-    cutlass::half_t* d_ptr,
+    void* d_ptr,
 {% endif %}
-    cutlass::half_t* c_ptr,
+    void* c_ptr,
     uint8_t* workspace,
 {% if support_split_k %}
     int split_k,
@@ -211,13 +218,14 @@
 {{problem_args}}
 
 {{indent}}};
-{{indent}}{{instance}} gemm_op;
 {% if is_profiler %}
 {{indent}}// https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} gemm_op;
 {% endif %}
 {{indent}}auto status = gemm_op.can_implement(arguments);
 {{indent}}CUTLASS_CHECK(status);
@@ -233,9 +241,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
   uint8_t*,
 {% if support_split_k %}
   int,
@@ -260,13 +268,16 @@
 {{indent}}{
 {{indent}}{{local_dim_defs}}
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
 {{indent}}    {{a_ptr}},
 {{indent}}    {{b_ptr}},
 {% if has_bias %}
 {{indent}}    {{bias_ptr}},
 {% endif %}
 {{indent}}    {{c_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {{indent}}    {{split_k}},
 {% for dim in adims %}
 {{indent}}    {{dim}},
@@ -284,6 +295,53 @@
 )
 
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}{{instance_name}} {{gemm_op}};
+{{indent}}const char *gemm_op_name = "{{gemm_op_name}}";
+{{indent}}int ret = 0;
+{{indent}}try {
+{{indent}}ret = {{func_name}}(
+{{indent}}    {{gemm_op}},
+{{indent}}    gemm_op_name,
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{% if has_d %}
+{{indent}}    {{d_ptr}},
+{% endif %}
+{% if has_d1 %}
+{{indent}}    {{d1_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    global_workspace_,
+{% if support_split_k %}
+{{indent}}    {{split_k}},
+{% endif %}
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}} catch (...) {}
+{{indent}}if (ret != 0)
+{{indent}}  return ret;
+{{indent}}
+{{indent}}}
+"""
+)
+
+
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
   int64_t a_ptr_sz = a_dim0 * a_dim1;
@@ -296,12 +354,12 @@
   // need to tune it for other devices
   int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
 
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
 
 {% if has_bias %}
-  memory_pool->AllocateHalfTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
+  memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
 
 """
@@ -315,6 +373,95 @@
 
 {{op_func}}
 
+template <typename GemmInstance>
+int benchmark_{{function_name}} (
+{% if is_group_gemm %}
+    GemmInstance &gemm_op,
+    const char *gemm_op_name,
+    int sharedMemPerMultiprocessor,
+    int multiProcessorCount,
+    uint8_t* global_workspace_,
+    int problem_count,
+    cutlass::gemm::GemmCoord* problem_sizes_device,
+    void **ptr_A,
+    void **ptr_B,
+    void **ptr_C,
+{% if has_bias %}
+    void **ptr_bias,
+{% endif %}
+    int64_t* lda,
+    int64_t* ldb,
+    int64_t* ldc,
+{% if has_bias %}
+    int64_t* ldd,
+{% endif %}
+    int occupancy,
+    cudaStream_t stream
+
+{% else %}
+
+    GemmInstance &gemm_op,
+    const char *gemm_op_name,
+    void* a_ptr,
+    void* b_ptr,
+{% if has_bias %}
+    void* bias_ptr,
+{% endif %}
+{% if has_d %}
+    void* d_ptr,
+{% endif %}
+{% if has_d1 %}
+    void* d1_ptr,
+{% endif %}
+    void* c_ptr,
+    uint8_t* global_workspace_,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(output_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+{% endif %}
+  ) {
+  // warmup
+  for (int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 10; ++i) {
+    {{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << gemm_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
+template <typename DType>
 struct ProfilerMemoryPool {
   ProfilerMemoryPool() {
     std::random_device rd;
@@ -328,7 +475,6 @@
   }
   ~ProfilerMemoryPool() {}
 
-  template <typename DType>
   DType* AllocateGaussianTensor(int64_t size) {
     size_t length = size * sizeof(DType);
     blobs.emplace_back(length);
@@ -345,25 +491,20 @@
   }
 
 
-  cutlass::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<cutlass::half_t*>(
-        AllocateGaussianTensor<__half>(size));
-  }
-
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
+  int AllocateTensor(int64_t size, int64_t copy) {
     offsets.push_back(0);
     strides.push_back(size);
     copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    auto ptr = AllocateGaussianTensor(size * copy);
     ptrs.push_back(reinterpret_cast<void*>(ptr));
     return ptrs.size() - 1;
   }
 
-  cutlass::half_t* RequestHalfTensorByIdx(int idx) {
+  DType* RequestTensorByIdx(int idx) {
     auto copy = copies.at(idx);
     auto offset = offsets.at(idx);
     auto stride = strides.at(idx);
-    cutlass::half_t* ptr = reinterpret_cast<cutlass::half_t*>(ptrs.at(idx));
+    DType* ptr = reinterpret_cast<DType*>(ptrs.at(idx));
     ptr += offset;
     offset += stride;
     if (offset == copy * stride) {
@@ -387,7 +528,7 @@
   int device_idx;
   cudaDeviceProp device_properties;
   cudaError_t result = cudaGetDevice(&device_idx);
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  auto memory_pool = std::make_unique<ProfilerMemoryPool<{{elem_type}}>>();
   if (result != cudaSuccess) {
     throw std::runtime_error("cudaGetDevice() API call failed.");
   }
@@ -400,41 +541,12 @@
 
   {{args_parse}}
 
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-  uint8_t* global_workspace = nullptr;
+  uint8_t* global_workspace_ = nullptr;
   cudaStream_t stream = nullptr;
 
   {{tensor_decl}}
 
-  // warmup
-  for (int i = 0; i < 5; ++i) {
-    {{func_call}}
-  }
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0]);
-  for (int i = 0; i < 10; ++i) {
-    {{func_call}}
-  }
-  cudaEventRecord(events[1]);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  {{benchmark_instances}}
   return 0;
 }
 """
@@ -512,6 +624,11 @@ def update_alignments_in_gemm_instance(
     epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
         output_accessor
     )
+
+    # if the last dim is dynamic, force align=1
+    if not isinstance(output_accessor.original_shapes[-1], IntImm):
+        epilogue_alignment = 1
+
     gemm_params = get_gemm_instance_template_params(op_def, kernel_config)
     epilogue_align_idx = 11
     a_align_idx = 17
@@ -592,7 +709,7 @@ def emit_instance(
     return op_def
 
 
-def extract_config(f_proc_op):
+def extract_config(f_proc_op, f_kernel_name=kernel_name):
     import cutlass_lib
 
     op_kind = cutlass_lib.library.OperationKind.Gemm
@@ -606,7 +723,7 @@ def extract_config(f_proc_op):
             ret = f_proc_op(op)
             if len(ret) > 0:
                 for op_inst in ret:
-                    key = kernel_name(op_inst)
+                    key = f_kernel_name(op_inst)
                     gemm_ops[key] = op_inst
     return gemm_ops
 
@@ -636,6 +753,13 @@ def gen_function(
     output_addr_calculator="",
     extra_code="",
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     op_instance = func_attrs["op_instance"]
@@ -697,6 +821,8 @@ def gen_function(
         has_d=has_d(func_attrs),
         has_d1=has_d1(func_attrs),
         extra_code=extra_code,
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
     )
 
 
@@ -705,11 +831,10 @@ def build_profiler(file_pairs):
     if target.disable_profiler_codegen():
         file_pairs = []
     elif target.use_dummy_profiling_results():
-        # if it is circle CI only random build 2 profiler
+        # if it is circle CI only random build 2 profilers
         random.shuffle(file_pairs)
         file_pairs = file_pairs[:2]
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+    return file_pairs
 
 
 def add_profiler(file_pairs, workdir, op_type, output_name, code):
@@ -728,6 +853,7 @@ def add_profiler(file_pairs, workdir, op_type, output_name, code):
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
@@ -739,6 +865,16 @@ def gen_profiler(
 ):
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     ndims = 2
     adims = ["&a_dim" + str(i) for i in range(ndims)]
     bdims = ["&b_dim" + str(i) for i in range(ndims)]
@@ -747,68 +883,117 @@ def gen_profiler(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        support_split_k=support_split_k,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+    )
+    input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+    )
+
+    function_name = "gemm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(op, for_profiler=True)
         config_name = extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = EXEC_TEMPLATE.render(
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            support_split_k=support_split_k,
-            problem_args=problem_args_template.render(),
-        )
-        input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
-            input_ndims=ndims,
-            weight_ndims=ndims,
-            output_ndims=ndims,
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="gemm",
-            input_ndims=ndims,
-            weight_ndims=ndims,
-            output_ndims=ndims,
-            shape_eval=shape_func,
-            input_output_checks=input_output_checks,
-            exec_paths=exec_program,
-            output_addr_calculator=output_addr_calculator,
-            support_split_k=support_split_k,
-            extra_code=extra_code,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="gemm",
-            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
-            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
             has_bias=has_bias,
             bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
         )
-        # TODO: Render args_parse by caller.
-        args_parse = (
-            args_parser_template
-            if isinstance(args_parser_template, str)
-            else args_parser_template.render()
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parse,
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
-        )
-        add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    # TODO: Render args_parse by caller.
+    args_parse = (
+        args_parser_template
+        if isinstance(args_parser_template, str)
+        else args_parser_template.render()
+    )
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        output_addr_calculator=output_addr_calculator,
+        support_split_k=support_split_k,
+        extra_code=extra_code,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=function_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr="bias_ptr",
+        c_ptr="c_ptr",
+        split_k="split_k",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+    )
+    tensor_decl = TENSOR_DECL_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        has_bias=has_bias,
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        has_d=has_d(func_attrs),
+        support_split_k=support_split_k,
+        args_parse=args_parse,
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        func_call=func_call,
+        name=instance_name_base,
+        tensor_decl=tensor_decl,
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    build_profiler(file_pairs)
+    return build_profiler(file_pairs)
 
 
 def gen_local_dim_defs(func_attrs, indent="  "):
@@ -864,22 +1049,24 @@ def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
     )
 
 
-def default_fproc_f16(*, op, a_layout, b_layout, c_layout, epiligue_name):
+def default_fproc(
+    *, op, a_layout, b_layout, c_layout, elem_type, epiligue_name, permute_layout=None
+):
     import copy
 
     import cutlass_lib
 
     ret = []
-    data_type = cutlass_lib.library.DataType.f16
+    data_type = elem_type
     acc_type = cutlass_lib.library.DataType.f32
     # check target use fp16 acc
-    if "use_fp16_acc" in Target.current()._kwargs:
+    if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
         if Target.current()._kwargs["use_fp16_acc"]:
             acc_type = cutlass_lib.library.DataType.f16
     if (
-        op.A.element == data_type
-        and op.B.element == data_type
-        and op.C.element == data_type
+        cutlass_lib.library.DataTypeTag[op.A.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.C.element] == data_type
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
@@ -890,6 +1077,10 @@ def default_fproc_f16(*, op, a_layout, b_layout, c_layout, epiligue_name):
         # set epilogue
         op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
         op.element_epilogue = acc_type
+        if permute_layout is not None:
+            op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
+                permute_layout
+            ]
         # set C alignment
         for i in [8, 4, 2, 1]:
             op = copy.deepcopy(op)
@@ -898,23 +1089,27 @@ def default_fproc_f16(*, op, a_layout, b_layout, c_layout, epiligue_name):
     return ret
 
 
-def make_fproc_f16(func_attrs, layout):
+def make_fproc(func_attrs, layout):
     """
     This function sets a callback for processing the epilogue of the kernel
     associated with func_attrs.
     """
 
-    def fproc_f16(op):
+    backend_spec = CUDASpec()
+    elem_type = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    def fproc(op):
         a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
-        return default_fproc_f16(
+        return default_fproc(
             op=op,
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = extract_config(fproc_f16)
+    func_attrs["op_instance"] = extract_config(fproc)
 
 
 def function_filter(cfg, func_attrs, ab_alignment):
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
index 98d8e979c..2d4e7f05a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
@@ -58,11 +58,17 @@
 
 {{instances}}
 
+{% if is_profiler %}
+template <typename GemmInstance>
 void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-    cutlass::half_t* bias_ptr,
-    cutlass::half_t* c_ptr,
+    GemmInstance& gemm_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
+    void* a_ptr,
+    void* b_ptr,
+    void* bias_ptr,
+    void* c_ptr,
     uint8_t* workspace,
 {% if support_split_k %}
     int split_k,
@@ -111,10 +117,10 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
+  void*,
   uint8_t*,
 {% if support_split_k %}
     int,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
index 843230243..bd7e437e4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -17,6 +17,7 @@
 Common codegen functions for gemm_bias_activation.
 """
 
+from ...backend_spec import CUDASpec
 from . import common, common_bias, gemm_rcr
 from .layout import RCR
 
@@ -24,23 +25,25 @@
 
 
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     problem_args_template,
     extra_code="",
 ):
-    gemm_rcr.common_gen_profiler(
+    return gemm_rcr.common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         problem_args_template,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
         extra_code=extra_code,
     )
 
@@ -55,7 +58,17 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
-    problem_args = problem_args_template.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = problem_args_template.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
         func_attrs,
         common_bias.SRC_TEMPLATE,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 5c46b3cc5..42564bc0c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -22,6 +22,7 @@
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from ...target import Target
 
@@ -70,16 +71,16 @@
     1,
 {% endif %}
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) (a_ptr + input_a_offset),
-    (void*) (b_ptr + input_b_offset),
-    (void*) d0_ptr,
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
+    ({{elem_output_type}}*)(d0_ptr),
 {% if has_d1 %}
-    (void*) d1_ptr,
+    ({{elem_output_type}}*)(d1_ptr),
 {% else %}
     nullptr,
 {% endif %}
-    (void*) (c_ptr + output_offset),
-    (void*) bias_ptr,
+    ({{elem_output_type}}*) (c_ptr) + output_offset,
+    ({{elem_input_type}}*) (bias_ptr),
     nullptr,
     /*batch_stride_A*/ input_a_batch_stride,
     /*batch_stride_B*/ input_b_batch_stride,
@@ -113,16 +114,16 @@
     1,
 {% endif %}
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) d0_ptr,
+    ({{elem_input_type}}*) a_ptr,
+    ({{elem_input_type}}*) b_ptr,
+    ({{elem_output_type}}*) d0_ptr,
 {% if has_d1 %}
-    (void*) d1_ptr,
+    ({{elem_output_type}}*) d1_ptr,
 {% else %}
     nullptr,
 {% endif %}
-    (void*) (c_ptr + output_offset),
-    (void*) bias_ptr,
+    ({{elem_output_type}}*) (c_ptr) + output_offset,
+    ({{elem_input_type}}*) bias_ptr,
     nullptr,
     /*batch_stride_A*/ 0,
     /*batch_stride_B*/ 0,
@@ -173,15 +174,21 @@
 
 {{instances}}
 
+{% if is_profiler %}
+template <typename GemmInstance>
 void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-    cutlass::half_t* bias_ptr,
-    cutlass::half_t* d0_ptr,
+    GemmInstance& gemm_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
+    void* a_ptr,
+    void* b_ptr,
+    void* bias_ptr,
+    void* d0_ptr,
 {% if has_d1 %}
-    cutlass::half_t* d1_ptr,
+    void* d1_ptr,
 {% endif %}
-    cutlass::half_t* c_ptr,
+    void* c_ptr,
     uint8_t* workspace,
 {% if support_split_k %}
     int split_k,
@@ -229,14 +236,14 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
+  void*,
+  void*,
 {% if has_d1 %}
-  cutlass::half_t*,
+  void*,
 {% endif %}
-  cutlass::half_t*,
+  void*,
   uint8_t*,
 {% if support_split_k %}
     int,
@@ -262,6 +269,9 @@
 {{indent}}{
 {{indent}}{{local_dim_defs}}
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
 {{indent}}    {{a_ptr}},
 {{indent}}    {{b_ptr}},
 {{indent}}    {{bias_ptr}},
@@ -270,7 +280,7 @@
 {{indent}}    {{d1_ptr}},
 {% endif %}
 {{indent}}    {{c_ptr}},
-{{indent}}    global_workspace,
+{{indent}}    global_workspace_,
 {% if support_split_k %}
 {{indent}} {{split_k}},
 {% endif %}
@@ -313,13 +323,13 @@
   // need to tune it for other devices
   int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
 
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
-  memory_pool->AllocateHalfTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
 {% if has_d1 %}
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d1 ptr: index 5
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // d1 ptr: index 5
 {% endif %}
 """
 )
@@ -386,12 +396,13 @@ def gemm_bias_broadcast_instance(
 
 
 def gemm_bias_broadcast_config(func_attrs, layout, dtype="float16"):
-    common.make_fproc_f16(func_attrs, layout)
+    common.make_fproc(func_attrs, layout)
 
 
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     layout,
     unary_op1,
@@ -399,6 +410,16 @@ def gen_profiler(
     binary_op2,
     unary_op2,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     op_type = func_attrs["op"]
     support_split_k = _support_split_k(func_attrs)
     op_instance = func_attrs["op_instance"]
@@ -412,8 +433,29 @@ def gen_profiler(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = common.EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=PROFILER_PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            support_split_k=support_split_k,
+            layout=layout,
+            has_d1=has_d1,
+        ),
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+    )
+
+    function_name = "gemm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(
             op,
             for_profiler=True,
@@ -427,64 +469,95 @@ def gen_profiler(
             ),
         )
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = common.EXEC_TEMPLATE.render(
+        benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            problem_args=PROFILER_PROBLEM_ARGS_TEMPLATE.render(
-                support_split_k=support_split_k, layout=layout, has_d1=has_d1
-            ),
-        )
-        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
-            input_ndims=ndims,
-            weight_ndims=ndims,
-            output_ndims=ndims,
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="gemm",
-            input_ndims=ndims,
-            weight_ndims=ndims,
-            shape_eval=shape_func,
-            input_output_checks=input_output_checks,
-            exec_paths=exec_program,
-            output_addr_calculator=common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
-                stride_dim="N"
-            ),
-            support_split_k=support_split_k,
-            has_d1=has_d1,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="gemm",
-            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
-            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
-            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
-            d0_ptr="memory_pool->RequestHalfTensorByIdx(4)",
-            d1_ptr="memory_pool->RequestHalfTensorByIdx(5)",
-            bias_ptr="memory_pool->RequestHalfTensorByIdx(3)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            d_ptr="memory_pool->RequestTensorByIdx(4)",
+            d1_ptr="memory_pool->RequestTensorByIdx(5)",
+            bias_ptr="memory_pool->RequestTensorByIdx(3)",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
             support_split_k=support_split_k,
             split_k="split_k",
+            has_bias=True,
+            has_d=True,
             has_d1=has_d1,
         )
-        code = common.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=ARGS_PARSER_TEMPLATE.render(
-                layout=layout, support_split_k=support_split_k
-            ),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_d1=has_d1),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = SRC_TEMPLATE.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        output_addr_calculator=common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N"
+        ),
+        support_split_k=support_split_k,
+        has_d1=has_d1,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name="gemm",
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        c_ptr="c_ptr",
+        d0_ptr="d_ptr",
+        d1_ptr="d1_ptr",
+        bias_ptr="bias_ptr",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+        support_split_k=support_split_k,
+        split_k="split_k",
+        has_d1=has_d1,
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=True,
+        has_d=True,
+        has_d1=has_d1,
+        support_split_k=support_split_k,
+        args_parse=ARGS_PARSER_TEMPLATE.render(
+            layout=layout, support_split_k=support_split_k
+        ),
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        func_call=func_call,
+        name=instance_name_base,
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(has_d1=has_d1),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function(
@@ -497,6 +570,13 @@ def gen_function(
     binary_op2,
     unary_op2,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     input_addr_calculator = gemm_rcr.get_input_addr_calculator(func_attrs)
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
@@ -504,7 +584,11 @@ def gen_function(
     support_split_k = _support_split_k(func_attrs)
     has_d1 = common.has_d1(func_attrs)
     problem_args = PROBLEM_ARGS_TEMPLATE.render(
-        layout=layout, support_split_k=support_split_k, has_d1=has_d1
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        layout=layout,
+        support_split_k=support_split_k,
+        has_d1=has_d1,
     )
     return common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
new file mode 100644
index 000000000..8c1e80cc3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for gemm_activation but use nullptr for bias.
+"""
+
+import jinja2
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/epilogue/thread/linear_combination_silu.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+{{extra_code}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{% if is_profiler %}
+template <typename GemmInstance>
+void {{function_name}} (
+    GemmInstance& gemm_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
+    void* a_ptr,
+    void* b_ptr,
+    void* bias_ptr,
+    void* c_ptr,
+    uint8_t* workspace,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+  cudaStream_t stream
+  ) {
+  {{shape_eval}}
+  {{input_addr_calculator}}
+  {{output_addr_calculator}}
+  {{extra_shape}}
+  {{input_output_checks}}
+
+  if (bias_ptr) {
+    throw std::runtime_error("bias_ptr is not null!");
+  }
+
+  {{exec_paths}}
+  {% for idx in range(input_ndims) %}
+      std::cout << "input_ndims{{idx}}: " << *a_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(weight_ndims) %}
+      std::cout << "weight_ndims{{idx}}: " << *b_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(input_ndims) %}
+      std::cout << "output_ndims{{idx}}: " << *c_dim{{idx}} << std::endl;
+  {% endfor %}
+  throw std::runtime_error(
+      "Unsupported workload for this {{function_name}} specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
index 2f3f1e903..378911608 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
@@ -17,13 +17,13 @@
 """
 
 import re
-from collections import OrderedDict
+from functools import partial
 from hashlib import sha1
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...common import gemm_common
-from ...target import Target
 from ..gemm_universal import common
 
 # pylint: disable=C0301,C0415,R1705
@@ -63,6 +63,9 @@ def kernel_name(op, func_attrs):
     if len(shape) == 1:
         perm_type = "perm4d"
         perm_shape = f"{shape[0]}"
+    elif len(shape) == 2:
+        perm_type = "perm4d"
+        perm_shape = f"{shape[0]}_{shape[1]}"
     elif len(shape) == 3:
         perm_type = "perm5d"
         perm_shape = f"{shape[0]}_{shape[1]}_{shape[2]}"
@@ -83,63 +86,8 @@ def kernel_name(op, func_attrs):
     return name.replace("\n", "")
 
 
-def default_fproc_f16(
-    *, op, a_layout, b_layout, c_layout, epiligue_name, permute_layout
-):
-    """Generates new op_instances by adding alignment info, permute_layout, etc."""
-    import copy
-
-    import cutlass_lib
-
-    ret = []
-    data_type = cutlass_lib.library.DataType.f16
-    acc_type = cutlass_lib.library.DataType.f32
-    # check target use fp16 acc
-    if "use_fp16_acc" in Target.current()._kwargs:
-        if Target.current()._kwargs["use_fp16_acc"]:
-            acc_type = cutlass_lib.library.DataType.f16
-    if (
-        op.A.element == data_type
-        and op.B.element == data_type
-        and op.C.element == data_type
-        and op.accumulator_type() == acc_type
-        and op.A.layout == a_layout
-        and op.B.layout == b_layout
-    ):
-        op = copy.deepcopy(op)
-        # set output major
-        op.C.layout = c_layout
-        # set epilogue
-        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
-        op.element_epilogue = acc_type
-        op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
-            permute_layout
-        ]
-        # set C alignment
-        for i in [8, 4, 2, 1]:
-            op = copy.deepcopy(op)
-            op.C.alignment = i
-            ret.append(op)
-    return ret
-
-
 def extract_config(f_proc_op, func_attrs):
-    import cutlass_lib
-
-    op_kind = cutlass_lib.library.OperationKind.Gemm
-    gemm_kind = cutlass_lib.library.GemmKind.Universal
-    gemm_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.gemm_kind == gemm_kind:
-            ret = f_proc_op(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = kernel_name(op_inst, func_attrs)
-                    gemm_ops[key] = op_inst
-    return gemm_ops
+    return common.extract_config(f_proc_op, partial(kernel_name, func_attrs=func_attrs))
 
 
 def gemm_permute_instance(op_def, func_attrs, for_profiler):
@@ -262,6 +210,7 @@ def gen_function(
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
@@ -272,6 +221,16 @@ def gen_profiler(
     bias_ptr_arg=None,
     extra_code="",
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
 
@@ -283,69 +242,109 @@ def gen_profiler(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = common.EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        support_split_k=support_split_k,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+    )
+
+    function_name = "gemm"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(
             op, for_profiler=True, emit_kernel=emit_kernel, func_attrs=func_attrs
         )
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = common.EXEC_TEMPLATE.render(
+        benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            support_split_k=support_split_k,
-            problem_args=problem_args_template.render(),
-        )
-        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
-            input_ndims=ndims,
-            weight_ndims=ndims,
-            output_ndims=ndims,
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="gemm",
-            input_ndims=2,
-            weight_ndims=2,
-            output_ndims=2,
-            shape_eval=shape_func,
-            input_output_checks=input_output_checks,
-            exec_paths=exec_program,
-            output_addr_calculator=output_addr_calculator,
-            support_split_k=support_split_k,
-            extra_code=extra_code,
-        )
-        func_call = common.FUNC_CALL_TEMPLATE.render(
-            func_name="gemm",
-            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
-            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
             has_bias=has_bias,
             bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            c_ptr="memory_pool->RequestTensorByIdx(2)",
+            support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
         )
-        # TODO: Render args_parse by caller.
-        args_parse = (
-            args_parser_template
-            if isinstance(args_parser_template, str)
-            else args_parser_template.render()
-        )
-        code = common.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parse,
-            func_call=func_call,
-            name=name,
-            tensor_decl=common.TENSOR_DECL_TEMPLATE.render(
-                name=name, has_bias=has_bias
-            ),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        shape_eval=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_program,
+        output_addr_calculator=output_addr_calculator,
+        support_split_k=support_split_k,
+        extra_code=extra_code,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = common.FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=function_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr="bias_ptr",
+        c_ptr="c_ptr",
+        split_k="split_k",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+    )
+    # TODO: Render args_parse by caller.
+    args_parse = (
+        args_parser_template
+        if isinstance(args_parser_template, str)
+        else args_parser_template.render()
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        support_split_k=support_split_k,
+        args_parse=args_parse,
+        function_name=function_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        output_ndims=ndims,
+        func_call=func_call,
+        tensor_decl=common.TENSOR_DECL_TEMPLATE.render(has_bias=has_bias),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index 0fb211cb0..44c85125c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -20,6 +20,8 @@
 import jinja2
 
 from ... import registry
+
+from ...backend_spec import CUDASpec
 from . import common
 from .layout import RCR
 
@@ -49,10 +51,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) (a_ptr + input_a_offset),
-    (void*) (b_ptr + input_b_offset),
-    (void*) (c_ptr + output_offset),
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     input_a_batch_stride,
     input_b_batch_stride,
     /*output_batch_stride*/ M * N,
@@ -72,10 +74,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) c_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_output_type}}*)(c_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     M * N,
@@ -90,12 +92,13 @@
 
 @registry.reg("cuda.gemm_rcr.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 def common_gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
@@ -105,9 +108,10 @@ def common_gen_profiler(
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
     )
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         src_template,
         problem_args_template,
@@ -120,10 +124,11 @@ def common_gen_profiler(
 
 
 @registry.reg("cuda.gemm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
@@ -172,7 +177,17 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
-    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
         func_attrs,
         common.SRC_TEMPLATE,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index f54c0ed2c..7c06c7408 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -20,6 +20,8 @@
 import jinja2
 
 from ... import registry
+
+from ...backend_spec import CUDASpec
 from . import common, common_bias, gemm_rcr
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
@@ -32,10 +34,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) (a_ptr + input_a_offset),
-    (void*) (b_ptr + input_b_offset),
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     input_a_batch_stride,
     input_b_batch_stride,
     /*bias_batch_stride*/ N,
@@ -55,10 +57,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -77,14 +79,15 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    gemm_rcr.common_gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return gemm_rcr.common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -98,7 +101,17 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
-    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
         func_attrs,
         common_bias.SRC_TEMPLATE,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
index c2fc67191..c556485f1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
index 56511dbc1..bd2988abf 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_add_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
index f823baab2..5d262712e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
index bd4f7da4b..212b01a74 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index f55e21cd8..12af54f6a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -66,10 +66,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -88,10 +88,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_fast_gelu.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
         extra_code=EXTRA_CODE.render(),
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index d16d769a1..b4617b9d6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -30,10 +30,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -52,10 +52,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_gelu.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index 6c22e1e3a..a0952d345 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -30,10 +30,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -52,10 +52,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_hardswish.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
index f2049abef..1b2dea303 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
index 55400a029..12bce07ae 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_mul_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
index 3d5abf306..c8be43f28 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
index 2a4c75cbe..6abdcc977 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
@@ -17,6 +17,7 @@
 """
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ..gemm_universal import common
 from . import common_bias, common_permute, gemm_rcr_bias, gemm_rcr_permute
 
@@ -31,14 +32,15 @@ def gemm_rcr_bias_permute_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return gemm_rcr_permute.common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
         extra_code=common_permute.EXTRA_CODE.render(),
     )
 
@@ -50,10 +52,23 @@ def gen_function(
     dim_info_dict,
     problem_args_template=None,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     if problem_args_template is None:
-        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     else:
-        problem_args = problem_args_template.render()
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index 3a5940e7a..eae96241c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -31,10 +31,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -53,10 +53,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index 719efbfa2..e8ea6a976 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -31,10 +31,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -53,10 +53,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
index b3b306f38..2828d379d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
index 66cad13c4..b3d721d6c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -36,10 +36,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common_bias_broadcast.gen_profiler(
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_broadcast.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         RCR,
         UNARY_OP1,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index 688c9daf3..e4c082580 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -31,10 +31,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -53,10 +53,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_swish.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 8a11c966f..934c9a1c0 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -66,10 +66,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) bias_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_input_type}}*)(bias_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     N,
@@ -88,10 +88,11 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.gemm_rcr_bias_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         PROBLEM_ARGS_TEMPLATE,
         extra_code=EXTRA_CODE.render(),
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
new file mode 100644
index 000000000..791f3e300
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for C = fast_gelu(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+
+from ...backend_spec import CUDASpec
+from . import common, common_bias_activation, common_no_bias
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationFastGELU = LinearCombinationGeneric<GELU_taylor, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    ({{elem_input_type}}*) a_ptr,
+    ({{elem_input_type}}*) b_ptr,
+    nullptr,
+    ({{elem_output_type}}*) (c_ptr) + output_offset,
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    return common.gen_function(
+        func_attrs,
+        common_no_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N",
+            output_accessor=func_attrs["output_accessors"][0],
+        ),
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common.gen_function_call(func_attrs, indent, bias_ptr_arg="nullptr")
+
+
+@registry.reg("cuda.gemm_rcr_fast_gelu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index f2851db12..b5f1cc9da 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -20,8 +20,8 @@
 import jinja2
 
 from ... import registry
-from ..gemm_universal import common
-from . import common_permute
+from ...backend_spec import CUDASpec
+from . import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -49,10 +49,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) c_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_output_type}}*)(c_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     M * N,
@@ -67,24 +67,33 @@
 
 @registry.reg("cuda.gemm_rcr_permute.config")
 def gemm_rcr_permute_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common_permute.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
-    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = common_permute.extract_config(fproc, func_attrs)
 
 
 def common_gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
@@ -94,9 +103,10 @@ def common_gen_profiler(
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
     )
-    common_permute.gen_profiler(
+    return common_permute.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         src_template,
         problem_args_template,
@@ -110,10 +120,11 @@ def common_gen_profiler(
 
 
 @registry.reg("cuda.gemm_rcr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
@@ -128,10 +139,24 @@ def gen_function(
     dim_info_dict,
     problem_args_template=None,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     if problem_args_template is None:
-        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     else:
-        problem_args = problem_args_template.render()
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 0a3d109d6..90654c06f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -20,6 +20,8 @@
 import jinja2
 
 from ... import registry
+
+from ...backend_spec import CUDASpec
 from . import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
@@ -47,10 +49,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) c_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_output_type}}*)(c_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     M * N,
@@ -65,28 +67,37 @@
 
 @registry.reg("cuda.gemm_rrr.config")
 def gemm_rrr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.gemm_rrr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="N"
     )
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
@@ -105,7 +116,17 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
-    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
         func_attrs,
         common.SRC_TEMPLATE,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index 8653efab1..4b7ced1ea 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -20,8 +20,9 @@
 import jinja2
 
 from ... import registry
-from ..gemm_universal import common
-from . import common_permute
+
+from ...backend_spec import CUDASpec
+from . import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -49,10 +50,10 @@
     {M, N, K},
     split_k,
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    (void*) a_ptr,
-    (void*) b_ptr,
-    (void*) c_ptr,
-    (void*) (c_ptr + output_offset),
+    ({{elem_input_type}}*)(a_ptr),
+    ({{elem_input_type}}*)(b_ptr),
+    ({{elem_output_type}}*)(c_ptr),
+    ({{elem_output_type}}*)(c_ptr) + output_offset,
     M * K,
     N * K,
     M * N,
@@ -67,24 +68,33 @@
 
 @registry.reg("cuda.gemm_rrr_permute.config")
 def gemm_rrr_permute_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common_permute.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
-    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = common_permute.extract_config(fproc, func_attrs)
 
 
 def common_gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
@@ -94,9 +104,10 @@ def common_gen_profiler(
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="N"
     )
-    common_permute.gen_profiler(
+    return common_permute.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         src_template,
         problem_args_template,
@@ -110,10 +121,11 @@ def common_gen_profiler(
 
 
 @registry.reg("cuda.gemm_rrr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
@@ -128,10 +140,23 @@ def gen_function(
     dim_info_dict,
     problem_args_template=None,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     if problem_args_template is None:
-        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     else:
-        problem_args = problem_args_template.render()
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 6568b3c4f..1185ab1ab 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -21,6 +21,7 @@
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...common import tensor_accessor_codegen
 from . import common
 
@@ -77,13 +78,13 @@
 {{indent}} int,
 {{indent}} int64_t*,
 {{indent}} int,
-{{indent}} cutlass::half_t*,
+{{indent}} void*,
 {% for i in range(groups) %}
-{{indent}} cutlass::half_t*,
-{{indent}} cutlass::half_t*,
-{{indent}} cutlass::half_t*,
+{{indent}} void*,
+{{indent}} void*,
+{{indent}} void*,
 {% if has_bias %}
-{{indent}} cutlass::half_t*,
+{{indent}} void*,
 {% endif %}
 {% endfor %}
 {{indent}} uint8_t*,
@@ -104,8 +105,11 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}} device_properties.sharedMemPerMultiprocessor,
-{{indent}} device_properties.multiProcessorCount,
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
+{{indent}} device_properties_.sharedMemPerMultiprocessor,
+{{indent}} device_properties_.multiProcessorCount,
 {{indent}} &{{func_name}}_state,
 {{indent}} {{problem_count}},
 {{indent}} {{device_args}},
@@ -117,7 +121,7 @@
 {{indent}} {{operand[3]}},
 {% endif %}
 {% endfor %}
-{{indent}} global_workspace,
+{{indent}} global_workspace_,
 {% for operand_dim in group_operand_dims %}
 {{indent}} {{operand_dim[0]}},
 {{indent}} {{operand_dim[1]}},
@@ -160,22 +164,25 @@
     }                                                                                 \\
   }
 
-{{instance}}
+{{instances}}
 
 {% endif %}
 
-{{indent}}template<typename GEMMKind>
+{{indent}}template<typename GemmInstance>
 {{indent}}void {{func_name}}_adapter(
+{%if is_profiler %}
+    GemmInstance& gemm_op,
+{% endif %}
     int sharedMemPerMultiprocessor,
     int multiProcessorCount,
     uint8_t* workspace,
     int problem_count,
     cutlass::gemm::GemmCoord* problem_sizes_device,
-    cutlass::half_t **ptr_A,
-    cutlass::half_t **ptr_B,
-    cutlass::half_t **ptr_C,
+    void **ptr_A,
+    void **ptr_B,
+    void **ptr_C,
 {% if has_bias %}
-    cutlass::half_t **ptr_bias,
+    void **ptr_bias,
 {% endif %}
     int64_t* lda,
     int64_t* ldb,
@@ -199,6 +206,9 @@
 ADAPTER_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}_adapter<{{instance}}>(
+{% if is_profiler %}
+    gemm_op,
+{% endif %}
     {{sharedMemPerMultiprocessor}},
     {{multiProcessorCount}},
     {{workspace}},
@@ -225,11 +235,50 @@
 )
 
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}{{instance_name}} {{gemm_op}};
+{{indent}}const char *gemm_op_name = "{{gemm_op_name}}";
+{{indent}}int ret = {{func_name}}_adapter(
+{{indent}}    {{gemm_op}},
+{{indent}}    gemm_op_name,
+{{indent}}    {{sharedMemPerMultiprocessor}},
+{{indent}}    {{multiProcessorCount}},
+{{indent}}    {{workspace}},
+{{indent}}    {{problem_count}},
+{{indent}}    {{problem_sizes_device}},
+{{indent}}    (void**)({{ptr_A}}),
+{{indent}}    (void**)({{ptr_B}}),
+{{indent}}    (void**)({{ptr_C}}),
+{% if has_bias %}
+{{indent}}    (void**)({{ptr_bias}}),
+{% endif %}
+{{indent}}    {{lda}},
+{{indent}}    {{ldb}},
+{{indent}}    {{ldc}},
+{% if has_bias %}
+{{indent}}    {{ldd}},
+{% endif %}
+{{indent}}    {{instance_name}}::maximum_active_blocks(),
+{{indent}}    stream
+{{indent}}    );
+{{indent}}if (ret != 0)
+{{indent}}  return ret;
+{{indent}}
+{{indent}}}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
 EXEC_TEMPLATE = jinja2.Template(
     """
 //  TODO: cast to right dtype
-{{indent}}using ElementComputeEpilogue = typename GEMMKind::ElementAccumulator;
-{{indent}}// int smem_size = int(sizeof(typename GEMMKind::GemmKernel::SharedStorage));
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
+{{indent}}// int smem_size = int(sizeof(typename {{instance}}::GemmKernel::SharedStorage));
 {{indent}}// int occupancy = std::min(2, int(sharedMemPerMultiprocessor / smem_size));
 {{indent}}int threadblock_count = multiProcessorCount * occupancy;
 {{indent}}// Early exit
@@ -240,18 +289,19 @@
 {{indent}}}
 
 
-{{indent}}typename GEMMKind::Arguments arguments{
+{{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}
 
 {{indent}}};
-{{indent}}GEMMKind gemm_op;
 {% if is_profiler %}
 {{indent}}// Debug BGM: https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} gemm_op;
 {% endif %}
 {{indent}}// TODO: cutlass bug here
 {{indent}}// auto status = gemm_op.can_implement(arguments);
@@ -310,21 +360,27 @@
 
 {{func_adapter}}
 
+{% if is_profiler %}
+template <typename GemmInstance>
+void {{function_name}} (
+    GemmInstance& gemm_op,
+{% else %}
 void {{function_name}} (
+{% endif %}
     int sharedMemPerMultiprocessor,
     int multiProcessorCount,
     int64_t* func_state,
     int problem_count,
-    cutlass::half_t* device_args,
+    void* device_args,
     {% for operand in group_operands %}
-    cutlass::half_t* {{operand[0]}},
-    cutlass::half_t* {{operand[1]}},
-    cutlass::half_t* {{operand[2]}},
+    void* {{operand[0]}},
+    void* {{operand[1]}},
+    void* {{operand[2]}},
     {% if has_bias %}
-    cutlass::half_t* {{operand[3]}},
+    void* {{operand[3]}},
     {% endif %}
     {% endfor %}
-    uint8_t* global_workspace,
+    uint8_t* global_workspace_,
 {% for operand_dim in group_operand_dims %}
     int64_t* {{operand_dim[0]}},
     int64_t* {{operand_dim[1]}},
@@ -369,7 +425,7 @@
 
 {% endfor %}
 
-    uint8_t* arg_ptr = (uint8_t*) device_args;
+    void* arg_ptr = device_args;
     // problem_sizes_device: N * GemmCoord -> N * 3 * sizeof(int64_t) -> 32 * N
     // ptrA/B/C/D: N * 8 for each
     // lda/b/c/d: N * 8 for each
@@ -380,14 +436,14 @@
         (cutlass::gemm::GemmCoord*)(arg_ptr + offset);
     offset += 32 * problem_count;
 
-    auto ptr_A = (cutlass::half_t**)(arg_ptr + offset);
+    auto ptr_A = (void**)(arg_ptr + offset);
     offset += 8 * problem_count;
-    auto ptr_B = (cutlass::half_t**)(arg_ptr + offset);
+    auto ptr_B = (void**)(arg_ptr + offset);
     offset += 8 * problem_count;
-    auto ptr_C = (cutlass::half_t**)(arg_ptr + offset);
+    auto ptr_C = (void**)(arg_ptr + offset);
     offset += 8 * problem_count;
     {% if has_bias %}
-    auto ptr_bias = (cutlass::half_t**)(arg_ptr + offset);
+    auto ptr_bias = (void**)(arg_ptr + offset);
     offset += 8 * problem_count;
     {% endif %}
 
@@ -405,11 +461,11 @@
     if (*func_state != GROUP_0_AM) {
         // need update
         std::vector<cutlass::gemm::GemmCoord> problem_sizes;
-        std::vector<cutlass::half_t*> ptr_A_host;
-        std::vector<cutlass::half_t*> ptr_B_host;
-        std::vector<cutlass::half_t*> ptr_C_host;
+        std::vector<void*> ptr_A_host;
+        std::vector<void*> ptr_B_host;
+        std::vector<void*> ptr_C_host;
         {% if has_bias %}
-        std::vector<cutlass::half_t*> ptr_bias_host;
+        std::vector<void*> ptr_bias_host;
         {% endif %}
         std::vector<int64_t> lda_host;
         std::vector<int64_t> ldb_host;
@@ -419,11 +475,11 @@
         {% endif %}
 
         {% for operand in group_operands %}
-        ptr_A_host.push_back({{operand[0]}} + input_a_offset_{{loop.index0}});
-        ptr_B_host.push_back({{operand[1]}});
-        ptr_C_host.push_back({{operand[2]}} + output_offset_{{loop.index0}});
+        ptr_A_host.push_back(({{elem_input_type}}*)({{operand[0]}}) + input_a_offset_{{loop.index0}});
+        ptr_B_host.push_back(({{elem_input_type}}*)({{operand[1]}}));
+        ptr_C_host.push_back(({{elem_output_type}}*)({{operand[2]}}) + output_offset_{{loop.index0}});
         {% if has_bias %}
-        ptr_bias_host.push_back({{operand[3]}});
+        ptr_bias_host.push_back(({{elem_input_type}}*)({{operand[3]}}));
         {% endif %}
         {% endfor %}
 
@@ -514,6 +570,9 @@
 
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
+  using ElementOutput = {{elem_output_type}};
+  using ElementInputA = {{elem_input_type}};
+  using ElementInputB = {{elem_input_type}};
   cutlass::DeviceAllocation<ElementInputA> blob_A;
   cutlass::DeviceAllocation<ElementInputB> blob_B;
   cutlass::DeviceAllocation<ElementOutput> blob_C;
@@ -733,6 +792,7 @@ def group_gemm_instance(op_def: str, func_attrs: Dict[str, Any], for_profiler: b
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     shape_template,
     problem_args_template,
     has_bias=False,
@@ -740,9 +800,31 @@ def gen_profiler(
 ):
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
 
-    file_pairs = []
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+    )
+
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(
             op,
             for_profiler=True,
@@ -750,29 +832,20 @@ def gen_profiler(
             emit_kernel=True,
         )
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_op_{instance_idx}"
         instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-
-        # instance = instance
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, problem_args=problem_args_template.render()
-        )
-        op_func = ADAPTOR_FUNCTION_TEMPLATE.render(
-            instance=instance,
-            is_profiler=True,
-            func_name=name,
-            indent=" ",
-            exec_program=exec_program,
-            has_bias=has_bias,
-        )
-        func_call = ADAPTER_CALL_TEMPLATE.render(
-            func_name=name,
-            instance=name,
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{instance_name_base}",
             sharedMemPerMultiprocessor="device_properties.sharedMemPerMultiprocessor",
             multiProcessorCount="device_properties.multiProcessorCount",
-            workspace="global_workspace",
+            workspace="global_workspace_",
             problem_count="problem_count",
             problem_sizes_device="problem_sizes_device.get()",
             ptr_A="ptr_A.get()",
@@ -785,16 +858,58 @@ def gen_profiler(
             ldc="ldc.get()",
             ldd="ldd.get()",
         )
-        code = common.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=ARGS_PARSER_TEMPLATE.render(),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+    op_func = ADAPTOR_FUNCTION_TEMPLATE.render(
+        instances="\n".join(instances),
+        is_profiler=True,
+        func_name=instance_name_base,
+        indent=" ",
+        exec_program=exec_program,
+        has_bias=has_bias,
+    )
+    func_call = ADAPTER_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=instance_name_base,
+        instance=instance_name_base,
+        sharedMemPerMultiprocessor="sharedMemPerMultiprocessor",
+        multiProcessorCount="multiProcessorCount",
+        workspace="global_workspace_",
+        problem_count="problem_count",
+        problem_sizes_device="problem_sizes_device",
+        ptr_A="ptr_A",
+        ptr_B="ptr_B",
+        ptr_C="ptr_C",
+        has_bias=has_bias,
+        ptr_bias="ptr_bias",
+        lda="lda",
+        ldb="ldb",
+        ldc="ldc",
+        ldd="ldd",
+    )
+    tensor_decl = TENSOR_DECL_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        has_bias=has_bias,
+    )
+    code = common.PROFILER_TEMPLATE.render(
+        is_group_gemm=True,
+        op_func=op_func,
+        has_bias=has_bias,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        function_name=f"{instance_name_base}_adapter",
+        func_call=func_call,
+        name=instance_name_base,
+        tensor_decl=tensor_decl,
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_type=elem_type,
+    )
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
-    common.build_profiler(file_pairs)
+    return common.build_profiler(file_pairs)
 
 
 def gen_function(
@@ -804,7 +919,17 @@ def gen_function(
     problem_args_template,
     has_bias=False,
 ):
-    problem_args = problem_args_template.render()
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = problem_args_template.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     op_instance = func_attrs["op_instance"]
@@ -873,7 +998,7 @@ def gen_function(
             instance=fname,
             sharedMemPerMultiprocessor="sharedMemPerMultiprocessor",
             multiProcessorCount="multiProcessorCount",
-            workspace="global_workspace",
+            workspace="global_workspace_",
             problem_count=func_attrs["groups"],
             problem_sizes_device="problem_sizes_device",
             ptr_A="ptr_A",
@@ -890,7 +1015,10 @@ def gen_function(
         exec_paths += exec_inst
 
     exec_program = EXEC_TEMPLATE.render(
-        indent="  ", is_profiler=False, problem_args=problem_args
+        indent="  ",
+        instance="GemmInstance",
+        is_profiler=False,
+        problem_args=problem_args,
     )
     adapter_func = ADAPTOR_FUNCTION_TEMPLATE.render(
         func_name=func_name, exec_program=exec_program, has_bias=has_bias
@@ -914,6 +1042,8 @@ def gen_function(
         instances=instance_decl,
         func_adapter=adapter_func,
         function_name=func_name,
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
         shape_function=shape_func,
         group_operands=group_operands,
         group_operand_dims=group_operand_dims,
@@ -962,7 +1092,7 @@ def gen_function_call(func_attrs, ndims, has_bias=False, indent="  "):
         operand_dims.append("&" + cshape[1]._attrs["name"])
         group_operands.append(operands)
         group_operand_dims.append(operand_dims)
-    device_args = f'reinterpret_cast<cutlass::half_t*>(unique_workspace + {func_attrs["unique_workspace_offset"]})'
+    device_args = f'unique_workspace_ + {func_attrs["unique_workspace_offset"]}'
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         problem_count=func_attrs["groups"],
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
index 2b556fc83..c18ef3e5f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
@@ -28,10 +28,10 @@
         problem_count,
         threadblock_count,
         {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-        ptr_A,
-        ptr_B,
-        ptr_bias,
-        ptr_C,
+        ({{elem_input_type}}**)(ptr_A),
+        ({{elem_input_type}}**)(ptr_B),
+        ({{elem_input_type}}**)(ptr_bias),
+        ({{elem_output_type}}**)ptr_C,
         lda,
         ldb,
         ldc,
@@ -43,10 +43,16 @@
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     shape_template,
 ):
-    group_common.gen_profiler(
-        func_attrs, workdir, shape_template, PROBLEM_ARGS_TEMPLATE, has_bias=True
+    return group_common.gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        shape_template,
+        PROBLEM_ARGS_TEMPLATE,
+        has_bias=True,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
index 354039b40..83f0e2aa0 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
@@ -29,10 +29,10 @@
         problem_count,
         threadblock_count,
         {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-        ptr_A,
-        ptr_B,
-        ptr_C,
-        ptr_C,
+        ({{elem_input_type}}**)(ptr_A),
+        ({{elem_input_type}}**)(ptr_B),
+        ({{elem_output_type}}**)(ptr_C),
+        ({{elem_output_type}}**)(ptr_C),
         lda,
         ldb,
         ldc,
@@ -43,13 +43,13 @@
 
 @registry.reg("cuda.group_gemm_rcr.config")
 def group_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 @registry.reg("cuda.group_gemm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    group_common.gen_profiler(
-        func_attrs, workdir, shape_template, PROBLEM_ARGS_TEMPLATE
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    return group_common.gen_profiler(
+        func_attrs, workdir, profiler_filename, shape_template, PROBLEM_ARGS_TEMPLATE
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
index c292c3e1d..88c348d2e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -27,8 +27,10 @@ def group_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    return group_common_bias.gen_profiler(
+        func_attrs, workdir, profiler_filename, shape_template
+    )
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.gen_function")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
index 9345c26e4..fc43233da 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -27,8 +27,10 @@ def group_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    return group_common_bias.gen_profiler(
+        func_attrs, workdir, profiler_filename, shape_template
+    )
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_function")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index e247bbe2a..bce93b575 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -27,8 +27,10 @@ def group_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    return group_common_bias.gen_profiler(
+        func_attrs, workdir, profiler_filename, shape_template
+    )
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_function")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
index 580a3b005..7d1741c52 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -43,32 +43,45 @@ def _get_problem_info(**kwargs):
 
 @registry.reg("cuda.perm021fc_ccr.config")
 def gemm_ccr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.perm021fc_ccr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["B", "K", "M"], b_dims=["1", "N", "K"], c_dims=["B", "M", "N"]
     )
 
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -82,8 +95,12 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
index b4f320de9..69712f30f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
@@ -48,22 +48,27 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.perm021fc_ccr_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["B", "K", "M"], b_dims=["1", "N", "K"], c_dims=["B", "M", "N"]
     )
 
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         problem_args,
         args_parser,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -73,8 +78,12 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
index 5631bf3ca..76ac6533b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -18,11 +18,10 @@
 """
 from ... import registry
 
-from ..gemm_universal import common
-
 from . import (
     bmm_common,
     bmm_permute_common,
+    common,
     common_bias,
     common_permute,
     perm021fc_ccr_bias,
@@ -85,24 +84,34 @@ class Tensor3DPermute021BMM {
 
 @registry.reg("cuda.perm021fc_ccr_bias_permute.config")
 def config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common_permute.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
-    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = common_permute.extract_config(fproc, func_attrs)
 
 
 @registry.reg("cuda.perm021fc_ccr_bias_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    return perm021fc_ccr_bias.gen_profiler(func_attrs, workdir, dim_info_dict)
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    return perm021fc_ccr_bias.gen_profiler(
+        func_attrs, workdir, profiler_filename, dim_info_dict
+    )
 
 
 @registry.reg("cuda.perm021fc_ccr_bias_permute.gen_function")
@@ -112,9 +121,11 @@ def gen_function(
     dim_info_dict,
 ):
     mm_info = perm021fc_ccr_bias._get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1)
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
     )
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
 
     return bmm_permute_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index 35a9ef77d..3d08f0291 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -44,33 +44,45 @@ def _get_problem_info(**kwargs):
 
 @registry.reg("cuda.perm021fc_crc.config")
 def gemm_crc_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.perm021fc_crc.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["1", "K", "N"], b_dims=["B", "K", "M"], c_dims=["B", "M", "N"]
     )
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1), beta_value=0)
+        mm_info=_get_problem_info(
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=0,
+        ),
     )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -85,7 +97,10 @@ def gen_function(
     dim_info_dict,
 ):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1), beta_value=0)
+        mm_info=_get_problem_info(
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=0,
+        ),
     )
 
     return bmm_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
index 187a0c6c1..3e6497c76 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
@@ -49,23 +49,26 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.perm021fc_crc_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["1", "K", "N"], b_dims=["B", "K", "M"], c_dims=["B", "M", "N"]
     )
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+        mm_info=_get_problem_info(
+            alpha_value=func_attrs.get("alpha", 1),
+        ),
     )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         problem_args,
         args_parser,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -76,7 +79,9 @@ def gen_function(
     dim_info_dict,
 ):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+        mm_info=_get_problem_info(
+            alpha_value=func_attrs.get("alpha", 1),
+        ),
     )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index fe0ffe9cd..c414816d8 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -17,6 +17,7 @@
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
@@ -43,12 +44,17 @@ def _get_default_problem_info(**kwargs):
 
 # Currently only has output Tensor Accessor support.
 def _get_strided_problem_info(func_attrs):
+    backend_spec = CUDASpec()
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     return bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
         a_ptr="a_ptr",
         b_ptr="b_ptr",
-        bias_ptr="(c_ptr + output_offset)",
-        c_ptr="(c_ptr + output_offset)",
+        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="K",
         b_batch_stride="N * K",
         bias_batch_stride="output_batch_stride",
@@ -94,32 +100,45 @@ def get_output_addr_calculator(func_attrs):
 
 @registry.reg("cuda.perm102_bmm_rcr.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.perm102_bmm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["M", "B", "K"], b_dims=["B", "N", "K"], c_dims=["M", "B", "N"]
     )
 
-    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_default_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -136,7 +155,9 @@ def gen_function(
     bmm_problem_info = _get_strided_problem_info(func_attrs)
 
     # broadcasting is not supported
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
index 8c34ecd48..92afe0ca5 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
@@ -17,6 +17,7 @@
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col)) + bias[n].
 """
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import bmm_common, common, common_bias, perm102_bmm_rcr
 from .perm102_bmm_rcr import get_output_addr_calculator
 
@@ -45,13 +46,18 @@ def _get_default_problem_info(**kwargs):
 
 # Currently only has output Tensor Accessor support.
 def _get_strided_problem_info(func_attrs):
+    backend_spec = CUDASpec()
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     return bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
         beta_value=1,
         a_ptr="(a_ptr)",
         b_ptr="(b_ptr)",
         bias_ptr="(bias_ptr)",
-        c_ptr="(c_ptr + output_offset)",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="K",
         b_batch_stride="N * K",
         bias_batch_stride="N",
@@ -69,22 +75,27 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.perm102_bmm_rcr_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["M", "B", "K"], b_dims=["B", "N", "K"], c_dims=["M", "B", "N"]
     )
 
-    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_default_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         problem_args,
         args_parser,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -97,7 +108,9 @@ def gen_function(
     bmm_problem_info = _get_strided_problem_info(func_attrs)
 
     # broadcasting is not supported
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index e4a3d7d1b..2f8d35522 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -17,6 +17,7 @@
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
 """
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import bmm_common, common
 from .perm102_bmm_rcr import get_output_addr_calculator
 
@@ -44,12 +45,17 @@ def _get_default_problem_info(**kwargs):
 
 # Currently only has output Tensor Accessor support.
 def _get_strided_problem_info(func_attrs):
+    backend_spec = CUDASpec()
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     return bmm_common.Bmm_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
         a_ptr="(a_ptr)",
         b_ptr="(b_ptr)",
-        bias_ptr="(c_ptr + output_offset)",
-        c_ptr="(c_ptr + output_offset)",
+        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
+        alpha_value=func_attrs.get("alpha", 1),
         a_batch_stride="K",
         b_batch_stride="N * K",
         bias_batch_stride="output_batch_stride",
@@ -63,32 +69,45 @@ def _get_strided_problem_info(func_attrs):
 
 @registry.reg("cuda.perm102_bmm_rrr.config")
 def gemm_rrr_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
+    def fproc(op):
         import cutlass_lib
 
-        return common.default_fproc_f16(
+        from ...backend_spec import CUDASpec
+
+        backend_spec = CUDASpec()
+        elem_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+
+        return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            elem_type=elem_type,
             epiligue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+    func_attrs["op_instance"] = common.extract_config(fproc)
 
 
 @registry.reg("cuda.perm102_bmm_rrr.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["M", "B", "K"], b_dims=["B", "K", "N"], c_dims=["M", "B", "N"]
     )
 
-    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_default_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common.SRC_TEMPLATE,
         problem_args,
@@ -105,7 +124,9 @@ def gen_function(
     bmm_problem_info = _get_strided_problem_info(func_attrs)
 
     # broadcasting is not supported
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     return bmm_common.gen_function(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
index f7435c071..e065d70c1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
@@ -17,6 +17,7 @@
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[n]
 """
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import bmm_common, common, common_bias, perm102_bmm_rrr
 from .perm102_bmm_rcr import get_output_addr_calculator
 
@@ -45,13 +46,17 @@ def _get_default_problem_info(**kwargs):
 
 # Currently only has output Tensor Accessor support.
 def _get_strided_problem_info(func_attrs):
+    backend_spec = CUDASpec()
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
     return bmm_common.Bmm_problem_info(
         alpha_value=func_attrs.get("alpha", 1),
         beta_value=1,
         a_ptr="(a_ptr)",
         b_ptr="(b_ptr)",
         bias_ptr="(bias_ptr)",
-        c_ptr="(c_ptr + output_offset)",
+        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
         a_batch_stride="K",
         b_batch_stride="N * K",
         bias_batch_stride="N",
@@ -69,22 +74,27 @@ def gemm_rrr_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("cuda.perm102_bmm_rrr_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
         a_dims=["M", "B", "K"], b_dims=["B", "K", "N"], c_dims=["M", "B", "N"]
     )
 
-    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    mm_info = _get_default_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=mm_info,
+    )
 
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
+        profiler_filename,
         dim_info_dict,
         common_bias.SRC_TEMPLATE,
         problem_args,
         args_parser,
-        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -97,7 +107,9 @@ def gen_function(
     bmm_problem_info = _get_strided_problem_info(func_attrs)
 
     # broadcasting is not supported
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 5b075783c..d1a48f28b 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -21,21 +21,19 @@
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...target import Target
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
-
 FUNC_SIGNATURE = jinja2.Template(
     """
-cudaError_t {{func_name}}(half* output,
-                          half* input,
-                          half* gamma,
-                          half* beta,
+cudaError_t {{func_name}}(void* output,
+                          void* input,
+                          void* gamma,
+                          void* beta,
                           int N,
                           const float eps,
                           const int max_smem_size,
+                          void* workspace,
                           cudaStream_t stream)
     """
 )
@@ -51,7 +49,8 @@
 {{indent}}{
 {{indent}}  {{func_name}}(
 {{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
-{{indent}}     {{eps}}, max_smem_size, stream /* default stream */
+{{indent}}     {{eps}}, max_smem_size_, global_workspace_,
+{{indent}}  stream /* default stream */
 {{indent}}  );
 {{indent}}}
     """
@@ -69,26 +68,32 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
+#include <math_constants.h>
+#include <assert.h>
 
 
 {{gamma_beta_const_defs}}
 
 namespace {
 
+{{helper_libs}}
+
 {{custom_libs}}
 
 }  // namespace
 
 {{func_signature}}
 {
-    return invokeGroupNorm<{{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
-            output,
-            input,
-            gamma,
-            beta,
+
+    return invokeGroupNorm_{{elem_input_type}}<{{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+            static_cast<{{elem_input_type}}*>(output),
+            static_cast<{{elem_input_type}}*>(input),
+            static_cast<{{elem_input_type}}*>(gamma),
+            static_cast<{{elem_input_type}}*>(beta),
             N,
             eps,
             max_smem_size,
+            workspace,
             stream);
 }
     """
@@ -113,15 +118,15 @@ def get_input_names(func_attrs: Dict[str, Any]) -> List[str]:
         beta = inputs[idx]
         idx += 1
 
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=x._attrs["name"])
+    input_name = x._attrs["name"]
     if gamma is None:
         gamma_name = "nullptr"
     else:
-        gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
+        gamma_name = gamma._attrs["name"]
     if beta is None:
         beta_name = "nullptr"
     else:
-        beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+        beta_name = beta._attrs["name"]
 
     return (input_name, gamma_name, beta_name)
 
@@ -135,11 +140,19 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     C = input_shape[3].value()
     G = func_attrs["num_groups"]
 
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
+        helper_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "layer_norm.cuh"
+        ),
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "groupnorm_kernel.cuh"
         ),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
         FuseSwish="true" if use_swish else "false",
         H=H,
         W=W,
@@ -161,9 +174,7 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
         func_attrs["inputs"]
     ), "expected at least 1 inputs but got {}".format(len(func_attrs["inputs"]))
 
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
     input_shape = func_attrs["inputs"][0]._attrs["shape"]
     eps = func_attrs["eps"]
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 6a235589c..2a22ed903 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -33,10 +33,30 @@
 #define GROUP_NORM_CUDA_CHECK_LAUNCH() GROUP_NORM_CUDA_CHECK(cudaGetLastError())
 #endif
 
+__device__ half fast_tanh(half x) {
+  return half(cutlass::fast_tanh(float(x)));
+}
+
 __inline__ __device__ float sigmoid(float val) {
   return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
 }
 
+__device__ half constant_half() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+__device__ half one() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+__inline__ __device__ half hsigmoid(half a) {
+  half half_val = constant_half();
+  half one_val = one();
+  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // The Groupnorm implementation below is based on OneFlow's Layernorm
 // implementation at:
@@ -447,7 +467,7 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
 }
 
 template <bool FuseSwish, int H, int W, int C, int num_groups>
-cudaError_t invokeWelfordGroupNorm(
+cudaError_t invokeWelfordGroupNorm_half(
     half* output,
     half* input,
     half* gamma,
@@ -512,8 +532,322 @@ cudaError_t invokeWelfordGroupNorm(
   return cudaSuccess;
 }
 
+template <typename SRC, typename DST, bool affine, bool FuseSwish>
+struct AffineStore {
+  AffineStore(
+      DST* y,
+      int64_t row_size,
+      int64_t channel_size,
+      int64_t spatial_size,
+      const DST* gamma,
+      const DST* beta)
+      : y(y),
+        row_size(row_size),
+        channel_size(channel_size),
+        spatial_size(spatial_size),
+        gamma(gamma),
+        beta(beta) {}
+
+  template <int PackSize>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    layer_norm::Pack<DST, PackSize> y_pack;
+    const int64_t offset = row * row_size + col;
+    const int64_t packed_offset = offset / PackSize;
+    const int64_t gamma_beta_offset = (offset / spatial_size) % channel_size;
+    DST gamma_val = 1.0;
+    DST beta_val = 0.0;
+    if (affine) {
+      gamma_val = gamma[gamma_beta_offset];
+      beta_val = beta[gamma_beta_offset];
+    }
+
+#pragma unroll
+    for (int i = 0; i < PackSize; ++i) {
+      DST normalized_i = static_cast<DST>(src[i]);
+      if (affine) {
+        y_pack.elem[i] = normalized_i * gamma_val + beta_val;
+      } else {
+        // Direct Store.
+        y_pack.elem[i] = normalized_i;
+      }
+      if (FuseSwish) {
+        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+      }
+    }
+    *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) +
+      packed_offset) = y_pack.storage;
+  }
+  bool CanPackAs(size_t pack_size) {
+    return (spatial_size % pack_size) == 0;
+  }
+  DST* y;
+  int64_t row_size;
+  int64_t channel_size;
+  int64_t spatial_size;
+  const DST* gamma;
+  const DST* beta;
+};
+
+template <typename SRC, typename DST, bool affine>
+struct ScaleLoad {
+  ScaleLoad(
+      const SRC* src,
+      const SRC* gamma,
+      int64_t row_size,
+      int64_t channel_size,
+      int64_t spatial_size)
+      : src(src),
+        gamma(gamma),
+        row_size(row_size),
+        channel_size(channel_size),
+        spatial_size(spatial_size) {}
+  template <int PackSize>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    layer_norm::Pack<SRC, PackSize> src_pack;
+    layer_norm::Pack<SRC, PackSize> gamma_pack;
+
+    const int64_t offset = row * row_size + col;
+    const int64_t packed_offset = offset / PackSize;
+    const int64_t gamma_offset = (offset / spatial_size) % channel_size;
+
+    src_pack.storage =
+        *(reinterpret_cast<const layer_norm::PackType<SRC, PackSize>*>(src) +
+          packed_offset);
+    SRC gamma_val = static_cast<SRC>(1.0);
+    if (affine) {
+      gamma_val = gamma[gamma_offset];
+    }
+#pragma unroll
+    for (int i = 0; i < PackSize; ++i) {
+      dst[i] = static_cast<DST>(src_pack.elem[i] * gamma_val);
+    }
+  }
+  bool CanPackAs(size_t pack_size) {
+    return (spatial_size % pack_size) == 0;
+  }
+  const SRC* src;
+  const SRC* gamma;
+  int64_t row_size;
+  int64_t channel_size;
+  int64_t spatial_size;
+};
+
+template <typename SRC, typename DST, bool affine, bool FuseSwish>
+struct ChannelsLastStore {
+  ChannelsLastStore(
+      DST* y,
+      const DST* gamma,
+      const DST* beta,
+      int64_t spatial_size,
+      int64_t channel_size,
+      int64_t num_groups)
+      : y(y),
+        gamma(gamma),
+        beta(beta),
+        spatial_size(spatial_size),
+        c0(num_groups),
+        c1(channel_size / num_groups) {}
+
+  template <int PackSize>
+  __device__ void store(const SRC* src, int32_t row, int32_t col) {
+    layer_norm::Pack<DST, PackSize> y_pack;
+    layer_norm::Pack<DST, PackSize> gamma_pack;
+    layer_norm::Pack<DST, PackSize> beta_pack;
+    int32_t spatial_idx;
+    int32_t c1_idx;
+    c1(spatial_idx, c1_idx, col);
+    int32_t batch_idx;
+    int32_t c0_idx;
+    c0(batch_idx, c0_idx, row);
+    const int32_t y_offset =
+        (batch_idx * c0.divisor * c1.divisor * spatial_size +
+         spatial_idx * c0.divisor * c1.divisor + c0_idx * c1.divisor + c1_idx) /
+        PackSize;
+    const int32_t gamma_beta_offset = (c0_idx * c1.divisor + c1_idx) / PackSize;
+    if (affine) {
+      gamma_pack.storage = *(
+          reinterpret_cast<const layer_norm::PackType<DST, PackSize>*>(gamma) +
+          gamma_beta_offset);
+      beta_pack.storage =
+          *(reinterpret_cast<const layer_norm::PackType<DST, PackSize>*>(beta) +
+            gamma_beta_offset);
+    }
+
+#pragma unroll
+    for (int i = 0; i < PackSize; ++i) {
+      DST normalized_i = static_cast<DST>(src[i]);
+      if (affine) {
+        y_pack.elem[i] = normalized_i * gamma_pack.elem[i] + beta_pack.elem[i];
+      } else {
+        // Direct Store.
+        y_pack.elem[i] = normalized_i;
+      }
+      if (FuseSwish) {
+        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+      }
+    }
+    *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) + y_offset) =
+        y_pack.storage;
+  }
+  bool CanPackAs(size_t pack_size) {
+    return (c1.divisor % pack_size) == 0;
+  }
+  DST* y;
+  const DST* gamma;
+  const DST* beta;
+  int32_t spatial_size;
+  cutlass::FastDivmod c0;
+  cutlass::FastDivmod c1;
+};
+
+template <typename SRC, typename DST>
+struct ChannelsLastLoad {
+  ChannelsLastLoad(
+      const SRC* src,
+      int64_t spatial_size,
+      int64_t channel_size,
+      int64_t num_groups)
+      : src(src),
+        spatial_size(spatial_size),
+        c0(num_groups),
+        c1(channel_size / num_groups) {}
+  template <int N>
+  __device__ void load(DST* dst, int32_t row, int32_t col) const {
+    int32_t spatial_idx;
+    int32_t c1_idx;
+    c1(spatial_idx, c1_idx, col);
+    int32_t batch_idx;
+    int32_t c0_idx;
+    c0(batch_idx, c0_idx, row);
+    layer_norm::Pack<SRC, N> pack;
+    const int32_t offset =
+        (batch_idx * c0.divisor * c1.divisor * spatial_size +
+         spatial_idx * c0.divisor * c1.divisor + c0_idx * c1.divisor + c1_idx) /
+        N;
+
+    pack.storage =
+        *(reinterpret_cast<const layer_norm::PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+  bool CanPackAs(size_t pack_size) {
+    return (c1.divisor % pack_size) == 0;
+  }
+  const SRC* src;
+  int32_t spatial_size;
+  cutlass::FastDivmod c0;
+  cutlass::FastDivmod c1;
+};
+
+template <typename T, typename ComputeType, bool affine, bool FuseSwish>
+void GroupNormForwardGpu(
+    cudaStream_t stream,
+    const int64_t num_instances,
+    const int64_t norm_size,
+    const int64_t channel_size,
+    const int64_t spatial_size,
+    const double epsilon,
+    const T* x_ptr,
+    const T* gamma_ptr,
+    const T* beta_ptr,
+    T* y_ptr,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool channels_first) {
+  // using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
+  if (channels_first) {
+    layer_norm::DirectLoad<T, ComputeType> load(x_ptr, norm_size);
+    AffineStore<ComputeType, T, affine, FuseSwish> store(
+        y_ptr, norm_size, channel_size, spatial_size, gamma_ptr, beta_ptr);
+
+    layer_norm::DispatchLayerNorm<decltype(load), decltype(store), ComputeType>(
+        stream,
+        load,
+        store,
+        num_instances,
+        norm_size,
+        epsilon,
+        mean,
+        inv_variance);
+  } else {
+    ChannelsLastLoad<T, ComputeType> load(
+        x_ptr,
+        spatial_size,
+        channel_size,
+        channel_size / (norm_size / spatial_size));
+    ChannelsLastStore<ComputeType, T, affine, FuseSwish> store(
+        y_ptr,
+        gamma_ptr,
+        beta_ptr,
+        spatial_size,
+        channel_size,
+        channel_size / (norm_size / spatial_size));
+
+    layer_norm::DispatchLayerNorm<decltype(load), decltype(store), ComputeType>(
+        stream,
+        load,
+        store,
+        num_instances,
+        norm_size,
+        epsilon,
+        mean,
+        inv_variance);
+  }
+}
+
+template <typename T, typename T2, bool FuseSwish>
+void DispatchGroupNormForwardGpu(
+    cudaStream_t stream,
+    const int64_t num_instances,
+    const int64_t norm_size,
+    const int64_t channel_size,
+    const int64_t spatial_size,
+    const double epsilon,
+    const T* x_ptr,
+    const T* gamma_ptr,
+    const T* beta_ptr,
+    T* y_ptr,
+    T2* mean,
+    T2* inv_variance,
+    bool channels_first) {
+  using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
+  if (gamma_ptr != nullptr && beta_ptr != nullptr) {
+    GroupNormForwardGpu<T, ComputeType, true, FuseSwish>(
+        stream,
+        num_instances,
+        norm_size,
+        channel_size,
+        spatial_size,
+        epsilon,
+        x_ptr,
+        gamma_ptr,
+        beta_ptr,
+        y_ptr,
+        mean,
+        inv_variance,
+        channels_first);
+  } else {
+    GroupNormForwardGpu<T, ComputeType, false, FuseSwish>(
+        stream,
+        num_instances,
+        norm_size,
+        channel_size,
+        spatial_size,
+        epsilon,
+        x_ptr,
+        gamma_ptr,
+        beta_ptr,
+        y_ptr,
+        mean,
+        inv_variance,
+        channels_first);
+  }
+}
+
 template <bool FuseSwish, int H, int W, int C, int G>
-cudaError_t invokeGroupNorm(
+cudaError_t invokeGroupNorm_half(
     half* output,
     half* input,
     half* gamma,
@@ -521,11 +855,19 @@ cudaError_t invokeGroupNorm(
     int N,
     const float eps,
     const int max_smem_size,
+    void* workspace,
     cudaStream_t stream) {
   constexpr auto C_G = C / G;
   constexpr auto C_G_2 = C_G / 2;
   constexpr int ILP = 8;
 
+  const int64_t num_instances = N * G;
+  const int64_t norm_size = H * W * C / G;
+  const int64_t spatial_size = H * W;
+  const int64_t channel_size = C;
+  const double epsilon = eps;
+  bool channels_first = false;
+
   // Use a little big more shared_memory to reduce occupancy and boost perf.
   constexpr int MEM_BANK_CONFLICT = 1;
 
@@ -543,14 +885,42 @@ cudaError_t invokeGroupNorm(
         smem));
 
     constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
-
-    dim3 block(num_threads);
-    group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT, num_threads>
-        <<<dim3(G, N), block, smem, stream>>>(
-            input, output, gamma, beta, N, eps);
+    if constexpr (num_threads > 0) {
+      dim3 block(num_threads);
+      group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT, num_threads>
+          <<<dim3(G, N), block, smem, stream>>>(
+              input, output, gamma, beta, N, eps);
+    } else {
+      DispatchGroupNormForwardGpu<half, float, FuseSwish>(
+          stream,
+          num_instances,
+          norm_size,
+          channel_size,
+          spatial_size,
+          epsilon,
+          static_cast<half*>(input),
+          static_cast<half*>(gamma),
+          static_cast<half*>(beta),
+          static_cast<half*>(output),
+          reinterpret_cast<float*>(workspace),
+          reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
+          channels_first);
+    }
   } else {
-    return invokeWelfordGroupNorm<FuseSwish, H, W, C, G>(
-        output, input, gamma, beta, N, eps, stream);
+    DispatchGroupNormForwardGpu<half, float, FuseSwish>(
+        stream,
+        num_instances,
+        norm_size,
+        channel_size,
+        spatial_size,
+        epsilon,
+        static_cast<half*>(input),
+        static_cast<half*>(gamma),
+        static_cast<half*>(beta),
+        static_cast<half*>(output),
+        reinterpret_cast<float*>(workspace),
+        reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
+        channels_first);
   }
 
   // GROUP_NORM_CUDA_CHECK_LAUNCH();
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
new file mode 100644
index 000000000..baa1981b3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -0,0 +1,2404 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+namespace layer_norm {
+
+constexpr int kWarpSize = 32;
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return max(a, b);
+  }
+};
+
+template <
+    template <typename>
+    class ReductionOp,
+    typename T,
+    int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    val = ReductionOp<T>()(
+        val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+  }
+  return val;
+}
+
+template <template <typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) {
+    result_broadcast = result;
+  }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template <typename T>
+__inline__ __device__ T Div(T a, T b);
+
+template <>
+__inline__ __device__ float Div<float>(float a, float b) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template <>
+__inline__ __device__ double Div<double>(double a, double b) {
+  return a / b;
+}
+
+template <typename T>
+__inline__ __device__ T Rsqrt(T x);
+
+template <>
+__inline__ __device__ float Rsqrt<float>(float x) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __frsqrt_rn(x);
+#else
+  return rsqrt(x);
+#endif
+}
+
+template <>
+__inline__ __device__ double Rsqrt<double>(double x) {
+  return rsqrt(x);
+}
+
+template <class Func>
+inline cudaError_t GetNumBlocks(
+    Func func,
+    int64_t block_size,
+    size_t dynamic_smem_size,
+    int64_t max_blocks,
+    int64_t waves,
+    int* num_blocks) {
+  int dev;
+  {
+    cudaError_t err = cudaGetDevice(&dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int sm_count;
+  {
+    cudaError_t err =
+        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int max_active_blocks;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, func, block_size, dynamic_smem_size);
+  }
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return cudaSuccess;
+}
+
+template <typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template <>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+// #if CUDA_VERSION >= 11000
+// template<>
+// struct DefaultComputeType<nv_bfloat16> {
+//   using type = float;
+// };
+// #endif  // CUDA_VERSION >= 11000
+
+template <typename T>
+class HasCanPackAs {
+  typedef char one;
+  struct two {
+    char x[2];
+  };
+
+  template <typename C>
+  static one test(decltype(&C::CanPackAs));
+  template <typename C>
+  static two test(...);
+
+ public:
+  enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == true, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return t.CanPackAs(pack_size);
+}
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == false, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return true;
+}
+
+template <typename T, int N>
+struct GetPackType {
+  using type =
+      typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template <typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template <typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template <typename SRC, typename DST>
+struct DirectLoad {
+  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
+  template <int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+  const SRC* src;
+  int64_t row_size;
+};
+
+template <typename SRC, typename DST>
+struct DirectStore {
+  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
+  template <int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      pack.elem[i] = static_cast<DST>(src[i]);
+    }
+    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t row_size;
+};
+
+template <typename T>
+inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
+  // Use Welford Online algorithem to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  T delta1 = val - *mean;
+  *mean += Div(delta1, *count);
+  T delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template <typename T>
+inline __device__ void WelfordCombine(
+    T b_mean,
+    T b_m2,
+    T b_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  if (b_count == 0) {
+    return;
+  }
+  T new_count = *count + b_count;
+  T nb_over_n = Div(b_count, new_count);
+  T delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+  *count = new_count;
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  WelfordWarpReduce<T, thread_group_width>(
+      thread_mean, thread_m2, thread_count, mean, m2, count);
+  *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
+  *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
+  *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+}
+
+template <typename T>
+__inline__ __device__ void WelfordBlockAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* result_mean,
+    T* result_m2,
+    T* result_count) {
+  __shared__ T mean_shared[kWarpSize];
+  __shared__ T m2_shared[kWarpSize];
+  __shared__ T count_shared[kWarpSize];
+  __shared__ T mean_result_broadcast;
+  __shared__ T m2_result_broadcast;
+  __shared__ T count_result_broadcast;
+  const int lid = threadIdx.x % kWarpSize;
+  const int wid = threadIdx.x / kWarpSize;
+  T warp_mean = 0;
+  T warp_m2 = 0;
+  T warp_count = 0;
+  WelfordWarpReduce(
+      thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
+  __syncthreads();
+  if (lid == 0) {
+    mean_shared[wid] = warp_mean;
+    m2_shared[wid] = warp_m2;
+    count_shared[wid] = warp_count;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    if (threadIdx.x < blockDim.x / kWarpSize) {
+      warp_mean = mean_shared[lid];
+      warp_m2 = m2_shared[lid];
+      warp_count = count_shared[lid];
+    } else {
+      warp_mean = static_cast<T>(0);
+      warp_m2 = static_cast<T>(0);
+      warp_count = static_cast<T>(0);
+    }
+    __syncwarp();
+    T block_mean = 0;
+    T block_m2 = 0;
+    T block_count = 0;
+    WelfordWarpReduce(
+        warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
+    if (lid == 0) {
+      mean_result_broadcast = block_mean;
+      m2_result_broadcast = block_m2;
+      count_result_broadcast = block_count;
+    }
+  }
+  __syncthreads();
+  *result_mean = mean_result_broadcast;
+  *result_m2 = m2_result_broadcast;
+  *result_count = count_result_broadcast;
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+__global__ void LayerNormWarpImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  static_assert(max_cols_per_thread % pack_size == 0, "");
+  static_assert(min_cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int max_num_packs = max_cols_per_thread / pack_size;
+  constexpr int min_num_packs = min_cols_per_thread / pack_size;
+  assert(cols <= max_cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][max_cols_per_thread];
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int64_t lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows;
+       row += step) {
+    ComputeType thread_mean[rows_per_access];
+    ComputeType thread_m2[rows_per_access];
+    ComputeType thread_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_mean[row_id] = 0;
+      thread_m2[row_id] = 0;
+      thread_count[row_id] = 0;
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+        for (int i = 0; i < pack_size; ++i) {
+          WelfordCombine(
+              row_buf[pack_offset + i],
+              thread_mean + row_id,
+              thread_m2 + row_id,
+              thread_count + row_id);
+        }
+      }
+      for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (!padding || col < cols) {
+          load.template load<pack_size>(
+              row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            WelfordCombine(
+                row_buf[pack_offset + i],
+                thread_mean + row_id,
+                thread_m2 + row_id,
+                thread_count + row_id);
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            row_buf[pack_offset + i] = 0;
+          }
+        }
+      }
+    }
+    ComputeType warp_mean[rows_per_access];
+    ComputeType warp_m2[rows_per_access];
+    ComputeType warp_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      int global_row_id = row + row_id;
+      ComputeType* row_buf = buf[row_id];
+      WelfordWarpAllReduce<ComputeType, thread_group_width>(
+          thread_mean[row_id],
+          thread_m2[row_id],
+          thread_count[row_id],
+          warp_mean + row_id,
+          warp_m2 + row_id,
+          warp_count + row_id);
+      ComputeType row_mean = warp_mean[row_id];
+      ComputeType row_variance =
+          max(Div(warp_m2[row_id], warp_count[row_id]),
+              static_cast<ComputeType>(0.0));
+      ComputeType row_inv_var =
+          Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+      if (lane_id == 0) {
+        mean[global_row_id] = row_mean;
+        inv_variance[global_row_id] = row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < max_cols_per_thread; ++i) {
+        row_buf[i] = (row_buf[i] - row_mean) * row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < min_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        store.template store<pack_size>(
+            row_buf + i * pack_size, global_row_id, col);
+      }
+#pragma unroll
+      for (int i = min_num_packs; i < max_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          store.template store<pack_size>(
+              row_buf + i * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+inline cudaError_t LaunchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) /
+      thread_groups_per_block;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormWarpImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            max_cols_per_thread,
+            min_cols_per_thread,
+            thread_group_width,
+            rows_per_access,
+            padding>,
+        block_size,
+        0,
+        num_blocks,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormWarpImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      max_cols_per_thread,
+      min_cols_per_thread,
+      thread_group_width,
+      rows_per_access,
+      padding><<<grid_dim_x, block_dim, 0, stream>>>(
+      load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+inline cudaError_t DispatchLayerNormWarpImplPadding(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols == max_cols_per_thread * thread_group_width) {
+    // when not padding, min_cols_per_thread must equals to max_cols_per_thread,
+    // pass max_cols_per_thread as min_cols_per_thread and max_cols_per_thread
+    // param.
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        max_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        false>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        min_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        true>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 1, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                 \
+  else if (cols <= (max_col)*kWarpSize) {                                 \
+    return DispatchLayerNormWarpImplPadding<                              \
+        LOAD,                                                             \
+        STORE,                                                            \
+        ComputeType,                                                      \
+        pack_size,                                                        \
+        max_col,                                                          \
+        min_col,                                                          \
+        kWarpSize,                                                        \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+  }
+  DEFINE_ONE_ELIF(2, 1)
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 2, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                   \
+  else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \
+    return DispatchLayerNormWarpImplPadding<                                \
+        LOAD,                                                               \
+        STORE,                                                              \
+        ComputeType,                                                        \
+        pack_size,                                                          \
+        max_col,                                                            \
+        min_col,                                                            \
+        kWarpSize,                                                          \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);   \
+  }
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormWarpImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormWarpImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void LayerNormBlockSMemImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+inline cudaError_t LaunchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    int smem,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
+        block_size,
+        smem,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType);
+  int max_active_blocks_conf_1;
+
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_1>,
+        block_size_conf_1,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_4>,
+        block_size_conf_4,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_4>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_3;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_3>,
+        block_size_conf_3,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_3>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_2;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_2>,
+        block_size_conf_2,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_2>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  *success = true;
+  return LaunchLayerNormBlockSMemImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size_conf_1>(
+      stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct TryDispatchLayerNormBlockSMemImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance,
+      bool* success) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          4>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          2>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          1>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t TryDispatchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void __launch_bounds__(1024) LayerNormBlockUncachedImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      const int pack_offset = pack_id * pack_size;
+      load.template load<pack_size>(pack, row, pack_offset);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (pack[i] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_offset);
+    }
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t LaunchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockUncachedImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size>,
+        block_size,
+        0,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, 0, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormBlockUncachedImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<!std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  if (cols <= 1024) {
+    return DispatchLayerNormWarpImpl<LOAD, STORE, ComputeType>(
+        stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      cudaError_t err =
+          TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+              stream,
+              load,
+              store,
+              rows,
+              cols,
+              epsilon,
+              mean,
+              inv_variance,
+              &dispatch_smem_impl_success);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+    return cudaSuccess;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+/*
+LayerNormGrad dx:
+normalized = (x - mean) * inv_var
+sum_stats1 = sum(scaled_dy)
+sum_stats2 = sum(scaled_dy * normalized)
+dx = cols * dy - sum_stats1 - normalized * sum_stats2
+dx *= inv_var / cols
+*/
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+__global__ void LayerNormGradWarpImpl(
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  static_assert(max_cols_per_thread % pack_size == 0, "");
+  static_assert(min_cols_per_thread % pack_size == 0, "");
+  constexpr int max_num_packs = max_cols_per_thread / pack_size;
+  constexpr int min_num_packs = min_cols_per_thread / pack_size;
+  assert(cols <= max_cols_per_thread * thread_group_width);
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  ComputeType normalized_buf[rows_per_access][max_cols_per_thread];
+  ComputeType dy_buf[rows_per_access][max_cols_per_thread];
+  const ComputeType one_over_cols =
+      static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows;
+       row += step) {
+    ComputeType sum_stats1[rows_per_access];
+    ComputeType sum_stats2[rows_per_access];
+    ComputeType inv_variance_buf[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      const int global_row_id = row + row_id;
+      ComputeType mean_val = mean[global_row_id];
+      inv_variance_buf[row_id] = inv_variance[global_row_id];
+      sum_stats1[row_id] = 0;
+      sum_stats2[row_id] = 0;
+      ComputeType* row_normalized_buf = normalized_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        load_x.template load<pack_size>(
+            row_normalized_buf + pack_offset, global_row_id, col);
+        load_scaled_dy.template load<pack_size>(
+            row_dy_buf + pack_offset, global_row_id, col);
+#pragma unroll
+        for (int i = 0; i < pack_size; ++i) {
+          const int col_id = pack_offset + i;
+          // row_normalized_buf store x
+          row_normalized_buf[col_id] = (row_normalized_buf[col_id] - mean_val) *
+              inv_variance_buf[row_id];
+          sum_stats1[row_id] += row_dy_buf[col_id];
+          sum_stats2[row_id] += row_dy_buf[col_id] * row_normalized_buf[col_id];
+        }
+      }
+#pragma unroll
+      for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (col < cols) {
+          load_x.template load<pack_size>(
+              row_normalized_buf + pack_offset, global_row_id, col);
+          load_scaled_dy.template load<pack_size>(
+              row_dy_buf + pack_offset, global_row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            const int col_id = pack_offset + i;
+            // row_normalized_buf store x
+            row_normalized_buf[col_id] =
+                (row_normalized_buf[col_id] - mean_val) *
+                inv_variance_buf[row_id];
+            sum_stats1[row_id] += row_dy_buf[col_id];
+            sum_stats2[row_id] +=
+                row_dy_buf[col_id] * row_normalized_buf[col_id];
+          }
+        }
+      }
+    }
+    ComputeType warp_sum_stats1[rows_per_access];
+    ComputeType warp_sum_stats2[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      warp_sum_stats1[row_id] =
+          WarpAllReduce<SumOp, ComputeType, thread_group_width>(
+              sum_stats1[row_id]);
+      warp_sum_stats2[row_id] =
+          WarpAllReduce<SumOp, ComputeType, thread_group_width>(
+              sum_stats2[row_id]);
+    }
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      const int global_row_id = row + row_id;
+      ComputeType* row_normalized_buf = normalized_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+      const ComputeType inv_variance_over_cols =
+          inv_variance_buf[row_id] * one_over_cols;
+#pragma unroll
+      for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        for (int i = 0; i < pack_size; ++i) {
+          const int col_id = pack_id * pack_size + i;
+          row_dy_buf[col_id] =
+              (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id] -
+               row_normalized_buf[col_id] * warp_sum_stats2[row_id]) *
+              inv_variance_over_cols;
+        }
+        store.template store<pack_size>(
+            row_dy_buf + pack_id * pack_size, global_row_id, col);
+      }
+#pragma unroll
+      for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        if (col < cols) {
+          for (int i = 0; i < pack_size; ++i) {
+            const int col_id = pack_id * pack_size + i;
+            row_dy_buf[col_id] =
+                (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id] -
+                 row_normalized_buf[col_id] * warp_sum_stats2[row_id]) *
+                inv_variance_over_cols;
+          }
+          store.template store<pack_size>(
+              row_dy_buf + pack_id * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+inline cudaError_t LaunchLayerNormGradWarpImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) /
+      thread_groups_per_block;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormGradWarpImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            max_cols_per_thread,
+            min_cols_per_thread,
+            thread_group_width,
+            rows_per_access>,
+        block_size,
+        0,
+        num_blocks,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormGradWarpImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType,
+      pack_size,
+      max_cols_per_thread,
+      min_cols_per_thread,
+      thread_group_width,
+      rows_per_access><<<grid_dim_x, block_dim, 0, stream>>>(
+      load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+inline cudaError_t DispatchLayerNormGradWarpImplPadding(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  if (cols == max_cols_per_thread * thread_group_width) {
+    // when not padding, min_cols_per_thread must equals to max_cols_per_thread,
+    // pass max_cols_per_thread as min_cols_per_thread and max_cols_per_thread
+    // param.
+    return LaunchLayerNormGradWarpImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        max_cols_per_thread,
+        thread_group_width,
+        rows_per_access>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  } else {
+    return LaunchLayerNormGradWarpImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        min_cols_per_thread,
+        thread_group_width,
+        rows_per_access>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size>
+typename std::enable_if<pack_size == 1, cudaError_t>::type
+DispatchLayerNormGradWarpImplCols(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)          \
+  else if (cols <= (thread_group_width)*pack_size) { \
+    if (rows % 2 == 0) {                             \
+      return DispatchLayerNormGradWarpImplPadding<   \
+          LOAD_X,                                    \
+          LOAD_SCALED_DY,                            \
+          STORE,                                     \
+          ComputeType,                               \
+          pack_size,                                 \
+          pack_size,                                 \
+          0,                                         \
+          thread_group_width,                        \
+          2>(                                        \
+          stream,                                    \
+          load_x,                                    \
+          load_scaled_dy,                            \
+          store,                                     \
+          mean,                                      \
+          inv_variance,                              \
+          rows,                                      \
+          cols);                                     \
+    } else {                                         \
+      return DispatchLayerNormGradWarpImplPadding<   \
+          LOAD_X,                                    \
+          LOAD_SCALED_DY,                            \
+          STORE,                                     \
+          ComputeType,                               \
+          pack_size,                                 \
+          pack_size,                                 \
+          0,                                         \
+          thread_group_width,                        \
+          1>(                                        \
+          stream,                                    \
+          load_x,                                    \
+          load_scaled_dy,                            \
+          store,                                     \
+          mean,                                      \
+          inv_variance,                              \
+          rows,                                      \
+          cols);                                     \
+    }                                                \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)        \
+  else if (cols <= (max_col)*kWarpSize) {        \
+    return DispatchLayerNormGradWarpImplPadding< \
+        LOAD_X,                                  \
+        LOAD_SCALED_DY,                          \
+        STORE,                                   \
+        ComputeType,                             \
+        pack_size,                               \
+        max_col,                                 \
+        min_col,                                 \
+        kWarpSize,                               \
+        1>(                                      \
+        stream,                                  \
+        load_x,                                  \
+        load_scaled_dy,                          \
+        store,                                   \
+        mean,                                    \
+        inv_variance,                            \
+        rows,                                    \
+        cols);                                   \
+  }
+  DEFINE_ONE_ELIF(2, 1)
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+struct DispatchLayerNormGradWarpImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD_X load_x,
+      LOAD_SCALED_DY load_scaled_dy,
+      STORE store,
+      const ComputeType* mean,
+      const ComputeType* inv_variance,
+      const int64_t rows,
+      const int64_t cols) {
+    return DispatchLayerNormGradWarpImplCols<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        1>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  }
+};
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+inline cudaError_t DispatchLayerNormGradWarpImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  return DispatchLayerNormGradWarpImplPackSize<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType>()(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void LayerNormGradBlockSMemImpl(
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
+  auto* normalized_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
+  auto* dy_buf = normalized_buf + cols;
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  const ComputeType one_over_cols =
+      static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType sum_stats1 = 0;
+    ComputeType sum_stats2 = 0;
+    const ComputeType mean_val = mean[row];
+    const ComputeType inv_variance_val = inv_variance[row];
+    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(
+          dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val;
+        normalized_buf[buf_offset] = normalized;
+        dy_buf[buf_offset] = dy_pack[i];
+        sum_stats1 += dy_pack[i];
+        sum_stats2 += dy_pack[i] * normalized;
+      }
+    }
+    const ComputeType row_sum_stats1 =
+        BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
+    const ComputeType row_sum_stats2 =
+        BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1 -
+                   normalized_buf[buf_offset] * row_sum_stats2) *
+            inv_variance_over_cols;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+inline cudaError_t LaunchLayerNormGradBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    int smem,
+    const int64_t rows,
+    const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormGradBlockSMemImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size>,
+        block_size,
+        smem,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormGradBlockSMemImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size><<<grid_dim_x, block_size, smem, stream>>>(
+      load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size>
+inline cudaError_t TryDispatchLayerNormGradBlockSMemImplBlockSize(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols,
+    bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType) * 2;
+  int max_active_blocks_conf_1;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormGradBlockSMemImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_1>,
+        block_size_conf_1,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormGradBlockSMemImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_4>,
+        block_size_conf_4,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_4>(
+        stream,
+        load_x,
+        load_scaled_dy,
+        store,
+        mean,
+        inv_variance,
+        smem,
+        rows,
+        cols);
+  }
+  int max_active_blocks_conf_3;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormGradBlockSMemImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_3>,
+        block_size_conf_3,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_3>(
+        stream,
+        load_x,
+        load_scaled_dy,
+        store,
+        mean,
+        inv_variance,
+        smem,
+        rows,
+        cols);
+  }
+  int max_active_blocks_conf_2;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormGradBlockSMemImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_2>,
+        block_size_conf_2,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_2>(
+        stream,
+        load_x,
+        load_scaled_dy,
+        store,
+        mean,
+        inv_variance,
+        smem,
+        rows,
+        cols);
+  }
+  *success = true;
+  return LaunchLayerNormGradBlockSMemImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size_conf_1>(
+      stream,
+      load_x,
+      load_scaled_dy,
+      store,
+      mean,
+      inv_variance,
+      smem,
+      rows,
+      cols);
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+struct TryDispatchLayerNormGradBlockSMemImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD_X load_x,
+      LOAD_SCALED_DY load_scaled_dy,
+      STORE store,
+      const ComputeType* mean,
+      const ComputeType* inv_variance,
+      const int64_t rows,
+      const int64_t cols,
+      bool* success) {
+    if (cols % 2 == 0 && CanPackAs<LOAD_X>(load_x, 2) &&
+        CanPackAs<LOAD_SCALED_DY>(load_scaled_dy, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return TryDispatchLayerNormGradBlockSMemImplBlockSize<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          2>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols,
+          success);
+    } else {
+      return TryDispatchLayerNormGradBlockSMemImplBlockSize<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          1>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols,
+          success);
+    }
+  }
+};
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+inline cudaError_t TryDispatchLayerNormGradBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols,
+    bool* success) {
+  return TryDispatchLayerNormGradBlockSMemImplPackSize<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType>()(
+      stream,
+      load_x,
+      load_scaled_dy,
+      store,
+      mean,
+      inv_variance,
+      rows,
+      cols,
+      success);
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void LayerNormGradBlockUncachedImpl(
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  const ComputeType one_over_cols =
+      static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    const ComputeType mean_val = mean[row];
+    const ComputeType inv_variance_val = inv_variance[row];
+    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
+    ComputeType sum_stats1 = 0;
+    ComputeType sum_stats2 = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(
+          dy_pack, row, pack_id * pack_size);
+
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        sum_stats1 += dy_pack[i];
+        sum_stats2 += dy_pack[i] * (x_pack[i] - mean_val) * inv_variance_val;
+      }
+    }
+    const ComputeType row_sum_stats1 =
+        BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
+    const ComputeType row_sum_stats2 =
+        BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(
+          dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        dy_pack[i] =
+            (cols * dy_pack[i] - row_sum_stats1 -
+             (x_pack[i] - mean_val) * inv_variance_val * row_sum_stats2) *
+            inv_variance_over_cols;
+      }
+      store.template store<pack_size>(dy_pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+inline cudaError_t LaunchLayerNormGradBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormGradBlockUncachedImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size>,
+        block_size,
+        0,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormGradBlockUncachedImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size><<<grid_dim_x, block_size, 0, stream>>>(
+      load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType,
+    int pack_size>
+inline cudaError_t TryDispatchLaunchLayerNormGradBlockUncachedImplBlockSize(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  int max_active_blocks = 0;
+  constexpr int block_size_conf_1 = 1024;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        LayerNormGradBlockUncachedImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_1>,
+        block_size_conf_1,
+        0);
+    if (max_active_blocks > 0) {
+      return LaunchLayerNormGradBlockUncachedImpl<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          pack_size,
+          block_size_conf_1>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    }
+  }
+  constexpr int block_size_conf_2 = 512;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        LayerNormGradBlockUncachedImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_2>,
+        block_size_conf_2,
+        0);
+    if (max_active_blocks > 0) {
+      return LaunchLayerNormGradBlockUncachedImpl<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          pack_size,
+          block_size_conf_2>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    }
+  }
+  constexpr int block_size_conf_3 = 256;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        LayerNormGradBlockUncachedImpl<
+            LOAD_X,
+            LOAD_SCALED_DY,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_3>,
+        block_size_conf_2,
+        0);
+    if (max_active_blocks > 0) {
+      return LaunchLayerNormGradBlockUncachedImpl<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          pack_size,
+          block_size_conf_3>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    }
+  }
+  constexpr int block_size_conf_4 = 128;
+  return LaunchLayerNormGradBlockUncachedImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size_conf_4>(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+struct DispatchLayerNormGradBlockUncachedImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD_X load_x,
+      LOAD_SCALED_DY load_scaled_dy,
+      STORE store,
+      const ComputeType* mean,
+      const ComputeType* inv_variance,
+      const int64_t rows,
+      const int64_t cols) {
+    if (cols % 2 == 0 && CanPackAs<LOAD_X>(load_x, 2) &&
+        CanPackAs<LOAD_SCALED_DY>(load_scaled_dy, 2) &&
+        CanPackAs<STORE>(store, 2) && cols > kWarpSize) {
+      return TryDispatchLaunchLayerNormGradBlockUncachedImplBlockSize<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          2>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    } else {
+      return TryDispatchLaunchLayerNormGradBlockUncachedImplBlockSize<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType,
+          1>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    }
+  }
+};
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+inline cudaError_t DispatchLayerNormGradBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD_X load_x,
+    LOAD_SCALED_DY load_scaled_dy,
+    STORE store,
+    const ComputeType* mean,
+    const ComputeType* inv_variance,
+    const int64_t rows,
+    const int64_t cols) {
+  return DispatchLayerNormGradBlockUncachedImplPackSize<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType>()(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+inline typename std::
+    enable_if<!std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNormGrad(
+        cudaStream_t stream,
+        LOAD_X load_x,
+        LOAD_SCALED_DY load_scaled_dy,
+        STORE store,
+        const ComputeType* mean,
+        const ComputeType* inv_variance,
+        const int64_t rows,
+        const int64_t cols) {
+  if (cols <= 1024) {
+    return DispatchLayerNormGradWarpImpl<
+        LOAD_X,
+        LOAD_SCALED_DY,
+        STORE,
+        ComputeType>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      cudaError_t err = TryDispatchLayerNormGradBlockSMemImpl<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols,
+          &dispatch_smem_impl_success);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormGradBlockUncachedImpl<
+          LOAD_X,
+          LOAD_SCALED_DY,
+          STORE,
+          ComputeType>(
+          stream,
+          load_x,
+          load_scaled_dy,
+          store,
+          mean,
+          inv_variance,
+          rows,
+          cols);
+    }
+    return cudaSuccess;
+  }
+}
+
+template <
+    typename LOAD_X,
+    typename LOAD_SCALED_DY,
+    typename STORE,
+    typename ComputeType>
+inline typename std::
+    enable_if<std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNormGrad(
+        cudaStream_t stream,
+        LOAD_X load_x,
+        LOAD_SCALED_DY load_scaled_dy,
+        STORE store,
+        const ComputeType* mean,
+        const ComputeType* inv_variance,
+        const int64_t rows,
+        const int64_t cols) {
+  return DispatchLayerNormGradBlockUncachedImpl<
+      LOAD_X,
+      LOAD_SCALED_DY,
+      STORE,
+      ComputeType>(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+} // namespace layer_norm
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
index 2d7b6181e..6292898b1 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
@@ -22,6 +22,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import tensor_accessor_codegen
 from ...target import Target
 from . import layernorm_common
@@ -46,17 +47,22 @@
 
 {{func_signature}}
 {
-    invokeBatchLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}>(output, input, gamma, beta, b, m, n, eps, stream);
+    invokeBatchLayernormSigmoidMul<{{elem_input_type}}, float, {{fuse_sigmoid_mul}}>(
+        static_cast<{{elem_input_type}}*>(output),
+        static_cast<{{elem_input_type}}*>(input),
+        static_cast<const {{elem_input_type}}*>(gamma),
+        static_cast<const {{elem_input_type}}*>(beta),
+        b, m, n, eps, stream);
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
-                   half* input,
-                   const half* gamma,
-                   const half* beta,
+void {{func_name}}(void* output,
+                   void* input,
+                   const void* gamma,
+                   const void* beta,
                    int b,
                    int m,
                    int n,
@@ -84,12 +90,17 @@
 @registry.reg("cuda.batch_layernorm_sigmoid_mul.gen_function")
 def batch_layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
         ),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
         fuse_sigmoid_mul="true",
         gamma_beta_const_defs=gamma_beta_const_defs,
     )
@@ -110,9 +121,7 @@ def batch_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
         1 <= len(func_attrs["inputs"]) <= 4
     ), "expected 1 ~ 4 inputs but got {}".format(len(func_attrs["inputs"]))
 
-    output_name = layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = layernorm_common.get_input_names(func_attrs)
 
     shapes = func_attrs["inputs"][0]._attrs["shape"]
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
index d9e53d0cf..7c6b34ec4 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -22,13 +22,26 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import tensor_accessor_codegen
 from ...target import Target
-from .. import cuda_common
 from . import layernorm_common
 
 # pylint: disable=C0301
 
+
+LOCAL_PARAM_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}} {{elem_input_type}} *{{local_param_name}}[{{num_groups}}] = {
+{% for i in range(num_groups) %}
+{{indent}}    static_cast<{{elem_input_type}}*>({{param_name}}[{{i}}]){{", " if not loop.last else ""}}
+{% endfor %}
+{{indent}}
+{{indent}}};
+"""
+)
+
+
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
@@ -49,17 +62,20 @@
 {
     {{output_accessor_template}}
     {{input_accessor_template}}
-    invokeGroupLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}, {{num_inputs}}>(output, input, gamma, beta, b, m, n, eps, stream, input_accessors, output_accessors);
+    {{local_param_defs}}
+    invokeGroupLayernormSigmoidMul<{{elem_input_type}}, float, {{fuse_sigmoid_mul}}, {{num_inputs}}>(
+        {{local_param_names}},
+        b, m, n, eps, stream, input_accessors, output_accessors);
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output[],
-                   half* input[],
-                   half* gamma[],
-                   half* beta[],
+void {{func_name}}(void* output[],
+                   void* input[],
+                   void* gamma[],
+                   void* beta[],
                    int b,
                    int m,
                    int64_t* n,
@@ -96,19 +112,19 @@
     """
 {{indent}}{
 
-{{indent}}  {{input_elem_type}} *outputs[] = {
+{{indent}}  void *outputs[] = {
 {{indent}}    {{outputs}}
 {{indent}}  };
 
-{{indent}}  {{input_elem_type}} *inputs[] = {
+{{indent}}  void *inputs[] = {
 {{indent}}    {{inputs}}
 {{indent}}  };
 
-{{indent}}  {{input_elem_type}} *gamma[] = {
+{{indent}}  void *gamma[] = {
 {{indent}}    {{gamma}}
 {{indent}}  };
 
-{{indent}}  {{input_elem_type}} *beta[] = {
+{{indent}}  void *beta[] = {
 {{indent}}    {{beta}}
 {{indent}}  };
 
@@ -172,17 +188,39 @@ def group_layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
         raise RuntimeError(f"Unsupported op: {op}")
 
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    num_groups = len(func_attrs["outputs"])
+    params = ["output", "input", "gamma", "beta"]
+    local_param_defs = []
+    local_param_names = []
+    for param in params:
+        local_name = f"{param}_tmp"
+        local_param_def = LOCAL_PARAM_DEF_TEMPLATE.render(
+            indent="  ",
+            elem_input_type=elem_input_type,
+            num_groups=num_groups,
+            local_param_name=local_name,
+            param_name=param,
+        )
+        local_param_defs.append(local_param_def)
+        local_param_names.append(local_name)
     return FUNC_TEMPLATE.render(
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
         ),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
         fuse_sigmoid_mul=fuse_sigmoid_mul,
         num_inputs=len(func_attrs["outputs"]),
         output_accessor_template=output_accessor_template,
         input_accessor_template=input_accessor_template,
         gamma_beta_const_defs=gamma_beta_const_defs,
+        local_param_defs="\n".join(local_param_defs),
+        local_param_names=",".join(local_param_names),
     )
 
 
@@ -212,47 +250,17 @@ def group_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
         idx += b
     outputs = func_attrs["outputs"]
 
-    output_ptrs = ",\n        ".join(
-        [
-            layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-                name=out._attrs["name"]
-            )
-            for out in outputs
-        ]
-    )
+    output_ptrs = ",\n        ".join([out._attrs["name"] for out in outputs])
 
-    input_ptrs = ",\n        ".join(
-        [
-            layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=i._attrs["name"])
-            for i in inputs
-        ]
-    )
+    input_ptrs = ",\n        ".join([i._attrs["name"] for i in inputs])
 
     gamma_strs = (
-        ["nullptr"] * b
-        if gammas is None
-        else (
-            [
-                layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-                    name=i._attrs["name"]
-                )
-                for i in gammas
-            ]
-        )
+        ["nullptr"] * b if gammas is None else ([i._attrs["name"] for i in gammas])
     )
     gamma_ptrs = ",\n        ".join(gamma_strs)
 
     beta_strs = (
-        ["nullptr"] * b
-        if betas is None
-        else (
-            [
-                layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-                    name=i._attrs["name"]
-                )
-                for i in betas
-            ]
-        )
+        ["nullptr"] * b if betas is None else ([i._attrs["name"] for i in betas])
     )
     beta_ptrs = ",\n        ".join(beta_strs)
 
@@ -290,7 +298,6 @@ def group_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         m_n_shape_func="".join(all_shape_funcs),
-        input_elem_type=cuda_common.dtype_to_cuda_type(inputs[0]._attrs["dtype"]),
         indent=indent,
         outputs=output_ptrs,
         inputs=input_ptrs,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
index 4daa082ec..dbdeecdea 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
@@ -20,9 +20,6 @@
 
 import jinja2
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
 
 GAMMA_BETA_CONST_DEFS_TEMPLATE = jinja2.Template(
     """
@@ -100,14 +97,14 @@ def get_input_names(func_attrs: Dict[str, Any]) -> List[str]:
         beta = inputs[idx]
         idx += 1
 
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=x._attrs["name"])
+    input_name = x._attrs["name"]
     if gamma is None:
         gamma_name = "nullptr"
     else:
-        gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
+        gamma_name = gamma._attrs["name"]
     if beta is None:
         beta_name = "nullptr"
     else:
-        beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+        beta_name = beta._attrs["name"]
 
     return (input_name, gamma_name, beta_name)
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
index e0dc892a0..99140521f 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -22,6 +22,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...common import tensor_accessor_codegen
 from ...target import Target
 from . import layernorm_common
@@ -48,17 +49,22 @@
 {
     {{input_accessor}}
     {{output_accessor}}
-    return invokeLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}>(output, input, gamma, beta, m, n, eps, stream, input_accessor, output_accessor);
+    return invokeLayernormSigmoidMul<{{elem_input_type}}, float, {{fuse_sigmoid_mul}}>(
+        static_cast<{{elem_input_type}}*>(output),
+        static_cast<const {{elem_input_type}}*>(input),
+        static_cast<const {{elem_input_type}}*>(gamma),
+        static_cast<const {{elem_input_type}}*>(beta),
+        m, n, eps, stream, input_accessor, output_accessor);
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-cudaError_t {{func_name}}(half* output,
-                   half* input,
-                   const half* gamma,
-                   const half* beta,
+cudaError_t {{func_name}}(void* output,
+                   void* input,
+                   const void* gamma,
+                   const void* beta,
                    int m,
                    int n,
                    const float eps,
@@ -88,12 +94,17 @@
 @registry.reg("cuda.layernorm.gen_function")
 def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
         ),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
         fuse_sigmoid_mul="false",
         input_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
             name="input_accessor", tensor_accessor=func_attrs["input_accessors"][0]
@@ -108,12 +119,17 @@ def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
 @registry.reg("cuda.layernorm_sigmoid_mul.gen_function")
 def layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
         ),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
         fuse_sigmoid_mul="true",
         input_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
             name="input_accessor", tensor_accessor=func_attrs["input_accessors"][0]
@@ -142,9 +158,7 @@ def layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
         func_attrs["inputs"]
     ), "expected at least 1 inputs but got {}".format(len(func_attrs["inputs"]))
 
-    output_name = layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = layernorm_common.get_input_names(func_attrs)
 
     input_accessor = func_attrs["input_accessors"][0]
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index 0b2249266..f91f6dc16 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -1281,20 +1281,27 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
   const int quarter_n = n >> 2;
   const int offset = m_idx * quarter_n;
 
+  const int block_size = blockDim.x;
+  const int num_iters =
+      ceil(static_cast<float>(quarter_n) / static_cast<float>(block_size));
+
   half4 local_val_half{0, 0, 0, 0};
   float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
 
-  if (tid < quarter_n) {
-    local_val_half =
-        *input_accessor.get<const half, const half4>(input, offset + tid);
-    local_val = {
-        static_cast<float>(local_val_half.x),
-        static_cast<float>(local_val_half.y),
-        static_cast<float>(local_val_half.z),
-        static_cast<float>(local_val_half.w)};
-    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+
+    if (elem_no < quarter_n) {
+      local_val_half =
+          *input_accessor.get<const half, const half4>(input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += local_val.x + local_val.y + local_val.z + local_val.w;
+    }
   }
-
   if (blockDim.x <= 32) {
     warpReduceSum<float, 1>(local_sums);
   } else {
@@ -1303,17 +1310,25 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
   if (threadIdx.x == 0) {
     s_mean = local_sums[0] / n;
   }
-
   __syncthreads();
-
   local_sums[0] = 0.0f;
-  if (tid < quarter_n) {
-    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
-        (local_val.y - s_mean) * (local_val.y - s_mean) +
-        (local_val.z - s_mean) * (local_val.z - s_mean) +
-        (local_val.w - s_mean) * (local_val.w - s_mean);
-  }
 
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half =
+          *input_accessor.get<const half, const half4>(input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += (local_val.x - s_mean) * (local_val.x - s_mean) +
+          (local_val.y - s_mean) * (local_val.y - s_mean) +
+          (local_val.z - s_mean) * (local_val.z - s_mean) +
+          (local_val.w - s_mean) * (local_val.w - s_mean);
+    }
+  }
   if (blockDim.x <= 32) {
     warpReduceSum<float, 1>(local_sums);
   } else {
@@ -1324,63 +1339,74 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
   }
   __syncthreads();
 
-  if (tid < quarter_n) {
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half =
+          *input_accessor.get<const half, const half4>(input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
 #ifdef AIT_LAYERNORM_CONST_GAMMA
-    const float4 gamma_val = {
-        AIT_LAYERNORM_CONST_GAMMA,
-        AIT_LAYERNORM_CONST_GAMMA,
-        AIT_LAYERNORM_CONST_GAMMA,
-        AIT_LAYERNORM_CONST_GAMMA};
+      const float4 gamma_val = {
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA};
 #else
-    const half4 gamma_val_half = gamma[tid];
-    const float4 gamma_val = {
-        static_cast<float>(gamma_val_half.x),
-        static_cast<float>(gamma_val_half.y),
-        static_cast<float>(gamma_val_half.z),
-        static_cast<float>(gamma_val_half.w)};
+      const half4 gamma_val_half = gamma[elem_no];
+      const float4 gamma_val = {
+          static_cast<float>(gamma_val_half.x),
+          static_cast<float>(gamma_val_half.y),
+          static_cast<float>(gamma_val_half.z),
+          static_cast<float>(gamma_val_half.w)};
 #endif // AIT_LAYERNORM_CONST_GAMMA
 
 #ifdef AIT_LAYERNORM_CONST_BETA
-    const float4 beta_val = {
-        AIT_LAYERNORM_CONST_BETA,
-        AIT_LAYERNORM_CONST_BETA,
-        AIT_LAYERNORM_CONST_BETA,
-        AIT_LAYERNORM_CONST_BETA};
+      const float4 beta_val = {
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA};
 #else
-    const half4 beta_val_half = beta[tid];
-    const float4 beta_val = {
-        static_cast<float>(beta_val_half.x),
-        static_cast<float>(beta_val_half.y),
-        static_cast<float>(beta_val_half.z),
-        static_cast<float>(beta_val_half.w)};
+      const half4 beta_val_half = beta[elem_no];
+      const float4 beta_val = {
+          static_cast<float>(beta_val_half.x),
+          static_cast<float>(beta_val_half.y),
+          static_cast<float>(beta_val_half.z),
+          static_cast<float>(beta_val_half.w)};
 #endif // AIT_LAYERNORM_CONST_BETA
 
-    if (FuseSigmoidMul) {
-      local_val.x *= sigmoid(
-          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
-      local_val.y *= sigmoid(
-          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
-      local_val.z *= sigmoid(
-          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
-      local_val.w *= sigmoid(
-          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
-    } else {
-      local_val.x =
-          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
-      local_val.y =
-          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
-      local_val.z =
-          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
-      local_val.w =
-          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
-    }
+      if constexpr (FuseSigmoidMul) {
+        local_val.x *= sigmoid(normalize(
+            local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+        local_val.y *= sigmoid(normalize(
+            local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+        local_val.z *= sigmoid(normalize(
+            local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+        local_val.w *= sigmoid(normalize(
+            local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+      } else {
+        local_val.x =
+            normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+        local_val.y =
+            normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+        local_val.z =
+            normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+        local_val.w =
+            normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+      }
 
-    local_val_half.x = __float2half_rn(local_val.x);
-    local_val_half.y = __float2half_rn(local_val.y);
-    local_val_half.z = __float2half_rn(local_val.z);
-    local_val_half.w = __float2half_rn(local_val.w);
+      local_val_half.x = __float2half_rn(local_val.x);
+      local_val_half.y = __float2half_rn(local_val.y);
+      local_val_half.z = __float2half_rn(local_val.z);
+      local_val_half.w = __float2half_rn(local_val.w);
 
-    *(output_accessor.get<half, half4>(output, offset + tid)) = local_val_half;
+      *(output_accessor.get<half, half4>(output, offset + elem_no)) =
+          local_val_half;
+    }
   }
 }
 
@@ -1478,7 +1504,7 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const float beta_val = static_cast<float>(beta[tid]);
 #endif // AIT_LAYERNORM_CONST_BETA
 
-    if (FuseSigmoidMul) {
+    if constexpr (FuseSigmoidMul) {
       local_val *= sigmoid(
           normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
     } else {
@@ -1640,11 +1666,10 @@ cudaError_t invokeGroupLayernormSigmoidMul(
   }
 
   dim3 grid(b, m);
-  dim3 block(max_n);
-
   // TODO: implement float4 group kernel
   if (std::is_same<T, half>::value && n_is_multiple_of_4 && (min_n >= 128) &&
       (max_n <= 4096)) {
+    dim3 block(min_n);
     // round up to multiples of 32 to make warp shuffles safe
     block.x = (block.x / 4 + 31) / 32 * 32;
     Arguments<half4, float, NumInputs> args;
@@ -1674,6 +1699,8 @@ cudaError_t invokeGroupLayernormSigmoidMul(
       LAYER_NORM_CUDA_CHECK(cudaFree(argsPtr));
     }
   } else {
+    // TODO: Should we apply min_n block size to this branch as well?
+    dim3 block(max_n);
     Arguments<T, T_ACC, NumInputs> args;
     for (size_t i = 0; i < b; i++) {
       args.outputs[i] = output[i];
diff --git a/python/aitemplate/backend/cuda/lib_template.py b/python/aitemplate/backend/cuda/lib_template.py
index 16d5ee505..67d6d76b9 100644
--- a/python/aitemplate/backend/cuda/lib_template.py
+++ b/python/aitemplate/backend/cuda/lib_template.py
@@ -31,16 +31,8 @@ def var_decl(name, value=0, indent="  "):
     return VAR_TEMPLATE.render(name=name, value=value, indent=indent)
 
 
-@registry.reg("cuda.lib.ptr_decl")
-def ptr_decl(name, dtype="float16", indent="  "):
-    if dtype == "float16":
-        type_string = "cutlass::half_t*"
-    elif dtype in ["float", "float32"]:
-        type_string = "float*"
-    elif dtype == "int64":
-        type_string = "int64_t*"
-    elif dtype in ["int", "int32"]:
-        type_string = "int32_t*"
-    else:
-        raise NotImplementedError
-    return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
+@registry.reg("cuda.lib.void_ptr_decl")
+def void_ptr_decl(name, dtype="float16", indent="  "):
+    # FIXME: we keep dtype in void_ptr_decl's param list because rocm needs it.
+    # We will remove it once we support general tensor type for rocm
+    return PTR_TEMPLATE.render(name=name, dtype="void*", indent=indent)
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to4.py b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
index 7f940ba63..fd67dd1ca 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to4.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
@@ -18,14 +18,15 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
   int64_t*,
   int64_t*,
   int64_t*,
@@ -56,9 +57,9 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}nhwc3to4_launcher(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
+{{indent}}nhwc3to4_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -106,8 +107,9 @@
   }
 }
 
-void nhwc3to4_launcher(cutlass::half_t* in_ptr,
-                       cutlass::half_t* out_ptr,
+template <typename ElemT>
+void nhwc3to4_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
                        int NI,
                        int HI,
                        int WI,
@@ -120,9 +122,9 @@
   const int max_input_element = nhwc / element_in_Tio;
   const int max_output_element = nhw * 4 / element_in_Tio;
   const int4 zero_io = {0, 0, 0, 0};
-  const cutlass::half_t zero_element = static_cast<cutlass::half_t>(0.0f);
+  const ElemT zero_element = static_cast<ElemT>(0.0f);
   dim3 grid((nhwc + 192 * element_in_Tio - 1)/(192 * element_in_Tio));
-  nhwc_padding_channel_3To4_kernel<int4, cutlass::half_t, element_in_Tio><<<grid, block, 0, stream>>>
+  nhwc_padding_channel_3To4_kernel<int4, ElemT, element_in_Tio><<<grid, block, 0, stream>>>
           (NI, HI, WI,
           (const int4 *)in_ptr,
           (int4 *)out_ptr,
@@ -133,8 +135,8 @@
 }
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* out_ptr,
+    void* in_ptr,
+    void* out_ptr,
     int64_t* batch,
     int64_t* in_h,
     int64_t* in_w,
@@ -172,6 +174,10 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -186,9 +192,12 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         y_dim2="*out_w",
     )
     shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render()
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to8.py b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
index 4bae4d217..5f66c9be7 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to8.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
@@ -18,14 +18,15 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
   int64_t*,
   int64_t*,
   int64_t*,
@@ -56,9 +57,9 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}nhwc3to8_launcher(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
+{{indent}}nhwc3to8_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -74,11 +75,12 @@
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
-// load 128 bit every time (8 half = 4 float)
+// load 128 bit every time (n ElemT = 4 float)
 // use as many as thread with factor of 3:
-// each time load num_thread * 8 half = num_thread / 3 * 8 * 3ch -> num_thread / 3 * 8 * 8ch
+// each time load num_thread * n ElemT = num_thread / 3 * n ElemT * 3ch ->
+// num_thread / 3 * n ElemT * n ElemT ch
 
-template<int num_thread>
+template<typename ElemT, int num_thread>
 __global__ void nhwc3to8_kernel(const float4* input,
                                 float4* output,
                                 const int NI,
@@ -86,10 +88,11 @@
                                 const int WI,
                                 const int max_in_elements,
                                 const int max_out_elements) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
   __shared__ float4 shared_mem[num_thread];
-  const int out_offset = num_thread * 8 / 3;
+  const int out_offset = num_thread * num_elem_t_in_float4 / 3;
   const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
-  const half zero = static_cast<half>(0.f);
+  const ElemT zero = static_cast<ElemT>(0.f);
   const int in_idx = blockIdx.x * num_thread + threadIdx.x;
   const int tid = threadIdx.x;
 
@@ -99,32 +102,39 @@
   const int out_start_idx = blockIdx.x * out_offset;
   const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
   for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
-    const half* smem_element = (const half*)shared_mem + j * 3;
-    half tmp[8];
+    const ElemT* smem_element = (const ElemT*)shared_mem + j * 3;
+    ElemT tmp[num_elem_t_in_float4];
 
     #pragma unroll
-    for (int k = 0; k < 8; ++k) {
+    for (int k = 0; k < num_elem_t_in_float4; ++k) {
       tmp[k] = k < 3 ? smem_element[k] : zero;
     }
     output[i] = *((const float4*)tmp);
   }
 }
 
-void nhwc3to8_launcher(cutlass::half_t* in_ptr,
-                       cutlass::half_t* out_ptr,
+template <typename ElemT>
+void nhwc3to8_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
                        int NI,
                        int HI,
                        int WI,
                        cudaStream_t stream) {
-  const int nthread = 240;
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  constexpr int nthread = 240;
   const int NHW = NI * HI * WI;
-  // assert NHW % 8 == 0
-  // assert nthread % 3 == 0
-  const int max_in_elements = NHW * 3 / 8;
-  const int max_out_elements = NHW * 8 / 8;
+  if (NHW % num_elem_t_in_float4 != 0) {
+    throw std::runtime_error(
+        "NHW (" + std::to_string(NHW) + ") mod num_elem_t_in_float4 (" +
+        std::to_string(num_elem_t_in_float4) + ") is not 0"
+    );
+  }
+  static_assert(nthread % 3 == 0);
+  const int max_in_elements = NHW * 3 / num_elem_t_in_float4;
+  const int max_out_elements = NHW * num_elem_t_in_float4 / num_elem_t_in_float4;
   dim3 thread_block(nthread);
-  dim3 grid((NHW * 3 + nthread * 8 -1) / (nthread * 8));
-  nhwc3to8_kernel<nthread><<<grid, thread_block, 0, stream>>>(
+  dim3 grid((NHW * 3 + nthread * num_elem_t_in_float4 -1) / (nthread * num_elem_t_in_float4));
+  nhwc3to8_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
     (const float4*)in_ptr,
     (float4*) out_ptr,
     NI,
@@ -136,8 +146,8 @@
 }
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* out_ptr,
+    void* in_ptr,
+    void* out_ptr,
     int64_t* batch,
     int64_t* in_h,
     int64_t* in_w,
@@ -175,6 +185,10 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -189,9 +203,12 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         y_dim2="*out_w",
     )
     shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render()
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/padding/pad_last_dim.py b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
index f26404af0..601da83ad 100644
--- a/python/aitemplate/backend/cuda/padding/pad_last_dim.py
+++ b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
@@ -18,14 +18,15 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
+  void*,
+  void*,
   {%for i in range(ndim)%}
   int64_t*,
   {% endfor %}
@@ -58,9 +59,9 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}padding4d_launcher(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
+{{indent}}padding4d_launcher<{{elem_input_type}}, {{elem_input_type2}}>(
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
 {%for i in range(4 - ndim)%}
 1,
 {% endfor %}
@@ -140,23 +141,24 @@
 }
 
 
-
-void padding4d_launcher(cutlass::half_t* in_ptr,
-                        cutlass::half_t* out_ptr,
+template <typename ElemT, typename ElemT2>
+void padding4d_launcher(ElemT* in_ptr,
+                        ElemT* out_ptr,
                         const int32_t x_dim0,
                         const int32_t x_dim1,
                         const int32_t x_dim2,
                         const int32_t x_dim3,
                         const int32_t out_dim,
                         cudaStream_t stream) {
+  static_assert(sizeof(ElemT2) % sizeof(ElemT) == 0);
   const int block_size = 256;
   if ((out_dim % 2) == 0 && (x_dim3 % 2) == 0) {
     int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3 / 2;
     dim3 grid((total_elements + 255) /  block_size);
     dim3 block(block_size);
-    const __half2 zero  = {0.0f, 0.0f};
-    padding4d_kernel<__half2><<<grid, block, 0, stream>>>(
-        (const __half2*)in_ptr, (__half2*)out_ptr,
+    const ElemT2 zero  = {0.0f, 0.0f};
+    padding4d_kernel<ElemT2><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const ElemT2*>(in_ptr), reinterpret_cast<ElemT2*>(out_ptr),
         x_dim0, x_dim1, x_dim2, x_dim3 / 2,
         out_dim / 2,
         zero
@@ -165,9 +167,9 @@
     int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3;
     dim3 grid((total_elements + 255) /  block_size);
     dim3 block(block_size);
-    const __half zero = static_cast<__half>(0.f);
-    padding4d_kernel<__half><<<grid, block, 0, stream>>>(
-        (const __half*)in_ptr, (__half*)out_ptr,
+    const ElemT zero = static_cast<ElemT>(0.f);
+    padding4d_kernel<ElemT><<<grid, block, 0, stream>>>(
+        in_ptr, out_ptr,
         x_dim0, x_dim1, x_dim2, x_dim3,
         out_dim,
         zero
@@ -178,8 +180,8 @@
 } // namespace
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* out_ptr,
+    void* in_ptr,
+    void* out_ptr,
     {%for i in range(ndim)%}
     int64_t* x_dim{{i}},
     {% endfor %}
@@ -218,6 +220,15 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_input_type2 = None
+    if elem_input_type == "half":
+        elem_input_type2 = "half2"
+    else:
+        raise NotImplementedError(f"unsupported {elem_input_type=}")
     ndim = func_attrs["ndim"]
     xshape = ["*x_dim%d" % i for i in range(ndim)]
     shape_eval_func = shape_eval_template.render(
@@ -228,9 +239,15 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         indent="  ", shape=yshape, last_dim="*y_dim%d" % (ndim - 1)
     )
     shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render(ndim=func_attrs["ndim"], indent="  ")
+    exec_paths = EXEC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_input_type2=elem_input_type2,
+        ndim=func_attrs["ndim"],
+        indent="  ",
+    )
     return SRC_TEMPLATE.render(
         function_name=func_name,
+        elem_input_type=elem_input_type,
         shape_function=shape_func,
         exec_paths=exec_paths,
         ndim=func_attrs["ndim"],
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
index dbb4993bb..bc18b9a99 100644
--- a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -18,6 +18,8 @@
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import pool2d
 
@@ -27,8 +29,8 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}avg_pool_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
+{{indent}}    static_cast<const {{dtype}}*>(in_ptr),
+{{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -96,7 +98,7 @@
 }
 
 template <int kernel_size, int stride, int padding>
-void avg_pool_launcher(cutlass::half_t* input,
+void avg_pool_launcher(const cutlass::half_t* input,
                       cutlass::half_t* output,
                       const int N,
                       const int H,
@@ -114,8 +116,8 @@
 } // namespace
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* out_ptr,
+    const void* in_ptr,
+    void* out_ptr,
     int64_t* batch,
     int64_t* in_h,
     int64_t* in_w,
@@ -144,7 +146,8 @@ def gen_function(
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
-
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -168,10 +171,11 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = EXEC_TEMPLATE.render(
-            indent="    ",
+            indent=" " * 4,
             kernel_size=func_attrs["kernel_size"],
             padding=func_attrs["pad"],
             stride=func_attrs["stride"],
+            dtype=dtype,
         )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index cafef0937..2f1744a5e 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import pool2d
 
@@ -26,8 +28,8 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}max_pooling_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
+{{indent}}    static_cast<const {{dtype}}*>(in_ptr),
+{{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -133,8 +135,8 @@
   }
 }
 
-template <int kernel_size, int stride, int pad>
-void max_pooling_launcher(cutlass::half_t* input,
+template<int kernel_size, int stride, int pad>
+void max_pooling_launcher(const cutlass::half_t* input,
                           cutlass::half_t* output,
                           int NI,
                           int HI,
@@ -142,7 +144,8 @@
                           int CI,
                           int HO,
                           int WO,
-                          cudaStream_t stream) {
+                          cudaStream_t stream)
+{
   const int block_ch = 4;
   const int block_w = 4;
   const int block_h = 4;
@@ -159,8 +162,8 @@
 } // namespace
 
 void {{function_name}} (
-    cutlass::half_t* in_ptr,
-    cutlass::half_t* out_ptr,
+    const void* in_ptr,
+    void* out_ptr,
     int64_t* batch,
     int64_t* in_h,
     int64_t* in_w,
@@ -189,7 +192,8 @@ def gen_function(
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
-
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -217,6 +221,7 @@ def gen_function(
             kernel_size=func_attrs["kernel_size"],
             padding=func_attrs["pad"],
             stride=func_attrs["stride"],
+            dtype=dtype,
         )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
diff --git a/python/aitemplate/backend/cuda/pool2d/pool2d.py b/python/aitemplate/backend/cuda/pool2d/pool2d.py
index 922536559..5c92c55f4 100644
--- a/python/aitemplate/backend/cuda/pool2d/pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/pool2d.py
@@ -20,8 +20,8 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
+  const void*,
+  void*,
   int64_t*,
   int64_t*,
   int64_t*,
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index 399d0e9f8..aa3fb6ccb 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -24,10 +24,9 @@
 
 import jinja2
 
+from ...backend_spec import CUDASpec
 from ...common import tensor_accessor_codegen
 
-from .. import cuda_common
-
 from . import reduce_small_axis
 
 
@@ -62,8 +61,8 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_output_type}} * /*output*/,
-  {{elem_input_type}} * /*input*/,
+  void * /*output*/,
+  void * /*input*/,
   int /*reduction_axis*/,
   int64_t *[] /*output_shape*/,
   const int64_t * /*input_shape*/,
@@ -102,7 +101,9 @@
 {% for align in alignments %}
 {{indent}}  if (input_shape[reduction_axis] % {{align}} == 0) {
 {{indent}}    reduce_mean_launcher_RowMajor_{{align}}(
-{{indent}}      output, input, b, m, n, batch_stride_input, batch_stride_output, stream);
+{{indent}}      static_cast<{{elem_output_type}}*>(output),
+{{indent}}      static_cast<{{elem_input_type}}*>(input),
+{{indent}}      b, m, n, batch_stride_input, batch_stride_output, stream);
 {{indent}}    return;
 {{indent}}  }
 {% endfor %}
@@ -123,7 +124,9 @@
 {{indent}}    throw std::runtime_error("unreachable: invalid rank");
 {{indent}}  }
 {{indent}}  reduce_mean_launcher_ColumnMajor_1(
-{{indent}}    output, input, b, m, n, batch_stride_input, batch_stride_output, stream);
+{{indent}}    static_cast<{{elem_output_type}}*>(output),
+{{indent}}    static_cast<{{elem_input_type}}*>(input),
+{{indent}}    b, m, n, batch_stride_input, batch_stride_output, stream);
 {{indent}}  return;
 {{indent}}}
 #else
@@ -506,9 +509,10 @@
 {{reduce_kernel_instance}}
 
 {% for align in alignments %}
+template<typename ElementOutput, typename ElementInput>
 void reduce_mean_launcher_RowMajor_{{align}}(
-  {{elem_output_type}} *output,
-  {{elem_input_type}} *input,
+  ElementOutput *output,
+  ElementInput *input,
   int64_t batch_count,
   int64_t rows,
   int64_t columns,
@@ -558,9 +562,10 @@
 }
 {% endfor %}
 
+template<typename ElementOutput, typename ElementInput>
 void reduce_mean_launcher_ColumnMajor_1(
-  {{elem_output_type}} *output,
-  {{elem_input_type}} *input,
+  ElementOutput *output,
+  ElementInput *input,
   int64_t batch_count,
   int64_t rows,
   int64_t columns,
@@ -673,8 +678,8 @@
 }
 
 void {{func_name}}(
-  {{elem_output_type}} *output,
-  {{elem_input_type}} *input,
+  void *output,
+  void *input,
   int reduction_axis,
   int64_t *output_shape[],
   const int64_t *orig_input_shape,
@@ -783,13 +788,7 @@ def gen_function_decl(func_attrs) -> str:
     str
         returns the rendered function declaration with appropriate replacements
     """
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    return FUNC_DECL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        elem_input_type=cuda_common.dtype_to_cutlass_type(x._attrs["dtype"]),
-        elem_output_type=cuda_common.dtype_to_cutlass_type(y._attrs["dtype"]),
-    )
+    return FUNC_DECL_TEMPLATE.render(func_name=func_attrs["name"])
 
 
 def gen_function(
@@ -825,8 +824,9 @@ def gen_function(
     """
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
-    input_type = cuda_common.dtype_to_cutlass_type(x._attrs["dtype"])
-    output_type = cuda_common.dtype_to_cutlass_type(y._attrs["dtype"])
+    backend_spec = CUDASpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     if accumulation_type is None:
         # follow pytorch's semantics
         acc_type = output_type
@@ -920,6 +920,9 @@ def gen_function(
     exec_paths = EXEC_COND_TEMPLATE.render(
         indent="  ",
         func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+        elem_compute_type=acc_type,
         alignments=alignments,
         special_exec_cond=special_exec_path,
     )
@@ -940,15 +943,11 @@ def gen_function(
         alignments=alignments,
         prologue_code=prologue_code,
         epilogue_scalar_code=epilogue_scalar_code,
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         special_kernel=special_kernel,
     )
 
     return SRC_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         kernel_source=kernel_src,
         exec_paths=exec_paths,
         output_accessor=output_accessors[0],
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_common.py b/python/aitemplate/backend/cuda/reduce/reduce_common.py
index fb96339be..ff8d65c12 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_common.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_common.py
@@ -18,17 +18,17 @@
 import jinja2
 
 from ....compiler.base import IntImm, IntVar
-from .. import cuda_common
+from ...backend_spec import CUDASpec
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_output_type}}* /*dst_ptr*/,
-  {{elem_input_type}}*  /*src_ptr*/,
-  int                   /*reduction_axis*/,
-  const int64_t*        /*shape*/,
-  const int             /*rank*/,
-  uint8_t*              /*workspace*/,
+  void*          /*dst_ptr*/,
+  void*          /*src_ptr*/,
+  int            /*reduction_axis*/,
+  const int64_t* /*shape*/,
+  const int      /*rank*/,
+  uint8_t*       /*workspace*/,
   cudaStream_t
 );
 """
@@ -38,8 +38,14 @@
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if (shape[rank - 1] % {{vector_length}} == 0) {
-{{indent}}  {{func_name}}_launcher<{{vector_length}}>(
-{{indent}}      dst_ptr, src_ptr, reduction_axis, shape, rank, workspace, stream);
+{{indent}}  {{func_name}}_launcher<{{elem_output_type}}, {{elem_input_type}}, {{vector_length}}>(
+{{indent}}      static_cast<{{elem_output_type}}*>(dst_ptr),
+{{indent}}      static_cast<{{elem_input_type}}*>(src_ptr),
+{{indent}}      reduction_axis,
+{{indent}}      shape,
+{{indent}}      rank,
+{{indent}}      workspace,
+{{indent}}      stream);
 {{indent}}  return;
 }
 """
@@ -67,10 +73,10 @@
     }                                                                                 \\
   }
 
-template <int VectorLength = 1>
+template <typename ElemOutputType, typename ElemInputType, int VectorLength = 1>
 void {{func_name}}_launcher(
-    {{elem_output_type}} *dst_ptr,
-    {{elem_input_type}} *src_ptr,
+    ElemOutputType *dst_ptr,
+    ElemInputType *src_ptr,
     int reduction_axis,
     const int64_t *shape,
     const int rank,
@@ -79,15 +85,17 @@
   // Instead of making our own 4D tensor definition,
   // we simply use TensoeNHWC as a 4D tensor
   using Layout = cutlass::layout::TensorNHWC;
-  using ElementCompute = {{elem_compute_type}};
+  // Match pytorch's behavior where the accumuation type is the same
+  // as the output type
+  using ElementCompute = ElemOutputType;
   using ReductionOp = {{reduction_op}}<ElementCompute>;
   constexpr int NUM_DIMS = 4;
   assert(rank <= NUM_DIMS);
   assert(reduction_axis < rank);
   assert(rank > 0);
   using TensorReduction = cutlass::reduction::device::TensorReduction<
-    {{elem_output_type}},
-    {{elem_input_type}},
+    ElemOutputType,
+    ElemInputType,
     Layout,
     ReductionOp,
     VectorLength,
@@ -134,8 +142,8 @@
 }
 #undef CUTLASS_CHECK_REDUCE
 void {{func_name}}(
-    {{elem_output_type}} *dst_ptr,
-    {{elem_input_type}}  *src_ptr,
+    void *dst_ptr,
+    void *src_ptr,
     int reduction_axis,
     const int64_t *shape,
     const int rank,
@@ -165,7 +173,7 @@
   {{indent}}    {{reduction_axis}},
   {{indent}}    shape,
   {{indent}}    {{rank}},
-  {{indent}}    global_workspace,
+  {{indent}}    global_workspace_,
   {{indent}}    stream
   {{indent}});
 {{indent}}}
@@ -174,39 +182,38 @@
 
 
 def gen_function_decl(func_attrs):
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_input_type=cuda_common.dtype_to_cutlass_type(x._attrs["dtype"]),
-        elem_output_type=cuda_common.dtype_to_cutlass_type(y._attrs["dtype"]),
     )
 
 
 def gen_function(func_attrs, reduction_op):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     vector_lens_config = [32, 16, 8, 4, 1]
     exec_paths = ""
     for vlen in vector_lens_config:
         exec_program = EXEC_COND_TEMPLATE.render(
-            func_name=func_attrs["name"], vector_length=vlen, indent="  "
+            func_name=func_attrs["name"],
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            vector_length=vlen,
+            indent="  ",
         )
         exec_paths += exec_program
 
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    input_type = cuda_common.dtype_to_cutlass_type(x._attrs["dtype"])
-    output_type = cuda_common.dtype_to_cutlass_type(y._attrs["dtype"])
     if func_attrs.get("workspace", 0) > 0:
         workspace_ptr = "workspace"
     else:
         workspace_ptr = "nullptr"
     return SRC_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
-        # Match pytorch's behavior where the accumuation type is the same
-        # as the output type
-        elem_compute_type=output_type,
         reduction_op=reduction_op,
         exec_paths=exec_paths,
         workspace_ptr=workspace_ptr,
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index 7c294a97c..8bf4c6713 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -43,8 +43,13 @@
 {{indent}}    } else {
 {{indent}}      throw std::runtime_error("reduce_small_axis: invalid rank rank");
 {{indent}}    }
-{{indent}}    reduce_mean_launcher_small_axis<cst_n>(
-{{indent}}          output, input, b, m, batch_stride_input,
+{{indent}}    reduce_mean_launcher_small_axis<{{elem_output_type}},
+{{indent}}                                    {{elem_input_type}},
+{{indent}}                                    {{elem_compute_type}},
+{{indent}}                                    cst_n>(
+{{indent}}          static_cast<{{elem_output_type}}*>(output),
+{{indent}}          static_cast<{{elem_input_type}}*>(input),
+{{indent}}          b, m, batch_stride_input,
 {{indent}}          batch_stride_output, stream);
 {{indent}}    return;
 {{indent}}  } else {
@@ -88,12 +93,12 @@
   size_t output_idx = block_batch * batch_stride_output + idx;
   ElemT *this_output = get_strided_address_at_idx<ElemT, ElemT>(output, output_idx);
 
-  assert(sizeof(ReadVecT) % sizeof(ElemT) == 0);
+  static_assert(sizeof(ReadVecT) % sizeof(ElemT) == 0);
   constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElemT);
   // number of original elements
   constexpr int64_t num_elems_per_thread = num_rows_per_thread * num_cols;
   // number of vector elements
-  assert(num_elems_per_thread % n_read_elems_in_v == 0);
+  static_assert(num_elems_per_thread % n_read_elems_in_v == 0);
   constexpr int64_t num_elems_per_thread_v =
       num_elems_per_thread / n_read_elems_in_v;
 
@@ -122,10 +127,10 @@
   };
 
   ElemT reduced_elems[num_rows_per_thread];
-  assert(num_elems_per_thread % num_cols == 0);
+  static_assert(num_elems_per_thread % num_cols == 0);
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < num_elems_per_thread / num_cols; i++) {
-    assert(num_elems_per_thread % num_rows_per_thread == 0);
+    static_assert(num_elems_per_thread % num_rows_per_thread == 0);
     FragmentCompute frag_compute = FragmentCompute(0);
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < num_cols; j++) {
@@ -159,10 +164,13 @@
 {% endif %}
 }
 
-template <int64_t num_cols>
+template <typename ElemOutputType,
+          typename ElemInputType,
+          typename ElemComputeType,
+          int64_t num_cols>
 void reduce_mean_launcher_small_axis(
-  {{elem_output_type}} *output,
-  {{elem_input_type}} *input,
+  ElemOutputType *output,
+  ElemInputType *input,
   int64_t num_batches,
   int64_t num_rows,
   int64_t batch_stride_input,
@@ -170,16 +178,16 @@
   cudaStream_t stream
 ) {
   constexpr int64_t num_read_v =
-      sizeof({{read_vec_type}}) / sizeof({{elem_input_type}});
+      sizeof({{read_vec_type}}) / sizeof(ElemInputType);
   constexpr int64_t row_gcd = std::gcd(num_cols, num_read_v);
   constexpr int64_t num_rows_per_thread = num_read_v / row_gcd;
 {% if output_accessor.is_contiguous %}
   constexpr int64_t num_write_bytes_v =
-      num_rows_per_thread * sizeof({{elem_output_type}});
+      num_rows_per_thread * sizeof(ElemOutputType);
 {% else %}
   constexpr int64_t num_write_bytes_v =
       std::min(num_rows_per_thread, static_cast<int64_t>({{output_access_alignment}})) *
-      sizeof({{elem_output_type}});
+      sizeof(ElemOutputType);
 {% endif %}
 
   assert(num_rows % num_rows_per_thread == 0);
@@ -191,8 +199,8 @@
 
 #define HANDLE_ONE_WRITE_VEC(write_bytes, write_vec_type) \\
     case write_bytes:                                     \\
-      reduce_small_in_v_out_v<{{elem_input_type}},        \\
-                              {{elem_compute_type}},      \\
+      reduce_small_in_v_out_v<ElemInputType,              \\
+                              ElemComputeType,            \\
                               {{read_vec_type}},          \\
                               write_vec_type,             \\
                               num_rows_per_thread,        \\
@@ -219,9 +227,10 @@
   LAUNCH_CHECK_REDUCE();
 }
 
+template <typename ElemOutputType, typename ElemInputType>
 void reduce_mean_launcher_small_axis_column_major(
-  {{elem_output_type}} *output,
-  {{elem_input_type}} *input,
+  ElemOutputType *output,
+  ElemInputType *input,
   int64_t num_batches,
   int64_t num_rows,
   int64_t num_columns,
@@ -396,6 +405,9 @@ def get_exec_cond_and_kernel(
     exec_cond = EXEC_COND_TEMPLATE.render(
         indent="  ",
         func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+        elem_compute_type=acc_type,
         reduction_dim_upperbound=reduction_dim_upperbound,
         reduction_dim_val=reduction_dim_val,
         static_small_reduction_dim=valid_static_small_reduction_dim,
@@ -415,9 +427,6 @@ def get_exec_cond_and_kernel(
         reduce_op=reduce_op,
         prologue_code=prologue_code,
         epilogue_scalar_code=epilogue_scalar_code,
-        elem_input_type=input_type,
-        elem_compute_type=acc_type,
-        elem_output_type=output_type,
         read_vec_type=read_vec_type,
         output_accessor=output_accessors[0],
         output_access_alignment=output_alignment,
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 54d0468f2..7e61c8445 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -22,6 +22,7 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import reduce_3d
 
 
@@ -259,7 +260,11 @@ def var_gen_function(func_attrs) -> str:
         returns the rendered code for the complete implementation of this var op
     """
     bessel = "true" if func_attrs["unbiased"] else "false"
-    acc_type = "WelfordData<cutlass::half_t, {}>".format(bessel)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    acc_type = f"WelfordData<{elem_input_type}, {bessel}>"
     return reduce_3d.gen_function(
         func_attrs,
         "cutlass::welford_op",
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index 4e98321fc..68a0eec7e 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -24,17 +24,11 @@
 from ....compiler.base import IntImm
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from ...target import Target
 
 # pylint: disable=C0301, C0116
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<cutlass::half_t*>(&({{name}}->raw()))"
-)
-
-FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<float*>(&({{name}}->raw()))"
-)
 
 # input size: [M, K]
 # We put if else condition here to avoid long compilation time.
@@ -72,8 +66,11 @@
     const int m0_by_n_threads = m0 * n_threads;
     dim3 block(n_threads);
     dim3 grid((m + m0_by_n_threads - 1) / m0_by_n_threads);
+    Arguments<{{dtype}}> args = {
+      static_cast<{{dtype}}*>(input), static_cast<{{dtype}}*>(output)
+    };
     softmax_small_k<{{dtype}}, float4, n_threads, {{K}}, {{m}}>
-        <<<grid, block, 0, stream>>>({input, output}, m);
+        <<<grid, block, 0, stream>>>(args, m);
   {% elif K % 8 == 0 %}
     {% if K/8 <=32 %}
       int thread_group_width = -1;
@@ -233,8 +230,8 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}({{dtype}}* input,
-                   {{dtype}}* output,
+void {{func_name}}(void* input,
+                   void* output,
 {% for idx in range(input_ndim - 1) %}
                    int64_t* in_{{idx}},
 {% endfor %}
@@ -266,7 +263,6 @@ def get_func_signature(func_attrs: Dict[str, Any]) -> str:
     input_ndim = func_attrs["inputs"][0]._rank()
     return FUNC_SIGNATURE.render(
         func_name=func_attrs["name"],
-        dtype="cutlass::half_t",
         input_ndim=input_ndim,
     ).strip()
 
@@ -302,13 +298,17 @@ def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
 
     k = shapes[dim].value()
 
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
         custom_libs=Target.current().get_custom_libs(
             os.path.dirname(__file__), "softmax.cuh"
         ),
         func_signature=get_func_signature(func_attrs),
         shape_functions=SHAPE_FUNCTIONS.render(input_ndim=rank),
-        dtype="cutlass::half_t",
+        dtype=elem_input_type,
         K=k,
         m=find_tile_size(k),
     )
@@ -324,12 +324,8 @@ def softmax_gen_function_call(func_attrs, indent="  "):
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 1
 
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
-    )
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
-    )
+    input_name = func_attrs["inputs"][0]._attrs["name"]
+    output_name = func_attrs["outputs"][0]._attrs["name"]
 
     shapes = func_attrs["inputs"][0]._attrs["shape"]
     assert (
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 3b9e1ec84..5b8b462ab 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -15,14 +15,23 @@
 """
 CUDA target specialization
 """
-
+import json
 import os
+import pipes
 import re
 import shutil
 import sys
+import tempfile
 
+from pathlib import Path
 from typing import List
 
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.backend.target import TargetType
+
+from ...utils import logger
+
 from .. import registry
 from ..target import AIT_STATIC_FILES_PATH, CUTLASS_PATH, Target
 
@@ -78,6 +87,8 @@ def _build_compile_options(self):
             os.path.join(self._template_path, "include"),
             os.path.join(self._template_path, "tools/util/include"),
             os.path.join(self._template_path, "examples/35_gemm_softmax"),
+            os.path.join(self._template_path, "examples/42_fused_multi_head_attention"),
+            os.path.join(self._template_path, "examples/43_dual_gemm"),
             os.path.join(
                 flash_attention_path,
                 "./",
@@ -107,7 +118,11 @@ def _build_compile_options(self):
             "-I" + cutlass_path[2],
             "-I" + cutlass_path[3],
             "-I" + cutlass_path[4],
+            "-I" + cutlass_path[5],
+            "-I" + cutlass_path[6],
         ]
+        if self._ndebug == 1:
+            options.append("-DNDEBUG")
         return " ".join(options)
 
     def src_extension(self):
@@ -136,7 +151,11 @@ def __enter__(self):
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if self.lib_folder and os.path.exists(self.lib_folder):
+        if (
+            self.lib_folder
+            and os.path.exists(self.lib_folder)
+            and not logger.is_debug()
+        ):
             shutil.rmtree(self.lib_folder)
 
     def cc(self):
@@ -163,7 +182,195 @@ def comp_func(name):
             args.append(int(align_args[-1]))
             return tuple(args)
 
-        return sorted(algo_names, key=comp_func)[0]
+        return min(algo_names, key=comp_func)
+
+
+class FBCUDA(CUDA):
+    """FBCUDA target. Used in Meta internal env only."""
+
+    nvcc_option_json = None
+    cutlass_path_ = None
+    compile_options_ = None
+
+    def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
+        from libfb.py import parutil
+
+        cutlass_src_path = parutil.get_dir_path(
+            "aitemplate/AITemplate/fb/3rdparty/cutlass"
+        )
+        cub_src_path = parutil.get_dir_path("aitemplate/AITemplate/fb/3rdparty/cub")
+        static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
+        self._include_path = None
+        if not FBCUDA.cutlass_path_:
+            self._include_path = tempfile.mkdtemp()
+
+            FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
+            self.cub_path_ = self._include_path + "/cub"
+            shutil.copytree(cutlass_src_path, FBCUDA.cutlass_path_)
+            shutil.copytree(cub_src_path, self.cub_path_)
+
+            attention_src_path = parutil.get_dir_path(
+                "aitemplate/AITemplate/python/aitemplate/backend/cuda/attention/src"
+            )
+            attention_include_path = self._include_path + "/att_include"
+            shutil.copytree(attention_src_path, attention_include_path)
+        self.cutlass_path_ = FBCUDA.cutlass_path_
+
+        cutlass_lib_path = parutil.get_dir_path(
+            "aitemplate/AITemplate/python/aitemplate/utils/mk_cutlass_lib"
+        )
+        sys.path.append(cutlass_lib_path)
+
+        if not FBCUDA.nvcc_option_json:
+            convert_nvcc_json = parutil.get_file_path(
+                os.path.join("aitemplate/testing", "convert_nvcc_cmd")
+            )
+            logger.info(
+                __name__, f"Load the nvcc compile option from {convert_nvcc_json}"
+            )
+            with open(convert_nvcc_json, "r") as nvcc_option_json:
+                FBCUDA.nvcc_option_json = json.load(nvcc_option_json)
+        self.nvcc_options_json = FBCUDA.nvcc_option_json
+
+        self.remote_cache_bytes = remote_cache_bytes
+        super().__init__(self.cutlass_path_, static_files_path, arch, **kwargs)
+
+    def _build_compile_options(self):
+        if not FBCUDA.compile_options_:
+            cutlass_path = [
+                os.path.join(self._template_path, "include"),
+                os.path.join(self._template_path, "tools/util/include"),
+                os.path.join(self._template_path, "examples/35_gemm_softmax"),
+                os.path.join(
+                    self._template_path, "examples/42_fused_multi_head_attention"
+                ),
+                os.path.join(self._template_path, "examples/43_dual_gemm"),
+                os.path.join(self._template_path, "../att_include"),
+                os.path.join(self._template_path, "../att_include/fmha"),
+                os.path.join(self._template_path, "../cub"),
+            ]
+            fb_include_path = os.path.join(self._include_path, "fb_include")
+            pp_args = self.nvcc_options_json["pp_args"]
+            with open(fb_include_path, "w") as fb_include:
+                for arg in pp_args:
+                    fb_include.write(pipes.quote(arg) + "\n")
+            options = self.nvcc_options_json["args"] + [
+                "-I" + cutlass_path[0],
+                "-I" + cutlass_path[1],
+                "-I" + cutlass_path[2],
+                "-I" + cutlass_path[3],
+                "-I" + cutlass_path[4],
+                "-I" + cutlass_path[5],
+                "-I" + cutlass_path[6],
+                f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
+                "-Xcompiler -Wno-strict-aliasing",
+                "-Xcompiler -Wno-narrowing",
+                "-Xcompiler -Wno-error=maybe-uninitialized",
+                "-Xcompiler -Wno-uninitialized",
+                "-Xcompiler -Wno-error=array-bounds",
+                "-Xcompiler -fPIC",
+                "-Xcompiler -fvisibility=hidden",
+                "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                "-w",
+                "--expt-relaxed-constexpr",
+                "--use_fast_math",
+                "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
+                % (self._arch, self._arch, self._arch),
+                "-Xcompiler=-Wconversion",
+                "-O3",
+                "-std=c++17",
+            ]
+            if self._ndebug == 1:
+                options.append("-DNDEBUG")
+            FBCUDA.compile_options_ = " ".join(options)
+        compile_options = FBCUDA.compile_options_
+        logger.debug(__name__, f"The compile options are: {compile_options}")
+        return compile_options
+
+    def __exit__(self, ptype, value, trace):
+        super().__exit__(ptype, value, trace)
+        if not logger.is_debug() and self._include_path:
+            shutil.rmtree(self._include_path)
+
+    def binary_compile_cmd(self):
+        """
+        There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
+        """
+        ld = self.nvcc_options_json["ld"]
+        return " ".join([ld, "-r -b binary -o {target} {src}"])
+
+    def cc(self):
+        return self.nvcc_options_json["nvcc_bin"]
+
+    def make(self):
+        return self.nvcc_options_json.get("make_bin", super().make())
+
+    def compile_options(self):
+        return self._compile_options
+
+    def get_custom_libs(self, absolute_dir, filename) -> str:
+        def list_rindex(input_list, x):
+            for i in reversed(range(len(input_list))):
+                if input_list[i] == x:
+                    return i
+            raise ValueError("{} is not in list".format(x))
+
+        from libfb.py import parutil
+
+        absolute_dir = os.path.normpath(absolute_dir)
+        dir_parts = Path(absolute_dir).parts
+        relative_path = Path(
+            "/".join(dir_parts[list_rindex(dir_parts, "aitemplate") :]) + "/" + filename
+        )
+        f_name = parutil.get_dir_path(relative_path)
+        with open(f_name) as f:
+            res = f.read()
+            return res
+
+    def in_ci_env(self):
+        return (
+            os.environ.get("INSIDE_RE_WORKER", None) == "1" and not self.trick_ci_env()
+        )
+
+    @classmethod
+    def remote_logger(cls, record):
+        """
+        Upload the record to Scuba table
+        """
+        # Only upload when force_profile or trick_ci_env is specified.
+        # i.e. FORCE_PROFILE=1 or -c aitemplate.force_profile=true or TRICK_CI_ENV=1
+        # Otherwise, dummy profiling records are not useful.
+        if cls.force_profile(cls) or cls.trick_ci_env(cls):
+            from aitemplate.AITemplate.fb.remote_logger import AITemplateRemoteLogger
+
+            try:
+                AITemplateRemoteLogger.log(record)
+            except Exception as e:
+                logger.info(__name__, f"remote_logger failed: {e}")
+
+    def _load_profile_cache(self):
+        """Load local profile cache for this target."""
+        cache_path = self._prepare_profile_cache_path()
+        if cache_path is None:
+            return
+
+        if self.remote_cache_bytes is not None:
+            logger.info(
+                __name__,
+                f"Loading profile cache from provided cache content with length {len(self.remote_cache_bytes)}",
+            )
+            with open(cache_path, "wb") as f:
+                f.write(self.remote_cache_bytes)
+        logger.info(__name__, f"Loading profile cache from: {cache_path}")
+        self._profile_cache = ProfileCacheDB(
+            TargetType(self._target_type).name, path=cache_path
+        )
+
+
+@registry.reg("fb.cuda.create_target")
+def create_target_fb(arch, **kwargs):
+    return FBCUDA(arch=arch, **kwargs)
 
 
 @registry.reg("cuda.create_target")
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index da8747d41..a0f93b8fe 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -23,6 +23,7 @@
     dynamic_slice,
     expand,
     gather,
+    permute,
     permute021,
     permute102,
     permute210,
@@ -40,6 +41,7 @@
     "dynamic_slice",
     "expand",
     "gather",
+    "permute",
     "permute021",
     "permute102",
     "permute210",
diff --git a/python/aitemplate/backend/cuda/tensor/gather.py b/python/aitemplate/backend/cuda/tensor/gather.py
index 3ef41b477..22fdaf8d0 100644
--- a/python/aitemplate/backend/cuda/tensor/gather.py
+++ b/python/aitemplate/backend/cuda/tensor/gather.py
@@ -20,25 +20,15 @@
 from ... import registry
 from .. import cuda_common
 
-CAST_TO_CONST_HALF_PTR_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<const half*>(&({{name}}->raw()))"
-)
-
-
 CAST_TO_CONST_INDEX_PTR_TEMPLATE = jinja2.Template(
     "reinterpret_cast<const {{index_type}}*>({{name}})"
 )
 
-
-CAST_TO_HALF_PTR_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
-
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-    {{elem_output_type}} * /*output*/,
-    const {{elem_input_type}} * /*input*/,
+    void * /*output*/,
+    const void * /*input*/,
     const {{index_type}} * /*indices*/,
     int64_t *[] /*output_shape*/,
     const int64_t * /*input_shape*/,
@@ -133,10 +123,10 @@
 
   constexpr unsigned read_t_sz = sizeof(READ_T);
   constexpr unsigned elem_t_sz = sizeof(ELEM_T);
-  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  static_assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
   constexpr int n_of_elem_t = read_t_sz / elem_t_sz;
-  assert(sizeof(READ_INDEX_T) % sizeof(INDEX_TYPE) == 0);
-  assert(n_of_elem_t == (sizeof(READ_INDEX_T) / sizeof(INDEX_TYPE)));
+  static_assert(sizeof(READ_INDEX_T) % sizeof(INDEX_TYPE) == 0);
+  static_assert(n_of_elem_t == (sizeof(READ_INDEX_T) / sizeof(INDEX_TYPE)));
   // number of READ_T elements per thread
   constexpr int reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
   const int num_elems_in_read_t = num_output_elems / n_of_elem_t;
@@ -230,7 +220,7 @@
                                    {{rank}}/*Rank*/,
                                    {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                         {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
-{{indent}}    output, input, indices, input_shape, index_shape, gather_dim, stream);
+{{indent}}    static_cast<{{elem_type}}*>(output), static_cast<const {{elem_type}}*>(input), indices, input_shape, index_shape, gather_dim, stream);
 {{indent}}  return;
 {{indent}}}
 """
@@ -242,8 +232,8 @@
 {{kernel_src}}
 
 void {{func_name}}(
-    {{elem_output_type}}* output,
-    const {{elem_input_type}}* input,
+    void* output,
+    const void* input,
     const INDEX_TYPE* indices,
     int64_t *output_shape[],
     const int64_t *input_shape,
@@ -327,16 +317,10 @@
 @registry.reg("cuda.gather.func_decl")
 def gen_function_decl(func_attrs):
     inputs = func_attrs["inputs"]
-    x = inputs[0]
     index = inputs[1]
-    y = func_attrs["outputs"][0]
-    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
     index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
-    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        elem_output_type=output_type,
-        elem_input_type=input_type,
         index_type=index_type,
     )
 
@@ -353,6 +337,9 @@ def gen_function(func_attrs):
     index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
     output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
 
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
     # TODO: consider to add profiling paths for tuning
     # elems_per_thread and threads_per_block
     exec_paths = EXEC_COND_TEMPLATE.render(
@@ -367,8 +354,6 @@ def gen_function(func_attrs):
     return SRC_TEMPLATE.render(
         kernel_src=kernel_src,
         func_name=func_attrs["name"],
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         exec_paths=exec_paths,
     )
 
@@ -389,11 +374,9 @@ def _dims(t, ref=""):
     y_dims = _dims(y, ref="&")
 
     index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
-    casted_x_ptr = CAST_TO_CONST_HALF_PTR_TEMPLATE.render(name=x._attrs["name"])
     casted_index_ptr = CAST_TO_CONST_INDEX_PTR_TEMPLATE.render(
         index_type=index_type, name=index._attrs["name"]
     )
-    casted_y_ptr = CAST_TO_HALF_PTR_TEMPLATE.render(name=y._attrs["name"])
 
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
@@ -404,8 +387,8 @@ def _dims(t, ref=""):
         output_dims=y_dims,
         input_dims=x_dims,
         index_dims=index_dims,
-        output_ptr=casted_y_ptr,
-        input_ptr=casted_x_ptr,
+        output_ptr=y._attrs["name"],
+        input_ptr=x._attrs["name"],
         index_ptr=casted_index_ptr,
         gather_dim=gather_dim,
         rank=len(x._attrs["shape"]),
diff --git a/python/aitemplate/backend/cuda/tensor/permute.cuh b/python/aitemplate/backend/cuda/tensor/permute.cuh
new file mode 100644
index 000000000..595aef45c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute.cuh
@@ -0,0 +1,369 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+/************** One flow generic permute implementation **************/
+// https://github.com/Oneflow-Inc/oneflow/blob/f0e9d38b2ba4ac535fd6de5dbeca4e3d2051de23/oneflow/core/ep/cuda/primitive/permute.cu
+// The following code fixed a bug in the original code related to vector
+// read/write.
+
+template <typename T, int N>
+class NdIndexOffsetHelper {
+ public:
+  CUTLASS_HOST_DEVICE NdIndexOffsetHelper() = default;
+
+  template <class... Ts>
+  CUTLASS_HOST_DEVICE explicit NdIndexOffsetHelper(T d0, Ts... dims) {
+    constexpr int n = 1 + sizeof...(dims);
+    static_assert(n <= N, "");
+    T dims_arr[n] = {d0, static_cast<T>(dims)...};
+    InitStrides(dims_arr, n);
+  }
+
+  CUTLASS_HOST_DEVICE explicit NdIndexOffsetHelper(const T* dims) {
+    InitStrides(dims, N);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE explicit NdIndexOffsetHelper(const U* dims) {
+    T dims_arr[N];
+    for (int i = 0; i < N; ++i) {
+      dims_arr[i] = dims[i];
+    }
+    InitStrides(dims_arr, N);
+  }
+
+  CUTLASS_HOST_DEVICE explicit NdIndexOffsetHelper(const T* dims, int n) {
+    InitStrides(dims, n);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE explicit NdIndexOffsetHelper(const U* dims, int n) {
+    T dims_arr[N];
+    for (int i = 0; i < N; ++i) {
+      if (i < n) {
+        dims_arr[i] = dims[i];
+      }
+    }
+    InitStrides(dims_arr, n);
+  }
+
+  ~NdIndexOffsetHelper() = default;
+
+  CUTLASS_HOST_DEVICE T NdIndexToOffset(const T* index) const {
+    T offset = 0;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      offset += index[i] * stride_[i];
+    }
+    return offset;
+  }
+
+  CUTLASS_HOST_DEVICE T NdIndexToOffset(const T* index, int n) const {
+    assert(n <= N);
+    T offset = 0;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      if (i < n) {
+        offset += index[i] * stride_[i];
+      }
+    }
+    return offset;
+  }
+
+  template <class... Ts>
+  CUTLASS_HOST_DEVICE T NdIndexToOffset(T d0, Ts... others) const {
+    constexpr int n = 1 + sizeof...(others);
+    static_assert(n <= N, "");
+    T index[n] = {d0, others...};
+    T offset = 0;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < n - 1; ++i) {
+      offset += index[i] * stride_[i];
+    }
+    if (n == N) {
+      offset += index[n - 1];
+    } else {
+      offset += index[n - 1] * stride_[n - 1];
+    }
+    return offset;
+  }
+
+  CUTLASS_HOST_DEVICE void OffsetToNdIndex(T offset, T* index) const {
+    T remaining = offset;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < N - 1; ++i) {
+      const T idx = remaining / stride_[i];
+      index[i] = idx;
+      remaining = remaining - idx * stride_[i];
+    }
+    index[N - 1] = remaining;
+  }
+
+  CUTLASS_HOST_DEVICE void OffsetToNdIndex(T offset, T* index, int n) const {
+    assert(n <= N);
+    T remaining = offset;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      if (i < n) {
+        const T idx = remaining / stride_[i];
+        index[i] = idx;
+        remaining = remaining - idx * stride_[i];
+      }
+    }
+  }
+
+  template <class... Ts>
+  CUTLASS_HOST_DEVICE void OffsetToNdIndex(T offset, T& d0, Ts&... others)
+      const {
+    constexpr int n = 1 + sizeof...(others);
+    static_assert(n <= N, "");
+    T* index[n] = {&d0, &others...};
+    T remaining = offset;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < n - 1; ++i) {
+      const T idx = remaining / stride_[i];
+      *index[i] = idx;
+      remaining = remaining - idx * stride_[i];
+    }
+    if (n == N) {
+      *index[n - 1] = remaining;
+    } else {
+      *index[n - 1] = remaining / stride_[n - 1];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE constexpr int Size() const {
+    return N;
+  }
+
+ protected:
+  CUTLASS_HOST_DEVICE void InitStrides(const T* dims, const int n) {
+    for (int i = n - 1; i < N; ++i) {
+      stride_[i] = 1;
+    }
+    for (int i = n - 2; i >= 0; --i) {
+      stride_[i] = dims[i + 1] * stride_[i + 1];
+    }
+  }
+
+  T stride_[N];
+};
+
+template <size_t num_dims, typename IndexType>
+struct PermuteKernelParams {
+  NdIndexOffsetHelper<IndexType, num_dims> src_index_helper;
+  NdIndexOffsetHelper<IndexType, num_dims> dst_index_helper;
+  int permutation[num_dims]{};
+  IndexType count{};
+  const void* src{};
+  void* dst{};
+};
+
+template <size_t num_dims, typename IndexType>
+PermuteKernelParams<num_dims, IndexType> MakePermuteParams(
+    const int64_t* src_dims,
+    const void* src,
+    const int* permutation,
+    void* dst,
+    size_t count) {
+  PermuteKernelParams<num_dims, IndexType> params;
+  params.src_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(src_dims);
+  int64_t dst_dims[num_dims];
+  for (size_t i = 0; i < num_dims; ++i) {
+    dst_dims[i] = src_dims[permutation[i]];
+  }
+  params.dst_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(dst_dims);
+  for (size_t i = 0; i < num_dims; ++i) {
+    params.permutation[i] = permutation[i];
+  }
+  params.src = src;
+  params.dst = dst;
+  params.count = static_cast<IndexType>(count);
+  return params;
+}
+
+template <size_t num_dims, size_t movement_size, typename IndexType>
+__global__ void PermuteKernel(PermuteKernelParams<num_dims, IndexType> params) {
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  const T* src = reinterpret_cast<const T*>(params.src);
+  T* dst = reinterpret_cast<T*>(params.dst);
+
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+
+  IndexType start_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IndexType step = blockDim.x * gridDim.x;
+  for (IndexType i = start_idx; i < params.count; i += step) {
+    params.dst_index_helper.OffsetToNdIndex(i, dst_index);
+#pragma unroll
+    for (size_t dim = 0; dim < num_dims; ++dim) {
+      src_index[params.permutation[dim]] = dst_index[dim];
+    }
+    IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+    dst[i] = src[src_offset];
+  }
+}
+
+// SimplifyPermutation can be added to further improve perf
+
+template <size_t max_movement_size>
+size_t GetMovementSize(
+    size_t elem_size,
+    size_t num_dims,
+    const int64_t* src_dims,
+    const void* src,
+    const int* permutation,
+    void* dst) {
+  static_assert(
+      max_movement_size > 0 &&
+          (max_movement_size & (max_movement_size - 1)) == 0,
+      "");
+  CHECK_GT(elem_size, 0);
+  CHECK_EQ((elem_size & (elem_size - 1)), 0);
+  CHECK_EQ(max_movement_size % elem_size, 0);
+
+  if (permutation[num_dims - 1] == num_dims - 1) {
+    const int64_t last_dim_size = src_dims[num_dims - 1] * elem_size;
+    auto src_ptr = reinterpret_cast<std::uintptr_t>(src);
+    auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
+    for (size_t size = max_movement_size; size > elem_size; size /= 2) {
+      if (last_dim_size % size == 0 && src_ptr % size == 0 &&
+          dst_ptr % size == 0) {
+        return size;
+      }
+    }
+  }
+  return elem_size;
+}
+
+const int32_t kCudaThreadsNumPerBlock = 512;
+const int32_t kCudaMaxBlocksNum = 8192;
+
+inline int64_t BlocksNum4ThreadsNum(
+    const int64_t n,
+    const int64_t num_threads_per_block = kCudaThreadsNumPerBlock) {
+  CHECK_GT(n, 0);
+  return std::min(
+      (n + num_threads_per_block - 1) / num_threads_per_block,
+      static_cast<int64_t>(kCudaMaxBlocksNum));
+}
+
+template <size_t num_dims, size_t movement_size, typename IndexType>
+void LaunchKernel(
+    const int64_t* src_dims,
+    const void* src,
+    const int* permutation,
+    void* dst,
+    size_t count,
+    cudaStream_t cuda_stream) {
+  PermuteKernelParams<num_dims, IndexType> params =
+      MakePermuteParams<num_dims, IndexType>(
+          src_dims, src, permutation, dst, count);
+
+  PermuteKernel<num_dims, movement_size, IndexType>
+      <<<BlocksNum4ThreadsNum(params.count),
+         std::min((int64_t)kCudaThreadsNumPerBlock, (int64_t)params.count),
+         0,
+         cuda_stream>>>(params);
+}
+
+template <size_t num_dims, size_t movement_size>
+void DispatchIndexType(
+    int64_t* src_dims,
+    const void* src,
+    const int* permutation,
+    void* dst,
+    cudaStream_t stream) {
+  // Vector read/write.
+  // This fixed a bug in the original oneflow code.
+  src_dims[num_dims - 1] = src_dims[num_dims - 1] * 2 / movement_size;
+
+  size_t count = 1;
+  for (size_t i = 0; i < num_dims; ++i) {
+    count *= src_dims[i];
+  }
+  if (count < std::numeric_limits<int32_t>::max()) {
+    LaunchKernel<num_dims, movement_size, int32_t>(
+        src_dims, src, permutation, dst, count, stream);
+  } else {
+    LaunchKernel<num_dims, movement_size, int64_t>(
+        src_dims, src, permutation, dst, count, stream);
+  }
+}
+
+template <size_t num_dims>
+void DispatchMovementSize(
+    size_t movement_size,
+    int64_t* src_dims,
+    const void* src,
+    const int* permutation,
+    void* dst,
+    cudaStream_t stream) {
+  void (*func)(
+      int64_t* /*src_dims*/,
+      const void* /*src*/,
+      const int* /*permutation*/,
+      void* /*dst*/,
+      cudaStream_t /*stream*/) = nullptr;
+  if (movement_size == 1) {
+    func = DispatchIndexType<num_dims, 1>;
+  } else if (movement_size == 2) {
+    func = DispatchIndexType<num_dims, 2>;
+  } else if (movement_size == 4) {
+    func = DispatchIndexType<num_dims, 4>;
+  } else if (movement_size == 8) {
+    func = DispatchIndexType<num_dims, 8>;
+  } else if (movement_size == 16) {
+    func = DispatchIndexType<num_dims, 16>;
+  } else {
+    throw std::runtime_error("unsupported movement_size for permute");
+  }
+  func(src_dims, src, permutation, dst, stream);
+}
+
+template <size_t num_dims, size_t elem_size>
+void invokePermute(
+    void* dst,
+    const void* src,
+    int64_t* src_dims,
+    const int* permutation,
+    cudaStream_t stream) {
+  if (!dst) {
+    throw std::runtime_error("dst is NULL!");
+  }
+  if (!src) {
+    throw std::runtime_error("src is NULL!");
+  }
+
+  // 2 bytes/half * 8 halves
+  constexpr size_t kMaxMovementSize = 16;
+  const size_t movement_size = GetMovementSize<kMaxMovementSize>(
+      elem_size, num_dims, src_dims, src, permutation, dst);
+  DispatchMovementSize<num_dims>(
+      movement_size, src_dims, src, permutation, dst, stream);
+}
diff --git a/python/aitemplate/backend/cuda/tensor/permute.py b/python/aitemplate/backend/cuda/tensor/permute.py
new file mode 100644
index 000000000..d22041264
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute.py
@@ -0,0 +1,183 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute for cuda
+"""
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+from ...target import Target
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  const void*,
+{% for _ in range(input_rank) %}
+  int64_t*,
+{% endfor %}
+  const int*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{
+{{indent}}int dims[] = {{permutation}};
+{{indent}}{{func_name}}(
+{{indent}}    {{dst}},
+{{indent}}    {{src}},
+{% for dim in input_dims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    dims,
+{{indent}}    stream
+{{indent}});
+}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <cuda_fp16.h>
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/cutlass.h"
+#include "logging.h"
+
+namespace {
+
+{{custom_libs}}
+
+} // namespace
+
+void {{func_name}}(
+  void* dst,
+  const void* src,
+{% for i in range(input_rank) %}
+  int64_t* dim_{{i}},
+{% endfor %}
+  const int* permutation,
+  cudaStream_t stream
+){
+    // invoke permute kernel
+    int64_t src_dims[] = {
+{% for i in range(input_rank - 1) %}
+  *dim_{{i}},
+{% endfor %}
+  *dim_{{input_rank - 1}}
+    };
+
+    invokePermute<{{input_rank}}, {{elem_size}}>(dst, src, src_dims, permutation, stream);
+}
+
+  """
+)
+
+
+@registry.reg("cuda.permute.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    rank = x._rank()
+
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "permute.cuh"
+    )
+    dtype = x.dtype()
+    assert dtype == "float16", "permute kernel only supports fp16"
+    elem_size = 2
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        custom_libs=custom_libs,
+        input_rank=rank,
+        elem_size=elem_size,
+    )
+
+
+@registry.reg("cuda.permute.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    rank = x._rank()
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_rank=rank,
+    )
+
+
+@registry.reg("cuda.permute.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    input_dims = [f"&{dim._attrs['name']}" for dim in xshape]
+
+    y = func_attrs["outputs"][0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        dst=y._attrs["name"],
+        src=x._attrs["name"],
+        input_dims=input_dims,
+        permutation="{" + ",".join(str(dim) for dim in func_attrs["dims"]) + "}",
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index f1de127b5..b0acd9c61 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -41,10 +41,8 @@ def gen_function(
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
     backend_spec = CUDASpec()
     input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -66,7 +64,7 @@ def gen_function(
     shape_func = shape_eval_func + shape_save_func
     exec_paths = ""
     for key in exec_path:
-        program = upsampling2d_common.EXEC_TEMPLATE.render()
+        program = upsampling2d_common.EXEC_TEMPLATE.render(dtype=input_type)
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
@@ -76,8 +74,6 @@ def gen_function(
         exec_paths=exec_paths,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         half2_data_ref=half2_data_ref,
         mode=func_attrs["mode"],
         tsize=upsampling2d_common.gen_alignment(x),
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
index 9f7139189..f369a3ed2 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -41,10 +41,8 @@ def gen_function(
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
     backend_spec = CUDASpec()
     input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -66,7 +64,9 @@ def gen_function(
     shape_func = shape_eval_func + shape_save_func
     exec_paths = ""
     for key in exec_path:
-        program = upsampling2d_common.EXEC_TEMPLATE.render(bias_add=True)
+        program = upsampling2d_common.EXEC_TEMPLATE.render(
+            bias_add=True, dtype=input_type
+        )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
@@ -76,8 +76,6 @@ def gen_function(
         exec_paths=exec_paths,
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         half2_data_ref=half2_data_ref,
         mode=func_attrs["mode"],
         bias_add=True,
diff --git a/python/aitemplate/backend/cuda/view_ops/view_ops.py b/python/aitemplate/backend/cuda/view_ops/view_ops.py
index 17cbaaa6f..792f7b1de 100644
--- a/python/aitemplate/backend/cuda/view_ops/view_ops.py
+++ b/python/aitemplate/backend/cuda/view_ops/view_ops.py
@@ -85,15 +85,19 @@
 )
 
 
+def _is_intvar(func_attrs):
+    return func_attrs["is_intvar"] if "is_intvar" in func_attrs else False
+
+
 @registry.reg("cuda.reshape.gen_function")
 @registry.reg("cuda.flatten.gen_function")
 def reshape_gen_function(func_attrs, shape_eval_template):
     func_name = func_attrs["name"]
-
+    unknown_idx = func_attrs["unknown_idx"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    if _is_intvar(func_attrs):
+        input_ndim = len(func_attrs["inputs"]) - 1
     output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
-    unknown_idx = func_attrs["unknown_idx"]
-
     input_args = INPUT_ARGS_TEMPLATE.render(input_ndim=input_ndim)
     output_args = OUTPUT_ARGS_TEMPLATE.render(output_ndim=output_ndim)
 
@@ -118,6 +122,8 @@ def reshape_gen_function(func_attrs, shape_eval_template):
 def reshape_gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    if _is_intvar(func_attrs):
+        input_ndim = len(func_attrs["inputs"]) - 1
     output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
 
     return FUNC_DECL_TEMPLATE.render(
@@ -129,9 +135,17 @@ def reshape_gen_function_decl(func_attrs):
 @registry.reg("cuda.flatten.func_call")
 def reshape_gen_function_call(func_attrs, indent="  "):
     func_name = func_attrs["name"]
-    input_names = [
-        shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
-    ]
+    input_names = []
+    if _is_intvar(func_attrs):
+        for i, inp in enumerate(func_attrs["inputs"]):
+            if i == 0:
+                continue
+            input_names.append(inp._attrs["int_var"]._attrs["name"])
+    else:
+        input_names = [
+            shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
+        ]
+
     output_names = [
         shape._attrs["name"] for shape in func_attrs["outputs"][0]._attrs["shape"]
     ]
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
index be18598d0..f5efb3df0 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
@@ -22,13 +22,10 @@
 import jinja2
 
 from .... import registry
+from ....backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    "reinterpret_cast<half*>(&({{name}}->raw()))"
-)
-
 FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
 
 FUNC_TEMPLATE = jinja2.Template(
@@ -48,7 +45,8 @@
 {{func_signature}}
 {
 
-    batched_nms_launcher<half>(0, instance_num, keep_n, iou_threshold, input, workspace, output, mask);
+    batched_nms_launcher<{{elem_input_type}}>(
+        0, instance_num, keep_n, iou_threshold, input, workspace, output, mask);
 }
     """
 )
@@ -56,7 +54,7 @@
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(int64_t* output,
-                   const half* input,
+                   const void* input,
                    const int instance_num,
                    const int keep_n,
                    const float iou_threshold,
@@ -80,7 +78,7 @@
 {{indent}}    {{keep_n}},
 {{indent}}    {{iou_threshold}},
 {{indent}}    {{mask}},
-{{indent}}    global_workspace, stream /* default stream */
+{{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
     """
 )
@@ -96,7 +94,12 @@ def get_custom_libs() -> str:
 
 @registry.reg("cuda.batched_nms.gen_function")
 def batched_nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
     return FUNC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
         custom_libs=get_custom_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
     )
@@ -118,9 +121,7 @@ def batched_nms_gen_function_call(func_attrs, indent="  "):
     output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["outputs"][0]._attrs["name"]
     )
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
-    )
+    input_name = func_attrs["inputs"][0]._attrs["name"]
     tmp_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["inputs"][1]._attrs["name"]
     )
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
index 51d7032ae..2a44ce211 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
@@ -1,17 +1,3 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
 // CUDA batched_nms kernel
 
 int const threadsPerBlock = sizeof(unsigned long long int) * 8;
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
index cbbf6368f..1eb9dedd2 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -45,10 +45,8 @@ def gen_function(
     exec_path = func_attrs["exec_path"]
 
     x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    library_dtype = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -80,6 +78,7 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
+            library_dtype=library_dtype,
         )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
@@ -88,8 +87,6 @@ def gen_function(
         shape_function=shape_func,
         exec_paths=exec_paths,
         prefix=backend_spec.prefix,
-        elem_input_type=input_type,
-        elem_output_type=output_type,
         header_files=EXTRA_HEADER.render(),
         index_type=backend_spec.index_type,
         half2_data_ref=half2_data_ref,
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
index 550043c93..61c37c87d 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
@@ -20,9 +20,9 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
+  const void*,
+  const void*,
+  void*,
   int64_t*,
   int64_t*,
   int64_t*,
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index a8c00e82d..88af849a7 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -21,6 +21,9 @@
 MODEL_TEMPLATE = jinja2.Template(
     """
 #pragma once
+{% if debug_header %}
+#include "debug_utility.h"
+{% endif %}
 #include "logging.h"
 #include "device_functions-generated.h"
 #include "model_interface.h"
@@ -53,7 +56,9 @@
     throw std::runtime_error(msg);
   }
 }
-}
+
+thread_local bool target_has_graph_mode = {{ target_has_graph_mode }};
+} // namespace
 
 // Model is the class that actually performs inference. It owns memory for
 // intermediate tensors and dynamic dimensions. Constants are owned by
@@ -69,46 +74,65 @@ class Model {
       size_t num_inputs,
       size_t num_outputs,
       size_t num_unbound_constants,
-      uint8_t* constants)
-      : blob(RAII_DeviceMalloc(blob_size)),
-        workspace(RAII_DeviceMalloc(workspace_size)),
-        params(num_inputs + num_outputs + num_unbound_constants),
-        num_inputs(num_inputs),
-        constants(constants) {
+      uint8_t* constants,
+      AITemplateAllocator& allocator)
+      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
+        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
+        params_(num_inputs + num_outputs + num_unbound_constants),
+        num_inputs_(num_inputs),
+        num_outputs_(num_outputs),
+        constants_(constants) {
       dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
       LOG(INFO) << "Init AITemplate Runtime.";
-      global_workspace = static_cast<uint8_t*>(workspace.get()) + {{ unique_workspace_size }};
-      unique_workspace = static_cast<uint8_t*>(workspace.get());
-      DEVICE_CHECK(GetDevice(&device_idx))
-      DEVICE_CHECK(CreateEvent(&run_finished));
+      global_workspace_ = static_cast<uint8_t*>(workspace_.get()) + {{ unique_workspace_size }};
+      unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
+      DEVICE_CHECK(GetDevice(&device_idx_))
+      DEVICE_CHECK(CreateEvent(&run_finished_));
 #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
       DEVICE_CHECK(cudaDeviceGetAttribute(
-        &max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx));
+        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
 #endif
-      DEVICE_CHECK(GetDeviceProperties(&device_properties, device_idx));
-      DEVICE_CHECK(StreamCreate(&graph_capture_stream, /*non_blocking=*/true));
-
-  {{ set_up_constants }}
-      auto* blob_ptr = static_cast<uint8_t*>(blob.get());
+      DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
+      DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
+  InitConstants(constants_);
+      auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
   {{ tensor_slice }}
   {{ tensor_map_set }}
-  {{ set_up_param_dynamic_shapes }}
     }
 
     ~Model() {
-      DestroyEvent(run_finished);
-      StreamDestroy(graph_capture_stream);
-      if (graph_exec != nullptr) {
-        GraphExecDestroy(graph_exec);
+      if (run_finished_ != nullptr) {
+        DestroyEvent(run_finished_);
       }
-      if (graph != nullptr) {
-        GraphDestroy(graph);
+      if (graph_capture_stream_ != nullptr) {
+        StreamDestroy(graph_capture_stream_);
+      }
+      if (graph_exec_ != nullptr) {
+        GraphExecDestroy(graph_exec_);
       }
     }
 
-    Model(Model&&) = default;
-    Model& operator=(Model&&) = default;
+    Model(Model&& other) {
+      run_finished_ = other.run_finished_;
+      graph_exec_ = other.graph_exec_;
+      graph_capture_stream_ = other.graph_capture_stream_;
+      other.run_finished_ = nullptr;
+      other.graph_exec_ = nullptr;
+      other.graph_capture_stream_ = nullptr;
+
+      constants_ = other.constants_;
+      num_inputs_ = other.num_inputs_;
+      global_workspace_ = other.global_workspace_;
+      unique_workspace_ = other.unique_workspace_;
+      workspace_ = std::move(other.workspace_);
+
+      params_ = std::move(other.params_);
+      constant_name_to_ptr_ = std::move(other.constant_name_to_ptr_);
+      // Re-wire the pointers in the above 2 structures.
+      InitConstants(constants_);
+    }
 
+    Model& operator=(Model&&) = delete;
     Model(const Model&) = delete;
     Model& operator=(const Model&) = delete;
 
@@ -119,7 +143,6 @@ class Model {
     void DeviceToDeviceCopies(StreamType stream) {
   {{ device_to_device_copies }}
     }
-
     void Run(StreamType stream, bool graph_mode) {
       SetUpInputsOutputs();
       if (target_has_graph_mode && graph_mode) {
@@ -127,7 +150,7 @@ class Model {
       } else {
         RunImpl(stream);
       }
-      DEVICE_CHECK(EventRecord(run_finished, stream));
+      DEVICE_CHECK(EventRecord(run_finished_, stream));
     }
 
     void RunImpl(StreamType stream) {
@@ -139,7 +162,7 @@ class Model {
     }
 
     bool IsPending() {
-      auto query = QueryEvent(run_finished);
+      auto query = QueryEvent(run_finished_);
       if (query == GetDeviceNotReady()) {
         return true;
       }
@@ -151,19 +174,19 @@ class Model {
     }
 
     void WaitForCompletion() {
-      DEVICE_CHECK(EventSynchronize(run_finished));
+      DEVICE_CHECK(EventSynchronize(run_finished_));
     }
 
     size_t NumInputs() const {
-      return num_inputs;
+      return num_inputs_;
     }
 
     size_t NumOutputs() const {
-      return params.size() - num_inputs;
+      return num_outputs_;
     }
 
     void SetParam(const void* src, size_t param_idx) {
-      CHECK_VECTOR_ACCESS(params, param_idx)
+      CHECK_VECTOR_ACCESS(params_, param_idx)
       // const_cast is not ideal here, but it is unfortunately
       // necessary:
       // 1) We store outputs and inputs in the same vector,
@@ -172,7 +195,7 @@ class Model {
       //    require non-const pointers). So even if we put const
       //    pointers into params, a const_cast would be required
       //    somewhere else.
-      params[param_idx].ptr = const_cast<void*>(src);
+      params_[param_idx].ptr = const_cast<void*>(src);
     }
 
     void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
@@ -181,7 +204,7 @@ class Model {
     }
 
     void SetOutput(void* src, size_t idx) {
-      SetParam(src, idx + num_inputs);
+      SetParam(src, idx + num_inputs_);
     }
 
     // Write the (possibly dynamic) output shape to the given pointer.
@@ -189,9 +212,9 @@ class Model {
     // Run() is finished. output_shape_out should be able to store
     // at least GetOutputMaximumShape(idx).size values.
     void GetOutputShape(size_t idx, int64_t* output_shape_out) {
-      const auto param_idx = idx + num_inputs;
-      CHECK_VECTOR_ACCESS(params, param_idx);
-      const auto& shape_ptrs = params[param_idx].shape_ptrs;
+      const auto param_idx = idx + num_inputs_;
+      CHECK_VECTOR_ACCESS(params_, param_idx);
+      const auto& shape_ptrs = params_[param_idx].shape_ptrs;
       for (size_t i = 0; i < shape_ptrs.size(); ++i) {
         output_shape_out[i] = shape_ptrs[i].GetValue();
       }
@@ -207,8 +230,13 @@ class Model {
     }
 
   private:
+    void InitConstants(uint8_t* constants) {
+      {{ set_up_constants }}
+      {{ set_up_param_dynamic_shapes }}
+    }
+
     void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
-      auto& param = params[idx];
+      auto& param = params_[idx];
       if (shape.size != param.shape_ptrs.size()) {
         throw std::runtime_error(
           "[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
@@ -220,47 +248,76 @@ class Model {
       }
     }
 
+    DeviceError EndCapture(GraphType* graph_ptr) {
+      auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
+      if (err != GetDeviceSuccess()) {
+        // If we can't take the stream out of capture mode, something is probably
+        // wrong with CUDA graph for this model (e.g. there might have been an
+        // illegal capture mode operation). Disable graph mode to avoid such issues
+        // in future iterations.
+        target_has_graph_mode = false;
+        LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
+        return err;
+      }
+      return GetDeviceSuccess();
+    }
+
     void RunAsGraph(StreamType stream) {
-      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream));
+      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
       try {
-        RunImpl(graph_capture_stream);
+        RunImpl(graph_capture_stream_);
       } catch (...) {
-        DEVICE_CHECK(StreamEndCapture(graph_capture_stream, &graph));
+        GraphType graph;
+        // No need to DEVICE_CHECK here, we want to see the original exception.
+        EndCapture(&graph);
+        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+          LOG(WARNING) << "Graph destruction failed while handling exception! Memory will be leaked.";
+        }
         throw;
       }
-      DEVICE_CHECK(StreamEndCapture(graph_capture_stream, &graph));
 
-      if (graph_exec == nullptr) {
-        DEVICE_CHECK(GraphInstantiate(&graph_exec, graph));
-      } else if (GraphExecUpdate(graph_exec, graph) != GetDeviceSuccess()) {
-        DEVICE_CHECK(GraphExecDestroy(graph_exec));
-        DEVICE_CHECK(GraphInstantiate(&graph_exec, graph));
+      // The following function ends the capture and creates a graph
+      // inside a unique_ptr that cleans up it when it goes out of scope.
+      // Note that it throws an exception if EndCapture fails.
+      auto graph = RAII_EndCaptureAndCreateGraph(
+        [this](GraphType* graph_ptr){ return EndCapture(graph_ptr); }
+      );
+
+      if (graph_exec_ == nullptr) {
+        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+      } else if (GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
+        // Consume the last cuda error, which may affect the next GraphExecLaunch
+        // call.
+        GetLastError();
+        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
+        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
       }
 
-      DEVICE_CHECK(GraphExecLaunch(graph_exec, stream));
+      DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
     }
 
-    int device_idx;
-    int max_smem_size{0};
-    DevicePropertyType device_properties;
+    int device_idx_;
+    int max_smem_size_{0};
+    DevicePropertyType device_properties_;
     // This event tracks when the inference is finished
     // so that this Model may be reclaimed by its owning
     // ModelContainer.
-    EventType run_finished;
+    EventType run_finished_;
     // A blob of memory used for storing intermediate tensors.
-    GPUPtr blob;
+    GPUPtr blob_;
     // Memory for constants that were folded into the *.so. Unowned by Model,
     // owned by ModelContainer.
     // TODO: make this const. It can't be const right now because we derive
     // tensor pointers from it, and no tensor pointers are const.
-    uint8_t* constants;
-    size_t num_inputs;
+    uint8_t* constants_;
+    size_t num_inputs_;
+    size_t num_outputs_;
 
     // The workspace blob is used as scratch memory. See
     // _generate_workspace in memory planning for more information.
-    GPUPtr workspace;
-    uint8_t* global_workspace{nullptr};
-    uint8_t* unique_workspace{nullptr};
+    GPUPtr workspace_;
+    uint8_t* global_workspace_{nullptr};
+    uint8_t* unique_workspace_{nullptr};
 
     class ParamDim {
       public:
@@ -300,16 +357,12 @@ class ParamDim {
     // Contains info for all tensors marked as inputs
     // or outputs. The first num_inputs elements are the inputs.
     // Constants are not included.
-    std::vector<ParamInfo> params;
+    std::vector<ParamInfo> params_;
 
-    GraphExecType graph_exec = nullptr;
-    GraphType graph = nullptr;
-    StreamType graph_capture_stream;
+    GraphExecType graph_exec_ = nullptr;
+    StreamType graph_capture_stream_;
 
     std::unordered_map<std::string, const void**> constant_name_to_ptr_;
-
-    constexpr static bool target_has_graph_mode = {{ target_has_graph_mode }};
-
 {{ tensor_decl }}
 {{ dim_decl }}
 {{ function_state }}
@@ -335,8 +388,9 @@ class ParamDim {
     size_t num_inputs,
     size_t num_outputs,
     size_t num_unbound_constants,
-    size_t params_size)
-    : constants_(RAII_DeviceMalloc(params_size)),
+    size_t params_size,
+    AITemplateAllocator& allocator)
+    : constants_(RAII_DeviceMalloc(params_size, allocator)),
       num_params_(num_inputs + num_outputs + num_unbound_constants),
       param_names_(num_params_),
       param_dtypes_(num_params_),
@@ -358,7 +412,6 @@ class ParamDim {
   }
 
   auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
-  DEVICE_CHECK(DeviceMemset(constants_ptr, 0, params_size));
   const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
   for (auto& constant_info : owned_constants) {
     auto* dst = constants_ptr + constant_info.internal_offset;
@@ -369,9 +422,9 @@ class ParamDim {
   }
 }
 
-ModelContainer* CreateModelContainer(size_t num_runtimes) {
-  // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size
-  return new ModelContainer(num_runtimes, {{blob_size}}, {{workspace_size}}, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}});
+ModelContainer* CreateModelContainer(size_t num_runtimes, AITemplateAllocator& allocator) {
+  // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size, allocator
+  return new ModelContainer(num_runtimes, {{blob_size}}, {{workspace_size}}, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}}, allocator);
 }
 } // namespace ait
 """
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 389928dc3..4d8cd5e36 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -43,7 +43,7 @@ class CacheMode(enum.Enum):
 
 GEMM_INIT_TEMPLATE = jinja2.Template(
     """
- CREATE TABLE IF NOT EXISTS {{dev}}_gemm (
+ CREATE TABLE IF NOT EXISTS {{dev}}_gemm_{{version}} (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
   exec_entry VARCHAR(8192) NOT NULL,
   exec_entry_sha1 VARCHAR(64) NOT NULL,
@@ -71,7 +71,7 @@ class CacheMode(enum.Enum):
 GEMM_QUERY_TEMPLATE = jinja2.Template(
     """
 SELECT algo, workspace, split_k
-FROM {{dev}}_gemm
+FROM {{dev}}_gemm_{{version}}
 WHERE
 dtype_a={{dtype_a}} AND
 dtype_b={{dtype_b}} AND
@@ -90,7 +90,7 @@ class CacheMode(enum.Enum):
 
 GEMM_INSERT_TEMPLATE = jinja2.Template(
     """
-INSERT INTO {{dev}}_gemm (
+INSERT INTO {{dev}}_gemm_{{version}} (
     exec_entry,
     exec_entry_sha1,
     dtype_a,
@@ -129,16 +129,6 @@ class CacheMode(enum.Enum):
 """
 )
 
-GEMM_ENTRY_QUERY = jinja2.Template(
-    """
-SELECT id
-FROM {{dev}}_gemm
-WHERE
-op_type='{{op_type}}' AND
-exec_entry_sha1='{{exec_entry_sha1}}';
-"""
-)
-
 CONV_INIT_TEMPLATE = jinja2.Template(
     """
  CREATE TABLE IF NOT EXISTS {{dev}}_conv (
@@ -248,6 +238,142 @@ class CacheMode(enum.Enum):
 """
 )
 
+CONV3D_INIT_TEMPLATE = jinja2.Template(
+    """
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv3d (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  exec_entry VARCHAR(8192) NOT NULL,
+  exec_entry_sha1 VARCHAR(64) NOT NULL,
+  dtype_a INTEGER NOT NULL,
+  dtype_b INTEGER NOT NULL,
+  dtype_c INTEGER NOT NULL,
+  dtype_acc INTEGER NOT NULL,
+  major_a INTEGER NOT NULL,
+  major_b INTEGER NOT NULL,
+  major_c INTEGER NOT NULL,
+  kd INTEGER NOT NULL,
+  kh INTEGER NOT NULL,
+  kw INTEGER NOT NULL,
+  co INTEGER NOT NULL,
+  stride_d INTEGER NOT NULL,
+  stride_h INTEGER NOT NULL,
+  stride_w INTEGER NOT NULL,
+  pad_d INTEGER NOT NULL,
+  pad_h INTEGER NOT NULL,
+  pad_w INTEGER NOT NULL,
+  dilate_d INTEGER NOT NULL,
+  dilate_h INTEGER NOT NULL,
+  dilate_w INTEGER NOT NULL,
+  op_type VARCHAR(512) NOT NULL,
+  epilogue VARCHAR(512) NOT NULL,
+  device VARCHAR(16) NOT NULL,
+  algo VARCHAR(512) NOT NULL,
+  workspace INTEGER DEFAULT 0,
+  duration FLOAT DEFAULT -1,
+  split_k INTEGER DEFAULT 1,
+  template_ver INTEGER NOT NULL DEFAULT 290,
+  created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL
+);
+"""
+)
+
+CONV3D_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT algo, workspace
+FROM {{dev}}_conv3d
+WHERE
+dtype_a={{dtype_a}} AND
+dtype_b={{dtype_b}} AND
+dtype_c={{dtype_c}} AND
+dtype_acc={{dtype_acc}} AND
+major_a={{major_a}} AND
+major_b={{major_b}} AND
+major_c={{major_c}} AND
+kd={{kd}} AND
+kh={{kh}} AND
+kw={{kw}} AND
+co={{co}} AND
+stride_d={{stride_d}} AND
+stride_h={{stride_h}} AND
+stride_w={{stride_w}} AND
+pad_d={{pad_d}} AND
+pad_h={{pad_h}} AND
+pad_w={{pad_w}} AND
+dilate_d={{dilate_d}} AND
+dilate_h={{dilate_h}} AND
+dilate_w={{dilate_w}} AND
+op_type='{{op_type}}' AND
+device='{{device}}' AND
+epilogue={{epilogue}} AND
+split_k={{split_k}} AND
+exec_entry_sha1='{{exec_entry_sha1}}';
+"""
+)
+
+CONV3D_INSERT_TEMPLATE = jinja2.Template(
+    """
+INSERT INTO {{dev}}_conv3d (
+    exec_entry,
+    exec_entry_sha1,
+    dtype_a,
+    dtype_b,
+    dtype_c,
+    dtype_acc,
+    major_a,
+    major_b,
+    major_c,
+    kd,
+    kh,
+    kw,
+    co,
+    stride_d,
+    stride_h,
+    stride_w,
+    pad_d,
+    pad_h,
+    pad_w,
+    dilate_d,
+    dilate_h,
+    dilate_w,
+    op_type,
+    epilogue,
+    device,
+    algo,
+    workspace,
+    split_k
+)
+VALUES (
+    '{{exec_entry}}',
+    '{{exec_entry_sha1}}',
+    {{dtype_a}},
+    {{dtype_b}},
+    {{dtype_c}},
+    {{dtype_acc}},
+    {{major_a}},
+    {{major_b}},
+    {{major_c}},
+    {{kd}},
+    {{kh}},
+    {{kw}},
+    {{co}},
+    {{stride_d}},
+    {{stride_h}},
+    {{stride_w}},
+    {{pad_d}},
+    {{pad_h}},
+    {{pad_w}},
+    {{dilate_d}},
+    {{dilate_h}},
+    {{dilate_w}},
+    '{{op_type}}',
+    {{epilogue}},
+    '{{device}}',
+    '{{algo}}',
+    {{workspace}},
+    {{split_k}}
+);
+"""
+)
 
 NORM_INIT_TEMPLATE = jinja2.Template(
     """
@@ -315,6 +441,20 @@ class CacheMode(enum.Enum):
 )
 
 
+CHECK_TABLE_EXISTENCE_TEMPLATE = jinja2.Template(
+    """
+SELECT name FROM sqlite_master WHERE type='table' AND name='{{table_name}}';
+"""
+)
+
+
+QUERY_ALL_TABLES_TEMPLATE = jinja2.Template(
+    """
+SELECT name FROM sqlite_master WHERE type='table';
+"""
+)
+
+
 class ProfileCacheDB(object):
     r"""Local SQLite profile cache database."""
 
@@ -338,6 +478,19 @@ def __init__(
         self._target = target
         self._mode = CacheMode.LOCAL
         self._db_commit_flag = False
+        # Some design rationales:
+        #   * Each table maintains it own version number. This can avoid re-creating
+        #     tables that are not involved with the breaking changes.
+        #   * We only keep a single table (i.e. version) for each category (
+        #     gemm, conv and norm) to simplify how we handle breaking changes
+        #     and rollbacks caused by failures in the updated version.
+        #     For example, if we keep multiple versions (i.e. tables) for gemm,
+        #     we would have to consider how we were going to maintain those versions.
+        #     We could choose the old working version upon rollback, but we might
+        #     leave some content from the failing version in the db. How are we
+        #     going to update the db if we update the version again, and so on.
+        # TODO: add similar version control for conv and norm
+        self._gemm_cache_version = 1
         if uri is not None:
             self._mode = CacheMode.REMOTE
         if self._mode == CacheMode.LOCAL:
@@ -352,26 +505,87 @@ def _init_db(self):
         """Creates table in cache."""
         self._create_gemm_table()
         self._create_conv_table()
+        self._create_conv3d_table()
         self._create_norm_table()
 
+    def get_profile_gemm_cache_version(self) -> int:
+        return self._gemm_cache_version
+
     def _create_gemm_table(self):
         """Creates gemm table."""
-        sql = GEMM_INIT_TEMPLATE.render(dev=self._target)
+        if not self._gemm_table_version_matches():
+            logger.info(__name__, "temporarily keep old cache versions")
+            # FIXME: will delete unmatched version once we get into production
+            # self._delete_existing_table("gemm")
+
+        logger.info(
+            __name__,
+            f"Trying to make a new gemm table with {self._gemm_cache_version=}",
+        )
+        sql = GEMM_INIT_TEMPLATE.render(
+            dev=self._target, version=self._gemm_cache_version
+        )
         self._cur.execute(sql)
         self._con.commit()
 
+    def _delete_existing_table(self, table_kind):
+        """Delete an existing table in the db"""
+        sql = QUERY_ALL_TABLES_TEMPLATE.render()
+        self._cur.execute(sql)
+        all_tables = self._cur.fetchall()
+        if len(all_tables) == 0:
+            logger.info(__name__, "deleting table: skip empty table")
+            return
+
+        target_tables = [
+            table[0]
+            for table in all_tables
+            if table[0].startswith(f"{self._target}_{table_kind}")
+        ]
+        assert len(target_tables) != 0, f"no {table_kind} table exists"
+        # To simplify the logic, we only keep a single table for each kind
+        assert (
+            len(target_tables) == 1
+        ), f"expected only one {table_kind} table but got {target_tables=}"
+        logger.info(__name__, f"deleting table {target_tables[0]=}")
+        self._cur.execute(f"DROP TABLE {target_tables[0]}")
+
     def _create_conv_table(self):
         """Creates conv table."""
         sql = CONV_INIT_TEMPLATE.render(dev=self._target)
         self._cur.execute(sql)
         self._con.commit()
 
+    def _create_conv3d_table(self):
+        """Creates conv3d table."""
+        sql = CONV3D_INIT_TEMPLATE.render(dev=self._target)
+        self._cur.execute(sql)
+        self._con.commit()
+
     def _create_norm_table(self):
         """Creates conv table."""
         sql = NORM_INIT_TEMPLATE.render(dev=self._target)
         self._cur.execute(sql)
         self._con.commit()
 
+    def _if_table_exists(self, table_name):
+        """check if a table exists"""
+        sql = CHECK_TABLE_EXISTENCE_TEMPLATE.render(table_name=table_name)
+        self._cur.execute(sql)
+        tables = self._cur.fetchall()
+        return len(tables) > 0
+
+    def _gemm_table_version_matches(self):
+        table_name = f"{self._target}_gemm_{self._gemm_cache_version}"
+        if self._if_table_exists(table_name):
+            logger.info(__name__, f"{table_name=} exists in the db")
+            return True
+        else:
+            logger.info(
+                __name__, f"{table_name=} does not exist in the db, version mismatch!"
+            )
+            return False
+
     def _query(self, sql: str) -> Tuple[str, int]:
         """a function to query op from cache
 
@@ -410,7 +624,9 @@ def query_gemm(self, args: Dict[str, Any]) -> Tuple[str, int]:
         Tuple
             profiling results
         """
-        sql = GEMM_QUERY_TEMPLATE.render(dev=self._target, **args)
+        sql = GEMM_QUERY_TEMPLATE.render(
+            dev=self._target, version=self._gemm_cache_version, **args
+        )
         return self._query(sql)
 
     def query_conv(self, args: Dict[str, Any]) -> Tuple[str, int]:
@@ -430,6 +646,23 @@ def query_conv(self, args: Dict[str, Any]) -> Tuple[str, int]:
         sql = CONV_QUERY_TEMPLATE.render(dev=self._target, **args)
         return self._query(sql)
 
+    def query_conv3d(self, args: Dict[str, Any]) -> Tuple[str, int]:
+        """a function to query conv op epilogue from cache,
+        here we use the same sql table for conv and gemm
+
+        Parameters
+        ----------
+        args : Dict
+            Conv3d query entry
+
+        Returns
+        -------
+        Tuple
+            profiling results
+        """
+        sql = CONV3D_QUERY_TEMPLATE.render(dev=self._target, **args)
+        return self._query(sql)
+
     def query_normalization(self, args: Dict[str, Any]) -> Tuple[str, int]:
         """a function to query normalization op epilogue from cache
 
@@ -475,6 +708,7 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
         """
         query_sql = GEMM_QUERY_TEMPLATE.render(
             dev=self._target,
+            version=self._gemm_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -489,7 +723,9 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
             pshape=args["pshape"],
             exec_entry_sha1=args["exec_entry_sha1"],
         )
-        insert_sql = GEMM_INSERT_TEMPLATE.render(dev=self._target, **args)
+        insert_sql = GEMM_INSERT_TEMPLATE.render(
+            dev=self._target, version=self._gemm_cache_version, **args
+        )
         self._insert(query_sql, insert_sql)
 
     def insert_conv(self, args: Dict[str, Any]) -> None:
@@ -526,6 +762,47 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
         insert_sql = CONV_INSERT_TEMPLATE.render(dev=self._target, **args)
         self._insert(query_sql, insert_sql)
 
+    def insert_conv3d(self, args: Dict[str, Any]) -> None:
+        """a function to insert conv op epilogue into cache,
+        here we use the same sql table for conv and gemm
+
+        Parameters
+        ----------
+        args : Dict
+            Conv Record Entry
+
+        """
+        query_sql = CONV3D_QUERY_TEMPLATE.render(
+            dev=self._target,
+            dtype_a=args["dtype_a"],
+            dtype_b=args["dtype_b"],
+            dtype_c=args["dtype_c"],
+            dtype_acc=args["dtype_acc"],
+            major_a=args["major_a"],
+            major_b=args["major_b"],
+            major_c=args["major_c"],
+            kd=args["kd"],
+            kh=args["kh"],
+            kw=args["kw"],
+            co=args["co"],
+            stride_d=args["stride_d"],
+            stride_h=args["stride_h"],
+            stride_w=args["stride_w"],
+            pad_d=args["pad_d"],
+            pad_h=args["pad_h"],
+            pad_w=args["pad_w"],
+            dilate_d=args["dilate_d"],
+            dilate_h=args["dilate_h"],
+            dilate_w=args["dilate_w"],
+            op_type=args["op_type"],
+            device=args["device"],
+            epilogue=args["epilogue"],
+            split_k=args["split_k"],
+            exec_entry_sha1=args["exec_entry_sha1"],
+        )
+        insert_sql = CONV3D_INSERT_TEMPLATE.render(dev=self._target, **args)
+        self._insert(query_sql, insert_sql)
+
     def insert_normalization(self, args: Dict[str, Any]) -> None:
         """a function to insert conv op epilogue into cache,
         here we use the same sql table for conv and gemm
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index bcbce8bdd..add0ce75f 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -15,12 +15,17 @@
 """
 A subprocess based multiple GPUs runner for auto-tuning
 """
-
 from __future__ import annotations
 
+import concurrent
+import os
+
 import re
+import subprocess
 import typing
 from collections import namedtuple
+from queue import Queue
+from typing import Callable, List, Tuple
 
 from ..utils import logger
 from .target import Target
@@ -28,14 +33,49 @@
 
 # pylint: disable=W0221
 
+PROF_RUNTIME_PATTERN = re.compile(r"OP:([a-zA-Z0-9_]+),TIME:([\d\.]+),WS:([\d]+)")
+# FIXME: We will remove the following two patterns once we implement the
+# same profiling mechanism as gemm for conv and amd
 RUNTIME_PATTERN = re.compile(r"TIME:([\d\.]+)")
 WORKSPACE_PATTERN = re.compile(r"WS:([\d]+)")
 
-ProfileResult = namedtuple("ProfileResult", "duration workspace")
+ProfileResult = namedtuple("ProfileResult", "op_config duration workspace")
 """Object to store profiling result
 """
 
 
+def optimization_key(result):
+    return float(result[1])
+
+
+def extract_profile_result(stdout) -> Tuple[ProfileResult, bool]:
+    failed = False
+    try:
+        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
+        if len(runtimes) > 0:
+            logger.debug(__name__, f"all runtimes (unsorted): {runtimes}")
+            # format - OP:xx,TIME:x.xx,WS:xx
+            best_runtime = min(runtimes, key=optimization_key)
+            op_config = best_runtime[0]
+            duration = float(best_runtime[1])
+            workspace = int(best_runtime[2])
+        else:
+            # FIXME: remove it once we unify our profiling mechanism for conv and amd
+            op_config = ""
+            duration = float(RUNTIME_PATTERN.findall(stdout)[0])
+            workspace = int(WORKSPACE_PATTERN.findall(stdout)[0])
+    except Exception:
+        duration = 0
+        workspace = 0
+        failed = True
+    return ProfileResult(op_config, duration, workspace), failed
+
+
+def update_inplace(d, new_d):
+    d.update(new_d)
+    return d
+
+
 def process_task(task: Task) -> None:
     """Extract kernel execution time and workspace from task process outputs
 
@@ -47,7 +87,6 @@ def process_task(task: Task) -> None:
     stdout = task._stdout
     stderr = task._stderr
     if len(stderr) > 0:
-        task._failed = True
         logger.debug(
             __name__,
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
@@ -57,15 +96,12 @@ def process_task(task: Task) -> None:
                 stderr=stderr,
             ),
         )
-    else:
-        duration = float(RUNTIME_PATTERN.findall(stdout)[0])
-        workspace = int(WORKSPACE_PATTERN.findall(stdout)[0])
-        task._ret = ProfileResult(duration, workspace)
-        logger.info(
+    task._ret, task._failed = extract_profile_result(stdout)
+    if not task._failed:
+        logger.debug(
             __name__,
-            "Successful: [{name}][{algo}]: TIME: {duration} WS:{ws}".format(
-                name=task._name, algo=task._idx, duration=duration, ws=workspace
-            ),
+            f"Successful: [{task._name}][{task._idx}]: OP: {task._ret.op_config} "
+            f"TIME: {task._ret.duration} WS:{task._ret.workspace}",
         )
 
 
@@ -121,3 +157,111 @@ def pull(self):
         """
         ret = super().pull(self._ftask_proc, self._fret_proc)
         return ret
+
+
+def run_task(cmds, queue, dev_select_flag):
+    # get device or block until one is available
+    device = queue.get()
+    logger.debug(__name__, f"running profiler {cmds=} on GPU #{device}")
+
+    completed_process = subprocess.run(
+        cmds,
+        env=update_inplace(os.environ.copy(), {dev_select_flag: device}),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        shell=False,
+    )
+    queue.put(device)
+    return completed_process.stdout, completed_process.stderr
+
+
+class ProfilerRunner:
+    """Another parallel runner to execute profilers on multiple GPUs in parallel
+    It uses a process pool for implementation, avoiding process creation overhead
+    The size of the process pool is equal to the number of provided GPUs,
+    so ~ideally~ each process should execute a profiler on its dedicated GPU.
+    This property hasn't been properly verified yet,
+    however, the results are empirically better compared to the previous runner.
+    """
+
+    def __init__(self, devices: List[str], timeout: int, postprocessing_delegate):
+        """
+        Parameters
+        ----------
+        devices : List[str]
+            device identifiers (contents of {CUDA,HIP}_VISIBLE_DEVICES)
+        timeout : int
+            timeout to wait for all profilers completion in seconds
+        postprocessing_delegate :
+            object responsible for postprocessing results after futures completion
+        """
+        if devices is None:
+            devices = [0]
+        # This queue is used to ensure only one task is executed on a device at a time
+        self._device_queue = Queue()
+        # This queue is used to ensure postprocessing in `join()` happens *after* done_callbacks complete
+        self._done_queue = Queue()
+        for d in devices:
+            self._device_queue.put(str(d))
+        logger.info(__name__, f"Initialized profiler runner with devices: {devices}")
+        self._timeout = timeout
+        self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(devices))
+        self._futures = []
+        self._postprocessing_delegate = postprocessing_delegate
+        self._dev_select_flag = Target.current().dev_select_flag()
+
+    def push(self, cmds: List[str], process_result_callback: Callable):
+        """
+        Schedule the profiler for execution in a separate process,
+        Call the callback after subprocess completion
+
+        Parameters
+        ----------
+        cmds : List[str]
+            argv for the launched profiler
+        process_result_callback : Callable
+            Called after subprocess completion in the main process
+            (but possibly not main thread).
+            Currently used to aggregate profiler results,
+            so the callable takes `result` and `postprocessing_delegate` parameters
+            It is also used to propagate the profiler launch context to the aggregation point,
+            namely, split_k value for the gemm profilers
+        """
+        future = self._executor.submit(
+            run_task, cmds, self._device_queue, self._dev_select_flag
+        )
+
+        # done callbacks are used to collect profiler results for postprocessing
+        # they are launched asynchronously, in a separate thread,
+        # some time after a future holding profiler result completes
+        def callback_when_done(fut):
+            try:
+                stdout, stderr = fut.result()
+                profile_result, err = extract_profile_result(stdout)
+                if err:
+                    logger.error(
+                        f"Profiler failure!\nProfiler stdout: {stdout}\nProfiler stderr: {stderr}"
+                    )
+                    raise RuntimeError(f"Failed to extract profiler result for {cmds}")
+                process_result_callback(profile_result, self._postprocessing_delegate)
+            finally:
+                # unblock one future in `join()`
+                self._done_queue.put(stdout)
+
+        future.add_done_callback(callback_when_done)
+        self._futures.append(future)
+
+    def join(self):
+        """
+        Wait for subprocesses completion or timeout; postprocess the profiler results with delegate(s)
+        """
+        done, not_done = concurrent.futures.wait(self._futures, self._timeout)
+        for f in not_done:
+            f.cancel()
+        # block until each done_callback completes,
+        # or raise Empty exception after 3 minutes of waiting
+        block_timeout = 3 * 60
+        for _ in self._futures:
+            self._done_queue.get(timeout=block_timeout)
+        self._postprocessing_delegate.postprocess_results()
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 8e7d893eb..c2d91d5d2 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -22,7 +22,6 @@
 
 import jinja2
 
-from ... import builder
 from ...target import Target
 
 # pylint: disable=C0103,C0415,W0611,C0301
@@ -604,7 +603,7 @@ def gen_profiler(
         dilate="dilation",
         pad="pad",
     )
-    file_paris = []
+    file_pairs = []
     for op_name, op in op_instance.items():
         config = emit_instance(op)
         config_name = extract_config_name(config)
@@ -673,15 +672,8 @@ def gen_profiler(
             continue
         with open(src_path, "w") as fo:
             fo.write(code)
-        file_paris.append((src_path, obj_path))
-
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
-    # cleanup source
-    # for src_path, _ in file_paris:
-    #     os.remove(src_path)
+        file_pairs.append((src_path, obj_path))
+    return file_pairs
 
 
 def gen_function(
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d.py b/python/aitemplate/backend/rocm/conv2d/conv2d.py
index 1757f3608..c8191c19a 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d.py
@@ -58,7 +58,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
index ae5207fe5..ccbc265dd 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
@@ -58,7 +58,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index bd94efd51..5ae33fd39 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -103,7 +103,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         The template is passed from compiler/ops/pool.
     """
     extra_code = EXTRA_CODE.render()
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
index 44aaf5963..ddbcaecd3 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
@@ -59,7 +59,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index d94ad48de..2ca81637f 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -107,7 +107,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
index 0a0bce2f7..f07a8f17a 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -85,7 +85,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
index 44771e9b7..0be5a94e6 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
@@ -68,7 +68,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         shape_template=shape_template,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
index edb3f10e9..b8c24f4af 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
@@ -81,7 +81,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_common.py b/python/aitemplate/backend/rocm/gemm/bmm_common.py
index a56f8b6f0..497eaf26b 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_common.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_common.py
@@ -144,7 +144,7 @@ def gen_profiler(
     extra_code : str
         Extra code for self-defined operators.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs,
         workdir,
         dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr.py b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
index cb8886214..2369ffd45 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_crr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
@@ -81,7 +81,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
index 08c939828..3fe8e9529 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
@@ -81,7 +81,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
index 9672310af..9909b3e65 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
@@ -83,7 +83,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
         dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
index e7f660ae0..8e8646385 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
@@ -81,7 +81,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
index 9bfed99b5..bedbc90ba 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
@@ -83,7 +83,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs,
         workdir,
         dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
index 53c07aaf9..3881b4879 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -186,7 +186,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
index 7ad9999e4..cf9fdd752 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -218,7 +218,7 @@ def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from bmm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    bmm_common.gen_profiler(
+    return bmm_common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         args_parse=ARGS_PARSER_TEMPLATE.render(),
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index bbc35048c..4b3c1f351 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -22,7 +22,6 @@
 
 import jinja2
 
-from ... import builder
 from ...common import gemm_common
 from ...target import Target
 
@@ -619,7 +618,7 @@ def gen_profiler(
     if func_attrs.get("shape") is not None:
         pdims = ["p_dim" + str(i) for i in range(len(func_attrs["shape"]))]
     extra_shape_func = extra_shape_template.render(indent="  ")
-    file_paris = []
+    file_pairs = []
     has_d0_flag = has_d0(func_attrs)
     has_d1_flag = has_d1(func_attrs)
     for op_name, op in op_instance.items():
@@ -696,12 +695,8 @@ def gen_profiler(
             continue
         with open(src_path, "w") as fo:
             fo.write(code)
-        file_paris.append((src_path, obj_path))
-
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
+        file_pairs.append((src_path, obj_path))
+    return file_pairs
 
 
 def gen_function(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
index ca790b2b0..eaf160305 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
@@ -62,7 +62,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
index aff8e059e..3eb456567 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
@@ -62,7 +62,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
index dcf346288..c6b1e43c4 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
@@ -88,7 +88,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
index 112cd21e2..83f2422e4 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
@@ -88,7 +88,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
index bf41915ba..4f3d7a3b9 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
@@ -89,7 +89,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
index 2c2e96a03..f40127ce0 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
@@ -89,7 +89,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
index cbeacdb51..376065ba8 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
@@ -62,7 +62,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
index db7bbe766..0741eb9d2 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
@@ -88,7 +88,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
index 0dba9d33f..0591e573d 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
@@ -76,7 +76,7 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 
 @registry.reg("rocm.gemm_rcr_bias_mul_add.gen_profiler")
 def gen_profiler(func_attrs, workdir, dim_info_dict):
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
index 5b993ed70..5c925b21c 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
@@ -91,7 +91,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
index 0043c65cb..ffacf0417 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -60,7 +60,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
index 7c74479d3..a0b96d106 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
@@ -85,7 +85,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
index 527b34f1b..596dee60c 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
@@ -85,7 +85,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
index d28bca9ca..e49bcd7ec 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
@@ -62,7 +62,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
index 8dadff7a0..83531c77e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
@@ -108,7 +108,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
index 1e0f1e42c..100805e2e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
@@ -90,7 +90,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
index d9818e3a9..dc73256ef 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -93,7 +93,7 @@ def gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index ed18338e4..cb298e6b1 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -63,7 +63,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
index cc1d8f6db..5eaeb0686 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
@@ -110,7 +110,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
index 07833dc13..d9350fd20 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
@@ -85,7 +85,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
index 582d63ad8..414428906 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
@@ -62,7 +62,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
index b080779db..005f51bd3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
@@ -60,7 +60,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         Generated from gemm._extract_dims().
         Used to store mapping between dim_names to input / output tensor dims.
     """
-    common.gen_profiler(
+    return common.gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
index 36e8151d6..9dfbf11e2 100644
--- a/python/aitemplate/backend/rocm/lib_template.py
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -31,12 +31,16 @@ def var_decl(name, value=0, indent="  "):
     return VAR_TEMPLATE.render(name=name, value=value, indent=indent)
 
 
-@registry.reg("rocm.lib.ptr_decl")
-def ptr_decl(name, dtype="float16", indent="  "):
+@registry.reg("rocm.lib.void_ptr_decl")
+def void_ptr_decl(name, dtype="float16", indent="  "):
+    # FIXME: we should just print void* after we support general tensor type, e.g.
+    # return PTR_TEMPLATE.render(name=name, dtype="void*", indent=indent)
     if dtype == "float16":
         type_string = "ck::half_t*"
     elif dtype == "int64":
         type_string = "int64_t*"
+    elif dtype == "bool":
+        type_string = "bool*"
     else:
         raise NotImplementedError
     return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index a6d3443ee..b2dfffb64 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -268,7 +268,7 @@ def groupnorm_gen_profiler(
             shapes[dim_idx], IntImm
         ), f"groupnorm requires reduction dim {dim_idx=} to be static"
 
-    norm_common.gen_profiler(
+    return norm_common.gen_profiler(
         func_attrs,
         workdir,
         5,  # rank
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
index 2808f88f7..01872be32 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
@@ -32,7 +32,7 @@ def extract_config(func_attrs):
 
 @registry.reg("rocm.groupnorm_swish.gen_profiler")
 def gen_profiler(func_attrs: Dict[str, Any], workdir: str, indent: str = "  ") -> str:
-    groupnorm_gen_profiler(func_attrs, workdir, indent, use_swish=True)
+    return groupnorm_gen_profiler(func_attrs, workdir, indent, use_swish=True)
 
 
 @registry.reg("rocm.groupnorm_swish.gen_function")
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index f2392c024..0ca7e6052 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -195,7 +195,7 @@ def layernorm_gen_profiler(
         shapes[dim], IntImm
     ), "layernorm requires reduction dim to be static"
 
-    norm_common.gen_profiler(
+    return norm_common.gen_profiler(
         func_attrs,
         workdir,
         2,  # rank
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index fb4e5135c..c6a0cca17 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -23,7 +23,6 @@
 
 import jinja2
 
-from ... import builder
 from ...target import Target
 
 FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
@@ -337,7 +336,7 @@ def gen_profiler(
     eps = func_attrs.get("eps", "1e-5")
 
     op_instance = func_attrs["op_instance"]
-    file_paris = []
+    file_pairs = []
     for op_name, op in op_instance.items():
 
         config = emit_instance(op)
@@ -392,12 +391,8 @@ def gen_profiler(
             continue
         with open(src_path, "w") as fo:
             fo.write(code)
-        file_paris.append((src_path, obj_path))
-
-    # build
-    target = Target.current()
-    compile_engine = builder.Builder()
-    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
+        file_pairs.append((src_path, obj_path))
+    return file_pairs
 
 
 # no longer used by layernorm
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index 7b12a264e..819f24e0e 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -153,7 +153,7 @@ def softmax_gen_profiler(
         shapes[dim], IntImm
     ), "softmax requires reduction dim to be static"
 
-    norm_common.gen_profiler(
+    return norm_common.gen_profiler(
         func_attrs,
         workdir,
         rank,
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index 6d7b3330b..ff4483f88 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -17,6 +17,7 @@
 """
 # pylint: disable=W0702,W0707,W0611,C0415
 
+import json
 import os
 import re
 import shutil
@@ -25,6 +26,8 @@
 
 from aitemplate.backend.target import AIT_STATIC_FILES_PATH
 
+from ...utils import logger
+
 from .. import registry
 from ..target import COMPOSABLE_KERNEL_PATH, Target
 
@@ -76,20 +79,7 @@ def _pkg_path(self):
         rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm")
         return rocm_path
 
-    def _build_compile_options(self):
-        """Build compilation commands, including compilation flag library and includes.
-
-        Returns
-        -------
-        List
-            List of compilation options.
-
-        Raises
-        ------
-        RuntimeError
-            Unsupported GPU Arch.
-        """
-
+    def _get_ck_paths(self):
         ck_paths = [
             os.path.join(self._template_path),
             os.path.join(self._template_path, "include/"),
@@ -144,6 +134,23 @@ def _build_compile_options(self):
             os.path.join(self._template_path, "library/include/ck/library/utility/"),
             os.path.join(self._template_path, "profiler/include/"),
         ]
+        return ck_paths
+
+    def _build_compile_options(self):
+        """Build compilation commands, including compilation flag library and includes.
+
+        Returns
+        -------
+        List
+            List of compilation options.
+
+        Raises
+        ------
+        RuntimeError
+            Unsupported GPU Arch.
+        """
+
+        ck_paths = self._get_ck_paths()
         options = [
             "-O3",
             "-fPIC",
@@ -168,6 +175,8 @@ def _build_compile_options(self):
         rocrand_path = os.path.join(self._pkg_path(), "rocrand/lib/")
         options.append("-L" + rocrand_path)
         options.append("-lrocrand")
+        if self._ndebug == 1:
+            options.append("-DNDEBUG")
         return " ".join(options)
 
     def _gen_ck_lib_pkg(self):
@@ -228,7 +237,9 @@ def compile_cmd(self, executable=False):
         if executable:
             cmd = self.cc() + " " + self._compile_options + " -o {target} {src}"
         else:
-            cmd = self.cc() + " " + self._compile_options + " -c -o {target} {src}"
+            cmd = (
+                self.cc() + " " + self._compile_options + " -x hip -c -o {target} {src}"
+            )
         return cmd
 
     def src_extension(self):
@@ -257,7 +268,110 @@ def comp_func(name):
                 raise RuntimeError("Unknown CK ops.")
             return tuple(args)
 
-        return sorted(algo_names, key=comp_func)[0]
+        return min(algo_names, key=comp_func)
+
+
+class FBROCM(ROCM):
+    """ROCM target.
+
+    Parameters
+    ----------
+    Target : Target
+        All attributes needed for ROCM.
+    """
+
+    def __init__(
+        self,
+        template_path=COMPOSABLE_KERNEL_PATH,
+        arch="GFX90a",
+        ait_static_files_path=AIT_STATIC_FILES_PATH,
+        **kwargs,
+    ):
+        """Initialize ROCM target.
+
+        Parameters
+        ----------
+        template_path : str, optional
+            Path to composable kernel library, by default "${repo_root}/3rdparty/composable_kernel".
+        ait_static_files_path : str
+            Absolute path to the AIT static/ directory
+        arch : str, optional
+            Supported ROCM architecture, by default "GFX90a".
+        """
+        from libfb.py import parutil
+
+        self._template_path = template_path.replace("3rdparty", "fb/3rdparty")
+
+        convert_hippcc_json = parutil.get_file_path(
+            os.path.join("aitemplate/testing", "convert_hipcc_cmd")
+        )
+        logger.info(
+            __name__, f"Load the hipcc compile option from {convert_hippcc_json}"
+        )
+        with open(convert_hippcc_json, "r") as hipcc_options_json:
+            self.hipcc_options_json = json.load(hipcc_options_json)
+
+        super().__init__(template_path=self._template_path, arch=arch, **kwargs)
+
+    def _build_compile_options(self):
+        """Build compilation commands, including compilation flag library and includes.
+
+        Returns
+        -------
+        List
+            List of compilation options.
+
+        Raises
+        ------
+        RuntimeError
+            Unsupported GPU Arch.
+        """
+
+        ck_paths = self._get_ck_paths()
+        options = self.hipcc_options_json["args"] + [
+            "-O3",
+            "-fPIC",
+            "-fvisibility=hidden",
+            "-std=c++17",
+            "-w",
+            "-DCK_TIME_KERNEL=0",
+            "--hip-version=5.2.0",
+        ]
+
+        for path in ck_paths:
+            options.append("-I" + path)
+
+        if self._arch in {"GFX908", "gfx908"}:
+            options.append("-DCK_AMD_GPU_GFX908")
+            options.append("--cuda-gpu-arch=gfx908")
+        elif self._arch in {"GFX90a", "gfx90a"}:
+            options.append("-DCK_AMD_GPU_GFX90A")
+            options.append("--cuda-gpu-arch=gfx90a")
+        else:
+            raise RuntimeError("Unsupported GPU Arch")
+        for path in ck_paths:
+            options.append("-I" + path)
+
+        options.append("-lrocrand")
+        return " ".join(options)
+
+    def binary_compile_cmd(self):
+        """
+        There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
+        """
+        ld = self.hipcc_options_json["ld"]
+        return " ".join([ld, "-r -b binary -o {target} {src}"])
+
+    def cc(self):
+        return self.hipcc_options_json["hipcc_bin"]
+
+    def compile_options(self):
+        return self._compile_options
+
+
+@registry.reg("fb.rocm.create_target")
+def create_target_fb(arch, **kwargs):
+    return FBROCM(arch=arch, **kwargs)
 
 
 @registry.reg("rocm.create_target")
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 9baec587c..057633c93 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -35,7 +35,6 @@
 CUTLASS_PATH = os.path.join(_3RDPARTY_PATH, "cutlass")
 COMPOSABLE_KERNEL_PATH = os.path.join(_3RDPARTY_PATH, "composable_kernel")
 CUB_PATH = os.path.join(_3RDPARTY_PATH, "cub")
-DEFAULT_INTERNAL_DB_PATH = "aitemplate/AITemplate/python/aitemplate"
 
 CURRENT_TARGET = None
 
@@ -62,6 +61,12 @@ def __init__(self, static_files_path: str):
         self._profile_cache = None
         self.static_files_path = static_files_path
 
+        ndebug_str = os.getenv("AIT_NDEBUG", "1")
+        try:
+            self._ndebug = int(ndebug_str)
+        except ValueError:
+            self._ndebug = 0
+
     def __enter__(self):
         """Enter the target context manager.
 
@@ -139,6 +144,10 @@ def cc(self):
         """
         raise NotImplementedError
 
+    def make(self):
+        make_path = shutil.which("make")
+        return make_path if make_path is not None else "make"
+
     def compile_cmd(self, executable: bool = False):
         """Compile command string template for this target.
 
@@ -293,10 +302,6 @@ def _prepare_profile_cache_path(self) -> Optional[str]:
         prefix = None
         if os.environ.get("CACHE_DIR", None):
             prefix = os.environ.get("CACHE_DIR", None)
-        if os.getenv("INSIDE_RE_WORKER") == "1":
-            from libfb.py import parutil
-
-            prefix = parutil.get_file_path(DEFAULT_INTERNAL_DB_PATH)
         cache_file = self._get_cache_file_name()
         if prefix is None:
             prefix = os.path.join(pathlib.Path.home(), ".aitemplate")
@@ -330,6 +335,29 @@ def get_profile_cache_path(self):
         """Get local profile cache path for this target."""
         return self._cache_path
 
+    def get_profile_cache_version(self, op_class: str) -> int:
+        """Get the current profile cache version for the op_class.
+
+        Parameters
+        ----------
+        op_class : str
+            Op class name: only gemm is supported at the moment.
+
+        Returns
+        -------
+        int
+            cache version.
+
+        Raises
+        ------
+        NotImplementedError
+            If op class is not supported, raise error.
+        """
+        # TODO: support conv and normalization
+        if op_class == "gemm":
+            return self._profile_cache.get_profile_gemm_cache_version()
+        raise NotImplementedError
+
     def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
         """Query the profile cache for the given op class and args.
 
@@ -354,6 +382,8 @@ def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
             return self._profile_cache.query_gemm(args)
         if op_class == "conv":
             return self._profile_cache.query_conv(args)
+        if op_class == "conv3d":
+            return self._profile_cache.query_conv3d(args)
         if op_class == "normalization":
             return self._profile_cache.query_normalization(args)
         raise NotImplementedError
@@ -364,6 +394,8 @@ def insert_profile_cache(self, op_class: str, args: str):
             self._profile_cache.insert_gemm(args)
         elif op_class == "conv":
             self._profile_cache.insert_conv(args)
+        elif op_class == "conv3d":
+            self._profile_cache.insert_conv3d(args)
         elif op_class == "normalization":
             self._profile_cache.insert_normalization(args)
         else:
diff --git a/python/aitemplate/compiler/__init__.py b/python/aitemplate/compiler/__init__.py
index 175a06eac..315577d39 100644
--- a/python/aitemplate/compiler/__init__.py
+++ b/python/aitemplate/compiler/__init__.py
@@ -12,12 +12,13 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from . import base, ops, tensor_accessor, transform
+from . import base, dtype, ops, tensor_accessor, transform
 from .compiler import compile_model
 from .model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
 
 __all__ = [
     "base",
+    "dtype",
     "op_registry",
     "ops",
     "tensor_accessor",
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index fd1d07488..df03965ca 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -26,6 +26,9 @@
 
 import numpy as np
 
+from aitemplate.compiler.dtype import get_dtype_size, normalize_dtype
+
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 from ..utils.tensor_utils import wrap_dim
@@ -42,7 +45,6 @@ def __init__(self) -> None:
         Initializes self._attrs field, which is a dict that stores
         all attributes for this Node.
         Basic attributes include:
-
             * name: str, name of the node.
             * depth: int, depth of the node in a graph. None if this is not applicable.
             * nop: bool, marks whether this node is a no-operation.
@@ -204,35 +206,6 @@ def pseudo_code(self, with_shape=False) -> str:
         return str(self.value())
 
 
-_DTYPE2BYTE = {
-    "float16": 2,
-    "float32": 4,
-    "float": 4,
-    "int": 4,
-    "int32": 4,
-    "int64": 8,
-}
-
-
-def get_dtype_size(dtype: str) -> int:
-    """Returns size (in bytes) of the given dtype str.
-
-    Parameters
-    ----------
-    dtype: str
-        A data type string.
-
-    Returns
-    ----------
-    int
-        Size (in bytes) of this dtype.
-    """
-
-    if dtype not in _DTYPE2BYTE:
-        raise KeyError(f"Unknown dtype: {dtype}. Expected one of {_DTYPE2BYTE.keys()}")
-    return _DTYPE2BYTE[dtype]
-
-
 def get_aligned_size(shape: List[IntVar], dtype: str, alignment: int = 64) -> int:
     """Returns aligned size (in bytes) of given shape and dtype.
 
@@ -277,14 +250,7 @@ class _ConstantTensorData(ABC):
 
     def __init__(self, dtype: str):
         super().__init__()
-        self.dtype = self._normalize_dtype(dtype)
-
-    def _normalize_dtype(self, dtype: str) -> str:
-        if dtype == "int":
-            return "int32"
-        if dtype == "float":
-            return "float32"
-        return dtype
+        self.dtype = normalize_dtype(dtype)
 
     @abstractmethod
     def to_bytes(self) -> bytes:
@@ -303,7 +269,7 @@ def size(self) -> int:
         return len(self.to_bytes())
 
     def is_dtype(self, dtype: str) -> bool:
-        return self._normalize_dtype(dtype) == self.dtype
+        return normalize_dtype(dtype) == self.dtype
 
     def __len__(self) -> int:
         return self.size()
@@ -365,13 +331,15 @@ def __init__(
         self,
         shape: List[IntVar],
         name: str = None,
-        src_ops: Set[Node] = None,
-        dst_ops: Set[Node] = None,
+        src_ops: StableSet[Node] = None,
+        dst_ops: StableSet[Node] = None,
         dtype: str = "float16",
         is_input: bool = False,
         is_output: bool = False,
         value: Any = None,
         is_view_of: Any = None,
+        check_nan_and_inf: bool = False,
+        check_outputs: bool = False,
     ) -> None:
         """Initializes a Tensor.
 
@@ -400,12 +368,20 @@ def __init__(
             empty list, this Tensor is used to represent a number.
         is_view_of : Any, optional
             Whether this Tensor is a view of another Tensor.
+        check_nan_and_inf : bool, optional
+            Whether or not to check this tensor is nan or inf during runtime.
+        check_outputs : bool, optional
+            Whether or not to print this tensor's value out during runtime.
         """
         super().__init__()
         self._attrs["shape"] = self._convert_shape(shape)
         self._attrs["name"] = name
-        self._attrs["src_ops"] = src_ops if src_ops is not None else set()
-        self._attrs["dst_ops"] = dst_ops if dst_ops is not None else set()
+        self._attrs["src_ops"] = (
+            StableSet(src_ops) if src_ops is not None else StableSet()
+        )
+        self._attrs["dst_ops"] = (
+            StableSet(dst_ops) if dst_ops is not None else StableSet()
+        )
         self._attrs["dtype"] = dtype
         self._attrs["is_output"] = is_output
         self._attrs["is_input"] = is_input
@@ -437,6 +413,9 @@ def __init__(
         # Data to be bound for constant folding. See _bind_data.
         self._attrs["data"] = None
 
+        self._attrs["check_nan_and_inf"] = check_nan_and_inf
+        self._attrs["check_outputs"] = check_outputs
+
     def __str__(self) -> str:
         output = {}
         for key in self._attrs.keys():
@@ -511,7 +490,7 @@ def pseudo_code(self, with_shape=True) -> str:
 
         if with_shape:
             shapes = ", ".join([dim.pseudo_code() for dim in self._attrs["shape"]])
-            args.append(f"shape={shapes}")
+            args.append(f"shape=[{shapes}]")
 
         data = self._attrs["data"]
         if data is not None:
@@ -627,6 +606,7 @@ def __init__(
             name,
             src_ops,
             dst_ops,
+            dtype=dtype,
             is_input=is_input,
             is_output=is_output,
         )
@@ -635,6 +615,30 @@ def __init__(
     def pseudo_code(self, with_shape=True) -> str:
         return f"IntVarTensor({self._attrs['int_var'].pseudo_code()})"
 
+    def __add__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_ADD")(self, other)
+
+    def __radd__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_ADD")(other, self)
+
+    def __sub__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_SUB")(self, other)
+
+    def __rsub__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_SUB")(other, self)
+
+    def __mul__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_MUL")(self, other)
+
+    def __rmul__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_MUL")(other, self)
+
+    def __truediv__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_DIV")(self, other)
+
+    def __rtruediv__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("INT_DIV")(other, self)
+
 
 class DynamicProfileStrategy(Enum):
     """Dynamic profiling stategy enum.
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 49307c3bd..3d869ee89 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -16,10 +16,14 @@
 build a test module from a tensor
 """
 import os
+from datetime import datetime
 from typing import Dict, List, Optional, Union
 
 from aitemplate import backend, compiler
+from aitemplate.compiler.model import AITemplateAllocatorKind
+from aitemplate.compiler.transform.profile import elapsed_dt_sec
 from aitemplate.utils import graph_utils, logger
+from aitemplate.utils.serialization.serdes_code import dump_program
 
 from .base import DynamicProfileStrategy, Tensor
 
@@ -89,6 +93,10 @@ def compile_model(
     num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES,
     profile_dir: str = None,
     constants: Optional[Dict[str, TorchTensor]] = None,
+    allocator_kind: Optional[AITemplateAllocatorKind] = None,
+    check_all_nan_and_inf: bool = False,
+    check_all_outputs: bool = False,
+    dump_ait_to_py: Optional[str] = None,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
@@ -111,9 +119,17 @@ def compile_model(
     dll_name: str
         The output .so name.
     num_runtimes: int
-            How many runtimes should be stored in the internal pool. This
-            determines how many inferences can happen concurrently. By
-            default, set to 2. Must be positive.
+        How many runtimes should be stored in the internal pool. This
+        determines how many inferences can happen concurrently. By
+        default, set to 2. Must be positive.
+    allocator_kind: AITemplateAllocatorKind, optional
+        The GPU allocator to use. If none is specified, use the default allocator.
+    check_all_nan_and_inf : bool, optional
+        Whether or not to check this tensor is nan or inf during runtime.
+    check_all_outputs : bool, optional
+        Whether or not to print this tensor's value out during runtime.
+    dump_ait_to_py: str, optional
+        The path where the AIT graph is dumped into a .py file.
 
     Returns
     -------
@@ -132,6 +148,10 @@ def compile_model(
     test_name = test_name.replace(",", "_")
     test_dir = os.path.join(workdir, test_name)
     profile_dir = workdir if profile_dir is None else profile_dir
+
+    if dump_ait_to_py:
+        dump_program(tensor, dump_ait_to_py)
+
     if int(recompile) == 1:
         os.makedirs(test_dir, exist_ok=True)
         with target:
@@ -160,8 +180,12 @@ def compile_model(
                 graph, test_dir, "mark_param_tensor"
             )
 
+            start_t = datetime.now()
             graph = compiler.transform.optimize_graph(graph, test_dir)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
+            logger.info(
+                __name__, f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}"
+            )
 
             compiler.transform.mark_special_views(graph)
             compiler.transform.refine_graph(graph)
@@ -178,12 +202,16 @@ def compile_model(
             )
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "profile")
 
+            start_t = datetime.now()
             constant_folding_workdir = os.path.join(workdir, test_name)
             os.makedirs(constant_folding_workdir, exist_ok=True)
             graph = compiler.transform.constant_folding(graph, constant_folding_workdir)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "constant_folding"
             )
+            logger.info(
+                __name__, f"folded constants elapsed time: {elapsed_dt_sec(start_t)}"
+            )
 
             _verify_outputs_still_in_graph(graph, output_tensors)
             (
@@ -192,6 +220,7 @@ def compile_model(
                 workspace,
             ) = compiler.transform.memory_planning(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
+
             file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
 
             # It's possible that the original output tensor has been replaced with a new tensor.
@@ -215,22 +244,21 @@ def compile_model(
                 workdir,
                 output_tensors,
                 test_name,
+                check_all_nan_and_inf,
+                check_all_outputs,
             )
             file_pairs.extend(main_pairs)
 
+            start_t = datetime.now()
             compile_engine = backend.builder.Builder()
-            if logger.is_debug():
-                compile_engine.gen_makefile(file_pairs, dll_name, workdir, test_name)
-
-            compile_engine.build_objs(
-                file_pairs,
-                backend.target.Target.current().compile_cmd(False),
-                backend.target.Target.current().binary_compile_cmd(),
-            )
-            compile_engine.build_so(
-                os.path.join(workdir, test_name, dll_name), [p[1] for p in file_pairs]
+            compile_engine.make(file_pairs, dll_name, workdir, test_name)
+            logger.info(
+                __name__,
+                f"compiled the final .so file elapsed time: {elapsed_dt_sec(start_t)}",
             )
 
-    module = Model(os.path.join(workdir, test_name, dll_name), num_runtimes)
+    module = Model(
+        os.path.join(workdir, test_name, dll_name), num_runtimes, allocator_kind
+    )
     module.debug_sorted_graph = graph
     return module
diff --git a/python/aitemplate/compiler/dtype.py b/python/aitemplate/compiler/dtype.py
new file mode 100644
index 000000000..51b6c96d6
--- /dev/null
+++ b/python/aitemplate/compiler/dtype.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+dtype definitions and utility functions of AITemplate
+"""
+
+
+_DTYPE2BYTE = {
+    "bool": 1,
+    "float16": 2,
+    "float32": 4,
+    "float": 4,
+    "int": 4,
+    "int32": 4,
+    "int64": 8,
+}
+
+
+# Maps dtype strings to AITemplateDtype enum in model_interface.h.
+# Must be kept in sync!
+# We can consider defining an AITemplateDtype enum to use on the Python
+# side at some point, but stick to strings for now to keep things consistent
+# with other Python APIs.
+_DTYPE_TO_ENUM = {
+    "float16": 1,
+    "float32": 2,
+    "float": 2,
+    "int": 3,
+    "int32": 3,
+    "int64": 4,
+    "bool": 5,
+}
+
+
+def get_dtype_size(dtype: str) -> int:
+    """Returns size (in bytes) of the given dtype str.
+
+    Parameters
+    ----------
+    dtype: str
+        A data type string.
+
+    Returns
+    ----------
+    int
+        Size (in bytes) of this dtype.
+    """
+
+    if dtype not in _DTYPE2BYTE:
+        raise KeyError(f"Unknown dtype: {dtype}. Expected one of {_DTYPE2BYTE.keys()}")
+    return _DTYPE2BYTE[dtype]
+
+
+def normalize_dtype(dtype: str) -> str:
+    """Returns a normalized dtype str.
+
+    Parameters
+    ----------
+    dtype: str
+        A data type string.
+
+    Returns
+    ----------
+    str
+        normalized dtype str.
+    """
+    if dtype == "int":
+        return "int32"
+    if dtype == "float":
+        return "float32"
+    return dtype
+
+
+def dtype_str_to_enum(dtype: str) -> int:
+    """Returns the AITemplateDtype enum value (defined in model_interface.h) of
+    the given dtype str.
+
+    Parameters
+    ----------
+    dtype: str
+        A data type string.
+
+    Returns
+    ----------
+    int
+        the AITemplateDtype enum value.
+    """
+    if dtype not in _DTYPE_TO_ENUM:
+        raise ValueError(
+            f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(_DTYPE_TO_ENUM.keys())}"
+        )
+    return _DTYPE_TO_ENUM[dtype]
+
+
+def dtype_to_enumerator(dtype: str) -> str:
+    """Returns the string representation of the AITemplateDtype enum
+    (defined in model_interface.h) for the given dtype str.
+
+    Parameters
+    ----------
+    dtype: str
+        A data type string.
+
+    Returns
+    ----------
+    str
+        the AITemplateDtype enum string representation.
+    """
+
+    def _impl(dtype):
+        if dtype == "float16":
+            return "kHalf"
+        elif dtype == "float32" or dtype == "float":
+            return "kFloat"
+        elif dtype == "int32" or dtype == "int":
+            return "kInt"
+        elif dtype == "int64":
+            return "kLong"
+        elif dtype == "bool":
+            return "kBool"
+        else:
+            raise AssertionError(f"unknown dtype {dtype}")
+
+    return f"AITemplateDtype::{_impl(dtype)}"
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 18da2d589..a7281f1d3 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -23,6 +23,7 @@
 
 import numpy as np
 
+from aitemplate.compiler.dtype import dtype_str_to_enum
 from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 # Controls how many runtimes will be used in ModelContainer by default.
@@ -33,18 +34,6 @@
 # effect since Python default arguments only get evaluated once.
 AIT_DEFAULT_NUM_RUNTIMES = 1
 
-# pylint: disable=C0103
-
-DTYPE_TO_BYTES: Dict[str, str] = {
-    "float16": 2,
-    "float32": 4,
-    "float": 4,
-    "int": 4,
-    "int32": 4,
-    "int64": 8,
-}
-
-
 # Stand-in for torch.Tensor. Use a TypeVar for some APIs since we can't introduce
 # a torch dependency.
 TorchTensor = TypeVar("TorchTensor")
@@ -134,6 +123,11 @@ class AITemplateMemcpyKind(enum.Enum):
     DeviceToDevice = 2
 
 
+class AITemplateAllocatorKind(enum.Enum):
+    DEFAULT = 0
+    TRACKING = 1
+
+
 class AITData(NamedTuple):
     """
     Input or output tensor for Model.run. We require the extra data for safety
@@ -161,22 +155,36 @@ class _CFormatAITData(ctypes.Structure):
 
 
 class Model(object):
-    """AITemplate Python runtime binding."""
-
     class _DLLWrapper:
-        def __init__(self, lib_path: str, num_runtimes: int):
+        def __init__(
+            self,
+            lib_path: str,
+            num_runtimes: int,
+            allocator_kind: Optional[AITemplateAllocatorKind],
+        ):
             self.lib_path = lib_path
             self.DLL = ctypes.cdll.LoadLibrary(lib_path)
 
             self.handle = ctypes.c_void_p()
+            self.allocator_handle = ctypes.c_void_p()
+            if allocator_kind is not None:
+                self.DLL.AITemplateAllocatorCreate(
+                    ctypes.byref(self.allocator_handle),
+                    ctypes.c_int(allocator_kind.value),
+                )
+
             self.DLL.AITemplateModelContainerCreate(
-                ctypes.pointer(self.handle), ctypes.c_size_t(num_runtimes)
+                ctypes.pointer(self.handle),
+                ctypes.c_size_t(num_runtimes),
+                self.allocator_handle,
             )
             self.is_open = True
 
         def close(self):
             if self.is_open:
                 self.DLL.AITemplateModelContainerDelete(self.handle)
+                if self.allocator_handle:
+                    self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
                 _dlclose(self.DLL)
                 self.is_open = False
 
@@ -193,7 +201,12 @@ def _wrapped_func(*args):
 
             return _wrapped_func
 
-    def __init__(self, lib_path: str, num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES):
+    def __init__(
+        self,
+        lib_path: str,
+        num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES,
+        allocator_kind: Optional[AITemplateAllocatorKind] = None,
+    ):
         """
         Instantiates a wrapper around the C++ model_interface.
 
@@ -205,11 +218,13 @@ def __init__(self, lib_path: str, num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES):
             How many runtimes should be stored in the internal pool. This
             determines how many inferences can happen concurrently. By
             default, set to 2. Must be positive.
+        allocator_kind : AITemplateAllocatorKind, optional
+            What type of allocator to use when allocating GPU memory.
         """
         if num_runtimes <= 0:
             raise ValueError(f"num_runtimes must be positive, but got {num_runtimes}")
 
-        self.DLL = self._DLLWrapper(lib_path, num_runtimes)
+        self.DLL = self._DLLWrapper(lib_path, num_runtimes, allocator_kind)
         self.handle = self.DLL.handle
         self.lib_path = self.DLL.lib_path
 
@@ -220,19 +235,6 @@ def __init__(self, lib_path: str, num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES):
         # The corresponding sorted_graph. Optional. For debugging purpose.
         self.debug_sorted_graph = None
 
-        # Maps dtype strings to AITemplateDtype enum in model_interface.h.
-        # Must be kept in sync!
-        # We can consider defining an AITemplateDtype enum to use on the Python
-        # side at some point, but stick to strings for now to keep things consistent
-        # with other Python APIs.
-        self._DTYPE_TO_ENUM = {
-            "float16": 1,
-            "float32": 2,
-            "float": 2,
-            "int": 3,
-            "int32": 3,
-            "int64": 4,
-        }
         self._output_name_to_index = self._construct_output_name_to_index_map()
         self._input_name_to_index = self._construct_input_name_to_index_map()
         self._output_ndims = [
@@ -268,13 +270,6 @@ def __setstate__(self, d):
             raise RuntimeError(f"Didn't find 'lib_path' property in {d}")
         self.__init__(d["lib_path"])
 
-    def _dtype_str_to_enum(self, dtype: str) -> int:
-        if dtype not in self._DTYPE_TO_ENUM:
-            raise ValueError(
-                f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(self._DTYPE_TO_ENUM.keys())}"
-            )
-        return self._DTYPE_TO_ENUM[dtype]
-
     def _convert_single_param_to_c_format(self, param: AITData) -> _CFormatAITData:
         pointer, shape, dtype = param
         c_pointer = ctypes.c_void_p(pointer)
@@ -282,7 +277,7 @@ def _convert_single_param_to_c_format(self, param: AITData) -> _CFormatAITData:
         for j, dim in enumerate(shape):
             c_shape_data[j] = ctypes.c_longlong(dim)
         c_shape = _AITemplateShape(c_shape_data, ctypes.c_size_t(len(shape)))
-        c_dtype = self._dtype_str_to_enum(dtype)
+        c_dtype = dtype_str_to_enum(dtype)
         return _CFormatAITData(c_pointer, c_shape, c_dtype)
 
     def _convert_params_to_c_format(self, params: List[AITData]):
diff --git a/python/aitemplate/compiler/ops/attention/__init__.py b/python/aitemplate/compiler/ops/attention/__init__.py
index 962476aaa..4f18558f7 100644
--- a/python/aitemplate/compiler/ops/attention/__init__.py
+++ b/python/aitemplate/compiler/ops/attention/__init__.py
@@ -16,6 +16,7 @@
 flash attention module init
 """
 from .flash_attention import flash_attention
+from .mem_eff_attention import mem_eff_attention
 
 
-__all__ = ["flash_attention"]
+__all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/compiler/ops/attention/flash_attention.py b/python/aitemplate/compiler/ops/attention/flash_attention.py
index 527476fe7..f81a59543 100644
--- a/python/aitemplate/compiler/ops/attention/flash_attention.py
+++ b/python/aitemplate/compiler/ops/attention/flash_attention.py
@@ -159,6 +159,16 @@ def __call__(self, x: Tensor, cu_seqlens: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        target_attrs = ["batch_size", "dropout", "max_seq_len", "causal"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
     def _gen_exec_key(self, shape):
         """rendering shape info"""
         return self.exec_key_template.render(
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
new file mode 100644
index 000000000..7964cdc39
--- /dev/null
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -0,0 +1,179 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Flash attention.
+"""
+import itertools
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+import numpy as np
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}B = {{x_dim0}};
+{{indent}}{{dtype}}num_heads = {{x_dim1}};
+{{indent}}{{dtype}}M = {{x_dim2}};
+{{indent}}{{dtype}}Kv = {{x_dim3}};
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+batch_size == {{x_dim0}} && num_heads == {{x_dim1}} && seq_len == {{x_dim2}} && head_sizes == {{x_dim3}}
+"""
+)
+
+
+class mem_eff_attention(Operator):
+    r"""mem_eff_attention provides an implementation for fused
+    multi-head attention module:
+
+    .. math::
+        \text{Attention}(Q, K, V) = \text{softmax}(\frac{QK}{\sqrt(d)}) * V
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    """
+
+    def __init__(self, causal, dropout=0) -> None:
+        """initilize attention module"""
+        super().__init__()
+        assert dropout == 0
+        self._attrs["op"] = "mem_eff_attention"
+        self._attrs["has_profiler"] = False
+        self._attrs["dropout"] = dropout
+        self._attrs["causal"] = causal
+        self._attrs["head_size"] = -1
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=w[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["B"]),
+            int(output["M"]),
+            int(output["num_heads"]),
+            int(output["Kv"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor):
+        """infer the output shape for attention module"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        # run infer shape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        batch_info = x._attrs["shape"][0]
+        output_shape = [
+            batch_info,
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        qkv : float16
+            QKV tensor
+            shape: (b, seqlen, num_heads, Kv)
+
+        Returns
+        ----------
+            Tensor
+        """
+
+        head_size_v = v._attrs["shape"][3]._attrs["values"][0]
+        self._attrs["head_size"] = head_size_v
+
+        self._attrs["inputs"] = [q, k, v]
+        self._set_depth()
+        self._extract_exec_path(q)
+        output_shape = self._infer_shapes(q, v)
+
+        o_shape = [var._attrs["values"][-1] for var in output_shape]
+        if o_shape[-1] > 128:
+            self._attrs["workspace"] = 4 * np.prod(o_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = ["causal"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def _gen_exec_key(self, shape):
+        """rendering shape info"""
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+            x_dim2=shape[2],
+            x_dim3=shape[3],
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/common/__init__.py b/python/aitemplate/compiler/ops/common/__init__.py
index cb0530124..1247cc790 100644
--- a/python/aitemplate/compiler/ops/common/__init__.py
+++ b/python/aitemplate/compiler/ops/common/__init__.py
@@ -17,6 +17,7 @@
 Common ops.
 """
 from .elementwise import *
+from .int_elementwise import *
 from .epilogue import *
 from .fused_elementwise import *
 from .math import *
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index dbd3a4296..dc18ba16e 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -90,11 +90,17 @@ def __call__(self, *args: Tensor) -> Tensor:
             arg for arg in converted_args if not arg.is_a_const_num()
         ]
         self._set_depth()
+        # for some reason aten converters fail if uncommented
+        # we will need this for fp32 support
+        # dtype = self._attrs["args"][0]._attrs["dtype"]
         output_shape = self._infer_shapes(*converted_args)
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {"func_enum": self._attrs["func"]}
+
     def replace_input_tensor(self, old_tensor, new_tensor) -> None:
         super().replace_input_tensor(old_tensor, new_tensor)
         self._attrs["args"] = [
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index fb8ca9926..cec01268b 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -57,3 +57,7 @@ class FuncEnum(Enum):
     NAN_TO_NUM = 19
     CLAMP_NAN_TO_NUM = 20
     SILU = 21
+    POW = 22
+    GELU = 23
+    FASTGELU = 24
+    SOFTPLUS = 25
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
index 40fd3ecd8..4ad1988e5 100644
--- a/python/aitemplate/compiler/ops/common/fused_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -110,7 +110,7 @@ def _update_inputs_outputs(self) -> None:
         self._attrs["original_outputs"] = list(self._attrs["outputs"])
 
         for tensor in tmp_inputs | tmp_outputs:
-            tensor._attrs["src_ops"] = set(tensor._attrs["src_ops"]) - ops
+            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - ops
             tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - ops
         for tensor in external_inputs:
             tensor._attrs["dst_ops"].add(self)
@@ -144,6 +144,9 @@ def __init__(self, elementwise_ops: List[elementwise]) -> None:
         self._set_depth()
         self._check_constant()
 
+    def _get_op_attributes(self):
+        return {"elementwise_ops": self._attrs["elementwise_ops"]}
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
new file mode 100644
index 000000000..a704d40ed
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -0,0 +1,142 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Int elementwise operator definition, for integer calcuation on tensor dimensions.
+"""
+import functools
+from functools import reduce
+
+from .... import backend
+from ....backend import registry
+
+from ....utils import shape_utils
+from ...base import IntVarTensor, Operator, Tensor
+from ...op_registry import OP_REGISTRY
+from .epilogue import FuncEnum
+
+# pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
+
+INT_ELEMENTWISE_FUNC_COVERAGE = [FuncEnum.MUL, FuncEnum.DIV, FuncEnum.SUB, FuncEnum.ADD]
+
+
+class int_elementwise(Operator):
+    """int elementwise operator definition."""
+
+    def __init__(self, func_enum: FuncEnum) -> None:
+        """
+        Parameters
+        ----------
+        func_enum : the underlying function enum.
+        """
+
+        super().__init__()
+        self._attrs["op"] = "int_elementwise"
+        if func_enum not in INT_ELEMENTWISE_FUNC_COVERAGE:
+            raise RuntimeError(f"Not such FuncEnum {func_enum} in int_elementwise!")
+        self._attrs["func"] = func_enum
+        self._attrs["has_profiler"] = False
+
+    def __call__(self, *args: IntVarTensor) -> Tensor:
+        int_vars = []
+        for arg in args:
+            if isinstance(arg, IntVarTensor):
+                int_vars.append(arg._attrs["int_var"])
+            else:
+                raise RuntimeError(
+                    f"Unsupported data type {arg} in elementwise {self}!"
+                )
+        max_vars = [max(v._attrs["values"]) for v in int_vars]
+        min_vars = [min(v._attrs["values"]) for v in int_vars]
+        assert len(max_vars) == len(min_vars) and len(max_vars) >= 2
+        values = []
+        if self._attrs["func"] == FuncEnum.MUL:
+            values += [reduce(lambda x, y: x * y, lis) for lis in [min_vars, max_vars]]
+        elif self._attrs["func"] == FuncEnum.ADD:
+            values += [reduce(lambda x, y: x + y, lis) for lis in [min_vars, max_vars]]
+        elif self._attrs["func"] == FuncEnum.SUB:
+            inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
+            # For an inputs of range [(4,9), (1,8)],
+            # i.e. (4,9) is the range for first dim and (1,8) for the second
+            # we want max(4,9)-min(1,8) as the new upper bound and
+            # min(4,9)-max(1,8) as the new lower bound.
+            # however, range should be larger than 1, thus we have max(1, min(4,9)-max(1,8))
+            for i, b in enumerate(inp_range):
+                if i == 0:
+                    a = b
+                else:
+                    lower_bound = max(min(a[0], a[1]) - max(b[0], b[1]), 1)
+                    upper_bound = max(a[0], a[1]) - min(b[0], b[1])
+                    if upper_bound <= 0:
+                        raise RuntimeError(
+                            f"Subtracting Tensor with shape {b} from Tensor with shape {a} "
+                            f"is invalid. Cannot deduce a valid bound."
+                        )
+                    a = (lower_bound, upper_bound)
+            values = list(a)
+        elif self._attrs["func"] == FuncEnum.DIV:  # floordiv
+            inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
+            # For an inputs of range [(4,9), (1,8)],
+            # i.e. (4,9) is the range for first dim and (1,8) for the second
+            # we want max(4,9)/min(1,8) as the new upper bound and
+            # min(4,9)/max(1,8) as the new lower bound.
+            # however, range should be larger than 1, thus we have max(1, min(4,9)/max(1,8))
+            for i, b in enumerate(inp_range):
+                if i == 0:
+                    a = b
+                else:
+                    lower_bound = max(int(min(a[0], a[1]) / max(b[0], b[1])), 1)
+                    upper_bound = int(max(a[0], a[1]) / min(b[0], b[1]))
+                    if upper_bound <= 0:
+                        raise RuntimeError(
+                            f"Dividing Tensor with shape {b} from Tensor with shape {a} "
+                            f"is invalid. Cannot deduce a valid bound."
+                        )
+                    a = (lower_bound, upper_bound)
+            values = list(a)
+        else:
+            raise RuntimeError(f"Unsupported calculation type {self._attrs['func']}!")
+        dim = shape_utils.gen_int_var_min_max(values)
+        for arg, iv in zip(args, int_vars):
+            arg._attrs["int_var"] = iv
+            assert not arg.is_a_const_num(), f"{arg} cannot be constant"
+        self._attrs["args"] = args
+        self._attrs["inputs"] = list(args)
+        self._set_depth()
+        output = IntVarTensor(dim, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self):
+        return {"func_enum": self._attrs["func"]}
+
+    def _args_for_pseudo_code(self):
+        return [self._attrs["func"]]
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+
+def _int_elementwise_func(func_enum: FuncEnum, *args: IntVarTensor) -> Tensor:
+    return int_elementwise(func_enum)(*args)
+
+
+# Initialize OP_REGISTRY so that Tensor built-in functions can use.
+for name, func_enum in FuncEnum.__members__.items():
+    OP_REGISTRY["INT_" + name] = functools.partial(_int_elementwise_func, func_enum)
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index f8cddfe19..8c6e76cac 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -85,3 +85,19 @@ def silu(tensor: Any) -> Tensor:
 
 def nan_to_num(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("NAN_TO_NUM")(tensor)
+
+
+def pow(*args, **kwargs) -> Tensor:
+    return OP_REGISTRY.get("POW")(*args, **kwargs)
+
+
+def gelu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("GELU")(tensor)
+
+
+def fast_gelu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("FASTGELU")(tensor)
+
+
+def softplus(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SOFTPLUS")(tensor)
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index bab845786..5fe2e564c 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -26,8 +26,8 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar
 
-from ....utils.shape_utils import convert_shape_to_IntVar
 from ....utils.tensor_utils import wrap_dim
 
 
@@ -66,6 +66,16 @@
     lstrip_blocks=True,
 )
 
+DYNAMIC_RESHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(input_ndim) %}
+{{indent}}*out_{{idx}} = *in_{{idx}};
+{% endfor %}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
 SQUEEZE_FUNC_TEMPLATE = jinja2.Template(
     """
 {% for idx in range(output_ndim) %}
@@ -109,6 +119,7 @@ def make_output_shape(
         self,
         y_shape_values: List[Union[List[int], int]],
         dynamic_dim: IntVar = None,
+        is_intvar_tensor: bool = False,
     ) -> List[IntVar]:
         """
         Make the output shape from the output shape values.
@@ -118,11 +129,12 @@ def make_output_shape(
             if len(values) == 1:
                 output_shape.append(IntImm(values[0]))
             else:
-                assert (
-                    self._attrs["unknown_idx"] == -1
-                ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
-                "got {idx} and {self._attrs['unknown_idx']}"
-                self._attrs["unknown_idx"] = idx
+                if not is_intvar_tensor:
+                    assert (
+                        self._attrs["unknown_idx"] == -1
+                    ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
+                    "got {idx} and {self._attrs['unknown_idx']}"
+                    self._attrs["unknown_idx"] = idx
                 output_shape.append(
                     dynamic_dim if dynamic_dim is not None else IntVar(values=values)
                 )
@@ -150,8 +162,7 @@ def _is_dynamic_dim_reused(x_shape_values, y_shape_values) -> bool:
 class reshape(_reshape_base):
     """
     Returns a tensor with the same data and number of elements as input, but with the
-    specified shape. Inputs must be contiguous. It always returns a tensor view which
-    shares the same underlying data as the input.
+    specified shape. Inputs must be contiguous.
 
     A single dimension may be -1, in which case it’s inferred from the remaining
     dimensions and the number of elements in input.
@@ -162,6 +173,7 @@ def __init__(self) -> None:
         super().__init__()
         self._attrs["op"] = "reshape"
         self.shape_eval_template = RESHAPE_FUNC_TEMPLATE
+        self.dynamic_eval_template = DYNAMIC_RESHAPE_FUNC_TEMPLATE
 
     def _infer_shape(self, x: Tuple[int], shape: Tuple[int]):
         new_shape = list(shape)
@@ -197,46 +209,59 @@ def _infer_shape(self, x: Tuple[int], shape: Tuple[int]):
         return new_shape
 
     def _infer_shapes(self, x: Tensor):
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_dynamic_dims = [
-            var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
-        ]
-        x_shapes = list(itertools.product(*x_shape_values))
-
-        new_shape_vals = [var._attrs["values"] for var in self._attrs["shape"]]
-        new_shapes = list(itertools.product(*new_shape_vals))
-
-        # len(x_shapes) > 1 means that at least 1 dim in the shapes of x is dynamic.
-        # len(new_shapes) > 1 means that the dynamic dim is retained; otherwise, it would
-        # have been replaced with -1 or a concrete number.
-        if len(x_shapes) > len(new_shapes):
-            # we only support two cases here, when len(x_shapes) > 1, len(x_shapes) must
-            # be either len(new_shapes) (the dynamic dim is retained) or 1 (use -1 to
-            # mark the dynamic or unknown index and no other dim is dynamic).
-            assert len(new_shapes) == 1
-            new_shapes = new_shapes * len(x_shapes)
-
-        # run infershape for each
-        y_shapes = [
-            self._infer_shape(x_shape, new_shape)
-            for x_shape, new_shape in zip(x_shapes, new_shapes)
-        ]
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        y_shape_values = list(map(unique, zip(*y_shapes)))
-        reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
-
-        return self.make_output_shape(
-            y_shape_values,
-            dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
-        )
+        # There are two cases:
+        # 1) there is only one unknown shape.
+        # 2) there is no unkown shape and all shape dimensions are represented as IntVarTensor
+        # For 1), the view op will deduce the shape of if one dim is labeled as -1,
+        #         but it can't do so with more than 1 dynamic dimension
+        # For 2), when all dynamic shapes are known, we should be able to pass the input shape to out.
+        #         i.e. we should skip the deduction when all shapes are known.
+        is_intvar = all([isinstance(var, IntVarTensor) for var in self._attrs["shape"]])
+        self._attrs["is_intvar"] = is_intvar
+        if not is_intvar:
+            x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+            x_dynamic_dims = [
+                var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
+            ]
+            x_shapes = list(itertools.product(*x_shape_values))
+
+            self._attrs["shape"] = convert_shape_to_IntVar(self._attrs["shape"])
+            new_shape_vals = [var._attrs["values"] for var in self._attrs["shape"]]
+            new_shapes = list(itertools.product(*new_shape_vals))
+
+            # len(x_shapes) > 1 means that at least 1 dim in the shapes of x is dynamic.
+            # len(new_shapes) > 1 means that the dynamic dim is retained; otherwise, it would
+            # have been replaced with -1 or a concrete number.
+            if len(x_shapes) > len(new_shapes):
+                # we only support two cases here, when len(x_shapes) > 1, len(x_shapes) must
+                # be either len(new_shapes) (the dynamic dim is retained) or 1 (use -1 to
+                # mark the dynamic or unknown index and no other dim is dynamic).
+                assert len(new_shapes) == 1
+                new_shapes = new_shapes * len(x_shapes)
+            # run infershape for each
+            y_shapes = [
+                self._infer_shape(x_shape, new_shape)
+                for x_shape, new_shape in zip(x_shapes, new_shapes)
+            ]
+
+            def unique(vector):
+                return sorted(set(vector))
+
+            y_shape_values = list(map(unique, zip(*y_shapes)))
+            reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
+            return self.make_output_shape(
+                y_shape_values,
+                dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
+            )
+        else:
+            new_shape_vals = [
+                shape._attrs["int_var"]._attrs["values"]
+                for shape in self._attrs["shape"]
+            ]
+            return self.make_output_shape(new_shape_vals, is_intvar_tensor=True)
 
     def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
-        self._attrs["shape"] = convert_shape_to_IntVar(
-            [shape] if isinstance(shape, int) else shape
-        )
+        self._attrs["shape"] = shape
         self._attrs["inputs"] = [x]
         for s in shape:
             if isinstance(s, IntVarTensor):
@@ -249,12 +274,20 @@ def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
         return output
 
     def gen_function(self) -> str:
+        # There are two cases:
+        # 1) there is only one unknown shape.
+        # 2) there is no unkown shape and all shape dimensions are represented as IntVarTensor
+        # For 1), at implementation, the uknown dimension = X.flatten()/(*known_out_shape)
+        # For 2), when all dynamic shapes are intVarTensor, output_shape = input_shape.
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        return func(self._attrs, self.shape_eval_template)
+        if self._attrs["is_intvar"]:
+            return func(self._attrs, self.dynamic_eval_template)
+        else:
+            return func(self._attrs, self.shape_eval_template)
 
     def _inputs_for_pseudo_code(self):
         return [
@@ -268,8 +301,6 @@ class flatten(_reshape_base):
     Flattens input by reshaping it into a one-dimensional tensor. If start_dim or end_dim
     are passed, only dimensions starting with start_dim and ending with end_dim are
     flattened. The order of elements in input is unchanged.
-
-    It always returns a tensor view.
     """
 
     def __init__(self, start_dim=0, end_dim=-1) -> None:
@@ -343,6 +374,9 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {"start_dim": self._attrs["start"], "end_dim": self._attrs["end"]}
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
@@ -439,6 +473,9 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {"dim": self._attrs["dim"]}
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
index 3671a2308..b99bb893c 100644
--- a/python/aitemplate/compiler/ops/conv/__init__.py
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -27,6 +27,8 @@
 from .conv2d_bias_relu import conv2d_bias_relu
 from .conv2d_bias_relu_few_channels import conv2d_bias_relu_few_channels
 from .conv2d_bias_sigmoid import conv2d_bias_sigmoid
+from .conv3d import conv3d
+from .depthwise_conv3d import depthwise_conv3d
 from .transposed_conv2d import transposed_conv2d
 from .transposed_conv2d_bias import transposed_conv2d_bias
 from .transposed_conv2d_bias_relu import transposed_conv2d_bias_relu
diff --git a/python/aitemplate/compiler/ops/conv/cache_entry.py b/python/aitemplate/compiler/ops/conv/cache_entry.py
index 8e7b989e9..5f08fe215 100644
--- a/python/aitemplate/compiler/ops/conv/cache_entry.py
+++ b/python/aitemplate/compiler/ops/conv/cache_entry.py
@@ -69,3 +69,68 @@ class ConvRecordEntry:
     algo: str
     workspace: int
     split_k: int
+
+
+@dataclass
+class Conv3dQueryEntry:
+    """Query Entry"""
+
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    kd: int
+    kh: int
+    kw: int
+    co: int
+    stride_d: int
+    stride_h: int
+    stride_w: int
+    pad_d: int
+    pad_h: int
+    pad_w: int
+    dilate_d: int
+    dilate_h: int
+    dilate_w: int
+    op_type: str
+    device: str
+    epilogue: int
+    split_k: int
+    exec_entry_sha1: str
+
+
+@dataclass
+class Conv3dRecordEntry:
+    """Record Entry"""
+
+    exec_entry: str
+    exec_entry_sha1: str
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    kd: int
+    kh: int
+    kw: int
+    co: int
+    stride_d: int
+    stride_h: int
+    stride_w: int
+    pad_d: int
+    pad_h: int
+    pad_w: int
+    dilate_d: int
+    dilate_h: int
+    dilate_w: int
+    op_type: str
+    epilogue: int
+    device: str
+    algo: str
+    workspace: int
+    split_k: int
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index dfedba010..5c8c8537c 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -71,6 +71,12 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        attrs = super()._get_op_attributes()
+        attrs.update({"activation": self._attrs["op"].split("_")[-1]})
+
+        return attrs
+
     @staticmethod
     def is_valid_inputs(x: Tensor, w: Tensor, b: Tensor) -> Tuple[bool, str]:
         x_shape = x._attrs["shape"]
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
index 30c83471b..6a38e3e4c 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
@@ -72,3 +72,9 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor, r: Tensor):
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
         return output
+
+    def _get_op_attributes(self):
+        attrs = super()._get_op_attributes()
+        attrs.update({"activation": self._attrs["op"].split("_")[-1]})
+
+        return attrs
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index f77b863ee..59f96c467 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -20,6 +20,7 @@
 import re
 from collections import OrderedDict
 from hashlib import sha1
+from operator import itemgetter
 from typing import Any, Dict, List
 
 import jinja2
@@ -329,7 +330,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir, self.shape_eval_template)
+        return func(self._attrs, workdir, self.shape_eval_template)
 
     def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         exe_path = os.path.join(profiler_prefix, cfg)
@@ -411,14 +412,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
 
         runner.join()
         result = runner.pull()
-
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        best_algo = out[0][0]
-        workspace = out[0][1].workspace
+        out = min(result, key=itemgetter(1))
+        best_algo = out[0]
+        workspace = out[1].workspace
         ## cache
         cache_record = ConvRecordEntry(
             exec_entry=exec_key,
@@ -456,6 +456,10 @@ def profile(
             devices = [0]
         self._profile_static(workdir, devices)
 
+        target = backend.target.Target.current()
+        if target.use_dummy_profiling_results():
+            return
+
         has_dynamic = False
         for input_tensor in self._attrs["inputs"]:
             for dim in input_tensor._attrs["shape"]:
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
index 2102ffaa5..9628df362 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
@@ -72,3 +72,13 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         super().__init__("identity", stride, pad, dilate=dilate, group=group)
         self._attrs["op"] = "conv2d_bias"
         self._attrs["epilogue"] = "LinearCombination"
+
+    def _get_op_attributes(self):
+        target_attrs = ["dilate", "group", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
index 4581e4a7a..39f7c2a95 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
@@ -75,3 +75,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             Number of input channels to process to compute one output channel, by default 1
         """
         super().__init__("identity", stride, pad, dilate=dilate, group=group)
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
index b3632f617..b8f224a7a 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
@@ -74,3 +74,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             Number of input channels to process to compute one output channel, by default 1
         """
         super().__init__("hardswish", stride, pad, dilate=dilate, group=group)
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
index 3483c2d16..c118716a6 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
@@ -75,3 +75,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             Number of input channels to process to compute one output channel, by default 1
         """
         super().__init__("relu", stride, pad, dilate=dilate, group=group)
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
index 6b61f1b80..1328d5f53 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
@@ -31,11 +31,7 @@ def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
         self._attrs["epilogue"] = "LinearCombination"
 
     def _get_op_attributes(self):
-        target_attrs = ["dilate", "pad", "stride"]
-        attr = {}
-
-        for target_attr in target_attrs:
-            if target_attr in self._attrs:
-                attr[target_attr] = self._attrs[target_attr]
+        attr = super()._get_op_attributes()
+        del attr["activation"]
 
         return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
index d98a510d0..b36039cb3 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
@@ -70,3 +70,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         """
         super().__init__("hardswish", stride, pad, dilate=dilate, group=group)
         self._attrs["epilogue"] = "LinearCombinationHardSwish"
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
index 9f9e4b31c..104bf7ef1 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
@@ -27,3 +27,9 @@ class conv2d_bias_hardswish_few_channels(special_conv2d_bias_activation):
     def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
         """Initializes conv2d_bias_relu_few_channels"""
         super().__init__("hardswish", stride, pad, dilate, auto_padding)
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
index 13c2b6cfe..b8fdf7d75 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
@@ -69,3 +69,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             Number of input channels to process to compute one output channel, by default 1
         """
         super().__init__("relu", stride, pad, dilate=dilate, group=group)
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
index 6f2c2c6d8..b4d5f9594 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
@@ -29,11 +29,7 @@ def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
         super().__init__("relu", stride, pad, dilate, auto_padding)
 
     def _get_op_attributes(self):
-        target_attrs = ["dilate", "pad", "stride"]
-        attr = {}
-
-        for target_attr in target_attrs:
-            if target_attr in self._attrs:
-                attr[target_attr] = self._attrs[target_attr]
+        attr = super()._get_op_attributes()
+        del attr["activation"]
 
         return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
index 75adc9919..521fd642d 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
@@ -70,3 +70,9 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         """
         super().__init__("sigmoid", stride, pad, dilate=dilate, group=group)
         self._attrs["epilogue"] = "LinearCombinationSigmoid"
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        del attr["activation"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
new file mode 100644
index 000000000..37cd6f7c7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -0,0 +1,623 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Base class for conv3d.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from operator import itemgetter
+from typing import Any, Dict, List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger, shape_utils
+from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
+from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+
+# pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
+
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}DI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}WI = {{x_dim3}};
+{{indent}}{{dtype}}CI = {{x_dim4}};
+{{indent}}{{dtype}}CO = {{w_dim0}};
+{{indent}}{{dtype}}KD = {{w_dim1}};
+{{indent}}{{dtype}}KH = {{w_dim2}};
+{{indent}}{{dtype}}KW = {{w_dim3}};
+{{indent}}{{dtype}}SD = {{stride_d}};
+{{indent}}{{dtype}}SH = {{stride_h}};
+{{indent}}{{dtype}}SW = {{stride_w}};
+{{indent}}{{dtype}}DD = {{dilate_d}};
+{{indent}}{{dtype}}DH = {{dilate_h}};
+{{indent}}{{dtype}}DW = {{dilate_w}};
+{{indent}}{{dtype}}PD = {{pad_d}};
+{{indent}}{{dtype}}PH = {{pad_h}};
+{{indent}}{{dtype}}PW = {{pad_w}};
+{{indent}}{{dtype}}KDEff = (KD - 1) * DD + 1;
+{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
+{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}DO = (DI + PD + PD - KDEff) {{div}} SD + 1;
+{{indent}}{{dtype}}HO = (HI + PH + PH - KHEff) {{div}} SH + 1;
+{{indent}}{{dtype}}WO = (WI + PW + PW - KWEff) {{div}} SW + 1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = DO;
+{{indent}}{{y_dim2}} = HO;
+{{indent}}{{y_dim3}} = WO;
+{{indent}}{{y_dim4}} = CO;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+NI == {{x_dim0}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
+    """
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class conv3d(Operator):
+    r"""conv3d"""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv3d constructor.
+
+        Parameters
+        ----------
+        stride : int or tuple
+            Stride of the convolution
+        pad : int or tuple
+            Size of padding to add to the input
+        dilate : int ot tuple, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__()
+        self._attrs["op"] = "conv3d"
+        self._attrs["stride"] = stride
+        if isinstance(stride, int):
+            self._attrs["stride"] = (stride, stride, stride)
+        self._attrs["pad"] = pad
+        if isinstance(pad, int):
+            self._attrs["pad"] = (pad, pad, pad)
+        self._attrs["dilate"] = dilate
+        if isinstance(dilate, int):
+            self._attrs["dilate"] = (dilate, dilate, dilate)
+        self._attrs["group"] = group
+        self._attrs["has_profiler"] = True
+        self._attrs["epilogue_alignment"] = 1
+        self._attrs["epilogue"] = "LinearCombination"
+        self._attrs["workspace"] = 0
+        self._attrs["split_k"] = None
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[4] != w[4] * self._attrs["group"]:
+            raise RuntimeError("X/W Shape mismatch for conv3d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            x_dim4=x[4],
+            w_dim0=w[0],
+            w_dim1=w[1],
+            w_dim2=w[2],
+            w_dim3=w[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["DO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[0]
+        self._attrs["KD"] = w_shape[1]
+        self._attrs["KH"] = w_shape[2]
+        self._attrs["KW"] = w_shape[3]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape: List[int]):
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+            x_dim2=shape[2],
+            x_dim3=shape[3],
+            x_dim4=shape[4],
+        ).replace("\n", "")
+
+    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+        return self.exec_dyn_key_template.render(
+            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _signature(self):
+        signature = "conv3d: K=[{kd}, {kh}, {kw}], S=[{sd}, {sh}, {sw}], P=[{pd}, {ph}, {pw}], CO=[{co}]".format(
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            sd=self._attrs["stride"][0],
+            sh=self._attrs["stride"][1],
+            sw=self._attrs["stride"][2],
+            pd=self._attrs["pad"][0],
+            ph=self._attrs["pad"][1],
+            pw=self._attrs["pad"][2],
+            co=self._attrs["CO"],
+        )
+        return signature
+
+    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+        epilogue_dim = output_shape[-1]
+        if not isinstance(epilogue_dim, IntImm):
+            raise RuntimeError("Conv output last dimension must be static!")
+        shape = epilogue_dim._attrs["values"][0]
+        if shape % 8 == 0:
+            self._attrs["epilogue_alignment"] = 8
+        elif shape % 4 == 0:
+            self._attrs["epilogue_alignment"] = 4
+        elif shape % 2 == 0:
+            self._attrs["epilogue_alignment"] = 2
+
+    def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
+        """Call conv3d with tensors x, w
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, D, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_d, K_h, K_w, C_in)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, D_out, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self) -> Dict[str, Any]:
+        target_attrs = ["dilate", "group", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        """Profiler generator.
+
+        Parameters
+        ----------
+        workdir : str, optional, by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = backend.target.Target.current()
+
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs, workdir, self.shape_eval_template)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        cmd.append(x_shape[2])
+        cmd.append(x_shape[3])
+        cmd.append(x_shape[4])
+        cmd.append(self._attrs["KD"])
+        cmd.append(self._attrs["KH"])
+        cmd.append(self._attrs["KW"])
+        cmd.append(self._attrs["CO"])
+        cmd.append(self._attrs["stride"][0])
+        cmd.append(self._attrs["stride"][1])
+        cmd.append(self._attrs["stride"][2])
+        cmd.append(self._attrs["pad"][0])
+        cmd.append(self._attrs["pad"][1])
+        cmd.append(self._attrs["pad"][2])
+        cmd.append(self._attrs["dilate"][0])
+        cmd.append(self._attrs["dilate"][1])
+        cmd.append(self._attrs["dilate"][2])
+        cmd.append(self._attrs["group"])
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        target = backend.target.Target.current()
+        # if in CI just choose minimal configs
+        # workspace is a hack just provides 102400 Byte
+        if target.use_dummy_profiling_results():
+            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
+            logger.info(__name__, f"Select minimal algo {algo} for CI")
+            return (algo, 102400)
+        # query cache
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+        query = Conv3dQueryEntry(
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            op_type=self._attrs["op"],
+            device=target._arch,
+            epilogue=tmp_op.epilogue_functor.value,
+            split_k=split_k,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("conv3d", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.info(__name__, "Load profiling result from cache.")
+            return cache_value
+        if target.use_dummy_profiling_results():
+            op_type = self._attrs["op"]
+            raise Exception(
+                "This is a CI run but we could not find the following cache ",
+                f"available on device {target._arch}\n",
+                f"{op_type} {exec_entry_sha1}.\n",
+                "To bypass, you need to make it available in the db table.",
+            )
+
+        func_key = "{target}.{op}.filter".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape = self._invert_exec_key(exec_key)
+        for cfg in content:
+            if not func(cfg, self._attrs, x_shape):
+                continue
+            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+            runner.push(cfg, command)
+
+        runner.join()
+        result = runner.pull()
+        if len(result) == 0:
+            raise RuntimeError(
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
+            )
+        out = min(result, key=itemgetter(1))
+        best_algo = out[0]
+        workspace = out[1].workspace
+        ## cache
+        cache_record = Conv3dRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            op_type=self._attrs["op"],
+            epilogue=tmp_op.epilogue_functor.value,
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+            split_k=split_k,  # todo add into profile
+        )
+        Target.current().insert_profile_cache("conv3d", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ):
+        if devices is None:
+            devices = [0]
+        self._profile_static(workdir, devices)
+
+        has_dynamic = False
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    has_dynamic = True
+                    break
+        if has_dynamic:
+            if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
+                raise NotImplementedError(
+                    "conv3d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
+                        dynamic_profiling_strategy
+                    )
+                )
+            self._profile_dynamic_dim(workdir)
+
+    def _profile_static(self, workdir, devices):
+        """Profiles with static shapes."""
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            best_algo, workspace = self._profile_single_workload(
+                profiler_prefix, wkl, devices
+            )
+            self._attrs["exec_path"][wkl] = best_algo
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+
+    def _profile_dynamic_dim(self, workdir):
+        """Profiles with dynamic shapes."""
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
+        # extract dynamic dim from exec_path
+        if len(self._attrs["exec_path"]) <= 1:
+            return
+
+        def _extract_dynamic_dim(exec_keys):
+            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            var_dims = [[], [], [], []]
+            for key in exec_keys:
+                dims = self._invert_exec_key(key)
+                for i, v in enumerate(dims):
+                    var_dims[i].append(v)
+            return var_dims
+
+        dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
+        dim1 = dims[1][0]
+        dim2 = dims[2][0]
+        dim3 = dims[3][0]
+        algos = list(self._attrs["exec_path"].values())
+        # generate region
+        regions = []  # lb, ub, lb_algos, ub_algos
+        for i in range(len(dims[0]) - 1):
+            regions.append([dims[0][i], dims[0][i + 1], algos[i], algos[i + 1]])
+        # for each region,
+        #   binary search to find cutting point
+        #   generate new exec
+        special_cases = OrderedDict()
+        new_exec_paths = OrderedDict()
+        for lb, ub, lb_algo, ub_algo in regions:
+            mid = (lb + ub) // 2
+            origin_lb = lb
+            origin_ub = ub
+            last_mid = mid
+            while mid > lb and mid < ub:
+                mid = (lb + ub) // 2
+                mid_shape = [mid, dim1, dim2, dim3]
+                logger.info(
+                    __name__,
+                    "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
+                        lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
+                    ),
+                )
+
+                mid_lb_algo_cmd = self._gen_profile_cmd(
+                    profiler_prefix, str(lb_algo), mid_shape
+                )
+                mid_ub_algo_cmd = self._gen_profile_cmd(
+                    profiler_prefix, str(ub_algo), mid_shape
+                )
+                runner.push(0, mid_lb_algo_cmd)
+                runner.push(1, mid_ub_algo_cmd)
+                runner.join()
+                result = runner.pull()
+                assert len(result) >= 1
+                # if there is only one result, assume ub algo failed.
+                if len(result) == 1:
+                    assert result[0][0] == 0
+                    # last_lb = lb
+                    lb = mid + 1
+                # if there are two result, compare to decide new lb/ub
+                else:
+                    lb_time = result[0][1]
+                    ub_time = result[1][1]
+                    if lb_time < ub_time:
+                        # lb algo can work with larger batch
+                        # last_lb = lb
+                        lb = mid + 1
+                    else:
+                        # ub algo can work with smaller batch
+                        # last_ub = ub
+                        ub = mid - 1
+                last_mid = mid
+                mid = (lb + ub) // 2
+            lo_region_key = self._gen_dyn_exec_key(
+                origin_lb, last_mid, dim1, dim2, dim3
+            )
+            up_region_key = self._gen_dyn_exec_key(
+                last_mid, origin_ub, dim1, dim2, dim3
+            )
+            new_exec_paths[lo_region_key] = lb_algo
+            new_exec_paths[up_region_key] = ub_algo
+            # find special cases
+            # This code is kept in case need fully tested dynamic code
+            # So far I find binary search works well.
+            # def _find_special_case(lb, ub, algo):
+            #     for i in range(lb + 1, ub + 1):
+            #         x_shape = [i, dim1, dim2, dim3]
+            #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
+            #         runner.push(0, cmd)
+            #         runner.join()
+            #         out = runner.pull()
+            #         if len(out) == 0:
+            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
+            #             special_cases[self._gen_exec_key(x_shape)] = algo
+
+            # logger.info(self._attrs["name"],
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
+            #         ub=last_mid))
+            # _find_special_case(origin_lb, last_mid, lb_algo)
+            # logger.info(self._attrs["name"],
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
+            #         ub=origin_ub))
+            # _find_special_case(last_mid, origin_ub, ub_algo)
+        special_cases.update(new_exec_paths)
+        self._attrs["exec_path"] = special_cases
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        op_name = self._attrs["op"]
+        func_key = "{target}.{op}.gen_function".format(target=target.name(), op=op_name)
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
new file mode 100644
index 000000000..4d37a5334
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -0,0 +1,290 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Base class for depthwise_conv3d.
+"""
+import itertools
+import re
+from collections import OrderedDict
+from typing import Any, Dict, List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import IntImm, IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
+
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}TI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}WI = {{x_dim3}};
+{{indent}}{{dtype}}CI = {{x_dim4}};
+{{indent}}{{dtype}}CO = {{w_dim0}};
+{{indent}}{{dtype}}KT = {{w_dim1}};
+{{indent}}{{dtype}}KH = {{w_dim2}};
+{{indent}}{{dtype}}KW = {{w_dim3}};
+{{indent}}{{dtype}}ST = {{stride_t}};
+{{indent}}{{dtype}}SH = {{stride_h}};
+{{indent}}{{dtype}}SW = {{stride_w}};
+{{indent}}{{dtype}}DT = {{dilate_t}};
+{{indent}}{{dtype}}DH = {{dilate_h}};
+{{indent}}{{dtype}}DW = {{dilate_w}};
+{{indent}}{{dtype}}PT = {{pad_t}};
+{{indent}}{{dtype}}PH = {{pad_h}};
+{{indent}}{{dtype}}PW = {{pad_w}};
+{{indent}}{{dtype}}KTEff = (KT - 1) * DT + 1;
+{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
+{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}TO = (TI + PT + PT - KTEff) {{div}} ST + 1;
+{{indent}}{{dtype}}HO = (HI + PH + PH - KHEff) {{div}} SH + 1;
+{{indent}}{{dtype}}WO = (WI + PW + PW - KWEff) {{div}} SW + 1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = TO;
+{{indent}}{{y_dim2}} = HO;
+{{indent}}{{y_dim3}} = WO;
+{{indent}}{{y_dim4}} = CO;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+NI == {{x_dim0}} && TI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
+    """
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && TI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class depthwise_conv3d(Operator):
+    r"""depthwise_conv3d"""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv3d constructor.
+
+        Parameters
+        ----------
+        stride : int or tuple
+            Stride of the convolution
+        pad : int or tuple
+            Size of padding to add to the input
+        dilate : int or tuple, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__()
+        self._attrs["op"] = "depthwise_conv3d"
+        self._attrs["stride"] = stride
+        if isinstance(stride, int):
+            self._attrs["stride"] = (stride, stride, stride)
+        self._attrs["pad"] = pad
+        if isinstance(pad, int):
+            self._attrs["pad"] = (pad, pad, pad)
+        self._attrs["dilate"] = dilate
+        if isinstance(dilate, int):
+            self._attrs["dilate"] = (dilate, dilate, dilate)
+        self._attrs["group"] = group
+        self._attrs["has_profiler"] = False
+        self._attrs["epilogue_alignment"] = 1
+        self._attrs["epilogue"] = "LinearCombination"
+        self._attrs["workspace"] = 0
+        self._attrs["split_k"] = None
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[4] != w[0] or x[4] != self._attrs["group"]:
+            raise RuntimeError("Wrong inputs for depthwise_conv3d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride_t=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_t=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_t=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            x_dim4=x[4],
+            w_dim0=w[0],
+            w_dim1=w[1],
+            w_dim2=w[2],
+            w_dim3=w[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["TO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[0]
+        self._attrs["KT"] = w_shape[1]
+        self._attrs["KH"] = w_shape[2]
+        self._attrs["KW"] = w_shape[3]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape: List[int]):
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+            x_dim2=shape[2],
+            x_dim3=shape[3],
+            x_dim4=shape[4],
+        ).replace("\n", "")
+
+    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+        return self.exec_dyn_key_template.render(
+            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _signature(self):
+        signature = "depthwise_conv3d: K=[{kt}, {kh}, {kw}], S=[{st}, {sh}, {sw}], P=[{pt}, {ph}, {pw}], CO=[{co}]".format(
+            kt=self._attrs["KT"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            st=self._attrs["stride"][0],
+            sh=self._attrs["stride"][1],
+            sw=self._attrs["stride"][2],
+            pt=self._attrs["pad"][0],
+            ph=self._attrs["pad"][1],
+            pw=self._attrs["pad"][2],
+            co=self._attrs["CO"],
+        )
+        return signature
+
+    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+        epilogue_dim = output_shape[-1]
+        if not isinstance(epilogue_dim, IntImm):
+            raise RuntimeError("Conv output last dimension must be static!")
+        shape = epilogue_dim._attrs["values"][0]
+        if shape % 8 == 0:
+            self._attrs["epilogue_alignment"] = 8
+        elif shape % 4 == 0:
+            self._attrs["epilogue_alignment"] = 4
+        elif shape % 2 == 0:
+            self._attrs["epilogue_alignment"] = 2
+
+    def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
+        """Call depthwise_conv3d with tensors x, w
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, T, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_t, K_h, K_w, C_in)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, T_out, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self) -> Dict[str, Any]:
+        target_attrs = ["dilate", "group", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
index 025daae9d..52175059f 100644
--- a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -85,3 +85,16 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
         return output
+
+    def _get_op_attributes(self):
+        target_attrs = ["dilate", "pad", "stride"]
+        attr = {
+            "activation": self._attrs["op"].split("_")[-1],
+            "auto_padding": self._auto_padding,
+        }
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
index 5958fe5ae..74ea33f2c 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
@@ -13,8 +13,16 @@
 #  limitations under the License.
 #
 from .bmm_rcr_softmax import bmm_rcr_softmax
+from .dual_gemm_rcr_fast_gelu import dual_gemm_rcr_fast_gelu
+from .dual_gemm_rcr_silu import dual_gemm_rcr_silu
 from .gemm_rcr_bias_softmax import gemm_rcr_bias_softmax
 from .gemm_rcr_softmax import gemm_rcr_softmax
 
 
-__all__ = ["bmm_rcr_softmax", "gemm_rcr_bias_softmax", "gemm_rcr_softmax"]
+__all__ = [
+    "bmm_rcr_softmax",
+    "gemm_rcr_bias_softmax",
+    "gemm_rcr_softmax",
+    "dual_gemm_rcr_silu",
+    "dual_gemm_rcr_fast_gelu",
+]
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
new file mode 100644
index 000000000..9681fc348
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+"""
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal.gemm_rcr import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class dual_gemm_rcr_fast_gelu(gemm_rcr):
+    """GEMM Specialization: FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+        A = torch.randn(M, K)
+        W = torch.randn(N, K)
+        B = torch.randn(N, K)
+        Y1 = torch.nn.functional.linear(A, W)
+        Y2 = torch.nn.functional.linear(A, B)
+        Y = torch.nn.functional.silu(Y_1) * Y_2
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "dual_gemm_rcr_fast_gelu"
+        self._attrs["epilogue2"] = "LeftFastGeluAndMul"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        """Infers output shapes for gemm_rcr_bas.
+
+        Parameters
+        ----------
+        a : Tensor
+            Input tensor A.
+        b : Tensor
+            Input tensor B.
+        bias : Tensor
+            Input tensor bias. Must be a 1D vector.
+
+        Returns
+        -------
+        List[IntVar]
+            Output tensor shape.
+        """
+        return super()._infer_shapes(a, b)
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
new file mode 100644
index 000000000..576269722
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+"""
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal.gemm_rcr import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class dual_gemm_rcr_silu(gemm_rcr):
+    """GEMM Specialization: SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+        A = torch.randn(M, K)
+        W = torch.randn(N, K)
+        B = torch.randn(N, K)
+        Y1 = torch.nn.functional.linear(A, W)
+        Y2 = torch.nn.functional.linear(A, B)
+        Y = torch.nn.functional.silu(Y_1) * Y_2
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "dual_gemm_rcr_silu"
+        self._attrs["epilogue2"] = "LeftSiLUAndMul"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        """Infers output shapes for gemm_rcr_bas.
+
+        Parameters
+        ----------
+        a : Tensor
+            Input tensor A.
+        b : Tensor
+            Input tensor B.
+        bias : Tensor
+            Input tensor bias. Must be a 1D vector.
+
+        Returns
+        -------
+        List[IntVar]
+            Output tensor shape.
+        """
+        return super()._infer_shapes(a, b)
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
index a288d0576..33a68ec0f 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
@@ -45,7 +45,6 @@ def __init__(self):
     def is_valid_shape(a: Tensor, b: Tensor):
         """
         Check input a/b is valid for bmm_rcr_n1.
-
         Requirements:
             1) matching dimension of a/b (where a is row major, b is column major)
             2) dim N of b needs to be 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
index 32e4b839a..b9e2f8e5c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -43,6 +43,7 @@
 from .gemm_rcr_bias_sigmoid_mul_tanh import gemm_rcr_bias_sigmoid_mul_tanh
 from .gemm_rcr_bias_swish import gemm_rcr_bias_swish
 from .gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
+from .gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
 from .gemm_rcr_permute import gemm_rcr_permute
 from .gemm_rrr import gemm_rrr
 from .gemm_rrr_bias import gemm_rrr_bias
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
index b1ab7dfb3..5bc4b4baa 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -36,7 +36,6 @@ class bmm_rcr_permute(bmm_rcr):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(B, M, K).cuda().half()
         W_pt = torch.randn(B, N, K).cuda().half()
 
@@ -89,8 +88,6 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        self._extract_epilogue_alignment(output_shape)
-
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -99,8 +96,15 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
             b, m, n = output_shape
             d1 = self._attrs["shape"][0]
             output_shape = [b.value() // d1, m, d1, n]
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
+
+    def _get_op_attributes(self):
+        return {
+            "layout": self._attrs["layout"].split("_")[-1],
+            "shape": tuple(map(int, self._attrs["permute_shape"].split("_"))),
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index 4378d893f..5bba36489 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -36,7 +36,6 @@ class bmm_rrr_permute(bmm_rrr):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(B, M, K).cuda().half()
         W_pt = torch.randn(B, K, N).cuda().half()
 
@@ -88,7 +87,6 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        self._extract_epilogue_alignment(output_shape)
 
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
@@ -98,8 +96,15 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
             b, m, n = output_shape
             d1 = self._attrs["shape"][0]
             output_shape = [b.value() // d1, m, d1, n]
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
+
+    def _get_op_attributes(self):
+        return {
+            "layout": self._attrs["layout"].split("_")[-1],
+            "shape": tuple(map(int, self._attrs["permute_shape"].split("_"))),
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
index 3cd6986fd..5ec16955f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
@@ -157,3 +157,6 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
+
+    def _get_op_attributes(self):
+        return {"scale": self._attrs["scale"]}
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
index dfc6c7601..5beedb6c2 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -167,7 +167,7 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, b1)
-        self._extract_epilogue_alignment(output_shape)
+
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -176,9 +176,18 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
             b, m, o = output_shape
             d1 = self._attrs["shape"][0]
             output_shape = [b.value() // d1, m, d1, o]
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
         return output
+
+    def _get_op_attributes(self):
+        return {
+            "causal": self._attrs["op"] == "bmm_softmax_bmm_permute_causal",
+            "layout": self._attrs["layout"].split("_")[-1],
+            "scale": self._attrs["scale"],
+            "shape": self._attrs["shape"],
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index e59c1d008..d354af3bc 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -15,6 +15,7 @@
 """
 Common functions/classes for GEMM ops
 """
+import itertools
 import math
 import os
 import re
@@ -23,20 +24,27 @@
 from enum import Enum
 from hashlib import sha1
 from operator import itemgetter
-from typing import Dict, List, Union
+from typing import Any, Dict, List, Union
 
 import jinja2
 
+from aitemplate.backend.profiler_runner import ProfileResult
+
 from .... import backend
 from ....backend import registry
-from ....backend.target import Target
 from ....utils import logger
+from ....utils.alignment import find_max_alignment
 from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
 from ...tensor_accessor import TensorAccessor
 from .cache_entry import GemmQueryEntry, GemmRecordEntry
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
+
+def split_k_result_getter(result):
+    return result[1].duration
+
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -182,19 +190,28 @@ def __init__(
         self._attrs["permute_shape"] = ""
         self.exec_cond_template = EXEC_COND_TEMPLATE
 
-    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+    def _extract_epilogue_alignment(
+        self, output_shape: List[Any], dynamic_profiling_strategy=None
+    ) -> None:
         epilogue_dim = output_shape[-1]
-        if not isinstance(epilogue_dim, IntImm):
-            raise RuntimeError(
-                f"Gemm output last dimension must be static! gemm: {self._attrs}"
-            )
-        shape = epilogue_dim._attrs["values"][0]
-        if shape % 8 == 0:
-            self._attrs["epilogue_alignment"] = 8
-        elif shape % 4 == 0:
-            self._attrs["epilogue_alignment"] = 4
-        elif shape % 2 == 0:
-            self._attrs["epilogue_alignment"] = 2
+        if isinstance(epilogue_dim, int):
+            shape = epilogue_dim
+        elif not isinstance(epilogue_dim, IntImm):
+            # The alignment inferred here will be set to 1 during codegen.
+            if dynamic_profiling_strategy is None:
+                return
+            elif dynamic_profiling_strategy == DynamicProfileStrategy.MAX:
+                shape = epilogue_dim.upper_bound()
+            elif dynamic_profiling_strategy == DynamicProfileStrategy.MIN:
+                shape = epilogue_dim.lower_bound()
+            else:
+                raise RuntimeError(
+                    f"Unsupported dynamic profiling strategy: {dynamic_profiling_strategy}"
+                )
+        else:
+            shape = epilogue_dim._attrs["values"][0]
+
+        self._attrs["epilogue_alignment"] = find_max_alignment(shape)
         return
 
     def _infer_shapes(self, a: Tensor, b: Tensor):
@@ -358,6 +375,21 @@ def _extract_exec_path(self, dynamic_profiling_strategy):
                 )
             )
 
+    def _get_profiler_filename(self):
+        """
+        generate a filename for a profiler that benchmarks multiple GEMM instances
+        """
+        target = backend.target.Target.current()
+        op_type = self._attrs["op"]
+        all_op_names = list(self._attrs["op_instance"].keys())
+        encoded_str = sha1((";".join(all_op_names)).encode("utf-8")).hexdigest()
+        # we don't use cache
+        if target.use_dummy_profiling_results():
+            return f"{op_type}_{encoded_str}"
+        else:
+            cache_ver = target.get_profile_cache_version("gemm")
+            return f"{op_type}_{encoded_str}_{cache_ver}"
+
     def _should_build_profiler(
         self, workloads: List[str], new_op_instance: OrderedDict
     ):
@@ -439,6 +471,15 @@ def gen_profiler(
         func_key = "{target}.{op}.filter".format(
             target=target.name(), op=self._attrs["op"]
         )
+
+        # Update epilogue alignment here because it may be different depending on the profiling strategy.
+        # Note that this alignment is only used in profiling and will be updated
+        # during the final codegen.
+        # gemm_permute ops have special output alignment rules, skip here.
+        if "layout" not in self._attrs:
+            output_shape = self._attrs["output_accessors"][0].original_shapes
+            self._extract_epilogue_alignment(output_shape, dynamic_profiling_strategy)
+
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
@@ -463,10 +504,19 @@ def gen_profiler(
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs, workdir, self._extract_dims(for_profiling=True))
+            profiler_filename = self._get_profiler_filename()
+            logger.info(__name__, f"generating {profiler_filename=}")
+            return func(
+                self._attrs,
+                workdir,
+                profiler_filename,
+                self._extract_dims(for_profiling=True),
+            )
 
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key, fbuild_cmd):
-        exe_path = os.path.join(profiler_prefix, cfg)
+    def _gen_profile_cmd(
+        self, profiler_prefix, profiler_filename, exec_key, fbuild_cmd
+    ):
+        exe_path = os.path.join(profiler_prefix, profiler_filename)
         if not os.access(exe_path, os.X_OK):
             raise RuntimeError("Profiler %s is not executable" % exe_path)
         cmd_args = fbuild_cmd(exec_key)
@@ -506,7 +556,7 @@ def _get_ab_alignment(self, exec_key):
                 self._attrs["f_ab_alignment"](int(m), int(n), int(k))
                 for m, n, k in zip(all_m, all_n, all_k)
             ]
-            ab_alignment = sorted(all_ab_alignments)[0]
+            ab_alignment = min(all_ab_alignments)
         else:
             # exec_key may contain batch dimension, which we don't care here
             m, n, k = gemm_inverse_key_func(exec_key)[-3:]
@@ -518,7 +568,12 @@ def _get_ab_alignment(self, exec_key):
                 )
         return ab_alignment
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
+        """
+        Schedule profilers for given profiler path and gemm shape (exec_key)
+        or get the result from cache
+        or use dummy result in CI
+        """
         target = backend.target.Target.current()
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
         tmp_op = self._attrs["op_instance"][tmp_key]
@@ -553,7 +608,10 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 f'Load profiling result for {self._attrs["name"]} '
                 f"from cache: {cache_value}",
             )
-            return cache_value
+            self._attrs["exec_path"][exec_key].algo = cache_value[0]
+            self._attrs["workspace"] = max(self._attrs["workspace"], cache_value[1])
+            self._attrs["split_k"] = cache_value[2]
+            return
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
@@ -562,87 +620,47 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 f"{op_type} {exec_entry_sha1}.\n",
                 "To bypass, you need to make it available in the db table.",
             )
-        # do real profile
-        content = list(self._attrs["op_instance"].keys())
-        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        profiler_filename = self._get_profiler_filename()
+
+        def _gen_callback(split_k):
+            def process_result_callback(result, postprocessing_delegate):
+                postprocessing_delegate.add_instance(
+                    (result, self._attrs, profiler_filename, exec_key, split_k)
+                )
+
+            return process_result_callback
+
+        command = self._gen_profile_cmd(profiler_prefix, profiler_filename, exec_key)
 
-        results = []
         if self._attrs["op"].startswith("group_gemm") or self._attrs["op"].startswith(
             "bmm"
         ):
-            for cfg in content:
-                command = self._gen_profile_cmd(profiler_prefix, cfg, exec_key)
-                runner.push(cfg, command)
-            runner.join()
-            result = runner.pull()
-            results += [item + (1,) for item in result]
+            profiler_runner.push(command, _gen_callback(split_k=1))
         else:
             m, n, k = gemm_inverse_key_func(exec_key)[-3:]
-            for split_k in self._split_k_search_space(m, n, k):
-                for cfg in content:
-                    command = self._gen_profile_cmd(profiler_prefix, cfg, exec_key)
-                    command.append(str(split_k))
-                    logger.debug(__name__, "profiling cmd: {}".format(command))
-                    runner.push(cfg, command)
-                runner.join()
-                result = runner.pull()
-                results += [item + (split_k,) for item in result]
-
-        out = sorted(results, key=lambda x: x[1].duration)
-        if len(out) == 0:
-            raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
-            )
-        best_algo = out[0][0]
-        workspace = out[0][1].workspace
-        split_k = out[0][2]
-        # cache
-        cache_record = GemmRecordEntry(
-            exec_entry=exec_key,
-            exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
-            major_a=tmp_op.A.layout.value,
-            major_b=tmp_op.B.layout.value,
-            major_c=tmp_op.C.layout.value,
-            op_type=self._attrs["op"],
-            epilogue=tmp_op.epilogue_functor.value,
-            device=target._arch,
-            algo=best_algo,
-            workspace=workspace,
-            split_k=split_k,
-            pshape=self._attrs["permute_shape"],
-        )
-        Target.current().insert_profile_cache("gemm", cache_record.__dict__)
-        logger.info(__name__, f"Selected kernel: {best_algo}, {workspace}, {split_k}")
-        return (best_algo, workspace, split_k)
+            if "split_k_hints" in self._attrs:
+                split_k_search_space = self._attrs["split_k_hints"]
+            else:
+                split_k_search_space = self._split_k_search_space(m, n, k)
+            for split_k in split_k_search_space:
+                gemm_command = command + [str(split_k)]
+                profiler_runner.push(gemm_command, _gen_callback(split_k))
 
     def profile(
         self,
+        profiler_runner,
         workdir="./",
-        devices=None,
-        dynamic_profiling_strategy=None,
     ):
         """Selects the fastest kernel configurations.
 
         Parameters
         ----------
-        workdir : str, optional
-            Base dir to keep profiling source codes, by default "./"
-        devices: list, optional
-            Devices used for profiling, by default device 0 will be used.
-        dynamic_profiling_strategy: DynamicProfileStrategy, optional
-            Unused (profiles are generated at compile time), by default None.
-            Call site in :func:`~aitemplate.compiler.transform.profile.profile`
-            uses this parameter for each op, so it cannot be removed until all
-            profile generation is done at compile time.
+        profiler_runner: ProfilerRunner
+            Profiler runner to schedule async profiler jobs,
+        workdir : str
+            Base dir to keep profiling source codes, by default "./"running on separate GPU devices concurrently
         """
 
-        if devices is None:
-            devices = [0]
-
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
         if "op_instance" not in self._attrs:
@@ -672,16 +690,7 @@ def profile(
                 # we have cached best algo
                 return
             else:
-                best_algo, workspace, split_k = self._profile_single_workload(
-                    profiler_prefix, wkl, devices
-                )
-                self._attrs["exec_path"][wkl].algo = best_algo
-                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
-                self._attrs["split_k"] = split_k
-                logger.debug(
-                    __name__,
-                    "Profile best split-k: {}".format(split_k),
-                )
+                self._profile_single_workload(profiler_prefix, wkl, profiler_runner)
 
     def gen_function(self) -> str:
         """Generates the function code for the gemm op for the current target.
@@ -760,3 +769,97 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
+
+
+def _profiler_results_groupby_key(instance):
+    return (
+        instance[1]["name"],  # unique op name
+        instance[2],  # profiler executable
+        instance[3],  # profiler key (gemm shape)
+    )
+
+
+def _profiler_group_reduce_min_key(group):
+    return group[0][1]  # elapsed runtime
+
+
+class GemmProfilerPostprocessingDelegate:
+    """
+    Object which collects profiler results after profiler executables complete,
+    updates profiler results cache and the gemm nodes' attrs after all profilers complete.
+    """
+
+    def __init__(self):
+        """
+        Initialize storage for profiler results
+        Instance=(
+            ProfileResult=(best_algo, elapsed_runtime, workspace),
+            func_attrs,
+            profiler_filename,
+            exec_key,
+            split_k,
+        )
+        """
+        self._instances = []
+
+    def add_instance(self, instance: ProfileResult):
+        """
+        As a profiler executable completes, collect the result
+        """
+        self._instances.append(instance)
+
+    def postprocess_results(self):
+        """
+        When all profiler executables complete, find the best instance
+        (min runtime per op name, profiler executable and exec_key (i.e. gemm shape mnk)
+        across multiple split_k values)
+        The best instance is cached, and written into corresponding gemm nodes in the graph
+        """
+        target = backend.target.Target.current()
+        for _, group in itertools.groupby(
+            self._instances,
+            key=_profiler_results_groupby_key,
+        ):
+            min_runtime_results = min(group, key=_profiler_group_reduce_min_key)
+            (
+                (best_algo, runtime, workspace),
+                func_attrs,
+                profiler_filename,
+                exec_key,
+                split_k,
+            ) = min_runtime_results
+
+            func_attrs["exec_path"][exec_key].algo = best_algo
+            func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+            func_attrs["split_k"] = split_k
+
+            logger.info(
+                __name__,
+                f"Profiler ({profiler_filename} {exec_key}) selected kernel: "
+                f"{best_algo=} {workspace=} {split_k=}",
+            )
+
+            tmp_op = next(iter(func_attrs["op_instance"].values()))
+            exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+            cache_record = GemmRecordEntry(
+                exec_entry=exec_key,
+                exec_entry_sha1=exec_entry_sha1,
+                dtype_a=tmp_op.A.element.value,
+                dtype_b=tmp_op.B.element.value,
+                dtype_c=tmp_op.C.element.value,
+                dtype_acc=tmp_op.accumulator_type().value,
+                major_a=tmp_op.A.layout.value,
+                major_b=tmp_op.B.layout.value,
+                major_c=tmp_op.C.layout.value,
+                op_type=func_attrs["op"],
+                epilogue=tmp_op.epilogue_functor.value,
+                device=target._arch,
+                algo=best_algo,
+                workspace=workspace,
+                split_k=split_k,
+                pshape=func_attrs["permute_shape"],
+            )
+            try:
+                target.insert_profile_cache("gemm", cache_record.__dict__)
+            except Exception as e:
+                logger.warning(__name__, e)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
index 6532d5aba..42023d1dc 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
@@ -29,7 +29,6 @@ class gemm_rcr(common.gemm):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
index 73e6d3659..4372e19f4 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
@@ -29,7 +29,6 @@ class gemm_rcr_bias(gemm_rcr):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 05d8c23b7..7c72ac636 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_fast_gelu(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
index 730df8f0a..b8c7a33ce 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_gelu(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
index 1063e36f5..4ee004262 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_hardswish(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
index 711eb9d75..97b199b66 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
@@ -52,7 +52,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
-        self._extract_epilogue_alignment(output_shape)
+
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -66,4 +66,11 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
             output_shape = [t2, m.value() // t1 // t2, t3, t1, n.value() // t3]
         else:
             output_shape = [t2, m.value() // t1, t3, t1, n.value() // t3 // t2]
+        self._extract_epilogue_alignment(output_shape)
         return reshape()(output, output_shape)
+
+    def _get_op_attributes(self):
+        return {
+            "layout": self._attrs["layout"].split("_")[-1],
+            "shape": self._attrs["shape"],
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
index c22822a97..99318ff49 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_relu(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
index cae111a19..b65c6f0a6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_sigmoid(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
index 69c388ddd..ffb285ef8 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_swish(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
index aaec1c507..53b35e879 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
@@ -27,7 +27,6 @@ class gemm_rcr_bias_tanh(gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(N, K).cuda().half()
         Bias = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
new file mode 100644
index 000000000..1ffed29a4
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
@@ -0,0 +1,41 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: FastGELU(GEMM_RCR(A, B))
+"""
+from . import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_fast_gelu(gemm_rcr):
+    """GEMM Specialization: FastGELU(GEMM_RCR(A, B))
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B)
+        y = torch.nn.GELU(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_fast_gelu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_fast_gelu"
+        self._attrs["epilogue"] = "LinearCombinationFastGELU"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
index 549cf11fb..75560e1ba 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
@@ -23,7 +23,7 @@
 
 from aitemplate.testing import detect_target
 
-from ...base import Tensor
+from ...base import IntImm, IntVar, Tensor
 from ...tensor_accessor import TensorAccessor
 from ..common import reshape
 
@@ -35,7 +35,7 @@
 class gemm_rcr_permute(gemm_rcr):
     def __init__(self, shape: Tuple[int], layout="20314"):
         super().__init__()
-        if layout == "20314":
+        if layout == "20314" or layout == "0213":
             self._attrs["op"] = "gemm_rcr_permute"
         elif layout == "m2n3":
             self._attrs["op"] = "gemm_rcr_permute_m2n3"
@@ -43,7 +43,10 @@ def __init__(self, shape: Tuple[int], layout="20314"):
             raise NotImplementedError("{} is not implemented!".format(layout))
 
         self._attrs["shape"] = shape
-        self._attrs["layout"] = "Permute5D_{}".format(layout)
+        if layout == "0213":
+            self._attrs["layout"] = "Permute4D_{}".format(layout)
+        else:
+            self._attrs["layout"] = "Permute5D_{}".format(layout)
         self._attrs["permute_shape"] = "_".join(map(str, shape))
 
     def __call__(self, a: Tensor, b: Tensor) -> Tensor:
@@ -53,7 +56,6 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        self._extract_epilogue_alignment(output_shape)
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -65,8 +67,35 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
             m, n = output_shape
             t1, t2, t3 = self._attrs["shape"]
             output_shape = [t2, m.value() // t1, t3, t1, n.value() // t2 // t3]
+            # output alignment needs to be calculated based on the reshaped last dim
+            self._extract_epilogue_alignment(output_shape)
+            return reshape()(output, output_shape)
+        elif (
+            self._attrs["layout"] == "Permute4D_0213"
+            and detect_target().name() == "cuda"
+        ):
+            m, n = output_shape
+            t1, t2 = self._attrs["shape"]
+            if not isinstance(m, IntImm):
+                vals = []
+                for val in m._attrs["values"]:
+                    assert val % t1 == 0
+                    vals.append(val // t1)
+                dim0 = IntVar(vals)
+            else:
+                assert m.value() % t1 == 0
+                dim0 = m.value() // t1
+            output_shape = [dim0, t2, t1, n.value() // t2]
+            # output alignment needs to be calculated based on the reshaped last dim
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
+
+    def _get_op_attributes(self):
+        return {
+            "layout": self._attrs["layout"].split("_")[-1],
+            "shape": self._attrs["shape"],
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
index d8a2f1a99..cee26e810 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
@@ -32,7 +32,6 @@ class gemm_rrr(common.gemm):
 
     .. highlight:: python
     .. code-block:: python
-
         A = torch.randn(M, K).cuda().half()
         B = torch.randn(K, N).cuda().half()
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
index fddc0bf00..b1b75ee6d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
@@ -46,7 +46,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
-        self._extract_epilogue_alignment(output_shape)
+
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -60,6 +60,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
                 output_shape = [t2, m.value() // t1 // t2, t3, t1, n.value() // t3]
             else:
                 output_shape = [t2, m.value() // t1, t3, t1, n.value() // t3 // t2]
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
index 48a64fbd8..24b9bc276 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
@@ -45,7 +45,6 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        self._extract_epilogue_alignment(output_shape)
 
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
@@ -55,8 +54,15 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
             m, n = output_shape
             t1, t2, t3 = self._attrs["shape"]
             output_shape = [t2, m.value() // t1, t3, t1, n.value() // t2 // t3]
+            self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
+
+    def _get_op_attributes(self):
+        return {
+            "layout": self._attrs["layout"].split("_")[-1],
+            "shape": self._attrs["shape"],
+        }
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index 50231d2ec..6777ab228 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -21,6 +21,8 @@
 
 import jinja2
 
+from aitemplate.compiler.stable_set import StableSet
+
 from ....backend import registry
 from ....backend.target import Target
 from ....utils import logger
@@ -77,7 +79,6 @@ class group_gemm_rcr(common.gemm):
 
     .. highlight:: python
     .. code-block:: python
-
         # group 1
         A1 = torch.randn(M1, K1).cuda().half()
         B1 = torch.randn(N1, K1).cuda().half()
@@ -153,7 +154,7 @@ def _concat_strided_outputs(self, outputs, output_stride_dim):
         """a temporary function to concatenate strided outputs"""
         cat_op = concatenate()
         cat_output = cat_op(outputs, dim=output_stride_dim)
-        cat_output._attrs["src_ops"] = [self]
+        cat_output._attrs["src_ops"] = StableSet([self])
         offset = 0
         for idx, output_tensor in enumerate(outputs):
             self._attrs["output_accessors"][idx].update_base_tensor(
@@ -197,7 +198,7 @@ def __call__(self, operand_groups: List[List[Tensor]], output_stride_dim=None):
         for a, b in operand_groups:
             op = gemm_rcr()
             c = op(a, b)
-            c._attrs["src_ops"] = [self]
+            c._attrs["src_ops"] = StableSet([self])
             a._attrs["dst_ops"].remove(op)
             b._attrs["dst_ops"].remove(op)
             epilogue_alignment = min(
@@ -297,7 +298,11 @@ def gen_profiler(
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs, workdir, self.shape_eval_template)
+            profiler_filename = self._get_profiler_filename()
+            logger.info(__name__, f"generating {profiler_filename=}")
+            return func(
+                self._attrs, workdir, profiler_filename, self.shape_eval_template
+            )
 
     def gen_function(self) -> str:
         """Generate function for the op
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
index 906234345..ac348062f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
@@ -20,6 +20,8 @@
 
 import jinja2
 
+from aitemplate.compiler.stable_set import StableSet
+
 from ...base import ExecItem, Tensor
 from ...tensor_accessor import TensorAccessor
 from .gemm_rcr_bias import gemm_rcr_bias
@@ -46,7 +48,6 @@ class group_gemm_rcr_bias(group_gemm_rcr):
 
     .. highlight:: python
     .. code-block:: python
-
         # group 1
         A1 = torch.randn(M1, K1).cuda().half()
         B1 = torch.randn(N1, K1).cuda().half()
@@ -122,7 +123,7 @@ def __call__(self, operand_groups: List[List[Tensor]], output_stride_dim=None):
         for a, b, bias in operand_groups:
             op = gemm_rcr_bias()
             c = op(a, b, bias)
-            c._attrs["src_ops"] = [self]
+            c._attrs["src_ops"] = StableSet([self])
             a._attrs["dst_ops"].remove(op)
             b._attrs["dst_ops"].remove(op)
             bias._attrs["dst_ops"].remove(op)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
index 1e095bdcf..3094eb71f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -27,7 +27,6 @@ class group_gemm_rcr_bias_relu(group_gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         # group 1
         A1 = torch.randn(M1, K1).cuda().half()
         B1 = torch.randn(N1, K1).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index eccedaf52..8601144a3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -27,7 +27,6 @@ class group_gemm_rcr_bias_sigmoid(group_gemm_rcr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         # group 1
         A1 = torch.randn(M1, K1).cuda().half()
         B1 = torch.randn(N1, K1).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
index c3ee84199..0f8b3d7d4 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
@@ -31,7 +31,6 @@ class perm021fc_ccr(bmm):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(B, K, M).cuda().half()
         W_pt = torch.randn(N, K).cuda().half()
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
index e5af55a95..378f8f33b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
@@ -31,7 +31,6 @@ class perm021fc_ccr_bias(perm021fc_ccr):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(B, K, M).cuda().half()
         W_pt = torch.randn(N, K).cuda().half()
         Bias_pt = torch.randn(N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
index aca47f57e..e48701330 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -34,7 +34,6 @@ class perm021fc_ccr_bias_permute(perm021fc_ccr_bias):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(B, K, M).cuda().half()
         W_pt = torch.randn(N, K).cuda().half()
         Bias_pt = torch.randn(N).cuda().half()
@@ -75,3 +74,6 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
+
+    def _get_op_attributes(self):
+        return {"layout": self._attrs["layout"].split("_")[-1]}
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
index e4293290c..abd6d06a6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
@@ -30,7 +30,6 @@ class perm102_bmm_rcr(bmm):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(M, B, K).cuda().half()
         W_pt = torch.randn(B, N, K).cuda().half()
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
index 8e134d9c1..326291002 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
@@ -31,7 +31,6 @@ class perm102_bmm_rcr_bias(perm102_bmm_rcr):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(M, B, K).cuda().half()
         W_pt = torch.randn(B, N, K).cuda().half()
         B_pt = torch.randn(B, N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
index a11830f40..2d57a75cb 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
@@ -30,7 +30,6 @@ class perm102_bmm_rrr(bmm):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(M, B, K).cuda().half()
         W_pt = torch.randn(B, K, N).cuda().half()
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
index 337894b3b..59d5fd4de 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
@@ -30,7 +30,6 @@ class perm102_bmm_rrr_bias(perm102_bmm_rrr):
 
     .. highlight:: python
     .. code-block:: python
-
         X_pt = torch.randn(M, B, K).cuda().half()
         W_pt = torch.randn(B, K, N).cuda().half()
         B_pt = torch.randn(B, N).cuda().half()
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index acac2e14e..d3114b545 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -19,6 +19,7 @@
 import re
 from collections import OrderedDict
 from hashlib import sha1
+from operator import itemgetter
 from typing import Any, List, Union
 
 import jinja2
@@ -57,6 +58,7 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
         if detect_target().name() == "rocm":
             self._attrs["has_profiler"] = True
         self._attrs["num_channels"] = num_channels
+        self._attrs["workspace"] = 0
 
     @staticmethod
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
@@ -128,8 +130,10 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        self._extract_exec_path()
         output = Tensor(output_shape, src_ops={self})
+
+        batch_size = output_shape[0]._attrs["values"][-1]
+        self._attrs["workspace"] = 8 * batch_size * self._attrs["num_groups"]
         self._attrs["outputs"] = [output]
         return output
 
@@ -250,13 +254,14 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        best_algo = out[0][0]
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        best_algo = out[0]
+        workspace = out[1].workspace
         ## cache
         cache_record = NormRecordEntry(
             exec_entry=exec_key,
@@ -402,3 +407,9 @@ def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.M
 
     def _inputs_for_pseudo_code(self):
         return self._attrs["inputs"] + [f"num_groups={self._attrs['num_groups']}"]
+
+    def _get_op_attributes(self):
+        return {
+            "num_groups": self._attrs["num_groups"],
+            "num_channels": self._attrs["num_channels"],
+        }
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
index 8458526f7..9092e8ea9 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -28,7 +28,6 @@
 class group_layernorm(layernorm):
     """group_layernorm.
     For each group, we expect each input to have shapes:
-
         Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
         Normalized_shape: [N1, N2, ..., ND]
         Gamma/Beta, if not None, have the same shape as normalized_shape.
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
index 764880244..ed13b6760 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
@@ -26,7 +26,6 @@
 class group_layernorm_sigmoid_mul(group_layernorm):
     """group_layernorm_sigmoid_mul.
     For each group, we expect each input to have shapes:
-
         Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
         Normalized_shape: [N1, N2, ..., ND]
         Gamma/Beta, if not None, have the same shape as normalized_shape.
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index e0c373f90..d9f8f6364 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -19,6 +19,7 @@
 import re
 from collections import OrderedDict
 from hashlib import sha1
+from operator import itemgetter
 from typing import Any, List, Union
 
 import jinja2
@@ -48,7 +49,6 @@
 
 class layernorm(Operator):
     """Standalone layernorm op.
-
     Applies Layer Normalization over a mini-batch of inputs as described in the
     paper Layer Normalization. The mean and standard-deviation are calculated
     over the last D dimensions, where D is the dimension of normalized_shape.
@@ -164,7 +164,6 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        self._extract_exec_path()
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
@@ -217,7 +216,7 @@ def _gen_exec_key(self, name_value_mapping):
                 key_strs.append(f"{name} >= {values[0]} && {name} <= {values[-1]}")
             else:
                 raise RuntimeError(
-                    "Layernorm input has empty dim values: {}".format(values)
+                    "Softmax input has empty dim values: {}".format(values)
                 )
         return " && ".join(key_strs)
 
@@ -230,10 +229,9 @@ def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.M
             A dynamic profiling strategy. By default MAX is used, i.e. to profile
             a dynamic range, an upper bound will be used.
         """
-        if self._attrs["has_profiler"]:
-            assert (
-                len(self._attrs["normalized_shape"]) == 1
-            ), "For profiling, normalized_shape must be 1D"
+        assert (
+            len(self._attrs["normalized_shape"]) == 1
+        ), "For profiling, normalized_shape must be 1D"
 
         m_max = 1
         m_min = 1
@@ -331,13 +329,14 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        best_algo = out[0][0]
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        best_algo = out[0]
+        workspace = out[1].workspace
         ## cache
         cache_record = NormRecordEntry(
             exec_entry=exec_key,
@@ -405,6 +404,16 @@ def gen_profiler(
         workdir: str = None,
         dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
     ) -> None:
+        """Generator profiler. The profiler files are standalone executable for profiling.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
         target = Target.current()
         # init candidate ops
         func_key = "{target}.{op}.config".format(
@@ -416,4 +425,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir)
+        return func(self._attrs, workdir)
+
+    def _get_op_attributes(self):
+        return {"normalized_shape": self._attrs["default_normalized_shape"]}
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
index 47ace2262..e34a2a019 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
@@ -15,6 +15,8 @@
 """
 Operator definition for layernorm_sigmoid_mul.
 """
+from aitemplate.compiler.stable_set import StableSet
+
 from .... import backend
 from ....backend import registry
 from ...base import Operator
@@ -72,7 +74,7 @@ def _update_inputs_outputs(self, layer_norm, sigmoid, mul):
 
         assert len(self._attrs["outputs"]) == 1
         output_tensor = self._attrs["outputs"][0]
-        output_tensor._attrs["src_ops"] = {self}
+        output_tensor._attrs["src_ops"] = StableSet([self])
 
         # update output tensor shape
         # hack for fixing dynamic shape with elementwise fusion issue
@@ -80,14 +82,19 @@ def _update_inputs_outputs(self, layer_norm, sigmoid, mul):
         for i, shape_var in enumerate(output_tensor._attrs["shape"]):
             shape_var._attrs["values"] = x._attrs["shape"][i]._attrs["values"]
 
-        sigmoid._attrs["inputs"][0]._attrs["src_ops"] = set()
-        sigmoid._attrs["inputs"][0]._attrs["dst_ops"] = set()
-        sigmoid._attrs["outputs"][0]._attrs["src_ops"] = set()
-        sigmoid._attrs["outputs"][0]._attrs["dst_ops"] = set()
+        sigmoid._attrs["inputs"][0]._attrs["src_ops"] = StableSet()
+        sigmoid._attrs["inputs"][0]._attrs["dst_ops"] = StableSet()
+        sigmoid._attrs["outputs"][0]._attrs["src_ops"] = StableSet()
+        sigmoid._attrs["outputs"][0]._attrs["dst_ops"] = StableSet()
 
     def __call__(self):
         return self._attrs["outputs"][0]
 
+    def _get_op_attributes(self):
+        raise NotImplementedError(
+            "layernorm_sigmoid_mul get op attribute not implemented"
+        )
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
diff --git a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
index b88967a6f..b673a774e 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
@@ -94,6 +94,12 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "padded_channels": self._attrs["op"].split("to")[-1],
+            "shape_func_template": self.shape_eval_template,
+        }
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         template_path = target.template_path()
diff --git a/python/aitemplate/compiler/ops/padding/pad_last_dim.py b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
index 29fb49c24..b25267f4e 100644
--- a/python/aitemplate/compiler/ops/padding/pad_last_dim.py
+++ b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
@@ -78,6 +78,9 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {"ndim": self._attrs["ndim"], "out_dim": self._attrs["out_dim"]}
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         template_path = target.template_path()
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
index 883abfd40..5fd2bfd78 100644
--- a/python/aitemplate/compiler/ops/pool/pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -166,6 +166,16 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        target_attrs = ["stride", "pad", "kernel_size", "reduce_func"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_common.py b/python/aitemplate/compiler/ops/reduce/reduce_common.py
index d8036168b..71dbec41c 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_common.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_common.py
@@ -15,13 +15,16 @@
 """
 Base operator definition for reduce-family ops.
 """
+import itertools
+
 from typing import List
 
 from .... import backend
 from ....backend import registry
 from ....utils import logger, shape_utils
 from ....utils.tensor_utils import wrap_dim
-from ...base import get_dtype_size, IntImm, IntVar, Operator, Tensor
+from ...base import IntImm, IntVar, Operator, Tensor
+from ...dtype import get_dtype_size
 from ...tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221
@@ -155,46 +158,60 @@ def _compute_workspace_size(
             if last_dim % vec_len == 0:
                 vector_length = vec_len
                 break
-        extent_affine = []
         num_dims = 4
-        # Use dim's upper-bound for computing workspace size
-        shape_dims = [d.upper_bound() for d in shape]
-        rank = len(shape_dims)
+        rank = len(shape)
         assert (
             rank <= num_dims
         ), f"expected rank <= num_dims, but got {rank=}, {num_dims=}"
         # adjust reduction axis
         reduction_axis = num_dims - rank + reduction_axis
-        prefix_dims = [1] * (num_dims - rank)
-        shape_dims = prefix_dims + shape_dims
-        # normalize extent_affine list
-        # reference: TensorReduction construction in cutlass
-        # reduction/device/tensor_reduce.h
-        if reduction_axis == 0:
-            extent_affine.append(shape_dims[1])
-            extent_affine.append(shape_dims[2])
-            extent_affine.append(shape_dims[0])
-            extent_affine.append(shape_dims[3])
-        elif reduction_axis == 1:
-            extent_affine.append(shape_dims[0])
-            extent_affine.append(shape_dims[2])
-            extent_affine.append(shape_dims[1])
-            extent_affine.append(shape_dims[3])
-        elif reduction_axis == 2:
-            extent_affine.append(shape_dims[0])
-            extent_affine.append(shape_dims[1])
-            extent_affine.append(shape_dims[2])
-            extent_affine.append(shape_dims[3])
-        else:
-            # note that we already ruled out non-col-reduction kernels so that
-            # reduction_axis would never be 3. Consequently, we would never
-            # invoke contiguous tensor_reduce kernels.
-            raise RuntimeError(
-                f"Expected reduction_axis to be within [0, 2], but got {reduction_axis=}"
+        all_shape_dims = [
+            list(range(d.lower_bound(), d.upper_bound() + 1)) for d in shape
+        ]
+        max_ws = 0
+        # Go through cartesian product of all possible dynamic dim values
+        # to find the maximum workspace size. It might be a bit heavy
+        # for some cases. However, it's OK for our current use cases where
+        # we have a single dynamic axis within a range of several thousand.
+        # Moreover, we would remove this entire estimation once we have
+        # our own row-reduction kernel.
+        for one_dims in itertools.product(*all_shape_dims):
+            extent_affine = []
+            prefix_dims = [1] * (num_dims - rank)
+            # Use dim's upper-bound for computing workspace size
+            shape_dims = prefix_dims + list(one_dims)
+            # normalize extent_affine list
+            # reference: TensorReduction construction in cutlass
+            # reduction/device/tensor_reduce.h
+            if reduction_axis == 0:
+                extent_affine.append(shape_dims[1])
+                extent_affine.append(shape_dims[2])
+                extent_affine.append(shape_dims[0])
+                extent_affine.append(shape_dims[3])
+            elif reduction_axis == 1:
+                extent_affine.append(shape_dims[0])
+                extent_affine.append(shape_dims[2])
+                extent_affine.append(shape_dims[1])
+                extent_affine.append(shape_dims[3])
+            elif reduction_axis == 2:
+                extent_affine.append(shape_dims[0])
+                extent_affine.append(shape_dims[1])
+                extent_affine.append(shape_dims[2])
+                extent_affine.append(shape_dims[3])
+            else:
+                # note that we already ruled out non-col-reduction kernels so that
+                # reduction_axis would never be 3. Consequently, we would never
+                # invoke contiguous tensor_reduce kernels.
+                raise RuntimeError(
+                    f"Expected reduction_axis to be within [0, 2], but got {reduction_axis=}"
+                )
+            max_ws = max(
+                max_ws,
+                self._compute_ws_size_strided(
+                    extent_affine, reduction_axis, vector_length, dtype
+                ),
             )
-        return self._compute_ws_size_strided(
-            extent_affine, reduction_axis, vector_length, dtype
-        )
+        return max_ws
 
     def __call__(self, x: Tensor) -> Tensor:
         self._attrs["inputs"] = [x]
@@ -240,6 +257,13 @@ def __call__(self, x: Tensor) -> Tensor:
             self._attrs["workspace"] = ws_size
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "dim": self._attrs["reduction_axes"],
+            "dtype": self._attrs["output_type"],
+            "keepdim": self._attrs["keepdim"],
+        }
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
diff --git a/python/aitemplate/compiler/ops/reduce/var.py b/python/aitemplate/compiler/ops/reduce/var.py
index 2f2fec56e..136783ee1 100644
--- a/python/aitemplate/compiler/ops/reduce/var.py
+++ b/python/aitemplate/compiler/ops/reduce/var.py
@@ -50,3 +50,11 @@ def __init__(self, dim, unbiased, keepdim=False, dtype=None) -> None:
         super().__init__(dim, keepdim, dtype)
         self._attrs["op"] = "var"
         self._attrs["unbiased"] = unbiased
+
+    def _get_op_attributes(self):
+        return {
+            "dim": self._attrs["reduction_axes"],
+            "dtype": self._attrs["output_type"],
+            "keepdim": self._attrs["keepdim"],
+            "unbiased": self._attrs["unbiased"],
+        }
diff --git a/python/aitemplate/compiler/ops/reduce/vector_norm.py b/python/aitemplate/compiler/ops/reduce/vector_norm.py
index fbacf8f46..c4d445195 100644
--- a/python/aitemplate/compiler/ops/reduce/vector_norm.py
+++ b/python/aitemplate/compiler/ops/reduce/vector_norm.py
@@ -57,3 +57,11 @@ def __init__(self, ord_kind=2, dim=None, keepdim=False, dtype=None) -> None:
         super().__init__(dim, keepdim, dtype)
         self._attrs["op"] = "vector_norm"
         self._attrs["ord_kind"] = str(ord_kind)
+
+    def _get_op_attributes(self):
+        return {
+            "dim": self._attrs["reduction_axes"],
+            "dtype": self._attrs["output_type"],
+            "keepdim": self._attrs["keepdim"],
+            "ord_kind": self._attrs["ord_kind"],
+        }
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index cb3f0d2b9..d077e1325 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -19,6 +19,7 @@
 import re
 from collections import OrderedDict
 from hashlib import sha1
+from operator import itemgetter
 from typing import Dict, List, Union
 
 import jinja2
@@ -202,7 +203,6 @@ def __call__(self, x: Tensor, dim: int = None) -> Tensor:
         self._attrs["dim"] = dim
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        self._extract_exec_path()
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         return output
@@ -271,13 +271,14 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        best_algo = out[0][0]
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        best_algo = out[0]
+        workspace = out[1].workspace
         ## cache
         cache_record = NormRecordEntry(
             exec_entry=exec_key,
@@ -345,6 +346,16 @@ def gen_profiler(
         workdir: str = None,
         dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
     ) -> None:
+        """Generator profiler. The profiler files are standalone executable for profiling.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
         target = Target.current()
         # init candidate ops
         func_key = "{target}.{op}.config".format(
@@ -359,6 +370,13 @@ def gen_profiler(
         func(self._attrs, workdir)
 
     def gen_function(self) -> str:
+        """Generate function body.
+
+        Returns
+        -------
+        str
+            The rendered template of generated function body.
+        """
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
             target=target.name(), op=self._attrs["op"]
diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
index d7fad570f..b7f3ad348 100644
--- a/python/aitemplate/compiler/ops/tensor/argmax.py
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -19,6 +19,7 @@
 import os
 import re
 from collections import OrderedDict
+from operator import itemgetter
 from typing import List
 
 import jinja2
@@ -121,7 +122,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir)
+        return func(self._attrs, workdir)
 
     def _gen_exec_key(self, shape: List[int]):
         """rending the shape info"""
@@ -164,12 +165,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        workspace = out[1].workspace
         return workspace
 
     def profile(
@@ -179,7 +181,6 @@ def profile(
         dynamic_profiling_strategy=None,
     ):
         """Get the Argmax Op workspace
-
         Parameters
         ----------
         workdir : str, optional
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index 534149310..a7669d969 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -237,11 +237,6 @@ def remove_input_at(self, indices: Union[int, Sequence[int]]) -> None:
                     raise RuntimeError(
                         f'Expected input_masks at {idx} to be True for {self._attrs["name"]}'
                     )
-                if curr_input_accessors[curr_idx].stride_dim is not None:
-                    raise RuntimeError(
-                        f"Cannot remove an input (idx: {curr_idx}) with a valid "
-                        f'TensorAccessor for {self._attrs["name"]}'
-                    )
                 self._attrs["input_masks"][orig_idx] = False
                 idx += 1
             else:
diff --git a/python/aitemplate/compiler/ops/tensor/permute.py b/python/aitemplate/compiler/ops/tensor/permute.py
index e4f7c2011..a992b0dce 100644
--- a/python/aitemplate/compiler/ops/tensor/permute.py
+++ b/python/aitemplate/compiler/ops/tensor/permute.py
@@ -15,11 +15,12 @@
 """
 permute op
 """
-from typing import Sequence
+from typing import List, Sequence
 
+from .... import backend
+from ....backend import registry
 from ....utils.tensor_utils import wrap_dim
-
-from ...base import Operator, Tensor
+from ...base import IntVar, Operator, Tensor
 from .permute021 import permute021
 from .permute102 import permute102
 from .permute210 import permute210
@@ -34,14 +35,19 @@ def __init__(self):
         super().__init__()
         self._attrs["op"] = "permute"
 
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for permute."""
+
+        output_shapes = []
+        input_shapes = x.shape()
+        for dim in self._attrs["dims"]:
+            output_shapes.append(input_shapes[dim])
+        return output_shapes
+
     def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
-        if len(dims) != 3:
-            raise NotImplementedError(
-                "Permute op doesn't support permute pattern {}".format(dims)
-            )
         dims = list(dims)
         for i, dim in enumerate(dims):
-            dims[i] = wrap_dim(dim, 3)
+            dims[i] = wrap_dim(dim, x._rank())
 
         if dims == [0, 2, 1]:
             return permute021()(x)
@@ -49,6 +55,24 @@ def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
             return permute102()(x)
         if dims == [2, 1, 0]:
             return permute210()(x)
-        raise NotImplementedError(
-            "Permute op doesn't support permute pattern {}".format(dims)
+
+        self._attrs["dims"] = dims
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+
+        output_shapes = self._infer_shapes(x)
+        output = Tensor(output_shapes, src_ops={self})
+        self._attrs["outputs"] = [output]
+
+        # TODO: support output TensorAccessor
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
         )
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index cba756182..cb8b34819 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -19,6 +19,7 @@
 from .... import backend
 from ....backend import registry
 from ...base import IntImm, IntVar, Operator
+from ...stable_set import StableSet
 
 # pylint: disable=C0103,C0415,W0221
 
@@ -97,15 +98,15 @@ def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
             y._attrs["src_ops"].add(self)
 
         for op in self._attrs["slice_ops"]:
-            op._attrs["outputs"][0]._attrs["src_ops"] = set()
-            op._attrs["outputs"][0]._attrs["dst_ops"] = set()
+            op._attrs["outputs"][0]._attrs["src_ops"] = StableSet()
+            op._attrs["outputs"][0]._attrs["dst_ops"] = StableSet()
 
         for x in cat_op._attrs["inputs"]:
-            x._attrs["src_ops"] = set()
-            x._attrs["dst_ops"] = set()
+            x._attrs["src_ops"] = StableSet()
+            x._attrs["dst_ops"] = StableSet()
         for y in cat_op._attrs["outputs"]:
-            y._attrs["src_ops"] = set()
-            y._attrs["dst_ops"] = set()
+            y._attrs["src_ops"] = StableSet()
+            y._attrs["dst_ops"] = StableSet()
 
     def __init__(
         self, cat_op: Operator, reshape_op: Operator, cat_op_2: Operator
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 729201934..02e3d4666 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -16,6 +16,8 @@
 Slice_scatter.
 """
 
+from aitemplate.compiler.stable_set import StableSet
+
 from .... import backend
 from ....backend import registry
 from ...base import Operator
@@ -58,15 +60,15 @@ def _update_inputs_outputs(self, cat_op):
 
         self._attrs["outputs"] = cat_op._attrs["outputs"]
         for y in self._attrs["outputs"]:
-            y._attrs["src_ops"] = {self}
+            y._attrs["src_ops"] = StableSet({self})
 
         for op in self._attrs["slice_ops"]:
-            op._attrs["outputs"][0]._attrs["src_ops"] = set()
-            op._attrs["outputs"][0]._attrs["dst_ops"] = set()
+            op._attrs["outputs"][0]._attrs["src_ops"] = StableSet()
+            op._attrs["outputs"][0]._attrs["dst_ops"] = StableSet()
 
         for x in cat_op._attrs["inputs"]:
-            x._attrs["src_ops"] = set()
-            x._attrs["dst_ops"] = set()
+            x._attrs["src_ops"] = StableSet()
+            x._attrs["dst_ops"] = StableSet()
 
     def __init__(self, cat_op: Operator) -> None:
         super().__init__()
@@ -89,6 +91,9 @@ def __init__(self, cat_op: Operator) -> None:
     def __call__(self):
         raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
 
+    def _get_op_attributes(self):
+        raise NotImplementedError("slice_scatter get op attribute not implemented")
+
     def _get_func(self, fmt_str):
         target = backend.target.Target.current()
         func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
index 89e7d0256..252e47507 100644
--- a/python/aitemplate/compiler/ops/tensor/topk.py
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -19,6 +19,7 @@
 import os
 import re
 from collections import OrderedDict
+from operator import itemgetter
 from typing import List
 
 import jinja2
@@ -87,7 +88,11 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {"k": self._attrs["topK"]}
+
     def gen_function(self) -> str:
+        """call backend function"""
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_function".format(
             target=target.name(), op=self._attrs["op"]
@@ -98,12 +103,21 @@ def gen_function(self) -> str:
     def gen_profiler(
         self, workdir: str = None, dynamic_profiling_strategy=None
     ) -> None:
+        """Profile TopK to get workspace
+        Parameters
+        ----------
+        workdir : str, optional
+            [description], by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
         target = backend.target.Target.current()
         func_key = "{target}.{op}.gen_profiler".format(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir)
+        return func(self._attrs, workdir)
 
     def _gen_exec_key(self, shape: List[int]):
         """rending the shape info"""
@@ -147,12 +161,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        workspace = out[1].workspace
         return workspace
 
     def profile(
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index 528c86f64..5f0a83344 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -156,6 +156,12 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "mode": self._attrs["mode"],
+            "scale_factor": self._attrs["scale_factor"],
+        }
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         template_path = target.template_path()
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
index 314124f5d..c44c8d30f 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
@@ -104,6 +104,12 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "iou_threshold": self._attrs["iou_threshold"],
+            "keep_n": self._attrs["keep_n"],
+        }
+
     def gen_function(self) -> str:
         """call backend function"""
         target = backend.target.Target.current()
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
index 44583e1c0..0b57d0502 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
@@ -19,6 +19,7 @@
 import os
 import re
 from collections import OrderedDict
+from operator import itemgetter
 from typing import List
 
 import jinja2
@@ -154,6 +155,14 @@ def __call__(self, boxes: Tensor, scores: Tensor) -> Tensor:
         ]
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "iouThreshold": self._attrs["iouThreshold"],
+            "minBoxSize": self._attrs["minBoxSize"],
+            "nmsMaxOut": self._attrs["nmsMaxOut"],
+            "preNmsTop": self._attrs["preNmsTop"],
+        }
+
     def _gen_exec_key(self, shape):
         """rendering shape info"""
         return self.exec_key_template.render(
@@ -187,7 +196,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir)
+        return func(self._attrs, workdir)
 
     def _invert_exec_key(self, key):
         tmp = re.findall(r"(\d+)", key)
@@ -214,12 +223,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        workspace = out[1].workspace
         return workspace
 
     def profile(
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
index cd6af1340..45dabb290 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
@@ -19,6 +19,7 @@
 import os
 import re
 from collections import OrderedDict
+from operator import itemgetter
 from typing import List
 
 import jinja2
@@ -139,6 +140,14 @@ def __call__(self, x: Tensor, scores: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        return {
+            "iouThreshold": self._attrs["iouThreshold"],
+            "minBoxSize": self._attrs["minBoxSize"],
+            "nmsMaxOut": self._attrs["nmsMaxOut"],
+            "preNmsTop": self._attrs["preNmsTop"],
+        }
+
     def _gen_exec_key(self, shape):
         """rending the shape info"""
         return self.exec_key_template.render(
@@ -171,7 +180,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs, workdir)
+        return func(self._attrs, workdir)
 
     def _invert_exec_key(self, key):
         tmp = re.findall(r"(\d+)", key)
@@ -197,12 +206,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         runner.join()
         result = runner.pull()
 
-        out = sorted(result, key=lambda x: x[1])
-        if len(out) == 0:
+        if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        workspace = out[0][1].workspace
+
+        out = min(result, key=itemgetter(1))
+        workspace = out[1].workspace
         return workspace
 
     def profile(
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
index 69fbb9eae..190c06207 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
@@ -117,3 +117,9 @@ def __call__(
         output = Tensor(output_shape, src_ops={self})
         self._attrs["outputs"] = [output]
         return output
+
+    def _get_op_attributes(self):
+        attr = super()._get_op_attributes()
+        attr["im_shape"] = self._attrs["im_shape"]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
index a21f3dc1b..a962e20ad 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
@@ -198,6 +198,23 @@ def __call__(self, x: Tensor, rois: Tensor) -> List[Tensor]:
         self._attrs["outputs"] = [output]
         return output
 
+    def _get_op_attributes(self):
+        target_attrs = [
+            "continuous_coordinate",
+            "num_rois",
+            "pooled_size",
+            "position_sensitive",
+            "sampling_ratio",
+            "spatial_scale",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
     def gen_function(self) -> str:
         target = backend.target.Target.current()
         template_path = target.template_path()
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index c9eaf9bf4..d0b664a3b 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -35,10 +35,13 @@
 from aitemplate.compiler.ops.common.elementwise import clamp, elementwise
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 
+from aitemplate.compiler.ops.common.int_elementwise import int_elementwise
+
 """GEMM"""
 from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
 from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
 from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
 
 """Reduce"""
@@ -61,6 +64,7 @@
 from aitemplate.compiler.ops.layernorm.layernorm import layernorm
 from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
+from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax
 from aitemplate.compiler.ops.tensor.size import size
 from aitemplate.compiler.ops.tensor.topk import topk
diff --git a/python/aitemplate/compiler/stable_set.py b/python/aitemplate/compiler/stable_set.py
new file mode 100644
index 000000000..84a5704d7
--- /dev/null
+++ b/python/aitemplate/compiler/stable_set.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+A stable set is like a Python set which produces deterministic results.
+It also tries to preserve the original element order as much as possible, which could
+potentially make debugging (e.g. comparison with the original graph, comparison between
+AIT GPU trace and other GPU traces) easier.
+"""
+
+from typing import Any, Sequence
+
+
+class StableSet:
+    def __init__(self, s: Sequence[Any] = None):
+        if s is None:
+            s = []
+        self._d = {item: None for item in s}
+
+    def add(self, value) -> None:
+        self._d[value] = None
+
+    def update(self, other) -> None:
+        for item in other:
+            self._d[item] = None
+
+    def discard(self, value) -> None:
+        self._d.pop(value, None)
+
+    def remove(self, value) -> None:
+        self._d.pop(value)
+
+    def copy(self):
+        return StableSet(list(self._d))
+
+    def clear(self):
+        self._d = {}
+
+    def __sub__(self, other):
+        res = self.copy()
+        for item in other:
+            res.discard(item)
+        return res
+
+    def __str__(self) -> str:
+        return str(list(self._d))
+
+    def __repr__(self) -> str:
+        return str(list(self._d))
+
+    def __len__(self) -> int:
+        return len(self._d)
+
+    def __contains__(self, value: Any) -> int:
+        return value in self._d
+
+    def __iter__(self):
+        return list(self._d).__iter__()
+
+    def _type_check(self, other):
+        if not isinstance(other, StableSet):
+            raise RuntimeError(
+                f"A StableSet can only be operated with another StableSet! "
+                f"Current type: {type(other)}."
+            )
+
+    def __eq__(self, other):
+        self._type_check(other)
+        return set(other._d) == set(self._d)
+
+    def __le__(self, other):
+        self._type_check(other)
+        return set(self._d) <= set(other._d)
+
+    def __lt__(self, other):
+        self._type_check(other)
+        return set(self._d) < set(other._d)
+
+    def __ge__(self, other):
+        self._type_check(other)
+        return set(self._d) >= set(other._d)
+
+    def __gt__(self, other):
+        self._type_check(other)
+        return set(self._d) > set(other._d)
+
+    def __getitem__(self, idx):
+        return list(self._d)[idx]
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index 232d5041a..b6276ed38 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -88,7 +88,7 @@ def __init__(self, original_tensor: Tensor) -> None:
         self._dim_mapping = [([i], [i]) for i in range(len(self.original_shapes))]
 
     def __deepcopy__(self, memo):
-        res = type(self)(Tensor(shape=self.original_shapes))
+        res = copy.copy(self)
         res.original_shapes = copy.deepcopy(self.original_shapes, memo)
         res.actual_shapes = copy.deepcopy(self.actual_shapes, memo)
         res._dim_mapping = copy.deepcopy(self._dim_mapping, memo)
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index bda828a9e..b336aac0c 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -23,7 +23,7 @@
 )
 from .fuse_mm_elementwise import fuse_mm_elementwise
 from .fuse_ops import fuse_ops
-from .fuse_permute_bmm import fuse_permute_bmm
+from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
 from .mark_param_tensor import mark_param_tensor, mark_special_views
 from .memory_planning import memory_planning
 from .name_graph import name_graph
@@ -32,6 +32,7 @@
 from .refine_graph import refine_graph
 from .remove_no_ops import remove_no_ops
 from .remove_unused_ops import remove_unused_ops
+from .split_large_concat_ops import split_large_concat_ops
 from .toposort import toposort
 from .transform_memory_ops import transform_memory_ops
 from .transform_odd_alignment import transform_odd_alignment
diff --git a/python/aitemplate/compiler/transform/apply_padding.py b/python/aitemplate/compiler/transform/apply_padding.py
index f04587ea0..5041d889e 100644
--- a/python/aitemplate/compiler/transform/apply_padding.py
+++ b/python/aitemplate/compiler/transform/apply_padding.py
@@ -214,7 +214,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
 
             # Replaces the old op with the new op.
             for tensor_input in op._attrs["inputs"]:
-                tensor_input._attrs["dst_ops"].remove(op)
+                tensor_input._attrs["dst_ops"].discard(op)
             new_op = type(op)()
             new_op._attrs["split_k"] = op._attrs["split_k"]
             if "alpha" in op._attrs:
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 6e9239618..0b6459750 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -19,7 +19,7 @@
 
 from aitemplate import backend, compiler
 
-from aitemplate.compiler.base import _NumpyConstantTensorData, Tensor
+from aitemplate.compiler.base import _NumpyConstantTensorData, IntVarTensor, Tensor
 from aitemplate.compiler.model import AITData, Model
 from aitemplate.compiler.transform.transform_utils import replace_tensor
 from aitemplate.utils import logger
@@ -75,7 +75,8 @@ def _extract_foldable_subgraph(
         elif tensor._attrs["is_param"]:
             # Params that do not have bound data cannot be folded.
             continue
-
+        elif isinstance(tensor, IntVarTensor):
+            continue
         foldable = all(
             inp._attrs["name"] in foldable_node_names
             for op in tensor._attrs["src_ops"]
@@ -123,15 +124,8 @@ def _constant_folding_impl(
     )
     file_pairs.extend(main_pairs)
     compile_engine = backend.builder.Builder()
-    compile_engine.build_objs(
-        file_pairs,
-        backend.target.Target.current().compile_cmd(False),
-        backend.target.Target.current().binary_compile_cmd(),
-    )
-
-    so_name = os.path.join(constant_folding_workdir, "test.so")
-    compile_engine.build_so(so_name, [p[1] for p in file_pairs])
-
+    so_name = os.path.join(constant_folding_workdir, "constant_folding.so")
+    compile_engine.make(file_pairs, "constant_folding.so", workdir, "constant_folding")
     module = Model(so_name, num_runtimes=1)
 
     outputs = {}
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index 6d515659b..c954167d1 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -383,7 +383,7 @@ def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
 
     for i in range(num_groups):
         begin = i * _MAX_LAYERNORM_GROUP
-        end = min((i + 1) * _MAX_LAYERNORM_GROUP, num_groups)
+        end = min((i + 1) * _MAX_LAYERNORM_GROUP, len(group))
         groups.append(group[begin:end])
     return groups
 
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
index d822a707c..c97cfbfbe 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
@@ -26,6 +26,8 @@
     gemm_rcr_bias_add_add,
     gemm_rcr_bias_add_add_relu,
     gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_fast_gelu,
+    gemm_rcr_bias_gelu,
     gemm_rcr_bias_mul,
     gemm_rcr_bias_mul_add,
     gemm_rcr_bias_mul_tanh,
@@ -97,6 +99,20 @@ def get_patterns():
             ),
             gemm_rcr_bias_tanh,
         ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.GELU),
+            ),
+            gemm_rcr_bias_gelu,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.FASTGELU),
+            ),
+            gemm_rcr_bias_fast_gelu,
+        ),
     ]
 
     gemm_rcr_bias_add_patterns = [
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
new file mode 100644
index 000000000..7985ef354
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -0,0 +1,189 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fuse GEMM + reshape + permute0213
+"""
+from typing import List, Sequence
+
+from ...utils import graph_utils
+from ..base import IntImm, Operator, Tensor
+from ..ops import gemm_rcr_permute
+from . import transform_utils
+from .toposort import toposort
+
+
+def _check_reshape(op: Operator) -> bool:
+    """check reshape [M, N] -> [M/D1, D1, D2, N/D2]
+    D1 and D2 must be static. Also checks alignment here.
+
+    Args:
+        op (Operator): reshape op
+
+    Returns:
+        bool: True if can fuse
+    """
+    input_shapes = op._attrs["inputs"][0].shape()
+    output_shapes = op._attrs["outputs"][0].shape()
+
+    if len(input_shapes) != 2 or len(output_shapes) != 4:
+        return False
+
+    m, n = input_shapes
+    m_d1, d1, d2, n_d2 = output_shapes
+
+    if not isinstance(n, IntImm) or not isinstance(n_d2, IntImm):
+        return False
+
+    if not isinstance(d1, IntImm) or not isinstance(d2, IntImm):
+        return False
+
+    d1 = d1.value()
+    d2 = d2.value()
+
+    if len(m._attrs["values"]) != len(m_d1._attrs["values"]):
+        return False
+
+    if n.value() != n_d2.value() * d2:
+        return False
+
+    # check alignment
+    if n_d2.value() % 2 == 1:
+        return False
+
+    return True
+
+
+def _check_permute(op: Operator, dims: Sequence[int]) -> bool:
+    """Check permute dims match input dims
+
+    Args:
+        op (Operator): permute op
+        dims (Sequence): permute dims
+
+    Returns:
+        bool: True if match
+    """
+    permute_dims = op._attrs["dims"]
+    if len(dims) != len(permute_dims):
+        return False
+    for d0, d1 in zip(dims, permute_dims):
+        if d0 != d1:
+            return False
+    return True
+
+
+def _fuse_gemm_reshape_permute0213(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Fuse GEMM + reshape + permute0213
+    Fuse patterns like this together:
+
+    y0 = gemm_rcr(a, b) # [M, N]
+    y1 = reshape(y0, [M/D1, D1, D2, N/D2])
+    y2 = permute(y1, [0, 2, 1, 3])
+
+    into
+    y2 = gemm_rcr_permute(a, b, shape=[D1, D2], layout="0213")
+
+    fusion condition:
+    N/D2 must meet alignment condition: align > 1 for fp16
+    Otherwise, it causes perf regression to gemm.
+    Must run before any pass that modifies Tensor Accessor or fuses reshape
+
+    Args:
+        sorted_graph (List[Tensor]): input graph
+        workdir (str, optional): current workdir for dumping debug info. Defaults to None.
+
+    Returns:
+        List[Tensor]: optimized graph
+    """
+
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+
+    for op in sorted_ops:
+        if op._attrs["op"] != "gemm_rcr":
+            continue
+
+        outputs = op._attrs["outputs"]
+        assert len(outputs) == 1
+
+        gemm_output = outputs[0]
+        if len(gemm_output.dst_ops()) != 1:
+            continue
+
+        reshape_op = list(gemm_output.dst_ops())[0]
+
+        if reshape_op._attrs["op"] != "reshape":
+            continue
+
+        reshape_output = reshape_op._attrs["outputs"][0]
+        if len(reshape_output.dst_ops()) != 1:
+            continue
+
+        permute_op = list(reshape_output.dst_ops())[0]
+
+        if permute_op._attrs["op"] != "permute":
+            continue
+
+        permute_output = permute_op._attrs["outputs"][0]
+
+        # check reshape [M, N] -> [M/D1, D1, D2, N/D2]
+        if not _check_reshape(reshape_op):
+            continue
+
+        if not _check_permute(permute_op, [0, 2, 1, 3]):
+            continue
+
+        # fuse ops together
+        _, d1, d2, _ = reshape_output.shape()
+        d1_v = d1.value()
+        d2_v = d2.value()
+        gemm_permute_op = gemm_rcr_permute(shape=(d1_v, d2_v), layout="0213")
+        a, b = op._attrs["inputs"]
+        transform_utils.remove_dst_op_from_tensor(a, op)
+        transform_utils.remove_dst_op_from_tensor(b, op)
+
+        new_output = gemm_permute_op(a, b)
+
+        transform_utils.replace_tensor(permute_output, new_output)
+        sorted_graph.append(new_output)
+
+        transform_utils.remove_tensor_from_sorted_graph(gemm_output)
+        transform_utils.remove_tensor_from_sorted_graph(reshape_output)
+
+    sorted_graph = toposort(sorted_graph)
+    transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
+
+
+def fuse_mm_reshape_permute(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Fuse GEMM/BMM + reshape + permute into a single op
+
+    Args:
+        sorted_graph (List[Tensor]): input graph
+        workdir (str, optional): current workdir for dumping debug info. Defaults to None.
+
+    Returns:
+        List[Tensor]: optimized graph
+    """
+
+    funcs = [
+        _fuse_gemm_reshape_permute0213,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index d03132c9a..61db0f8b9 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -87,6 +87,18 @@ def _find_fusable_elementwise_ops(op: Operator) -> Set[Operator]:
     for op in dependent_ops:
         if op._attrs["op"] != "elementwise":
             to_be_removed_set.add(op)
+        else:
+            # Assuming there are two elementwise ops, op1 and op2, where op1 is a
+            # parent op of op2. If op1's output is an output tensor, or if op1 is
+            # consumed by other non-elementwise ops, op1 cannot be fused with op2.
+            output = op._attrs["outputs"][0]
+            if output._attrs["is_output"]:
+                to_be_removed_set.add(op)
+                continue
+            for next_op in output.dst_ops():
+                if next_op._attrs["op"] != "elementwise":
+                    to_be_removed_set.add(op)
+
     dependent_ops = dependent_ops - to_be_removed_set
 
     # Then get all connected elementwise ops at the last layer.
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
new file mode 100644
index 000000000..4a09f5f5c
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform fusions for permute+bmm operators.
+"""
+from typing import Callable, List, Optional, Set, Tuple, Type, Union
+
+from aitemplate.compiler.ops.tensor.permute import permute
+
+from .. import ops
+from ..base import IntImm, Operator, Tensor
+from ..ops.gemm_universal import (
+    bmm_ccr,
+    bmm_crr,
+    bmm_rcr,
+    bmm_rrr,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rrr,
+    gemm_rrr_bias,
+)
+from ..ops.tensor import permute021
+from .fuse_utils import extract_only_one_op
+from .transform_utils import (
+    copy_src_op_attributes,
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    remove_tensor_from_sorted_graph,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,W0612
+
+
+def _try_extract_one_mm_op(ops: Set[Union[None, Operator]]) -> Union[None, Operator]:
+    """
+    Helper function that returns the matmul op from src_ops() or dst_ops() call.
+    Return None if there's no bmm ops
+    """
+    if ops is None:
+        return None
+
+    for op in ops:
+        if op._attrs["op"].startswith("bmm") or op._attrs["op"].startswith("gemm"):
+            return op
+
+    return None
+
+
+def _fuse_permute_impl(
+    sorted_graph: List[Tensor],
+    source: List[Type[Operator]],
+    targets: List[Union[None, Type[Operator]]],
+    gemm_condition: Optional[Callable],
+    permute_condition: Optional[Callable],
+) -> Tuple[bool, List[Tensor]]:
+    """
+    Function that fuses [permute021 + bmm] into corresponding bmm op.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        AIT graph to run fusion
+    source: List[Type[Operator]]
+        Combination of permute+bmm ops to be fused.
+        This should be of len-2
+    targets: List[Type[Operator]]
+        To be fused bmm that matches the source.
+        This should be of len 2, which corresponds to the operator that does
+        permute A and permute B respectively
+    gemm_condition: Optional[Callable]
+        If not None, we apply on the gemm op to check whether it requires fusion.
+    permute_condition: Optional[Callable]
+        If not None, we apply on the permute op to check whether it requires fusion.
+    """
+    assert len(source) == 2, "Source should have 2 elements, got {} instead".format(
+        len(source)
+    )
+
+    new_sorted_graph = []
+    fused = False
+    to_replace = {}
+    for tensor in sorted_graph:
+        if tensor in to_replace:
+            new_sorted_graph.append(to_replace[tensor])
+            replace_tensor(tensor, to_replace[tensor])
+            del to_replace[tensor]
+            continue
+        new_sorted_graph.append(tensor)
+
+        if fused:
+            continue
+        if tensor._attrs["is_output"]:
+            continue
+
+        permute_op = extract_only_one_op(tensor._attrs["src_ops"])
+        bmm_op = _try_extract_one_mm_op(tensor._attrs["dst_ops"])
+        if permute_op is None or bmm_op is None:
+            continue
+
+        if permute_op._attrs["op"] != source[0]()._attrs["op"]:
+            continue
+        if bmm_op._attrs["op"] != source[1]()._attrs["op"]:
+            continue
+        if gemm_condition is not None and not gemm_condition(bmm_op):
+            continue
+        if permute_condition is not None and not permute_condition(permute_op):
+            continue
+
+        assert len(permute_op._attrs["inputs"]) == 1
+        assert len(bmm_op._attrs["outputs"]) == 1
+
+        inputs = list(bmm_op._attrs["inputs"])
+        if targets[0] is None and inputs[0] == tensor:
+            continue
+        if targets[1] is None and inputs[1] == tensor:
+            continue
+
+        input_tensor = permute_op._attrs["inputs"][0]
+        output_tensor = bmm_op._attrs["outputs"][0]
+
+        # TODO: Check whether the input is weight to have better compile time
+        #       optimization on preprocessing of pad etc.
+        permute_shape = tensor.shape()
+        prepermute_shape = input_tensor.shape()
+
+        if (
+            isinstance(prepermute_shape[-1], IntImm)
+            and prepermute_shape[-1].value() % 2 == 1
+            and isinstance(permute_shape[-1], IntImm)
+            and permute_shape[-1].value() % 2 == 0
+        ):
+            # We don't run the permute+bmm fusion if the permute op could
+            # turn an odd alignment into even alignment.
+            continue
+
+        fused = True
+
+        remove_dst_op_from_tensor(bmm_op._attrs["inputs"], bmm_op)
+
+        target = None
+        if inputs[0] == tensor:
+            target = targets[0]
+            inputs[0] = input_tensor
+        elif inputs[1] == tensor:
+            target = targets[1]
+            inputs[1] = input_tensor
+        else:
+            raise RuntimeError(
+                "bmm inputs are {}, not matching permute's output tensor {}".format(
+                    inputs, tensor
+                )
+            )
+
+        if not tensor.dst_ops():
+            # Remove permute configs if this is the last bmm consuming the tensor
+            remove_dst_op_from_tensor(input_tensor, permute_op)
+            remove_tensor_from_sorted_graph(tensor)
+
+        new_tensor = target()(*inputs)
+        copy_tensor_attributes(new_tensor, output_tensor)
+        copy_src_op_attributes(new_tensor, output_tensor)
+        to_replace[output_tensor] = new_tensor
+
+    return (fused, sanitize_sorted_graph(new_sorted_graph))
+
+
+def fuse_permute_bmm_and_gemm(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Fuse [permute021 + bmm] and [permute(0, 1) + gemm].
+
+    Note that for the latter fusion, we require that this pass takes
+    place before any gemm + elementwise fusions.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        working dir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+
+    def _need_broadcast_gemm(op: Operator):
+        if not op._attrs["op"].startswith("gemm"):
+            return False
+        inputs = op._attrs["inputs"]
+        return len(inputs[0].shape()) != 2 or len(inputs[1].shape()) != 2
+
+    def _is_transpose(op: Operator):
+        if op._attrs["op"] != "permute":
+            return False
+        dims = op._attrs["dims"]
+        return dims == [1, 0]
+
+    permute_mm_patterns = (
+        ([permute021, bmm_ccr], [bmm_rcr, bmm_crr], None, None),
+        ([permute021, bmm_crr], [bmm_rrr, bmm_ccr], None, None),
+        ([permute021, bmm_rcr], [bmm_ccr, bmm_rrr], None, None),
+        ([permute021, bmm_rrr], [bmm_crr, bmm_rcr], None, None),
+        ([permute021, gemm_rcr], [bmm_ccr, bmm_rrr], _need_broadcast_gemm, None),
+        ([permute021, gemm_rrr], [bmm_crr, bmm_rcr], _need_broadcast_gemm, None),
+        (
+            [permute021, gemm_rcr_bias],
+            [ops.gemm_universal.bmm_ccr_add, ops.gemm_universal.bmm_rrr_add],
+            _need_broadcast_gemm,
+            None,
+        ),
+        (
+            [permute021, gemm_rrr_bias],
+            [ops.gemm_universal.bmm_crr_add, None],
+            _need_broadcast_gemm,
+            None,
+        ),
+        ([permute, gemm_rcr], [None, gemm_rrr], None, _is_transpose),
+        ([permute, gemm_rrr], [None, gemm_rcr], None, _is_transpose),
+    )
+
+    graph_transformed = True
+    while graph_transformed:
+        graph_transformed = False
+        for source, targets, gemm_condition, permute_condition in permute_mm_patterns:
+            fused, sorted_graph = _fuse_permute_impl(
+                sorted_graph, source, targets, gemm_condition, permute_condition
+            )
+            graph_transformed |= fused
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index 646a49a0e..6cb52e3c4 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -17,6 +17,8 @@
 """
 from typing import List
 
+from aitemplate.compiler.stable_set import StableSet
+
 from ...utils import graph_utils, logger
 from ..base import IntImm, IntVar, Operator, Tensor
 from . import transform_strided_ops_utils, transform_utils
@@ -126,7 +128,7 @@ def _valid_input(input_tensor):
             == split_input._attrs["shape"][split_dim]._attrs["values"][0]
         )
         # some final updates
-        split_input._attrs["dst_ops"] = [group_gemm_op]
+        split_input._attrs["dst_ops"] = StableSet([group_gemm_op])
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 87d3b1b53..7819190ef 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,7 @@
 import re
 from typing import List
 
-from ..base import Tensor
+from ..base import IntVarTensor, Tensor
 
 # pylint: disable=C0103
 
@@ -62,6 +62,12 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 tensor_name = unique_name(f"tensor_{tensor_cnt}")
                 node._attrs["name"] = tensor_name
                 tensor_cnt += 1
+                if isinstance(node, IntVarTensor):
+                    # TODO: emit standalone dynamic shape initialization for IntVarTensor
+                    raise RuntimeError(
+                        "We don't support emitting standalone IntVarTensor at this moment.\n"
+                        f"Encountered {node._attrs['name']}: {node._attrs['int_var']}."
+                    )
 
         else:
             for func in funcs:
@@ -79,6 +85,11 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                     node_name = unique_name(f"{func_name}_{func_tensor_count}")
                     node._attrs["name"] = node_name
                     func_name_to_tensor_cnt[func_name] = func_tensor_count + 1
+                    if isinstance(node, IntVarTensor):
+                        shape_name = node._attrs["int_var"]._attrs["name"]
+                        if shape_name is None:
+                            node._attrs["int_var"]._attrs["name"] = node_name
+
         tensor_name = node._attrs["name"]
         for i, dim in enumerate(node._attrs["shape"]):
             if dim._attrs["name"] is None:
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 804de46aa..f959221ae 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -24,9 +24,11 @@
 from .fuse_conv_elementwise import fuse_conv_elementwise
 from .fuse_group_ops import fuse_group_ops
 from .fuse_mm_elementwise import fuse_mm_elementwise
+from .fuse_mm_reshape_permute import fuse_mm_reshape_permute
 from .fuse_ops import fuse_ops
 from .fuse_parallel_gemms import fuse_parallel_gemms
-from .fuse_permute_bmm import fuse_permute_bmm
+from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
+from .split_large_concat_ops import split_large_concat_ops
 from .transform_memory_ops import transform_memory_ops
 from .transform_odd_alignment import transform_odd_alignment
 from .transform_special_ops import transform_special_ops
@@ -37,6 +39,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     """Applies graph optimizations, including
 
     - fuse permute and bmm
+    - fuse permute and gemm
     - transform odd alignment
     - fuse conv and elementwise
     - fuse gemm and elementwise
@@ -62,10 +65,11 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     """
 
     funcs = [
-        fuse_permute_bmm,
+        fuse_permute_bmm_and_gemm,
         transform_odd_alignment,
         fuse_conv_elementwise,
         fuse_mm_elementwise,
+        fuse_mm_reshape_permute,
         transform_memory_ops,
         fuse_ops,
         # need to run before transform_strided_ops to fuse strided ops + concat
@@ -77,6 +81,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         transform_special_ops,
         apply_padding,
         transform_strided_ops,
+        split_large_concat_ops,
         transform_memory_ops,
     ]
 
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index cc65cb50c..549d86098 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -16,14 +16,37 @@
 Graph pass to invoke profiling.
 """
 import os
-from typing import List
+from copy import deepcopy
+from datetime import datetime
+from typing import List, OrderedDict
 
-from ...backend import codegen
+from aitemplate.backend.profiler_runner import ProfilerRunner
+
+from aitemplate.compiler.ops.gemm_universal.gemm_common import (
+    gemm,
+    GemmProfilerPostprocessingDelegate,
+)
+
+from aitemplate.utils import logger
+
+from ...backend import builder, codegen
 from ..base import DynamicProfileStrategy, Tensor
 
 # pylint: disable=C0103,W0613,W0102
 
 
+def elapsed_dt_sec(start_t_sec):
+    return datetime.now() - start_t_sec
+
+
+def _splitter(data, pred=bool):
+    group_a = []
+    group_b = []
+    for d in data:
+        (group_a if pred(d) else group_b).append(d)
+    return group_a, group_b
+
+
 def profile(
     sorted_graph: List[Tensor],
     workdir="./tmp",
@@ -51,22 +74,55 @@ def profile(
     if devices is None:
         devices = [0]
     profiler_dir = os.path.join(workdir)
-    codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
-    profiled = {}
+    start_t = datetime.now()
+    generated_profilers = list(
+        codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
+    )
+    generated_profilers = [p for p in generated_profilers if p is not None]
+    logger.info(
+        __name__,
+        f"generated {len(generated_profilers)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
+    )
+    start_t = datetime.now()
+    compile_engine = builder.Builder()
+    compile_engine.make_profilers(generated_profilers, profiler_dir)
+    logger.info(__name__, f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
+    funcs_to_profile = OrderedDict(
+        {
+            func._attrs["name"]: func
+            for node in sorted_graph
+            for func in node.src_ops()
+            if func._attrs["has_profiler"]
+        }
+    )
+    start_t = datetime.now()
+    gemms, non_gemms = _splitter(
+        funcs_to_profile.values(), lambda f: isinstance(f, gemm)
+    )
+    for f in non_gemms:
+        f.profile(
+            workdir=profiler_dir,
+            devices=devices,
+            dynamic_profiling_strategy=dynamic_profiling_strategy,
+        )
+    profiler_runner = ProfilerRunner(
+        devices,
+        timeout=180,
+        postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
+    )
+    for f in gemms:
+        f.profile(
+            workdir=profiler_dir,
+            profiler_runner=profiler_runner,
+        )
+    profiler_runner.join()
+    logger.info(
+        __name__,
+        f"ran {len(funcs_to_profile)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
+    )
     for node in sorted_graph:
         for func in node.src_ops():
-            func_name = func._attrs["name"]
-            if func_name in profiled:
-                paths = func._attrs["exec_path"].keys()
-                for path in paths:
-                    func._attrs["exec_path"][path] = profiled[func_name]._attrs[
-                        "exec_path"
-                    ][path]
-                continue
             if func._attrs["has_profiler"]:
-                func.profile(
-                    workdir=profiler_dir,
-                    devices=devices,
-                    dynamic_profiling_strategy=dynamic_profiling_strategy,
+                func._attrs["exec_path"] = deepcopy(
+                    funcs_to_profile[func._attrs["name"]]._attrs["exec_path"]
                 )
-                profiled[func_name] = func
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index 525e2072c..ee08c2716 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -16,9 +16,9 @@
 Graph pass to invoke profiling with dynamic shapes.
 """
 from copy import deepcopy
-from typing import List
+from typing import List, OrderedDict
 
-from ...backend import codegen
+from ...backend import builder, codegen
 from ...utils import logger
 from ..base import Tensor
 
@@ -27,20 +27,25 @@
 
 def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
     logger.info(__name__, "Current dynamic profiler supports ONLY ONE dynamic dim.")
-    codegen.gen_profiler(sorted_graph, workdir)
-    profiled = {}
+    generated_profilers = list(codegen.gen_profiler(sorted_graph, workdir))
+    generated_profilers = [p for p in generated_profilers if p is not None]
+    compile_engine = builder.Builder()
+    compile_engine.make_profilers(generated_profilers, workdir)
+    funcs_to_profile = OrderedDict(
+        {
+            func._attrs["name"]: func
+            for node in sorted_graph
+            for func in node.src_ops()
+            if func._attrs["has_profiler"]
+        }
+    )
+    for f in funcs_to_profile.values():
+        f.profile_dynamic_dim(
+            workdir=workdir,
+        )
     for node in sorted_graph:
         for func in node.src_ops():
-            func_name = func._attrs["name"]
-            if func_name in profiled:
-                # paths = profiled[func_name]._attrs["exec_path"].keys()
+            if func._attrs["has_profiler"]:
                 func._attrs["exec_path"] = deepcopy(
-                    profiled[func_name]._attrs["exec_path"]
+                    funcs_to_profile[func._attrs["name"]]._attrs["exec_path"]
                 )
-                # for path in paths:
-                #     func._attrs["exec_path"][path] = \
-                #         profiled[func_name]._attrs["exec_path"][path]
-                continue
-            if func._attrs["has_profiler"]:
-                func.profile_dynamic_dim(workdir=workdir)
-                profiled[func_name] = func
diff --git a/python/aitemplate/compiler/transform/split_large_concat_ops.py b/python/aitemplate/compiler/transform/split_large_concat_ops.py
new file mode 100644
index 000000000..d2c6ee1ab
--- /dev/null
+++ b/python/aitemplate/compiler/transform/split_large_concat_ops.py
@@ -0,0 +1,124 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This transformation splits a concat with a large number of inputs into multiple
+concat ops, which share the same inputs with correct input_masks and the same
+output.
+"""
+import copy
+import logging
+
+from typing import List
+
+from aitemplate.compiler.stable_set import StableSet
+
+from ...utils import graph_utils
+from .. import ops
+from ..base import Operator, Tensor
+from . import transform_utils
+
+logger = logging.getLogger(__name__)
+
+CONCAT_INPUT_META_SIZE = 64
+CONCAT_OUTPUT_META_SIZE = 16
+MAX_CUDA_PARAM_BYTES = 4096
+
+
+def _concat_kernel_single_input_output_param_size(op: Operator):
+    """
+    Return the total size (in bytes) of the concat's params.
+    We need to adjust this if we change the concatenate op's params.
+    """
+    inputs = op._attrs["inputs"]
+    rank = inputs[0]._rank()
+    size_of_one_output_meta = CONCAT_OUTPUT_META_SIZE * rank
+    # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
+    total_params_size = CONCAT_INPUT_META_SIZE + size_of_one_output_meta + 24
+    logger.debug(f'concat op op._attrs["name"]: {total_params_size=}')
+    return total_params_size
+
+
+def split_large_concat_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Our concatenate CUDA kernel takes an input meta argument whose size
+    is proportional to the number of inputs. In extreme cases, the total size
+    of the params of a concatenate kernel may exceed the limit imposed by
+    the CUDA compiler. In such cases, we split the concatenate op into separate
+    ones, each of which takes the original output and inputs with correct
+    input_masks values.
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if not op._attrs["op"].startswith("concatenate"):
+            continue
+        concat_op = op
+        # We create InputMeta for inputs that need to copy data.
+        num_inputs = len([m for m in concat_op._attrs["input_masks"] if m is True])
+        concat_inputs = concat_op._attrs["inputs"]
+        assert num_inputs == len(
+            concat_inputs
+        ), f"expected {num_inputs=} and {len(concat_inputs)=} to be equal"
+        if num_inputs == 0:
+            continue
+        concat_params_size = _concat_kernel_single_input_output_param_size(concat_op)
+        if concat_params_size > MAX_CUDA_PARAM_BYTES:
+            raise RuntimeError(
+                f"cannot handle cases: {concat_params_size=} > {MAX_CUDA_PARAM_BYTES=}"
+            )
+        total_params_size = concat_params_size * num_inputs
+        if total_params_size <= MAX_CUDA_PARAM_BYTES:
+            continue
+        num_inputs_per_split = MAX_CUDA_PARAM_BYTES // concat_params_size
+        num_splits = (num_inputs + num_inputs_per_split - 1) // num_inputs_per_split
+        split_sizes = [num_inputs_per_split] * num_splits
+        if num_inputs % num_inputs_per_split:
+            split_sizes[num_splits - 1] = num_inputs % num_inputs_per_split
+
+        offset = 0
+        all_new_concat_ops = []
+        concat_outputs = concat_op._attrs["outputs"]
+        input_accessors = concat_op._attrs["input_accessors"]
+        for new_inputs_size in split_sizes:
+            new_concat_output = ops.concatenate()(
+                concat_inputs, concat_op._attrs["concat_dim"]
+            )
+            new_concat_op = list(new_concat_output.src_ops())[0]
+            new_concat_op._attrs["outputs"] = concat_outputs.copy()
+            new_concat_op._attrs["original_inputs"] = concat_op._attrs[
+                "original_inputs"
+            ].copy()
+            new_concat_op._attrs["input_masks"] = concat_op._attrs["input_masks"].copy()
+            new_concat_op._attrs["input_accessors"] = copy.deepcopy(input_accessors)
+            indices_to_remove = list(range(offset)) + list(
+                range(offset + new_inputs_size, num_inputs)
+            )
+            new_concat_op.remove_input_at(indices_to_remove)
+            new_concat_output._attrs["src_ops"] = StableSet()
+            new_concat_output._attrs["dst_ops"] = StableSet()
+            all_new_concat_ops.append(new_concat_op)
+            offset += new_inputs_size
+        # original inputs are distributed among new concats, so we need to adjust
+        # their dst_ops
+        for inp in concat_inputs:
+            new_dst_ops = StableSet()
+            for inp_dst_op in inp.dst_ops():
+                if inp in inp_dst_op._attrs["inputs"]:
+                    new_dst_ops.add(inp_dst_op)
+            inp._attrs["dst_ops"] = new_dst_ops
+        concat_output = concat_op._attrs["outputs"][0]
+        concat_output._attrs["src_ops"].update(all_new_concat_ops)
+        concat_output._attrs["src_ops"].remove(concat_op)
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index f03ce6f2b..d6de21e7f 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -50,6 +50,18 @@ def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
 
 
 def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
+    # Make sure input_accessors do not carry any strided information.
+    # It may happen. For example, an input of the cat can be of a strided
+    # tensor generated by slice, which takes another concat's output.
+    # Something like below:
+    #     y1 = concat(x0, x1)
+    #     y2 = slice(y1)
+    #     y = cat(y1, y2)
+    # In such a case, we cannot merge those two concat ops.
+    if not all(
+        accessor.stride_dim is None for accessor in cat._attrs["input_accessors"]
+    ):
+        return False
     first_op_inputs = first_op._attrs["inputs"]
     first_op_outputs = first_op._attrs["outputs"]
     cat_inputs = cat._attrs["inputs"]
@@ -79,10 +91,6 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     # change this part later when we have TensorAccessors, depending on
     # the order of the transformations.
     assert all(cat._attrs["input_masks"])
-    # make sure input_accessors do not carry any strided information
-    assert all(
-        accessor.stride_dim is None for accessor in cat._attrs["input_accessors"]
-    )
     cat._attrs["input_accessors"] = [TensorAccessor(t) for t in cat._attrs["inputs"]]
     cat._attrs["original_inputs"] = list(new_cat_inputs)
     cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index c0a56379c..d1dc9fecf 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -21,6 +21,7 @@
 
 from aitemplate.compiler.base import Operator, Tensor
 from aitemplate.compiler.public import IntImm
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.compiler.transform import transform_utils
 from aitemplate.utils import graph_utils
 
@@ -86,7 +87,7 @@ def _fuse_strided_op_and_view_op_single_pass(
                     accessor.update_base_tensor_shape(tensor)
                     tensor._attrs["is_view_of"] = None
                     src_op._attrs["outputs"][idx] = tensor
-                    tensor._attrs["src_ops"] = {src_op}
+                    tensor._attrs["src_ops"] = StableSet({src_op})
                     transform_utils.remove_tensor_from_sorted_graph(view_input_tensor)
                     break
             assert (
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 1ef6ade8e..c5dcd5652 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -93,8 +93,20 @@ def _is_strided_gemm(op_type: str) -> bool:
 
 
 def _gemm_cat_checker(gemm_op: Operator, cat_op: Operator) -> bool:
+    shapes = gemm_op._attrs["output_accessors"][0].original_shapes
+    rank = len(shapes)
+    cat_dim = cat_op._attrs["concat_dim"]
+    # For > 2D gemms, the only cat_dim possible is the last dim
+    # or cases like [m1, m2, 1, n] with cat_dim = -2 or -1
+    if rank > 2 and cat_dim != rank - 1:
+        for shape in shapes[cat_dim:-1]:
+            if shape.value() != 1:
+                return False
+
+    # Only correct for row major in C (C = A @ B)
     return transform_strided_ops_utils.gemm_stride_checker(
-        gemm_op._attrs["output_accessors"][0], cat_op._attrs["concat_dim"]
+        gemm_op._attrs["output_accessors"][0],
+        cat_dim,
     )
 
 
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index d27f6208a..ca66bea8b 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -19,6 +19,8 @@
 from collections import deque
 from typing import Dict, List, Union
 
+from aitemplate.compiler.stable_set import StableSet
+
 from ...utils import graph_utils, logger
 from ..base import Operator, Tensor
 from .mark_param_tensor import mark_param_tensor
@@ -242,8 +244,8 @@ def remove_tensor_from_sorted_graph(tensor: Tensor) -> None:
     Disconnects the tensor from others so that sanitize_sorted_graph()
     could remove it.
     """
-    tensor._attrs["src_ops"] = set()
-    tensor._attrs["dst_ops"] = set()
+    tensor._attrs["src_ops"] = StableSet()
+    tensor._attrs["dst_ops"] = StableSet()
     tensor._attrs["is_input"] = False
     tensor._attrs["is_output"] = False
 
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index 4dc331ab5..c014b02b5 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -25,10 +25,9 @@
 from .roi_ops import *
 from .upsample import *
 from .view_ops import *
-from .attention import FlashAttention, MultiheadAttention
+from .attention import CrossAttention, FlashAttention, MultiheadAttention
 from .identity import Identity
 from .dropout import *
 from .layer_norm import *
 from .group_norm import *
-
-__all__ = ["Module", "ModuleDict", "ModuleList", "Sequential"]
+from .dual_gemm import T5DenseGatedGeluDense
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 978ba1949..2bc7a5917 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -32,6 +32,18 @@
 
 
 class FlashAttention(Module):
+    r"""FlashAttention provides an implementation for fused
+    multi-head attention module:
+
+    .. math::
+        \text{Attention}(Q, K, V) = \text{softmax}(\frac{QK}{\sqrt(d)}) * V
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    """
+
     def __init__(
         self,
         batch_size,
@@ -58,6 +70,31 @@ def forward(self, *args):
 
 
 class MultiheadAttention(Module):
+    r"""Multi-Head Attention.
+
+    Allows the model to jointly attend to information
+    from different representation subspaces as described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+
+    Multi-Head Attention is defined as:
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    Args:
+        dim: toal dimension of the model
+        batch_size: batch size
+        seq_len: sequence length
+        num_heads: Number of parallel attention heads. Default: 8
+        qkv_bias: whether to add bias to QKV. Default: False
+        attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout).
+        proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout).
+        has_residual: has or has no residual. Default: `True`.
+        causal: default: `False`.
+        mask_seq: sequence mask, default: ``0``.
+    """
+
     def __init__(
         self,
         dim,
@@ -70,6 +107,7 @@ def __init__(
         has_residual=True,
         causal=False,
         mask_seq=0,
+        use_mem_eff=False,
     ):
         super().__init__()
         assert (
@@ -81,22 +119,29 @@ def __init__(
         self.causal = causal
         self.has_residual = has_residual
         self.mask_seq = mask_seq
+        self.use_mem_eff = use_mem_eff
 
         flash_head_dims = {8, 16, 32, 64, 128}
         # simple heuristic, may need refinement
         self.use_flash = (
-            not (seq_len >= 384 and batch_size <= 3)
+            not (seq_len >= 512 and batch_size <= 2)
         ) and head_dim in flash_head_dims
         # odd seq try use flash
         if seq_len % 2 == 1:
             self.use_flash = True
 
-        self.op = flash_attention(
-            batch_size=batch_size,
-            dropout=attn_drop,
-            max_seq_len=seq_len,
-            causal=causal,
-        )
+        if use_mem_eff:
+            self.op = ops.mem_eff_attention(
+                causal=causal,
+            )
+            self.use_flash = False
+        else:
+            self.op = flash_attention(
+                batch_size=batch_size,
+                dropout=attn_drop,
+                max_seq_len=seq_len,
+                causal=causal,
+            )
         # cu_length: the cumulative sequence lengths, used to index into hidden_states.
         self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
         if self.mask_seq:
@@ -163,6 +208,14 @@ def attention(self, x):
             # input(x): (B*seqlen, 3, num_heads, head_dim)
             # output: (B, Seqlen, num_heads, head_dim)
             return self.op(x, self.cu_length.tensor())
+        elif USE_CUDA and self.use_mem_eff:
+            (q, k, v) = ops.split()(x, 1, dim=0)
+            _, b, num_heads, seqlen, d = self.get_shape(q)
+            return self.op(
+                ops.reshape()(q, [b, -1, seqlen, d]),
+                ops.reshape()(k, [b, -1, seqlen, d]),
+                ops.reshape()(v, [b, -1, seqlen, d]),
+            )
         else:
             # intput(q/k/v): (B*num_heads, seqlen, head_dim)
             # attn = (B, S, H) * (B, S, H) = (B, S, S) #RCR
@@ -192,9 +245,9 @@ def attention(self, x):
                     causal=self.causal,
                 )
                 out = OP(
-                    (ops.reshape()(q, [-1, seqlen, d])),
-                    (ops.reshape()(k, [-1, seqlen, d])),
-                    (ops.reshape()(v, [-1, seqlen, d])),
+                    ops.reshape()(q, [-1, seqlen, d]),
+                    ops.reshape()(k, [-1, seqlen, d]),
+                    ops.reshape()(v, [-1, seqlen, d]),
                 )
             return out
 
@@ -225,3 +278,119 @@ def forward(self, *args):
         x = self.proj_drop(x)
         x = ops.reshape()(x, [batch, seq, hidden])
         return x
+
+
+class CrossAttention(Module):
+    r"""Cross Multi-head Attention.
+
+    Allows the model to jointly attend to information
+    from different representation subspaces as described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+
+    Multi-Head Attention is defined as:
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    Args:
+        dim: toal dimension of the model
+        batch_size: batch size
+        seq_len: sequence length
+        num_heads: Number of parallel attention heads. Default: 8
+        qkv_bias: whether to add bias to QKV. Default: False
+        attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout).
+        proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout).
+        has_residual: has or has no residual. Default: `True`.
+        causal: default: `False`.
+        mask_seq: sequence mask, default: ``0``.
+    """
+
+    def __init__(
+        self,
+        dim,
+        seq_len,
+        seq_len_kv,
+        num_heads,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        has_residual=True,
+        causal=False,
+    ):
+        super().__init__()
+        assert (
+            dim % num_heads == 0
+        ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.causal = causal
+        self.has_residual = has_residual
+        self.dim = dim
+        self.seqlen = seq_len
+        self.seqlen_kv = seq_len_kv
+
+        self.op = ops.mem_eff_attention(causal=causal)
+
+        self.proj_q = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+        self.proj_k = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+        self.proj_v = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
+        self.proj_drop = Dropout(proj_drop)
+
+    def qkv_proj(self, x):
+        batch, seq, hidden = self.get_shape(x)
+        x = ops.reshape()(x, [-1, hidden])
+        return self.qkv(x)
+
+    def attention(self, q, k, v):
+        seqlen = self.seqlen
+        seqlen_kv = self.seqlen_kv
+        head_dim = self.dim // self.num_heads
+
+        query = self.proj_q(q)
+        key = self.proj_k(k)
+        value = self.proj_v(v)
+
+        query = ops.permute()(
+            ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim]), [0, 2, 1, 3]
+        )
+        key = ops.permute()(
+            ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim]), [0, 2, 1, 3]
+        )
+        value = ops.permute()(
+            ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim]),
+            [0, 2, 1, 3],
+        )
+        return self.op(query, key, value)
+
+    def forward(self, *args):
+        """forward pass for calling mha module"""
+        assert len(args) >= 3
+        x = args[0]
+        seq = self.seqlen
+        attn_output = self.attention(args[0], args[1], args[2])
+        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
+
+        if self.has_residual:
+            assert len(args) == 4
+            x = self.proj(attn_output, args[3])
+        else:
+            x = self.proj(attn_output)
+        x = self.proj_drop(x)
+        x = ops.reshape()(x, [-1, seq, self.dim])
+        return x
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d.py b/python/aitemplate/frontend/nn/conv2d/conv2d.py
index 18b8c6ca7..fa1f1d0da 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d.py
@@ -109,6 +109,7 @@ def __init__(
         self.op = conv2d(stride=stride, pad=padding, dilate=dilation, group=groups)
 
     def forward(self, *args):
+        """Applies Conv2d on the input tensor."""
         assert len(args) == 1
         x = args[0]
         return self.op(x, self.weight.tensor())
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index 92a03cf58..b3b99fae6 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -19,6 +19,47 @@
 
 
 class Conv2dBias(Conv2dBiasAct):
+    r"""Applies 2D convolution with bias.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the convolving kernel
+        stride (int): Stride of the convolution
+        padding (int, optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        dtype (string, optional): Data type. Default: "float16"
+
+    Shape:
+        - Input: :math:`(N, H_{in}, W_{in}, C_{in})`
+        - Output: :math:`(N, H_{out}, W_{out}, C_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}, \text{kernel_size}, `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels).
+
+    Examples::
+
+        >>> m = nn.Conv2d(16, 33, 3, 2)
+        >>> input = Tensor(shape=[20, 50, 100, 16])
+        >>> output = m(input)
+
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
index a3938171c..343780b53 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
@@ -19,6 +19,27 @@
 
 
 class Conv2dBiasAddHardswish(Conv2dBiasAddAct):
+    r"""Applies 2D convolution with bias + add + hardswish.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}, \text{kernel_size}, `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels).
+
+    Args:
+        input (Tensor): the input tensor to apply 2D convolution on.
+        residual (Tensor): the residule tensor to add after Conv2dBias.
+
+    Examples::
+
+        >>> m = nn.Conv2dBiasAddRelu(128, 256, 3, 1)
+        >>> input = Tensor(shape=[4, 28, 28, 128])
+        >>> residual = Tensor(shape=[4, 28, 28, 256])
+        >>> output = m(input, residual)
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
index 7d15e22c4..f12c7a3ec 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
@@ -19,6 +19,27 @@
 
 
 class Conv2dBiasAddRelu(Conv2dBiasAddAct):
+    r"""Applies 2D convolution with bias + add + relu.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}, \text{kernel_size}, `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels).
+
+    Args:
+        input (Tensor): the input tensor to apply 2D convolution on.
+        residual (Tensor): the residule tensor to add after Conv2dBias.
+
+    Examples::
+
+        >>> m = nn.Conv2dBiasAddRelu(128, 256, 3, 1)
+        >>> input = Tensor(shape=[4, 28, 28, 128])
+        >>> residual = Tensor(shape=[4, 28, 28, 256])
+        >>> output = m(input, residual)
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
index 825d80bcc..f7494d54f 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
@@ -13,13 +13,16 @@
 #  limitations under the License.
 #
 """
-conv2d bias relu module
+conv2d bias for few channels
 """
 from .special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasFewChannels(SpecialConv2dBiasAct):
-    """functions for the op with conv2d+bias+relu pattern"""
+    r"""Applies 2D convolution with bias for few channels.
+
+    This layer equals to Conv2dBias but has improved performance for in_channels < 8.
+    """
 
     def __init__(
         self,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
index bd3251b64..89ccdd94f 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
@@ -19,6 +19,8 @@
 
 
 class Conv2dBiasHardswish(Conv2dBiasAct):
+    r"""Applies 2D convolution with bias + hardswish."""
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
index 73a6e42be..c6b6e4d0d 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -13,13 +13,16 @@
 #  limitations under the License.
 #
 """
-conv2d bias hardswish module
+conv2d bias hardswish module for few channels
 """
 from .special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasHardswishFewChannels(SpecialConv2dBiasAct):
-    """functions for the op with conv2d+bias+hardswish pattern"""
+    r"""Applies 2D convolution with bias + hardswish for few channels.
+
+    This layer equals to Conv2dBiasHardswish but has improved performance for in_channels < 8.
+    """
 
     def __init__(
         self,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
index 02a48d3f3..197ce60ce 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
@@ -19,6 +19,8 @@
 
 
 class Conv2dBiasRelu(Conv2dBiasAct):
+    r"""Applies 2D convolution with bias + relu."""
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
index 6f8b78f33..214ae2726 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
@@ -13,13 +13,16 @@
 #  limitations under the License.
 #
 """
-conv2d bias relu module
+conv2d bias relu for few channels
 """
 from .special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasReluFewChannels(SpecialConv2dBiasAct):
-    """functions for the op with conv2d+bias+relu pattern"""
+    r"""Applies 2D convolution with bias + relu for few channels.
+
+    This layer equals to Conv2dBiasRelu but has improved performance for in_channels < 8.
+    """
 
     def __init__(
         self,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
index 5de24ea50..51c6eb839 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
@@ -19,6 +19,8 @@
 
 
 class Conv2dBiasSigmoid(Conv2dBiasAct):
+    r"""Applies 2D convolution with bias + sigmoid."""
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
index ecaa5498d..8fc7e6c45 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
@@ -12,13 +12,56 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-"""
-[summary] conv2d bias relu module
-"""
 from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
 
 
 class ConvTranspose2dBias(ConvTranspose2dBiasAct):
+    r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the convolving kernel
+        stride (int): Stride of the convolution
+        padding (int, optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        dtype (string, optional): Data type. Default: "float16"
+
+    Shape:
+        - Input: :math:`(N, H_{in}, W_{in}, C_{in})`
+        - Output: :math:`(N, H_{out}, W_{out}, C_{out})`, where
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride} - 2 \times \text{padding} + \text{dilation}
+                        \times (\text{kernel_size} - 1) + \text{output_padding} + 1
+          .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride} - 2 \times \text{padding} + \text{dilation}
+                        \times (\text{kernel_size} - 1) + \text{output_padding} + 1
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}, \text{kernel_size}, `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels).
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
index 368d8b4f7..a2d89c848 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
@@ -19,6 +19,8 @@
 
 
 class ConvTranspose2dBiasRelu(ConvTranspose2dBiasAct):
+    r"""Applies a 2D transposed convolution with bias + relu."""
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/dropout.py b/python/aitemplate/frontend/nn/dropout.py
index 0e3ddd41a..353ae44d7 100644
--- a/python/aitemplate/frontend/nn/dropout.py
+++ b/python/aitemplate/frontend/nn/dropout.py
@@ -19,6 +19,8 @@
 
 
 class Dropout(Module):
+    r"""Dropout placeholder"""
+
     def __init__(
         self,
         p=0,
@@ -27,12 +29,15 @@ def __init__(
         super().__init__()
 
     def forward(self, *args):
+        r"""Not implemented."""
         assert len(args) == 1
         data = args[0]
         return data
 
 
 class DropPath(Dropout):
+    r"""DropPath placeholder"""
+
     def __init__(
         self,
         dtype="float16",
diff --git a/python/aitemplate/frontend/nn/dual_gemm.py b/python/aitemplate/frontend/nn/dual_gemm.py
new file mode 100644
index 000000000..1db963eab
--- /dev/null
+++ b/python/aitemplate/frontend/nn/dual_gemm.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for attention module
+"""
+from ...compiler import ops
+from .linear import Linear
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class DualGemm(Module):
+    r"""DualGemm frontend"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        fast_gelu=True,
+        dtype="float16",
+    ):
+        """Initilize dual gemm module, create a tensor for weights"""
+        super().__init__()
+        self.w1 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
+        self.w2 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
+        if fast_gelu:
+            self.op = ops.dual_gemm_rcr_fast_gelu()
+        else:
+            self.op = ops.dual_gemm_rcr_silu()
+
+    def forward(self, *args):
+        """forward pass for calling attention op"""
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.w1.tensor(), self.w2.tensor())
+
+
+class T5DenseGatedGeluDense(Module):
+    r"""T5DenseGatedGeluDense."""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.wi_0_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
+        self.wi_1_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
+        self.wo = Linear(out_channels, in_channels, bias=False)
+        self.op = ops.dual_gemm_rcr_fast_gelu()
+
+    def forward(self, *args):
+        """forward pass for calling T5 block"""
+        assert len(args) == 1
+        x = args[0]
+        hidden = self.op(x, self.wi_0_weight.tensor(), self.wi_1_weight.tensor())
+        return self.wo(hidden)
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index e72990e8a..018a597a1 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -23,6 +23,17 @@
 
 
 class Embedding(Module):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        shape (List[int]): denotes the shape of the embeddings which is typically `[num_embeddings, embedding_dim]` where `num_embeddings` is the size of the dictionary of embeddings, and `embedding_dim` is the size of each embedding vector.
+        dtype (string): denotes the data type
+    """
+
     def __init__(
         self,
         shape,
diff --git a/python/aitemplate/frontend/nn/identity.py b/python/aitemplate/frontend/nn/identity.py
index f3a421df5..ac51ae53d 100644
--- a/python/aitemplate/frontend/nn/identity.py
+++ b/python/aitemplate/frontend/nn/identity.py
@@ -21,6 +21,8 @@
 
 
 class Identity(Module):
+    """The identify of the input."""
+
     def __init__(
         self,
         dtype="float16",
diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
index d91099f2f..a50f27847 100644
--- a/python/aitemplate/frontend/nn/linear.py
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -27,6 +27,43 @@
 
 
 class Linear(Module):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+
+    Args:
+        in_channels: size of each input sample
+        out_channels: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+        specialization: elementwise operation to add after the linear operation,
+            Default: ``None``
+        dtype: data type, default: ``float16``
+
+    Shape:
+
+        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
+          dimensions including none and :math:`H_{in} = \text{in_channels}`.
+        - Output: :math:`(*, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = \text{out_channels}`.
+
+    Attributes:
+
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{in_channels})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_channels}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out_channels})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in_channels}}`
+
+    Examples::
+
+        >>> m = nn.Linear(20, 30)
+        >>> input = Tensor(shape=[128, 20])
+        >>> output = m(input)
+        Tensor(shape=[128, 30])
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/python/aitemplate/frontend/nn/padding.py b/python/aitemplate/frontend/nn/padding.py
index d2caa58a0..c1a6efb6d 100644
--- a/python/aitemplate/frontend/nn/padding.py
+++ b/python/aitemplate/frontend/nn/padding.py
@@ -20,6 +20,8 @@
 
 
 class Nhwc3to8(Module):
+    r"""Pads the input data with nhwc dimensions from 3 channels to 8 channels"""
+
     def __init__(self):
         super().__init__()
         self.op = nhwc3to8()
diff --git a/python/aitemplate/frontend/nn/pool2d.py b/python/aitemplate/frontend/nn/pool2d.py
index 46d38bca7..212847d77 100644
--- a/python/aitemplate/frontend/nn/pool2d.py
+++ b/python/aitemplate/frontend/nn/pool2d.py
@@ -20,22 +20,72 @@
 
 
 class MaxPool2d(Module):
+    r"""Applies a 2D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, H, W, C)`,
+    output :math:`(N, H_{out}, W_{out}, C)` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, h, w, C_j) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n, C_j)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points.
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window
+        padding: implicit zero padding to be added on both sides
+    """
+
     def __init__(self, kernel_size, stride, padding=0):
         super().__init__()
         self.op = max_pool2d(kernel_size, stride, padding)
 
     def forward(self, *args):
+        r"""Applies MaxPool2d on the input."""
         assert len(args) == 1
         x = args[0]
         return self.op(x)
 
 
 class AvgPool2d(Module):
+    r"""Applies a 2D average pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, H, W, C)`,
+    output :math:`(N, H_{out}, W_{out}, C)` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, h, w, C_j)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, stride[0] \times h + m, stride[1] \times w + n, C_j)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: the size of the window to take an avg over
+        stride: the stride of the window
+        padding: implicit zero padding to be added on both sides
+    """
+
     def __init__(self, kernel_size, stride, padding):
         super().__init__()
         self.op = avg_pool2d(kernel_size, stride, padding)
 
     def forward(self, *args):
+        r"""Applies AvgPool2d on the input."""
         assert len(args) == 1
         x = args[0]
         return self.op(x)
diff --git a/python/aitemplate/frontend/nn/roi_ops.py b/python/aitemplate/frontend/nn/roi_ops.py
index 401813f09..12e1f7621 100644
--- a/python/aitemplate/frontend/nn/roi_ops.py
+++ b/python/aitemplate/frontend/nn/roi_ops.py
@@ -20,6 +20,38 @@
 
 
 class RoiAlign(Module):
+    r"""
+    Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+
+     * :attr:`num_rois` identifies the number of RoIs in the input.
+
+     * :attr:`pooled_size` identifies the size of the pooling section, i.e., the size of the output (in bins or pixels) after the pooling
+       is performed, as (height, width).
+
+     * :attr:`sampling_ratio` is the number of sampling points in the interpolation grid
+       used to compute the output value of each pooled output bin. If > 0,
+       then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+       <= 0, then an adaptive number of grid points are used (computed as
+       ``ceil(roi_width / output_width)``, and likewise for height).
+
+     * :attr:`spatial_scale` is a scaling factor that maps the box coordinates to
+       the input coordinates. For example, if your boxes are defined on the scale
+       of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+       the original image), you'll want to set this to 0.5.
+
+     * :attr:`position_sensitive`, a bool value.
+
+     * :attr:`continuous_coordinate`. a bool value.
+
+    Args:
+        x (Tensor[N, H, W, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``.
+        rois (Tensor[roi_batch, 5]): the list of RoIs and each ROI contains the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``, and the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Return:
+        Tensor[roi_batch, pooled_size, pooled_size, C]: the fixed-size feature maps, i.e., the pooled RoIs.
+
+    """
+
     def __init__(
         self,
         num_rois,
@@ -40,6 +72,7 @@ def __init__(
         )
 
     def forward(self, *args):
+        """Performs RoiAlign on the input."""
         assert len(args) == 2
         x = args[0]
         rois = args[1]
@@ -47,6 +80,43 @@ def forward(self, *args):
 
 
 class FPNRoiAlign(Module):
+    """
+    Performs Multiple level Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+
+     * :attr:`num_rois` identifies the number of RoIs in the input.
+
+     * :attr:`pooled_size` identifies the size of the pooling section, i.e., the size of the output (in bins or pixels) after the pooling
+       is performed, as (height, width).
+
+     * :attr:`sampling_ratio` is the number of sampling points in the interpolation grid
+       used to compute the output value of each pooled output bin. If > 0,
+       then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+       <= 0, then an adaptive number of grid points are used (computed as
+       ``ceil(roi_width / output_width)``, and likewise for height).
+
+     * :attr:`spatial_scale` is a scaling factor that maps the box coordinates to
+       the input coordinates. For example, if your boxes are defined on the scale
+       of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+       the original image), you'll want to set this to 0.5.
+
+     * :attr:`position_sensitive`, a bool value.
+
+     * :attr:`continuous_coordinate`, a bool value.
+
+     * :attr:`im_shape`, original image shape.
+
+    Args:
+        p1 (Tensor[N, H//4, W//4, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//4) x (W//4)``.
+        p2 (Tensor[N, H//8, W//8, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//8) x (W//8)``.
+        p3 (Tensor[N, H//16, W//16, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//16) x (W//16)``.
+        p4 (Tensor[N, H//32, W//32, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//32) x (W//32)``.
+        rois (Tensor[roi_batch, 5]): the list of RoIs and each ROI contains the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``, and the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Return:
+        Tensor[num_rois * N, pooled_size, pooled_size, C]: the fixed-size feature maps, i.e., the pooled RoIs.
+
+    """
+
     def __init__(
         self,
         num_rois,
@@ -69,6 +139,7 @@ def __init__(
         )
 
     def forward(self, *args):
+        """Performs Multi Level RoiAlign on the input."""
         assert len(args) >= 2
         x = args[0]
         rois = args[1]
diff --git a/python/aitemplate/frontend/nn/upsample.py b/python/aitemplate/frontend/nn/upsample.py
index ab85c6d61..aa6a90edd 100644
--- a/python/aitemplate/frontend/nn/upsample.py
+++ b/python/aitemplate/frontend/nn/upsample.py
@@ -20,6 +20,25 @@
 
 
 class Upsampling2d(Module):
+    r"""
+    Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes the :attr:`scale_factor` as it's constructor argument.
+
+    * :attr:`scale_factor` (float): multiplier for spatial size.
+
+    * :attr:`mode` (str): the upsampling algorithm: one of ``'nearest'``,
+      ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+      Currently we support ``'bilinear'`` and  ``'nearest'`` mode.
+
+    Args:
+        input (Tensor [N, H, W, C]): the input data.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+    """
+
     def __init__(self, scale_factor, mode):
         super().__init__()
         self.op = upsampling2d(scale_factor, mode)
@@ -31,6 +50,8 @@ def forward(self, *args):
 
 
 class Upsampling2dAdd(Module):
+    r"""Applies Upsampling2d + add."""
+
     def __init__(self, scale_factor, mode):
         super().__init__()
         self.op = upsampling2d_add(scale_factor, mode)
diff --git a/python/aitemplate/frontend/nn/view_ops.py b/python/aitemplate/frontend/nn/view_ops.py
index dc6b03715..f4afc902e 100644
--- a/python/aitemplate/frontend/nn/view_ops.py
+++ b/python/aitemplate/frontend/nn/view_ops.py
@@ -20,11 +20,20 @@
 
 
 class Reshape(Module):
+    """
+    Returns a tensor with the same data and number of elements as input, but with the
+    specified shape. Inputs must be contiguous.
+
+    A single dimension may be -1, in which case it’s inferred from the remaining
+    dimensions and the number of elements in input.
+    """
+
     def __init__(self):
         super().__init__()
         self.op = reshape()
 
     def forward(self, *args):
+        """Reshaped the input to given size."""
         assert len(args) == 2
         x = args[0]
         shape = args[1]
@@ -32,11 +41,20 @@ def forward(self, *args):
 
 
 class View(Module):
+    """
+    Placeholder for View layer. The current implementation is the same as Reshape.
+    Returns a tensor with the same data and number of elements as input, but with the specified shape. Inputs must be contiguous.
+
+    A single dimension may be -1, in which case it’s inferred from the remaining
+    dimensions and the number of elements in input.
+    """
+
     def __init__(self):
         super().__init__()
         self.op = reshape()
 
     def forward(self, *args):
+        """Creates a view (copy) of the input with given shape."""
         assert len(args) == 2
         x = args[0]
         shape = args[1]
@@ -44,11 +62,18 @@ def forward(self, *args):
 
 
 class Flatten(Module):
+    """
+    Flattens input by reshaping it into a one-dimensional tensor. If start_dim or end_dim
+    are passed, only dimensions starting with start_dim and ending with end_dim are
+    flattened. The order of elements in input is unchanged.
+    """
+
     def __init__(self, start_dim=0, end_dim=-1):
         super().__init__()
         self.op = flatten(start_dim, end_dim)
 
     def forward(self, *args):
+        """Flattens the input with specified start and end dims."""
         assert len(args) == 1
         x = args[0]
         return self.op(x)
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index fed4d4a9c..9e1867f44 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -36,13 +36,12 @@ def _detect_cuda():
         )
         stdout, stderr = proc.communicate()
         stdout = stdout.decode("utf-8")
-        if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout or "A10" in stdout:
+        if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout:
             return "80"
+        if "V100" in stdout:
+            return "70"
         if "T4" in stdout:
-            if os.environ.get("CI_FLAG", None) == "CIRCLECI":
-                return "75"
-            else:
-                return None
+            return "75"
         return None
     except Exception:
         return None
@@ -77,7 +76,6 @@ def detect_target(**kwargs):
             return CUDA(arch=FLAG, **kwargs)
         else:
             return ROCM(arch=FLAG, **kwargs)
-
     doc_flag = os.getenv("BUILD_DOCS", None)
     if doc_flag is not None:
         return CUDA(arch="80", **kwargs)
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index 6e0902313..b41eabd98 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -16,6 +16,7 @@
 # flake8: noqa
 
 from . import (
+    alignment,
     graph_utils,
     logger,
     markdown_table,
diff --git a/python/aitemplate/utils/alignment.py b/python/aitemplate/utils/alignment.py
new file mode 100644
index 000000000..d171a8cb8
--- /dev/null
+++ b/python/aitemplate/utils/alignment.py
@@ -0,0 +1,36 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions to handle alignment.
+"""
+# Currently read4, add2 is best for both backend, so two backend seems identical.
+# They may diverge when we got deeper understanding / further optimization.
+ALIGNMENTS = [
+    8,
+    4,
+    2,
+    1,
+]
+
+
+def find_max_alignment(number: int) -> int:
+    """
+    Return the first alignment value that meets the alignment requirement
+    for accessing the `number` of elements. This is dtype dependent.
+    """
+    for alignment in ALIGNMENTS:
+        if number % alignment == 0:
+            return alignment
+    return 1
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 9f5dedc51..600bf14f8 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -63,12 +63,14 @@ def sorted_op_pseudo_code(ops, with_shape=True) -> str:
 
 
 def dump_graph_debug_str_to_file(tensors, workdir, name):
-    prefix = os.path.join(workdir, name)
-    graph_path = prefix + "_graph.txt"
-    pseudo_code_path = prefix + "_pseudo_code.txt"
-    with open(graph_path, "w") as f:
-        f.write(sorted_graph_debug_str(tensors))
-        logger.info(__file__, f"Dumped {name} graph to {graph_path}")
-    with open(pseudo_code_path, "w") as f:
-        f.write(sorted_graph_pseudo_code(tensors))
-        logger.info(__file__, f"Dumped {name} pseudo code to {pseudo_code_path}")
+    if logger.is_debug():
+        # Dump graph and pseudo code for debug only
+        prefix = os.path.join(workdir, name)
+        graph_path = prefix + "_graph.txt"
+        pseudo_code_path = prefix + "_pseudo_code.txt"
+        with open(graph_path, "w") as f:
+            f.write(sorted_graph_debug_str(tensors))
+            logger.debug(__file__, f"Dumped {name} graph to {graph_path}")
+        with open(pseudo_code_path, "w") as f:
+            f.write(sorted_graph_pseudo_code(tensors))
+            logger.debug(__file__, f"Dumped {name} pseudo code to {pseudo_code_path}")
diff --git a/python/aitemplate/utils/logger.py b/python/aitemplate/utils/logger.py
index cd7eba07e..7dfdba771 100644
--- a/python/aitemplate/utils/logger.py
+++ b/python/aitemplate/utils/logger.py
@@ -16,6 +16,7 @@
 default logger
 """
 import logging
+import os
 
 
 def info(name, message):
@@ -36,3 +37,22 @@ def warning(name, message):
 def is_debug():
     logger = logging.getLogger("aitemplate")
     return logger.level == logging.DEBUG
+
+
+def setup_logger(name):
+    root_logger = logging.getLogger(name)
+    info_handle = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s %(levelname)s <%(name)s> %(message)s")
+    info_handle.setFormatter(formatter)
+    root_logger.addHandler(info_handle)
+    root_logger.propagate = False
+
+    DEFAULT_LOGLEVEL = logging.getLogger().level
+    log_level_str = os.environ.get("LOGLEVEL", None)
+    LOG_LEVEL = (
+        getattr(logging, log_level_str.upper())
+        if log_level_str is not None
+        else DEFAULT_LOGLEVEL
+    )
+    root_logger.setLevel(LOG_LEVEL)
+    return root_logger
diff --git a/python/aitemplate/utils/mk_ck_lib/__init__.py b/python/aitemplate/utils/mk_ck_lib/__init__.py
new file mode 100644
index 000000000..0988106cc
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# flake8: noqa
+
+from . import conv2d_operation, gemm_operation, generator, library, manifest
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index 5a59cc185..c65f6c4dc 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -2015,8 +2015,6 @@ def CreateGroupNormOperator(manifest, rank=5):
         groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
         groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
         groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
     ]
 
     operations = []
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index bcd7f85ef..9aa3aade9 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -31,6 +31,8 @@ class EpilogueFunctor(enum.Enum):
   LinearCombinationGELU = enum_auto()
   LinearCombinationFastGELU = enum_auto()
   LinearCombinationSilu = enum_auto()
+  LeftSiLUAndMul = enum_auto()
+  LeftFastGeluAndMul = enum_auto()
 
 EpilogueFunctorTag = {
   EpilogueFunctor.LinearCombination:
@@ -55,6 +57,10 @@ class EpilogueFunctor(enum.Enum):
     'cutlass::epilogue::thread::LinearCombinationFastGELU',
   EpilogueFunctor.LinearCombinationSilu:
     'cutlass::epilogue::thread::LinearCombinationSilu',
+  EpilogueFunctor.LeftSiLUAndMul:
+    'cutlass::epilogue::thread::LeftSiLUAndMul',
+  EpilogueFunctor.LeftFastGeluAndMul:
+    'cutlass::epilogue::thread::LeftFastGeluAndMul',
 }
 
 EpilogueFunctorName = {
@@ -68,7 +74,9 @@ class EpilogueFunctor(enum.Enum):
   "LinearCombinationHardSwish": EpilogueFunctor.LinearCombinationHardSwish,
   "LinearCombinationGELU": EpilogueFunctor.LinearCombinationGELU,
   "LinearCombinationFastGELU": EpilogueFunctor.LinearCombinationFastGELU,
-  "LinearCombinationSilu": EpilogueFunctor.LinearCombinationSilu
+  "LinearCombinationSilu": EpilogueFunctor.LinearCombinationSilu,
+  "LeftSiLUAndMul": EpilogueFunctor.LeftSiLUAndMul,
+  "LeftFastGeluAndMul": EpilogueFunctor.LeftFastGeluAndMul,
 }
 
 class EpilogueMath(enum.Enum):
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
index cb15736a3..5a428bcbe 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
@@ -235,6 +235,141 @@ def emit(self, operation):
     return SubstituteTemplate(self.gemm_template, values)
 
 
+'''
+)
+
+DUAL_GEMM_TEMPLATE = jinja2.Template(
+    """
+    using ${operation_name}_base =
+    cutlass::gemm::device::DualGemm<
+        ${element_a}, ${layout_a},
+        ${element_b}, ${layout_b},
+        ${element_c}, ${layout_c},
+        ${element_accumulator},
+        ${opcode_class},
+        ${arch},
+        cutlass::gemm::GemmShape<${threadblock_shape_m},
+                                  ${threadblock_shape_n},
+                                  ${threadblock_shape_k}>,
+        cutlass::gemm::GemmShape<${warp_shape_m},
+                                  ${warp_shape_n},
+                                  ${warp_shape_k}>,
+        cutlass::gemm::GemmShape<${instruction_shape_m},
+                                  ${instruction_shape_n},
+                                  ${instruction_shape_k}>,
+        ${epilogue_functor}
+        ${epilogue_functor}
+        ${epilogue_functor2}
+        ${swizzling_functor},
+        ${stages},
+        false,
+        false,
+        false
+    >;
+"""
+)
+
+SRC_DUAL_GEMM_TEMPLATE = jinja2.Template(
+    '''
+class EmitDualGemmInstance:
+  def __init__(self, operation_suffix = ''):
+      self.operation_suffix = operation_suffix
+      self.includes = []
+      self.builtin_epilogue_functor_template = """
+          ${epilogue_functor}<
+            ${element_c},
+            ${epilogue_vector_length},
+            ${element_accumulator},
+            ${element_epilogue},
+            cutlass::epilogue::thread::ScaleType::Nothing
+          >,
+      """
+      self.builtin_epilogue_functor2_template = """
+          ${epilogue_functor2}<
+            ${element_c},
+            ${epilogue_vector_length},
+            ${element_c},
+            ${element_epilogue}
+          >,
+      """
+
+      self.gemm_template = """
+      {{gemm_template}}
+      """
+
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    instance_layout_A, instance_layout_B, instance_layout_C = \
+      (operation.A.layout, operation.B.layout, operation.C.layout)
+    #
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) \
+          // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+        'epilogue_functor2': EpilogueFunctorTag[operation.epilogue_functor2],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+      epilogue_functor2 = SubstituteTemplate(self.builtin_epilogue_functor2_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+      epilogue_functor2 = self.epilogue_functor.emit_declaration()
+
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'epilogue_functor2': epilogue_functor2,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'align_c': str(operation.C.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
+
 '''
 )
 
@@ -247,4 +382,5 @@ def emit_library():
     template += SRC_PERMUTE_TEMPLATE.render(
         gemm_template=GEMM_PERMUTE_TEMPLATE.render()
     )
+    template += SRC_DUAL_GEMM_TEMPLATE.render(gemm_template=DUAL_GEMM_TEMPLATE.render())
     return template
diff --git a/python/aitemplate/utils/serialization/ait_program.py b/python/aitemplate/utils/serialization/ait_program.py
new file mode 100644
index 000000000..4e4f62376
--- /dev/null
+++ b/python/aitemplate/utils/serialization/ait_program.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+import torch
+
+from aitemplate.compiler.base import (
+    _HostConstantTensorData,
+    _NumpyConstantTensorData,
+    _TorchConstantTensorData,
+)
+from aitemplate.frontend import IntVar, Tensor
+
+
+def convert_to_ait_const(const):
+    if isinstance(const, bytes):
+        return _HostConstantTensorData(const)
+    elif isinstance(const, torch.Tensor):
+        return _TorchConstantTensorData(const)
+    elif isinstance(const, np.ndarray):
+        return _NumpyConstantTensorData(const)
+    else:
+        raise RuntimeError(f"Unknown type ({type(const)}) to convert to AIT Tensor")
+
+
+class AITBasicProgram:
+    def __init__(self):
+        """
+        Initialize all inputs and constants parameters.
+        """
+        pass
+
+    def get_constants(self) -> Dict[str, List[int]]:
+        """
+        Returns a dictionary of the constants.
+        The returned dictionary has key as constant name and value as input shape.
+        """
+        pass
+
+    def get_inputs(self) -> Dict[str, List[IntVar]]:
+        """
+        Returns a dictionary of the expected inputs.
+        The returned dictionary has key as input name and value as input shape.
+        """
+        pass
+
+    def set_constants(self, constants: Dict[str, Any]):
+        """
+        Provide a dictionary to set the corresponding constant values.
+        The constant value could be bytes/torch.Tensor/numpy.ndarray.
+        """
+        for k, v in constants.items():
+            getattr(self, k)._bind_data(convert_to_ait_const(v))
+
+    def set_default_constants(self):
+        """
+        This function is called to set up default constants
+        (ex. constant folded/constants set up by zero padding etc.).
+        """
+        self.set_all_random_constants()
+
+    def set_all_random_constants(self):
+        """
+        This function would set all constants into random value.
+        """
+        const_infos = self.get_constants()
+        for k, v in const_infos.items():
+            getattr(self, k)._bind_data(
+                _NumpyConstantTensorData(np.random.randn(*v).astype("float16"))
+            )
+
+    def model(self) -> Union[Tensor, Tuple[Tensor]]:
+        """
+        This function defines the AIT program.
+        Returns a output tensor, or a tuple of output tensors.
+        """
+        pass
diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
new file mode 100644
index 000000000..1f7c98b44
--- /dev/null
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -0,0 +1,393 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dump/Read sorted_graph to/from python code.
+"""
+import os
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import jinja2
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+
+from aitemplate.compiler.transform import mark_param_tensor, name_graph, toposort
+
+PROGRAM_TEMPLATE = jinja2.Template(
+    """import numpy as np
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm, IntVar, _HostConstantTensorData, _NumpyConstantTensorData
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+
+from aitemplate.utils.serialization.ait_program import AITBasicProgram
+
+class AITProgram(AITBasicProgram):
+{{indent}}def __init__(self):
+{{indent}}{{indent}}# Inputs of model
+{% for input in inputs -%}
+{{indent}}{{indent}}{{input}}
+{% endfor %}
+{{indent}}{{indent}}# End of inputs
+{{indent}}{{indent}}# Constants of model
+{% for const in consts -%}
+{{indent}}{{indent}}{{const}}
+{% endfor %}
+{{indent}}{{indent}}# End of Constants
+{{indent}}{{indent}}self.set_default_constants()
+{{indent}}{{indent}}return
+
+
+{{indent}}def get_constants(self):
+{{indent}}{{indent}}ret = {}
+{% for k, v in consts_info.items() -%}
+{{indent}}{{indent}}ret["{{k}}"] = {{v}}
+{% endfor %}
+{{indent}}{{indent}}return ret
+
+
+{{indent}}def get_inputs(self):
+{{indent}}{{indent}}ret = {}
+{% for k, v in inputs_info.items() -%}
+{{indent}}{{indent}}ret["{{k}}"] = {{v}}
+{% endfor %}
+{{indent}}{{indent}}return ret
+
+
+{{indent}}def set_default_constants(self):
+{{indent}}{{indent}}super().set_default_constants()
+{% for const_val in default_const_vals -%}
+{{indent}}{{indent}}{{const_val}}
+{% endfor %}
+{{indent}}{{indent}}# End of set_default_constants
+{{indent}}{{indent}}return
+
+
+{{indent}}def model(self):
+{% for op in ops -%}
+{{indent}}{{indent}}{{op}}
+{% endfor %}
+{{indent}}{{indent}}# Set outputs
+{% for output in outputs -%}
+{{indent}}{{indent}}{{output}}._attrs["name"] = "{{output}}"
+{{indent}}{{indent}}{{output}}._attrs["is_output"] = True
+{% endfor %}
+{{indent}}{{indent}}# End of setting outputs
+{{indent}}{{indent}}return {{", ".join(outputs)}}
+"""
+)
+
+OPS_TEMPLATE = jinja2.Template(
+    "{{op_name}} = ops.{{op_type}}({{op_attrs}})({{op_inputs}})"
+)
+PARAMS_TEMPLATE = jinja2.Template(
+    'self.{{input_name}} = Tensor(shape={{tensor_shape}}, name="{{input_name}}", is_input={{is_input}})'
+)
+DEFAULT_CONST_VAL_TEMPLATE = jinja2.Template(
+    "self.{{const_name}}._bind_data(_HostConstantTensorData({{bytes_data}}, '{{dtype}}'))"
+)
+
+
+def _shape_to_str(shapes: List[Union[IntVar, Tensor]], intimm_to_int=False):
+    shape_str = "["
+    for idx, shape in enumerate(shapes):
+        if idx != 0:
+            shape_str += ", "
+        if isinstance(shape, IntImm):
+            if intimm_to_int:
+                shape_str += f"{shape.value()}"
+            else:
+                shape_str += f"IntImm({shape.value()})"
+        elif isinstance(shape, IntVar):
+            shape_str += (
+                f"IntVar({shape._attrs['values']}, name='{shape._attrs['name']}')"
+            )
+        elif isinstance(shape, Tensor):
+            raise RuntimeError("IntVarTensor not supported yet")
+    shape_str += "]"
+
+    return shape_str
+
+
+def _retrieve_op_info(op: Operator, params_set) -> Tuple[List, Dict]:
+    op_inputs = list(op._attrs["inputs"])
+    op_attrs = op._get_op_attributes()
+
+    if op._attrs["op"] == "elementwise":
+        # Elementwise might have constants as inputs.
+        args = op._attrs["args"]
+        tmp_inputs = []
+        for arg in args:
+            if not arg.is_a_const_num():
+                tmp_inputs.append(arg)
+            else:
+                tmp_inputs.append(str(arg._attrs["value"]))
+        op_inputs = tmp_inputs
+    elif op._attrs["op"] == "layernorm":
+        # normalized_shape in _attrs are Optional[List[IntImm]], we serialize them here.
+        default_normalized_shape = op._attrs["default_normalized_shape"]
+        normalized_shape = op._attrs["normalized_shape"]
+        if default_normalized_shape == normalized_shape:
+            op_attrs["normalized_shape"] = default_normalized_shape
+        else:
+            op_inputs = op_inputs[:3]
+
+            norm_shapes_input = []
+            curr_idx = 3
+            for s in normalized_shape:
+                if isinstance(s, IntImm):
+                    norm_shapes_input.append(f"IntImm({s.value()})")
+                else:
+                    if isinstance(op_inputs[curr_idx], IntVarTensor):
+                        input_name = op_inputs[curr_idx]._attrs["name"]
+                        if input_name in params_set:
+                            input_name = "self." + input_name
+                        norm_shapes_input.append(input_name)
+                    elif isinstance(op_inputs[curr_idx], IntVar):
+                        norm_shapes_input.append(
+                            f'IntVar(values={s._attrs["values"]}, name="{s._attrs["name"]}")'
+                        )
+                    curr_idx += 1
+
+            op_inputs.append(f'[{", ".join(norm_shapes_input)}]')
+            op_inputs.append(str(op._attrs["eps"]))
+    elif op._attrs["op"] == "split":
+        # split has size and dim provided as inputs.
+        op_inputs.append(str(op._attrs["split_sizes"]))
+        op_inputs.append(str(op._attrs["split_dim"]))
+    elif op._attrs["op"].startswith("concatenate"):
+        # concatenate takes list as input
+        tmp_inputs = []
+        for input_ in op_inputs:
+            input_name = input_._attrs["name"]
+            if input_name in params_set:
+                input_name = "self." + input_name
+            tmp_inputs.append(input_name)
+        op_inputs = [
+            f'[{", ".join(tmp_inputs)}]',
+            str(op._attrs["concat_dim"]),
+        ]
+    elif op._attrs["op"] == "reshape":
+        # reshape take shape as inputs
+        op_inputs = op_inputs[:1]
+        shape_str = _shape_to_str(op._attrs["shape"], intimm_to_int=True)
+
+        op_inputs.append(shape_str)
+    elif op._attrs["op"].startswith("group_gemm_rcr"):
+        # group_gemm takes bundled X,W,(B) as inputs.
+        diff = 2
+        if op._attrs["op"].startswith("group_gemm_rcr_bias"):
+            diff = 3
+        inputs_str = "["
+        for i in range(0, len(op_inputs), diff):
+            if i != 0:
+                inputs_str += ", "
+            inputs_str += "["
+            input_group = op_inputs[i : i + diff]
+            input_group_names = []
+            for input_ in input_group:
+                input_name = input_._attrs["name"]
+                if input_name in params_set:
+                    input_name = "self." + input_name
+                input_group_names.append(input_name)
+            inputs_str += ", ".join(input_group_names)
+            inputs_str += "]"
+        inputs_str += "]"
+        op_inputs = [inputs_str]
+    elif op._attrs["op"] == "dynamic_slice":
+        # dynamic slice provides start/end indices as inputs
+        op_inputs.append(str(op._attrs["start_indices"]))
+        op_inputs.append(str(op._attrs["end_indices"]))
+
+    return op_inputs, op_attrs
+
+
+def convert_to_default_const_val_str(tensor: Tensor) -> str:
+    const_name = tensor._attrs["name"]
+    assert const_name is not None, "const name cannot be none."
+
+    return DEFAULT_CONST_VAL_TEMPLATE.render(
+        const_name=const_name,
+        bytes_data=tensor._attrs["data"].to_bytes(),
+        dtype=tensor._attrs["data"].dtype,
+    )
+
+
+def convert_to_param_str(tensor: Tensor) -> str:
+    input_name = tensor._attrs["name"]
+    assert input_name is not None, "input name cannot be none."
+
+    return PARAMS_TEMPLATE.render(
+        input_name=input_name,
+        tensor_shape=_shape_to_str(tensor.shape()),
+        is_input=tensor._attrs["is_input"],
+    )
+
+
+def convert_to_info_str(shapes: List[Union[IntImm, IntVar]], is_constant=False) -> str:
+    info_str_shapes = []
+    for shape in shapes:
+        if is_constant:
+            if not isinstance(shape, IntImm):
+                raise RuntimeError(
+                    f"Constant got type {type(shape)} can't have non-IntImm input!"
+                )
+            info_str_shapes.append(str(shape.value()))
+        elif isinstance(shape, IntImm):
+            info_str_shapes.append(
+                f'IntImm(value={shape.value()}, name="{shape._attrs["name"]}")'
+            )
+        else:
+            info_str_shapes.append(
+                f'IntVar(values={shape._attrs["values"]}, name="{shape._attrs["name"]}")'
+            )
+    return f"[{', '.join(info_str_shapes)}]"
+
+
+def convert_to_op_str(op: Operator, params_set) -> str:
+    op_inputs, op_attrs = _retrieve_op_info(op, params_set)
+
+    serialized_op_inputs = []
+    for input_ in op_inputs:
+        if isinstance(input_, Tensor):
+            input_name = input_._attrs["name"]
+            if input_name in params_set:
+                input_name = "self." + input_name
+            serialized_op_inputs.append(input_name)
+        else:
+            # If done being processed as string
+            serialized_op_inputs.append(input_)
+
+    return OPS_TEMPLATE.render(
+        op_name=", ".join([o._attrs["name"] for o in op._attrs["outputs"]]),
+        op_type=op._attrs["op"],
+        op_attrs=", ".join([f"{k}={v}" for k, v in op_attrs.items()]),
+        op_inputs=", ".join(serialized_op_inputs),
+    )
+
+
+def dump_program(
+    sorted_graph: Union[Tensor, List[Tensor]],
+    file_path: str,
+    indent: str = "    ",
+    random_constants: bool = False,
+):
+    """This function dumps out an AIT sorted graph to an executable python code.
+
+    Parameters
+    ----------
+    sorted_graph : Union[Tensor, List[Tensor]]
+        Final tensor(s) that are associated to the AIT graph.
+    file_path: str
+        Location for the python file to be dumped.
+    indent: str, optional
+        The indentation to be used in python code, default is 4 spaces.
+    random_constants: bool, optional
+        Assign random values for constants, default is False.
+    """
+    if isinstance(sorted_graph, Tensor):
+        sorted_graph = [sorted_graph]
+
+    # Make sure the graph is in correct order and has names and param set correctly.
+    sorted_graph = toposort(sorted_graph)
+    mark_param_tensor(sorted_graph)
+    name_graph(sorted_graph)
+
+    params_set = set()
+    inputs_str = []
+    consts_str = []
+    default_const_vals = []
+    op_str = []
+    inputs_info = {}
+    consts_info = {}
+    outputs = []
+    visited_ops = set()
+    for tensor in sorted_graph:
+        if tensor._attrs["is_input"]:
+            inputs_str.append(convert_to_param_str(tensor))
+            inputs_info[tensor._attrs["name"]] = convert_to_info_str(tensor.shape())
+            params_set.add(tensor._attrs["name"])
+            continue
+        if tensor._attrs["is_param"]:
+            # This is the case that the tensor is some constant.
+            consts_str.append(convert_to_param_str(tensor))
+            consts_info[tensor._attrs["name"]] = convert_to_info_str(
+                tensor.shape(), is_constant=True
+            )
+            if tensor._attrs["data"] is not None and not random_constants:
+                default_const_vals.append(convert_to_default_const_val_str(tensor))
+            params_set.add(tensor._attrs["name"])
+            continue
+
+        if tensor._attrs["is_output"]:
+            outputs.append(tensor._attrs["name"])
+
+        src_ops = tensor.src_ops()
+        for src_op in src_ops:
+            if src_op in visited_ops:
+                continue
+            visited_ops.add(src_op)
+            op_str.append(convert_to_op_str(src_op, params_set))
+
+    program = PROGRAM_TEMPLATE.render(
+        indent=indent,
+        inputs=inputs_str,
+        inputs_info=inputs_info,
+        consts_info=consts_info,
+        consts=consts_str,
+        default_const_vals=default_const_vals,
+        ops=op_str,
+        outputs=outputs,
+    )
+
+    if file_path != "":
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "w") as f:
+            f.write(program)
+
+    return program
+
+
+def _get_class(file_path: str, class_name: str = "AITProgram"):
+    import importlib.util
+
+    spec = importlib.util.spec_from_file_location("AITProgram", file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    return getattr(module, class_name)()
+
+
+def get_inputs_from_graph(file_path: str):
+    program = _get_class(file_path)
+    return program.get_inputs()
+
+
+def get_program(file_path: str) -> Tuple[Tuple[Tensor], Union[Tensor, List[Tensor]]]:
+    program = _get_class(file_path)
+
+    outputs = program.model()
+    sorted_graph = toposort(outputs)
+
+    return outputs, sorted_graph
+
+
+def strip_hardcoded_constants(file_path: str, new_file: Optional[str] = None) -> None:
+    program = _get_class(file_path)
+    outputs = program.model()
+    if new_file:
+        file_path = new_file
+    dump_program(outputs, file_path, random_constants=True)
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
index b3d41efb2..266b3a279 100644
--- a/python/aitemplate/utils/torch_utils.py
+++ b/python/aitemplate/utils/torch_utils.py
@@ -30,6 +30,7 @@ def torch_dtype_to_string(dtype):
         torch.float32: "float32",
         torch.int32: "int32",
         torch.int64: "int64",
+        torch.bool: "bool",
     }
     if dtype not in dtype_to_str:
         raise ValueError(
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 228eb435c..4ae0d5983 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -15,9 +15,10 @@
 """
 Graph visualization tool for AITemplate
 """
-
 import json
+import os
 
+from aitemplate import compiler
 from aitemplate.utils.visualization import op_attr_factory, pydot
 from aitemplate.utils.visualization.web_template import (
     INDEX_TEMPLATE,
@@ -89,22 +90,30 @@ def _gen_op_modal(op) -> str:
     return modal_src
 
 
-def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
-    """Plot a sorted graph into an interactive HTML page.
-
-    The sorted graph must be named.
-    The HTML can be opened in Chrome directly.
+def plot_graph(tensors, file_path: str) -> None:
+    """
+    Plot AIT graph.
 
     Parameters
     ----------
-    sorted_graph : List[Tensor]
-        output of sorted graph / other optimization pass.
+    tensors : Union[Tensor, List[Tensor]]
+        An output Tensor, or a list of output Tensors of AIT graph.
     file_path : str
-        output HTML path
-    network_name : str, optional
-        the name of network, will appear in navigation bar, by default ""
+        Output file path, currently we support the following extension:
+            - html
+            - format supported by graphviz
     """
     dot_graph = pydot.Dot(graph_type="digraph")
+    _, ext = os.path.splitext(file_path)
+    if ext == "":
+        raise ValueError("Please provide a file extension in path to plot.")
+
+    ext = ext[1:]
+    if ext != "html" and ext not in dot_graph.formats:
+        raise ValueError(f"Unsupported extension '{ext}' to plot!")
+
+    sorted_graph = compiler.transform.toposort(tensors)
+    compiler.transform.name_graph(sorted_graph)
 
     op_set = {}
     tensor_set = {}
@@ -114,10 +123,6 @@ def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
     for tensor in sorted_graph:
         tensor_node = None
         tensor_name = tensor._attrs["name"]
-        if tensor_name is None:
-            raise RuntimeError(
-                "Input sorted_graph must be named. Try to run name_graph pass on it."
-            )
         if tensor in tensor_set:
             tensor_node = tensor_set[tensor]
         else:
@@ -143,10 +148,6 @@ def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
         for src_op in tensor.src_ops():
             op_node = None
             op_name = src_op._attrs["name"]
-            if op_name is None:
-                raise RuntimeError(
-                    "Input sorted_graph must be named. Try to run name_graph pass on it."
-                )
             if src_op in op_set:
                 op_node = op_set[src_op]
             else:
@@ -166,10 +167,6 @@ def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
         for dst_op in tensor.dst_ops():
             op_node = None
             op_name = dst_op._attrs["name"]
-            if op_name is None:
-                raise RuntimeError(
-                    "Input sorted_graph must be named. Try to run name_graph pass on it."
-                )
             if dst_op in op_set:
                 op_node = op_set[dst_op]
             else:
@@ -186,17 +183,24 @@ def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
                 # add modal
                 modal_set.append(_gen_op_modal(dst_op))
             dot_graph.add_edge(pydot.Edge(tensor_node, op_node))
-    dot_src = dot_graph.to_string()
-    modal_src = "\n".join(modal_set)
-    items_src = [f'"{item}"' for item in items]
-    popover_src = json.dumps(popover_data)
-    index = INDEX_TEMPLATE.render(
-        dot_src=dot_src,
-        modals=modal_src,
-        network_name=network_name,
-        items=items_src,
-        popover_data=popover_src,
-    )
 
-    with open(file_path, "w") as fo:
-        fo.write(index)
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+    if ext == "html":
+        basename = os.path.splitext(os.path.basename(file_path))[0]
+        dot_src = dot_graph.to_string()
+        modal_src = "\n".join(modal_set)
+        items_src = [f'"{item}"' for item in items]
+        popover_src = json.dumps(popover_data)
+        index = INDEX_TEMPLATE.render(
+            dot_src=dot_src,
+            modals=modal_src,
+            network_name=basename,
+            items=items_src,
+            popover_data=popover_src,
+        )
+
+        with open(file_path, "w") as fo:
+            fo.write(index)
+    else:
+        dot_graph.write(file_path, format=ext)
diff --git a/python/setup.py b/python/setup.py
index d44f3d6cf..df01212e3 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -12,6 +12,22 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 # pylint: disable=invalid-name, exec-used
 
 import os
@@ -39,18 +55,9 @@ def is_pure(self):
 
 
 # temp copy 3rdparty libs to build dir
-try:
-    shutil.copytree("../3rdparty", "./aitemplate/3rdparty")
-except FileExistsError:
-    pass
-try:
-    shutil.copytree("../static", "./aitemplate/static")
-except FileExistsError:
-    pass
-try:
-    shutil.copytree("../licenses", "./aitemplate/licenses")
-except FileExistsError:
-    pass
+shutil.copytree("../3rdparty", "./aitemplate/3rdparty")
+shutil.copytree("../static", "./aitemplate/static")
+shutil.copytree("../licenses", "./aitemplate/licenses")
 
 
 def gen_file_list(srcs, f_cond):
@@ -149,6 +156,7 @@ def gen_license_file_list():
             "backend/cuda/elementwise/custom_math.cuh",
             "backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh",
             "backend/cuda/groupnorm/groupnorm_kernel.cuh",
+            "backend/cuda/groupnorm/layer_norm.cuh",
             "backend/cuda/softmax/softmax.cuh",
             "backend/cuda/vision_ops/nms/batched_nms_kernel.cuh",
             "backend/cuda/vision_ops/nms/nms_kernel.cuh",
diff --git a/static/csrc/debug_utility.cpp b/static/csrc/debug_utility.cpp
new file mode 100644
index 000000000..3ebaf1879
--- /dev/null
+++ b/static/csrc/debug_utility.cpp
@@ -0,0 +1,80 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "debug_utility.h"
+
+namespace {
+
+__global__ void inf_and_nan_checker(const half* tensor, int64_t elem_cnt) {
+  int64_t nan_num = 0, pos_inf = 0, neg_inf = 0;
+  for (int64_t i = 0; i < elem_cnt; i++) {
+    float v = (float)(*(tensor + i));
+    if (isnan(v)) {
+      nan_num += 1;
+    }
+    auto is_inf = isinf(v);
+    if (is_inf) {
+      if (v > 0) {
+        pos_inf += 1;
+      } else {
+        neg_inf += 1;
+      }
+    }
+  }
+  if (nan_num > 0 || pos_inf > 0 || neg_inf > 0) {
+    printf(
+        "contains NaN: %ld, +INF: %ld, -INF: %ld, total elements: %ld\n",
+        nan_num,
+        pos_inf,
+        neg_inf,
+        elem_cnt);
+  } else {
+    printf("doesn't contain NaN/INF\n");
+  }
+}
+
+__global__ void outputs_checker(const half* tensor, int64_t elem_cnt) {
+  for (int64_t i = 0; i < elem_cnt; i++) {
+    float v = (float)(*(tensor + i));
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("%f", v);
+  }
+  printf("\n");
+}
+} // namespace
+
+namespace ait {
+void InvokeInfAndNanChecker(
+    const half* tensor,
+    const char* tensor_name,
+    int64_t elem_cnt,
+    ait::StreamType stream) {
+  printf("Tensor (%s) ", tensor_name);
+  inf_and_nan_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
+  ait::StreamSynchronize(stream);
+}
+
+void InvokeOutputsChecker(
+    const half* tensor,
+    const char* tensor_name,
+    int64_t elem_cnt,
+    ait::StreamType stream) {
+  printf("Tensor (%s) output:\n", tensor_name);
+  outputs_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
+  ait::StreamSynchronize(stream);
+}
+} // namespace ait
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index cb3961214..3bfaa338e 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -26,12 +26,15 @@ ModelContainer::ModelContainer(
     size_t num_inputs,
     size_t num_outputs,
     size_t num_unbound_constants,
-    size_t params_size)
+    size_t params_size,
+    AITemplateAllocator& allocator)
     : ModelContainerBase(
           num_inputs,
           num_outputs,
           num_unbound_constants,
-          params_size),
+          params_size,
+          allocator),
+      allocator_(allocator),
       num_inputs_(num_inputs),
       num_outputs_(num_outputs) {
   if (num_models == 0) {
@@ -47,7 +50,8 @@ ModelContainer::ModelContainer(
         num_inputs,
         num_outputs,
         num_unbound_constants,
-        static_cast<uint8_t*>(constants_.get()));
+        static_cast<uint8_t*>(constants_.get()),
+        allocator);
     available_models_.push_back(&models_.back());
   }
 }
@@ -102,7 +106,8 @@ void ModelContainer::RunWithOutputsOnHost(
   owned_outputs.reserve(num_outputs);
   for (size_t i = 0; i < num_outputs; ++i) {
     size_t num_bytes = MaxOutputStorageBytes(i);
-    owned_outputs_ptrs.emplace_back(RAII_DeviceMalloc(num_bytes), num_bytes);
+    owned_outputs_ptrs.emplace_back(
+        RAII_DeviceMalloc(num_bytes, allocator_), num_bytes);
     owned_outputs.emplace_back(
         owned_outputs_ptrs.back().first.get(),
         outputs[i].shape,
@@ -178,7 +183,8 @@ float ModelContainer::Benchmark(
 
     for (size_t j = 0; j < num_outputs; ++j) {
       size_t num_bytes = MaxOutputStorageBytes(j);
-      cloned_outputs_ptrs.emplace_back(RAII_DeviceMalloc(num_bytes));
+      cloned_outputs_ptrs.emplace_back(
+          RAII_DeviceMalloc(num_bytes, allocator_));
       auto* new_pointer = cloned_outputs_ptrs.back().get();
       DEVICE_CHECK(
           DeviceToDeviceCopy(new_pointer, outputs[j].ptr, num_bytes, stream));
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index 91529f7d1..f980e1644 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -40,20 +40,57 @@
     return AITemplateError::AITemplateFailure;             \
   }
 
+namespace ait {
+namespace {
+class DefaultAllocator : public AITemplateAllocator {
+ public:
+  void* Allocate(size_t n_bytes) override {
+    void* result;
+    DEVICE_CHECK(DeviceMalloc(&result, n_bytes));
+    return result;
+  }
+
+  void Free(void* ptr) override {
+    DEVICE_CHECK(FreeDeviceMemory(ptr));
+  }
+};
+
+class TrackingAllocator : public DefaultAllocator {
+ public:
+  void* Allocate(size_t n_bytes) override {
+    auto* result = DefaultAllocator::Allocate(n_bytes);
+    num_bytes_ += n_bytes;
+    return result;
+  }
+
+  size_t NumBytesAllocated() const {
+    return num_bytes_;
+  }
+
+ private:
+  size_t num_bytes_ = 0;
+};
+
+DefaultAllocator default_allocator;
+} // namespace
+} // namespace ait
+
 extern "C" {
 
 AITemplateError AITemplateModelContainerCreate(
     AITemplateModelHandle* ret,
-    size_t num_runtimes) {
+    size_t num_runtimes,
+    AITemplateAllocator* allocator) {
   if (num_runtimes == 0) {
     LOG(ERROR) << "num_runtimes must be positive, but got 0";
     return AITemplateError::AITemplateFailure;
   }
   RETURN_ERROR_IF_NULL(ret)
+  AITemplateAllocator& allocator_ref =
+      allocator == nullptr ? ait::default_allocator : *allocator;
   CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto* m = ait::CreateModelContainer(num_runtimes);
+    auto* m = ait::CreateModelContainer(num_runtimes, allocator_ref);
     *ret = reinterpret_cast<AITemplateModelHandle>(m);
-    return AITemplateError::AITemplateSuccess;
   })
 }
 
@@ -62,7 +99,6 @@ AITemplateError AITemplateModelContainerDelete(AITemplateModelHandle handle) {
   CONVERT_EXCEPTION_TO_ERROR_CODE({
     auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
     delete m;
-    return AITemplateError::AITemplateSuccess;
   });
 }
 
@@ -226,4 +262,43 @@ AITemplateError AITemplateModelContainerGetNumRuntimes(
   auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
   CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
 }
+
+AITemplateError AITemplateAllocatorCreate(
+    AITemplateAllocator** allocator_out,
+    AITemplateAllocatorType allocator_type) {
+  RETURN_ERROR_IF_NULL(allocator_out);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    switch (allocator_type) {
+      case AITemplateAllocatorType::kDefault:
+        *allocator_out = new ait::DefaultAllocator();
+        break;
+      case AITemplateAllocatorType::kTracking:
+        *allocator_out = new ait::TrackingAllocator();
+        break;
+      default:
+        throw std::runtime_error("Unrecognized allocator type");
+    }
+  });
+}
+
+AITemplateError AITemplateAllocatorDelete(AITemplateAllocator* allocator) {
+  RETURN_ERROR_IF_NULL(allocator);
+  delete allocator;
+  return AITemplateError::AITemplateSuccess;
+}
+
+AITemplateError AITemplateTrackingAllocatorGetNumBytes(
+    AITemplateAllocator* allocator,
+    size_t* num_bytes_out) {
+  RETURN_ERROR_IF_NULL(allocator);
+  RETURN_ERROR_IF_NULL(num_bytes_out);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* tracking_allocator = dynamic_cast<ait::TrackingAllocator*>(allocator);
+    if (tracking_allocator == nullptr) {
+      throw std::runtime_error("Allocator was not a tracking allocator!");
+    }
+    *num_bytes_out = tracking_allocator->NumBytesAllocated();
+  });
+}
+
 } // extern "C"
diff --git a/static/include/debug_utility.h b/static/include/debug_utility.h
new file mode 100644
index 000000000..102bce838
--- /dev/null
+++ b/static/include/debug_utility.h
@@ -0,0 +1,30 @@
+//  Copyright (c) Meta Platform, Inc. and its affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+#include "device_functions-generated.h"
+
+namespace ait {
+void InvokeInfAndNanChecker(
+    const half* tensor,
+    const char* tensor_name,
+    int64_t elem_cnt,
+    ait::StreamType stream);
+
+void InvokeOutputsChecker(
+    const half* tensor,
+    const char* tensor_name,
+    int64_t elem_cnt,
+    ait::StreamType stream);
+} // namespace ait
diff --git a/static/include/model_container.h b/static/include/model_container.h
index a18702489..72ea5d6f8 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "model-generated.h"
+#include "model_interface.h"
 #include "raii_wrapper.h"
 
 #include <condition_variable>
@@ -36,7 +37,8 @@ class ModelContainerBase {
       size_t num_inputs,
       size_t num_outputs,
       size_t num_unbound_constants,
-      size_t params_size);
+      size_t params_size,
+      AITemplateAllocator& allocator);
 
  protected:
   // The set of unbound constants/weights/parameters. These are constants which
@@ -66,7 +68,9 @@ class ModelContainerBase {
 // codegened (the parameters passed to the ctor are determined
 // at compilation time)
 class ModelContainer;
-ModelContainer* CreateModelContainer(size_t num_runtimes);
+ModelContainer* CreateModelContainer(
+    size_t num_runtimes,
+    AITemplateAllocator& allocator);
 
 // Each ModelContainer contains num_models Models. Inference runs
 // can be started by invoking Run() with lists of pre-allocated
@@ -93,6 +97,13 @@ ModelContainer* CreateModelContainer(size_t num_runtimes);
 //
 // Note that if there are no models available for inference, Run() will block
 // until one becomes available.
+//
+// ModelContainer optionally takes an allocator argument, which it will use to
+// allocate the space for the buffers used for intermediate tensors and
+// constants. If it is nullptr, the default allocator will be used (e.g. just
+// {cuda/hip}{Malloc/Free}).
+// Important: we assume that the allocator lives until the ModelContainer is
+// destroyed. The default allocator has a static lifetime.
 class ModelContainer : ModelContainerBase {
  public:
   ModelContainer(
@@ -102,7 +113,8 @@ class ModelContainer : ModelContainerBase {
       size_t num_inputs,
       size_t num_outputs,
       size_t num_unbound_constants,
-      size_t params_size);
+      size_t params_size,
+      AITemplateAllocator& allocator);
 
   void Run(
       const AITData* inputs,
@@ -173,6 +185,8 @@ class ModelContainer : ModelContainerBase {
       size_t count,
       int64_t** output_shapes_out);
 
+  AITemplateAllocator& allocator_;
+
   std::vector<Model> models_;
   std::vector<Model*> available_models_;
   std::deque<Model*> pending_models_;
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 6d6200afc..2ec362535 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -63,6 +63,7 @@ enum class AITemplateDtype {
   kFloat,
   kInt,
   kLong,
+  kBool,
 };
 
 struct AITData {
@@ -89,6 +90,8 @@ inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
       return 4;
     case AITemplateDtype::kLong:
       return 8;
+    case AITemplateDtype::kBool:
+      return 1;
     case AITemplateDtype::kUnset:
       throw std::runtime_error("Unset dtype has no size!");
   }
@@ -97,10 +100,37 @@ inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
 struct AITemplateStreamOpaque {};
 using AITemplateStreamHandle = AITemplateStreamOpaque*;
 
+// Allocator to use for GPU mallocs and frees. Allocations will only happen
+// when the ModelContainer is created.
+class AITemplateAllocator {
+ public:
+  virtual void* Allocate(size_t nbytes) = 0;
+  virtual void Free(void* ptr) = 0;
+
+  virtual ~AITemplateAllocator() = default;
+};
+
+// Some custom allocators are provided. They can be created by passing
+// an enum into the AITemplateAllocatorCreate() function.
+enum class AITemplateAllocatorType {
+  // The default allocator just uses the backend's default malloc/free.
+  kDefault = 0,
+  // The tracking allocator is like the default allocator, but it keeps
+  // track of how many bytes it has allocated. Mainly used for testing.
+  kTracking,
+};
+
 extern "C" {
 
-AIT_EXPORT AITemplateError
-AITemplateModelContainerCreate(AITemplateModelHandle* ret, size_t num_runtimes);
+// Create a ModelContainer. See model_container.h for all the details.
+// Some important high-level notes:
+// * If allocator is null, a default allocator is used (forwards to
+//   {cuda/hip}{Malloc/Free}).
+// * We assume that the allocator lives at least as long as the ModelContainer.
+AIT_EXPORT AITemplateError AITemplateModelContainerCreate(
+    AITemplateModelHandle* ret,
+    size_t num_runtimes,
+    AITemplateAllocator* allocator = nullptr);
 
 AIT_EXPORT AITemplateError
 AITemplateModelContainerDelete(AITemplateModelHandle handle);
@@ -181,4 +211,16 @@ AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
     AITemplateModelHandle handle,
     size_t* num_runtimes_out);
 
+AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
+    AITemplateAllocator** allocator_out,
+    AITemplateAllocatorType allocator_type);
+
+AIT_EXPORT AITemplateError
+AITemplateAllocatorDelete(AITemplateAllocator* allocator_out);
+
+// Get the number of bytes allocated; mainly used for testing.
+AIT_EXPORT AITemplateError AITemplateTrackingAllocatorGetNumBytes(
+    AITemplateAllocator* allocator,
+    size_t* num_bytes_out);
+
 } // extern "C"
diff --git a/static/include/raii_wrapper.h b/static/include/raii_wrapper.h
index 440205afe..24270d8b9 100644
--- a/static/include/raii_wrapper.h
+++ b/static/include/raii_wrapper.h
@@ -14,6 +14,7 @@
 //
 #pragma once
 // Some helpful unique_ptr instantiations and factory functions for CUDA types
+#include <functional>
 #include <memory>
 #include <type_traits>
 
@@ -24,7 +25,7 @@ namespace ait {
 
 // RAII wrapper for owned GPU memory. Not that the underlying calls
 // to malloc/free are synchronous for simplicity.
-using GPUPtr = std::unique_ptr<void, decltype(&FreeDeviceMemory)>;
+using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
 using StreamPtr = std::
     unique_ptr<std::remove_pointer<StreamType>::type, decltype(&StreamDestroy)>;
@@ -32,10 +33,16 @@ using StreamPtr = std::
 using EventPtr = std::
     unique_ptr<std::remove_pointer<EventType>::type, decltype(&DestroyEvent)>;
 
-inline GPUPtr RAII_DeviceMalloc(size_t num_bytes) {
-  void* output;
-  DEVICE_CHECK(DeviceMalloc(&output, num_bytes));
-  return GPUPtr(output, FreeDeviceMemory);
+using GraphPtr = std::unique_ptr<
+    std::remove_pointer<GraphType>::type,
+    std::function<void(GraphType)>>;
+
+inline GPUPtr RAII_DeviceMalloc(
+    size_t num_bytes,
+    AITemplateAllocator& allocator) {
+  auto* output = allocator.Allocate(num_bytes);
+  auto deleter = [&allocator](void* ptr) mutable { allocator.Free(ptr); };
+  return GPUPtr(output, deleter);
 }
 
 inline StreamPtr RAII_StreamCreate(bool non_blocking = false) {
@@ -50,4 +57,16 @@ inline EventPtr RAII_CreateEvent() {
   return EventPtr(event, DestroyEvent);
 }
 
+inline GraphPtr RAII_EndCaptureAndCreateGraph(
+    const std::function<DeviceError(GraphType*)>& end_capture_fn) {
+  GraphType graph;
+  // If this throws, we shouldn't leak memory. cudaGraphEndCapture is guaranteed
+  // to return the NULL graph if ending the stream capture doesn't work.
+  // We pass a custom function here instead of calling StreamEndCapture
+  // directly so classes can manipulate state if the stream capture fails
+  // (e.g. disabling graph mode might be useful in that case).
+  DEVICE_CHECK(end_capture_fn(&graph))
+  return GraphPtr(graph, GraphDestroy);
+}
+
 } // namespace ait
diff --git a/tests/unittest/backend/test_cuda_graph.py b/tests/unittest/backend/test_cuda_graph.py
new file mode 100644
index 000000000..70d57f3f3
--- /dev/null
+++ b/tests/unittest/backend/test_cuda_graph.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class CUDAGraphTestCase(unittest.TestCase):
+    def test_cuda_graph_multiple_runs(self):
+        logger.info("testing cuda graph with multiple runs")
+        X0_batch_dim = IntVar([1, 65], name="batch_size")
+        X0_non_batch_shape = [1, 772]
+        X0_non_batch_dims = [IntImm(d) for d in X0_non_batch_shape]
+        X0_tensor_shape = [X0_batch_dim] + X0_non_batch_dims
+        X0 = Tensor(shape=X0_tensor_shape, name="X0", is_input=True)
+        X1_shape = [2, 772]
+        X1 = Tensor(shape=X1_shape, name="X1", is_input=True)
+
+        reduction_dim = 1
+        Y0 = ops.elementwise(func_enum=FuncEnum.ADD)(X0, X1)
+        Y = ops.reduce_sum(dim=reduction_dim)(Y0)
+
+        # Set outputs
+        Y._attrs["name"] = "Trueoutput_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        test_name = "cuda_graph_multiple_runes"
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        run = 2
+        repeat = 1
+        for b_size in [1, 65]:
+            logger.info(f"batch size = {b_size}")
+            X0_shape = [b_size] + X0_non_batch_shape
+            x0_pt = torch.randn(*X0_shape).cuda().half()
+            x1_pt = torch.randn(*X1_shape).cuda().half()
+            y0_pt = x0_pt + x1_pt
+            y_pt = torch.sum(y0_pt, dim=reduction_dim)
+
+            y = torch.empty(y_pt.size()).cuda().half()
+            inputs = {"X0": x0_pt, "X1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=run,
+                repeat=repeat,
+                graph_mode=True,
+            )
+            y_pt = y_pt.cpu().numpy()
+            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index f907959de..d7e1fb3ef 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 import contextlib
+import ctypes
 import itertools
 import unittest
 from typing import Callable, Optional, Tuple
@@ -32,6 +33,7 @@
 )
 from aitemplate.compiler.model import (
     AITData,
+    AITemplateAllocatorKind,
     AITemplateMemcpyKind,
     Model,
     torch_to_ait_data,
@@ -1210,7 +1212,7 @@ def test_run_fails_with_unbound_constants(self):
         module.run_with_tensors([], [output_data])
 
         expected = const_1_pt * const_1_pt * const_2_pt
-        torch.testing.assert_allclose(output_data, expected)
+        self.assertTrue(torch.allclose(output_data, expected))
 
     def test_set_constant_fails_wrong_dtype(self):
         constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
@@ -1403,6 +1405,46 @@ def test_numpy_to_ait_data_manual_free(self):
             module.free_gpu_memory(x_ait.data_ptr)
             # Make sure we don't double-free when we exit.
 
+    def test_custom_allocator(self):
+        x = Tensor([1], dtype="float16", is_input=True)
+        y = x * x
+        z = y * y
+        z._attrs["is_output"] = True
+        for allocator_kind in (
+            AITemplateAllocatorKind.DEFAULT,
+            AITemplateAllocatorKind.TRACKING,
+        ):
+            with compile_model(
+                z,
+                detect_target(),
+                "./tmp",
+                f"test_custom_allocator_{allocator_kind.value}",
+                allocator_kind=AITemplateAllocatorKind.TRACKING,
+            ) as module:
+                allocator = module.DLL.allocator_handle
+                self.assertIsNotNone(allocator.value)
+
+                if allocator_kind == AITemplateAllocatorKind.TRACKING:
+                    num_bytes = ctypes.c_size_t()
+                    module.DLL.AITemplateTrackingAllocatorGetNumBytes(
+                        allocator, ctypes.byref(num_bytes)
+                    )
+                    self.assertGreater(num_bytes.value, 0)
+
+                x_pt = (
+                    torch.randn(
+                        1,
+                    )
+                    .half()
+                    .cuda()
+                )
+                y_pt = x_pt * x_pt
+                z_pt = y_pt * y_pt
+
+                z_ait = torch.empty_like(x_pt)
+                module.run_with_tensors([x_pt], [z_ait])
+                self.assertTrue(z_ait.equal(z_pt))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
new file mode 100644
index 000000000..7276493f9
--- /dev/null
+++ b/tests/unittest/backend/test_profiler.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+from random import randrange
+
+from aitemplate.backend import profiler_runner
+
+profiler_runner.extract_profile_result = lambda _: (
+    "",
+    False,
+)
+
+from time import sleep
+
+from aitemplate.backend.cuda.target_def import CUDA as CUDATarget
+
+from aitemplate.backend.profiler_runner import ProfilerRunner
+
+
+def dice():
+    return randrange(1, 10) / 4
+
+
+class Delegate:
+    def __init__(self, test_instance):
+        self._test_instance = test_instance
+        self.results = [None] * 42
+
+    def add_result(self, idx, val):
+        sleep(dice())
+        self.results[idx] = val
+
+    def postprocess_results(self):
+        for i, val in enumerate(self.results):
+            self._test_instance.assertTrue(val is not None, f"Result {i} not filled in")
+
+
+def delegate_cb_wrapper(idx, value):
+    def wrapped(result, delegate):
+        return delegate.add_result(idx, value)
+
+    return wrapped
+
+
+class ProfilerTestCase(unittest.TestCase):
+    def test_profiler_runner(self):
+        with CUDATarget() as _:
+            pr = ProfilerRunner(
+                devices=[str(i) for i in range(12)],
+                timeout=60,
+                postprocessing_delegate=Delegate(test_instance=self),
+            )
+
+            for i, _ in enumerate(pr._postprocessing_delegate.results):
+                sleep_for = 0
+                pr.push(
+                    cmds=["sleep", f"{sleep_for}"],
+                    process_result_callback=delegate_cb_wrapper(i, sleep_for),
+                )
+
+            pr.join()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/benchmark/test_gemm_benchmark.py b/tests/unittest/benchmark/test_gemm_benchmark.py
new file mode 100644
index 000000000..fee506494
--- /dev/null
+++ b/tests/unittest/benchmark/test_gemm_benchmark.py
@@ -0,0 +1,321 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import json
+import unittest
+import uuid
+
+import torch
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.compiler import compile_model
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_ait import make_input_output_pools, run_benchmark
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.testing.benchmark_trt import make_trt_module
+from aitemplate.utils import logger, shape_utils
+
+NK_SHAPES = ((8314, 3072), (6912, 8314))
+INPUT_POOL_SIZE = 20
+BATCH_SIZES = (
+    1,
+    2048,
+)
+
+
+class GemmRCRModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b):
+        return torch.nn.functional.linear(a, b)
+
+
+class GemmRCRFunction:
+    def __init__(self, inputs_pool):
+        self._it_pool = 0
+        self._as = [t["a"] for t in inputs_pool]
+        self._bs = [t["b"] for t in inputs_pool]
+        self._inputs_pool_size = len(inputs_pool)
+        self._module = GemmRCRModule()
+
+    def next_input(self):
+        self._it_pool += 1
+        self._it_pool %= self._inputs_pool_size
+        return self._as[self._it_pool], self._bs[self._it_pool]
+
+    def __call__(self):
+        return self._module(*self.next_input())
+
+
+class GemmRCRTRTFunction(GemmRCRFunction):
+    def __init__(self, inputs_pool, max_batch_size):
+        super().__init__(inputs_pool)
+        a, b = self.next_input()
+        self._module = make_trt_module(
+            self._module, a, b, max_batch_size=max_batch_size
+        )
+        self._module(a, b)
+
+
+def build_ait_module_gemm_rcr(*, ms, n, k, split_k, test_name):
+    target = detect_target(use_fp16_acc=True)
+    input_params = {
+        "dtype": "float16",
+        "is_input": True,
+    }
+    a = Tensor(shape=[shape_utils.gen_int_var_min_max(ms), k], name="a", **input_params)
+    b = Tensor(shape=[n, k], name="b", **input_params)
+    bias = Tensor(shape=[n], name="bias", **input_params)
+    OP = ops.gemm_rcr_bias()
+    OP._attrs["split_k_hints"] = (split_k,)
+    output = OP(a, b, bias)
+    output._attrs["name"] = "output"
+    output._attrs["is_output"] = True
+    return compile_model(output, target, "./tmp", test_name=test_name)
+
+
+def eval_pt_gemm_rcr(*, m, n, k):
+    input_params = {
+        "dtype": torch.float16,
+        "device": "cuda",
+    }
+    a = torch.rand(m, k, **input_params)
+    b = torch.rand(n, k, **input_params)
+    bias = torch.rand(n, **input_params)
+    output = torch.nn.functional.linear(a, b, bias).to(torch.float16)
+    return {"a": a, "b": b, "bias": bias, "output": output}
+
+
+class BmmRRRModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b):
+        return torch.bmm(a, b)
+
+
+class BmmRRRFunction:
+    def __init__(self, inputs_pool):
+        self._it_pool = 0
+        self._as = [t["batch_a"] for t in inputs_pool]
+        self._bs = [t["batch_b"] for t in inputs_pool]
+        self._inputs_pool_size = len(inputs_pool)
+        self._module = BmmRRRModule()
+
+    def next_input(self):
+        self._it_pool += 1
+        self._it_pool %= self._inputs_pool_size
+        return self._as[self._it_pool], self._bs[self._it_pool]
+
+    def __call__(self):
+        return self._module(*self.next_input())
+
+
+class BmmRRRTRTFunction(BmmRRRFunction):
+    def __init__(self, inputs_pool, max_batch_size):
+        super().__init__(inputs_pool)
+        batch_as, batch_bs = self.next_input()
+        self._module = make_trt_module(
+            self._module, batch_as, batch_bs, max_batch_size=max_batch_size
+        )
+        self._module(batch_as, batch_bs)
+
+
+def build_ait_module_bmm_rrr(*, bs, m, n, k, split_k, test_name):
+    target = detect_target(use_fp16_acc=True)
+    input_params = {
+        "dtype": "float16",
+        "is_input": True,
+    }
+    batch_dim = shape_utils.gen_int_var_min_max(bs, "batch_dim")
+    batch_a = Tensor(
+        shape=[batch_dim, m, k],
+        name="batch_a",
+        **input_params,
+    )
+    batch_b = Tensor(
+        shape=[batch_dim, k, n],
+        name="batch_b",
+        **input_params,
+    )
+    OP = ops.bmm_rrr()
+    OP._attrs["split_k_hints"] = (split_k,)
+    output = OP(batch_a, batch_b)
+    output._attrs["name"] = "output"
+    output._attrs["is_output"] = True
+    return compile_model(output, target, "./tmp", test_name=test_name)
+
+
+def eval_pt_bmm_rrr(*, b, m, n, k):
+    input_params = {
+        "dtype": torch.float16,
+        "device": "cuda",
+    }
+    batch_a = torch.rand(b, m, k, **input_params)
+    batch_b = torch.rand(b, k, n, **input_params)
+    output = torch.bmm(batch_a, batch_b).to(torch.float16)
+    return {
+        "batch_a": batch_a,
+        "batch_b": batch_b,
+        "output": output,
+    }
+
+
+class TestGemmRCRBenchmark(unittest.TestCase):
+    @unittest.skipIf(
+        detect_target(use_fp16_acc=True).in_ci_env(), "don't run benchmark in CI"
+    )
+    def test_benchmark(self):
+        split_ks = sorted(set(range(1, 6)).union([2**i for i in range(5)]))
+        for split_k, (n, k) in itertools.product(split_ks, NK_SHAPES):
+            NUM_ITERS = 100000
+            NUM_WARMUP_ITERS = 1000
+            ait_module = build_ait_module_gemm_rcr(
+                ms=BATCH_SIZES,
+                n=n,
+                k=k,
+                split_k=split_k,
+                test_name=f"gemm_rcr_{split_k=}_{uuid.uuid4().hex}",
+            )
+            for m in BATCH_SIZES:
+                mnk = {"m": m, "n": n, "k": k}
+                logger.warning(__name__, f"mnk={mnk}, split_k={split_k}")
+                inputs_pool, outputs_pool = make_input_output_pools(
+                    pool_size=INPUT_POOL_SIZE,
+                    eval_pt_func=lambda: eval_pt_gemm_rcr(**mnk),
+                    input_filter_func=lambda name, _: not name.startswith("output"),
+                    output_filter_func=lambda name, _: name.startswith("output"),
+                )
+                gemm_rcr_function = GemmRCRFunction(inputs_pool)
+                gemm_rcr_trt_function = GemmRCRTRTFunction(
+                    inputs_pool, max_batch_size=m
+                )
+
+                pt_outputs = eval_pt_gemm_rcr(**mnk)
+                ait_outputs = {"output": torch.empty_like(pt_outputs["output"])}
+                ait_module.run_with_tensors(
+                    {k: v for k, v in pt_outputs.items() if k != "output"},
+                    ait_outputs,
+                )
+                torch.testing.assert_close(
+                    ait_outputs["output"], pt_outputs["output"], rtol=1, atol=1
+                )
+                mean_runtime_ait = run_benchmark(
+                    ait_module=ait_module,
+                    inputs_pool=inputs_pool,
+                    outputs_pool=outputs_pool,
+                    num_iters=NUM_ITERS,
+                    num_warmup_iters=NUM_WARMUP_ITERS,
+                )
+
+                mean_runtime_pt = benchmark_torch_function(
+                    iters=NUM_ITERS, function=gemm_rcr_function
+                )
+
+                mean_runtime_trt = benchmark_torch_function(
+                    iters=NUM_ITERS, function=gemm_rcr_trt_function
+                )
+
+                benchmark_results = {
+                    "function": "gemm_rcr_bias",
+                    "mean_runtime_ait_ms": round(mean_runtime_ait, 5),
+                    "mean_runtime_pt_ms": round(mean_runtime_pt, 5),
+                    "mean_runtime_trt_ms": round(mean_runtime_trt, 5),
+                    "split_k": split_k,
+                    **mnk,
+                }
+                logger.warning(
+                    __name__,
+                    f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
+                )
+
+
+class TestBmmRRRBenchmark(unittest.TestCase):
+    @unittest.skipIf(
+        detect_target(use_fp16_acc=True).in_ci_env(), "don't run benchmark in CI"
+    )
+    def test_benchmark(self):
+        INPUT_POOL_SIZE = 3
+        MNK_SHAPES = ((1469, 16, 128),)
+        split_ks = sorted(set(range(1, 6)).union([2**i for i in range(5)]))
+        for split_k, (m, n, k) in itertools.product(split_ks, MNK_SHAPES):
+            NUM_ITERS = 100000
+            NUM_WARMUP_ITERS = 1000
+            ait_module = build_ait_module_bmm_rrr(
+                bs=BATCH_SIZES,
+                m=m,
+                n=n,
+                k=k,
+                split_k=split_k,
+                test_name=f"bmm_rrr_{split_k=}_{uuid.uuid4().hex}",
+            )
+            for b in BATCH_SIZES:
+                bmnk = {"b": b, "m": m, "n": n, "k": k}
+                logger.warning(__name__, f"bmnk={bmnk}, split_k={split_k}")
+                inputs_pool, outputs_pool = make_input_output_pools(
+                    pool_size=INPUT_POOL_SIZE,
+                    eval_pt_func=lambda: eval_pt_bmm_rrr(**bmnk),
+                    input_filter_func=lambda name, _: not name.startswith("output"),
+                    output_filter_func=lambda name, _: name.startswith("output"),
+                )
+
+                bmm_rrr_function = BmmRRRFunction(inputs_pool)
+                bmm_rrr_trt_function = BmmRRRTRTFunction(inputs_pool, max_batch_size=b)
+
+                pt_outputs = eval_pt_bmm_rrr(**bmnk)
+                ait_outputs = {"output": torch.empty_like(pt_outputs["output"])}
+                ait_module.run_with_tensors(
+                    {k: v for k, v in pt_outputs.items() if k != "output"},
+                    ait_outputs,
+                )
+                torch.testing.assert_close(
+                    ait_outputs["output"], pt_outputs["output"], rtol=1, atol=1
+                )
+
+                mean_runtime_ait = run_benchmark(
+                    ait_module=ait_module,
+                    inputs_pool=inputs_pool,
+                    outputs_pool=outputs_pool,
+                    num_iters=NUM_ITERS,
+                    num_warmup_iters=NUM_WARMUP_ITERS,
+                )
+
+                mean_runtime_pt = benchmark_torch_function(
+                    iters=NUM_ITERS, function=bmm_rrr_function
+                )
+
+                mean_runtime_trt = benchmark_torch_function(
+                    iters=NUM_ITERS, function=bmm_rrr_trt_function
+                )
+
+                benchmark_results = {
+                    "function": "bmm_rrr",
+                    "mean_runtime_ait_ms": round(mean_runtime_ait, 5),
+                    "mean_runtime_pt_ms": round(mean_runtime_pt, 5),
+                    "mean_runtime_trt_ms": round(mean_runtime_trt, 5),
+                    "split_k": split_k,
+                    **bmnk,
+                }
+                logger.warning(
+                    __name__,
+                    f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index 9823ef02d..6d1b22ae9 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -99,7 +99,7 @@ def test_pad_constant_weight(self):
         y = torch.empty((M, N)).cuda().half()
         mod.run_with_tensors({"input_0": input_0_pt}, {"y": y})
 
-        self.assertTrue(torch.equal(y, y_pt))
+        torch.testing.assert_close(y, y_pt, atol=1e-1, rtol=1e-1)
 
         # The apply_padding graph pass will add padding to both the input and the
         # weight in this case with concatenate(). The concatenate for the weight
@@ -143,7 +143,7 @@ def test_fold_long_chain(self):
         z = torch.empty((M, N)).cuda().half()
         mod.run_with_tensors({}, {"z": z})
 
-        self.assertTrue(torch.equal(z, z_pt))
+        torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
 
         # The entire graph is turned into a constant.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
@@ -213,7 +213,7 @@ def test_late_binding(self):
         z = torch.empty((M, N)).cuda().half()
         mod.run_with_tensors({}, {"z": z})
 
-        self.assertTrue(torch.equal(z, z_pt))
+        torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
 
         # The entire graph is turned into a constant.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index b118db813..983a35605 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -738,6 +738,12 @@ def _test_gemm_rcr_bias_activation(
         elif activation == "tanh":
             ait_func = FuncEnum.TANH
             pt_func = torch.tanh
+        elif activation == "gelu":
+            ait_func = FuncEnum.GELU
+            pt_func = torch.nn.functional.gelu
+        elif activation == "fast_gelu":
+            ait_func = FuncEnum.FASTGELU
+            pt_func = torch.nn.functional.gelu
         else:
             raise AssertionError("Activation not supported")
 
@@ -1006,6 +1012,26 @@ def test_gemm_rcr_bias_tanh(self):
             "gemm_rcr_bias_tanh_need_align",
         )
 
+    def test_gemm_rcr_bias_gelu(self):
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "gelu",
+            "gemm_rcr_bias_gelu",
+            True,
+            "gemm_rcr_bias_gelu_basic_decomposed",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "fast_gelu",
+            "gemm_rcr_bias_fast_gelu",
+            True,
+            "gemm_rcr_bias_fast_gelu_basic_decomposed",
+        )
+
 
 class FuseGemmRcrBiasSwishCase(unittest.TestCase):
     def _test_gemm_rcr_bias_swish(self, Ms, N, K, testname, use_add=False):
diff --git a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
new file mode 100644
index 000000000..edea2d2d8
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
@@ -0,0 +1,125 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import has_op
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMReshapePermuteTestCase(unittest.TestCase):
+    def _test_rcr_0213(
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        has_bias=False,
+        layout="0213",
+        should_fuse=True,
+    ):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        # B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        t1, t2 = shape
+
+        if has_bias:
+            return
+        else:
+            m_d1 = [m // t1 for m in ms]
+            Y0 = ops.gemm_rcr()(X, W)
+            Y1 = ops.reshape()(
+                Y0, [shape_utils.gen_int_var_min_max(m_d1), t1, t2, n // t2]
+            )
+            Y = ops.permute()(Y1, [0, 2, 1, 3])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+
+        if should_fuse:
+            sorted_ops = graph_utils.get_sorted_ops(module.debug_sorted_graph)
+            assert has_op(sorted_ops, "gemm_rcr_permute")
+        else:
+            return
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(n, k).cuda().half()
+            B_pt = torch.randn(n).cuda().half()
+
+            def torch_f(x, w, b, has_bias, shape):
+                if has_bias:
+                    Y_l = torch.nn.functional.linear(x, w, b)
+                else:
+                    Y_l = torch.nn.functional.linear(x, w)
+                t1, t2 = shape
+                Y_r = Y_l.reshape(m // t1, t1, t2, n // t2)
+                Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+                Y_out = Y_pt.reshape([m // t1, t2, -1])
+                return Y_pt, Y_out
+
+            Y_pt, _ = torch_f(X_pt, W_pt, B_pt, has_bias, shape)
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            if has_bias:
+                inputs["input_2"] = B_pt
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+            # module.benchmark_with_tensors(inputs, [y], count=1000)
+            # from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            # t = benchmark_torch_function(
+            #     1000, torch_f, X_pt, W_pt, B_pt, has_bias, shape
+            # )
+            # print(f"pt: {t} ms/iter")
+
+    def test_rcr_0213(self):
+        self._test_rcr_0213(
+            [54],
+            256,
+            4000000,
+            [54, 1000000],
+            "permute_0213_1",
+            has_bias=False,
+            layout="0213",
+        )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "permute_0213_2",
+            has_bias=False,
+            layout="0213",
+            should_fuse=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_gemm.py b/tests/unittest/compiler/test_fuse_permute_gemm.py
new file mode 100644
index 000000000..4d2541bc8
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_permute_gemm.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target, test_utils
+
+
+class FusePermuteGemmTestCase(unittest.TestCase):
+    def test_no_fusion_odd_alignment(self):
+        x = Tensor([32, 51], is_input=True)
+        w = Tensor([32, 51], is_input=True)
+        y = ops.permute()(x, dims=[1, 0])
+        z = ops.gemm_rrr()(w, y)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        module = compile_model(
+            z, detect_target(), "./tmp", "test_no_fusion_odd_alignment"
+        )
+        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
+        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
+
+    def test_gemm_rrr_to_rcr(self):
+        x = Tensor([32, 52], is_input=True, name="x")
+        w = Tensor([32, 52], is_input=True, name="w")
+        y = ops.permute()(x, dims=[1, 0])
+        z = ops.gemm_rrr()(w, y)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        module = compile_model(z, detect_target(), "./tmp", "test_gemm_rrr_to_rcr")
+        self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
+        self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
+        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
+
+        x_pt = torch.randn(32, 52).half().cuda()
+        w_pt = torch.randn(32, 52).half().cuda()
+        y_pt = x_pt.t()
+        z_pt = torch.matmul(w_pt, y_pt)
+        z_ait = torch.empty_like(z_pt)
+        module.run_with_tensors({"x": x_pt, "w": w_pt}, {"z": z_ait})
+
+        torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
+
+    def test_gemm_rcr_to_rrr(self):
+        x = Tensor([52, 32], is_input=True, name="x")
+        w = Tensor([32, 52], is_input=True, name="w")
+        y = ops.permute()(x, dims=[1, 0])
+        z = ops.gemm_rcr()(w, y)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        module = compile_model(
+            z,
+            detect_target(),
+            "./tmp",
+            "test_gemm_rcr_to_rrr",
+        )
+        self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
+        self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
+        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
+
+        x_pt = torch.randn(52, 32).half().cuda()
+        w_pt = torch.randn(32, 52).half().cuda()
+        z_pt = torch.matmul(w_pt, x_pt)
+        z_ait = torch.empty_like(z_pt)
+        module.run_with_tensors({"x": x_pt, "w": w_pt}, {"z": z_ait})
+
+        torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 437f27105..73cd581ce 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -99,6 +99,92 @@ def test_fused_elementwise_direct_input_dependency(self):
         module.run_with_tensors(inputs, [r2])
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
 
+    def test_fused_elementwise_non_elementwise_ops(self):
+        r"""
+                X0   X1 (3)
+                 \   /
+                  Add_1 (R0)   X2
+                   |    \      /
+                   |      Add_2 (R1, is_output)
+                  / \      /
+        (R3) reshape   Sub_1 (R2)
+               |
+              Add_3 (R4)
+
+
+            Add_1, Add_2, and Sub_1 should be fused together.
+        """
+
+        M = 10
+        N = 4
+        X0 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)  # Add_1
+        R1 = ops.elementwise(FuncEnum.ADD)(R0, X2)  # Add_2
+        R2 = ops.elementwise(FuncEnum.SUB)(R0, R1)
+        R3 = ops.reshape()(R0, [-1])
+        R4 = ops.elementwise(FuncEnum.ADD)(R3, R3)  # Add3
+        R1._attrs["name"] = "R1"
+        R1._attrs["is_output"] = True
+        R2._attrs["name"] = "R2"
+        R2._attrs["is_output"] = True
+        R4._attrs["name"] = "R4"
+        R4._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [R1, R2, R4],
+            target,
+            "./tmp",
+            "test_fused_elementwise_non_elementwise_ops",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 4)
+
+        x0_pt = torch.rand(M, N).cuda().half()
+        x2_pt = torch.rand(M, N).cuda().half()
+
+        r0_pt = x0_pt + 3
+        r1_pt = r0_pt + x2_pt
+        r2_pt = r0_pt - r1_pt
+        r3_pt = r0_pt.reshape([-1])
+        r4_pt = r3_pt + r3_pt
+
+        r1 = torch.empty(r1_pt.shape).cuda().half()
+        r2 = torch.empty([M, N]).cuda().half()
+        r4 = torch.empty(r4_pt.shape).cuda().half()
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, {"R1": r1, "R2": r2, "R4": r4})
+        self.assertTrue(torch.allclose(r1, r1_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
+
     def test_fused_elementwise_indirect_input_dependency(self):
         r"""
             X0   X1
diff --git a/tests/unittest/compiler/test_group_fusions.py b/tests/unittest/compiler/test_group_fusions.py
index 732f74631..a4b78bd48 100644
--- a/tests/unittest/compiler/test_group_fusions.py
+++ b/tests/unittest/compiler/test_group_fusions.py
@@ -268,6 +268,11 @@ def test_group_layernorm_sigmoid_mul_fusion(self):
             fuse_sigmoid_mul=True,
             num_group_ops=2,
         )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 64]] * 50,
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+        )
 
         # ctr_mbl_feed overarch cases
         self._test_group_layernorm_sigmoid_mul_cat_fusion(
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
index 878a61355..d4991cde2 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -165,6 +165,51 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    @parameterized.expand(
+        [
+            param("static_M", [23], 7, 3),
+            param("dynamic_M", [1, 78, 99], 7, 3),
+        ]
+    )
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n, k):
+        # S1 is fed to gemm twice
+        m_dim = shape_utils.gen_int_var_min_max(ms, "M")
+
+        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
+
+        X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S1)
+        Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [Y], target, "./tmp", f"pad_gemm_with_elementwise_2_{test_name}"
+        )
+
+        for m in ms:
+            X1_pt = torch.randn(m, k).cuda().half()
+            W1_pt = torch.randn(n, k).cuda().half()
+            B1_pt = torch.randn(n).cuda().half()
+            S1_pt = torch.randn(m, n).cuda().half()
+
+            X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S1_pt
+            Y_pt = X2_pt + X2_pt
+
+            inputs = [0] * 4
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["b1"]] = B1_pt
+            inputs[name_to_idx["s1"]] = S1_pt
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_split_large_concat.py b/tests/unittest/compiler/test_split_large_concat.py
new file mode 100644
index 000000000..10639a38b
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_concat.py
@@ -0,0 +1,462 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import random
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils
+
+logger = logging.getLogger(__name__)
+
+
+class SplitLargeConcatTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitLargeConcatTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _make_tensors(self, num_inputs, input_shape, input_names=None):
+        if input_names is not None:
+            assert num_inputs == len(input_names)
+        input_tensors = []
+        input_type = "float16"
+        for i in range(num_inputs):
+            name = input_names[i] if input_names is not None else f"input_{i}"
+            t = Tensor(
+                shape=input_shape,
+                dtype=input_type,
+                name=name,
+                is_input=True,
+            )
+            input_tensors.append(t)
+        return input_tensors
+
+    def _test_split_large_concat_simple(
+        self, cat_dim, num_inputs, input_shape, split_count, test_name
+    ):
+        # a simple test: a concat takes num_inputs and the output of the concat
+        # is a model output
+        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_op = ops.concatenate()
+        Y = concat_op(input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), num_inputs + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), split_count)
+
+        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
+        y_pt = torch.cat(inputs_pt, cat_dim)
+
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_simple(self):
+        self._test_split_large_concat_simple(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            split_count=4,
+            test_name="split_large_concat_simple",
+        )
+        self._test_split_large_concat_simple(
+            cat_dim=1,
+            num_inputs=34,
+            input_shape=(2, 3),
+            split_count=1,
+            test_name="split_large_concat_simple",
+        )
+        self._test_split_large_concat_simple(
+            cat_dim=1,
+            num_inputs=35,
+            input_shape=(2, 3),
+            split_count=2,
+            test_name="split_large_concat_simple",
+        )
+
+    def _test_split_large_concat_with_add(
+        self, cat_dim, num_inputs, input_shape, test_name
+    ):
+        # make a model like below:
+        # y1 = concat(x1,x2...)
+        # y = add(y1, x_n) where x_n is not used by concat
+        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_op = ops.concatenate()
+        Y1 = concat_op(input_tensors, cat_dim)
+        x_n_shape = [1]
+        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_n = X_ns[0]
+        Y = ops.elementwise(FuncEnum.ADD)(Y1, X_n)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
+        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        y1_pt = torch.cat(inputs_pt, cat_dim)
+        inputs_pt.append(x_n_pt)
+        y_pt = y1_pt + x_n_pt
+
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors + [X_n]]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_with_add(self):
+        self._test_split_large_concat_with_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3, 4),
+            test_name="split_large_concat_with_add",
+        )
+
+    def _test_split_large_concat_with_strided_add(
+        self, cat_dim, num_inputs, input_shape, test_name
+    ):
+        # make a model like below:
+        # y1 = add(x1, x2)
+        # y2 = concat(y1, x3, ...)
+        # y = add(y1, x_n) where x_n is not used by concat
+        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        add_input_tensors = self._make_tensors(
+            2, input_shape, ["add_input_0", "add_input_1"]
+        )
+        Y1 = ops.elementwise(FuncEnum.ADD)(add_input_tensors[0], add_input_tensors[1])
+        concat_input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_op = ops.concatenate()
+        Y2 = concat_op([Y1] + concat_input_tensors, cat_dim)
+        x_n_shape = [1]
+        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_n = X_ns[0]
+        Y = ops.elementwise(FuncEnum.ADD)(Y2, X_n)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        add_inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(2)]
+        y1_pt = add_inputs_pt[0] + add_inputs_pt[1]
+        concat_inputs_pt = [
+            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+        ]
+        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        y2_pt = torch.cat([y1_pt] + concat_inputs_pt, cat_dim)
+        y_pt = y2_pt + x_n_pt
+
+        input_tensors = add_input_tensors + concat_input_tensors + [X_n]
+        inputs_pt = add_inputs_pt + concat_inputs_pt + [x_n_pt]
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_with_strided_add(self):
+        self._test_split_large_concat_with_strided_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add",
+        )
+
+    def _test_split_large_concat_with_strided_add_complex(
+        self, cat_dim, num_inputs, input_shape, test_name
+    ):
+        # make a model like below:
+        # a1 = add(x1, x2)
+        # a2 = add(x3, x4)
+        # ...
+        # y = concat(a1, x1_1, a2, x1_2, ...)
+        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
+        add_input_tensors = self._make_tensors(
+            num_inputs * 2, input_shape, add_input_tensor_names
+        )
+        add_output_tensors = []
+        for i in range(num_inputs):
+            a = ops.elementwise(FuncEnum.ADD)(
+                add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
+            )
+            add_output_tensors.append(a)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_op = ops.concatenate()
+        concat_input_tensors = []
+        for i in range(num_inputs):
+            concat_input_tensors.append(add_output_tensors[i])
+            concat_input_tensors.append(other_input_tensors[i])
+        Y = concat_op(concat_input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        add_inputs_pt = [
+            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+        ]
+        add_outputs_pt = []
+        for i in range(num_inputs):
+            add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
+        other_inputs_pt = [
+            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+        ]
+        concat_inputs_pt = []
+        for i in range(num_inputs):
+            concat_inputs_pt.append(add_outputs_pt[i])
+            concat_inputs_pt.append(other_inputs_pt[i])
+        y_pt = torch.cat(concat_inputs_pt, cat_dim)
+
+        input_tensors = add_input_tensors + other_input_tensors
+        inputs_pt = add_inputs_pt + other_inputs_pt
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_with_strided_add_complex(self):
+        self._test_split_large_concat_with_strided_add_complex(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add_complex",
+        )
+
+    def _test_split_large_concat_with_reuse(
+        self, cat_dim, num_inputs, input_shape, test_name
+    ):
+        # make a model like below:
+        # a1 = add(x1, x2)
+        # a2 = add(x3, x4)
+        # ...
+        # add_inputs = shuffle(x1,x2,x3...)
+        # other_inputs = [o1, o2...]
+        # concat_input = shuffle([a1, a2...] + add_inputs[0:10] + other_inputs)
+        # y = concat(concat_input)
+        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
+        add_input_tensors = self._make_tensors(
+            num_inputs * 2, input_shape, add_input_tensor_names
+        )
+        add_output_tensors = []
+        for i in range(num_inputs):
+            a = ops.elementwise(FuncEnum.ADD)(
+                add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
+            )
+            add_output_tensors.append(a)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        add_inputs_shuffle = list(range(len(add_input_tensors)))
+        random.shuffle(add_inputs_shuffle)
+        add_inputs_for_concat = [add_input_tensors[i] for i in add_inputs_shuffle[0:10]]
+        concat_input_tensors = (
+            add_output_tensors + other_input_tensors + add_inputs_for_concat
+        )
+        concat_inputs_shuffle = list(range(len(concat_input_tensors)))
+        random.shuffle(concat_inputs_shuffle)
+        real_concat_input_tensors = [
+            concat_input_tensors[i] for i in concat_inputs_shuffle
+        ]
+        concat_op = ops.concatenate()
+        Y = concat_op(real_concat_input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        add_inputs_pt = [
+            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+        ]
+        add_outputs_pt = []
+        for i in range(num_inputs):
+            add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
+        add_inputs_for_concat_pt = [add_inputs_pt[i] for i in add_inputs_shuffle[0:10]]
+        other_inputs_pt = [
+            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+        ]
+        concat_inputs_pt = add_outputs_pt + other_inputs_pt + add_inputs_for_concat_pt
+        real_concat_inputs_pt = [concat_inputs_pt[i] for i in concat_inputs_shuffle]
+        y_pt = torch.cat(real_concat_inputs_pt, cat_dim)
+
+        input_tensors = add_input_tensors + other_input_tensors
+        inputs_pt = add_inputs_pt + other_inputs_pt
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_with_reuse(self):
+        self._test_split_large_concat_with_reuse(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_reuse",
+        )
+
+    def _test_split_large_concat_with_slice(
+        self,
+        cat_dim,
+        num_slice_inputs,
+        slice_input_shape,
+        start_indices,
+        end_indices,
+        num_add_inputs,
+        add_input_shape,
+        test_name,
+    ):
+        # make a model like below:
+        # s1 = t1[:, 0:10]
+        # s2 = t1[:, 0:10]
+        # ...
+        # a1 = add(x1, x2)
+        # a2 = add(x3, x4)
+        # ...
+        # concat_input = [s1, s2, ...] + [a1, a2...]
+        # y = concat(concat_input)
+        slice_input_tensor_names = [f"slice_input_{i}" for i in range(num_slice_inputs)]
+        slice_input_tensors = self._make_tensors(
+            num_slice_inputs, slice_input_shape, slice_input_tensor_names
+        )
+        slice_output_tensors = []
+        for slice_input_tensor in slice_input_tensors:
+            t = ops.dynamic_slice()(
+                slice_input_tensor, start_indices=start_indices, end_indices=end_indices
+            )
+            slice_output_tensors.append(t)
+
+        add_input_tensor_names = [f"add_input_{i}" for i in range(num_add_inputs * 2)]
+        add_input_tensors = self._make_tensors(
+            num_add_inputs * 2, add_input_shape, add_input_tensor_names
+        )
+        add_output_tensors = []
+        for i in range(num_add_inputs):
+            a = ops.elementwise(FuncEnum.ADD)(
+                add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
+            )
+            add_output_tensors.append(a)
+
+        concat_input_tensors = slice_output_tensors + add_output_tensors
+        concat_op = ops.concatenate()
+        Y = concat_op(concat_input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        slice_inputs_pt = [
+            torch.randn(slice_input_shape).cuda().half()
+            for _ in range(num_slice_inputs)
+        ]
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+        slice_outputs_pt = [inp_pt[slice_indices] for inp_pt in slice_inputs_pt]
+
+        add_inputs_pt = [
+            torch.randn(add_input_shape).cuda().half()
+            for _ in range(num_add_inputs * 2)
+        ]
+        add_outputs_pt = []
+        for i in range(num_add_inputs):
+            add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
+        concat_inputs_pt = slice_outputs_pt + add_outputs_pt
+        y_pt = torch.cat(concat_inputs_pt, cat_dim)
+
+        input_tensors = slice_input_tensors + add_input_tensors
+        inputs_pt = slice_inputs_pt + add_inputs_pt
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(inputs_pt))]
+        input_names = [x._attrs["name"] for x in input_tensors]
+        for x_name, x_pt in zip(input_names, inputs_pt):
+            inputs[input_name_to_index[x_name]] = x_pt
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_split_large_concat_with_slice(self):
+        self._test_split_large_concat_with_slice(
+            cat_dim=1,
+            num_slice_inputs=161,
+            slice_input_shape=(20, 20),
+            start_indices=[0, 0],
+            end_indices=[None, 10],
+            num_add_inputs=5,
+            add_input_shape=(20, 161 * 10),
+            test_name="split_large_concat_with_dynamic_slice",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_group_gemm.py b/tests/unittest/compiler/test_strided_group_gemm.py
index 1d4cfbba3..6824984d3 100644
--- a/tests/unittest/compiler/test_strided_group_gemm.py
+++ b/tests/unittest/compiler/test_strided_group_gemm.py
@@ -20,6 +20,7 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.utils import logger
@@ -60,7 +61,7 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
         module = compile_model([Y], target, "./tmp", test_name)
         Y_src_ops = Y._attrs["src_ops"]
         np.testing.assert_equal(len(Y_src_ops), 2)
-        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        np.testing.assert_equal(Y_src_ops, StableSet({group_gemm_op, concat_op}))
         expected_inputs_group_gemm_op = [X1, W1, X2, W2]
         np.testing.assert_equal(
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
@@ -151,7 +152,7 @@ def _test_strided_group_gemm_bias(
         )
         Y_src_ops = Y._attrs["src_ops"]
         np.testing.assert_equal(len(Y_src_ops), 2)
-        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        np.testing.assert_equal(Y_src_ops, StableSet({group_gemm_op, concat_op}))
         expected_inputs_group_gemm_op = [X1, W1, B1, X2, W2, B2]
         np.testing.assert_equal(
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
index d7c2fe063..38ae403d4 100644
--- a/tests/unittest/compiler/test_strided_op_cat_pattern.py
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -214,26 +214,76 @@ def test_elementwise_cat_1(self):
             # Do comparisons.
             self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
 
-    def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
+    def _fused_gemm_e2e_helper(
+        self,
+        m: int,
+        k: int,
+        n1: int,
+        n2: int,
+        n3: int,
+        m2: int = -1,
+        cat_dim: int = 1,
+        no_fuse: bool = False,
+    ):
         # Construct one graph with 3 gemms + 1 cat.
-        X1 = Tensor(
-            shape=[IntImm(m), IntImm(k)],
-            dtype="float16",
-            name="X1",
-            is_input=True,
-        )
+        nd_gemm = m2 > 0
+        if nd_gemm:
+            X1 = Tensor(
+                shape=[IntImm(m), IntImm(m2), IntImm(k)],
+                dtype="float16",
+                name="X1",
+                is_input=True,
+            )
+            X2 = Tensor(
+                shape=[IntImm(m), IntImm(m2), IntImm(k)],
+                dtype="float16",
+                name="X2",
+                is_input=True,
+            )
+            X3 = Tensor(
+                shape=[IntImm(m), IntImm(m2), IntImm(k)],
+                dtype="float16",
+                name="X3",
+                is_input=True,
+            )
+            X4 = Tensor(
+                shape=[IntImm(m), IntImm(m2), IntImm(n2)],
+                dtype="float16",
+                name="X4",
+                is_input=True,
+            )
+        else:
+            X1 = Tensor(
+                shape=[IntImm(m), IntImm(k)],
+                dtype="float16",
+                name="X1",
+                is_input=True,
+            )
+            X2 = Tensor(
+                shape=[IntImm(m), IntImm(k)],
+                dtype="float16",
+                name="X2",
+                is_input=True,
+            )
+            X3 = Tensor(
+                shape=[IntImm(m), IntImm(k)],
+                dtype="float16",
+                name="X3",
+                is_input=True,
+            )
+            X4 = Tensor(
+                shape=[IntImm(m), IntImm(n2)],
+                dtype="float16",
+                name="X4",
+                is_input=True,
+            )
+
         W1 = Tensor(
             shape=[IntImm(n1), IntImm(k)],
             dtype="float16",
             name="W1",
             is_input=True,
         )
-        X2 = Tensor(
-            shape=[IntImm(m), IntImm(k)],
-            dtype="float16",
-            name="X2",
-            is_input=True,
-        )
         W2 = Tensor(
             shape=[IntImm(n2), IntImm(k)],
             dtype="float16",
@@ -246,30 +296,18 @@ def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
             name="B2",
             is_input=True,
         )
-        X3 = Tensor(
-            shape=[IntImm(m), IntImm(k)],
-            dtype="float16",
-            name="X3",
-            is_input=True,
-        )
         W3 = Tensor(
             shape=[IntImm(k), IntImm(n3)],
             dtype="float16",
             name="W3",
             is_input=True,
         )
-        X4 = Tensor(
-            shape=[IntImm(m), IntImm(n2)],
-            dtype="float16",
-            name="X4",
-            is_input=True,
-        )
 
         X5 = ops.gemm_rcr()(X1, W1)
         X6 = ops.gemm_rcr_bias()(X2, W2, B2)
         X7 = ops.gemm_rrr()(X3, W3)
         X8 = ops.gemm_rcr_bias_add_add_relu()(X2, W2, B2, X4, X4)
-        X9 = ops.concatenate()([X5, X6, X7, X8], dim=1)
+        X9 = ops.concatenate()([X5, X6, X7, X8], dim=cat_dim)
         X9._attrs["name"] = "output0"
         X9._attrs["is_output"] = True
 
@@ -282,21 +320,30 @@ def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
             "fused_gemm_m_{}_k_{}_n1_{}_n2_{}_n3_{}".format(m, k, n1, n2, n3),
         ) as module:
 
-            # Verify the generated graph.
-            sorted_graph = module.debug_sorted_graph
-            self.assertEqual(len(sorted_graph), 9)
-            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-            self.assertEqual(len(sorted_ops), 4)
+            if not no_fuse:
+                # Verify the generated graph.
+                sorted_graph = module.debug_sorted_graph
+                self.assertEqual(len(sorted_graph), 9)
+                sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+                self.assertEqual(len(sorted_ops), 4)
+
+            if nd_gemm:
+                # Run PyTorch baseline.
+                x1_pt = torch.randn(m, m2, k).cuda().half()
+                x2_pt = torch.randn(m, m2, k).cuda().half()
+                x3_pt = torch.randn(m, m2, k).cuda().half()
+                x4_pt = torch.randn(m, m2, n2).cuda().half()
+            else:
+                # Run PyTorch baseline.
+                x1_pt = torch.randn(m, k).cuda().half()
+                x2_pt = torch.randn(m, k).cuda().half()
+                x3_pt = torch.randn(m, k).cuda().half()
+                x4_pt = torch.randn(m, n2).cuda().half()
 
-            # Run PyTorch baseline.
-            x1_pt = torch.randn(m, k).cuda().half()
             w1_pt = torch.randn(n1, k).cuda().half()
-            x2_pt = torch.randn(m, k).cuda().half()
             w2_pt = torch.randn(n2, k).cuda().half()
             b2_pt = torch.randn(n2).cuda().half()
-            x3_pt = torch.randn(m, k).cuda().half()
             w3_pt = torch.randn(k, n3).cuda().half()
-            x4_pt = torch.randn(m, n2).cuda().half()
 
             x5_pt = torch.nn.functional.linear(x1_pt, w1_pt)
             x6_pt = torch.nn.functional.linear(x2_pt, w2_pt, b2_pt)
@@ -305,7 +352,7 @@ def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
                 torch.nn.functional.linear(x2_pt, w2_pt, b2_pt) + x4_pt + x4_pt
             )
 
-            x9_pt = torch.cat([x5_pt, x6_pt, x7_pt, x8_pt], dim=1)
+            x9_pt = torch.cat([x5_pt, x6_pt, x7_pt, x8_pt], dim=cat_dim)
 
             # Run AITemplate module.
             inputs = [0] * 8
@@ -321,7 +368,7 @@ def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
 
             inputs[name_to_idx["B2"]] = b2_pt
 
-            x9 = torch.empty([m, n1 + n2 + n3 + n2]).cuda().half()
+            x9 = torch.empty(x9_pt.shape).cuda().half()
             module.run_with_tensors(inputs, [x9])
 
             # Do comparisons.
@@ -333,6 +380,11 @@ def test_gemm(self):
         self._fused_gemm_e2e_helper(m=1024, k=128, n1=16, n2=32, n3=8)
         self._fused_gemm_e2e_helper(m=1024, k=256, n1=8, n2=16, n3=32)
 
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=8, n2=16, n3=32, m2=8, cat_dim=2)
+        self._fused_gemm_e2e_helper(
+            m=1024, k=256, n1=32, n2=32, n3=32, m2=8, cat_dim=1, no_fuse=True
+        )
+
     def _fused_gemm_alignment_e2e_helper(
         self, gemm_op, input_n: int, m: int, k: int, n: int
     ):
@@ -1550,7 +1602,103 @@ def test_col_reduce_cat_fusion(self):
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
+        torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
+
+    def test_strided_op_multiple_cats(self):
+        # y1 = concat(x0, x1) # [4, 30]
+        # y2 = slice(y1) # [4, 6]
+        # y = concat(y1, y2) # [4, 36]
+        x0_shape = [4, 10]
+        x1_shape = [4, 20]
+        input_type = "float16"
+        cat_dim = 1
+        test_name = "test_strided_op_multiple_cats"
+
+        target = detect_target()
+        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
+
+        Y1 = ops.concatenate()([X0, X1], dim=cat_dim)
+        slice_start_indices = [0, 0]
+        slice_end_indices = [None, 6]
+        Y2 = ops.dynamic_slice()(
+            Y1, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        Y = ops.concatenate()([Y1, Y2], dim=cat_dim)
+
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        x0_pt = get_random_torch_tensor(x0_shape, input_type)
+        x1_pt = get_random_torch_tensor(x1_shape, input_type)
+        y1_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        y2_pt = y1_pt[slice_indices]
+        y_pt = torch.cat([y1_pt, y2_pt], dim=cat_dim)
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        inputs = {"x0": x0_pt, "x1": x1_pt}
+        module.run_with_tensors(inputs, [y])
+        y_pt = y_pt.cpu().numpy()
+
+        torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
+
+    def test_strided_op_multiple_cats_2(self):
+        # y1 = x0 + x1
+        # y2 = slice(y1)
+        # y3 = concat(x2, y2)
+        # y = concat(y3, y3)
+        x0_shape = [4, 10]
+        x1_shape = [4, 10]
+        x2_shape = [4, 20]
+        input_type = "float16"
+        cat_dim = 1
+        test_name = "test_strided_op_multiple_cats_2"
+
+        target = detect_target()
+        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=input_type, name="x2", is_input=True)
+
+        Y1 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_start_indices = [0, 0]
+        slice_end_indices = [None, 12]
+        Y2 = ops.dynamic_slice()(
+            Y1, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        Y3 = ops.concatenate()([X2, Y2], dim=cat_dim)
+        Y = ops.concatenate()([Y3, Y3], dim=cat_dim)
+
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        self.assertEqual(sorted_ops[1]._attrs["op"], "concatenate")
+
+        x0_pt = get_random_torch_tensor(x0_shape, input_type)
+        x1_pt = get_random_torch_tensor(x1_shape, input_type)
+        x2_pt = get_random_torch_tensor(x2_shape, input_type)
+        y1_pt = x0_pt + x1_pt
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        y2_pt = y1_pt[slice_indices]
+        y3_pt = torch.cat([x2_pt, y2_pt], dim=cat_dim)
+        y_pt = torch.cat([y3_pt, y3_pt], dim=cat_dim)
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+        module.run_with_tensors(inputs, [y])
+        y_pt = y_pt.cpu().numpy()
+
+        torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
index 2566947c3..345cd3d2b 100644
--- a/tests/unittest/compiler/test_strided_reshape_cat.py
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -18,6 +18,7 @@
 import numpy as np
 import torch
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.utils import logger
@@ -101,7 +102,7 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
         Y_src_ops = Y._attrs["src_ops"]
         if num_cat_ops == 1:
             np.testing.assert_equal(len(Y_src_ops), 2)
-            np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+            np.testing.assert_equal(Y_src_ops, StableSet({group_gemm_op, concat_op}))
             np.testing.assert_equal(
                 concat_op._attrs["input_masks"], [False, False, True, False]
             )
@@ -203,7 +204,7 @@ def test_strided_reshape_cat_bias(self):
         )
         Y_src_ops = Y._attrs["src_ops"]
         np.testing.assert_equal(len(Y_src_ops), 2)
-        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        np.testing.assert_equal(Y_src_ops, StableSet({group_gemm_op, concat_op}))
         np.testing.assert_equal(concat_op._attrs["input_masks"], [False, False, True])
         expected_inputs_group_gemm_op = [X1, W1, B1, X2, W2, B2]
         np.testing.assert_equal(
diff --git a/tests/unittest/compiler/test_tensor.py b/tests/unittest/compiler/test_tensor.py
new file mode 100644
index 000000000..76047a78f
--- /dev/null
+++ b/tests/unittest/compiler/test_tensor.py
@@ -0,0 +1,54 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target
+
+
+class TensorTestCase(unittest.TestCase):
+    def test_tensor_size(self):
+        to_torch_dtype = {
+            "bool": torch.bool,
+            "int": torch.int32,
+            "int32": torch.int32,
+            "int64": torch.int64,
+            "float16": torch.float16,
+            "float": torch.float,
+            "float32": torch.float,
+        }
+        for dtype, torch_dtype in to_torch_dtype.items():
+            x = Tensor([3], dtype=dtype, is_input=True, is_output=True)
+            x_pt = torch.randn(3).to(torch_dtype).cuda()
+
+            expected_bytes = x_pt.numel() * x_pt.element_size()
+            self.assertEqual(x.size_bytes(), expected_bytes)
+
+            mod = compile_model(
+                x, detect_target(), "./tmp", f"test_tensor_size_{dtype}"
+            )
+
+            out = torch.empty_like(x_pt)
+            mod.run_with_tensors([x_pt], [out])
+            self.assertTrue(torch.equal(out, x_pt))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_utils.py b/tests/unittest/compiler/test_transform_utils.py
index 73de60dda..6566d82cf 100644
--- a/tests/unittest/compiler/test_transform_utils.py
+++ b/tests/unittest/compiler/test_transform_utils.py
@@ -18,6 +18,7 @@
 
 from aitemplate.compiler import compile_model, ops, transform
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
@@ -81,7 +82,7 @@ def test_check_validity_no_dst_op(self):
         tensor = self._get_simple_graph()
         graph = transform.toposort(tensor)
 
-        graph[0]._attrs["dst_ops"] = set()
+        graph[0]._attrs["dst_ops"] = StableSet()
         with self.assertRaisesRegex(
             RuntimeError, "Op None not designated as dst_op for tensor inputs_0"
         ):
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index 0c4c09d37..085304905 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -28,7 +28,9 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedElementwiseTestCase(unittest.TestCase):
-    def _test_leaky_relu(self, input_size, negative_slope=0.01, test_name="leaky_relu"):
+    def _test_leaky_relu(
+        self, input_size, negative_slope=0.01, test_name="leaky_relu", copy_op=False
+    ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
@@ -42,7 +44,10 @@ def _test_leaky_relu(self, input_size, negative_slope=0.01, test_name="leaky_rel
             name="slope",
             value=negative_slope,
         )
-        X2 = ops.elementwise(FuncEnum.LRELU)(X1, slope)
+        X2_op = ops.elementwise(FuncEnum.LRELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, slope)
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
@@ -57,7 +62,7 @@ def _test_leaky_relu(self, input_size, negative_slope=0.01, test_name="leaky_rel
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
-    def _test_relu(self, input_size, test_name="relu"):
+    def _test_relu(self, input_size, test_name="relu", copy_op=False):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
@@ -65,7 +70,10 @@ def _test_relu(self, input_size, test_name="relu"):
             name="input0",
             is_input=True,
         )
-        X2 = ops.elementwise(FuncEnum.RELU)(X1)
+        X2_op = ops.elementwise(FuncEnum.RELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
@@ -79,7 +87,9 @@ def _test_relu(self, input_size, test_name="relu"):
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
-    def _test_hardtanh(self, input_size, min_val=-1, max_val=1, test_name="hard_tanh"):
+    def _test_hardtanh(
+        self, input_size, min_val=-1, max_val=1, test_name="hard_tanh", copy_op=False
+    ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
@@ -101,7 +111,10 @@ def _test_hardtanh(self, input_size, min_val=-1, max_val=1, test_name="hard_tanh
             value=max_val,
             is_input=True,
         )
-        X2 = ops.elementwise(FuncEnum.HARDTANH)(X1, X_min, X_max)
+        X2_op = ops.elementwise(FuncEnum.HARDTANH)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_min, X_max)
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
@@ -116,20 +129,88 @@ def _test_hardtanh(self, input_size, min_val=-1, max_val=1, test_name="hard_tanh
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
+    def _test_softplus(
+        self, input_size, beta=1.0, threshold=20.0, test_name="softplus", copy_op=False
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X_beta = Tensor(
+            shape=[],
+            dtype="float16",
+            name="beta",
+            value=beta,
+            is_input=True,
+        )
+        X_threshold = Tensor(
+            shape=[],
+            dtype="float16",
+            name="threshold",
+            value=threshold,
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.SOFTPLUS)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_beta, X_threshold)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
     def test_lrelu(self):
         self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
         self._test_leaky_relu(
             [1024, 1024], negative_slope=0.5, test_name="leaky_relu_2"
         )
+        self._test_leaky_relu(
+            [1024, 1024],
+            negative_slope=0.5,
+            test_name="leaky_relu_2_copy_op",
+            copy_op=True,
+        )
 
     def test_htanh(self):
         self._test_hardtanh([512, 512], test_name="hard_tanh_1")
         self._test_hardtanh(
             [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2"
         )
+        self._test_hardtanh(
+            [1024, 1024],
+            min_val=-2,
+            max_val=2,
+            test_name="hard_tanh_2_copy_op",
+            copy_op=True,
+        )
 
     def test_relu(self):
         self._test_relu([512, 512], test_name="relu_1")
+        self._test_relu([512, 512], test_name="relu_1_copy_op", copy_op=True)
+
+    def test_softplus(self):
+        self._test_softplus([64, 64], test_name="softplus_1")
+        self._test_softplus([128, 128], beta=1.0, threshold=1.5, test_name="softplus_2")
+        self._test_softplus([128, 256], beta=2.0, threshold=0.5, test_name="softplus_3")
+        self._test_softplus(
+            [256, 128],
+            beta=1.0,
+            threshold=1.0,
+            test_name="softplus_3_copy_op",
+            copy_op=True,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_argmax.py b/tests/unittest/ops/test_argmax.py
index eccdf0f75..6dc91dc01 100644
--- a/tests/unittest/ops/test_argmax.py
+++ b/tests/unittest/ops/test_argmax.py
@@ -24,7 +24,9 @@
 
 
 class argmaxTestCase(unittest.TestCase):
-    def _test_argmax(self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax"):
+    def _test_argmax(
+        self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax", copy_op=False
+    ):
 
         o_shape = list(shape)[:-1]
 
@@ -34,7 +36,10 @@ def _test_argmax(self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax"):
             name="X",
             is_input=True,
         )
-        X4 = ops.argmax(dim=dim)(X1)
+        X4_op = ops.argmax(dim=dim)
+        if copy_op:
+            X4_op = ops.argmax(**X4_op._get_op_attributes())
+        X4 = X4_op(X1)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
@@ -51,6 +56,9 @@ def _test_argmax(self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax"):
 
     def test_argmax(self):
         self._test_argmax(shape=(300, 80), dim=1, test_name="argmax")
+        self._test_argmax(
+            shape=(300, 80), dim=1, test_name="argmax_copy_op", copy_op=True
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 3cb7cd934..8088db64c 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -109,6 +109,35 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     return Y_pt
 
 
+def ref_cross_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    q = q * (1 / q.shape[-1] ** 0.5)
+    attn = q @ k.transpose(-2, -1)
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias):
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    out = ref_cross_attention(T(q), T(k), T(v), attn_bias)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class attentionTestCase(unittest.TestCase):
     def _test_flash_attention(
@@ -124,6 +153,7 @@ def _test_flash_attention(
         test_name="attention",
         rebuild=True,
         benchmark_pt=False,
+        copy_op=False,
     ):
 
         d = n // nheads
@@ -173,12 +203,18 @@ def _test_flash_attention(
             name="cu_seqlens",
             is_input=True,
         )
-        Y = ops.flash_attention(
+
+        flash_attention_op = ops.flash_attention(
             batch_size=batch_size,
             dropout=dropout_p,
             max_seq_len=max_seqlen_in_batch,
             causal=causal,
-        )(X1, X2)
+        )
+        if copy_op:
+            flash_attention_op = ops.flash_attention(
+                **flash_attention_op._get_op_attributes()
+            )
+        Y = flash_attention_op(X1, X2)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
@@ -193,9 +229,20 @@ def _test_flash_attention(
         inputs = {"qkv": x1, "cu_seqlens": x2}
         y = torch.empty([total, num_heads, head_size]).cuda().half()
         module.run_with_tensors(inputs, [y])
-        y = y.reshape((batch_size, -1, nheads, d))
 
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-3, rtol=1e-3))
+        # Warm up.
+        for _ in range(5):
+            module.run_with_tensors(inputs, [y])
+        # Benchmark.
+        time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+            inputs,
+            [y],
+            count=100,
+        )
+        logger.info(__file__, "benchmark flash-attn time: {0}".format(time_per_iter_ms))
+
+        y = y.reshape((batch_size, -1, nheads, d))
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
 
         if benchmark_pt:
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
@@ -216,6 +263,9 @@ def _test_flash_attention(
     def test_flash_attention(self):
         if detect_target().name() == "cuda":
             self._test_flash_attention(test_name="flash_attention")
+            self._test_flash_attention(
+                test_name="flash_attention_copy_op", copy_op=True
+            )
 
     def _test_attention(self, test_name, rebuild=True, benchmark=False):
         target = detect_target()
@@ -288,6 +338,284 @@ def test_attention(self):
         if detect_target().name() == "rocm":
             self._test_attention(test_name="attention")
 
+    def _test_mem_eff_attention(
+        self,
+        batch_size=16,
+        nheads=16,
+        seqlen=1024,
+        n=1024,
+        dropout_p=0.0,
+        causal=False,
+        dtype=torch.float16,
+        device="cuda",
+        test_name="attention",
+        rebuild=True,
+        benchmark_ait=False,
+        benchmark_pt=False,
+        copy_op=False,
+        use_perm=True,
+    ):
+        d = n // nheads
+
+        x = torch.randn(
+            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
+        )
+        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
+
+        lengths = torch.tensor(
+            [seqlen] * batch_size, dtype=torch.int, device="cuda"
+        ).reshape(-1, 1)
+        attention_mask_bool = (
+            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
+            < lengths
+        )
+        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
+        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
+
+        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+            x, attention_mask_bool
+        )
+        qkv_unpad = (
+            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
+            .detach()
+            .requires_grad_()
+        )
+        qkv = (
+            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            .detach()
+            .requires_grad_()
+        )
+        q, k, v = torch.split(qkv, 1, dim=2)
+        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
+        y_pt = output.detach()
+
+        total, _, num_heads, head_size = qkv_unpad.shape
+
+        Q = Tensor(
+            shape=[batch_size, num_heads, seqlen, head_size],
+            dtype="float16",
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size, num_heads, seqlen, head_size],
+            dtype="float16",
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size, num_heads, seqlen, head_size],
+            dtype="float16",
+            name="v",
+            is_input=True,
+        )
+
+        flash_attention_op = ops.mem_eff_attention(
+            causal=causal,
+        )
+        if copy_op:
+            flash_attention_op = ops.mem_eff_attention(
+                **flash_attention_op._get_op_attributes()
+            )
+
+        Y = flash_attention_op(Q, K, V)
+
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        q = torch.permute(q, (0, 3, 2, 1, 4)).reshape(
+            batch_size, num_heads, seqlen, head_size
+        )
+        k = torch.permute(k, (0, 3, 2, 1, 4)).reshape(
+            batch_size, num_heads, seqlen, head_size
+        )
+        v = torch.permute(v, (0, 3, 2, 1, 4)).reshape(
+            batch_size, num_heads, seqlen, head_size
+        )
+
+        inputs = {
+            "q": q.detach().half().cuda().contiguous(),
+            "k": k.detach().half().cuda().contiguous(),
+            "v": v.detach().half().cuda().contiguous(),
+        }
+
+        y = torch.empty([batch_size, seqlen, num_heads, head_size]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        if benchmark_ait:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            logger.info(
+                __file__, "benchmark eff-mem-attn time: {0}".format(time_per_iter_ms)
+            )
+
+        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+
+        if benchmark_pt:
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = attention_ref
+            args = (
+                qkv.cuda().half(),
+                attention_mask_bool.cuda(),
+                dropout_p,
+                False,
+                False,
+            )
+            duration = benchmark_torch_function(100, func, *args)
+            print(
+                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            )
+
+    def test_mem_eff_attention(self):
+        if detect_target().name() == "cuda":
+            for use_perm in [False, True]:
+                self._test_mem_eff_attention(
+                    use_perm=use_perm, test_name="mem_eff_attention"
+                )
+                self._test_mem_eff_attention(
+                    causal=True, test_name="mem_eff_attention_causal"
+                )
+                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
+                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
+                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
+                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
+                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
+
+    def _test_cross_attention(
+        self,
+        batch_size=16,
+        num_heads=16,
+        seqlen=1024,
+        seqlen_kv=1024,
+        head_size=64,
+        head_size_v=64,
+        dropout_p=0.0,
+        causal=False,
+        dtype=torch.float16,
+        device="cuda",
+        test_name="attention",
+        rebuild=True,
+        benchmark_ait=False,
+        benchmark_pt=False,
+        copy_op=False,
+    ):
+        q = torch.randn(
+            batch_size,
+            seqlen,
+            num_heads,
+            head_size,
+            device="cuda",
+            dtype=dtype,
+        )
+        k = torch.randn(
+            batch_size,
+            seqlen_kv,
+            num_heads,
+            head_size,
+            device="cuda",
+            dtype=dtype,
+        )
+        v = torch.randn(
+            batch_size,
+            seqlen_kv,
+            num_heads,
+            head_size_v,
+            device="cuda",
+            dtype=dtype,
+        )
+
+        output = ref_cross_attention(q, k, v)
+        y_pt = output.detach()
+
+        Q = Tensor(
+            shape=[batch_size, num_heads, seqlen, head_size],
+            dtype="float16",
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size, num_heads, seqlen_kv, head_size],
+            dtype="float16",
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size, num_heads, seqlen_kv, head_size_v],
+            dtype="float16",
+            name="v",
+            is_input=True,
+        )
+
+        flash_attention_op = ops.mem_eff_attention(
+            causal=causal,
+        )
+        if copy_op:
+            flash_attention_op = ops.flash_attention(
+                **flash_attention_op._get_op_attributes()
+            )
+        Y = flash_attention_op(Q, K, V)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        q = torch.permute(q, (0, 2, 1, 3))
+        k = torch.permute(k, (0, 2, 1, 3))
+        v = torch.permute(v, (0, 2, 1, 3))
+
+        inputs = {
+            "q": q.detach().half().cuda().contiguous(),
+            "k": k.detach().half().cuda().contiguous(),
+            "v": v.detach().half().cuda().contiguous(),
+        }
+        y = torch.empty([batch_size, seqlen, num_heads, head_size_v]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        if benchmark_ait:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            logger.info(
+                __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
+            )
+
+        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+
+    def test_cross_attention(self):
+        if detect_target().name() == "cuda":
+            self._test_cross_attention(test_name="cross_attention")
+            self._test_cross_attention(
+                seqlen=1024,
+                seqlen_kv=768,
+                head_size=64,
+                head_size_v=64,
+                test_name="cross_attention2",
+            )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 5b0499965..4f188a421 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -22,7 +22,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
+class BMMAddTestCase(unittest.TestCase):
     def test_rrr(self):
         B = 32
         M = 256
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index d77573c53..df7420811 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -24,8 +24,8 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
-    def _test_rrr(self, bs, ms, N, K, d1, test_name):
+class BMMPermuteTestCase(unittest.TestCase):
+    def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
@@ -36,6 +36,8 @@ def _test_rrr(self, bs, ms, N, K, d1, test_name):
             shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
         )
         OP = ops.bmm_rrr_permute(shape=(d1,))
+        if copy_op:
+            OP = ops.bmm_rrr_permute(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -60,11 +62,14 @@ def test_rrr(self):
         self._test_rrr([24], [80], N=88, K=64, d1=12, test_name="permute1")
         self._test_rrr([10240], [88], N=88, K=64, d1=10, test_name="permute2")
         self._test_rrr([100], [88], N=88, K=64, d1=10, test_name="permute3")
+        self._test_rrr(
+            [100], [88], N=88, K=64, d1=10, test_name="permute3_copy_op", copy_op=True
+        )
         if detect_target().name() != "rocm":
             self._test_rrr([24], [80], N=0, K=96, d1=12, test_name="permute1_zero_n")
             self._test_rrr([24], [0], N=32, K=96, d1=12, test_name="permute1_zero_m")
 
-    def _test_rcr(self, bs, ms, N, K, d1, test_name):
+    def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
@@ -75,6 +80,8 @@ def _test_rcr(self, bs, ms, N, K, d1, test_name):
             shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
         )
         OP = ops.bmm_rcr_permute(shape=(d1,))
+        if copy_op:
+            OP = ops.bmm_rcr_permute(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -100,6 +107,9 @@ def test_rcr(self):
         self._test_rcr([10240], [88], N=64, K=88, d1=10, test_name="permute1")
         self._test_rcr([24], [80], N=64, K=88, d1=12, test_name="permute2")
         self._test_rcr([100], [88], N=64, K=88, d1=10, test_name="permute3")
+        self._test_rcr(
+            [100], [88], N=64, K=88, d1=10, test_name="permute3_copy_op", copy_op=True
+        )
         if detect_target().name() != "rocm":
             self._test_rcr(
                 [0], [80], N=96, K=32, d1=12, test_name="permute1_zero_batch"
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index 9d2d6e7b2..f6a32c6a0 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -26,7 +26,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
+class BMMRcrN1TestCase(unittest.TestCase):
     def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         BDim = shape_utils.gen_int_var_min_max(Bs, name="batch")
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
index 43e7b0aff..5bf158dc2 100644
--- a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -22,7 +22,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
+class BMMRrrK1TanhTestCase(unittest.TestCase):
     def _test_rrr(self, B, M, K, N, test_name):
         target = detect_target()
         X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
index 0e5440336..3b8350528 100644
--- a/tests/unittest/ops/test_bmm_softmax.py
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -40,6 +40,9 @@ def _test_bmm_rcr_softmax(
         if int(target._arch) < 80:
             logger.warning(__file__, "Skip this test on SM75")
             return
+        if type(target).__name__ == "FBCUDA":
+            logger.warning(__file__, "Skip this test for special profiling requirement")
+            return
         module = compile_model(Y, target, "./tmp", test_name)
         X_pt = torch.randn(B, M, K).cuda().half()
         W_pt = torch.randn(B, N, K).cuda().half()
diff --git a/tests/unittest/ops/test_bmm_softmax_bmm.py b/tests/unittest/ops/test_bmm_softmax_bmm.py
index 0933c75f3..7afad1c85 100644
--- a/tests/unittest/ops/test_bmm_softmax_bmm.py
+++ b/tests/unittest/ops/test_bmm_softmax_bmm.py
@@ -54,6 +54,7 @@ def _test_bmm_permute(
         num_heads=12,
         causal=False,
         test_name="ck_attn",
+        copy_op=False,
     ):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
@@ -71,6 +72,8 @@ def _test_bmm_permute(
         scale = head_dim**-0.5
 
         OP = ops.bmm_softmax_bmm_permute(shape=(num_heads,), scale=scale, causal=causal)
+        if copy_op:
+            OP = ops.bmm_softmax_bmm_permute(**OP._get_op_attributes())
         Y = OP(X, B0, B1)
 
         Y._attrs["name"] = "output_0"
@@ -123,7 +126,9 @@ def _test_bmm_permute(
             #     [X_pt, W_pt, B1_pt], [y], count=200, repeat=2
             # )
 
-    def _test_b2b(self, bs, ms, N, K, D, head_dim=64, test_name="ck_attn"):
+    def _test_b2b(
+        self, bs, ms, N, K, D, head_dim=64, test_name="ck_attn", copy_op=False
+    ):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
@@ -140,6 +145,8 @@ def _test_b2b(self, bs, ms, N, K, D, head_dim=64, test_name="ck_attn"):
         scale = head_dim**-0.5
 
         OP = ops.bmm_softmax_bmm(scale=scale)
+        if copy_op:
+            OP = ops.bmm_softmax_bmm(OP._get_op_attributes())
         Y = OP(X, B0, B1)
 
         Y._attrs["name"] = "output_0"
@@ -174,6 +181,9 @@ def test_rcr(self):
         self._test_bmm_permute([24], [128], N=49, K=64, D=128, test_name="static")
         self._test_bmm_permute([24], [49], N=49, K=64, D=64, test_name="static")
         self._test_bmm_permute([24], [1020], N=1020, K=64, D=128, test_name="static")
+        self._test_bmm_permute(
+            [24], [1020], N=1020, K=64, D=128, test_name="static_copy_op", copy_op=True
+        )
         self._test_bmm_permute(
             [32], [49], N=49, K=64, D=64, num_heads=4, test_name="static"
         )
@@ -183,6 +193,17 @@ def test_rcr(self):
         self._test_bmm_permute(
             [12], [64], N=64, K=64, D=64, num_heads=12, causal=True, test_name="static"
         )
+        self._test_bmm_permute(
+            [12],
+            [64],
+            N=64,
+            K=64,
+            D=64,
+            num_heads=12,
+            causal=True,
+            test_name="static_copy_op",
+            copy_op=True,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_concatenate_tanh.py b/tests/unittest/ops/test_concatenate_tanh.py
index e4fa82ed1..2c24436a6 100644
--- a/tests/unittest/ops/test_concatenate_tanh.py
+++ b/tests/unittest/ops/test_concatenate_tanh.py
@@ -320,15 +320,15 @@ def test_cat(self):
             input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
         )
 
-        # self._run_concatenate(concatenate_op=tensor.cat(),
+        # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]), dim=0)
-        # self._run_concatenate(concatenate_op=tensor.cat(),
+        # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]), dim=1)
-        # self._run_concatenate(concatenate_op=tensor.cat(),
+        # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]), dim=2)
-        # self._run_concatenate(concatenate_op=tensor.cat(),
+        # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]), dim=3)
-        # self._run_concatenate(concatenate_op=tensor.cat(),
+        # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]))
 
     def test_masked_cat(self):
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index b4565844a..7a0a3881c 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -22,7 +22,7 @@
 
 
 class ConvTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
@@ -34,10 +34,12 @@ def test_fp16(self, batch=4):
             shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
         )
         OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d")
+        module = compile_model(Y, target, "./tmp", f"conv2d_{copy_op}")
 
         X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
         W_pt = torch.randn(256, 128, 3, 3).cuda().half()
@@ -52,6 +54,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index 1eec1c540..4501d1ca8 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -22,7 +22,7 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasAddTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
@@ -41,6 +41,8 @@ def test_fp16(self, batch=4):
             is_input=True,
         )
         OP = ops.conv2d_bias_add(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_add(**OP._get_op_attributes())
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -65,6 +67,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
new file mode 100644
index 000000000..d79f9fedd
--- /dev/null
+++ b/tests/unittest/ops/test_conv3d.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
+class ConvTestCase(unittest.TestCase):
+    def _test_fp16(
+        self,
+        tt,
+        hh,
+        ww,
+        ci,
+        co,
+        kt,
+        kh,
+        kw,
+        stride=(1, 1, 1),
+        pad=(1, 1, 1),
+        batch=4,
+        test_case="",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[IntImm(batch), tt, hh, ww, ci],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, kt, kh, kw, ci], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"conv3d_{test_case}")
+
+        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
+        W_pt = torch.randn(co, ci, kt, kh, kw).cuda().half()
+        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, stride=stride, padding=pad)
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+
+        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
+        y_shape = list(Y_pt_transpose.shape)
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+
+        np.testing.assert_allclose(
+            Y_pt_transpose.cpu().numpy(), y.cpu().numpy(), atol=1e-2, rtol=1e-2
+        )
+
+    def test_fp16(self):
+        self._test_fp16(
+            4, 224, 224, 8, 96, 3, 5, 5, stride=(2, 4, 4), pad=(1, 2, 2), test_case=1
+        )
+        self._test_fp16(56, 56, 56, 64, 256, 1, 1, 1, test_case=2)
+        self._test_fp16(56, 56, 56, 64, 64, 1, 1, 1, test_case=3)
+        self._test_fp16(56, 56, 56, 64, 64, 3, 3, 3, test_case=4)
+        self._test_fp16(56, 56, 56, 256, 64, 1, 1, 1, test_case=5)
+        self._test_fp16(56, 56, 56, 256, 512, 1, 1, 1, stride=(2, 2, 2), test_case=6)
+        self._test_fp16(56, 56, 56, 128, 128, 3, 3, 3, stride=(2, 2, 2), test_case=7)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index 4ac8b285b..c1b0d16a0 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -22,7 +22,7 @@
 
 
 class ConvBiasTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
@@ -35,6 +35,8 @@ def test_fp16(self, batch=4):
         )
         B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -56,6 +58,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index 1113b367b..f284d1111 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -28,7 +28,7 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasReluTestCase(unittest.TestCase):
-    def test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1):
+    def _test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
         KK = 7
         stride = 2
         pad = 3
@@ -44,6 +44,8 @@ def test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1):
         )
         B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu_few_channels(stride=stride, pad=pad, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_relu_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -63,7 +65,11 @@ def test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1):
         y_transpose = y.permute((0, 3, 1, 2))
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1):
+    def test_relu(self):
+        self._test_relu()
+        self._test_relu(copy_op=True)
+
+    def _test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
         KK = 7
         stride = 2
         pad = 3
@@ -79,6 +85,8 @@ def test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1):
         )
         B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish_few_channels(stride=stride, pad=pad, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_hardswish_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -98,6 +106,10 @@ def test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1):
         y_transpose = y.permute((0, 3, 1, 2))
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_hardswish(self):
+        self._test_hardswish()
+        self._test_hardswish(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index feeeae827..d390d4303 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -27,7 +27,7 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasHardswishAddTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
@@ -46,6 +46,8 @@ def test_fp16(self, batch=4):
             is_input=True,
         )
         OP = ops.conv2d_bias_add_hardswish(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_add_hardswish(**OP._get_op_attributes())
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -68,6 +70,10 @@ def test_fp16(self, batch=4):
         y_transpose = y.permute(0, 3, 1, 2)
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index dca345105..bace7be14 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -21,7 +21,7 @@
 
 
 class ConvBiasReluAddTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
@@ -40,6 +40,8 @@ def test_fp16(self, batch=4):
             is_input=True,
         )
         OP = ops.conv2d_bias_add_relu(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_add_relu(**OP._get_op_attributes())
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -65,6 +67,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index 08512aab6..6a424b1af 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -28,7 +28,7 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasHardswishTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
@@ -41,6 +41,8 @@ def test_fp16(self, batch=4):
         )
         B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_hardswish(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -65,6 +67,10 @@ def test_fp16(self, batch=4):
         y_transpose = y.permute((0, 3, 1, 2))
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index a1b711b27..1ab18b4ff 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -22,7 +22,7 @@
 
 
 class ConvBiasReluTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
@@ -35,6 +35,8 @@ def test_fp16(self, batch=4):
         )
         B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_relu(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -57,6 +59,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index e8b609b5f..c9e3ad3f6 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -22,7 +22,7 @@
 
 
 class ConvBiasSigmoidTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
@@ -35,6 +35,8 @@ def test_fp16(self, batch=4):
         )
         B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_sigmoid(stride=1, pad=1, dilate=1)
+        if copy_op:
+            OP = ops.conv2d_bias_sigmoid(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -57,6 +59,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
new file mode 100644
index 000000000..088a56892
--- /dev/null
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+class crossattentionTestCase(unittest.TestCase):
+    def _test_mha(
+        self,
+        batch_sizes,
+        seqlen=1,
+        seqlen_kv=62,
+        dim=4,
+        num_heads=2,
+        use_fp16_acc=False,
+    ):
+        pt_mod = (
+            torch.nn.MultiheadAttention(
+                embed_dim=dim,
+                num_heads=num_heads,
+                batch_first=True,
+            )
+            .cuda()
+            .half()
+        )
+        pt_mod = pt_mod.eval()
+
+        pt_params = dict(pt_mod.named_parameters())
+        params_ait = {}
+        for key, arr in pt_params.items():
+            if "in_proj" in key:
+                if len(arr.shape) == 2:
+                    w_q, w_k, w_v = arr.chunk(3)
+                    params_ait["proj_q_weight"] = w_q
+                    params_ait["proj_k_weight"] = w_k
+                    params_ait["proj_v_weight"] = w_v
+                else:
+                    b_q, b_k, b_v = arr.chunk(3)
+                    params_ait["proj_q_bias"] = b_q
+                    params_ait["proj_k_bias"] = b_k
+                    params_ait["proj_v_bias"] = b_v
+
+            else:
+                params_ait[key.replace(".", "_").replace("out_proj", "proj")] = arr
+
+        ait_mod = nn.CrossAttention(
+            dim=dim,
+            seq_len=seqlen,
+            seq_len_kv=seqlen_kv,
+            num_heads=num_heads,
+            qkv_bias=True,
+            has_residual=False,
+        )
+        ait_mod.name_parameter_tensor()
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_size")
+
+        inputs_ait = Tensor([batch_dim, seqlen, dim], name="input0", is_input=True)
+        inputs_ait_k = Tensor([batch_dim, seqlen_kv, dim], name="input1", is_input=True)
+        inputs_ait_v = Tensor([batch_dim, seqlen_kv, dim], name="input2", is_input=True)
+        Y = ait_mod(inputs_ait, inputs_ait_k, inputs_ait_v)
+        Y = Y + inputs_ait
+        mark_output(Y)
+        target = detect_target(use_fp16_acc=False)
+        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        for name, weight in params_ait.items():
+            exe_module.set_constant_with_tensor(name, weight)
+
+        for batch_size in batch_sizes:
+            input_pt = torch.randn([batch_size, seqlen, dim]).cuda().half()
+            if seqlen == seqlen_kv:
+                input_pt_k = input_pt
+                input_pt_v = input_pt
+            else:
+                input_pt_k = torch.randn([batch_size, seqlen_kv, dim]).cuda().half()
+                input_pt_v = torch.randn([batch_size, seqlen_kv, dim]).cuda().half()
+
+            pt_ys, _ = pt_mod(input_pt, input_pt_k, input_pt_v)
+            pt_ys = pt_ys + input_pt
+            print("pt output:", pt_ys.shape)
+
+            inputs = [input_pt, input_pt_k, input_pt_v]
+            ys = [torch.empty(pt_ys.shape).cuda().half()]
+            exe_module.run_with_tensors(inputs, ys)
+            eps = 1e-2
+            np.testing.assert_allclose(
+                pt_ys.detach().cpu().numpy(),
+                ys[0].cpu().numpy(),
+                atol=eps,
+                rtol=eps,
+            )
+            print("Batch {} MHA verification pass".format(batch_size))
+
+    def test_cross_attn(self):
+        self._test_mha(
+            batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
+        )
+        self._test_mha(
+            batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_depthwise_conv3d.py b/tests/unittest/ops/test_depthwise_conv3d.py
new file mode 100644
index 000000000..c9d46d943
--- /dev/null
+++ b/tests/unittest/ops/test_depthwise_conv3d.py
@@ -0,0 +1,123 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class DepthwiseConv3dTestCase(unittest.TestCase):
+    def _test_fp16(self, batch=4, copy_op=False):
+        target = detect_target()
+        tt, hh, ww, ci, co, groups = 28, 28, 28, 128, 128, 128
+        X = Tensor(
+            shape=[IntImm(batch), tt, hh, ww, ci],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, 3, 3, 3, 1], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups)
+        if copy_op:
+            OP = ops.depthwise_conv3d(**OP._get_op_attributes())
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"depthwise_conv3d_{copy_op}")
+
+        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
+        W_pt = torch.randn(co, 1, 3, 3, 3).cuda().half()
+        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty([batch, tt, hh, ww, co]).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+
+        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
+        self.assertTrue(torch.allclose(Y_pt_transpose, y, atol=1e-2, rtol=1e-2))
+
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
+    def _test_mvit_shape(
+        self,
+        batch,
+        tt,
+        hh,
+        ww,
+        ci,
+        co,
+        groups,
+        kernel_size,
+        strides,
+        test_case,
+    ):
+        assert ci == co and ci == groups
+
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), tt, hh, ww, ci],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W_shape = [co] + list(kernel_size) + [1]
+        W = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+
+        OP = ops.depthwise_conv3d(stride=strides, pad=1, dilate=1, group=groups)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"depthwise_conv3d_mvit_{test_case}")
+
+        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
+        W_pt = (
+            torch.randn(co, 1, kernel_size[0], kernel_size[1], kernel_size[2])
+            .cuda()
+            .half()
+        )
+        Y_pt = torch.nn.functional.conv3d(
+            X_pt, W_pt, stride=strides, padding=1, groups=groups
+        )
+        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
+
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty(Y_pt_transpose.shape).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+
+        self.assertTrue(torch.allclose(Y_pt_transpose, y, atol=1e-2, rtol=1e-2))
+
+    def test_mvit(self):
+        self._test_mvit_shape(1, 2, 56, 56, 96, 96, 96, (3, 3, 3), (1, 1, 1), "0")
+        self._test_mvit_shape(2, 2, 28, 28, 96, 96, 96, (3, 3, 3), (1, 1, 1), "1")
+        self._test_mvit_shape(4, 2, 14, 14, 96, 96, 96, (3, 3, 3), (1, 1, 1), "2")
+        self._test_mvit_shape(8, 2, 7, 7, 96, 96, 96, (3, 3, 3), (1, 1, 1), "3")
+        self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (1, 2, 2), "4")
+        self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (1, 4, 4), "5")
+        self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (2, 8, 8), "6")
+        self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (1, 3, 3), (2, 8, 8), "7")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_dual_gemm.py b/tests/unittest/ops/test_dual_gemm.py
new file mode 100644
index 000000000..d737ed270
--- /dev/null
+++ b/tests/unittest/ops/test_dual_gemm.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+import unittest
+
+import numpy as np
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger, shape_utils
+
+
+class NewGELUActivation(torch.nn.Module):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return (
+            0.5
+            * input
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi)
+                    * (input + 0.044715 * torch.pow(input, 3.0))
+                )
+            )
+        )
+
+
+class T5DenseGatedGeluDense(torch.nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        d_ff: int,
+    ) -> None:
+        super().__init__()
+        self.wi_0 = torch.nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = torch.nn.Linear(d_model, d_ff, bias=False)
+        self.wo = torch.nn.Linear(d_ff, d_model, bias=False)
+        self.gelu_act = NewGELUActivation()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+@unittest.skipIf(detect_target()._arch == "75", "DualGemm not supported on sm75.")
+class DUALGEMMTestCase(unittest.TestCase):
+    def _test_dual_gemm(self, M=4096, N=4096, K=8192, fast_gelu=False, benchmark=False):
+        target = detect_target(use_fp16_acc=False)
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N, K], dtype="float16", name="input_2", is_input=True)
+        if fast_gelu:
+            OP = ops.dual_gemm_rcr_fast_gelu()
+        else:
+            OP = ops.dual_gemm_rcr_silu()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "dual_gemm")
+        X_pt = torch.randn(M, K).cuda().half() * 0.01
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N, K).cuda().half()
+
+        def pt_func(X_pt, W_pt, B_pt):
+            Y_pt1 = torch.nn.functional.linear(X_pt, W_pt)
+            Y_pt2 = torch.nn.functional.linear(X_pt, B_pt)
+            if fast_gelu:
+                gelu_act = NewGELUActivation()
+            else:
+                gelu_act = torch.nn.functional.silu
+            Y_pt = gelu_act(Y_pt1) * Y_pt2
+            return Y_pt
+
+        Y_pt = pt_func(X_pt, W_pt, B_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+        if benchmark:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            logger.info(__file__, f"AIT GEMMxGEMM time: {time_per_iter_ms:.5f}ms")
+            # Benchmark PT
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = pt_func
+            args = (X_pt, W_pt, B_pt)
+            duration = benchmark_torch_function(100, func, *args)
+            logger.info(__file__, f"PT GEMMxGEMM Time: {duration:.5f}ms")
+
+    def test_dual_gemm(self):
+        for fast_gelu in [True, False]:
+            self._test_dual_gemm(M=128, N=128, K=256, fast_gelu=fast_gelu)
+            self._test_dual_gemm(M=1024, N=1024, K=2048, fast_gelu=fast_gelu)
+            self._test_dual_gemm(M=4096, N=4096, K=8192, fast_gelu=fast_gelu)
+
+    def _test_t5block(
+        self,
+        Ms,
+        d_model=1024,
+        d_ff=2048,
+        use_fp16_acc=False,
+    ):
+
+        pt_mod = T5DenseGatedGeluDense(d_model=d_model, d_ff=d_ff).cuda().half()
+        pt_mod = pt_mod.eval()
+
+        pt_params = dict(pt_mod.named_parameters())
+        params_ait = {}
+        for key, arr in pt_params.items():
+            print(key, arr.shape)
+            params_ait[key.replace(".", "_").replace("out_proj", "proj")] = arr
+
+        ait_mod = nn.T5DenseGatedGeluDense(
+            in_channels=d_model,
+            out_channels=d_ff,
+        )
+        ait_mod.name_parameter_tensor()
+
+        M_dim = shape_utils.gen_int_var_min_max(Ms, name="Mdim")
+        inputs_ait = Tensor([M_dim, d_model], name="input0", is_input=True)
+        Y = ait_mod(inputs_ait)
+        mark_output(Y)
+        target = detect_target(use_fp16_acc=False)
+        exe_module = compile_model(Y, target, "./tmp", "t5block")
+        for name, weight in params_ait.items():
+            exe_module.set_constant_with_tensor(name, weight)
+
+        for m in Ms:
+            input_pt = torch.randn([m, d_model]).cuda().half()
+            pt_ys = pt_mod(input_pt)
+            print("pt output:", pt_ys.shape)
+
+            inputs = [input_pt]
+            ys = [torch.empty(pt_ys.shape).cuda().half()]
+            exe_module.run_with_tensors(inputs, ys)
+            eps = 1e-2
+            np.testing.assert_allclose(
+                pt_ys.detach().cpu().numpy(),
+                ys[0].cpu().numpy(),
+                atol=eps,
+                rtol=eps,
+            )
+            print("M = {} t5 verification pass".format(m))
+
+    def test_t5block(self):
+        self._test_t5block(Ms=[1024, 2048, 4096])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index 9b92c6d0a..14b77a440 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,59 +12,53 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-# import torch
-# import numpy as np
 
+import unittest
 
-# from aitemplate.frontend import IntVar, Tensor
-# from aitemplate.compiler import ops
-# from aitemplate.frontend import nn
-# from aitemplate.testing import compile_model, detect_target
+import torch
 
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
 
-# def test_fp16(batch_size=[4, 32, 48]):
-#     target = detect_target()
-#     X = Tensor(
-#         shape=[
-#             IntVar(values=batch_size, name="input_batch"),
-#             28,
-#             28,
-#             128
-#         ],
-#         dtype="float16",
-#         name="input_0"
-#     )
-#     W = Tensor(
-#         shape=[
-#             256,
-#             3,
-#             3,
-#             128
-#         ],
-#         dtype="float16",
-#         name="input_1"
-#     )
-#     OP = ops.conv2d(stride=1, pad=1, dilate=1)
-#     Y = OP(X, W)
-#     Y._attrs["name"] = "output_0"
-#     Y._attrs["is_output"] = True
-#     module = compile_model(Y, target, "./tmp", "dynamic_conv", dynamic_batch=True)
-#     for batch in range(batch_size[0], batch_size[-1] + 1):
-#         print("Test batch: %d" % batch)
-#         X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-#         W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-#         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt,  padding=1)
-#         Y_np = Y_pt.cpu().numpy()
-#         module.SetDim("input_batch", batch)
-#         x = np.transpose(X_pt.cpu().numpy(), (0, 2, 3, 1)).copy()
-#         w = np.transpose(W_pt.cpu().numpy(), (0, 2, 3, 1)).copy()
-#         module.SetInput("input_0", x)
-#         module.SetInput("input_1", w)
-#         module.benchmark()
-#         y = module.GetOutput("output_0", [batch, 28, 28, 256])
-#         np.testing.assert_allclose(Y_np,
-#                                    np.transpose(y, (0, 3, 1, 2)),
-#                                    atol=1e-2, rtol=1e-2)
 
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvTestCase(unittest.TestCase):
+    def test_fp16(self):
+        target = detect_target()
+        batch_size = [2, 32]
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 24, 24, 4],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[36, 3, 3, 4], dtype="float16", name="input_1", is_input=True)
+        OP = ops.conv2d(stride=2, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "dynamic_conv",
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+        for batch in batch_size:
+            print("Test batch: %d" % batch)
+            X_pt = torch.randn(batch, 4, 24, 24).cuda().half()
+            W_pt = torch.randn(36, 4, 3, 3).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, stride=2, padding=1)
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            y = torch.empty([batch, 12, 12, 36]).cuda().half()
+            module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+            y_transpose = y.permute((0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-# test_fp16()
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index a46f3df22..54aaa513c 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#  Copyright (c) Meta Platform, Inc. and its affiliates"""
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -132,6 +132,7 @@ def _test_nms(
         rebuild=True,
         test_name="efficient_nms",
         benchmark_shapes=False,
+        copy_op=False,
     ):
         X1 = Tensor(
             shape=[batch_size, N, num_classes, 4],
@@ -147,12 +148,15 @@ def _test_nms(
             is_input=True,
         )
 
-        Y = ops.efficient_nms(
+        OP = ops.efficient_nms(
             preNmsTop=preNmsTop,
             nmsMaxOut=nmsMaxOut,
             iouThreshold=iouThreshold,
             minBoxSize=minBoxSize,
-        )(X1, X2)
+        )
+        if copy_op:
+            OP = ops.efficient_nms(**OP._get_op_attributes())
+        Y = OP(X1, X2)
         mark_output(Y)
 
         boxes, scores = self._create_tensors(N, rand=rand_box)
@@ -247,6 +251,7 @@ def test_nms(self):
         #     test_name="nms1",
         # )
 
+        """
         self._test_nms(
             N=30,
             preNmsTop=30,
@@ -258,6 +263,7 @@ def test_nms(self):
             rand_box=False,
             test_name="nms1",
         )
+        """
         self._test_nms(
             N=30,
             preNmsTop=30,
@@ -269,6 +275,18 @@ def test_nms(self):
             rand_box=False,
             test_name="nms2",
         )
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2_copy_op",
+            copy_op=True,
+        )
 
     @unittest.skip("manually enable it for benchmarking")
     def test_nms_benchmark_shapes(self):
diff --git a/tests/unittest/ops/test_fpn_roi_align.py b/tests/unittest/ops/test_fpn_roi_align.py
index c4e2f2ea8..22e6ab870 100644
--- a/tests/unittest/ops/test_fpn_roi_align.py
+++ b/tests/unittest/ops/test_fpn_roi_align.py
@@ -53,6 +53,7 @@ def _test_fpn_roi_align(
         im_shape=(512, 512),
         rebuild=True,
         bench=False,
+        copy_op=False,
     ):
         HH, WW = im_shape
         target = detect_target()
@@ -81,6 +82,8 @@ def _test_fpn_roi_align(
             continuous_coordinate=True,
             im_shape=im_shape,
         )
+        if copy_op:
+            OP = ops.multi_level_roi_align(**OP._get_op_attributes())
         Y = OP(P2, P3, P4, P5, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -194,6 +197,17 @@ def test_fpn_roi_align(self):
             rebuild=1,
             test_name="fpn_roi_align",
         )
+        self._test_fpn_roi_align(
+            boxes,
+            features,
+            CC=C,
+            num_rois=boxes.shape[0],
+            im_shape=(H, W),
+            pooled_size=7,
+            rebuild=1,
+            test_name="fpn_roi_align_copy_op",
+            copy_op=True,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 2798dc774..f611ecf1b 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -25,13 +25,16 @@
 from aitemplate.compiler import compile_model, ops, transform
 from aitemplate.compiler.base import IntImm
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.utils import shape_utils
 
+ait_dtype_to_pytorch = {"float16": torch.float16}
+
 
 class FusedElementwiseTestCase(unittest.TestCase):
-    def test_fused_elementwise_constructor(self):
+    def _test_fused_elementwise_constructor(self, ait_dtype):
         BATCH_SIZE = 1024
         M = 256
         K = 128
@@ -42,13 +45,13 @@ def test_fused_elementwise_constructor(self):
         op2._attrs["name"] = "e2"
         X1 = Tensor(
             shape=[BATCH_SIZE, M, K],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=ait_dtype,
             name="X2",
             value=3.0,
         )
@@ -69,27 +72,32 @@ def test_fused_elementwise_constructor(self):
         self.assertEqual(fused_op._attrs["inputs"], [X1])
         self.assertEqual(fused_op._attrs["outputs"], [X4])
 
-        self.assertEqual(X4._attrs["src_ops"], {fused_op})
-        self.assertEqual(X1._attrs["dst_ops"], {fused_op})
+        self.assertEqual(X4._attrs["src_ops"], StableSet({fused_op}))
+        self.assertEqual(X1._attrs["dst_ops"], StableSet({fused_op}))
 
         self.assertEqual(fused_op._attrs["depth"], 0)
         self.assertEqual(X1._attrs["depth"], 0)
         self.assertEqual(X4._attrs["depth"], 2)
 
-    def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name):
+    def test_fused_elementwise_constructor(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_fused_elementwise_constructor(ait_dtype)
+
+    def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype):
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(batch_sizes),
                 shape_utils.gen_int_var_min_max(ms),
                 shape_utils.gen_int_var_min_max(ks),
             ],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=ait_dtype,
             name="X2",
             value=3.0,
         )
@@ -110,55 +118,70 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name):
         for batch_size in batch_sizes:
             for m in ms:
                 for k in ks:
-                    x1_pt = torch.randn(batch_size, m, k).cuda().half()
+                    x1_pt = torch.randn(batch_size, m, k).cuda().to(dtype=torch_dtype)
                     x4_pt = torch.tanh(x1_pt + 3.0)
 
-                    x4 = torch.empty([batch_size, m, k]).cuda().half()
+                    x4 = torch.empty([batch_size, m, k]).cuda().to(dtype=torch_dtype)
                     module.run_with_tensors([x1_pt], [x4])
                     self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
     def test_fused_elementwise_e2e(self):
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[1024], ms=[256], ks=[128], test_name="static_shapes"
-        )
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[1, 99, 998, 1024],
-            ms=[256],
-            ks=[128],
-            test_name="dynamic_batch_size",
-        )
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[1024], ms=[1, 128, 256], ks=[128], test_name="dynamic_m"
-        )
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[1024], ms=[256], ks=[1, 3, 8, 128], test_name="dynamic_k"
-        )
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[700, 80, 1024],
-            ms=[23, 78, 256],
-            ks=[10, 30, 128],
-            test_name="dynamic_all",
-        )
-
-    def test_fused_elementwise_kernel1(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[1024],
+                ms=[256],
+                ks=[128],
+                test_name=f"static_shapes_{ait_dtype}",
+                ait_dtype=ait_dtype,
+            )
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[1, 99, 998, 1024],
+                ms=[256],
+                ks=[128],
+                test_name=f"dynamic_batch_size_{ait_dtype}",
+                ait_dtype=ait_dtype,
+            )
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[1024],
+                ms=[1, 128, 256],
+                ks=[128],
+                test_name=f"dynamic_m_{ait_dtype}",
+                ait_dtype=ait_dtype,
+            )
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[1024],
+                ms=[256],
+                ks=[1, 3, 8, 128],
+                test_name=f"dynamic_k_{ait_dtype}",
+                ait_dtype=ait_dtype,
+            )
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[700, 80, 1024],
+                ms=[23, 78, 256],
+                ks=[10, 30, 128],
+                test_name=f"dynamic_all_{ait_dtype}",
+                ait_dtype=ait_dtype,
+            )
+
+    def _test_fused_elementwise_kernel1(self, ait_dtype):
         BATCH_SIZE = 1024
         M = 1496
-
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(BATCH_SIZE), IntImm(2), IntImm(M)],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=ait_dtype,
             name="constant_number",
             value=1.0,
         )
         X3 = Tensor(
             shape=[IntImm(2), IntImm(M)],
-            dtype="float16",
+            dtype=ait_dtype,
             name="constant_matrix",
             is_input=True,
         )
@@ -172,21 +195,28 @@ def test_fused_elementwise_kernel1(self):
         X9._attrs["name"] = "output0"
 
         target = detect_target()
-        module = compile_model(X9, target, "./tmp", "fused_elementwise_kernel1")
+        module = compile_model(
+            X9, target, "./tmp", f"fused_elementwise_kernel1_{ait_dtype}"
+        )
 
-        x1_pt = torch.randn(BATCH_SIZE, 2, M).cuda().half()
-        x3_pt = torch.randn(2, M).cuda().half()
+        x1_pt = torch.randn(BATCH_SIZE, 2, M).cuda().to(dtype=torch_dtype)
+        x3_pt = torch.randn(2, M).cuda().to(dtype=torch_dtype)
         x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt)) * x3_pt
 
         inputs = {"input0": x1_pt, "constant_matrix": x3_pt}
-        x9 = torch.empty([BATCH_SIZE, 2, M]).cuda().half()
+        x9 = torch.empty([BATCH_SIZE, 2, M]).cuda().to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [x9])
         self.assertTrue(torch.allclose(x9, x9_pt, atol=1e-2, rtol=1e-2))
 
-    def _test_sigmoid(self, input_size, test_name="sigmoid"):
+    def test_fused_elementwise_kernel1(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_fused_elementwise_kernel1(ait_dtype)
+
+    def _test_sigmoid(self, input_size, test_name, ait_dtype):
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
@@ -197,23 +227,25 @@ def _test_sigmoid(self, input_size, test_name="sigmoid"):
         target = detect_target()
         module = compile_model(X2, target, "./tmp", test_name)
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
         x2_pt = torch.sigmoid(x1_pt)
 
-        x2 = torch.empty(input_size).cuda().half()
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     def test_sigmoid(self):
-        self._test_sigmoid([1024, 2 * 1496], "sigmoid_1")
-        self._test_sigmoid([1024, 23744], "sigmoid_2")
-        self._test_sigmoid([1024, 70144], "sigmoid_3")
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
+            self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
+            self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
 
-    def _test_tanh(self, input_size, test_name="tanh"):
+    def _test_tanh(self, input_size, test_name, ait_dtype):
         assert len(input_size) == 2
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
@@ -224,31 +256,123 @@ def _test_tanh(self, input_size, test_name="tanh"):
         target = detect_target()
         module = compile_model(X2, target, "./tmp", test_name)
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
         x2_pt = torch.tanh(x1_pt)
 
-        x2 = torch.empty(input_size).cuda().half()
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     def test_tanh(self):
-        self._test_tanh([1024, 22400], "tanh_1")
-        self._test_tanh([1024, 70144], "tanh_2")
-        self._test_tanh([1024, 23744], "tanh_3")
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
+            self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
+            self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
+
+    def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
+        assert len(input_size) == 2
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        if fast_gelu:
+            X2 = ops.elementwise(FuncEnum.FASTGELU)(X1)
+        else:
+            X2 = ops.elementwise(FuncEnum.GELU)(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x2_pt = torch.nn.functional.gelu(x1_pt)
+
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_gelu(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_gelu([1024, 22400], f"gelu_1_{ait_dtype}", ait_dtype)
+            self._test_gelu([1024, 70144], f"fast_gelu_1_{ait_dtype}", ait_dtype, True)
+
+    def _test_power(self, input_size, exp, test_name, ait_dtype):
+        print(f"Running test {test_name} with exp = {exp}")
+        assert len(input_size) == 2
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.POW)(X1, exp)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        if abs(exp) < 1.0:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype) + 0.5
+        else:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x2_pt = torch.pow(x1_pt, exp)
+
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        module.run_with_tensors([x1_pt], [x2])
+        # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
+        # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
+        # print(f"BW: {bw} GB/s")
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True))
+
+    def test_power(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            for i, exp in enumerate(
+                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
+            ):
+                input_sizes = [1024, 22400]
+                self._test_power(
+                    input_sizes,
+                    exp,
+                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
+                    ait_dtype,
+                )
+
+            for i, exp in enumerate(
+                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
+            ):
+                input_sizes = [1025, 22401]
+                self._test_power(
+                    input_sizes,
+                    exp,
+                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
+                    ait_dtype,
+                )
 
     def _test_min_max(
-        self, input_size: List[List[int]], test_name: str, is_min: bool, add_nans: bool
+        self,
+        input_size: List[List[int]],
+        test_name: str,
+        is_min: bool,
+        add_nans: bool,
+        ait_dtype,
     ) -> None:
         assert len(input_size) == 2
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X0 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input1",
             is_input=True,
         )
@@ -263,8 +387,8 @@ def _test_min_max(
         target = detect_target()
         module = compile_model(result, target, "./tmp", test_name)
 
-        x0_pt = torch.randn(input_size).cuda().half()
-        x1_pt = torch.randn(input_size).cuda().half()
+        x0_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
         if add_nans:
             x1_pt[0].fill_(float("nan"))
 
@@ -275,7 +399,7 @@ def _test_min_max(
         x2_np = x2_pt.cpu().numpy()
 
         inputs = {"input0": x0_pt, "input1": x1_pt}
-        x2 = torch.empty(input_size).cuda().half()
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [x2])
         x2 = x2.cpu().numpy()
 
@@ -287,20 +411,52 @@ def _test_min_max(
         np.testing.assert_allclose(x2, x2_np, atol=1e-2, rtol=1e-2)
 
     def test_min(self):
-        self._test_min_max([512, 512], test_name="min_1", is_min=True, add_nans=False)
-        self._test_min_max([512, 512], test_name="min_2", is_min=True, add_nans=True)
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_min_max(
+                [512, 512],
+                test_name=f"min_nonan_{ait_dtype}",
+                is_min=True,
+                add_nans=False,
+                ait_dtype=ait_dtype,
+            )
+            self._test_min_max(
+                [512, 512],
+                test_name=f"min_nan_{ait_dtype}",
+                is_min=True,
+                add_nans=True,
+                ait_dtype=ait_dtype,
+            )
 
     def test_max(self):
-        self._test_min_max([512, 512], test_name="max_1", is_min=False, add_nans=False)
-        self._test_min_max([512, 512], test_name="max_2", is_min=False, add_nans=True)
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_min_max(
+                [512, 512],
+                test_name=f"max_nonan_{ait_dtype}",
+                is_min=False,
+                add_nans=False,
+                ait_dtype=ait_dtype,
+            )
+            self._test_min_max(
+                [512, 512],
+                test_name=f"max_nan_{ait_dtype}",
+                is_min=False,
+                add_nans=True,
+                ait_dtype=ait_dtype,
+            )
 
     def _test_clamp(
-        self, input_size: List[List[int]], min_val: int, max_val: int, test_name: str
+        self,
+        input_size: List[List[int]],
+        min_val: int,
+        max_val: int,
+        test_name: str,
+        ait_dtype,
     ) -> None:
         assert len(input_size) == 2
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X0 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
@@ -311,32 +467,34 @@ def _test_clamp(
         target = detect_target()
         module = compile_model(result, target, "./tmp", test_name)
 
-        x0_pt = torch.randn(input_size).cuda().half()
+        x0_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
 
         x1_pt = torch.clamp(x0_pt, min_val, max_val)
 
-        x1 = torch.empty(input_size).cuda().half()
+        x1 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x0_pt], [x1])
 
         self.assertTrue(torch.allclose(x1, x1_pt, atol=1e-2, rtol=1e-2))
 
     def test_clamp(self):
-        self._test_clamp([512, 106], -1, 1, "clamp_0")
-        self._test_clamp([128, 46], None, 1, "clamp_1")
-        self._test_clamp([56, 265], -1, None, "clamp_2")
-        self._test_clamp([17, 123], 1, -1, "clamp_3")
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_clamp([512, 106], -1, 1, f"clamp_0_{ait_dtype}", ait_dtype)
+            self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
+            self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
+            self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
 
-    def test_operator_overload(self):
+    def _test_operator_overload(self, ait_dtype):
         input_size = [4, 2]
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=input_size,
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=input_size,
-            dtype="float16",
+            dtype=ait_dtype,
             name="input1",
             is_input=True,
         )
@@ -345,21 +503,26 @@ def test_operator_overload(self):
         OUTPUT._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "test_op_overload")
+        module = compile_model(OUTPUT, target, "./tmp", f"test_op_overload_{ait_dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
-        x2_pt = torch.randn(input_size).cuda().half()
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x2_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
         output_pt = -torch.tanh(x1_pt + x2_pt) + torch.tanh(x2_pt) + torch.tanh(x1_pt)
 
-        output = torch.empty(input_size).cuda().half()
+        output = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt, x2_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
-    def test_operator_overload_with_constant_number(self):
+    def test_operator_overload(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_operator_overload(ait_dtype)
+
+    def _test_operator_overload_with_constant_number(self, ait_dtype):
         input_size = [4, 2]
+        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=input_size,
-            dtype="float16",
+            dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
@@ -368,14 +531,18 @@ def test_operator_overload_with_constant_number(self):
         OUTPUT._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "test_op_overload")
+        module = compile_model(OUTPUT, target, "./tmp", f"test_op_overload_{ait_dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
         output_pt = 10 / torch.tanh(x1_pt + 5) - math.cos(10)
-        output = torch.empty(input_size).cuda().half()
+        output = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
+    def test_operator_overload_with_constant_number(self):
+        for ait_dtype in ait_dtype_to_pytorch.keys():
+            self._test_operator_overload_with_constant_number(ait_dtype)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index b3a6a6b47..15ad326cd 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -60,6 +60,52 @@ def test_rcr(self):
             self._test_rcr([8], 0, 4, "zero_k")
             self._test_rcr([0], 8, 4, "zero_m")
 
+    def _test_rcr_dynamic_n(self, ms, k, ns, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ns), k],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
+
+        for m in ms:
+            for n in ns:
+                X_pt = torch.randn(m, k).cuda().half()
+                W_pt = torch.randn(n, k).cuda().half()
+                Y_pt = torch.nn.functional.linear(X_pt, W_pt)
+
+                inputs = {"input_0": X_pt, "input_1": W_pt}
+                y = torch.empty([m, n]).cuda().half()
+                module.run_with_tensors(inputs, [y])
+
+                # from aitemplate.testing.benchmark_pt import benchmark_torch_function
+                # module.benchmark_with_tensors(inputs, [y], count=1000)
+                # t = benchmark_torch_function(1000, torch.nn.functional.linear, X_pt, W_pt)
+                # print(f"pt: {t} ms")
+
+                if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                    pass
+                else:
+                    self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr_dynamic_n(self):
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "umia_einsum_1")
+        self._test_rcr_dynamic_n(
+            [16, 1 * 29, 64], 256, [100000, 300000], "umia_einsum_dynamic_n"
+        )
+
     def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name):
         target = detect_target()
         X = Tensor(
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index f155d9c93..cbe0b9ce6 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -70,7 +70,8 @@ def test_rcr(self):
             # This test triggered a c10 assertion failure internally
             # caffe2/c10/util/SmallVector.h:338:
             # Assertion `idx < size()' failed
-
+            if type(target).__name__ != "FBCUDA":
+                self._test_rcr([2], N=64, K=0, test_name="zero_k")
             self._test_rcr([2], N=0, K=4, test_name="zero_n")
             self._test_rcr([0], N=4, K=4, test_name="zero_m")
 
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index 2226b77d1..ef2ccc365 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -227,6 +227,8 @@ def test_bias_rcr_add_add_relu(self):
             # This test triggered a c10 assertion failure internally
             # caffe2/c10/util/SmallVector.h:338:
             # Assertion `idx < size()' failed
+            if type(target).__name__ != "FBCUDA":
+                self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
 
     def _test_bias_rcr_mul(self, m, m0, m1, k, n):
         target = detect_target()
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index f82c97c39..c0e55201e 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -25,7 +25,7 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasHardSwishTestCase(unittest.TestCase):
     def test_rcr(self):
         M = 128
         K = 1024
diff --git a/tests/unittest/ops/test_gemm_bias_permute.py b/tests/unittest/ops/test_gemm_bias_permute.py
index bde06b9d4..a98718d38 100644
--- a/tests/unittest/ops/test_gemm_bias_permute.py
+++ b/tests/unittest/ops/test_gemm_bias_permute.py
@@ -22,8 +22,8 @@
 
 
 @unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
-class GEMMTestCase(unittest.TestCase):
-    def test_gemm_rcr_bias_permute_m2n3(self):
+class GEMMBiasPermuteTestCase(unittest.TestCase):
+    def _test_gemm_rcr_bias_permute_m2n3(self, copy_op=False):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -38,6 +38,8 @@ def test_gemm_rcr_bias_permute_m2n3(self):
         W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_permute(shape, layout="m2n3")
+        if copy_op:
+            OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -56,7 +58,11 @@ def test_gemm_rcr_bias_permute_m2n3(self):
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m3n2(self):
+    def test_gemm_rcr_bias_permute_m2n3(self):
+        self._test_gemm_rcr_bias_permute_m2n3()
+        self._test_gemm_rcr_bias_permute_m2n3(copy_op=True)
+
+    def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
         M0 = 4
         M1 = 16
         M2 = 32
@@ -71,6 +77,8 @@ def test_gemm_rcr_bias_permute_m3n2(self):
         W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_permute(shape, layout="m3n2")
+        if copy_op:
+            OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -88,7 +96,11 @@ def test_gemm_rcr_bias_permute_m3n2(self):
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_permute_m2n3(self):
+    def test_gemm_rcr_bias_permute_m3n2(self):
+        self._test_gemm_rcr_bias_permute_m3n2()
+        self._test_gemm_rcr_bias_permute_m3n2(copy_op=True)
+
+    def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -102,6 +114,8 @@ def test_gemm_rcr_permute_m2n3(self):
         X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
         OP = ops.gemm_rcr_permute(shape, layout="m2n3")
+        if copy_op:
+            OP = ops.gemm_rcr_permute(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -119,6 +133,10 @@ def test_gemm_rcr_permute_m2n3(self):
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_gemm_rcr_permute_m2n3(self):
+        self._test_gemm_rcr_permute_m2n3()
+        self._test_gemm_rcr_permute_m2n3(copy_op=True)
+
     # ========== enable them after fix profiler =========
     # def test_gemm_rcr_bias_relu(self):
     #     M0 = 4
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index ea60e6ec5..8def037ea 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -20,7 +20,7 @@
 from aitemplate.testing import detect_target
 
 
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasReluTestCase(unittest.TestCase):
     def test_gemm_rcr_bias_relu(self):
         M = 128
         K = 1024
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 98d9197af..48f57b030 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -20,7 +20,7 @@
 from aitemplate.testing import detect_target
 
 
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasSigmoidTestCase(unittest.TestCase):
     def test_rcr(self):
         M = 128
         K = 1024
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
index 16dfb4487..62fd90727 100644
--- a/tests/unittest/ops/test_gemm_bias_softmax.py
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils import logger
 
 
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -29,6 +30,10 @@ def _test_gemm_rcr_bias_softmax(
         self, M=16, K=64, N=24, rebuild=True, test_name="gemm_bias_softmax"
     ):
         target = detect_target()
+        if type(target).__name__ == "FBCUDA":
+            logger.warning(__file__, "Skip this test for special profiling requirement")
+            return
+
         X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
index 293dca0b9..bbffb1e3a 100644
--- a/tests/unittest/ops/test_gemm_bias_swish.py
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -25,7 +25,7 @@ def swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasSwishTestCase(unittest.TestCase):
     def test_rcr(self):
         M = 128
         K = 1024
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index de8358e20..27ef27c0b 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -24,7 +24,7 @@
 from aitemplate.utils import shape_utils
 
 
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasTanhTestCase(unittest.TestCase):
     def _test_rcr(self, Ms, test_name):
         K = 1024
         N = 64
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index b17340e35..e961c359f 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -24,7 +24,7 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False):
+    def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
@@ -35,9 +35,15 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False):
         W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
         B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
         if has_bias:
-            Y = ops.gemm_rcr_bias_permute(shape)(X, W, B)
+            OP = ops.gemm_rcr_bias_permute(shape)
+            if copy_op:
+                OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
+            Y = OP(X, W, B)
         else:
-            Y = ops.gemm_rcr_permute(shape)(X, W)
+            OP = ops.gemm_rcr_permute(shape)
+            if copy_op:
+                OP = ops.gemm_rcr_permute(**OP._get_op_attributes())
+            Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
@@ -62,10 +68,108 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False):
 
     def test_rcr(self):
         for has_bias in (True, False):
-            self._test_rcr([80], 32, 96, (5, 3, 2), "permute1", has_bias=has_bias)
-            self._test_rcr([128], 64, 256, (8, 4, 4), "permute2", has_bias=has_bias)
+            for copy_op in (True, False):
+                self._test_rcr(
+                    [80],
+                    32,
+                    96,
+                    (5, 3, 2),
+                    "permute1",
+                    has_bias=has_bias,
+                    copy_op=copy_op,
+                )
+                self._test_rcr(
+                    [128],
+                    64,
+                    256,
+                    (8, 4, 4),
+                    "permute2",
+                    has_bias=has_bias,
+                    copy_op=copy_op,
+                )
 
-    def _test_rrr(self, ms, k, n, shape, test_name):
+    def _test_rcr_0213(
+        self, ms, k, n, shape, test_name, has_bias=False, copy_op=False, layout="0213"
+    ):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        if has_bias:
+            OP = ops.gemm_rcr_bias_permute(shape, layout)
+            if copy_op:
+                OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
+            Y = OP(X, W, B)
+        else:
+            OP = ops.gemm_rcr_permute(shape, layout)
+            if copy_op:
+                OP = ops.gemm_rcr_permute(**OP._get_op_attributes())
+            Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(n, k).cuda().half()
+            B_pt = torch.randn(n).cuda().half()
+
+            def torch_f(x, w, b, has_bias, shape):
+                if has_bias:
+                    Y_l = torch.nn.functional.linear(x, w, b)
+                else:
+                    Y_l = torch.nn.functional.linear(x, w)
+                t1, t2 = shape
+                Y_r = Y_l.reshape(m // t1, t1, t2, n // t2)
+                Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+                Y_out = Y_pt.reshape([m // t1, t2, -1])
+                return Y_pt, Y_out
+
+            Y_pt, _ = torch_f(X_pt, W_pt, B_pt, has_bias, shape)
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            if has_bias:
+                inputs["input_2"] = B_pt
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+            # module.benchmark_with_tensors(inputs, [y], count=1000)
+            # from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            # t = benchmark_torch_function(
+            #     1000, torch_f, X_pt, W_pt, B_pt, has_bias, shape
+            # )
+            # print(f"pt: {t} ms/iter")
+
+    def test_rcr_0213(self):
+        self._test_rcr_0213(
+            [54],
+            256,
+            4000000,
+            [54, 1000000],
+            "permute_0213_1",
+            has_bias=False,
+            copy_op=False,
+            layout="0213",
+        )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "permute_0213_2",
+            has_bias=False,
+            copy_op=False,
+            layout="0213",
+        )
+
+    def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
@@ -75,6 +179,8 @@ def _test_rrr(self, ms, k, n, shape, test_name):
         )
         W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
         OP = ops.gemm_rrr_permute(shape)
+        if copy_op:
+            OP = ops.gemm_rrr_permute(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -94,6 +200,7 @@ def _test_rrr(self, ms, k, n, shape, test_name):
     def test_rrr(self):
         self._test_rrr([80], 32, 96, (5, 3, 2), "permute1")
         self._test_rrr([128], 64, 256, (8, 4, 4), "permute2")
+        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_copy_op", copy_op=True)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index 5a4dba5cc..ce9eb31ba 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -24,7 +24,7 @@
 from aitemplate.utils import shape_utils
 
 
-class GEMMTestCase(unittest.TestCase):
+class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
     def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
         K = 1024
         N = 64
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
new file mode 100644
index 000000000..a95397f60
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import math
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class NewGELUActivation(torch.nn.Module):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return (
+            0.5
+            * input
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi)
+                    * (input + 0.044715 * torch.pow(input, 3.0))
+                )
+            )
+        )
+
+
+class GEMMRcrFastGeluTestCase(unittest.TestCase):
+    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
+        K = 1024
+        N = 64
+        target = detect_target()
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(
+            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+        )
+
+        OP = ops.gemm_rcr_fast_gelu()
+
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_fast_gelu_{test_name}")
+
+        for M in Ms:
+            logging.info(f"Testing {M=}")
+
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            Y_pt = NewGELUActivation()(torch.nn.functional.linear(X_pt, W_pt))
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt},
+                [y],
+            )
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr([128], "static", use_fast_gelu=True)
+        if detect_target().name() == "cuda":
+            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
+            self._test_rcr([128], "static", use_fast_gelu=False)
+            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index 252076212..b8279891f 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -23,7 +23,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GEMMRrrSmallNKTestCase(unittest.TestCase):
     def _test_rrr(self, M, N, K, use_fp16_acc=True):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         X = Tensor(shape=[*M, K], dtype="float16", name="input_0", is_input=True)
diff --git a/tests/unittest/ops/test_gemm_softmax.py b/tests/unittest/ops/test_gemm_softmax.py
index 14beb65cd..5bf34a3f5 100644
--- a/tests/unittest/ops/test_gemm_softmax.py
+++ b/tests/unittest/ops/test_gemm_softmax.py
@@ -20,15 +20,20 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils import logger
 
 
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 @unittest.skip("GEMM + Softmax is disabled for now")
-class GEMMTestCase(unittest.TestCase):
+class GEMMSoftmaxTestCase(unittest.TestCase):
     def _test_gemm_rcr_softmax(
         self, M=16, K=64, N=24, rebuild=True, test_name="gemm_softmax"
     ):
         target = detect_target()
+        if type(target).__name__ == "FBCUDA":
+            logger.warning(__file__, "Skip this test for special profiling requirement")
+            return
+
         X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
         OP = ops.gemm_rcr_softmax()
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index b29c2a399..7b07fcbc3 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -24,14 +24,14 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GroupGEMMRcrTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             param(False, "group_gemm_rcr_run_once"),
             param(True, "group_gemm_rcr_run_twice"),
         ]
     )
-    def test_rcr_foo(self, run_twice: bool, test_name: str):
+    def test_rcr(self, run_twice: bool, test_name: str):
         M = 256
         K1 = 128
         N1 = 60
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index 3db0fb882..159f3b0ff 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -25,7 +25,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GroupGEMMRcrBiasTestCase(unittest.TestCase):
     def test_rcr(self):
         M = 256
         K1 = 128
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index f9a998200..a533410a6 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -23,7 +23,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
     def test_rcr_relu(self):
         M = 256
         K1 = 128
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index 54d9c124c..bf8af1bb4 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -25,7 +25,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GroupGEMMRcrCatTestCase(unittest.TestCase):
     def test_rcr_bias_cat(self):
         M = 256
         K1 = 128
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index 9b22c0ae1..cb4ff4986 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -25,7 +25,7 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GroupGEMMRcrCatTestCase(unittest.TestCase):
     def test_rcr_cat(self):
         M = 256
         K1 = 128
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 126c95016..484db1d25 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -25,6 +25,7 @@
 from aitemplate.utils import logger
 
 
+@unittest.skipIf(detect_target()._arch == "75", "Skip GN on sm75.")
 class GroupnormTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(GroupnormTestCase, self).__init__(*args, **kwargs)
@@ -39,6 +40,7 @@ def _test_groupnorm(
         use_size_op=False,
         eps=1e-5,
         use_swish=False,
+        copy_op=False,
     ):
         test_name = "group_norm_swish" if use_swish else "group_norm"
         logger.info(
@@ -65,7 +67,10 @@ def _test_groupnorm(
         )
 
         op_name = "group_norm_swish" if use_swish else "group_norm"
-        X4 = getattr(ops, op_name)(num_groups, num_channels)(X1, X2, X3, eps)
+        OP = getattr(ops, op_name)(num_groups, num_channels)
+        if copy_op:
+            OP = getattr(ops, op_name)(**OP._get_op_attributes())
+        X4 = OP(X1, X2, X3, eps)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
@@ -110,17 +115,26 @@ def _test_groupnorm(
         )
         self.test_count += 1
 
-    def test_layernorm(self):
+    def test_groupnorm(self):
         self._test_groupnorm()
+        self._test_groupnorm(x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5)
+        self._test_groupnorm(x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 16, 16, 8192], num_groups=32, eps=1e-3)
         self._test_groupnorm(x_shape=[3, 64, 64, 128], num_groups=16, eps=1e-5)
         self._test_groupnorm(x_shape=[3, 33, 64, 120], num_groups=10, eps=1e-5)
         self._test_groupnorm(x_shape=[8, 34, 10, 72], num_groups=6, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 64], num_groups=32, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5)
+        self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5, copy_op=True)
 
-    def test_layernorm_swish(self):
+    def test_groupnorm_swish(self):
         self._test_groupnorm(use_swish=True)
+        self._test_groupnorm(
+            x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5, use_swish=True
+        )
+        self._test_groupnorm(
+            x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5, use_swish=True
+        )
 
         shapes = [
             (2, 16, 16, 1280),
@@ -143,6 +157,9 @@ def test_layernorm_swish(self):
 
         for shape in shapes:
             self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
+            self._test_groupnorm(
+                x_shape=shape, num_groups=32, eps=1e-5, use_swish=True, copy_op=True
+            )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
new file mode 100644
index 000000000..075e8f79c
--- /dev/null
+++ b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class IntElementwiseReshapeOpTestCase(unittest.TestCase):
+    def test_int_elementwise_reshape_op(
+        self,
+        batch_size=(1, 3),
+        x1_size=(2, 3),
+        X_shape=(32, 64),
+        test_name="elementwise_reshape_op",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
+        X = Tensor(
+            shape=[b_dim, x1_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        Y5 = ops.getitem()(Y1, 3)
+        Y6 = Y2 * Y3  # infer_shape intvar[2,9]
+        Y = ops.reshape()(X, [Y6, Y4, Y5])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b, x1 in zip(batch_size, x1_size):
+            X_shape_pt = (b, x1, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y_pt = X_pt.reshape(
+                X_shape_pt[1] * X_shape_pt[0], X_shape_pt[2], X_shape_pt[3]
+            )
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_int_elementwise_reshape_op2(
+        self,
+        batch_size=(1, 3),
+        x1_size=(2, 3),
+        x2_size=(10, 32),
+        x3_size=(48, 64),
+        test_name="elementwise_reshape_op2",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="x1_size")
+        x2_dim = shape_utils.gen_int_var_min_max(x2_size, name="x2_size")
+        x3_dim = shape_utils.gen_int_var_min_max(x3_size, name="x3_size")
+        X = Tensor(
+            shape=[b_dim, x1_dim, x2_dim, x3_dim],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        Y5 = ops.getitem()(Y1, 3)
+        f1 = ops.int_elementwise(FuncEnum.MUL)(Y4, Y5)
+
+        Y = ops.reshape()(X, [Y2 * Y3 * f1 / Y5, Y5])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b, x1, x2, x3 in zip(batch_size, x1_size, x2_size, x3_size):
+            X_shape_pt = (b, x1, x2, x3)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y_pt = X_pt.reshape(-1, X_shape_pt[3])
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index e33399639..2f81e9b02 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -88,7 +88,7 @@ def _create_tensors(self, N):
 
         return torch.tensor(boxes).cuda().half(), torch.tensor(scores).cuda().half()
 
-    def test_nms(
+    def _test_nms(
         self,
         N=30,
         preNmsTop=30,
@@ -97,6 +97,7 @@ def test_nms(
         minBoxSize=0,
         num_classes=1,
         test_name="proposal_nms",
+        copy_op=False,
     ):
         target = detect_target()
 
@@ -114,16 +115,19 @@ def test_nms(
             is_input=True,
         )
 
-        X4 = ops.nms(
+        OP = ops.nms(
             preNmsTop=preNmsTop,
             nmsMaxOut=nmsMaxOut,
             iouThreshold=iouThreshold,
             minBoxSize=minBoxSize,
-        )(X1, X2)
+        )
+        if copy_op:
+            OP = ops.nms(**OP._get_op_attributes())
+        X4 = OP(X1, X2)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", test_name + str(copy_op))
 
         boxes, scores = self._create_tensors(N)
         idxs = torch.randint(0, num_classes, (N,)).cuda().half()
@@ -149,7 +153,13 @@ def test_nms(
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(ref_box.cuda(), y, atol=1e-2, rtol=1e-2))
 
-    def test_topk_nms(self, batch_size=1, N=30, topK=30, iou=0.5, test_name="topk_nms"):
+    def test_nms(self):
+        self._test_nms()
+        self._test_nms(copy_op=True)
+
+    def _test_topk_nms(
+        self, batch_size=1, N=30, topK=30, iou=0.5, test_name="topk_nms", copy_op=False
+    ):
 
         target = detect_target()
         if target.name() == "rocm":
@@ -171,7 +181,10 @@ def model():
             )
             score_inds = ops.topk(k=topK)(X_scores)
             bboxes = ops.batch_gather()(X_boxes, score_inds)
-            keep = ops.batched_nms(iou_threshold=iou, keep_n=N)(bboxes)
+            OP = ops.batched_nms(iou_threshold=iou, keep_n=N)
+            if copy_op:
+                OP = ops.batched_nms(**OP._get_op_attributes())
+            keep = OP(bboxes)
             return keep, score_inds
 
         Y = model()
@@ -202,6 +215,10 @@ def model():
         y = score_inds[index]
         np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
 
+    def test_topk_nms(self):
+        self._test_topk_nms()
+        self._test_topk_nms(copy_op=True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_norm.py b/tests/unittest/ops/test_norm.py
index 705b727b4..1d6abeba6 100644
--- a/tests/unittest/ops/test_norm.py
+++ b/tests/unittest/ops/test_norm.py
@@ -39,6 +39,7 @@ def _run_vector_norm(
         keepdim=False,
         input_type="float16",
         output_type=None,
+        copy_op=False,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -52,6 +53,8 @@ def _run_vector_norm(
         op = ops.vector_norm(
             ord_kind=ord_kind, dim=dim, keepdim=keepdim, dtype=output_type
         )
+        if copy_op:
+            op = ops.vector_norm(**op._get_op_attributes())
         Y = op(X)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -88,6 +91,16 @@ def _run_l2_norm(
             input_type=input_type,
             output_type=output_type,
         )
+        self._run_vector_norm(
+            test_name="l2_norm_copy_op",
+            ord_kind=2,
+            dim=dim,
+            input_shape=input_shape,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+            copy_op=True,
+        )
 
     def test_l2_norm(self):
         self._run_l2_norm(dim=0, input_shape=[1], keepdim=True)
diff --git a/tests/unittest/ops/test_pad_last_dim.py b/tests/unittest/ops/test_pad_last_dim.py
index 2fc7d9831..61f778d59 100644
--- a/tests/unittest/ops/test_pad_last_dim.py
+++ b/tests/unittest/ops/test_pad_last_dim.py
@@ -23,7 +23,7 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class PadLastDim(unittest.TestCase):
-    def test_static_shape_4d(self):
+    def _test_static_shape_4d(self, copy_op=False):
         NN = 2
         HH = 7
         WW = 7
@@ -31,11 +31,13 @@ def test_static_shape_4d(self):
         CO = 264
         X = Tensor(shape=[NN, HH, WW, CI], name="X", is_input=True)
         op = ops.pad_last_dim(4, CO)
+        if copy_op:
+            op = ops.pad_last_dim(**op._get_op_attributes())
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "pad_last_dim4d")
+        module = compile_model(Y, target, "./tmp", f"pad_last_dim4d_{copy_op}")
 
         X_pt = torch.randn(NN, HH, WW, CI).cuda().half()
         Pad_pt = torch.zeros(NN, HH, WW, CO - CI).cuda().half()
@@ -45,6 +47,10 @@ def test_static_shape_4d(self):
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_static_shape_4d(self):
+        self._test_static_shape_4d()
+        self._test_static_shape_4d(copy_op=True)
+
     def test_static_shape_2d(self):
         NN = 32
         CI = 259
diff --git a/tests/unittest/ops/test_permute.py b/tests/unittest/ops/test_permute.py
index f9f61aef9..f1e5b7456 100644
--- a/tests/unittest/ops/test_permute.py
+++ b/tests/unittest/ops/test_permute.py
@@ -26,16 +26,21 @@
 class PermuteTest(unittest.TestCase):
     @parameterized.expand(
         [
-            param((0, 2, 1), "permute_1"),
-            param((1, 0, 2), "permute_2"),
-            param((2, 1, 0), "permute_3"),
+            param((80, 300, 2), (0, 2, 1), "permute_1"),
+            param((80, 300, 2), (1, 0, 2), "permute_2"),
+            param((80, 300, 2), (2, 1, 0), "permute_3"),
+            param((5, 113, 15, 31), (0, 2, 1, 3), "permute_4"),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4), "permute_5"),
+            param((8, 29, 100000, 3), (0, 2, 1, 3), "permute_6"),
+            param((32, 12, 4096, 64), (0, 2, 1, 3), "permute_7"),
+            param((1, 12, 128, 64), (0, 2, 1, 3), "permute_8"),
+            param((2, 3, 4, 5), (3, 2, 1, 0), "permute_9"),
+            param((3, 5, 128, 514), (2, 3, 0, 1), "permute_10"),
+            param((128, 512), (1, 0), "permute_11"),
         ]
     )
-    def test_static_shape_3d(self, dims, testname):
-        NN = 80
-        WW = 300
-        CI = 2
-        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+    def test_static_shape_3d(self, input_shapes, dims, testname):
+        X = Tensor(shape=input_shapes, name="X", is_input=True)
         op = ops.permute()
         Y = op(X, dims)
         Y._attrs["is_output"] = True
@@ -43,11 +48,22 @@ def test_static_shape_3d(self, dims, testname):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", testname)
 
-        X_pt = torch.randn(NN, WW, CI).cuda().half()
+        count = 1
+        for dim in input_shapes:
+            count *= dim
+        X_pt = torch.randn(input_shapes).cuda().half()
         Y_pt = torch.permute(X_pt, dims)
 
         y = torch.empty(Y_pt.size()).cuda().half()
         module.run_with_tensors([X_pt], [y])
+
+        # mean, _, _ = module.benchmark_with_tensors([X_pt], [y], count=1000)
+        # mem = 1
+        # for dim in input_shapes:
+        #     mem *= dim
+        # bw = 2 * 2 * mem / (mean * 1e-3 * 1e9)  # GB/s
+        # print(f"bw: {bw} GB/s")
+
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
 
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index d0dcbe964..b091c81c9 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -19,9 +19,10 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.utils import shape_utils
 
 logger = logging.getLogger(__name__)
 
@@ -362,6 +363,88 @@ def test_reduce_mean(self):
             output_type="float16",
         )
 
+    def _run_batched_reduce(
+        self,
+        *,
+        test_name,
+        reduce_op,
+        torch_reduce_op,
+        dim,
+        batch_sizes,
+        non_batch_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+    ):
+        torch.manual_seed(0)
+        logger.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
+        target = detect_target()
+
+        batch0_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        non_batch_dims = [IntImm(d) for d in non_batch_shape]
+        input_tensor_shape = [batch0_dim] + non_batch_dims
+        X = Tensor(
+            shape=input_tensor_shape, dtype=input_type, name="input_0", is_input=True
+        )
+
+        if keepdim is None:
+            op = reduce_op(dim, dtype=output_type)
+        else:
+            op = reduce_op(dim, keepdim=keepdim, dtype=output_type)
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        for batch_size in batch_sizes:
+            input_shape = [batch_size] + non_batch_shape
+            X_pt = get_random_torch_tensor(input_shape, input_type)
+            dtype_pt = dtype_to_torch_dtype(output_type)
+            if keepdim is None:
+                Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
+            else:
+                Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            y_pt = Y_pt.cpu().numpy()
+
+            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+            self.test_count += 1
+
+    def _run_batched_reduce_sum(
+        self,
+        *,
+        dim,
+        batch_sizes,
+        non_batch_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+    ):
+        self._run_batched_reduce(
+            test_name="reduce_sum_batched",
+            reduce_op=ops.reduce_sum,
+            torch_reduce_op=torch.sum,
+            dim=dim,
+            batch_sizes=batch_sizes,
+            non_batch_shape=non_batch_shape,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+        )
+
+    def test_batched_reduce_sum(self):
+        self._run_batched_reduce_sum(
+            dim=1,
+            batch_sizes=[10, 2048],
+            non_batch_shape=[2, 1944],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
index 2d1781506..c224d09f5 100644
--- a/tests/unittest/ops/test_size_getitem_ops.py
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -66,6 +66,7 @@ def _test_size_op_2(
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="tensor_size_op",
+        copy_op=False,
     ):
         target = detect_target()
         X1 = Tensor(
@@ -75,8 +76,13 @@ def _test_size_op_2(
             is_input=True,
         )
 
-        Y1 = ops.flatten(1, -1)(ops.elementwise(FuncEnum.ADD)(X1, X1))
-        Y2 = ops.flatten(1, -1)(ops.elementwise(FuncEnum.MUL)(X1, X1))
+        Y1_op = ops.flatten(1, -1)
+        Y2_op = ops.flatten(1, -1)
+        if copy_op:
+            Y1_op = ops.flatten(**Y1_op._get_op_attributes())
+            Y2_op = ops.flatten(**Y2_op._get_op_attributes())
+        Y1 = Y1_op(ops.elementwise(FuncEnum.ADD)(X1, X1))
+        Y2 = Y2_op(ops.elementwise(FuncEnum.MUL)(X1, X1))
         Y3 = ops.concatenate()([Y1, Y2], 0)
         dim = ops.size()(Y3, -4)  # test negative dim
         Y = ops.reshape()(Y2, [dim, -1])
@@ -106,6 +112,7 @@ def test_size_op(self):
         self._test_size_op([3, 1], (5, 4, 16), (-1, 8), "size_op_2")
 
         self._test_size_op_2(test_name="size_op_3")
+        self._test_size_op_2(test_name="size_op_3_copy_op", copy_op=True)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index 6a2c795bb..c963d1bd2 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -32,7 +32,13 @@ def _create_tensors(self, shape):
         return scores.reshape(shape).cuda().half()
 
     def _test_topk(
-        self, batch_size=1, shape=(2, 500), dim=0, topK=100, test_name="topk"
+        self,
+        batch_size=1,
+        shape=(2, 500),
+        dim=0,
+        topK=100,
+        test_name="topk",
+        copy_op=False,
     ):
 
         o_shape = list(shape)
@@ -44,7 +50,10 @@ def _test_topk(
             name="X",
             is_input=True,
         )
-        X4 = ops.topk(k=topK)(X1)
+        OP = ops.topk(k=topK)
+        if copy_op:
+            OP = ops.topk(**OP._get_op_attributes())
+        X4 = OP(X1)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
@@ -61,11 +70,31 @@ def _test_topk(
 
     def test_topk_heap(self):
         self._test_topk(shape=(2000,), topK=100, test_name="topk_heap")
+        self._test_topk(
+            shape=(2000,), topK=100, test_name="topk_heap_copy_op", copy_op=True
+        )
         self._test_topk(shape=(4, 500), topK=100, dim=1, test_name="topk_heap2")
+        self._test_topk(
+            shape=(4, 500),
+            topK=100,
+            dim=1,
+            test_name="topk_heap2_copy_op",
+            copy_op=True,
+        )
 
     def test_topk_sort(self):
         self._test_topk(shape=(2000,), topK=300, test_name="topk_sort")
+        self._test_topk(
+            shape=(2000,), topK=300, test_name="topk_sort_copy_op", copy_op=True
+        )
         self._test_topk(shape=(4, 500), topK=200, dim=1, test_name="topk_sort2")
+        self._test_topk(
+            shape=(4, 500),
+            topK=200,
+            dim=1,
+            test_name="topk_sort2_copy_op",
+            copy_op=True,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_transpose_conv2d.py b/tests/unittest/ops/test_transpose_conv2d.py
index 2f4dc7726..e36878b21 100644
--- a/tests/unittest/ops/test_transpose_conv2d.py
+++ b/tests/unittest/ops/test_transpose_conv2d.py
@@ -22,7 +22,7 @@
 
 
 class conv2dTransposeTestCase(unittest.TestCase):
-    def test_fp16(self, batch=32):
+    def _test_fp16(self, batch=32, copy_op=False):
         target = detect_target()
         if target.name() == "cuda" and int(target._arch) < 80:
             return
@@ -36,6 +36,8 @@ def test_fp16(self, batch=32):
             shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
         )
         OP = ops.transposed_conv2d(stride=2, pad=0, dilate=1)
+        if copy_op:
+            OP = ops.transposed_conv2d(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -52,6 +54,10 @@ def test_fp16(self, batch=32):
         y_transpose = y.permute((0, 3, 1, 2))
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias.py b/tests/unittest/ops/test_transpose_conv2d_bias.py
index 90692c8cd..5ab0b6f70 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias.py
@@ -23,7 +23,7 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class conv2dTransposeTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+    def _test_fp16(self, batch=4, copy_op=False):
         target = detect_target()
         if int(target._arch) < 80:
             return
@@ -38,6 +38,8 @@ def test_fp16(self, batch=4):
         )
         B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.transposed_conv2d_bias(stride=2, pad=0, dilate=1)
+        if copy_op:
+            OP = ops.transposed_conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -61,6 +63,10 @@ def test_fp16(self, batch=4):
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_fp16()
+        self._test_fp16(copy_op=True)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
index a3549f28d..edf92b362 100644
--- a/tests/unittest/ops/test_var.py
+++ b/tests/unittest/ops/test_var.py
@@ -38,6 +38,7 @@ def _run_var(
         keepdim=False,
         input_type="float16",
         output_type=None,
+        copy_op=False,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -49,6 +50,8 @@ def _run_var(
         X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
 
         op = ops.var(dim=dim, unbiased=unbiased, keepdim=keepdim, dtype=output_type)
+        if copy_op:
+            op = ops.var(**op._get_op_attributes())
         Y = op(X)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -80,6 +83,16 @@ def test_var(self):
         self._run_var(dim=1, unbiased=True, input_shape=[3, 2050, 2], keepdim=True)
         self._run_var(dim=0, unbiased=True, input_shape=[3001, 4, 2], keepdim=True)
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 1000000, 6], keepdim=False)
+        self._run_var(
+            dim=0, unbiased=True, input_shape=[3001, 4, 2], keepdim=True, copy_op=True
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=True,
+            input_shape=[1, 1000000, 6],
+            keepdim=False,
+            copy_op=True,
+        )
 
     def _run_batched_var(
         self, *, dim, unbiased, keepdim=False, input_type="float16", output_type=None
diff --git a/tests/unittest/test_stable_set.py b/tests/unittest/test_stable_set.py
new file mode 100644
index 000000000..3b5b92342
--- /dev/null
+++ b/tests/unittest/test_stable_set.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for StableSet.
+"""
+import unittest
+
+from aitemplate.compiler.stable_set import StableSet
+
+
+class StableSetTestCase(unittest.TestCase):
+    def test_stable_set(self):
+        s = StableSet([5, 2, 1])
+
+        s.add(4)
+        s.add(5)
+        s.add(1)
+        self.assertEqual(s, StableSet([5, 2, 1, 4]))
+
+        s.discard(4)
+        s.discard(4)
+        s.discard(10)
+        self.assertEqual(s, StableSet([5, 2, 1]))
+
+        s.remove(1)
+        self.assertEqual(s, StableSet([5, 2]))
+        with self.assertRaises(KeyError):
+            s.remove(1)
+
+        s.update([1, 5, 9])
+        self.assertEqual(s, StableSet([5, 2, 1, 9]))
+
+        s1 = s.copy()
+        self.assertEqual(s, s1)
+        self.assertNotEqual(id(s._d), id(s1._d))
+
+        s1 = s - [1]
+        self.assertEqual(s1, StableSet([5, 2, 9]))
+        self.assertEqual(s, StableSet([5, 2, 1, 9]))
+
+        self.assertEqual(len(s), 4)
+
+        self.assertTrue(1 in s)
+        self.assertTrue(1 not in s1)
+
+        s1 = list(s)
+        self.assertEqual(s, StableSet(s1))
+
+        self.assertTrue(s >= StableSet([5, 2, 1, 9]))
+        self.assertTrue(s > StableSet([5, 1, 2]))
+        self.assertTrue(s <= StableSet([5, 2, 1, 9]))
+        self.assertTrue(s < StableSet([5, 2, 1, 9, 10]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/util/test_debug_utils.py b/tests/unittest/util/test_debug_utils.py
new file mode 100644
index 000000000..b92181b0e
--- /dev/null
+++ b/tests/unittest/util/test_debug_utils.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for debug utils.
+"""
+import numpy as np
+import pytest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+def _test_inf_and_nan(
+    check_tensor, check_all, test_name, capfd: pytest.CaptureFixture[str]
+):
+    X1 = Tensor(
+        shape=[IntImm(1), IntImm(3)],
+        dtype="float16",
+        name="input0",
+        is_input=True,
+    )
+    X2_op = ops.elementwise(FuncEnum.DIV)
+    X2 = X2_op(X1, 0.0)
+    X2._attrs["is_output"] = True
+    X2._attrs["name"] = "output0"
+    X2._attrs["check_nan_and_inf"] = check_tensor
+
+    target = detect_target()
+    module = compile_model(
+        X2, target, "./tmp", test_name, check_all_nan_and_inf=check_all
+    )
+
+    x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
+    x2 = torch.empty_like(x1_pt)
+    module.run_with_tensors([x1_pt], [x2])
+
+    out, _ = capfd.readouterr()
+    check_str = "Tensor (output0) contains NaN: 1, +INF: 1, -INF: 1, total elements: 3"
+    assert check_str in out
+
+
+def test_inf_and_nan(capfd):
+    _test_inf_and_nan(True, False, "test_inf_and_nan_tensor", capfd)
+    _test_inf_and_nan(False, True, "test_inf_and_nan_all", capfd)
+    _test_inf_and_nan(True, True, "test_inf_and_nan_both", capfd)
+
+
+def _test_outputs(
+    check_tensor, check_all, test_name, capfd: pytest.CaptureFixture[str]
+):
+    X1 = Tensor(
+        shape=[IntImm(1), IntImm(3)],
+        dtype="float16",
+        name="input0",
+        is_input=True,
+    )
+    X2_op = ops.elementwise(FuncEnum.MUL)
+    X2 = X2_op(X1, 1.3)
+    X2._attrs["is_output"] = True
+    X2._attrs["name"] = "output0"
+    X2._attrs["check_outputs"] = check_tensor
+
+    target = detect_target()
+    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+
+    x1_pt = torch.Tensor([[1.0, 1.5, 2.0]]).cuda().half()
+    x2 = torch.empty_like(x1_pt)
+    module.run_with_tensors([x1_pt], [x2])
+
+    out, _ = capfd.readouterr()
+    output_str = "Tensor (output0) output:"
+    assert out.find(output_str) != -1
+
+    out = out[len(output_str) :].strip()
+    values = out.split(", ")
+    assert len(values) == 3, f"Got {len(values)} outputs, expected 3"
+
+    values = [float(value) for value in values]
+    target_values = np.array([1.0, 1.5, 2.0]) * 1.3
+    assert np.allclose(
+        values, target_values, rtol=1e-2, atol=1e-2
+    ), f"Expected {target_values}, got {values} instead"
+
+
+def test_outputs(capfd):
+    _test_outputs(True, False, "test_outputs_tensor", capfd)
+    _test_outputs(False, True, "test_outputs_all", capfd)
+    _test_outputs(True, True, "test_outputs_both", capfd)
+
+
+def _test_special_outputs(
+    check_tensor, check_all, test_name, capfd: pytest.CaptureFixture[str]
+):
+    X1 = Tensor(
+        shape=[IntImm(1), IntImm(3)],
+        dtype="float16",
+        name="input0",
+        is_input=True,
+    )
+    X2_op = ops.elementwise(FuncEnum.DIV)
+    X2 = X2_op(X1, 0.0)
+    X2._attrs["is_output"] = True
+    X2._attrs["name"] = "output0"
+    X2._attrs["check_outputs"] = check_tensor
+
+    target = detect_target()
+    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+
+    x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
+    x2 = torch.empty_like(x1_pt)
+    module.run_with_tensors([x1_pt], [x2])
+
+    out, _ = capfd.readouterr()
+    check_str = "inf, -inf, nan"
+    assert check_str in out
+
+
+def test_special_outputs(capfd):
+    _test_special_outputs(True, False, "test_special_outputs_tensor", capfd)
+    _test_special_outputs(False, True, "test_special_outputs_all", capfd)
+    _test_special_outputs(True, True, "test_special_outputs_both", capfd)
diff --git a/tests/unittest/util/test_serdes.py b/tests/unittest/util/test_serdes.py
new file mode 100644
index 000000000..c800e4711
--- /dev/null
+++ b/tests/unittest/util/test_serdes.py
@@ -0,0 +1,290 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for special activation Operator.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+from aitemplate.utils.serialization.serdes_code import (
+    dump_program,
+    get_inputs_from_graph,
+    get_program,
+)
+
+
+class SerDesTestCase(unittest.TestCase):
+    def test_get_inputs(self):
+        X1 = Tensor(
+            shape=[IntImm(3), IntImm(4)],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        X3 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X3._attrs["is_output"] = True
+        X3._attrs["name"] = "output0"
+
+        test_get_inputs_path = "./tmp/test_serdes_get_inputs.py"
+        dump_program(X3, test_get_inputs_path)
+        inputs = get_inputs_from_graph(test_get_inputs_path)
+
+        self.assertEqual(len(inputs), 2)
+        self.assertIsNotNone(inputs.get("input_0", None))
+        self.assertIsNotNone(inputs.get("input_1", None))
+        for input_ in [X1, X2]:
+            shape = input_.shape()
+            graph_shape = inputs[input_._attrs["name"]]
+            self.assertEqual(len(shape), len(graph_shape))
+            for x, y in zip(shape, graph_shape):
+                self.assertEqual(x, y)
+
+    def test_simple_serdes(self):
+        X1 = Tensor(
+            shape=[IntImm(3), IntImm(4)],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(1)],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        X3 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X3._attrs["is_output"] = True
+        X3._attrs["name"] = "output0"
+
+        test_path = "./tmp/test_simple_serdes.py"
+        dump_program(X3, test_path)
+
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "simple_serdes")
+
+        x1_pt = torch.randn(3, 4).cuda().half()
+        x2_pt = torch.randn(1).cuda().half()
+        x3_pt = x1_pt + x2_pt
+
+        x3 = torch.empty_like(x3_pt)
+        module.run_with_tensors({"input_0": x1_pt, "input_1": x2_pt}, {"output0": x3})
+        self.assertTrue(torch.allclose(x3, x3_pt, atol=1e-2, rtol=1e-2))
+
+    def test_multi_outputs(self):
+        X_pt = torch.randn(8, 10).cuda().half()
+        Ys_pt = torch.split(X_pt, 4)
+
+        X = Tensor(shape=[8, 10], dtype="float16", name="input_0", is_input=True)
+        Ys = ops.split()(X, 4)
+
+        self.assertEqual(len(Ys_pt), len(Ys))
+
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+
+        test_path = "./tmp/test_serdes_multi_outputs.py"
+        dump_program(Ys, test_path)
+
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "serdes_multi_outputs")
+
+        y_shapes = [(4, 10), (4, 10)]
+        outputs = {
+            f"output_{idx}": torch.empty(y_shape).cuda().half()
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors({"input_0": X_pt}, outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+class SerDesSpecialOpTestCase(unittest.TestCase):
+    def test_elementwise(self):
+        X1 = Tensor(
+            shape=[IntImm(3), IntImm(4)],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.MIN)(X1, 0.5)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        test_path = "./tmp/test_serdes_elementwise.py"
+        dump_program(X2, test_path)
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "serdes_elementwise")
+
+        x1_pt = torch.randn(3, 4).cuda().half()
+        x2_pt = torch.clamp(x1_pt, max=0.5)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors({"input_0": x1_pt}, {"output0": x2})
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_concat(self):
+        X_pts = [torch.randn(8, 10).cuda().half() for _ in range(5)]
+        Y_pt = torch.cat(X_pts)
+
+        Xs = [
+            Tensor(shape=[8, 10], dtype="float16", name=f"input_{i}", is_input=True)
+            for i in range(5)
+        ]
+        Y = ops.concatenate()(Xs)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        test_path = "./tmp/test_serdes_concat.py"
+        dump_program(Y, test_path)
+
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "serdes_concat")
+
+        input_tensors_ait = {f"input_{idx}": X_pts[idx] for idx in range(5)}
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(input_tensors_ait, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_reshape(self):
+        X = Tensor(
+            shape=[5, 4, 6, 8],  # 960 total
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        Y = ops.reshape()(X, [-1, 6, 32])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        test_path = "./tmp/test_serdes_reshape.py"
+        dump_program(Y, test_path)
+
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "serdes_reshape")
+
+        X_pt = torch.randn(5, 4, 6, 8).cuda().half()
+        Y_pt = torch.reshape(X_pt, (-1, 6, 32))
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_group_gemm_rcr(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        OP = ops.group_gemm_rcr()
+        Y1, Y2 = OP(operand_groups=[[X1, W1], [X2, W2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+
+        test_path = "./tmp/test_serdes_group_gemm.py"
+        dump_program([Y1, Y2], test_path)
+        outputs, _ = get_program(test_path)
+        module = compile_model(outputs, target, "./tmp", "serdes_group_gemm")
+
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+        }
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        outputs = {"y1": y1, "y2": y2}
+
+        module.run_with_tensors(inputs, outputs)
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+
+    def test_dynamic_slice(self):
+        batch_sizes = [5, 6, 7]
+        input_shape = [2, 3, 4]
+        X = Tensor(
+            shape=[IntVar(values=batch_sizes, name="input_batch_0"), *input_shape],
+            name="input_0",
+            is_input=True,
+        )
+        start_indices = [2, 1, 0, 1]
+        end_indices = [5, 2, 2, 4]
+        Y = ops.dynamic_slice()(X, start_indices=start_indices, end_indices=end_indices)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        test_path = "./tmp/test_serdes_dynamic_slice.py"
+        dump_program(Y, test_path)
+        outputs, _ = get_program(test_path)
+
+        target = detect_target()
+        module = compile_model(outputs, target, "./tmp", "serdes_dynamic_slice")
+
+        for batch in batch_sizes:
+            # generate torch reference result
+            X_pt = torch.randn(batch, *input_shape).cuda().half()
+
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            y_pt = Y_pt.cpu().numpy()
+
+            y = torch.empty(y_pt.shape).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))

From 4d71d48601b87e886f8999be1c02e1f94b98449a Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 11 Nov 2022 17:32:46 -0800
Subject: [PATCH 004/638] fix sm86 conv (#81)

Co-authored-by: Bing Xu <bingxu@fb.com>
---
 python/aitemplate/_libinfo.py                | 2 +-
 python/aitemplate/backend/profiler_runner.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/_libinfo.py b/python/aitemplate/_libinfo.py
index 6aacc3444..ca4d89ecb 100644
--- a/python/aitemplate/_libinfo.py
+++ b/python/aitemplate/_libinfo.py
@@ -14,4 +14,4 @@
 #
 # current version
 # We use the version of the incoming release for code
-__version__ = "0.1.dev0"
+__version__ = "0.1.dev1"
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index add0ce75f..cdc45a6a9 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -86,7 +86,16 @@ def process_task(task: Task) -> None:
     """
     stdout = task._stdout
     stderr = task._stderr
+    single_file_profiler = False
+
     if len(stderr) > 0:
+        # TODO: ugly fix, should remove when finish all profiler refactor
+        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
+        if len(runtimes) > 0:
+            single_file_profiler = True
+        if not single_file_profiler:
+            task._failed = True
+            return
         logger.debug(
             __name__,
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(

From 6a6cf82e31c867f3dd963d91e3f19e477be9c98e Mon Sep 17 00:00:00 2001
From: Zhang Jun <ewalker@live.cn>
Date: Sun, 13 Nov 2022 02:03:48 +0800
Subject: [PATCH 005/638] fix README.md of bert example (#82)

---
 examples/03_bert/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/03_bert/README.md b/examples/03_bert/README.md
index 2c6e4a489..019a2368c 100644
--- a/examples/03_bert/README.md
+++ b/examples/03_bert/README.md
@@ -29,7 +29,7 @@ python3 examples/03_bert/benchmark_ait.py --activation fast_gelu
 
 The batch size and sequence length can also be configured via the command line:
 ```
-python3 examples/03_bert/benchmark_ait.py --batch_size 1 --seq_length 128
+python3 examples/03_bert/benchmark_ait.py --batch-size 1 --seq-length 128
 ```
 
 PyTorch eager mode benchmarks can also be run:

From 3625c3056041963926ba455347d9091fb891b872 Mon Sep 17 00:00:00 2001
From: Bozhao <yubz86@gmail.com>
Date: Sat, 12 Nov 2022 10:04:07 -0800
Subject: [PATCH 006/638] Add negative prompts feature for txt2img pipeline
 (#75)

Add optional negative prompt option for txt2img pipeline
---
 .../pipeline_stable_diffusion_ait.py          | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
index 5234117b1..06f364ce1 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
@@ -160,6 +160,7 @@ def __call__(
         width: Optional[int] = 512,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
         eta: Optional[float] = 0.0,
         generator: Optional[torch.Generator] = None,
         latents: Optional[torch.FloatTensor] = None,
@@ -186,6 +187,9 @@ def __call__(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
@@ -253,9 +257,27 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
+            uncond_tokens: List[str]
             max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
             uncond_input = self.tokenizer(
-                [""] * batch_size,
+                uncond_tokens,
                 padding="max_length",
                 max_length=max_length,
                 return_tensors="pt",

From c2b03f076af7e5750bca690614881f25b483dd8e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 14 Nov 2022 17:52:14 -0800
Subject: [PATCH 007/638] add missing copyright headers (#86)

---
 .../backend/cuda/attention/src/fmha.h           | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/gemm.h      | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/gmem_tile.h | 17 +++++++++++++++++
 .../cuda/attention/src/fmha/kernel_traits.h     | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/mask.h      | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/smem_tile.h | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/softmax.h   | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha/utils.h     | 17 +++++++++++++++++
 .../src/fmha_block_fprop_fp16_kernel.sm80.cu    | 17 +++++++++++++++++
 .../attention/src/fmha_block_fprop_kernel_1xN.h | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha_blockmask.h | 17 +++++++++++++++++
 .../src/fmha_fprop_fp16_kernel.sm80.cu          | 17 +++++++++++++++++
 .../cuda/attention/src/fmha_fprop_kernel_1xN.h  | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha_kernel.h    | 17 +++++++++++++++++
 .../backend/cuda/attention/src/fmha_utils.h     | 17 +++++++++++++++++
 .../backend/cuda/attention/src/philox.cuh       | 15 +++++++++++++++
 python/aitemplate/backend/cuda/conv3d/conv3d.py |  3 ++-
 .../backend/cuda/elementwise/int_elementwise.py |  2 +-
 .../gemm_epilogue_vistor/common_dual_gemm.py    |  2 +-
 .../dual_gemm_rcr_fast_gelu.py                  |  2 +-
 .../gemm_epilogue_vistor/dual_gemm_rcr_silu.py  |  2 +-
 .../include/gemm_with_softmax.h                 | 15 +++++++++++++++
 .../backend/cuda/groupnorm/layer_norm.cuh       | 17 +++++++++++++++++
 .../aitemplate/backend/cuda/tensor/permute.cuh  | 17 +++++++++++++++++
 .../cuda/vision_ops/nms/batched_nms_kernel.cuh  | 14 +++++++++++++-
 .../compiler/ops/common/int_elementwise.py      |  2 +-
 python/aitemplate/compiler/ops/conv/conv3d.py   |  3 ++-
 .../dual_gemm_rcr_fast_gelu.py                  |  2 +-
 .../gemm_epilogue_vistor/dual_gemm_rcr_silu.py  |  2 +-
 static/include/debug_utility.h                  |  4 ++--
 tests/unittest/benchmark/test_gemm_benchmark.py |  2 +-
 tests/unittest/ops/test_conv3d.py               |  2 +-
 tests/unittest/ops/test_cross_attention.py      |  2 +-
 tests/unittest/ops/test_dual_gemm.py            |  2 +-
 tests/unittest/ops/test_dynamic_conv.py         |  2 +-
 tests/unittest/ops/test_efficient_nms.py        |  2 +-
 .../ops/test_int_elementwise_dynamic_reshape.py |  2 +-
 tests/unittest/util/test_debug_utils.py         |  2 +-
 38 files changed, 353 insertions(+), 20 deletions(-)

diff --git a/python/aitemplate/backend/cuda/attention/src/fmha.h b/python/aitemplate/backend/cuda/attention/src/fmha.h
index 066f236c7..ddd1b15c9 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
index 254abe31b..919885aa8 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
index fa00d5984..6dd7d407e 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
index 3b7487e3b..f85039038 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/mask.h b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
index 358acb90a..9e2664b7b 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
index c3f87a71d..cca1100ab 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
index ec5461966..41f653f4d 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/utils.h b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
index 4a95ccce6..21dec28d5 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
index 92756cc6f..625c47395 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
index d90ab5065..8c5e20e3c 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /***************************************************************************************************
  * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
index 94dd66718..6690a5e28 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
index aa4138983..73fd3ab31 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
index 86f39f3c7..4c1611af7 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /***************************************************************************************************
  * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
index 41f49ffda..f1bbfd471 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_utils.h b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
index a27bd40d9..9e72465cc 100644
--- a/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original NVIDIA copyright notice:
+
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
diff --git a/python/aitemplate/backend/cuda/attention/src/philox.cuh b/python/aitemplate/backend/cuda/attention/src/philox.cuh
index 4ab1a63ff..8b3e22e88 100644
--- a/python/aitemplate/backend/cuda/attention/src/philox.cuh
+++ b/python/aitemplate/backend/cuda/attention/src/philox.cuh
@@ -1,3 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
 // Pytorch also has an implementation of Philox RNG:
 // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
 #pragma once
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
index 045092131..09f9589dd 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 """
 Codegen for conv3d.
 """
diff --git a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
index ad9be8b98..8bd6fc5a8 100644
--- a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index fdcf0e741..820cbde0d 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index 5c626cbb8..b615589c2 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index 211259e9e..e0418ece8 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
index d5e7351a9..4ad8ee10b 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
@@ -1,3 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
 namespace cutlass {
 
 template <
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index baa1981b3..386fd69ae 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original OneFlow copyright notice:
+
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute.cuh b/python/aitemplate/backend/cuda/tensor/permute.cuh
index 595aef45c..b9ced62b7 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.cuh
+++ b/python/aitemplate/backend/cuda/tensor/permute.cuh
@@ -1,3 +1,20 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original OneFlow copyright notice:
+
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
index 2a44ce211..a1d791633 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
@@ -1,4 +1,16 @@
-// CUDA batched_nms kernel
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
 
 int const threadsPerBlock = sizeof(unsigned long long int) * 8;
 #define THREADS_PER_BLOCK 256
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index a704d40ed..1e8a4b8e0 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 37cd6f7c7..0015d4091 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 """
 Base class for conv3d.
 """
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index 9681fc348..e6e8c1d0e 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index 576269722..d80a541e2 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/static/include/debug_utility.h b/static/include/debug_utility.h
index 102bce838..332f07890 100644
--- a/static/include/debug_utility.h
+++ b/static/include/debug_utility.h
@@ -1,4 +1,4 @@
-//  Copyright (c) Meta Platform, Inc. and its affiliates.
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
 //
 //  Licensed under the Apache License, Version 2.0 (the "License");
 //  you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //  See the License for the specific language governing permissions and
 //  limitations under the License.
-//
+
 #pragma once
 #include "device_functions-generated.h"
 
diff --git a/tests/unittest/benchmark/test_gemm_benchmark.py b/tests/unittest/benchmark/test_gemm_benchmark.py
index fee506494..07f43cc3e 100644
--- a/tests/unittest/benchmark/test_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_gemm_benchmark.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index d79f9fedd..ea192241d 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
index 088a56892..52f98c398 100644
--- a/tests/unittest/ops/test_cross_attention.py
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_dual_gemm.py b/tests/unittest/ops/test_dual_gemm.py
index d737ed270..28b25bfda 100644
--- a/tests/unittest/ops/test_dual_gemm.py
+++ b/tests/unittest/ops/test_dual_gemm.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index 14b77a440..b42378f4f 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -1,4 +1,4 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index 54aaa513c..1960b8483 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates"""
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
index 075e8f79c..f3d2aaff2 100644
--- a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
+++ b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tests/unittest/util/test_debug_utils.py b/tests/unittest/util/test_debug_utils.py
index b92181b0e..c0c4e63c1 100644
--- a/tests/unittest/util/test_debug_utils.py
+++ b/tests/unittest/util/test_debug_utils.py
@@ -1,4 +1,4 @@
-#  Copyright (c) Meta Platform, Inc. and its affiliates.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.

From d3e6010d9b596786b2111a3fc3f34a2b348bc9af Mon Sep 17 00:00:00 2001
From: Ehsan Azar <dashesy@gmail.com>
Date: Wed, 16 Nov 2022 13:53:06 -0800
Subject: [PATCH 008/638] Conv2d group (#73)

* group conv

* add conv_groups op compiler

* Conv2d groups

* Conv2d depthwise

* wip

* wip

* wip

* wip

* only one ops to get feedback

* only one ops to get feedback

* Fix layout, now test passes

* Fix docstring

* Add conv2d_depthwise_bias and test

* Add conv2d_depthwise_bias and test and frontends

* doc

* frontend import depthwise

* Fix lint

* Fix lint

* Fix after rebase UTs pass

* fix lint

* fix more lint
---
 .../backend/cuda/conv2d/__init__.py           |   2 +
 .../aitemplate/backend/cuda/conv2d/common.py  |   2 +-
 .../backend/cuda/conv2d/conv2d_depthwise.py   | 518 ++++++++++++++++++
 .../cuda/conv2d/conv2d_depthwise_bias.py      | 389 +++++++++++++
 .../aitemplate/compiler/ops/conv/__init__.py  |   2 +
 python/aitemplate/compiler/ops/conv/conv2d.py |   1 +
 .../compiler/ops/conv/conv2d_depthwise.py     |  87 +++
 .../ops/conv/conv2d_depthwise_bias.py         |  99 ++++
 .../aitemplate/frontend/nn/conv2d/__init__.py |   2 +
 .../frontend/nn/conv2d/conv2d_depthwise.py    |  46 ++
 .../nn/conv2d/conv2d_depthwise_bias.py        |  43 ++
 tests/unittest/ops/test_conv_depthwise.py     |  60 ++
 .../unittest/ops/test_conv_depthwise_bias.py  |  62 +++
 13 files changed, 1312 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
 create mode 100644 tests/unittest/ops/test_conv_depthwise.py
 create mode 100644 tests/unittest/ops/test_conv_depthwise_bias.py

diff --git a/python/aitemplate/backend/cuda/conv2d/__init__.py b/python/aitemplate/backend/cuda/conv2d/__init__.py
index 7d83ce1fd..09703e7b8 100644
--- a/python/aitemplate/backend/cuda/conv2d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv2d/__init__.py
@@ -28,6 +28,8 @@
     conv2d_bias_relu,
     conv2d_bias_relu_few_channels,
     conv2d_bias_sigmoid,
+    conv2d_depthwise,
+    conv2d_depthwise_bias,
     transposed_conv2d,
     transposed_conv2d_bias,
 )
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 61c6c05f8..c9fb0ef4a 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -223,7 +223,7 @@ def cal_align_ab(x_shape: List[int]) -> int:
         return 4
     if k % 2 == 0:
         return 2
-    raise RuntimeError("a/b is not aligned")
+    raise RuntimeError(f"a/b is not aligned {x_shape=}")
 
 
 def function_filter(cfg, func_attrs, x_shape):
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
new file mode 100644
index 000000000..bb166baa1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -0,0 +1,518 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for conv2d_depthwise.
+"""
+from collections import OrderedDict
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ... import registry
+from ...target import Target
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1,
+        i32_in_ch
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
+
+  //
+  // warmup
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def conv_dw_instance(op_def):
+    op_def = op_def.replace("DefaultConv2dFprop", "DefaultDepthwiseFprop")
+    op_def = op_def.replace("OpClassTensorOp", "OpClassSimt")
+    idx = op_def.find("kAnalytic")
+    op_def = op_def[: idx + 9] + "\n>::Kernel;\n"
+    return op_def
+
+
+def emit_instance(op, f_instance_convertor=conv_dw_instance):
+    """Emits cutlass instance."""
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+def apply_special_config(func_attrs, op):
+    import cutlass_lib
+
+    op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.Analytic
+    op.A.alignment = 1
+    op.B.alignment = 1
+    op.tile_description.stages = 2
+    op.tile_description.math_instruction.instruction_shape = [1, 1, 1]
+    op.tile_description.threadblock_shape[-1] = 8
+    return op
+
+
+def extract_config(func_attrs):
+    import copy
+
+    import cutlass_lib
+
+    def f_proc_op_special(op):
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+            op = apply_special_config(func_attrs, op)
+
+            # set C alignment
+            for i in [1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    op_kind = cutlass_lib.library.OperationKind.Conv2d
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
+    ret = []
+    conv2d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.conv_kind == conv_kind:
+            ret = f_proc_op_special(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = common.kernel_name(op_inst)
+                    conv2d_ops[key] = op_inst
+    return conv2d_ops
+
+
+@registry.reg("cuda.conv2d_depthwise.config")
+def conv2d_depthwise_config(func_attrs, dtype="float16"):
+    """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_depthwise.gen_profiler")
+def gen_profiler(
+    func_attrs,
+    workdir,
+    shape_template,
+    exec_template=EXEC_TEMPLATE,
+    src_template=SRC_TEMPLATE,
+    profiler_template=PROFILER_TEMPLATE,
+):
+    """Codegen for conv2d_depthwise profiler."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = emit_instance(op)
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvFwdInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = exec_template.render(
+            indent="  ", is_profiler=True, instance=name, dtype=dtype
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="conv",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = profiler_template.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    return common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.conv2d_depthwise.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv2d_depthwise function."""
+    return common.gen_function(
+        func_attrs,
+        INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        f_emit_instance=emit_instance,
+    )
+
+
+@registry.reg("cuda.conv2d_depthwise.func_decl")
+def conv2d_depthwise_gen_function_decl(func_attrs):
+    """Codegen for conv2d_depthwise function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_depthwise.func_call")
+def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv2d_depthwise function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d_depthwise.filter")
+def conv2d_depthwise_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
new file mode 100644
index 000000000..af33fecce
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
@@ -0,0 +1,389 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for conv2d_depthwise.
+"""
+import jinja2
+
+from ... import registry
+from . import common, conv2d_depthwise as cdw
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+    void* bias_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1,
+        i32_in_ch
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
+
+  //
+  // warmup
+  conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+      conv(x.device_data(),
+       w.device_data(),
+       y.device_data(),
+       b.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  void*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.config")
+def conv2d_depthwise_config(func_attrs, dtype="float16"):
+    """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = cdw.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """Codegen for conv2d_depthwise_bias profiler."""
+    return cdw.gen_profiler(
+        func_attrs,
+        workdir,
+        shape_template,
+        exec_template=EXEC_TEMPLATE,
+        src_template=SRC_TEMPLATE,
+        profiler_template=PROFILER_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv2d_depthwise_bias function."""
+    return common.gen_function(
+        func_attrs,
+        cdw.INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        f_emit_instance=cdw.emit_instance,
+    )
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.func_decl")
+def conv2d_depthwise_gen_function_decl(func_attrs):
+    """Codegen for conv2d_depthwise_bias function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.func_call")
+def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv2d_depthwise_bias function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d_depthwise_bias.filter")
+def conv2d_depthwise_bias_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
index b99bb893c..1233111c6 100644
--- a/python/aitemplate/compiler/ops/conv/__init__.py
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -27,6 +27,8 @@
 from .conv2d_bias_relu import conv2d_bias_relu
 from .conv2d_bias_relu_few_channels import conv2d_bias_relu_few_channels
 from .conv2d_bias_sigmoid import conv2d_bias_sigmoid
+from .conv2d_depthwise import conv2d_depthwise
+from .conv2d_depthwise_bias import conv2d_depthwise_bias
 from .conv3d import conv3d
 from .depthwise_conv3d import depthwise_conv3d
 from .transposed_conv2d import transposed_conv2d
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 59f96c467..11c67cc9c 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -408,6 +408,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             if not func(cfg, self._attrs, x_shape):
                 continue
             command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+            logger.info(__name__, "Running " + " ".join(command))
             runner.push(cfg, command)
 
         runner.join()
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
new file mode 100644
index 000000000..35bc350e8
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_depthwise op.
+"""
+from typing import List, Tuple
+
+from ...base import Tensor
+from .conv2d import conv2d
+
+# pylint: disable=C0103
+class conv2d_depthwise(conv2d):
+    """Base class of conv2d with groups."""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """conv2d_depthwise constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv2d_depthwise"
+
+    def __call__(self, x: Tensor, w: Tensor):
+        """Call conv2d_depthwise with tensors x, w
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, 1)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if w[0] != self._attrs["group"]:
+            raise RuntimeError("W Shape mismatch for conv2d_depthwise")
+        return super()._infer_shape(x, w)
+
+    @staticmethod
+    def is_valid_inputs(x: Tensor, w: Tensor) -> Tuple[bool, str]:
+        x_shape = x._attrs["shape"]
+        if len(x_shape) != 4:
+            return False, f"x should be 4D: {x_shape=}"
+
+        w_shape = w._attrs["shape"]
+        if len(w_shape) != 4:
+            return False, f"w should be 4D: {w_shape=}"
+
+        # No need to check compatibility of x/w. This function is only used
+        # for fusing conv/elementwise into conv_bias. If x and w were not compatible,
+        # it would fail in the original conv.__call__.
+        return True, ""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
new file mode 100644
index 000000000..73ddaa04c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_depthwise op.
+"""
+from typing import List, Tuple
+
+from ...base import Tensor
+from .conv2d import conv2d
+
+# pylint: disable=C0103
+class conv2d_depthwise_bias(conv2d):
+    """Base class of conv2d with groups."""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """conv2d_depthwise constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv2d_depthwise_bias"
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor):
+        """Call conv2d_depthwise with tensors x, w, b
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, 1)
+        b : Tensor
+            in shape (C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if w[0] != self._attrs["group"]:
+            raise RuntimeError("W Shape mismatch for conv2d_depthwise")
+        return super()._infer_shape(x, w)
+
+    @staticmethod
+    def is_valid_inputs(x: Tensor, w: Tensor, b: Tensor) -> Tuple[bool, str]:
+        x_shape = x._attrs["shape"]
+        if len(x_shape) != 4:
+            return False, f"x should be 4D: {x_shape=}"
+
+        w_shape = w._attrs["shape"]
+        if len(w_shape) != 4:
+            return False, f"w should be 4D: {w_shape=}"
+
+        b_shape = b._attrs["shape"]
+        if len(b_shape) != 1:
+            return False, f"b should be 1D: {b_shape=}"
+
+        if b_shape[0] != w_shape[0]:
+            return (
+                False,
+                f"out channels in bias does not match: {b_shape[0]=} != {w_shape[0]=}",
+            )
+
+        # No need to check compatibility of x/w. This function is only used
+        # for fusing conv/elementwise into conv_bias. If x and w were not compatible,
+        # it would fail in the original conv.__call__.
+        return True, ""
diff --git a/python/aitemplate/frontend/nn/conv2d/__init__.py b/python/aitemplate/frontend/nn/conv2d/__init__.py
index d951bfcce..79375c8f1 100644
--- a/python/aitemplate/frontend/nn/conv2d/__init__.py
+++ b/python/aitemplate/frontend/nn/conv2d/__init__.py
@@ -26,5 +26,7 @@
 from .conv2d_bias_relu import Conv2dBiasRelu
 from .conv2d_bias_relu_few_channels import Conv2dBiasReluFewChannels
 from .conv2d_bias_sigmoid import Conv2dBiasSigmoid
+from .conv2d_depthwise import Conv2dDepthwise
+from .conv2d_depthwise_bias import Conv2dDepthwiseBias
 from .transposed_conv2d_bias import ConvTranspose2dBias
 from .transposed_conv2d_bias_relu import ConvTranspose2dBiasRelu
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
new file mode 100644
index 000000000..93b95927c
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d depthwise module
+"""
+from ....compiler.ops import conv2d_depthwise
+from .conv2d import Conv2d
+
+
+class Conv2dDepthwise(Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
+        self.op = conv2d_depthwise(
+            stride=stride, pad=padding, dilate=dilation, group=groups
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
new file mode 100644
index 000000000..6632db113
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d depthwise bias module
+"""
+from .common_conv2d_bias_act import Conv2dBiasAct
+
+
+class Conv2dDepthwiseBias(Conv2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_depthwise_bias",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/tests/unittest/ops/test_conv_depthwise.py b/tests/unittest/ops/test_conv_depthwise.py
new file mode 100644
index 000000000..8f8708a78
--- /dev/null
+++ b/tests/unittest/ops/test_conv_depthwise.py
@@ -0,0 +1,60 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvDepthwiseTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        groups = 32
+        size = (12, 12)
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), *size, 32],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[32, 3, 3, 1], dtype="float16", name="input_1", is_input=True)
+        OP = ops.conv2d_depthwise(stride=1, pad=1, dilate=1, group=groups)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_dw")
+
+        X_pt = torch.randn(batch, 32, *size).cuda().half()
+        W_pt = torch.randn(32, 1, 3, 3).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1, groups=groups)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, *size, 32]).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertFalse(y_transpose.isnan().any())
+        self.assertFalse(y_transpose.isinf().any())
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_depthwise_bias.py b/tests/unittest/ops/test_conv_depthwise_bias.py
new file mode 100644
index 000000000..25cc58e8a
--- /dev/null
+++ b/tests/unittest/ops/test_conv_depthwise_bias.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvDepthwiseBiasTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        groups = 32
+        size = (12, 12)
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), *size, 32],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[32, 3, 3, 1], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[32], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_depthwise_bias(stride=1, pad=1, dilate=1, group=groups)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_dw_bias")
+
+        X_pt = torch.randn(batch, 32, *size).cuda().half()
+        W_pt = torch.randn(32, 1, 3, 3).cuda().half()
+        B_pt = torch.randn(32).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(
+            X_pt, W_pt, bias=B_pt, padding=1, groups=groups
+        )
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, *size, 32]).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w, "input_2": B_pt}, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 20616ba1ac4462e61012e5f3ac796fa62fb41a34 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Sun, 27 Nov 2022 03:56:45 -0800
Subject: [PATCH 009/638] Fix python lint (#106)

* fix python lint

* .

* .

* fix
---
 .flake8 | 162 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 108 insertions(+), 54 deletions(-)

diff --git a/.flake8 b/.flake8
index 3f29e8612..71a5883ed 100644
--- a/.flake8
+++ b/.flake8
@@ -5,59 +5,113 @@ max-line-length = 80
 ignore =
   # Black conflicts and overlaps.
   # Found in https://github.com/psf/black/issues/429
-  B950, # Line too long.
-  E111, # Indentation is not a multiple of four.
-  E115, # Expected an indented block (comment).
-  E117, # Over-indented.
-  E121, # Continuation line under-indented for hanging indent.
-  E122, # Continuation line missing indentation or outdented.
-  E123, # Closing bracket does not match indentation of opening bracket's line.
-  E124, # Closing bracket does not match visual indentation.
-  E125, # Continuation line with same indent as next logical line.
-  E126, # Continuation line over-indented for hanging indent.
-  E127, # Continuation line over-indented for visual indent.
-  E128, # Continuation line under-indented for visual indent.
-  E129, # Visually indented line with same indent as next logical line.
-  E131, # Continuation line unaligned for hanging indent.
-  E201, # Whitespace after '('.
-  E202, # Whitespace before ')'.
-  E203, # Whitespace before ':'.
-  E221, # Multiple spaces before operator.
-  E222, # Multiple spaces after operator.
-  E225, # Missing whitespace around operator.
-  E226, # Missing whitespace around arithmetic operator.
-  E227, # Missing whitespace around bitwise or shift operator.
-  E231, # Missing whitespace after ',', ';', or ':'.
-  E241, # Multiple spaces after ','.
-  E251, # Unexpected spaces around keyword / parameter equals.
-  E252, # Missing whitespace around parameter equals.
-  E261, # At least two spaces before inline comment.
-  E262, # Inline comment should start with '# '.
-  E265, # Block comment should start with '# '.
-  E271, # Multiple spaces after keyword.
-  E272, # Multiple spaces before keyword.
-  E301, # Expected 1 blank line, found 0.
-  E302, # Expected 2 blank lines, found 0.
-  E303, # Too many blank lines (3).
-  E305, # Expected 2 blank lines after end of function or class.
-  E306, # Expected 1 blank line before a nested definition.
-  E501, # Line too long (82 > 79 characters).
-  E502, # The backslash is redundant between brackets.
-  E701, # Multiple statements on one line (colon).
-  E702, # Multiple statements on one line (semicolon).
-  E703, # Statement ends with a semicolon.
-  E704, # Multiple statements on one line (def).
-  W291, # Trailing whitespace.
-  W292, # No newline at end of file.
-  W293, # Blank line contains whitespace.
-  W391, # Blank line at end of file.
-  W504, # Line break occurred after a binary operator.
+  # Line too long.
+  B950,
+  # Indentation is not a multiple of four. 
+  E111, 
+  # Expected an indented block (comment).
+  E115, 
+  # Over-indented.
+  E117,
+  # Continuation line under-indented for hanging indent. 
+  E121,
+  # Continuation line missing indentation or outdented. 
+  E122,
+  # Closing bracket does not match indentation of opening bracket's line. 
+  E123,
+  # Closing bracket does not match visual indentation. 
+  E124,
+  # Continuation line with same indent as next logical line. 
+  E125,
+  # Continuation line over-indented for hanging indent. 
+  E126,
+  # Continuation line over-indented for visual indent. 
+  E127,
+  # Continuation line under-indented for visual indent. 
+  E128,
+  # Visually indented line with same indent as next logical line. 
+  E129,
+  # Continuation line unaligned for hanging indent. 
+  E131,
+  # Whitespace after '('. 
+  E201,
+  # Whitespace before ')'. 
+  E202,
+  # Whitespace before ':'. 
+  E203,
+  # Multiple spaces before operator. 
+  E221,
+  # Multiple spaces after operator. 
+  E222,
+  # Missing whitespace around operator. 
+  E225,
+  # Missing whitespace around arithmetic operator. 
+  E226,
+  # Missing whitespace around bitwise or shift operator. 
+  E227,
+  # Missing whitespace after ',', ';', or ':'. 
+  E231,
+  # Multiple spaces after ','. 
+  E241,
+  # Unexpected spaces around keyword / parameter equals. 
+  E251,
+  # Missing whitespace around parameter equals. 
+  E252,
+  # At least two spaces before inline comment. 
+  E261, 
+  # Inline comment should start with '# '.
+  E262, 
+  # Block comment should start with '# '.
+  E265,
+  # Multiple spaces after keyword. 
+  E271,
+  # Multiple spaces before keyword. 
+  E272,
+  # Expected 1 blank line, found 0. 
+  E301,
+  # Expected 2 blank lines, found 0. 
+  E302,
+  # Too many blank lines (3). 
+  E303,
+  # Expected 2 blank lines after end of function or class. 
+  E305,
+  # Expected 1 blank line before a nested definition. 
+  E306,
+  # Line too long (82 > 79 characters). 
+  E501,
+  # The backslash is redundant between brackets. 
+  E502,
+  # Multiple statements on one line (colon). 
+  E701,
+  # Multiple statements on one line (semicolon). 
+  E702,
+  # Statement ends with a semicolon. 
+  E703,
+  # Multiple statements on one line (def). 
+  E704,
+  # Trailing whitespace. 
+  W291,
+  # No newline at end of file. 
+  W292,
+  # Blank line contains whitespace. 
+  W293,
+  # Blank line at end of file. 
+  W391,
+  # Line break occurred after a binary operator. 
+  W504, 
 
   # Too opinionated.
-  E265, # Block comment should start with '# '.
-  E266, # Too many leading '#' for block comment.
-  E402, # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports)
-  E722, # Do not use bare except, specify exception instead. (Duplicate of B001)
-  P207, # (Duplicate of B003)
-  P208, # (Duplicate of C403)
-  W503  # Line break occurred before a binary operator.
+  # Block comment should start with '# '.
+  E265,
+  # Too many leading '#' for block comment. 
+  E266,
+  # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports) 
+  E402, 
+  # Do not use bare except, specify exception instead. (Duplicate of B001)
+  E722, 
+  # (Duplicate of B003)
+  P207, 
+  # (Duplicate of C403)
+  P208,
+  # Line break occurred before a binary operator.
+  W503  

From 8c158f54278975a074f9cadf12982bcba259436d Mon Sep 17 00:00:00 2001
From: Terry Chen <hahakuku@hotmail.com>
Date: Sun, 27 Nov 2022 03:58:09 -0800
Subject: [PATCH 010/638] sd 2.0 (#105)

Co-authored-by: Terry Chne <terrychen2012@live.com>
---
 examples/05_stable_diffusion/benchmark.py     | 10 ++----
 examples/05_stable_diffusion/compile.py       | 20 +++++++----
 examples/05_stable_diffusion/demo.py          |  8 ++++-
 examples/05_stable_diffusion/modeling/clip.py |  6 ++--
 .../modeling/unet_2d_condition.py             | 15 +++++---
 .../pipeline_stable_diffusion_ait.py          | 34 +++++++++++++------
 6 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
index 811743da9..6f0e3f695 100644
--- a/examples/05_stable_diffusion/benchmark.py
+++ b/examples/05_stable_diffusion/benchmark.py
@@ -53,6 +53,7 @@ def benchmark_unet(
     hh=64,
     ww=64,
     dim=320,
+    hidden_size=1024,
     benchmark_pt=False,
     verify=False,
 ):
@@ -67,7 +68,7 @@ def benchmark_unet(
     pt_mod = pt_mod.eval()
 
     latent_model_input_pt = torch.randn(batch_size, 4, hh, ww).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 64, 768).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 64, hidden_size).cuda().half()
     timesteps_pt = torch.Tensor([1, 1]).cuda().half()
 
     with autocast("cuda"):
@@ -126,11 +127,6 @@ def benchmark_unet(
 def benchmark_clip(
     batch_size=1,
     seqlen=64,
-    dim=768,
-    num_heads=12,
-    hidden_size=768,
-    vocab_size=49408,
-    max_position_embeddings=77,
     benchmark_pt=False,
     verify=False,
 ):
@@ -288,7 +284,7 @@ def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
         access_token = token
 
     pipe = StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        "stabilityai/stable-diffusion-2",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=access_token,
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
index 9c87f4155..f9f5224df 100644
--- a/examples/05_stable_diffusion/compile.py
+++ b/examples/05_stable_diffusion/compile.py
@@ -180,11 +180,16 @@ def compile_unet(
     hh=64,
     ww=64,
     dim=320,
+    hidden_dim=1024,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
 ):
 
-    ait_mod = ait_UNet2DConditionModel(sample_size=64, cross_attention_dim=768)
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=64,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=[5, 10, 20, 20],
+    )
     ait_mod.name_parameter_tensor()
 
     # set AIT parameters
@@ -196,7 +201,9 @@ def compile_unet(
         [batch_size, hh, ww, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
-    text_embeddings_pt_ait = Tensor([batch_size, 64, 768], name="input2", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [batch_size, 64, hidden_dim], name="input2", is_input=True
+    )
 
     Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
     mark_output(Y)
@@ -212,15 +219,12 @@ def compile_clip(
     seqlen=64,
     dim=768,
     num_heads=12,
-    hidden_size=768,
-    vocab_size=49408,
-    max_position_embeddings=77,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
 ):
     mask_seq = 0
     causal = True
-    depth = 12
+    depth = 23
 
     ait_mod = ait_CLIPTextTransformer(
         num_hidden_layers=depth,
@@ -336,7 +340,7 @@ def compile_diffusers(
         access_token = token
 
     pipe = StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        "stabilityai/stable-diffusion-2",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=access_token,
@@ -348,6 +352,8 @@ def compile_diffusers(
     # CLIP
     compile_clip(
         batch_size=batch_size,
+        dim=1024,
+        num_heads=16,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
     )
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
index 51859e886..5f9ab77c8 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/demo.py
@@ -16,6 +16,7 @@
 import torch
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import EulerDiscreteScheduler
 from pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
 
 
@@ -28,8 +29,13 @@
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
 def run(token, width, height, prompt, benchmark):
+
+    model_id = "stabilityai/stable-diffusion-2"
+    scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+
     pipe = StableDiffusionAITPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        model_id,
+        scheduler=scheduler,
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token,
diff --git a/examples/05_stable_diffusion/modeling/clip.py b/examples/05_stable_diffusion/modeling/clip.py
index f9687d64a..8d6079988 100644
--- a/examples/05_stable_diffusion/modeling/clip.py
+++ b/examples/05_stable_diffusion/modeling/clip.py
@@ -307,14 +307,13 @@ def __init__(
         self.fc1 = nn.Linear(
             in_features,
             hidden_features,
+            specialization="gelu",
         )
-        self.activation_fn = QuickGELUActivation()
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
         shape = get_shape(x)
         x = self.fc1(x)
-        x = self.activation_fn(x)
         x = self.fc2(x, res)
         return ops.reshape()(x, shape)
 
@@ -344,6 +343,7 @@ def __init__(
             has_residual=True,
             causal=causal,
             mask_seq=mask_seq,
+            use_mem_eff=True,
         )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
         self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
@@ -534,7 +534,7 @@ def __init__(
     ):
         super().__init__()
         embed_dim = hidden_size
-        self.embeddings = CLIPTextEmbeddings()
+        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
         self.encoder = CLIPEncoder(
             num_hidden_layers=num_hidden_layers,
             hidden_size=hidden_size,
diff --git a/examples/05_stable_diffusion/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/modeling/unet_2d_condition.py
index 9c1d9f07c..a21879dea 100644
--- a/examples/05_stable_diffusion/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/modeling/unet_2d_condition.py
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 from aitemplate.frontend import nn
 
@@ -80,7 +80,7 @@ def __init__(
         norm_num_groups: int = 32,
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
-        attention_head_dim: int = 8,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
     ):
         super().__init__()
         self.center_input_sample = center_input_sample
@@ -98,6 +98,9 @@ def __init__(
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
 
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -115,7 +118,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
             )
             self.down_blocks.append(down_block)
@@ -129,12 +132,14 @@ def __init__(
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim,
+            attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
         )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
@@ -156,7 +161,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
index 06f364ce1..7514161a5 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
@@ -24,11 +24,15 @@
 from diffusers import (
     AutoencoderKL,
     DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
     UNet2DConditionModel,
 )
+
 from diffusers.pipelines.stable_diffusion import (
     StableDiffusionPipelineOutput,
     StableDiffusionSafetyChecker,
@@ -71,7 +75,14 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
     ):
@@ -321,9 +332,7 @@ def __call__(
 
         self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
 
-        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents * self.scheduler.sigmas[0]
+        latents = latents * self.scheduler.init_noise_sigma
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -341,6 +350,8 @@ def __call__(
             latent_model_input = (
                 torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
             if isinstance(self.scheduler, LMSDiscreteScheduler):
                 sigma = self.scheduler.sigmas[i]
                 # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
@@ -376,12 +387,15 @@ def __call__(
         image = image.cpu().permute(0, 2, 3, 1).numpy()
 
         # run safety checker
-        safety_cheker_input = self.feature_extractor(
-            self.numpy_to_pil(image), return_tensors="pt"
-        ).to(self.device)
-        image, has_nsfw_concept = self.safety_checker(
-            images=image, clip_input=safety_cheker_input.pixel_values
-        )
+        if self.safety_checker is not None:
+            safety_cheker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="pt"
+            ).to(self.device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_cheker_input.pixel_values
+            )
+        else:
+            has_nsfw_concept = None
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)

From bbc311bc01014ba7084cd3f9309b35402b721b3c Mon Sep 17 00:00:00 2001
From: BasicCoder <abasiccoder@gmail.com>
Date: Fri, 2 Dec 2022 15:34:01 +0800
Subject: [PATCH 011/638] support pipeline at 768x768 resolution (#119)

The pipeline default image size = 512x512, but when using the Stable Diffusion 2.0-v  (768x768) model for benchmark (--benchmark=True) evaluation, after compiled model in 768x768 image size,  the demo script will error in image size.
---
 examples/05_stable_diffusion/demo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
index 5f9ab77c8..1a2fca835 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/demo.py
@@ -44,7 +44,7 @@ def run(token, width, height, prompt, benchmark):
     with torch.autocast("cuda"):
         image = pipe(prompt, height, width).images[0]
         if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
+            t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
             print(f"sd e2e: {t} ms")
 
     image.save("example_ait.png")

From 183f55e7f128fb13fe4346bd737f2cf012cddc92 Mon Sep 17 00:00:00 2001
From: Ivan Mikhnenkov <39604625+ivanmikhnenkov@users.noreply.github.com>
Date: Sat, 17 Dec 2022 04:45:13 +0300
Subject: [PATCH 012/638] added requires_safety_checker to sd pipelines,
 following diffusers pipelines (#126)

---
 examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py   | 2 ++
 .../pipeline_stable_diffusion_img2img_ait.py                    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
index 7514161a5..3a14debcc 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
@@ -85,6 +85,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__(
             vae=vae,
@@ -94,6 +95,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
         )
 
         workdir = "tmp/"
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
index 251326b55..7380aeebd 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
@@ -86,6 +86,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         # super().__init__()
         super().__init__(
@@ -96,6 +97,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
         )
         scheduler = scheduler.set_format("pt")
         self.register_modules(

From e6715f6495593ea65e1497c177b9f44f4a71338b Mon Sep 17 00:00:00 2001
From: inisis <46103969+inisis@users.noreply.github.com>
Date: Thu, 29 Dec 2022 00:38:37 +0800
Subject: [PATCH 013/638] use same benchmark pipeline (#130)

---
 examples/05_stable_diffusion/benchmark_pt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/benchmark_pt.py
index 05c65e9bf..aa9af8596 100644
--- a/examples/05_stable_diffusion/benchmark_pt.py
+++ b/examples/05_stable_diffusion/benchmark_pt.py
@@ -28,7 +28,7 @@
 )
 def run(token, prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        "stabilityai/stable-diffusion-2",
         revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token,

From 69c9a61cb2ad442f16f7d5fb31a32196e1bcfd05 Mon Sep 17 00:00:00 2001
From: Ehsan Azar <ehazar@microsoft.com>
Date: Wed, 28 Dec 2022 08:40:22 -0800
Subject: [PATCH 014/638] Vanilla Attention (#100)

* Vanilla attention

* Vanilla attention

* fix lint

* Fix causal unimplemented

* support causal

* causal vanilla attention

* retrigger the CI

* Separate class for vanilla attention

* type

* docstring

* address CR
---
 python/aitemplate/frontend/nn/__init__.py     |   5 +
 .../frontend/nn/vanilla_attention.py          | 302 ++++++++++++++++
 tests/unittest/ops/test_cross_attention.py    |   9 +-
 tests/unittest/ops/test_vanilla_attention.py  | 336 ++++++++++++++++++
 4 files changed, 650 insertions(+), 2 deletions(-)
 create mode 100644 python/aitemplate/frontend/nn/vanilla_attention.py
 create mode 100644 tests/unittest/ops/test_vanilla_attention.py

diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index c014b02b5..bb067846e 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -27,6 +27,11 @@
 from .view_ops import *
 from .attention import CrossAttention, FlashAttention, MultiheadAttention
 from .identity import Identity
+from .vanilla_attention import (
+    vanilla_attention,
+    VanillaCrossAttention,
+    VanillaMultiheadAttention,
+)
 from .dropout import *
 from .layer_norm import *
 from .group_norm import *
diff --git a/python/aitemplate/frontend/nn/vanilla_attention.py b/python/aitemplate/frontend/nn/vanilla_attention.py
new file mode 100644
index 000000000..cf7e62592
--- /dev/null
+++ b/python/aitemplate/frontend/nn/vanilla_attention.py
@@ -0,0 +1,302 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for vanilla attention module
+"""
+from functools import partial
+
+from ...compiler import ops
+from .. import Tensor
+from .dropout import Dropout
+from .linear import Linear
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+
+def _get_dim(it):
+    try:
+        return it.value()
+    except AttributeError:
+        return -1
+
+
+def _get_shape(x):
+    shape = [_get_dim(it) for it in x._attrs["shape"]]
+    return shape
+
+
+def vanilla_attention(
+    q: Tensor, k: Tensor, v: Tensor, scale: float = None, attn_mask: Tensor = None
+) -> Tensor:
+    """Vanilla attention in the most basic form.
+    q,k,v: batch, seqlen, num_heads, head_dim
+        Either batch or sequence dimension could be variable (but not both)
+    attn_mask: attention mask is *added* to the attention,
+        use 0 and -inf to mask a sequence index
+    """
+    batch_name, seq_name = [it._attrs["name"] for it in q._attrs["shape"]][0:2]
+    B, N, G, D = _get_shape(q)
+    B, M, _, _ = _get_shape(k)
+    BG = B * G
+    if BG < 0:
+        BG = -1
+    C = G * D
+    if scale is None:
+        scale = D ** (-0.5)
+    q = q * scale
+
+    q = ops.permute()(q, [0, 2, 1, 3])  # BxGxNxD
+    q = ops.reshape()(q, (BG, N, D))  # BGxNxD
+
+    k = ops.reshape()(k, (B, M, C))  # BxMxGD
+    k = ops.permute021()(k)  # BxGDxM
+    k = ops.reshape()(k, (BG, D, M))  # BGxDxM
+
+    attention = ops.bmm_rrr()(q, k)  # BGxNxM
+    if attn_mask is not None:
+        attention = attention + attn_mask
+    attention = ops.softmax()(attention, -1)  # BGxNxM
+
+    v = ops.reshape()(v, (B, M, C))  # BxMxGD
+    v = ops.permute021()(v)  # BxGDxM
+    v = ops.reshape()(v, (BG, D, M))  # BGxDxM
+
+    out = ops.bmm_rcr()(v, attention)  # BGxDxN
+    out = ops.reshape()(out, (B, C, N))  # BxGDxN == BxCxN
+    out = ops.permute021()(out)  # BxNxC
+    out._attrs["shape"][0]._attrs["name"] = batch_name
+    out._attrs["shape"][1]._attrs["name"] = seq_name
+    return out
+
+
+class VanillaMultiheadAttention(Module):
+    r"""Vanilla Multi-Head Attention.
+
+    Allows the model to jointly attend to information
+    from different representation subspaces as described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+
+    Multi-Head Attention is defined as:
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    Args:
+        dim: toal dimension of the model
+        batch_size: batch size
+        seq_len: sequence length
+        num_heads: Number of parallel attention heads. Default: 8
+        qkv_bias: whether to add bias to QKV. Default: False
+        attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout).
+        proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout).
+        has_residual: has or has no residual. Default: `True`.
+        causal: default: `False`.
+        attn_mask: Attention mask. If causal this should be a tensor of shape [1, seq_len, seq_len] filled with -inf and 0
+        mask_seq: sequence mask, default: ``0``.
+    """
+
+    def __init__(
+        self,
+        dim,
+        batch_size=-1,
+        seq_len=-1,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        has_residual=True,
+        causal=False,
+        attn_mask: Tensor = None,
+        mask_seq=0,
+    ):
+        super().__init__()
+        assert (
+            dim % num_heads == 0
+        ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.causal = causal
+        self.has_residual = has_residual
+        self.mask_seq = mask_seq
+
+        if causal:
+            assert (
+                attn_mask is not None
+            ), f"Missing attn_mask=Tensor(shape=1x{seq_len}x{seq_len})"
+            self.op = partial(vanilla_attention, attn_mask=attn_mask)
+        else:
+            self.op = vanilla_attention
+
+        if self.mask_seq:
+            self.output_mask = Parameter(
+                shape=[mask_seq, num_heads, head_dim], dtype="float16"
+            )
+        self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
+        self.proj_drop = Dropout(proj_drop)
+
+    def get_shape(self, x):
+        return _get_shape(x)
+
+    def attention(self, x):
+        b, seqlen, d = self.get_shape(x)
+        hidden = d // 3
+        x = ops.reshape()(x, [-1, 3, hidden])
+        (q, k, v) = ops.split()(x, 1, dim=1)
+        return self.op(
+            ops.reshape()(q, [b, seqlen, self.num_heads, hidden // self.num_heads]),
+            ops.reshape()(k, [b, seqlen, self.num_heads, hidden // self.num_heads]),
+            ops.reshape()(v, [b, seqlen, self.num_heads, hidden // self.num_heads]),
+            self.scale,
+        )
+
+    def forward(self, *args):
+        """forward pass for calling mha module"""
+        assert len(args) >= 1
+        x = args[0]
+        batch, seq, hidden = self.get_shape(x)
+        qkv = self.qkv(x)
+        if self.mask_seq:
+            total = self.get_shape(qkv)[0]
+            qkv = ops.dynamic_slice()(
+                qkv,
+                start_indices=[0, 0, 0, 0],
+                end_indices=[total - self.mask_seq, None, None, None],
+            )
+        attn_output = self.attention(qkv)
+        if self.mask_seq:
+            attn_output = ops.concatenate()(
+                [attn_output, self.output_mask.tensor()], dim=0
+            )
+        attn_output = ops.reshape()(attn_output, [batch * seq, -1])
+        if self.has_residual:
+            assert len(args) == 2
+            x = self.proj(attn_output, args[1])
+        else:
+            x = self.proj(attn_output)
+        x = self.proj_drop(x)
+        x = ops.reshape()(x, [batch, seq, hidden])
+        return x
+
+
+class VanillaCrossAttention(Module):
+    r"""Vanilla Cross Multi-head Attention.
+
+    Allows the model to jointly attend to information
+    from different representation subspaces as described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+
+    Multi-Head Attention is defined as:
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    Args:
+        dim: toal dimension of the model
+        batch_size: batch size
+        seq_len: sequence length
+        num_heads: Number of parallel attention heads. Default: 8
+        qkv_bias: whether to add bias to QKV. Default: False
+        attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout).
+        proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout).
+        has_residual: has or has no residual. Default: `True`.
+        causal: default: `False`.
+        mask_seq: sequence mask, default: ``0``.
+    """
+
+    def __init__(
+        self,
+        dim,
+        seq_len,
+        seq_len_kv,
+        num_heads,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        has_residual=True,
+        causal=False,
+    ):
+        super().__init__()
+        assert (
+            dim % num_heads == 0
+        ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.causal = causal
+        self.has_residual = has_residual
+        self.dim = dim
+        self.seqlen = seq_len
+        self.seqlen_kv = seq_len_kv
+
+        assert not causal, "Causal not implemented"
+        self.op = vanilla_attention
+
+        self.proj_q = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+        self.proj_k = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+        self.proj_v = Linear(
+            dim,
+            dim,
+            bias=qkv_bias,
+        )
+
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
+        self.proj_drop = Dropout(proj_drop)
+
+    def attention(self, q, k, v):
+        seqlen = self.seqlen
+        seqlen_kv = self.seqlen_kv
+        head_dim = self.dim // self.num_heads
+
+        query = self.proj_q(q)
+        key = self.proj_k(k)
+        value = self.proj_v(v)
+
+        query = ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim])
+        key = ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim])
+        value = ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim])
+        return self.op(query, key, value)
+
+    def forward(self, *args):
+        """forward pass for calling mha module"""
+        assert len(args) >= 3
+        x = args[0]
+        seq = self.seqlen
+        attn_output = self.attention(args[0], args[1], args[2])
+        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
+
+        if self.has_residual:
+            assert len(args) == 4
+            x = self.proj(attn_output, args[3])
+        else:
+            x = self.proj(attn_output)
+        x = self.proj_drop(x)
+        x = ops.reshape()(x, [-1, seq, self.dim])
+        return x
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
index 52f98c398..13f2f0eff 100644
--- a/tests/unittest/ops/test_cross_attention.py
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -81,7 +81,11 @@ def _test_mha(
         )
         ait_mod.name_parameter_tensor()
 
-        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_size")
+        if len(batch_sizes) == 1:
+            # static
+            batch_dim = batch_sizes[0]
+        else:
+            batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_size")
 
         inputs_ait = Tensor([batch_dim, seqlen, dim], name="input0", is_input=True)
         inputs_ait_k = Tensor([batch_dim, seqlen_kv, dim], name="input1", is_input=True)
@@ -107,7 +111,7 @@ def _test_mha(
             pt_ys = pt_ys + input_pt
             print("pt output:", pt_ys.shape)
 
-            inputs = [input_pt, input_pt_k, input_pt_v]
+            inputs = {"input0": input_pt, "input1": input_pt_k, "input2": input_pt_v}
             ys = [torch.empty(pt_ys.shape).cuda().half()]
             exe_module.run_with_tensors(inputs, ys)
             eps = 1e-2
@@ -120,6 +124,7 @@ def _test_mha(
             print("Batch {} MHA verification pass".format(batch_size))
 
     def test_cross_attn(self):
+        self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
         self._test_mha(
             batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
         )
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
new file mode 100644
index 000000000..1d7d7d72f
--- /dev/null
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -0,0 +1,336 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for vanilla_attenion.
+"""
+import math
+import os
+import unittest
+
+import torch
+import torch.nn.functional as F
+
+from aitemplate.compiler import compile_model, Model
+from aitemplate.frontend import nn, Tensor
+from aitemplate.frontend.nn.vanilla_attention import vanilla_attention
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger, shape_utils
+from einops import rearrange
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        attn_mask: (batch_size, seqlen)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+        attention: softmax after dropout
+    """
+    q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
+    seqlen = qkv.shape[1]
+    d = qkv.shape[-1]
+    scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    if attn_mask is not None:
+        scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf"))
+    if causal:
+        causal_mask = torch.triu(
+            torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1
+        )
+        scores.masked_fill_(causal_mask, float("-inf"))
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+    return output.to(dtype=qkv.dtype)
+
+
+def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
+    qkv_pt = torch.nn.functional.linear(
+        X_pt, W_pt, bias=B_pt
+    )  # [4096*3, 256] *[768, 256]
+    qkv_pt = torch.reshape(
+        qkv_pt, [1, seqlen, 3, nheads, d]
+    )  # [4096*3, 768] -> [1, 4096, 3, 12, 64]
+    qkv_pt = torch.permute(qkv_pt, [2, 0, 3, 1, 4])  # [3, 1, 12, 4096, 64]
+
+    q_pt, k_pt, v_pt = torch.split(qkv_pt, 1, dim=0)  # [1, 1, 12, 4096, 64]
+    scale_pt = torch.tensor(64**-0.5)
+    q_pt = q_pt * (scale_pt)
+    # #[12, 4096, 64] * [12, 64, 4096] => [12, 4096, 4096]
+    attn_pt = torch.bmm(
+        (torch.reshape(q_pt, [nheads, -1, d])),
+        (torch.transpose(torch.reshape(k_pt, [nheads, -1, d]), 2, 1)),
+    )  # [12,4096,4096]
+    attn_pt = torch.softmax(attn_pt, dim=-1)  # [12,4096,4096]
+    v_pt = torch.reshape(v_pt, [nheads, -1, d])  # [12, 4096, 64]
+    y_pt = torch.bmm(attn_pt, v_pt)  # [12, 4096, 64]
+    y_pt = torch.reshape(y_pt, [1, nheads, seqlen, d])
+    Y_pt = torch.permute(y_pt, [0, 2, 1, 3]).cuda().half()  # [1,4096,12,64]
+    return Y_pt
+
+
+class vanillaAttentionTestCase(unittest.TestCase):
+    def _test_vanilla_attention(
+        self,
+        batch_size=16,
+        nheads=16,
+        seqlen=1024,
+        n=1024,
+        causal=False,
+        dtype=torch.float16,
+        device="cuda",
+        test_name="attention",
+        rebuild=True,
+        benchmark_ait=False,
+        benchmark_pt=False,
+    ):
+        head_size = n // nheads
+
+        x = torch.randn(
+            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
+        )
+        Wqkv = torch.nn.Linear(
+            nheads * head_size, 3 * nheads * head_size, device=device, dtype=dtype
+        )
+        qkv = (
+            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            .detach()
+            .requires_grad_()
+        )
+        q, k, v = torch.split(qkv, 1, dim=2)
+        q, k, v = (
+            q.squeeze(2),
+            k.squeeze(2),
+            v.squeeze(2),
+        )  # batch_size, seqlen, nheads, head_size
+        output = attention_ref(qkv, None, 0, causal=causal)
+        y_pt = output.detach()
+        y_pt = y_pt.reshape(batch_size, seqlen, nheads * head_size)
+        print(f"{y_pt.shape=}")
+
+        Q = Tensor(
+            shape=[batch_size, seqlen, nheads, head_size],
+            dtype="float16",
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size, seqlen, nheads, head_size],
+            dtype="float16",
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size, seqlen, nheads, head_size],
+            dtype="float16",
+            name="v",
+            is_input=True,
+        )
+
+        from aitemplate.compiler.base import _TorchConstantTensorData
+
+        causal_mask = None
+        if causal:
+            mask = torch.triu(
+                torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1
+            )
+            causal_mask_pt = torch.zeros(
+                seqlen, seqlen, dtype=qkv.dtype, device=qkv.device
+            )
+            causal_mask_pt.masked_fill_(mask, float("-inf"))
+            causal_mask_pt = causal_mask_pt.unsqueeze(0)
+
+            causal_mask = Tensor(
+                shape=[1, seqlen, seqlen],
+                dtype="float16",
+                name="causal_mask",
+            )
+            causal_mask._bind_data(_TorchConstantTensorData(causal_mask_pt))
+        Y = vanilla_attention(Q, K, V, attn_mask=causal_mask)
+
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        inputs = {
+            "q": q.detach().half().cuda().contiguous(),
+            "k": k.detach().half().cuda().contiguous(),
+            "v": v.detach().half().cuda().contiguous(),
+        }
+
+        y = torch.empty([batch_size, seqlen, nheads * head_size]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        if benchmark_ait:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            logger.info(
+                __file__, "benchmark vanilla-attn time: {0}".format(time_per_iter_ms)
+            )
+
+        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+
+        if benchmark_pt:
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = attention_ref
+            args = (
+                qkv.cuda().half(),
+                None,
+                0,
+                False,
+                False,
+            )
+            duration = benchmark_torch_function(100, func, *args)
+            print(
+                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            )
+
+    def test_vanilla_attention(self):
+        self._test_vanilla_attention(test_name="vanilla_attention")
+        self._test_vanilla_attention(test_name="vanilla_attention_causal", causal=True)
+
+    def _test_mha(
+        self,
+        batch_sizes,
+        seqlen=1,
+        seqlen_kv=62,
+        dim=4,
+        num_heads=2,
+        use_fp16_acc=False,
+        benchmark_ait=False,
+    ):
+        pt_mod = (
+            torch.nn.MultiheadAttention(
+                embed_dim=dim,
+                num_heads=num_heads,
+                batch_first=True,
+            )
+            .cuda()
+            .half()
+        )
+        pt_mod = pt_mod.eval()
+
+        pt_params = dict(pt_mod.named_parameters())
+        params_ait = {}
+        for key, arr in pt_params.items():
+            if "in_proj" in key:
+                if len(arr.shape) == 2:
+                    w_q, w_k, w_v = arr.chunk(3)
+                    params_ait["proj_q_weight"] = w_q
+                    params_ait["proj_k_weight"] = w_k
+                    params_ait["proj_v_weight"] = w_v
+                else:
+                    b_q, b_k, b_v = arr.chunk(3)
+                    params_ait["proj_q_bias"] = b_q
+                    params_ait["proj_k_bias"] = b_k
+                    params_ait["proj_v_bias"] = b_v
+
+            else:
+                params_ait[key.replace(".", "_").replace("out_proj", "proj")] = arr
+
+        ait_mod = nn.VanillaCrossAttention(
+            dim=dim,
+            seq_len=seqlen,
+            seq_len_kv=seqlen_kv,
+            num_heads=num_heads,
+            qkv_bias=True,
+            has_residual=False,
+        )
+        ait_mod.name_parameter_tensor()
+
+        if len(batch_sizes) == 1:
+            # static
+            batch_dim = batch_sizes[0]
+        else:
+            batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_size")
+
+        inputs_ait = Tensor([batch_dim, seqlen, dim], name="input0", is_input=True)
+        inputs_ait_k = Tensor([batch_dim, seqlen_kv, dim], name="input1", is_input=True)
+        inputs_ait_v = Tensor([batch_dim, seqlen_kv, dim], name="input2", is_input=True)
+        Y = ait_mod(inputs_ait, inputs_ait_k, inputs_ait_v)
+        Y = Y + inputs_ait
+        mark_output(Y)
+        target = detect_target(use_fp16_acc=False)
+        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        for name, weight in params_ait.items():
+            exe_module.set_constant_with_tensor(name, weight)
+
+        for batch_size in batch_sizes:
+            input_pt = torch.randn([batch_size, seqlen, dim]).cuda().half()
+            if seqlen == seqlen_kv:
+                input_pt_k = input_pt
+                input_pt_v = input_pt
+            else:
+                input_pt_k = torch.randn([batch_size, seqlen_kv, dim]).cuda().half()
+                input_pt_v = torch.randn([batch_size, seqlen_kv, dim]).cuda().half()
+
+            pt_ys, _ = pt_mod(input_pt, input_pt_k, input_pt_v)
+            pt_ys = pt_ys + input_pt
+            print("pt output:", pt_ys.shape)
+
+            inputs = {"input0": input_pt, "input1": input_pt_k, "input2": input_pt_v}
+            ys = [torch.empty(pt_ys.shape).cuda().half()]
+            exe_module.run_with_tensors(inputs, ys)
+            self.assertTrue(torch.allclose(pt_ys, ys[0], atol=1e-2, rtol=1e-2))
+            print("Batch {} MHA verification pass".format(batch_size))
+
+            if benchmark_ait:
+                # Benchmark AIT
+                time_per_iter_ms, time_std, _ = exe_module.benchmark_with_tensors(
+                    inputs,
+                    ys,
+                    count=100,
+                )
+                logger.info(
+                    __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
+                )
+
+    def test_cross_attn(self):
+        self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
+        self._test_mha(
+            batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
+        )
+        self._test_mha(
+            batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From ce8842c8757f3e47903ab8029752a4b27fffc68a Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 18 Jan 2023 09:34:47 -0800
Subject: [PATCH 015/638] remove bing as maintianer (#137)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 38330592d..d08a04f0e 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute
 
 AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
 
-AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine), [Mu-Chu Lee](https://github.com/muchulee8) and [Bing Xu](https://github.com/antinucleon).
+AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine) and [Mu-Chu Lee](https://github.com/muchulee8).
 
 
 ## Acknowledgement

From 5674cac88e7e9555c85910ab2ecc1c6515c70395 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 19 Jan 2023 14:23:45 +0800
Subject: [PATCH 016/638] Update ait_ci.yml, diffusers=0.11.1

---
 .github/workflows/ait_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index f4d0463ef..0f790af9d 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -114,7 +114,7 @@ jobs:
       run: |
         echo "Running Stable Diffusion tests"
         cd $GITHUB_WORKSPACE/AITemplate/examples/05_stable_diffusion
-        python3 -m pip install transformers click torch diffusers accelerate
+        python3 -m pip install transformers click torch diffusers=0.11.1 accelerate
         # populate log headers
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
         echo -n "hostname: ">sdiff.log; hostname >> sdiff.log

From 593f147a3ec89b496f8bfd80c6b43a655d52b233 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 19 Jan 2023 16:15:42 +0800
Subject: [PATCH 017/638] Update ait_ci.yml with diffusers==0.11.1

---
 .github/workflows/ait_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index 0f790af9d..0f598865f 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -114,7 +114,7 @@ jobs:
       run: |
         echo "Running Stable Diffusion tests"
         cd $GITHUB_WORKSPACE/AITemplate/examples/05_stable_diffusion
-        python3 -m pip install transformers click torch diffusers=0.11.1 accelerate
+        python3 -m pip install transformers click torch diffusers==0.11.1 accelerate
         # populate log headers
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
         echo -n "hostname: ">sdiff.log; hostname >> sdiff.log

From 7bbde4c7ec6efc26db750113b0b652a7859d75be Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 26 Jan 2023 16:50:05 +0800
Subject: [PATCH 018/638] format code

---
 examples/process_results.py                   | 336 ++++++++++++------
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |   2 +-
 tests/unittest/ops/test_conv_bias_add_relu.py |   4 -
 3 files changed, 222 insertions(+), 120 deletions(-)

diff --git a/examples/process_results.py b/examples/process_results.py
index 19f013637..20963e51f 100644
--- a/examples/process_results.py
+++ b/examples/process_results.py
@@ -1,12 +1,16 @@
 #!/usr/bin/env python3
-import glob,os, io, argparse, datetime
-#import numpy as np
-import sqlalchemy
-from sqlalchemy.types import NVARCHAR, Float, Integer
-import pymysql
+import datetime
+import glob
+import io
+import os
+
 import pandas as pd
+
+# import numpy as np
+import sqlalchemy
 from sshtunnel import SSHTunnelForwarder
 
+
 def print_to_string(*args, **kwargs):
     output = io.StringIO()
     print(*args, file=output, **kwargs)
@@ -14,134 +18,214 @@ def print_to_string(*args, **kwargs):
     output.close()
     return contents
 
+
 def get_logfiles():
-    path = r'./**/*.log'
+    path = r"./**/*.log"
     files = glob.glob(path, recursive=True)
     files.sort()
     return files
 
+
 def get_log_params(logfile):
-    branch_name=' '
-    commit= ' '
-    node_id=' '
-    gpu_arch=' '
-    compute_units=0
-    ngpus=0
-    rocm_vers=' '
-    compiler_vers='release'
+    branch_name = " "
+    commit = " "
+    node_id = " "
+    gpu_arch = " "
+    compute_units = 0
+    ngpus = 0
+    rocm_vers = " "
+    compiler_vers = "release"
     for line in open(logfile):
-         if 'git_branch' in line:
-             lst=line.split()
-             branch_name=lst[1]
-         if 'commit' in line:
-             lst=line.split()
-             commit=lst[1]
-         if 'hostname' in line:
-             lst=line.split()
-             node_id=lst[1]
-         if 'GPU_arch' in line:
-             lst=line.split()
-             gpu_arch=lst[2]
-         if 'Name:                    gfx' in line:
-             ngpus=ngpus+1
-         if 'Compute Unit' in line:
-             lst=line.split()
-             compute_units=lst[2]
-         if 'InstalledDir' in line:
-             lst=line.split()
-             rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
-    return branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers
+        if "git_branch" in line:
+            lst = line.split()
+            branch_name = lst[1]
+        if "commit" in line:
+            lst = line.split()
+            commit = lst[1]
+        if "hostname" in line:
+            lst = line.split()
+            node_id = lst[1]
+        if "GPU_arch" in line:
+            lst = line.split()
+            gpu_arch = lst[2]
+        if "Name:                    gfx" in line:
+            ngpus = ngpus + 1
+        if "Compute Unit" in line:
+            lst = line.split()
+            compute_units = lst[2]
+        if "InstalledDir" in line:
+            lst = line.split()
+            rocm_vers = lst[1][
+                lst[1].find("/opt/rocm-")
+                + len("/opt/rocm-") : lst[1].rfind("/llvm/bin")
+            ]
+    return (
+        branch_name,
+        commit,
+        node_id,
+        gpu_arch,
+        compute_units,
+        ngpus,
+        rocm_vers,
+        compiler_vers,
+    )
+
 
 def parse_logfile(files):
-    glue=''
-    res=[]
-    tests=[]
+    # glue = ""
+    res = []
+    # tests = []
     for logfile in files:
-       if 'resnet50' in logfile or 'vit.log' in logfile:
-          init_bs=0
-          for line in open(logfile):
-              if 'batch_size:' in line:
-                 lst=line.split()
-                 lst[1]=int(lst[1].replace(',',''))
-                 if lst[1]>init_bs: #only grab first 9 results for different batch sizes from these tests
-                    init_bs=lst[1]
-                    res.append(lst[3])
-       if 'bert.log' in logfile:
-           for line in open(logfile):
-              if 'batch_size:' in line: #grab all 45 results from these tests
-                 lst=line.split()
-                 res.append(lst[5])
-       if 'sdiff.log' in logfile:
-           for line in open(logfile):
-              if 'sd e2e:' in line: #results for stable diffusion
-                lst=line.split()
-                res.append(lst[2])
+        if "resnet50" in logfile or "vit.log" in logfile:
+            init_bs = 0
+            for line in open(logfile):
+                if "batch_size:" in line:
+                    lst = line.split()
+                    lst[1] = int(lst[1].replace(",", ""))
+                    if (
+                        lst[1] > init_bs
+                    ):  # only grab first 9 results for different batch sizes from these tests
+                        init_bs = lst[1]
+                        res.append(lst[3])
+        if "bert.log" in logfile:
+            for line in open(logfile):
+                if "batch_size:" in line:  # grab all 45 results from these tests
+                    lst = line.split()
+                    res.append(lst[5])
+        if "sdiff.log" in logfile:
+            for line in open(logfile):
+                if "sd e2e:" in line:  # results for stable diffusion
+                    lst = line.split()
+                    res.append(lst[2])
     return res
 
+
 def get_baseline(table, connection):
-    query = '''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where git_branch='amd-develop' );'''
+    query = (
+        """SELECT * from """
+        + table
+        + """ WHERE Datetime = (SELECT MAX(Datetime) FROM """
+        + table
+        + """ where git_branch='amd-develop' );"""
+    )
     return pd.read_sql_query(query, connection)
 
-def store_new_test_result(table_name, test_results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, connection):
-    params=[str(node_id),str(branch_name),str(commit),str(gpu_arch),compute_units,ngpus,str(rocm_vers),str(compiler_vers),str(datetime.datetime.now())]
-    df=pd.DataFrame(data=[params],columns=['hostname','git_branch','git_commit','GPU_arch','Compute_units','number_of_gpus','ROCM_version','compiler_version','Datetime'])
-    df_add=pd.DataFrame(data=[test_results],columns=testlist)
-    df=pd.concat([df,df_add],axis=1)
-    print("new test results dataframe:",df)
-    df.to_sql(table_name,connection,if_exists='append',index=False)
+
+def store_new_test_result(
+    table_name,
+    test_results,
+    testlist,
+    node_id,
+    branch_name,
+    commit,
+    gpu_arch,
+    compute_units,
+    ngpus,
+    rocm_vers,
+    compiler_vers,
+    connection,
+):
+    params = [
+        str(node_id),
+        str(branch_name),
+        str(commit),
+        str(gpu_arch),
+        compute_units,
+        ngpus,
+        str(rocm_vers),
+        str(compiler_vers),
+        str(datetime.datetime.now()),
+    ]
+    df = pd.DataFrame(
+        data=[params],
+        columns=[
+            "hostname",
+            "git_branch",
+            "git_commit",
+            "GPU_arch",
+            "Compute_units",
+            "number_of_gpus",
+            "ROCM_version",
+            "compiler_version",
+            "Datetime",
+        ],
+    )
+    df_add = pd.DataFrame(data=[test_results], columns=testlist)
+    df = pd.concat([df, df_add], axis=1)
+    print("new test results dataframe:", df)
+    df.to_sql(table_name, connection, if_exists="append", index=False)
     return 0
 
-def compare_test_to_baseline(baseline,test,testlist):
-    regression=0
+
+def compare_test_to_baseline(baseline, test, testlist):
+    regression = 0
     if not baseline.empty:
-        base=baseline[testlist].to_numpy(dtype='float')
-        base_list=base[0]
-        ave_perf=0
+        base = baseline[testlist].to_numpy(dtype="float")
+        base_list = base[0]
+        ave_perf = 0
         for i in range(len(base_list)):
             # success criterion:
-            if base_list[i]>1.01*float(test[i]):
-                print("test # ",i,"shows regression by {:.3f}%".format(
-                    (float(test[i])-base_list[i])/base_list[i]*100))
-                regression=1
-            if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
-        if regression==0:
+            if base_list[i] > 1.01 * float(test[i]):
+                print(
+                    "test # ",
+                    i,
+                    "shows regression by {:.3f}%".format(
+                        (float(test[i]) - base_list[i]) / base_list[i] * 100
+                    ),
+                )
+                regression = 1
+            if base_list[i] > 0:
+                ave_perf = ave_perf + float(test[i]) / base_list[i]
+        if regression == 0:
             print("no regressions found")
-        ave_perf=ave_perf/len(base_list)
-        print("average performance relative to baseline:",ave_perf)
+        ave_perf = ave_perf / len(base_list)
+        print("average performance relative to baseline:", ave_perf)
     else:
         print("could not find a baseline")
     return regression
 
 
 def main():
-    files=get_logfiles()
-    results=[]
-    baseline=[]
-    testlist=[]
-    #parse the test parameters from the logfile
+    files = get_logfiles()
+    results = []
+    baseline = []
+    testlist = []
+    # parse the test parameters from the logfile
     for filename in files:
-        branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers = get_log_params(filename)
-
-    print("Branch name:",branch_name)
-    print("Git_commit:",commit)
-    print("Node name:",node_id)
-    print("GPU_arch:",gpu_arch)
-    print("Compute units:",compute_units)
-    print("ROCM_version:",rocm_vers)
-    print("Compiler_version:",compiler_vers)
-    #parse results, get the Tflops value for "Best Perf" kernels
-    results=parse_logfile(files)
-
-    print("Number of tests:",len(results))
-    sql_hostname = '127.0.0.1'
+        (
+            branch_name,
+            commit,
+            node_id,
+            gpu_arch,
+            compute_units,
+            ngpus,
+            rocm_vers,
+            compiler_vers,
+        ) = get_log_params(filename)
+
+    print("Branch name:", branch_name)
+    print("Git_commit:", commit)
+    print("Node name:", node_id)
+    print("GPU_arch:", gpu_arch)
+    print("Compute units:", compute_units)
+    print("ROCM_version:", rocm_vers)
+    print("Compiler_version:", compiler_vers)
+    # parse results, get the Tflops value for "Best Perf" kernels
+    results = parse_logfile(files)
+
+    print("Number of tests:", len(results))
+    sql_hostname = "127.0.0.1"
     sql_username = os.environ["dbuser"]
     sql_password = os.environ["dbpassword"]
-    sql_main_database = 'sys'
+    sql_main_database = "sys"
     sql_port = 3306
     hostname = os.uname()[1]
-    if hostname == 'jwr-amd-132':
-        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.
-            format(sql_username, sql_password, sql_hostname, sql_main_database))
+    if hostname == "jwr-amd-132":
+        sqlEngine = sqlalchemy.create_engine(
+            "mysql+pymysql://{0}:{1}@{2}/{3}".format(
+                sql_username, sql_password, sql_hostname, sql_main_database
+            )
+        )
         conn = sqlEngine.connect()
     else:
         ssh_host = os.environ["dbsship"]
@@ -152,23 +236,45 @@ def main():
             (ssh_host, ssh_port),
             ssh_username=ssh_user,
             ssh_password=ssh_pass,
-            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
-                sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
-                    format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+            remote_bind_address=(sql_hostname, sql_port),
+        ) as tunnel:
+            sqlEngine = sqlalchemy.create_engine(
+                "mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(
+                    sql_username,
+                    sql_password,
+                    sql_hostname,
+                    tunnel.local_bind_port,
+                    sql_main_database,
+                )
+            )
         conn = sqlEngine.connect()
-    #save gemm performance tests:
-    for i in range(1,len(results)+1):
-        testlist.append("Test%i"%i)
-    table_name="ait_performance"
-        
-    baseline = get_baseline(table_name,conn)
-    store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn)
+    # save gemm performance tests:
+    for i in range(1, len(results) + 1):
+        testlist.append("Test%i" % i)
+    table_name = "ait_performance"
+
+    baseline = get_baseline(table_name, conn)
+    store_new_test_result(
+        table_name,
+        results,
+        testlist,
+        node_id,
+        branch_name,
+        commit,
+        gpu_arch,
+        compute_units,
+        ngpus,
+        rocm_vers,
+        compiler_vers,
+        conn,
+    )
     conn.close()
 
-    #compare the results to the baseline if baseline exists
-    regression=0
-    regression=compare_test_to_baseline(baseline,results,testlist)
+    # compare the results to the baseline if baseline exists
+    regression = 0
+    regression = compare_test_to_baseline(baseline, results, testlist)
     return regression
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index e78512c80..639bb21fb 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -35,7 +35,7 @@
 struct AddSigmoid
 {
     template <typename T>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;\    
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;   
     template <>
     __host__ __device__ constexpr void
     operator()<float>(float& y, const float& x0, const float& x1) const
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index f13474835..7511b4f6b 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -68,10 +68,6 @@ def test_fp16(self):
         self._test_fp16()
         self._test_fp16(copy_op=True)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
-
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 8d7f5b9eafb3d90aa8ce65b49a366468f790b4fe Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 26 Jan 2023 21:37:35 +0800
Subject: [PATCH 019/638] fix profile bug

---
 python/aitemplate/backend/profiler_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 3fc5fa3b4..ec3c4e8e8 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -263,7 +263,8 @@ def callback_when_done(fut):
                     logger.debug(
                         __name__, f"Failed to extract profiler result for {cmds}"
                     )
-                process_result_callback(profile_result, self._postprocessing_delegate)
+                else:
+                    process_result_callback(profile_result, self._postprocessing_delegate)
             finally:
                 # unblock one future in `join()`
                 self._done_queue.put(stdout)

From d6b31fd992f0a9c09b5f4646a8d0a339c5d16c46 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 27 Jan 2023 01:06:41 +0800
Subject: [PATCH 020/638] fix embeddings bug

---
 .../backend/rocm/embedding/bert_embeddings.py | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
index 110aea2d9..5c10eedc1 100644
--- a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -31,25 +31,30 @@
 #include "logging.h"
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #define EMBEDDING_DIM {{embedding_dim}}
 
+using EmbElementwiseOperation = ck::tensor_operation::element_wise::AddAdd;
+using EmbType = {{elem_input_type}};
+using IndexType = {{index_type}};
+
 {{func_signature}}
 {
-  auto device_instance = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<{{elem_input_type}}, {{index_type}}, {{elem_input_type}}, {{elem_input_type}}, float, {{elem_input_type}}, 256, 1, 256, 1, EMBEDDING_DIM, 1, {{row_v_size}}, 3>{};
+  auto device_instance = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, EmbType, EmbType, float, EmbType, EmbElementwiseOperation, 256, 1, 256, 1, EMBEDDING_DIM, 1, {{row_v_size}}, 3>{};
   auto argument_ptr = device_instance.MakeArgumentPointer(output,
-                                                          word_embeddings,
-                                                          token_type_embeddings,
-                                                          position_embeddings,
-                                                          input_ids,
-                                                          token_type_ids,
-                                                          position_ids,
+                                                          {ck::type_convert<EmbType*>(word_embeddings),
+                                                          ck::type_convert<EmbType*>(token_type_embeddings),
+                                                          ck::type_convert<EmbType*>(position_embeddings)},
+                                                          {ck::type_convert<IndexType*>(input_ids),
+                                                          ck::type_convert<IndexType*>(token_type_ids),
+                                                          ck::type_convert<IndexType*>(position_ids)},
                                                           gamma,
                                                           beta,
-                                                          8,
                                                           EMBEDDING_DIM,
                                                           indices_num,
-                                                          eps);
+                                                          eps,
+                                                          EmbElementwiseOperation{});
   if(!device_instance.IsSupportedArgument(argument_ptr.get())){
     LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
   }

From 116bae518d0b25e6fcb219cd55e05c4f2a78e6ef Mon Sep 17 00:00:00 2001
From: Ying Zhang <ipiszy@users.noreply.github.com>
Date: Sun, 29 Jan 2023 19:35:50 -0700
Subject: [PATCH 021/638] release v0.2 (#151)

---
 .gitmodules                                   |    3 +
 3rdparty/cutlass                              |    2 +-
 3rdparty/picojson                             |    1 +
 docs/source/reference/env.rst                 |    2 +
 examples/04_vit/test_correctness.py           |  184 +
 examples/05_stable_diffusion/README.md        |   49 +-
 .../05_stable_diffusion/scripts/compile.py    |   89 +
 examples/05_stable_diffusion/scripts/demo.py  |   55 +
 .../scripts/demo_img2img.py                   |   72 +
 .../scripts/download_pipeline.py              |   38 +
 examples/05_stable_diffusion/src/__init__.py  |   14 +
 examples/05_stable_diffusion/src/benchmark.py |  309 ++
 .../05_stable_diffusion/src/benchmark_pt.py   |   50 +
 .../src/compile_lib/__init__.py               |    0
 .../src/compile_lib/compile_clip.py           |  118 +
 .../src/compile_lib/compile_unet.py           |   87 +
 .../src/compile_lib/compile_vae.py            |  140 +
 .../src/compile_lib/util.py                   |   22 +
 .../src/modeling/attention.py                 |  105 +
 .../05_stable_diffusion/src/modeling/clip.py  |  587 +++
 .../src/modeling/embeddings.py                |  101 +
 .../src/modeling/resnet.py                    |  238 ++
 .../src/modeling/unet_2d_condition.py         |  255 ++
 .../src/modeling/unet_blocks.py               |  762 ++++
 .../05_stable_diffusion/src/modeling/vae.py   |  153 +
 .../src/pipeline_stable_diffusion_ait.py      |  410 +++
 .../pipeline_stable_diffusion_img2img_ait.py  |  403 +++
 .../src/test_correctness.py                   |  137 +
 .../06_how_to_add_an_op/how_to_add_an_op.py   |   48 +-
 fx2ait/CMakeLists.txt                         |   19 +
 fx2ait/README.md                              |   47 +
 fx2ait/fx2ait/TARGETS                         |   41 +
 fx2ait/fx2ait/__init__.py                     |   29 +
 fx2ait/fx2ait/acc_tracer/__init__.py          |   43 +
 fx2ait/fx2ait/acc_tracer/acc_normalizer.py    |  465 +++
 fx2ait/fx2ait/acc_tracer/acc_op_properties.py |   50 +
 fx2ait/fx2ait/acc_tracer/acc_ops.py           | 3215 +++++++++++++++++
 fx2ait/fx2ait/acc_tracer/acc_shape_prop.py    |  124 +
 fx2ait/fx2ait/acc_tracer/acc_tracer.py        |  626 ++++
 fx2ait/fx2ait/acc_tracer/acc_utils.py         |  201 ++
 .../fx2ait/acc_tracer/ait_acc_normalizer.py   |   55 +
 fx2ait/fx2ait/acc_tracer/ait_acc_ops.py       |   26 +
 .../fx2ait/acc_tracer/ait_acc_ops_registry.py |   92 +
 fx2ait/fx2ait/ait_module.py                   |   41 +
 fx2ait/fx2ait/ait_splitter.py                 |  134 +
 fx2ait/fx2ait/cache.py                        |   13 +
 fx2ait/fx2ait/converters/__init__.py          |    3 +
 fx2ait/fx2ait/converters/ait_converters.py    | 1091 ++++++
 .../converters/ait_module_converters.py       |   81 +
 .../fx2ait/converters/aten2ait_converters.py  | 1081 ++++++
 .../fx2ait/converters/converter_registry.py   |   19 +
 fx2ait/fx2ait/converters/utils.py             |  140 +
 fx2ait/fx2ait/csrc/AITModel.cpp               |  104 +
 fx2ait/fx2ait/csrc/AITModel.h                 |   64 +
 fx2ait/fx2ait/csrc/AITModelImpl.cpp           |  495 +++
 fx2ait/fx2ait/csrc/AITModelImpl.h             |  158 +
 fx2ait/fx2ait/csrc/TARGETS                    |   29 +
 .../example/01_transformer_model/README.md    |   76 +
 .../test_transformer_encoder.py               |   86 +
 .../fx2ait/example/02_vision_model/README.md  |   51 +
 .../02_vision_model/test_vision_model.py      |   59 +
 .../example/03_lowering_split/README.md       |   90 +
 .../example/03_lowering_split/test_lower.py   |  107 +
 fx2ait/fx2ait/example/benchmark_utils.py      |  197 +
 fx2ait/fx2ait/fx2ait.py                       |  321 ++
 fx2ait/fx2ait/lower/lower.py                  |  238 ++
 fx2ait/fx2ait/lower/lower_settings.py         |   68 +
 fx2ait/fx2ait/passes/lower_basic_pass_aten.py |  618 ++++
 fx2ait/fx2ait/tensor_spec.py                  |  218 ++
 fx2ait/fx2ait/test/TARGETS                    |   78 +
 fx2ait/fx2ait/test/__init__.py                |   21 +
 .../test_ait_transformer_model.py             |   60 +
 .../converters_model/test_ait_vision_model.py |   26 +
 .../test_ait_multihead_attention.py           |   65 +
 .../test_ait_adaptive_avg_pool2d.py           |   38 +
 .../test/converters/test_ait_avg_pool2d.py    |   33 +
 .../test/converters/test_ait_batch_norm.py    |   26 +
 .../test/converters/test_ait_binary_op.py     |  144 +
 .../fx2ait/test/converters/test_ait_chunk.py  |   29 +
 .../fx2ait/test/converters/test_ait_clamp.py  |   30 +
 .../fx2ait/test/converters/test_ait_common.py |  368 ++
 .../test/converters/test_ait_contiguous.py    |   18 +
 .../fx2ait/test/converters/test_ait_conv2d.py |   50 +
 .../fx2ait/test/converters/test_ait_expand.py |   30 +
 .../test/converters/test_ait_flatten.py       |   21 +
 .../fx2ait/test/converters/test_ait_gelu.py   |   46 +
 .../test/converters/test_ait_layer_norm.py    |   45 +
 .../test/converters/test_ait_leaky_relu.py    |   18 +
 .../test/converters/test_ait_linalg_norm.py   |   48 +
 .../fx2ait/test/converters/test_ait_linear.py |   34 +
 .../fx2ait/test/converters/test_ait_matmul.py |   78 +
 .../test/converters/test_ait_max_pool2d.py    |   33 +
 .../test/converters/test_ait_nan2num.py       |   28 +
 fx2ait/fx2ait/test/converters/test_ait_pow.py |   17 +
 .../fx2ait/test/converters/test_ait_reduce.py |   80 +
 .../test/converters/test_ait_sigmoid.py       |   15 +
 .../test/converters/test_ait_slice_tensor.py  |   76 +
 .../test/converters/test_ait_softmax.py       |   24 +
 .../test/converters/test_ait_squeeze.py       |   28 +
 .../fx2ait/test/converters/test_ait_tile.py   |   35 +
 .../test/converters/test_ait_unary_ops.py     |   48 +
 fx2ait/fx2ait/test/converters/test_ait_var.py |   46 +
 .../test_ait_adaptive_avg_pool2d_aten.py      |   84 +
 .../test_ait_avg_pool2d_aten.py               |   69 +
 .../test_ait_batch_norm_aten.py               |   55 +
 .../test_ait_binary_op_aten.py                |  115 +
 .../test/converters_aten/test_ait_cat_aten.py |   68 +
 .../converters_aten/test_ait_chunk_aten.py    |   46 +
 .../converters_aten/test_ait_clamp_aten.py    |   66 +
 .../converters_aten/test_ait_conv2d_aten.py   |  113 +
 .../converters_aten/test_ait_flatten_aten.py  |   64 +
 .../test_ait_layer_norm_aten.py               |  102 +
 .../converters_aten/test_ait_linear_aten.py   |   76 +
 .../converters_aten/test_ait_matmul_aten.py   |   74 +
 .../test_ait_max_pool2d_aten.py               |   69 +
 .../converters_aten/test_ait_model_aten.py    |   93 +
 .../converters_aten/test_ait_nan2num_aten.py  |   75 +
 .../converters_aten/test_ait_permute_aten.py  |   64 +
 .../test/converters_aten/test_ait_pow_aten.py |   59 +
 .../converters_aten/test_ait_reduce_aten.py   |  117 +
 .../converters_aten/test_ait_relu_aten.py     |   49 +
 .../converters_aten/test_ait_reshape_aten.py  |  110 +
 .../converters_aten/test_ait_size_aten.py     |   55 +
 .../test_ait_slice_tensor_aten.py             |  208 ++
 .../converters_aten/test_ait_split_aten.py    |   68 +
 .../converters_aten/test_ait_squeeze_aten.py  |  151 +
 .../test_ait_unary_ops_aten.py                |   53 +
 fx2ait/fx2ait/test/test_ait_lower.py          |   77 +
 fx2ait/fx2ait/test/test_fx2ait.py             |   85 +
 fx2ait/fx2ait/test/test_tensor_spec.py        |  105 +
 fx2ait/fx2ait/tools/ait_minimizer.py          |   86 +
 fx2ait/fx2ait/tools/common_aten2ait.py        |  399 ++
 fx2ait/fx2ait/tools/common_fx2ait.py          |  396 ++
 fx2ait/fx2ait/utils.py                        |   17 +
 fx2ait/setup.py                               |   95 +
 python/aitemplate/__init__.py                 |    2 +-
 python/aitemplate/backend/backend_spec.py     |  159 +-
 python/aitemplate/backend/builder.py          |  436 ++-
 python/aitemplate/backend/codegen.py          |   63 +-
 .../backend/common/concatenate_common.py      |   29 +-
 .../backend/common/elementwise_common.py      |   30 +-
 .../aitemplate/backend/common/split_common.py |   93 +-
 .../backend/common/tensor/argmax_common.py    |   36 +-
 .../common/tensor/permute0213_common.py       |  445 +++
 .../common/tensor/permute021_common.py        |  199 +-
 .../common/tensor/permute102_common.py        |  372 +-
 .../common/tensor/permute210_common.py        |   85 +-
 .../backend/common/tensor/slice_common.py     |   18 +-
 .../backend/common/tensor_accessor_codegen.py |   26 +-
 .../backend/common/upsampling2d_common.py     |   87 +-
 .../multi_level_roi_align_common.py           |   32 +-
 .../backend/common/vision_ops/nms_kernel.py   |   37 +-
 .../common/vision_ops/roi_align_common.py     |  106 +-
 .../cuda/attention/mem_eff_attention.py       |    4 +
 .../aitemplate/backend/cuda/conv2d/common.py  |  783 +++-
 .../conv2d/common_conv2d_bias_activation.py   |  388 +-
 .../common_conv2d_bias_add_activation.py      |  379 +-
 .../cuda/conv2d/common_conv2d_few_channels.py |  122 +-
 .../cuda/conv2d/common_transposed_conv2d.py   |   63 +
 .../aitemplate/backend/cuda/conv2d/conv2d.py  |  399 +-
 .../backend/cuda/conv2d/conv2d_bias.py        |   70 +-
 .../backend/cuda/conv2d/conv2d_bias_add.py    |  133 +-
 .../cuda/conv2d/conv2d_bias_add_hardswish.py  |  137 +-
 .../cuda/conv2d/conv2d_bias_add_relu.py       |  137 +-
 .../cuda/conv2d/conv2d_bias_few_channels.py   |  179 +-
 .../cuda/conv2d/conv2d_bias_hardswish.py      |   70 +-
 .../conv2d_bias_hardswish_few_channels.py     |   91 +-
 .../backend/cuda/conv2d/conv2d_bias_relu.py   |   70 +-
 .../conv2d/conv2d_bias_relu_few_channels.py   |   77 +-
 .../cuda/conv2d/conv2d_bias_sigmoid.py        |   70 +-
 .../backend/cuda/conv2d/conv2d_depthwise.py   |  383 +-
 .../cuda/conv2d/conv2d_depthwise_bias.py      |  335 +-
 .../backend/cuda/conv2d/transposed_conv2d.py  |  253 +-
 .../cuda/conv2d/transposed_conv2d_bias.py     |  262 +-
 .../aitemplate/backend/cuda/conv3d/common.py  |  102 +-
 .../aitemplate/backend/cuda/conv3d/conv3d.py  |  370 +-
 .../backend/cuda/conv3d/depthwise_conv3d.py   |   50 +-
 python/aitemplate/backend/cuda/cuda_common.py |    1 +
 .../backend/cuda/elementwise/custom_math.cuh  |  540 ++-
 .../cuda/elementwise/fused_elementwise.py     |    4 +-
 .../cuda/gemm_epilogue_vistor/__init__.py     |    6 +-
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |   40 +-
 .../gemm_epilogue_vistor/common_dual_gemm.py  |  127 +-
 .../gemm_epilogue_vistor/common_softmax.py    |   13 +-
 .../gemm_epilogue_vistor/dual_bmm_rrr_div.py  |  346 ++
 .../dual_gemm_rcr_fast_gelu.py                |   94 +-
 .../dual_gemm_rcr_silu.py                     |  109 +-
 .../gemm_rcr_bias_softmax.py                  |   25 +-
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |   25 +-
 .../backend/cuda/gemm_special/bmm_rcr_n1.py   |  189 +-
 .../cuda/gemm_special/gemm_rrr_small_nk.py    |  102 +-
 .../backend/cuda/gemm_universal/__init__.py   |   11 +-
 .../backend/cuda/gemm_universal/bmm_ccr.py    |    9 +-
 .../backend/cuda/gemm_universal/bmm_common.py |  203 +-
 .../backend/cuda/gemm_universal/bmm_crr.py    |   87 +-
 .../cuda/gemm_universal/bmm_crr_add.py        |   25 +-
 .../backend/cuda/gemm_universal/bmm_rcr.py    |  156 +-
 .../cuda/gemm_universal/bmm_rcr_permute.py    |  124 +-
 .../backend/cuda/gemm_universal/bmm_rrr.py    |    9 +-
 .../cuda/gemm_universal/bmm_rrr_permute.py    |  134 +-
 .../backend/cuda/gemm_universal/common.py     |   68 +-
 .../cuda/gemm_universal/common_bias.py        |    3 +
 .../gemm_universal/common_bias_broadcast.py   |  135 +-
 .../backend/cuda/gemm_universal/gemm_rcr.py   |   64 +-
 .../cuda/gemm_universal/gemm_rcr_bias.py      |   64 +-
 .../gemm_rcr_bias_elementwise.py              |  135 +
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |   32 +-
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py |   32 +-
 .../gemm_universal/gemm_rcr_bias_hardswish.py |   32 +-
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |   32 +-
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |   32 +-
 .../gemm_universal/gemm_rcr_bias_swish.py     |   32 +-
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py |   32 +-
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py |   32 +-
 .../cuda/gemm_universal/gemm_rcr_permute.py   |   50 +-
 .../gemm_universal/gemm_rcr_permute_elup1.py  |  209 ++
 .../backend/cuda/gemm_universal/gemm_rrr.py   |   41 +-
 .../cuda/gemm_universal/gemm_rrr_permute.py   |   41 +-
 .../cuda/gemm_universal/group_common_bias.py  |   24 +-
 .../cuda/gemm_universal/group_gemm_rcr.py     |   26 +-
 .../gemm_universal/group_gemm_rcr_bias.py     |    4 +-
 .../group_gemm_rcr_bias_relu.py               |    4 +-
 .../group_gemm_rcr_bias_sigmoid.py            |    4 +-
 .../backend/cuda/gemm_universal/layout.py     |   50 +
 .../cuda/gemm_universal/perm021fc_ccr.py      |    9 +-
 .../perm021fc_ccr_bias_permute.py             |    9 +-
 .../cuda/gemm_universal/perm021fc_crc.py      |    9 +-
 .../cuda/gemm_universal/perm102_bmm_rcr.py    |    9 +-
 .../cuda/gemm_universal/perm102_bmm_rrr.py    |    9 +-
 .../cuda/groupnorm/groupnorm_common.py        |    2 +-
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  251 +-
 .../layernorm_sigmoid_mul_kernel.cuh          |   19 +-
 .../backend/cuda/padding/nhwc3to4.py          |    2 +-
 .../backend/cuda/padding/pad_last_dim.py      |    2 +
 .../backend/cuda/pool2d/avg_pool2d.py         |   73 +-
 .../backend/cuda/pool2d/max_pool2d.py         |   55 +-
 .../backend/cuda/reduce/reduce_3d.py          |   10 +-
 .../backend/cuda/reduce/reduce_small_axis.py  |   46 +-
 python/aitemplate/backend/cuda/reduce/var.py  |   41 +-
 .../backend/cuda/reduce/vector_norm.py        |    2 +-
 .../backend/cuda/softmax/softmax.cuh          |   22 +-
 python/aitemplate/backend/cuda/target_def.py  |   35 +-
 .../backend/cuda/tensor/__init__.py           |    2 +
 .../backend/cuda/tensor/concatenate.py        |   36 +-
 .../backend/cuda/tensor/concatenate_fast.cuh  |  851 +++++
 .../backend/cuda/tensor/concatenate_fast.py   |  195 +
 .../backend/cuda/tensor/concatenate_tanh.py   |   34 +-
 .../aitemplate/backend/cuda/tensor/gather.py  |   18 +-
 .../backend/cuda/tensor/permute.cuh           |   13 +-
 .../aitemplate/backend/cuda/tensor/permute.py |   15 +-
 .../backend/cuda/tensor/permute021.py         |   14 +-
 .../backend/cuda/tensor/permute0213.py        |   87 +
 .../backend/cuda/tensor/permute102.py         |    6 +-
 .../backend/cuda/upsample/upsampling2d.py     |    3 +-
 .../backend/cuda/upsample/upsampling2d_add.py |    3 +-
 python/aitemplate/backend/cuda/utils.py       |    9 +-
 .../roi_ops/multi_level_roi_align.py          |    4 +-
 .../cuda/vision_ops/roi_ops/roi_align.py      |    5 +-
 python/aitemplate/backend/main_templates.py   |  356 +-
 python/aitemplate/backend/profiler_cache.py   |  196 +-
 python/aitemplate/backend/profiler_runner.py  |  109 +-
 python/aitemplate/backend/registry.py         |    4 +-
 python/aitemplate/backend/rocm/target_def.py  |   10 +-
 .../backend/rocm/tensor/__init__.py           |    1 +
 .../backend/rocm/tensor/permute021.py         |    6 +-
 .../backend/rocm/tensor/permute0213.py        |   86 +
 .../backend/rocm/tensor/permute102.py         |    6 +-
 python/aitemplate/backend/target.py           |   21 +-
 python/aitemplate/compiler/compiler.py        |   42 +-
 python/aitemplate/compiler/dtype.py           |   22 +
 python/aitemplate/compiler/model.py           |  113 +-
 .../ops/attention/mem_eff_attention.py        |    6 +-
 .../compiler/ops/common/elementwise.py        |   38 +-
 .../compiler/ops/common/epilogue.py           |    1 +
 .../compiler/ops/common/fused_elementwise.py  |   75 +-
 python/aitemplate/compiler/ops/common/math.py |    4 +
 .../compiler/ops/common/view_ops.py           |   44 +-
 .../ops/conv/common_conv2d_bias_activation.py |    2 +-
 .../conv/common_conv2d_bias_add_activation.py |    2 +-
 python/aitemplate/compiler/ops/conv/conv2d.py |  214 +-
 python/aitemplate/compiler/ops/conv/conv3d.py |  237 +-
 .../compiler/ops/conv/conv_common.py          |   87 +
 .../compiler/ops/conv/depthwise_conv3d.py     |    2 +-
 .../conv/special_conv2d_bias_activation.py    |    2 +-
 .../compiler/ops/conv/transposed_conv2d.py    |   57 +
 .../ops/conv/transposed_conv2d_bias.py        |    2 +-
 .../compiler/ops/embedding/bert_embeddings.py |   13 +-
 .../ops/gemm_epilogue_vistor/__init__.py      |    6 +-
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |    2 +-
 .../gemm_epilogue_vistor/dual_bmm_rrr_div.py  |   58 +
 .../dual_gemm_rcr_fast_gelu.py                |   10 +-
 .../dual_gemm_rcr_silu.py                     |   10 +-
 .../compiler/ops/gemm_special/bmm_rcr_n1.py   |    2 +-
 .../ops/gemm_special/bmm_rrr_k1_tanh.py       |    2 +-
 .../ops/gemm_special/gemm_rrr_small_nk.py     |    2 +-
 .../compiler/ops/gemm_universal/__init__.py   |    1 +
 .../compiler/ops/gemm_universal/bmm.py        |   50 +
 .../compiler/ops/gemm_universal/bmm_ccr.py    |    2 +-
 .../ops/gemm_universal/bmm_ccr_add.py         |    8 +
 .../compiler/ops/gemm_universal/bmm_crr.py    |    2 +-
 .../ops/gemm_universal/bmm_crr_add.py         |    8 +
 .../compiler/ops/gemm_universal/bmm_rcr.py    |    2 +-
 .../ops/gemm_universal/bmm_rcr_permute.py     |    2 +-
 .../compiler/ops/gemm_universal/bmm_rrr.py    |    2 +-
 .../ops/gemm_universal/bmm_rrr_add.py         |    8 +
 .../ops/gemm_universal/bmm_rrr_permute.py     |    2 +-
 .../ops/gemm_universal/bmm_softmax_bmm.py     |    4 +-
 .../gemm_universal/bmm_softmax_bmm_permute.py |    4 +-
 .../ops/gemm_universal/gemm_common.py         |   72 +-
 .../compiler/ops/gemm_universal/gemm_rcr.py   |    2 +-
 .../ops/gemm_universal/gemm_rcr_bias.py       |    2 +-
 .../gemm_universal/gemm_rcr_bias_broadcast.py |    4 +-
 .../gemm_universal/gemm_rcr_bias_permute.py   |    2 +-
 .../ops/gemm_universal/gemm_rcr_permute.py    |    2 +-
 .../gemm_universal/gemm_rcr_permute_elup1.py  |   28 +
 .../compiler/ops/gemm_universal/gemm_rrr.py   |    2 +-
 .../ops/gemm_universal/gemm_rrr_bias.py       |    2 +-
 .../gemm_universal/gemm_rrr_bias_permute.py   |    2 +-
 .../ops/gemm_universal/gemm_rrr_permute.py    |    2 +-
 .../ops/gemm_universal/group_gemm_rcr.py      |   15 +-
 .../ops/gemm_universal/perm021fc_ccr.py       |    5 +-
 .../ops/gemm_universal/perm021fc_ccr_bias.py  |    2 +-
 .../perm021fc_ccr_bias_permute.py             |    2 +-
 .../ops/gemm_universal/perm021fc_crc.py       |    2 +-
 .../ops/gemm_universal/perm021fc_crc_bias.py  |    2 +-
 .../ops/gemm_universal/perm102_bmm_rcr.py     |    2 +-
 .../gemm_universal/perm102_bmm_rcr_bias.py    |    2 +-
 .../ops/gemm_universal/perm102_bmm_rrr.py     |    2 +-
 .../gemm_universal/perm102_bmm_rrr_bias.py    |    2 +-
 .../compiler/ops/groupnorm/groupnorm.py       |   11 +-
 .../compiler/ops/layernorm/group_layernorm.py |    2 +-
 .../compiler/ops/layernorm/layernorm.py       |   18 +-
 .../compiler/ops/padding/nhwc_pad_common.py   |    2 +-
 .../compiler/ops/padding/pad_last_dim.py      |    2 +-
 python/aitemplate/compiler/ops/pool/pool2d.py |    2 +-
 .../compiler/ops/reduce/reduce_common.py      |   14 +-
 .../compiler/ops/softmax/softmax.py           |   12 +-
 .../compiler/ops/tensor/__init__.py           |    2 +
 .../aitemplate/compiler/ops/tensor/argmax.py  |   11 +-
 .../compiler/ops/tensor/batch_gather.py       |    2 +-
 .../compiler/ops/tensor/concatenate.py        |   29 +-
 .../aitemplate/compiler/ops/tensor/gather.py  |    6 +-
 .../aitemplate/compiler/ops/tensor/permute.py |   37 +-
 .../compiler/ops/tensor/permute021.py         |   37 +-
 .../compiler/ops/tensor/permute0213.py        |   96 +
 .../compiler/ops/tensor/permute102.py         |   48 +-
 .../compiler/ops/tensor/permute210.py         |    1 +
 .../aitemplate/compiler/ops/tensor/split.py   |   63 +-
 python/aitemplate/compiler/ops/tensor/topk.py |   10 +-
 .../compiler/ops/tensor/transpose.py          |   34 +
 .../compiler/ops/upsample/upsampling2d_add.py |    2 +-
 .../ops/upsample/upsampling_common.py         |    2 +-
 .../ops/vision_ops/nms/efficient_nms.py       |   25 +-
 .../compiler/ops/vision_ops/nms/nms.py        |   13 +-
 .../roi_ops/multi_level_roi_align.py          |    2 +-
 .../ops/vision_ops/roi_ops/roi_ops.py         |    2 +-
 python/aitemplate/compiler/tensor_accessor.py |   17 +-
 .../aitemplate/compiler/transform/__init__.py |    1 +
 .../compiler/transform/apply_padding.py       |   26 +-
 .../compiler/transform/constant_folding.py    |   13 +-
 .../compiler/transform/fuse_group_ops.py      |   98 +-
 .../compiler/transform/fuse_mm_elementwise.py |   14 +-
 .../transform/fuse_mm_elementwise_patterns.py |   11 +
 .../transform/fuse_mm_reshape_permute.py      |    8 +-
 .../aitemplate/compiler/transform/fuse_ops.py |  206 +-
 .../compiler/transform/fuse_parallel_gemms.py |    7 +-
 .../transform/fuse_permute_bmm_and_gemm.py    |   13 +-
 .../compiler/transform/fuse_split.py          |   37 +-
 .../compiler/transform/fuse_utils.py          |   24 +-
 .../compiler/transform/name_graph.py          |   15 +-
 .../compiler/transform/optimize_graph.py      |    2 +
 .../aitemplate/compiler/transform/profile.py  |   17 +-
 .../compiler/transform/profile_dynamic_dim.py |    7 +-
 .../compiler/transform/refine_graph.py        |   11 +-
 .../transform/split_large_concat_ops.py       |   16 +-
 .../transform/split_large_split_ops.py        |  113 +
 .../aitemplate/compiler/transform/toposort.py |   39 +-
 .../transform/transform_memory_ops.py         |   24 +-
 .../transform/transform_odd_alignment.py      |   14 +-
 .../transform_strided_op_and_view_op.py       |   11 +-
 .../transform/transform_strided_ops.py        |   20 +-
 .../transform/transform_strided_ops_utils.py  |    5 +-
 .../compiler/transform/transform_utils.py     |   10 +-
 python/aitemplate/frontend/nn/__init__.py     |    1 +
 python/aitemplate/frontend/nn/attention.py    |    5 -
 python/aitemplate/frontend/nn/conv3d.py       |  121 +
 python/aitemplate/frontend/nn/dual_gemm.py    |   17 +-
 python/aitemplate/frontend/nn/proposal.py     |   70 +-
 python/aitemplate/testing/benchmark_trt.py    |   59 +
 python/aitemplate/testing/detect_target.py    |   11 +-
 python/aitemplate/testing/test_utils.py       |   48 +-
 python/aitemplate/utils/__init__.py           |    2 +-
 python/aitemplate/utils/alignment.py          |   47 +-
 python/aitemplate/utils/debug_settings.py     |   42 +
 python/aitemplate/utils/graph_utils.py        |   16 +-
 python/aitemplate/utils/misc.py               |   43 +
 .../aitemplate/utils/mk_ck_lib/generator.py   |    2 +
 .../utils/mk_cutlass_lib/extra_enum.py        |   19 +-
 .../utils/mk_cutlass_lib/extra_gemm_emit.py   |   20 +-
 .../utils/mk_cutlass_lib/mk_cutlass_lib.py    |    2 +
 .../utils/serialization/ait_program.py        |    8 +-
 .../utils/serialization/serdes_code.py        |    6 +
 python/aitemplate/utils/torch_utils.py        |   47 +-
 .../utils/visualization/op_attr_factory.py    |    4 +
 python/aitemplate/utils/visualization/plot.py |    9 +-
 static/csrc/model_container.cpp               |   54 +-
 static/csrc/model_interface.cpp               |   19 +
 static/include/cuda_device_functions.h        |   20 +
 static/include/macros.h                       |    7 +
 static/include/model.h                        |  304 ++
 static/include/model_container.h              |   13 +-
 static/include/model_interface.h              |   13 +
 static/include/raii_wrapper.h                 |   13 +
 static/include/rocm_device_functions.h        |   19 +
 tests/ci_profile_cache/update_cache.py        |   32 +-
 tests/unittest/backend/test_cuda_graph.py     |    7 +-
 tests/unittest/backend/test_model_api.py      |   26 +-
 tests/unittest/backend/test_profiler.py       |   39 +-
 .../unittest/benchmark/test_gemm_benchmark.py |   16 +-
 .../benchmark/test_group_gemm_benchmark.py    |   11 +-
 .../test_strided_layernorm_benchmark.py       |    6 +
 .../compiler/test_compilation_failure.py      |  101 +
 .../compiler/test_constant_folding.py         |  189 +-
 .../compiler/test_fuse_mm_elementwise.py      |  752 ++--
 .../compiler/test_fuse_mm_reshape_permute.py  |   44 +-
 .../compiler/test_fuse_permute_bmm.py         |  378 +-
 .../compiler/test_fuse_permute_gemm.py        |   74 +-
 ...st_fused_elementwise_complex_dependency.py |  363 +-
 .../test_fused_elementwise_out_of_order.py    |   35 +-
 tests/unittest/compiler/test_group_fusions.py |  398 +-
 .../unittest/compiler/test_memory_planning.py |   17 +-
 .../test_pad_bmm_rrr_bias_with_cat.py         |   66 +-
 .../compiler/test_pad_gemm_rrr_with_cat.py    |   55 +-
 .../compiler/test_pad_gemm_with_cat.py        |   50 +-
 .../test_pad_gemm_with_elementwise.py         |  134 +-
 .../compiler/test_parallel_gemm_fusions.py    |  210 +-
 .../compiler/test_permute_bmm_special_op.py   |   32 +-
 tests/unittest/compiler/test_refine_graph.py  |  121 +-
 .../compiler/test_remove_unused_ops.py        |   18 +-
 .../compiler/test_slice_elemwise_fusion.py    |  105 +-
 .../compiler/test_slice_gemm_fusion.py        |  170 +-
 .../compiler/test_slice_reshape_scatter.py    |   32 +-
 .../compiler/test_slice_scatter_pattern.py    |  163 +-
 .../compiler/test_slice_view_strided.py       |  345 +-
 .../compiler/test_split_bmm_fusion.py         |  114 +-
 .../compiler/test_split_bmm_softmax_bmm.py    |   12 +-
 .../compiler/test_split_large_concat.py       |  212 +-
 .../compiler/test_split_large_split.py        |  156 +
 .../compiler/test_split_view_strided.py       |   69 +-
 .../compiler/test_strided_group_gemm.py       |  113 +-
 .../compiler/test_strided_group_layernorm.py  |   52 +-
 .../compiler/test_strided_layernorm.py        |   54 +-
 .../test_strided_layernorm_reshape.py         |   40 +-
 .../compiler/test_strided_op_cat_pattern.py   |  960 ++++-
 .../compiler/test_strided_reshape_cat.py      |   98 +-
 .../unittest/compiler/test_strided_scatter.py |  239 +-
 .../compiler/test_strided_split_group_gemm.py |  147 +-
 .../compiler/test_strided_view_cat.py         |  212 +-
 .../unittest/compiler/test_strided_view_op.py |  203 +-
 .../compiler/test_transform_memory_ops.py     |   72 +-
 .../compiler/test_transform_special_op.py     |  106 +-
 .../compiler/test_transform_toposort.py       |   51 +
 .../unittest/compiler/test_view_strided_op.py |  250 +-
 tests/unittest/ops/test_activation.py         |  432 ++-
 tests/unittest/ops/test_argmax.py             |   45 +-
 tests/unittest/ops/test_attention.py          |  293 +-
 tests/unittest/ops/test_avg_pool2d.py         |   21 +-
 tests/unittest/ops/test_batch_gather.py       |   73 +-
 tests/unittest/ops/test_bert_embeddings.py    |  115 +-
 tests/unittest/ops/test_bmm.py                |  227 +-
 tests/unittest/ops/test_bmm_add.py            |  233 +-
 tests/unittest/ops/test_bmm_alpha.py          |   68 +-
 tests/unittest/ops/test_bmm_permute.py        |   54 +-
 tests/unittest/ops/test_bmm_rcr_n1.py         |   37 +-
 tests/unittest/ops/test_bmm_rrr_k1_tanh.py    |   22 +-
 tests/unittest/ops/test_bmm_softmax.py        |    9 +-
 tests/unittest/ops/test_chunk.py              |   62 +-
 tests/unittest/ops/test_clamp_nan_to_num.py   |  215 +-
 tests/unittest/ops/test_concatenate.py        |   88 +-
 tests/unittest/ops/test_concatenate_tanh.py   |  273 +-
 tests/unittest/ops/test_conv.py               |   56 +-
 tests/unittest/ops/test_conv2d_bias_add.py    |   72 +-
 tests/unittest/ops/test_conv3d.py             |  165 +-
 .../ops/test_conv3d_profiler_cache.py         |  201 ++
 tests/unittest/ops/test_conv_bias.py          |   65 +-
 .../ops/test_conv_bias_act_few_channels.py    |  143 +-
 .../ops/test_conv_bias_add_hardswish.py       |   74 +-
 tests/unittest/ops/test_conv_bias_add_relu.py |   75 +-
 .../unittest/ops/test_conv_bias_hardswish.py  |   65 +-
 tests/unittest/ops/test_conv_bias_relu.py     |   65 +-
 tests/unittest/ops/test_conv_bias_sigmoid.py  |   65 +-
 tests/unittest/ops/test_conv_depthwise.py     |    4 +-
 .../unittest/ops/test_conv_profiler_cache.py  |  190 +
 tests/unittest/ops/test_cross_attention.py    |   15 +-
 tests/unittest/ops/test_depthwise_conv3d.py   |   53 +-
 tests/unittest/ops/test_dual_bmm.py           |  231 ++
 tests/unittest/ops/test_dual_gemm.py          |  237 +-
 tests/unittest/ops/test_dynamic_conv.py       |   43 +-
 tests/unittest/ops/test_efficient_nms.py      |   80 +-
 tests/unittest/ops/test_expand.py             |  111 +-
 tests/unittest/ops/test_flatten.py            |   98 +-
 tests/unittest/ops/test_fpn_roi_align.py      |   79 +-
 tests/unittest/ops/test_fused_elementwise.py  |   74 +-
 .../ops/test_fused_elementwise_broadcast.py   |  355 +-
 ..._fused_elementwise_with_strided_outputs.py |  225 +-
 tests/unittest/ops/test_gather.py             |  161 +-
 tests/unittest/ops/test_gemm.py               |  246 +-
 tests/unittest/ops/test_gemm_bias.py          |   89 +-
 .../unittest/ops/test_gemm_bias_broadcast.py  |  121 +-
 .../unittest/ops/test_gemm_bias_hardswish.py  |   58 +-
 tests/unittest/ops/test_gemm_bias_permute.py  |  141 +-
 tests/unittest/ops/test_gemm_bias_relu.py     |   99 +-
 tests/unittest/ops/test_gemm_bias_sigmoid.py  |   60 +-
 tests/unittest/ops/test_gemm_bias_softmax.py  |   35 +-
 tests/unittest/ops/test_gemm_bias_swish.py    |   59 +-
 tests/unittest/ops/test_gemm_bias_tanh.py     |   68 +-
 tests/unittest/ops/test_gemm_permute.py       |   98 +-
 .../unittest/ops/test_gemm_profiler_cache.py  |  167 +
 .../ops/test_gemm_rcr_bias_fast_gelu.py       |   35 +-
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py |   29 +-
 tests/unittest/ops/test_gemm_rrr_small_nk.py  |   50 +-
 tests/unittest/ops/test_gemm_softmax.py       |   29 +-
 tests/unittest/ops/test_group_gemm_rcr.py     |   37 +-
 .../unittest/ops/test_group_gemm_rcr_bias.py  |   50 +-
 .../test_group_gemm_rcr_bias_activation.py    |   64 +-
 .../ops/test_group_gemm_rcr_bias_cat.py       |   50 +-
 tests/unittest/ops/test_group_gemm_rcr_cat.py |   40 +-
 tests/unittest/ops/test_groupnorm.py          |   50 +-
 .../test_int_elementwise_dynamic_reshape.py   |  175 +-
 tests/unittest/ops/test_layernorm.py          |   32 +-
 .../ops/test_layernorm_sigmoid_mul.py         |  459 ++-
 tests/unittest/ops/test_max_pool2d.py         |   17 +-
 tests/unittest/ops/test_nhwc3to4.py           |   16 +-
 tests/unittest/ops/test_nms.py                |  103 +-
 tests/unittest/ops/test_norm.py               |  127 +-
 tests/unittest/ops/test_pad_last_dim.py       |   99 +-
 tests/unittest/ops/test_perm021fc_ccr.py      |   50 +-
 tests/unittest/ops/test_perm021fc_ccr_bias.py |   57 +-
 .../ops/test_perm021fc_ccr_bias_perm021.py    |   59 +-
 tests/unittest/ops/test_perm021fc_crc.py      |   49 +-
 tests/unittest/ops/test_perm021fc_crc_bias.py |   58 +-
 tests/unittest/ops/test_perm102_bmm_rcr.py    |   54 +-
 tests/unittest/ops/test_perm102_bmm_rrr.py    |   48 +-
 tests/unittest/ops/test_permute.py            |   98 +-
 tests/unittest/ops/test_permute021.py         |   84 +-
 tests/unittest/ops/test_permute0213.py        |  104 +
 tests/unittest/ops/test_permute102.py         |   85 +-
 tests/unittest/ops/test_permute210.py         |  100 +-
 tests/unittest/ops/test_proposal.py           |   48 +-
 tests/unittest/ops/test_reduce.py             |  112 +-
 tests/unittest/ops/test_reshape.py            |   49 +-
 tests/unittest/ops/test_roi_align.py          |   73 +-
 tests/unittest/ops/test_size_getitem_ops.py   |   89 +-
 tests/unittest/ops/test_slice.py              |  184 +-
 tests/unittest/ops/test_softmax.py            |   48 +-
 tests/unittest/ops/test_split.py              |   96 +-
 tests/unittest/ops/test_split_getitem.py      |  146 +-
 tests/unittest/ops/test_squeeze.py            |   38 +-
 tests/unittest/ops/test_topk.py               |   52 +-
 tests/unittest/ops/test_transpose.py          |   98 +
 tests/unittest/ops/test_transpose_conv2d.py   |   64 +-
 .../ops/test_transpose_conv2d_bias.py         |   85 +-
 .../ops/test_transpose_conv2d_bias_relu.py    |   75 +-
 .../unittest/ops/test_tuple_list_construct.py |   35 +-
 tests/unittest/ops/test_upsamping2d.py        |   56 +-
 tests/unittest/ops/test_upsamping2d_add.py    |   97 +-
 tests/unittest/ops/test_vanilla_attention.py  |   14 +-
 tests/unittest/ops/test_var.py                |   89 +-
 tests/unittest/util/test_debug_utils.py       |   19 +-
 tests/unittest/util/test_serdes.py            |   12 +-
 570 files changed, 44943 insertions(+), 9816 deletions(-)
 create mode 160000 3rdparty/picojson
 create mode 100644 examples/04_vit/test_correctness.py
 create mode 100644 examples/05_stable_diffusion/scripts/compile.py
 create mode 100644 examples/05_stable_diffusion/scripts/demo.py
 create mode 100644 examples/05_stable_diffusion/scripts/demo_img2img.py
 create mode 100644 examples/05_stable_diffusion/scripts/download_pipeline.py
 create mode 100644 examples/05_stable_diffusion/src/__init__.py
 create mode 100644 examples/05_stable_diffusion/src/benchmark.py
 create mode 100644 examples/05_stable_diffusion/src/benchmark_pt.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/__init__.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_clip.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_unet.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_vae.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/util.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/attention.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/clip.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/embeddings.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/resnet.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/unet_blocks.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/vae.py
 create mode 100644 examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
 create mode 100644 examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
 create mode 100644 examples/05_stable_diffusion/src/test_correctness.py
 create mode 100644 fx2ait/CMakeLists.txt
 create mode 100644 fx2ait/README.md
 create mode 100644 fx2ait/fx2ait/TARGETS
 create mode 100644 fx2ait/fx2ait/__init__.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/__init__.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_normalizer.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_op_properties.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_ops.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_tracer.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/acc_utils.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
 create mode 100644 fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
 create mode 100644 fx2ait/fx2ait/ait_module.py
 create mode 100644 fx2ait/fx2ait/ait_splitter.py
 create mode 100644 fx2ait/fx2ait/cache.py
 create mode 100644 fx2ait/fx2ait/converters/__init__.py
 create mode 100644 fx2ait/fx2ait/converters/ait_converters.py
 create mode 100644 fx2ait/fx2ait/converters/ait_module_converters.py
 create mode 100644 fx2ait/fx2ait/converters/aten2ait_converters.py
 create mode 100644 fx2ait/fx2ait/converters/converter_registry.py
 create mode 100644 fx2ait/fx2ait/converters/utils.py
 create mode 100644 fx2ait/fx2ait/csrc/AITModel.cpp
 create mode 100644 fx2ait/fx2ait/csrc/AITModel.h
 create mode 100644 fx2ait/fx2ait/csrc/AITModelImpl.cpp
 create mode 100644 fx2ait/fx2ait/csrc/AITModelImpl.h
 create mode 100644 fx2ait/fx2ait/csrc/TARGETS
 create mode 100644 fx2ait/fx2ait/example/01_transformer_model/README.md
 create mode 100644 fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
 create mode 100644 fx2ait/fx2ait/example/02_vision_model/README.md
 create mode 100644 fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
 create mode 100644 fx2ait/fx2ait/example/03_lowering_split/README.md
 create mode 100644 fx2ait/fx2ait/example/03_lowering_split/test_lower.py
 create mode 100644 fx2ait/fx2ait/example/benchmark_utils.py
 create mode 100644 fx2ait/fx2ait/fx2ait.py
 create mode 100644 fx2ait/fx2ait/lower/lower.py
 create mode 100644 fx2ait/fx2ait/lower/lower_settings.py
 create mode 100644 fx2ait/fx2ait/passes/lower_basic_pass_aten.py
 create mode 100644 fx2ait/fx2ait/tensor_spec.py
 create mode 100644 fx2ait/fx2ait/test/TARGETS
 create mode 100644 fx2ait/fx2ait/test/__init__.py
 create mode 100644 fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
 create mode 100644 fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
 create mode 100644 fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_binary_op.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_chunk.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_clamp.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_common.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_contiguous.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_conv2d.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_expand.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_flatten.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_gelu.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_linear.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_matmul.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_nan2num.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_pow.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_reduce.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_softmax.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_squeeze.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_tile.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_var.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
 create mode 100644 fx2ait/fx2ait/test/test_ait_lower.py
 create mode 100644 fx2ait/fx2ait/test/test_fx2ait.py
 create mode 100644 fx2ait/fx2ait/test/test_tensor_spec.py
 create mode 100644 fx2ait/fx2ait/tools/ait_minimizer.py
 create mode 100644 fx2ait/fx2ait/tools/common_aten2ait.py
 create mode 100644 fx2ait/fx2ait/tools/common_fx2ait.py
 create mode 100644 fx2ait/fx2ait/utils.py
 create mode 100644 fx2ait/setup.py
 create mode 100644 python/aitemplate/backend/common/tensor/permute0213_common.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
 create mode 100644 python/aitemplate/backend/cuda/tensor/concatenate_fast.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute0213.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/permute0213.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv_common.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/permute0213.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/transpose.py
 create mode 100644 python/aitemplate/compiler/transform/split_large_split_ops.py
 create mode 100644 python/aitemplate/frontend/nn/conv3d.py
 create mode 100644 python/aitemplate/testing/benchmark_trt.py
 create mode 100644 python/aitemplate/utils/debug_settings.py
 create mode 100644 python/aitemplate/utils/misc.py
 create mode 100644 static/include/model.h
 create mode 100644 tests/unittest/compiler/test_compilation_failure.py
 create mode 100644 tests/unittest/compiler/test_split_large_split.py
 create mode 100644 tests/unittest/compiler/test_transform_toposort.py
 create mode 100644 tests/unittest/ops/test_conv3d_profiler_cache.py
 create mode 100644 tests/unittest/ops/test_conv_profiler_cache.py
 create mode 100644 tests/unittest/ops/test_dual_bmm.py
 create mode 100644 tests/unittest/ops/test_gemm_profiler_cache.py
 create mode 100644 tests/unittest/ops/test_permute0213.py
 create mode 100644 tests/unittest/ops/test_transpose.py

diff --git a/.gitmodules b/.gitmodules
index a82a39064..e439953e9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,6 @@
 	path = 3rdparty/composable_kernel
 	url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git
 	branch = develop
+[submodule "3rdparty/picojson"]
+	path = 3rdparty/picojson
+	url = https://github.com/kazuho/picojson.git
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index f434be22a..5d7be1ac1 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit f434be22a6270f9f000712286f92545ccca045b7
+Subproject commit 5d7be1ac1b0dae1e9b8ccbe98d494ccaa437ddc0
diff --git a/3rdparty/picojson b/3rdparty/picojson
new file mode 160000
index 000000000..111c9be51
--- /dev/null
+++ b/3rdparty/picojson
@@ -0,0 +1 @@
+Subproject commit 111c9be5188f7350c2eac9ddaedd8cca3d7bf394
diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 1342becf6..4c7cf2eb9 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -9,6 +9,8 @@ Codegen
 
 **RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
 
+**AIT_NDEBUG**: If set to "1", compile with `NDEBUG`, disabling debug assertions. Recommended for production builds. "1" by default.
+
 Profiling
 ---------
 
diff --git a/examples/04_vit/test_correctness.py b/examples/04_vit/test_correctness.py
new file mode 100644
index 000000000..745ab2d8e
--- /dev/null
+++ b/examples/04_vit/test_correctness.py
@@ -0,0 +1,184 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import io
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.testing import detect_target
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+from parameterized import parameterized
+
+from timm.models.vision_transformer import vit_base_patch16_224, vit_large_patch16_384
+
+from .modeling.vision_transformer import VisionTransformer
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def compile_vit(
+    batch_size=128,
+    img_size=224,
+    patch_size=16,
+    embed_dim=768,
+    num_heads=12,
+    depth=12,
+    class_token=True,
+    global_pool="token",
+    use_fp16_acc=True,
+):
+    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+    ait_model = VisionTransformer(
+        batch_size=batch_size,
+        img_size=img_size,
+        class_token=class_token,
+        global_pool=global_pool,
+        num_heads=num_heads,
+        embed_dim=embed_dim,
+        patch_size=patch_size,
+        depth=depth,
+        act_layer="GELU",
+    )
+    ait_model.name_parameter_tensor()
+    inputs_ait = Tensor(
+        [batch_size, img_size, img_size, 3], name="input0", is_input=True
+    )
+    Y = ait_model(inputs_ait)
+    mark_output(Y)
+
+    target = detect_target(use_fp16_acc=use_fp16_acc)
+    exe_module = compile_model(
+        Y, target, "./tmp", "vision_transformer_bs%d_seq%d" % (batch_size, seqlen)
+    )
+    return exe_module
+
+
+class VITVerification(unittest.TestCase):
+    @parameterized.expand(["vit_base_patch16_224", "vit_large_patch16_384"])
+    def test_vit(self, model_name):
+        if model_name == "vit_base_patch16_224":
+            img_size = 224
+            depth = 12
+            embed_dim = 768
+            num_heads = 12
+            global_pool = "token"
+            vit_pt_def = vit_base_patch16_224
+            path = "tree/vit-pt/vit_base_patch16_224.pt"
+
+        elif model_name == "vit_large_patch16_384":
+            img_size = 384
+            depth = 24
+            embed_dim = 1024
+            num_heads = 16
+            vit_pt_def = vit_large_patch16_384
+            path = "tree/vit-pt/vit_large_patch16_384.pt"
+        if ManifoldClient is None:
+            vit_pt = vit_pt_def(pretrained=True)
+        else:
+            stream = io.BytesIO()
+            with ManifoldClient.get_client(bucket="aitemplate") as client:
+                await_sync(
+                    client.get(
+                        path,
+                        stream,
+                    )
+                )
+            stream.seek(0)
+            vit_pt = vit_pt_def(pretrained=False)
+            vit_pt.load_state_dict(torch.load(stream))
+        global_pool = "token"
+        patch_size = 16
+        vit_pt = vit_pt.cuda().half()
+        batch_size = 1
+        vit_ait = compile_vit(
+            batch_size=batch_size,
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            depth=depth,
+            class_token=True,
+            global_pool=global_pool,
+            use_fp16_acc=False,
+        )
+        nc = 3
+        seqlen = (img_size // patch_size) ** 2 + 1
+
+        # prepare params
+        params_pt = vit_pt.named_parameters()
+        params_ait = {}
+        for key, arr in params_pt:
+            ait_key = key.replace(".", "_")
+            if len(arr.shape) == 4:
+                arr = arr.permute((0, 2, 3, 1)).contiguous()
+                if detect_target().name() == "cuda":
+                    conv0_w_pad = (
+                        torch.zeros((embed_dim, patch_size, patch_size, 4))
+                        .cuda()
+                        .half()
+                    )
+                    conv0_w_pad[:, :, :, :3] = arr
+                    arr = conv0_w_pad
+            params_ait[f"{ait_key}"] = arr
+        params_ait["cls_token_mask"] = (
+            torch.zeros((batch_size, 1, embed_dim)).cuda().half()
+        )
+        if detect_target().name() == "cuda":
+            ait_key = "attn_cu_length"
+            for i in range(depth):
+                prefix = "blocks_%d" % (i)
+                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+                params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
+
+        # set weights
+        for name, weight in params_ait.items():
+            vit_ait.set_constant_with_tensor(name, weight)
+
+        with torch.no_grad():
+            x_pt = (
+                torch.rand(
+                    (batch_size, nc, img_size, img_size),
+                    dtype=torch.float16,
+                    device="cuda",
+                )
+                * 255
+            )
+            x_ait = x_pt.permute(0, 2, 3, 1).contiguous()
+            y_pt = vit_pt(x_pt).reshape(batch_size, 1, -1)
+            y_ait = torch.empty_like(y_pt)
+            vit_ait.run_with_tensors([x_ait], [y_ait])
+            torch.testing.assert_close(y_ait, y_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index 1f62403de..18700540a 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -6,34 +6,37 @@ In this example, we show how to build fast AIT modules for CLIP, UNet, VAE model
 
 First, clone, build, and install AITemplate [per the README instructions](https://github.com/facebookincubator/AITemplate#clone-the-code).
 
-This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`.
+This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`. You could install them using `pip`.
 
-Verify the library versions. We have tested transformers 4.21/4.22/4.23, diffusers 0.3/0.4 and torch 1.11/1.12.
+Verify the library versions. We have tested transformers==4.25, diffusers==0.11[torch] and torch==1.12.
 
 ```
 >>> import transformers
 >>> transformers.__version__
-'4.21.2'
+'4.25.0'
 >>> import diffusers
 >>> diffusers.__version__
-'0.3.0'
+'0.11.0'
+>>> import torch
 >>> torch.__version__
-'1.12.1+cu116'
+'1.12.0+cu113'
 ```
 
-### Build AIT modules for CLIP, UNet, VAE
-
-Build the AIT modules by running `compile.py`. You must first register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
+### Download the diffusers pipeline files
+You must first register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
 
 ```
-python3 examples/05_stable_diffusion/compile.py --token ACCESS_TOKEN
+python3 scripts/download_pipeline.py --token ACCESS_TOKEN
 ```
-It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
-Compile the img2img models:
+### Build AIT modules for CLIP, UNet, VAE
+
+Build the AIT modules by running `compile.py`.
+
 ```
-python3 examples/05_stable_diffusion/compile.py --img2img True --token ACCESS_TOKEN
+python3 scripts/compile.py
 ```
+It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
 #### Multi-GPU profiling
 AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
@@ -41,10 +44,18 @@ To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBL
 
 ### Benchmark
 
-This step is optional. You can run `benchmark.py` with the access token to initialize the weights and benchmark.
+This step is optional. You can run `benchmark.py` to measure throughput for each of the subnets.
+
+```
+python3 src/benchmark.py
+```
+
+### Verify
+
+This step is optional. You can verify numerical correctness for each of the subnets.
 
 ```
-python3 examples/05_stable_diffusion/benchmark.py --token ACCESS_TOKEN
+HUGGINGFACE_AUTH_TOKEN=ACCESS_TOKEN python3 -m unittest src/test_correctness.py
 ```
 
 ### Run Models
@@ -52,13 +63,13 @@ python3 examples/05_stable_diffusion/benchmark.py --token ACCESS_TOKEN
 Run AIT models with an example image:
 
 ```
-python3 examples/05_stable_diffusion/demo.py --token ACCESS_TOKEN
+python3 scripts/demo.py
 ```
 
 Img2img demo:
 
 ```
-python3 examples/05_stable_diffusion/demo_img2img.py --token ACCESS_TOKEN
+python3 scripts/demo_img2img.py
 ```
 
 Check the resulted image: `example_ait.png`
@@ -66,15 +77,15 @@ Check the resulted image: `example_ait.png`
 
 ### Sample outputs
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Mountain Rainier in van Gogh's world"`
+Command: `python3 scripts/demo.py --prompt "Mountain Rainier in van Gogh's world"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_rainier.png)
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Sitting in a tea house in Japan with Mount Fuji in the background, sunset professional portrait, Nikon 85mm f/1.4G"`
+Command: `python3 scripts/demo.py --prompt "Sitting in a tea house in Japan with Mount Fuji in the background, sunset professional portrait, Nikon 85mm f/1.4G"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_fuji.png)
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "A lot of wild flowers with North Cascade Mountain in background, sunset professional photo, Unreal Engine"`
+Command: `scripts/demo.py --prompt "A lot of wild flowers with North Cascade Mountain in background, sunset professional photo, Unreal Engine"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_cascade2.png)
 
diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
new file mode 100644
index 000000000..8c7a5be98
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+
+import torch
+
+from aitemplate.testing import detect_target
+
+from diffusers import StableDiffusionPipeline
+
+from src.compile_lib.compile_clip import compile_clip
+from src.compile_lib.compile_unet import compile_unet
+from src.compile_lib.compile_vae import compile_vae
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch-size", default=1, help="batch size")
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(
+    local_dir, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    ww = width // 8
+    hh = height // 8
+
+    # CLIP
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        dim=1024,
+        num_heads=16,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size * 2,
+        width=ww,
+        height=hh,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=ww,
+        height=hh,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
new file mode 100644
index 000000000..77d58cde2
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import EulerDiscreteScheduler
+from src.pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(local_dir, width, height, prompt, benchmark):
+    pipe = StableDiffusionAITPipeline.from_pretrained(
+        local_dir,
+        scheduler=EulerDiscreteScheduler.from_pretrained(
+            local_dir, subfolder="scheduler"
+        ),
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    with torch.autocast("cuda"):
+        image = pipe(prompt, height, width).images[0]
+        if benchmark:
+            t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
+            print(f"sd e2e: {t} ms")
+
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
new file mode 100644
index 000000000..46c53cfd9
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from io import BytesIO
+
+import click
+import requests
+import torch
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from PIL import Image
+from src.pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option(
+    "--prompt", default="A fantasy landscape, trending on artstation", help="prompt"
+)
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(local_dir, width, height, prompt, benchmark):
+
+    # load the pipeline
+    device = "cuda"
+    pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+        safety_checker=None,
+        feature_extractor=None,
+    )
+    pipe = pipe.to(device)
+    # let's download an initial image
+    url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+    response = requests.get(url)
+    init_image = Image.open(BytesIO(response.content)).convert("RGB")
+    init_image = init_image.resize((height, width))
+
+    with torch.autocast("cuda"):
+        images = pipe(
+            prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5
+        ).images
+        if benchmark:
+            args = (prompt, init_image)
+            t = benchmark_torch_function(10, pipe, *args)
+            print(f"sd e2e: {t} ms")
+
+    images[0].save("fantasy_landscape_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
new file mode 100644
index 000000000..e5ffe56f0
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+from diffusers import StableDiffusionPipeline
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option(
+    "--save_directory",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="pipeline files local directory",
+)
+def download_pipeline_files(token, save_directory) -> None:
+    StableDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        # use provided token or the one generated with `huggingface-cli login``
+        use_auth_token=token if token != "" else True,
+    ).save_pretrained(save_directory)
+
+
+if __name__ == "__main__":
+    download_pipeline_files()
diff --git a/examples/05_stable_diffusion/src/__init__.py b/examples/05_stable_diffusion/src/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/examples/05_stable_diffusion/src/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
new file mode 100644
index 000000000..5a99b1f48
--- /dev/null
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -0,0 +1,309 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+
+import click
+
+import numpy as np
+import torch
+from aitemplate.compiler import Model
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import StableDiffusionPipeline
+
+from torch import autocast
+from transformers import CLIPTokenizer
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_int_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def benchmark_unet(
+    pt_mod,
+    batch_size=2,
+    height=64,
+    width=64,
+    dim=320,
+    hidden_dim=1024,
+    benchmark_pt=False,
+    verify=False,
+):
+
+    exe_module = Model("./tmp/UNet2DConditionModel/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for UNet2DConditionModel.")
+        exit(-1)
+
+    # run PT unet model
+    pt_mod = pt_mod.eval()
+
+    latent_model_input_pt = torch.randn(batch_size, 4, height, width).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 64, hidden_dim).cuda().half()
+    timesteps_pt = torch.Tensor([1, 1]).cuda().half()
+
+    with autocast("cuda"):
+        pt_ys = pt_mod(
+            latent_model_input_pt,
+            timesteps_pt,
+            encoder_hidden_states=text_embeddings_pt,
+        ).sample
+
+        # PT benchmark
+        if benchmark_pt:
+            args = (latent_model_input_pt, 1, text_embeddings_pt)
+            pt_time = benchmark_torch_function(100, pt_mod, *args)
+            print(f"PT batch_size: {batch_size}, {pt_time} ms")
+            with open("sd_pt_benchmark.txt", "a") as f:
+                f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    print("pt output:", pt_ys.shape)
+
+    # run AIT unet model
+    inputs = {
+        "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
+        "input1": timesteps_pt,
+        "input2": text_embeddings_pt,
+    }
+
+    ys = []
+    num_ouputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = exe_module.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    exe_module.run_with_tensors(inputs, ys)
+
+    # verification
+    y_transpose = ys[0].permute((0, 3, 1, 2))
+
+    if verify:
+        eps = 1e-1
+        np.testing.assert_allclose(
+            pt_ys.detach().cpu().numpy(),
+            y_transpose.cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        print("UNet2DCondition verification pass")
+
+    # AIT benchmark
+    # warmup
+    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"unet batch_size: {batch_size}, latency: {t} ms\n")
+
+
+def benchmark_clip(
+    pt_mod,
+    batch_size=1,
+    seqlen=64,
+    tokenizer=None,
+    benchmark_pt=False,
+    verify=False,
+):
+    mask_seq = 0
+
+    exe_module = Model("./tmp/CLIPTextModel/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for CLIPTextModel.")
+        exit(-1)
+
+    # run PT clip
+    pt_mod = pt_mod.eval()
+
+    if tokenizer is None:
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_input = tokenizer(
+        ["a photo of an astronaut riding a horse on mars"],
+        padding="max_length",
+        max_length=seqlen,
+        truncation=True,
+        return_tensors="pt",
+    )
+    input_ids = text_input["input_ids"].cuda()
+
+    attention_mask = torch.ones((batch_size, seqlen))
+    attention_mask[-1, -mask_seq:] = 0
+    attention_mask = None
+
+    position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
+    pt_ys = pt_mod(input_ids, attention_mask, position_ids)
+    print("pt output:", pt_ys[0].shape)
+
+    # PT benchmark
+    if benchmark_pt:
+        args = (input_ids, attention_mask, position_ids)
+        pt_time = benchmark_torch_function(100, pt_mod, *args)
+        print(f"PT batch_size: {batch_size}, {pt_time} ms")
+        with open("sd_pt_benchmark.txt", "a") as f:
+            f.write(f"clip batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    # run AIT clip
+    inputs = {
+        "input0": input_ids,
+        "input1": position_ids,
+    }
+    ys = []
+    num_ouputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = exe_module.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    exe_module.run_with_tensors(inputs, ys)
+
+    # verification
+    if verify:
+        eps = 1e-1
+        pt_np = pt_ys[0].detach().cpu().numpy()
+        np.testing.assert_allclose(
+            pt_np,
+            ys[0].cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        print("CLIPTextTransformer verification pass")
+
+    # AIT benchmark
+    # warmup
+    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"clip batch_size: {batch_size}, latency: {t} ms\n")
+
+
+def benchmark_vae(
+    pt_vae, batch_size=1, height=64, width=64, benchmark_pt=False, verify=False
+):
+
+    latent_channels = 4
+
+    exe_module = Model("./tmp/AutoencoderKL/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for AutoencoderKL.")
+        exit(-1)
+
+    # run PT vae
+    pt_vae = pt_vae.cuda().half()
+    pt_vae.eval()
+
+    pt_input = torch.rand([batch_size, latent_channels, height, width]).cuda().half()
+    print("pt_input shape", pt_input.shape)
+    with autocast("cuda"):
+        pt_output = pt_vae.decode(pt_input).sample
+        pt_output = pt_output.half()
+
+        # PT benchmark
+        if benchmark_pt:
+            args = (pt_input,)
+            pt_time = benchmark_torch_function(100, pt_vae.decode, *args)
+            print(f"PT batch_size: {batch_size}, {pt_time} ms")
+            with open("sd_pt_benchmark.txt", "a") as f:
+                f.write(f"vae batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    # run AIT vae
+    y = (
+        torch.empty(
+            pt_output.size(0),
+            pt_output.size(2),
+            pt_output.size(3),
+            pt_output.size(1),
+        )
+        .cuda()
+        .half()
+    )
+    ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
+    print("input pt tensor size: ", ait_input_pt_tensor.shape)
+    print("output pt tensor size: ", y.shape)
+    exe_module.run_with_tensors([ait_input_pt_tensor], [y])
+
+    # verification
+    if verify:
+        y_pt = torch.permute(y, (0, 3, 1, 2))
+        eps = 1e-1
+        np.testing.assert_allclose(
+            pt_output.detach().cpu().numpy(),
+            y_pt.cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        logging.info("VAE Verification done!")
+
+    # AIT benchmark:
+    # warmup
+    exe_module.benchmark_with_tensors([ait_input_pt_tensor], [y], count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(
+        [ait_input_pt_tensor], [y], count=100, repeat=4
+    )
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"vae batch_size: {batch_size}, latency: {t} ms\n")
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--batch-size", default=1, help="batch size")
+@click.option("--verify", type=bool, default=False, help="verify correctness")
+@click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
+def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
+    assert batch_size == 1, "batch size must be 1 for submodule verification"
+    logging.getLogger().setLevel(logging.INFO)
+    np.random.seed(0)
+    torch.manual_seed(4896)
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    # CLIP
+    benchmark_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
+    )
+    # UNet
+    benchmark_unet(
+        pipe.unet, batch_size=batch_size * 2, benchmark_pt=benchmark_pt, verify=verify
+    )
+    # VAE
+    benchmark_vae(
+        pipe.vae, batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify
+    )
+
+
+if __name__ == "__main__":
+    benchmark_diffusers()
diff --git a/examples/05_stable_diffusion/src/benchmark_pt.py b/examples/05_stable_diffusion/src/benchmark_pt.py
new file mode 100644
index 000000000..95bfb725f
--- /dev/null
+++ b/examples/05_stable_diffusion/src/benchmark_pt.py
@@ -0,0 +1,50 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import StableDiffusionPipeline
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(local_dir, prompt, benchmark):
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    with torch.autocast("cuda"):
+        image = pipe(prompt).images[0]
+        if benchmark:
+            t = benchmark_torch_function(10, pipe, prompt)
+            print(f"sd pt e2e: {t} ms")
+
+    image.save("example_pt.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/src/compile_lib/__init__.py b/examples/05_stable_diffusion/src/compile_lib/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
new file mode 100644
index 000000000..5cc57077f
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -0,0 +1,118 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
+from .util import mark_output
+
+
+def map_clip_params(pt_mod, batch_size, seqlen, depth):
+
+    params_pt = list(pt_mod.named_parameters())
+
+    params_ait = {}
+    pt_params = {}
+    for key, arr in params_pt:
+        pt_params[key.replace("text_model.", "")] = arr
+
+    pt_params = dict(pt_mod.named_parameters())
+    for key, arr in pt_params.items():
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("q_proj.weight"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.weight")]
+            q = pt_params[prefix + "q_proj.weight"]
+            k = pt_params[prefix + "k_proj.weight"]
+            v = pt_params[prefix + "v_proj.weight"]
+            qkv_weight = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_weight
+            continue
+        elif name.endswith("q_proj.bias"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.bias")]
+            q = pt_params[prefix + "q_proj.bias"]
+            k = pt_params[prefix + "k_proj.bias"]
+            v = pt_params[prefix + "v_proj.bias"]
+            qkv_bias = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_bias
+            continue
+        elif name.endswith("k_proj.weight"):
+            continue
+        elif name.endswith("k_proj.bias"):
+            continue
+        elif name.endswith("v_proj.weight"):
+            continue
+        elif name.endswith("v_proj.bias"):
+            continue
+        params_ait[ait_name] = arr
+
+        if detect_target().name() == "cuda":
+            for i in range(depth):
+                prefix = "encoder_layers_%d_self_attn_cu_length" % (i)
+                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+                params_ait[prefix] = torch.from_numpy(cu_len).cuda()
+
+    return params_ait
+
+
+def compile_clip(
+    pt_mod,
+    batch_size=1,
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+):
+    mask_seq = 0
+    causal = True
+    depth = 23
+
+    ait_mod = ait_CLIPTextTransformer(
+        num_hidden_layers=depth,
+        hidden_size=dim,
+        num_attention_heads=num_heads,
+        batch_size=batch_size,
+        seq_len=seqlen,
+        causal=causal,
+        mask_seq=mask_seq,
+    )
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
+
+    input_ids_ait = Tensor(
+        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
+    )
+    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
new file mode 100644
index 000000000..f1f4acab8
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.unet_2d_condition import (
+    UNet2DConditionModel as ait_UNet2DConditionModel,
+)
+from .util import mark_output
+
+
+def map_unet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def compile_unet(
+    pt_mod,
+    batch_size=2,
+    height=64,
+    width=64,
+    dim=320,
+    hidden_dim=1024,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+):
+
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=64,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=[5, 10, 20, 20],
+    )
+    ait_mod.name_parameter_tensor()
+
+    # set AIT parameters
+    pt_mod = pt_mod.eval()
+    params_ait = map_unet_params(pt_mod, dim)
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height, width, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [batch_size, 64, hidden_dim], name="input2", is_input=True
+    )
+
+    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
new file mode 100644
index 000000000..d01f320dc
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import OrderedDict
+
+import numpy as np
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
+from .util import mark_output
+
+
+def map_vae_params(ait_module, pt_module, batch_size, seq_len):
+    pt_params = dict(pt_module.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for name, _ in ait_module.named_parameters():
+        ait_name = name.replace(".", "_")
+        if name in pt_params:
+            if (
+                "conv" in name
+                and "norm" not in name
+                and name.endswith(".weight")
+                and len(pt_params[name].shape) == 4
+            ):
+                mapped_pt_params[ait_name] = torch.permute(
+                    pt_params[name], [0, 2, 3, 1]
+                ).contiguous()
+            else:
+                mapped_pt_params[ait_name] = pt_params[name]
+        elif name.endswith("attention.qkv.weight"):
+            prefix = name[: -len("attention.qkv.weight")]
+            q_weight = pt_params[prefix + "query.weight"]
+            k_weight = pt_params[prefix + "key.weight"]
+            v_weight = pt_params[prefix + "value.weight"]
+            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+            mapped_pt_params[ait_name] = qkv_weight
+        elif name.endswith("attention.qkv.bias"):
+            prefix = name[: -len("attention.qkv.bias")]
+            q_bias = pt_params[prefix + "query.bias"]
+            k_bias = pt_params[prefix + "key.bias"]
+            v_bias = pt_params[prefix + "value.bias"]
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
+            mapped_pt_params[ait_name] = qkv_bias
+        elif name.endswith("attention.proj.weight"):
+            prefix = name[: -len("attention.proj.weight")]
+            pt_name = prefix + "proj_attn.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj.bias"):
+            prefix = name[: -len("attention.proj.bias")]
+            pt_name = prefix + "proj_attn.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.cu_length"):
+            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
+            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
+        else:
+            pt_param = pt_module.get_parameter(name)
+            mapped_pt_params[ait_name] = pt_param
+
+    return mapped_pt_params
+
+
+def compile_vae(
+    pt_mod,
+    batch_size=1,
+    height=64,
+    width=64,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+):
+    in_channels = 3
+    out_channels = 3
+    down_block_types = [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ]
+    up_block_types = [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ]
+    block_out_channels = [128, 256, 512, 512]
+    layers_per_block = 2
+    act_fn = "silu"
+    latent_channels = 4
+    sample_size = 512
+
+    ait_vae = ait_AutoencoderKL(
+        batch_size,
+        height,
+        width,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn=act_fn,
+        latent_channels=latent_channels,
+        sample_size=sample_size,
+    )
+    ait_input = Tensor(
+        shape=[batch_size, height, width, latent_channels],
+        name="vae_input",
+        is_input=True,
+    )
+    ait_vae.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_vae_params(ait_vae, pt_mod, batch_size, height * width)
+
+    Y = ait_vae.decode(ait_input)
+    mark_output(Y)
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y,
+        target,
+        "./tmp",
+        "AutoencoderKL",
+        constants=params_ait,
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
new file mode 100644
index 000000000..000e862e9
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -0,0 +1,22 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("AIT output_{} shape: {}".format(i, y_shape))
diff --git a/examples/05_stable_diffusion/src/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
new file mode 100644
index 000000000..14993e6d9
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
+"""
+
+from typing import Optional
+
+from aitemplate.compiler.ops import reshape
+
+from aitemplate.frontend import nn, Tensor
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        batch_size (:obj:`int`): The number of examples per batch.
+        height (:obj:`int`): Height of each image example.
+        width (:obj:`int`): Width of each image example.
+        channels (:obj:`int`): The number of channels in the input and output.
+        num_head_channels (:obj:`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.channels = channels
+        self.num_heads = (
+            channels // num_head_channels if num_head_channels is not None else 1
+        )
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
+        self.attention = nn.MultiheadAttention(
+            channels,
+            batch_size,
+            height * width,
+            self.num_heads,
+            qkv_bias=True,
+            has_residual=True,
+            use_mem_eff=True,
+        )
+        self.rescale_output_factor = rescale_output_factor
+
+    def forward(self, hidden_states) -> Tensor:
+        """
+        input hidden_states shape: [batch, height, width, channel]
+        output shape: [batch, height, width, channel]
+        """
+        residual = hidden_states
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = reshape()(
+            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+        )
+
+        batch, hw, channel = hidden_states.shape()
+        if (
+            batch.value() != self.batch_size
+            or hw.value() != self.width * self.height
+            or channel.value() != self.channels
+        ):
+            raise RuntimeError(
+                "nchw params do not match! "
+                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
+                f"actual: {batch}, {channel}, {hw}."
+            )
+
+        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
+        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+
+        return res
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
new file mode 100644
index 000000000..874050eb2
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -0,0 +1,587 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from inspect import isfunction
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        dtype="float16",
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
+        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None, residual=None):
+        nheads = self.heads
+        d = self.dim_head
+
+        layout = "20314" if USE_CUDA else "m2n3"
+
+        bs, seqlen, _ = get_shape(x)
+        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
+        )
+        context = default(context, x)
+
+        seqlen = get_shape(context)[1]
+        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        )
+        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
+        )
+
+        if USE_CUDA:
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
+            )
+        else:
+            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
+            out = OP(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+                (ops.reshape()(v, [bs * nheads, -1, d])),
+            )
+        out = ops.reshape()(out, [bs, -1, nheads * d])
+        proj = self.to_out(out)
+        proj = ops.reshape()(proj, [bs, -1, nheads * d])
+        if residual is not None:
+            return proj + residual
+        else:
+            return proj
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+
+    def forward(self, x):
+        return self.proj(x, self.gate(x))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(
+                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x, residual=None):
+        shape = ops.size()(x)
+        x = self.net(x)
+        x = ops.reshape()(x, shape)
+        if residual is not None:
+            return x + residual
+        else:
+            return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
+
+    def forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), residual=x)
+        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.ff(self.norm3(x), residual=x)
+        return x
+
+
+def Normalize(in_channels):
+    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+
+    def __init__(
+        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)  # Group Norm
+
+        self.proj_in = nn.Conv2dBias(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+
+        self.proj_out = nn.Conv2dBias(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, h, w, c = get_shape(x)
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = ops.reshape()(x, [b, -1, c])
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = ops.reshape()(x, [b, h, w, c])
+        x = self.proj_out(x)
+        return x + x_in
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        batch_size=1,
+        seq_len=16,
+        layer_norm_eps=1e-5,
+        hidden_dropout_prob=0.0,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=hidden_dropout_prob,
+            has_residual=False,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        residual: Optional[Tensor] = None,
+    ):
+        if residual is not None:
+            self_output = self.attn(hidden_states, residual)
+        else:
+            self_output = self.attn(hidden_states)
+        return self_output
+
+
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, x):
+        x1 = x * 1.702
+        x1 = ops.sigmoid(x1)
+        x = x * x1
+        return x
+
+
+class CLIPMLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="GELU",
+        drop=0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            specialization="gelu",
+        )
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        mlp_ratio=4.0,
+        batch_size=1,
+        seq_len=16,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=0,
+            has_residual=True,
+            causal=causal,
+            mask_seq=mask_seq,
+            use_mem_eff=True,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, residual)
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states, residual)
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        num_hidden_layers=12,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        hidden_size=768,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    causal=causal,
+                    mask_seq=mask_seq,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        # all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        vocab_size=49408,
+        max_position_embeddings=77,
+        dtype="float16",
+    ):
+        super().__init__()
+        embed_dim = hidden_size
+
+        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
+        self.position_embedding = nn.Embedding(
+            shape=[max_position_embeddings, embed_dim], dtype=dtype
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        inputs_embeds: Optional[Tensor] = None,
+    ) -> Tensor:
+
+        input_shape = ops.size()(input_ids)
+
+        # [B * S]
+        input_ids = ops.reshape()(input_ids, [-1])
+
+        position_ids = ops.reshape()(position_ids, [-1])
+
+        if inputs_embeds is None:
+            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+
+        position_embeddings = ops.batch_gather()(
+            self.position_embedding.tensor(), position_ids
+        )
+
+        embeddings = inputs_embeds + position_embeddings
+
+        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
+
+        return embeddings
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
+        self.encoder = CLIPEncoder(
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+        self.final_layer_norm = nn.LayerNorm(hidden_size)
+
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        return last_hidden_state
diff --git a/examples/05_stable_diffusion/src/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
new file mode 100644
index 000000000..36b96a4fb
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def get_timestep_embedding(
+    timesteps: Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+
+    exponent = (-math.log(max_period)) * Tensor(
+        shape=[half_dim], dtype="float16", name="arange"
+    )
+
+    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
+
+    emb = ops.exp(exponent)
+    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = ops.concatenate()(
+            [ops.cos(emb), ops.sin(emb)],
+            dim=-1,
+        )
+    else:
+        emb = ops.concatenate()(
+            [ops.sin(emb), ops.cos(emb)],
+            dim=-1,
+        )
+    return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.linear_2(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
diff --git a/examples/05_stable_diffusion/src/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
new file mode 100644
index 000000000..03e4f8023
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -0,0 +1,238 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Upsample2D(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        conv = None
+        if use_conv_transpose:
+            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(x)
+
+        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                x = self.conv(x)
+            else:
+                x = self.Conv2d_0(x)
+
+        return x
+
+
+class Downsample2D(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            conv = nn.Conv2dBias(
+                self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        x = self.conv(x)
+
+        return x
+
+
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_nin_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = nn.GroupNorm(
+            num_groups=groups,
+            num_channels=in_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+
+        self.conv1 = nn.Conv2dBias(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = nn.GroupNorm(
+            num_groups=groups_out,
+            num_channels=out_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2dBias(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        self.upsample = self.downsample = None
+
+        self.use_nin_shortcut = (
+            self.in_channels != self.out_channels
+            if use_nin_shortcut is None
+            else use_nin_shortcut
+        )
+
+        if self.use_nin_shortcut:
+            self.conv_shortcut = nn.Conv2dBias(
+                in_channels, out_channels, 1, 1, 0
+            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
+        else:
+            self.conv_shortcut = None
+
+    def forward(self, x, temb=None):
+        hidden_states = x
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm1(
+            hidden_states
+        )  # .float()).type(hidden_states.dtype) # fused swish
+        # hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            x = self.downsample(x)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(ops.silu(temb))
+            bs, dim = get_shape(temb)
+            temb = ops.reshape()(temb, [bs, 1, 1, dim])
+            hidden_states = hidden_states + temb
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = hidden_states + x
+
+        return out
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
new file mode 100644
index 000000000..770156ff9
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -0,0 +1,255 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple, Union
+
+from aitemplate.frontend import nn
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+
+
+class UNet2DConditionModel(nn.Module):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+    ):
+        super().__init__()
+        self.center_input_sample = center_input_sample
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=norm_eps,
+            use_swish=True,
+        )
+
+        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+
+    def forward(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        return_dict: bool = True,
+    ):
+        """r
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+
+        # 1. time
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "attentions")
+                and downsample_block.attentions is not None
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+
+        # 5. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "attentions")
+                and upsample_block.attentions is not None
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                )
+
+        # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
new file mode 100644
index 000000000..7b6e3e6e6
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -0,0 +1,762 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# flake8: noqa
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+from .attention import AttentionBlock
+
+from .clip import SpatialTransformer
+from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
+
+# pylint: disable=W0102
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+    downsample_padding=None,
+):
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
+            )
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+):
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
+            )
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                SpatialTransformer(
+                    in_channels,
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if attention_type != "default":
+            raise NotImplementedError(
+                f"attention_type must be default! current value: {attention_type}"
+            )
+
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    batch_size,
+                    height,
+                    width,
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
new file mode 100644
index 000000000..1cd25aa19
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
+"""
+
+from typing import Tuple
+
+from aitemplate.frontend import nn, Tensor
+
+from .unet_blocks import get_up_block, UNetMidBlock2D
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        act_fn="silu",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2dBias(
+            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+        )
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=32,
+            temb_channels=None,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = 32
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=num_groups_out,
+            eps=1e-6,
+            use_swish=True,
+        )
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, z) -> Tensor:
+        sample = z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        sample_size: int = 32,
+    ):
+        super().__init__()
+        self.decoder = Decoder(
+            batch_size,
+            height,
+            width,
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+        )
+        self.post_quant_conv = nn.Conv2dBias(
+            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def decode(self, z: Tensor, return_dict: bool = True):
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self):
+        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
new file mode 100644
index 000000000..ce744bff8
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -0,0 +1,410 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+
+import os
+import warnings
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+class StableDiffusionAITPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+        )
+
+        workdir = "tmp/"
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="UNet2DConditionModel", workdir=workdir
+        )
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=64):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if "torch_device" in kwargs:
+            device = kwargs.pop("torch_device")
+            warnings.warn(
+                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
+                " Consider using `pipe.to(torch_device)` instead."
+            )
+
+            # Set device as before (to be removed in 0.3.0)
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.to(device)
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=64,  # self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = "cpu" if self.device.type == "mps" else self.device
+        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                sigma = self.scheduler.sigmas[i]
+                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input, t, encoder_hidden_states=text_embeddings
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(
+                    noise_pred, i, latents, **extra_step_kwargs
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents)
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        # run safety checker
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="pt"
+            ).to(self.device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
new file mode 100644
index 000000000..592260981
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -0,0 +1,403 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# flakes8: noqa
+import inspect
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+
+import PIL
+import torch
+from aitemplate.compiler import Model
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class StableDiffusionImg2ImgAITPipeline(StableDiffusionImg2ImgPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+        )
+        # scheduler = scheduler.set_format("pt")
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+        workdir = "tmp/"
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="UNet2DConditionModel", workdir=workdir
+        )
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=64):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if strength < 0 or strength > 1:
+            raise ValueError(
+                f"The value of strength should in [0.0, 1.0] but is {strength}"
+            )
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        offset = 0
+        if accepts_offset:
+            offset = 1
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        # encode the init image into latents and scale the latents
+        init_latent_dist = self.vae.encode(init_image.to(self.device)).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        # expand init_latents for batch_size
+        init_latents = torch.cat([init_latents] * batch_size)
+
+        # get the original timestep using init_timestep
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            timesteps = torch.tensor(
+                [num_inference_steps - init_timestep] * batch_size,
+                device=self.device,
+            ).to(dtype=torch.long)
+        else:
+            timesteps = self.scheduler.timesteps[-init_timestep]
+            timesteps = torch.tensor([timesteps] * batch_size, device=self.device).to(
+                dtype=torch.long
+            )
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=self.device)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps).to(
+            self.device
+        )
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=64,  # self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps[t_start:])):
+            t_index = t_start + i
+
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+
+            # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                sigma = self.scheduler.sigmas[t_index]
+                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+                latent_model_input = latent_model_input.to(self.unet.dtype)
+                t = t.to(self.unet.dtype)
+
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input, t, encoder_hidden_states=text_embeddings
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(
+                    noise_pred, t_index, latents, **extra_step_kwargs
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents)
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        # run safety checker
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="pt"
+            ).to(self.device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
diff --git a/examples/05_stable_diffusion/src/test_correctness.py b/examples/05_stable_diffusion/src/test_correctness.py
new file mode 100644
index 000000000..d16f5fcfa
--- /dev/null
+++ b/examples/05_stable_diffusion/src/test_correctness.py
@@ -0,0 +1,137 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+
+import torch
+from diffusers import StableDiffusionPipeline
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+from .benchmark import benchmark_clip, benchmark_unet, benchmark_vae
+from .compile_lib.compile_clip import compile_clip
+from .compile_lib.compile_unet import compile_unet
+from .compile_lib.compile_vae import compile_vae
+
+
+class StableDiffusionVerification(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StableDiffusionVerification, self).__init__(*args, **kwargs)
+
+        self.local_path = "/tmp/aitemplate_stablediffusion_v2"
+        os.makedirs(self.local_path, exist_ok=True)
+
+        try:
+            pipe = StableDiffusionPipeline.from_pretrained(
+                self.local_path, revision="fp16", torch_dtype=torch.float16
+            ).to("cuda")
+        except OSError:
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client(bucket="aitemplate") as client:
+                    await_sync(
+                        client.getRecursive(
+                            manifold_path="tree/stable_diffusion/v2",
+                            local_path=self.local_path,
+                        )
+                    )
+
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    self.local_path, revision="fp16", torch_dtype=torch.float16
+                ).to("cuda")
+            else:
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    "stabilityai/stable-diffusion-2",
+                    revision="fp16",
+                    torch_dtype=torch.float16,
+                    use_auth_token=os.environ.get("HUGGINGFACE_AUTH_TOKEN", True),
+                ).to("cuda")
+                pipe.save_pretrained(self.local_path)
+
+        self.pt_unet = pipe.unet
+        self.pt_vae = pipe.vae
+        self.pt_clip = pipe.text_encoder
+        self.tokenizer = pipe.tokenizer
+
+        self.vae_config = {
+            "batch_size": 1,
+            "width": 64,
+            "height": 64,
+        }
+
+        self.unet_config = {
+            "batch_size": 2,
+            "dim": 320,
+            "hidden_dim": 1024,
+            "width": 64,
+            "height": 64,
+        }
+
+        self.clip_config = {
+            "batch_size": 1,
+            "seqlen": 64,
+        }
+
+        self.clip_compile_extra_config = {
+            "dim": 1024,
+            "num_heads": 16,
+        }
+
+    def test_vae(self):
+        compile_vae(
+            self.pt_vae,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.vae_config,
+        )
+        benchmark_vae(
+            self.pt_vae,
+            benchmark_pt=False,
+            verify=True,
+            **self.vae_config,
+        )
+
+    def test_unet(self):
+        compile_unet(
+            self.pt_unet,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.unet_config,
+        )
+        benchmark_unet(
+            self.pt_unet,
+            benchmark_pt=False,
+            verify=True,
+            **self.unet_config,
+        )
+
+    def test_clip(self):
+        compile_clip(
+            self.pt_clip,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.clip_config,
+            **self.clip_compile_extra_config,
+        )
+        benchmark_clip(
+            self.pt_clip,
+            benchmark_pt=False,
+            verify=True,
+            tokenizer=self.tokenizer,
+            **self.clip_config,
+        )
diff --git a/examples/06_how_to_add_an_op/how_to_add_an_op.py b/examples/06_how_to_add_an_op/how_to_add_an_op.py
index cd1646aeb..4e0087cd9 100644
--- a/examples/06_how_to_add_an_op/how_to_add_an_op.py
+++ b/examples/06_how_to_add_an_op/how_to_add_an_op.py
@@ -71,15 +71,19 @@ def gen_function(self) -> str:
 
 {{func_signature}}
 {
-    invoke_add_one(output, input, num_elements, stream);
+    invoke_add_one(
+        static_cast<{{elem_type}}*>(output),
+        static_cast<const {{elem_type}}*>(input),
+        num_elements,
+        stream);
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
-                   const half* input,
+void {{func_name}}(void* output,
+                   const void* input,
                    const int64_t num_elements,
                    {{prefix}}Stream_t stream)
     """
@@ -108,14 +112,14 @@ def gen_function(self) -> str:
 
 KERNEL_TEMPLATE = jinja2.Template(
     """
-__global__ void add_one(half* output, const half* input, const int64_t num_elements) {
+__global__ void add_one({{elem_type}}* output, const {{elem_type}}* input, const int64_t num_elements) {
   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_elements) {
-    output[idx] = input[idx] + half(1.0);
+    output[idx] = input[idx] + {{elem_type}}(1.0);
   }
 }
 
-void invoke_add_one(half* output, const half* input, int64_t num_elements, {{prefix}}Stream_t stream) {
+void invoke_add_one({{elem_type}}* output, const {{elem_type}}* input, int64_t num_elements, {{prefix}}Stream_t stream) {
   if (num_elements < 1024) {
     dim3 grid(1);
     dim3 block(num_elements);
@@ -130,22 +134,12 @@ def gen_function(self) -> str:
 )
 
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    """reinterpret_cast<half*>(
-        {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
-)
-
-
-def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) -> str:
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 1
 
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    input_name = func_attrs["inputs"][0]._attrs["name"]
 
     dim_names = [dim._attrs["name"] for dim in func_attrs["inputs"][0].shape()]
     return FUNC_CALL_TEMPLATE.render(
@@ -158,10 +152,20 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
 
 def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    input_x = func_attrs["inputs"][0]
+    output_y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(input_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(output_y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
     prefix = backend_spec.prefix
+
     return FUNC_TEMPLATE.render(
         header_files=header_files,
-        kernel=KERNEL_TEMPLATE.render(prefix=prefix),
+        elem_type=input_type,
+        kernel=KERNEL_TEMPLATE.render(prefix=prefix, elem_type=input_type),
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], prefix=prefix
         ),
@@ -194,7 +198,7 @@ def cuda_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
 
 @registry.reg("cuda.add_one.func_call")
 def cuda_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
-    return gen_function_call(func_attrs, indent, is_cuda=True)
+    return gen_function_call(func_attrs, indent)
 
 
 HIP_HEADER_FILES = """
@@ -215,7 +219,7 @@ def rocm_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
 
 @registry.reg("rocm.add_one.func_call")
 def rocm_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
-    return gen_function_call(func_attrs, indent, is_cuda=False)
+    return gen_function_call(func_attrs, indent)
 
 
 def create_ait_model(shapes):
diff --git a/fx2ait/CMakeLists.txt b/fx2ait/CMakeLists.txt
new file mode 100644
index 000000000..b0de91aa7
--- /dev/null
+++ b/fx2ait/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+
+project(ait_model)
+find_package(Torch REQUIRED)
+
+include_directories(
+   ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/picojson
+)
+
+# Define our library target
+set(CMAKE_CXX_STANDARD 17)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../static/include)
+add_library(ait_model SHARED
+  ${CMAKE_CURRENT_SOURCE_DIR}/fx2ait/csrc/AITModel.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/fx2ait/csrc/AITModelImpl.cpp
+)
+
+# Link against LibTorch
+target_link_libraries(ait_model "${TORCH_LIBRARIES}")
diff --git a/fx2ait/README.md b/fx2ait/README.md
new file mode 100644
index 000000000..9443aa197
--- /dev/null
+++ b/fx2ait/README.md
@@ -0,0 +1,47 @@
+# FX2AIT for AITemplate
+
+
+FX2AIT is an python based tool that transforms PyTorch model into AITempate(AIT) engine for lightning-fast inference serving.
+AITLowerer built on top of FX2AIT is able to perform AIT conversion on PyTorch model with AIT unsupported operators. Model can enjoy partial AIT acceleration using AITLowerer.
+
+FX2AIT highlights include:
+
+- Automatic Conversion: FX2AIT only need PyTorch model and input as input for conversion. The output can be used for inference serving directly.
+- Expanded Support: AITemplate doesn't cover all operators PyTorch provides. FX2AIT provided AITLowerer as solution to support patial AIT conversion for models with AIT unsupportted operators. For more information, please check example/03_lowering_split.
+
+## Installalation
+
+**Hardware requirement:**
+  - **NVIDIA**: FX2AIT is base on AIT, thus the hardware requirement is same as AIT. AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
+### From Source
+The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA compiler installed.
+- CUDA: CUDA 11.6
+- cuDNN: v8.7.0 for CUDA 11.x
+  download source: https://developer.nvidia.com/rdp/cudnn-download
+  installation guidance: https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html
+
+*Incorrect compiler will lead performance regression.*
+
+```
+cd fx2ait
+python setup.py install
+```
+
+### Docker Image
+We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC.
+- CUDA: `./docker/build.sh cuda`
+
+This will build a docker image with tag `ait:latest`.
+
+## Examples
+AITemplate provides the following getting started tutorials:
+- 01: [How to inference a PyTorch Transformer model with FX2AIT](fx2ait/example/01_transformer_model/)
+- 02: [How to inference a PyTorch vision model with FX2AIT](fx2ait/example/02_vision_model/)
+- 03: [How to inference a general PyTorch model with AIT unsupported operator using AIT Lowerer](fx2ait/example/03_lowering_split/)
+### Run Example and Test
+Example command:
+```
+cd fx2ait
+python example/03_lowering_split/test_lower.py
+python test/test_ait_lower.py
+```
diff --git a/fx2ait/fx2ait/TARGETS b/fx2ait/fx2ait/TARGETS
new file mode 100644
index 000000000..c247fbd11
--- /dev/null
+++ b/fx2ait/fx2ait/TARGETS
@@ -0,0 +1,41 @@
+# @noautodeps
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load("@fbsource//tools/build_defs:glob_defs.bzl", "glob")
+
+oncall("aitemplate")
+
+# Note that we exclude common acc_tracer python files here and will reuse
+# those in torch_tensorrt/fx/tracer/acc_tracer/
+python_library(
+    name = "fx2ait",
+    srcs = glob(
+        [
+            "converters/*.py",
+            "*.py",
+            "passes/*.py",
+            "tools/*.py",
+        ] + [
+            "acc_tracer/ait_acc_normalizer.py",
+            "acc_tracer/ait_acc_ops_registry.py",
+            "acc_tracer/ait_acc_ops.py",
+        ],
+        exclude = [
+            "cache.py",
+        ],
+    ),
+    base_module = "fx2ait",
+    deps = [
+        "fbsource//third-party/pypi/graphviz:graphviz",
+        "fbsource//third-party/pypi/numpy:numpy",
+        "fbsource//third-party/pypi/pydot:pydot",
+        "//aitemplate/AITemplate/fx2ait/fx2ait/fb:acc_import_helper",
+        "//aitemplate/AITemplate/fx2ait/fx2ait/fb/lower:ait_lowering_setting",
+        "//aitemplate/AITemplate/python/aitemplate:aitemplate",
+        "//caffe2:torch",
+        "//deeplearning/ait:AITModel",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:tracer",
+        "//pytorch/vision:torchvision",
+    ],
+)
diff --git a/fx2ait/fx2ait/__init__.py b/fx2ait/fx2ait/__init__.py
new file mode 100644
index 000000000..d2ac413d0
--- /dev/null
+++ b/fx2ait/fx2ait/__init__.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import acc_tracer, converters  # noqa
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
+
+__all__ = [
+    "acc_tracer",
+    "converters",
+    "core",
+    "lower",
+    "test",
+]
diff --git a/fx2ait/fx2ait/acc_tracer/__init__.py b/fx2ait/fx2ait/acc_tracer/__init__.py
new file mode 100644
index 000000000..024b7058e
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/__init__.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import (  # noqa
+    acc_normalizer,
+    acc_op_properties,
+    acc_ops,
+    acc_shape_prop,
+    acc_tracer,
+    acc_utils,
+    ait_acc_normalizer,
+    ait_acc_ops,
+    ait_acc_ops_registry,
+)
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
+
+__all__ = [
+    "acc_normalizer",
+    "acc_op_properties",
+    "acc_ops",
+    "acc_shape_prop",
+    "acc_tracer",
+    "acc_utils",
+    "ait_acc_normalizer",
+    "ait_acc_ops_registry",
+    "ait_acc_ops",
+]
diff --git a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
new file mode 100644
index 000000000..55cb39d4a
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
@@ -0,0 +1,465 @@
+import inspect
+import logging
+import re
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+
+import torch
+import torch.fx
+from torch.fx.node import _get_qualified_name
+
+from . import acc_utils
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+# Need to keep up-to-date with https://fburl.com/codesearch/7r2hhh53
+ALIAS_MAP = {
+    "input": ("input", "x", "a", "x1"),
+    "dim": ("dim", "axis"),
+    "keepdim": ("keepdim", "keepdims"),
+    "other": ("other", "x2"),
+}
+
+# Type used for arg replacement tuples. The list represents the argument signature of
+# some callable. Each item in the list is a tuple, where for each member of a tuple:
+# - The first member is union of either:
+#   - A tuple of all potential alias kwarg str names of the source signature, or
+#   - A tuple of a single str representing the single kwarg name allowed.
+# - The second member is the str name of the kwarg to map it to. This is either from the
+#   signature of the acc_op, or for custom mapped nodes from the original unnormalized op.
+# - The third member is a bool representing whether this arg is optional, i.e. whether it
+#   is allowed to not be present in the original input args.
+ArgReplacementTuplesType = List[Tuple[Tuple[str, ...], str, bool]]
+
+
+class NormalizationInfo(NamedTuple):
+    """
+    Holds normalization info for some FX node, where the FX node will be mapped either
+    via new_fn_target and arg_replacement_tuples, or via custom_mapping_fn.
+
+    If via new_fn_target and arg_replacement_tuples:
+      - new_fn_target is the target function to replace the original node with
+        (generally some function from acc_ops).
+
+      - arg_replacement_tuples describes how to map the original FX node's args/kwargs to
+        the new FX node. If set to None, then the kwargs are copied directly from the
+        original FX node. Else, this is list of three-member tuples, where each tuple
+        represents a mapping from either an arg or kwarg in the original FX node to the
+        kwarg it should be mapped to. If for ops registered with `register_acc_op` then
+        this is a mapping to the the new FX node for the acc_op. Otherwise it is for some
+        op registered with `register_custom_acc_mapper_fn`, in which case this is a
+        mapping for the original input node so its args are normalized to kwargs before
+        being custom normalized to acc_ops. The third member of the tuple is a bool
+        representing whether this argument is optional; if False and the arg is not
+        present then an assertion will be thrown. The index of the tuple indicates where
+        the original arg is in node.args and the string name indicates which original
+        kwarg it is.
+
+    If via custom_mapping_fn, then custom_mapping_fn is some function that takes the
+    original FX node as input and returns the FX node that should replace it. This means
+    it was registered via `register_custom_acc_mapper_fn`.
+    """
+
+    new_fn_target: Callable
+    arg_replacement_tuples: Optional[ArgReplacementTuplesType]
+    custom_mapping_fn: Optional[Callable]
+    # either (tensor_meta_field_name, original_field_name, move_to_qparams) or
+    # (tensor_meta_field_name, orginal_field_name)
+    # when move_to_qparams is True, we'll move the field to qparams
+    # dictionary, otherwise it will stay in TensorMeta itself
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ]
+    needs_shapes_for_normalization: bool
+
+
+# Dict from (op, target) to NormalizationInfo for that op.
+_normalization_dict: Dict[Tuple[str, Union[str, Callable]], NormalizationInfo] = {}
+
+# Set of all the acc ops.
+_acc_ops: Set[Callable] = set()
+
+
+def _insert_fun(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[Tuple],
+    new_fn_target: Optional[Callable] = None,
+    custom_mapping_fn: Optional[Callable] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+):
+    if op_and_target[0] == "call_function":
+        assert callable(op_and_target[1])
+    elif op_and_target[0] == "call_method":
+        assert isinstance(op_and_target[1], str)
+    elif op_and_target[0] == "call_module":
+        assert isinstance(op_and_target[1], type)
+
+    # Finalize arg replacement tuples.
+    # 1. Check to see if they have the `is_optional` bool, and if not defaulting it to
+    #   False.
+    # 2. Some kwargs might have aliases. e.g. "a", "x" and "x1" are aliases of "input".
+    #   Here we replace `orig_kwarg` with a tuple of all aliases if it has aliases.
+    final_arg_replacement_tuples = []
+    for arg_replacement_tuple in arg_replacement_tuples:
+        if len(arg_replacement_tuple) == 2:
+            orig_kwarg, new_kwarg, is_optional = *arg_replacement_tuple, False
+        else:
+            assert len(arg_replacement_tuple) == 3
+            orig_kwarg, new_kwarg, is_optional = arg_replacement_tuple
+
+        if not isinstance(orig_kwarg, tuple):
+            orig_kwarg = (orig_kwarg,)
+
+        # Use set to avoid duplicates.
+        orig_kwarg_set = set(orig_kwarg)
+
+        for k in orig_kwarg:
+            if k in ALIAS_MAP:
+                orig_kwarg_set.update(ALIAS_MAP[k])
+        final_arg_replacement_tuples.append(
+            (tuple(orig_kwarg_set), new_kwarg, is_optional)
+        )
+
+    assert op_and_target not in _normalization_dict.keys()
+    norm_info = NormalizationInfo(
+        new_fn_target=new_fn_target,  # type: ignore[arg-type]
+        arg_replacement_tuples=final_arg_replacement_tuples,
+        custom_mapping_fn=custom_mapping_fn,
+        kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+        needs_shapes_for_normalization=needs_shapes_for_normalization,
+    )
+    _normalization_dict[op_and_target] = norm_info
+
+    # If allow_normalize_from_torch_package then add another entry to
+    # _normalization_dict where we look for the qualified name of the target with the
+    # torch_package module prefix. Note that we leave off any integer at the end of
+    # "<torch_package_>" in order to allow for whatever mangling index is used.
+    if allow_normalize_from_torch_package:
+        torch_package_op_and_target = (
+            op_and_target[0],  # type: ignore[]
+            f"<torch_package_>.{_get_qualified_name(op_and_target[1])}",  # type: ignore[arg-type]
+        )
+        _normalization_dict[torch_package_op_and_target] = norm_info
+
+
+def _get_dup_signature_tuples(fn: Callable) -> List[Tuple[str, str]]:
+    """
+    Helper that inspects the arg signature of `fn` and returns a list of tuples, where
+    each tuple is a pair of duplicated names which is used for arg_replacement_tuples.
+    """
+    sig_tuples: List[Tuple[str, str]] = []
+    for param in inspect.signature(inspect.unwrap(fn)).parameters:
+        sig_tuples.append((param, param))
+    return sig_tuples
+
+
+def register_acc_op(acc_op: Callable):
+    """
+    For a new acc op, add this as decorator to register it.
+    """
+    _acc_ops.add(acc_op)
+    return acc_op
+
+
+def register_acc_op_mapping(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+    allow_normalize_from_torch_package=False,
+):
+    """
+    Use this decorator to map a non-acc operator to an acc operator.
+
+    Args:
+        op_and_target: A tuple that contains op and target of the node that represents the non-acc operator.
+        arg_replacement_tuples: Please refer to the comment on above for `ArgReplacementTuplesType`.
+        kwargs_to_move_to_acc_out_ty: The kwargs we want to move out from the non-acc op kwargs to acc_out_ty.
+    """
+
+    def insert(new_fn_target: Callable):
+        # If arg_replacement_tuples is None then assume we use the same signature for
+        # the acc_op and the original op.
+        if arg_replacement_tuples is None:
+            final_arg_replacement_tuples = _get_dup_signature_tuples(new_fn_target)
+        else:
+            final_arg_replacement_tuples = arg_replacement_tuples  # type: ignore[assignment]
+
+        _insert_fun(
+            op_and_target=op_and_target,
+            new_fn_target=new_fn_target,
+            arg_replacement_tuples=final_arg_replacement_tuples,  # type: ignore[arg-type]
+            kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+        )
+        return new_fn_target
+
+    return insert
+
+
+def register_custom_acc_mapper_fn(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ],
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+):
+    def insert(custom_mapping_fn: Callable):
+        _insert_fun(
+            op_and_target=op_and_target,
+            custom_mapping_fn=custom_mapping_fn,
+            arg_replacement_tuples=arg_replacement_tuples,  # type: ignore[arg-type]
+            needs_shapes_for_normalization=needs_shapes_for_normalization,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+        )
+        return custom_mapping_fn
+
+    return insert
+
+
+def move_kwargs_to_acc_out_ty(
+    node_or_normalization_info: Union[NormalizationInfo, torch.fx.Node],
+    new_kwargs: Dict[str, Any],
+):
+    """
+    Given `node_or_normalization_info` which is either NormalizationInfo for a node, or
+    a node to fetch NormalizationInfo for, check if kwargs_to_move_to_acc_out_ty exists
+    in the NormalizationInfo, and if so perform the move of kwargs to acc_out_ty.
+    """
+
+    if isinstance(node_or_normalization_info, torch.fx.Node):
+        node = node_or_normalization_info
+        normalization_info = _normalization_dict.get((node.op, node.target))
+    else:
+        assert isinstance(node_or_normalization_info, NormalizationInfo)
+        normalization_info = node_or_normalization_info
+
+    assert normalization_info is not None
+    if normalization_info.kwargs_to_move_to_acc_out_ty is None:
+        return
+
+    assert acc_utils.is_acc_op_with_kwarg(
+        normalization_info.new_fn_target, "acc_out_ty"
+    )
+
+    # Build a dict representing the new TensorMetadata to use for acc_out_ty,
+    # and then remove the kwarg from the new_kwargs since it's passed in via
+    # acc_out_ty instead.
+    tmd_dict: Dict[str, Any] = {}
+    qparams: Dict[str, Any] = {}
+
+    for kwarg_replacement_tuple in normalization_info.kwargs_to_move_to_acc_out_ty:
+        if len(kwarg_replacement_tuple) == 2:
+            orig_kwarg_name, tmd_field_name, move_to_qparams = *kwarg_replacement_tuple, False  # type: ignore[misc]
+        else:
+            assert len(kwarg_replacement_tuple) == 3
+            orig_kwarg_name, tmd_field_name, move_to_qparams = kwarg_replacement_tuple  # type: ignore[misc]
+        if move_to_qparams:
+            qparams[tmd_field_name] = new_kwargs[orig_kwarg_name]
+        else:
+            tmd_dict[tmd_field_name] = new_kwargs[orig_kwarg_name]
+        del new_kwargs[orig_kwarg_name]
+
+    tmd_dict["qparams"] = qparams
+    # Note: allow_partial_spec here because we are only using the tensor metadata tuple
+    # here to pass specific values into the function. For example, for quantization we
+    # only need to provide qparams dictionary, but is_quantized is
+    # not passed in.
+    new_kwargs["acc_out_ty"] = acc_utils.build_raw_tensor_meta(**tmd_dict)
+
+
+def get_normalized_kwargs(
+    node: torch.fx.Node, arg_replacement_tuples: ArgReplacementTuplesType
+):
+    new_kwargs = {}
+    final_arg_is_varg = False
+    for i, replacement_tuple in enumerate(arg_replacement_tuples):
+        orig_kwargs_names, new_kwarg_name, is_optional = replacement_tuple
+
+        # Check if this is a varg and if so break/process the rest outside the loop.
+        if "*" in orig_kwargs_names:
+            assert len(orig_kwargs_names) == 1
+            assert i == len(arg_replacement_tuples) - 1
+            final_arg_is_varg = True
+            break
+
+        # If nothing is found in node.kwargs it means the kwarg is in node.arg
+        # or it's optional. In this case, we set orig_kwargs_name to None.
+        assert isinstance(orig_kwargs_names, tuple)
+        orig_kwargs_name = next(
+            (key for key in orig_kwargs_names if key in node.kwargs),
+            None,
+        )
+
+        # If can't find in node.kwargs then it should be in the i index
+        # of node.args.
+        if orig_kwargs_name is None:
+            if i < len(node.args):
+                new_kwargs[new_kwarg_name] = node.args[i]
+            else:
+                # Verify the arg we're trying to normalize was optional.
+                assert (
+                    is_optional
+                ), f"Cannot normalize {orig_kwargs_names} to {new_kwarg_name} for {node.name}"
+        else:
+            new_kwargs[new_kwarg_name] = node.kwargs[orig_kwargs_name]
+
+    # If using var args then process the rest of the args now.
+    if final_arg_is_varg:
+        var_arg_idx = len(arg_replacement_tuples) - 1
+        new_kwarg_name = arg_replacement_tuples[var_arg_idx][1]
+        rest_of_args = []
+        for i in range(var_arg_idx, len(node.args)):
+            rest_of_args.append(node.args[i])
+        new_kwargs[new_kwarg_name] = rest_of_args
+
+    return new_kwargs
+
+
+def normalize(
+    mod: torch.fx.GraphModule,
+    expect_nodes_have_shapes: bool = False,
+    acc_normalization_block_list: Optional[
+        Set[Tuple[str, Union[str, Callable]]]
+    ] = None,
+):
+    assert len(_normalization_dict) > 0
+    graph = mod.graph
+    if acc_normalization_block_list is None:
+        acc_normalization_block_list = set()
+
+    # For "call_module" node we return _base_class_origin if it's a
+    # RewrittenModule, otherwise, return its type. For other nodes,
+    # we return node.target.
+    def get_target(mod: torch.fx.GraphModule, node: torch.fx.Node):
+        if node.op != "call_module":
+            return node.target
+
+        # Find the module that node.target points to
+        m = dict(mod.named_modules())[node.target]
+        return getattr(m, "_base_class_origin", type(m))
+
+    def normalize_to_acc_op(
+        node: torch.fx.Node,
+        normalization_info: NormalizationInfo,
+        normalized_args: Tuple[Any, ...],
+        normalized_kwargs: Dict[str, Any],
+    ):
+        # If there's a custom mapping function then use it.
+        if normalization_info.custom_mapping_fn is not None:
+            # For custom mapping, the normalized_kwargs are used for the original op,
+            # i.e. *before* custom acc_ops normalization. Do that now.
+            node.args = normalized_args
+            node.kwargs = normalized_kwargs
+            new_node = normalization_info.custom_mapping_fn(node, mod)
+            # If a new node is returned then use it to replace the old node. Otherwise
+            # the custom mapping function did its own replacement, so return early.
+            if new_node is None:
+                return
+        else:
+            # If there's kwargs_to_move_to_acc_out_ty then use it to setup acc_out_ty in
+            # normalized_kwargs, and remove the kwarg from normalized_kwargs.
+            move_kwargs_to_acc_out_ty(normalization_info, normalized_kwargs)
+
+            # All acc ops are functions. Create a call to the correct acc_ops target using
+            # the normalized kwargs provided.
+            with graph.inserting_before(node):
+                new_node = graph.create_node(
+                    "call_function",
+                    normalization_info.new_fn_target,
+                    args=normalized_args,
+                    kwargs=normalized_kwargs,
+                    name=node.name,
+                )
+                new_node.meta = node.meta.copy()
+
+        # Finally replace the original node with the normalized node.
+        node.replace_all_uses_with(new_node)
+        graph.erase_node(node)
+
+        # Don't wrap the acc_op node just because the original node was wrapped.
+        if "is_wrapped" in new_node.meta:
+            del new_node.meta["is_wrapped"]
+
+    for node in graph.nodes:
+        if node.op in {"placeholder", "get_attr", "output"}:
+            continue
+
+        op_and_target = (node.op, get_target(mod, node))
+
+        if op_and_target in acc_normalization_block_list:
+            continue
+
+        normalization_info = _normalization_dict.get(op_and_target)
+
+        # Also check if the torch_packaged version of the op was specified to be normalized.
+        if normalization_info is None and node.op == "call_function":
+            # Strip off the mangle_index suffix here before checking the map.
+            target = re.sub(
+                r"\A<torch_package_\d+>",
+                "<torch_package_>",
+                _get_qualified_name(node.target),
+            )
+            torch_package_op_and_target = (node.op, target)
+            normalization_info = _normalization_dict.get(torch_package_op_and_target)
+
+        if normalization_info is None:
+            continue
+
+        # Get the normalized kwargs to be used by normalize_to_acc_op below. If
+        # normalization_info.arg_replacement_tuples is empty then assume the function
+        # signature must be left as is.
+        assert normalization_info.arg_replacement_tuples is not None
+        if len(normalization_info.arg_replacement_tuples) == 0:
+            normalized_args = node.args
+            normalized_kwargs = node.kwargs
+        else:
+            normalized_args = ()
+            try:
+                normalized_kwargs = get_normalized_kwargs(
+                    node, normalization_info.arg_replacement_tuples
+                )
+            except Exception:
+                _LOGGER.error(
+                    f"Error during kwarg normalization for: {node.format_node()}; "
+                    f"arg_replacement_tuples={normalization_info.arg_replacement_tuples}"
+                )
+                raise
+
+        if (
+            normalization_info.needs_shapes_for_normalization
+            and not expect_nodes_have_shapes
+        ):
+            # All nodes needing shapes for normalization should be custom mapped.
+            assert normalization_info.custom_mapping_fn is not None
+            # For custom mapping, the normalized_kwargs are used for the original op,
+            # i.e. *before* custom acc_ops normalization. Do that now so that whoever
+            # consumes the graph next (e.g. shape inference) can use kwargs safely.
+            node.args = normalized_args
+            node.kwargs = normalized_kwargs
+            continue
+
+        try:
+            normalize_to_acc_op(
+                node, normalization_info, normalized_args, normalized_kwargs
+            )
+        except Exception:
+            _LOGGER.error(f"Error during normalization for node: {node.format_node()}")
+            raise
+
+    # If there are any dead nodes left after normalization, eliminate them now.
+    mod.graph.eliminate_dead_code()
diff --git a/fx2ait/fx2ait/acc_tracer/acc_op_properties.py b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
new file mode 100644
index 000000000..8160cfe9f
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from enum import auto, Flag
+from typing import Callable, DefaultDict, Set
+
+import torch
+import torch.fx
+
+
+class AccOpProperty(Flag):
+    """
+    A collection of static properties for acc_ops.
+
+    * pointwise - op commutes with data restructuring ops such as reshape,
+        transpose, permute. e.g. op(reshape(x)) == reshape(op(x)).
+        Alternatively, for tensor x = (x1, x2, ...), there exists a scalar
+        function f such that op(x) = (f(x1), f(x2), ...).
+    * quantized - op expects quantized inputs and return quantized outputs
+    * unary - op has exactly one graph dependent input. e.g. relu,
+        dequantize, sum
+    """
+
+    pointwise = auto()
+    quantized = auto()
+    unary = auto()
+
+
+acc_op_properties: DefaultDict[Callable, Set[AccOpProperty]] = defaultdict(set)
+acc_ops_with_property: DefaultDict[AccOpProperty, Set[Callable]] = defaultdict(set)
+
+
+def register_acc_op_properties(*properties: AccOpProperty):
+    """
+    Attach properties to acc_op to inform optimization
+    """
+
+    def decorator(acc_op: Callable):
+        acc_op_properties[acc_op] |= set(properties)
+        for prop in properties:
+            acc_ops_with_property[prop].add(acc_op)
+        return acc_op
+
+    return decorator
+
+
+def add_optimization_properties_to_meta(mod: torch.fx.GraphModule) -> None:
+    """
+    Add acc_op properties to Node.meta to inform optimization
+    """
+    for node in mod.graph.nodes:
+        node.meta["acc_op_properties"] = acc_op_properties[node.target]
diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
new file mode 100644
index 000000000..d625643e3
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -0,0 +1,3215 @@
+# encoding: utf-8
+import operator
+
+import torch  # isort:skip
+from typing import cast, Iterable, List, Sequence
+
+import torch.nn as nn
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+
+from . import acc_utils
+from .acc_normalizer import (
+    register_acc_op,
+    register_acc_op_mapping,
+    register_custom_acc_mapper_fn,
+)
+from .acc_op_properties import AccOpProperty, register_acc_op_properties
+
+this_arg_is_optional = True
+move_to_qparams = True
+dont_move_to_qparams = False
+
+# A proxy embedding size. We use this for tracing proxy operators using XL
+# weights which we can't load into memory (because they're too large), we
+# instead substitute a smaller weight with embedding size =
+# PROXY_EMBEDDING_SIZE.
+PROXY_EMBEDDING_SIZE = 8
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.linear))
+@register_acc_op
+def linear(*, input, weight, bias):
+    return nn.functional.linear(input=input, weight=weight, bias=bias)
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op
+def quantized_linear(*, input, weight, bias, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return nn.quantized.functional.linear(
+        input,
+        weight,
+        bias,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "flatten"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("start_dim", "start_dim", this_arg_is_optional),
+        ("end_dim", "end_dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(op_and_target=("call_function", torch.flatten))
+@register_acc_op
+def flatten(*, input, start_dim=0, end_dim=-1):
+    return torch.flatten(input=input, start_dim=start_dim, end_dim=end_dim)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "squeeze"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.squeeze),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def squeeze(*, input, dim=None):
+    if dim is None:
+        return input.squeeze()
+    return input.squeeze(dim=dim)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding))
+@register_acc_op
+def embedding(
+    *,
+    input,
+    weight,
+    padding_idx,
+    max_norm,
+    norm_type,
+    scale_grad_by_freq,
+    sparse,
+):
+    return torch.nn.functional.embedding(**locals())
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool1d))
+@register_acc_op
+def max_pool1d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    return_indices,
+):
+    return nn.functional.max_pool1d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool2d))
+@register_acc_op
+def max_pool2d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    return_indices,
+):
+    return nn.functional.max_pool2d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool3d))
+@register_acc_op
+def max_pool3d(
+    *, input, kernel_size, stride, padding, dilation, ceil_mode, return_indices
+):
+    return nn.functional.max_pool3d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.adaptive_avg_pool2d)
+)
+@register_acc_op
+def adaptive_avg_pool2d(*, input, output_size):
+    return nn.functional.adaptive_avg_pool2d(input=input, output_size=output_size)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.adaptive_avg_pool3d)
+)
+@register_acc_op
+def adaptive_avg_pool3d(*, input, output_size):
+    return nn.functional.adaptive_avg_pool3d(input=input, output_size=output_size)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool1d))
+@register_acc_op
+def avg_pool1d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+):
+    return nn.functional.avg_pool1d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool2d))
+@register_acc_op
+def avg_pool2d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    return nn.functional.avg_pool2d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+        divisor_override=divisor_override,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool3d))
+@register_acc_op
+def avg_pool3d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    return nn.functional.avg_pool3d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+        divisor_override=divisor_override,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sign))
+@register_acc_op
+def sign(*, input):
+    return torch.sign(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "type"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def custom_type_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_obj = node.kwargs["input"]
+    dtype_obj = node.kwargs.get("dtype")
+    with node.graph.inserting_before(node):
+        if dtype_obj is None:
+            dtype_node = node.graph.call_function(dtype, kwargs={"input": input_obj})
+            dtype_node.meta["type"] = torch.dtype
+            return dtype_node
+        else:
+            new_kwargs = {
+                "input": input_obj,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_obj),
+            }
+            new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+            new_node.meta = node.meta
+            return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "type_as"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("tensor", "tensor"),
+    ],
+)
+def custom_type_as_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_obj = node.kwargs["input"]
+    other_obj = node.kwargs["tensor"]
+    with node.graph.inserting_before(node):
+        dtype_node = node.graph.call_function(dtype, kwargs={"input": other_obj})
+        dtype_node.meta["type"] = torch.dtype
+        device_node = node.graph.call_function(device, kwargs={"input": other_obj})
+        device_node.meta["type"] = torch.device
+
+        new_kwargs = {
+            "input": input_obj,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_node),
+            "device": device_node,
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def dtype(*, input):
+    return input.dtype
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def size(*, input):
+    return input.size()
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def device(*, input):
+    return input.device
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.numel))
+@register_acc_op
+def numel(*, input):
+    return torch.numel(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", getattr),
+    arg_replacement_tuples=[],
+)
+def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Custom function for mapping a call_function getattr to other ops.
+
+    Supports:
+    * getattr on a torch.Tensor with "shape", "device", or "dtype" attributes
+    * getattr for accessing named tuples
+    """
+    # Have to use args here since getattr forces positional args.
+    input_obj = node.args[0]
+    attr_name = node.args[1]
+    assert isinstance(input_obj, torch.fx.Node)
+    input_obj_type = input_obj.meta["type"]
+
+    # Handle named tuple access. NamedTupleMeta and the namedtuple factory function
+    # create a subclass of tuple with an extra _fields attribute.
+    if issubclass(input_obj_type, tuple) and hasattr(input_obj_type, "_fields"):
+        idx = None
+        for i, name in enumerate(input_obj_type._fields):
+            if name == attr_name:
+                idx = i
+                break
+        assert (
+            idx is not None
+        ), f"Named tuple type {input_obj_type} does not have field {name}"
+
+        with node.graph.inserting_before(node):
+            getitem_node = node.graph.call_function(
+                getitem, kwargs={"input": input_obj, "idx": idx}
+            )
+            getitem_node.meta = node.meta.copy()
+            return getitem_node
+
+    assert (
+        input_obj_type == torch.Tensor
+    ), f"Expected torch.Tensor type for {input_obj_type}"
+    assert (
+        attr_name == "shape" or attr_name == "device" or attr_name == "dtype"
+    ), f"Only supporting shape, device and dtype getattr for now, not {attr_name}"
+    if attr_name == "shape":
+        func = size
+    elif attr_name == "device":
+        func = device
+    elif attr_name == "dtype":
+        func = dtype
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(func, kwargs={"input": input_obj})
+        size_node.meta = node.meta.copy()
+        return size_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "size"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+def tensor_size_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from Tensor.size() to acc_ops.size. We map size() to acc_ops.size directly
+    and map size(dim) to acc_ops.size + acc_ops.getitem.
+    """
+
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(
+            size, kwargs={"input": node.kwargs["input"]}
+        )
+
+        if "dim" not in node.kwargs:
+            size_node.meta = node.meta.copy()
+            return size_node
+
+        size_node.meta["type"] = torch.Size
+        getitem_node = node.graph.call_function(
+            getitem, kwargs={"input": size_node, "idx": node.kwargs["dim"]}
+        )
+        getitem_node.meta = node.meta.copy()
+        return getitem_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.add))
+@register_acc_op_mapping(op_and_target=("call_method", "add"))
+@register_acc_op
+def add(*, input, other):
+    return input + other
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze))
+@register_acc_op
+def unsqueeze(*, input, dim: int):
+    return torch.unsqueeze(input=input, dim=dim)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "tile"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.tile))
+@register_acc_op
+def tile(*, input, dims):
+    return torch.tile(input=input, dims=dims)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "repeat"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "sizes"),
+    ],
+)
+def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Map repeat to tile.
+    """
+    with node.graph.inserting_before(node):
+        inputs = node.kwargs["input"]
+        dims = node.kwargs["sizes"]
+        new_node = node.graph.create_node(
+            "call_function",
+            tile,
+            kwargs={"input": inputs, "dims": dims},
+            name=f"{node.name}_repeat_map",
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "repeat_interleave"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("repeats", "repeats"),
+        ("dim", "dim", this_arg_is_optional),
+        ("output_size", "output_size", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.repeat_interleave),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("repeats", "repeats"),
+        ("dim", "dim", this_arg_is_optional),
+        ("output_size", "output_size", this_arg_is_optional),
+    ],
+)
+def repeat_interleave_mapper(node: torch.fx.Node, _: nn.Module):
+    input_node = node.kwargs["input"]
+    repeats = cast(int, node.kwargs["repeats"])
+    dim = node.kwargs["dim"]
+    assert (
+        type(repeats) is int
+    ), "We currently only support `repeat_interleave` with int repeats"
+    rank = node.meta["tensor_rank"]
+    if dim is None:
+        repeat_dim = rank - 1
+    else:
+        assert type(dim) is int, "dim should be an int"
+        repeat_dim = dim
+    tile_dims = [1] * (rank + 1)
+    tile_dims[repeat_dim + 1] = repeats
+
+    with node.graph.inserting_before(node):
+        unsqueeze_node = node.graph.create_node(
+            "call_function",
+            unsqueeze,
+            kwargs={"input": input_node, "dim": repeat_dim + 1},
+            name=f"{node.name}_unsqueeze",
+        )
+        tile_node = node.graph.create_node(
+            "call_function",
+            tile,
+            kwargs={"input": unsqueeze_node, "dims": tuple(tile_dims)},
+            name=f"{node.name}_repeat_interleave_map_tile",
+        )
+        new_shape = []
+        if dim is not None:
+            if dim < 0:
+                repeat_dim = dim + rank
+            else:
+                repeat_dim = dim
+            size_node = node.graph.create_node(
+                "call_function",
+                size,
+                kwargs={"input": input_node},
+                name=f"{node.name}_size",
+            )
+            size_node.meta["type"] = torch.Size
+            for i in range(rank):
+                shape_i = node.graph.create_node(
+                    "call_function",
+                    getitem,
+                    kwargs={"input": size_node, "idx": i},
+                    name=f"{node.name}_size_{i}",
+                )
+                if i == repeat_dim:
+                    new_shape.append(-1)
+                else:
+                    new_shape.append(shape_i)
+        else:
+            new_shape.append(-1)
+
+        reshaped_node = node.graph.create_node(
+            "call_function",
+            reshape,
+            kwargs={
+                "input": tile_node,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=new_shape),
+            },
+            name=f"{node.name}_reshape",
+        )
+        reshaped_node.meta = node.meta.copy()
+        return reshaped_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.stack),
+    arg_replacement_tuples=[
+        ("tensors", "tensors"),
+        ("dim", "dim"),
+    ],
+)
+def stack_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Map torch.stack to unsqueeze + cat.
+    """
+    with node.graph.inserting_before(node):
+        inputs = node.kwargs["tensors"]
+        unsqueeze_nodes = []
+        assert isinstance(inputs, Sequence)
+        for i, t in enumerate(inputs):
+            new_node = node.graph.create_node(
+                "call_function",
+                unsqueeze,
+                kwargs={"input": t, "dim": node.kwargs["dim"]},
+                name=f"{node.name}_unsqueeze_{i}",
+            )
+            new_node.meta["type"] = torch.Tensor
+            unsqueeze_nodes.append(new_node)
+        cat_node = node.graph.create_node(
+            "call_function",
+            cat,
+            kwargs={"tensors": unsqueeze_nodes, "dim": node.kwargs["dim"]},
+        )
+        cat_node.meta = node.meta.copy()
+        return cat_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.clamp))
+@register_acc_op_mapping(op_and_target=("call_function", torch.clip))
+@register_acc_op_mapping(op_and_target=("call_method", "clamp"))
+@register_acc_op_mapping(op_and_target=("call_method", "clip"))
+@register_acc_op
+def clamp(*, input, min=None, max=None):
+    return torch.clamp(input=input, min=min, max=max)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.concat))
+@register_acc_op_mapping(op_and_target=("call_function", torch.cat))
+@register_acc_op
+def cat(*, tensors, dim):
+    return torch.cat(tensors=tensors, dim=dim)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.transpose),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim0", "dim0"),
+        ("dim1", "dim1"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "transpose"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim0", "dim0"),
+        ("dim1", "dim1"),
+    ],
+)
+def transpose_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    # Get the dim-permutation/shuffle
+    ranks = node.meta["tensor_rank"]
+    shuffle = list(range(ranks))
+    dim0 = cast(int, node.kwargs["dim0"])
+    dim1 = cast(int, node.kwargs["dim1"])
+    shuffle[dim0] = dim1
+    shuffle[dim1] = dim0
+
+    # Create the new acc_ops.permute node. Update all uses of the transpose
+    # node and then delete the transpose node.
+    with node.graph.inserting_after(node):
+        permute_node = node.graph.call_function(
+            the_function=permute,
+            kwargs={
+                "input": node.kwargs.get("input"),
+                "permutation": shuffle,
+            },
+        )
+        permute_node.meta = node.meta.copy()
+        node.replace_all_uses_with(permute_node)
+
+    permute_node.graph.erase_node(node)
+    return permute_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "contiguous"))
+@register_acc_op
+def contiguous(*, input):
+    return input.contiguous()
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "softmax"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.softmax),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def softmax(*, input, dim, dtype=None):
+    """
+    _stacklevel are ignored here.
+    """
+    return torch.nn.functional.softmax(input=input, dim=dim, dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.addmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat1", "mat1"),
+        ("mat2", "mat2"),
+        ("beta", "beta"),
+        ("alpha", "alpha"),
+    ],
+)
+def addmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.addmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1
+    then we also insert acc_ops.mul to the right place.
+    """
+    with node.graph.inserting_before(node):
+        mm_kwargs = {"input": node.kwargs["mat1"], "other": node.kwargs["mat2"]}
+        mm_node = node.graph.create_node(
+            "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_mm"
+        )
+        mm_node.meta = node.meta.copy()
+
+        if node.kwargs["alpha"] != 1:
+            mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]}
+            mm_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul"
+            )
+        mm_node.meta = node.meta.copy()
+
+        input_node = node.kwargs["input"]
+        if node.kwargs["beta"] != 1:
+            mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]}
+            new_input_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul"
+            )
+            assert isinstance(input_node, torch.fx.Node)
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {"input": mm_node, "other": input_node}
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.t),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "t"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def t_mapper(node: torch.fx.Node, _: nn.Module):
+    ranks = node.meta["tensor_rank"]
+    shuffle = [1, 0] if (ranks > 1) else [0]
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.create_node(
+            "call_function",
+            permute,
+            kwargs={"input": node.kwargs["input"], "permutation": shuffle},
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "permute"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "permutation"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.permute),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dims", "permutation"),
+    ],
+)
+@register_acc_op
+def permute(*, input, permutation):
+    return input.permute(*permutation)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.square),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def square_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": input_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "mm"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", operator.matmul),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.bmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.mm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(op_and_target=("call_function", torch.matmul))
+@register_acc_op
+def matmul(*, input, other):
+    return torch.matmul(input=input, other=other)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "detach"), arg_replacement_tuples=[("input", "input")]
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.detach),
+    arg_replacement_tuples=[("input", "input")],
+)
+def dropout_mapper(node: torch.fx.Node, mod: nn.Module):
+    """
+    Remove dropout node and directly map its input to output.
+    """
+    return node.kwargs["input"]
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.hardtanh),
+)
+@register_acc_op
+def hardtanh(*, input, min_val=-1.0, max_val=1.0):
+    return nn.functional.hardtanh(input=input, min_val=min_val, max_val=max_val)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.hardsigmoid))
+@register_acc_op
+def hardsigmoid(*, input):
+    return nn.functional.hardsigmoid(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.silu),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def silu(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        sigmoid_node = node.graph.call_function(sigmoid, kwargs={"input": input_node})
+        sigmoid_node.meta = node.meta.copy()
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": sigmoid_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.hardswish),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_sigmoid_node = node.graph.call_function(
+            hardsigmoid, kwargs={"input": input_node}
+        )
+        new_sigmoid_node.meta = node.meta.copy()
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": new_sigmoid_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.add),
+    arg_replacement_tuples=[
+        ("qa", "input"),
+        ("qb", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_add(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.add(
+        input,
+        other,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.mul),
+    arg_replacement_tuples=[
+        ("qa", "input"),
+        ("qb", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_mul(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.mul(
+        input,
+        other,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.quantize_per_tensor),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+        ("dtype", "dtype"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+        ("dtype", "dtype", dont_move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    dtype = acc_out_ty.dtype
+    return torch.quantize_per_tensor(
+        input, qparams["scale"], qparams["zero_point"], dtype
+    )
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.quantize_per_channel),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("scales", "scales"),
+        ("zero_points", "zero_points"),
+        ("axis", "axis"),
+        ("dtype", "dtype"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scales", "scale", move_to_qparams),
+        ("zero_points", "zero_point", move_to_qparams),
+        ("axis", "axis", move_to_qparams),
+        ("dtype", "dtype", dont_move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    dtype = acc_out_ty.dtype
+    return torch.quantize_per_channel(
+        input,
+        torch.tensor(qparams["scale"]),
+        torch.tensor(qparams["zero_point"]),
+        qparams["axis"],
+        dtype,
+    )  # type: ignore[call-overload]
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "dequantize"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize))
+@register_acc_op
+def dequantize(*, input):
+    return torch.dequantize(input)
+
+
+@register_acc_op_properties(
+    AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized
+)
+@register_acc_op
+def rescale_quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    d = dequantize(input=input)
+    return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty)
+
+
+@register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized)
+@register_acc_op
+def rescale_quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    d = dequantize(input=input)
+    return quantize_per_channel(input=d, acc_out_ty=acc_out_ty)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sub))
+@register_acc_op_mapping(op_and_target=("call_function", operator.sub))
+@register_acc_op_mapping(op_and_target=("call_method", "sub"))
+@register_acc_op
+def sub(*, input, other):
+    return input - other
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.mul))
+@register_acc_op_mapping(op_and_target=("call_function", operator.mul))
+@register_acc_op_mapping(op_and_target=("call_method", "mul"))
+@register_acc_op
+def mul(*, input, other):
+    return input * other
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "div"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("rounding_mode", "rounding_mode", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.div),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("rounding_mode", "rounding_mode", this_arg_is_optional),
+    ],
+)
+def div_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        div_kwargs = dict(node.kwargs)
+        if "rounding_mode" not in div_kwargs or div_kwargs["rounding_mode"] is None:
+            div_node = node.graph.call_function(
+                div, kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]}
+            )
+        elif div_kwargs["rounding_mode"] == "trunc":
+            div_node = node.graph.call_function(
+                trunc_div,
+                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
+            )
+        elif div_kwargs["rounding_mode"] == "floor":
+            div_node = node.graph.call_function(
+                floor_div,
+                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
+            )
+        else:
+            raise RuntimeError(
+                f"Unhandled div rounding mode {div_kwargs['rounding_mode']}"
+            )
+        div_node.meta = node.meta.copy()
+        return div_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.truediv))
+@register_acc_op
+def div(*, input, other):
+    return input / other
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.floordiv))
+@register_acc_op
+def floor_div(*, input, other):
+    # This is temp fix because currently operator.floor_div for tensors would
+    # traslate into torch.floor_divide which would throw an error. After it's
+    # fixed we can stick to `input // other`.
+    if isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor):
+        return torch.div(input, other, rounding_mode="floor")
+    return input // other
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.floor_divide))
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op
+def trunc_div(*, input, other):
+    return torch.div(input, other, rounding_mode="trunc")
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.pow))
+@register_acc_op_mapping(op_and_target=("call_method", "pow"))
+@register_acc_op
+def pow(*, input, exponent):
+    return torch.pow(input, exponent)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu))
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.relu),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "relu"),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_acc_op
+def relu(*, input, inplace=False):
+    return nn.functional.relu(input=input, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.leaky_relu)
+)
+@register_acc_op
+def leaky_relu(*, input, negative_slope=0.01, inplace=False):
+    return nn.functional.leaky_relu(
+        input=input, negative_slope=negative_slope, inplace=inplace
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.elu))
+@register_acc_op
+def elu(*, input, alpha=1.0, inplace=False):
+    return nn.functional.elu(input=input, alpha=alpha, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.selu))
+@register_acc_op
+def selu(*, input, inplace=False):
+    return nn.functional.selu(input=input, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softsign))
+@register_acc_op
+def softsign(*, input):
+    return nn.functional.softsign(input=input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.log1p),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        add_kwargs = {"input": node.kwargs["input"], "other": 1.0}
+        add_node = node.graph.call_function(add, kwargs=add_kwargs)
+        add_node.meta = node.meta.copy()
+        log_kwargs = {"input": add_node}
+        log_node = node.graph.call_function(log, kwargs=log_kwargs)
+        log_node.meta = node.meta.copy()
+        return log_node
+
+
+def reduce_op_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule, func
+) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        kwargs = dict(node.kwargs)
+        if "dim" in kwargs and isinstance(kwargs["dim"], int):
+            kwargs["dim"] = (kwargs["dim"],)
+        new_node = node.graph.call_function(func, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def sum(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.sum(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.sum(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "sum"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.sum),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    return reduce_op_mapper(node, mod, sum)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def prod(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.prod(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.prod(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "prod"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.prod),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def prod_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    func = prod
+    with node.graph.inserting_before(node):
+        kwargs = dict(node.kwargs)
+        new_node = node.graph.call_function(func, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def mean(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.mean(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.mean(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "mean"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.mean),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def mean_mapper(node, mod):
+    return reduce_op_mapper(node, mod, mean)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "std"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.std),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def std_mapper(node, mod):
+    """
+    Formula of std: sqrt(sum(pow(X-mean(X))))/N)
+    This op is mapped to a few existing ops
+    """
+    input_node = node.kwargs["input"]
+    # unbiased = node.kwargs.get("unbiased")
+    dim = node.kwargs.get("dim")
+    keepdim = node.kwargs.get("keepdim")
+    # assert unbiased is False or unbiased is None, "We currently do not support `std` with unbiased=True where n-1 is used"
+    assert (
+        dim is not None and keepdim is not None
+    ), "We currently do not support `std` with dim=None and keepdim=None"
+
+    with node.graph.inserting_before(node):
+        # mean(X)
+        mean_kwargs = {
+            "input": input_node,
+            "dim": dim,
+            "keepdim": keepdim,
+        }
+        mean_node = node.graph.call_function(mean, kwargs=mean_kwargs)
+        mean_node.meta["type"] = torch.Tensor
+        # X-mean(X)
+        sub_kwargs = {
+            "input": input_node,
+            "other": mean_node,
+        }
+        sub_node = node.graph.call_function(sub, kwargs=sub_kwargs)
+        sub_node.meta["type"] = torch.Tensor
+        # pow(X-mean(X))
+        pow_kwargs = {
+            "input": sub_node,
+            "exponent": 2.0,
+        }
+        pow_node = node.graph.call_function(pow, kwargs=pow_kwargs)
+        pow_node.meta["type"] = torch.Tensor
+        # sum(pow(X-mean(X))))/N
+        post_mean_kwargs = {
+            "input": pow_node,
+            "dim": dim,
+            "keepdim": keepdim,
+        }
+        post_mean_node = node.graph.call_function(mean, kwargs=post_mean_kwargs)
+        post_mean_node.meta["type"] = torch.Tensor
+        # sqrt(sum(pow(X-mean(X))))/N)
+        sqrt_kwargs = {
+            "input": post_mean_node,
+        }
+        sqrt_node = node.graph.call_function(sqrt, kwargs=sqrt_kwargs)
+        sqrt_node.meta["type"] = torch.Tensor
+
+        output_node = sqrt_node
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "max"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.max),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "min"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.min),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+def add_maximum_minimum_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule
+) -> torch.fx.Node:
+    # there are effectively three versions of torch.max / torch.min
+    # full reduce: torch.max(input) -> Tensor
+    # dimensional reduce: torch.max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    # elementwise: torch.max(input, other, *, out=None) -> Tensor
+
+    # the mapper function is remapping for both min and max situations
+    # this helper function makes the choices available clearer and provides an easier way
+    # to lookup the right function
+    def target_map(op, target):
+        if (op, target) in (("call_method", "max"), ("call_function", torch.max)):
+            return {
+                "full_reduce": max_full_reduce,
+                "dim_reduce": max_dim_reduce,
+                "elementwise": maximum,
+            }
+        elif (op, target) in (("call_method", "min"), ("call_function", torch.min)):
+            return {
+                "full_reduce": min_full_reduce,
+                "dim_reduce": min_dim_reduce,
+                "elementwise": minimum,
+            }
+
+    with node.graph.inserting_before(node):
+        new_targets = target_map(node.op, node.target)
+        max_kwargs = {}
+        max_kwargs["input"] = node.kwargs["input"]
+        if ("dim_or_other" not in node.kwargs) or (node.kwargs["dim_or_other"] is None):
+            nt = new_targets["full_reduce"]
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        elif isinstance(node.kwargs["dim_or_other"], int):
+            nt = new_targets["dim_reduce"]
+            dim = node.kwargs["dim_or_other"]
+            max_kwargs["dim"] = dim
+            max_kwargs["keepdim"] = node.kwargs.get("keepdim", False)
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        else:
+            other = node.kwargs["dim_or_other"]
+            assert isinstance(other, torch.fx.Node)
+            # Lowering path for when provided "other", where we do elem-wise max
+            nt = new_targets["elementwise"]
+            max_kwargs["other"] = other
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        max_node.meta = node.meta.copy()
+        return max_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def max_full_reduce(*, input):
+    return torch.max(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def max_dim_reduce(*, input, dim=None, keepdim=False):
+    return torch.max(input=input, dim=dim, keepdim=keepdim)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.maximum))
+@register_acc_op_mapping(op_and_target=("call_method", "maximum"))
+@register_acc_op
+def maximum(*, input, other):
+    return torch.maximum(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def min_full_reduce(*, input):
+    return torch.min(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def min_dim_reduce(*, input, dim=None, keepdim=False):
+    return torch.min(input, dim=dim, keepdim=keepdim)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.minimum))
+@register_acc_op_mapping(op_and_target=("call_method", "minimum"))
+@register_acc_op
+def minimum(*, input, other):
+    return torch.minimum(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.ne))
+@register_acc_op_mapping(op_and_target=("call_function", torch.ne))
+@register_acc_op_mapping(op_and_target=("call_method", "ne"))
+@register_acc_op
+def ne(*, input, other):
+    return operator.ne(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.eq))
+@register_acc_op_mapping(op_and_target=("call_function", torch.eq))
+@register_acc_op_mapping(op_and_target=("call_method", "eq"))
+@register_acc_op
+def eq(*, input, other):
+    return operator.eq(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.gt))
+@register_acc_op_mapping(op_and_target=("call_function", torch.gt))
+@register_acc_op_mapping(op_and_target=("call_method", "gt"))
+@register_acc_op
+def gt(*, input, other):
+    return operator.gt(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.lt))
+@register_acc_op_mapping(op_and_target=("call_function", torch.lt))
+@register_acc_op_mapping(op_and_target=("call_method", "lt"))
+@register_acc_op
+def lt(*, input, other):
+    return operator.lt(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.and_))
+@register_acc_op_mapping(op_and_target=("call_method", "bitwise_and"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.bitwise_and))
+def bitwise_and(*, input, other):
+    return operator.and_(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_and))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_and"))
+@register_acc_op
+def logical_and(*, input, other):
+    return torch.logical_and(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.or_))
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_or))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_or"))
+@register_acc_op
+def logical_or(*, input, other):
+    return torch.logical_or(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_not))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_not"))
+@register_acc_op
+def logical_not(*, input):
+    return torch.logical_not(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.xor))
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_xor))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_xor"))
+@register_acc_op
+def logical_xor(*, input, other):
+    return torch.logical_xor(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.isinf))
+@register_acc_op
+def isinf(*, input):
+    return torch.isinf(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def any(*, input, dim=None, keepdim=False):
+    if dim is not None:
+        return torch.any(input, dim, keepdim=keepdim)
+    else:
+        return torch.any(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.any),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "any"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+def any_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(any, kwargs=node.kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.fmod))
+@register_acc_op_mapping(op_and_target=("call_method", "fmod"))
+@register_acc_op
+def fmod(*, input, other):
+    return torch.fmod(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sigmoid))
+@register_acc_op_mapping(op_and_target=("call_method", "sigmoid"))
+@register_acc_op
+def sigmoid(*, input):
+    return torch.sigmoid(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sinh))
+@register_acc_op
+def sinh(*, input):
+    return torch.sinh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cosh))
+@register_acc_op
+def cosh(*, input):
+    return torch.cosh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.tanh))
+@register_acc_op_mapping(op_and_target=("call_method", "tanh"))
+@register_acc_op
+def tanh(*, input):
+    return torch.tanh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.asin))
+@register_acc_op
+def asin(*, input):
+    return torch.asin(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.acos))
+@register_acc_op
+def acos(*, input):
+    return torch.acos(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.atan))
+@register_acc_op
+def atan(*, input):
+    return torch.atan(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.exp))
+@register_acc_op
+def exp(*, input):
+    return torch.exp(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.log))
+@register_acc_op
+def log(*, input):
+    return torch.log(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sqrt))
+@register_acc_op_mapping(op_and_target=("call_method", "sqrt"))
+@register_acc_op
+def sqrt(*, input):
+    return torch.sqrt(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.reciprocal))
+@register_acc_op
+def reciprocal(*, input):
+    return torch.reciprocal(input=input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "rsqrt"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.rsqrt),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def rsqrt_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": input_node,
+        }
+        sqrt_node = node.graph.call_function(sqrt, kwargs=new_kwargs)
+        sqrt_node.meta["type"] = torch.Tensor
+        new_kwargs = {
+            "input": sqrt_node,
+        }
+        rec_node = node.graph.call_function(reciprocal, kwargs=new_kwargs)
+        rec_node.meta["type"] = torch.Tensor
+        output_node = rec_node
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.abs))
+@register_acc_op
+def abs(*, input):
+    return torch.abs(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", operator.neg))
+@register_acc_op_mapping(op_and_target=("call_function", torch.neg))
+@register_acc_op
+def neg(*, input):
+    return torch.neg(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.floor))
+@register_acc_op
+def floor(*, input):
+    return torch.floor(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.ceil))
+@register_acc_op
+def ceil(*, input):
+    return torch.ceil(input=input)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.pad))
+@register_acc_op
+def pad(*, input, pad: List[int], mode: str, value: float):
+    return torch.nn.functional.pad(input=input, pad=pad, mode=mode, value=value)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv1d))
+@register_acc_op
+def conv1d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv1d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv2d))
+@register_acc_op
+def conv2d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op
+def quantized_conv2d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    padding_mode,
+    acc_out_ty=None,
+):
+    qparams = acc_out_ty.qparams
+    return torch.nn.quantized.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        padding_mode=padding_mode,
+        scale=qparams["scale"],
+        zero_point=qparams["zero_point"],
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv3d))
+@register_acc_op
+def conv3d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv3d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.conv_transpose2d)
+)
+@register_acc_op
+def conv_transpose2d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return nn.functional.conv_transpose2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.conv_transpose3d)
+)
+@register_acc_op
+def conv_transpose3d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return nn.functional.conv_transpose3d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.batch_norm))
+@register_acc_op
+def batch_norm(
+    *,
+    input,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    training,
+    momentum,
+    eps,
+):
+    return nn.functional.batch_norm(
+        input=input,
+        running_mean=running_mean,
+        running_var=running_var,
+        weight=weight,
+        bias=bias,
+        training=training,
+        momentum=momentum,
+        eps=eps,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.layer_norm))
+@register_acc_op
+def layer_norm(*, input, normalized_shape, weight, bias, eps):
+    return nn.functional.layer_norm(
+        input=input,
+        normalized_shape=normalized_shape,
+        weight=weight,
+        bias=bias,
+        eps=eps,
+    )
+
+
+def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node:
+    """
+    Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk
+    + acc_ops.getitem + acc_ops.squeeze (depends on keepdim).
+    """
+    input_node = node.kwargs["input"]
+    dim = node.kwargs["dim"]
+    keepdim = node.kwargs["keepdim"]
+
+    if dim is None and keepdim:
+        raise RuntimeError(
+            "We currently don't support argmin/argmax with dim=None and keepdim=True"
+        )
+
+    with node.graph.inserting_before(node):
+        if dim is None:
+            flatten_kwargs = {
+                "input": node.kwargs["input"],
+                "start_dim": 0,
+                "end_dim": -1,
+            }
+            flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs)
+            flatten_node.meta["type"] = torch.Tensor
+            input_node = flatten_node
+            dim = -1
+
+        topk_kwargs = {
+            "input": input_node,
+            "k": 1,
+            "dim": dim,
+            "largest": largest,
+            "sorted": False,
+        }
+        topk_node = node.graph.call_function(topk, kwargs=topk_kwargs)
+        # It's actually more like NamedTuple but tuple here should be fine.
+        topk_node.meta["type"] = tuple
+
+        getitem_kwargs = {"input": topk_node, "idx": 1}
+        getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs)
+        getitem_node.meta["type"] = torch.Tensor
+        output_node = getitem_node
+
+        if not keepdim:
+            squeeze_kwargs = {"input": getitem_node, "dim": dim}
+            output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs)
+
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.argmin),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("keepdim", "keepdim"),
+    ],
+)
+def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    """
+    Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem
+    + acc_ops.squeeze (depends on keepdim).
+    """
+    return argmin_max_mapper_impl(node, largest=False)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.linalg.norm))
+@register_acc_op
+def linalg_norm(*, input, ord, dim, keepdim):
+    return torch.linalg.norm(input=input, ord=ord, dim=dim, keepdim=keepdim)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.functional.norm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("p", "p"),
+        ("dim", "dim"),
+        ("keepdim", "keepdim"),
+    ],
+)
+def norm_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+
+    input_node = node.kwargs["input"]
+    p = node.kwargs["p"]
+    dim = node.kwargs["dim"]
+    keepdim = node.kwargs["keepdim"]
+    output_node = None
+    with node.graph.inserting_before(node):
+        if dim is None and p == 1:
+            # linalg_norm takes the max along the sum along a dim
+            # rather than the entire sum for p = 1
+            abs_node = node.graph.call_function(abs, kwargs={"input": input_node})
+            output_node = node.graph.call_function(
+                sum,
+                kwargs={"input": abs_node},
+            )
+        elif dim is None:
+            raise RuntimeError("dim=None has not been implemented for p != 1")
+        else:
+            output_node = node.graph.call_function(
+                linalg_norm,
+                kwargs={"input": input_node, "ord": p, "dim": dim, "keepdim": keepdim},
+            )
+
+    return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "split"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "split_with_sizes"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_sizes", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.split),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+def torch_split_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    """
+    If split_size_or_sections is sections, map the node to slice_tensors
+    + tuple_construct. Otherwise, if split_size_or_sections is split_size,
+    map the node to acc_ops.split.
+    """
+    split_size_or_sections = node.kwargs["split_size_or_sections"]
+    with node.graph.inserting_before(node):
+        if isinstance(split_size_or_sections, int):
+            new_kwargs = {
+                "input": node.kwargs["input"],
+                "split_size": split_size_or_sections,
+                "dim": node.kwargs["dim"],
+            }
+            new_node = node.graph.call_function(split, kwargs=new_kwargs)
+            new_node.meta = node.meta.copy()
+            return new_node
+
+        assert isinstance(split_size_or_sections, Sequence)
+        start = 0
+        slice_nodes = []
+        for i in split_size_or_sections:
+            assert isinstance(i, int)
+            new_kwargs = {
+                "input": node.kwargs["input"],
+                "dim": node.kwargs["dim"],
+                "start": start,
+                "stop": start + i,
+                "step": 1,
+            }
+            new_node = node.graph.call_function(slice_tensor, kwargs=new_kwargs)
+            new_node.meta["type"] = torch.Tensor
+            slice_nodes.append(new_node)
+            start += i
+
+        new_node = node.graph.call_function(
+            tuple_construct, kwargs={"tensors": tuple(slice_nodes)}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def split(*, input, split_size, dim):
+    return torch.split(input, split_size, dim)
+
+
+@register_acc_op
+def tuple_construct(*, tensors):
+    return tuple(tensors)
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.batch_norm2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("weight", "weight"),
+        ("bias", "bias"),
+        ("running_mean", "running_mean"),
+        ("running_var", "running_var"),
+        ("eps", "eps"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_batch_norm2d(
+    *,
+    input,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    eps,
+    acc_out_ty=None,
+):
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.batch_norm2d(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        eps,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding_bag))
+@register_acc_op
+def embedding_bag(
+    *,
+    input,
+    weight,
+    offsets,
+    max_norm,
+    norm_type,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return nn.functional.embedding_bag(
+        input=input,
+        weight=weight,
+        offsets=offsets,
+        max_norm=max_norm,
+        norm_type=norm_type,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        sparse=sparse,
+        per_sample_weights=per_sample_weights,
+        include_last_offset=include_last_offset,
+        padding_idx=padding_idx,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=(
+        "call_function",
+        torch.ops.quantized.embedding_bag_byte_rowwise_offsets,
+    )
+)
+@register_acc_op
+def embedding_bag_byte_rowwise_offsets(
+    *,
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    pruned_weights,
+    per_sample_weights,
+    compressed_indices_mapping,
+    include_last_offset,
+):
+    return torch.ops.quantized.embedding_bag_byte_rowwise_offsets(
+        weight=weight,
+        indices=indices,
+        offsets=offsets,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        pruned_weights=pruned_weights,
+        per_sample_weights=per_sample_weights,
+        compressed_indices_mapping=compressed_indices_mapping,
+        include_last_offset=include_last_offset,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=(
+        "call_function",
+        torch.ops.quantized.embedding_bag_4bit_rowwise_offsets,
+    )
+)
+@register_acc_op
+def embedding_bag_4bit_rowwise_offsets(
+    *,
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    pruned_weights,
+    per_sample_weights,
+    compressed_indices_mapping,
+    include_last_offset,
+):
+    return torch.ops.quantized.embedding_bag_4bit_rowwise_offsets(
+        weight=weight,
+        indices=indices,
+        offsets=offsets,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        pruned_weights=pruned_weights,
+        per_sample_weights=per_sample_weights,
+        compressed_indices_mapping=compressed_indices_mapping,
+        include_last_offset=include_last_offset,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sin))
+@register_acc_op
+def sin(*, input):
+    return torch.sin(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cos))
+@register_acc_op
+def cos(*, input):
+    return torch.cos(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.tan))
+@register_acc_op
+def tan(*, input):
+    return torch.tan(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.topk))
+@register_acc_op
+def topk(*, input, k, dim, largest, sorted):
+    return torch.topk(input=input, k=k, dim=dim, largest=largest, sorted=sorted)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", operator.getitem))
+@register_acc_op
+def getitem(*, input, idx):
+    return input[idx]
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.nan_to_num))
+@register_acc_op_mapping(op_and_target=("call_method", "nan_to_num"))
+@register_acc_op
+def nan_to_num(*, input, nan=0.0, posinf=None, neginf=None):
+    return torch.nan_to_num(input, nan=nan, posinf=posinf, neginf=neginf)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "expand"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "sizes"),
+    ],
+)
+@register_acc_op
+def expand(*, input, sizes):
+    return input.expand(*sizes)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.masked_fill),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mask", "mask"),
+        ("value", "value"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "masked_fill"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mask", "mask"),
+        ("value", "value"),
+    ],
+)
+@register_acc_op
+def masked_fill(*, input, mask, value):
+    return input.masked_fill(mask, value)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.where))
+@register_acc_op
+def where(*, condition, x, y):
+    return torch.where(condition, x, y)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def slice_tensor(*, input, dim, start, stop, step):
+    slc = slice(start, stop, step)
+    if dim >= 0:
+        slices: List[slice] = [slice(None, None, None) for _ in range(dim)]
+        slices.append(slc)
+    else:
+        slices = [Ellipsis, slc]  # type: ignore[list-item]
+        slices.extend([slice(None, None, None) for _ in range(-dim - 1)])
+
+    return input[tuple(slices)]
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.narrow),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("start", "start"),
+        ("length", "length"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "narrow"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("start", "start"),
+        ("length", "length"),
+    ],
+)
+def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    assert isinstance(node.kwargs["start"], int) and isinstance(
+        node.kwargs["length"], int
+    )
+    kwargs = {
+        "input": node.kwargs["input"],
+        "dim": node.kwargs["dim"],
+        "start": node.kwargs["start"],
+        "stop": node.kwargs["start"] + node.kwargs["length"],
+        "step": 1,
+    }
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(slice_tensor, kwargs=kwargs)
+    new_node.meta = node.meta.copy()
+    return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.reshape),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("shape", "shape"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[("shape", "shape")],
+)
+@register_acc_op
+def reshape(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    return input.reshape(acc_out_ty.shape)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "reshape"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "shape"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "view"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "shape"),
+    ],
+)
+def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    For Tensor.reshape and Tensor.view nodes, args could be (input, 1, 2, 3) or (input,
+    (1, 2, 3)).  Here we do some special handling with the `shape` arg in order to map
+    it to acc_ops.reshape. It also handles the case when `shape` is a list instead of
+    tuple.
+    """
+    input_node = node.kwargs["input"]
+    shape = node.kwargs["shape"]
+
+    assert isinstance(shape, Sequence)
+    if isinstance(shape[0], (tuple, list)):  # type: ignore[index]
+        shape = shape[0]  # type: ignore[index]
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(
+            reshape,
+            kwargs={
+                "input": input_node,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=shape),
+            },
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "half"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_half_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.float16),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "int"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_int_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.int),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "float"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_float_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.float),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op
+def to_dtype(input, acc_out_ty=None, device=None):
+    assert acc_out_ty is not None
+    return input.to(dtype=acc_out_ty.dtype, device=device)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "to"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dtype", "dtype"),
+        ("device", "device", this_arg_is_optional),
+    ],
+)
+def custom_tensor_to_mapper(node: torch.fx.Node, _: nn.Module):
+    dest = node.kwargs["dtype"]
+    mem_format = node.kwargs.get("memory_format")
+    dest_other = node.kwargs.get("device")
+    assert dest is not None
+    assert mem_format is None or mem_format == torch.preserve_format
+
+    dest_dtype = dest_device = None
+    if isinstance(dest, torch.fx.node.Node):
+        meta_type = dest.meta["type"]
+        # consider the device is gpu only, meta info is limited to give clear device type
+        if dest.meta["type"] == torch.device:
+            dest_device = dest
+        elif dest.meta["type"] == torch.dtype:
+            dest_dtype = dest
+        elif dest.meta["type"] == torch.Tensor:
+            input_obj = node.kwargs["input"]
+            other_obj = dest
+            with node.graph.inserting_before(node):
+                dtype_node = node.graph.call_function(
+                    dtype, kwargs={"input": other_obj}
+                )
+                dtype_node.meta["type"] = torch.dtype
+                device_node = node.graph.call_function(
+                    device, kwargs={"input": other_obj}
+                )
+                device_node.meta["type"] = torch.device
+                new_kwargs = {
+                    "input": input_obj,
+                    "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_node),
+                    "device": device_node,
+                }
+                new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+                new_node.meta = node.meta
+                return new_node
+        else:
+            raise RuntimeError(f"We currently do not support to({meta_type})")
+    elif isinstance(dest, torch.device):
+        # only device is set, dtype=None
+        if dest_other is None:
+            dest_device = dest
+        # device and dtype are both set
+        else:
+            dest_dtype = dest_other
+            dest_device = dest
+    # only dtype is set
+    else:
+        dest_dtype = dest
+
+    new_kwargs = {
+        "input": node.kwargs["input"],
+        "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dest_dtype),
+        "device": dest_device,
+    }
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.create_node(
+            "call_function", to_dtype, kwargs=new_kwargs, name=node.name
+        )
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.add),
+    # Note that we may have aliases for inputs here due to issues with deterministically
+    # knowing the correct target that will be resolved by pytorch.
+    arg_replacement_tuples=[
+        (("input", "a"), "input"),
+        (("other", "b"), "other"),
+        ("alpha", "alpha", this_arg_is_optional),
+    ],
+)
+def custom_torch_add_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    """
+    Add custom mapping for torch.add because it has an `alpha` parameter which scales
+    the `other` input, and we want to make that mul a separate node.
+    """
+    with node.graph.inserting_before(node):
+        # If alpha is in kwargs check if we need to add a mul, and use correct kwargs.
+        if "alpha" in node.kwargs:
+            # Add mul node only if it has a numerical impact, i.e. alpha != 1.0.
+            if node.kwargs["alpha"] != 1.0:
+                other_node = node.graph.create_node(
+                    "call_function",
+                    mul,
+                    kwargs={
+                        "input": node.kwargs["other"],
+                        "other": node.kwargs["alpha"],
+                    },
+                    name=node.name + "_mul_alpha",
+                )
+                other_node.meta = node.meta
+            else:
+                other_node = node.kwargs["other"]
+            add_kwargs = {"input": node.kwargs["input"], "other": other_node}
+        else:
+            add_kwargs = node.kwargs
+
+        new_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=node.name
+        )
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.quantized.Linear),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_linear_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantized_linear module to acc_op.linear. We unpack weight and bias
+    in this mapper and pass them directly to linear node.
+    """
+    assert isinstance(node.target, str)
+    linear_module = dict(mod.named_modules())[node.target]
+    prefix = node.target.replace(".", "_")
+    weight_name = f"{prefix}_weight"
+    bias_name = f"{prefix}_bias"
+
+    # Store weight and bias in the main module
+    mod.register_buffer(weight_name, linear_module.weight())
+    if linear_module.bias() is not None:
+        mod.register_buffer(bias_name, linear_module.bias())
+
+    with node.graph.inserting_before(node):
+        # Insert get_attr nodes for weight and bias
+        get_weight = node.graph.get_attr(weight_name)
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(
+            linear_module.weight()
+        )
+
+        get_bias = None
+        if linear_module.bias() is not None:
+            get_bias = node.graph.get_attr(bias_name)
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(
+                linear_module.bias()
+            )
+
+        qparams = {"scale": linear_module.scale, "zero_point": linear_module.zero_point}
+        # Create kwargs for acc_op.quantized_linear
+        kwargs = {
+            "input": node.kwargs["input"],
+            "weight": get_weight,
+            "bias": get_bias,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+
+        new_node = node.graph.call_function(quantized_linear, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.quantized.Conv2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_conv2d_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantzed Conv2d module to acc_op.conv. We unpack all the parameters
+    in this mapper and pass them directly to conv2d node.
+    """
+    assert isinstance(node.target, str)
+    conv_module = dict(mod.named_modules())[node.target]
+    prefix = node.target.replace(".", "_")
+    weight_name = f"{prefix}_weight"
+    bias_name = f"{prefix}_bias"
+
+    # Store weight and bias in the main module
+    mod.register_buffer(weight_name, conv_module.weight())
+    if conv_module.bias() is not None:
+        mod.register_buffer(bias_name, conv_module.bias())
+
+    with node.graph.inserting_before(node):
+        # Insert get_attr nodes for weight and bias
+        get_weight = node.graph.get_attr(weight_name)
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight())
+
+        get_bias = None
+        if conv_module.bias() is not None:
+            get_bias = node.graph.get_attr(bias_name)
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias())
+
+        qparams = {"scale": conv_module.scale, "zero_point": conv_module.zero_point}
+
+        # Create kwargs for acc_op.conv
+        kwargs = {
+            "input": node.kwargs["input"],
+            "weight": get_weight,
+            "bias": get_bias,
+            "stride": conv_module.stride,
+            "padding": conv_module.padding,
+            "dilation": conv_module.dilation,
+            "groups": conv_module.groups,
+            "padding_mode": conv_module.padding_mode,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+
+        new_node = node.graph.call_function(quantized_conv2d, kwargs=kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.ops.quantized.add_relu),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+)
+def add_relu_unfuse_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule
+) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        qparams = {
+            "scale": node.kwargs["scale"],
+            "zero_point": node.kwargs["zero_point"],
+        }
+        add_kwargs = {
+            "input": node.kwargs["input"],
+            "other": node.kwargs["other"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+        add_node = node.graph.call_function(quantized_add, kwargs=add_kwargs)
+        add_node.meta = node.meta.copy()
+
+        relu_node = node.graph.call_function(
+            relu, kwargs={"input": add_node, "inplace": False}
+        )
+        relu_node.meta = node.meta
+        return relu_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.intrinsic.quantized.ConvReLU2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_convrelu2d_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantized ConvReLU2d module to acc_op.relu. We use packed_quantized_conv2d_mapper to unpack all the parameters
+    in this mapper and pass the returned conv2d node directly to relu node.
+    """
+
+    with node.graph.inserting_before(node):
+        # conv2d op
+        conv2d_node = packed_quantized_conv2d_mapper(node, mod)
+
+        # relu op
+        relu_node = node.graph.call_function(
+            relu, kwargs={"input": conv2d_node, "inplace": False}
+        )
+        relu_node.meta = node.meta
+        return relu_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.gelu))
+@register_acc_op_mapping(op_and_target=("call_method", "gelu"))
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", torch.nn.GELU),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("approximate", "approximate"),
+    ],
+)
+@register_acc_op
+def gelu(*, input, approximate="none"):
+    return torch.nn.functional.gelu(input=input, approximate=approximate)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cumsum))
+@register_acc_op_mapping(op_and_target=("call_method", "cumsum"))
+@register_acc_op
+def cumsum(*, input, dim, dtype=None):
+    return torch.cumsum(input=input, dim=dim, dtype=dtype)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.chunk))
+@register_acc_op_mapping(op_and_target=("call_method", "chunk"))
+@register_acc_op
+def chunk(*, input, chunks, dim=0):
+    return torch.chunk(input=input, chunks=chunks, dim=dim)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.gather),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("index", "index"),
+        ("sparse_grad", "sparse_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def gather(*, input, dim, index, sparse_grad=False):
+    return torch.gather(input=input, dim=dim, index=index, sparse_grad=sparse_grad)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.index_select),
+)
+@register_acc_op
+def index_select(*, input, dim, index):
+    return torch.index_select(input, dim, index)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "expand_as"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+    ],
+)
+def expand_as_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Maps expand_as(other) to expand(other.size())
+    """
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(
+            size, kwargs={"input": node.kwargs["other"]}
+        )
+        size_node.meta["type"] = torch.Size
+
+        expand_node = node.graph.call_function(
+            expand, kwargs={"input": node.kwargs["input"], "sizes": size_node}
+        )
+        expand_node.meta = node.meta.copy()
+        return expand_node
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.grid_sample),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("grid", "grid"),
+        ("mode", "mode", this_arg_is_optional),
+        ("padding_mode", "padding_mode", this_arg_is_optional),
+        ("align_corners", "align_corners", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def grid_sample(
+    *,
+    input,
+    grid,
+    mode="bilinear",
+    padding_mode="zeros",
+    align_corners=None,
+):
+    return torch.nn.functional.grid_sample(
+        input=input,
+        grid=grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.interpolate),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size", this_arg_is_optional),
+        ("scale_factor", "scale_factor", this_arg_is_optional),
+        ("mode", "mode", this_arg_is_optional),
+        ("align_corners", "align_corners", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def interpolate(
+    *,
+    input,
+    size=None,
+    scale_factor=None,
+    mode="nearest",
+    align_corners=None,
+):
+    return torch.nn.functional.interpolate(
+        input=input,
+        size=size,
+        scale_factor=scale_factor,
+        mode=mode,
+        align_corners=align_corners,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.tensor_split),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("tensor_indices_or_sections", "sections", "indices"), "indices_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "tensor_split"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("tensor_indices_or_sections", "sections", "indices"), "indices_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def tensor_split(*, input, indices_or_sections, dim=0):
+    # Need to de-coalesce the indices_or_sections because tensor_split accepts
+    # one of three kwarg signatures:
+    #  * (Tensor input, Tensor tensor_indices_or_sections, int dim)
+    #  * (Tensor input, int sections, int dim)
+    #  * (Tensor input, tuple of ints indices, int dim)
+    if isinstance(indices_or_sections, torch.Tensor):
+        indices_or_sections = indices_or_sections.tolist()
+    if isinstance(indices_or_sections, int):
+        return torch.tensor_split(input, sections=indices_or_sections, dim=dim)
+    elif isinstance(indices_or_sections, Iterable):
+        return torch.tensor_split(input, indices=tuple(indices_or_sections), dim=dim)
+    else:
+        raise RuntimeError(
+            f"Expected int, Iterable or Tensor for "
+            f"indices_or_sections arg, got: {type(indices_or_sections)}"
+        )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_ones"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_ones(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_ones(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_empty"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_empty(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_empty(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.einsum),
+    arg_replacement_tuples=[
+        ("equation", "equation"),
+        ("*", "operands"),
+    ],
+)
+@register_acc_op
+def einsum(*, equation, operands):
+    return torch.einsum(equation, *operands)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.as_strided),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("stride", "stride"),
+        ("storage_offset", "storage_offset", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "as_strided"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("stride", "stride"),
+        ("storage_offset", "storage_offset", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def as_strided(*, input, size, stride, storage_offset=0):
+    return torch.as_strided(
+        input=input, size=size, stride=stride, storage_offset=storage_offset
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.var))
+@register_acc_op_mapping(
+    op_and_target=("call_method", "var"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased"),
+        ("keepdim", "keepdim"),
+    ],
+)
+@register_acc_op
+def var(*, input, dim, unbiased, keepdim=False):
+    return torch.var(input=input, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+
+@register_acc_op
+def xl_weight(weight_id: str, metadata: TensorMetadata, proxy_shape, dtype):
+    """
+    This op stores metadata and weight_id and otherwise returns a zeros tensor
+    with shape `proxy_shape` and dtype `dtype`.
+
+    Note: when Nodes with this op are run through ShapeProp, its metadata will
+    be the same as computed and set as of that of `proxy`, however when running
+    acc_shape_inference, it will return `metadata`.
+
+    Args:
+        weight_id: string identifier for the XL weight
+        metadata: metadata of the XL weight
+        proxy_shape: shape of substitute tensor
+        dtype: dtype of substitute tensor
+    """
+    return torch.zeros(proxy_shape, dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.nn.functional.log_softmax),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype"),
+    ],
+)
+def log_softmax_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    with node.graph.inserting_after(node):
+
+        softmax_kwargs = {
+            "input": node.kwargs["input"],
+            "dim": node.kwargs["dim"],
+            "dtype": node.kwargs["dtype"],
+        }
+        softmax_node = node.graph.call_function(softmax, kwargs=softmax_kwargs)
+        softmax_node.meta = node.meta.copy()
+
+    with softmax_node.graph.inserting_after(softmax_node):
+        log_kwargs = {"input": softmax_node}
+        log_node = node.graph.call_function(log, kwargs=log_kwargs)
+        log_node.meta = node.meta.copy()
+
+        return log_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.nn.functional.softplus),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("beta", "beta", this_arg_is_optional),
+        ("threshold", "threshold", this_arg_is_optional),
+    ],
+)
+def softplus_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    """
+    Maps torch.nn.functional.softplus to acc_ops.where, acc_ops.relu, acc_ops.exp, acc_ops.mul, acc_ops.add and acc_ops.div
+
+    softplus(input, beta, threshold) = where(beta * input > threshold, relu(input), div(log(1 + exp(beta * input))), beta))
+
+    torch.where(
+        softplus_module.beta * sample_inputs[0] > softplus_module.threshold,
+        sample_inputs[0].relu(),
+        torch.div((1 + (softplus_module.beta * sample_inputs[0]).exp()).log(), softplus_module.beta),
+    )
+
+    """
+
+    input_node = node.kwargs["input"]
+    beta_node = node.kwargs["beta"]
+    threshold_node = node.kwargs["threshold"]
+
+    with node.graph.inserting_after(node):
+        cond_mul_node = node.graph.call_function(
+            mul,
+            kwargs={
+                "input": input_node,
+                "other": beta_node,
+            },
+        )
+        cond_mul_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(cond_mul_node):
+        gt_node = node.graph.call_function(
+            gt,
+            kwargs={
+                "input": cond_mul_node,
+                "other": threshold_node,
+            },
+        )
+        gt_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(gt_node):
+        relu_node = node.graph.call_function(relu, kwargs={"input": input_node})
+        relu_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(relu_node):
+        mul_node = node.graph.call_function(
+            mul,
+            kwargs={
+                "input": input_node,
+                "other": beta_node,
+            },
+        )
+        mul_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(mul_node):
+        exp_node = node.graph.call_function(exp, kwargs={"input": mul_node})
+        exp_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(exp_node):
+        add_node = node.graph.call_function(
+            add,
+            kwargs={
+                "input": exp_node,
+                "other": 1,
+            },
+        )
+        add_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(add_node):
+        log_node = node.graph.call_function(log, kwargs={"input": add_node})
+        log_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(log_node):
+        div_node = node.graph.call_function(
+            div,
+            kwargs={
+                "input": log_node,
+                "other": beta_node,
+            },
+        )
+        div_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(div_node):
+        where_node = node.graph.call_function(
+            where,
+            kwargs={
+                "condition": gt_node,
+                "x": relu_node,
+                "y": div_node,
+            },
+        )
+        where_node.meta = div_node.meta.copy()
+
+        return where_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.baddbmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("batch1", "batch1"),
+        ("batch2", "batch2"),
+        ("beta", "beta", this_arg_is_optional),
+        ("alpha", "alpha", this_arg_is_optional),
+    ],
+)
+def baddbmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.baddbmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1
+    then we also insert acc_ops.mul to the right place.
+    """
+    with node.graph.inserting_before(node):
+        mm_kwargs = {"input": node.kwargs["batch1"], "other": node.kwargs["batch2"]}
+        mm_node = node.graph.create_node(
+            "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_matmul"
+        )
+        mm_node.meta = node.meta.copy()
+
+        if node.kwargs["alpha"] != 1:
+            mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]}
+            mm_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul"
+            )
+        mm_node.meta = node.meta.copy()
+
+        input_node = node.kwargs["input"]
+        if node.kwargs["beta"] != 1:
+            mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]}
+            new_input_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul"
+            )
+            assert isinstance(input_node, torch.fx.Node)
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {"input": input_node, "other": mm_node}
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
+###############################################################################
+
+# Set ops as side-effectul, this prevents them from being optimized away or
+# being folded into constants.
+torch.fx.node._side_effectful_functions.add(xl_weight)
diff --git a/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
new file mode 100644
index 000000000..ddea8c847
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
@@ -0,0 +1,124 @@
+import os
+import sys
+from typing import Any
+
+import torch.fx
+from torch.fx.passes import shape_prop
+
+from . import acc_ops
+
+
+class SuppressStderrPrints:
+    def __enter__(self):
+        self._original_stderr = sys.stderr
+        sys.stderr = open(os.devnull, "w")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stderr.close()
+        sys.stderr = self._original_stderr
+
+
+class AccShapeProp(shape_prop.ShapeProp):
+    """
+    Similar to standard shape prop, but if any node that is run with standard shape prop
+    fails then it tries to upconvert any fp16 inputs to fp32, rerun shape prop, and then
+    downconvert fp32 results back to fp16.
+
+    Note that we currently mostly only look for/support up/down conversion for nodes
+    with tensor outputs, but this is likely fine for most cases. Additionally the base
+    shape_prop works for many ops with fp16, such as tensor.cat, tensor slice, tensor.to
+    dtype conversion, etc.
+
+    """
+
+    def _run_node(self, n: torch.fx.Node) -> Any:
+        # Run ops with XL weights by clamping their inputs, see
+        # docstring for self.run_node_with_xl_weights for more details
+        if any(
+            isinstance(kwarg, torch.fx.Node) and kwarg.target == acc_ops.xl_weight
+            for kwarg in n.kwargs.values()
+        ):
+            return self.run_node_with_xl_weights(n)
+        else:
+            return super().run_node(n)
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        # First try running shape_prop with the original inputs.
+        with SuppressStderrPrints():
+            try:
+                return self._run_node(n)
+            except Exception:
+                pass
+
+        # Base shape_prop failed, so temporarily upconvert the node's fp16 inputs in env
+        # and retry. For now just support upconverting Tensor outputs.
+        orig_dtype_env = []
+        for in_node in n.all_input_nodes:
+            in_ten = self.env[in_node]
+            if isinstance(in_ten, torch.Tensor) and in_ten.dtype == torch.float16:
+                orig_dtype_env.append((in_node, in_ten))
+                self.env[in_node] = in_ten.clone().to(dtype=torch.float)
+
+        # Now try running again with upconverted fp32 input tensor in env.
+        result = self._run_node(n)
+
+        # Now that we succeeded, assume it's thanks to upconverting. Therefore we
+        # downconvert fp32 tensor results to fp16.
+        if isinstance(result, torch.Tensor) and result.dtype == torch.float:
+            result = result.to(dtype=torch.float16)
+            self.env[n] = result
+            n.meta["tensor_meta"] = n.meta["tensor_meta"]._replace(dtype=torch.float16)
+
+        # Finally, restore the original env back to fp16 for any upconverted tensors.
+        for in_node, in_ten in orig_dtype_env:
+            self.env[in_node] = in_ten
+
+        return result
+
+    def run_node_with_xl_weights(self, n: torch.fx.Node) -> Any:
+        """
+        EmbeddingBag with XL Weights of shape (num_embeddings, embedding_dim)
+        are replaced with smaller proxies of shape
+        (acc_ops.PROXY_EMBEDDING_SIZE, embedding_dim) during tracing. This can
+        cause index out of bounds issues when sample inputs lead to the
+        embedding bag op indexing into the first dimension of the weight tensor
+        which it expects to be bigger than it is during tracing.
+
+        For these ops, return a zeros tensor of the correct shape and dtype.
+        """
+
+        op = n.target.__module__ + "." + n.target.__name__
+
+        if op.endswith("acc_ops.int_nbit_split_embedding_codegen_lookup_function"):
+            output_dtype_int = n.kwargs["output_dtype"]
+            assert output_dtype_int < 2, "only support float16 and float32"
+            output_dtype = torch.float if output_dtype_int == 0 else torch.float16
+            total_D = n.kwargs["total_D"]
+
+            D_offsets_shape = self.env[n.kwargs["D_offsets"]].shape
+            offsets_shape = self.env[n.kwargs["offsets"]].shape
+            batches = (offsets_shape[0] - 1) // (D_offsets_shape[0] - 1)
+            result = torch.zeros((batches, total_D), dtype=output_dtype)
+
+        elif op.find("acc_ops.embedding_bag"):
+            weight = self.env[n.kwargs["weight"]]
+            offsets_shape = self.env[n.kwargs["offsets"]].shape
+            batches = offsets_shape[0] - int(n.kwargs["include_last_offset"])
+            output_dtype = weight.dtype
+
+            embedding_size = weight.shape[1]
+            if op.endswith("acc_ops.embedding_bag_byte_rowwise_offsets"):
+                embedding_size -= 8
+                output_dtype = torch.float32
+            elif op.endswith("acc_ops.embedding_bag_4bit_rowwise_offsets"):
+                embedding_size = (embedding_size - 4) * 2
+                output_dtype = torch.float32
+
+            result = torch.zeros((batches, embedding_size), dtype=output_dtype)
+
+        else:
+            raise NotImplementedError(
+                f"The op {op} cannot be run with xl_weight(s) inputs"
+            )
+
+        return result
diff --git a/fx2ait/fx2ait/acc_tracer/acc_tracer.py b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
new file mode 100644
index 000000000..bcbd7c1c2
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
@@ -0,0 +1,626 @@
+import ast
+import builtins
+import copy
+import inspect
+import logging
+import textwrap
+import warnings
+from types import FunctionType
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.jit as jit
+import torch.nn as nn
+from torch._sources import normalize_source_lines
+from torch.fx import Graph, Tracer
+from torch.fx.experimental.normalize import NormalizeArgs
+from torch.fx.node import Argument, Node, Target
+
+from . import acc_normalizer, acc_ops, acc_shape_prop, acc_utils  # noqa: F401
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def _get_exception_wrapper_attr_name(exc_type: Type[Exception]) -> str:
+    return f"_conditional_exception_wrapper_{exc_type.__name__}"
+
+
+class Acc_Rewriter(ast.NodeTransformer):
+    """
+    Take a FunctionType object representing a `forward` method, then
+    perform an AST rewrite to swap out nodes that are not symbolically
+    traceable with a callsite to the FX alternative.
+
+    To support swapping out an AST node, define a new `visit` method on
+    that node. For more details, see:
+    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.exceptions_rewritten: Set[Type[Exception]] = set()
+        self.exceptions_bool_rewritten: Set[Type[Exception]] = set()
+
+    def rewrite(
+        self, fn: FunctionType
+    ) -> Tuple[FunctionType, Set[Type[Exception]], Set[Type[Exception]]]:
+
+        # Normalize the source lines
+        sourcelines, _ = inspect.getsourcelines(fn)
+        sourcelines = normalize_source_lines(sourcelines)
+        source = "".join(sourcelines)
+        normalized_str = textwrap.dedent(source)
+
+        # Rewrite the original AST
+        source_ast = ast.parse(normalized_str)
+        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
+
+        # Pull out the compiled function from the newly-created Module
+        code = compile(dest_ast, "", "exec")
+        globals_dict = copy.copy(fn.__globals__)
+        keys_before = set(globals_dict.keys())
+        exec(code, globals_dict)  # noqa P204
+        new_keys = list(set(globals_dict.keys()) - keys_before)
+        assert len(new_keys) <= 1
+        fn_compiled = globals_dict[fn.__name__]
+
+        # Return the correct FunctionType object and the Exceptions that were
+        # rewritten during visit_If.
+        return fn_compiled, self.exceptions_rewritten, self.exceptions_bool_rewritten
+
+    def visit_Assert(self, node: ast.Assert):
+        """
+        Swap out the Assert node (Python's `assert`) with a callsite to the
+        symbolically-traceable torch._assert function
+        """
+        # Create the Call node
+        n = ast.parse("torch._assert()", mode="eval")
+        assert isinstance(n, ast.Expression)
+        call_node = n.body
+        assert isinstance(call_node, ast.Call)
+        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
+        call_node.args = [node.test, msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = ast.Expr(value=call_node)
+
+        # Return the new Call node to signify that we want to use it as
+        # a replacement for the original _assert node
+        return ast.copy_location(expr_wrapper, node)
+
+    def visit_If(self, if_node: ast.If):
+        """
+        Swap out the pattern `If(x): Raise(y)` with a ConditionalExceptionWrapper
+        specialized for the specific exception y. The specialized
+        ConditionalExceptionWrapper module will be added in the RewrittenModule.
+        Only works with builtin Exceptions, as we assume the signature of the
+        init for the Exception is a string.
+        """
+        raise_node = if_node.body[0]
+        if not isinstance(raise_node, ast.Raise):
+            return if_node
+
+        # Don't handle orelse for now.
+        # TODO: Move orelse to the body after calling ConditionalExceptionWrapper.
+        if len(if_node.orelse) != 0:
+            return if_node
+
+        def _reuse_loc(node):
+            return ast.copy_location(node, if_node)
+
+        # If the exception has a message then we expect the raise's exc to be a
+        # Call w/ a msg. Else if it's a exc Name then there's no msg to use.
+        node_for_exc = raise_node.exc
+        if isinstance(node_for_exc, ast.Name):
+            # E.g. `raise AssertionError`, i.e. without an exc_msg.
+            name_node_of_exc = node_for_exc
+            exc_msg = _reuse_loc(ast.Constant(None))
+        elif isinstance(node_for_exc, ast.Call):
+            # E.g. `raise AssertionError("error message")`
+            name_node_of_exc = node_for_exc.func  # type: ignore[assignment]
+            if not isinstance(name_node_of_exc, ast.Name):
+                return if_node
+            # Most assertions just take a single string arg, but some may not; skip
+            # handling such assertions for now.
+            if len(node_for_exc.args) != 1:
+                return if_node
+            exc_msg = node_for_exc.args[0]
+        else:
+            return if_node
+
+        # Convert what we expect is the name of the exception into its
+        # associated python class.
+        name_of_exc = name_node_of_exc.id
+        try:
+            exc_type = eval(name_of_exc)  # noqa P204
+        except Exception:
+            return if_node
+
+        # Check that we actually have a builtin exception.
+        if (
+            not issubclass(exc_type, Exception)
+            or getattr(getattr(exc_type, "__class__", None), "__module__", None)
+            != "builtins"
+        ):
+            return if_node
+
+        # We need a ConditionalExceptionWrapper specialized for every kind of
+        # exception, so add it to exceptions_rewritten to remember for later to
+        # add a specialized attr with it.
+        self.exceptions_rewritten.add(exc_type)
+
+        # From here we definitely should be able to do the replacement. Create a
+        # Call node to the ConditionalExceptionWrapper module we're replacing
+        # the If with, with args set as the If's condition and the string of the
+        # exception. The call to the self._conditional_exception_wrapper_*Error
+        # module is safe because the RewrittenModule will add it as an attr
+        # based on the returned exceptions_rewritten, and we assume we are
+        # currently modifying the AST of a method from a RewrittenModule.
+        exc_wrapper_node = ast.parse(
+            f"self.{_get_exception_wrapper_attr_name(exc_type)}()", mode="eval"
+        )
+        assert isinstance(exc_wrapper_node, ast.Expression)
+        exc_wrapper_call_node = exc_wrapper_node.body
+        assert isinstance(exc_wrapper_call_node, ast.Call)
+        if isinstance(if_node.test, ast.BoolOp) and isinstance(
+            if_node.test.op, ast.And
+        ):
+            self.exceptions_bool_rewritten.add(exc_type)
+            bool_wrapper_node = ast.parse(
+                f"self.{_get_exception_wrapper_attr_name(exc_type)}_bool()", mode="eval"
+            )
+            assert isinstance(exc_wrapper_node, ast.Expression)
+            bool_wrapper_call_node = bool_wrapper_node.body
+            assert isinstance(exc_wrapper_call_node, ast.Call)
+            bool_wrapper_call_node.args = if_node.test.values
+            exc_wrapper_call_node.args = [
+                _reuse_loc(bool_wrapper_call_node),
+                exc_msg,
+            ]
+        else:
+            exc_wrapper_call_node.args = [if_node.test, exc_msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = _reuse_loc(ast.Expr(_reuse_loc(exc_wrapper_call_node)))
+
+        # Return the new node to signify that we want to use it as a replacement
+        # for the original `If x: Raise y` pattern.
+        return expr_wrapper
+
+
+class ConditionalExceptionWrapper(nn.Module):
+    """
+    This wrapper class is used to wrap conditional raising of exceptions during
+    rewriting. For example:
+
+    .. code-block:: python
+
+        if self.name != "x":
+            raise AssertionError(f"Name was not x: {self.name}")
+
+    Is rewritten into
+
+    .. code-block:: python
+
+        self._conditional_exception_wrapper_AssertionError(
+            self.name != "x", f"Name was not x: {self.name}"
+        )
+
+    Note that __init__ takes the Exception class that it is wrapping, while
+    forward takes the condition to check and the message for the exception.
+
+    """
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(self, exc: Type[Exception]):
+        super().__init__()
+        self.exc = exc
+
+    def forward(self, cond: bool, msg: str):
+        if cond:
+            raise self.exc if msg is None else self.exc(msg)
+
+
+class ConditionalExceptionBoolCondWrapper(nn.Module):
+    """
+    This is a wrapper class to for boolean ops used inside conditionals
+    raising exceptions.
+    This currently only handles binary input cases for the `and` operator
+    at one level of depth
+    For example:
+
+    .. code-block:: python
+
+    if self.name != "x" and self.name != "y":
+        raise AssertionError(f"Name was not x: {self.name}")
+
+    rewrites the `self.name != "x" and self.name != "y"` with
+    a `_conditional_exception_wrapper_AssertionError_bool` as follows:
+
+    .. code-block:: python
+
+        self._conditional_exception_wrapper_AssertionError(
+            self._conditional_exception_wrapper_AssertionError_bool(self.name != "x" and self.name != "y"), f"Name was not x: {self.name}"
+        )
+    """
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(self, op):
+        super().__init__()
+
+    def forward(self, *conds: Iterable):
+        return all(conds)
+
+
+# Custom tracer that traces to the functional level and rewrites asserts and
+# exceptions.
+class AccRewritingTracer(Tracer):
+    # Add an explicit check for mutable operations, which break symbolic tracing.
+    check_mutable_operations = True
+    # Disble proxying buffers, which currently breaks some quantization code
+    proxy_buffer_attributes = False
+
+    # Note: Treat ConditionalExceptionWrapper as a leaf so that we don't
+    # trace into it, because it contains control flow and raises an exception.
+    DEFAULT_LEAF_MODULE_LIST = {
+        ConditionalExceptionBoolCondWrapper,
+        ConditionalExceptionWrapper,
+        torch.nn.quantized.Linear,
+        torch.nn.quantized.Conv2d,
+        torch.nn.intrinsic.quantized.ConvReLU2d,
+        jit.ScriptModule,
+        jit.RecursiveScriptModule,
+        torch.nn.modules.activation.MultiheadAttention,
+    }
+
+    def is_leaf_module(self, m: nn.Module, mod_qual_name: str) -> bool:
+        return getattr(m, "_base_class_origin", type(m)) in self.leaf_module_list
+
+    def trace(
+        self,
+        root: nn.Module,
+        concrete_args: Optional[Dict[str, Any]] = None,
+        ast_rewriter_allow_list: Optional[Set] = None,
+        leaf_module_list: Optional[Set] = None,
+    ) -> Tuple[Graph, nn.Module]:
+        self.leaf_module_list = self.DEFAULT_LEAF_MODULE_LIST
+        if leaf_module_list:
+            self.leaf_module_list.update(leaf_module_list)
+        rewritten = _rewrite(root, ast_rewriter_allow_list, self.leaf_module_list)
+        return super().trace(rewritten, concrete_args), rewritten
+
+    # override TraceBase's method
+    def create_node(
+        self,
+        kind: str,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: Optional[str] = None,
+        type_expr: Optional[Any] = None,
+    ) -> Node:
+        """
+        Inserts a graph node given target, args, kwargs, and name.
+
+        This method can be overridden to do extra checking, validation, or
+        modification of values used in node creation. For example, one might
+        want to disallow in-place operations from being recorded.
+        """
+
+        ## Hacky way to decide inplace ops
+        if type(target) != str:
+            name_target = target.__name__
+        else:
+            name_target = target
+
+        allow_list = ["and_", "or_"]  # python  operator.and_,  operator.or_
+        if (
+            name_target[-1] == "_"
+            and name_target[0] != "_"
+            and not (name_target in allow_list)
+            and kind != "placeholder"
+        ):
+            raise RuntimeError(
+                f"Tried to trace mutable operation {name_target}. FX only supports functional code"
+            )
+
+        return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+
+
+# List of modules that need rewriting to be supported for tracing.
+DEFAULT_REWRITE_ALLOW_LIST = {
+    nn.BatchNorm1d,
+    nn.BatchNorm2d,
+    nn.BatchNorm3d,
+}
+
+
+def _rewrite(
+    mod_to_rewrite: nn.Module,
+    allow_list: Optional[Set] = None,
+    leaf_module_list: Optional[Set] = None,
+) -> nn.Module:
+    if allow_list is None:
+        allow_list = DEFAULT_REWRITE_ALLOW_LIST
+    else:
+        allow_list = allow_list.union(DEFAULT_REWRITE_ALLOW_LIST)
+
+    if not leaf_module_list:
+        leaf_module_list = set()
+
+    # Rewrite this module's functions as well as all recursive modules'
+    # functions that are attrs of this moodule. Return the new, rewritten module
+    # hierarchy.
+    def rewrite_module(m: nn.Module):
+        if isinstance(m, jit.ScriptModule):
+            # ScriptModule cannot be rewritten, so bypass it. The issue is it
+            # requires explicitly calling its `__init__()`, calling
+            # `nn.Module.__init__()` in the derived `RewrittenModule` is not
+            # enough. And even if we init it we can't do much with it.
+            return m
+
+        # If m is an already-rewritten RewrittenModule, then use the original base class.
+        base_class: Type[nn.Module] = getattr(m, "_base_class_origin", type(m))
+
+        # Keep track of all the ConditionalExceptionWrappers that the
+        # Acc_Rewriter calls into in this module so we can add them in init
+        # below.
+        all_added_wrappers: Set[Type[Exception]] = set()
+        all_added_bool_wrappers: Set[Type[Exception]] = set()
+
+        # Note: Make this a subclass of our base class.
+        class RewrittenModule(base_class):  # type: ignore[valid-type, misc]
+            # Keep track of the base_class so that symbolic tracing can
+            # determine what kind of module this originally was later on.
+            _base_class_origin = base_class
+            # Add suffix to qualname so it's easier to debug the origin of this module.
+            __qualname__ = f"{base_class.__qualname__}__AccRewrittenModule"
+
+            # Write all of the non-dunder or special methods from base_class
+            # into RewrittenModule.
+            for method_name in dir(base_class):
+                method = getattr(base_class, method_name, None)
+                if method is None and method_name not in {"__doc__"}:
+                    _LOGGER.warning(
+                        f"{__qualname__} does not have attribute {method_name}"
+                    )
+
+                if builtins.type(method) is not FunctionType:
+                    continue
+
+                # Always skip rewriting dunder methods, as they haven't (yet) been
+                # problematic, and modifying them has caused issues previously.
+                if method_name.startswith("__") and method_name.endswith("__"):
+                    continue
+
+                # Only rewrite those Modules explicitly in the allow_list.
+                assert allow_list is not None
+                if base_class not in allow_list:
+                    vars()[method_name] = method
+                else:
+                    (
+                        vars()[method_name],
+                        added_wrappers,
+                        added_bool_wrappers,
+                    ) = Acc_Rewriter().rewrite(method)
+                    all_added_wrappers.update(added_wrappers)
+                    all_added_bool_wrappers.update(added_bool_wrappers)
+
+            def __init__(self, orig):
+                nn.Module.__init__(self)
+
+                # Iterate over all added exception wrappers and add
+                # ConditionalExceptionWrapper attrs for each.
+                for exc_type in all_added_wrappers:
+                    wrapper_name = _get_exception_wrapper_attr_name(exc_type)
+                    assert not hasattr(self, wrapper_name)
+                    setattr(
+                        self,
+                        wrapper_name,
+                        ConditionalExceptionWrapper(exc_type),
+                    )
+
+                for exc_type in all_added_bool_wrappers:
+                    wrapper_name = f"{_get_exception_wrapper_attr_name(exc_type)}_bool"
+                    assert not hasattr(self, wrapper_name)
+                    setattr(
+                        self,
+                        wrapper_name,
+                        ConditionalExceptionBoolCondWrapper(exc_type),
+                    )
+
+                # Recursively rewrite and copy all module attrs of this module.
+                for k, v in orig.__dict__.items():
+                    if k == "_modules":
+                        for mod_k, mod_v in v.items():
+                            if getattr(mod_v, "_base_class_origin", type(mod_v)) in leaf_module_list:  # type: ignore[operator]
+                                _LOGGER.info(
+                                    f"Skip rewriting leaf module {type(mod_v)}"
+                                )
+                                self._modules[mod_k] = mod_v
+                            else:
+                                self._modules[mod_k] = rewrite_module(mod_v)
+                    else:
+                        self.__dict__[k] = v
+
+        # Add suffix to name so it's easier to debug the origin of this module.
+        RewrittenModule.__name__ = f"{base_class.__name__}__AccRewrittenModule"
+        return RewrittenModule(m)
+
+    return rewrite_module(mod_to_rewrite)
+
+
+def _remove_assertions(gm: torch.fx.GraphModule) -> bool:
+    """
+    Unconditionally removes all assertions found in GraphModule gm.
+    Returns whether the graph is modified.
+    """
+    changed = False
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch._assert:
+            gm.graph.erase_node(node)
+            changed = True
+    return changed
+
+
+def _remove_exceptions(gm: torch.fx.GraphModule) -> bool:
+    """
+    Unconditionally removes all call_modules to ConditionalExceptionWrappers
+    found in GraphModule gm. Returns whether the graph is modified.
+    """
+    changed = False
+    for node in reversed(gm.graph.nodes):
+        if node.op == "call_module" and (
+            isinstance(gm.get_submodule(node.target), ConditionalExceptionWrapper)
+            or isinstance(
+                gm.get_submodule(node.target), ConditionalExceptionBoolCondWrapper
+            )
+        ):
+            gm.graph.erase_node(node)
+            changed = True
+    return changed
+
+
+def _replace_tensor_meta_with_rank(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        if node.op != "output" and "tensor_meta" in node.meta:
+            node.meta["tensor_rank"] = acc_utils.map_tensor_metadata(
+                node.meta["tensor_meta"], lambda x: len(x.shape)
+            )
+            del node.meta["tensor_meta"]
+
+
+def rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list):
+    rewritten_graph, rewritten_mod = AccRewritingTracer().trace(
+        mod,
+        ast_rewriter_allow_list=ast_rewriter_allow_list,
+        leaf_module_list=leaf_module_list,
+    )
+
+    assert isinstance(rewritten_mod, nn.Module)
+    # Note: use the rewritten_mod here as the root. This is necessary because
+    # RewrittenModule includes a new module for the ConditionalExceptionWrapper.
+    return torch.fx.GraphModule(rewritten_mod, rewritten_graph)
+
+
+def trace(
+    mod: nn.Module,
+    sample_inputs: Sequence[Any],
+    remove_assertions: bool = True,
+    remove_exceptions: bool = True,
+    use_acc_normalization: bool = True,
+    ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None,
+    leaf_module_list: Optional[Set[Type[nn.Module]]] = None,
+    acc_normalization_block_list: Optional[
+        Set[Tuple[str, Union[str, Callable]]]
+    ] = None,
+) -> torch.fx.GraphModule:
+    """
+    Performs tracing and arg normalization specialized for accelerator lowering.
+
+    It first rewrites the AST of the module's methods (and all attr methods
+    recursively) to transform un-tracable parts of the module to make them
+    traceable.
+
+    It then traces to the functional level so that optimizations and backend
+    accelerator importers have the ability to see and/or change inputs to each
+    op.
+
+    It then removes assertions and exception wrappers found during symbolic
+    tracing if requested based on remove_assertions and remove_exceptions
+
+    Dead code is then eliminated, which will e.g. remove any nodes that were
+    only used by assertions or exceptions if they were removed.
+
+    It then performs normalization on args/kwargs, aligning any arg that can be
+    moved to kwarg to be so, and then making default values explicit.
+
+    Args:
+
+        mod (Module): The module to transform and trace.
+
+        sample_inputs (Tuple[Union[torch.Tensor, List[torch.Tensor]]]):
+                Sample inputs with which to run shape prop.
+
+        remove_assertions (bool): Whether to remove assertion nodes from
+                                    the graph after symbolic tracing.
+
+        remove_exceptions (bool): Whether to remove exception wrapper nodes
+                                    from the graph after symbolic tracing.
+
+        use_acc_normalization (bool): Whether to use acc-specific
+                                        normalization to all acc_ops.
+
+        ast_rewriter_allow_list (Optional[Set[nn.Module]]): Optional allow list of
+                                            modules that need AST rewriting.
+
+        leaf_module_list (Optional[Set[nn.Module]]): Optional leaf module list where
+                                            modules will not be traced into.
+
+        acc_normalization_block_list (Optional[Set[Tuple[str, Union[str, Callable]]]]):
+                                    Optional set of (op, target) pairs to not apply acc
+                                    normalization to. Just like the register_acc_op decarators,
+                                    the target can either be a string (e.g. for op == "call_method")
+                                    or a callable (e.g. for op == "call_function").
+    """
+    if mod.training:
+        warnings.warn(
+            "acc_tracer does not support currently support models for training."
+            " Calling eval on model before tracing."
+        )
+        mod.eval()
+
+    assert isinstance(sample_inputs, (list, tuple))
+
+    # Rewrite the module to make it symbolic traceable, and then trace it.
+    traced = rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list)
+
+    # Now remove all assertions and exceptions if requested.
+    if remove_assertions:
+        _remove_assertions(traced)
+    if remove_exceptions:
+        _remove_exceptions(traced)
+
+    # Cleanup any dead code from the original module as well as resulting dead
+    # nodes after removing assertions and exceptions.
+    traced.graph.eliminate_dead_code()
+    traced.recompile()
+
+    # Run shape prop to add node.meta["type"] to nodes, needed for NormalizeArgs.
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
+    # Swap out tensor_meta for tensor_rank, because we don't actually want to rely on
+    # tensor_meta yet for normalization/lowering, though rank shouldn't change.
+    _replace_tensor_meta_with_rank(traced)
+    # Now normalize args/kwargs to make default values visible. Leave args/kwargs as
+    # they were, since all-kwarg normalization is broken, and we don't need it anyway.
+    traced = NormalizeArgs(traced, normalize_to_only_use_kwargs=False).transform()
+
+    # Normalize to acc-specialized wrappers for consistency across op naming and
+    # ensuring all kwarg usage.
+    if use_acc_normalization:
+        acc_normalizer.normalize(
+            traced, acc_normalization_block_list=acc_normalization_block_list
+        )
+
+    traced.recompile()
+
+    # Run shape prop to again to populate tensor_meta after normalize.
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
+
+    return traced
diff --git a/fx2ait/fx2ait/acc_tracer/acc_utils.py b/fx2ait/fx2ait/acc_tracer/acc_utils.py
new file mode 100644
index 000000000..586c20b6c
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_utils.py
@@ -0,0 +1,201 @@
+import inspect
+import logging
+import os
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fx
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from torch.fx.node import _get_qualified_name
+from torch.fx.passes import graph_drawer
+from torch.fx.passes.shape_prop import TensorMetadata
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+def get_target_from_module(mod: torch.nn.Module, target: str):
+    """
+    Gets `target` from `mod` and returns it. If `target` is empty then returns `mod.`
+    """
+    if target == "":
+        return mod
+
+    target_atoms = target.split(".")
+    curr_obj = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(curr_obj, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target '{'.'.join(target_atoms[:i])}'; "
+                f" original whole target: '{target}'"
+            )
+        curr_obj = getattr(curr_obj, atom)
+    return curr_obj
+
+
+def get_attr(node: torch.fx.Node) -> Any:
+    """
+    Returns the underlying attr for a given node which
+    must be of type get_attr.
+    """
+    assert node.op == "get_attr", "Expected a get_attr node"
+    return get_target_from_module(node.graph.owning_module, str(node.target))
+
+
+def is_acc_op(node_or_target: Union[Callable, torch.fx.Node]) -> bool:
+    """
+    Returns whether `node_or_target` is an acc_op. If it's a node, then checks whether
+    it's a call_function target is from the acc_ops module. Otherwise it's already
+    the target, which is similarly checked to see if it's from the acc_ops module.
+    """
+    if isinstance(node_or_target, torch.fx.Node):
+        # All acc_ops are call_functions.
+        if node_or_target.op != "call_function":
+            return False
+        target = node_or_target.target
+    else:
+        target = node_or_target
+    return "acc_ops" in target.__module__
+
+
+def is_acc_op_with_kwarg(
+    node_or_target: Union[Callable, torch.fx.Node], kwarg: str
+) -> bool:
+    """
+    Helper that inspects `node_or_target` and returns whether it is an acc_op node
+    (or a target for an acc_op) that has an arg signature that includes `kwarg`.
+    """
+    if not is_acc_op(node_or_target):
+        return False
+
+    target = (
+        node_or_target.target
+        if isinstance(node_or_target, torch.fx.Node)
+        else node_or_target
+    )
+    assert not isinstance(target, str)
+    return kwarg in inspect.signature(inspect.unwrap(target)).parameters
+
+
+def build_raw_tensor_meta(
+    shape=None,
+    dtype=None,
+    requires_grad=None,
+    stride=None,
+    memory_format=None,
+    is_quantized=None,
+    qparams=None,
+):
+    return TensorMetadata(**locals())
+
+
+def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph"):
+    base, ext = os.path.splitext(fname)
+    if not ext:
+        ext = ".svg"
+    _LOGGER.info(f"Writing FX graph to file: {base}{ext}")
+    g = graph_drawer.FxGraphDrawer(traced, figname)
+    x = g.get_main_dot_graph()
+    try:
+        getattr(x, "write_" + ext.lstrip("."))(fname)
+    except OSError as e:
+        _LOGGER.error(f"Failed to write the FX graph due to: {e}")
+
+
+def get_model_info_str(gm: torch.fx.GraphModule, header: Optional[str] = None):
+    """
+    Print out info of the provided `gm`.
+    If `header` is provided then it's included in the printed string.
+    """
+    ops_and_counts: Dict[Callable, int] = {}
+    placeholder_count = get_attr_count = call_method_count = call_module_count = 0
+    for node in gm.graph.nodes:
+        if node.op == "call_function":
+            ops_and_counts[node.target] = ops_and_counts.get(node.target, 0) + 1
+        elif node.op == "placeholder":
+            placeholder_count += 1
+        elif node.op == "get_attr":
+            get_attr_count += 1
+        elif node.op == "call_method":
+            call_method_count += 1
+        elif node.op == "call_module":
+            call_module_count += 1
+        elif node.op == "output":
+            output_count = len(node.args[0]) if isinstance(node.args[0], tuple) else 1
+        else:
+            raise RuntimeError(f"Unknown node found: {node.format_node()}")
+
+    header = "" if header is None else f" [{header}]"
+    model_info_str = f"Model Info{header}:\n"
+    model_info_str += f"> placeholder: {placeholder_count}\n"
+    model_info_str += f"> get_attr: {get_attr_count}\n"
+    model_info_str += f"> output: {output_count}\n"
+    if call_module_count != 0:
+        model_info_str += f"> WARNING: call_module: {call_module_count}"
+    if call_method_count != 0:
+        model_info_str += f"> WARNING: call_method: {call_method_count}"
+
+    # Sort and print all the other ops. Sort so it's deterministic between runs and
+    # easier to parse.
+    pretty_ops_and_counts: List[Tuple[str, int]] = []
+    for op, count in ops_and_counts.items():
+        pretty_ops_and_counts.append((_get_qualified_name(op), count))
+    pretty_ops_and_counts.sort()
+    for op_str, count in pretty_ops_and_counts:
+        model_info_str += f"> {op_str}: {count}\n"
+
+    return model_info_str
+
+
+def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
+    """
+    Make sure the name is unique (in a module) and can represents an attr.
+    """
+    # Delete all characters that are illegal in a Python identifier.
+    name = re.sub("[^0-9a-zA-Z_]+", "_", name)
+    if name[0].isdigit():
+        name = f"_{name}"
+    # Now make sure it is in fact unique to the module by incrementing suffix value.
+    while hasattr(mod_traced, name):
+        match = re.match(r"(.*)_(\d+)$", name)
+        if match is None:
+            name = name + "_1"
+        else:
+            base, num = match.group(1, 2)
+            name = f"{base}_{int(num) + 1}"
+
+    return name
+
+
+def map_tensor_metadata(a: Any, fn: Callable):
+    """
+    Map some `fn` to `a`, where `a` is either a TensorMetadata, or else a tuple/list/dict
+    recursively containing TensorMetadata.
+    """
+    if isinstance(a, int):
+        return 1
+    elif a is None:
+        return 1
+    elif isinstance(a, TensorMetadata):
+        return fn(a)
+    elif isinstance(a, tuple):
+        return tuple(map_tensor_metadata(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return immutable_dict(
+            {name: map_tensor_metadata(elem, fn) for name, elem in a.items()}
+        )
+    assert isinstance(
+        a, list
+    ), f"Only supporting tuple/list/TensorMetadata, but found {type(a)}"
+    return immutable_list(map_tensor_metadata(elem, fn) for elem in a)
+
+
+def get_tensor_meta(node: torch.fx.Node) -> TensorMetadata:
+    tensor_meta = node.meta.get("tensor_meta")
+
+    if not tensor_meta:
+        raise RuntimeError(
+            f"Node has no tensor metadata associated with it! "
+            f"Check that shape propagation has run. {node.format_node()}"
+        )
+    return tensor_meta
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
new file mode 100644
index 000000000..915505491
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
@@ -0,0 +1,55 @@
+from .acc_ops import *  # isort:skip # noqa: F403 F401
+from .ait_acc_ops import *  # noqa: F403 F401
+import logging
+
+from .acc_normalizer import (
+    _normalization_dict,
+    register_acc_op_mapping,
+    register_custom_acc_mapper_fn,
+)
+
+from .ait_acc_ops_registry import get_ait_acc_op_mappers, get_custom_ait_acc_op_mappers
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def update_acc_op_mappers_for_ait() -> None:
+    """
+    This function allows to replace some of the default acc_ops mappers with
+    custom mappers. Custom mappers are defined in the 'ait_acc_ops.py' file.
+    """
+    ait_acc_op_mappers = get_ait_acc_op_mappers()
+    custom_ait_acc_op_mappers = get_custom_ait_acc_op_mappers()
+
+    logger.info(
+        "Found %s ait mappers, %s custom ait op mappers",
+        len(ait_acc_op_mappers),
+        len(custom_ait_acc_op_mappers),
+    )
+
+    for op_and_target, mapper in ait_acc_op_mappers.items():
+        if op_and_target in _normalization_dict:
+            logger.info("Removing %s from acc normalization dict", op_and_target)
+            del _normalization_dict[op_and_target]
+
+        logger.info("Adding AIT acc mapper for %s", op_and_target)
+        register_acc_op_mapping(
+            op_and_target,
+            mapper.arg_replacement_tuples,
+            mapper.kwargs_to_move_to_acc_out_ty,
+        )(mapper.new_fn_target)
+
+    for op_and_target, mapper in custom_ait_acc_op_mappers.items():
+        if op_and_target in _normalization_dict:
+            logger.info("Removing %s from acc normalization dict", op_and_target)
+            del _normalization_dict[op_and_target]
+
+        logger.info("Adding custom AIT acc mapper for %s", op_and_target)
+        register_custom_acc_mapper_fn(
+            op_and_target,
+            mapper.arg_replacement_tuples,
+            mapper.needs_shapes_for_normalization,
+            mapper.allow_normalize_from_torch_package,
+        )(mapper.custom_mapping_fn)
+
+    logger.info("Completed updating acc mappers")
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
new file mode 100644
index 000000000..b291ebdc6
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
@@ -0,0 +1,26 @@
+import torch
+
+from fx2ait.acc_tracer.acc_normalizer import register_acc_op
+
+from fx2ait.acc_tracer.ait_acc_ops_registry import ait_register_acc_op_mapping
+
+
+@ait_register_acc_op_mapping(
+    op_and_target=("call_method", "split"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@ait_register_acc_op_mapping(
+    op_and_target=("call_function", torch.split),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@register_acc_op
+def split(*, input, split_size_or_sections, dim=0):
+    return torch.split(input, split_size_or_sections, dim)
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
new file mode 100644
index 000000000..c7f96b7ed
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
@@ -0,0 +1,92 @@
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
+
+
+class AitAccOpMapper(NamedTuple):
+    new_fn_target: Callable
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ]
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ]
+
+
+class CustomAitAccOpMapper(NamedTuple):
+    custom_mapping_fn: Callable
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ]
+    needs_shapes_for_normalization: bool
+    allow_normalize_from_torch_package: bool
+
+
+_AIT_ACC_OP_MAPPERS: Dict[Tuple[str, Union[str, Callable]], AitAccOpMapper] = {}
+_CUSTOM_AIT_ACC_OP_MAPPERS: Dict[
+    Tuple[str, Union[str, Callable]], CustomAitAccOpMapper
+] = {}
+
+
+def ait_register_acc_op_mapping(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+):
+    def insert(new_fn_target: Callable):
+        _AIT_ACC_OP_MAPPERS[op_and_target] = AitAccOpMapper(
+            new_fn_target=new_fn_target,
+            arg_replacement_tuples=arg_replacement_tuples,
+            kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+        )
+        return new_fn_target
+
+    return insert
+
+
+def ait_register_custom_acc_mapper_fn(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ],
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+):
+    def insert(custom_mapping_fn: Callable):
+        _CUSTOM_AIT_ACC_OP_MAPPERS[op_and_target] = CustomAitAccOpMapper(
+            custom_mapping_fn=custom_mapping_fn,
+            arg_replacement_tuples=arg_replacement_tuples,
+            needs_shapes_for_normalization=needs_shapes_for_normalization,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+        )
+        return custom_mapping_fn
+
+    return insert
+
+
+def get_ait_acc_op_mappers() -> Dict[Tuple[str, Union[str, Callable]], AitAccOpMapper]:
+    return _AIT_ACC_OP_MAPPERS
+
+
+def get_custom_ait_acc_op_mappers() -> Dict[
+    Tuple[str, Union[str, Callable]], CustomAitAccOpMapper
+]:
+    return _CUSTOM_AIT_ACC_OP_MAPPERS
diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
new file mode 100644
index 000000000..ba1376605
--- /dev/null
+++ b/fx2ait/fx2ait/ait_module.py
@@ -0,0 +1,41 @@
+from typing import List
+
+import torch
+
+
+class AITModule(torch.nn.Module):
+    def __init__(
+        self,
+        engine=None,
+    ):
+        super(AITModule, self).__init__()
+        self.engine = engine
+
+    def forward(self, *inputs):
+        outputs = self.engine.forward(inputs)
+        if len(outputs) == 1:
+            return outputs[0]
+        return tuple(outputs)
+
+    def profile(
+        self, inputs: List[torch.Tensor], filename: str, num_iters: int
+    ) -> None:
+        """
+        Profile the AIT module and save the report to a file. The AITModule
+        must be created with allow_scripting=False.
+        inputs: sample inputs
+        filename: report filename
+        num_iters: number of iterations per op run
+        """
+        self.engine.profile(inputs, filename, num_iters)
+
+    @staticmethod
+    def create_ait_module_wrapper(engine, trace_ait_module, *inputs):
+        """
+        Some use cases need to torch.jit.script a model with AITModules in
+        it, but TorchScript does not support variadic inputs. We can get
+        around this by scripting the AITModule with some sample inputs.
+        This is turned in by passing allow_scripting=True.
+        """
+        mod = AITModule(engine)
+        return torch.jit.trace(mod, inputs) if trace_ait_module else mod
diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
new file mode 100644
index 000000000..e05315118
--- /dev/null
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -0,0 +1,134 @@
+from typing import Any, Dict, Iterable, Mapping, Sequence
+
+import torch
+import torch.fx.passes.operator_support as ops
+import torch.fx.passes.splitter_base as splitter_base
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.ait_module import AITModule
+
+from fx2ait.converters.converter_registry import AIT_CONVERTERS
+from fx2ait.fx2ait import AITInterpreter
+from torch.fx.passes.operator_support import create_op_support, OperatorSupportBase
+from torch.fx.passes.tools_common import get_acc_ops_name
+
+try:
+    torch.ops.load_library("//deeplearning/ait:AITModel")
+except BaseException:
+    torch.ops.load_library("build/libait_model.so")
+
+
+_VIEW_OPS = frozenset(
+    (
+        acc_ops.unsqueeze,
+        acc_ops.squeeze,
+        acc_ops.reshape,
+        acc_ops.flatten,
+    )
+)
+
+DEFAULT_MIN_ACC_MODULE_SIZE = 10
+
+
+def _decline_if_would_trigger_extra_copies(
+    has_converter: OperatorSupportBase,
+) -> OperatorSupportBase:
+    def _impl(
+        submodules: Mapping[str, torch.nn.Module],
+        node: torch.fx.Node,
+    ):
+        def _any_supported(nodes: Sequence[torch.fx.Node]) -> bool:
+            return any(
+                has_converter.is_node_supported(submodules, node) for node in nodes
+            )
+
+        if node.target not in _VIEW_OPS:
+            return True
+
+        if _any_supported(node.users) or _any_supported(node.all_input_nodes):
+            return True
+
+        return False
+
+    return create_op_support(_impl)
+
+
+def create_ait_operator_support(
+    use_implicit_batch_dim=True, op_lowering_disallow_list=None
+) -> ops.OperatorSupportBase:
+    """Creates an `OperatorSupportBase` instance used for AIT splitting purpose."""
+    # Create an `OperatorSupport` that declares a node supported if it
+    # finds a registered AIT converter.
+    support_dict: Dict[str, None] = {}
+    for k in AIT_CONVERTERS.keys():
+        # may need to switch the op name here
+        support_dict[get_acc_ops_name(k)] = None
+    supported_if_converter_registered = ops.OperatorSupport(support_dict=support_dict)
+
+    op_lowering_disallow_set = (
+        set() if op_lowering_disallow_list is None else set(op_lowering_disallow_list)
+    )
+    return ops.chain(
+        ops.OpSupports.decline_if_node_in_names(op_lowering_disallow_set),
+        # 1. We only support subgraphs with torch.Tensor inputs for now
+        ops.OpSupports.decline_if_input_dtype(torch.int64),
+        ops.OpSupports.decline_if_input_dtype(torch.int32),
+        ops.OpSupports.decline_if_input_dtype(dict),
+        # 2. Node is supported if it has AIT converter:
+        supported_if_converter_registered,
+        # 3. Decline nodes that would trigger extra copies. This can happen if
+        # we have an output that is just a view of an input, for example.
+        # Note that this is not required for correctness, it is merely an
+        # optimization.
+        _decline_if_would_trigger_extra_copies(supported_if_converter_registered),
+    )
+
+
+class AITSplitterSettings(splitter_base._SplitterSettingBase):
+    # TODO: Fix this once pytorch nightly is updated
+    def __init__(self, min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE):
+        super().__init__()
+        self.min_acc_module_size = min_acc_module_size
+
+
+class AITSplitter(splitter_base._SplitterBase):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Sequence[Any],
+        operator_support: ops.OperatorSupportBase = None,
+        settings: AITSplitterSettings = None,
+    ):
+        if not settings:
+            settings = AITSplitterSettings()
+        if not operator_support:
+            operator_support = create_ait_operator_support()
+        super().__init__(
+            module,
+            sample_input,
+            operator_support,
+            settings,
+            non_acc_submodule_name="_run_on_gpu_",
+        )
+
+    def _lower_model_to_backend(
+        self, mod: torch.fx.GraphModule, inputs: Iterable[torch.Tensor]
+    ):
+        """
+        Lower a GraphModule `mod` to AITemplate with `inputs`.
+        """
+        # Current code for lowering is place-holder, subject to future change
+        # based on feeds model's actual status
+        interp = AITInterpreter(mod, [inputs])
+        interpreter_result = interp.run(*inputs)
+        return AITModule(
+            torch.classes.fb.AITModel(
+                interpreter_result.engine.lib_path,
+                interpreter_result.input_names,
+                interpreter_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  # num_runtimes
+            )
+        )
+
+    # TODO add _find_culprit once minimizer completed
diff --git a/fx2ait/fx2ait/cache.py b/fx2ait/fx2ait/cache.py
new file mode 100644
index 000000000..88fd2b3db
--- /dev/null
+++ b/fx2ait/fx2ait/cache.py
@@ -0,0 +1,13 @@
+import os.path as path
+
+
+def save_profile_cache(remote_cache_file_path, cache_path):
+    with open(cache_path, "rb") as f:
+        with open(remote_cache_file_path, "wb") as target:
+            target.write(f.read())
+
+
+def load_profile_cache(remote_cache_file_path, cache_bytes):
+    if path.isfile(remote_cache_file_path):
+        with open(remote_cache_file_path, "rb") as cache_content:
+            cache_bytes.write(cache_content.read())
diff --git a/fx2ait/fx2ait/converters/__init__.py b/fx2ait/fx2ait/converters/__init__.py
new file mode 100644
index 000000000..d463837ae
--- /dev/null
+++ b/fx2ait/fx2ait/converters/__init__.py
@@ -0,0 +1,3 @@
+from .ait_converters import *  # noqa: F401 F403
+from .aten2ait_converters import *  # noqa: F401 F403
+from .ait_module_converters import *  # noqa: F401 F403
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
new file mode 100644
index 000000000..cd62f4378
--- /dev/null
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -0,0 +1,1091 @@
+import logging
+import math
+import operator
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+from aitemplate.compiler.public import (
+    avg_pool2d,
+    bmm_rrr,
+    chunk,
+    clamp,
+    concatenate,
+    conv2d,
+    conv2d_bias,
+    dynamic_slice,
+    elementwise,
+    expand,
+    flatten,
+    FuncEnum,
+    gemm_rcr,
+    gemm_rrr,
+    getitem,
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    layernorm,
+    max_pool2d,
+    nhwc3to8,
+    pad_last_dim,
+    permute,
+    reduce_mean,
+    reduce_sum,
+    reshape,
+    size,
+    softmax,
+    split,
+    squeeze,
+    Tensor as AITTensor,
+    topk,
+    tuple_construct,
+    unsqueeze,
+    var,
+    vector_norm,
+)
+
+from fx2ait.acc_tracer import acc_ops, ait_acc_ops
+from torch.fx.node import Argument, Target
+
+from .converter_registry import ait_converter
+
+from .utils import (
+    create_binary_op,
+    create_reduce_op,
+    create_unary_op,
+    get_positive_dim,
+    identical_elem_tuple_to_int,
+    nchw2nhwc,
+    unify_dynamic_shape_name,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+ConverterOutput = Union[AITTensor, Tuple[AITTensor, ...], List[IntVar], IntVar]
+
+
+@ait_converter(acc_ops.sigmoid)
+def acc_ops_sigmoid(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    return elementwise(FuncEnum.SIGMOID)(input_val)
+
+
+@ait_converter(acc_ops.mul)
+def acc_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.MUL, args, kwargs, name)
+
+
+@ait_converter(acc_ops.div)
+def acc_ops_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.DIV, args, kwargs, name)
+
+
+@ait_converter(acc_ops.add)
+def acc_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.ADD, args, kwargs, name)
+
+
+@ait_converter(acc_ops.sub)
+def acc_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.SUB, args, kwargs, name)
+
+
+@ait_converter(acc_ops.tanh)
+def acc_ops_tanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.TANH)(input_val)
+
+
+@ait_converter(acc_ops.sum)
+def acc_ops_sum(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_reduce_op(reduce_sum, args, kwargs, name)
+
+
+@ait_converter(acc_ops.mean)
+def acc_ops_mean(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_reduce_op(reduce_mean, args, kwargs, name)
+
+
+@ait_converter(acc_ops.linear)
+def acc_ops_linear(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+
+    result = gemm_rcr()(input_val, weight)
+
+    bias = kwargs["bias"]
+    if bias is not None:
+        assert isinstance(bias, AITTensor)
+        result = elementwise(FuncEnum.ADD)(result, bias)
+
+    return result
+
+
+@ait_converter(acc_ops.unsqueeze)
+def acc_ops_unsqueeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return unsqueeze(dim)(input_val)
+
+
+@ait_converter(acc_ops.clamp)
+def acc_ops_clamp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    result = input_val
+    min_val = kwargs.get("min")
+    max_val = kwargs.get("max")
+    return clamp()(result, min_val, max_val)
+
+
+@ait_converter(acc_ops.linalg_norm)
+def acc_ops_linalg_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    if "ord" not in kwargs or kwargs["ord"] != 2:
+        raise RuntimeError("AIT linalg_norm only supports ord=2 use case!")
+
+    # Hard code ord_kind=2 for l2 norm
+    l2_norm = vector_norm(
+        ord_kind=2, dim=kwargs["dim"], keepdim=kwargs["keepdim"], dtype=None
+    )
+
+    return l2_norm(input_val)
+
+
+@ait_converter(acc_ops.permute)
+def acc_ops_permute(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    permutation = kwargs["permutation"]
+
+    return permute()(input_val, permutation)
+
+
+@ait_converter(acc_ops.cat)
+def acc_ops_cat(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = kwargs["tensors"]
+    for t in tensors:
+        if not isinstance(t, AITTensor):
+            raise ValueError(f"Non-tensor inputs for {name}: {tensors}")
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    # TODO:  unify_dynamic_shape_name is a hack to workaround AIT's dynamic shape requirement.
+    # We will remove it after AIT provides vanilla support.
+    for i in range(len(tensors) - 1):
+        unify_dynamic_shape_name(tensors[i], tensors[i + 1])
+    return concatenate()(tensors, dim=dim)
+
+
+@ait_converter(acc_ops.sign)
+def acc_ops_sign(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.SIGN)(input_val)
+
+
+@ait_converter(acc_ops.abs)
+def acc_ops_abs(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.ABS)(input_val)
+
+
+@ait_converter(acc_ops.log)
+def acc_ops_log(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.LOGE)(input_val)
+
+
+@ait_converter(acc_ops.var)
+def acc_ops_var(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    op = var(
+        dim=kwargs["dim"],
+        unbiased=kwargs["unbiased"],
+        keepdim=kwargs["keepdim"],
+        dtype=None,
+    )
+    return op(input_val)
+
+
+@ait_converter(acc_ops.softmax)
+def acc_ops_softmax(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"]
+
+    return softmax()(input_val, dim)
+
+
+@ait_converter(acc_ops.relu)
+def acc_ops_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.RELU)(input_val)
+
+
+@ait_converter(acc_ops.leaky_relu)
+def acc_ops_leaky_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    negative_slope = kwargs["negative_slope"]
+    return elementwise(FuncEnum.LRELU)(input_val, negative_slope)
+
+
+@ait_converter(acc_ops.squeeze)
+def acc_ops_squeeze(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"] if "dim" in kwargs else None
+    op = squeeze(dim)
+    return op(input_val)
+
+
+@ait_converter(acc_ops.size)
+def acc_ops_size(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    if "dim" in kwargs:
+        raise NotImplementedError(
+            f"In {name} found 'dim' in size() which is not supported"
+        )
+
+    return size()(input_val)
+
+
+@ait_converter(acc_ops.getitem)
+def acc_ops_getitem(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    idx = kwargs["idx"]
+    if isinstance(idx, slice) or (
+        isinstance(idx, Sequence) and any(isinstance(x, slice) for x in idx)
+    ):
+        return acc_ops_slice(target, args, kwargs, name)
+    if isinstance(input_val, AITTensor):
+        return acc_ops_slice(target, args, kwargs, name)
+
+    if isinstance(kwargs["idx"], int):
+        idx = get_positive_dim(idx, len(input_val))
+
+    if all(isinstance(i, IntImm) for i in input_val):
+        return operator.getitem(input_val, kwargs["idx"])
+    else:
+        return getitem()(input_val, idx)
+
+
+@ait_converter(acc_ops.slice_tensor)
+def acc_ops_slice(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    idx = kwargs["idx"]
+    if isinstance(input_val, (tuple, list)):
+        return operator.getitem(input_val, idx)
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    rank = input_val._rank()
+    if not isinstance(idx, Sequence):
+        idx = [idx]
+    op = dynamic_slice()
+
+    def num_slice_types(slices):
+        return sum(1 for s in slices if isinstance(s, slice) or isinstance(s, int))
+
+    # Replace ellipsis with expand slices.
+    num_ellipsis = rank - num_slice_types(idx)
+    expand_idx = []
+    for i in idx:
+        if i == Ellipsis:
+            # pass explicit start to guard against negative num_ellipsis
+            for _ in range(0, num_ellipsis):
+                expand_idx.append(slice(None, None, None))
+        else:
+            expand_idx.append(i)
+    idx = expand_idx
+
+    # Record indices that need to be either:
+    #   (1) sequeezed if Slice-index is of int; or
+    #   (2) unsqueezed if Slice-index is of None
+    # Each element of the list is a tuple of (int, func), where the second item
+    # is either squeeze or unsqueeze function and the first
+    # item gives the index to be squeezed or unsqueezed.
+    squeezable_indices = []
+    # the number of the indices of type None
+    num_none_indices = 0
+    start, end = [], []
+    for index, i in enumerate(idx):
+        if i is None:
+            squeezable_indices.append((index, unsqueeze))
+            num_none_indices += 1
+            continue
+        if isinstance(i, int):
+            i = get_positive_dim(i, input_val.shape()[index].value())
+            # If we pass an int, we need to squeeze this dim.
+            # Note that because we skip None-indices before, so we adjust
+            # the index by subtracting the number of None-indices.
+            squeezable_indices.append((index - num_none_indices, squeeze))
+        # if idx is slice, AIT only support slice.step == 1
+        # TODO remove check once slice support step != 1
+        if isinstance(i, slice) and i.step not in (1, None):
+            raise ValueError(
+                f"Slice tensor only support step=1 case, get step={i.step}."
+            )
+        start.append(i.start if isinstance(i, slice) else i)
+        end.append(i.stop if isinstance(i, slice) else (i + 1 if i is not None else i))
+
+    # append hiden dim at end
+    while len(start) < rank:
+        start.append(0)
+        end.append(None)
+
+    output = op(input_val, start, end)
+    for dim, squeeze_func in reversed(squeezable_indices):
+        output = squeeze_func(dim)(output)
+    return output
+
+
+@ait_converter(acc_ops.reshape)
+def acc_ops_reshape(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    shape = kwargs["acc_out_ty"].shape
+
+    return reshape()(input_val, shape)
+
+
+# TODO (T124248862)
+# We are waiting for full support of topk including:
+# actual return values
+# dim,
+# largest flag,
+# sorted flag
+@ait_converter(acc_ops.topk)
+def acc_ops_topk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    k = kwargs["k"]
+    if not isinstance(k, int):
+        raise ValueError(f"Unexpected value for k in {name}: {k}")
+
+    dim = kwargs["dim"] if "dim" in kwargs else None
+    if dim is not None and dim != -1:
+        raise NotImplementedError(
+            f"Found 'dim' in {name} which is not supported: {dim}"
+        )
+
+    largest = kwargs["largest"] if "largest" in kwargs else None
+    if largest is not None and largest is not True:
+        raise NotImplementedError(
+            f"Found 'largest' in {name} which is not supported: {largest}"
+        )
+
+    # current AIT implementation only returns indices, so 'sorted' does not apply. Ignore if specified.
+    sorted = kwargs["sorted"] if "sorted" in kwargs else None
+    if sorted is not None:
+        logger.warning("Ignoring the value of 'sorted': %s", sorted)
+
+    result_indices = topk(k=k)(input_val)
+    # current AIT implementation only returns indices. to match the torch topk return types, create dummy values
+    #
+    # TODO remove the hard coded dtype below, once we know whether AIT will support fp32 (thus providing an option of
+    # fp16 or fp32 for values)
+    return (
+        AITTensor(
+            shape=result_indices.shape(), dtype="float16", name=f"{name}_result_values"
+        ),
+        result_indices,
+    )
+
+
+@ait_converter(acc_ops.tuple_construct)
+def acc_ops_tuple_construct(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = kwargs["tensors"]
+    return tuple_construct()(*tensors)
+
+
+@ait_converter(acc_ops.nan_to_num)
+def acc_ops_nan_to_num(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    nan = 0 if kwargs["nan"] is None else kwargs["nan"]
+
+    def _get_dtype(dtype: str):
+        if dtype in ("float", "float32"):
+            return np.float32
+        elif dtype == "float16":
+            return np.float16
+        else:
+            raise NotImplementedError(f"Unsupported dtype {dtype} for nan_to_num")
+
+    input_dtype = input_val.dtype()
+    np_dtype = _get_dtype(input_dtype)
+    posinf = np.finfo(np_dtype).max if kwargs["posinf"] is None else kwargs["posinf"]
+    neginf = np.finfo(np_dtype).min if kwargs["neginf"] is None else kwargs["neginf"]
+    return elementwise(FuncEnum.NAN_TO_NUM)(
+        input_val,
+        AITTensor(value=nan, shape=[], name="nan", dtype=input_dtype),
+        AITTensor(value=posinf, shape=[], name="posinf", dtype=input_dtype),
+        AITTensor(value=neginf, shape=[], name="neginf", dtype=input_dtype),
+    )
+
+
+@ait_converter(acc_ops.layer_norm)
+def acc_ops_layer_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = kwargs["normalized_shape"]
+    if shape is None or len(shape) == 0:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    weight = kwargs["weight"]
+    bias = kwargs["bias"]
+    normalized_shape = []
+    if all(isinstance(i, int) for i in shape):
+        for i in shape:
+            normalized_shape.append(IntImm(i))
+    elif all(isinstance(i, IntImm) or isinstance(i, IntVarTensor) for i in shape):
+        normalized_shape = shape
+    else:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    return layernorm()(input_val, weight, bias, normalized_shape)
+
+
+@ait_converter(acc_ops.flatten)
+def acc_ops_flatten(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    start_dim = kwargs["start_dim"] if "start_dim" in kwargs else 0
+    end_dim = kwargs["end_dim"] if "end_dim" in kwargs else -1
+
+    return flatten(start_dim=start_dim, end_dim=end_dim)(input_val)
+
+
+@ait_converter(acc_ops.matmul)
+def acc_ops_matmul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    lhs = kwargs["input"]
+    if not isinstance(lhs, AITTensor):
+        raise ValueError(f"Unexpected left operand in {name}: {lhs}")
+    # TODO: ideally we shouldn't be using _rank()/_shape()
+    lhs_shape = lhs.shape()
+    if len(lhs_shape) < 2:
+        raise ValueError(f"Not enough dims for matmul in {name}: {lhs_shape}")
+
+    rhs = kwargs["other"]
+    if not isinstance(rhs, AITTensor):
+        raise ValueError(f"Unexpected right operand in {name}: {rhs}")
+    # TODO: ideally we shouldn't be using _rank()/_shape()
+    rhs_shape = rhs.shape()
+    if len(rhs_shape) < 2:
+        raise ValueError(f"Not enough dims for matmul in {name}: {rhs_shape}")
+
+    if len(rhs_shape) == 2:
+        return gemm_rrr()(lhs, rhs)
+    elif len(lhs_shape) <= 3 and len(rhs_shape) <= 3:
+        return bmm_rrr()(lhs, rhs)
+    elif len(lhs_shape) == 4 and len(rhs_shape) == 4 and lhs_shape[1] == rhs_shape[1]:
+        assert all(isinstance(i, IntImm) for i in lhs_shape)
+        assert all(isinstance(i, IntImm) for i in rhs_shape)
+        # Current AIT bmm only supports 3-dim. Use reshape to workaround.
+        reshape_op_0 = reshape()
+        batch_size = lhs_shape[0].value()
+        M = lhs_shape[2].value()
+        K = lhs_shape[3].value()
+        channel = lhs_shape[1].value()
+        shape_0 = (batch_size * channel, M, K)
+        reshape_op_1 = reshape()
+        N = rhs_shape[3].value()
+        if K != rhs_shape[2].value():
+            raise ValueError(
+                f"K dim mismatch on matmaul. Expected: [N, K] X [K, M]. Found: : [{M}, {K}] X [{rhs_shape[2].value()}, {N}]"
+            )
+
+        shape_1 = (batch_size * channel, K, N)
+        reshape_op_2 = reshape()
+        shape_2 = (batch_size, channel, M, N)
+        return reshape_op_2(
+            bmm_rrr()(reshape_op_0(lhs, shape_0), reshape_op_1(rhs, shape_1)), shape_2
+        )
+    else:
+        raise NotImplementedError(
+            f"This case is unsupported in {name}: {len(lhs_shape)} and {len(rhs_shape)}"
+        )
+
+
+@ait_converter(acc_ops.chunk)
+def acc_ops_chunk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = input_val.shape()
+    target_dim = get_positive_dim(kwargs["dim"], len(shape))
+    chunks = min(kwargs["chunks"], shape[target_dim].value())
+    assert isinstance(
+        shape[target_dim], IntImm
+    ), f"Cannot perform chunk on dynamic dim! Get target dim {target_dim}."
+
+    return chunk()(input_val, chunks, dim=target_dim)
+
+
+@ait_converter(ait_acc_ops.split)
+def ait_acc_ops_split(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    split_size_or_sections = kwargs["split_size_or_sections"]
+    if not isinstance(split_size_or_sections, (int, list)):
+        raise ValueError(
+            f"Unexpected value for split_size_or_sections in {name}: {split_size_or_sections}"
+        )
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected value for dim in {name}: {dim}")
+
+    return split()(input_val, split_size_or_sections, dim)
+
+
+@ait_converter(acc_ops.expand)
+def ait_acc_ops_expand(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    sizes = kwargs["sizes"]
+    if not sizes:
+        raise ValueError("Expand sizes cannot be empty")
+
+    def _is_int_list(iterable):
+        return all(isinstance(dim, (int, IntVar, IntVarTensor)) for dim in iterable)
+
+    # sizes can either be a single int list or a list of ints.
+    if _is_int_list(sizes):
+        shape = sizes
+    elif len(sizes) == 1 and _is_int_list(sizes[0]):
+        shape = sizes[0]
+    else:
+        raise ValueError(
+            f"sizes argument can either be many ints or single int iterable, but got: {', '.join(str(type(dim)) for dim in sizes)}"
+        )
+
+    return expand()(input_val, shape)
+
+
+@ait_converter(acc_ops.batch_norm)
+def acc_ops_batch_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO @qxy11: Update channels-last assumption once AIT backend is updated
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    scale = elementwise(FuncEnum.DIV)(
+        kwargs["weight"],
+        elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.SQRT)(kwargs["running_var"]),
+            AITTensor(shape=[], value=kwargs["eps"]),
+        ),
+    )
+    bias = elementwise(FuncEnum.SUB)(kwargs["bias"], kwargs["running_mean"])
+    matmul_result = elementwise(FuncEnum.MUL)(input_val, scale)
+    result = elementwise(FuncEnum.ADD)(matmul_result, bias)
+    return result
+
+
+def _choose_conv2d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: Optional[AITTensor],
+) -> ConverterOutput:
+    """
+    Helper to choose conv2d vs. conv2d_bias op based on existence of bias
+    and pad channel input dim to 4/8
+    """
+    last_dim = x._attrs["shape"][-1]._attrs["values"][0]
+    # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
+    # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
+    if last_dim < 4:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
+    elif last_dim in range(5, 8):
+        to_8 = nhwc3to8()
+        weight = to_8(weight)
+        x = to_8(x)
+    elif last_dim % 2 != 0:
+        return RuntimeError(
+            f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
+        )
+    if bias:
+        return conv2d_bias(stride=stride, pad=pad, dilate=dilate)(x, weight, bias)
+    else:
+        return conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+
+@ait_converter(acc_ops.conv2d)
+def acc_ops_conv2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: qxy11: Update once channels-first format is supported
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
+
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if kwargs["groups"] is None or kwargs["groups"] == 1:
+        result = _choose_conv2d_op(stride, padding, dilation, input_val, weight, bias)
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        groups = kwargs["groups"]
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def get_channel_dim_slice_idx(start, end, step):
+            all_none_slice = slice(None, None, None)
+            return (
+                all_none_slice,
+                all_none_slice,
+                all_none_slice,
+                slice(start, end, step),
+            )
+
+        def get_batch_dim_slice_idx(start, end, step):
+            return (slice(start, end, step),)
+
+        def make_slice(x, slice_idx, name):
+            return acc_ops_slice(
+                target,
+                args,
+                {
+                    "input": x,
+                    "idx": slice_idx,
+                },
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    get_channel_dim_slice_idx(
+                        i * group_size, i * group_size + group_size, 1
+                    ),
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.weight.slice_{i}",
+                ),
+                make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.bias.slice_{i}",
+                ),
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    return result
+
+
+@ait_converter(acc_ops.max_pool2d)
+def acc_ops_max_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    ceil_mode = kwargs["ceil_mode"]
+    return_indices = kwargs["return_indices"]
+    if ceil_mode or return_indices:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+    return max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(acc_ops.avg_pool2d)
+def acc_ops_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    ceil_mode = kwargs["ceil_mode"]
+    count_include_pad = kwargs["count_include_pad"]
+    divisor_override = kwargs["divisor_override"]
+    if ceil_mode or not count_include_pad or divisor_override:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(acc_ops.adaptive_avg_pool2d)
+def acc_ops_adaptive_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    output_size = identical_elem_tuple_to_int(kwargs["output_size"])
+    # FIXME: try to find a way to not explose internal date like this.
+    shape = [var._attrs["values"][0] for var in input_val._attrs["shape"]]
+    HI, WI, CI = shape[1], shape[2], shape[3]
+    if CI % 2 != 0:
+        raise RuntimeError(
+            f"AIT avg_pool2d expects input channel dim to align w/ a multiple of 2 but got {CI}"
+        )
+    if HI != WI:
+        raise RuntimeError(
+            f"adaptive_avg_pool2d currently only supports square input H/W but got H: {shape[1]} and W: {shape[2]}"
+        )
+    stride = HI // output_size
+    kernel_size = HI - (output_size - 1) * stride
+
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+
+
+@ait_converter(acc_ops.contiguous)
+def acc_ops_contiguous(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return kwargs["input"]
+
+
+@ait_converter(acc_ops.gelu)
+def acc_ops_gelu(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    # For extra speedup, you can always lower to fast_gelu
+    if kwargs.get("approximate", None) == "tanh":
+        result = elementwise(FuncEnum.FASTGELU)(input_val)
+    else:
+        result = elementwise(FuncEnum.GELU)(input_val)
+    return result
+
+
+@ait_converter(acc_ops.pow)
+def acc_ops_pow(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    exponent = kwargs["exponent"]
+    return elementwise(FuncEnum.POW)(input_val, exponent)
+
+
+@ait_converter(acc_ops.tile)
+def acc_ops_tile(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape_dims = list(kwargs["dims"])
+    input_dim_len = len(input_val.shape())
+    result = input_val
+    if len(shape_dims) < input_dim_len:
+        for i in range(input_dim_len - len(shape_dims)):
+            shape_dims.insert(0, 1)
+    if input_dim_len < len(shape_dims):
+        shape = input_val.shape()
+        for i in range(len(shape_dims) - input_dim_len):
+            shape.insert(0, IntImm(1))
+        result = expand()(input_val, shape)
+
+    for i, shape in enumerate(shape_dims):
+        # Avoid operate on batch_size dim
+        if input_val.shape()[i]._attrs["name"] is not None:
+            continue
+        cat_groups = [result] * shape
+        result = concatenate()(cat_groups, dim=i)
+    return result
+
+
+@ait_converter(math.sqrt)
+def math_sqrt(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    return create_unary_op(FuncEnum.SQRT, args, kwargs, name)
diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
new file mode 100644
index 000000000..e4bba9013
--- /dev/null
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -0,0 +1,81 @@
+from typing import Any, Dict, OrderedDict, Tuple
+
+import numpy as np
+
+import torch
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import _TorchConstantTensorData
+from aitemplate.frontend import nn
+from torch.fx.node import Argument
+
+from .ait_converters import ConverterOutput
+from .converter_registry import ait_converter
+
+
+@ait_converter(torch.nn.modules.activation.MultiheadAttention)
+def multi_head_attention_module(
+    target: Target,
+    submod: Any,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO fix arg/kwargs matching
+    query = kwargs["query"] if "query" in kwargs else args[0]
+    key = kwargs["key"] if "key" in kwargs else args[1]
+    value = kwargs["value"] if "value" in kwargs else args[2]
+    bsz, seq_len_q, dim = query.shape()
+    _, seq_len, _ = key.shape()
+    attn = nn.CrossAttention(
+        dim=submod.embed_dim,
+        seq_len=seq_len_q.value(),
+        seq_len_kv=seq_len.value(),
+        num_heads=submod.num_heads,
+        qkv_bias=True,
+        has_residual=False,
+    )
+
+    # Bind constant tensor for MHA module
+    mapped_params = _map_ait_pt_params(attn, submod)
+    ait_params = dict(attn.named_parameters())
+    for name, data in mapped_params.items():
+        ait_tensor = ait_params[name].tensor()
+        ait_data = _TorchConstantTensorData(data.contiguous().cuda().half())
+        ait_tensor._bind_data(ait_data)
+
+    if "cu_length" in ait_params:
+        ait_tensor = ait_params["cu_length"].tensor()
+        cu_len = np.cumsum([0] + [seq_len.value()] * bsz.value()).astype("int32")
+        cu_len = torch.from_numpy(cu_len)
+        ait_data = _TorchConstantTensorData(cu_len.contiguous().cuda())
+        ait_tensor._bind_data(ait_data)
+
+    res = attn(query, key, value)
+    # make output of MHA a list to match the output type of pytorch MHA
+    return [res]
+
+
+def _map_ait_pt_params(ait_module, pt_module):
+    ait_params = dict(ait_module.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for pt_name, pt_param in pt_module.named_parameters():
+        ait_friendly_name = (
+            pt_name.replace("in_proj", "qkv")
+            .replace("out_proj", "proj")
+            .replace("_", ".")
+        )
+        if ait_friendly_name in ait_params:
+            mapped_pt_params[ait_friendly_name] = pt_param.data
+        elif "in_proj" in pt_name:
+            # set constant for cross attention
+            if len(pt_param.shape) == 2:
+                w_q, w_k, w_v = pt_param.chunk(3)
+                mapped_pt_params["proj_q.weight"] = w_q
+                mapped_pt_params["proj_k.weight"] = w_k
+                mapped_pt_params["proj_v.weight"] = w_v
+            else:
+                b_q, b_k, b_v = pt_param.chunk(3)
+                mapped_pt_params["proj_q.bias"] = b_q
+                mapped_pt_params["proj_k.bias"] = b_k
+                mapped_pt_params["proj_v.bias"] = b_v
+    return mapped_pt_params
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
new file mode 100644
index 000000000..986ae5196
--- /dev/null
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -0,0 +1,1081 @@
+import logging
+import torch  # isort:skip
+import operator
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
+from aitemplate.compiler.public import (
+    avg_pool2d,
+    bmm_rrr,
+    chunk,
+    concatenate,
+    conv2d,
+    conv2d_bias,
+    dynamic_slice,
+    elementwise,
+    expand,
+    FuncEnum,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rrr,
+    getitem,
+    int_elementwise,
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    layernorm,
+    max_pool2d,
+    nhwc3to8,
+    pad_last_dim,
+    permute,
+    reduce_mean,
+    reduce_sum,
+    reshape,
+    size,
+    split,
+    squeeze,
+    Tensor as AITTensor,
+    unsqueeze,
+)
+from fx2ait.converters.utils import (
+    create_binary_op,
+    get_positive_dim,
+    identical_elem_tuple_to_int,
+    nchw2nhwc,
+    unify_dynamic_shape_name,
+)
+from fx2ait.passes.lower_basic_pass_aten import (
+    aten_compose_bmm_2d,
+    aten_compose_bmm_3d,
+    aten_compose_chunk,
+    aten_compose_getitem_slice,
+    aten_operator_getitem,
+)
+from torch.fx.node import Argument, Target
+
+from .converter_registry import ait_converter
+
+
+# Logging
+logger: logging.Logger = logging.getLogger(__name__)
+ConverterOutput = Union[AITTensor, Tuple[AITTensor, ...], List[IntVar], IntVar]
+
+## make sure the functions are place in alphabetic order
+
+
+@ait_converter(torch.ops.aten._adaptive_avg_pool2d.default)
+def aten_ops_adaptive_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    output_size = identical_elem_tuple_to_int(args[1])
+    # FIXME: try to find a way to not explose internal date like this.
+    shape = [var._attrs["values"][0] for var in input_val._attrs["shape"]]
+    HI, WI, CI = shape[1], shape[2], shape[3]
+    if CI % 2 != 0:
+        raise RuntimeError(
+            f"AIT avg_pool2d expects input channel dim to align w/ a multiple of 2 but got {CI}"
+        )
+    if HI != WI:
+        raise RuntimeError(
+            f"adaptive_avg_pool2d currently only supports square input H/W but got H: {shape[1]} and W: {shape[2]}"
+        )
+    stride = HI // output_size
+    kernel_size = HI - (output_size - 1) * stride
+
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+
+
+@ait_converter(torch.ops.aten.avg_pool2d.default)
+def aten_ops_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    kernel_size = args[1]
+    stride = args[2]
+    padding = args[3] if len(args) > 3 else 0
+    kernel_size = identical_elem_tuple_to_int(kernel_size)
+    stride = identical_elem_tuple_to_int(stride)
+    padding = identical_elem_tuple_to_int(padding)
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(torch.ops.aten.batch_norm)
+def aten_ops_batch_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO @qxy11: Update channels-last assumption once AIT backend is updated
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    weight = args[1]
+    bias = args[2]
+    running_mean = args[3]
+    running_var = args[4]
+    eps = args[7]
+    scale = elementwise(FuncEnum.DIV)(
+        weight,
+        elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.SQRT)(running_var),
+            AITTensor(shape=[], value=eps),
+        ),
+    )
+    running_mean = elementwise(FuncEnum.MUL)(scale, running_mean)
+    bias = elementwise(FuncEnum.SUB)(bias, running_mean)
+    matmul_result = elementwise(FuncEnum.MUL)(input_val, scale)
+    result = elementwise(FuncEnum.ADD)(matmul_result, bias)
+    return result
+
+
+@ait_converter(torch.ops.aten.add.Tensor)
+def aten_binary_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    unify_dynamic_shape_name(args[0], args[1])
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.ADD, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.div.Tensor)
+def aten_binary_ops_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.DIV, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.mul.Tensor)
+def aten_binary_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.MUL, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.sub.Tensor)
+def aten_binary_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.SUB, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.cat.default)
+def aten_ops_cat(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = args[0]
+    for t in tensors:
+        if not isinstance(t, AITTensor):
+            raise ValueError(f"Non-tensor inputs for {name}: {tensors}")
+
+    dim = args[1] if len(args) > 1 else 0
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return concatenate()(tensors, dim=dim)
+
+
+def _choose_conv2d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: Optional[AITTensor],
+) -> ConverterOutput:
+    """
+    Helper to choose conv2d vs. conv2d_bias op based on existence of bias
+    and pad channel input dim to 4/8
+    """
+    last_dim = x._attrs["shape"][-1]._attrs["values"][0]
+    # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
+    # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
+    if last_dim < 4:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
+    elif last_dim in range(5, 8):
+        to_8 = nhwc3to8()
+        weight = to_8(weight)
+        x = to_8(x)
+    elif last_dim % 2 != 0:
+        return RuntimeError(
+            f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
+        )
+    if bias:
+        return conv2d_bias(stride=stride, pad=pad, dilate=dilate)(x, weight, bias)
+    else:
+        return conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+
+@ait_converter(torch.ops.aten.convolution.default)
+def aten_ops_conv2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: qxy11: Update once channels-first format is supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = args[1]
+    assert isinstance(weight, AITTensor)
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+
+    bias = args[2]
+    if not (isinstance(bias, AITTensor) or bias is None):
+        raise RuntimeError(f"Non-tensor weight for {name}: {bias}")
+
+    stride = args[3]
+    stride = identical_elem_tuple_to_int(stride)
+    padding = args[4]
+    padding = identical_elem_tuple_to_int(padding)
+    dilation = args[5]
+    dilation = identical_elem_tuple_to_int(dilation)
+    # TODO transposed=args[6], output_padding=args[7]
+    groups = args[8]
+
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if groups is None or groups == 1:
+        result = _choose_conv2d_op(stride, padding, dilation, input_val, weight, bias)
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        # groups = kwargs["groups"]
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def make_slice(x, dim, start, end, step, name):
+            args = []
+            args.append(x)
+            args.append(dim)
+            args.append(start)
+            args.append(end)
+            args.append(step)
+            return aten_ops_slice(
+                target,
+                args,
+                None,
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    3,
+                    i * group_size,
+                    i * group_size + group_size,
+                    1,
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    0,
+                    i * w_group_size,
+                    i * w_group_size + w_group_size,
+                    1,
+                    f"{name}.weight.slice_{i}",
+                ),
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    0,
+                    i * w_group_size,
+                    i * w_group_size + w_group_size,
+                    1,
+                    f"{name}.bias.slice_{i}",
+                ),
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    return result
+
+
+@ait_converter(aten_compose_chunk)
+@ait_converter(torch.ops.aten.chunk.default)
+def aten_ops_chunk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = input_val.shape()
+    target_dim = get_positive_dim(args[2], len(shape))
+    chunks = min(args[1], shape[target_dim].value())
+    assert isinstance(
+        shape[target_dim], IntImm
+    ), f"Cannot perform chunk on dynamic dim! Get target dim {target_dim}."
+
+    return chunk()(input_val, chunks, dim=target_dim)
+
+
+@ait_converter(torch.ops.aten.expand.default)
+def aten_ops_expand(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO expand is not functional yet but only for cases with dim=-1
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    sizes = args[1]
+    if not sizes:
+        raise ValueError("Expand sizes cannot be empty")
+
+    def _is_int_list(iterable):
+        return all(isinstance(dim, (int, IntVar, IntVarTensor)) for dim in iterable)
+
+    # sizes can either be a single int list or a list of ints.
+    if _is_int_list(sizes):
+        shape = sizes
+    elif len(sizes) == 1 and _is_int_list(sizes[0]):
+        shape = sizes[0]
+    else:
+        raise ValueError(
+            f"sizes argument can either be many ints or single int iterable, but got: {', '.join(str(type(dim)) for dim in sizes)}"
+        )
+
+    return expand()(input_val, shape)
+
+
+@ait_converter(aten_operator_getitem)
+def aten_ops_getitem(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    idx = args[1]
+
+    # This case is decomposed into a few slice.Tensor ops
+    # if (
+    #     isinstance(idx, slice)
+    #     or (isinstance(idx, Sequence) and any(isinstance(x, slice) for x in idx))
+    #     or isinstance(input_val, AITTensor)
+    # ):
+    #     return aten_ops_slice(target, args, kwargs, name)
+
+    if isinstance(idx, int):
+        idx = get_positive_dim(idx, len(input_val))
+
+    if all(isinstance(i, IntImm) for i in input_val):
+        return operator.getitem(input_val, idx)
+    else:
+        return getitem()(input_val, idx)
+
+
+@ait_converter(torch.ops.aten.layer_norm.default)
+def aten_ops_layer_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = args[1]
+    if shape is None or len(shape) == 0:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    weight = args[2]
+    bias = args[3]
+    normalized_shape = []
+    if all(isinstance(i, int) for i in shape):
+        for i in shape:
+            normalized_shape.append(IntImm(i))
+    elif all(isinstance(i, IntImm) or isinstance(i, IntVarTensor) for i in shape):
+        normalized_shape = shape
+    else:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    return layernorm()(input_val, weight, bias, normalized_shape)
+
+
+@ait_converter(torch.ops.aten.linear)
+def aten_ops_linear(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    weight = args[1]
+    bias = args[2]
+
+    assert isinstance(weight, AITTensor)
+    if bias is None:
+        return gemm_rcr()(input_val, weight)
+    else:
+        return gemm_rcr_bias()(input_val, weight, bias)
+
+
+@ait_converter(torch.ops.aten.max_pool2d)
+def aten_ops_max_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    kernel_size = args[1]
+    stride = args[2]
+    padding = args[3] if len(args) > 3 else 0
+    kernel_size = identical_elem_tuple_to_int(kernel_size)
+    stride = identical_elem_tuple_to_int(stride)
+    padding = identical_elem_tuple_to_int(padding)
+    return max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(aten_compose_bmm_3d)
+@ait_converter(aten_compose_bmm_2d)
+@ait_converter(torch.ops.aten.addmm.default)
+@ait_converter(torch.ops.aten.mm.default)
+@ait_converter(torch.ops.aten.bmm.default)
+def aten_ops_matmul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    if len(args) > 2:
+        bias = args[0]
+        input_val = args[1]
+        weight = args[2]
+    else:
+        bias = None
+        input_val = args[0]
+        weight = args[1]
+
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    if not isinstance(weight, AITTensor):
+        raise ValueError(f"Unexpected weight for {name}: {weight}")
+
+    input_shape = input_val.shape()
+    weight_shape = weight.shape()
+    if len(weight_shape) == 2:
+        result = gemm_rrr()(input_val, weight)
+    elif len(input_shape) == 3 and len(weight_shape) == 3:
+        unify_dynamic_shape_name(input_val, weight)
+        result = bmm_rrr()(input_val, weight)
+    else:
+        raise NotImplementedError(
+            f"This case is unsupported in {name}: {len(input_shape)} and {len(weight_shape)}"
+        )
+
+    if bias is not None:
+        if not isinstance(bias, AITTensor):
+            raise ValueError(f"Unexpected weight for {name}: {bias}")
+        result = elementwise(FuncEnum.ADD)(result, bias)
+
+    return result
+
+
+@ait_converter(torch.ops.aten.mean.dim)
+def aten_ops_mean(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    dims = args[1]
+    keepdim = args[2] if len(args) > 2 else False
+    if len(dims) == 1:
+        return reduce_mean(dim=dims, keepdim=keepdim)(input_val)
+    else:
+        new_dims = list(dims)
+        res = input_val
+        for i, d in enumerate(new_dims):
+            if d < 0:
+                d += len(input_val.shape())
+            new_dims[i] = d
+            res = reduce_mean(dim=d, keepdim=True)(res)
+        if not keepdim:
+            new_dims = sorted(new_dims, reverse=True)
+            for d in new_dims:
+                res = squeeze(d)(res)
+        return res
+
+
+@ait_converter(torch.ops.aten.nan_to_num.default)
+def aten_ops_nan_to_num(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    nan = args[1] if len(args) > 1 else None
+    nan = 0 if nan is None else nan
+    posinf = args[2] if len(args) > 2 else None
+    posinf = numpy.finfo(numpy.float16).max if posinf is None else posinf
+    neginf = args[3] if len(args) > 3 else None
+    neginf = numpy.finfo(numpy.float16).min if neginf is None else neginf
+    return elementwise(FuncEnum.NAN_TO_NUM)(
+        input_val,
+        AITTensor(value=nan, shape=[], name="nan"),
+        AITTensor(value=posinf, shape=[], name="posinf"),
+        AITTensor(value=neginf, shape=[], name="neginf"),
+    )
+
+
+@ait_converter(torch.ops.aten.split_with_sizes.default)
+@ait_converter(torch.ops.aten.split.Tensor)
+def aten_ops_split(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    split_size_or_sections = args[1]
+    # TODO split_size_or_sections can be IntVar and AIT does not support yet
+    # if not isinstance(split_size_or_sections, (int, list)):
+    #     raise ValueError(
+    #         f"Unexpected value for split_size_or_sections in {name}: {split_size_or_sections}"
+    #     )
+    dim = args[2] if len(args) > 2 else 0
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected value for dim in {name}: {dim}")
+
+    return split()(input_val, split_size_or_sections, dim)
+
+
+@ait_converter(torch.ops.aten.sym_numel)
+def aten_ops_numel(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape = size()(input_val)
+    res = shape[0]
+    for ind, dim in enumerate(shape):
+        if ind != 0:
+            res = int_elementwise(FuncEnum.MUL)(res, dim)
+    return res
+
+
+@ait_converter(torch.ops.aten.permute.default)
+def aten_ops_permute(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    permutation = args[1]
+    if len(permutation) > 5:
+        raise RuntimeError(f"Unsupported permutation {permutation} for {input_val}")
+
+    return permute()(input_val, permutation)
+
+
+@ait_converter(torch.ops.aten.pow.Tensor_Scalar)
+def aten_ops_pow(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    exp = args[1]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.POW)(input_val, exp)
+
+
+@ait_converter(torch.ops.aten.relu.default)
+def aten_ops_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.RELU)(input_val)
+
+
+@ait_converter(torch.ops.aten.reshape)
+@ait_converter(torch.ops.aten.view.default)
+def aten_ops_reshape(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape = args[1]
+
+    return reshape()(input_val, shape)
+
+
+@ait_converter(torch.ops.aten.sym_size)
+def aten_ops_size(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    dim = args[1]
+    return size()(input_val, dim)
+
+
+@ait_converter(aten_compose_getitem_slice)
+@ait_converter(torch.ops.aten.slice.Tensor)
+@ait_converter(torch.ops.aten.select.int)
+def aten_ops_slice(  # noqa: C901
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    idx = []
+    if target == aten_compose_getitem_slice:
+        for sli in args[1]:
+            start = sli[1]
+            end = sli[2] if len(sli) >= 3 else start + 1
+            if end == 9223372036854775807:  # represents None in pt2 tracer
+                end = None
+            step = sli[3] if len(sli) >= 4 else None
+            idx.append(slice(start, end, step))
+    else:
+        dim = args[1]
+        start = args[2]
+        end = args[3] if len(args) > 3 else start + 1
+        if end == 9223372036854775807:  # represents None in pt2 tracer
+            end = None
+        step = args[4] if len(args) > 4 else None
+        for _ in range(0, dim):
+            idx.append(slice(None, None, None))
+        if len(args) > 3:
+            idx.append(slice(start, end, step))
+        else:
+            idx.append(start)
+
+    if isinstance(input_val, (tuple, list)):
+        return operator.getitem(input_val, idx)
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    rank = input_val._rank()
+    op = dynamic_slice()
+
+    def num_slice_types(slices):
+        return sum(1 for s in slices if isinstance(s, slice) or isinstance(s, int))
+
+    # Replace ellipsis with expand slices.
+    num_ellipsis = rank - num_slice_types(idx)
+    expand_idx = []
+    for i in idx:
+        if i == Ellipsis:
+            # pass explicit start to guard against negative num_ellipsis
+            for _ in range(0, num_ellipsis):
+                expand_idx.append(slice(None, None, None))
+        else:
+            expand_idx.append(i)
+    idx = expand_idx
+
+    # Record indices that need to be either:
+    #   (1) sequeezed if Slice-index is of int; or
+    #   (2) unsqueezed if Slice-index is of None
+    # Each element of the list is a tuple of (int, func), where the second item
+    # is either squeeze or unsqueeze function and the first
+    # item gives the index to be squeezed or unsqueezed.
+    squeezable_indices = []
+    # the number of the indices of type None
+    num_none_indices = 0
+    start, end = [], []
+    for index, i in enumerate(idx):
+        if i is None:
+            squeezable_indices.append((index, unsqueeze))
+            num_none_indices += 1
+            continue
+        if isinstance(i, int):
+            i = get_positive_dim(i, input_val.shape()[index].value())
+            # If we pass an int, we need to squeeze this dim.
+            # Note that because we skip None-indices before, so we adjust
+            # the index by subtracting the number of None-indices.
+            squeezable_indices.append((index - num_none_indices, squeeze))
+        # if idx is slice, AIT only support slice.step == 1
+        # TODO remove check once slice support step != 1
+        if isinstance(i, slice) and i.step not in (1, None):
+            raise ValueError(
+                f"Slice tensor only support step=1 case, get step={i.step}."
+            )
+        start.append(i.start if isinstance(i, slice) else i)
+        end.append(i.stop if isinstance(i, slice) else (i + 1 if i is not None else i))
+
+    # append hiden dim at end
+    while len(start) < rank:
+        start.append(0)
+        end.append(None)
+    output = op(input_val, start, end)
+    for dim, squeeze_func in reversed(squeezable_indices):
+        output = squeeze_func(dim)(output)
+    return output
+
+
+@ait_converter(torch.ops.aten.squeeze)
+@ait_converter(torch.ops.aten.squeeze.default)
+@ait_converter(torch.ops.aten.squeeze.dim)
+def aten_ops_squeeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = args[1] if len(args) > 1 else None
+    if not isinstance(dim, int) and dim is not None:
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return squeeze(dim)(input_val)
+
+
+@ait_converter(torch.ops.aten.sum.dim_IntList)
+def aten_ops_sum(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    dims = args[1]
+    keepdim = args[2] if len(args) > 2 else False
+
+    if len(dims) == 1:
+        return reduce_sum(dim=dims, keepdim=keepdim)(input_val)
+    else:
+        new_dims = list(dims)
+        res = input_val
+        for i, d in enumerate(new_dims):
+            if d < 0:
+                d += len(input_val.shape())
+            new_dims[i] = d
+            res = reduce_sum(dim=d, keepdim=True)(res)
+        if not keepdim:
+            new_dims = sorted(new_dims, reverse=True)
+            for d in new_dims:
+                res = squeeze(d)(res)
+        return res
+
+
+@ait_converter(torch.ops.aten.hardtanh.default)
+def aten_ops_hardtanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    result = elementwise(FuncEnum.TANH)(input_val)
+    minimal = args[1] if len(args) > 1 else -1
+    maximum = args[2] if len(args) > 2 else 1
+    if minimal is not None:
+        result = elementwise(FuncEnum.MAX)(result, AITTensor(value=minimal, shape=[]))
+    if maximum is not None:
+        result = elementwise(FuncEnum.MIN)(result, AITTensor(value=maximum, shape=[]))
+    return result
+
+
+@ait_converter(torch.ops.aten.t.default)
+def aten_ops_transpose(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: we will also support https://pytorch.org/docs/stable/generated/torch.transpose.html in the future
+    # Be careful. Transpose is expensive, so we want to avoid it.
+    input_val = args[0]
+    permutation = [0, 2, 1]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    input_3d = unsqueeze(0)(input_val)
+    input_3d = permute()(input_3d, permutation)
+    input_2d = squeeze(0)(input_3d)
+    return input_2d
+
+
+@ait_converter(torch.ops.aten.unsqueeze.default)
+def aten_ops_unsqueeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = args[1]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return unsqueeze(dim)(input_val)
+
+
+## operator for symbolic computation
+@ait_converter(operator.mul)
+def operator_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.MUL)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.add)
+def operator_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.ADD)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.sub)
+def operator_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.SUB)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.floordiv)
+def operator_ops_floordiv(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.DIV)(input_val, other_val)
+    return res
+
+
+@ait_converter(torch.ops.aten.abs.default)
+def aten_unary_ops_abs(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.ABS)(input_val)
+
+
+@ait_converter(torch.ops.aten.clamp.default)
+def aten_unary_ops_clamp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    result = input_val
+    minimal = args[1]
+    maximum = args[2] if len(args) > 2 else None
+    if minimal is not None:
+        result = elementwise(FuncEnum.MAX)(result, AITTensor(value=minimal, shape=[]))
+    if maximum is not None:
+        result = elementwise(FuncEnum.MIN)(result, AITTensor(value=maximum, shape=[]))
+    return result
+
+
+@ait_converter(torch.ops.aten.log.default)
+def aten_unary_ops_log(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.LOGE)(input_val)
+
+
+@ait_converter(torch.ops.aten.sigmoid.default)
+def aten_unary_ops_sigmoid(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    return elementwise(FuncEnum.SIGMOID)(input_val)
+
+
+@ait_converter(torch.ops.aten.sign.default)
+def aten_unary_ops_sign(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.SIGN)(input_val)
+
+
+@ait_converter(torch.ops.aten.tanh.default)
+def aten_unary_ops_tanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.TANH)(input_val)
diff --git a/fx2ait/fx2ait/converters/converter_registry.py b/fx2ait/fx2ait/converters/converter_registry.py
new file mode 100644
index 000000000..0b4902cde
--- /dev/null
+++ b/fx2ait/fx2ait/converters/converter_registry.py
@@ -0,0 +1,19 @@
+from typing import Any, Callable, Dict
+
+from torch.fx.node import Target
+
+AIT_CONVERTERS: Dict[Target, Any] = {}
+
+
+def ait_converter(key: Target, enabled: bool = True) -> Callable[[Any], Any]:
+    def register_converter(converter):
+        AIT_CONVERTERS[key] = converter
+        return converter
+
+    def disable_converter(converter):
+        return converter
+
+    if enabled:
+        return register_converter
+    else:
+        return disable_converter
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
new file mode 100644
index 000000000..f4de0e387
--- /dev/null
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -0,0 +1,140 @@
+import math
+import operator
+from typing import Any, Callable, Dict, List, Tuple, Union
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
+
+from aitemplate.compiler.public import elementwise, FuncEnum, Tensor as AITTensor
+from torch.fx.node import Argument
+
+
+def get_positive_dim(dim: int, dim_size: int) -> int:
+    if dim < 0:
+        return dim % dim_size
+    return dim
+
+
+def create_reduce_op(
+    op_type: Any, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> AITTensor:
+    input_val = kwargs["input"]
+    # TODO: remove once multiple reduction axes are supported
+    dims = kwargs.get("dim", None)
+    if dims is None:
+        dims = list(range(len(input_val.shape())))
+    if len(dims) < 1:
+        raise ValueError("No dims to reduce on")
+    dim = dims[0]
+    keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"]
+    sum_val = op_type(dim=dim, keepdim=keepdim)(input_val)
+
+    if len(dims) > 1:
+        new_kwargs = {"input": sum_val, "dims": dims[1:]}
+        return create_reduce_op(op_type, args, new_kwargs, name)
+
+    return sum_val
+
+
+def create_binary_op(
+    op_type: FuncEnum,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> AITTensor:
+    lhs = kwargs["input"]
+    if not isinstance(lhs, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected left operand {type(lhs)} on {name}: {lhs}")
+
+    rhs = kwargs["other"]
+    if not isinstance(rhs, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected right operand {type(rhs)} on {name}: {rhs}")
+
+    lhs_is_constant, lhs_constant = try_get_constant_num(lhs)
+    rhs_is_constant, rhs_constant = try_get_constant_num(rhs)
+    if lhs_is_constant and rhs_is_constant:
+        res = get_python_op_from_ait_constant_elementwise_op(op_type)(
+            lhs_constant, rhs_constant
+        )
+        return res
+
+    return elementwise(op_type)(lhs, rhs)
+
+
+def create_unary_op(
+    op_type: FuncEnum,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> AITTensor:
+    input = kwargs["input"] if "input" in kwargs else args[0]
+    if not isinstance(input, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected left operand {type(input)} on {name}: {input}")
+
+    input_is_constant, input_constant = try_get_constant_num(input)
+    if input_is_constant:
+        res = get_python_op_from_ait_constant_elementwise_op(op_type)(input_constant)
+        return res
+
+    return elementwise(op_type)(input)
+
+
+def try_get_constant_num(arg: Any) -> (bool, Any):
+    if isinstance(arg, (float, int)):
+        return (True, arg)
+    elif isinstance(arg, IntImm):
+        return (True, arg.value())
+    elif isinstance(arg, IntVarTensor):
+        var = arg._attrs["int_var"]
+        return try_get_constant_num(var)
+    else:
+        return (False, None)
+
+
+def get_python_op_from_ait_constant_elementwise_op(
+    op_type: FuncEnum,
+) -> Callable[[Any, Any], Any]:
+    if op_type == FuncEnum.ADD:
+        return operator.add
+    elif op_type == FuncEnum.MUL:
+        return operator.mul
+    elif op_type == FuncEnum.SUB:
+        return operator.sub
+    elif op_type == FuncEnum.DIV:
+        return operator.truediv
+    elif op_type == FuncEnum.SQRT:
+        return math.sqrt
+    else:
+        raise RuntimeError(f"{op_type} is not supported yet!")
+
+
+def identical_elem_tuple_to_int(param):
+    """
+    Convert tuples with all the same int elem to
+    a single int (ex. (3, 3, 3) --> 3)
+    """
+    if isinstance(param, int):
+        return param
+
+    if not isinstance(param, (list, tuple)) or not all(x == param[0] for x in param):
+        raise RuntimeError(f"AIT supports square param values only, but got {param}")
+    return param[0]
+
+
+def nchw2nhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
+    return [shape[0], shape[2], shape[3], shape[1]]
+
+
+# TODO:  This is a hack to workaround AIT's dynamic shape requirement.
+# Detailed explanation can be found in D41743385 (aten2ait) D41974191(fx2ait).
+# We will throw this one after AIT provides vanilla support.
+def unify_dynamic_shape_name(input_val, weight):
+    input_shape = input_val.shape()
+    weight_shape = weight.shape()
+    if len(input_shape) == len(weight_shape):
+        for a, b in zip(input_shape, weight_shape):
+            if a._attrs["values"] == b._attrs["values"]:
+                if a._attrs["name"] is None:
+                    a._attrs["name"] = b._attrs["name"]
+                elif b._attrs["name"] is None:
+                    b._attrs["name"] = a._attrs["name"]
+    return input_shape, weight_shape
diff --git a/fx2ait/fx2ait/csrc/AITModel.cpp b/fx2ait/fx2ait/csrc/AITModel.cpp
new file mode 100644
index 000000000..6ba3eaba6
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModel.cpp
@@ -0,0 +1,104 @@
+#include "AITModel.h"
+
+#include "picojson.h"
+
+namespace torch::aitemplate {
+// const string for serialization
+const static std::string LIB_BASENAME_STR = "library_basename";
+const static std::string INPUT_NAMES_STR = "input_names";
+const static std::string OUTPUT_NAMES_STR = "output_names";
+const static std::string FLOATING_POINT_INPUT_DTYPE_STR =
+    "floating_point_input_dtype";
+const static std::string FLOATING_POINT_OUTPUT_DTYPE_STR =
+    "floating_point_output_dtype";
+std::string AITModel::serialize() const {
+  std::string result;
+  picojson::object var;
+  picojson::array pick_input_names;
+  var[LIB_BASENAME_STR] = picojson::value(aitModelImpl_.libraryBasename());
+  for (const auto& entry : aitModelImpl_.inputNames()) {
+    pick_input_names.push_back(picojson::value(entry));
+  }
+  var[INPUT_NAMES_STR] = picojson::value(pick_input_names);
+  picojson::array pick_output_names;
+  for (const auto& entry : aitModelImpl_.outputNames()) {
+    pick_output_names.push_back(picojson::value(entry));
+  }
+  var[OUTPUT_NAMES_STR] = picojson::value(pick_output_names);
+  var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(std::to_string(
+      static_cast<int16_t>(aitModelImpl_.floatingPointInputDtype().value())));
+
+  var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(std::to_string(
+      static_cast<int16_t>(aitModelImpl_.floatingPointOutputDtype().value())));
+
+  result = picojson::value(var).serialize();
+  return result;
+}
+
+void AITModel::loadAsTorchClass() {
+  // Calling this function will make sure that the static content of this file
+  // will be executed. I.e. the most important part here is registering the
+  // AITModel class for Python environment (i.e. torch::deploy).
+  LOG(INFO) << "Making sure AITModel is registered via torch::class_";
+}
+
+static auto registerAITModel =
+    torch::class_<AITModel>("ait", "AITModel")
+        .def(torch::init<
+             std::string,
+             std::vector<std::string>,
+             std::vector<std::string>,
+             c10::optional<at::ScalarType>,
+             c10::optional<at::ScalarType>,
+             int64_t>())
+        .def("forward", &AITModel::forward)
+        .def("profile", &AITModel::profile)
+        .def("get_library_path", &AITModel::libraryPath)
+        .def_property(
+            "use_cuda_graph",
+            &AITModel::getUseCudaGraph,
+            &AITModel::setUseCudaGraph)
+        .def_static(
+            "register_library_name_to_path_map",
+            [](c10::Dict<std::string, std::string> dict) {
+              std::unordered_map<std::string, std::string> map;
+              for (const auto& entry : dict) {
+                map[entry.key()] = entry.value();
+              }
+              AITModelImpl::registerLibraryNameToPathMap(std::move(map));
+            })
+        .def_pickle(
+            [](const c10::intrusive_ptr<AITModel>& self) -> std::string {
+              return self->serialize();
+            },
+            [](const std::string& data) {
+              picojson::value var;
+              const char* json = data.c_str();
+              picojson::parse(var, json, json + strlen(json));
+              std::vector<std::string> input_names;
+              for (const auto name :
+                   var.get(INPUT_NAMES_STR).get<picojson::array>()) {
+                input_names.push_back(name.get<std::string>());
+              }
+              std::vector<std::string> output_names;
+              for (const auto name :
+                   var.get(OUTPUT_NAMES_STR).get<picojson::array>()) {
+                output_names.push_back(name.get<std::string>());
+              }
+              auto floating_point_input_dtype =
+                  std::stoi(var.get(FLOATING_POINT_INPUT_DTYPE_STR)
+                                .get<std::string>()
+                                .c_str());
+              auto floating_point_output_dtype =
+                  std::stoi(var.get(FLOATING_POINT_OUTPUT_DTYPE_STR)
+                                .get<std::string>()
+                                .c_str());
+              return c10::make_intrusive<AITModel>(
+                  AITModelImpl::getFullPathForLibraryName(
+                      var.get(LIB_BASENAME_STR).get<std::string>().c_str()),
+                  input_names,
+                  output_names,
+                  static_cast<at::ScalarType>(floating_point_input_dtype),
+                  static_cast<at::ScalarType>(floating_point_output_dtype));
+            });
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModel.h b/fx2ait/fx2ait/csrc/AITModel.h
new file mode 100644
index 000000000..8949758db
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModel.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/torch.h> // @manual=//caffe2:torch-cpp
+#include "AITModelImpl.h"
+
+namespace torch::aitemplate {
+
+class AITModel : public torch::CustomClassHolder {
+ public:
+  explicit AITModel(
+      const std::string& model_path,
+      std::vector<std::string> input_names,
+      std::vector<std::string> output_names,
+      c10::optional<at::ScalarType> input_dtype,
+      c10::optional<at::ScalarType> output_dtype,
+      int64_t num_runtimes = 2,
+      bool use_cuda_graph = false)
+      : aitModelImpl_(
+            model_path,
+            input_names,
+            output_names,
+            input_dtype,
+            output_dtype) {}
+
+  ~AITModel() {}
+
+  // If we need to move or copy this object, then we should just
+  // define a unique_ptr with deleter for the handle.
+  AITModel(const AITModel&) = delete;
+  AITModel& operator=(const AITModel&) = delete;
+
+  std::vector<torch::Tensor> forward(std::vector<torch::Tensor> inputs) {
+    return aitModelImpl_.forward(inputs);
+  }
+
+  void profile(
+      std::vector<torch::Tensor> inputs,
+      const std::string& filename,
+      int64_t num_iters) {
+    TORCH_CHECK_GE(num_iters, 0);
+    aitModelImpl_.profile(inputs, filename, static_cast<size_t>(num_iters));
+  }
+
+  const std::string& libraryPath() const {
+    return aitModelImpl_.libraryPath();
+  }
+
+  void setUseCudaGraph(bool use_cuda_graph) {
+    aitModelImpl_.setUseCudaGraph(use_cuda_graph);
+  }
+
+  bool getUseCudaGraph() const {
+    return aitModelImpl_.getUseCudaGraph();
+  }
+
+  std::string serialize() const;
+
+  static void loadAsTorchClass();
+
+ private:
+  AITModelImpl aitModelImpl_;
+};
+
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
new file mode 100644
index 000000000..a859f37d0
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -0,0 +1,495 @@
+#include "AITModelImpl.h" // @manual
+
+#include <type_traits>
+
+#include <dlfcn.h>
+#include <sstream>
+
+#include "ATen/Context.h" // @manual
+#include "ATen/cuda/CUDAContext.h"
+#include "c10/core/CPUAllocator.h"
+#include "c10/cuda/CUDAStream.h"
+
+#ifdef FBCODE_AIT
+#include "folly/MapUtil.h"
+#endif
+
+namespace torch::aitemplate {
+
+AITemplatePyTorchCachingAllocator::AITemplatePyTorchCachingAllocator() {
+  at::globalContext().lazyInitCUDA();
+  cuda_allocator_ = at::cuda::getCUDADeviceAllocator();
+  TORCH_CHECK(cuda_allocator_ != nullptr);
+}
+
+void* AITemplatePyTorchCachingAllocator::Allocate(size_t num_bytes) {
+  if (num_bytes == 0) {
+    return nullptr;
+  }
+  return cuda_allocator_->raw_allocate(num_bytes);
+}
+
+void AITemplatePyTorchCachingAllocator::Free(void* ptr) {
+  if (!ptr) {
+    return;
+  }
+  cuda_allocator_->raw_deallocate(ptr);
+}
+
+namespace {
+template <typename T>
+struct GetLastArgType;
+
+template <typename T>
+struct tag {
+  using type = T;
+};
+
+template <typename Function, typename... Args>
+struct GetLastArgType<Function(Args...)> {
+  using last_arg_type = typename decltype((tag<Args>{}, ...))::type;
+};
+
+template <typename T>
+struct AITCallImpl;
+
+#define AIT_CHECK(status)                           \
+  TORCH_CHECK(                                      \
+      status == AITemplateError::AITemplateSuccess, \
+      "an AITemplate function failed")
+
+template <>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle)> {
+  // Special version for a function with no result.
+  void operator()(
+      AITemplateError (*f)(AITemplateModelHandle),
+      AITemplateModelHandle handle) {
+    AIT_CHECK(f(handle));
+  }
+};
+
+template <typename... Args>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle*, Args...)> {
+  // Special version for ModelContainer creation
+  void operator()(
+      AITemplateError (*f)(AITemplateModelHandle*, Args...),
+      AITemplateModelHandle* handle,
+      Args... args) {
+    AIT_CHECK(f(handle, args...));
+  }
+};
+
+template <typename... Args>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle, Args...)> {
+  using Function = AITemplateError(AITemplateModelHandle, Args...);
+  template <typename... ArgsWithoutLastArgument>
+  auto operator()(
+      Function* f,
+      AITemplateModelHandle handle,
+      ArgsWithoutLastArgument... args) {
+    std::remove_pointer_t<typename GetLastArgType<Function>::last_arg_type>
+        result;
+    AIT_CHECK(f(handle, args..., &result));
+    return result;
+  }
+};
+
+template <typename Function, typename... Args>
+auto AITCall(Function* f, AITemplateModelHandle handle, Args... args) {
+  return AITCallImpl<Function>()(f, handle, args...);
+}
+
+template <typename Function>
+auto AITCallCreate(
+    Function* f,
+    AITemplateModelHandle* handle,
+    size_t num_runtimes,
+    AITemplateAllocator* allocator = nullptr) {
+  return AITCallImpl<Function>()(f, handle, num_runtimes, allocator);
+}
+
+std::string getFileBasename(const std::string& filename) {
+  const auto slash = filename.rfind('/');
+  return slash != std::string::npos ? filename.substr(slash + 1) : filename;
+}
+
+} // namespace
+
+AITModelImpl::AITModelImpl(
+    const std::string& model_path,
+    std::vector<std::string> input_names,
+    std::vector<std::string> output_names,
+    c10::optional<at::ScalarType> input_dtype,
+    c10::optional<at::ScalarType> output_dtype,
+    int64_t num_runtimes,
+    bool use_cuda_graph)
+    : handle_(dlopen(model_path.c_str(), RTLD_NOW | RTLD_LOCAL)),
+      library_basename_(getFileBasename(model_path)),
+      library_path_(model_path),
+      input_names_(std::move(input_names)),
+      output_names_(std::move(output_names)),
+      floating_point_input_dtype_(input_dtype),
+      floating_point_output_dtype_(output_dtype),
+      use_cuda_graph_(use_cuda_graph) {
+  LOG(INFO) << "Loading .so lib " << model_path;
+  TORCH_CHECK(handle_, "could not dlopen ", model_path, ": ", dlerror());
+  TORCH_CHECK(num_runtimes > 0, "num_runtimes must be positive");
+
+#define LOAD_SYMBOL(var, name_str)                                       \
+  var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
+  TORCH_CHECK(var, "could not dlsym " name_str);
+
+#define LOAD_SYMBOL_WARN(var, name_str)                                  \
+  var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
+  if (!var) {                                                            \
+    LOG(WARNING) << "Could not dlsym " << name_str;                      \
+  }
+
+  LOAD_SYMBOL(deleteFunc_, "AITemplateModelContainerDelete");
+  LOAD_SYMBOL(runFunc_, "AITemplateModelContainerRun");
+  LOAD_SYMBOL(getOutputNameFunc_, "AITemplateModelContainerGetOutputName");
+  LOAD_SYMBOL(
+      getMaximumOutputShapeFunc_,
+      "AITemplateModelContainerGetMaximumOutputShape");
+  LOAD_SYMBOL(getOutputDtypeFunc_, "AITemplateModelContainerGetOutputDtype");
+
+  // It's possible that we have new field added in AITemplateModelContainer,
+  // But we can be using a new AITModel to load an old AITemplateModelContainer.
+  // The newly added method are usually non-critical, so we issue warning
+  // instead of hard exception.
+  LOAD_SYMBOL_WARN(profileFunc_, "AITemplateModelContainerProfile");
+
+  // We never call these functions again after the constructor returns, so
+  // there's no point in caching them in member variables.
+  decltype(&AITemplateModelContainerCreate) createFunc;
+  decltype(&AITemplateModelContainerGetInputName) getInputNameFunc;
+  decltype(&AITemplateModelContainerGetNumInputs) getNumInputsFunc;
+  decltype(&AITemplateModelContainerGetNumOutputs) getNumOutputsFunc;
+  LOAD_SYMBOL(createFunc, "AITemplateModelContainerCreate");
+  LOAD_SYMBOL(getInputNameFunc, "AITemplateModelContainerGetInputName");
+  LOAD_SYMBOL(getNumInputsFunc, "AITemplateModelContainerGetNumInputs");
+  LOAD_SYMBOL(getNumOutputsFunc, "AITemplateModelContainerGetNumOutputs");
+#undef LOAD_SYMBOL
+
+  AITCallCreate(createFunc, &model_handle_, num_runtimes, &allocator_);
+  const auto num_inputs = AITCall(getNumInputsFunc, model_handle_);
+  const auto num_outputs = AITCall(getNumOutputsFunc, model_handle_);
+
+  for (const auto idx : c10::irange(num_inputs)) {
+    input_name_to_index_.emplace(
+        AITCall(getInputNameFunc, model_handle_, idx), idx);
+  }
+  for (const auto idx : c10::irange(num_outputs)) {
+    output_name_to_index_.emplace(
+        AITCall(getOutputNameFunc_, model_handle_, idx), idx);
+  }
+}
+
+namespace {
+at::ScalarType AITemplateDtypeToTorchDtype(AITemplateDtype ait_dtype) {
+  switch (ait_dtype) {
+    case AITemplateDtype::kHalf:
+      return torch::kHalf;
+    case AITemplateDtype::kFloat:
+      return torch::kFloat;
+    case AITemplateDtype::kInt:
+      return torch::kInt;
+    case AITemplateDtype::kLong:
+      return torch::kLong;
+    case AITemplateDtype::kBool:
+      return torch::kBool;
+    case AITemplateDtype::kBFloat16:
+      return torch::kBFloat16;
+    case AITemplateDtype::kUnset:
+      TORCH_CHECK(false, "Unset AITemplate dtype");
+  }
+}
+
+AITemplateDtype TorchDtypeToAITemplateDtype(at::ScalarType torch_dtype) {
+  switch (torch_dtype) {
+    case torch::kHalf:
+      return AITemplateDtype::kHalf;
+    case torch::kFloat:
+      return AITemplateDtype::kFloat;
+    case torch::kInt:
+      return AITemplateDtype::kInt;
+    case torch::kLong:
+      return AITemplateDtype::kLong;
+    case torch::kBool:
+      return AITemplateDtype::kBool;
+    case torch::kBFloat16:
+      return AITemplateDtype::kBFloat16;
+    default:
+      TORCH_CHECK(false, "Unknown or unsupported torch dtype");
+  }
+}
+} // namespace
+
+void AITModelImpl::allocateOutputs(
+    std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+        output_index_to_output_storage_impl,
+    std::vector<AITData>& ait_outputs,
+    std::vector<std::vector<int64_t>>& output_shapes,
+    std::vector<int64_t*>& output_shape_ptrs,
+    const c10::Device& device) {
+  RECORD_USER_SCOPE("AITModel::AllocateOutputs");
+  const auto num_outputs = output_name_to_index_.size();
+  output_index_to_output_storage_impl.resize(num_outputs);
+  const c10::DeviceGuard device_guard(device);
+  ait_outputs.reserve(num_outputs);
+  for (const auto output_index : c10::irange(num_outputs)) {
+    const auto shape =
+        AITCall(getMaximumOutputShapeFunc_, model_handle_, output_index);
+    auto output_ndim = shape.size;
+    output_shapes.emplace_back(output_ndim, 0);
+    output_shape_ptrs.emplace_back(output_shapes.back().data());
+
+    size_t size_bytes = 0;
+    AITemplateDtype ait_dtype = AITemplateDtype::kUnset;
+    ait_dtype = AITCall(getOutputDtypeFunc_, model_handle_, output_index);
+    TORCH_CHECK(
+        ait_dtype != AITemplateDtype::kUnset,
+        "Unset dtype for AITemplate output ",
+        AITCall(getOutputNameFunc_, model_handle_, output_index));
+    const auto dtype = AITemplateDtypeToTorchDtype(ait_dtype);
+    const auto size_array_ref = c10::IntArrayRef(shape.shape_data, shape.size);
+    size_bytes = at::detail::computeStorageNbytesContiguous(
+        size_array_ref, scalarTypeToTypeMeta(dtype).itemsize());
+    c10::Allocator* const allocator = at::cuda::getCUDADeviceAllocator();
+    auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+        c10::StorageImpl::use_byte_size_t(),
+        size_bytes,
+        allocator->allocate(size_bytes),
+        allocator,
+        /*resizable=*/true);
+    ait_outputs.emplace_back(
+        storage_impl->unsafe_data<void>(), shape, ait_dtype);
+    output_index_to_output_storage_impl[output_index] = std::move(storage_impl);
+  }
+}
+
+std::vector<torch::Tensor> AITModelImpl::processOutputs(
+    std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+        output_index_to_output_storage_impl,
+    const std::vector<std::vector<int64_t>>& output_shapes) {
+  std::vector<torch::Tensor> outputs;
+  outputs.reserve(output_names_.size());
+  for (const auto& output_name : output_names_) {
+    const auto output_idx = output_name_to_index_.at(output_name);
+
+    // Now take the storage and jam it into a Tensor that has its shape set
+    // to the actual shape.
+    const auto ait_dtype =
+        AITCall(getOutputDtypeFunc_, model_handle_, output_idx);
+    // This should never fail as we checked it the first time around...
+    TORCH_CHECK(
+        ait_dtype != AITemplateDtype::kUnset,
+        "Unset dtype for AITemplate output ",
+        AITCall(getOutputNameFunc_, model_handle_, output_idx));
+    const auto dtype = AITemplateDtypeToTorchDtype(ait_dtype);
+
+    auto output = at::detail::make_tensor_base<c10::TensorImpl>(
+        std::move(output_index_to_output_storage_impl.at(output_idx)),
+        c10::DispatchKeySet(c10::DispatchKey::CUDA),
+        scalarTypeToTypeMeta(dtype));
+    const auto& size = output_shapes.at(output_idx);
+    if (size.size() != 1 || size[0] != 0) {
+      output.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+    }
+
+    if (floating_point_output_dtype_ != c10::nullopt &&
+        output.is_floating_point()) {
+      outputs.emplace_back(output.to(*floating_point_output_dtype_));
+    } else {
+      outputs.emplace_back(std::move(output));
+    }
+  }
+  return outputs;
+}
+
+std::vector<AITData> AITModelImpl::processInputs(
+    std::vector<torch::Tensor>& inputs,
+    std::vector<torch::Tensor>& inputs_contig) {
+  RECORD_USER_SCOPE("AITModel::ProcessInputs");
+  const auto num_inputs = input_name_to_index_.size();
+  std::vector<AITData> ait_inputs;
+  TORCH_CHECK(
+      inputs.size() == num_inputs,
+      "User passed ",
+      inputs.size(),
+      " inputs, but the model expects ",
+      num_inputs);
+  ait_inputs.resize(inputs.size());
+  for (int python_input_idx = 0; python_input_idx < input_names_.size();
+       python_input_idx++) {
+    auto input_name = input_names_[python_input_idx];
+    const auto ait_input_idx = input_name_to_index_.at(input_name);
+    auto& input = inputs[python_input_idx];
+    if (floating_point_input_dtype_ != c10::nullopt &&
+        input.is_floating_point()) {
+      // Need to keep input alive; cannot just stash result of to()
+      // call in a local!
+      input = input.to(*floating_point_input_dtype_);
+    }
+    inputs_contig.push_back(input.contiguous());
+    auto& input_contig = inputs_contig.back();
+    auto input_shape_array_ref = input_contig.sizes();
+    ait_inputs[ait_input_idx] = AITData{
+        input_contig.data_ptr(),
+        AITemplateParamShape{
+            input_shape_array_ref.data(), input_shape_array_ref.size()},
+        TorchDtypeToAITemplateDtype(input.scalar_type())};
+  }
+  return ait_inputs;
+}
+
+std::vector<torch::Tensor> AITModelImpl::forward(
+    std::vector<torch::Tensor>& inputs) {
+  RECORD_USER_SCOPE("AITModel::Forward");
+  TORCH_CHECK(!inputs.empty());
+  const auto device = inputs[0].device();
+
+  // Process inputs
+  std::vector<torch::Tensor> inputs_contig;
+  std::vector<AITData> ait_inputs = processInputs(inputs, inputs_contig);
+
+  // Allocate outputs
+  std::vector<c10::intrusive_ptr<c10::StorageImpl>>
+      output_index_to_output_storage_impl;
+  std::vector<AITData> ait_outputs;
+  std::vector<std::vector<int64_t>> output_shapes;
+  std::vector<int64_t*> output_shape_ptrs;
+  allocateOutputs(
+      output_index_to_output_storage_impl,
+      ait_outputs,
+      output_shapes,
+      output_shape_ptrs,
+      device);
+
+  std::vector<torch::Tensor> outputs;
+  {
+    const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    const auto stream_id = cuda_stream.stream();
+    // TODO: remove casting after fixing API
+    AITemplateStreamHandle stream_handle =
+        reinterpret_cast<AITemplateStreamHandle>(stream_id);
+    RECORD_USER_SCOPE("AITModel::AITRuntime");
+    if (runFunc_(
+            model_handle_,
+            ait_inputs.data(),
+            ait_inputs.size(),
+            ait_outputs.data(),
+            ait_outputs.size(),
+            /* stream = */ stream_handle,
+            /* sync = */ false,
+            use_cuda_graph_,
+            output_shape_ptrs.data()) != AITemplateError::AITemplateSuccess) {
+      std::stringstream ss;
+      ss << "AITModel run failed with input spec: ";
+      for (const auto& i : inputs) {
+        ss << i.sizes() << ":" << i.dtype() << ", ";
+      }
+      TORCH_CHECK(false, ss.str());
+    }
+
+    // Process outputs
+    outputs =
+        processOutputs(output_index_to_output_storage_impl, output_shapes);
+  }
+  return outputs;
+}
+
+void AITModelImpl::profile(
+    std::vector<torch::Tensor>& inputs,
+    const std::string& filename,
+    size_t num_iters) {
+  TORCH_CHECK(!inputs.empty());
+  TORCH_CHECK(
+      profileFunc_,
+      "Check whether the loaded AITModelContainer.so contains Profile().");
+  const auto device = inputs[0].device();
+
+  // Process inputs
+  std::vector<torch::Tensor> inputs_contig;
+  std::vector<AITData> ait_inputs = processInputs(inputs, inputs_contig);
+
+  // Allocate outputs
+  std::vector<c10::intrusive_ptr<c10::StorageImpl>>
+      output_index_to_output_storage_impl;
+  std::vector<AITData> ait_outputs;
+  std::vector<std::vector<int64_t>> output_shapes;
+  std::vector<int64_t*> output_shape_ptrs;
+  allocateOutputs(
+      output_index_to_output_storage_impl,
+      ait_outputs,
+      output_shapes,
+      output_shape_ptrs,
+      device);
+
+  {
+    const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    const auto stream_id = cuda_stream.stream();
+    // TODO: remove casting after fixing API
+    AITemplateStreamHandle stream_handle =
+        reinterpret_cast<AITemplateStreamHandle>(stream_id);
+    if (profileFunc_(
+            model_handle_,
+            ait_inputs.data(),
+            ait_inputs.size(),
+            ait_outputs.data(),
+            ait_outputs.size(),
+            /* stream = */ stream_handle,
+            num_iters,
+            filename.c_str()) != AITemplateError::AITemplateSuccess) {
+      std::stringstream ss;
+      ss << "AITModel profile failed with input spec: ";
+      for (const auto& i : inputs) {
+        ss << i.sizes() << ":" << i.dtype() << ", ";
+      }
+      TORCH_CHECK(false, ss.str());
+    }
+  }
+}
+
+thread_local std::unordered_map<std::string, std::string>
+    AITModelImpl::name_to_path_map_;
+
+void AITModelImpl::registerLibraryNameToPathMap(
+    std::unordered_map<std::string, std::string> map) {
+  std::ostringstream ss;
+  ss << "{\n";
+  for (const auto& [k, v] : map) {
+    ss << "  " << k << " => " << v << ",\n";
+  }
+  ss << "}";
+
+  LOG(INFO) << "Registering .so lib paths: " << ss.str();
+  name_to_path_map_ = std::move(map);
+}
+
+const std::string& AITModelImpl::getFullPathForLibraryName(
+    const std::string& name) {
+  const std::string* path = nullptr;
+#ifdef FBCODE_AIT
+  path = folly::get_ptr(name_to_path_map_, name);
+#else
+  auto it = name_to_path_map_.find(name);
+  if (it != name_to_path_map_.end()) {
+    path = &(it->second);
+  }
+#endif
+  std::ostringstream ss;
+  ss << "{\n";
+  for (const auto& [k, v] : name_to_path_map_) {
+    ss << "  " << k << " => " << v << ",\n";
+  }
+  ss << "}";
+  TORCH_CHECK(
+      path != nullptr,
+      "could not find full path for AITemplate model .so named ",
+      name,
+      ". available paths: ",
+      ss.str());
+  return *path;
+}
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
new file mode 100644
index 000000000..14f992422
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include "model_interface.h" // @manual=//aitemplate/AITemplate/static/include:aitemplate
+
+#include <dlfcn.h>
+#include <torch/torch.h> // @manual=//caffe2:torch-cpp
+#include <memory>
+#include <optional>
+
+#ifdef FBCODE_AIT
+#include "folly/container/F14Map.h"
+#endif
+
+namespace torch::aitemplate {
+
+class AITemplatePyTorchCachingAllocator : public AITemplateAllocator {
+ public:
+  AITemplatePyTorchCachingAllocator();
+  void* Allocate(size_t num_bytes) override;
+  void Free(void* ptr) override;
+
+ private:
+  c10::Allocator* cuda_allocator_;
+};
+
+class AITModelImpl {
+ public:
+  explicit AITModelImpl(
+      const std::string& model_path,
+      std::vector<std::string> input_names,
+      std::vector<std::string> output_names,
+      c10::optional<at::ScalarType> input_dtype,
+      c10::optional<at::ScalarType> output_dtype,
+      int64_t num_runtimes = 2,
+      bool use_cuda_graph = false);
+
+  ~AITModelImpl() {
+    if (model_handle_) {
+      deleteFunc_(model_handle_);
+    }
+  }
+
+  std::vector<torch::Tensor> forward(std::vector<torch::Tensor>& inputs);
+
+  void profile(
+      std::vector<torch::Tensor>& inputs,
+      const std::string& filename,
+      size_t num_iters);
+
+  // If we need to move or copy this object, then we should just
+  // define a unique_ptr with deleter for the handle.
+  AITModelImpl(const AITModelImpl&) = delete;
+  AITModelImpl& operator=(const AITModelImpl&) = delete;
+
+  static void registerLibraryNameToPathMap(
+      std::unordered_map<std::string, std::string> map);
+
+  static const std::string& getFullPathForLibraryName(const std::string& name);
+
+  /*
+   * Returns a path to .so file (either relative or absolute).
+   */
+  const std::string& libraryPath() const {
+    return library_path_;
+  }
+
+  void setUseCudaGraph(bool use_cuda_graph) {
+    use_cuda_graph_ = use_cuda_graph;
+  }
+
+  bool getUseCudaGraph() const {
+    return use_cuda_graph_;
+  }
+
+  const std::string& libraryBasename() const {
+    return library_basename_;
+  }
+
+  const std::vector<std::string>& inputNames() const {
+    return input_names_;
+  }
+
+  const std::vector<std::string>& outputNames() const {
+    return output_names_;
+  }
+
+  const c10::optional<at::ScalarType> floatingPointInputDtype() const {
+    return floating_point_input_dtype_;
+  }
+
+  const c10::optional<at::ScalarType> floatingPointOutputDtype() const {
+    return floating_point_output_dtype_;
+  }
+
+ private:
+  // @lint-ignore CLANGTIDY facebook-hte-NonPodStaticDeclaration
+  static thread_local std::unordered_map<std::string, std::string>
+      name_to_path_map_;
+
+  struct DlcloseDeleter {
+    void operator()(void* p) const {
+      if (p) {
+        dlclose(p);
+      }
+    }
+  };
+
+  std::vector<AITData> processInputs(
+      std::vector<torch::Tensor>& inputs,
+      std::vector<torch::Tensor>& inputs_contig);
+
+  std::vector<torch::Tensor> processOutputs(
+      std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+          output_index_to_output_storage_impl,
+      const std::vector<std::vector<int64_t>>& output_shapes);
+
+  void allocateOutputs(
+      std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+          output_index_to_output_storage_impl,
+      std::vector<AITData>& ait_outputs,
+      std::vector<std::vector<int64_t>>& output_shapes,
+      std::vector<int64_t*>& output_shape_ptrs,
+      const c10::Device& device);
+
+  const std::unique_ptr<void, DlcloseDeleter> handle_ = nullptr;
+  AITemplateModelHandle model_handle_;
+
+  decltype(&AITemplateModelContainerDelete) deleteFunc_ = nullptr;
+  decltype(&AITemplateModelContainerRun) runFunc_ = nullptr;
+  decltype(&AITemplateModelContainerProfile) profileFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetOutputName) getOutputNameFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetMaximumOutputShape)
+      getMaximumOutputShapeFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetOutputDtype) getOutputDtypeFunc_ =
+      nullptr;
+
+  const std::string library_basename_;
+  const std::string library_path_;
+  const std::vector<std::string> input_names_;
+  const std::vector<std::string> output_names_;
+  const c10::optional<at::ScalarType> floating_point_input_dtype_;
+  const c10::optional<at::ScalarType> floating_point_output_dtype_;
+#ifdef FBCODE_AIT
+  folly::F14FastMap<const char*, size_t> input_name_to_index_;
+  folly::F14FastMap<const char*, size_t> output_name_to_index_;
+#else
+  std::unordered_map<std::string, size_t> input_name_to_index_;
+  std::unordered_map<std::string, size_t> output_name_to_index_;
+#endif
+
+  // Whether to use CUDA graph when launching the model. Defaults to
+  // FLAGS_ait_model_enable_cuda_graph, but can be overridden by
+  // setUseCudaGraph().
+  bool use_cuda_graph_;
+
+  AITemplatePyTorchCachingAllocator allocator_;
+};
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/TARGETS b/fx2ait/fx2ait/csrc/TARGETS
new file mode 100644
index 000000000..88893b1f7
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/TARGETS
@@ -0,0 +1,29 @@
+load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
+
+oncall("aitemplate")
+
+cpp_library(
+    name = "AITModelImpl",
+    srcs = ["AITModelImpl.cpp"],
+    headers = ["AITModelImpl.h"],
+    propagated_pp_flags = [
+        "-DFBCODE_AIT",
+        "-Iaitemplate/AITemplate/static/include",
+    ],
+    supports_python_dlopen = True,
+    deps = [
+        "//caffe2:ATen-cu",
+        "//caffe2/c10:c10",
+        "//caffe2/c10:c10_cuda",
+        "//folly:map_util",
+    ],
+    exported_deps = [
+        "//aitemplate/AITemplate/static/include:aitemplate",  # @manual
+        "//caffe2:ATen-cu",
+        "//caffe2:torch-cpp",
+        "//folly/container:f14_hash",
+    ],
+    exported_external_deps = [
+        ("glibc", None, "dl"),
+    ],
+)
diff --git a/fx2ait/fx2ait/example/01_transformer_model/README.md b/fx2ait/fx2ait/example/01_transformer_model/README.md
new file mode 100644
index 000000000..4baa06f5c
--- /dev/null
+++ b/fx2ait/fx2ait/example/01_transformer_model/README.md
@@ -0,0 +1,76 @@
+# Transfomer encoder
+
+In this example, we will demo how to use FX2AIT for inference on the transformer encoder block from pytorch.
+
+## Code structure
+```
+test_transformer_encoder.py     # Transformer encoder block definition using torch API
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+```
+
+## How to Use
+FX2AIT allows users to directly define a torch model, while fx2ait converter does the conversion for the usage.
+Therefore the encoder can be defined normally as
+```
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+```
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+|          1 |     0.00043845 |    2280.75893 |        0.0001806 |     5537.09872 | 2.42774396 |
+|          8 |     0.00047376 |    8443.01343 |        0.0002221 |     18009.9959 | 2.13312416 |
+|         16 |     0.00085377 |    18740.4255 |       0.00050193 |     31876.7364 | 1.90096119 |
+|         32 |     0.00150154 |    21311.3919 |       0.00069578 |     45991.3908 | 2.15806602 |
+|         64 |     0.00296888 |    21556.9773 |       0.00138113 |     46338.7065 | 2.14959202 |
+|        128 |     0.00530519 |    24127.3232 |       0.00261813 |     48889.8245 | 2.02632609 |
+|        256 |     0.01015745 |    25203.1791 |       0.00516545 |     49560.0242 | 1.96641955 |
+|        512 |     0.02023099 |    25307.7086 |       0.01034528 |     49491.1828 | 1.95557739 |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py b/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
new file mode 100644
index 000000000..096700dc3
--- /dev/null
+++ b/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from fx2ait.example.benchmark_utils import benchmark_function, verify_accuracy
+
+
+class TestTransformerModule(unittest.TestCase):
+    def test_transformer_encoder(self):
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                """
+                Inputs:
+                    input_dim - Dimensionality of the input
+                    num_heads - Number of heads to use in the attention block
+                    dim_feedforward - Dimensionality of the hidden layer in the MLP
+                    dropout - Dropout probability to use in the dropout layers
+                """
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            EncoderBlock(input_dim=768, num_heads=12, dim_feedforward=3072)
+            .cuda()
+            .half()
+        )
+
+        inputs = [torch.randn(10, 196, 768).half().cuda()]
+        verify_accuracy(model, inputs)
+
+        results = []
+        for batch_size in [1, 4, 16, 32, 64, 128, 256, 512]:
+            inputs = [torch.randn(batch_size, 196, 768).half().cuda()]
+            results.append(
+                benchmark_function(self.__class__.__name__, 100, model, inputs)
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/02_vision_model/README.md b/fx2ait/fx2ait/example/02_vision_model/README.md
new file mode 100644
index 000000000..8245b7cfd
--- /dev/null
+++ b/fx2ait/fx2ait/example/02_vision_model/README.md
@@ -0,0 +1,51 @@
+# ResNet-18
+
+In this example, we will demo how to use FX2AIT for inference on the ResNet-18 model from torchvision.
+
+## Code structure
+```
+test_vision_model.py            # ResNet definition using torch API
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+```
+
+## How to Use
+FX2AIT allows users to directly define a torch model, while fx2ait converter does the conversion for the usage.
+Therefore the definition of model is as simple as
+```
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+```
+Notice that because AIT supports channel last, while pytorch supports channel first operation, we need to permute the input
+```
+inputs = [inp.permute([0, 2, 3, 1]).contiguous() for inp in inputs]
+``
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+|          1 |     0.00349264 |    286.316562 |       0.00052888 |     1890.78465 | 6.60382564 |
+|          8 |     0.00382057 |    2093.93053 |        0.0007766 |     10301.2714 | 4.91958606 |
+|         16 |     0.00351062 |    4557.59936 |       0.00098235 |     16287.4093 | 3.57368167 |
+|         32 |     0.00321071 |    9966.64244 |       0.00166504 |     19218.8053 | 1.92831291 |
+|        256 |     0.01670636 |    15323.5057 |       0.01181243 |     21672.0808 | 1.41430305 |
+|        512 |     0.03276252 |    15627.6137 |       0.02347752 |     21808.0915 | 1.39548442 |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
new file mode 100644
index 000000000..1ca633b41
--- /dev/null
+++ b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
@@ -0,0 +1,59 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+import torchvision
+from fx2ait.example.benchmark_utils import benchmark_function, verify_accuracy
+
+
+class TestResNet(unittest.TestCase):
+    def test_resnet18(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        verify_accuracy(
+            model,
+            inputs,
+            permute_inputs=[0, 2, 3, 1],
+        )
+        results = []
+        for batch_size in [1, 8, 16, 32, 256, 512]:
+            inputs = [torch.randn(batch_size, 3, 224, 224).half().cuda()]
+            results.append(
+                benchmark_function(
+                    self.__class__.__name__,
+                    100,
+                    model,
+                    inputs,
+                    permute_inputs=[0, 2, 3, 1],
+                )
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/03_lowering_split/README.md b/fx2ait/fx2ait/example/03_lowering_split/README.md
new file mode 100644
index 000000000..1f892e054
--- /dev/null
+++ b/fx2ait/fx2ait/example/03_lowering_split/README.md
@@ -0,0 +1,90 @@
+# AIT Lowerer
+Now let's go back the example of encoder. Imagine the cases that
+1) You want to test the effect of a particular op, say MultiheadAttention on the entire module.
+2) There is some special op that AIT doesn't support.
+AIT actually provide an **automatic** Lowerer to split the graph into subgraphs and run interpreter,
+so that AIT only run the part it can handle and leave other to AITemplate.
+
+In this example, we will demo how to use AitLowerer for inference on any models.
+
+## Code structure
+```
+test_lowerr.py                  # Splited transformer encoder block to illustrate the usage of AitLowerer.
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+../lower/
+        lower.py                # Lower interface, which integrates lowering passes of Split subgraph and AIT Interpreter
+        ait_splitter.py         # Splitter to split graph into submodules
+        ait_setting.py          # Lowering settings
+
+```
+
+## How to Use
+To skip an operation can be extremely easy. One just need to register in the method function `@torch.fx.wrap`
+```
+@torch.fx.wrap
+def unsupported_attention_op(f, x):
+    attn_out, _ = f(x, x, x)
+    return attn_out
+```
+Then at forward stage, call the function.
+```
+        class LowerModule(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                ...
+
+            def forward(self, x):
+                # Unsupported op will not be lowered to AIT backend.
+                attn_out = unsupported_attention_op(self.attn, x)
+                ...
+```
+Then AIT won't deal with that part.
+```
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(model, inputs)
+        lower_output = lowered(*inputs)
+```
+The mechanism is that Acc tracer allows user to register wrap function so that Acc won't deal with it.
+Then our splitter will split the them into subgraph: _run_on_gpu_0 for pytorch Eager mode and _run_on_acc_1 for AIT,
+where _run_on_gpu_0 contains torch.nn.MultiheadAttention and _run_on_acc_1 contains the rest of the model.
+Finally, interpreter will be called for the AIT subgraph. (_run_on_acc_1)
+
+*Notice that our splitter only split subgraphs with more than 10 ops, since otherwise the subgraph is too small.*
+
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/03_lowering_split/test_lower.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+| 1          | 0.00065761     | 1520.66428    | 0.00076476       | 1307.59981     | 0.85988724 |
+| 4          | 0.00090687     | 4410.77681    | 0.00079056       | 5059.68597     | 1.14711902 |
+| 16         | 0.00249116     | 6422.69897    | 0.00200686       | 7972.66574     | 1.24132639 |
+| 32         | 0.00473638     | 6756.209      | 0.00396992       | 8060.62008     | 1.19306849 |
+| 64         | 0.00914742     | 6996.51201    | 0.00754977       | 8477.07749     | 1.2116148  |
+| 128        | 0.0178672      | 7163.96537    | 0.01501702       | 8523.66305     | 1.1897968  |
+| 256        | 0.03554306     | 7202.53192    | 0.02998132       | 8538.65123     | 1.18550689 |
+| 512        | 0.07118476     | 7192.55069    | 0.06006168       | 8524.56943     | 1.18519421 |
+
+From the example, we learn without AIT's multihead attention module, the speedup will be degraded to 1.2x compared to Pytorch Eager.
+
+### Note for Performance Results
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/03_lowering_split/test_lower.py b/fx2ait/fx2ait/example/03_lowering_split/test_lower.py
new file mode 100644
index 000000000..d4da616d6
--- /dev/null
+++ b/fx2ait/fx2ait/example/03_lowering_split/test_lower.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from fx2ait.example.benchmark_utils import benchmark_function
+from fx2ait.lower.lower import AitLowerer
+from fx2ait.lower.lower_settings import LowerSettings
+
+
+@torch.fx.wrap
+def unsupported_attention_op(f, x):
+    attn_out, _ = f(x, x, x)
+    return attn_out
+
+
+class TestFx2aitLowerTests(unittest.TestCase):
+    def test_ait_lower(self):
+        class LowerModule(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Unsupported op will not be lowered to AIT backend.
+                attn_out = unsupported_attention_op(self.attn, x)
+                # attn_out, _ = self.attn(x,x,x)
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            LowerModule(input_dim=768, num_heads=12, dim_feedforward=3072).cuda().half()
+        )
+
+        inputs = [torch.randn(10, 196, 768).half().cuda()]
+
+        ref_output = model(*inputs)
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(model, inputs)
+        lower_output = lowered(*inputs)
+
+        # Check accuracy
+        torch.testing.assert_close(
+            ref_output, lower_output, check_dtype=False, atol=1e-2, rtol=1e-2
+        )
+        # Expect 2 submodules in target model, one is run_on_acc and another run_on_gpu
+        children = list(lowered.named_children())
+        self.assertEqual(len(children), 2)
+
+        results = []
+        for batch_size in [1, 4, 16, 32, 64, 128, 256, 512]:
+            inputs = [torch.randn(batch_size, 196, 768).half().cuda()]
+            lowered = lowerer(model, inputs)
+            results.append(
+                benchmark_function(
+                    self.__class__.__name__, 100, model, inputs, ait_mod=lowered
+                )
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/benchmark_utils.py b/fx2ait/fx2ait/example/benchmark_utils.py
new file mode 100644
index 000000000..bd2b8379f
--- /dev/null
+++ b/fx2ait/fx2ait/example/benchmark_utils.py
@@ -0,0 +1,197 @@
+import time
+
+import uuid
+from typing import List, Optional
+
+import torch
+from fx2ait.acc_tracer import acc_tracer
+
+from fx2ait.ait_module import AITModule
+
+from fx2ait.fx2ait import AITInterpreter
+
+torch.ops.load_library("build/libait_model.so")
+
+
+def verify_accuracy(
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    rtol: float = 1e-01,
+    atol: float = 1e-01,
+    permute_inputs: Optional[List[int]] = None,
+    permute_outputs: Optional[List[int]] = None,
+):
+    # TODO: add precision to interpreter once AIT supports multiple precision level
+    # TODO: @qxy11 remove permute options once AIT supports channels-first format
+    mod.eval()
+    mod = acc_tracer.trace(
+        mod,
+        inputs,
+    )
+    print(mod)
+
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+    interp = AITInterpreter(
+        mod,
+        inputs,
+        "/tmp",
+        f"test-fx2ait-{uuid.uuid1()}",
+    )
+    with torch.no_grad():
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+
+        mod.eval()
+
+        start = time.perf_counter()
+        interp_result = interp.run()
+        sec = time.perf_counter() - start
+        print("Interpreter run time(s):", sec)
+        ait_mod = AITModule(
+            torch.classes.ait.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+        ref_outputs = mod(*original_inputs)
+
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        outputs = ait_mod(*cuda_inputs)
+        end_event.record()
+        torch.cuda.synchronize()
+        print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+        if isinstance(outputs, torch.Tensor):
+            ref_outputs = [ref_outputs]
+            outputs = [outputs]
+
+        for out, ref in zip(outputs, ref_outputs):
+            if not isinstance(ref, torch.Tensor):
+                ref = torch.tensor([ref])
+            ref = ref.cpu()  # to_dtype test has cases with gpu output
+            if permute_outputs:
+                out = out.permute(*permute_outputs)
+            print(out)
+            print(ref)
+            torch.testing.assert_close(
+                out.cpu(),
+                ref,
+                rtol=rtol,
+                atol=atol,
+                check_dtype=False,
+                equal_nan=True,
+            )
+
+
+def benchmark_function(
+    name: str,
+    iters: int,
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    permute_inputs: Optional[List[int]] = None,
+    ait_mod: torch.nn.Module = None,
+) -> float:
+
+    mod.eval()
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+    if ait_mod is None:
+        mod = acc_tracer.trace(
+            mod,
+            original_inputs,
+        )
+        interp = AITInterpreter(
+            mod,
+            inputs,
+            "/tmp",
+            f"benchmark-fx2ait-{uuid.uuid1()}",
+        )
+
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            torch.classes.ait.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+    def benchmark(f, args):
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        print("== Start benchmark iterations")
+        with torch.inference_mode():
+            start_event.record()
+            for _ in range(iters):
+                f(*args)
+            end_event.record()
+        torch.cuda.synchronize()
+        print("== End benchmark iterations")
+        time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+        return time_per_iter_ms
+
+    with torch.inference_mode():
+        # Benchmark Pytorch Eager
+        # warmup
+        for _ in range(10):
+            mod(*original_inputs)
+        batch_size = inputs[0].shape[0]
+        pt_time_per_iter_ms = benchmark(mod, original_inputs)
+        pt_qps = batch_size / pt_time_per_iter_ms
+
+        # Benchmark FX2AIT
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+        # warmup
+        for _ in range(10):
+            ait_mod(*cuda_inputs)
+
+        ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+        ait_qps = batch_size / ait_time_per_iter_ms
+
+        result = (
+            f"== Benchmark Result for: {name}\n"
+            f"BS: {batch_size}, "
+            f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+            f"PT Eager QPS: {pt_qps:.2f}, "
+            f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+            f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+            f"Speedup: {ait_qps/pt_qps:.2f}, "
+        )
+        with open("/tmp/bench_" + name + ".csv", "a") as f:
+            f.write(
+                ",".join(
+                    map(
+                        str,
+                        [
+                            name,
+                            batch_size,
+                            pt_time_per_iter_ms,
+                            pt_qps,
+                            ait_time_per_iter_ms,
+                            ait_qps,
+                            ait_qps / pt_qps,
+                        ],
+                    )
+                )
+                + "\n"
+            )
+        return result
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
new file mode 100644
index 000000000..50b4fcf9a
--- /dev/null
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -0,0 +1,321 @@
+import io
+import logging
+import os
+import tempfile
+import warnings
+from datetime import datetime
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence
+
+import fx2ait.cache as cache
+
+import torch
+
+# @manual=//aitemplate/AITemplate/python/aitemplate:aitemplate
+from aitemplate.testing import detect_target
+from .converters.ait_converters import *  # isort:skip # noqa: F401 F403
+from .converters.aten2ait_converters import *  # isort:skip # noqa: F401 F403
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import _TorchConstantTensorData
+from aitemplate.compiler.public import DynamicProfileStrategy, Tensor as AITTensor
+from aitemplate.utils.serialization.serdes_code import dump_program, get_program
+from torch.fx.node import _get_qualified_name
+from torch.fx.passes.split_utils import getattr_recursive
+
+from .converters.converter_registry import AIT_CONVERTERS
+from .tensor_spec import TensorSpec
+
+from .utils import dtype_to_str, make_str_ait_friendly
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+class AITInterpreterResult(NamedTuple):
+    engine: Any
+    input_names: Sequence[str]
+    output_names: Sequence[str]
+
+
+class AITInterpreter(torch.fx.Interpreter):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        input_specs: List[TensorSpec],
+        workdir: str,
+        name: str,
+        dll_name: str = "test.so",
+        dynamic_profile_strategy=DynamicProfileStrategy.MAX,
+        profile_devs=None,
+        use_fp16_acc=True,
+        dump_ait_dir: Optional[str] = None,
+        keep_constants: Optional[bool] = None,
+        load_ait_dir: Optional[str] = None,
+        remote_cache_file_path: Optional[str] = None,
+        save_remote_cache: Optional[bool] = False,
+    ):
+        """
+        Args:
+            module: target module for AITemplate compilation
+            input_specs: sample input for the target module
+            workdir: directory path for store AITemplate generated files
+            name: directory name for store AITemplate generated files
+            dll_name: AITemplate library name
+            dynamic_profile_strategy: A dynamic profiling strategy, used to filter
+            generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+            use_fp16_acc: whether to uses fp16 accumulation for gemm ops.
+            dump_ait_dir: AIT generated file dump location.
+            keep_constants: whether to keep original constants or use random generated constants
+            load_ait_dir: location for existing ait files
+            remote_cache_file_path: AITemplate profiling cache location
+            save_remote_cache: whether to save the updated cache
+        """
+        super().__init__(module)
+
+        missing_ops = self.validate_conversion()
+        if missing_ops:
+            warnings.warn(
+                "Interpretation will fail due to missing operations \n"
+                + "\n".join(f"{i}" for i in missing_ops)
+            )
+
+        self.remote_cache_file_path = remote_cache_file_path
+        self.save_remote_cache: bool = (
+            True if save_remote_cache and self.remote_cache_file_path else False
+        )
+        self.remote_cache_bytes = self._load_profile_cache()
+        if self.save_remote_cache:
+            self.cache_dir = os.path.join(
+                tempfile.mkdtemp(prefix="aitemplate_"), ".aitemplate"
+            )
+            os.environ["CACHE_DIR"] = self.cache_dir
+            _LOGGER.info(f"Set CACHE_DIR to {self.cache_dir}")
+        self.use_fp16_acc = use_fp16_acc
+        self.hardware_target = self._create_target()
+        self.input_specs = input_specs
+        self.input_specs_iter = 0
+        self.workdir = workdir
+        self.name = name
+        self.dll_name = dll_name
+        self.dynamic_profile_strategy = dynamic_profile_strategy
+        self.profile_devs = profile_devs
+
+        self._input_names: List[str] = []
+        self._output_names: List[str] = []
+        self._loaded_params: Dict[str, AITTensor] = {}
+
+        self.dump_ait_dir = dump_ait_dir
+        self.keep_constants = keep_constants
+        self.load_ait_dir = load_ait_dir
+
+    def _create_target(self):
+        """Detect GPU target"""
+        return detect_target(
+            use_fp16_acc=self.use_fp16_acc, remote_cache_bytes=self.remote_cache_bytes
+        )
+
+    def _load_profile_cache(self) -> bytes:
+        """
+        Load AITemplate profile cache if cache file path is provided
+        """
+        if not self.remote_cache_file_path:
+            return
+
+        cache_bytes = io.BytesIO()
+        cache.load_profile_cache(self.remote_cache_file_path, cache_bytes)
+        remote_cache_bytes = cache_bytes.getvalue()
+        _LOGGER.info(
+            f"Loaded profile cache from remote: {self.remote_cache_file_path} with length {len(remote_cache_bytes)}",
+        )
+        return remote_cache_bytes
+
+    def _upload_profile_cache(self, hardware_target) -> None:
+        """
+        Update AITemplate profile cache if cache file path is provided
+        """
+        cache_path = os.path.join(
+            self.cache_dir, hardware_target._get_cache_file_name()
+        )
+        if not self.save_remote_cache or not cache_path:
+            return
+
+        _LOGGER.info(
+            f"Uploading profile cache to remote: {self.remote_cache_file_path}",
+        )
+        cache.save_profile_cache(self.remote_cache_file_path, cache_path)
+        _LOGGER.info(
+            f"Upload AIT cache file to path {self.remote_cache_file_path} completed."
+        )
+
+    def validate_conversion(self):
+        """
+        Validate all node in target module has correspondent AIT converter support.
+        """
+        missing_converter = set()
+
+        for node in self.module.graph.nodes:
+            if node.op == "call_function" and not AIT_CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} {_get_qualified_name(node.target)}")
+            elif node.op == "call_method" and not AIT_CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} torch.Tensor.{node.target}")
+            elif node.op == "call_module":
+                submod = self.fetch_attr(node.target)
+                submod_type = getattr(submod, "_base_class_origin", type(submod))
+                if not AIT_CONVERTERS.get(submod_type):
+                    missing_converter.add(f"{node.op} {torch.typename(submod_type)}")
+
+        return missing_converter
+
+    def run(self) -> AITInterpreterResult:
+        """
+        Build AITemplate engine
+        Returns:
+        Compiled AITemplate engine packaged as AITInterpreterResult
+        """
+        run_module_start_time = datetime.now()
+        output_tensors = super().run()
+        _LOGGER.info(
+            f"Run Module elapsed time: {datetime.now() - run_module_start_time}"
+        )
+        # FX2AIT name if composed as MODULE_NAME/submodule_name, we put all profile file on
+        # parent dir of submodule_name to share across submodules.
+        profile_dir = (
+            os.path.join(self.workdir, self.name[0 : self.name.rindex("/")])
+            if self.name.find("/") != -1
+            else self.workdir
+        )
+        args = {
+            "tensor": output_tensors,
+            "target": self.hardware_target,
+            "workdir": self.workdir,
+            "test_name": self.name,
+            "profile_devs": self.profile_devs,
+            "dynamic_profiling_strategy": self.dynamic_profile_strategy,
+            "dll_name": self.dll_name,
+            "profile_dir": profile_dir,
+        }
+        if self.dump_ait_dir:
+            dump_ait_path = os.path.join(self.dump_ait_dir, self.name + ".py")
+            random_constants = not self.keep_constants
+            dump_program(
+                output_tensors, dump_ait_path, random_constants=random_constants
+            )
+            _LOGGER.info(f"Dumped AIT model to {dump_ait_path}")
+
+        if self.load_ait_dir:
+            load_ait_path = os.path.join(self.load_ait_dir, self.name + ".py")
+            _LOGGER.info(f"Loaded AIT model from {load_ait_path}")
+            output_tensors, _ = get_program(load_ait_path)
+            if isinstance(output_tensors, AITTensor):
+                output_tensors = (output_tensors,)
+            args["tensor"] = output_tensors
+
+        self.engine = compile_model(**args)
+
+        for i, input_name in enumerate(self._input_names):
+            _LOGGER.info("Set input{}: {}".format(i, input_name))
+
+        if self.engine is None:
+            raise RuntimeError("Engine is missing!")
+
+        if self.save_remote_cache:
+            self._upload_profile_cache(self.hardware_target)
+
+        return AITInterpreterResult(
+            self.engine,
+            self._input_names,
+            self._output_names,
+        )
+
+    def run_node(self, n):
+        self._cur_node_name = str(n)
+        return super().run_node(n)
+
+    def placeholder(self, target, args, kwargs):
+        self._input_names.append(target)
+        input_spec = self.input_specs[self.input_specs_iter]
+        self.input_specs_iter += 1
+
+        return AITTensor(
+            shape=input_spec.shape,
+            dtype=dtype_to_str(input_spec.dtype),
+            name=target,
+            is_input=True,
+        )
+
+    def get_attr(self, target, args, kwargs):
+        attr_val = getattr_recursive(self.module, target)
+
+        if not isinstance(attr_val, (torch.Tensor, torch.nn.Parameter)):
+            raise RuntimeError(f"Unexpected get_attr value for {target}: {attr_val}")
+
+        ait_friendly_name = make_str_ait_friendly(target)
+        ait_dtype = dtype_to_str(attr_val.dtype)
+        ait_val = attr_val.contiguous()
+        if ait_friendly_name in self._loaded_params:
+            existing_tensor = self._loaded_params[ait_friendly_name]
+            assert existing_tensor._attrs["dtype"] == ait_dtype
+            assert existing_tensor._attrs["data"].tensor == ait_val
+            return existing_tensor
+
+        data = _TorchConstantTensorData(ait_val)
+        tensor = AITTensor(
+            shape=attr_val.shape, dtype=ait_dtype, name=ait_friendly_name
+        )
+        tensor._bind_data(data)
+        self._loaded_params[ait_friendly_name] = tensor
+        return tensor
+
+    def call_function(self, target, args, kwargs):
+        converter = AIT_CONVERTERS.get(target)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of function {torch.typename(target)} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, args, kwargs, self._cur_node_name)
+
+    def call_method(self, target, args, kwargs):
+        assert isinstance(target, str)
+        converter = AIT_CONVERTERS.get(target)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of method {target} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, args, kwargs, self._cur_node_name)
+
+    def call_module(self, target, args, kwargs):
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        submod_type = getattr(submod, "_base_class_origin", type(submod))
+        converter = AIT_CONVERTERS.get(submod_type)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of module of type {submod_type} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, submod, args, kwargs, self._cur_node_name)
+
+    def output(self, target, args, kwargs):
+        assert len(args) == 1
+        if isinstance(args[0], tuple):
+            outputs = args[0]
+        elif isinstance(args[0], list):
+            outputs = tuple(args[0])
+        else:
+            outputs = (args[0],)
+
+        for i, output in enumerate(outputs):
+
+            name = f"output_{i}"
+            output._attrs["name"] = name
+            output._attrs["is_output"] = True
+            self._output_names.append(name)
+
+        return outputs
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
new file mode 100644
index 000000000..047535a48
--- /dev/null
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -0,0 +1,238 @@
+import dataclasses as dc
+import datetime
+import logging
+import operator
+from typing import Any, Callable, List, Optional, Sequence
+
+import fx2ait.acc_tracer.acc_tracer as acc_tracer
+
+import torch
+
+from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
+from fx2ait.ait_module import AITModule
+
+from fx2ait.ait_splitter import AITSplitter, AITSplitterSettings
+from fx2ait.fx2ait import AITInterpreter, AITInterpreterResult
+from fx2ait.tensor_spec import TensorSpec
+from torch import fx, nn
+from torch.fx.passes.splitter_base import generate_inputs_for_submodules, SplitResult
+
+from .lower_settings import LowerPrecision, LowerSettings
+
+logger: logging.Logger = logging.getLogger(__name__)
+Input = Sequence[Any]
+
+torch.ops.load_library("build/libait_model.so")
+
+
+# A list of (function, target) pairs to not apply acc normalization
+# to when scripting. For one reason or another, these targets do
+# not play well with TorchScript after normalization.
+SCRIPTING_ACC_NORMALIZATION_BLOCKLIST = {
+    ("call_function", operator.getitem),
+    ("call_method", "to"),
+}
+
+
+@dc.dataclass
+class AitLowerInterpreter:
+    lower_settings: LowerSettings
+
+    @classmethod
+    def create(cls, lower_settings):
+        return AitLowerInterpreter(lower_settings)
+
+    def __call__(
+        self,
+        module_name: str,
+        mod: fx.GraphModule,
+        inputs: List[torch.Tensor],
+    ) -> AITInterpreterResult:
+
+        (additional_inputs,) = self.lower_settings.additional_inputs
+        if additional_inputs is None:
+            input_specs = TensorSpec.from_input_list_with_batch_size(
+                inputs, self.lower_settings.max_batch_size
+            )
+        else:
+            input_specs = TensorSpec.from_two_input_lists(inputs, additional_inputs)
+        logger.info("Input specs: %s", input_specs)
+
+        interpreter = AITInterpreter(
+            module=mod,
+            input_specs=input_specs,
+            workdir=self.lower_settings.workdir,
+            name=f"{self.lower_settings.name}/{module_name}",
+            dll_name=module_name + "-" + self.lower_settings.dll_name,
+            dynamic_profile_strategy=self.lower_settings.dynamic_profile_strategy,
+            profile_devs=self.lower_settings.profile_devs,
+            use_fp16_acc=self.lower_settings.use_fp16_acc,
+            remote_cache_file_path=self.lower_settings.remote_cache_file_path,
+            save_remote_cache=self.lower_settings.save_remote_cache,
+            dump_ait_dir=self.lower_settings.dump_ait_dir,
+            keep_constants=self.lower_settings.keep_constants,
+            load_ait_dir=self.lower_settings.load_ait_dir,
+        )
+
+        interp_result: AITInterpreterResult = interpreter.run()
+
+        return interp_result
+
+
+def create_ait_lower_interpreter(lower_settings: LowerSettings) -> AitLowerInterpreter:
+    return AitLowerInterpreter.create(lower_settings)
+
+
+def default_split_function(
+    model: fx.GraphModule, inputs: Input, lower_settings: LowerSettings
+) -> SplitResult:
+    settings = AITSplitterSettings(
+        min_acc_module_size=lower_settings.min_acc_module_size
+    )
+    splitter = AITSplitter(model, inputs, settings=settings)
+    splitter.node_support_preview()
+    return splitter.generate_split_results()
+
+
+def default_lower_pass(
+    create_ait_interpreter: Callable[[LowerSettings], AitLowerInterpreter],
+) -> Callable:
+    def lower_pass(
+        mod: nn.Module, input: Input, lower_settings: LowerSettings, module_name: str
+    ) -> nn.Module:
+        """
+        Create a module transformation pass which lowers an `nn.Module` into an
+        `AITModule`
+        """
+        interpreter = create_ait_interpreter(lower_settings)
+        interp_res: AITInterpreterResult = interpreter(module_name, mod, input)
+
+        # Return a scriptable module since some use cases need to script the top
+        # level module
+        return AITModule.create_ait_module_wrapper(
+            torch.classes.ait.AITModel(
+                interp_res.engine.lib_path,
+                interp_res.input_names,
+                interp_res.output_names,
+                _precision_to_torch_type(lower_settings.precision),
+                _precision_to_torch_type(lower_settings.output_precision),
+                1,  # num_runtimes
+            ),
+            lower_settings.trace_ait_module,
+            *input,
+        )
+
+    return lower_pass
+
+
+@dc.dataclass(frozen=True)
+class AitLowerer:
+    """Lowers a module using fx2ait.
+
+    This is a composable class to facilitate fx2ait. A normal fx2ait process
+    composes of the following passes to transform an `fx.GraphModule`:
+
+        1. trace - use torch.fx to trace the module so we can get the graph
+            representation of the model.
+        2. split - the graph module is split into several submodules,
+            running either via AITemplate, or via regular CUDA.
+
+    For each split that need to run via AIT, the following passes are
+    invoked:
+
+        3. `AITInterpreter` - build the AIT engine for the submodule that
+            can be supported through `AITInterpreter`.
+        4. Wraps the executable AIT engine into `AITModule`, which is an `nn.Module`.
+        5. The converted submodule is then set back onto the top-level module
+
+    """
+
+    lower_settings: LowerSettings
+    lower_pass: Callable
+    static_deps_initialized: bool = False
+
+    @staticmethod
+    def initialize_static_deps() -> None:
+        if AitLowerer.static_deps_initialized:
+            logger.info("Static deps were initialized already")
+        else:
+            logger.info("Initializing static deps")
+            update_acc_op_mappers_for_ait()
+            AitLowerer.static_deps_initialized = True
+            logger.info("Initialized static deps")
+
+    @classmethod
+    def create(
+        cls,
+        lower_settings: LowerSettings,
+        interpreter_builder: Callable = create_ait_lower_interpreter,
+    ) -> "AitLowerer":
+        """Instantiate an `AitLowerer` instance."""
+        cls.initialize_static_deps()
+
+        return cls(
+            lower_settings=lower_settings,
+            lower_pass=default_lower_pass(create_ait_lower_interpreter),
+        )
+
+    def lower_func(
+        self, split_result: SplitResult, additional_inputs: Optional[Input] = None
+    ) -> nn.Module:
+        if additional_inputs:
+            additional_submodule_inputs = generate_inputs_for_submodules(
+                split_result.split_module,
+                additional_inputs,
+                list(split_result.submodule_inputs.keys()),
+            )
+        else:
+            additional_submodule_inputs = None
+
+        for submod_name, submod_inputs in split_result.submodule_inputs.items():
+            submod = getattr(split_result.split_module, submod_name)
+            # Only acc submodules will be lowered.
+            if not submod_name.startswith(split_result.non_acc_submodule_prefix):
+                logger.info(f"Now lowering submodule {submod_name}")
+                lowering_start_time = datetime.datetime.now()
+
+                self.lower_settings.additional_inputs = (
+                    additional_submodule_inputs[submod_name]
+                    if additional_submodule_inputs
+                    else None,
+                )
+
+                lowered_module = self.lower_pass(
+                    submod, submod_inputs, self.lower_settings, submod_name
+                )
+                setattr(split_result.split_module, submod_name, lowered_module)
+                logger.info(
+                    f"Lowering submodule {submod_name} elapsed time {datetime.datetime.now() - lowering_start_time}"
+                )
+
+        return split_result.split_module
+
+    def __call__(
+        self,
+        module: nn.Module,
+        inputs: Input,
+        additional_inputs: Optional[Input] = None,
+    ) -> nn.Module:
+        module.eval()
+        module = acc_tracer.trace(
+            module, inputs, leaf_module_list=self.lower_settings.leaf_module_list
+        )
+        split_result = default_split_function(module, inputs, self.lower_settings)
+        lower_result = self.lower_func(split_result, additional_inputs)
+
+        return lower_result
+
+
+def _precision_to_torch_type(
+    precision: Optional[LowerPrecision],
+) -> Optional[torch.dtype]:
+    if precision == LowerPrecision.FP16:
+        return torch.float16
+    elif precision == LowerPrecision.FP32:
+        return torch.float
+    elif precision == LowerPrecision.INT8:
+        return torch.int8
+    return None
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
new file mode 100644
index 000000000..75b152dc6
--- /dev/null
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -0,0 +1,68 @@
+import dataclasses as dc
+from enum import Enum
+from typing import Any, List, Optional, Set, Type
+
+import torch
+
+from aitemplate.compiler.public import DynamicProfileStrategy
+from torch import nn
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
+
+
+@dc.dataclass
+class LowerSettings:
+    """
+    Basic configuration for lowering stack.
+    Args:
+    max_batch_size: The maximium batch_size for the input
+    min_acc_module_size: minimal number of nodes for individual accelerated submodule.
+    workdir: the working directory path.
+    name: the working directory name.
+    dll_name: AITemplate generated .so file name
+    dynamic_profile_strategy:
+        A dynamic profiling strategy, used to filter
+        generated profiles at compile time.
+    precision: The runtime precision setting
+    use_fp16_acc:
+        For LowerPrecision.FP16, use_fp16_acc can be either True or False.
+        use_fp16_acc=True uses fp16 accumulation for gemm ops.
+        use_fp16_acc=False uses fp32 accumulation for gemm ops.
+        Set use_fp16_acc=True for better perf; set use_fp16_acc=False for better accuracy.
+        For LowerPrecision.FP32, use_fp16_acc is invalid.
+    leaf_module_list: The list of modules that acc_tracer will not trace into.
+    output_precision: The AITemplate output precision level.
+    additional_inputs: The additional input to help determine input batch_size dimension range.
+    remote_cache_file_path: Location for AITemplate cache file.
+    save_remote_cache: Whether to save the current cache update to the cache file.
+    dump_ait_dir: Dump AIT module into python code
+    keep_constants: Whether or not to keep the constants in the dumped AIT module
+    load_ait_dir: Reload AIT module from dumped AIT python code instead.
+    """
+
+    max_batch_size: int = 2048
+    min_acc_module_size: int = 10
+    workdir: str = ""
+    name: str = ""
+    dll_name: str = "ait_engine.so"
+    dynamic_profile_strategy: DynamicProfileStrategy = DynamicProfileStrategy.MAX
+    profile_devs: Any = None
+    # If None, infer the dtypes from the sample inputs.
+    precision: Optional[LowerPrecision] = LowerPrecision.FP16
+    use_fp16_acc: bool = True  # only valid for precision == FP16
+    ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None
+    leaf_module_list: Optional[Set[Type[nn.Module]]] = None
+    # If None, infer the dtypes from the sample inputs.
+    output_precision: Optional[LowerPrecision] = LowerPrecision.FP16
+    additional_inputs: Optional[List[torch.Tensor]] = None
+    remote_cache_file_path: Optional[str] = None
+    save_remote_cache: Optional[bool] = None
+    dump_ait_dir: Optional[str] = None
+    keep_constants: Optional[bool] = None
+    load_ait_dir: Optional[str] = None
+    # jit.trace AITModule
+    trace_ait_module: bool = True
diff --git a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
new file mode 100644
index 000000000..f03d9f9f0
--- /dev/null
+++ b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
@@ -0,0 +1,618 @@
+import logging
+import operator
+from typing import Any, NamedTuple
+
+import torch
+import torch.fx
+from fx2ait.acc_tracer import acc_ops
+from torch.fx.experimental.const_fold import split_const_subgraphs
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.shape_prop import TensorMetadata
+
+_LOGGER = logging.getLogger(__name__)
+
+# Create an alias for module input type to avoid littering pyre-ignore for Any
+# throughout the file.
+Input = Any
+
+
+def run_const_fold(traced_mod: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    # Now we do constant folding on traced module.
+    def skip_folding(node: torch.fx.Node):
+        if node.target == torch.ops.aten.sym_size:
+            return True
+
+    const_split_mod = split_const_subgraphs(
+        traced_mod, skip_folding_node_fn=skip_folding
+    )
+    const_split_mod.run_folding()
+    return const_split_mod
+
+
+def nchw2nhwc_pass(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    This pass is a kind of hacky way to support some vision models. The reason is due the fact that the frontend is traced based on channel first while AIT needs channel last.
+    We need to modify
+    1) mean.dim for dim=[-1,-2] changed to [-2,-3]
+    2) dim=1 of mean.dim changed to dim=3
+    3) concat(inputs, dim=1) need to be dim=3
+    """
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == torch.ops.aten.cat.default:
+            if n.args[1] == 1:
+                new_args = list(n.args)
+                new_args[1] = 3
+        elif n.op == "call_function" and n.target == torch.ops.aten.mean.dim:
+            if n.args[1] == [-1, -2] or [-2, -1]:
+                new_args = list(n.args)
+                new_args[1] = [-2, -3]
+        else:
+            continue
+        n.args = tuple(new_args)
+        modified = True
+
+        modified_list1 = []
+        modified_list2 = []
+        modified_list3 = []
+        for u in n.users:
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 1:
+                modified_list1.append(u)
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 2:
+                modified_list2.append(u)
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 3:
+                modified_list3.append(u)
+
+        for v in modified_list1:
+            new_args = list(v.args)
+            new_args[1] = 3
+            v.args = tuple(new_args)
+
+        for v in modified_list2:
+            new_args = list(v.args)
+            new_args[1] = 1
+            v.args = tuple(new_args)
+
+        for v in modified_list3:
+            new_args = list(v.args)
+            new_args[1] = 2
+            v.args = tuple(new_args)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_inplace_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Remove this func after functionalization is workable
+    """
+    modified = False
+    map_func = {
+        torch.ops.aten.relu_.default: torch.ops.aten.relu.default,
+        torch.ops.aten.hardtanh_.default: torch.ops.aten.hardtanh.default,
+        torch.ops.aten.add_.Tensor: torch.ops.aten.add.Tensor,
+    }
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in map_func.keys():
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_args = node.args
+                new_node = module.graph.create_node(
+                    "call_function",
+                    map_func[node.target],
+                    args=new_args,
+                    kwargs=None,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_native_layernorm_with_layernorm(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if (
+            n.op == "call_function"
+            and n.target == torch.ops.aten.native_layer_norm.default
+        ):
+            for v in n.users:
+                if v.op == "call_function" and v.target == operator.getitem:
+                    if v.args[1] != 0:
+                        raise RuntimeError(
+                            f"Got args[{v.args[1]}]!!\n"
+                            "layernorm can only generate output (args[0]), "
+                            "not mean (args[1]) or std (args[2])!"
+                        )
+                    new_op = torch.ops.aten.layer_norm.default
+                    new_args = (*n.args, True)  # cudnn_enable=True
+                    modified = True
+                else:
+                    continue
+
+                with module.graph.inserting_after(v):
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        new_op,
+                        args=new_args,
+                        kwargs=v.kwargs,
+                    )
+                    v.replace_all_uses_with(new_node)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_transpose_mm_op_with_linear(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == torch.ops.aten.t.default:
+            to_erase = []
+            for v in n.users:
+                if v.op == "call_function" and v.target == torch.ops.aten.addmm.default:
+                    new_op = torch.ops.aten.linear
+                    bias, inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, bias)
+                    modified = True
+                elif v.op == "call_function" and v.target == torch.ops.aten.mm.default:
+                    new_op = torch.ops.aten.linear
+                    inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, None)
+                    modified = True
+                # this pass should be after `compose_bmm`
+                elif v.op == "call_function" and v.target == aten_compose_bmm_2d:
+                    new_op = torch.ops.aten.linear
+                    inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, None)
+                    modified = True
+                else:
+                    continue
+
+                with module.graph.inserting_after(v):
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        new_op,
+                        args=new_args,
+                        kwargs=v.kwargs,
+                    )
+                    v.replace_all_uses_with(new_node)
+                    to_erase.append(v)
+            for v in to_erase:
+                module.graph.erase_node(v)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_aten_op_with_indices(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten.max_pool2d_with_indices.default,
+            torch.ops.aten.max_pool3d_with_indices.default,
+            torch.ops.aten.native_batch_norm.default,
+            torch.ops.aten._native_batch_norm_legit.default,
+        ):
+            modified = True
+            if len(n.users) != 1:
+                raise RuntimeError(
+                    f"{n.target} has users={len(n.users)}. We can only handle it with 1 user"
+                )
+            if n.target == torch.ops.aten.max_pool2d_with_indices.default:
+                new_op = torch.ops.aten.max_pool2d
+                new_args = n.args
+            elif n.target == torch.ops.aten.max_pool3d_with_indices.default:
+                new_op = torch.ops.aten.max_pool3d
+                new_args = n.args
+            elif (
+                n.target == torch.ops.aten.native_batch_norm.default
+                or n.target == torch.ops.aten._native_batch_norm_legit.default
+            ):
+                new_op = torch.ops.aten.batch_norm
+                new_args = list(n.args)
+                new_args.append(False)
+                new_args = tuple(new_args)
+
+            getitem_node = next(iter(n.users))
+            with module.graph.inserting_after(getitem_node):
+                new_node = module.graph.create_node(
+                    "call_function",
+                    new_op,
+                    args=new_args,
+                    kwargs=n.kwargs,
+                )
+                getitem_node.replace_all_uses_with(new_node)
+                module.graph.erase_node(getitem_node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_aten_reshape_alias_with_replace(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    # The stride parameter is not used. Replace with reshape without stride
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten._reshape_alias.default,
+        ):
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_args = (node.args[0], node.args[1])
+                new_node = module.graph.create_node(
+                    "call_function",
+                    torch.ops.aten.reshape,
+                    args=new_args,
+                    kwargs=None,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+            break
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+## Acc tracer pass, but for aten usage
+def acc_replace_reshape_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Change TensorMetadata to shapeMetadata which only contains shape field.
+    """
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == acc_ops.reshape:
+            if isinstance(n.kwargs["acc_out_ty"], TensorMetadata):
+
+                class shapeMetadata(NamedTuple):
+                    shape: torch.Size
+
+                node = n
+                with module.graph.inserting_after(node):
+                    new_kargs = {}
+                    new_kargs["input"] = node.kwargs["input"]
+                    new_kargs["acc_out_ty"] = shapeMetadata(
+                        node.kwargs["acc_out_ty"].shape
+                    )
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        acc_ops.reshape,
+                        args=node.args,
+                        kwargs=new_kargs,
+                    )
+                    node.replace_all_uses_with(new_node)
+                    module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+def remove_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    1. Remove clone, _unsafe_view node. #TODO Remove this func after functionalization is workable
+    2. Remove inefficient op getitem(index=slice) P561572458
+    """
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (torch.ops.aten.clone.default,):
+            modified = True
+            node = n
+            input_n = node.all_input_nodes[0]
+            node.replace_all_uses_with(input_n)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten._unsafe_view.default,
+        ):
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_node = module.graph.create_node(
+                    "call_function",
+                    torch.ops.aten.reshape,
+                    args=node.args,
+                    kwargs=node.kwargs,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def aten_operator_getitem(*args):
+    return operator.getitem(*args)
+
+
+def replace_builtin_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    To differential the same op in fx2ait as they are registered in the same dictionary
+    """
+
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (operator.getitem,):
+            modified = True
+            n.target = aten_operator_getitem
+    module.graph.eliminate_dead_code()
+    module.recompile()
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+###############
+"""
+Trace compose. For some ops, we do not want to decompose further but want coarse granularity
+For ex:
+1. bmm
+2. chunk
+3. getitem(input, idx=(slice(),slice()...))
+"""
+
+
+def aten_compose_getitem_slice(input, list_args):
+    for _, args in enumerate(list_args):
+        input = torch.ops.aten.slice.Tensor(input, *args)
+    return input
+
+
+def compose_getitem_slice(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed getitem(input, idx=(slice(),slice()...))
+    """
+
+    def match_pattern(module, node):
+        if node.op == "call_function" and node.target == torch.ops.aten.slice.Tensor:
+            holder = []
+            holder.append(node)
+            while (
+                len(node.users.keys()) == 1
+                and next(iter(node.users)).target == torch.ops.aten.slice.Tensor
+                and node.args[1] + 1 == next(iter(node.users)).args[1]
+            ):
+                node = next(iter(node.users))
+                holder.append(node)
+            if len(holder) == 1:
+                return (False,)
+            else:
+                return (True, holder)
+        return (False,)
+
+    modified = False
+    for node in module.graph.nodes:
+        res = match_pattern(module, node)
+        if res[0]:
+            modified = True
+            holder = res[1]
+            input_n = holder[0].args[0]
+            last_n = holder[-1]
+            list_args = []
+            for h_n in holder:
+                list_args.append(h_n.args[1:])
+
+            with module.graph.inserting_after(last_n):
+                new_args = (input_n, list_args)
+                new_node = module.graph.create_node(
+                    "call_function",
+                    aten_compose_getitem_slice,
+                    args=new_args,
+                    kwargs=None,
+                )
+            last_n.replace_all_uses_with(new_node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def aten_compose_bmm_2d(flat_args_1, flat_args_2):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(flat_args_1, 1)
+    sym_size_2 = torch.ops.aten.sym_size(flat_args_1, 2)
+    expand = torch.ops.aten.expand.default(
+        flat_args_1, [sym_size, sym_size_1, sym_size_2]
+    )
+    view = torch.ops.aten.view.default(expand, [sym_size, sym_size_1, sym_size_2])
+    sym_size_3 = torch.ops.aten.sym_size(flat_args_2, 0)
+    sym_size_4 = torch.ops.aten.sym_size(flat_args_2, 1)
+    expand_1 = torch.ops.aten.expand.default(
+        flat_args_2, [sym_size, sym_size_3, sym_size_4]
+    )
+    view_1 = torch.ops.aten.view.default(expand_1, [sym_size, sym_size_3, sym_size_4])
+    bmm = torch.ops.aten.bmm.default(view, view_1)
+    view_2 = torch.ops.aten.view.default(bmm, [sym_size, sym_size_1, sym_size_4])
+    return view_2
+
+
+def aten_compose_bmm_3d(flat_args_1, flat_args_2):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(flat_args_1, 1)
+    sym_size_2 = torch.ops.aten.sym_size(flat_args_1, 2)
+    expand = torch.ops.aten.expand.default(
+        flat_args_1, [sym_size, sym_size_1, sym_size_2]
+    )
+    view = torch.ops.aten.view.default(expand, [sym_size, sym_size_1, sym_size_2])
+    sym_size_3 = torch.ops.aten.sym_size(flat_args_2, 1)
+    sym_size_4 = torch.ops.aten.sym_size(flat_args_2, 2)
+    expand_1 = torch.ops.aten.expand.default(
+        flat_args_2, [sym_size, sym_size_3, sym_size_4]
+    )
+    view_1 = torch.ops.aten.view.default(expand_1, [sym_size, sym_size_3, sym_size_4])
+    bmm = torch.ops.aten.bmm.default(view, view_1)
+    view_2 = torch.ops.aten.view.default(bmm, [sym_size, sym_size_1, sym_size_4])
+    return view_2
+
+
+def compose_bmm(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed bmm (matmul)
+    """
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (torch.ops.aten.bmm.default,):
+            modified = True
+            node = n
+            input_n = node.all_input_nodes[0]
+            other_n = node.all_input_nodes[1]
+            output = next(iter(node.users))
+            input_input_n = input_n.all_input_nodes[0]
+            if (
+                input_input_n.target != torch.ops.aten.expand.default
+                and input_n.target != torch.ops.aten.view.default
+            ):
+                raise RuntimeError(
+                    "Bmm is addressed in fixed pattern. A new pattern is met!"
+                )
+            real_input = input_input_n.all_input_nodes[0]
+            input_other_n = other_n.all_input_nodes[0]
+            if (
+                input_other_n.target != torch.ops.aten.expand.default
+                and other_n.target != torch.ops.aten.view.default
+            ):
+                raise RuntimeError(
+                    "Bmm is addressed in fixed pattern. A new pattern is met!"
+                )
+            real_other = input_other_n.all_input_nodes[0]
+            if len(real_other.meta["val"].size()) == 2:
+                new_func = aten_compose_bmm_2d
+            if len(real_other.meta["val"].size()) == 3:
+                new_func = aten_compose_bmm_3d
+
+            with module.graph.inserting_after(node):
+                new_args = (real_input, real_other)
+                new_node = module.graph.create_node(
+                    "call_function",
+                    new_func,
+                    args=new_args,
+                    kwargs=None,
+                )
+            output.replace_all_uses_with(new_node)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def aten_compose_chunk(flat_args_1, chunk, dim):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, dim)
+    add = operator.add(sym_size, chunk)
+    sub = operator.sub(add, 1)
+    floordiv = operator.floordiv(sub, chunk)
+    split = torch.ops.aten.split.Tensor(flat_args_1, floordiv, dim)
+    return split
+
+
+def compose_chunk(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed chunk
+    """
+
+    def match_pattern(module, node):
+        if node.op == "call_function" and node.target in (torch.ops.aten.split.Tensor,):
+            div = node.args[1]
+            input = node.args[0]
+            if isinstance(div, int):
+                return (False,)
+            if div.target != operator.floordiv:
+                return (False,)
+            else:
+                div_const = div.args[1]
+                sub = div.args[0]
+                if sub.target != operator.sub:
+                    return (False,)
+                else:
+                    add = sub.args[0]
+                    if add.target != operator.add:
+                        return (False,)
+                    else:
+                        add_const = add.args[1]
+                        if add_const != div_const:
+                            return (False,)
+                        symsize = add.args[0]
+                        if symsize.target != torch.ops.aten.sym_size:
+                            return (False,)
+                        else:
+                            symsize_input = symsize.args[0]
+                            dim = symsize.args[1]
+                            if symsize_input != input:
+                                return (False,)
+
+            return (True, div_const, dim)
+        else:
+            return (False,)
+
+    modified = False
+    for node in module.graph.nodes:
+        res = match_pattern(module, node)
+        if res[0]:
+            modified = True
+            with module.graph.inserting_after(node):
+                new_args = (node.args[0], res[1], res[2])
+                new_node = module.graph.create_node(
+                    "call_function",
+                    aten_compose_chunk,
+                    args=new_args,
+                    kwargs=None,
+                )
+            node.replace_all_uses_with(new_node)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def acc_replace_mul_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Put constant at the end of multiplicaiton, i.e change 15*x.size(1) to x.size(1)*15.
+    TODO: we will remove this pass once dynamo fixed the bug
+    """
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == acc_ops.mul:
+            if isinstance(n.kwargs["input"], int):
+                node = n
+                with module.graph.inserting_after(node):
+                    new_kargs = {}
+                    new_kargs["input"] = node.kwargs["other"]
+                    new_kargs["other"] = node.kwargs["input"]
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        acc_ops.mul,
+                        args=node.args,
+                        kwargs=new_kargs,
+                    )
+                    node.replace_all_uses_with(new_node)
+                    module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
new file mode 100644
index 000000000..b9805842a
--- /dev/null
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -0,0 +1,218 @@
+from functools import reduce
+from typing import Any, List
+
+import torch
+from aitemplate.compiler.public import IntImm, IntVar
+
+
+class TensorSpec:
+    def __init__(self, shape: List[IntVar], dtype: torch.dtype) -> None:
+        self.shape = shape
+        self.dtype = dtype
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, TensorSpec):
+            return False
+        if self.dtype != other.dtype:
+            return False
+        if len(self.shape) != len(other.shape):
+            return False
+        for d1, d2 in zip(self.shape, other.shape):
+            if d1 != d2:
+                return False
+        return True
+
+    def __str__(self) -> str:
+        return "TensorSpec[shape=[{}],dtype={}]".format(
+            ",".join([str(d) for d in self.shape]), self.dtype
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    @classmethod
+    def from_two_input_lists(
+        cls, inputs1: List[torch.Tensor], inputs2: List[torch.Tensor]
+    ) -> List["TensorSpec"]:
+        """
+        This function is useful when we expect multiple dynamic dims.
+
+        The parent graph can receive two sets of inputs:
+        1. with min dynamic dim values,
+        2. with max dynamic dim values.
+
+        After FX splitter logic is applied and lowerable subgraph sample inputs
+        are inferred, we make two assumptions:
+        1. two lists of inferred inputs will differ at dynamic dimensions,
+        2. the difference numbers will be the dynamic ranges, i.e. min and max.
+
+        TODO: The assumptions above are not ideal, and, in theory, we should do
+        symbolic shape propagation using something like SymPy.
+        """
+        if len(inputs1) != len(inputs2):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs1)} vs {len(inputs2)}"
+            )
+
+        result: List[TensorSpec] = []
+
+        for t1, t2 in zip(inputs1, inputs2):
+            if t1.dtype != t2.dtype:
+                raise ValueError(f"Different types: {t1.dtype} vs {t2.dtype}")
+            if len(t1.shape) != len(t2.shape):
+                raise ValueError(
+                    f"Different tensor sizes: {len(t1.shape)} vs {len(t2.shape)}"
+                )
+            shape: List[IntVar] = []
+            for i, (d1, d2) in enumerate(zip(t1.shape, t2.shape)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    shape.append(IntVar([min(d1, d2), max(d1, d2)], f"dynamic_dim_{i}"))
+            result.append(TensorSpec(shape, t1.dtype))
+
+        return result
+
+    @classmethod
+    def gen_int_var_min_max(cls, vmin: int, vmax: int, name: str = None):  # noqa [B902]
+        values = [vmin, vmax]
+        if vmin == vmax:
+            return IntImm(vmin, name=name)
+        elif vmin < vmax:
+            return IntVar([vmin, vmax], name=name)
+        else:
+            raise RuntimeError("Unsupported int var definition: {}".format(values))
+
+    @classmethod
+    def create_spec_from_int_vars(cls, int_vars: List[IntVar], dtype_list: torch.dtype):
+        if len(int_vars) != len(dtype_list):
+            raise ValueError(
+                f"Different number of int_var and dtype_list: {len(int_vars)} vs {len(dtype_list)}"
+            )
+        res = []
+        for int_var, dtype in zip(int_vars, dtype_list):
+            res.append(TensorSpec(int_var, dtype))
+        return res
+
+    @classmethod
+    def create_spec_from_shapes(
+        cls, inputs_min: List[int], inputs_max: List[int], dtype_list: torch.dtype
+    ) -> List["TensorSpec"]:
+        if len(inputs_min) != len(inputs_max):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs_min)} vs {len(inputs_max)}"
+            )
+        res = []
+        for shape1, shape2, dtype in zip(inputs_min, inputs_max, dtype_list):
+            if len(shape1) != len(shape2):
+                raise ValueError(
+                    f"Different number of input dims: {len(shape1)} vs {len(shape2)}"
+                )
+
+            shape: List[IntVar] = []
+            for i, (d1, d2) in enumerate(zip(shape1, shape2)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    shape.append(IntVar([min(d1, d2), max(d1, d2)], f"dynamic_dim_{i}"))
+            res.append(TensorSpec(shape, dtype))
+        return res
+
+    def to_random_tensor(self, use_lower_bound=True):
+        shape = []
+        for s in self.shape:
+            if use_lower_bound:
+                shape.append(s.lower_bound())
+            else:
+                shape.append(s.upper_bound())
+        return torch.randn(shape).to(dtype=self.dtype)
+
+    def to_specific_tensor(self, use_lower_bound, specify_num):
+        shape = []
+        for s in self.shape:
+            if use_lower_bound:
+                shape.append(s.lower_bound())
+            else:
+                shape.append(s.upper_bound())
+        return torch.full(shape, specify_num).to(dtype=self.dtype)
+
+    @classmethod
+    def create_inputs_from_specs(
+        cls, input_specs: List["TensorSpec"], use_lower_bound: bool, specify_num=None
+    ) -> torch.Tensor:
+        result = []
+        for inp in input_specs:
+            if specify_num is None:
+                result.append(inp.to_random_tensor(use_lower_bound).cuda())
+            else:
+                result.append(
+                    inp.to_specific_tensor(use_lower_bound, specify_num).cuda()
+                )
+
+        return result
+
+    @classmethod
+    def from_input_list_with_batch_size(
+        cls, inputs: List[torch.Tensor], max_batch_size: int, batch_dim: int = 0
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        result: List[TensorSpec] = []
+
+        bs_dim = cls.find_batch_size_dim(inputs)
+        for index, t in enumerate(inputs):
+            shape: List[IntVar] = []
+            for i, d in enumerate(t.shape):
+                if i == bs_dim[index]:
+                    shape.append(IntVar([1, max_batch_size], "batch_size"))
+                else:
+                    shape.append(IntImm(d))
+            result.append(TensorSpec(shape, t.dtype))
+
+        return result
+
+    @classmethod
+    # pyre-ignore [2]: Parameter `sample_input` must have a type other than `Any`
+    def find_batch_size_dim(cls, inputs: Any) -> []:
+        if isinstance(inputs, torch.Tensor) or len(inputs) <= 1:
+            return [0]
+        shapes = [i.shape for i in inputs]
+        batch_size = list(reduce(lambda i, j: i & j, (set(x) for x in shapes)))
+        if len(batch_size) != 1:
+            # Unable to find unified batch_size value among input tensors, default batch_size dim=0
+            return [0] * len(inputs)
+
+        bs_dim = []
+        for i in inputs:
+            # Default batch size dim = 0
+            dim = 0
+            for index, val in enumerate(i.shape):
+                if val == batch_size[0]:
+                    dim = index
+                    break
+            bs_dim.append(dim)
+        return bs_dim
+
+    @classmethod
+    def from_input_list_with_batch_size_static_batch(
+        cls, inputs: List[torch.Tensor], max_batch_size: int, batch_dim: int = 0
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        result: List[TensorSpec] = []
+
+        for t in inputs:
+            shape: List[IntVar] = []
+            for _, d in enumerate(t.shape):
+                shape.append(IntImm(d))
+            result.append(TensorSpec(shape, t.dtype))
+
+        return result
diff --git a/fx2ait/fx2ait/test/TARGETS b/fx2ait/fx2ait/test/TARGETS
new file mode 100644
index 000000000..465522f7d
--- /dev/null
+++ b/fx2ait/fx2ait/test/TARGETS
@@ -0,0 +1,78 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbsource//tools/build_defs:glob_defs.bzl", "glob")
+
+oncall("aitemplate")
+
+[
+    python_unittest(
+        name = test_file.split("/")[-1][:-3],
+        srcs = [
+            test_file,
+        ],
+        env = {
+            "NUM_BUILDERS": "12",
+        },
+        par_style = "xar",
+        tags = [
+            "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"A100\"}",
+            "serialize_test_cases",
+            "supports_remote_execution",
+        ],
+        deps = [
+            "fbsource//third-party/pypi/numpy:numpy",
+            "fbsource//third-party/pypi/parameterized:parameterized",
+            "//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait",
+            "//aitemplate/AITemplate/fx2ait/fx2ait/fb/converters:internal_converters",
+            "//caffe2:test-lib",
+            "//caffe2:torch",
+            "//deeplearning/trt/torch_tensorrt/py/torch_tensorrt:acc_tracer",
+            "//deeplearning/trt/torch_tensorrt/py/torch_tensorrt/fb:internal_passes",
+            "//glow/fb/fx/acc_tracer:acc_tracer",
+        ],
+    )
+    for test_file in glob(
+        [
+            "fb/converters/test*.py",
+            "converters/test*.py",
+            "converters/*/test*.py",
+            "test*.py",
+        ],
+        exclude = [
+            "test_fx2ait.py",
+            "test_ait_lower.py",
+        ],
+    )
+]
+
+[
+    python_unittest(
+        name = test_file.split("/")[-1][:-3],
+        srcs = [
+            test_file,
+        ],
+        env = {
+            "NUM_BUILDERS": "12",
+        },
+        par_style = "xar",
+        tags = [
+            "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"A100\"}",
+            "serialize_test_cases",
+            "supports_remote_execution",
+        ],
+        deps = [
+            "fbsource//third-party/pypi/numpy:numpy",
+            "fbsource//third-party/pypi/parameterized:parameterized",
+            "fbsource//third-party/pypi/transformers:transformers",
+            "//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait",
+            "//aitemplate/AITemplate/fx2ait/fx2ait/fb/converters:internal_converters_aten",
+            "//caffe2:test-lib",
+            "//caffe2:torch",
+            "//caffe2/functorch:functorch",
+        ],
+    )
+    for test_file in glob(
+        [
+            "converters_aten/test*.py",
+        ],
+    )
+]
diff --git a/fx2ait/fx2ait/test/__init__.py b/fx2ait/fx2ait/test/__init__.py
new file mode 100644
index 000000000..c91744130
--- /dev/null
+++ b/fx2ait/fx2ait/test/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import test_ait_lower, test_fx2ait  # noqa
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
new file mode 100644
index 000000000..e209e5c45
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
@@ -0,0 +1,60 @@
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestTransformerModelConverter(AITTestCase):
+    def test_transformer_encoder(self):
+        torch.manual_seed(0)
+
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                """
+                Inputs:
+                    input_dim - Dimensionality of the input
+                    num_heads - Number of heads to use in the attention block
+                    dim_feedforward - Dimensionality of the hidden layer in the MLP
+                    dropout - Dropout probability to use in the dropout layers
+                """
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            EncoderBlock(input_dim=512, num_heads=16, dim_feedforward=12).cuda().half()
+        )
+
+        inputs = [torch.randn(10, 32, 512).half().cuda()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
new file mode 100644
index 000000000..8453a97f4
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
@@ -0,0 +1,26 @@
+import torch
+import torchvision
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestVisionModelConverter(AITTestCase):
+    def test_resnet50(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=None,
+        )
diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
new file mode 100644
index 000000000..1cf935379
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -0,0 +1,65 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestMultiHeadAttentionConverter(AITTestCase):
+    def test_multihead_attention_cross_attenytion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, dim, nheads):
+                super().__init__()
+                self.attn = torch.nn.modules.activation.MultiheadAttention(
+                    embed_dim=dim,
+                    num_heads=nheads,
+                    batch_first=True,
+                )
+
+            def forward(self, x):
+                layer_norm = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5)
+                getitem = layer_norm[slice(None, None, None), 0]
+                unsqueeze = torch.unsqueeze(getitem, dim=1)
+
+                return self.attn(query=unsqueeze, key=layer_norm, value=layer_norm)
+
+        seq_len_q, dim, nheads = 4, 16, 2
+        model = TestModule(dim, nheads).half().cuda()
+        input_q = torch.randn(128, seq_len_q, dim).cuda().half()
+        self.run_test(
+            model,
+            [input_q],
+            expected_ops={
+                torch.nn.modules.activation.MultiheadAttention,
+                acc_ops.layer_norm,
+                acc_ops.unsqueeze,
+                acc_ops.getitem,
+            },
+            transformer_mode=True,
+        )
+
+    def test_multihead_attention(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, dim, nheads):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=dim,
+                    num_heads=nheads,
+                    batch_first=True,
+                )
+
+            def forward(self, x):
+                return self.attn(query=x, key=x, value=x)
+
+        batch_size = 2
+        seqlen = 4
+        dim = 512
+        num_heads = 8
+
+        x = torch.ones(batch_size, seqlen, dim).cuda().half()
+        model = TestModule(dim, num_heads).eval().half().cuda()
+
+        self.run_test(
+            model,
+            [x],
+            expected_ops={torch.nn.MultiheadAttention},
+            transformer_mode=True,
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
new file mode 100644
index 000000000..8c29ddbfb
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env fbpython
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAdaptiveAvgPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64),),
+            ((128, 128),),
+            (64,),
+        ]
+    )
+    def test_adaptive_avgpool2d(
+        self,
+        output_size,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 32, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.adaptive_avg_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
new file mode 100644
index 000000000..298c5f9de
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
@@ -0,0 +1,33 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAvgPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.avg_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
new file mode 100644
index 000000000..b6751d0a8
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
@@ -0,0 +1,26 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestAdaptiveAvgPool2dConverter(AITTestCase):
+    def test_batch_norm(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.batch_norm},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
new file mode 100644
index 000000000..54c54b840
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
@@ -0,0 +1,144 @@
+import operator
+from typing import Callable, List, Tuple, Union
+
+import torch
+
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+TWO_TENSOR_INPUTS = [
+    (torch.randn(2, 3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(3, 4)),
+    (torch.randn(1, 1, 1), torch.randn(2, 3, 4)),
+    (torch.randn(1), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(1)),
+    (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
+    (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
+]
+
+
+class TestBinaryOpConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [
+                "add",
+                operator.add,
+                acc_ops.add,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "sub",
+                operator.sub,
+                acc_ops.sub,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "mul",
+                operator.mul,
+                acc_ops.mul,
+                TWO_TENSOR_INPUTS,
+            ],
+            # Add .clamp() to avoid division by zero
+            [
+                "div",
+                operator.truediv,
+                acc_ops.div,
+                [(lhs, rhs.clamp(min=0.01)) for lhs, rhs in TWO_TENSOR_INPUTS],
+            ],
+        ]
+    )
+    def test_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        acc_op: Callable,
+        inputs: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        for lhs, rhs in inputs:
+            model = TestModule().cuda()
+            lhs = lhs.half().cuda()
+            rhs = rhs.half().cuda()
+            self.run_test(model, [lhs, rhs], expected_ops={acc_op})
+
+    @parameterized.expand(
+        [
+            param("add_int", 1, operator.add, acc_ops.add),
+            param("add_float", 0.5, operator.add, acc_ops.add),
+            param("mul_int", 1, operator.mul, acc_ops.mul),
+            param("mul_float", 0.5, operator.mul, acc_ops.mul),
+            param("div_int", 1, operator.truediv, acc_ops.div),
+            param("div_float", 0.5, operator.truediv, acc_ops.div),
+        ]
+    )
+    def test_scalar_operand(
+        self, name: str, scalar: Union[int, float], op: Callable, acc_op: Callable
+    ) -> None:
+        class TestModuleScalarLhs(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return op(scalar, x)
+
+        class TestModuleScalarRhs(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return op(x, scalar)
+
+        model_scalar_lhs = TestModuleScalarLhs().cuda()
+        self.run_test(
+            model_scalar_lhs,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+        model_scalar_rhs = TestModuleScalarRhs().cuda()
+        self.run_test(
+            model_scalar_rhs,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+    @parameterized.expand(
+        [
+            param("add", 1, 3, operator.add, acc_ops.add),
+            param("mul", 0.5, 1, operator.mul, acc_ops.mul),
+            param("sub", 1, 0.5, operator.sub, acc_ops.sub),
+            param("div", 0.5, 0.5, operator.truediv, acc_ops.div),
+        ]
+    )
+    def test_constant_operand(
+        self,
+        name: str,
+        x: Union[int, float],
+        y: Union[int, float],
+        op: Callable,
+        acc_op: Callable,
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                x = op(input.size()[-1], input.size()[-1])
+                return op(x, input)
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+    # This is a common binary op combo usage for ads models.
+    def test_binary_op_combo(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                x = input.size()[0] * input.size()[0]
+                return torch.reshape(input, [-1, x])
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 4).half().cuda()],
+            expected_ops={acc_ops.reshape, acc_ops.mul},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_chunk.py b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
new file mode 100644
index 000000000..905d0ebf4
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
@@ -0,0 +1,29 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestChunkConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 2, [3, 10, 2], 1),
+            param("no_dim", 2, [3, 10, 2]),
+            param("neg_dim", 1, [3, 10, 2], -2),
+            param("chunk_bigger_than_dim", 4, [2, 10, 2], 2),
+        ]
+    )
+    def test_chunk(self, name, chunks, shape, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = (
+                    torch.chunk(x, chunks=chunks, dim=dim)
+                    if dim is not None
+                    else torch.chunk(x, chunks=chunks)
+                )
+                # For AIT, all chunk results must be used
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.chunk})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_clamp.py b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
new file mode 100644
index 000000000..b7858a9cd
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
@@ -0,0 +1,30 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestClampConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.clamp})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_common.py b/fx2ait/fx2ait/test/converters/test_ait_common.py
new file mode 100644
index 000000000..11f65c160
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_common.py
@@ -0,0 +1,368 @@
+from typing import Callable, List, Union
+
+import torch
+from fx2ait.acc_tracer import acc_ops, ait_acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestUnsqueezeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_simple(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.unsqueeze})
+
+    def test_simple_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, 1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.unsqueeze}
+        )
+
+
+class TestPermuteConverter(AITTestCase):
+    def test_permute021(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, [0, 2, 1])
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.permute})
+
+    def test_permute021_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, [0, 2, 1])
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.permute}
+        )
+
+
+class TestCatConverter(AITTestCase):
+    combo = [
+        ["default", 0, torch.cat],
+        ["positive_dim", 1, torch.cat],
+        ["negative_dim", -1, torch.cat],
+        ["default", 0, torch.concat],
+        ["positive_dim", 1, torch.concat],
+        ["negative_dim", -1, torch.concat],
+    ]
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.cat})
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat_dynamic_shape(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+                [2, 3, 4],
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+                [20, 3, 4],
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={acc_ops.cat})
+
+
+class TestReshapeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3, 4], [6, 4]],
+            [[2, 3, 4], [2, 12]],
+            [[2, 3, 4], [24]],
+            [[2, 3, 4], [-1, 4]],
+            [[2, 3, 4], [2, -1]],
+            [[2, 3, 4], [-1]],
+        ]
+    )
+    def test_simple(self, original_shape: List[int], final_shape: List[int]) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, final_shape)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*original_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.reshape})
+
+    def test_with_getitem_size(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                d0 = y.size(dim=0)
+                d1 = y.size(dim=1)
+                return x.reshape(d0, d1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(6, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0_dynamic(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+    ###TODO dim=0,1 dynamic has problem due to output size is not IntVar for dim1(P537903486).
+    # def test_with_getitem_reshape_dim01_dynamic(self) -> None:
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             d0 = x.size(dim=0)
+    #             d1 = x.size(dim=1)
+    #             d2 = x.size(dim=2)
+    #             d = d1 * d2
+    #             return x.reshape(d0, d)
+
+    #     model = TestModule().cuda()
+    #     inputs = [
+    #         [
+    #             torch.randn(2, 3, 4).half().cuda(),
+    #         ],
+    #         [
+    #             torch.randn(20, 30, 4).half().cuda(),
+    #         ],
+    #     ]
+    #     self.run_test_with_dynamic_shape(
+    #         model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+    #     )
+
+
+class TestTopkConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[4], 1],
+            [[6], 3],
+            [[6], 6],
+        ]
+    )
+    def test_simple(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values, indices = torch.topk(x, k)
+                return indices
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    @parameterized.expand(
+        [
+            [[2, 4], 1],
+            [[2, 4], 2],
+            [[3, 3], 3],
+        ]
+    )
+    def test_multi_dimensional(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values, indices = torch.topk(x, k)
+                return indices
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    ##TODO results mismatch.(P537992074)
+    # def test_multi_dimensional_dynamic_shape(self) -> None:
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             values, indices = torch.topk(x, 1)
+    #             return indices
+
+    #     model = TestModule().cuda()
+    #     inputs = [
+    #         [
+    #             torch.randn((2, 4)).half().cuda(),
+    #         ],
+    #         [
+    #             torch.randn((20, 4)).half().cuda(),
+    #         ],
+    #     ]
+    #     self.run_test_with_dynamic_shape(model, inputs, expected_ops={acc_ops.topk})
+
+
+class TestSplitConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 10], [2, 3, 5]],
+            [[2, 10], 2],
+            [[2, 10], 3],
+        ]
+    )
+    def test_with_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections, dim=1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[10], [2, 3, 5]],
+            [[10], 2],
+            [[10], 3],
+        ]
+    )
+    def test_without_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    def test_with_dim_dynamic_shape(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, 2, dim=1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 10],
+            ],
+            inputs_max=[
+                [20, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={ait_acc_ops.split}
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_contiguous.py b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
new file mode 100644
index 000000000..aff2acfad
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
@@ -0,0 +1,18 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestContiguousConverter(AITTestCase):
+    def test_contigupus(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x) -> torch.Tensor:
+                x = x.contiguous()
+                return x + x
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.contiguous})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
new file mode 100644
index 000000000..a1b5603e3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
@@ -0,0 +1,50 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestConv2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+        ]
+    )
+    def test_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(1, 3, 224, 224).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_expand.py b/fx2ait/fx2ait/test/converters/test_ait_expand.py
new file mode 100644
index 000000000..b1e7509a9
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_expand.py
@@ -0,0 +1,30 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestExpandConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("same_shapes", [1, 2, 3], [1, 2, 3]),
+            param("infer_shapes", [1, 2, 3], [-1, -1, -1]),
+        ]
+    )
+    def test_expand(self, name, orig_shape, target_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                y = x.expand(target_shape)
+                return y * y
+
+        class TestModuleManyArgs(torch.nn.Module):
+            def forward(self, x):
+                y = x.expand(*target_shape)
+                return y * y
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(orig_shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.expand})
+
+        model_many_args = TestModuleManyArgs().cuda().half()
+        self.run_test(model_many_args, inputs, expected_ops={acc_ops.expand})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_flatten.py b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
new file mode 100644
index 000000000..698e42edd
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
@@ -0,0 +1,21 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestFlattenConverter(AITTestCase):
+    @parameterized.expand(
+        [param("default"), param("start", start_dim=1), param("end", end_dim=2)]
+    )
+    def test_clamp(self, name, start_dim=0, end_dim=-1):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.flatten(x, start_dim=start_dim, end_dim=end_dim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.flatten})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_gelu.py b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
new file mode 100644
index 000000000..8239d873f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
@@ -0,0 +1,46 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestGeluConverter(AITTestCase):
+    def test_gelu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.gelu(x)
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
+
+    def test_fast_gelu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.gelu(x, approximate="tanh")
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
+
+    @parameterized.expand(
+        [
+            ("none"),
+            ("tanh"),
+        ]
+    )
+    def test_gelu_module(self, name):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gelu = torch.nn.GELU(approximate=name)
+
+            def forward(self, x):
+                return self.gelu(x)
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
new file mode 100644
index 000000000..99a7028f6
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
@@ -0,0 +1,45 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+from torch import nn
+
+
+class TestLayernormConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [2, 10]),
+            # Enable test case once layernorm support expand
+            # param("2d_normalized_shape", [5, 10], [5, 10]),
+        ]
+    )
+    def test_layer_norm(self, name, normalized_shape, input_shape):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.layer_norm})
+
+    def test_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([10, 10]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.layer_norm})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
new file mode 100644
index 000000000..f0bda2c40
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
@@ -0,0 +1,18 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestLeakyReluConverter(AITTestCase):
+    def test_leaky_relu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.leaky_relu(x, negative_slope=0.05)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.leaky_relu})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
new file mode 100644
index 000000000..a8b70d263
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
@@ -0,0 +1,48 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestLinalgConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "l2_norm_dim_3",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=3,
+                keepdims=False,
+            ),
+            param(
+                "l2_norm_dim_2",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=2,
+                keepdims=False,
+            ),
+            param(
+                "l2_norm_dim_1",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=1,
+                keepdims=True,
+            ),
+        ]
+    )
+    def test_linalg_norm(
+        self, test_name, input_shape, ord=None, dim=None, keepdims=False
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self, ord, dim, keepdims):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.linalg.norm(x, ord, dim, keepdims)
+
+        model = TestModule(ord, dim, keepdims).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.linalg_norm})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linear.py b/fx2ait/fx2ait/test/converters/test_ait_linear.py
new file mode 100644
index 000000000..5112283d2
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_linear.py
@@ -0,0 +1,34 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestLinearConverter(AITTestCase):
+    def test_linear(self):
+        M = 2
+        N = 4
+        K = 8
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                w = torch.randn(N, K).half().cuda()
+                b = torch.randn(N).half().cuda()
+                return torch.nn.functional.linear(x, w, b)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(M, K).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.linear})
+
+    def test_linear_no_bias(self):
+        M = 2
+        N = 4
+        K = 8
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                w = torch.randn(N, K).half().cuda()
+                return torch.nn.functional.linear(x, w, bias=None)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(M, K).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.linear})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_matmul.py b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
new file mode 100644
index 000000000..a96bc0eef
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
@@ -0,0 +1,78 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestMatMulConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3], [3, 4]],
+            [[2, 3, 4], [2, 4, 6]],
+            [[2, 3, 4], [4, 6]],
+            [[3, 4], [5, 4, 6]],
+            [[2, 2, 2, 3, 4], [4, 6]],
+        ]
+    )
+    def test_simple(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    def test_mm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.mm(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+            torch.randn(3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    @parameterized.expand(
+        [
+            [[1, 2, 3], [1, 3, 4]],
+            [[3, 2, 3], [3, 3, 4]],
+        ]
+    )
+    def test_bmm(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.bmm(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    @parameterized.expand(
+        [
+            [[1, 1, 3, 4], [1, 1, 4, 6]],
+            [[1, 2, 3, 4], [1, 2, 4, 6]],
+            [[4, 1, 3, 4], [4, 1, 4, 6]],
+            [[4, 2, 3, 4], [4, 2, 4, 6]],
+        ]
+    )
+    def test_matmul_with_4d_tensors(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
new file mode 100644
index 000000000..4dfc745b3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
@@ -0,0 +1,33 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.max_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_nan2num.py b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
new file mode 100644
index 000000000..9a2ae5b01
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
@@ -0,0 +1,28 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestNan2NumConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14])
+            .half()
+            .cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.nan_to_num})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_pow.py b/fx2ait/fx2ait/test/converters/test_ait_pow.py
new file mode 100644
index 000000000..148c45bdd
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_pow.py
@@ -0,0 +1,17 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestPowConverter(AITTestCase):
+    @parameterized.expand([("int", 3), ("float", 0.25)])
+    def test_pow(self, _, exp):
+        class Pow(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return torch.pow(x, exp)
+
+        model = Pow().half().cuda()
+        input = [torch.randn(3, 3).half().cuda()]
+        self.run_test(model, input, expected_ops={acc_ops.pow})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reduce.py b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
new file mode 100644
index 000000000..e9160e9cc
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
@@ -0,0 +1,80 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestSumConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+        ]
+    )
+    def test_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sum})
+
+    def test_sum_no_dim(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x + torch.sum(x)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sum})
+
+    @parameterized.expand(
+        [
+            ["default", None, False],
+            ["specified_dims", (0, 1, 2), False],
+        ]
+    )
+    def test_sum_multi_dims(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return y + torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(2, 3, 5).half().cuda()] * 2
+        self.run_test(model, inputs, expected_ops={acc_ops.add, acc_ops.sum})
+
+
+class TestMeanConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+        ]
+    )
+    def test_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.mean})
+
+    @parameterized.expand(
+        [
+            ["none", None, False],
+            ["specified_dims", (0, 1, 2), False],
+        ]
+    )
+    def test_mean_multi_dims(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return y + torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(2, 3, 5).half().cuda() + 1] * 2
+        self.run_test(model, inputs, expected_ops={acc_ops.mean})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
new file mode 100644
index 000000000..fcbc80600
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
@@ -0,0 +1,15 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from torch import nn
+
+
+class TestSigmoidConverter(AITTestCase):
+    def test_sigmoid(self):
+        class Sigmoid(nn.Module):
+            def forward(self, x):
+                return torch.sigmoid(x)
+
+        model = Sigmoid().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sigmoid})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
new file mode 100644
index 000000000..32d304e2e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -0,0 +1,76 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestSliceTensor(AITTestCase):
+    @parameterized.expand(
+        [
+            ("integer_slice", 1),
+            ("slice_batch_dim", slice(None, None, None)),
+            ("slice_basic", (slice(None, None, None), slice(0, 3, 1))),
+            ("slice_full", (slice(None, None, None), slice(0, 10, 1))),
+            ("ellipsis", (slice(None, None, None), ..., slice(0, 3, 1))),
+            (
+                "slice_all_none",
+                (slice(None, None, None), slice(None, None, None)),
+            ),
+            (
+                "slice_start_none",
+                (slice(None, None, None), slice(None, 2, 1)),
+            ),
+            ("slice_end_none", (slice(None, None, None), slice(1, None, 1))),
+            (
+                "slice_step_none",
+                (slice(None, None, None), slice(0, 3, None)),
+            ),
+            ("slice_neg_idx", (slice(None, None, None), -1)),
+            ("slice_neg_slice", (slice(None, None, None), slice(-8, -2, 1))),
+            ("multi_dim", (slice(None, None, None), 0, 1)),
+            (
+                "slice_multi_dim",
+                (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
+            ),
+            ("none", (slice(None, None, None), None, slice(1, -1, 1), 1)),
+            ("with_squeeze", (slice(None, None, None), 1, slice(1, -1, 1), None)),
+            (
+                "slice_zero_slice",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+            ),
+        ]
+    )
+    def test_slice_tensor(self, name, idx):
+        class SliceTensor(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                y = x + x
+                return y[self.idx]
+
+        mod = SliceTensor(idx).half().cuda()
+        inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
+        self.run_test(mod, inputs, expected_ops={acc_ops.getitem})
+
+    @parameterized.expand([("default", 1), ("neg", -2)])
+    def test_get_item(self, _, idx):
+        class GetItem(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                shape = x.shape[1:]
+                y = torch.nn.functional.layer_norm(x, shape, eps=1e-5)
+                return y
+
+        mod = GetItem(idx).half().cuda()
+        inputs = [torch.randn(2, 10).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={acc_ops.getitem, acc_ops.size, acc_ops.layer_norm},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_softmax.py b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
new file mode 100644
index 000000000..8c03e5eb1
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
@@ -0,0 +1,24 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestSoftmaxConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=1),
+            param("neg", dim=-1),
+        ]
+    )
+    def test_softmax(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_squeeze.py b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
new file mode 100644
index 000000000..b8b3c0183
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
@@ -0,0 +1,28 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestSqueezeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=None, shape=[2, 1, 1, 3]),
+            param("1", dim=1, shape=[2, 1, 1, 3]),
+            param("-1", dim=-1, shape=[2, 1, 3, 1]),
+        ]
+    )
+    def test_squeeze(self, name, dim, shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.squeeze})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_tile.py b/fx2ait/fx2ait/test/converters/test_ait_tile.py
new file mode 100644
index 000000000..8e1f53db4
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_tile.py
@@ -0,0 +1,35 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestTile(AITTestCase):
+    @parameterized.expand(
+        [
+            ("same_num_dims", (2, 2, 3), (1, 2, 2)),
+            (
+                "less_dims",
+                (2, 2, 3),
+                (
+                    1,
+                    2,
+                ),
+            ),
+            ("more_dims", (2, 3), (1, 2, 2, 1)),
+        ]
+    )
+    def test_tile(self, _, input_shape, dims):
+        class Tile(nn.Module):
+            def __init__(self, dims):
+                super().__init__()
+                self.dims = dims
+
+            def forward(self, x):
+                x = x + x  # avoid input shape infer error from AIT
+                return torch.tile(x, self.dims)
+
+        model = Tile(dims).half().cuda()
+        inputs = [torch.randn(*input_shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.add, acc_ops.tile})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
new file mode 100644
index 000000000..fb12d24d5
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -0,0 +1,48 @@
+import math
+from typing import Callable
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+unary_ops = [
+    (torch.abs, acc_ops.abs),
+    (torch.sign, acc_ops.sign),
+    (torch.log, acc_ops.log),
+    (torch.relu, acc_ops.relu),
+]
+
+
+class TestUnaryOpsConverter(AITTestCase):
+    @parameterized.expand([(op[0].__name__, op[0], op[1]) for op in unary_ops])
+    def test_unary_ops(self, name, orig_op: Callable, expected_op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(
+            model, inputs, expected_ops={expected_op} if expected_op is not None else {}
+        )
+
+    def test_sqrt(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, x):
+                super().__init__()
+                self.x = x
+
+            def forward(self, y):
+                return torch.div(y, math.sqrt(self.x))
+
+        model = TestModule(x=64).cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_var.py b/fx2ait/fx2ait/test/converters/test_ait_var.py
new file mode 100644
index 000000000..af4073073
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_var.py
@@ -0,0 +1,46 @@
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestVarConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=0, unbiased=False),
+            param("unbiased", dim=0, unbiased=True),
+            param("neg_dim", dim=-1, unbiased=True),
+            param("keepdim", dim=0, unbiased=True, keepdim=True),
+        ]
+    )
+    def test_var(self, name, dim, unbiased, keepdim=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.var(x, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.var})
+
+    @parameterized.expand(
+        [
+            param("default", dim=0, unbiased=False),
+            param("unbiased", dim=0, unbiased=True),
+            param("neg_dim", dim=-1, unbiased=True),
+            param("keepdim", dim=0, unbiased=True, keepdim=True),
+        ]
+    )
+    def test_var_call_method(self, name, dim, unbiased, keepdim=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.var(dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.var})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
new file mode 100644
index 000000000..f334fd215
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env fbpython
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestAdaptiveAvgPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64), torch.ops.aten._adaptive_avg_pool2d.default),
+            ((128, 128), torch.ops.aten._adaptive_avg_pool2d.default),
+            (64, torch.ops.aten._adaptive_avg_pool2d.default),
+            (
+                (1, 1),
+                torch.ops.aten.mean.dim,
+            ),
+        ]
+    )
+    def test_adaptive_avgpool2d(self, output_size, op_check):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 32, 256, 256).cuda().half()]
+        if op_check == torch.ops.aten.mean.dim:
+            permute_inputs = None
+            permute_outputs = None
+        else:
+            permute_inputs = [0, 2, 3, 1]
+            permute_outputs = [0, 3, 1, 2]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={op_check},
+            permute_inputs=permute_inputs,
+            permute_outputs=permute_outputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((64, 64),),
+            ((128, 128),),
+            (64,),
+        ]
+    )
+    def test_dynamic_adaptive_avgpool2d(
+        self,
+        output_size,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 32, 256, 256],
+            ],
+            inputs_max=[
+                [10, 32, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten._adaptive_avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
new file mode 100644
index 000000000..4a380de64
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
@@ -0,0 +1,69 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestAvgPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_dynamic_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 4, 256, 256],
+            ],
+            inputs_max=[
+                [10, 4, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
new file mode 100644
index 000000000..10caaded3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
@@ -0,0 +1,55 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestAdaptiveAvgPool2dConverter(DispatchTestCase):
+    def test_batch_norm(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.batch_norm},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    def test_dynamic_batch_norm(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 3, 244, 244],
+            ],
+            inputs_max=[
+                [10, 3, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.batch_norm},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
new file mode 100644
index 000000000..6580574f9
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
@@ -0,0 +1,115 @@
+import operator
+from typing import Callable, List, Tuple
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+TWO_TENSOR_INPUTS = [
+    (torch.randn(2, 3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(3, 4)),
+    (torch.randn(1, 1, 1), torch.randn(2, 3, 4)),
+    (torch.randn(1), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(1)),
+    (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
+    (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
+]
+
+
+class TestATenBinaryOpConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            [
+                "add",
+                operator.add,
+                torch.ops.aten.add.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "sub",
+                operator.sub,
+                torch.ops.aten.sub.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "mul",
+                operator.mul,
+                torch.ops.aten.mul.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "div",
+                operator.truediv,
+                torch.ops.aten.div.Tensor,
+                [(lhs, rhs.clamp(min=0.01)) for lhs, rhs in TWO_TENSOR_INPUTS],
+            ],
+        ]
+    )
+    def test_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        aten_op: Callable,
+        inputs: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        for lhs, rhs in inputs:
+            model = TestModule().cuda()
+            lhs = lhs.half().cuda()
+            rhs = rhs.half().cuda()
+            self.run_test(model, [lhs, rhs], expected_ops={aten_op})
+
+    @parameterized.expand(
+        [
+            [
+                "dynamic_add",
+                operator.add,
+                torch.ops.aten.add.Tensor,
+            ],
+            [
+                "dynamic_sub",
+                operator.sub,
+                torch.ops.aten.sub.Tensor,
+            ],
+            [
+                "dynamic_sub",
+                operator.mul,
+                torch.ops.aten.mul.Tensor,
+            ],
+            [
+                "dynamic_div",
+                operator.truediv,
+                torch.ops.aten.div.Tensor,
+            ],
+        ]
+    )
+    def test_dynamic_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        aten_op: Callable,
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        m = TensorSpec.gen_int_var_min_max(1, 32, "dynamic_m")
+        n = TensorSpec.gen_int_var_min_max(3, 1024, "dynamic_n")
+        k = TensorSpec.gen_int_var_min_max(4, 2048, "dynamic_k")
+        model = TestModule().cuda().half()
+        # AIT can automatically calculate broadcast
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [[m, n, k], [n, k]], dtype_list=[torch.float16] * 2
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            input_spec,
+            expected_ops={aten_op},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
new file mode 100644
index 000000000..5b2ba8dad
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
@@ -0,0 +1,68 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestCatConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", 0],
+            ["positive_dim", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_cat(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return torch.cat([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.cat.default})
+
+    @parameterized.expand(
+        [
+            ["default", 0],
+            ["positive_dim", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_cat_dynamic_shape(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return torch.cat([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+                [2, 3, 4],
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+                [20, 3, 4],
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.cat.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
new file mode 100644
index 000000000..f629da19d
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
@@ -0,0 +1,46 @@
+import torch
+from fx2ait.fx2ait import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestChunkConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 2, [3, 10, 2], 1),
+            param("no_dim", 2, [3, 10, 2]),
+            param("neg_dim", 1, [3, 10, 2], -2),
+            param("chunk_bigger_than_dim", 4, [2, 10, 2], 2),
+        ]
+    )
+    def test_chunk(self, name, chunks, shape, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = (
+                    torch.chunk(x, chunks=chunks, dim=dim)
+                    if dim is not None
+                    else torch.chunk(x, chunks=chunks)
+                )
+                # For AIT, all chunk results must be used
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={})
+
+    def test_chunk_dynamic(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = torch.chunk(x, chunks=2, dim=1)
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[20, 10, 8]],
+            inputs_max=[[50, 10, 8]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
new file mode 100644
index 000000000..77ac1b973
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
@@ -0,0 +1,66 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestClampConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.clamp.default})
+
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_dynamic_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.clamp.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
new file mode 100644
index 000000000..2bcd2b241
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
@@ -0,0 +1,113 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from aitemplate.compiler.public import DynamicProfileStrategy
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestConv2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+        ]
+    )
+    def test_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(1, 3, 24, 24).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={
+                torch.ops.aten.convolution.default,
+                torch.ops.aten.relu.default,
+            },
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+        ]
+    )
+    def test_dynamic_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 24, 24],
+            ],
+            inputs_max=[
+                [32, 3, 24, 24],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={
+                torch.ops.aten.convolution.default,
+                torch.ops.aten.relu.default,
+            },
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+            # AIT conv2d only support HINTS as dyanmic profiliing strategy.
+            dynamic_profile_strategy=DynamicProfileStrategy.HINTS,
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
new file mode 100644
index 000000000..ce6496fcb
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestFlattenConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ("flatten_middle_dims", 1, 2),
+            ("flatten_last_3_dims", 1, 3),
+            ("flatten_all", 0, 3),
+        ]
+    )
+    def test_flatten(self, _, start_dim, end_dim):
+        class TestModule(nn.Module):
+            def __init__(self, start, end):
+                super().__init__()
+                self.start = start
+                self.end = end
+
+            def forward(self, x):
+                return torch.flatten(x, self.start, self.end)
+
+        model = TestModule(start_dim, end_dim).cuda().half()
+        inputs = (torch.randn(1, 2, 3, 1).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    @parameterized.expand(
+        [
+            ("flatten_middle_dims", 1, 2),
+            ("flatten_last_3_dims", 1, 3),
+        ]
+    )
+    def test_flatten_with_dynamic_shape(self, _, start_dim, end_dim):
+        class TestModule(nn.Module):
+            def __init__(self, start, end):
+                super().__init__()
+                self.start = start
+                self.end = end
+
+            def forward(self, x):
+                return torch.flatten(x, self.start, self.end)
+
+        model = TestModule(start_dim, end_dim).cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 2, 3, 4],
+            ],
+            inputs_max=[
+                [10, 20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
new file mode 100644
index 000000000..1cbe433b3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
@@ -0,0 +1,102 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+from torch import nn
+
+
+class TestLayernormConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [2, 10]),
+            param("1d_normalized_shape_3d_input", [10], [2, 6, 10]),
+            param("2d_normalized_shape", [6, 10], [2, 6, 10]),
+            # FIXME: Enable test case once layernorm support expand
+            # param("2d_normalized_shape", [5, 10], [5, 10]),
+        ]
+    )
+    def test_layer_norm(self, name, normalized_shape, input_shape):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.layer_norm.default})
+
+    def test_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([10, 10]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.layer_norm.default})
+
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [[2, 10], [12, 10]]),
+            param("1d_normalized_shape_3d_input", [10], [[2, 6, 10], [12, 20, 10]]),
+            param("2d_normalized_shape", [6, 10], [[2, 6, 10], [12, 6, 10]]),
+        ]
+    )
+    def test_dynamic_layer_norm(self, name, normalized_shape, input_shape):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                input_shape[0],
+            ],
+            inputs_max=[
+                input_shape[1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.layer_norm.default}
+        )
+
+    def test_dynamic_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[10, 30]],
+            inputs_max=[[20, 30]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs, expected_ops={torch.ops.aten.layer_norm.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
new file mode 100644
index 000000000..0754a1d61
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
@@ -0,0 +1,76 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestLinearConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ("default", [1, 512], True),
+            ("matrix", [5, 512], True),
+            ("no_bias", [1, 512], False),
+            (
+                "multi_dim_matrix",
+                [4, 5, 512],
+                True,
+            ),
+            (
+                "multi_dim_matrix",
+                [4, 5, 512],
+                False,
+            ),
+        ]
+    )
+    def test_linear(self, test_name, shape, bias):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 256, bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.linear})
+
+    @parameterized.expand(
+        [
+            ("default", [[1, 5], [512, 512]], True),
+            ("no_bias", [[1, 4], [512, 512]], False),
+            (
+                "multi_dim_matrix",
+                [[2, 4], [512, 512]],
+                True,
+            ),
+            (
+                "multi_dim_matrix_no_bias",
+                [[2, 4], [512, 512]],
+                False,
+            ),
+        ]
+    )
+    def test_dynamic_linear(self, test_name, shape, bias):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 256, bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = TestModule().cuda().half()
+
+        input_shape = []
+        for i, s in enumerate(shape):
+            input_shape.append(
+                TensorSpec.gen_int_var_min_max(s[0], s[1], "dynamic" + str(i))
+            )
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [input_shape], dtype_list=[torch.float16]
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, input_spec, expected_ops={torch.ops.aten.linear}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
new file mode 100644
index 000000000..74cfe4566
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
@@ -0,0 +1,74 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+from parameterized import parameterized
+
+
+class TestMatMulConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3], [3, 4], torch.ops.aten.mm.default],
+            [[2, 3, 4], [4, 6], torch.ops.aten.mm.default],
+            [[2, 3, 4], [2, 4, 6], torch.ops.aten.bmm.default],
+            [[2, 2, 2, 3, 4], [4, 6], torch.ops.aten.mm.default],
+        ]
+    )
+    def test_simple(self, lhs_shape, rhs_shape, op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={op})
+
+    def test_mm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.mm(x, y)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+            torch.randn(3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.mm.default})
+
+    @parameterized.expand(
+        [
+            # Only M can be dynamic: https://github.com/fairinternal/AITemplate/blob/main/tests/unittest/ops/test_gemm.py
+            [[[2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            [[[2, 3], [2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            [[[1, 3], [2, 3], [6, 8], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            # FIXME: batch_size cannot be dynamic because the permutation of shape change the names: P544607056
+            # b, m, k, n
+            [[[2, 2], [6, 8], [3, 3], [6, 6]], torch.ops.aten.bmm.default, True],
+        ]
+    )
+    def test_dynamic(self, shape, op, bmm=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+
+        input0_shape = []
+        for i, s in enumerate(shape):
+            if i == len(shape) - 1:
+                break
+            input0_shape.append(
+                TensorSpec.gen_int_var_min_max(s[0], s[1], "dynamic" + str(i))
+            )
+        input1_shape = [input0_shape[-1]] + [
+            TensorSpec.gen_int_var_min_max(shape[-1][0], shape[-1][1], "dynamic_last")
+        ]
+        if bmm:
+            input1_shape = [input0_shape[0]] + input1_shape
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [input0_shape, input1_shape], dtype_list=[torch.float16] * 2
+        )
+        self.run_test_with_dynamic_shape(model, input_spec, expected_ops={op})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
new file mode 100644
index 000000000..ba9861798
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
@@ -0,0 +1,69 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_maxpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.max_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_dynamic_maxpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 4, 224, 224],
+            ],
+            inputs_max=[
+                [10, 4, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.max_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
new file mode 100644
index 000000000..baa9cc384
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
@@ -0,0 +1,93 @@
+import torch
+import torchvision
+from fx2ait.passes.lower_basic_pass_aten import nchw2nhwc_pass, replace_inplace_ops
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestModelConverter(DispatchTestCase):
+    def test_resnet50(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        customized_passes = [
+            replace_inplace_ops,
+            nchw2nhwc_pass,
+        ]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=None,
+            customized_passes=customized_passes,
+        )
+
+    def test_densenet(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.densenet121(pretrained=True)
+
+            def forward(self, x):
+                return self.mod(x)
+
+        inputs = [torch.randn(1, 3, 224, 224).cuda().half()]
+        model = TestModule().cuda().half()
+        self.run_test(
+            model,
+            inputs,
+            atol=0.18,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=None,
+            customized_passes=[
+                replace_inplace_ops,
+                nchw2nhwc_pass,
+            ],
+        )
+
+    # def test_hf_albert_base(self):
+    #     # config = AutoConfig.from_pretrained("albert-base-v2")
+    #     # config = AutoConfig.from_pretrained("gpt2")
+    #     # config = BertConfig()
+    #     config = AutoConfig.from_pretrained("allenai/longformer-base-4096")
+    #     max_length = 128
+    #     batch_size = 32
+    #     device = "cuda"
+
+    #     class TestModule(torch.nn.Module):
+    #         def __init__(self):
+    #             super().__init__()
+    #             self.mod = AutoModelForMaskedLM.from_config(config).to(device)
+
+    #         def forward(self, x):
+    #             return self.mod(x).logits
+
+    #     model = TestModule().cuda().half()
+    #     input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
+    #         device
+    #     )
+    #     inputs = [input_ids]
+    #     self.run_test(
+    #         model,
+    #         inputs,
+    #         expected_ops={},
+    #         # permute_inputs=[0, 2, 3, 1],
+    #         # permute_outputs=None,
+    #         customized_passes=[
+    #             replace_inplace_ops,
+    #             nchw2nhwc_pass,
+    #         ],
+    #     )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
new file mode 100644
index 000000000..74633505c
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
@@ -0,0 +1,75 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestCatConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14])
+            .half()
+            .cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.nan_to_num.default})
+
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_dynamic_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [3, 8, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=float("nan"),
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=float("inf"),
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=-float("inf"),
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
new file mode 100644
index 000000000..2c20c4105
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
@@ -0,0 +1,64 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestPermuteConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param((128, 512), (1, 0)),
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+        ]
+    )
+    def test_permute(self, shape, dims):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, dims)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.permute.default})
+
+    @parameterized.expand(
+        [
+            param((128, 500), (256, 512), (1, 0)),
+            param((80, 300, 2), (98, 512, 20), (0, 2, 1)),
+            param((80, 300, 2), (98, 512, 20), (1, 0, 2)),
+            param((80, 300, 2), (98, 512, 20), (2, 1, 0)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (0, 2, 1, 3)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (3, 2, 1, 0)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (2, 3, 0, 1)),
+            param((3, 1, 113, 15, 64), (6, 10, 128, 16, 128), (2, 0, 3, 1, 4)),
+        ]
+    )
+    def test_permute_dynamic_shape(self, input_min, input_max, dims):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, dims)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                input_min,
+            ],
+            inputs_max=[
+                input_max,
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.permute.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
new file mode 100644
index 000000000..c716cf1f8
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
@@ -0,0 +1,59 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestPowConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("exp", size=[10], exp=5),
+            param("3d_exp", size=[2, 5, 32], exp=5),
+            param("4d_float_exp", size=[2, 5, 32, 128], exp=2.2),
+        ]
+    )
+    def test_pow(self, name, size, exp):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.pow(x, exponent=exp)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([2, 5, 32, 128]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.pow.Tensor_Scalar})
+
+    @parameterized.expand(
+        [
+            param("exp", inputs_min=[10], inputs_max=[15], exp=5),
+            param("3d_exp", inputs_min=[2, 5, 32], inputs_max=[3, 7, 64], exp=5),
+            param(
+                "4d_float_exp",
+                inputs_min=[2, 5, 32, 128],
+                inputs_max=[20, 7, 35, 140],
+                exp=2.2,
+            ),
+        ]
+    )
+    def test_dynamic_pow(self, name, inputs_min, inputs_max, exp):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.pow(x, exponent=exp)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [3, 8, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.pow.Tensor_Scalar}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
new file mode 100644
index 000000000..e1fbf9cfb
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
@@ -0,0 +1,117 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestSumConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.sum.dim_IntList})
+
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_dynamic_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        # The last dim has to be static to pre-compute vector_length:
+        # https://fburl.com/code/1x07doen
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 6, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.sum.dim_IntList}
+        )
+
+
+class TestMeanConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.mean.dim})
+
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_dynamic_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 6, 8],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.mean.dim}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
new file mode 100644
index 000000000..9ee3a0522
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
@@ -0,0 +1,49 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.op = torch.relu
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.op(x)
+
+
+class TestATenReluConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("small", size=(2, 3)),
+            param("large", size=(1024, 4096, 8)),
+        ]
+    )
+    def test_relu(self, name, size):
+
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(size).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.relu.default})
+
+    def test_relu_with_dynamic_shape(self):
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 3, 4],
+            ],
+            inputs_max=[
+                [32, 1024, 2048],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.relu.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
new file mode 100644
index 000000000..0ae0d8289
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
@@ -0,0 +1,110 @@
+import unittest
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestATenReshapeConverter(DispatchTestCase):
+    def test_reshape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (2, 12))
+
+        size = (2, 3, 4)
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(size).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    def test_reshape_size(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                dim1_y = y.shape[0]
+                return torch.reshape(x, (dim1_y, -1, 128))
+
+        model = TestModule().cuda().half()
+        inputs = (
+            torch.randn(2, 10, 128).half().cuda(),
+            torch.randn(2, 10, 128).half().cuda(),
+        )
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    def test_reshape_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (x.size(0), x.size(1) * x.size(2)))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [10, 30, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
+
+    def test_reshape_neg_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (x.size(0), -1))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [10, 30, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
+
+    # TODO: trigger assertion in AIT: AssertionError: When there is no unknown index, we expect dim products to be equal, got current shape numel=2560 != new shape prod=256
+    @unittest.skip
+    def test_reshape_size_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                dim1_y = y.shape[0]
+                return torch.reshape(x, (dim1_y, -1, 128))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 10, 128],
+                [2, 10, 128],
+            ],
+            inputs_max=[
+                [20, 10, 128],
+                [20, 10, 128],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
new file mode 100644
index 000000000..561caf381
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
@@ -0,0 +1,55 @@
+import torch
+
+# from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestATenSizeConverter(DispatchTestCase):
+    def test_size(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                t = x.size()
+                return y.reshape(t)
+
+        xsize = (2, 3, 4)
+        ysize = (2, 12)
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(xsize).half().cuda(), torch.randn(ysize).half().cuda())
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.sym_size})
+
+    ## AIT not support now
+    # def test_size_dim(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             return x.size(1)
+
+    #     size = (2, 3, 4)
+    #     model = TestModule().cuda().half()
+    #     inputs = (torch.randn(size).half().cuda(),)
+
+    #     self.run_test(model, inputs, expected_ops={torch.ops.aten.sym_size})
+
+    # def test_size_dim_with_dynamic_shape(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             return x.size(1)
+
+    #     model = TestModule().cuda().half()
+    #     inputs_spec = TensorSpec.create_spec_from_shapes(
+    #         inputs_min=[
+    #             [2, 3, 4],
+    #         ],
+    #         inputs_max=[
+    #             [10, 30, 4],
+    #         ],
+    #         dtype_list=[
+    #             torch.float16,
+    #         ],
+    #     )
+
+    #     self.run_test_with_dynamic_shape(
+    #         model,
+    #         inputs_spec,
+    #         expected_ops={torch.ops.aten.sym_size},
+    #     )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
new file mode 100644
index 000000000..f7d28e7eb
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
@@ -0,0 +1,208 @@
+import torch
+from fx2ait.passes.lower_basic_pass_aten import (
+    aten_compose_getitem_slice,
+    compose_getitem_slice,
+)
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestSliceTensor(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (
+                "integer_slice",
+                1,
+                {
+                    torch.ops.aten.select.int,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_batch_dim",
+                slice(None, None, None),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_basic",
+                (slice(None, None, None), slice(0, 3, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_full",
+                (slice(None, None, None), slice(0, 10, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            ## Trace problem in support of ellipsis
+            # (
+            #     "ellipsis",  # It seems there is some problem in tracing ellipsis: P539875442
+            #     (slice(None, None, None), ..., slice(0, 3, 1)),
+            #     {
+            #         torch.ops.aten.add.Tensor,
+            #     },
+            # ),
+            (
+                "slice_all_none",
+                (
+                    slice(None, None, None),
+                    slice(None, None, None),
+                ),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_start_none",
+                (
+                    slice(None, None, None),
+                    slice(None, 2, 1),
+                ),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_end_none",
+                (slice(None, None, None), slice(1, None, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_step_none",
+                (
+                    slice(None, None, None),
+                    slice(0, 3, None),
+                ),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_neg_idx",
+                (slice(None, None, None), -1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_neg_slice",
+                (slice(None, None, None), slice(-8, -2, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "multi_dim",
+                (slice(None, None, None), 0, 1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_multi_dim",
+                (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "none",
+                (slice(None, None, None), None, slice(1, -1, 1), 1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "with_squeeze",
+                (slice(None, None, None), 1, slice(1, -1, 1), None),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_zero_slice",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_basic_compose",
+                (slice(None, None, None), slice(None, None, None), slice(0, 3, 1)),
+                {
+                    torch.ops.aten.add.Tensor,
+                    aten_compose_getitem_slice,
+                },
+                [
+                    compose_getitem_slice,
+                ],
+            ),
+            (
+                "slice_zero_slice_compose",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+                {
+                    torch.ops.aten.add.Tensor,
+                    aten_compose_getitem_slice,
+                },
+                [
+                    compose_getitem_slice,
+                ],
+            ),
+        ]
+    )
+    def test_slice_tensor(self, name, idx, expected_ops, customized_passes):
+        class SliceTensor(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                y = x + x
+                return y[self.idx]
+
+        mod = SliceTensor(idx).half().cuda()
+
+        inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops=expected_ops,
+            customized_passes=customized_passes,
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
new file mode 100644
index 000000000..9268597db
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
@@ -0,0 +1,68 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestSplitConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "dim1",
+                dim=1,
+                split_size=3,
+                expected_ops={torch.ops.aten.split.Tensor},
+            ),
+            param(
+                "dim0",
+                dim=0,
+                split_size=3,
+                expected_ops={torch.ops.aten.split.Tensor},
+            ),
+        ]
+    )
+    def test_split(self, name, dim, split_size, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                res = torch.split(y, split_size, dim)
+                return res[0]
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(20, 10, 8).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops=expected_ops)
+
+    def test_split_dynamic(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                res = torch.split(y, 4, 1)
+                return res[0]
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[20, 10, 8]],
+            inputs_max=[[50, 10, 8]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.split.Tensor}
+        )
+
+    # TODO low priority. May need to support it in future.
+    # def test_split_imm(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, y: torch.Tensor) -> torch.Tensor:
+    #             dim1 = y.size(1)
+    #             split_size = dim1 // 2
+    #             return torch.split(y, split_size, 1)
+
+    #     model = TestModule().cuda().half()
+    #     inputs = [
+    #         torch.randn(2, 10, 20).half().cuda(),
+    #     ]
+    #     self.run_test(model, inputs, expected_ops={torch.ops.aten.split.Tensor})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
new file mode 100644
index 000000000..7822fb74b
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
@@ -0,0 +1,151 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestSqueezeConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "default",
+                dim=None,
+                shape=[2, 1, 1, 3],
+                expected_ops={torch.ops.aten.squeeze.default},
+            ),
+            param(
+                "1",
+                dim=1,
+                shape=[2, 1, 1, 3],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape=[2, 1, 3, 1],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+        ]
+    )
+    def test_squeeze(self, name, dim, shape, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops=expected_ops)
+
+    @parameterized.expand(
+        [
+            param(
+                "default",
+                dim=None,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.default},
+            ),
+            param(
+                "1",
+                dim=1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+        ]
+    )
+    def test_dynamic_squeeze(self, name, dim, shape1, shape2, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=shape1,
+            inputs_max=shape2,
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops=expected_ops)
+
+
+class TestUnSqueezeConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("1", dim=1, shape=[2, 1, 1, 3]),
+            param("-1", dim=-1, shape=[2, 1, 3, 1]),
+        ]
+    )
+    def test_unsqueeze(self, name, dim, shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                unsqueeze = (
+                    torch.unsqueeze(y, dim=dim)
+                    if dim is not None
+                    else torch.unsqueeze(y)
+                )
+                return unsqueeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.unsqueeze.default})
+
+    @parameterized.expand(
+        [
+            param(
+                "1",
+                dim=1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+            ),
+        ]
+    )
+    def test_dynamic_squeeze(self, name, dim, shape1, shape2):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                unsqueeze = (
+                    torch.unsqueeze(y, dim=dim)
+                    if dim is not None
+                    else torch.unsqueeze(y)
+                )
+                return unsqueeze
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=shape1,
+            inputs_max=shape2,
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.unsqueeze.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
new file mode 100644
index 000000000..25f90c94f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
@@ -0,0 +1,53 @@
+from typing import Callable
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+unary_ops = [
+    (torch.abs, torch.ops.aten.abs.default),
+    (torch.log, torch.ops.aten.log.default),
+    (torch.sigmoid, torch.ops.aten.sigmoid.default),
+    (torch.sign, torch.ops.aten.sign.default),
+    (torch.tanh, torch.ops.aten.tanh.default),
+]
+
+
+class TestUnaryOpsConverter(DispatchTestCase):
+    @parameterized.expand([(op[1].__name__, op[0], op[1]) for op in unary_ops])
+    def test_unary_ops(self, name, orig_op: Callable, expected_op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={expected_op})
+
+    @parameterized.expand([(op[1].__name__, op[0], op[1]) for op in unary_ops])
+    def test_dynamic_unary_ops(self, name, orig_op: Callable, expected_op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={expected_op})
diff --git a/fx2ait/fx2ait/test/test_ait_lower.py b/fx2ait/fx2ait/test/test_ait_lower.py
new file mode 100644
index 000000000..a78bad897
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_ait_lower.py
@@ -0,0 +1,77 @@
+import unittest
+
+import torch
+from fx2ait.lower.lower import AitLowerer
+from fx2ait.lower.lower_settings import LowerSettings
+
+
+@torch.fx.wrap
+def get_length(input: torch.Tensor) -> int:
+    return len(input)
+
+
+@torch.fx.wrap
+def unsupported_op(x):
+    return x + x
+
+
+class TestFx2aitLowerTests(unittest.TestCase):
+    def test_fx2ait_lower(self):
+        class TestMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                values = torch.sigmoid(x)
+                return get_length(values)
+
+        mod = TestMod().half().cuda()
+        input = [torch.randn(2, 3).half().cuda()]
+        ref_output = mod(*input)
+        lower = AitLowerer.create(
+            LowerSettings(workdir="/tmp", name="test_ait_lower", min_acc_module_size=0)
+        )
+        lowered = lower(mod, input)
+        lower_output = lowered(*input)
+        self.assertTrue(len(lowered._modules.keys()), 2)
+        torch.testing.assert_close(ref_output, lower_output, check_dtype=False)
+
+        # Verify that the resulting module is scriptable and
+        # the scripted module is working properly with dynamic batch input
+        # TODO: Enable script test after python release include fix:
+        # https://github.com/pytorch/pytorch/pull/87804
+        # scripted = torch.jit.script(lowered)
+        # input2 = [torch.randn(16, 3).half().cuda()]
+        # ref_output2 = mod(*input2)
+        # torch.testing.assert_close(ref_output2, scripted(*input2), check_dtype=False)
+
+    def test_fx2ait_lower_avoids_copies(self):
+        class TestMod(torch.nn.Module):
+            def forward(self, x):
+                a = unsupported_op(x)
+                b = a.unsqueeze(0)
+                return unsupported_op(b)
+
+        mod = TestMod().half().cuda()
+        x = torch.randn((1,)).half().cuda()
+        ref_output = mod(x)
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower_avoids_copies",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(mod, [x])
+        lower_output = lowered(x)
+        torch.testing.assert_close(ref_output, lower_output, check_dtype=False)
+
+        children = list(lowered.named_children())
+        self.assertEqual(len(children), 1)
+        name, _ = children[0]
+        self.assertNotIn("_run_on_acc", name)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
new file mode 100644
index 000000000..d0d2a80b3
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -0,0 +1,85 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import os
+import unittest
+
+import torch
+from fx2ait.acc_tracer import acc_tracer
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter
+
+torch.ops.load_library("build/libait_model.so")
+
+
+class TestAITModule(unittest.TestCase):
+    def _test_fx2ait_impl(self, test_serialization=False, test_cuda_graph=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                sigmoid = torch.sigmoid(x)
+                add = sigmoid * sigmoid
+                return add
+
+        inputs = [torch.ones(2, 2).cuda().half()]
+        mod = TestModule().cuda().half()
+        ref_output = mod(*inputs)
+
+        traced = acc_tracer.trace(mod, inputs)
+
+        interp = AITInterpreter(traced, inputs, "./tmp", "test")
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            torch.classes.ait.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float16,
+                1,  # num_runtimes
+            ),
+        )
+        ait_mod.engine.use_cuda_graph = test_cuda_graph
+        if test_serialization:
+            buf = io.BytesIO()
+            # Have to JIT-ify the module before we can save/load it.
+            ait_mod = torch.jit.trace(ait_mod, inputs)
+            script_output = ait_mod(*inputs)
+            torch.testing.assert_close(script_output, ref_output, atol=1e-2, rtol=1e-2)
+            torch.jit.save(ait_mod, buf)
+            buf.seek(0)
+            torch.classes.ait.AITModel.register_library_name_to_path_map(
+                {
+                    os.path.basename(
+                        interp_result.engine.lib_path
+                    ): interp_result.engine.lib_path
+                }
+            )
+            ait_mod = torch.jit.load(buf)
+        ait_output = ait_mod(*inputs)
+        torch.testing.assert_close(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+
+    def test_fx2ait(self):
+        self._test_fx2ait_impl(test_serialization=False)
+
+    def test_fx2ait_module_serialization(self):
+        self._test_fx2ait_impl(test_serialization=True)
+
+    def test_fx2ait_cuda_graph(self):
+        self._test_fx2ait_impl(test_cuda_graph=True)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
new file mode 100644
index 000000000..13d5f42cf
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -0,0 +1,105 @@
+import unittest
+
+import torch
+from aitemplate.compiler.public import IntImm, IntVar
+from fx2ait.tensor_spec import TensorSpec
+from parameterized import parameterized
+
+
+class TestTensorSpec(unittest.TestCase):
+    def test_two_input_lists(self):
+        inputs1 = [
+            torch.empty([1, 3, 4], dtype=torch.float16),
+            torch.empty([5, 6], dtype=torch.int32),
+            torch.empty([7, 128, 9], dtype=torch.float16),
+        ]
+        inputs2 = [
+            torch.empty([32, 3, 4], dtype=torch.float16),
+            torch.empty([5, 6], dtype=torch.int32),
+            torch.empty([7, 1, 9], dtype=torch.float16),
+        ]
+
+        specs = TensorSpec.from_two_input_lists(inputs1, inputs2)
+
+        self.assertEqual(3, len(specs))
+        self.assertEqual(
+            TensorSpec(
+                [IntVar([1, 32], "dynamic_dim_0"), IntImm(3), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(TensorSpec([IntImm(5), IntImm(6)], torch.int32), specs[1])
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 128], "dynamic_dim_1"), IntImm(9)], torch.float16
+            ),
+            specs[2],
+        )
+
+    @parameterized.expand(
+        [
+            ("single", [([10, 3, 4], torch.float16)]),
+            (
+                "multi",
+                [
+                    ([10, 3, 4], torch.float16),
+                    ([10, 6], torch.int32),
+                    ([10, 8, 9], torch.float16),
+                ],
+            ),
+            (
+                "different_bs_dim",
+                [
+                    ([10, 3, 4], torch.float16),
+                    ([10, 6], torch.int32),
+                    ([4, 10, 9], torch.float16),
+                ],
+            ),
+        ]
+    )
+    def test_input_list_with_batch_size(self, _, settings):
+        inputs = [torch.empty(setting[0], dtype=setting[1]) for setting in settings]
+        # Test case default batch_size = 10, avoid set other shape param with this value
+        batch_size = 10
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32)
+        self.assertEqual(len(settings), len(specs))
+        for index, setting in enumerate(settings):
+            expected_shape = setting[0]
+            expected_spec = []
+            for shape in expected_shape:
+                if shape == batch_size:
+                    expected_spec.append(IntVar([1, 32], "batch_size"))
+                else:
+                    expected_spec.append(IntImm(shape))
+
+            self.assertEqual(
+                TensorSpec(expected_spec, setting[1]),
+                specs[index],
+            )
+
+    def test_input_list_with_batch_size_non_default_dim(self):
+        inputs = [
+            torch.empty([2, 10, 4], dtype=torch.float16),
+            torch.empty([5, 10], dtype=torch.int32),
+            torch.empty([7, 10, 9], dtype=torch.float16),
+        ]
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32, 1)
+        self.assertEqual(3, len(specs))
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(2), IntVar([1, 32], "batch_size"), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(
+            TensorSpec([IntImm(5), IntVar([1, 32], "batch_size")], torch.int32),
+            specs[1],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)], torch.float16
+            ),
+            specs[2],
+        )
diff --git a/fx2ait/fx2ait/tools/ait_minimizer.py b/fx2ait/fx2ait/tools/ait_minimizer.py
new file mode 100644
index 000000000..08b8e60de
--- /dev/null
+++ b/fx2ait/fx2ait/tools/ait_minimizer.py
@@ -0,0 +1,86 @@
+import logging
+from typing import Any, Callable, List, Tuple
+
+import torch
+import torch.fx.passes.net_min_base as net_min_base
+
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter, TensorSpec
+from torch.fx.passes.tools_common import Tensors
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+def lower_mod_default(
+    mod: torch.fx.GraphModule,
+    inputs: List[TensorSpec],
+    workdir: str,
+    name: str,
+    dll_name: str,
+) -> AITModule:
+    interp = AITInterpreter(mod, inputs, workdir, name, dll_name)
+    interpreter_result = interp.run()
+    res_mod = AITModule(
+        torch.classes.fb.AITModel(
+            interpreter_result.engine.lib_path,
+            interpreter_result.input_names,
+            interpreter_result.output_names,
+            torch.float16,
+            torch.float16,
+            1,  # num_runtimes
+        ),
+    )
+    return res_mod
+
+
+class AITMinizerSetting(net_min_base._MinimizerSettingBase):
+    def __init__(self):
+        super().__init__()
+
+
+class AITMinimizer(net_min_base._MinimizerBase):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Tensors,
+        compare_fn: Callable[[Any, Any, Any], Tuple[float, bool]] = lambda a, b, c: (
+            torch.dist(a, b),
+            torch.allclose(a, b),
+        ),
+        settings: AITMinizerSetting = AITMinizerSetting(),
+        lower_fn: Callable[
+            [torch.fx.GraphModule, Tensors, str, str, str], AITModule
+        ] = lower_mod_default,
+        workdir: str = "./tmp/AITMinimizer",
+        name: str = "minimize_module",
+    ):
+        self.lower_fn = lower_fn
+        self.workdir = workdir
+        self.name = name
+        self.curr_iter = 0  # We use this counter to prevent duplicate .so naming
+        super().__init__(module, sample_input, compare_fn, settings)
+
+    def run_a(self, mod, inputs):
+        mod.eval()
+        with torch.no_grad():
+            return mod(*inputs)
+
+    def run_b(self, mod, inputs):
+        mod.eval()
+        dll_name = f"{self.name}_{self.curr_iter}.so"
+        self.curr_iter += 1
+        try:
+            mod = self.lower_fn(mod, inputs, self.workdir, self.name, dll_name)
+            output = mod(*inputs)
+        except RuntimeError as e:
+            raise net_min_base.FxNetMinimizerRunFuncError(
+                f"Encounter an error when processing \n{mod.graph}\n {e}"
+            )
+        else:
+            return output
+
+    def get_nodes(self, start=None, end=None, enable_print=False):
+        nodes = self._collect_nodes(start, end)
+        if enable_print:
+            _LOGGER.info(f"Nodes fetched from start {start} to end {end} as: {nodes}")
+        return nodes
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
new file mode 100644
index 000000000..4c0409288
--- /dev/null
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -0,0 +1,399 @@
+import unittest
+
+import uuid
+from enum import Enum
+from typing import Callable, List, Optional, Set
+from unittest import TestCase
+
+# executorch
+import executorch.exir as exir
+import torch
+from aitemplate.compiler.public import DynamicProfileStrategy
+from executorch.exir import CaptureConfig, ServerCompileConfig
+
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter
+
+from fx2ait.passes.lower_basic_pass_aten import (
+    compose_chunk,
+    replace_aten_op_with_indices,
+    replace_aten_reshape_alias_with_replace,
+    replace_builtin_ops,
+    replace_native_layernorm_with_layernorm,
+    replace_transpose_mm_op_with_linear,
+    run_const_fold,
+)
+from fx2ait.tensor_spec import TensorSpec
+
+torch.ops.load_library("//deeplearning/ait:AITModel")
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
+
+
+def fetch_attr(mod, target):
+    """
+    Fetch an attribute from the ``Module`` hierarchy of ``mod.module``.
+
+    Args:
+        target (str): The fully-qualfiied name of the attribute to fetch
+
+    Return:
+        Any: The value of the attribute.
+    """
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip because CUDA is not available")
+class DispatchTestCase(TestCase):
+    def generate_graph(
+        self,
+        mod: torch.nn.Module,
+        original_inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        customized_passes: List[Callable] = None,
+    ):
+        # Torchdynamo+aot proxytensor tracer
+        # Below are common passes
+        passes_list = [
+            replace_aten_reshape_alias_with_replace,
+            replace_aten_op_with_indices,
+            replace_transpose_mm_op_with_linear,
+            replace_native_layernorm_with_layernorm,
+            compose_chunk,
+            replace_builtin_ops,
+        ]
+        # Combine with customized passes specific to any model
+        if customized_passes:
+            passes_list.extend(customized_passes)
+
+        fx_module = exir.capture(
+            mod,
+            tuple(original_inputs),
+            CaptureConfig(
+                pt2_mode=True,
+                enable_functionalization=False,
+                enable_dynamic_shape=True,
+            ),
+        )._to_server(ServerCompileConfig(passes=passes_list))
+
+        fx_module = run_const_fold(fx_module)
+        print(fx_module.graph)
+
+        if len(expected_ops):
+            self.assert_has_op(fx_module, expected_ops)
+        if unexpected_ops:
+            self.assert_unexpected_op(fx_module, unexpected_ops)
+
+        return fx_module
+
+    def run_test(
+        self,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        customized_passes: List[Callable] = None,
+    ):
+
+        mod.eval()
+        original_inputs = inputs
+        if permute_inputs:
+            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        fx_module = self.generate_graph(
+            mod, original_inputs, expected_ops, unexpected_ops, customized_passes
+        )
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs,
+            "/tmp",
+            f"test-aten2ait-{uuid.uuid1()}",
+        )
+        interp_result = interp.run()
+        ait_mod_run = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+        # Inference run and results comparison
+        with torch.no_grad():
+            # reference run
+            ref_outputs = mod(*original_inputs)
+            # ait run
+            cuda_inputs = []
+            for i in inputs:
+                cuda_inputs.append(i.cuda())
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod_run(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+                if permute_outputs:
+                    out = out.permute(*permute_outputs)
+                torch.testing.assert_close(
+                    out.cpu(),
+                    ref,
+                    rtol=rtol,
+                    atol=atol,
+                    check_dtype=False,
+                    equal_nan=True,
+                )
+
+    def run_test_with_dynamic_shape(
+        self,
+        mod: torch.nn.Module,
+        inputs_spec: List[TensorSpec],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        customized_passes: List[Callable] = None,
+        dynamic_profile_strategy=DynamicProfileStrategy.MAX,
+        specify_num: Optional[float] = None,
+    ):
+        mod.eval()
+        inputs_list = []
+        for use_lower_bound in [True, False]:
+            inputs_list.append(
+                TensorSpec.create_inputs_from_specs(
+                    inputs_spec,
+                    use_lower_bound=use_lower_bound,
+                    specify_num=specify_num,
+                )
+            )
+        inputs = inputs_list[0]
+
+        fx_module = self.generate_graph(
+            mod, inputs, expected_ops, unexpected_ops, customized_passes
+        )
+
+        if permute_inputs:
+            for inp in inputs_spec:
+                shape = []
+                for i in permute_inputs:
+                    shape.append(inp.shape[i])
+                inp.shape = shape
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs_spec,
+            "/tmp",
+            f"test-aten2ait-{uuid.uuid1()}",
+            dynamic_profile_strategy=dynamic_profile_strategy,
+        )
+        interp_result = interp.run()
+        ait_mod_run = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+        for inputs in inputs_list:
+            with torch.no_grad():
+                ref_outputs = mod(*inputs)
+                # ait run
+                cuda_inputs = []
+                # reference run
+                if permute_inputs:
+                    inputs = [
+                        inp.permute(*permute_inputs).contiguous() for inp in inputs
+                    ]
+                for i in inputs:
+                    cuda_inputs.append(i.cuda())
+                torch.cuda.synchronize()
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+                outputs = ait_mod_run(*cuda_inputs)
+                end_event.record()
+                torch.cuda.synchronize()
+                print(
+                    "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+                )
+
+                if isinstance(outputs, torch.Tensor):
+                    ref_outputs = [ref_outputs]
+                    outputs = [outputs]
+                for out, ref in zip(outputs, ref_outputs):
+                    if not isinstance(ref, torch.Tensor):
+                        ref = torch.tensor([ref])
+                    ref = ref.cpu()  # to_dtype test has cases with gpu output
+                    if permute_outputs:
+                        out = out.permute(*permute_outputs)
+
+                    torch.testing.assert_close(
+                        out.cpu(),
+                        ref,
+                        rtol=rtol,
+                        atol=atol,
+                        check_dtype=False,
+                        equal_nan=True,
+                    )
+
+    def assert_has_op(self, mod, ops):
+        ops_in_mod = set()
+
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                ops_in_mod.add(type(fetch_attr(mod, node.target)))
+            elif node.op in {"call_function", "call_method"}:
+                ops_in_mod.add(node.target)
+
+        self.assertTrue(
+            ops_in_mod >= ops, f"expected ops {ops}, actuall ops {ops_in_mod}"
+        )
+
+    def assert_unexpected_op(self, mod, ops):
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                if type(fetch_attr(mod, node.target)) in ops:
+                    return False
+            elif node.op in {"call_function", "call_method"}:
+                if node.target in ops:
+                    return False
+        return True
+
+    def benchmark_function(
+        self,
+        name: str,
+        iters: int,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        permute_inputs: Optional[List[int]] = None,
+        customized_passes: Optional[List[int]] = None,
+    ) -> float:
+
+        mod.eval()
+        original_inputs = inputs
+        if permute_inputs:
+            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        fx_module = self.generate_graph(
+            mod, original_inputs, {}, customized_passes=customized_passes
+        )
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs,
+            "/tmp",
+            f"benchmark-fx2ait-{uuid.uuid1()}",
+        )
+
+        def benchmark(f, args):
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            print("== Start benchmark iterations")
+            with torch.inference_mode():
+                start_event.record()
+                for _ in range(iters):
+                    f(*args)
+                end_event.record()
+            torch.cuda.synchronize()
+            print("== End benchmark iterations")
+            time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+            return time_per_iter_ms
+
+        with torch.inference_mode():
+            interp_result = interp.run()
+            ait_mod = AITModule(
+                torch.classes.fb.AITModel(
+                    interp_result.engine.lib_path,
+                    interp_result.input_names,
+                    interp_result.output_names,
+                    torch.float16,
+                    torch.float,
+                    1,  #  num_runtimes
+                )
+            )
+            # Benchmark Pytorch Eager
+            # warmup
+            for _ in range(10):
+                mod(*original_inputs)
+            batch_size = inputs[0].shape[0]
+            pt_time_per_iter_ms = benchmark(mod, original_inputs)
+            pt_qps = batch_size / pt_time_per_iter_ms
+
+            # Benchmark FX2AIT
+            cuda_inputs = []
+            for i in inputs:
+                cuda_inputs.append(i.cuda())
+            # warmup
+            for _ in range(10):
+                ait_mod(*cuda_inputs)
+
+            ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+            ait_qps = batch_size / ait_time_per_iter_ms
+
+            result = (
+                f"== Benchmark Result for: {name}\n"
+                f"BS: {batch_size}, "
+                f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+                f"PT Eager QPS: {pt_qps:.2f}, "
+                f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+                f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+                f"Speedup: {ait_qps/pt_qps:.2f}, "
+            )
+            with open("/tmp/bench_" + name + ".csv", "a") as f:
+                f.write(
+                    ",".join(
+                        map(
+                            str,
+                            [
+                                name,
+                                batch_size,
+                                pt_time_per_iter_ms,
+                                pt_qps,
+                                ait_time_per_iter_ms,
+                                ait_qps,
+                                ait_qps / pt_qps,
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+            return result
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
new file mode 100644
index 000000000..382942a66
--- /dev/null
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -0,0 +1,396 @@
+import time
+import unittest
+
+import uuid
+from enum import Enum
+from typing import Callable, List, Optional, Set
+from unittest import TestCase
+
+import torch
+from fx2ait.acc_tracer import acc_tracer
+from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter
+from fx2ait.tensor_spec import TensorSpec
+
+OSS_AITModel = False
+try:
+    torch.ops.load_library("//deeplearning/ait:AITModel")
+    print("===Load non-OSS AITModel===")
+except Exception:
+    torch.ops.load_library("build/libait_model.so")
+    print("===Load OSS AITModel===")
+    OSS_AITModel = True
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
+
+
+def fetch_attr(mod, target):
+    """
+    Fetch an attribute from the ``Module`` hierarchy of ``mod.module``.
+
+    Args:
+        target (str): The fully-qualfiied name of the attribute to fetch
+
+    Return:
+        Any: The value of the attribute.
+    """
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip because CUDA is not available")
+class AITTestCase(TestCase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(3)
+
+    def run_test(
+        self,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        transformer_mode: Optional[bool] = False,
+    ):
+        # TODO: add precision to interpreter once AIT supports multiple precision level
+        # TODO: @qxy11 remove permute options once AIT supports channels-first format
+        mod.eval()
+        mod = acc_tracer.trace(
+            mod,
+            inputs,
+            leaf_module_list=[
+                torch.nn.MultiheadAttention if transformer_mode else None
+            ],
+        )
+        print(mod)
+
+        original_inputs = inputs
+        if permute_inputs:
+            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+        interp = AITInterpreter(
+            mod,
+            inputs,
+            "/tmp",
+            f"test-fx2ait-{uuid.uuid1()}",
+        )
+        with torch.no_grad():
+            cuda_inputs = []
+            for i in inputs:
+                cuda_inputs.append(i.cuda())
+
+            mod.eval()
+            if len(expected_ops):
+                self.assert_has_op(mod, expected_ops)
+            if unexpected_ops:
+                self.assert_unexpected_op(mod, unexpected_ops)
+            start = time.perf_counter()
+            interp_result = interp.run()
+            sec = time.perf_counter() - start
+            print("Interpreter run time(s):", sec)
+            if OSS_AITModel:
+                ait_mod = AITModule(
+                    torch.classes.ait.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    )
+                )
+            else:
+                ait_mod = AITModule(
+                    torch.classes.fb.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    )
+                )
+
+            ref_outputs = mod(*original_inputs)
+
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+            # PyTorch Transformer model would yield 2 output tensors, of which the second one is
+            # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
+            if transformer_mode:
+                ref_outputs = ref_outputs[0]
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+                if permute_outputs:
+                    out = out.permute(*permute_outputs)
+                torch.testing.assert_close(
+                    out.cpu(),
+                    ref,
+                    rtol=rtol,
+                    atol=atol,
+                    check_dtype=False,
+                    equal_nan=True,
+                )
+
+    def run_test_with_dynamic_shape(
+        self,
+        mod: torch.nn.Module,
+        inputs_spec: List[TensorSpec],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+    ):
+        mod.eval()
+        inputs_list = []
+        for use_lower_bound in [True, False]:
+            inputs_list.append(
+                TensorSpec.create_inputs_from_specs(
+                    inputs_spec, use_lower_bound=use_lower_bound
+                )
+            )
+
+        inputs_min = inputs_list[0]
+        inputs_max = inputs_list[1]
+        mod.eval()
+        mod = acc_tracer.trace(mod, inputs_min)
+        original_inputs = inputs_min
+        # Trace and test with inputs_min
+        interp = AITInterpreter(
+            mod,
+            inputs_spec,
+            "/tmp",
+            f"test-fx2ait-{uuid.uuid1()}",
+        )
+        with torch.no_grad():
+            cuda_inputs = []
+            for i in inputs_min:
+                cuda_inputs.append(i.cuda())
+
+            mod.eval()
+            if len(expected_ops):
+                self.assert_has_op(mod, expected_ops)
+            if unexpected_ops:
+                self.assert_unexpected_op(mod, unexpected_ops)
+            start = time.perf_counter()
+            interp_result = interp.run()
+            sec = time.perf_counter() - start
+            print("Interpreter run time(s):", sec)
+            if OSS_AITModel:
+                ait_mod = AITModule(
+                    torch.classes.ait.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    )
+                )
+            else:
+                ait_mod = AITModule(
+                    torch.classes.fb.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    )
+                )
+
+            ref_outputs = mod(*original_inputs)
+
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+
+                torch.testing.assert_close(
+                    out.cpu(), ref, rtol=rtol, atol=atol, check_dtype=False
+                )
+
+            # To test dynamic shape, we test it again with inputs_max
+            ref_outputs_max = mod(*inputs_max)
+            for i in inputs_max:
+                cuda_inputs_max = [i.cuda() for i in inputs_max]
+            outputs_max = ait_mod(*cuda_inputs_max)
+            if isinstance(outputs_max, torch.Tensor):
+                ref_outputs_max = [ref_outputs_max]
+                outputs_max = [outputs_max]
+            for out, ref in zip(outputs_max, ref_outputs_max):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+
+                torch.testing.assert_close(
+                    out.cpu(), ref, rtol=rtol, atol=atol, check_dtype=False
+                )
+
+    def assert_has_op(self, mod, ops):
+        ops_in_mod = set()
+
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                ops_in_mod.add(type(fetch_attr(mod, node.target)))
+            elif node.op in {"call_function", "call_method"}:
+                ops_in_mod.add(node.target)
+
+        self.assertTrue(
+            ops_in_mod >= ops, f"expected ops {ops}, actuall ops {ops_in_mod}"
+        )
+
+    def assert_unexpected_op(self, mod, ops):
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                if type(fetch_attr(mod, node.target)) in ops:
+                    return False
+            elif node.op in {"call_function", "call_method"}:
+                if node.target in ops:
+                    return False
+        return True
+
+
+def benchmark_function(
+    name: str,
+    iters: int,
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    permute_inputs: Optional[List[int]] = None,
+) -> float:
+
+    mod.eval()
+    mod = acc_tracer.trace(
+        mod,
+        inputs,
+    )
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+    interp = AITInterpreter(
+        mod,
+        inputs,
+        "/tmp",
+        f"benchmark-fx2ait-{uuid.uuid1()}",
+    )
+
+    def benchmark(f, args):
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        print("== Start benchmark iterations")
+        with torch.inference_mode():
+            start_event.record()
+            for _ in range(iters):
+                f(*args)
+            end_event.record()
+        torch.cuda.synchronize()
+        print("== End benchmark iterations")
+        time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+        return time_per_iter_ms
+
+    with torch.inference_mode():
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+        # Benchmark Pytorch Eager
+        # warmup
+        for _ in range(10):
+            mod(*original_inputs)
+        batch_size = inputs[0].shape[0]
+        pt_time_per_iter_ms = benchmark(mod, original_inputs)
+        pt_qps = batch_size / pt_time_per_iter_ms
+
+        # Benchmark FX2AIT
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+        # warmup
+        for _ in range(10):
+            ait_mod(*cuda_inputs)
+
+        ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+        ait_qps = batch_size / ait_time_per_iter_ms
+
+        result = (
+            f"== Benchmark Result for: {name}\n"
+            f"BS: {batch_size}, "
+            f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+            f"PT Eager QPS: {pt_qps:.2f}, "
+            f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+            f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+            f"Speedup: {ait_qps/pt_qps:.2f}, "
+        )
+
+        with open("/tmp/bench_" + name + ".csv", "a") as f:
+            f.write(
+                ",".join(
+                    map(
+                        str,
+                        [
+                            name,
+                            batch_size,
+                            pt_time_per_iter_ms,
+                            pt_qps,
+                            ait_time_per_iter_ms,
+                            ait_qps,
+                            ait_qps / pt_qps,
+                        ],
+                    )
+                )
+                + "\n"
+            )
+        return result
+
+
+update_acc_op_mappers_for_ait()
diff --git a/fx2ait/fx2ait/utils.py b/fx2ait/fx2ait/utils.py
new file mode 100644
index 000000000..ef7fb3e7a
--- /dev/null
+++ b/fx2ait/fx2ait/utils.py
@@ -0,0 +1,17 @@
+from aitemplate.utils.torch_utils import torch_dtype_to_string
+
+
+def dtype_to_str(dtype):
+    if dtype is None:
+        return "float16"
+    return torch_dtype_to_string(dtype)
+
+
+def make_str_ait_friendly(s: str) -> str:
+    if s.isalnum():
+        ret = s
+    else:
+        ret = "".join(c if c.isalnum() else "_" for c in s)
+    if ret[0].isdigit():
+        ret = "_" + ret
+    return ret
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
new file mode 100644
index 000000000..7d6ba9de0
--- /dev/null
+++ b/fx2ait/setup.py
@@ -0,0 +1,95 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name):
+        Extension.__init__(self, name, sources=[])
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            subprocess.check_output(["cmake", "--version"])
+        except OSError as exc:
+            raise RuntimeError(
+                "CMake must be installed to build the following extensions: "
+                + ", ".join(e.name for e in self.extensions)
+            ) from exc
+
+        try:
+            import torch.utils
+
+            cmake_prefix_path = torch.utils.cmake_prefix_path
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "Cannot import torch.utils. Check torch installation."
+            ) from exc
+
+        build_directory = os.path.abspath(self.build_temp)
+        cmake_args = [
+            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + build_directory,
+            "-DPYTHON_EXECUTABLE=" + sys.executable,
+            "-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
+        ]
+
+        cfg = "Debug" if self.debug else "Release"
+        build_args = ["--config", cfg]
+
+        # cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+
+        # Assuming Makefiles
+        build_args += ["--", "-j2"]
+
+        self.build_args = build_args
+
+        env = os.environ.copy()
+        env["CXXFLAGS"] = '{} -DVERSION_INFO=\\"{}\\"'.format(
+            env.get("CXXFLAGS", ""), self.distribution.get_version()
+        )
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        # CMakeLists.txt is in the same directory as this setup.py file
+        cmake_list_dir = os.path.abspath(os.path.dirname(__file__))
+        print("-" * 10, "Running CMake prepare", "-" * 40)
+        subprocess.check_call(
+            ["cmake", cmake_list_dir] + cmake_args, cwd=self.build_temp, env=env
+        )
+
+        print("-" * 10, "Building extensions", "-" * 40)
+        cmake_cmd = ["cmake", "--build", "."] + self.build_args
+        subprocess.check_call(cmake_cmd, cwd=self.build_temp)
+        # Move from build temp to final position
+        for ext in self.extensions:
+            self.move_output(ext)
+
+    def move_output(self, ext):
+        build_temp = Path(self.build_temp).resolve()
+        lib_name = "lib" + ext.name + ".so"
+        dest_path = build_temp.parents[0] / lib_name
+        source_path = build_temp / lib_name
+        dest_directory = dest_path.parents[0]
+        dest_directory.mkdir(parents=True, exist_ok=True)
+        self.copy_file(source_path, dest_path)
+
+
+ext_modules = [
+    CMakeExtension("ait_model"),
+]
+
+setup(
+    name="fx2ait",
+    version="0.2.dev1",
+    description="FX2AIT: Convert PyTorch Models to AITemplate",
+    zip_safe=True,
+    install_requires=["torch"],  # We will need torch>=1.13
+    packages=find_packages(),
+    ext_modules=ext_modules,
+    cmdclass=dict(build_ext=CMakeBuild),
+)
diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
index 9adca1347..99f5e468c 100644
--- a/python/aitemplate/__init__.py
+++ b/python/aitemplate/__init__.py
@@ -23,4 +23,4 @@
 
 __all__ = ["backend", "compiler", "frontend", "testing", "utils"]
 
-root_logger = utils.logger.setup_logger(__name__)
+root_logger = utils.misc.setup_logger(__name__)
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 62fd07ade..7ca35a719 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass, field
 
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 import jinja2
 
@@ -53,6 +53,8 @@ class GPUBackendSpec(BackendSpec):
     dtype_to_backend_dtype: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "half",
+            "bfloat16": "bfloat16",
+            "float32": "float",
             "float": "float",
             "int64": "int64_t",
         }
@@ -65,24 +67,12 @@ class GPUBackendSpec(BackendSpec):
         }
     )
 
-    read_num_elements_to_backend_type: List[Tuple[int, str]] = field(
-        default_factory=lambda: [
-            (8, "uint4"),
-            (4, "uint2"),
-            (2, "uint"),
-            (1, "half"),
-        ]
-    )
-    op_num_elements_to_backend_type: List[Tuple[int, str]] = field(
-        default_factory=lambda: [
-            (2, "half2"),
-            (1, "half"),
-        ]
-    )
     op_type_priority_list: List[str] = field(
         default_factory=lambda: [
             "half2",
             "half",
+            "bfloat16_2",
+            "bfloat16",
             "float",
         ]
     )
@@ -90,141 +80,246 @@ class GPUBackendSpec(BackendSpec):
         default_factory=lambda: {
             FuncEnum.ADD: {
                 "half2": "__hadd2",
+                "bfloat16_2": "__hadd2",
                 "half": "__hadd",
+                "bfloat16": "__hadd",
                 "float": "__fadd_rn",
             },
             FuncEnum.SUB: {
                 "half2": "__hsub2",
+                "bfloat16_2": "__hsub2",
                 "half": "__hsub",
+                "bfloat16": "__hsub",
                 "float": "__fsub_rn",
             },
             FuncEnum.MUL: {
                 "half2": "__hmul2",
+                "bfloat16_2": "__hmul2",
                 "half": "__hmul",
+                "bfloat16": "__hmul",
                 "float": "__fmul_rn",
             },
             FuncEnum.DIV: {
                 "half2": "__h2div",
+                "bfloat16_2": "__h2div",
                 "half": "__hdiv",
+                "bfloat16": "__hdiv",
                 "float": "__fdiv_rn",
             },
             FuncEnum.COS: {
                 "half2": "h2cos",
+                "bfloat16_2": "h2cos",
                 "half": "hcos",
+                "bfloat16": "hcos",
                 "float": "cosf",
             },
             FuncEnum.SIN: {
                 "half2": "h2sin",
+                "bfloat16_2": "h2sin",
                 "half": "hsin" if Target.current().name() == "cuda" else "hsin_custom",
+                "bfloat16": "hsin"
+                if Target.current().name() == "cuda"
+                else "hsin_custom",
                 "float": "sinf",
             },
             FuncEnum.TANH: {
                 "half2": "fast_tanh",
+                "bfloat16_2": "fast_tanh",
                 "half": "fast_tanh",
+                "bfloat16": "fast_tanh",
                 "float": "tanh",
             },
             FuncEnum.ABS: {
                 "half2": "__habs2",
+                "bfloat16_2": "__habs2",
                 "half": "__habs",
+                "bfloat16": "__habs",
                 "float": "fabsf",
             },
             FuncEnum.LOGE: {
                 "half2": "h2log",
+                "bfloat16_2": "h2log",
                 "half": "hlog",
+                "bfloat16": "hlog",
                 "float": "logf",
             },
             FuncEnum.EXP: {
                 "half2": "h2exp",
+                "bfloat16_2": "h2exp",
                 "half": "hexp",
+                "bfloat16": "hexp",
                 "float": "expf",
             },
             FuncEnum.SQRT: {
                 "half2": "h2sqrt",
+                "bfloat16_2": "h2sqrt",
                 "half": "hsqrt",
+                "bfloat16": "hsqrt",
                 "float": "sqrtf",
             },
             FuncEnum.MAX: {
                 "half2": "hmax2_nan",
+                "bfloat16_2": "hmax2_nan",
                 "half": "hmax_nan",
+                "bfloat16": "hmax_nan",
                 "float": "fmaxf_nan",
             },
             FuncEnum.MIN: {
                 "half2": "hmin2_nan",
+                "bfloat16_2": "hmin2_nan",
                 "half": "hmin_nan",
+                "bfloat16": "hmin_nan",
                 "float": "fminf_nan",
             },
             FuncEnum.SIGN: {
                 "half2": "h2sign_custom",
+                "bfloat16_2": "h2sign_custom",
                 "half": "sign_custom<half>",
+                "bfloat16": "sign_custom<bfloat16>",
                 "float": "sign_custom<float>",
             },
             FuncEnum.SIGMOID: {
                 "half2": "h2sigmoid_custom",
+                "bfloat16_2": "h2sigmoid_custom",
                 "half": "hsigmoid_custom",
+                "bfloat16": "hsigmoid_custom",
                 "float": "fsigmoid_custom",
             },
             FuncEnum.LRELU: {
                 "half2": "leaky_relu",
+                "bfloat16_2": "leaky_relu",
                 "half": "leaky_relu",
+                "bfloat16": "leaky_relu",
                 "float": "leaky_relu",
             },
             FuncEnum.HARDTANH: {
                 "half2": "h2hard_tanh",
-                "half": "hard_tanh<half>",
-                "float": "hard_tanh<float>",
+                "bfloat16_2": "h2hard_tanh",
+                "half": "hard_tanh",
+                "float": "hard_tanh",
+                "bfloat16": "hard_tanh",
+            },
+            FuncEnum.RELU: {
+                "half2": "relu",
+                "bfloat16_2": "relu",
+                "half": "relu",
+                "bfloat16": "relu",
+                "float": "relu",
             },
-            FuncEnum.RELU: {"half2": "relu", "half": "relu", "float": "relu"},
             FuncEnum.NAN_TO_NUM: {
                 "half2": "nan_to_num",
+                "bfloat16_2": "nan_to_num",
                 "half": "nan_to_num",
+                "bfloat16": "nan_to_num",
                 "float": "nan_to_num",
             },
             FuncEnum.CLAMP_NAN_TO_NUM: {
                 "half2": "clamp_nan_to_num",
+                "bfloat16_2": "clamp_nan_to_num",
                 "half": "clamp_nan_to_num",
+                "bfloat16": "clamp_nan_to_num",
                 "float": "clamp_nan_to_num",
             },
             FuncEnum.SILU: {
                 "half2": "h2silu",
+                "bfloat16_2": "h2silu",
                 "half": "hsilu",
+                "bfloat16": "hsilu",
                 "float": "fsilu",
             },
             FuncEnum.POW: {
                 "half2": "h2pow",
+                "bfloat16_2": "h2pow",
                 "half": "hpow",
+                "bfloat16": "hpow",
                 "float": "fpow",
             },
             FuncEnum.GELU: {
                 "half": "hgelu",
+                "bfloat16": "hgelu",
                 "float": "fgelu",
             },
             FuncEnum.FASTGELU: {
                 "half": "h_fast_gelu",
+                "bfloat16": "h_fast_gelu",
                 "float": "f_fast_gelu",
             },
             FuncEnum.SOFTPLUS: {
                 "half2": "h2softplus",
+                "bfloat16_2": "h2softplus",
                 "half": "hsoftplus",
+                "bfloat16": "hsoftplus",
                 "float": "fsoftplus",
             },
+            FuncEnum.ELU: {
+                "half2": "h2elu",
+                "bfloat16_2": "h2elu",
+                "half": "helu",
+                "bfloat16": "helu",
+                "float": "felu",
+            },
         }
     )
 
-    def get_backend_type(
+    def get_elementwise_op_backend_type(
         self,
         num_elements: int,
         dtype: str,
-        num_elements_to_backend_type_list: List[Tuple[int, str]],
     ) -> str:
-        if dtype not in ("float16", "float"):
+        """
+        Get a backend type execution in elementwise ops.
+        For example, if we're dealing with fp16, we might be able to use half2 if num_elements is divisible by 2.
+        """
+        if dtype in ("float", "float32"):
+            return "float"
+        elif dtype == "float16":
+            if num_elements % 2 == 0:
+                return "half2"
+            else:
+                return "half"
+        elif dtype == "bfloat16":
+            if num_elements % 2 == 0:
+                return "bfloat16_2"
+            else:
+                return "bfloat16"
+        raise NotImplementedError("Unsupported dtype {}!".format(dtype))
+
+    def get_elementwise_read_backend_type(
+        self,
+        num_elements: int,
+        dtype: str,
+    ) -> str:
+        """
+        Get a backend type for reading in elementwise ops.
+        For example, if we're dealing with fp16 and num_elements is divisible by 8,
+        we can use uint4.
+        """
+        if dtype in ("float", "float32"):
+            num_elems_to_backend_type = ((4, "uint4"), (2, "uint2"), (1, "float"))
+
+        elif dtype == "float16":
+            num_elems_to_backend_type = (
+                (8, "uint4"),
+                (4, "uint2"),
+                (2, "uint"),
+                (1, "half"),
+            )
+        elif dtype == "bfloat16":
+            num_elems_to_backend_type = (
+                (8, "uint4"),
+                (4, "uint2"),
+                (2, "uint"),
+                (1, "bfloat16"),
+            )
+        else:
             raise NotImplementedError("Unsupported dtype {}!".format(dtype))
-        for alignment, backend_type in num_elements_to_backend_type_list:
-            if num_elements % alignment == 0:
-                return backend_type
+
+        for mod, dtype in num_elems_to_backend_type:
+            if num_elements % mod == 0:
+                return dtype
+
         raise RuntimeError(
-            "Failed to infer data type! num_elements: {}, num_elements_to_backend_type_list: {}".format(
-                num_elements, num_elements_to_backend_type_list
-            )
+            f"Failed to infer data type due to invalid num elems to backend type mapping: {num_elems_to_backend_type}"
         )
 
     def get_candidate_op_types(self, op_t: str) -> List[str]:
@@ -261,6 +356,7 @@ class ROCMSpec(GPUBackendSpec):
     stream = "stream"
     cub = "hipcub"
 
+    cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
     cast_to_const_half_ptr_template = jinja2.Template(
         "reinterpret_cast<const half*>({{name}})"
@@ -277,6 +373,7 @@ class ROCMSpec(GPUBackendSpec):
     dtype_to_ck_type: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "ck::half_t",
+            "float32": "float",
             "float": "float",
         }
     )
@@ -293,6 +390,7 @@ class CUDASpec(GPUBackendSpec):
     stream = "stream"
     cub = "cub"
 
+    cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
     cast_to_const_half_ptr_template = jinja2.Template(
         "reinterpret_cast<const half*>({{name}})"
@@ -300,6 +398,11 @@ class CUDASpec(GPUBackendSpec):
     header_src_template = jinja2.Template(
         """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+using bfloat16 = nv_bfloat16;
+using bfloat16_2 = nv_bfloat162;
+
 {{extra_header}}
         """
     )
@@ -308,6 +411,8 @@ class CUDASpec(GPUBackendSpec):
     dtype_to_cutlass_type: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "cutlass::half_t",
+            "bfloat16": "cutlass::bfloat16_t",
+            "float32": "float",
             "float": "float",
         }
     )
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index bd0b8c4eb..a4a2fbd3b 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -18,27 +18,118 @@
 
 from __future__ import annotations
 
+import logging
 import multiprocessing
 
 import os
-import pathlib
 import re
 import shlex
 import subprocess
 import typing
+from hashlib import sha1
+from pathlib import Path
 from typing import Optional
 
 import jinja2
 
-from ..utils import logger
+from ..utils.misc import is_debug
 from .target import Target
 from .task_runner import BaseRunner, Task
 
 # pylint: disable=W0221,C0103
 
 
-def _run_make_cmds(cmds, timeout):
-    logger.debug(__name__, f"make {cmds=}")
+_LOGGER = logging.getLogger(__name__)
+
+
+def _augment_for_trace(cmd):
+    return (
+        'date +"{{\\"name\\": \\"$@\\", \\"ph\\": \\"B\\", \\"pid\\": \\"$$$$\\", \\"ts\\": \\"%s%6N\\"}},";'
+        " {}; "
+        'date +"{{\\"name\\": \\"$@\\", \\"ph\\": \\"E\\", \\"pid\\": \\"$$$$\\", \\"ts\\": \\"%s%6N\\"}},";'
+    ).format(cmd)
+
+
+def _log_error_context(
+    stderr,
+    build_dir,
+    context_radius=10,
+    max_errors_per_file=5,
+    padding=5,
+):
+    path_to_error_lines = {}
+    for line in [L for L in stderr.split("\n") if ": error:" in L]:
+        match = re.search(r"(.+)\((\d+)\): error:.*", line)
+        if match:
+            path = match[1]
+            error_line = match[2]
+            if path not in path_to_error_lines:
+                path_to_error_lines[path] = set()
+            # nvcc line numbers are 1-based
+            error_line = int(error_line) - 1
+            path_to_error_lines[path].add(error_line)
+
+    # keep only the first N error lines per file
+    path_to_error_lines = {
+        path: sorted(error_lines)[:max_errors_per_file]
+        for path, error_lines in path_to_error_lines.items()
+    }
+
+    path_to_visible_lines = {}
+    for path, error_lines in path_to_error_lines.items():
+        path_to_visible_lines[path] = set()
+        for error_line in error_lines:
+            # collect the context lines around each error line
+            context = range(
+                error_line - context_radius,
+                error_line + context_radius + 1,
+            )
+            path_to_visible_lines[path].update(list(context))
+
+    for path, visible_lines in path_to_visible_lines.items():
+        full_path = os.path.join(build_dir, path)
+        if os.path.exists(full_path):
+            # read the lines from the file
+            with open(full_path, "r") as f:
+                # each line ends with '\n'
+                file_lines = f.readlines()
+            # except maybe the last line
+            if file_lines and not file_lines[-1].endswith("\n"):
+                file_lines[-1] = f"{file_lines[-1]}\n"
+            num_file_lines = len(file_lines)
+
+            error_lines = path_to_error_lines[path]
+            visible_lines = sorted(visible_lines)
+
+            lines_to_show = []
+            last_printed_i = -1
+            for i in visible_lines:
+                if i < 0 or i >= num_file_lines:
+                    # skip the line number as extraneous
+                    continue
+                if i - last_printed_i > 1:
+                    # preceding ellipsis
+                    lines_to_show.append("...\n")
+                line = file_lines[i]
+                lines_to_show.append(f"{i+1:<{padding}} {line}")
+                if i in error_lines:
+                    # mark the line as an error line: underscore
+                    spaces = line[: len(line) - len(line.lstrip())]
+                    underscore = spaces + "^" * (len(line) - len(spaces) - 1)
+                    lines_to_show.append(f"{' ' * padding} {underscore}\n")
+                last_printed_i = i
+            if visible_lines[-1] < num_file_lines - 1:
+                # closing ellipsis
+                lines_to_show.append("...\n")
+
+            if lines_to_show:
+                # all lines_to_show end with '\n'
+                summary = "".join(lines_to_show)
+                _LOGGER.info(f"{path}:\n\n{summary}")
+
+
+def _run_make_cmds(cmds, timeout, build_dir):
+    _LOGGER.debug(f"make {cmds=}")
     proc = subprocess.Popen(
         [" && ".join(cmds)],
         shell=True,
@@ -53,12 +144,18 @@ def _run_make_cmds(cmds, timeout):
         out, err = proc.communicate()
         raise e
     finally:
+        stdout = out.decode()
+        stderr = err.decode()
         if proc.returncode != 0:
-            # Let's always print out more info upon any failures.
-            logger_f = logger.info
+            _LOGGER.info(f"make stdout:\n\n{stdout}")
+            _LOGGER.info(f"make stderr:\n\n{stderr}")
+
+            _log_error_context(stderr, build_dir)
+
+            raise RuntimeError("Build has failed.")
         else:
-            logger_f = logger.debug
-        logger_f(__name__, f"make stdout: {out.decode()}\nmake stderr: {err.decode()}")
+            _LOGGER.debug(f"make stdout:\n\n{stdout}")
+            _LOGGER.debug(f"make stderr:\n\n{stderr}")
 
 
 def process_task(task: Task) -> None:
@@ -75,16 +172,14 @@ def process_task(task: Task) -> None:
     stderr = task._stderr
     if task._proc.returncode != 0:
         task._failed = True
-        logger.info(
-            __name__,
+        _LOGGER.info(
             "Failed: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
                 name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
             ),
         )
         task._ret = -1
     else:
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "Successful: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
                 name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
             ),
@@ -126,8 +221,7 @@ def __init__(self, devs: list[int], timeout: int = 10):
             Compiling timeout, by default 10 (seconds)
         """
         super().__init__(devs, "builder", timeout)
-        logger.info(
-            __name__,
+        _LOGGER.info(
             "Using {n} CPU for building".format(n=devs),
         )
         self._ftask_proc = process_task
@@ -184,6 +278,7 @@ def __init__(self, n_jobs: int = -1, timeout: int = 180) -> None:
         self._runner = Runner(n_jobs, timeout)
         self._n_jobs = n_jobs
         self._timeout = timeout
+        self._do_trace = os.environ.get("AIT_TRACE_MAKE", False)
 
     def build_objs(
         self,
@@ -206,15 +301,15 @@ def build_objs(
         """
         for idx, fpair in enumerate(files):
             src, target = fpair
-            logger.info(__name__, "Building " + target)
+            _LOGGER.info("Building " + target)
             if src.endswith(".bin"):
                 if binary_cc_cmd is None:
                     raise ValueError(
                         "Cannot compile .bin file without specifying binary_cc_cmd!"
                     )
 
-                src_path = pathlib.Path(src)
-                target_path = pathlib.Path(target)
+                src_path = Path(src)
+                target_path = Path(target)
                 compile_cmd = binary_cc_cmd.format(
                     target=target_path.name, src=src_path.name
                 )
@@ -222,7 +317,7 @@ def build_objs(
                 # Have to cd into the containing dir so ld doesn't include
                 # the path in the symbol names; unfortunately, there's no other
                 # way to control this.
-                if logger.is_debug():
+                if is_debug():
                     cmd = f"cd {containing_dir} && {compile_cmd} && cd -"
                 else:
                     # If not in debug mode, remove the original .bin file which can potentially be quite large.
@@ -230,7 +325,7 @@ def build_objs(
             else:
                 cmd = cc_cmd.format(target=target, src=src)
 
-            logger.debug(__name__, f"The cmd for building {target} is : {cmd}")
+            _LOGGER.debug(f"The cmd for building {target} is : {cmd}")
             self._runner.push(idx, cmd, target)
         self._runner.join()
         self._runner.pull()
@@ -245,7 +340,7 @@ def build_so(self, target: Target, objs: list[str]):
         objs : list[str]
             List of all object file paths for building the dynamic library.
         """
-        logger.info(__name__, "Building " + target)
+        _LOGGER.info("Building " + target)
         cc = Target.current().cc()
         compile_options = Target.current().compile_options()
         fpic = "-fPIC"
@@ -258,7 +353,7 @@ def build_so(self, target: Target, objs: list[str]):
             + compile_options
             + " -o {target} {objs}".format(target=target, objs=" ".join(objs))
         )
-        logger.debug(__name__, f"The cmd for building {target} is {cmd}")
+        _LOGGER.debug(f"The cmd for building {target} is {cmd}")
         self._runner.push(0, cmd, target)
         self._runner.join()
         self._runner.pull()
@@ -282,7 +377,7 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
 all: {{target}}
 
 {{target}}: $(obj_files)
-    $(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
+    {{build_so_cmd}}
 
 clean:
     rm -f *.obj {{target}} test.so
@@ -291,7 +386,7 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
     rm -f constants.bin
 """
         )
-
+        build_so_cmd = "$(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)"
         obj_files = [pair[1].split("/")[-1] for pair in file_pairs]
         obj_files = " ".join(obj_files)
 
@@ -304,11 +399,17 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
 
         cfile_cmd = Target.current().compile_cmd(False).format(target="$@", src="$<")
         bfile_cmd = Target.current().binary_compile_cmd()
+
         if not bfile_cmd:
             bfile_cmd = ""
         else:
             bfile_cmd = bfile_cmd.format(target="$@", src="$<")
 
+        if self._do_trace:
+            cfile_cmd = _augment_for_trace(cfile_cmd)
+            bfile_cmd = _augment_for_trace(bfile_cmd)
+            build_so_cmd = _augment_for_trace(build_so_cmd)
+
         makefile_str = makefile_template.render(
             cc=cc,
             cpp=cpp,
@@ -318,6 +419,7 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
             target=dll_name,
             cfile_cmd=cfile_cmd,
             bfile_cmd=bfile_cmd,
+            build_so_cmd=build_so_cmd,
         )
 
         dumpfile = os.path.join(workdir, test_name, "Makefile")
@@ -325,36 +427,286 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
             # fix the makefile indentation
             f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
 
+    @staticmethod
+    def _combine_profiler_multi_sources():
+        """Whether to combine multiple profiler sources per target."""
+        return bool(int(os.environ.get("COMBINE_PROFILER_MULTI_SOURCES", 1)))
+
+    @staticmethod
+    def _force_one_profiler_source_per_target():
+        """Whether to combine multiple profiler sources per target into one."""
+        return bool(int(os.environ.get("FORCE_ONE_PROFILER_SOURCE_PER_TARGET", 0)))
+
+    def _combine_sources(self, sources):
+        """
+        Combine multiple source files (given by path) into one
+        source file and return the path of the combined file.
+
+        Parameters
+        ----------
+        sources : Iterable[str]
+            The list of paths to the source files to combine.
+
+        Returns
+        -------
+        path : str
+            The path to the combined source file.
+        """
+        assert len(sources) > 0, "Must have at least one source"
+        if len(sources) == 1:
+            # no need to combine a single source
+            return next(iter(sources))
+
+        file_lines = []
+        for source in sources:
+            with open(source, "r") as f:
+                lines = f.readlines()
+            for line in lines:
+                if line.strip():
+                    # collect the original non-empty lines
+                    file_lines.append(line)
+            # the last line might not end with "\n"
+            file_lines.append("\n")
+
+        # generate a new file name conditioned on the list of the source file names
+        file_name = sha1((";".join(sorted(sources))).encode("utf-8")).hexdigest()
+        file_dir = Path(next(iter(sources))).parents[0]  # fetch the directory
+        file_path = file_dir / Path(f"temp_{file_name}.cu")
+        with open(file_path, "w") as f:
+            # file_lines end with "\n" already
+            f.write("".join(file_lines))
+
+        # return the path starting with "./"
+        return os.path.join(".", str(file_path))
+
+    def _combine_profiler_sources(self, target_to_sources, num_builders):
+        """
+        Combine multiple profiler sources generated for different targets
+        to optimize the overall compilation time, given the available number
+        of builders (CPUs). The total number of sources (across all targets)
+        is set equal to the `num_builders`. Single-source targets are kept
+        as is; multi-source targetss' sources are possibly combined.
+
+        Simplifying assumptions:
+
+            - Individual split (multiple) sources per target take
+              approximately equal time to compile across different
+              targets (this is, in particular, not true for the main
+              profiler source file vs kernel-specific source files:
+              the former is typically larger than the latter);
+            - Compilation time grows linearly in the number of
+              separate sources combined into a single file.
+
+        Parameters
+        ----------
+        target_to_soruces : dict[str, Iterable[str]]
+            The mapping from each target name to the list of sources
+            required to compile this target. There can be one or more
+            sources for each target.
+        num_builders : int
+            The number of available builders (CPUs).
+
+        Returns
+        ----------
+        target_to_combined_sources : dict[str, Iterable[str]]
+            Like `target_to_sources`, but with some of the source paths
+            in the values replaced by the paths to the respective combined
+            source files. Whether and which of the sources are combined
+            depends on the arguments.
+        """
+        num_total_sources = num_builders
+
+        if (
+            len(target_to_sources) >= num_total_sources
+            or self._force_one_profiler_source_per_target()
+        ):
+            # there are at least as many targets as the total
+            # number of sources required (or single source per
+            # target is forced): combine everything
+            return {
+                target: [self._combine_sources(sources)]
+                for target, sources in target_to_sources.items()
+            }
+
+        combine_candidates = {}  # multi-source targets
+        num_multi_sources, num_single_sources = 0, 0
+        for target, sources in target_to_sources.items():
+            if len(sources) > 1:
+                combine_candidates[target] = sources
+                num_multi_sources += len(sources)
+            else:
+                num_single_sources += 1
+
+        if num_multi_sources == 0:
+            # all targets are single-source: nothing to combine
+            return target_to_sources
+        if num_multi_sources + num_single_sources <= num_total_sources:
+            # there are fewer source files than the total
+            # number of sources required: no need to combine
+            return target_to_sources
+
+        # number of sources we need for the multi-file targets
+        num_combined_sources = num_total_sources - num_single_sources
+        num_sources_per_target = {
+            # the number of combined sources per multi-source target as a
+            # fraction of num_combined_sources is proportional to the number of
+            # multiple sources of the target (rounded down); ultimately, there
+            # should be at least one source target (hence max(..., 1))
+            target: max(int(len(sources) / num_multi_sources * num_combined_sources), 1)
+            for target, sources in combine_candidates.items()
+        }
+
+        # do any sources remain after the above per-target distribution?
+        remaining_sources = num_combined_sources - sum(num_sources_per_target.values())
+        if remaining_sources > 0:
+            # reverse-sort the targets by the remainder after rounding down:
+            # prefer adding sources to the targets with a higher remainder
+            # (i.e. the ones closest to getting another source)
+            targets = sorted(
+                num_sources_per_target.keys(),
+                key=lambda target: (
+                    (
+                        len(target_to_sources[target])
+                        / num_multi_sources
+                        * num_combined_sources
+                    )
+                    - int(
+                        len(target_to_sources[target])
+                        / num_multi_sources
+                        * num_combined_sources
+                    )
+                ),
+                reverse=True,
+            )
+            target_id = 0
+            while remaining_sources > 0:
+                # increment the number of sources for the target
+                num_sources_per_target[targets[target_id]] += 1
+                target_id = (target_id + 1) % len(targets)
+                remaining_sources -= 1
+
+        result = {}
+        for target in target_to_sources:
+            if target in combine_candidates:
+                # collect the sources of the target
+                # in N batches by round robin
+                num_sources = num_sources_per_target[target]
+                # TODO: form the source batches by the total number
+                # of lines instead of the number of sources for more
+                # even distribution of the compilation time per batch
+                batch_id = 0
+                batches = [[] for _ in range(num_sources)]
+                for source in target_to_sources[target]:
+                    batches[batch_id].append(source)
+                    batch_id = (batch_id + 1) % num_sources
+                # conbine the sources in each batch
+                result[target] = [self._combine_sources(b) for b in batches]
+            else:
+                # use the single-source profiler target as is
+                result[target] = target_to_sources[target]
+        return result
+
     def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
         makefile_template = jinja2.Template(
             """
-programs = {{programs}}
-all: $(programs)
+all: {{targets}}
 .PHONY: all clean
 
-$(programs): %: %.{{cpp}}
-    {{cc_cmd}}
+{{commands}}
 
 clean:
-    rm -f $(programs)
+\trm -f {{targets}}
 """
         )
-        program_relative_paths = sorted(
-            {f[1].split(os.path.join(profiler_dir, ""))[-1] for f in file_pairs}
-        )
-        logger.info(__name__, f"compiling {len(program_relative_paths)} profiler srcs")
-        programs = " ".join(program_relative_paths)
-        cc_cmd = Target.current().compile_cmd(True).format(target="$@", src="$<")
+
+        # normalize the profiler dir: add / at the end
+        profiler_dir = os.path.join(profiler_dir, "")
+
+        # deduplicate targets from different ops
+        target_to_sources = {}
+        for source, target in file_pairs:
+            if target not in target_to_sources:
+                target_to_sources[target] = set()
+            if isinstance(source, str):
+                target_to_sources[target].add(source)
+            else:
+                target_to_sources[target].update(source)
+
+        # stabilize the order of sources per target
+        target_to_sources = {
+            target: sorted(sources) for target, sources in target_to_sources.items()
+        }
+
+        if self._combine_profiler_multi_sources():
+            num_sources_before = sum(len(s) for s in target_to_sources.values())
+            target_to_sources = self._combine_profiler_sources(
+                target_to_sources=target_to_sources,
+                num_builders=self._n_jobs,
+            )
+            num_sources_after = sum(len(s) for s in target_to_sources.values())
+
+            _LOGGER.info(
+                f"combined {num_sources_before} profiler sources into {num_sources_after}",
+            )
+
+        targets = []
+        dependencies = {}
+        for target, sources in target_to_sources.items():
+            target = target.split(profiler_dir)[-1]
+            if len(sources) == 1:
+                # single-source profiler executable
+                source = next(iter(sources))
+                source = source.split(profiler_dir)[-1]
+                dependencies[target] = [source]
+            else:
+                # multi-source profiler executable
+                objects = []
+                for source in sources:
+                    # first compile the objects
+                    source = source.split(profiler_dir)[-1]
+                    obj = source.replace(".cu", ".obj")
+                    if not os.path.exists(os.path.join(profiler_dir, obj)):
+                        # compile the object only if it is absent
+                        dependencies[obj] = [source]
+                    objects.append(obj)
+                # then link the objects into an executable
+                dependencies[target] = objects
+            targets.append(target)
+
+        commands = []
+        num_compiled_sources = 0
+        num_linked_executables = 0
+        for target, srcs in dependencies.items():
+            # for each "target: srcs" pair,
+            # generate two lines for the Makefile
+            src_list = " ".join(srcs)
+            dep_line = f"{target}: {src_list}"
+            cmd_line = (
+                Target.current()
+                .compile_cmd(executable=(not target.endswith(".obj")))
+                .format(target=target, src=src_list)
+            )
+            if self._do_trace:
+                cmd_line = _augment_for_trace(cmd_line)
+
+            command = f"{dep_line}\n\t{cmd_line}\n"
+            commands.append(command)
+
+            # increment compilation statistics
+            num_compiled_sources += sum(1 for s in srcs if s.endswith(".cu"))
+            num_linked_executables += 0 if target.endswith(".obj") else 1
+
+        _LOGGER.info(f"compiling {num_compiled_sources} profiler sources")
+        _LOGGER.info(f"linking {num_linked_executables} profiler executables")
+
         makefile_str = makefile_template.render(
-            cpp="cu",
-            programs=programs,
-            cc_cmd=cc_cmd,
+            targets=" ".join(set(targets)),
+            commands="\n".join(commands),
         )
 
         dumpfile = os.path.join(profiler_dir, "Makefile")
         with open(dumpfile, "w+") as f:
-            # fix the makefile indentation
-            f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
+            f.write(makefile_str)
 
     def make_profilers(self, generated_profilers, workdir):
         file_pairs = [f for gp in generated_profilers for f in gp]
@@ -372,7 +724,7 @@ def make_profilers(self, generated_profilers, workdir):
         make_clean_cmd = f" {make_path} {make_flags} clean "
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         cmds = [make_clean_cmd, make_all_cmd]
-        _run_make_cmds(cmds, self._timeout)
+        _run_make_cmds(cmds, self._timeout, build_dir)
 
     def make(self, file_pairs, dll_name, workdir, test_name):
         self.gen_makefile(file_pairs, dll_name, workdir, test_name)
@@ -388,6 +740,6 @@ def make(self, file_pairs, dll_name, workdir, test_name):
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         make_clean_constants_cmd = f" {make_path} {make_flags} clean_constants "
         cmds = [make_clean_cmd, make_all_cmd]
-        if not logger.is_debug():
+        if not is_debug():
             cmds.append(make_clean_constants_cmd)
-        _run_make_cmds(cmds, self._timeout)
+        _run_make_cmds(cmds, self._timeout, build_dir)
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 6ad72b854..1436cd887 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -23,6 +23,7 @@
 from __future__ import annotations
 
 import io
+import logging
 import os
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -32,7 +33,7 @@
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from aitemplate.compiler.transform.memory_planning import Workspace
-from aitemplate.utils import logger
+from aitemplate.utils.debug_settings import AITDebugSettings
 
 from ..compiler.base import IntImm, IntVar, IntVarTensor, Tensor
 from . import registry
@@ -40,6 +41,9 @@
 
 # pylint: disable=C0103,W0613,C0301
 
+
+_LOGGER = logging.getLogger(__name__)
+
 DTYPE_TO_POINTERTYPE: Dict[str, str] = {
     "float32": "float*",
     "float": "float*",
@@ -105,7 +109,7 @@ def gen_function_src(
                 with open(src_path, "w") as fo:
                     fo.write(func.gen_function())
                 exist_func.add(fname)
-    logger.info(__name__, f"generated {len(file_pairs)} function srcs")
+    _LOGGER.info(f"generated {len(file_pairs)} function srcs")
     return file_pairs
 
 
@@ -261,8 +265,7 @@ def __init__(
         num_outputs: int,
         constants_data_file: io.BytesIO,
         output_name_to_idx: Dict[str, int],
-        check_all_nan_and_inf: bool = False,
-        check_all_outputs: bool = False,
+        debug_settings: AITDebugSettings,
     ):
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
@@ -275,6 +278,7 @@ def __init__(
         self.tensor_slice = []
         self.tensor_map_set = []
         self.set_inputs = []
+        self.func_name_seq = []
         self.func_seq = []
         self.tensor_decl = []
         self.dim_decl = []
@@ -313,8 +317,7 @@ def __init__(
             num_outputs,
         )
 
-        self.check_all_nan_and_inf = check_all_nan_and_inf
-        self.check_all_outputs = check_all_outputs
+        self.debug_settings = debug_settings
 
         # This records whether or not we should debug header.
         self.debug_header = False
@@ -543,7 +546,11 @@ def _process_src_ops(self, node: Tensor) -> None:
             # We use original_name here because it's unique.
             if func._attrs["original_name"] not in self.visited_func:
                 self.visited_func.add(func._attrs["original_name"])
-                self.func_seq.append(f_func_call(func._attrs, indent="    "))
+                seq = f_func_call(func._attrs, indent="    ")
+                if self.debug_settings.gen_profiler_annotation:
+                    seq = f'  {{\n  RAII_ProfilerRange _raiiOpProfilerRange("{func._attrs["outputs"][0]._attrs["name"]}");\n{seq}\n  }}'
+                self.func_name_seq.append(func._attrs["original_name"])
+                self.func_seq.append(seq)
             if "int_state_flag" in func._attrs:
                 if func._attrs["name"] not in self.state_record:
                     self.function_state.append(
@@ -552,15 +559,20 @@ def _process_src_ops(self, node: Tensor) -> None:
                     self.state_record.add(func._attrs["name"])
             self._process_dims_for_op(func)
 
-        if self.check_all_nan_and_inf or node._attrs.get("check_nan_and_inf", False):
+        if self.debug_settings.check_all_nan_and_inf or node._attrs.get(
+            "check_nan_and_inf", False
+        ):
             self._append_check_nan_and_inf(node)
-        if self.check_all_outputs or node._attrs.get("check_outputs", False):
+        if self.debug_settings.check_all_outputs or node._attrs.get(
+            "check_outputs", False
+        ):
             self._append_check_outputs(node)
 
     def _append_check_nan_and_inf(self, node: Tensor):
         self.debug_header = True
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
+        self.func_name_seq.append("nan_and_inf_check")
         self.func_seq.append(
             f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
         )
@@ -569,6 +581,7 @@ def _append_check_outputs(self, node: Tensor):
         self.debug_header = True
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
+        self.func_name_seq.append("output_check")
         self.func_seq.append(
             f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
         )
@@ -579,7 +592,13 @@ def append_tensor(self, node: Tensor) -> None:
         name = node._attrs["name"]
         dtype = node._attrs["dtype"]
         if isinstance(node, IntVarTensor):
-            self.tensor_decl.append(self.f_var_decl(name=name))
+            int_var = node._attrs["int_var"]
+            if isinstance(int_var, IntImm):
+                self.tensor_decl.append(
+                    self.f_var_decl(name=name, value=int_var._attrs["values"][0])
+                )
+            else:
+                self.tensor_decl.append(self.f_var_decl(name=name))
         else:
             self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
 
@@ -632,6 +651,7 @@ def generate_source(self) -> Dict[str, str]:
         # are not supported
         target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
 
+        func_pair_seq = zip(self.func_name_seq, self.func_seq)
         model_def = MODEL_TEMPLATE.render(
             function_decl="\n".join(self.func_decl),
             device_functions_header=device_functions_header_name,
@@ -642,20 +662,26 @@ def generate_source(self) -> Dict[str, str]:
             device_to_device_copies="\n".join(self.device_to_device_copies),
             set_up_param_dynamic_shapes="\n".join(self.set_up_param_dynamic_shapes),
             function_seq=self.func_seq,
+            function_pair_seq=func_pair_seq,
             tensor_decl="\n".join(self.tensor_decl),
             dim_decl="\n".join(self.dim_decl),
             function_state="\n".join(self.function_state),
             target_has_graph_mode=target_has_graph_mode,
             unique_workspace_size=self.workspace.unique_size,
             debug_header=self.debug_header,
+            blob_size=self.max_blob_size,
+            workspace_size=self.workspace.total_size(),
+            num_inputs=self.num_inputs,
+            num_outputs=self.num_outputs,
+            param_size=self.max_constant_blob_size,
+            num_unbound_constants=self.unbound_constant_idx,
+            profiler_annotation=self.debug_settings.gen_profiler_annotation,
         )
 
         result["model-generated.h"] = model_def
 
         model_container_src_fname = f"model_container_base{self.target.src_extension()}"
         model_container_base_src = MODEL_CONTAINER_TEMPLATE.render(
-            blob_size=self.max_blob_size,
-            workspace_size=self.workspace.total_size(),
             num_inputs=self.num_inputs,
             num_outputs=self.num_outputs,
             param_size=self.max_constant_blob_size,
@@ -697,6 +723,9 @@ def _construct_output_name_to_index_map(
     return result
 
 
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
 def gen_library_src(  # noqa: C901
     sorted_graph: list[Tensor],
     max_blob_size: int,
@@ -705,8 +734,7 @@ def gen_library_src(  # noqa: C901
     workdir: str,
     output_tensors: List[Tensor],
     model_name: str = "",
-    check_all_nan_and_inf: bool = False,
-    check_all_outputs: bool = False,
+    debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
 ) -> list[Tuple[str, str]]:
     """Generate model driver source code files for the given graph
 
@@ -723,6 +751,8 @@ def gen_library_src(  # noqa: C901
         Target directory for generated C++ source code files
     model_name : str, optional
         Sub working directory in the workdir for the given model, by default ""
+    debug_settings : AITDebugSettings
+        specify debug settings such as where to dump AITemplate model Python file, etc.
 
     Returns
     -------
@@ -751,8 +781,7 @@ def to_obj_name(name: str):
         num_outputs,
         constants_data_file,
         output_name_to_index,
-        check_all_nan_and_inf,
-        check_all_outputs,
+        debug_settings,
     )
     for node in sorted_graph:
         model_container_generator.append_tensor(node)
@@ -772,5 +801,5 @@ def to_obj_name(name: str):
     for fname in sources:
         to_build.append((fname, to_obj_name(fname)))
 
-    logger.info(__name__, f"generated {len(to_build)} library srcs")
+    _LOGGER.info(f"generated {len(to_build)} library srcs")
     return to_build
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index 001afe0ac..acaa1b899 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from ...compiler.ops.tensor import concatenate
+
 from . import tensor_accessor_codegen
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
@@ -194,6 +196,7 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
@@ -214,8 +217,12 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  }
+  if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+  }
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
       "Cannot resolve LoadVecType."
@@ -265,7 +272,7 @@
   dim3 grid_config = dim3(static_cast<unsigned>(num_blocks_x), NumInputs);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                        \\
-    case load_vec_type: {                                                   \\
+    if (min_vec_type == load_vec_type) {                                    \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {             \\
          throw std::runtime_error(                                          \\
            std::string("No valid kernel available for ") + #vec_type);      \\
@@ -278,19 +285,21 @@
             concat_dim,                                                     \\
             output_meta.output_strides[concat_dim]);                        \\
       LAUNCH_CHECK_CAT();                                                   \\
-      break;                                                                \\
+      return;                                                               \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    }
+    if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+    }
 
 #undef HANDLE_ONE_VEC_TYPE
+  throw std::runtime_error("Invalid LoadVecType\\n");
 }
 
 #undef CHECK_ERROR_CAT
@@ -614,6 +623,7 @@ def gen_function(
     """
     inputs = func_attrs["inputs"]
     original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
     orig_x = original_inputs[0]
     y = func_attrs["outputs"][0]
     x_shape = orig_x._attrs["shape"]
@@ -725,6 +735,7 @@ def gen_function_call(
         f'{len(inputs)}, {len(input_accessors)}, op: {func_attrs["name"]}'
     )
     original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
     orig_x = original_inputs[0]
     y = func_attrs["outputs"][0]
     concat_dim = func_attrs["concat_dim"]
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 546763505..632f2ef4c 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -16,6 +16,7 @@
 Backend-agnostic functions for elementwise codegen.
 """
 
+import math
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
@@ -246,16 +247,21 @@ def gen_function_single_thread(
                     else param
                 )
             elif arg.is_a_const_num():
+                arg_str = ""
+                if math.isinf(arg._attrs["value"]):
+                    arg_str = "CUDART_INF_F"
+                else:
+                    arg_str = str(arg._attrs["value"])
                 if func_op_t[-1] == "2":
                     params.append(
                         "{}({},{})".format(
                             func_op_t,
-                            str(arg._attrs["value"]),
-                            str(arg._attrs["value"]),
+                            arg_str,
+                            arg_str,
                         )
                     )
                 else:
-                    params.append("{}({})".format(func_op_t, str(arg._attrs["value"])))
+                    params.append("{}({})".format(func_op_t, arg_str))
             else:
                 raise RuntimeError(
                     "Cannot generate expression for node {}, ops: {}".format(
@@ -312,7 +318,7 @@ def _get_sub_func_metadata(
                 break
     if len(candidate_op_types) == 0:
         raise RuntimeError(
-            "Cannot find a common rocm data type! candidate_op_types: {}, op_t: {}.".format(
+            "Cannot find a common backend data type! candidate_op_types: {}, op_t: {}.".format(
                 candidate_op_types, op_t
             )
         )
@@ -391,13 +397,15 @@ def _get_types_and_sizes(
         else:
             min_num_elements = min(min_num_elements, num_elements_for_alignments)
     alignment = tensor_accessor_codegen.find_max_alignment(
-        min_num_elements, output_accessors
+        min_num_elements, dtype, output_accessors
     )
     # Note that we use the same alignment for accessing inputs and outputs, although
     # they may have different alignment requirements. We may lose perf a little bit,
     # but reduce the complexity of our jinja template. We can do some perf
     # experiments later to determine if we want to chase more perf gains.
-    alignment = tensor_accessor_codegen.find_max_alignment(alignment, input_accessors)
+    alignment = tensor_accessor_codegen.find_max_alignment(
+        alignment, dtype, input_accessors
+    )
     return alignment, input_broadcast_sizes, dtype
 
 
@@ -423,12 +431,8 @@ def _parse_func_metadata(
     alignment, input_broadcast_sizes, dtype = _get_types_and_sizes(
         inputs, input_accessors, output_accessors, backend_spec
     )
-    read_type = backend_spec.get_backend_type(
-        alignment, dtype, backend_spec.read_num_elements_to_backend_type
-    )
-    op_type = backend_spec.get_backend_type(
-        alignment, dtype, backend_spec.op_num_elements_to_backend_type
-    )
+    read_type = backend_spec.get_elementwise_read_backend_type(alignment, dtype)
+    op_type = backend_spec.get_elementwise_op_backend_type(alignment, dtype)
     data_type = backend_spec.dtype_to_backend_type(dtype)
     sub_func_metadata, op_type = _get_sub_func_metadata(
         ops, data_type, op_type, backend_spec
@@ -464,7 +468,7 @@ def _gen_int_var_product_str(
             raise RuntimeError(
                 "A dim must be an IntVar! Current type: {}".format(type(int_var))
             )
-    return " * ".join(res)
+    return " * ".join(res) if res else "1"
 
 
 def _gen_input_broadcast_calculator_str(
diff --git a/python/aitemplate/backend/common/split_common.py b/python/aitemplate/backend/common/split_common.py
index a1dbaa930..44c0d040c 100644
--- a/python/aitemplate/backend/common/split_common.py
+++ b/python/aitemplate/backend/common/split_common.py
@@ -22,9 +22,11 @@
 void {{func_name}}(
     void *[] /*outputs*/,
     {{index_type}} **[] /*output_shapes*/,
+    const bool [] /*output_masks*/,
     const void * /*input*/,
     const {{index_type}} * /*input_shape*/,
-    {{index_type}} /*num_splits*/,
+    {{index_type}} /*real_num_splits*/,
+    {{index_type}} /*all_num_splits*/,
     {{index_type}} [] /*split_sizes*/,
     {{index_type}} /*split_dim*/,
     {{index_type}} /*rank*/,
@@ -185,7 +187,9 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  }
 
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
@@ -198,9 +202,11 @@
 void split_kernel_launcher(
     void *outputs[],
     {{index_type}} *output_shapes[],
+    const bool output_masks[],
     const void *input,
     const {{index_type}} *input_shape,
     const {{index_type}} split_dim,
+    const {{index_type}} split_sizes[],
     {{prefix}}Stream_t stream
 ) {
 
@@ -215,13 +221,19 @@
 
   OutputMetaData<ELEM_T, NumSplits> output_meta;
   {{index_type}} offset = 0;
+  {{index_type}} split_sizes_idx = 0;
   LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
   for ({{index_type}} i = 0; i < NumSplits; i++) {
+    while (!output_masks[split_sizes_idx]) {
+      offset += split_sizes[split_sizes_idx];
+      split_sizes_idx++;
+    }
     output_meta.outputs[i] = static_cast<ELEM_T*>(outputs[i]);
     output_meta.split_dim_offsets[i] = offset;
     output_meta.split_dim_sizes[i] = output_shapes[i][split_dim];
     output_meta.num_elems[i] = get_num_elems(output_shapes[i], Rank);
     offset += output_meta.split_dim_sizes[i];
+    split_sizes_idx++;
     LoadVecType vec_type =
         get_vec_type<ELEM_T>(output_shapes[i], Rank, split_dim);
     min_vec_type = vec_type < min_vec_type ? vec_type : min_vec_type;
@@ -239,7 +251,7 @@
   dim3 grid_config = dim3(num_blocks_x, NumSplits);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                   \\
-    case load_vec_type: {                                              \\
+    if (min_vec_type == load_vec_type) {                               \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {        \\
          throw std::runtime_error(                                     \\
            std::string("No valid kernel available for ") + #vec_type); \\
@@ -252,18 +264,17 @@
             split_dim,                                                 \\
             input_meta.input_strides[split_dim]);                      \\
       LAUNCH_CHECK_SPLIT();                                            \\
-      break;                                                           \\
+      return;                                                          \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    }
 
+  throw std::runtime_error("Invalid LoadVecType\\n");
 #undef HANDLE_ONE_VEC_TYPE
 }
 
@@ -276,29 +287,30 @@
 
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
-{{indent}}if (rank == {{rank}} && num_splits == {{num_splits}}) {
-{% for split_idx in range(num_splits) %}
-{{indent}}  {{index_type}} local_shape{{split_idx}}[{{rank}}];
+{{indent}}if (rank == {{rank}} && real_num_splits == {{real_num_splits}}) {
+{% for split_idx in split_indices %}
+{% set outer_loop = loop %}
+{{indent}}  {{index_type}} local_shape{{outer_loop.index0}}[{{rank}}];
 {% for rank_idx in range(rank) %}
-{{indent}}  local_shape{{split_idx}}[{{rank_idx}}] = input_shape[{{rank_idx}}];
+{{indent}}  local_shape{{outer_loop.index0}}[{{rank_idx}}] = input_shape[{{rank_idx}}];
 {% endfor %}
-{{indent}}  local_shape{{split_idx}}[split_dim] = split_sizes[{{split_idx}}];
+{{indent}}  local_shape{{outer_loop.index0}}[split_dim] = split_sizes[{{split_idx}}];
 
 {% endfor %}
 
-{{indent}}  {{index_type}}* local_output_shapes[{{num_splits}}] = {
-{% for idx in range(num_splits - 1) %}
+{{indent}}  {{index_type}}* local_output_shapes[{{real_num_splits}}] = {
+{% for idx in range(real_num_splits - 1) %}
 {{indent}}    local_shape{{idx}},
 {% endfor %}
-{{indent}}    local_shape{{num_splits - 1}}
+{{indent}}    local_shape{{real_num_splits - 1}}
 {{indent}}  };
 {{indent}}  /* TODO: more profiling on ElemsPerThread and ThreadsPerBlock */
 {{indent}}  split_kernel_launcher<{{elem_type}},
 {{indent}}                        {{rank}}/*Rank*/,
-{{indent}}                        {{num_splits}}/*NumSplits*/,
+{{indent}}                        {{real_num_splits}}/*NumSplits*/,
 {{indent}}                        {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                        {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
-{{indent}}      outputs, local_output_shapes, input, input_shape, split_dim, stream);
+{{indent}}      outputs, local_output_shapes, output_masks, input, input_shape, split_dim, split_sizes, stream);
 {{indent}}  return;
 {{indent}}}
 """
@@ -311,9 +323,11 @@
 void {{func_name}}(
     void* outputs[],
     {{index_type}} **output_shapes[],
+    const bool output_masks[],
     const void* input,
     const {{index_type}} *input_shape,
-    {{index_type}} num_splits,
+    {{index_type}} real_num_splits,
+    {{index_type}} all_num_splits,
     {{index_type}} split_sizes[],
     {{index_type}} split_dim,
     {{index_type}} rank,
@@ -326,23 +340,28 @@
   if (split_dim >= rank) {
     throw std::runtime_error("cat_dim must be smaller than rank!");
   }
-  if (num_splits < 1) {
+  if (real_num_splits < 1) {
     throw std::runtime_error("the number of splits must be larger than 0!");
   }
 
   // now we update the shape for each output
-  for ({{index_type}} i = 0; i < num_splits; i++) {
-    {{index_type}} **shape_ptr = output_shapes[i];
+  {{index_type}} real_idx = 0;
+  for ({{index_type}} i = 0; i < all_num_splits; i++) {
+    if (!output_masks[i]) {
+      continue;
+    }
+    {{index_type}} **shape_ptr = output_shapes[real_idx];
     for ({{index_type}} dim_idx = 0; dim_idx < rank; dim_idx++) {
       *(shape_ptr[dim_idx]) = input_shape[dim_idx];
     }
     // update dim size for the split axis
     *(shape_ptr[split_dim]) = split_sizes[i];
+    real_idx++;
   }
 
   {{index_type}} split_dim_size = input_shape[split_dim];
   {{index_type}} sum_of_split_sizes = 0;
-  for ({{index_type}} i = 0; i < num_splits; i++) {
+  for ({{index_type}} i = 0; i < all_num_splits; i++) {
     sum_of_split_sizes += split_sizes[i];
   }
   if (split_dim_size != sum_of_split_sizes) {
@@ -361,7 +380,7 @@
   if (!input) {
     throw std::runtime_error("input is NULL!");
   }
-  for (int i = 0; i < num_splits; i++) {
+  for (int i = 0; i < real_num_splits; i++) {
     if (!outputs[i]) {
       throw std::runtime_error("NULL output found at: " + std::to_string(i));
     }
@@ -370,7 +389,7 @@
 {{exec_paths}}
 
   throw std::runtime_error(
-      "Unsupported cat kernel specialization!"
+      "Unsupported split kernel specialization!"
   );
 }
 """
@@ -408,12 +427,18 @@
 {{indent}}    {{split_sizes}}
 {{indent}}  };
 
+{{indent}}  bool output_masks[] = {
+{{indent}}    {{output_masks}}
+{{indent}}  };
+
 {{indent}}  {{func_name}}(
 {{indent}}      outputs,
 {{indent}}      output_shapes,
+{{indent}}      output_masks,
 {{indent}}      {{input_ptr}},
 {{indent}}      {{input_name}}_shape,
-{{indent}}      {{num_splits}}/*num_splits*/,
+{{indent}}      {{real_num_splits}}/*real_num_splits*/,
+{{indent}}      {{all_num_splits}}/*all_num_splits*/,
 {{indent}}      split_sizes,
 {{indent}}      {{split_dim}}/*split_dim*/,
 {{indent}}      {{rank}}/*rank*/,
@@ -469,12 +494,15 @@ def gen_function(func_attrs, backend_spec):
     if input_type != output_type:
         raise NotImplementedError("input type must equal to output type")
 
+    split_indices = [idx for idx, mask in enumerate(func_attrs["output_masks"]) if mask]
+
     # TODO: consider to add profiling paths for tuning
     # elems_per_thread and threads_per_block
     exec_paths = EXEC_COND_TEMPLATE.render(
         indent="  ",
         rank=len(x_shape),
-        num_splits=len(func_attrs["split_sizes"]),
+        real_num_splits=len(func_attrs["outputs"]),
+        split_indices=split_indices,
         elem_type=input_type,
         elems_per_thread=128,
         threads_per_block=128,
@@ -513,7 +541,7 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     x = func_attrs["inputs"][0]
     outputs = func_attrs["outputs"]
     split_dim = func_attrs["split_dim"]
-    num_splits = len(func_attrs["split_sizes"])
+    num_splits = len(func_attrs["outputs"])
 
     output_names = ",\n      ".join([i._attrs["name"] for i in outputs])
 
@@ -539,16 +567,23 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
 
     split_sizes = ", ".join([str(i) for i in func_attrs["split_sizes"]])
 
+    output_masks_str = ", ".join(
+        ["true" if mask is True else "false" for mask in func_attrs["output_masks"]]
+    )
+
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
         outputs=output_names,
         output_shape_defs="".join(output_shape_defs),
         output_shapes=", ".join(output_shape_names),
+        output_masks=output_masks_str,
         input_dims=x_dims,
         func_name=func_attrs["name"],
         input_name=x._attrs["name"],
         input_ptr=x._attrs["name"],
         split_dim=split_dim,
+        real_num_splits=len(func_attrs["outputs"]),
+        all_num_splits=len(func_attrs["output_masks"]),
         rank=len(x._attrs["shape"]),
         num_splits=num_splits,
         split_sizes=split_sizes,
diff --git a/python/aitemplate/backend/common/tensor/argmax_common.py b/python/aitemplate/backend/common/tensor/argmax_common.py
index 67c3d4b94..acdffd824 100644
--- a/python/aitemplate/backend/common/tensor/argmax_common.py
+++ b/python/aitemplate/backend/common/tensor/argmax_common.py
@@ -42,7 +42,7 @@
 {{func_signature}}
 {
 
-    argmax_launcher<half>(stream, elem_cnt, instance_size, instance_num, input, workspace, output);
+    argmax_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, input, workspace, output);
 }
     """
 )
@@ -247,8 +247,8 @@ class MultiplyFunctor final {
   int instance_num = std::stoi(argv[2]);
 
   float runtime_ms = 0;
-  int32_t key_value_out_bytes = GetAlignedSize(instance_num * sizeof({{cub}}::KeyValuePair<int32_t, half>));
-  size_t temp_storage_bytes = InferTempStorageForArgMax<half>(instance_num, instance_size);
+  int32_t key_value_out_bytes = GetAlignedSize(instance_num * sizeof({{cub}}::KeyValuePair<int32_t, {{dtype}}>));
+  size_t temp_storage_bytes = InferTempStorageForArgMax<{{dtype}}>(instance_num, instance_size);
   GLOBAL_WORKSPACE_SIZE  =  GetAlignedSize(key_value_out_bytes + temp_storage_bytes);
 
   std::cout << "TIME:" << runtime_ms << std::endl;
@@ -260,7 +260,7 @@ class MultiplyFunctor final {
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(int64_t* output,
-                   const half* input,
+                   const {{dtype}}* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
                    const {{index_type}} instance_num,
@@ -308,14 +308,21 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
     cub = backend_spec.cub
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     return FUNC_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
-            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+            func_name=func_attrs["name"],
+            index_type=index_type,
+            prefix=prefix,
+            dtype=dtype,
         ),
         kernel=KERNEL_TEMPLATE.render(
             cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
         ),
+        dtype=dtype,
     )
 
 
@@ -334,11 +341,14 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     str
         Rendered function decl.
     """
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     return FUNC_DECL.render(
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"],
             index_type=backend_spec.index_type,
             prefix=backend_spec.prefix,
+            dtype=dtype,
         ),
     ).strip()
 
@@ -364,11 +374,14 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 1
 
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["outputs"][0]._attrs["name"]
     )
-    input_name = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
+    input_name = backend_spec.cast_to_ptr_template.render(
+        name=func_attrs["inputs"][0]._attrs["name"],
+        dtype=dtype,
     )
 
     x = func_attrs["inputs"][0]
@@ -435,15 +448,22 @@ def gen_profiler(
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
     cub = backend_spec.cub
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     code = PROFILER_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
-            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+            func_name=func_attrs["name"],
+            index_type=index_type,
+            prefix=prefix,
+            dtype=dtype,
         ),
         kernel=KERNEL_TEMPLATE.render(
             cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
         ),
         cub=cub,
+        dtype=dtype,
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
diff --git a/python/aitemplate/backend/common/tensor/permute0213_common.py b/python/aitemplate/backend/common/tensor/permute0213_common.py
new file mode 100644
index 000000000..7dceb9cd9
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/permute0213_common.py
@@ -0,0 +1,445 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common implementations for all backends for permute0213.
+
+This implementation is based on the permute102 implementation in
+permute102_common.py. The difference is that, in this implementation,
+the permute102 logic is applied to each slice along the batch
+dimension of the 4d input tensor. To this end, the batch dimension
+is added as a blockIdx.z for the tiled kernel launch and encoded
+in the blockIdx.z for the direct kernel launch. The input and output
+pointers are shifted accordingly in the kernel code.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* /* input */,
+  void* /* output */,
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
+  int64_t /* x_dim3 */,
+  {{prefix}}Stream_t /* stream */
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{x_dim0}},
+{{indent}}    {{x_dim1}},
+{{indent}}    {{x_dim2}},
+{{indent}}    {{x_dim3}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{% if dtype == "half" %}
+{{indent}}if (x_dim3 % 8 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<half>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "float" %}
+{{indent}}if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% endif %}
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+#define TILE_SIZE 32
+#define ITEMS_PER_THREAD 4
+#define DIRECT_BLOCK_Y 4
+#define DIRECT_BLOCK_Z 2
+
+namespace {
+
+template<typename T>
+__global__ void permute0213_tiled_kernel(T* output,
+                                         const T *input,
+                                         const int M,
+                                         const int N,
+                                         const int D,
+                                         const int n) {
+  __shared__ T shbuf[TILE_SIZE * TILE_SIZE];
+
+  const int nD = n * D;
+  const int ND = N * D;
+  const int MD = M * D;
+  const int bxn = blockIdx.x * n;
+  const int DT = D * TILE_SIZE;
+  int x, y, i, tid, threadIdxY;
+
+  int offset = blockIdx.z * M * N * D;
+  input += offset;
+  output += offset;
+
+  if (threadIdx.x < nD) {
+    x = blockIdx.x * nD + threadIdx.x;
+    if (x < ND) {
+      threadIdxY = threadIdx.y;
+      if ((blockIdx.y + 1) * TILE_SIZE <= M) {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      } else {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          if (y >= M) break;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  threadIdxY = threadIdx.y;
+  if ((blockIdx.x + 1) * n <= N) {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        output[(bxn + y) * MD + blockIdx.y * DT + x] =
+          shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  } else {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N && blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void permute0213_direct_kernel(T* output,
+                                          const T *input,
+                                          const int M,
+                                          const int N,
+                                          const int D,
+                                          const int m) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x < D && y < N) {
+    int zi = blockIdx.z % m;
+
+    int offset = (blockIdx.z / m) * M * N * D;
+    input += offset;
+    output += offset;
+
+    int bound = min(M, (zi + 1) * TILE_SIZE);
+    for (int z = zi * TILE_SIZE + threadIdx.z; z < bound; z += DIRECT_BLOCK_Z) {
+      output[y * M * D + z * D + x] = input[z * N * D + y * D + x];
+    }
+  }
+}
+
+template <typename T>
+void permute0213_launcher(const void* in_ptr,
+                          void* out_ptr,
+                          int x_dim0,
+                          int x_dim1,
+                          int x_dim2,
+                          int x_dim3,
+                          {{prefix}}Stream_t stream) {
+  const int B = x_dim0;
+  const int M = x_dim1;
+  const int N = x_dim2;
+  const int D = x_dim3;
+
+  if (D <= 16) {
+    // each warp reads n x d coalesced items of input
+    const int d = min(TILE_SIZE, D);
+    const int n = TILE_SIZE / d;
+
+    dim3 grid((N + n - 1) / n, (M + TILE_SIZE - 1) / TILE_SIZE, B);
+    dim3 block(TILE_SIZE, TILE_SIZE / ITEMS_PER_THREAD);
+
+    permute0213_tiled_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      n
+    );
+  } else {
+    const int m = ((M + TILE_SIZE - 1) / TILE_SIZE);
+
+    dim3 grid((D + 31) / 32, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, B * m);
+    dim3 block(32, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = 32, the warp size
+
+    permute0213_direct_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      m
+    );
+  }
+}
+} // namespace
+
+void {{function_name}} (
+    const void* in_ptr,
+    void* out_ptr,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
+    int64_t x_dim3,
+    {{prefix}}Stream_t stream
+) {
+  if (!in_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  if (!out_ptr) {
+    throw std::runtime_error("out_ptr is NULL!");
+  }
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+def gen_function(
+    func_attrs: Dict[str, Any],
+    template_path: str,
+    header_files: str,
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    header_files : str
+        header files included in the function
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    exec_paths = EXEC_TEMPLATE.render(
+        indent="  ",
+        dtype=backend_spec.dtype_to_backend_type(xdtype),
+    )
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        exec_paths=exec_paths,
+        header_files=header_files,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
+        x_dim3=xshape[3]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute021_common.py b/python/aitemplate/backend/common/tensor/permute021_common.py
index 30ab97b80..7850e443c 100644
--- a/python/aitemplate/backend/common/tensor/permute021_common.py
+++ b/python/aitemplate/backend/common/tensor/permute021_common.py
@@ -17,6 +17,8 @@
 
 For three dimension input, shift the second and the third dimension.
 i.e. Output[d0, d2, d1] = Input[d0, d1, d2]
+For higher-rank input, treat the first n-2 dims as a single flat dim.
+i.e. Output[d0, ..., dn-3, dn-1, dn-2] = Input[d0, ..., dn-3, dn-2, dn-1]
 
 """
 from typing import Any, Dict
@@ -28,14 +30,10 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  const void* /*input*/,
+  const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
+  int64_t /* rank */,
+  const int64_t* /* x_dims */,
   {{prefix}}Stream_t /* stream */
 );
 """
@@ -43,17 +41,16 @@
 
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{x_dim0}},
-{{indent}}    {{x_dim1}},
-{{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
-{{indent}}    stream
-{{indent}});
+{{indent}}{
+{{indent}}  const int64_t x_dims[] = {{x_dims}};
+{{indent}}  {{func_name}}(
+{{indent}}      {{in_ptr}},
+{{indent}}      {{out_ptr}},
+{{indent}}      {{rank}},
+{{indent}}      x_dims,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
 """
 )
 
@@ -63,9 +60,8 @@
 {{indent}}permute021_launcher(
 {{indent}}    in_ptr,
 {{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
+{{indent}}    rank,
+{{indent}}    x_dims,
 {{indent}}    stream
 {{indent}});
 {{indent}}return;
@@ -76,43 +72,50 @@
     """
 {{header_files}}
 
+#include <limits>
+
+#define TILE_SIZE 32
+#define CH_K 4
+
 namespace {
 template <typename T>
-__global__ void nhwc_to_nchw_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-
-  const int hw = h*w;
-  const int hwc = hw*c;
-  __shared__ T shbuf[32 * (32 + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / 32;
-  const int32_t lid  = tid % 32;
+__global__ void permute021_kernel(T *output,
+                                  const T *input,
+                                  const int64_t n,
+                                  const int32_t h,
+                                  const int32_t w,
+                                  const int32_t c) {
+
+  const int32_t hw = h * w;
+  const int32_t hwc = hw * c;
+
+  __shared__ T shbuf[TILE_SIZE * (TILE_SIZE + 1)];
+
+  const int32_t tid  = threadIdx.y * blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / TILE_SIZE;
+  const int32_t lid  = tid % TILE_SIZE;
   const int32_t ni   = blockIdx.z;
-  const int32_t hwi0  = blockIdx.y * 32;
-  const int32_t ci0 = blockIdx.x * 32;
+  const int32_t hwi0 = blockIdx.y * TILE_SIZE;
+  const int32_t ci0  = blockIdx.x * TILE_SIZE;
 
   const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
   const T *A = input + input_idx;
   if (ci0 + lid < c) {
-    const int lid_x_33 = lid * 33;
-    if ((hwi0 + 32) <= hw) {
+    const int lid_x_33 = lid * (TILE_SIZE + 1);
+    if ((hwi0 + TILE_SIZE) <= hw) {
       int hwi = wid;  // between 0 and 7
       #pragma unroll
-      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
+      for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
         shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[8 * c];
-        hwi += 8;
+        A                     = &A[TILE_SIZE / CH_K * c];
+        hwi += TILE_SIZE / CH_K;
       }
     } else {
-      for (int hwi = wid; hwi < 32; hwi += 8) {
-        if ((hwi + hwi0) < hw) {
+      for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
+        if (hwi + hwi0 < hw) {
           shbuf[lid_x_33 + hwi] = A[lid];
         }
-        A = &A[8 * c];
+        A = &A[TILE_SIZE / CH_K * c];
       }
     }
   }
@@ -121,17 +124,17 @@
   const int32_t hwiOut = hwi0 + lid;
   output = &output[ni * hwc + hwiOut];
   if (hwiOut < hw) {
-    if (ci0 + 32 < c) {
+    if (ci0 + TILE_SIZE < c) {
       int cI = wid;
       #pragma unroll
-      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
-        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
-        cI += 8;
+      for (int hwLoopIdx = 0; hwLoopIdx < CH_K; ++hwLoopIdx) {
+        output[(ci0 + cI) * hw] = shbuf[cI * (TILE_SIZE + 1) + lid];
+        cI += TILE_SIZE / CH_K;
       }
     } else {
-      for (int cI = wid; cI < 32; cI += 8) {
+      for (int cI = wid; cI < TILE_SIZE; cI += TILE_SIZE / CH_K) {
         if (ci0 + cI < c) {
-          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+          output[(ci0 + cI) * hw] = shbuf[cI * (TILE_SIZE + 1) + lid];
         }
       }
     }
@@ -140,17 +143,32 @@
 
 void permute021_launcher(const void* in_ptr,
                          void* out_ptr,
-                         int x_dim0,
-                         int x_dim1,
-                         int x_dim2,
+                         int64_t rank,
+                         const int64_t* x_dims,
                          {{prefix}}Stream_t stream) {
-  const int n = x_dim0;
-  const int h = 1;
-  const int w = x_dim1;
-  const int c = x_dim2;
-  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
-  dim3 block(32, 8);
-  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
+  int64_t x_dim0 = 1;
+  for (int i = 0; i < rank - 2; i++) {
+    x_dim0 *= x_dims[i];
+  }
+
+  if (x_dims[rank-2] > std::numeric_limits<int32_t>::max()) {
+    throw std::runtime_error("The second last dim does not fit into int32_t.");
+  }
+  if (x_dims[rank-1] > std::numeric_limits<int32_t>::max()) {
+    throw std::runtime_error("The last dim does not fit into int32_t.");
+  }
+
+  // given the above checks, we know it's safe
+  const int32_t x_dim1 = x_dims[rank-2];
+  const int32_t x_dim2 = x_dims[rank-1];
+
+  const int64_t n = x_dim0;
+  const int32_t h = 1;
+  const int32_t w = x_dim1;
+  const int32_t c = x_dim2;
+  dim3 grid((c + TILE_SIZE - 1) / TILE_SIZE, (h * w + TILE_SIZE - 1) / TILE_SIZE, n);
+  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
+  permute021_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
     static_cast<{{lib_dtype}}*>(out_ptr),
     static_cast<const {{lib_dtype}}*>(in_ptr),
     n,
@@ -164,24 +182,18 @@
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t rank,
+    const int64_t* x_dims,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
-  {{shape_function}}
   {{exec_paths}}
 }
-
 """
 )
 
@@ -189,8 +201,6 @@
 def gen_function(
     func_attrs: Dict[str, Any],
     template_path: str,
-    shape_eval_template,
-    shape_save_template,
     header_files: str,
     backend_spec,
 ) -> str:
@@ -201,8 +211,6 @@ def gen_function(
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
     header_files : str
         header files included in the function
     backend_spec : class
@@ -213,36 +221,23 @@ def gen_function(
     str
         Source code for function generated.
     """
-
     func_name = func_attrs["name"]
     x = func_attrs["inputs"][0]
     xdtype = x._attrs["dtype"]
-    shape_eval_func = shape_eval_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        x_dim0="*x_dim0",
-        x_dim1="*x_dim1",
-        x_dim2="*x_dim2",
-    )
-    shape_save_func = shape_save_template.render(
-        indent="  ",
-        y_dim0="*y_dim0",
-        y_dim1="*y_dim1",
-        y_dim2="*y_dim2",
-    )
-    shape_func = shape_eval_func + shape_save_func
     exec_paths = EXEC_TEMPLATE.render()
     return SRC_TEMPLATE.render(
         function_name=func_name,
-        header_files=header_files,
-        shape_function=shape_func,
         exec_paths=exec_paths,
+        header_files=header_files,
         lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -256,7 +251,6 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     str
         Function declaration
     """
-
     func_name = func_attrs["name"]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
@@ -264,7 +258,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -280,20 +278,17 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     str
         Driver code for invoking call
     """
-
     x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
+
+    xshape = x._attrs["shape"]
+    x_dims = [dim._attrs["name"] for dim in xshape]
+
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dims=("{" + ", ".join(x_dims) + "}"),
+        rank=len(xshape),
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index 7c367ed8a..ba8200abd 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -18,14 +18,23 @@
 For three dimension input, shift the first and the second dimension.
 i.e. Output[d1, d0, d2] = Input[d0, d1, d2]
 
-This is a naive modification over cutlass nhwc to nchw op:
-https://github.com/NVIDIA/cutlass/blob/master/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
-At implementation, it creates d1/32 x d2/32 x d0 blocks, each with 32 x 8 threads,
-and each thread processes 4 elements.
-
-We change the write stage of this cutlass permute op for d1 & d0.
-It might not be the most effecient version as applying different dimension on threads
-may relate to cache's performance.
+After determining the largest movable vectorized type fitting into d2,
+the implementation is based on two different kernels invoked depending
+on the number of items of that type d2 consists of:
+
+1. If the number is <= 16, the extension of the SMEM-tile approach (as
+used in permute021) is used to maintain coalesced reads from and writes
+to the global memory, with the SMEM layout for avoiding bank conflicts
+on store and load. This approach assumes that the last dimension can be
+fully covered with a single warp, hence can only work with the number
+being <= 32.
+
+2. If the number is > 16, the direct approach is used for copying
+d2-sized blocks along the last dimension directly from the input to the
+output global memory. This trivially corresponds to coalesced read and
+write of the whole d2-sized block. The cutoff of > 16 is chosen, as
+starting from 17 items, the approach #1 corresponds to the same data
+movement, just through the SMEM and with more index computation.
 """
 from typing import Any, Dict
 
@@ -38,13 +47,10 @@
 void {{func_name}}(
   const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
-  {{prefix}}Stream_t
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
+  {{prefix}}Stream_t /* stream */
 );
 """
 )
@@ -57,9 +63,6 @@
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
 {{indent}}    stream
 {{indent}});
 """
@@ -68,14 +71,74 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}permute102_launcher(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
-{{indent}}    stream
-{{indent}});
+{% if dtype == "half" %}
+{{indent}}if (x_dim2 % 8 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<half>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "float" %}
+{{indent}}if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% endif %}
 {{indent}}return;
 """
 )
@@ -85,111 +148,186 @@
 {{header_files}}
 
 #define TILE_SIZE 32
-#define CH_K 4
+#define ITEMS_PER_THREAD 4
+#define DIRECT_BLOCK_Y 4
+#define DIRECT_BLOCK_Z 2
 
 namespace {
-template <typename T>
-__global__ void nhwc_to_nchw_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-
-  const int hw = h*w;
-  const int hwc = hw*c;
-  __shared__ T shbuf[TILE_SIZE * (TILE_SIZE + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / TILE_SIZE;//th.y:0-7
-  const int32_t lid  = tid % TILE_SIZE;//th.x:0-31
-  const int32_t ni0   = blockIdx.z;
-  const int32_t hwi0  = blockIdx.y * TILE_SIZE;//parallel 8*seq 4
-  const int32_t ci0 = blockIdx.x * TILE_SIZE;//parallel 32
-  const size_t input_idx = ni0 * hwc + (hwi0 + wid) * c + ci0;
-  const T *A = input + input_idx;
-  if (ci0 + lid < c) {
-    const int lid_x_33 = lid * (TILE_SIZE + 1);
-    if ((hwi0 + TILE_SIZE - TILE_SIZE / CH_K) <= hw) {
-      int hwi = wid;  // between 0 and 7
-      #pragma unroll
-      for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
-        shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[TILE_SIZE / CH_K * c];//because c is distributed on threads y
-        hwi += TILE_SIZE / CH_K;
-      }
-    } else {
-      for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
-        if ((hwi + hwi0) < hw) {
-          shbuf[lid_x_33 + hwi] = A[lid];
+
+template<typename T>
+__global__ void permute102_tiled_kernel(T* output,
+                                        const T *input,
+                                        const int M,
+                                        const int N,
+                                        const int D,
+                                        const int n) {
+  __shared__ T shbuf[TILE_SIZE * TILE_SIZE];
+
+  const int nD = n * D;
+  const int ND = N * D;
+  const int MD = M * D;
+  const int bxn = blockIdx.x * n;
+  const int DT = D * TILE_SIZE;
+  int x, y, i, tid, threadIdxY;
+
+  if (threadIdx.x < nD) {
+    x = blockIdx.x * nD + threadIdx.x;
+    if (x < ND) {
+      threadIdxY = threadIdx.y;
+      if ((blockIdx.y + 1) * TILE_SIZE <= M) {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      } else {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          if (y >= M) break;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
         }
-        A = &A[TILE_SIZE / CH_K * c];
       }
     }
   }
+
   __syncthreads();
 
-  const int32_t hwiOut = hwi0 + lid;
-  const int nc = n*c;
-  output = &output[hwiOut*nc];
-  if(hwiOut < hw){
-    if(ci0 + TILE_SIZE < c){
-      int cI = wid;
+  threadIdxY = threadIdx.y;
+  if ((blockIdx.x + 1) * n <= N) {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
       #pragma unroll
-      for(int hwLoopIdx = 0; hwLoopIdx < CH_K; ++hwLoopIdx){
-          output[ni0*c + ci0 + cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
-          cI += TILE_SIZE / CH_K;
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        output[(bxn + y) * MD + blockIdx.y * DT + x] =
+          shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
       }
     } else {
-      for(int cI = wid; cI < TILE_SIZE; cI += TILE_SIZE / CH_K){
-        if(ci0+cI<c){
-          output[ni0*c+ci0+cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  } else {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N && blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
         }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
       }
     }
   }
 }
 
+template <typename T>
+__global__ void permute102_direct_kernel(T* output,
+                                         const T *input,
+                                         const int M,
+                                         const int N,
+                                         const int D) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x < D && y < N) {
+    int bound = min(M, (blockIdx.z + 1) * TILE_SIZE);
+    for (int z = blockIdx.z * TILE_SIZE + threadIdx.z; z < bound; z += DIRECT_BLOCK_Z) {
+      output[y * M * D + z * D + x] = input[z * N * D + y * D + x];
+    }
+  }
+}
+
+template <typename T>
 void permute102_launcher(const void* in_ptr,
                          void* out_ptr,
                          int x_dim0,
                          int x_dim1,
                          int x_dim2,
                          {{prefix}}Stream_t stream) {
-  const int n = x_dim0;
-  const int h = 1;
-  const int w = x_dim1;
-  const int c = x_dim2;
-  dim3 grid((c + TILE_SIZE - 1)/TILE_SIZE, (h*w + TILE_SIZE -1)/TILE_SIZE, n);
-  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
-  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
-    static_cast<{{lib_dtype}}*>(out_ptr),
-    static_cast<const {{lib_dtype}}*>(in_ptr),
-    n,
-    h,
-    w,
-    c
-  );
+  const int M = x_dim0;
+  const int N = x_dim1;
+  const int D = x_dim2;
+
+  if (D <= 16) {
+    // each warp reads n x d coalesced items of input
+    const int d = min(TILE_SIZE, D);
+    const int n = TILE_SIZE / d;
+
+    dim3 grid((N + n - 1) / n, (M + TILE_SIZE - 1) / TILE_SIZE);
+    dim3 block(TILE_SIZE, TILE_SIZE / ITEMS_PER_THREAD);
+
+    permute102_tiled_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      n
+    );
+  } else {
+    dim3 grid((D + 31) / 32, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, (M + TILE_SIZE - 1) / TILE_SIZE);
+    dim3 block(32, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = 32, the warp size
+
+    permute102_direct_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D
+    );
+  }
 }
 } // namespace
 
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
-  {{shape_function}}
   {{exec_paths}}
 }
 
@@ -200,8 +338,6 @@
 def gen_function(
     func_attrs: Dict[str, Any],
     template_path: str,
-    shape_eval_template,
-    shape_save_template,
     header_files: str,
     backend_spec,
 ) -> str:
@@ -212,8 +348,8 @@ def gen_function(
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
+    header_files : str
+        header files included in the function
     backend_spec : class
         specifies backend configs
 
@@ -225,32 +361,22 @@ def gen_function(
     func_name = func_attrs["name"]
     x = func_attrs["inputs"][0]
     xdtype = x._attrs["dtype"]
-    shape_eval_func = shape_eval_template.render(
+    exec_paths = EXEC_TEMPLATE.render(
         indent="  ",
-        dtype="int64_t ",
-        x_dim0="*x_dim0",
-        x_dim1="*x_dim1",
-        x_dim2="*x_dim2",
+        dtype=backend_spec.dtype_to_backend_type(xdtype),
     )
-    shape_save_func = shape_save_template.render(
-        indent="  ",
-        y_dim0="*y_dim0",
-        y_dim1="*y_dim1",
-        y_dim2="*y_dim2",
-    )
-    shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render()
     return SRC_TEMPLATE.render(
         function_name=func_name,
-        shape_function=shape_func,
         exec_paths=exec_paths,
         header_files=header_files,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -271,7 +397,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -290,16 +420,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/permute210_common.py b/python/aitemplate/backend/common/tensor/permute210_common.py
index 35894b315..2183101ec 100644
--- a/python/aitemplate/backend/common/tensor/permute210_common.py
+++ b/python/aitemplate/backend/common/tensor/permute210_common.py
@@ -37,12 +37,9 @@
 void {{func_name}}(
   const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
   {{prefix}}Stream_t /* stream */
 );
 """
@@ -56,9 +53,6 @@
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
 {{indent}}    stream
 {{indent}});
 """
@@ -70,9 +64,9 @@
 {{indent}}permute210_launcher(
 {{indent}}    in_ptr,
 {{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
+{{indent}}    x_dim0,
+{{indent}}    x_dim1,
+{{indent}}    x_dim2,
 {{indent}}    stream
 {{indent}});
 {{indent}}return;
@@ -84,6 +78,7 @@
 {{header_files}}
 
 #define TILE_SIZE 32
+#define CH_K 4
 
 namespace {
 template <typename T>
@@ -95,7 +90,7 @@
   __shared__ T shbuf[TILE_SIZE][TILE_SIZE + 1];
 
   int32_t strides[2] = { c * w, w };
-  int32_t offset = blockIdx.y * strides[1]; // We are slicing through static c.
+  int32_t offset = blockIdx.y * strides[1];  // We are slicing through static c.
 
   int32_t xBlock = blockIdx.x * TILE_SIZE;
   int32_t yBlock = blockIdx.z * TILE_SIZE;
@@ -106,21 +101,21 @@
   const T *A = input + inputIdx;
 
   if (x < w) {
-    if (y + 24 < n) { // This guards (y, y+8, y+16, y+24) are within boundary.
+    if (y + 24 < n) {  // This guards (y, y+8, y+16, y+24) are within boundary.
       int tid = threadIdx.y;
       #pragma unroll
-      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+      for (int loopIdx = 0; loopIdx < CH_K; loopIdx++) {
         shbuf[threadIdx.x][tid] = A[threadIdx.x];
-        A                       = &A[8 * strides[0]];
-        tid += 8;
+        A                       = &A[TILE_SIZE / CH_K * strides[0]];
+        tid += TILE_SIZE / CH_K;
       }
     } else {
       #pragma unroll
-      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+      for (int tid = threadIdx.y; tid < TILE_SIZE; tid += TILE_SIZE / CH_K) {
         if (yBlock + tid < n) {
           shbuf[threadIdx.x][tid] = A[threadIdx.x];
         }
-        A = &A[8 * strides[0]];
+        A = &A[TILE_SIZE / CH_K * strides[0]];
       }
     }
   }
@@ -141,18 +136,18 @@
     if (y + 24 < w) {
       int tid = threadIdx.y;
       #pragma unroll
-      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+      for (int loopIdx = 0; loopIdx < CH_K; loopIdx++) {
         output[threadIdx.x] = shbuf[tid][threadIdx.x];
-        output              = &output[8 * strides[0]];
-        tid += 8;
+        output              = &output[TILE_SIZE / CH_K * strides[0]];
+        tid += TILE_SIZE / CH_K;
       }
     } else {
       #pragma unroll
-      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+      for (int tid = threadIdx.y; tid < TILE_SIZE; tid += TILE_SIZE / CH_K) {
         if (yBlock + tid < w) {
           output[threadIdx.x] = shbuf[tid][threadIdx.x];
         }
-        output = &output[8 * strides[0]];
+        output = &output[TILE_SIZE / CH_K * strides[0]];
       }
     }
   }
@@ -164,8 +159,8 @@
                          int x_dim1,
                          int x_dim2,
                          {{prefix}}Stream_t stream) {
-  dim3 grid((x_dim2 + (TILE_SIZE-1))/TILE_SIZE, x_dim1, (x_dim0 + (TILE_SIZE-1))/TILE_SIZE);
-  dim3 block(TILE_SIZE, TILE_SIZE/4);
+  dim3 grid((x_dim2 + TILE_SIZE - 1) / TILE_SIZE, x_dim1, (x_dim0 + TILE_SIZE - 1) / TILE_SIZE);
+  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
   permute210_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
     static_cast<{{lib_dtype}}*>(out_ptr),
     static_cast<const {{lib_dtype}}*>(in_ptr),
@@ -179,19 +174,16 @@
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
   {{exec_paths}}
 }
@@ -200,7 +192,11 @@
 )
 
 
-def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+def gen_function(
+    func_attrs: Dict[str, Any],
+    header_files: str,
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -229,7 +225,10 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -250,7 +249,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -269,16 +272,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index f42f213f2..e3c89a09f 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -244,6 +244,7 @@
   VT_FLOAT4
 };
 
+
 template <typename ELEM_T>
 static inline LoadVecType get_vec_type(int64_t dim_size) {
   {{index_type}}  size_elem_t = sizeof(ELEM_T);
@@ -259,7 +260,9 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  }
 
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
@@ -422,7 +425,7 @@
   dim3 grid_config = dim3(num_blocks_x, NumInputs);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                          \\
-    case load_vec_type: {                                                     \\
+    if (min_vec_type == load_vec_type) {                                      \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {               \\
          throw std::runtime_error(                                            \\
            std::string("No valid kernel available for ") + #vec_type);        \\
@@ -433,18 +436,17 @@
             slice_meta_data,                                                  \\
             scatter_meta_data);                                               \\
       LAUNCH_CHECK_SLICE();                                                   \\
-      break;                                                                  \\
+      return;                                                                 \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    }
 
+  throw std::runtime_error("Invalid LoadVecType\\n");
 #undef HANDLE_ONE_VEC_TYPE
 }
 
diff --git a/python/aitemplate/backend/common/tensor_accessor_codegen.py b/python/aitemplate/backend/common/tensor_accessor_codegen.py
index 6d6174d27..7e3336f76 100644
--- a/python/aitemplate/backend/common/tensor_accessor_codegen.py
+++ b/python/aitemplate/backend/common/tensor_accessor_codegen.py
@@ -85,29 +85,33 @@ def find_max_alignment_for_accessor(accessor: TensorAccessor) -> int:
     int
         the max alignment value
     """
-    align = alignment.find_max_alignment(accessor.offset)
+    align = alignment.find_max_alignment(accessor.offset, accessor.tensor_dtype)
     if not accessor.is_contiguous:
         align = min(
             align,
             alignment.find_max_alignment(
-                accessor.original_total_elements_from_stride_dim
+                accessor.original_total_elements_from_stride_dim, accessor.tensor_dtype
             ),
         )
         align = min(
             align,
             alignment.find_max_alignment(
-                accessor.actual_total_elements_from_stride_dim
+                accessor.actual_total_elements_from_stride_dim, accessor.tensor_dtype
             ),
         )
     return align
 
 
-def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
+def find_max_alignment_for_accessors(
+    dtype: str, accessors: List[TensorAccessor]
+) -> int:
     """the max alignment value that meets the requirement specified by
-       the accessors
+       the accessors and dtype
 
     Parameters
     ----------
+    dtype: str
+        dtype of the tensor for which the accessors are attached
     accessors: List[TensorAccessor]
         TensorAccessor(s) attached to the relevant tensor being accessed
 
@@ -116,14 +120,16 @@ def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
     int
         the max alignment value
     """
-    align = max(alignment.ALIGNMENTS)
+    align = max(alignment.get_alignments(dtype))
     # Handle accessors
     for accessor in accessors:
         align = min(align, find_max_alignment_for_accessor(accessor))
     return align
 
 
-def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> int:
+def find_max_alignment(
+    num_elements: int, dtype: str, accessors: List[TensorAccessor]
+) -> int:
     """find the max alignment value that meets the requirement of accessing
        num_elements of data with access patterns (strides and offsets)
        specified by accessors
@@ -132,6 +138,8 @@ def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> in
     ----------
     num_elements: int
         specify the number of elements being accessed
+    dtype: str
+        dtype of the tensor for which the accessors are attached
 
     accessors: List[TensorAccessor]
         TensorAccessor(s) attached to the relevant tensor being accessed
@@ -142,6 +150,6 @@ def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> in
         the max alignment value
     """
     # get initial alignment based on the number of elements being accessed
-    align = alignment.find_max_alignment(num_elements)
-    accessor_alignment = find_max_alignment_for_accessors(accessors)
+    align = alignment.find_max_alignment(num_elements, dtype)
+    accessor_alignment = find_max_alignment_for_accessors(dtype, accessors)
     return min(align, accessor_alignment)
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index 736ee6482..8e8310229 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -50,21 +50,27 @@
   for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 {% if mode == "bilinear"%}
-__global__ void bilinear_upsampling_f16_nhwc_kernel(const half2* input,
+__global__ void bilinear_upsampling_nhwc_kernel(const {{dtype}}* input_raw,
                                                     {% if bias_add %}
-                                                      const half2* input_res,
+                                                      const {{dtype}}* input_res_raw,
                                                     {% endif %}
-                                                    half2* output,
+                                                    {{dtype}}* output_raw,
                                                     const {{index_type}} batch,
                                                     const {{index_type}} in_height,
                                                     const {{index_type}} in_width,
                                                     const {{index_type}} channels,
                                                     const {{index_type}} out_height,
                                                     const {{index_type}} out_width) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+{% if bias_add %}
+  const {{vec_dtype}}* input_res = (const {{vec_dtype}}*)input_res_raw;
+{% endif %}
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
 
-    const float height_scale = in_height / static_cast<float>(out_height);
-    const float width_scale = in_width / static_cast<float>(out_width);
-    const int64_t num_threads = out_height * out_width * channels * batch;
+  const float height_scale = in_height / static_cast<float>(out_height);
+  const float width_scale = in_width / static_cast<float>(out_width);
+  const int64_t num_threads = out_height * out_width * channels * batch;
 
 GPU_1D_KERNEL_LOOP(out_idx, num_threads) {
     int64_t idx = out_idx;
@@ -87,46 +93,61 @@
         (in_x < in_width - 1) ? ceilf(in_x) : in_width - 1;
     const float x_lerp = in_x - floorf(in_x);
 
-    const half2 top_left = __ldg(
+    const {{vec_dtype}} top_left = __ldg(
         input + ((b * in_height + top_y_index) * in_width + left_x_index) *
                    channels +
                c);
 
-    const half2 top_right = __ldg(
+    const {{vec_dtype}} top_right = __ldg(
         input + ((b * in_height + top_y_index) * in_width + right_x_index) *
                    channels +
                c);
-    const half2 bottom_left = __ldg(
+    const {{vec_dtype}} bottom_left = __ldg(
         input + ((b * in_height + bottom_y_index) * in_width + left_x_index) *
                    channels +
                c);
-    const half2 bottom_right = __ldg(
+    const {{vec_dtype}} bottom_right = __ldg(
         input + ((b * in_height + bottom_y_index) * in_width + right_x_index) *
                    channels +
                c);
 
+{% if dtype == "half" %}
     float top_x = __half2float(top_left{{half2_data_ref}}.x) + (__half2float(top_right{{half2_data_ref}}.x) - __half2float(top_left{{half2_data_ref}}.x)) * x_lerp;
     float top_y = __half2float(top_left{{half2_data_ref}}.y) + (__half2float(top_right{{half2_data_ref}}.y) - __half2float(top_left{{half2_data_ref}}.y)) * x_lerp;
-
     float bottom_x = __half2float(bottom_left{{half2_data_ref}}.x) + (__half2float(bottom_right{{half2_data_ref}}.x) - __half2float(bottom_left{{half2_data_ref}}.x)) * x_lerp;;
     float bottom_y = __half2float(bottom_left{{half2_data_ref}}.y) + (__half2float(bottom_right{{half2_data_ref}}.y) - __half2float(bottom_left{{half2_data_ref}}.y)) * x_lerp;;
+{% elif dtype == "float" %}
+    float top_x = top_left{{half2_data_ref}}.x + (top_right{{half2_data_ref}}.x - top_left{{half2_data_ref}}.x) * x_lerp;
+    float top_y = top_left{{half2_data_ref}}.y + (top_right{{half2_data_ref}}.y - top_left{{half2_data_ref}}.y) * x_lerp;
+    float bottom_x = bottom_left{{half2_data_ref}}.x + (bottom_right{{half2_data_ref}}.x - bottom_left{{half2_data_ref}}.x) * x_lerp;;
+    float bottom_y = bottom_left{{half2_data_ref}}.y + (bottom_right{{half2_data_ref}}.y - bottom_left{{half2_data_ref}}.y) * x_lerp;;
+{% endif %}
 
     float2 out = {0.f, 0.f};
     out.x = top_x + (bottom_x - top_x) * y_lerp;
     out.y = top_y + (bottom_y - top_y) * y_lerp;
 
+{% if dtype == "half" %}
     {% if bias_add %}
       output[out_idx] = __hadd2(__float22half2_rn(out), __ldg(input_res + out_idx));
     {% else %}
       output[out_idx] = __float22half2_rn(out);
     {% endif %}
+{% elif dtype == "float" %}
+    {% if bias_add %}
+      const auto tmp = __ldg(input_res + out_idx);
+      out.x += tmp.x;
+      out.y += tmp.y;
+    {% endif %}
+    output[out_idx] = out;
+{% endif %}
   }
 
 }
 
 {% else %}
 template <typename T, typename Telement, int element_in_Tio>
-__global__ void nearest_upsampling_f16_nhwc_kernel(const T* input,
+__global__ void nearest_upsampling_nhwc_kernel(const T* input,
                                                     {% if bias_add %}
                                                       const T* input_res,
                                                     {% endif %}
@@ -138,9 +159,9 @@
                                                     const {{index_type}} out_height,
                                                     const {{index_type}} out_width) {
 
-    const float height_scale = in_height / static_cast<float>(out_height);
-    const float width_scale = in_width / static_cast<float>(out_width);
-    const int64_t nthreads = out_height * out_width * channels * batch;
+  const float height_scale = in_height / static_cast<float>(out_height);
+  const float width_scale = in_width / static_cast<float>(out_width);
+  const int64_t nthreads = out_height * out_width * channels * batch;
 
 GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
@@ -171,7 +192,7 @@
     {% if tsize == 1 %}
     output[index] = input_val + input_res_val;
 
-    {% elif tsize == 8 %}
+    {% elif tsize == 8 and dtype == "half" %}
     T output_val;
     Telement* pack_y = reinterpret_cast<Telement*>(&output_val);
     Telement* pack_x = reinterpret_cast<Telement*>(&input_val);
@@ -220,16 +241,35 @@
     dim3 block(512);
 
 {% if mode == "bilinear" %}
-    bilinear_upsampling_f16_nhwc_kernel<<<grid, block, 0, stream>>>(
-      (const half2 *)input,
+    bilinear_upsampling_nhwc_kernel<<<grid, block, 0, stream>>>(
+      input,
       {% if bias_add %}
-        (const half2 *)input_res,
+        input_res,
       {% endif %}
-      (half2 *)output,
+      output,
       N, H, W, C/2, HO, WO);
 {% else %}
+  {% if dtype == "float" %}
     {% if tsize == 1 %}
-    nearest_upsampling_f16_nhwc_kernel<half, half, 1><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<float, float, 1><<<grid, block, 0, stream>>>(
+      (const float*)input,
+      {% if bias_add %}
+        (const float*)input_res,
+      {% endif %}
+      (float*)output,
+      N, H, W, C, HO, WO);
+    {% else %}
+    nearest_upsampling_nhwc_kernel<float2, float, 2><<<grid, block, 0, stream>>>(
+      (const float2*)input,
+      {% if bias_add %}
+        (const float2*)input_res,
+      {% endif %}
+      (float2*)output,
+      N, H, W, C / 2, HO, WO);
+    {% endif %}
+  {% else %}
+    {% if tsize == 1 %}
+    nearest_upsampling_nhwc_kernel<half, half, 1><<<grid, block, 0, stream>>>(
       (const half *)input,
       {% if bias_add %}
         (const half *)input_res,
@@ -237,7 +277,7 @@
       (half *)output,
       N, H, W, C, HO, WO);
     {% elif tsize == 8 %}
-    nearest_upsampling_f16_nhwc_kernel<float4, half, 8><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<float4, half, 8><<<grid, block, 0, stream>>>(
       (const float4 *)input,
       {% if bias_add %}
         (const float4 *)input_res,
@@ -245,7 +285,7 @@
       (float4 *)output,
       N, H, W, C/8, HO, WO);
     {% else %}
-    nearest_upsampling_f16_nhwc_kernel<half2, half, 2><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<half2, half, 2><<<grid, block, 0, stream>>>(
       (const half2 *)input,
       {% if bias_add %}
         (const half2 *)input_res,
@@ -253,6 +293,7 @@
       (half2 *)output,
       N, H, W, C/2, HO, WO);
     {% endif %}
+  {% endif %}
 {% endif %}
 }
 } // namespace
diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
index 19f8bd6cd..5756aa59c 100644
--- a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -142,10 +142,18 @@
 
   const Trois* roi = rois + 5 * (batch * roiCount + roiIdx);
   float hw;
+
+{% if elem_input_type == "half" %}
   float x1 = __half2float(roi[1]);
   float y1 = __half2float(roi[2]);
   float x2 = __half2float(roi[3]);
   float y2 = __half2float(roi[4]);
+{% elif elem_input_type == "float" %}
+  float x1 = roi[1];
+  float y1 = roi[2];
+  float x2 = roi[3];
+  float y2 = roi[4];
+{% endif %}
 
   y1 = max(0.f, min((float)imageSize.y, y1)) / imageSize.y;
   x1 = max(0.f, min((float)imageSize.x, x1)) / imageSize.x;
@@ -225,7 +233,11 @@
             interpolateBilinear(src, srcDims, ySample, xSample, featureCount);
       }
     }
+{% if elem_output_type == "half" %}
     *out = result / __float2half_rn(samplingCount);
+{% elif elem_output_type == "float" %}
+    *out = result / samplingCount;
+{% endif %}
   }
 }
 
@@ -262,16 +274,16 @@
       roiCount,
       firstThreshold,
       samplingRatio,
-      (const half*)rois,
-      (const half*)P2,
+      reinterpret_cast<const {{elem_input_type}}*>(rois),
+      reinterpret_cast<const {{elem_input_type}}*>(P2),
       P2dims,
-      (const half*)P3,
+      reinterpret_cast<const {{elem_input_type}}*>(P3),
       P3dims,
-      (const half*)P4,
+      reinterpret_cast<const {{elem_input_type}}*>(P4),
       P4dims,
-      (const half*)P5,
+      reinterpret_cast<const {{elem_input_type}}*>(P5),
       P5dims,
-      (half*)output,
+      output,
       {pool_size, pool_size});
 }
 
@@ -388,8 +400,8 @@ def gen_function_decl(func_attrs, backend_spec):
     """
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
@@ -423,8 +435,8 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
 
-    input_type = backend_spec.dtype_to_lib_type(p2._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(p2._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
diff --git a/python/aitemplate/backend/common/vision_ops/nms_kernel.py b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
index 1eb8a51bd..fba468e28 100644
--- a/python/aitemplate/backend/common/vision_ops/nms_kernel.py
+++ b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
@@ -223,6 +223,18 @@
   return interS / (Sa + Sb - interS);
 }
 
+__device__ __host__ inline float IoU(const Bbox<float>& a, const Bbox<float>& b) {
+  float left = fmaxf(a.xmin, b.xmin), right = fminf(a.xmax, b.xmax);
+  float top = fmaxf(a.ymin, b.ymin), bottom = fminf(a.ymax, b.ymax);
+  float width = fmaxf(right - left + 1.0f, 0.0f);
+  float height = fmaxf(bottom - top + 1.0f, 0.0f);
+  float interS = width * height;
+  float Sa = (a.xmax - a.xmin + 1.0f) * (a.ymax - a.ymin + 1.0f);
+  float Sb = (b.xmax - b.xmin + 1.0f) * (b.ymax - b.ymin + 1.0f);
+
+  return interS / (Sa + Sb - interS);
+}
+
 // NMS KERNEL FOR SMALL BATCH SIZE
 template <typename T_PROPOSALS, typename T_ROIS, int DIM, int TSIZE>
 __global__ __launch_bounds__(DIM) void nmsKernel1(
@@ -419,7 +431,7 @@
   }
 }
 
-// BBFilter KERNEL
+// BBFilter KERNEL half
 __global__ void bboxFilter_kernel(
     int N,
     const float minSize,
@@ -444,6 +456,27 @@
   }
 }
 
+// BBFilter KERNEL float
+__global__ void bboxFilter_kernel(
+    int N,
+    const float minSize,
+    const float* proposals,
+    float* scores) {
+  if (minSize == 0)
+    return;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid < N) {
+    int ininf = 0xff800000;
+    float ninf = *(float*)&ininf;
+
+    if (proposals[tid * 4 + 2] - proposals[tid * 4 + 0] < minSize ||
+        proposals[tid * 4 + 3] - proposals[tid * 4 + 1] < minSize) {
+      scores[tid] = ninf;
+    }
+  }
+}
+
 inline size_t GetCudaAlignedSize(size_t size) {
   const size_t kCudaAlignSize = 1 << 20;
   return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize;
@@ -525,7 +558,7 @@ class MultiplyFunctor final {
   vworkspace = alignPtr(vworkspace, ALIGNMENT);
 
   std::size_t tempStorageBytes =
-      InferTempStorageForSortPairsDescending<half, int64_t>(N, R);
+      InferTempStorageForSortPairsDescending<T_ROIS, Bbox<T_ROIS>>(N, R);
 
   CSC({{prefix}}GetLastError(), STATUS_FAILURE);
 
diff --git a/python/aitemplate/backend/common/vision_ops/roi_align_common.py b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
index d7c64d60e..fb65b400a 100644
--- a/python/aitemplate/backend/common/vision_ops/roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
@@ -23,10 +23,10 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}roi_align_launcher<{{library_dtype}}, float, {{num_rois}}, {{pooled_size}}>(
-{{indent}}    static_cast<const {{library_dtype}}*>(in_ptr),
-{{indent}}    static_cast<const {{library_dtype}}*>(rois_ptr),
-{{indent}}    static_cast<{{library_dtype}}*>(out_ptr),
+{{indent}}roi_align_launcher<{{dtype}}, float, {{num_rois}}, {{pooled_size}}>(
+{{indent}}    static_cast<const {{dtype}}*>(in_ptr),
+{{indent}}    static_cast<const {{dtype}}*>(rois_ptr),
+{{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -46,19 +46,20 @@
 SRC_TEMPLATE = jinja2.Template(
     """
 {{header_files}}
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
 
 namespace {
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 template <typename T>
-__device__ float2 bilinear_interpolate(const half2* bottom_data,
-                                  const int height,
-                                  const int width,
-                                  T y,
-                                  T x,
-                                  const int channels,
-                                  const int index /* index for debug only*/) {
+__device__ float2 bilinear_interpolate(const {{vec_dtype}}* bottom_data,
+                                       const int height,
+                                       const int width,
+                                       T y,
+                                       T x,
+                                       const int channels,
+                                       const int index /* index for debug only*/) {
   // deal with cases that inverse elements are out of feature map boundary
   float2 val = {0.f, 0.f};
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
@@ -87,11 +88,12 @@
   T lx = x - x_low;
   T hy = 1. - ly, hx = 1. - lx;
   // do bilinear interpolation
-  const half2  v1 = __ldg(bottom_data + (y_low * width + x_low) * channels);
-  const half2  v2 = __ldg(bottom_data + (y_low * width + x_high) * channels);
-  const half2  v3 = __ldg(bottom_data + (y_high * width + x_low) * channels);
-  const half2  v4 = __ldg(bottom_data + (y_high * width + x_high) * channels);
+  const {{vec_dtype}}  v1 = __ldg(bottom_data + (y_low * width + x_low) * channels);
+  const {{vec_dtype}}  v2 = __ldg(bottom_data + (y_low * width + x_high) * channels);
+  const {{vec_dtype}}  v3 = __ldg(bottom_data + (y_high * width + x_low) * channels);
+  const {{vec_dtype}}  v4 = __ldg(bottom_data + (y_high * width + x_high) * channels);
 
+{% if dtype == "half" %}
   T v1_x = __half2float(v1{{half2_data_ref}}.x);
   T v2_x = __half2float(v2{{half2_data_ref}}.x);
   T v3_x = __half2float(v3{{half2_data_ref}}.x);
@@ -101,6 +103,17 @@
   T v2_y = __half2float(v2{{half2_data_ref}}.y);
   T v3_y = __half2float(v3{{half2_data_ref}}.y);
   T v4_y = __half2float(v4{{half2_data_ref}}.y);
+{% elif dtype == "float" %}
+  T v1_x = v1{{half2_data_ref}}.x;
+  T v2_x = v2{{half2_data_ref}}.x;
+  T v3_x = v3{{half2_data_ref}}.x;
+  T v4_x = v4{{half2_data_ref}}.x;
+
+  T v1_y = v1{{half2_data_ref}}.y;
+  T v2_y = v2{{half2_data_ref}}.y;
+  T v3_y = v3{{half2_data_ref}}.y;
+  T v4_y = v4{{half2_data_ref}}.y;
+{% endif %}
 
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
@@ -111,19 +124,21 @@
 }
 
 template <typename T, int64_t num_rois, int pool_size>
-__global__ void roi_align_f16_nhwc_kernel(const half2* bottom_data,
-                                         const half* bottom_rois,
-                                         half2* top_data,
-                                         const int64_t N,
-                                         const int64_t height,
-                                         const int64_t width,
-                                         const int64_t channels,
-                                         const int64_t pooled_height,
-                                         const int64_t pooled_width,
-                                         const int sampling_ratio,
-                                         const float spatial_scale,
-                                         const bool position_sensitive,
-                                         const bool continuous_coordinate) {
+__global__ void roi_align_nhwc_kernel(const {{dtype}}* bottom_data_raw,
+                                      const {{dtype}}* bottom_rois,
+                                      {{dtype}}* top_data_raw,
+                                      const int64_t N,
+                                      const int64_t height,
+                                      const int64_t width,
+                                      const int64_t channels,
+                                      const int64_t pooled_height,
+                                      const int64_t pooled_width,
+                                      const int sampling_ratio,
+                                      const float spatial_scale,
+                                      const bool position_sensitive,
+                                      const bool continuous_coordinate) {
+  const {{vec_dtype}}* bottom_data = reinterpret_cast<const {{vec_dtype}}*>(bottom_data_raw);
+  {{vec_dtype}}* top_data = reinterpret_cast<{{vec_dtype}}*>(top_data_raw);
 
   const int64_t nthreads = num_rois * channels * pooled_width * pooled_height;
 
@@ -139,21 +154,36 @@
     const int n = idx / pooled_height;
 
 
-    const half* offset_bottom_rois = bottom_rois + n * 5;
+    const {{dtype}}* offset_bottom_rois = bottom_rois + n * 5;
+  {% if dtype == "half" %}
     int roi_batch_ind = static_cast<int>(__half2float(offset_bottom_rois[0]));
+  {% elif dtype == "float" %}
+    int roi_batch_ind = static_cast<int>(offset_bottom_rois[0]);
+  {% endif %}
 
     float2 output_val = {0.f, 0.f};
     if (roi_batch_ind < 0) {
+  {% if dtype == "half" %}
       top_data[index] = __float22half2_rn(output_val);
+  {% elif dtype == "float" %}
+      top_data[index] = output_val;
+  {% endif %}
       continue;
     }
 
     // Do not using rounding; this implementation detail is critical
     T roi_offset  = continuous_coordinate ? static_cast<T>(0.5) : static_cast<T>(0);
+  {% if dtype == "half" %}
     T roi_start_w = __half2float(offset_bottom_rois[1]) * spatial_scale - roi_offset;
     T roi_start_h = __half2float(offset_bottom_rois[2]) * spatial_scale - roi_offset;
     T roi_end_w   = __half2float(offset_bottom_rois[3]) * spatial_scale - roi_offset;
     T roi_end_h   = __half2float(offset_bottom_rois[4]) * spatial_scale - roi_offset;
+  {% elif dtype == "float" %}
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_w   = offset_bottom_rois[3] * spatial_scale - roi_offset;
+    T roi_end_h   = offset_bottom_rois[4] * spatial_scale - roi_offset;
+  {% endif %}
 
     T roi_width  = roi_end_w - roi_start_w;
     T roi_height = roi_end_h - roi_start_h;
@@ -172,7 +202,7 @@
       channels_unpooled = channels * pooled_height * pooled_width;
     }
 
-    const half2* offset_bottom_data =
+    const {{vec_dtype}}* offset_bottom_data =
            bottom_data + (roi_batch_ind * height * width * channels_unpooled + c_unpooled);
 
     // We use roi_bin_grid to sample the grid and mimic integral
@@ -200,7 +230,11 @@
     output_val.x /= count;
     output_val.y /= count;
 
+  {% if dtype == "half" %}
     top_data[index] = __float22half2_rn(output_val);
+  {% elif dtype == "float" %}
+    top_data[index] = output_val;
+  {% endif %}
   }
 
 }
@@ -212,10 +246,10 @@
 }
 
 
-template <typename LibraryT, typename T, int64_t num_rois, int pool_size>
-void roi_align_launcher(const LibraryT* input,
-                        const LibraryT* rois,
-                        LibraryT* output,
+template <typename ElemT, typename T, int64_t num_rois, int pool_size>
+void roi_align_launcher(const ElemT* input,
+                        const ElemT* rois,
+                        ElemT* output,
                       const {{index_type}} N,
                       const {{index_type}} H,
                       const {{index_type}} W,
@@ -235,8 +269,8 @@
       static_cast<int64_t>(4096)));
   dim3 block(512);
 
-  roi_align_f16_nhwc_kernel<T, num_rois, pool_size><<<grid, block, 0, stream>>>(
-    (const half2*)input, (const half*)rois, (half2*)output, N, H, W, C / 2, HO, WO,
+  roi_align_nhwc_kernel<T, num_rois, pool_size><<<grid, block, 0, stream>>>(
+    input, rois, output, N, H, W, C / 2, HO, WO,
     sampling_ratio, spatial_scale, position_sensitive, continuous_coordinate);
 
 }
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 3948182d2..6a473d7d9 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -29,6 +29,10 @@
 #include <iostream>
 #include <cuda_fp16.h>
 #include "cutlass/cutlass.h"
+// TODO: this include should be removed. There's a bug in CUTLASS, the
+// header containing cutlass::gemm::warp::WarpSize is not being included.
+// Until the fix is upstreamed, just inject it here instead.
+#include "cutlass/gemm/warp/mma.h"
 #include "kernel_forward.h"
 
 {{func_signature}}
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index c9fb0ef4a..2ea880f08 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -25,8 +25,9 @@
 
 from aitemplate.backend.backend_spec import CUDASpec
 
+from ....utils import alignment
 from ...target import Target
-from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+from ..gemm_universal.common import add_profiler, build_profiler
 
 
 KERNEL_KEY_TEMPLATE = jinja2.Template(
@@ -35,8 +36,419 @@
 """
 )
 
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance_name}}::ElementCompute;
+{{indent}}//  TODO: cast to right dtype
+{{indent}}typename {{instance_name}}::Arguments arguments{
+{{indent}}    problem_size,                                                                 // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},                                  // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},                              // TensorRefA const & ref_B
+{% if is_bias %}
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},  // TensorRefC const & ref_C
+{% elif is_bias_add %}
+{{indent}}    {static_cast<{{dtype}}*>(res_ptr), layout_C},                                 // TensorRefC const & ref_C
+{% else %}
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},                                 // TensorRefC const & ref_C
+{% endif %}
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},                                 // TensorRefC const & ref_D
+{% if is_bias %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},                       // typename EpilogueOutputOp::Params const & output_op
+{% elif is_bias_add %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},                       // typename EpilogueOutputOp::Params const & output_op
+{{indent}}    cutlass::conv::SplitKMode::kSerial,                                           // SplitKMode const & split_k_mode
+{{indent}}    static_cast<{{dtype}}*>(bias_ptr),                                            // void * ptr_Vector
+{{indent}}    nullptr,                                                                      // void * ptr_Tensor
+{{indent}}    0,                                                                            // typename LayoutC::Stride::Index ldr
+{{indent}}    *out_ch,                                                                      // typename LayoutC::Stride::Index ldt
+{% else %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},                       // typename EpilogueOutputOp::Params const & output_op
+{% endif %}
+{{indent}}};
+{{indent}}{{instance_name}} conv_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE_{{instance_name}} = workspace_size;
+{% endif %}
+{{indent}}auto status = conv_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cstdio>
+#include <stdexcept>
+
+#include "cutlass/cutlass.h"
+{% if is_transpose %}
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+{% elif is_depthwise %}
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+{% else %}
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+{% endif %}
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      static char msg[2048];                                                          \\
+      snprintf(msg, sizeof(msg), "[%s] Got cutlass error: %s at: %s",                 \\
+        __FILE__, cutlassGetStatusString(error), __LINE__);                           \\
+      fprintf(stderr, msg);                                                           \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{functions}}
+"""
+)
+
+FUNCTION_TEMPLATE = jinja2.Template(
+    """
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+{% if is_bias %}
+    void* bias_ptr,
+{% elif is_bias_add %}
+    void* bias_ptr,
+    void* res_ptr,
+{% endif %}
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+{% if is_depthwise%}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
+{% elif is_transpose %}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_in_ch, i32_kernel_h, i32_kernel_w, i32_out_ch)));
+{% else %}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+{% endif %}
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+{% if is_transpose %}
+    {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},    // cutlass::Tensor4DCoord input_size
+{% else %}
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},           // cutlass::Tensor4DCoord input_size
+{% endif %}
+{% if is_depthwise%}
+    {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},  // cutlass::Tensor4DCoord filter_size
+{% elif is_transpose%}
+    {i32_in_ch, i32_kernel_h, i32_kernel_w, i32_out_ch},  // cutlass::Tensor4DCoord filter_size
+{% else %}
+    {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},  // cutlass::Tensor4DCoord filter_size
+{% endif %}
+    {pad, pad, pad, pad},                                 // cutlass::Tensor4DCoord padding
+    {stride, stride},                                     // cutlass::MatrixCoord stride
+    {dilation, dilation},                                 // cutlass::MatrixCoord dilation
+{% if is_transpose %}
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},           // cutlass::Tensor4DCoord output_size
+{% else %}
+    {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},    // cutlass::Tensor4DCoord output_size
+{% endif %}
+    cutlass::conv::Mode::kCrossCorrelation,               // cutlass::conv::Mode mode
+    1                                                     // int split_k_slices
+  );
+
+  {{exec_paths}}
+
+  throw std::runtime_error(
+    "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      &runtime,
+{{indent}}      &workspace_size,
+{{indent}}      {{ni}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{stride}},
+{{indent}}      {{dilation}},
+{{indent}}      {{pad}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {
+{{indent}}    runtime = 0;
+{{indent}}    workspace_size = 0;
+{{indent}}  }
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}  std::cout << "OP:{{conv_op_name}},"
+{{indent}}            << "TIME:" << runtime << ","
+{{indent}}            << "WS:" << workspace_size << std::endl;
+{{indent}}}
+"""
+)
+
+BENCHMARK_DECL_TEMPLATE = jinja2.Template(
+    """
+int benchmark_{{function_name}} (
+  float*,
+  size_t*,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int,
+  int,
+  int,
+  uint8_t*,
+  cudaStream_t
+);
+"""
+)
+
+BENCHMARK_TEMPLATE = jinja2.Template(
+    """
+int benchmark_{{function_name}} (
+  float* runtime,
+  size_t* workspace_size,
+  int64_t NI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t HO,
+  int64_t WO,
+  int stride,
+  int dilation,
+  int pad,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementInputA = typename {{instance_name}}::ElementA;
+  using ElementInputB = typename {{instance_name}}::ElementB;
+  using ElementOutput = typename {{instance_name}}::ElementC;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> w({CO, KH, KW, CI});
+{% if is_bias %}
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> b({(int)CO, 1, 1, 1});
+{% elif is_bias_add %}
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> b({(int)CO, 1, 1, 1});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name}}::LayoutC> r({NO, HO, WO, CO});
+{% endif %}
+  cutlass::HostTensor<ElementOutput, typename {{instance_name}}::LayoutC> y({NO, HO, WO, CO});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  *runtime = runtime_ms;
+  *workspace_size = GLOBAL_WORKSPACE_SIZE_{{instance_name}};
+  return 0;
+}
+"""
+)
+
+PROFILER_BENCHMARK_TEMPLATE = jinja2.Template(
+    """
+static size_t GLOBAL_WORKSPACE_SIZE_{{instance_name}} = 0;
+
+{{op_source}}
+
+{{benchmark}}
+"""
+)
+
+PROFILER_MAIN_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+
+#include "cutlass/cutlass.h"
+
+{{benchmark_decls}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+
+{{shape_func}}
+
+  float runtime = 0;
+  size_t workspace_size = 0;
+  uint8_t* global_workspace_ = nullptr;
+  cudaStream_t stream = nullptr;
+
+{{benchmark_instances}}
+
+  return 0;
+}
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+{% if is_bias %}
+  void*,
+{% elif is_bias_add %}
+  void*,
+  void*,
+{% endif %}
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{% if is_bias %}
+{{indent}}    {{bias_ptr}},
+{% elif is_bias_add %}
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{res_ptr}},
+{% endif %}
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
 
-def kernel_name(op):
+def kernel_name(op, layout=None):
     """generate cuda kernel name"""
     from cutlass_lib import library
 
@@ -45,7 +457,8 @@ def kernel_name(op):
     opcode_class_name = library.OpcodeClassNames[
         op.tile_description.math_instruction.opcode_class
     ]
-    layout = op.layout_name()
+    if layout is None:
+        layout = op.layout_name()
     align_ab = op.A.alignment
     align_c = op.C.alignment
     name = KERNEL_KEY_TEMPLATE.render(
@@ -71,15 +484,26 @@ def emit_instance(op):
     return op_def
 
 
-def extract_config(func_attrs, f_proc_op=None):
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    skip_simt_kernels=False,
+    f_apply_special_config=None,
+    op_kind=None,
+    op_layout=None,
+):
     """Extracts cutlass config for conv kernels."""
     import copy
 
     import cutlass_lib
 
-    def f_proc_op_default(op):
-        # import cutlass_lib
-        ret = []
+    spec = CUDASpec()
+    lib_dtype = spec.dtype_to_lib_type(dtype)
+
+    if lib_dtype == "float":
+        data_type = cutlass_lib.library.DataType.f32
+        acc_type = cutlass_lib.library.DataType.f32
+    else:
         data_type = cutlass_lib.library.DataType.f16
         acc_type = cutlass_lib.library.DataType.f32
         # check target use fp16 acc
@@ -87,44 +511,224 @@ def f_proc_op_default(op):
             if Target.current()._kwargs["use_fp16_acc"]:
                 acc_type = cutlass_lib.library.DataType.f16
 
+    def f_proc_op(op):
+        ret = []
+        if (
+            skip_simt_kernels
+            and op.tile_description.math_instruction.opcode_class
+            == cutlass_lib.library.OpcodeClass.Simt
+        ):
+            return ret
+
         if (
             op.A.element == data_type
             and op.B.element == data_type
             and op.C.element == data_type
             and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
+            and op.tile_description.math_instruction.element_accumulator == acc_type
         ):
-
             op = copy.deepcopy(op)
+
             # set epilogue
             epilogue_name = func_attrs["epilogue"]
             op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
             op.element_epilogue = acc_type
-            # set C alignment
-            for i in [8, 4, 2, 1]:
+
+            # apply special config if required
+            if f_apply_special_config is not None:
+                op = f_apply_special_config(func_attrs, op)
+
+            # set C alignment depending on the dtype
+            for i in alignment.get_alignments(dtype):
                 op = copy.deepcopy(op)
                 op.C.alignment = i
                 ret.append(op)
+
         return ret
 
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
+    if op_kind is None:
+        op_kind = cutlass_lib.library.OperationKind.Conv2d
     extract_ops = list(Target.current()._operators[op_kind].items())
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
 
+    conv_ops = OrderedDict()
     for _, value in extract_ops:
         op = value[0]
         if op.conv_kind == conv_kind:
-            if f_proc_op is None:
-                ret = f_proc_op_default(op)
-            else:
-                ret = f_proc_op(op)
+            ret = f_proc_op(op)
             if len(ret) > 0:
                 for op_inst in ret:
-                    key = kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
+                    key = kernel_name(op_inst, layout=op_layout)
+                    conv_ops[key] = op_inst
+    return conv_ops
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+    f_emit_instance=emit_instance,
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+    is_depthwise=False,
+    extra_header="",
+    instance_name_base="DeviceConvFwdInstance",
+):
+    """Generate profiler sources."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    func_call_extra_args = {}
+    if is_bias:
+        func_call_extra_args = {
+            "bias_ptr": "b.device_data()",
+        }
+    elif is_bias_add:
+        func_call_extra_args = {
+            "bias_ptr": "b.device_data()",
+            "res_ptr": "r.device_data()",
+        }
+
+    benchmark_decls = []
+    benchmark_instances = []
+    profiler_benchmarks = {}
+
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
+        config = f_emit_instance(op)
+        config_name = extract_config_name(config)
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        function_name = f"{op_type}_{op_name}"
+
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            is_profiler=True,
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            instance_name=instance_name,
+            dtype=dtype,
+        )
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name,
+            name=instance_name,
+            config=config,
+        )
+        function = FUNCTION_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            is_transpose=is_transpose,
+            is_depthwise=is_depthwise,
+            function_name=function_name,
+            shape_function="",
+            exec_paths=exec_program,
+        )
+        op_source = SRC_TEMPLATE.render(
+            is_transpose=is_transpose,
+            is_depthwise=is_depthwise,
+            extra_header=extra_header,
+            instances=instance,
+            functions=function,
+        )
+
+        func_call = FUNC_CALL_TEMPLATE.render(
+            indent="  ",
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            func_name=function_name,
+            in_ptr="x.device_data()",
+            weight_ptr="w.device_data()",
+            out_ptr="y.device_data()",
+            **func_call_extra_args,
+            p_batch="&NI",
+            p_out_ch="&CO",
+            p_in_ch="&CI",
+            p_kernel_h="&KH",
+            p_kernel_w="&KW",
+            p_in_h="&HI",
+            p_in_w="&WI",
+            p_out_batch="&NO",
+            p_out_h="&HO",
+            p_out_w="&WO",
+            stride="stride",
+            dilation="dilation",
+            pad="pad",
+        )
+        benchmark = BENCHMARK_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            instance_name_base=instance_name_base,
+            function_name=function_name,
+            func_call=func_call,
+            instance_name=instance_name,
+        )
+
+        profiler_benchmarks[function_name] = PROFILER_BENCHMARK_TEMPLATE.render(
+            op_source=op_source,
+            benchmark=benchmark,
+            instance_name=instance_name,
+        )
+
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            ho="HO",
+            wo="WO",
+            stride="stride",
+            dilation="dilation",
+            pad="pad",
+        )
+        benchmark_instances.append(benchmark_instance)
+
+        benchmark_decl = BENCHMARK_DECL_TEMPLATE.render(
+            function_name=function_name,
+        )
+        benchmark_decls.append(benchmark_decl)
+
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    profiler_main_code = PROFILER_MAIN_TEMPLATE.render(
+        shape_func=shape_func,
+        benchmark_decls="\n".join(benchmark_decls),
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    code = {profiler_filename: profiler_main_code}
+    for benchmark_filename, benchmark_code in profiler_benchmarks.items():
+        code[benchmark_filename] = benchmark_code
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
+
+    # build
+    return build_profiler(file_pairs)
 
 
 def extract_config_name(config):
@@ -139,13 +743,14 @@ def extract_config_name(config):
 
 def gen_function(
     func_attrs,
-    instance_template,
-    exec_template,
-    src_template,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+    is_depthwise=False,
     extra_header="",
 ):
     """Function definition codegen."""
@@ -156,22 +761,24 @@ def gen_function(
     inst_def_flag = set()
     instances = {}
     instance_decl = ""
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     for key, value in exec_path.items():
         fname = "f" + sha1(key.encode()).hexdigest()
-
-        emit_instance = f_emit_instance(op_instance[value])
+        emitted_instance = f_emit_instance(op_instance[value])
         if value not in inst_def_flag:
             inst_def_flag.add(value)
-            config = emit_instance
+            config = emitted_instance
         else:
             config = ""
-        inst = instance_template.render(
-            config=config, name=fname, config_name=extract_config_name(emit_instance)
+        inst = INSTANCE_TEMPLATE.render(
+            config=config,
+            name=fname,
+            config_name=extract_config_name(emitted_instance),
         )
         instances[key] = inst
         instance_decl += inst
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -195,38 +802,118 @@ def gen_function(
         y_dim3="*out_ch",
     )
     shape_func = shape_eval_func + shape_save_func
+
     exec_paths = ""
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
-        program = exec_template.render(
+        program = EXEC_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
             indent=" " * 4,
-            instance=fname,
+            instance_name=fname,
             dtype=dtype,
         )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
-    return src_template.render(
-        instances=instance_decl,
+
+    function = FUNCTION_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        is_transpose=is_transpose,
+        is_depthwise=is_depthwise,
         function_name=func_name,
         shape_function=shape_func,
         exec_paths=exec_paths,
+    )
+
+    return SRC_TEMPLATE.render(
+        is_transpose=is_transpose,
+        is_depthwise=is_depthwise,
         extra_header=extra_header,
+        instances=instance_decl,
+        functions=function,
     )
 
 
-def cal_align_ab(x_shape: List[int]) -> int:
+def gen_function_decl(
+    func_attrs,
+    is_bias=False,
+    is_bias_add=False,
+):
+    func_name = func_attrs["name"]
+
+    return FUNC_DECL_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        func_name=func_name,
+    )
+
+
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+
+    func_call_extra_args = {}
+    if is_bias:
+        b = func_attrs["inputs"][2]
+        func_call_extra_args = {
+            "bias_ptr": b._attrs["name"],
+        }
+    elif is_bias_add:
+        b = func_attrs["inputs"][2]
+        r = func_attrs["inputs"][3]
+        func_call_extra_args = {
+            "bias_ptr": b._attrs["name"],
+            "res_ptr": r._attrs["name"],
+        }
+
+    out_ch = wshape[-1]._attrs["name"] if is_transpose else wshape[0]._attrs["name"]
+    return FUNC_CALL_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        **func_call_extra_args,
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + out_ch,
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+def _cal_align_ab(x_shape: List[int], dtype="float16") -> int:
     """Returns input alignment."""
     k = x_shape[3]  # CI
-    if k % 8 == 0:
-        return 8
-    if k % 4 == 0:
-        return 4
-    if k % 2 == 0:
-        return 2
-    raise RuntimeError(f"a/b is not aligned {x_shape=}")
+    return alignment.find_max_alignment(k, dtype)
 
 
-def function_filter(cfg, func_attrs, x_shape):
+def function_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -243,12 +930,16 @@ def function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    ab_alignment = cal_align_ab(x_shape)
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = _cal_align_ab(x_shape, dtype=dtype)
+
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
+
     if align_c != func_attrs["epilogue_alignment"]:
         return False
     if align_ab != ab_alignment:
         return False
+
     return True
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
index aa48d92f9..e1dbf6f1d 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -13,368 +13,66 @@
 #  limitations under the License.
 #
 """
-common templates for conv_bias_activation subgraph
+common functions for conv_bias_activation subgraph
 """
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
 
 from . import common
 
 # pylint: disable=C0103,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
+EXTRA_HEADER = """
 #include <cutlass/epilogue/thread/linear_combination_bias_relu.h>
 #include <cutlass/epilogue/thread/linear_combination_hardswish.h>
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
 """
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
 
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
 
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        is_bias=True,
+        extra_header=EXTRA_HEADER,
+    )
 
 
-def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_bias=True,
+        extra_header=EXTRA_HEADER,
     )
 
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
 
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ",
-            is_profiler=True,
-            instance=name,
-            dtype=dtype,
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-            extra_header=extra_header,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
+def gen_function_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
-def gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
index 5439f1fc0..b3e78c300 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -13,339 +13,90 @@
 #  limitations under the License.
 #
 """
-common template for conv2d bias act residual add
+common functions for conv2d bias act residual add
 """
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
 
 from . import common
 
 # pylint: disable=C0301,C0103
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(res_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}    cutlass::conv::SplitKMode::kSerial,
-{{indent}}    static_cast<{{dtype}}*>(bias_ptr),
-{{indent}}    nullptr, 0, *out_ch
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-return;
-"""
-)
-
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
+EXTRA_HEADER = """
 #include <cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h>
 #include <cutlass/epilogue/thread/linear_combination_residual_block.h>
+"""
 
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    void* res_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-      {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-      {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-      {pad, pad, pad, pad},
-      {stride, stride},
-      {dilation, dilation},
-      {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-      cutlass::conv::Mode::kCrossCorrelation,
-      1
-  );
 
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    activation_op_name="Identity",
+    binary_op_name="Plus",
+    unary_op_name="Identity",
+):
+    def set_ops(func_attrs, op):
+        import cutlass_lib
 
+        op.activation_op = cutlass_lib.library.EpilogueMathName[activation_op_name]
+        op.binary_op = cutlass_lib.library.EpilogueMathName[binary_op_name]
+        op.unary_op = cutlass_lib.library.EpilogueMathName[unary_op_name]
 
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
+        return op
 
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
+    return common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        skip_simt_kernels=True,
+        f_apply_special_config=set_ops,
+    )
 
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
 
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> r({NO, HO, WO, CO});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       r.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       r.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        is_bias_add=True,
+        extra_header=EXTRA_HEADER,
+    )
 
-"""
-)
 
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_bias_add=True,
+        extra_header=EXTRA_HEADER,
+    )
 
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
 
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    {{res_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
+def gen_function_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias_add=True,
+    )
 
 
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        is_bias_add=True,
     )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
index c24f0a4db..d110a21f0 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
@@ -16,96 +16,38 @@
 common functions for conv2d op with few channels(< 8)
 """
 
-from collections import OrderedDict
-
-from ...target import Target
+from ....utils import alignment
 from . import common
 
 
-def apply_special_config(func_attrs, op):
-    import cutlass_lib
-
-    x = func_attrs["inputs"][0]
-    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
-
-    if in_ch == 3:
-        # By default we don't use it since the perf is worse than pad4+fixchannel
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
-        op.A.alignment = 1
-        op.B.alignment = 1
-        op.tile_description.stages = 2
-    elif in_ch in [2, 4, 8]:
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
-        op.A.alignment = in_ch
-        op.B.alignment = in_ch
-        op.tile_description.stages = 3
-    return op
-
-
-def extract_config(func_attrs):
-    """extract epilogue for conv op
-
-    Parameters
-    ----------
-    func_attrs : Dict
-        [description] op attributes
-
-    Returns
-    -------
-    [type]: Dict
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
-    """
-    import copy
-
-    import cutlass_lib
-
-    def f_proc_op_special(op):
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            op = apply_special_config(func_attrs, op)
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            ret = f_proc_op_special(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = common.kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
+def extract_config(func_attrs, dtype="float16"):
+    def apply_special_config(func_attrs, op):
+        import cutlass_lib
+
+        x = func_attrs["inputs"][0]
+        in_ch = x._attrs["shape"][-1]._attrs["values"][0]
+
+        # Make sure to use NoneGroup here. Otherwise, we'll generate Conv2dGroupFprop,
+        # which doesn't have template specializations for either of the iterator
+        # algorithms below, resulting in "incomplete type is not allowed" errors.
+        op.group_mode = cutlass_lib.library.GroupMode.NoneGroup
+        if in_ch == 3:
+            # By default we don't use it since the perf is worse than pad4+fixchannel
+            op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
+            op.A.alignment = 1
+            op.B.alignment = 1
+            op.tile_description.stages = 2
+        elif in_ch in alignment.get_alignments(dtype):
+            op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
+            op.A.alignment = in_ch
+            op.B.alignment = in_ch
+            op.tile_description.stages = 3
+
+        return op
+
+    return common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        skip_simt_kernels=True,
+        f_apply_special_config=apply_special_config,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
new file mode 100644
index 000000000..eb5be30ed
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common functions for transposed conv2d
+"""
+
+import re
+
+from . import common
+
+
+def _conv_transpose_instance(op_def):
+    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
+    tmp = re.sub(
+        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
+        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
+        tmp,
+    )
+    return tmp
+
+
+def emit_instance(op, f_instance_convertor=_conv_transpose_instance):
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    skip_simt_kernels=False,
+    op_kind=None,
+    op_layout=None,
+):
+    def apply_special_config(func_attrs, op):
+        import cutlass_lib
+
+        op.group_mode = cutlass_lib.library.GroupMode.NoneGroup
+        return op
+
+    return common.extract_config(
+        func_attrs,
+        dtype,
+        skip_simt_kernels,
+        f_apply_special_config=apply_special_config,
+        op_kind=op_kind,
+        op_layout=op_layout,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
index 3279e2ff7..68de39fd5 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -15,338 +15,42 @@
 """
 Codegen for conv2d.
 """
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
 from ... import registry
 from . import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 @registry.reg("cuda.conv2d.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_config(
+    func_attrs,
+    dtype="float16",
+):
     """Populates conv2d cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """Codegen for conv2d profiler."""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
     )
-    file_pairs = []
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.conv2d.gen_function")
-def gen_function(
+def conv2d_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
@@ -354,56 +58,41 @@ def gen_function(
 ):
     """Codegen for conv2d function."""
     return common.gen_function(
-        func_attrs,
-        INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d.func_decl")
-def conv2d_gen_function_decl(func_attrs):
+def conv2d_func_decl(
+    func_attrs,
+):
     """Codegen for conv2d function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
+def conv2d_func_call(
+    func_attrs,
+    indent="  ",
+):
     """Codegen for conv2d function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -420,4 +109,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
index c4fb32c42..66c57f966 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -22,51 +22,77 @@
 
 
 @registry.reg("cuda.conv2d_bias.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_config(
+    func_attrs,
+    dtype="float16",
+):
     """Populates all available conv2d configs into the op_instance field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """Codegen for conv2d profiler."""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.gen_function")
-def gen_function(
+def conv2d_bias_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d function."""
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias.func_decl")
-def conv2d_gen_function_decl(func_attrs):
+def conv2d_bias_func_decl(
+    func_attrs,
+):
     """Codegen for conv2d function declaration."""
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
+def conv2d_bias_func_call(
+    func_attrs,
+    indent="  ",
+):
     """Codegen for conv2d function call."""
-    return cba.gen_function_call(func_attrs, indent)
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -83,4 +109,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
index 07ecbbff6..bd952ab68 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -16,120 +16,77 @@
 conv2d bias add codegen
 """
 from ... import registry
-from ...target import Target
 from . import common, common_conv2d_bias_add_activation as cbaa
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["Identity"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_identity_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Plus",
+        unary_op_name="Identity",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_identity_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_function")
-def gen_function(
+def conv2d_bias_add_identity_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_identity_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_identity_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_identity_filter(cfg, func_attrs, x_shape):
     """Generates function filter.
 
     Parameters
@@ -146,4 +103,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
index 09d975ae4..4e7526699 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -16,120 +16,81 @@
 conv2d bias add hardswish codegen
 """
 from ... import registry
-from ...target import Target
 from . import common, common_conv2d_bias_add_activation as cbaa
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Add"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["HardSwish"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_hardswish_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Add",
+        unary_op_name="HardSwish",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_hardswish_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_function")
-def gen_function(
+def conv2d_bias_add_hardswish_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_hardswish_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_hardswish_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_hardswish_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -146,4 +107,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
index 5a5e7314b..e7f009871 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -16,120 +16,81 @@
 conv2d bias add relu codegen
 """
 from ... import registry
-from ...target import Target
 from . import common, common_conv2d_bias_add_activation as cbaa
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["ReLu"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_relu_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Plus",
+        unary_op_name="ReLu",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_relu_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_function")
-def gen_function(
+def conv2d_bias_add_relu_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_relu_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_relu_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_relu_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -146,4 +107,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
index 584eddbfe..40330318b 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -15,136 +15,59 @@
 """
 specialize conv2d op with few channels(< 8)
 """
-from collections import OrderedDict
 
 from ... import registry
-from ...target import Target
-from . import common, common_conv2d_bias_activation as cba
+from . import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
-def apply_special_config(func_attrs, op):
-    import cutlass_lib
-
-    x = func_attrs["inputs"][0]
-    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
-
-    if in_ch == 3:
-        # By default we don't use it since the perf is worse than pad4+fixchannel
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
-        op.A.alignment = 1
-        op.B.alignment = 1
-        op.tile_description.stages = 2
-    elif in_ch in [2, 4, 8]:
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
-        op.A.alignment = in_ch
-        op.B.alignment = in_ch
-        op.tile_description.stages = 3
-    return op
-
-
-def extract_config(func_attrs):
-    """extract epilogue for conv op
-
-    Parameters
-    ----------
-    func_attrs : Dict
-        [description] op attributes
-
-    Returns
-    -------
-    [type]: Dict
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
-    """
-    import copy
-
-    import cutlass_lib
-
-    def f_proc_op_special(op):
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            op = apply_special_config(func_attrs, op)
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            ret = f_proc_op_special(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = common.kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
-
-
 @registry.reg("cuda.conv2d_bias_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
     ----------
     func_attrs : Dict
-        [description] op attributes
+        op attributes
     dtype : str, optional
-        [description] by default "float16"
+        by default "float16"
 
     Returns
     -------
-    [type]
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
+    None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_few_channels_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
@@ -168,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -208,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
index ccdc3ae1e..13743d294 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -22,46 +22,72 @@
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_hardswish_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_hardswish_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_function")
-def gen_function(
+def conv2d_bias_hardswish_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_hardswish_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_hardswish_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_hardswish_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -78,4 +104,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
index f8de585fa..7594887c9 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -17,46 +17,57 @@
 """
 
 from ... import registry
-
-from . import common, common_conv2d_bias_activation as cba
-from .common_conv2d_few_channels import extract_config
-
+from . import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_hardswish_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
     ----------
     func_attrs : Dict
-        [description] op attributes
+        op attributes
     dtype : str, optional
-        [description] by default "float16"
+        by default "float16"
 
     Returns
     -------
-    [type]
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
+    None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_hardswish_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_hardswish_few_channels_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
@@ -80,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_hardswish_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_hardswish_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_hardswish_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -120,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
index 920e13d5c..cd44eef51 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -22,46 +22,72 @@
 
 
 @registry.reg("cuda.conv2d_bias_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_relu_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_relu_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.gen_function")
-def gen_function(
+def conv2d_bias_relu_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_relu_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_relu_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_relu_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -78,4 +104,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
index 39019c5f1..927e9da83 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -17,14 +17,20 @@
 """
 
 from ... import registry
-from . import common, common_conv2d_bias_activation as cba
-from .common_conv2d_few_channels import extract_config
+from . import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_relu_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
@@ -38,17 +44,30 @@ def conv2d_config(func_attrs, dtype="float16"):
     -------
     None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_relu_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_relu_few_channels_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
@@ -72,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_relu_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_relu_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_relu_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -112,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
index cbb896e71..a9fe1801f 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -23,46 +23,72 @@
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_sigmoid_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_sigmoid_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_function")
-def gen_function(
+def conv2d_bias_sigmoid_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_sigmoid_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_sigmoid_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_sigmoid_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -79,4 +105,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
index bb166baa1..fee377f95 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -17,8 +17,6 @@
 """
 from collections import OrderedDict
 
-import jinja2
-
 from aitemplate.backend.backend_spec import CUDASpec
 
 from ... import registry
@@ -27,273 +25,6 @@
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_depthwise_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1,
-        i32_in_ch
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 def conv_dw_instance(op_def):
     op_def = op_def.replace("DefaultConv2dFprop", "DefaultDepthwiseFprop")
@@ -325,13 +56,18 @@ def apply_special_config(func_attrs, op):
     return op
 
 
-def extract_config(func_attrs):
+def extract_config(func_attrs, dtype="float16"):
     import copy
 
     import cutlass_lib
 
-    def f_proc_op_special(op):
-        ret = []
+    spec = CUDASpec()
+    lib_dtype = spec.dtype_to_lib_type(dtype)
+
+    if lib_dtype == "float":
+        data_type = cutlass_lib.library.DataType.f32
+        acc_type = cutlass_lib.library.DataType.f32
+    else:
         data_type = cutlass_lib.library.DataType.f16
         acc_type = cutlass_lib.library.DataType.f32
         # check target use fp16 acc
@@ -339,12 +75,15 @@ def f_proc_op_special(op):
             if Target.current()._kwargs["use_fp16_acc"]:
                 acc_type = cutlass_lib.library.DataType.f16
 
+    def f_proc_op_special(op):
+        ret = []
         if (
             op.A.element == data_type
             and op.B.element == data_type
             and op.C.element == data_type
             and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
             and op.accumulator_type() == acc_type
+            and op.group_mode == cutlass_lib.library.GroupMode.NoneGroup
         ):
 
             op = copy.deepcopy(op)
@@ -381,62 +120,25 @@ def f_proc_op_special(op):
 @registry.reg("cuda.conv2d_depthwise.config")
 def conv2d_depthwise_config(func_attrs, dtype="float16"):
     """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = extract_config(func_attrs, dtype)
 
 
 @registry.reg("cuda.conv2d_depthwise.gen_profiler")
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     shape_template,
-    exec_template=EXEC_TEMPLATE,
-    src_template=SRC_TEMPLATE,
-    profiler_template=PROFILER_TEMPLATE,
 ):
-    """Codegen for conv2d_depthwise profiler."""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=emit_instance,
+        is_depthwise=True,
+        instance_name_base="DeviceConvFwdInstance",
     )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = exec_template.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = profiler_template.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.conv2d_depthwise.gen_function")
@@ -448,13 +150,11 @@ def gen_function(
 ):
     """Codegen for conv2d_depthwise function."""
     return common.gen_function(
-        func_attrs,
-        INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_depthwise=True,
         f_emit_instance=emit_instance,
     )
 
@@ -462,37 +162,16 @@ def gen_function(
 @registry.reg("cuda.conv2d_depthwise.func_decl")
 def conv2d_depthwise_gen_function_decl(func_attrs):
     """Codegen for conv2d_depthwise function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_depthwise.func_call")
 def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
     """Codegen for conv2d_depthwise function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
index af33fecce..a42edfb33 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
@@ -15,296 +15,31 @@
 """
 Codegen for conv2d_depthwise.
 """
-import jinja2
 
 from ... import registry
 from . import common, conv2d_depthwise as cdw
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_depthwise_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1,
-        i32_in_ch
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 @registry.reg("cuda.conv2d_depthwise_bias.config")
 def conv2d_depthwise_config(func_attrs, dtype="float16"):
     """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = cdw.extract_config(func_attrs)
+    func_attrs["op_instance"] = cdw.extract_config(func_attrs, dtype)
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
     """Codegen for conv2d_depthwise_bias profiler."""
-    return cdw.gen_profiler(
-        func_attrs,
-        workdir,
-        shape_template,
-        exec_template=EXEC_TEMPLATE,
-        src_template=SRC_TEMPLATE,
-        profiler_template=PROFILER_TEMPLATE,
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=cdw.emit_instance,
+        is_bias=True,
+        is_depthwise=True,
+        instance_name_base="DeviceConvFwdInstance",
     )
 
 
@@ -317,54 +52,32 @@ def gen_function(
 ):
     """Codegen for conv2d_depthwise_bias function."""
     return common.gen_function(
-        func_attrs,
-        cdw.INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
         f_emit_instance=cdw.emit_instance,
+        is_bias=True,
+        is_depthwise=True,
     )
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.func_decl")
 def conv2d_depthwise_gen_function_decl(func_attrs):
     """Codegen for conv2d_depthwise_bias function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.func_call")
 def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
     """Codegen for conv2d_depthwise_bias function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
index 574f0d361..00f639983 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -15,232 +15,85 @@
 """
 transposed conv2d op codegen
 """
-import re
-
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
 from ... import registry
-from . import common, conv2d
+from . import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
 
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-def conv_transpose_instance(op_def):
-    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
-    tmp = re.sub(
-        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
-        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
-        tmp,
+@registry.reg("cuda.transposed_conv2d.config")
+def transposed_conv2d_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = ctc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
     )
-    return tmp
-
-
-def emit_instance(op, f_instance_convertor=conv_transpose_instance):
-    """Emits cutlass instance."""
-    import cutlass_lib
 
-    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
-    op_def = emiter.emit(op)
-    op_def = f_instance_convertor(op_def)
-    return op_def
 
-
-@registry.reg("cuda.transposed_conv2d.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+@registry.reg("cuda.transposed_conv2d.gen_profiler")
+def transposed_conv2d_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=ctc.emit_instance,
+        is_transpose=True,
+        instance_name_base="DeviceConvBwdInstance",
+    )
 
 
 @registry.reg("cuda.transposed_conv2d.gen_function")
-def gen_function(
+def transposed_conv2d_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
-        func_attrs,
-        conv2d.INSTANCE_TEMPLATE,
-        conv2d.EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
-        f_emit_instance=emit_instance,
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        f_emit_instance=ctc.emit_instance,
+        is_transpose=True,
     )
 
 
 @registry.reg("cuda.transposed_conv2d.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return conv2d.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def transposed_conv2d_func_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.transposed_conv2d.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return conv2d.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def transposed_conv2d_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_transpose=True,
     )
 
 
-@registry.reg("cuda.transposed_conv2d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
-    )
-    file_pairs = []
-
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvBwdInstance"
-        instance = conv2d.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = conv2d.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = conv2d.PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
 @registry.reg("cuda.transposed_conv2d.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def transposed_conv2d_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -257,4 +110,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
index 35b08d19f..fb10b92ae 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -15,239 +15,95 @@
 """
 transposed conv2d + bias + (relu) codegen
 """
-import re
-
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
 from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from . import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("Got cutlass error: ") + cutlassGetStatusString(error) + \\
-          " at: " + std::to_string(__LINE__);                                         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
 
-  cutlass::conv::Conv2dProblemSize problem_size(
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-def _conv_transpose_instance(op_def):
-    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
-    tmp = re.sub(
-        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
-        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
-        tmp,
+@registry.reg("cuda.transposed_conv2d_bias.config")
+@registry.reg("cuda.transposed_conv2d_bias_relu.config")
+def transposed_conv2d_bias_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = ctc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
     )
-    return tmp
-
-
-def emit_instance(op, f_instance_convertor=_conv_transpose_instance):
-    import cutlass_lib
-
-    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
-    op_def = emiter.emit(op)
-    op_def = f_instance_convertor(op_def)
-    return op_def
 
 
-@registry.reg("cuda.transposed_conv2d_bias.config")
-@registry.reg("cuda.transposed_conv2d_bias_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+@registry.reg("cuda.transposed_conv2d_bias.gen_profiler")
+@registry.reg("cuda.transposed_conv2d_bias_relu.gen_profiler")
+def transposed_conv2d_bias_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=ctc.emit_instance,
+        is_bias=True,
+        is_transpose=True,
+        instance_name_base="DeviceConvBwdInstance",
+    )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.gen_function")
 @registry.reg("cuda.transposed_conv2d_bias_relu.gen_function")
-def gen_function(
+def transposed_conv2d_bias_gen_function(
     func_attrs,
     exec_cond_remplate,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
-        f_emit_instance=emit_instance,
+        func_attrs=func_attrs,
+        exec_cond_remplate=exec_cond_remplate,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        f_emit_instance=ctc.emit_instance,
+        is_bias=True,
+        is_transpose=True,
     )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.func_decl")
 @registry.reg("cuda.transposed_conv2d_bias_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def transposed_conv2d_bias_func_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.func_call")
 @registry.reg("cuda.transposed_conv2d_bias_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cba.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def transposed_conv2d_bias_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
+        is_transpose=True,
     )
 
 
-@registry.reg("cuda.transposed_conv2d_bias.gen_profiler")
-@registry.reg("cuda.transposed_conv2d_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
-    )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvBwdInstance"
-        instance = cba.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = cba.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = cba.PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
 @registry.reg("cuda.transposed_conv2d_bias.filter")
 @registry.reg("cuda.transposed_conv2d_bias_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def transposed_conv2d_bias_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -264,4 +120,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv3d/common.py b/python/aitemplate/backend/cuda/conv3d/common.py
index 461c4e6e9..b059d1770 100644
--- a/python/aitemplate/backend/cuda/conv3d/common.py
+++ b/python/aitemplate/backend/cuda/conv3d/common.py
@@ -16,7 +16,6 @@
 CUDA conv3d common functions
 """
 import re
-from collections import OrderedDict
 from hashlib import sha1
 from typing import List
 
@@ -24,9 +23,11 @@
 
 from aitemplate.backend.backend_spec import CUDASpec
 
-from ...target import Target
+from ....utils import alignment
+from ..conv2d.common import extract_config as conv2d_extract_config
 from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
 
+
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
@@ -140,29 +141,6 @@ def gen_function_call(func_attrs, indent="  "):
 )
 
 
-def kernel_name(op):
-    """generate cuda kernel name"""
-    from cutlass_lib import library
-
-    threadblock = op.tile_description.procedural_name()
-    extended_name = op.extended_name()
-    opcode_class_name = library.OpcodeClassNames[
-        op.tile_description.math_instruction.opcode_class
-    ]
-    layout = "ndhwc"  # op.layout_name()
-    align_ab = op.A.alignment
-    align_c = op.C.alignment
-    name = KERNEL_KEY_TEMPLATE.render(
-        threadblock=threadblock,
-        extended_name=extended_name,
-        opcode_class_name=opcode_class_name,
-        layout=layout,
-        align_ab=align_ab,
-        align_c=align_c,
-    )
-    return name.replace("\n", "")
-
-
 def emit_instance(op):
     """emit instance"""
     import cutlass_lib
@@ -176,61 +154,16 @@ def emit_instance(op):
     return op_def
 
 
-def extract_config(func_attrs, f_proc_op=None):
+def extract_config(func_attrs, dtype="float16"):
     """Extracts cutlass config for conv kernels."""
-    import copy
-
     import cutlass_lib
 
-    def f_proc_op_default(op):
-        # import cutlass_lib
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.tile_description.math_instruction.element_accumulator == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv3d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv3d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            if f_proc_op is None:
-                ret = f_proc_op_default(op)
-            else:
-                ret = f_proc_op(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = kernel_name(op_inst)
-                    conv3d_ops[key] = op_inst
-
-    return conv3d_ops
+    return conv2d_extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        op_kind=cutlass_lib.library.OperationKind.Conv3d,
+        op_layout="ndhwc",
+    )
 
 
 def extract_config_name(config):
@@ -271,7 +204,7 @@ def gen_function(
             config = f_emit_instance(op_instance[value])
             inst_def_flag.add(value)
         else:
-            config = ""
+            continue
         inst = instance_template.render(
             config=config, name=fname, config_name=extract_config_name(config)
         )
@@ -324,16 +257,10 @@ def gen_function(
     )
 
 
-def cal_align_ab(x_shape: List[int]) -> int:
+def cal_align_ab(x_shape: List[int], dtype="float16") -> int:
     """Returns input alignment."""
     k = x_shape[4]  # CI
-    if k % 8 == 0:
-        return 8
-    if k % 4 == 0:
-        return 4
-    if k % 2 == 0:
-        return 2
-    raise RuntimeError("a/b is not aligned")
+    return alignment.find_max_alignment(k, dtype)
 
 
 def function_filter(cfg, func_attrs, x_shape):
@@ -353,7 +280,8 @@ def function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    ab_alignment = cal_align_ab(x_shape)
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = cal_align_ab(x_shape, dtype=dtype)
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
index 09f9589dd..2b2ce4620 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -35,27 +35,28 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
+{{indent}}//  TODO: cast to right dtype
 {{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+{{indent}}    problem_size,                                            // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},             // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},         // TensorRefB const & ref_B
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_C
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_D
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params const & output_op
 {{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
 {% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} conv_op;
 {% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}auto status = conv_op.can_implement(arguments);
 {{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}status = conv_op.initialize(arguments, workspace);
 {{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
+{{indent}}status = conv_op(stream);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}return;
 """
@@ -89,7 +90,13 @@
 
 {{instances_def}}
 
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
+void {{function_name}} (
+    {{instance_name_base}}& conv_op,
+{% else %}
 void {{function_name}} (
+{% endif %}
     void* in_ptr,
     void* weight_ptr,
     void* out_ptr,
@@ -140,14 +147,14 @@
   TensorNDHWC layout_C(TensorNDHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_d, i32_out_h, i32_out_w, i32_out_ch)));
 
   cutlass::conv::Conv3dProblemSize problem_size(
-    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),
-    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),
-    cutlass::make_Coord(pad_d, pad_h, pad_w),
-    cutlass::make_Coord(stride_d, stride_h, stride_w),
-    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),
-    cutlass::conv::Mode::kCrossCorrelation,
-    1,
-    1
+    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),               // cutlass::Tensor5DCoord input_size
+    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),  // cutlass::Tensor5DCoord filter_size
+    cutlass::make_Coord(pad_d, pad_h, pad_w),                                                 // Coord3D padding
+    cutlass::make_Coord(stride_d, stride_h, stride_w),                                        // Coord3D stride
+    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),                                  // Coord3D dilation
+    cutlass::conv::Mode::kCrossCorrelation,                                                   // cutlass::conv::Mode mode
+    1,                                                                                        // int split_k_slices
+    1                                                                                         // int groups
   );
 
   {{exec_paths}}
@@ -158,6 +165,47 @@
 """
 )
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{instance_name}} {{conv_op}};
+{{indent}}  const char *conv_op_name = "{{conv_op_name}}";
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      {{conv_op}},
+{{indent}}      conv_op_name,
+{{indent}}      {{ni}},
+{{indent}}      {{di}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kd}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{do}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{stride_d}},
+{{indent}}      {{stride_h}},
+{{indent}}      {{stride_w}},
+{{indent}}      {{dilation_d}},
+{{indent}}      {{dilation_h}},
+{{indent}}      {{dilation_w}},
+{{indent}}      {{pad_d}},
+{{indent}}      {{pad_h}},
+{{indent}}      {{pad_w}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {}
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}}
+"""
+)
 
 PROFILER_TEMPLATE = jinja2.Template(
     """
@@ -165,6 +213,72 @@
 
 {{op_func}}
 
+template <typename {{instance_name_base}}>
+int benchmark_{{function_name}} (
+  {{instance_name_base}} &conv_op,
+  const char *conv_op_name,
+  int64_t NI,
+  int64_t DI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KD,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t DO,
+  int64_t HO,
+  int64_t WO,
+  int stride_d,
+  int stride_h,
+  int stride_w,
+  int dilation_d,
+  int dilation_h,
+  int dilation_w,
+  int pad_d,
+  int pad_h,
+  int pad_w,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementOutput = typename {{instance_name_base}}::ElementC;
+  using ElementInputA = typename {{instance_name_base}}::ElementA;
+  using ElementInputB = typename {{instance_name_base}}::ElementB;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name_base}}::LayoutA> x({NI, DI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> w({CO, KD, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name_base}}::LayoutC> y({NO, DO, HO, WO, CO});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << conv_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
 int main(int argc, char** argv) {
   int64_t batch = std::stoi(argv[1]);
   int64_t in_d = std::stoi(argv[2]);
@@ -184,98 +298,16 @@
   int dilation_d = std::stoi(argv[16]);
   int dilation_h = std::stoi(argv[17]);
   int dilation_w = std::stoi(argv[18]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
 
-  uint8_t* global_workspace = nullptr;
+{{shape_func}}
+
+  uint8_t* global_workspace_ = nullptr;
   cudaStream_t stream = nullptr;
 
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, DI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KD, KH, KW, CI});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, DO, HO, WO, CO});
+{{benchmark_instances}}
 
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KD,
-       &KH,
-       &KW,
-       &DI,
-       &HI,
-       &WI,
-       &NO,
-       &DO,
-       &HO,
-       &WO,
-       stride_d,
-       stride_h,
-       stride_w,
-       dilation_d,
-       dilation_h,
-       dilation_w,
-       pad_d,
-       pad_h,
-       pad_w,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0]);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KD,
-       &KH,
-       &KW,
-       &DI,
-       &HI,
-       &WI,
-       &NO,
-       &DO,
-       &HO,
-       &WO,
-       stride_d,
-       stride_h,
-       stride_w,
-       dilation_d,
-       dilation_h,
-       dilation_w,
-       pad_d,
-       pad_h,
-       pad_w,
-       stream);
-  }
-  cudaEventRecord(events[1]);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
 }
-
 """
 )
 
@@ -316,6 +348,9 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    conv_op,
+{% endif %}
 {{indent}}    {{in_ptr}},
 {{indent}}    {{weight_ptr}},
 {{indent}}    {{out_ptr}},
@@ -351,14 +386,15 @@
 @registry.reg("cuda.conv3d.config")
 def conv3d_config(func_attrs, dtype="float16"):
     """Populates conv3d cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(func_attrs, dtype=dtype)
 
 
 @registry.reg("cuda.conv3d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
     """Codegen for conv3d profiler."""
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+
     # shape func
     shape_func = shape_template.render(
         indent="  ",
@@ -383,29 +419,113 @@ def gen_profiler(func_attrs, workdir, shape_template):
         pad_h="pad_h",
         pad_w="pad_w",
     )
+
     backend_spec = CUDASpec()
     dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
+    instance_name_base = "DeviceConvFwdInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        instance=instance_name_base,
+        dtype=dtype,
+    )
+
+    function_name = "conv"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(op)
         config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        conv_op = f"conv_op_{instance_idx}"
         instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
+            config_name=config_name,
+            name=instance_name,
+            config=config,
         )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            conv_op=conv_op,
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            di="DI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kd="KD",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            do="DO",
+            ho="HO",
+            wo="WO",
+            stride_d="stride_d",
+            stride_h="stride_h",
+            stride_w="stride_w",
+            dilation_d="dilation_d",
+            dilation_h="dilation_h",
+            dilation_w="dilation_w",
+            pad_d="pad_d",
+            pad_h="pad_h",
+            pad_w="pad_w",
         )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = SRC_TEMPLATE.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        shape_function="",
+        exec_paths=exec_program,
+    )
+    func_call = FUNC_CALL_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        func_name=function_name,
+        in_ptr="x.device_data()",
+        weight_ptr="w.device_data()",
+        out_ptr="y.device_data()",
+        p_batch="&NI",
+        p_out_ch="&CO",
+        p_in_ch="&CI",
+        p_kernel_d="&KD",
+        p_kernel_h="&KH",
+        p_kernel_w="&KW",
+        p_in_d="&DI",
+        p_in_h="&HI",
+        p_in_w="&WI",
+        p_out_batch="&NO",
+        p_out_d="&DO",
+        p_out_h="&HO",
+        p_out_w="&WO",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilation_d="dilation_d",
+        dilation_h="dilation_h",
+        dilation_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        shape_func=shape_func,
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        func_call=func_call,
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
     return common.build_profiler(file_pairs)
 
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
index 92158b6ae..92a63f325 100644
--- a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
@@ -17,6 +17,8 @@
 """
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from . import common
 
@@ -40,7 +42,7 @@
 template <typename scalar_t, typename accscalar_t, typename Telement, int element_in_Tio, int kernel_k, int dil_d>
 __global__ void conv_depthwise3d_cuda_kernel(
     const scalar_t * input,
-    const half* kernel,
+    const {{dtype}}* kernel,
     scalar_t * output,
     int _kT, int _kH, int _kW,
     int strideT, int strideH, int strideW,
@@ -81,7 +83,7 @@
     for (int tk = 0; tk < element_in_Tio; tk++){
         sum[tk] = 0;
     }
-    const half *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
+    const {{dtype}} *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
     const scalar_t *input_ptr = input + in_offset;
     for (int k_frame = 0; k_frame < kT; ++k_frame) {
       const int in_frame = in_frame_start + k_frame * dilationT;
@@ -95,8 +97,13 @@
             Telement* pack_input = reinterpret_cast<Telement*>(&input_val);
 
             for (int tk = 0; tk < element_in_Tio; tk++){
+              {% if dtype == "half" %}
                 accscalar_t op1 = __half2float(pack_input[tk]);
                 sum[tk] += op1 * __half2float(kernel_ptr[tk*kT*kH*kW]);
+              {% elif dtype == "float" %}
+                accscalar_t op1 = pack_input[tk];
+                sum[tk] += op1 * kernel_ptr[tk*kT*kH*kW];
+              {% endif %}
             }
           }
           kernel_ptr += 1;
@@ -110,7 +117,11 @@
     scalar_t output_val;
     Telement* pack_output = reinterpret_cast<Telement*>(&output_val);
     for (int tk = 0; tk < element_in_Tio; tk++){
+      {% if dtype == "half" %}
         pack_output[tk] = __float2half(sum[tk]);
+      {% elif dtype == "float" %}
+        pack_output[tk] = sum[tk];
+      {% endif %}
     }
     output[out_offset] = output_val;
   }
@@ -159,9 +170,9 @@
 
 
 void conv_depthwise3d_launcher(
-    const half * input,
-    const half * weight,
-    half * output,
+    const {{dtype}} * input,
+    const {{dtype}} * weight,
+    {{dtype}} * output,
     int kernel_t,
     int kernel_h,
     int kernel_w,
@@ -214,6 +225,7 @@
 
 
   using accscalar_t = float;
+{% if dtype == "half" %}
   using Telement = half;
   {% if csize == 0 %}
     using scalar_t = float4;
@@ -229,6 +241,18 @@
     using scalar_t = half;
     #define element_in_Tio 1
   {% endif %}
+{% elif dtype == "float" %}
+  using Telement = float;
+  {% if csize == 2 %}
+    using scalar_t = float2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = float;
+    #define element_in_Tio 1
+  {% endif %}
+{% endif %}
 
   DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 1)
   DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, 1)
@@ -281,9 +305,9 @@
   int out_w = *p_out_w;
 
   conv_depthwise3d_launcher(
-    (const half*)in_ptr,
-    (const half*)weight_ptr,
-    (half*)out_ptr,
+    (const {{dtype}}*)in_ptr,
+    (const {{dtype}}*)weight_ptr,
+    ({{dtype}}*)out_ptr,
     kt,
     kh,
     kw,
@@ -317,7 +341,15 @@
 def gen_function(func_attrs):
     func_name = func_attrs["name"]
     csize = func_attrs["group"] % 8
-    return SRC_TEMPLATE.render(function_name=func_name, csize=csize)
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        csize=csize,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.depthwise_conv3d.func_decl")
diff --git a/python/aitemplate/backend/cuda/cuda_common.py b/python/aitemplate/backend/cuda/cuda_common.py
index 20093b05c..adb09af10 100644
--- a/python/aitemplate/backend/cuda/cuda_common.py
+++ b/python/aitemplate/backend/cuda/cuda_common.py
@@ -19,6 +19,7 @@
 
 DTYPE_TO_CUDATYPE: Dict[str, str] = {
     "float16": "half",
+    "float32": "float",
     "float": "float",
     "int64": "int64_t",
 }
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 07d1650f5..376506e73 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -23,13 +23,68 @@
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
 #endif
 
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
+#define CUDA_FP16_ZERO \
+  __half {             \
+    0x0u               \
+  }
+#define CUDA_BF16_ZERO \
+  __nv_bfloat16 {      \
+    0x0u               \
+  }
+#define CUDA_FP162_ZERO \
+  __half2 {             \
+    0x0u, 0x0u          \
+  }
+#define CUDA_BF162_ZERO \
+  __nv_bfloat162 {      \
+    0x0u, 0x0u          \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+
+// sqrt(2 / pi)
+#define CUDA_BF16_K1  \
+  __nv_bfloat16_raw { \
+    0x3f4c            \
+  }
+
+// 2/(3*pi) - 1/6
+#define CUDA_BF16_K3  \
+  __nv_bfloat16_raw { \
+    0x3d3a            \
+  }
+
 template <typename T>
 __device__ T sign_custom(const T a) {
   return T(a > T(0)) - T(a < T(0));
 }
 
 __device__ half2 h2sign_custom(const half2 a) {
-  return half2(sign_custom(a.x), sign_custom(a.y));
+  return __hsub2(__hgt2(a, CUDA_FP162_ZERO), __hlt2(a, CUDA_FP162_ZERO));
+}
+
+__device__ bfloat16_2 h2sign_custom(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hsub2(__hgt2(a, CUDA_BF162_ZERO), __hlt2(a, CUDA_BF162_ZERO));
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
 __device__ half2 fast_tanh(half2 x) {
@@ -42,7 +97,7 @@ __device__ half2 fast_tanh(half2 x) {
   return x;
 
 #else
-  CUTLASS_NOT_IMPLEMENTED();
+  NOT_IMPLEMENTED();
 #endif
 }
 
@@ -60,16 +115,36 @@ __device__ half fast_tanh(half x) {
 #endif
 }
 
-// Return 1
-__device__ half one() {
-  uint16_t bits = 0x3c00u;
-  return reinterpret_cast<half const&>(bits);
+__device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900)
+
+  asm volatile("tanh.approx.bf16x2 %0, %1;"
+               : "=r"(__HALF_TO_UI(x))
+               : "r"(__HALF_TO_UI(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(
+      {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
-/// Returns (1/2)  (specialization for half_t)
-__device__ half constant_half() {
-  uint16_t bits = 0x3800u;
-  return reinterpret_cast<half const&>(bits);
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900)
+  asm volatile("tanh.approx.bf16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return cutlass::fast_tanh(float(x));
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
 __device__ float fsigmoid_custom(const float a) {
@@ -77,17 +152,37 @@ __device__ float fsigmoid_custom(const float a) {
 }
 
 __device__ half hsigmoid_custom(const half a) {
-  half half_val = constant_half();
-  half one_val = one();
-  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
 }
 
 __device__ half2 h2sigmoid_custom(const half2 a) {
-  half2 halfX2 = half2(constant_half(), constant_half());
-  half2 oneX2 = half2(one(), one());
+  const auto halfX2 = half2(CUDA_FP16_ONE_HALF, CUDA_FP16_ONE_HALF);
+  const auto oneX2 = half2(CUDA_FP16_ONE, CUDA_FP16_ONE);
   return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
 }
 
+__device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 h2sigmoid_custom(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  const auto halfX2 = bfloat16_2(CUDA_BF16_ONE_HALF, CUDA_BF16_ONE_HALF);
+  const auto oneX2 = bfloat16_2(CUDA_BF16_ONE, CUDA_BF16_ONE);
+  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float fsilu(const float a) {
   return a * fsigmoid_custom(a);
 }
@@ -96,10 +191,26 @@ __device__ half hsilu(const half a) {
   return __hmul(a, hsigmoid_custom(a));
 }
 
+__device__ bfloat16 hsilu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(a, hsigmoid_custom(a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half2 h2silu(const half2 a) {
   return __hmul2(a, h2sigmoid_custom(a));
 }
 
+__device__ bfloat16_2 h2silu(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul2(a, h2sigmoid_custom(a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float leaky_relu(const float a, const float negativeSlope) {
   return a > 0.f ? a : a * negativeSlope;
 }
@@ -108,44 +219,111 @@ __device__ half leaky_relu(const half a, const half negativeSlope) {
   return a > half(0.f) ? a : __hmul(a, negativeSlope);
 }
 
+__device__ bfloat16 leaky_relu(const bfloat16 a, const bfloat16 negativeSlope) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return a > bfloat16(0.f) ? a : __hmul(a, negativeSlope);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
   return half2(
       leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
 }
 
+__device__ bfloat16_2
+leaky_relu(const bfloat16_2 a, const bfloat16_2 negativeSlope) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(
+      leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float relu(const float a) {
-  return a > 0.f ? a : 0.f;
+  return fmaxf(a, 0.f);
 }
 
 __device__ half relu(const half a) {
-  return a > half(0.f) ? a : half(0.f);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(a, CUDA_FP16_ZERO);
+#else
+  return a > CUDA_FP16_ZERO ? a : CUDA_FP16_ZERO;
+#endif
 }
 
 __device__ half2 relu(const half2 a) {
-  half2 zeroX2 = half2(half(0.f), half(0.f));
-#if __CUDA_ARCH__ >= 800
+  const half2 zeroX2 = half2(CUDA_FP16_ZERO, CUDA_FP16_ZERO);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
   return __hmax2(a, zeroX2);
 #else
   return half2(relu(a.x), relu(a.y));
 #endif
 }
 
-template <typename T>
-__device__ T hard_tanh(const T a, T min_val, T max_val) {
-  if (a <= min_val) {
-    return min_val;
-  } else if (a >= max_val) {
-    return max_val;
-  } else {
-    return a;
-  }
+__device__ bfloat16 relu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(a, CUDA_BF16_ZERO);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 relu(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(a, CUDA_BF162_ZERO);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16
+hard_tanh(const bfloat16 a, const bfloat16 min_val, const bfloat16 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(min_val, __hmin(max_val, a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half
+hard_tanh(const half a, const half min_val, const half max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(min_val, __hmin(max_val, a));
+#else
+  return a > max_val ? max_val : a < min_val ? min_val : a;
+#endif
+}
+
+__device__ float hard_tanh(
+    const float a,
+    const float min_val,
+    const float max_val) {
+  return fmaxf(min_val, fminf(max_val, a));
 }
 
 __device__ half2
 h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(min_val, __hmin2(max_val, a));
+#else
   return half2(
       hard_tanh(a.x, min_val.x, max_val.x),
       hard_tanh(a.y, min_val.y, max_val.y));
+#endif
+}
+
+__device__ bfloat16_2 h2hard_tanh(
+    const bfloat16_2 a,
+    const bfloat16_2 min_val,
+    const bfloat16_2 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(min_val, __hmin2(max_val, a));
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
 __device__ half replace_if_inf(
@@ -162,11 +340,12 @@ __device__ half replace_if_inf(
   return a;
 }
 
-__device__ float replace_if_inf(
-    const float a,
-    const float inf_replace,
-    const float neginf_replace) {
-  auto is_inf = isinf(a);
+__device__ bfloat16 replace_if_inf(
+    const bfloat16 a,
+    const bfloat16 inf_replace,
+    const bfloat16 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto is_inf = __hisinf(a);
   if (is_inf == -1) {
     return neginf_replace;
   }
@@ -174,6 +353,19 @@ __device__ float replace_if_inf(
     return inf_replace;
   }
   return a;
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float replace_if_inf(
+    const float a,
+    const float inf_replace,
+    const float neginf_replace) {
+  if (isinf(a)) {
+    return (a > 0) ? inf_replace : neginf_replace;
+  }
+  return a;
 }
 
 __device__ half2 nan_to_num(
@@ -189,6 +381,23 @@ __device__ half2 nan_to_num(
               : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
 }
 
+__device__ bfloat16_2 nan_to_num(
+    const bfloat16_2 a,
+    const bfloat16_2 nan_replace,
+    const bfloat16_2 inf_replace,
+    const bfloat16_2 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  bfloat16_2 isnan = __hisnan2(a);
+  return bfloat16_2(
+      isnan.x ? nan_replace.x
+              : replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
+      isnan.y ? nan_replace.y
+              : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half nan_to_num(
     const half a,
     const half nan_replace,
@@ -200,6 +409,21 @@ __device__ half nan_to_num(
   return replace_if_inf(a, inf_replace, neginf_replace);
 }
 
+__device__ bfloat16 nan_to_num(
+    const bfloat16 a,
+    const bfloat16 nan_replace,
+    const bfloat16 inf_replace,
+    const bfloat16 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  if (__hisnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float nan_to_num(
     const float a,
     const float nan_replace,
@@ -222,6 +446,21 @@ __device__ half2 clamp_nan_to_num(
       isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
 }
 
+__device__ bfloat16_2 clamp_nan_to_num(
+    const bfloat16_2 a,
+    const bfloat16_2 clamp_min,
+    const bfloat16_2 clamp_max,
+    const bfloat16_2 nan_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto isnan = __hisnan2(a);
+  return bfloat16_2(
+      isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
+      isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half clamp_nan_to_num(
     const half a,
     const half clamp_min,
@@ -230,6 +469,18 @@ __device__ half clamp_nan_to_num(
   return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
 }
 
+__device__ bfloat16 clamp_nan_to_num(
+    const bfloat16 a,
+    const bfloat16 clamp_min,
+    const bfloat16 clamp_max,
+    const bfloat16 nan_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float clamp_nan_to_num(
     const float a,
     const float clamp_min,
@@ -268,6 +519,14 @@ __device__ half hmax_nan(const half a, const half b) {
 #endif
 }
 
+__device__ bfloat16 hmax_nan(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half2 hmax2_nan(const half2 a, const half2 b) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
   return __hmax2_nan(a, b);
@@ -276,6 +535,14 @@ __device__ half2 hmax2_nan(const half2 a, const half2 b) {
 #endif
 }
 
+__device__ bfloat16_2 hmax2_nan(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float fminf_nan(const float a, const float b) {
   return (isnan(a) || isnan(b)) ? nanf("") : fminf(a, b);
 }
@@ -288,6 +555,14 @@ __device__ half hmin_nan(const half a, const half b) {
 #endif
 }
 
+__device__ bfloat16 hmin_nan(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half2 hmin2_nan(const half2 a, const half2 b) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
   return __hmin2_nan(a, b);
@@ -296,8 +571,17 @@ __device__ half2 hmin2_nan(const half2 a, const half2 b) {
 #endif
 }
 
+__device__ bfloat16_2 hmin2_nan(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin2_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 // pow impl
 __device__ half hpow(const half a, const half b);
+__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b);
 
 __device__ half2 h2pow(const half2 a, const half2 b) {
   half b1 = __low2half(b);
@@ -349,6 +633,62 @@ __device__ half2 h2pow(const half2 a, const half2 b) {
   return __halves2half2(c1, c2);
 }
 
+__device__ bfloat16_2 h2pow(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto b1 = __low2bfloat16(b);
+  auto b2 = __high2bfloat16(b);
+  if (b1 != b2) {
+    auto a1 = __low2bfloat16(a);
+    auto a2 = __high2bfloat16(a);
+    auto c1 = hpow(a1, b1);
+    auto c2 = hpow(a2, b2);
+    return __halves2bfloat162(c1, c2);
+  }
+
+  // New special cases can be added if needed, such as
+  // an powi for cases where b is an integer
+  if (__hbeq2(b, bfloat16_2(0.0, 0.0))) {
+    return bfloat16_2(1.0, 1.0);
+  }
+  if (__hbeq2(b, bfloat16_2(1.0, 1.0))) {
+    return a;
+  }
+  if (__hbeq2(b, bfloat16_2(2.0, 2.0))) {
+    return __hmul2(a, a);
+  }
+  if (__hbeq2(b, bfloat16_2(3.0, 3.0))) {
+    return __hmul2(__hmul2(a, a), a);
+  }
+  if (__hbeq2(b, bfloat16_2(0.5, 0.5))) {
+    return h2sqrt(a);
+  }
+  if (__hbeq2(b, bfloat16_2(-0.5, -0.5))) {
+    return h2rsqrt(a);
+  }
+  if (__hbeq2(b, bfloat16_2(-1.0, -1.0))) {
+    return __h2div(bfloat16_2(1.0, 1.0), a);
+  }
+  if (__hbeq2(b, bfloat16_2(-2.0, -2.0))) {
+    return __h2div(bfloat16_2(1.0, 1.0), __hmul2(a, a));
+  }
+
+  auto a1 = __low2bfloat16(a);
+  auto a2 = __high2bfloat16(a);
+
+  // low 16 bits
+  auto c1 = static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a1)),
+          static_cast<double>(__bfloat162float(b1))));
+  // high 16 bits
+  auto c2 = static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a2)),
+          static_cast<double>(__bfloat162float(b2))));
+  return __halves2bfloat162(c1, c2);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ half hpow(const half a, const half b) {
   if (b == half(0.0)) {
     return half(1.0);
@@ -377,6 +717,40 @@ __device__ half hpow(const half a, const half b) {
   return static_cast<half>(pow(static_cast<double>(a), static_cast<double>(b)));
 }
 
+__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  if (b == bfloat16(0.0)) {
+    return bfloat16(1.0);
+  }
+  if (b == bfloat16(1.0)) {
+    return a;
+  }
+  if (b == bfloat16(2.0)) {
+    return a * a;
+  }
+  if (b == bfloat16(3.0)) {
+    return a * a * a;
+  }
+  if (b == bfloat16(0.5)) {
+    return hsqrt(a);
+  }
+  if (b == bfloat16(-0.5)) {
+    return hrsqrt(a);
+  }
+  if (b == bfloat16(-1.0)) {
+    return bfloat16(1.0) / a;
+  }
+  if (b == bfloat16(-2.0)) {
+    return bfloat16(1.0) / (a * a);
+  }
+  return static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a)),
+          static_cast<double>(__bfloat162float(b))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float fpow(const float a, const float b) {
   if (b == float(0.0)) {
     return float(1.0);
@@ -419,9 +793,22 @@ __device__ half hgelu(const half a) {
   return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
 }
 
+__device__ bfloat16 hgelu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(
+      a,
+      __hmul(
+          CUDA_BF16_ONE_HALF,
+          __hadd(
+              CUDA_BF16_ONE,
+              bfloat16(erff(__bfloat162float(a) * rsqrtf(2.f))))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float fgelu(const float a) {
-  cutlass::epilogue::thread::GELU<float> gelu_op;
-  return gelu_op(a);
+  return a * .5f * (1.f + erff(a * rsqrtf(2.f)));
 }
 
 __device__ half h_fast_gelu(const half a) {
@@ -429,6 +816,31 @@ __device__ half h_fast_gelu(const half a) {
   return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
 }
 
+// The CUDA_BF16_K3 constant in the linked paper
+// (https://arxiv.org/pdf/1606.08415.pdf) (=0.044715) slightly differs
+// from the one computed analytically (2/(3*pi) - 1/6) ~ 0.045539):
+//   atanh(x) = x + x^3/3 + O(x^5),
+//   erf(x/sqrt(2)) = sqrt(2/pi)*(x - x^3/6 + O(x^5)),
+//   atanh(erf(x/sqrt(2))) = sqrt(2/pi)*x +
+//   + (sqrt(2/pi)*x)^3/3 - (sqrt(2/pi)/6)*x^3 + O(x^5) =
+//   = sqrt(2/pi)*x*(1 + (2/(3*pi) - 1/6)*x^2 + O(x^4)).
+// The Cutlass folks have hardcoded the constant from the paper.
+__device__ bfloat16 h_fast_gelu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(
+      a,
+      __hmul(
+          CUDA_BF16_ONE_HALF,
+          __hadd(
+              CUDA_BF16_ONE,
+              fast_tanh(
+                  __hmul(CUDA_BF16_K1, a) *
+                  __hadd(CUDA_BF16_ONE, __hmul(CUDA_BF16_K3, a * a))))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 __device__ float f_fast_gelu(const float a) {
   cutlass::epilogue::thread::GELU_taylor<float> gelu_op;
   return gelu_op(a);
@@ -442,10 +854,20 @@ __device__ float fsoftplus(
 }
 
 __device__ half hsoftplus(const half a, const half beta, const half threshold) {
-  half one_val = one();
   return __hgt(__hmul(a, beta), threshold)
       ? a
-      : __hdiv(hlog(__hadd(one_val, hexp(__hmul(a, beta)))), beta);
+      : __hdiv(hlog(__hadd(CUDA_FP16_ONE, hexp(__hmul(a, beta)))), beta);
+}
+
+__device__ bfloat16
+hsoftplus(const bfloat16 a, const bfloat16 beta, const bfloat16 threshold) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(__hmul(a, beta), threshold)
+      ? a
+      : __hdiv(hlog(__hadd(CUDA_BF16_ONE, hexp(__hmul(a, beta)))), beta);
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
 __device__ half2
@@ -454,4 +876,48 @@ h2softplus(const half2 a, const half2 beta, const half2 threshold) {
       hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
 }
 
+__device__ bfloat16_2 h2softplus(
+    const bfloat16_2 a,
+    const bfloat16_2 beta,
+    const bfloat16_2 threshold) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(
+      hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float felu(const float op_input, const float alpha) {
+  return op_input > 0.f ? op_input : alpha * (expf(op_input) - 1.0f);
+}
+
+__device__ half helu(const half op_input, const half alpha) {
+  return __hgt(op_input, CUDA_FP16_ZERO)
+      ? op_input
+      : __hmul(alpha, __hsub(hexp(op_input), CUDA_FP16_ONE));
+}
+
+__device__ bfloat16 helu(const bfloat16 op_input, const bfloat16 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(op_input, CUDA_BF16_ZERO)
+      ? op_input
+      : __hmul(alpha, __hsub(hexp(op_input), CUDA_BF16_ONE));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 h2elu(const half2 op_input, const half2 alpha) {
+  return half2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
+}
+
+__device__ bfloat16_2 h2elu(const bfloat16_2 op_input, const bfloat16_2 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 #endif
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
index 667310726..fabf7d4f9 100644
--- a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -25,11 +25,13 @@
 from ...target import Target
 
 HEAD_TEMPLATE = """
-#include <cuda_fp16.hpp>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/constants.h"
 #include "cutlass/epilogue/thread/activation.h"
+#include "math_constants.h"
 """
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
index 3c3873c83..28ff30353 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
@@ -15,6 +15,7 @@
 
 from . import (
     bmm_rcr_softmax,
+    dual_bmm_rrr_div,
     dual_gemm_rcr_fast_gelu,
     dual_gemm_rcr_silu,
     gemm_rcr_bias_softmax,
@@ -23,8 +24,9 @@
 
 __all__ = [
     "bmm_rcr_softmax",
+    "dual_bmm_rrr_div",
+    "dual_gemm_rcr_fast_gelu",
+    "dual_gemm_rcr_silu",
     "gemm_rcr_bias_softmax",
     "gemm_rcr_softmax",
-    "dual_gemm_rcr_silu",
-    "dual_gemm_rcr_fast_gelu",
 ]
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 4a4b745a9..60f0587f4 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -56,26 +56,24 @@
         N: B*M*1 (RowMajor)
     */
 
-        {M, N, K},
-        B,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, LayoutC(N)},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(0.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)},
-        M*K,
-        N*K,
-        M*N,
-        M*N,
-        M*N,
-        M*N
-
-
+    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
+    B,                       // int32_t batch_count_
+    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
+    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
+    {c_ptr, LayoutC(N)},     // TensorRefC ref_C_
+    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(0.0)
+    },                       // typename EpilogueFunctorOp::Params linear_scaling
+    {n_ptr, LayoutC(1)},     // ???
+    {soft_ptr, LayoutC(N)},  // ???
+    M*K,                     // int64_t batch_stride_A_
+    N*K,                     // int64_t batch_stride_B_
+    M*N,                     // int64_t batch_stride_C_
+    M*N,                     // int64_t batch_stride_D_
+    M*N,                     // ???
+    M*N,                     // ???
 """
 )
 
@@ -95,7 +93,7 @@ def bmm_rcr_softmax_config(func_attrs, dtype="float16"):
     -------
     None
     """
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 @registry.reg("cuda.bmm_rcr_softmax.gen_profiler")
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 820cbde0d..0ab286363 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -25,27 +25,15 @@
 
 import jinja2
 
+from ....utils import alignment
+
 from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from ...target import Target
 from ..gemm_universal import common
 
-# pylint: disable=C0301,C0415,R1705
-
-EXTRA_CODE = jinja2.Template(
-    """
-#include "device/dual_gemm.h"
-#include "thread/left_silu_and_mul.h"
 
-typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
-decltype(nullptr_ref) ref_B0, ref_B1;
-
-using LayoutA = cutlass::layout::RowMajor;
-using LayoutB = cutlass::layout::ColumnMajor;
-using LayoutC = cutlass::layout::RowMajor;
-
-"""
-)
+# pylint: disable=C0301,C0415,R1705
 
 # HACK: we don't record different permutation shape,
 # because it has little impact on execution time compared.
@@ -60,22 +48,39 @@
 
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
-  int64_t a_ptr_sz = a_dim0 * a_dim1;
-  int64_t b_ptr_sz = b_dim0 * b_dim1;
-  int64_t c_ptr_sz = c_dim0 * c_dim1;
+  int64_t a_ptr_sz = 1;
+{% for dim in adims %}
+  a_ptr_sz *= {{dim}};
+{% endfor %}
+
+  int64_t b0_ptr_sz = 1;
+{% for dim in bdims %}
+  b0_ptr_sz *= {{dim}};
+{% endfor %}
+
+  int64_t b1_ptr_sz = b0_ptr_sz;
+{% if broadcast_b1 %}
+  // scale b1_ptr_sz down by the broadcasted dim
+  b1_ptr_sz /= {{ bdims[broadcasted_bdim_id] }};
+{% endif %}
+
+  int64_t c_ptr_sz = 1;
+{% for dim in cdims %}
+  c_ptr_sz *= {{dim}};
+{% endfor %}
 
   // The value 1 is used to force ptr_max_sz to be non-zero
-  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b0_ptr_sz, c_ptr_sz});
   // TODO: special pool size for A100 L2 cache 40M
   // need to tune it for other devices
   int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(b0_ptr_sz, mem_pool_sz);  // b_ptr: index 1
   memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
 
 {% if has_bias %}
-  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 3
+  memory_pool->AllocateTensor(b1_ptr_sz, mem_pool_sz);  // b_ptr: index 3
 {% endif %}
 
 """
@@ -159,16 +164,17 @@ def emit_instance(
     f_instance_convertor=dual_gemm_instance,
     emit_kernel=False,
     func_attrs=None,
+    broadcast_b1=False,
 ):
     import cutlass_lib
 
     emiter = cutlass_lib.gemm_operation.EmitDualGemmInstance()
-    op_def = emiter.emit(op)
+    op_def = emiter.emit(op, broadcast_b1=broadcast_b1)
     op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
     return op_def
 
 
-def default_fproc_f16(
+def default_fproc(
     *,
     op,
     a_layout,
@@ -177,22 +183,41 @@ def default_fproc_f16(
     epiligue_name,
     epiligue2_name,
     permute_layout=None,
+    dtype="float16",
 ):
     import copy
 
     import cutlass_lib
 
+    backend_spec = CUDASpec()
+    data_type = backend_spec.dtype_to_lib_type(dtype)
+
     ret = []
-    data_type = cutlass_lib.library.DataType.f16
+    # skip simt kernels
+    if (
+        op.tile_description.math_instruction.opcode_class
+        == cutlass_lib.library.OpcodeClass.Simt
+    ):
+        return ret
+
+    if data_type == "float":
+        if (
+            op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.f32
+            and op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.tf32
+        ):
+            return ret
     acc_type = cutlass_lib.library.DataType.f32
     # check target use fp16 acc
-    if "use_fp16_acc" in Target.current()._kwargs:
+    if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
         if Target.current()._kwargs["use_fp16_acc"]:
             acc_type = cutlass_lib.library.DataType.f16
+
     if (
-        op.A.element == data_type
-        and op.B.element == data_type
-        and op.C.element == data_type
+        cutlass_lib.library.DataTypeTag[op.A.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.C.element] == data_type
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
@@ -209,31 +234,37 @@ def default_fproc_f16(
                 permute_layout
             ]
         # set C alignment
-        for i in [8, 4, 2, 1]:
+        alignments = alignment.get_alignments(dtype)
+        for i in alignments:
             op = copy.deepcopy(op)
             op.C.alignment = i
             ret.append(op)
     return ret
 
 
-def make_fproc_f16(func_attrs, layout):
+def make_fproc(
+    func_attrs,
+    layout,
+    dtype="float16",
+):
     """
     This function sets a callback for processing the epilogue of the kernel
     associated with func_attrs.
     """
 
-    def fproc_f16(op):
+    def fproc(op):
         a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
-        return default_fproc_f16(
+        return default_fproc(
             op=op,
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
             epiligue_name=func_attrs["epilogue"],
             epiligue2_name=func_attrs["epilogue2"],
+            dtype=dtype,
         )
 
-    func_attrs["op_instance"] = extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = extract_config(fproc, func_attrs)
 
 
 def gen_function(
@@ -251,6 +282,7 @@ def gen_function(
     input_addr_calculator="",
     output_addr_calculator="",
     extra_code="",
+    broadcast_b1=False,
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
@@ -268,6 +300,7 @@ def gen_function(
                 f_instance_convertor=f_instance_convertor,
                 emit_kernel=emit_kernel,
                 func_attrs=func_attrs,
+                broadcast_b1=broadcast_b1,
             )
             inst_def_flag.add(algo)
         else:
@@ -296,10 +329,16 @@ def gen_function(
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
     )
+
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return src_template.render(
         instances=instance_decl,
         function_name=func_name,
-        dtype="cutlass::half_t",
+        dtype=elem_input_type,
         shape_eval=shape_eval_func,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=output_addr_calculator,
@@ -328,6 +367,9 @@ def gen_profiler(
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
+    broadcasted_bdim_id=0,
+    ndims=2,
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -342,7 +384,6 @@ def gen_profiler(
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
 
-    ndims = 2
     adims = ["&a_dim" + str(i) for i in range(ndims)]
     bdims = ["&b_dim" + str(i) for i in range(ndims)]
     cdims = ["&c_dim" + str(i) for i in range(ndims)]
@@ -360,6 +401,7 @@ def gen_profiler(
         problem_args=problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         ),
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
@@ -373,7 +415,11 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(
-            op, for_profiler=True, emit_kernel=emit_kernel, func_attrs=func_attrs
+            op,
+            for_profiler=True,
+            emit_kernel=emit_kernel,
+            func_attrs=func_attrs,
+            broadcast_b1=broadcast_b1,
         )
         config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
@@ -446,7 +492,14 @@ def gen_profiler(
         weight_ndims=ndims,
         output_ndims=ndims,
         func_call=func_call,
-        tensor_decl=TENSOR_DECL_TEMPLATE.render(has_bias=has_bias),
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(
+            has_bias=has_bias,
+            adims=benchmark_adims,
+            bdims=benchmark_bdims,
+            cdims=benchmark_cdims,
+            broadcast_b1=broadcast_b1,
+            broadcasted_bdim_id=broadcasted_bdim_id,
+        ),
         benchmark_instances="\n".join(benchmark_instances),
         elem_type=elem_type,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index 5f172d6ea..bb6140318 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -224,6 +224,7 @@
     """
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
+#include <sstream>
 {{op_func}}
 
 struct ProfilerMemoryPool {
@@ -314,13 +315,21 @@
   cudaError_t result = cudaGetDevice(&device_idx);
   auto memory_pool = std::make_unique<ProfilerMemoryPool>();
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDevice() API call failed.");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDevice() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   result = cudaGetDeviceProperties(&device_properties, device_idx);
 
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDeviceProperties() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
new file mode 100644
index 000000000..905d8e72b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -0,0 +1,346 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM Specialization for
+C = BMM_RRR(A, B0) / BMM_RRR(A, B1)
+where A[RowMajor][M, K], B[RowMajor][K, N], B1[RowMajor][K, N]
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ..gemm_universal import common, common_bias
+from ..gemm_universal.layout import RRR
+from . import common_dual_gemm
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::DualGemmMode::kBatched,         // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(N)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    nullptr_ref,                                   // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(N)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    nullptr_ref,                                   // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
+    B,                                             // int batch_count
+    M * K,                                         // int64_t batch_stride_A
+    K * N,                                         // int64_t batch_stride_B0
+{% if broadcast_b1 %}
+    K,                                             // int64_t batch_stride_B1
+{% else %}
+    K * N,                                         // int64_t batch_stride_B1
+{% endif %}
+    M * N,                                         // int64_t batch_stride_C
+    M * N,                                         // int64_t batch_stride_D
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+    int64_t B = std::atoi(argv[1]);
+    int64_t M = std::atoi(argv[2]);
+    int64_t N = std::atoi(argv[3]);
+    int64_t K = std::atoi(argv[4]);
+
+    int64_t split_k = 1;  // present in the generated code, but not used
+
+    int64_t a_dim0 = B;
+    int64_t a_dim1 = M;
+    int64_t a_dim2 = K;
+
+    int64_t b_dim0 = B;
+    int64_t b_dim1 = K;
+    int64_t b_dim2 = N;
+
+    int64_t c_dim0 = B;
+    int64_t c_dim1 = M;
+    int64_t c_dim2 = N;
+"""
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+
+#include "device/dual_gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class Div {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  struct Params{};
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  Div(Params const &/*params*/) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(false);
+  }
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &lhs,
+    FragmentAccumulator const &rhs) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_to_compute;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> compute_to_output;
+
+    ComputeFragment converted_lhs = accumulator_to_compute(lhs);
+    ComputeFragment converted_rhs = accumulator_to_compute(rhs);
+
+    cutlass::divides<ComputeFragment> div;
+    return compute_to_output(div(converted_lhs, converted_rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementOutput operator()(
+      ElementAccumulator const& lhs,
+      ElementAccumulator const& rhs
+  ) const {
+      ElementCompute converted_lhs(lhs);
+      ElementCompute converted_rhs(rhs);
+      cutlass::divides<ElementCompute> div;
+      return ElementOutput(div(converted_lhs, converted_rhs));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+"""
+)
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.config")
+def config(
+    func_attrs,
+    dtype="float16",
+):
+    common_dual_gemm.make_fproc(
+        func_attrs=func_attrs,
+        layout=RRR,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.gen_profiler")
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
+    return common_dual_gemm.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator="",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
+        broadcasted_bdim_id=2,
+        ndims=3,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
+        )
+    else:
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
+        )
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common_dual_gemm.gen_function(
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator="",
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.func_call")
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    bias = func_attrs["inputs"][2]
+
+    return common.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        bias_ptr_arg=bias._attrs["name"],
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.filter")
+def function_filter(
+    cfg,
+    func_attrs,
+    ab_alignment,
+):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        ab_alignment=ab_alignment,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index b615589c2..753ed9347 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization for
 C = FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
-where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[ColMajor][N, K]
 """
 import jinja2
 
@@ -31,19 +31,24 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -67,19 +72,24 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -201,7 +211,7 @@ class LeftFastGeluAndMul {
 } // namespace cutlass
 
 
-typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
 decltype(nullptr_ref) ref_B0, ref_B1;
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -214,7 +224,11 @@ class LeftFastGeluAndMul {
 
 @registry.reg("cuda.dual_gemm_rcr_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+    common_dual_gemm.make_fproc(
+        func_attrs,
+        RCR,
+        dtype=dtype,
+    )
 
 
 def common_gen_profiler(
@@ -226,6 +240,7 @@ def common_gen_profiler(
     problem_args_template,
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
@@ -243,11 +258,19 @@ def common_gen_profiler(
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
         extra_code=extra_code,
+        broadcast_b1=broadcast_b1,
+        broadcasted_bdim_id=0,
+        ndims=2,
     )
 
 
 @registry.reg("cuda.dual_gemm_rcr_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -256,7 +279,10 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         common_bias.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
     )
 
 
@@ -275,15 +301,18 @@ def gen_function(
         func_attrs["outputs"][0]._attrs["dtype"]
     )
 
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
     if problem_args_template is None:
         problem_args = PROBLEM_ARGS_TEMPLATE.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     else:
         problem_args = problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
@@ -302,7 +331,10 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index e0418ece8..1c4528560 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization for
 C = SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
-where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[ColMajor][N, K]
 """
 import jinja2
 
@@ -31,19 +31,24 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -67,26 +72,52 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
+"""
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "device/dual_gemm.h"
+#include "thread/left_silu_and_mul.h"
+#include "dual_gemm_common.h"
+
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
+decltype(nullptr_ref) ref_B0, ref_B1;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
 """
 )
 
 
 @registry.reg("cuda.dual_gemm_rcr_silu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+    common_dual_gemm.make_fproc(
+        func_attrs,
+        RCR,
+        dtype=dtype,
+    )
 
 
 def common_gen_profiler(
@@ -98,6 +129,7 @@ def common_gen_profiler(
     problem_args_template,
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
@@ -115,11 +147,19 @@ def common_gen_profiler(
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
         extra_code=extra_code,
+        broadcast_b1=broadcast_b1,
+        broadcasted_bdim_id=0,
+        ndims=2,
     )
 
 
 @registry.reg("cuda.dual_gemm_rcr_silu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -128,7 +168,10 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         common_bias.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
     )
 
 
@@ -147,15 +190,18 @@ def gen_function(
         func_attrs["outputs"][0]._attrs["dtype"]
     )
 
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
     if problem_args_template is None:
         problem_args = PROBLEM_ARGS_TEMPLATE.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     else:
         problem_args = problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
@@ -174,7 +220,10 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 90e9d25a6..0bc378b04 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -37,19 +37,18 @@
         N: M*1 (RowMajor)
     */
 
-        {M, N, K},
-        1,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, 0},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(1.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)}
-
+    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
+    1,                       // int32_t batch_count_
+    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
+    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
+    {c_ptr, 0},              // TensorRefC ref_C_
+    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(1.0)
+    },                       // typename EpilogueFunctorOp::Params linear_scaling
+    {n_ptr, LayoutC(1)},     // ???
+    {soft_ptr, LayoutC(N)},  // ???
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 45d69ac00..0964cd303 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -53,19 +53,18 @@
         N: M*1 (RowMajor)
     */
 
-        {M, N, K},
-        1,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, LayoutC(N)},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(0.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)}
-
+    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
+    1,                       // int32_t batch_count_
+    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
+    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
+    {c_ptr, LayoutC(N)},     // TensorRefC ref_C_
+    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(0.0)
+    },                       // typename EpilogueFunctorOp::Params linear_scaling
+    {n_ptr, LayoutC(1)},     // ???
+    {soft_ptr, LayoutC(N)},  // ???
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
index 42e203069..56f43cbf5 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -172,6 +172,123 @@
   return true;
 }
 
+namespace detail {
+  template<typename TInput>
+  struct InputHelper;
+
+  template<>
+  struct InputHelper<float>{
+    typedef float scalar_type;
+    typedef float2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return make_float2(__fmaf_rn(a.x, b.x, c.x), __fmaf_rn(a.y, b.y, c.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __fmaf_rn(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return make_float2(__fmul_rn(a.x, b.x), __fmul_rn(a.y, b.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __fmul_rn(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return make_float2(__fadd_rn(a.x, b.x), __fadd_rn(a.y, b.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __fadd_rn(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return a.x;
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return a.y;
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return a.x;
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return a.y;
+    }
+  };
+
+  template<>
+  struct InputHelper<half>{
+    typedef half scalar_type;
+    typedef half2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return __hfma2(a, b, c);
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __hfma(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return __hmul2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __hmul(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return __hadd2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __hadd(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return __low2half(a);
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return __high2half(a);
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return __low2float(a);
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return __high2float(a);
+    }
+  };
+}
+
 // Each thread reads one row from "a" and one column from "b",
 // computes dot_product(a_row, b_col), and writes the result to "c".
 // This kernel assumes loading "a" and "b" can be fully vectorized,
@@ -204,14 +321,16 @@
 
   float result = 0.0;
 
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
-    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
-    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    auto* a_vec_h2 = reinterpret_cast<const vec2_type*>(&a_vec[i]);
+    auto* b_vec_h2 = reinterpret_cast<const vec2_type*>(&b_vec[i]);
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
-      half2 c_h2 = __hmul2(a_vec_h2[j], b_vec_h2[j]);
-      result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+      auto c_h2 = dispatch::mul2(a_vec_h2[j], b_vec_h2[j]);
+      result += dispatch::lowf(c_h2) + dispatch::highf(c_h2);
     }
   }
 
@@ -295,16 +414,18 @@
 
   float result = 0.0;
 
-  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
-  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+
+  auto* a_data_h2 = reinterpret_cast<const vec2_type*>(&a_data[0]);
+  auto* b_data_h2 = reinterpret_cast<const vec2_type*>(&b_data[0]);
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < K / 2; ++i) {
-    half2 c_h2 = __hmul2(a_data_h2[i], b_data_h2[i]);
-    result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+    auto c_h2 = dispatch::mul2(a_data_h2[i], b_data_h2[i]);
+    result += dispatch::lowf(c_h2) + dispatch::highf(c_h2);
   }
   if (K % 2) {
-    result += float(__hmul(reinterpret_cast<half&>(a_data[K-1]),
-                           reinterpret_cast<half&>(b_data[K-1])));
+    result += float(dispatch::mul(a_data[K-1], b_data[K-1]));
   }
 
   int64_t batch_idx = blockIdx.y;
@@ -339,19 +460,21 @@
     return;
   }
 
-  half2 result_h2 = {0.0, 0.0};
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+  vec2_type result_h2 = {0.0, 0.0};
 
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
-    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
-    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    auto* a_vec_h2 = reinterpret_cast<const vec2_type*>(&a_vec[i]);
+    auto* b_vec_h2 = reinterpret_cast<const vec2_type*>(&b_vec[i]);
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
-      result_h2 = __hfma2(a_vec_h2[j], b_vec_h2[j], result_h2);
+      result_h2 = dispatch::fma2(a_vec_h2[j], b_vec_h2[j], result_h2);
     }
   }
 
-  float result = __hadd(__low2half(result_h2), __high2half(result_h2));
+  float result = float(dispatch::add(dispatch::low(result_h2), dispatch::high(result_h2)));
 
   int64_t batch_idx = blockIdx.y;
   int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -378,20 +501,21 @@
     return;
   }
 
-  half2 result_h2 = {0.0, 0.0};
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+
+  vec2_type result_h2 = {0.0, 0.0};
 
-  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
-  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  const auto* a_data_h2 = reinterpret_cast<const vec2_type*>(&a_data[0]);
+  const auto* b_data_h2 = reinterpret_cast<const vec2_type*>(&b_data[0]);
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < K / 2; ++i) {
-    result_h2 = __hfma2(a_data_h2[i], b_data_h2[i], result_h2);
+    result_h2 = dispatch::fma2(a_data_h2[i], b_data_h2[i], result_h2);
   }
 
-  half result = __hadd(__low2half(result_h2), __high2half(result_h2));
+  auto result = dispatch::add(dispatch::low(result_h2), dispatch::high(result_h2));
   if (K % 2) {
-    result = __hfma(reinterpret_cast<const half&>(a_data[K-1]),
-                    reinterpret_cast<const half&>(b_data[K-1]),
-                    result);
+    result = dispatch::fma(a_data[K-1], b_data[K-1], result);
   }
 
   int64_t batch_idx = blockIdx.y;
@@ -497,12 +621,17 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
     assert ak == bk, f"ak is not equal to bk. ak: {ak}, bk: {bk}"
 
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_backend_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    vec_lens = list(zip(*backend_spec.read_num_elements_to_backend_type))[0][:-1]
+    dtype = func_attrs["inputs"][0].dtype()
+    elem_input_type = backend_spec.dtype_to_backend_type(dtype)
+    vec_lens = [8, 4, 2]
+    # Each corresponds to a vec_len in the list above
+    backend_types = [
+        "uint4",
+        "uint2",
+        "uint",
+    ]
     alignment = tensor_accessor_codegen.find_max_alignment(
-        ak, func_attrs["input_accessors"]
+        ak, dtype, func_attrs["input_accessors"]
     )
     if alignment % 2:
         bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc"
@@ -513,9 +642,7 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
             if ak % vec_len == 0:
                 bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc_vec"
                 bmm_rcr_n1_kernel_fp16 = "bmm_rcr_n1_kernel_fp16_acc_vec"
-                read_vec_type = backend_spec.read_num_elements_to_backend_type[vec_idx][
-                    1
-                ]
+                read_vec_type = backend_types[vec_idx]
                 break
 
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index b53b74f37..cd7167149 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -109,12 +109,13 @@
 // A tile: 8 x K
 // B matrix: K x N
 // C tile: 8 x N
-template<int num_thread, int N, int K, bool USE_FP16_ACC>
-__global__ void gemm_rrr_small_nk_kernel_half(
-    float4* a_ptr, float4* b_ptr, float4* c_ptr, int M) {
+template<typename TElem, int num_thread, int N, int K, bool USE_FP16_ACC>
+__global__ void gemm_rrr_small_nk_kernel(
+    const float4* a_ptr, const float4* b_ptr, float4* c_ptr, int M) {
   int idx = blockIdx.x * num_thread + threadIdx.x;
+  constexpr int num_elems_in_float4 = sizeof(float4) / sizeof(TElem);
 
-  if (idx >= (M + 7) / 8) {
+  if (idx >= (M + num_elems_in_float4 - 1) / num_elems_in_float4) {
     return;
   }
 
@@ -122,20 +123,20 @@
   a_ptr += a_idx_base;
 
   // load b matrix
-  half b[K][N];
-  half* b_half = reinterpret_cast<half*>(b_ptr);
+  TElem b[K][N];
+  auto* b_e = reinterpret_cast<const TElem*>(b_ptr);
   for (int i = 0; i < K; ++i) {
     for (int j = 0; j < N; ++j) {
-      b[i][j] = b_half[i * N + j];
+      b[i][j] = b_e[i * N + j];
     }
   }
 
   int c_idx_base = idx * N;
   c_ptr += c_idx_base;
 
-  half c_tile[8][N];
+  TElem c_tile[num_elems_in_float4][N];
 
-  if (idx <= M / 8 - 1) {
+  if (idx <= M / num_elems_in_float4 - 1) {
     // fast kernel
     // load a
     float4 a_tile_vec[K];
@@ -143,11 +144,11 @@
     for (int i = 0; i < K; i++) {
       a_tile_vec[i] = __ldg(a_ptr++);
     }
-    half* a_tile = reinterpret_cast<half*>(&a_tile_vec);
+    auto* a_tile = reinterpret_cast<const TElem*>(&a_tile_vec);
 
     // compute
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 8; ++i) {
+    for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
         if (USE_FP16_ACC) {
@@ -159,11 +160,19 @@
           c_tile[i][j] = sum;
         } else {
           float sum = 0;
-          CUTLASS_PRAGMA_UNROLL
-          for (int k = 0; k < K; ++k) {
-            sum += __half2float(__hmul(a_tile[i * K + k], b[k][j]));
+          if constexpr (std::is_same_v<TElem, half>) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __half2float(__hmul(a_tile[i * K + k], b[k][j]));
+            }
+            c_tile[i][j] = __float2half_rn(sum);
+          } else {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __fmul_rn(a_tile[i * K + k], b[k][j]);
+            }
+            c_tile[i][j] = sum;
           }
-          c_tile[i][j] = __float2half_rn(sum);
         }
       }
     }
@@ -177,14 +186,14 @@
   } else {
     // process tail
     // load a
-    half* a_h = reinterpret_cast<half*>(a_ptr);
-    int m = M - M / 8 * 8;
-    half a_tile[8][K];
+    auto* a_e = reinterpret_cast<const TElem*>(a_ptr);
+    int m = M - M / num_elems_in_float4 * num_elems_in_float4;
+    TElem a_tile[num_elems_in_float4][K];
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < m; i++) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < K; j++) {
-        a_tile[i][j] = a_h[i * K + j];
+        a_tile[i][j] = a_e[i * K + j];
       }
     }
 
@@ -202,17 +211,26 @@
           c_tile[i][j] = sum;
         } else {
           float sum = 0;
-          CUTLASS_PRAGMA_UNROLL
-          for (int k = 0; k < K; ++k) {
-            sum += __half2float(__hmul(a_tile[i][k], b[k][j]));
+          if constexpr (std::is_same_v<TElem, half>) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __half2float(__hmul(a_tile[i][k], b[k][j]));
+            }
+            c_tile[i][j] = __float2half_rn(sum);
+          }
+          else {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __fmul_rn(a_tile[i][k], b[k][j]);
+            }
+            c_tile[i][j] = sum;
           }
-          c_tile[i][j] = __float2half_rn(sum);
         }
       }
     }
 
     // write c
-    half* c_h = reinterpret_cast<half*>(c_ptr);
+    auto* c_h = reinterpret_cast<TElem*>(c_ptr);
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < m; i++) {
       CUTLASS_PRAGMA_UNROLL
@@ -224,7 +242,8 @@
 }
 
 // N <= 8, K <= 8
-template<typename ElemT, int N, int K>
+template<typename ElemT, int N, int K,
+         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half>, void>>
 void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
                          ElemT* b_ptr,
                          ElemT* c_ptr,
@@ -236,27 +255,20 @@
   dim3 thread_block(nthread);
   constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if constexpr (std::is_same<ElemT, half>::value) {
-    if(use_fp16_acc) {
-      gemm_rrr_small_nk_kernel_half<nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
-        (float4*)a_ptr,
-        (float4*)b_ptr,
-        (float4*)c_ptr,
-        M
-      );
-    } else {
-      gemm_rrr_small_nk_kernel_half<nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
-        (float4*)a_ptr,
-        (float4*)b_ptr,
-        (float4*)c_ptr,
-        M
-      );
-    }
+  if (use_fp16_acc && std::is_same_v<ElemT, half>) {
+    gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
+      reinterpret_cast<const float4*>(a_ptr),
+      reinterpret_cast<const float4*>(b_ptr),
+      reinterpret_cast<float4*>(c_ptr),
+      M
+    );
   } else {
-    auto msg = std::string("Got error: unsupported elem type ") +
-      " at " + __FILE__ + ": " + std::to_string(__LINE__);
-    std::cerr << msg << std::endl;
-    throw std::runtime_error(msg);
+    gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
+      reinterpret_cast<const float4*>(a_ptr),
+      reinterpret_cast<const float4*>(b_ptr),
+      reinterpret_cast<float4*>(c_ptr),
+      M
+    );
   }
 }
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index 9d04403bc..13e978dcd 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -25,25 +25,18 @@
     bmm_rrr_permute,
     gemm_rcr,
     gemm_rcr_bias,
-    gemm_rcr_bias_add,
-    gemm_rcr_bias_add_add,
-    gemm_rcr_bias_add_add_relu,
-    gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_elementwise,
     gemm_rcr_bias_fast_gelu,
     gemm_rcr_bias_gelu,
     gemm_rcr_bias_hardswish,
-    gemm_rcr_bias_mul,
-    gemm_rcr_bias_mul_add,
-    gemm_rcr_bias_mul_tanh,
     gemm_rcr_bias_permute,
     gemm_rcr_bias_relu,
     gemm_rcr_bias_sigmoid,
-    gemm_rcr_bias_sigmoid_mul,
-    gemm_rcr_bias_sigmoid_mul_tanh,
     gemm_rcr_bias_swish,
     gemm_rcr_bias_tanh,
     gemm_rcr_fast_gelu,
     gemm_rcr_permute,
+    gemm_rcr_permute_elup1,
     gemm_rrr,
     gemm_rrr_permute,
     group_gemm_rcr,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
index b8e3fa6c1..c75243fee 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
@@ -47,19 +47,12 @@ def bmm_ccr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 6a00b0fc5..4f4bba5d4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -189,22 +189,22 @@ def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kBatched,
-    {{mm_info.problem_size}},
-    {{mm_info.batch_size}},
-    {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},
-    {{mm_info.a_ptr}},
-    {{mm_info.b_ptr}},
-    {{mm_info.bias_ptr}},
-    {{mm_info.c_ptr}},
-    {{mm_info.a_batch_stride}},
-    {{mm_info.b_batch_stride}},
-    {{mm_info.bias_batch_stride}},
-    {{mm_info.c_batch_stride}},
-    {{mm_info.lda}},
-    {{mm_info.ldb}},
-    {{mm_info.ldbias}},
-    {{mm_info.ldc}}
+    cutlass::gemm::GemmUniversalMode::kBatched,                                                         // GemmUniversalMode mode
+    {{mm_info.problem_size}},                                                                           // GemmCoord problem_size
+    {{mm_info.batch_size}},                                                                             // int batch_count
+    {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},  // typename EpilogueOutputOp::Params epilogue
+    {{mm_info.a_ptr}},                                                                                  // void const * ptr_A
+    {{mm_info.b_ptr}},                                                                                  // void const * ptr_B
+    {{mm_info.bias_ptr}},                                                                               // void const * ptr_C
+    {{mm_info.c_ptr}},                                                                                  // void * ptr_D
+    {{mm_info.a_batch_stride}},                                                                         // int64_t batch_stride_A
+    {{mm_info.b_batch_stride}},                                                                         // int64_t batch_stride_B
+    {{mm_info.bias_batch_stride}},                                                                      // int64_t batch_stride_C
+    {{mm_info.c_batch_stride}},                                                                         // int64_t batch_stride_D
+    {{mm_info.lda}},                                                                                    // typename LayoutA::Stride::LongIndex lda
+    {{mm_info.ldb}},                                                                                    // typename LayoutB::Stride::LongIndex ldb
+    {{mm_info.ldbias}},                                                                                 // typename LayoutC::Stride::LongIndex ldc
+    {{mm_info.ldc}},                                                                                    // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -233,6 +233,138 @@ def _fill(arr, idx, val):
     return ret
 
 
+def get_default_problem_info(default_problem_args, **kwargs):
+    """Return the default problem args"""
+    problem_args = default_problem_args.copy()
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+def make_function_strided_args(
+    func_attrs,
+    dim_info_dict,
+    default_mm_info,
+    is_permute=False,
+):
+    """
+    Return a tuple of (problem_args, input_addr_calculator, output_addr_calculator)
+    """
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    input_a_batch_stride_dim = default_mm_info.a_batch_stride
+    input_a_stride_lda_dim = default_mm_info.lda
+    input_a_offset = 0
+    input_b_batch_stride_dim = default_mm_info.b_batch_stride
+    input_b_stride_ldb_dim = default_mm_info.ldb
+    input_b_offset = 0
+
+    has_bias = len(func_attrs["inputs"]) == 3
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            if not input_a_accessor.is_contiguous:
+                a_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 0
+                )
+
+                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
+                input_a_stride_lda_dim = input_a_accessor.stride(1)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            if not input_b_accessor.is_contiguous:
+                b_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 1
+                )
+                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
+                input_b_stride_ldb_dim = input_b_accessor.stride(1)
+
+        if has_bias:
+            # FIXME: we don't suppor strided bias yet. Will enable it once
+            # we support it.
+            assert (
+                not input_b_accessor.is_from_strided_tensor
+            ), f'strided bias is not supported for op {func_attrs["name"]}'
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_lda_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_ldb_dim,
+        input_b_offset_val=input_b_offset,
+    )
+
+    # bmm_permute requires a slightly different c_batch_stride and
+    # output_batch_stride_dim values
+    if is_permute:
+        output_batch_stride_dim = default_mm_info.bias_batch_stride
+        c_batch_stride = default_mm_info.c_batch_stride
+    else:
+        output_batch_stride_dim = default_mm_info.c_batch_stride
+        c_batch_stride = "output_batch_stride"
+    output_stride_ldc_dim = default_mm_info.ldc
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                c_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.OUTPUT, 0
+                )
+                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
+                output_stride_ldc_dim = output_accessor.stride(1)
+
+    output_addr_calculator = OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_ldc_dim,
+        output_offset_val=output_offset,
+    )
+
+    bmm_problem_info = Bmm_problem_info(
+        alpha_value=default_mm_info.alpha_value,
+        beta_value=default_mm_info.beta_value,
+        a_ptr=f"({elem_input_type}*)({default_mm_info.a_ptr}) + input_a_offset",
+        b_ptr=f"({elem_input_type}*)({default_mm_info.b_ptr}) + input_b_offset",
+        bias_ptr=f"({elem_output_type}*)({default_mm_info.bias_ptr})",
+        c_ptr=f"({elem_output_type}*)({default_mm_info.c_ptr}) + output_offset",
+        a_batch_stride="input_a_batch_stride",
+        b_batch_stride="input_b_batch_stride",
+        bias_batch_stride=f"{default_mm_info.bias_batch_stride}",
+        c_batch_stride=c_batch_stride,
+        lda="input_a_stride",
+        ldb="input_b_stride",
+        ldbias=f"{default_mm_info.ldbias}",
+        ldc="output_stride",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = None
+    if has_bias:
+        d_shapes = func_attrs["input_accessors"][2].original_shapes
+    _update_stride_info(bmm_problem_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
+    return (problem_args, input_addr_calculator, output_addr_calculator)
+
+
 def gen_profiler(
     func_attrs,
     workdir,
@@ -366,6 +498,45 @@ def gen_profiler(
     return common.build_profiler(file_pairs)
 
 
+def default_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    default_problem_args,
+):
+    """default function for generating bmm profilers"""
+    a_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.INPUT, 0)
+    b_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.INPUT, 1)
+    c_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.OUTPUT, 0)
+
+    args_parser = ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    default_mm_info = get_default_problem_info(
+        default_problem_args,
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    _update_stride_info(default_mm_info, a_shapes, b_shapes)
+
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=default_mm_info,
+    )
+
+    return gen_profiler(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
 def gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
     has_d = False
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
index 213234342..5f85b482a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
@@ -19,29 +19,22 @@
 """
 
 from ... import registry
-from ...common import gemm_common
 from . import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "M",
-        "ldb": "N",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "N * K",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "M * N",
+    "lda": "M",
+    "ldb": "N",
+    "ldbias": "N",
+    "ldc": "N",
+}
 
 
 @registry.reg("cuda.bmm_crr.config")
@@ -49,19 +42,12 @@ def bmm_crr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
@@ -70,39 +56,12 @@ def fproc(op):
 
 @registry.reg("cuda.bmm_crr.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
+    return bmm_common.default_gen_profiler(
         func_attrs,
         workdir,
         profiler_filename,
         dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        PROBLEM_ARGS,
     )
 
 
@@ -112,21 +71,25 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = _get_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
+    (
+        problem_args,
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=False
     )
+
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
         problem_args,
         dim_info_dict,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
index ce62a6a1e..23f818a50 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
@@ -46,7 +46,8 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    mm_info = bmm_crr._get_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        bmm_crr.PROBLEM_ARGS,
         bias_ptr="d_ptr",
         alpha_value=func_attrs.get("alpha", 1),
         beta_value=1,
@@ -54,10 +55,10 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
     d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes, d_shapes)
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
+        mm_info=default_mm_info,
     )
 
     return bmm_common.gen_profiler(
@@ -77,24 +78,26 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    mm_info = bmm_crr._get_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        bmm_crr.PROBLEM_ARGS,
         bias_ptr="d_ptr",
         alpha_value=func_attrs.get("alpha", 1),
         beta_value=1,
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
+    (
+        problem_args,
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=False
     )
     return bmm_common.gen_function(
         func_attrs,
         exec_cond_template,
         problem_args,
         dim_info_dict,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
index c8afa49aa..866ca7921 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
@@ -19,31 +19,23 @@
 """
 
 from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
 from . import bmm_common, common
 from .layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
-def _get_default_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "K",
-        "ldb": "K",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "N * K",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "M * N",
+    "lda": "K",
+    "ldb": "K",
+    "ldbias": "N",
+    "ldc": "N",
+}
 
 
 @registry.reg("cuda.bmm_rcr.config")
@@ -53,39 +45,12 @@ def bmm_rcr_config(func_attrs, dtype="float16"):
 
 @registry.reg("cuda.bmm_rcr.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_default_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
+    return bmm_common.default_gen_profiler(
         func_attrs,
         workdir,
         profiler_filename,
         dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        PROBLEM_ARGS,
     )
 
 
@@ -95,95 +60,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "N * K"
-    input_b_stride_k_dim = "K"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                a_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 0
-                )
-
-                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                b_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 1
-                )
-                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                c_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.OUTPUT, 0
-                )
-                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="output_batch_stride",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+    (
+        problem_args,
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=False
     )
 
     return bmm_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 17574b62e..8b66f8542 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -19,31 +19,36 @@
 """
 
 from ... import registry
-from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import bmm_common, bmm_permute_common, common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "N * K",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "0",
+    "lda": "K",
+    "ldb": "K",
+    "ldbias": "N",
+    "ldc": "N",
+}
+
+
 @registry.reg("cuda.bmm_rcr_permute.config")
 def bmm_rcr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
@@ -67,24 +72,16 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        bias_ptr="c_ptr",
-        a_batch_stride="M * K",
-        b_batch_stride="N * K",
-        bias_batch_stride="M * N",
-        c_batch_stride="0",
-        lda="K",
-        ldb="K",
-        ldbias="N",
-        ldc="N",
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes)
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+        mm_info=default_mm_info,
     )
 
     return bmm_permute_common.gen_profiler(
@@ -106,85 +103,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "N * K"
-    input_b_stride_k_dim = "K"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                input_a_batch_stride_dim = input_a_accessor.stride(0)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                input_b_batch_stride_dim = input_b_accessor.stride(0)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                output_batch_stride_dim = output_accessor.stride(0)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="0",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+    (
+        problem_args,
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=True
     )
 
     return bmm_permute_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
index 489059f31..ded795721 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
@@ -49,19 +49,12 @@ def bmm_rrr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index 40a69bd28..670936784 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -19,31 +19,36 @@
 """
 
 from ... import registry
-from ...backend_spec import CUDASpec
 from ...common import gemm_common
 from . import bmm_common, bmm_permute_common, common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "K * N",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "0",
+    "lda": "K",
+    "ldb": "N",
+    "ldbias": "N",
+    "ldc": "N",
+}
+
+
 @registry.reg("cuda.bmm_rrr_permute.config")
 def bmm_rrr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
@@ -67,24 +72,16 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        bias_ptr="c_ptr",
-        a_batch_stride="M * K",
-        b_batch_stride="K * N",
-        bias_batch_stride="M * N",
-        c_batch_stride="0",
-        lda="K",
-        ldb="N",
-        ldbias="N",
-        ldc="N",
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes)
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+        mm_info=default_mm_info,
     )
 
     return bmm_permute_common.gen_profiler(
@@ -106,95 +103,16 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "K * N"
-    input_b_stride_k_dim = "N"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                a_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 0
-                )
-
-                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                b_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 1
-                )
-                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                c_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.OUTPUT, 0
-                )
-                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="0",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+    (
+        problem_args,
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=True
     )
 
     return bmm_permute_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 9c18ab765..1de5a3de2 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -26,6 +26,7 @@
 import jinja2
 
 from ....compiler.base import IntImm
+from ....utils import alignment
 
 from ...backend_spec import CUDASpec
 
@@ -139,6 +140,8 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+using bfloat16 = nv_bfloat16;
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
@@ -371,6 +374,7 @@
     """
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
+#include <sstream>
 {{op_func}}
 
 template <typename GemmInstance>
@@ -530,13 +534,21 @@
   cudaError_t result = cudaGetDevice(&device_idx);
   auto memory_pool = std::make_unique<ProfilerMemoryPool<{{elem_type}}>>();
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDevice() API call failed.");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDevice() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   result = cudaGetDeviceProperties(&device_properties, device_idx);
 
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDeviceProperties() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   {{args_parse}}
@@ -841,13 +853,29 @@ def add_profiler(file_pairs, workdir, op_type, output_name, code):
     prefix = os.path.join(workdir, "profiler", op_type)
     if not os.path.exists(prefix):
         os.makedirs(prefix)
-    src_path = os.path.join(prefix, output_name + ".cu")
+
     obj_path = os.path.join(prefix, output_name)
     if os.path.exists(obj_path):
         return
-    with open(src_path, "w") as f:
-        f.write(code)
-    file_pairs.append((src_path, obj_path))
+
+    if isinstance(code, dict):
+        # multi-source profiler
+        src_paths = []
+        for src_name, src_code in code.items():
+            # create each source file separately
+            src_path = os.path.join(prefix, src_name + ".cu")
+            with open(src_path, "w") as f:
+                f.write(src_code)
+            src_paths.append(src_path)
+        # add multiple src paths to file_pairs
+        file_pairs.append((src_paths, obj_path))
+    else:
+        # single-source profiler
+        src_path = os.path.join(prefix, output_name + ".cu")
+        with open(src_path, "w") as f:
+            f.write(code)
+        # add single src path to file_pairs
+        file_pairs.append((src_path, obj_path))
 
 
 def gen_profiler(
@@ -1050,14 +1078,30 @@ def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
 
 
 def default_fproc(
-    *, op, a_layout, b_layout, c_layout, elem_type, epiligue_name, permute_layout=None
+    *, op, a_layout, b_layout, c_layout, dtype, epiligue_name, permute_layout=None
 ):
     import copy
 
     import cutlass_lib
 
+    backend_spec = CUDASpec()
+
     ret = []
-    data_type = elem_type
+    # skip simt kernels
+    if (
+        op.tile_description.math_instruction.opcode_class
+        == cutlass_lib.library.OpcodeClass.Simt
+    ):
+        return ret
+    data_type = backend_spec.dtype_to_lib_type(dtype)
+    if data_type == "float":
+        if (
+            op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.f32
+            and op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.tf32
+        ):
+            return ret
     acc_type = cutlass_lib.library.DataType.f32
     # check target use fp16 acc
     if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
@@ -1082,7 +1126,8 @@ def default_fproc(
                 permute_layout
             ]
         # set C alignment
-        for i in [8, 4, 2, 1]:
+        alignments = alignment.get_alignments(dtype)
+        for i in alignments:
             op = copy.deepcopy(op)
             op.C.alignment = i
             ret.append(op)
@@ -1095,9 +1140,6 @@ def make_fproc(func_attrs, layout):
     associated with func_attrs.
     """
 
-    backend_spec = CUDASpec()
-    elem_type = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-
     def fproc(op):
         a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
         return default_fproc(
@@ -1105,7 +1147,7 @@ def fproc(op):
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
index 2d4e7f05a..bae48543f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
@@ -35,6 +35,7 @@
 #include <random>
 #include <vector>
 #include <iostream>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
@@ -43,6 +44,8 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+using bfloat16 = nv_bfloat16;
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 42564bc0c..20c2c402f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -35,9 +35,9 @@
 GEMM_UNIVERSAL_WITH_BROADCAST_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::device::GemmUniversalWithBroadcast<
-        cutlass::half_t, {{layout.cutlass_layout_a}},
-        cutlass::half_t, {{layout.cutlass_layout_b}},
-        cutlass::half_t, {{layout.cutlass_layout_c}},
+        {{elem_type}}, {{layout.cutlass_layout_a}},
+        {{elem_type}}, {{layout.cutlass_layout_b}},
+        {{elem_type}}, {{layout.cutlass_layout_c}},
         {{acc_type}},
         cutlass::arch::OpClassTensorOp,
         {{arch}},
@@ -45,8 +45,8 @@
         {{warp_shape}},
         {{instruction_shape}},
         {{epilogue_functor}}<
-            cutlass::half_t, {{acc_type}}, {{acc_type}},
-            cutlass::half_t, {{epilogue_vector_length}},
+            {{elem_type}}, {{acc_type}}, {{acc_type}},
+            {{elem_type}}, {{epilogue_vector_length}},
             {{unary_op1}}, {{binary_op1}}, {{unary_op2}}
 {% if has_d1 %}
             , {{binary_op2}}
@@ -63,86 +63,82 @@
 # For func codegen.
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    { {{layout.m}}, {{layout.n}}, {{layout.k}} },            // GemmCoord problem_size
 {% if support_split_k %}
-    split_k,
+    split_k,                                                 // int batch_count
 {% else %}
-    1,
+    1,                                                       // int batch_count
 {% endif %}
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_output_type}}*)(d0_ptr),
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_output_type}}*)(d0_ptr),                         // void const * ptr_C1
 {% if has_d1 %}
-    ({{elem_output_type}}*)(d1_ptr),
-{% else %}
-    nullptr,
+    ({{elem_output_type}}*)(d1_ptr),                         // void const * ptr_C2
 {% endif %}
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    ({{elem_input_type}}*) (bias_ptr),
-    nullptr,
-    /*batch_stride_A*/ input_a_batch_stride,
-    /*batch_stride_B*/ input_b_batch_stride,
-    /*batch_stride_C1*/ 0,
-    /*batch_stride_C2*/ 0,
-    /*batch_stride_D*/ 0,
-    /*batch_stride_Vector*/ 0,
-    /*batch_stride_Tensor*/ 0,
-    input_a_stride,
-    input_b_stride,
-    {{layout.stride_c}},
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    ({{elem_input_type}}*) (bias_ptr),                       // void * ptr_Vector
+    nullptr,                                                 // void * ptr_Tensor
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    0,                                                       // int64_t batch_stride_C1
 {% if has_d1 %}
-    {{layout.stride_c}},
-{% else %}
-    0,
+    0,                                                       // int64_t batch_stride_C2
 {% endif %}
-    output_stride,
-    /*ldr*/ 0,
-    /*/ldt*/ 0
+    0,                                                       // int64_t batch_stride_D
+    0,                                                       // int64_t batch_stride_Vector
+    0,                                                       // int64_t batch_stride_Tensor
+    input_a_stride,                                          // typename LayoutA::Stride::Index lda
+    input_b_stride,                                          // typename LayoutB::Stride::Index ldb
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc1
+{% if has_d1 %}
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc2
+{% endif %}
+    output_stride,                                           // typename LayoutC::Stride::Index ldd
+    0,                                                       // typename LayoutC::Stride::Index ldr
+    0,                                                       // typename LayoutC::Stride::Index ldt
 """
 )
 
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    { {{layout.m}}, {{layout.n}}, {{layout.k}} },            // GemmCoord problem_size
 {% if support_split_k %}
-    split_k,
+    split_k,                                                 // int batch_count
 {% else %}
-    1,
+    1,                                                       // int batch_count
 {% endif %}
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*) a_ptr,
-    ({{elem_input_type}}*) b_ptr,
-    ({{elem_output_type}}*) d0_ptr,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*) a_ptr,                            // void const * ptr_A
+    ({{elem_input_type}}*) b_ptr,                            // void const * ptr_B
+    ({{elem_output_type}}*) d0_ptr,                          // void const * ptr_C1
 {% if has_d1 %}
-    ({{elem_output_type}}*) d1_ptr,
-{% else %}
-    nullptr,
+    ({{elem_output_type}}*) d1_ptr,                          // void const * ptr_C2
 {% endif %}
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    ({{elem_input_type}}*) bias_ptr,
-    nullptr,
-    /*batch_stride_A*/ 0,
-    /*batch_stride_B*/ 0,
-    /*batch_stride_C1*/ 0,
-    /*batch_stride_C2*/ 0,
-    /*batch_stride_D*/ 0,
-    /*batch_stride_Vector*/ 0,
-    /*batch_stride_Tensor*/ 0,
-    {{layout.stride_a}},
-    {{layout.stride_b}},
-    {{layout.stride_c}},
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    ({{elem_input_type}}*) bias_ptr,                         // void * ptr_Vector
+    nullptr,                                                 // void * ptr_Tensor
+    0,                                                       // int64_t batch_stride_A
+    0,                                                       // int64_t batch_stride_B
+    0,                                                       // int64_t batch_stride_C1
 {% if has_d1 %}
-    {{layout.stride_c}},
-{% else %}
-    0,
+    0,                                                       // int64_t batch_stride_C2
+{% endif %}
+    0,                                                       // int64_t batch_stride_D
+    0,                                                       // int64_t batch_stride_Vector
+    0,                                                       // int64_t batch_stride_Tensor
+    {{layout.stride_a}},                                     // typename LayoutA::Stride::Index lda
+    {{layout.stride_b}},                                     // typename LayoutA::Stride::Index ldb
+    {{layout.stride_c}},                                     // typename LayoutA::Stride::Index ldc1
+{% if has_d1 %}
+    {{layout.stride_c}},                                     // typename LayoutA::Stride::Index ldc2
 {% endif %}
-    output_stride,
-    /*ldr*/ 0,
-    /*/ldt*/ 0
+    output_stride,                                           // typename LayoutA::Stride::Index ldd
+    0,                                                       // typename LayoutA::Stride::Index ldr
+    0,                                                       // typename LayoutA::Stride::Index ldt
 """
 )
 
@@ -153,14 +149,17 @@
 #include <random>
 #include <vector>
 
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/linear_combination_residual_block_v2.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
 #include "cutlass/gemm/device/gemm_universal_with_broadcast.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+using bfloat16 = nv_bfloat16;
+
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
@@ -348,6 +347,7 @@ def gemm_bias_broadcast_instance(
     binary_op1,
     binary_op2,
     unary_op2,
+    elem_type,
 ):
     """
     adjust gemm instance with respect to input_accessors, layout and epilogue ops
@@ -384,6 +384,7 @@ def gemm_bias_broadcast_instance(
             alignment_b=gemm_universal_params[18],
             layout=layout,
             acc_type=acc_type,
+            elem_type=elem_type,
             has_d1=(binary_op2 is not None),
         )
     )
@@ -466,6 +467,7 @@ def gen_profiler(
                 binary_op1=binary_op1,
                 binary_op2=binary_op2,
                 unary_op2=unary_op2,
+                elem_type=elem_input_type,
             ),
         )
         config_name = common.extract_config_name(config)
@@ -606,6 +608,7 @@ def gen_function(
             binary_op1=binary_op1,
             binary_op2=binary_op2,
             unary_op2=unary_op2,
+            elem_type=elem_input_type,
         ),
         support_split_k=support_split_k,
         input_addr_calculator=input_addr_calculator,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index 44c85125c..7ab432ebd 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -47,22 +47,22 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    input_a_batch_stride,
-    input_b_batch_stride,
-    /*output_batch_stride*/ M * N,
-    /*output_batch_stride*/ M * N,
-    input_a_stride,
-    input_b_stride,
-    output_stride,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    /*output_batch_stride*/ M * N,                           // int64_t batch_stride_C
+    /*output_batch_stride*/ M * N,                           // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -70,22 +70,22 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    K,
-    N,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index 7c06c7408..11642f13d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -30,22 +30,22 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    input_a_batch_stride,
-    input_b_batch_stride,
-    /*bias_batch_stride*/ N,
-    /*output_batch_stride*/ M * N,
-    input_a_stride,
-    input_b_stride,
-    /*bias_stride*/ 0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -53,22 +53,22 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
new file mode 100644
index 000000000..0dd38d9d7
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = UnaryOp2(BinaryOp2(BinaryOp1(UnaryOp1(GeMM(A, B) + bias), D1), D2)),
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_IDENTITY = "cutlass::epilogue::thread::Identity"
+UNARY_RELU = "cutlass::epilogue::thread::ReLu"
+UNARY_SIGMOID = "cutlass::epilogue::thread::Sigmoid"
+UNARY_TANH = "cutlass::epilogue::thread::Tanh"
+BINARY_PLUS = "cutlass::plus"
+BINARY_MULTIPLY = "cutlass::multiplies"
+
+
+_CONFIGS = [
+    # gemm_rcr_bias_add
+    ["add", UNARY_IDENTITY, BINARY_PLUS, None, UNARY_IDENTITY],
+    # gemm_rcr_bias_add_add
+    ["add_add", UNARY_IDENTITY, BINARY_PLUS, BINARY_PLUS, UNARY_IDENTITY],
+    # gemm_rcr_bias_add_relu
+    ["add_relu", UNARY_IDENTITY, BINARY_PLUS, None, UNARY_RELU],
+    # gemm_rcr_bias_add_add_relu
+    ["add_add_relu", UNARY_IDENTITY, BINARY_PLUS, BINARY_PLUS, UNARY_RELU],
+    # gemm_rcr_bias_mul
+    ["mul", UNARY_IDENTITY, BINARY_MULTIPLY, None, UNARY_IDENTITY],
+    # gemm_rcr_bias_mul_add
+    ["mul_add", UNARY_IDENTITY, BINARY_MULTIPLY, BINARY_PLUS, UNARY_IDENTITY],
+    # gemm_rcr_bias_mul_tanh
+    ["mul_tanh", UNARY_IDENTITY, BINARY_MULTIPLY, None, UNARY_TANH],
+    # gemm_rcr_bias_sigmoid_mul_tanh
+    ["sigmoid_mul_tanh", UNARY_SIGMOID, BINARY_MULTIPLY, None, UNARY_TANH],
+    # gemm_rcr_bias_sigmoid_mul
+    ["sigmoid_mul", UNARY_SIGMOID, BINARY_MULTIPLY, None, UNARY_IDENTITY],
+]
+
+
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+def gen_profiler_template(unary_op1, binary_op1, binary_op2, unary_op2):
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        return common_bias_broadcast.gen_profiler(
+            func_attrs,
+            workdir,
+            profiler_filename,
+            dim_info_dict,
+            RCR,
+            unary_op1,
+            binary_op1,
+            binary_op2,
+            unary_op2,
+        )
+
+    return gen_profiler
+
+
+def gen_function_template(unary_op1, binary_op1, binary_op2, unary_op2):
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        return common_bias_broadcast.gen_function(
+            func_attrs,
+            exec_cond_template,
+            dim_info_dict,
+            RCR,
+            unary_op1,
+            binary_op1,
+            binary_op2,
+            unary_op2,
+        )
+
+    return gen_function
+
+
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
+
+
+for conf in _CONFIGS:
+    name, unary_op1, binary_op1, binary_op2, unary_op2 = conf
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.config")(gemm_rcr_config)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.gen_profiler")(
+        gen_profiler_template(unary_op1, binary_op1, binary_op2, unary_op2)
+    )
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.gen_function")(
+        gen_function_template(unary_op1, binary_op1, binary_op2, unary_op2)
+    )
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.func_decl")(gen_function_decl)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.func_call")(gen_function_call)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.filter")(function_filter)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 12af54f6a..d6bcb1b16 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -62,22 +62,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index b4617b9d6..9c0deed3e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -26,22 +26,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index a0952d345..084baa41e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -26,22 +26,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index eae96241c..fb7c0e17d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -27,22 +27,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                                            // GemmUniversalMode mode
+    {M, N, K},                                                                          // GemmCoord problem_size
+    split_k,                                                                            // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                                                      // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                                                      // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                                                   // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,                                     // void * ptr_D
+    M * K,                                                                              // int64_t batch_stride_A
+    N * K,                                                                              // int64_t batch_stride_B
+    N,                                                                                  // int64_t batch_stride_C
+    M * N,                                                                              // int64_t batch_stride_D
+    K,                                                                                  // typename LayoutA::Stride::LongIndex lda
+    K,                                                                                  // typename LayoutB::Stride::LongIndex ldb
+    0,                                                                                  // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                                                      // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index e8ea6a976..fd49dad6b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -27,22 +27,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index e4c082580..3899a79bb 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -27,22 +27,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 934c9a1c0..18c889a13 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -62,22 +62,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index 791f3e300..d63c87c49 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -64,22 +64,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*) a_ptr,
-    ({{elem_input_type}}*) b_ptr,
-    nullptr,
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*) a_ptr,                            // void const * ptr_A
+    ({{elem_input_type}}*) b_ptr,                            // void const * ptr_B
+    nullptr,                                                 // void const * ptr_C
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index b5f1cc9da..8931b62f7 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -45,22 +45,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    K,
-    N,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -70,19 +70,12 @@ def gemm_rcr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
@@ -120,7 +113,8 @@ def common_gen_profiler(
 
 
 @registry.reg("cuda.gemm_rcr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict, extra_code=""):
+    extra_code = f"{common_permute.EXTRA_CODE.render()}\n{extra_code}"
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -128,7 +122,7 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
-        extra_code=common_permute.EXTRA_CODE.render(),
+        extra_code=extra_code,
     )
 
 
@@ -138,6 +132,7 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
     problem_args_template=None,
+    extra_code="",
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -160,6 +155,7 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    extra_code = f"{common_permute.EXTRA_CODE.render()}\n{extra_code}"
     return common_permute.gen_function(
         func_attrs,
         common.SRC_TEMPLATE,
@@ -174,7 +170,7 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=common_permute.EXTRA_CODE.render(),
+        extra_code=extra_code,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
new file mode 100644
index 000000000..e9741f320
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
@@ -0,0 +1,209 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = permute(elu(GeMM(A, B) + bias) + 1.0)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from . import gemm_rcr_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+#define CUDA_FP16_ZERO \
+  __half {             \
+    0x0u               \
+  }
+
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+// ELU(x; alpha = 1) + 1
+template <typename T>
+struct ELUp1 {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& scalar) const {
+    return scalar >= T(0) ? scalar + T(1) : fast_exp(scalar);
+  }
+
+  using Params = LinearCombinationGenericParams<T>;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& scalar, Params const& params_) const {
+    return this->operator()(scalar);
+  }
+};
+
+template <>
+struct ELUp1<cutlass::half_t> {
+  CUTLASS_DEVICE
+  cutlass::half_t operator()(cutlass::half_t const& scalar) const {
+    half s = (half)scalar;
+    return (cutlass::half_t)(
+        __hadd(
+            __hmul(__hgt(s, CUDA_FP16_ZERO), __hadd(s, CUDA_FP16_ONE)),
+            __hmul(__hle(s, CUDA_FP16_ZERO), hexp(s))
+        )
+    );
+  }
+
+  using Params = LinearCombinationGenericParams<cutlass::half_t>;
+
+  CUTLASS_DEVICE
+  cutlass::half_t operator()(cutlass::half_t const& scalar, Params const& params_) const {
+    return this->operator()(scalar);
+  }
+};
+
+template <typename T, int N>
+struct ELUp1<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& value) const {
+    Array<T, N> y;
+    ELUp1<T> elup1_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = elup1_op(value[i]);
+    }
+
+    return y;
+  }
+
+  using Params = LinearCombinationGenericParams<T>;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& value, Params const& params_)
+      const {
+    return this->operator()(value);
+  }
+};
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationELUp1 = LinearCombinationGeneric<ELUp1, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, false>;
+
+// The last template argument above (IsHeavy) being "false" is important for the functor
+// (here: ELUp1) to be inlined. Otherwise, the performance of the epilogue may worsen.
+// https://github.com/NVIDIA/cutlass/blob/7bdba07310b497e75c8377031e524fadc929b849/include/cutlass/epilogue/threadblock/epilogue_base.h#L74-L81
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.config")
+def gemm_rcr_permute_elup1_config(
+    func_attrs,
+    dtype="float16",
+):
+    gemm_rcr_permute.gemm_rcr_permute_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.gen_profiler")
+def gemm_rcr_permute_elup1_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    return gemm_rcr_permute.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.gen_function")
+def gemm_rcr_permute_elup1_gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    return gemm_rcr_permute.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=problem_args_template,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.func_decl")
+def gemm_rcr_permute_elup1_func_decl(func_attrs):
+    return gemm_rcr_permute.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.func_call")
+def gemm_rcr_permute_elup1_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return gemm_rcr_permute.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.filter")
+def gemm_rcr_permute_elup1_filter(
+    cfg,
+    func_attrs,
+    ab_alignment,
+):
+    return gemm_rcr_permute.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        ab_alignment=ab_alignment,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 90654c06f..306280e20 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -45,22 +45,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    N,
-    N,
-    output_stride,
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -70,19 +70,12 @@ def gemm_rrr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index 4b7ced1ea..f34e8315d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -46,22 +46,22 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    N,
-    N,
-    output_stride,
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {M, N, K},                                               // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -71,19 +71,12 @@ def gemm_rrr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
index c18ef3e5f..b270f99d4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
@@ -24,18 +24,18 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-        problem_sizes_device,
-        problem_count,
-        threadblock_count,
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-        ({{elem_input_type}}**)(ptr_A),
-        ({{elem_input_type}}**)(ptr_B),
-        ({{elem_input_type}}**)(ptr_bias),
-        ({{elem_output_type}}**)ptr_C,
-        lda,
-        ldb,
-        ldc,
-        ldd
+    problem_sizes_device,                                    // GemmCoord *problem_sizes
+    problem_count,                                           // int problem_count
+    threadblock_count,                                       // int threadblock_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params output_op
+    ({{elem_input_type}}**)(ptr_A),                          // ElementA ** ptr_A
+    ({{elem_input_type}}**)(ptr_B),                          // ElementB ** ptr_B
+    ({{elem_input_type}}**)(ptr_bias),                       // ElementC ** ptr_C
+    ({{elem_output_type}}**)ptr_C,                           // ElementC ** ptr_D
+    lda,                                                     // typename LayoutA::Stride::LongIndex *lda
+    ldb,                                                     // typename LayoutB::Stride::LongIndex *ldb
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldc
+    ldd,                                                     // typename LayoutC::Stride::LongIndex *ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
index 83f0e2aa0..6011a6bad 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
@@ -25,24 +25,24 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-        problem_sizes_device,
-        problem_count,
-        threadblock_count,
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-        ({{elem_input_type}}**)(ptr_A),
-        ({{elem_input_type}}**)(ptr_B),
-        ({{elem_output_type}}**)(ptr_C),
-        ({{elem_output_type}}**)(ptr_C),
-        lda,
-        ldb,
-        ldc,
-        ldc
+    problem_sizes_device,                                    // GemmCoord *problem_sizes
+    problem_count,                                           // int problem_count
+    threadblock_count,                                       // int threadblock_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params output_op
+    ({{elem_input_type}}**)(ptr_A),                          // ElementA ** ptr_A
+    ({{elem_input_type}}**)(ptr_B),                          // ElementB ** ptr_B
+    ({{elem_output_type}}**)(ptr_C),                         // ElementC ** ptr_C
+    ({{elem_output_type}}**)(ptr_C),                         // ElementC ** ptr_D
+    lda,                                                     // typename LayoutA::Stride::LongIndex *lda
+    ldb,                                                     // typename LayoutB::Stride::LongIndex *ldb
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldc
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldd
 """
 )
 
 
 @registry.reg("cuda.group_gemm_rcr.config")
-def group_rcr_config(func_attrs, dtype="float16"):
+def group_rcr_config(func_attrs):
     common.make_fproc(func_attrs, RCR)
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
index 88c348d2e..0f395982d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -22,8 +22,8 @@
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.gen_profiler")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
index fc43233da..b295fc1e4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -22,8 +22,8 @@
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_profiler")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index bce93b575..4f05d1108 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -22,8 +22,8 @@
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_profiler")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/layout.py b/python/aitemplate/backend/cuda/gemm_universal/layout.py
index 8bab2b98e..568fa48c6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/layout.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/layout.py
@@ -77,3 +77,53 @@ def cutlass_lib_layouts():
             cutlass_lib.library.LayoutType.ColumnMajor,
             cutlass_lib.library.LayoutType.RowMajor,
         ]
+
+
+@dataclass
+class RRR(Layout):
+    """
+    Layout: A[RowMajor], B[RowMajor], C[RowMajor]
+    """
+
+    cutlass_layout_a = "cutlass::layout::RowMajor"
+    cutlass_layout_b = "cutlass::layout::RowMajor"
+    cutlass_layout_c = "cutlass::layout::RowMajor"
+    stride_a = "K"
+    stride_b = "N"
+    stride_c = "N"
+
+    args_parser = """
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        return op.A.layout == row_major and op.B.layout == row_major
+
+    @staticmethod
+    def cutlass_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of cutlass_lib definitions
+        """
+        import cutlass_lib
+
+        return [
+            cutlass_lib.library.LayoutType.RowMajor,
+            cutlass_lib.library.LayoutType.RowMajor,
+            cutlass_lib.library.LayoutType.RowMajor,
+        ]
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
index 7d1741c52..20e688383 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -46,19 +46,12 @@ def gemm_ccr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
index 76ac6533b..77a59f21a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -87,19 +87,12 @@ def config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index 3d08f0291..98daa99b9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -47,19 +47,12 @@ def gemm_crc_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index c414816d8..36b9ceda1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -103,19 +103,12 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index 2f8d35522..2b3d78fd6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -72,19 +72,12 @@ def gemm_rrr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
+            dtype=func_attrs["inputs"][0].dtype(),
             epiligue_name=func_attrs["epilogue"],
         )
 
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index d1a48f28b..1213743b5 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -85,7 +85,7 @@
 {{func_signature}}
 {
 
-    return invokeGroupNorm_{{elem_input_type}}<{{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
             static_cast<{{elem_input_type}}*>(output),
             static_cast<{{elem_input_type}}*>(input),
             static_cast<{{elem_input_type}}*>(gamma),
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 2a22ed903..78566b9e4 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -15,7 +15,7 @@
 #ifndef GROUPNORM_KERNEL_CUH
 #define GROUPNORM_KERNEL_CUH
 
-#define FINAL_MASK 0xffffffff
+constexpr uint32_t kFinalMask = 0xffffffff;
 
 #ifndef GROUP_NORM_CUDA_CHECK
 #define GROUP_NORM_CUDA_CHECK(expr)                                       \
@@ -41,22 +41,41 @@ __inline__ __device__ float sigmoid(float val) {
   return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
 }
 
-__device__ half constant_half() {
-  uint16_t bits = 0x3800u;
+__inline__ __device__ half constant_half() {
+  const uint16_t bits = 0x3800u;
   return reinterpret_cast<half const&>(bits);
 }
 
-__device__ half one() {
-  uint16_t bits = 0x3c00u;
+__inline__ __device__ half one() {
+  const uint16_t bits = 0x3c00u;
   return reinterpret_cast<half const&>(bits);
 }
 
 __inline__ __device__ half hsigmoid(half a) {
-  half half_val = constant_half();
-  half one_val = one();
+  const half half_val = constant_half();
+  const half one_val = one();
   return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
 }
 
+template <typename T>
+struct FSigmoid {
+  __inline__ __device__ T operator()(T input);
+};
+
+template <>
+struct FSigmoid<half> {
+  __inline__ __device__ half operator()(half a) {
+    return hsigmoid(a);
+  }
+};
+
+template <>
+struct FSigmoid<float> {
+  __inline__ __device__ float operator()(float a) {
+    return sigmoid(a);
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // The Groupnorm implementation below is based on OneFlow's Layernorm
 // implementation at:
@@ -157,10 +176,10 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
-    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
-    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    T b_mean = __shfl_down_sync(kFinalMask, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(kFinalMask, *m2, mask, thread_group_width);
     int b_count =
-        __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+        __shfl_down_sync(kFinalMask, *count, mask, thread_group_width);
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -347,7 +366,37 @@ __inline__ __device__ T BlockAllReduce(T val) {
   return result_broadcast;
 }
 
+namespace detail {
+
+template <typename TInput>
+struct TInputHelper;
+
+template <>
+struct TInputHelper<half> {
+  typedef __half2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return __half22float2(a);
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return __float22half2_rn(a);
+  }
+};
+
+template <>
+struct TInputHelper<float> {
+  typedef float2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return a;
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return a;
+  }
+};
+
+} // namespace detail
+
 template <
+    typename TInput,
     bool FuseSwish,
     int H,
     int W,
@@ -357,37 +406,43 @@ template <
     int BANK_CONFLICT = 0,
     int NUM_THREADS = 1024>
 __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
-    const half* X,
-    half* Y,
-    half* gamma,
-    half* beta,
+    const TInput* X,
+    TInput* Y,
+    TInput* gamma,
+    TInput* beta,
     int N,
     float epsilon) {
   constexpr int C_G_2 = C_G / 2;
   constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
   extern __shared__ int svals_[];
-  auto* svals = reinterpret_cast<__half2*>(&svals_[0]);
+  using vec2_type = typename detail::TInputHelper<TInput>::vec2_type;
+  auto to_float2 = detail::TInputHelper<TInput>::to_float2;
+  auto to_vec2 = detail::TInputHelper<TInput>::to_vec2;
+  auto* svals = reinterpret_cast<vec2_type*>(&svals_[0]);
 
-  int32_t g = blockIdx.x;
-  int32_t start_c = g * C_G;
-  int32_t n = blockIdx.y;
+  const int32_t g = blockIdx.x;
+  const int32_t start_c = g * C_G;
+  const int32_t n = blockIdx.y;
 
   // X: [N, H, W, C]
-  int32_t strides[4] = {H * W * C, W * C, C, 1};
+  // last stride is 1
+  const int32_t src_strides[3] = {H * W * C, W * C, C};
+  const int32_t smem_strides[2] = {W * C_G_stride, C_G_stride};
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      const __half2* src = reinterpret_cast<const __half2*>(
-          &(X[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
-              w * strides[2] + (start_c + c_g_2 * 2)]));
-      __half2* dst =
-          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      cutlass::arch::cp_async_zfill<sizeof(__half2)>(dst, src, true);
+      const vec2_type* const src = reinterpret_cast<const vec2_type*>(
+          &(X[n * src_strides[0] + (h_ilp * ILP + ii) * src_strides[1] +
+              w * src_strides[2] + (start_c + c_g_2 * 2)]));
+      vec2_type* const dst = &svals
+                                 [(h_ilp * ILP + ii) * smem_strides[0] +
+                                  w * smem_strides[1] + c_g_2];
+      cutlass::arch::cp_async_zfill<sizeof(vec2_type)>(dst, src, true);
     }
   }
   cutlass::arch::cp_async_wait<0>();
@@ -395,14 +450,14 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
   float thread_sum = 0;
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      half2 valh =
-          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      float2 val = __half22float2(valh);
+      const vec2_type valh = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      const float2 val = to_float2(valh);
       thread_sum += val.x + val.y;
     }
   }
@@ -413,15 +468,15 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
   float thread_sq_sum = 0;
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      half2 valh =
-          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      float2 val = __half22float2(valh);
+      const vec2_type valh = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      const float2 val = to_float2(valh);
       thread_sq_sum += (val.x - block_mean) * (val.x - block_mean) +
           (val.y - block_mean) * (val.y - block_mean);
     }
@@ -434,34 +489,37 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
 
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
-
-    auto g = __half22float2(
-        *reinterpret_cast<const __half2*>(&gamma[start_c + c_g_2 * 2]));
-    g.x *= block_inv_std;
-    g.y *= block_inv_std;
-    auto b = __half22float2(
-        *reinterpret_cast<const __half2*>(&beta[start_c + c_g_2 * 2]));
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
+
+    const auto dst_stride3_offset = start_c + c_g_2 * 2;
+    const auto g_v2 =
+        *reinterpret_cast<const vec2_type*>(gamma + dst_stride3_offset);
+    auto g_f2 = to_float2(g_v2);
+    g_f2.x *= block_inv_std;
+    g_f2.y *= block_inv_std;
+    const auto b_v2 =
+        *reinterpret_cast<const vec2_type*>(beta + dst_stride3_offset);
+    const auto b_f2 = to_float2(b_v2);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      __half2* src =
-          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      __half2* dst = reinterpret_cast<__half2*>(
-          &(Y[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
-              w * strides[2] + (start_c + c_g_2 * 2)]));
+      const vec2_type src = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      vec2_type* const dst = reinterpret_cast<vec2_type*>(
+          &(Y[n * src_strides[0] + (h_ilp * ILP + ii) * src_strides[1] +
+              w * src_strides[2] + dst_stride3_offset]));
 
-      auto fsrc = __half22float2(*src);
+      const auto fsrc = to_float2(src);
       float2 result;
-      result.x = (fsrc.x - block_mean) * g.x + b.x;
-      result.y = (fsrc.y - block_mean) * g.y + b.y;
+      result.x = (fsrc.x - block_mean) * g_f2.x + b_f2.x;
+      result.y = (fsrc.y - block_mean) * g_f2.y + b_f2.y;
       if (FuseSwish) {
         result.x = result.x * sigmoid(result.x);
         result.y = result.y * sigmoid(result.y);
       }
-      *dst = __float22half2_rn(result);
+      *dst = to_vec2(result);
     }
   }
 }
@@ -560,7 +618,7 @@ struct AffineStore {
       gamma_val = gamma[gamma_beta_offset];
       beta_val = beta[gamma_beta_offset];
     }
-
+    FSigmoid<DST> fsigmoid;
 #pragma unroll
     for (int i = 0; i < PackSize; ++i) {
       DST normalized_i = static_cast<DST>(src[i]);
@@ -571,7 +629,7 @@ struct AffineStore {
         y_pack.elem[i] = normalized_i;
       }
       if (FuseSwish) {
-        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+        y_pack.elem[i] = y_pack.elem[i] * fsigmoid(y_pack.elem[i]);
       }
     }
     *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) +
@@ -673,6 +731,7 @@ struct ChannelsLastStore {
             gamma_beta_offset);
     }
 
+    FSigmoid<DST> fsigmoid;
 #pragma unroll
     for (int i = 0; i < PackSize; ++i) {
       DST normalized_i = static_cast<DST>(src[i]);
@@ -683,7 +742,7 @@ struct ChannelsLastStore {
         y_pack.elem[i] = normalized_i;
       }
       if (FuseSwish) {
-        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+        y_pack.elem[i] = y_pack.elem[i] * fsigmoid(y_pack.elem[i]);
       }
     }
     *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) + y_offset) =
@@ -846,12 +905,12 @@ void DispatchGroupNormForwardGpu(
   }
 }
 
-template <bool FuseSwish, int H, int W, int C, int G>
-cudaError_t invokeGroupNorm_half(
-    half* output,
-    half* input,
-    half* gamma,
-    half* beta,
+template <typename TInput, bool FuseSwish, int H, int W, int C, int G>
+cudaError_t invokeGroupNorm(
+    TInput* output,
+    TInput* input,
+    TInput* gamma,
+    TInput* beta,
     int N,
     const float eps,
     const int max_smem_size,
@@ -868,58 +927,66 @@ cudaError_t invokeGroupNorm_half(
   const double epsilon = eps;
   bool channels_first = false;
 
-  // Use a little big more shared_memory to reduce occupancy and boost perf.
+  // Use a little bit more shared_memory to reduce occupancy and boost perf.
   constexpr int MEM_BANK_CONFLICT = 1;
 
   // Bank conflict doesn't seem to matter to perf
   constexpr int BANK_CONFLICT = 0;
 
-  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(uint16_t);
+  constexpr auto smem =
+      H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
 
   // C_G must be even, or we can have misaligned address for cp.async
   // reserve some shared_mem for block reduction
   if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
-    GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
-        group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem));
-
     constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
+
     if constexpr (num_threads > 0) {
+      auto kernel_func = group_norm_smem<
+          TInput,
+          FuseSwish,
+          H,
+          W,
+          C,
+          C_G,
+          ILP,
+          BANK_CONFLICT,
+          num_threads>;
+      GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
+          kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
       dim3 block(num_threads);
-      group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT, num_threads>
-          <<<dim3(G, N), block, smem, stream>>>(
-              input, output, gamma, beta, N, eps);
+      kernel_func<<<dim3(G, N), block, smem, stream>>>(
+          input, output, gamma, beta, N, eps);
     } else {
-      DispatchGroupNormForwardGpu<half, float, FuseSwish>(
+      DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
           stream,
           num_instances,
           norm_size,
           channel_size,
           spatial_size,
           epsilon,
-          static_cast<half*>(input),
-          static_cast<half*>(gamma),
-          static_cast<half*>(beta),
-          static_cast<half*>(output),
-          reinterpret_cast<float*>(workspace),
-          reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
+          input,
+          gamma,
+          beta,
+          output,
+          static_cast<float*>(workspace),
+          static_cast<float*>(workspace) + num_instances,
           channels_first);
     }
   } else {
-    DispatchGroupNormForwardGpu<half, float, FuseSwish>(
+    DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
         stream,
         num_instances,
         norm_size,
         channel_size,
         spatial_size,
         epsilon,
-        static_cast<half*>(input),
-        static_cast<half*>(gamma),
-        static_cast<half*>(beta),
-        static_cast<half*>(output),
-        reinterpret_cast<float*>(workspace),
-        reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
+        input,
+        gamma,
+        beta,
+        output,
+        static_cast<float*>(workspace),
+        static_cast<float*>(workspace) + num_instances,
         channels_first);
   }
 
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index f91f6dc16..1ce0bd6f3 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -201,7 +201,7 @@ __global__ void layernorm_sigmoid_mul_stored_locally(
   float local_sums[1] = {0.0f};
   if (tid < quarter_n) {
     local_val =
-        *input_accessor.get<const float4, const float4>(input, offset + tid);
+        *input_accessor.get<const float, const float4>(input, offset + tid);
 
     local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
   }
@@ -466,8 +466,16 @@ __global__ void layernorm_sigmoid_mul(
   __syncthreads();
 
   for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
     const float gamma_val = static_cast<float>(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
     const float beta_val = static_cast<float>(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
     float local_val = static_cast<float>(
         *input_accessor.get<const T, const T>(input, offset + i));
 
@@ -628,14 +636,7 @@ cudaError_t invokeLayernormSigmoidMul(
     block.x = 512;
     if constexpr (std::is_same<T, half>::value) {
       layernorm_sigmoid_mul<FuseSigmoidMul><<<grid, block, 0, stream>>>(
-          (half*)(output),
-          (const half*)(input),
-          (const half*)(gamma),
-          (const half*)(beta),
-          n,
-          eps,
-          input_accessor,
-          output_accessor);
+          output, input, gamma, beta, n, eps, input_accessor, output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
     } else {
       layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul>
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to4.py b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
index fd67dd1ca..2b56539ad 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to4.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
@@ -118,7 +118,7 @@
   const int nhw = NI * HI * WI;
   const int nhwc = nhw * 3;
   CHECK_EQ(nhw % 8, 0);
-  const int element_in_Tio = 8;
+  const int element_in_Tio = sizeof(int4) / sizeof(ElemT);
   const int max_input_element = nhwc / element_in_Tio;
   const int max_output_element = nhw * 4 / element_in_Tio;
   const int4 zero_io = {0, 0, 0, 0};
diff --git a/python/aitemplate/backend/cuda/padding/pad_last_dim.py b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
index 601da83ad..a8ac6ec68 100644
--- a/python/aitemplate/backend/cuda/padding/pad_last_dim.py
+++ b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
@@ -227,6 +227,8 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     elem_input_type2 = None
     if elem_input_type == "half":
         elem_input_type2 = "half2"
+    elif elem_input_type == "float":
+        elem_input_type2 = "float2"
     else:
         raise NotImplementedError(f"unsupported {elem_input_type=}")
     ndim = func_attrs["ndim"]
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
index bc18b9a99..d82df77bf 100644
--- a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -28,7 +28,7 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}avg_pool_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}avg_pool_launcher<{{dtype}}, {{kernel_size}}, {{stride}}, {{padding}}>(
 {{indent}}    static_cast<const {{dtype}}*>(in_ptr),
 {{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
@@ -52,14 +52,18 @@
 namespace {
 
 template <int kernel_size, int stride, int padding>
-__global__ void avg_pool_f16_nhwc_kernel(const half2* input,
-                                         half2* output,
-                                         const int N,
-                                         const int H,
-                                         const int W,
-                                         const int C,
-                                         const int HO,
-                                         const int WO) {
+__global__ void avg_pool_nhwc_kernel(const {{dtype}}* input_raw,
+                                     {{dtype}}* output_raw,
+                                     const int N,
+                                     const int H,
+                                     const int W,
+                                     const int C,
+                                     const int HO,
+                                     const int WO) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
+
   const int tid = threadIdx.x;
   const int n_idx = blockIdx.x;
   const int out_h_idx = blockIdx.y;
@@ -85,33 +89,49 @@
       #pragma unroll
       for (int w = w_start_idx; w < w_end_idx; w++) {
         const int idx = (h * W + w) * C;
-        const half2 tmp = __ldg(input + (idx + c_idx));
+        const {{vec_dtype}} tmp = __ldg(input + (idx + c_idx));
+{% if dtype == "half" %}
         avg.x += __half2float(tmp.x);
         avg.y += __half2float(tmp.y);
+{% else %}
+        avg.x += tmp.x;
+        avg.y += tmp.y;
+{% endif %}
       }
     }
 
     avg.x *= norm_factor;
     avg.y *= norm_factor;
+{% if dtype == "half" %}
     output[c_idx] = __float22half2_rn(avg);
+{% else %}
+    output[c_idx] = avg;
+{% endif %}
   }
 }
 
-template <int kernel_size, int stride, int padding>
-void avg_pool_launcher(const cutlass::half_t* input,
-                      cutlass::half_t* output,
-                      const int N,
-                      const int H,
-                      const int W,
-                      const int C,
-                      const int HO,
-                      const int WO,
-                      cudaStream_t stream) {
-  int num_thread = (C / 2) < 256 ? C / 2 : 256;
+template <typename ElemT, int kernel_size, int stride, int padding>
+void avg_pool_launcher(const ElemT* input,
+                       ElemT* output,
+                       const int N,
+                       const int H,
+                       const int W,
+                       const int C,
+                       const int HO,
+                       const int WO,
+                       cudaStream_t stream)
+{
+  int num_thread = C / 2;
+  if (num_thread > 256) {
+      num_thread = 256;
+  } else if (num_thread == 0) {
+      num_thread = 1;
+  }
   dim3 grid(N, HO, WO);
   dim3 block(num_thread);
-  avg_pool_f16_nhwc_kernel<kernel_size, stride, padding><<<grid, block, 0, stream>>>(
-      (const half2*)input, (half2*)output, N, H, W, C / 2, HO, WO);
+  avg_pool_nhwc_kernel<kernel_size, stride, padding>
+      <<<grid, block, 0, stream>>>(input, output, N, H,
+                                   W, C / 2, HO, WO);
 }
 } // namespace
 
@@ -147,7 +167,7 @@ def gen_function(
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -180,7 +200,10 @@ def gen_function(
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index 2f1744a5e..1ec46cd10 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -15,6 +15,7 @@
 """
 Codegen functions for max_pool2d.
 """
+
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
@@ -27,7 +28,7 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}max_pooling_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}max_pooling_launcher<{{dtype}}, {{kernel_size}}, {{stride}}, {{padding}}>(
 {{indent}}    static_cast<const {{dtype}}*>(in_ptr),
 {{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
@@ -57,15 +58,19 @@
           int block_ch,
           int block_h,
           int block_w>
-__global__ void max_pool_f16_nhwc_kernel(const half2* input,
-                                         half2* output,
-                                         const int N,
-                                         const int H,
-                                         const int W,
-                                         const int C,
-                                         const int HO,
-                                         const int WO) {
-  half2* shm = (half2*)shared_mem;
+__global__ void max_pool_nhwc_kernel(const {{dtype}}* input_raw,
+                                     {{dtype}}* output_raw,
+                                     const int N,
+                                     const int H,
+                                     const int W,
+                                     const int C,
+                                     const int HO,
+                                     const int WO) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
+  {{vec_dtype}}* shm = ({{vec_dtype}}*)shared_mem;
+
   const int ldg_h = (block_h - 1) * stride + kernel_size;
   const int ldg_w = (block_w - 1) * stride + kernel_size;
   const int ldg_hw_num = ldg_h * ldg_w;
@@ -83,8 +88,13 @@
   const int hw_start_idx_of_thread = threadIdx.y;
   const int ch_thread_idx = threadIdx.x;
 
+{% if dtype == "half" %}
   const half2 min = {static_cast<half>(-65503.0f),
                      static_cast<half>(-65503.0f)};
+{% elif dtype == "float" %}
+  const float2 min = {-(std::numeric_limits<float>::max() - 1),
+                      -(std::numeric_limits<float>::max() - 1)};
+{% endif %}
 
   for (int i = hw_start_idx_of_thread; i < ldg_hw_num; i += block_ch) {
     const int shm_h_idx = i / ldg_w;
@@ -110,7 +120,7 @@
     const int out_w_idx = out_w_start_idx + out_w_offset;
     if (out_h_idx >= 0 && out_h_idx < HO && out_w_idx >= 0 &&
         out_w_idx < WO) {
-      half2 max = min;
+      auto max = min;
 
       const int shm_h_start_idx = out_h_offset * stride;
       const int shm_h_end_idx = shm_h_start_idx + kernel_size;
@@ -124,7 +134,7 @@
              shm_w_idx++) {
           const int shm_idx =
               (shm_h_idx * ldg_w + shm_w_idx) * C + ch_thread_idx;
-          const half2 tmp = shm[shm_idx];
+          const auto tmp = shm[shm_idx];
           max.x = (tmp.x > max.x) ? tmp.x : max.x;
           max.y = (tmp.y > max.y) ? tmp.y : max.y;
         }
@@ -135,9 +145,9 @@
   }
 }
 
-template<int kernel_size, int stride, int pad>
-void max_pooling_launcher(const cutlass::half_t* input,
-                          cutlass::half_t* output,
+template <typename ElemT, int kernel_size, int stride, int pad>
+void max_pooling_launcher(const ElemT* input,
+                          ElemT* output,
                           int NI,
                           int HI,
                           int WI,
@@ -151,13 +161,13 @@
   const int block_h = 4;
   const size_t shm_size = ((block_h - 1) * stride + kernel_size) *
                           ((block_w - 1) * stride + kernel_size) * CI *
-                          sizeof(half);
+                          sizeof(ElemT);
   dim3 grid(NI, (HO + block_h - 1) / block_h,
             (WO + block_w - 1) / block_w);
   dim3 block(CI / 2, block_ch);
-  max_pool_f16_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
-      <<<grid, block, shm_size, stream>>>((const half2*)input, (half2*)output, NI, HI,
-                                  WI, CI / 2, HO, WO);
+  max_pool_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
+      <<<grid, block, shm_size, stream>>>(input, output, NI, HI,
+                                          WI, CI / 2, HO, WO);
 }
 } // namespace
 
@@ -193,7 +203,7 @@ def gen_function(
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -226,7 +236,10 @@ def gen_function(
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index aa3fb6ccb..58563d6d5 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -192,6 +192,7 @@
 #include "cutlass/matrix_shape.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/tensor_ref.h"
+#include "cutlass/fast_math.h"
 
 #ifndef CHECK_ERROR_REDUCE
 #define CHECK_ERROR_REDUCE(expr)                             \\
@@ -272,7 +273,7 @@
   };
 
   struct SharedStorage {
-    cutlass::AlignedArray<ElementCompute, Shape::kCount> exchange;
+    cutlass::AlignedArray<ElementCompute, Shape::kCount, Shape::kCount * alignof(ElementCompute)> exchange;
   };
 
   CUTLASS_DEVICE
@@ -858,7 +859,9 @@ def gen_function(
 
     # FIXME: these alignments values are only for half_t type.
     # make it adjustable to other types such as float.
-    alignments = [16, 8, 4, 2, 1]
+    alignments = [8, 4, 2, 1]
+    if x._attrs["dtype"] in ("float16",):
+        alignments.append(16)
     # This is ugly. Ideally, we should have templated code like below:
     # template <typename Alignment>
     # reduce_launcher(...) {
@@ -902,8 +905,9 @@ def gen_function(
     assert (
         len(output_accessors) == 1
     ), f"expected the length of output_accessors to be one but got {len(output_accessors)}"
+    dtype = func_attrs["inputs"][0].dtype()
     output_alignment = tensor_accessor_codegen.find_max_alignment_for_accessors(
-        output_accessors
+        dtype, output_accessors
     )
     special_exec_path, special_kernel = reduce_small_axis.get_exec_cond_and_kernel(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index 8bf4c6713..a8c711706 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -68,15 +68,16 @@
     """
 constexpr const int ThreadsPerBlock = 128;
 
-template <typename ElemT,
+template <typename ElementInput,
+          typename ElementOutput,
           typename ElementCompute,
           typename ReadVecT,
           typename WriteVecT,
           int64_t num_rows_per_thread,
           int64_t num_cols>
 __global__ void reduce_small_in_v_out_v(
-    ElemT *output,
-    const ElemT *input,
+    ElementOutput *output,
+    const ElementInput *input,
     int64_t num_rows,
     int64_t batch_stride_input,
     int64_t batch_stride_output) {
@@ -88,13 +89,13 @@
     return;
   // input within the batch
   int64_t input_offset = idx * num_cols;
-  const ElemT *this_input =
+  const ElementInput *this_input =
       input + block_batch * batch_stride_input + input_offset;
   size_t output_idx = block_batch * batch_stride_output + idx;
-  ElemT *this_output = get_strided_address_at_idx<ElemT, ElemT>(output, output_idx);
+  ElementOutput *this_output = get_strided_address_at_idx<ElementOutput, ElementOutput>(output, output_idx);
 
-  static_assert(sizeof(ReadVecT) % sizeof(ElemT) == 0);
-  constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElemT);
+  static_assert(sizeof(ReadVecT) % sizeof(ElementInput) == 0);
+  constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElementInput);
   // number of original elements
   constexpr int64_t num_elems_per_thread = num_rows_per_thread * num_cols;
   // number of vector elements
@@ -114,7 +115,7 @@
 
   // compute
   using FragmentCompute = ElementCompute;
-  ElemT *read_elems = reinterpret_cast<ElemT *>(read_elems_v);
+  ElementInput *read_elems = reinterpret_cast<ElementInput *>(read_elems_v);
   using ReduceScalarOp = {{reduce_op}}<ElementCompute>;
   ReduceScalarOp reduce_s_op;
   constexpr int num_reduced_elems = num_cols;
@@ -126,8 +127,9 @@
     {{epilogue_scalar_code}}
   };
 
-  ElemT reduced_elems[num_rows_per_thread];
+  ElementOutput reduced_elems[num_rows_per_thread];
   static_assert(num_elems_per_thread % num_cols == 0);
+  cutlass::NumericConverter<ElementCompute, ElementInput> convert_input;
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < num_elems_per_thread / num_cols; i++) {
     static_assert(num_elems_per_thread % num_rows_per_thread == 0);
@@ -135,17 +137,17 @@
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < num_cols; j++) {
       int64_t read_idx = i * num_cols + j;
-      FragmentCompute tmp = prologue_fn(read_elems[read_idx]);
+      FragmentCompute tmp = prologue_fn(convert_input(read_elems[read_idx]));
       frag_compute = reduce_s_op(frag_compute, tmp);
     }
-    cutlass::NumericConverter<ElemT, ElementCompute> convert_output;
+    cutlass::NumericConverter<ElementOutput, ElementCompute> convert_output;
     ElementCompute tmp = epilogue_scalar_fn(frag_compute);
     reduced_elems[i] = convert_output(tmp);
   }
 
   WriteVecT *this_output_v = reinterpret_cast<WriteVecT*>(this_output);
   WriteVecT *reduced_elems_v = reinterpret_cast<WriteVecT*>(&reduced_elems[0]);
-  constexpr int n_write_elems_in_v = sizeof(WriteVecT) / sizeof(ElemT);
+  constexpr int n_write_elems_in_v = sizeof(WriteVecT) / sizeof(ElementOutput);
   CUTLASS_PRAGMA_UNROLL
 {% if output_accessor.is_contiguous %}
   for (int64_t i = 0; i < num_rows_per_thread / n_write_elems_in_v; i++) {
@@ -198,8 +200,9 @@
   if (num_rows % num_rows_per_thread == 0) {
 
 #define HANDLE_ONE_WRITE_VEC(write_bytes, write_vec_type) \\
-    case write_bytes:                                     \\
+    if (write_bytes == num_write_bytes_v) {               \\
       reduce_small_in_v_out_v<ElemInputType,              \\
+                              ElemOutputType,             \\
                               ElemComputeType,            \\
                               {{read_vec_type}},          \\
                               write_vec_type,             \\
@@ -211,20 +214,19 @@
           num_rows,                                       \\
           batch_stride_input,                             \\
           batch_stride_output);                           \\
-      break;                                              \\
-
-    switch(num_write_bytes_v) {
-      HANDLE_ONE_WRITE_VEC(16, uint4)
-      HANDLE_ONE_WRITE_VEC(8, uint2)
-      HANDLE_ONE_WRITE_VEC(4, unsigned)
+      LAUNCH_CHECK_REDUCE();                              \\
+      return;                                             \\
+    }
+    HANDLE_ONE_WRITE_VEC(16, uint4)
+    HANDLE_ONE_WRITE_VEC(8, uint2)
+    HANDLE_ONE_WRITE_VEC(4, unsigned)
+    if constexpr (std::is_same_v<ElemOutputType, cutlass::half_t>) {
       HANDLE_ONE_WRITE_VEC(2, cutlass::half_t)
-      default:
-        throw std::runtime_error("unsupported vector size for write");
     }
+    throw std::runtime_error("unsupported vector size for write");
   } else {
     throw std::runtime_error("unsupported num_row_per_threads");
   }
-  LAUNCH_CHECK_REDUCE();
 }
 
 template <typename ElemOutputType, typename ElemInputType>
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 7e61c8445..0cd6fc3e7 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -112,6 +112,39 @@
     : "r"(ptr));
 }
 
+template <>
+CUTLASS_DEVICE
+void shared_load<48>(void *dst, uint32_t ptr) {
+  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+
+  dst_u128++;
+  ptr = ptr + sizeof(uint4);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+
+  dst_u128++;
+  ptr = ptr + sizeof(uint4);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+}
+
 } // namespace arch
 
 template <typename ElementT, bool BesselCorrection>
@@ -125,7 +158,7 @@
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-    return WelfordData<ElementT, BesselCorrection>(-1, s, ElementT(0));
+    return WelfordData<ElementT, BesselCorrection>(-1, static_cast<ElementT>(s), ElementT(0));
   }
 
   CUTLASS_HOST_DEVICE
@@ -261,10 +294,10 @@ def var_gen_function(func_attrs) -> str:
     """
     bessel = "true" if func_attrs["unbiased"] else "false"
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
     )
-    acc_type = f"WelfordData<{elem_input_type}, {bessel}>"
+    acc_type = f"WelfordData<{elem_output_type}, {bessel}>"
     return reduce_3d.gen_function(
         func_attrs,
         "cutlass::welford_op",
diff --git a/python/aitemplate/backend/cuda/reduce/vector_norm.py b/python/aitemplate/backend/cuda/reduce/vector_norm.py
index 21bf195e5..c212a66a1 100644
--- a/python/aitemplate/backend/cuda/reduce/vector_norm.py
+++ b/python/aitemplate/backend/cuda/reduce/vector_norm.py
@@ -34,7 +34,7 @@
 L2_NORM_EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
     """
 {{indent}}cutlass::NumericConverter<ElementCompute, float> local_converter;
-{{indent}}return local_converter(fast_sqrt(reduced_result));
+{{indent}}return local_converter(cutlass::fast_sqrt(reduced_result));
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
index 8a6e2317e..f93b186c4 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.cuh
+++ b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -120,6 +120,24 @@ __inline__ __device__ T blockReduceMax(T* val) {
   return (T)0.0f;
 }
 
+namespace detail {
+template <typename T>
+struct numeric_limits_helper {
+  __device__ __host__ static constexpr T lowest() {
+    return platform::numeric_limits<T>::lowest();
+  }
+};
+
+// Cutlass doesn't have `lowest` in their specialization for float,
+// so we define our own helper struct here.
+template <>
+struct numeric_limits_helper<float> {
+  __device__ __host__ static constexpr float lowest() {
+    return std::numeric_limits<float>::lowest();
+  }
+};
+} // namespace detail
+
 // input size: [M, K]
 // Currently the softmax kernel only supports 2D input with dim=1.
 // For input with more dimensions, reshape first.
@@ -167,7 +185,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
 
     CUTLASS_PRAGMA_UNROLL
     for (size_t i = 0; i < m; i++) {
-      T max = platform::numeric_limits<T>::lowest();
+      T max = detail::numeric_limits_helper<T>::lowest();
       // find max
       CUTLASS_PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
@@ -214,7 +232,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
         input_tile[j] = input[i * K + j];
       }
 
-      T max = platform::numeric_limits<T>::lowest();
+      T max = detail::numeric_limits_helper<T>::lowest();
       // find max
       CUTLASS_PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 5b8b462ab..ee1998202 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -16,6 +16,7 @@
 CUDA target specialization
 """
 import json
+import logging
 import os
 import pipes
 import re
@@ -30,7 +31,7 @@
 
 from aitemplate.backend.target import TargetType
 
-from ...utils import logger
+from ...utils.misc import is_debug
 
 from .. import registry
 from ..target import AIT_STATIC_FILES_PATH, CUTLASS_PATH, Target
@@ -38,6 +39,9 @@
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class CUDA(Target):
     """CUDA target."""
 
@@ -87,8 +91,8 @@ def _build_compile_options(self):
             os.path.join(self._template_path, "include"),
             os.path.join(self._template_path, "tools/util/include"),
             os.path.join(self._template_path, "examples/35_gemm_softmax"),
-            os.path.join(self._template_path, "examples/42_fused_multi_head_attention"),
-            os.path.join(self._template_path, "examples/43_dual_gemm"),
+            os.path.join(self._template_path, "examples/41_fused_multi_head_attention"),
+            os.path.join(self._template_path, "examples/45_dual_gemm"),
             os.path.join(
                 flash_attention_path,
                 "./",
@@ -151,11 +155,7 @@ def __enter__(self):
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if (
-            self.lib_folder
-            and os.path.exists(self.lib_folder)
-            and not logger.is_debug()
-        ):
+        if self.lib_folder and os.path.exists(self.lib_folder) and not is_debug():
             shutil.rmtree(self.lib_folder)
 
     def cc(self):
@@ -225,9 +225,7 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             convert_nvcc_json = parutil.get_file_path(
                 os.path.join("aitemplate/testing", "convert_nvcc_cmd")
             )
-            logger.info(
-                __name__, f"Load the nvcc compile option from {convert_nvcc_json}"
-            )
+            _LOGGER.info(f"Load the nvcc compile option from {convert_nvcc_json}")
             with open(convert_nvcc_json, "r") as nvcc_option_json:
                 FBCUDA.nvcc_option_json = json.load(nvcc_option_json)
         self.nvcc_options_json = FBCUDA.nvcc_option_json
@@ -242,9 +240,9 @@ def _build_compile_options(self):
                 os.path.join(self._template_path, "tools/util/include"),
                 os.path.join(self._template_path, "examples/35_gemm_softmax"),
                 os.path.join(
-                    self._template_path, "examples/42_fused_multi_head_attention"
+                    self._template_path, "examples/41_fused_multi_head_attention"
                 ),
-                os.path.join(self._template_path, "examples/43_dual_gemm"),
+                os.path.join(self._template_path, "examples/45_dual_gemm"),
                 os.path.join(self._template_path, "../att_include"),
                 os.path.join(self._template_path, "../att_include/fmha"),
                 os.path.join(self._template_path, "../cub"),
@@ -285,12 +283,12 @@ def _build_compile_options(self):
                 options.append("-DNDEBUG")
             FBCUDA.compile_options_ = " ".join(options)
         compile_options = FBCUDA.compile_options_
-        logger.debug(__name__, f"The compile options are: {compile_options}")
+        _LOGGER.debug(f"The compile options are: {compile_options}")
         return compile_options
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if not logger.is_debug() and self._include_path:
+        if not is_debug() and self._include_path:
             shutil.rmtree(self._include_path)
 
     def binary_compile_cmd(self):
@@ -347,7 +345,7 @@ def remote_logger(cls, record):
             try:
                 AITemplateRemoteLogger.log(record)
             except Exception as e:
-                logger.info(__name__, f"remote_logger failed: {e}")
+                _LOGGER.info(f"remote_logger failed: {e}")
 
     def _load_profile_cache(self):
         """Load local profile cache for this target."""
@@ -356,13 +354,12 @@ def _load_profile_cache(self):
             return
 
         if self.remote_cache_bytes is not None:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 f"Loading profile cache from provided cache content with length {len(self.remote_cache_bytes)}",
             )
             with open(cache_path, "wb") as f:
                 f.write(self.remote_cache_bytes)
-        logger.info(__name__, f"Loading profile cache from: {cache_path}")
+        _LOGGER.info(f"Loading profile cache from: {cache_path}")
         self._profile_cache = ProfileCacheDB(
             TargetType(self._target_type).name, path=cache_path
         )
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index a0f93b8fe..381f2c010 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -25,6 +25,7 @@
     gather,
     permute,
     permute021,
+    permute0213,
     permute102,
     permute210,
     slice_reshape_scatter,
@@ -43,6 +44,7 @@
     "gather",
     "permute",
     "permute021",
+    "permute0213",
     "permute102",
     "permute210",
     "slice_reshape_scatter",
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate.py b/python/aitemplate/backend/cuda/tensor/concatenate.py
index 8f56b12ba..a0ef2a035 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate.py
@@ -19,6 +19,22 @@
 from ... import registry
 from ...backend_spec import CUDASpec
 from ...common import concatenate_common
+from . import concatenate_fast
+
+
+def _is_valid_fast_cat(func_attrs):
+    """
+    Checks whether the call is acceptable for the concatenate
+    kernel in concatenate_fast.py
+    """
+
+    if "fast_cat" not in func_attrs:
+        return False
+    if not func_attrs["fast_cat"]:
+        return False
+    if len(func_attrs["inputs"]) == 0:
+        return False
+    return True
 
 
 @registry.reg("cuda.concatenate.func_decl")
@@ -56,12 +72,20 @@ def gen_function(func_attrs, element_func=None, element_func_def=None):
     str
         Rendered function body.
     """
-    return concatenate_common.gen_function(
-        func_attrs=func_attrs,
-        backend_spec=CUDASpec(),
-        element_func=element_func,
-        element_func_def=element_func_def,
-    )
+    if _is_valid_fast_cat(func_attrs):
+        return concatenate_fast.gen_function(
+            func_attrs,
+            concatenate_common.SRC_TEMPLATE,
+            element_func=element_func,
+            element_func_def=element_func_def,
+        )
+    else:
+        return concatenate_common.gen_function(
+            func_attrs=func_attrs,
+            backend_spec=CUDASpec(),
+            element_func=element_func,
+            element_func_def=element_func_def,
+        )
 
 
 @registry.reg("cuda.concatenate.func_call")
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
new file mode 100644
index 000000000..387d508d2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
@@ -0,0 +1,851 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#ifndef CONCATENATE_FAST_KERNEL
+#define CONCATENATE_FAST_KERNEL
+
+/////////////////////////////////////////////////////////////
+// some standard includes
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+
+// fast tanh for the most resent hardware
+#include <cutlass/fast_math.h>
+
+////////////////////////////////////////////////////////////
+// I'm trying to do as much C++ as possible in order to simplify
+//   the debugging without any Python + Jinja2.
+
+/*
+////////////////////////////////////////////////////////////////////////////////////////
+// Baseline C++ implementation looks the following.
+// Please note that it does not include masking or TensorAccessor-objects.
+// It's just a plain tensor concatenation code.
+////////////////////////////////////////////////////////////////////////////////////////
+
+// a crude representation of a tensor
+struct Tensor {
+    std::vector<int64_t> sizes;
+    std::vector<float> data;
+};
+
+// contains a list of tensors that need to be concatenated
+struct TestCase {
+    std::vector<Tensor> inputs;
+};
+
+// concatDim >= 0
+Tensor ConcatKernelDimN(const TestCase & tc, int64_t concatDim) {
+    // this is the output tensor
+    Tensor output;
+
+    // copy sizes from the first input tensor
+    output.sizes = tc.inputs[0].sizes;
+
+    // compute the resulting number of elements for dim=concatDim
+    int64_t nTotalElementsAtConcatDim = 0;
+    for (const auto & tensor : tc.inputs) {
+        nTotalElementsAtConcatDim += tensor.sizes[concatDim];
+    }
+
+    // save this new dimension
+    output.sizes[concatDim] = nTotalElementsAtConcatDim;
+
+    // concat all the data.
+    // the overall logic is the following: we need to perform
+    //   n operations, and on every iteration one copies the number
+    //   of elements proportional to ncopy.
+
+    int64_t n = 1;
+    for (int64_t i = 0; i < concatDim; i++) {
+        n *= output.sizes[i];
+    }
+
+    int64_t ncopy = 1;
+    for (int64_t i = concatDim + 1; i < output.sizes.size(); i++) {
+        ncopy *= output.sizes[i];
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        for (const auto & tensor : tc.inputs) {
+            // add a new chunk to the end of the output tensor data container
+            output.data.insert(
+                output.data.end(),
+                tensor.data.cbegin() +
+                    i * tensor.sizes[concatDim] * ncopy,
+                tensor.data.cbegin() +
+                    (i + 1) * tensor.sizes[concatDim] * ncopy);
+        }
+    }
+
+    // done
+    return output;
+}
+*/
+
+////////////////////////////////////////////////////////////
+// Here go the facilities that are resposible for post-processing,
+//   such as applying tanh on top of values on a concatenated tensor.
+
+// does no processing
+template <typename DataT>
+struct NoopTransform {
+  using data_type = DataT;
+  __device__ inline data_type operator()(const data_type value) {
+    return value;
+  }
+};
+
+// does tanh()
+template <typename DataT>
+struct TanhTransform {
+  using data_type = DataT;
+};
+
+template <>
+struct TanhTransform<float> {
+  using data_type = float;
+  __device__ inline float operator()(const float value) {
+    // return tanhf(value);
+    return cutlass::fast_tanh(value);
+  }
+};
+
+template <>
+struct TanhTransform<half> {
+  using data_type = half;
+  __device__ inline half operator()(const half value) {
+    // return __float2half(tanhf(__half2float(value)));
+    return cutlass::fast_tanh(value);
+  }
+};
+
+template <>
+struct TanhTransform<__nv_bfloat16> {
+  using data_type = __nv_bfloat16;
+  __device__ inline __nv_bfloat16 operator()(const __nv_bfloat16 value) {
+    return __float2bfloat16(tanhf(__bfloat162float(value)));
+  }
+};
+
+// CUDA-based hardware benefits not only from coalescing, but from
+//   reading/writing in memory-aligned chunks. This template defined
+//   the type which is used for reading/writing. For example, float2
+//   and float4 are built-in CUDA types, so compiler will assume that
+//   these types are address-aligned and issue 64-bit or 128-bit
+//   read operations instead of 32-bit one.
+
+template <int32_t AlignmentInBytes>
+struct RWChunkTrait {};
+
+template <>
+struct RWChunkTrait<2> {
+  using chunk_type = half;
+};
+template <>
+struct RWChunkTrait<4> {
+  using chunk_type = float;
+};
+template <>
+struct RWChunkTrait<8> {
+  using chunk_type = float2;
+};
+template <>
+struct RWChunkTrait<16> {
+  using chunk_type = float4;
+};
+// This one is introduced, despite current CUDA hardware
+// is capable of doing only 128 bit transfers. Benchmarks
+// showed that doing 2x float4 is faster than 1x or 4x.
+// Maybe, 3x needs to be benchmarked as well.
+template <>
+struct RWChunkTrait<32> {
+  using chunk_type = float4;
+};
+
+// This is a piece of tensor data that is going to be read or written.
+//   The purpose is to organize read/write operations is the way to
+//   maximize the number of aligned 128 bit/64 bit/32 bit reads.
+// For example, AlignedChunk<half, 32> means that:
+//   * our tensor works with element of datatype half
+//   * it is guaranteed that it is possible to read 16 contiguous elements
+//   * the address of the first element is aligned to 32 bytes
+// Thus, float4 will be deduced as an underlying data type for interacting
+//   with the global memory and the read/write operations will be
+//   performed via 128 bit reads.
+template <typename DataT, int32_t AlignmentInBytes>
+struct alignas(AlignmentInBytes) AlignedChunk {
+  // This is the type of the data of a tensor elements. Most likely, it is
+  //   float, half or bf16.
+  using data_type = DataT;
+  static constexpr int32_t NElements = AlignmentInBytes / sizeof(data_type);
+
+  // This is the type used for interacting with the global memory.
+  using chunk_type = typename RWChunkTrait<AlignmentInBytes>::chunk_type;
+  static constexpr int32_t NChunkElements =
+      AlignmentInBytes / sizeof(chunk_type);
+
+  using self_type = AlignedChunk<DataT, AlignmentInBytes>;
+
+  // the data itself
+  union {
+    // this is for accessing and applying transformations like tanh()
+    data_type data[NElements];
+    // this is for reading/writing
+    chunk_type chunks[NChunkElements];
+  } holder;
+
+  // read from the global memory
+  __device__ inline void load(const void* const src) {
+    auto srcMod = reinterpret_cast<const chunk_type*>(src);
+#pragma unroll NChunkElements
+    for (int32_t i = 0; i < NChunkElements; i++) {
+      holder.chunks[i] = srcMod[i];
+    }
+  }
+
+  // transform the elements
+  template <typename TransformT>
+  __device__ inline void transform() {
+    TransformT transform;
+
+#pragma unroll NElements
+    for (int32_t i = 0; i < NElements; i++) {
+      holder.data[i] = transform(holder.data[i]);
+    }
+  }
+
+  // write to the global memory
+  __device__ inline void store(void* const dst) const {
+    auto dstMod = reinterpret_cast<chunk_type*>(dst);
+#pragma unroll NChunkElements
+    for (int32_t i = 0; i < NChunkElements; i++) {
+      dstMod[i] = holder.chunks[i];
+    }
+  }
+
+  // This operation is needed to merge AlignedChunk items.
+  // Say, we read an input tensor as AlignedChunk<half, 8>
+  //   and we write into an output tensor as AlignedChunk<half, 32>.
+  //   So, it is possible to merge AlignedChunk<half, 8>[4] into
+  //   a single AlignedChunk<half, 32>.
+  // The compiler does nothing but just the register reassignment.
+  template <typename OtherChunkT, int32_t M>
+  __device__ inline void copyFrom(const OtherChunkT other[M]) {
+    // TODO: this function needs to perform a type conversion
+    //   if the types are different. Via if constexpr, I suppose.
+    // Say, the input tensor uses half data type, and the output tensor
+    //   uses float one.
+    static_assert(std::is_same_v<typename OtherChunkT::data_type, data_type>);
+
+    const data_type* otherAddr = (const data_type*)(other);
+#pragma unroll NElements
+    for (int32_t i = 0; i < NElements; i++) {
+      holder.data[i] = otherAddr[i];
+    }
+  }
+};
+
+// TODO: This can be improved to have less read/write operations.
+// As of now, AlignedChunk is read using the same primitive
+// type all the time. Technically, it can be reorganized to have
+// multiple underlying chunk types.
+// Say, something like AlignedChunkPlusPlus<half, 32, 8, 16, 8>
+//  that reads 8b + 16b + 8b may be used in future instead of
+//  4x AlignedChunk<half, 8> that reads 8b + 8b + 8b + 8b,
+//  if the alignment allows it.
+
+// A simple 1D fixed-size array.
+// One needs to be cautions about pointers, because nvcc compiler
+//   does not apply __restrict correctly for the array of pointers
+//   or structs.
+template <typename DataT, int32_t N>
+struct FSArray {
+  DataT data[N];
+};
+
+// clang-format off
+
+// The most general kernel that supports all the features, but the slowest one.
+// The kernel is organized in the form so that every thread writes a single
+//   ChunkOutputT value to the output tensor.
+// A single write op into the output tensor is supported with the
+//   one or multiple read ops from one of the input tensors.
+//
+// The template parameters are the following:
+// * ChunkOutputT is an aligned data type which is used for
+//   writing into the output tensor. We want this one to be as large as possible
+//   in order to minimize the number of writing ops.
+//   It is guaranteed that all the writing ops are aligned for the addresses.
+//   AlignedChunk<T, M> is used for this.
+// * ChunkInputT is an aligned data type which is used for
+//   reading from input tensors. We want this one to be as large as possible
+//   in order to minimize the number of reading ops.
+//   It is guaranteed that all the reading ops are aligned for the addresses
+//   of all input tensors.
+//   Also, sizeof(ChunkInputT) <= sizeof(ChunkOutputT)
+//   AlignedChunk<T, M> is used for this.
+// * IndexT is a pointer size type. It is either int32_t or int64_t.
+//   It is beneficial to use int32_t unless super-large tensors are used.
+// * NInputTensors is a number of input tensors.
+template <
+    typename ChunkInputT,
+    typename ChunkOutputT,
+    typename IndexT,
+    int32_t NInputTensors,
+    typename TransformT>
+__global__ void ConcatKernelGeneralized(
+    // pointers to the data of input tensors
+    const FSArray<const typename ChunkInputT::data_type*, NInputTensors>
+        inputDatas,
+    // TensorAccessor.original_total_elements_from_stride_dim values
+    //   for input tensors, Please reference tensor_accessor.cuh file.
+    const FSArray<IndexT, NInputTensors> originalTE,
+    // TensorAccessor.actual_total_elements_from_stride_dim values
+    //   for input tensors. Please reference tensor_accessor.cuh file.
+    const FSArray<IndexT, NInputTensors> actualTE,
+    // The sum of input tensor sizes for dim=concatDim.
+    //   This equals to the output tensor size for dim=concatDim.
+    const IndexT outputSizeAtConcatDimMultipliedByNCopy,
+    // The stride for output tensor for dim=concatDim.
+    //   This equals to outputSizeAtConcatDim if there were no masked inputs.
+    const IndexT strideMultipliedByNCopy,
+    // Postfix sum of tensor sizes for dim=concatDim.
+    //   All the values are were multiplied by nCopy.
+    const FSArray<IndexT, NInputTensors> concatDimPostfixSumMultipliedByNCopy,
+    // Every input tensor is expected to get written on a certain
+    //   offset of the output tensor. These are needed if masks are used,
+    //   otherwise ones may be skipped.
+    const FSArray<IndexT, NInputTensors> outputConcatDimOffsetsMultipliedByNCopy,
+    // Where to write the output to.
+    typename ChunkOutputT::data_type* const __restrict outputData,
+    // the total amount of elements to populate in the output tensor.
+    const IndexT numOutputElements) {
+  // some typedefs
+  using input_data_type = typename ChunkInputT::data_type;
+  using output_data_type = typename ChunkOutputT::data_type;
+
+  // put the input values into shared memory.
+  __shared__ IndexT shared_concatDimPostfixSumMultipliedByNCopy[NInputTensors];
+  __shared__ IndexT shared_originalTE[NInputTensors];
+  __shared__ IndexT shared_actualTE[NInputTensors];
+  __shared__ IndexT shared_outputConcatDimOffsetsMultipliedByNCopy[NInputTensors];
+  __shared__ const input_data_type* shared_inputDatas[NInputTensors];
+
+  if (threadIdx.x == 0) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_concatDimPostfixSumMultipliedByNCopy[i] = concatDimPostfixSumMultipliedByNCopy.data[i];
+    }
+  } else if (threadIdx.x == 1) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_originalTE[i] = originalTE.data[i];
+    }
+  } else if (threadIdx.x == 2) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_actualTE[i] = actualTE.data[i];
+    }
+  } else if (threadIdx.x == 3) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_outputConcatDimOffsetsMultipliedByNCopy[i] = outputConcatDimOffsetsMultipliedByNCopy.data[i];
+    }
+  } else if (threadIdx.x == 4) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_inputDatas[i] = inputDatas.data[i];
+    }
+  }
+
+  __syncthreads();
+
+  // Every thread handles a single ChunkOutputT element, or
+  // ChunkOutputT::NElements of an output tensor;
+  const IndexT tid = ((IndexT)blockIdx.x * (IndexT)blockDim.x + threadIdx.x) *
+      ChunkOutputT::NElements;
+  if (tid >= numOutputElements) {
+    return;
+  }
+
+  // calculate the location of the output tensor a current thread is writing to:
+  //   outputRowIdx is the row
+  //   outputColumnIdx is the column
+
+  const IndexT outputRowIdx = tid / (outputSizeAtConcatDimMultipliedByNCopy);
+
+  // Find the input tensor to use
+  const IndexT offset = tid % (outputSizeAtConcatDimMultipliedByNCopy);
+  int32_t inputTensorIdx = 0;
+#pragma unroll NInputTensors
+  for (int32_t i = 1; i < NInputTensors; i++) {
+    inputTensorIdx = (offset < shared_concatDimPostfixSumMultipliedByNCopy[i - 1]) ? inputTensorIdx : i;
+  }
+
+  const IndexT subtract = (inputTensorIdx == 0) ? 0 : shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx - 1];
+  const IndexT outputColumnIdx = offset - subtract;
+
+  // Load the TensorAccessor.original_total_elements_from_stride_dim and
+  //   TensorAccessor.actual_total_elements_from_stride_dim values
+  //   for the current tensor.
+
+  IndexT originalTEValue = shared_originalTE[inputTensorIdx];
+  IndexT actualTEValue = shared_actualTE[inputTensorIdx];
+
+  // Calculate the contiguous access index of the current input tensor
+  IndexT readPositionContiguous =
+        (inputTensorIdx == 0) ?
+        shared_concatDimPostfixSumMultipliedByNCopy[0] :
+        (shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx] - shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx - 1]);
+  readPositionContiguous = outputRowIdx * readPositionContiguous + outputColumnIdx;
+
+  // Get the pointer to data of the input tensor
+  const input_data_type* __restrict inputData = shared_inputDatas[inputTensorIdx];
+
+  // Ok, what's the number of read operations from an input tensor
+  //   needed for a single write operation for the output tensor?
+  constexpr int32_t N_READ_OPS = ChunkOutputT::NElements / ChunkInputT::NElements;
+
+  // Allocate a temporary buffer and perform all these read ops
+  ChunkInputT inputValues[N_READ_OPS];
+
+  if (actualTEValue != originalTEValue) {
+#pragma unroll N_READ_OPS
+    for (int32_t i = 0; i < N_READ_OPS; i++) {
+      // do remapping according to a TensorAccessor logic
+      // the remapping is expensive.
+      const IndexT iInputRow = (readPositionContiguous + i * (IndexT)ChunkInputT::NElements) / originalTEValue;
+      const IndexT iInputPos = (readPositionContiguous + i * (IndexT)ChunkInputT::NElements) % originalTEValue;
+      const IndexT readPosition = iInputRow * actualTEValue + iInputPos;
+
+      // each read op reads ChunkInputT::NElements elements from an input tensor
+      inputValues[i].load(inputData + readPosition);
+    }
+  }
+  else {
+#pragma unroll N_READ_OPS
+    for (int32_t i = 0; i < N_READ_OPS; i++) {
+      // each read op reads ChunkInputT::NElements elements from an input tensor
+      inputValues[i].load(inputData + readPositionContiguous + i * (IndexT)ChunkInputT::NElements);
+    }
+  }
+
+  // combine all the input data
+  ChunkOutputT outputChunk;
+  outputChunk.template copyFrom<ChunkInputT, N_READ_OPS>(inputValues);
+
+  // transform
+  outputChunk.template transform<TransformT>();
+
+  // Find a destination offset for the output tensor
+  IndexT outputOffsetMultipliedByNCopy = shared_outputConcatDimOffsetsMultipliedByNCopy[inputTensorIdx];
+  ChunkOutputT* const __restrict outputAddr = reinterpret_cast<ChunkOutputT*>(outputData);
+
+  // perform a write operation
+  const IndexT outputWritePosition = outputRowIdx * strideMultipliedByNCopy + outputColumnIdx;
+
+  const IndexT op = outputOffsetMultipliedByNCopy + outputWritePosition;
+  outputChunk.store(outputAddr + op / ChunkOutputT::NElements);
+}
+
+// utility functions
+size_t getAlignment(const void* const inputData) {
+    uintptr_t ptr = (uintptr_t)(inputData);
+    if ((ptr % 32) == 0) { return 32; }
+    if ((ptr % 16) == 0) { return 16; }
+    if ((ptr % 8) == 0) { return 8; }
+    if ((ptr % 4) == 0) { return 4; }
+    if ((ptr % 2) == 0) { return 2; }
+
+    return 1;
+}
+
+size_t getAlignment(const size_t n) {
+    if ((n % 32) == 0) { return 32; }
+    if ((n % 16) == 0) { return 16; }
+    if ((n % 8) == 0) { return 8; }
+    if ((n % 4) == 0) { return 4; }
+    if ((n % 2) == 0) { return 2; }
+
+    return 1;
+}
+
+// clang-format on
+
+//
+template <
+    typename ChunkInputT,
+    typename ChunkOutputT,
+    typename IndexT,
+    int32_t NInputTensors,
+    size_t NRank,
+    typename TransformT>
+void concatenateFastLauncher(
+    const int64_t* inputDim[],
+    const void* const inputData[NInputTensors],
+    const int64_t inputConcatDimOffsets[],
+    const int64_t originalTE[],
+    const int64_t actualTE[],
+    const int64_t outputDim[NRank],
+    const int64_t outputConcatDimOffsets[],
+    void* const outputData,
+    const size_t concatDim,
+    char* func_name,
+    cudaStream_t stream) {
+  // some typedefs
+  using input_data_type = typename ChunkInputT::data_type;
+  using output_data_type = typename ChunkOutputT::data_type;
+
+  // assign input tensors
+  FSArray<const input_data_type*, NInputTensors> inputDataFS;
+  for (size_t iTensor = 0; iTensor < NInputTensors; iTensor++) {
+    inputDataFS.data[iTensor] =
+        reinterpret_cast<const input_data_type*>(inputData[iTensor]);
+  }
+
+  // compute ncopy
+  int64_t ncopy = 1;
+  for (size_t i = concatDim + 1; i < NRank; i++) {
+    ncopy *= outputDim[i];
+  }
+
+  // copy
+  FSArray<IndexT, NInputTensors> inputConcatDimOffsetsFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    inputConcatDimOffsetsFS.data[j] = inputConcatDimOffsets[j];
+  }
+
+  FSArray<IndexT, NInputTensors> originalTEFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    originalTEFS.data[j] = originalTE[j];
+  }
+
+  FSArray<IndexT, NInputTensors> actualTEFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    actualTEFS.data[j] = actualTE[j];
+  }
+
+  FSArray<IndexT, NInputTensors> outputConcatDimOffsetsMultipliedByNCopyFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    outputConcatDimOffsetsMultipliedByNCopyFS.data[j] =
+        outputConcatDimOffsets[j] * ncopy;
+  }
+
+  // compute postfix sum.
+  FSArray<IndexT, NInputTensors> concatDimPostfixSumMultipliedByNCopy;
+  {
+    int64_t current = 0;
+    for (size_t j = 0; j < NInputTensors; j++) {
+      auto dim = inputDim[j][concatDim];
+      current += dim;
+      concatDimPostfixSumMultipliedByNCopy.data[j] = (IndexT)(current * ncopy);
+    }
+  }
+
+  // this is the number of elements that needs to be filled on
+  //   dim=concatDim. Basically, it is the sum of available elements
+  //   on dim=concatDim for all of the inputs.
+  // also, this is the number of ncopy-sized chunks that needs to be processed
+  //   per single row of an output tensor. So, every row processes
+  //   nElementsAtConcatDim * ncopy elements.
+  int64_t nElementsAtConcatDim = 0;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    auto dim = inputDim[j][concatDim];
+    nElementsAtConcatDim += dim;
+  }
+
+  // the total number of output elements that needs to be processed
+  int64_t numOutputElements = 1;
+  // the number of rows...
+  for (int32_t iRank = 0; iRank < concatDim; iRank++) {
+    numOutputElements *= outputDim[iRank];
+  }
+
+  // ... multiplied by the number of elements per row
+  numOutputElements *= nElementsAtConcatDim;
+  numOutputElements *= ncopy;
+
+  if (numOutputElements == 0) {
+    // nothing to do
+    return;
+  }
+
+  // this is the stride for dim=concatDim. Basically, the amount of
+  //   memory allocated for a single output tensor row.
+  // stride != nElementsAtConcatDim if some inputs were originally masked out.
+  int64_t stride = outputDim[concatDim];
+
+  // run the CUDA kernel
+  const int32_t nThreadsPerBlock = 128;
+  const int64_t effNumOutputElements =
+      numOutputElements / ChunkOutputT::NElements;
+  const int32_t nBlocks =
+      (effNumOutputElements + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+  // // tell some debug information
+  // printf(
+  //     "I am %s v2 with %ld elements, %d inputs, %zd ChunkInputT, "
+  //     "%zd ChunkOutputT, "
+  //     "%zd InputDataT, %zd OutputDataT\n",
+  //     func_name,
+  //     effNumOutputElements,
+  //     (int32_t)NInputTensors,
+  //     sizeof(ChunkInputT),
+  //     sizeof(ChunkOutputT),
+  //     sizeof(input_data_type),
+  //     sizeof(output_data_type));
+
+  ConcatKernelGeneralized<
+      ChunkInputT,
+      ChunkOutputT,
+      IndexT,
+      NInputTensors,
+      TransformT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+      inputDataFS,
+      originalTEFS,
+      actualTEFS,
+      nElementsAtConcatDim * ncopy,
+      stride * ncopy,
+      concatDimPostfixSumMultipliedByNCopy,
+      outputConcatDimOffsetsMultipliedByNCopyFS,
+      reinterpret_cast<output_data_type*>(outputData),
+      numOutputElements);
+}
+
+template <
+    typename InputDataT,
+    typename OutputDataT,
+    size_t NInputTensors,
+    size_t NRank,
+    typename TransformT>
+void invoke_concatenate_fast(
+    const int64_t* inputDim[],
+    const void* const inputData[NInputTensors],
+    const TensorAccessor* inputTensorAccessors[NInputTensors],
+    const int64_t outputDim[NRank],
+    const int64_t outputConcatDimOffsets[],
+    void* const outputData,
+    const size_t concatDim,
+    char* func_name,
+    cudaStream_t stream) {
+  // check the input parameters
+  if (NInputTensors == 0 || NRank == 0) {
+    return;
+  }
+  if (outputData == nullptr) {
+    throw std::runtime_error("output is nullptr!");
+  }
+
+  // every thread in a kernel may copy up to ncopy elements
+  //   in a single copy operation
+  int64_t ncopy = 1;
+  for (size_t i = concatDim + 1; i < NRank; i++) {
+    ncopy *= outputDim[i];
+  }
+
+  // Compute the alignment of our output dataset
+  // the alignment of the base address
+  size_t alignmentOutput = getAlignment(outputData);
+
+  // The alignment for the amount of copied data
+  alignmentOutput = std::min(
+      alignmentOutput,
+      getAlignment(outputDim[concatDim] * ncopy * sizeof(OutputDataT)));
+
+  // Input tensor i will be copied to a location that starts from
+  //   a column outputConcatDimOffsets[i]. Compute its alignment
+  for (size_t i = 0; i < NInputTensors; i++) {
+    alignmentOutput = std::min(
+        alignmentOutput,
+        getAlignment(outputConcatDimOffsets[i] * ncopy * sizeof(OutputDataT)));
+  }
+
+  //
+  const void* inputDataWithOffsets[NInputTensors];
+
+  int64_t originalTE[NInputTensors];
+  int64_t actualTE[NInputTensors];
+
+  int64_t inputConcatDimOffsets[NInputTensors];
+
+  //
+  size_t alignmentInputs = 65536;
+  for (size_t i = 0; i < NInputTensors; i++) {
+    alignmentInputs = std::min(
+        alignmentInputs,
+        getAlignment(inputDim[i][concatDim] * ncopy * sizeof(InputDataT)));
+    alignmentOutput = std::min(
+        alignmentOutput,
+        getAlignment(inputDim[i][concatDim] * ncopy * sizeof(OutputDataT)));
+  }
+
+  for (size_t j = 0; j < NInputTensors; j++) {
+    const auto* tensorAccessor = inputTensorAccessors[j];
+
+    // recompute the inputData with respect to offset
+    inputDataWithOffsets[j] =
+        ((InputDataT*)inputData[j]) + tensorAccessor->offset;
+
+    // alter its alignment
+    alignmentInputs =
+        std::min(alignmentInputs, getAlignment(inputDataWithOffsets[j]));
+
+    // is input tensor implies a contiguous access?
+    if (tensorAccessor->is_contiguous) {
+      // yes
+      inputConcatDimOffsets[j] = inputDim[j][concatDim];
+
+      originalTE[j] = inputDim[j][concatDim];
+      actualTE[j] = inputDim[j][concatDim];
+    } else {
+      // no
+      if (tensorAccessor->stride_dim == -1) {
+        throw std::runtime_error(
+            "Unsupported negative tensorAccessor stride_dim value!");
+      } else {
+        inputConcatDimOffsets[j] =
+            tensorAccessor->actual_total_elements_from_stride_dim;
+        originalTE[j] = tensorAccessor->original_total_elements_from_stride_dim;
+        actualTE[j] = tensorAccessor->actual_total_elements_from_stride_dim;
+
+        // ncopy?
+        alignmentInputs = std::min(
+            alignmentInputs, getAlignment(originalTE[j] * sizeof(InputDataT)));
+        alignmentInputs = std::min(
+            alignmentInputs, getAlignment(actualTE[j] * sizeof(InputDataT)));
+      }
+    }
+  }
+
+  if (alignmentOutput < alignmentInputs) {
+    // // TODO: this is a possible optimization, bcz the current kernel
+    // supports N reads ops per 1 write op, but not 1 read op per N write ops.
+    // printf(
+    //     "SHRINK, AlignmentInputs = %zd, AlignmentOutput = %zd\n",
+    //     (size_t)alignmentInputs,
+    //     (size_t)alignmentOutput);
+    alignmentInputs = alignmentOutput;
+  }
+
+  if (alignmentInputs == 1) {
+    // unsupported yet. todo
+    throw std::runtime_error("Unsupported input tensors alignment!");
+  }
+  if (alignmentOutput == 1) {
+    // unsupported yet. todo
+    throw std::runtime_error("Unsupported output tensor alignment!");
+  }
+
+#define LAUNCHER(ALIGNMENT_INPUT, ALIGNMENT_OUTPUT, INDEX_T)            \
+  if (alignmentOutput == ALIGNMENT_OUTPUT &&                            \
+      alignmentInputs == ALIGNMENT_INPUT) {                             \
+    if constexpr (                                                      \
+        sizeof(InputDataT) <= ALIGNMENT_INPUT &&                        \
+        sizeof(OutputDataT) <= ALIGNMENT_OUTPUT) {                      \
+      using InputChunkT = AlignedChunk<InputDataT, ALIGNMENT_INPUT>;    \
+      using OutputChunkT = AlignedChunk<OutputDataT, ALIGNMENT_OUTPUT>; \
+      concatenateFastLauncher<                                          \
+          InputChunkT,                                                  \
+          OutputChunkT,                                                 \
+          INDEX_T,                                                      \
+          NInputTensors,                                                \
+          NRank,                                                        \
+          TransformT>(                                                  \
+          inputDim,                                                     \
+          inputDataWithOffsets,                                         \
+          inputConcatDimOffsets,                                        \
+          originalTE,                                                   \
+          actualTE,                                                     \
+          outputDim,                                                    \
+          outputConcatDimOffsets,                                       \
+          outputData,                                                   \
+          concatDim,                                                    \
+          func_name,                                                    \
+          stream);                                                      \
+      return;                                                           \
+    }                                                                   \
+  }
+
+  // compute the limit of the number of elements in output tensor
+  int64_t numOutputElements = 1;
+  for (size_t iRank = 0; iRank < NRank; iRank++) {
+    numOutputElements *= outputDim[iRank];
+  }
+
+  if (numOutputElements == 0) {
+    // no elements to process
+    return;
+  }
+
+  // TODO: rework the following if condition.
+  // 1. This value is a constexpr value, because all the
+  // input & output tensor sizes are known to a template generator.
+  // This improvement should reduce the compilation speed 2x.
+  // 2. Strided tensors might need a special handling.
+  if (!can_use_32bit_index_math(numOutputElements)) {
+    using index_type = int64_t;
+
+    LAUNCHER(32, 32, index_type);
+    LAUNCHER(16, 32, index_type);
+    LAUNCHER(8, 32, index_type);
+    LAUNCHER(4, 32, index_type);
+    LAUNCHER(2, 32, index_type);
+
+    LAUNCHER(16, 16, index_type);
+    LAUNCHER(8, 16, index_type);
+    LAUNCHER(4, 16, index_type);
+    LAUNCHER(2, 16, index_type);
+
+    LAUNCHER(8, 8, index_type);
+    LAUNCHER(4, 8, index_type);
+    LAUNCHER(2, 8, index_type);
+
+    LAUNCHER(4, 4, index_type);
+    LAUNCHER(2, 4, index_type);
+
+    LAUNCHER(2, 2, index_type);
+  } else {
+    using index_type = int32_t;
+
+    LAUNCHER(32, 32, index_type);
+    LAUNCHER(16, 32, index_type);
+    LAUNCHER(8, 32, index_type);
+    LAUNCHER(4, 32, index_type);
+    LAUNCHER(2, 32, index_type);
+
+    LAUNCHER(16, 16, index_type);
+    LAUNCHER(8, 16, index_type);
+    LAUNCHER(4, 16, index_type);
+    LAUNCHER(2, 16, index_type);
+
+    LAUNCHER(8, 8, index_type);
+    LAUNCHER(4, 8, index_type);
+    LAUNCHER(2, 8, index_type);
+
+    LAUNCHER(4, 4, index_type);
+    LAUNCHER(2, 4, index_type);
+
+    LAUNCHER(2, 2, index_type);
+  }
+
+  // no launcher was found
+  throw std::runtime_error("Unsupported concat kernel specialization!");
+}
+
+#endif
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.py b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
new file mode 100644
index 000000000..ee74d4509
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
@@ -0,0 +1,195 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+
+import jinja2
+
+from ....compiler.ops.tensor import concatenate
+
+from ...backend_spec import CUDASpec
+
+from ...common import tensor_accessor_codegen
+from ...target import Target
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+{{header_src}}
+
+{% if element_func_def %}
+{{element_func_def}}
+{% endif %}
+
+namespace {
+
+{{tensor_accessor_libs}}
+
+// TODO: support strided tensor with TensorAccessor
+// For strided tensor, the index can be much larger than original if the stride is large
+bool can_use_32bit_index_math(const int64_t elements, int64_t max_elem=std::numeric_limits<int32_t>::max()) {
+  if (elements >= max_elem) {
+    return false;
+  }
+  if (elements == 0) {
+    return max_elem > 0;
+  }
+
+  return true;
+}
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const {{index_type}} *shape, {{index_type}} rank) {
+  int64_t num = 1;
+  for ({{index_type}} i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+{{custom_libs}}
+
+}  // namespace
+
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+
+{{input_accessor_defs}}
+
+{{indent}}{{index_type}} local_output_shape[] = {
+{% for idx in range(rank - 1) %}
+{{indent}}  *(output_shape[{{idx}}]),
+{% endfor %}
+{{indent}}  *(output_shape[{{rank - 1}}])
+{{indent}}};
+{{indent}}
+{{indent}}{% if element_func == "fast_tanh" %}
+{{indent}}using transform_type = TanhTransform<{{elem_type}}>;
+{{indent}}{% else %}
+{{indent}}using transform_type = NoopTransform<{{elem_type}}>;
+{{indent}}{% endif %}
+{{indent}}
+{{indent}}invoke_concatenate_fast<{{elem_type}}, {{elem_type}}, {{num_all_inputs}}, {{rank}}, transform_type>(
+{{indent}}    real_input_shapes,
+{{indent}}    inputs,
+{{indent}}    input_accessors,
+{{indent}}    local_output_shape,
+{{indent}}    concat_dim_offsets.data(),
+{{indent}}    output,
+{{indent}}    concat_dim,
+{{indent}}    "{{func_name}}",
+{{indent}}    stream);
+{{indent}}return;
+"""
+)
+
+INPUT_ACCESSOR_DEFS_TEMPLATE = jinja2.Template(
+    """
+{{input_accessors}}
+
+{{indent}}const TensorAccessor *input_accessors[{{num_real_inputs}}] = {
+
+{{indent}}  {{input_accessor_refs}}
+
+{{indent}}};
+"""
+)
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    element_func=None,
+    element_func_def=None,
+):
+    backend_spec = CUDASpec()
+
+    inputs = func_attrs["inputs"]
+    original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
+    orig_x = original_inputs[0]
+    y = func_attrs["outputs"][0]
+    x_shape = orig_x._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+
+    # TODO: support type cast
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
+    concat_dim = func_attrs["concat_dim"]
+    assert concat_dim < len(x_shape)
+
+    input_accessors = []
+    input_accessor_refs = []
+    for i in range(len(inputs)):
+        accessor_name = f"input_accessor{i}"
+        input_accessor_refs.append(f"&{accessor_name}")
+        input_accessors.append(
+            tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+                name=accessor_name, tensor_accessor=func_attrs["input_accessors"][i]
+            )
+        )
+    input_accessor_defs = INPUT_ACCESSOR_DEFS_TEMPLATE.render(
+        indent="    ",
+        input_accessors="".join(input_accessors),
+        num_real_inputs=len(inputs),
+        input_accessor_refs=", ".join(input_accessor_refs),
+    )
+
+    # load the file from the drive
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "concatenate_fast.cuh"
+    )
+
+    header_src = backend_spec.header_src_template.render()
+    tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        custom_libs=custom_libs,
+        element_func=element_func,
+        element_func_def=element_func_def,
+        header_src=header_src,
+        index_type=backend_spec.index_type,
+        tensor_accessor_libs=tensor_accessor_libs,
+    )
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        rank=len(x_shape),
+        num_all_inputs=len(inputs),
+        elem_type=input_type,
+        element_func=element_func,
+        element_func_def=element_func_def,
+        index_type=backend_spec.index_type,
+        input_accessor_defs=input_accessor_defs,
+        func_name=func_attrs["name"],
+    )
+
+    return src_template.render(
+        kernel_src=kernel_src,
+        func_name=func_attrs["name"],
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
index efb2ea440..c03d6d250 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
@@ -18,12 +18,15 @@
 import jinja2
 
 from ... import registry
+from ...backend_spec import CUDASpec
 from . import concatenate
 
+
 TANH_DEF = jinja2.Template(
     """
 #include <cutlass/fast_math.h>
 
+{% if dtype == "half" %}
 #ifndef __HALF2_TO_UI
 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
 #endif
@@ -81,7 +84,27 @@
     y_vec[3] = fast_tanh(x_vec[3]);
     return y;
 }
+{% elif dtype == "float" %}
+__device__  float fast_tanh(float x) {
+    return cutlass::fast_tanh(x);
+}
+
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    y.x = cutlass::fast_tanh(x.x);
+    y.y = cutlass::fast_tanh(x.y);
+    return y;
+}
 
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    y.x = cutlass::fast_tanh(x.x);
+    y.y = cutlass::fast_tanh(x.y);
+    y.z = cutlass::fast_tanh(x.z);
+    y.w = cutlass::fast_tanh(x.w);
+    return y;
+}
+{% endif %}
 """
 )
 
@@ -93,8 +116,17 @@ def gen_function_decl(func_attrs):
 
 @registry.reg("cuda.concatenate_tanh.gen_function")
 def gen_function(func_attrs):
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"],
+    )
+
     return concatenate.gen_function(
-        func_attrs, element_func="fast_tanh", element_func_def=TANH_DEF.render()
+        func_attrs,
+        element_func="fast_tanh",
+        element_func_def=TANH_DEF.render(
+            dtype=dtype,
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/gather.py b/python/aitemplate/backend/cuda/tensor/gather.py
index 22fdaf8d0..f8ecf17a9 100644
--- a/python/aitemplate/backend/cuda/tensor/gather.py
+++ b/python/aitemplate/backend/cuda/tensor/gather.py
@@ -181,6 +181,7 @@
       (num_output_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
   int grid_config = num_blocks_x;
 
+{% if elem_type == "half" %}
   if (num_output_elems % 2 == 0) {
     gather_kernel<float, int4, ELEM_T, Rank, ElemsPerThread>
     <<<grid_config, ThreadsPerBlock, 0, stream>>>(
@@ -202,6 +203,17 @@
         num_output_elems);
     CUDA_LAUNCH_CHECK_GATHER();
   }
+{% elif elem_type == "float" %}
+  gather_kernel<float, INDEX_TYPE, ELEM_T, Rank, ElemsPerThread>
+  <<<grid_config, ThreadsPerBlock, 0, stream>>>(
+      output,
+      input,
+      indices,
+      input_meta,
+      gather_dim,
+      num_output_elems);
+  CUDA_LAUNCH_CHECK_GATHER();
+{% endif %}
 }
 
 #undef CUDA_CHECK_ERROR_GATHER
@@ -349,8 +361,10 @@ def gen_function(func_attrs):
         elems_per_thread=2,
         threads_per_block=128,
     )
-
-    kernel_src = KERNEL_SRC_TEMPLATE.render(index_type=index_type)
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        index_type=index_type,
+        elem_type=input_type,
+    )
     return SRC_TEMPLATE.render(
         kernel_src=kernel_src,
         func_name=func_attrs["name"],
diff --git a/python/aitemplate/backend/cuda/tensor/permute.cuh b/python/aitemplate/backend/cuda/tensor/permute.cuh
index b9ced62b7..759f601fa 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.cuh
+++ b/python/aitemplate/backend/cuda/tensor/permute.cuh
@@ -315,10 +315,11 @@ void DispatchIndexType(
     const void* src,
     const int* permutation,
     void* dst,
+    size_t elem_size,
     cudaStream_t stream) {
   // Vector read/write.
   // This fixed a bug in the original oneflow code.
-  src_dims[num_dims - 1] = src_dims[num_dims - 1] * 2 / movement_size;
+  src_dims[num_dims - 1] = src_dims[num_dims - 1] * elem_size / movement_size;
 
   size_t count = 1;
   for (size_t i = 0; i < num_dims; ++i) {
@@ -340,12 +341,14 @@ void DispatchMovementSize(
     const void* src,
     const int* permutation,
     void* dst,
+    size_t elem_size,
     cudaStream_t stream) {
   void (*func)(
       int64_t* /*src_dims*/,
       const void* /*src*/,
       const int* /*permutation*/,
       void* /*dst*/,
+      size_t /*elem_size*/,
       cudaStream_t /*stream*/) = nullptr;
   if (movement_size == 1) {
     func = DispatchIndexType<num_dims, 1>;
@@ -360,10 +363,10 @@ void DispatchMovementSize(
   } else {
     throw std::runtime_error("unsupported movement_size for permute");
   }
-  func(src_dims, src, permutation, dst, stream);
+  func(src_dims, src, permutation, dst, elem_size, stream);
 }
 
-template <size_t num_dims, size_t elem_size>
+template <size_t num_dims, typename ElemType>
 void invokePermute(
     void* dst,
     const void* src,
@@ -377,10 +380,10 @@ void invokePermute(
     throw std::runtime_error("src is NULL!");
   }
 
-  // 2 bytes/half * 8 halves
+  constexpr size_t elem_size = sizeof(ElemType);
   constexpr size_t kMaxMovementSize = 16;
   const size_t movement_size = GetMovementSize<kMaxMovementSize>(
       elem_size, num_dims, src_dims, src, permutation, dst);
   DispatchMovementSize<num_dims>(
-      movement_size, src_dims, src, permutation, dst, stream);
+      movement_size, src_dims, src, permutation, dst, elem_size, stream);
 }
diff --git a/python/aitemplate/backend/cuda/tensor/permute.py b/python/aitemplate/backend/cuda/tensor/permute.py
index d22041264..6c7746f5b 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.py
+++ b/python/aitemplate/backend/cuda/tensor/permute.py
@@ -20,6 +20,8 @@
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
+
 from ... import registry
 from ...target import Target
 
@@ -86,8 +88,7 @@
 {% endfor %}
   *dim_{{input_rank - 1}}
     };
-
-    invokePermute<{{input_rank}}, {{elem_size}}>(dst, src, src_dims, permutation, stream);
+    invokePermute<{{input_rank}}, {{elem_type}}>(dst, src, src_dims, permutation, stream);
 }
 
   """
@@ -116,13 +117,17 @@ def gen_function(func_attrs: Dict[str, Any]) -> str:
         os.path.dirname(__file__), "permute.cuh"
     )
     dtype = x.dtype()
-    assert dtype == "float16", "permute kernel only supports fp16"
-    elem_size = 2
+    assert dtype in (
+        "float16",
+        "float32",
+        "float",
+    ), "permute is only tested for floating point type"
+    backend_type = CUDASpec().dtype_to_backend_dtype[dtype]
     return SRC_TEMPLATE.render(
         func_name=func_name,
         custom_libs=custom_libs,
         input_rank=rank,
-        elem_size=elem_size,
+        elem_type=backend_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute021.py b/python/aitemplate/backend/cuda/tensor/permute021.py
index c51a7ace1..95015cf14 100644
--- a/python/aitemplate/backend/cuda/tensor/permute021.py
+++ b/python/aitemplate/backend/cuda/tensor/permute021.py
@@ -31,7 +31,10 @@
 
 
 @registry.reg("cuda.permute021.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(
+    func_attrs,
+    template_path,
+):
     """
     Parameters
     ----------
@@ -39,8 +42,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -50,8 +51,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute021_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         CUDASpec(),
     )
@@ -74,7 +73,10 @@ def gen_function_decl(func_attrs):
 
 
 @registry.reg("cuda.permute021.func_call")
-def gen_function_call(func_attrs, indent="  "):
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
     """
     Parameters
     ----------
diff --git a/python/aitemplate/backend/cuda/tensor/permute0213.py b/python/aitemplate/backend/cuda/tensor/permute0213.py
new file mode 100644
index 000000000..b277eff87
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute0213.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute0213 for cuda
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import permute0213_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.permute0213.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute0213_common.gen_function(
+        func_attrs,
+        template_path,
+        Header_files,
+        CUDASpec(),
+    )
+
+
+@registry.reg("cuda.permute0213.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute0213_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.permute0213.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute0213_common.gen_function_call(func_attrs, CUDASpec(), indent)
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
index 715623e54..ddee78a74 100644
--- a/python/aitemplate/backend/cuda/tensor/permute102.py
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -31,7 +31,7 @@
 
 
 @registry.reg("cuda.permute102.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -39,8 +39,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -50,8 +48,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute102_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         CUDASpec(),
     )
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index b0acd9c61..795f857f2 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -42,7 +42,7 @@ def gen_function(
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -77,6 +77,7 @@ def gen_function(
         half2_data_ref=half2_data_ref,
         mode=func_attrs["mode"],
         tsize=upsampling2d_common.gen_alignment(x),
+        dtype=input_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
index f369a3ed2..8015ed78d 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -42,7 +42,7 @@ def gen_function(
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -80,6 +80,7 @@ def gen_function(
         mode=func_attrs["mode"],
         bias_add=True,
         tsize=upsampling2d_common.gen_alignment(x),
+        dtype=input_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index ada46c404..14f8fa3c1 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -15,14 +15,18 @@
 """
 Util functions for CUDA codegen.
 """
+import logging
+
 from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
 
-from ...utils import logger
 from .. import registry
 
 # pylint: disable=C0103,C0415,W0707
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class Args(object):
     def __init__(self, arch):
         self.operations = "all"
@@ -37,6 +41,7 @@ def __init__(self, arch):
         self.selected_kernel_list = None
         self.interface_dir = None
         self.filter_by_cc = True
+        self.disable_full_archs_compilation = False
 
 
 registry.reg("cuda.make_cutlass_lib")(mk_cutlass_lib)
@@ -59,5 +64,5 @@ def gen_ops(arch):
         func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
         func(manifest, args)
     except AttributeError:
-        logger.warning(__file__, "Arch " + arch + " is not supported by extra ops.")
+        _LOGGER.warning("Arch " + arch + " is not supported by extra ops.")
     return manifest.operations
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
index d94d484ed..89e608d5d 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -45,8 +45,8 @@ def gen_function(
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
     exec_paths = ""
     for key, _ in exec_path.items():
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
index 1eb9dedd2..1597e848c 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -46,7 +46,7 @@ def gen_function(
 
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    library_dtype = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -78,7 +78,7 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
-            library_dtype=library_dtype,
+            dtype=dtype,
         )
         exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
@@ -90,6 +90,7 @@ def gen_function(
         header_files=EXTRA_HEADER.render(),
         index_type=backend_spec.index_type,
         half2_data_ref=half2_data_ref,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 88af849a7..3aa881631 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -28,22 +28,18 @@
 #include "device_functions-generated.h"
 #include "model_interface.h"
 #include "raii_wrapper.h"
+#include "model.h"
 #include "macros.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <math.h>
 
 {{ function_decl }}
 
-#define CHECK_VECTOR_ACCESS(vector, idx)                                  \\
-  if (idx >= vector.size()) {                                             \\
-    throw std::out_of_range(                                              \\
-        "[__func__]: index out of range, " #vector ".size()=" +           \\
-        std::to_string(vector.size()) + ", got " + std::to_string(idx));  \\
-  }
-
 namespace ait {
 namespace {
 void DeviceCheckLastError(const char* file, int line) {
@@ -57,7 +53,6 @@
   }
 }
 
-thread_local bool target_has_graph_mode = {{ target_has_graph_mode }};
 } // namespace
 
 // Model is the class that actually performs inference. It owns memory for
@@ -66,75 +61,32 @@
 // by the user.
 // Once an inference run has started, it is not safe to re-use the Model
 // until the run has finished!
-class Model {
+class Model : public ModelBase<Model> {
   public:
-  Model(
-      size_t blob_size,
-      size_t workspace_size,
-      size_t num_inputs,
-      size_t num_outputs,
-      size_t num_unbound_constants,
-      uint8_t* constants,
-      AITemplateAllocator& allocator)
-      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
-        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
-        params_(num_inputs + num_outputs + num_unbound_constants),
-        num_inputs_(num_inputs),
-        num_outputs_(num_outputs),
-        constants_(constants) {
-      dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
-      LOG(INFO) << "Init AITemplate Runtime.";
-      global_workspace_ = static_cast<uint8_t*>(workspace_.get()) + {{ unique_workspace_size }};
-      unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
-      DEVICE_CHECK(GetDevice(&device_idx_))
-      DEVICE_CHECK(CreateEvent(&run_finished_));
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-      DEVICE_CHECK(cudaDeviceGetAttribute(
-        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
-#endif
-      DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
-      DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
-  InitConstants(constants_);
-      auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
-  {{ tensor_slice }}
-  {{ tensor_map_set }}
-    }
-
-    ~Model() {
-      if (run_finished_ != nullptr) {
-        DestroyEvent(run_finished_);
+    Model(
+        size_t blob_size,
+        size_t workspace_size,
+        size_t unique_workspace_size,
+        size_t num_inputs,
+        size_t num_outputs,
+        size_t num_unbound_constants,
+        uint8_t* constants,
+        AITemplateAllocator& allocator)
+        : ModelBase(
+            blob_size,
+            workspace_size,
+            unique_workspace_size,
+            num_inputs,
+            num_outputs,
+            num_unbound_constants,
+            constants,
+            allocator) {
+    {{ set_up_constants }}
+    auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
+    {{ tensor_slice }}
+    {{ tensor_map_set }}
+    {{ set_up_param_dynamic_shapes }}
       }
-      if (graph_capture_stream_ != nullptr) {
-        StreamDestroy(graph_capture_stream_);
-      }
-      if (graph_exec_ != nullptr) {
-        GraphExecDestroy(graph_exec_);
-      }
-    }
-
-    Model(Model&& other) {
-      run_finished_ = other.run_finished_;
-      graph_exec_ = other.graph_exec_;
-      graph_capture_stream_ = other.graph_capture_stream_;
-      other.run_finished_ = nullptr;
-      other.graph_exec_ = nullptr;
-      other.graph_capture_stream_ = nullptr;
-
-      constants_ = other.constants_;
-      num_inputs_ = other.num_inputs_;
-      global_workspace_ = other.global_workspace_;
-      unique_workspace_ = other.unique_workspace_;
-      workspace_ = std::move(other.workspace_);
-
-      params_ = std::move(other.params_);
-      constant_name_to_ptr_ = std::move(other.constant_name_to_ptr_);
-      // Re-wire the pointers in the above 2 structures.
-      InitConstants(constants_);
-    }
-
-    Model& operator=(Model&&) = delete;
-    Model(const Model&) = delete;
-    Model& operator=(const Model&) = delete;
 
     void SetUpInputsOutputs() {
         {{ set_inputs }}
@@ -143,17 +95,11 @@ class Model {
     void DeviceToDeviceCopies(StreamType stream) {
   {{ device_to_device_copies }}
     }
-    void Run(StreamType stream, bool graph_mode) {
-      SetUpInputsOutputs();
-      if (target_has_graph_mode && graph_mode) {
-        RunAsGraph(stream);
-      } else {
-        RunImpl(stream);
-      }
-      DEVICE_CHECK(EventRecord(run_finished_, stream));
-    }
 
     void RunImpl(StreamType stream) {
+        {% if profiler_annotation %}
+        RAII_ProfilerRange _raiiAITProfilerRange("main_start");
+        {% endif %}
   {% for func in function_seq %}
   {{ func }}
       DeviceCheckLastError(__FILE__, __LINE__);
@@ -161,208 +107,58 @@ class Model {
       DeviceToDeviceCopies(stream);
     }
 
-    bool IsPending() {
-      auto query = QueryEvent(run_finished_);
-      if (query == GetDeviceNotReady()) {
-        return true;
+    void ProfileImpl(StreamType stream, size_t iters, const std::string& filename) {
+      std::ofstream ss(filename);
+      if (!ss) {
+        throw std::runtime_error(std::string("Could not open file ") + filename);
       }
-      if (query != GetDeviceSuccess()) {
-        LOG(WARNING) << "Pending model run did not finish successfully. Error: "
-                    << GetErrorString(query);
-      }
-      return false;
-    }
-
-    void WaitForCompletion() {
-      DEVICE_CHECK(EventSynchronize(run_finished_));
-    }
-
-    size_t NumInputs() const {
-      return num_inputs_;
-    }
-
-    size_t NumOutputs() const {
-      return num_outputs_;
-    }
-
-    void SetParam(const void* src, size_t param_idx) {
-      CHECK_VECTOR_ACCESS(params_, param_idx)
-      // const_cast is not ideal here, but it is unfortunately
-      // necessary:
-      // 1) We store outputs and inputs in the same vector,
-      //    and outputs cannot be const.
-      // 2) Most of the codegen is not const-correct (most ops
-      //    require non-const pointers). So even if we put const
-      //    pointers into params, a const_cast would be required
-      //    somewhere else.
-      params_[param_idx].ptr = const_cast<void*>(src);
-    }
-
-    void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
-      SetInputShape(shape, idx);
-      SetParam(src, idx);
-    }
-
-    void SetOutput(void* src, size_t idx) {
-      SetParam(src, idx + num_inputs_);
-    }
-
-    // Write the (possibly dynamic) output shape to the given pointer.
-    // Note that this should be called _after_ the shape inference in
-    // Run() is finished. output_shape_out should be able to store
-    // at least GetOutputMaximumShape(idx).size values.
-    void GetOutputShape(size_t idx, int64_t* output_shape_out) {
-      const auto param_idx = idx + num_inputs_;
-      CHECK_VECTOR_ACCESS(params_, param_idx);
-      const auto& shape_ptrs = params_[param_idx].shape_ptrs;
-      for (size_t i = 0; i < shape_ptrs.size(); ++i) {
-        output_shape_out[i] = shape_ptrs[i].GetValue();
-      }
-    }
-
-    void SetConstant(const char* name, const void* src) {
-      auto it = constant_name_to_ptr_.find(name);
-      if (it == constant_name_to_ptr_.end()) {
-        throw std::out_of_range(std::string("Could not find constant ") + name);
-      }
-      const void** ptr = it->second;
-      *ptr = src;
-    }
-
-  private:
-    void InitConstants(uint8_t* constants) {
-      {{ set_up_constants }}
-      {{ set_up_param_dynamic_shapes }}
-    }
-
-    void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
-      auto& param = params_[idx];
-      if (shape.size != param.shape_ptrs.size()) {
-        throw std::runtime_error(
-          "[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
-          "; expected " + std::to_string(param.shape_ptrs.size()) + ", got " +
-          std::to_string(shape.size));
-      }
-      for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
-        param.shape_ptrs[i].SetValue(shape.shape_data[i]);
-      }
-    }
-
-    DeviceError EndCapture(GraphType* graph_ptr) {
-      auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
-      if (err != GetDeviceSuccess()) {
-        // If we can't take the stream out of capture mode, something is probably
-        // wrong with CUDA graph for this model (e.g. there might have been an
-        // illegal capture mode operation). Disable graph mode to avoid such issues
-        // in future iterations.
-        target_has_graph_mode = false;
-        LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
-        return err;
-      }
-      return GetDeviceSuccess();
-    }
-
-    void RunAsGraph(StreamType stream) {
-      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
-      try {
-        RunImpl(graph_capture_stream_);
-      } catch (...) {
-        GraphType graph;
-        // No need to DEVICE_CHECK here, we want to see the original exception.
-        EndCapture(&graph);
-        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
-          LOG(WARNING) << "Graph destruction failed while handling exception! Memory will be leaked.";
+      ss << "{\\n";
+      {% for func_name, func in function_pair_seq %}
+      {
+        std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
+        cudaEvent_t start, stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord(start);
+        for (size_t i = 0; i < iters; ++i) {
+            {{ func }}
+          DeviceCheckLastError(__FILE__, __LINE__);
         }
-        throw;
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        float milliseconds = 0.0;
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        ss << "\\"" << "{{ func_name }}" << "\\": " <<  (milliseconds/iters);
+        {% if loop.last %}
+          ss << "\\n";
+        {% else %}
+          ss << ",\\n";
+        {% endif %}
       }
+      {% endfor %}
+      ss << "}\\n";
 
-      // The following function ends the capture and creates a graph
-      // inside a unique_ptr that cleans up it when it goes out of scope.
-      // Note that it throws an exception if EndCapture fails.
-      auto graph = RAII_EndCaptureAndCreateGraph(
-        [this](GraphType* graph_ptr){ return EndCapture(graph_ptr); }
+      DeviceToDeviceCopies(stream);
+      std::cout << "AIT per op profiling finished." << std::endl;
+    }
+
+    static std::unique_ptr<Model> Create(
+      AITemplateAllocator& allocator,
+      uint8_t* constants
+    ) {
+      return std::make_unique<Model>(
+          {{ blob_size }},
+          {{ workspace_size }},
+          {{ unique_workspace_size }},
+          {{ num_inputs }},
+          {{ num_outputs }},
+          {{ num_unbound_constants }},
+          constants,
+          allocator
       );
-
-      if (graph_exec_ == nullptr) {
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      } else if (GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
-        // Consume the last cuda error, which may affect the next GraphExecLaunch
-        // call.
-        GetLastError();
-        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      }
-
-      DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
     }
 
-    int device_idx_;
-    int max_smem_size_{0};
-    DevicePropertyType device_properties_;
-    // This event tracks when the inference is finished
-    // so that this Model may be reclaimed by its owning
-    // ModelContainer.
-    EventType run_finished_;
-    // A blob of memory used for storing intermediate tensors.
-    GPUPtr blob_;
-    // Memory for constants that were folded into the *.so. Unowned by Model,
-    // owned by ModelContainer.
-    // TODO: make this const. It can't be const right now because we derive
-    // tensor pointers from it, and no tensor pointers are const.
-    uint8_t* constants_;
-    size_t num_inputs_;
-    size_t num_outputs_;
-
-    // The workspace blob is used as scratch memory. See
-    // _generate_workspace in memory planning for more information.
-    GPUPtr workspace_;
-    uint8_t* global_workspace_{nullptr};
-    uint8_t* unique_workspace_{nullptr};
-
-    class ParamDim {
-      public:
-        ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value) :
-          lower_bound_(lower_bound),
-          upper_bound_(upper_bound),
-          value_(value) {}
-
-        void SetValue(int64_t new_value) {
-          if (new_value < lower_bound_ || new_value > upper_bound_) {
-            throw std::out_of_range(
-              "[SetValue] Dimension got value out of bounds; expected value to be in [" +
-              std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) + "], but got " +
-              std::to_string(new_value)
-            );
-          }
-          *value_ = new_value;
-        }
-
-        int64_t GetValue() const {
-          return *value_;
-        }
-
-      private:
-        int64_t lower_bound_;
-        int64_t upper_bound_;
-        int64_t* value_;
-    };
-
-    struct ParamInfo {
-      void* ptr = nullptr;
-      // TODO add offset
-      const char* name;
-      std::vector<ParamDim> shape_ptrs;
-    };
-
-    // Contains info for all tensors marked as inputs
-    // or outputs. The first num_inputs elements are the inputs.
-    // Constants are not included.
-    std::vector<ParamInfo> params_;
-
-    GraphExecType graph_exec_ = nullptr;
-    StreamType graph_capture_stream_;
-
-    std::unordered_map<std::string, const void**> constant_name_to_ptr_;
+  private:
 {{ tensor_decl }}
 {{ dim_decl }}
 {{ function_state }}
@@ -424,7 +220,7 @@ class ParamDim {
 
 ModelContainer* CreateModelContainer(size_t num_runtimes, AITemplateAllocator& allocator) {
   // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size, allocator
-  return new ModelContainer(num_runtimes, {{blob_size}}, {{workspace_size}}, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}}, allocator);
+  return new ModelContainer(num_runtimes, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}}, allocator);
 }
 } // namespace ait
 """
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 4d8cd5e36..2e027a59c 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -16,17 +16,19 @@
 SQLite backend for conv/gemm profiling cache
 """
 import enum
+import logging
 import sqlite3
 
 from typing import Any, Dict, Tuple
 
 import jinja2
 
-from ..utils import logger
-
 # pylint: disable=W0613
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class CacheMode(enum.Enum):
     r"""Enum for cache mode
 
@@ -131,7 +133,7 @@ class CacheMode(enum.Enum):
 
 CONV_INIT_TEMPLATE = jinja2.Template(
     """
- CREATE TABLE IF NOT EXISTS {{dev}}_conv (
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv_{{version}} (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
   exec_entry VARCHAR(8192) NOT NULL,
   exec_entry_sha1 VARCHAR(64) NOT NULL,
@@ -164,7 +166,7 @@ class CacheMode(enum.Enum):
 CONV_QUERY_TEMPLATE = jinja2.Template(
     """
 SELECT algo, workspace
-FROM {{dev}}_conv
+FROM {{dev}}_conv_{{version}}
 WHERE
 dtype_a={{dtype_a}} AND
 dtype_b={{dtype_b}} AND
@@ -189,7 +191,7 @@ class CacheMode(enum.Enum):
 
 CONV_INSERT_TEMPLATE = jinja2.Template(
     """
-INSERT INTO {{dev}}_conv (
+INSERT INTO {{dev}}_conv_{{version}} (
     exec_entry,
     exec_entry_sha1,
     dtype_a,
@@ -240,7 +242,7 @@ class CacheMode(enum.Enum):
 
 CONV3D_INIT_TEMPLATE = jinja2.Template(
     """
- CREATE TABLE IF NOT EXISTS {{dev}}_conv3d (
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv3d_{{version}} (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
   exec_entry VARCHAR(8192) NOT NULL,
   exec_entry_sha1 VARCHAR(64) NOT NULL,
@@ -280,7 +282,7 @@ class CacheMode(enum.Enum):
 CONV3D_QUERY_TEMPLATE = jinja2.Template(
     """
 SELECT algo, workspace
-FROM {{dev}}_conv3d
+FROM {{dev}}_conv3d_{{version}}
 WHERE
 dtype_a={{dtype_a}} AND
 dtype_b={{dtype_b}} AND
@@ -312,7 +314,7 @@ class CacheMode(enum.Enum):
 
 CONV3D_INSERT_TEMPLATE = jinja2.Template(
     """
-INSERT INTO {{dev}}_conv3d (
+INSERT INTO {{dev}}_conv3d_{{version}} (
     exec_entry,
     exec_entry_sha1,
     dtype_a,
@@ -489,8 +491,10 @@ def __init__(
         #     We could choose the old working version upon rollback, but we might
         #     leave some content from the failing version in the db. How are we
         #     going to update the db if we update the version again, and so on.
-        # TODO: add similar version control for conv and norm
+        # TODO: add similar version control for norm
         self._gemm_cache_version = 1
+        self._conv_cache_version = 1
+        self._conv3d_cache_version = 1
         if uri is not None:
             self._mode = CacheMode.REMOTE
         if self._mode == CacheMode.LOCAL:
@@ -508,33 +512,109 @@ def _init_db(self):
         self._create_conv3d_table()
         self._create_norm_table()
 
-    def get_profile_gemm_cache_version(self) -> int:
+    @property
+    def gemm_cache_version(self) -> int:
         return self._gemm_cache_version
 
+    @property
+    def conv_cache_version(self) -> int:
+        return self._conv_cache_version
+
+    @property
+    def conv3d_cache_version(self) -> int:
+        return self._conv3d_cache_version
+
     def _create_gemm_table(self):
         """Creates gemm table."""
-        if not self._gemm_table_version_matches():
-            logger.info(__name__, "temporarily keep old cache versions")
+        version = self.gemm_cache_version
+        if not self._table_exists("gemm", version):
+            _LOGGER.info(
+                "Temporarily keeping the old gemm cache versions if exist",
+            )
             # FIXME: will delete unmatched version once we get into production
             # self._delete_existing_table("gemm")
 
-        logger.info(
-            __name__,
-            f"Trying to make a new gemm table with {self._gemm_cache_version=}",
-        )
-        sql = GEMM_INIT_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version
-        )
+            _LOGGER.info(
+                f"Creating a new gemm table with {version=}",
+            )
+            sql = GEMM_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_conv_table(self):
+        """Creates conv table."""
+        version = self.conv_cache_version
+        if not self._table_exists("conv", version):
+            _LOGGER.info(
+                "Temporarily keeping the old conv cache versions if exist",
+            )
+            # FIXME: will delete unmatched version once we get into production
+            # self._delete_existing_table("conv")
+
+            _LOGGER.info(
+                f"Creating a new conv table with {version=}",
+            )
+            sql = CONV_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_conv3d_table(self):
+        """Creates conv3d table."""
+        version = self.conv3d_cache_version
+        if not self._table_exists("conv3d", version):
+            _LOGGER.info(
+                "Temporarily keeping the old conv3d cache versions if exist",
+            )
+            # FIXME: will delete unmatched version once we get into production
+            # self._delete_existing_table("conv3d")
+
+            _LOGGER.info(
+                f"Creating a new conv3d table with {version=}",
+            )
+            sql = CONV3D_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_norm_table(self):
+        """Creates conv table."""
+        sql = NORM_INIT_TEMPLATE.render(dev=self._target)
         self._cur.execute(sql)
         self._con.commit()
 
+    def _table_exists(self, table_kind, cache_version):
+        """Check if the table of given kind and cache version exists."""
+        table_name = f"{self._target}_{table_kind}_{cache_version}"
+        sql = CHECK_TABLE_EXISTENCE_TEMPLATE.render(table_name=table_name)
+        self._cur.execute(sql)
+        tables = self._cur.fetchall()
+
+        if tables:
+            _LOGGER.info(
+                f"{table_name=} exists in the db",
+            )
+            return True
+        else:
+            _LOGGER.info(
+                f"{table_name=} does not exist in the db, possible version mismatch!",
+            )
+            return False
+
     def _delete_existing_table(self, table_kind):
         """Delete an existing table in the db"""
         sql = QUERY_ALL_TABLES_TEMPLATE.render()
         self._cur.execute(sql)
         all_tables = self._cur.fetchall()
         if len(all_tables) == 0:
-            logger.info(__name__, "deleting table: skip empty table")
+            _LOGGER.info("deleting table: skip empty table")
             return
 
         target_tables = [
@@ -547,45 +627,9 @@ def _delete_existing_table(self, table_kind):
         assert (
             len(target_tables) == 1
         ), f"expected only one {table_kind} table but got {target_tables=}"
-        logger.info(__name__, f"deleting table {target_tables[0]=}")
+        _LOGGER.info(f"deleting table {target_tables[0]=}")
         self._cur.execute(f"DROP TABLE {target_tables[0]}")
 
-    def _create_conv_table(self):
-        """Creates conv table."""
-        sql = CONV_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _create_conv3d_table(self):
-        """Creates conv3d table."""
-        sql = CONV3D_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _create_norm_table(self):
-        """Creates conv table."""
-        sql = NORM_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _if_table_exists(self, table_name):
-        """check if a table exists"""
-        sql = CHECK_TABLE_EXISTENCE_TEMPLATE.render(table_name=table_name)
-        self._cur.execute(sql)
-        tables = self._cur.fetchall()
-        return len(tables) > 0
-
-    def _gemm_table_version_matches(self):
-        table_name = f"{self._target}_gemm_{self._gemm_cache_version}"
-        if self._if_table_exists(table_name):
-            logger.info(__name__, f"{table_name=} exists in the db")
-            return True
-        else:
-            logger.info(
-                __name__, f"{table_name=} does not exist in the db, version mismatch!"
-            )
-            return False
-
     def _query(self, sql: str) -> Tuple[str, int]:
         """a function to query op from cache
 
@@ -625,7 +669,9 @@ def query_gemm(self, args: Dict[str, Any]) -> Tuple[str, int]:
             profiling results
         """
         sql = GEMM_QUERY_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version, **args
+            dev=self._target,
+            version=self.gemm_cache_version,
+            **args,
         )
         return self._query(sql)
 
@@ -643,7 +689,11 @@ def query_conv(self, args: Dict[str, Any]) -> Tuple[str, int]:
         Tuple
             profiling results
         """
-        sql = CONV_QUERY_TEMPLATE.render(dev=self._target, **args)
+        sql = CONV_QUERY_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv_cache_version,
+            **args,
+        )
         return self._query(sql)
 
     def query_conv3d(self, args: Dict[str, Any]) -> Tuple[str, int]:
@@ -660,7 +710,11 @@ def query_conv3d(self, args: Dict[str, Any]) -> Tuple[str, int]:
         Tuple
             profiling results
         """
-        sql = CONV3D_QUERY_TEMPLATE.render(dev=self._target, **args)
+        sql = CONV3D_QUERY_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv3d_cache_version,
+            **args,
+        )
         return self._query(sql)
 
     def query_normalization(self, args: Dict[str, Any]) -> Tuple[str, int]:
@@ -696,7 +750,7 @@ def _insert(self, query_sql: str, insert_sql: str) -> None:
                 self._cur.execute(insert_sql)
                 self._db_commit_flag = True
             else:
-                logger.info(__name__, "Ignore repeat profile_record: " + query_sql)
+                _LOGGER.info("Ignore repeat profile_record: " + query_sql)
 
     def insert_gemm(self, args: Dict[str, Any]) -> None:
         """a function to insert gemm op epilogue into cache
@@ -708,7 +762,7 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
         """
         query_sql = GEMM_QUERY_TEMPLATE.render(
             dev=self._target,
-            version=self._gemm_cache_version,
+            version=self.gemm_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -724,7 +778,9 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
             exec_entry_sha1=args["exec_entry_sha1"],
         )
         insert_sql = GEMM_INSERT_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version, **args
+            dev=self._target,
+            version=self.gemm_cache_version,
+            **args,
         )
         self._insert(query_sql, insert_sql)
 
@@ -740,6 +796,7 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
         """
         query_sql = CONV_QUERY_TEMPLATE.render(
             dev=self._target,
+            version=self.conv_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -759,7 +816,11 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
             split_k=args["split_k"],
             exec_entry_sha1=args["exec_entry_sha1"],
         )
-        insert_sql = CONV_INSERT_TEMPLATE.render(dev=self._target, **args)
+        insert_sql = CONV_INSERT_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv_cache_version,
+            **args,
+        )
         self._insert(query_sql, insert_sql)
 
     def insert_conv3d(self, args: Dict[str, Any]) -> None:
@@ -774,6 +835,7 @@ def insert_conv3d(self, args: Dict[str, Any]) -> None:
         """
         query_sql = CONV3D_QUERY_TEMPLATE.render(
             dev=self._target,
+            version=self.conv3d_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -800,7 +862,11 @@ def insert_conv3d(self, args: Dict[str, Any]) -> None:
             split_k=args["split_k"],
             exec_entry_sha1=args["exec_entry_sha1"],
         )
-        insert_sql = CONV3D_INSERT_TEMPLATE.render(dev=self._target, **args)
+        insert_sql = CONV3D_INSERT_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv3d_cache_version,
+            **args,
+        )
         self._insert(query_sql, insert_sql)
 
     def insert_normalization(self, args: Dict[str, Any]) -> None:
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index cdc45a6a9..1be2adae7 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -17,7 +17,8 @@
 """
 from __future__ import annotations
 
-import concurrent
+import concurrent.futures
+import logging
 import os
 
 import re
@@ -27,12 +28,15 @@
 from queue import Queue
 from typing import Callable, List, Tuple
 
-from ..utils import logger
 from .target import Target
 from .task_runner import BaseRunner, Task
 
 # pylint: disable=W0221
 
+
+_LOGGER = logging.getLogger(__name__)
+
+
 PROF_RUNTIME_PATTERN = re.compile(r"OP:([a-zA-Z0-9_]+),TIME:([\d\.]+),WS:([\d]+)")
 # FIXME: We will remove the following two patterns once we implement the
 # same profiling mechanism as gemm for conv and amd
@@ -48,27 +52,50 @@ def optimization_key(result):
     return float(result[1])
 
 
-def extract_profile_result(stdout) -> Tuple[ProfileResult, bool]:
+def extract_profile_result(
+    stdout,
+    return_ops=None,
+) -> Tuple[ProfileResult | List[ProfileResult], bool]:
     failed = False
     try:
         runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
         if len(runtimes) > 0:
-            logger.debug(__name__, f"all runtimes (unsorted): {runtimes}")
+            _LOGGER.debug(f"all runtimes (unsorted): {runtimes}")
             # format - OP:xx,TIME:x.xx,WS:xx
-            best_runtime = min(runtimes, key=optimization_key)
-            op_config = best_runtime[0]
-            duration = float(best_runtime[1])
-            workspace = int(best_runtime[2])
+            if return_ops is not None:
+                _LOGGER.debug(f"return ops: {return_ops}")
+                return_ops = set(return_ops)
+                result = [
+                    ProfileResult(
+                        op_config=runtime[0],
+                        duration=float(runtime[1]),
+                        workspace=int(runtime[2]),
+                    )
+                    for runtime in runtimes
+                    if runtime[0] in return_ops
+                ]
+            else:
+                best_runtime = min(runtimes, key=optimization_key)
+                result = ProfileResult(
+                    op_config=best_runtime[0],
+                    duration=float(best_runtime[1]),
+                    workspace=int(best_runtime[2]),
+                )
         else:
             # FIXME: remove it once we unify our profiling mechanism for conv and amd
-            op_config = ""
-            duration = float(RUNTIME_PATTERN.findall(stdout)[0])
-            workspace = int(WORKSPACE_PATTERN.findall(stdout)[0])
+            result = ProfileResult(
+                op_config="",
+                duration=float(RUNTIME_PATTERN.findall(stdout)[0]),
+                workspace=int(WORKSPACE_PATTERN.findall(stdout)[0]),
+            )
     except Exception:
-        duration = 0
-        workspace = 0
+        result = ProfileResult(
+            op_config="",
+            duration=0,
+            workspace=0,
+        )
         failed = True
-    return ProfileResult(op_config, duration, workspace), failed
+    return result, failed
 
 
 def update_inplace(d, new_d):
@@ -96,8 +123,7 @@ def process_task(task: Task) -> None:
         if not single_file_profiler:
             task._failed = True
             return
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
                 name=task._name,
                 algo=task._idx,
@@ -105,13 +131,19 @@ def process_task(task: Task) -> None:
                 stderr=stderr,
             ),
         )
-    task._ret, task._failed = extract_profile_result(stdout)
+    task._ret, task._failed = extract_profile_result(
+        stdout=stdout,
+        return_ops=task._kwargs.get("return_ops", None),
+    )
     if not task._failed:
-        logger.debug(
-            __name__,
-            f"Successful: [{task._name}][{task._idx}]: OP: {task._ret.op_config} "
-            f"TIME: {task._ret.duration} WS:{task._ret.workspace}",
-        )
+        results = task._ret
+        if not isinstance(results, list):
+            results = [results]
+        for result in results:
+            _LOGGER.debug(
+                f"Successful: [{task._name}][{task._idx}]: OP: {result.op_config} "
+                f"TIME: {result.duration} WS:{result.workspace}",
+            )
 
 
 def process_return(task: Task) -> typing.Tuple[typing.Union[int, str], ProfileResult]:
@@ -136,15 +168,13 @@ class Runner(BaseRunner):
     """
 
     def __init__(self, devs: list[int], op_name: str, timeout: int = 30):
-        logger.info(
-            __name__, "Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name)
-        )
+        _LOGGER.info("Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name))
         super().__init__(devs, op_name, timeout)
         self._dev_flag = Target.current().dev_select_flag()
         self._ftask_proc = process_task
         self._fret_proc = process_return
 
-    def push(self, idx: typing.Union[int, str], cmd: str):
+    def push(self, idx: typing.Union[int, str], cmd: str, return_ops: List[str] = None):
         """Push a new profiling task into runner's queue
 
         Parameters
@@ -153,8 +183,20 @@ def push(self, idx: typing.Union[int, str], cmd: str):
             Profiling task id (usually is algorithm id or name)
         cmd : str
             Bash command to execute the profiling task
+        return_ops : list[str]
+            Names of the ops to return the profiling results for. If specified,
+            instead of a single (best) ProfileResult instance, a list with the
+            ProfileResults for each op in the return_ops is returned from `pull`.
         """
-        self._queue.append(Task(idx, cmd, self._tag, dev_flag=self._dev_flag))
+        self._queue.append(
+            Task(
+                idx,
+                cmd,
+                self._tag,
+                dev_flag=self._dev_flag,
+                return_ops=return_ops,
+            )
+        )
 
     def pull(self):
         """Pull results from all profiling tasks assigned to runner.
@@ -171,7 +213,7 @@ def pull(self):
 def run_task(cmds, queue, dev_select_flag):
     # get device or block until one is available
     device = queue.get()
-    logger.debug(__name__, f"running profiler {cmds=} on GPU #{device}")
+    _LOGGER.debug(f"running profiler {cmds=} on GPU #{device}")
 
     completed_process = subprocess.run(
         cmds,
@@ -213,7 +255,7 @@ def __init__(self, devices: List[str], timeout: int, postprocessing_delegate):
         self._done_queue = Queue()
         for d in devices:
             self._device_queue.put(str(d))
-        logger.info(__name__, f"Initialized profiler runner with devices: {devices}")
+        _LOGGER.info(f"Initialized profiler runner with devices: {devices}")
         self._timeout = timeout
         self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(devices))
         self._futures = []
@@ -245,18 +287,21 @@ def push(self, cmds: List[str], process_result_callback: Callable):
         # they are launched asynchronously, in a separate thread,
         # some time after a future holding profiler result completes
         def callback_when_done(fut):
+            stdout = None
+            stderr = None
             try:
                 stdout, stderr = fut.result()
                 profile_result, err = extract_profile_result(stdout)
                 if err:
-                    logger.error(
-                        f"Profiler failure!\nProfiler stdout: {stdout}\nProfiler stderr: {stderr}"
+                    _LOGGER.error(
+                        f"Profiler failure!\nProfiler stdout: {stdout}\nProfiler stderr: {stderr}",
                     )
                     raise RuntimeError(f"Failed to extract profiler result for {cmds}")
                 process_result_callback(profile_result, self._postprocessing_delegate)
             finally:
                 # unblock one future in `join()`
-                self._done_queue.put(stdout)
+                if stdout is not None:
+                    self._done_queue.put(stdout)
 
         future.add_done_callback(callback_when_done)
         self._futures.append(future)
diff --git a/python/aitemplate/backend/registry.py b/python/aitemplate/backend/registry.py
index 62f4a10ee..42e0675bc 100644
--- a/python/aitemplate/backend/registry.py
+++ b/python/aitemplate/backend/registry.py
@@ -55,7 +55,9 @@ def func(args):
         If same key is founded in registry, will raise a RuntimeError
     """
     if func_name in BACKEND_FUNCTIONS:
-        raise RuntimeError("{name} funcion has been registered.".format(name=func_name))
+        raise RuntimeError(
+            "{name} funcion has already been registered.".format(name=func_name)
+        )
 
     def _do_reg(func):
         BACKEND_FUNCTIONS[func_name] = func
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index ff4483f88..9a7b2dd83 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -18,6 +18,7 @@
 # pylint: disable=W0702,W0707,W0611,C0415
 
 import json
+import logging
 import os
 import re
 import shutil
@@ -26,14 +27,15 @@
 
 from aitemplate.backend.target import AIT_STATIC_FILES_PATH
 
-from ...utils import logger
-
 from .. import registry
 from ..target import COMPOSABLE_KERNEL_PATH, Target
 
 # pylint: disable=W0613
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class ROCM(Target):
     """ROCM target.
 
@@ -305,9 +307,7 @@ def __init__(
         convert_hippcc_json = parutil.get_file_path(
             os.path.join("aitemplate/testing", "convert_hipcc_cmd")
         )
-        logger.info(
-            __name__, f"Load the hipcc compile option from {convert_hippcc_json}"
-        )
+        _LOGGER.info(f"Load the hipcc compile option from {convert_hippcc_json}")
         with open(convert_hippcc_json, "r") as hipcc_options_json:
             self.hipcc_options_json = json.load(hipcc_options_json)
 
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index c5fe33e23..70203d4c8 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -22,6 +22,7 @@
     concatenate_tanh,
     dynamic_slice,
     permute021,
+    permute0213,
     permute102,
     permute210,
     slice_reshape_scatter,
diff --git a/python/aitemplate/backend/rocm/tensor/permute021.py b/python/aitemplate/backend/rocm/tensor/permute021.py
index 8dc0d1e40..df066ca78 100644
--- a/python/aitemplate/backend/rocm/tensor/permute021.py
+++ b/python/aitemplate/backend/rocm/tensor/permute021.py
@@ -30,7 +30,7 @@
 
 
 @registry.reg("rocm.permute021.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -38,8 +38,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -49,8 +47,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute021_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         ROCMSpec(),
     )
diff --git a/python/aitemplate/backend/rocm/tensor/permute0213.py b/python/aitemplate/backend/rocm/tensor/permute0213.py
new file mode 100644
index 000000000..2fdde245d
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/permute0213.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute0213 for rocm
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import permute0213_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.permute0213.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute0213_common.gen_function(
+        func_attrs,
+        template_path,
+        Header_files,
+        ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.permute0213.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute0213_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.permute0213.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute0213_common.gen_function_call(func_attrs, ROCMSpec(), indent)
diff --git a/python/aitemplate/backend/rocm/tensor/permute102.py b/python/aitemplate/backend/rocm/tensor/permute102.py
index df6fd3e82..f5304897f 100644
--- a/python/aitemplate/backend/rocm/tensor/permute102.py
+++ b/python/aitemplate/backend/rocm/tensor/permute102.py
@@ -30,7 +30,7 @@
 
 
 @registry.reg("rocm.permute102.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -38,8 +38,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -49,8 +47,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute102_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         ROCMSpec(),
     )
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 057633c93..27559c7d5 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -15,6 +15,7 @@
 """
 Target object for AITemplate.
 """
+import logging
 import os
 import pathlib
 import shutil
@@ -22,10 +23,12 @@
 from enum import IntEnum
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from ..utils import logger
 from . import registry
 from .profiler_cache import ProfileCacheDB
 
+
+_LOGGER = logging.getLogger(__name__)
+
 _MYPATH = os.path.dirname(os.path.realpath(__file__))
 _3RDPARTY_PATH = os.path.normpath(os.path.join(_MYPATH, "..", "..", "..", "3rdparty"))
 _WHEEL_3RDPARTY_PATH = os.path.normpath(os.path.join(_MYPATH, "..", "3rdparty"))
@@ -294,9 +297,7 @@ def _get_cache_file_name(self) -> str:
     def _prepare_profile_cache_path(self) -> Optional[str]:
         """Prepare local profile cache for this target."""
         if self.use_dummy_profiling_results():
-            logger.info(
-                __name__, "Escape loading profile cache when using dummy profiling"
-            )
+            _LOGGER.info("Escape loading profile cache when using dummy profiling")
             return None
 
         prefix = None
@@ -309,10 +310,10 @@ def _prepare_profile_cache_path(self) -> Optional[str]:
         try:
             os.makedirs(prefix, exist_ok=True)
         except OSError as error:
-            logger.info(__name__, f"Cannot mkdir at {prefix} due to issue {error}")
+            _LOGGER.info(f"Cannot mkdir at {prefix} due to issue {error}")
             prefix = os.path.join(tempfile.mkdtemp(prefix="aitemplate_"), ".aitemplate")
             os.makedirs(prefix, exist_ok=True)
-            logger.info(__name__, f"mkdir at {prefix} instead")
+            _LOGGER.info(f"mkdir at {prefix} instead")
 
         cache_path = os.path.join(prefix, cache_file)
         flush_flag = os.environ.get("FLUSH_PROFILE_CACHE", "0")
@@ -326,7 +327,7 @@ def _load_profile_cache(self):
         if self._cache_path is None:
             return
 
-        logger.info(__name__, f"Loading profile cache from: {self._cache_path}")
+        _LOGGER.info(f"Loading profile cache from: {self._cache_path}")
         self._profile_cache = ProfileCacheDB(
             TargetType(self._target_type).name, path=self._cache_path
         )
@@ -355,7 +356,11 @@ def get_profile_cache_version(self, op_class: str) -> int:
         """
         # TODO: support conv and normalization
         if op_class == "gemm":
-            return self._profile_cache.get_profile_gemm_cache_version()
+            return self._profile_cache.gemm_cache_version
+        elif op_class == "conv":
+            return self._profile_cache.conv_cache_version
+        elif op_class == "conv3d":
+            return self._profile_cache.conv3d_cache_version
         raise NotImplementedError
 
     def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 3d869ee89..f99436de3 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -15,6 +15,7 @@
 """
 build a test module from a tensor
 """
+import logging
 import os
 from datetime import datetime
 from typing import Dict, List, Optional, Union
@@ -22,7 +23,8 @@
 from aitemplate import backend, compiler
 from aitemplate.compiler.model import AITemplateAllocatorKind
 from aitemplate.compiler.transform.profile import elapsed_dt_sec
-from aitemplate.utils import graph_utils, logger
+from aitemplate.utils import graph_utils
+from aitemplate.utils.debug_settings import AITDebugSettings
 from aitemplate.utils.serialization.serdes_code import dump_program
 
 from .base import DynamicProfileStrategy, Tensor
@@ -32,6 +34,9 @@
 # pylint: disable=W0102
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def _validate_tensor_args(sorted_graph: List[Tensor], output_tensors: List[Tensor]):
     """
     Validate the user's desired output name -> index ordering.
@@ -78,10 +83,13 @@ def _verify_outputs_still_in_graph(sorted_graph: List[Tensor], outputs: List[Ten
     for tensor, was_seen in seen.items():
         if not was_seen:
             raise ValueError(
-                f"Output {tensor._attrs['name']} was not found in the graph after opitmizations."
+                f"Output {tensor} was not found in the graph after opitmizations."
             )
 
 
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
 def compile_model(
     tensor: Union[Tensor, List[Tensor]],
     target: backend.target.Target,
@@ -94,9 +102,7 @@ def compile_model(
     profile_dir: str = None,
     constants: Optional[Dict[str, TorchTensor]] = None,
     allocator_kind: Optional[AITemplateAllocatorKind] = None,
-    check_all_nan_and_inf: bool = False,
-    check_all_outputs: bool = False,
-    dump_ait_to_py: Optional[str] = None,
+    debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
@@ -124,12 +130,8 @@ def compile_model(
         default, set to 2. Must be positive.
     allocator_kind: AITemplateAllocatorKind, optional
         The GPU allocator to use. If none is specified, use the default allocator.
-    check_all_nan_and_inf : bool, optional
-        Whether or not to check this tensor is nan or inf during runtime.
-    check_all_outputs : bool, optional
-        Whether or not to print this tensor's value out during runtime.
-    dump_ait_to_py: str, optional
-        The path where the AIT graph is dumped into a .py file.
+    debug_settings: AITDebugSettings
+        specify debug settings such as where to dump AITemplate model Python file, etc.
 
     Returns
     -------
@@ -149,8 +151,8 @@ def compile_model(
     test_dir = os.path.join(workdir, test_name)
     profile_dir = workdir if profile_dir is None else profile_dir
 
-    if dump_ait_to_py:
-        dump_program(tensor, dump_ait_to_py)
+    if debug_settings.dump_ait_to_py:
+        dump_program(tensor, debug_settings.dump_ait_to_py)
 
     if int(recompile) == 1:
         os.makedirs(test_dir, exist_ok=True)
@@ -183,9 +185,7 @@ def compile_model(
             start_t = datetime.now()
             graph = compiler.transform.optimize_graph(graph, test_dir)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
-            logger.info(
-                __name__, f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}"
-            )
+            _LOGGER.info(f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}")
 
             compiler.transform.mark_special_views(graph)
             compiler.transform.refine_graph(graph)
@@ -209,9 +209,7 @@ def compile_model(
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "constant_folding"
             )
-            logger.info(
-                __name__, f"folded constants elapsed time: {elapsed_dt_sec(start_t)}"
-            )
+            _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
 
             _verify_outputs_still_in_graph(graph, output_tensors)
             (
@@ -244,16 +242,14 @@ def compile_model(
                 workdir,
                 output_tensors,
                 test_name,
-                check_all_nan_and_inf,
-                check_all_outputs,
+                debug_settings,
             )
             file_pairs.extend(main_pairs)
 
             start_t = datetime.now()
             compile_engine = backend.builder.Builder()
             compile_engine.make(file_pairs, dll_name, workdir, test_name)
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 f"compiled the final .so file elapsed time: {elapsed_dt_sec(start_t)}",
             )
 
diff --git a/python/aitemplate/compiler/dtype.py b/python/aitemplate/compiler/dtype.py
index 51b6c96d6..029ea4197 100644
--- a/python/aitemplate/compiler/dtype.py
+++ b/python/aitemplate/compiler/dtype.py
@@ -25,6 +25,7 @@
     "int": 4,
     "int32": 4,
     "int64": 8,
+    "bfloat16": 2,
 }
 
 
@@ -41,6 +42,7 @@
     "int32": 3,
     "int64": 4,
     "bool": 5,
+    "bfloat16": 6,
 }
 
 
@@ -130,7 +132,27 @@ def _impl(dtype):
             return "kLong"
         elif dtype == "bool":
             return "kBool"
+        elif dtype == "bfloat16":
+            return "kBFloat16"
         else:
             raise AssertionError(f"unknown dtype {dtype}")
 
     return f"AITemplateDtype::{_impl(dtype)}"
+
+
+def is_same_dtype(dtype1: str, dtype2: str) -> bool:
+    """Returns True if dtype1 and dtype2 are the same dtype and False otherwise.
+
+    Parameters
+    ----------
+    dtype1: str
+        A data type string.
+    dtype2: str
+        A data type string.
+
+    Returns
+    ----------
+    bool
+        whether dtype1 and dtype2 are the same dtype
+    """
+    return normalize_dtype(dtype1) == normalize_dtype(dtype2)
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index a7281f1d3..d0ffe94d3 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -159,32 +159,13 @@ class _DLLWrapper:
         def __init__(
             self,
             lib_path: str,
-            num_runtimes: int,
-            allocator_kind: Optional[AITemplateAllocatorKind],
         ):
             self.lib_path = lib_path
             self.DLL = ctypes.cdll.LoadLibrary(lib_path)
-
-            self.handle = ctypes.c_void_p()
-            self.allocator_handle = ctypes.c_void_p()
-            if allocator_kind is not None:
-                self.DLL.AITemplateAllocatorCreate(
-                    ctypes.byref(self.allocator_handle),
-                    ctypes.c_int(allocator_kind.value),
-                )
-
-            self.DLL.AITemplateModelContainerCreate(
-                ctypes.pointer(self.handle),
-                ctypes.c_size_t(num_runtimes),
-                self.allocator_handle,
-            )
             self.is_open = True
 
         def close(self):
             if self.is_open:
-                self.DLL.AITemplateModelContainerDelete(self.handle)
-                if self.allocator_handle:
-                    self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
                 _dlclose(self.DLL)
                 self.is_open = False
 
@@ -221,12 +202,29 @@ def __init__(
         allocator_kind : AITemplateAllocatorKind, optional
             What type of allocator to use when allocating GPU memory.
         """
+        # Set of pointers allocated with numpy_to_ait_data.
+        # If the user forgets to free their data, we use this to
+        # avoid leaking memory.
+        self._allocated_ait_data = set()
+
         if num_runtimes <= 0:
             raise ValueError(f"num_runtimes must be positive, but got {num_runtimes}")
 
-        self.DLL = self._DLLWrapper(lib_path, num_runtimes, allocator_kind)
-        self.handle = self.DLL.handle
-        self.lib_path = self.DLL.lib_path
+        self.DLL = self._DLLWrapper(lib_path)
+        self.lib_path = lib_path
+        self.handle = ctypes.c_void_p()
+        self.allocator_handle = ctypes.c_void_p()
+        if allocator_kind is not None:
+            self.DLL.AITemplateAllocatorCreate(
+                ctypes.byref(self.allocator_handle),
+                ctypes.c_int(allocator_kind.value),
+            )
+
+        self.DLL.AITemplateModelContainerCreate(
+            ctypes.pointer(self.handle),
+            ctypes.c_size_t(num_runtimes),
+            self.allocator_handle,
+        )
 
         # We use this list to add reference counts of Torch tensors
         # to avoid lifetime issues caused by user misuse.
@@ -242,11 +240,6 @@ def __init__(
             for i in range(len(self._output_name_to_index))
         ]
 
-        # Set of pointers allocated with numpy_to_ait_data.
-        # If the user forgets to free their data, we use this to
-        # avoid leaking memory.
-        self._allocated_ait_data = set()
-
     def __enter__(self):
         return self
 
@@ -260,7 +253,19 @@ def close(self):
         # Copy to avoid set size changed during iteration
         for ptr in list(self._allocated_ait_data):
             self.free_gpu_memory(ptr, sync=True)
-        self.DLL.close()
+
+        # Check that it exists since we may have thrown
+        # an exception before initializing it.
+        if hasattr(self, "DLL"):
+            if self.handle:
+                self.DLL.AITemplateModelContainerDelete(self.handle)
+                self.handle = ctypes.c_void_p()
+
+            if self.allocator_handle:
+                self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
+                self.allocator_handle = ctypes.c_void_p()
+
+            self.DLL.close()
 
     def __getstate__(self):
         return {"lib_path": self.DLL.lib_path}
@@ -434,6 +439,58 @@ def run(
             inputs, outputs, stream_ptr, sync, graph_mode, outputs_on_host=False
         )
 
+    def profile(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
+        num_iters: int,
+        filename: str,
+        stream_ptr: Optional[int] = None,
+    ) -> None:
+        if isinstance(inputs, dict):
+            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
+        if isinstance(outputs, dict):
+            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+            inputs,
+            outputs,
+            stream_ptr,
+        )
+        self.DLL.AITemplateModelContainerProfile(
+            self.handle,
+            c_inputs,
+            ctypes.c_size_t(len(inputs)),
+            c_outputs,
+            ctypes.c_size_t(len(outputs)),
+            c_stream,
+            ctypes.c_size_t(num_iters),
+            ctypes.c_char_p(filename.encode("utf-8")),
+        )
+
+    def profile_with_tensors(
+        self,
+        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        num_iters: int,
+        filename: str,
+        stream_ptr: Optional[int] = None,
+    ) -> None:
+        _check_tensors_contiguous_and_on_gpu(
+            inputs,
+            name="inputs",
+        )
+        _check_tensors_contiguous_and_on_gpu(
+            outputs,
+            name="outputs",
+        )
+        self.profile(
+            _convert_tensor_args(inputs),
+            _convert_tensor_args(outputs),
+            num_iters,
+            filename,
+            stream_ptr,
+        )
+
     def _interpret_tensors_as_shapes(
         self,
         outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 7964cdc39..bc910e6b2 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -138,7 +138,11 @@ def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
         o_shape = [var._attrs["values"][-1] for var in output_shape]
         if o_shape[-1] > 128:
             self._attrs["workspace"] = 4 * np.prod(o_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index dc18ba16e..c236675b2 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -19,7 +19,8 @@
 from typing import Any, List
 
 from ....utils import shape_utils
-from ...base import IntVar, Operator, Tensor
+from ...base import IntVar, IntVarTensor, Operator, Tensor
+from ...dtype import normalize_dtype
 from ...op_registry import OP_REGISTRY
 from .epilogue import FuncEnum
 
@@ -62,7 +63,7 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
             shape = tensor._attrs["shape"]
             if max_shape is None:
                 max_shape = list(shape)
-            broadcastable, max_shape = shape_utils.get_broadcast_max_shape(
+            broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
                 max_shape, shape
             )
             if not broadcastable:
@@ -71,30 +72,51 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
                         max_shape, shape
                     )
                 )
+            max_shape = new_max_shape
         return max_shape
 
     def __call__(self, *args: Tensor) -> Tensor:
         converted_args = []
+        common_dtype = None
+        assert len(args) > 0, "Elementwise ops must take at least one argument."
         for arg in args:
             if isinstance(arg, int) or isinstance(arg, float):
                 converted_args.append(Tensor(shape=[], value=arg))
+            elif isinstance(arg, IntVarTensor) and self._attrs["func"] == FuncEnum.SQRT:
+                assert len(arg._attrs["int_var"]._attrs["values"]) == 1
+                converted_args.append(
+                    Tensor(shape=[], value=arg._attrs["int_var"]._attrs["values"][0])
+                )
             elif isinstance(arg, Tensor):
                 converted_args.append(arg)
+                if common_dtype is None:
+                    common_dtype = normalize_dtype(arg.dtype())
+                elif normalize_dtype(arg.dtype()) != common_dtype:
+                    raise NotImplementedError(
+                        f"Type promotions are not supported; got dtype {arg.dtype()}, but expected {common_dtype}"
+                    )
+
             else:
                 raise RuntimeError(
                     f"Unsupported data type {arg} in elementwise {self}!"
                 )
 
+        if common_dtype is None:
+            # All inputs were constants. Just use fp16
+            common_dtype = "float16"
+        else:
+            # Infer dtype for constant nums
+            for arg in converted_args:
+                if arg.is_a_const_num():
+                    arg._attrs["dtype"] = common_dtype
+
         self._attrs["args"] = list(converted_args)
         self._attrs["inputs"] = [
             arg for arg in converted_args if not arg.is_a_const_num()
         ]
         self._set_depth()
-        # for some reason aten converters fail if uncommented
-        # we will need this for fp32 support
-        # dtype = self._attrs["args"][0]._attrs["dtype"]
         output_shape = self._infer_shapes(*converted_args)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=common_dtype)
         self._attrs["outputs"] = [output]
         return output
 
@@ -129,10 +151,6 @@ def __init__(self) -> None:
     def __call__(
         self, x: Tensor, min_value: Any = None, max_value: Any = None
     ) -> Tensor:
-        if isinstance(min_value, (int, float)):
-            min_value = Tensor(value=min_value, shape=[])
-        if isinstance(max_value, (int, float)):
-            max_value = Tensor(value=max_value, shape=[])
         if min_value is None and max_value is not None:
             return elementwise(FuncEnum.MIN)(
                 x,
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index cec01268b..7cac4fdee 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -61,3 +61,4 @@ class FuncEnum(Enum):
     GELU = 23
     FASTGELU = 24
     SOFTPLUS = 25
+    ELU = 26
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
index 4ad1988e5..67bdf5abc 100644
--- a/python/aitemplate/compiler/ops/common/fused_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -15,7 +15,7 @@
 """
 Fused elementwise operator definition.
 """
-from typing import List
+from typing import List, Set
 
 from .... import backend
 from ....backend import registry
@@ -51,54 +51,15 @@ def _check_output_shape(self) -> None:
                     )
                 )
 
-    def _update_inputs_outputs(self) -> None:
-        ops = set(self._attrs["elementwise_ops"])
-        external_inputs = set()
-        external_outputs = set()
-        tmp_inputs = set()
-        tmp_outputs = set()
-
-        for op in ops:
-            for input_tensor in op._attrs["inputs"]:
-                tmp_inputs.add(input_tensor)
-                if (
-                    len(input_tensor._attrs["src_ops"]) == 0
-                    or len(set(input_tensor._attrs["src_ops"]) - ops) > 0
-                ) and (not input_tensor.is_a_const_num()):
-                    external_inputs.add(input_tensor)
-                assert op in input_tensor._attrs["dst_ops"]
-            for output_tensor in op._attrs["outputs"]:
-                tmp_outputs.add(output_tensor)
-                if (
-                    output_tensor._attrs["is_output"]
-                    or len(output_tensor._attrs["dst_ops"] - ops) > 0
-                ):
-                    external_outputs.add(output_tensor)
-                assert len(output_tensor._attrs["src_ops"]) == 1
-                assert list(output_tensor._attrs["src_ops"])[0] == op
-
-        assert (
-            external_inputs == tmp_inputs - tmp_outputs
-        ), "external_inputs: {} is not equal to tmp_inputs: {} - tmp_outputs: {}.".format(
-            external_inputs, tmp_inputs, tmp_outputs
-        )
-        assert (
-            len(tmp_outputs - tmp_inputs - external_outputs) == 0
-        ), "tmp_outputs: {} - tmp_inputs: {} - external_outputs: {} is not empty.".format(
-            tmp_outputs, tmp_inputs, external_outputs
-        )
-        assert (
-            len(external_outputs - tmp_outputs) == 0
-        ), "external_outputs: {} - tmp_outputs: {} is not empty.".format(
-            external_outputs, tmp_outputs
-        )
-
-        self._attrs["inputs"] = list(external_inputs)
+    def _update_inputs_outputs(
+        self, inputs: Set[Operator], outputs: Set[Operator]
+    ) -> None:
+        self._attrs["inputs"] = list(inputs)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
-        self._attrs["outputs"] = list(external_outputs)
+        self._attrs["outputs"] = list(outputs)
         self._attrs["output_accessors"] = [
             TensorAccessor(output_tensor) for output_tensor in self._attrs["outputs"]
         ]
@@ -109,12 +70,9 @@ def _update_inputs_outputs(self) -> None:
         self._attrs["original_inputs"] = list(self._attrs["inputs"])
         self._attrs["original_outputs"] = list(self._attrs["outputs"])
 
-        for tensor in tmp_inputs | tmp_outputs:
-            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - ops
-            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - ops
-        for tensor in external_inputs:
+        for tensor in inputs:
             tensor._attrs["dst_ops"].add(self)
-        for tensor in external_outputs:
+        for tensor in outputs:
             tensor._attrs["src_ops"].add(self)
 
     def _check_constant(self) -> None:
@@ -128,24 +86,33 @@ def _check_constant(self) -> None:
             f"Please use Python to calculate directly. Operator: {self}"
         )
 
-    def __init__(self, elementwise_ops: List[elementwise]) -> None:
+    def __init__(
+        self,
+        elementwise_ops: List[elementwise],
+        inputs: Set[Operator],
+        outputs: Set[Operator],
+    ) -> None:
         super().__init__()
 
         if len(elementwise_ops) == 0:
             raise RuntimeError(
                 "fused_elementwise argument elementwise_ops cannot be empty!"
             )
-
+        # It is required that elementwise_ops need to be topologically sorted.
         self._attrs["op"] = "fused_elementwise"
         self._attrs["elementwise_ops"] = elementwise_ops
         self._attrs["has_profiler"] = False
 
-        self._update_inputs_outputs()
+        self._update_inputs_outputs(inputs, outputs)
         self._set_depth()
         self._check_constant()
 
     def _get_op_attributes(self):
-        return {"elementwise_ops": self._attrs["elementwise_ops"]}
+        return {
+            "elementwise_ops": self._attrs["elementwise_ops"],
+            "inputs": self._attrs["inputs"],
+            "outputs": self._attrs["outputs"],
+        }
 
     def gen_function(self) -> str:
         target = backend.target.Target.current()
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index 8c6e76cac..4d40952f3 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -101,3 +101,7 @@ def fast_gelu(tensor: Any) -> Tensor:
 
 def softplus(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("SOFTPLUS")(tensor)
+
+
+def elu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("ELU")(tensor)
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 5fe2e564c..f61394e43 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -115,6 +115,25 @@ def __init__(self):
         super().__init__()
         self._attrs["unknown_idx"] = -1
 
+    def make_output_shape_from_int_vars(
+        self,
+        shape: List[Any],
+    ) -> List[IntVar]:
+        output_shape = []
+        for dim in shape:
+            int_var = dim._attrs["int_var"]
+            assert (
+                int_var is not None
+            ), f"expected an int_var dimension, but got {int_var=} for {shape=}"
+            dim_values = list(int_var._attrs["values"])
+            if len(dim_values) == 1:
+                output_shape.append(IntImm(dim_values[0]))
+            else:
+                # dynamic dimension
+                dim_name = int_var._attrs["name"]
+                output_shape.append(IntVar(name=dim_name, values=dim_values))
+        return output_shape
+
     def make_output_shape(
         self,
         y_shape_values: List[Union[List[int], int]],
@@ -129,12 +148,11 @@ def make_output_shape(
             if len(values) == 1:
                 output_shape.append(IntImm(values[0]))
             else:
-                if not is_intvar_tensor:
-                    assert (
-                        self._attrs["unknown_idx"] == -1
-                    ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
-                    "got {idx} and {self._attrs['unknown_idx']}"
-                    self._attrs["unknown_idx"] = idx
+                assert (
+                    self._attrs["unknown_idx"] == -1
+                ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
+                "got {idx} and {self._attrs['unknown_idx']}"
+                self._attrs["unknown_idx"] = idx
                 output_shape.append(
                     dynamic_dim if dynamic_dim is not None else IntVar(values=values)
                 )
@@ -254,11 +272,7 @@ def unique(vector):
                 dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
             )
         else:
-            new_shape_vals = [
-                shape._attrs["int_var"]._attrs["values"]
-                for shape in self._attrs["shape"]
-            ]
-            return self.make_output_shape(new_shape_vals, is_intvar_tensor=True)
+            return self.make_output_shape_from_int_vars(self._attrs["shape"])
 
     def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
         self._attrs["shape"] = shape
@@ -269,7 +283,9 @@ def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
                 self._attrs["inputs"].append(s)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        output = Tensor(
+            output_shape, src_ops={self}, is_view_of=x, dtype=x._attrs["dtype"]
+        )
         self._attrs["outputs"] = [output]
         return output
 
@@ -469,7 +485,9 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        output = Tensor(
+            output_shape, src_ops={self}, is_view_of=x, dtype=x._attrs["dtype"]
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index 5c8c8537c..f018ef0d8 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -65,7 +65,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
index 6a38e3e4c..240ad61c6 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
@@ -67,7 +67,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor, r: Tensor):
         self._attrs["inputs"] = [x, w, b, r]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 11c67cc9c..6265e57c7 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -16,6 +16,7 @@
 Base class for conv2d.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -28,13 +29,20 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import logger, shape_utils
+from ....utils import alignment, shape_utils
 from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
 from .cache_entry import ConvQueryEntry, ConvRecordEntry
+from .conv_common import (
+    filter_op_instances,
+    generate_profiler_sources,
+    get_profiler_filename,
+)
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
 
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{dtype}}NI = {{x_dim0}};
@@ -263,13 +271,10 @@ def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         epilogue_dim = output_shape[-1]
         if not isinstance(epilogue_dim, IntImm):
             raise RuntimeError("Conv output last dimension must be static!")
-        shape = epilogue_dim._attrs["values"][0]
-        if shape % 8 == 0:
-            self._attrs["epilogue_alignment"] = 8
-        elif shape % 4 == 0:
-            self._attrs["epilogue_alignment"] = 4
-        elif shape % 2 == 0:
-            self._attrs["epilogue_alignment"] = 2
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
+            number=epilogue_dim._attrs["values"][0],
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
 
     def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         """Call conv2d with tensors x, w
@@ -291,7 +296,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -305,6 +310,66 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         return attr
 
+    def _should_build_profiler(self) -> bool:
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this conv instance, we update this conv op's
+        relevant attributes with the cached result and return False.
+        """
+        if self._has_dynamic_input_dims():
+            # If there are dynamic dims, we'll have to generate and build the
+            # profilers, as the binaries will be needed for dynamic profiling.
+            return True
+
+        target = backend.target.Target.current()
+        workloads = list(self._attrs["exec_path"].keys())
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(self._attrs["op_instance"].keys()))
+            tmp_op = self._attrs["op_instance"][tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                split_k = (
+                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+                )
+                query = ConvQueryEntry(
+                    dtype_a=tmp_op.A.element.value,
+                    dtype_b=tmp_op.B.element.value,
+                    dtype_c=tmp_op.C.element.value,
+                    dtype_acc=tmp_op.accumulator_type().value,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    kh=self._attrs["KH"],
+                    kw=self._attrs["KW"],
+                    co=self._attrs["CO"],
+                    stride=self._attrs["stride"],
+                    pad=self._attrs["pad"],
+                    dilate=self._attrs["dilate"],
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    split_k=split_k,
+                    exec_entry_sha1=exec_entry_sha1,
+                )
+                cache_value = target.query_profile_cache("conv", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    _LOGGER.info(
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace = cache_value
+                    self._attrs["exec_path"][wkl] = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
     def gen_profiler(
         self,
         workdir: str = None,
@@ -325,12 +390,22 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs)
-        func_key = "{target}.{op}.gen_profiler".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        return func(self._attrs, workdir, self.shape_eval_template)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+
+        if self._should_build_profiler():
+            x_shapes = [
+                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
+            ]
+            self._attrs["op_instance"] = filter_op_instances(
+                func_attrs=self._attrs,
+                x_shapes=x_shapes,
+            )
+            return generate_profiler_sources(
+                func_attrs=self._attrs,
+                op_class="conv",
+                workdir=workdir,
+                shape_template=self.shape_eval_template,
+            )
 
     def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         exe_path = os.path.join(profiler_prefix, cfg)
@@ -353,12 +428,6 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         target = backend.target.Target.current()
-        # if in CI just choose minimal configs
-        # workspace is a hack just provides 102400 Byte
-        if target.use_dummy_profiling_results():
-            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
-            logger.info(__name__, f"Select minimal algo {algo} for CI")
-            return (algo, 102400)
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
         tmp_op = self._attrs["op_instance"][tmp_key]
@@ -386,7 +455,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("conv", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
@@ -397,20 +466,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 "To bypass, you need to make it available in the db table.",
             )
 
-        func_key = "{target}.{op}.filter".format(
-            target=target.name(), op=self._attrs["op"]
+        profiler_filename = get_profiler_filename(self._attrs, "conv")
+        runner = backend.profiler_runner.Runner(
+            devices, self._attrs["name"], timeout=180
         )
-        func = registry.get(func_key)
-        content = list(self._attrs["op_instance"].keys())
-        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
         x_shape = self._invert_exec_key(exec_key)
-        for cfg in content:
-            if not func(cfg, self._attrs, x_shape):
-                continue
-            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
-            logger.info(__name__, "Running " + " ".join(command))
-            runner.push(cfg, command)
-
+        command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
+        runner.push(profiler_filename, command)
         runner.join()
         result = runner.pull()
         if len(result) == 0:
@@ -418,7 +480,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
         out = min(result, key=itemgetter(1))
-        best_algo = out[0]
+        best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = ConvRecordEntry(
@@ -447,6 +509,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         Target.current().insert_profile_cache("conv", cache_record.__dict__)
         return (best_algo, workspace)
 
+    def _has_dynamic_input_dims(self):
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    return True
+        return False
+
     def profile(
         self,
         workdir="./",
@@ -461,13 +530,7 @@ def profile(
         if target.use_dummy_profiling_results():
             return
 
-        has_dynamic = False
-        for input_tensor in self._attrs["inputs"]:
-            for dim in input_tensor._attrs["shape"]:
-                if not isinstance(dim, IntImm):
-                    has_dynamic = True
-                    break
-        if has_dynamic:
+        if self._has_dynamic_input_dims():
             if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
                 raise NotImplementedError(
                     "conv2d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
@@ -488,18 +551,28 @@ def _profile_static(self, workdir, devices):
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            best_algo, workspace = self._profile_single_workload(
-                profiler_prefix, wkl, devices
-            )
-            self._attrs["exec_path"][wkl] = best_algo
-            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+            target = backend.target.Target.current()
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results():
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl] = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl] == "":
+                best_algo, workspace = self._profile_single_workload(
+                    profiler_prefix, wkl, devices
+                )
+                self._attrs["exec_path"][wkl] = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
 
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
@@ -509,9 +582,12 @@ def _profile_dynamic_dim(self, workdir):
         # extract dynamic dim from exec_path
         if len(self._attrs["exec_path"]) <= 1:
             return
+        if len(set(self._attrs["exec_path"].values())) <= 1:
+            # all exec paths point to the same algo
+            return
 
         def _extract_dynamic_dim(exec_keys):
-            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
             var_dims = [[], [], [], []]
             for key in exec_keys:
                 dims = self._invert_exec_key(key)
@@ -541,33 +617,37 @@ def _extract_dynamic_dim(exec_keys):
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
                 mid_shape = [mid, dim1, dim2, dim3]
-                logger.info(
-                    __name__,
+                _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
                     ),
                 )
 
-                mid_lb_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(lb_algo), mid_shape
+                # run the profiler binary with all ops on the mid_shape
+                # and fetch the results only for the lb_algo and ub_algo
+                profiler_filename = get_profiler_filename(self._attrs, "conv")
+                profiler_cmd = self._gen_profile_cmd(
+                    profiler_prefix, profiler_filename, mid_shape
                 )
-                mid_ub_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(ub_algo), mid_shape
+                runner.push(
+                    idx=profiler_filename,
+                    cmd=profiler_cmd,
+                    return_ops=[str(lb_algo), str(ub_algo)],
                 )
-                runner.push(0, mid_lb_algo_cmd)
-                runner.push(1, mid_ub_algo_cmd)
                 runner.join()
                 result = runner.pull()
-                assert len(result) >= 1
+                result_dict = {res.op_config: res for res in result[0][1]}
+
+                assert len(result_dict) >= 1
                 # if there is only one result, assume ub algo failed.
-                if len(result) == 1:
-                    assert result[0][0] == 0
+                if len(result_dict) == 1:
+                    assert str(ub_algo) not in result_dict
                     # last_lb = lb
                     lb = mid + 1
                 # if there are two result, compare to decide new lb/ub
                 else:
-                    lb_time = result[0][1]
-                    ub_time = result[1][1]
+                    lb_time = result_dict[str(lb_algo)].duration
+                    ub_time = result_dict[str(ub_algo)].duration
                     if lb_time < ub_time:
                         # lb algo can work with larger batch
                         # last_lb = lb
@@ -597,15 +677,15 @@ def _extract_dynamic_dim(exec_keys):
             #         runner.join()
             #         out = runner.pull()
             #         if len(out) == 0:
-            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             _LOGGER.info("Find specail case: batch=%d" % i)
             #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
             #             special_cases[self._gen_exec_key(x_shape)] = algo
 
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
             #         ub=last_mid))
             # _find_special_case(origin_lb, last_mid, lb_algo)
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
             #         ub=origin_ub))
             # _find_special_case(last_mid, origin_ub, ub_algo)
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 0015d4091..a3a6d00cf 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -17,6 +17,7 @@
 Base class for conv3d.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -29,13 +30,20 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import logger, shape_utils
+from ....utils import alignment, shape_utils
 from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
 from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+from .conv_common import (
+    filter_op_instances,
+    generate_profiler_sources,
+    get_profiler_filename,
+)
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
 
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{dtype}}NI = {{x_dim0}};
@@ -213,9 +221,14 @@ def _gen_exec_key(self, shape: List[int]):
             x_dim4=shape[4],
         ).replace("\n", "")
 
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3, dim4):
         return self.exec_dyn_key_template.render(
-            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+            x_dim0_lb=dim0_lb,
+            x_dim0_ub=dim0_ub,
+            x_dim1=dim1,
+            x_dim2=dim2,
+            x_dim3=dim3,
+            x_dim4=dim4,
         ).replace("\n", "")
 
     def _extract_exec_path(self, x: Tensor):
@@ -245,13 +258,10 @@ def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         epilogue_dim = output_shape[-1]
         if not isinstance(epilogue_dim, IntImm):
             raise RuntimeError("Conv output last dimension must be static!")
-        shape = epilogue_dim._attrs["values"][0]
-        if shape % 8 == 0:
-            self._attrs["epilogue_alignment"] = 8
-        elif shape % 4 == 0:
-            self._attrs["epilogue_alignment"] = 4
-        elif shape % 2 == 0:
-            self._attrs["epilogue_alignment"] = 2
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
+            number=epilogue_dim._attrs["values"][0],
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
 
     def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         """Call conv3d with tensors x, w
@@ -273,7 +283,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -287,6 +297,73 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         return attr
 
+    def _should_build_profiler(self) -> bool:
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this gemm instance, we update this gemm op's
+        relevant attributes with the cached result and return False.
+        """
+        if self._has_dynamic_input_dims():
+            # If there are dynamic dims, we'll have to generate and build the
+            # profilers, as the binaries will be needed for dynamic profiling.
+            return True
+
+        target = backend.target.Target.current()
+        workloads = list(self._attrs["exec_path"].keys())
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(self._attrs["op_instance"].keys()))
+            tmp_op = self._attrs["op_instance"][tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                split_k = (
+                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+                )
+                query = Conv3dQueryEntry(
+                    dtype_a=tmp_op.A.element.value,
+                    dtype_b=tmp_op.B.element.value,
+                    dtype_c=tmp_op.C.element.value,
+                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    kd=self._attrs["KD"],
+                    kh=self._attrs["KH"],
+                    kw=self._attrs["KW"],
+                    co=self._attrs["CO"],
+                    stride_d=self._attrs["stride"][0],
+                    stride_h=self._attrs["stride"][1],
+                    stride_w=self._attrs["stride"][2],
+                    pad_d=self._attrs["pad"][0],
+                    pad_h=self._attrs["pad"][1],
+                    pad_w=self._attrs["pad"][2],
+                    dilate_d=self._attrs["dilate"][0],
+                    dilate_h=self._attrs["dilate"][1],
+                    dilate_w=self._attrs["dilate"][2],
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    split_k=split_k,
+                    exec_entry_sha1=exec_entry_sha1,
+                )
+                cache_value = target.query_profile_cache("conv3d", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    _LOGGER.info(
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace = cache_value
+                    self._attrs["exec_path"][wkl] = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
     def gen_profiler(
         self,
         workdir: str = None,
@@ -307,12 +384,22 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs)
-        func_key = "{target}.{op}.gen_profiler".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        return func(self._attrs, workdir, self.shape_eval_template)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+
+        if self._should_build_profiler():
+            x_shapes = [
+                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
+            ]
+            self._attrs["op_instance"] = filter_op_instances(
+                func_attrs=self._attrs,
+                x_shapes=x_shapes,
+            )
+            return generate_profiler_sources(
+                func_attrs=self._attrs,
+                op_class="conv3d",
+                workdir=workdir,
+                shape_template=self.shape_eval_template,
+            )
 
     def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         exe_path = os.path.join(profiler_prefix, cfg)
@@ -343,12 +430,6 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         target = backend.target.Target.current()
-        # if in CI just choose minimal configs
-        # workspace is a hack just provides 102400 Byte
-        if target.use_dummy_profiling_results():
-            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
-            logger.info(__name__, f"Select minimal algo {algo} for CI")
-            return (algo, 102400)
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
         tmp_op = self._attrs["op_instance"][tmp_key]
@@ -383,7 +464,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("conv3d", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
@@ -394,18 +475,11 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 "To bypass, you need to make it available in the db table.",
             )
 
-        func_key = "{target}.{op}.filter".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        content = list(self._attrs["op_instance"].keys())
+        profiler_filename = get_profiler_filename(self._attrs, "conv3d")
         runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
         x_shape = self._invert_exec_key(exec_key)
-        for cfg in content:
-            if not func(cfg, self._attrs, x_shape):
-                continue
-            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
-            runner.push(cfg, command)
+        command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
+        runner.push(profiler_filename, command)
 
         runner.join()
         result = runner.pull()
@@ -414,7 +488,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
         out = min(result, key=itemgetter(1))
-        best_algo = out[0]
+        best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = Conv3dRecordEntry(
@@ -450,6 +524,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         Target.current().insert_profile_cache("conv3d", cache_record.__dict__)
         return (best_algo, workspace)
 
+    def _has_dynamic_input_dims(self):
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    return True
+        return False
+
     def profile(
         self,
         workdir="./",
@@ -460,13 +541,7 @@ def profile(
             devices = [0]
         self._profile_static(workdir, devices)
 
-        has_dynamic = False
-        for input_tensor in self._attrs["inputs"]:
-            for dim in input_tensor._attrs["shape"]:
-                if not isinstance(dim, IntImm):
-                    has_dynamic = True
-                    break
-        if has_dynamic:
+        if self._has_dynamic_input_dims():
             if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
                 raise NotImplementedError(
                     "conv3d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
@@ -487,18 +562,28 @@ def _profile_static(self, workdir, devices):
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            best_algo, workspace = self._profile_single_workload(
-                profiler_prefix, wkl, devices
-            )
-            self._attrs["exec_path"][wkl] = best_algo
-            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+            target = backend.target.Target.current()
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results():
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl] = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl] == "":
+                best_algo, workspace = self._profile_single_workload(
+                    profiler_prefix, wkl, devices
+                )
+                self._attrs["exec_path"][wkl] = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
 
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
@@ -508,10 +593,13 @@ def _profile_dynamic_dim(self, workdir):
         # extract dynamic dim from exec_path
         if len(self._attrs["exec_path"]) <= 1:
             return
+        if len(set(self._attrs["exec_path"].values())) <= 1:
+            # all exec paths point to the same algo
+            return
 
         def _extract_dynamic_dim(exec_keys):
-            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
-            var_dims = [[], [], [], []]
+            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            var_dims = [[], [], [], [], []]
             for key in exec_keys:
                 dims = self._invert_exec_key(key)
                 for i, v in enumerate(dims):
@@ -522,6 +610,7 @@ def _extract_dynamic_dim(exec_keys):
         dim1 = dims[1][0]
         dim2 = dims[2][0]
         dim3 = dims[3][0]
+        dim4 = dims[4][0]
         algos = list(self._attrs["exec_path"].values())
         # generate region
         regions = []  # lb, ub, lb_algos, ub_algos
@@ -539,34 +628,38 @@ def _extract_dynamic_dim(exec_keys):
             last_mid = mid
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3]
-                logger.info(
-                    __name__,
+                mid_shape = [mid, dim1, dim2, dim3, dim4]
+                _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
                     ),
                 )
 
-                mid_lb_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(lb_algo), mid_shape
+                # run the profiler binary with all ops on the mid_shape
+                # and fetch the results only for the lb_algo and ub_algo
+                profiler_filename = get_profiler_filename(self._attrs, "conv3d")
+                profiler_cmd = self._gen_profile_cmd(
+                    profiler_prefix, profiler_filename, mid_shape
                 )
-                mid_ub_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(ub_algo), mid_shape
+                runner.push(
+                    idx=profiler_filename,
+                    cmd=profiler_cmd,
+                    return_ops=[str(lb_algo), str(ub_algo)],
                 )
-                runner.push(0, mid_lb_algo_cmd)
-                runner.push(1, mid_ub_algo_cmd)
                 runner.join()
                 result = runner.pull()
-                assert len(result) >= 1
+                result_dict = {res.op_config: res for res in result[0][1]}
+
+                assert len(result_dict) >= 1
                 # if there is only one result, assume ub algo failed.
-                if len(result) == 1:
-                    assert result[0][0] == 0
+                if len(result_dict) == 1:
+                    assert str(ub_algo) not in result_dict
                     # last_lb = lb
                     lb = mid + 1
                 # if there are two result, compare to decide new lb/ub
                 else:
-                    lb_time = result[0][1]
-                    ub_time = result[1][1]
+                    lb_time = result_dict[str(lb_algo)].duration
+                    ub_time = result_dict[str(ub_algo)].duration
                     if lb_time < ub_time:
                         # lb algo can work with larger batch
                         # last_lb = lb
@@ -578,10 +671,10 @@ def _extract_dynamic_dim(exec_keys):
                 last_mid = mid
                 mid = (lb + ub) // 2
             lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3
+                origin_lb, last_mid, dim1, dim2, dim3, dim4
             )
             up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3
+                last_mid, origin_ub, dim1, dim2, dim3, dim4
             )
             new_exec_paths[lo_region_key] = lb_algo
             new_exec_paths[up_region_key] = ub_algo
@@ -590,21 +683,21 @@ def _extract_dynamic_dim(exec_keys):
             # So far I find binary search works well.
             # def _find_special_case(lb, ub, algo):
             #     for i in range(lb + 1, ub + 1):
-            #         x_shape = [i, dim1, dim2, dim3]
+            #         x_shape = [i, dim1, dim2, dim3, dim4]
             #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
             #         runner.push(0, cmd)
             #         runner.join()
             #         out = runner.pull()
             #         if len(out) == 0:
-            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             _LOGGER.info(Find specail case: batch=%d" % i)
             #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
             #             special_cases[self._gen_exec_key(x_shape)] = algo
 
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
             #         ub=last_mid))
             # _find_special_case(origin_lb, last_mid, lb_algo)
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
             #         ub=origin_ub))
             # _find_special_case(last_mid, origin_ub, ub_algo)
diff --git a/python/aitemplate/compiler/ops/conv/conv_common.py b/python/aitemplate/compiler/ops/conv/conv_common.py
new file mode 100644
index 000000000..647d45408
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv_common.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from hashlib import sha1
+
+from .... import backend
+from ....backend import registry
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def get_profiler_filename(func_attrs, op_class):
+    """
+    Generate a filename for a profiler that benchmarks multiple instances.
+    """
+    target = backend.target.Target.current()
+
+    op_type = func_attrs["op"]
+    all_op_names = list(func_attrs["op_instance"].keys())
+    encoded_str = sha1((";".join(all_op_names)).encode("utf-8")).hexdigest()
+
+    if target.use_dummy_profiling_results():
+        # we don't use cache
+        return f"{op_type}_{encoded_str}"
+    else:
+        cache_ver = target.get_profile_cache_version(op_class)
+        return f"{op_type}_{encoded_str}_{cache_ver}"
+
+
+def filter_op_instances(func_attrs, x_shapes):
+    """
+    Filter out some of the func's op instances using the filter function.
+    """
+    target = backend.target.Target.current()
+    func_key = "{target}.{op}.filter".format(
+        target=target.name(),
+        op=func_attrs["op"],
+    )
+    filter_func = registry.get(func_key)
+
+    op_names_to_keep = set()
+    for x_shape in x_shapes:
+        for op_name in func_attrs["op_instance"]:
+            if filter_func(op_name, func_attrs, x_shape):
+                op_names_to_keep.add(op_name)
+
+    return {
+        op_name: op
+        for op_name, op in func_attrs["op_instance"].items()
+        if op_name in op_names_to_keep
+    }
+
+
+def generate_profiler_sources(func_attrs, op_class, workdir, shape_template):
+    """
+    Generate profiler sources for the func.
+    """
+    target = backend.target.Target.current()
+    func_key = "{target}.{op}.gen_profiler".format(
+        target=target.name(),
+        op=func_attrs["op"],
+    )
+    gen_profiler_func = registry.get(func_key)
+
+    profiler_filename = get_profiler_filename(func_attrs, op_class)
+    _LOGGER.info(f"generating {profiler_filename=}")
+
+    return gen_profiler_func(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        shape_template,
+    )
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 4d37a5334..6d2737c01 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -267,7 +267,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
index 52175059f..dc1f557d5 100644
--- a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -80,7 +80,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
index 23c8ab1fc..40a293238 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -15,8 +15,14 @@
 """
 Transposed conv2d op.
 """
+
+import itertools
+from typing import List
+
 import jinja2
 
+from ....utils import shape_utils
+from ...base import Tensor
 from .conv2d import conv2d
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
@@ -109,3 +115,54 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self._attrs["op"] = "transposed_conv2d"
         self._attrs["epilogue"] = "LinearCombination"
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[3] != w[0] * self._attrs["group"]:
+            raise RuntimeError("X/W Shape mismatch for conv2d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride=self._attrs["stride"],
+            pad=self._attrs["pad"],
+            dilate=self._attrs["dilate"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            w_dim0=w[3],  # for conv_transpose w = [c_in, kh, kw, c_out]
+            w_dim1=w[1],
+            w_dim2=w[2],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[3]
+        self._attrs["KH"] = w_shape[1]
+        self._attrs["KW"] = w_shape[2]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
index 7a1d1c801..13d44f128 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
@@ -99,7 +99,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
index 54da519bd..8fc501529 100644
--- a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
+++ b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
@@ -79,9 +79,10 @@ def __call__(
             "int64",
         ], f"Expected dtype int/int32/int64 for index, got dtype {dtype_input_ids}"
 
-        assert (
-            dtype_word_embeddings == "float16"
-        ), f"Expected float16 embeddings, but got {dtype_word_embeddings}"
+        assert dtype_word_embeddings in [
+            "float16",
+            "float32",
+        ], f"Expected dtype float16/float32 for embeddings, got dtype {dtype_word_embeddings}"
 
         # expecting all three ids to have the same shapes
         assert shape_utils.is_same_shape(input_ids.shape(), token_type_ids.shape()), (
@@ -123,7 +124,11 @@ def __call__(
         self._set_depth()
 
         output_shape = self._infer_shapes(input_ids, word_embeddings)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=word_embeddings._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
index 74ea33f2c..265d4aebf 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 from .bmm_rcr_softmax import bmm_rcr_softmax
+from .dual_bmm_rrr_div import dual_bmm_rrr_div
 from .dual_gemm_rcr_fast_gelu import dual_gemm_rcr_fast_gelu
 from .dual_gemm_rcr_silu import dual_gemm_rcr_silu
 from .gemm_rcr_bias_softmax import gemm_rcr_bias_softmax
@@ -21,8 +22,9 @@
 
 __all__ = [
     "bmm_rcr_softmax",
+    "dual_bmm_rrr_div",
+    "dual_gemm_rcr_fast_gelu",
+    "dual_gemm_rcr_silu",
     "gemm_rcr_bias_softmax",
     "gemm_rcr_softmax",
-    "dual_gemm_rcr_silu",
-    "dual_gemm_rcr_fast_gelu",
 ]
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
index c3166b925..7eba552dd 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -39,7 +39,7 @@ def __init__(self):
         raise Exception("BMM + Softmax is disabled for now")
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
new file mode 100644
index 000000000..c22b15c83
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
+"""
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal.bmm_rrr import bmm_rrr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class dual_bmm_rrr_div(bmm_rrr):
+    """Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+        A = torch.randn(B, M, K)
+        B0 = torch.randn(B, K, N)
+        B1 = torch.randn(B, K, N)
+        D0 = torch.bmm(A, B0)
+        D1 = torch.bmm(A, B1)
+        D2 = D0 / D1
+
+    If the last dim of B1 is 1 (while the last dim of B0 isn't),
+    B1 is broadcasted to the same shape as B0 before computing
+    the right gemm A @ B1.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "dual_bmm_rrr_div"
+        self._attrs["epilogue2"] = "Div"
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(bias)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        if b._attrs["shape"][-1] != 1 and bias._attrs["shape"][-1] == 1:
+            self._attrs["broadcast_b1"] = True
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index e6e8c1d0e..11deca6ee 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -34,7 +34,7 @@ class dual_gemm_rcr_fast_gelu(gemm_rcr):
         B = torch.randn(N, K)
         Y1 = torch.nn.functional.linear(A, W)
         Y2 = torch.nn.functional.linear(A, B)
-        Y = torch.nn.functional.silu(Y_1) * Y_2
+        Y = torch.nn.functional.silu(Y1) * Y2
     """
 
     def __init__(self):
@@ -71,7 +71,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
+        if b._attrs["shape"][-2] != 1 and bias._attrs["shape"][-2] == 1:
+            self._attrs["broadcast_b1"] = True
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index d80a541e2..04bc02b38 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -34,7 +34,7 @@ class dual_gemm_rcr_silu(gemm_rcr):
         B = torch.randn(N, K)
         Y1 = torch.nn.functional.linear(A, W)
         Y2 = torch.nn.functional.linear(A, B)
-        Y = torch.nn.functional.silu(Y_1) * Y_2
+        Y = torch.nn.functional.silu(Y1) * Y2
     """
 
     def __init__(self):
@@ -71,7 +71,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
+        if b._attrs["shape"][-2] != 1 and bias._attrs["shape"][-2] == 1:
+            self._attrs["broadcast_b1"] = True
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
index 33a68ec0f..752cef8a0 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
@@ -83,7 +83,7 @@ def __call__(self, a: Tensor, b: Tensor, alpha: float = 1.0) -> Tensor:
         self._attrs["alpha"] = alpha
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
index 6fd8c2800..e71c2933c 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
@@ -73,7 +73,7 @@ def __call__(self, a: Tensor, b: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [a, b]
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
index 3e28087f3..46ff36709 100644
--- a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
@@ -103,7 +103,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._attrs["inputs"] = [a, b]
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         # self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
index b9e2f8e5c..7ce092d7d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -45,6 +45,7 @@
 from .gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
 from .gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
 from .gemm_rcr_permute import gemm_rcr_permute
+from .gemm_rcr_permute_elup1 import gemm_rcr_permute_elup1
 from .gemm_rrr import gemm_rrr
 from .gemm_rrr_bias import gemm_rrr_bias
 from .gemm_rrr_bias_permute import gemm_rrr_bias_permute
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
index 6cfbf89c3..0903664da 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
@@ -20,9 +20,53 @@
 
 from aitemplate.compiler.base import Tensor
 
+from ...base import IntImm
+from ...dtype import is_same_dtype
 from .gemm_common import gemm
 
 
+def is_valid_inputs(output_shapes, c_shapes):
+    """
+    Used by bmm_xxx_add ops to check whether elementwise ops
+    can be fused to the bmm op via epilogue fusion. So far,
+    only add ops are supported.
+    """
+    msg = ""
+    if output_shapes == c_shapes:
+        return True, msg
+
+    def _squeeze_leading_1s(shapes):
+        out = []
+        if len(shapes) == 0:
+            return out
+        i = 0
+        for shape in shapes:
+            if not isinstance(shape, IntImm):
+                break
+            if shape.value() != 1:
+                break
+            i = i + 1
+
+        out = shapes[i:]
+        if len(out) == 0:
+            out.append(shapes[-1])
+        return out
+
+    msg = (
+        f"C can't be broadcast to the bmm output."
+        f"Output shapes: {output_shapes}, C shapes: {c_shapes}"
+    )
+    bias_shapes = _squeeze_leading_1s(c_shapes)
+    if len(bias_shapes) >= len(output_shapes):
+        return False, msg
+
+    for o_shape, c_shape in zip(reversed(output_shapes), reversed(bias_shapes)):
+        if o_shape != c_shape:
+            return False, msg
+
+    return True, ""
+
+
 class bmm(gemm):
     """Base class for bmm."""
 
@@ -65,3 +109,9 @@ def _sanity_check(self, a: Tensor, b: Tensor):
             raise RuntimeError(
                 "bmm operand A and B both have 2 dimensions! Use gemm instead."
             )
+        if not is_same_dtype(a.dtype(), b.dtype()):
+            raise RuntimeError(
+                "gemm operand A and B should have the same data type! Current A: {atype}, B: {btype}.".format(
+                    atype=a.dtype(), btype=b.dtype()
+                )
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
index 45d6ee06d..57f206312 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "bmm_ccr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, k)
+            return common.default_align_ab(m, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
index 5dce354fd..cf37ac68b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
@@ -20,12 +20,14 @@
 
 from ...base import Tensor
 from . import bmm_ccr
+from .bmm import is_valid_inputs
 
 # pylint: disable=C0103, W0223
 
 
 class bmm_ccr_add(bmm_ccr):
     """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
 
     This operator is equivalent to following PyTorch code:
 
@@ -48,6 +50,12 @@ def __init__(self):
         self._attrs["op"] = "bmm_ccr_add"
         self._attrs["has_d"] = True
 
+    @staticmethod
+    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
+        output_shapes = bmm_ccr()._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return is_valid_inputs(output_shapes, c_shapes)
+
     def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
         """Call bmm_ccr_add with tensors a, b, c
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
index 35915f4a5..dc3a9ee12 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "bmm_crr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, n)
+            return common.default_align_ab(m, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
index b03bb183b..c5697c2b0 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
@@ -20,12 +20,14 @@
 
 from ...base import Tensor
 from . import bmm_crr
+from .bmm import is_valid_inputs
 
 # pylint: disable=C0103, W0223
 
 
 class bmm_crr_add(bmm_crr):
     """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
 
     This operator is equivalent to the following PyTorch code:
 
@@ -48,6 +50,12 @@ def __init__(self):
         self._attrs["op"] = "bmm_crr_add"
         self._attrs["has_d"] = True
 
+    @staticmethod
+    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
+        output_shapes = bmm_crr()._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return is_valid_inputs(output_shapes, c_shapes)
+
     def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
         """Call bmm_crr_add with tensors a, b, c
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
index 35c9e717c..5565eda9c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "bmm_rcr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
index 5bc4b4baa..51e3a480b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -88,7 +88,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
index 82c865d13..ae788c72f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
@@ -42,7 +42,7 @@ def __init__(self):
         self._attrs["op"] = "bmm_rrr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
+            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
index 6e8b7ab28..a6b5dde8f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
@@ -20,12 +20,14 @@
 
 from ...base import Tensor
 from . import bmm_rrr
+from .bmm import is_valid_inputs
 
 # pylint: disable=C0103, W0223
 
 
 class bmm_rrr_add(bmm_rrr):
     """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
 
     This operator is equivalent to the following pytorch code:
 
@@ -44,6 +46,12 @@ def __init__(self):
         self._attrs["op"] = "bmm_rrr_add"
         self._attrs["has_d"] = True
 
+    @staticmethod
+    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
+        output_shapes = bmm_rrr()._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return is_valid_inputs(output_shapes, c_shapes)
+
     def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
         """Call bmm_rrr_add with tensors a, b, c
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index 5bba36489..c920dfb4f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -88,7 +88,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
index 5ec16955f..6872691ee 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
@@ -60,7 +60,7 @@ def __init__(self, scale=1.0):
         self._attrs["scale"] = scale
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -153,7 +153,7 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, b1)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
index 5beedb6c2..c9f00d27d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -70,7 +70,7 @@ def __init__(self, shape: Tuple[int], scale=1.0, causal=False, layout="0213"):
         self._attrs["layout"] = "Permute4DBMM_{}".format(layout)
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -168,7 +168,7 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, b1)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index d354af3bc..b8a6ebe19 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -16,6 +16,7 @@
 Common functions/classes for GEMM ops
 """
 import itertools
+import logging
 import math
 import os
 import re
@@ -32,15 +33,18 @@
 
 from .... import backend
 from ....backend import registry
-from ....utils import logger
-from ....utils.alignment import find_max_alignment
+from ....utils import alignment
 from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
+from ...dtype import is_same_dtype
 from ...tensor_accessor import TensorAccessor
 from .cache_entry import GemmQueryEntry, GemmRecordEntry
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def split_k_result_getter(result):
     return result[1].duration
 
@@ -153,15 +157,9 @@ def gemm_inverse_key_func(key):
     return [int(x) for x in tmp]
 
 
-def default_align_ab(a, b):
+def default_align_ab(a, b, dtype):
     ab = math.gcd(a, b)
-    if ab % 8 == 0:
-        return 8
-    if ab % 4 == 0:
-        return 4
-    if ab % 2 == 0:
-        return 2
-    return 1
+    return alignment.find_max_alignment(ab, dtype)
 
 
 def _to_list(elem):
@@ -211,7 +209,8 @@ def _extract_epilogue_alignment(
         else:
             shape = epilogue_dim._attrs["values"][0]
 
-        self._attrs["epilogue_alignment"] = find_max_alignment(shape)
+        dtype = self._attrs["inputs"][0].dtype()
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(shape, dtype)
         return
 
     def _infer_shapes(self, a: Tensor, b: Tensor):
@@ -380,11 +379,13 @@ def _get_profiler_filename(self):
         generate a filename for a profiler that benchmarks multiple GEMM instances
         """
         target = backend.target.Target.current()
+
         op_type = self._attrs["op"]
         all_op_names = list(self._attrs["op_instance"].keys())
         encoded_str = sha1((";".join(all_op_names)).encode("utf-8")).hexdigest()
-        # we don't use cache
+
         if target.use_dummy_profiling_results():
+            # we don't use cache
             return f"{op_type}_{encoded_str}"
         else:
             cache_ver = target.get_profile_cache_version("gemm")
@@ -425,8 +426,7 @@ def _should_build_profiler(
                 )
                 cache_value = target.query_profile_cache("gemm", query.__dict__)
                 if cache_value is not None and not target.force_profile():
-                    logger.info(
-                        __name__,
+                    _LOGGER.info(
                         f'Load profiling result for {self._attrs["name"]} '
                         f"from cache: {cache_value}",
                     )
@@ -458,7 +458,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
         # init exec path
         self._extract_exec_path(dynamic_profiling_strategy)
@@ -489,8 +489,7 @@ def gen_profiler(
                 if filter_func(k, self._attrs, ab_alignments[0])
             }
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
             f"number of generated kernels from {len(self._attrs['op_instance'])} "
             f"to {len(new_op_instance)}",
@@ -505,7 +504,7 @@ def gen_profiler(
             )
             func = registry.get(func_key)
             profiler_filename = self._get_profiler_filename()
-            logger.info(__name__, f"generating {profiler_filename=}")
+            _LOGGER.info(f"generating {profiler_filename=}")
             return func(
                 self._attrs,
                 workdir,
@@ -543,8 +542,7 @@ def _split_k_search_space(self, M, N, K):
         if low_range == 1:
             low_range += 1
         space += list(range(low_range, high_range, 2))
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"profiling split-k for gemm instance M={M}, N={N}, K={K} in {set(space)}",
         )
         return set(space)
@@ -561,10 +559,12 @@ def _get_ab_alignment(self, exec_key):
             # exec_key may contain batch dimension, which we don't care here
             m, n, k = gemm_inverse_key_func(exec_key)[-3:]
             ab_alignment = self._attrs["f_ab_alignment"](m, n, k)
-            # FIXME: for dtype != float16
-            if ab_alignment == 1:
+            if not alignment.valid_alignment(
+                ab_alignment, self._attrs["inputs"][0].dtype()
+            ):
                 raise RuntimeError(
-                    "A / B alignment == 1 is not supported! " f"m: {m}, n: {n}, k: {k}."
+                    f"A / B {ab_alignment=} is not valid! The last dimension of each input tensor needs to be divisible by 2."
+                    f"m: {m}, n: {n}, k: {k}."
                 )
         return ab_alignment
 
@@ -603,8 +603,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
         )
         cache_value = target.query_profile_cache("gemm", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 f'Load profiling result for {self._attrs["name"]} '
                 f"from cache: {cache_value}",
             )
@@ -667,13 +666,13 @@ def profile(
             target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
-                target=target.name(), op=self._attrs["op"]
+                target=target.name(),
+                op=self._attrs["op"],
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             target = backend.target.Target.current()
@@ -683,7 +682,7 @@ def profile(
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
-                logger.info(__name__, f"Select minimal algo {algo} for CI")
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
                 self._attrs["exec_path"][wkl].algo = algo
                 self._attrs["workspace"] = 102400
             elif self._attrs["exec_path"][wkl].algo != "":
@@ -741,6 +740,12 @@ def _sanity_check(self, a: Tensor, b: Tensor):
                     b_shapes
                 )
             )
+        if not is_same_dtype(a.dtype(), b.dtype()):
+            raise RuntimeError(
+                "gemm operand A and B should have the same data type! Current A: {atype}, B: {btype}.".format(
+                    atype=a.dtype(), btype=b.dtype()
+                )
+            )
 
     def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         """Call the gemm op.
@@ -765,7 +770,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
@@ -833,8 +838,7 @@ def postprocess_results(self):
             func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
             func_attrs["split_k"] = split_k
 
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 f"Profiler ({profiler_filename} {exec_key}) selected kernel: "
                 f"{best_algo=} {workspace=} {split_k=}",
             )
@@ -862,4 +866,4 @@ def postprocess_results(self):
             try:
                 target.insert_profile_cache("gemm", cache_record.__dict__)
             except Exception as e:
-                logger.warning(__name__, e)
+                _LOGGER.warning(e)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
index 42023d1dc..c84915fe5 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
@@ -40,7 +40,7 @@ def __init__(self):
         self._attrs["op"] = "gemm_rcr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
index 4372e19f4..85a777278 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
@@ -93,7 +93,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
index d55457f98..2faaf8234 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
@@ -27,7 +27,7 @@
 class gemm_rcr_bias_broadcast(gemm_rcr_bias):
     def __init__(self):
         super().__init__()
-        self._attrs["epilogue"] = "LinearCombinationResidualBlockV2"
+        self._attrs["epilogue"] = "LinearCombinationResidualBlock"
 
     @staticmethod
     def is_valid_inputs(*inputs):
@@ -68,7 +68,7 @@ def __call__(
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
index 97b199b66..dcb865d6b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
@@ -53,7 +53,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
index 75560e1ba..6a21c6e8a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
@@ -56,7 +56,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
new file mode 100644
index 000000000..f99b54bcd
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A specialization of gemm_rcr_permute applying ELU + 1 as epilogue.
+"""
+
+from . import gemm_rcr_permute
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_permute_elup1(gemm_rcr_permute):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._attrs["op"] = "gemm_rcr_permute_elup1"
+        self._attrs["epilogue"] = "LinearCombinationELUp1"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
index cee26e810..7a9bd7062 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "gemm_rrr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
+            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
index aa5060afe..03f6242a6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
@@ -80,7 +80,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
index b1b75ee6d..8774c7f97 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
@@ -47,7 +47,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
index 24b9bc276..498b90ad9 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
@@ -46,7 +46,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index 6777ab228..4d5e94d19 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -15,6 +15,7 @@
 """
 Grouped GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
 """
+import logging
 import re
 from collections import OrderedDict
 from typing import List
@@ -25,7 +26,6 @@
 
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import logger
 from ...base import ExecItem, Tensor
 from ...tensor_accessor import TensorAccessor
 from ..tensor import concatenate
@@ -34,6 +34,9 @@
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
+
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_EVAL_TEMPLATE = jinja2.Template(
     """
 {% for operand_dim in group_operand_dims %}
@@ -101,7 +104,7 @@ def __init__(self):
         self._attrs["int_state_flag"] = 0
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -281,14 +284,12 @@ def gen_profiler(
                 if filter_func(k, self._attrs, ab_alignments[0])
             }
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
             f"number of generated kernels from {len(self._attrs['op_instance'])} "
             f"to {len(new_op_instance)}",
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Group_gemm profiler valid configs: {sorted(new_op_instance.keys())}",
         )
         self._attrs["op_instance"] = new_op_instance
@@ -299,7 +300,7 @@ def gen_profiler(
             )
             func = registry.get(func_key)
             profiler_filename = self._get_profiler_filename()
-            logger.info(__name__, f"generating {profiler_filename=}")
+            _LOGGER.info(f"generating {profiler_filename=}")
             return func(
                 self._attrs, workdir, profiler_filename, self.shape_eval_template
             )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
index 0f8b3d7d4..5189f8d17 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
@@ -16,6 +16,7 @@
 GEMM Specialization: A.permute(0, 2, 1)[col] @ B[col]
 """
 
+from ....utils import alignment
 from ...base import _create_host_zero_tensor, IntImm, Tensor
 from ..tensor import concatenate
 from . import gemm_common as common
@@ -46,7 +47,7 @@ def __init__(self):
         self._attrs["op"] = "perm021fc_ccr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, k)
+            return common.default_align_ab(m, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -120,7 +121,7 @@ def _align_ab(self, a: Tensor, b: Tensor):
             )
         k = ak._attrs["values"][0]
 
-        if k % 2 != 0:
+        if not alignment.valid_alignment(k % 2, a.dtype()):
             pad_k = int((k // 8 + 1) * 8)
 
             pad_a = _create_host_zero_tensor(
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
index 378f8f33b..3d19f77fd 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
@@ -68,7 +68,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
index e48701330..5016174cb 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -61,7 +61,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
index ba4d52fd7..806e3d0eb 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
@@ -50,7 +50,7 @@ def __init__(self):
         self._attrs["op"] = "perm021fc_crc"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, n)
+            return common.default_align_ab(m, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
index 43b17fdac..749be1900 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
@@ -71,7 +71,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
index abd6d06a6..3a7d8dc9b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
@@ -43,7 +43,7 @@ def __init__(self):
         self._attrs["op"] = "perm102_bmm_rcr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
index 326291002..fb1969552 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
@@ -82,7 +82,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
index 2d57a75cb..d22913d65 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "perm102_bmm_rrr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
+            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
index 59d5fd4de..c8e64ff45 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
@@ -65,7 +65,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index d3114b545..6aa31a68f 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -15,6 +15,7 @@
 """
 Operator definition for groupnorm.
 """
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -29,13 +30,14 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import logger
 from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
 from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
 
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -130,7 +132,7 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
 
         batch_size = output_shape[0]._attrs["values"][-1]
         self._attrs["workspace"] = 8 * batch_size * self._attrs["num_groups"]
@@ -241,7 +243,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -314,8 +316,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
index 9092e8ea9..aee3458e5 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -152,7 +152,7 @@ def __call__(
         self._attrs["output_accessors"] = []
         for x in inputs:
             output_shape = self._infer_shapes(x)
-            output = Tensor(output_shape, src_ops={self})
+            output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
             self._attrs["outputs"].append(output)
             self._attrs["output_accessors"].append(TensorAccessor(output))
             self._attrs["input_accessors"].append(TensorAccessor(x))
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index d9f8f6364..5ab572d74 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -15,6 +15,7 @@
 """
 Operator definition for layernorm.
 """
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -30,7 +31,6 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import logger
 from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
 from ...tensor_accessor import TensorAccessor
 from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
@@ -38,6 +38,8 @@
 # pylint: disable=C0103,W0221,W0102,W0223
 
 
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -124,6 +126,13 @@ def _sanity_check(self, x, gamma, beta):
             )
         (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(x, gamma, beta)
 
+        expected_dtype = x.dtype()
+        for (param, name) in ((gamma, "gamma"), (beta, "beta")):
+            if param is not None and param.dtype() != expected_dtype:
+                raise NotImplementedError(
+                    f"Layernorm doesn't support type promotions; expected {expected_dtype} but got {name} with dtype {param.dtype()}"
+                )
+
         layernorm.check_shapes(x_shape, gamma_shape, beta_shape, normalized_shape)
 
     def _infer_shapes(self, x: Tensor):
@@ -164,7 +173,7 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
@@ -316,7 +325,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -389,8 +398,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
diff --git a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
index b673a774e..c1bbe897f 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
@@ -90,7 +90,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/padding/pad_last_dim.py b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
index b25267f4e..7a826b2f1 100644
--- a/python/aitemplate/compiler/ops/padding/pad_last_dim.py
+++ b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
@@ -74,7 +74,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
index 5fd2bfd78..f3164e38b 100644
--- a/python/aitemplate/compiler/ops/pool/pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -162,7 +162,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_common.py b/python/aitemplate/compiler/ops/reduce/reduce_common.py
index 71dbec41c..33fa194e3 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_common.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_common.py
@@ -16,12 +16,13 @@
 Base operator definition for reduce-family ops.
 """
 import itertools
+import logging
 
 from typing import List
 
 from .... import backend
 from ....backend import registry
-from ....utils import logger, shape_utils
+from ....utils import shape_utils
 from ....utils.tensor_utils import wrap_dim
 from ...base import IntImm, IntVar, Operator, Tensor
 from ...dtype import get_dtype_size
@@ -30,6 +31,9 @@
 # pylint: disable=C0103,W0221
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class reduce_base(Operator):
     """The base class for reduce ops."""
 
@@ -249,11 +253,13 @@ def __call__(self, x: Tensor) -> Tensor:
         # Note that this is a temprary solution only for col-reduction reduce_sum
         # kernels that invoke cutlass's TensorReduction kernel. Once we have our
         # own implementation, we will remove the workaround.
-        if self._attrs["op"] == "reduce_sum" and (reduction_axes[0] != input_rank - 1):
+        if self._attrs["op"] == "reduce_sum" and (
+            self._attrs["reduction_axes"][0] != input_rank - 1
+        ):
             ws_size = self._compute_workspace_size(
-                x._attrs["shape"], reduction_axes[0], x.dtype()
+                x._attrs["shape"], self._attrs["reduction_axes"][0], x.dtype()
             )
-            logger.info(__name__, f'allocating {ws_size} for tensor {x._attrs["name"]}')
+            _LOGGER.info(f'allocating {ws_size} for tensor {x._attrs["name"]}')
             self._attrs["workspace"] = ws_size
         return output
 
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index d077e1325..42de07d67 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -15,6 +15,7 @@
 """
 Softmax op implementation
 """
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -30,11 +31,13 @@
 from ....backend import registry
 from ....backend.target import Target
 
-from ....utils import logger
 from ....utils.tensor_utils import wrap_dim
 from ...base import DynamicProfileStrategy, ExecItem, IntVar, Operator, Tensor
 from .cache_entry import NormQueryEntry, NormRecordEntry
 
+
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -203,7 +206,7 @@ def __call__(self, x: Tensor, dim: int = None) -> Tensor:
         self._attrs["dim"] = dim
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
@@ -258,7 +261,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -331,8 +334,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 61ae1c585..b3e32e846 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -26,6 +26,7 @@
 from .gather import gather
 from .permute import permute
 from .permute021 import permute021
+from .permute0213 import permute0213
 from .permute102 import permute102
 from .permute210 import permute210
 from .size import size
@@ -33,3 +34,4 @@
 from .slice_scatter import slice_scatter
 from .split import split
 from .topk import topk
+from .transpose import transpose
diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
index b7f3ad348..2cf26c11a 100644
--- a/python/aitemplate/compiler/ops/tensor/argmax.py
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -16,6 +16,7 @@
 Argmax.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -27,11 +28,14 @@
 
 from .... import backend
 from ....backend import registry
-from ....utils import logger, shape_utils
+from ....utils import shape_utils
 from ...base import Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_KEY_TEMPLATE = jinja2.Template(
     """
 instance_size == {{x_dim0}} &&  instance_num == {{x_dim1}}
@@ -153,7 +157,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -199,8 +203,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index a895b2516..03664f494 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -94,7 +94,7 @@ def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x, indices)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index a7669d969..db0922cbe 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -41,17 +41,24 @@ class concatenate(Operator):
 
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fast_cat=True) -> None:
+        # TMP: note that fast_cat is a temporary flag to force backend to select
+        # the fast concat implementation. After we finish benchmark fast concat,
+        # we should remove this flag. Instead, we will rely on backend to dispatch
+        # to the appropriate implementation based on input shapes if the fast
+        # concat couldn't handle all cases. If the fast concat is complete, we
+        # can remove the old concat kernel.
         super().__init__()
         self._attrs["op"] = "concatenate"
         self._attrs["has_profiler"] = False
+        self._attrs["fast_cat"] = fast_cat
 
     def _unique(self, vector):
         return sorted(set(vector))
 
-    def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
-        """Infers shapes for concatenate."""
-
+    @staticmethod
+    def check_rank(inputs: List[Tensor], dim) -> bool:
+        """check if the rank is valid"""
         if len(inputs) < 1:
             raise RuntimeError("expected a list of Tensors")
         x = inputs[0]
@@ -60,18 +67,20 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             raise RuntimeError("expected a non-scalar tensor")
         if dim >= rank:
             raise RuntimeError(
-                "concat_dim ({dim}) expected to be less than rank ({rank})".format(
-                    dim=dim, rank=rank
-                )
+                f"concat_dim ({dim}) expected to be less than rank ({rank})"
             )
         for t in inputs:
             r = len(t._attrs["shape"])
             if r != rank:
                 raise RuntimeError(
-                    "tensors expected to have the same rank, got {} and {}".format(
-                        r, rank
-                    )
+                    f"tensors expected to have the same rank but got {rank=} "
+                    f'and {r=} for tensor {t._attrs["name"]}'
                 )
+
+    def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
+        """Infers shapes for concatenate."""
+        concatenate.check_rank(inputs, dim)
+
         input_shapes = [i._attrs["shape"] for i in inputs]
         output_shape = []
         input_shape_values = [
diff --git a/python/aitemplate/compiler/ops/tensor/gather.py b/python/aitemplate/compiler/ops/tensor/gather.py
index 867962a28..6a551892c 100644
--- a/python/aitemplate/compiler/ops/tensor/gather.py
+++ b/python/aitemplate/compiler/ops/tensor/gather.py
@@ -55,7 +55,11 @@ def __call__(self, x: Tensor, dim: int, index: Tensor) -> Tensor:
         self._set_depth()
 
         output_shape = index._attrs["shape"]
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/permute.py b/python/aitemplate/compiler/ops/tensor/permute.py
index a992b0dce..3e7eadfce 100644
--- a/python/aitemplate/compiler/ops/tensor/permute.py
+++ b/python/aitemplate/compiler/ops/tensor/permute.py
@@ -20,8 +20,9 @@
 from .... import backend
 from ....backend import registry
 from ....utils.tensor_utils import wrap_dim
-from ...base import IntVar, Operator, Tensor
+from ...base import IntImm, IntVar, Operator, Tensor
 from .permute021 import permute021
+from .permute0213 import permute0213
 from .permute102 import permute102
 from .permute210 import permute210
 
@@ -49,6 +50,16 @@ def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
         for i, dim in enumerate(dims):
             dims[i] = wrap_dim(dim, x._rank())
 
+        sorted_dims = list(range(x._rank()))
+        assert (
+            sorted(dims) == sorted_dims
+        ), f"expected a permutation of {sorted_dims}, but got {dims}"
+
+        # "dims" is set here before possible dispatching to the
+        # static-shape permute kernels below to keep the call to
+        # ops.permute(..., dims) recoverable from the self._attrs
+        self._attrs["dims"] = dims
+
         if dims == [0, 2, 1]:
             return permute021()(x)
         if dims == [1, 0, 2]:
@@ -56,13 +67,35 @@ def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
         if dims == [2, 1, 0]:
             return permute210()(x)
 
-        self._attrs["dims"] = dims
+        if dims == [0, 2, 1, 3]:
+            second_dim = x.shape()[1]
+            if (isinstance(second_dim, IntImm) and second_dim.value() >= 24) or (
+                isinstance(second_dim, IntVar) and second_dim.lower_bound() >= 24
+            ):
+                # for (0, 2, 1, 3) dims, we dispatch to the permute0213 op
+                # when the second dim >= 24 due to a better performance
+                return permute0213()(x)
+
+        last_dim = x.shape()[-1]
+        if (
+            len(dims) > 3
+            and dims[:-2] + [dims[-1], dims[-2]] == sorted_dims
+            and (
+                (isinstance(last_dim, IntImm) and last_dim.value() >= 8)
+                or (isinstance(last_dim, IntVar) and last_dim.lower_bound() >= 8)
+            )
+        ):
+            # when swapping the last two dims and the last_dim >= 8, we
+            # dispatch to the permute021 op due to a better performance
+            return permute021()(x)
+
         self._attrs["inputs"] = [x]
         self._set_depth()
 
         output_shapes = self._infer_shapes(x)
         output = Tensor(output_shapes, src_ops={self})
         self._attrs["outputs"] = [output]
+        output._attrs["dtype"] = x.dtype()
 
         # TODO: support output TensorAccessor
         return output
diff --git a/python/aitemplate/compiler/ops/tensor/permute021.py b/python/aitemplate/compiler/ops/tensor/permute021.py
index 25b3eedec..e1d20f48f 100644
--- a/python/aitemplate/compiler/ops/tensor/permute021.py
+++ b/python/aitemplate/compiler/ops/tensor/permute021.py
@@ -17,43 +17,22 @@
 """
 from typing import List
 
-import jinja2
-
 from .... import backend
 from ....backend import registry
 from ...base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
-{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
-{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
-{{indent}}{{dtype}}Y_DIM0 = X_DIM0;
-{{indent}}{{dtype}}Y_DIM1 = X_DIM2;
-{{indent}}{{dtype}}Y_DIM2 = X_DIM1;
-"""
-)
-
-SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{y_dim0}} = Y_DIM0;
-{{indent}}{{y_dim1}} = Y_DIM1;
-{{indent}}{{y_dim2}} = Y_DIM2;
-"""
-)
-
 
 class permute021(Operator):
     """
-    Permutes the input tensor from (B, N, M) to (B, M, N).
+    Permutes the input tensor from (B1, B2, ..., Bn, N, M) to (B1, B2, ..., Bn, M, N).
 
     Args:
-        input (Tensor[B, N, M]): the source tensor with 3 dimensions
+        input (Tensor[B1, B2, ..., Bn, N, M]): the source tensor with 3 dimensions
 
     Returns:
-        output (Tensor[B, M, N]): the destination tensor
+        output (Tensor[B1, B2, ..., Bn, M, N]): the destination tensor
 
     Example:
 
@@ -72,20 +51,20 @@ class permute021(Operator):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "permute021"
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
-        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
 
     def _infer_shapes(self, x: Tensor) -> List[IntVar]:
         """Infers shapes for permute021."""
-
         x_shape = x._attrs["shape"]
-        return [x_shape[0], x_shape[2], x_shape[1]]
+        return x_shape[:-2] + [x_shape[-1], x_shape[-2]]
 
     def __call__(self, x: Tensor) -> Tensor:
+        assert len(x.shape()) > 2, "The input tensor must have at least 3 dimensions"
+
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
@@ -99,6 +78,4 @@ def gen_function(self) -> str:
         return func(
             self._attrs,
             template_path,
-            self.shape_eval_template,
-            self.shape_save_template,
         )
diff --git a/python/aitemplate/compiler/ops/tensor/permute0213.py b/python/aitemplate/compiler/ops/tensor/permute0213.py
new file mode 100644
index 000000000..42aab5709
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute0213.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Permute(0, 2, 1, 3) op.
+Change the dimensions dim1 and dim2 of input 4d tensor.
+"""
+from typing import List
+
+from aitemplate.backend import registry
+
+from .... import backend
+from ...base import IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+class permute0213(Operator):
+    """
+    Permutes the input 4d tensor from (B, N, M, K) to (B, M, N, K).
+
+    Args:
+        input (Tensor[B, N, M, K]): the source tensor with 3 dimensions
+
+    Returns:
+        output (Tensor[B, M, N, K]): the destination tensor
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 384, 262, 10], name="X", is_input=True)
+            Y = ops.permute0213()(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [2, 262, 384, 10]
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute0213"
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for permute0213."""
+
+        x_shape = x._attrs["shape"]
+        return [x_shape[0], x_shape[2], x_shape[1], x_shape[3]]
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        """
+        Parameters
+        ----------
+        x : Tensor
+
+        Returns
+        -------
+        Tensor
+            Generate output tensors of function calls.
+            In permute0213, its a 4d tensor with d0,d2,d1,d3 of
+            input Tensor.
+        """
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """Generate function body."""
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/permute102.py b/python/aitemplate/compiler/ops/tensor/permute102.py
index 37e6c3880..c678210b8 100644
--- a/python/aitemplate/compiler/ops/tensor/permute102.py
+++ b/python/aitemplate/compiler/ops/tensor/permute102.py
@@ -18,8 +18,6 @@
 """
 from typing import List
 
-import jinja2
-
 from aitemplate.backend import registry
 
 from .... import backend
@@ -27,25 +25,6 @@
 
 # pylint: disable=C0103,W0221
 
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
-{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
-{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
-{{indent}}{{dtype}}Y_DIM0 = X_DIM1;
-{{indent}}{{dtype}}Y_DIM1 = X_DIM0;
-{{indent}}{{dtype}}Y_DIM2 = X_DIM2;
-"""
-)
-
-SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{y_dim0}} = Y_DIM0;
-{{indent}}{{y_dim1}} = Y_DIM1;
-{{indent}}{{y_dim2}} = Y_DIM2;
-"""
-)
-
 
 class permute102(Operator):
     """
@@ -75,30 +54,6 @@ class permute102(Operator):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "permute102"
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
-        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
-
-    def _infer_shape(self, x: List[int]):
-        """
-        Parameters
-        ----------
-        x : List[int]
-
-        Returns
-        -------
-        List[int]
-            Deduce output dimension based on SHAPE_ASSIGNMENT_TEMPLATE.
-        """
-        eval_func = self.shape_eval_template.render(
-            indent="",
-            dtype="",
-            x_dim0=x[0],
-            x_dim1=x[1],
-            x_dim2=x[2],
-        )
-        output = {}
-        exec(eval_func, output)  # noqa: P204
-        return [int(output["Y_DIM0"]), int(output["Y_DIM1"]), int(output["Y_DIM2"])]
 
     def _infer_shapes(self, x: Tensor) -> List[IntVar]:
         """Infers shapes for permute021."""
@@ -123,6 +78,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
@@ -137,6 +93,4 @@ def gen_function(self) -> str:
         return func(
             self._attrs,
             template_path,
-            self.shape_eval_template,
-            self.shape_save_template,
         )
diff --git a/python/aitemplate/compiler/ops/tensor/permute210.py b/python/aitemplate/compiler/ops/tensor/permute210.py
index a815adce6..3cba6d811 100644
--- a/python/aitemplate/compiler/ops/tensor/permute210.py
+++ b/python/aitemplate/compiler/ops/tensor/permute210.py
@@ -91,6 +91,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
index 56fbfeeeb..7b60d36f4 100644
--- a/python/aitemplate/compiler/ops/tensor/split.py
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -16,7 +16,7 @@
 Split.
 """
 import itertools
-from typing import List
+from typing import List, Sequence, Union
 
 from .... import backend
 from ....backend import registry
@@ -146,6 +146,9 @@ def __call__(self, x: Tensor, split_size_or_sections, dim=0) -> List[Tensor]:
             for output_shape in output_shapes
         ]
         self._attrs["outputs"] = outputs
+        self._attrs["original_outputs"] = list(outputs)
+        # True means the corresponding output tensor will be materialized by backend.
+        self._attrs["output_masks"] = [True] * len(outputs)
         # torch returns a tuple, so do we
         return tuple(outputs)
 
@@ -158,6 +161,64 @@ def gen_function(self) -> str:
         func = self._get_func("{target}.{op}.gen_function")
         return func(self._attrs)
 
+    def remove_output_at(self, indices: Union[int, Sequence[int]]) -> None:
+        """
+        This function removes the outputs in indices from the "outputs" attribute
+        and sets output_masks[indices] to be False. Note that the indices are based
+        on the current "outputs".
+
+        Parameters
+        ----------
+        indices : Union[int, Sequence[int]]
+            the index of an output or indices of multiple outputs based on the current "outputs"
+
+        Returns
+        -------
+        None
+        """
+        if isinstance(indices, int):
+            indices = [indices]
+        else:
+            indices = list(indices)
+
+        curr_outputs = self._attrs["outputs"]
+        num_curr_outputs = len(curr_outputs)
+
+        assert (
+            len(indices) <= num_curr_outputs
+        ), f"Expected len(indices) <= num_curr_outputs, but got {len(indices)} and {num_curr_outputs}"
+
+        num_original_outputs = len(self._attrs["original_outputs"])
+        num_output_masks = len(self._attrs["output_masks"])
+        assert num_original_outputs == num_output_masks, (
+            f"original_outputs and output_masks must have the same length, "
+            f"but got {num_original_outputs} and {num_output_masks}"
+        )
+
+        curr_idx = 0  # index into curr_outputs
+        idx = 0  # index into indices
+        new_outputs = []
+        # we need to skip those indices where output_masks have been modified.
+        for orig_idx in range(num_original_outputs):
+            if not self._attrs["output_masks"][orig_idx]:
+                continue
+            if idx < len(indices) and curr_idx == indices[idx]:
+                if not self._attrs["output_masks"][orig_idx]:
+                    raise RuntimeError(
+                        f'Expected input_masks at {idx} to be True for {self._attrs["name"]}'
+                    )
+                self._attrs["output_masks"][orig_idx] = False
+                idx += 1
+            else:
+                new_outputs.append(curr_outputs[curr_idx])
+            curr_idx += 1
+        num_new_outputs = len(new_outputs)
+        assert num_new_outputs + len(indices) == num_curr_outputs, (
+            f"Expected num_new_outputs + len(indices) == num_curr_outputs, "
+            f"but got {num_new_outputs + len(indices)} and {num_curr_outputs}"
+        )
+        self._attrs["outputs"] = new_outputs
+
     def _inputs_for_pseudo_code(self):
         return self._attrs["inputs"] + [
             f"split_sizes={str(self._attrs['split_sizes'])}]",
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
index 252e47507..871f03d5d 100644
--- a/python/aitemplate/compiler/ops/tensor/topk.py
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -16,6 +16,7 @@
 Topk.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -27,11 +28,13 @@
 
 from .... import backend
 from ....backend import registry
-from ....utils import logger
 from ...base import IntImm, IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_KEY_TEMPLATE = jinja2.Template(
     """
 elem_cnt == {{x_dim0}} &&  instance_size == {{x_dim1}} &&  instance_num == {{x_dim2}}
@@ -149,7 +152,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[1])
         cmd.append(x_shape[2])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -196,8 +199,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/tensor/transpose.py b/python/aitemplate/compiler/ops/tensor/transpose.py
new file mode 100644
index 000000000..2154105a4
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/transpose.py
@@ -0,0 +1,34 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+transpose op
+"""
+
+from ...base import Tensor
+from .permute import permute
+
+
+class transpose(permute):
+    """
+    Returns a tensor with its two dimensions transposed.
+    This returned tensor is not a view. Dims can be negative.
+    """
+
+    def __call__(self, x: Tensor, dim0: int, dim1: int) -> Tensor:
+        dims = list(range(x._rank()))
+        dims[dim0] = dim1
+        dims[dim1] = dim0
+
+        return super().__call__(x, dims)
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
index 16632d4fe..b203c0050 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
@@ -51,6 +51,6 @@ def __call__(self, x: Tensor, r: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index 5f0a83344..aff1b36a8 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -152,7 +152,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
index 0b57d0502..5b124d081 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
@@ -16,6 +16,7 @@
 Efficient nms.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -26,12 +27,14 @@
 
 from ..... import backend
 from .....backend import registry
-from .....utils import logger, shape_utils
+from .....utils import shape_utils
 from ....base import IntImm, Operator, Tensor
 
-
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 # TODO: change to column last
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
@@ -140,11 +143,20 @@ def __call__(self, boxes: Tensor, scores: Tensor) -> Tensor:
         self._extract_exec_path(boxes)
         output_shape = self._infer_shapes(boxes, scores)
 
+        x = boxes
         num_detections = Tensor(
             [output_shape[0], IntImm(1)], dtype="int64", src_ops={self}
         )
-        detection_boxes = Tensor(output_shape, src_ops={self})
-        detection_scores = Tensor(output_shape[:-1], src_ops={self})
+        detection_boxes = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
+        detection_scores = Tensor(
+            output_shape[:-1],
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
         detection_classes = Tensor(output_shape[:-1], dtype="int64", src_ops={self})
         output = (num_detections, detection_boxes, detection_scores, detection_classes)
         self._attrs["outputs"] = [
@@ -211,7 +223,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[1] * x_shape[2])
         cmd.append(x_shape[2])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -246,8 +258,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
index 45dabb290..bc1769e4c 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
@@ -16,6 +16,7 @@
 Nms.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -26,11 +27,14 @@
 
 from ..... import backend
 from .....backend import registry
-from .....utils import logger, shape_utils
+from .....utils import shape_utils
 from ....base import Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 # TODO: change to column last
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
@@ -136,7 +140,7 @@ def __call__(self, x: Tensor, scores: Tensor) -> Tensor:
         self._set_depth()
         output_shape = self._infer_shapes(x, scores)
         self._extract_exec_path(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -194,7 +198,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -230,8 +234,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
index 190c06207..bd3fc7093 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
@@ -114,7 +114,7 @@ def __call__(
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
index a962e20ad..076ee9235 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
@@ -194,7 +194,7 @@ def __call__(self, x: Tensor, rois: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index b6276ed38..518cd8df7 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -28,7 +28,8 @@
 
 from .base import IntImm, Tensor
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class TensorAccessor(object):
@@ -45,6 +46,8 @@ def __init__(self, original_tensor: Tensor) -> None:
         # Tensor offset in terms of number of elements compared to the base tensor.
         self.offset = 0
         self.original_shapes = original_tensor._attrs["shape"]
+        # We need dtype for computing alignment requirement
+        self.tensor_dtype = original_tensor.dtype()
         # This strictly means that the tensor's memory itself is contiguous
         self.is_contiguous = True
 
@@ -204,7 +207,7 @@ def _try_gen_dim_mapping(self):
             or original_idx != len(original_shapes)
             or actual_idx != len(actual_shapes)
         ):
-            logger.debug(f"tail processing failed, dim_mapping: {dim_mapping}")
+            _LOGGER.debug(f"tail processing failed, dim_mapping: {dim_mapping}")
             return
 
         # Remove the last dummy group.
@@ -212,7 +215,7 @@ def _try_gen_dim_mapping(self):
 
         # Assign new dim_mapping to self._dim_mapping.
         self._dim_mapping = dim_mapping
-        logger.debug(f"generate dim_mapping: {dim_mapping}")
+        _LOGGER.debug(f"generate dim_mapping: {dim_mapping}")
 
     def try_get_stride_strs(
         self, dim: int, dim_names: List[str] = None
@@ -255,7 +258,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
         if self._dim_mapping is None:
             # self._dim_mapping cannot be generated successfully.
             # Return None to represent an error.
-            logger.debug("Failed to get dim mapping.")
+            _LOGGER.debug("Failed to get dim mapping.")
             return None
 
         # Loop through self._dim_mapping to generate stride_strs.
@@ -273,7 +276,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
                         # need to make sure that dim is the last dim
                         # inside the original group.
                         # Otherwise, we cannot compute strides.
-                        logger.debug(
+                        _LOGGER.debug(
                             "Multiple dims in stride_dim group. "
                             f"dim_mapping: {self._dim_mapping}, "
                             f"dim: {dim}, stride_dim: {self.stride_dim}, self: {self}"
@@ -287,7 +290,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
             else:
                 if self.stride_dim in actual_group:
                     if actual_group.index(self.stride_dim) != 0:
-                        logger.debug(
+                        _LOGGER.debug(
                             f"Stride dim {self.stride_dim} is not the first dim "
                             f"of the underlying group {actual_group}."
                         )
@@ -298,7 +301,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
                         _get_value_or_names(self.original_shapes, original_group)
                     )
 
-        logger.debug(
+        _LOGGER.debug(
             f"dim: {dim}, stride_dim: {self.stride_dim}, "
             f"mapping: {self._dim_mapping}, stride_strs: {res}, "
             f"original: {self.original_shapes}, actual: {self.actual_shapes}"
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index b336aac0c..ca9bf77e4 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -33,6 +33,7 @@
 from .remove_no_ops import remove_no_ops
 from .remove_unused_ops import remove_unused_ops
 from .split_large_concat_ops import split_large_concat_ops
+from .split_large_split_ops import split_large_split_ops
 from .toposort import toposort
 from .transform_memory_ops import transform_memory_ops
 from .transform_odd_alignment import transform_odd_alignment
diff --git a/python/aitemplate/compiler/transform/apply_padding.py b/python/aitemplate/compiler/transform/apply_padding.py
index 5041d889e..423e0980c 100644
--- a/python/aitemplate/compiler/transform/apply_padding.py
+++ b/python/aitemplate/compiler/transform/apply_padding.py
@@ -15,17 +15,21 @@
 """
 Applies paddings to gemms based on alignment requirements.
 """
+import logging
 from typing import Callable, Dict, List
 
 from aitemplate.compiler.base import _create_host_zero_tensor
 
-from ...utils import logger
+from ...utils import alignment
 from .. import ops
 from ..base import IntImm, Operator, Tensor
 from ..ops.gemm_universal.gemm_common import DimInfo, gemm, Source
 from . import transform_utils
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def _extract_mnk_name(
     dim_info_dict: Dict[str, DimInfo], source: Source, tensor_idx: int, dim_idx: int
 ) -> str:
@@ -36,8 +40,8 @@ def _extract_mnk_name(
     return None
 
 
-def _get_padding_length(original_length: int) -> int:
-    if original_length % 2 == 0:
+def get_padding_length(original_length: int, dtype: str) -> int:
+    if alignment.valid_alignment(original_length, dtype):
         return 0
 
     # TODO(yingz): Tune padding strategy.
@@ -83,8 +87,7 @@ def _pad_input_tensor(
         tensor_list.append(padding_tensor)
         tensor_list.append(padded_tensor)
 
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "**** Apply padding ****, replace input tensor \n {} \n with \n {} \n".format(
                 original_tensor_debug_str, padded_tensor
             ),
@@ -142,6 +145,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
                 or isinstance(op, ops.gemm_rrr_small_nk)
                 or isinstance(op, ops.bmm_rcr_n1)
                 or isinstance(op, ops.bmm_rrr_k1_tanh)
+                or "permute" in op._attrs["op"]
             ):
                 continue
 
@@ -168,15 +172,16 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
                         "Gemm does not support dynamic alignment dimensions "
                         "(i.e. alignment==1)! Gemm: {}".format(op)
                     )
-                padding_length = _get_padding_length(alignment_dim.value())
+                padding_length = get_padding_length(
+                    alignment_dim.value(), tensor.dtype()
+                )
                 if padding_length > 0:
                     alignment_var_to_padding_length[alignment_var] = padding_length
             if len(alignment_var_to_padding_length) == 0:
                 # No padding is necessary.
                 continue
 
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 "**** Apply padding ****, alignment_var_to_padding_length: \n {} \n".format(
                     alignment_var_to_padding_length
                 ),
@@ -215,7 +220,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
             # Replaces the old op with the new op.
             for tensor_input in op._attrs["inputs"]:
                 tensor_input._attrs["dst_ops"].discard(op)
-            new_op = type(op)()
+            new_op = type(op)(**op._get_op_attributes())
             new_op._attrs["split_k"] = op._attrs["split_k"]
             if "alpha" in op._attrs:
                 new_op._attrs["alpha"] = op._attrs["alpha"]
@@ -231,8 +236,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
             transform_utils.replace_tensor(original_output, new_output)
             transform_utils.remove_tensor_from_sorted_graph(original_output)
 
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 "**** Apply padding ****, replace op \n {} \n with \n {} \n".format(
                     original_op_debug_str, new_op
                 ),
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 0b6459750..b8f9e2194 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
 from typing import Dict, List
 
@@ -20,9 +21,12 @@
 from aitemplate import backend, compiler
 
 from aitemplate.compiler.base import _NumpyConstantTensorData, IntVarTensor, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
 from aitemplate.compiler.model import AITData, Model
 from aitemplate.compiler.transform.transform_utils import replace_tensor
-from aitemplate.utils import logger
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _output_from_tensor(tensor: Tensor) -> Tensor:
@@ -105,7 +109,7 @@ def _constant_folding_impl(
     subgraph = _extract_foldable_subgraph(sorted_graph)
     output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
     if not output_tensors:
-        logger.info(__file__, "No constants to fold, skipping constant folding.")
+        _LOGGER.info("No constants to fold, skipping constant folding.")
         return {}
 
     blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
@@ -134,7 +138,7 @@ def _constant_folding_impl(
         if tensor._attrs["data"] is None:
             name = tensor._attrs["name"]
             shape = module.get_output_maximum_shape(tensor._attrs["name"])
-            arr = np.empty(shape, dtype=tensor._attrs["dtype"])
+            arr = np.empty(shape, dtype=normalize_dtype(tensor._attrs["dtype"]))
             new_tensor = Tensor(
                 shape=tensor._attrs["shape"],
                 name=name,
@@ -167,8 +171,7 @@ def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     try:
         new_constants = _constant_folding_impl(sorted_graph, workdir)
     except Exception as e:
-        logger.warning(
-            __file__,
+        _LOGGER.warning(
             f"Constant folding encountered an error: {e}. The graph will not be modified.",
         )
         return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index c954167d1..6f8b2ad0f 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -16,10 +16,11 @@
 Horizontal fusion pass to group ops together.
 """
 import collections
+import logging
 import os
 from typing import Callable, List, OrderedDict, Set
 
-from ...utils import graph_utils, logger
+from ...utils import graph_utils
 from ...utils.shape_utils import all_static_dimensions
 from .. import ops
 from ..base import Operator, Tensor
@@ -29,6 +30,9 @@
 from .toposort import toposort
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 # used by debugging only
 def _dump_dependency_graph(graph, op_type, postfix, workdir):
     fname = f"fuse_group_{op_type}_dependency_graph_{postfix}.txt"
@@ -41,7 +45,7 @@ def _dump_dependency_graph(graph, op_type, postfix, workdir):
 
     with open(file_path, "w") as f:
         f.write("\n\n".join(graph_str))
-        logger.info(__file__, f"Dumped dependency graph to {file_path}")
+        _LOGGER.info(f"Dumped dependency graph to {file_path}")
 
 
 def _dump_groups(groups, op_type, workdir):
@@ -53,7 +57,7 @@ def _dump_groups(groups, op_type, workdir):
             f.write(f"[{single_group_str}]\n\n")
             f.write(graph_utils.sorted_op_pseudo_code(group))
             f.write("\n")
-        logger.info(__file__, f"Dumped groups to {file_path}")
+        _LOGGER.info(f"Dumped groups to {file_path}")
 
 
 def _dump_single_group(group):
@@ -75,7 +79,7 @@ def _check_op_num_outputs(op: Operator, num_outputs: int) -> bool:
 def _get_ab_alignment(op: Operator) -> int:
     if op._attrs["op"].startswith("gemm_rcr"):
         k = op._attrs["inputs"][0]._size(1).value()
-        return default_align_ab(k, k)
+        return default_align_ab(k, k, op._attrs["inputs"][0].dtype())
     raise NotImplementedError(
         f"Need to add alignment check support for op {op._attrs['op']}"
     )
@@ -222,8 +226,24 @@ def _get_op_filter(op_type: str) -> Callable:
 }
 
 
+def _has_cycle(grouped_op: Operator, group: List[Operator]):
+    """
+    Assuming that grouped_op is in the group, determine if grouped_op
+    can reach any other op in the group. Return True if it can.
+    """
+    assert (
+        grouped_op in group
+    ), f'grouped_op {grouped_op._attrs["name"]} is not from the group'
+    for op in group:
+        if op is grouped_op:
+            continue
+        if transform_utils.is_ancestor(op, grouped_op):
+            return True
+    return False
+
+
 def _group_split_outputs_together(
-    sorted_ops: List[Operator], op_type: str
+    sorted_graph: List[Tensor], sorted_ops: List[Operator], op_type: str
 ) -> List[List[Operator]]:
     """As long as alignment allows, we group all output gemm ops from split op
     together to eliminate the cost of split. Here we don't exclude large gemms
@@ -255,7 +275,10 @@ def _group_split_outputs_together(
                     gemm_group.append(gemm_op)
                 else:
                     break
-        if len(gemm_group) == len(op._attrs["outputs"]):
+        if len(gemm_group) == len(op._attrs["outputs"]) and all(
+            not _has_cycle(grouped_op, gemm_group) for grouped_op in gemm_group
+        ):
+            _fuse_gemm_ops(gemm_group, sorted_graph)
             groups.append(gemm_group)
     return groups
 
@@ -390,16 +413,16 @@ def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
 
 def _group_ops_by_type(
     sorted_graph: List[Tensor], op_type: str, workdir: str = None
-) -> List[List[Operator]]:
-    """Find all groups of ops that can be fused together. Each group is replaced
-    with 1 group op.
+) -> bool:
+    """Find and fuse all groups of ops that can be fused together.
+    Each group is replaced with 1 group op.
 
     Args:
         sorted_graph (List[Tensor]): Topologically sorted input graph
         op_type (str): The type of op to be grouped
 
     Returns:
-        List[List[Operator]]: All groups of ops that can be grouped together.
+        True if we fused any group.
 
     The algorithm can be described as:
     0) Let groups = []
@@ -440,13 +463,15 @@ def _group_ops_by_type(
 
     # There is no op with op_type in the graph
     if len(dependency_graph) == 0:
-        return []
+        return False
 
     if workdir:
         _dump_dependency_graph(dependency_graph, op_type, "filtered", workdir)
 
     f_filter_op = _get_op_filter(op_type)
     f_check_ops_are_compatible = _get_op_checker(op_type)
+    is_layernorm = op_type.startswith("layernorm")
+    f_fuse_ops = _fuse_layernorm_ops if is_layernorm else _fuse_gemm_ops
 
     sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
 
@@ -458,7 +483,7 @@ def _group_ops_by_type(
     groups = []
 
     # applies to group gemms only
-    split_groups = _group_split_outputs_together(sorted_ops, op_type)
+    split_groups = _group_split_outputs_together(sorted_graph, sorted_ops, op_type)
     for group in split_groups:
         groups.append(group)
         for op in group:
@@ -503,15 +528,45 @@ def get_op_number(op: Operator) -> int:
 
                 # must merge descendants together
                 descendants.update(dependency_graph[candidate])
-
+        # remove any op that may introduce a cycle because of grouping ops
+        group_op_idx = 0
+        while group_op_idx < len(group):
+            grouped_op = group[group_op_idx]
+            if _has_cycle(grouped_op, group):
+                del group[group_op_idx]
+            else:
+                group_op_idx += 1
+
+        # We fuse each group right after we form it. Otherwise, _has_cycle may
+        # miss cycles within groups. For example, see the graph below:
+        #
+        #        A --> C ---
+        #                  |
+        #    --> B --> D   |
+        #    |             |
+        #    --- X --> M   |
+        #                  |
+        #        Y --> N <--
+        #
+        # If we fuse (A, B) and (X, Y) at the same time, we would end up with a
+        # cycle between the fused op (A, B) and (X, Y). On the other hand, if we
+        # fuse (A, B) first, and then check _has_cycle before fusing (X, Y), we
+        # will be able to detect the cycle.
         if len(group) > _MAX_LAYERNORM_GROUP and op_type.startswith("layernorm"):
-            groups.extend(_break_layernorm_groups(group))
+            new_groups = _break_layernorm_groups(group)
+            for new_group in new_groups:
+                f_fuse_ops(new_group, sorted_graph)
+            groups.extend(new_groups)
         elif len(group) >= 2:
+            f_fuse_ops(group, sorted_graph)
             groups.append(group)
 
         grouped[op] = True
 
-    return groups
+    if workdir:
+        _dump_groups(groups, op_type, workdir)
+
+    return len(groups) > 0
 
 
 def _fuse_layernorm_ops(
@@ -650,19 +705,10 @@ def _fuse_group_ops_by_type(
     2) fuse them together
     Details of step 1 can be found in _group_ops_by_type
     """
-    groups = _group_ops_by_type(sorted_graph, op_type, workdir)
-
-    if len(groups) == 0:
+    # if we didn't fuse any grouped ops, we simply return original sorted_graph
+    if not _group_ops_by_type(sorted_graph, op_type, workdir):
         return sorted_graph
 
-    if workdir:
-        _dump_groups(groups, op_type, workdir)
-
-    is_layernorm = op_type.startswith("layernorm")
-    f_fuse_ops = _fuse_layernorm_ops if is_layernorm else _fuse_gemm_ops
-    for op_group in groups:
-        f_fuse_ops(op_group, sorted_graph)
-
     sorted_graph = toposort(sorted_graph)
     sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
index fcf8e31ba..89610d3f9 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
@@ -18,11 +18,10 @@
 from typing import List
 
 from ..base import Tensor
-from ..ops.common import elementwise
 from ..ops.common.epilogue import FuncEnum
-from ..ops.gemm_universal import gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish
+from ..ops.gemm_universal import gemm_rcr_bias_swish
 
-from .fuse_mm_elementwise_patterns import get_patterns
+from .fuse_mm_elementwise_patterns import get_gemm_rcr_bias_patterns, get_patterns
 from .fuse_utils import (
     extract_only_one_op,
     is_elementwise_type,
@@ -174,14 +173,7 @@ def _fuse_gemm_rcr_bias_swish(sorted_graph: List[Tensor]) -> List[Tensor]:
 
 
 def _transform_gemm_bias(sorted_graph: List[Tensor]) -> List[Tensor]:
-    gemm_rcr_bias_patterns = [
-        (
-            (gemm_rcr(), elementwise(FuncEnum.ADD)),
-            gemm_rcr_bias,
-        ),
-    ]
-
-    return transform_simple_fusion_patterns(sorted_graph, gemm_rcr_bias_patterns)
+    return transform_simple_fusion_patterns(sorted_graph, get_gemm_rcr_bias_patterns())
 
 
 def _transform_mm_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
index c97cfbfbe..9e53f4711 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
@@ -21,6 +21,7 @@
     bmm_crr_add,
     bmm_rrr,
     bmm_rrr_add,
+    gemm_rcr,
     gemm_rcr_bias,
     gemm_rcr_bias_add,
     gemm_rcr_bias_add_add,
@@ -39,6 +40,16 @@
 )
 
 
+def get_gemm_rcr_bias_patterns():
+    gemm_rcr_bias_patterns = [
+        (
+            (gemm_rcr(), elementwise(FuncEnum.ADD)),
+            gemm_rcr_bias,
+        ),
+    ]
+    return gemm_rcr_bias_patterns
+
+
 def get_patterns():
     """
     We create the pattern of fusion here.
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
index 7985ef354..9bb606c26 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -134,7 +134,7 @@ def _fuse_gemm_reshape_permute0213(
 
         permute_op = list(reshape_output.dst_ops())[0]
 
-        if permute_op._attrs["op"] != "permute":
+        if permute_op._attrs["op"] not in ("permute", "permute0213"):
             continue
 
         permute_output = permute_op._attrs["outputs"][0]
@@ -143,7 +143,11 @@ def _fuse_gemm_reshape_permute0213(
         if not _check_reshape(reshape_op):
             continue
 
-        if not _check_permute(permute_op, [0, 2, 1, 3]):
+        # check permute dims match [0, 2, 1, 3]: either
+        # permute0213 or generic permute with those dims
+        if permute_op._attrs["op"] != "permute0213" and not _check_permute(
+            permute_op, [0, 2, 1, 3]
+        ):
             continue
 
         # fuse ops together
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 61db0f8b9..05e7b8b0c 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -15,13 +15,14 @@
 """
 Perform operator fusions.
 """
+import collections
+import logging
+from dataclasses import dataclass
 from typing import Any, Dict, List, Set
 
 from aitemplate.compiler.base import Operator
 from aitemplate.compiler.transform.toposort import toposort
 
-from aitemplate.utils import logger
-
 from ..base import Tensor
 from ..ops.common import fused_elementwise
 from ..ops.common.epilogue import FuncEnum
@@ -31,6 +32,9 @@
 # pylint: disable=C0103,W0612
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class SimpleDisjointSet(object):
     def __init__(self):
         self.node_to_list_mapping: Dict[Any, List[Any]] = {}
@@ -43,20 +47,19 @@ def add(self, node: Any, dependent_nodes: Set[Any]) -> None:
             self.node_to_list_mapping[node] = [node]
             return
 
-        current_list = None
-        for dependent in dependent_nodes:
+        current_list = [
+            node  # node should also be considered to decide if a new_list can be added.
+        ]
+        for dependent in list(dependent_nodes):
             if dependent is None or dependent not in self.node_to_list_mapping:
                 continue
             new_list = self.node_to_list_mapping.get(dependent)
-            if current_list is None:
-                current_list = new_list
-            elif current_list is not new_list:
-                current_list.extend(new_list)
-                for new_node in new_list:
-                    self.node_to_list_mapping[new_node] = current_list
-        if current_list is None:
-            current_list = []
-        current_list.append(node)
+
+            if _detect_cycle(current_list + new_list):
+                continue
+            current_list.extend(new_list)
+            for new_node in new_list:
+                self.node_to_list_mapping[new_node] = current_list
         self.node_to_list_mapping[node] = current_list
 
     def get_node_groups(self) -> List[List[Any]]:
@@ -127,14 +130,179 @@ def _find_fusable_elementwise_ops(op: Operator) -> Set[Operator]:
         if prev_len == new_len:
             break
 
-    logger.debug(
-        __file__,
+    _LOGGER.debug(
         f"original op set: {original_ops}, to_be_removed_set: {to_be_removed_set}, final_set: {dependent_ops}",
     )
     return dependent_ops
 
 
+@dataclass
+class FusedElementwiseInfo:
+    partitioned_ops: List[Operator]
+    inputs: Set[Tensor]
+    outputs: Set[Tensor]
+    external_inputs: Set[Tensor]
+    external_outputs: Set[Tensor]
+
+
+def _partition_subgraphs(ops: List[Operator]) -> Dict[str, Set[Operator]]:
+    """
+    Given ops of candidate graph of fused_elementwise op graph and partition
+    into subgraph based on output shape, returns dict of
+    {output shape: ops to form subgraph based on the shape}
+    """
+    # Partition graph of elementwise into subgraph based on output shape.
+    output_op_map = collections.defaultdict(set)
+    for op in ops:
+        shapes = []
+        # Find output nodes
+        for output_tensor in op._attrs["outputs"]:
+            if (
+                output_tensor._attrs["is_output"]
+                or len(output_tensor._attrs["dst_ops"] - ops) > 0
+            ):
+                shapes.append("_".join(map(str, output_tensor._attrs["shape"])))
+        # Find anscestor of output node.
+        # Outputs with the same shape should form the same graph
+        if shapes:
+            key = "|".join(shapes)
+            op_set = output_op_map[key]
+            for anc_op in ops:
+                if transform_utils.is_ancestor(anc_op, op):
+                    op_set.add(anc_op)
+            op_set.add(op)
+    return output_op_map
+
+
+def _get_inputs_outputs(
+    partitioned_ops: Set[Operator], all_ops: Set[Operator]
+) -> List[Set[Tensor]]:
+    """
+    Given ops of a partitioned subgraph based on output shape, and ops of full graph
+    to form a complete graph with fused_elementwise op, returns all inputs/outputs of
+    the ops and the external input/output of the subgraph, which will serve as input/output
+    of fused_elementwise op.
+    """
+    external_inputs = set()
+    external_outputs = set()
+    tmp_inputs = set()
+    tmp_outputs = set()
+
+    for op in partitioned_ops:
+        for input_tensor in op._attrs["inputs"]:
+            tmp_inputs.add(input_tensor)
+            src_ops = set(input_tensor._attrs["src_ops"])
+            if (len(src_ops) == 0 or len(src_ops - all_ops) > 0) and (
+                not input_tensor.is_a_const_num()
+            ):
+                external_inputs.add(input_tensor)
+            assert op in input_tensor._attrs["dst_ops"]
+        for output_tensor in op._attrs["outputs"]:
+            tmp_outputs.add(output_tensor)
+            dst_ops = set(output_tensor._attrs["dst_ops"])
+            if output_tensor._attrs["is_output"] or len(dst_ops - all_ops) > 0:
+                external_outputs.add(output_tensor)
+            assert len(output_tensor._attrs["src_ops"]) == 1
+            assert list(output_tensor._attrs["src_ops"])[0] == op
+
+    assert (
+        external_inputs == tmp_inputs - tmp_outputs
+    ), "external_inputs: {} is not equal to tmp_inputs: {} - tmp_outputs: {}.".format(
+        external_inputs, tmp_inputs, tmp_outputs
+    )
+    assert (
+        len(tmp_outputs - tmp_inputs - external_outputs) == 0
+    ), "tmp_outputs: {} - tmp_inputs: {} - external_outputs: {} is not empty.".format(
+        tmp_outputs, tmp_inputs, external_outputs
+    )
+    assert (
+        len(external_outputs - tmp_outputs) == 0
+    ), "external_outputs: {} - tmp_outputs: {} is not empty.".format(
+        external_outputs, tmp_outputs
+    )
+
+    return [tmp_inputs, tmp_outputs, external_inputs, external_outputs]
+
+
+def _collect_info(
+    output_op_map: Dict[str, Set[Operator]],
+    all_ops: Set[Operator],
+    sorted_graph: List[Tensor],
+) -> List[FusedElementwiseInfo]:
+    """
+    Collects information for each fused_elementwise op:
+        1. Provide op_list in topological order so fuse_elementwise backend can emit operations in order.
+        2. Provide inputs outputs info of each subgraph. This need to happen before fuse ops are created,
+        i.e. graph get changed.
+    Returns list of fused_op_info, which contains:
+        partitioned op list in topological order, all inputs/outputs of elementwise ops and
+        their external input/output, serving as input/output of fused_elementwise op.
+    """
+    info_list = []
+    for op_set in output_op_map.values():
+        # Toposort the op_set into op_list
+        # because fuse_elementwise stores elementwise ops in topological order
+        topo_set = set()
+        op_list = []
+        for tensor in sorted_graph:
+            topo_set.add(tensor)
+            to_remove = set()
+            for op in op_set:
+                if all([arg in topo_set for arg in op._attrs["inputs"]]):
+                    op_list.append(op)
+                    to_remove.add(op)
+            op_set = op_set - to_remove
+        assert (
+            not op_set
+        ), "Unable to find topological order of op list for fused_elementwise!"
+        # Get all inputs/outputs of elementwise ops and their external input/output,
+        # which will serve as input/output of fused_elementwise op.
+        inputs_outputs = _get_inputs_outputs(op_list, all_ops)
+        fused_op_info = FusedElementwiseInfo(op_list, *inputs_outputs)
+        info_list.append(fused_op_info)
+    return info_list
+
+
+def _create_fuse_ops(info_list: List[FusedElementwiseInfo]) -> None:
+    """
+    Creates fused ops based on info we collected.
+    First is to update elementwise ops' inputs/outputs within the subgraph;
+    Second is to create fused_elementwise ops where their inputs/outputs
+    are external inputs/outputs of the subgraph.
+    """
+    for info in info_list:
+        op_set = set(info.partitioned_ops)
+        for tensor in info.inputs | info.outputs:
+            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - op_set
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - op_set
+        fused_elementwise(
+            info.partitioned_ops,
+            info.external_inputs,
+            info.external_outputs,
+        )
+
+
+def _detect_cycle(group: List[Operator]) -> bool:
+    """
+    Given a group of ops, to detect if they would form cycles, i.e.
+      --> group_ops
+     /      /
+    A <-----
+    we need to find all parents of all ops in that group
+    and see if any parent's ancester (execluding the ones already in the group) exists in the group.
+    """
+    parents = [o for op1 in group for i in op1._attrs["inputs"] for o in i.src_ops()]
+    for op1 in group:
+        for op2 in set(parents) - set(group):
+            if transform_utils.is_ancestor(op1, op2):
+                return True
+    return False
+
+
 def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Given a sorted graph, returns a sorted graph with fused_elementwise ops on fusable elementwise ops.
+    """
     disjoint_set = SimpleDisjointSet()
     for tensor in sorted_graph:
         src_ops = tensor._attrs["src_ops"]
@@ -145,8 +313,14 @@ def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
             disjoint_set.add(src_op, _find_fusable_elementwise_ops(src_op))
 
     to_be_fused_op_groups = disjoint_set.get_node_groups()
+
     for ops in to_be_fused_op_groups:
-        fused_elementwise(ops)
+        # Partition subgraph based on output shape.
+        output_op_map = _partition_subgraphs(ops)
+        # Collect information to create fuse ops.
+        info_list = _collect_info(output_op_map, set(ops), sorted_graph)
+        # Create fuse ops.
+        _create_fuse_ops(info_list)
 
     sorted_graph = toposort(sorted_graph)
     return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
index bdd0d6473..668372d2c 100644
--- a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
+++ b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
@@ -170,8 +170,9 @@ def _merge_parallel_gemm_concat(
     n, k = weights[0].shape()[0].value(), weights[0].shape()[1].value()
     b = len(weights)
 
-    rcr_align = default_align_ab(k, k)
-    rrr_align = default_align_ab(k, n)
+    dtype = inputs[0].dtype()
+    rcr_align = default_align_ab(k, k, dtype)
+    rrr_align = default_align_ab(k, n, dtype)
 
     use_rcr = rcr_align > rrr_align
 
@@ -216,7 +217,7 @@ def _merge_parallel_gemm_concat(
 
         cat_op._attrs["inputs"] = new_inputs
         cat_op._attrs["input_accessors"] = [TensorAccessor(t) for t in new_inputs]
-        cat_op._attrs["original_inputs"] = new_inputs
+        cat_op._attrs["original_inputs"] = list(new_inputs)
         cat_op._attrs["input_masks"] = [True] * len(new_inputs)
 
         bmm_reshape._attrs["dst_ops"].add(cat_op)
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
index 4a09f5f5c..47cc5f6b6 100644
--- a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
+++ b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
@@ -19,6 +19,7 @@
 
 from aitemplate.compiler.ops.tensor.permute import permute
 
+from ...utils import alignment
 from .. import ops
 from ..base import IntImm, Operator, Tensor
 from ..ops.gemm_universal import (
@@ -135,16 +136,22 @@ def _fuse_permute_impl(
         # TODO: Check whether the input is weight to have better compile time
         #       optimization on preprocessing of pad etc.
         permute_shape = tensor.shape()
+        permute_dtype = tensor.dtype()
         prepermute_shape = input_tensor.shape()
+        prepermute_dtype = input_tensor.dtype()
 
         if (
             isinstance(prepermute_shape[-1], IntImm)
-            and prepermute_shape[-1].value() % 2 == 1
+            and (
+                not alignment.valid_alignment(
+                    prepermute_shape[-1].value(), prepermute_dtype
+                )
+            )
             and isinstance(permute_shape[-1], IntImm)
-            and permute_shape[-1].value() % 2 == 0
+            and alignment.valid_alignment(permute_shape[-1].value(), permute_dtype)
         ):
             # We don't run the permute+bmm fusion if the permute op could
-            # turn an odd alignment into even alignment.
+            # turn an invalid alignment into a valid alignment.
             continue
 
         fused = True
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index 6cb52e3c4..8074b389b 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -15,17 +15,21 @@
 """
 Perform transformations on ops which support strided inputs / outputs.
 """
+import logging
 from typing import List
 
 from aitemplate.compiler.stable_set import StableSet
 
-from ...utils import graph_utils, logger
+from ...utils import alignment, graph_utils
 from ..base import IntImm, IntVar, Operator, Tensor
 from . import transform_strided_ops_utils, transform_utils
 
 # pylint: disable=W0612
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def _can_fuse_split_op(split_op: Operator):
     split_dim = split_op._attrs["split_dim"]
     # FIXME: only support dim == 1 at the moment
@@ -148,17 +152,14 @@ def get_stride(t: Tensor, dim: int):
     return stride
 
 
-def _check_dim_alignment(shape: List[IntVar], dim_idx: int) -> bool:
+def _check_dim_alignment(shape: List[IntVar], dim_idx: int, dtype: str) -> bool:
     k_dim = shape[dim_idx]
     # skip dynamic dim
     if not isinstance(k_dim, IntImm):
         return False
     k_dim_val = k_dim._attrs["values"][0]
     # We cannot have mis-aligned K
-    if k_dim_val % 2 == 0:
-        return True
-    else:
-        return False
+    return alignment.valid_alignment(k_dim_val, dtype)
 
 
 def _check_alignment(op: Operator, offset: int):
@@ -166,23 +167,21 @@ def _check_alignment(op: Operator, offset: int):
     if op._attrs["op"] == "bmm_rcr_n1":
         return True
 
-    # ops that don't support align=1
-    # TODO: adjust alignment requirement based on dtype. 2-elem-alignment is
-    # only required by fp16, because async.copy needs at least 32 bits.
-    # For fp32 dtype values, 1-elem-alignment is valid.
-    if offset % 2 != 0:  # fp16
+    dtype = op._attrs["inputs"][0].dtype()
+    # ops that don't have valid alignments
+    if not alignment.valid_alignment(offset, dtype):
         return False
     if op._attrs["op"] == "bmm_rrr_permute":
         a_shape = op._attrs["input_accessors"][0].original_shapes
         b_shape = op._attrs["input_accessors"][1].original_shapes
         # check K and N
-        return _check_dim_alignment(a_shape, dim_idx=2) and _check_dim_alignment(
-            b_shape, dim_idx=2
-        )
+        return _check_dim_alignment(
+            a_shape, dim_idx=2, dtype=dtype
+        ) and _check_dim_alignment(b_shape, dim_idx=2, dtype=dtype)
     if op._attrs["op"] == "bmm_rcr":
         a_shape = op._attrs["input_accessors"][0].original_shapes
         # check K
-        return _check_dim_alignment(a_shape, dim_idx=2)
+        return _check_dim_alignment(a_shape, dim_idx=2, dtype=dtype)
     if op._attrs["op"] == "bmm_softmax_bmm_permute":
         # a = (B, M, K), b = (B, N, K), c = (B, N, O)
         # t = bmm_rcr(a, b)
@@ -192,13 +191,13 @@ def _check_alignment(op: Operator, offset: int):
         c_shape = op._attrs["input_accessors"][2].original_shapes
         return (
             # check K for bmm_rcr((B, M, K), (B, N, K))
-            _check_dim_alignment(a_shape, dim_idx=2)
+            _check_dim_alignment(a_shape, dim_idx=2, dtype=dtype)
             and
             # check N for bmm_rrr((B, M, N), (B, N, O))
-            _check_dim_alignment(c_shape, dim_idx=1)
+            _check_dim_alignment(c_shape, dim_idx=1, dtype=dtype)
             and
             # check O for bmm_rrr((B, M, N), (B, N, O))
-            _check_dim_alignment(c_shape, dim_idx=2)
+            _check_dim_alignment(c_shape, dim_idx=2, dtype=dtype)
         )
 
     raise RuntimeError(f'Unexpected op type: {op._attrs["op"]}')
@@ -263,7 +262,7 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
 
         if not can_fuse_split:
             continue
-        logger.debug(__file__, "Remove split from graph")
+        _LOGGER.debug("Remove split from graph")
         split_input.dst_ops().remove(split_op)
 
         for output, offset in zip(outputs, output_offsets):
diff --git a/python/aitemplate/compiler/transform/fuse_utils.py b/python/aitemplate/compiler/transform/fuse_utils.py
index 4aa7ee25b..13553480d 100644
--- a/python/aitemplate/compiler/transform/fuse_utils.py
+++ b/python/aitemplate/compiler/transform/fuse_utils.py
@@ -15,7 +15,6 @@
 from typing import Any, List, Optional, Set
 
 from ..base import Operator, Tensor
-from ..ops.conv.common_conv2d_bias_add_activation import conv2d_bias_add_activation
 from .toposort import toposort
 from .transform_utils import (
     copy_tensor_attributes,
@@ -120,6 +119,7 @@ def transform_simple_fusion_patterns(
         src_op = extract_only_one_op(tensor._attrs["src_ops"])
         inputs = list(src_op._attrs["inputs"])
         to_remove_dst_op[src_op] = list(inputs)
+        src_op_num_inputs = len(inputs)
 
         last_tensor = tensor
         to_remove_candidate.add(last_tensor)
@@ -158,22 +158,22 @@ def transform_simple_fusion_patterns(
         # A final check to make sure our replacement is valid.
         new_op = fusion_patterns[fusion_idx][1]
 
+        # For bias_add fusion, use is_valid_inputs
         check_inputs_func = getattr(new_op, "is_valid_inputs", None)
         if check_inputs_func is not None:
             valid, _ = check_inputs_func(*inputs)
             if not valid:
                 continue
-
-        # TODO: remove after broadcasting is supported
-        # special shape check for conv2d_bias_add_activation ops
-        if issubclass(new_op, conv2d_bias_add_activation):
-            assert len(inputs) >= 4, (
-                f"The number of inputs must be larger than 4 for conv2d_bias_add_activation "
-                f"family fusions. Current number of inputs: {len(inputs)}"
-            )
-            residual = inputs[3]
-            y = src_op._attrs["outputs"][0]
-            if y.shape() != residual.shape():
+        else:
+            # gemm/conv epilogue fusion with elementwise ops doesn't
+            # support broadcasting except for bias_add.
+            # Here we do assume that all other inputs are elementwise inputs.
+            cannot_fuse = False
+            for elementwise_input in inputs[src_op_num_inputs:]:
+                if tensor.shape() != elementwise_input.shape():
+                    cannot_fuse = True
+                    break
+            if cannot_fuse:
                 continue
 
         # inputs here might not be ready in graph. But we will toposort again
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 7819190ef..b67b61d64 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,7 @@
 import re
 from typing import List
 
-from ..base import IntVarTensor, Tensor
+from ..base import IntImm, IntVarTensor, Tensor
 
 # pylint: disable=C0103
 
@@ -63,11 +63,14 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 node._attrs["name"] = tensor_name
                 tensor_cnt += 1
                 if isinstance(node, IntVarTensor):
-                    # TODO: emit standalone dynamic shape initialization for IntVarTensor
-                    raise RuntimeError(
-                        "We don't support emitting standalone IntVarTensor at this moment.\n"
-                        f"Encountered {node._attrs['name']}: {node._attrs['int_var']}."
-                    )
+                    if not isinstance(node._attrs["int_var"], IntImm):
+                        # TODO: emit standalone dynamic shape initialization for IntVarTensor
+                        raise RuntimeError(
+                            "We don't support emitting standalone IntVarTensor at this moment.\n"
+                            f"Encountered {node._attrs['name']}: {node._attrs['int_var']}."
+                        )
+                    else:
+                        node._attrs["int_var"]._attrs["name"] = tensor_name
 
         else:
             for func in funcs:
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index f959221ae..c4d2f817b 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -29,6 +29,7 @@
 from .fuse_parallel_gemms import fuse_parallel_gemms
 from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
 from .split_large_concat_ops import split_large_concat_ops
+from .split_large_split_ops import split_large_split_ops
 from .transform_memory_ops import transform_memory_ops
 from .transform_odd_alignment import transform_odd_alignment
 from .transform_special_ops import transform_special_ops
@@ -82,6 +83,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         apply_padding,
         transform_strided_ops,
         split_large_concat_ops,
+        split_large_split_ops,
         transform_memory_ops,
     ]
 
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 549d86098..2bc00b1d1 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -15,6 +15,7 @@
 """
 Graph pass to invoke profiling.
 """
+import logging
 import os
 from copy import deepcopy
 from datetime import datetime
@@ -27,14 +28,15 @@
     GemmProfilerPostprocessingDelegate,
 )
 
-from aitemplate.utils import logger
-
 from ...backend import builder, codegen
 from ..base import DynamicProfileStrategy, Tensor
 
 # pylint: disable=C0103,W0613,W0102
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def elapsed_dt_sec(start_t_sec):
     return datetime.now() - start_t_sec
 
@@ -79,14 +81,13 @@ def profile(
         codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
     )
     generated_profilers = [p for p in generated_profilers if p is not None]
-    logger.info(
-        __name__,
+    _LOGGER.info(
         f"generated {len(generated_profilers)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
     )
     start_t = datetime.now()
     compile_engine = builder.Builder()
     compile_engine.make_profilers(generated_profilers, profiler_dir)
-    logger.info(__name__, f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
+    _LOGGER.info(f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
     funcs_to_profile = OrderedDict(
         {
             func._attrs["name"]: func
@@ -103,11 +104,10 @@ def profile(
         f.profile(
             workdir=profiler_dir,
             devices=devices,
-            dynamic_profiling_strategy=dynamic_profiling_strategy,
         )
     profiler_runner = ProfilerRunner(
         devices,
-        timeout=180,
+        timeout=240,
         postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
     )
     for f in gemms:
@@ -116,8 +116,7 @@ def profile(
             profiler_runner=profiler_runner,
         )
     profiler_runner.join()
-    logger.info(
-        __name__,
+    _LOGGER.info(
         f"ran {len(funcs_to_profile)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
     )
     for node in sorted_graph:
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index ee08c2716..e49367d47 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -15,18 +15,21 @@
 """
 Graph pass to invoke profiling with dynamic shapes.
 """
+import logging
 from copy import deepcopy
 from typing import List, OrderedDict
 
 from ...backend import builder, codegen
-from ...utils import logger
 from ..base import Tensor
 
 # pylint: disable=C0103,W0613,W0102
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
-    logger.info(__name__, "Current dynamic profiler supports ONLY ONE dynamic dim.")
+    _LOGGER.info("Current dynamic profiler supports ONLY ONE dynamic dim.")
     generated_profilers = list(codegen.gen_profiler(sorted_graph, workdir))
     generated_profilers = [p for p in generated_profilers if p is not None]
     compile_engine = builder.Builder()
diff --git a/python/aitemplate/compiler/transform/refine_graph.py b/python/aitemplate/compiler/transform/refine_graph.py
index 6cc44cb2d..2d1aa552b 100644
--- a/python/aitemplate/compiler/transform/refine_graph.py
+++ b/python/aitemplate/compiler/transform/refine_graph.py
@@ -15,15 +15,18 @@
 """
 Graph pass to dedup operators with same signatures.
 """
+import logging
 from typing import List
 
-from ...utils import logger
 from ...utils.graph_utils import get_sorted_ops
 
 from ..base import Operator, Tensor
 
 # pylint: disable=C0103
 
+
+_LOGGER = logging.getLogger(__name__)
+
 SPECIAL_CHECK_FUNC_KEYS = {
     "inputs",
     "name",
@@ -153,7 +156,5 @@ def refine_graph(sorted_graph: List[Tensor]):
         if found:
             refined_ops_set.add(func._attrs["op"])
 
-    logger.debug(__file__, f"refined ops: {refined_ops_set}")
-    logger.info(
-        __file__, f"reduced unique ops from {total_ops} to {total_ops - refined_ops}"
-    )
+    _LOGGER.debug(f"refined ops: {refined_ops_set}")
+    _LOGGER.info(f"reduced unique ops from {total_ops} to {total_ops - refined_ops}")
diff --git a/python/aitemplate/compiler/transform/split_large_concat_ops.py b/python/aitemplate/compiler/transform/split_large_concat_ops.py
index d2c6ee1ab..06b4522af 100644
--- a/python/aitemplate/compiler/transform/split_large_concat_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_concat_ops.py
@@ -29,7 +29,8 @@
 from ..base import Operator, Tensor
 from . import transform_utils
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 CONCAT_INPUT_META_SIZE = 64
 CONCAT_OUTPUT_META_SIZE = 16
@@ -46,7 +47,7 @@ def _concat_kernel_single_input_output_param_size(op: Operator):
     size_of_one_output_meta = CONCAT_OUTPUT_META_SIZE * rank
     # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
     total_params_size = CONCAT_INPUT_META_SIZE + size_of_one_output_meta + 24
-    logger.debug(f'concat op op._attrs["name"]: {total_params_size=}')
+    _LOGGER.debug(f'concat op {op._attrs["name"]}: {total_params_size=}')
     return total_params_size
 
 
@@ -91,22 +92,21 @@ def split_large_concat_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
         concat_outputs = concat_op._attrs["outputs"]
         input_accessors = concat_op._attrs["input_accessors"]
         for new_inputs_size in split_sizes:
-            new_concat_output = ops.concatenate()(
-                concat_inputs, concat_op._attrs["concat_dim"]
-            )
-            new_concat_op = list(new_concat_output.src_ops())[0]
+            new_concat_op = ops.concatenate()
+            new_concat_op._attrs["inputs"] = list(concat_inputs)
+            new_concat_op._attrs["concat_dim"] = concat_op._attrs["concat_dim"]
             new_concat_op._attrs["outputs"] = concat_outputs.copy()
             new_concat_op._attrs["original_inputs"] = concat_op._attrs[
                 "original_inputs"
             ].copy()
             new_concat_op._attrs["input_masks"] = concat_op._attrs["input_masks"].copy()
             new_concat_op._attrs["input_accessors"] = copy.deepcopy(input_accessors)
+            new_concat_op._set_depth()
+
             indices_to_remove = list(range(offset)) + list(
                 range(offset + new_inputs_size, num_inputs)
             )
             new_concat_op.remove_input_at(indices_to_remove)
-            new_concat_output._attrs["src_ops"] = StableSet()
-            new_concat_output._attrs["dst_ops"] = StableSet()
             all_new_concat_ops.append(new_concat_op)
             offset += new_inputs_size
         # original inputs are distributed among new concats, so we need to adjust
diff --git a/python/aitemplate/compiler/transform/split_large_split_ops.py b/python/aitemplate/compiler/transform/split_large_split_ops.py
new file mode 100644
index 000000000..321afea63
--- /dev/null
+++ b/python/aitemplate/compiler/transform/split_large_split_ops.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This transformation splits a split with a large number of outputs into multiple
+splitt ops, which share the same input with correct output_masks.
+"""
+import logging
+
+from typing import List
+
+from ...utils import graph_utils
+from .. import ops
+from ..base import Operator, Tensor
+
+from . import toposort, transform_utils
+
+
+_LOGGER = logging.getLogger(__name__)
+
+SPLIT_INPUT_META_SIZE = 16
+SPLIT_OUTPUT_META_SIZE = 32
+MAX_CUDA_PARAM_BYTES = 4096
+
+
+def _split_kernel_single_input_output_param_size(op: Operator):
+    """
+    Return the total size (in bytes) of the split's params.
+    We need to adjust this if we change the split op's params.
+    Note this is conservative by multiplying input_meta and constant 24 bytes.
+    """
+    outputs = op._attrs["outputs"]
+    rank = outputs[0]._rank()
+    size_of_input_meta = SPLIT_INPUT_META_SIZE * rank
+    # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
+    total_params_size = SPLIT_OUTPUT_META_SIZE + size_of_input_meta + 24
+    _LOGGER.debug(f'split op op._attrs["name"]: {total_params_size=}')
+    return total_params_size
+
+
+def split_large_split_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Our split CUDA kernel takes an output meta argument whose size
+    is proportional to the number of outputs. In extreme cases, the total size
+    of the params of a split kernel may exceed the limit imposed by the CUDA
+    compiler. In such cases, we split the split op into separate ones.
+    """
+    modified = False
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if not op._attrs["op"].startswith("split"):
+            continue
+        split_op = op
+
+        split_params_size = _split_kernel_single_input_output_param_size(split_op)
+        if split_params_size > MAX_CUDA_PARAM_BYTES:
+            raise RuntimeError(
+                f"cannot handle cases: {split_params_size=} > {MAX_CUDA_PARAM_BYTES=}"
+            )
+        if split_params_size * len(split_op._attrs["outputs"]) <= MAX_CUDA_PARAM_BYTES:
+            continue
+
+        modified = True
+        split_dim = split_op._attrs["split_dim"]
+        split_sizes = split_op._attrs["split_sizes"]
+        outputs = split_op._attrs["outputs"]
+        num_outputs_per_split = MAX_CUDA_PARAM_BYTES // split_params_size
+        # compute how many split ops we need to fix within MAX_CUDA_PARAM_BYTES
+        num_split_ops = (
+            len(outputs) + num_outputs_per_split - 1
+        ) // num_outputs_per_split
+
+        output_mapping = []
+        for split_i in range(num_split_ops):
+            start = split_i * num_outputs_per_split
+            end = min(
+                (split_i + 1) * num_outputs_per_split, len(split_op._attrs["outputs"])
+            )
+
+            remove_indices = list(range(start)) + list(
+                range(end, len(split_op._attrs["outputs"]))
+            )
+            new_split = ops.split()
+            new_outputs = new_split(
+                split_op._attrs["inputs"][0], split_sizes, split_dim
+            )
+            new_split.remove_output_at(remove_indices)
+            new_outputs = new_split._attrs["outputs"]
+            sorted_graph += list(new_outputs)
+            output_mapping += list(zip(outputs[start:end], new_outputs))
+
+        for (old_output, new_output) in output_mapping:
+            transform_utils.replace_tensor(old_output, new_output)
+
+    if not modified:
+        return sorted_graph
+
+    new_output_tensors = [
+        tensor for tensor in sorted_graph if tensor._attrs["is_output"]
+    ]
+    sorted_graph = toposort.toposort(new_output_tensors)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/toposort.py b/python/aitemplate/compiler/transform/toposort.py
index 9de7eb615..c5e3c2fbe 100644
--- a/python/aitemplate/compiler/transform/toposort.py
+++ b/python/aitemplate/compiler/transform/toposort.py
@@ -37,29 +37,36 @@ def toposort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
     """
     visited = set()
     sorted_graph = []
+    stack = []
 
-    def DFS(nd: Tensor):
-        if nd in visited:
-            return
-        for src_op in nd.src_ops():
+    if isinstance(nodes, Tensor):
+        stack.append((nodes, False))
+    else:
+        for node in list(nodes)[::-1]:
+            stack.append((node, False))
+
+    while len(stack) > 0:
+        curr_node, curr_visited = stack.pop()
+        if curr_visited:
+            sorted_graph.append(curr_node)
+            for src_op in curr_node.src_ops():
+                for next_node in src_op._attrs["outputs"]:
+                    stack.append((next_node, False))
+            continue
+        if curr_node in visited:
+            continue
+
+        visited.add(curr_node)
+        stack.append((curr_node, True))
+        for src_op in curr_node.src_ops():
             args = src_op._attrs["inputs"]
             indexed_args = list(enumerate(args))
             depth_first_args = sorted(
                 indexed_args, key=lambda x: x[1]._attrs["depth"], reverse=True
             )
-            visit_seq = [x[0] for x in depth_first_args]
+            visit_seq = [x[0] for x in depth_first_args[::-1]]
             for idx in visit_seq:
                 arg = args[idx]
-                DFS(arg)
-        visited.add(nd)
-        sorted_graph.append(nd)
-        for src_op in nd.src_ops():
-            for next_nd in src_op._attrs["outputs"]:
-                DFS(next_nd)
+                stack.append((arg, False))
 
-    if isinstance(nodes, Tensor):
-        DFS(nodes)
-    else:
-        for node in list(nodes):
-            DFS(node)
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index d6de21e7f..a585393e7 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -15,6 +15,7 @@
 """
 Perform memory operator related transformations.
 """
+import copy
 from typing import List
 
 from aitemplate.compiler.tensor_accessor import TensorAccessor
@@ -65,7 +66,10 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     first_op_inputs = first_op._attrs["inputs"]
     first_op_outputs = first_op._attrs["outputs"]
     cat_inputs = cat._attrs["inputs"]
+    cat_original_inputs = cat._attrs["original_inputs"]
     new_cat_inputs = []
+    new_cat_original_inputs = []
+    new_cat_input_accessors = []
     i = 0
     while i < len(cat_inputs):
         matched = True
@@ -77,9 +81,25 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
                 break
         if matched:
             new_cat_inputs.extend(first_op._attrs["inputs"])
+            # we may not have original_inputs/input_accessors, e.g. if first_op is split
+            if "original_inputs" in first_op._attrs:
+                original_inputs = first_op._attrs["original_inputs"]
+            else:
+                original_inputs = first_op._attrs["inputs"]
+            new_cat_original_inputs.extend(original_inputs)
+            if "input_accessors" in first_op._attrs:
+                new_cat_input_accessors.extend(
+                    copy.deepcopy(first_op._attrs["input_accessors"])
+                )
+            else:
+                new_cat_input_accessors.extend(
+                    [TensorAccessor(t) for t in original_inputs]
+                )
             i += len(first_op_outputs)
         else:
             new_cat_inputs.append(cat_inputs[i])
+            new_cat_original_inputs.append(cat_original_inputs[i])
+            new_cat_input_accessors.append(cat._attrs["input_accessors"][i])
             i += 1
 
     for tensor in new_cat_inputs:
@@ -91,8 +111,8 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     # change this part later when we have TensorAccessors, depending on
     # the order of the transformations.
     assert all(cat._attrs["input_masks"])
-    cat._attrs["input_accessors"] = [TensorAccessor(t) for t in cat._attrs["inputs"]]
-    cat._attrs["original_inputs"] = list(new_cat_inputs)
+    cat._attrs["input_accessors"] = new_cat_input_accessors
+    cat._attrs["original_inputs"] = list(new_cat_original_inputs)
     cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
     for tensor in first_op_inputs:
         tensor._attrs["dst_ops"].remove(first_op)
diff --git a/python/aitemplate/compiler/transform/transform_odd_alignment.py b/python/aitemplate/compiler/transform/transform_odd_alignment.py
index 0e9b9414d..c572b5e76 100644
--- a/python/aitemplate/compiler/transform/transform_odd_alignment.py
+++ b/python/aitemplate/compiler/transform/transform_odd_alignment.py
@@ -23,7 +23,7 @@
 from ..ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
 from ..ops.tensor import permute021
 
-from .apply_padding import _get_padding_length
+from .apply_padding import get_padding_length
 from .fuse_utils import extract_only_one_op
 from .toposort import toposort
 from .transform_strided_ops import _is_supported_op as _is_supported_strided_op
@@ -63,7 +63,7 @@ def _compute_padding_flops(
     elif _is_strided_tensor(tensor):
         return (
             _matrix_shape_prod(shapes)
-            * _get_padding_length(shapes[padding_idx].value())
+            * get_padding_length(shapes[padding_idx].value(), tensor.dtype())
             / shapes[padding_idx].value()
         )
     else:
@@ -81,7 +81,9 @@ def _compute_slicing_flops(mm_op: Operator, slicing_dim: int, other_dim: int) ->
             can_be_fused = False
 
     if can_be_fused:
-        return other_dim * _get_padding_length(slicing_dim)
+        return other_dim * get_padding_length(
+            slicing_dim, mm_op._attrs["inputs"][0].dtype()
+        )
     else:
         return other_dim * slicing_dim
 
@@ -222,6 +224,12 @@ def _transform_odd_alignment(
         op_type = src_op._attrs["op"]
         if op_type not in permutable_pairs:
             continue
+        # FIXME: This pass only works for half type. We may need to change it to
+        # work with other types such as int8 later. Note that for float type, it
+        # is safe to skip, because gemm/bmm with float inputs always meet alignment
+        # requirements.
+        if src_op._attrs["inputs"][0].dtype() != "float16":
+            continue
 
         perm_type = ([False, False], [False, True], [True, False], [True, True])
         permute_input = [False, False]
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index d1dc9fecf..3d2604582 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -16,7 +16,6 @@
 Perform transformations to fuse view ops with strided op by using TensorAccessor.
 """
 
-import logging
 from typing import List
 
 from aitemplate.compiler.base import Operator, Tensor
@@ -25,7 +24,6 @@
 from aitemplate.compiler.transform import transform_utils
 from aitemplate.utils import graph_utils
 
-logger = logging.getLogger(__name__)
 
 _VIEW_OPS = {"reshape", "flatten", "squeeze", "unsqueeze"}
 
@@ -37,7 +35,7 @@ def _is_supported_strided_op(op: Operator) -> bool:
     if Target.current().name() == "rocm":
         return op_kind == "bmm_softmax_bmm_permute"
     else:
-        return not op_kind.startswith(("group_gemm", "concatenate"))
+        return not op_kind.startswith("group_gemm")
 
 
 def _is_supported_view_op(op: Operator, tensor: Tensor) -> bool:
@@ -96,6 +94,13 @@ def _fuse_strided_op_and_view_op_single_pass(
         else:
             if tensor._attrs["is_output"]:
                 continue
+            # We have special handling for group_gemm + reshape + concat
+            # in transform_strided_ops, so we skip group_gemm at the moment.
+            # Otherwise, we would end up with shape mismatch due to fusing
+            # the view op. We may relax this constraint if we remove the special
+            # pass above.
+            if src_op is not None and src_op._attrs["op"].startswith("group_gemm"):
+                continue
             to_be_removed_dst_ops = set()
             for dst_op in tensor._attrs["dst_ops"]:
                 if (
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index c5dcd5652..2e05fee5c 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -16,7 +16,6 @@
 Perform transformations on ops which support strided inputs / outputs.
 """
 import functools
-import logging
 
 from typing import List
 
@@ -33,8 +32,6 @@
 
 # pylint: disable=W0612
 
-logger = logging.getLogger(__name__)
-
 
 def _fuse_slices_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
     for tensor in sorted_graph:
@@ -132,7 +129,7 @@ def _group_gemm_cat_checker(
 
 def _is_bmm(op_type: str) -> bool:
     # TODO: support cutlass bmm ops
-    return op_type.startswith("bmm_rcr")
+    return op_type.startswith(("bmm_rcr", "bmm_crr"))
 
 
 def _bmm_checker(bmm_op: Operator, cat_op: Operator) -> bool:
@@ -172,10 +169,6 @@ def _is_layernorm(op_type: str) -> bool:
     return op_type.startswith("layernorm") or op_type.startswith("group_layernorm")
 
 
-def _layernorm_cat_checker(cat_op: Operator) -> bool:
-    return cat_op._attrs["concat_dim"] in [0, 1]
-
-
 def _is_reduce_op(op_type: str) -> bool:
     return op_type in {"reduce_sum", "reduce_mean", "var", "vector_norm"}
 
@@ -215,8 +208,6 @@ def _is_valid_for_fusion(strided_op: Operator, cat_op: Operator, out_idx: int):
         return _gemm_cat_checker(strided_op, cat_op)
     if _is_strided_group_gemm(strided_op):
         return _group_gemm_cat_checker(strided_op, cat_op, out_idx)
-    if _is_layernorm(op_type):
-        return _layernorm_cat_checker(cat_op)
     if _is_bmm(op_type):
         return _bmm_checker(strided_op, cat_op)
     if _is_perm102_bmm(op_type):
@@ -272,6 +263,8 @@ def _fuse_strided_op_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noq
             src_ops = list(cat_input.src_ops())
             if len(src_ops) != 1 or len(cat_input.dst_ops()) != 1:
                 continue
+            if cat_input._attrs["is_output"]:
+                continue
             strided_op = src_ops[0]
             if not _is_supported_op(strided_op):
                 continue
@@ -292,12 +285,15 @@ def _fuse_strided_op_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noq
 
             offset = 0
 
+            # cat's inputs may have been updated for cases like view_op + cat.
+            # So, we need to retrieve original shapes from its input accessors.
+            cat_input_accessors = cat_op._attrs["input_accessors"]
             # This pass must run before any other pass that remove cat inputs, like
             # _fuse_strided_op_reshape_cat
             for orig_i in range(idx):
-                input_tensor = cat_inputs[orig_i]
+                input_accessor = cat_input_accessors[orig_i]
                 # TODO: Add dynamic shape support.
-                offset += input_tensor._attrs["shape"][cat_dim].value()
+                offset += input_accessor.original_shapes[cat_dim].value()
 
             cat_inputs_to_remove.append(idx)
 
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
index e04107649..cd040155c 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
@@ -20,7 +20,8 @@
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _dynamic_shape_checker(shape: List[IntVar], dim: int) -> bool:
@@ -98,7 +99,7 @@ def gemm_stride_checker(
     # TODO: Make this configurable for different gemms, bmms, etc.
     stride_strs = tmp_ta.try_get_stride_strs(get_stride_at_dim)
     if stride_strs is None:
-        logger.debug(
+        _LOGGER.debug(
             f"Failed in gemm_stride_checker: "
             f"dim: {dim}, "
             f"original_shapes length: {len(original_ta.original_shapes)}"
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index ca66bea8b..9a2b66fd4 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -16,18 +16,22 @@
 Util functions for graph transformations.
 """
 
+import logging
 from collections import deque
 from typing import Dict, List, Union
 
 from aitemplate.compiler.stable_set import StableSet
 
-from ...utils import graph_utils, logger
+from ...utils import graph_utils
 from ..base import Operator, Tensor
 from .mark_param_tensor import mark_param_tensor
 from .name_graph import name_graph
 from .remove_unused_ops import remove_unused_ops
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def check_graph_validity(sorted_graph: List[Tensor], raiseError: bool = False) -> bool:
     """
     Check whether all tensor/op in the AIT graph matches.
@@ -37,8 +41,8 @@ def check_graph_validity(sorted_graph: List[Tensor], raiseError: bool = False) -
 
     def handleError(msg: str):
         if raiseError:
-            logger.info(__file__, "check_graph_validity() error! Graph:")
-            logger.info(__file__, graph_utils.sorted_graph_debug_str(sorted_graph))
+            _LOGGER.info("check_graph_validity() error! Graph:")
+            _LOGGER.info(graph_utils.sorted_graph_debug_str(sorted_graph))
             raise RuntimeError(msg)
         else:
             return False
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index bb067846e..0d9c00905 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -17,6 +17,7 @@
 from .embedding import BertEmbeddings, Embedding
 from .module import Module
 from .conv2d import *
+from .conv3d import *
 from .linear import *
 from .padding import *
 from .pool2d import *
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 2bc7a5917..c297538f6 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -352,11 +352,6 @@ def __init__(
         self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
         self.proj_drop = Dropout(proj_drop)
 
-    def qkv_proj(self, x):
-        batch, seq, hidden = self.get_shape(x)
-        x = ops.reshape()(x, [-1, hidden])
-        return self.qkv(x)
-
     def attention(self, q, k, v):
         seqlen = self.seqlen
         seqlen_kv = self.seqlen_kv
diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
new file mode 100644
index 000000000..f105c717e
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -0,0 +1,121 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv3d Module.
+"""
+from ...compiler.ops import conv3d, depthwise_conv3d
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class Conv3d(Module):
+    r"""Applies a 3D convolution over an input signal composed of several input
+    planes.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of padding applied to the input.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or Tuple(int)): Size of the convolving kernel
+        stride (int or Tuple(int)): Stride of the convolution
+        padding (int or Tuple(int), optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int or Tuple(int), optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        dtype (string, optional): Data type. Default: "float16"
+        bias (bool, optional): Has bias or not. Default: False (Note that we only support bias for depthwise_conv3d for now)
+
+    Shape:
+        - Input: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})`
+        - Output: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}[0], \text{kernel_size}[1], \text{kernel_size}[2], `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+
+    Examples::
+
+        >>> m = nn.Conv3d(16, 33, 3, 2)
+        >>> input = Tensor(shape=[20, 50, 100, 100, 16])
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+        bias=False,
+    ):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.weight = Parameter(
+            shape=[out_channels, *kernel_size, in_channels // groups], dtype=dtype
+        )
+        if groups != 1 and bias:
+            self.bias = Parameter(shape=[out_channels], dtype=dtype)
+
+        if groups == 1:
+            if bias:
+                raise AttributeError(
+                    "conv3d with groups==1 does not support bias for now."
+                )
+            self.op = conv3d(stride=stride, pad=padding, dilate=dilation, group=groups)
+        else:
+            self.op = depthwise_conv3d(
+                stride=stride, pad=padding, dilate=dilation, group=groups, bias=bias
+            )
+
+    def forward(self, *args):
+        """Applies Conv3d on the input tensor."""
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.weight.tensor())
diff --git a/python/aitemplate/frontend/nn/dual_gemm.py b/python/aitemplate/frontend/nn/dual_gemm.py
index 1db963eab..2ddf59d30 100644
--- a/python/aitemplate/frontend/nn/dual_gemm.py
+++ b/python/aitemplate/frontend/nn/dual_gemm.py
@@ -59,9 +59,20 @@ def __init__(
         dtype="float16",
     ):
         super().__init__()
-        self.wi_0_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
-        self.wi_1_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
-        self.wo = Linear(out_channels, in_channels, bias=False)
+        self.wi_0_weight = Parameter(
+            shape=[out_channels, in_channels],
+            dtype=dtype,
+        )
+        self.wi_1_weight = Parameter(
+            shape=[out_channels, in_channels],
+            dtype=dtype,
+        )
+        self.wo = Linear(
+            out_channels,
+            in_channels,
+            bias=False,
+            dtype=dtype,
+        )
         self.op = ops.dual_gemm_rcr_fast_gelu()
 
     def forward(self, *args):
diff --git a/python/aitemplate/frontend/nn/proposal.py b/python/aitemplate/frontend/nn/proposal.py
index 999682915..59a590b6e 100644
--- a/python/aitemplate/frontend/nn/proposal.py
+++ b/python/aitemplate/frontend/nn/proposal.py
@@ -104,7 +104,13 @@ def generate_anchors(ratios=(0.5, 1, 2), scales=(8, 16, 32)):
 
 
 def generate_shifted_anchors(
-    im_h, im_w, feat_stride, scales, ratios, batch_size, dtype
+    im_h,
+    im_w,
+    feat_stride,
+    scales,
+    ratios,
+    batch_size,
+    dtype,
 ):
     """
     Enumerate all shifted anchors
@@ -134,13 +140,17 @@ def generate_shifted_anchors(
     return exp_anchors.astype(dtype)
 
 
-def gen_batch_inds(batch_size, rpn_post_nms_top_n):
+def gen_batch_inds(
+    batch_size,
+    rpn_post_nms_top_n,
+    dtype="float16",
+):
     if batch_size > 1:
         inds = np.arange(batch_size)
         batch_inds = np.repeat(inds.reshape(-1, 1), repeats=rpn_post_nms_top_n, axis=1)
-        return batch_inds.reshape(batch_size, rpn_post_nms_top_n, 1).astype("float16")
+        return batch_inds.reshape(batch_size, rpn_post_nms_top_n, 1).astype(dtype)
     else:
-        return np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype("float16")
+        return np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype(dtype)
 
 
 class Proposal(Module):
@@ -186,7 +196,11 @@ def __init__(
             self.batch_size,
             self.dtype,
         )
-        self._batch_inds = gen_batch_inds(batch_size, rpn_post_nms_top_n)
+        self._batch_inds = gen_batch_inds(
+            batch_size,
+            rpn_post_nms_top_n,
+            dtype=dtype,
+        )
 
     def forward(self, *args):
         assert len(args) >= 1
@@ -236,36 +250,58 @@ def box_transform(self, bbox_deltas, anchors):
         ctr_y = ops.elementwise(FuncEnum.ADD)(anchor_y1, height_mid)
 
         pred_ctr_x = ops.elementwise(FuncEnum.ADD)(
-            ops.elementwise(FuncEnum.MUL)(delta_x, widths), ctr_x
+            ops.elementwise(FuncEnum.MUL)(delta_x, widths),
+            ctr_x,
         )
         pred_ctr_y = ops.elementwise(FuncEnum.ADD)(
-            ops.elementwise(FuncEnum.MUL)(delta_y, heights), ctr_y
+            ops.elementwise(FuncEnum.MUL)(delta_y, heights),
+            ctr_y,
         )
         pred_w = ops.elementwise(FuncEnum.MUL)(
-            ops.elementwise(FuncEnum.EXP)(delta_w), widths
+            ops.elementwise(FuncEnum.EXP)(delta_w),
+            widths,
         )
         pred_h = ops.elementwise(FuncEnum.MUL)(
-            ops.elementwise(FuncEnum.EXP)(delta_h), heights
+            ops.elementwise(FuncEnum.EXP)(delta_h),
+            heights,
         )
 
         p_x1 = ops.elementwise(FuncEnum.SUB)(
-            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+            pred_ctr_x,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w),
         )
         p_y1 = ops.elementwise(FuncEnum.SUB)(
-            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+            pred_ctr_y,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h),
         )
         p_x2 = ops.elementwise(FuncEnum.ADD)(
-            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+            pred_ctr_x,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w),
         )
         p_y2 = ops.elementwise(FuncEnum.ADD)(
-            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+            pred_ctr_y,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h),
         )
 
         if self.clip_box:
-
-            x_min = Tensor(shape=[], dtype="float16", name="X_min", value=0)
-            x_max_h = Tensor(shape=[], dtype="float16", name="X_min_h", value=self.im_h)
-            x_max_w = Tensor(shape=[], dtype="float16", name="X_min_w", value=self.im_w)
+            x_min = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min",
+                value=0,
+            )
+            x_max_h = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min_h",
+                value=self.im_h,
+            )
+            x_max_w = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min_w",
+                value=self.im_w,
+            )
 
             f_x1 = ops.elementwise(FuncEnum.HARDTANH)(p_x1, x_min, x_max_w)
             f_y1 = ops.elementwise(FuncEnum.HARDTANH)(p_y1, x_min, x_max_h)
diff --git a/python/aitemplate/testing/benchmark_trt.py b/python/aitemplate/testing/benchmark_trt.py
new file mode 100644
index 000000000..2b7222ba0
--- /dev/null
+++ b/python/aitemplate/testing/benchmark_trt.py
@@ -0,0 +1,59 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+helper functions to benchmark fx-trt
+"""
+from aitemplate.testing.benchmark_pt import benchmark_torch_function  # usort:skip
+from torch_tensorrt.fx import lower
+from torch_tensorrt.fx.utils import LowerPrecision
+
+
+def make_trt_module(
+    function,
+    *inputs,
+    max_batch_size=256,
+    max_workspace_size=2 << 31,
+    dtype="float16",
+):
+    if dtype == "float16":
+        lower_precision = LowerPrecision.FP16
+    elif dtype == "float32":
+        lower_precision = LowerPrecision.FP32
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+    return lower.compile(
+        function,
+        inputs,
+        min_acc_module_size=1,
+        max_batch_size=max_batch_size,
+        max_workspace_size=max_workspace_size,
+        lower_precision=lower_precision,
+        verbose_log=True,
+        timing_cache_prefix=True,
+        save_timing_cache=True,
+        explicit_batch_dimension=True,
+        dynamic_batch=False,
+    )
+
+
+def benchmark_trt_function(iters: int, function, *args) -> float:
+    submod = make_trt_module(function, args)
+    submod(*args)
+    return benchmark_torch_function(
+        iters,
+        submod,
+        *args,
+    )
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 9e1867f44..e85a46217 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -15,14 +15,17 @@
 """
 Automatic detect target for testing
 """
+import logging
 import os
 from subprocess import PIPE, Popen
 
 from ..backend.target import CUDA, ROCM
-from ..utils import logger
 
 # pylint: disable=W0702, W0612,R1732
 
+
+_LOGGER = logging.getLogger(__name__)
+
 IS_CUDA = None
 FLAG = ""
 
@@ -36,6 +39,8 @@ def _detect_cuda():
         )
         stdout, stderr = proc.communicate()
         stdout = stdout.decode("utf-8")
+        if "H100" in stdout:
+            return "90"
         if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout:
             return "80"
         if "V100" in stdout:
@@ -84,13 +89,13 @@ def detect_target(**kwargs):
         IS_CUDA = True
         FLAG = flag
 
-        logger.info(__name__, "Set target to CUDA")
+        _LOGGER.info("Set target to CUDA")
         return CUDA(arch=flag, **kwargs)
     flag = _detect_rocm()
     if flag is not None:
         IS_CUDA = False
         FLAG = flag
 
-        logger.info(__name__, "Set target to ROCM")
+        _LOGGER.info("Set target to ROCM")
         return ROCM(arch=flag, **kwargs)
     raise RuntimeError("Unsupported platform")
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index b229541f3..643c948f7 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -15,38 +15,38 @@
 """
 Utils for unit tests.
 """
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import torch
 
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
 from aitemplate.utils.graph_utils import get_sorted_ops
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
-DTYPE_TO_TORCH_DTYPE: Dict[str, torch.dtype] = {
-    "float16": torch.half,
-    "float": torch.float,
-    "int": torch.int,
-}
+def _get_torch_tensor(torch_fn, shape, dtype):
+    dtype = normalize_dtype(dtype)
+    return torch_fn(shape, device="cuda", dtype=string_to_torch_dtype(dtype))
 
 
-def dtype_to_torch_dtype(dtype):
-    if dtype is None:
-        return None
-    torch_dtype = DTYPE_TO_TORCH_DTYPE.get(dtype)
-    if torch_dtype is None:
-        raise RuntimeError("Unsupported dtype: {}".format(dtype))
-    return torch_dtype
+def get_random_torch_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.randn, shape, dtype)
 
 
-def get_random_torch_tensor(shape, dtype):
-    if dtype == "float16":
-        return torch.randn(shape).cuda().half()
-    if dtype == "float":
-        return torch.randn(shape).cuda().float()
-    if dtype == "int":
-        return torch.randn(shape).cuda().int()
-    raise RuntimeError("unsupported dtype: {}".format(dtype))
+def get_torch_empty_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.empty, shape, dtype)
+
+
+def get_torch_zeros_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.zeros, shape, dtype)
+
+
+def get_torch_full_tensor(shape, fill_value, dtype="float16"):
+    dtype = normalize_dtype(dtype)
+    return torch.full(
+        shape, fill_value, device="cuda", dtype=string_to_torch_dtype(dtype)
+    )
 
 
 def has_op(sorted_ops: List[Operator], op_name: str) -> bool:
@@ -70,10 +70,12 @@ def count_ops(sorted_ops: List[Operator], op_name: str):
     return count
 
 
-def gen_input_tensor(shape: List[Any], name: str = None) -> Tensor:
+def gen_input_tensor(
+    shape: List[Any], dtype: str = "float16", name: Optional[str] = None
+) -> Tensor:
     tensor = Tensor(
         shape=shape,
-        dtype="float16",
+        dtype=dtype,
         name=name,
         is_input=True,
     )
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index b41eabd98..44c1a6b98 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -18,8 +18,8 @@
 from . import (
     alignment,
     graph_utils,
-    logger,
     markdown_table,
+    misc,
     shape_utils,
     tensor_utils,
     torch_utils,
diff --git a/python/aitemplate/utils/alignment.py b/python/aitemplate/utils/alignment.py
index d171a8cb8..c5aad47a7 100644
--- a/python/aitemplate/utils/alignment.py
+++ b/python/aitemplate/utils/alignment.py
@@ -15,22 +15,49 @@
 """
 Util functions to handle alignment.
 """
-# Currently read4, add2 is best for both backend, so two backend seems identical.
-# They may diverge when we got deeper understanding / further optimization.
-ALIGNMENTS = [
-    8,
-    4,
-    2,
-    1,
-]
 
+from typing import List
 
-def find_max_alignment(number: int) -> int:
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+# FIXME: These alignment constraints are for cutlass/ck. We should consider
+# to refine this part for other backends.
+def get_alignments(dtype: str) -> List[int]:
+    """
+    Return all of the valid alignment values for the dtype.
+    """
+    dtype = normalize_dtype(dtype)
+    if dtype in ("float16", "bfloat16"):
+        return [8, 4, 2, 1]
+    elif dtype in ("float", "float32"):
+        return [4, 2, 1]
+    else:
+        raise NotImplementedError(f"unsupported {dtype=} for alignments")
+
+
+def find_max_alignment(number: int, dtype: str) -> int:
     """
     Return the first alignment value that meets the alignment requirement
     for accessing the `number` of elements. This is dtype dependent.
     """
-    for alignment in ALIGNMENTS:
+    alignments = get_alignments(dtype)
+    for alignment in alignments:
         if number % alignment == 0:
             return alignment
     return 1
+
+
+def valid_alignment(align: int, dtype: str) -> bool:
+    """
+    Return True if the given align value is legitimate for the dtype.
+    """
+    dtype = normalize_dtype(dtype)
+    # 2-elem-alignment is required by fp16, because async.copy needs at least 32
+    # bits. For fp32 dtype values, 1-elem-alignment is valid.
+    if dtype in ("float16", "bfloat16"):
+        return align % 2 == 0
+    elif dtype in ("float", "float32"):
+        return True
+    else:
+        raise NotImplementedError(f"unsupported {dtype=} for valid_alignment")
diff --git a/python/aitemplate/utils/debug_settings.py b/python/aitemplate/utils/debug_settings.py
new file mode 100644
index 000000000..d614a0be6
--- /dev/null
+++ b/python/aitemplate/utils/debug_settings.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Debug settings
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class AITDebugSettings:
+    """
+    This class contains the options for configuring debug settings
+    Arguments:
+    check_all_nan_and_inf : bool (default: False)
+        Whether or not to check this tensor is nan or inf during runtime.
+    check_all_outputs : bool (default: False)
+        Whether or not to print this tensor's value out during runtime.
+    gen_profiler_annotation : bool (default: False)
+        Whether or not to add profile annotation primitives when doing codegen.
+        (e.g. NVTX for CUDA and rocTX for AMD) Currently only supports NVIDIA.
+    dump_ait_to_py: str, optional
+        The path where the AIT graph is dumped into a .py file.
+    """
+
+    check_all_nan_and_inf: bool = False
+    check_all_outputs: bool = False
+    gen_profiler_annotation: bool = False
+    dump_ait_to_py: Optional[str] = None
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 600bf14f8..d3dcf6f52 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -12,10 +12,15 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
 from typing import Any, List
 
-from aitemplate.utils import logger
+from aitemplate.utils.misc import is_debug
+from aitemplate.utils.visualization import plot_graph
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def get_sorted_ops(tensors) -> List[Any]:
@@ -63,14 +68,17 @@ def sorted_op_pseudo_code(ops, with_shape=True) -> str:
 
 
 def dump_graph_debug_str_to_file(tensors, workdir, name):
-    if logger.is_debug():
+    if is_debug():
         # Dump graph and pseudo code for debug only
         prefix = os.path.join(workdir, name)
         graph_path = prefix + "_graph.txt"
         pseudo_code_path = prefix + "_pseudo_code.txt"
+        graph_visual_path = prefix + "_graph_vis.html"
         with open(graph_path, "w") as f:
             f.write(sorted_graph_debug_str(tensors))
-            logger.debug(__file__, f"Dumped {name} graph to {graph_path}")
+            _LOGGER.debug(f"Dumped {name} graph to {graph_path}")
         with open(pseudo_code_path, "w") as f:
             f.write(sorted_graph_pseudo_code(tensors))
-            logger.debug(__file__, f"Dumped {name} pseudo code to {pseudo_code_path}")
+            _LOGGER.debug(f"Dumped {name} pseudo code to {pseudo_code_path}")
+        plot_graph(tensors, graph_visual_path)
+        _LOGGER.debug(f"Dumped {name} visualization to {graph_visual_path}")
diff --git a/python/aitemplate/utils/misc.py b/python/aitemplate/utils/misc.py
new file mode 100644
index 000000000..a1b52babf
--- /dev/null
+++ b/python/aitemplate/utils/misc.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+miscellaneous utilities
+"""
+import logging
+import os
+
+
+def is_debug():
+    logger = logging.getLogger("aitemplate")
+    return logger.level == logging.DEBUG
+
+
+def setup_logger(name):
+    root_logger = logging.getLogger(name)
+    info_handle = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s %(levelname)s <%(name)s> %(message)s")
+    info_handle.setFormatter(formatter)
+    root_logger.addHandler(info_handle)
+    root_logger.propagate = False
+
+    DEFAULT_LOGLEVEL = logging.getLogger().level
+    log_level_str = os.environ.get("LOGLEVEL", None)
+    LOG_LEVEL = (
+        getattr(logging, log_level_str.upper())
+        if log_level_str is not None
+        else DEFAULT_LOGLEVEL
+    )
+    root_logger.setLevel(LOG_LEVEL)
+    return root_logger
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index c65f6c4dc..5a59cc185 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -2015,6 +2015,8 @@ def CreateGroupNormOperator(manifest, rank=5):
         groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
         groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
         groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
     ]
 
     operations = []
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index 9aa3aade9..54c1a4d9d 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -26,13 +26,14 @@ class EpilogueFunctor(enum.Enum):
   LinearCombinationSigmoid = enum_auto()
   LinearCombinationTanh = enum_auto()
   LinearCombinationResidualBlock = enum_auto()
-  LinearCombinationResidualBlockV2 = enum_auto()
   LinearCombinationHardSwish = enum_auto()
   LinearCombinationGELU = enum_auto()
   LinearCombinationFastGELU = enum_auto()
   LinearCombinationSilu = enum_auto()
+  LinearCombinationELUp1 = enum_auto()
   LeftSiLUAndMul = enum_auto()
   LeftFastGeluAndMul = enum_auto()
+  Div = enum_auto()
 
 EpilogueFunctorTag = {
   EpilogueFunctor.LinearCombination:
@@ -47,8 +48,6 @@ class EpilogueFunctor(enum.Enum):
     'cutlass::epilogue::thread::LinearCombinationTanh',
   EpilogueFunctor.LinearCombinationResidualBlock:
     'cutlass::epilogue::thread::LinearCombinationResidualBlock',
-  EpilogueFunctor.LinearCombinationResidualBlockV2:
-    'cutlass::epilogue::thread::LinearCombinationResidualBlockV2',
   EpilogueFunctor.LinearCombinationHardSwish:
     'cutlass::epilogue::thread::LinearCombinationHardSwish',
   EpilogueFunctor.LinearCombinationGELU:
@@ -57,10 +56,14 @@ class EpilogueFunctor(enum.Enum):
     'cutlass::epilogue::thread::LinearCombinationFastGELU',
   EpilogueFunctor.LinearCombinationSilu:
     'cutlass::epilogue::thread::LinearCombinationSilu',
+  EpilogueFunctor.LinearCombinationELUp1:
+    'cutlass::epilogue::thread::LinearCombinationELUp1',
   EpilogueFunctor.LeftSiLUAndMul:
     'cutlass::epilogue::thread::LeftSiLUAndMul',
   EpilogueFunctor.LeftFastGeluAndMul:
     'cutlass::epilogue::thread::LeftFastGeluAndMul',
+  EpilogueFunctor.Div:
+    'cutlass::epilogue::thread::Div',
 }
 
 EpilogueFunctorName = {
@@ -70,13 +73,14 @@ class EpilogueFunctor(enum.Enum):
   "LinearCombinationSigmoid": EpilogueFunctor.LinearCombinationSigmoid,
   "LinearCombinationTanh": EpilogueFunctor.LinearCombinationTanh,
   "LinearCombinationResidualBlock": EpilogueFunctor.LinearCombinationResidualBlock,
-  "LinearCombinationResidualBlockV2": EpilogueFunctor.LinearCombinationResidualBlockV2,
   "LinearCombinationHardSwish": EpilogueFunctor.LinearCombinationHardSwish,
   "LinearCombinationGELU": EpilogueFunctor.LinearCombinationGELU,
   "LinearCombinationFastGELU": EpilogueFunctor.LinearCombinationFastGELU,
   "LinearCombinationSilu": EpilogueFunctor.LinearCombinationSilu,
+  "LinearCombinationELUp1": EpilogueFunctor.LinearCombinationELUp1,
   "LeftSiLUAndMul": EpilogueFunctor.LeftSiLUAndMul,
   "LeftFastGeluAndMul": EpilogueFunctor.LeftFastGeluAndMul,
+  "Div": EpilogueFunctor.Div,
 }
 
 class EpilogueMath(enum.Enum):
@@ -89,6 +93,7 @@ class EpilogueMath(enum.Enum):
   Gelu = enum_auto()
   FastGelu = enum_auto()
   Silu = enum_auto()
+  ELUp1 = enum_auto()
 
 
 EpilogueMathTag = {
@@ -100,7 +105,8 @@ class EpilogueMath(enum.Enum):
   EpilogueMath.Plus: 'cutlass::plus',
   EpilogueMath.Gelu: 'GELU',
   EpilogueMath.FastGelu: 'GELU_taylor',
-  EpilogueMath.FastGelu: 'cutlass::epilogue::thread::Silu'
+  EpilogueMath.Silu: 'cutlass::epilogue::thread::Silu',
+  EpilogueMath.ELUp1: 'cutlass::epilogue::thread::ELUp1',
 }
 
 EpilogueMathName = {
@@ -113,7 +119,8 @@ class EpilogueMath(enum.Enum):
   "Add": EpilogueMath.Plus,
   "Gelu": EpilogueMath.Gelu,
   "FastGelu": EpilogueMath.FastGelu,
-  "Silu": EpilogueMath.Silu
+  "Silu": EpilogueMath.Silu,
+  "ELUp1": EpilogueMath.ELUp1
 }
 
 class EpiloguePermuteLayout(enum.Enum):
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
index 5a428bcbe..93ce36765 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
@@ -242,9 +242,13 @@ def emit(self, operation):
     """
     using ${operation_name}_base =
     cutlass::gemm::device::DualGemm<
-        ${element_a}, ${layout_a},
-        ${element_b}, ${layout_b},
-        ${element_c}, ${layout_c},
+        ${element_a},
+        ${layout_a},
+        ${element_b},
+        ${layout_b0},
+        ${layout_b1},
+        ${element_c},
+        ${layout_c},
         ${element_accumulator},
         ${opcode_class},
         ${arch},
@@ -298,7 +302,7 @@ def __init__(self, operation_suffix = ''):
       """
 
 
-  def emit(self, operation):
+  def emit(self, operation, broadcast_b1=False):
 
     threadblock_shape = operation.tile_description.threadblock_shape
     warp_count = operation.tile_description.warp_count
@@ -310,9 +314,10 @@ def emit(self, operation):
       LayoutType.RowMajor: LayoutType.ColumnMajor
     }
 
-    instance_layout_A, instance_layout_B, instance_layout_C = \
+    instance_layout_A, instance_layout_B0, instance_layout_C = \
       (operation.A.layout, operation.B.layout, operation.C.layout)
-    #
+    # B1 is broadcasted in column-major with zero stride (the latter set in the Arguments)
+    instance_layout_B1 = LayoutType.ColumnMajor if broadcast_b1 else instance_layout_B0
 
     # Support built-in epilogue functors or user-defined functions
     if isinstance(operation.epilogue_functor, enum.Enum):
@@ -340,7 +345,8 @@ def emit(self, operation):
       'element_a': DataTypeTag[operation.A.element],
       'layout_a': LayoutTag[instance_layout_A],
       'element_b': DataTypeTag[operation.B.element],
-      'layout_b': LayoutTag[instance_layout_B],
+      'layout_b0': LayoutTag[instance_layout_B0],
+      'layout_b1': LayoutTag[instance_layout_B1],
       'element_c': DataTypeTag[operation.C.element],
       'layout_c': LayoutTag[instance_layout_C],
       'element_accumulator': DataTypeTag[operation.accumulator_type()],
diff --git a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
index 2791fe05c..373610a7a 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
@@ -65,6 +65,8 @@ def process_code(src_path, dst_path, code_set):
 
     src_prefix = os.path.join(template_path, "tools/library/scripts")
     srcs = os.listdir(src_prefix)
+    if "__init__.py" in srcs:
+        srcs.remove("__init__.py")
     for file in srcs:
         src_path = os.path.join(src_prefix, file)
         if not os.path.isfile(src_path):
diff --git a/python/aitemplate/utils/serialization/ait_program.py b/python/aitemplate/utils/serialization/ait_program.py
index 4e4f62376..2ccb0addc 100644
--- a/python/aitemplate/utils/serialization/ait_program.py
+++ b/python/aitemplate/utils/serialization/ait_program.py
@@ -65,21 +65,21 @@ def set_constants(self, constants: Dict[str, Any]):
         for k, v in constants.items():
             getattr(self, k)._bind_data(convert_to_ait_const(v))
 
-    def set_default_constants(self):
+    def set_default_constants(self, dtype="float16"):
         """
         This function is called to set up default constants
         (ex. constant folded/constants set up by zero padding etc.).
         """
-        self.set_all_random_constants()
+        self.set_all_random_constants(dtype)
 
-    def set_all_random_constants(self):
+    def set_all_random_constants(self, dtype="float16"):
         """
         This function would set all constants into random value.
         """
         const_infos = self.get_constants()
         for k, v in const_infos.items():
             getattr(self, k)._bind_data(
-                _NumpyConstantTensorData(np.random.randn(*v).astype("float16"))
+                _NumpyConstantTensorData(np.random.randn(*v).astype(dtype))
             )
 
     def model(self) -> Union[Tensor, Tuple[Tensor]]:
diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
index 1f7c98b44..c263bca9f 100644
--- a/python/aitemplate/utils/serialization/serdes_code.py
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -211,6 +211,12 @@ def _retrieve_op_info(op: Operator, params_set) -> Tuple[List, Dict]:
         # dynamic slice provides start/end indices as inputs
         op_inputs.append(str(op._attrs["start_indices"]))
         op_inputs.append(str(op._attrs["end_indices"]))
+    elif op._attrs["op"] == "permute":
+        # permute takes permuted dimensions as input,
+        # but can forward to static shape permute ops
+        # that don't (e.g., permute021 or permute102)
+        if "dims" in op._attrs:
+            op_inputs.append(str(op._attrs["dims"]))
 
     return op_inputs, op_attrs
 
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
index 266b3a279..078d40557 100644
--- a/python/aitemplate/utils/torch_utils.py
+++ b/python/aitemplate/utils/torch_utils.py
@@ -22,18 +22,37 @@
 """
 
 
+def types_mapping():
+    from torch import bfloat16, bool, float16, float32, int32, int64
+
+    yield (float16, "float16")
+    yield (bfloat16, "bfloat16")
+    yield (float32, "float32")
+    yield (int32, "int32")
+    yield (int64, "int64")
+    yield (bool, "bool")
+
+
 def torch_dtype_to_string(dtype):
-    import torch
-
-    dtype_to_str = {
-        torch.float16: "float16",
-        torch.float32: "float32",
-        torch.int32: "int32",
-        torch.int64: "int64",
-        torch.bool: "bool",
-    }
-    if dtype not in dtype_to_str:
-        raise ValueError(
-            f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(dtype_to_str.keys())}"
-        )
-    return dtype_to_str[dtype]
+    for (torch_dtype, ait_dtype) in types_mapping():
+        if dtype == torch_dtype:
+            return ait_dtype
+    raise ValueError(
+        f"Got unsupported input dtype {dtype}! "
+        f"Supported dtypes are: {list(types_mapping())}"
+    )
+
+
+def string_to_torch_dtype(string_dtype):
+    if string_dtype is None:
+        # Many torch functions take optional dtypes, so
+        # handling None is useful here.
+        return None
+
+    for (torch_dtype, ait_dtype) in types_mapping():
+        if string_dtype == ait_dtype:
+            return torch_dtype
+    raise ValueError(
+        f"Got unsupported ait dtype {string_dtype}! "
+        f"Supported dtypes are: {list(types_mapping())}"
+    )
diff --git a/python/aitemplate/utils/visualization/op_attr_factory.py b/python/aitemplate/utils/visualization/op_attr_factory.py
index 6049a1151..53505259c 100644
--- a/python/aitemplate/utils/visualization/op_attr_factory.py
+++ b/python/aitemplate/utils/visualization/op_attr_factory.py
@@ -18,4 +18,8 @@ def op_to_content(op):
     # TODO (XXX): Add op specialized attrs here, like gemm/conv
     content = {}
     content["op_type"] = op._attrs["op"]
+    if op._attrs["op"] == "fused_elementwise":
+        content["func"] = ", ".join(
+            [str(x._attrs["func"]) for x in op._attrs["elementwise_ops"]]
+        )
     return content
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 4ae0d5983..cd55eb081 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -20,6 +20,7 @@
 
 from aitemplate import compiler
 from aitemplate.utils.visualization import op_attr_factory, pydot
+from aitemplate.utils.visualization.op_attr_factory import op_to_content
 from aitemplate.utils.visualization.web_template import (
     INDEX_TEMPLATE,
     MODAL_TEMPLATE,
@@ -161,7 +162,9 @@ def plot_graph(tensors, file_path: str) -> None:
                 dot_graph.add_node(op_node)
                 modal_set.append(_gen_op_modal(src_op))
                 items.append(op_name)
-                popover_data[op_name] = "op: " + src_op._attrs["op"]
+                popover_data[op_name] = ", ".join(
+                    [f"{x}: {y}" for x, y in op_to_content(src_op).items()]
+                )
             dot_graph.add_edge(pydot.Edge(op_node, tensor_node))
 
         for dst_op in tensor.dst_ops():
@@ -179,7 +182,9 @@ def plot_graph(tensors, file_path: str) -> None:
                 op_set[dst_op] = op_node
                 dot_graph.add_node(op_node)
                 items.append(op_name)
-                popover_data[op_name] = "op: " + dst_op._attrs["op"]
+                popover_data[op_name] = ", ".join(
+                    [f"{x}: {y}" for x, y in op_to_content(dst_op).items()]
+                )
                 # add modal
                 modal_set.append(_gen_op_modal(dst_op))
             dot_graph.add_edge(pydot.Edge(tensor_node, op_node))
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 3bfaa338e..3fc3bc7dc 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -21,8 +21,6 @@ namespace ait {
 
 ModelContainer::ModelContainer(
     size_t num_models,
-    size_t blob_size,
-    size_t workspace_size,
     size_t num_inputs,
     size_t num_outputs,
     size_t num_unbound_constants,
@@ -40,19 +38,21 @@ ModelContainer::ModelContainer(
   if (num_models == 0) {
     throw std::runtime_error("Number of models must be positive");
   }
+  dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
+  int runtime_version;
+  int driver_version;
+  DEVICE_CHECK(GetDriverVersion(&driver_version));
+  DEVICE_CHECK(GetRuntimeVersion(&runtime_version));
+  LOG(INFO) << "Device Runtime Version: " << runtime_version
+            << "; Driver Version: " << driver_version;
+  LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency";
   models_.reserve(num_models);
   available_models_.reserve(num_models);
 
   for (size_t i = 0; i < num_models; ++i) {
-    models_.emplace_back(
-        blob_size,
-        workspace_size,
-        num_inputs,
-        num_outputs,
-        num_unbound_constants,
-        static_cast<uint8_t*>(constants_.get()),
-        allocator);
-    available_models_.push_back(&models_.back());
+    models_.push_back(
+        Model::Create(allocator, static_cast<uint8_t*>(constants_.get())));
+    available_models_.push_back(models_.back().get());
   }
 }
 
@@ -92,6 +92,34 @@ void ModelContainer::Run(
   }
 }
 
+void ModelContainer::Profile(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    size_t num_iters,
+    const char* filename) {
+  auto* model = GetAvailableModel();
+  if (filename == nullptr) {
+    throw;
+  }
+  try {
+    PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
+    model->Profile(stream, num_iters, filename);
+  } catch (...) {
+    std::lock_guard lk(models_mutex_);
+    available_models_.push_back(model);
+    throw;
+  }
+
+  {
+    std::lock_guard lk(models_mutex_);
+    pending_models_.push_back(model);
+  }
+  pending_models_available_.notify_one();
+}
+
 void ModelContainer::RunWithOutputsOnHost(
     const AITData* inputs,
     size_t num_inputs,
@@ -279,7 +307,7 @@ void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
   if (expected_num_bytes != actual_num_bytes) {
     throw std::runtime_error(
         std::string(
-            "SetConstant did not recieve correct number of bytes for constant ") +
+            "SetConstant did not receive correct number of bytes for constant ") +
         name + ": expected " + std::to_string(expected_num_bytes) +
         " but got " + std::to_string(actual_num_bytes) +
         ". Check that the provided tensor's shape is correct.");
@@ -287,7 +315,7 @@ void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
 
   auto* src = tensor.ptr;
   for (auto& model : models_) {
-    model.SetConstant(name, src);
+    model->SetConstant(name, src);
   }
 }
 
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index f980e1644..6501eaa8e 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -162,6 +162,25 @@ AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
   })
 }
 
+AITemplateError AITemplateModelContainerProfile(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    size_t num_iters,
+    const char* filename) {
+  RETURN_ERROR_IF_NULL(handle);
+  RETURN_ERROR_IF_NULL(filename);
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->Profile(
+        inputs, num_inputs, outputs, num_outputs, stream, num_iters, filename);
+  })
+}
+
 AITemplateError AITemplateModelContainerBenchmark(
     AITemplateModelHandle handle,
     const AITData* inputs,
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index bb377a8bd..57f309bbc 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -23,8 +23,12 @@
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 
+#include <nvtx3/nvToolsExt.h>
+
 namespace ait {
 
+inline thread_local bool target_has_graph_mode = true;
+
 using DeviceError = cudaError_t;
 using DevicePropertyType = cudaDeviceProp;
 using StreamType = cudaStream_t;
@@ -182,4 +186,20 @@ inline DeviceError GetDeviceNotReady() {
   return cudaErrorNotReady;
 }
 
+inline DeviceError GetDriverVersion(int* driverVersion) {
+  return cudaDriverGetVersion(driverVersion);
+}
+
+inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
+  return cudaRuntimeGetVersion(runtimeVersion);
+}
+
+inline void ProfilerRangePush(const char* msg) {
+  nvtxRangePushA(msg);
+}
+
+inline void ProfilerRangePop() {
+  nvtxRangePop();
+}
+
 } // namespace ait
diff --git a/static/include/macros.h b/static/include/macros.h
index 84bf47155..59fcde94b 100644
--- a/static/include/macros.h
+++ b/static/include/macros.h
@@ -27,3 +27,10 @@
   }
 
 #define LAUNCH_CHECK() DEVICE_CHECK(GetLastError())
+
+#define CHECK_VECTOR_ACCESS(vector, idx)                                 \
+  if (idx >= vector.size()) {                                            \
+    throw std::out_of_range(                                             \
+        "[__func__]: index out of range, " #vector ".size()=" +          \
+        std::to_string(vector.size()) + ", got " + std::to_string(idx)); \
+  }
diff --git a/static/include/model.h b/static/include/model.h
new file mode 100644
index 000000000..b2fcde66e
--- /dev/null
+++ b/static/include/model.h
@@ -0,0 +1,304 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+namespace ait {
+
+// This serves as a base class for AIT runtime objects, e.g. the compiled
+// model and the constant folder. It uses CRTP as a mechanism to call into
+// a few base class methods (dynamic dispatch is not needed in ModelContainer,
+// so there's no need to add a vtable). Inheriting classes should implement
+// the following methods:
+// - RunImpl(StreamType):    The bulk of the compiled model's kernel invocations
+//                           go here.
+// - SetUpInputsOutputs():   Check the provided input/output pointers dtypes &
+//                           sizes
+// - DeviceToDeviceCopies(): Called at the end of infernece, copy views of
+//                           inputs/constants to the provided output pointer.
+//
+// In practice, inheriting classes are generated via MODEL_TEMPLATE in
+// python/aitemplate/backend/main_templates.py.
+template <typename ModelType>
+class ModelBase {
+ protected:
+  // Should not be constructed directly, use the base class' factory function
+  // instead.
+  ModelBase(
+      size_t blob_size,
+      size_t workspace_size,
+      size_t unique_workspace_size,
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_unbound_constants,
+      uint8_t* constants,
+      AITemplateAllocator& allocator)
+      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
+        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
+        params_(num_inputs + num_outputs + num_unbound_constants),
+        num_inputs_(num_inputs),
+        num_outputs_(num_outputs),
+        constants_(constants) {
+    global_workspace_ =
+        static_cast<uint8_t*>(workspace_.get()) + unique_workspace_size;
+    unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
+    DEVICE_CHECK(GetDevice(&device_idx_))
+    DEVICE_CHECK(CreateEvent(&run_finished_));
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+    DEVICE_CHECK(cudaDeviceGetAttribute(
+        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
+#endif
+    DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
+    DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
+  }
+
+ public:
+  ~ModelBase() {
+    if (run_finished_ != nullptr) {
+      DestroyEvent(run_finished_);
+    }
+    if (graph_capture_stream_ != nullptr) {
+      StreamDestroy(graph_capture_stream_);
+    }
+    if (graph_exec_ != nullptr) {
+      GraphExecDestroy(graph_exec_);
+    }
+  }
+
+  ModelBase(ModelBase&&) = delete;
+  ModelBase& operator=(ModelBase&&) = delete;
+  ModelBase(const ModelBase&) = delete;
+  ModelBase& operator=(const ModelBase&) = delete;
+
+  void Run(StreamType stream, bool graph_mode) {
+    auto* model = static_cast<ModelType*>(this);
+    model->SetUpInputsOutputs();
+    if (target_has_graph_mode && graph_mode) {
+      RunAsGraph(stream);
+    } else {
+      model->RunImpl(stream);
+    }
+    model->DeviceToDeviceCopies(stream);
+    DEVICE_CHECK(EventRecord(run_finished_, stream));
+  }
+
+  void Profile(StreamType stream, size_t iters, const std::string& filename) {
+    auto* model = static_cast<ModelType*>(this);
+    model->SetUpInputsOutputs();
+    model->ProfileImpl(stream, iters, filename);
+  }
+
+  bool IsPending() {
+    auto query = QueryEvent(run_finished_);
+    if (query == GetDeviceNotReady()) {
+      return true;
+    }
+    if (query != GetDeviceSuccess()) {
+      LOG(WARNING) << "Pending model run did not finish successfully. Error: "
+                   << GetErrorString(query);
+    }
+    return false;
+  }
+
+  void WaitForCompletion() {
+    DEVICE_CHECK(EventSynchronize(run_finished_));
+  }
+
+  size_t NumInputs() const {
+    return num_inputs_;
+  }
+
+  size_t NumOutputs() const {
+    return num_outputs_;
+  }
+
+  void SetParam(const void* src, size_t param_idx) {
+    CHECK_VECTOR_ACCESS(params_, param_idx)
+    // const_cast is not ideal here, but it is unfortunately
+    // necessary:
+    // 1) We store outputs and inputs in the same vector,
+    //    and outputs cannot be const.
+    // 2) Most of the codegen is not const-correct (most ops
+    //    require non-const pointers). So even if we put const
+    //    pointers into params, a const_cast would be required
+    //    somewhere else.
+    params_[param_idx].ptr = const_cast<void*>(src);
+  }
+
+  void SetInput(
+      const void* src,
+      const AITemplateParamShape& shape,
+      size_t idx) {
+    SetInputShape(shape, idx);
+    SetParam(src, idx);
+  }
+
+  void SetOutput(void* src, size_t idx) {
+    SetParam(src, idx + num_inputs_);
+  }
+
+  // Write the (possibly dynamic) output shape to the given pointer.
+  // Note that this should be called _after_ the shape inference in
+  // Run() is finished. output_shape_out should be able to store
+  // at least GetOutputMaximumShape(idx).size values.
+  void GetOutputShape(size_t idx, int64_t* output_shape_out) {
+    const auto param_idx = idx + num_inputs_;
+    CHECK_VECTOR_ACCESS(params_, param_idx);
+    const auto& shape_ptrs = params_[param_idx].shape_ptrs;
+    for (size_t i = 0; i < shape_ptrs.size(); ++i) {
+      output_shape_out[i] = shape_ptrs[i].GetValue();
+    }
+  }
+
+  void SetConstant(const char* name, const void* src) {
+    auto it = constant_name_to_ptr_.find(name);
+    if (it == constant_name_to_ptr_.end()) {
+      throw std::out_of_range(std::string("Could not find constant ") + name);
+    }
+    const void** ptr = it->second;
+    *ptr = src;
+  }
+
+ private:
+  void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
+    auto& param = params_[idx];
+    if (shape.size != param.shape_ptrs.size()) {
+      throw std::runtime_error(
+          "[SetInputShape] Got wrong param shape for input " +
+          std::to_string(idx) + "; expected " +
+          std::to_string(param.shape_ptrs.size()) + ", got " +
+          std::to_string(shape.size));
+    }
+    for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
+      param.shape_ptrs[i].SetValue(shape.shape_data[i]);
+    }
+  }
+
+  DeviceError EndCapture(GraphType* graph_ptr) {
+    auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
+    if (err != GetDeviceSuccess()) {
+      // If we can't take the stream out of capture mode, something is probably
+      // wrong with CUDA graph for this model (e.g. there might have been an
+      // illegal capture mode operation). Disable graph mode to avoid such
+      // issues in future iterations.
+      target_has_graph_mode = false;
+      LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
+      return err;
+    }
+    return GetDeviceSuccess();
+  }
+
+  void RunAsGraph(StreamType stream) {
+    DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
+    try {
+      static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
+    } catch (...) {
+      GraphType graph;
+      // No need to DEVICE_CHECK here, we want to see the original exception.
+      EndCapture(&graph);
+      if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+        LOG(WARNING)
+            << "Graph destruction failed while handling exception! Memory will be leaked.";
+      }
+      throw;
+    }
+
+    // The following function ends the capture and creates a graph
+    // inside a unique_ptr that cleans up it when it goes out of scope.
+    // Note that it throws an exception if EndCapture fails.
+    auto graph = RAII_EndCaptureAndCreateGraph(
+        [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
+
+    if (graph_exec_ == nullptr) {
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    } else if (
+        GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
+      // Consume the last cuda error, which may affect the next GraphExecLaunch
+      // call.
+      GetLastError();
+      DEVICE_CHECK(GraphExecDestroy(graph_exec_));
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    }
+
+    DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
+  }
+
+ protected:
+  int device_idx_;
+  int max_smem_size_{0};
+  DevicePropertyType device_properties_;
+  // This event tracks when the inference is finished
+  // so that this Model may be reclaimed by its owning
+  // ModelContainer.
+  EventType run_finished_;
+  // A blob of memory used for storing intermediate tensors.
+  GPUPtr blob_;
+  // Memory for constants that were folded into the *.so. Unowned by Model,
+  // owned by ModelContainer.
+  // TODO: make this const. It can't be const right now because we derive
+  // tensor pointers from it, and no tensor pointers are const.
+  uint8_t* constants_;
+  size_t num_inputs_;
+  size_t num_outputs_;
+
+  // The workspace blob is used as scratch memory. See
+  // _generate_workspace in memory planning for more information.
+  GPUPtr workspace_;
+  uint8_t* global_workspace_{nullptr};
+  uint8_t* unique_workspace_{nullptr};
+
+  class ParamDim {
+   public:
+    ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value)
+        : lower_bound_(lower_bound), upper_bound_(upper_bound), value_(value) {}
+
+    void SetValue(int64_t new_value) {
+      if (new_value < lower_bound_ || new_value > upper_bound_) {
+        throw std::out_of_range(
+            "[SetValue] Dimension got value out of bounds; expected value to be in [" +
+            std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) +
+            "], but got " + std::to_string(new_value));
+      }
+      *value_ = new_value;
+    }
+
+    int64_t GetValue() const {
+      return *value_;
+    }
+
+   private:
+    int64_t lower_bound_;
+    int64_t upper_bound_;
+    int64_t* value_;
+  };
+
+  struct ParamInfo {
+    void* ptr = nullptr;
+    // TODO add offset
+    const char* name;
+    std::vector<ParamDim> shape_ptrs;
+  };
+
+  // Contains info for all tensors marked as inputs
+  // or outputs. The first num_inputs elements are the inputs.
+  // Constants are not included.
+  std::vector<ParamInfo> params_;
+
+  GraphExecType graph_exec_ = nullptr;
+  StreamType graph_capture_stream_;
+
+  std::unordered_map<std::string, const void**> constant_name_to_ptr_;
+};
+
+} // namespace ait
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 72ea5d6f8..4a1ba9c55 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -108,8 +108,6 @@ class ModelContainer : ModelContainerBase {
  public:
   ModelContainer(
       size_t num_models,
-      size_t blob_size,
-      size_t workspace_size,
       size_t num_inputs,
       size_t num_outputs,
       size_t num_unbound_constants,
@@ -135,6 +133,15 @@ class ModelContainer : ModelContainerBase {
       bool graph_mode,
       int64_t** output_shapes_out);
 
+  void Profile(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      size_t num_iters,
+      const char* filename);
+
   float Benchmark(
       const AITData* inputs,
       size_t num_inputs,
@@ -187,7 +194,7 @@ class ModelContainer : ModelContainerBase {
 
   AITemplateAllocator& allocator_;
 
-  std::vector<Model> models_;
+  std::vector<std::unique_ptr<Model>> models_;
   std::vector<Model*> available_models_;
   std::deque<Model*> pending_models_;
 
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 2ec362535..198f90534 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -64,6 +64,7 @@ enum class AITemplateDtype {
   kInt,
   kLong,
   kBool,
+  kBFloat16,
 };
 
 struct AITData {
@@ -83,6 +84,7 @@ struct AITData {
 inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
   switch (dtype) {
     case AITemplateDtype::kHalf:
+    case AITemplateDtype::kBFloat16:
       return 2;
     case AITemplateDtype::kFloat:
       return 4;
@@ -165,6 +167,17 @@ AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
     bool graph_mode,
     int64_t** output_shapes_out);
 
+/// Do per op profile and write the profiling report to file.
+AIT_EXPORT AITemplateError AITemplateModelContainerProfile(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    size_t num_iters,
+    const char* filename);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
     AITemplateModelHandle handle,
     const AITData* inputs,
diff --git a/static/include/raii_wrapper.h b/static/include/raii_wrapper.h
index 24270d8b9..ce1667959 100644
--- a/static/include/raii_wrapper.h
+++ b/static/include/raii_wrapper.h
@@ -69,4 +69,17 @@ inline GraphPtr RAII_EndCaptureAndCreateGraph(
   return GraphPtr(graph, GraphDestroy);
 }
 
+class RAII_ProfilerRange {
+ public:
+  RAII_ProfilerRange(char* name) {
+    ProfilerRangePush(name);
+  }
+  ~RAII_ProfilerRange() {
+    ProfilerRangePop();
+  }
+
+  RAII_ProfilerRange(const RAII_ProfilerRange&) = delete;
+  RAII_ProfilerRange(RAII_ProfilerRange&&) = delete;
+};
+
 } // namespace ait
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index dced66911..f6809d9d7 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -28,6 +28,8 @@
 
 namespace ait {
 
+inline thread_local bool target_has_graph_mode = false;
+
 using DeviceError = hipError_t;
 using DevicePropertyType = hipDeviceProp_t;
 using StreamType = hipStream_t;
@@ -189,4 +191,21 @@ inline DeviceError GetDeviceNotReady() {
   return hipErrorNotReady;
 }
 
+inline DeviceError GetDriverVersion(int* driverVersion) {
+  return hipDriverGetVersion(driverVersion);
+}
+
+inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
+  return hipRuntimeGetVersion(runtimeVersion);
+}
+
+inline void ProfilerRangePush(const char* msg) {
+  // TODO: Activate roctx header and linkage
+  // roctxRangePush(msg);
+}
+
+inline void ProfilerRangePop() {
+  // TODO: Activate roctx header and linkage
+  // roctxRangePop();
+}
 } // namespace ait
diff --git a/tests/ci_profile_cache/update_cache.py b/tests/ci_profile_cache/update_cache.py
index 8011b7546..3ac4e280e 100644
--- a/tests/ci_profile_cache/update_cache.py
+++ b/tests/ci_profile_cache/update_cache.py
@@ -28,8 +28,10 @@
 import jinja2
 from aitemplate.backend.profiler_cache import GEMM_INSERT_TEMPLATE, GEMM_QUERY_TEMPLATE
 
+
 logging.basicConfig(format="%(name)s: %(message)s", level=logging.INFO)
-logger = logging.getLogger("update-cache")
+
+_LOGGER = logging.getLogger("update-cache")
 
 DEFAULT_QUERY_TEMPLATE = jinja2.Template(
     """
@@ -127,7 +129,7 @@ def del_entry(
     Returns
     ----------
     """
-    logger.info("query entry for deletion - id: %s, op_type: %s", entry_id, op_type)
+    _LOGGER.info("query entry for deletion - id: %s, op_type: %s", entry_id, op_type)
     db_conn_cur = db_conn.cursor()
     entries = query_cache(
         db_conn_cur=db_conn_cur,
@@ -140,7 +142,7 @@ def del_entry(
         entry_id=entry_id,
     )
     if len(entries) == 0:
-        logger.info("Could not find valid entries, skip")
+        _LOGGER.info("Could not find valid entries, skip")
         return
 
     assert len(entries) == 1
@@ -148,14 +150,14 @@ def del_entry(
         raise RuntimeError(
             f"cannot delete the entry, unmatched op_type: {op_type}, {entries[0][1]}"
         )
-    logger.info("deleting entry - id: %s, op_type: %s", entry_id, op_type)
+    _LOGGER.info("deleting entry - id: %s, op_type: %s", entry_id, op_type)
     del_query = DEL_ID_TEMPLATE.render(
         table=table,
         id=entry_id,
     )
     db_conn_cur.execute(del_query)
     db_conn.commit()
-    logger.info("entry deleted successfully")
+    _LOGGER.info("entry deleted successfully")
 
 
 def insert_sm75_entry(
@@ -200,7 +202,7 @@ def insert_sm75_entry(
     args["device"] = "75"
 
     new_args_str = "\n".join(["{}: {}".format(n, v) for n, v in args.items()])
-    logger.info("new_args:\n%s", new_args_str)
+    _LOGGER.info("new_args:\n%s", new_args_str)
     query_sql = GEMM_QUERY_TEMPLATE.render(
         dev="cuda",
         dtype_a=args["dtype_a"],
@@ -238,7 +240,7 @@ def insert_sm75_entry(
     insertion_sql = GEMM_INSERT_TEMPLATE.render(dev="cuda", **args)
     db_conn_cur.execute(insertion_sql)
     db_conn.commit()
-    logger.info(
+    _LOGGER.info(
         "successfully insert an sm75 entry for: '%s', '%s'",
         args["op_type"],
         args["exec_entry"],
@@ -389,10 +391,10 @@ def query_cache(
 
     if exec_key is not None:
         if not suppress_print:
-            logger.info("exec_key: '%s'", exec_key)
+            _LOGGER.info("exec_key: '%s'", exec_key)
         exec_entry_sha1 = hashlib.sha1(exec_key.encode("utf-8")).hexdigest()
         if not suppress_print:
-            logger.info("exec_sha1: '%s'", exec_entry_sha1)
+            _LOGGER.info("exec_sha1: '%s'", exec_entry_sha1)
 
     query_args = {
         "table": table,
@@ -430,7 +432,7 @@ def query_cache(
     db_conn_cur.execute(query)
     entries = db_conn_cur.fetchall()
     if not suppress_print:
-        logger.info("entries: id, op_type, algo, device, exec_entry")
+        _LOGGER.info("entries: id, op_type, algo, device, exec_entry")
     for entry in entries:
         if not suppress_print:
             print("entry: {}".format(entry))
@@ -459,7 +461,7 @@ def process_missing_75_entries_from_80(
     Returns
     ----------
     """
-    logger.info("query all missing sm75 entries - op_type: %s", op_type)
+    _LOGGER.info("query all missing sm75 entries - op_type: %s", op_type)
     db_conn_cur = db_conn.cursor()
     if op_type == "all":
         op_type = None
@@ -500,7 +502,7 @@ def process_missing_75_entries_from_80(
         if len(sm75_entries) == 0:
             print("missing sm75 entry for this sm80 entry: '{}'".format(sm80_entry))
             if gen_sm75_entry:
-                logger.info("gen sm75 entry for: '%s'", sm80_entry)
+                _LOGGER.info("gen sm75 entry for: '%s'", sm80_entry)
                 column_names = get_column_names(db_conn_cur, table)
                 insert_sm75_entry(db_conn, table, sm80_entry, column_names, None)
 
@@ -586,7 +588,7 @@ def make_75_algo_from_80(old_algo: str):
     else:
         raise RuntimeError("Invalid old_algo format: '{}'".format(old_algo))
 
-    logger.info("new_algo: '%s'", new_algo)
+    _LOGGER.info("new_algo: '%s'", new_algo)
     return new_algo
 
 
@@ -594,7 +596,7 @@ def get_column_names(db_conn_cur: sqlite3.Cursor, table: str):
     column_names_query = QUERY_COLUMN_NAMES_TEMPLATE.render(table=table)
     columns = db_conn_cur.execute(column_names_query)
     column_names = [col[0] for col in columns.description]
-    logger.info("colum_names:%s", column_names)
+    _LOGGER.info("colum_names:%s", column_names)
     return column_names
 
 
@@ -638,7 +640,7 @@ def gen_one_75_entry_from_80(
         device="80",
     )
     if len(entries) == 0:
-        logger.info("Could not find valid entries, skip")
+        _LOGGER.info("Could not find valid entries, skip")
         return
 
     column_names = get_column_names(db_conn_cur, table)
diff --git a/tests/unittest/backend/test_cuda_graph.py b/tests/unittest/backend/test_cuda_graph.py
index 70d57f3f3..2f24b3b1b 100644
--- a/tests/unittest/backend/test_cuda_graph.py
+++ b/tests/unittest/backend/test_cuda_graph.py
@@ -24,13 +24,14 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class CUDAGraphTestCase(unittest.TestCase):
     def test_cuda_graph_multiple_runs(self):
-        logger.info("testing cuda graph with multiple runs")
+        _LOGGER.info("testing cuda graph with multiple runs")
         X0_batch_dim = IntVar([1, 65], name="batch_size")
         X0_non_batch_shape = [1, 772]
         X0_non_batch_dims = [IntImm(d) for d in X0_non_batch_shape]
@@ -54,7 +55,7 @@ def test_cuda_graph_multiple_runs(self):
         run = 2
         repeat = 1
         for b_size in [1, 65]:
-            logger.info(f"batch size = {b_size}")
+            _LOGGER.info(f"batch size = {b_size}")
             X0_shape = [b_size] + X0_non_batch_shape
             x0_pt = torch.randn(*X0_shape).cuda().half()
             x1_pt = torch.randn(*X1_shape).cuda().half()
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index d7e1fb3ef..270118683 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -15,6 +15,9 @@
 import contextlib
 import ctypes
 import itertools
+import json
+import os
+import tempfile
 import unittest
 from typing import Callable, Optional, Tuple
 
@@ -507,6 +510,27 @@ def test_benchmark(self):
         self.assertEqual(len(tensors), 1)
         self.assertTrue(torch.equal(tensors["output"], in0 * in1))
 
+    def test_profile(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_profile", False, True
+        )
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            profile_name = os.path.join(tmpdirname, "profile.json")
+            module.profile(
+                [
+                    torch_to_ait_data(in0),
+                    torch_to_ait_data(in1),
+                ],
+                [torch_to_ait_data(out_ait)],
+                20,
+                profile_name,
+            )
+            with open(profile_name) as f:
+                report = json.loads(f.read())
+                self.assertTrue(len(report), 1)
+                for _, elapsed in report.items():
+                    self.assertGreater(elapsed, 0)
+
     def test_get_output_dtype(self):
         module, inputs, output_np = self._get_simple_graph_and_output(
             "test_get_param_dtype"
@@ -1421,7 +1445,7 @@ def test_custom_allocator(self):
                 f"test_custom_allocator_{allocator_kind.value}",
                 allocator_kind=AITemplateAllocatorKind.TRACKING,
             ) as module:
-                allocator = module.DLL.allocator_handle
+                allocator = module.allocator_handle
                 self.assertIsNotNone(allocator.value)
 
                 if allocator_kind == AITemplateAllocatorKind.TRACKING:
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
index 7276493f9..f2fb16597 100644
--- a/tests/unittest/backend/test_profiler.py
+++ b/tests/unittest/backend/test_profiler.py
@@ -14,15 +14,8 @@
 
 import unittest
 from random import randrange
-
-from aitemplate.backend import profiler_runner
-
-profiler_runner.extract_profile_result = lambda _: (
-    "",
-    False,
-)
-
 from time import sleep
+from unittest.mock import patch
 
 from aitemplate.backend.cuda.target_def import CUDA as CUDATarget
 
@@ -56,21 +49,25 @@ def wrapped(result, delegate):
 
 class ProfilerTestCase(unittest.TestCase):
     def test_profiler_runner(self):
-        with CUDATarget() as _:
-            pr = ProfilerRunner(
-                devices=[str(i) for i in range(12)],
-                timeout=60,
-                postprocessing_delegate=Delegate(test_instance=self),
-            )
-
-            for i, _ in enumerate(pr._postprocessing_delegate.results):
-                sleep_for = 0
-                pr.push(
-                    cmds=["sleep", f"{sleep_for}"],
-                    process_result_callback=delegate_cb_wrapper(i, sleep_for),
+        with patch(
+            "aitemplate.backend.profiler_runner.extract_profile_result"
+        ) as mock_extract_profile_result:
+            mock_extract_profile_result.return_value = ("", False)
+            with CUDATarget() as _:
+                pr = ProfilerRunner(
+                    devices=[str(i) for i in range(12)],
+                    timeout=60,
+                    postprocessing_delegate=Delegate(test_instance=self),
                 )
 
-            pr.join()
+                for i, _ in enumerate(pr._postprocessing_delegate.results):
+                    sleep_for = 0
+                    pr.push(
+                        cmds=["sleep", f"{sleep_for}"],
+                        process_result_callback=delegate_cb_wrapper(i, sleep_for),
+                    )
+
+                pr.join()
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/benchmark/test_gemm_benchmark.py b/tests/unittest/benchmark/test_gemm_benchmark.py
index 07f43cc3e..b1ac17049 100644
--- a/tests/unittest/benchmark/test_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_gemm_benchmark.py
@@ -14,6 +14,7 @@
 #
 import itertools
 import json
+import logging
 import unittest
 import uuid
 
@@ -27,7 +28,7 @@
 from aitemplate.testing.benchmark_ait import make_input_output_pools, run_benchmark
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
 from aitemplate.testing.benchmark_trt import make_trt_module
-from aitemplate.utils import logger, shape_utils
+from aitemplate.utils import shape_utils
 
 NK_SHAPES = ((8314, 3072), (6912, 8314))
 INPUT_POOL_SIZE = 20
@@ -37,6 +38,9 @@
 )
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class GemmRCRModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -194,7 +198,7 @@ def test_benchmark(self):
             )
             for m in BATCH_SIZES:
                 mnk = {"m": m, "n": n, "k": k}
-                logger.warning(__name__, f"mnk={mnk}, split_k={split_k}")
+                _LOGGER.warning(f"mnk={mnk}, split_k={split_k}")
                 inputs_pool, outputs_pool = make_input_output_pools(
                     pool_size=INPUT_POOL_SIZE,
                     eval_pt_func=lambda: eval_pt_gemm_rcr(**mnk),
@@ -239,8 +243,7 @@ def test_benchmark(self):
                     "split_k": split_k,
                     **mnk,
                 }
-                logger.warning(
-                    __name__,
+                _LOGGER.warning(
                     f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
                 )
 
@@ -266,7 +269,7 @@ def test_benchmark(self):
             )
             for b in BATCH_SIZES:
                 bmnk = {"b": b, "m": m, "n": n, "k": k}
-                logger.warning(__name__, f"bmnk={bmnk}, split_k={split_k}")
+                _LOGGER.warning(f"bmnk={bmnk}, split_k={split_k}")
                 inputs_pool, outputs_pool = make_input_output_pools(
                     pool_size=INPUT_POOL_SIZE,
                     eval_pt_func=lambda: eval_pt_bmm_rrr(**bmnk),
@@ -311,8 +314,7 @@ def test_benchmark(self):
                     "split_k": split_k,
                     **bmnk,
                 }
-                logger.warning(
-                    __name__,
+                _LOGGER.warning(
                     f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
                 )
 
diff --git a/tests/unittest/benchmark/test_group_gemm_benchmark.py b/tests/unittest/benchmark/test_group_gemm_benchmark.py
index d7a05ad1a..03b0034fc 100644
--- a/tests/unittest/benchmark/test_group_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_group_gemm_benchmark.py
@@ -21,7 +21,8 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
@@ -209,7 +210,7 @@ def _benchmark(count, inputs_repeats, warmup, inputs, outputs, module, test_name
         module.run_with_tensors(inputs[i % inputs_repeats], outputs, sync=False)
     end_event.record()
     torch.cuda.synchronize()
-    logger.warning(
+    _LOGGER.warning(
         f"{test_name} benchmark, duration: {start_event.elapsed_time(end_event) / count}ms",
     )
 
@@ -224,7 +225,7 @@ def test_rcr(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning("Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
         X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
@@ -271,7 +272,7 @@ def _benchmark_rcr(
         test_name="",
         benchmark_non_group=False,
     ):
-        logger.warning(
+        _LOGGER.warning(
             f"{test_name} benchmark, m: {m}, nk groups: {nk_groups_1}, {nk_groups_2}",
         )
         WARMUP = 10000
@@ -315,7 +316,7 @@ def _benchmark_rcr(
             )
 
     def _benchmark_batch_rcr(self, b, m, n, k, test_name=""):
-        logger.warning(
+        _LOGGER.warning(
             f"{test_name} benchmark, b: {b}, m: {m}, n: {n}, k: {k}",
         )
         WARMUP = 10000
diff --git a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
index 4ecda179d..18d751a07 100644
--- a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
+++ b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
@@ -28,6 +28,10 @@
 
 
 class TestStridedLayerNormBenchmark(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_id = 0
+
     @unittest.skipIf(detect_target().in_ci_env(), "don't run benchmark in CI")
     def test_benchmark(self):
         for (input_nonbatch_shape, (start_indices, end_indices),) in itertools.product(
@@ -51,8 +55,10 @@ def test_benchmark(self):
             ait_module = build_ait_module(
                 batch_sizes=(BATCH_SIZE,),
                 workdir=uuid.uuid4().hex,
+                test_id=self.test_id,
                 **_layernorm_common_params,
             )
+            self.test_id += 1
             inputs_pool, outputs_pool = make_input_output_pools(
                 pool_size=INPUT_POOL_SIZE,
                 eval_pt_func=lambda: eval_pt(
diff --git a/tests/unittest/compiler/test_compilation_failure.py b/tests/unittest/compiler/test_compilation_failure.py
new file mode 100644
index 000000000..23a968134
--- /dev/null
+++ b/tests/unittest/compiler/test_compilation_failure.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest.mock import patch
+
+import jinja2
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class _EnableForceProfile:
+    """
+    Prevent cached profiling entries from causing profiling-related
+    compilation tests from failing.
+    """
+
+    def __init__(self):
+        self.old_force_profile = os.environ.get("FORCE_PROFILE", None)
+
+    def __enter__(self):
+        os.environ["FORCE_PROFILE"] = "1"
+        return self
+
+    def __exit__(self, *args):
+        if self.old_force_profile is None:
+            del os.environ["FORCE_PROFILE"]
+        else:
+            os.environ["FORCE_PROFILE"] = self.old_force_profile
+
+
+class CompilationFailureTestCase(unittest.TestCase):
+    def _test_compilation_failure(
+        self,
+        test_name="compilation_failure",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[IntImm(4), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        compile_model(
+            Y,
+            target,
+            f"./tmp/{test_name}",
+            test_name,
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+
+    def test_compilation_failure_profiler(self):
+        target = detect_target().name()
+        with _EnableForceProfile():
+            profiler_main_template = (
+                f"aitemplate.backend.{target}.conv2d.common.PROFILER_MAIN_TEMPLATE"
+            )
+            with patch(profiler_main_template, jinja2.Template("BAD CODE!")):
+                with self.assertRaisesRegex(RuntimeError, "Build has failed."):
+                    self._test_compilation_failure(
+                        test_name="compilation_failure_profiler"
+                    )
+
+    def test_compilation_failure_function(self):
+        target = detect_target().name()
+        gen_function = f"aitemplate.backend.{target}.conv2d.common.gen_function"
+        with patch(gen_function) as mock_gen_function:
+            mock_gen_function.return_value = "BAD CODE!"
+            with self.assertRaisesRegex(RuntimeError, "Build has failed."):
+                self._test_compilation_failure(test_name="compilation_failure_function")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index 6d1b22ae9..ae5729bf4 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -27,6 +27,12 @@
 from aitemplate.compiler.transform.transform_utils import check_graph_validity
 
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
 
 
 class ConstantFoldingTestCase(unittest.TestCase):
@@ -43,20 +49,23 @@ def _verify_graph(
         # Make sure the extra constants are deleted.
         self.assertEqual(num_constants, expected_num_constants)
 
-    def test_simple_constant_fold(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_simple_constant_fold(self, dtype):
         target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
-        inp0_pt = torch.randn((3, 3)).half().cuda()
-        inp1_pt = torch.randn((3, 3)).half().cuda()
-        inp2_pt = torch.randn((3, 3)).half().cuda()
+        inp0_pt = get_random_torch_tensor((3, 3), dtype)
+        inp1_pt = get_random_torch_tensor((3, 3), dtype)
+        inp2_pt = get_random_torch_tensor((3, 3), dtype)
         x_pt = inp0_pt * inp1_pt
         y_pt = (inp2_pt + x_pt).flatten()
 
-        inp0_ait = Tensor(shape=(3, 3), name="inp0")
+        inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
         inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
-        inp1_ait = Tensor(shape=(3, 3), name="inp1")
+        inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
         inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
-        inp2_ait = Tensor(shape=[3, 3], name="inp2", is_input=True)
+        inp2_ait = Tensor(shape=[3, 3], dtype=dtype, name="inp2", is_input=True)
 
         x_ait = ops.elementwise(FuncEnum.MUL)(inp0_ait, inp1_ait)
         # prevent mul/add fusion. If the ops get fused, then inp2_ait will be
@@ -67,10 +76,11 @@ def test_simple_constant_fold(self):
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
-        mod = compile_model(y_ait, target, "./tmp", "test_constant_folding_simple")
+        mod = compile_model(
+            y_ait, target, "./tmp", f"test_constant_folding_simple_{dtype}"
+        )
 
-        y = torch.empty((9,)).cuda().half()
+        y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({"inp2": inp2_pt}, {"y": y})
         self.assertTrue(torch.equal(y, y_pt))
 
@@ -79,68 +89,92 @@ def test_simple_constant_fold(self):
         # and add one constant, so the total size should be 3.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=3)
 
-    def test_pad_constant_weight(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_pad_constant_weight(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 3
-        w_pt = torch.randn((K, N)).half().cuda()
+        w_pt = get_random_torch_tensor((K, N), dtype)
         weight_data = _TorchConstantTensorData(w_pt)
-        input_0 = Tensor(shape=[M, K], name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], name="weight")
+        input_0 = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype=dtype, name="weight")
         W._bind_data(weight_data)
         Y = ops.gemm_rrr()(input_0, W)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
-        mod = compile_model(Y, target, "./tmp", "test_pad_constant_weight")
+        mod = compile_model(Y, target, "./tmp", f"test_pad_constant_weight_{dtype}")
 
-        input_0_pt = torch.randn((M, K)).half().cuda()
+        input_0_pt = get_random_torch_tensor((M, K), dtype)
         y_pt = torch.matmul(input_0_pt, w_pt)
 
-        y = torch.empty((M, N)).cuda().half()
+        y = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({"input_0": input_0_pt}, {"y": y})
 
         torch.testing.assert_close(y, y_pt, atol=1e-1, rtol=1e-1)
 
-        # The apply_padding graph pass will add padding to both the input and the
-        # weight in this case with concatenate(). The concatenate for the weight
-        # will be folded, so we will be left with 2 constants.
-        self._verify_graph(mod, expected_num_constants=2, expected_num_nodes=5)
+        # For float16 inputs, the apply_padding graph pass will add padding to
+        # both the input and the weight in this case with concatenate().
+        # The concatenate for the weight will be folded, so we will be left with
+        # 2 constants.
+        if dtype == "float16":
+            expected_num_constants = 2
+            expected_num_nodes = 5
+        elif dtype == "float":
+            # Gemm ops with float inputs do not have any alignment requirements,
+            # so the apply_padding pass will not add any padding constants.
+            # The final graph only contains the original "weight" constant tensor.
+            expected_num_constants = 1
+            expected_num_nodes = 3
+        else:
+            raise RuntimeError(f"invalid {dtype=}")
+        self._verify_graph(
+            mod,
+            expected_num_constants=expected_num_constants,
+            expected_num_nodes=expected_num_nodes,
+        )
 
-    def test_fold_long_chain(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fold_long_chain(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         M, N, K = 16, 32, 3
-        w1_pt = torch.randn((K, N)).half().cuda()
+        w1_pt = get_random_torch_tensor((K, N), dtype)
         w1_data = _TorchConstantTensorData(w1_pt)
 
-        w2_pt = torch.randn((K, N)).half().cuda()
+        w2_pt = get_random_torch_tensor((K, N), dtype)
         w2_data = _TorchConstantTensorData(w2_pt)
 
         w3_pt = w1_pt * w2_pt
-        x_pt = torch.randn((M, K)).half().cuda()
+        x_pt = get_random_torch_tensor((M, K), dtype)
         x_pt_data = _TorchConstantTensorData(x_pt)
 
         y_pt = torch.matmul(x_pt, w3_pt)
-        w4_pt = torch.randn((M, N)).half().cuda()
+        w4_pt = get_random_torch_tensor((M, N), dtype)
         w4_data = _TorchConstantTensorData(w4_pt)
         z_pt = y_pt * w4_pt
 
-        w1_ait = Tensor(shape=[K, N], name="w1")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
         w1_ait._bind_data(w1_data)
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         w2_ait._bind_data(w2_data)
         w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
-        x_ait = Tensor(shape=[M, K], name="x")
+        x_ait = Tensor(shape=[M, K], dtype=dtype, name="x")
         x_ait._bind_data(x_pt_data)
         y_ait = ops.gemm_rrr()(x_ait, w3_ait)
-        w4_ait = Tensor(shape=[M, N], name="w4")
+        w4_ait = Tensor(shape=[M, N], dtype=dtype, name="w4")
         w4_ait._bind_data(w4_data)
         z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
         z_ait._attrs["name"] = "z"
         z_ait._attrs["is_output"] = True
 
         target = detect_target()
-        mod = compile_model(z_ait, target, "./tmp", "test_pad_constant_weight")
+        mod = compile_model(z_ait, target, "./tmp", f"test_fold_long_chain_{dtype}")
 
-        z = torch.empty((M, N)).cuda().half()
+        z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
 
         torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
@@ -148,16 +182,19 @@ def test_fold_long_chain(self):
         # The entire graph is turned into a constant.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    def test_constant_folding_through_views(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_constant_folding_through_views(self, dtype):
         target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
-        inp0_pt = torch.randn((3, 3)).half().cuda()
-        inp1_pt = torch.randn((3, 3)).half().cuda()
+        inp0_pt = get_random_torch_tensor((3, 3), dtype)
+        inp1_pt = get_random_torch_tensor((3, 3), dtype)
         y_pt = (inp0_pt * inp1_pt).flatten()
 
-        inp0_ait = Tensor(shape=(3, 3), name="inp0")
+        inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
         inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
-        inp1_ait = Tensor(shape=(3, 3), name="inp1")
+        inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
         inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
         inp0_view = ops.flatten()(inp0_ait)
         inp1_view = ops.flatten()(inp1_ait)
@@ -165,52 +202,55 @@ def test_constant_folding_through_views(self):
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         mod = compile_model(
-            y_ait, target, "./tmp", "test_constant_folding_through_views"
+            y_ait, target, "./tmp", f"test_constant_folding_through_views_{dtype}"
         )
 
-        y = torch.empty((9,)).cuda().half()
+        y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({}, {"y": y})
         self.assertTrue(torch.equal(y, y_pt))
 
         # The entire graph is eliminated.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    def test_late_binding(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_late_binding(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         # Test binding constants through compile_model
         M, N, K = 16, 32, 3
-        w1_pt = torch.randn((K, N)).half().cuda()
+        w1_pt = get_random_torch_tensor((K, N), dtype)
 
-        w2_pt = torch.randn((K, N)).half().cuda()
+        w2_pt = get_random_torch_tensor((K, N), dtype)
 
         w3_pt = w1_pt * w2_pt
-        x_pt = torch.randn((M, K)).half().cuda()
+        x_pt = get_random_torch_tensor((M, K), dtype)
 
         y_pt = torch.matmul(x_pt, w3_pt)
-        w4_pt = torch.randn((M, N)).half().cuda()
+        w4_pt = get_random_torch_tensor((M, N), dtype)
         z_pt = y_pt * w4_pt
 
-        w1_ait = Tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
-        x_ait = Tensor(shape=[M, K], name="x")
+        x_ait = Tensor(shape=[M, K], dtype=dtype, name="x")
         y_ait = ops.gemm_rrr()(x_ait, w3_ait)
-        w4_ait = Tensor(shape=[M, N], name="w4")
+        w4_ait = Tensor(shape=[M, N], dtype=dtype, name="w4")
         z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
         z_ait._attrs["name"] = "z"
         z_ait._attrs["is_output"] = True
 
-        target = detect_target()
         mod = compile_model(
             z_ait,
             target,
             "./tmp",
-            "test_late_binding",
+            f"test_late_binding_{dtype}",
             constants={"w1": w1_pt, "w2": w2_pt, "x": x_pt, "w4": w4_pt},
         )
 
-        z = torch.empty((M, N)).cuda().half()
+        z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
 
         torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
@@ -219,15 +259,17 @@ def test_late_binding(self):
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
     def test_late_binding_error_constant_already_bound(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = _create_host_zero_tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = _create_host_zero_tensor(shape=[K, N], name="w1", dtype=dtype)
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Tensor w1 is already bound!"):
             compile_model(
                 y_ait,
@@ -235,21 +277,23 @@ def test_late_binding_error_constant_already_bound(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_error_cannot_bind_input(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = Tensor(shape=[K, N], name="w1", is_input=True)
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1", is_input=True)
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Cannot bind input tensor w1"):
             compile_model(
                 y_ait,
@@ -257,21 +301,23 @@ def test_late_binding_error_cannot_bind_input(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_error_cannot_bind_non_constant(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = Tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Cannot bind non-constant tensor y"):
             compile_model(
                 y_ait,
@@ -279,14 +325,16 @@ def test_late_binding_error_cannot_bind_non_constant(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
-                    "y": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
+                    "y": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_fails_wrong_dtype(self):
-        w1_ait = Tensor(shape=[1], name="w1", dtype="float16")
+        dtype = "float16"
+
+        w1_ait = Tensor(shape=[1], name="w1", dtype=dtype)
         y = ops.elementwise(FuncEnum.MUL)(w1_ait, w1_ait)
         y._attrs["name"] = "y"
         y._attrs["is_output"] = True
@@ -296,6 +344,7 @@ def test_late_binding_fails_wrong_dtype(self):
             torch.zeros((1,)).int(),
             torch.zeros((1,)).long(),
         )
+
         for w1_pt in wrong_inputs:
             with self.assertRaisesRegex(
                 ValueError,
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index 983a35605..4db38e955 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -20,18 +20,24 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
+from parameterized import parameterized
+
 
 class FuseGemmRcrBiasCase(unittest.TestCase):
-    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         if decomposed:
             gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -41,12 +47,14 @@ def _build_gemm_rcr_bias(self, M, N, K, decomposed):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_add_add_relu_chain(self, M, N, K, depth, decomposed):
+    def _build_gemm_rcr_bias_add_add_relu_chain(
+        self, M, N, K, depth, decomposed, dtype
+    ):
         D_shape = [M, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_4", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         if depth == 1:
             return bias_tensor
 
@@ -64,19 +72,19 @@ def _build_gemm_rcr_bias_add_add_relu_chain(self, M, N, K, depth, decomposed):
 
         raise AssertionError("No suitable output tensors available")
 
-    def _build_gemm_rcr_bias_mul(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias_mul(self, M, N, K, decomposed, dtype):
         D_shape = [M, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
 
         return mul_tensor
 
-    def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         bias_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 1, decomposed
+            m_dim, N, K, 1, decomposed, dtype
         )
         bias_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(bias_tensor)
@@ -98,9 +106,9 @@ def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -108,15 +116,15 @@ def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_0"]] = X_pt
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         add_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 2, decomposed
+            m_dim, N, K, 2, decomposed, dtype
         )
         add_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add_tensor)
@@ -138,10 +146,10 @@ def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt)
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -150,15 +158,17 @@ def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_add(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         add2_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 3, decomposed
+            m_dim, N, K, 3, decomposed, dtype
         )
         add2_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add2_tensor)
@@ -180,11 +190,11 @@ def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
             )
@@ -197,14 +207,16 @@ def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_add_relu(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         relu_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 4, decomposed
+            m_dim, N, K, 4, decomposed, dtype
         )
         relu_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(relu_tensor)
@@ -226,11 +238,11 @@ def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add_relu")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.relu(
                     torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
@@ -245,17 +257,22 @@ def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_add_fail(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_gemm_rcr_bias_add_fail(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         B_shape = [N]
 
-        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=B_shape, dtype=dtype, name="input_3", is_input=True)
 
-        gemm_bias_tensor = self._build_gemm_rcr_bias(M, N, K, False)
+        gemm_bias_tensor = self._build_gemm_rcr_bias(M, N, K, False, dtype)
         gemm_bias_tensor._attrs["name"] = "gemm_tensor"
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_bias_tensor, input_3)
         add_tensor._attrs["name"] = "gemm_bias_add_tensor"
@@ -265,8 +282,9 @@ def test_gemm_rcr_bias_add_fail(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_add_fail")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_add_fail_{dtype}"
+        )
 
         # This shouldn't be merged into gemm_rcr_bias_add since input_3 needs broadcasting
         check_tensor = None
@@ -278,32 +296,37 @@ def test_gemm_rcr_bias_add_fail(self):
         src_op = list(check_tensor.src_ops())[0]
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + B1_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt, B1_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_chained(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_gemm_rcr_bias_chained(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
         add_tensor._attrs["name"] = "first_gemm"
 
         D_shape = [N, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
         gemm1_tensor = ops.gemm_universal.gemm_rcr()(add_tensor, input_3)
         add1_tensor = ops.elementwise(FuncEnum.ADD)(gemm1_tensor, input_2)
         add1_tensor._attrs["name"] = "second_gemm"
@@ -313,8 +336,9 @@ def test_gemm_rcr_bias_chained(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_chained")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_chained_{dtype}"
+        )
 
         gemm_check = [False, False]
         for tensor in module.debug_sorted_graph:
@@ -328,29 +352,34 @@ def test_gemm_rcr_bias_chained(self):
                 gemm_check[1] = True
         self.assertTupleEqual(tuple(gemm_check), (True, True))
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        D_pt = torch.randn(N, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([N, N], dtype)
         Y_pt = torch.cos(
             torch.nn.functional.linear(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt), D_pt, bias=B_pt
             )
         )
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt, D_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_fail(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_gemm_rcr_bias_fail(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [M, N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
@@ -361,8 +390,9 @@ def test_gemm_rcr_bias_fail(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_fail")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_fail_{dtype}"
+        )
 
         check_tensor = None
         for tensor in module.debug_sorted_graph:
@@ -374,22 +404,24 @@ def test_gemm_rcr_bias_fail(self):
                 break
         self.assertIsNotNone(check_tensor)
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(M, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([M, N], dtype)
         Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt) + B_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_relu(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
 
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bias_tensor, input_3)
         relu_tensor = ops.elementwise(FuncEnum.RELU)(add_tensor)
         relu_tensor._attrs["name"] = "final_tensor"
@@ -412,10 +444,10 @@ def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_relu")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.relu(
                     torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt
@@ -429,14 +461,14 @@ def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(bias_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
@@ -458,9 +490,9 @@ def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
             )
@@ -471,14 +503,14 @@ def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         mul_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(mul_tensor)
         output._attrs["name"] = "output_0"
@@ -499,10 +531,10 @@ def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -512,16 +544,18 @@ def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul_add(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
 
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_4", is_input=True)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         add_tensor = ops.elementwise(FuncEnum.ADD)(mul_tensor, input_4)
         add_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add_tensor)
@@ -543,11 +577,11 @@ def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt + D1_pt
             )
@@ -560,14 +594,16 @@ def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul_tanh(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
@@ -589,10 +625,10 @@ def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
             )
@@ -604,7 +640,7 @@ def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -700,16 +736,60 @@ def test_gemm_rcr_bias_mul_tanh(self):
             [8], 16, 3, False, "gemm_rcr_bias_mul_tanh_need_align"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_rcr_bias_add_float(self):
+        self._test_gemm_rcr_bias(
+            [8], 16, 8, True, "gemm_rcr_bias_basic_decomposed_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_add(
+            [8], 16, 8, False, "gemm_rcr_bias_add_basic_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_add_add(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_add_add_dynamic_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_add_add_relu(
+            [8],
+            16,
+            3,
+            False,
+            "gemm_rcr_bias_add_add_relu_need_align_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_add_relu(
+            [8],
+            16,
+            8,
+            True,
+            "gemm_rcr_bias_add_relu_basic_decomposed_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_tanh(
+            [8], 16, 8, False, "gemm_rcr_bias_tanh_basic_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_mul(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_mul_dynamic_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_mul_add(
+            [8], 16, 3, False, "gemm_rcr_bias_mul_add_need_align_float", dtype="float"
+        )
+        self._test_gemm_rcr_bias_mul_tanh(
+            [8], 16, 3, False, "gemm_rcr_bias_mul_tanh_need_align_float", dtype="float"
+        )
+
 
 class FuseGemmRcrBiasActivationCase(unittest.TestCase):
-    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         if decomposed:
             gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -719,14 +799,14 @@ def _build_gemm_rcr_bias(self, M, N, K, decomposed):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_sigmoid(self, M, N, K, decomposed):
-        gemm_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+    def _build_gemm_rcr_bias_sigmoid(self, M, N, K, decomposed, dtype):
+        gemm_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         sigmoid_tensor = ops.elementwise(FuncEnum.SIGMOID)(gemm_tensor)
 
         return sigmoid_tensor
 
     def _test_gemm_rcr_bias_activation(
-        self, Ms, N, K, activation, target_ait, decomposed, testname
+        self, Ms, N, K, activation, target_ait, decomposed, testname, dtype="float16"
     ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         if activation == "relu":
@@ -747,7 +827,7 @@ def _test_gemm_rcr_bias_activation(
         else:
             raise AssertionError("Activation not supported")
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         act_tensor = ops.elementwise(ait_func)(bias_tensor)
         act_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(act_tensor)
@@ -769,9 +849,9 @@ def _test_gemm_rcr_bias_activation(
         self.assertEqual(src_op._attrs["op"], target_ait)
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(pt_func(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)))
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -779,17 +859,21 @@ def _test_gemm_rcr_bias_activation(
             inputs[input_name_to_index["input_0"]] = X_pt
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_sigmoid_mul(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(
+            m_dim, N, K, decomposed, dtype
+        )
         mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
         mul_tensor._attrs["name"] = "final_tensor"
 
@@ -812,10 +896,10 @@ def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, B_pt)) * D_pt
             )
@@ -827,16 +911,20 @@ def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_sigmoid_mul_tanh(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(
+            m_dim, N, K, decomposed, dtype
+        )
         mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
@@ -860,10 +948,10 @@ def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(
                     torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
@@ -878,7 +966,7 @@ def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1032,18 +1120,83 @@ def test_gemm_rcr_bias_gelu(self):
             "gemm_rcr_bias_fast_gelu_basic_decomposed",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_rcr_bias_float(self):
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "relu",
+            "gemm_rcr_bias_relu",
+            True,
+            "gemm_rcr_bias_relu_basic_decomposed_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "sigmoid",
+            "gemm_rcr_bias_sigmoid",
+            False,
+            "gemm_rcr_bias_sigmoid_basic_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul(
+            [8],
+            16,
+            8,
+            False,
+            "gemm_rcr_bias_sigmoid_mul_basic_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
+            [8],
+            16,
+            3,
+            False,
+            "gemm_rcr_bias_sigmoid_mul_tanh_need_align_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "tanh",
+            "gemm_rcr_bias_tanh",
+            False,
+            "gemm_rcr_bias_tanh_basic_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "fast_gelu",
+            "gemm_rcr_bias_fast_gelu",
+            True,
+            "gemm_rcr_bias_fast_gelu_basic_decomposed_float",
+            dtype="float",
+        )
+
 
 class FuseGemmRcrBiasSwishCase(unittest.TestCase):
-    def _test_gemm_rcr_bias_swish(self, Ms, N, K, testname, use_add=False):
+    def _test_gemm_rcr_bias_swish(
+        self, Ms, N, K, testname, dtype="float16", use_add=False
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         X_shape = [m_dim, K]
         W_shape = [N, K]
         B_shape = [N]
         D_shape = [m_dim, N]
-        input_1 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_2 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_1 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_2 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_3 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
         if use_add:
             tensor = ops.gemm_rcr()(input_1, input_2)
@@ -1073,10 +1226,10 @@ def _test_gemm_rcr_bias_swish(self, Ms, N, K, testname, use_add=False):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_swish")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             gemm_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
             Y_pt = gemm_pt * torch.sigmoid(gemm_pt) + D_pt
 
@@ -1087,7 +1240,7 @@ def _test_gemm_rcr_bias_swish(self, Ms, N, K, testname, use_add=False):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
@@ -1097,24 +1250,62 @@ def test_gemm_rcr_bias_swish(self):
         self._test_gemm_rcr_bias_swish([8], 16, 3, "gemm_rcr_bias_swish_need_align")
 
     def test_gemm_rcr_add_swish(self):
-        self._test_gemm_rcr_bias_swish([8], 16, 8, "gemm_rcr_add_swish_basic", True)
         self._test_gemm_rcr_bias_swish(
-            [8, 32], 16, 8, "gemm_rcr_add_swish_dynamic", True
+            [8], 16, 8, "gemm_rcr_add_swish_basic", use_add=True
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8, 32], 16, 8, "gemm_rcr_add_swish_dynamic", use_add=True
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8], 16, 3, "gemm_rcr_add_swish_need_align", use_add=True
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_rcr_swish_float(self):
+        self._test_gemm_rcr_bias_swish(
+            [8],
+            16,
+            8,
+            "gemm_rcr_bias_swish_basic_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8, 32],
+            16,
+            8,
+            "gemm_rcr_add_swish_dynamic_float",
+            dtype="float",
+            use_add=True,
         )
         self._test_gemm_rcr_bias_swish(
-            [8], 16, 3, "gemm_rcr_add_swish_need_align", True
+            [8],
+            16,
+            3,
+            "gemm_rcr_add_swish_need_align_float",
+            dtype="float",
+            use_add=True,
         )
 
 
 class FuseBmmCcrAddCase(unittest.TestCase):
-    def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_ccr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, K, M]
         B_shape = [batch_dim, N, K]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1133,14 +1324,20 @@ def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, K, M).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1149,18 +1346,18 @@ def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_bmm_ccr_add_negative(self, testname, negative_type):
+    def _test_bmm_ccr_add_negative(self, testname, negative_type, dtype="float16"):
         B, K, M, N = 8, 32, 16, 8
         A_shape = [B, K, M]
         B_shape = [B, N, K]
         D0_shape = [B, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         bmm_tensor._attrs["name"] = "bmm_tensor"
         if negative_type == "is_output":
@@ -1190,9 +1387,9 @@ def _test_bmm_ccr_add_negative(self, testname, negative_type):
         src_op = list(check_tensor.src_ops())[0]
         self.assertEqual(src_op._attrs["op"], "bmm_ccr")
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D0_pt = torch.randn(B, M, N).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D0_pt = get_random_torch_tensor([B, M, N], dtype)
 
         bmm_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1))
         Y_pt = bmm_pt + D0_pt + D0_pt
@@ -1207,8 +1404,8 @@ def _test_bmm_ccr_add_negative(self, testname, negative_type):
         inputs[input_name_to_index["input_1"]] = W_pt
         inputs[input_name_to_index["input_2"]] = D0_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
-        y1 = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        y1 = get_torch_empty_tensor([B, M, N], dtype)
         output_name_to_index = module.get_output_name_to_index_map()
         if output_name_to_index["output_0"] == 0:
             ys = [y, y1]
@@ -1223,26 +1420,57 @@ def test_bmm_ccr_add(self):
         self._test_bmm_ccr_add([8], 32, 16, 8, "bmm_ccr_add_basic")
         self._test_bmm_ccr_add([8, 32], 32, 16, 8, "bmm_ccr_add_dynamic")
         self._test_bmm_ccr_add([8], 7, 13, 3, "bmm_ccr_add_need_align")
+        self._test_bmm_ccr_add(
+            [8], 32, 16, 8, "bmm_ccr_add_do_not_fuse", do_not_fuse=True
+        )
 
     def test_bmm_ccr_add_negative(self):
         self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_output", "is_output")
         self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_input", "other_input")
 
-    def test_bmm_ccr_add_double_shared_input(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_ccr_add_float(self):
+        self._test_bmm_ccr_add(
+            [8, 32], 32, 16, 8, "bmm_ccr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_ccr_add(
+            [8], 7, 13, 3, "bmm_ccr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_ccr_add(
+            [8],
+            32,
+            16,
+            8,
+            "bmm_ccr_add_do_not_fuse_float",
+            dtype="float",
+            do_not_fuse=True,
+        )
+        self._test_bmm_ccr_add_negative(
+            "bmm_ccr_add_negative_output", "is_output", dtype="float"
+        )
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_bmm_ccr_add_double_shared_input(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         B, M, N, K = 8, 32, 16, 8
 
         A_shape = [B, K, M]
         B_shape = [B, N, K]
         D0_shape = [B, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_11 = Tensor(
-            shape=B_shape, dtype="float16", name="input_11", is_input=True
-        )
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_11 = Tensor(shape=B_shape, dtype=dtype, name="input_11", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         bmm_tensor_1 = ops.gemm_universal.bmm_ccr()(input_0, input_11)
 
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
         add_tensor_1 = ops.elementwise(FuncEnum.ADD)(bmm_tensor_1, input_2)
@@ -1256,9 +1484,8 @@ def test_bmm_ccr_add_double_shared_input(self):
         output_1._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
         module = compile_model(
-            [output, output_1], target, "./tmp", "bmm_ccr_double_shared_inputs"
+            [output, output_1], target, "./tmp", f"bmm_ccr_double_shared_inputs_{dtype}"
         )
 
         check_tensor = None
@@ -1275,10 +1502,10 @@ def test_bmm_ccr_add_double_shared_input(self):
             self.assertEqual(src_op._attrs["op"], "bmm_ccr_add")
             check_tensor = None
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        W1_pt = torch.randn(B, N, K).cuda().half()
-        D0_pt = torch.randn(B, M, N).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        W1_pt = get_random_torch_tensor([B, N, K], dtype)
+        D0_pt = get_random_torch_tensor([B, M, N], dtype)
         Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
         Y1_pt = torch.bmm(X_pt.transpose(2, 1), W1_pt.transpose(2, 1)) + D0_pt + D0_pt
 
@@ -1289,8 +1516,8 @@ def test_bmm_ccr_add_double_shared_input(self):
         inputs[input_name_to_index["input_11"]] = W1_pt
         inputs[input_name_to_index["input_2"]] = D0_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
-        y1 = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        y1 = get_torch_empty_tensor([B, M, N], dtype)
         ys = [None] * 2
         output_name_to_index = module.get_output_name_to_index_map()
         ys[output_name_to_index["output_0"]] = y
@@ -1303,14 +1530,20 @@ def test_bmm_ccr_add_double_shared_input(self):
 
 
 class FuseBmmCrrAddCase(unittest.TestCase):
-    def _test_bmm_crr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_crr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, K, M]
         B_shape = [batch_dim, K, N]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_crr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1329,14 +1562,20 @@ def _test_bmm_crr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, K, M).cuda().half()
-            W_pt = torch.randn(B, K, N).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1345,7 +1584,7 @@ def _test_bmm_crr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1353,17 +1592,42 @@ def test_bmm_crr_add(self):
         self._test_bmm_crr_add([8], 32, 16, 8, "bmm_crr_add_basic")
         self._test_bmm_crr_add([8, 32], 32, 16, 8, "bmm_crr_add_dynamic")
         self._test_bmm_crr_add([8], 7, 13, 3, "bmm_crr_add_need_align")
+        self._test_bmm_crr_add(
+            [8], 32, 16, 8, "bmm_crr_add_do_not_fuse", do_not_fuse=True
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_crr_add_float(self):
+        self._test_bmm_crr_add(
+            [8, 32], 32, 16, 8, "bmm_crr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_crr_add(
+            [8], 7, 13, 3, "bmm_crr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_crr_add(
+            [8], 32, 16, 8, "bmm_crr_add_do_not_fuse", dtype="float", do_not_fuse=True
+        )
 
 
 class FuseBmmRrrAddCase(unittest.TestCase):
-    def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_rrr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, M, K]
         B_shape = [batch_dim, K, N]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_rrr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1382,14 +1646,20 @@ def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, M, K).cuda().half()
-            W_pt = torch.randn(B, K, N).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt, W_pt) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1398,7 +1668,7 @@ def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1406,6 +1676,80 @@ def test_bmm_rrr_add(self):
         self._test_bmm_rrr_add([8], 32, 16, 8, "bmm_rrr_add_basic")
         self._test_bmm_rrr_add([8, 32], 32, 16, 8, "bmm_rrr_add_dynamic")
         self._test_bmm_rrr_add([8], 7, 13, 3, "bmm_rrr_add_need_align")
+        self._test_bmm_rrr_add([8], 32, 16, 8, "bmm_rrr_add_no_fuse", do_not_fuse=True)
+
+    def _test_bmm_rrr_bias_add(
+        self, Bs, M, N, K, bias_shapes, testname, dtype="float16"
+    ):
+        batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
+        A_shape = [batch_dim, M, K]
+        B_shape = [batch_dim, K, N]
+        D0_shape = bias_shapes
+
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_rrr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            src_ops = list(tensor.src_ops())
+            if len(src_ops) != 1:
+                continue
+            if src_ops[0]._attrs["op"].startswith("bmm"):
+                check_tensor = tensor
+                self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
+        self.assertIsNotNone(check_tensor)
+
+        for B in Bs:
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor(D0_shape, dtype)
+            Y_pt = torch.bmm(X_pt, W_pt) + D0_pt + D0_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 3
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = D0_pt
+
+            y = get_torch_empty_tensor([B, M, N], dtype)
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_rrr_bias_add(self):
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [16], "bmm_rrr_bias_add_01")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [32, 16], "bmm_rrr_bias_add_02")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [1, 32, 16], "bmm_rrr_bias_add_03")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [1, 16], "bmm_rrr_bias_add_03")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_rrr_add_float(self):
+        self._test_bmm_rrr_add(
+            [8, 32], 32, 16, 8, "bmm_rrr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_rrr_add(
+            [8], 7, 13, 3, "bmm_rrr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_rrr_add(
+            [8], 32, 16, 8, "bmm_rrr_add_no_fuse_float", dtype="float", do_not_fuse=True
+        )
+        self._test_bmm_rrr_bias_add(
+            [8], 32, 16, 8, [1, 32, 16], "bmm_rrr_bias_add_float_03", dtype="float"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
index edea2d2d8..05b974f8b 100644
--- a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
+++ b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
@@ -19,7 +19,11 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import has_op
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -32,6 +36,7 @@ def _test_rcr_0213(
         n,
         shape,
         test_name,
+        dtype="float16",
         has_bias=False,
         layout="0213",
         should_fuse=True,
@@ -39,11 +44,11 @@ def _test_rcr_0213(
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         # B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
         t1, t2 = shape
 
@@ -67,9 +72,9 @@ def _test_rcr_0213(
             return
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
 
             def torch_f(x, w, b, has_bias, shape):
                 if has_bias:
@@ -87,7 +92,7 @@ def torch_f(x, w, b, has_bias, shape):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -103,8 +108,8 @@ def test_rcr_0213(self):
         self._test_rcr_0213(
             [54],
             256,
-            4000000,
-            [54, 1000000],
+            40000,
+            [54, 10000],
             "permute_0213_1",
             has_bias=False,
             layout="0213",
@@ -112,14 +117,31 @@ def test_rcr_0213(self):
         self._test_rcr_0213(
             [29, 29 * 8],
             256,
-            300000,
-            [29, 100000],
+            3000,
+            [29, 1000],
             "permute_0213_2",
             has_bias=False,
             layout="0213",
             should_fuse=False,
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rcr_0213_float(self):
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            3000,
+            [29, 1000],
+            "permute_0213_float_2",
+            dtype="float",
+            has_bias=False,
+            layout="0213",
+            should_fuse=False,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
index b510acf7b..c12a0dd28 100644
--- a/tests/unittest/compiler/test_fuse_permute_bmm.py
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -20,18 +20,24 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
+from parameterized import parameterized
+
 
 class FusePermuteBmmCase(unittest.TestCase):
     def _create_permute_bmm_graph(
-        self, A_shape, B_shape, bmm_type, permA, permB, bias_shape=None
+        self, A_shape, B_shape, bmm_type, permA, permB, dtype, bias_shape=None
     ):
         OP = getattr(ops, bmm_type, None)
         assert OP is not None
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         X = A
         W = B
         if permA:
@@ -41,7 +47,7 @@ def _create_permute_bmm_graph(
         inputs = [A, B]
         if bias_shape is not None:
             inputs.append(
-                Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+                Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
             )
 
         Y = OP()(*inputs)
@@ -49,10 +55,10 @@ def _create_permute_bmm_graph(
         return X, W, Y
 
     def _test_missing_alignment_bmm(
-        self, A_shape, B_shape, bmm_type, permA, permB, testname
+        self, A_shape, B_shape, bmm_type, permA, permB, testname, dtype="float16"
     ):
         X, W, bmm_tensor = self._create_permute_bmm_graph(
-            A_shape, B_shape, bmm_type, permA, permB
+            A_shape, B_shape, bmm_type, permA, permB, dtype
         )
         output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
         output._attrs["name"] = "output_0"
@@ -61,6 +67,21 @@ def _test_missing_alignment_bmm(
         target = detect_target()
         module = compile_model(output, target, "./tmp", testname)
 
+        if dtype == "float":
+            expected_bmm_type = list(bmm_type)
+            if permA:
+                if expected_bmm_type[-3] == "c":
+                    expected_bmm_type[-3] = "r"
+                else:
+                    expected_bmm_type[-3] = "c"
+            if permB:
+                if expected_bmm_type[-2] == "c":
+                    expected_bmm_type[-2] = "r"
+                else:
+                    expected_bmm_type[-2] = "c"
+            expected_bmm_type = "".join(expected_bmm_type)
+        else:
+            expected_bmm_type = bmm_type
         found_tensor = False
         for tensor in module.debug_sorted_graph:
             src_ops = tensor.src_ops()
@@ -72,7 +93,7 @@ def _test_missing_alignment_bmm(
             src_op = list(tensor.src_ops())[0]
             if src_op._attrs["op"].startswith("bmm"):
                 found_tensor = True
-                self.assertEqual(src_op._attrs["op"], bmm_type)
+                self.assertEqual(src_op._attrs["op"], expected_bmm_type)
         self.assertTrue(found_tensor)
 
     def test_misalign_a_bmm(self):
@@ -97,6 +118,67 @@ def test_misalign_b_bmm(self):
             [2, 4, 8], [2, 8, 7], "bmm_rcr", False, True, "bmm_rcr_misalign_b"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_misalign_bmm_float(self):
+        self._test_missing_alignment_bmm(
+            [2, 4, 7],
+            [2, 7, 8],
+            "bmm_crr",
+            True,
+            False,
+            "bmm_crr_misalign_a",
+            dtype="float",
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 7],
+            [2, 8, 4],
+            "bmm_rcr",
+            True,
+            False,
+            "bmm_rcr_misalign_a",
+            dtype="float",
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 7],
+            [2, 4, 8],
+            "bmm_rrr",
+            True,
+            False,
+            "bmm_rrr_misalign_a",
+            dtype="float",
+        )
+        self._test_missing_alignment_bmm(
+            [2, 8, 4],
+            [2, 8, 7],
+            "bmm_ccr",
+            False,
+            True,
+            "bmm_ccr_misalign_b",
+            dtype="float",
+        )
+        self._test_missing_alignment_bmm(
+            [2, 7, 8],
+            [2, 8, 7],
+            "bmm_crr",
+            False,
+            True,
+            "bmm_crr_misalign_b",
+            dtype="float",
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 8],
+            [2, 8, 7],
+            "bmm_rcr",
+            False,
+            True,
+            "bmm_rcr_misalign_b",
+            dtype="float",
+        )
+
     def _test_permute_bmm(
         self,
         B,
@@ -105,6 +187,7 @@ def _test_permute_bmm(
         original_bmm,
         new_bmm,
         testname,
+        dtype="float16",
         bias_shape=None,
     ):
         new_layout = new_bmm[-3:]
@@ -133,6 +216,7 @@ def _test_permute_bmm(
             original_bmm,
             permA,
             permB,
+            dtype,
             bias_shape=bias_shape,
         )
 
@@ -163,19 +247,19 @@ def _test_permute_bmm(
 
         for b in B:
             if len(A_shape) > 2:
-                X_pt = torch.randn(b, M, K).cuda().half()
+                X_pt = get_random_torch_tensor([b, M, K], dtype)
             else:
-                X_pt = torch.randn(M, K).cuda().half()
+                X_pt = get_random_torch_tensor([M, K], dtype)
 
             if len(B_shape) > 2:
-                W_pt = torch.randn(b, K, N).cuda().half()
+                W_pt = get_random_torch_tensor([b, K, N], dtype)
             else:
-                W_pt = torch.randn(K, N).cuda().half()
+                W_pt = get_random_torch_tensor([K, N], dtype)
 
             Y_pt = torch.matmul(X_pt, W_pt)
 
             if bias_shape is not None:
-                bias_pt = torch.randn(bias_shape[0]).cuda().half()
+                bias_pt = get_random_torch_tensor(bias_shape[0], dtype)
                 Y_pt += bias_pt
 
             Y_pt = torch.cos(Y_pt)
@@ -188,7 +272,7 @@ def _test_permute_bmm(
                 W_pt = W_pt.permute(perm).contiguous()
 
             # We currently only have row-major outputs.
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
 
             input_name_to_index = module.get_input_name_to_index_map()
             inputs = [0, 0] if bias_shape is None else [0, 0, 0]
@@ -226,6 +310,24 @@ def test_ccr_to_rrr(self):
             "ccr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_ccr_to_rrr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_ccr",
+            "bmm_rrr",
+            "ccr_to_rrr_need_align_float",
+            dtype="float",
+        )
+
     def test_ccr_to_crr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -252,6 +354,24 @@ def test_ccr_to_crr(self):
             "ccr_to_crr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_ccr_to_crr_float(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 4, 8],
+            "bmm_ccr",
+            "bmm_crr",
+            "ccr_to_crr_dynamic_float",
+            dtype="float",
+        )
+
     def test_ccr_to_rcr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -278,6 +398,24 @@ def test_ccr_to_rcr(self):
             "ccr_to_rcr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_ccr_to_rcr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 8, 4],
+            "bmm_ccr",
+            "bmm_rcr",
+            "ccr_to_rcr_float",
+            dtype="float",
+        )
+
     def test_crr_to_ccr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -304,6 +442,24 @@ def test_crr_to_ccr(self):
             "crr_to_ccr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_crr_to_ccr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_crr",
+            "bmm_ccr",
+            "crr_to_ccr_float",
+            dtype="float",
+        )
+
     def test_crr_to_rrr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -330,6 +486,24 @@ def test_crr_to_rrr(self):
             "crr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_crr_to_rrr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_crr",
+            "bmm_rrr",
+            "crr_to_rrr_need_align_float",
+            dtype="float",
+        )
+
     def test_rcr_to_ccr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -356,6 +530,24 @@ def test_rcr_to_ccr(self):
             "rcr_to_ccr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rcr_to_ccr_float(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_dynamic_float",
+            dtype="float",
+        )
+
     def test_rcr_to_rrr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -382,6 +574,24 @@ def test_rcr_to_rrr(self):
             "rcr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rcr_to_rrr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 4, 8],
+            "bmm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_float",
+            dtype="float",
+        )
+
     def test_rrr_to_crr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -408,6 +618,24 @@ def test_rrr_to_crr(self):
             "rrr_to_crr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rrr_to_crr_float(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 5, 7],
+            "bmm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_need_align_float",
+            dtype="float",
+        )
+
     def test_rrr_to_rcr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -429,7 +657,25 @@ def test_rrr_to_rcr(self):
             B, [batch_dim, 2, 4], [batch_dim, 8, 4], "bmm_rrr", "bmm_rcr", "rrr_to_rcr"
         )
 
-    def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rrr_to_rcr_float(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 8, 4],
+            "bmm_rrr",
+            "bmm_rcr",
+            "rrr_to_rcr_float",
+            dtype="float",
+        )
+
+    def _test_gemm_broadcast_rcr_to_ccr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -438,7 +684,8 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [8, 4],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_gemm_broadcast_b",
+            f"rcr_to_ccr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -447,7 +694,8 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [7, 5],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_need_align_gemm_broadcast_b",
+            f"rcr_to_ccr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -459,11 +707,12 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [8, 4],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_dynamic_gemm_broadcast_b",
+            f"rcr_to_ccr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
-    def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
+    def _test_gemm_broadcast_rcr_to_rrr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -472,7 +721,8 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [4, 8],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_gemm_broadcast_b",
+            f"rcr_to_rrr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -481,7 +731,8 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [5, 7],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_need_align_gemm_broadcast_b",
+            f"rcr_to_rrr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -493,11 +744,12 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [4, 8],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_dynamic_gemm_broadcast_b",
+            f"rcr_to_rrr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
-    def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
+    def _test_gemm_broadcast_rrr_to_crr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -506,7 +758,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [4, 8],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_gemm_broadcast_b",
+            f"rrr_to_crr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -515,7 +768,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [5, 7],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_need_align_gemm_broadcast_b",
+            f"rrr_to_crr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -527,7 +781,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [4, 8],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_dynamic_gemm_broadcast_b",
+            f"rrr_to_crr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
@@ -539,12 +794,26 @@ def test_gemm_broadcast_rrr_to_crr(self):
         self._test_gemm_broadcast_rrr_to_crr(True)
         self._test_gemm_broadcast_rrr_to_crr(False)
 
-    def test_permute_multiple_consumer(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_broadcast_float(self):
+        self._test_gemm_broadcast_rcr_to_ccr(True, dtype="float")
+        self._test_gemm_broadcast_rrr_to_crr(False, dtype="float")
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_permute_multiple_consumer(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         A_shape = [2, 8, 4]
         B_shape = [2, 8, 8]
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
 
         permA = ops.permute021()(A)
 
@@ -555,14 +824,15 @@ def test_permute_multiple_consumer(self):
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "permute_multiple_consumer")
+        module = compile_model(
+            output, target, "./tmp", f"permute_multiple_consumer_{dtype}"
+        )
 
         graph = module.debug_sorted_graph
         bmm_tensors = 0
         for tensor in graph:
             src_ops = tensor.src_ops()
-            if len(src_ops) != 1:
+            if len(src_ops) != 2:
                 continue
             src_op = list(tensor.src_ops())[0]
             if src_op._attrs["op"].startswith("bmm"):
@@ -570,16 +840,16 @@ def test_permute_multiple_consumer(self):
                 self.assertEqual(src_op._attrs["op"], "bmm_crr")
         self.assertEqual(bmm_tensors, 1)
 
-        A_pt = torch.randn(*A_shape).cuda().half()
+        A_pt = get_random_torch_tensor(A_shape, dtype)
         AT_pt = A_pt.permute((0, 2, 1))
-        B1_pt = torch.randn(*B_shape).cuda().half()
+        B1_pt = get_random_torch_tensor(B_shape, dtype)
 
         C1_pt = torch.bmm(AT_pt, B1_pt)
         C2_pt = torch.cos(AT_pt)
 
         Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
 
-        y = torch.empty([4, 4, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 4, 8], dtype)
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0, 0]
         inputs[input_name_to_index["input_0"]] = A_pt
@@ -588,13 +858,18 @@ def test_permute_multiple_consumer(self):
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_permute_multiple_only_bmm_consumer(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_permute_multiple_only_bmm_consumer(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         A_shape = [2, 8, 4]
         B_shape = [2, 8, 8]
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        B2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        B2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         permA = ops.permute021()(A)
 
@@ -605,34 +880,35 @@ def test_permute_multiple_only_bmm_consumer(self):
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "permute_multiple_bmm_consumer")
+        module = compile_model(
+            output, target, "./tmp", f"permute_multiple_bmm_consumer_{dtype}"
+        )
 
         graph = module.debug_sorted_graph
         bmm_tensors = 0
         for tensor in graph:
             src_ops = tensor.src_ops()
-            if len(src_ops) != 1:
+            if len(src_ops) != 2:
                 continue
-            src_op = list(tensor.src_ops())[0]
-            # All permutes should've be gone.
-            self.assertFalse(src_op._attrs["op"].startswith("permute"))
-            if src_op._attrs["op"].startswith("bmm"):
-                bmm_tensors += 1
-                self.assertEqual(src_op._attrs["op"], "bmm_crr")
+            for src_op in list(tensor.src_ops()):
+                # All permutes should've be gone.
+                self.assertFalse(src_op._attrs["op"].startswith("permute"))
+                if src_op._attrs["op"].startswith("bmm"):
+                    bmm_tensors += 1
+                    self.assertEqual(src_op._attrs["op"], "bmm_crr")
         self.assertEqual(bmm_tensors, 2)
 
-        A_pt = torch.randn(*A_shape).cuda().half()
+        A_pt = get_random_torch_tensor(A_shape, dtype)
         AT_pt = A_pt.permute((0, 2, 1))
-        B1_pt = torch.randn(*B_shape).cuda().half()
-        B2_pt = torch.randn(*B_shape).cuda().half()
+        B1_pt = get_random_torch_tensor(B_shape, dtype)
+        B2_pt = get_random_torch_tensor(B_shape, dtype)
 
         C1_pt = torch.bmm(AT_pt, B1_pt)
         C2_pt = torch.bmm(AT_pt, B2_pt)
 
         Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
 
-        y = torch.empty([4, 4, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 4, 8], dtype)
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0, 0, 0]
         inputs[input_name_to_index["input_0"]] = A_pt
diff --git a/tests/unittest/compiler/test_fuse_permute_gemm.py b/tests/unittest/compiler/test_fuse_permute_gemm.py
index 4d2541bc8..051ea6a01 100644
--- a/tests/unittest/compiler/test_fuse_permute_gemm.py
+++ b/tests/unittest/compiler/test_fuse_permute_gemm.py
@@ -20,38 +20,65 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from parameterized import parameterized
 
 
 class FusePermuteGemmTestCase(unittest.TestCase):
-    def test_no_fusion_odd_alignment(self):
-        x = Tensor([32, 51], is_input=True)
-        w = Tensor([32, 51], is_input=True)
+    @parameterized.expand([("float16"), ("float")])
+    def test_no_fusion_odd_alignment(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        x = Tensor([32, 51], dtype=dtype, is_input=True)
+        w = Tensor([32, 51], dtype=dtype, is_input=True)
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rrr()(w, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "z"
 
         module = compile_model(
-            z, detect_target(), "./tmp", "test_no_fusion_odd_alignment"
+            z, target, "./tmp", f"test_no_fusion_odd_alignment_{dtype}"
         )
-        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
-        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
+        if dtype == "float":
+            self.assertFalse(
+                test_utils.graph_has_op(module.debug_sorted_graph, "permute")
+            )
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr")
+            )
+        elif dtype == "float16":
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "permute")
+            )
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr")
+            )
+        else:
+            raise RuntimeError("invalid {dtype=}")
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_gemm_rrr_to_rcr(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
-    def test_gemm_rrr_to_rcr(self):
-        x = Tensor([32, 52], is_input=True, name="x")
-        w = Tensor([32, 52], is_input=True, name="w")
+        x = Tensor([32, 52], dtype=dtype, is_input=True, name="x")
+        w = Tensor([32, 52], dtype=dtype, is_input=True, name="w")
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rrr()(w, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "z"
 
-        module = compile_model(z, detect_target(), "./tmp", "test_gemm_rrr_to_rcr")
+        module = compile_model(z, target, "./tmp", f"test_gemm_rrr_to_rcr_{dtype}")
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
         self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
 
-        x_pt = torch.randn(32, 52).half().cuda()
-        w_pt = torch.randn(32, 52).half().cuda()
+        x_pt = get_random_torch_tensor([32, 52], dtype)
+        w_pt = get_random_torch_tensor([32, 52], dtype)
         y_pt = x_pt.t()
         z_pt = torch.matmul(w_pt, y_pt)
         z_ait = torch.empty_like(z_pt)
@@ -59,9 +86,14 @@ def test_gemm_rrr_to_rcr(self):
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
 
-    def test_gemm_rcr_to_rrr(self):
-        x = Tensor([52, 32], is_input=True, name="x")
-        w = Tensor([32, 52], is_input=True, name="w")
+    @parameterized.expand([("float16"), ("float")])
+    def test_gemm_rcr_to_rrr(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        x = Tensor([52, 32], dtype=dtype, is_input=True, name="x")
+        w = Tensor([32, 52], dtype=dtype, is_input=True, name="w")
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rcr()(w, y)
         z._attrs["is_output"] = True
@@ -69,18 +101,22 @@ def test_gemm_rcr_to_rrr(self):
 
         module = compile_model(
             z,
-            detect_target(),
+            target,
             "./tmp",
-            "test_gemm_rcr_to_rrr",
+            f"test_gemm_rcr_to_rrr_{dtype}",
         )
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
         self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
 
-        x_pt = torch.randn(52, 32).half().cuda()
-        w_pt = torch.randn(32, 52).half().cuda()
+        x_pt = get_random_torch_tensor([52, 32], dtype)
+        w_pt = get_random_torch_tensor([32, 52], dtype)
         z_pt = torch.matmul(w_pt, x_pt)
         z_ait = torch.empty_like(z_pt)
         module.run_with_tensors({"x": x_pt, "w": w_pt}, {"z": z_ait})
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 73cd581ce..45b4ab545 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -23,12 +23,19 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
+
+from parameterized import parameterized
 from torch import nn
 
 
 class FusedElementwiseComplexDependencyTestCase(unittest.TestCase):
-    def test_fused_elementwise_direct_input_dependency(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_direct_input_dependency(self, dtype):
         r"""
             X0   X1
              \   /
@@ -40,24 +47,27 @@ def test_fused_elementwise_direct_input_dependency(self):
 
         Add_1, Add_2, and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -68,25 +78,24 @@ def test_fused_elementwise_direct_input_dependency(self):
         R2._attrs["name"] = "R2"
         R2._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R2,
             target,
             "./tmp",
-            "fused_elementwise_direct_input_dependency",
+            f"fused_elementwise_direct_input_dependency_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
 
-        x0_pt = torch.rand(M, N).cuda().half()
-        x2_pt = torch.rand(M, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
 
         r0_pt = x0_pt + 3 + x2_pt
         r1_pt = r0_pt + x2_pt
         r2_pt = r0_pt - r1_pt
 
-        r2 = torch.empty([M, N]).cuda().half()
+        r2 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -99,7 +108,97 @@ def test_fused_elementwise_direct_input_dependency(self):
         module.run_with_tensors(inputs, [r2])
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_non_elementwise_ops(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_direct_input_dependency_split_subgraph(self, dtype):
+        r"""
+        X3[K,N]   X0[N]   X1[]
+           |         \   /
+           |     Add_1[N]  X2[M,N]
+            \      /  |  \    /
+             Add[K,N] |  Add_2[M, N]
+                       \     /
+                       Sub_1 [M,N]
+
+           Add_1, Add_2, and Sub_1 should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
+
+        M = 10
+        N = 4
+        K = 15
+        X0 = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.elementwise(FuncEnum.ADD)(R0, X2)
+        R2 = ops.elementwise(FuncEnum.SUB)(R0, R1)
+        R3 = ops.elementwise(FuncEnum.ADD)(R0, X3)
+        R2._attrs["name"] = "R2"
+        R2._attrs["is_output"] = True
+        R3._attrs["name"] = "R3"
+        R3._attrs["is_output"] = True
+
+        module = compile_model(
+            [R3, R2],
+            target,
+            "./tmp",
+            f"fused_elementwise_direct_input_dependency_split_subgraph{dtype}",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        x0_pt = get_random_torch_tensor([N], dtype)  # N
+        x2_pt = get_random_torch_tensor([M, N], dtype)
+        x3_pt = get_random_torch_tensor([K, N], dtype)
+
+        r0_pt = x0_pt + 3
+        r3_pt = r0_pt + x3_pt
+        r1_pt = r0_pt + x2_pt
+        r2_pt = r0_pt - r1_pt
+
+        r2 = get_torch_empty_tensor([M, N], dtype)
+        r3 = get_torch_empty_tensor([K, N], dtype)  # N
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r3, r2])
+        self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_non_elementwise_ops(self, dtype):
         r"""
                 X0   X1 (3)
                  \   /
@@ -114,24 +213,27 @@ def test_fused_elementwise_non_elementwise_ops(self):
 
             Add_1, Add_2, and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -148,19 +250,18 @@ def test_fused_elementwise_non_elementwise_ops(self):
         R4._attrs["name"] = "R4"
         R4._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [R1, R2, R4],
             target,
             "./tmp",
-            "test_fused_elementwise_non_elementwise_ops",
+            f"test_fused_elementwise_non_elementwise_ops_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 4)
 
-        x0_pt = torch.rand(M, N).cuda().half()
-        x2_pt = torch.rand(M, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = r0_pt + x2_pt
@@ -168,9 +269,9 @@ def test_fused_elementwise_non_elementwise_ops(self):
         r3_pt = r0_pt.reshape([-1])
         r4_pt = r3_pt + r3_pt
 
-        r1 = torch.empty(r1_pt.shape).cuda().half()
-        r2 = torch.empty([M, N]).cuda().half()
-        r4 = torch.empty(r4_pt.shape).cuda().half()
+        r1 = get_torch_empty_tensor(r1_pt.shape, dtype)
+        r2 = get_torch_empty_tensor([M, N], dtype)
+        r4 = get_torch_empty_tensor(r4_pt.shape, dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -185,7 +286,8 @@ def test_fused_elementwise_non_elementwise_ops(self):
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_indirect_input_dependency(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_indirect_input_dependency(self, dtype):
         r"""
             X0   X1
              \   /
@@ -199,25 +301,28 @@ def test_fused_elementwise_indirect_input_dependency(self):
 
         Tanh_1 and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -229,26 +334,25 @@ def test_fused_elementwise_indirect_input_dependency(self):
         R3._attrs["name"] = "R3"
         R3._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R3,
             target,
             "./tmp",
-            "fused_elementwise_indirect_input_dependency",
+            f"fused_elementwise_indirect_input_dependency_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 3)
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x2_pt)
         r2_pt = torch.tanh(r1_pt)
         r3_pt = r0_pt - r2_pt
 
-        r3 = torch.empty([M, N]).cuda().half()
+        r3 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -261,7 +365,103 @@ def test_fused_elementwise_indirect_input_dependency(self):
         module.run_with_tensors(inputs, [r3])
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_multi_dependency(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype):
+        r"""
+                X0[M,K] X1[]
+                 \      /
+                  Add_1      X2[K,N]
+                   |    \      /
+                   |     Gemm_1
+                   |        |
+        X3[P,M,N]  |      Tanh_1 (output)
+              \    |           |
+                Sub_1          |
+                   |          /
+                Sub_2 (output)
+            Tanh_1 and Sub_1 should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        M = 10
+        K = 4
+        N = 4
+        P = 15
+        X0 = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[P, M, N],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.gemm_rcr()(R0, X2)
+        R2 = ops.elementwise(FuncEnum.TANH)(R1)
+        R3 = ops.elementwise(FuncEnum.SUB)(X3, R0)
+        R4 = ops.elementwise(FuncEnum.SUB)(R3, R2)
+        R3._attrs["name"] = "R3"
+        R3._attrs["is_output"] = True
+        R4._attrs["name"] = "R4"
+        R4._attrs["is_output"] = True
+
+        module = compile_model(
+            [R3, R4],
+            target,
+            "./tmp",
+            f"fused_elementwise_indirect_input_dependency_split_subgraph{dtype}",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 4)
+
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
+        x3_pt = get_random_torch_tensor([P, M, N], dtype)
+
+        r0_pt = x0_pt + 3
+        r1_pt = nn.functional.linear(r0_pt, x2_pt)
+        r2_pt = torch.tanh(r1_pt)
+        r3_pt = x3_pt - r0_pt
+        r4_pt = r3_pt - r2_pt
+
+        r3 = get_torch_empty_tensor([P, M, N], dtype)
+        r4 = get_torch_empty_tensor([P, M, N], dtype)
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r3, r4])
+        self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_multi_dependency(self, dtype):
         r"""
             X0   X1                X3
              \   /                 |
@@ -278,37 +478,40 @@ def test_fused_elementwise_multi_dependency(self):
 
         Tanh_1, Sub_1, Sub_2 and Add_2 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
         X3 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X3",
             is_input=True,
         )
         X4 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X4",
             is_input=True,
         )
@@ -335,10 +538,10 @@ def test_fused_elementwise_multi_dependency(self):
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 5)
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(K, N).cuda().half()
-        x3_pt = torch.rand(M, K).cuda().half()
-        x4_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
+        x3_pt = get_random_torch_tensor([M, K], dtype)
+        x4_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x2_pt)
@@ -349,7 +552,7 @@ def test_fused_elementwise_multi_dependency(self):
         r6_pt = r4_pt - r5_pt
         r7_pt = r6_pt + r3_pt
 
-        r7 = torch.empty([M, N]).cuda().half()
+        r7 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -364,6 +567,86 @@ def test_fused_elementwise_multi_dependency(self):
         module.run_with_tensors(inputs, [r7])
         self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
 
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_find_fusable_graph(self, dtype):
+        r"""
+                     X0
+                     |
+                    Abs
+                   /   \
+            X1   Tanh  |
+             \    /    |
+              Gemm   Relu
+                \      |
+                 Exp   |
+                   \  /
+                   Sub
+
+        Tanh, Abs, Relu should be fused together;  Sub, Exp should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        M = 10
+        K = 4
+        N = 4
+        X0 = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ABS)(X0)
+        R1 = ops.elementwise(FuncEnum.TANH)(R0)
+        R2 = ops.gemm_rcr()(R1, X1)
+        R3 = ops.elementwise(FuncEnum.EXP)(R2)
+        R4 = ops.elementwise(FuncEnum.RELU)(R0)
+        R5 = ops.elementwise(FuncEnum.SUB)(R4, R3)
+        R5._attrs["name"] = "R5"
+        R5._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R5,
+            target,
+            "./tmp",
+            "fused_elementwise_find_fusable_graph",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x1_pt = get_random_torch_tensor([K, N], dtype)
+        relu = torch.nn.ReLU()
+        r0_pt = torch.abs(x0_pt)
+        r1_pt = torch.tanh(r0_pt)
+        r2_pt = nn.functional.linear(r1_pt, x1_pt)
+        r3_pt = torch.exp(r2_pt)
+        r4_pt = relu(r0_pt)
+        r5_pt = r4_pt - r3_pt
+
+        r5 = get_torch_empty_tensor([M, N], dtype)
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X1": x1_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r5])
+        self.assertTrue(torch.allclose(r5, r5_pt, atol=1e-2, rtol=1e-2))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
index f7513b42b..d8abf7a10 100644
--- a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
+++ b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
@@ -23,11 +23,18 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
 from torch import nn
 
 
 class FusedElementwiseOutOfOrderTestCase(unittest.TestCase):
-    def test_fused_elementwise_out_of_order(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_out_of_order(self, dtype):
         r"""
            X0   X1
             \   /
@@ -48,37 +55,40 @@ def test_fused_elementwise_out_of_order(self):
         New order:
         [X2, X4, R2, X0, X1, R0, X3, R1, R3, R4, R5]
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
         X3 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X3",
             is_input=True,
         )
         X4 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X4",
             is_input=True,
         )
@@ -92,18 +102,17 @@ def test_fused_elementwise_out_of_order(self):
         R5._attrs["name"] = "R5"
         R5._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R5,
             target,
             "./tmp",
-            "fused_elementwise_out_of_order",
+            f"fused_elementwise_out_of_order_{dtype}",
         )
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(M, K).cuda().half()
-        x3_pt = torch.rand(K, N).cuda().half()
-        x4_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([M, K], dtype)
+        x3_pt = get_random_torch_tensor([K, N], dtype)
+        x4_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x3_pt)
@@ -112,7 +121,7 @@ def test_fused_elementwise_out_of_order(self):
         r4_pt = nn.functional.linear(r3_pt, x4_pt)
         r5_pt = r1_pt - r4_pt
 
-        r5 = torch.empty([M, N]).cuda().half()
+        r5 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
diff --git a/tests/unittest/compiler/test_group_fusions.py b/tests/unittest/compiler/test_group_fusions.py
index a4b78bd48..b7a218187 100644
--- a/tests/unittest/compiler/test_group_fusions.py
+++ b/tests/unittest/compiler/test_group_fusions.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,30 +20,40 @@
 from aitemplate import compiler
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import count_ops, has_op
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    count_ops,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
+from aitemplate.utils import graph_utils
 
 
-def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True, only_params=False):
+_LOGGER = logging.getLogger(__name__)
+
+
+def _prepare_input_tensors(
+    m, nk_groups, dtype, start=0, has_bias=True, only_params=False
+):
     inputs = []
     for i, (n, k) in enumerate(nk_groups):
         X = Tensor(
             shape=[m, k],
-            dtype="float16",
+            dtype=dtype,
             name="x_{}".format(i + start),
             is_input=True,
         )
         W = Tensor(
             shape=[n, k],
-            dtype="float16",
+            dtype=dtype,
             name="w_{}".format(i + start),
             is_input=True,
         )
         B = Tensor(
             shape=[n],
-            dtype="float16",
+            dtype=dtype,
             name="b_{}".format(i + start),
             is_input=True,
         )
@@ -74,6 +85,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         fuse_sigmoid_mul=True,
         num_group_ops=1,
         should_fail=False,
+        dtype="float16",
     ):
         if gamma_is_none or beta_is_none or len(input_shapes) <= 1:
             should_fail = True
@@ -82,8 +94,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
             if fuse_sigmoid_mul
             else "group_layernorm_fusion"
         )
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"{testname}: input_shapes={input_shapes}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}",
         )
@@ -98,7 +109,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                         IntImm(shape[0]),
                         IntImm(shape[1]),
                     ],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -108,7 +119,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -119,7 +130,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -172,8 +183,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
 
         B = len(input_shapes)
 
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Run test group_layernorm_sigmoid_mul. Input shapes: {input_shapes}",
         )
 
@@ -181,10 +191,14 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         gammas_pt = []
         betas_pt = []
         for shape in input_shapes:
-            xs_pt.append(torch.randn(shape).cuda().half())
-            gamma_pt = None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+            xs_pt.append(get_random_torch_tensor(shape, dtype))
+            gamma_pt = (
+                None if gamma_is_none else get_random_torch_tensor([shape[1]], dtype)
+            )
             gammas_pt.append(gamma_pt)
-            beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+            beta_pt = (
+                None if beta_is_none else get_random_torch_tensor([shape[1]], dtype)
+            )
             betas_pt.append(beta_pt)
 
         ys_pt = []
@@ -209,7 +223,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
         ys = []
         for y_pt in ys_pt:
-            ys.append(torch.empty(y_pt.size()).cuda().half())
+            ys.append(get_torch_empty_tensor(y_pt.size(), dtype))
         module.run_with_tensors(inputs, ys)
         # module.benchmark_with_tensors(inputs, ys)
         for y_pt, y in zip(ys_pt, ys):
@@ -218,7 +232,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 f"max diff: {torch.max(y_pt - y)}, min diff: {torch.min(y_pt - y)}",
             )
 
-    def test_group_layernorm_sigmoid_mul_fusion(self):
+    def test_group_layernorm_sigmoid_mul_fusion_float16(self):
         self._test_group_layernorm_sigmoid_mul_cat_fusion(
             [[128, 256]], fuse_sigmoid_mul=True
         )
@@ -295,6 +309,284 @@ def test_group_layernorm_sigmoid_mul_fusion(self):
             num_group_ops=1,
         )
 
+    def test_group_layernorm_sigmoid_mul_fusion_float32(self):
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4, fuse_sigmoid_mul=True, dtype="float32"
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[10, 64], [10, 64], [10, 64]],
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[64, 64], [128, 256], [1, 125]],
+            fuse_sigmoid_mul=True,
+            should_fail=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125]],
+            fuse_sigmoid_mul=True,
+            add_size_op=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125], [128, 125]],
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 64]] * 39 + [[128, 256]] * 10,
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+            dtype="float32",
+        )
+
+        # ctr_mbl_feed overarch cases
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [
+                [2048, 256],
+                [2048, 256],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 1024],
+            ],
+            fuse_sigmoid_mul=True,
+            num_group_ops=1,
+            dtype="float32",
+        )
+
+    def test_layernorm_with_cycles(self):
+        """
+        The test basically forms the following subgraph:
+
+        layernorm_sigmoid_mul_1 = layernorm_sigmoid_mul(...)
+        gemm_rcr_2 = gemm_rcr(layernorm_sigmoid_mul_1)
+        layernorm_3 = layernorm(gemm_rcr_2)
+        layernorm_4 = layernorm(...)
+        gemm_rcr_5 = gemm_rcr(layernorm_4)
+        layernorm_sigmoid_mul_6 = layernorm_sigmoid_mul(gemm_rcr_5)
+
+        For example, grouping (layernorm_sigmoid_mul_1, layernorm_sigmoid_mul_6)
+        and (gemm_rcr_2, gemm_rcr_5) at the same time would introduce a cycle
+        between the fused group ops, because we have the following dependency:
+            layernorm_sigmoid_mul_1 -> gemm_rcr_2
+            gemm_rcr_5 -> layernrom_sigmoid_mul_6
+        """
+        torch.manual_seed(0)
+        testname = "layernorm_with_cycles_0"
+        dtype = "float16"
+        batch_sizes = [1, 2048]
+        eps = 1e-5
+
+        Input0 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=1024)],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        reshape_to_shape_0 = [-1, 32, 32]
+        reshape_0 = ops.reshape()(Input0, reshape_to_shape_0)
+
+        W0 = Tensor(shape=[IntImm(16), IntImm(32)], name="w0", is_input=True)
+        gemm_rcr_0 = ops.gemm_rcr()(reshape_0, W0)
+
+        reshape_to_shape_1 = [-1, 512]
+        reshape_1 = ops.reshape()(gemm_rcr_0, reshape_to_shape_1)
+
+        Input1 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=512)],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        elementwise_0 = ops.elementwise(func_enum=FuncEnum.MUL)(reshape_1, Input1)
+
+        W1 = Tensor(shape=[IntImm(3821), IntImm(512)], name="w1", is_input=True)
+        gemm_rcr_1 = ops.gemm_rcr()(elementwise_0, W1)
+
+        concat_dim = 1
+        concatenate_0 = ops.concatenate()([Input1, gemm_rcr_1], concat_dim)
+
+        Gamma0 = Tensor(shape=[IntImm(4333)], name="gamma0", is_input=True)
+        Beta0 = Tensor(shape=[IntImm(4333)], name="beta0", is_input=True)
+        layernorm_0 = ops.layernorm(normalized_shape=None)(
+            concatenate_0, Gamma0, Beta0, [IntImm(4333)], eps
+        )
+
+        Input2 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=256)],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        W2 = Tensor(shape=[IntImm(256), IntImm(256)], name="w2", is_input=True)
+        gemm_rcr_2 = ops.gemm_rcr()(Input2, W2)
+
+        Gamma1 = Tensor(shape=[IntImm(256)], name="gamma1", is_input=True)
+        Beta1 = Tensor(shape=[IntImm(256)], name="beta1", is_input=True)
+        layernorm_1 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_2, Gamma1, Beta1, [IntImm(256)], eps
+        )
+        elementwise_1 = ops.elementwise(func_enum=FuncEnum.SIGMOID)(layernorm_1)
+        elementwise_2 = ops.elementwise(func_enum=FuncEnum.MUL)(
+            gemm_rcr_2, elementwise_1
+        )
+
+        W3 = Tensor(shape=[IntImm(2048), IntImm(256)], name="w3", is_input=True)
+        gemm_rcr_3 = ops.gemm_rcr()(elementwise_2, W3)
+
+        Gamma2 = Tensor(shape=[IntImm(2048)], name="gamma2", is_input=True)
+        Beta2 = Tensor(shape=[IntImm(2048)], name="beta2", is_input=True)
+        layernorm_2 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_3, Gamma2, Beta2, [IntImm(2048)], eps
+        )
+
+        Input3 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=1320)],
+            dtype=dtype,
+            name="input3",
+            is_input=True,
+        )
+        Gamma3 = Tensor(shape=[IntImm(1320)], name="gamma3", is_input=True)
+        Beta3 = Tensor(shape=[IntImm(1320)], name="beta3", is_input=True)
+        layernorm_3 = ops.layernorm(normalized_shape=None)(
+            Input3, Gamma3, Beta3, [IntImm(1320)], eps
+        )
+
+        W4 = Tensor(shape=[IntImm(128), IntImm(1320)], name="w4", is_input=True)
+        gemm_rcr_4 = ops.gemm_rcr()(layernorm_3, W4)
+
+        Gamma4 = Tensor(shape=[IntImm(128)], name="gamma4", is_input=True)
+        Beta4 = Tensor(shape=[IntImm(128)], name="beta4", is_input=True)
+        layernorm_4 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_4, Gamma4, Beta4, [IntImm(128)], eps
+        )
+        elementwise_3 = ops.elementwise(func_enum=FuncEnum.SIGMOID)(layernorm_4)
+        elementwise_4 = ops.elementwise(func_enum=FuncEnum.MUL)(
+            gemm_rcr_4, elementwise_3
+        )
+
+        output_0 = ops.concatenate()(
+            [elementwise_4, layernorm_3, layernorm_0, layernorm_2], concat_dim
+        )
+        output_0._attrs["name"] = "output_0"
+        output_0._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [output_0],
+            target,
+            "./tmp",
+            testname,
+        )
+
+        for batch in batch_sizes:
+            input0_pt = get_random_torch_tensor([batch, 1024], dtype)
+            reshape_0_pt = torch.reshape(input0_pt, reshape_to_shape_0)
+
+            w0_pt = get_random_torch_tensor([16, 32], dtype)
+            gemm_rcr_0_pt = torch.nn.functional.linear(reshape_0_pt, w0_pt)
+
+            reshape_1_pt = torch.reshape(gemm_rcr_0_pt, reshape_to_shape_1)
+
+            input1_pt = get_random_torch_tensor([batch, 512], dtype)
+            elementwise_0_pt = reshape_1_pt * input1_pt
+
+            w1_pt = get_random_torch_tensor([3821, 512], dtype)
+            gemm_rcr_1_pt = torch.nn.functional.linear(elementwise_0_pt, w1_pt)
+
+            concatenate_0_pt = torch.cat([input1_pt, gemm_rcr_1_pt], concat_dim)
+
+            gamma0_pt = get_random_torch_tensor([4333], dtype)
+            beta0_pt = get_random_torch_tensor([4333], dtype)
+            layernorm_0_pt = torch.nn.functional.layer_norm(
+                concatenate_0_pt,
+                concatenate_0_pt.size()[1:],
+                gamma0_pt,
+                beta0_pt,
+                eps=eps,
+            )
+
+            input2_pt = get_random_torch_tensor([batch, 256], dtype)
+            w2_pt = get_random_torch_tensor([256, 256], dtype)
+            gemm_rcr_2_pt = torch.nn.functional.linear(input2_pt, w2_pt)
+
+            gamma1_pt = get_random_torch_tensor([256], dtype)
+            beta1_pt = get_random_torch_tensor([256], dtype)
+            layernorm_1_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_2_pt, gemm_rcr_2_pt.size()[1:], gamma1_pt, beta1_pt, eps=eps
+            )
+            elementwise_1_pt = torch.sigmoid(layernorm_1_pt)
+            elementwise_2_pt = torch.mul(gemm_rcr_2_pt, elementwise_1_pt)
+
+            w3_pt = get_random_torch_tensor([2048, 256], dtype)
+            gemm_rcr_3_pt = torch.nn.functional.linear(elementwise_2_pt, w3_pt)
+
+            gamma2_pt = get_random_torch_tensor([2048], dtype)
+            beta2_pt = get_random_torch_tensor([2048], dtype)
+            layernorm_2_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_3_pt, gemm_rcr_3_pt.size()[1:], gamma2_pt, beta2_pt, eps=eps
+            )
+
+            input3_pt = get_random_torch_tensor([batch, 1320], dtype)
+            gamma3_pt = get_random_torch_tensor([1320], dtype)
+            beta3_pt = get_random_torch_tensor([1320], dtype)
+            layernorm_3_pt = torch.nn.functional.layer_norm(
+                input3_pt, input3_pt.size()[1:], gamma3_pt, beta3_pt, eps=eps
+            )
+
+            w4_pt = get_random_torch_tensor([128, 1320], dtype)
+            gemm_rcr_4_pt = torch.nn.functional.linear(layernorm_3_pt, w4_pt)
+
+            gamma4_pt = get_random_torch_tensor([128], dtype)
+            beta4_pt = get_random_torch_tensor([128], dtype)
+            layernorm_4_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_4_pt, gemm_rcr_4_pt.size()[1:], gamma4_pt, beta4_pt, eps=eps
+            )
+            elementwise_3_pt = torch.sigmoid(layernorm_4_pt)
+            elementwise_4_pt = torch.mul(gemm_rcr_4_pt, elementwise_3_pt)
+
+            output_0_pt = torch.cat(
+                [elementwise_4_pt, layernorm_3_pt, layernorm_0_pt, layernorm_2_pt],
+                concat_dim,
+            )
+
+            inputs = {
+                "input0": input0_pt,
+                "input1": input1_pt,
+                "input2": input2_pt,
+                "input3": input3_pt,
+                "w0": w0_pt,
+                "w1": w1_pt,
+                "w2": w2_pt,
+                "w3": w3_pt,
+                "w4": w4_pt,
+                "gamma0": gamma0_pt,
+                "beta0": beta0_pt,
+                "gamma1": gamma1_pt,
+                "beta1": beta1_pt,
+                "gamma2": gamma2_pt,
+                "beta2": beta2_pt,
+                "gamma3": gamma3_pt,
+                "beta3": beta3_pt,
+                "gamma4": gamma4_pt,
+                "beta4": beta4_pt,
+            }
+            y = torch.empty_like(output_0_pt)
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(output_0_pt, y, atol=0.03, rtol=0.03))
+
     def _test_group_gemm_fusion(
         self,
         m,
@@ -303,9 +595,9 @@ def _test_group_gemm_fusion(
         has_relu=False,
         has_sigmoid=False,
         should_fail=False,
+        dtype="float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Running _test_group_gemm_fusion, m = {m}, nk_groups = {nk_groups}, "
             f"has_bias = {has_bias}, has_relu = {has_relu}, has_sigmoid = {has_sigmoid}, "
             f"should_fail = {should_fail}",
@@ -327,7 +619,9 @@ def _test_group_gemm_fusion(
             op = ops.gemm_rcr
             op_type = "group_gemm_rcr"
 
-        group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+        group_input_tensors = _prepare_input_tensors(
+            m, nk_groups, dtype, has_bias=has_bias
+        )
         graph = []
         for i, group in enumerate(group_input_tensors):
             Y = op()(*group)
@@ -349,7 +643,7 @@ def _test_group_gemm_fusion(
             else:
                 assert not has_op(sorted_ops, op_type)
 
-    def test_group_gemm_fusion(self):
+    def test_group_gemm_fusion_float16(self):
         self._test_group_gemm_fusion(1024, [[16, 64], [32, 32]])
         self._test_group_gemm_fusion(1024, [[16, 64], [32, 40]], has_bias=False)
         self._test_group_gemm_fusion(
@@ -363,6 +657,23 @@ def test_group_gemm_fusion(self):
         self._test_group_gemm_fusion(1024, [[16, 44], [32, 32]], should_fail=True)
         self._test_group_gemm_fusion(1024, [[16, 13], [32, 1]], should_fail=True)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_group_gemm_fusion_float32(self):
+        self._test_group_gemm_fusion(32, [[16, 64], [32, 32]], dtype="float32")
+        self._test_group_gemm_fusion(
+            32, [[16, 64], [32, 40]], has_bias=False, dtype="float32"
+        )
+        self._test_group_gemm_fusion(
+            32, [[16, 64], [32, 40], [75, 128]], has_relu=True, dtype="float32"
+        )
+        # test misalignment
+        self._test_group_gemm_fusion(
+            32, [[16, 13], [32, 1]], should_fail=True, dtype="float32"
+        )
+
     def _test_split_group_gemm_fusion(
         self,
         m,
@@ -371,9 +682,9 @@ def _test_split_group_gemm_fusion(
         split_dim=1,
         should_fail=False,
         num_group_ops=2,
+        dtype="float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Running _test_split_group_gemm_fusion, m = {m}, nk_groups_1 = {nk_groups_1}, "
             f"nk_groups_2 = {nk_groups_2}, split_dim = {split_dim}, should_fail: {should_fail}, "
             f"num_group_ops = {num_group_ops}",
@@ -381,10 +692,15 @@ def _test_split_group_gemm_fusion(
         op_type = "group_gemm_rcr_bias"
 
         inputs1 = _prepare_input_tensors(
-            m, nk_groups_1, has_bias=True, only_params=True
+            m, nk_groups_1, dtype, has_bias=True, only_params=True
         )
         inputs2 = _prepare_input_tensors(
-            m, nk_groups_2, start=len(nk_groups_1), has_bias=True, only_params=False
+            m,
+            nk_groups_2,
+            dtype,
+            start=len(nk_groups_1),
+            has_bias=True,
+            only_params=False,
         )
 
         if split_dim == 1:
@@ -392,7 +708,7 @@ def _test_split_group_gemm_fusion(
             K = sum(split_sizes)
             X = Tensor(
                 shape=[m, K],
-                dtype="float16",
+                dtype=dtype,
                 name="input",
                 is_input=True,
             )
@@ -400,7 +716,7 @@ def _test_split_group_gemm_fusion(
             split_sizes = m
             X = Tensor(
                 shape=[m * len(nk_groups_1), nk_groups_1[0][1]],
-                dtype="float16",
+                dtype=dtype,
                 name="input",
                 is_input=True,
             )
@@ -438,7 +754,7 @@ def _test_split_group_gemm_fusion(
                 assert not has_op(sorted_ops, "split")
                 assert count_ops(sorted_ops, op_type) == num_group_ops
 
-    def test_split_group_gemm_fusion(self):
+    def test_split_group_gemm_fusion_float16(self):
         self._test_split_group_gemm_fusion(
             1024, [[16, 64], [16, 40], [16, 128]], [[1, 16], [3, 48]], num_group_ops=2
         )
@@ -458,6 +774,28 @@ def test_split_group_gemm_fusion(self):
             num_group_ops=1,
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_split_group_gemm_fusion_float32(self):
+        self._test_split_group_gemm_fusion(
+            32,
+            [[16, 64], [16, 40], [16, 128]],
+            [[1, 16], [3, 48]],
+            num_group_ops=2,
+            dtype="float32",
+        )
+        self._test_split_group_gemm_fusion(
+            48,
+            [[16, 64], [16, 64], [16, 64]],
+            [[1, 16], [3, 48]],
+            split_dim=0,
+            should_fail=True,
+            num_group_ops=1,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_memory_planning.py b/tests/unittest/compiler/test_memory_planning.py
index e2ce76b5a..a819c3d5d 100644
--- a/tests/unittest/compiler/test_memory_planning.py
+++ b/tests/unittest/compiler/test_memory_planning.py
@@ -22,12 +22,21 @@
 from aitemplate.compiler.base import Operator
 from aitemplate.frontend import IntImm, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
 
 
 class MemoryPlanningTestCase(unittest.TestCase):
-    def test_memory_planning_with_tensor_views(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_memory_planning_with_tensor_views(self, dtype):
         target = detect_target()
-        dtype = "float16"
+        if dtype == "float32" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
+
         # batch_size = [4, 16] # reduce_sum doesn't work with dynamic shape
         batch_size = [4]
         in_shape = [32, 16, 8]
@@ -72,7 +81,7 @@ def test_memory_planning_with_tensor_views(self):
 
         for b in batch_size:
             X_shape = [b] + in_shape
-            x_pt = torch.randn(X_shape).cuda().half()
+            x_pt = get_random_torch_tensor(X_shape, dtype)
             t0_pt = torch.sum(x_pt, dim=3, keepdim=True)
             t1_pt = torch.sum(t0_pt, dim=2, keepdim=True)
             t2_pt = torch.reshape(t1_pt, [-1, 32])
@@ -80,7 +89,7 @@ def test_memory_planning_with_tensor_views(self):
             t4_pt = torch.sum(t3_pt, dim=2, keepdim=False)
             out_pt = torch.add(t2_pt, t4_pt).flatten()
 
-            out = torch.empty(out_pt.size()).cuda().half()
+            out = get_torch_empty_tensor(out_pt.size(), dtype)
             module.run_with_tensors([x_pt], [out])
             self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
 
diff --git a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
index 515959224..25909a2a0 100644
--- a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
+++ b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
@@ -21,22 +21,31 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import shape_utils
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadBmmBiasWithCatTestCase(unittest.TestCase):
-    def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
+    def _test_pad_bmm_rrr_bias_with_cat(
+        self, test_name, bs, ms, n, k1, k2, dtype="float16"
+    ):
         k = k1 + k2
         b_dim = shape_utils.gen_int_var_min_max(bs, name="b")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
-        X1 = Tensor(shape=[b_dim, m_dim, k1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[b_dim, m_dim, k2], dtype="float16", name="x2", is_input=True)
+        X1 = Tensor(shape=[b_dim, m_dim, k1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[b_dim, m_dim, k2], dtype=dtype, name="x2", is_input=True)
         X4 = ops.concatenate()([X1, X2], dim=2)
 
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
-        W2 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b2", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b1", is_input=True)
+        W2 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b2", is_input=True)
         Y1 = ops.bmm_rrr_add()(X4, W1, B1)
         Y2 = ops.bmm_rrr_add()(X4, W2, B2)
 
@@ -46,24 +55,24 @@ def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
 
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         module = compile_model(
-            [Y], target, "./tmp", f"test_bmm_rrr_padding_{test_name}"
+            [Y], target, "./tmp", f"test_bmm_rrr_padding_{test_name}_{dtype}"
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(b, m, k1).cuda().half()
-            X2_pt = torch.randn(b, m, k2).cuda().half()
+            X1_pt = get_random_torch_tensor([b, m, k1], dtype)
+            X2_pt = get_random_torch_tensor([b, m, k2], dtype)
             X4_pt = torch.cat([X1_pt, X2_pt], dim=2)
 
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, m, n).cuda().half()
-            W2_pt = torch.randn(b, k, n).cuda().half()
-            B2_pt = torch.randn(b, m, n).cuda().half()
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, m, n], dtype)
+            W2_pt = get_random_torch_tensor([b, k, n], dtype)
+            B2_pt = get_random_torch_tensor([b, m, n], dtype)
 
             Y1_pt = torch.baddbmm(B1_pt, X4_pt, W1_pt)
             Y2_pt = torch.baddbmm(B2_pt, X4_pt, W2_pt)
@@ -78,11 +87,11 @@ def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["b2"]] = B2_pt
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_pad_bmm_rrr_bias_with_cat(self):
+    def test_pad_bmm_rrr_bias_with_cat_float16(self):
         self._test_pad_bmm_rrr_bias_with_cat(
             "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10
         )
@@ -93,6 +102,25 @@ def test_pad_bmm_rrr_bias_with_cat(self):
             "dynamic_odd_kn", bs=[1, 2, 3], ms=[2, 5, 7], n=15, k1=1, k2=2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_pad_bmm_rrr_bias_with_cat_float32(self):
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10, dtype="float32"
+        )
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "dynamic_odd_kn",
+            bs=[1, 2, 3],
+            ms=[2, 5, 7],
+            n=15,
+            k1=1,
+            k2=2,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
index ac91d3fbe..d03e5bcae 100644
--- a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
@@ -20,17 +20,24 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import shape_utils
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadGemmWithCatTestCase(unittest.TestCase):
-    def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
+    def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2, dtype="float16"):
         k = k1 + k2
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
-        X1 = Tensor(shape=[m_dim, k1], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[k, n], dtype="float16", name="w1", is_input=True)
-        X2 = Tensor(shape=[m_dim, k2], dtype="float16", name="x2", is_input=True)
-        W2 = Tensor(shape=[k, n], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[m_dim, k1], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[k, n], dtype=dtype, name="w1", is_input=True)
+        X2 = Tensor(shape=[m_dim, k2], dtype=dtype, name="x2", is_input=True)
+        W2 = Tensor(shape=[k, n], dtype=dtype, name="w2", is_input=True)
         X4 = ops.concatenate()([X1, X2], dim=1)
         Y1 = ops.gemm_rrr()(X4, W1)
         Y2 = ops.gemm_rrr()(X4, W2)
@@ -40,21 +47,21 @@ def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
 
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         dll_name = f"test_rrr_padding_{test_name}.so"
         module = compile_model(
-            [Y], target, "./tmp", "pad_gemm_with_cat_rrr", dll_name=dll_name
+            [Y], target, "./tmp", f"pad_gemm_with_cat_rrr_{dtype}", dll_name=dll_name
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
 
         for m in ms:
-            X1_pt = torch.randn(m, k1).cuda().half()
-            W1_pt = torch.randn(k, n).cuda().half()
-            X2_pt = torch.randn(m, k2).cuda().half()
-            W2_pt = torch.randn(k, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k1], dtype)
+            W1_pt = get_random_torch_tensor([k, n], dtype)
+            X2_pt = get_random_torch_tensor([m, k2], dtype)
+            W2_pt = get_random_torch_tensor([k, n], dtype)
             X4_pt = torch.cat([X1_pt, X2_pt], dim=1)
             Y1_pt = torch.matmul(X4_pt, W1_pt)
             Y2_pt = torch.matmul(X4_pt, W2_pt)
@@ -66,17 +73,35 @@ def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
             inputs[name_to_idx["x2"]] = X2_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["w2"]] = W2_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_pad_gemm_rrr_with_cat(self):
+    def test_pad_gemm_rrr_with_cat_float16(self):
         self._test_pad_gemm_rrr_with_cat("static_odd_k", ms=[128], n=32, k1=3, k2=10)
         self._test_pad_gemm_rrr_with_cat("static_odd_kn", ms=[128], n=31, k1=1, k2=8)
         self._test_pad_gemm_rrr_with_cat(
             "dynamic_odd_kn", ms=[2, 5, 7], n=15, k1=1, k2=2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_pad_gemm_rrr_with_cat_float32(self):
+        self._test_pad_gemm_rrr_with_cat(
+            "static_odd_k", ms=[128], n=32, k1=3, k2=10, dtype="float32"
+        )
+        self._test_pad_gemm_rrr_with_cat(
+            "dynamic_odd_kn",
+            ms=[2, 5, 7],
+            n=15,
+            k1=1,
+            k2=2,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_pad_gemm_with_cat.py b/tests/unittest/compiler/test_pad_gemm_with_cat.py
index b5cf0ffa6..d330f694d 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_cat.py
@@ -22,24 +22,37 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadGemmWithCatTestCase(unittest.TestCase):
-    def test_pad_gemm_rcr_with_cat(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_pad_gemm_rcr_with_cat(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 128
         N = 32
         K1 = 3
         K2 = 10
         K = K1 + K2
 
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[N, K], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[N, K], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
 
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W2 = Tensor(shape=[N, K], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W2 = Tensor(shape=[N, K], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
 
         X3 = ops.elementwise(FuncEnum.ADD)(X1, X1)
         X4 = ops.concatenate()([X2, X3], dim=1)
@@ -49,21 +62,20 @@ def test_pad_gemm_rcr_with_cat(self):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         dll_name = "test_rcr.so"
         module = compile_model(
-            [Y], target, "./tmp", "pad_gemm_with_cat", dll_name=dll_name
+            [Y], target, "./tmp", f"pad_gemm_with_cat_{dtype}", dll_name=dll_name
         )
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N, K).cuda().half()
-        W2_pt = torch.randn(N, K).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
+        X1_pt = get_random_torch_tensor([M, K1], dtype)
+        X2_pt = get_random_torch_tensor([M, K2], dtype)
+        W1_pt = get_random_torch_tensor([N, K], dtype)
+        W2_pt = get_random_torch_tensor([N, K], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
         X3_pt = torch.add(X1_pt, X1_pt)
         X4_pt = torch.cat([X2_pt, X3_pt], dim=1)
         X5_pt = torch.nn.functional.linear(X4_pt, W1_pt, bias=B1_pt)
@@ -71,7 +83,7 @@ def test_pad_gemm_rcr_with_cat(self):
         Y_pt = torch.cat([X5_pt, X6_pt], dim=1)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = [0] * 6
@@ -85,7 +97,7 @@ def test_pad_gemm_rcr_with_cat(self):
         inputs[name_to_idx["b1"]] = B1_pt
         inputs[name_to_idx["b2"]] = B2_pt
 
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
index d4991cde2..15ef14349 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -21,6 +21,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 from parameterized import param, parameterized
 
@@ -28,18 +32,25 @@
 class PadGemmWithElementwise(unittest.TestCase):
     @parameterized.expand(
         [
-            param("static_M", [23], 7, 3),
-            param("dynamic_M", [1, 78, 99], 7, 3),
+            param("static_M_float16", [23], 7, 3, "float16"),
+            param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+            param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
         ]
     )
-    def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k):
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise(
+        self, test_name, ms, n, k, dtype
+    ):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
-        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
-        S2 = Tensor(shape=[m_dim, n], dtype="float16", name="s2", is_input=True)
+        X1 = Tensor(shape=[m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype=dtype, name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype=dtype, name="s1", is_input=True)
+        S2 = Tensor(shape=[m_dim, n], dtype=dtype, name="s2", is_input=True)
 
         X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S2)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -47,17 +58,16 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_gemm_with_elementwise_{test_name}"
         )
 
         for m in ms:
-            X1_pt = torch.randn(m, k).cuda().half()
-            W1_pt = torch.randn(n, k).cuda().half()
-            B1_pt = torch.randn(n).cuda().half()
-            S1_pt = torch.randn(m, n).cuda().half()
-            S2_pt = torch.randn(m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k], dtype)
+            W1_pt = get_random_torch_tensor([n, k], dtype)
+            B1_pt = get_random_torch_tensor([n], dtype)
+            S1_pt = get_random_torch_tensor([m, n], dtype)
+            S2_pt = get_random_torch_tensor([m, n], dtype)
 
             X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S2_pt
             Y_pt = X2_pt + X2_pt
@@ -69,25 +79,31 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k)
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["s1"]] = S1_pt
             inputs[name_to_idx["s2"]] = S2_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
         [
-            ("static_shape", [3], [1], 5, 3),
-            ("dynamic_M", [3], [1, 78, 99], 7, 3),
-            ("dynamic_B", [3, 5, 8], [3], 11, 15),
-            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
+            ("static_shape_float16", [3], [1], 5, 3, "float16"),
+            ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+            ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+            ("dynamic_BM_float16", [3, 5, 8], [3, 9, 10], 17, 21, "float16"),
+            ("static_shape_float32", [3], [1], 5, 3, "float32"),
+            ("dynamic_BM_float32", [3, 5, 8], [3, 9, 10], 17, 21, "float32"),
         ]
     )
-    def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
+    def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         b_dim = shape_utils.gen_int_var_min_max(bs, "B")
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[b_dim, m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[b_dim, m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b1", is_input=True)
 
         X2 = ops.bmm_rrr_add()(X1, W1, B1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -95,15 +111,14 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_bmm_with_elementwise_{test_name}"
         )
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(b, m, k).cuda().half()
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([b, m, k], dtype)
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, m, n], dtype)
 
             X2_pt = torch.matmul(X1_pt, W1_pt) + B1_pt
             Y_pt = X2_pt + X2_pt
@@ -113,26 +128,32 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
             inputs[name_to_idx["x1"]] = X1_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
         [
-            ("static_shape", [3], [1], 5, 3),
-            ("dynamic_M", [3], [1, 78, 99], 7, 3),
-            ("dynamic_B", [3, 5, 8], [3], 11, 15),
-            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
+            ("static_shape_float16", [3], [1], 5, 3, "float16"),
+            ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+            ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+            ("dynamic_BM_float16", [3, 5, 8], [3, 9, 10], 17, 21, "float16"),
+            ("static_shape_float32", [3], [1], 5, 3, "float32"),
+            ("dynamic_BM_float32", [3, 5, 8], [3, 9, 10], 17, 21, "float32"),
         ]
     )
-    def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
+    def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         b_dim = shape_utils.gen_int_var_min_max(bs, "B")
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
         # (M, B, K) * (B, K, N) = (M, B, N)
-        X1 = Tensor(shape=[m_dim, b_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, n], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[m_dim, b_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, n], dtype=dtype, name="b1", is_input=True)
 
         X2 = ops.perm102_bmm_rrr_bias()(X1, W1, B1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -140,15 +161,14 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_perm102_with_elementwise_{test_name}"
         )
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(m, b, k).cuda().half()
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, b, k], dtype)
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, n], dtype)
             Bias_pt = B1_pt.unsqueeze(1)
 
             X2_pt = torch.permute(
@@ -161,24 +181,31 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
             inputs[name_to_idx["x1"]] = X1_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
         [
-            param("static_M", [23], 7, 3),
-            param("dynamic_M", [1, 78, 99], 7, 3),
+            param("static_M_float16", [23], 7, 3, "float16"),
+            param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+            param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
         ]
     )
-    def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n, k):
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(
+        self, test_name, ms, n, k, dtype
+    ):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         # S1 is fed to gemm twice
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
-        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
+        X1 = Tensor(shape=[m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype=dtype, name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype=dtype, name="s1", is_input=True)
 
         X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -186,16 +213,15 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n,
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_gemm_with_elementwise_2_{test_name}"
         )
 
         for m in ms:
-            X1_pt = torch.randn(m, k).cuda().half()
-            W1_pt = torch.randn(n, k).cuda().half()
-            B1_pt = torch.randn(n).cuda().half()
-            S1_pt = torch.randn(m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k], dtype)
+            W1_pt = get_random_torch_tensor([n, k], dtype)
+            B1_pt = get_random_torch_tensor([n], dtype)
+            S1_pt = get_random_torch_tensor([m, n], dtype)
 
             X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S1_pt
             Y_pt = X2_pt + X2_pt
@@ -206,7 +232,7 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n,
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["s1"]] = S1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
index 3108256ff..f84d7e7b5 100644
--- a/tests/unittest/compiler/test_parallel_gemm_fusions.py
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 from typing import Sequence
@@ -24,28 +25,36 @@
 from aitemplate.compiler.transform.toposort import toposort
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import count_ops, has_op
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    count_ops,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
+from aitemplate.utils import graph_utils
 
 
-def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
+_LOGGER = logging.getLogger(__name__)
+
+
+def _prepare_input_tensors(m, nk_groups, dtype, start=0, has_bias=True):
     inputs = []
     batch_dim = IntImm(m)
     for i, (n, k) in enumerate(nk_groups):
         X = Tensor(
             shape=[batch_dim, IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="x_{}".format(i + start),
             is_input=True,
         )
         W = Tensor(
             shape=[IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="w_{}".format(i + start),
         )
         B = Tensor(
             shape=[IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="b_{}".format(i + start),
         )
         if has_bias:
@@ -55,14 +64,14 @@ def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
     return inputs
 
 
-def _prepare_inputs_and_constants(m, nk_groups, start=0, has_bias=True):
+def _prepare_inputs_and_constants(m, nk_groups, dtype, start=0, has_bias=True):
     inputs = []
     constants = {}
 
     for i, (n, k) in enumerate(nk_groups):
-        x_pt = torch.randn(m, k).half().cuda()
-        w_pt = torch.randn(n, k).half().cuda()
-        b_pt = torch.randn(n).half().cuda()
+        x_pt = get_random_torch_tensor([m, k], dtype)
+        w_pt = get_random_torch_tensor([n, k], dtype)
+        b_pt = get_random_torch_tensor([n], dtype)
 
         inputs.append(x_pt)
         constants[f"w_{i}"] = w_pt
@@ -72,7 +81,7 @@ def _prepare_inputs_and_constants(m, nk_groups, start=0, has_bias=True):
     return inputs, constants
 
 
-def _prepare_outputs(output_tensors):
+def _prepare_outputs(output_tensors, dtype):
     def _to_int_list(shape):
         result = []
         for d in shape:
@@ -81,12 +90,12 @@ def _to_int_list(shape):
         return result
 
     output_shapes = [_to_int_list(output._attrs["shape"]) for output in output_tensors]
-    outputs = [torch.empty(shape).half().cuda() for shape in output_shapes]
+    outputs = [get_torch_empty_tensor(shape, dtype) for shape in output_shapes]
     return outputs
 
 
-def _prepare_ait_module(m, nk_groups, constants, test_idx=0, has_bias=True):
-    group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+def _prepare_ait_module(m, nk_groups, constants, dtype, test_idx=0, has_bias=True):
+    group_input_tensors = _prepare_input_tensors(m, nk_groups, dtype, has_bias=has_bias)
     output_tensors = []
     for group in group_input_tensors:
         group[0] = ops.elementwise(FuncEnum.TANH)(group[0])
@@ -102,10 +111,11 @@ def _prepare_ait_module(m, nk_groups, constants, test_idx=0, has_bias=True):
         Y,
         target,
         "./tmp",
-        f"test_multi_parallel_gemm_cat_groups_{test_idx}",
+        f"test_multi_parallel_gemm_cat_groups_{dtype}",
+        dll_name=f"test_{test_idx}.so",
         constants=constants,
     )
-    outputs = _prepare_outputs([Y])
+    outputs = _prepare_outputs([Y], dtype)
     return outputs, module
 
 
@@ -116,21 +126,20 @@ def __init__(self, *args, **kwargs):
         self._test_id = 0
 
     def _fuse_2_split_parallel_gemm_cat(
-        self, b: int, ms: Sequence[int], n: int, k: int
+        self, b: int, ms: Sequence[int], n: int, k: int, dtype: str = "float16"
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"_fuse_2_split_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}",
         )
         X1 = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -139,14 +148,14 @@ def _fuse_2_split_parallel_gemm_cat(
         for i in range(2 * b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
                 is_input=True,
             )
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
                 is_input=True,
             )
@@ -184,13 +193,12 @@ def _fuse_parallel_gemm_cat(
         perm102_bmm_op: str,
         has_tanh: bool = True,
         reshape_weight: bool = False,
+        dtype: str = "float16",
     ):
-        logger.info(
-            __file__, f"_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}"
-        )
+        _LOGGER.info(f"_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}")
         X = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -199,7 +207,7 @@ def _fuse_parallel_gemm_cat(
         for i in range(b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
             )
             if reshape_weight:
@@ -207,7 +215,7 @@ def _fuse_parallel_gemm_cat(
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
             )
             Bs.append(B)
@@ -225,8 +233,8 @@ def _fuse_parallel_gemm_cat(
 
         constants = {}
         for i in range(b):
-            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
-            constants[f"B{i}"] = torch.randn(n).cuda().half()
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
 
         # Gen module.
         target = detect_target()
@@ -234,7 +242,8 @@ def _fuse_parallel_gemm_cat(
             [cat_output],
             target,
             "./tmp",
-            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
             constants=constants,
         ) as module:
             self._test_id += 1
@@ -243,14 +252,14 @@ def _fuse_parallel_gemm_cat(
             sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
             assert has_op(
                 sorted_ops, perm102_bmm_op
-            ), "the final graph does not have op perm102_bmm_rrr_bias"
+            ), f"the final graph does not have op {perm102_bmm_op}"
             if not has_tanh:
                 assert not has_op(
                     sorted_ops, "split"
                 ), "the final graph has split op, but it should not"
 
             for m in ms:
-                x_pt = torch.randn(m, b * k).cuda().half()
+                x_pt = get_random_torch_tensor([m, b * k], dtype)
                 x1_pt = torch.split(x_pt, k, dim=-1)
 
                 cat_inputs_pt = []
@@ -264,7 +273,7 @@ def _fuse_parallel_gemm_cat(
 
                 # Run AITemplate module.
 
-                out = torch.empty([m, b * n]).cuda().half()
+                out = get_torch_empty_tensor([m, b * n], dtype)
                 module.run_with_tensors([x_pt], [out])
                 # module.benchmark_with_tensors([x_pt], [out])
 
@@ -273,7 +282,7 @@ def _fuse_parallel_gemm_cat(
                     torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
                 )
 
-    def test_fuse_parallel_gemm_cat(self):
+    def test_fuse_parallel_gemm_cat_fp16(self):
         # test n x gemms + cat
         self._fuse_parallel_gemm_cat(
             b=4, ms=[256, 512], n=128, k=64, perm102_bmm_op="perm102_bmm_rrr_bias"
@@ -333,6 +342,64 @@ def test_fuse_parallel_gemm_cat(self):
         # test multiple split + n x gemms + cat
         self._fuse_2_split_parallel_gemm_cat(b=4, ms=[256, 512], n=128, k=64)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fuse_parallel_gemm_cat_fp32(self):
+        # test n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[256, 512],
+            n=128,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            reshape_weight=True,
+            dtype="float32",
+        )
+
+        # test split + n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[256, 512],
+            n=32,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            has_tanh=False,
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            has_tanh=False,
+            dtype="float32",
+        )
+
+        # test multiple split + n x gemms + cat
+        self._fuse_2_split_parallel_gemm_cat(
+            b=4, ms=[256, 512], n=128, k=64, dtype="float32"
+        )
+
     def _test_fuse_parallel_gemm_cat_partial(
         self,
         b1: int,
@@ -341,22 +408,22 @@ def _test_fuse_parallel_gemm_cat_partial(
         n: int,
         k: int,
         has_tanh: bool = True,
+        dtype: str = "float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"_fuse_parallel_gemm_cat_partial, b1: {b1}, b2: {b2}, ms: {ms}, n: {n}, k: {k}",
         )
         batch_dim = IntVar(ms, "input_batch")
         b = b1 + b2
         X1 = Tensor(
             shape=[batch_dim, IntImm(b1 * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[batch_dim, IntImm(b2 * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -365,13 +432,13 @@ def _test_fuse_parallel_gemm_cat_partial(
         for i in range(b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
             )
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
             )
             Bs.append(B)
@@ -387,12 +454,12 @@ def _test_fuse_parallel_gemm_cat_partial(
         X7 = ops.reshape()(X1, [-1, b1, k])
         W = Tensor(
             shape=[IntImm(b1), IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W",
         )
         B = Tensor(
             shape=[IntImm(b1), IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="B",
         )
         WT = ops.permute021()(W)
@@ -426,11 +493,11 @@ def _test_fuse_parallel_gemm_cat_partial(
 
         constants = {}
         for i in range(b):
-            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
-            constants[f"B{i}"] = torch.randn(n).cuda().half()
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
 
-        constants["W"] = torch.randn(b1, n, k).cuda().half()
-        constants["B"] = torch.randn(b1, n).cuda().half()
+        constants["W"] = get_random_torch_tensor([b1, n, k], dtype)
+        constants["B"] = get_random_torch_tensor([b1, n], dtype)
 
         # Gen module.
         target = detect_target()
@@ -438,7 +505,8 @@ def _test_fuse_parallel_gemm_cat_partial(
             [cat_output],
             target,
             "./tmp",
-            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
             constants=constants,
         ) as module:
             self._test_id += 1
@@ -454,7 +522,7 @@ def _test_fuse_parallel_gemm_cat_partial(
                 ), "the final graph has split op, but it should not"
 
             for m in ms:
-                x_pt = torch.randn(m, b1 * k).cuda().half()
+                x_pt = get_random_torch_tensor([m, b1 * k], dtype)
                 x1_pt = torch.split(x_pt, k, dim=-1)
 
                 cat_inputs_pt = []
@@ -477,7 +545,7 @@ def _test_fuse_parallel_gemm_cat_partial(
                 cat_inputs_pt.append(x8_pt)
                 cat_inputs_pt.append(x9_pt)
 
-                xx_pt = torch.randn(m, b2 * k).cuda().half()
+                xx_pt = get_random_torch_tensor([m, b2 * k], dtype)
                 x2_pt = torch.split(xx_pt, k, dim=-1)
                 for i in range(b2):
                     x3_pt = x2_pt[i].tanh() if has_tanh else x2_pt[i]
@@ -490,7 +558,7 @@ def _test_fuse_parallel_gemm_cat_partial(
 
                 # Run AITemplate module.
 
-                out = torch.empty(cat_output_pt.size()).cuda().half()
+                out = get_torch_empty_tensor(cat_output_pt.size(), dtype)
                 module.run_with_tensors({"X1": x_pt, "X2": xx_pt}, {"output0": out})
 
                 # Do comparisons.
@@ -498,16 +566,31 @@ def _test_fuse_parallel_gemm_cat_partial(
                     torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
                 )
 
-    def test_fuse_parallel_gemm_cat_partial(self):
+    def test_fuse_parallel_gemm_cat_partial_fp16(self):
         self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, True)
         self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, False)
         self._test_fuse_parallel_gemm_cat_partial(3, 3, [128, 256], 30, 66, True)
         self._test_fuse_parallel_gemm_cat_partial(2, 2, [128, 256], 33, 55, True)
 
-    def _test_multi_parallel_gemm_cat_groups(self, m, nk_groups, num_unfused_ops=0):
-        inputs, constants = _prepare_inputs_and_constants(m, nk_groups)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fuse_parallel_gemm_cat_partial_fp32(self):
+        self._test_fuse_parallel_gemm_cat_partial(
+            4, 4, [128, 256], 32, 64, True, dtype="float32"
+        )
+        self._test_fuse_parallel_gemm_cat_partial(
+            4, 4, [128, 256], 32, 64, False, dtype="float32"
+        )
+
+    def _test_multi_parallel_gemm_cat_groups(
+        self, m, nk_groups, num_unfused_ops=0, dtype="float16"
+    ):
+        inputs, constants = _prepare_inputs_and_constants(m, nk_groups, dtype)
         outputs, module = _prepare_ait_module(
-            m, nk_groups, constants, test_idx=self._test_id
+            m, nk_groups, constants, dtype, test_idx=self._test_id
         )
         self._test_id += 1
         with module:
@@ -528,7 +611,7 @@ def _test_multi_parallel_gemm_cat_groups(self, m, nk_groups, num_unfused_ops=0):
             module.run_with_tensors(inputs, outputs)
             self.assertTrue(torch.allclose(pt_y, outputs[0], atol=1e-2, rtol=1e-2))
 
-    def test_multi_parallel_gemm_cat_groups(self):
+    def test_multi_parallel_gemm_cat_groups_fp16(self):
         self._test_multi_parallel_gemm_cat_groups(
             256,
             [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
@@ -537,6 +620,19 @@ def test_multi_parallel_gemm_cat_groups(self):
             256, [[128, 64]] * 2 + [[128, 120]] + [[128, 72]] * 2 + [[128, 64]], 2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_multi_parallel_gemm_cat_groups_fp32(self):
+        self._test_multi_parallel_gemm_cat_groups(
+            256,
+            [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_permute_bmm_special_op.py b/tests/unittest/compiler/test_permute_bmm_special_op.py
index ef47daaef..719500003 100644
--- a/tests/unittest/compiler/test_permute_bmm_special_op.py
+++ b/tests/unittest/compiler/test_permute_bmm_special_op.py
@@ -20,20 +20,20 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 class FusePermuteBmmRRRN1Case(unittest.TestCase):
-    def _test_permute_bmm_rrr_n1(self, B, M, K, testname):
+    def _test_permute_bmm_rrr_n1(self, B, M, K, testname, dtype="float16"):
         N = 1
 
         batch_dim = shape_utils.gen_int_var_min_max(B)
-        X = Tensor(
-            shape=[batch_dim, M, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
-        )
+        X = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
 
         WT = ops.permute021()(W)
 
@@ -59,20 +59,26 @@ def _test_permute_bmm_rrr_n1(self, B, M, K, testname):
         assert src_op._attrs["op"] == "bmm_rcr_n1"
 
         for b in B:
-            X_pt = torch.randn(b, M, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+            X_pt = get_random_torch_tensor([b, M, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_pt = torch.cos(torch.bmm(X_pt, W_pt))
             w = W_pt.permute([0, 2, 1]).contiguous()
 
             # We currently only have row-major outputs.
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": w}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_permute_bmm_rrr_n1(self):
-        self._test_permute_bmm_rrr_n1([1], 4, 8, "permute_bmm_rrr_n1")
-        self._test_permute_bmm_rrr_n1([1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic")
+    def test_permute_bmm_rrr_n1_fp16(self):
+        self._test_permute_bmm_rrr_n1([1], 4, 8, "permute_bmm_rrr_n1_fp16")
+        self._test_permute_bmm_rrr_n1([1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic_fp16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_permute_bmm_rrr_n1_fp32(self):
+        self._test_permute_bmm_rrr_n1(
+            [1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic_fp32", dtype="float32"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
index 83367776d..685b8438c 100644
--- a/tests/unittest/compiler/test_refine_graph.py
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 
+import logging
 import unittest
 
 import torch
@@ -21,28 +22,42 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils
+
+from parameterized import parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class RefineGraphTestCase(unittest.TestCase):
-    def test_elementwise_ops(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_elementwise_ops(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and target.name == "rocm":
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -53,12 +68,11 @@ def test_elementwise_ops(self):
         Y0._attrs["is_output"] = True
         Y1._attrs["name"] = "Y1"
         Y1._attrs["is_output"] = True
-        target = detect_target()
         module = compile_model(
             [Y0, Y1],
             target,
             "./tmp",
-            "test_refine_graph_elementwise",
+            f"test_refine_graph_elementwise_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
@@ -67,17 +81,18 @@ def test_elementwise_ops(self):
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
     def test_elementwise_ops_single_input_no_refine(self):
+        dtype = "float16"
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -102,17 +117,18 @@ def test_elementwise_ops_single_input_no_refine(self):
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
     def test_elementwise_ops_single_input(self):
+        dtype = "float16"
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -138,10 +154,10 @@ def test_elementwise_ops_single_input(self):
 
         inputs = {}
         outputs = {}
-        inputs["X0"] = torch.randn([M, N]).cuda().half()
-        inputs["X1"] = torch.randn([M, N]).cuda().half()
-        outputs["Y0"] = torch.empty([M, N]).cuda().half()
-        outputs["Y1"] = torch.empty([M, N]).cuda().half()
+        inputs["X0"] = get_random_torch_tensor([M, N], dtype)
+        inputs["X1"] = get_random_torch_tensor([M, N], dtype)
+        outputs["Y0"] = get_torch_empty_tensor([M, N], dtype)
+        outputs["Y1"] = get_torch_empty_tensor([M, N], dtype)
 
         module.run_with_tensors(inputs, outputs)
         y0 = torch.nn.functional.silu(inputs["X0"])
@@ -150,19 +166,19 @@ def test_elementwise_ops_single_input(self):
         self.assertTrue(torch.allclose(y0, outputs["Y0"], 1e-2, 1e-2))
         self.assertTrue(torch.allclose(y1, outputs["Y1"], 1e-2, 1e-2))
 
-    def _build_gemm_rcr_bias(self, M, N, K, start_idx=0):
+    def _build_gemm_rcr_bias(self, M, N, K, dtype, start_idx=0):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
         input_0 = Tensor(
-            shape=X_shape, dtype="float16", name=f"input_{start_idx}", is_input=True
+            shape=X_shape, dtype=dtype, name=f"input_{start_idx}", is_input=True
         )
         input_1 = Tensor(
-            shape=W_shape, dtype="float16", name=f"input_{start_idx + 1}", is_input=True
+            shape=W_shape, dtype=dtype, name=f"input_{start_idx + 1}", is_input=True
         )
         input_2 = Tensor(
-            shape=B_shape, dtype="float16", name=f"input_{start_idx + 2}", is_input=True
+            shape=B_shape, dtype=dtype, name=f"input_{start_idx + 2}", is_input=True
         )
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -170,35 +186,39 @@ def _build_gemm_rcr_bias(self, M, N, K, start_idx=0):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_mul(self, M, N, K, start_idx=0):
+    def _build_gemm_rcr_bias_mul(self, M, N, K, dtype, start_idx=0):
         D_shape = [M, N]
         input_3 = Tensor(
-            shape=D_shape, dtype="float16", name=f"input_{start_idx + 3}", is_input=True
+            shape=D_shape, dtype=dtype, name=f"input_{start_idx + 3}", is_input=True
         )
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, start_idx)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, dtype, start_idx)
         mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
 
         return mul_tensor
 
-    def test_gemm_ops(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_gemm_ops(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 128
         N = 64
         K = 256
 
-        Y1 = self._build_gemm_rcr_bias_mul(M, N, K, 0)
-        Y2 = self._build_gemm_rcr_bias_mul(M, N, K, 4)
+        Y1 = self._build_gemm_rcr_bias_mul(M, N, K, dtype, 0)
+        Y2 = self._build_gemm_rcr_bias_mul(M, N, K, dtype, 4)
         Y1._attrs["name"] = "Y0"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "Y1"
         Y2._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y1, Y2],
             target,
             "./tmp",
-            "test_refine_graph_gemm",
+            f"test_refine_graph_gemm_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
@@ -206,8 +226,12 @@ def test_gemm_ops(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
 
-    def test_bmm_ops_accessor(self):
-        dtype = "float16"
+    @parameterized.expand([("float16"), ("float32")])
+    def test_bmm_ops_accessor(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         B = 16
         M = 128
         K = 64
@@ -239,12 +263,11 @@ def test_bmm_ops_accessor(self):
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             Y,
             target,
             "./tmp",
-            "test_refine_graph_bmm",
+            f"test_refine_graph_bmm_{dtype}",
         )
 
         debug_sorted_graph = module.debug_sorted_graph
@@ -253,20 +276,24 @@ def test_bmm_ops_accessor(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-    def test_refine_graph_group_gemms(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_refine_graph_group_gemms(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
-        target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         Y1, Y2 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
         Y3, Y4 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
         Y1._attrs["name"] = "y1"
@@ -281,7 +308,7 @@ def test_refine_graph_group_gemms(self):
         graph_outputs = [Y1, Y2, Y3, Y4]
 
         module = compile_model(
-            graph_outputs, target, "./tmp", "test_refine_graph_group_gemms"
+            graph_outputs, target, "./tmp", f"test_refine_graph_group_gemms_{dtype}"
         )
 
         debug_sorted_graph = module.debug_sorted_graph
@@ -289,10 +316,10 @@ def test_refine_graph_group_gemms(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor([M, K1], dtype)
+        X2_pt = get_random_torch_tensor([M, K2], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
 
@@ -302,10 +329,10 @@ def test_refine_graph_group_gemms(self):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
-        y3 = torch.empty([M, N1]).cuda().half()
-        y4 = torch.empty([M, N2]).cuda().half()
+        y1 = get_torch_empty_tensor([M, N1], dtype)
+        y2 = get_torch_empty_tensor([M, N2], dtype)
+        y3 = get_torch_empty_tensor([M, N1], dtype)
+        y4 = get_torch_empty_tensor([M, N2], dtype)
         outputs = {"y1": y1, "y2": y2, "y3": y3, "y4": y4}
 
         module.run_with_tensors(inputs, outputs)
diff --git a/tests/unittest/compiler/test_remove_unused_ops.py b/tests/unittest/compiler/test_remove_unused_ops.py
index c1014fab3..b0e7d94a0 100644
--- a/tests/unittest/compiler/test_remove_unused_ops.py
+++ b/tests/unittest/compiler/test_remove_unused_ops.py
@@ -20,6 +20,10 @@
 
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -30,12 +34,13 @@ def _test_remove_unused_ops(
         batch_size=(1, 3),
         X_shape=(5, 10),
         test_name="test_remove_unused_ops",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -43,7 +48,7 @@ def _test_remove_unused_ops(
         Y2 = ops.getitem()(Y1, 1)
         CONST_X = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
             value=Y2._attrs["int_var"].value(),
@@ -57,10 +62,10 @@ def _test_remove_unused_ops(
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype)
             Y_pt = X_pt + X_pt.size(1)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
@@ -69,9 +74,12 @@ def _test_remove_unused_ops(
                 len(graph_utils.get_sorted_ops(module.debug_sorted_graph)), 1
             )
 
-    def test_remove_unused_ops(self):
+    def test_remove_unused_ops_float16(self):
         self._test_remove_unused_ops()
 
+    def test_remove_unused_ops_float32(self):
+        self._test_remove_unused_ops(dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
index 6ed7a7cc6..fa961e4a7 100644
--- a/tests/unittest/compiler/test_slice_elemwise_fusion.py
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -21,6 +21,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -39,8 +43,8 @@ def _test_slice_elemwise_fusion(
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
+        dtype="float16",
     ):
-        dtype = "float16"
         X1 = Tensor(
             shape=slice_input_shape,
             dtype=dtype,
@@ -82,8 +86,8 @@ def _test_slice_elemwise_fusion(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         # Run PyTorch
-        x1_pt = torch.randn(*slice_input_shape).cuda().half()
-        x2_pt = torch.randn(*input_x2_shape).cuda().half()
+        x1_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        x2_pt = get_random_torch_tensor(input_x2_shape, dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -96,7 +100,7 @@ def _test_slice_elemwise_fusion(
             "input_x1": x1_pt,
             "input_x2": x2_pt,
         }
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -198,8 +202,8 @@ def _test_slice_elemwise_fusion_dynamic(
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
+        dtype="float16",
     ):
-        dtype = "float16"
         x_shape = [
             shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
             for d in slice_input_shape
@@ -285,8 +289,8 @@ def _test_slice_elemwise_fusion_dynamic(
                 input_x2_shape_pt = [
                     d[idx] if isinstance(d, list) else d for d in input_x2_shape
                 ]
-            x1_pt = torch.randn(*x_shape_pt).cuda().half()
-            x2_pt = torch.randn(*input_x2_shape_pt).cuda().half()
+            x1_pt = get_random_torch_tensor(x_shape_pt, dtype)
+            x2_pt = get_random_torch_tensor(input_x2_shape_pt, dtype)
 
             slice_indices = [
                 slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -301,7 +305,7 @@ def _test_slice_elemwise_fusion_dynamic(
                 "input_x1": x1_pt,
                 "input_x2": x2_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -411,8 +415,8 @@ def _test_two_slice_elemwise_fusion_dynamic(
         expected_op_t,
         expected_data_t,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
         x_shape = [
             shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
             for d in slice_input_shape
@@ -463,7 +467,7 @@ def _test_two_slice_elemwise_fusion_dynamic(
             x_shape_pt = [
                 d[idx] if isinstance(d, list) else d for d in slice_input_shape
             ]
-            x1_pt = torch.randn(*x_shape_pt).cuda().half()
+            x1_pt = get_random_torch_tensor(x_shape_pt, dtype)
 
             slice_indices1 = [
                 slice(i, j) for i, j in zip(slice_start_indices1, slice_end_indices1)
@@ -479,7 +483,7 @@ def _test_two_slice_elemwise_fusion_dynamic(
             inputs = {
                 "input_x1": x1_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -508,6 +512,85 @@ def test_two_slice_elemwise_fusion_dynamic(self):
             test_name="two_slice_elemwise_fusion_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_elemwise_fusion_float(self):
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 20, 30),
+            slice_start_indices=(0, 3, 0),
+            slice_end_indices=(None, 5, None),
+            test_name="slice_elemwise_fusion_float",
+            expected_read_t="uint2",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 16),
+            slice_start_indices=(2, 0),
+            slice_end_indices=(3, None),
+            test_name="slice_elemwise_fusion_broadcast_float",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(4, 16),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(1, 1, 10),
+            slice_start_indices=(0, 0, 2),
+            slice_end_indices=(None, None, 7),
+            test_name="slice_elemwise_fusion_broadcast_float_2",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(10, 3, 1),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], [4, 10], 16),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 16),
+            test_name="slice_elemwise_fusion_dynamic_float",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 5),
+            test_name="slice_elemwise_fusion_dynamic_broadcast_float",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(1, 10, 15),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 0),
+            slice_end_indices=(None, None, 8),
+            test_name="slice_elemwise_fusion_dynamic_broadcast_float",
+            expected_read_t="uint2",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(10, 8),
+            dtype="float",
+        )
+        self._test_two_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([3, 50], 100),
+            slice_start_indices1=(0, 4),
+            slice_end_indices1=(None, 8),
+            slice_start_indices2=(0, 16),
+            slice_end_indices2=(None, 20),
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            test_name="two_slice_elemwise_fusion_dynamic_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 60d5dd2de..88f3bfb6d 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -22,8 +22,14 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
+from parameterized import parameterized
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SliceGemmFusionTestCase(unittest.TestCase):
@@ -39,9 +45,8 @@ def _test_slice_gemm_rcr_fusion_a(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -82,9 +87,9 @@ def _test_slice_gemm_rcr_fusion_a(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        b_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
+        b_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -93,7 +98,7 @@ def _test_slice_gemm_rcr_fusion_a(
         y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -146,7 +151,10 @@ def test_slice_gemm_rcr_fusion_a(self):
 
     # This is a test for testing cases where we correctly update a/b_alignment
     # based on input_accessors
-    def test_slice_gemm_rcr_fusion_align(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_gemm_rcr_fusion_align(self, dtype):
+        if dtype == "float" and int(detect_target()._arch) < 80:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         # [slice_end_indices[0] - slice_start_indices[0]] = M
         # [slice_end_indices[1] - slice_start_indices[1]] = K
         # a = [M, K]
@@ -166,6 +174,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 2),
             slice_end_indices=(None, 18),
             test_name="slice_gemm_rcr_fusion_a",
+            dtype=dtype,
         )
         # Next, make another one with a larger alignment.
         # If we don't update a/b_alignment accordingly, we would end up with
@@ -177,6 +186,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 8),
             slice_end_indices=(None, 24),
             test_name="slice_gemm_rcr_fusion_a",
+            dtype=dtype,
         )
 
         # another set of tests for a/b alignments
@@ -187,6 +197,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 6),
             slice_end_indices=(None, 10),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_fusion_b(
             M=21,
@@ -195,6 +206,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 8),
             slice_end_indices=(None, 12),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_fusion_b(
             M=21,
@@ -203,6 +215,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 10),
             slice_end_indices=(None, 14),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
 
         # another set of tests for a/b alignments
@@ -214,6 +227,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 10),
             slice_end_indices=(None, 12),
             test_name="slice_gemm_rcr_bias_add",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_bias_add(
             M=5,
@@ -223,6 +237,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 16),
             slice_end_indices=(None, 18),
             test_name="slice_gemm_rcr_bias_add",
+            dtype=dtype,
         )
 
         # restore old env
@@ -240,9 +255,8 @@ def _test_slice_gemm_rcr_fusion_b(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_A = Tensor(
             shape=[M, K],
             dtype=dtype,
@@ -277,8 +291,8 @@ def _test_slice_gemm_rcr_fusion_b(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        a_pt = torch.randn(M, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        a_pt = get_random_torch_tensor([M, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -287,7 +301,7 @@ def _test_slice_gemm_rcr_fusion_b(
         y_pt = torch.nn.functional.linear(a_pt, b_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, a_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -322,8 +336,8 @@ def _test_slice_gemm_rcr_fusion_a_2(
         slice_end_indices,
         test_name,
         no_fusion=False,
+        dtype="float16",
     ):
-        dtype = "float16"
 
         X = Tensor(
             shape=slice_input_shape,
@@ -373,8 +387,8 @@ def _test_slice_gemm_rcr_fusion_a_2(
             self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(M).cuda().half()
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([M], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -383,7 +397,7 @@ def _test_slice_gemm_rcr_fusion_a_2(
         y_pt = torch.nn.functional.linear(a_pt, a_pt, bias=bias_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, bias_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         dll_name = "test_{}.so".format(self.test_count)
@@ -419,9 +433,8 @@ def _test_slice_gemm_rcr_bias_add(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -470,10 +483,10 @@ def _test_slice_gemm_rcr_bias_add(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        b_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
-        d_pt = torch.randn(M, N).cuda().half()
+        b_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
+        d_pt = get_random_torch_tensor([M, N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -483,7 +496,7 @@ def _test_slice_gemm_rcr_bias_add(
         y_pt = y2_pt + d_pt
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, b_pt, bias_pt, d_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -552,9 +565,8 @@ def _test_slice_gemm_rcr_fusion_dynamic(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -606,12 +618,12 @@ def _test_slice_gemm_rcr_fusion_dynamic(
         assert Ms is not None, "expected to have at least one dynamic dim"
         for idx in range(len(Ms)):
             # Run PyTorch
-            b_pt = torch.randn(N, K).cuda().half()
+            b_pt = get_random_torch_tensor([N, K], dtype)
             input_shape_pt = [
                 d[idx] if isinstance(d, list) else d for d in slice_input_shape
             ]
-            input_pt = torch.randn(*input_shape_pt).cuda().half()
-            bias_pt = torch.randn(N).cuda().half()
+            input_pt = get_random_torch_tensor(input_shape_pt, dtype)
+            bias_pt = get_random_torch_tensor([N], dtype)
 
             slice_indices = [
                 slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -620,7 +632,7 @@ def _test_slice_gemm_rcr_fusion_dynamic(
             y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
             # Run AITemplate module.
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -663,9 +675,8 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B1 = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -714,10 +725,10 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         self.assertEqual(len(sorted_ops), 2)
 
         # Run PyTorch
-        b1_pt = torch.randn(N, K).cuda().half()
-        b2_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
+        b1_pt = get_random_torch_tensor([N, K], dtype)
+        b2_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -728,7 +739,7 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         y_pt = y1_pt + y2_pt
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(
             {
                 "input_x": input_pt,
@@ -763,6 +774,93 @@ def test_slice_multiple_gemm_rcr_fusion_a(self):
             test_name="slice_multiple_gemm_rcr_fusion_a",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_slice_gemm_fusion_float(self):
+        self._test_slice_gemm_rcr_fusion_a(
+            N=4,
+            K=8,
+            slice_input_shape=(2, 8),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, None),
+            test_name="slice_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=16,
+            slice_input_shape=(24, 16),
+            slice_start_indices=(3, 0),
+            slice_end_indices=(15, None),
+            test_name="slice_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_b(
+            M=24,
+            K=16,
+            slice_input_shape=(32, 32),
+            slice_start_indices=(0, 16),
+            slice_end_indices=(None, 32),
+            test_name="slice_gemm_rcr_fusion_b_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 24),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 23),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2_float",
+            no_fusion=True,
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_bias_add(
+            M=4,
+            N=2,
+            K=8,
+            slice_input_shape=(4, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_bias_add_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=5,
+            K=4,
+            slice_input_shape=(13, 2, 32),
+            slice_start_indices=(0, 0, 10),
+            slice_end_indices=(None, None, 14),
+            test_name="slice_nd_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_dynamic(
+            N=4,
+            K=8,
+            slice_input_shape=([4, 9], 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_dynamic_float",
+            dtype="float",
+        )
+        self._test_slice_multiple_gemm_rcr_fusion_a(
+            N=4,
+            K=16,
+            slice_input_shape=(30, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 24),
+            test_name="slice_multiple_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index aaa38deec..d60779452 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -21,8 +21,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class SliceScatterReshapeCatTestCase(unittest.TestCase):
@@ -40,17 +45,18 @@ def _run_one_test(
         input_x_shape,
         dim,
         add_tanh=False,
+        dtype="float16",
     ):
         target = detect_target()
 
-        input_X_pt = torch.randn(input_x_shape).cuda().half()
+        input_X_pt = get_random_torch_tensor(input_x_shape, dtype)
 
         Ys_pt = []
         Xs_pt = []
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -62,7 +68,7 @@ def _run_one_test(
             Y_pt = torch.tanh(Y_pt)
 
         input_X = Tensor(
-            shape=input_x_shape, dtype="float16", name="input_x", is_input=True
+            shape=input_x_shape, dtype=dtype, name="input_x", is_input=True
         )
         Ys = []
         for idx, (input_shape, start_indices, end_indices) in enumerate(
@@ -70,7 +76,7 @@ def _run_one_test(
         ):
             slice_op = ops.dynamic_slice()
             X_name = "input_{}".format(idx)
-            X = Tensor(shape=input_shape, dtype="float16", name=X_name, is_input=True)
+            X = Tensor(shape=input_shape, dtype=dtype, name=X_name, is_input=True)
             Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
             Ys.append(Y)
         concat_op = ops.concatenate()
@@ -84,7 +90,7 @@ def _run_one_test(
         Y._attrs["is_output"] = True
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logger.info(
+        _LOGGER.info(
             "AITemplate output_0 shape: {}, pt shape: {}".format(y_shape, Y_pt.size())
         )
         np.testing.assert_equal(y_shape, Y_pt.size())
@@ -110,7 +116,7 @@ def _run_one_test(
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
         inputs[input_name_to_index["input_x"]] = input_X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -134,6 +140,18 @@ def test_slice_scatter_reshape(self):
             add_tanh=True,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_scatter_reshape_float(self):
+        self._run_one_test(
+            input_shapes=[[8, 16], [20, 30]],
+            input_start_indices=[[0, 4], [12, 2]],
+            input_end_indices=[[4, 14], [16, 8]],
+            reshape_to=[4, 2, 8],
+            input_x_shape=[4, 5, 8],
+            dim=1,
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_scatter_pattern.py b/tests/unittest/compiler/test_slice_scatter_pattern.py
index a537d11fe..01700d96b 100644
--- a/tests/unittest/compiler/test_slice_scatter_pattern.py
+++ b/tests/unittest/compiler/test_slice_scatter_pattern.py
@@ -22,6 +22,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 
@@ -36,8 +40,8 @@ def _make_slice_ops(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
-        input_type="float16",
     ):
         Ys = []
         for idx, (input_shape, start_indices, end_indices) in enumerate(
@@ -46,16 +50,14 @@ def _make_slice_ops(
             slice_op = ops.dynamic_slice()
             X_name = "input_{}".format(idx)
             if batch_sizes is None:
-                X = Tensor(
-                    shape=input_shape, dtype=input_type, name=X_name, is_input=True
-                )
+                X = Tensor(shape=input_shape, dtype=dtype, name=X_name, is_input=True)
             else:
                 X = Tensor(
                     shape=[
                         IntVar(values=batch_sizes, name="input_batch_{}".format(idx)),
                         *input_shape,
                     ],
-                    dtype=input_type,
+                    dtype=dtype,
                     name=X_name,
                     is_input=True,
                 )
@@ -69,10 +71,16 @@ def _make_test_graph(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
     ):
         Ys = self._make_slice_ops(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         concat_op = ops.concatenate()
         Y = concat_op(Ys, dim)
@@ -86,10 +94,16 @@ def _graph_transformation_test(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
     ):
         graph = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         graph = transform.toposort(graph)
         transform.name_graph(graph)
@@ -107,7 +121,9 @@ def _graph_transformation_test(
         for idx, x in enumerate(fused_op._attrs["inputs"]):
             self.assertEqual(x._attrs["name"], "input_{}".format(idx))
 
-    def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
+    def _e2e_test(
+        self, input_shapes, input_start_indices, input_end_indices, dim, dtype
+    ):
         logging.info(
             "e2e test with input_shapes {}, start_indices {}, end_indices {}".format(
                 input_shapes, input_start_indices, input_end_indices
@@ -121,7 +137,7 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -129,7 +145,7 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         Y_pt = torch.cat(Ys_pt, dim)
 
         Y = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         logging.info(
@@ -146,13 +162,19 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
     def _e2e_batch_test(
-        self, input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype,
+        batch_sizes,
     ):
         logging.info(
             "e2e batch test with batch_sizes {}, input_shapes{}, "
@@ -164,7 +186,12 @@ def _e2e_batch_test(
         target = detect_target()
 
         Y = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
 
@@ -181,7 +208,7 @@ def _e2e_batch_test(
             for input_shape, start_indices, end_indices in zip(
                 input_shapes, input_start_indices, input_end_indices
             ):
-                X_pt = torch.randn([batch, *input_shape]).cuda().half()
+                X_pt = get_random_torch_tensor([batch, *input_shape], dtype)
                 Xs_pt.append(X_pt)
                 slice_indices = [
                     slice(i, j) for i, j in zip(start_indices, end_indices)
@@ -193,27 +220,50 @@ def _e2e_batch_test(
             inputs = [0 for i in range(len(Xs_pt))]
             for i, X_pt in enumerate(Xs_pt):
                 inputs[input_name_to_index[f"input_{i}"]] = X_pt
-            y = torch.empty(y_shape).cuda().half()
+            y = get_torch_empty_tensor(y_shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
     def _run_one_test(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         self._graph_transformation_test(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
-        self._e2e_test(input_shapes, input_start_indices, input_end_indices, dim)
+        self._e2e_test(input_shapes, input_start_indices, input_end_indices, dim, dtype)
 
     def _run_one_batch_test(
-        self, *, batch_sizes, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        batch_sizes,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         self._graph_transformation_test(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         self._e2e_batch_test(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
 
     def test_slice_scatter(self):
@@ -258,21 +308,22 @@ def _make_test_graph_multi_dsts(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
     ):
         Ys = self._make_slice_ops(
             input_shapes,
             input_start_indices,
             input_end_indices,
             dim,
+            dtype,
         )
-        input_type = "float16"
         # make the first input tensor have multiple uses
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
         X0_shape = [d._attrs["values"][0] for d in X0._attrs["shape"]]
         num_slice_inputs = len(input_shapes)
         X1_name = f"input_{num_slice_inputs}"
-        X1 = Tensor(shape=X0_shape, dtype=input_type, name=X1_name, is_input=True)
+        X1 = Tensor(shape=X0_shape, dtype=dtype, name=X1_name, is_input=True)
         concat_op = ops.concatenate()
         Y0 = concat_op(Ys, dim)
         Y0._attrs["name"] = "output_0"
@@ -286,7 +337,13 @@ def _make_test_graph_multi_dsts(
         return (Y0, Y1)
 
     def _test_slice_scatter_multi_dsts(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         """test cases where a tensor being sliced has multiple dsts"""
 
@@ -301,7 +358,7 @@ def _test_slice_scatter_multi_dsts(
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -309,12 +366,12 @@ def _test_slice_scatter_multi_dsts(
         Y0_pt = torch.cat(Ys_pt, dim)
 
         input0_shape = Xs_pt[0].size()
-        other_X_pt = torch.randn(input0_shape).cuda().half()
+        other_X_pt = get_random_torch_tensor(input0_shape, dtype)
         Xs_pt.append(other_X_pt)
         Y1_pt = Xs_pt[0] + other_X_pt
 
         Y0, Y1 = self._make_test_graph_multi_dsts(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
 
         y0_shape = [var._attrs["values"][0] for var in Y0._attrs["shape"]]
@@ -340,8 +397,8 @@ def _test_slice_scatter_multi_dsts(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y0 = torch.empty(y0_shape).cuda().half()
-        y1 = torch.empty(y1_shape).cuda().half()
+        y0 = get_torch_empty_tensor(y0_shape, dtype)
+        y1 = get_torch_empty_tensor(y1_shape, dtype)
         module.run_with_tensors(inputs, [y0, y1])
         self.assertTrue(torch.allclose(Y0_pt, y0, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-2, rtol=1e-2))
@@ -361,6 +418,7 @@ def _make_test_graph_multi_dsts_2(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
     ):
         """Make a graph where (1) a tensor is sliced twice and both slices are
         fed into the same concat op, and (2) another sliced output (i.e not
@@ -372,6 +430,7 @@ def _make_test_graph_multi_dsts_2(
             input_start_indices,
             input_end_indices,
             dim,
+            dtype,
         )
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
@@ -394,7 +453,13 @@ def _make_test_graph_multi_dsts_2(
         return Y
 
     def _test_slice_scatter_multi_dsts_2(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         logging.info(
             f"multi_dsts_2 e2e test with input_shapes: {input_shapes}, "
@@ -407,7 +472,7 @@ def _test_slice_scatter_multi_dsts_2(
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -425,7 +490,7 @@ def _test_slice_scatter_multi_dsts_2(
         Y_pt = torch.cat(Ys_pt, dim)
 
         Y = self._make_test_graph_multi_dsts_2(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
@@ -449,7 +514,7 @@ def _test_slice_scatter_multi_dsts_2(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -468,6 +533,38 @@ def test_slice_scatter_multi_dsts_2(self):
             dim=2,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_scatter_float(self):
+        self._run_one_test(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+            dtype="float",
+        )
+        self._run_one_batch_test(
+            batch_sizes=[1024, 4, 128],
+            input_shapes=[[3], [3]],
+            input_start_indices=[[1, 1], [0, 0]],
+            input_end_indices=[[2, 3], [1, 2]],
+            dim=0,
+            dtype="float",
+        )
+        self._test_slice_scatter_multi_dsts(
+            input_shapes=[[4, 3, 4], [3, 7, 10]],
+            input_start_indices=[[1, 0, -3], [0, 2, 1]],
+            input_end_indices=[[3, 3, 4], [2, 5, -1]],
+            dim=2,
+            dtype="float",
+        )
+        self._test_slice_scatter_multi_dsts_2(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index 65ebc9393..6c3267ad4 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -18,26 +18,43 @@
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
+from parameterized import parameterized
+
+
+SKIP_FLOAT = detect_target().name() == "rocm" or int(detect_target()._arch) < 80
+
 
 class SliceViewStridedOpTestCase(unittest.TestCase):
-    def test_slice_view_gemm_fusible(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_view_gemm_fusible(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
-        input0 = test_utils.gen_input_tensor([batch_dim, 2 * N, N], name="input0")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, 2 * N, N], dtype=dtype, name="input0"
+        )
         X0 = ops.dynamic_slice()(input0, [None, None, None], [None, N, None])
         X1 = ops.reshape()(X0, [-1, N * N])
-        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        input1 = test_utils.gen_input_tensor([N, N * N], dtype=dtype, name="input1")
         Y = ops.gemm_rcr()(X1, input1)
         Y._attrs["name"] = "output0"
         Y._attrs["is_output"] = True
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_fusible")
+        module = compile_model(
+            [Y], target, "./tmp", f"slice_reshape_gemm_fusible_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -48,12 +65,12 @@ def test_slice_view_gemm_fusible(self):
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
             # Run PyTorch baseline.
-            input0_pt = torch.randn([batch_size, 2 * N, N]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, 2 * N, N], dtype)
             x0_pt = input0_pt[:, :N, :]
             x1_pt = torch.reshape(x0_pt, [-1, N * N])
-            input1_pt = torch.rand([N, N * N]).cuda().half()
+            input1_pt = get_random_torch_tensor([N, N * N], dtype)
             y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -70,21 +87,29 @@ def test_slice_view_gemm_fusible(self):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
-    def test_slice_view_gemm_non_fusible(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_view_gemm_non_fusible(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
-        input0 = test_utils.gen_input_tensor([batch_dim, N, 2 * N], name="input0")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, N, 2 * N], dtype=dtype, name="input0"
+        )
         X0 = ops.dynamic_slice()(input0, [None, None, None], [None, None, N])
         X1 = ops.reshape()(X0, [-1, N * N])
-        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        input1 = test_utils.gen_input_tensor([N, N * N], dtype=dtype, name="input1")
         Y = ops.gemm_rcr()(X1, input1)
         Y._attrs["name"] = "output0"
         Y._attrs["is_output"] = True
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_non_fusible")
+        module = compile_model(
+            [Y], target, "./tmp", f"slice_reshape_gemm_non_fusible_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -95,12 +120,12 @@ def test_slice_view_gemm_non_fusible(self):
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
             # Run PyTorch baseline.
-            input0_pt = torch.randn([batch_size, N, 2 * N]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N, 2 * N], dtype)
             x0_pt = input0_pt[:, :, :N]
             x1_pt = torch.reshape(x0_pt, [-1, N * N])
-            input1_pt = torch.rand([N, N * N]).cuda().half()
+            input1_pt = get_random_torch_tensor([N, N * N], dtype)
             y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -117,6 +142,298 @@ def test_slice_view_gemm_non_fusible(self):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_flatten_concat_fusible_1(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        test_name = f"slice_flatten_concat_fusible_{dtype}"
+        batch_dim = IntVar([3, 10], "batch_size")
+        X0 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x1")
+        X2 = test_utils.gen_input_tensor([batch_dim, 10], dtype=dtype, name="x2")
+        A = test_utils.gen_input_tensor([batch_dim, 8, 48], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, 48, 40], dtype=dtype, name="b")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 39]
+        squeeze_dim = 2
+        cat_dim = 1
+        flatten_start_dim = 1
+        flatten_end_dim = -1
+
+        Y0 = ops.bmm_rrr()(A, B)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.flatten(start_dim=flatten_start_dim, end_dim=flatten_end_dim)(Y1)
+        Y3 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        Y4 = ops.squeeze(squeeze_dim)(Y3)
+        Y = ops.concatenate()([X2, Y2, Y4], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 7)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 12, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 12, 1], dtype)
+            x2_pt = get_random_torch_tensor([batch_size, 10], dtype)
+            a_pt = get_random_torch_tensor([batch_size, 8, 48], dtype)
+            b_pt = get_random_torch_tensor([batch_size, 48, 40], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt, b_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = torch.flatten(
+                y1_pt, start_dim=flatten_start_dim, end_dim=flatten_end_dim
+            )
+            y3_pt = x0_pt + x1_pt
+            y4_pt = torch.squeeze(y3_pt, dim=squeeze_dim)
+            y_pt = torch.cat([x2_pt, y2_pt, y4_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "x2": x2_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_flatten_concat_fusible_2(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        test_name = f"slice_flatten_concat_fusible_{dtype}_2"
+        batch_dim = IntVar([1, 2], "batch_size")
+        X0 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x1")
+        X2 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x2")
+        A = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, 1, 2], dtype=dtype, name="b")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 3]
+        reshape_to = [-1, 2]
+        cat_dim = 1
+        flatten_start_dim = 1
+        flatten_end_dim = -1
+
+        Y0 = ops.bmm_rrr()(A, B)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.flatten(start_dim=flatten_start_dim, end_dim=flatten_end_dim)(Y1)
+        Y3 = X0 + X1
+        Y4 = ops.reshape()(Y3, reshape_to)
+        Y = ops.concatenate()([Y4, Y2, X2, Y4], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            x2_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            a_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            b_pt = get_random_torch_tensor([batch_size, 1, 2], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt, b_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = torch.flatten(
+                y1_pt, start_dim=flatten_start_dim, end_dim=flatten_end_dim
+            )
+            y3_pt = x0_pt + x1_pt
+            y4_pt = y3_pt.reshape(*reshape_to)
+            y_pt = torch.cat([y4_pt, y2_pt, x2_pt, y4_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "x2": x2_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_reshape_concat_fusible_1(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        test_name = f"slice_reshape_concat_fusible_{dtype}_1"
+        batch_dim = IntVar([1, 2], "batch_size")
+        M = 2
+        N = 2
+        K = 1
+
+        X0 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x1")
+        A = test_utils.gen_input_tensor([batch_dim, K, M], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, K, N], dtype=dtype, name="b")
+        D = test_utils.gen_input_tensor([N], dtype=dtype, name="d")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 1]
+        reshape_to = [-1, M * (N - 1)]
+        cat_dim = 1
+
+        Y0 = ops.bmm_crr_add()(A, B, D)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.reshape()(Y1, reshape_to)
+        Y3 = ops.concatenate()([Y2, X0], dim=cat_dim)
+        Y = ops.concatenate()([Y3, X1], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            a_pt = get_random_torch_tensor([batch_size, K, M], dtype)
+            b_pt = get_random_torch_tensor([batch_size, K, N], dtype)
+            d_pt = get_random_torch_tensor([N], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt.permute([0, 2, 1]), b_pt)
+            y0_pt = y0_pt + d_pt
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = y1_pt.reshape(*reshape_to)
+            y3_pt = torch.cat([y2_pt, x0_pt], dim=cat_dim)
+            y_pt = torch.cat([y3_pt, x1_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                    "d": d_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_slice_reshape_concat_fusible_2(self, dtype):
+        if dtype == "float" and SKIP_FLOAT:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        test_name = "slice_reshape_concat_fusible_{dtype}_2"
+        batch_dim = IntVar([1, 8], "batch_size")
+        M = 8
+        N = 64
+        K = 4
+
+        K1_0 = 32 * 8
+        K1_1 = 3
+
+        # K1 = 259: need padding
+        K1 = K1_0 + K1_1
+        N1 = 256
+
+        X0 = test_utils.gen_input_tensor([batch_dim, M, K], dtype=dtype, name="x0")
+        W0 = test_utils.gen_input_tensor([N, K], dtype=dtype, name="w0")
+        X1 = test_utils.gen_input_tensor([batch_dim, K1_1], dtype=dtype, name="x1")
+        W1 = test_utils.gen_input_tensor([N1, K1], dtype=dtype, name="w1")
+
+        start_indices = [0, 0, 32]
+        end_indices = [None, None, 64]
+        reshape_to = [-1, K1_0]
+        cat_dim = 1
+
+        Y0 = ops.gemm_rcr()(X0, W0)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.reshape()(Y1, reshape_to)
+        Y3 = ops.concatenate()([Y2, X1], dim=cat_dim)
+        Y = ops.gemm_rcr()(Y3, W1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, M, K], dtype)
+            w0_pt = get_random_torch_tensor([N, K], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, K1_1], dtype)
+            w1_pt = get_random_torch_tensor([N1, K1], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.nn.functional.linear(x0_pt, w0_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = y1_pt.reshape(*reshape_to)
+            y3_pt = torch.cat([y2_pt, x1_pt], dim=cat_dim)
+            y_pt = torch.nn.functional.linear(y3_pt, w1_pt)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "w0": w0_pt,
+                    "x1": x1_pt,
+                    "w1": w1_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=5e-2, rtol=5e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
index 0a07920b0..c3921cbd8 100644
--- a/tests/unittest/compiler/test_split_bmm_fusion.py
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -20,7 +20,11 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import has_op
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
 from aitemplate.utils import graph_utils
 
 
@@ -37,9 +41,8 @@ def _test_split_bmm_rcr_fusion(
         split_dim,
         testname,
         with_padding=False,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         T_A = Tensor(
             shape=[B, M, K],
             dtype=dtype,
@@ -67,8 +70,8 @@ def _test_split_bmm_rcr_fusion(
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        a = torch.randn(B, M, K).cuda().half()
-        b = torch.randn(B, N, K).cuda().half()
+        a = get_random_torch_tensor([B, M, K], dtype)
+        b = get_random_torch_tensor([B, N, K], dtype)
         xs = a.split(split_size_or_sections, split_dim)
         ys = b.split(split_size_or_sections, split_dim)
         cs = []
@@ -88,7 +91,7 @@ def _test_split_bmm_rcr_fusion(
                 f"The final graph should have only 3 tensors. "
                 f"But it has {len(graph)} tensors now."
             )
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors({"input0": a, "input1": b}, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
@@ -158,8 +161,8 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
         split_size_or_sections,
         split_dim,
         testname,
+        dtype="float16",
     ):
-        dtype = "float16"
         assert isinstance(Ms, (list, tuple))
 
         T_A = Tensor(
@@ -199,8 +202,8 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
         )
 
         for M in Ms:
-            a = torch.randn(B, M, K).cuda().half()
-            b = torch.randn(B, N, K).cuda().half()
+            a = get_random_torch_tensor([B, M, K], dtype)
+            b = get_random_torch_tensor([B, N, K], dtype)
             xs = a.split(split_size_or_sections, split_dim)
             ys = b.split(split_size_or_sections, split_dim)
             cs = []
@@ -211,7 +214,7 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
                 cs.append(c)
             y_pt = torch.cat(cs, dim=split_dim)
 
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors({"input0": a, "input1": b}, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
@@ -249,17 +252,16 @@ def _test_split_bmm_rcr_fusion_qkv(
         split_size_or_sections,
         split_dim=0,
         testname="test_split_qkv",
+        dtype="float16",
         should_fail=False,
     ):
-        dtype = "float16"
-
         X = Tensor(
             shape=[B, M, K],
             dtype=dtype,
             name="input0",
             is_input=True,
         )
-        scale = Tensor(shape=[], dtype="float16", name="scale", value=K**-0.5)
+        scale = Tensor(shape=[], dtype=dtype, name="scale", value=K**-0.5)
 
         (Q, KK, V) = ops.split()(X, split_size_or_sections, split_dim)
         QK = ops.bmm_rcr()(Q, KK)
@@ -269,7 +271,7 @@ def _test_split_bmm_rcr_fusion_qkv(
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        a = torch.randn(B, M, K).cuda().half()
+        a = get_random_torch_tensor([B, M, K], dtype)
         (q, k, v) = a.split(split_size_or_sections, split_dim)
         qk = torch.bmm(q, k.permute(0, 2, 1)) * K**-0.5
         qk = torch.softmax(qk, -1)
@@ -286,7 +288,7 @@ def _test_split_bmm_rcr_fusion_qkv(
         else:
             assert not has_op(sorted_ops, "split"), "The final graph has split op!"
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors({"input0": a}, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
@@ -294,6 +296,88 @@ def test_split_bmm_rcr_fusion_qkv(self):
         self._test_split_bmm_rcr_fusion_qkv(3, 4096, 4096, 512, 1, 1)
         self._test_split_bmm_rcr_fusion_qkv(3 * 16, 1024, 1024, 256, 16, 16)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_split_bmm_fusion_float(self):
+        # bmm_rcr (K with an odd value) with padding:
+        # in this case, split and bmm_rcr are not going to be fused actually because
+        # of the padding applied to bmm_rcr.
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            1,
+            10000,
+            3,
+            5,
+            [2, 3],
+            2,
+            "test_split_bmm_rcr",
+            with_padding=True,
+            dtype="float",
+        )
+        # bmm_rcr_n1, split_dim = 2
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1,
+            1,
+            10000,
+            1,
+            5,
+            [2, 3],
+            2,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            10,
+            8,
+            32,
+            16 * 2,
+            16,
+            2,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr, split_dim = 0, can only be static
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            10,
+            8,
+            32,
+            16 * 2,
+            32,
+            0,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr_n1
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr_n1,
+            1,
+            [100, 160],
+            1,
+            32,
+            8,
+            2,
+            "test_split_bmm_rcr_n1_dynamic_M_float",
+            dtype="float",
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr,
+            10,
+            [8, 16],
+            32,
+            16 * 2,
+            16,
+            2,
+            "test_split_bmm_rcr_dynamic_M_float",
+            dtype="float",
+        )
+        self._test_split_bmm_rcr_fusion_qkv(3 * 16, 10, 10, 8, 16, 16, dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
index 519a035b2..2e3d846de 100644
--- a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
+++ b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
@@ -23,13 +23,17 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
 class SplitBMMTestCase(unittest.TestCase):
     def _test_split_reshape_bmm_permute(
-        self, bs, nheads, seq_len, hidden_size, test_name
+        self, bs, nheads, seq_len, hidden_size, test_name, dtype="float16"
     ):
         target = detect_target()
         head_dim = hidden_size // nheads
@@ -37,7 +41,7 @@ def _test_split_reshape_bmm_permute(
 
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         input_shape = [3, batch_dim, nheads, seq_len, head_dim]
-        X = Tensor(shape=input_shape, dtype="float16", name="input_0", is_input=True)
+        X = Tensor(shape=input_shape, dtype=dtype, name="input_0", is_input=True)
         (Q, K, V) = ops.split()(X, 1, dim=0)
 
         OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=scale)
@@ -60,7 +64,7 @@ def _test_split_reshape_bmm_permute(
 
         for b in bs:
             input_shape = [3, b, nheads, seq_len, head_dim]
-            x_pt = torch.randn(*input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             (q_pt, k_pt, v_pt) = torch.split(x_pt, 1, dim=0)
             q_pt = q_pt.reshape(-1, seq_len, head_dim)
             k_pt = k_pt.reshape(-1, seq_len, head_dim)
@@ -74,7 +78,7 @@ def _test_split_reshape_bmm_permute(
             y_r = y_l.reshape(b, nheads, seq_len, head_dim)
             y_pt = torch.permute(y_r, [0, 2, 1, 3])
 
-            y = torch.empty([b, seq_len, nheads, head_dim]).cuda().half()
+            y = get_torch_empty_tensor([b, seq_len, nheads, head_dim], dtype)
             module.run_with_tensors([x_pt], [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
 
diff --git a/tests/unittest/compiler/test_split_large_concat.py b/tests/unittest/compiler/test_split_large_concat.py
index 10639a38b..682fed083 100644
--- a/tests/unittest/compiler/test_split_large_concat.py
+++ b/tests/unittest/compiler/test_split_large_concat.py
@@ -22,9 +22,14 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class SplitLargeConcatTestCase(unittest.TestCase):
@@ -32,16 +37,15 @@ def __init__(self, *args, **kwargs):
         super(SplitLargeConcatTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
 
-    def _make_tensors(self, num_inputs, input_shape, input_names=None):
+    def _make_tensors(self, num_inputs, input_shape, dtype, input_names=None):
         if input_names is not None:
             assert num_inputs == len(input_names)
         input_tensors = []
-        input_type = "float16"
         for i in range(num_inputs):
             name = input_names[i] if input_names is not None else f"input_{i}"
             t = Tensor(
                 shape=input_shape,
-                dtype=input_type,
+                dtype=dtype,
                 name=name,
                 is_input=True,
             )
@@ -49,12 +53,12 @@ def _make_tensors(self, num_inputs, input_shape, input_names=None):
         return input_tensors
 
     def _test_split_large_concat_simple(
-        self, cat_dim, num_inputs, input_shape, split_count, test_name
+        self, cat_dim, num_inputs, input_shape, split_count, test_name, dtype="float16"
     ):
         # a simple test: a concat takes num_inputs and the output of the concat
         # is a model output
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
-        input_tensors = self._make_tensors(num_inputs, input_shape)
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y = concat_op(input_tensors, cat_dim)
         Y._attrs["name"] = "output_0"
@@ -70,7 +74,9 @@ def _test_split_large_concat_simple(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), split_count)
 
-        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
+        inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
         y_pt = torch.cat(inputs_pt, cat_dim)
 
         # run ait
@@ -80,7 +86,7 @@ def _test_split_large_concat_simple(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -109,17 +115,17 @@ def test_split_large_concat_simple(self):
         )
 
     def _test_split_large_concat_with_add(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # y1 = concat(x1,x2...)
         # y = add(y1, x_n) where x_n is not used by concat
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
-        input_tensors = self._make_tensors(num_inputs, input_shape)
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y1 = concat_op(input_tensors, cat_dim)
         x_n_shape = [1]
-        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_ns = self._make_tensors(1, x_n_shape, dtype, ["input_x_n"])
         X_n = X_ns[0]
         Y = ops.elementwise(FuncEnum.ADD)(Y1, X_n)
         Y._attrs["name"] = "output_0"
@@ -129,8 +135,10 @@ def _test_split_large_concat_with_add(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
-        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
-        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
+        x_n_pt = get_random_torch_tensor(x_n_shape, dtype)
         y1_pt = torch.cat(inputs_pt, cat_dim)
         inputs_pt.append(x_n_pt)
         y_pt = y1_pt + x_n_pt
@@ -142,7 +150,7 @@ def _test_split_large_concat_with_add(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -156,22 +164,22 @@ def test_split_large_concat_with_add(self):
         )
 
     def _test_split_large_concat_with_strided_add(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # y1 = add(x1, x2)
         # y2 = concat(y1, x3, ...)
         # y = add(y1, x_n) where x_n is not used by concat
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensors = self._make_tensors(
-            2, input_shape, ["add_input_0", "add_input_1"]
+            2, input_shape, dtype, ["add_input_0", "add_input_1"]
         )
         Y1 = ops.elementwise(FuncEnum.ADD)(add_input_tensors[0], add_input_tensors[1])
-        concat_input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y2 = concat_op([Y1] + concat_input_tensors, cat_dim)
         x_n_shape = [1]
-        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_ns = self._make_tensors(1, x_n_shape, dtype, ["input_x_n"])
         X_n = X_ns[0]
         Y = ops.elementwise(FuncEnum.ADD)(Y2, X_n)
         Y._attrs["name"] = "output_0"
@@ -181,12 +189,12 @@ def _test_split_large_concat_with_strided_add(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
-        add_inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(2)]
+        add_inputs_pt = [get_random_torch_tensor(input_shape, dtype) for _ in range(2)]
         y1_pt = add_inputs_pt[0] + add_inputs_pt[1]
         concat_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
-        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        x_n_pt = get_random_torch_tensor(x_n_shape, dtype)
         y2_pt = torch.cat([y1_pt] + concat_inputs_pt, cat_dim)
         y_pt = y2_pt + x_n_pt
 
@@ -199,7 +207,7 @@ def _test_split_large_concat_with_strided_add(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -213,17 +221,17 @@ def test_split_large_concat_with_strided_add(self):
         )
 
     def _test_split_large_concat_with_strided_add_complex(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # a1 = add(x1, x2)
         # a2 = add(x3, x4)
         # ...
         # y = concat(a1, x1_1, a2, x1_2, ...)
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_inputs * 2, input_shape, add_input_tensor_names
+            num_inputs * 2, input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_inputs):
@@ -231,7 +239,7 @@ def _test_split_large_concat_with_strided_add_complex(
                 add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
             )
             add_output_tensors.append(a)
-        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         concat_input_tensors = []
         for i in range(num_inputs):
@@ -246,13 +254,13 @@ def _test_split_large_concat_with_strided_add_complex(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         add_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs * 2)
         ]
         add_outputs_pt = []
         for i in range(num_inputs):
             add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
         other_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
         concat_inputs_pt = []
         for i in range(num_inputs):
@@ -269,7 +277,7 @@ def _test_split_large_concat_with_strided_add_complex(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -283,7 +291,7 @@ def test_split_large_concat_with_strided_add_complex(self):
         )
 
     def _test_split_large_concat_with_reuse(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # a1 = add(x1, x2)
@@ -293,10 +301,10 @@ def _test_split_large_concat_with_reuse(
         # other_inputs = [o1, o2...]
         # concat_input = shuffle([a1, a2...] + add_inputs[0:10] + other_inputs)
         # y = concat(concat_input)
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_inputs * 2, input_shape, add_input_tensor_names
+            num_inputs * 2, input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_inputs):
@@ -304,7 +312,7 @@ def _test_split_large_concat_with_reuse(
                 add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
             )
             add_output_tensors.append(a)
-        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         add_inputs_shuffle = list(range(len(add_input_tensors)))
         random.shuffle(add_inputs_shuffle)
         add_inputs_for_concat = [add_input_tensors[i] for i in add_inputs_shuffle[0:10]]
@@ -326,14 +334,14 @@ def _test_split_large_concat_with_reuse(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         add_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs * 2)
         ]
         add_outputs_pt = []
         for i in range(num_inputs):
             add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
         add_inputs_for_concat_pt = [add_inputs_pt[i] for i in add_inputs_shuffle[0:10]]
         other_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
         concat_inputs_pt = add_outputs_pt + other_inputs_pt + add_inputs_for_concat_pt
         real_concat_inputs_pt = [concat_inputs_pt[i] for i in concat_inputs_shuffle]
@@ -348,7 +356,7 @@ def _test_split_large_concat_with_reuse(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -371,6 +379,7 @@ def _test_split_large_concat_with_slice(
         num_add_inputs,
         add_input_shape,
         test_name,
+        dtype="float16",
     ):
         # make a model like below:
         # s1 = t1[:, 0:10]
@@ -383,7 +392,7 @@ def _test_split_large_concat_with_slice(
         # y = concat(concat_input)
         slice_input_tensor_names = [f"slice_input_{i}" for i in range(num_slice_inputs)]
         slice_input_tensors = self._make_tensors(
-            num_slice_inputs, slice_input_shape, slice_input_tensor_names
+            num_slice_inputs, slice_input_shape, dtype, slice_input_tensor_names
         )
         slice_output_tensors = []
         for slice_input_tensor in slice_input_tensors:
@@ -394,7 +403,7 @@ def _test_split_large_concat_with_slice(
 
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_add_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_add_inputs * 2, add_input_shape, add_input_tensor_names
+            num_add_inputs * 2, add_input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_add_inputs):
@@ -414,14 +423,14 @@ def _test_split_large_concat_with_slice(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         slice_inputs_pt = [
-            torch.randn(slice_input_shape).cuda().half()
+            get_random_torch_tensor(slice_input_shape, dtype)
             for _ in range(num_slice_inputs)
         ]
         slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
         slice_outputs_pt = [inp_pt[slice_indices] for inp_pt in slice_inputs_pt]
 
         add_inputs_pt = [
-            torch.randn(add_input_shape).cuda().half()
+            get_random_torch_tensor(add_input_shape, dtype)
             for _ in range(num_add_inputs * 2)
         ]
         add_outputs_pt = []
@@ -439,7 +448,7 @@ def _test_split_large_concat_with_slice(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -456,6 +465,121 @@ def test_split_large_concat_with_slice(self):
             test_name="split_large_concat_with_dynamic_slice",
         )
 
+    def _test_split_large_concat_with_reshape(
+        self,
+        num_inputs,
+        input_shape,
+        reshape_shape,
+        cat_dim,
+        test_name,
+        dtype="float16",
+    ):
+        # make a model like below:
+        # x = Tensor([10, 2, 20])
+        # reshape_output = reshape(t1, [10, -1])
+        # t1 = Tensor([10, 40])
+        # ...
+        # tn = Tensor([10, 40])
+        # y = concat([x, t1, ..., tn])
+        X = Tensor(
+            shape=reshape_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        reshape_output = ops.reshape()(X, input_shape)
+        normal_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
+        concat_input_tensors = [reshape_output] + normal_input_tensors
+        concat_op = ops.concatenate()
+        Y = concat_op(concat_input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+
+        x_pt = get_random_torch_tensor(reshape_shape, dtype)
+        reshape_output_pt = torch.reshape(x_pt, input_shape)
+        normal_inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
+        concat_inputs_pt = [reshape_output_pt] + normal_inputs_pt
+        y_pt = torch.cat(concat_inputs_pt, cat_dim)
+
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(concat_inputs_pt))]
+        input_names = [X._attrs["name"]] + [
+            i._attrs["name"] for i in normal_input_tensors
+        ]
+        for i_name, i_pt in zip(input_names, [x_pt] + normal_inputs_pt):
+            inputs[input_name_to_index[i_name]] = i_pt
+
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_large_concat_with_reshape(self):
+        self._test_split_large_concat_with_reshape(
+            num_inputs=180,
+            input_shape=(10, 40),
+            reshape_shape=(10, 2, 20),
+            cat_dim=1,
+            test_name="split_large_concat_with_reshape",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_large_concat_float(self):
+        self._test_split_large_concat_simple(
+            cat_dim=1,
+            num_inputs=35,
+            input_shape=(2, 3),
+            split_count=2,
+            test_name="split_large_concat_simple_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3, 4),
+            test_name="split_large_concat_with_add_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_strided_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_strided_add_complex(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add_complex_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_reuse(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_reuse_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_slice(
+            cat_dim=1,
+            num_slice_inputs=161,
+            slice_input_shape=(20, 20),
+            start_indices=[0, 0],
+            end_indices=[None, 10],
+            num_add_inputs=5,
+            add_input_shape=(20, 161 * 10),
+            test_name="split_large_concat_with_dynamic_slice_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_split_large_split.py b/tests/unittest/compiler/test_split_large_split.py
new file mode 100644
index 000000000..8788e3803
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_split.py
@@ -0,0 +1,156 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SplitLargeSplitTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitLargeSplitTestCase, self).__init__(*args, **kwargs)
+
+    def _run_split(
+        self,
+        *,
+        input_shape,
+        split_size_or_sections,
+        dim=None,
+        input_type="float16",
+        testname=None,
+    ):
+        logging.info(
+            f"Test input shape {input_shape}, "
+            f"split_size_or_sections={split_size_or_sections}, dim={dim}"
+        )
+
+        split_op = ops.split()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Ys_pt = (
+            torch.split(X_pt, split_size_or_sections)
+            if dim is None
+            else torch.split(X_pt, split_size_or_sections, dim)
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        Ys = (
+            split_op(X, split_size_or_sections)
+            if dim is None
+            else split_op(X, split_size_or_sections, dim)
+        )
+        np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+        y_shapes = []
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+            y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+            logging.info(f"AITemplate output_{idx} shape: {y_shape}")
+            y_shapes.append(y_shape)
+
+        module = compile_model(Ys, target, "./tmp", testname)
+
+        outputs = {
+            f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors([X_pt], outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split(self):
+        self._run_split(
+            input_shape=[4096, 128, 64],
+            split_size_or_sections=32,
+            dim=0,
+            testname="split_0",
+        )
+        self._run_split(
+            input_shape=[128, 2048, 64],
+            split_size_or_sections=3,
+            dim=1,
+            testname="split_1",
+        )
+        self._run_split(
+            input_shape=[64, 128, 1024],
+            split_size_or_sections=2,
+            dim=2,
+            testname="split_2",
+        )
+        self._run_split(
+            input_shape=[64, 128, 1024],
+            split_size_or_sections=7,
+            dim=2,
+            testname="split_3",
+        )
+
+    def test_split_with_strided_op(self):
+        input_shape = [64, 128, 1024]
+        split_size_or_sections = 3
+        split_dim = 2
+        strided_op_idx = [100, 200, 300]
+
+        split_op = ops.split()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape)
+        Ys_pt = list(torch.split(X_pt, split_size_or_sections, split_dim))
+        for idx in strided_op_idx:
+            Ys_pt[idx] = torch.relu(Ys_pt[idx])
+        target = detect_target()
+        X = Tensor(shape=input_shape, name="input_0", is_input=True)
+        Ys = list(split_op(X, split_size_or_sections, split_dim))
+        np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+        y_shapes = []
+        for idx, Y in enumerate(Ys):
+            if idx in strided_op_idx:
+                Y = ops.elementwise(FuncEnum.RELU)(Y)
+                Ys[idx] = Y
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+
+            y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+            logging.info(f"AITemplate output_{idx} shape: {y_shape}")
+            y_shapes.append(y_shape)
+
+        module = compile_model(Ys, target, "./tmp", "split_with_strided_ops")
+
+        outputs = {
+            f"output_{idx}": get_torch_empty_tensor(y_shape)
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors([X_pt], outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
index a0a96cdc4..62bcd6e7f 100644
--- a/tests/unittest/compiler/test_split_view_strided.py
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -20,6 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -38,14 +42,17 @@ def _test_split_view_bmm_rcr(
         expected_num_tensors,
         expected_num_ops,
         testname,
+        dtype="float16",
     ):
         T_A = Tensor(
             shape=input_A_shape,
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         T_B = Tensor(
             shape=input_B_shape,
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -75,19 +82,13 @@ def _test_split_view_bmm_rcr(
                 "batch_size": B,
                 "emb_pool_size": M,
             }
-            a = (
-                torch.randn(
-                    *test_utils.get_shape(T_A._attrs["shape"], dim_to_value_dict)
-                )
-                .cuda()
-                .half()
+            a = get_random_torch_tensor(
+                test_utils.get_shape(T_A._attrs["shape"], dim_to_value_dict),
+                dtype,
             )
-            b = (
-                torch.randn(
-                    *test_utils.get_shape(T_B._attrs["shape"], dim_to_value_dict)
-                )
-                .cuda()
-                .half()
+            b = get_random_torch_tensor(
+                test_utils.get_shape(T_B._attrs["shape"], dim_to_value_dict),
+                dtype,
             )
             xs = a.split(split_size_or_sections, split_dim)
             ys = b.split(split_size_or_sections, split_dim)
@@ -98,7 +99,7 @@ def _test_split_view_bmm_rcr(
                 c = torch.bmm(x, y.permute(0, 2, 1))
                 cs.append(c)
 
-            ys = [torch.empty(y_pt.size()).cuda().half() for y_pt in cs]
+            ys = [get_torch_empty_tensor(y_pt.size(), dtype) for y_pt in cs]
             module.run_with_tensors({"input0": a, "input1": b}, ys)
 
             for y, y_pt in zip(ys, cs):
@@ -175,6 +176,48 @@ def test_split_view_bmm_rcr_fusion(self):
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_split_view_bmm_rcr_fusion_float(self):
+        b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
+        m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
+
+        # bmm_rcr dynamic M fusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[1],
+            Ms=[100, 105, 160],
+            input_A_shape=[1, m_dim, 10, 2],
+            input_B_shape=[1, 6, 10, 2],
+            split_size_or_sections=10,
+            split_dim=2,
+            reshape_A=[1, -1, 20],
+            reshape_B=[1, 6, 20],
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            testname="test_split_bmm_rcr_dynamic_m_fusible_float",
+            dtype="float",
+        )
+        # bmm_rcr dynamic M, B unfusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[2, 4, 5, 10],
+            Ms=[100, 200],
+            input_A_shape=[b_dim, m_dim, 10, 8],
+            input_B_shape=[b_dim, m_dim, 10, 8],
+            split_size_or_sections=2,
+            split_dim=2,
+            reshape_A=[-1, 10, 16],
+            reshape_B=[-1, 10, 16],
+            expected_num_tensors=27,
+            expected_num_ops=17,
+            testname="test_split_bmm_rcr_dynamic_bm_non_fusible_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_group_gemm.py b/tests/unittest/compiler/test_strided_group_gemm.py
index 6824984d3..61267d475 100644
--- a/tests/unittest/compiler/test_strided_group_gemm.py
+++ b/tests/unittest/compiler/test_strided_group_gemm.py
@@ -23,14 +23,22 @@
 from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedGroupGemmTestCase(unittest.TestCase):
-    def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
+    def _test_strided_group_gemm(
+        self, M, N1, K1, N2, K2, N3, test_name, dtype="float16"
+    ):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = M
@@ -40,15 +48,15 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
 
-        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+        X3 = Tensor(shape=[M3, N3], dtype=dtype, name="x3", is_input=True)
 
         group_gemm_op = ops.group_gemm_rcr()
         Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1], [X2, W2]])
@@ -67,17 +75,17 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        X3_pt = torch.randn(M3, N3).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        X3_pt = get_random_torch_tensor([M3, N3], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -87,7 +95,7 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
             "w2": W2_pt,
             "x3": X3_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -106,13 +114,13 @@ def test_strided_group_gemm(self):
         )
 
     def _test_strided_group_gemm_bias(
-        self, M, N1, K1, N2, K2, N3, test_name, input_first
+        self, M, N1, K1, N2, K2, N3, test_name, input_first, dtype="float16"
     ):
         # input_first determines if we place input tensor (X3) to be the first
         # concatenated tensor or not
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
         M1 = M
         M2 = M
@@ -121,17 +129,17 @@ def _test_strided_group_gemm_bias(
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
 
-        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+        X3 = Tensor(shape=[M3, N3], dtype=dtype, name="x3", is_input=True)
 
         group_gemm_op = ops.group_gemm_rcr_bias()
         Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
@@ -158,13 +166,13 @@ def _test_strided_group_gemm_bias(
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
-        X3_pt = torch.randn(M3, N3).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        B1_pt = get_random_torch_tensor([N1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        B2_pt = get_random_torch_tensor([N2], dtype)
+        X3_pt = get_random_torch_tensor([M3, N3], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         if input_first:
@@ -173,7 +181,7 @@ def _test_strided_group_gemm_bias(
             Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -185,7 +193,7 @@ def _test_strided_group_gemm_bias(
             "b2": B2_pt,
             "x3": X3_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -211,8 +219,32 @@ def test_strided_group_gemm_bias(self):
             input_first=False,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_group_gemm_float(self):
+        self._test_strided_group_gemm(
+            M=8,
+            N1=32,
+            K1=32,
+            N2=4,
+            K2=4,
+            N3=3,
+            test_name="strided_group_gemm_rcr_cat_float2",
+            dtype="float",
+        )
+        self._test_strided_group_gemm_bias(
+            M=128,
+            N1=32,
+            K1=32,
+            N2=64,
+            K2=16,
+            N3=8,
+            test_name="strided_group_gemm_rcr_bias_cat_float1",
+            input_first=False,
+            dtype="float",
+        )
+
     # test if we update epilogue alignment values correctly
-    def test_strided_group_gemm_epilogue_alignment(self):
+    def _test_strided_group_gemm_epilogue_alignment(self, dtype="float16"):
         # Note that we have to force profiling in ci. Otherwise, we would not
         # be able to fetch cached config.
         target = detect_target()
@@ -228,8 +260,9 @@ def test_strided_group_gemm_epilogue_alignment(self):
             N2=62,
             K2=16,
             N3=2,
-            test_name="strided_group_gemm_rcr_epilogue_alignment1",
+            test_name=f"strided_group_gemm_rcr_epilogue_alignment_{dtype}_1",
             input_first=True,
+            dtype=dtype,
         )
         # a bigger epilogue alignment value 4
         self._test_strided_group_gemm_bias(
@@ -239,8 +272,9 @@ def test_strided_group_gemm_epilogue_alignment(self):
             N2=62,
             K2=16,
             N3=4,
-            test_name="strided_group_gemm_rcr_epilogue_alignment2",
+            test_name=f"strided_group_gemm_rcr_epilogue_alignment_{dtype}_2",
             input_first=True,
+            dtype=dtype,
         )
 
         # restore old env
@@ -250,6 +284,13 @@ def test_strided_group_gemm_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    def test_strided_group_gemm_epilogue_alignment(self):
+        self._test_strided_group_gemm_epilogue_alignment()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_group_gemm_epilogue_alignment_float(self):
+        self._test_strided_group_gemm_epilogue_alignment(dtype="float")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_group_layernorm.py b/tests/unittest/compiler/test_strided_group_layernorm.py
index b3b7ecd5a..02f24a795 100644
--- a/tests/unittest/compiler/test_strided_group_layernorm.py
+++ b/tests/unittest/compiler/test_strided_group_layernorm.py
@@ -14,14 +14,13 @@
 #
 import itertools
 import unittest
-import uuid
 from typing import List
 
 import torch
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.utils import shape_utils, torch_utils
 
 
 def build_ait_module(
@@ -35,9 +34,10 @@ def build_ait_module(
     beta_is_none,
     fuse_sigmoid_mul,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_group_layernorm",
+    test_name="strided_group_layernorm",
 ):
     target = detect_target()
     inputs = [
@@ -84,11 +84,13 @@ def build_ait_module(
     for i, output in enumerate(outputs):
         output._attrs["is_output"] = True
         output._attrs["name"] = f"output_{i}"
+    dll_name = f"test_{test_id}.so"
     return compile_model(
         outputs,
         target,
         workdir,
         test_name,
+        dll_name=dll_name,
     )
 
 
@@ -164,6 +166,10 @@ def eval_pt(
 
 
 class SliceGroupLayerNormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceGroupLayerNormTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_slice_group_layer_norm(
         self,
         *,
@@ -176,6 +182,7 @@ def _test_slice_group_layer_norm(
         eps=1e-5,
         start_indices: List[int] = (0,),
         end_indices: List[int] = (None,),
+        dtype: str = "float16",
     ):
         input_rank = 1 + len(input_nonbatch_shapes[0])
         if 1 == len(start_indices) and len(start_indices) != input_rank:
@@ -196,13 +203,15 @@ def _test_slice_group_layer_norm(
 
         ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             **_layernorm_common_params,
+            test_id=self._test_id,
+            ait_dtype=dtype,
         )
+        self._test_id += 1
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
             pt_tensors = eval_pt(
-                batch_size=batch_size,
-                **_layernorm_common_params,
+                batch_size=batch_size, **_layernorm_common_params, dtype=pt_dtype
             )
             ait_inputs = {
                 k: v
@@ -329,6 +338,37 @@ def test_middle_slice_group_layer_norm_fuse_sigmoid_mul_float16(self):
                 fuse_sigmoid_mul=True,
             )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_group_layer_norm_float(self):
+        self._test_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=3,
+            gamma_is_none=True,
+            beta_is_none=True,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_middle_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=2,
+            gamma_is_none=True,
+            beta_is_none=False,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=1,
+            gamma_is_none=False,
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+        self._test_middle_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=3,
+            gamma_is_none=False,
+            beta_is_none=False,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
index 43b270669..c119c85d9 100644
--- a/tests/unittest/compiler/test_strided_layernorm.py
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -14,7 +14,6 @@
 #
 import itertools
 import unittest
-import uuid
 from typing import List
 
 import torch
@@ -22,7 +21,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.utils import shape_utils, torch_utils
 
 
 def build_ait_module(
@@ -36,9 +35,10 @@ def build_ait_module(
     beta_is_none,
     fuse_sigmoid_mul,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_layernorm",
+    test_name="strided_layernorm",
 ):
     target = detect_target()
     X0 = Tensor(
@@ -83,11 +83,13 @@ def build_ait_module(
         output = ops.layernorm()(X1, X2, X3, layernorm_weight_shape, eps)
     output._attrs["is_output"] = True
     output._attrs["name"] = "output"
+    dll_name = f"test_{test_id}.so"
     return compile_model(
         output,
         target,
         workdir,
         test_name,
+        dll_name=dll_name,
     )
 
 
@@ -132,6 +134,10 @@ def eval_pt(
 
 
 class SliceLayerNormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceLayerNormTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_slice_layer_norm(
         self,
         *,
@@ -144,6 +150,7 @@ def _test_slice_layer_norm(
         eps=1e-5,
         start_indices: List[int] = (0,),
         end_indices: List[int] = (None,),
+        dtype: str = "float16",
     ):
 
         input_rank = 1 + len(input_nonbatch_shape)
@@ -165,13 +172,15 @@ def _test_slice_layer_norm(
 
         ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             **_layernorm_common_params,
+            test_id=self._test_id,
+            ait_dtype=dtype,
         )
+        self._test_id += 1
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
             pt_tensors = eval_pt(
-                batch_size=batch_size,
-                **_layernorm_common_params,
+                batch_size=batch_size, **_layernorm_common_params, dtype=pt_dtype
             )
             ait_inputs = {
                 k: v for k, v in pt_tensors.items() if v is not None and k != "output"
@@ -291,6 +300,39 @@ def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
                 fuse_sigmoid_mul=True,
             )
 
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    )
+    def test_slice_layer_norm_float32(self):
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=1,
+            gamma_is_none=True,
+            beta_is_none=True,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_middle_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=2,
+            gamma_is_none=True,
+            beta_is_none=False,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=3,
+            gamma_is_none=False,
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+        self._test_middle_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=2,
+            gamma_is_none=False,
+            beta_is_none=False,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_layernorm_reshape.py b/tests/unittest/compiler/test_strided_layernorm_reshape.py
index 2cc725c4a..3854d5ef0 100644
--- a/tests/unittest/compiler/test_strided_layernorm_reshape.py
+++ b/tests/unittest/compiler/test_strided_layernorm_reshape.py
@@ -13,22 +13,22 @@
 #  limitations under the License.
 #
 import unittest
-import uuid
 
 import torch
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.utils import shape_utils, torch_utils
 
 
 def build_ait_module(
     *,
     batch_sizes,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_layernorm_reshape",
+    test_name="strided_layernorm_reshape",
 ):
     input_nonbatch_shape = [6912]
     target = detect_target()
@@ -67,15 +67,11 @@ def build_ait_module(
 
     output._attrs["is_output"] = True
     output._attrs["name"] = "output"
+    dll_name = f"test_{test_id}.so"
     return (
         inputs,
         output,
-        compile_model(
-            output,
-            target,
-            workdir,
-            test_name,
-        ),
+        compile_model(output, target, workdir, test_name, dll_name=dll_name),
     )
 
 
@@ -106,9 +102,14 @@ def eval_pt(
 
 
 class SliceLayerNormReshapeTestCase(unittest.TestCase):
-    def test_slice_layer_norm_reshape(
+    def __init__(self, *args, **kwargs):
+        super(SliceLayerNormReshapeTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_slice_layer_norm_reshape(
         self,
         *,
+        dtype="float16",
         batch_sizes=(3, 4),
         eps=1e-5,
         atol=1e-3,
@@ -116,9 +117,11 @@ def test_slice_layer_norm_reshape(
     ):
         ait_in_node, ait_out_node, ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             eps=eps,
+            test_id=self._test_id,
+            ait_dtype=dtype,
         )
+        self._test_id += 1
 
         for op_name in (
             next(iter(ait_in_node._attrs["dst_ops"]))._attrs["name"],
@@ -126,11 +129,9 @@ def test_slice_layer_norm_reshape(
         ):
             self.assertRegex(op_name, "layernorm")
 
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
-            pt_tensors = eval_pt(
-                batch_size=batch_size,
-                eps=eps,
-            )
+            pt_tensors = eval_pt(batch_size=batch_size, eps=eps, dtype=pt_dtype)
             ait_inputs = {k: v for k, v in pt_tensors.items() if k != "output"}
             ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
             ait_module.run_with_tensors(ait_inputs, ait_outputs)
@@ -141,6 +142,15 @@ def test_slice_layer_norm_reshape(
                 )
             )
 
+    def test_slice_layer_norm_reshape_float16(self):
+        self._test_slice_layer_norm_reshape()
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    )
+    def test_slice_layer_norm_reshape_float32(self):
+        self._test_slice_layer_norm_reshape(dtype="float32")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
index 38ae403d4..fd7a21dfe 100644
--- a/tests/unittest/compiler/test_strided_op_cat_pattern.py
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -27,7 +27,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -44,6 +47,7 @@ def _fused_elementwise_e2e_helper(
         m2: int,
         m3: int,
         k: int,
+        dtype: str = "float16",
     ):
         # Construct one graph with 2 fused_elementwises + 1 cat.
         batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch_0")
@@ -51,25 +55,25 @@ def _fused_elementwise_e2e_helper(
 
         X1 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m1), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             value=3.0,
         )
         X3 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m2), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
         X9 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m3), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input2",
             is_input=True,
         )
@@ -88,7 +92,7 @@ def _fused_elementwise_e2e_helper(
             [X8],
             target,
             "./tmp",
-            "fused_elementwise_cat_m1_{}_m2_{}_m3_{}_k_{}".format(m1, m2, m3, k),
+            f"fused_elementwise_cat_m1_{m1}_m2_{m2}_m3_{m3}_k_{k}_{dtype}",
         ) as module:
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
@@ -98,9 +102,9 @@ def _fused_elementwise_e2e_helper(
 
             # Run PyTorch baseline.
             for sizes in itertools.product(batch0_sizes, batch1_sizes):
-                x1_pt = torch.randn(sizes[0], sizes[1], m1, k).cuda().half()
-                x3_pt = torch.randn(sizes[0], sizes[1], m2, k).cuda().half()
-                x9_pt = torch.randn(sizes[0], sizes[1], m3, k).cuda().half()
+                x1_pt = get_random_torch_tensor([sizes[0], sizes[1], m1, k], dtype)
+                x3_pt = get_random_torch_tensor([sizes[0], sizes[1], m2, k], dtype)
+                x9_pt = get_random_torch_tensor([sizes[0], sizes[1], m3, k], dtype)
                 x5_pt = torch.tanh(x1_pt + 3.0)
                 x6_pt = torch.tanh(x3_pt)
                 x7_pt = torch.cat([x5_pt, x6_pt, x9_pt], dim=2)
@@ -108,8 +112,8 @@ def _fused_elementwise_e2e_helper(
 
                 # Run AITemplate module.
                 inputs = [x1_pt, x3_pt, x9_pt]
-                x8 = (
-                    torch.empty([sizes[0] * sizes[1], (m1 + m2 + m3) * k]).cuda().half()
+                x8 = get_torch_empty_tensor(
+                    [sizes[0] * sizes[1], (m1 + m2 + m3) * k], dtype
                 )
                 module.run_with_tensors(inputs, [x8])
 
@@ -172,20 +176,116 @@ def test_elementwise(self):
             batch0_sizes=[2, 59, 88], batch1_sizes=[20], m1=12, m2=16, m3=4, k=1
         )
 
-    def test_elementwise_cat_1(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_elementwise_float(self):
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1024], batch1_sizes=[2], m1=8, m2=16, m3=8, k=1, dtype="float"
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=6,
+            m2=8,
+            m3=2,
+            k=1,
+            dtype="float",
+        )
+        # float v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[100, 30],
+            batch1_sizes=[2],
+            m1=1,
+            m2=1,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float2 v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[30],
+            batch1_sizes=[2, 88, 99],
+            m1=2,
+            m2=3,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float v.s. float2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[77, 89, 188],
+            batch1_sizes=[1, 2, 4],
+            m1=3,
+            m2=2,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float4 v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2],
+            batch1_sizes=[1, 3, 1024],
+            m1=4,
+            m2=5,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float4 v.s. float2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2],
+            batch1_sizes=[1, 3, 1024],
+            m1=4,
+            m2=6,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # Offset alignment tests.
+        # offset alignment = 1
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=3,
+            m2=4,
+            m3=5,
+            k=1,
+            dtype="float",
+        )
+        # offset alignment = 2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=6,
+            m2=8,
+            m3=2,
+            k=1,
+            dtype="float",
+        )
+        # offset alignment = 4
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=12,
+            m2=16,
+            m3=4,
+            k=1,
+            dtype="float",
+        )
+
+    def _test_elementwise_cat_1(self, dtype="float16"):
         BATCH_SIZE = 1024
         NUM_FLOAT_FEATURES = 1456
 
         X1 = Tensor(
             shape=[IntImm(BATCH_SIZE), IntImm(NUM_FLOAT_FEATURES)],
-            dtype="float16",
+            dtype=dtype,
             name="float_features",
             is_input=True,
         )
         X2 = ops.elementwise(FuncEnum.SIGN)(X1)  # Sign
         X3 = ops.elementwise(FuncEnum.ABS)(X1)  # Abs
         X4 = ops.elementwise(FuncEnum.LOGE)(
-            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], value=1.0))
+            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], dtype=dtype, value=1.0))
         )  # Log1p
         X5 = ops.elementwise(FuncEnum.MUL)(X2, X4)  # Mul
         X6 = ops.concatenate()([X5, X1], dim=1)  # Concat
@@ -198,9 +298,11 @@ def test_elementwise_cat_1(self):
             [X6],
             target,
             "./tmp",
-            "test_elementwise_cat_1",
+            f"test_elementwise_cat_1_{dtype}",
         ) as module:
-            float_features = torch.randn(BATCH_SIZE, NUM_FLOAT_FEATURES).cuda().half()
+            float_features = get_random_torch_tensor(
+                [BATCH_SIZE, NUM_FLOAT_FEATURES], dtype
+            )
             x1_pt = torch.sign(float_features)  # Sign
             x2_pt = torch.abs(float_features)  # Abs
             x3_pt = torch.log1p(x2_pt)  # Log1p
@@ -208,12 +310,65 @@ def test_elementwise_cat_1(self):
             x5_pt = torch.cat([x4_pt, float_features], dim=1)  # Concat
 
             # Run AITemplate module.
-            x6 = torch.empty(x5_pt.size()).cuda().half()
+            x6 = get_torch_empty_tensor(x5_pt.size(), dtype)
             module.run_with_tensors([float_features], [x6])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
 
+    def test_elementwise_cat_1(self):
+        self._test_elementwise_cat_1()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_elementwise_cat_1_float(self):
+        self._test_elementwise_cat_1(dtype="float")
+
+    def test_elementwise_cat_non_fusion(self):
+        BATCH_SIZE = 1024
+        NUM_FLOAT_FEATURES = 1456
+
+        X1 = Tensor(
+            shape=[IntImm(BATCH_SIZE), IntImm(NUM_FLOAT_FEATURES)],
+            name="float_features",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.SIGN)(X1)  # Sign
+        X3 = ops.elementwise(FuncEnum.ABS)(X1)  # Abs
+        X4 = ops.elementwise(FuncEnum.LOGE)(
+            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], value=1.0))
+        )  # Log1p
+        X5 = ops.elementwise(FuncEnum.MUL)(X2, X4)  # Mul
+        X5._attrs["name"] = "intermediate_out"
+        X5._attrs["is_output"] = True
+        X6 = ops.concatenate()([X5, X1], dim=1)  # Concat
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X5, X6],
+            target,
+            "./tmp",
+            "test_elementwise_cat_1_non_fusion",
+        ) as module:
+            float_features = get_random_torch_tensor([BATCH_SIZE, NUM_FLOAT_FEATURES])
+            x1_pt = torch.sign(float_features)  # Sign
+            x2_pt = torch.abs(float_features)  # Abs
+            x3_pt = torch.log1p(x2_pt)  # Log1p
+            x4_pt = x1_pt * x3_pt  # Mul
+            x5_pt = torch.cat([x4_pt, float_features], dim=1)  # Concat
+
+            # Run AITemplate module.
+            x6 = get_torch_empty_tensor(x5_pt.size())
+            x5 = get_torch_empty_tensor(x4_pt.size())
+            module.run_with_tensors(
+                [float_features], {"output0": x6, "intermediate_out": x5}
+            )
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
+
     def _fused_gemm_e2e_helper(
         self,
         m: int,
@@ -224,81 +379,82 @@ def _fused_gemm_e2e_helper(
         m2: int = -1,
         cat_dim: int = 1,
         no_fuse: bool = False,
+        dtype: str = "float16",
     ):
         # Construct one graph with 3 gemms + 1 cat.
         nd_gemm = m2 > 0
         if nd_gemm:
             X1 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X1",
                 is_input=True,
             )
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
             X3 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X3",
                 is_input=True,
             )
             X4 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="X4",
                 is_input=True,
             )
         else:
             X1 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X1",
                 is_input=True,
             )
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
             X3 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X3",
                 is_input=True,
             )
             X4 = Tensor(
                 shape=[IntImm(m), IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="X4",
                 is_input=True,
             )
 
         W1 = Tensor(
             shape=[IntImm(n1), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W1",
             is_input=True,
         )
         W2 = Tensor(
             shape=[IntImm(n2), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W2",
             is_input=True,
         )
         B2 = Tensor(
             shape=[IntImm(n2)],
-            dtype="float16",
+            dtype=dtype,
             name="B2",
             is_input=True,
         )
         W3 = Tensor(
             shape=[IntImm(k), IntImm(n3)],
-            dtype="float16",
+            dtype=dtype,
             name="W3",
             is_input=True,
         )
@@ -317,7 +473,7 @@ def _fused_gemm_e2e_helper(
             [X9],
             target,
             "./tmp",
-            "fused_gemm_m_{}_k_{}_n1_{}_n2_{}_n3_{}".format(m, k, n1, n2, n3),
+            f"fused_gemm_m_{m}_k_{k}_n1_{n1}_n2_{n2}_n3_{n3}_{dtype}",
         ) as module:
 
             if not no_fuse:
@@ -329,21 +485,21 @@ def _fused_gemm_e2e_helper(
 
             if nd_gemm:
                 # Run PyTorch baseline.
-                x1_pt = torch.randn(m, m2, k).cuda().half()
-                x2_pt = torch.randn(m, m2, k).cuda().half()
-                x3_pt = torch.randn(m, m2, k).cuda().half()
-                x4_pt = torch.randn(m, m2, n2).cuda().half()
+                x1_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x2_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x3_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x4_pt = get_random_torch_tensor([m, m2, n2], dtype)
             else:
                 # Run PyTorch baseline.
-                x1_pt = torch.randn(m, k).cuda().half()
-                x2_pt = torch.randn(m, k).cuda().half()
-                x3_pt = torch.randn(m, k).cuda().half()
-                x4_pt = torch.randn(m, n2).cuda().half()
+                x1_pt = get_random_torch_tensor([m, k], dtype)
+                x2_pt = get_random_torch_tensor([m, k], dtype)
+                x3_pt = get_random_torch_tensor([m, k], dtype)
+                x4_pt = get_random_torch_tensor([m, n2], dtype)
 
-            w1_pt = torch.randn(n1, k).cuda().half()
-            w2_pt = torch.randn(n2, k).cuda().half()
-            b2_pt = torch.randn(n2).cuda().half()
-            w3_pt = torch.randn(k, n3).cuda().half()
+            w1_pt = get_random_torch_tensor([n1, k], dtype)
+            w2_pt = get_random_torch_tensor([n2, k], dtype)
+            b2_pt = get_random_torch_tensor([n2], dtype)
+            w3_pt = get_random_torch_tensor([k, n3], dtype)
 
             x5_pt = torch.nn.functional.linear(x1_pt, w1_pt)
             x6_pt = torch.nn.functional.linear(x2_pt, w2_pt, b2_pt)
@@ -368,7 +524,7 @@ def _fused_gemm_e2e_helper(
 
             inputs[name_to_idx["B2"]] = b2_pt
 
-            x9 = torch.empty(x9_pt.shape).cuda().half()
+            x9 = get_torch_empty_tensor(x9_pt.shape, dtype)
             module.run_with_tensors(inputs, [x9])
 
             # Do comparisons.
@@ -385,31 +541,53 @@ def test_gemm(self):
             m=1024, k=256, n1=32, n2=32, n3=32, m2=8, cat_dim=1, no_fuse=True
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_float(self):
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=5, n2=32, n3=4, dtype="float")
+        self._fused_gemm_e2e_helper(
+            m=1024, k=256, n1=8, n2=16, n3=32, m2=8, cat_dim=2, dtype="float"
+        )
+        self._fused_gemm_e2e_helper(
+            m=1024,
+            k=256,
+            n1=32,
+            n2=32,
+            n3=32,
+            m2=8,
+            cat_dim=1,
+            no_fuse=True,
+            dtype="float",
+        )
+
     def _fused_gemm_alignment_e2e_helper(
-        self, gemm_op, input_n: int, m: int, k: int, n: int
+        self, gemm_op, input_n: int, m: int, k: int, n: int, dtype: str = "float16"
     ):
         # Construct one graph with 1 input + 1 gemm_bias_add + 1 cat.
         Input1 = Tensor(
             shape=[IntImm(m), IntImm(input_n)],
-            dtype="float16",
+            dtype=dtype,
             name="Input1",
             is_input=True,
         )
         X1 = Tensor(
             shape=[IntImm(m), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         W1 = Tensor(
             shape=[IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W1",
             is_input=True,
         )
         B1 = Tensor(
             shape=[IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="B1",
             is_input=True,
         )
@@ -419,7 +597,7 @@ def _fused_gemm_alignment_e2e_helper(
             num_inputs = 5
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
@@ -440,7 +618,7 @@ def _fused_gemm_alignment_e2e_helper(
             [Y],
             target,
             "./tmp",
-            f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}",
+            f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}_{dtype}",
         ) as module:
 
             # Verify the generated graph.
@@ -462,15 +640,15 @@ def _fused_gemm_alignment_e2e_helper(
             np.testing.assert_equal(concat_op._attrs["input_masks"], [True, False])
 
             # Run PyTorch baseline.
-            input_pt = torch.randn(m, input_n).cuda().half()
-            x1_pt = torch.randn(m, k).cuda().half()
-            w1_pt = torch.randn(n, k).cuda().half()
-            b1_pt = torch.randn(n).cuda().half()
+            input_pt = get_random_torch_tensor([m, input_n], dtype)
+            x1_pt = get_random_torch_tensor([m, k], dtype)
+            w1_pt = get_random_torch_tensor([n, k], dtype)
+            b1_pt = get_random_torch_tensor([n], dtype)
 
             y1_pt = torch.nn.functional.linear(x1_pt, w1_pt)
             y1_pt = torch.nn.functional.linear(x1_pt, w1_pt, b1_pt)
             if gemm_op_kind == "gemm_rcr_bias_add":
-                x2_pt = torch.randn(m, n).cuda().half()
+                x2_pt = get_random_torch_tensor([m, n], dtype)
                 y1_pt += x2_pt
 
             y_pt = torch.cat([input_pt, y1_pt], dim=1)
@@ -485,7 +663,7 @@ def _fused_gemm_alignment_e2e_helper(
             inputs[name_to_idx["W1"]] = w1_pt
             inputs[name_to_idx["B1"]] = b1_pt
 
-            y = torch.empty([m, input_n + n]).cuda().half()
+            y = get_torch_empty_tensor([m, input_n + n], dtype)
             module.run_with_tensors(inputs, [y])
 
             # Do comparisons.
@@ -505,6 +683,19 @@ def test_gemm_alignment(self):
             gemm_op=ops.gemm_rcr_bias_add(), input_n=7, m=4, k=4, n=8
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_alignment_float(self):
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=1, m=2, k=2, n=4, dtype="float"
+        )
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=4, k=4, n=2, dtype="float"
+        )
+
     # Tests to ensure that we correctly update epilogue alignment values
     def test_gemm_update_epilogue_alignment(self):
         # Note that we have to force profiling in ci. Otherwise, we would not
@@ -539,6 +730,45 @@ def test_gemm_update_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    # Tests to ensure that we correctly update epilogue alignment values
+    def test_gemm_update_epilogue_alignment_float(self):
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=1, m=2, k=2, n=4, dtype="float"
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=4, m=2, k=2, n=4, dtype="float"
+        )
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=2, m=3, k=2, n=4, dtype="float"
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=3, k=2, n=4, dtype="float"
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
     def _fused_layernorm_e2e_helper(
         self,
         m: int,
@@ -548,6 +778,7 @@ def _fused_layernorm_e2e_helper(
         batch_size: Optional[IntVar] = None,
         gamma_is_none: bool = False,
         beta_is_none: bool = False,
+        dtype: str = "float16",
     ):
         logging.info(
             f"_fused_layernorm_e2e: m={m}, n1={n1}, n2={n2}, cat_dim={cat_dim}, batch_size={batch_size}"
@@ -560,7 +791,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         # Construct one graph with 2 layernorms + 1 cat.
         X1 = Tensor(
             shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n1)]),
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -569,7 +800,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             GAMMA1 = Tensor(
                 shape=[IntImm(n1)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma1",
                 is_input=True,
             )
@@ -578,13 +809,13 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             BETA1 = Tensor(
                 shape=[IntImm(n1)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta1",
                 is_input=True,
             )
         X2 = Tensor(
             shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n2)]),
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -593,7 +824,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             GAMMA2 = Tensor(
                 shape=[IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma2",
                 is_input=True,
             )
@@ -602,7 +833,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             BETA2 = Tensor(
                 shape=[IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta2",
                 is_input=True,
             )
@@ -623,8 +854,10 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
             [X7],
             target,
             "./tmp",
-            "fused_layernorm",
+            f"fused_layernorm_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
         ) as module:
+            self._test_id += 1
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
             num_tensors = 7
@@ -637,24 +870,24 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
             self.assertEqual(len(sorted_ops), 2)
 
             # Run PyTorch baseline.
-            x1_pt = torch.randn(_maybe_add_batch_size_pt([m, n1])).cuda().half()
+            x1_pt = get_random_torch_tensor(_maybe_add_batch_size_pt([m, n1]), dtype)
             if gamma_is_none:
                 gamma1_pt = None
             else:
-                gamma1_pt = torch.randn(n1).cuda().half()
+                gamma1_pt = get_random_torch_tensor([n1], dtype)
             if beta_is_none:
                 beta1_pt = None
             else:
-                beta1_pt = torch.randn(n1).cuda().half()
-            x2_pt = torch.randn(_maybe_add_batch_size_pt([m, n2])).cuda().half()
+                beta1_pt = get_random_torch_tensor([n1], dtype)
+            x2_pt = get_random_torch_tensor(_maybe_add_batch_size_pt([m, n2]), dtype)
             if gamma_is_none:
                 gamma2_pt = None
             else:
-                gamma2_pt = torch.randn(n2).cuda().half()
+                gamma2_pt = get_random_torch_tensor([n2], dtype)
             if beta_is_none:
                 beta2_pt = None
             else:
-                beta2_pt = torch.randn(n2).cuda().half()
+                beta2_pt = get_random_torch_tensor([n2], dtype)
 
             x3_pt = torch.nn.functional.layer_norm(
                 x1_pt, x1_pt.size()[-1:], gamma1_pt, beta1_pt
@@ -676,7 +909,7 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
                 inputs.append(gamma2_pt)
             if not beta_is_none:
                 inputs.append(beta2_pt)
-            x7 = torch.empty(x7_pt.size()).cuda().half()
+            x7 = get_torch_empty_tensor(x7_pt.size(), dtype)
             module.run_with_tensors(inputs, [x7])
 
             # Do comparisons.
@@ -722,6 +955,40 @@ def test_layernorm(self):
             batch_size=IntVar([1, 10], name="batch_size"),
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_layernorm_float(self):
+        self._fused_layernorm_e2e_helper(
+            m=1024, n1=256, n2=256, cat_dim=1, dtype="float"
+        )
+        self._fused_layernorm_e2e_helper(m=1, n1=256, n2=256, cat_dim=0, dtype="float")
+        self._fused_layernorm_e2e_helper(
+            m=1024,
+            n1=256,
+            n2=256,
+            cat_dim=1,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float",
+        )
+        self._fused_layernorm_e2e_helper(m=2, n1=128, n2=5, cat_dim=1, dtype="float")
+        self._fused_layernorm_e2e_helper(
+            m=2,
+            n1=3,
+            n2=128,
+            cat_dim=1,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float",
+        )
+        self._fused_layernorm_e2e_helper(
+            m=1024,
+            n1=256,
+            n2=256,
+            cat_dim=1,
+            batch_size=IntVar([1, 10], name="batch_size"),
+            dtype="float",
+        )
+
     def _test_group_layernorm_sigmoid_mul_cat_fusion(
         self,
         input_shapes,
@@ -731,6 +998,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         fuse_sigmoid_mul=True,
         use_group_ops=True,
         num_cat_ops=1,
+        dtype="float16",
     ):
         assert num_cat_ops in (1, 2), "Only supports testing with num_cat_ops in (1, 2)"
         testname = (
@@ -754,7 +1022,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                         IntImm(shape[0]),
                         IntImm(shape[1]),
                     ],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -764,7 +1032,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -775,7 +1043,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -850,12 +1118,16 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
             gammas_pt = []
             betas_pt = []
             for shape in input_shapes:
-                xs_pt.append(torch.randn(shape).cuda().half())
+                xs_pt.append(get_random_torch_tensor(shape, dtype))
                 gamma_pt = (
-                    None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+                    None
+                    if gamma_is_none
+                    else get_random_torch_tensor([shape[1]], dtype)
                 )
                 gammas_pt.append(gamma_pt)
-                beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+                beta_pt = (
+                    None if beta_is_none else get_random_torch_tensor([shape[1]], dtype)
+                )
                 betas_pt.append(beta_pt)
 
             y0s_pt = []
@@ -888,7 +1160,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                     inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
             ys = []
             for y_pt in ys_pt:
-                ys.append(torch.empty(y_pt.size()).cuda().half())
+                ys.append(get_torch_empty_tensor(y_pt.size(), dtype))
             module.run_with_tensors(inputs, ys)
             for y_pt, y in zip(ys_pt, ys):
                 self.assertTrue(
@@ -961,10 +1233,64 @@ def test_group_layernorm_sigmoid_mul_cat_fusion(self):
                 use_group_ops=False,
             )
 
-    def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_group_layernorm_sigmoid_mul_cat_fusion_float(self):
+        for fuse_sigmoid_mul in (True, False):
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 4, 0, fuse_sigmoid_mul=fuse_sigmoid_mul, dtype="float"
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 64], [128, 256], [128, 125]],
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]],
+                0,
+                gamma_is_none=True,
+                beta_is_none=True,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                0,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                num_cat_ops=2,
+                dtype="float",
+            )
+            # test group layernorm fusion (horizontal fusion)
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+                num_cat_ops=2,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+                dtype="float",
+            )
+
+    def _test_bmm_rcr_cat_fusion(
+        self,
+        B,
+        M,
+        Ns,
+        Ks,
+        cat_dim,
+        test_name,
+        expected_num_tensors,
+        expected_num_ops,
+        dtype="float16",
+    ):
         n = len(Ns)
         Cs = []
-        dtype = "float16"
 
         Xs_pt = []
         Ys_pt = []
@@ -990,8 +1316,8 @@ def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
                 C = ops.bmm_rcr_n1()(X, Y)
             Cs.append(C)
 
-            x = torch.randn(B, M, K).cuda().half()
-            y = torch.randn(B, N, K).cuda().half()
+            x = get_random_torch_tensor([B, M, K], dtype)
+            y = get_random_torch_tensor([B, N, K], dtype)
             c = torch.bmm(x, y.permute([0, 2, 1]))
             Xs_pt.append(x)
             Ys_pt.append(y)
@@ -1004,28 +1330,308 @@ def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
 
         # Gen module.
         target = detect_target()
-        with compile_model(Y, target, "./tmp", testname) as module:
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(2 * n)]
+            for i in range(n):
+                inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
+                inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), expected_num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), expected_num_ops)
+
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_bmm_rcr_cat_fusion(self):
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 2, 2],
+            Ks=[4, 5, 32],
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_1",
+            expected_num_tensors=11,
+            expected_num_ops=5,
+        )
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            cat_dim=1,
+            test_name="test_bmm_rcr_cat_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_3",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            cat_dim=-1,
+            test_name="test_bmm_rcr_cat_4",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def _test_bmm_crr_cat_fusion(
+        self,
+        B,
+        M,
+        Ns,
+        Ks,
+        cat_dim,
+        test_name,
+        expected_num_tensors,
+        expected_num_ops,
+        dtype="float16",
+    ):
+        n = len(Ns)
+        Cs = []
+
+        Xs_pt = []
+        Ys_pt = []
+        Cs_pt = []
+        for i in range(n):
+            N = Ns[i]
+            K = Ks[i]
+            X = Tensor(
+                shape=[B, K, M],
+                dtype=dtype,
+                name=f"X{i}",
+                is_input=True,
+            )
+            Y = Tensor(
+                shape=[B, K, N],
+                dtype=dtype,
+                name=f"Y{i}",
+                is_input=True,
+            )
+            C = ops.bmm_crr()(X, Y)
+            Cs.append(C)
+
+            x = get_random_torch_tensor([B, K, M], dtype)
+            y = get_random_torch_tensor([B, K, N], dtype)
+            c = torch.bmm(x.permute([0, 2, 1]), y)
+            Xs_pt.append(x)
+            Ys_pt.append(y)
+            Cs_pt.append(c)
+
+        Y = ops.concatenate()(Cs, dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_pt = torch.cat(Cs_pt, dim=cat_dim)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(Y, target, "./tmp", test_name) as module:
             input_name_to_index = module.get_input_name_to_index_map()
             inputs = [0 for i in range(2 * n)]
             for i in range(n):
                 inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
                 inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), expected_num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), expected_num_ops)
+
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_bmm_crr_cat_fusion(self):
+        # [B, K, M] x [B, K, N] = [B, M, N]
+        self._test_bmm_crr_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 5, 32],
+            cat_dim=2,
+            test_name="test_bmm_crr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_crr_cat_fusion(
+            B=8,
+            M=16,
+            Ns=[4, 4, 4],
+            Ks=[3, 16, 9],
+            cat_dim=1,
+            test_name="test_bmm_crr_cat_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def _test_bmm_crr_add_cat_fusion(
+        self,
+        B,
+        M,
+        Ns,
+        Ks,
+        cat_dim,
+        test_name,
+        expected_num_tensors,
+        expected_num_ops,
+        dtype="float16",
+    ):
+        n = len(Ns)
+        Cs = []
+
+        Xs_pt = []
+        Ys_pt = []
+        Ds_pt = []
+        Cs_pt = []
+        for i in range(n):
+            N = Ns[i]
+            K = Ks[i]
+            X = Tensor(
+                shape=[B, K, M],
+                dtype=dtype,
+                name=f"X{i}",
+                is_input=True,
+            )
+            Y = Tensor(
+                shape=[B, K, N],
+                dtype=dtype,
+                name=f"Y{i}",
+                is_input=True,
+            )
+            D = Tensor(
+                shape=[B, M, N],
+                dtype=dtype,
+                name=f"D{i}",
+                is_input=True,
+            )
+            C = ops.bmm_crr_add()(X, Y, D)
+            Cs.append(C)
+
+            x = get_random_torch_tensor([B, K, M], dtype)
+            y = get_random_torch_tensor([B, K, N], dtype)
+            d = get_random_torch_tensor([B, M, N], dtype)
+            c = torch.bmm(x.permute([0, 2, 1]), y)
+            c = c + d
+            Xs_pt.append(x)
+            Ys_pt.append(y)
+            Ds_pt.append(d)
+            Cs_pt.append(c)
+
+        Y = ops.concatenate()(Cs, dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_pt = torch.cat(Cs_pt, dim=cat_dim)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(3 * n)]
+            for i in range(n):
+                inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
+                inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
+                inputs[input_name_to_index[f"D{i}"]] = Ds_pt[i]
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
+
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), expected_num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), expected_num_ops)
+
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
-    def test_bmm_cat_fusion(self):
-        self._test_bmm_cat_fusion(1, 8, [2, 2, 2], [4, 5, 32], 2, "test_bmm_cat_1")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 1, "test_bmm_cat_2")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 2, "test_bmm_cat_3")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], -1, "test_bmm_cat_4")
+    def test_bmm_crr_add_cat_fusion(self):
+        self._test_bmm_crr_add_cat_fusion(
+            B=7,
+            M=10,
+            Ns=[2, 12, 8],
+            Ks=[4, 5, 6],
+            cat_dim=2,
+            test_name="test_bmm_crr_add_cat_1",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+        )
+        self._test_bmm_crr_add_cat_fusion(
+            B=8,
+            M=4,
+            Ns=[10, 10, 10],
+            Ks=[4, 5, 6],
+            cat_dim=1,
+            test_name="test_bmm_crr_add_cat_2",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_cat_fusion_float(self):
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 2, 2],
+            Ks=[4, 5, 32],
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_float_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_rcr_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_float_3",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_crr_cat_fusion(
+            B=8,
+            M=16,
+            Ns=[4, 4, 4],
+            Ks=[3, 16, 9],
+            cat_dim=1,
+            test_name="test_bmm_crr_cat_float_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_crr_add_cat_fusion(
+            B=7,
+            M=10,
+            Ns=[2, 12, 8],
+            Ks=[4, 5, 6],
+            cat_dim=2,
+            test_name="test_bmm_crr_add_cat_float_1",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+            dtype="float",
+        )
 
     def _test_bmm_rcr_update_epilogue_alignment(
-        self, bmm_op, input_N, B, M, N, K, testname
+        self, bmm_op, input_N, B, M, N, K, testname, dtype="float16"
     ):
         # create a graph with 1 input + 1 bmm + 1 concat
         cat_dim = -1
-        dtype = "float16"
 
         bmm_op_kind = bmm_op._attrs["op"]
         Input1 = Tensor(
@@ -1058,7 +1664,7 @@ def _test_bmm_rcr_update_epilogue_alignment(
             num_inputs += 1
             X2 = Tensor(
                 shape=[IntImm(B), IntImm(M), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
@@ -1066,11 +1672,11 @@ def _test_bmm_rcr_update_epilogue_alignment(
         else:
             C = bmm_op(X, W)
 
-        input1_pt = torch.randn(B, M, input_N).cuda().half()
-        x_pt = torch.randn(B, M, K).cuda().half()
-        w_pt = torch.randn(*w_shape).cuda().half()
+        input1_pt = get_random_torch_tensor([B, M, input_N], dtype)
+        x_pt = get_random_torch_tensor([B, M, K], dtype)
+        w_pt = get_random_torch_tensor(w_shape, dtype)
         if num_inputs == 4:
-            x2_pt = torch.randn(B, M, N).cuda().half()
+            x2_pt = get_random_torch_tensor([B, M, N], dtype)
 
         if "rcr" in bmm_op_kind:
             c_pt = torch.bmm(x_pt, w_pt.permute([0, 2, 1]))
@@ -1096,12 +1702,12 @@ def _test_bmm_rcr_update_epilogue_alignment(
         inputs[input_name_to_index["W"]] = w_pt
         if num_inputs == 4:
             inputs[input_name_to_index["X2"]] = x2_pt
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
     # Test to ensure we update epilogue alignment values
-    def test_bmm_rcr_update_epilogue_alignment(self):
+    def _test_bmm_rcr_update_epilogue_alignment_common(self, dtype="float16"):
         # Note that we have to force profiling in ci. Otherwise, we would not
         # be able to fetch cached config.
         target = detect_target()
@@ -1118,6 +1724,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=5,
             K=8,
             testname="test_bmm_rcr_epilogue_3",
+            dtype=dtype,
         )
         # a larger epilogue value 4
         self._test_bmm_rcr_update_epilogue_alignment(
@@ -1128,6 +1735,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=5,
             K=8,
             testname="test_bmm_rcr_epilogue_4",
+            dtype=dtype,
         )
 
         # a smaller epilogue value 2
@@ -1139,6 +1747,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=4,
             K=8,
             testname="test_bmm_rcr_epilogue_1",
+            dtype=dtype,
         )
         # a larger epilogue value 4
         self._test_bmm_rcr_update_epilogue_alignment(
@@ -1149,6 +1758,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=4,
             K=8,
             testname="test_bmm_rcr_epilogue_2",
+            dtype=dtype,
         )
 
         # restore old env
@@ -1158,6 +1768,17 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    def test_bmm_rcr_update_epilogue_alignment(self):
+        self._test_bmm_rcr_update_epilogue_alignment_common()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_rcr_update_epilogue_alignment_float(self):
+        self._test_bmm_rcr_update_epilogue_alignment_common(dtype="float")
+
     def _test_reduce_cat_fusion_1(
         self,
         input_shape,
@@ -1166,7 +1787,7 @@ def _test_reduce_cat_fusion_1(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1175,7 +1796,7 @@ def _test_reduce_cat_fusion_1(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1186,7 +1807,7 @@ def _test_reduce_cat_fusion_1(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_op(X1)
@@ -1210,8 +1831,8 @@ def _test_reduce_cat_fusion_1(
                 np.testing.assert_equal(Y_src_ops[0], reduce_op)
             np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y_pt = torch.cat([Y1_pt, X2_pt], dim=cat_dim)
 
@@ -1265,7 +1886,7 @@ def _test_reduce_cat_fusion_2(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1274,7 +1895,7 @@ def _test_reduce_cat_fusion_2(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1285,7 +1906,7 @@ def _test_reduce_cat_fusion_2(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_mean_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_mean_op(X1)
@@ -1296,7 +1917,7 @@ def _test_reduce_cat_fusion_2(
         Y3 = ops.concatenate()([X2, Y1, Y2], dim=cat_dim)
 
         x3_shape = [d._attrs["values"][0] for d in Y3._attrs["shape"]]
-        X3 = Tensor(shape=x3_shape, dtype=input_type, name="input_3", is_input=True)
+        X3 = Tensor(shape=x3_shape, dtype=dtype, name="input_3", is_input=True)
 
         add_op = ops.elementwise(FuncEnum.ADD)
         Y = add_op(Y3, X3)
@@ -1329,9 +1950,9 @@ def _test_reduce_cat_fusion_2(
                 concat_op._attrs["input_masks"], [True, False, False]
             )
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
-            X3_pt = get_random_torch_tensor(x3_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
+            X3_pt = get_random_torch_tensor(x3_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y2_pt = torch.var(X1_pt, dim=reduction_dim, unbiased=True, keepdim=keepdim)
             Y3_pt = torch.cat([X2_pt, Y1_pt, Y2_pt], dim=cat_dim)
@@ -1393,7 +2014,7 @@ def _test_reduce_cat_fusion_3(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1402,7 +2023,7 @@ def _test_reduce_cat_fusion_3(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1413,7 +2034,7 @@ def _test_reduce_cat_fusion_3(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_op(X1)
@@ -1439,8 +2060,8 @@ def _test_reduce_cat_fusion_3(
                 concat_op._attrs["input_masks"], [True, False, True]
             )
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y_pt = torch.cat([X2_pt, Y1_pt, X2_pt], dim=cat_dim)
 
@@ -1479,7 +2100,7 @@ def _test_reduce_cat_fusion_batch(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1494,7 +2115,7 @@ def _test_reduce_cat_fusion_batch(
 
         X1 = Tensor(
             shape=[batch_dim, *input_shape],
-            dtype=input_type,
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -1513,7 +2134,7 @@ def _test_reduce_cat_fusion_batch(
         x2_shape[cat_dim - 1] = new_cat_dim_val
         X2 = Tensor(
             shape=[batch_dim, *x2_shape],
-            dtype=input_type,
+            dtype=dtype,
             name="input_2",
             is_input=True,
         )
@@ -1542,8 +2163,8 @@ def _test_reduce_cat_fusion_batch(
             np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
 
             for batch in batch_sizes:
-                X1_pt = get_random_torch_tensor([batch, *input_shape], input_type)
-                X2_pt = get_random_torch_tensor([batch, *x2_shape], input_type)
+                X1_pt = get_random_torch_tensor([batch, *input_shape], dtype)
+                X2_pt = get_random_torch_tensor([batch, *x2_shape], dtype)
                 Y1_pt = torch.linalg.vector_norm(
                     X1_pt, ord=ord_kind, dim=reduction_dim, keepdim=keepdim
                 )
@@ -1564,21 +2185,20 @@ def test_reduce_cat_fusion_batch(self):
             keepdim=True,
             cat_dim=2,
             new_cat_dim_val=5,
-            test_name="test_reduce_cat_1_0",
+            test_name="test_reduce_cat_fusion_batch",
         )
 
-    def test_col_reduce_cat_fusion(self):
+    def _test_col_reduce_cat_fusion(self, dtype="float16"):
         torch.manual_seed(0)
         input_a_shape = [1, 4096]
         input_b_shape = [1, 250, 256]
-        input_type = "float16"
         reduction_dim = 1
         cat_dim = -1
-        test_name = "test_col_reduce_sum_cat"
+        test_name = f"test_col_reduce_sum_cat_{dtype}"
 
         target = detect_target()
-        A = Tensor(shape=input_a_shape, dtype=input_type, name="input_a", is_input=True)
-        B = Tensor(shape=input_b_shape, dtype=input_type, name="input_b", is_input=True)
+        A = Tensor(shape=input_a_shape, dtype=dtype, name="input_a", is_input=True)
+        B = Tensor(shape=input_b_shape, dtype=dtype, name="input_b", is_input=True)
 
         X = ops.reduce_sum(dim=reduction_dim)(B)
         Y = ops.concatenate()([A, X], dim=cat_dim)
@@ -1592,31 +2212,33 @@ def test_col_reduce_cat_fusion(self):
         concat_op = sorted_ops[1]
         np.testing.assert_equal(concat_op._attrs["input_masks"], [True, True])
 
-        a_pt = get_random_torch_tensor(input_a_shape, input_type)
-        b_pt = get_random_torch_tensor(input_b_shape, input_type)
+        a_pt = get_random_torch_tensor(input_a_shape, dtype)
+        b_pt = get_random_torch_tensor(input_b_shape, dtype)
         x_pt = torch.sum(b_pt, dim=reduction_dim)
         y_pt = torch.cat([a_pt, x_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"input_a": a_pt, "input_b": b_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
-    def test_strided_op_multiple_cats(self):
+    def test_col_reduce_cat_fusion(self):
+        self._test_col_reduce_cat_fusion()
+
+    def _test_strided_op_multiple_cats(self, dtype="float16"):
         # y1 = concat(x0, x1) # [4, 30]
         # y2 = slice(y1) # [4, 6]
         # y = concat(y1, y2) # [4, 36]
         x0_shape = [4, 10]
         x1_shape = [4, 20]
-        input_type = "float16"
         cat_dim = 1
-        test_name = "test_strided_op_multiple_cats"
+        test_name = f"test_strided_op_multiple_cats_{dtype}"
 
         target = detect_target()
-        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
-        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
+        X0 = Tensor(shape=x0_shape, dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=dtype, name="x1", is_input=True)
 
         Y1 = ops.concatenate()([X0, X1], dim=cat_dim)
         slice_start_indices = [0, 0]
@@ -1631,8 +2253,8 @@ def test_strided_op_multiple_cats(self):
 
         module = compile_model(Y, target, "./tmp", test_name)
 
-        x0_pt = get_random_torch_tensor(x0_shape, input_type)
-        x1_pt = get_random_torch_tensor(x1_shape, input_type)
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        x1_pt = get_random_torch_tensor(x1_shape, dtype)
         y1_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -1640,14 +2262,14 @@ def test_strided_op_multiple_cats(self):
         y2_pt = y1_pt[slice_indices]
         y_pt = torch.cat([y1_pt, y2_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"x0": x0_pt, "x1": x1_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
-    def test_strided_op_multiple_cats_2(self):
+    def _test_strided_op_multiple_cats_2(self, dtype="float16"):
         # y1 = x0 + x1
         # y2 = slice(y1)
         # y3 = concat(x2, y2)
@@ -1655,14 +2277,13 @@ def test_strided_op_multiple_cats_2(self):
         x0_shape = [4, 10]
         x1_shape = [4, 10]
         x2_shape = [4, 20]
-        input_type = "float16"
         cat_dim = 1
-        test_name = "test_strided_op_multiple_cats_2"
+        test_name = f"test_strided_op_multiple_cats_2_{dtype}"
 
         target = detect_target()
-        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
-        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="x2", is_input=True)
+        X0 = Tensor(shape=x0_shape, dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="x2", is_input=True)
 
         Y1 = ops.elementwise(FuncEnum.ADD)(X0, X1)
         slice_start_indices = [0, 0]
@@ -1682,9 +2303,9 @@ def test_strided_op_multiple_cats_2(self):
         self.assertEqual(len(sorted_ops), 2)
         self.assertEqual(sorted_ops[1]._attrs["op"], "concatenate")
 
-        x0_pt = get_random_torch_tensor(x0_shape, input_type)
-        x1_pt = get_random_torch_tensor(x1_shape, input_type)
-        x2_pt = get_random_torch_tensor(x2_shape, input_type)
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        x1_pt = get_random_torch_tensor(x1_shape, dtype)
+        x2_pt = get_random_torch_tensor(x2_shape, dtype)
         y1_pt = x0_pt + x1_pt
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -1693,13 +2314,60 @@ def test_strided_op_multiple_cats_2(self):
         y3_pt = torch.cat([x2_pt, y2_pt], dim=cat_dim)
         y_pt = torch.cat([y3_pt, y3_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
+    def test_strided_op_multiple_cats(self):
+        self._test_strided_op_multiple_cats()
+        self._test_strided_op_multiple_cats_2()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_reduce_cat_float(self):
+        self._test_reduce_cat_fusion_1(
+            input_shape=[4, 2],
+            reduction_dim=1,
+            keepdim=True,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_1_0_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[10, 22, 16],
+            reduction_dim=1,
+            keepdim=False,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_2_1_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_3(
+            input_shape=[3, 11, 16],
+            reduction_dim=2,
+            keepdim=False,
+            cat_dim=0,
+            new_cat_dim_val=10,
+            test_name="test_reduce_cat_3_1_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_batch(
+            batch_sizes=[5, 20],
+            input_shape=[4, 2],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_fusion_batch_float",
+            dtype="float",
+        )
+        self._test_col_reduce_cat_fusion(dtype="float")
+        self._test_strided_op_multiple_cats(dtype="float")
+        self._test_strided_op_multiple_cats_2(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
index 345cd3d2b..fec28a2c8 100644
--- a/tests/unittest/compiler/test_strided_reshape_cat.py
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -21,7 +21,13 @@
 from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedReshapeCatTestCase(unittest.TestCase):
@@ -29,10 +35,10 @@ def __init__(self, *args, **kwargs):
         super(StridedReshapeCatTestCase, self).__init__(*args, **kwargs)
         self.test_count = 1
 
-    def _test_strided_reshape_cat(self, num_cat_ops=1):
+    def _test_strided_reshape_cat(self, num_cat_ops=1, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = 128
@@ -54,21 +60,21 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
 
         X3 = Tensor(
-            shape=[IntImm(M3), IntImm(K3)], dtype="float16", name="x3", is_input=True
+            shape=[IntImm(M3), IntImm(K3)], dtype=dtype, name="x3", is_input=True
         )
-        W3 = Tensor(shape=[N3, K3], dtype="float16", name="w3", is_input=True)
+        W3 = Tensor(shape=[N3, K3], dtype=dtype, name="w3", is_input=True)
 
         Input = Tensor(
-            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+            shape=[BS, Input_M, Input_N], dtype=dtype, name="input", is_input=True
         )
 
         group_gemm_op = ops.group_gemm_rcr()
@@ -115,13 +121,13 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        X3_pt = torch.randn(M3, K3).cuda().half()
-        W3_pt = torch.randn(N3, K3).cuda().half()
-        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        X3_pt = get_random_torch_tensor([M3, K3], dtype)
+        W3_pt = get_random_torch_tensor([N3, K3], dtype)
+        Input_pt = get_random_torch_tensor([BS, Input_M, Input_N], dtype)
         Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y3_orig_pt = torch.nn.functional.linear(X3_pt, W3_pt)
@@ -131,7 +137,7 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
         Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt, Y3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -144,19 +150,15 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
             "input": Input_pt,
         }
 
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
         self.test_count += 1
 
-    def test_strided_reshape_cat(self, num_cat_ops=1):
-        self._test_strided_reshape_cat(1)
-        self._test_strided_reshape_cat(2)
-
-    def test_strided_reshape_cat_bias(self):
+    def _test_strided_reshape_cat_bias(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = 128
@@ -174,18 +176,18 @@ def test_strided_reshape_cat_bias(self):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
 
         Input = Tensor(
-            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+            shape=[BS, Input_M, Input_N], dtype=dtype, name="input", is_input=True
         )
 
         group_gemm_op = ops.group_gemm_rcr_bias()
@@ -211,13 +213,13 @@ def test_strided_reshape_cat_bias(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
-        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        B1_pt = get_random_torch_tensor([N1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        B2_pt = get_random_torch_tensor([N2], dtype)
+        Input_pt = get_random_torch_tensor([BS, Input_M, Input_N], dtype)
         Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y1_pt = torch.reshape(Y1_orig_pt, [BS, -1, Input_N])
@@ -225,7 +227,7 @@ def test_strided_reshape_cat_bias(self):
         Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -237,11 +239,25 @@ def test_strided_reshape_cat_bias(self):
             "b2": B2_pt,
             "input": Input_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
         self.test_count += 1
 
+    def test_strided_reshape_cat(self):
+        self._test_strided_reshape_cat()
+        self._test_strided_reshape_cat(num_cat_ops=2)
+        self._test_strided_reshape_cat_bias()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_strided_reshape_cat_float(self):
+        self._test_strided_reshape_cat(num_cat_ops=2, dtype="float")
+        self._test_strided_reshape_cat_bias(dtype="float")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_scatter.py b/tests/unittest/compiler/test_strided_scatter.py
index f98aa5211..64187ef24 100644
--- a/tests/unittest/compiler/test_strided_scatter.py
+++ b/tests/unittest/compiler/test_strided_scatter.py
@@ -22,9 +22,14 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedScatterTestCase(unittest.TestCase):
@@ -85,7 +90,7 @@ def _make_slice_ops(
                 X = input_tensor
             else:
                 X_name = f"input_{idx}"
-                X = self._make_tensor(input_shape, X_name)
+                X = self._make_tensor(input_shape, X_name, input_type)
             Y = slice_op(X, start_indices=s_indices, end_indices=e_indices)
             Ys.append(Y)
         return Ys
@@ -98,8 +103,9 @@ def _test_strided_scatter_basic(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_basic with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
@@ -110,6 +116,7 @@ def _test_strided_scatter_basic(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -135,7 +142,7 @@ def _test_strided_scatter_basic(
         for input_shape, s_indices, e_indices in zip(
             input_shapes, start_indices, end_indices
         ):
-            x_pt = torch.randn(input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             xs_pt.append(x_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             slice_output_pt = x_pt[slice_indices]
@@ -147,7 +154,7 @@ def _test_strided_scatter_basic(
         inputs = [0 for i in range(len(xs_pt))]
         for i, x_pt in enumerate(xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = x_pt
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -190,8 +197,9 @@ def _test_strided_scatter_dynamic(
         scatter_dim,
         test_name,
         make_slices=None,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_dynamic with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
@@ -207,12 +215,15 @@ def _test_strided_scatter_dynamic(
             ):
                 if not make_slice:
                     input_name = f"input_{idx}"
-                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+                    input_tensors[idx] = self._make_tensor(
+                        input_shape, input_name, dtype
+                    )
         slice_outputs = self._make_slice_ops(
             input_shapes,
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -247,7 +258,7 @@ def _test_strided_scatter_dynamic(
                 input_shape_pt = [
                     d[idx] if isinstance(d, list) else d for d in input_shape
                 ]
-                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
                 slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
                 slice_output_pt = x_pt[slice_indices]
@@ -259,7 +270,7 @@ def _test_strided_scatter_dynamic(
             inputs = [0 for i in range(len(xs_pt))]
             for i, x_pt in enumerate(xs_pt):
                 inputs[input_name_to_index[f"input_{i}"]] = x_pt
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -321,6 +332,7 @@ def _make_test_graph_multi_dsts_2(
         start_indices,
         end_indices,
         scatter_dim,
+        dtype="float16",
     ):
         """Make a graph where (1) a tensor is sliced twice and both slices are
         fed into the same concat op, and (2) another sliced output (i.e not
@@ -332,7 +344,7 @@ def _make_test_graph_multi_dsts_2(
             input_tensors,
             start_indices,
             end_indices,
-            scatter_dim,
+            dtype,
         )
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
@@ -359,8 +371,9 @@ def _test_strided_scatter_multi_dsts_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"strided_scatter_multi_dsts_2 with input_shapes: {input_shapes}, "
             f"start_indices: {start_indices}, end_indices: {end_indices}"
         )
@@ -371,7 +384,7 @@ def _test_strided_scatter_multi_dsts_2(
         for input_shape, s_indices, e_indices in zip(
             input_shapes, start_indices, end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             Y_pt = X_pt[slice_indices]
@@ -388,7 +401,7 @@ def _test_strided_scatter_multi_dsts_2(
 
         input_tensors = [None] * len(input_shapes)
         Y = self._make_test_graph_multi_dsts_2(
-            input_shapes, input_tensors, start_indices, end_indices, scatter_dim
+            input_shapes, input_tensors, start_indices, end_indices, scatter_dim, dtype
         )
 
         test_name = "strided_scatter_multi_dsts_2"
@@ -406,7 +419,7 @@ def _test_strided_scatter_multi_dsts_2(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(Y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(Y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -431,26 +444,27 @@ def _test_strided_scatter_input_masks(
         scatter_dim,
         test_name,
         make_slices,
+        dtype="float16",
     ):
         # make a graph with 1 gemm_rcr_bias + 1 elemwise + multiple slices -> cat
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_input_masks with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
 
         input_A_name = "input_a"
-        input_A = self._make_tensor([list(Ms), K], input_A_name)
+        input_A = self._make_tensor([list(Ms), K], input_A_name, dtype)
         input_B_name = "input_b"
-        input_B = self._make_tensor([N, K], input_B_name)
+        input_B = self._make_tensor([N, K], input_B_name, dtype)
         input_Bias_name = "input_bias"
-        input_Bias = self._make_tensor([N], input_Bias_name)
+        input_Bias = self._make_tensor([N], input_Bias_name, dtype)
         gemm_output = ops.gemm_rcr_bias()(input_A, input_B, input_Bias)
         gemm_output._attrs["name"] = "gemm_output"
 
         input_Add_0_name = "input_add_0"
         input_Add_1_name = "input_add_1"
         add_output = self._make_add(
-            [list(Ms), N], input_Add_0_name, input_Add_1_name, "add_output"
+            [list(Ms), N], input_Add_0_name, input_Add_1_name, "add_output", dtype
         )
         # A, B, bias, add_0 and add_1
         num_extra_inputs = 5
@@ -466,12 +480,15 @@ def _test_strided_scatter_input_masks(
             ):
                 if not make_slice:
                     input_name = f"input_{idx}"
-                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+                    input_tensors[idx] = self._make_tensor(
+                        input_shape, input_name, dtype
+                    )
         slice_outputs = self._make_slice_ops(
             input_shapes,
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_inputs = [gemm_output] + slice_outputs + [add_output]
         concat_op = ops.concatenate()
@@ -501,13 +518,13 @@ def _test_strided_scatter_input_masks(
 
         for idx, M in enumerate(Ms):
             # Run PyTorch
-            a_pt = torch.randn(M, K).cuda().half()
-            b_pt = torch.randn(N, K).cuda().half()
-            bias_pt = torch.randn(N).cuda().half()
+            a_pt = get_random_torch_tensor([M, K], dtype)
+            b_pt = get_random_torch_tensor([N, K], dtype)
+            bias_pt = get_random_torch_tensor([N], dtype)
             gemm_output_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
-            add_0_pt = torch.randn(M, N).cuda().half()
-            add_1_pt = torch.randn(M, N).cuda().half()
+            add_0_pt = get_random_torch_tensor([M, N], dtype)
+            add_1_pt = get_random_torch_tensor([M, N], dtype)
             add_output_pt = add_0_pt + add_1_pt
 
             slice_outputs_pt = []
@@ -518,7 +535,7 @@ def _test_strided_scatter_input_masks(
                 input_shape_pt = [
                     d[idx] if isinstance(d, list) else d for d in input_shape
                 ]
-                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
                 slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
                 slice_output_pt = x_pt[slice_indices]
@@ -536,7 +553,7 @@ def _test_strided_scatter_input_masks(
             inputs[input_name_to_index["input_bias"]] = bias_pt
             inputs[input_name_to_index[input_Add_0_name]] = add_0_pt
             inputs[input_name_to_index[input_Add_1_name]] = add_1_pt
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -575,13 +592,16 @@ def _test_strided_scatter_basic_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(f"test_strided_scatter_basic with {start_indices=}, {end_indices=}")
+        _LOGGER.info(
+            f"test_strided_scatter_basic with {start_indices=}, {end_indices=}"
+        )
 
         input_name_0 = "input_0"
-        input_0 = self._make_tensor(input_shape_0, input_name_0)
+        input_0 = self._make_tensor(input_shape_0, input_name_0, dtype)
         input_name_2 = "input_2"
-        input_2 = self._make_tensor(input_shape_2, input_name_2)
+        input_2 = self._make_tensor(input_shape_2, input_name_2, dtype)
 
         input_tensors = [input_2, input_0, input_2]
         input_shapes = [None] * len(input_tensors)
@@ -590,6 +610,7 @@ def _test_strided_scatter_basic_2(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -611,8 +632,8 @@ def _test_strided_scatter_basic_2(
 
         # Run PyTorch
         slice_outputs_pt = []
-        x0_pt = torch.randn(input_shape_0).cuda().half()
-        x2_pt = torch.randn(input_shape_2).cuda().half()
+        x0_pt = get_random_torch_tensor(input_shape_0, dtype)
+        x2_pt = get_random_torch_tensor(input_shape_2, dtype)
         xs_pt = [x2_pt, x0_pt, x2_pt]
         for x_pt, s_indices, e_indices in zip(xs_pt, start_indices, end_indices):
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
@@ -622,7 +643,7 @@ def _test_strided_scatter_basic_2(
 
         # run ait
         inputs = {"input_0": x0_pt, "input_2": x2_pt}
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -655,21 +676,30 @@ def _test_strided_scatter_input_masks_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
         # make a graph with 2 elemwise + 3 slices where 1 elemwise is sliced twice
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_input_masks {start_indices=}, {end_indices=}"
         )
 
         add_0_input_name_0 = "add_0_input_0"
         add_0_input_name_1 = "add_0_input_1"
         add_output0 = self._make_add(
-            [list(Ms0), N0], add_0_input_name_0, add_0_input_name_1, "add_0_output"
+            [list(Ms0), N0],
+            add_0_input_name_0,
+            add_0_input_name_1,
+            "add_0_output",
+            dtype,
         )
         add_1_input_name_0 = "add_1_input_0"
         add_1_input_name_1 = "add_1_input_1"
         add_output1 = self._make_add(
-            [list(Ms1), N1], add_1_input_name_0, add_1_input_name_1, "add_1_output"
+            [list(Ms1), N1],
+            add_1_input_name_0,
+            add_1_input_name_1,
+            "add_1_output",
+            dtype,
         )
 
         input_tensors = [add_output0, add_output1, add_output0]
@@ -679,6 +709,7 @@ def _test_strided_scatter_input_masks_2(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -707,11 +738,11 @@ def _test_strided_scatter_input_masks_2(
 
         for M0, M1 in zip(Ms0, Ms1):
             # Run PyTorch
-            add_0_0_pt = torch.randn(M0, N0).cuda().half()
-            add_0_1_pt = torch.randn(M0, N0).cuda().half()
+            add_0_0_pt = get_random_torch_tensor([M0, N0], dtype)
+            add_0_1_pt = get_random_torch_tensor([M0, N0], dtype)
             add_0_output_pt = add_0_0_pt + add_0_1_pt
-            add_1_0_pt = torch.randn(M1, N1).cuda().half()
-            add_1_1_pt = torch.randn(M1, N1).cuda().half()
+            add_1_0_pt = get_random_torch_tensor([M1, N1], dtype)
+            add_1_1_pt = get_random_torch_tensor([M1, N1], dtype)
             add_1_output_pt = add_1_0_pt + add_1_1_pt
 
             slice_outputs_pt = []
@@ -729,7 +760,7 @@ def _test_strided_scatter_input_masks_2(
                 add_1_input_name_0: add_1_0_pt,
                 add_1_input_name_1: add_1_1_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -764,8 +795,9 @@ def _test_strided_scatter_with_split(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_with_split with {start_indices=}, {end_indices=}"
         )
 
@@ -773,12 +805,12 @@ def _test_strided_scatter_with_split(
         input_Add_0_name = "input_add_0"
         input_Add_1_name = "input_add_1"
         add_output = self._make_add(
-            add_input_shape, input_Add_0_name, input_Add_1_name, "add_output"
+            add_input_shape, input_Add_0_name, input_Add_1_name, "add_output", dtype
         )
 
         # make split
         split_input_name = "split_input"
-        split_input = self._make_tensor(split_input_shape, split_input_name)
+        split_input = self._make_tensor(split_input_shape, split_input_name, dtype)
         split_dim_size = split_input_shape[scatter_dim]
         split_outputs = ops.split()(
             split_input, int(split_dim_size / 2), dim=scatter_dim
@@ -790,6 +822,7 @@ def _test_strided_scatter_with_split(
             slice_input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_inputs = [add_output] + slice_outputs + list(split_outputs)
         concat_op = ops.concatenate()
@@ -818,11 +851,11 @@ def _test_strided_scatter_with_split(
         np.testing.assert_equal(concat_op._attrs["input_masks"], input_masks)
 
         # Run PyTorch
-        input_add_0_pt = torch.randn(add_input_shape).cuda().half()
-        input_add_1_pt = torch.randn(add_input_shape).cuda().half()
+        input_add_0_pt = get_random_torch_tensor(add_input_shape, dtype)
+        input_add_1_pt = get_random_torch_tensor(add_input_shape, dtype)
         add_output_pt = input_add_0_pt + input_add_1_pt
 
-        split_input_pt = torch.randn(split_input_shape).cuda().half()
+        split_input_pt = get_random_torch_tensor(split_input_shape, dtype)
         split_outputs_pt = torch.split(
             split_input_pt, int(split_dim_size / 2), dim=scatter_dim
         )
@@ -832,7 +865,7 @@ def _test_strided_scatter_with_split(
         for input_shape, s_indices, e_indices in zip(
             slice_input_shapes, start_indices, end_indices
         ):
-            x_pt = torch.randn(input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             xs_pt.append(x_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             slice_output_pt = x_pt[slice_indices]
@@ -849,7 +882,7 @@ def _test_strided_scatter_with_split(
             inputs[f"input_{i}"] = x_pt
 
         # run ait
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -865,6 +898,114 @@ def test_strided_scatter_with_split(self):
             test_name="strided_scatter_with_split",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_scatter_float(self):
+        self._test_strided_scatter_basic(
+            input_shapes=([2], [3]),
+            start_indices=([1], [2]),
+            end_indices=([2], [-1]),
+            scatter_dim=0,
+            test_name="strided_scatter_basic_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_basic(
+            input_shapes=([10, 30, 20], [10, 8, 20], [10, 10, 20]),
+            start_indices=([0, 5, 0], [0, 6, 0], [0, 1, 0]),
+            end_indices=([None, 6, None], [None, 8, None], [None, 4, None]),
+            scatter_dim=1,
+            test_name="strided_scatter_basic_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_dynamic(
+            input_shapes=([[5, 16], [10, 20], 4], [[5, 16], [10, 20], 10]),
+            start_indices=([0, 0, 2], [0, 0, 2]),
+            end_indices=([None, None, 4], [None, None, 10]),
+            scatter_dim=2,
+            test_name="strided_scatter_dynamic_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_dynamic(
+            input_shapes=(
+                [[5, 7], [1, 10], 4],
+                [[5, 7], [1, 10], 6],
+                [[5, 7], [1, 10], 8],
+            ),
+            start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
+            end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
+            scatter_dim=2,
+            test_name="strided_scatter_partial_float",
+            make_slices=[True, False, True],
+            dtype="float",
+        )
+        self._test_strided_scatter_multi_dsts_2(
+            input_shapes=[[3, 3, 10], [3, 7, 10]],
+            start_indices=[[0, 1, 0], [0, 1, 0]],
+            end_indices=[[None, 2, None], [None, 7, None]],
+            scatter_dim=1,
+            test_name="strided_scatter_partial_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_basic_2(
+            input_shape_0=(1, 10),
+            input_shape_2=(1, 8),
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input_2
+                [None, 8],  # input_0
+                [None, 4],  # input_2
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_basic_float_2",
+            dtype="float",
+        )
+        self._test_strided_scatter_input_masks_2(
+            Ms0=(4, 10),
+            N0=6,
+            Ms1=(4, 10),
+            N1=7,
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input0
+                [None, 5],  # input1
+                [None, 4],  # input0
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_input_masks_float_2",
+            dtype="float",
+        )
+        self._test_strided_scatter_with_split(
+            add_input_shape=(4, 10),
+            split_input_shape=(4, 9),
+            slice_input_shapes=([4, 6], [4, 12]),
+            start_indices=([0, 2], [0, 8]),
+            end_indices=([None, 4], [None, 12]),
+            scatter_dim=1,
+            test_name="strided_scatter_with_split_float",
+            dtype="float",
+        )
+        target = detect_target()
+        if int(target._arch) >= 80:
+            self._test_strided_scatter_input_masks(
+                Ms=(5, 16),
+                N=4,
+                K=10,
+                input_shapes=([[5, 16], 5], [[5, 16], 10]),
+                start_indices=([0, 1], [0, 2]),
+                end_indices=([None, 2], [None, 10]),
+                scatter_dim=1,
+                test_name="strided_scatter_input_masks_float",
+                make_slices=[True, True],
+                dtype="float",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_split_group_gemm.py b/tests/unittest/compiler/test_strided_split_group_gemm.py
index dcafe5231..a822179ec 100644
--- a/tests/unittest/compiler/test_strided_split_group_gemm.py
+++ b/tests/unittest/compiler/test_strided_split_group_gemm.py
@@ -21,14 +21,20 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedSplitGroupGemmTestCase(unittest.TestCase):
-    def test_split_group_gemm(self):
+    def _test_split_group_gemm(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -43,13 +49,13 @@ def test_split_group_gemm(self):
 
         X = Tensor(
             shape=[IntImm(M), IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="x",
             is_input=True,
         )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -72,10 +78,10 @@ def test_split_group_gemm(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
@@ -84,7 +90,7 @@ def test_split_group_gemm(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = [
@@ -93,14 +99,14 @@ def test_split_group_gemm(self):
             W2_pt,
             W3_pt,
         ]
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_bias(self):
+    def _test_split_group_gemm_bias(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -113,15 +119,13 @@ def test_split_group_gemm_bias(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
-        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype=dtype, name="b3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -149,13 +153,13 @@ def test_split_group_gemm_bias(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
-        B3_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
+        B3_pt = get_random_torch_tensor([N], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
@@ -164,7 +168,7 @@ def test_split_group_gemm_bias(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         input_name_to_index = module.get_input_name_to_index_map()
@@ -176,14 +180,14 @@ def test_split_group_gemm_bias(self):
         inputs[input_name_to_index["b1"]] = B1_pt
         inputs[input_name_to_index["b2"]] = B2_pt
         inputs[input_name_to_index["b3"]] = B3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_reorder(self):
+    def _test_split_group_gemm_reorder(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -196,12 +200,10 @@ def test_split_group_gemm_reorder(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -224,10 +226,10 @@ def test_split_group_gemm_reorder(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
@@ -236,7 +238,7 @@ def test_split_group_gemm_reorder(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = [0 for i in range(4)]
@@ -245,14 +247,14 @@ def test_split_group_gemm_reorder(self):
         inputs[name_to_idx["w1"]] = W1_pt
         inputs[name_to_idx["w2"]] = W2_pt
         inputs[name_to_idx["w3"]] = W3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_bias_reorder(self):
+    def _test_split_group_gemm_bias_reorder(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -265,15 +267,13 @@ def test_split_group_gemm_bias_reorder(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
-        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype=dtype, name="b3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -301,13 +301,13 @@ def test_split_group_gemm_bias_reorder(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
-        B3_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
+        B3_pt = get_random_torch_tensor([N], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
@@ -316,7 +316,7 @@ def test_split_group_gemm_bias_reorder(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         input_name_to_index = module.get_input_name_to_index_map()
@@ -328,10 +328,23 @@ def test_split_group_gemm_bias_reorder(self):
         inputs[input_name_to_index["b1"]] = B1_pt
         inputs[input_name_to_index["b2"]] = B2_pt
         inputs[input_name_to_index["b3"]] = B3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_split_group_gemm_float16(self):
+        self._test_split_group_gemm()
+        self._test_split_group_gemm_bias()
+        self._test_split_group_gemm_reorder()
+        self._test_split_group_gemm_bias_reorder()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_group_gemm_float(self):
+        self._test_split_group_gemm(dtype="float")
+        self._test_split_group_gemm_bias(dtype="float")
+        self._test_split_group_gemm_reorder(dtype="float")
+        self._test_split_group_gemm_bias_reorder(dtype="float")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index 9b1636767..52a79fd0b 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -18,8 +18,13 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 from parameterized import param, parameterized
 
@@ -36,40 +41,40 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=11,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_1",
                 n=2,
                 new_shape=[-1, 2, 1, 2],
                 cat_dim=3,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=11,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_2",
                 n=4,
                 new_shape=[-1, 4, 4, 1],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=11,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_3",
                 n=2,
                 new_shape=[-1, 2, 2, 1],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=11,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_4",
                 n=4,
                 new_shape=[-1, 4, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=11,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_non_fusible_dynamic_dim",
@@ -84,16 +89,34 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2 * 2],
                 cat_dim=1,
-                expected_num_tensors=15,
-                expected_num_ops=10,
+                expected_num_tensors=14,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_non_fusible_expand",
                 n=4,
                 new_shape=[-1, 4, 2, 2],
                 cat_dim=3,
-                expected_num_tensors=17,
-                expected_num_ops=10,
+                expected_num_tensors=16,
+                expected_num_ops=9,
+            ),
+            param(
+                "gemm_reshape_cat_fusible_expand_float_1",
+                n=2,
+                new_shape=[-1, 2, 1, 2],
+                cat_dim=3,
+                expected_num_tensors=11,
+                expected_num_ops=9,
+                dtype="float",
+            ),
+            param(
+                "gemm_reshape_cat_non_fusible_expand_float",
+                n=4,
+                new_shape=[-1, 4, 2, 2],
+                cat_dim=3,
+                expected_num_tensors=16,
+                expected_num_ops=9,
+                dtype="float",
             ),
         ],
         name_func=custom_name_func,
@@ -106,15 +129,28 @@ def test_strided_gemm_view_cat_fusible(
         cat_dim: int,
         expected_num_tensors: int,
         expected_num_ops: int,
+        dtype: str = "float16",
     ):
+        target = detect_target()
+        if dtype == "float" and (target.name() != "cuda" or int(target._arch) < 80):
+            self.skipTest("Only supported with CUDA >= 80")
+
         batch_dim = IntVar([1, 2, 3], "batch_size")
-        input0 = test_utils.gen_input_tensor([batch_dim, n, n], name="input0")
-        input1 = test_utils.gen_input_tensor([n, n], name="input1")
-        input2 = test_utils.gen_input_tensor([batch_dim, n, n], name="input2")
-        input3 = test_utils.gen_input_tensor([n], name="input3")
-        input4 = test_utils.gen_input_tensor([batch_dim, n, n], name="input4")
-        input5 = test_utils.gen_input_tensor([batch_dim, n, n], name="input5")
-        input6 = test_utils.gen_input_tensor([n, n, n], name="input6")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input0", dtype=dtype
+        )
+        input1 = test_utils.gen_input_tensor([n, n], name="input1", dtype=dtype)
+        input2 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input2", dtype=dtype
+        )
+        input3 = test_utils.gen_input_tensor([n], name="input3", dtype=dtype)
+        input4 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input4", dtype=dtype
+        )
+        input5 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input5", dtype=dtype
+        )
+        input6 = test_utils.gen_input_tensor([n, n, n], name="input6", dtype=dtype)
 
         X0 = ops.gemm_rcr()(input0, input1)
         X1 = ops.gemm_rcr_bias()(input0, input1, input3)
@@ -138,7 +174,6 @@ def test_strided_gemm_view_cat_fusible(
         Z._attrs["is_output"] = True
 
         # Gen module.
-        target = detect_target()
         module = compile_model([Z], target, "./tmp", test_name)
 
         # Verify the generated graph.
@@ -149,13 +184,13 @@ def test_strided_gemm_view_cat_fusible(
 
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input1_pt = torch.randn([n, n]).cuda().half()
-            input2_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input3_pt = torch.randn([n]).cuda().half()
-            input4_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input5_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input6_pt = torch.randn([n, n, n]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input1_pt = get_random_torch_tensor([n, n], dtype)
+            input2_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input3_pt = get_random_torch_tensor([n], dtype)
+            input4_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input5_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input6_pt = get_random_torch_tensor([n, n, n], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.nn.functional.linear(input0_pt, input1_pt)
@@ -179,7 +214,7 @@ def test_strided_gemm_view_cat_fusible(
             ys_pt = [torch.reshape(x, new_shape) for x in xs_pt]
             ys_pt.insert(2, torch.reshape(input5_pt, new_shape))
             z_pt = torch.cat(ys_pt, dim=cat_dim)
-            z = torch.empty(z_pt.shape).cuda().half()
+            z = get_torch_empty_tensor(z_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -201,6 +236,121 @@ def test_strided_gemm_view_cat_fusible(
                 f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}, input5_pt: {input5_pt}",
             )
 
+    def _test_strided_layernorm_view_cat_fusible(self, dtype="float16"):
+        def _create_layernorm_sigmoid_mul(
+            input: Tensor,
+            normalized_shape: List[int],
+            gamma: Tensor = None,
+            beta: Tensor = None,
+        ) -> Tensor:
+            X1 = ops.layernorm([IntImm(s) for s in normalized_shape])(
+                input, gamma, beta
+            )
+            X2 = ops.elementwise(FuncEnum.SIGMOID)(X1)
+            X3 = ops.elementwise(FuncEnum.MUL)(X2, input)
+            return X3
+
+        batch_dim = IntVar([1, 2, 3], "batch_size")
+        m = 5
+        n = 10
+        new_shape = [-1, m, n * 2]
+        cat_dim = 1
+        # layernorm + reshape
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, m, 2, n], name="input0", dtype=dtype
+        )
+        # group layernorm + reshape
+        gamma = test_utils.gen_input_tensor([m * n], name="g", dtype=dtype)
+        beta = test_utils.gen_input_tensor([m * n], name="b", dtype=dtype)
+        input1 = test_utils.gen_input_tensor(
+            [batch_dim, 2, m * n], name="input1", dtype=dtype
+        )
+        input2 = test_utils.gen_input_tensor(
+            [batch_dim, 2, m * n], name="input2", dtype=dtype
+        )
+        # layernorm + nop reshape
+        input3 = test_utils.gen_input_tensor(
+            [batch_dim, m, n * 2], name="input3", dtype=dtype
+        )
+
+        X0 = _create_layernorm_sigmoid_mul(input0, [n])
+        X1 = _create_layernorm_sigmoid_mul(input1, [m * n], gamma, beta)
+        X2 = _create_layernorm_sigmoid_mul(input2, [m * n], gamma, beta)
+        X3 = _create_layernorm_sigmoid_mul(input3, [n * 2])
+        Xs = [X0, X1, X2, X3]
+        Ys = [ops.reshape()(X, new_shape) for X in Xs]
+        Z = ops.concatenate()(Ys, dim=cat_dim)
+
+        Z._attrs["name"] = "output0"
+        Z._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(
+            Z, target, "./tmp", f"strided_layernorm_view_cat_fusion_{dtype}"
+        )
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 7)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input0_pt = get_random_torch_tensor([batch_size, m, 2, n], dtype)
+            input1_pt = get_random_torch_tensor([batch_size, 2, m * n], dtype)
+            input2_pt = get_random_torch_tensor([batch_size, 2, m * n], dtype)
+            gamma_pt = get_random_torch_tensor([m * n], dtype)
+            beta_pt = get_random_torch_tensor([m * n], dtype)
+            input3_pt = get_random_torch_tensor([batch_size, m, n * 2], dtype)
+
+            # Run PyTorch baseline.
+            x0_pt = torch.nn.functional.layer_norm(input0_pt, [n])
+            x0_pt = torch.mul(input0_pt, torch.sigmoid(x0_pt))
+            x1_pt = torch.nn.functional.layer_norm(
+                input1_pt, [m * n], weight=gamma_pt, bias=beta_pt
+            )
+            x1_pt = torch.mul(input1_pt, torch.sigmoid(x1_pt))
+            x2_pt = torch.nn.functional.layer_norm(
+                input2_pt, [m * n], weight=gamma_pt, bias=beta_pt
+            )
+            x2_pt = torch.mul(input2_pt, torch.sigmoid(x2_pt))
+            x3_pt = torch.nn.functional.layer_norm(input3_pt, [n * 2])
+            x3_pt = torch.mul(input3_pt, torch.sigmoid(x3_pt))
+
+            xs_pt = [x0_pt, x1_pt, x2_pt, x3_pt]
+            ys_pt = [torch.reshape(x, new_shape) for x in xs_pt]
+            z_pt = torch.cat(ys_pt, dim=cat_dim)
+            z = get_torch_empty_tensor(z_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "input0": input0_pt,
+                    "input1": input1_pt,
+                    "input2": input2_pt,
+                    "input3": input3_pt,
+                    "g": gamma_pt,
+                    "b": beta_pt,
+                },
+                [z],
+            )
+
+            # Do comparisons.
+            for x, x_pt in zip(z, z_pt):
+                self.assertTrue(
+                    torch.allclose(x, x_pt, atol=1e-2, rtol=1e-2),
+                    f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}",
+                )
+
+    def test_strided_layernorm_view_cat_fusible(self):
+        self._test_strided_layernorm_view_cat_fusible()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_layernorm_view_cat_fusible_float(self):
+        self._test_strided_layernorm_view_cat_fusible(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
index 38bf9e7a0..d413b7a0f 100644
--- a/tests/unittest/compiler/test_strided_view_op.py
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -15,7 +15,7 @@
 import unittest
 from functools import partial
 
-from typing import Callable, Dict, List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 
@@ -24,6 +24,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 from parameterized import param, parameterized
@@ -31,33 +35,61 @@
 
 def _gen_simple_strided_ops(
     batch_dim: IntVar, n1: int, n2: int
-) -> List[Tuple[Tensor, Callable[[torch.Tensor], torch.Tensor]]]:
-    return [
+) -> List[Tuple[str, Tensor, str, Callable[[torch.Tensor], torch.Tensor]]]:
+    test_cases = [
         (
             "tanh",
             ops.elementwise(FuncEnum.TANH)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             torch.tanh,
         ),
         (
             "layernorm",
             ops.layernorm(normalized_shape=[IntImm(n2)])(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             partial(torch.nn.functional.layer_norm, normalized_shape=[n2]),
         ),
         (
             "sum",
             ops.reduce_sum(2, keepdim=True)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             partial(torch.sum, dim=2, keepdim=True),
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda":
+        test_cases.append(
+            (
+                "tanh",
+                ops.elementwise(FuncEnum.TANH)(
+                    test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float")
+                ),
+                "float",
+                torch.tanh,
+            )
+        )
+        test_cases.append(
+            (
+                "sum",
+                ops.reduce_sum(2, keepdim=True)(
+                    test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float")
+                ),
+                "float",
+                partial(torch.sum, dim=2, keepdim=True),
+            )
+        )
+    return test_cases
 
 
-def _gen_fusible_view_ops_after_strided_op() -> Dict[str, Callable[[Tensor], Tensor]]:
+def _gen_fusible_view_ops_after_strided_op() -> List[
+    Tuple[str, Callable[[Tensor], Tensor], str]
+]:
     def reshape_op(input_tensor: Tensor):
         shape = input_tensor._attrs["shape"]
         return ops.reshape()(
@@ -68,11 +100,18 @@ def reshape_op(input_tensor: Tensor):
     def flatten_op(input_tensor: Tensor):
         return ops.flatten(start_dim=1, end_dim=-1)(input_tensor)
 
-    return {"reshape": reshape_op, "flatten": flatten_op}
+    test_cases = [
+        ("reshape", reshape_op, "float16"),
+        ("flatten", flatten_op, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_cases.append(("reshape", reshape_op, "float"))
+    return test_cases
 
 
-def _gen_non_fusible_view_ops_after_strided_op() -> Dict[
-    str, Callable[[Tensor], Tensor]
+def _gen_non_fusible_view_ops_after_strided_op() -> List[
+    Tuple[str, Callable[[Tensor], Tensor], str]
 ]:
     def reshape_op(input_tensor: Tensor):
         n2 = input_tensor._attrs["shape"][2].value()
@@ -81,11 +120,18 @@ def reshape_op(input_tensor: Tensor):
     def flatten_op(input_tensor: Tensor):
         return ops.flatten(start_dim=0, end_dim=1)(input_tensor)
 
-    return {"reshape": reshape_op, "flatten": flatten_op}
+    test_cases = [
+        ("reshape", reshape_op, "float16"),
+        ("flatten", flatten_op, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda":
+        test_cases.append(("flatten", flatten_op, "float"))
+    return test_cases
 
 
-def _gen_multiple_fusible_view_ops_after_strided_op() -> Dict[
-    str, Callable[[Tensor], Tensor]
+def _gen_multiple_fusible_view_ops_after_strided_op() -> List[
+    Tuple[str, Callable[[Tensor], Tensor], str]
 ]:
     def _get_shape(input_tensor: Tensor):
         return (
@@ -107,31 +153,39 @@ def squeeze_unsqueeze(input_tensor: Tensor):
         n1, n2 = _get_shape(input_tensor)
         return ops.squeeze(dim=1)(ops.unsqueeze(dim=1)(input_tensor))
 
-    return {
-        "multi_reshape": multi_reshape,
-        "squeeze_unsqueeze": squeeze_unsqueeze,
-    }
+    test_cases = [
+        ("multi_reshape", multi_reshape, "float16"),
+        ("squeeze_unsqueeze", squeeze_unsqueeze, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_cases.append(("multi_reshape", multi_reshape, "float"))
+    return test_cases
 
 
 def custom_name_func(testcase_func, param_num, param):
-    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
+    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}_{param.args[2]}"
 
 
 class StridedViewOpTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StridedViewOpTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     @parameterized.expand(
         [
-            param(f"single_gemm_{name}_fusion", func)
-            for (name, func) in _gen_fusible_view_ops_after_strided_op().items()
+            param(f"single_gemm_{name}_fusion_{dtype}", func, dtype)
+            for (name, func, dtype) in _gen_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_gemm_and_view_fusible(self, test_name, func):
+    def test_single_gemm_and_view_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
         K = 10
-        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
-        input1 = test_utils.gen_input_tensor([N2, K])
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K], dtype=dtype)
+        input1 = test_utils.gen_input_tensor([N2, K], dtype=dtype)
         X0 = ops.gemm_rcr()(input0, input1)
         Y = ops.elementwise(FuncEnum.TANH)(func(X0))
         Y._attrs["name"] = "output0"
@@ -139,7 +193,8 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -149,8 +204,8 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
-            input1_pt = torch.randn(N2, K).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N1, K], dtype)
+            input1_pt = get_random_torch_tensor([N2, K], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.matmul(input0_pt, input1_pt.transpose(0, 1))
@@ -160,31 +215,33 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
                     x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
                 )
             )
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input0_pt, input1_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_bmm_{name}_fusion", func)
+            param(f"single_bmm_{name}_fusion_{dtype}", func, dtype)
             for (
                 name,
                 func,
-            ) in _gen_multiple_fusible_view_ops_after_strided_op().items()
+                dtype,
+            ) in _gen_multiple_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_bmm_and_multi_view_fusible(self, test_name, func):
+    def test_single_bmm_and_multi_view_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
         K = 10
-        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
-        input1 = test_utils.gen_input_tensor([batch_dim, K, N2])
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K], dtype)
+        input1 = test_utils.gen_input_tensor([batch_dim, K, N2], dtype)
         X0 = ops.bmm_rrr()(input0, input1)
         Y = ops.elementwise(FuncEnum.COS)(func(X0))
         Y._attrs["name"] = "output0"
@@ -192,7 +249,8 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -202,8 +260,8 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
-            input1_pt = torch.randn(batch_size, K, N2).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N1, K], dtype)
+            input1_pt = get_random_torch_tensor([batch_size, K, N2], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.matmul(input0_pt, input1_pt)
@@ -213,24 +271,32 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
                     x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
                 )
             )
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input0_pt, input1_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_{op_name}_reshape_fusion", input_tensor, torch_func)
-            for (op_name, input_tensor, torch_func) in _gen_simple_strided_ops(
+            param(
+                f"single_{op_name}_reshape_fusion_{dtype}",
+                input_tensor,
+                dtype,
+                torch_func,
+            )
+            for (op_name, input_tensor, dtype, torch_func) in _gen_simple_strided_ops(
                 IntVar([1, 128, 256], "batch_size"), n1=10, n2=8
             )
         ],
         name_func=custom_name_func,
     )
-    def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
+    def test_single_op_and_view_fusible(
+        self, test_name, input_tensor, dtype, torch_func
+    ):
         src_input = test_utils.get_src_input(input_tensor)
         batch_dim = src_input._attrs["shape"][0]
         n1 = src_input._attrs["shape"][1].value()
@@ -241,7 +307,8 @@ def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -251,30 +318,31 @@ def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, n1, n2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, n1, n2], dtype)
 
             # Run PyTorch baseline.
             y_pt = torch.tanh(torch.reshape(torch_func(input_pt), [batch_size, -1]))
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_op_{name}_non_fusion", func)
-            for (name, func) in _gen_non_fusible_view_ops_after_strided_op().items()
+            param(f"single_op_{name}_non_fusion_{dtype}", func, dtype)
+            for (name, func, dtype) in _gen_non_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_op_and_view_non_fusible(self, test_name, func):
+    def test_single_op_and_view_non_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype=dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y = ops.elementwise(FuncEnum.TANH)(func(X1))
         Y._attrs["name"] = "output"
@@ -282,7 +350,8 @@ def test_single_op_and_view_non_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -292,23 +361,24 @@ def test_single_op_and_view_non_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            x0_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            x0_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
 
             # Run PyTorch baseline.
             y_pt = torch.tanh(torch.reshape(torch.tanh(x0_pt), [-1, N2]))
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([x0_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
-    def test_two_serial_view_outputs(self):
+    def _test_two_serial_view_outputs(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y1 = ops.reshape()(X1, [-1, N1 * N2])
         Y2 = ops.reshape()(Y1, [-1, N1, N2])
@@ -319,7 +389,7 @@ def test_two_serial_view_outputs(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y1, Y2], target, "./tmp", "two_view_outputs")
+        module = compile_model([Y1, Y2], target, "./tmp", f"two_view_outputs_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -329,13 +399,13 @@ def test_two_serial_view_outputs(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
 
             # Run PyTorch baseline.
             y1_pt = torch.reshape(torch.tanh(input_pt), [batch_size, N1 * N2])
             y2_pt = torch.reshape(y1_pt, [batch_size, N1, N2])
-            y1 = torch.empty(y1_pt.shape).cuda().half()
-            y2 = torch.empty(y2_pt.shape).cuda().half()
+            y1 = get_torch_empty_tensor(y1_pt.shape, dtype)
+            y2 = get_torch_empty_tensor(y2_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y1, y2])
@@ -344,11 +414,11 @@ def test_two_serial_view_outputs(self):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_two_parallel_views(self):
+    def _test_two_parallel_views(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y1 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1 * N2]))
         Y2 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1, N2]))
@@ -359,7 +429,9 @@ def test_two_parallel_views(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y1, Y2], target, "./tmp", "two_parallel_view_outputs")
+        module = compile_model(
+            [Y1, Y2], target, "./tmp", f"two_parallel_view_outputs_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -369,14 +441,14 @@ def test_two_parallel_views(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
             x1_pt = torch.tanh(input_pt)
 
             # Run PyTorch baseline.
             y1_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1 * N2]))
             y2_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1, N2]))
-            y1 = torch.empty(y1_pt.shape).cuda().half()
-            y2 = torch.empty(y2_pt.shape).cuda().half()
+            y1 = get_torch_empty_tensor(y1_pt.shape, dtype)
+            y2 = get_torch_empty_tensor(y2_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y1, y2])
@@ -385,6 +457,15 @@ def test_two_parallel_views(self):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
+    def test_two_views(self):
+        self._test_two_parallel_views()
+        self._test_two_serial_view_outputs()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_two_views_float(self):
+        self._test_two_parallel_views(dtype="float")
+        self._test_two_serial_view_outputs(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index 4e3eebf54..59132feb4 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -19,8 +19,14 @@
 from aitemplate.compiler import compile_model, ops, transform
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
+from parameterized import parameterized
+
 
 class MemoryOpTransformationTestCase(unittest.TestCase):
     BATCH_SIZE = 1024
@@ -28,8 +34,7 @@ class MemoryOpTransformationTestCase(unittest.TestCase):
     N = 128
     USE_DYNAMIC_BATCH = False
 
-    def _prepare_cat_elimination_graph(self):
-        dtype = "float16"
+    def _prepare_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -58,20 +63,22 @@ def test_cat_elimination_graph_transformation(self):
         graph = transform.transform_memory_ops(graph)
         self.assertEqual(len(graph), 2)
 
-    def test_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"cat_elimination_{dtype}")
 
-        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         out_pt = torch.cat([x0_pt, x0_pt], dim=1)
 
-        out = torch.empty(out_pt.size()).cuda().half()
+        out = get_torch_empty_tensor(out_pt.size(), dtype)
         module.run_with_tensors([x0_pt], [out])
         self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
 
-    def _prepare_split_cat_elimination_graph(self):
-        dtype = "float16"
+    def _prepare_split_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -131,20 +138,25 @@ def test_split_cat_elimination_graph_transformation(self):
         graph = transform.transform_memory_ops(graph)
         self.assertEqual(len(graph), 7)
 
-    def test_split_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_split_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_split_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "split_cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_split_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"split_cat_elimination_{dtype}"
+        )
 
-        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         x4_pt, x5_pt = torch.split(x0_pt, int(self.N / 2), dim=2)
         out_pt0 = torch.cat([x4_pt, x5_pt], dim=1)
-        y0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
-        y1_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        y0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        y1_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         out_pt1 = torch.cat([y1_pt, y0_pt, y0_pt], dim=1)
 
-        out0 = torch.empty(out_pt0.size()).cuda().half()
-        out1 = torch.empty(out_pt1.size()).cuda().half()
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
         module.run_with_tensors(
             {"input0": x0_pt, "input1": y0_pt, "input2": y1_pt},
             {"output0": out0, "output1": out1},
@@ -152,8 +164,7 @@ def test_split_cat_elimination_e2e(self):
         self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
         self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
 
-    def _prepare_cat_cat_elimination_graph(self):
-        dtype = "float16"
+    def _prepare_cat_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -223,19 +234,26 @@ def test_cat_cat_elimination_graph_transformation(self):
         self.assertEqual(len(graph), 6)
         self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 2)
 
-    def test_cat_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_cat_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_cat_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "cat_cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_cat_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"cat_cat_elimination_{dtype}")
 
-        x0_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
-        x1_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
-        x2_pt = torch.randn([self.BATCH_SIZE, self.M, self.N + 4]).cuda().half()
-        x3_pt = torch.randn([self.BATCH_SIZE, self.M, self.N * 2]).cuda().half()
+        x0_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x1_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x2_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N + 4], dtype)
+        x3_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N * 2], dtype)
         x5_pt = torch.cat([x0_pt, x1_pt], dim=1)
         out_pt0 = torch.cat([x3_pt, x5_pt, x2_pt, x2_pt], dim=2)
 
-        out0 = torch.empty(out_pt0.size()).cuda().half()
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
         module.run_with_tensors(
             {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt, "input3": x3_pt},
             [out0],
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
index 9e5efcaba..aed1a89b8 100644
--- a/tests/unittest/compiler/test_transform_special_op.py
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -22,14 +22,20 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 from aitemplate.utils.graph_utils import get_sorted_ops
 
+from parameterized import parameterized
+
 
 class GemmRrrSmallNkTestCase(unittest.TestCase):
-    def _create_gemm_rrr_graph(self, M, K, N):
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+    def _create_gemm_rrr_graph(self, M, K, N, dtype):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "gemm_rrr_tensor"
@@ -37,15 +43,15 @@ def _create_gemm_rrr_graph(self, M, K, N):
 
         return X, W, Y
 
-    def _test_small_nk(self, Ms, N, K, testname=None):
+    def _test_small_nk(self, Ms, N, K, testname=None, dtype="float16"):
         if testname is None:
-            testname = "gemm_rrr_small_nk_{}_{}_{}".format(Ms, N, K)
+            testname = f"gemm_rrr_small_nk_{Ms}_{N}_{K}_{dtype}"
             testname = testname.replace(" ", "")
             testname = testname.replace("[", "")
             testname = testname.replace("]", "")
 
         X, W, gemm_tensor = self._create_gemm_rrr_graph(
-            shape_utils.gen_int_var_min_max(Ms), K, N
+            shape_utils.gen_int_var_min_max(Ms), K, N, dtype
         )
 
         output = ops.elementwise(FuncEnum.COS)(gemm_tensor)
@@ -77,12 +83,12 @@ def _test_small_nk(self, Ms, N, K, testname=None):
         )
 
         for m in Ms:
-            X_pt = torch.randn(m, K).cuda().half()
-            W_pt = torch.randn(K, N).cuda().half()
+            X_pt = get_random_torch_tensor([m, K], dtype)
+            W_pt = get_random_torch_tensor([K, N], dtype)
             mm_pt = torch.matmul(X_pt, W_pt)
             Y_pt = torch.cos(mm_pt)
-            y = torch.empty([m, N]).cuda().half()
-            gemm_tensor_pt = torch.empty([m, N]).cuda().half()
+            y = get_torch_empty_tensor([m, N], dtype)
+            gemm_tensor_pt = get_torch_empty_tensor([m, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt},
                 {"output_0": y, "gemm_rrr_tensor": gemm_tensor_pt},
@@ -104,13 +110,27 @@ def test_small_nk_alignment(self):
         self._test_small_nk([100, 200], 6, 3)
         self._test_small_nk([105], 7, 1)
 
-    def test_small_nk_no_transform(self):
-        M, K, N = 8, 8, 16
-        _, _, output = self._create_gemm_rrr_graph(M, K, N)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_small_nk_fp32(self):
+        self._test_small_nk([10], 8, 4, "test_small_nk_fp32", dtype="float32")
+        self._test_small_nk(
+            [10, 30, 50], 6, 4, "test_small_kn_dynamic1_fp32", dtype="float32"
+        )
+        self._test_small_nk(
+            [100, 200], 6, 3, "test_small_nk_alignment_fp32", dtype="float32"
+        )
 
+    @parameterized.expand([("float16"), ("float32")])
+    def test_small_nk_no_transform(self, dtype):
         target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        M, K, N = 8, 8, 16
+        _, _, output = self._create_gemm_rrr_graph(M, K, N, dtype)
+
         module = compile_model(
-            output, target, "./tmp", "test_small_nk_fail_{}_{}_{}".format(M, K, N)
+            output, target, "./tmp", f"test_small_nk_fail_{M}_{K}_{N}_{dtype}"
         )
 
         for tensor in module.debug_sorted_graph:
@@ -130,36 +150,38 @@ def test_small_nk_no_transform(self):
         src_op = list(output_tensor._attrs["src_ops"])[0]
         self.assertEqual(src_op._attrs["op"], "gemm_rrr", "output op type incorrect")
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(K, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([K, N], dtype)
         Y_pt = torch.matmul(X_pt, W_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
 class BmmRcrN1TestCase(unittest.TestCase):
-    def _create_bmm_rcr_graph(self, B, M, N, K):
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+    def _create_bmm_rcr_graph(self, B, M, N, K, dtype):
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "bmm_rcr_tensor"
 
         return X, W, Y
 
-    def _test_n1_k8(self, B, M, N, K, testname=None):
+    def _test_n1_k8(self, B, M, N, K, testname=None, dtype="float16"):
         if testname is None:
-            testname = "bmm_rcr_n1_{}_{}_{}_{}".format(B, M, N, K)
+            testname = f"bmm_rcr_n1_{B}_{M}_{N}_{K}_{dtype}"
             testname = testname.replace(" ", "")
             testname = testname.replace("[", "")
             testname = testname.replace("]", "")
 
         X, W, bmm_tensor = self._create_bmm_rcr_graph(
-            B, shape_utils.gen_int_var_min_max(M), N, K
+            B, shape_utils.gen_int_var_min_max(M), N, K, dtype
+        )
+        mul = ops.elementwise(FuncEnum.MUL)(
+            bmm_tensor, Tensor(shape=[], dtype=dtype, value=1.0)
         )
-        mul = ops.elementwise(FuncEnum.MUL)(bmm_tensor, Tensor(shape=[], value=1.0))
         output = ops.elementwise(FuncEnum.COS)(mul)
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
@@ -180,8 +202,8 @@ def _test_n1_k8(self, B, M, N, K, testname=None):
         assert src_op._attrs["op"] == "bmm_rcr_n1"
 
         for m in M:
-            X_pt = torch.randn(B, m, K).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([B, m, K], dtype)
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
 
             def pt_bmm(X_pt, W_pt):
                 WT = torch.transpose(W_pt, 2, 1)
@@ -190,7 +212,7 @@ def pt_bmm(X_pt, W_pt):
 
             Y_pt = torch.cos(pt_bmm(X_pt, W_pt))
 
-            y = torch.empty([B, m, N]).cuda().half()
+            y = get_torch_empty_tensor([B, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -201,13 +223,22 @@ def test_n1_k8(self):
     def test_n1_k8_dynamic(self):
         self._test_n1_k8(10, [8, 16], 1, 8)
 
-    def test_n_non1_fail(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_n1_k8_fp32(self):
+        self._test_n1_k8(10, [8], 1, 8, dtype="float32")
+        self._test_n1_k8(10, [8, 16], 1, 8, dtype="float32")
+
+    @parameterized.expand([("float16"), ("float32")])
+    def test_n_non1_fail(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         B, M, K, N = 8, 8, 8, 8
-        _, _, output = self._create_bmm_rcr_graph(B, M, K, N)
+        _, _, output = self._create_bmm_rcr_graph(B, M, K, N, dtype)
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "bmm_rcr_n_non1")
+        module = compile_model(output, target, "./tmp", f"bmm_rcr_n_non1_{dtype}")
 
         output_tensor = None
         for tensor in module.debug_sorted_graph:
@@ -238,18 +269,20 @@ def _assert_has_gemm(self, sorted_graph: List[Tensor]):
         raise AssertionError("Did not find gemm_rcr in graph")
 
     def _test_simple_1x1_conv(
-        self, batch, CO, HH, WW, CI, activation=None, with_bias=False
+        self, batch, CO, HH, WW, CI, activation=None, with_bias=False, dtype="float16"
     ):
         if isinstance(batch, int):
             batch = (batch,)
         batch_var = shape_utils.gen_int_var_min_max(batch, name="batch_size")
         X = Tensor(
             shape=[batch_var, HH, WW, CI],
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[CO, 1, 1, CI],
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -257,6 +290,7 @@ def _test_simple_1x1_conv(
         if with_bias:
             bias = Tensor(
                 shape=[CO],
+                dtype=dtype,
                 name="bias",
                 is_input=True,
             )
@@ -293,11 +327,11 @@ def _test_simple_1x1_conv(
             self._assert_has_gemm(module.debug_sorted_graph)
 
             for batch_pt in batch:
-                X_pt = torch.randn(batch_pt, CI, HH, WW).half().cuda()
-                W_pt = torch.randn(CO, CI, 1, 1).half().cuda()
+                X_pt = get_random_torch_tensor([batch_pt, CI, HH, WW], dtype)
+                W_pt = get_random_torch_tensor([CO, CI, 1, 1], dtype)
 
                 if with_bias:
-                    B_pt = torch.randn(CO).half().cuda()
+                    B_pt = get_random_torch_tensor([CO], dtype)
                 else:
                     B_pt = None
 
@@ -314,7 +348,7 @@ def _test_simple_1x1_conv(
                 elif activation is not None:
                     raise NotImplementedError(f"Unsupported activation {activation}")
 
-                Y_ait = torch.empty(batch_pt, HH, WW, CO).half().cuda()
+                Y_ait = get_torch_empty_tensor(batch_pt, HH, WW, CO, dtype)
                 inputs = {
                     "input_0": X_pt.permute(0, 2, 3, 1).contiguous(),
                     "input_1": W_pt.permute(0, 2, 3, 1).contiguous(),
diff --git a/tests/unittest/compiler/test_transform_toposort.py b/tests/unittest/compiler/test_transform_toposort.py
new file mode 100644
index 000000000..918a5e40c
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_toposort.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.testing import detect_target
+
+
+class TestTopoSort(unittest.TestCase):
+    def test_very_deep_toposort(self):
+        x = Tensor(
+            [2, 10],
+            is_input=True,
+            name="x",
+        )
+
+        for _ in range(1000):
+            x = ops.elementwise(FuncEnum.RELU)(x)
+
+        x._attrs["is_output"] = True
+        x._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(x, target, "./tmp", "test_very_deep_toposort")
+
+        x_pt = torch.randn((2, 10)).half().cuda()
+        out_pt = torch.relu(x_pt)
+
+        out_ait = torch.empty_like(out_pt)
+        module.run_with_tensors({"x": x_pt}, {"output": out_ait})
+
+        self.assertTrue(torch.equal(out_ait, out_pt))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
index 3589449c7..d5170a079 100644
--- a/tests/unittest/compiler/test_view_strided_op.py
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -22,6 +22,10 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 from parameterized import param, parameterized
@@ -31,30 +35,60 @@ def _gen_fusible_view_ops_before_strided_op(
     name: str, batch_dim: Optional[IntVar], n1: int, n2: int
 ) -> List[Tensor]:
     assert n2 % 2 == 0, f"n2 must be even! n2: {n2}"
+    target = detect_target()
+    support_float = target.name() == "cuda" and int(target._arch) >= 80
     if batch_dim is not None:
-        return [
+        test_ops = [
             ops.reshape()(
-                test_utils.gen_input_tensor([batch_dim, n1 * n2], name),
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1 * n2], name=name, dtype="float16"
+                ),
                 [-1, n1, n2],
             ),
             ops.flatten(start_dim=2, end_dim=-1)(
-                test_utils.gen_input_tensor([batch_dim, n1, int(n2 / 2), 2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, int(n2 / 2), 2], name=name, dtype="float16"
+                )
             ),
             ops.squeeze(dim=1)(
-                test_utils.gen_input_tensor([batch_dim, 1, n1, n2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, 1, n1, n2], name=name, dtype="float16"
+                )
             ),
         ]
+        if support_float:
+            test_ops.append(
+                ops.reshape()(
+                    test_utils.gen_input_tensor(
+                        [batch_dim, n1 * n2], name=name, dtype="float"
+                    ),
+                    [-1, n1, n2],
+                )
+            )
     else:
-        return [
+        test_ops = [
             ops.reshape()(
-                test_utils.gen_input_tensor([n1 * n2], name),
+                test_utils.gen_input_tensor([n1 * n2], name=name, dtype="float16"),
                 [n1, n2],
             ),
             ops.flatten(start_dim=1, end_dim=-1)(
-                test_utils.gen_input_tensor([n1, int(n2 / 2), 2], name)
+                test_utils.gen_input_tensor(
+                    [n1, int(n2 / 2), 2], name=name, dtype="float16"
+                )
+            ),
+            ops.squeeze(dim=0)(
+                test_utils.gen_input_tensor([1, n1, n2], name=name, dtype="float16")
             ),
-            ops.squeeze(dim=0)(test_utils.gen_input_tensor([1, n1, n2], name)),
         ]
+        if support_float:
+            test_ops.append(
+                ops.flatten(start_dim=1, end_dim=-1)(
+                    test_utils.gen_input_tensor(
+                        [n1, int(n2 / 2), 2], name=name, dtype="float"
+                    )
+                ),
+            )
+    return test_ops
 
 
 def _gen_non_fusible_view_ops_before_strided_op(
@@ -64,34 +98,65 @@ def _gen_non_fusible_view_ops_before_strided_op(
         name=batch_dim._attrs["name"],
         values=[int(value / 2) for value in batch_dim._attrs["values"]],
     )
-    return [
+    test_ops = [
         ops.reshape()(
-            test_utils.gen_input_tensor([new_batch_dim, n1, n2 * 2], name),
+            test_utils.gen_input_tensor(
+                [new_batch_dim, n1, n2 * 2], name=name, dtype="float16"
+            ),
             [-1, n1, n2],
         ),
         ops.flatten(start_dim=0, end_dim=1)(
-            test_utils.gen_input_tensor([new_batch_dim, 2, n1, n2], name)
+            test_utils.gen_input_tensor(
+                [new_batch_dim, 2, n1, n2], name=name, dtype="float16"
+            )
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_ops.append(
+            ops.reshape()(
+                test_utils.gen_input_tensor(
+                    [new_batch_dim, n1, n2 * 2], name=name, dtype="float"
+                ),
+                [-1, n1, n2],
+            )
+        )
+    return test_ops
 
 
 def _gen_multiple_fusible_view_ops_before_strided_op(
     name: str, batch_dim: IntVar, n1: int, n2: int
 ) -> List[Tensor]:
-    return [
+    test_ops = [
         ops.reshape()(
             ops.reshape()(
-                test_utils.gen_input_tensor([batch_dim, n1, n2], name),
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, n2], name=name, dtype="float16"
+                ),
                 [-1, n1 * n2],
             ),
             [-1, n1, n2],
         ),
         ops.squeeze(dim=1)(
             ops.unsqueeze(dim=1)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, n2], name=name, dtype="float16"
+                )
             ),
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_ops.append(
+            ops.squeeze(dim=1)(
+                ops.unsqueeze(dim=1)(
+                    test_utils.gen_input_tensor(
+                        [batch_dim, n1, n2], name=name, dtype="float16"
+                    )
+                )
+            )
+        )
+    return test_ops
 
 
 def custom_name_func(testcase_func, param_num, param):
@@ -104,6 +169,7 @@ def _gen_view_bmm_module(
         input0: Tensor,
         input1: Tensor,
         test_name: str,
+        dtype: str,
         expected_num_tensors: int,
         expected_num_ops: int,
         num_bmms: int = 1,
@@ -117,7 +183,7 @@ def _gen_view_bmm_module(
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", test_name)
+        module = compile_model(Ys, target, "./tmp", f"{test_name}_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -157,6 +223,7 @@ def _test_view_and_bmm(
                 f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_fusible_view_ops_before_strided_op(
@@ -173,30 +240,26 @@ def _test_view_and_bmm(
         name_func=custom_name_func,
     )
     def test_single_view_and_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+            input0, input1, test_name, dtype, expected_num_tensors=3, expected_num_ops=1
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": batch_size}
             self._test_view_and_bmm(
@@ -215,6 +278,7 @@ def test_single_view_and_bmm_fusible(
                 f"{test_utils.get_src_op_name(tensor0)}_multi_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_fusible_view_ops_before_strided_op(
@@ -231,7 +295,7 @@ def test_single_view_and_bmm_fusible(
         name_func=custom_name_func,
     )
     def test_single_view_and_multi_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
@@ -241,6 +305,7 @@ def test_single_view_and_multi_bmm_fusible(
             input0,
             input1,
             test_name,
+            dtype,
             expected_num_tensors=4,
             expected_num_ops=2,
             num_bmms=2,
@@ -250,16 +315,12 @@ def test_single_view_and_multi_bmm_fusible(
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y0 = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y0 = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             y1 = y0.clone()
             dim_to_value_dict = {"batch_size": batch_size}
@@ -272,13 +333,15 @@ def test_single_view_and_multi_bmm_fusible(
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
-    def test_multi_view_and_multi_bmm_fusible(self):
+    def _test_multi_view_and_multi_bmm_fusible(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N0 = 13
         N1 = 46
         N2 = 5
-        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
-        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
+        X0 = test_utils.gen_input_tensor(
+            [batch_dim, N0 * N1], name="input0", dtype=dtype
+        )
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], name="input1", dtype=dtype)
         X2 = ops.reshape()(X0, [-1, N0, N1])
         X3 = ops.reshape()(X0, [-1, N0, N1])
         X4 = ops.reshape()(X1, [-1, N2, N1])
@@ -297,7 +360,9 @@ def test_multi_view_and_multi_bmm_fusible(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", "multi_view_multi_bmm_fusion")
+        module = compile_model(
+            Ys, target, "./tmp", f"multi_view_multi_bmm_fusion_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -309,16 +374,12 @@ def test_multi_view_and_multi_bmm_fusible(self):
         a_shape = X2._attrs["shape"]
         b_shape = X4._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y0 = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y0 = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             y1 = y0.clone()
             dim_to_value_dict = {"batch_size": batch_size}
@@ -331,6 +392,17 @@ def test_multi_view_and_multi_bmm_fusible(self):
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
+    def test_multi_view_and_multi_bmm_fusible(self):
+        self._test_multi_view_and_multi_bmm_fusible()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_multi_view_and_multi_bmm_fusible_float(self):
+        self._test_multi_view_and_multi_bmm_fusible(dtype="float")
+
     @parameterized.expand(
         [
             param(
@@ -338,6 +410,7 @@ def test_multi_view_and_multi_bmm_fusible(self):
                 f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_multiple_fusible_view_ops_before_strided_op(
@@ -354,7 +427,7 @@ def test_multi_view_and_multi_bmm_fusible(self):
         name_func=custom_name_func,
     )
     def test_multiple_view_and_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(
             test_utils.get_src_input(input0)
@@ -365,23 +438,19 @@ def test_multiple_view_and_bmm_fusible(
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+            input0, input1, test_name, dtype, expected_num_tensors=3, expected_num_ops=1
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": batch_size}
             self._test_view_and_bmm(
@@ -396,9 +465,11 @@ def test_multiple_view_and_bmm_fusible(
     @parameterized.expand(
         [
             param(
-                f"non_fusible_{test_utils.get_src_op_name(tensor0)}_{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
+                f"non_fusible_{test_utils.get_src_op_name(tensor0)}_"
+                f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_non_fusible_view_ops_before_strided_op(
@@ -418,34 +489,28 @@ def test_multiple_view_and_bmm_fusible(
         name_func=custom_name_func,
     )
     def test_non_fusible_view_and_bmm(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=5, expected_num_ops=3
+            input0, input1, test_name, dtype, expected_num_tensors=5, expected_num_ops=3
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = (
-                torch.randn(batch_size, b_shape[1].value(), b_shape[2].value())
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor(
+                [batch_size, b_shape[1].value(), b_shape[2].value()], dtype
             )
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": int(batch_size / 2)}
             self._test_view_and_bmm(
@@ -457,14 +522,16 @@ def test_non_fusible_view_and_bmm(
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
-    def test_single_view_and_gemm_fusible(self):
+    def _test_single_view_and_gemm_fusible(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N0 = 13
         N1 = 46
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
-        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
-        X2 = test_utils.gen_input_tensor([N2], "input2")
+        X0 = test_utils.gen_input_tensor(
+            [batch_dim, N0 * N1], name="input0", dtype=dtype
+        )
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], name="input1", dtype=dtype)
+        X2 = test_utils.gen_input_tensor([N2], name="input2", dtype=dtype)
         X3 = ops.reshape()(X0, [-1, N0, N1])
         X4 = ops.reshape()(X1, [N2, N1])
         X5 = ops.reshape()(X1, [N1, N2])
@@ -480,7 +547,7 @@ def test_single_view_and_gemm_fusible(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", "single_view_gemm_fusion")
+        module = compile_model(Ys, target, "./tmp", f"single_view_gemm_fusion_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -490,9 +557,9 @@ def test_single_view_and_gemm_fusible(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            x0_pt = torch.randn(batch_size, N0 * N1).cuda().half()
-            x1_pt = torch.randn(1, N2 * N1).cuda().half()
-            x2_pt = torch.randn(N2).cuda().half()
+            x0_pt = get_random_torch_tensor([batch_size, N0 * N1], dtype)
+            x1_pt = get_random_torch_tensor([1, N2 * N1], dtype)
+            x2_pt = get_random_torch_tensor([N2], dtype)
             x3_pt = torch.reshape(x0_pt, [-1, N0, N1])
             x4_pt = torch.reshape(x1_pt, [N2, N1])
             x5_pt = torch.reshape(x1_pt, [N1, N2])
@@ -501,9 +568,9 @@ def test_single_view_and_gemm_fusible(self):
             y2_pt = torch.nn.functional.linear(x3_pt, x5_pt.transpose(0, 1))
             y_pts = [y0_pt, y1_pt, y2_pt]
             ys = [
-                torch.empty(batch_size, N0, N2).cuda().half(),
-                torch.empty(batch_size, N0, N2).cuda().half(),
-                torch.empty(batch_size, N0, N2).cuda().half(),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
             ]
 
             # Run AITemplate module.
@@ -514,6 +581,17 @@ def test_single_view_and_gemm_fusible(self):
             for y, y_pt in zip(ys, y_pts):
                 self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_single_view_and_gemm_fusible(self):
+        self._test_single_view_and_gemm_fusible()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_single_view_and_gemm_fusible_float(self):
+        self._test_single_view_and_gemm_fusible(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index 085304905..a9528c1c4 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -24,157 +24,243 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import torch_dtype_to_string
+
+
+TORCH_EQUIVALENTS = {
+    FuncEnum.TANH: torch.tanh,
+    FuncEnum.COS: torch.cos,
+    FuncEnum.SIN: torch.sin,
+    FuncEnum.SIGN: torch.sign,
+    FuncEnum.ABS: torch.abs,
+    FuncEnum.LOGE: torch.log,
+    FuncEnum.EXP: torch.exp,
+    FuncEnum.SQRT: torch.sqrt,
+    FuncEnum.SIGMOID: torch.sigmoid,
+    FuncEnum.RELU: torch.relu,
+}
+
+TORCH_FP_DTYPES = [torch.float16]
+if detect_target().name() != "rocm":
+    TORCH_FP_DTYPES.append(torch.float32)
+    if int(detect_target()._arch) >= 80:
+        TORCH_FP_DTYPES.append(torch.bfloat16)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedElementwiseTestCase(unittest.TestCase):
     def _test_leaky_relu(
-        self, input_size, negative_slope=0.01, test_name="leaky_relu", copy_op=False
+        self,
+        input_size,
+        negative_slope=0.01,
+        test_name="leaky_relu",
+        copy_op=False,
     ):
-        assert len(input_size) == 2
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
-            name="input0",
-            is_input=True,
-        )
-        slope = Tensor(
-            shape=[],
-            dtype="float16",
-            name="slope",
-            value=negative_slope,
-        )
-        X2_op = ops.elementwise(FuncEnum.LRELU)
-        if copy_op:
-            X2_op = ops.elementwise(**X2_op._get_op_attributes())
-        X2 = X2_op(X1, slope)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        x1_pt = torch.randn(input_size).cuda().half()
-        OP_pt = torch.nn.LeakyReLU(negative_slope)
-        x2_pt = OP_pt(x1_pt)
-
-        x2 = torch.empty_like(x2_pt)
-        module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
-
-    def _test_relu(self, input_size, test_name="relu", copy_op=False):
-        assert len(input_size) == 2
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
-            name="input0",
-            is_input=True,
-        )
-        X2_op = ops.elementwise(FuncEnum.RELU)
-        if copy_op:
-            X2_op = ops.elementwise(**X2_op._get_op_attributes())
-        X2 = X2_op(X1)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        x1_pt = torch.randn(input_size).cuda().half()
-        x2_pt = torch.relu(x1_pt)
-
-        x2 = torch.empty_like(x2_pt)
-        module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            slope = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="slope",
+                value=negative_slope,
+            )
+            X2_op = ops.elementwise(FuncEnum.LRELU)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, slope)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.LeakyReLU(negative_slope)
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     def _test_hardtanh(
-        self, input_size, min_val=-1, max_val=1, test_name="hard_tanh", copy_op=False
+        self,
+        input_size,
+        min_val=-1,
+        max_val=1,
+        test_name="hard_tanh",
+        copy_op=False,
     ):
-        assert len(input_size) == 2
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
-            name="input0",
-            is_input=True,
-        )
-        X_min = Tensor(
-            shape=[],
-            dtype="float16",
-            name="min_val",
-            value=min_val,
-            is_input=True,
-        )
-        X_max = Tensor(
-            shape=[],
-            dtype="float16",
-            name="max_val",
-            value=max_val,
-            is_input=True,
-        )
-        X2_op = ops.elementwise(FuncEnum.HARDTANH)
-        if copy_op:
-            X2_op = ops.elementwise(**X2_op._get_op_attributes())
-        X2 = X2_op(X1, X_min, X_max)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        x1_pt = torch.randn(input_size).cuda().half()
-        OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
-        x2_pt = OP_pt(x1_pt)
-
-        x2 = torch.empty_like(x2_pt)
-        module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            X_min = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="min_val",
+                value=min_val,
+                is_input=True,
+            )
+            X_max = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="max_val",
+                value=max_val,
+                is_input=True,
+            )
+            X2_op = ops.elementwise(FuncEnum.HARDTANH)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, X_min, X_max)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     def _test_softplus(
-        self, input_size, beta=1.0, threshold=20.0, test_name="softplus", copy_op=False
+        self,
+        input_size,
+        beta=1.0,
+        threshold=20.0,
+        test_name="softplus",
+        copy_op=False,
     ):
-        assert len(input_size) == 2
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
-            name="input0",
-            is_input=True,
-        )
-        X_beta = Tensor(
-            shape=[],
-            dtype="float16",
-            name="beta",
-            value=beta,
-            is_input=True,
-        )
-        X_threshold = Tensor(
-            shape=[],
-            dtype="float16",
-            name="threshold",
-            value=threshold,
-            is_input=True,
-        )
-        X2_op = ops.elementwise(FuncEnum.SOFTPLUS)
-        if copy_op:
-            X2_op = ops.elementwise(**X2_op._get_op_attributes())
-        X2 = X2_op(X1, X_beta, X_threshold)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        x1_pt = torch.randn(input_size).cuda().half()
-        OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
-        x2_pt = OP_pt(x1_pt)
-
-        x2 = torch.empty_like(x2_pt)
-        module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            X_beta = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="beta",
+                value=beta,
+                is_input=True,
+            )
+            X_threshold = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="threshold",
+                value=threshold,
+                is_input=True,
+            )
+            X2_op = ops.elementwise(FuncEnum.SOFTPLUS)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, X_beta, X_threshold)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def _test_simple_function(self, input_size, function, test_name, copy_op=False):
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            X2_op = ops.elementwise(function)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            x2_pt = TORCH_EQUIVALENTS[function](x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(
+                torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
+            )
+
+    def _test_elu(
+        self,
+        input_size,
+        alpha=1.0,
+        test_name="elu",
+        copy_op=False,
+    ):
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            X_alpha = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="alpha",
+                value=alpha,
+            )
+            X2_op = ops.elementwise(FuncEnum.ELU)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, X_alpha)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.ELU(alpha=alpha)
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     def test_lrelu(self):
         self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
         self._test_leaky_relu(
-            [1024, 1024], negative_slope=0.5, test_name="leaky_relu_2"
+            [1024, 1024],
+            negative_slope=0.5,
+            test_name="leaky_relu_2",
         )
         self._test_leaky_relu(
             [1024, 1024],
@@ -196,10 +282,6 @@ def test_htanh(self):
             copy_op=True,
         )
 
-    def test_relu(self):
-        self._test_relu([512, 512], test_name="relu_1")
-        self._test_relu([512, 512], test_name="relu_1_copy_op", copy_op=True)
-
     def test_softplus(self):
         self._test_softplus([64, 64], test_name="softplus_1")
         self._test_softplus([128, 128], beta=1.0, threshold=1.5, test_name="softplus_2")
@@ -212,6 +294,78 @@ def test_softplus(self):
             copy_op=True,
         )
 
+    def test_cos(self):
+        self._test_simple_function([512, 512], FuncEnum.COS, test_name="cos_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.COS, test_name="cos_1_copy_op", copy_op=True
+        )
+
+    def test_sin(self):
+        self._test_simple_function([512, 512], FuncEnum.SIN, test_name="sin_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.SIN, test_name="sin_1_copy_op", copy_op=True
+        )
+
+    def test_tanh(self):
+        self._test_simple_function([512, 512], FuncEnum.TANH, test_name="tanh_1")
+        self._test_simple_function([1, 1], FuncEnum.TANH, test_name="tanh_2")
+        self._test_simple_function(
+            [512, 512], FuncEnum.TANH, test_name="tanh_1_copy_op", copy_op=True
+        )
+
+    def test_sign(self):
+        self._test_simple_function([512, 512], FuncEnum.SIGN, test_name="sign_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.SIGN, test_name="sign_1_copy_op", copy_op=True
+        )
+
+    def test_abs(self):
+        self._test_simple_function([512, 512], FuncEnum.ABS, test_name="abs_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.ABS, test_name="abs_1_copy_op", copy_op=True
+        )
+
+    def test_loge(self):
+        self._test_simple_function([512, 512], FuncEnum.LOGE, test_name="loge_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.LOGE, test_name="loge_1_copy_op", copy_op=True
+        )
+
+    def test_exp(self):
+        self._test_simple_function([512, 512], FuncEnum.EXP, test_name="exp_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.EXP, test_name="exp_1_copy_op", copy_op=True
+        )
+
+    def test_sqrt(self):
+        self._test_simple_function([512, 512], FuncEnum.SQRT, test_name="sqrt_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.SQRT, test_name="sqrt_1_copy_op", copy_op=True
+        )
+
+    def test_sigmoid(self):
+        self._test_simple_function([512, 512], FuncEnum.SIGMOID, test_name="sigmoid_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.SIGMOID, test_name="sigmoid_1_copy_op", copy_op=True
+        )
+
+    def test_relu(self):
+        self._test_simple_function([512, 512], FuncEnum.RELU, test_name="relu_1")
+        self._test_simple_function(
+            [512, 512], FuncEnum.RELU, test_name="relu_1_copy_op", copy_op=True
+        )
+
+    def test_elu(self):
+        self._test_elu([64, 64], test_name="elu_1")
+        self._test_elu([128, 128], alpha=4.0, test_name="elu_2")
+        self._test_elu([128, 256], alpha=0.4, test_name="elu_3")
+        self._test_elu(
+            [256, 128],
+            alpha=1.0,
+            test_name="elu_3_copy_op",
+            copy_op=True,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_argmax.py b/tests/unittest/ops/test_argmax.py
index 6dc91dc01..4c47a5f97 100644
--- a/tests/unittest/ops/test_argmax.py
+++ b/tests/unittest/ops/test_argmax.py
@@ -21,18 +21,24 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class argmaxTestCase(unittest.TestCase):
     def _test_argmax(
-        self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax", copy_op=False
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        dim=0,
+        test_name="argmax",
+        copy_op=False,
+        dtype="float16",
     ):
-
         o_shape = list(shape)[:-1]
 
         X1 = Tensor(
             shape=shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -46,7 +52,7 @@ def _test_argmax(
         target = detect_target()
         module = compile_model(X4, target, "./tmp", test_name)
 
-        scores = torch.rand(shape).cuda().half()
+        scores = get_random_torch_tensor(shape, dtype=dtype)
         y_pt = torch.argmax(scores, dim=dim)
         y = torch.empty_like(y_pt, dtype=torch.int64)
 
@@ -54,10 +60,35 @@ def _test_argmax(
         y_reshape = y.reshape(o_shape)
         self.assertTrue(torch.allclose(y_pt, y_reshape, atol=1e-2, rtol=1e-2))
 
-    def test_argmax(self):
-        self._test_argmax(shape=(300, 80), dim=1, test_name="argmax")
+    def test_fp16(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp16",
+            dtype="float16",
+        )
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp16_copy_op",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    def test_fp32(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp32",
+            dtype="float32",
+        )
         self._test_argmax(
-            shape=(300, 80), dim=1, test_name="argmax_copy_op", copy_op=True
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp32_copy_op",
+            copy_op=True,
+            dtype="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 8088db64c..2d4a7f72d 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -15,6 +15,7 @@
 """
 Unittests for flash_attenion Operator.
 """
+import logging
 import math
 import os
 import unittest
@@ -26,10 +27,15 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import benchmark_pt, detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
 from einops import rearrange, repeat
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def unpad_input(hidden_states, attention_mask):
     """
     Arguments:
@@ -105,7 +111,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     v_pt = torch.reshape(v_pt, [nheads, -1, d])  # [12, 4096, 64]
     y_pt = torch.bmm(attn_pt, v_pt)  # [12, 4096, 64]
     y_pt = torch.reshape(y_pt, [1, nheads, seqlen, d])
-    Y_pt = torch.permute(y_pt, [0, 2, 1, 3]).cuda().half()  # [1,4096,12,64]
+    Y_pt = torch.permute(y_pt, [0, 2, 1, 3])  # [1,4096,12,64]
     return Y_pt
 
 
@@ -140,6 +146,10 @@ def T(t):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class attentionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _test_flash_attention(
         self,
         batch_size=16,
@@ -148,20 +158,30 @@ def _test_flash_attention(
         n=1024,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="flash_attention",
         rebuild=True,
         benchmark_pt=False,
         copy_op=False,
     ):
-
+        torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
         x = torch.randn(
-            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
+            batch_size,
+            seqlen,
+            n,
+            device="cuda",
+            dtype=torch_dtype,
+            requires_grad=True,
+        )
+        Wqkv = torch.nn.Linear(
+            nheads * d,
+            3 * nheads * d,
+            device=device,
+            dtype=torch_dtype,
         )
-        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
 
         lengths = torch.tensor(
             [seqlen] * batch_size, dtype=torch.int, device="cuda"
@@ -170,7 +190,12 @@ def _test_flash_attention(
             repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
             < lengths
         )
-        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
+        attention_mask = torch.zeros(
+            batch_size,
+            seqlen,
+            device="cuda",
+            dtype=torch_dtype,
+        )
         attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
 
         x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
@@ -193,7 +218,7 @@ def _test_flash_attention(
 
         X1 = Tensor(
             shape=[total, 3, num_heads, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="qkv",
             is_input=True,
         )
@@ -224,10 +249,14 @@ def _test_flash_attention(
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-        x1 = qkv_unpad.detach().half().cuda()
+        x1 = qkv_unpad.detach().to(torch_dtype).cuda()
         x2 = cu_seqlens.detach().to(torch.int32).cuda()
         inputs = {"qkv": x1, "cu_seqlens": x2}
-        y = torch.empty([total, num_heads, head_size]).cuda().half()
+        y = torch.empty(
+            [total, num_heads, head_size],
+            dtype=torch_dtype,
+            device="cuda",
+        )
         module.run_with_tensors(inputs, [y])
 
         # Warm up.
@@ -239,17 +268,17 @@ def _test_flash_attention(
             [y],
             count=100,
         )
-        logger.info(__file__, "benchmark flash-attn time: {0}".format(time_per_iter_ms))
+        _LOGGER.info(f"benchmark flash-attn time: {time_per_iter_ms}")
 
         y = y.reshape((batch_size, -1, nheads, d))
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(y, y_pt, atol=1e-3, rtol=1e-3)
 
         if benchmark_pt:
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
             func = attention_ref
             args = (
-                qkv.cuda().half(),
+                qkv.to(torch_dtype).cuda(),
                 attention_mask_bool.cuda(),
                 dropout_p,
                 False,
@@ -262,29 +291,58 @@ def _test_flash_attention(
 
     def test_flash_attention(self):
         if detect_target().name() == "cuda":
-            self._test_flash_attention(test_name="flash_attention")
             self._test_flash_attention(
-                test_name="flash_attention_copy_op", copy_op=True
+                test_name="flash_attention_fp16",
+                dtype="float16",
+            )
+            self._test_flash_attention(
+                test_name="flash_attention_fp16_copy_op",
+                copy_op=True,
+                dtype="float16",
             )
 
-    def _test_attention(self, test_name, rebuild=True, benchmark=False):
+    def _test_attention(
+        self,
+        test_name="attention",
+        rebuild=True,
+        benchmark=False,
+        dtype="float16",
+    ):
         target = detect_target()
         nheads = 12
         d = 64  # head_dim
         seqlen = 4096
         dim = 768
         token_emb_init_range = 0.001
-        X = Tensor(shape=[seqlen, dim], dtype="float16", name="input_0", is_input=True)
+        X = Tensor(
+            shape=[seqlen, dim],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
         qkv_w = Tensor(
-            shape=[dim * 3, dim], dtype="float16", name="input_1", is_input=True
+            shape=[dim * 3, dim],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[dim * 3],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[dim * 3], dtype="float16", name="input_2", is_input=True)
 
         qkv = ops.gemm_rcr_bias_permute(shape=(seqlen, 3, nheads), layout="m2n3")(
             X, qkv_w, B
         )
         (q, k, v) = ops.split()(qkv, 1, dim=0)
-        scale = Tensor(shape=[], dtype="float16", name="input_3", value=(d**-0.5))
+        scale = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="input_3",
+            value=(d**-0.5),
+        )
         q = ops.elementwise(FuncEnum.MUL)(q, scale)
         attn = ops.bmm_rcr()(
             (ops.reshape()(q, [nheads, -1, d])),
@@ -302,24 +360,26 @@ def _test_attention(self, test_name, rebuild=True, benchmark=False):
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-        X_pt = torch.randn(seqlen, dim).cuda().half() * token_emb_init_range
-        W_pt = torch.randn(dim * 3, dim).cuda().half()
-        B_pt = torch.randn(dim * 3).cuda().half()
+        X_pt = get_random_torch_tensor([seqlen, dim], dtype=dtype)
+        X_pt *= token_emb_init_range
+        W_pt = get_random_torch_tensor([dim * 3, dim], dtype=dtype)
+        B_pt = get_random_torch_tensor([dim * 3], dtype=dtype)
         Y_pt = attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen)
         inputs = {
-            "input_0": X_pt.half(),
-            "input_1": W_pt.half(),
-            "input_2": B_pt.half(),
+            "input_0": X_pt,
+            "input_1": W_pt,
+            "input_2": B_pt,
         }
-        y = torch.empty(Y_pt.shape).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        y = torch.empty_like(Y_pt, dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(y, Y_pt, atol=1e-1, rtol=1e-1)
 
         if benchmark:
             pt_time = benchmark_pt.benchmark_torch_function(
                 100, attention_pt, X_pt, W_pt, B_pt, nheads, d, seqlen
             )
-            logger.info(__file__, "benchmark compiler model time: {0}".format(pt_time))
+            _LOGGER.info(f"benchmark compiler model time: {pt_time}")
 
             # Warm up.
             for _ in range(5):
@@ -330,13 +390,14 @@ def _test_attention(self, test_name, rebuild=True, benchmark=False):
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark compiler model time: {0}".format(time_per_iter_ms)
-            )
+            _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
 
     def test_attention(self):
         if detect_target().name() == "rocm":
-            self._test_attention(test_name="attention")
+            self._test_attention(
+                test_name="attention_fp16",
+                dtype="float16",
+            )
 
     def _test_mem_eff_attention(
         self,
@@ -346,21 +407,32 @@ def _test_mem_eff_attention(
         n=1024,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="mem_eff_attention",
         rebuild=True,
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
         use_perm=True,
     ):
+        torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
         x = torch.randn(
-            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
+            batch_size,
+            seqlen,
+            n,
+            device="cuda",
+            dtype=torch_dtype,
+            requires_grad=True,
+        )
+        Wqkv = torch.nn.Linear(
+            nheads * d,
+            3 * nheads * d,
+            device=device,
+            dtype=torch_dtype,
         )
-        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
 
         lengths = torch.tensor(
             [seqlen] * batch_size, dtype=torch.int, device="cuda"
@@ -369,7 +441,12 @@ def _test_mem_eff_attention(
             repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
             < lengths
         )
-        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
+        attention_mask = torch.zeros(
+            batch_size,
+            seqlen,
+            device="cuda",
+            dtype=torch_dtype,
+        )
         attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
 
         x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
@@ -393,32 +470,32 @@ def _test_mem_eff_attention(
 
         Q = Tensor(
             shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="q",
             is_input=True,
         )
         K = Tensor(
             shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="k",
             is_input=True,
         )
         V = Tensor(
             shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="v",
             is_input=True,
         )
 
-        flash_attention_op = ops.mem_eff_attention(
+        mem_eff_attention_op = ops.mem_eff_attention(
             causal=causal,
         )
         if copy_op:
-            flash_attention_op = ops.mem_eff_attention(
-                **flash_attention_op._get_op_attributes()
+            mem_eff_attention_op = ops.mem_eff_attention(
+                **mem_eff_attention_op._get_op_attributes()
             )
 
-        Y = flash_attention_op(Q, K, V)
+        Y = mem_eff_attention_op(Q, K, V)
 
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
@@ -440,12 +517,16 @@ def _test_mem_eff_attention(
         )
 
         inputs = {
-            "q": q.detach().half().cuda().contiguous(),
-            "k": k.detach().half().cuda().contiguous(),
-            "v": v.detach().half().cuda().contiguous(),
+            "q": q.detach().to(torch_dtype).cuda().contiguous(),
+            "k": k.detach().to(torch_dtype).cuda().contiguous(),
+            "v": v.detach().to(torch_dtype).cuda().contiguous(),
         }
 
-        y = torch.empty([batch_size, seqlen, num_heads, head_size]).cuda().half()
+        y = torch.empty(
+            [batch_size, seqlen, num_heads, head_size],
+            dtype=torch_dtype,
+            device="cuda",
+        )
         module.run_with_tensors(inputs, [y])
 
         if benchmark_ait:
@@ -458,18 +539,16 @@ def _test_mem_eff_attention(
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark eff-mem-attn time: {0}".format(time_per_iter_ms)
-            )
+            _LOGGER.info(f"benchmark eff-mem-attn time: {time_per_iter_ms}")
 
-        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=1e-3, rtol=1e-3)
 
         if benchmark_pt:
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
             func = attention_ref
             args = (
-                qkv.cuda().half(),
+                qkv.to(torch_dtype).cuda(),
                 attention_mask_bool.cuda(),
                 dropout_p,
                 False,
@@ -480,14 +559,19 @@ def _test_mem_eff_attention(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
-    def test_mem_eff_attention(self):
+    def test_mem_eff_attention_fp16(self):
         if detect_target().name() == "cuda":
             for use_perm in [False, True]:
                 self._test_mem_eff_attention(
-                    use_perm=use_perm, test_name="mem_eff_attention"
+                    use_perm=use_perm,
+                    test_name=f"mem_eff_attention_fp16_{use_perm}",
+                    dtype="float16",
                 )
                 self._test_mem_eff_attention(
-                    causal=True, test_name="mem_eff_attention_causal"
+                    use_perm=use_perm,
+                    causal=True,
+                    test_name=f"mem_eff_attention_fp16_{use_perm}_causal",
+                    dtype="float16",
                 )
                 # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
                 # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
@@ -495,6 +579,25 @@ def test_mem_eff_attention(self):
                 # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
                 # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_mem_eff_attention_fp32(self):
+        if detect_target().name() == "cuda":
+            for use_perm in [False, True]:
+                self._test_mem_eff_attention(
+                    use_perm=use_perm,
+                    test_name=f"mem_eff_attention_fp32_{use_perm}",
+                    dtype="float32",
+                )
+                self._test_mem_eff_attention(
+                    use_perm=use_perm,
+                    causal=True,
+                    test_name=f"mem_eff_attention_fp32_{use_perm}_causal",
+                    dtype="float32",
+                )
+
     def _test_cross_attention(
         self,
         batch_size=16,
@@ -505,21 +608,23 @@ def _test_cross_attention(
         head_size_v=64,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="cross_attention",
         rebuild=True,
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
     ):
+        torch_dtype = string_to_torch_dtype(dtype)
+
         q = torch.randn(
             batch_size,
             seqlen,
             num_heads,
             head_size,
             device="cuda",
-            dtype=dtype,
+            dtype=torch_dtype,
         )
         k = torch.randn(
             batch_size,
@@ -527,7 +632,7 @@ def _test_cross_attention(
             num_heads,
             head_size,
             device="cuda",
-            dtype=dtype,
+            dtype=torch_dtype,
         )
         v = torch.randn(
             batch_size,
@@ -535,7 +640,7 @@ def _test_cross_attention(
             num_heads,
             head_size_v,
             device="cuda",
-            dtype=dtype,
+            dtype=torch_dtype,
         )
 
         output = ref_cross_attention(q, k, v)
@@ -543,31 +648,31 @@ def _test_cross_attention(
 
         Q = Tensor(
             shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="q",
             is_input=True,
         )
         K = Tensor(
             shape=[batch_size, num_heads, seqlen_kv, head_size],
-            dtype="float16",
+            dtype=dtype,
             name="k",
             is_input=True,
         )
         V = Tensor(
             shape=[batch_size, num_heads, seqlen_kv, head_size_v],
-            dtype="float16",
+            dtype=dtype,
             name="v",
             is_input=True,
         )
 
-        flash_attention_op = ops.mem_eff_attention(
+        mem_eff_attention_op = ops.mem_eff_attention(
             causal=causal,
         )
         if copy_op:
-            flash_attention_op = ops.flash_attention(
-                **flash_attention_op._get_op_attributes()
+            mem_eff_attention_op = ops.mem_eff_attention(
+                **mem_eff_attention_op._get_op_attributes()
             )
-        Y = flash_attention_op(Q, K, V)
+        Y = mem_eff_attention_op(Q, K, V)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
@@ -582,11 +687,15 @@ def _test_cross_attention(
         v = torch.permute(v, (0, 2, 1, 3))
 
         inputs = {
-            "q": q.detach().half().cuda().contiguous(),
-            "k": k.detach().half().cuda().contiguous(),
-            "v": v.detach().half().cuda().contiguous(),
+            "q": q.detach().to(torch_dtype).cuda().contiguous(),
+            "k": k.detach().to(torch_dtype).cuda().contiguous(),
+            "v": v.detach().to(torch_dtype).cuda().contiguous(),
         }
-        y = torch.empty([batch_size, seqlen, num_heads, head_size_v]).cuda().half()
+        y = torch.empty(
+            [batch_size, seqlen, num_heads, head_size_v],
+            dtype=torch_dtype,
+            device="cuda",
+        )
         module.run_with_tensors(inputs, [y])
 
         if benchmark_ait:
@@ -599,24 +708,44 @@ def _test_cross_attention(
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
-            )
+            _LOGGER.info(f"benchmark cross-attn time: {time_per_iter_ms}")
 
-        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=1e-3, rtol=1e-3)
 
-    def test_cross_attention(self):
+    def test_cross_attention_fp16(self):
         if detect_target().name() == "cuda":
-            self._test_cross_attention(test_name="cross_attention")
+            self._test_cross_attention(
+                test_name="cross_attention_fp16",
+                dtype="float16",
+            )
+            self._test_cross_attention(
+                seqlen=1024,
+                seqlen_kv=768,
+                head_size=64,
+                head_size_v=64,
+                test_name="cross_attention2_fp16",
+                dtype="float16",
+            )
+
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_cross_attention_fp32(self):
+        if detect_target().name() == "cuda":
+            self._test_cross_attention(
+                test_name="cross_attention_fp32",
+                dtype="float32",
+            )
             self._test_cross_attention(
                 seqlen=1024,
                 seqlen_kv=768,
                 head_size=64,
                 head_size_v=64,
-                test_name="cross_attention2",
+                test_name="cross_attention2_fp32",
+                dtype="float32",
             )
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_avg_pool2d.py b/tests/unittest/ops/test_avg_pool2d.py
index fc92d39f4..ad09576a2 100644
--- a/tests/unittest/ops/test_avg_pool2d.py
+++ b/tests/unittest/ops/test_avg_pool2d.py
@@ -19,15 +19,16 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class AvgPoolTestCase(unittest.TestCase):
-    def test_fp16(self):
-        target = detect_target()
+    def _test_avg_pool_2d(self, dtype="float16"):
         batch_size = [1, 3]
+        target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 7, 7, 2048],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -36,16 +37,24 @@ def test_fp16(self):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "avg_pool2d")
-        for b in batch_size:
-            X_pt = torch.randn(b, 2048, 7, 7).cuda().half()
+        for batch in batch_size:
+            X_pt = get_random_torch_tensor([batch, 2048, 7, 7], dtype=dtype)
             OP_pt = torch.nn.AvgPool2d(kernel_size=7, stride=1, padding=0)
             Y_pt = OP_pt(X_pt)
-            y = torch.empty([b, 1, 1, 2048]).cuda().half()
             x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute(0, 2, 3, 1).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_avg_pool_2d_fp16(self):
+        self._test_avg_pool_2d(dtype="float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_avg_pool_2d_fp32(self):
+        self._test_avg_pool_2d(dtype="float32")
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 9e90c1fbb..4c210af1a 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -22,21 +22,29 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class gatherTestCase(unittest.TestCase):
-    def _create_tensors(self, N):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _create_tensors(self, N, dtype):
         scores = torch.randperm(N) / N
-        return scores.cuda().half()
+        return scores.cuda().to(dtype=string_to_torch_dtype(dtype))
 
 
 class batchGatherTestCase(gatherTestCase):
-    def _create_tensors(self, N):
-        scores = torch.randperm(N) / N
-        return scores.cuda().half()
-
     def _test_batch_gather(
-        self, shape=(3, 2, 2), ind_shape=(3,), dim=0, max_ind=2, test_name="gather"
+        self,
+        shape=(3, 2, 2),
+        ind_shape=(3,),
+        dim=0,
+        max_ind=2,
+        test_name="gather",
+        dtype="float16",
     ):
 
         in_shape = shape
@@ -47,7 +55,7 @@ def _test_batch_gather(
 
         X1 = Tensor(
             shape=in_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -62,9 +70,9 @@ def _test_batch_gather(
         X4._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
 
-        input_x = torch.rand(in_shape).cuda().half()
+        input_x = get_random_torch_tensor(in_shape, dtype)
         init_index = torch.randint(max_ind, size=ind_shape, dtype=torch.int64).cuda()
 
         reshaped_shape = list(ind_shape)
@@ -80,10 +88,10 @@ def _test_batch_gather(
 
         indices = init_index.reshape(ind_shape).contiguous()
 
-        y = torch.empty(o_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors({"X": x, "indices": indices}, [y])
 
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
     def test_batch_gather(self):
         self._test_batch_gather(
@@ -103,10 +111,27 @@ def test_batch_gather(self):
             test_name="batch_gather4",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather(
+            shape=(8, 2, 2),
+            ind_shape=(2,),
+            dim=0,
+            max_ind=8,
+            test_name="batch_gather_f32",
+            dtype="float32",
+        )
+
 
 class batchGatherTopkTestCase(gatherTestCase):
     def _test_batch_gather_topk(
-        self, shape=(2, 2, 2), batch_size=1, N=1000, topK=100, test_name="topk"
+        self,
+        shape=(2, 2, 2),
+        batch_size=1,
+        N=1000,
+        topK=100,
+        test_name="topk",
+        dtype="float16",
     ):
 
         m_shape = (N,) + shape
@@ -114,13 +139,13 @@ def _test_batch_gather_topk(
 
         X1 = Tensor(
             shape=m_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
         X2 = Tensor(
             shape=[N],
-            dtype="float16",
+            dtype=dtype,
             name="scores",
             is_input=True,
         )
@@ -130,10 +155,10 @@ def _test_batch_gather_topk(
         X4._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
 
-        input_x = torch.rand(m_shape).cuda().half()
-        scores = self._create_tensors(N)
+        input_x = get_random_torch_tensor(m_shape, dtype)
+        scores = self._create_tensors(N, dtype)
 
         (_, init_index) = torch.topk(scores, k=topK, dim=0)
 
@@ -150,7 +175,7 @@ def _test_batch_gather_topk(
 
         x_scores = scores.reshape((N,)).contiguous()
 
-        y = torch.empty(n_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors({"X": x, "scores": x_scores}, [y])
 
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
@@ -160,6 +185,16 @@ def test_batch_gather_topk(self):
             shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather_topk(
+            shape=(4, 1, 1),
+            N=2000,
+            topK=300,
+            test_name="batch_gather_topk_f32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_bert_embeddings.py b/tests/unittest/ops/test_bert_embeddings.py
index 39dc2315d..f1656ea0e 100644
--- a/tests/unittest/ops/test_bert_embeddings.py
+++ b/tests/unittest/ops/test_bert_embeddings.py
@@ -20,9 +20,15 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
-def get_ait_inputs(batch_size=1, seq_len=512, dtype="int64"):
+def get_ait_inputs(
+    batch_size=1,
+    seq_len=512,
+    dtype="int64",
+):
     input_ids = Tensor(
         shape=[batch_size, seq_len],
         name="input_ids",
@@ -45,7 +51,11 @@ def get_ait_inputs(batch_size=1, seq_len=512, dtype="int64"):
 
 
 def get_ait_params(
-    hidden_size, vocab_size, max_position_embeddings, type_vocab_size, dtype="float16"
+    hidden_size,
+    vocab_size,
+    max_position_embeddings,
+    type_vocab_size,
+    dtype="float16",
 ):
     word_embeddings = Tensor(
         shape=[vocab_size, hidden_size],
@@ -94,14 +104,21 @@ def _test_bert_embeddings(
         vocab_size,
         max_position_embeddings,
         type_vocab_size,
+        test_name="bert_embeddings",
         indices_type="int64",
+        input_type="float16",
     ):
-        inputs = get_ait_inputs(batch_size, seq_len, indices_type)
+        inputs = get_ait_inputs(
+            batch_size,
+            seq_len,
+            dtype=indices_type,
+        )
         params = get_ait_params(
             hidden_size,
             vocab_size,
             max_position_embeddings,
             type_vocab_size,
+            dtype=input_type,
         )
         y = ops.bert_embeddings()(*(inputs + params), 1e-5)
         y._attrs["is_output"] = True
@@ -109,17 +126,27 @@ def _test_bert_embeddings(
 
         target = detect_target()
         with compile_model(
-            y, target, "./tmp", f"test_bert_embeddings_{self._test_id}"
+            y,
+            target,
+            "./tmp",
+            f"{test_name}_{self._test_id}",
         ) as module:
-            dtype = torch.long
+            self._test_id += 1
+            torch_indices_type = string_to_torch_dtype(indices_type)
             input_ids = torch.randint(
-                0, vocab_size, (batch_size, seq_len), dtype=dtype
+                0,
+                vocab_size,
+                (batch_size, seq_len),
+                dtype=torch_indices_type,
             ).cuda()
             token_type_ids = torch.randint(
-                0, type_vocab_size, input_ids.size(), dtype=dtype
+                0,
+                type_vocab_size,
+                input_ids.size(),
+                dtype=torch_indices_type,
             ).cuda()
             position_ids = (
-                torch.arange(seq_len, dtype=dtype)
+                torch.arange(seq_len, dtype=torch_indices_type)
                 .reshape((1, -1))
                 .expand(batch_size, -1)
                 .contiguous()
@@ -133,7 +160,7 @@ def _test_bert_embeddings(
             for param in params:
                 name = param._attrs["name"]
                 shape = [shape.value() for shape in param.shape()]
-                w = torch.randn(shape).cuda().half()
+                w = get_random_torch_tensor(shape, dtype=input_type)
                 inputs[name] = w
 
             word_embedding = torch.nn.functional.embedding(
@@ -151,17 +178,77 @@ def _test_bert_embeddings(
                 pt_embedding, [hidden_size], inputs["gamma"], inputs["beta"], eps=1e-5
             )
 
-            embedding = torch.empty(pt_embedding.shape).cuda().half()
+            embedding = torch.empty_like(pt_embedding)
             module.run_with_tensors(inputs, [embedding])
             self.assertTrue(
                 torch.allclose(embedding, pt_embedding, atol=1e-3, rtol=1e-3)
             )
 
-    def test_bert_embeddings(self):
-        self._test_bert_embeddings(15, 17, 264, 10000, 512, 2)
-        self._test_bert_embeddings(1, 13, 264, 10000, 512, 2)
-        self._test_bert_embeddings(8, 512, 512, 10000, 512, 2)
+    def test_bert_embeddings_fp16(self):
+        self._test_bert_embeddings(
+            batch_size=15,
+            seq_len=17,
+            hidden_size=264,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp16",
+            input_type="float16",
+        )
+        self._test_bert_embeddings(
+            batch_size=1,
+            seq_len=13,
+            hidden_size=264,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp16",
+            input_type="float16",
+        )
+        self._test_bert_embeddings(
+            batch_size=8,
+            seq_len=512,
+            hidden_size=512,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp16",
+            input_type="float16",
+        )
+
+    def test_bert_embeddings_fp32(self):
+        self._test_bert_embeddings(
+            batch_size=15,
+            seq_len=17,
+            hidden_size=264,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp32",
+            input_type="float32",
+        )
+        self._test_bert_embeddings(
+            batch_size=1,
+            seq_len=13,
+            hidden_size=264,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp32",
+            input_type="float32",
+        )
+        self._test_bert_embeddings(
+            batch_size=8,
+            seq_len=512,
+            hidden_size=512,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp32",
+            input_type="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 0c14c348c..347a0711b 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -20,20 +20,22 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 class BMMTestCase(unittest.TestCase):
-    def _test_rcr(self, bs, ms, N, K, test_name):
+    def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -41,13 +43,13 @@ def _test_rcr(self, bs, ms, N, K, test_name):
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
         for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             WT = torch.transpose(W_pt, 2, 1)
             Y_pt = torch.bmm(X_pt, WT)
 
-            y = torch.empty([b, m, N]).cuda().half()
+            y = get_torch_empty_tensor([b, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or Y_pt.nelement() == 0:
                 pass
@@ -66,17 +68,15 @@ def test_rcr(self):
             self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
             self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
 
-    def _test_crr(self, bs, ks, test_name):
-        M = 256
-        N = 512
+    def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         k_dim = shape_utils.gen_int_var_min_max(ks, name="k")
         X = Tensor(
-            shape=[batch_dim, k_dim, M], dtype="float16", name="input_0", is_input=True
+            shape=[batch_dim, k_dim, M], dtype=dtype, name="input_0", is_input=True
         )
         W = Tensor(
-            shape=[batch_dim, k_dim, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, k_dim, N], dtype=dtype, name="input_1", is_input=True
         )
         OP = ops.bmm_crr()
         Y = OP(X, W)
@@ -85,35 +85,31 @@ def _test_crr(self, bs, ks, test_name):
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
         for (b, k) in itertools.product(bs, ks):
-            X_pt = torch.randn(b, k, M).cuda().half()
-            W_pt = torch.randn(b, k, N).cuda().half()
+            X_pt = get_random_torch_tensor([b, k, M], dtype)
+            W_pt = get_random_torch_tensor([b, k, N], dtype)
 
             XT = torch.transpose(X_pt, 2, 1)
             Y_pt = torch.bmm(XT, W_pt)
 
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def test_crr(self):
-        self._test_crr([1024], [128], "static")
+        self._test_crr([1024], [128], M=256, N=512, test_name="static")
         if detect_target().name() == "cuda":
-            self._test_crr([3, 977, 1024], [128], "dynamic_b")
-            self._test_crr([5], [45, 56, 78], "dynamic_k")
-            self._test_crr([1, 2, 5], [3, 6, 8], "dynamic_bk")
+            self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+            self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+            self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
 
-    def _test_rrr(self, bs, ms, test_name):
-        K = 256
-        N = 512
+    def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -121,34 +117,27 @@ def _test_rrr(self, bs, ms, test_name):
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
         for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_pt = torch.bmm(X_pt, W_pt)
 
-            y = torch.empty([b, m, N]).cuda().half()
+            y = get_torch_empty_tensor([b, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def test_rrr(self):
-        self._test_rrr([87], [23], "static")
+        self._test_rrr([87], [23], K=256, N=512, test_name="static")
         if detect_target().name() == "cuda":
-            self._test_rrr([2, 5, 99], [23], "dynamic_b")
-            self._test_rrr([77], [4, 7, 9], "dynamic_m")
-            self._test_rrr([2, 5, 7], [1, 7, 9], "dynamic_bm")
-
-    def _test_ccr(self, bs, test_name):
-        M = 256
-        N = 64
-        K = 128
+            self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+            self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+            self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
-        X = Tensor(
-            shape=[batch_dim, K, M], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
-        )
+        X = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_ccr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -156,33 +145,61 @@ def _test_ccr(self, bs, test_name):
         module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
 
         for b in bs:
-            X_pt = torch.randn(b, K, M).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([b, K, M], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             XT = torch.transpose(X_pt, 2, 1)
             Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def test_ccr(self):
-        self._test_ccr([77], "static")
+        self._test_ccr([77], M=256, N=64, K=128, test_name="static")
         if detect_target().name() == "cuda":
-            self._test_ccr([1, 9, 101], "dynamic_b")
+            self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_float(self):
+        self._test_rcr([128], [64], N=8, K=64, test_name="static_float", dtype="float")
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name="dynamic_b_float",
+            dtype="float",
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name="dynamic_bk_float",
+            dtype="float",
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name="dynamic_m_float", dtype="float"
+        )
+        self._test_ccr(
+            [1, 9, 11], M=64, N=32, K=16, test_name="dynamic_b_float", dtype="float"
+        )
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
-    def test_rcr_with_accessors(self):
+    def _test_rcr_with_accessors(self, dtype="float16"):
         A_shape = [2, 2, 4]
         B_shape = [2, 8, 4]
         C_shape = [2, 2, 8]
 
-        X_expanded = Tensor(
-            shape=A_shape, dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        C = Tensor(shape=C_shape, dtype="float16", name="input_2", is_input=True)
+        X_expanded = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        C = Tensor(shape=C_shape, dtype=dtype, name="input_2", is_input=True)
 
         X, _ = ops.split()(X_expanded, [1, 1], 0)
         OP = ops.bmm_rcr()
@@ -191,12 +208,13 @@ def test_rcr_with_accessors(self):
         out._attrs["name"] = "output_0"
         out._attrs["is_output"] = True
 
+        test_name = f"bmm_rcr_with_accessor_{dtype}"
         target = detect_target()
-        module = compile_model(out, target, "./tmp", "bmm_rcr_with_accessor")
+        module = compile_model(out, target, "./tmp", test_name)
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        C_pt = torch.randn(*C_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        C_pt = get_random_torch_tensor(C_shape, dtype)
 
         X_feed, _ = torch.split(X_pt, [1, 1], 0)
 
@@ -209,20 +227,16 @@ def test_rcr_with_accessors(self):
         inputs[input_name_to_index["input_0"]] = X_pt
         inputs[input_name_to_index["input_1"]] = W_pt
         inputs[input_name_to_index["input_2"]] = C_pt
-        y = torch.empty([4, 2, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 2, 8], dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rcr_merge_with_accessors(self):
+    def _test_rcr_merge_with_accessors(self, dtype="float16"):
         A_shape = [2, 2, 4]
         B_shape = [4, 8, 4]
 
-        X_expanded = Tensor(
-            shape=A_shape, dtype="float16", name="input_0", is_input=True
-        )
-        W_expanded = Tensor(
-            shape=B_shape, dtype="float16", name="input_1", is_input=True
-        )
+        X_expanded = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W_expanded = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
 
         X1, X2 = ops.split()(X_expanded, [1, 1], 0)
         W1, W2 = ops.split()(W_expanded, [2, 2], 0)
@@ -233,10 +247,11 @@ def test_rcr_merge_with_accessors(self):
         out._attrs["is_output"] = True
 
         target = detect_target()
-        module = compile_model(out, target, "./tmp", "bmm_rcr_merge_with_accessor")
+        test_name = f"bmm_rcr_merge_with_accessor_{dtype}"
+        module = compile_model(out, target, "./tmp", test_name)
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         X1_pt, X2_pt = torch.split(X_pt, [1, 1], 0)
 
@@ -250,11 +265,15 @@ def test_rcr_merge_with_accessors(self):
         inputs = [0, 0]
         inputs[input_name_to_index["input_0"]] = X_pt
         inputs[input_name_to_index["input_1"]] = W_pt
-        y = torch.empty([4, 2, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 2, 8], dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_rcr(self, A_shape, B_shape, test_name):
+    def test_with_accessors(self):
+        self._test_rcr_with_accessors()
+        self._test_rcr_merge_with_accessors()
+
+    def _test_rcr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-2]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -263,8 +282,8 @@ def _test_rcr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -273,13 +292,13 @@ def _test_rcr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(X_pt, WT)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -289,7 +308,7 @@ def test_rcr(self):
         self._test_rcr([16, 8], [8, 32, 8], "2d_broadcastable_a")
         self._test_rcr([8, 16, 8], [32, 8], "2d_broadcastable_b")
 
-    def _test_crr(self, A_shape, B_shape, test_name):
+    def _test_crr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-1]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -298,8 +317,8 @@ def _test_crr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_crr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -308,13 +327,13 @@ def _test_crr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         Y_pt = torch.matmul(XT, W_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -324,7 +343,7 @@ def test_crr(self):
         self._test_crr([8, 16], [8, 8, 32], "2d_broadcastable_a")
         self._test_crr([8, 8, 16], [8, 32], "2d_broadcastable_b")
 
-    def _test_rrr(self, A_shape, B_shape, test_name):
+    def _test_rrr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-1]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -333,8 +352,8 @@ def _test_rrr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -343,12 +362,12 @@ def _test_rrr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         Y_pt = torch.matmul(X_pt, W_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -358,7 +377,7 @@ def test_rrr(self):
         self._test_rrr([16, 8], [8, 8, 32], "2d_broadcastable_a")
         self._test_rrr([8, 16, 8], [8, 32], "2d_broadcastable_b")
 
-    def _test_ccr(self, A_shape, B_shape, test_name):
+    def _test_ccr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-2]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -367,8 +386,8 @@ def _test_ccr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_ccr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -377,14 +396,14 @@ def _test_ccr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(XT, WT)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -394,6 +413,22 @@ def test_ccr(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_broadcast_float(self):
+        self._test_rcr_with_accessors(dtype="float")
+        self._test_rcr_merge_with_accessors(dtype="float")
+        self._test_rcr([2, 16, 8], [1, 32, 8], "broadcastable_b", dtype="float")
+        self._test_rcr([16, 8], [8, 32, 8], "2d_broadcastable_a", dtype="float")
+        self._test_crr([1, 8, 16], [2, 8, 32], "broadcastable_a", dtype="float")
+        self._test_crr([8, 8, 16], [8, 32], "2d_broadcastable_b", dtype="float")
+        self._test_rrr([2, 16, 8], [1, 8, 32], "broadcastable_b", dtype="float")
+        self._test_rrr([16, 8], [8, 8, 32], "2d_broadcastable_a", dtype="float")
+        self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a", dtype="float")
+        self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b", dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 4f188a421..45fc2fd90 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -19,56 +19,65 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMAddTestCase(unittest.TestCase):
-    def test_rrr(self):
-        B = 32
-        M = 256
-        K = 256
-        N = 512
+    def __init__(self, *args, **kwargs):
+        super(BMMAddTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(self, B, M, K, N, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
-        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.bmm_rrr_add()
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "bmm_rrr_add")
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", f"bmm_rrr_add_{dtype}", dll_name=dll_name
+        )
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         Y_pt = torch.bmm(X_pt, W_pt)
         Y_pt = Y_pt + D_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
         )
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
 
-    def _test_ccr(self, B, M, N, K, test_name):
+    def _test_ccr(self, B, M, N, K, test_name, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
-        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.bmm_ccr_add()
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         XT = torch.transpose(X_pt, 2, 1)
         Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
         Y_pt = Y_pt + D_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
         )
@@ -76,6 +85,52 @@ def _test_ccr(self, B, M, N, K, test_name):
             pass
         else:
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_crr(self, B, M, K, N, dtype="float16"):
+        target = detect_target()
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[B, M, N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.bmm_crr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        test_name = f"bmm_crr_add_{dtype}"
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt + D_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_rrr(self):
+        self._test_rrr(B=32, M=256, K=256, N=512)
 
     def test_ccr(self):
         self._test_ccr(B=32, M=256, N=256, K=512, test_name="bmm_ccr_add")
@@ -83,63 +138,30 @@ def test_ccr(self):
         self._test_ccr(B=1, M=0, N=256, K=512, test_name="bmm_ccr_zero_m")
         self._test_ccr(B=1, M=256, N=256, K=0, test_name="bmm_ccr_zero_k")
 
-    # def test_crr(self):
-    #     B = 32
-    #     M = 256
-    #     K = 256
-    #     N = 512
-    #     target = detect_target()
-    #     X = Tensor(
-    #         shape=[B, K, M],
-    #         dtype="float16",
-    #         name="input_0"
-    #     )
-    #     W = Tensor(
-    #         shape=[B, K, N],
-    #         dtype="float16",
-    #         name="input_1"
-    #     )
-    #     D = Tensor(
-    #         shape=[B, M, N],
-    #         dtype="float16",
-    #         name="input_2"
-    #     )
-    #     OP = ops.bmm_crr_add()
-    #     Y = OP(X, W, D)
-    #     Y._attrs["name"] = "output_0"
-    #     Y._attrs["is_output"] = True
-    #     module = compile_model(Y, target, "./tmp", "bmm_crr_add")
-    #     X_pt = torch.randn(B, K, M).cuda().half()
-    #     W_pt = torch.randn(B, K, N).cuda().half()
-    #     D_pt = torch.randn(B, M, N).cuda().half()
-
-    #     XT = torch.transpose(X_pt, 2, 1)
-    #     Y_pt = torch.bmm(XT, W_pt)
-    #     Y_pt = Y_pt + D_pt
-    #     Y_np = Y_pt.cpu().numpy()
-
-    #     x = X_pt.cpu().numpy()
-    #     w = W_pt.cpu().numpy()
-    #     d = D_pt.cpu().numpy()
-    #     module.SetInput("input_0", x)
-    #     module.SetInput("input_1", w)
-    #     module.SetInput("input_2", d)
-    #     module.benchmark()
-    #     y = module.GetOutput("output_0", [B, M, N])
-    #     np.testing.assert_allclose(Y_np,
-    #                                y,
-    #                                atol=1e-2, rtol=1e-2)
+    def test_crr(self):
+        self._test_crr(B=32, M=256, K=256, N=512)
+
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_add_float(self):
+        self._test_rrr(B=8, M=32, K=8, N=64, dtype="float")
+        self._test_ccr(
+            B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_float", dtype="float"
+        )
+        self._test_crr(B=8, M=32, K=16, N=64, dtype="float")
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
-    def _test_crr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_crr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-1]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_crr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -147,14 +169,14 @@ def _test_crr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         Y_pt = torch.matmul(XT, W_pt) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -183,13 +205,13 @@ def test_crr(self):
             test_name="broadcastable_bias3d",
         )
 
-    def _test_rrr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_rrr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-1]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_rrr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -197,13 +219,13 @@ def _test_rrr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         Y_pt = torch.matmul(X_pt, W_pt) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -232,13 +254,13 @@ def test_rrr(self):
             test_name="broadcastable_bias3d",
         )
 
-    def _test_ccr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_ccr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-2]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_ccr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -246,15 +268,15 @@ def _test_ccr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(XT, WT) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -283,6 +305,33 @@ def test_ccr(self):
             test_name="broadcastable_bias3d",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_broadcast_float(self):
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d_float",
+            dtype="float",
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+            dtype="float",
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_alpha.py b/tests/unittest/ops/test_bmm_alpha.py
index 4eff3f11a..39f4adcea 100644
--- a/tests/unittest/ops/test_bmm_alpha.py
+++ b/tests/unittest/ops/test_bmm_alpha.py
@@ -21,6 +21,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 
@@ -41,26 +45,27 @@ def _test_bmm_alpha(
         expected_num_ops,
         use_fp16_acc=False,
         with_add=False,
+        dtype="float16",
     ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         if X_trans:
-            X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+            X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
         else:
-            X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+            X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
         if W_trans:
-            W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+            W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         else:
-            W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+            W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         if with_add:
-            D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+            D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         BMM_OP = bmm_op()
         Y1 = BMM_OP(X, W, D) if with_add else BMM_OP(X, W)
         elem_func_type = FuncEnum.DIV if is_div else FuncEnum.MUL
-        Y2 = ops.elementwise(elem_func_type)(Y1, Tensor([], value=cst_val))
+        Y2 = ops.elementwise(elem_func_type)(Y1, Tensor([], value=cst_val, dtype=dtype))
         Y2._attrs["name"] = "output_0"
         Y2._attrs["is_output"] = True
         module = compile_model(
-            Y2, target, "./tmp", f"bmm_alpha_{B}_{M}_{N}_{K}_{use_fp16_acc}"
+            Y2, target, "./tmp", f"bmm_alpha_{B}_{M}_{N}_{K}_{use_fp16_acc}_{dtype}"
         )
         expected_cst_val = 1.0 / float(cst_val) if is_div else float(cst_val)
 
@@ -84,15 +89,15 @@ def _test_bmm_alpha(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), expected_num_ops)
         if X_trans:
-            X_pt = torch.randn(B, K, M).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
         else:
-            X_pt = torch.randn(B, M, K).cuda().half()
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
         if W_trans:
-            W_pt = torch.randn(B, N, K).cuda().half()
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
         else:
-            W_pt = torch.randn(B, K, N).cuda().half()
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
         if with_add:
-            D_pt = torch.randn(B, M, N).cuda().half()
+            D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         def pt_bmm():
             XT = torch.transpose(X_pt, 2, 1) if X_trans else X_pt
@@ -108,7 +113,7 @@ def pt_bmm():
         inputs = {"input_0": X_pt, "input_1": W_pt}
         if with_add:
             inputs["input_2"] = D_pt
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(inputs, [y])
 
         if X_pt.nelement() == 0 or W_pt.nelement() == 0:
@@ -277,6 +282,43 @@ def test_bmm_alpha(self):
             use_fp16_acc=False,
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_alpha_float(self):
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+            dtype="float",
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr_add,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=False,
+            with_add=True,
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index df7420811..42cc684e5 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -20,21 +20,23 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMPermuteTestCase(unittest.TestCase):
-    def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False):
+    def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr_permute(shape=(d1,))
         if copy_op:
             OP = ops.bmm_rrr_permute(**OP._get_op_attributes())
@@ -44,14 +46,14 @@ def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
         for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_l = torch.bmm(X_pt, W_pt)
             Y_r = Y_l.reshape(b // d1, d1, m, N)
             Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
 
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
@@ -69,16 +71,14 @@ def test_rrr(self):
             self._test_rrr([24], [80], N=0, K=96, d1=12, test_name="permute1_zero_n")
             self._test_rrr([24], [0], N=32, K=96, d1=12, test_name="permute1_zero_m")
 
-    def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False):
+    def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr_permute(shape=(d1,))
         if copy_op:
             OP = ops.bmm_rcr_permute(**OP._get_op_attributes())
@@ -88,15 +88,15 @@ def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
         for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             WT = torch.transpose(W_pt, 2, 1)
             Y_l = torch.bmm(X_pt, WT)
             Y_r = Y_l.reshape(b // d1, d1, m, N)
             Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
 
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
@@ -116,6 +116,28 @@ def test_rcr(self):
             )
             self._test_rcr([24], [80], N=96, K=0, d1=12, test_name="permute1_zero_k")
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_permute_float(self):
+        self._test_rrr(
+            [10], [8], N=88, K=64, d1=10, test_name="permute3_float", dtype="float"
+        )
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_copy_op_float",
+            copy_op=True,
+            dtype="float",
+        )
+        self._test_rcr(
+            [10], [8], N=64, K=88, d1=10, test_name="permute3_float", dtype="float"
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index f6a32c6a0..ee7e60122 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -22,24 +22,29 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMRcrN1TestCase(unittest.TestCase):
-    def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name, dtype="float16"):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         BDim = shape_utils.gen_int_var_min_max(Bs, name="batch")
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(
             shape=[BDim, MDim, IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[BDim, IntImm(N), IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -48,22 +53,26 @@ def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", f"bmm_rcr_n1_{use_fp16_acc}_{test_name}"
+            Y,
+            target,
+            "./tmp",
+            f"bmm_rcr_n1_{use_fp16_acc}_{test_name}_{self.test_count}",
         )
         for B, M in itertools.product(Bs, Ms):
             logging.info(f"Testing {B=} {M=}")
-            X_pt = torch.randn(B, M, K).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
+            X_pt = get_random_torch_tensor((B, M, K), dtype)
+            W_pt = get_random_torch_tensor((B, N, K), dtype)
 
             Y_pt = torch.bmm(X_pt, torch.transpose(W_pt, 2, 1))
 
-            y = torch.empty([B, M, N]).half().cuda()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
                 self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
 
     def test_rcr_n1(self):
         self._test_rcr_n1([1], [1000000], 1, 32, True, "static")
@@ -84,6 +93,20 @@ def test_rcr_n1(self):
         self._test_rcr_n1([1], [100], 1, 0, False, "zero_k")
         self._test_rcr_n1([1], [0], 1, 3, False, "zero_m")
 
+    def test_float32(self):
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, True, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, False, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 7, True, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 123, False, "static_float32", dtype="float32"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
index 5bf158dc2..a50e67111 100644
--- a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -19,26 +19,31 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMRrrK1TanhTestCase(unittest.TestCase):
-    def _test_rrr(self, B, M, K, N, test_name):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr_k1_tanh()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self.test_count}")
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
 
         Y_pt = torch.bmm(X_pt, W_pt)
         Y_pt = torch.tanh(Y_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         if X_pt.nelement() == 0 or W_pt.nelement() == 0:
             pass
@@ -50,6 +55,9 @@ def test_rrr(self):
         self._test_rrr(B=1024, M=0, K=1, N=32, test_name="bmm_rrr_k1_zero_m")
         self._test_rrr(B=1024, M=32, K=0, N=32, test_name="bmm_rrr_k1_zero_k")
 
+    def test_float32(self):
+        self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
index 3b8350528..0aeaa6fe2 100644
--- a/tests/unittest/ops/test_bmm_softmax.py
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,7 +20,9 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -38,10 +41,10 @@ def _test_bmm_rcr_softmax(
 
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
+            _LOGGER.warning("Skip this test for special profiling requirement")
             return
         module = compile_model(Y, target, "./tmp", test_name)
         X_pt = torch.randn(B, M, K).cuda().half()
diff --git a/tests/unittest/ops/test_chunk.py b/tests/unittest/ops/test_chunk.py
index 7da51ee6d..fe1f9cdcf 100644
--- a/tests/unittest/ops/test_chunk.py
+++ b/tests/unittest/ops/test_chunk.py
@@ -26,9 +26,6 @@
 
 
 class ChunkTestCase(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(ChunkTestCase, self).__init__(*args, **kwargs)
-
     def _run_chunk(
         self,
         *,
@@ -41,7 +38,12 @@ def _run_chunk(
 
         chunk_op = ops.chunk()
         target = detect_target()
-        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        X = Tensor(
+            shape=input_shape,
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
         Ys = chunk_op(X, chunks, dim)
         for idx, Y in enumerate(Ys):
             Y._attrs["name"] = "output_{}".format(idx)
@@ -52,13 +54,13 @@ def _run_chunk(
         for batch_size in input_shape[0]._attrs["values"]:
             logging.info(f"Testing {batch_size=}")
             x_pt = get_random_torch_tensor(
-                [batch_size, *[v.value() for v in input_shape[1:]]], input_type
+                [batch_size, *[v.value() for v in input_shape[1:]]],
+                input_type,
             )
             ys_pt = torch.chunk(x_pt, chunks, dim)
-            y_shapes = [Y_pt.size() for Y_pt in ys_pt]
             outputs = {
-                f"output_{idx}": torch.empty(y_shape).cuda().half()
-                for idx, y_shape in enumerate(y_shapes)
+                f"output_{idx}": torch.empty_like(Y_pt)
+                for idx, Y_pt in enumerate(ys_pt)
             }
 
             module.run_with_tensors([x_pt], outputs)
@@ -68,7 +70,7 @@ def _run_chunk(
                     torch.allclose(y_pt, outputs[f"output_{idx}"], atol=1e-2, rtol=1e-2)
                 )
 
-    def test_chunk(self):
+    def test_chunk_fp16(self):
         self._run_chunk(
             input_shape=[IntImm(17), IntImm(5), IntImm(29)],
             chunks=2,
@@ -88,7 +90,28 @@ def test_chunk(self):
             input_type="float16",
         )
 
-    def test_dynamic_chunk(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_chunk_fp32(self):
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=2,
+            dim=0,
+            input_type="float32",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=7,
+            dim=1,
+            input_type="float32",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=11,
+            dim=2,
+            input_type="float32",
+        )
+
+    def test_dynamic_chunk_fp16(self):
         self._run_chunk(
             input_shape=[
                 IntVar(values=[13, 17], name="batch_dim"),
@@ -99,7 +122,10 @@ def test_dynamic_chunk(self):
             dim=1,
             input_type="float16",
         )
-        with self.assertRaises(RuntimeError) as context:
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Not implemented: chunk along dynamic axes",
+        ):
             self._run_chunk(
                 input_shape=[
                     IntVar(values=[13, 17], name="batch_dim"),
@@ -110,8 +136,18 @@ def test_dynamic_chunk(self):
                 dim=0,
                 input_type="float16",
             )
-        self.assertTrue(
-            "Not implemented: chunk along dynamic axes" in str(context.exception)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_chunk_fp32(self):
+        self._run_chunk(
+            input_shape=[
+                IntVar(values=[13, 17], name="batch_dim"),
+                IntImm(5),
+                IntImm(29),
+            ],
+            chunks=2,
+            dim=1,
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_clamp_nan_to_num.py b/tests/unittest/ops/test_clamp_nan_to_num.py
index 0dc6e5415..9e065acd9 100644
--- a/tests/unittest/ops/test_clamp_nan_to_num.py
+++ b/tests/unittest/ops/test_clamp_nan_to_num.py
@@ -23,17 +23,33 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ClampTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _create_shape_from_list(self, shape: List[int]) -> IntVar:
         if len(shape) > 1:
             return IntVar(shape)
         return IntImm(shape[0])
 
-    def _float_to_tensor(self, name: str, value: float) -> Tensor:
-        return Tensor(shape=[], dtype="float16", name=name, value=value)
+    def _float_to_tensor(
+        self,
+        name: str,
+        value: float,
+        dtype="float16",
+    ) -> Tensor:
+        return Tensor(
+            shape=[],
+            dtype=dtype,
+            name=name,
+            value=value,
+        )
 
     def _test_helper(
         self,
@@ -46,133 +62,218 @@ def _test_helper(
         test_name: str,
         func: FuncEnum,
         get_expected: Callable[[torch.Tensor], torch.Tensor],
+        dtype="float16",
     ):
         self.assertGreater(len(input_shape), 0)
         X = Tensor(
             shape=[self._create_shape_from_list(shape) for shape in input_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input",
             is_input=True,
         )
-        a_tensor = self._float_to_tensor("a", arg_a)
-        b_tensor = self._float_to_tensor("b", arg_b)
-        c_tensor = self._float_to_tensor("c", arg_c)
+        a_tensor = self._float_to_tensor("a", arg_a, dtype=dtype)
+        b_tensor = self._float_to_tensor("b", arg_b, dtype=dtype)
+        c_tensor = self._float_to_tensor("c", arg_c, dtype=dtype)
 
         result = ops.elementwise(func)(X, a_tensor, b_tensor, c_tensor)
         result._attrs["is_output"] = True
         result._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(result, target, "./tmp", test_name)
+        module = compile_model(result, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
+        torch_dtype = string_to_torch_dtype(dtype)
         for shape in itertools.product(*input_shape):
-            X_pt = torch.randn(shape, dtype=torch.half).cuda()
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
             if add_nans:
                 X_pt[0].fill_(float("nan"))
             if add_infs:
                 X_pt[1].fill_(float("inf"))
                 X_pt[2].fill_(-float("inf"))
 
-            actual = torch.empty(shape).cuda().half()
+            actual = torch.empty_like(X_pt)
             module.run_with_tensors([X_pt], [actual])
-
-            expected = get_expected(X_pt).cuda()
-            self.assertTrue(torch.equal(expected, actual))
+            expected = get_expected(X_pt, torch_dtype)
+            torch.testing.assert_close(expected, actual)
 
     def _test_nan_to_num(
         self,
-        test_num: int,
         input_shape: List[List[int]],
         nan_replacement: float,
         inf_replacement: float,
         neginf_replacement: float,
         add_nans: bool = False,
         add_infs: bool = False,
+        test_name: str = "nan_to_num",
+        dtype="float16",
     ):
         nan_to_num_pt = (
-            lambda x: x.to(torch.float)
+            lambda x, torch_dtype: x.to(torch.float)
             .nan_to_num(
-                posinf=inf_replacement, neginf=neginf_replacement, nan=nan_replacement
+                posinf=inf_replacement,
+                neginf=neginf_replacement,
+                nan=nan_replacement,
             )
-            .to(torch.half)
+            .to(torch_dtype)
         )
         self._test_helper(
-            input_shape,
-            nan_replacement,
-            inf_replacement,
-            neginf_replacement,
-            add_nans,
-            add_infs,
-            f"nan_to_num_{test_num}",
-            FuncEnum.NAN_TO_NUM,
-            nan_to_num_pt,
+            input_shape=input_shape,
+            arg_a=nan_replacement,
+            arg_b=inf_replacement,
+            arg_c=neginf_replacement,
+            add_nans=add_nans,
+            add_infs=add_infs,
+            test_name=test_name,
+            func=FuncEnum.NAN_TO_NUM,
+            get_expected=nan_to_num_pt,
+            dtype=dtype,
         )
 
     def _test_clamp_nan_to_num(
         self,
-        test_num: int,
         input_shape: List[List[int]],
         clamp_min: float,
         clamp_max: float,
         nan_replacement: float,
         add_nans: bool = False,
+        test_name: str = "clamp_nan_to_num",
+        dtype="float16",
     ):
         clamp_nan_to_num_pt = (
-            lambda x: x.to(torch.float)
+            lambda x, torch_dtype: x.to(torch.float)
             .clamp(clamp_min, clamp_max)
             .nan_to_num(nan=nan_replacement)
-            .to(torch.half)
+            .to(torch_dtype)
         )
         self._test_helper(
-            input_shape,
-            clamp_min,
-            clamp_max,
-            nan_replacement,
-            add_nans,
-            False,
-            f"clamp_nan_to_num_{test_num}",
-            FuncEnum.CLAMP_NAN_TO_NUM,
-            clamp_nan_to_num_pt,
+            input_shape=input_shape,
+            arg_a=clamp_min,
+            arg_b=clamp_max,
+            arg_c=nan_replacement,
+            add_nans=add_nans,
+            add_infs=False,
+            test_name=test_name,
+            func=FuncEnum.CLAMP_NAN_TO_NUM,
+            get_expected=clamp_nan_to_num_pt,
+            dtype=dtype,
         )
 
-    def test_clamp_nan_to_num(self):
+    def test_clamp_nan_to_num_fp16(self):
         clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
-        test_num = 0
-        for clamp_args in clamp_arg_sets:
+        for clamp_min, clamp_max, nan_replacement in clamp_arg_sets:
             self._test_clamp_nan_to_num(
-                test_num,
-                [[40, 2], [40], [40]],
-                *clamp_args,
+                input_shape=[[40, 2], [40], [40]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
                 add_nans=False,
+                test_name="clamp_nan_to_num_fp16",
+                dtype="float16",
             )
             self._test_clamp_nan_to_num(
-                test_num + 1,
-                [[40, 3], [3], [3]],
-                *clamp_args,
+                input_shape=[[40, 3], [3], [3]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=True,
+                test_name="clamp_nan_to_num_fp16",
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_clamp_nan_to_num_fp32(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        for clamp_min, clamp_max, nan_replacement in clamp_arg_sets:
+            self._test_clamp_nan_to_num(
+                input_shape=[[40, 2], [40], [40]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=False,
+                test_name="clamp_nan_to_num_fp32",
+                dtype="float32",
+            )
+            self._test_clamp_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=True,
+                test_name="clamp_nan_to_num_fp32",
+                dtype="float32",
+            )
+
+    def test_nan_to_num_fp16(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        for nan_replacement, inf_replacement, neginf_replacement in clamp_arg_sets:
+            self._test_nan_to_num(
+                input_shape=[[40, 2], [40], [40]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=False,
+                add_infs=False,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=float("inf"),
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
             )
-            test_num += 2
 
-    def test_nan_to_num(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_nan_to_num_fp32(self):
         clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
-        test_num = 0
-        for clamp_args in clamp_arg_sets:
+        for nan_replacement, inf_replacement, neginf_replacement in clamp_arg_sets:
             self._test_nan_to_num(
-                test_num,
-                [[40, 2], [40], [40]],
-                *clamp_args,
+                input_shape=[[40, 2], [40], [40]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=False,
                 add_infs=False,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
             )
             self._test_nan_to_num(
-                test_num + 1,
-                [[40, 3], [3], [3]],
-                *clamp_args,
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=float("inf"),
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=True,
                 add_infs=True,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
             )
-            test_num += 2
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index 0a5eec59e..d9bc93a6e 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -12,10 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -23,6 +21,7 @@
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 
 class ConcatenateTestCase(unittest.TestCase):
@@ -33,12 +32,6 @@ def __init__(self, *args, **kwargs):
     def _run_concatenate(
         self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
-
         # generate torch reference result
         input_tensors_pt = [
             get_random_torch_tensor(shape, input_type)
@@ -60,9 +53,6 @@ def _run_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
-
-        logging.info("AITemplate output_shape: {}".format(y_shape))
 
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", "concatenate", dll_name=dll_name)
@@ -70,19 +60,15 @@ def _run_concatenate(
         input_tensors_ait = {
             f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(input_tensors_ait, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.equal(Y_pt, y))
+
         self.test_count += 1
 
     def _run_batch_concatenate(
         self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
     ):
-        logging.info(
-            "Batch test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
         target = detect_target()
         BATCH_DIM_NAME = "input_batch"
         batch_dim = shape_utils.gen_int_var_min_max(
@@ -110,7 +96,6 @@ def _run_batch_concatenate(
             Y, target, "./tmp", f"concatenate_batched_{batch_tag}", dll_name=dll_name
         )
         for batch in batch_sizes:
-            logging.info("checking batch: {}".format(batch))
             input_tensors_pt = [
                 get_random_torch_tensor([batch, *shape], input_type)
                 for i, shape in enumerate(input_shapes)
@@ -123,9 +108,9 @@ def _run_batch_concatenate(
             input_tensors_ait = {
                 f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
             }
-            y = torch.empty_like(Y_pt).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors(input_tensors_ait, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.equal(Y_pt, y))
             self.test_count += 1
 
     def _run_masked_concatenate(
@@ -137,23 +122,16 @@ def _run_masked_concatenate(
         dim=None,
         input_type="float16",
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
-                input_shapes=input_shapes, input_masks=input_masks, dim=dim
-            )
-        )
-
         # generate torch reference result
         input_tensors_pt = [
             get_random_torch_tensor(shape, input_type)
             for i, shape in enumerate(input_shapes)
         ]
-        Y_pt = (
+        y_pt = (
             torch.cat(input_tensors_pt)
             if dim is None
             else torch.cat(input_tensors_pt, dim)
         )
-        y_pt = Y_pt.cpu().numpy()
 
         target = detect_target()
         inputs = [
@@ -165,7 +143,6 @@ def _run_masked_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
 
         # setup new input_masks, inputs and input_accessors
         inputs = [i for mask, i in zip(input_masks, inputs) if mask is True]
@@ -178,8 +155,6 @@ def _run_masked_concatenate(
         concatenate_op._attrs["inputs"] = inputs
         concatenate_op._attrs["input_accessors"] = input_accessors
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
-
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(
             Y, target, "./tmp", "concatenate_masked", dll_name=dll_name
@@ -189,20 +164,16 @@ def _run_masked_concatenate(
         for i, x_tensor_pt in enumerate(input_tensors_pt):
             if input_masks[i]:
                 inputs.append(x_tensor_pt)
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors(inputs, [y])
 
-        split_sections = []
-        split_offset = 0
-        for shape in input_shapes[:-1]:
-            split_offset = split_offset + shape[dim]
-            split_sections.append(split_offset)
+        split_sections = [shape[dim] for shape in input_shapes]
 
-        ys_pt = np.split(y_pt, split_sections, axis=dim)
-        ys = np.split(y.cpu().numpy(), split_sections, axis=dim)
+        ys_pt = torch.split(y_pt, split_sections, dim=dim)
+        ys = torch.split(y, split_sections, dim=dim)
         for mask, pt, actual in zip(input_masks, ys_pt, ys):
             if mask is True:
-                np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
+                self.assertTrue(torch.equal(pt, actual))
         self.test_count += 1
 
     def test_batch_cat(self):
@@ -248,12 +219,6 @@ def test_batch_cat(self):
             input_shapes=([2, 1, 4], [2, 3, 4]),
             dim=2,
         )
-        self._run_batch_concatenate(
-            batch_sizes=[3, 5, 9],
-            concatenate_op=ops.concatenate(),
-            input_shapes=([2, 3, 4], [2, 3, 2]),
-            dim=3,
-        )
 
     def test_cat(self):
         self._run_concatenate(
@@ -321,11 +286,6 @@ def test_cat(self):
             dim=3,
         )
 
-        self._run_concatenate(
-            concatenate_op=ops.concatenate(),
-            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
-        )
-
         self._run_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
@@ -346,10 +306,6 @@ def test_cat(self):
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
             dim=3,
         )
-        self._run_concatenate(
-            concatenate_op=ops.concatenate(),
-            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
-        )
         self._run_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
@@ -388,11 +344,31 @@ def test_masked_cat(self):
             input_masks=[False, True, False],
             dim=2,
         )
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_floats(self, dtype):
+        if detect_target().name() != "cuda" and dtype != "float16":
+            self.skipTest(
+                f"{detect_target().name()} backend is not supported for {dtype} input type"
+            )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+            input_type=dtype,
+        )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
             input_masks=[False, True, False],
             dim=2,
+            input_type=dtype,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 2]),
+            dim=3,
+            input_type=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_concatenate_tanh.py b/tests/unittest/ops/test_concatenate_tanh.py
index 2c24436a6..ef775f148 100644
--- a/tests/unittest/ops/test_concatenate_tanh.py
+++ b/tests/unittest/ops/test_concatenate_tanh.py
@@ -21,22 +21,28 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
-class ConcatenateTestCase(unittest.TestCase):
+class ConcatenateTanhTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
-        super(ConcatenateTestCase, self).__init__(*args, **kwargs)
+        super(ConcatenateTanhTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
 
     def _run_concatenate(
-        self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
+        self,
+        *,
+        concatenate_op,
+        input_shapes,
+        dim=None,
+        test_name="concatenate_tanh_cat",
+        input_type="float16",
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
+        logging.info(f"Test input shapes {input_shapes}, dim={dim}")
 
         # generate torch reference result
         input_tensors_pt = [
@@ -53,7 +59,10 @@ def _run_concatenate(
         target = detect_target()
         inputs = [
             Tensor(
-                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+                shape=shape,
+                dtype=input_type,
+                name=f"input_{i}",
+                is_input=True,
             )
             for i, shape in enumerate(input_shapes)
         ]
@@ -62,25 +71,29 @@ def _run_concatenate(
         Y._attrs["is_output"] = True
         y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info(f"AITemplate output_shape: {y_shape}")
 
-        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         input_tensors_ait = {
             f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(input_tensors_ait, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def _run_batch_concatenate(
-        self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
+        self,
+        *,
+        batch_sizes,
+        concatenate_op,
+        input_shapes,
+        dim=0,
+        test_name="concatenate_tanh_batch_cat",
+        input_type="float16",
     ):
-        logging.info(
-            "Batch test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
+        logging.info(f"Batch test input shapes {input_shapes}, dim={dim}")
         batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
         target = detect_target()
         inputs = [
@@ -90,7 +103,7 @@ def _run_batch_concatenate(
                     *shape,
                 ],
                 dtype=input_type,
-                name="input_{}".format(i),
+                name=f"input_{i}",
                 is_input=True,
             )
             for i, shape in enumerate(input_shapes)
@@ -98,10 +111,10 @@ def _run_batch_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        batch_tag = "_".join([str(b) for b in batch_sizes])
-        module = compile_model(Y, target, "./tmp", f"concatenate_tanh_{batch_tag}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
         for batch in batch_sizes:
-            logging.info("checking batch: {}".format(batch))
+            logging.info(f"checking batch: {batch}")
             input_tensors_pt = [
                 get_random_torch_tensor([batch, *shape], input_type)
                 for i, shape in enumerate(input_shapes)
@@ -127,12 +140,11 @@ def _run_masked_concatenate(
         input_shapes,
         input_masks,
         dim=None,
+        test_name="concatenate_tanh_masked_cat",
         input_type="float16",
     ):
         logging.info(
-            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
-                input_shapes=input_shapes, input_masks=input_masks, dim=dim
-            )
+            f"Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}"
         )
 
         # generate torch reference result
@@ -150,7 +162,10 @@ def _run_masked_concatenate(
         target = detect_target()
         inputs = [
             Tensor(
-                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+                shape=shape,
+                dtype=input_type,
+                name=f"input_{i}",
+                is_input=True,
             )
             for i, shape in enumerate(input_shapes)
         ]
@@ -170,15 +185,17 @@ def _run_masked_concatenate(
         concatenate_op._attrs["inputs"] = inputs
         concatenate_op._attrs["input_accessors"] = input_accessors
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info(f"AITemplate output_shape: {y_shape}")
 
-        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         inputs = []
         for i, x_tensor_pt in enumerate(input_tensors_pt):
             if input_masks[i]:
                 inputs.append(x_tensor_pt)
-        y = torch.empty(y_shape).cuda().half()
+
+        y = get_torch_empty_tensor(y_shape, dtype=input_type)
         module.run_with_tensors(inputs, [y])
 
         split_sections = []
@@ -193,131 +210,225 @@ def _run_masked_concatenate(
             if mask is True:
                 np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
 
-    def test_batch_cat(self):
+    def test_batch_cat_fp16(self):
         self._run_batch_concatenate(
             batch_sizes=[1, 1],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1], [1]),
             dim=0,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[1, 1],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1], [1]),
             dim=1,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=1,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=3,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 1, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 2]),
             dim=3,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_cat_fp32(self):
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=1,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=0,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=1,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
         )
 
-    def test_cat(self):
+    def test_cat_fp16(self):
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1], [1]), dim=0
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=0
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([2, 1], [2, 1]), dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 1], [2, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=[[2, 3, 4]], dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=[[2, 3, 4]],
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [3, 3, 4], [4, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 4, 4], [2, 5, 4]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 6], [2, 3, 5], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1024, 32, 32], [1024, 16, 32], [1024, 8, 32]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
             dim=3,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
-
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
 
         # self._run_concatenate(concatenate_op=ops.concatenate(),
@@ -331,36 +442,120 @@ def test_cat(self):
         # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]))
 
-    def test_masked_cat(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_cat_fp32(self):
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 1], [2, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+
+    def test_masked_cat_fp16(self):
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2], [2]),
             input_masks=[True, False],
             dim=0,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3], [5, 3], [3, 3]),
             input_masks=[False, True, True],
             dim=0,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
             input_masks=[True, False, True],
             dim=1,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
             input_masks=[False, True, False],
             dim=2,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
             input_masks=[False, True, False],
             dim=2,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_masked_cat_fp32(self):
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2], [2]),
+            input_masks=[True, False],
+            dim=0,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3], [5, 3], [3, 3]),
+            input_masks=[False, True, True],
+            dim=0,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
+            input_masks=[True, False, True],
+            dim=1,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
+            input_masks=[False, True, False],
+            dim=2,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 32]),
+            input_masks=[False, True, False],
+            dim=2,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index 7a0a3881c..ee92173cb 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -19,19 +19,29 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ConvTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.conv2d(stride=1, pad=1, dilate=1)
         if copy_op:
@@ -39,24 +49,50 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"conv2d_{copy_op}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv(
+            test_name="conv2d_fp16",
+            dtype="float16",
+        )
+        self._test_conv(
+            copy_op=True,
+            test_name="conv2d_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv(
+            test_name="conv2d_fp32",
+            dtype="float32",
+        )
+        self._test_conv(
+            copy_op=True,
+            test_name="conv2d_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index 4501d1ca8..7ebe7a185 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -15,28 +15,46 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_add(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -46,12 +64,12 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt + R_pt
 
@@ -59,17 +77,43 @@ def _test_fp16(self, batch=4, copy_op=False):
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_add(
+            test_name="conv2d_bias_add_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_add(
+            copy_op=True,
+            test_name="conv2d_bias_add_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_add(
+            test_name="conv2d_bias_add_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_add(
+            copy_op=True,
+            test_name="conv2d_bias_add_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index ea192241d..a0141cc7e 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -14,18 +14,17 @@
 #
 import unittest
 
-import numpy as np
-
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
-class ConvTestCase(unittest.TestCase):
-    def _test_fp16(
+class Conv3dTestCase(unittest.TestCase):
+    def _test_conv3d(
         self,
         tt,
         hh,
@@ -38,50 +37,166 @@ def _test_fp16(
         stride=(1, 1, 1),
         pad=(1, 1, 1),
         batch=4,
-        test_case="",
+        test_name="conv3d",
+        dtype="float16",
     ):
         target = detect_target()
 
         X = Tensor(
             shape=[IntImm(batch), tt, hh, ww, ci],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[co, kt, kh, kw, ci], dtype="float16", name="input_1", is_input=True
+            shape=[co, kt, kh, kw, ci],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"conv3d_{test_case}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
-        W_pt = torch.randn(co, ci, kt, kh, kw).cuda().half()
+        X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, ci, kt, kh, kw], dtype=dtype)
         Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, stride=stride, padding=pad)
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
-
-        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
-        y_shape = list(Y_pt_transpose.shape)
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
 
-        np.testing.assert_allclose(
-            Y_pt_transpose.cpu().numpy(), y.cpu().numpy(), atol=1e-2, rtol=1e-2
-        )
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16(
-            4, 224, 224, 8, 96, 3, 5, 5, stride=(2, 4, 4), pad=(1, 2, 2), test_case=1
+        self._test_conv3d(
+            4,
+            224,
+            224,
+            8,
+            96,
+            3,
+            5,
+            5,
+            stride=(2, 4, 4),
+            pad=(1, 2, 2),
+            test_name="conv3d_fp16_1",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            256,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_2",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            64,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_3",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            64,
+            3,
+            3,
+            3,
+            test_name="conv3d_fp16_4",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            256,
+            64,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_5",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            256,
+            512,
+            1,
+            1,
+            1,
+            stride=(2, 2, 2),
+            test_name="conv3d_fp16_6",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            128,
+            128,
+            3,
+            3,
+            3,
+            stride=(2, 2, 2),
+            test_name="conv3d_fp16_7",
+            dtype="float16",
+        )
+
+    @unittest.skip("no fp32 kernels are available for conv3d")
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv3d(
+            4,
+            224,
+            224,
+            8,
+            96,
+            3,
+            5,
+            5,
+            stride=(2, 4, 4),
+            pad=(1, 2, 2),
+            test_name="conv3d_fp32_1",
+            dtype="float32",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            256,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp32_2",
+            dtype="float32",
         )
-        self._test_fp16(56, 56, 56, 64, 256, 1, 1, 1, test_case=2)
-        self._test_fp16(56, 56, 56, 64, 64, 1, 1, 1, test_case=3)
-        self._test_fp16(56, 56, 56, 64, 64, 3, 3, 3, test_case=4)
-        self._test_fp16(56, 56, 56, 256, 64, 1, 1, 1, test_case=5)
-        self._test_fp16(56, 56, 56, 256, 512, 1, 1, 1, stride=(2, 2, 2), test_case=6)
-        self._test_fp16(56, 56, 56, 128, 128, 3, 3, 3, stride=(2, 2, 2), test_case=7)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv3d_profiler_cache.py b/tests/unittest/ops/test_conv3d_profiler_cache.py
new file mode 100644
index 000000000..816b04b3f
--- /dev/null
+++ b/tests/unittest/ops/test_conv3d_profiler_cache.py
@@ -0,0 +1,201 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
+class Conv3DProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="conv3d",
+        tt=56,
+        hh=56,
+        ww=56,
+        ci=64,
+        co=256,
+        kt=1,
+        kh=1,
+        kw=1,
+        stride=(1, 1, 1),
+        pad=(1, 1, 1),
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, tt, hh, ww, ci],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, kt, kh, kw, ci],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                "conv3d",
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+    ):
+        old_trick = os.environ.get("TRICK_CI_ENV", None)
+        old_cache = os.environ.get("CACHE_DIR", None)
+        try:
+            os.environ["TRICK_CI_ENV"] = "1"
+            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+        finally:
+            if old_trick is not None:
+                os.environ["TRICK_CI_ENV"] = old_trick
+            else:
+                os.environ.pop("TRICK_CI_ENV")
+            if old_cache is not None:
+                os.environ["CACHE_DIR"] = old_cache
+            else:
+                os.environ.pop("CACHE_DIR")
+
+    def test_conv3d_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        run1_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run1_logs)
+
+        run2_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_conv3d_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "conv3d_cache_version"
+        target_name = detect_target().name()
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=1,  # version
+        ):
+            run1_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv3d_1' does not exist in the db",
+                run1_before_version_change_logs,
+            )
+
+            run2_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv3d_1' exists in the db",
+                run2_before_version_change_logs,
+            )
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=2,  # version
+        ):
+            run1_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv3d_2' does not exist in the db",
+                run1_after_version_change_logs,
+            )
+
+            run2_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv3d_2' exists in the db",
+                run2_after_version_change_logs,
+            )
+
+    def test_conv3d_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv3d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        run1_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run1_logs)
+
+        run2_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run2_logs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index c1b0d16a0..15193f057 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -19,48 +19,89 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ConvBiasTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias(
+            test_name="conv2d_bias_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias(
+            copy_op=True,
+            test_name="conv2d_bias_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias(
+            test_name="conv2d_bias_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias(
+            copy_op=True,
+            test_name="conv2d_bias_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index f284d1111..7c91ad840 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -19,6 +19,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def hard_swish(x):
@@ -27,88 +28,176 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvBiasReluTestCase(unittest.TestCase):
-    def _test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
+class ConvBiasActFewChannelsTestCase(unittest.TestCase):
+    def _test_conv_bias_relu_few_channels(
+        self,
+        HH=224,
+        WW=224,
+        CI=4,
+        CO=64,
+        batch=1,
+        copy_op=False,
+        test_name="conv2d_bias_relu_few_channels",
+        dtype="float16",
+    ):
         KK = 7
         stride = 2
         pad = 3
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+            shape=[CO, KK, KK, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu_few_channels(stride=stride, pad=pad, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_relu_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_relu_few_channels")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, KK, KK], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_relu(self):
-        self._test_relu()
-        self._test_relu(copy_op=True)
+    def test_relu_fp16(self):
+        self._test_conv_bias_relu_few_channels(
+            test_name="conv_bias_relu_few_channels_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_relu_few_channels(
+            copy_op=True,
+            test_name="conv_bias_relu_few_channels_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_relu_fp32(self):
+        self._test_conv_bias_relu_few_channels(
+            test_name="conv_bias_relu_few_channels_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_relu_few_channels(
+            copy_op=True,
+            test_name="conv_bias_relu_few_channels_fp32_copy_op",
+            dtype="float32",
+        )
 
-    def _test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
+    def _test_conv_bias_hardswish_few_channels(
+        self,
+        HH=224,
+        WW=224,
+        CI=4,
+        CO=64,
+        batch=1,
+        copy_op=False,
+        test_name="conv2d_bias_hardswish_few_channels",
+        dtype="float16",
+    ):
         KK = 7
         stride = 2
         pad = 3
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+            shape=[CO, KK, KK, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish_few_channels(stride=stride, pad=pad, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_hardswish_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish_few_channels")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, KK, KK], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
         Y_pt = Y_pt + B_pt
         Y_pt = hard_swish(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_hardswish(self):
-        self._test_hardswish()
-        self._test_hardswish(copy_op=True)
+    def test_hardswish_fp16(self):
+        self._test_conv_bias_hardswish_few_channels(
+            test_name="conv_bias_hardswish_few_channels_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_hardswish_few_channels(
+            copy_op=True,
+            test_name="conv_bias_hardswish_few_channels_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_hardswish_fp32(self):
+        self._test_conv_bias_hardswish_few_channels(
+            test_name="conv_bias_hardswish_few_channels_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_hardswish_few_channels(
+            copy_op=True,
+            test_name="conv_bias_hardswish_few_channels_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index d390d4303..ff7ce64a0 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -15,9 +15,11 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def hard_swish(x):
@@ -26,22 +28,38 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvBiasHardswishAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+class ConvBiasAddHardswishTestCase(unittest.TestCase):
+    def _test_conv_bias_add_hardswish(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add_hardswish",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -51,12 +69,12 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_hardswish")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt + R_pt
         Y_pt = hard_swish(Y_pt)
@@ -65,14 +83,40 @@ def _test_fp16(self, batch=4, copy_op=False):
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_add_hardswish(
+            test_name="conv2d_bias_add_hardswish_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_add_hardswish(
+            copy_op=True,
+            test_name="conv2d_bias_add_hardswish_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_add_hardswish(
+            test_name="conv2d_bias_add_hardswish_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_add_hardswish(
+            copy_op=True,
+            test_name="conv2d_bias_add_hardswish_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index bace7be14..5128e4908 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -15,27 +15,46 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-class ConvBiasReluAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasAddReluTestCase(unittest.TestCase):
+    def _test_conv_bias_add_relu(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add_relu",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -45,12 +64,12 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_relu")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt + R_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
@@ -59,17 +78,43 @@ def _test_fp16(self, batch=4, copy_op=False):
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_add_relu(
+            test_name="conv2d_bias_add_relu_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_add_relu(
+            copy_op=True,
+            test_name="conv2d_bias_add_relu_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_add_relu(
+            test_name="conv2d_bias_add_relu_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_add_relu(
+            copy_op=True,
+            test_name="conv2d_bias_add_relu_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index 6a424b1af..a56b9995f 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -19,6 +19,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def hard_swish(x):
@@ -28,29 +29,43 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasHardswishTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_hardswish(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_hardswish",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_hardswish(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt
         Y_pt = hard_swish(Y_pt)
@@ -62,14 +77,40 @@ def _test_fp16(self, batch=4, copy_op=False):
         # np.savetxt("x.txt", x.flatten())
         # np.savetxt("w.txt", w.flatten())
         # np.savetxt("b.txt", b.flatten())
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_hardswish(
+            test_name="conv2d_bias_hardswish_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_hardswish(
+            copy_op=True,
+            test_name="conv2d_bias_hardswish_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_hardswish(
+            test_name="conv2d_bias_hardswish_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_hardswish(
+            copy_op=True,
+            test_name="conv2d_bias_hardswish_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index 1ab18b4ff..28364e1e6 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -19,49 +19,90 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ConvBiasReluTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_relu(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_relu",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_relu(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_relu")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_relu(
+            test_name="conv2d_bias_relu_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_relu(
+            copy_op=True,
+            test_name="conv2d_bias_relu_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_relu(
+            test_name="conv2d_bias_relu_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_relu(
+            copy_op=True,
+            test_name="conv2d_bias_relu_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index c9e3ad3f6..9cd3b38dc 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -19,49 +19,90 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ConvBiasSigmoidTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_sigmoid(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_sigmoid",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_sigmoid(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_sigmoid(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_sigmoid")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.sigmoid(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_conv_bias_sigmoid(
+            test_name="conv2d_bias_sigmoid_fp16",
+            dtype="float16",
+        )
+        self._test_conv_bias_sigmoid(
+            copy_op=True,
+            test_name="conv2d_bias_sigmoid_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_bias_sigmoid(
+            test_name="conv2d_bias_sigmoid_fp32",
+            dtype="float32",
+        )
+        self._test_conv_bias_sigmoid(
+            copy_op=True,
+            test_name="conv2d_bias_sigmoid_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_depthwise.py b/tests/unittest/ops/test_conv_depthwise.py
index 8f8708a78..799dc698e 100644
--- a/tests/unittest/ops/test_conv_depthwise.py
+++ b/tests/unittest/ops/test_conv_depthwise.py
@@ -50,9 +50,9 @@ def test_fp16(self, batch=4):
         self.assertFalse(y_transpose.isnan().any())
         self.assertFalse(y_transpose.isinf().any())
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_profiler_cache.py b/tests/unittest/ops/test_conv_profiler_cache.py
new file mode 100644
index 000000000..039f54053
--- /dev/null
+++ b/tests/unittest/ops/test_conv_profiler_cache.py
@@ -0,0 +1,190 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="conv2d",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name,
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+    ):
+        old_trick = os.environ.get("TRICK_CI_ENV", None)
+        old_cache = os.environ.get("CACHE_DIR", None)
+        try:
+            os.environ["TRICK_CI_ENV"] = "1"
+            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+        finally:
+            if old_trick is not None:
+                os.environ["TRICK_CI_ENV"] = old_trick
+            else:
+                os.environ.pop("TRICK_CI_ENV")
+            if old_cache is not None:
+                os.environ["CACHE_DIR"] = old_cache
+            else:
+                os.environ.pop("CACHE_DIR")
+
+    def test_conv_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        run1_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run1_logs)
+
+        run2_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_conv_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "conv_cache_version"
+        target_name = detect_target().name()
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=1,  # version
+        ):
+            run1_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv_1' does not exist in the db",
+                run1_before_version_change_logs,
+            )
+
+            run2_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv_1' exists in the db",
+                run2_before_version_change_logs,
+            )
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=2,  # version
+        ):
+            run1_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv_2' does not exist in the db",
+                run1_after_version_change_logs,
+            )
+
+            run2_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_conv_2' exists in the db",
+                run2_after_version_change_logs,
+            )
+
+    def test_conv_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv2d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        run1_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run1_logs)
+
+        run2_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run2_logs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
index 13f2f0eff..675a1ed3e 100644
--- a/tests/unittest/ops/test_cross_attention.py
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -33,6 +33,14 @@ def mark_output(y):
 
 
 class crossattentionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_id = 0
+
     def _test_mha(
         self,
         batch_sizes,
@@ -94,7 +102,10 @@ def _test_mha(
         Y = Y + inputs_ait
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        exe_module = compile_model(
+            Y, target, "./tmp", f"cross_attn_dynamic_{self.test_id}"
+        )
+        self.test_id += 1
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
@@ -131,8 +142,8 @@ def test_cross_attn(self):
         self._test_mha(
             batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
         )
+        self._test_mha(batch_sizes=[128], seqlen=1, seqlen_kv=4, dim=16, num_heads=2)
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_depthwise_conv3d.py b/tests/unittest/ops/test_depthwise_conv3d.py
index c9d46d943..73cadeec3 100644
--- a/tests/unittest/ops/test_depthwise_conv3d.py
+++ b/tests/unittest/ops/test_depthwise_conv3d.py
@@ -19,21 +19,31 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class DepthwiseConv3dTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+class Conv3dDepthwiseTestCase(unittest.TestCase):
+    def _test_depthwise_conv3d(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="depthwise_conv3d",
+        dtype="float16",
+    ):
         target = detect_target()
         tt, hh, ww, ci, co, groups = 28, 28, 28, 128, 128, 128
         X = Tensor(
             shape=[IntImm(batch), tt, hh, ww, ci],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[co, 3, 3, 3, 1], dtype="float16", name="input_1", is_input=True
+            shape=[co, 3, 3, 3, 1],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups)
         if copy_op:
@@ -41,22 +51,41 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"depthwise_conv3d_{copy_op}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
-        W_pt = torch.randn(co, 1, 3, 3, 3).cuda().half()
+        X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, 1, 3, 3, 3], dtype=dtype)
         Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
-        y = torch.empty([batch, tt, hh, ww, co]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
 
-        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
-        self.assertTrue(torch.allclose(Y_pt_transpose, y, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_fp16",
+            dtype="float16",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            test_name="depthwise_conv3d_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_fp32(self):
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_fp32",
+            dtype="float32",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            test_name="depthwise_conv3d_fp32",
+            dtype="float32",
+        )
 
     def _test_mvit_shape(
         self,
diff --git a/tests/unittest/ops/test_dual_bmm.py b/tests/unittest/ops/test_dual_bmm.py
new file mode 100644
index 000000000..90292d315
--- /dev/null
+++ b/tests/unittest/ops/test_dual_bmm.py
@@ -0,0 +1,231 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target()._arch == "75", "DualGemm not supported on sm75.")
+class DUALBMMTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_dual_bmm_rrr_div(
+        self,
+        B=256,
+        M=256,
+        N=512,
+        K=512,
+        broadcast_b1=False,
+        benchmark=False,
+        use_fp16_acc=False,
+        test_name="dual_bmm",
+        dtype="float16",
+    ):
+        B1_shape = [B, K, 1] if broadcast_b1 else [B, K, N]
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        X = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        B0 = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B1 = Tensor(
+            shape=B1_shape,
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.dual_bmm_rrr_div()
+        Y = OP(X, B0, B1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        X_pt = get_random_torch_tensor([B, M, K], dtype=dtype) + 1.0
+        B0_pt = get_random_torch_tensor([B, K, N], dtype=dtype) + 1.0
+        B1_pt = get_random_torch_tensor(B1_shape, dtype=dtype) + 1.0
+
+        def pt_func(X_pt, W_pt, B_pt):
+            Y_pt1 = torch.bmm(X_pt, W_pt)
+            Y_pt2 = torch.bmm(X_pt, B_pt)
+            Y_pt = Y_pt1 / Y_pt2
+            return Y_pt
+
+        Y_pt = pt_func(X_pt, B0_pt, B1_pt)
+
+        inputs = {"input_0": X_pt, "input_1": B0_pt, "input_2": B1_pt}
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(inputs, [y])
+
+        torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
+
+        if benchmark:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            _LOGGER.info(f"[{M}, {N}, {K}] AIT BMMxBMM time: {time_per_iter_ms:.5f}ms")
+            # Benchmark PT
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = pt_func
+            args = (X_pt, B0_pt, B1_pt)
+            duration = benchmark_torch_function(100, func, *args)
+            _LOGGER.info(f"PT BMMxBMM Time: {duration:.5f}ms")
+
+    def test_dual_bmm_rrr_div_fp16(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16",
+            dtype="float16",
+        )
+        self._test_dual_bmm_rrr_div(
+            B=512,
+            M=256,
+            N=512,
+            K=512,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16",
+            dtype="float16",
+        )
+        self._test_dual_bmm_rrr_div(
+            B=64,
+            M=1024,
+            N=1024,
+            K=2048,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16",
+            dtype="float16",
+        )
+
+    def test_dual_bmm_rrr_div_broadcast_b1_fp16(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=True,
+            test_name="dual_bmm_rrr_div_fp16",
+            dtype="float16",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_fp16",
+        #     dtype="float16",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_fp16",
+        #     dtype="float16",
+        # )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_bmm_rrr_div_fp32(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp32",
+            dtype="float32",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=False,
+        #     test_name="dual_bmm_rrr_div_fp32",
+        #     dtype="float32",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=False,
+        #     test_name="dual_bmm_rrr_div_fp32",
+        #     dtype="float32",
+        # )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_bmm_rrr_div_broadcast_b1_fp32(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=True,
+            test_name="dual_bmm_rrr_div_fp32",
+            dtype="float32",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_fp32",
+        #     dtype="float32",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_fp32",
+        #     dtype="float32",
+        # )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_dual_gemm.py b/tests/unittest/ops/test_dual_gemm.py
index 28b25bfda..2d6ce76d5 100644
--- a/tests/unittest/ops/test_dual_gemm.py
+++ b/tests/unittest/ops/test_dual_gemm.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import math
 import unittest
 
@@ -21,7 +22,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class NewGELUActivation(torch.nn.Module):
@@ -71,16 +77,46 @@ def mark_output(y):
         y[i]._attrs["is_output"] = True
         y[i]._attrs["name"] = "output_%d" % (i)
         y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
-        print("output_{} shape: {}".format(i, y_shape))
+        print(f"output_{i} shape: {y_shape}")
 
 
 @unittest.skipIf(detect_target()._arch == "75", "DualGemm not supported on sm75.")
 class DUALGEMMTestCase(unittest.TestCase):
-    def _test_dual_gemm(self, M=4096, N=4096, K=8192, fast_gelu=False, benchmark=False):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_dual_gemm(
+        self,
+        M=4096,
+        N=4096,
+        K=8192,
+        fast_gelu=False,
+        benchmark=False,
+        broadcast_b1=False,
+        test_name="dual_gemm",
+        dtype="float16",
+    ):
+        B_shape = [1, K] if broadcast_b1 else [N, K]
         target = detect_target(use_fp16_acc=False)
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N, K], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=B_shape,
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         if fast_gelu:
             OP = ops.dual_gemm_rcr_fast_gelu()
         else:
@@ -88,10 +124,11 @@ def _test_dual_gemm(self, M=4096, N=4096, K=8192, fast_gelu=False, benchmark=Fal
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "dual_gemm")
-        X_pt = torch.randn(M, K).cuda().half() * 0.01
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N, K).cuda().half()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype) * 0.01
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor(B_shape, dtype=dtype)
 
         def pt_func(X_pt, W_pt, B_pt):
             Y_pt1 = torch.nn.functional.linear(X_pt, W_pt)
@@ -106,7 +143,7 @@ def pt_func(X_pt, W_pt, B_pt):
         Y_pt = pt_func(X_pt, W_pt, B_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
@@ -121,20 +158,142 @@ def pt_func(X_pt, W_pt, B_pt):
                 [y],
                 count=100,
             )
-            logger.info(__file__, f"AIT GEMMxGEMM time: {time_per_iter_ms:.5f}ms")
+            _LOGGER.info(f"AIT GEMMxGEMM time: {time_per_iter_ms:.5f}ms")
             # Benchmark PT
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
             func = pt_func
             args = (X_pt, W_pt, B_pt)
             duration = benchmark_torch_function(100, func, *args)
-            logger.info(__file__, f"PT GEMMxGEMM Time: {duration:.5f}ms")
+            _LOGGER.info(f"PT GEMMxGEMM Time: {duration:.5f}ms")
 
-    def test_dual_gemm(self):
-        for fast_gelu in [True, False]:
-            self._test_dual_gemm(M=128, N=128, K=256, fast_gelu=fast_gelu)
-            self._test_dual_gemm(M=1024, N=1024, K=2048, fast_gelu=fast_gelu)
-            self._test_dual_gemm(M=4096, N=4096, K=8192, fast_gelu=fast_gelu)
+    def test_dual_gemm_silu_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=1024,
+            N=1024,
+            K=2048,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=4096,
+            N=4096,
+            K=8192,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_silu_broadcast_b1_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=True,
+            test_name="dual_gemm_silu_broadcast_b1_fp16",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_fast_gelu_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=1024,
+            N=1024,
+            K=2048,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=4096,
+            N=4096,
+            K=8192,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_fast_gelu_broadcast_b1_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=True,
+            test_name="dual_gemm_fast_gelu_broadcast_b1_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_silu_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_silu_broadcast_b1_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=True,
+            test_name="dual_gemm_silu_broadcast_b1_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_fast_gelu_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_fast_gelu_broadcast_b1_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=True,
+            test_name="dual_gemm_fast_gelu_broadcast_b1_fp32",
+            dtype="float32",
+        )
 
     def _test_t5block(
         self,
@@ -142,9 +301,13 @@ def _test_t5block(
         d_model=1024,
         d_ff=2048,
         use_fp16_acc=False,
+        test_name="t5block",
+        dtype="float16",
     ):
-
-        pt_mod = T5DenseGatedGeluDense(d_model=d_model, d_ff=d_ff).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        pt_mod = (
+            T5DenseGatedGeluDense(d_model=d_model, d_ff=d_ff).to(torch_dtype).cuda()
+        )
         pt_mod = pt_mod.eval()
 
         pt_params = dict(pt_mod.named_parameters())
@@ -156,25 +319,32 @@ def _test_t5block(
         ait_mod = nn.T5DenseGatedGeluDense(
             in_channels=d_model,
             out_channels=d_ff,
+            dtype=dtype,
         )
         ait_mod.name_parameter_tensor()
 
         M_dim = shape_utils.gen_int_var_min_max(Ms, name="Mdim")
-        inputs_ait = Tensor([M_dim, d_model], name="input0", is_input=True)
+        inputs_ait = Tensor(
+            [M_dim, d_model],
+            name="input0",
+            is_input=True,
+            dtype=dtype,
+        )
         Y = ait_mod(inputs_ait)
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "t5block")
+        exe_module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
         for m in Ms:
-            input_pt = torch.randn([m, d_model]).cuda().half()
+            input_pt = get_random_torch_tensor([m, d_model], dtype)
             pt_ys = pt_mod(input_pt)
             print("pt output:", pt_ys.shape)
 
             inputs = [input_pt]
-            ys = [torch.empty(pt_ys.shape).cuda().half()]
+            ys = [torch.empty_like(pt_ys)]
             exe_module.run_with_tensors(inputs, ys)
             eps = 1e-2
             np.testing.assert_allclose(
@@ -183,11 +353,24 @@ def _test_t5block(
                 atol=eps,
                 rtol=eps,
             )
-            print("M = {} t5 verification pass".format(m))
+            print(f"M = {m} t5 verification pass")
+
+    def test_t5block_fp16(self):
+        self._test_t5block(
+            Ms=[1024, 2048, 4096],
+            test_name="t5block_fp16",
+            dtype="float16",
+        )
 
-    def test_t5block(self):
-        self._test_t5block(Ms=[1024, 2048, 4096])
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_t5block_fp32(self):
+        self._test_t5block(
+            Ms=[1024, 2048, 4096],
+            test_name="t5block_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index b42378f4f..92dc686f1 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -21,20 +21,30 @@
 from aitemplate.compiler.base import DynamicProfileStrategy
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvTestCase(unittest.TestCase):
-    def test_fp16(self):
+class ConvDynamicTestCase(unittest.TestCase):
+    def _test_conv_dynamic(
+        self,
+        test_name="conv_dynamic",
+        dtype="float16",
+    ):
         target = detect_target()
         batch_size = [2, 32]
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 24, 24, 4],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[36, 3, 3, 4], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(
+            shape=[36, 3, 3, 4],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.conv2d(stride=2, pad=1, dilate=1)
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -43,21 +53,38 @@ def test_fp16(self):
             Y,
             target,
             "./tmp",
-            "dynamic_conv",
+            test_name,
             dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
         )
         for batch in batch_size:
             print("Test batch: %d" % batch)
-            X_pt = torch.randn(batch, 4, 24, 24).cuda().half()
-            W_pt = torch.randn(36, 4, 3, 3).cuda().half()
+            X_pt = get_random_torch_tensor([batch, 4, 24, 24], dtype=dtype)
+            W_pt = get_random_torch_tensor([36, 4, 3, 3], dtype=dtype)
             Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, stride=2, padding=1)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
             w = W_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty([batch, 12, 12, 36]).cuda().half()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors({"input_0": x, "input_1": w}, [y])
             y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_fp16(self):
+        self._test_conv_dynamic(
+            test_name="conv_dynamic_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_conv_dynamic(
+            test_name="conv_dynamic_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index 1960b8483..ce2962dd5 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -27,6 +27,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from torchvision.ops import boxes as box_ops
@@ -61,7 +62,7 @@ def mark_output(y):
         print("output_{} shape: {}".format(i, y_shape))
 
 
-def create_tensors(N):
+def create_tensors(N, dtype="float16"):
     dets = np.array(
         [
             [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
@@ -95,7 +96,7 @@ def create_tensors(N):
             [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
             [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
         ],
-        dtype="float16",
+        dtype=dtype,
     )
     return dets[:N, :4], dets[:N, -1]
 
@@ -108,13 +109,16 @@ def op_gflop(bz, N, max_out):
 
 @skipIfNoTorchVision
 class nmsTestCase(unittest.TestCase):
-    def _create_tensors(self, N, rand=False):
+    def _create_tensors(self, N, rand=False, dtype="float16"):
         if rand:
             boxes = random_boxes(N, 200)
             scores = torch.rand(N)
-            return boxes.numpy().astype("float16"), scores.numpy().astype("float16")
+            return (
+                boxes.numpy().astype(dtype),
+                scores.numpy().astype(dtype),
+            )
         else:
-            boxes, scores = create_tensors(N)
+            boxes, scores = create_tensors(N, dtype=dtype)
             return boxes, scores
 
     def _test_nms(
@@ -133,17 +137,18 @@ def _test_nms(
         test_name="efficient_nms",
         benchmark_shapes=False,
         copy_op=False,
+        dtype="float16",
     ):
         X1 = Tensor(
             shape=[batch_size, N, num_classes, 4],
-            dtype="float16",
+            dtype=dtype,
             name="boxes",
             is_input=True,
         )
 
         X2 = Tensor(
             shape=[batch_size, N, num_classes],
-            dtype="float16",
+            dtype=dtype,
             name="scores",
             is_input=True,
         )
@@ -159,24 +164,24 @@ def _test_nms(
         Y = OP(X1, X2)
         mark_output(Y)
 
-        boxes, scores = self._create_tensors(N, rand=rand_box)
-        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
-        iou = iouThreshold
-        boxes_pt = torch.tensor(boxes).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, rand=rand_box, dtype=dtype)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().to(dtype=torch_dtype)
+        boxes_pt = torch.tensor(boxes).cuda().to(dtype=torch_dtype)
         kept = nonempty(boxes_pt, threshold=minBoxSize)
-        score_pt = torch.tensor(scores).cuda().half()
+        score_pt = torch.tensor(scores).cuda().to(dtype=torch_dtype)
         score_pt[kept] = -1
 
         if bench_pt:
             func = box_ops.batched_nms
-            args = (boxes_pt, score_pt, idxs, iou)
+            args = (boxes_pt, score_pt, idxs, iouThreshold)
             batch_size = 1
             duration = benchmark_torch_function(100, func, *args)
             print(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
-        keep = box_ops.batched_nms(boxes_pt, score_pt, idxs, iou)
+        keep = box_ops.batched_nms(boxes_pt, score_pt, idxs, iouThreshold)
 
         if keep.shape[0] >= nmsMaxOut:
             keep = keep[:nmsMaxOut]
@@ -186,6 +191,7 @@ def _test_nms(
             ref_box[
                 : keep.shape[0],
             ] = boxes_pt[keep].cpu()
+        ref_box = ref_box.cuda().to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 1, 4)).copy()
         x_scores = scores.reshape((1, N, 1)).copy()
@@ -213,9 +219,9 @@ def _test_nms(
         inputs = {"boxes": x_reshaped, "scores": scores_reshaped}
 
         y0 = torch.empty([batch_size, 1]).cuda().to(torch.int64)
-        y1 = torch.empty([batch_size, nmsMaxOut, 4]).cuda().half()
-        y2 = torch.empty([batch_size, nmsMaxOut]).cuda().half()
-        y3 = torch.empty([batch_size, nmsMaxOut]).cuda().to(torch.int64)
+        y1 = torch.empty([batch_size, nmsMaxOut, 4]).cuda().to(dtype=torch_dtype)
+        y2 = torch.empty([batch_size, nmsMaxOut]).cuda().to(dtype=torch_dtype)
+        y3 = torch.empty([batch_size, nmsMaxOut]).cuda().to(dtype=torch.int64)
         outputs = {"output_0": y0, "output_1": y1, "output_2": y2, "output_3": y3}
         module.run_with_tensors(inputs, outputs)
 
@@ -235,11 +241,9 @@ def _test_nms(
                     torch.allclose(y[idx1, :], y[idx2, :], atol=1e-2, rtol=1e-2)
                 )
         else:
-            self.assertTrue(
-                torch.allclose(y1[0, :], ref_box.cuda().half(), atol=1e-2, rtol=1e-2)
-            )
+            self.assertTrue(torch.allclose(y1[0, :], ref_box, atol=1e-2, rtol=1e-2))
 
-    def test_nms(self):
+    def test_nms_fp16(self):
         # self._test_nms(
         #     N=15000,
         #     preNmsTop=6000,
@@ -273,7 +277,36 @@ def test_nms(self):
             batch_size=2,
             num_classes=4,
             rand_box=False,
-            test_name="nms2",
+            test_name="nms2_fp16",
+            dtype="float16",
+        )
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    def test_nms_fp32(self):
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2_fp32",
+            dtype="float32",
         )
         self._test_nms(
             N=30,
@@ -284,8 +317,9 @@ def test_nms(self):
             batch_size=2,
             num_classes=4,
             rand_box=False,
-            test_name="nms2_copy_op",
+            test_name="nms2_copy_op_fp32",
             copy_op=True,
+            dtype="float32",
         )
 
     @unittest.skip("manually enable it for benchmarking")
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
index d259ee961..e17500058 100644
--- a/tests/unittest/ops/test_expand.py
+++ b/tests/unittest/ops/test_expand.py
@@ -15,13 +15,12 @@
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model, ops
 
+from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import IntVar, Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import graph_has_op
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -40,44 +39,98 @@ def test_expand_fails_non_singleton_dim(self):
         expand_shape = [20]
         self.assertRaises(ValueError, ops.expand().__call__, x, expand_shape)
 
-    def test_no_op_expands_removed_static_shapes(self):
-        x = Tensor([1, 2, 3], name="input_0", is_input=True)
+    def _test_no_op_expands_removed_static_shapes(
+        self,
+        test_name="no_op_expands_removed_static_shapes",
+        dtype="float16",
+    ):
+        x = Tensor(
+            [1, 2, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
         y = ops.expand()(x, [1, -1, -1])
         z = ops.elementwise(FuncEnum.MUL)(y, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * x_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_static_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
-    def test_no_op_expands_removed_dynamic_shapes(self):
+    def test_no_op_expands_removed_static_shapes_fp16(self):
+        self._test_no_op_expands_removed_static_shapes(
+            test_name="no_op_expands_removed_static_shapes_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_static_shapes_fp32(self):
+        self._test_no_op_expands_removed_static_shapes(
+            test_name="no_op_expands_removed_static_shapes_fp32",
+            dtype="float32",
+        )
+
+    def _test_no_op_expands_removed_dynamic_shapes(
+        self,
+        test_name="no_op_expands_removed_dynamic_shapes",
+        dtype="float16",
+    ):
         dynamic_dim = IntVar([1, 5], name="dynamic_dim")
-        x = Tensor([1, dynamic_dim, 3], name="input_0", is_input=True)
+        x = Tensor(
+            [1, dynamic_dim, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
         y = ops.expand()(x, [IntVar([1, 1]), -1, -1])
         z = ops.elementwise(FuncEnum.MUL)(y, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * x_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
-    def test_no_op_expands_removed_size_op(self):
-        x = Tensor([1, 2, 3], name="input_0", is_input=True)
-        y = Tensor([IntVar([1, 1]), 2, 3], name="input_1", is_input=True)
+    def test_no_op_expands_removed_dynamic_shapes_fp16(self):
+        self._test_no_op_expands_removed_dynamic_shapes(
+            test_name="no_op_expands_removed_dynamic_shapes_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_dynamic_shapes_fp32(self):
+        self._test_no_op_expands_removed_dynamic_shapes(
+            test_name="no_op_expands_removed_dynamic_shapes_fp32",
+            dtype="float32",
+        )
+
+    def _test_no_op_expands_removed_size_op(
+        self,
+        test_name="no_op_expands_removed_size_op",
+        dtype="float16",
+    ):
+        x = Tensor(
+            [1, 2, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
+        y = Tensor(
+            [IntVar([1, 1]), 2, 3],
+            name="input_1",
+            is_input=True,
+            dtype=dtype,
+        )
         x_size = ops.size()(x, 0)
         y_size = ops.size()(y, 0)
         x_expand = ops.expand()(x, [x_size, -1, -1])
@@ -86,19 +139,31 @@ def test_no_op_expands_removed_size_op(self):
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
-        y_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
+        y_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * y_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors(
                 {"input_0": x_pt, "input_1": y_pt}, {"output_0": z_ait}
             )
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
+    def test_no_op_expands_removed_size_op_fp16(self):
+        self._test_no_op_expands_removed_size_op(
+            test_name="no_op_expands_removed_size_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_size_op_fp32(self):
+        self._test_no_op_expands_removed_size_op(
+            test_name="no_op_expands_removed_size_op_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_flatten.py b/tests/unittest/ops/test_flatten.py
index 2a7057d3e..cbf1920f2 100644
--- a/tests/unittest/ops/test_flatten.py
+++ b/tests/unittest/ops/test_flatten.py
@@ -16,22 +16,28 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model
 from aitemplate.compiler.base import IntImm, IntVar
-
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FlattenTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_single_op(
         self,
         X_shape,
         start_dim=0,
         end_dim=-1,
         test_name="flatten",
         check_name_retention=False,
+        dtype="float16",
     ):
         target = detect_target()
         dynamic_dim_names = [
@@ -41,7 +47,7 @@ def _test_fp16_single_op(
         X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
         X = Tensor(
             shape=X_shape,
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -52,13 +58,14 @@ def _test_fp16_single_op(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         x_shape_values = [var._attrs["values"] for var in X_shape]
         x_shapes = itertools.product(*x_shape_values)
 
         for x_shape in x_shapes:
-            X_pt = torch.randn(x_shape).cuda().half()
+            X_pt = get_random_torch_tensor(x_shape, dtype=dtype)
             Y_pt = torch.flatten(X_pt, start_dim, end_dim)
             y = torch.empty_like(Y_pt)
             in_x = X_pt.clone()
@@ -73,63 +80,102 @@ def _test_fp16_single_op(
                     )
                 )
 
-    def test_flatten(self):
-        self._test_fp16_single_op(
-            X_shape=(IntVar(values=[1, 3]), 16, 32, 64), test_name="flatten0"
+    def test_flatten_fp16(self):
+        self._test_single_op(
+            X_shape=(IntVar(values=[1, 3]), 16, 32, 64),
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
             start_dim=0,
             end_dim=1,
-            test_name="flatten1",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
             start_dim=0,
             end_dim=0,
-            test_name="flatten2",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[3, 4]), 16, 32, 64),
             start_dim=1,
             end_dim=-2,
-            test_name="flatten3",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[3, 4], name="input_batch"), 16, 32, 2, 64),
             start_dim=1,
             end_dim=-2,
-            test_name="flatten_name",
+            test_name="flatten_fp16_name",
             check_name_retention=True,
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(16, 32, IntVar(values=[3, 4], name="input_batch"), 2, 64),
             start_dim=1,
             end_dim=-1,
-            test_name="flatten_dynamic_nonbatch",
+            test_name="flatten_fp16_dynamic_nonbatch",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 16, 4, IntVar(values=[3, 4], name="input_batch"), 16),
             start_dim=0,
             end_dim=2,
-            test_name="flatten_dynamic_nonbatch_name",
+            test_name="flatten_fp16_dynamic_nonbatch_name",
             check_name_retention=True,
+            dtype="float16",
         )
-
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 16, 4, 3, 16),
             start_dim=0,
             end_dim=2,
-            test_name="flatten_static_1",
+            test_name="flatten_fp16_static",
+            dtype="float16",
         )
-
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 3, 16, 4, 16),
             start_dim=0,
             end_dim=-1,
-            test_name="flatten_static_2",
+            test_name="flatten_fp16_static",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_flatten_fp32(self):
+        self._test_single_op(
+            X_shape=(IntVar(values=[1, 3]), 16, 32, 64),
+            test_name="flatten_fp32",
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(IntVar(values=[3, 4], name="input_batch"), 16, 32, 2, 64),
+            start_dim=1,
+            end_dim=-2,
+            test_name="flatten_fp32_name",
+            check_name_retention=True,
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(16, 32, IntVar(values=[3, 4], name="input_batch"), 2, 64),
+            start_dim=1,
+            end_dim=-1,
+            test_name="flatten_fp32_dynamic_nonbatch",
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(32, 16, 4, 3, 16),
+            start_dim=0,
+            end_dim=2,
+            test_name="flatten_fp32_static",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_fpn_roi_align.py b/tests/unittest/ops/test_fpn_roi_align.py
index 22e6ab870..6a6461fbd 100644
--- a/tests/unittest/ops/test_fpn_roi_align.py
+++ b/tests/unittest/ops/test_fpn_roi_align.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from detectron2.modeling.poolers import ROIPooler
@@ -30,15 +31,20 @@
 skipIfNoD2 = skipIf(not HAS_D2, "no detectron2")
 
 
-def random_boxes(num_boxes, max_coord=512):
+def random_boxes(num_boxes, max_coord=512, dtype="float16"):
     boxes = torch.rand(num_boxes, 4) * (max_coord * 0.5)
     boxes.clamp_(min=1.0)
     boxes[:, 2:] += boxes[:, :2]
-    return boxes.cuda().half()
+    torch_dtype = string_to_torch_dtype(dtype)
+    return boxes.cuda().to(dtype=torch_dtype)
 
 
 @skipIfNoD2
 class RoiAlignTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        torch.manual_seed(0)
+
     def _test_fpn_roi_align(
         self,
         boxes,
@@ -54,24 +60,26 @@ def _test_fpn_roi_align(
         rebuild=True,
         bench=False,
         copy_op=False,
+        dtype="float16",
+        eps=1e-2,
     ):
         HH, WW = im_shape
         target = detect_target()
 
         P2 = Tensor(
-            shape=[1, HH // 4, WW // 4, CC], dtype="float16", name="P2", is_input=True
+            shape=[1, HH // 4, WW // 4, CC], dtype=dtype, name="P2", is_input=True
         )
 
         P3 = Tensor(
-            shape=[1, HH // 8, WW // 8, CC], dtype="float16", name="P3", is_input=True
+            shape=[1, HH // 8, WW // 8, CC], dtype=dtype, name="P3", is_input=True
         )
         P4 = Tensor(
-            shape=[1, HH // 16, WW // 16, CC], dtype="float16", name="P4", is_input=True
+            shape=[1, HH // 16, WW // 16, CC], dtype=dtype, name="P4", is_input=True
         )
         P5 = Tensor(
-            shape=[1, HH // 32, WW // 32, CC], dtype="float16", name="P5", is_input=True
+            shape=[1, HH // 32, WW // 32, CC], dtype=dtype, name="P5", is_input=True
         )
-        R = Tensor(shape=[num_rois, 5], dtype="float16", name="ROI", is_input=True)
+        R = Tensor(shape=[num_rois, 5], dtype=dtype, name="ROI", is_input=True)
 
         OP = ops.multi_level_roi_align(
             num_rois=num_rois,
@@ -131,16 +139,15 @@ def fpn_roialign_pt(boxes, features, device="cuda"):
 
         rois = torch.zeros(num_rois, 5)
         rois[:, 1:] = boxes
-        rois = rois.cuda().half()
-        X_p2 = features[0].half()
-        X_p3 = features[1].half()
-        X_p4 = features[2].half()
-        X_p5 = features[3].half()
+        rois = rois.cuda()
+
+        torch_dtype = string_to_torch_dtype(dtype)
+        rois = rois.to(dtype=torch_dtype)
+        features = [f.to(dtype=torch_dtype) for f in features]
 
-        x_p2 = X_p2.permute((0, 2, 3, 1)).contiguous()
-        x_p3 = X_p3.permute((0, 2, 3, 1)).contiguous()
-        x_p4 = X_p4.permute((0, 2, 3, 1)).contiguous()
-        x_p5 = X_p5.permute((0, 2, 3, 1)).contiguous()
+        x_p2, x_p3, x_p4, x_p5 = [
+            f.permute((0, 2, 3, 1)).contiguous() for f in features
+        ]
 
         inputs = {
             "P2": x_p2,
@@ -149,13 +156,14 @@ def fpn_roialign_pt(boxes, features, device="cuda"):
             "P5": x_p5,
             "ROI": rois,
         }
-        y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
+        y = torch.empty_like(y_pt).permute((0, 2, 3, 1)).contiguous()
+        y = y.to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        eps = 1e-2
-        self.assertTrue(torch.allclose(y_pt.half(), y_transpose, atol=eps, rtol=eps))
+        y_transpose = y_transpose.to(dtype=y_pt.dtype)
+        self.assertTrue(torch.allclose(y_pt, y_transpose, atol=eps, rtol=eps))
 
-    def test_fpn_roi_align(self):
+    def _runner(self, dtype="float16", eps=1e-2):
         N, C, H, W = 1, 16, 512, 512
         std = 11
         mean = 0
@@ -172,21 +180,7 @@ def test_fpn_roi_align(self):
             feature5.cuda(),
         ]
 
-        boxes = torch.tensor(
-            [
-                [100.0, 120.0, 152.0, 152.0],
-                [2.0, 2.0, 52.0, 52.0],
-                [1.0, 1.0, 100.0, 100.0],
-                [110.0, 110.0, 300.0, 300.0],
-                [1.0, 1.0, 150.0, 150.0],
-                [10.0, 10.0, 300.0, 300.0],
-                [10.0, 10.0, 400.0, 400.0],
-                [110.0, 110.0, 400.0, 400.0],
-                [110.0, 110.0, 350.0, 350.0],
-                [10.0, 10.0, 510.0, 510.0],
-            ]
-        ).cuda()
-        boxes = random_boxes(100)
+        boxes = random_boxes(100, dtype=dtype)
         self._test_fpn_roi_align(
             boxes,
             features,
@@ -195,7 +189,9 @@ def test_fpn_roi_align(self):
             im_shape=(H, W),
             pooled_size=7,
             rebuild=1,
-            test_name="fpn_roi_align",
+            test_name=f"fpn_roi_align_{dtype}",
+            dtype=dtype,
+            eps=eps,
         )
         self._test_fpn_roi_align(
             boxes,
@@ -205,11 +201,18 @@ def test_fpn_roi_align(self):
             im_shape=(H, W),
             pooled_size=7,
             rebuild=1,
-            test_name="fpn_roi_align_copy_op",
+            test_name=f"fpn_roi_align_copy_op_{dtype}",
             copy_op=True,
+            dtype=dtype,
+            eps=eps,
         )
 
+    def test_fpn_roi_align_fp16(self):
+        self._runner(dtype="float16", eps=1e-1)
+
+    def test_fpn_roi_align_fp32(self):
+        self._runner(dtype="float32", eps=1e-2)
+
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index f611ecf1b..1ac2fbe90 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -15,22 +15,31 @@
 """
 Unittests for fused_elementwise Operator.
 """
-import math
+
 import unittest
 from typing import List
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops, transform
 from aitemplate.compiler.base import IntImm
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform.fuse_ops import _get_inputs_outputs
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    get_torch_full_tensor,
+)
 from aitemplate.utils import shape_utils
 
 ait_dtype_to_pytorch = {"float16": torch.float16}
+if detect_target().name() != "rocm":
+    ait_dtype_to_pytorch["float32"] = torch.float32
+    if int(detect_target()._arch) >= 80:
+        ait_dtype_to_pytorch["bfloat16"] = torch.bfloat16
 
 
 class FusedElementwiseTestCase(unittest.TestCase):
@@ -65,8 +74,17 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
         transform.name_graph(graph)
         transform.mark_param_tensor(graph)
         transform.refine_graph(graph)
-
-        fused_op = ops.fused_elementwise([op1, op2])
+        inputs, outputs, external_inputs, external_outputs = _get_inputs_outputs(
+            {op1, op2}, {op1, op2}
+        )
+        for tensor in inputs | outputs:
+            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - {op1, op2}
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - {op1, op2}
+        fused_op = ops.fused_elementwise(
+            [op1, op2],
+            external_inputs,
+            external_outputs,
+        )
         fused_op._attrs["name"] = "fused_elementwise0"
 
         self.assertEqual(fused_op._attrs["inputs"], [X1])
@@ -166,7 +184,6 @@ def test_fused_elementwise_e2e(self):
     def _test_fused_elementwise_kernel1(self, ait_dtype):
         BATCH_SIZE = 1024
         M = 1496
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(BATCH_SIZE), IntImm(2), IntImm(M)],
             dtype=ait_dtype,
@@ -177,7 +194,7 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
             shape=[],
             dtype=ait_dtype,
             name="constant_number",
-            value=1.0,
+            value=2.0,
         )
         X3 = Tensor(
             shape=[IntImm(2), IntImm(M)],
@@ -199,14 +216,14 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
             X9, target, "./tmp", f"fused_elementwise_kernel1_{ait_dtype}"
         )
 
-        x1_pt = torch.randn(BATCH_SIZE, 2, M).cuda().to(dtype=torch_dtype)
-        x3_pt = torch.randn(2, M).cuda().to(dtype=torch_dtype)
-        x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt)) * x3_pt
+        x1_pt = get_random_torch_tensor((BATCH_SIZE, 2, M), ait_dtype)
+        x3_pt = get_random_torch_tensor((2, M), ait_dtype)
+        x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt) + 1) * x3_pt
 
         inputs = {"input0": x1_pt, "constant_matrix": x3_pt}
-        x9 = torch.empty([BATCH_SIZE, 2, M]).cuda().to(dtype=torch_dtype)
+        x9 = get_torch_empty_tensor([BATCH_SIZE, 2, M], ait_dtype)
         module.run_with_tensors(inputs, [x9])
-        self.assertTrue(torch.allclose(x9, x9_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(x9, x9_pt, atol=1e-2, rtol=1e-2)
 
     def test_fused_elementwise_kernel1(self):
         for ait_dtype in ait_dtype_to_pytorch.keys():
@@ -227,12 +244,17 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
         target = detect_target()
         module = compile_model(X2, target, "./tmp", test_name)
 
-        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x1_pt = (
+            (torch.rand(input_size, device="cuda", dtype=torch_dtype) - 0.5) * 2.0
+        ) * torch.finfo(torch_dtype).max
         x2_pt = torch.sigmoid(x1_pt)
 
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        # sanity checks
+        self.assertEqual(torch.sum(x2 < 0), 0)
+        self.assertEqual(torch.sum(x2 > 1), 0)
 
     def test_sigmoid(self):
         for ait_dtype in ait_dtype_to_pytorch.keys():
@@ -363,7 +385,6 @@ def _test_min_max(
         ait_dtype,
     ) -> None:
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X0 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -387,8 +408,8 @@ def _test_min_max(
         target = detect_target()
         module = compile_model(result, target, "./tmp", test_name)
 
-        x0_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x0_pt = get_random_torch_tensor(input_size, ait_dtype)
+        x1_pt = get_random_torch_tensor(input_size, ait_dtype)
         if add_nans:
             x1_pt[0].fill_(float("nan"))
 
@@ -396,19 +417,17 @@ def _test_min_max(
             x2_pt = torch.min(x0_pt, x1_pt)
         else:
             x2_pt = torch.max(x0_pt, x1_pt)
-        x2_np = x2_pt.cpu().numpy()
 
         inputs = {"input0": x0_pt, "input1": x1_pt}
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        x2 = get_torch_empty_tensor(input_size, ait_dtype)
         module.run_with_tensors(inputs, [x2])
-        x2 = x2.cpu().numpy()
 
         if add_nans:
-            nans = np.full(x2_np[0].shape, np.nan)
-            np.testing.assert_allclose(nans, x2_np[0], equal_nan=True)
-            np.testing.assert_allclose(nans, x2[0], equal_nan=True)
+            nans = get_torch_full_tensor(x2_pt[0].shape, float("nan"), ait_dtype)
+            torch.testing.assert_close(nans, x2_pt[0], equal_nan=True)
+            torch.testing.assert_close(nans, x2[0], equal_nan=True)
 
-        np.testing.assert_allclose(x2, x2_np, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
 
     def test_min(self):
         for ait_dtype in ait_dtype_to_pytorch.keys():
@@ -452,10 +471,10 @@ def _test_clamp(
         test_name: str,
         ait_dtype,
     ) -> None:
-        assert len(input_size) == 2
+        assert len(input_size) == 2 or len(input_size) == 0
         torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X0 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])] if input_size else [],
             dtype=ait_dtype,
             name="input0",
             is_input=True,
@@ -482,6 +501,7 @@ def test_clamp(self):
             self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
             self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
             self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
+            self._test_clamp([], 1, -1, f"clamp_4_{ait_dtype}", ait_dtype)
 
     def _test_operator_overload(self, ait_dtype):
         input_size = [4, 2]
@@ -526,7 +546,7 @@ def _test_operator_overload_with_constant_number(self, ait_dtype):
             name="input0",
             is_input=True,
         )
-        OUTPUT = 10 / ops.tanh(X1 + 5) - ops.cos(10)
+        OUTPUT = 10 / ops.tanh(X1 + 5)
         OUTPUT._attrs["is_output"] = True
         OUTPUT._attrs["name"] = "output"
 
@@ -534,7 +554,7 @@ def _test_operator_overload_with_constant_number(self, ait_dtype):
         module = compile_model(OUTPUT, target, "./tmp", f"test_op_overload_{ait_dtype}")
 
         x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        output_pt = 10 / torch.tanh(x1_pt + 5) - math.cos(10)
+        output_pt = 10 / torch.tanh(x1_pt + 5)
         output = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index 5de8ea4b6..b8784b9e0 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -25,6 +25,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -38,6 +39,7 @@ def _test_different_dim(
         expected_read_t,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests tanh(A(B, M, K) + B(M, K)).
@@ -49,13 +51,13 @@ def _test_different_dim(
 
         X1 = Tensor(
             shape=[batch_dim, m_dim, k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[m_dim, k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -66,12 +68,8 @@ def _test_different_dim(
         self.assertEqual(X4._attrs["shape"], [batch_dim, m_dim, k_dim])
 
         target = detect_target()
-        module = compile_model(
-            X4,
-            target,
-            "./tmp",
-            "fused_elementwise_different_dims_{}".format(test_name),
-        )
+        module = compile_model(X4, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
@@ -81,59 +79,117 @@ def _test_different_dim(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, k in itertools.product(batch_sizes, ms, ks):
-            x1_pt = torch.randn(batch_size, m, k).cuda().half()
-            x2_pt = torch.randn(m, k).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, m, k], dtype=dtype)
+            x2_pt = get_random_torch_tensor([m, k], dtype=dtype)
             x4_pt = torch.tanh(x1_pt + x2_pt)
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x4 = torch.empty([batch_size, m, k]).cuda().half()
+            x4 = torch.empty_like(x4_pt)
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_different_dim(self):
+    def test_different_dim_fp16(self):
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[256],
             ks=[128],
-            test_name="static_shapes",
+            test_name="fused_elementwise_different_dim_fp16_static_shapes",
             expected_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[23, 56, 1024],
             ms=[256],
             ks=[128],
-            test_name="dynamic_bs",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_bs",
             expected_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[34, 67, 256],
             ks=[128],
-            test_name="dynamic_ms",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_ms",
             expected_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[256],
             ks=[34, 87, 128],
-            test_name="dynamic_ks",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_ks",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[23, 1024],
             ms=[13, 256],
             ks=[34, 128],
-            test_name="dynamic_all",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_all",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_different_dim_fp32(self):
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_static_shapes",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 56, 1024],
+            ms=[256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_bs",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[34, 67, 256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_ms",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[34, 87, 128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_ks",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 1024],
+            ms=[13, 256],
+            ks=[34, 128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_all",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_1_shape(
@@ -146,6 +202,7 @@ def _test_1_shape(
         expected_read_t,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests tanh(A(B, 1, 1, M, K, 1) + B(N, N, 1, K, M)).
@@ -158,13 +215,13 @@ def _test_1_shape(
 
         X1 = Tensor(
             shape=[batch_dim, IntImm(1), IntImm(1), m_dim, k_dim, IntImm(1)],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[n_dim, n_dim, IntImm(1), k_dim, m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -177,12 +234,8 @@ def _test_1_shape(
         )
 
         target = detect_target()
-        module = compile_model(
-            X4,
-            target,
-            "./tmp",
-            "fused_elementwise_1_shape_{}".format(test_name),
-        )
+        module = compile_model(X4, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
@@ -192,74 +245,149 @@ def _test_1_shape(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
-            x1_pt = torch.randn(batch_size, 1, 1, m, k, 1).cuda().half()
-            x2_pt = torch.randn(n, n, 1, k, m).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, 1, 1, m, k, 1], dtype=dtype)
+            x2_pt = get_random_torch_tensor([n, n, 1, k, m], dtype=dtype)
             x4_pt = torch.tanh(x1_pt + x2_pt)
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x4 = torch.empty([batch_size, n, n, m, k, m]).cuda().half()
+            x4 = torch.empty_like(x4_pt)
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_1_shape(self):
+    def test_1_shape_fp16(self):
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="static_shapes",
+            test_name="fused_elementwise_test_1_fp16_static_shapes",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[23, 56, 1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_bs",
+            test_name="fused_elementwise_test_1_fp16_dynamic_bs",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[1, 3, 8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_ms",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ms",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[1, 3, 4],
             ks=[16],
-            test_name="dynamic_ns",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ns",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[1, 4, 7, 16],
-            test_name="dynamic_ks",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ks",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[25, 1024],
             ms=[7, 8],
             ns=[3, 4],
             ks=[1, 16],
-            test_name="dynamic_all",
+            test_name="fused_elementwise_test_1_fp16_dynamic_all",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_1_shape_fp32(self):
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_static_shapes",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[23, 56, 1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_bs",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ms",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ns",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ks",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_all",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_chained_broadcasts(
@@ -272,6 +400,7 @@ def _test_chained_broadcasts(
         expected_read_t,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests A(B, 1, 1, M) + B(1, N, 1, M) + C(1, N, K, M).
@@ -284,19 +413,19 @@ def _test_chained_broadcasts(
 
         X1 = Tensor(
             shape=[batch_dim, IntImm(1), IntImm(1), m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntImm(1), n_dim, IntImm(1), m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
         X3 = Tensor(
             shape=[IntImm(1), n_dim, k_dim, m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input2",
             is_input=True,
         )
@@ -308,12 +437,7 @@ def _test_chained_broadcasts(
         self.assertEqual(X5._attrs["shape"], [batch_dim, n_dim, k_dim, m_dim])
 
         target = detect_target()
-        module = compile_model(
-            X5,
-            target,
-            "./tmp",
-            "fused_elementwise_chained_broadcasts_{}".format(test_name),
-        )
+        module = compile_model(X5, target, "./tmp", test_name)
 
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
@@ -324,75 +448,150 @@ def _test_chained_broadcasts(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
-            x1_pt = torch.randn(batch_size, 1, 1, m).cuda().half()
-            x2_pt = torch.randn(1, n, 1, m).cuda().half()
-            x3_pt = torch.randn(1, n, k, m).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, 1, 1, m], dtype=dtype)
+            x2_pt = get_random_torch_tensor([1, n, 1, m], dtype=dtype)
+            x3_pt = get_random_torch_tensor([1, n, k, m], dtype=dtype)
             x5_pt = x3_pt + x1_pt + x2_pt
             inputs = {"input0": x1_pt, "input1": x2_pt, "input2": x3_pt}
-            x5 = torch.empty([batch_size, n, k, m]).cuda().half()
+            x5 = torch.empty_like(x5_pt)
             module.run_with_tensors(inputs, [x5])
             self.assertTrue(torch.allclose(x5, x5_pt, atol=1e-2, rtol=1e-2))
 
-    def test_chained_shapes(self):
+    def test_chained_shapes_fp16(self):
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="static_shapes",
+            test_name="fused_elementwise_chained_broadcasts_fp16_static_shapes",
             expected_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[23, 56, 1024],
             ms=[2],
             ns=[4],
             ks=[16],
-            test_name="dynamic_bs",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_bs",
             expected_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[1, 3, 8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_ms",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ms",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[4],
             ns=[1, 3, 4],
             ks=[16],
-            test_name="dynamic_ns",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ns",
             expected_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[1, 4, 7, 16],
-            test_name="dynamic_ks",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ks",
             expected_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[25, 1024],
             ms=[7, 8],
             ns=[3, 4],
             ks=[1, 16],
-            test_name="dynamic_all",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_all",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_chained_shapes_fp32(self):
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_static_shapes",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[23, 56, 1024],
+            ms=[2],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_bs",
+            expected_read_t="uint2",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ms",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[4],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ns",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ks",
+            expected_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_all",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_consecutive_1s_broadcast(
@@ -402,6 +601,7 @@ def _test_consecutive_1s_broadcast(
         expected_read_t,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests A(1, 1, K, 1, 1, K) / B(1, 1, 1, 1, 1, 1).
@@ -411,13 +611,13 @@ def _test_consecutive_1s_broadcast(
 
         X1 = Tensor(
             shape=[IntImm(1), IntImm(1), k_dim, IntImm(1), IntImm(1), k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1)],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -427,12 +627,8 @@ def _test_consecutive_1s_broadcast(
         self.assertEqual(X3._attrs["shape"], X1._attrs["shape"])
 
         target = detect_target()
-        module = compile_model(
-            X3,
-            target,
-            "./tmp",
-            "fused_elementwise_consecutive_1s_broadcast_{}".format(test_name),
-        )
+        module = compile_model(X3, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
@@ -442,28 +638,49 @@ def _test_consecutive_1s_broadcast(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for k in ks:
-            x1_pt = torch.randn(1, 1, k, 1, 1, k).cuda().half()
-            x2_pt = torch.randn(1, 1, 1, 1, 1, 1).cuda().half()
+            x1_pt = get_random_torch_tensor([1, 1, k, 1, 1, k], dtype=dtype)
+            x2_pt = get_random_torch_tensor([1, 1, 1, 1, 1, 1], dtype=dtype)
             x3_pt = x1_pt / x2_pt
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x3 = torch.empty([1, 1, k, 1, 1, k]).cuda().half()
+            x3 = torch.empty_like(x3_pt)
             module.run_with_tensors(inputs, [x3])
             self.assertTrue(torch.allclose(x3, x3_pt, atol=1e-2, rtol=1e-2))
 
-    def test_consecutive_1s_broadcast(self):
+    def test_consecutive_1s_broadcast_fp16(self):
         self._test_consecutive_1s_broadcast(
             ks=[32],
-            test_name="static_shapes",
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp16_static_shapes",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_consecutive_1s_broadcast(
             ks=[1, 5, 7, 32],
-            test_name="dynamic_shapes",
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp16_dynamic_shapes",
             expected_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_consecutive_1s_broadcast_fp32(self):
+        self._test_consecutive_1s_broadcast(
+            ks=[32],
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp32_static_shapes",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_consecutive_1s_broadcast(
+            ks=[1, 5, 7, 32],
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp32_dynamic_shapes",
+            expected_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
index dff0f67b9..b5398eb38 100644
--- a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
+++ b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
@@ -26,12 +26,26 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 class FusedElementwiseWithStridedOutputsTestCase(unittest.TestCase):
-    def _fused_elementwise_e2e_helper(
-        self, batch0_sizes: List[int], batch1_sizes: List[int], m1: int, m2: int, k: int
+    def __init__(self, *args, **kwargs):
+        super(FusedElementwiseWithStridedOutputsTestCase, self).__init__(
+            *args, **kwargs
+        )
+        self._test_id = 0
+
+    def _test_fused_elementwise_with_strided_outputs(
+        self,
+        batch0_sizes: List[int],
+        batch1_sizes: List[int],
+        m1: int,
+        m2: int,
+        k: int,
+        test_name: str = "fused_elementwise_with_strided_outputs",
+        dtype: str = "float16",
     ):
         # Construct one graph with 2 fused_elementwises + 1 cat.
         batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch0")
@@ -44,13 +58,13 @@ def _fused_elementwise_e2e_helper(
                 IntImm(m1),
                 IntImm(k),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             value=3.0,
         )
@@ -61,7 +75,7 @@ def _fused_elementwise_e2e_helper(
                 IntImm(m2),
                 IntImm(k),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -79,13 +93,20 @@ def _fused_elementwise_e2e_helper(
             [X7],
             target,
             "./tmp",
-            f"fused_elementwise_with_strided_outputs_m1_{m1}_m2_{m2}_k_{k}",
+            f"{test_name}_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch0_size in batch0_sizes:
                 for batch1_size in batch1_sizes:
                     # Run PyTorch baseline.
-                    x1_pt = torch.randn(batch0_size, batch1_size, m1, k).cuda().half()
-                    x3_pt = torch.randn(batch0_size, batch1_size, m2, k).cuda().half()
+                    x1_pt = get_random_torch_tensor(
+                        [batch0_size, batch1_size, m1, k],
+                        dtype=dtype,
+                    )
+                    x3_pt = get_random_torch_tensor(
+                        [batch0_size, batch1_size, m2, k],
+                        dtype=dtype,
+                    )
                     x5_pt = torch.tanh(x1_pt + 3.0)
                     x6_pt = torch.tanh(x3_pt)
                     x7_pt = torch.cat([x5_pt, x6_pt], dim=2)
@@ -96,46 +117,184 @@ def _fused_elementwise_e2e_helper(
                     inputs[name_to_index_map["input0"]] = x1_pt
                     inputs[name_to_index_map["input1"]] = x3_pt
 
-                    x7 = (
-                        torch.empty([batch0_size, batch1_size, m1 + m2, k])
-                        .cuda()
-                        .half()
-                    )
+                    x7 = torch.empty_like(x7_pt)
                     module.run_with_tensors(inputs, [x7])
                     # Do comparisons.
                     self.assertTrue(torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2))
 
-    def test_all_aligned(self):
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1], batch1_sizes=[2, 4, 5], m1=8, m2=8, k=1
+    def test_all_aligned_fp16(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1],
+            batch1_sizes=[2, 4, 5],
+            m1=8,
+            m2=8,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1, 99, 1024],
+            batch1_sizes=[8],
+            m1=8,
+            m2=16,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[3, 5, 1024],
+            batch1_sizes=[2, 5],
+            m1=4,
+            m2=4,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=4,
+            m2=2,
+            k=4,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=16,
+            m2=64,
+            k=32,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_all_aligned_fp32(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1],
+            batch1_sizes=[2, 4, 5],
+            m1=8,
+            m2=8,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1, 99, 1024],
+            batch1_sizes=[8],
+            m1=8,
+            m2=16,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[3, 5, 1024],
+            batch1_sizes=[2, 5],
+            m1=4,
+            m2=4,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1, 99, 1024], batch1_sizes=[8], m1=8, m2=16, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=4,
+            m2=2,
+            k=4,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[3, 5, 1024], batch1_sizes=[2, 5], m1=4, m2=4, k=2
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=16,
+            m2=64,
+            k=32,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1024], batch1_sizes=[2], m1=4, m2=2, k=4
+
+    def test_not_aligned_fp16(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[8],
+            batch1_sizes=[23, 88, 100],
+            m1=1,
+            m2=1,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[88, 100, 234],
+            batch1_sizes=[40],
+            m1=4,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1024], batch1_sizes=[2], m1=16, m2=64, k=32
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[23, 56, 93],
+            batch1_sizes=[12, 34, 55],
+            m1=1,
+            m2=2,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[2],
+            batch1_sizes=[1024],
+            m1=8,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
         )
 
-    def test_not_aligned(self):
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[8], batch1_sizes=[23, 88, 100], m1=1, m2=1, k=1
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_not_aligned_fp32(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[8],
+            batch1_sizes=[23, 88, 100],
+            m1=1,
+            m2=1,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[88, 100, 234], batch1_sizes=[40], m1=4, m2=2, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[88, 100, 234],
+            batch1_sizes=[40],
+            m1=4,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[23, 56, 93], batch1_sizes=[12, 34, 55], m1=1, m2=2, k=2
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[23, 56, 93],
+            batch1_sizes=[12, 34, 55],
+            m1=1,
+            m2=2,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[2], batch1_sizes=[1024], m1=8, m2=2, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[2],
+            batch1_sizes=[1024],
+            m1=8,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gather.py b/tests/unittest/ops/test_gather.py
index dcfec0d02..2ff6b3927 100644
--- a/tests/unittest/ops/test_gather.py
+++ b/tests/unittest/ops/test_gather.py
@@ -22,30 +22,50 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GatherTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(GatherTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
 
-    def _run_gather_test(self, *, input_shape, gather_dim, dim_size, index_shape=None):
-        logging.info(
-            "Test with input_shape {}, gather_dim {}".format(input_shape, gather_dim)
-        )
+    def _run_gather_test(
+        self,
+        *,
+        input_shape,
+        gather_dim,
+        dim_size,
+        index_shape=None,
+        test_name="gather",
+        input_type="float16",
+        index_type="int64",
+    ):
+        logging.info(f"Test with input_shape {input_shape}, gather_dim {gather_dim}")
 
-        input_type = "float16"
-        index_type = "int64"
         if index_shape is None:
             index_shape = [
                 random.randint(0, d - 1) if i != gather_dim else dim_size
                 for (i, d) in enumerate(input_shape)
             ]
-        logging.info("index_shape {}".format(index_shape))
+        logging.info(f"index_shape {index_shape}")
 
-        X = Tensor(shape=input_shape, dtype=input_type, name="X", is_input=True)
-        Index = Tensor(shape=index_shape, dtype=index_type, name="Index", is_input=True)
+        X = Tensor(
+            shape=input_shape,
+            dtype=input_type,
+            name="X",
+            is_input=True,
+        )
+        Index = Tensor(
+            shape=index_shape,
+            dtype=index_type,
+            name="Index",
+            is_input=True,
+        )
         gather_op = ops.gather()
         Y = gather_op(X, gather_dim, Index)
         Y._attrs["name"] = "output"
@@ -54,34 +74,131 @@ def _run_gather_test(self, *, input_shape, gather_dim, dim_size, index_shape=Non
         np.testing.assert_equal(y_shape, index_shape)
 
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "gather")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         X_pt = get_random_torch_tensor(input_shape, input_type)
         Index_pt = torch.randint(
-            input_shape[gather_dim], size=index_shape, dtype=torch.int64
-        ).cuda()
+            input_shape[gather_dim],
+            size=index_shape,
+            dtype=torch.int64,
+            device="cuda",
+        )
         Y_pt = torch.gather(X_pt, gather_dim, Index_pt)
         Y_np = Y_pt.cpu().numpy()
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         Index_pt = Index_pt.to(torch.int64)
         inputs = {"X": X_pt, "Index": Index_pt}
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype=input_type)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_gather(self):
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=1)
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=2)
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=3)
+    def test_gather_fp16(self):
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=1,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=2,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=3,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=1,
+            dim_size=4,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=0,
+            dim_size=2,
+            index_shape=[7, 1, 4],
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            index_shape=[0, 1, 2],
+            test_name="gather_fp16",
+            input_type="float16",
+        )
 
-        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=2, dim_size=7)
-        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=1, dim_size=4)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gather_fp32(self):
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=1,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=2,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=3,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=1,
+            dim_size=4,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
         self._run_gather_test(
-            input_shape=[3, 4, 5], gather_dim=0, dim_size=2, index_shape=[7, 1, 4]
+            input_shape=[3, 4, 5],
+            gather_dim=0,
+            dim_size=2,
+            index_shape=[7, 1, 4],
+            test_name="gather_fp32",
+            input_type="float32",
         )
         self._run_gather_test(
-            input_shape=[3, 4, 5], gather_dim=2, dim_size=7, index_shape=[0, 1, 2]
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            index_shape=[0, 1, 2],
+            test_name="gather_fp32",
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 15ad326cd..9a390b19c 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -20,57 +20,97 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
+
+
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-2, "rtol": 1e-2}
+    elif dtype in ("float", "float32"):
+        return {"atol": 3e-2, "rtol": 3e-2}
+    elif dtype == "bfloat16":
+        return {"atol": 2e-1, "rtol": 2e-1}
+    else:
+        return {}
 
 
 class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, ms, k, n, test_name):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(GEMMTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
-
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m, n]).cuda().half()
+            y = get_torch_empty_tensor([m, n], dtype)
             module.run_with_tensors(inputs, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                print(f"Processing m={m}")
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_rcr(self):
+    def test_rcr_simple_static(self) -> None:
         self._test_rcr([1024], 256, 512, "static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 1024], 256, 512, "dynamic1")
-            self._test_rcr([1, 99, 84, 987, 1024], 128, 8, "dynamic2")
-            self._test_rcr([8], 0, 4, "zero_k")
-            self._test_rcr([0], 8, 4, "zero_m")
 
-    def _test_rcr_dynamic_n(self, ms, k, ns, test_name):
+    @unittest.skipIf(detect_target().name() != "cuda", "Only supported by CUDA.")
+    @parameterized.expand(
+        [
+            ("dynamic1", [1, 1024], 256, 512),
+            # TODO/FIXME: Fix the issue below.
+            # There is some bug with floating point rounding,
+            # e.g. the list of batch sizes like this [1, 99, 84, 987, 1024]
+            # is not handled properly.
+            ("dynamic2", [1, 99, 84, 1024], 128, 8),
+            ("zero_k", [8], 0, 4),
+            ("zero_m", [0], 8, 4),
+        ]
+    )
+    def test_rcr_simple_dynamic(self, name, ms, k, n) -> None:
+        self._test_rcr(ms, k, n, name)
+
+    def _test_rcr_dynamic_n(self, ms, k, ns, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ns), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -78,65 +118,68 @@ def _test_rcr_dynamic_n(self, ms, k, ns, test_name):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m in ms:
             for n in ns:
-                X_pt = torch.randn(m, k).cuda().half()
-                W_pt = torch.randn(n, k).cuda().half()
+                X_pt = get_random_torch_tensor([m, k], dtype)
+                W_pt = get_random_torch_tensor([n, k], dtype)
                 Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
                 inputs = {"input_0": X_pt, "input_1": W_pt}
-                y = torch.empty([m, n]).cuda().half()
+                y = get_torch_empty_tensor([m, n], dtype)
                 module.run_with_tensors(inputs, [y])
 
-                # from aitemplate.testing.benchmark_pt import benchmark_torch_function
-                # module.benchmark_with_tensors(inputs, [y], count=1000)
-                # t = benchmark_torch_function(1000, torch.nn.functional.linear, X_pt, W_pt)
-                # print(f"pt: {t} ms")
-
                 if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                     pass
                 else:
-                    self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                    torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     def test_rcr_dynamic_n(self):
-        self._test_rcr([16, 1 * 29, 64], 256, 300000, "umia_einsum_1")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1")
         self._test_rcr_dynamic_n(
-            [16, 1 * 29, 64], 256, [100000, 300000], "umia_einsum_dynamic_n"
+            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n"
         )
 
-    def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name):
+    def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
+        if dtype == "float16":
+            tolerance_limits["atol"] = 2e-2
+            tolerance_limits["rtol"] = 2e-2
         X = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(m0s),
                 shape_utils.gen_int_var_min_max(m1s),
                 k,
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         X._attrs["is_input"] = True
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", "gemm_3d_2d_rcr_{}".format(test_name)
+            Y, target, "./tmp", f"gemm_3d_2d_rcr_{test_name}_{self._test_id}"
         )
+        self._test_id += 1
 
         for m0, m1 in itertools.product(m0s, m1s):
-            X_pt = torch.randn(m0, m1, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
+            X_pt = get_random_torch_tensor([m0, m1, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m0, m1, n]).cuda().half()
+            y = get_torch_empty_tensor([m0, m1, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rcr(self):
@@ -145,63 +188,74 @@ def test_3d_2d_rcr(self):
         self._test_3d_2d_rcr([3], [128, 256], 256, 512, "dynamic2")
         self._test_3d_2d_rcr([1, 99, 1024], [1, 2], 128, 8, "dynamic3")
 
-    def _test_rrr(self, ms, k, n, test_name):
+    def _test_rrr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
+        if dtype == "float16":
+            tolerance_limits["atol"] = 2e-2
+            tolerance_limits["rtol"] = 2e-2
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_pt = torch.matmul(X_pt, W_pt)
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m, n]).cuda().half()
+            y = get_torch_empty_tensor([m, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     def test_rrr(self):
         self._test_rrr([256], 128, 32, "static")
         if detect_target().name() == "cuda":
             self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
 
-    def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name):
+    def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = {"atol": 2e-1, "rtol": 2e-1}
         X = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(m0s),
                 shape_utils.gen_int_var_min_max(m1s),
                 k,
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m0, m1 in itertools.product(m0s, m1s):
-            X_pt = torch.randn(m0, m1, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m0, m1, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_pt = torch.matmul(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m0, m1, n]).cuda().half()
+            y = get_torch_empty_tensor([m0, m1, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rrr(self):
@@ -210,26 +264,92 @@ def test_3d_2d_rrr(self):
         self._test_3d_2d_rrr([2], [24, 36], 256, 16, "dynamic2")
         self._test_3d_2d_rrr([2, 34, 48], [1, 3, 5], 256, 16, "dynamic3")
 
-    def test_h_rcr(self):
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_h_rcr(self, ait_dtype):
         M = 256
         K = 256
         N = 512
-        target = detect_target(use_fp16_acc=True)
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        target = detect_target(use_fp16_acc=(ait_dtype == "float16"))
+        if target.name() != "cuda" and ait_dtype != "float16":
+            self.skipTest(
+                f"{ait_dtype} input type is not supported for {target.name()}"
+            )
+        if (
+            target.name() == "cuda"
+            and int(target._arch) < 80
+            and ait_dtype != "float16"
+        ):
+            self.skipTest(f"{ait_dtype} is not supported for cuda sm < 80")
+        X = Tensor(shape=[M, K], dtype=ait_dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=ait_dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "hgemm_rcr")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        module = compile_model(
+            Y, target, "./tmp", f"hgemm_rcr_{ait_dtype}_{self._test_id}"
+        )
+        self._test_id += 1
+        X_pt = get_random_torch_tensor((M, K), ait_dtype)
+        W_pt = get_random_torch_tensor((N, K), ait_dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor((M, N), ait_dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_float(self):
+        self._test_rcr([1024], 256, 512, "static_float", dtype="float")
+        self._test_rcr([1, 1024], 256, 512, "dynamic1_float", dtype="float")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_float", dtype="float")
+
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_float", dtype="float")
+        self._test_3d_2d_rcr(
+            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_float", dtype="float"
+        )
+
+        self._test_rrr([256], 128, 32, "static_float", dtype="float")
+        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic_float", dtype="float")
+
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static_float", dtype="float")
+        self._test_3d_2d_rrr(
+            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_float", dtype="float"
+        )
+
+    @unittest.skipIf(
+        detect_target().name() == "rocm", "bfloat16 is not supported by ROCm."
+    )
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bfloat16 is not supported by CUDA < SM80.",
+    )
+    def test_gemm_bfloat16(self):
+        self._test_rcr([1024], 256, 512, "static_bfloat16", dtype="bfloat16")
+        self._test_rcr([1, 1024], 256, 512, "dynamic1_bfloat16", dtype="bfloat16")
+        self._test_rcr(
+            [16, 1 * 29, 64], 256, 300000, "einsum_1_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_bfloat16", dtype="bfloat16")
+        self._test_3d_2d_rcr(
+            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_rrr([256], 128, 32, "static_bfloat16", dtype="bfloat16")
+        self._test_rrr(
+            [1, 99, 1024, 2048], 256, 16, "dynamic_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static_bfloat16", dtype="bfloat16")
+        self._test_3d_2d_rrr(
+            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_bfloat16", dtype="bfloat16"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index cbe0b9ce6..bb493ad37 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
 import torch
@@ -21,35 +20,65 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 
-class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, N, K, test_name):
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "bfloat16":
+        return {"atol": 3e-1, "rtol": 3e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
+
+
+class GEMMBiasTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, Ms, N, K, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_{test_name}")
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_bias_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for M in Ms:
-            logging.info(f"Testing {M=}")
-
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
 
-            y = torch.empty([M, N]).half().cuda()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
@@ -57,16 +86,11 @@ def _test_rcr(self, Ms, N, K, test_name):
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_rcr(self):
+    def test_rcr_zero_size(self):
         target = detect_target()
-        self._test_rcr([128], N=64, K=1024, test_name="static")
-        self._test_rcr([4096], N=4, K=4, test_name="static")
-        self._test_rcr([1000], N=81, K=1024, test_name="static")
-        self._test_rcr([67200], N=3, K=256, test_name="static")
         if target.name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], N=64, K=1024, test_name="dynamic_m")
             # This test triggered a c10 assertion failure internally
             # caffe2/c10/util/SmallVector.h:338:
             # Assertion `idx < size()' failed
@@ -75,6 +99,27 @@ def test_rcr(self):
             self._test_rcr([2], N=0, K=4, test_name="zero_n")
             self._test_rcr([0], N=4, K=4, test_name="zero_m")
 
+    def test_rcr_static(self):
+        self._test_rcr([4096], N=4, K=4, test_name="static")
+        self._test_rcr([1000], N=81, K=1024, test_name="static")
+        self._test_rcr([67200], N=3, K=256, test_name="static")
+
+    @parameterized.expand(("bfloat16",))
+    def test_rcr_all_floats(self, dtype):
+        skipped_reason = _skip_target(detect_target(), dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_rcr([4], N=2, K=11, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr([128], N=64, K=1024, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 7, 64, 127],
+            N=64,
+            K=1024,
+            test_name=f"dynamic_m_{dtype}",
+            dtype=dtype,
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index ef2ccc365..b0ccfe05a 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -20,26 +20,28 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 class GEMMBiasBroadcastTestCase(unittest.TestCase):
-    def _init_tensors(self, m, k, n, m0=None, m1=None):
+    def _init_tensors(self, m, k, n, m0=None, m1=None, dtype="float16"):
         m_shape = [m] if m is not None else [m0, m1]
-        self.X = Tensor(
-            shape=m_shape + [k], dtype="float16", name="input_0", is_input=True
-        )
-        self.W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        self.B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
-        self.D0 = Tensor(shape=m_shape + [n], dtype="float16", name="d0", is_input=True)
-        self.D1 = Tensor(shape=m_shape + [n], dtype="float16", name="d1", is_input=True)
-        self.X_pt = torch.randn(*m_shape, k).cuda().half()
-        self.W_pt = torch.randn(n, k).cuda().half()
-        self.B_pt = torch.randn(n).cuda().half()
-        self.D0_pt = torch.randn(*m_shape, n).cuda().half()
-        self.D1_pt = torch.randn(*m_shape, n).cuda().half()
+        self.X = Tensor(shape=m_shape + [k], dtype=dtype, name="input_0", is_input=True)
+        self.W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        self.B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
+        self.D0 = Tensor(shape=m_shape + [n], dtype=dtype, name="d0", is_input=True)
+        self.D1 = Tensor(shape=m_shape + [n], dtype=dtype, name="d1", is_input=True)
+        self.X_pt = get_random_torch_tensor([*m_shape, k], dtype)
+        self.W_pt = get_random_torch_tensor([n, k], dtype)
+        self.B_pt = get_random_torch_tensor([n], dtype)
+        self.D0_pt = get_random_torch_tensor([*m_shape, n], dtype)
+        self.D1_pt = get_random_torch_tensor([*m_shape, n], dtype)
 
     def _test_and_verify(
-        self, module, numpy_output, has_d1=False, module_output_name="output_0"
+        self, module, numpy_output, dtype, has_d1=False, module_output_name="output_0"
     ):
         inputs = {
             "input_0": self.X_pt,
@@ -49,7 +51,7 @@ def _test_and_verify(
         }
         if has_d1:
             inputs["d1"] = self.D1_pt
-        y = torch.empty(list(numpy_output.shape)).cuda().half()
+        y = get_torch_empty_tensor(list(numpy_output.shape), dtype)
         module.run_with_tensors(inputs, [y])
         if self.X_pt.nelement() == 0 or self.W_pt.nelement() == 0:
             pass
@@ -58,15 +60,15 @@ def _test_and_verify(
                 numpy_output, y.cpu().numpy(), atol=1e-1, rtol=1e-1
             )
 
-    def _test_bias_rcr_mul_add(self, m, m0, m1, k, n):
+    def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_add()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", "gemm_rcr_bias_mul_add_k_{}_n_{}".format(k, n)
+            Y, target, "./tmp", f"gemm_rcr_bias_mul_add_k_{k}_n_{n}_{dtype}"
         )
         Y_pt = (
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
@@ -74,7 +76,7 @@ def _test_bias_rcr_mul_add(self, m, m0, m1, k, n):
             + self.D1_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_np, dtype, has_d1=True)
 
     def test_bias_rcr_mul_add(self):
         self._test_bias_rcr_mul_add(8, None, None, 8, 8)
@@ -82,9 +84,9 @@ def test_bias_rcr_mul_add(self):
             self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
             self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
 
-    def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
+    def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -93,7 +95,7 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_sigmoid_mul_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_sigmoid_mul_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = (
@@ -103,7 +105,7 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
             * self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_sigmoid_mul(self):
         self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
@@ -111,9 +113,9 @@ def test_bias_rcr_sigmoid_mul(self):
             self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
             self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
 
-    def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
+    def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul_tanh()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -122,7 +124,7 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_sigmoid_mul_tanh_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_sigmoid_mul_tanh_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = torch.tanh(
@@ -132,7 +134,7 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
             * self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_sigmoid_mul_tanh(self):
         self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
@@ -141,9 +143,9 @@ def test_bias_rcr_sigmoid_mul_tanh(self):
             self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
             self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
 
-    def _test_bias_rcr_add(self, m, m0, m1, k, n):
+    def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -152,7 +154,7 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = (
@@ -160,7 +162,7 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n):
             + self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_add(self):
         self._test_bias_rcr_add(8, None, None, 8, 8)
@@ -168,9 +170,9 @@ def test_bias_rcr_add(self):
             self._test_bias_rcr_add(None, 2, 32, 256, 128)
             self._test_bias_rcr_add(None, 21, 5, 1024, 512)
 
-    def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
+    def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_relu()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -179,7 +181,7 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_relu_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_relu_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = torch.relu(
@@ -187,7 +189,7 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
             + self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_add_relu(self):
         self._test_bias_rcr_add_relu(8, None, None, 8, 8)
@@ -195,9 +197,9 @@ def test_bias_rcr_add_relu(self):
             self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
             self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
 
-    def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
+    def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add_relu()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
@@ -206,7 +208,7 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_add_relu_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_add_relu_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = torch.relu(
@@ -215,7 +217,7 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
             + self.D1_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_np, dtype, has_d1=True)
 
     def test_bias_rcr_add_add_relu(self):
         target = detect_target()
@@ -230,9 +232,9 @@ def test_bias_rcr_add_add_relu(self):
             if type(target).__name__ != "FBCUDA":
                 self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
 
-    def _test_bias_rcr_mul(self, m, m0, m1, k, n):
+    def _test_bias_rcr_mul(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -241,7 +243,7 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_mul_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_mul_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = (
@@ -249,7 +251,7 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n):
             * self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_mul(self):
         self._test_bias_rcr_mul(8, None, None, 8, 8)
@@ -257,9 +259,9 @@ def test_bias_rcr_mul(self):
             self._test_bias_rcr_mul(None, 2, 32, 256, 128)
             self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
 
-    def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
+    def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
@@ -268,7 +270,7 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_add_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_add_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = (
@@ -277,7 +279,7 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
             + self.D1_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_np, dtype, has_d1=True)
 
     def test_bias_rcr_add_add(self):
         self._test_bias_rcr_add_add(8, None, None, 8, 8)
@@ -286,9 +288,9 @@ def test_bias_rcr_add_add(self):
             self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
             self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
 
-    def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
+    def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_tanh()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -297,7 +299,7 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_mul_tanh_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_mul_tanh_k_{k}_n_{n}_{dtype}",
         )
 
         Y_pt = torch.tanh(
@@ -305,7 +307,7 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
             * self.D0_pt
         )
         Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_np, dtype)
 
     def test_bias_rcr_mul_tanh(self):
         self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
@@ -313,6 +315,23 @@ def test_bias_rcr_mul_tanh(self):
             self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
             self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bias_broadcast_float(self):
+        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="float")
+        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index c0e55201e..cef3443f8 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -18,6 +18,32 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from parameterized import parameterized
+
+
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "bfloat16":
+        return {"atol": 3e-1, "rtol": 3e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
 
 
 def hard_swish(x):
@@ -26,29 +52,41 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasHardSwishTestCase(unittest.TestCase):
-    def test_rcr(self):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasHardSwishTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, dtype="float16"):
         M = 128
         K = 1024
         N = 64
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_hardswish()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_hardswish")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        test_name = f"gemm_rcr_bias_hardswish_{dtype}_{self._test_id}"
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = hard_swish(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y_pt, y, **_tolerance_limits(dtype)))
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_rcr(self, dtype):
+        skipped_reason = _skip_target(detect_target(), dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_rcr(dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_permute.py b/tests/unittest/ops/test_gemm_bias_permute.py
index a98718d38..48637a18e 100644
--- a/tests/unittest/ops/test_gemm_bias_permute.py
+++ b/tests/unittest/ops/test_gemm_bias_permute.py
@@ -19,11 +19,17 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
 class GEMMBiasPermuteTestCase(unittest.TestCase):
-    def _test_gemm_rcr_bias_permute_m2n3(self, copy_op=False):
+    def _test_gemm_rcr_bias_permute_m2n3(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_bias_permute_m2n3",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -34,35 +40,62 @@ def _test_gemm_rcr_bias_permute_m2n3(self, copy_op=False):
         K = 256
         shape = (M1, N0, N1)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_bias_permute(shape, layout="m2n3")
         if copy_op:
             OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m2n3")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype)
 
         Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt, B_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m2n3(self):
-        self._test_gemm_rcr_bias_permute_m2n3()
-        self._test_gemm_rcr_bias_permute_m2n3(copy_op=True)
+    def test_gemm_rcr_bias_permute_m2n3_fp16(self):
+        self._test_gemm_rcr_bias_permute_m2n3(
+            test_name="gemm_rcr_bias_permute_m2n3_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_permute_m2n3(
+            copy_op=True,
+            test_name="gemm_rcr_bias_permute_m2n3_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
+    def _test_gemm_rcr_bias_permute_m3n2(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_bias_permute_m3n2",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 16
         M2 = 32
@@ -73,9 +106,24 @@ def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
         K = 256
         shape = (M1, M2, N0)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_bias_permute(shape, layout="m3n2")
         if copy_op:
             OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
@@ -83,24 +131,36 @@ def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m3n2")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype)
         Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_r = Y_l.reshape(M0, M1, M2, N0, N1)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt, B_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m3n2(self):
-        self._test_gemm_rcr_bias_permute_m3n2()
-        self._test_gemm_rcr_bias_permute_m3n2(copy_op=True)
+    def test_gemm_rcr_bias_permute_m3n2_fp16(self):
+        self._test_gemm_rcr_bias_permute_m3n2(
+            test_name="gemm_rcr_bias_permute_m3n2_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_permute_m3n2(
+            copy_op=True,
+            test_name="gemm_rcr_bias_permute_m3n2_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
+    def _test_gemm_rcr_permute_m2n3(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_permute_m2n3",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -111,8 +171,18 @@ def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
         K = 256
         shape = (M1, N0, N1)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_permute(shape, layout="m2n3")
         if copy_op:
             OP = ops.gemm_rcr_permute(**OP._get_op_attributes())
@@ -120,22 +190,29 @@ def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "gemm_rcr_permute_m2n3")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         Y_l = torch.nn.functional.linear(X_pt, W_pt)
         Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_permute_m2n3(self):
-        self._test_gemm_rcr_permute_m2n3()
-        self._test_gemm_rcr_permute_m2n3(copy_op=True)
+    def test_gemm_rcr_permute_m2n3_fp16(self):
+        self._test_gemm_rcr_permute_m2n3(
+            test_name="test_gemm_rcr_permute_m2n3_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_permute_m2n3(
+            copy_op=True,
+            test_name="test_gemm_rcr_permute_m2n3_fp16_copy_op",
+            dtype="float16",
+        )
 
     # ========== enable them after fix profiler =========
     # def test_gemm_rcr_bias_relu(self):
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index 8def037ea..9f43fed0d 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -18,59 +18,110 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from parameterized import parameterized
+
+
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "bfloat16":
+        return {"atol": 2e-1, "rtol": 2e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
 
 
 class GEMMBiasReluTestCase(unittest.TestCase):
-    def test_gemm_rcr_bias_relu(self):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasReluTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         M = 128
         K = 1024
         N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        tolerance_limits = _tolerance_limits(dtype)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_relu()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_relu")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        test_name = f"gemm_rcr_bias_relu_{dtype}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = torch.relu(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_gemm_rcr_bias_add_relu(self):
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_gemm_rcr_bias_relu(self, ait_dtype):
+        target = detect_target()
+        skipped_reason = _skip_target(target, ait_dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_gemm_rcr_bias_relu(ait_dtype, target)
+
+    def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         M = 128
         K = 1024
         N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
-        D = Tensor(shape=[M, N], dtype="float16", name="input_3", is_input=True)
+        tolerance_limits = _tolerance_limits(dtype)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
+        D = Tensor(shape=[M, N], dtype=dtype, name="input_3", is_input=True)
         OP = ops.gemm_rcr_bias_add_relu()
         Y = OP(X, W, B, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_add_relu")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        D_pt = torch.randn(M, N).cuda().half()
+        test_name = f"gemm_rcr_bias_add_relu_{dtype}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D_pt
         Y_pt = torch.relu(Y_pt)
 
         inputs = [X_pt, W_pt, B_pt, D_pt]
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **tolerance_limits)
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_gemm_rcr_bias_add_relu(self, ait_dtype):
+        target = detect_target()
+        skipped_reason = _skip_target(target, ait_dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_gemm_rcr_bias_add_relu(ait_dtype, target)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 48f57b030..2985f2da3 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -18,33 +18,73 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from parameterized import parameterized
+
+
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "bfloat16":
+        return {"atol": 3e-1, "rtol": 3e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
 
 
 class GEMMBiasSigmoidTestCase(unittest.TestCase):
-    def test_rcr(self):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasSigmoidTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, dtype="float16"):
         M = 128
         K = 1024
         N = 64
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_sigmoid()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_sigmoid")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        test_name = f"gemm_rcr_bias_sigmoid_{dtype}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = torch.sigmoid(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **_tolerance_limits(dtype))
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_rcr(self, dtype):
+        skipped_reason = _skip_target(detect_target(), dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_rcr(dtype)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
index 62fd90727..5bee4c91a 100644
--- a/tests/unittest/ops/test_gemm_bias_softmax.py
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
 import unittest
 
@@ -20,42 +21,49 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 @unittest.skip("GEMM + Softmax is disabled for now")
 class GEMMTestCase(unittest.TestCase):
     def _test_gemm_rcr_bias_softmax(
-        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_bias_softmax"
+        self, M=16, K=64, N=24, rebuild=True, dtype="float16"
     ):
         target = detect_target()
         if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
+            _LOGGER.warning("Skip this test for special profiling requirement")
             return
 
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_softmax()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = torch.softmax(Y_pt, dim=1)
         Y_np = Y_pt.cpu().numpy()
 
+        test_name = f"gemm_bias_softmax_{dtype}"
         if rebuild:
             target = detect_target()
             module = compile_model(Y, target, "./tmp", test_name)
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -69,6 +77,13 @@ def _test_gemm_rcr_bias_softmax(
     def test_gemm_bias_softmax(self):
         self._test_gemm_rcr_bias_softmax(N=81)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_bias_softmax_float(self):
+        self._test_gemm_rcr_bias_softmax(N=81, dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
index bbffb1e3a..0ce13bf91 100644
--- a/tests/unittest/ops/test_gemm_bias_swish.py
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -18,37 +18,76 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from parameterized import parameterized
 
 
 def swish(x):
     return x * torch.sigmoid(x)
 
 
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "bfloat16":
+        return {"atol": 3e-1, "rtol": 3e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
+
+
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasSwishTestCase(unittest.TestCase):
-    def test_rcr(self):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasSwishTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, dtype="float16"):
         M = 128
         K = 1024
         N = 64
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_swish()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_swish")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        test_name = f"gemm_rcr_bias_swish_{dtype}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = swish(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y_pt, y, **_tolerance_limits(dtype)))
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_rcr(self, dtype):
+        skipped_reason = _skip_target(detect_target(), dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
+        self._test_rcr(dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index 27ef27c0b..089fbf10c 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
 import torch
@@ -21,46 +20,79 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
+
+
+def _tolerance_limits(dtype):
+    if dtype == "float16":
+        return {"atol": 1e-1, "rtol": 1e-1}
+    elif dtype == "float32":
+        return {"atol": 3e-2, "rtol": 2e-2}
+    elif dtype == "bfloat16":
+        return {"atol": 2e-1, "rtol": 2e-1}
+    else:
+        return {}
+
+
+def _skip_target(target, ait_dtype):
+    if ait_dtype == "float16":
+        return None
+    if target.name() != "cuda":
+        return "Not supported for non-CUDA target"
+    if int(target._arch) < 80:
+        return "Not supported for CUDA SM<80."
+    return None
 
 
 class GEMMBiasTanhTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasTanhTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, Ms, test_name, dtype="float16"):
         K = 1024
         N = 64
         target = detect_target()
+        tolerance_limits = _tolerance_limits(dtype)
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_tanh()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_tanh_{test_name}")
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_bias_tanh_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for M in Ms:
-            logging.info(f"Testing {M=}")
-
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_rcr(self):
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_rcr_bias_tanh_floats(self, dtype):
+        skipped_reason = _skip_target(detect_target(), dtype)
+        if skipped_reason is not None:
+            self.skipTest(skipped_reason)
         self._test_rcr([128], "static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m")
+        self._test_rcr([1, 7, 64, 127], f"dynamic_m_{dtype}", dtype=dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index e961c359f..3f1ccc1d0 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -19,21 +19,27 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
+    def _test_rcr(
+        self, ms, k, n, shape, test_name, has_bias=False, copy_op=False, dtype="float16"
+    ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
         if has_bias:
             OP = ops.gemm_rcr_bias_permute(shape)
             if copy_op:
@@ -49,9 +55,9 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
         module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
             if has_bias:
                 Y_l = torch.nn.functional.linear(X_pt, W_pt, B_pt)
             else:
@@ -62,7 +68,7 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -89,17 +95,26 @@ def test_rcr(self):
                 )
 
     def _test_rcr_0213(
-        self, ms, k, n, shape, test_name, has_bias=False, copy_op=False, layout="0213"
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        has_bias=False,
+        copy_op=False,
+        layout="0213",
+        dtype="float16",
     ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
         if has_bias:
             OP = ops.gemm_rcr_bias_permute(shape, layout)
             if copy_op:
@@ -115,9 +130,9 @@ def _test_rcr_0213(
         module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
 
             def torch_f(x, w, b, has_bias, shape):
                 if has_bias:
@@ -135,7 +150,7 @@ def torch_f(x, w, b, has_bias, shape):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -169,15 +184,15 @@ def test_rcr_0213(self):
             layout="0213",
         )
 
-    def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False):
+    def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr_permute(shape)
         if copy_op:
             OP = ops.gemm_rrr_permute(**OP._get_op_attributes())
@@ -187,13 +202,13 @@ def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False):
         module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_l = torch.matmul(X_pt, W_pt)
             Y_r = Y_l.reshape(16, *shape, 16)
             Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -202,6 +217,45 @@ def test_rrr(self):
         self._test_rrr([128], 64, 256, (8, 4, 4), "permute2")
         self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_copy_op", copy_op=True)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_permute_float(self):
+        for has_bias in (True, False):
+            for copy_op in (True, False):
+                self._test_rcr(
+                    [80],
+                    32,
+                    96,
+                    (5, 3, 2),
+                    "permute1_float",
+                    has_bias=has_bias,
+                    copy_op=copy_op,
+                    dtype="float",
+                )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "permute_0213_2_float",
+            has_bias=False,
+            copy_op=False,
+            layout="0213",
+            dtype="float",
+        )
+        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_float", dtype="float")
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "permute2_copy_op_float",
+            copy_op=True,
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_profiler_cache.py b/tests/unittest/ops/test_gemm_profiler_cache.py
new file mode 100644
index 000000000..0f20cc7c2
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_profiler_cache.py
@@ -0,0 +1,167 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class GemmProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="gemm_rcr",
+        k=128,
+        n=8,
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[n, k],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+    ):
+        old_trick = os.environ.get("TRICK_CI_ENV", None)
+        old_cache = os.environ.get("CACHE_DIR", None)
+        try:
+            os.environ["TRICK_CI_ENV"] = "1"
+            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+        finally:
+            if old_trick is not None:
+                os.environ["TRICK_CI_ENV"] = old_trick
+            else:
+                os.environ.pop("TRICK_CI_ENV")
+            if old_cache is not None:
+                os.environ["CACHE_DIR"] = old_cache
+            else:
+                os.environ.pop("CACHE_DIR")
+
+    def test_gemm_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        run1_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 1 profilers", run1_logs)
+
+        run2_logs = self._run_test(
+            first_dim=first_dim,
+            test_name=test_name,
+            logger=logger,
+        )
+        self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_gemm_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "gemm_cache_version"
+        target_name = detect_target().name()
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=1,  # version
+        ):
+            run1_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_gemm_1' does not exist in the db",
+                run1_before_version_change_logs,
+            )
+
+            run2_before_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_gemm_1' exists in the db",
+                run2_before_version_change_logs,
+            )
+
+        with patch.object(
+            target=ProfileCacheDB,
+            attribute=cache_version_property,
+            new=2,  # version
+        ):
+            run1_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_gemm_2' does not exist in the db",
+                run1_after_version_change_logs,
+            )
+
+            run2_after_version_change_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+            )
+            self.assertIn(
+                f"table_name='{target_name}_gemm_2' exists in the db",
+                run2_after_version_change_logs,
+            )
diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index ce9eb31ba..c3bbf2e6b 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -21,22 +21,24 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
+    def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
         K = 1024
         N = 64
         target = detect_target()
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = (
             ops.gemm_rcr_bias_fast_gelu() if use_fast_gelu else ops.gemm_rcr_bias_gelu()
         )
@@ -56,11 +58,11 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
         for M in Ms:
             logging.info(f"Testing {M=}")
 
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.nn.GELU()(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
@@ -74,6 +76,19 @@ def test_rcr(self):
             self._test_rcr([128], "static", use_fast_gelu=False)
             self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rcr_float(self):
+        self._test_rcr(
+            [1, 7, 64, 127], "fast_dynamic_m_float", use_fast_gelu=True, dtype="float"
+        )
+        self._test_rcr(
+            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=False, dtype="float"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index a95397f60..f56820ab5 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -22,6 +22,10 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
@@ -46,16 +50,14 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class GEMMRcrFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
+    def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
         K = 1024
         N = 64
         target = detect_target()
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
 
         OP = ops.gemm_rcr_fast_gelu()
@@ -69,10 +71,10 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
         for M in Ms:
             logging.info(f"Testing {M=}")
 
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
             Y_pt = NewGELUActivation()(torch.nn.functional.linear(X_pt, W_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt},
                 [y],
@@ -86,6 +88,17 @@ def test_rcr(self):
             self._test_rcr([128], "static", use_fast_gelu=False)
             self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_rcr_float(self):
+        self._test_rcr([128], "static_float", use_fast_gelu=True, dtype="float")
+        self._test_rcr(
+            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="float"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index b8279891f..2b3a5df99 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -20,30 +20,48 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMRrrSmallNKTestCase(unittest.TestCase):
-    def _test_rrr(self, M, N, K, use_fp16_acc=True):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(
+        self, M, N, K, use_fp16_acc=True, dtype="float16", atol=1e-1, rtol=1e-1
+    ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
-        X = Tensor(shape=[*M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(M, name="batch_dim"), K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr_small_nk()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_small_nk")
-        X_pt = torch.randn(*M, K).cuda().half()
-        W_pt = torch.randn(K, N).cuda().half()
-        Y_pt = torch.matmul(X_pt, W_pt)
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_small_nk_{self.test_count}"
+        )
+
+        for m in M:
+            X_pt = get_random_torch_tensor([m, K], dtype)
+            W_pt = get_random_torch_tensor([K, N], dtype)
+            Y_pt = torch.matmul(X_pt, W_pt)
 
-        inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([*M, N]).cuda().half()
-        module.run_with_tensors(inputs, [y])
-        if X_pt.nelement() == 0 or W_pt.nelement() == 0:
-            pass
-        else:
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
+        self.test_count += 1
 
         # from aitemplate.testing.benchmark_pt import benchmark_torch_function
         # t = benchmark_torch_function(100, torch.matmul, X_pt, W_pt)
@@ -63,6 +81,10 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
+    def test_float32(self):
+        self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
+        self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_softmax.py b/tests/unittest/ops/test_gemm_softmax.py
index 5bf34a3f5..f189a39e6 100644
--- a/tests/unittest/ops/test_gemm_softmax.py
+++ b/tests/unittest/ops/test_gemm_softmax.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
 import unittest
 
@@ -20,40 +21,45 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 @unittest.skip("GEMM + Softmax is disabled for now")
 class GEMMSoftmaxTestCase(unittest.TestCase):
-    def _test_gemm_rcr_softmax(
-        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_softmax"
-    ):
+    def _test_gemm_rcr_softmax(self, M=16, K=64, N=24, rebuild=True, dtype="float16"):
         target = detect_target()
         if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
+            _LOGGER.warning("Skip this test for special profiling requirement")
             return
 
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr_softmax()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt)
         Y_pt = torch.softmax(Y_pt, dim=1)
         Y_np = Y_pt.cpu().numpy()
 
+        test_name = f"gemm_softmax_{dtype}"
         if rebuild:
             target = detect_target()
             module = compile_model(Y, target, "./tmp", test_name)
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
         inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
         y_ait_np = y.cpu().numpy()
         np.testing.assert_allclose(Y_np, y_ait_np, atol=1e-1, rtol=1e-1)
@@ -67,6 +73,9 @@ def _test_gemm_rcr_softmax(
     def test_gemm_softmax(self):
         self._test_gemm_rcr_softmax()
 
+    def test_gemm_softmax_float(self):
+        self._test_gemm_rcr_softmax(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index 7b07fcbc3..fbd22f5de 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,19 +20,23 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from parameterized import param, parameterized
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrTestCase(unittest.TestCase):
     @parameterized.expand(
         [
-            param(False, "group_gemm_rcr_run_once"),
-            param(True, "group_gemm_rcr_run_twice"),
+            param(False, "group_gemm_rcr_run_once", "float16"),
+            param(True, "group_gemm_rcr_run_twice", "float16"),
+            param(False, "group_gemm_rcr_run_once_fp32", "float32"),
         ]
     )
-    def test_rcr(self, run_twice: bool, test_name: str):
+    def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
         M = 256
         K1 = 128
         N1 = 60
@@ -39,12 +44,12 @@ def test_rcr(self, run_twice: bool, test_name: str):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         OP = ops.group_gemm_rcr()
         Y1, Y2 = OP(operand_groups=[[X1, W1], [X2, W2]])
         Y1._attrs["name"] = "y1"
@@ -61,10 +66,10 @@ def test_rcr(self, run_twice: bool, test_name: str):
             graph_outputs.append(Y3)
 
         module = compile_model(graph_outputs, target, "./tmp", test_name)
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
 
@@ -74,11 +79,11 @@ def test_rcr(self, run_twice: bool, test_name: str):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         outputs = {"y1": y1, "y2": y2}
         if run_twice:
-            outputs["y3"] = torch.empty([M, N1]).cuda().half()
+            outputs["y3"] = torch.empty_like(y1)
 
         module.run_with_tensors(inputs, outputs)
         self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index 159f3b0ff..91a601744 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -20,13 +20,23 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
-logger = logging.getLogger(__name__)
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrBiasTestCase(unittest.TestCase):
-    def test_rcr(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_fp16", "float16"),
+            param("group_gemm_rcr_bias_fp32", "float32"),
+        ]
+    )
+    def test_rcr(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -34,27 +44,27 @@ def test_rcr(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning("Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
         OP = ops.group_gemm_rcr_bias()
         Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
         Y1._attrs["name"] = "y1"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "y2"
         Y2._attrs["is_output"] = True
-        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y1, Y2], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
 
@@ -66,11 +76,11 @@ def test_rcr(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index a533410a6..ae27c3d78 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,12 +20,24 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
-    def test_rcr_relu(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
+            param("group_gemm_rcr_bias_relu_fp32", "float32", "relu"),
+            param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_fp32", "float32", "sigmoid"),
+        ]
+    )
+    def test_rcr_activation(self, test_name, dtype, activation):
         M = 256
         K1 = 128
         N1 = 60
@@ -32,31 +45,36 @@ def test_rcr_relu(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
-        OP = ops.group_gemm_rcr_bias_relu()
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
+        OP = (
+            ops.group_gemm_rcr_bias_relu()
+            if activation == "relu"
+            else ops.group_gemm_rcr_bias_sigmoid()
+        )
+        act_pt = torch.relu if activation == "relu" else torch.sigmoid
         Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
         Y1._attrs["name"] = "y1"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "y2"
         Y2._attrs["is_output"] = True
-        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias_relu")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y1, Y2], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
-        Y1_pt = torch.relu(Y1_pt)
+        Y1_pt = act_pt(Y1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
-        Y2_pt = torch.relu(Y2_pt)
+        Y2_pt = act_pt(Y2_pt)
 
         inputs = {
             "x1": X1_pt,
@@ -66,11 +84,11 @@ def test_rcr_relu(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index bf8af1bb4..72343721d 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -21,12 +21,22 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GroupGEMMRcrCatTestCase(unittest.TestCase):
-    def test_rcr_bias_cat(self):
+class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_cat_fp16", "float16"),
+            param("group_gemm_rcr_bias_cat_fp32", "float32"),
+        ]
+    )
+    def test_rcr_bias_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -34,32 +44,32 @@ def test_rcr_bias_cat(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
         OP = ops.group_gemm_rcr_bias()
         Y = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]], output_stride_dim=1)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
-        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_bias_cat")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = {
@@ -70,9 +80,9 @@ def test_rcr_bias_cat(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index cb4ff4986..69717a440 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -21,12 +21,22 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrCatTestCase(unittest.TestCase):
-    def test_rcr_cat(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_cat_fp16", "float16"),
+            param("group_gemm_rcr_cat_fp32", "float32"),
+        ]
+    )
+    def test_rcr_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -34,29 +44,29 @@ def test_rcr_cat(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         OP = ops.group_gemm_rcr()
         Y = OP(operand_groups=[[X1, W1], [X2, W2]], output_stride_dim=1)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
-        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_cat")
+        module = compile_model([Y], target, "./tmp", test_name)
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = {
@@ -65,9 +75,9 @@ def test_rcr_cat(self):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 484db1d25..881eecc9a 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -15,6 +15,7 @@
 """
 Unittests for group norm Operator.
 """
+import logging
 import unittest
 
 import torch
@@ -22,7 +23,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target()._arch == "75", "Skip GN on sm75.")
@@ -41,27 +45,26 @@ def _test_groupnorm(
         eps=1e-5,
         use_swish=False,
         copy_op=False,
+        dtype="float16",
     ):
         test_name = "group_norm_swish" if use_swish else "group_norm"
-        logger.info(
-            __file__, f"Testing {test_name}: {x_shape}, num_groups: {num_groups}"
-        )
+        _LOGGER.info(f"Testing {test_name}: {x_shape}, num_groups: {num_groups}")
         num_channels = x_shape[-1]
         X1 = Tensor(
             shape=x_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
         X2 = Tensor(
             shape=[num_channels],
-            dtype="float16",
+            dtype=dtype,
             name="gamma",
             is_input=True,
         )
         X3 = Tensor(
             shape=[num_channels],
-            dtype="float16",
+            dtype=dtype,
             name="beta",
             is_input=True,
         )
@@ -78,10 +81,10 @@ def _test_groupnorm(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(X4, target, "./tmp", op_name, dll_name=dll_name)
 
-        x1_nhwc_pt = torch.randn(*x_shape).cuda().half()
+        x1_nhwc_pt = get_random_torch_tensor(x_shape, dtype)
         x1_nchw_pt = x1_nhwc_pt.permute(0, 3, 1, 2).contiguous()
-        gamma_pt = torch.randn(num_channels).cuda().half()
-        beta_pt = torch.randn(num_channels).cuda().half()
+        gamma_pt = get_random_torch_tensor((num_channels,), dtype)
+        beta_pt = torch.randn_like(gamma_pt)
 
         x4_pt = torch.nn.functional.group_norm(
             x1_nchw_pt, num_groups, gamma_pt, beta_pt, eps=eps
@@ -92,7 +95,7 @@ def _test_groupnorm(
         inputs = {"X": x1_nhwc_pt}
         inputs["gamma"] = gamma_pt
         inputs["beta"] = beta_pt
-        x4 = torch.empty(x_shape).cuda().half()
+        x4 = torch.empty_like(x1_nhwc_pt)
         module.run_with_tensors(inputs, [x4])
 
         # from aitemplate.testing.benchmark_pt import benchmark_torch_function
@@ -108,10 +111,8 @@ def _test_groupnorm(
         # )
         # print("pt: ", t)
 
-        self.assertTrue(
-            torch.allclose(
-                x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
-            )
+        torch.testing.assert_close(
+            x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
         )
         self.test_count += 1
 
@@ -161,6 +162,25 @@ def test_groupnorm_swish(self):
                 x_shape=shape, num_groups=32, eps=1e-5, use_swish=True, copy_op=True
             )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_float32(self):
+        # H % 8 != 0
+        self._test_groupnorm(
+            x_shape=[7, 13, 9, 12],
+            num_groups=4,
+            eps=1e-5,
+            dtype="float32",
+            use_swish=True,
+        )
+        # H % 8 == 0
+        self._test_groupnorm(
+            x_shape=[2, 16, 16, 640],
+            num_groups=32,
+            eps=1e-5,
+            dtype="float32",
+            use_swish=True,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
index f3d2aaff2..d1e393f2a 100644
--- a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
+++ b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
@@ -16,29 +16,30 @@
 
 import torch
 from aitemplate.compiler import compile_model, ops
-
+from aitemplate.compiler.base import IntImm, IntVarTensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class IntElementwiseReshapeOpTestCase(unittest.TestCase):
-    def test_int_elementwise_reshape_op(
+    def _test_int_elementwise_reshape_op(
         self,
         batch_size=(1, 3),
         x1_size=(2, 3),
         X_shape=(32, 64),
-        test_name="elementwise_reshape_op",
+        test_name="int_elementwise_reshape_op",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
         X = Tensor(
             shape=[b_dim, x1_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -56,23 +57,39 @@ def test_int_elementwise_reshape_op(
 
         for b, x1 in zip(batch_size, x1_size):
             X_shape_pt = (b, x1, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(
-                X_shape_pt[1] * X_shape_pt[0], X_shape_pt[2], X_shape_pt[3]
+                X_shape_pt[1] * X_shape_pt[0],
+                X_shape_pt[2],
+                X_shape_pt[3],
             )
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_int_elementwise_reshape_op2(
+    def test_int_elementwise_reshape_op_fp16(self):
+        self._test_int_elementwise_reshape_op(
+            test_name="int_elementwise_reshape_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_fp32(self):
+        self._test_int_elementwise_reshape_op(
+            test_name="int_elementwise_reshape_op_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op2(
         self,
         batch_size=(1, 3),
         x1_size=(2, 3),
         x2_size=(10, 32),
         x3_size=(48, 64),
-        test_name="elementwise_reshape_op2",
+        test_name="int_elementwise_reshape_op2",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
@@ -81,7 +98,7 @@ def test_int_elementwise_reshape_op2(
         x3_dim = shape_utils.gen_int_var_min_max(x3_size, name="x3_size")
         X = Tensor(
             shape=[b_dim, x1_dim, x2_dim, x3_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -101,14 +118,146 @@ def test_int_elementwise_reshape_op2(
 
         for b, x1, x2, x3 in zip(batch_size, x1_size, x2_size, x3_size):
             X_shape_pt = (b, x1, x2, x3)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(-1, X_shape_pt[3])
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
+    def test_int_elementwise_reshape_op2_fp16(self):
+        self._test_int_elementwise_reshape_op2(
+            test_name="int_elementwise_reshape_op2_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op2_fp32(self):
+        self._test_int_elementwise_reshape_op2(
+            test_name="int_elementwise_reshape_op2_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op_imm(
+        self,
+        batch_size=(3, 5),
+        x1_size=(2, 3),
+        X_shape=(32, 64),
+        test_name="int_elementwise_reshape_op_imm",
+        dtype="float16",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
+        X = Tensor(
+            shape=[b_dim, x1_dim, *X_shape],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        Y5 = ops.getitem()(Y1, 3)
+        f1 = ops.int_elementwise(FuncEnum.MUL)(Y4, Y5)
+        f2 = IntVarTensor(IntImm(12))
+
+        Y = ops.reshape()(X, [Y2 * Y3 * f1 / f2, f2])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b, x1 in zip(batch_size, x1_size):
+            X_shape_pt = (b, x1, *X_shape)
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            Y_pt = X_pt.reshape(
+                int(X_shape_pt[0] * X_shape_pt[1] * X_shape_pt[2] * X_shape_pt[3] / 12),
+                12,
+            )
+
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_int_elementwise_reshape_op_imm_fp16(self):
+        self._test_int_elementwise_reshape_op_imm(
+            test_name="int_elementwise_reshape_op_imm_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_imm_fp32(self):
+        self._test_int_elementwise_reshape_op_imm(
+            test_name="int_elementwise_reshape_op_imm_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op_add(
+        self,
+        batch_size=(3, 5),
+        X_shape=(4, 8),
+        test_name="int_elementwise_reshape_op_add",
+        dtype="float16",
+    ):
+        target = detect_target()
+        last_dim_val = X_shape[0] * X_shape[1]
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X0 = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[b_dim, last_dim_val],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X0)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        f1 = ops.int_elementwise(FuncEnum.MUL)(Y3, Y4)
+
+        Y5 = ops.reshape()(X0, [Y2, f1])
+        Y = ops.elementwise(FuncEnum.ADD)(Y5, X1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X0_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            X1_pt = get_random_torch_tensor([b, last_dim_val], dtype=dtype)
+            Y1_pt = X0_pt.reshape(b, last_dim_val)
+            Y_pt = Y1_pt + X1_pt
+
+            inputs = {"input_0": X0_pt, "input_1": X1_pt}
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_int_elementwise_reshape_op_add_fp16(self):
+        self._test_int_elementwise_reshape_op_add(
+            test_name="int_elementwise_reshape_op_add_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_add_fp32(self):
+        self._test_int_elementwise_reshape_op_add(
+            test_name="int_elementwise_reshape_op_add_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
index d05eaafa6..54440b6ed 100644
--- a/tests/unittest/ops/test_layernorm.py
+++ b/tests/unittest/ops/test_layernorm.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler.base import IntImm, IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class LayernormTestCase(unittest.TestCase):
@@ -39,20 +40,23 @@ def _test_layernorm(
         beta_is_none=False,
         use_size_op=False,
         eps=1e-5,
+        dtype="float16",
     ):
+        torch_dtype = string_to_torch_dtype(dtype)
         BS = [1, 1024]
         input_shapes = ((BS), *MS, *NS)
         logging.info(
             f"input shapes: {input_shapes}"
             f"gamma_is_none: {gamma_is_none}, beta_is_none: {beta_is_none}, "
             f"use_size_op: {use_size_op}"
+            f"dtype: {dtype}"
         )
         assert isinstance(MS, (list, tuple))
         assert isinstance(NS, (list, tuple))
 
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=BS), *MS, *NS],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -61,7 +65,7 @@ def _test_layernorm(
         else:
             X2 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -70,7 +74,7 @@ def _test_layernorm(
         else:
             X3 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -94,15 +98,15 @@ def _test_layernorm(
         module = compile_model(X4, target, "./tmp", "layernorm", dll_name=dll_name)
 
         for batch_size in [50, 900, 1024]:
-            x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+            x1_pt = torch.randn(batch_size, *MS, *NS, dtype=torch_dtype).cuda()
             if gamma_is_none:
                 x2_pt = None
             else:
-                x2_pt = torch.randn(NS).cuda().half()
+                x2_pt = torch.randn(NS, dtype=torch_dtype).cuda()
             if beta_is_none:
                 x3_pt = None
             else:
-                x3_pt = torch.randn(NS).cuda().half()
+                x3_pt = torch.randn(NS, dtype=torch_dtype).cuda()
             x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
 
             inputs = {"X": x1_pt}
@@ -110,9 +114,9 @@ def _test_layernorm(
                 inputs["gamma"] = x2_pt
             if not beta_is_none:
                 inputs["beta"] = x3_pt
-            x4 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+            x4 = torch.empty([batch_size, *MS, *NS], dtype=torch_dtype).cuda()
             module.run_with_tensors(inputs, [x4])
-            self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-3, rtol=1e-3))
+            torch.testing.assert_close(x4, x4_pt, atol=1e-3, rtol=1e-3)
             self.test_count += 1
 
     def test_layernorm(self):
@@ -141,6 +145,18 @@ def test_layernorm(self):
                     MS=(16, 8, 4), NS=(2, 4, 32), use_size_op=use_size_op
                 )
 
+    @unittest.skipIf(
+        detect_target().name() == "rocm", "fp32 layer norm is not supported on ROCm"
+    )
+    def test_layernorm_fp32(self):
+        self._test_layernorm(dtype="float32")
+        self._test_layernorm(gamma_is_none=True, dtype="float32")
+        self._test_layernorm(beta_is_none=True, dtype="float32")
+        self._test_layernorm(gamma_is_none=True, beta_is_none=True, dtype="float32")
+        self._test_layernorm(eps=0.1, dtype="float32")
+        self._test_layernorm(MS=(16, 64), NS=(4, 32), dtype="float32")
+        self._test_layernorm(MS=(16, 8, 4), NS=(2, 4, 32), dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
index 672da2bf7..abd22835f 100644
--- a/tests/unittest/ops/test_layernorm_sigmoid_mul.py
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -25,15 +25,17 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedLayernormSigmoidMulTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(FusedLayernormSigmoidMulTestCase, self).__init__(*args, **kwargs)
-        torch.manual_seed(0)
         self._atol = 1e-2
         self._rtol = 1e-3
+        self._test_id = 0
 
     def _test_fused_layernorm_sigmoid_mul(
         self,
@@ -43,17 +45,19 @@ def _test_fused_layernorm_sigmoid_mul(
         beta_is_none=False,
         use_size_op=False,
         eps=1e-5,
+        dtype="float16",
     ):
         logging.info(
             f"_test_fused_layernorm_sigmoid_mul: M={MS}, N={NS}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+            f"dtype={dtype}"
         )
         assert isinstance(MS, (list, tuple))
         assert isinstance(NS, (list, tuple))
 
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=[1, 1024]), *MS, *NS],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -62,7 +66,7 @@ def _test_fused_layernorm_sigmoid_mul(
         else:
             X2 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -71,7 +75,7 @@ def _test_fused_layernorm_sigmoid_mul(
         else:
             X3 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -93,21 +97,25 @@ def _test_fused_layernorm_sigmoid_mul(
 
         target = detect_target()
         with compile_model(
-            X6, target, "./tmp", "fused_layernorm_sigmoid_mul_test"
+            X6,
+            target,
+            "./tmp",
+            f"fused_layernorm_sigmoid_mul_test_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch_size in [50, 900, 1024]:
                 logging.info(
                     f"Run test layernorm_sigmoid_mul. Problem size {[batch_size,] + list(MS) + list(NS)}"
                 )
-                x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+                x1_pt = get_random_torch_tensor([batch_size, *MS, *NS], dtype=dtype)
                 if gamma_is_none:
                     x2_pt = None
                 else:
-                    x2_pt = torch.randn(NS).cuda().half()
+                    x2_pt = get_random_torch_tensor(NS, dtype=dtype)
                 if beta_is_none:
                     x3_pt = None
                 else:
-                    x3_pt = torch.randn(NS).cuda().half()
+                    x3_pt = get_random_torch_tensor(NS, dtype=dtype)
 
                 x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
                 x6_pt = torch.mul(x1_pt, torch.sigmoid(x4_pt))
@@ -117,59 +125,183 @@ def _test_fused_layernorm_sigmoid_mul(
                     inputs["gamma"] = x2_pt
                 if not beta_is_none:
                     inputs["beta"] = x3_pt
-                x6 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+                x6 = torch.empty_like(x6_pt)
                 module.run_with_tensors(inputs, [x6])
-                self.assertTrue(
-                    torch.allclose(x6, x6_pt, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x6 - x6_pt) if x6_pt.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x6 - x6_pt) if x6_pt.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x6, x6_pt, atol=self._atol, rtol=self._rtol),
 
-    def test_fused_layernorm_sigmoid_mul(self):
+    def test_fused_layernorm_sigmoid_mul_fp16(self):
         for eps in (1e-5, 1e-1):
             # half4 kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(1496,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="float16",
+            )
             # block_size = n kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(515,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
+                eps=eps,
+                dtype="float16",
+            )
             # block_size = 512 kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="float16",
+            )
 
         # test ND inputs
         eps = 1e-5
         # half4 kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(64, 8), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="float16",
+        )
         # block_size = n kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(213, 2), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(3, 2), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(1, 1), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(0, 1), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="float16",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="float16",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(1, 1),
+            eps=eps,
+            dtype="float16",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(0, 1),
+            eps=eps,
+            dtype="float16",
+        )
         # block_size = 512 kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 4), NS=(1055, 5), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 4),
+            NS=(1055, 5),
+            eps=eps,
+            dtype="float16",
+        )
 
         self._test_fused_layernorm_sigmoid_mul(
-            NS=(1496,), gamma_is_none=True, beta_is_none=True
+            NS=(1496,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float16",
         )
         self._test_fused_layernorm_sigmoid_mul(
-            NS=(515,), gamma_is_none=True, beta_is_none=True
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float16",
         )
         for use_size_op in (True, False):
-            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), use_size_op=use_size_op)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                use_size_op=use_size_op,
+                dtype="float16",
+            )
             self._test_fused_layernorm_sigmoid_mul(
                 NS=(1055,),
                 gamma_is_none=True,
                 beta_is_none=True,
                 use_size_op=use_size_op,
+                dtype="float16",
             )
             self._test_fused_layernorm_sigmoid_mul(
-                NS=(1496,), gamma_is_none=True, use_size_op=use_size_op
+                NS=(1496,),
+                gamma_is_none=True,
+                use_size_op=use_size_op,
+                dtype="float16",
             )
             self._test_fused_layernorm_sigmoid_mul(
-                NS=(515,), beta_is_none=True, use_size_op=use_size_op
+                NS=(515,),
+                beta_is_none=True,
+                use_size_op=use_size_op,
+                dtype="float16",
             )
 
+    def test_fused_layernorm_sigmoid_mul_fp32(self):
+        for eps in (1e-5, 1e-1):
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="float32",
+            )
+            # block_size = n kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
+                eps=eps,
+                dtype="float32",
+            )
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="float32",
+            )
+
+        # test ND inputs
+        eps = 1e-5
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="float32",
+        )
+        # block_size = n kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="float32",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="float32",
+        )
+        # block_size = 512 kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 4),
+            NS=(1055, 5),
+            eps=eps,
+            dtype="float32",
+        )
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(1496,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float32",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float32",
+        )
+
     # dim0 is batch size
     def _test_batch_fused_layernorm_sigmoid_mul(
-        self, M, N, gamma_is_none=False, beta_is_none=False, use_size_op=False, eps=1e-5
+        self,
+        M,
+        N,
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+        dtype="float16",
     ):
         logging.info(
             f"_test_batch_fused_layernorm_sigmoid_mul: M={M}, N={N}, "
@@ -177,7 +309,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         )
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(M), IntImm(N)],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -186,7 +318,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         else:
             X2 = Tensor(
                 shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -195,7 +327,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         else:
             X3 = Tensor(
                 shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -211,25 +343,34 @@ def _test_batch_fused_layernorm_sigmoid_mul(
 
         target = detect_target()
         with compile_model(
-            X4, target, "./tmp", f"batch_fused_layernorm_sigmoid_mul_{M}_{N}_test"
+            X4,
+            target,
+            "./tmp",
+            f"batch_fused_layernorm_sigmoid_mul_{M}_{N}_test_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch_size in [2, 16, 32]:
                 logging.info(
-                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
-                        batch_size, M, N
-                    )
+                    f"Run test batch_layernorm_sigmoid_mul. Problem size [{batch_size}, {M}, {N}]"
                 )
-                xs_pt = [torch.randn(M, N).cuda().half() for i in range(batch_size)]
+                xs_pt = [
+                    get_random_torch_tensor([M, N], dtype=dtype)
+                    for i in range(batch_size)
+                ]
                 if gamma_is_none:
                     gammas_pt = [None] * batch_size
                 else:
                     gammas_pt = [
-                        torch.randn(N).cuda().half() for i in range(batch_size)
+                        get_random_torch_tensor([N], dtype=dtype)
+                        for i in range(batch_size)
                     ]
                 if beta_is_none:
                     betas_pt = [None] * batch_size
                 else:
-                    betas_pt = [torch.randn(N).cuda().half() for i in range(batch_size)]
+                    betas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype)
+                        for i in range(batch_size)
+                    ]
 
                 ys_pt = []
                 for i in range(batch_size):
@@ -255,7 +396,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
                     inputs["gamma"] = gamma_pt
                 if not beta_is_none:
                     inputs["beta"] = beta_pt
-                x4 = torch.empty([batch_size, M, N]).cuda().half()
+                x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
                 self.assertTrue(
                     torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
@@ -265,7 +406,12 @@ def _test_batch_fused_layernorm_sigmoid_mul(
 
     # dim1 is the batch size
     def _test_batch_fused_layernorm_sigmoid_mul_dim1(
-        self, B, N, gamma_is_none=False, beta_is_none=False
+        self,
+        B,
+        N,
+        gamma_is_none=False,
+        beta_is_none=False,
+        dtype="float16",
     ):
         logging.info(
             f"_test_batch_fused_layernorm_sigmoid_mul_dim1: M={B}, N={N}, "
@@ -277,7 +423,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                 IntVar(name="input_batch", values=[128, 1024]),
                 IntImm(N),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -286,7 +432,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
         else:
             X2 = Tensor(
                 shape=[IntImm(B), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -295,7 +441,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
         else:
             X3 = Tensor(
                 shape=[IntImm(B), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -308,23 +454,26 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
             X4,
             target,
             "./tmp",
-            f"batch_fused_layernorm_sigmoid_mul_dim1_{B}_{N}_test",
+            f"batch_fused_layernorm_sigmoid_mul_dim1_{B}_{N}_test_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for M in [128, 1024]:
                 logging.info(
-                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
-                        B, M, N
-                    )
+                    f"Run test batch_layernorm_sigmoid_mul. Problem size [{B}, {M}, {N}]"
                 )
-                xs_pt = [torch.randn(M, N).cuda().half() for i in range(B)]
+                xs_pt = [get_random_torch_tensor([M, N], dtype=dtype) for i in range(B)]
                 if gamma_is_none:
                     gammas_pt = [None] * B
                 else:
-                    gammas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+                    gammas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype) for i in range(B)
+                    ]
                 if beta_is_none:
                     betas_pt = [None] * B
                 else:
-                    betas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+                    betas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype) for i in range(B)
+                    ]
 
                 ys_pt = []
                 for i in range(B):
@@ -346,7 +495,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                     inputs["gamma"] = gamma_pt
                 if not beta_is_none:
                     inputs["beta"] = beta_pt
-                x4 = torch.empty([B, M, N]).cuda().half()
+                x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
                 self.assertTrue(
                     torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
@@ -354,23 +503,54 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                     f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
                 )
 
-    def test_batch_fused_layernorm_sigmoid_mul(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+        ]
+    )
+    def test_batch_fused_layernorm_sigmoid_mul(self, dtype: str):
         for eps in (1e-5, 1e-1):
-            self._test_batch_fused_layernorm_sigmoid_mul(512, 1024, eps=eps)
-            self._test_batch_fused_layernorm_sigmoid_mul(512, 64, eps=eps)
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512,
+                1024,
+                eps=eps,
+                dtype=dtype,
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512,
+                64,
+                eps=eps,
+                dtype=dtype,
+            )
 
         self._test_batch_fused_layernorm_sigmoid_mul(
-            512, 1024, gamma_is_none=True, beta_is_none=True
+            512,
+            1024,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         self._test_batch_fused_layernorm_sigmoid_mul(
-            512, 64, gamma_is_none=True, beta_is_none=True
+            512,
+            64,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         for use_size_op in (True, False):
             self._test_batch_fused_layernorm_sigmoid_mul(
-                1024, 1055, use_size_op=use_size_op, eps=1e-1
+                1024,
+                1055,
+                use_size_op=use_size_op,
+                eps=1e-1,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                1024, 1055, use_size_op=use_size_op
+                1024,
+                1055,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
                 1024,
@@ -378,22 +558,47 @@ def test_batch_fused_layernorm_sigmoid_mul(self):
                 gamma_is_none=True,
                 beta_is_none=True,
                 use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                512, 1024, gamma_is_none=True, use_size_op=use_size_op
+                512,
+                1024,
+                gamma_is_none=True,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                512, 1024, beta_is_none=True, use_size_op=use_size_op
+                512,
+                1024,
+                beta_is_none=True,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
 
-        self._test_batch_fused_layernorm_sigmoid_mul_dim1(1, 512)
-        self._test_batch_fused_layernorm_sigmoid_mul_dim1(16, 512)
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            1,
+            512,
+            dtype=dtype,
+        )
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            16,
+            512,
+            dtype=dtype,
+        )
 
         self._test_batch_fused_layernorm_sigmoid_mul_dim1(
-            1, 512, gamma_is_none=True, beta_is_none=True
+            1,
+            512,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         self._test_batch_fused_layernorm_sigmoid_mul_dim1(
-            16, 512, gamma_is_none=True, beta_is_none=True
+            16,
+            512,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
 
     def _test_group_fused_layernorm_sigmoid_mul(
@@ -405,16 +610,18 @@ def _test_group_fused_layernorm_sigmoid_mul(
         use_size_op=False,
         eps=1e-5,
         fuse_sigmoid_mul=True,
+        dtype="float16",
     ):
         testname = (
-            "group_fused_layernorm_sigmoid_mul_test"
+            f"group_fused_layernorm_sigmoid_mul_test_{dtype}_{self._test_id}"
             if fuse_sigmoid_mul
-            else "group_layernorm_test"
+            else f"group_layernorm_test_{dtype}_{self._test_id}"
         )
+        self._test_id += 1
         logging.info(
             f"{testname}: input_shapes={input_shapes}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}, "
-            f"use_size_op={use_size_op}"
+            f"use_size_op={use_size_op}, dtype={dtype}"
         )
         inputs = []
         gammas = []
@@ -425,7 +632,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
             inputs.append(
                 Tensor(
                     shape=[IntImm(n) for n in shape],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -435,7 +642,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(n) for n in shape[batch_ndim:]],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -446,7 +653,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(n) for n in shape[batch_ndim:]],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -489,14 +696,18 @@ def _test_group_fused_layernorm_sigmoid_mul(
             gammas_pt = []
             betas_pt = []
             for shape in input_shapes:
-                xs_pt.append(torch.randn(shape).cuda().half())
+                xs_pt.append(get_random_torch_tensor(shape, dtype=dtype))
                 norm_shape = shape[batch_ndim:]
                 gamma_pt = (
-                    None if gamma_is_none else torch.randn(norm_shape).cuda().half()
+                    None
+                    if gamma_is_none
+                    else get_random_torch_tensor(norm_shape, dtype=dtype)
                 )
                 gammas_pt.append(gamma_pt)
                 beta_pt = (
-                    None if beta_is_none else torch.randn(norm_shape).cuda().half()
+                    None
+                    if beta_is_none
+                    else get_random_torch_tensor(norm_shape, dtype=dtype)
                 )
                 betas_pt.append(beta_pt)
 
@@ -531,7 +742,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
             # module.benchmark_with_tensors(inputs, outputs)
 
         for i in range(B):
-            logging.debug("output: {}".format(str(i)))
+            logging.debug(f"output: {i}")
             y = outputs[i]
             self.assertTrue(
                 torch.allclose(ys_pt[i], y, atol=self._atol, rtol=self._rtol),
@@ -539,16 +750,28 @@ def _test_group_fused_layernorm_sigmoid_mul(
                 f"min diff: {torch.min(ys_pt[i] - y) if y.numel() > 0 else 0}",
             )
 
-    def test_group_fused_layernorm_sigmoid_mul(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+        ]
+    )
+    def test_group_fused_layernorm_sigmoid_mul(self, dtype: str):
         # half4 kernel
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128]], eps=1e-1
+            [[1024, 256], [1024, 128]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128]], use_size_op=False
+            [[1024, 256], [1024, 128]],
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, use_size_op=True
+            [[1024, 256]] * 4,
+            use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [
@@ -558,109 +781,149 @@ def test_group_fused_layernorm_sigmoid_mul(self):
                 [1024, 256],
                 [1024, 128],
                 [1024, 256],
-            ]
+            ],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [
                 [2048, 2048],
                 [2048, 1024],
-            ]
+            ],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 256], [1024, 128]],
             gamma_is_none=True,
             beta_is_none=True,
             use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=False
+            [[1024, 256]] * 4,
+            gamma_is_none=True,
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=True
+            [[1024, 256]] * 4,
+            gamma_is_none=True,
+            use_size_op=True,
+            dtype=dtype,
         )
 
         # Make sure we test the boundary between being able to fit the arguments in constant memory vs not.
         for num_groups in range(38, 41):
             self._test_group_fused_layernorm_sigmoid_mul(
-                [[1024, 256]] * num_groups, use_size_op=True
+                [[1024, 256]] * num_groups,
+                use_size_op=True,
+                dtype=dtype,
             )
 
         # < 1024 kernel
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 64], [1024, 256], [1024, 125]], eps=1e-1
+            [[1024, 64], [1024, 256], [1024, 125]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 64], [1024, 256], [1024, 125]]
+            [[1024, 64], [1024, 256], [1024, 125]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             gamma_is_none=True,
             beta_is_none=True,
             use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             beta_is_none=True,
             use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             beta_is_none=True,
             use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1], [1, 0], [1, 1]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128], [1024, 0]]
+            [[1024, 256], [1024, 128], [1024, 0]],
+            dtype=dtype,
         )
 
         # fallback kernel
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 1025], [1024, 1276], [1024, 1023]], eps=1e-1
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 1025], [1024, 1276], [1024, 1023]]
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 1025], [1024, 1276], [1024, 1023]],
             gamma_is_none=True,
             beta_is_none=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[128, 1025], [128, 0], [128, 1023]]
+            [[128, 1025], [128, 0], [128, 1023]],
+            dtype=dtype,
         )
         # Ditto boundary test
         for num_groups_divided_by_3 in range(12, 15):
             self._test_group_fused_layernorm_sigmoid_mul(
-                [[1024, 1025], [1024, 1276], [1024, 1023]] * num_groups_divided_by_3
+                [[1024, 1025], [1024, 1276], [1024, 1023]] * num_groups_divided_by_3,
+                dtype=dtype,
             )
 
         # ND
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[2, 512, 256, 16], [2, 512, 128, 4]], 2, use_size_op=False
+            [[2, 512, 256, 16], [2, 512, 128, 4]],
+            2,
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[3, 256, 64], [3, 256, 256], [3, 256, 125]], 1
+            [[3, 256, 64], [3, 256, 256], [3, 256, 125]],
+            1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16, 3, 1025], [4, 16, 2, 1276], [4, 16, 1, 1023]],
             2,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16, 1025], [4, 16, 1276], [4, 16, 1023]],
             1,
             gamma_is_none=True,
             beta_is_none=True,
+            dtype=dtype,
         )
 
-    def test_group_layernorm(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+        ]
+    )
+    def test_group_layernorm(self, dtype: str):
         self._test_group_fused_layernorm_sigmoid_mul(
             [
                 [1024, 256],
@@ -671,6 +934,7 @@ def test_group_layernorm(self):
                 [1024, 256],
             ],
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
@@ -678,23 +942,28 @@ def test_group_layernorm(self):
             beta_is_none=True,
             use_size_op=True,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 1025], [1024, 1276], [1024, 1023]],
             eps=1e-1,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1], [1, 0], [1, 1]],
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[2, 512, 256, 16], [2, 512, 128, 4]],
             2,
             use_size_op=False,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_max_pool2d.py b/tests/unittest/ops/test_max_pool2d.py
index f605d810d..b1f16601a 100644
--- a/tests/unittest/ops/test_max_pool2d.py
+++ b/tests/unittest/ops/test_max_pool2d.py
@@ -19,15 +19,16 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class MaxPool2dTestCase(unittest.TestCase):
-    def test_max_pool_2d_fp16(self):
+    def _test_max_pool_2d(self, dtype="float16"):
         batch_size = [1, 3]
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 112, 112, 64],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -37,15 +38,23 @@ def test_max_pool_2d_fp16(self):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "max_pool2d")
         for batch in batch_size:
-            X_pt = torch.randn(batch, 64, 112, 112).cuda().half()
+            X_pt = get_random_torch_tensor([batch, 64, 112, 112], dtype=dtype)
             OP_pt = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
             Y_pt = OP_pt(X_pt)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty([batch, 56, 56, 64]).cuda().half()
+            y = torch.empty_like(Y_pt).permute(0, 2, 3, 1).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_max_pool_2d_fp16(self):
+        self._test_max_pool_2d(dtype="float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_max_pool_2d_fp32(self):
+        self._test_max_pool_2d(dtype="float32")
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_nhwc3to4.py b/tests/unittest/ops/test_nhwc3to4.py
index d0dbbe402..76e910c72 100644
--- a/tests/unittest/ops/test_nhwc3to4.py
+++ b/tests/unittest/ops/test_nhwc3to4.py
@@ -24,14 +24,14 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Nhcw3To4TestCase(unittest.TestCase):
-    def test_nhcw3to8_fp16(self):
+    def _test_nhcw3to4(self, dtype="float16"):
         target = detect_target()
         batch_size = [1, 3]
         if target.name() == "rocm":
             return True
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 224, 224, 3],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -41,17 +41,23 @@ def test_nhcw3to8_fp16(self):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "nhwc3to4")
         for batch in batch_size:
-            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype("float16")
-            Y_np = np.zeros((batch, 224, 224, 4)).astype("float16")
+            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype(dtype)
+            Y_np = np.zeros((batch, 224, 224, 4)).astype(dtype)
             Y_np[:, :, :, 0] = X_np[:, :, :, 0]
             Y_np[:, :, :, 1] = X_np[:, :, :, 1]
             Y_np[:, :, :, 2] = X_np[:, :, :, 2]
             Y_pt = torch.from_numpy(Y_np).cuda()
             X_pt = torch.from_numpy(X_np).cuda()
-            y = torch.empty([batch, 224, 224, 4]).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
+    def test_nhcw3to4_f16(self):
+        self._test_nhcw3to4()
+
+    def test_nhcw3to4_f32(self):
+        self._test_nhcw3to4(dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index 2f81e9b02..430967af3 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from torchvision.ops import boxes as box_ops
@@ -41,7 +42,7 @@ def nonempty(box, threshold=0.0):
     return keep
 
 
-def create_tensors(N):
+def create_tensors(N, dtype="float16"):
     dets = np.array(
         [
             [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
@@ -75,7 +76,7 @@ def create_tensors(N):
             [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
             [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
         ],
-        dtype="float16",
+        dtype=dtype,
     )
 
     return dets[:N, :4], dets[:N, -1]
@@ -83,10 +84,10 @@ def create_tensors(N):
 
 @skipIfNoTorchVision
 class nmsTestCase(unittest.TestCase):
-    def _create_tensors(self, N):
+    def _create_tensors(self, N, dtype="float16"):
         boxes, scores = create_tensors(N)
-
-        return torch.tensor(boxes).cuda().half(), torch.tensor(scores).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        return [torch.tensor(x).cuda().to(dtype=torch_dtype) for x in (boxes, scores)]
 
     def _test_nms(
         self,
@@ -98,19 +99,20 @@ def _test_nms(
         num_classes=1,
         test_name="proposal_nms",
         copy_op=False,
+        dtype="float16",
     ):
         target = detect_target()
 
         X1 = Tensor(
             shape=[1, N, 4],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
 
         X2 = Tensor(
             shape=[1, N],
-            dtype="float16",
+            dtype=dtype,
             name="kernel",
             is_input=True,
         )
@@ -127,55 +129,79 @@ def _test_nms(
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
-        module = compile_model(X4, target, "./tmp", test_name + str(copy_op))
+        module = compile_model(X4, target, "./tmp", test_name)
 
-        boxes, scores = self._create_tensors(N)
-        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
-        iou = iouThreshold
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, dtype=dtype)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().to(dtype=torch_dtype)
         kept = nonempty(boxes, threshold=minBoxSize)
         score_pt = scores.clone()
         score_pt[kept] = -1
-        keep = box_ops.batched_nms(boxes, score_pt, idxs, iou)
+        keep = box_ops.batched_nms(boxes, score_pt, idxs, iouThreshold)
 
         if keep.shape[0] >= nmsMaxOut:
             keep = keep[:nmsMaxOut]
             ref_box = boxes[keep]
         else:
-            ref_box = torch.zeros(nmsMaxOut, 4).half()
+            ref_box = torch.zeros(nmsMaxOut, 4)
             ref_box[
                 : keep.shape[0],
             ] = boxes[keep]
+        ref_box = ref_box.to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 4)).contiguous()
         x_scores = scores.reshape((1, N)).contiguous()
         inputs = [x, x_scores]
-        y = torch.empty([1, nmsMaxOut, 4]).cuda().half()
+        y = torch.empty([1, nmsMaxOut, 4]).cuda().to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(ref_box.cuda(), y, atol=1e-2, rtol=1e-2))
 
-    def test_nms(self):
-        self._test_nms()
-        self._test_nms(copy_op=True)
+    def test_nms_fp16(self):
+        self._test_nms(
+            test_name="proposal_nms_fp16",
+            dtype="float16",
+        )
+        self._test_nms(
+            test_name="proposal_nms_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    def test_nms_fp32(self):
+        self._test_nms(
+            test_name="proposal_nms_fp32",
+            dtype="float32",
+        )
+        self._test_nms(
+            test_name="proposal_nms_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
+        )
 
     def _test_topk_nms(
-        self, batch_size=1, N=30, topK=30, iou=0.5, test_name="topk_nms", copy_op=False
+        self,
+        batch_size=1,
+        N=30,
+        topK=30,
+        iou=0.5,
+        test_name="topk_nms",
+        copy_op=False,
+        dtype="float16",
     ):
-
         target = detect_target()
-        if target.name() == "rocm":
-            return
         m_shape = (N, 4)
 
         def model():
             X_boxes = Tensor(
                 shape=m_shape,
-                dtype="float16",
+                dtype=dtype,
                 name="X",
                 is_input=True,
             )
             X_scores = Tensor(
                 shape=[N],
-                dtype="float16",
+                dtype=dtype,
                 name="scores",
                 is_input=True,
             )
@@ -195,8 +221,9 @@ def model():
 
         module = compile_model(Y, target, "./tmp", test_name)
 
-        boxes, scores = self._create_tensors(N)
-        idxs = torch.randint(0, 1, (N,)).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, dtype=dtype)
+        idxs = torch.randint(0, 1, (N,)).cuda().to(dtype=torch_dtype)
         y_pt = box_ops.batched_nms(boxes, scores, idxs, iou)
         y_np = y_pt.cpu().numpy()
 
@@ -215,9 +242,29 @@ def model():
         y = score_inds[index]
         np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
 
-    def test_topk_nms(self):
-        self._test_topk_nms()
-        self._test_topk_nms(copy_op=True)
+    @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
+    def test_topk_nms_fp16(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp16",
+            dtype="float16",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    def test_topk_nms_fp32(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp32",
+            dtype="float32",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_norm.py b/tests/unittest/ops/test_norm.py
index 1d6abeba6..99951d4b6 100644
--- a/tests/unittest/ops/test_norm.py
+++ b/tests/unittest/ops/test_norm.py
@@ -15,20 +15,17 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class VectorNormTestCase(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(VectorNormTestCase, self).__init__(*args, **kwargs)
-
     def _run_vector_norm(
         self,
         *,
@@ -40,6 +37,8 @@ def _run_vector_norm(
         input_type="float16",
         output_type=None,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -66,21 +65,31 @@ def _run_vector_norm(
 
         module = compile_model(Y, target, "./tmp", test_name)
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        output_dtype_pt = (
+            string_to_torch_dtype(output_type)
+            if output_type is not None
+            else string_to_torch_dtype(input_type)
+        )
         Y_pt = torch.linalg.vector_norm(
-            X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+            X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=output_dtype_pt
         )
-
-        y = torch.empty(y_shape).half().cuda()
+        y = torch.empty(y_shape, dtype=output_dtype_pt).cuda()
         module.run_with_tensors([X_pt], [y])
-        y_pt = Y_pt.cpu().numpy()
 
-        np.testing.assert_equal(y_shape, y_pt.shape)
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+        self.assertEqual(y_shape, list(Y_pt.shape))
+        self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
 
     def _run_l2_norm(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         self._run_vector_norm(
             test_name="l2_norm",
@@ -90,6 +99,8 @@ def _run_l2_norm(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            atol=atol,
+            rtol=rtol,
         )
         self._run_vector_norm(
             test_name="l2_norm_copy_op",
@@ -100,6 +111,8 @@ def _run_l2_norm(
             input_type=input_type,
             output_type=output_type,
             copy_op=True,
+            atol=atol,
+            rtol=rtol,
         )
 
     def test_l2_norm(self):
@@ -112,6 +125,76 @@ def test_l2_norm(self):
         self._run_l2_norm(dim=-1, input_shape=[4, 1230, 1237], keepdim=True)
         self._run_l2_norm(dim=-1, input_shape=[1, 1000000, 6], keepdim=True)
 
+    def test_l2_norm_fp32(self):
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[3, 2, 2048],
+            keepdim=False,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[3, 1234, 4],
+            keepdim=True,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=2,
+            input_shape=[5, 1, 34, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[4, 1230, 1237],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
     def _run_batched_vector_norm(
         self,
         *,
@@ -147,7 +230,11 @@ def _run_batched_vector_norm(
 
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        output_dtype_pt = (
+            string_to_torch_dtype(output_type)
+            if output_type is not None
+            else string_to_torch_dtype(input_type)
+        )
         module = compile_model(Y, target, "./tmp", test_name)
 
         for B in [5, 128, 1024, 1237, 2002]:
@@ -156,15 +243,13 @@ def _run_batched_vector_norm(
 
             X_pt = get_random_torch_tensor(input_shape, input_type)
             Y_pt = torch.linalg.vector_norm(
-                X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+                X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=output_dtype_pt
             )
-            y_pt = Y_pt.cpu().numpy()
-
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = torch.empty(Y_pt.shape, dtype=output_dtype_pt).cuda()
             module.run_with_tensors([X_pt], [y])
 
-            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+            self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+            torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
 
     def _run_batched_l2_norm(
         self, *, dim, keepdim, input_type="float16", output_type=None
diff --git a/tests/unittest/ops/test_pad_last_dim.py b/tests/unittest/ops/test_pad_last_dim.py
index 61f778d59..7bf5d2b26 100644
--- a/tests/unittest/ops/test_pad_last_dim.py
+++ b/tests/unittest/ops/test_pad_last_dim.py
@@ -19,17 +19,31 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_zeros_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class PadLastDim(unittest.TestCase):
-    def _test_static_shape_4d(self, copy_op=False):
+    def _test_static_shape_4d(
+        self,
+        copy_op=False,
+        test_name="static_shape_4d",
+        dtype="float16",
+    ):
         NN = 2
         HH = 7
         WW = 7
         CI = 262
         CO = 264
-        X = Tensor(shape=[NN, HH, WW, CI], name="X", is_input=True)
+        X = Tensor(
+            shape=[NN, HH, WW, CI],
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
         op = ops.pad_last_dim(4, CO)
         if copy_op:
             op = ops.pad_last_dim(**op._get_op_attributes())
@@ -37,40 +51,95 @@ def _test_static_shape_4d(self, copy_op=False):
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", f"pad_last_dim4d_{copy_op}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(NN, HH, WW, CI).cuda().half()
-        Pad_pt = torch.zeros(NN, HH, WW, CO - CI).cuda().half()
+        X_pt = get_random_torch_tensor([NN, HH, WW, CI], dtype=dtype)
+        Pad_pt = get_torch_zeros_tensor([NN, HH, WW, CO - CI], dtype=dtype)
         Y_pt = torch.cat([X_pt, Pad_pt], dim=3)
 
-        y = torch.empty([NN, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
-    def test_static_shape_4d(self):
-        self._test_static_shape_4d()
-        self._test_static_shape_4d(copy_op=True)
+    def test_static_shape_4d_fp16(self):
+        self._test_static_shape_4d(
+            test_name="static_shape_4d_fp16",
+            dtype="float16",
+        )
+        self._test_static_shape_4d(
+            copy_op=True,
+            test_name="static_shape_4d_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def test_static_shape_2d(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_static_shape_4d_fp32(self):
+        self._test_static_shape_4d(
+            test_name="static_shape_4d_fp32",
+            dtype="float32",
+        )
+        self._test_static_shape_4d(
+            copy_op=True,
+            test_name="static_shape_4d_fp32_copy_op",
+            dtype="float32",
+        )
+
+    def _test_static_shape_2d(
+        self,
+        copy_op=False,
+        test_name="static_shape_2d",
+        dtype="float16",
+    ):
         NN = 32
         CI = 259
         CO = 264
-        X = Tensor(shape=[NN, CI], name="X", is_input=True)
+        X = Tensor(
+            shape=[NN, CI],
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
         op = ops.pad_last_dim(2, CO)
+        if copy_op:
+            op = ops.pad_last_dim(**op._get_op_attributes())
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "pad_last_dim2d")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(NN, CI).cuda().half()
-        Pad_pt = torch.zeros(NN, CO - CI).cuda().half()
+        X_pt = get_random_torch_tensor([NN, CI], dtype=dtype)
+        Pad_pt = get_torch_zeros_tensor([NN, CO - CI], dtype=dtype)
         Y_pt = torch.cat([X_pt, Pad_pt], dim=1)
 
-        y = torch.empty([NN, CO]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_static_shape_2d_fp16(self):
+        self._test_static_shape_2d(
+            test_name="static_shape_2d_fp16",
+            dtype="float16",
+        )
+        self._test_static_shape_2d(
+            copy_op=True,
+            test_name="static_shape_2d_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_static_shape_2d_fp32(self):
+        self._test_static_shape_2d(
+            test_name="static_shape_2d_fp32",
+            dtype="float32",
+        )
+        self._test_static_shape_2d(
+            copy_op=True,
+            test_name="static_shape_2d_fp32_copy_op",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr.py b/tests/unittest/ops/test_perm021fc_ccr.py
index 9e20ddc71..18c843916 100644
--- a/tests/unittest/ops/test_perm021fc_ccr.py
+++ b/tests/unittest/ops/test_perm021fc_ccr.py
@@ -18,7 +18,6 @@
 # _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
 """
 
-
 import unittest
 
 import torch
@@ -26,37 +25,70 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
-    def test_ccr(self):
+class Perm021FCCCRTestCase(unittest.TestCase):
+    def _test_perm021fc_ccr(
+        self,
+        test_name="perm021fc_ccr",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 745
         # K = 752
         N = 30
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt.unsqueeze(0)}, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_ccr_fp16(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_fp32(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias.py b/tests/unittest/ops/test_perm021fc_ccr_bias.py
index 2b5916bb0..196e11f73 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias.py
@@ -25,11 +25,16 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCCRBiasTestCase(unittest.TestCase):
-    def test_ccr(self):
+    def _test_perm021fc_ccr_bias(
+        self,
+        test_name="perm021fc_ccr_bias",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         # K = 745
@@ -37,32 +42,64 @@ def test_ccr(self):
         # N = 30
         N = 64
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        # B_pt = torch.randn(N).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_ccr_bias_fp16(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_bias_fp32(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
index f9fbc8cf4..f0b6a2f16 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
@@ -25,12 +25,17 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skip("Re-enable after cutlass fix")
 # @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021FCCCRBiasTestCase(unittest.TestCase):
-    def test_ccr(self):
+class Perm021FCCCRBiasPerm021TestCase(unittest.TestCase):
+    def _test_perm021fc_ccr_bias_perm021(
+        self,
+        test_name="perm021fc_ccr_bias_perm021",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         # K = 745
@@ -38,32 +43,64 @@ def test_ccr(self):
         # N = 30
         N = 64
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr_bias_permute()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        # B_pt = torch.randn(N).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
         Y_pt = Y_pt.permute(0, 2, 1)
-        y = torch.empty([B, N, M]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_ccr_bias_perm021_fp16(self):
+        self._test_perm021fc_ccr_bias_perm021(
+            test_name="perm021fc_ccr_bias_perm021_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_bias_perm021_fp32(self):
+        self._test_perm021fc_ccr_bias_perm021(
+            test_name="perm021fc_ccr_bias_perm021_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc.py b/tests/unittest/ops/test_perm021fc_crc.py
index 30faa3cc4..5f99f3275 100644
--- a/tests/unittest/ops/test_perm021fc_crc.py
+++ b/tests/unittest/ops/test_perm021fc_crc.py
@@ -25,27 +25,42 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021BMMTestCase(unittest.TestCase):
-    def test_crc(self):
+class Perm021FCCRCTestCase(unittest.TestCase):
+    def _test_perm021fc_crc(
+        self,
+        test_name="perm021fc_crc",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 742
         # K = 752
         N = 64
         target = detect_target()
-        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[1, K, N],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.perm021fc_crc()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_crc")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
@@ -53,11 +68,29 @@ def test_crc(self):
         Y_pt = torch.reshape(Y_pt, (B, M, N))
 
         WT = W_pt.transpose(0, 1).contiguous()
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": WT.unsqueeze(0), "input_1": X_pt}, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_crc_fp16(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_crc_fp32(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
index e880018c9..cab5f5bb3 100644
--- a/tests/unittest/ops/test_perm021fc_crc_bias.py
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -25,42 +25,80 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021BMMTestCase(unittest.TestCase):
-    def test_crc(self):
+class Perm021FCCRCBiasTestCase(unittest.TestCase):
+    def _test_perm021fc_crc_bias(
+        self,
+        test_name="perm021fc_crc_bias",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 742
         # K = 752
         N = 64
         target = detect_target()
-        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[1, K, N],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_crc_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_crc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N)).contiguous()
         WT = W_pt.transpose(0, 1).contiguous()
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": WT.unsqueeze(0), "input_1": X_pt, "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_crc_bias_fp16(self):
+        self._test_perm021fc_crc_bias(
+            test_name="perm021fc_crc_bias_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_crc_bias_fp32(self):
+        self._test_perm021fc_crc_bias(
+            test_name="perm021fc_crc_bias_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index e7c44e1f3..0e70baa99 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -28,68 +28,82 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
+
+
+def cuda_skip_condition(dtype, arch):
+    return dtype == "float32" and int(arch) < 80
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_TestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_perm102_bmm_rrr(self, dtype):
+        arch_ = detect_target()._arch
+        if cuda_skip_condition(dtype, arch_):
+            self.skipTest(f"BMM with float32 inputs not supported on CUDA SM{arch_}")
         B = 25
         M = 128
         K = 256
         N = 100
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.perm102_bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+        module = compile_model(Y, target, "./tmp", f"perm102_bmm_rcr_{dtype}")
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
 
         XT = X_pt.permute(1, 0, 2)
         Y_pt = torch.bmm(XT, W_pt.permute([0, 2, 1]))
-        Y_pt = Y_pt.permute(1, 0, 2)
-        y = torch.empty([M, B, N]).cuda().half()
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr_bias(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_perm102_bmm_rrr_bias(self, dtype):
+        arch_ = detect_target()._arch
+        if cuda_skip_condition(dtype, arch_):
+            self.skipTest(f"BMM with float32 inputs not supported on CUDA SM{arch_}")
         B = 25
         M = 128
         K = 256
         N = 100
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.perm102_bmm_rcr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+        module = compile_model(Y, target, "./tmp", f"perm102_bmm_rcr_bias_{dtype}")
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        B_pt = torch.randn(B, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
+        B_pt = get_random_torch_tensor(shape=(B, N), dtype=dtype)
 
         XT = X_pt.permute(1, 0, 2)
         Bias = B_pt.unsqueeze(1)
         Y_pt = torch.baddbmm(Bias, XT, W_pt.permute([0, 2, 1]))
-        Y_pt = Y_pt.permute(1, 0, 2)
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
 
-        y = torch.empty([M, B, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
         )
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index b45c46a34..f6c75110e 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -28,68 +28,76 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_perm102_bmm_rrr(self, dtype="float16"):
+        if dtype == "float32" and int(detect_target()._arch) < 80:
+            self.skipTest(f"fp32 BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256
         N = 100
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.perm102_bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
 
-        XT = X_pt.permute(1, 0, 2)
+        XT = X_pt.permute(1, 0, 2).contiguous()
         Y_pt = torch.bmm(XT, W_pt)
-        Y_pt = Y_pt.permute(1, 0, 2)
-        y = torch.empty([M, B, N]).cuda().half()
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMBiasTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr_bias(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_perm102_bmm_rrr_bias(self, dtype="float16"):
+        if dtype == "float32" and int(detect_target()._arch) < 80:
+            self.skipTest(f"fp32 BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256
         N = 100
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.perm102_bmm_rrr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        B_pt = torch.randn(B, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
+        B_pt = get_random_torch_tensor(shape=(B, N), dtype=dtype)
 
-        XT = X_pt.permute(1, 0, 2)
+        XT = X_pt.permute(1, 0, 2).contiguous()
         Bias = B_pt.unsqueeze(1)
         Y_pt = torch.baddbmm(Bias, XT, W_pt)
-        Y_pt = Y_pt.permute(1, 0, 2)
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
 
-        y = torch.empty([M, B, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
         )
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_permute.py b/tests/unittest/ops/test_permute.py
index f1e5b7456..1578e8bd8 100644
--- a/tests/unittest/ops/test_permute.py
+++ b/tests/unittest/ops/test_permute.py
@@ -13,48 +13,44 @@
 #  limitations under the License.
 #
 import unittest
+from typing import Sequence
 
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import torch_dtype_to_string
 from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class PermuteTest(unittest.TestCase):
-    @parameterized.expand(
-        [
-            param((80, 300, 2), (0, 2, 1), "permute_1"),
-            param((80, 300, 2), (1, 0, 2), "permute_2"),
-            param((80, 300, 2), (2, 1, 0), "permute_3"),
-            param((5, 113, 15, 31), (0, 2, 1, 3), "permute_4"),
-            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4), "permute_5"),
-            param((8, 29, 100000, 3), (0, 2, 1, 3), "permute_6"),
-            param((32, 12, 4096, 64), (0, 2, 1, 3), "permute_7"),
-            param((1, 12, 128, 64), (0, 2, 1, 3), "permute_8"),
-            param((2, 3, 4, 5), (3, 2, 1, 0), "permute_9"),
-            param((3, 5, 128, 514), (2, 3, 0, 1), "permute_10"),
-            param((128, 512), (1, 0), "permute_11"),
-        ]
-    )
-    def test_static_shape_3d(self, input_shapes, dims, testname):
-        X = Tensor(shape=input_shapes, name="X", is_input=True)
+class GenericPermuteTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GenericPermuteTest, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_generic_permute(
+        self,
+        input_shapes: Sequence[int],
+        dims: Sequence[int],
+        torch_dtype: torch.dtype,
+        testname: str,
+    ) -> None:
+        ait_dtype = torch_dtype_to_string(torch_dtype)
+        X = Tensor(shape=input_shapes, name="X", dtype=ait_dtype, is_input=True)
         op = ops.permute()
         Y = op(X, dims)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", testname)
+        module = compile_model(Y, target, "./tmp", f"{testname}_{self._test_id}")
+        self._test_id += 1
 
-        count = 1
-        for dim in input_shapes:
-            count *= dim
-        X_pt = torch.randn(input_shapes).cuda().half()
+        X_pt = torch.randn(input_shapes, dtype=torch_dtype).cuda()
         Y_pt = torch.permute(X_pt, dims)
 
-        y = torch.empty(Y_pt.size()).cuda().half()
+        y = torch.empty(Y_pt.size(), dtype=torch_dtype).cuda()
         module.run_with_tensors([X_pt], [y])
 
         # mean, _, _ = module.benchmark_with_tensors([X_pt], [y], count=1000)
@@ -64,8 +60,60 @@ def test_static_shape_3d(self, input_shapes, dims, testname):
         # bw = 2 * 2 * mem / (mean * 1e-3 * 1e9)  # GB/s
         # print(f"bw: {bw} GB/s")
 
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    def test_generic_permute_fp16(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.float16,
+            testname="test_generic_permute_fp16",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported by ROCm.")
+    def test_generic_permute_fp32(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.float32,
+            testname="test_generic_permute_fp32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute021.py b/tests/unittest/ops/test_permute021.py
index 4d040657d..ac2dfa27f 100644
--- a/tests/unittest/ops/test_permute021.py
+++ b/tests/unittest/ops/test_permute021.py
@@ -17,29 +17,87 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute021(unittest.TestCase):
-    def test_static_shape_3d(self):
-        NN = 2
-        WW = 384
-        CI = 262
-        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+class Permute021Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute021Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_021(
+        self,
+        input_shape,
+        dims,
+        test_name="permute021",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
         op = ops.permute021()
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "perm021")
+        module = compile_model(Y, target, "./tmp", f"perm021_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, dims)
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((2, 384, 262), (0, 2, 1)),
+            param((2, 3, 384, 262), (0, 1, 3, 2)),
+            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    def test_permute021_fp16(self, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name="permute021_fp16",
+            dtype="float16",
+        )
 
-        X_pt = torch.randn(NN, WW, CI).cuda().half()
-        Y_pt = torch.permute(X_pt, [0, 2, 1])
-        y = torch.empty([NN, CI, WW]).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+    @parameterized.expand(
+        [
+            param((2, 384, 262), (0, 2, 1)),
+            param((2, 3, 384, 262), (0, 1, 3, 2)),
+            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute021_fp32(self, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name="permute021_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute0213.py b/tests/unittest/ops/test_permute0213.py
new file mode 100644
index 000000000..8c4d889cb
--- /dev/null
+++ b/tests/unittest/ops/test_permute0213.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+class Permute0213Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute0213Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_0213(
+        self,
+        input_shape,
+        test_name="permute0213",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.permute0213()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"perm0213_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [0, 2, 1, 3])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((1, 80, 300, 2)),
+            param((5, 31, 7, 3)),
+            param((4, 256, 128, 7)),
+            param((7, 128, 256, 8)),
+            param((32, 128, 128, 63)),
+            param((33, 256, 256, 64)),
+            param((IntVar([2, 3]), 33, 256, 64)),
+        ]
+    )
+    def test_permute0213_fp16(self, input_shape):
+        self._test_permute_0213(
+            input_shape=input_shape,
+            test_name="permute0213_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((1, 80, 300, 2)),
+            param((5, 31, 7, 3)),
+            param((4, 256, 128, 7)),
+            param((7, 128, 256, 8)),
+            param((32, 128, 128, 63)),
+            param((33, 256, 256, 64)),
+            param((IntVar([2, 3]), 33, 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute0213_fp32(self, input_shape):
+        self._test_permute_0213(
+            input_shape=input_shape,
+            test_name="permute0213_fp32",
+            dtype="float32",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute102.py b/tests/unittest/ops/test_permute102.py
index c2901fa0b..0069f783e 100644
--- a/tests/unittest/ops/test_permute102.py
+++ b/tests/unittest/ops/test_permute102.py
@@ -17,29 +17,88 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute102(unittest.TestCase):
-    def test_static_shape_3d(self):
-        NN = 80
-        WW = 300
-        CI = 2
-        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+class Permute102Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute102Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_102(
+        self,
+        input_shape,
+        test_name="permute102",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
         op = ops.permute102()
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "perm102")
+        module = compile_model(Y, target, "./tmp", f"perm102_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [1, 0, 2])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    def test_permute102_fp16(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_fp16",
+            dtype="float16",
+        )
 
-        X_pt = torch.randn(NN, WW, CI).cuda().half()
-        Y_pt = torch.permute(X_pt, [1, 0, 2])
-        y = torch.empty([WW, NN, CI]).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute102_fp32(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute210.py b/tests/unittest/ops/test_permute210.py
index 78c480e19..aaa68f502 100644
--- a/tests/unittest/ops/test_permute210.py
+++ b/tests/unittest/ops/test_permute210.py
@@ -12,37 +12,93 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import itertools
 import unittest
 
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute210(unittest.TestCase):
-    def test_static_shape_3d(self):
-        for NWC in itertools.product([2, 80, 300], [2, 80, 300], [2, 80, 300]):
-            with self.subTest(NWC=NWC):
-                NN, WW, CI = NWC
-                X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
-                op = ops.permute210()
-                Y = op(X)
-                Y._attrs["is_output"] = True
-                Y._attrs["name"] = "output"
-                target = detect_target()
-                module = compile_model(
-                    Y, target, "./tmp", "perm210_{}_{}_{}".format(NN, WW, CI)
-                )
-
-                X_pt = torch.randn(NN, WW, CI).cuda().half()
-                Y_pt = torch.permute(X_pt, [2, 1, 0])
-                y = torch.empty([CI, WW, NN]).cuda().half()
-                module.run_with_tensors([X_pt], [y])
-                self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+class Permute210Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute210Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_210(
+        self,
+        input_shape,
+        test_name="permute210",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.permute210()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"perm210_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [2, 1, 0])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    def test_permute210_fp16(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute210_fp32(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_proposal.py b/tests/unittest/ops/test_proposal.py
index 30862be85..cd06b9b3f 100644
--- a/tests/unittest/ops/test_proposal.py
+++ b/tests/unittest/ops/test_proposal.py
@@ -23,6 +23,7 @@
 
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 DEBUG = False
 
@@ -409,7 +410,11 @@ def mark_output(y):
 
 
 class ProposalTestCase(unittest.TestCase):
-    def test_fp16_single_op(self, test_name="proposal"):
+    def _test_single_op(
+        self,
+        test_name="proposal",
+        dtype="float16",
+    ):
         target = detect_target()
         feat_stride = 16
         scales = [128, 256, 512]
@@ -447,20 +452,27 @@ def test_fp16_single_op(self, test_name="proposal"):
         scores = np.repeat(scores, repeats=batch_size, axis=0)
 
         bbox_deltas_ait = np.transpose(
-            bbox_deltas.astype("float16"), (0, 2, 3, 1)
+            bbox_deltas.astype(dtype),
+            (0, 2, 3, 1),
+        ).copy()
+        scores_ait = np.transpose(
+            scores.astype(dtype),
+            (0, 2, 3, 1),
         ).copy()
-        scores_ait = np.transpose(scores.astype("float16"), (0, 2, 3, 1)).copy()
 
         X_bbox_deltas = Tensor(
             shape=bbox_deltas_ait.shape,
             name="X_bbox_deltas",
-            dtype="float16",
+            dtype=dtype,
             is_input=True,
         )
-
         X_scores = Tensor(
-            shape=scores_ait.shape, name="X_scores", dtype="float16", is_input=True
+            shape=scores_ait.shape,
+            name="X_scores",
+            dtype=dtype,
+            is_input=True,
         )
+
         OP = nn.Proposal(
             im_shape=im_info[:2],
             scales=scales,
@@ -472,6 +484,7 @@ def test_fp16_single_op(self, test_name="proposal"):
             iou_threshold=threshold,
             rpn_min_size=rpn_min_size,
             batch_size=batch_size,
+            dtype=dtype,
         )
 
         y = OP(X_bbox_deltas, X_scores)
@@ -482,18 +495,33 @@ def test_fp16_single_op(self, test_name="proposal"):
         batch_inds = torch.from_numpy(OP._batch_inds.copy()).cuda()
         module.set_constant_with_tensor("anchors", anchors)
         module.set_constant_with_tensor("batch_inds", batch_inds)
+        torch_dtype = string_to_torch_dtype(dtype)
         inputs_pt = [
-            torch.from_numpy(bbox_deltas_ait).cuda().half(),
-            torch.from_numpy(scores_ait).cuda().half(),
+            torch.from_numpy(bbox_deltas_ait).cuda().to(torch_dtype),
+            torch.from_numpy(scores_ait).cuda().to(torch_dtype),
         ]
         out0_shape = module.get_output_maximum_shape(0)
-        out0 = torch.empty(out0_shape).cuda().half()
+        out0 = torch.empty(out0_shape, dtype=torch_dtype, device="cuda")
         y_ait_shape = module.get_output_maximum_shape(1)
-        y_ait = torch.empty(y_ait_shape).cuda().half()
+        y_ait = torch.empty(y_ait_shape, dtype=torch_dtype, device="cuda")
         module.run_with_tensors(inputs_pt, [out0, y_ait])
         y_ait = y_ait.reshape(2, -1, 4)
         self.assertTrue(torch.allclose(y_ait[0, :], y_ait[1, :], atol=1e-2, rtol=1e-2))
 
+    def test_proposal_fp16(self):
+        self._test_single_op(
+            test_name="proposal_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_proposal_fp32(self):
+        self._test_single_op(
+            test_name="proposal_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index b091c81c9..8d617cb26 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -21,10 +21,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -44,9 +46,11 @@ def _run_reduce(
         keepdim,
         input_type="float16",
         output_type=None,
+        rtol=1e-2,
+        atol=1e-2,
     ):
         torch.manual_seed(0)
-        logger.info(
+        _LOGGER.info(
             "Test input_shape={input_shape}, reduction_axes={dim}".format(
                 input_shape=input_shape, dim=dim
             )
@@ -64,29 +68,37 @@ def _run_reduce(
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         y_dtype = Y._attrs["dtype"]
 
-        logger.info("AITemplate output_shape: {}".format(y_shape))
-        logger.info("AITemplate output_type: {}".format(y_dtype))
+        _LOGGER.info("AITemplate output_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate output_type: {}".format(y_dtype))
 
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        dtype_pt = string_to_torch_dtype(output_type)
         if keepdim is None:
             Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
         else:
             Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
 
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
         y_pt = Y_pt.cpu().numpy()
 
         np.testing.assert_equal(y_shape, y_pt.shape)
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+        np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=atol, rtol=rtol)
         self.test_count += 1
 
     def _run_reduce_sum(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+        rtol=1e-2,
+        atol=1e-2,
     ):
         self._run_reduce(
             test_name="reduce_sum",
@@ -97,6 +109,8 @@ def _run_reduce_sum(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            rtol=rtol,
+            atol=atol,
         )
 
     def test_reduce_sum(self):
@@ -198,7 +212,13 @@ def test_reduce_sum(self):
         )
 
     def _run_reduce_mean(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
     ):
         self._run_reduce(
             test_name="reduce_mean",
@@ -377,7 +397,7 @@ def _run_batched_reduce(
         output_type=None,
     ):
         torch.manual_seed(0)
-        logger.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
+        _LOGGER.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
         target = detect_target()
 
         batch0_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
@@ -400,13 +420,17 @@ def _run_batched_reduce(
         for batch_size in batch_sizes:
             input_shape = [batch_size] + non_batch_shape
             X_pt = get_random_torch_tensor(input_shape, input_type)
-            dtype_pt = dtype_to_torch_dtype(output_type)
+            dtype_pt = (
+                X_pt.dtype
+                if output_type is None
+                else string_to_torch_dtype(output_type)
+            )
             if keepdim is None:
                 Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
             else:
                 Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
             y_pt = Y_pt.cpu().numpy()
 
@@ -445,6 +469,66 @@ def test_batched_reduce_sum(self):
             output_type=None,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reduce_sum_float32(self):
+        # reduce_smallaxis
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float32",
+            output_type=None,
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        # reduce_3d
+        self._run_reduce_sum(
+            dim=-2,
+            input_shape=[3, 2048, 4],
+            keepdim=False,
+            input_type="float32",
+            output_type=None,
+            rtol=4e-6,
+            atol=2e-5,
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[11, 4096, 2],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        # reduce (common) 2d
+        self._run_reduce_sum(
+            dim=-1,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="float32",
+            output_type=None,
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[1231, 1234],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_reshape.py b/tests/unittest/ops/test_reshape.py
index cfff48af1..a7252b635 100644
--- a/tests/unittest/ops/test_reshape.py
+++ b/tests/unittest/ops/test_reshape.py
@@ -21,21 +21,23 @@
 
 from aitemplate.frontend import IntImm, IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ReshapeTestCase(unittest.TestCase):
-    def _test_fp16(
+    def _test_reshape(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="reshape",
+        input_type="float16",
     ):
         target = detect_target()
         # N, H, W, C
         X = Tensor(
             shape=[IntVar(values=list(batch_size), name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=input_type,
             name="input_0",
             is_input=True,
         )
@@ -57,30 +59,31 @@ def _test_fp16(
         for b in batch_size:
             # C, H, W
             X_shape_pt = (X_shape[2], X_shape[0], X_shape[1])
-            X_pt = torch.randn(b, *X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(shape=(b, *X_shape_pt), dtype=input_type)
             OP_pt = torch.nn.AvgPool2d(kernel_size=3, stride=1, padding=1)
             Y1_pt = OP_pt(X_pt).permute([0, 2, 3, 1])
             Y2_pt = torch.reshape(Y1_pt, shape)  # reshape 1
             Y_pt = torch.reshape(Y2_pt, shape + [1])  # reshape 2
 
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([x], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def _test_fp16_single_op(
+    def _test_reshape_single_op(
         self,
-        X_shape,
-        Y_shape,
+        X_shape=(16, 32, 64),
+        Y_shape=(-1, 16, 16, 128),
         test_name="reshape",
         check_name_retention=False,
+        input_type="float16",
     ):
         target = detect_target()
         X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
         Y_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in Y_shape]
         X = Tensor(
             shape=X_shape,
-            dtype="float16",
+            dtype=input_type,
             name="input_0",
             is_input=True,
         )
@@ -106,9 +109,9 @@ def _test_fp16_single_op(
         ]
 
         for x_shape, y_shape in zip(x_shapes, y_shapes):
-            X_pt = torch.randn(x_shape).cuda().half()
+            X_pt = get_random_torch_tensor(x_shape, input_type)
             Y_pt = torch.reshape(X_pt, y_shape)
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         if check_name_retention:
@@ -118,51 +121,55 @@ def _test_fp16_single_op(
             )
 
     def test_reshape(self):
-        self._test_fp16(test_name="reshape0")
-        self._test_fp16([4, 2], (4, 8, 8), (-1,), "reshape1")
-        self._test_fp16([3, 1], (5, 4, 16), (-1, 8), "reshape2")
-        self._test_fp16_single_op(
+        self._test_reshape(test_name="reshape0")
+        self._test_reshape([4, 2], (4, 8, 8), (-1,), "reshape1")
+        self._test_reshape([3, 1], (5, 4, 16), (-1, 8), "reshape2")
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(1, 3), name="input_batch"), 16, 32, 64),
             Y_shape=(-1, 16, 16, 128),
             test_name="reshape3",
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(1, 16, 32, 64), Y_shape=[1, 64, 16, 32], test_name="reshape4"
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 0, 8),
             Y_shape=(0, 2, 4),
             test_name="reshape1",
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(5, 4, -1, 3, 2),
             test_name="reshape_name",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(5, 4, IntVar(values=(2, 4)), 3, -1),
             test_name="reshape_name_unknown_static_dim",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(5, IntVar(values=(2, 4)), 3, 4, 2),
             test_name="reshape_name_no_unknown_dims",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(IntVar(values=(10, 20)), 4, 2, 3, -1),
             test_name="reshape_squeeze_intvar_dim",
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(20, 40), name="input_batch"), 1, 12),
             Y_shape=(4, 2, IntVar(values=(2, 4)), 3, 5),
             test_name="reshape_unsqueeze_intvar_dim",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reshape_float32(self):
+        self._test_reshape_single_op(input_type="float32", test_name="reshape_float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_roi_align.py b/tests/unittest/ops/test_roi_align.py
index 979897181..53c3911e0 100644
--- a/tests/unittest/ops/test_roi_align.py
+++ b/tests/unittest/ops/test_roi_align.py
@@ -21,6 +21,8 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     import torchvision
@@ -40,7 +42,7 @@ def random_boxes(num_boxes, max_coord=100):
 
 @skipIfNoTorchVision
 class RoiAlignTestCase(unittest.TestCase):
-    def _create_tensors(self, num_rois, b, rand=False):
+    def _create_tensors(self, num_rois, b, rand=False, dtype="float16"):
         if rand:
             boxes = random_boxes(num_rois, 200)
             inds = np.arange(b)
@@ -48,25 +50,22 @@ def _create_tensors(self, num_rois, b, rand=False):
             rois = torch.cat(
                 (torch.tensor(batch_inds).reshape(b, -1, 1), boxes.reshape(b, -1, 4)), 2
             )
-            rois = rois.reshape(-1, 5).cuda().half()
+            rois = rois.reshape(-1, 5).cuda()
         else:
-            rois = (
-                torch.tensor(
-                    [
-                        [0, -2.0, -2.0, 22.0, 22.0],
-                        [0, 10.0, 10.0, 30.0, 30.0],
-                        [0, 1.0, 1.0, 10.0, 10.0],
-                        [1, -2.0, -2.0, 22.0, 22.0],
-                        [1, 10.0, 10.0, 30.0, 30.0],
-                        [1, 1.0, 1.0, 10.0, 10.0],
-                    ]
-                )
-                .cuda()
-                .half()
-            )
-        return rois
-
-    def _test_fp16_single_op(
+            rois = torch.tensor(
+                [
+                    [0, -2.0, -2.0, 22.0, 22.0],
+                    [0, 10.0, 10.0, 30.0, 30.0],
+                    [0, 1.0, 1.0, 10.0, 10.0],
+                    [1, -2.0, -2.0, 22.0, 22.0],
+                    [1, 10.0, 10.0, 30.0, 30.0],
+                    [1, 1.0, 1.0, 10.0, 10.0],
+                ]
+            ).cuda()
+        torch_dtype = string_to_torch_dtype(dtype)
+        return rois.to(dtype=torch_dtype)
+
+    def _test_single_op(
         self,
         HH,
         WW,
@@ -77,20 +76,21 @@ def _test_fp16_single_op(
         sampling_ratio=2,
         batch_size=(1, 1),
         rand=False,
-        test_name="roi_align",
+        test_name="roi_align_fp16",
+        dtype="float16",
     ):
         target = detect_target()
 
         X = Tensor(
             shape=[IntVar(values=list(batch_size), name="input_batch"), HH, WW, CC],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
         R = Tensor(
             shape=[IntVar(values=[num_rois, num_rois], name="roi_batch"), 5],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -110,8 +110,8 @@ def _test_fp16_single_op(
         module = compile_model(Y, target, "./tmp", test_name)
 
         for b in batch_size:
-            X_pt = torch.randn(b, CC, WW, HH).cuda().half()
-            rois = self._create_tensors(num_rois, b, rand)
+            X_pt = get_random_torch_tensor([b, CC, WW, HH], dtype=dtype)
+            rois = self._create_tensors(num_rois, b, rand, dtype=dtype)
 
             if b == 1:
                 rois = rois[:num_rois, :]
@@ -121,16 +121,31 @@ def _test_fp16_single_op(
             )
             Y_pt = OP_pt(X_pt, rois)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            inputs = [x, rois]
-            y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
-            module.run_with_tensors(inputs, [y])
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
+            module.run_with_tensors([x, rois], [y])
             y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
 
-    def test_roi_align(self):
-        self._test_fp16_single_op(HH=56, WW=56, CC=256, test_name="roi_align1")
+    def test_roi_align_fp16(self):
+        self._test_single_op(
+            HH=56,
+            WW=56,
+            CC=256,
+            test_name="roi_align1",
+            dtype="float16",
+        )
         # self._test_fp16_single_op(HH=16, WW=16, CC=32, num_rois=6, batch_size=(2, 2), rand=True, test_name="roi_align2")
 
+    def test_roi_align_fp32(self):
+        self._test_single_op(
+            HH=56,
+            WW=56,
+            CC=256,
+            test_name="roi_align1",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
index c224d09f5..fdb08f346 100644
--- a/tests/unittest/ops/test_size_getitem_ops.py
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -15,28 +15,34 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class SizeOpTestCase(unittest.TestCase):
+class SizeGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SizeGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_size_op(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="size_op",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -48,30 +54,66 @@ def _test_size_op(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(b, -1, X_shape_pt[-1])
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def _test_size_op_2(
+    def test_size_op_fp16(self):
+        self._test_size_op(
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [1],
+            (4, 8, 8),
+            (-1,),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [4, 2],
+            (4, 8, 8),
+            (-1,),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [3, 1],
+            (5, 4, 16),
+            (-1, 8),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_size_op_fp32(self):
+        self._test_size_op(
+            test_name="size_op_fp32",
+            dtype="float32",
+        )
+
+    def _test_tensor_size_op(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="tensor_size_op",
         copy_op=False,
+        dtype="float16",
     ):
         target = detect_target()
         X1 = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -90,30 +132,41 @@ def _test_size_op_2(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         self.assertEqual(len(module.debug_sorted_graph), 6)
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y2_pt = X_pt * X_pt
             Y_pt = Y2_pt.reshape(2 * b, -1)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_size_op(self):
-        self._test_size_op(test_name="size_op_0")
-        self._test_size_op([1], (4, 8, 8), (-1,), "size_op_static")
-        self._test_size_op([4, 2], (4, 8, 8), (-1,), "size_op_1")
-        self._test_size_op([3, 1], (5, 4, 16), (-1, 8), "size_op_2")
+    def test_tensor_size_op_fp16(self):
+        self._test_tensor_size_op(
+            test_name="tensor_size_op_fp16",
+            dtype="float16",
+        )
+        self._test_tensor_size_op(
+            copy_op=True,
+            test_name="tensor_size_op_fp16_copy_op",
+            dtype="float16",
+        )
 
-        self._test_size_op_2(test_name="size_op_3")
-        self._test_size_op_2(test_name="size_op_3_copy_op", copy_op=True)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_tensor_size_op_fp32(self):
+        self._test_tensor_size_op(
+            test_name="tensor_size_op_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_slice.py b/tests/unittest/ops/test_slice.py
index 82ee06d45..06e0d50c9 100644
--- a/tests/unittest/ops/test_slice.py
+++ b/tests/unittest/ops/test_slice.py
@@ -26,9 +26,18 @@
 from aitemplate.utils import shape_utils
 
 
-class SliceTestCase(unittest.TestCase):
+class DynamicSliceTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(DynamicSliceTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
     def _run_dynamic_slice(
-        self, *, input_shape, start_indices, end_indices, input_type="float16"
+        self,
+        *,
+        input_shape,
+        start_indices,
+        end_indices,
+        input_type="float16",
     ):
         logging.info(
             "Test with start_indices {}, end_indices {}".format(
@@ -58,11 +67,104 @@ def _run_dynamic_slice(
         logging.info("AITemplate output_0 shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
-        module = compile_model(Y, target, "./tmp", "dynamic_slice")
+        module = compile_model(Y, target, "./tmp", f"dynamic_slice_{self.test_count}")
+
+        y_ait = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y_ait])
+        self.assertTrue(torch.allclose(Y_pt, y_ait, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_dynamic_slice(self):
+        self._run_dynamic_slice(
+            input_shape=[1],
+            start_indices=[0],
+            end_indices=[1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2],
+            start_indices=[0],
+            end_indices=[-1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3],
+            start_indices=[0, 0],
+            end_indices=[2, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 2, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[1, 0, 1],
+            end_indices=[2, 2, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 0, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 0, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 1],
+            end_indices=[-11, 3, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, -3, -4],
+            end_indices=[9, -1, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[4, 0, 1],
+            end_indices=[1, 1, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2048, 256, 64],
+            start_indices=[256, 32, 0],
+            end_indices=[1024, 193, 65],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3],
+            start_indices=[IntVar([1, 1]), IntImm(1)],
+            end_indices=[IntVarTensor(IntImm(2)), IntVarTensor(IntImm(2))],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_slice_float32(self):
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+            input_type="float32",
+        )
+
 
-        y = torch.empty(y_shape).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+class DynamicSliceBatchedTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(DynamicSliceBatchedTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_batch_dynamic_slice(
         self,
@@ -96,7 +198,9 @@ def _run_batch_dynamic_slice(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", "dynamic_slice_batched")
+        module = compile_model(
+            Y, target, "./tmp", f"dynamic_slice_batched_{self.test_count}"
+        )
 
         for batch in batch_sizes:
             logging.info("checking batch: {}".format(batch))
@@ -105,58 +209,10 @@ def _run_batch_dynamic_slice(
             X_pt = get_random_torch_tensor([batch, *input_shape], input_type)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
-            y_pt = Y_pt.cpu().numpy()
-
-            y = torch.empty(y_pt.shape).cuda().half()
-            module.run_with_tensors([X_pt], [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
-
-    def test_dynamic_slice(self):
-        self._run_dynamic_slice(input_shape=[1], start_indices=[0], end_indices=[1])
-        self._run_dynamic_slice(input_shape=[2], start_indices=[0], end_indices=[-1])
-        self._run_dynamic_slice(
-            input_shape=[2, 3], start_indices=[0, 0], end_indices=[2, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 5], start_indices=[0, 0, 0], end_indices=[2, 2, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[1, 0, 1], end_indices=[2, 2, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 0, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 0, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 1], end_indices=[-11, 3, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, -3, -4], end_indices=[9, -1, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[4, 0, 1], end_indices=[1, 1, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2048, 256, 64],
-            start_indices=[256, 32, 0],
-            end_indices=[1024, 193, 65],
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 5], start_indices=[None, 0, 0], end_indices=[2, None, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3],
-            start_indices=[IntVar([1, 1]), IntImm(1)],
-            end_indices=[IntVarTensor(IntImm(2)), IntVarTensor(IntImm(2))],
-        )
+            y_ait = torch.empty_like(Y_pt)
+            module.run_with_tensors([X_pt], [y_ait])
+            self.assertTrue(torch.allclose(Y_pt, y_ait, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def test_batch_dynamic_slice(self):
         self._run_batch_dynamic_slice(
@@ -214,6 +270,16 @@ def test_batch_dynamic_slice(self):
             end_indices=[None, None, -1, 0],
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_dynamic_slice_float32(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[None, 1, None, -1],
+            end_indices=[None, None, -1, 0],
+            input_type="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index 70b6f8e44..fe29658eb 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class SoftmaxTestCase(unittest.TestCase):
@@ -34,7 +35,7 @@ def _test_softmax(
         dtype="float16",
         testname="softmax",
     ):
-
+        torch_dtype = string_to_torch_dtype(dtype)
         X = Tensor(
             shape=[IntVar(name="input_batch", values=list(batch_sizes)), *input_shapes],
             dtype=dtype,
@@ -49,25 +50,42 @@ def _test_softmax(
         module = compile_model(Y, target, "./tmp", testname)
 
         for batch_size in batch_sizes:
-            x_pt = torch.randn(batch_size, *input_shapes).cuda().half()
+            x_pt = torch.randn(batch_size, *input_shapes, dtype=torch_dtype).cuda()
             y_pt = torch.nn.functional.softmax(x_pt, dim=dim)
 
-            y = torch.empty([batch_size, *input_shapes]).cuda().half()
+            y = torch.empty([batch_size, *input_shapes], dtype=torch_dtype).cuda()
             module.run_with_tensors([x_pt], [y])
-            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
     def test_softmax(self):
-        self._test_softmax()
-        self._test_softmax(dim=1)
-        self._test_softmax((1, 13), (7,))
-        self._test_softmax((10, 1025), (16,))
-        self._test_softmax((1, 17), (9, 8))
-        self._test_softmax((2, 64), (9, 1, 6))
-        self._test_softmax((1, 4096), (33,))
-        self._test_softmax((2, 21), (34,))
-        self._test_softmax((2, 17), (36,))
-        self._test_softmax((1, 64), (128,))
-        self._test_softmax((2, 31), (513,))
+        self._test_softmax(testname="softmax_0")
+        self._test_softmax(dim=1, testname="softmax_1")
+        self._test_softmax((1, 13), (7,), testname="softmax_2")
+        self._test_softmax((10, 1025), (16,), testname="softmax_3")
+        self._test_softmax((1, 17), (9, 8), testname="softmax_4")
+        self._test_softmax((2, 64), (9, 1, 6), testname="softmax_5")
+        self._test_softmax((1, 4096), (33,), testname="softmax_6")
+        self._test_softmax((2, 21), (34,), testname="softmax_7")
+        self._test_softmax((2, 17), (36,), testname="softmax_8")
+        self._test_softmax((1, 64), (128,), testname="softmax_9")
+        self._test_softmax((2, 31), (513,), testname="softmax_10")
+
+    def test_softmax_fp32(self):
+        self._test_softmax(dtype="float32", testname="softmax_fp32_0")
+        self._test_softmax(dim=1, dtype="float32", testname="softmax_fp32_1")
+        self._test_softmax((1, 13), (7,), dtype="float32", testname="softmax_fp32_2")
+        self._test_softmax(
+            (10, 1025), (16,), dtype="float32", testname="softmax_fp32_3"
+        )
+        self._test_softmax((1, 17), (9, 8), dtype="float32", testname="softmax_fp32_4")
+        self._test_softmax(
+            (2, 64), (9, 1, 6), dtype="float32", testname="softmax_fp32_5"
+        )
+        self._test_softmax((1, 4096), (33,), dtype="float32", testname="softmax_fp32_6")
+        self._test_softmax((2, 21), (34,), dtype="float32", testname="softmax_fp32_7")
+        self._test_softmax((2, 17), (36,), dtype="float32", testname="softmax_fp32_8")
+        self._test_softmax((1, 64), (128,), dtype="float32", testname="softmax_fp32_9")
+        self._test_softmax((2, 31), (513,), dtype="float32", testname="softmax_fp32_10")
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_split.py b/tests/unittest/ops/test_split.py
index cc96a043e..a6c828cd5 100644
--- a/tests/unittest/ops/test_split.py
+++ b/tests/unittest/ops/test_split.py
@@ -21,15 +21,25 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 class SplitTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(SplitTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_split(
-        self, *, input_shape, split_size_or_sections, dim=None, input_type="float16"
+        self,
+        *,
+        input_shape,
+        split_size_or_sections,
+        output_masks=None,
+        dim=None,
+        input_type="float16",
     ):
         logging.info(
             f"Test input shape {input_shape}, "
@@ -44,13 +54,27 @@ def _run_split(
             if dim is None
             else torch.split(X_pt, split_size_or_sections, dim)
         )
+        if output_masks is not None:
+            Ys_pt = [y_pt for idx, y_pt in enumerate(Ys_pt) if output_masks[idx]]
         target = detect_target()
         X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
         Ys = (
-            split_op(X, split_size_or_sections)
+            split_op(
+                X,
+                split_size_or_sections,
+            )
             if dim is None
-            else split_op(X, split_size_or_sections, dim)
+            else split_op(
+                X,
+                split_size_or_sections,
+                dim,
+            )
         )
+        if output_masks is not None:
+            split_op.remove_output_at(
+                [idx for idx, mask in enumerate(output_masks) if not mask]
+            )
+            Ys = split_op._attrs["outputs"]
         np.testing.assert_equal(len(Ys_pt), len(Ys))
 
         y_shapes = []
@@ -61,10 +85,11 @@ def _run_split(
             logging.info(f"AITemplate output_{idx} shape: {y_shape}")
             y_shapes.append(y_shape)
 
-        module = compile_model(Ys, target, "./tmp", "split")
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Ys, target, "./tmp", "split", dll_name=dll_name)
 
         outputs = {
-            f"output_{idx}": torch.empty(y_shape).cuda().half()
+            f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
             for idx, y_shape in enumerate(y_shapes)
         }
         module.run_with_tensors([X_pt], outputs)
@@ -72,6 +97,7 @@ def _run_split(
         for idx, y_pt in enumerate(Ys_pt):
             y = outputs[f"output_{idx}"]
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def _run_batch_split(
         self,
@@ -106,7 +132,8 @@ def _run_batch_split(
             Y._attrs["name"] = f"output_{idx}"
             Y._attrs["is_output"] = True
 
-        module = compile_model(Ys, target, "./tmp", "split")
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Ys, target, "./tmp", "split", dll_name=dll_name)
 
         for batch in batch_sizes:
             logging.info(f"checking batch: {batch}")
@@ -123,7 +150,7 @@ def _run_batch_split(
 
             y_shapes = [Y_pt.size() for Y_pt in Ys_pt]
             outputs = {
-                f"output_{idx}": torch.empty(y_shape).cuda().half()
+                f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
                 for idx, y_shape in enumerate(y_shapes)
             }
             module.run_with_tensors(
@@ -134,6 +161,7 @@ def _run_batch_split(
             for idx, y_pt in enumerate(Ys_pt):
                 y = outputs[f"output_{idx}"]
                 self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
 
     def test_split(self):
         self._run_split(input_shape=[1], split_size_or_sections=1, dim=0)
@@ -157,6 +185,26 @@ def test_split(self):
         self._run_split(input_shape=[2, 0, 4], split_size_or_sections=2, dim=-1)
         self._run_split(input_shape=[2, 0, 7], split_size_or_sections=[2, 3, 2], dim=-1)
 
+    def test_split_with_mask(self):
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=[2, 3, 3],
+            output_masks=[True, False, True],
+            dim=0,
+        )
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(5, 1),
+            output_masks=[True, False],
+            dim=1,
+        )
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(2, 2),
+            output_masks=[False, True],
+            dim=2,
+        )
+
     def test_batch_split(self):
         self._run_batch_split(
             batch_sizes=[1, 1], input_shape=[2, 1], split_size_or_sections=1, dim=1
@@ -187,6 +235,38 @@ def test_batch_split(self):
             dim=3,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_float(self):
+        self._run_split(
+            input_shape=[2, 3], split_size_or_sections=2, dim=1, input_type="float"
+        )
+        self._run_split(
+            input_shape=[4097, 128, 64],
+            split_size_or_sections=1024,
+            dim=0,
+            input_type="float",
+        )
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(2, 2),
+            dim=2,
+            input_type="float",
+        )
+        self._run_batch_split(
+            batch_sizes=[1, 1],
+            input_shape=[2, 1],
+            split_size_or_sections=1,
+            dim=1,
+            input_type="float",
+        )
+        self._run_batch_split(
+            batch_sizes=[3, 4],
+            input_shape=[2, 3, 4],
+            split_size_or_sections=2,
+            dim=2,
+            input_type="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_split_getitem.py b/tests/unittest/ops/test_split_getitem.py
index a229063a4..770f1e273 100644
--- a/tests/unittest/ops/test_split_getitem.py
+++ b/tests/unittest/ops/test_split_getitem.py
@@ -15,16 +15,21 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SplitGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_split_getitem(
         self,
         batch_size=(1, 3),
@@ -33,15 +38,14 @@ def _test_split_getitem(
         split_dim=1,
         item_idx=0,
         test_name="split_getitem",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -53,7 +57,12 @@ def _test_split_getitem(
         else:
             assert 0, f"expected split_dim to be either 1 or 2 but got {split_dim}"
 
-        W = Tensor(shape=[b_dim, N, K], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(
+            shape=[b_dim, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
 
         Y1 = ops.split()(X, split_sections, split_dim)
         Y2 = ops.getitem()(Y1, item_idx)
@@ -62,12 +71,13 @@ def _test_split_getitem(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype=dtype)
             WT = torch.transpose(W_pt, 2, 1)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
@@ -77,15 +87,39 @@ def _test_split_getitem(
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem(self):
-        self._test_split_getitem(test_name="split_getitem_0")
+    def test_split_getitem_fp16(self):
+        self._test_split_getitem(
+            test_name="split_getitem_fp16",
+            dtype="float16",
+        )
+        self._test_split_getitem(
+            batch_size=[5],
+            X_shape=(16, 32),
+            split_sections=[8, 20, 4],
+            split_dim=2,
+            item_idx=1,
+            test_name="split_getitem_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"fp32 BMM not supported in {detect_target()._arch}",
+    )
+    def test_split_getitem_fp32(self):
+        self._test_split_getitem(
+            test_name="split_getitem_fp32",
+            dtype="float32",
+        )
         self._test_split_getitem(
             batch_size=[5],
             X_shape=(16, 32),
             split_sections=[8, 20, 4],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_1",
+            test_name="split_getitem_fp32",
+            dtype="float32",
         )
 
     def _test_split_getitem_output(
@@ -95,16 +129,15 @@ def _test_split_getitem_output(
         split_sections=(4, 8, 2, 2),
         split_dim=1,
         item_idx=0,
-        test_name="split_getitem",
+        test_name="split_getitem_output",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -115,11 +148,12 @@ def _test_split_getitem_output(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
             Y_pt = Y1_pt[item_idx]
@@ -127,15 +161,35 @@ def _test_split_getitem_output(
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem_output(self):
-        self._test_split_getitem_output(test_name="split_getitem_output_0")
+    def test_split_getitem_output_fp16(self):
+        self._test_split_getitem_output(
+            test_name="split_getitem_output",
+            dtype="float16",
+        )
+        self._test_split_getitem_output(
+            batch_size=[10],
+            X_shape=(16, 31),
+            split_sections=[9, 19, 3],
+            split_dim=2,
+            item_idx=1,
+            test_name="split_getitem_output",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_getitem_output_fp32(self):
+        self._test_split_getitem_output(
+            test_name="split_getitem_output_fp32",
+            dtype="float32",
+        )
         self._test_split_getitem_output(
             batch_size=[10],
             X_shape=(16, 31),
             split_sections=[9, 19, 3],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_output_1",
+            test_name="split_getitem_output_fp32",
+            dtype="float32",
         )
 
     def _test_split_multiple_getitems(
@@ -144,21 +198,18 @@ def _test_split_multiple_getitems(
         X_shape=(16, 32),
         split_sections=(4, 4, 6, 2),
         split_dim=1,
-        test_name="split_getitem",
+        test_name="split_multiple_getitems",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         assert (
             len(split_sections) >= 2
-        ), "expected split_sections to have at least 2 values, but got {}".format(
-            split_sections
-        )
+        ), f"expected split_sections to have at least 2 values, but got {split_sections}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -173,7 +224,7 @@ def _test_split_multiple_getitems(
         X2_shape[split_dim - 1] = split_sections[item_idx0]
         X2 = Tensor(
             shape=[b_dim, *X2_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_2",
             is_input=True,
         )
@@ -188,13 +239,14 @@ def _test_split_multiple_getitems(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             X2_shape_pt = (b, *X2_shape)
-            X2_pt = torch.randn(X2_shape_pt).cuda().half()
+            X2_pt = get_random_torch_tensor(X2_shape_pt, dtype=dtype)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
             Y2_pt = Y1_pt[item_idx0]
@@ -207,16 +259,36 @@ def _test_split_multiple_getitems(
             module.run_with_tensors({"input_0": X_pt, "input_2": X2_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_mutiple_getitems(self):
-        self._test_split_multiple_getitems(test_name="split_multiple_getitems_0")
+    def test_split_mutiple_getitems_fp16(self):
+        self._test_split_multiple_getitems(
+            test_name="split_multiple_getitems_fp16",
+            dtype="float16",
+        )
+        self._test_split_multiple_getitems(
+            batch_size=[10],
+            X_shape=(16, 31),
+            split_sections=[9, 9, 13],
+            split_dim=2,
+            test_name="split_multiple_getitems_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_mutiple_getitems_fp32(self):
+        self._test_split_multiple_getitems(
+            test_name="split_multiple_getitems_fp32",
+            dtype="float32",
+        )
         self._test_split_multiple_getitems(
             batch_size=[10],
             X_shape=(16, 31),
             split_sections=[9, 9, 13],
             split_dim=2,
-            test_name="split_multiple_getitems_1",
+            test_name="split_multiple_getitems_fp32",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_squeeze.py b/tests/unittest/ops/test_squeeze.py
index d20d01bea..90a1c2a85 100644
--- a/tests/unittest/ops/test_squeeze.py
+++ b/tests/unittest/ops/test_squeeze.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def _construct_shape(
@@ -44,17 +45,19 @@ def _construct_shape(
 
 
 class SqueezeTestCase(unittest.TestCase):
-    def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
+    def _test_helper(
+        self, dim, shape, expected_shape, test_name, do_squeeze, input_type="float16"
+    ):
         target = detect_target()
 
         shape_vars, input_0_names = _construct_shape(shape, 0)
         expected_shape_vars, input_1_names = _construct_shape(expected_shape, 0)
 
         input_0 = Tensor(
-            shape=shape_vars, dtype="float16", name="input_0", is_input=True
+            shape=shape_vars, dtype=input_type, name="input_0", is_input=True
         )
         input_1 = Tensor(
-            shape=expected_shape_vars, dtype="float16", name="input_1", is_input=True
+            shape=expected_shape_vars, dtype=input_type, name="input_1", is_input=True
         )
 
         if do_squeeze:
@@ -76,8 +79,8 @@ def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
         all_input_1_shapes = itertools.product(*expected_shape)
 
         for input_0_shape, input_1_shape in zip(all_input_0_shapes, all_input_1_shapes):
-            input_0_pt = torch.randn(input_0_shape).cuda().half()
-            input_1_pt = torch.randn(input_1_shape).cuda().half()
+            input_0_pt = get_random_torch_tensor(input_0_shape, input_type)
+            input_1_pt = get_random_torch_tensor(input_1_shape, input_type)
             if do_squeeze:
                 # For some reason, torch.squeeze(X_pt, dim) fails when
                 # dim is None (even though the docs say dim is Optional[int])!
@@ -91,8 +94,9 @@ def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
             output_pt = torch.mul(Y_pt, input_1_pt)
             inputs = [input_0_pt, input_1_pt]
 
-            output = torch.empty(input_1_shape).cuda().half()
+            output = torch.empty_like(input_1_pt)
             module.run_with_tensors(inputs, [output])
+            # outputs from view ops must be exactly equal
             self.assertTrue(torch.equal(output, output_pt))
 
     def test_squeeze(self):
@@ -128,6 +132,28 @@ def test_unsqueeze(self):
             False,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_squeeze_float32(self):
+        self._test_helper(
+            2,
+            [[4, 2], [4], [1], [8]],
+            [[4, 2], [4], [8]],
+            "squeeze_float32",
+            True,
+            input_type="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_unsqueeze_float32(self):
+        self._test_helper(
+            0,
+            [[4, 3], [1], [2], [1]],
+            [[1], [4, 3], [1], [2], [1]],
+            "unsqueeze_float32",
+            False,
+            input_type="float32",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index c963d1bd2..01f65c026 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -23,13 +23,18 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class topkTestCase(unittest.TestCase):
-    def _create_tensors(self, shape):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _create_tensors(self, shape, dtype):
         N = np.prod(shape)
         scores = torch.randperm(N) / N
-        return scores.reshape(shape).cuda().half()
+        return scores.reshape(shape).cuda().to(dtype=string_to_torch_dtype(dtype))
 
     def _test_topk(
         self,
@@ -39,6 +44,7 @@ def _test_topk(
         topK=100,
         test_name="topk",
         copy_op=False,
+        dtype="float16",
     ):
 
         o_shape = list(shape)
@@ -46,7 +52,7 @@ def _test_topk(
 
         X1 = Tensor(
             shape=shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -58,15 +64,16 @@ def _test_topk(
         X4._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
 
-        scores = self._create_tensors(shape)
+        scores = self._create_tensors(shape, dtype)
         (values, y_pt) = torch.topk(scores, k=topK, dim=dim)
 
         x = scores.reshape(shape).contiguous()
         y = torch.empty(o_shape).cuda().to(torch.int64)
         module.run_with_tensors([x], [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def test_topk_heap(self):
         self._test_topk(shape=(2000,), topK=100, test_name="topk_heap")
@@ -96,6 +103,41 @@ def test_topk_sort(self):
             copy_op=True,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCm.")
+    def test_float32(self):
+        self._test_topk(
+            shape=(4, 500),
+            topK=200,
+            dim=1,
+            test_name="topk_sort_copy_op_f32",
+            copy_op=True,
+            dtype="float32",
+        )
+        self._test_topk(
+            shape=(4, 500),
+            topK=100,
+            dim=1,
+            test_name="topk_heap_copy_op_f32",
+            copy_op=True,
+            dtype="float32",
+        )
+        self._test_topk(
+            shape=(4, 500),
+            topK=200,
+            dim=1,
+            test_name="topk_sort_f32",
+            copy_op=False,
+            dtype="float32",
+        )
+        self._test_topk(
+            shape=(4, 500),
+            topK=100,
+            dim=1,
+            test_name="topk_heap_f32",
+            copy_op=False,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(1024)
diff --git a/tests/unittest/ops/test_transpose.py b/tests/unittest/ops/test_transpose.py
new file mode 100644
index 000000000..a480fe63b
--- /dev/null
+++ b/tests/unittest/ops/test_transpose.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Sequence
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class TransposeTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TransposeTest, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_transpose_static_shape(
+        self,
+        input_shape: Sequence[int],
+        dim0: int,
+        dim1: int,
+        dtype: str = "float16",
+        test_name: str = "transpose_static_shape",
+    ) -> None:
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.transpose()
+        Y = op(X, dim0, dim1)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        X_pt = get_random_torch_tensor(input_shape, dtype=dtype)
+        Y_pt = torch.transpose(X_pt, dim0, dim1).contiguous()
+
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y])
+
+        torch.testing.assert_close(y, Y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), 1, 2),
+            param((80, 300, 2), 2, -2),
+            param((32, 12, 4096, 64), 2, 1),
+            param((128, 512), -1, -2),
+            param((128, 512), 0, 0),
+        ]
+    )
+    def test_transpose_static_shape_fp16(self, input_shape, dim0, dim1):
+        self._test_transpose_static_shape(
+            input_shape=input_shape,
+            dim0=dim0,
+            dim1=dim1,
+            test_name="test_transpose_static_shape_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), 1, 2),
+        ]
+    )
+    def test_transpose_static_shape_fp32(self, input_shape, dim0, dim1):
+        self._test_transpose_static_shape(
+            input_shape=input_shape,
+            dim0=dim0,
+            dim1=dim1,
+            test_name="test_transpose_static_shape_fp32",
+            dtype="float32",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d.py b/tests/unittest/ops/test_transpose_conv2d.py
index e36878b21..2bb23d2bf 100644
--- a/tests/unittest/ops/test_transpose_conv2d.py
+++ b/tests/unittest/ops/test_transpose_conv2d.py
@@ -19,21 +19,33 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-class conv2dTransposeTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=32, copy_op=False):
+@unittest.skipIf(
+    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
+    "Not supported by CUDA arch < 80.",
+)
+class Conv2dTransposeTestCase(unittest.TestCase):
+    def _test_transpose_conv2d(
+        self,
+        batch=32,
+        copy_op=False,
+        test_name="transpose_conv2d",
+        dtype="float16",
+    ):
         target = detect_target()
-        if target.name() == "cuda" and int(target._arch) < 80:
-            return
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 256],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[256, 2, 2, 256],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.transposed_conv2d(stride=2, pad=0, dilate=1)
         if copy_op:
@@ -41,22 +53,48 @@ def _test_fp16(self, batch=32, copy_op=False):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 256, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 256, 2, 2], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 56, 56, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_transpose_conv2d(
+            test_name="transpose_conv2d_fp16",
+            dtype="float16",
+        )
+        self._test_transpose_conv2d(
+            copy_op=True,
+            test_name="transpose_conv2d_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_transpose_conv2d(
+            test_name="transpose_conv2d_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d(
+            copy_op=True,
+            test_name="transpose_conv2d_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias.py b/tests/unittest/ops/test_transpose_conv2d_bias.py
index 5ab0b6f70..8cd97bde0 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias.py
@@ -19,55 +19,106 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class conv2dTransposeTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+@unittest.skipIf(
+    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
+    "Not supported by CUDA arch < 80.",
+)
+class Conv2dTransposeBiasTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_transpose_conv2d_bias(
+        self,
+        batch=32,
+        c_in=256,
+        c_out=256,
+        copy_op=False,
+        test_name="transpose_conv2d_bias",
+        dtype="float16",
+    ):
         target = detect_target()
-        if int(target._arch) < 80:
-            return
         X = Tensor(
-            shape=[IntImm(batch), 14, 14, 256],
-            dtype="float16",
+            shape=[IntImm(batch), 14, 14, c_in],
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[c_in, 2, 2, c_out],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[c_out],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.transposed_conv2d_bias(stride=2, pad=0, dilate=1)
         if copy_op:
             OP = ops.transposed_conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, c_in, 14, 14], dtype=dtype)
+        W_pt = get_random_torch_tensor([c_in, c_out, 2, 2], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, c_out, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
         Y_pt = Y_pt + B_pt
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(
             {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
         )
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=2e-1, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        for c_out in [192, 256]:
+            self._test_transpose_conv2d_bias(
+                c_out=c_out,
+                test_name="transpose_conv2d_bias_fp16",
+                dtype="float16",
+            )
+        self._test_transpose_conv2d_bias(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_transpose_conv2d_bias(
+            test_name="transpose_conv2d_bias_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d_bias(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_fp32_copy_op",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
index d5489f0bd..95d7f6102 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
@@ -19,49 +19,98 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class conv2dTransposeTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+@unittest.skipIf(
+    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
+    "Not supported by CUDA arch < 80.",
+)
+class Conv2dTransposeBiasReluTestCase(unittest.TestCase):
+    def _test_transpose_conv2d_bias_relu(
+        self,
+        batch=32,
+        copy_op=False,
+        test_name="transpose_conv2d_bias_relu",
+        dtype="float16",
+    ):
         target = detect_target()
-        if int(target._arch) < 80:
-            return
         X = Tensor(
             shape=[IntImm(batch), 14, 14, 256],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[256, 2, 2, 256],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.transposed_conv2d_bias_relu(stride=2, pad=0, dilate=1)
+        if copy_op:
+            OP = ops.transposed_conv2d_bias_relu(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 256, 14, 14], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 256, 2, 2], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.relu(Y_pt)
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(
             {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
         )
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_transpose_conv2d_bias_relu(
+            test_name="transpose_conv2d_bias_relu_fp16",
+            dtype="float16",
+        )
+        self._test_transpose_conv2d_bias_relu(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_relu_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_fp32(self):
+        self._test_transpose_conv2d_bias_relu(
+            test_name="transpose_conv2d_bias_relu_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d_bias_relu(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_relu_fp32_copy_op",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_tuple_list_construct.py b/tests/unittest/ops/test_tuple_list_construct.py
index a4a4b539a..406df8ff7 100644
--- a/tests/unittest/ops/test_tuple_list_construct.py
+++ b/tests/unittest/ops/test_tuple_list_construct.py
@@ -15,25 +15,27 @@
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model, ops
 
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class TupleConstructTestCase(unittest.TestCase):
-    def _test_tuple_construct(
+class TupleListConstructTestCase(unittest.TestCase):
+    def _test_construct(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         test_op=ops.tuple_construct,
-        test_name="tuple",
+        test_name="tuple_construct",
+        dtype="float16",
     ):
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -57,15 +59,15 @@ def _test_tuple_construct(
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y1_pt = X_pt.reshape(-1, X_shape_pt[-1])
             Y2_pt = X_pt.flatten()
             Y3_pt = Y2_pt.unsqueeze(1)
 
             outputs = [
-                torch.empty(Y1_pt.size()).cuda().half(),
-                torch.empty(Y2_pt.size()).cuda().half(),
-                torch.empty(Y3_pt.size()).cuda().half(),
+                torch.empty_like(Y1_pt),
+                torch.empty_like(Y2_pt),
+                torch.empty_like(Y3_pt),
             ]
             module.run_with_tensors([X_pt], outputs)
 
@@ -73,10 +75,19 @@ def _test_tuple_construct(
             self.assertTrue(torch.allclose(Y2_pt, outputs[1], atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(Y3_pt, outputs[2], atol=1e-2, rtol=1e-2))
 
-    def test_tuple_construct(self):
-        self._test_tuple_construct(test_op=ops.tuple_construct, test_name="tuple_0")
-        self._test_tuple_construct(test_op=ops.list_construct, test_name="list_0")
+    def test_construct_fp16(self):
+        self._test_construct(
+            test_op=ops.tuple_construct,
+            test_name="construct_fp16_tuple",
+            dtype="float16",
+        )
+        self._test_construct(
+            test_op=ops.list_construct,
+            test_name="construct_fp16_list",
+            dtype="float16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d.py b/tests/unittest/ops/test_upsamping2d.py
index 41cd32883..2c4e88660 100644
--- a/tests/unittest/ops/test_upsamping2d.py
+++ b/tests/unittest/ops/test_upsamping2d.py
@@ -19,24 +19,27 @@
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
 
 _DEFAULT_BATCH_SIZE = [1, 3]
 
 
 class UpsamplingTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
+    def _test_single_op(
         self,
         scale_factor=2.0,
         mode="bilinear",
         batch_size=_DEFAULT_BATCH_SIZE,
-        test_name="bilinear_upsampling2d",
+        test_name="bilinear_upsampling2d_fp16",
+        dtype="float16",
     ):
         channels = 1024
         HH, WW = 8, 8
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -47,32 +50,51 @@ def _test_fp16_single_op(
         module = compile_model(Y, target, "./tmp", test_name)
 
         for b in batch_size:
-            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
+            X_pt = get_random_torch_tensor([b, channels, HH, WW], dtype=dtype)
             Y_pt = torch.nn.functional.interpolate(
                 X_pt, scale_factor=scale_factor, mode=mode
             )
             x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
-            y = (
-                torch.empty(
-                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
-                )
-                .cuda()
-                .half()
-            )
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_bilinear_upsample(self):
-        self._test_fp16_single_op(
-            scale_factor=3.5, mode="bilinear", test_name="bilinear_upsampling2d"
+    def test_bilinear_upsample_fp16(self):
+        self._test_single_op(
+            scale_factor=3.5,
+            mode="bilinear",
+            test_name="bilinear_upsampling2d_fp16",
+            dtype="float16",
+        )
+
+    def test_nearest_upsample_fp16(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_bilinear_upsample_fp32(self):
+        self._test_single_op(
+            scale_factor=3.5,
+            mode="bilinear",
+            test_name="bilinear_upsampling2d_fp32",
+            dtype="float32",
         )
 
-    def test_nearest_upsample(self):
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", test_name="nearest_upsampling2d"
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_nearest_upsample_fp32(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_fp32",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsamping2d_add.py
index f4e23105f..51eabc454 100644
--- a/tests/unittest/ops/test_upsamping2d_add.py
+++ b/tests/unittest/ops/test_upsamping2d_add.py
@@ -19,25 +19,27 @@
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 _DEFAULT_BATCH_SIZE = [1, 16]
 
 
 class UpsamplingAddTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
+    def _test_single_op(
         self,
         scale_factor=2.0,
         mode="bilinear",
         channels=1024,
         batch_size=_DEFAULT_BATCH_SIZE,
-        test_name="upsampling2d_add",
+        test_name="bilinear_upsampling2d_add_fp16",
+        dtype="float16",
     ):
         HH, WW = 32, 32
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -49,7 +51,7 @@ def _test_fp16_single_op(
                 int(WW * scale_factor),
                 channels,
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -61,11 +63,15 @@ def _test_fp16_single_op(
         module = compile_model(Y, target, "./tmp", test_name)
 
         for b in batch_size:
-            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
-            R_pt = (
-                torch.randn(b, channels, int(HH * scale_factor), int(WW * scale_factor))
-                .cuda()
-                .half()
+            X_pt = get_random_torch_tensor([b, channels, HH, WW], dtype=dtype)
+            R_pt = get_random_torch_tensor(
+                [
+                    b,
+                    channels,
+                    int(HH * scale_factor),
+                    int(WW * scale_factor),
+                ],
+                dtype=dtype,
             )
 
             Y_pt = (
@@ -77,33 +83,72 @@ def _test_fp16_single_op(
 
             x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
             r = torch.permute(R_pt, (0, 2, 3, 1)).contiguous()
-            y = (
-                torch.empty(
-                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
-                )
-                .cuda()
-                .half()
-            )
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors({"input_0": x, "input_1": r}, [y])
             y_tranpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_tranpose, atol=1e-2, rtol=1e-2))
 
-    def test_bilinear_upsample_add(self):
-        self._test_fp16_single_op(
-            scale_factor=2.0, test_name="bilinear_upsampling2d_add"
+    def test_bilinear_upsample_add_fp16(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            test_name="bilinear_upsampling2d_add_fp16",
+            dtype="float16",
+        )
+
+    def test_nearest_upsample_add_fp16(self):
+        self._test_single_op(
+            scale_factor=3.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_add_fp16_1",
+            dtype="float16",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=514,
+            test_name="nearest_upsampling2d_add_fp16_2",
+            dtype="float16",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=231,
+            test_name="nearest_upsampling2d_add_fp16_3",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_bilinear_upsample_add_fp32(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            test_name="bilinear_upsampling2d_add_fp32",
+            dtype="float32",
         )
 
-    def test_nearest_upsample_add(self):
-        self._test_fp16_single_op(
-            scale_factor=3.0, mode="nearest", test_name="nearest_add1"
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_nearest_upsample_add_fp32(self):
+        self._test_single_op(
+            scale_factor=3.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_add_fp32_1",
+            dtype="float32",
         )
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", channels=514, test_name="nearest_add2"
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=514,
+            test_name="nearest_upsampling2d_add_fp32_2",
+            dtype="float32",
         )
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", channels=231, test_name="nearest_add3"
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=231,
+            test_name="nearest_upsampling2d_add_fp32_3",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index 1d7d7d72f..293bde6ad 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -15,6 +15,7 @@
 """
 Unittests for vanilla_attenion.
 """
+import logging
 import math
 import os
 import unittest
@@ -26,10 +27,13 @@
 from aitemplate.frontend import nn, Tensor
 from aitemplate.frontend.nn.vanilla_attention import vanilla_attention
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.utils import shape_utils
 from einops import rearrange
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def mark_output(y):
     if type(y) is not tuple:
         y = (y,)
@@ -199,9 +203,7 @@ def _test_vanilla_attention(
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark vanilla-attn time: {0}".format(time_per_iter_ms)
-            )
+            _LOGGER.info("benchmark vanilla-attn time: {0}".format(time_per_iter_ms))
 
         self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
 
@@ -317,9 +319,7 @@ def _test_mha(
                     ys,
                     count=100,
                 )
-                logger.info(
-                    __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
-                )
+                _LOGGER.info("benchmark cross-attn time: {0}".format(time_per_iter_ms))
 
     def test_cross_attn(self):
         self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
index edf92b362..9524c4b6a 100644
--- a/tests/unittest/ops/test_var.py
+++ b/tests/unittest/ops/test_var.py
@@ -21,13 +21,15 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class VarTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(VarTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_var(
         self,
@@ -39,6 +41,8 @@ def _run_var(
         input_type="float16",
         output_type=None,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -61,17 +65,23 @@ def _run_var(
         logging.info("AITemplate output_shape: {}".format(y_shape))
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        test_name = "var"
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"var_{self.test_count}")
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
+        if output_type is None:
+            torch_dtype = None
+        else:
+            torch_dtype = string_to_torch_dtype(output_type)
+        Y_pt = torch.var(
+            X_pt.to(dtype=torch_dtype), dim=dim, unbiased=unbiased, keepdim=keepdim
+        )
 
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
 
         np.testing.assert_equal(y_shape, Y_pt.size())
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True))
+        np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        self.assertTrue(torch.allclose(Y_pt, y, atol=atol, rtol=rtol, equal_nan=True))
+        self.test_count += 1
 
     def test_var(self):
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 1], keepdim=False)
@@ -128,11 +138,12 @@ def _run_batched_var(
             X_pt = get_random_torch_tensor(input_shape, input_type)
             Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
-            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+            np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def test_batched_var(self):
         self._run_batched_var(dim=0, unbiased=False, keepdim=True)
@@ -140,6 +151,66 @@ def test_batched_var(self):
         self._run_batched_var(dim=1, unbiased=False, keepdim=True)
         self._run_batched_var(dim=2, unbiased=True, keepdim=False)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_float32(self):
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/util/test_debug_utils.py b/tests/unittest/util/test_debug_utils.py
index c0c4e63c1..a8e31c6b5 100644
--- a/tests/unittest/util/test_debug_utils.py
+++ b/tests/unittest/util/test_debug_utils.py
@@ -25,6 +25,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.debug_settings import AITDebugSettings
 
 
 def _test_inf_and_nan(
@@ -43,8 +44,9 @@ def _test_inf_and_nan(
     X2._attrs["check_nan_and_inf"] = check_tensor
 
     target = detect_target()
+    debug_settings = AITDebugSettings(check_all_nan_and_inf=check_all)
     module = compile_model(
-        X2, target, "./tmp", test_name, check_all_nan_and_inf=check_all
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
     )
 
     x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
@@ -78,7 +80,10 @@ def _test_outputs(
     X2._attrs["check_outputs"] = check_tensor
 
     target = detect_target()
-    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+    debug_settings = AITDebugSettings(check_all_outputs=check_all)
+    module = compile_model(
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
+    )
 
     x1_pt = torch.Tensor([[1.0, 1.5, 2.0]]).cuda().half()
     x2 = torch.empty_like(x1_pt)
@@ -86,9 +91,10 @@ def _test_outputs(
 
     out, _ = capfd.readouterr()
     output_str = "Tensor (output0) output:"
-    assert out.find(output_str) != -1
+    idx = out.find(output_str)
+    assert idx != -1
 
-    out = out[len(output_str) :].strip()
+    out = out[idx + len(output_str) :].strip()
     values = out.split(", ")
     assert len(values) == 3, f"Got {len(values)} outputs, expected 3"
 
@@ -121,7 +127,10 @@ def _test_special_outputs(
     X2._attrs["check_outputs"] = check_tensor
 
     target = detect_target()
-    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+    debug_settings = AITDebugSettings(check_all_outputs=check_all)
+    module = compile_model(
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
+    )
 
     x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
     x2 = torch.empty_like(x1_pt)
diff --git a/tests/unittest/util/test_serdes.py b/tests/unittest/util/test_serdes.py
index c800e4711..617bb599b 100644
--- a/tests/unittest/util/test_serdes.py
+++ b/tests/unittest/util/test_serdes.py
@@ -15,6 +15,7 @@
 """
 Unittests for special activation Operator.
 """
+import logging
 import unittest
 
 import torch
@@ -24,7 +25,6 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
 from aitemplate.utils.serialization.serdes_code import (
     dump_program,
     get_inputs_from_graph,
@@ -32,6 +32,9 @@
 )
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class SerDesTestCase(unittest.TestCase):
     def test_get_inputs(self):
         X1 = Tensor(
@@ -209,7 +212,7 @@ def test_reshape(self):
     def test_group_gemm_rcr(self):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M = 256
@@ -288,3 +291,8 @@ def test_dynamic_slice(self):
             y = torch.empty(y_pt.shape).cuda().half()
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 745e17238db099f964754376ace8e5e5c47c691d Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Sun, 29 Jan 2023 19:17:28 -0800
Subject: [PATCH 022/638] fbshipit-source-id:
 1d8e20873714b390309c90b01098d2afea2734f0


From 35bd5a47ab057527c294c8d08d2b151e1475e60b Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 30 Jan 2023 19:27:10 +0800
Subject: [PATCH 023/638] remove useless code

---
 python/aitemplate/backend/profiler_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index ec3c4e8e8..e0cb2f103 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -97,9 +97,6 @@ def process_task(task: Task) -> None:
             task._failed = True
             return
         cmd = task._cmd
-        if Target.current().name() == "rocm":
-            cmd = " ".join(cmd)
-        cmd = task._cmd
         if Target.current().name() == "rocm":
             cmd = " ".join(cmd)
         logger.debug(

From b5d9c94dc269842390b4efdb31c1753875eedc0c Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Tue, 31 Jan 2023 03:20:33 +0800
Subject: [PATCH 024/638] fix a bug

---
 python/aitemplate/compiler/ops/conv/conv2d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 2e49edb52..9019c8b3e 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -465,12 +465,12 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 "To bypass, you need to make it available in the db table.",
             )
         if target.name() == "rocm":
+            runner = backend.profiler_runner.Runner(
+                devices, self._attrs["name"], timeout=180
+            )
             op_type = self._attrs["op"]
             all_op_names = list(self._attrs["op_instance"].keys())
             for op_name in all_op_names:
-                runner = backend.profiler_runner.Runner(
-                    devices, self._attrs["name"], timeout=180
-                )
                 x_shape = self._invert_exec_key(exec_key)
                 command = self._gen_profile_cmd(profiler_prefix, op_name, x_shape)
                 runner.push(op_name, command)

From 7e346b286aa662dc5482d0478379e1b635e70dad Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Mon, 30 Jan 2023 12:09:50 -0800
Subject: [PATCH 025/638] Back out "Revert D42767227: Multisect successfully
 blamed D42767227 for test or build failures" (#155)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/155

as titled
Fixed the unit test failure issue. Now local test is good.

Reviewed By: wushirong

Differential Revision: D42826998

fbshipit-source-id: 61082707766c97b726f3c2630ea067536c7e3d0e
---
 .../aitemplate/compiler/transform/fuse_ops.py | 23 ++++-
 tests/unittest/compiler/test_fuse_ops.py      | 90 +++++++++++++++++++
 2 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 tests/unittest/compiler/test_fuse_ops.py

diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 05e7b8b0c..ba9c9d8bd 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -24,10 +24,13 @@
 from aitemplate.compiler.transform.toposort import toposort
 
 from ..base import Tensor
-from ..ops.common import fused_elementwise
+from ..ops.common import elementwise, fused_elementwise
 from ..ops.common.epilogue import FuncEnum
+from ..ops.groupnorm.groupnorm import group_norm
+from ..ops.groupnorm.groupnorm_swish import group_norm_swish
 from ..ops.layernorm import layernorm_sigmoid_mul
 from . import transform_utils
+from .fuse_utils import transform_simple_fusion_patterns
 
 # pylint: disable=C0103,W0612
 
@@ -376,10 +379,26 @@ def _fuse_layernorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
+def _fuse_groupnorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
+    fusion_patterns = [
+        (
+            (
+                group_norm(num_groups=2, num_channels=4),
+                elementwise(FuncEnum.SIGMOID),
+                elementwise(FuncEnum.MUL),
+            ),
+            group_norm_swish,
+        )
+    ]
+    graph = transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+    return graph
+
+
 def fuse_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
     funcs = [
         _fuse_layernorm_sigmoid_mul,
-        _fuse_elementwise,
+        _fuse_groupnorm_sigmoid_mul,
+        _fuse_elementwise,  # this pass should be left in the last one
     ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
diff --git a/tests/unittest/compiler/test_fuse_ops.py b/tests/unittest/compiler/test_fuse_ops.py
new file mode 100644
index 000000000..329c78b1a
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_ops.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class TestFuseGroupnormSwish(unittest.TestCase):
+    def test_fused(self):
+        x_shape = [3, 3, 1, 4]
+        num_groups = 2
+        num_channels = x_shape[-1]
+        dtype = "float16"
+        eps = 1e-5
+
+        X1 = Tensor(
+            shape=x_shape,
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[num_channels],
+            dtype=dtype,
+            name="gamma",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[num_channels],
+            dtype=dtype,
+            name="beta",
+            is_input=True,
+        )
+
+        op_name = "group_norm"
+        OP = getattr(ops, op_name)(num_groups, num_channels)
+
+        X4 = OP(X1, X2, X3, eps)
+        X5 = ops.elementwise(FuncEnum.SIGMOID)(X4)
+        X6 = ops.elementwise(FuncEnum.MUL)(X5)
+        X6._attrs["is_output"] = True
+        X6._attrs["name"] = "output"
+
+        target = detect_target()
+        dll_name = "test_0.so"
+        module = compile_model(X6, target, "./tmp", op_name, dll_name=dll_name)
+
+        x1_nhwc_pt = get_random_torch_tensor(x_shape, dtype)
+        x1_nchw_pt = x1_nhwc_pt.permute(0, 3, 1, 2).contiguous()
+        gamma_pt = get_random_torch_tensor((num_channels,), dtype)
+        beta_pt = torch.randn_like(gamma_pt)
+
+        x6_pt = torch.nn.functional.group_norm(
+            x1_nchw_pt, num_groups, gamma_pt, beta_pt, eps=eps
+        )
+
+        x6_pt = torch.nn.SiLU()(x6_pt)
+
+        inputs = {"X": x1_nhwc_pt}
+        inputs["gamma"] = gamma_pt
+        inputs["beta"] = beta_pt
+        x6 = torch.empty_like(x1_nhwc_pt)
+        module.run_with_tensors(inputs, [x6])
+
+        torch.testing.assert_close(
+            x6, x6_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From ebd2f23789858920f6fdd8336c95d9c45b300729 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 30 Jan 2023 12:15:01 -0800
Subject: [PATCH 026/638] Fix aitemplate setuptools and stable_diffusion
 example run (#152)

Summary:
ATT, several fixes.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/152

Reviewed By: chenyang78

Differential Revision: D42838942

Pulled By: ipiszy

fbshipit-source-id: b865e4eb74fe240a0bb8cb20d08ffd1685c3de67
---
 .../05_stable_diffusion/scripts/compile.py    |  4 +++
 examples/05_stable_diffusion/scripts/demo.py  |  6 +++++
 .../scripts/demo_img2img.py                   |  5 ++++
 python/aitemplate/utils/__init__.py           |  1 +
 python/aitemplate/utils/import_path.py        | 27 +++++++++++++++++++
 python/setup.py                               |  8 ++++--
 6 files changed, 49 insertions(+), 2 deletions(-)
 create mode 100644 python/aitemplate/utils/import_path.py

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 8c7a5be98..4a38d3bc4 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -19,9 +19,13 @@
 import torch
 
 from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
 
 from diffusers import StableDiffusionPipeline
 
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
 from src.compile_lib.compile_clip import compile_clip
 from src.compile_lib.compile_unet import compile_unet
 from src.compile_lib.compile_vae import compile_vae
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index 77d58cde2..d4f5dbb99 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -12,11 +12,17 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 import click
 import torch
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.import_path import import_parent
 from diffusers import EulerDiscreteScheduler
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
 from src.pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
 
 
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index 46c53cfd9..e4d96d865 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -19,7 +19,12 @@
 import torch
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.import_path import import_parent
 from PIL import Image
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
 from src.pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
 
 
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index 44c1a6b98..6f57327ed 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -18,6 +18,7 @@
 from . import (
     alignment,
     graph_utils,
+    import_path,
     markdown_table,
     misc,
     shape_utils,
diff --git a/python/aitemplate/utils/import_path.py b/python/aitemplate/utils/import_path.py
new file mode 100644
index 000000000..caaccd9f2
--- /dev/null
+++ b/python/aitemplate/utils/import_path.py
@@ -0,0 +1,27 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+from pathlib import Path
+
+
+def import_parent(filepath: str, level: int) -> None:
+    r_filepath = Path(filepath).resolve()
+    parent, top = r_filepath.parent, r_filepath.parents[level]
+
+    sys.path.append(str(top))
+    try:
+        sys.path.remove(str(parent))
+    except ValueError:  # Already removed
+        pass
diff --git a/python/setup.py b/python/setup.py
index df01212e3..53eaa8063 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -79,7 +79,11 @@ def gen_cutlass_list():
         "aitemplate/3rdparty/cutlass/examples",
         "aitemplate/3rdparty/cutlass/tools/util/include",
     ]
-    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cuh") else False
+    f_cond = (
+        lambda x: True
+        if x.endswith(".h") or x.endswith(".cuh") or x.endswith(".hpp")
+        else False
+    )
     return gen_file_list(srcs, f_cond)
 
 
@@ -128,7 +132,7 @@ def gen_utils_file_list():
 
 
 def gen_backend_common_file_list():
-    srcs = ["aitemplate/backend/common"]
+    srcs = ["aitemplate/backend"]
     f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") else False
     return gen_file_list(srcs, f_cond)
 

From e16047450f91c40e9066e7f91403b328ef97f2a0 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 30 Jan 2023 20:42:27 -0800
Subject: [PATCH 027/638] fbshipit-source-id:
 28c84b50b0a6699b88bfd075f7fdcff94f1be102

---
 .circleci/config.yml                          |  86 +-
 .flake8                                       | 106 +--
 .github/workflows/docs.yaml                   |  39 +
 .github/workflows/{docs.yml => pages.yaml}    |   4 +-
 .github/workflows/{lint.yml => pylint.yaml}   |   6 +-
 examples/05_stable_diffusion/benchmark.py     | 302 -------
 examples/05_stable_diffusion/benchmark_pt.py  |  47 --
 examples/05_stable_diffusion/compile.py       | 379 ---------
 examples/05_stable_diffusion/demo.py          |  54 --
 examples/05_stable_diffusion/demo_img2img.py  |  69 --
 .../05_stable_diffusion/modeling/attention.py | 105 ---
 examples/05_stable_diffusion/modeling/clip.py | 588 --------------
 .../modeling/embeddings.py                    | 101 ---
 .../05_stable_diffusion/modeling/resnet.py    | 238 ------
 .../modeling/unet_2d_condition.py             | 256 ------
 .../modeling/unet_blocks.py                   | 761 ------------------
 examples/05_stable_diffusion/modeling/vae.py  | 152 ----
 .../pipeline_stable_diffusion_ait.py          | 410 ----------
 .../pipeline_stable_diffusion_img2img_ait.py  | 402 ---------
 fx2ait/fx2ait/TARGETS                         |  41 -
 fx2ait/fx2ait/csrc/TARGETS                    |  29 -
 fx2ait/fx2ait/test/TARGETS                    |  78 --
 .../cuda/gemm_universal/gemm_rcr_bias_add.py  |  99 ---
 .../gemm_universal/gemm_rcr_bias_add_add.py   |  99 ---
 .../gemm_rcr_bias_add_add_relu.py             |  99 ---
 .../gemm_universal/gemm_rcr_bias_add_relu.py  |  99 ---
 .../cuda/gemm_universal/gemm_rcr_bias_mul.py  |  99 ---
 .../gemm_universal/gemm_rcr_bias_mul_add.py   |  99 ---
 .../gemm_universal/gemm_rcr_bias_mul_tanh.py  |  99 ---
 .../gemm_rcr_bias_sigmoid_mul.py              |  99 ---
 .../gemm_rcr_bias_sigmoid_mul_tanh.py         |  99 ---
 .../compiler/transform/fuse_permute_bmm.py    | 224 ------
 python/aitemplate/utils/logger.py             |  58 --
 33 files changed, 159 insertions(+), 5267 deletions(-)
 create mode 100644 .github/workflows/docs.yaml
 rename .github/workflows/{docs.yml => pages.yaml} (96%)
 rename .github/workflows/{lint.yml => pylint.yaml} (91%)
 delete mode 100644 examples/05_stable_diffusion/benchmark.py
 delete mode 100644 examples/05_stable_diffusion/benchmark_pt.py
 delete mode 100644 examples/05_stable_diffusion/compile.py
 delete mode 100644 examples/05_stable_diffusion/demo.py
 delete mode 100644 examples/05_stable_diffusion/demo_img2img.py
 delete mode 100644 examples/05_stable_diffusion/modeling/attention.py
 delete mode 100644 examples/05_stable_diffusion/modeling/clip.py
 delete mode 100644 examples/05_stable_diffusion/modeling/embeddings.py
 delete mode 100644 examples/05_stable_diffusion/modeling/resnet.py
 delete mode 100644 examples/05_stable_diffusion/modeling/unet_2d_condition.py
 delete mode 100644 examples/05_stable_diffusion/modeling/unet_blocks.py
 delete mode 100644 examples/05_stable_diffusion/modeling/vae.py
 delete mode 100644 examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
 delete mode 100644 examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
 delete mode 100644 fx2ait/fx2ait/TARGETS
 delete mode 100644 fx2ait/fx2ait/csrc/TARGETS
 delete mode 100644 fx2ait/fx2ait/test/TARGETS
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
 delete mode 100644 python/aitemplate/compiler/transform/fuse_permute_bmm.py
 delete mode 100644 python/aitemplate/utils/logger.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 19c2d377a..c86323420 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,25 +18,45 @@ setup_env: &setup_env
   - run:
       name: Setup environment
       command: |
-        python3.8 --version
-        python3.8 -m pip install --upgrade pip
-        cd python
-        python3.8 setup.py bdist_wheel
-        sudo python3.8 -m pip install --no-input dist/*.whl
-        cd ..
-        python3.8 -m pip install pytest
-        python3.8 -m pip install torch
-        python3.8 -m pip install numpy
-        python3.8 -m pip install jinja2
-        python3.8 -m pip install recordtype
-        python3.8 -m pip install parameterized
-        python3.8 -m pip install einops
-        git submodule sync
-        git submodule update --init
-        echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV
-        echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV
-        echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV
-        echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV
+        for i in {1..3}; do
+          python3.8 --version &&
+          python3.8 -m pip install --upgrade pip &&
+          cd /home/circleci/project/python &&
+          python3.8 setup.py bdist_wheel &&
+          sudo python3.8 -m pip install --no-input dist/*.whl &&
+          cd /home/circleci/project &&
+          python3.8 -m pip install pytest &&
+          python3.8 -m pip install torch &&
+          python3.8 -m pip install numpy &&
+          python3.8 -m pip install jinja2 &&
+          python3.8 -m pip install recordtype &&
+          python3.8 -m pip install parameterized &&
+          python3.8 -m pip install einops &&
+          git submodule sync &&
+          git submodule update --init &&
+          echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV &&
+          echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV &&
+          echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV &&
+          echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV &&
+          break || sleep 5;
+        done
+
+
+setup_fx2ait_env: &setup_fx2ait_env
+  - run:
+      name: Setup fx2ait environment
+      command: |
+        for i in {1..3}; do
+          wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+          tar -xvf cudnn-*-archive.tar.xz
+          sudo cp cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include
+          sudo cp -P cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
+          sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
+          python3.8 -m pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+          python3.8 fx2ait/setup.py install --prefix=/home/circleci/
+          echo 'export PYTHONPATH=$PWD/fx2ait:$PYTHONPATH' >> $BASH_ENV
+          break || sleep 5;
+        done
 
 basic_tests: &basic_tests
   - run:
@@ -44,19 +64,38 @@ basic_tests: &basic_tests
       command: |
         set -e
         TEST_FILES=$(circleci tests glob "tests/unittest/**/test_*.py" | grep -v benchmark | circleci tests split --split-by=timings)
-        mkdir test-results
-        python3.8 -m pytest $TEST_FILES --junitxml=test-results/junit.xml --verbose --continue-on-collection-errors -rA
+        mkdir ~/test-results
+        python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-results/junit.xml --verbose --continue-on-collection-errors -rA
 
+fx2ait_tests: &fx2ait_tests
+  - run:
+      name: Run fx2ait tests
+      command: |
+        source $BASH_ENV
+        mkdir -p ~/test-fx2ait-results
+        TEST_FILES=$(circleci tests glob "fx2ait/fx2ait/test/test_*.py" "fx2ait/fx2ait/test/converters/**/test_*.py")
+        python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-fx2ait-results/junit.xml --verbose --continue-on-collection-errors -rA
 
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
 jobs:
+  fx2ait-test:
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+      resource_class: gpu.nvidia.medium
+    steps:
+      - checkout
+      - <<: *setup_env
+      - <<: *setup_fx2ait_env
+      - <<: *fx2ait_tests
+      - store_test_results:
+          path: ~/test-fx2ait-results
+
   build-and-test:
     machine:
       image: ubuntu-2004-cuda-11.4:202110-01
       # Check T101565170 for multi-gpu use cases.
       resource_class: gpu.nvidia.medium
-
     parallelism: 10
 
     # Checkout the code as the first step. This is a dedicated CircleCI step.
@@ -69,7 +108,7 @@ jobs:
       - <<: *setup_env
       - <<: *basic_tests
       - store_test_results:
-          path: test-results
+          path: ~/test-results
 
 # Invoke jobs via workflows
 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
@@ -77,4 +116,5 @@ workflows:
   unittest: # This is the name of the workflow, feel free to change it to better match your workflow.
     # Inside the workflow, you define the jobs you want to run.
     jobs:
+      - fx2ait-test
       - build-and-test
diff --git a/.flake8 b/.flake8
index 71a5883ed..9ef66bc0d 100644
--- a/.flake8
+++ b/.flake8
@@ -7,111 +7,111 @@ ignore =
   # Found in https://github.com/psf/black/issues/429
   # Line too long.
   B950,
-  # Indentation is not a multiple of four. 
-  E111, 
+  # Indentation is not a multiple of four.
+  E111,
   # Expected an indented block (comment).
-  E115, 
+  E115,
   # Over-indented.
   E117,
-  # Continuation line under-indented for hanging indent. 
+  # Continuation line under-indented for hanging indent.
   E121,
-  # Continuation line missing indentation or outdented. 
+  # Continuation line missing indentation or outdented.
   E122,
-  # Closing bracket does not match indentation of opening bracket's line. 
+  # Closing bracket does not match indentation of opening bracket's line.
   E123,
-  # Closing bracket does not match visual indentation. 
+  # Closing bracket does not match visual indentation.
   E124,
-  # Continuation line with same indent as next logical line. 
+  # Continuation line with same indent as next logical line.
   E125,
-  # Continuation line over-indented for hanging indent. 
+  # Continuation line over-indented for hanging indent.
   E126,
-  # Continuation line over-indented for visual indent. 
+  # Continuation line over-indented for visual indent.
   E127,
-  # Continuation line under-indented for visual indent. 
+  # Continuation line under-indented for visual indent.
   E128,
-  # Visually indented line with same indent as next logical line. 
+  # Visually indented line with same indent as next logical line.
   E129,
-  # Continuation line unaligned for hanging indent. 
+  # Continuation line unaligned for hanging indent.
   E131,
-  # Whitespace after '('. 
+  # Whitespace after '('.
   E201,
-  # Whitespace before ')'. 
+  # Whitespace before ')'.
   E202,
-  # Whitespace before ':'. 
+  # Whitespace before ':'.
   E203,
-  # Multiple spaces before operator. 
+  # Multiple spaces before operator.
   E221,
-  # Multiple spaces after operator. 
+  # Multiple spaces after operator.
   E222,
-  # Missing whitespace around operator. 
+  # Missing whitespace around operator.
   E225,
-  # Missing whitespace around arithmetic operator. 
+  # Missing whitespace around arithmetic operator.
   E226,
-  # Missing whitespace around bitwise or shift operator. 
+  # Missing whitespace around bitwise or shift operator.
   E227,
-  # Missing whitespace after ',', ';', or ':'. 
+  # Missing whitespace after ',', ';', or ':'.
   E231,
-  # Multiple spaces after ','. 
+  # Multiple spaces after ','.
   E241,
-  # Unexpected spaces around keyword / parameter equals. 
+  # Unexpected spaces around keyword / parameter equals.
   E251,
-  # Missing whitespace around parameter equals. 
+  # Missing whitespace around parameter equals.
   E252,
-  # At least two spaces before inline comment. 
-  E261, 
+  # At least two spaces before inline comment.
+  E261,
   # Inline comment should start with '# '.
-  E262, 
+  E262,
   # Block comment should start with '# '.
   E265,
-  # Multiple spaces after keyword. 
+  # Multiple spaces after keyword.
   E271,
-  # Multiple spaces before keyword. 
+  # Multiple spaces before keyword.
   E272,
-  # Expected 1 blank line, found 0. 
+  # Expected 1 blank line, found 0.
   E301,
-  # Expected 2 blank lines, found 0. 
+  # Expected 2 blank lines, found 0.
   E302,
-  # Too many blank lines (3). 
+  # Too many blank lines (3).
   E303,
-  # Expected 2 blank lines after end of function or class. 
+  # Expected 2 blank lines after end of function or class.
   E305,
-  # Expected 1 blank line before a nested definition. 
+  # Expected 1 blank line before a nested definition.
   E306,
-  # Line too long (82 > 79 characters). 
+  # Line too long (82 > 79 characters).
   E501,
-  # The backslash is redundant between brackets. 
+  # The backslash is redundant between brackets.
   E502,
-  # Multiple statements on one line (colon). 
+  # Multiple statements on one line (colon).
   E701,
-  # Multiple statements on one line (semicolon). 
+  # Multiple statements on one line (semicolon).
   E702,
-  # Statement ends with a semicolon. 
+  # Statement ends with a semicolon.
   E703,
-  # Multiple statements on one line (def). 
+  # Multiple statements on one line (def).
   E704,
-  # Trailing whitespace. 
+  # Trailing whitespace.
   W291,
-  # No newline at end of file. 
+  # No newline at end of file.
   W292,
-  # Blank line contains whitespace. 
+  # Blank line contains whitespace.
   W293,
-  # Blank line at end of file. 
+  # Blank line at end of file.
   W391,
-  # Line break occurred after a binary operator. 
-  W504, 
+  # Line break occurred after a binary operator.
+  W504,
 
   # Too opinionated.
   # Block comment should start with '# '.
   E265,
-  # Too many leading '#' for block comment. 
+  # Too many leading '#' for block comment.
   E266,
-  # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports) 
-  E402, 
+  # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports)
+  E402,
   # Do not use bare except, specify exception instead. (Duplicate of B001)
-  E722, 
+  E722,
   # (Duplicate of B003)
-  P207, 
+  P207,
   # (Duplicate of C403)
   P208,
   # Line break occurred before a binary operator.
-  W503  
+  W503
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 000000000..6c1bd8ba9
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,39 @@
+name: Docs
+
+on:
+  push:
+    branches:
+      - main
+
+  pull_request:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install autodocsumm
+        pip install sphinx_rtd_theme
+        pip install sphinx_gallery
+        pip install sphinxcontrib-inlinesyntaxhighlight
+        pip install sphinx_toolbox
+        cd python
+        python setup.py develop
+        cd ..
+        pip install numpy
+    - name: Build documents with Sphinx
+      run: |
+        cd docs
+        make html
+        cd ..
diff --git a/.github/workflows/docs.yml b/.github/workflows/pages.yaml
similarity index 96%
rename from .github/workflows/docs.yml
rename to .github/workflows/pages.yaml
index 208bd1f77..d9074b8b3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/pages.yaml
@@ -1,5 +1,5 @@
 # Simple workflow for deploying static content to GitHub Pages
-name: Documentation
+name: Deploy docs to Pages
 
 on:
   # Runs on pushes targeting the default branch
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.8"]
     steps:
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/.github/workflows/lint.yml b/.github/workflows/pylint.yaml
similarity index 91%
rename from .github/workflows/lint.yml
rename to .github/workflows/pylint.yaml
index dbd4beb83..be97139fa 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/pylint.yaml
@@ -23,9 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ufmt
-        pip install click
-        pip install flake8
+        pip install ufmt==2.0.1 click==8.1.3 black==22.12.0 flake8==5.0.4
     - name: Analyzing the code with flake8
       run: |
         echo "::add-matcher::tests/lint/flake8_problem_matcher.json"
@@ -38,4 +36,4 @@ jobs:
     - name: Check Meta copyright header
       run: |
         python tests/lint/check_meta_header.py --path=./tests --fixit=False
-        python tests/lint/check_meta_header.py --path=./python --fixit=False
\ No newline at end of file
+        python tests/lint/check_meta_header.py --path=./python --fixit=False
diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
deleted file mode 100644
index 6f0e3f695..000000000
--- a/examples/05_stable_diffusion/benchmark.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-
-import click
-
-import numpy as np
-import torch
-from aitemplate.compiler import Model
-from aitemplate.testing import detect_target
-from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from diffusers import StableDiffusionPipeline
-
-from torch import autocast
-from transformers import CLIPTokenizer
-
-USE_CUDA = detect_target().name() == "cuda"
-
-access_token = True
-pipe = None
-
-
-def get_int_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def mark_output(y):
-    if type(y) is not tuple:
-        y = (y,)
-    for i in range(len(y)):
-        y[i]._attrs["is_output"] = True
-        y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
-        print("AIT output_{} shape: {}".format(i, y_shape))
-
-
-def benchmark_unet(
-    batch_size=2,
-    hh=64,
-    ww=64,
-    dim=320,
-    hidden_size=1024,
-    benchmark_pt=False,
-    verify=False,
-):
-
-    exe_module = Model("./tmp/UNet2DConditionModel/test.so")
-    if exe_module is None:
-        print("Error!! Cannot find compiled module for UNet2DConditionModel.")
-        exit(-1)
-
-    # run PT unet model
-    pt_mod = pipe.unet
-    pt_mod = pt_mod.eval()
-
-    latent_model_input_pt = torch.randn(batch_size, 4, hh, ww).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 64, hidden_size).cuda().half()
-    timesteps_pt = torch.Tensor([1, 1]).cuda().half()
-
-    with autocast("cuda"):
-        pt_ys = pt_mod(
-            latent_model_input_pt,
-            timesteps_pt,
-            encoder_hidden_states=text_embeddings_pt,
-        ).sample
-
-        # PT benchmark
-        if benchmark_pt:
-            args = (latent_model_input_pt, 1, text_embeddings_pt)
-            pt_time = benchmark_torch_function(100, pt_mod, *args)
-            print(f"PT batch_size: {batch_size}, {pt_time} ms")
-            with open("sd_pt_benchmark.txt", "a") as f:
-                f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
-
-    print("pt output:", pt_ys.shape)
-
-    # run AIT unet model
-    inputs = {
-        "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
-        "input1": timesteps_pt,
-        "input2": text_embeddings_pt,
-    }
-
-    ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
-        shape = exe_module.get_output_maximum_shape(i)
-        ys.append(torch.empty(shape).cuda().half())
-    exe_module.run_with_tensors(inputs, ys)
-
-    # verification
-    y_transpose = ys[0].permute((0, 3, 1, 2))
-
-    if verify:
-        eps = 1e-1
-        np.testing.assert_allclose(
-            pt_ys.detach().cpu().numpy(),
-            y_transpose.cpu().numpy(),
-            atol=eps,
-            rtol=eps,
-        )
-        print("UNet2DCondition verification pass")
-
-    # AIT benchmark
-    # warmup
-    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
-    # benchmark
-    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
-    with open("sd_ait_benchmark.txt", "a") as f:
-        f.write(f"unet batch_size: {batch_size}, latency: {t} ms\n")
-
-
-def benchmark_clip(
-    batch_size=1,
-    seqlen=64,
-    benchmark_pt=False,
-    verify=False,
-):
-    mask_seq = 0
-    version = "openai/clip-vit-large-patch14"
-
-    exe_module = Model("./tmp/CLIPTextModel/test.so")
-    if exe_module is None:
-        print("Error!! Cannot find compiled module for CLIPTextModel.")
-        exit(-1)
-
-    # run PT clip
-    pt_mod = pipe.text_encoder
-    pt_mod = pt_mod.eval()
-
-    tokenizer = CLIPTokenizer.from_pretrained(version)
-    text_input = tokenizer(
-        ["a photo of an astronaut riding a horse on mars"],
-        padding="max_length",
-        max_length=seqlen,
-        truncation=True,
-        return_tensors="pt",
-    )
-    input_ids = text_input["input_ids"].cuda()
-
-    attention_mask = torch.ones((batch_size, seqlen))
-    attention_mask[-1, -mask_seq:] = 0
-    attention_mask = None
-
-    position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
-    pt_ys = pt_mod(input_ids, attention_mask, position_ids)
-    print("pt output:", pt_ys[0].shape)
-
-    # PT benchmark
-    if benchmark_pt:
-        args = (input_ids, attention_mask, position_ids)
-        pt_time = benchmark_torch_function(100, pt_mod, *args)
-        print(f"PT batch_size: {batch_size}, {pt_time} ms")
-        with open("sd_pt_benchmark.txt", "a") as f:
-            f.write(f"clip batch_size: {batch_size}, latency: {pt_time} ms\n")
-
-    # run AIT clip
-    inputs = {
-        "input0": input_ids,
-        "input1": position_ids,
-    }
-    ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
-        shape = exe_module.get_output_maximum_shape(i)
-        ys.append(torch.empty(shape).cuda().half())
-    exe_module.run_with_tensors(inputs, ys)
-
-    # verification
-    if verify:
-        eps = 1e-1
-        pt_np = pt_ys[0].detach().cpu().numpy()
-        np.testing.assert_allclose(
-            pt_np,
-            ys[0].cpu().numpy(),
-            atol=eps,
-            rtol=eps,
-        )
-        print("CLIPTextTransformer verification pass")
-
-    # AIT benchmark
-    # warmup
-    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
-    # benchmark
-    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
-    with open("sd_ait_benchmark.txt", "a") as f:
-        f.write(f"clip batch_size: {batch_size}, latency: {t} ms\n")
-
-
-def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=False):
-
-    latent_channels = 4
-
-    exe_module = Model("./tmp/AutoencoderKL/test.so")
-    if exe_module is None:
-        print("Error!! Cannot find compiled module for AutoencoderKL.")
-        exit(-1)
-
-    # run PT vae
-    pt_vae = pipe.vae
-    pt_vae = pt_vae.cuda().half()
-    pt_vae.eval()
-
-    pt_input = torch.rand([batch_size, latent_channels, height, width]).cuda().half()
-    print("pt_input shape", pt_input.shape)
-    with autocast("cuda"):
-        pt_output = pt_vae.decode(pt_input).sample
-        pt_output = pt_output.half()
-
-        # PT benchmark
-        if benchmark_pt:
-            args = (pt_input,)
-            pt_time = benchmark_torch_function(100, pt_vae.decode, *args)
-            print(f"PT batch_size: {batch_size}, {pt_time} ms")
-            with open("sd_pt_benchmark.txt", "a") as f:
-                f.write(f"vae batch_size: {batch_size}, latency: {pt_time} ms\n")
-
-    # run AIT vae
-    y = (
-        torch.empty(
-            pt_output.size(0),
-            pt_output.size(2),
-            pt_output.size(3),
-            pt_output.size(1),
-        )
-        .cuda()
-        .half()
-    )
-    ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
-    print("input pt tensor size: ", ait_input_pt_tensor.shape)
-    print("output pt tensor size: ", y.shape)
-    exe_module.run_with_tensors([ait_input_pt_tensor], [y])
-
-    # verification
-    if verify:
-        y_pt = torch.permute(y, (0, 3, 1, 2))
-        eps = 1e-1
-        np.testing.assert_allclose(
-            pt_output.detach().cpu().numpy(),
-            y_pt.cpu().numpy(),
-            atol=eps,
-            rtol=eps,
-        )
-        logging.info("VAE Verification done!")
-
-    # AIT benchmark:
-    # warmup
-    exe_module.benchmark_with_tensors([ait_input_pt_tensor], [y], count=100, repeat=4)
-    # benchmark
-    t, _, _ = exe_module.benchmark_with_tensors(
-        [ait_input_pt_tensor], [y], count=100, repeat=4
-    )
-    with open("sd_ait_benchmark.txt", "a") as f:
-        f.write(f"vae batch_size: {batch_size}, latency: {t} ms\n")
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--batch-size", default=1, help="batch size")
-@click.option("--verify", type=bool, default=False, help="verify correctness")
-@click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
-def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
-    assert batch_size == 1, "batch size must be 1 for submodule verification"
-    logging.getLogger().setLevel(logging.INFO)
-    np.random.seed(0)
-    torch.manual_seed(4896)
-
-    global access_token, pipe
-    if token != "":
-        access_token = token
-
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=access_token,
-    ).to("cuda")
-
-    # CLIP
-    benchmark_clip(batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify)
-    # UNet
-    benchmark_unet(batch_size=batch_size * 2, benchmark_pt=benchmark_pt, verify=verify)
-    # VAE
-    benchmark_vae(batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify)
-
-
-if __name__ == "__main__":
-    benchmark_diffusers()
diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/benchmark_pt.py
deleted file mode 100644
index aa9af8596..000000000
--- a/examples/05_stable_diffusion/benchmark_pt.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import click
-import torch
-
-from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from diffusers import StableDiffusionPipeline
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
-@click.option(
-    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
-)
-def run(token, prompt, benchmark):
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=token,
-    ).to("cuda")
-
-    with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
-        if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
-            print(f"sd pt e2e: {t} ms")
-
-    image.save("example_pt.png")
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
deleted file mode 100644
index f9f5224df..000000000
--- a/examples/05_stable_diffusion/compile.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import logging
-from collections import OrderedDict
-
-import click
-import numpy as np
-
-import torch
-
-from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
-from aitemplate.testing import detect_target
-from diffusers import StableDiffusionPipeline
-
-from modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
-
-from modeling.unet_2d_condition import UNet2DConditionModel as ait_UNet2DConditionModel
-
-from modeling.vae import AutoencoderKL as ait_AutoencoderKL
-
-
-USE_CUDA = detect_target().name() == "cuda"
-
-access_token = True
-pipe = None
-
-
-def mark_output(y):
-    if type(y) is not tuple:
-        y = (y,)
-    for i in range(len(y)):
-        y[i]._attrs["is_output"] = True
-        y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
-        print("AIT output_{} shape: {}".format(i, y_shape))
-
-
-def map_unet_params(pt_mod, dim):
-    pt_params = dict(pt_mod.named_parameters())
-    params_ait = {}
-    for key, arr in pt_params.items():
-        if len(arr.shape) == 4:
-            arr = arr.permute((0, 2, 3, 1)).contiguous()
-        elif key.endswith("ff.net.0.proj.weight"):
-            w1, w2 = arr.chunk(2, dim=0)
-            params_ait[key.replace(".", "_")] = w1
-            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
-            continue
-        elif key.endswith("ff.net.0.proj.bias"):
-            w1, w2 = arr.chunk(2, dim=0)
-            params_ait[key.replace(".", "_")] = w1
-            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
-            continue
-        params_ait[key.replace(".", "_")] = arr
-
-    params_ait["arange"] = (
-        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
-    )
-    return params_ait
-
-
-def map_vae_params(ait_module, pt_module, batch_size, seq_len):
-    pt_params = dict(pt_module.named_parameters())
-    mapped_pt_params = OrderedDict()
-    for name, _ in ait_module.named_parameters():
-        ait_name = name.replace(".", "_")
-        if name in pt_params:
-            if (
-                "conv" in name
-                and "norm" not in name
-                and name.endswith(".weight")
-                and len(pt_params[name].shape) == 4
-            ):
-                mapped_pt_params[ait_name] = torch.permute(
-                    pt_params[name], [0, 2, 3, 1]
-                ).contiguous()
-            else:
-                mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.qkv.weight"):
-            prefix = name[: -len("attention.qkv.weight")]
-            q_weight = pt_params[prefix + "query.weight"]
-            k_weight = pt_params[prefix + "key.weight"]
-            v_weight = pt_params[prefix + "value.weight"]
-            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-            mapped_pt_params[ait_name] = qkv_weight
-        elif name.endswith("attention.qkv.bias"):
-            prefix = name[: -len("attention.qkv.bias")]
-            q_bias = pt_params[prefix + "query.bias"]
-            k_bias = pt_params[prefix + "key.bias"]
-            v_bias = pt_params[prefix + "value.bias"]
-            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
-            mapped_pt_params[ait_name] = qkv_bias
-        elif name.endswith("attention.proj.weight"):
-            prefix = name[: -len("attention.proj.weight")]
-            pt_name = prefix + "proj_attn.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj.bias"):
-            prefix = name[: -len("attention.proj.bias")]
-            pt_name = prefix + "proj_attn.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.cu_length"):
-            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
-            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
-        else:
-            pt_param = pt_module.get_parameter(name)
-            mapped_pt_params[ait_name] = pt_param
-
-    return mapped_pt_params
-
-
-def map_clip_params(pt_mod, batch_size, seqlen, depth):
-
-    params_pt = list(pt_mod.named_parameters())
-
-    params_ait = {}
-    pt_params = {}
-    for key, arr in params_pt:
-        pt_params[key.replace("text_model.", "")] = arr
-
-    pt_params = dict(pt_mod.named_parameters())
-    for key, arr in pt_params.items():
-        name = key.replace("text_model.", "")
-        ait_name = name.replace(".", "_")
-        if name.endswith("out_proj.weight"):
-            ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("out_proj.bias"):
-            ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("q_proj.weight"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.weight")]
-            q = pt_params[prefix + "q_proj.weight"]
-            k = pt_params[prefix + "k_proj.weight"]
-            v = pt_params[prefix + "v_proj.weight"]
-            qkv_weight = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_weight
-            continue
-        elif name.endswith("q_proj.bias"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.bias")]
-            q = pt_params[prefix + "q_proj.bias"]
-            k = pt_params[prefix + "k_proj.bias"]
-            v = pt_params[prefix + "v_proj.bias"]
-            qkv_bias = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_bias
-            continue
-        elif name.endswith("k_proj.weight"):
-            continue
-        elif name.endswith("k_proj.bias"):
-            continue
-        elif name.endswith("v_proj.weight"):
-            continue
-        elif name.endswith("v_proj.bias"):
-            continue
-        params_ait[ait_name] = arr
-
-        if USE_CUDA:
-            for i in range(depth):
-                prefix = "encoder_layers_%d_self_attn_cu_length" % (i)
-                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
-                params_ait[prefix] = torch.from_numpy(cu_len).cuda()
-
-    return params_ait
-
-
-def compile_unet(
-    batch_size=2,
-    hh=64,
-    ww=64,
-    dim=320,
-    hidden_dim=1024,
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
-):
-
-    ait_mod = ait_UNet2DConditionModel(
-        sample_size=64,
-        cross_attention_dim=hidden_dim,
-        attention_head_dim=[5, 10, 20, 20],
-    )
-    ait_mod.name_parameter_tensor()
-
-    # set AIT parameters
-    pt_mod = pipe.unet
-    pt_mod = pt_mod.eval()
-    params_ait = map_unet_params(pt_mod, dim)
-
-    latent_model_input_ait = Tensor(
-        [batch_size, hh, ww, 4], name="input0", is_input=True
-    )
-    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
-    text_embeddings_pt_ait = Tensor(
-        [batch_size, 64, hidden_dim], name="input2", is_input=True
-    )
-
-    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
-    mark_output(Y)
-
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
-
-
-def compile_clip(
-    batch_size=1,
-    seqlen=64,
-    dim=768,
-    num_heads=12,
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
-):
-    mask_seq = 0
-    causal = True
-    depth = 23
-
-    ait_mod = ait_CLIPTextTransformer(
-        num_hidden_layers=depth,
-        hidden_size=dim,
-        num_attention_heads=num_heads,
-        batch_size=batch_size,
-        seq_len=seqlen,
-        causal=causal,
-        mask_seq=mask_seq,
-    )
-    ait_mod.name_parameter_tensor()
-
-    pt_mod = pipe.text_encoder
-    pt_mod = pt_mod.eval()
-    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-
-    input_ids_ait = Tensor(
-        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
-    )
-    position_ids_ait = Tensor(
-        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
-    )
-    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
-    mark_output(Y)
-
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
-
-
-def compile_vae(
-    batch_size=1, height=64, width=64, use_fp16_acc=False, convert_conv_to_gemm=False
-):
-    in_channels = 3
-    out_channels = 3
-    down_block_types = [
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-    ]
-    up_block_types = [
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-    ]
-    block_out_channels = [128, 256, 512, 512]
-    layers_per_block = 2
-    act_fn = "silu"
-    latent_channels = 4
-    sample_size = 512
-
-    ait_vae = ait_AutoencoderKL(
-        batch_size,
-        height,
-        width,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        down_block_types=down_block_types,
-        up_block_types=up_block_types,
-        block_out_channels=block_out_channels,
-        layers_per_block=layers_per_block,
-        act_fn=act_fn,
-        latent_channels=latent_channels,
-        sample_size=sample_size,
-    )
-    ait_input = Tensor(
-        shape=[batch_size, height, width, latent_channels],
-        name="vae_input",
-        is_input=True,
-    )
-    ait_vae.name_parameter_tensor()
-
-    pt_mod = pipe.vae
-    pt_mod = pt_mod.eval()
-    params_ait = map_vae_params(ait_vae, pt_mod, batch_size, height * width)
-
-    Y = ait_vae.decode(ait_input)
-    mark_output(Y)
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(
-        Y,
-        target,
-        "./tmp",
-        "AutoencoderKL",
-        constants=params_ait,
-    )
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--width", default=512, help="Width of generated image")
-@click.option("--height", default=512, help="Height of generated image")
-@click.option("--batch-size", default=1, help="batch size")
-@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
-@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
-def compile_diffusers(
-    token, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
-):
-    logging.getLogger().setLevel(logging.INFO)
-    np.random.seed(0)
-    torch.manual_seed(4896)
-
-    if detect_target().name() == "rocm":
-        convert_conv_to_gemm = False
-
-    global access_token, pipe
-    if token != "":
-        access_token = token
-
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=access_token,
-    ).to("cuda")
-
-    ww = width // 8
-    hh = height // 8
-
-    # CLIP
-    compile_clip(
-        batch_size=batch_size,
-        dim=1024,
-        num_heads=16,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-    # UNet
-    compile_unet(
-        batch_size=batch_size * 2,
-        ww=ww,
-        hh=hh,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-    # VAE
-    compile_vae(
-        batch_size=batch_size,
-        width=ww,
-        height=hh,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-
-
-if __name__ == "__main__":
-    compile_diffusers()
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
deleted file mode 100644
index 1a2fca835..000000000
--- a/examples/05_stable_diffusion/demo.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import click
-import torch
-
-from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from diffusers import EulerDiscreteScheduler
-from pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--width", default=512, help="Width of generated image")
-@click.option("--height", default=512, help="Height of generated image")
-@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
-@click.option(
-    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
-)
-def run(token, width, height, prompt, benchmark):
-
-    model_id = "stabilityai/stable-diffusion-2"
-    scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
-
-    pipe = StableDiffusionAITPipeline.from_pretrained(
-        model_id,
-        scheduler=scheduler,
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=token,
-    ).to("cuda")
-
-    with torch.autocast("cuda"):
-        image = pipe(prompt, height, width).images[0]
-        if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
-            print(f"sd e2e: {t} ms")
-
-    image.save("example_ait.png")
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/demo_img2img.py
deleted file mode 100644
index 569a713ed..000000000
--- a/examples/05_stable_diffusion/demo_img2img.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from io import BytesIO
-
-import click
-import requests
-import torch
-
-from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from PIL import Image
-from pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--width", default=512, help="Width of generated image")
-@click.option("--height", default=512, help="Height of generated image")
-@click.option(
-    "--prompt", default="A fantasy landscape, trending on artstation", help="prompt"
-)
-@click.option(
-    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
-)
-def run(token, width, height, prompt, benchmark):
-
-    # load the pipeline
-    device = "cuda"
-    model_id_or_path = "runwayml/stable-diffusion-v1-5"
-    pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
-        model_id_or_path,
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=token,
-    )
-    pipe = pipe.to(device)
-
-    # let's download an initial image
-    url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-    response = requests.get(url)
-    init_image = Image.open(BytesIO(response.content)).convert("RGB")
-    init_image = init_image.resize((height, width))
-
-    with torch.autocast("cuda"):
-        images = pipe(
-            prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5
-        ).images
-        if benchmark:
-            args = (prompt, init_image)
-            t = benchmark_torch_function(10, pipe, *args)
-            print(f"sd e2e: {t} ms")
-
-    images[0].save("fantasy_landscape_ait.png")
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/05_stable_diffusion/modeling/attention.py b/examples/05_stable_diffusion/modeling/attention.py
deleted file mode 100644
index 14993e6d9..000000000
--- a/examples/05_stable_diffusion/modeling/attention.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
-"""
-
-from typing import Optional
-
-from aitemplate.compiler.ops import reshape
-
-from aitemplate.frontend import nn, Tensor
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    Uses three q, k, v linear layers to compute attention.
-    Parameters:
-        batch_size (:obj:`int`): The number of examples per batch.
-        height (:obj:`int`): Height of each image example.
-        width (:obj:`int`): Width of each image example.
-        channels (:obj:`int`): The number of channels in the input and output.
-        num_head_channels (:obj:`int`, *optional*):
-            The number of channels in each head. If None, then `num_heads` = 1.
-        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        channels: int,
-        num_head_channels: Optional[int] = None,
-        num_groups: int = 32,
-        rescale_output_factor: float = 1.0,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.batch_size = batch_size
-        self.height = height
-        self.width = width
-        self.channels = channels
-        self.num_heads = (
-            channels // num_head_channels if num_head_channels is not None else 1
-        )
-        self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
-            channels,
-            batch_size,
-            height * width,
-            self.num_heads,
-            qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
-        )
-        self.rescale_output_factor = rescale_output_factor
-
-    def forward(self, hidden_states) -> Tensor:
-        """
-        input hidden_states shape: [batch, height, width, channel]
-        output shape: [batch, height, width, channel]
-        """
-        residual = hidden_states
-
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-
-        hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
-        )
-
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
-
-        return res
diff --git a/examples/05_stable_diffusion/modeling/clip.py b/examples/05_stable_diffusion/modeling/clip.py
deleted file mode 100644
index 8d6079988..000000000
--- a/examples/05_stable_diffusion/modeling/clip.py
+++ /dev/null
@@ -1,588 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from inspect import isfunction
-from typing import Optional
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-
-# pylint: disable=W0102
-
-USE_CUDA = detect_target().name() == "cuda"
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        dtype="float16",
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.dim_head = dim_head
-
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None, residual=None):
-        nheads = self.heads
-        d = self.dim_head
-
-        layout = "20314" if USE_CUDA else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
-        context = default(context, x)
-
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
-        )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
-
-        if USE_CUDA:
-            attn_op = ops.mem_eff_attention(causal=False)
-            out = attn_op(
-                (ops.reshape()(q, [bs, nheads, -1, d])),
-                (ops.reshape()(k, [bs, nheads, -1, d])),
-                (ops.reshape()(v, [bs, nheads, -1, d])),
-            )
-        else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
-            )
-        out = ops.reshape()(out, [bs, -1, nheads * d])
-        proj = self.to_out(out)
-        proj = ops.reshape()(proj, [bs, -1, nheads * d])
-        if residual is not None:
-            return proj + residual
-        else:
-            return proj
-
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
-        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
-
-    def forward(self, x):
-        return self.proj(x, self.gate(x))
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = (
-            nn.Sequential(
-                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
-            )
-            if not glu
-            else GEGLU(dim, inner_dim)
-        )
-
-        self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x, residual=None):
-        shape = ops.size()(x)
-        x = self.net(x)
-        x = ops.reshape()(x, shape)
-        if residual is not None:
-            return x + residual
-        else:
-            return x
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        n_heads,
-        d_head,
-        dropout=0.0,
-        context_dim=None,
-        gated_ff=True,
-        checkpoint=True,
-    ):
-        super().__init__()
-        self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
-            query_dim=dim,
-            context_dim=context_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            dropout=dropout,
-        )
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-
-        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
-
-    def forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), residual=x)
-        x = self.attn2(self.norm2(x), context=context, residual=x)
-        x = self.ff(self.norm3(x), residual=x)
-        return x
-
-
-def Normalize(in_channels):
-    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-
-    def __init__(
-        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)  # Group Norm
-
-        self.proj_in = nn.Conv2dBias(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
-                )
-                for d in range(depth)
-            ]
-        )
-
-        self.proj_out = nn.Conv2dBias(
-            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = ops.reshape()(x, [b, -1, c])
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = ops.reshape()(x, [b, h, w, c])
-        x = self.proj_out(x)
-        return x + x_in
-
-
-class CLIPAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        batch_size=1,
-        seq_len=16,
-        layer_norm_eps=1e-5,
-        hidden_dropout_prob=0.0,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=hidden_dropout_prob,
-            has_residual=False,
-            causal=causal,
-            mask_seq=mask_seq,
-        )
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        residual: Optional[Tensor] = None,
-    ):
-        if residual is not None:
-            self_output = self.attn(hidden_states, residual)
-        else:
-            self_output = self.attn(hidden_states)
-        return self_output
-
-
-class QuickGELUActivation(nn.Module):
-    """
-    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
-    """
-
-    def forward(self, x):
-        x1 = x * 1.702
-        x1 = ops.sigmoid(x1)
-        x = x * x1
-        return x
-
-
-class CLIPMLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer="GELU",
-        drop=0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-            specialization="gelu",
-        )
-        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
-
-    def forward(self, x, res):
-        shape = get_shape(x)
-        x = self.fc1(x)
-        x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
-
-
-class CLIPEncoderLayer(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        mlp_ratio=4.0,
-        batch_size=1,
-        seq_len=16,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
-            causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
-        )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        output_attentions: Optional[bool] = False,
-    ):
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-                `(config.encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states, residual)
-
-        return hidden_states
-
-
-class CLIPEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`CLIPEncoderLayer`].
-    Args:
-        config: CLIPConfig
-    """
-
-    def __init__(
-        self,
-        num_hidden_layers=12,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        hidden_size=768,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [
-                CLIPEncoderLayer(
-                    hidden_size=hidden_size,
-                    num_attention_heads=num_attention_heads,
-                    batch_size=batch_size,
-                    seq_len=seq_len,
-                    causal=causal,
-                    mask_seq=mask_seq,
-                )
-                for _ in range(num_hidden_layers)
-            ]
-        )
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Causal mask for the text model. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        # all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for _, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(hidden_states)
-            hidden_states = layer_outputs
-
-        return hidden_states
-
-
-class CLIPTextEmbeddings(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        vocab_size=49408,
-        max_position_embeddings=77,
-        dtype="float16",
-    ):
-        super().__init__()
-        embed_dim = hidden_size
-
-        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
-        self.position_embedding = nn.Embedding(
-            shape=[max_position_embeddings, embed_dim], dtype=dtype
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Tensor:
-
-        input_shape = ops.size()(input_ids)
-
-        # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
-
-        if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
-
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
-        )
-
-        embeddings = inputs_embeds + position_embeddings
-
-        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
-
-        return embeddings
-
-
-class CLIPTextTransformer(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        embed_dim = hidden_size
-        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
-        self.encoder = CLIPEncoder(
-            num_hidden_layers=num_hidden_layers,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            causal=causal,
-            mask_seq=mask_seq,
-        )
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
-
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        input_ids: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        position_ids: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Returns:
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
-
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-        )
-
-        last_hidden_state = encoder_outputs
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        return last_hidden_state
diff --git a/examples/05_stable_diffusion/modeling/embeddings.py b/examples/05_stable_diffusion/modeling/embeddings.py
deleted file mode 100644
index 36b96a4fb..000000000
--- a/examples/05_stable_diffusion/modeling/embeddings.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import math
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def get_timestep_embedding(
-    timesteps: Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
-
-    half_dim = embedding_dim // 2
-
-    exponent = (-math.log(max_period)) * Tensor(
-        shape=[half_dim], dtype="float16", name="arange"
-    )
-
-    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
-
-    emb = ops.exp(exponent)
-    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
-
-    # scale embeddings
-    emb = scale * emb
-
-    # concat sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = ops.concatenate()(
-            [ops.cos(emb), ops.sin(emb)],
-            dim=-1,
-        )
-    else:
-        emb = ops.concatenate()(
-            [ops.sin(emb), ops.cos(emb)],
-            dim=-1,
-        )
-    return emb
-
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
-        super().__init__()
-
-        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
-
-    def forward(self, sample):
-        sample = self.linear_1(sample)
-        sample = self.linear_2(sample)
-        return sample
-
-
-class Timesteps(nn.Module):
-    def __init__(
-        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
-    ):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-
-    def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
diff --git a/examples/05_stable_diffusion/modeling/resnet.py b/examples/05_stable_diffusion/modeling/resnet.py
deleted file mode 100644
index 03e4f8023..000000000
--- a/examples/05_stable_diffusion/modeling/resnet.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-class Upsample2D(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self,
-        channels,
-        use_conv=False,
-        use_conv_transpose=False,
-        out_channels=None,
-        name="conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        conv = None
-        if use_conv_transpose:
-            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(x)
-
-        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                x = self.conv(x)
-            else:
-                x = self.Conv2d_0(x)
-
-        return x
-
-
-class Downsample2D(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            conv = nn.Conv2dBias(
-                self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        x = self.conv(x)
-
-        return x
-
-
-class ResnetBlock2D(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        time_embedding_norm="default",
-        kernel=None,
-        output_scale_factor=1.0,
-        use_nin_shortcut=None,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.up = up
-        self.down = down
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        self.norm1 = nn.GroupNorm(
-            num_groups=groups,
-            num_channels=in_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-
-        self.conv1 = nn.Conv2dBias(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
-        else:
-            self.time_emb_proj = None
-
-        self.norm2 = nn.GroupNorm(
-            num_groups=groups_out,
-            num_channels=out_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = nn.Conv2dBias(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        self.upsample = self.downsample = None
-
-        self.use_nin_shortcut = (
-            self.in_channels != self.out_channels
-            if use_nin_shortcut is None
-            else use_nin_shortcut
-        )
-
-        if self.use_nin_shortcut:
-            self.conv_shortcut = nn.Conv2dBias(
-                in_channels, out_channels, 1, 1, 0
-            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
-        else:
-            self.conv_shortcut = None
-
-    def forward(self, x, temb=None):
-        hidden_states = x
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm1(
-            hidden_states
-        )  # .float()).type(hidden_states.dtype) # fused swish
-        # hidden_states = self.nonlinearity(hidden_states)
-
-        if self.upsample is not None:
-            x = self.upsample(x)
-            hidden_states = self.upsample(hidden_states)
-        elif self.downsample is not None:
-            x = self.downsample(x)
-            hidden_states = self.downsample(hidden_states)
-
-        hidden_states = self.conv1(hidden_states)
-
-        if temb is not None:
-            temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
-            temb = ops.reshape()(temb, [bs, 1, 1, dim])
-            hidden_states = hidden_states + temb
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm2(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            x = self.conv_shortcut(x)
-
-        out = hidden_states + x
-
-        return out
diff --git a/examples/05_stable_diffusion/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/modeling/unet_2d_condition.py
deleted file mode 100644
index a21879dea..000000000
--- a/examples/05_stable_diffusion/modeling/unet_2d_condition.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from typing import Optional, Tuple, Union
-
-from aitemplate.frontend import nn
-
-from modeling.embeddings import TimestepEmbedding, Timesteps
-from modeling.unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
-
-
-class UNet2DConditionModel(nn.Module):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-    """
-
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types: Tuple[str] = (
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-    ):
-        super().__init__()
-        self.center_input_sample = center_input_sample
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-
-        # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
-        # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[
-                min(i + 1, len(block_out_channels) - 1)
-            ]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=reversed_attention_head_dim[i],
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=norm_num_groups,
-            eps=norm_eps,
-            use_swish=True,
-        )
-
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
-
-    def forward(
-        self,
-        sample,
-        timesteps,
-        encoder_hidden_states,
-        return_dict: bool = True,
-    ):
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-
-        # 1. time
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if (
-                hasattr(downsample_block, "attentions")
-                and downsample_block.attentions is not None
-            ):
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        sample = self.mid_block(
-            sample, emb, encoder_hidden_states=encoder_hidden_states
-        )
-
-        # 5. up
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[
-                : -len(upsample_block.resnets)
-            ]
-
-            if (
-                hasattr(upsample_block, "attentions")
-                and upsample_block.attentions is not None
-            ):
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
-                )
-
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-        return sample
diff --git a/examples/05_stable_diffusion/modeling/unet_blocks.py b/examples/05_stable_diffusion/modeling/unet_blocks.py
deleted file mode 100644
index 75de2e0c8..000000000
--- a/examples/05_stable_diffusion/modeling/unet_blocks.py
+++ /dev/null
@@ -1,761 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# flake8: noqa
-from aitemplate.compiler import ops
-
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-from modeling.attention import AttentionBlock
-
-from modeling.clip import SpatialTransformer
-from modeling.resnet import Downsample2D, ResnetBlock2D, Upsample2D
-
-# pylint: disable=W0102
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-    downsample_padding=None,
-):
-    down_block_type = (
-        down_block_type[7:]
-        if down_block_type.startswith("UNetRes")
-        else down_block_type
-    )
-    if down_block_type == "DownBlock2D":
-        return DownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnDownBlock2D":
-        return AttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
-            )
-        return CrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "SkipDownBlock2D":
-        return SkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnSkipDownBlock2D":
-        return AttnSkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "DownEncoderBlock2D":
-        return DownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-):
-    up_block_type = (
-        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    )
-    if up_block_type == "UpBlock2D":
-        return UpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "CrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
-            )
-        return CrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "AttnUpBlock2D":
-        return AttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "SkipUpBlock2D":
-        return SkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "AttnSkipUpBlock2D":
-        return AttnSkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "UpDecoderBlock2D":
-        return UpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-class UNetMidBlock2DCrossAttn(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        cross_attention_dim=1280,
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                SpatialTransformer(
-                    in_channels,
-                    attn_num_head_channels,
-                    in_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class CrossAttnDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class DownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class CrossAttnUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_upsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-    ):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb=temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpDecoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UNetMidBlock2D(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        **kwargs,
-    ):
-        super().__init__()
-
-        if attention_type != "default":
-            raise NotImplementedError(
-                f"attention_type must be default! current value: {attention_type}"
-            )
-
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                AttentionBlock(
-                    batch_size,
-                    height,
-                    width,
-                    in_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
diff --git a/examples/05_stable_diffusion/modeling/vae.py b/examples/05_stable_diffusion/modeling/vae.py
deleted file mode 100644
index 6a239f233..000000000
--- a/examples/05_stable_diffusion/modeling/vae.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""
-Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
-"""
-
-from typing import Tuple
-
-from aitemplate.frontend import nn, Tensor
-from modeling.unet_blocks import get_up_block, UNetMidBlock2D
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2dBias(
-            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
-        )
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            batch_size,
-            height,
-            width,
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-
-        # up
-        self.up_blocks = nn.ModuleList([])
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=num_groups_out,
-            eps=1e-6,
-            use_swish=True,
-        )
-        self.conv_out = nn.Conv2dBias(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
-        )
-
-    def forward(self, z) -> Tensor:
-        sample = z
-        sample = self.conv_in(sample)
-
-        # middle
-        sample = self.mid_block(sample)
-
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
-
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class AutoencoderKL(nn.Module):
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        sample_size: int = 32,
-    ):
-        super().__init__()
-        self.decoder = Decoder(
-            batch_size,
-            height,
-            width,
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-        self.post_quant_conv = nn.Conv2dBias(
-            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def decode(self, z: Tensor, return_dict: bool = True):
-
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self):
-        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
deleted file mode 100644
index 3a14debcc..000000000
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ /dev/null
@@ -1,410 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import inspect
-
-import os
-import warnings
-from typing import List, Optional, Union
-
-import torch
-from aitemplate.compiler import Model
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-
-from diffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipelineOutput,
-    StableDiffusionSafetyChecker,
-)
-
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-
-class StableDiffusionAITPipeline(StableDiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-        ],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker,
-        )
-
-        workdir = "tmp/"
-        self.clip_ait_exe = self.init_ait_module(
-            model_name="CLIPTextModel", workdir=workdir
-        )
-        self.unet_ait_exe = self.init_ait_module(
-            model_name="UNet2DConditionModel", workdir=workdir
-        )
-        self.vae_ait_exe = self.init_ait_module(
-            model_name="AutoencoderKL", workdir=workdir
-        )
-
-    def init_ait_module(
-        self,
-        model_name,
-        workdir,
-    ):
-        mod = Model(os.path.join(workdir, model_name, "test.so"))
-        return mod
-
-    def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
-        exe_module = self.unet_ait_exe
-        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
-        inputs = {
-            "input0": latent_model_input.permute((0, 2, 3, 1))
-            .contiguous()
-            .cuda()
-            .half(),
-            "input1": timesteps_pt.cuda().half(),
-            "input2": encoder_hidden_states.cuda().half(),
-        }
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
-        return noise_pred
-
-    def clip_inference(self, input_ids, seqlen=64):
-        exe_module = self.clip_ait_exe
-        bs = input_ids.shape[0]
-        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
-        inputs = {
-            "input0": input_ids,
-            "input1": position_ids,
-        }
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        return ys[0].float()
-
-    def vae_inference(self, vae_input):
-        exe_module = self.vae_ait_exe
-        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        vae_out = ys[0].permute((0, 3, 1, 2)).float()
-        return vae_out
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        if "torch_device" in kwargs:
-            device = kwargs.pop("torch_device")
-            warnings.warn(
-                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
-                " Consider using `pipe.to(torch_device)` instead."
-            )
-
-            # Set device as before (to be removed in 0.3.0)
-            if device is None:
-                device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.to(device)
-
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
-
-        # get prompt text embeddings
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            max_length = text_input.input_ids.shape[-1]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.clip_inference(
-                uncond_input.input_ids.to(self.device)
-            )
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_device = "cpu" if self.device.type == "mps" else self.device
-        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
-        if latents is None:
-            latents = torch.randn(
-                latents_shape,
-                generator=generator,
-                device=latents_device,
-            )
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
-        latents = latents.to(self.device)
-
-        # set timesteps
-        accepts_offset = "offset" in set(
-            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
-        )
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            )
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[i]
-                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-            # predict the noise residual
-            noise_pred = self.unet_inference(
-                latent_model_input, t, encoder_hidden_states=text_embeddings
-            )
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-
-            # compute the previous noisy sample x_t -> x_t-1
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(
-                    noise_pred, i, latents, **extra_step_kwargs
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs
-                ).prev_sample
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae_inference(latents)
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-
-        # run safety checker
-        if self.safety_checker is not None:
-            safety_cheker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pt"
-            ).to(self.device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_cheker_input.pixel_values
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept
-        )
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
deleted file mode 100644
index 7380aeebd..000000000
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
+++ /dev/null
@@ -1,402 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-# flakes8: noqa
-import inspect
-import os
-from typing import List, Optional, Union
-
-import numpy as np
-
-import PIL
-import torch
-from aitemplate.compiler import Model
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImg2ImgPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipelineOutput,
-    StableDiffusionSafetyChecker,
-)
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-
-def preprocess(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.0 * image - 1.0
-
-
-class StableDiffusionImg2ImgAITPipeline(StableDiffusionImg2ImgPipeline):
-    r"""
-    Pipeline for text-guided image to image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        # super().__init__()
-        super().__init__(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker,
-        )
-        scheduler = scheduler.set_format("pt")
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-        workdir = "tmp/"
-        self.clip_ait_exe = self.init_ait_module(
-            model_name="CLIPTextModel", workdir=workdir
-        )
-        self.unet_ait_exe = self.init_ait_module(
-            model_name="UNet2DConditionModel", workdir=workdir
-        )
-        self.vae_ait_exe = self.init_ait_module(
-            model_name="AutoencoderKL", workdir=workdir
-        )
-
-    def init_ait_module(
-        self,
-        model_name,
-        workdir,
-    ):
-        mod = Model(os.path.join(workdir, model_name, "test.so"))
-        return mod
-
-    def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
-        exe_module = self.unet_ait_exe
-        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
-        inputs = {
-            "input0": latent_model_input.permute((0, 2, 3, 1))
-            .contiguous()
-            .cuda()
-            .half(),
-            "input1": timesteps_pt.cuda().half(),
-            "input2": encoder_hidden_states.cuda().half(),
-        }
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
-        return noise_pred
-
-    def clip_inference(self, input_ids, seqlen=64):
-        exe_module = self.clip_ait_exe
-        bs = input_ids.shape[0]
-        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
-        inputs = {
-            "input0": input_ids,
-            "input1": position_ids,
-        }
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        return ys[0].float()
-
-    def vae_inference(self, vae_input):
-        exe_module = self.vae_ait_exe
-        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
-        ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
-            shape = exe_module.get_output_maximum_shape(i)
-            ys.append(torch.empty(shape).cuda().half())
-        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
-        vae_out = ys[0].permute((0, 3, 1, 2)).float()
-        return vae_out
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        init_image: Union[torch.FloatTensor, PIL.Image.Image],
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
-                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
-
-        if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}"
-            )
-
-        # set timesteps
-        accepts_offset = "offset" in set(
-            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
-        )
-        extra_set_kwargs = {}
-        offset = 0
-        if accepts_offset:
-            offset = 1
-            extra_set_kwargs["offset"] = 1
-
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-
-        if isinstance(init_image, PIL.Image.Image):
-            init_image = preprocess(init_image)
-
-        # encode the init image into latents and scale the latents
-        init_latent_dist = self.vae.encode(init_image.to(self.device)).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = 0.18215 * init_latents
-
-        # expand init_latents for batch_size
-        init_latents = torch.cat([init_latents] * batch_size)
-
-        # get the original timestep using init_timestep
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            timesteps = torch.tensor(
-                [num_inference_steps - init_timestep] * batch_size,
-                dtype=torch.long,
-                device=self.device,
-            )
-        else:
-            timesteps = self.scheduler.timesteps[-init_timestep]
-            timesteps = torch.tensor(
-                [timesteps] * batch_size, dtype=torch.long, device=self.device
-            )
-
-        # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=self.device)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps).to(
-            self.device
-        )
-
-        # get prompt text embeddings
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            max_length = text_input.input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                [""] * batch_size,
-                padding="max_length",
-                max_length=max_length,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.clip_inference(
-                uncond_input.input_ids.to(self.device)
-            )
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        latents = init_latents
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps[t_start:])):
-            t_index = t_start + i
-
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            )
-
-            # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[t_index]
-                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-                latent_model_input = latent_model_input.to(self.unet.dtype)
-                t = t.to(self.unet.dtype)
-
-            # predict the noise residual
-            noise_pred = self.unet_inference(
-                latent_model_input, t, encoder_hidden_states=text_embeddings
-            )
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-
-            # compute the previous noisy sample x_t -> x_t-1
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(
-                    noise_pred, t_index, latents, **extra_step_kwargs
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs
-                ).prev_sample
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae_inference(latents)
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-
-        # run safety checker
-        safety_cheker_input = self.feature_extractor(
-            self.numpy_to_pil(image), return_tensors="pt"
-        ).to(self.device)
-        image, has_nsfw_concept = self.safety_checker(
-            images=image, clip_input=safety_cheker_input.pixel_values
-        )
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept
-        )
diff --git a/fx2ait/fx2ait/TARGETS b/fx2ait/fx2ait/TARGETS
deleted file mode 100644
index c247fbd11..000000000
--- a/fx2ait/fx2ait/TARGETS
+++ /dev/null
@@ -1,41 +0,0 @@
-# @noautodeps
-load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-load("@fbsource//tools/build_defs:glob_defs.bzl", "glob")
-
-oncall("aitemplate")
-
-# Note that we exclude common acc_tracer python files here and will reuse
-# those in torch_tensorrt/fx/tracer/acc_tracer/
-python_library(
-    name = "fx2ait",
-    srcs = glob(
-        [
-            "converters/*.py",
-            "*.py",
-            "passes/*.py",
-            "tools/*.py",
-        ] + [
-            "acc_tracer/ait_acc_normalizer.py",
-            "acc_tracer/ait_acc_ops_registry.py",
-            "acc_tracer/ait_acc_ops.py",
-        ],
-        exclude = [
-            "cache.py",
-        ],
-    ),
-    base_module = "fx2ait",
-    deps = [
-        "fbsource//third-party/pypi/graphviz:graphviz",
-        "fbsource//third-party/pypi/numpy:numpy",
-        "fbsource//third-party/pypi/pydot:pydot",
-        "//aitemplate/AITemplate/fx2ait/fx2ait/fb:acc_import_helper",
-        "//aitemplate/AITemplate/fx2ait/fx2ait/fb/lower:ait_lowering_setting",
-        "//aitemplate/AITemplate/python/aitemplate:aitemplate",
-        "//caffe2:torch",
-        "//deeplearning/ait:AITModel",
-        "//executorch/exir:graph_module",
-        "//executorch/exir:lib",
-        "//executorch/exir:tracer",
-        "//pytorch/vision:torchvision",
-    ],
-)
diff --git a/fx2ait/fx2ait/csrc/TARGETS b/fx2ait/fx2ait/csrc/TARGETS
deleted file mode 100644
index 88893b1f7..000000000
--- a/fx2ait/fx2ait/csrc/TARGETS
+++ /dev/null
@@ -1,29 +0,0 @@
-load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
-
-oncall("aitemplate")
-
-cpp_library(
-    name = "AITModelImpl",
-    srcs = ["AITModelImpl.cpp"],
-    headers = ["AITModelImpl.h"],
-    propagated_pp_flags = [
-        "-DFBCODE_AIT",
-        "-Iaitemplate/AITemplate/static/include",
-    ],
-    supports_python_dlopen = True,
-    deps = [
-        "//caffe2:ATen-cu",
-        "//caffe2/c10:c10",
-        "//caffe2/c10:c10_cuda",
-        "//folly:map_util",
-    ],
-    exported_deps = [
-        "//aitemplate/AITemplate/static/include:aitemplate",  # @manual
-        "//caffe2:ATen-cu",
-        "//caffe2:torch-cpp",
-        "//folly/container:f14_hash",
-    ],
-    exported_external_deps = [
-        ("glibc", None, "dl"),
-    ],
-)
diff --git a/fx2ait/fx2ait/test/TARGETS b/fx2ait/fx2ait/test/TARGETS
deleted file mode 100644
index 465522f7d..000000000
--- a/fx2ait/fx2ait/test/TARGETS
+++ /dev/null
@@ -1,78 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//tools/build_defs:glob_defs.bzl", "glob")
-
-oncall("aitemplate")
-
-[
-    python_unittest(
-        name = test_file.split("/")[-1][:-3],
-        srcs = [
-            test_file,
-        ],
-        env = {
-            "NUM_BUILDERS": "12",
-        },
-        par_style = "xar",
-        tags = [
-            "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"A100\"}",
-            "serialize_test_cases",
-            "supports_remote_execution",
-        ],
-        deps = [
-            "fbsource//third-party/pypi/numpy:numpy",
-            "fbsource//third-party/pypi/parameterized:parameterized",
-            "//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait",
-            "//aitemplate/AITemplate/fx2ait/fx2ait/fb/converters:internal_converters",
-            "//caffe2:test-lib",
-            "//caffe2:torch",
-            "//deeplearning/trt/torch_tensorrt/py/torch_tensorrt:acc_tracer",
-            "//deeplearning/trt/torch_tensorrt/py/torch_tensorrt/fb:internal_passes",
-            "//glow/fb/fx/acc_tracer:acc_tracer",
-        ],
-    )
-    for test_file in glob(
-        [
-            "fb/converters/test*.py",
-            "converters/test*.py",
-            "converters/*/test*.py",
-            "test*.py",
-        ],
-        exclude = [
-            "test_fx2ait.py",
-            "test_ait_lower.py",
-        ],
-    )
-]
-
-[
-    python_unittest(
-        name = test_file.split("/")[-1][:-3],
-        srcs = [
-            test_file,
-        ],
-        env = {
-            "NUM_BUILDERS": "12",
-        },
-        par_style = "xar",
-        tags = [
-            "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"A100\"}",
-            "serialize_test_cases",
-            "supports_remote_execution",
-        ],
-        deps = [
-            "fbsource//third-party/pypi/numpy:numpy",
-            "fbsource//third-party/pypi/parameterized:parameterized",
-            "fbsource//third-party/pypi/transformers:transformers",
-            "//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait",
-            "//aitemplate/AITemplate/fx2ait/fx2ait/fb/converters:internal_converters_aten",
-            "//caffe2:test-lib",
-            "//caffe2:torch",
-            "//caffe2/functorch:functorch",
-        ],
-    )
-    for test_file in glob(
-        [
-            "converters_aten/test*.py",
-        ],
-    )
-]
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
deleted file mode 100644
index c556485f1..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = ADD(GeMM(A, B) + bias, D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
deleted file mode 100644
index bd2988abf..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
deleted file mode 100644
index 5d262712e..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
deleted file mode 100644
index 212b01a74..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(GeMM(A, B) + bias, D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
deleted file mode 100644
index 1b2dea303..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = ADD(GeMM(A, B) + bias, D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
deleted file mode 100644
index 12bce07ae..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = Add(Mul(GeMM(A, B) + bias, D0), D1),
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
deleted file mode 100644
index c8be43f28..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = TANH(Mul((GeMM(A, B) + bias), D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
deleted file mode 100644
index 2828d379d..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = Mul(Sigmoid(GeMM(A, B) + bias), D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
deleted file mode 100644
index b3d721d6c..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = TANH(Mul(Sigmoid(GeMM(A, B) + bias), D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm.py
deleted file mode 100644
index 22a3ee036..000000000
--- a/python/aitemplate/compiler/transform/fuse_permute_bmm.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Perform fusions for permute+bmm operators.
-"""
-from typing import Callable, List, Optional, Set, Tuple, Type, Union
-
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal import (
-    bmm_ccr,
-    bmm_crr,
-    bmm_rcr,
-    bmm_rrr,
-    gemm_rcr,
-    gemm_rcr_bias,
-    gemm_rrr,
-    gemm_rrr_bias,
-)
-from ..ops.tensor import permute021
-from .fuse_utils import extract_only_one_op
-from .transform_utils import (
-    copy_src_op_attributes,
-    copy_tensor_attributes,
-    remove_dst_op_from_tensor,
-    remove_tensor_from_sorted_graph,
-    replace_tensor,
-    sanitize_sorted_graph,
-)
-
-# pylint: disable=C0103,W0612
-
-
-def _try_extract_one_mm_op(ops: Set[Union[None, Operator]]) -> Union[None, Operator]:
-    """
-    Helper function that returns the matmul op from src_ops() or dst_ops() call.
-    Return None if there's no bmm ops
-    """
-    if ops is None:
-        return None
-
-    for op in ops:
-        if op._attrs["op"].startswith("bmm") or op._attrs["op"].startswith("gemm"):
-            return op
-
-    return None
-
-
-def _fuse_permute_bmm_ops(
-    sorted_graph: List[Tensor],
-    source: List[Type[Operator]],
-    targets: List[Union[None, Type[Operator]]],
-    condition: Optional[Callable],
-) -> Tuple[bool, List[Tensor]]:
-    """
-    Function that fuses [permute021 + bmm] into corresponding bmm op.
-
-    Parameters
-    ----------
-    sorted_graph : List[Tensor]
-        AIT graph to run fusion
-    source: List[Type[Operator]]
-        Combination of permute+bmm ops to be fused.
-        This should be of len-2
-    targets: List[Type[Operator]]
-        To be fused bmm that matches the source.
-        This should be of len 2, which corresponds to the operator that does
-        permute A and permute B respectively
-    condition: Optional[Callable]
-        If not None, we apply on the gemm op to check whether it requires fusion.
-    """
-    assert len(source) == 2, "Source should have 2 elements, got {} instead".format(
-        len(source)
-    )
-
-    new_sorted_graph = []
-    fused = False
-    to_replace = {}
-    for tensor in sorted_graph:
-        if tensor in to_replace:
-            new_sorted_graph.append(to_replace[tensor])
-            replace_tensor(tensor, to_replace[tensor])
-            del to_replace[tensor]
-            continue
-        new_sorted_graph.append(tensor)
-
-        if fused:
-            continue
-        if tensor._attrs["is_output"]:
-            continue
-
-        permute_op = extract_only_one_op(tensor._attrs["src_ops"])
-        bmm_op = _try_extract_one_mm_op(tensor._attrs["dst_ops"])
-        if permute_op is None or bmm_op is None:
-            continue
-
-        if permute_op._attrs["op"] != source[0]()._attrs["op"]:
-            continue
-        if bmm_op._attrs["op"] != source[1]()._attrs["op"]:
-            continue
-        if condition is not None and not condition(bmm_op):
-            continue
-
-        assert len(permute_op._attrs["inputs"]) == 1
-        assert len(bmm_op._attrs["outputs"]) == 1
-
-        inputs = list(bmm_op._attrs["inputs"])
-        if targets[0] is None and inputs[0] == tensor:
-            continue
-        if targets[1] is None and inputs[1] == tensor:
-            continue
-
-        input_tensor = permute_op._attrs["inputs"][0]
-        output_tensor = bmm_op._attrs["outputs"][0]
-
-        # TODO: Check whether the input is weight to have better compile time
-        #       optimization on preprocessing of pad etc.
-        permute_shape = tensor.shape()
-        prepermute_shape = input_tensor.shape()
-
-        if (
-            isinstance(prepermute_shape[-1], IntImm)
-            and prepermute_shape[-1].value() % 2 == 1
-            and isinstance(permute_shape[-1], IntImm)
-            and permute_shape[-1].value() % 2 == 0
-        ):
-            # We don't run the permute+bmm fusion if the permute op could
-            # turn an odd alignment into even alignment.
-            continue
-
-        fused = True
-
-        remove_dst_op_from_tensor(bmm_op._attrs["inputs"], bmm_op)
-
-        target = None
-        if inputs[0] == tensor:
-            target = targets[0]
-            inputs[0] = input_tensor
-        elif inputs[1] == tensor:
-            target = targets[1]
-            inputs[1] = input_tensor
-        else:
-            raise RuntimeError(
-                "bmm inputs are {}, not matching permute's output tensor {}".format(
-                    inputs, tensor
-                )
-            )
-
-        if not tensor.dst_ops():
-            # Remove permute configs if this is the last bmm consuming the tensor
-            remove_dst_op_from_tensor(input_tensor, permute_op)
-            remove_tensor_from_sorted_graph(tensor)
-
-        new_tensor = target()(*inputs)
-        copy_tensor_attributes(new_tensor, output_tensor)
-        copy_src_op_attributes(new_tensor, output_tensor)
-        to_replace[output_tensor] = new_tensor
-
-    return (fused, sanitize_sorted_graph(new_sorted_graph))
-
-
-def fuse_permute_bmm(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
-    """Fuse [permute021 + bmm] into corresponding bmm op.
-
-    Parameters
-    ----------
-    sorted_graph : List[Tensor]
-        Input graph
-    workdir : str, optional
-        working dir, by default None
-
-    Returns
-    -------
-    List[Tensor]
-        Fused graph
-    """
-
-    def _need_broadcast_gemm(op: Operator):
-        if not op._attrs["op"].startswith("gemm"):
-            return False
-        inputs = op._attrs["inputs"]
-        return len(inputs[0].shape()) != 2 or len(inputs[1].shape()) != 2
-
-    permute_mm_patterns = (
-        ([permute021, bmm_ccr], [bmm_rcr, bmm_crr], None),
-        ([permute021, bmm_crr], [bmm_rrr, bmm_ccr], None),
-        ([permute021, bmm_rcr], [bmm_ccr, bmm_rrr], None),
-        ([permute021, bmm_rrr], [bmm_crr, bmm_rcr], None),
-        ([permute021, gemm_rcr], [bmm_ccr, bmm_rrr], _need_broadcast_gemm),
-        ([permute021, gemm_rrr], [bmm_crr, bmm_rcr], _need_broadcast_gemm),
-        (
-            [permute021, gemm_rcr_bias],
-            [ops.gemm_universal.bmm_ccr_add, ops.gemm_universal.bmm_rrr_add],
-            _need_broadcast_gemm,
-        ),
-        (
-            [permute021, gemm_rrr_bias],
-            [ops.gemm_universal.bmm_crr_add, None],
-            _need_broadcast_gemm,
-        ),
-    )
-
-    graph_transformed = True
-    while graph_transformed:
-        graph_transformed = False
-        for source, targets, condition in permute_mm_patterns:
-            fused, sorted_graph = _fuse_permute_bmm_ops(
-                sorted_graph, source, targets, condition
-            )
-            graph_transformed |= fused
-
-    return sorted_graph
diff --git a/python/aitemplate/utils/logger.py b/python/aitemplate/utils/logger.py
deleted file mode 100644
index 7dfdba771..000000000
--- a/python/aitemplate/utils/logger.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-default logger
-"""
-import logging
-import os
-
-
-def info(name, message):
-    logger = logging.getLogger(name)
-    logger.info(message)
-
-
-def debug(name, message):
-    logger = logging.getLogger(name)
-    logger.debug(message)
-
-
-def warning(name, message):
-    logger = logging.getLogger(name)
-    logger.warning(message)
-
-
-def is_debug():
-    logger = logging.getLogger("aitemplate")
-    return logger.level == logging.DEBUG
-
-
-def setup_logger(name):
-    root_logger = logging.getLogger(name)
-    info_handle = logging.StreamHandler()
-    formatter = logging.Formatter("%(asctime)s %(levelname)s <%(name)s> %(message)s")
-    info_handle.setFormatter(formatter)
-    root_logger.addHandler(info_handle)
-    root_logger.propagate = False
-
-    DEFAULT_LOGLEVEL = logging.getLogger().level
-    log_level_str = os.environ.get("LOGLEVEL", None)
-    LOG_LEVEL = (
-        getattr(logging, log_level_str.upper())
-        if log_level_str is not None
-        else DEFAULT_LOGLEVEL
-    )
-    root_logger.setLevel(LOG_LEVEL)
-    return root_logger

From 76eeab19b6f7b04e5d05e8fbed0f82af933c761c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 31 Jan 2023 09:29:20 -0800
Subject: [PATCH 028/638] Generate name strings for NoneType (#160)

Summary:
We enabled to dump a virtual graph after each transformation with LOGLEVEL=debug. One side-effect of this change was that we may encounter cases where nodes haven't been assigned any names, e.g. for transformations before the naming pass. When we plot such a graph, we would hit assertion failures if we "add" NoneType names to another string.

This PR fixed the issue by forcing NoneType names to have string representation.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/160

Reviewed By: aakhundov

Differential Revision: D42881728

Pulled By: chenyang78

fbshipit-source-id: fe68a3363472eb516cec847460f2fc8998c97dea
---
 python/aitemplate/utils/visualization/plot.py |  8 +++----
 .../aitemplate/utils/visualization/pydot.py   | 23 ++++---------------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index cd55eb081..9ed1101aa 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -71,8 +71,8 @@ def _gen_tensor_modal(tensor) -> str:
     content["dtype"] = str(tensor._attrs["dtype"])
     table_src = TABLE_TEMPLATE.render(table_data=content)
     modal_src = MODAL_TEMPLATE.render(
-        modal_id=tensor._attrs["name"] + "_modal",
-        modal_label=tensor._attrs["name"] + "_label",
+        modal_id=f'{tensor._attrs["name"]}_modal',
+        modal_label=f'{tensor._attrs["name"]}_label',
         modal_title=tensor._attrs["name"],
         modal_content=table_src,
     )
@@ -83,8 +83,8 @@ def _gen_op_modal(op) -> str:
     content = op_attr_factory.op_to_content(op)
     table_src = TABLE_TEMPLATE.render(table_data=content)
     modal_src = MODAL_TEMPLATE.render(
-        modal_id=op._attrs["name"] + "_modal",
-        modal_label=op._attrs["name"] + "_label",
+        modal_id=f'{op._attrs["name"]}_modal',
+        modal_label=f'{op._attrs["name"]}_label',
         modal_title=op._attrs["name"],
         modal_content=table_src,
     )
diff --git a/python/aitemplate/utils/visualization/pydot.py b/python/aitemplate/utils/visualization/pydot.py
index adcec96d1..6e33aec91 100644
--- a/python/aitemplate/utils/visualization/pydot.py
+++ b/python/aitemplate/utils/visualization/pydot.py
@@ -439,6 +439,9 @@ def quote_if_necessary(s):
             return "True"
         return "False"
 
+    if s is None:
+        return f"{s}"
+
     if not isinstance(s, str):
         return s
 
@@ -451,7 +454,7 @@ def quote_if_necessary(s):
             "\n": r"\n",
             "\r": r"\r",
         }
-        for (a, b) in replace.items():
+        for a, b in replace.items():
             s = s.replace(a, b)
 
         return '"' + s + '"'
@@ -505,7 +508,6 @@ def graph_from_edges(edge_list, node_prefix="", directed=False):
         graph = Dot(graph_type="graph")
 
     for edge in edge_list:
-
         if isinstance(edge[0], str):
             src = node_prefix + edge[0]
         else:
@@ -729,11 +731,9 @@ def __init__(self, name="", obj_dict=None, **attrs):
         # as if they were Node definitions
         #
         if obj_dict is not None:
-
             self.obj_dict = obj_dict
 
         else:
-
             self.obj_dict = dict()
 
             # Copy the attributes
@@ -893,7 +893,6 @@ def __eq__(self, edge):
             raise pydot.Error("Can not compare an edge to a non-edge object.")
 
         if self.get_parent_graph().get_top_graph_type() == "graph":
-
             # If the graph is undirected, the edge has neither
             # source nor destination.
             #
@@ -920,7 +919,6 @@ def parse_node_ref(self, node_str):
             return node_str
 
         if node_str.startswith('"') and node_str.endswith('"'):
-
             return node_str
 
         node_port_idx = node_str.rfind(":")
@@ -1041,7 +1039,6 @@ def __init__(
             self.obj_dict = obj_dict
 
         else:
-
             self.obj_dict = dict()
 
             self.obj_dict["attributes"] = dict(attrs)
@@ -1236,7 +1233,6 @@ def del_node(self, name, index=None):
             name = name.get_name()
 
         if name in self.obj_dict["nodes"]:
-
             if index is not None and index < len(self.obj_dict["nodes"][name]):
                 del self.obj_dict["nodes"][name][index]
                 return True
@@ -1259,7 +1255,6 @@ def get_node(self, name):
         match = list()
 
         if name in self.obj_dict["nodes"]:
-
             match.extend(
                 [Node(obj_dict=obj_dict) for obj_dict in self.obj_dict["nodes"][name]]
             )
@@ -1409,7 +1404,6 @@ def add_subgraph(self, sgraph):
             )
 
         if sgraph.get_name() in self.obj_dict["subgraphs"]:
-
             sgraph_list = self.obj_dict["subgraphs"][sgraph.get_name()]
             sgraph_list.append(sgraph.obj_dict)
 
@@ -1432,7 +1426,6 @@ def get_subgraph(self, name):
         match = list()
 
         if name in self.obj_dict["subgraphs"]:
-
             sgraphs_obj_dict = self.obj_dict["subgraphs"].get(name)
 
             for obj_dict_list in sgraphs_obj_dict:
@@ -1484,9 +1477,7 @@ def to_string(self):
         graph = list()
 
         if self.obj_dict.get("strict", None) is not None:
-
             if self == self.get_parent_graph() and self.obj_dict["strict"]:
-
                 graph.append("strict ")
 
         graph_type = self.obj_dict["type"]
@@ -1496,9 +1487,7 @@ def to_string(self):
         graph.append(s)
 
         for attr in sorted(self.obj_dict["attributes"]):
-
             if self.obj_dict["attributes"].get(attr, None) is not None:
-
                 val = self.obj_dict["attributes"].get(attr)
                 if val == "":
                     val = '""'
@@ -1538,12 +1527,10 @@ def to_string(self):
         obj_list.sort(key=lambda x: x[0])
 
         for idx, obj in obj_list:
-
             if obj["type"] == "node":
                 node = Node(obj_dict=obj)
 
                 if self.obj_dict.get("suppress_disconnected", False):
-
                     if (
                         node.get_name() not in edge_src_set
                         and node.get_name() not in edge_dst_set
@@ -1623,7 +1610,6 @@ def __init__(
         )
 
         if obj_dict is None:
-
             self.obj_dict["type"] = "subgraph"
 
 
@@ -1677,7 +1663,6 @@ def __init__(
         )
 
         if obj_dict is None:
-
             self.obj_dict["type"] = "subgraph"
             self.obj_dict["name"] = quote_if_necessary("cluster_" + graph_name)
 

From 75e855ca05738c279e2697dc812f4f2ca524e59d Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Wed, 1 Feb 2023 01:51:23 +0800
Subject: [PATCH 029/638] revert some changes

---
 examples/03_bert/benchmark_mi250.sh                 | 2 +-
 examples/05_stable_diffusion/benchmark.py           | 2 +-
 tests/unittest/compiler/test_fuse_mm_elementwise.py | 3 ---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/03_bert/benchmark_mi250.sh b/examples/03_bert/benchmark_mi250.sh
index ac6be56a6..4bacb3407 100644
--- a/examples/03_bert/benchmark_mi250.sh
+++ b/examples/03_bert/benchmark_mi250.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #profile
-#HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
+HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
 
 #1GCD
 HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1"
diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
index d2a08d51f..78f9be47a 100644
--- a/examples/05_stable_diffusion/benchmark.py
+++ b/examples/05_stable_diffusion/benchmark.py
@@ -274,7 +274,7 @@ def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=
 @click.option("--verify", type=bool, default=False, help="verify correctness")
 @click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
 def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
-    #assert batch_size == 1, "batch size must be 1 for submodule verification"
+    assert batch_size == 1, "batch size must be 1 for submodule verification"
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index ad85434a2..db1bc941c 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -1297,9 +1297,6 @@ def test_gemm_rcr_swish_float(self):
             dtype="float",
             use_add=True,
         )
-        self._test_gemm_rcr_bias_swish(
-            [8], 16, 3, "gemm_rcr_add_silu_basic", True, True
-        )
 
 
 class FuseBmmCcrAddCase(unittest.TestCase):

From 05eda49755819e5dd8656397fcc7d27094ba8691 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 31 Jan 2023 10:37:08 -0800
Subject: [PATCH 030/638] bump gemm/conv2/conv3 cache versions to 2 (#159)

Summary:
We upgraded to cutlass 3 recently, which introduced cache conflicts. Let's bump our cache versions to 2.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/159

Reviewed By: aakhundov

Differential Revision: D42881346

Pulled By: chenyang78

fbshipit-source-id: 3f5e82dd63bf31625771e744626c890edc338303
---
 python/aitemplate/backend/profiler_cache.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 2e027a59c..6818110c6 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -492,9 +492,9 @@ def __init__(
         #     leave some content from the failing version in the db. How are we
         #     going to update the db if we update the version again, and so on.
         # TODO: add similar version control for norm
-        self._gemm_cache_version = 1
-        self._conv_cache_version = 1
-        self._conv3d_cache_version = 1
+        self._gemm_cache_version = 2
+        self._conv_cache_version = 2
+        self._conv3d_cache_version = 2
         if uri is not None:
             self._mode = CacheMode.REMOTE
         if self._mode == CacheMode.LOCAL:

From bcb13944c93b40261929a43eedb417d98c2cd761 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 31 Jan 2023 10:48:23 -0800
Subject: [PATCH 031/638] Added a facility to generate a standalone executable
 for the model (#156)

Summary:
Sometimes, it would be more convenient to debug with a standalone executable without going through the python world.

This change added an option, gen_standalone, to AITDebugSettings, which can be passed to compile_model method. When gen_standalone is True, a standalone executable will be generated in the same folder as test.so, along with suitable changes to the Makefile.

Note that because we aim for assisting debugging, we make a number of simplifications to the entry standalone.cpp:
  * we use the maximum input shapes;
  * we only generate random inputs with a fixed seed;
  * we assume that outputs exist on the host;
  * we disable graph_mode;
  * etc... Once the file is copied into the intemediate working dir (e.g., ./tmp/gen_standalone_fp16) along with other files, users are free to make any changes to the code and run "make" to re-generate the executable.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/156

Reviewed By: ipiszy

Differential Revision: D42855268

Pulled By: chenyang78

fbshipit-source-id: 08357b8c98e4cb78ef2c0649905b5b1d81112a0b
---
 python/aitemplate/backend/builder.py          |  64 +++-
 python/aitemplate/compiler/compiler.py        |   4 +-
 python/aitemplate/utils/debug_settings.py     |   3 +
 static/csrc/model_container.cpp               |  14 +
 static/csrc/model_interface.cpp               |  20 ++
 static/csrc/standalone.cpp                    | 298 ++++++++++++++++++
 static/include/cuda_device_functions.h        |  10 +
 static/include/model_container.h              |   4 +
 static/include/model_interface.h              |  17 +
 static/include/rocm_device_functions.h        |   8 +
 tests/unittest/backend/test_gen_standalone.py | 176 +++++++++++
 11 files changed, 609 insertions(+), 9 deletions(-)
 create mode 100644 static/csrc/standalone.cpp
 create mode 100644 tests/unittest/backend/test_gen_standalone.py

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index a4a2fbd3b..1e735117e 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -32,6 +32,8 @@
 
 import jinja2
 
+from aitemplate.utils.debug_settings import AITDebugSettings
+
 from ..utils.misc import is_debug
 from .target import Target
 from .task_runner import BaseRunner, Task
@@ -40,6 +42,7 @@
 
 
 _LOGGER = logging.getLogger(__name__)
+_DEBUG_SETTINGS = AITDebugSettings()
 
 
 def _augment_for_trace(cmd):
@@ -358,7 +361,7 @@ def build_so(self, target: Target, objs: list[str]):
         self._runner.join()
         self._runner.pull()
 
-    def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
+    def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings):
 
         makefile_template = jinja2.Template(
             """
@@ -374,20 +377,43 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
     {{bfile_cmd}}
 
 .PHONY: all clean clean_constants
-all: {{target}}
+all: {{targets}}
 
-{{target}}: $(obj_files)
+{{dll_target}}: $(obj_files)
     {{build_so_cmd}}
 
+{{build_standalone_rules}}
+
 clean:
-    rm -f *.obj {{target}} test.so
+    rm -f *.obj {{targets}}
 
 clean_constants:
     rm -f constants.bin
 """
         )
+
+        standalone_rules_template = jinja2.Template(
+            """
+{{standalone_src}}: {{standalone_obj}}
+    {{cfile_cmd}}
+
+{{exe_target}}: {{exe_target_deps}}
+    {{build_exe_cmd}}
+"""
+        )
+
         build_so_cmd = "$(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)"
-        obj_files = [pair[1].split("/")[-1] for pair in file_pairs]
+        standalone_src = "standalone.cu"
+        standalone_obj = "standalone.obj"
+        obj_files = []
+        # standalone.cu is an AITemplate internal file that is used for generating
+        # standalone executables. We only want to compile it when the relevant
+        # debug option is enabled.
+        obj_files = [
+            pair[1].split("/")[-1]
+            for pair in file_pairs
+            if not pair[1].endswith(standalone_obj)
+        ]
         obj_files = " ".join(obj_files)
 
         cc = Target.current().cc()
@@ -410,16 +436,36 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
             bfile_cmd = _augment_for_trace(bfile_cmd)
             build_so_cmd = _augment_for_trace(build_so_cmd)
 
+        build_exe_cmd = "$(CC) $(CFLAGS) -o $@ $(obj_files)"
+        targets = f"{dll_name}"
+
+        build_standalone_rules = ""
+        if debug_settings.gen_standalone:
+            build_exe_cmd = f"$(CC) $(CFLAGS) -o $@ {standalone_obj} {dll_name}"
+            exe_name = os.path.splitext(dll_name)[0] + ".exe"
+            exe_target_deps = f"{dll_name} {standalone_obj}"
+            build_standalone_rules = standalone_rules_template.render(
+                standalone_src=standalone_src,
+                standalone_obj=standalone_obj,
+                cfile_cmd=cfile_cmd,
+                exe_target=exe_name,
+                exe_target_deps=exe_target_deps,
+                build_exe_cmd=build_exe_cmd,
+            )
+            targets += f" {exe_name}"
+
         makefile_str = makefile_template.render(
             cc=cc,
             cpp=cpp,
             CFLAGS=compile_options,
             fPIC=fpic,
             obj_files=obj_files,
-            target=dll_name,
+            dll_target=dll_name,
+            targets=targets,
             cfile_cmd=cfile_cmd,
             bfile_cmd=bfile_cmd,
             build_so_cmd=build_so_cmd,
+            build_standalone_rules=build_standalone_rules,
         )
 
         dumpfile = os.path.join(workdir, test_name, "Makefile")
@@ -726,8 +772,10 @@ def make_profilers(self, generated_profilers, workdir):
         cmds = [make_clean_cmd, make_all_cmd]
         _run_make_cmds(cmds, self._timeout, build_dir)
 
-    def make(self, file_pairs, dll_name, workdir, test_name):
-        self.gen_makefile(file_pairs, dll_name, workdir, test_name)
+    def make(
+        self, file_pairs, dll_name, workdir, test_name, debug_settings=_DEBUG_SETTINGS
+    ):
+        self.gen_makefile(file_pairs, dll_name, workdir, test_name, debug_settings)
         make_path = shlex.quote(Target.current().make())
         build_dir = shlex.quote(os.path.join(workdir, test_name))
         make_flags = " ".join(
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index f99436de3..ca87e0a65 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -248,7 +248,9 @@ def compile_model(
 
             start_t = datetime.now()
             compile_engine = backend.builder.Builder()
-            compile_engine.make(file_pairs, dll_name, workdir, test_name)
+            compile_engine.make(
+                file_pairs, dll_name, workdir, test_name, debug_settings
+            )
             _LOGGER.info(
                 f"compiled the final .so file elapsed time: {elapsed_dt_sec(start_t)}",
             )
diff --git a/python/aitemplate/utils/debug_settings.py b/python/aitemplate/utils/debug_settings.py
index d614a0be6..50446f654 100644
--- a/python/aitemplate/utils/debug_settings.py
+++ b/python/aitemplate/utils/debug_settings.py
@@ -34,9 +34,12 @@ class AITDebugSettings:
         (e.g. NVTX for CUDA and rocTX for AMD) Currently only supports NVIDIA.
     dump_ait_to_py: str, optional
         The path where the AIT graph is dumped into a .py file.
+    gen_standalone : bool (default: False)
+        Generate a standalone executable for the model
     """
 
     check_all_nan_and_inf: bool = False
     check_all_outputs: bool = False
     gen_profiler_annotation: bool = False
     dump_ait_to_py: Optional[str] = None
+    gen_standalone: bool = False
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 3fc3bc7dc..1c4f7f19f 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -324,10 +324,24 @@ size_t ModelContainer::NumInputs() const {
 }
 
 const char* ModelContainer::InputName(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
   CHECK_VECTOR_ACCESS(param_names_, input_idx)
   return param_names_[input_idx];
 }
 
+AITemplateParamShape ModelContainer::MaxInputShape(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
+  CHECK_VECTOR_ACCESS(max_param_shapes_, input_idx)
+  auto& input_shape = max_param_shapes_[input_idx];
+  return AITemplateParamShape{input_shape.data(), input_shape.size()};
+}
+
+AITemplateDtype ModelContainer::InputDtype(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
+  CHECK_VECTOR_ACCESS(param_dtypes_, input_idx)
+  return param_dtypes_[input_idx];
+}
+
 size_t ModelContainer::NumOutputs() const {
   return num_outputs_;
 }
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index 6501eaa8e..e24634629 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -233,6 +233,26 @@ AITemplateError AITemplateModelContainerGetInputName(
       { *input_name_out = m->InputName(input_idx); })
 }
 
+AITemplateError AITemplateModelContainerGetMaximumInputShape(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateParamShape* shape) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(shape)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *shape = m->MaxInputShape(input_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetInputDtype(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateDtype* input_dtype) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(input_dtype)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *input_dtype = m->InputDtype(input_idx); })
+}
+
 AITemplateError AITemplateModelContainerGetNumOutputs(
     AITemplateModelHandle handle,
     size_t* num_outputs_out) {
diff --git a/static/csrc/standalone.cpp b/static/csrc/standalone.cpp
new file mode 100644
index 000000000..876e7045c
--- /dev/null
+++ b/static/csrc/standalone.cpp
@@ -0,0 +1,298 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// This file is used for generating a standalone executable for a model.
+// It only invokes the C++ model interface. We can directly invoke the
+// generated executable without going through Python bindings. Because it
+// aims for assisting debugging, we make a number of simplifications:
+//   * we use the maximum input shapes;
+//   * we only generate random inputs with a fixed seed;
+//   * we assume that outputs exist on the host;
+//   * we disable graph_mode;
+//   * etc...
+// Once the file is copied into the intemediate working dir (e.g.,
+// ./tmp/test_gemm_rcr) along with other files, users are free to make any
+// changes to the code. We do not try to predict users' actions.
+
+#include <functional>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "macros.h"
+#include "model_interface.h"
+#include "raii_wrapper.h"
+
+using namespace ait;
+
+template <typename T>
+static void make_random_integer_values(
+    std::mt19937& rnd_generator,
+    T* h_data,
+    size_t numel,
+    T lb,
+    T ub) {
+  std::uniform_int_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    h_data[i] = static_cast<T>(dist(rnd_generator));
+  }
+}
+
+static void make_random_float_values(
+    std::mt19937& rnd_generator,
+    float* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    h_data[i] = static_cast<float>(dist(rnd_generator));
+  }
+}
+
+static void make_random_float16_values(
+    std::mt19937& rnd_generator,
+    half* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    float v = static_cast<float>(dist(rnd_generator));
+    h_data[i] = __float2half_rn(v);
+  }
+}
+
+static void make_random_bfloat16_values(
+    std::mt19937& rnd_generator,
+    bfloat16* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    float v = static_cast<float>(dist(rnd_generator));
+    h_data[i] = __float2bfloat16_rn(v);
+  }
+}
+
+static GPUPtr make_random_data(
+    AITemplateAllocator& allocator,
+    std::mt19937& rnd_generator,
+    const AITemplateParamShape& shape,
+    const AITemplateDtype& dtype) {
+  size_t numel = shape.Numel();
+  size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+  void* h_data;
+  DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+  switch (dtype) {
+    case AITemplateDtype::kInt:
+      make_random_integer_values<int>(
+          rnd_generator,
+          static_cast<int*>(h_data),
+          numel,
+          /*lb*/ -10,
+          /*ub*/ 10);
+      break;
+    case AITemplateDtype::kLong:
+      make_random_integer_values<int64_t>(
+          rnd_generator,
+          static_cast<int64_t*>(h_data),
+          numel,
+          /*lb*/ -10,
+          /*ub*/ 10);
+      break;
+    case AITemplateDtype::kFloat:
+      make_random_float_values(
+          rnd_generator,
+          static_cast<float*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kBFloat16:
+      make_random_bfloat16_values(
+          rnd_generator,
+          static_cast<bfloat16*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kHalf:
+      make_random_float16_values(
+          rnd_generator,
+          static_cast<half*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kBool:
+      make_random_integer_values<bool>(
+          rnd_generator, static_cast<bool*>(h_data), numel, /*lb*/ 0, /*ub*/ 1);
+      break;
+    default:
+      throw std::runtime_error("unsupported dtype for making random data");
+  }
+
+  GPUPtr d_ptr = RAII_DeviceMalloc(num_bytes, allocator);
+  DEVICE_CHECK(CopyToDevice(d_ptr.get(), h_data, num_bytes));
+
+  // free memory
+  DEVICE_CHECK(FreeDeviceHostMemory(h_data));
+
+  return d_ptr;
+}
+
+using OutputDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
+
+struct OutputData {
+  OutputData(
+      OutputDataPtr& data_in,
+      std::unique_ptr<int64_t[]>& shape_ptr_in,
+      int shape_size_in,
+      int index_in,
+      AITemplateDtype dtype_in,
+      const char* name_in)
+      : data(std::move(data_in)),
+        shape_ptr(std::move(shape_ptr_in)),
+        shape_size(shape_size_in),
+        index(index_in),
+        dtype(dtype_in),
+        name(name_in) {}
+
+  OutputData(OutputData&& other) noexcept
+      : data(std::move(other.data)),
+        shape_ptr(std::move(other.shape_ptr)),
+        shape_size(other.shape_size),
+        index(other.index),
+        dtype(other.dtype),
+        name(std::move(other.name)) {}
+
+  OutputDataPtr data;
+  std::unique_ptr<int64_t[]> shape_ptr;
+  int shape_size;
+  int index;
+  AITemplateDtype dtype;
+  std::string name;
+};
+
+static AITemplateError run(
+    AITemplateModelHandle handle,
+    AITemplateAllocator& allocator,
+    std::vector<OutputData>& outputs) {
+  size_t num_outputs = 0;
+  AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+
+  outputs.reserve(num_outputs);
+  std::vector<AITData> ait_outputs;
+  ait_outputs.reserve(num_outputs);
+  std::vector<int64_t*> ait_output_shapes_out;
+  ait_output_shapes_out.reserve(num_outputs);
+
+  for (unsigned i = 0; i < num_outputs; i++) {
+    const char* name;
+    AITemplateModelContainerGetOutputName(handle, i, &name);
+    AITemplateParamShape shape;
+    AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+    AITemplateDtype dtype;
+    AITemplateModelContainerGetOutputDtype(handle, i, &dtype);
+
+    std::unique_ptr<int64_t[]> shape_ptr =
+        std::make_unique<int64_t[]>(shape.size);
+    ait_output_shapes_out.push_back(shape_ptr.get());
+    size_t num_bytes = shape.Numel() * AITemplateDtypeSizeBytes(dtype);
+    void* h_data;
+    DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+    ait_outputs.emplace_back(h_data, shape, dtype);
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    OutputDataPtr h_output_ptr(h_data, deleter);
+    outputs.emplace_back(
+        h_output_ptr, shape_ptr, (int)shape.size, (int)i, dtype, name);
+  }
+
+  size_t num_inputs = 0;
+  AITemplateModelContainerGetNumInputs(handle, &num_inputs);
+  // Holding unique_ptr(s) that will be auto-released.
+  std::vector<GPUPtr> input_ptrs;
+  input_ptrs.reserve(num_inputs);
+
+  std::map<std::string, unsigned> input_name_to_index;
+  std::vector<AITData> inputs(num_inputs);
+  std::mt19937 rnd_generator(1234);
+  // set up the name-to-index map each input
+  for (unsigned i = 0; i < num_inputs; i++) {
+    const char* name;
+    AITemplateModelContainerGetInputName(handle, i, &name);
+    input_name_to_index.insert({name, i});
+    std::cout << "input: " << name << ", at idx: " << i << "\n";
+
+    AITemplateParamShape shape;
+    AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+    AITemplateDtype dtype;
+    AITemplateModelContainerGetInputDtype(handle, i, &dtype);
+    // This file aims for helping debugging so we make the code logic
+    // simple. Instead of asking the user to pass input names along with
+    // shapes, we just use the shape with the largest dimension values
+    // to make a random input. Once this code is copied into the test's
+    // tmp folder, the person who will be diagnosing the issue could make any
+    // changes to the code. We don't force us to predict the user's behavior.
+    input_ptrs.emplace_back(
+        make_random_data(allocator, rnd_generator, shape, dtype));
+    inputs[i] = AITData(input_ptrs.back().get(), shape, dtype);
+  }
+
+  bool graph_mode = false;
+  auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+  return AITemplateModelContainerRunWithOutputsOnHost(
+      handle,
+      inputs.data(),
+      num_inputs,
+      ait_outputs.data(),
+      num_outputs,
+      reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+      graph_mode,
+      ait_output_shapes_out.data());
+}
+
+int main() {
+  AITemplateModelHandle handle;
+  AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+  AITemplateAllocator* allocator;
+  AIT_ERROR_CHECK(
+      AITemplateAllocatorCreate(&allocator, AITemplateAllocatorType::kDefault));
+
+  auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+
+  std::vector<OutputData> outputs;
+  AIT_ERROR_CHECK(run(handle, *allocator, outputs));
+
+  // print out something
+  for (const auto& output : outputs) {
+    std::cout << "output: " << output.name << " at idx: " << output.index
+              << " with shape: ";
+    for (int i = 0; i < output.shape_size; i++) {
+      std::cout << output.shape_ptr[i] << ",";
+    }
+    std::cout << "\n";
+  }
+
+  AIT_ERROR_CHECK(AITemplateAllocatorDelete(allocator));
+  // We are done and delete the handle.
+  AITemplateModelContainerDelete(handle);
+  return 0;
+}
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 57f309bbc..4f72d80b9 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -37,6 +37,8 @@ using GraphType = cudaGraph_t;
 using GraphExecType = cudaGraphExec_t;
 using Handle = void*;
 
+using bfloat16 = __nv_bfloat16;
+
 inline DeviceError GetDevice(int* device_idx) {
   return cudaGetDevice(device_idx);
 }
@@ -119,6 +121,10 @@ inline DeviceError FreeDeviceMemory(Handle src) {
   return cudaFree(src);
 }
 
+inline DeviceError FreeDeviceHostMemory(Handle src) {
+  return cudaFreeHost(src);
+}
+
 inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) {
   return cudaFreeAsync(src, stream);
 }
@@ -127,6 +133,10 @@ inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
   return cudaMalloc(dst, size);
 }
 
+inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
+  return cudaMallocHost(dst, size);
+}
+
 inline DeviceError DeviceMallocAsync(
     Handle* dst,
     size_t size,
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 4a1ba9c55..594d26e7f 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -162,8 +162,12 @@ class ModelContainer : ModelContainerBase {
   const char* InputName(size_t input_idx) const;
   const char* OutputName(size_t output_idx) const;
 
+  AITemplateParamShape MaxInputShape(size_t input_idx) const;
   AITemplateParamShape MaxOutputShape(size_t output_idx) const;
+
+  AITemplateDtype InputDtype(size_t input_idx) const;
   AITemplateDtype OutputDtype(size_t output_idx) const;
+
   size_t MaxOutputStorageBytes(size_t output_idx) const;
 
   size_t GetNumRuntimes() const {
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 198f90534..0633c01c3 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -43,6 +43,13 @@ enum class AITemplateError : int {
   AITemplateFailure = 1,
 };
 
+#define AIT_ERROR_CHECK(call)                                             \
+  if ((call) != AITemplateError::AITemplateSuccess) {                     \
+    throw std::runtime_error(                                             \
+        std::string(#call " API call failed at ") + __FILE__ + ", line" + \
+        std::to_string(__LINE__));                                        \
+  }
+
 struct AITemplateParamShape {
   AITemplateParamShape() : shape_data(nullptr), size(0) {}
   AITemplateParamShape(const int64_t* shape_data_in, size_t size_in)
@@ -201,6 +208,16 @@ AIT_EXPORT AITemplateError AITemplateModelContainerGetInputName(
     size_t input_idx,
     const char** input_name_out);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerGetMaximumInputShape(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateParamShape* shape);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetInputDtype(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateDtype* input_dtype);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerGetNumOutputs(
     AITemplateModelHandle handle,
     size_t* num_outputs_out);
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index f6809d9d7..710fe3867 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -120,6 +120,10 @@ inline DeviceError FreeDeviceMemory(Handle src) {
   return hipFree(src);
 }
 
+inline DeviceError FreeDeviceHostMemory(Handle src) {
+  return hipHostFree(src);
+}
+
 inline DeviceError FreeDeviceMemoryAsync(
     Handle src,
     StreamType /*stream*/ = 0) {
@@ -131,6 +135,10 @@ inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
   return hipMalloc(dst, size);
 }
 
+inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
+  return hipHostMalloc(dst, size, hipHostMallocDefault);
+}
+
 inline DeviceError DeviceMallocAsync(
     Handle* dst,
     size_t size,
diff --git a/tests/unittest/backend/test_gen_standalone.py b/tests/unittest/backend/test_gen_standalone.py
new file mode 100644
index 000000000..746213b2c
--- /dev/null
+++ b/tests/unittest/backend/test_gen_standalone.py
@@ -0,0 +1,176 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import re
+import subprocess
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.debug_settings import AITDebugSettings
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class StridedOpCatPatternTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_gen_standalone(self, test_name, dtype):
+        M = 8
+        N = 16
+        K = 32
+        X1 = Tensor(
+            shape=[IntImm(M), IntImm(K)],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="W1",
+            is_input=True,
+        )
+        B1 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="B1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(M), IntImm(N)],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[IntImm(M), IntImm(N)],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+        Y1 = ops.gemm_rcr_bias()(X1, W1, B1)
+        Y2 = ops.elementwise(FuncEnum.ADD)(Y1, X2)
+        cat_dim = 1
+        Y = ops.concatenate()([X3, Y2], dim=cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        debug_settings = AITDebugSettings(gen_standalone=True)
+        dll_name = "test.so"
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+
+        x1_pt = get_random_torch_tensor([M, K], dtype)
+        w1_pt = get_random_torch_tensor([N, K], dtype)
+        b1_pt = get_random_torch_tensor([N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
+        x3_pt = get_random_torch_tensor([M, N], dtype)
+
+        y1_pt = torch.nn.functional.linear(x1_pt, w1_pt, b1_pt)
+        y2_pt = y1_pt + x2_pt
+        y_pt = torch.cat([x3_pt, y2_pt], dim=cat_dim)
+        y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+        module.run_with_tensors(
+            {
+                "X1": x1_pt,
+                "W1": w1_pt,
+                "B1": b1_pt,
+                "X2": x2_pt,
+                "X3": x3_pt,
+            },
+            [y],
+        )
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+        # Now we run the generate executable
+        cwd = os.getcwd()
+        workdir = os.path.join(cwd, "tmp", test_name)
+        working_env = os.environ.copy()
+        if "LD_LIBRARY_PATH" in working_env:
+            working_env["LD_LIBRARY_PATH"] = (
+                working_env["LD_LIBRARY_PATH"] + ":" + workdir
+            )
+        else:
+            working_env["LD_LIBRARY_PATH"] = workdir
+        _LOGGER.info(f"work dir: {workdir}")
+        with subprocess.Popen(
+            ["./test.exe"],
+            shell=True,
+            cwd=workdir,
+            env=working_env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        ) as proc:
+            try:
+                timeout = 10
+                out, err = proc.communicate(timeout)
+            except subprocess.TimeoutExpired as e:
+                proc.kill()
+                out, err = proc.communicate()
+                raise e
+            finally:
+                stdout = out.decode()
+                stderr = err.decode()
+                if proc.returncode != 0:
+                    _LOGGER.info(f"stdout:\n\n{stdout}")
+                    _LOGGER.info(f"stderr:\n\n{stderr}")
+                    raise RuntimeError("failed to execute test.exe")
+                else:
+                    _LOGGER.info(f"stdout:\n\n{stdout}")
+                    all_output_lines = stdout.split("\n")
+                    output_lines = [
+                        line for line in all_output_lines if "output_0" in line
+                    ]
+                    self.assertTrue(len(output_lines) == 1)
+                    m = re.search("with shape: +([0-9,]+)", output_lines[0])
+                    self.assertTrue(m is not None)
+                    shape = m.group(1).split(",")
+                    self.assertTrue(int(shape[0]) == 8)
+                    self.assertTrue(int(shape[1]) == 32)
+
+    def test_gen_standalone_f16(self):
+        self._test_gen_standalone("gen_standalone_f16", "float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gen_standalone_f32(self):
+        self._test_gen_standalone("gen_standalone_f32", "float32")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 74e234f0d26ba4e93a34b816489a8aa0fac6e5ed Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 31 Jan 2023 13:45:25 -0800
Subject: [PATCH 032/638] add test for OSS resnet50 to internal CI (#158)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/158

using a random input and untrained model seems sufficient here

Reviewed By: chenyang78

Differential Revision: D42872904

fbshipit-source-id: ad58d83fc0610b8018f129ff23863ff625785252
---
 examples/01_resnet-50/test_correctness.py | 93 +++++++++++++++++++++++
 examples/01_resnet-50/weight_utils.py     |  4 +-
 2 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 examples/01_resnet-50/test_correctness.py

diff --git a/examples/01_resnet-50/test_correctness.py b/examples/01_resnet-50/test_correctness.py
new file mode 100644
index 000000000..8c46ec769
--- /dev/null
+++ b/examples/01_resnet-50/test_correctness.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target
+
+from .modeling.resnet import build_resnet_backbone
+from .weight_utils import timm_export
+
+
+def mark_output(y):
+    """Different to PyTorch, we need to explicit mark output tensor for optimization,
+
+    Parameters
+    ----------
+    y : List[Tensor]
+        List of output tensors
+    """
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+class ResNet50Verification(unittest.TestCase):
+    def test_resnet50(self):
+        target = detect_target()
+        batch_size = 1
+        torch_dtype = torch.float16
+        ait_dtype = "float16"
+        # Create input tensor, need to specify the shape, dtype and is_input flag
+        x = Tensor(
+            shape=[batch_size, 224, 224, 3],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        model = build_resnet_backbone(50, activation="ReLU")
+        # Mark all parameters with name same to PyTorch name convention
+        model.name_parameter_tensor()
+        # Forward the input tensor to the model, get output tensor
+        y = model(x)
+        # Mark output tensor
+        mark_output(y)
+
+        timm_exporter = timm_export("resnet50", pretrained=False)
+        ait_params = timm_exporter.export_model(half=torch_dtype == torch.float16)
+        pt_model = timm_exporter.pt_model.to(dtype=torch_dtype, device="cuda")
+        pt_model.eval()
+        module = compile_model(y, target, "./tmp", "resnet50")
+        for name, param in ait_params.items():
+            module.set_constant_with_tensor(name, param)
+
+        # ait model expects NHWC format
+        x_ait = torch.rand([batch_size, 224, 224, 3], dtype=torch_dtype, device="cuda")
+        # center the input wrt the training data for numerical stability
+        x_ait -= torch.tensor([0.485, 0.456, 0.406]).cuda()
+        x_ait /= torch.tensor([0.229, 0.224, 0.225]).cuda()
+        # torch model expects NCHW format
+        x_pt = torch.transpose(x_ait, 1, 3).contiguous()
+        with torch.no_grad():
+            y_pt = pt_model(x_pt)
+        y_ait = torch.zeros([batch_size, 1, 1, 1000], dtype=torch_dtype, device="cuda")
+        module.run_with_tensors([x_ait], [y_ait])
+
+        torch.testing.assert_close(
+            y_pt, y_ait.reshape([batch_size, 1000]), rtol=1e-1, atol=1e-1
+        )
+
+
+if __name__ == "__main__":
+    torch.cuda.manual_seed(0)
+    unittest.main()
diff --git a/examples/01_resnet-50/weight_utils.py b/examples/01_resnet-50/weight_utils.py
index beaebd330..252583192 100644
--- a/examples/01_resnet-50/weight_utils.py
+++ b/examples/01_resnet-50/weight_utils.py
@@ -31,14 +31,14 @@
 
 
 class timm_export(object):
-    def __init__(self, model_name):
+    def __init__(self, model_name, pretrained=True):
         self.model_name = model_name
         if model_name != "resnet50":
             raise NotImplementedError
 
         with torch.no_grad():
             self.pt_model = timm.create_model(
-                model_name, pretrained=True, num_classes=1000
+                model_name, pretrained=pretrained, num_classes=1000
             )
         self.pt_state = self.pt_model.state_dict()
 

From cb0e67cff234133a5304e8c44c476836022d7d27 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 31 Jan 2023 15:41:16 -0800
Subject: [PATCH 033/638] Update infer_shapes

Summary:
Update infer_shapes to make them more "accurate".
This is for preperation for the upcoming symbolic shape support.

Reviewed By: terrychenism

Differential Revision: D42845965

fbshipit-source-id: e4fdf0ea9e0b9af166cc0511733d5a1da3cf975e
---
 python/aitemplate/compiler/ops/conv/conv2d.py           | 2 +-
 python/aitemplate/compiler/ops/conv/conv3d.py           | 2 +-
 python/aitemplate/compiler/ops/conv/depthwise_conv3d.py | 2 +-
 python/aitemplate/compiler/ops/pool/pool2d.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 6265e57c7..38316ce25 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -228,7 +228,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index a3a6d00cf..d876948c1 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -200,7 +200,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 6d2737c01..9e6f6d9e2 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -186,7 +186,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
index f3164e38b..f523f576a 100644
--- a/python/aitemplate/compiler/ops/pool/pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -127,7 +127,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),

From a81e793923fbd7305601b37bb63f41c9e6cf42d3 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Tue, 31 Jan 2023 20:19:42 -0800
Subject: [PATCH 034/638] Depthwise conv with bias (#165)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/165

Reviewed By: frank-wei

Differential Revision: D42908803

Pulled By: terrychenism

fbshipit-source-id: 8e77c350e3a30be4ebb018f68ef40a8bf1dc0e0b
---
 .../backend/cuda/conv3d/__init__.py           |   4 +-
 .../backend/cuda/conv3d/common_bias.py        | 297 +++++++++++++
 .../cuda/conv3d/depthwise_conv3d_bias.py      | 396 ++++++++++++++++++
 .../compiler/ops/conv/depthwise_conv3d.py     |  11 +-
 tests/unittest/ops/test_depthwise_conv3d.py   |  41 +-
 5 files changed, 739 insertions(+), 10 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/conv3d/common_bias.py
 create mode 100644 python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py

diff --git a/python/aitemplate/backend/cuda/conv3d/__init__.py b/python/aitemplate/backend/cuda/conv3d/__init__.py
index ba1388ae4..84e693cc3 100644
--- a/python/aitemplate/backend/cuda/conv3d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv3d/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA conv3d module init
 """
-from . import conv3d, depthwise_conv3d
+from . import conv3d, depthwise_conv3d, depthwise_conv3d_bias
 
-__all__ = ["conv3d", "depthwise_conv3d"]
+__all__ = ["conv3d", "depthwise_conv3d", "depthwise_conv3d_bias"]
diff --git a/python/aitemplate/backend/cuda/conv3d/common_bias.py b/python/aitemplate/backend/cuda/conv3d/common_bias.py
new file mode 100644
index 000000000..929d46fd2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/common_bias.py
@@ -0,0 +1,297 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA conv3d common functions
+"""
+import re
+from hashlib import sha1
+from typing import List
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ....utils import alignment
+from ..conv2d.common import extract_config as conv2d_extract_config
+from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  {% if has_bias %}
+  void*,
+  {% endif %}
+  void*,
+  int64_t*, // kernel size
+  int64_t*,
+  int64_t*,
+  int, // strides
+  int,
+  int,
+  int, // padding
+  int,
+  int,
+  int, // dilation
+  int,
+  int,
+  int64_t*, // in_batch
+  int64_t*, // in_ch
+  int64_t*, // in_t
+  int64_t*, // in_h
+  int64_t*, // in_w
+  int64_t*, // out_ch
+  int64_t*, // out_t
+  int64_t*, // out_h
+  int64_t*, // out_w
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_kernel_t}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{stride_t}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{padding_t}},
+{{indent}}    {{padding_h}},
+{{indent}}    {{padding_w}},
+{{indent}}    {{dilation_t}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{p_in_batch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_in_t}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_out_t}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, has_bias=True)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    b = func_attrs["inputs"][2]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        has_bias=True,
+        bias_ptr=b._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_in_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_t="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_t="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_t="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_t=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        padding_t=func_attrs["pad"][0],
+        padding_h=func_attrs["pad"][1],
+        padding_w=func_attrs["pad"][2],
+        dilation_t=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        indent=indent,
+    )
+
+
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass{{opcode_class}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def emit_instance(op):
+    """emit instance"""
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv3d_operation.EmitConv3dInstance()
+    op_def = emiter.emit(op)
+    return op_def
+
+
+def extract_config(func_attrs, dtype="float16"):
+    """Extracts cutlass config for conv kernels."""
+    import cutlass_lib
+
+    return conv2d_extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        op_kind=cutlass_lib.library.OperationKind.Conv3d,
+        op_layout="ndhwc",
+    )
+
+
+def extract_config_name(config):
+    """Extracts config name from a given config."""
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[2]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_function(
+    func_attrs,
+    instance_template,
+    exec_template,
+    src_template,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+    f_emit_instance=emit_instance,
+    extra_header="",
+):
+    """Function definition codegen."""
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        if value not in inst_def_flag:
+            config = f_emit_instance(op_instance[value])
+            inst_def_flag.add(value)
+        else:
+            continue
+        inst = instance_template.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_d",
+        x_dim2="*in_h",
+        x_dim3="*in_w",
+        x_dim4="*in_ch",
+        w_dim0="*out_ch",
+        w_dim1="*kernel_d",
+        w_dim2="*kernel_h",
+        w_dim3="*kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_d",
+        y_dim2="*out_h",
+        y_dim3="*out_w",
+        y_dim4="*out_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        extra_header=extra_header,
+    )
+
+
+def cal_align_ab(x_shape: List[int], dtype="float16") -> int:
+    """Returns input alignment."""
+    k = x_shape[4]  # CI
+    return alignment.find_max_alignment(k, dtype)
+
+
+def function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = cal_align_ab(x_shape, dtype=dtype)
+    tmp = cfg.split("_")
+    align_c = int(tmp[-1])
+    align_ab = int(tmp[-2])
+    if align_c != func_attrs["epilogue_alignment"]:
+        return False
+    if align_ab != ab_alignment:
+        return False
+    return True
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
new file mode 100644
index 000000000..1f6e682bc
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
@@ -0,0 +1,396 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for depthwise_conv3d_bias.
+"""
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ... import registry
+from . import common_bias
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+#include <algorithm>
+#include <limits>
+#include <assert.h>
+
+namespace {
+#define CUDA_KERNEL_LOOP(i, n)                                                                          \\
+    int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;                                         \\
+    for (int64_t i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+template <typename scalar_t, typename accscalar_t, typename Telement, int element_in_Tio, int kernel_k, int dil_d>
+__global__ void conv_depthwise3d_cuda_kernel(
+    const scalar_t * input,
+    const {{dtype}}* kernel,
+    {% if has_bias %}
+    const {{dtype}}* bias,
+    {% endif %}
+    scalar_t * output,
+    int _kT, int _kH, int _kW,
+    int strideT, int strideH, int strideW,
+    int paddingT, int paddingH, int paddingW,
+    int _dilationT, int _dilationH, int _dilationW,
+    int iC, int iT, int iH, int iW,
+    int oT, int oH, int oW,
+    int num_outputs)
+{
+  int kT = kernel_k > 0? kernel_k: _kT;
+  int kH = kernel_k > 0? kernel_k: _kH;
+  int kW = kernel_k > 0? kernel_k: _kW;
+
+  int dilationT = dil_d > 0? dil_d: _dilationT;
+  int dilationH = dil_d > 0? dil_d: _dilationH;
+  int dilationW = dil_d > 0? dil_d: _dilationW;
+
+  const int oC = iC;
+  const int channel_multiplier = 1;
+
+  CUDA_KERNEL_LOOP(index, num_outputs) {
+    const int out_channel = index  % oC;
+    const int out_col = (index / oC) % oW;
+    const int out_row = (index / oC / oW) % oH;
+    const int out_frame = (index / oC / oW / oH) % oT;
+    const int batch = index / oC / oW / oH / oT;
+
+    const int in_channel = out_channel / channel_multiplier;
+
+    const int in_col_start = out_col * strideW - paddingW;
+    const int in_row_start = out_row * strideH - paddingH;
+    const int in_frame_start = out_frame * strideT - paddingT;
+
+    const int in_offset = in_channel + iC * (in_col_start + iW * (in_row_start + iH * (in_frame_start + iT* batch)));
+    const int out_offset = out_channel + oC * (out_col + oW * (out_row + oH * (out_frame + oT* batch)));
+
+    accscalar_t sum[element_in_Tio];
+    for (int tk = 0; tk < element_in_Tio; tk++){
+        sum[tk] = 0;
+    }
+    const {{dtype}} *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
+    const scalar_t *input_ptr = input + in_offset;
+    for (int k_frame = 0; k_frame < kT; ++k_frame) {
+      const int in_frame = in_frame_start + k_frame * dilationT;
+      for (int k_row = 0; k_row < kH; ++k_row) {
+        const int in_row = in_row_start + k_row * dilationH;
+        for (int k_col = 0; k_col < kW; ++k_col) {
+          const int in_col = in_col_start + k_col * dilationW;
+          if (in_frame >= 0 && in_row >= 0 && in_col >= 0 &&
+              in_frame < iT && in_row < iH && in_col < iW) {
+            scalar_t input_val = __ldg(input_ptr);
+            Telement* pack_input = reinterpret_cast<Telement*>(&input_val);
+
+            for (int tk = 0; tk < element_in_Tio; tk++){
+              {% if dtype == "half" %}
+                accscalar_t op1 = __half2float(pack_input[tk]);
+                sum[tk] += op1 * __half2float(kernel_ptr[tk*kT*kH*kW]);
+              {% elif dtype == "float" %}
+                accscalar_t op1 = pack_input[tk];
+                sum[tk] += op1 * kernel_ptr[tk*kT*kH*kW];
+              {% endif %}
+            }
+          }
+          kernel_ptr += 1;
+          input_ptr += dilationW * iC;
+        }
+        input_ptr += iC * (iW * dilationH - kW * dilationW);
+      }
+      input_ptr += iC * iW * (iH * dilationT - kH * dilationH);
+    }
+
+    {% if has_bias %}
+      const {{dtype}} *bias_ptr = bias + out_channel * element_in_Tio;
+    {% endif %}
+
+
+    scalar_t output_val;
+    Telement* pack_output = reinterpret_cast<Telement*>(&output_val);
+    for (int tk = 0; tk < element_in_Tio; tk++){
+      {% if dtype == "half" %}
+        {% if has_bias %}
+          pack_output[tk] = __float2half(sum[tk]) + bias_ptr[tk];
+        {% else %}
+          pack_output[tk] = __float2half(sum[tk]);
+        {% endif %}
+      {% elif dtype == "float" %}
+        {% if has_bias %}
+          pack_output[tk] = sum[tk] + bias_ptr[tk];
+        {% else %}
+          pack_output[tk] = sum[tk];
+        {% endif %}
+      {% endif %}
+    }
+    output[out_offset] = output_val;
+  }
+}
+
+#define NODEF_OR_EQUAL(x, y) ((y) < 0 || (x) == (y))
+#define NODEF_OR_EQUAL_3(x, y1, y2, y3) \\
+  (NODEF_OR_EQUAL(x, y1) && \\
+   NODEF_OR_EQUAL(x, y2) && \\
+   NODEF_OR_EQUAL(x, y3))
+
+
+#define DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(kernel_k, dil_d)                       \\
+  if (NODEF_OR_EQUAL_3(kernel_k, (kernel_t), (kernel_h), (kernel_w)) &&                 \\
+      NODEF_OR_EQUAL_3(dil_d, (dilation_t), (dilation_h), (dilation_w))) {              \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, kernel_k, dil_d>  \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      {% if has_bias %}                                                 \\
+      bias,                                                             \\
+      {% endif %}                                                       \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);                                                     \\
+  } else                                                                \\
+
+#define DWCONV3D_FORWARD_DISPATCH_OTHERS                                \\
+  {                                                                     \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, -1, -1>           \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      {% if has_bias %}                                                 \\
+      bias,                                                             \\
+      {% endif %}                                                       \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);}                                                    \\
+
+
+void conv_depthwise3d_launcher(
+    const {{dtype}} * input,
+    const {{dtype}} * weight,
+    {% if has_bias %}
+    const {{dtype}} * bias,
+    {% endif %}
+    {{dtype}} * output,
+    int kernel_t,
+    int kernel_h,
+    int kernel_w,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int n,
+    int c,
+    int t,
+    int h,
+    int w,
+    int to,
+    int ho,
+    int wo,
+    cudaStream_t stream
+    ) {
+
+  assert(to > 0);
+  assert(ho > 0);
+  assert(wo > 0);
+
+  int64_t num_outputs = n * to * ho * wo * c;
+  int64_t block = 256;
+  int64_t grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536);
+
+  int64_t num_inputs = n * t * h * w * c;
+  int64_t num_weights = c * kernel_t * kernel_h * kernel_w;
+  int64_t smem = 0;
+
+  // Range check to avoid overflow in CUDA kernels.
+  assert((num_inputs <= std::numeric_limits<int32_t>::max()) &&
+              "Input tensor is too large.");
+  assert((num_outputs <= std::numeric_limits<int32_t>::max()) &&
+              "Output tensor is too large.");
+  assert((num_weights <= 1024*8) &&
+              "Weight tensor is too large.");
+
+  assert((padding_t * 2 + t <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_h * 2 + h <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_w * 2 + w <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+
+
+  using accscalar_t = float;
+{% if dtype == "half" %}
+  using Telement = half;
+  {% if csize == 0 %}
+    using scalar_t = float4;
+    c = c/8;
+    num_outputs = num_outputs/8;
+    #define element_in_Tio 8
+  {% elif csize == 2 %}
+    using scalar_t = half2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = half;
+    #define element_in_Tio 1
+  {% endif %}
+{% elif dtype == "float" %}
+  using Telement = float;
+  {% if csize == 2 %}
+    using scalar_t = float2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = float;
+    #define element_in_Tio 1
+  {% endif %}
+{% endif %}
+
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 1)
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, 1)
+  DWCONV3D_FORWARD_DISPATCH_OTHERS
+}
+
+#undef DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION
+#undef DWCONV3D_FORWARD_DISPATCH_OTHERS
+#undef CUDA_KERNEL_LOOP
+} // namespace
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+{% if has_bias %}
+    void* bias_ptr,
+{% endif %}
+    void* out_ptr,
+    int64_t* p_kt,
+    int64_t* p_kh,
+    int64_t* p_kw,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int64_t* p_batch,
+    int64_t* p_in_ch,
+    int64_t* p_in_t,
+    int64_t* p_in_h,
+    int64_t* p_in_w,
+    int64_t* p_out_ch,
+    int64_t* p_out_t,
+    int64_t* p_out_h,
+    int64_t* p_out_w,
+    cudaStream_t stream
+) {
+  int kt = *p_kt;
+  int kh = *p_kh;
+  int kw = *p_kw;
+  int batch = *p_batch;
+  int in_ch = *p_in_ch;
+  int in_t = *p_in_t;
+  int in_h = *p_in_h;
+  int in_w = *p_in_w;
+  int out_ch = *p_out_ch;
+  int out_t = *p_out_t;
+  int out_h = *p_out_h;
+  int out_w = *p_out_w;
+
+  conv_depthwise3d_launcher(
+    (const {{dtype}}*)in_ptr,
+    (const {{dtype}}*)weight_ptr,
+    {% if has_bias %}
+    (const {{dtype}}*)bias_ptr,
+    {% endif %}
+    ({{dtype}}*)out_ptr,
+    kt,
+    kh,
+    kw,
+    stride_t,
+    stride_h,
+    stride_w,
+    padding_t,
+    padding_h,
+    padding_w,
+    dilation_t,
+    dilation_h,
+    dilation_w,
+    batch,
+    in_ch,
+    in_t,
+    in_h,
+    in_w,
+    out_t,
+    out_h,
+    out_w,
+    stream
+  );
+
+  return;
+}
+"""
+)
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.gen_function")
+def gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    has_bias = func_attrs["bias"]
+    csize = func_attrs["group"] % 8
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        csize=csize,
+        has_bias=has_bias,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return common_bias.gen_function_decl(func_name)
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 9e6f6d9e2..57c9f5ba9 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -94,7 +94,7 @@
 class depthwise_conv3d(Operator):
     r"""depthwise_conv3d"""
 
-    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+    def __init__(self, stride, pad, dilate=1, group=1, bias=False) -> None:
         """Conv3d constructor.
 
         Parameters
@@ -110,7 +110,7 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             channels to output channels, by default 1
         """
         super().__init__()
-        self._attrs["op"] = "depthwise_conv3d"
+        self._attrs["op"] = "depthwise_conv3d_bias" if bias else "depthwise_conv3d"
         self._attrs["stride"] = stride
         if isinstance(stride, int):
             self._attrs["stride"] = (stride, stride, stride)
@@ -126,6 +126,7 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self._attrs["epilogue"] = "LinearCombination"
         self._attrs["workspace"] = 0
         self._attrs["split_k"] = None
+        self._attrs["bias"] = bias
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
         self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
         self.exec_key_template = EXEC_KEY_TEMPLATE
@@ -247,7 +248,7 @@ def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         elif shape % 2 == 0:
             self._attrs["epilogue_alignment"] = 2
 
-    def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
+    def __call__(self, x: Tensor, w: Tensor, bias: Tensor = None) -> List[Tensor]:
         """Call depthwise_conv3d with tensors x, w
 
         Parameters
@@ -263,6 +264,8 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
             includes the output tensor in shape (N, T_out, H_out, W_out, C_out)
         """
         self._attrs["inputs"] = [x, w]
+        if self._attrs["bias"]:
+            self._attrs["inputs"].append(bias)
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
@@ -272,7 +275,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         return output
 
     def _get_op_attributes(self) -> Dict[str, Any]:
-        target_attrs = ["dilate", "group", "pad", "stride"]
+        target_attrs = ["dilate", "group", "pad", "stride", "bias"]
         attr = {}
 
         for target_attr in target_attrs:
diff --git a/tests/unittest/ops/test_depthwise_conv3d.py b/tests/unittest/ops/test_depthwise_conv3d.py
index 73cadeec3..cabd697df 100644
--- a/tests/unittest/ops/test_depthwise_conv3d.py
+++ b/tests/unittest/ops/test_depthwise_conv3d.py
@@ -27,6 +27,7 @@ class Conv3dDepthwiseTestCase(unittest.TestCase):
     def _test_depthwise_conv3d(
         self,
         batch=4,
+        bias=False,
         copy_op=False,
         test_name="depthwise_conv3d",
         dtype="float16",
@@ -45,21 +46,40 @@ def _test_depthwise_conv3d(
             name="input_1",
             is_input=True,
         )
-        OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups)
+        if bias:
+            B = Tensor(
+                shape=[co],
+                dtype=dtype,
+                name="input_2",
+                is_input=True,
+            )
+
+        OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups, bias=bias)
         if copy_op:
             OP = ops.depthwise_conv3d(**OP._get_op_attributes())
-        Y = OP(X, W)
+        Y = OP(X, W, B) if bias else OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", test_name)
 
         X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
         W_pt = get_random_torch_tensor([co, 1, 3, 3, 3], dtype=dtype)
-        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
+        if bias:
+            bias_pt = get_random_torch_tensor([co], dtype=dtype)
+            Y_pt = torch.nn.functional.conv3d(
+                X_pt, W_pt, bias_pt, padding=1, groups=groups
+            )
+        else:
+            Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
         y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
-        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        if bias:
+            module.run_with_tensors(
+                {"input_0": x, "input_1": w, "input_2": bias_pt}, [y]
+            )
+        else:
+            module.run_with_tensors({"input_0": x, "input_1": w}, [y])
         y_transpose = y.permute((0, 4, 1, 2, 3))
 
         self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
@@ -75,6 +95,19 @@ def test_fp16(self):
             dtype="float16",
         )
 
+    def test_fp16_bias(self):
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_bias_fp16",
+            bias=True,
+            dtype="float16",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            bias=True,
+            test_name="depthwise_conv3d_bias_fp16",
+            dtype="float16",
+        )
+
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
     def test_fp32(self):
         self._test_depthwise_conv3d(

From b6c3b94c9b6e0416ccad77a06775d9930b24b88c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 31 Jan 2023 21:28:09 -0800
Subject: [PATCH 035/638] enforce Meta copyright headers for fx2ait (#161)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/161

Reviewed By: terrychenism, wushirong

Differential Revision: D42894350

Pulled By: chenyang78

fbshipit-source-id: dc8a70ccc764432080a5ca17db717ef3b708b383
---
 .github/workflows/pylint.yaml                 |  1 +
 fx2ait/fx2ait/acc_tracer/acc_normalizer.py    | 14 +++++++++++++
 fx2ait/fx2ait/acc_tracer/acc_op_properties.py | 14 +++++++++++++
 fx2ait/fx2ait/acc_tracer/acc_ops.py           | 16 +++++++++++++--
 fx2ait/fx2ait/acc_tracer/acc_shape_prop.py    | 14 +++++++++++++
 fx2ait/fx2ait/acc_tracer/acc_tracer.py        | 15 +++++++++++++-
 fx2ait/fx2ait/acc_tracer/acc_utils.py         | 14 +++++++++++++
 .../fx2ait/acc_tracer/ait_acc_normalizer.py   | 14 +++++++++++++
 fx2ait/fx2ait/acc_tracer/ait_acc_ops.py       | 14 +++++++++++++
 .../fx2ait/acc_tracer/ait_acc_ops_registry.py | 20 ++++++++++++++++---
 fx2ait/fx2ait/ait_module.py                   | 14 +++++++++++++
 fx2ait/fx2ait/ait_splitter.py                 | 14 +++++++++++++
 fx2ait/fx2ait/cache.py                        | 14 +++++++++++++
 fx2ait/fx2ait/converters/__init__.py          | 14 +++++++++++++
 fx2ait/fx2ait/converters/ait_converters.py    | 14 +++++++++++++
 .../converters/ait_module_converters.py       | 14 +++++++++++++
 .../fx2ait/converters/aten2ait_converters.py  | 14 +++++++++++++
 .../fx2ait/converters/converter_registry.py   | 14 +++++++++++++
 fx2ait/fx2ait/converters/utils.py             | 14 +++++++++++++
 fx2ait/fx2ait/example/benchmark_utils.py      | 15 +++++++++++++-
 fx2ait/fx2ait/fx2ait.py                       | 15 +++++++++++++-
 fx2ait/fx2ait/lower/lower.py                  | 15 +++++++++++++-
 fx2ait/fx2ait/lower/lower_settings.py         | 14 +++++++++++++
 fx2ait/fx2ait/passes/lower_basic_pass_aten.py | 14 +++++++++++++
 fx2ait/fx2ait/tensor_spec.py                  | 14 +++++++++++++
 .../test_ait_transformer_model.py             | 14 +++++++++++++
 .../converters_model/test_ait_vision_model.py | 14 +++++++++++++
 .../test_ait_multihead_attention.py           | 14 +++++++++++++
 .../test_ait_adaptive_avg_pool2d.py           | 14 +++++++++++++
 .../test/converters/test_ait_avg_pool2d.py    | 14 +++++++++++++
 .../test/converters/test_ait_batch_norm.py    | 14 +++++++++++++
 .../test/converters/test_ait_binary_op.py     | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_chunk.py  | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_clamp.py  | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_common.py | 14 +++++++++++++
 .../test/converters/test_ait_contiguous.py    | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_conv2d.py | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_expand.py | 14 +++++++++++++
 .../test/converters/test_ait_flatten.py       | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_gelu.py   | 14 +++++++++++++
 .../test/converters/test_ait_layer_norm.py    | 14 +++++++++++++
 .../test/converters/test_ait_leaky_relu.py    | 14 +++++++++++++
 .../test/converters/test_ait_linalg_norm.py   | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_linear.py | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_matmul.py | 14 +++++++++++++
 .../test/converters/test_ait_max_pool2d.py    | 14 +++++++++++++
 .../test/converters/test_ait_nan2num.py       | 14 +++++++++++++
 fx2ait/fx2ait/test/converters/test_ait_pow.py | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_reduce.py | 14 +++++++++++++
 .../test/converters/test_ait_sigmoid.py       | 14 +++++++++++++
 .../test/converters/test_ait_slice_tensor.py  | 14 +++++++++++++
 .../test/converters/test_ait_softmax.py       | 14 +++++++++++++
 .../test/converters/test_ait_squeeze.py       | 14 +++++++++++++
 .../fx2ait/test/converters/test_ait_tile.py   | 14 +++++++++++++
 .../test/converters/test_ait_unary_ops.py     | 14 +++++++++++++
 fx2ait/fx2ait/test/converters/test_ait_var.py | 14 +++++++++++++
 .../test_ait_adaptive_avg_pool2d_aten.py      | 14 +++++++++++++
 .../test_ait_avg_pool2d_aten.py               | 14 +++++++++++++
 .../test_ait_batch_norm_aten.py               | 14 +++++++++++++
 .../test_ait_binary_op_aten.py                | 14 +++++++++++++
 .../test/converters_aten/test_ait_cat_aten.py | 14 +++++++++++++
 .../converters_aten/test_ait_chunk_aten.py    | 14 +++++++++++++
 .../converters_aten/test_ait_clamp_aten.py    | 14 +++++++++++++
 .../converters_aten/test_ait_conv2d_aten.py   | 14 +++++++++++++
 .../converters_aten/test_ait_flatten_aten.py  | 14 +++++++++++++
 .../test_ait_layer_norm_aten.py               | 14 +++++++++++++
 .../converters_aten/test_ait_linear_aten.py   | 14 +++++++++++++
 .../converters_aten/test_ait_matmul_aten.py   | 14 +++++++++++++
 .../test_ait_max_pool2d_aten.py               | 14 +++++++++++++
 .../converters_aten/test_ait_model_aten.py    | 14 +++++++++++++
 .../converters_aten/test_ait_nan2num_aten.py  | 14 +++++++++++++
 .../converters_aten/test_ait_permute_aten.py  | 14 +++++++++++++
 .../test/converters_aten/test_ait_pow_aten.py | 14 +++++++++++++
 .../converters_aten/test_ait_reduce_aten.py   | 14 +++++++++++++
 .../converters_aten/test_ait_relu_aten.py     | 16 +++++++++++++--
 .../converters_aten/test_ait_reshape_aten.py  | 14 +++++++++++++
 .../converters_aten/test_ait_size_aten.py     | 14 +++++++++++++
 .../test_ait_slice_tensor_aten.py             | 14 +++++++++++++
 .../converters_aten/test_ait_split_aten.py    | 14 +++++++++++++
 .../converters_aten/test_ait_squeeze_aten.py  | 14 +++++++++++++
 .../test_ait_unary_ops_aten.py                | 14 +++++++++++++
 fx2ait/fx2ait/test/test_ait_lower.py          | 14 +++++++++++++
 fx2ait/fx2ait/test/test_tensor_spec.py        | 14 +++++++++++++
 fx2ait/fx2ait/tools/ait_minimizer.py          | 14 +++++++++++++
 fx2ait/fx2ait/tools/common_aten2ait.py        | 16 +++++++++++++--
 fx2ait/fx2ait/tools/common_fx2ait.py          | 15 +++++++++++++-
 fx2ait/fx2ait/utils.py                        | 14 +++++++++++++
 fx2ait/setup.py                               | 15 ++++++++++++++
 88 files changed, 1223 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml
index be97139fa..91f0018eb 100644
--- a/.github/workflows/pylint.yaml
+++ b/.github/workflows/pylint.yaml
@@ -37,3 +37,4 @@ jobs:
       run: |
         python tests/lint/check_meta_header.py --path=./tests --fixit=False
         python tests/lint/check_meta_header.py --path=./python --fixit=False
+        python tests/lint/check_meta_header.py --path=./fx2ait --fixit=False
diff --git a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
index 55cb39d4a..f1a96d0c4 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import inspect
 import logging
 import re
diff --git a/fx2ait/fx2ait/acc_tracer/acc_op_properties.py b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
index 8160cfe9f..895ad9a97 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from collections import defaultdict
 from enum import auto, Flag
 from typing import Callable, DefaultDict, Set
diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index d625643e3..7f6395353 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # encoding: utf-8
 import operator
 
@@ -1999,7 +2013,6 @@ def linalg_norm(*, input, ord, dim, keepdim):
     ],
 )
 def norm_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
-
     input_node = node.kwargs["input"]
     p = node.kwargs["p"]
     dim = node.kwargs["dim"]
@@ -3042,7 +3055,6 @@ def xl_weight(weight_id: str, metadata: TensorMetadata, proxy_shape, dtype):
 )
 def log_softmax_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
     with node.graph.inserting_after(node):
-
         softmax_kwargs = {
             "input": node.kwargs["input"],
             "dim": node.kwargs["dim"],
diff --git a/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
index ddea8c847..21fbd8415 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import os
 import sys
 from typing import Any
diff --git a/fx2ait/fx2ait/acc_tracer/acc_tracer.py b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
index bcbd7c1c2..b5899f89b 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_tracer.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import ast
 import builtins
 import copy
@@ -56,7 +70,6 @@ def __init__(self):
     def rewrite(
         self, fn: FunctionType
     ) -> Tuple[FunctionType, Set[Type[Exception]], Set[Type[Exception]]]:
-
         # Normalize the source lines
         sourcelines, _ = inspect.getsourcelines(fn)
         sourcelines = normalize_source_lines(sourcelines)
diff --git a/fx2ait/fx2ait/acc_tracer/acc_utils.py b/fx2ait/fx2ait/acc_tracer/acc_utils.py
index 586c20b6c..21646af9e 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_utils.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_utils.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import inspect
 import logging
 import os
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
index 915505491..cd963547e 100644
--- a/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from .acc_ops import *  # isort:skip # noqa: F403 F401
 from .ait_acc_ops import *  # noqa: F403 F401
 import logging
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
index b291ebdc6..f162520e5 100644
--- a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 
 from fx2ait.acc_tracer.acc_normalizer import register_acc_op
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
index c7f96b7ed..6417d8431 100644
--- a/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
 
 
@@ -86,7 +100,7 @@ def get_ait_acc_op_mappers() -> Dict[Tuple[str, Union[str, Callable]], AitAccOpM
     return _AIT_ACC_OP_MAPPERS
 
 
-def get_custom_ait_acc_op_mappers() -> Dict[
-    Tuple[str, Union[str, Callable]], CustomAitAccOpMapper
-]:
+def get_custom_ait_acc_op_mappers() -> (
+    Dict[Tuple[str, Union[str, Callable]], CustomAitAccOpMapper]
+):
     return _CUSTOM_AIT_ACC_OP_MAPPERS
diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
index ba1376605..b214a6bb2 100644
--- a/fx2ait/fx2ait/ait_module.py
+++ b/fx2ait/fx2ait/ait_module.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import List
 
 import torch
diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index e05315118..d946a2416 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Any, Dict, Iterable, Mapping, Sequence
 
 import torch
diff --git a/fx2ait/fx2ait/cache.py b/fx2ait/fx2ait/cache.py
index 88fd2b3db..e87d286ad 100644
--- a/fx2ait/fx2ait/cache.py
+++ b/fx2ait/fx2ait/cache.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import os.path as path
 
 
diff --git a/fx2ait/fx2ait/converters/__init__.py b/fx2ait/fx2ait/converters/__init__.py
index d463837ae..8f62fbc1e 100644
--- a/fx2ait/fx2ait/converters/__init__.py
+++ b/fx2ait/fx2ait/converters/__init__.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from .ait_converters import *  # noqa: F401 F403
 from .aten2ait_converters import *  # noqa: F401 F403
 from .ait_module_converters import *  # noqa: F401 F403
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index cd62f4378..5405f5046 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import logging
 import math
 import operator
diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index e4bba9013..a869b18ae 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Any, Dict, OrderedDict, Tuple
 
 import numpy as np
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 986ae5196..1f8cd02ee 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import logging
 import torch  # isort:skip
 import operator
diff --git a/fx2ait/fx2ait/converters/converter_registry.py b/fx2ait/fx2ait/converters/converter_registry.py
index 0b4902cde..11663efa0 100644
--- a/fx2ait/fx2ait/converters/converter_registry.py
+++ b/fx2ait/fx2ait/converters/converter_registry.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Any, Callable, Dict
 
 from torch.fx.node import Target
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index f4de0e387..3573c102f 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import math
 import operator
 from typing import Any, Callable, Dict, List, Tuple, Union
diff --git a/fx2ait/fx2ait/example/benchmark_utils.py b/fx2ait/fx2ait/example/benchmark_utils.py
index bd2b8379f..25ea436b5 100644
--- a/fx2ait/fx2ait/example/benchmark_utils.py
+++ b/fx2ait/fx2ait/example/benchmark_utils.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import time
 
 import uuid
@@ -102,7 +116,6 @@ def benchmark_function(
     permute_inputs: Optional[List[int]] = None,
     ait_mod: torch.nn.Module = None,
 ) -> float:
-
     mod.eval()
     original_inputs = inputs
     if permute_inputs:
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index 50b4fcf9a..75b3a2324 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import io
 import logging
 import os
@@ -312,7 +326,6 @@ def output(self, target, args, kwargs):
             outputs = (args[0],)
 
         for i, output in enumerate(outputs):
-
             name = f"output_{i}"
             output._attrs["name"] = name
             output._attrs["is_output"] = True
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
index 047535a48..2b2ca18d5 100644
--- a/fx2ait/fx2ait/lower/lower.py
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import dataclasses as dc
 import datetime
 import logging
@@ -48,7 +62,6 @@ def __call__(
         mod: fx.GraphModule,
         inputs: List[torch.Tensor],
     ) -> AITInterpreterResult:
-
         (additional_inputs,) = self.lower_settings.additional_inputs
         if additional_inputs is None:
             input_specs = TensorSpec.from_input_list_with_batch_size(
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
index 75b152dc6..685f6c19a 100644
--- a/fx2ait/fx2ait/lower/lower_settings.py
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import dataclasses as dc
 from enum import Enum
 from typing import Any, List, Optional, Set, Type
diff --git a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
index f03d9f9f0..a66a28704 100644
--- a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
+++ b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import logging
 import operator
 from typing import Any, NamedTuple
diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index b9805842a..8827e38af 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from functools import reduce
 from typing import Any, List
 
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
index e209e5c45..afca07a91 100644
--- a/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tools.common_fx2ait import AITTestCase
 
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
index 8453a97f4..4afa00218 100644
--- a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 import torchvision
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
index 1cf935379..163d1f2e3 100644
--- a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
index 8c29ddbfb..db535e1d2 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 #!/usr/bin/env fbpython
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
index 298c5f9de..846ec3987 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.acc_tracer import acc_ops
diff --git a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
index b6751d0a8..221511f06 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
index 54c54b840..95a1ac4a2 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import operator
 from typing import Callable, List, Tuple, Union
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_chunk.py b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
index 905d0ebf4..6849df4f2 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_chunk.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_clamp.py b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
index b7858a9cd..9d6cd8a71 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_clamp.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_common.py b/fx2ait/fx2ait/test/converters/test_ait_common.py
index 11f65c160..fc3598647 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_common.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_common.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Callable, List, Union
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters/test_ait_contiguous.py b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
index aff2acfad..a58816747 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
index a1b5603e3..eaaa7bf30 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters/test_ait_expand.py b/fx2ait/fx2ait/test/converters/test_ait_expand.py
index b1e7509a9..cab5d3ca9 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_expand.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_expand.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_flatten.py b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
index 698e42edd..405f65307 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_flatten.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_gelu.py b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
index 8239d873f..444d885d2 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_gelu.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
index 99a7028f6..4829634d2 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
index f0bda2c40..9250099d9 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.acc_tracer import acc_ops
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
index a8b70d263..90d5d5b4a 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linear.py b/fx2ait/fx2ait/test/converters/test_ait_linear.py
index 5112283d2..1fed2bbef 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_linear.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_linear.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_matmul.py b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
index a96bc0eef..5aed7240c 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_matmul.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
index 4dfc745b3..aca28f8b5 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.acc_tracer import acc_ops
diff --git a/fx2ait/fx2ait/test/converters/test_ait_nan2num.py b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
index 9a2ae5b01..d2d1c12dc 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_pow.py b/fx2ait/fx2ait/test/converters/test_ait_pow.py
index 148c45bdd..a6f75b28b 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_pow.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_pow.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reduce.py b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
index e9160e9cc..b79a2c9e7 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_reduce.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
index fcbc80600..bd61e7e3c 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index 32d304e2e..ef62c6d41 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_softmax.py b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
index 8c03e5eb1..aca9dc12a 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_softmax.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_squeeze.py b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
index b8b3c0183..e0f87a95e 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_tile.py b/fx2ait/fx2ait/test/converters/test_ait_tile.py
index 8e1f53db4..40840671e 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_tile.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_tile.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index fb12d24d5..12eac27b6 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import math
 from typing import Callable
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_var.py b/fx2ait/fx2ait/test/converters/test_ait_var.py
index af4073073..757b0fa04 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_var.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_var.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
index f334fd215..2d1e8a635 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 #!/usr/bin/env fbpython
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
index 4a380de64..681f8b407 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.tensor_spec import TensorSpec
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
index 10caaded3..d94b066d3 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
index 6580574f9..d2a9f7611 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import operator
 from typing import Callable, List, Tuple
 
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
index 5b2ba8dad..b971f735b 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
index f629da19d..110d6a280 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.fx2ait import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
index 77ac1b973..31d74189e 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
index 2bcd2b241..b15dae226 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
index ce6496fcb..47c8e38c2 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 import torch.nn as nn
 from fx2ait.tensor_spec import TensorSpec
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
index 1cbe433b3..7a6036dd3 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
index 0754a1d61..e7fbc678f 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
index 74cfe4566..da93658ba 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
index ba9861798..a391617b1 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.tensor_spec import TensorSpec
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
index baa9cc384..383d96e4e 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 import torchvision
 from fx2ait.passes.lower_basic_pass_aten import nchw2nhwc_pass, replace_inplace_ops
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
index 74633505c..5b84e3f2d 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
index 2c20c4105..25215b557 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 import torch
 from fx2ait.tensor_spec import TensorSpec
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
index c716cf1f8..695fe94db 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
index e1fbf9cfb..2e627480c 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
index 9ee3a0522..b7f1ce91b 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
@@ -21,14 +35,12 @@ class TestATenReluConverter(DispatchTestCase):
         ]
     )
     def test_relu(self, name, size):
-
         model = TestModule().cuda().half()
         inputs = (torch.randn(size).half().cuda(),)
 
         self.run_test(model, inputs, expected_ops={torch.ops.aten.relu.default})
 
     def test_relu_with_dynamic_shape(self):
-
         model = TestModule().cuda().half()
         inputs_spec = TensorSpec.create_spec_from_shapes(
             inputs_min=[
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
index 0ae0d8289..5a1febf44 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import unittest
 
 import torch
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
index 561caf381..bd076b77a 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 
 # from fx2ait.tensor_spec import TensorSpec
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
index f7d28e7eb..19ce54269 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.passes.lower_basic_pass_aten import (
     aten_compose_getitem_slice,
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
index 9268597db..ad4897dec 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
index 7822fb74b..c205f3cf6 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
index 25f90c94f..c5a50048e 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from typing import Callable
 
 import torch
diff --git a/fx2ait/fx2ait/test/test_ait_lower.py b/fx2ait/fx2ait/test/test_ait_lower.py
index a78bad897..eeb9a3610 100644
--- a/fx2ait/fx2ait/test/test_ait_lower.py
+++ b/fx2ait/fx2ait/test/test_ait_lower.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import unittest
 
 import torch
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index 13d5f42cf..8f02116e2 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import unittest
 
 import torch
diff --git a/fx2ait/fx2ait/tools/ait_minimizer.py b/fx2ait/fx2ait/tools/ait_minimizer.py
index 08b8e60de..255248973 100644
--- a/fx2ait/fx2ait/tools/ait_minimizer.py
+++ b/fx2ait/fx2ait/tools/ait_minimizer.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import logging
 from typing import Any, Callable, List, Tuple
 
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
index 4c0409288..aabedf79f 100644
--- a/fx2ait/fx2ait/tools/common_aten2ait.py
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import unittest
 
 import uuid
@@ -112,7 +126,6 @@ def run_test(
         permute_outputs: Optional[List[int]] = None,
         customized_passes: List[Callable] = None,
     ):
-
         mod.eval()
         original_inputs = inputs
         if permute_inputs:
@@ -306,7 +319,6 @@ def benchmark_function(
         permute_inputs: Optional[List[int]] = None,
         customized_passes: Optional[List[int]] = None,
     ) -> float:
-
         mod.eval()
         original_inputs = inputs
         if permute_inputs:
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 382942a66..af38ab68c 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import time
 import unittest
 
@@ -300,7 +314,6 @@ def benchmark_function(
     inputs: List[torch.Tensor],
     permute_inputs: Optional[List[int]] = None,
 ) -> float:
-
     mod.eval()
     mod = acc_tracer.trace(
         mod,
diff --git a/fx2ait/fx2ait/utils.py b/fx2ait/fx2ait/utils.py
index ef7fb3e7a..a87e2abaf 100644
--- a/fx2ait/fx2ait/utils.py
+++ b/fx2ait/fx2ait/utils.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
index 7d6ba9de0..cb1420a9b 100644
--- a/fx2ait/setup.py
+++ b/fx2ait/setup.py
@@ -1,3 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
 import os
 import subprocess
 import sys

From eb7ef21c1da8e305d8ae1037ad454d6969077149 Mon Sep 17 00:00:00 2001
From: Ying Zhang <ipiszy@users.noreply.github.com>
Date: Tue, 31 Jan 2023 22:01:28 -0800
Subject: [PATCH 036/638] Update README.md (#166)

Summary:
ATT, add a section about fx2ait, update release policy and maintainer list.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/166

Reviewed By: terrychenism

Differential Revision: D42912963

Pulled By: ipiszy

fbshipit-source-id: e6056920be6a8ac9a0415d7ece6465253eccc290
---
 README.md | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index d08a04f0e..13f7580a7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # AITemplate
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE) |
-[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
+[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yaml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
 [![CircleCI](https://circleci.com/gh/facebookincubator/AITemplate.svg?style=svg)](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate)
 
 
@@ -37,6 +37,16 @@ The AITemplate-generated Python runtime can take PyTorch tensors as inputs and o
 
 AITemplate provides a straightforward approach for making an extension in codegen. To add a new operator or a new fused kernel into AITemplate, most of the time one only needs to add two Python files: one for a graph node definition and another for the backend codegen. The CUDA/HIP kernel in a text header file can be directly utilized in the codegen.
 
+## FX2AIT
+FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving. Using FX2AIT's built-in AITLowerer, partial AIT acceleration can be achieved for models with unsupported operators in AITemplate.
+
+Key features of FX2AIT include:
+
+* Easy Conversion: FX2AIT requires only a PyTorch model and input for conversion, generating an "AITModule" output for inference serving.
+* Expanded Support: AITemplate does not support all PyTorch operators. FX2AIT's AITLowerer offers a solution for partial AIT conversion for models with unsupported operators. Check the example/03_lowering_split for more information.
+
+More info can be found from https://github.com/facebookincubator/AITemplate/tree/main/fx2ait.
+
 ## Installation
 
 **Hardware requirement:**
@@ -93,19 +103,17 @@ AITemplate provides the following model templates & reference performance data o
 
 ## Release
 
-AITemplate has a 90 days release cycle.
-In the next one or two releases, we will focus on:
-- Deprecating FlashAttention: Unify CUDA Attention computation to Composable Kernel (AMD GPU) style back-to-back fusion to improve performance and increase flexibility for NVIDIA GPU Transformer users.
-- Remove kernel profiling requirement.
-- GEMM + LayerNorm fusion, GEMM + GEMM fusion, Conv + Conv fusion.
-- Better dynamic shape support: Focus on the dynamic sequence in Transformers.
-- More model templates:  Provide model templates with control flow and containers.
+All current development updates can be seen in the AITemplate repository. Releases are not on a set schedule and will only be tagged for significant feature releases.
+
+Mid-term plan:
+- Better dynamic shape support: Focus on the dynamic sequence in Transformers. Add symbolic shape support.
 - More automatic graph passes: Relief manual rewrite models to obtain the best performance.
-- Enable more fusions on AMD backend.
+- Quantization: fp8/int8/int4.
+- Sparsity pruning for Gemm.
+- PT2 integration: Aten2AIT is under active development.
 
-Some ongoing/potential work that won't appear in the next short-term release:
-- Automatic Pytorch-FX, ONNX, Open-XLA and other format model conversion.
-- Quantized model (int8/fp8/int4) support.
+Long-term plan:
+- Automatic ONNX, Open-XLA and other format model conversion.
 - Composable Kernel CPU extension on AVX2/AVX-512 for AMD Epyc CPU.
 
 ## Contributing
@@ -113,9 +121,9 @@ Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute
 
 ## The Team
 
-AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
+AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mu-Chu Lee](https://github.com/muchulee8), [Max Podkorytov](https://github.com/tenpercent), [Adnan Akhundov](https://github.com/aakhundov).
 
-AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine) and [Mu-Chu Lee](https://github.com/muchulee8).
+AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
 
 
 ## Acknowledgement

From 53cc6aa8b0d11b5855c73aa43266fee959c88326 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 31 Jan 2023 22:28:07 -0800
Subject: [PATCH 037/638] test detectron2 correctness in CI (#157)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/157

* AIT model definition: done
* float16 comparison: done, mismatch (match with a caveat; results are ordered by scores, so the sets of bounded boxes and classes need to be compared as sets, not as lists. Also numerical precision is ~1e0 for boxes comparison which is probably fine for pixels)
* model checkpoints and sample inputs on manifold: done

* masks comparison: done (comparing boolean arrays, ~99% values are same)

Reviewed By: wushirong

Differential Revision: D42791155

fbshipit-source-id: f0e88b58b14c233dfbb5f2983b93aac1d3ada4ce
---
 examples/02_detectron2/test_correctness.py    | 387 ++++++++++++++++++
 .../02_detectron2/tools/convert_pt2ait.py     |  11 +-
 2 files changed, 393 insertions(+), 5 deletions(-)
 create mode 100644 examples/02_detectron2/test_correctness.py

diff --git a/examples/02_detectron2/test_correctness.py b/examples/02_detectron2/test_correctness.py
new file mode 100644
index 000000000..8a0e19d92
--- /dev/null
+++ b/examples/02_detectron2/test_correctness.py
@@ -0,0 +1,387 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import functools
+import logging
+import os
+import unittest
+
+import cv2
+import numpy as np
+
+import torch
+
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from detectron2.config import CfgNode
+from detectron2.engine import DefaultPredictor
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+    import requests
+
+from detectron2.model_zoo import get_checkpoint_url
+from parameterized import parameterized
+from PIL import Image
+
+from .configs.config import get_cfg_defaults
+from .modeling.meta_arch import GeneralizedRCNN
+from .tools.convert_pt2ait import detectron2_export
+
+logger = logging.getLogger(__name__)
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def extract_params_meta(ait_model):
+    ret = []
+    for name, p in ait_model.named_parameters():
+        name = name.replace(".", "_")
+        shape = [x._attrs["values"][0] for x in p.tensor()._attrs["shape"]]
+        ret.append([name, shape])
+    return ret
+
+
+def get_output_shape(oldh: int, oldw: int, short_edge_length: int, max_size: int):
+    """
+    Compute the output size given input size and target short edge length.
+    """
+    h, w = oldh, oldw
+    size = short_edge_length * 1.0
+    scale = size / min(h, w)
+    if h < w:
+        newh, neww = size, scale * w
+    else:
+        newh, neww = scale * h, size
+    if max(newh, neww) > max_size:
+        scale = max_size * 1.0 / max(newh, neww)
+        newh = newh * scale
+        neww = neww * scale
+    neww = int(neww + 0.5)
+    newh = int(newh + 0.5)
+    return (newh, neww)
+
+
+def apply_transform(cfg, img):
+    """
+    Resize the image while keeping the aspect ratio unchanged.
+    It attempts to scale the shorter edge to the given `short_edge_length`,
+    as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+    h, w = img.shape[:2]
+    new_h, new_w = get_output_shape(
+        h, w, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST
+    )
+    if len(img.shape) > 2 and img.shape[2] == 1:
+        pil_image = Image.fromarray(img[:, :, 0], mode="L")
+    else:
+        pil_image = Image.fromarray(img)
+    pil_image = pil_image.resize((new_w, new_h), Image.BILINEAR)
+    ret = np.asarray(pil_image)
+    if len(img.shape) > 2 and img.shape[2] == 1:
+        ret = np.expand_dims(ret, -1)
+    return ret
+
+
+def preprocess(cfg, ori_img, pad_value: float = 0.0, dtype="float16"):
+    """
+    Image preprocess: resize the image (see `apply_transform`), normalize the pixels,
+    and add padding.
+    """
+    # HH, WW = self.im_shape
+    ori_shape = ori_img.shape
+    if ori_shape[0] > ori_shape[1]:
+        img = np.rot90(ori_img, k=1)
+    else:
+        img = ori_img
+    inputs = apply_transform(cfg, img)
+    resize_scale = img.shape[0] / inputs.shape[0]
+    pixel_mean = np.array(cfg.MODEL.PIXEL_MEAN).reshape(1, 1, -1)
+    pixel_std = np.array(cfg.MODEL.PIXEL_STD).reshape(1, 1, -1)
+    inputs = (inputs - pixel_mean) / pixel_std
+    padding_size = (
+        (0, cfg.INPUT.MIN_SIZE_TEST - inputs.shape[0]),
+        (0, cfg.INPUT.MAX_SIZE_TEST - inputs.shape[1]),
+        (0, 0),
+    )
+    inputs = np.pad(inputs, padding_size, constant_values=pad_value)
+    inputs = inputs[np.newaxis, :]
+    return inputs.astype(dtype), ori_img, ori_shape, resize_scale
+
+
+def apply_bbox(bbox, im_w, im_h):
+    if im_h > im_w:
+        x0 = bbox[:, 0][..., np.newaxis]
+        y0 = bbox[:, 1][..., np.newaxis]
+        x1 = bbox[:, 2][..., np.newaxis]
+        y1 = bbox[:, 3][..., np.newaxis]
+        bbox = np.hstack((im_w - y1, x0, im_w - y0, x1))
+    return bbox
+
+
+def postprocess_ait_results(
+    ret,
+    mask_on,
+    batch_size,
+    score_thresh,
+    images,
+    image_list,
+    image_shapes,
+    image_scales,
+):
+    batched_boxes, batched_scores, batched_classes = ret[1:4]
+    if mask_on:
+        batched_masks = ret[-1]
+    results = {}
+    for i in range(batch_size):
+        boxes, scores, classes = (
+            batched_boxes[i, :],
+            batched_scores[i, :],
+            batched_classes[i, :],
+        )
+
+        filter_inds = (scores > score_thresh).nonzero().squeeze()
+        scores = scores[filter_inds]
+        boxes = boxes[filter_inds, :] * image_scales[i]
+        boxes = apply_bbox(boxes, image_shapes[i][1], image_shapes[i][0])
+        classes = classes[filter_inds]
+
+        results[image_list[i]] = {
+            "boxes": boxes,
+            "scores": scores,
+            "classes": classes,
+            "image_height": image_shapes[i][0],
+            "image_width": image_shapes[i][1],
+            "num_instances": boxes.shape[0],
+            "image": images[i],
+        }
+        if mask_on:
+            mask_pred = batched_masks[i, filter_inds, :, :]
+            im_height, im_width = image_shapes[i][:2]
+            masks = []
+            for pred_box, mask in zip(
+                boxes,
+                mask_pred,
+            ):
+                mask = mask.cpu().numpy().astype(np.float32)
+                if im_height > im_width:
+                    mask = np.rot90(mask, k=-1)
+                box = pred_box.cpu().numpy().astype("int")
+                det_width = box[2] - box[0]
+                det_height = box[3] - box[1]
+                small_mask = Image.fromarray(mask)
+                mask = small_mask.resize(
+                    (det_width, det_height), resample=Image.BILINEAR
+                )
+                mask = np.array(mask, copy=False)
+                MASK_THRESHOLD = 0.5
+                mask = np.array(mask > MASK_THRESHOLD, dtype=np.uint8)
+                padded_mask = np.zeros((im_height, im_width), dtype=np.uint8)
+                x_0 = max(box[0], 0)
+                x_1 = min(box[2], im_width)
+                y_0 = max(box[1], 0)
+                y_1 = min(box[3], im_height)
+                padded_mask[y_0:y_1, x_0:x_1] = mask[
+                    (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+                ]
+                masks.append(padded_mask)
+            results[image_list[i]]["masks"] = torch.tensor(masks)
+    return results
+
+
+class Detectron2Verification(unittest.TestCase):
+    @parameterized.expand(
+        ["faster_rcnn_R_50", "faster_rcnn_R_101", "mask_rcnn_R_50", "mask_rcnn_R_101"]
+    )
+    def test_detectron2(self, config):
+        cfg = get_cfg_defaults()
+        cfg.merge_from_file(
+            os.path.join(os.path.dirname(__file__), "configs", f"{config}_FPN.yaml")
+        )
+        cfg.SOLVER.IMS_PER_BATCH = 1
+        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8
+        cfg.freeze()
+
+        ait_dtype = "float16"
+        torch_dtype = torch.float16
+
+        model = GeneralizedRCNN(cfg)
+        model.name_parameter_tensor()
+
+        x = Tensor(
+            shape=[
+                cfg.SOLVER.IMS_PER_BATCH,
+                cfg.INPUT.MIN_SIZE_TEST,
+                cfg.INPUT.MAX_SIZE_TEST,
+                3,
+            ],
+            dtype=ait_dtype,
+            name="input_0",
+            is_input=True,
+        )
+        y = model(x)
+        mark_output(y)
+
+        checkpoint_path = f"/tmp/detectron2/{config}_FPN_3x.pkl"
+        sample_input_filename = "000000001268.jpg"
+        sample_input_path = f"/tmp/detectron2/{sample_input_filename}"
+
+        torch_cfg = CfgNode(cfg)
+        torch_cfg.MODEL.WEIGHTS = checkpoint_path
+        if not os.path.exists(checkpoint_path):
+            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client("aitemplate") as client:
+                    await_sync(
+                        client.get(
+                            f"tree/detectron2/pickles/{config}_FPN_3x.pkl",
+                            checkpoint_path,
+                        )
+                    )
+            else:
+                torch_cfg.MODEL.WEIGHTS = get_checkpoint_url(
+                    f"COCO-{'InstanceSegmentation' if 'mask' in config else 'Detection'}/{config}_FPN_3x.yaml"
+                )
+
+        torch_predictor = DefaultPredictor(torch_cfg)
+
+        if not os.path.exists(sample_input_path):
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client("aitemplate") as client:
+                    await_sync(
+                        client.get(
+                            f"tree/detectron2/datasets/coco/val2017/{sample_input_filename}",
+                            sample_input_path,
+                        )
+                    )
+            else:
+                img_url = (
+                    f"http://images.cocodataset.org/val2017/{sample_input_filename}"
+                )
+                img_data = requests.get(img_url).content
+                with open(sample_input_path, "wb") as f:
+                    f.write(img_data)
+
+        sample_img = cv2.imread(sample_input_path)
+        sample_input, original_image, shape, scale = preprocess(
+            cfg, sample_img, dtype=ait_dtype
+        )
+        x_ait = torch.tensor(sample_input).cuda()
+
+        with torch.no_grad():
+            ait_params = detectron2_export("").export_model(
+                {
+                    k: v.cpu().numpy()
+                    for k, v in torch_predictor.model.state_dict().items()
+                },
+                extract_params_meta(model),
+            )
+            pt_instance = torch_predictor(sample_img)["instances"]
+
+        ait_module = compile_model(y, detect_target(), "./tmp", cfg.MODEL.NAME)
+        for name, param in ait_params.items():
+            ait_module.set_constant_with_tensor(
+                name, param.contiguous().to(dtype=torch_dtype).cuda()
+            )
+        model.set_anchors(ait_module)
+        topk = cfg.POSTPROCESS.TOPK
+        BS = cfg.SOLVER.IMS_PER_BATCH
+        outputs = [
+            torch.empty([BS, 1], dtype=torch.int64).cuda(),
+            torch.empty([BS, topk, 4], dtype=torch_dtype).cuda(),
+            torch.empty([BS, topk], dtype=torch_dtype).cuda(),
+            torch.empty([BS, topk], dtype=torch.int64).cuda(),
+        ]
+        if cfg.MODEL.MASK_ON:
+            mask_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
+            outputs.append(
+                torch.empty([BS, topk, mask_size, mask_size], dtype=torch_dtype).cuda()
+            )
+
+        ait_module.run_with_tensors([x_ait], outputs)
+
+        ait_results = postprocess_ait_results(
+            outputs,
+            cfg.MODEL.MASK_ON,
+            BS,
+            cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
+            [original_image],
+            [sample_input_path],
+            [shape],
+            [scale],
+        )
+
+        result = ait_results[sample_input_path]
+
+        compare_floats = functools.partial(
+            torch.testing.assert_close, atol=1e-1, rtol=1e-1
+        )
+        compare_ints = functools.partial(torch.testing.assert_close, atol=0, rtol=0)
+
+        compare_ints(len(pt_instance), result["num_instances"])
+
+        # Boxes precision is tricky.
+        # Practically, these are pixel values, so any difference around 1e0 can be disregarded
+        compare_boxes_floats = functools.partial(
+            torch.testing.assert_close, atol=5e-0, rtol=1e-1
+        )
+        # Keep in mind that we are comparing sets here,
+        # not lists because all items are sorted by score and
+        # a small difference in score can result in a wrong items order.
+        # We do our best to estabilish 1:1 mapping for comparison
+        pt_boxes = pt_instance.pred_boxes.tensor.to(dtype=result["boxes"].dtype).sort(
+            dim=0
+        )
+        ait_boxes = result["boxes"].sort(dim=0)
+        compare_boxes_floats(
+            ait_boxes,
+            pt_boxes,
+        )
+        compare_floats(
+            pt_instance.scores.to(dtype=result["scores"].dtype),
+            result["scores"],
+        )
+        # also comparing sets
+        compare_ints(
+            pt_instance.pred_classes.sort().values, result["classes"].sort().values
+        )
+        # homebrew similarity match between boolean arrays
+        if cfg.MODEL.MASK_ON:
+            pt_masks = pt_instance.pred_masks.to(
+                dtype=result["masks"].dtype, device="cpu"
+            )
+            ait_masks = result["masks"]
+            self.assertLess(
+                (pt_masks != ait_masks).sum() / (pt_masks == ait_masks).sum(), 1e-2
+            )
+
+
+if __name__ == "__main__":
+    torch.cuda.manual_seed(1337)
+    unittest.main()
diff --git a/examples/02_detectron2/tools/convert_pt2ait.py b/examples/02_detectron2/tools/convert_pt2ait.py
index 584e14560..810fc8d3a 100644
--- a/examples/02_detectron2/tools/convert_pt2ait.py
+++ b/examples/02_detectron2/tools/convert_pt2ait.py
@@ -33,7 +33,7 @@ class detectron2_export:
     def __init__(self, model_name):
         self.model_name = model_name
 
-    def export_model(self, model):
+    def export_model(self, model, ait_param_map=None):
         fuse_model = {}
         bn_keys = set()
         for k, _ in model.items():
@@ -55,12 +55,13 @@ def export_model(self, model):
         if detect_target().name() == "cuda":
             self.export_conv0(ait_model, fuse_model)
 
-        self.check_model(ait_model)
+        self.check_model(ait_model, ait_param_map)
         return ait_model
 
-    def check_model(self, ait_model):
-        with open(os.path.join("./tmp", self.model_name, "params.json")) as fi:
-            param_map = json.load(fi)
+    def check_model(self, ait_model, param_map=None):
+        if param_map is None:
+            with open(os.path.join("./tmp", self.model_name, "params.json")) as fi:
+                param_map = json.load(fi)
         for name, shape in param_map:
             assert ait_model[name].shape == tuple(
                 shape

From 391853c266a99f58c2ccf0afa580c223c6e42783 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Wed, 1 Feb 2023 18:47:32 +0800
Subject: [PATCH 038/638] update ci

---
 .github/workflows/ait_ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index 0f598865f..4bdc67e38 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -124,8 +124,9 @@ jobs:
         git show --summary | grep commit >> sdiff.log
         /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> sdiff.log
         # profiling
-        HIP_VISIBLE_DEVICES=0,1 python3 compile.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log
-        HIP_VISIBLE_DEVICES=0 python3 demo.py --token ${{ secrets.HF_TOKEN }} --benchmark 1 2>&1 | tee -a sdiff.log
+        python3 scripts/download_pipeline.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log
+        HIP_VISIBLE_DEVICES=0,1 python3 scripts/compile.py 2>&1 | tee -a sdiff.log
+        HIP_VISIBLE_DEVICES=0 python3 scripts/demo.py --benchmark 1 2>&1 | tee -a sdiff.log
     - name: Archive logs
       uses: actions/upload-artifact@v3
       with:

From 02cd37fe93c6b8ce132e1a57fc2e051f011fd750 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Wed, 1 Feb 2023 23:56:09 +0800
Subject: [PATCH 039/638] support fx2ait on rocm

---
 .github/workflows/ait_ci.yml                  |   3 -
 docker/Dockerfile.rocm                        |   4 +-
 fx2ait/CMakeLists.txt                         |   6 +-
 fx2ait/fx2ait/csrc/AITModelImpl.cpp           |  20 ++
 fx2ait/setup.py                               |   1 +
 python/aitemplate/backend/rocm/__init__.py    |   1 +
 .../backend/rocm/padding/__init__.py          |  20 ++
 .../backend/rocm/padding/nhwc3to4.py          | 225 +++++++++++++++
 .../backend/rocm/padding/nhwc3to8.py          | 237 ++++++++++++++++
 .../backend/rocm/padding/pad_last_dim.py      | 261 ++++++++++++++++++
 10 files changed, 771 insertions(+), 7 deletions(-)
 create mode 100644 python/aitemplate/backend/rocm/padding/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/padding/nhwc3to4.py
 create mode 100644 python/aitemplate/backend/rocm/padding/nhwc3to8.py
 create mode 100644 python/aitemplate/backend/rocm/padding/pad_last_dim.py

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index 4bdc67e38..f3037119c 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -47,8 +47,6 @@ jobs:
         pip3 install dist/*.whl
         #install necessary python modules
         pip3 install timm
-        pip3 uninstall -y torch 
-        pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
         python3 -m pip install transformers click
         python3 -c "import torch; print(torch.__version__)"
         #run examples
@@ -141,4 +139,3 @@ jobs:
         export dbuser=${{ secrets.DBUSER }}
         export dbpassword=${{ secrets.DBPASSWORD }}
         python3 process_results.py 
-
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 8146b506c..e6231a5e1 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -15,7 +15,7 @@
 # ROCM Docker Image for AITemplate
 FROM ubuntu:20.04
 
-ARG ROCMVERSION=5.3
+ARG ROCMVERSION=5.4.2
 
 RUN set -xe
 
@@ -44,9 +44,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libpthread-stubs0-dev \
     llvm-amdgpu \
     pkg-config \
-    python \
     python3 \
-    python-dev \
     python3-dev \
     python3-pip \
     software-properties-common \
diff --git a/fx2ait/CMakeLists.txt b/fx2ait/CMakeLists.txt
index b0de91aa7..0b581aeb3 100644
--- a/fx2ait/CMakeLists.txt
+++ b/fx2ait/CMakeLists.txt
@@ -3,8 +3,12 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(ait_model)
 find_package(Torch REQUIRED)
 
+if(${AIT_USE_ROCM})
+  add_compile_definitions(AIT_USE_ROCM)
+endif()
+
 include_directories(
-   ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/picojson
+  ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/picojson
 )
 
 # Define our library target
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index a859f37d0..e0d9032b2 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -6,9 +6,15 @@
 #include <sstream>
 
 #include "ATen/Context.h" // @manual
+#ifdef AIT_USE_ROCM
+#include "ATen/hip/HIPContext.h"
+#include "c10/core/CPUAllocator.h"
+#include "c10/hip/HIPStream.h"
+#else
 #include "ATen/cuda/CUDAContext.h"
 #include "c10/core/CPUAllocator.h"
 #include "c10/cuda/CUDAStream.h"
+#endif
 
 #ifdef FBCODE_AIT
 #include "folly/MapUtil.h"
@@ -17,7 +23,9 @@
 namespace torch::aitemplate {
 
 AITemplatePyTorchCachingAllocator::AITemplatePyTorchCachingAllocator() {
+  #ifndef AIT_USE_ROCM
   at::globalContext().lazyInitCUDA();
+  #endif
   cuda_allocator_ = at::cuda::getCUDADeviceAllocator();
   TORCH_CHECK(cuda_allocator_ != nullptr);
 }
@@ -290,7 +298,11 @@ std::vector<torch::Tensor> AITModelImpl::processOutputs(
 
     auto output = at::detail::make_tensor_base<c10::TensorImpl>(
         std::move(output_index_to_output_storage_impl.at(output_idx)),
+        #ifdef AIT_USE_ROCM
         c10::DispatchKeySet(c10::DispatchKey::CUDA),
+        #else
+        c10::DispatchKeySet(c10::DispatchKey::HIP),
+        #endif
         scalarTypeToTypeMeta(dtype));
     const auto& size = output_shapes.at(output_idx);
     if (size.size() != 1 || size[0] != 0) {
@@ -368,7 +380,11 @@ std::vector<torch::Tensor> AITModelImpl::forward(
 
   std::vector<torch::Tensor> outputs;
   {
+    #ifdef AIT_USE_ROCM
+    const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
+    #else
     const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    #endif
     const auto stream_id = cuda_stream.stream();
     // TODO: remove casting after fixing API
     AITemplateStreamHandle stream_handle =
@@ -427,7 +443,11 @@ void AITModelImpl::profile(
       device);
 
   {
+    #ifdef AIT_USE_ROCM
+    const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
+    #else
     const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    #endif
     const auto stream_id = cuda_stream.stream();
     // TODO: remove casting after fixing API
     AITemplateStreamHandle stream_handle =
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
index 7d6ba9de0..5bc05ca47 100644
--- a/fx2ait/setup.py
+++ b/fx2ait/setup.py
@@ -36,6 +36,7 @@ def run(self):
             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + build_directory,
             "-DPYTHON_EXECUTABLE=" + sys.executable,
             "-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
+            "-DAIT_USE_ROCM=" + "1" if torch.cuda.is_available() and torch.version.hip else "0",
         ]
 
         cfg = "Debug" if self.debug else "Release"
diff --git a/python/aitemplate/backend/rocm/__init__.py b/python/aitemplate/backend/rocm/__init__.py
index 76dc9290b..d0d3d1af8 100644
--- a/python/aitemplate/backend/rocm/__init__.py
+++ b/python/aitemplate/backend/rocm/__init__.py
@@ -28,4 +28,5 @@
 from .normalization import softmax
 from .upsample import *
 from .vision_ops import *
+from .padding import *
 from .normalization import groupnorm, groupnorm_swish, layernorm
diff --git a/python/aitemplate/backend/rocm/padding/__init__.py b/python/aitemplate/backend/rocm/padding/__init__.py
new file mode 100644
index 000000000..455e327d6
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA padding init
+"""
+from . import nhwc3to4, nhwc3to8, pad_last_dim
+
+__all__ = ["nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to4.py b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
new file mode 100644
index 000000000..8714a216e
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
@@ -0,0 +1,225 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to4 op
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  hipStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to4_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+// fast kernel for c_in = 3 & c_out = 4
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;
+  const int threadidx = threadIdx.x;
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];
+  __syncthreads();
+
+  const int ouput_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256;
+  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
+    Telement array[element_in_Tio];
+    #pragma unroll
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
+    output[i] = *((const Tio *)array);
+  }
+}
+
+template <typename ElemT>
+void nhwc3to4_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       hipStream_t stream) {
+  dim3 block(192);
+  const int nhw = NI * HI * WI;
+  const int nhwc = nhw * 3;
+  CHECK_EQ(nhw % 8, 0);
+  const int element_in_Tio = sizeof(int4) / sizeof(ElemT);
+  const int max_input_element = nhwc / element_in_Tio;
+  const int max_output_element = nhw * 4 / element_in_Tio;
+  const int4 zero_io = {0, 0, 0, 0};
+  const ElemT zero_element = static_cast<ElemT>(0.0f);
+  dim3 grid((nhwc + 192 * element_in_Tio - 1)/(192 * element_in_Tio));
+  nhwc_padding_channel_3To4_kernel<int4, ElemT, element_in_Tio><<<grid, block, 0, stream>>>
+          (NI, HI, WI,
+          (const int4 *)in_ptr,
+          (int4 *)out_ptr,
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.nhwc3to4.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("rocm.nhwc3to4.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("rocm.nhwc3to4.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to8.py b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
new file mode 100644
index 000000000..302317c4f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
@@ -0,0 +1,237 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to8 op
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  hipStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to8_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+// load 128 bit every time (n ElemT = 4 float)
+// use as many as thread with factor of 3:
+// each time load num_thread * n ElemT = num_thread / 3 * n ElemT * 3ch ->
+// num_thread / 3 * n ElemT * n ElemT ch
+
+template<typename ElemT, int num_thread>
+__global__ void nhwc3to8_kernel(const float4* input,
+                                float4* output,
+                                const int NI,
+                                const int HI,
+                                const int WI,
+                                const int max_in_elements,
+                                const int max_out_elements) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  __shared__ float4 shared_mem[num_thread];
+  const int out_offset = num_thread * num_elem_t_in_float4 / 3;
+  const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  const ElemT zero = static_cast<ElemT>(0.f);
+  const int in_idx = blockIdx.x * num_thread + threadIdx.x;
+  const int tid = threadIdx.x;
+
+  shared_mem[tid] = in_idx >= max_in_elements ? zero4 : __ldg(input + in_idx);
+  __syncthreads();
+
+  const int out_start_idx = blockIdx.x * out_offset;
+  const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
+  for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
+    const ElemT* smem_element = (const ElemT*)shared_mem + j * 3;
+    ElemT tmp[num_elem_t_in_float4];
+
+    #pragma unroll
+    for (int k = 0; k < num_elem_t_in_float4; ++k) {
+      tmp[k] = k < 3 ? smem_element[k] : zero;
+    }
+    output[i] = *((const float4*)tmp);
+  }
+}
+
+template <typename ElemT>
+void nhwc3to8_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       hipStream_t stream) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  constexpr int nthread = 240;
+  const int NHW = NI * HI * WI;
+  if (NHW % num_elem_t_in_float4 != 0) {
+    throw std::runtime_error(
+        "NHW (" + std::to_string(NHW) + ") mod num_elem_t_in_float4 (" +
+        std::to_string(num_elem_t_in_float4) + ") is not 0"
+    );
+  }
+  static_assert(nthread % 3 == 0);
+  const int max_in_elements = NHW * 3 / num_elem_t_in_float4;
+  const int max_out_elements = NHW * num_elem_t_in_float4 / num_elem_t_in_float4;
+  dim3 thread_block(nthread);
+  dim3 grid((NHW * 3 + nthread * num_elem_t_in_float4 -1) / (nthread * num_elem_t_in_float4));
+  nhwc3to8_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)in_ptr,
+    (float4*) out_ptr,
+    NI,
+    HI,
+    WI,
+    max_in_elements,
+    max_out_elements
+  );
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.nhwc3to8.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("rocm.nhwc3to8.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("rocm.nhwc3to8.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/padding/pad_last_dim.py b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
new file mode 100644
index 000000000..f28f96a4d
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
@@ -0,0 +1,261 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for pad_last_dim.
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  int out_dim,
+  hipStream_t stream
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{% for dim in xshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{% for dim in yshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{{indent}}  {{out_dim}},
+{{indent}}  stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}padding4d_launcher<{{elem_input_type}}, {{elem_input_type2}}>(
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{%for i in range(4 - ndim)%}
+1,
+{% endfor %}
+{%for i in range(ndim)%}
+{{indent}}    *x_dim{{i}},
+{% endfor %}
+{{indent}}    out_dim,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+namespace {
+template <typename T>
+__global__ void padding4d_kernel(const T* input,
+                                 T* output,
+                                 const int32_t x_dim0,
+                                 const int32_t x_dim1,
+                                 const int32_t x_dim2,
+                                 const int32_t x_dim3,
+                                 const int32_t out_dim,
+                                 const T zero){
+
+  const int32_t idx_jump       = blockDim.x * gridDim.x;
+  const int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * out_dim;
+
+  int32_t dim3_idx = 0;
+  int32_t dim2_idx = 0;
+  int32_t dim1_idx = 0;
+  int32_t dim0_idx = 0;
+  int32_t residual = 0;
+
+  T value;
+  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
+
+    dim3_idx = idx % out_dim;
+    if (dim3_idx >= x_dim3){
+      value = zero;
+    }
+    else{
+      residual = idx / out_dim;
+      dim2_idx = residual % x_dim2;
+      residual = residual / x_dim2;
+      dim1_idx = residual % x_dim1;
+      dim0_idx = residual / x_dim1;
+      residual = ((dim0_idx * x_dim1 + dim1_idx) * x_dim2 + dim2_idx) * x_dim3 + dim3_idx;
+      value = input[residual];
+    }
+    output[idx] = value;
+  }
+}
+
+
+template <typename ElemT, typename ElemT2>
+void padding4d_launcher(ElemT* in_ptr,
+                        ElemT* out_ptr,
+                        const int32_t x_dim0,
+                        const int32_t x_dim1,
+                        const int32_t x_dim2,
+                        const int32_t x_dim3,
+                        const int32_t out_dim,
+                        hipStream_t stream) {
+  static_assert(sizeof(ElemT2) % sizeof(ElemT) == 0);
+  const int block_size = 256;
+  if ((out_dim % 2) == 0 && (x_dim3 % 2) == 0) {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3 / 2;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const ElemT2 zero  = {0.0f, 0.0f};
+    padding4d_kernel<ElemT2><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const ElemT2*>(in_ptr), reinterpret_cast<ElemT2*>(out_ptr),
+        x_dim0, x_dim1, x_dim2, x_dim3 / 2,
+        out_dim / 2,
+        zero
+    );
+  } else {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const ElemT zero = static_cast<ElemT>(0.f);
+    padding4d_kernel<ElemT><<<grid, block, 0, stream>>>(
+        in_ptr, out_ptr,
+        x_dim0, x_dim1, x_dim2, x_dim3,
+        out_dim,
+        zero
+    );
+  }
+}
+
+} // namespace
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    {%for i in range(ndim)%}
+    int64_t* x_dim{{i}},
+    {% endfor %}
+    {%for i in range(ndim)%}
+    int64_t* y_dim{{i}},
+    {% endfor %}
+    int out_dim,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.pad_last_dim.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_input_type2 = None
+    if elem_input_type == "half":
+        elem_input_type2 = "half2"
+    elif elem_input_type == "float":
+        elem_input_type2 = "float2"
+    else:
+        raise NotImplementedError(f"unsupported {elem_input_type=}")
+    ndim = func_attrs["ndim"]
+    xshape = ["*x_dim%d" % i for i in range(ndim)]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ", dtype="int64_t ", shape=xshape, out_dim="out_dim"
+    )
+    yshape = ["*y_dim%d" % i for i in range(ndim - 1)]
+    shape_save_func = shape_save_template.render(
+        indent="  ", shape=yshape, last_dim="*y_dim%d" % (ndim - 1)
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_input_type2=elem_input_type2,
+        ndim=func_attrs["ndim"],
+        indent="  ",
+    )
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        ndim=func_attrs["ndim"],
+    )
+
+
+@registry.reg("rocm.pad_last_dim.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndim=func_attrs["ndim"])
+
+
+@registry.reg("rocm.pad_last_dim.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xshape_args = ["&" + dim._attrs["name"] for dim in xshape]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    yshape_args = ["&" + dim._attrs["name"] for dim in yshape]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        xshape=xshape_args,
+        yshape=yshape_args,
+        out_dim=func_attrs["out_dim"],
+        indent=indent,
+    )

From 2eaed6cd171eaf4c8aeec931e74bb8bfb21cbe24 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Thu, 2 Feb 2023 00:16:38 +0800
Subject: [PATCH 040/638] fix a bug

---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index e0d9032b2..3590e9bed 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -299,9 +299,9 @@ std::vector<torch::Tensor> AITModelImpl::processOutputs(
     auto output = at::detail::make_tensor_base<c10::TensorImpl>(
         std::move(output_index_to_output_storage_impl.at(output_idx)),
         #ifdef AIT_USE_ROCM
-        c10::DispatchKeySet(c10::DispatchKey::CUDA),
-        #else
         c10::DispatchKeySet(c10::DispatchKey::HIP),
+        #else
+        c10::DispatchKeySet(c10::DispatchKey::CUDA),
         #endif
         scalarTypeToTypeMeta(dtype));
     const auto& size = output_shapes.at(output_idx);

From b71c23f3f5d757e44a6c0f475eb080b0498ed3e1 Mon Sep 17 00:00:00 2001
From: Wei <wwei6@meta.com>
Date: Wed, 1 Feb 2023 11:19:40 -0800
Subject: [PATCH 041/638] Update README.md (#167)

Summary:
Add fx2ait/aten2ait maintainers

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/167

Reviewed By: wushirong

Differential Revision: D42929880

Pulled By: frank-wei

fbshipit-source-id: 4a085a9f90612dde1b2757005cf58ddf7e756a8c
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 13f7580a7..c98b98db7 100644
--- a/README.md
+++ b/README.md
@@ -125,6 +125,8 @@ AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://githu
 
 AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
 
+FX2AIT and Aten2AIT are co-created and maintained by Meta engineers: [Wei Wei](https://github.com/frank-wei), [Shirong Wu](https://github.com/wushirong) and [Zhijing Li](https://github.com/tissue3).
+
 
 ## Acknowledgement
 

From 30e3bc6ccfb8332f4aa33a53b213f701f134c445 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Wed, 1 Feb 2023 13:23:31 -0800
Subject: [PATCH 042/638] transposed_conv2d (#163)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/163

as titled

Reviewed By: terrychenism, wushirong

Differential Revision: D42768682

fbshipit-source-id: 3e863174ac950f0afb113aebc3ea10f0f80d6a1d
---
 fx2ait/fx2ait/converters/ait_converters.py    | 133 +++++++++++++++++-
 .../fx2ait/converters/aten2ait_converters.py  |  34 ++++-
 .../converters/test_ait_convtranspose2d.py    |  96 +++++++++++++
 .../test_ait_convtranspose2d_aten.py          |  95 +++++++++++++
 fx2ait/fx2ait/tools/common_fx2ait.py          |   2 +-
 python/aitemplate/compiler/public/__init__.py |   2 +
 6 files changed, 353 insertions(+), 9 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 5405f5046..5e13179e6 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -15,7 +15,7 @@
 import logging
 import math
 import operator
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Sequence, Tuple, Union
 
 import numpy as np
 
@@ -52,6 +52,8 @@
     squeeze,
     Tensor as AITTensor,
     topk,
+    transposed_conv2d,
+    transposed_conv2d_bias,
     tuple_construct,
     unsqueeze,
     var,
@@ -586,6 +588,120 @@ def acc_ops_tuple_construct(
     return tuple_construct()(*tensors)
 
 
+@ait_converter(acc_ops.conv_transpose2d)
+def acc_ops_conv_transpose2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    output_padding = identical_elem_tuple_to_int(kwargs["output_padding"])
+    assert output_padding == 0, "output_padding is not 0!"
+
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+    w_last_dim = weight._attrs["data"].tensor.shape[-1]
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
+    assert dilation == 1, "dilation {dilation} does not equal to 1!"
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if kwargs["groups"] is None or kwargs["groups"] == 1:
+        assert (
+            w_last_dim % 8 == 0
+        ), f"cutlass needs weight output channel={w_last_dim} is not divisble by 8! This restriction may be not valid in newer version"
+
+        if bias:
+            result = transposed_conv2d_bias(
+                stride=stride, pad=padding, dilate=dilation
+            )(input_val, weight, bias)
+        else:
+            result = transposed_conv2d(stride=stride, pad=padding, dilate=dilation)(
+                input_val, weight
+            )
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        groups = kwargs["groups"]
+        assert (
+            w_last_dim * groups
+        ) % 8 == 0, f"cutlass needs weight output channel={w_last_dim*groups} is not divisble by 8! This restriction may be not valid in newer version"
+
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def get_channel_dim_slice_idx(start, end, step):
+            all_none_slice = slice(None, None, None)
+            return (
+                all_none_slice,
+                all_none_slice,
+                all_none_slice,
+                slice(start, end, step),
+            )
+
+        def get_batch_dim_slice_idx(start, end, step):
+            return (slice(start, end, step),)
+
+        def make_slice(x, slice_idx, name):
+            return acc_ops_slice(
+                target,
+                args,
+                {
+                    "input": x,
+                    "idx": slice_idx,
+                },
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    get_channel_dim_slice_idx(
+                        i * group_size, i * group_size + group_size, 1
+                    ),
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.weight.slice_{i}",
+                ),
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.bias.slice_{i}",
+                ),
+                transposed=True,
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    return result
+
+
 @ait_converter(acc_ops.nan_to_num)
 def acc_ops_nan_to_num(
     target: Target,
@@ -826,12 +942,20 @@ def _choose_conv2d_op(
     dilate: int,
     x: AITTensor,
     weight: AITTensor,
-    bias: Optional[AITTensor],
+    bias: [AITTensor],
+    transposed: [bool] = False,
 ) -> ConverterOutput:
     """
     Helper to choose conv2d vs. conv2d_bias op based on existence of bias
     and pad channel input dim to 4/8
     """
+    if transposed:
+        if bias:
+            return transposed_conv2d_bias(stride=stride, pad=pad, dilate=dilate)(
+                x, weight, bias
+            )
+        else:
+            return transposed_conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
     last_dim = x._attrs["shape"][-1]._attrs["values"][0]
     # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
     # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
@@ -930,13 +1054,16 @@ def make_slice(x, slice_idx, name):
                     ),
                     f"{name}.weight.slice_{i}",
                 ),
-                make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
                     bias,
                     get_batch_dim_slice_idx(
                         i * w_group_size, i * w_group_size + w_group_size, 1
                     ),
                     f"{name}.bias.slice_{i}",
                 ),
+                transposed=False,
             )
             for i in range(groups)
         ]
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 1f8cd02ee..8fb0dee33 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -15,7 +15,7 @@
 import logging
 import torch  # isort:skip
 import operator
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import numpy
 
@@ -50,6 +50,8 @@
     split,
     squeeze,
     Tensor as AITTensor,
+    transposed_conv2d,
+    transposed_conv2d_bias,
     unsqueeze,
 )
 from fx2ait.converters.utils import (
@@ -239,12 +241,21 @@ def _choose_conv2d_op(
     dilate: int,
     x: AITTensor,
     weight: AITTensor,
-    bias: Optional[AITTensor],
+    bias: [AITTensor],
+    transposed: [bool] = False,
 ) -> ConverterOutput:
     """
     Helper to choose conv2d vs. conv2d_bias op based on existence of bias
     and pad channel input dim to 4/8
     """
+    if transposed:
+        if bias:
+            return transposed_conv2d_bias(stride=stride, pad=pad, dilate=dilate)(
+                x, weight, bias
+            )
+        else:
+            return transposed_conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
     last_dim = x._attrs["shape"][-1]._attrs["values"][0]
     # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
     # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
@@ -292,7 +303,8 @@ def aten_ops_conv2d(
     padding = identical_elem_tuple_to_int(padding)
     dilation = args[5]
     dilation = identical_elem_tuple_to_int(dilation)
-    # TODO transposed=args[6], output_padding=args[7]
+    transposed = args[6]
+    # output_padding = args[7]
     groups = args[8]
 
     assert all(
@@ -300,10 +312,21 @@ def aten_ops_conv2d(
     ), "Expected int stride, padding, and dilation"
 
     if groups is None or groups == 1:
-        result = _choose_conv2d_op(stride, padding, dilation, input_val, weight, bias)
+        if transposed:
+            if bias:
+                result = transposed_conv2d_bias(
+                    stride=stride, pad=padding, dilate=dilation
+                )(input_val, weight, bias)
+            else:
+                result = transposed_conv2d(stride=stride, pad=padding, dilate=dilation)(
+                    input_val, weight
+                )
+        else:
+            result = _choose_conv2d_op(
+                stride, padding, dilation, input_val, weight, bias, transposed
+            )
     else:
         # Grouped conv doesn't currently work on AIT CUDA, manually map
-        # groups = kwargs["groups"]
         group_size = input_val.shape()[3]._attrs["values"][0] // groups
         w_group_size = weight.shape()[0]._attrs["values"][0] // groups
 
@@ -352,6 +375,7 @@ def make_slice(x, dim, start, end, step, name):
                     1,
                     f"{name}.bias.slice_{i}",
                 ),
+                transposed=transposed,
             )
             for i in range(groups)
         ]
diff --git a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
new file mode 100644
index 000000000..a4fcb4f35
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
@@ -0,0 +1,96 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestConvtTranspose2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 2, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+        ]
+    )
+    def test_convtranspose(
+        self,
+        name,
+        kernel_size,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    256,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv_transpose2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    # only works when in_ch == out_ch
+    def test_convtranspose_multi_group(
+        self,
+        name="multi_group",
+        kernel_size=2,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=2,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    192,  # must to divisblce by 8
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv_transpose2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
new file mode 100644
index 000000000..dbad335a0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
@@ -0,0 +1,95 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestConvtTranspose2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 2, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+        ]
+    )
+    def test_convtranspose(
+        self,
+        name,
+        kernel_size,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    256,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.convolution.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    # # only works when in_ch == out_ch
+    def test_convtranspose_multi_group(
+        self,
+        name="multi_group",
+        kernel_size=2,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=2,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    192,  # must to divisblce by 8
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.convolution.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index af38ab68c..2c6b6d19b 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -93,7 +93,7 @@ def run_test(
                 torch.nn.MultiheadAttention if transformer_mode else None
             ],
         )
-        print(mod)
+        print(mod.graph)
 
         original_inputs = inputs
         if permute_inputs:
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index d0b664a3b..4dcb936f5 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -57,6 +57,8 @@
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
 from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
 from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
     group_layernorm_sigmoid_mul,

From 57a0a86f57acff047575052dc2775c82d0d55f1a Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Wed, 1 Feb 2023 13:23:31 -0800
Subject: [PATCH 043/638] sin,cos,sqrt,clone support (#164)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/164

as titled

Reviewed By: terrychenism

Differential Revision: D42898874

fbshipit-source-id: 2df64af6bcf102a0ce40f1c5ab8472370d012904
---
 fx2ait/fx2ait/converters/ait_converters.py    | 47 +++++++++++++++++++
 .../fx2ait/converters/aten2ait_converters.py  | 47 +++++++++++++++++++
 .../test/converters/test_ait_unary_ops.py     |  6 ++-
 .../test_ait_unary_ops_aten.py                | 13 +++--
 4 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 5e13179e6..9157a8a4d 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import copy
 import logging
 import math
 import operator
@@ -143,6 +144,52 @@ def acc_ops_tanh(
     return elementwise(FuncEnum.TANH)(input_val)
 
 
+@ait_converter(acc_ops.sin)
+def acc_ops_sin(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.SIN)(input_val)
+
+
+@ait_converter(acc_ops.cos)
+def acc_ops_cos(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.COS)(input_val)
+
+
+@ait_converter(acc_ops.sqrt)
+def acc_ops_sqrt(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.SQRT)(input_val)
+
+
+@ait_converter(acc_ops.clone)
+def acc_ops_clone(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    res = copy.deepcopy(input_val)
+    res._attrs["dst_ops"].clear()
+    return res
+
+
 @ait_converter(acc_ops.sum)
 def acc_ops_sum(
     target: Target,
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 8fb0dee33..ab04c07b0 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -14,6 +14,7 @@
 #
 import logging
 import torch  # isort:skip
+import copy
 import operator
 from typing import Dict, List, Tuple, Union
 
@@ -384,6 +385,30 @@ def make_slice(x, dim, start, end, step, name):
     return result
 
 
+@ait_converter(torch.ops.aten.clone.default)
+def aten_unary_ops_clone(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    res = copy.deepcopy(input_val)
+    res._attrs["dst_ops"].clear()
+    return res
+
+
+@ait_converter(torch.ops.aten.cos.default)
+def aten_unary_ops_cos(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.COS)(input_val)
+
+
 @ait_converter(aten_compose_chunk)
 @ait_converter(torch.ops.aten.chunk.default)
 def aten_ops_chunk(
@@ -1108,6 +1133,28 @@ def aten_unary_ops_sign(
     return elementwise(FuncEnum.SIGN)(input_val)
 
 
+@ait_converter(torch.ops.aten.sin.default)
+def aten_unary_ops_sin(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.SIN)(input_val)
+
+
+@ait_converter(torch.ops.aten.sqrt.default)
+def aten_unary_ops_sqrt(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.SQRT)(input_val)
+
+
 @ait_converter(torch.ops.aten.tanh.default)
 def aten_unary_ops_tanh(
     target: Target,
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index 12eac27b6..0b75f85da 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -26,6 +26,10 @@
     (torch.sign, acc_ops.sign),
     (torch.log, acc_ops.log),
     (torch.relu, acc_ops.relu),
+    (torch.sin, acc_ops.sin),
+    (torch.cos, acc_ops.cos),
+    (torch.sqrt, acc_ops.sqrt),
+    (torch.clone, acc_ops.clone),
 ]
 
 
@@ -34,7 +38,7 @@ class TestUnaryOpsConverter(AITTestCase):
     def test_unary_ops(self, name, orig_op: Callable, expected_op):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return orig_op(x)
+                return orig_op(x) * 2
 
         model = TestModule().cuda().half()
         inputs = [
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
index c5a50048e..09a4d2bbf 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
@@ -26,6 +26,13 @@
     (torch.sigmoid, torch.ops.aten.sigmoid.default),
     (torch.sign, torch.ops.aten.sign.default),
     (torch.tanh, torch.ops.aten.tanh.default),
+    (torch.sin, torch.ops.aten.sin.default),
+    (torch.cos, torch.ops.aten.cos.default),
+    (torch.sqrt, torch.ops.aten.sqrt.default),
+    (
+        torch.clone,
+        torch.ops.aten.clone.default,
+    ),  # clone op can not be the output directly
 ]
 
 
@@ -34,20 +41,20 @@ class TestUnaryOpsConverter(DispatchTestCase):
     def test_unary_ops(self, name, orig_op: Callable, expected_op):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return orig_op(x)
+                return orig_op(x) * 2
 
         model = TestModule().cuda().half()
         inputs = [
             torch.randn(1, 2, 3).half().cuda(),
         ]
-
+        _ = model(*inputs)
         self.run_test(model, inputs, expected_ops={expected_op})
 
     @parameterized.expand([(op[1].__name__, op[0], op[1]) for op in unary_ops])
     def test_dynamic_unary_ops(self, name, orig_op: Callable, expected_op):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return orig_op(x)
+                return orig_op(x) * 2
 
         model = TestModule().cuda().half()
         inputs_spec = TensorSpec.create_spec_from_shapes(

From 94822fec86884664e96e9cad6854bb0bf51e62df Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 1 Feb 2023 19:34:00 -0800
Subject: [PATCH 044/638] add bfloat16 test coverage for bmm (#172)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/172

it works out of the box

Reviewed By: ipiszy

Differential Revision: D42949236

fbshipit-source-id: 00419ce80bd45e5ecdb8a182e5f221946580f6a8
---
 tests/unittest/ops/test_bmm.py | 69 ++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 347a0711b..117c7a30a 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -189,6 +189,43 @@ def test_bmm_float(self):
             [1, 9, 11], M=64, N=32, K=16, test_name="dynamic_b_float", dtype="float"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_bfloat16(self):
+        self._test_rcr(
+            [128], [64], N=8, K=64, test_name="static_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name="dynamic_b_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name="dynamic_bk_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name="dynamic_m_bfloat16", dtype="bfloat16"
+        )
+        self._test_ccr(
+            [1, 9, 11],
+            M=64,
+            N=32,
+            K=16,
+            test_name="dynamic_b_bfloat16",
+            dtype="bfloat16",
+        )
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
@@ -429,6 +466,38 @@ def test_bmm_broadcast_float(self):
         self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a", dtype="float")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b", dtype="float")
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_broadcast_bfloat16(self):
+        self._test_rcr_with_accessors(dtype="bfloat16")
+        self._test_rcr_merge_with_accessors(dtype="bfloat16")
+        self._test_rcr(
+            [2, 16, 8], [1, 32, 8], "broadcastable_b_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr(
+            [16, 8], [8, 32, 8], "2d_broadcastable_a_bfloat16", dtype="bfloat16"
+        )
+        self._test_crr(
+            [1, 8, 16], [2, 8, 32], "broadcastable_a_bfloat16", dtype="bfloat16"
+        )
+        self._test_crr(
+            [8, 8, 16], [8, 32], "2d_broadcastable_b_bfloat16", dtype="bfloat16"
+        )
+        self._test_rrr(
+            [2, 16, 8], [1, 8, 32], "broadcastable_b_bfloat16", dtype="bfloat16"
+        )
+        self._test_rrr(
+            [16, 8], [8, 8, 32], "2d_broadcastable_a_bfloat16", dtype="bfloat16"
+        )
+        self._test_ccr(
+            [1, 8, 16], [2, 32, 8], "broadcastable_a_bfloat16", dtype="bfloat16"
+        )
+        self._test_ccr(
+            [8, 8, 16], [32, 8], "2d_broadcastable_b_bfloat16", dtype="bfloat16"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From e5d4e96e7d4cb6d90dda4449602d3b94c75f8ff3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 1 Feb 2023 19:51:31 -0800
Subject: [PATCH 045/638] add bfloat16 test coverage for bmm_add (#173)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/173

it works out of the box

Reviewed By: ipiszy

Differential Revision: D42950381

fbshipit-source-id: 8eb407c2dd1289603588f647afe24614bf04310b
---
 tests/unittest/ops/test_bmm_add.py | 44 ++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 45fc2fd90..d1d391353 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -152,6 +152,17 @@ def test_bmm_add_float(self):
         )
         self._test_crr(B=8, M=32, K=16, N=64, dtype="float")
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_add_bfloat16(self):
+        self._test_rrr(B=8, M=32, K=8, N=64, dtype="bfloat16")
+        self._test_ccr(
+            B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_bfloat16", dtype="bfloat16"
+        )
+        self._test_crr(B=8, M=32, K=16, N=64, dtype="bfloat16")
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
@@ -309,7 +320,7 @@ def test_ccr(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_bmm_broadcast_float(self):
+    def test_bmm_add_broadcast_float(self):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
@@ -321,17 +332,44 @@ def test_bmm_broadcast_float(self):
             [1, 16, 8],
             [2, 8, 32],
             bias_shape=[1, 32],
-            test_name="broadcastable_bias1d_2",
+            test_name="broadcastable_bias1d_2_float",
             dtype="float",
         )
         self._test_ccr(
             [1, 8, 16],
             [2, 32, 8],
             bias_shape=[1, 16, 32],
-            test_name="broadcastable_bias3d",
+            test_name="broadcastable_bias3d_float",
             dtype="float",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_add_broadcast_bfloat16(self):
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d_bfloat16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From cc98029f826b45dd08b22d7388d6a1f8fad78db3 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 1 Feb 2023 21:06:54 -0800
Subject: [PATCH 046/638] Tensor idx list concatenation (#171)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/171

Reviewed By: wushirong

Differential Revision: D42949594

fbshipit-source-id: 0649d738a2d87dec1c87c323c43425acdad45559
---
 fx2ait/fx2ait/converters/ait_converters.py    | 23 +++++++++++++++++++
 .../converters/test_ait_convtranspose2d.py    | 14 +++++++++++
 .../test/converters/test_ait_slice_tensor.py  | 16 +++++++++++++
 .../test_ait_convtranspose2d_aten.py          | 14 +++++++++++
 4 files changed, 67 insertions(+)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 9157a8a4d..96b36468f 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -466,6 +466,29 @@ def acc_ops_getitem(
 ) -> ConverterOutput:
     input_val = kwargs["input"]
     idx = kwargs["idx"]
+    if isinstance(idx, Sequence) and any(isinstance(x, Sequence) for x in idx):
+        count = 0
+        dim = None
+        s = None
+        for d, x in enumerate(idx):
+            if isinstance(x, Sequence):
+                count += 1
+                dim = d
+                s = x
+        # TODO: Because multi-list concatenations e.g. x[[0,1],[0,2]] have broadcast implications
+        # which requires careful pre-conditions and complicated calculations,
+        # we ignore the situation for now and may add support per request.
+        assert count == 1, "Expected only one dimension with list concatenation."
+
+        # For list concatenations, we first take slices and then concate them back
+        # In terms of performance, AIT backend will take care of fusing these ops.
+        groups = []
+        kw = {"input": input_val}
+        for x in s:
+            kw["idx"] = idx[:dim] + (x,) + idx[dim + 1 :]
+            groups.append(unsqueeze(dim)(acc_ops_slice(target, args, kw, name)))
+        return concatenate()(groups, dim=dim)
+
     if isinstance(idx, slice) or (
         isinstance(idx, Sequence) and any(isinstance(x, slice) for x in idx)
     ):
diff --git a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
index a4fcb4f35..59d761698 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
@@ -1,4 +1,18 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
 from fx2ait.acc_tracer import acc_ops
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index ef62c6d41..6b95bd608 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -53,6 +53,22 @@ class TestSliceTensor(AITTestCase):
                 "slice_zero_slice",
                 (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
             ),
+            (
+                "slice_list_slice",
+                (slice(0, 1, None), [2], slice(0, 10, None)),
+            ),
+            (
+                "zero_list_zero",
+                (slice(0, 1, None), [0, 7, 5, 3, 1, 9], slice(0, 0, None)),
+            ),
+            (
+                "all_list_all",
+                (slice(None, None, None), [2, 2, 2, 2], slice(None, None, None)),
+            ),
+            (
+                "slice_zero_list",
+                (slice(0, 1, None), slice(0, 0, None), [0, 1, 3]),
+            ),
         ]
     )
     def test_slice_tensor(self, name, idx):
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
index dbad335a0..ebf6dd83c 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
@@ -1,4 +1,18 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 import torch
 from fx2ait.tools.common_aten2ait import DispatchTestCase

From c70dfde071bcd277c3712441c904f184dd5f655e Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Wed, 1 Feb 2023 23:02:19 -0800
Subject: [PATCH 047/638] clone op synced to acc_ops.py (#176)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/176

Reviewed By: ipiszy

Differential Revision: D42952270

fbshipit-source-id: 8dc52ca17a1301637a9d2629a32f093c67044791
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 7f6395353..047e1d887 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -3220,6 +3220,12 @@ def baddbmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
         return add_node
 
 
+@register_acc_op_mapping(op_and_target=("call_function", torch.clone))
+@register_acc_op
+def clone(*, input):
+    return torch.clone(input)
+
+
 ###############################################################################
 
 # Set ops as side-effectul, this prevents them from being optimized away or

From a9f6b25e20bc8e529f10df980284d81103b4edbe Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 1 Feb 2023 23:28:18 -0800
Subject: [PATCH 048/638] add bfloat16 test coverage for {bmm,gemm}_permute
 (#175)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/175

works out of the box

Reviewed By: chenyang78

Differential Revision: D42951480

fbshipit-source-id: 028432004cc47041ef1d333ea45ad28ed960ff06
---
 tests/unittest/ops/test_bmm_permute.py  | 34 +++++++++++++++++++++
 tests/unittest/ops/test_gemm_permute.py | 39 +++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index 42cc684e5..c989fdeae 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -138,6 +138,40 @@ def test_bmm_permute_float(self):
             [10], [8], N=64, K=88, d1=10, test_name="permute3_float", dtype="float"
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_permute_bfloat16(self):
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_copy_op_bfloat16",
+            copy_op=True,
+            dtype="bfloat16",
+        )
+        self._test_rcr(
+            [10],
+            [8],
+            N=64,
+            K=88,
+            d1=10,
+            test_name="permute3_bfloat16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index 3f1ccc1d0..13fcd34bf 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -256,6 +256,45 @@ def test_permute_float(self):
             dtype="float",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_permute_bfloat16(self):
+        for has_bias in (True, False):
+            for copy_op in (True, False):
+                self._test_rcr(
+                    [80],
+                    32,
+                    96,
+                    (5, 3, 2),
+                    "permute1_bfloat16",
+                    has_bias=has_bias,
+                    copy_op=copy_op,
+                    dtype="bfloat16",
+                )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "permute_0213_2_bfloat16",
+            has_bias=False,
+            copy_op=False,
+            layout="0213",
+            dtype="bfloat16",
+        )
+        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_bfloat16", dtype="bfloat16")
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "permute2_copy_op_bfloat16",
+            copy_op=True,
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From d70d45a236f7d751f37f222de9049ac0da4f59ec Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Thu, 2 Feb 2023 10:53:42 -0800
Subject: [PATCH 049/638] Incorporate op disallow list in AIT splitter settings
 (#169)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/169

Add disallow list to AIT splitting settings to exclude certain nodes for lowering.

Reviewed By: amateurcoffee, wushirong

Differential Revision: D42901736

fbshipit-source-id: 7252504067cc39a49888fd372503ca17baf15ca1
---
 fx2ait/fx2ait/ait_splitter.py           | 12 +++-
 fx2ait/fx2ait/test/test_ait_splitter.py | 89 +++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 fx2ait/fx2ait/test/test_ait_splitter.py

diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index d946a2416..3b78fc065 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -102,6 +102,7 @@ class AITSplitterSettings(splitter_base._SplitterSettingBase):
     def __init__(self, min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE):
         super().__init__()
         self.min_acc_module_size = min_acc_module_size
+        self.exclude_support_node_name: set = set()
 
 
 class AITSplitter(splitter_base._SplitterBase):
@@ -115,7 +116,16 @@ def __init__(
         if not settings:
             settings = AITSplitterSettings()
         if not operator_support:
-            operator_support = create_ait_operator_support()
+            operator_support = create_ait_operator_support(
+                op_lowering_disallow_list=settings.exclude_support_node_name
+            )
+        else:
+            operator_support = ops.chain(
+                operator_support,
+                ops.OpSupports.decline_if_node_in_names(
+                    settings.exclude_support_node_name
+                ),
+            )
         super().__init__(
             module,
             sample_input,
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
new file mode 100644
index 000000000..36d0e9006
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -0,0 +1,89 @@
+import torch
+from fx2ait.acc_tracer import acc_tracer
+from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait
+    AITSplitter,
+    AITSplitterSettings,
+)
+from fx2ait.tools.common_fx2ait import AITTestCase
+from torch.fx.passes import operator_support as op_support
+
+
+class TestSplit(AITTestCase):
+    def test_exclude_support_node_by_name(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.sin(a)
+                c = torch.relu(b)
+                d = torch.cos(c)
+                e = torch.sigmoid(d)
+                f = torch.tanh(e)
+                return f
+
+        # Support all ops
+        _support_dict = {
+            "acc_ops.sin": None,
+            "acc_ops.cos": None,
+            "acc_ops.relu": None,
+            "acc_ops.sigmoid": None,
+            "acc_ops.tanh": None,
+        }
+        custom_op_support = op_support.OperatorSupport(_support_dict)
+
+        # With no ops excluded, the entire module should be lowered
+        # into one acc graph
+        mod = acc_tracer.trace(TestModule(), [torch.randn(2, 3)])
+        settings = AITSplitterSettings(min_acc_module_size=0)
+        splitter = AITSplitter(
+            mod,
+            (torch.randn(2, 3),),
+            custom_op_support,
+            settings,
+        )
+
+        res_no_exclusion = splitter.generate_split_results()
+        split_named_mods = dict(res_no_exclusion.split_module.named_children())
+        self.assertEqual(len(split_named_mods), 1)
+        self.assertIn("_run_on_acc_0", split_named_mods)
+
+        # Add "relu" to exclude_support_node_name
+        # The graph should be split into 3 parts now(_run_on_acc_0, _run_on_gpu_1, _run_on_acc_2)
+        mod = acc_tracer.trace(TestModule(), [torch.randn(2, 3)])
+        settings.exclude_support_node_name.add("relu_1")
+        splitter = AITSplitter(
+            mod,
+            (torch.randn(2, 3),),
+            custom_op_support,
+            settings,
+        )
+        res_post_exclusion = splitter.generate_split_results()
+        split_named_mods = dict(res_post_exclusion.split_module.named_children())
+        self.assertEqual(len(split_named_mods), 3)
+        self.assertIn("_run_on_acc_0", split_named_mods)
+        self.assertIn("_run_on_gpu_1", split_named_mods)
+        self.assertIn("_run_on_acc_2", split_named_mods)
+
+        run_on_acc_0_nodes = [
+            n
+            for n in split_named_mods["_run_on_acc_0"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_acc_0_nodes), 1)
+        self.assertEqual(acc_tracer.acc_ops.sin, run_on_acc_0_nodes[0].target)
+
+        run_on_gpu_1_nodes = [
+            n
+            for n in split_named_mods["_run_on_gpu_1"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_gpu_1_nodes), 1)
+        self.assertEqual(acc_tracer.acc_ops.relu, run_on_gpu_1_nodes[0].target)
+
+        run_on_acc_2_nodes = [
+            n
+            for n in split_named_mods["_run_on_acc_2"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_acc_2_nodes), 3)
+        self.assertEqual(acc_tracer.acc_ops.cos, run_on_acc_2_nodes[0].target)
+        self.assertEqual(acc_tracer.acc_ops.sigmoid, run_on_acc_2_nodes[1].target)
+        self.assertEqual(acc_tracer.acc_ops.tanh, run_on_acc_2_nodes[2].target)

From b469e30ad62addbe716b55f65118eed38d65f2eb Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Thu, 2 Feb 2023 10:53:42 -0800
Subject: [PATCH 050/638] Split nodes w/ float64 inputs from lowering (#170)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/170

Lowering fails on input nodes where the input dtype is float64 since it isn't supported where the input is float64 (ex. fails https://fburl.com/code/40ha9aegin TRT). This removes all such node types from the supported nodes for lowering.

Reviewed By: houseroad, wushirong

Differential Revision: D42901735

fbshipit-source-id: 9fa6469ccb9d00320d78684d748fe1a7e5c3cf60
---
 fx2ait/fx2ait/ait_splitter.py           |  1 +
 fx2ait/fx2ait/test/test_ait_splitter.py | 45 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index 3b78fc065..742a0062a 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -86,6 +86,7 @@ def create_ait_operator_support(
         # 1. We only support subgraphs with torch.Tensor inputs for now
         ops.OpSupports.decline_if_input_dtype(torch.int64),
         ops.OpSupports.decline_if_input_dtype(torch.int32),
+        ops.OpSupports.decline_if_input_dtype(torch.float64),
         ops.OpSupports.decline_if_input_dtype(dict),
         # 2. Node is supported if it has AIT converter:
         supported_if_converter_registered,
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
index 36d0e9006..e6aa468ba 100644
--- a/fx2ait/fx2ait/test/test_ait_splitter.py
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -3,6 +3,7 @@
 from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait
     AITSplitter,
     AITSplitterSettings,
+    create_ait_operator_support,
 )
 from fx2ait.tools.common_fx2ait import AITTestCase
 from torch.fx.passes import operator_support as op_support
@@ -87,3 +88,47 @@ def forward(self, a):
         self.assertEqual(acc_tracer.acc_ops.cos, run_on_acc_2_nodes[0].target)
         self.assertEqual(acc_tracer.acc_ops.sigmoid, run_on_acc_2_nodes[1].target)
         self.assertEqual(acc_tracer.acc_ops.tanh, run_on_acc_2_nodes[2].target)
+
+    def test_decline_if_input_dtype(self):
+        operator_support = create_ait_operator_support()
+
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.relu(a)
+                return b
+
+        test_mod = TestModule().cuda().eval()
+        x = torch.randn(2, 3)
+        mod = acc_tracer.trace(test_mod, [x])
+        settings = AITSplitterSettings()
+        settings.min_acc_module_size = 0
+        # nodes w/ float16 input should be lowered
+        splitter = AITSplitter(
+            mod,
+            (x.half().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_half = splitter.generate_split_results()
+        self.assertTrue(len(split_results_half), 1)
+        self.assertEqual(
+            dict(split_results_half.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )
+
+        # nodes w/ float64 input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.double().cuda(),),
+            operator_support,
+            settings,
+        )
+
+        split_results_double = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_double), 1)
+        self.assertEqual(
+            dict(split_results_double.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )

From c973f207d1e3e1cf4d2c0dafebbf90e6a9cd0034 Mon Sep 17 00:00:00 2001
From: Bing Tian <bingt@meta.com>
Date: Thu, 2 Feb 2023 11:24:45 -0800
Subject: [PATCH 051/638] T137322293: [aten2ait] Add unittest for
 torch.ops.aten.hardtanh (#177)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/177

Reviewed By: tissue3

Differential Revision: D42934395

fbshipit-source-id: 796f757e0c463ce73c3b1e8a8ae632c0789ebccb
---
 .../fx2ait/converters/aten2ait_converters.py  |  2 +-
 .../converters_aten/test_ait_hardtanh_aten.py | 64 +++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py

diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index ab04c07b0..12f9ada77 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -917,7 +917,7 @@ def aten_ops_hardtanh(
     input_val = args[0]
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Unexpected input for {name}: {input_val}")
-    result = elementwise(FuncEnum.TANH)(input_val)
+    result = input_val
     minimal = args[1] if len(args) > 1 else -1
     maximum = args[2] if len(args) > 2 else 1
     if minimal is not None:
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
new file mode 100644
index 000000000..a94b6bbab
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
@@ -0,0 +1,64 @@
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestHardTanhConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1.5, max=3),
+            param("min", min=-1.5),
+            param("max", max=3),
+        ]
+    )
+    def test_hardtanh(self, name, min=-1, max=1):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.nn.Hardtanh(min, max)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.hardtanh.default})
+
+    @parameterized.expand(
+        [
+            param("default", min=-1.2, max=2),
+            param("min", min=-1.2),
+            param("max", max=2),
+        ]
+    )
+    def test_dynamic_hardtanh(self, name, min=-1, max=1):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.nn.Hardtanh(min, max)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.hardtanh.default}
+        )

From 3aa84a0a4c922f81cfff3b097044682c8df6c726 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 2 Feb 2023 13:54:34 -0800
Subject: [PATCH 052/638] add bfloat16 test coverage for gemm_bias_broadcast
 (#182)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/182

works out of the box

Reviewed By: chenyang78

Differential Revision: D42971435

fbshipit-source-id: da210bd8c010cfff170f3c11f32998849153482b
---
 .../unittest/ops/test_gemm_bias_broadcast.py  | 55 ++++++++++---------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index b0ccfe05a..a802b5eb4 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -14,7 +14,6 @@
 #
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -41,7 +40,7 @@ def _init_tensors(self, m, k, n, m0=None, m1=None, dtype="float16"):
         self.D1_pt = get_random_torch_tensor([*m_shape, n], dtype)
 
     def _test_and_verify(
-        self, module, numpy_output, dtype, has_d1=False, module_output_name="output_0"
+        self, module, torch_output, dtype, has_d1=False, module_output_name="output_0"
     ):
         inputs = {
             "input_0": self.X_pt,
@@ -51,14 +50,12 @@ def _test_and_verify(
         }
         if has_d1:
             inputs["d1"] = self.D1_pt
-        y = get_torch_empty_tensor(list(numpy_output.shape), dtype)
+        y = get_torch_empty_tensor(list(torch_output.shape), dtype)
         module.run_with_tensors(inputs, [y])
         if self.X_pt.nelement() == 0 or self.W_pt.nelement() == 0:
             pass
         else:
-            np.testing.assert_allclose(
-                numpy_output, y.cpu().numpy(), atol=1e-1, rtol=1e-1
-            )
+            torch.testing.assert_close(torch_output, y, atol=1e-1, rtol=1e-1)
 
     def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -75,8 +72,7 @@ def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
             * self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_mul_add(self):
         self._test_bias_rcr_mul_add(8, None, None, 8, 8)
@@ -104,8 +100,7 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
             )
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_sigmoid_mul(self):
         self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
@@ -133,8 +128,7 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
             )
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_sigmoid_mul_tanh(self):
         self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
@@ -161,8 +155,7 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             + self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_add(self):
         self._test_bias_rcr_add(8, None, None, 8, 8)
@@ -188,8 +181,7 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             + self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_add_relu(self):
         self._test_bias_rcr_add_relu(8, None, None, 8, 8)
@@ -216,8 +208,7 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
             + self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_add_add_relu(self):
         target = detect_target()
@@ -250,8 +241,7 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n, dtype="float16"):
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_mul(self):
         self._test_bias_rcr_mul(8, None, None, 8, 8)
@@ -278,8 +268,7 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
             + self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_add_add(self):
         self._test_bias_rcr_add_add(8, None, None, 8, 8)
@@ -306,8 +295,7 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, dtype)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_mul_tanh(self):
         self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
@@ -320,7 +308,7 @@ def test_bias_rcr_mul_tanh(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_bias_broadcast_float(self):
+    def test_gemm_bias_broadcast_float(self):
         self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="float")
         self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="float")
         self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="float")
@@ -332,6 +320,23 @@ def test_bias_broadcast_float(self):
         self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="float")
         self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="float")
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_bias_broadcast_bfloat16(self):
+        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="bfloat16")
+        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
+
 
 if __name__ == "__main__":
     unittest.main()

From 1d16e9920a68f6bb74fa1c823f2758e64c90adc1 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 2 Feb 2023 15:10:33 -0800
Subject: [PATCH 053/638] Added missing copyright headers (#184)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/184

Reviewed By: jianyuh

Differential Revision: D42974871

Pulled By: chenyang78

fbshipit-source-id: d447590815efba9d3ddd6cf8dac67ab94ecf3696
---
 .../test/converters_aten/test_ait_hardtanh_aten.py | 14 ++++++++++++++
 fx2ait/fx2ait/test/test_ait_splitter.py            | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
index a94b6bbab..f974ebbb3 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
index e6aa468ba..990c37cc3 100644
--- a/fx2ait/fx2ait/test/test_ait_splitter.py
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -1,3 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import torch
 from fx2ait.acc_tracer import acc_tracer
 from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait

From c2ebb9f5ec05dd8b732ab6a84e74dda85cfb84e8 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Thu, 2 Feb 2023 15:18:03 -0800
Subject: [PATCH 054/638] cover the bert example with the internal CI (#179)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/179

Included the bert example into the internal test-suites

Reviewed By: tenpercent

Differential Revision: D42958030

fbshipit-source-id: db788ecc68952e29b5c1a3d5a866238c26ebad9d
---
 examples/03_bert/benchmark_ait.py        |  4 +-
 examples/03_bert/demo.py                 | 20 +++++---
 examples/03_bert/modeling/torch_model.py |  8 ++--
 examples/03_bert/test_correctness.py     | 58 ++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 14 deletions(-)
 create mode 100644 examples/03_bert/test_correctness.py

diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 9847cb910..54e3c8e9a 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -25,8 +25,8 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
-from modeling.torch_model import BertBaseUncased as BertPt
+from .modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from .modeling.torch_model import BertBaseUncased as BertPt
 
 
 def mark_output(y: Tensor) -> None:
diff --git a/examples/03_bert/demo.py b/examples/03_bert/demo.py
index d783b6423..f23dcf9d7 100644
--- a/examples/03_bert/demo.py
+++ b/examples/03_bert/demo.py
@@ -16,13 +16,14 @@
 
 import torch
 
-from benchmark_ait import compile_module
-from modeling.torch_model import BertBaseUncased as BertPt
 from transformers import BertTokenizer
 
+from .benchmark_ait import compile_module
+from .modeling.torch_model import BertBaseUncased as BertPt
 
-def prepare_data(prompt: str):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+def prepare_data(prompt: str, model_path: str):
+    tokenizer = BertTokenizer.from_pretrained(model_path)
     result = tokenizer(prompt, return_attention_mask=False, return_tensors="pt")
     target_size = result["input_ids"].size()
     if target_size[1] > 512:
@@ -38,13 +39,18 @@ def prepare_data(prompt: str):
 
 
 def run_model(
-    prompt: str, activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool
+    prompt: str,
+    activation: str,
+    graph_mode: bool,
+    use_fp16_acc: bool,
+    verify: bool,
+    model_path="bert-base-uncased",
 ):
-    inputs = prepare_data(prompt)
+    inputs = prepare_data(prompt, model_path)
     inputs_pt = {name: data.cuda() for name, data in inputs.items()}
     batch_size, seq_len = inputs["input_ids"].size()
 
-    pt_model = BertPt(pretrained=True)._model
+    pt_model = BertPt(model_path=model_path, pretrained=True)._model
     pt_model.eval()
     hidden_size = pt_model.config.hidden_size
 
diff --git a/examples/03_bert/modeling/torch_model.py b/examples/03_bert/modeling/torch_model.py
index cbc965c70..7e5ae83f0 100644
--- a/examples/03_bert/modeling/torch_model.py
+++ b/examples/03_bert/modeling/torch_model.py
@@ -17,14 +17,12 @@
 
 
 class BertBaseUncased:
-    def __init__(self, pretrained=True):
+    def __init__(self, model_path="bert-base-uncased", pretrained=True):
         if not pretrained:
-            pretrained = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+            pretrained = AutoModelForMaskedLM.from_pretrained(model_path)
             self._model = BertForMaskedLM(pretrained.config).cuda().half()
         else:
-            self._model = (
-                AutoModelForMaskedLM.from_pretrained("bert-base-uncased").cuda().half()
-            )
+            self._model = AutoModelForMaskedLM.from_pretrained(model_path).cuda().half()
         self._vocab_size = 30522
 
     def forward(self, *args, **kwargs):
diff --git a/examples/03_bert/test_correctness.py b/examples/03_bert/test_correctness.py
new file mode 100644
index 000000000..c80902e8c
--- /dev/null
+++ b/examples/03_bert/test_correctness.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import unittest
+
+import torch
+
+from .demo import run_model
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+
+class BertBaseUncasedTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def test_bert_base_uncased(self):
+        model_path = "bert-base-uncased"
+        if ManifoldClient is not None:
+            model_path = "/tmp/aitemplate_bert/bert-base-uncased"
+            os.makedirs(model_path, exist_ok=True)
+            with ManifoldClient.get_client(bucket="aitemplate") as client:
+                await_sync(
+                    client.getRecursive(
+                        manifold_path="tree/bert/bert-base-uncased",
+                        local_path=model_path,
+                    )
+                )
+        run_model(
+            prompt="The quick brown fox jumps over the lazy dog.",
+            activation="fast_gelu",
+            graph_mode=True,
+            use_fp16_acc=True,
+            verify=True,
+            model_path=model_path,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From fb1c4bb669e7d24aed2496f4ef7ca848b7f2fed4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 3 Feb 2023 09:26:51 -0800
Subject: [PATCH 055/638] add bfloat16 test coverage for
 gemm_rcr{,_bias}_fast_gelu (#183)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/183

works out of the box, precision needs to be relaxed a bit

Reviewed By: chenyang78

Differential Revision: D42972970

fbshipit-source-id: 4c87c8d7c1db3c89d14bf4588ee0db66e6ab0090
---
 .../ops/test_gemm_rcr_bias_fast_gelu.py       | 24 +++++++++++++++--
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py | 26 ++++++++++++++++---
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index c3bbf2e6b..027cd6cb7 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -29,7 +29,9 @@
 
 
 class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
+    def _test_rcr(
+        self, Ms, test_name, use_fast_gelu=True, dtype="float16", atol=1e-1, rtol=1e-1
+    ):
         K = 1024
         N = 64
         target = detect_target()
@@ -67,7 +69,7 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
 
     def test_rcr(self):
         self._test_rcr([128], "static", use_fast_gelu=True)
@@ -89,6 +91,24 @@ def test_rcr_float(self):
             [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=False, dtype="float"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_rcr_bias_fast_gelu_bfloat16(self):
+        self._test_rcr(
+            [1, 7, 64, 127],
+            "fast_dynamic_m_bfloat16",
+            use_fast_gelu=True,
+            dtype="bfloat16",
+            atol=2e-1,
+            rtol=2e-1,
+        )
+        self._test_rcr(
+            [1, 7, 64, 127], "dynamic_m_bfloat16", use_fast_gelu=False, dtype="bfloat16"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index f56820ab5..51ef92ac5 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -50,7 +50,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class GEMMRcrFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
+    def _test_rcr(
+        self, Ms, test_name, use_fast_gelu=True, atol=1e-1, rtol=1e-1, dtype="float16"
+    ):
         K = 1024
         N = 64
         target = detect_target()
@@ -79,7 +81,7 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True, dtype="float16"):
                 {"input_0": X_pt, "input_1": W_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
 
     def test_rcr(self):
         self._test_rcr([128], "static", use_fast_gelu=True)
@@ -93,12 +95,30 @@ def test_rcr(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_rcr_float(self):
+    def test_gemm_rcr_fast_gelu_float(self):
         self._test_rcr([128], "static_float", use_fast_gelu=True, dtype="float")
         self._test_rcr(
             [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="float"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_gemm_rcr_fast_gelu_bfloat16(self):
+        self._test_rcr(
+            [128],
+            "static_float",
+            use_fast_gelu=True,
+            atol=3e-1,
+            rtol=3e-1,
+            dtype="bfloat16",
+        )
+        self._test_rcr(
+            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="bfloat16"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From d218701ccd1d9f06210643fd3cb12d1e9ddeb526 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 3 Feb 2023 13:47:49 -0800
Subject: [PATCH 056/638] support bfloat16 for gemm_rrr_small_nk (#185)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/185

caveat: use_fp16_acc would do accumulation in ElemT=bf16

Reviewed By: chenyang78

Differential Revision: D42976383

fbshipit-source-id: e65aa4508806c3658bd3474f5be865ed3f69761f
---
 .../cuda/gemm_special/gemm_rrr_small_nk.py       | 16 ++++++++++------
 tests/unittest/ops/test_gemm_rrr_small_nk.py     |  6 +++++-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index cd7167149..5b2b02e76 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -100,11 +100,15 @@
 #include <iostream>
 #include <type_traits>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
 namespace {
 
+using bfloat16 = __nv_bfloat16;
+
+
 // For each thread, read
 // A tile: 8 x K
 // B matrix: K x N
@@ -151,8 +155,8 @@
     for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
             sum = __hfma(a_tile[i * K + k], b[k][j], sum);
@@ -202,8 +206,8 @@
     for (int i = 0; i < m; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
             sum = __hfma(a_tile[i][k], b[k][j], sum);
@@ -243,7 +247,7 @@
 
 // N <= 8, K <= 8
 template<typename ElemT, int N, int K,
-         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half>, void>>
+         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>, void>>
 void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
                          ElemT* b_ptr,
                          ElemT* c_ptr,
@@ -255,7 +259,7 @@
   dim3 thread_block(nthread);
   constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if (use_fp16_acc && std::is_same_v<ElemT, half>) {
+  if (use_fp16_acc && (std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>)) {
     gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
       reinterpret_cast<const float4*>(a_ptr),
       reinterpret_cast<const float4*>(b_ptr),
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index 2b3a5df99..e7ec6dcef 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -81,10 +81,14 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
-    def test_float32(self):
+    def test_gemm_rrr_small_nk_float32(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
+    def test_gemm_rrr_small_nk_bfloat16(self):
+        self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+        self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+
 
 if __name__ == "__main__":
     unittest.main()

From e02d03ab3d20258e47589095ba20a4a950950ef6 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Fri, 3 Feb 2023 20:37:38 -0800
Subject: [PATCH 057/638] Add passes as option to AITTestCase.run_test

Summary:
We had issues with dper pass
like we encountered in D42983806, but currently AITTestCase.run_test does not support taking in dper passes.

This diff add passes as input to AITTestCase.run_test to test the correctness of dper passes.

Reviewed By: frank-wei, amateurcoffee

Differential Revision: D42984527

fbshipit-source-id: 5008c6d200f2a9ca035547204b47eb5e1704ce88
---
 fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py | 6 +++++-
 fx2ait/fx2ait/tools/common_fx2ait.py                   | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index 6b95bd608..29449baa9 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -13,6 +13,9 @@
 #  limitations under the License.
 #
 import torch
+from deeplearning.trt.torch_tensorrt.py.torch_tensorrt.fb.passes.dper_pass import (
+    push_down_split_ops,
+)
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
 from parameterized import parameterized
@@ -83,7 +86,8 @@ def forward(self, x):
 
         mod = SliceTensor(idx).half().cuda()
         inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
-        self.run_test(mod, inputs, expected_ops={acc_ops.getitem})
+        passes = [push_down_split_ops]
+        self.run_test(mod, inputs, expected_ops={acc_ops.getitem}, passes=passes)
 
     @parameterized.expand([("default", 1), ("neg", -2)])
     def test_get_item(self, _, idx):
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 2c6b6d19b..7c10f3249 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -82,6 +82,7 @@ def run_test(
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
         transformer_mode: Optional[bool] = False,
+        passes: List[Callable] = [],
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -93,6 +94,9 @@ def run_test(
                 torch.nn.MultiheadAttention if transformer_mode else None
             ],
         )
+        for p in passes:
+            mod = p(mod, inputs)
+
         print(mod.graph)
 
         original_inputs = inputs

From 5173b284ebfef102ad1ab4a46ec2b9604f1f3275 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 3 Feb 2023 21:13:19 -0800
Subject: [PATCH 058/638] add bfloat16 test coverage for group gemms (#193)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/193

works out of the box, mostly

Reviewed By: chenyang78

Differential Revision: D42977698

fbshipit-source-id: 04a712b334f85d7e8053d6c1697feecd2a28731d
---
 .../backend/cuda/gemm_universal/group_common.py          | 2 ++
 tests/unittest/ops/test_gemm_rrr_small_nk.py             | 4 ++++
 tests/unittest/ops/test_group_gemm_rcr.py                | 9 +++++----
 tests/unittest/ops/test_group_gemm_rcr_bias.py           | 3 ++-
 .../unittest/ops/test_group_gemm_rcr_bias_activation.py  | 2 ++
 tests/unittest/ops/test_group_gemm_rcr_bias_cat.py       | 7 +++----
 tests/unittest/ops/test_group_gemm_rcr_cat.py            | 7 +++----
 7 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 1185ab1ab..41cb8f444 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -153,6 +153,8 @@
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 
+using bfloat16 = nv_bfloat16;
+
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index e7ec6dcef..94f35a00b 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -85,6 +85,10 @@ def test_gemm_rrr_small_nk_float32(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_gemm_rrr_small_nk_bfloat16(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
         self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index fbd22f5de..4c1775f75 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -34,9 +34,10 @@ class GroupGEMMRcrTestCase(unittest.TestCase):
             param(False, "group_gemm_rcr_run_once", "float16"),
             param(True, "group_gemm_rcr_run_twice", "float16"),
             param(False, "group_gemm_rcr_run_once_fp32", "float32"),
+            param(False, "group_gemm_rcr_run_once_bf16", "bfloat16"),
         ]
     )
-    def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
+    def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
         M = 256
         K1 = 128
         N1 = 60
@@ -86,10 +87,10 @@ def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
             outputs["y3"] = torch.empty_like(y1)
 
         module.run_with_tensors(inputs, outputs)
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
         if run_twice:
-            self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index 91a601744..a29a7b841 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -34,9 +34,10 @@ class GroupGEMMRcrBiasTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_fp16", "float16"),
             param("group_gemm_rcr_bias_fp32", "float32"),
+            param("group_gemm_rcr_bias_bf16", "bfloat16"),
         ]
     )
-    def test_rcr(self, test_name, dtype):
+    def test_group_gemm_rcr_bias(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index ae27c3d78..6da1ed164 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -33,8 +33,10 @@ class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
             param("group_gemm_rcr_bias_relu_fp32", "float32", "relu"),
+            param("group_gemm_rcr_bias_relu_bf16", "bfloat16", "relu"),
             param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
             param("group_gemm_rcr_bias_sigmoid_fp32", "float32", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_bf16", "bfloat16", "sigmoid"),
         ]
     )
     def test_rcr_activation(self, test_name, dtype, activation):
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index 72343721d..5c5a0773d 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -34,9 +33,10 @@ class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_cat_fp16", "float16"),
             param("group_gemm_rcr_bias_cat_fp32", "float32"),
+            param("group_gemm_rcr_bias_cat_bf16", "bfloat16"),
         ]
     )
-    def test_rcr_bias_cat(self, test_name, dtype):
+    def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -66,11 +66,10 @@ def test_rcr_bias_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index 69717a440..c3ff44a93 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -34,9 +33,10 @@ class GroupGEMMRcrCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_cat_fp16", "float16"),
             param("group_gemm_rcr_cat_fp32", "float32"),
+            param("group_gemm_rcr_cat_bf16", "bfloat16"),
         ]
     )
-    def test_rcr_cat(self, test_name, dtype):
+    def test_group_gemm_rcr_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -63,11 +63,10 @@ def test_rcr_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,

From 6eb2b9f70db13ef1507159b62272ca093d92f613 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357 <generatedunixname89002005232357@fb.com>
Date: Fri, 3 Feb 2023 23:52:56 -0800
Subject: [PATCH 059/638] Revert D42977698: Multisect successfully blamed
 D42976383 for test or build failures

Summary:
This diff is reverting D42977698 (https://github.com/facebookincubator/AITemplate/commit/5173b284ebfef102ad1ab4a46ec2b9604f1f3275)
D42976383 (https://github.com/facebookincubator/AITemplate/commit/d218701ccd1d9f06210643fd3cb12d1e9ddeb526): [AITemplate] support bfloat16 for gemm_rrr_small_nk by tenpercent has been identified to be causing the following test or build failures:

Tests affected:
- [aitemplate/AITemplate:aitemplate_test_gpu - test_small_nk_fp32 (aitemplate.AITemplate.tests.unittest.compiler.test_transform_special_op.GemmRrrSmallNkTestCase)](https://www.internalfb.com/intern/test/562950033245771/)

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1555464
Here are the tasks that are relevant to this breakage:
T143403841: 7 tests started failing for oncall aitemplate in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Reviewed By: ipiszy

Differential Revision: D43019093

fbshipit-source-id: 30ae92de82cca60d8854dda16832799f9f6929e9
---
 .../backend/cuda/gemm_universal/group_common.py          | 2 --
 tests/unittest/ops/test_gemm_rrr_small_nk.py             | 4 ----
 tests/unittest/ops/test_group_gemm_rcr.py                | 9 ++++-----
 tests/unittest/ops/test_group_gemm_rcr_bias.py           | 3 +--
 .../unittest/ops/test_group_gemm_rcr_bias_activation.py  | 2 --
 tests/unittest/ops/test_group_gemm_rcr_bias_cat.py       | 7 ++++---
 tests/unittest/ops/test_group_gemm_rcr_cat.py            | 7 ++++---
 7 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 41cb8f444..1185ab1ab 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -153,8 +153,6 @@
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 
-using bfloat16 = nv_bfloat16;
-
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index 94f35a00b..e7ec6dcef 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -85,10 +85,6 @@ def test_gemm_rrr_small_nk_float32(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_gemm_rrr_small_nk_bfloat16(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
         self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index 4c1775f75..fbd22f5de 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -34,10 +34,9 @@ class GroupGEMMRcrTestCase(unittest.TestCase):
             param(False, "group_gemm_rcr_run_once", "float16"),
             param(True, "group_gemm_rcr_run_twice", "float16"),
             param(False, "group_gemm_rcr_run_once_fp32", "float32"),
-            param(False, "group_gemm_rcr_run_once_bf16", "bfloat16"),
         ]
     )
-    def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
+    def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
         M = 256
         K1 = 128
         N1 = 60
@@ -87,10 +86,10 @@ def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
             outputs["y3"] = torch.empty_like(y1)
 
         module.run_with_tensors(inputs, outputs)
-        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
-        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
         if run_twice:
-            torch.testing.assert_close(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1)
+            self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index a29a7b841..91a601744 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -34,10 +34,9 @@ class GroupGEMMRcrBiasTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_fp16", "float16"),
             param("group_gemm_rcr_bias_fp32", "float32"),
-            param("group_gemm_rcr_bias_bf16", "bfloat16"),
         ]
     )
-    def test_group_gemm_rcr_bias(self, test_name, dtype):
+    def test_rcr(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index 6da1ed164..ae27c3d78 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -33,10 +33,8 @@ class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
             param("group_gemm_rcr_bias_relu_fp32", "float32", "relu"),
-            param("group_gemm_rcr_bias_relu_bf16", "bfloat16", "relu"),
             param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
             param("group_gemm_rcr_bias_sigmoid_fp32", "float32", "sigmoid"),
-            param("group_gemm_rcr_bias_sigmoid_bf16", "bfloat16", "sigmoid"),
         ]
     )
     def test_rcr_activation(self, test_name, dtype, activation):
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index 5c5a0773d..72343721d 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -15,6 +15,7 @@
 import logging
 import unittest
 
+import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -33,10 +34,9 @@ class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_cat_fp16", "float16"),
             param("group_gemm_rcr_bias_cat_fp32", "float32"),
-            param("group_gemm_rcr_bias_cat_bf16", "bfloat16"),
         ]
     )
-    def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
+    def test_rcr_bias_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -66,10 +66,11 @@ def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
+        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        torch.testing.assert_close(y_shape, list(Y_pt.shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = {
             "x1": X1_pt,
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index c3ff44a93..69717a440 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -15,6 +15,7 @@
 import logging
 import unittest
 
+import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -33,10 +34,9 @@ class GroupGEMMRcrCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_cat_fp16", "float16"),
             param("group_gemm_rcr_cat_fp32", "float32"),
-            param("group_gemm_rcr_cat_bf16", "bfloat16"),
         ]
     )
-    def test_group_gemm_rcr_cat(self, test_name, dtype):
+    def test_rcr_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -63,10 +63,11 @@ def test_group_gemm_rcr_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
+        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        torch.testing.assert_close(y_shape, list(Y_pt.shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = {
             "x1": X1_pt,

From 7bfe522d71fe28aea661ab4eafeb121115e1c598 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357 <generatedunixname89002005232357@fb.com>
Date: Sat, 4 Feb 2023 00:30:40 -0800
Subject: [PATCH 060/638] Revert D42976383: Multisect successfully blamed
 D42976383 for test or build failures (#197)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/197

This diff is reverting D42976383 (https://github.com/facebookincubator/AITemplate/commit/d218701ccd1d9f06210643fd3cb12d1e9ddeb526)
Depends on D43019093 (https://github.com/facebookincubator/AITemplate/commit/6eb2b9f70db13ef1507159b62272ca093d92f613)
D42976383 (https://github.com/facebookincubator/AITemplate/commit/d218701ccd1d9f06210643fd3cb12d1e9ddeb526): [AITemplate] support bfloat16 for gemm_rrr_small_nk by tenpercent has been identified to be causing the following test or build failures:

Tests affected:
- [aitemplate/AITemplate:aitemplate_test_gpu - test_small_nk_fp32 (aitemplate.AITemplate.tests.unittest.compiler.test_transform_special_op.GemmRrrSmallNkTestCase)](https://www.internalfb.com/intern/test/562950033245771/)

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1555464
Here are the tasks that are relevant to this breakage:
T143403841: 7 tests started failing for oncall aitemplate in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Reviewed By: ipiszy

Differential Revision: D43019097

fbshipit-source-id: 2733b2f03a704fe685fe6124e22f4d8e660be193
---
 .../cuda/gemm_special/gemm_rrr_small_nk.py       | 16 ++++++----------
 tests/unittest/ops/test_gemm_rrr_small_nk.py     |  6 +-----
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index 5b2b02e76..cd7167149 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -100,15 +100,11 @@
 #include <iostream>
 #include <type_traits>
 #include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
 namespace {
 
-using bfloat16 = __nv_bfloat16;
-
-
 // For each thread, read
 // A tile: 8 x K
 // B matrix: K x N
@@ -155,8 +151,8 @@
     for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if constexpr (USE_FP16_ACC) {
-          TElem sum = 0;
+        if (USE_FP16_ACC) {
+          half sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
             sum = __hfma(a_tile[i * K + k], b[k][j], sum);
@@ -206,8 +202,8 @@
     for (int i = 0; i < m; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if constexpr (USE_FP16_ACC) {
-          TElem sum = 0;
+        if (USE_FP16_ACC) {
+          half sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
             sum = __hfma(a_tile[i][k], b[k][j], sum);
@@ -247,7 +243,7 @@
 
 // N <= 8, K <= 8
 template<typename ElemT, int N, int K,
-         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>, void>>
+         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half>, void>>
 void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
                          ElemT* b_ptr,
                          ElemT* c_ptr,
@@ -259,7 +255,7 @@
   dim3 thread_block(nthread);
   constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if (use_fp16_acc && (std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>)) {
+  if (use_fp16_acc && std::is_same_v<ElemT, half>) {
     gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
       reinterpret_cast<const float4*>(a_ptr),
       reinterpret_cast<const float4*>(b_ptr),
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index e7ec6dcef..2b3a5df99 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -81,14 +81,10 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
-    def test_gemm_rrr_small_nk_float32(self):
+    def test_float32(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
-    def test_gemm_rrr_small_nk_bfloat16(self):
-        self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
-        self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
-
 
 if __name__ == "__main__":
     unittest.main()

From dd1b03262aaa09490b816653be9c6ae4d4a5ea82 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Sat, 4 Feb 2023 00:45:46 -0800
Subject: [PATCH 061/638] Back out "Add passes as option to
 AITTestCase.run_test" (#199)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/199

Fix fx2ait test failures.

Reviewed By: chenyang78

Differential Revision: D43019639

fbshipit-source-id: f7bc0c543b553ca2f80149995b4c28599a6ea396
---
 fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py | 6 +-----
 fx2ait/fx2ait/tools/common_fx2ait.py                   | 4 ----
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index 29449baa9..6b95bd608 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -13,9 +13,6 @@
 #  limitations under the License.
 #
 import torch
-from deeplearning.trt.torch_tensorrt.py.torch_tensorrt.fb.passes.dper_pass import (
-    push_down_split_ops,
-)
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
 from parameterized import parameterized
@@ -86,8 +83,7 @@ def forward(self, x):
 
         mod = SliceTensor(idx).half().cuda()
         inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
-        passes = [push_down_split_ops]
-        self.run_test(mod, inputs, expected_ops={acc_ops.getitem}, passes=passes)
+        self.run_test(mod, inputs, expected_ops={acc_ops.getitem})
 
     @parameterized.expand([("default", 1), ("neg", -2)])
     def test_get_item(self, _, idx):
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 7c10f3249..2c6b6d19b 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -82,7 +82,6 @@ def run_test(
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
         transformer_mode: Optional[bool] = False,
-        passes: List[Callable] = [],
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -94,9 +93,6 @@ def run_test(
                 torch.nn.MultiheadAttention if transformer_mode else None
             ],
         )
-        for p in passes:
-            mod = p(mod, inputs)
-
         print(mod.graph)
 
         original_inputs = inputs

From 2aa7cd12bc37b01996b2907e9f147b1ff81f97a7 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Sat, 4 Feb 2023 18:16:06 +0800
Subject: [PATCH 062/638] fix bugs

---
 fx2ait/fx2ait/converters/ait_converters.py    | 13 ++++++--
 .../converters/ait_module_converters.py       | 30 ++++++++++++++-----
 python/aitemplate/backend/main_templates.py   |  4 +--
 python/aitemplate/backend/profiler_runner.py  |  2 +-
 python/aitemplate/compiler/ops/conv/conv2d.py |  2 +-
 python/aitemplate/compiler/ops/conv/conv3d.py |  2 +-
 .../aitemplate/compiler/transform/profile.py  |  2 +-
 static/include/cuda_device_functions.h        |  2 +-
 static/include/macros.h                       |  2 +-
 static/include/rocm_device_functions.h        |  2 +-
 10 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index cd62f4378..530b8e162 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -44,6 +44,8 @@
     vector_norm,
 )
 
+from aitemplate.testing import detect_target
+
 from fx2ait.acc_tracer import acc_ops, ait_acc_ops
 from torch.fx.node import Argument, Target
 
@@ -59,6 +61,8 @@
     unify_dynamic_shape_name,
 )
 
+USE_ROCM = detect_target().name() == "rocm"
+
 logger: logging.Logger = logging.getLogger(__name__)
 ConverterOutput = Union[AITTensor, Tuple[AITTensor, ...], List[IntVar], IntVar]
 
@@ -155,17 +159,20 @@ def acc_ops_linear(
     name: str,
 ) -> ConverterOutput:
     input_val = kwargs["input"]
-
+    if USE_ROCM:
+        shape = input_val._attrs["shape"]
+        input_val = input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1].value()])
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
-
+    
     result = gemm_rcr()(input_val, weight)
 
     bias = kwargs["bias"]
     if bias is not None:
         assert isinstance(bias, AITTensor)
         result = elementwise(FuncEnum.ADD)(result, bias)
-
+    if USE_ROCM:
+        result = result if len(shape) == 2 else reshape()(result, [shape[0].value(), -1, result._attrs["shape"][-1].value()])
     return result
 
 
diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index e4bba9013..388f53220 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -5,12 +5,15 @@
 import torch
 from aitemplate.backend.target import Target
 from aitemplate.compiler.base import _TorchConstantTensorData
+from aitemplate.testing import detect_target
 from aitemplate.frontend import nn
 from torch.fx.node import Argument
 
 from .ait_converters import ConverterOutput
 from .converter_registry import ait_converter
 
+USE_CUDA = detect_target().name() == "cuda"
+
 
 @ait_converter(torch.nn.modules.activation.MultiheadAttention)
 def multi_head_attention_module(
@@ -26,14 +29,25 @@ def multi_head_attention_module(
     value = kwargs["value"] if "value" in kwargs else args[2]
     bsz, seq_len_q, dim = query.shape()
     _, seq_len, _ = key.shape()
-    attn = nn.CrossAttention(
-        dim=submod.embed_dim,
-        seq_len=seq_len_q.value(),
-        seq_len_kv=seq_len.value(),
-        num_heads=submod.num_heads,
-        qkv_bias=True,
-        has_residual=False,
-    )
+    if USE_CUDA:
+        attn = nn.CrossAttention(
+            dim=submod.embed_dim,
+            seq_len=seq_len_q.value(),
+            seq_len_kv=seq_len.value(),
+            num_heads=submod.num_heads,
+            qkv_bias=True,
+            has_residual=False,
+        )
+    else:
+        attn = nn.MultiheadAttention(
+            dim=submod.embed_dim,
+            batch_size=bsz.value(),
+            seq_len=seq_len_q.value(),
+            num_heads=submod.num_heads,
+            qkv_bias=True,
+            has_residual=False,
+            use_mem_eff=True
+        )
 
     # Bind constant tensor for MHA module
     mapped_params = _map_ait_pt_params(attn, submod)
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index f694711df..1f64f632d 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -44,8 +44,8 @@
 namespace {
 void DeviceCheckLastError(const char* file, int line) {
   auto device_error = GetLastError();
-  if (device_error != GetDeviceSuccess()) {
-    std::string msg = std::string("Got error: ") + GetLastErrorString() +
+  if (device_error != GetDeviceSuccess() && device_error != GetDeviceNotReady()) {
+    std::string msg = std::string("Got error: ") + GetErrorString(device_error) +
                       " enum: " + std::to_string(device_error) +
                       " at " + file + ": " + std::to_string(line);
     LOG(ERROR) << msg;
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 189ed647f..2be76a39a 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -322,7 +322,7 @@ def join(self):
             f.cancel()
         # block until each done_callback completes,
         # or raise Empty exception after 3 minutes of waiting
-        block_timeout = 360 if Target.current().name() == "rocm" else 180
+        block_timeout = None if Target.current().name() == "rocm" else 180
         for _ in self._futures:
             self._done_queue.get(timeout=block_timeout)
         self._postprocessing_delegate.postprocess_results()
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 249eb20bb..6bb9447f6 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -466,7 +466,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             )
         if target.name() == "rocm":
             runner = backend.profiler_runner.Runner(
-                devices, self._attrs["name"], timeout=180
+                devices, self._attrs["name"], timeout=None
             )
             op_type = self._attrs["op"]
             all_op_names = list(self._attrs["op_instance"].keys())
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index f5adbc516..4341cfa64 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -477,7 +477,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             all_op_names = list(self._attrs["op_instance"].keys())
             for op_name in all_op_names:
                 runner = backend.profiler_runner.Runner(
-                    devices, self._attrs["name"], timeout=180
+                    devices, self._attrs["name"], timeout=None
                 )
                 x_shape = self._invert_exec_key(exec_key)
                 command = self._gen_profile_cmd(profiler_prefix, op_name, x_shape)
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 120fa41f4..33edf2a36 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -106,7 +106,7 @@ def profile(
             workdir=profiler_dir,
             devices=devices,
         )
-    timeout = 360 if Target.current().name() == "rocm" else 240
+    timeout = None if Target.current().name() == "rocm" else 240
     profiler_runner = ProfilerRunner(
         devices,
         timeout=timeout,
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 4f72d80b9..c03b16f86 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -188,7 +188,7 @@ inline DeviceError QueryEvent(EventType event) {
   return cudaEventQuery(event);
 }
 
-inline const char* GetErrorString(DeviceError err) {
+inline std::string GetErrorString(DeviceError err) {
   return cudaGetErrorString(err);
 }
 
diff --git a/static/include/macros.h b/static/include/macros.h
index 59fcde94b..462b25255 100644
--- a/static/include/macros.h
+++ b/static/include/macros.h
@@ -22,7 +22,7 @@
 #define DEVICE_CHECK(call)                                           \
   if ((call) != GetDeviceSuccess()) {                                \
     throw std::runtime_error(                                        \
-        #call " API call failed: " + GetLastErrorString() + " at " + \
+        #call " API call failed: " + GetErrorString(call) + " at " + \
         __FILE__ + ", line" + std::to_string(__LINE__));             \
   }
 
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 710fe3867..774a71b03 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -191,7 +191,7 @@ inline DeviceError QueryEvent(EventType event) {
   return hipEventQuery(event);
 }
 
-inline const char* GetErrorString(DeviceError err) {
+inline std::string GetErrorString(DeviceError err) {
   return hipGetErrorString(err);
 }
 

From b587d4d1958a9880ad9bf20595c89db746d9db16 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Sat, 4 Feb 2023 18:19:56 +0800
Subject: [PATCH 063/638] fix ci

---
 .github/workflows/ait_ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index f3037119c..c111bb404 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -47,6 +47,8 @@ jobs:
         pip3 install dist/*.whl
         #install necessary python modules
         pip3 install timm
+        pip3 uninstall -y torch 
+        pip3 install torch --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
         python3 -m pip install transformers click
         python3 -c "import torch; print(torch.__version__)"
         #run examples

From 1321520e0f783bfb5ae1c9309060ec319b60468e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 6 Feb 2023 11:08:49 -0800
Subject: [PATCH 064/638] print the device features on model container
 initialization (#174)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/174

retrieve and parse struct DeviceProp

the motivation comes from accidentally hitting the device's shared memory limit without realizing it

more info in logs should help in future with debugging

it follows the struct doc for cuda 12 at https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1d5909d1563000146dd3f12646f3c98c7

removing (1) deprecated fields and (2) fields causing compilation error with nvcc 11.4

(with the exception of sm/memory clock rate, no idea why it's marked deprecated and couldn't find an alternative)

and reordering so that similar properties appear together

did my best for AMD side but testing env is not readily available

Reviewed By: chenyang78

Differential Revision: D42913584

fbshipit-source-id: ac31ed00f3a1356e2532b9f55bd762de3b8a208c
---
 static/csrc/model_container.cpp        |  16 ++
 static/include/cuda_device_functions.h | 208 +++++++++++++++++++++++++
 static/include/rocm_device_functions.h | 119 ++++++++++++++
 3 files changed, 343 insertions(+)

diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 1c4f7f19f..f5e119690 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -45,6 +45,22 @@ ModelContainer::ModelContainer(
   DEVICE_CHECK(GetRuntimeVersion(&runtime_version));
   LOG(INFO) << "Device Runtime Version: " << runtime_version
             << "; Driver Version: " << driver_version;
+
+  int dev_id;
+  DevicePropertyType prop;
+  DEVICE_CHECK(GetDevice(&dev_id));
+  DEVICE_CHECK(GetDeviceProperties(&prop, dev_id));
+
+  bool useDebugLogging = false;
+  if (auto var = std::getenv("LOGLEVEL")) {
+    if (var[0] == 'd' || var[0] == 'D') {
+      useDebugLogging = true;
+    }
+  }
+  LOG(INFO)
+      << (useDebugLogging ? PrintDebugDeviceProperties(prop)
+                          : PrintInfoDeviceProperties(prop));
+
   LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency";
   models_.reserve(num_models);
   available_models_.reserve(num_models);
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 4f72d80b9..4db230ee4 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -49,6 +49,214 @@ inline DeviceError GetDeviceProperties(
   return cudaGetDeviceProperties(prop, device_idx);
 }
 
+inline std::string GetUUIDToString(const char bytes[16]) {
+  std::vector<std::tuple<int, int>> groups = {
+      {0, 4}, {4, 6}, {6, 8}, {8, 10}, {10, 16}};
+  char const hex_chars[16] = {
+      '0',
+      '1',
+      '2',
+      '3',
+      '4',
+      '5',
+      '6',
+      '7',
+      '8',
+      '9',
+      'a',
+      'b',
+      'c',
+      'd',
+      'e',
+      'f'};
+
+  std::string result = "GPU";
+  for (auto g : groups) {
+    result += "-";
+    for (size_t i = std::get<0>(g); i < std::get<1>(g); ++i) {
+      result += hex_chars[(bytes[i] & 0xF0) >> 4];
+      result += hex_chars[(bytes[i] & 0x0F)];
+    }
+  }
+  return result;
+}
+
+inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
+      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
+      << prop.multiGpuBoardGroupID
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n     PCI domain ID of the device: " << prop.pciDomainID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Device's maximum L2 persisting lines capacity in bytes: "
+      << prop.persistingL2CacheMaxSize
+      << "\n     Shared memory reserved by CUDA driver per block in bytes: "
+      << prop.reservedSharedMemPerBlock
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Per device maximum shared memory per block usable by special opt in: "
+      << prop.sharedMemPerBlockOptin
+      << "\n     Shared memory available per multiprocessor in bytes: "
+      << prop.sharedMemPerMultiprocessor
+      << "\n     The maximum value of cudaAccessPolicyWindow::num_bytes: "
+      << prop.accessPolicyMaxWindowSize
+      << "\n     Max global memory clock frequency in khz: "
+      << prop.memoryClockRate
+      << "\n     Peak global memory bandwidth (GByte/s): "
+      << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
+
+      << "\n  Thread limits: "
+      << "\n     Warp size in threads: " << prop.warpSize
+      << "\n     Maximum size of each dimension of a grid: "
+      << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
+      << prop.maxGridSize[2]
+      << "\n     Maximum size of each dimension of a block: "
+      << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
+      << prop.maxThreadsDim[2]
+      << "\n     Number of asynchronous engines: " << prop.asyncEngineCount
+      << "\n     Maximum number of resident blocks per multiprocessor: "
+      << prop.maxBlocksPerMultiProcessor
+      << "\n     Maximum number of threads per block: "
+      << prop.maxThreadsPerBlock
+      << "\n     Maximum resident threads per multiprocessor: "
+      << prop.maxThreadsPerMultiProcessor
+      << "\n     Maximum pitch in bytes allowed by memory copies: "
+      << prop.memPitch << "\n     Number of multiprocessors on device: "
+      << prop.multiProcessorCount
+      << "\n     32-bit registers available per block: " << prop.regsPerBlock
+      << "\n     32-bit registers available per multiprocessor: "
+      << prop.regsPerMultiprocessor
+      << "\n     Max clock frequency of the multiProcessors in khz: "
+      << prop.clockRate
+
+      << "\n  Device features: "
+      << "\n     Device has ECC support enabled: "
+      << (prop.ECCEnabled ? "yes" : "no")
+      << "\n     Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: "
+      << (prop.canMapHostMemory ? "yes" : "no")
+      << "\n     Device can access host registered memory at the same virtual address as the CPU: "
+      << (prop.canUseHostPointerForRegisteredMem ? "yes" : "no")
+      << "\n     Device supports Compute Preemption: "
+      << (prop.computePreemptionSupported ? "yes" : "no")
+      << "\n     Device can possibly execute multiple kernels concurrently: "
+      << (prop.concurrentKernels ? "yes" : "no")
+      << "\n     Device can coherently access managed memory concurrently with the CPU: "
+      << (prop.concurrentManagedAccess ? "yes" : "no")
+      << "\n     Device supports launching cooperative kernels via cudaLaunchCooperativeKernel: "
+      << (prop.cooperativeLaunch ? "yes" : "no")
+      << "\n     Host can directly access managed memory on the device without migration: "
+      << (prop.directManagedMemAccessFromHost ? "yes" : "no")
+      << "\n     Device supports caching globals in L1: "
+      << (prop.globalL1CacheSupported ? "yes" : "no")
+      << "\n     Link between the device and the host supports native atomic operations: "
+      << (prop.hostNativeAtomicSupported ? "yes" : "no")
+      << "\n     Device is integrated as opposed to discrete: "
+      << (prop.integrated ? "yes" : "no")
+      << "\n     Device is on a multi-GPU board: "
+      << (prop.isMultiGpuBoard ? "yes" : "no")
+      << "\n     Device supports caching locals in L1: "
+      << (prop.localL1CacheSupported ? "yes" : "no")
+      << "\n     Device supports allocating managed memory on this system: "
+      << (prop.managedMemory ? "yes" : "no")
+      << "\n     Device supports coherently accessing pageable memory without calling cudaHostRegister on it: "
+      << (prop.pageableMemoryAccess ? "yes" : "no")
+      << "\n     Device accesses pageable memory via the host's page tables: "
+      << (prop.pageableMemoryAccessUsesHostPageTables ? "yes" : "no")
+      << "\n     Device supports stream priorities: "
+      << (prop.streamPrioritiesSupported ? "yes" : "no")
+      << "\n     Device is a Tesla device using TCC driver: "
+      << (prop.tccDriver ? "yes" : "no")
+      << "\n     Device shares a unified address space with the host: "
+      << (prop.unifiedAddressing ? "yes" : "no")
+
+      << "\n  Texture limits: "
+      << "\n     Maximum 1D surface size: " << prop.maxSurface1D
+      << "\n     Maximum 1D layered surface dimensions: "
+      << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1]
+      << "\n     Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " "
+      << prop.maxSurface2D[1]
+      << "\n     Maximum 2D layered surface dimensions: "
+      << prop.maxSurface2DLayered[0] << " " << prop.maxSurface2DLayered[1]
+      << " " << prop.maxSurface2DLayered[2]
+      << "\n     Maximum 3D surface dimensions: " << prop.maxSurface3D[0] << " "
+      << prop.maxSurface3D[1] << " " << prop.maxSurface3D[2]
+      << "\n     Maximum Cubemap surface dimensions: " << prop.maxSurfaceCubemap
+      << "\n     Maximum Cubemap layered surface dimensions: "
+      << prop.maxSurfaceCubemapLayered[0] << " "
+      << prop.maxSurfaceCubemapLayered[1]
+      << "\n     Maximum 1D texture size: " << prop.maxTexture1D
+      << "\n     Maximum 1D layered texture dimensions "
+      << prop.maxTexture1DLayered[0] << " " << prop.maxTexture1DLayered[1]
+      << "\n     Maximum 1D mipmapped texture size: " << prop.maxTexture1DMipmap
+      << "\n     Maximum 2D texture dimensions: " << prop.maxTexture2D[0] << " "
+      << prop.maxTexture2D[1]
+      << "\n     Maximum 2D texture dimensions if texture gather operations have to be performed: "
+      << prop.maxTexture2DGather[0] << " " << prop.maxTexture2DGather[1]
+      << "\n     Maximum 2D layered texture dimensions: "
+      << prop.maxTexture2DLayered[0] << " " << prop.maxTexture2DLayered[1]
+      << " " << prop.maxTexture2DLayered[2]
+      << "\n     Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory: "
+      << prop.maxTexture2DLinear[0] << " " << prop.maxTexture2DLinear[1] << " "
+      << prop.maxTexture2DLinear[2]
+      << "\n     Maximum 2D mipmapped texture dimensions: "
+      << prop.maxTexture2DMipmap[0] << " " << prop.maxTexture2DMipmap[1]
+      << "\n     Maximum 3D texture dimensions: " << prop.maxTexture3D[0] << " "
+      << prop.maxTexture3D[1] << " " << prop.maxTexture3D[2]
+      << "\n     Maximum alternate 3D texture dimensions: "
+      << prop.maxTexture3DAlt[0] << " " << prop.maxTexture3DAlt[1] << " "
+      << prop.maxTexture3DAlt[2]
+      << "\n     Maximum Cubemap texture dimensions: " << prop.maxTextureCubemap
+      << "\n     Maximum Cubemap layered texture dimensions: "
+      << prop.maxTextureCubemapLayered[0] << " "
+      << prop.maxTextureCubemapLayered[1]
+      << "\n     Alignment requirements for surfaces: " << prop.surfaceAlignment
+      << "\n     Alignment requirement for textures: " << prop.textureAlignment
+      << "\n     Pitch alignment requirement for texture references bound to pitched memory: "
+      << prop.texturePitchAlignment;
+  return oss.str();
+}
+
+inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
+      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
+      << prop.multiGpuBoardGroupID
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n     PCI domain ID of the device: " << prop.pciDomainID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Shared memory available per multiprocessor in bytes: "
+      << prop.sharedMemPerMultiprocessor;
+  return oss.str();
+}
+
 inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
   auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault;
   return cudaStreamCreateWithFlags(stream, flags);
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 710fe3867..db06c2351 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -48,6 +48,125 @@ inline DeviceError GetDeviceProperties(
   return hipGetDeviceProperties(prop, device_idx);
 }
 
+inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) {
+  std::ostringstream oss;
+  oss << "\n     Has 32-bit integer atomics for global memory: "
+      << (arch.hasGlobalInt32Atomics ? "yes" : "no")
+      << "\n     Has 32-bit float atomic exch for global memory: "
+      << (arch.hasGlobalFloatAtomicExch ? "yes" : "no")
+      << "\n     Has 32-bit integer atomics for shared memory: "
+      << (arch.hasSharedInt32Atomics ? "yes" : "no")
+      << "\n     Has 32-bit float atomic exch for shared memory: "
+      << (arch.hasSharedFloatAtomicExch ? "yes" : "no"
+      << "\n     Has 32-bit float atomic add in global and shared memory: "
+      << (arch.hasFloatAtomicAdd ? "yes" : "no")
+      << "\n     Has 64-bit integer atomics for global memory: "
+      << (arch.hasGlobalInt64Atomics ? "yes" : "no")
+      << "\n     Has 64-bit integer atomics for shared memory: "
+      << (arch.hasSharedInt64Atomics ? "yes" : "no")
+      << "\n     Has double-precision floating point: "
+      << (arch.hasDoubles ? "yes" : "no")
+      << "\n     Has warp vote instructions (__any, __all): "
+      << (arch.hasWarpVote: ? "yes" : "no")
+      << "\n     Has warp ballot instructions (__ballot): "
+      << (arch.hasWarpBallot: ? "yes" : "no")
+      << "\n     Has warp shuffle operations. (__shfl_*): "
+      << (arch.hasWarpShuffle ? "yes" : "no")
+      << "\n     Has funnel two words into one with shift&mask caps: "
+      << (arch.hasFunnelShift ? "yes" : "no")
+      << "\n     Has __threadfence_system: "
+      << (arch.hasThreadFenceSystem ? "yes" : "no")
+      << "\n     Has __syncthreads_count, syncthreads_and, syncthreads_or: "
+      << (arch.hasSyncThreadsExt ? "yes" : "no")
+      << "\n     Has surface functions: "
+      << (arch.hasSurfaceFuncs ? "yes" : "no")
+      << "\n     Grid and group dims are 3D (rather than 2D): "
+      << (arch.has3dGrid ? "yes" : "no")
+      << "\n     Has dynamic parallelism: "
+      << (arch.hasDynamicParallelism ? "yes" : "no");
+      return oss.str();
+}
+
+inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     AMD GCN Arch Value: " << prop.gcnArch
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Maximum Shared Memory Per Multiprocessor in bytes: "
+      << prop.maxSharedMemoryPerMultiProcessor;
+  return oss.str();
+}
+
+inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     AMD GCN Arch Value: " << prop.gcnArch
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Maximum Shared Memory Per Multiprocessor in bytes: "
+      << prop.maxSharedMemoryPerMultiProcessor
+      << "\n     Max global memory clock frequency in khz: "
+      << prop.memoryClockRate
+      << "\n     Peak global memory bandwidth (GByte/s): "
+      << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
+
+      << "\n  Thread limits: "
+      << "\n     Warp size in threads: " << prop.warpSize
+      << "\n     Maximum size of each dimension of a grid: "
+      << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
+      << prop.maxGridSize[2]
+      << "\n     Maximum size of each dimension of a block: "
+      << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
+      << prop.maxThreadsDim[2] << "\n     Maximum number of threads per block: "
+      << prop.maxThreadsPerBlock
+      << "\n     Registers available per block: " << prop.regsPerBlock
+      << "\n     Number of multiprocessors on device: "
+      << prop.multiProcessorCount
+      << "\n     Maximum resident threads per multiprocessor: "
+      << prop.maxThreadsPerMultiProcessor
+      << "\n     Max clock frequency of the multiProcessors in khz: "
+      << prop.clockRate
+
+      << "\n  Device features: "
+      << "\n     Device can possibly execute multiple kernels concurrently: "
+      << (prop.concurrentKernels ? "yes" : "no")
+      << "\n     Device is on a multi-GPU board: "
+      << (prop.isMultiGpuBoard ? "yes" : "no")
+      << "\n     HIP can map host memory: "
+      << (prop.canMapHostMemory ? "yes" : "no")
+      << PrintArchFeatureFlags(prop.arch);
+
+  return oss.str();
+}
+
 inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
   auto flags = non_blocking ? hipStreamNonBlocking : hipStreamDefault;
   return hipStreamCreateWithFlags(stream, flags);

From f1fba3690794a9938a2b11a482421f0fdbace2d1 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 6 Feb 2023 13:40:53 -0800
Subject: [PATCH 065/638] Relaunch add passes as option to AITTestCase.run_test
 (#200)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/200

We had issues with dper pass
like we encountered in D42983806, but currently AITTestCase.run_test does not support taking in dper passes.
This diff add passes as input to AITTestCase.run_test to test the correctness of dper passes.

Previous diff D42984527 (https://github.com/facebookincubator/AITemplate/commit/e02d03ab3d20258e47589095ba20a4a950950ef6) has the issue that OSS don't know deeplearning

Reviewed By: frank-wei

Differential Revision: D43019947

fbshipit-source-id: 22d4044c66720e0e656f41538c81a3e90ef1a433
---
 fx2ait/fx2ait/tools/common_fx2ait.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 2c6b6d19b..067eb54eb 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -82,6 +82,7 @@ def run_test(
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
         transformer_mode: Optional[bool] = False,
+        passes: List[Callable] = [],  # noqa: B006
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -93,6 +94,9 @@ def run_test(
                 torch.nn.MultiheadAttention if transformer_mode else None
             ],
         )
+        for p in passes:
+            mod = p(mod, inputs)
+
         print(mod.graph)
 
         original_inputs = inputs

From 00d0b98d7bd3697dc534eaf4438e58336ab8304e Mon Sep 17 00:00:00 2001
From: hemildesai <hemil.desai10@gmail.com>
Date: Mon, 6 Feb 2023 13:57:37 -0800
Subject: [PATCH 066/638] Make Stable Diffusion Example dynamic to support both
 1.x and 2.x versions (#187)

Summary:
Currently, the stable diffusion example only supports version 2.0 and above. This change adds support for both 1.x and 2.x versions, and dynamically compiles according to the provided pre-trained pipeline.

Tested with both version 2, 1.5 and a custom trained 1.5 version on 3090. Let me know if there are any other tests to run.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/187

Reviewed By: tissue3

Differential Revision: D43051875

Pulled By: terrychenism

fbshipit-source-id: 3ccf6859c499af9cff6fbc23ca4ede2aa837decc
---
 .../05_stable_diffusion/scripts/compile.py    |  8 +++-
 examples/05_stable_diffusion/src/benchmark.py |  6 ++-
 .../src/compile_lib/compile_clip.py           |  4 +-
 .../src/compile_lib/compile_unet.py           |  3 +-
 .../05_stable_diffusion/src/modeling/clip.py  | 43 ++++++++++++++++++-
 .../src/test_correctness.py                   | 13 ++++--
 6 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 4a38d3bc4..db0a9ae93 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -64,10 +64,12 @@ def compile_diffusers(
     compile_clip(
         pipe.text_encoder,
         batch_size=batch_size,
-        dim=1024,
-        num_heads=16,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
     )
     # UNet
     compile_unet(
@@ -77,6 +79,8 @@ def compile_diffusers(
         height=hh,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
     )
     # VAE
     compile_vae(
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index 5a99b1f48..c278dcf1e 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -297,7 +297,11 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     )
     # UNet
     benchmark_unet(
-        pipe.unet, batch_size=batch_size * 2, benchmark_pt=benchmark_pt, verify=verify
+        pipe.unet,
+        batch_size=batch_size * 2,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
+        hidden_dim=pipe.text_encoder.config.hidden_size,
     )
     # VAE
     benchmark_vae(
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index 5cc57077f..cfda48607 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -82,12 +82,13 @@ def compile_clip(
     seqlen=64,
     dim=768,
     num_heads=12,
+    depth=12,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
+    act_layer="gelu",
 ):
     mask_seq = 0
     causal = True
-    depth = 23
 
     ait_mod = ait_CLIPTextTransformer(
         num_hidden_layers=depth,
@@ -97,6 +98,7 @@ def compile_clip(
         seq_len=seqlen,
         causal=causal,
         mask_seq=mask_seq,
+        act_layer=act_layer,
     )
     ait_mod.name_parameter_tensor()
 
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index f1f4acab8..7cc2b41e4 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -57,12 +57,13 @@ def compile_unet(
     hidden_dim=1024,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
+    attention_head_dim=[5, 10, 20, 20],  # noqa: B006
 ):
 
     ait_mod = ait_UNet2DConditionModel(
         sample_size=64,
         cross_attention_dim=hidden_dim,
-        attention_head_dim=[5, 10, 20, 20],
+        attention_head_dim=attention_head_dim,
     )
     ait_mod.name_parameter_tensor()
 
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 874050eb2..1a95314d4 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -318,7 +318,41 @@ def forward(self, x, res):
         return ops.reshape()(x, shape)
 
 
+class CLIPMLPQuickGelu(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+        )
+        self.activation_fn = QuickGELUActivation()
+
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
 class CLIPEncoderLayer(nn.Module):
+    ACT_LAYER_TO_CLIP_MLP_MAP = {
+        "gelu": CLIPMLP,
+        "quick_gelu": CLIPMLPQuickGelu,
+    }
+
     def __init__(
         self,
         hidden_size=768,
@@ -329,6 +363,7 @@ def __init__(
         seq_len=16,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
         self.embed_dim = hidden_size
@@ -346,7 +381,9 @@ def __init__(
             use_mem_eff=True,
         )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
+        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
+            hidden_size, int(hidden_size * mlp_ratio)
+        )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -396,6 +433,7 @@ def __init__(
         seq_len=64,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
         self.layers = nn.ModuleList(
@@ -407,6 +445,7 @@ def __init__(
                     seq_len=seq_len,
                     causal=causal,
                     mask_seq=mask_seq,
+                    act_layer=act_layer,
                 )
                 for _ in range(num_hidden_layers)
             ]
@@ -531,6 +570,7 @@ def __init__(
         seq_len=64,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
         self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
@@ -542,6 +582,7 @@ def __init__(
             seq_len=seq_len,
             causal=causal,
             mask_seq=mask_seq,
+            act_layer=act_layer,
         )
         self.final_layer_norm = nn.LayerNorm(hidden_size)
 
diff --git a/examples/05_stable_diffusion/src/test_correctness.py b/examples/05_stable_diffusion/src/test_correctness.py
index d16f5fcfa..01b93f24d 100644
--- a/examples/05_stable_diffusion/src/test_correctness.py
+++ b/examples/05_stable_diffusion/src/test_correctness.py
@@ -77,19 +77,25 @@ def __init__(self, *args, **kwargs):
         self.unet_config = {
             "batch_size": 2,
             "dim": 320,
-            "hidden_dim": 1024,
+            "hidden_dim": pipe.unet.config.cross_attention_dim,
             "width": 64,
             "height": 64,
         }
 
+        self.unet_compile_extra_config = {
+            "attention_head_dim": pipe.unet.config.attention_head_dim,
+        }
+
         self.clip_config = {
             "batch_size": 1,
             "seqlen": 64,
         }
 
         self.clip_compile_extra_config = {
-            "dim": 1024,
-            "num_heads": 16,
+            "depth": pipe.text_encoder.config.num_hidden_layers,
+            "num_heads": pipe.text_encoder.config.num_attention_heads,
+            "dim": pipe.text_encoder.config.hidden_size,
+            "act_layer": pipe.text_encoder.config.hidden_act,
         }
 
     def test_vae(self):
@@ -112,6 +118,7 @@ def test_unet(self):
             use_fp16_acc=False,
             convert_conv_to_gemm=True,
             **self.unet_config,
+            **self.unet_compile_extra_config,
         )
         benchmark_unet(
             self.pt_unet,

From 83042a5da8264c03a1a32348a8e3cbd5101e1e48 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 6 Feb 2023 20:36:06 -0800
Subject: [PATCH 067/638] Reland support bfloat16 in gemm_rrr_small_nk (#205)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/205

att

Reviewed By: wushirong

Differential Revision: D43035940

fbshipit-source-id: fe1d6bd1616b416d30ecdc7bad3d7095a4a32ac1
---
 .../cuda/gemm_special/gemm_rrr_small_nk.py    | 33 ++++++++++++++-----
 tests/unittest/ops/test_gemm_rrr_small_nk.py  | 10 +++++-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index cd7167149..b30034e56 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -100,11 +100,28 @@
 #include <iostream>
 #include <type_traits>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
 namespace {
 
+using bfloat16 = __nv_bfloat16;
+
+__device__ float fma(float a, float b, float c) {
+  return __fmaf_rn(a, b, c);
+}
+
+__device__ half fma(half a, half b, half c) {
+  return __hfma(a, b, c);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c) {
+  return __hfma(a, b, c);
+}
+#endif
+
 // For each thread, read
 // A tile: 8 x K
 // B matrix: K x N
@@ -151,11 +168,11 @@
     for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
-            sum = __hfma(a_tile[i * K + k], b[k][j], sum);
+            sum = fma(a_tile[i * K + k], b[k][j], sum);
           }
           c_tile[i][j] = sum;
         } else {
@@ -202,11 +219,11 @@
     for (int i = 0; i < m; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
-            sum = __hfma(a_tile[i][k], b[k][j], sum);
+            sum = fma(a_tile[i][k], b[k][j], sum);
           }
           c_tile[i][j] = sum;
         } else {
@@ -243,7 +260,7 @@
 
 // N <= 8, K <= 8
 template<typename ElemT, int N, int K,
-         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half>, void>>
+         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>, void>>
 void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
                          ElemT* b_ptr,
                          ElemT* c_ptr,
@@ -255,7 +272,7 @@
   dim3 thread_block(nthread);
   constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if (use_fp16_acc && std::is_same_v<ElemT, half>) {
+  if (use_fp16_acc && (std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>)) {
     gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
       reinterpret_cast<const float4*>(a_ptr),
       reinterpret_cast<const float4*>(b_ptr),
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index 2b3a5df99..94f35a00b 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -81,10 +81,18 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
-    def test_float32(self):
+    def test_gemm_rrr_small_nk_float32(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
+    def test_gemm_rrr_small_nk_bfloat16(self):
+        self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+        self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+
 
 if __name__ == "__main__":
     unittest.main()

From d94e50b72c77668c06016d2a3c3d86ed2e9476ea Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 6 Feb 2023 20:36:06 -0800
Subject: [PATCH 068/638] Reland bfloat16 unit tests for group gemms (#208)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/208

att

Reviewed By: jianyuh

Differential Revision: D43057865

fbshipit-source-id: 5f39d952a18e67370f3425e906d6ae21b34bac23
---
 .../backend/cuda/gemm_universal/group_common.py          | 2 ++
 tests/unittest/ops/test_group_gemm_rcr.py                | 9 +++++----
 tests/unittest/ops/test_group_gemm_rcr_bias.py           | 3 ++-
 .../unittest/ops/test_group_gemm_rcr_bias_activation.py  | 2 ++
 tests/unittest/ops/test_group_gemm_rcr_bias_cat.py       | 7 +++----
 tests/unittest/ops/test_group_gemm_rcr_cat.py            | 7 +++----
 6 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 1185ab1ab..41cb8f444 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -153,6 +153,8 @@
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 
+using bfloat16 = nv_bfloat16;
+
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index fbd22f5de..4c1775f75 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -34,9 +34,10 @@ class GroupGEMMRcrTestCase(unittest.TestCase):
             param(False, "group_gemm_rcr_run_once", "float16"),
             param(True, "group_gemm_rcr_run_twice", "float16"),
             param(False, "group_gemm_rcr_run_once_fp32", "float32"),
+            param(False, "group_gemm_rcr_run_once_bf16", "bfloat16"),
         ]
     )
-    def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
+    def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
         M = 256
         K1 = 128
         N1 = 60
@@ -86,10 +87,10 @@ def test_rcr(self, run_twice: bool, test_name: str, dtype: str):
             outputs["y3"] = torch.empty_like(y1)
 
         module.run_with_tensors(inputs, outputs)
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
         if run_twice:
-            self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index 91a601744..a29a7b841 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -34,9 +34,10 @@ class GroupGEMMRcrBiasTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_fp16", "float16"),
             param("group_gemm_rcr_bias_fp32", "float32"),
+            param("group_gemm_rcr_bias_bf16", "bfloat16"),
         ]
     )
-    def test_rcr(self, test_name, dtype):
+    def test_group_gemm_rcr_bias(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index ae27c3d78..6da1ed164 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -33,8 +33,10 @@ class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
             param("group_gemm_rcr_bias_relu_fp32", "float32", "relu"),
+            param("group_gemm_rcr_bias_relu_bf16", "bfloat16", "relu"),
             param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
             param("group_gemm_rcr_bias_sigmoid_fp32", "float32", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_bf16", "bfloat16", "sigmoid"),
         ]
     )
     def test_rcr_activation(self, test_name, dtype, activation):
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index 72343721d..5c5a0773d 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -34,9 +33,10 @@ class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_bias_cat_fp16", "float16"),
             param("group_gemm_rcr_bias_cat_fp32", "float32"),
+            param("group_gemm_rcr_bias_cat_bf16", "bfloat16"),
         ]
     )
-    def test_rcr_bias_cat(self, test_name, dtype):
+    def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -66,11 +66,10 @@ def test_rcr_bias_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index 69717a440..c3ff44a93 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -34,9 +33,10 @@ class GroupGEMMRcrCatTestCase(unittest.TestCase):
         [
             param("group_gemm_rcr_cat_fp16", "float16"),
             param("group_gemm_rcr_cat_fp32", "float32"),
+            param("group_gemm_rcr_cat_bf16", "bfloat16"),
         ]
     )
-    def test_rcr_cat(self, test_name, dtype):
+    def test_group_gemm_rcr_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
@@ -63,11 +63,10 @@ def test_rcr_cat(self, test_name, dtype):
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,

From e0343ef1213d9b98f01fe95b446db0fbe33399ae Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 069/638] Foundations for linking the constant folding code
 into the .so (#1179)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1179

* Implement a modified `constant_folding` pass that links the code into the .so instead of doing it all at compile time.
* Add all codegen & C++ changes to support load-time constant folding and constant updating.

This diff is ready for review. However, it is **not to be landed without additional stacked diffs**. I'm splitting up the changes to avoid huge reviews. But we don't want to have two versions of constant folding around, we should delete the old pass when everything is ready.

Reviewed By: chenyang78

Differential Revision: D41806880

fbshipit-source-id: 7884d274df02b24494af7e49f2d16f29695d40ea
---
 python/aitemplate/backend/codegen.py          | 260 ++++++++++-----
 python/aitemplate/backend/main_templates.py   |  23 +-
 python/aitemplate/compiler/base.py            |   2 +
 python/aitemplate/compiler/compiler.py        |  13 +-
 .../aitemplate/compiler/transform/__init__.py |   2 +-
 .../compiler/transform/constant_folding.py    | 310 +++++++++++++++++-
 .../compiler/transform/memory_planning.py     |  10 +-
 static/csrc/model_container.cpp               |  99 +++++-
 static/include/model.h                        |  14 +
 static/include/model_container.h              |  34 ++
 10 files changed, 663 insertions(+), 104 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 1436cd887..e2c597065 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -27,6 +27,8 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple
 
+import jinja2
+
 from aitemplate.backend.main_templates import MODEL_CONTAINER_TEMPLATE, MODEL_TEMPLATE
 from aitemplate.compiler.base import Operator
 from aitemplate.compiler.dtype import dtype_to_enumerator, get_dtype_size
@@ -54,6 +56,10 @@
 }
 
 
+CONSTANT_FOLDER_MODEL_NAME = "ConstantFolder"
+MODEL_NAME = "Model"
+
+
 def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_strategy):
     """Generate operator profiler source code files for the given graph
 
@@ -255,17 +261,44 @@ def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
     return f"DEVICE_CHECK(DeviceToDeviceCopy({dst_ptr}, {src_name}, {size}, stream));"
 
 
+def _construct_output_name_to_index_map(
+    sorted_graph: List[Tensor], output_tensors: List[Tensor]
+) -> Dict[str, int]:
+    """
+    Use the given output ordering to construct a name -> index map
+    to be used for constructing an internal ordering during codegen.
+
+    The indices in the map are propagated to an output's entire alias set.
+    If two outputs are part of the same alias set, only one of them propagates
+    its output index.
+    """
+    result = {tensor._attrs["name"]: i for i, tensor in enumerate(output_tensors)}
+
+    # Mark alias sets
+    for tensor in reversed(sorted_graph):
+        name = tensor._attrs["name"]
+        orig = tensor._attrs["is_view_of"]
+        if orig is None:
+            continue
+        orig_name = orig._attrs["name"]
+        if name in result and orig_name not in result:
+            result[orig_name] = result[name]
+
+    return result
+
+
 class ModelContainerGenerator:
     def __init__(
         self,
         max_blob_size: int,
         max_constant_blob_size: int,
         workspace: Workspace,
-        num_inputs: int,
-        num_outputs: int,
-        constants_data_file: io.BytesIO,
-        output_name_to_idx: Dict[str, int],
-        debug_settings: AITDebugSettings,
+        constants_data_file: Optional[io.BytesIO],
+        graph: List[Tensor],
+        output_tensors: List[Tensor],
+        model_name: str = MODEL_NAME,
+        additional_unbound_constants: Optional[list[Tensor]] = None,
+        debug_settings: Optional[AITDebugSettings] = None,
     ):
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
@@ -299,29 +332,43 @@ def __init__(
         self.constants_data_size = 0
         self.owned_constants_init = []
 
+        self.set_up_constant_folding_outputs_offsets = []
+
         self.input_idx = 0
         self.unbound_constant_idx = 0
-        self.output_name_to_idx = output_name_to_idx
-
-        (
-            self.max_blob_size,
-            self.max_constant_blob_size,
-            self.workspace,
-            self.num_inputs,
-            self.num_outputs,
-        ) = (
+        self.output_name_to_idx = _construct_output_name_to_index_map(
+            graph, output_tensors
+        )
+        self.graph = graph
+
+        self.num_inputs, self.num_outputs = count_inputs_outputs(graph)
+        (self.max_blob_size, self.max_constant_blob_size, self.workspace,) = (
             max_blob_size,
             max_constant_blob_size,
             workspace,
-            num_inputs,
-            num_outputs,
         )
 
-        self.debug_settings = debug_settings
+        self.debug_settings = (
+            AITDebugSettings() if debug_settings is None else debug_settings
+        )
 
         # This records whether or not we should debug header.
         self.debug_header = False
 
+        self.model_name = model_name
+
+        # additional_unbound_constants stores tensors that are used in constant folding
+        # but are not used in the main graph. We need this info so we can codegen SetConstant
+        # correctly; when we call SetConstant for one of these special names, we want to forward
+        # to constant_folder_->SetConstant().
+        self.additional_unbound_constants = additional_unbound_constants
+        self.set_up_constant_folding_inputs = []
+
+        # This is used to handle a corner case; if we have an owned tensor that is used as an input
+        # for constant folding, we need to allocate space for it in our constant buffer, but its
+        # size won't be found during memory planning.
+        self.extra_owned_constant_size = 0
+
     def _tensor_slice_func(
         self,
         node: Tensor,
@@ -363,6 +410,28 @@ def max_value(var_or_imm):
             )
         )
 
+    def _add_owned_constant(self, tensor: Tensor) -> None:
+        """
+        Add an owned constant, e.g. one with a bound "data" attribute.
+        Here, we codegen some extra logic to load it into memory from the .so.
+        """
+        assert (
+            self.constants_data_file is not None
+        ), "Cannot add owned constants without a data file"
+
+        name = tensor._attrs["name"]
+        data = tensor._attrs["data"]
+        assert (
+            tensor._attrs["offset"] >= 0
+        ), f"Constant node '{name}' must have non-negative offset"
+        num_bytes = len(data)
+        self.constants_data_file.write(data.to_bytes())
+
+        constant_info = f'ConstantInfo{{"{name}", {self.constants_data_size}, {tensor._attrs["offset"]}, {num_bytes}}}'
+        self.owned_constants_init.append(constant_info)
+        self.constants_data_size += num_bytes
+        self.num_constants += 1
+
     def _codegen_param_setup(
         self,
         tensor: Tensor,
@@ -374,17 +443,19 @@ def _codegen_param_setup(
         data = tensor._attrs["data"]
         if data is not None:
             # Owned constant. Set up logic for copying the constant in from *.so.
-            assert (
-                tensor._attrs["offset"] >= 0
-            ), f"Constant node '{name}' must have non-negative offset"
             self.set_up_constants.append(self._tensor_slice_func(tensor, "constants"))
-            num_bytes = len(data)
-            self.constants_data_file.write(data.to_bytes())
+            if self.constants_data_file is not None:
+                self._add_owned_constant(tensor)
+
+        elif tensor._attrs["constant_folding_output_idx"] is not None:
+            self.set_up_constant_folding_outputs_offsets.append(
+                set_value(
+                    f'constant_folding_outputs_offsets_[{tensor._attrs["constant_folding_output_idx"]}]',
+                    tensor._attrs["offset"],
+                )
+            )
+            self.tensor_slice.append(self._tensor_slice_func(tensor, "constants"))
 
-            constant_info = f'ConstantInfo{{"{name}", {self.constants_data_size}, {tensor._attrs["offset"]}, {num_bytes}}}'
-            self.owned_constants_init.append(constant_info)
-            self.constants_data_size += num_bytes
-            self.num_constants += 1
         elif not isinstance(tensor, IntVarTensor):
             # Unbound constant. We will expect the user to set this via SetConstant.
             self.set_up_constant_names.append(
@@ -636,25 +707,15 @@ def append_tensor(self, node: Tensor) -> None:
         self._process_dims_for_tensor(node)
         self._process_src_ops(node)
 
-    def generate_source(self) -> Dict[str, str]:
-        """
-        Perform the codegen after adding all tensors.
-        The dictionary returned is a map from filename -> contents.
-        """
-        device_functions_header_name = f"{self.target.name()}_device_functions.h"
-        result = {}
-        result[
-            "device_functions-generated.h"
-        ] = f'#include "{device_functions_header_name}"'
-
+    def generate_model(self) -> str:
         # Disable graph mode on ROCM because the updating operations
         # are not supported
         target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
 
         func_pair_seq = zip(self.func_name_seq, self.func_seq)
-        model_def = MODEL_TEMPLATE.render(
+        return MODEL_TEMPLATE.render(
+            model_name=self.model_name,
             function_decl="\n".join(self.func_decl),
-            device_functions_header=device_functions_header_name,
             set_inputs="\n".join(self.set_inputs),
             tensor_slice="\n".join(self.tensor_slice),
             tensor_map_set="\n".join(self.tensor_map_set),
@@ -673,18 +734,53 @@ def generate_source(self) -> Dict[str, str]:
             workspace_size=self.workspace.total_size(),
             num_inputs=self.num_inputs,
             num_outputs=self.num_outputs,
-            param_size=self.max_constant_blob_size,
+            param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
             num_unbound_constants=self.unbound_constant_idx,
             profiler_annotation=self.debug_settings.gen_profiler_annotation,
         )
 
-        result["model-generated.h"] = model_def
+    def _create_set_up_constant_folding_outputs_offsets(self) -> str:
+        """
+        constant_folding_outputs_offsets_ stores a map from each output of constant folding
+        to its offset inside the constant buffer.
+
+        When the model is loaded, we use these offsets to wire up the constant folding output
+        pointers to the outputs of the constant folder.
+        """
+        if not self.set_up_constant_folding_outputs_offsets:
+            return ""
+
+        return jinja2.Template(
+            """
+constant_folding_outputs_offsets_.resize({{num_constant_folding_outputs}});
+{{set_up_statements}}
+"""
+        ).render(
+            num_constant_folding_outputs=len(
+                self.set_up_constant_folding_outputs_offsets
+            ),
+            set_up_statements="\n".join(self.set_up_constant_folding_outputs_offsets),
+        )
+
+    def generate_source(self) -> Dict[str, str]:
+        """
+        Perform the codegen after adding all tensors.
+        The dictionary returned is a map from filename -> contents.
+        """
+        device_functions_header_name = f"{self.target.name()}_device_functions.h"
+        result = {}
+        result[
+            "device_functions-generated.h"
+        ] = f'#include "{device_functions_header_name}"'
+
+        result["model-generated.h"] = self.generate_model()
 
         model_container_src_fname = f"model_container_base{self.target.src_extension()}"
+
         model_container_base_src = MODEL_CONTAINER_TEMPLATE.render(
             num_inputs=self.num_inputs,
             num_outputs=self.num_outputs,
-            param_size=self.max_constant_blob_size,
+            param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
             set_up_constant_names="\n".join(self.set_up_constant_names),
             set_up_param_dtypes="\n".join(self.set_up_param_dtypes),
             set_up_output_shapes="\n".join(self.set_up_output_shapes),
@@ -692,35 +788,56 @@ def generate_source(self) -> Dict[str, str]:
             num_constants=self.num_constants,
             num_unbound_constants=self.unbound_constant_idx,
             owned_constants_init=",".join(self.owned_constants_init),
+            set_up_constant_folding_outputs_offsets=self._create_set_up_constant_folding_outputs_offsets(),
+            set_up_constant_folding_inputs="\n".join(
+                self.set_up_constant_folding_inputs
+            ),
         )
         result[model_container_src_fname] = model_container_base_src
         return result
 
+    def add_constant_folding_input(self, tensor: Tensor):
+        """
+        Handle an input to constant fold
+        Handle an input to constant folding, e.g. a constant that is
+        no longer part of the main graph
+        """
+        name = tensor._attrs["name"]
 
-def _construct_output_name_to_index_map(
-    sorted_graph: List[Tensor], output_tensors: List[Tensor]
-) -> Dict[str, int]:
-    """
-    Use the given output ordering to construct a name -> index map
-    to be used for constructing an internal ordering during codegen.
+        if tensor._attrs["data"] is None:
+            self.set_up_constant_names.append(
+                set_value(
+                    f'unbound_constant_name_to_idx_["{name}"]',
+                    self.unbound_constant_idx,
+                )
+            )
+            self._record_param_tensor_info(
+                tensor,
+                self.unbound_constant_idx + self.num_inputs + self.num_outputs,
+            )
+            self.unbound_constant_idx += 1
+            self.set_up_constant_folding_inputs.append(
+                f'constant_folding_inputs_.insert("{name}");'
+            )
+        else:
+            self._add_owned_constant(tensor)
 
-    The indices in the map are propagated to an output's entire alias set.
-    If two outputs are part of the same alias set, only one of them propagates
-    its output index.
-    """
-    result = {tensor._attrs["name"]: i for i, tensor in enumerate(output_tensors)}
+        self._process_dims_for_tensor(tensor)
 
-    # Mark alias sets
-    for tensor in reversed(sorted_graph):
-        name = tensor._attrs["name"]
-        orig = tensor._attrs["is_view_of"]
-        if orig is None:
-            continue
-        orig_name = orig._attrs["name"]
-        if name in result and orig_name not in result:
-            result[orig_name] = result[name]
+    def append_all_tensors(self) -> None:
+        if self.additional_unbound_constants is not None:
+            for tensor in self.additional_unbound_constants:
+                self.add_constant_folding_input(tensor)
+                self.extra_owned_constant_size += tensor.size_bytes(alignment=64)
 
-    return result
+        for tensor in self.graph:
+            if tensor._attrs["is_param"] and tensor._attrs["offset"] is not None:
+                # Make sure we leave room for the tensors that constant folding
+                # needs. These have been excluded from the final graph, so
+                # the memory planning pass will not have known about them.
+                tensor._attrs["offset"] += self.extra_owned_constant_size
+
+            self.append_tensor(tensor)
 
 
 _DEBUG_SETTINGS = AITDebugSettings()
@@ -735,6 +852,7 @@ def gen_library_src(  # noqa: C901
     output_tensors: List[Tensor],
     model_name: str = "",
     debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+    additional_unbound_constants: Optional[list[Tensor]] = None,
 ) -> list[Tuple[str, str]]:
     """Generate model driver source code files for the given graph
 
@@ -764,27 +882,21 @@ def to_obj_name(name: str):
         name, _ = os.path.splitext(name)
         return f"{name}.obj"
 
-    num_inputs, num_outputs = count_inputs_outputs(sorted_graph)
     prefix = os.path.join(workdir, model_name)
     constants_fname = os.path.join(prefix, "constants.bin")
     constants_data_file = open(constants_fname, "wb")
 
-    output_name_to_index = _construct_output_name_to_index_map(
-        sorted_graph, output_tensors
-    )
-
     model_container_generator = ModelContainerGenerator(
         max_blob_size,
         max_constant_blob_size,
         workspace,
-        num_inputs,
-        num_outputs,
         constants_data_file,
-        output_name_to_index,
-        debug_settings,
+        sorted_graph,
+        output_tensors,
+        additional_unbound_constants=additional_unbound_constants,
+        debug_settings=debug_settings,
     )
-    for node in sorted_graph:
-        model_container_generator.append_tensor(node)
+    model_container_generator.append_all_tensors()
     constants_data_file.close()
 
     files = model_container_generator.generate_source()
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 3aa881631..a8359c7c1 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -41,19 +41,6 @@
 {{ function_decl }}
 
 namespace ait {
-namespace {
-void DeviceCheckLastError(const char* file, int line) {
-  auto device_error = GetLastError();
-  if (device_error != GetDeviceSuccess()) {
-    std::string msg = std::string("Got error: ") + GetLastErrorString() +
-                      " enum: " + std::to_string(device_error) +
-                      " at " + file + ": " + std::to_string(line);
-    LOG(ERROR) << msg;
-    throw std::runtime_error(msg);
-  }
-}
-
-} // namespace
 
 // Model is the class that actually performs inference. It owns memory for
 // intermediate tensors and dynamic dimensions. Constants are owned by
@@ -61,9 +48,9 @@
 // by the user.
 // Once an inference run has started, it is not safe to re-use the Model
 // until the run has finished!
-class Model : public ModelBase<Model> {
+class {{model_name}} : public ModelBase<{{model_name}}> {
   public:
-    Model(
+    {{model_name}}(
         size_t blob_size,
         size_t workspace_size,
         size_t unique_workspace_size,
@@ -142,11 +129,11 @@ class Model : public ModelBase<Model> {
       std::cout << "AIT per op profiling finished." << std::endl;
     }
 
-    static std::unique_ptr<Model> Create(
+    static std::unique_ptr<{{model_name}}> Create(
       AITemplateAllocator& allocator,
       uint8_t* constants
     ) {
-      return std::make_unique<Model>(
+      return std::make_unique<{{model_name}}>(
           {{ blob_size }},
           {{ workspace_size }},
           {{ unique_workspace_size }},
@@ -206,6 +193,8 @@ class Model : public ModelBase<Model> {
     );
     max_param_storage_bytes_[i] = max_param_numel_[i] * AITemplateDtypeSizeBytes(param_dtypes_[i]);
   }
+{{ set_up_constant_folding_outputs_offsets }}
+{{ set_up_constant_folding_inputs }}
 
   auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
   const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index df03965ca..d690315cd 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -413,6 +413,8 @@ def __init__(
         # Data to be bound for constant folding. See _bind_data.
         self._attrs["data"] = None
 
+        self._attrs["constant_folding_output_idx"] = None
+
         self._attrs["check_nan_and_inf"] = check_nan_and_inf
         self._attrs["check_outputs"] = check_outputs
 
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index ca87e0a65..20e29e100 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -206,20 +206,30 @@ def compile_model(
             constant_folding_workdir = os.path.join(workdir, test_name)
             os.makedirs(constant_folding_workdir, exist_ok=True)
             graph = compiler.transform.constant_folding(graph, constant_folding_workdir)
+            # TODO: enable and delete the call above.
+            # They can't be enabled at the same time because contant folding mutates tensors
+            # in the graph.
+            # (
+            #    graph,
+            #    constant_folding_file_pairs,
+            #    constant_folding_inputs,
+            # ) = compiler.transform.constant_folding_v2(graph, workdir, test_name)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "constant_folding"
             )
             _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
 
-            _verify_outputs_still_in_graph(graph, output_tensors)
             (
                 max_blob,
                 max_constant_blob,
                 workspace,
             ) = compiler.transform.memory_planning(graph)
+            _verify_outputs_still_in_graph(graph, output_tensors)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
 
             file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
+            # TODO: uncomment when V2 constant folding is enabled.
+            # file_pairs.extend(constant_folding_file_pairs)
 
             # It's possible that the original output tensor has been replaced with a new tensor.
             # Preserve original output tensors' orders but use the new tensors.
@@ -242,6 +252,7 @@ def compile_model(
                 workdir,
                 output_tensors,
                 test_name,
+                # additional_unbound_constants=constant_folding_inputs,
                 debug_settings,
             )
             file_pairs.extend(main_pairs)
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index ca9bf77e4..9e61f5889 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -14,7 +14,7 @@
 #
 # flake8: noqa
 from .bind_constants import bind_constants
-from .constant_folding import constant_folding
+from .constant_folding import constant_folding, constant_folding_v2
 from .fuse_conv_elementwise import fuse_conv_elementwise
 from .fuse_group_ops import (
     fuse_group_gemm_ops,
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index b8f9e2194..0e1752a9a 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -14,7 +14,7 @@
 #
 import logging
 import os
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 import numpy as np
 
@@ -23,12 +23,27 @@
 from aitemplate.compiler.base import _NumpyConstantTensorData, IntVarTensor, Tensor
 from aitemplate.compiler.dtype import normalize_dtype
 from aitemplate.compiler.model import AITData, Model
+from aitemplate.compiler.transform.memory_planning import Workspace
 from aitemplate.compiler.transform.transform_utils import replace_tensor
+from aitemplate.utils import graph_utils
 
 
 _LOGGER = logging.getLogger(__name__)
 
 
+def _create_dummy_constant_folder():
+    model_container_generator = backend.codegen.ModelContainerGenerator(
+        max_blob_size=0,
+        max_constant_blob_size=0,
+        workspace=Workspace(0, 0),
+        constants_data_file=None,
+        graph=[],
+        output_tensors=[],
+        model_name=backend.codegen.CONSTANT_FOLDER_MODEL_NAME,
+    )
+    return model_container_generator.generate_model()
+
+
 def _output_from_tensor(tensor: Tensor) -> Tensor:
     new_tensor = Tensor(
         shape=tensor._attrs["shape"],
@@ -94,9 +109,42 @@ def _extract_foldable_subgraph(
     return subgraph
 
 
+def _make_op_names_unique(graph: List[Tensor]) -> Dict[str, str]:
+    """
+    To avoid ODR issues, we rename all ops in the constant folding subgraph.
+    ODR issues can arise if two ops end up sharing the same name & implementation (which
+    can actualy happen, e.g. in the proposal op).
+    """
+    new_name_to_old = {}
+    for tensor in graph:
+        for op in tensor._attrs["src_ops"]:
+            if op._attrs["name"] not in new_name_to_old:
+                new_name = f"{op._attrs['name']}_constant_folding"
+                new_name_to_old[new_name] = op._attrs["name"]
+                op._attrs["name"] = new_name
+    return new_name_to_old
+
+
+def _rename_ops(graph: List[Tensor], new_name_to_old: Dict[str, str]) -> None:
+    for tensor in graph:
+        for op in tensor._attrs["src_ops"]:
+            if op._attrs["name"] in new_name_to_old:
+                op._attrs["name"] = new_name_to_old[op._attrs["name"]]
+
+
 def _constant_folding_impl(
     sorted_graph: List[Tensor], workdir: str
 ) -> Dict[str, Tensor]:
+    constant_folding_workdir = os.path.join(workdir, "constant_folding")
+    os.makedirs(constant_folding_workdir, exist_ok=True)
+    # Just write a dummy constant folder. It's unused in this path, and we're removing this function soon.
+    with open(
+        os.path.join(constant_folding_workdir, "constant_folder-generated.h"), "w"
+    ) as f:
+        f.write(_create_dummy_constant_folder())
+
+    with open(os.path.join(workdir, "constant_folder-generated.h"), "w") as f:
+        f.write(_create_dummy_constant_folder())
 
     # Collect the set of output names before we do any transformations. We'll need this
     # if we end up turning outputs into constants. _extract_foldable_subgraph marks *all*
@@ -114,8 +162,6 @@ def _constant_folding_impl(
 
     blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
 
-    constant_folding_workdir = os.path.join(workdir, "constant_folding")
-    os.makedirs(constant_folding_workdir, exist_ok=True)
     file_pairs = backend.codegen.gen_function_src(subgraph, workdir, "constant_folding")
     main_pairs = backend.codegen.gen_library_src(
         subgraph,
@@ -187,3 +233,261 @@ def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     # Eliminate constants that are no longer used
     compiler.transform.remove_unused_ops(sorted_graph)
     return compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _non_output_from_tensor(tensor: Tensor) -> Tensor:
+    new_tensor = Tensor(
+        shape=tensor._attrs["shape"],
+        name=tensor._attrs["name"],
+        src_ops=tensor._attrs["src_ops"].copy(),
+        dst_ops=tensor._attrs["dst_ops"].copy(),
+        dtype=tensor._attrs["dtype"],
+        is_view_of=tensor._attrs["is_view_of"],
+    )
+    new_tensor._attrs["is_param"] = tensor._attrs["is_param"]
+    new_tensor._attrs["data"] = tensor._attrs["data"]
+    return new_tensor
+
+
+def _output_from_tensor_v2(tensor: Tensor) -> Tensor:
+    new_tensor = _non_output_from_tensor(tensor)
+    new_tensor._attrs["is_output"] = True
+    return new_tensor
+
+
+def _fix_op_inputs_outputs(
+    subgraph: List[Tensor], name_to_new_tensor: Dict[str, Tensor]
+) -> None:
+    """
+    This is an unfortunate hack made necessary by the following:
+
+    1) When constructing the constant folding subgraph, the most understandable
+       thing to do is create *new* tensors so we can modify their attributes without
+       affecting the original graph.
+    2) However, the inputs of each tensor's src and dst ops need to be wired up to
+       the new tensors since the memory planning pass will traverse the graph through those attributes.
+
+    So, we store the mapping from tensor name to its corresponding subgraph tensor and the tensor in
+    original graph.
+
+    Before we do memory planning for constant folding, we call:
+      _fix_op_inputs_outputs(subgraph, name_to_constant_folding_tensor)
+
+    And then afterwards we restore everything with:
+      _fix_op_inputs_outputs(subgraph, name_to_original_tensor)
+
+    It would be nice if we could deep copy the src and dst ops when we create new tensors so we can
+    skip the restoration step. But this is not implemented and not trivial. Thankfully, this function
+    is not too hard to understand once the rationale behind it is understood.
+    """
+    ops = graph_utils.get_sorted_ops(subgraph)
+    for op in ops:
+        op._attrs["inputs"] = [
+            name_to_new_tensor[tensor._attrs["name"]] for tensor in op._attrs["inputs"]
+        ]
+
+        op._attrs["outputs"] = [
+            name_to_new_tensor[tensor._attrs["name"]] for tensor in op._attrs["outputs"]
+        ]
+
+
+def _extract_foldable_subgraph_v2(
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    """
+    Extract a list of foldable nodes. A node is foldable if:
+    * It has bound data, or
+    * All of its inputs are foldable.
+
+    The subgraph returned is just a list of Tensors. All foldable
+    tensors that do not have bound data are marked as outputs in
+    the subgraph. The original graph is not modified.
+
+    All tensors that do not have bound data are marked as outputs.
+    This is because we want to execute the subgraph and get all
+    of the new constants. Only the ones that are actually needed are put
+    back into the final graph.
+    """
+    foldable_node_names = set()
+    foldable_ops = set()
+    subgraph = []
+
+    for tensor in sorted_graph:
+        if tensor._attrs["is_input"]:
+            continue
+
+        name = tensor._attrs["name"]
+        if tensor._attrs["data"] is not None or tensor._attrs["is_param"]:
+            foldable_node_names.add(name)
+            subgraph.append(tensor)
+            continue
+        elif isinstance(tensor, IntVarTensor):
+            continue
+        foldable = all(
+            inp._attrs["name"] in foldable_node_names
+            for op in tensor._attrs["src_ops"]
+            for inp in op._attrs["inputs"]
+        )
+
+        if foldable:
+            foldable_node_names.add(name)
+            subgraph.append(tensor)
+            for op in tensor._attrs["src_ops"]:
+                foldable_ops.add(op)
+
+    def _is_used_by_non_foldable_op(tensor: Tensor) -> bool:
+        for op in tensor._attrs["dst_ops"]:
+            if op not in foldable_ops:
+                return True
+        return False
+
+    def _is_used_by_foldable_op(tensor: Tensor) -> bool:
+        for op in tensor._attrs["dst_ops"]:
+            if op in foldable_ops:
+                return True
+        return False
+
+    filtered_subgraph = []
+    views_of_constants = set()
+    name_to_new_tensor = {}
+    name_to_old_tensor = {}
+    constant_folding_inputs = []
+
+    for tensor in subgraph:
+        # Mark views of constants. This helps us avoid extra copying, see below.
+        view = tensor._attrs["is_view_of"]
+        is_view_of_constant = tensor._attrs["is_param"] or (
+            view is not None and view in views_of_constants
+        )
+        if is_view_of_constant:
+            views_of_constants.add(tensor)
+
+        name = tensor._attrs["name"]
+        new_tensor = None
+
+        if not is_view_of_constant and (
+            _is_used_by_non_foldable_op(tensor) or tensor._attrs["is_output"]
+        ):
+            # Tensor is required outside of the subgraph, make it an output.
+            # We only need to do this if it's not a (view of) a constant, else
+            # we'll get wasteful D2D copies.
+            new_tensor = _output_from_tensor_v2(tensor)
+
+        elif _is_used_by_foldable_op(tensor):
+            # No need to append constants that are not used by foldable ops.
+            new_tensor = _non_output_from_tensor(tensor)
+            if new_tensor._attrs["is_param"]:
+                constant_folding_inputs.append(new_tensor)
+
+        if new_tensor is not None:
+            name_to_new_tensor[name] = new_tensor
+            name_to_old_tensor[name] = tensor
+            filtered_subgraph.append(new_tensor)
+
+    _fix_op_inputs_outputs(filtered_subgraph, name_to_new_tensor)
+    return filtered_subgraph, name_to_old_tensor, constant_folding_inputs
+
+
+def _constant_folding_impl_v2(
+    sorted_graph: List[Tensor],
+    workdir: str,
+    model_name: str,
+) -> Tuple[Dict[str, Tensor], List[Tuple[str, str]], List[Tensor]]:
+    model_dir = os.path.join(workdir, model_name)
+
+    # Collect the set of output names before we do any transformations. We'll need this
+    # if we end up turning outputs into constants. _extract_foldable_subgraph marks *all*
+    # folded constants as outputs, so we can't just query attrs["is_output"] (see
+    # extract_foldable_subgraph for more info on why that happens)
+    original_output_tensors = {
+        tensor._attrs["name"] for tensor in sorted_graph if tensor._attrs["is_output"]
+    }
+
+    (
+        subgraph,
+        name_to_old_tensor,
+        constant_folding_inputs,
+    ) = _extract_foldable_subgraph_v2(sorted_graph)
+    output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
+    if not output_tensors:
+        _LOGGER.info("No constants to fold, skipping constant folding.")
+        # Write a dummy constant folder so everything still compiles.
+        with open(os.path.join(model_dir, "constant_folder-generated.h"), "w") as f:
+            f.write(_create_dummy_constant_folder())
+        _fix_op_inputs_outputs(subgraph, name_to_old_tensor)
+        return {}, [], []
+
+    blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
+    new_name_to_old = _make_op_names_unique(subgraph)
+    file_pairs = backend.codegen.gen_function_src(subgraph, workdir, model_name)
+    model_container_generator = backend.codegen.ModelContainerGenerator(
+        blob,
+        constant_blob,
+        workspace,
+        constants_data_file=None,
+        graph=subgraph,
+        output_tensors=output_tensors,
+        model_name=backend.codegen.CONSTANT_FOLDER_MODEL_NAME,
+    )
+    model_container_generator.append_all_tensors()
+    constant_folding_model_def = model_container_generator.generate_model()
+    with open(os.path.join(model_dir, "constant_folder-generated.h"), "w") as f:
+        f.write(constant_folding_model_def)
+
+    _fix_op_inputs_outputs(subgraph, name_to_old_tensor)
+    _rename_ops(subgraph, new_name_to_old)
+    new_tensors = {}
+    for tensor in subgraph:
+        if not tensor._attrs["is_param"]:
+            name = tensor._attrs["name"]
+            new_tensor = Tensor(
+                shape=tensor._attrs["shape"],
+                name=name,
+                dtype=tensor._attrs["dtype"],
+                is_output=name in original_output_tensors,
+            )
+            if name in model_container_generator.output_name_to_idx:
+                new_tensor._attrs[
+                    "constant_folding_output_idx"
+                ] = model_container_generator.output_name_to_idx[name]
+            new_tensors[name] = new_tensor
+
+    return new_tensors, file_pairs, constant_folding_inputs
+
+
+def constant_folding_v2(
+    sorted_graph: List[Tensor],
+    workdir: str,
+    model_name: str,
+) -> Tuple[List[Tensor], Tuple[str, str]]:
+    """
+    Fold and propagate constants.
+
+    This pass looks for ops that have inputs which can be determined
+    at compile time. It evaluates them, then puts the new constants
+    back into the graph with bound data. The old ops are eliminated.
+
+    This pass actually compiles and runs an AIT runtime. If there are
+    any problems (e.g. due to buggy ops), the constant folding is
+    aborted and the graph is returned unchanged. All generated code
+    is stored in workdir/constant_folding.
+    """
+    new_constants, file_pairs, constant_folding_inputs = _constant_folding_impl_v2(
+        sorted_graph, workdir, model_name
+    )
+
+    # Replace ops with their folded values.
+    for idx, tensor in enumerate(sorted_graph):
+        name = tensor._attrs["name"]
+        if name in new_constants:
+            new_tensor = new_constants[name]
+            replace_tensor(tensor, new_tensor)
+            sorted_graph[idx] = new_tensor
+
+    # Eliminate constants that are no longer used
+    compiler.transform.remove_unused_ops(sorted_graph)
+    return (
+        compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph),
+        file_pairs,
+        constant_folding_inputs,
+    )
diff --git a/python/aitemplate/compiler/transform/memory_planning.py b/python/aitemplate/compiler/transform/memory_planning.py
index 7b53a0a80..2abe94a8b 100644
--- a/python/aitemplate/compiler/transform/memory_planning.py
+++ b/python/aitemplate/compiler/transform/memory_planning.py
@@ -250,7 +250,10 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
     # now we assign blobs for weights and inputs
     constant_offset = 0
     for node in sorted_graph:
-        if node._attrs["data"] is not None:
+        if (
+            node._attrs["data"] is not None
+            or node._attrs["constant_folding_output_idx"] is not None
+        ):
             node._attrs["offset"] = constant_offset
             constant_offset += node.size_bytes(alignment=64)
 
@@ -270,7 +273,10 @@ def naive_memory_planning(sorted_graph: List[Tensor]):
     offset = 0
     constant_offset = 0
     for node in sorted_graph:
-        if node._attrs["data"] is not None:
+        if (
+            node._attrs["data"] is not None
+            or node._attrs["constant_folding_output_idx"] is not None
+        ):
             node._attrs["offset"] = constant_offset
             constant_offset += node.size_bytes(alignment=64)
         elif not node._attrs["is_view_of"]:
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index f5e119690..86f3cd038 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -65,11 +65,19 @@ ModelContainer::ModelContainer(
   models_.reserve(num_models);
   available_models_.reserve(num_models);
 
+  auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
   for (size_t i = 0; i < num_models; ++i) {
-    models_.push_back(
-        Model::Create(allocator, static_cast<uint8_t*>(constants_.get())));
+    models_.push_back(Model::Create(allocator, constants_ptr));
     available_models_.push_back(models_.back().get());
   }
+
+  constant_folder_ = ConstantFolder::Create(allocator, constants_ptr);
+
+  // Wire up the constant folder's outputs to our constant buffer.
+  size_t constant_idx = 0;
+  for (auto offset : constant_folding_outputs_offsets_) {
+    constant_folder_->SetOutput(constants_ptr + offset, constant_idx++);
+  }
 }
 
 void ModelContainer::Run(
@@ -81,6 +89,20 @@ void ModelContainer::Run(
     bool sync,
     bool graph_mode,
     int64_t** output_shapes_out) {
+  std::shared_lock constants_lk(constants_sync_mutex_);
+  if (!constant_folded_once_) {
+    // We don't require users to manually call FoldConstants the first time.
+    // Note that if this throws (due to an unset constant, for example)
+    // constant_folded_once_ will not be set.
+    constants_lk.unlock();
+    std::unique_lock constants_unique_lk(constants_sync_mutex_);
+    // Check again, another thread may have updated after we unlocked.
+    if (!constant_folded_once_) {
+      FoldConstantsImpl(stream);
+    }
+    constants_unique_lk.unlock();
+    constants_lk.lock();
+  }
   auto* model = GetAvailableModel();
   try {
     PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
@@ -192,6 +214,18 @@ float ModelContainer::Benchmark(
     num_threads = std::thread::hardware_concurrency();
   }
 
+  std::shared_lock constants_lk(constants_sync_mutex_);
+  if (!constant_folded_once_) {
+    constants_lk.unlock();
+    std::unique_lock constants_unique_lk(constants_sync_mutex_);
+    // Check again, another thread may have updated after we unlocked.
+    if (!constant_folded_once_) {
+      FoldConstantsImpl(stream);
+    }
+    constants_unique_lk.unlock();
+    constants_lk.lock();
+  }
+
   if (num_threads == 1) {
     return BenchmarkImpl(
                inputs,
@@ -307,10 +341,20 @@ float ModelContainer::Benchmark(
 }
 
 void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
+  // Prevent constant folding/inferences from starting while we update
+  // constants.
+  std::lock_guard lk(constants_sync_mutex_);
+  // Wait for any ongoing inferences + foldings to finish.
+  WaitForAllModels();
+  try {
+    constant_folder_->WaitForCompletion();
+  } catch (...) {
+    LOG(WARNING)
+        << "Constant folder threw exception while waiting for completion, ignoring.";
+  }
+
   auto it = unbound_constant_name_to_idx_.find(name);
   if (it == unbound_constant_name_to_idx_.end()) {
-    // TODO make this an exception after we fix the CMF benchmarks
-    LOG(ERROR) << "Constant " << name << " not found";
     return;
   }
   auto constant_idx = it->second + num_inputs_ + num_outputs_;
@@ -330,8 +374,12 @@ void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
   }
 
   auto* src = tensor.ptr;
-  for (auto& model : models_) {
-    model->SetConstant(name, src);
+  if (constant_folding_inputs_.find(name) == constant_folding_inputs_.end()) {
+    for (auto& model : models_) {
+      model->SetConstant(name, src);
+    }
+  } else {
+    constant_folder_->SetConstant(name, src);
   }
 }
 
@@ -387,6 +435,45 @@ size_t ModelContainer::MaxOutputStorageBytes(size_t output_idx) const {
   return max_param_storage_bytes_[idx];
 }
 
+void ModelContainer::WaitForAllModels() {
+  // Wait for all on-going inferences to finish.
+  for (auto* model : pending_models_) {
+    try {
+      model->WaitForCompletion();
+      // Something has gone horribly wrong if we hit these catch cases, but
+      // there's not much we can do about it. Just put the model back into the
+      // pool and carry on with folding.
+    } catch (std::exception& e) {
+      LOG(WARNING)
+          << "Model threw exception when waiting for inference to finish: "
+          << e.what() << ". Ignoring and continuing constant folding.";
+    } catch (...) {
+      LOG(WARNING)
+          << "Model threw unknown exception when waiting for inference to finish. Ignoring and continuing constant foldng.";
+    }
+    available_models_.push_back(model);
+  }
+}
+
+void ModelContainer::FoldConstantsImpl(StreamType stream) {
+  // NB: No need to acquire models_mutex_ here. We're guaranteed that nothing
+  // will be concurrently messing with the Model vectors while we hold the
+  // constants_sync_mutex_ in unique mode. See model_container.h for the full
+  // explanation.
+  WaitForAllModels();
+  // We might have already started constant folding, make sure it finishes
+  // first. It's OK if we throw here, there's no state to restore.
+  // We just won't finish the folding and will need to do it again.
+  constant_folder_->WaitForCompletion();
+  constant_folder_->Run(stream, /*graph_mode=*/false);
+  constant_folded_once_ = true;
+}
+
+void ModelContainer::FoldConstants(StreamType stream) {
+  std::lock_guard constant_folding_lk(constants_sync_mutex_);
+  FoldConstantsImpl(stream);
+}
+
 void ModelContainer::PrepareForRun(
     Model* model,
     const AITData* inputs,
diff --git a/static/include/model.h b/static/include/model.h
index b2fcde66e..51f00713a 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -14,8 +14,22 @@
 //
 #pragma once
 
+#include <stdexcept>
+#include <string>
+
 namespace ait {
 
+inline void DeviceCheckLastError(const char* file, int line) {
+  auto device_error = GetLastError();
+  if (device_error != GetDeviceSuccess()) {
+    std::string msg = std::string("Got error: ") + GetLastErrorString() +
+        " enum: " + std::to_string(device_error) + " at " + file + ": " +
+        std::to_string(line);
+    LOG(ERROR) << msg;
+    throw std::runtime_error(msg);
+  }
+}
+
 // This serves as a base class for AIT runtime objects, e.g. the compiled
 // model and the constant folder. It uses CRTP as a mechanism to call into
 // a few base class methods (dynamic dispatch is not needed in ModelContainer,
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 594d26e7f..bc07c049c 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -14,6 +14,7 @@
 //
 #pragma once
 
+#include "constant_folder-generated.h"
 #include "model-generated.h"
 #include "model_interface.h"
 #include "raii_wrapper.h"
@@ -23,7 +24,9 @@
 #include <future>
 #include <mutex>
 #include <numeric>
+#include <shared_mutex>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace ait {
 
@@ -46,6 +49,15 @@ class ModelContainerBase {
   // They must be set via SetConstant prior to inference.
   std::unordered_map<std::string, size_t> unbound_constant_name_to_idx_;
 
+  // The names of all tensors that are required for constant folding, but are
+  // not necessarily in the final graph.
+  std::unordered_set<std::string> constant_folding_inputs_;
+
+  // Offsets here correspond to the offsets of constants that were the outputs
+  // of constant folding. The indices are guaranteed to map to the correct
+  // indices in constant_folder_.
+  std::vector<size_t> constant_folding_outputs_offsets_;
+
   // a single piece of memory for all constants
   GPUPtr constants_;
 
@@ -174,7 +186,12 @@ class ModelContainer : ModelContainerBase {
     return models_.size();
   }
 
+  void FoldConstants(StreamType stream);
+
  private:
+  void WaitForAllModels();
+  void FoldConstantsImpl(StreamType stream);
+
   void PrepareForRun(
       Model* model,
       const AITData* inputs,
@@ -199,6 +216,7 @@ class ModelContainer : ModelContainerBase {
   AITemplateAllocator& allocator_;
 
   std::vector<std::unique_ptr<Model>> models_;
+  std::unique_ptr<ConstantFolder> constant_folder_;
   std::vector<Model*> available_models_;
   std::deque<Model*> pending_models_;
 
@@ -206,9 +224,25 @@ class ModelContainer : ModelContainerBase {
   std::mutex models_mutex_;
   // Notified whenever a model is put into pending_models_.
   std::condition_variable pending_models_available_;
+  // Prevents constant folding or SetConstants on main models from starting
+  // while there are ongoing inferences (and vice versa). FoldConstants() and
+  // SetConstants acquires in unique mode, Run()/Benchmark() acquire in shared
+  // mode.
+  //
+  // Since constants_sync_mutex_ is acquired in shared mode for the entire
+  // duration of Run()/Benchmark(), there is no need to acquire models_mutex_
+  // while constants_sync_mutex_ is acquired in unique mode.
+  // Why complicate things with two locks? The system is designed with the
+  // assumption that concurrent inferences are common. We don't want to acquire
+  // models_mutex_ uniquely for the entire duration of Run(), because that
+  // prevents concurrent inferences from happening while kernels are being
+  // queued.
+  std::shared_mutex constants_sync_mutex_;
 
   size_t num_inputs_;
   size_t num_outputs_;
+
+  bool constant_folded_once_ = false;
 };
 
 } // namespace ait

From df51f149bbec5dbe6bca4b699c7c27a809c29631 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 070/638] Load-time constant folding API (#1180)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1180

Add the new AIT core APIs for load time constant folding.

* `fold_constants()` to manually trigger constant folding once all constants have been set

* `get_constant_names()` to get the set of all constant names

* `get_constant_folding_input_names()` to get the subset of constant names that are required for constant folding.

Like the previous diff, this is **not to be landed** without additional stacked changes. I'm trying to avoid super long code reviews. The only thing really missing here is test coverage. But there are lots of corner cases, so the `fold_constants` testing is getting its own diff.

Reviewed By: chenyang78

Differential Revision: D41837748

fbshipit-source-id: 93fa11ee011d1c0302e157db47c85b07646e6b8f
---
 python/aitemplate/compiler/model.py      | 23 +++++++++++++
 static/csrc/model_container.cpp          | 31 ++++++++++++++++-
 static/csrc/model_interface.cpp          | 40 ++++++++++++++++++++++
 static/include/model_container.h         | 13 ++++++-
 static/include/model_interface.h         | 15 +++++++++
 tests/unittest/backend/test_model_api.py | 43 ++++++++++++++++++++++++
 6 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index d0ffe94d3..772f72def 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -906,3 +906,26 @@ def ait_data_to_numpy(
             stream_ptr=stream_ptr,
         )
         return arr
+
+    def fold_constants(self, stream_ptr: Optional[int] = None, sync: bool = True):
+        self.DLL.AITemplateModelContainerFoldConstants(
+            self.handle, ctypes.c_void_p(stream_ptr), ctypes.c_bool(sync)
+        )
+
+    def _get_constant_names_impl(self, constant_folding_only: bool) -> List[str]:
+        num_constants = ctypes.c_size_t()
+        constant_folding_inputs_only = ctypes.c_bool(constant_folding_only)
+        self.DLL.AITemplateModelContainerGetNumConstants(
+            self.handle, constant_folding_inputs_only, ctypes.byref(num_constants)
+        )
+        names = (ctypes.c_char_p * num_constants.value)()
+        self.DLL.AITemplateModelContainerGetConstantNames(
+            self.handle, constant_folding_inputs_only, names
+        )
+        return [name.decode("utf-8") for name in names]
+
+    def get_constant_names(self) -> List[str]:
+        return self._get_constant_names_impl(False)
+
+    def get_constant_folding_input_names(self) -> List[str]:
+        return self._get_constant_names_impl(True)
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 86f3cd038..0a4173d7f 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -469,9 +469,38 @@ void ModelContainer::FoldConstantsImpl(StreamType stream) {
   constant_folded_once_ = true;
 }
 
-void ModelContainer::FoldConstants(StreamType stream) {
+void ModelContainer::FoldConstants(StreamType stream, bool sync) {
   std::lock_guard constant_folding_lk(constants_sync_mutex_);
   FoldConstantsImpl(stream);
+  if (sync) {
+    DEVICE_CHECK(StreamSynchronize(stream));
+  }
+}
+
+size_t ModelContainer::GetNumConstants() const {
+  return unbound_constant_name_to_idx_.size();
+}
+
+size_t ModelContainer::GetNumConstantFoldingInputs() const {
+  return constant_folding_inputs_.size();
+}
+
+void ModelContainer::WriteAllConstantNamesTo(
+    const char** constant_names_out,
+    bool constant_folding_inputs_only) const {
+  size_t num_to_write = constant_folding_inputs_only
+      ? GetNumConstants()
+      : GetNumConstantFoldingInputs();
+  if (constant_names_out == nullptr && num_to_write != 0) {
+    throw std::runtime_error("constant_names_out cannot be nullptr.");
+  }
+  size_t idx = 0;
+  for (auto& [name, _] : unbound_constant_name_to_idx_) {
+    if (!constant_folding_inputs_only ||
+        constant_folding_inputs_.find(name) != constant_folding_inputs_.end()) {
+      constant_names_out[idx++] = name.c_str();
+    }
+  }
 }
 
 void ModelContainer::PrepareForRun(
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index e24634629..1ec93d645 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -112,6 +112,36 @@ AITemplateError AITemplateModelContainerSetConstant(
   CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
 }
 
+AITemplateError AITemplateModelContainerGetNumConstants(
+    AITemplateModelHandle handle,
+    bool constant_folding_inputs_only,
+    size_t* num_constants_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_constants_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    if (constant_folding_inputs_only) {
+      *num_constants_out = m->GetNumConstantFoldingInputs();
+    } else {
+      *num_constants_out = m->GetNumConstants();
+    }
+  })
+}
+
+AITemplateError AITemplateModelContainerGetConstantNames(
+    AITemplateModelHandle handle,
+    bool constant_folding_inputs_only,
+    const char** constant_names_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  // WriteAllConstantNamesTo() will handle nullptr checks on constant_names_out.
+  // Passing nullptr is allowed if there are 0 constants!
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->WriteAllConstantNamesTo(
+        constant_names_out, constant_folding_inputs_only);
+  })
+}
+
 AITemplateError AITemplateModelContainerRun(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -302,6 +332,16 @@ AITemplateError AITemplateModelContainerGetNumRuntimes(
   CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
 }
 
+AITemplateError AITemplateModelContainerFoldConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync); })
+}
+
 AITemplateError AITemplateAllocatorCreate(
     AITemplateAllocator** allocator_out,
     AITemplateAllocatorType allocator_type) {
diff --git a/static/include/model_container.h b/static/include/model_container.h
index bc07c049c..e3f1f1c4f 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -186,7 +186,18 @@ class ModelContainer : ModelContainerBase {
     return models_.size();
   }
 
-  void FoldConstants(StreamType stream);
+  void FoldConstants(StreamType stream, bool sync);
+
+  size_t GetNumConstants() const;
+  size_t GetNumConstantFoldingInputs() const;
+
+  // Write all constant names to the array pointed to by names_out.
+  // This function assumes that names_out has enough space to hold
+  // at least GetNumConstants() pointers. The strings written
+  // are guaranteed to live as long as their owning ModelContainer.
+  void WriteAllConstantNamesTo(
+      const char** names_out,
+      bool constant_folding_inputs_only) const;
 
  private:
   void WaitForAllModels();
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 0633c01c3..3240379fc 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -149,6 +149,16 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
     const char* name,
     const AITData* tensor);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumConstants(
+    AITemplateModelHandle handle,
+    bool constant_folding_inputs_only,
+    size_t* num_constants_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetConstantNames(
+    AITemplateModelHandle handle,
+    bool constant_folding_inputs_only,
+    const char** constant_names_out);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerRun(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -241,6 +251,11 @@ AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
     AITemplateModelHandle handle,
     size_t* num_runtimes_out);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerFoldConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync);
+
 AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
     AITemplateAllocator** allocator_out,
     AITemplateAllocatorType allocator_type);
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index 270118683..55933e34f 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -1469,6 +1469,49 @@ def test_custom_allocator(self):
                 module.run_with_tensors([x_pt], [z_ait])
                 self.assertTrue(z_ait.equal(z_pt))
 
+    def test_get_constant_names(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_get_constant_names")
+        names = module.get_constant_names()
+        self.assertEqual(len(names), 2)
+        self.assertIn("constant_1", names)
+        self.assertIn("constant_2", names)
+
+    def test_get_constant_folding_input_names(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
+        # constant 1 is not folded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        # constants 2 and 3 are
+        y = ops.elementwise(FuncEnum.MUL)(constant_2, constant_2)
+
+        output = ops.elementwise(FuncEnum.MUL)(x, y)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output, target, "./tmp", "test_get_constant_folding_input_names"
+        )
+        names = module.get_constant_folding_input_names()
+        self.assertEqual(names, [])
+        # TODO: uncomment when the new constant folding pass is enabled.
+        # self.assertEqual(len(names), 2)
+        # self.assertIn("constant_2", names)
+        # self.assertIn("constant_3", names)
+
 
 if __name__ == "__main__":
     unittest.main()

From 7998f67c80f090cba64b30b2e93c2ed0e08da32d Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 071/638] Add an API for bulk-setting constants (#1185)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1185

If you don't create a new `ModelContainer`, stopping all inferences and acquiring the lock that blocks new inferences is unavoidable when updating constants.

Add a more efficient API that lets you set many constants at once to avoid acquiring the lock many times.

Reviewed By: chenyang78

Differential Revision: D42035260

fbshipit-source-id: 81ae9361f8df37a7fb01887813bdd262260b0e3c
---
 python/aitemplate/compiler/model.py      | 29 ++++++++++++
 static/csrc/model_container.cpp          | 56 ++++++++++++++++++------
 static/csrc/model_interface.cpp          | 11 +++++
 static/include/model_container.h         |  7 ++-
 static/include/model_interface.h         |  6 +++
 tests/unittest/backend/test_model_api.py | 24 ++++++++++
 6 files changed, 118 insertions(+), 15 deletions(-)

diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 772f72def..a24c3dd85 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -741,6 +741,35 @@ def set_constant(self, name: str, tensor: AITData):
             self.handle, c_name, ctypes.byref(c_tensor)
         )
 
+    def set_many_constants(self, tensors: Dict[str, AITData]):
+        """
+        Bulk set many constants at once. More efficient than set_constant()
+        since it only has to acquire the lock once.
+        """
+        c_names = (ctypes.c_char_p * len(tensors))()
+        c_tensors = (_CFormatAITData * len(tensors))()
+        ait_tensors = {
+            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
+            for name, tensor in tensors.items()
+        }
+        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
+            c_names[i] = ctypes.c_char_p(name_bytes)
+            c_tensors[i] = tensor
+
+        num_tensors = ctypes.c_size_t(len(tensors))
+        self.DLL.AITemplateModelContainerSetManyConstants(
+            self.handle, c_names, c_tensors, num_tensors
+        )
+
+    def set_many_constants_with_tensors(self, tensors: Dict[str, AITData]):
+        ait_tensors = {}
+        for name, tensor in tensors.items():
+            if not tensor.is_contiguous() or not tensor.is_cuda:
+                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+            self.torch_constant_tensors[name] = tensor
+            ait_tensors[name] = torch_to_ait_data(tensor)
+        self.set_many_constants(ait_tensors)
+
     def set_constant_with_tensor(self, name: str, tensor: TorchTensor):
         """
         Set a constant with a PyTorch tensor.
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 0a4173d7f..6c95a17c9 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -340,19 +340,7 @@ float ModelContainer::Benchmark(
   return max_time / total_num_iters;
 }
 
-void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
-  // Prevent constant folding/inferences from starting while we update
-  // constants.
-  std::lock_guard lk(constants_sync_mutex_);
-  // Wait for any ongoing inferences + foldings to finish.
-  WaitForAllModels();
-  try {
-    constant_folder_->WaitForCompletion();
-  } catch (...) {
-    LOG(WARNING)
-        << "Constant folder threw exception while waiting for completion, ignoring.";
-  }
-
+void ModelContainer::SetConstantImpl(const char* name, const AITData& tensor) {
   auto it = unbound_constant_name_to_idx_.find(name);
   if (it == unbound_constant_name_to_idx_.end()) {
     return;
@@ -383,6 +371,37 @@ void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
   }
 }
 
+void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
+  std::lock_guard lk(constants_sync_mutex_);
+  WaitForAllModels(/*include_constant_folder=*/true);
+  SetConstantImpl(name, tensor);
+}
+
+void ModelContainer::SetManyConstants(
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  if (num_tensors == 0) {
+    return;
+  }
+
+  if (tensors == nullptr) {
+    throw std::runtime_error("Tensor array cannot be null");
+  }
+
+  std::lock_guard lk(constants_sync_mutex_);
+  WaitForAllModels(/*include_constant_folder=*/true);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    const char* name = names[i];
+    if (name == nullptr) {
+      throw std::runtime_error("Constant name cannot be null");
+    }
+    const auto& tensor = tensors[i];
+    SetConstantImpl(names[i], tensor);
+  }
+}
+
 size_t ModelContainer::NumInputs() const {
   return num_inputs_;
 }
@@ -435,7 +454,7 @@ size_t ModelContainer::MaxOutputStorageBytes(size_t output_idx) const {
   return max_param_storage_bytes_[idx];
 }
 
-void ModelContainer::WaitForAllModels() {
+void ModelContainer::WaitForAllModels(bool include_constant_folder) {
   // Wait for all on-going inferences to finish.
   for (auto* model : pending_models_) {
     try {
@@ -453,6 +472,15 @@ void ModelContainer::WaitForAllModels() {
     }
     available_models_.push_back(model);
   }
+
+  if (include_constant_folder) {
+    try {
+      constant_folder_->WaitForCompletion();
+    } catch (...) {
+      LOG(WARNING)
+          << "Constant folder threw exception while waiting for completion, ignoring.";
+    }
+  }
 }
 
 void ModelContainer::FoldConstantsImpl(StreamType stream) {
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index 1ec93d645..dbf03378c 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -112,6 +112,17 @@ AITemplateError AITemplateModelContainerSetConstant(
   CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
 }
 
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
+    AITemplateModelHandle handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetManyConstants(names, tensors, num_tensors); })
+}
+
 AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
     bool constant_folding_inputs_only,
diff --git a/static/include/model_container.h b/static/include/model_container.h
index e3f1f1c4f..d9247d8d9 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -167,6 +167,10 @@ class ModelContainer : ModelContainerBase {
       int64_t** output_shapes_out);
 
   void SetConstant(const char* name, const AITData& tensor);
+  void SetManyConstants(
+      const char** names,
+      const AITData* tensors,
+      size_t num_tensors);
 
   size_t NumInputs() const;
   size_t NumOutputs() const;
@@ -200,8 +204,9 @@ class ModelContainer : ModelContainerBase {
       bool constant_folding_inputs_only) const;
 
  private:
-  void WaitForAllModels();
+  void WaitForAllModels(bool include_constant_folder = false);
   void FoldConstantsImpl(StreamType stream);
+  void SetConstantImpl(const char* name, const AITData& tensor);
 
   void PrepareForRun(
       Model* model,
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 3240379fc..c64133753 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -149,6 +149,12 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
     const char* name,
     const AITData* tensor);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
+    AITemplateModelHandle handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
     bool constant_folding_inputs_only,
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index 55933e34f..83b8161f8 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -1512,6 +1512,30 @@ def test_get_constant_folding_input_names(self):
         # self.assertIn("constant_2", names)
         # self.assertIn("constant_3", names)
 
+    def test_set_many_constants(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_get_constant_names")
+
+        input_0_pt = torch.randn((1, 2)).cuda().half()
+        constant_1_pt = torch.randn((1, 2)).cuda().half()
+        constant_2_pt = torch.randn((1, 2)).cuda().half()
+        module.set_many_constants_with_tensors(
+            {"constant_1": constant_1_pt, "constant_2": constant_2_pt}
+        )
+        output_pt = input_0_pt * constant_1_pt * constant_2_pt
+        output_ait = torch.empty_like(input_0_pt)
+        module.run_with_tensors([input_0_pt], [output_ait])
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
 
 if __name__ == "__main__":
     unittest.main()

From a8e7250b6fbc2330f5ed6990ce09ba873cebaac9 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 072/638] Enable constant folding v2, add tests, delete v1
 (#1187)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1187

Add a few tests for new constant folding & enable it.

Reviewed By: ipiszy, chenyang78

Differential Revision: D41875062

fbshipit-source-id: 7ad2c1f2624a3933b53037a832ed6880a5907fce
---
 python/aitemplate/compiler/compiler.py        |  21 +-
 python/aitemplate/compiler/model.py           |   4 +-
 .../aitemplate/compiler/transform/__init__.py |   2 +-
 .../compiler/transform/constant_folding.py    | 207 ++----------------
 static/include/model_interface.h              |   4 +-
 tests/unittest/backend/test_model_api.py      |  94 ++++----
 .../compiler/test_constant_folding.py         | 193 +++++++++++++---
 7 files changed, 237 insertions(+), 288 deletions(-)

diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 20e29e100..f9ea0f8ef 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -205,15 +205,11 @@ def compile_model(
             start_t = datetime.now()
             constant_folding_workdir = os.path.join(workdir, test_name)
             os.makedirs(constant_folding_workdir, exist_ok=True)
-            graph = compiler.transform.constant_folding(graph, constant_folding_workdir)
-            # TODO: enable and delete the call above.
-            # They can't be enabled at the same time because contant folding mutates tensors
-            # in the graph.
-            # (
-            #    graph,
-            #    constant_folding_file_pairs,
-            #    constant_folding_inputs,
-            # ) = compiler.transform.constant_folding_v2(graph, workdir, test_name)
+            (
+                graph,
+                constant_folding_file_pairs,
+                constant_folding_inputs,
+            ) = compiler.transform.constant_folding(graph, workdir, test_name)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "constant_folding"
             )
@@ -228,8 +224,7 @@ def compile_model(
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
 
             file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
-            # TODO: uncomment when V2 constant folding is enabled.
-            # file_pairs.extend(constant_folding_file_pairs)
+            file_pairs.extend(constant_folding_file_pairs)
 
             # It's possible that the original output tensor has been replaced with a new tensor.
             # Preserve original output tensors' orders but use the new tensors.
@@ -252,8 +247,8 @@ def compile_model(
                 workdir,
                 output_tensors,
                 test_name,
-                # additional_unbound_constants=constant_folding_inputs,
-                debug_settings,
+                additional_unbound_constants=constant_folding_inputs,
+                debug_settings=debug_settings,
             )
             file_pairs.extend(main_pairs)
 
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index a24c3dd85..5024039f5 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -554,7 +554,7 @@ def _run_with_outputs_on_host(
         the stream will always be synchronized after copying the outputs to the host.
 
         Warning: don't use this! It's not optimal with respect to performance.
-        It's here for use by internal constant folding passes.
+        It's here for use if you need it for debugging purpose.
         """
         return self._run_impl(
             inputs, outputs, stream_ptr, graph_mode=graph_mode, outputs_on_host=True
@@ -571,7 +571,7 @@ def _run_with_tensors_outputs_on_host(
         Like RunWithTensors(), but takes host memory tensors
 
         Warning: don't use this! It's not optimal with respect to performance.
-        It's here for use by internal constant folding passes.
+        It's here for use if you need it for debugging.
         """
         _check_tensors_contiguous_and_on_gpu(
             inputs,
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index 9e61f5889..ca9bf77e4 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -14,7 +14,7 @@
 #
 # flake8: noqa
 from .bind_constants import bind_constants
-from .constant_folding import constant_folding, constant_folding_v2
+from .constant_folding import constant_folding
 from .fuse_conv_elementwise import fuse_conv_elementwise
 from .fuse_group_ops import (
     fuse_group_gemm_ops,
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 0e1752a9a..3243d76f0 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -16,13 +16,9 @@
 import os
 from typing import Dict, List, Tuple
 
-import numpy as np
-
 from aitemplate import backend, compiler
 
-from aitemplate.compiler.base import _NumpyConstantTensorData, IntVarTensor, Tensor
-from aitemplate.compiler.dtype import normalize_dtype
-from aitemplate.compiler.model import AITData, Model
+from aitemplate.compiler.base import IntVarTensor, Tensor
 from aitemplate.compiler.transform.memory_planning import Workspace
 from aitemplate.compiler.transform.transform_utils import replace_tensor
 from aitemplate.utils import graph_utils
@@ -44,71 +40,6 @@ def _create_dummy_constant_folder():
     return model_container_generator.generate_model()
 
 
-def _output_from_tensor(tensor: Tensor) -> Tensor:
-    new_tensor = Tensor(
-        shape=tensor._attrs["shape"],
-        name=tensor._attrs["name"],
-        src_ops=tensor._attrs["src_ops"].copy(),
-        dst_ops=tensor._attrs["dst_ops"].copy(),
-        dtype=tensor._attrs["dtype"],
-        is_output=True,
-        is_view_of=tensor._attrs["is_view_of"],
-    )
-    if new_tensor._attrs["is_view_of"] is not None:
-        # If this tensor is a view, we need to set external_tensor
-        # so codegen handles the "output is view of output" case
-        # correctly.
-        new_tensor._attrs["external_tensor"] = new_tensor._attrs["is_view_of"]
-    return new_tensor
-
-
-def _extract_foldable_subgraph(
-    sorted_graph: List[Tensor],
-) -> List[Tensor]:
-    """
-    Extract a list of foldable nodes. A node is foldable if:
-    * It has bound data, or
-    * All of its inputs are foldable.
-
-    The subgraph returned is just a list of Tensors. All foldable
-    tensors that do not have bound data are marked as outputs in
-    the subgraph. The original graph is not modified.
-
-    All tensors that do not have bound data are marked as outputs.
-    This is because we want to execute the subgraph and get all
-    of the new constants. Only the ones that are actually needed are put
-    back into the final graph.
-    """
-    foldable_node_names = set()
-    subgraph = []
-
-    for tensor in sorted_graph:
-        if tensor._attrs["is_input"]:
-            continue
-
-        name = tensor._attrs["name"]
-        if tensor._attrs["data"] is not None:
-            foldable_node_names.add(name)
-            subgraph.append(tensor)
-            continue
-        elif tensor._attrs["is_param"]:
-            # Params that do not have bound data cannot be folded.
-            continue
-        elif isinstance(tensor, IntVarTensor):
-            continue
-        foldable = all(
-            inp._attrs["name"] in foldable_node_names
-            for op in tensor._attrs["src_ops"]
-            for inp in op._attrs["inputs"]
-        )
-
-        if foldable:
-            foldable_node_names.add(name)
-            subgraph.append(_output_from_tensor(tensor))
-
-    return subgraph
-
-
 def _make_op_names_unique(graph: List[Tensor]) -> Dict[str, str]:
     """
     To avoid ODR issues, we rename all ops in the constant folding subgraph.
@@ -132,109 +63,6 @@ def _rename_ops(graph: List[Tensor], new_name_to_old: Dict[str, str]) -> None:
                 op._attrs["name"] = new_name_to_old[op._attrs["name"]]
 
 
-def _constant_folding_impl(
-    sorted_graph: List[Tensor], workdir: str
-) -> Dict[str, Tensor]:
-    constant_folding_workdir = os.path.join(workdir, "constant_folding")
-    os.makedirs(constant_folding_workdir, exist_ok=True)
-    # Just write a dummy constant folder. It's unused in this path, and we're removing this function soon.
-    with open(
-        os.path.join(constant_folding_workdir, "constant_folder-generated.h"), "w"
-    ) as f:
-        f.write(_create_dummy_constant_folder())
-
-    with open(os.path.join(workdir, "constant_folder-generated.h"), "w") as f:
-        f.write(_create_dummy_constant_folder())
-
-    # Collect the set of output names before we do any transformations. We'll need this
-    # if we end up turning outputs into constants. _extract_foldable_subgraph marks *all*
-    # folded constants as outputs, so we can't just query attrs["is_output"] (see
-    # extract_foldable_subgraph for more info on why that happens)
-    original_output_tensors = {
-        tensor._attrs["name"] for tensor in sorted_graph if tensor._attrs["is_output"]
-    }
-
-    subgraph = _extract_foldable_subgraph(sorted_graph)
-    output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
-    if not output_tensors:
-        _LOGGER.info("No constants to fold, skipping constant folding.")
-        return {}
-
-    blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
-
-    file_pairs = backend.codegen.gen_function_src(subgraph, workdir, "constant_folding")
-    main_pairs = backend.codegen.gen_library_src(
-        subgraph,
-        blob,
-        constant_blob,
-        workspace,
-        workdir,
-        output_tensors,
-        "constant_folding",
-    )
-    file_pairs.extend(main_pairs)
-    compile_engine = backend.builder.Builder()
-    so_name = os.path.join(constant_folding_workdir, "constant_folding.so")
-    compile_engine.make(file_pairs, "constant_folding.so", workdir, "constant_folding")
-    module = Model(so_name, num_runtimes=1)
-
-    outputs = {}
-    new_tensors = {}
-    for tensor in subgraph:
-        if tensor._attrs["data"] is None:
-            name = tensor._attrs["name"]
-            shape = module.get_output_maximum_shape(tensor._attrs["name"])
-            arr = np.empty(shape, dtype=normalize_dtype(tensor._attrs["dtype"]))
-            new_tensor = Tensor(
-                shape=tensor._attrs["shape"],
-                name=name,
-                # copy dst_ops so we can modify the original tensor without affecting this one.
-                dst_ops=tensor._attrs["dst_ops"].copy(),
-                dtype=tensor._attrs["dtype"],
-                is_output=name in original_output_tensors,
-            )
-            new_tensor._bind_data(_NumpyConstantTensorData(arr))
-            new_tensors[name] = new_tensor
-            outputs[name] = AITData(arr.ctypes.data, shape, tensor._attrs["dtype"])
-
-    module._run_with_outputs_on_host({}, outputs)
-    return new_tensors
-
-
-def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
-    """
-    Fold and propagate constants.
-
-    This pass looks for ops that have inputs which can be determined
-    at compile time. It evaluates them, then puts the new constants
-    back into the graph with bound data. The old ops are eliminated.
-
-    This pass actually compiles and runs an AIT runtime. If there are
-    any problems (e.g. due to buggy ops), the constant folding is
-    aborted and the graph is returned unchanged. All generated code
-    is stored in workdir/constant_folding.
-    """
-    try:
-        new_constants = _constant_folding_impl(sorted_graph, workdir)
-    except Exception as e:
-        _LOGGER.warning(
-            f"Constant folding encountered an error: {e}. The graph will not be modified.",
-        )
-        return sorted_graph
-
-    # Replace ops with their folded values.
-    for idx, tensor in enumerate(sorted_graph):
-        name = tensor._attrs["name"]
-        if name in new_constants:
-            new_tensor = new_constants[name]
-            replace_tensor(tensor, new_tensor)
-            sorted_graph[idx] = new_tensor
-
-    # Eliminate constants that are no longer used
-    compiler.transform.remove_unused_ops(sorted_graph)
-    return compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph)
-
-
 def _non_output_from_tensor(tensor: Tensor) -> Tensor:
     new_tensor = Tensor(
         shape=tensor._attrs["shape"],
@@ -246,10 +74,11 @@ def _non_output_from_tensor(tensor: Tensor) -> Tensor:
     )
     new_tensor._attrs["is_param"] = tensor._attrs["is_param"]
     new_tensor._attrs["data"] = tensor._attrs["data"]
+    new_tensor._attrs["external_tensor"] = tensor._attrs["external_tensor"]
     return new_tensor
 
 
-def _output_from_tensor_v2(tensor: Tensor) -> Tensor:
+def _output_from_tensor(tensor: Tensor) -> Tensor:
     new_tensor = _non_output_from_tensor(tensor)
     new_tensor._attrs["is_output"] = True
     return new_tensor
@@ -291,7 +120,7 @@ def _fix_op_inputs_outputs(
         ]
 
 
-def _extract_foldable_subgraph_v2(
+def _extract_foldable_subgraph(
     sorted_graph: List[Tensor],
 ) -> List[Tensor]:
     """
@@ -347,34 +176,26 @@ def _is_used_by_foldable_op(tensor: Tensor) -> bool:
                 return True
         return False
 
+    # Now figure out which tensors can be marked as outputs.
     filtered_subgraph = []
-    views_of_constants = set()
     name_to_new_tensor = {}
     name_to_old_tensor = {}
     constant_folding_inputs = []
 
     for tensor in subgraph:
-        # Mark views of constants. This helps us avoid extra copying, see below.
-        view = tensor._attrs["is_view_of"]
-        is_view_of_constant = tensor._attrs["is_param"] or (
-            view is not None and view in views_of_constants
-        )
-        if is_view_of_constant:
-            views_of_constants.add(tensor)
-
         name = tensor._attrs["name"]
         new_tensor = None
 
-        if not is_view_of_constant and (
+        if not tensor._attrs["is_param"] and (
             _is_used_by_non_foldable_op(tensor) or tensor._attrs["is_output"]
         ):
             # Tensor is required outside of the subgraph, make it an output.
-            # We only need to do this if it's not a (view of) a constant, else
-            # we'll get wasteful D2D copies.
-            new_tensor = _output_from_tensor_v2(tensor)
+            # Parameters don't need to be marked as outputs in the
+            # subgraph, we already know their values.
+            new_tensor = _output_from_tensor(tensor)
 
         elif _is_used_by_foldable_op(tensor):
-            # No need to append constants that are not used by foldable ops.
+            # No need to append constants that are not used by any foldable ops.
             new_tensor = _non_output_from_tensor(tensor)
             if new_tensor._attrs["is_param"]:
                 constant_folding_inputs.append(new_tensor)
@@ -388,7 +209,7 @@ def _is_used_by_foldable_op(tensor: Tensor) -> bool:
     return filtered_subgraph, name_to_old_tensor, constant_folding_inputs
 
 
-def _constant_folding_impl_v2(
+def _constant_folding_impl(
     sorted_graph: List[Tensor],
     workdir: str,
     model_name: str,
@@ -407,7 +228,7 @@ def _constant_folding_impl_v2(
         subgraph,
         name_to_old_tensor,
         constant_folding_inputs,
-    ) = _extract_foldable_subgraph_v2(sorted_graph)
+    ) = _extract_foldable_subgraph(sorted_graph)
     output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
     if not output_tensors:
         _LOGGER.info("No constants to fold, skipping constant folding.")
@@ -455,7 +276,7 @@ def _constant_folding_impl_v2(
     return new_tensors, file_pairs, constant_folding_inputs
 
 
-def constant_folding_v2(
+def constant_folding(
     sorted_graph: List[Tensor],
     workdir: str,
     model_name: str,
@@ -472,7 +293,7 @@ def constant_folding_v2(
     aborted and the graph is returned unchanged. All generated code
     is stored in workdir/constant_folding.
     """
-    new_constants, file_pairs, constant_folding_inputs = _constant_folding_impl_v2(
+    new_constants, file_pairs, constant_folding_inputs = _constant_folding_impl(
         sorted_graph, workdir, model_name
     )
 
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index c64133753..9485c60ad 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -178,8 +178,8 @@ AIT_EXPORT AITemplateError AITemplateModelContainerRun(
 
 // Like AITemplateModelContainerRun, but expects outputs to be allocated on the
 // host. Does an extra sync/copy at the end to copy them over. Warning: don't
-// use this! It's not optimal with respect to performance. It's here for use by
-// internal constant folding passes.
+// use this! It's not optimal with respect to performance. It's here for use if
+// you need it for debugging.
 AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
     AITemplateModelHandle handle,
     const AITData* inputs,
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index 83b8161f8..146913378 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -890,7 +890,7 @@ def test_use_internal_constant_tensors_host(self):
     def test_use_internal_constant_tensors_gpu(self):
         self._test_use_constant_tensor(
             lambda tensor: _TorchConstantTensorData(tensor),
-            "test_use_internal_constant_tensors_host",
+            "test_use_internal_constant_tensors_gpu",
         )
 
     def test_use_internal_constant_tensors_huge(self):
@@ -1139,46 +1139,6 @@ def test_error_duplicate_output_in_output_tensors_list(self):
             "test_error_duplicate_output_in_output_tensors_list",
         )
 
-    def test_run_with_outputs_on_host(self):
-        (
-            module,
-            (in0_pt, in1_pt),
-            (out_pt, out_storage),
-        ) = self._get_simple_graph_and_output("test_run_with_outputs_on_host")
-        out_host = out_storage.cpu()
-        out_pt_host = out_pt.cpu()
-        module._run_with_outputs_on_host(
-            [
-                torch_to_ait_data(in0_pt),
-                torch_to_ait_data(in1_pt),
-            ],
-            [torch_to_ait_data(out_host)],
-        )
-
-        self.assertTrue(torch.equal(out_pt_host, out_host))
-        out_host.zero_()
-
-        module._run_with_tensors_outputs_on_host(
-            {"input_0": in0_pt, "input_1": in1_pt}, {"output": out_host}
-        )
-        self.assertTrue(torch.equal(out_pt_host, out_host))
-
-    def test_run_with_outputs_on_host_fails_with_outputs_on_device(self):
-        (
-            module,
-            (in0_pt, in1_pt),
-            (_, out_storage),
-        ) = self._get_simple_graph_and_output(
-            "test_run_with_outputs_on_host_fails_with_outputs_on_device"
-        )
-
-        self.assertRaises(
-            ValueError,
-            module._run_with_tensors_outputs_on_host,
-            {"input_0": in0_pt, "input_1": in1_pt},
-            {"output": out_storage},
-        )
-
     def test_cannot_use_closed_model(self):
         (
             module,
@@ -1239,10 +1199,12 @@ def test_run_fails_with_unbound_constants(self):
         self.assertTrue(torch.allclose(output_data, expected))
 
     def test_set_constant_fails_wrong_dtype(self):
-        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
-        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
-        output._attrs["name"] = "output"
-        output._attrs["is_output"] = True
+        def _create_graph():
+            constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+            output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+            output._attrs["name"] = "output"
+            output._attrs["is_output"] = True
+            return output
 
         for wrong_tensor in (
             torch.zeros([1, 2]).long().cuda(),
@@ -1251,7 +1213,7 @@ def test_set_constant_fails_wrong_dtype(self):
         ):
             target = detect_target()
             with compile_model(
-                output, target, "./tmp", "test_set_constant_fails_wrong_dtype"
+                _create_graph(), target, "./tmp", "test_set_constant_fails_wrong_dtype"
             ) as module:
                 self.assertRaises(
                     RuntimeError,
@@ -1261,10 +1223,12 @@ def test_set_constant_fails_wrong_dtype(self):
                 )
 
     def test_set_constant_fails_wrong_shape(self):
-        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
-        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
-        output._attrs["name"] = "output"
-        output._attrs["is_output"] = True
+        def _create_graph():
+            constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+            output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+            output._attrs["name"] = "output"
+            output._attrs["is_output"] = True
+            return output
 
         for wrong_shape in (
             [2, 2],
@@ -1273,6 +1237,7 @@ def test_set_constant_fails_wrong_shape(self):
         ):
             wrong_tensor = torch.randn(wrong_shape).half().cuda()
             target = detect_target()
+            output = _create_graph()
             with compile_model(
                 output, target, "./tmp", "test_set_constant_fails_wrong_shape"
             ) as module:
@@ -1536,6 +1501,35 @@ def test_set_many_constants(self):
         module.run_with_tensors([input_0_pt], [output_ait])
         self.assertTrue(torch.equal(output_pt, output_ait))
 
+    def test_async_fold_constants(self):
+        target = detect_target()
+
+        input_0 = Tensor(
+            shape=[10000, 2000], dtype="float16", name="input_0", is_input=True
+        )
+        constant_1 = Tensor(shape=[10000, 2000], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[10000, 2000], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_get_constant_names")
+
+        input_0_pt = torch.randn((10000, 2000)).cuda().half()
+        constant_1_pt = torch.randn((10000, 2000)).cuda().half()
+        constant_2_pt = torch.randn((10000, 2000)).cuda().half()
+        output_pt = input_0_pt * constant_1_pt * constant_2_pt
+        output_ait = torch.empty_like(input_0_pt)
+
+        module.set_many_constants_with_tensors(
+            {"constant_1": constant_1_pt, "constant_2": constant_2_pt}
+        )
+        module.fold_constants(sync=False)
+        module.run_with_tensors([input_0_pt], [output_ait])
+
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index ae5729bf4..0ca1d804c 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -17,11 +17,7 @@
 import torch
 from aitemplate.compiler import compile_model, Model, ops
 
-from aitemplate.compiler.base import (
-    _create_host_zero_tensor,
-    _TorchConstantTensorData,
-    Tensor,
-)
+from aitemplate.compiler.base import _create_host_zero_tensor, Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.compiler.public import IntImm
 from aitemplate.compiler.transform.transform_utils import check_graph_validity
@@ -43,13 +39,7 @@ def _verify_graph(
         graph_size = len(mod.debug_sorted_graph)
         self.assertEqual(graph_size, expected_num_nodes)
 
-        num_constants = sum(
-            1 for tensor in mod.debug_sorted_graph if tensor._attrs["data"] is not None
-        )
-        # Make sure the extra constants are deleted.
-        self.assertEqual(num_constants, expected_num_constants)
-
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand([("float16")])
     def test_simple_constant_fold(self, dtype):
         target = detect_target()
         if dtype == "float" and target.name == "rocm":
@@ -62,9 +52,7 @@ def test_simple_constant_fold(self, dtype):
         y_pt = (inp2_pt + x_pt).flatten()
 
         inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
-        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
         inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
-        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
         inp2_ait = Tensor(shape=[3, 3], dtype=dtype, name="inp2", is_input=True)
 
         x_ait = ops.elementwise(FuncEnum.MUL)(inp0_ait, inp1_ait)
@@ -79,6 +67,9 @@ def test_simple_constant_fold(self, dtype):
         mod = compile_model(
             y_ait, target, "./tmp", f"test_constant_folding_simple_{dtype}"
         )
+        mod.set_constant_with_tensor("inp0", inp0_pt)
+        mod.set_constant_with_tensor("inp1", inp1_pt)
+        mod.fold_constants()
 
         y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({"inp2": inp2_pt}, {"y": y})
@@ -97,15 +88,15 @@ def test_pad_constant_weight(self, dtype):
 
         M, N, K = 16, 32, 3
         w_pt = get_random_torch_tensor((K, N), dtype)
-        weight_data = _TorchConstantTensorData(w_pt)
         input_0 = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[K, N], dtype=dtype, name="weight")
-        W._bind_data(weight_data)
         Y = ops.gemm_rrr()(input_0, W)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
         mod = compile_model(Y, target, "./tmp", f"test_pad_constant_weight_{dtype}")
+        mod.set_constant_with_tensor("weight", w_pt)
+        mod.fold_constants()
 
         input_0_pt = get_random_torch_tensor((M, K), dtype)
         y_pt = torch.matmul(input_0_pt, w_pt)
@@ -143,36 +134,33 @@ def test_fold_long_chain(self, dtype):
             self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         M, N, K = 16, 32, 3
         w1_pt = get_random_torch_tensor((K, N), dtype)
-        w1_data = _TorchConstantTensorData(w1_pt)
 
         w2_pt = get_random_torch_tensor((K, N), dtype)
-        w2_data = _TorchConstantTensorData(w2_pt)
 
         w3_pt = w1_pt * w2_pt
         x_pt = get_random_torch_tensor((M, K), dtype)
-        x_pt_data = _TorchConstantTensorData(x_pt)
 
         y_pt = torch.matmul(x_pt, w3_pt)
         w4_pt = get_random_torch_tensor((M, N), dtype)
-        w4_data = _TorchConstantTensorData(w4_pt)
         z_pt = y_pt * w4_pt
 
         w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
-        w1_ait._bind_data(w1_data)
         w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
-        w2_ait._bind_data(w2_data)
         w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         x_ait = Tensor(shape=[M, K], dtype=dtype, name="x")
-        x_ait._bind_data(x_pt_data)
         y_ait = ops.gemm_rrr()(x_ait, w3_ait)
         w4_ait = Tensor(shape=[M, N], dtype=dtype, name="w4")
-        w4_ait._bind_data(w4_data)
         z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
         z_ait._attrs["name"] = "z"
         z_ait._attrs["is_output"] = True
 
         target = detect_target()
         mod = compile_model(z_ait, target, "./tmp", f"test_fold_long_chain_{dtype}")
+        mod.set_constant_with_tensor("w1", w1_pt)
+        mod.set_constant_with_tensor("w2", w2_pt)
+        mod.set_constant_with_tensor("x", x_pt)
+        mod.set_constant_with_tensor("w4", w4_pt)
+        mod.fold_constants()
 
         z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
@@ -193,9 +181,7 @@ def test_constant_folding_through_views(self, dtype):
         y_pt = (inp0_pt * inp1_pt).flatten()
 
         inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
-        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
         inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
-        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
         inp0_view = ops.flatten()(inp0_ait)
         inp1_view = ops.flatten()(inp1_ait)
         y_ait = ops.elementwise(FuncEnum.MUL)(inp0_view, inp1_view)
@@ -205,6 +191,9 @@ def test_constant_folding_through_views(self, dtype):
         mod = compile_model(
             y_ait, target, "./tmp", f"test_constant_folding_through_views_{dtype}"
         )
+        mod.set_constant_with_tensor("inp0", inp0_pt)
+        mod.set_constant_with_tensor("inp1", inp1_pt)
+        mod.fold_constants()
 
         y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({}, {"y": y})
@@ -247,8 +236,12 @@ def test_late_binding(self, dtype):
             target,
             "./tmp",
             f"test_late_binding_{dtype}",
-            constants={"w1": w1_pt, "w2": w2_pt, "x": x_pt, "w4": w4_pt},
         )
+        mod.set_constant_with_tensor("w1", w1_pt)
+        mod.set_constant_with_tensor("w2", w2_pt)
+        mod.set_constant_with_tensor("x", x_pt)
+        mod.set_constant_with_tensor("w4", w4_pt)
+        mod.fold_constants()
 
         z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
@@ -359,6 +352,152 @@ def test_late_binding_fails_wrong_dtype(self):
                     constants={"w1": w1_pt},
                 )
 
+    def test_constant_folding_manual_call(self):
+        dtype = "float16"
+
+        N, K = IntImm(16), IntImm(32)
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
+        y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        shape = (K.value(), N.value())
+        w1_pt = get_random_torch_tensor(shape, dtype)
+        w2_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = w1_pt * w2_pt
+        y = torch.empty_like(y_pt)
+
+        with compile_model(
+            y_ait, detect_target(), "./tmp", "test_constant_folding_manual_call"
+        ) as mod:
+            # Unset constants
+            self.assertRaises(RuntimeError, mod.run_with_tensors, {}, [y])
+            self.assertRaises(RuntimeError, mod.fold_constants)
+
+            mod.set_many_constants_with_tensors({"w1": w1_pt, "w2": w2_pt})
+            mod.fold_constants()
+            mod.run_with_tensors({}, [y])
+            self.assertTrue(torch.equal(y_pt, y))
+
+    def test_constant_folding_mixed_usage(self):
+        """
+        Test a mix of all the ways to use constants:
+        - Unbound constant that is not folded
+        - Unbound constant folding input
+        - Bound constant folding input
+        - Bound constant that is not folded
+        """
+        dtype = "float16"
+
+        N, K = IntImm(13), IntImm(33)
+        input_0 = Tensor(shape=[N, N], dtype=dtype, name="input_0", is_input=True)
+
+        # Unbound, unfolded constant
+        w1_ait = Tensor(shape=[N, N], dtype=dtype, name="w1")
+
+        x1_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, input_0)
+
+        # Unbound folded constant
+        w2_ait = Tensor(shape=[N, K], dtype=dtype, name="w2")
+
+        # Bound folded constants
+        w3_ait = Tensor(shape=[N, K], dtype=dtype, name="w3")
+        w4_ait = Tensor(shape=[N, K], dtype=dtype, name="w4")
+
+        x2_ait = ops.elementwise(FuncEnum.MUL)(w2_ait, w3_ait)
+        x3_ait = ops.gemm_rcr()(x2_ait, w4_ait)
+
+        x4_ait = ops.elementwise(FuncEnum.MUL)(x3_ait, x1_ait)
+
+        # Bound unfolded constant
+        w5_ait = Tensor(shape=[N, N], dtype=dtype, name="w5")
+        output = ops.elementwise(FuncEnum.MUL)(w5_ait, x4_ait)
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output"
+
+        input_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+        w1_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+        w2_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w3_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w4_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w5_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+
+        x1_pt = w1_pt * input_pt
+        x2_pt = w2_pt * w3_pt
+        x3_pt = torch.nn.functional.linear(x2_pt, w4_pt)
+        x4_pt = x3_pt * x1_pt
+        output_pt = w5_pt * x4_pt
+
+        mod = compile_model(
+            output,
+            detect_target(),
+            "./tmp",
+            "test_constant_folding_mixed_usage",
+            constants={"w3": w3_pt, "w4": w4_pt, "w5": w5_pt},
+        )
+
+        self.assertSetEqual(
+            set(mod.get_constant_folding_input_names()),
+            # This is not the only input, but it's the only one we can set.
+            {"w2"},
+        )
+
+        self.assertSetEqual(set(mod.get_constant_names()), {"w1", "w2"})
+
+        output = torch.empty_like(output_pt)
+        # Unset constant W2
+        self.assertRaises(RuntimeError, mod.run_with_tensors, [input_pt], [output])
+        mod.set_constant_with_tensor("w2", w2_pt)
+        # Unset constant W1
+        self.assertRaises(RuntimeError, mod.run_with_tensors, [input_pt], [output])
+        mod.set_constant_with_tensor("w1", w1_pt)
+
+        mod.run_with_tensors([input_pt], [output])
+        torch.testing.assert_close(output, output_pt, atol=1e-1, rtol=1e-1)
+
+    def test_constant_folding_output_in_middle_of_chain(self):
+        dtype = "float16"
+        N, K = IntImm(13), IntImm(33)
+        x = Tensor(shape=[N, K], dtype=dtype, name="x")
+        y = Tensor(shape=[N.value() * K.value(), 1], dtype=dtype, name="y")
+
+        x2 = ops.reshape()(x, [N.value() * K.value(), 1])
+        x2._attrs["name"] = "x2"
+        # Special case: view of constant needed outside of constant folding
+        # subgraph.
+        x2._attrs["is_output"] = True
+
+        x3 = ops.elementwise(FuncEnum.MUL)(x2, y)
+        x3._attrs["name"] = "x3"
+        x3._attrs["is_output"] = True
+
+        x4 = ops.elementwise(FuncEnum.ADD)(x3, x3)
+        x4._attrs["name"] = "x4"
+        x4._attrs["is_output"] = True
+
+        mod = compile_model(
+            [x2, x3, x4],
+            detect_target(),
+            "./tmp",
+            "test_constant_folding_output_in_middle_of_chain",
+        )
+
+        x_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        y_pt = get_random_torch_tensor((N.value() * K.value(), 1), dtype)
+        x2_pt = x_pt.reshape(N.value() * K.value(), 1)
+        x3_pt = x2_pt * y_pt
+        x4_pt = x3_pt + x3_pt
+
+        x2_ait, x3_ait, x4_ait = (
+            torch.empty_like(x2_pt),
+            torch.empty_like(x3_pt),
+            torch.empty_like(x4_pt),
+        )
+
+        mod.set_many_constants_with_tensors({"x": x_pt, "y": y_pt})
+        mod.run_with_tensors([], {"x2": x2_ait, "x3": x3_ait, "x4": x4_ait})
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 13084600c6a76c0e5fb296cbdbebead464625cc6 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 073/638] Make examples call fold_constants() before benchmarks
 (#1192)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1192

Load-time constant folding enables folding for many models that could not do it before, such as BERT.

Make all of the examples call `fold_constants()` before running the benchmarks. This is technically not necessary since folding will happen lazily on the first inference if you don't don't do this. But it's recommended for optimal performance.

Reviewed By: ipiszy

Differential Revision: D42070998

fbshipit-source-id: 6f8c5f030575e1f1fea9a1de9766173c179286eb
---
 examples/01_resnet-50/benchmark_ait.py        | 4 ++--
 examples/01_resnet-50/infer_with_torch.py     | 4 ++--
 examples/02_detectron2/compile_model.py       | 1 +
 examples/02_detectron2/predictor/predictor.py | 5 ++---
 examples/03_bert/benchmark_ait.py             | 4 ++--
 examples/04_vit/benchmark_ait.py              | 4 ++--
 examples/04_vit/verification.py               | 4 ++--
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/01_resnet-50/benchmark_ait.py b/examples/01_resnet-50/benchmark_ait.py
index 577a4472d..3e84681c9 100644
--- a/examples/01_resnet-50/benchmark_ait.py
+++ b/examples/01_resnet-50/benchmark_ait.py
@@ -76,8 +76,8 @@ def benchmark(model_name, batch_size, mod=None, graph_mode=True):
         mod = Model(os.path.join("./tmp", model_name, "test.so"))
 
     # Set params
-    for k, v in cuda_params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(cuda_params)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     x_input = torch.randn([batch_size, 224, 224, 3]).cuda().half()
diff --git a/examples/01_resnet-50/infer_with_torch.py b/examples/01_resnet-50/infer_with_torch.py
index 23269b2e4..5639897a8 100644
--- a/examples/01_resnet-50/infer_with_torch.py
+++ b/examples/01_resnet-50/infer_with_torch.py
@@ -98,8 +98,8 @@ def inference(model_name, mod=None):
         mod = Model(os.path.join("./tmp", model_name, "test.so"))
 
     # Set torch tensor params to runtime
-    for k, v in cuda_params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(cuda_params)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     x_input = prepare_data()
diff --git a/examples/02_detectron2/compile_model.py b/examples/02_detectron2/compile_model.py
index 4bf5d4d25..58d97924f 100644
--- a/examples/02_detectron2/compile_model.py
+++ b/examples/02_detectron2/compile_model.py
@@ -99,6 +99,7 @@ def benchmark(cfg, mod=None):
         mask_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
         outputs.append(torch.empty([BS, topk, mask_size, mask_size]).cuda().half())
 
+    mod.fold_constants(sync=True)
     mod.benchmark_with_tensors([x], outputs, count=100, repeat=2, graph_mode=True)
 
 
diff --git a/examples/02_detectron2/predictor/predictor.py b/examples/02_detectron2/predictor/predictor.py
index 324a138c2..ce3f85f24 100644
--- a/examples/02_detectron2/predictor/predictor.py
+++ b/examples/02_detectron2/predictor/predictor.py
@@ -176,9 +176,8 @@ def init_modules(self, detection_model_name, workdir):
         Load the AIT module of the detection model, and set the weights.
         """
         mod = Model(os.path.join(workdir, detection_model_name, "test.so"))
-        for name, weight in self.weights.items():
-            mod.set_constant_with_tensor(name, weight)
-
+        mod.set_many_constants_with_tensors(self.weights)
+        mod.fold_constants(sync=True)
         return mod
 
     def run_batch(self, batch_data, graph_mode=False):
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 54e3c8e9a..624588d18 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -209,8 +209,8 @@ def compile_module(
 
     mod = compile_model(y, target, "./tmp", model_name)
 
-    for k, v in params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(params)
+    mod.fold_constants(sync=True)
 
     return mod
 
diff --git a/examples/04_vit/benchmark_ait.py b/examples/04_vit/benchmark_ait.py
index c302d297d..3d3eba043 100644
--- a/examples/04_vit/benchmark_ait.py
+++ b/examples/04_vit/benchmark_ait.py
@@ -126,8 +126,8 @@ def benchmark(model_name, batch_size, mod=None, graph_mode=True):
             params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
 
     # set weights
-    for name, weight in params_ait.items():
-        mod.set_constant_with_tensor(name, weight)
+    mod.set_many_constants_with_tensors(params_ait)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     inputs = [torch.randn([batch_size, img_size, img_size, 3]).cuda().half()]
diff --git a/examples/04_vit/verification.py b/examples/04_vit/verification.py
index 0584707bf..5220b1213 100644
--- a/examples/04_vit/verification.py
+++ b/examples/04_vit/verification.py
@@ -129,8 +129,8 @@ def verification(
             params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
 
     # set weights
-    for name, weight in params_ait.items():
-        ait_mod.set_constant_with_tensor(name, weight)
+    ait_mod.set_many_constants_with_tensors(params_ait)
+    ait_mod.fold_constants(sync=True)
 
     inputs = [input_pt.permute((0, 2, 3, 1)).contiguous()]
     ys = []

From a82fc474dc6752986128440adee33f7eda152d66 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 6 Feb 2023 22:41:18 -0800
Subject: [PATCH 074/638] Load-time constant folding in AITModel (#1194)

Summary:
X-link: https://github.com/fairinternal/AITemplate/pull/1194

Call `FoldConstants()` at model-load time. Technically this can also just happen during the first inference. But we probably don't want the model to have to stop traffic to do extra work once it's been constructed.

Reviewed By: chenyang78

Differential Revision: D42099379

fbshipit-source-id: e0a6c73c9340ec47952bdc1f5f754f03e98e2528
---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index a859f37d0..9325d0011 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -135,6 +135,19 @@ AITModelImpl::AITModelImpl(
   TORCH_CHECK(handle_, "could not dlopen ", model_path, ": ", dlerror());
   TORCH_CHECK(num_runtimes > 0, "num_runtimes must be positive");
 
+  // It's not clear what stream we want to use yet. Create a new one.
+  // We could alternatively use the default stream, but that could cause extra
+  // synchronization.
+  cudaStream_t creation_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
+
 #define LOAD_SYMBOL(var, name_str)                                       \
   var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
   TORCH_CHECK(var, "could not dlsym " name_str);
@@ -170,8 +183,22 @@ AITModelImpl::AITModelImpl(
   LOAD_SYMBOL(getNumInputsFunc, "AITemplateModelContainerGetNumInputs");
   LOAD_SYMBOL(getNumOutputsFunc, "AITemplateModelContainerGetNumOutputs");
 #undef LOAD_SYMBOL
+  // TODO: this load is optional so we don't break backwards comptability.
+  // Once all relevant packages have been updated, we can just use
+  // LOAD_SYMBOL.
+  auto* foldConstantsFunc =
+      reinterpret_cast<decltype(&AITemplateModelContainerFoldConstants)>(
+          dlsym(handle_.get(), "AITemplateModelContainerFoldConstants"));
 
   AITCallCreate(createFunc, &model_handle_, num_runtimes, &allocator_);
+
+  if (foldConstantsFunc != nullptr) {
+    AIT_CHECK(foldConstantsFunc(
+        model_handle_,
+        /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(creation_stream),
+        /*sync=*/true));
+  }
+
   const auto num_inputs = AITCall(getNumInputsFunc, model_handle_);
   const auto num_outputs = AITCall(getNumOutputsFunc, model_handle_);
 

From 1e6374f5b1807033789494b07dbe6dfcfb96e3d1 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 7 Feb 2023 19:03:30 +0800
Subject: [PATCH 075/638] fix bugs

---
 fx2ait/CMakeLists.txt                      |  4 ----
 fx2ait/fx2ait/converters/ait_converters.py |  4 ++--
 fx2ait/fx2ait/csrc/AITModelImpl.cpp        | 10 +++++-----
 fx2ait/setup.py                            |  1 -
 python/aitemplate/compiler/base.py         |  1 -
 5 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/fx2ait/CMakeLists.txt b/fx2ait/CMakeLists.txt
index 0b581aeb3..bf6ef6211 100644
--- a/fx2ait/CMakeLists.txt
+++ b/fx2ait/CMakeLists.txt
@@ -3,10 +3,6 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(ait_model)
 find_package(Torch REQUIRED)
 
-if(${AIT_USE_ROCM})
-  add_compile_definitions(AIT_USE_ROCM)
-endif()
-
 include_directories(
   ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/picojson
 )
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 530b8e162..7d90c7963 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -161,7 +161,7 @@ def acc_ops_linear(
     input_val = kwargs["input"]
     if USE_ROCM:
         shape = input_val._attrs["shape"]
-        input_val = input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1].value()])
+        input_val = input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1]])
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
     
@@ -172,7 +172,7 @@ def acc_ops_linear(
         assert isinstance(bias, AITTensor)
         result = elementwise(FuncEnum.ADD)(result, bias)
     if USE_ROCM:
-        result = result if len(shape) == 2 else reshape()(result, [shape[0].value(), -1, result._attrs["shape"][-1].value()])
+        result = result if len(shape) == 2 else reshape()(result, [shape[0], -1, result._attrs["shape"][-1]])
     return result
 
 
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 3590e9bed..ba0162364 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -6,7 +6,7 @@
 #include <sstream>
 
 #include "ATen/Context.h" // @manual
-#ifdef AIT_USE_ROCM
+#ifdef __HIP_PLATFORM_HCC__
 #include "ATen/hip/HIPContext.h"
 #include "c10/core/CPUAllocator.h"
 #include "c10/hip/HIPStream.h"
@@ -23,7 +23,7 @@
 namespace torch::aitemplate {
 
 AITemplatePyTorchCachingAllocator::AITemplatePyTorchCachingAllocator() {
-  #ifndef AIT_USE_ROCM
+  #ifndef __HIP_PLATFORM_HCC__
   at::globalContext().lazyInitCUDA();
   #endif
   cuda_allocator_ = at::cuda::getCUDADeviceAllocator();
@@ -298,7 +298,7 @@ std::vector<torch::Tensor> AITModelImpl::processOutputs(
 
     auto output = at::detail::make_tensor_base<c10::TensorImpl>(
         std::move(output_index_to_output_storage_impl.at(output_idx)),
-        #ifdef AIT_USE_ROCM
+        #ifdef __HIP_PLATFORM_HCC__
         c10::DispatchKeySet(c10::DispatchKey::HIP),
         #else
         c10::DispatchKeySet(c10::DispatchKey::CUDA),
@@ -380,7 +380,7 @@ std::vector<torch::Tensor> AITModelImpl::forward(
 
   std::vector<torch::Tensor> outputs;
   {
-    #ifdef AIT_USE_ROCM
+    #ifdef __HIP_PLATFORM_HCC__
     const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
     #else
     const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
@@ -443,7 +443,7 @@ void AITModelImpl::profile(
       device);
 
   {
-    #ifdef AIT_USE_ROCM
+    #ifdef __HIP_PLATFORM_HCC__
     const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
     #else
     const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
index 5bc05ca47..7d6ba9de0 100644
--- a/fx2ait/setup.py
+++ b/fx2ait/setup.py
@@ -36,7 +36,6 @@ def run(self):
             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + build_directory,
             "-DPYTHON_EXECUTABLE=" + sys.executable,
             "-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
-            "-DAIT_USE_ROCM=" + "1" if torch.cuda.is_available() and torch.version.hip else "0",
         ]
 
         cfg = "Debug" if self.debug else "Release"
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index df03965ca..82c919a40 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -133,7 +133,6 @@ def __eq__(self, another: Any) -> bool:
         return (
             isinstance(another, IntVar)
             and self._attrs["values"] == another._attrs["values"]
-            and self._attrs["name"] == another._attrs["name"]
         )
 
     def __hash__(self) -> int:

From 919d126b590481d74ec9a990fb278af241289119 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Tue, 7 Feb 2023 12:38:01 -0800
Subject: [PATCH 076/638] add conv3d support in fx2ait (#213)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/213

Add the conv3d support in fx2ait, so we can use it to transform the xrayvideo model.
Add unit tests to make sure the support is good.

Reviewed By: wushirong

Differential Revision: D43079351

fbshipit-source-id: d9857aa95f5a6b44b37076d321d3574e4357091a
---
 fx2ait/fx2ait/converters/ait_converters.py    |  63 ++++++++++
 fx2ait/fx2ait/converters/utils.py             |   3 +
 .../fx2ait/test/converters/test_ait_conv3d.py | 112 ++++++++++++++++++
 python/aitemplate/compiler/ops/conv/conv3d.py |   2 +-
 .../compiler/ops/conv/depthwise_conv3d.py     |   4 +-
 python/aitemplate/compiler/public/__init__.py |   2 +
 6 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_conv3d.py

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 96b36468f..6d3b665d4 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -28,6 +28,8 @@
     concatenate,
     conv2d,
     conv2d_bias,
+    conv3d,
+    depthwise_conv3d,
     dynamic_slice,
     elementwise,
     expand,
@@ -72,6 +74,7 @@
     create_unary_op,
     get_positive_dim,
     identical_elem_tuple_to_int,
+    ncdhw2ndhwc,
     nchw2nhwc,
     unify_dynamic_shape_name,
 )
@@ -1142,6 +1145,66 @@ def make_slice(x, slice_idx, name):
     return result
 
 
+def _choose_conv3d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: [AITTensor],
+    groups: int = 1,
+) -> ConverterOutput:
+    """
+    Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
+    and groups
+    """
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
+    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
+
+    if bias is not None:
+        assert (
+            groups == weight._attrs["shape"][0]
+        ), "Currently only support channel == groups"
+        return depthwise_conv3d(
+            stride=stride, pad=pad, dilate=dilate, group=groups, bias=bias
+        )(x, weight)
+    else:
+        assert (
+            groups is None or groups == 1
+        ), "Currently only support non-bias conv3d without groups"
+        return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+
+@ait_converter(acc_ops.conv3d)
+def acc_ops_conv3d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
+
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    groups = kwargs["groups"]
+
+    return _choose_conv3d_op(stride, padding, dilation, input_val, weight, bias, groups)
+
+
 @ait_converter(acc_ops.max_pool2d)
 def acc_ops_max_pool2d(
     target: Target,
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index 3573c102f..26d89f704 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -138,6 +138,9 @@ def nchw2nhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
     return [shape[0], shape[2], shape[3], shape[1]]
 
 
+def ncdhw2ndhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
+    return [shape[0], shape[2], shape[3], shape[4], shape[1]]
+
 # TODO:  This is a hack to workaround AIT's dynamic shape requirement.
 # Detailed explanation can be found in D41743385 (aten2ait) D41974191(fx2ait).
 # We will throw this one after AIT provides vanilla support.
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
new file mode 100644
index 000000000..603c61535
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -0,0 +1,112 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestAitConv3d(AITTestCase):
+    @parameterized.expand(
+        [
+            param("conv3d", 3, bias=False),
+            param(
+                name="conv3d_tuple_parameters",
+                kernel_size=3,
+                stride=(4, 4, 4),
+                padding=(2, 2, 2),
+                dilation=2,
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="depthwise_conv3d",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=56,
+                w=56,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_2",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=28,
+                w=28,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_3",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=7,
+                w=7,
+                bias=True,
+            ),
+        ]
+    )
+    def test_conv3d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        ci=8,
+        co=8,
+        groups=1,
+        d=4,
+        h=224,
+        w=224,
+        bias=False,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv3d(
+                    ci,
+                    co,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    bias,
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(4, ci, d, h, w).cuda().half()]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv3d},
+            permute_inputs=[0, 2, 3, 4, 1],  # inputs should be NDHWC
+            permute_outputs=[0, 4, 1, 2, 3],
+        )
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index d876948c1..ecacb385e 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -555,7 +555,7 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        if "op_instance" not in self._attrs:
+        if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
             target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 57c9f5ba9..1d234988c 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -264,8 +264,10 @@ def __call__(self, x: Tensor, w: Tensor, bias: Tensor = None) -> List[Tensor]:
             includes the output tensor in shape (N, T_out, H_out, W_out, C_out)
         """
         self._attrs["inputs"] = [x, w]
-        if self._attrs["bias"]:
+        if bias:
             self._attrs["inputs"].append(bias)
+        elif self._attrs["bias"]:
+            self._attrs["inputs"].append(self._attrs["bias"])
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index 4dcb936f5..425965be2 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -57,6 +57,8 @@
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
 from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm

From e9fc1c584ad70499628c178a73f8ec0be4c7741f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 077/638] support bfloat16 in special bmms (#189)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/189

att

Reviewed By: chenyang78

Differential Revision: D42979181

fbshipit-source-id: a27b0ff5cd5522b263e0e5906532c34bba971a2e
---
 .../backend/cuda/gemm_special/bmm_rcr_n1.py   | 64 ++++++++++++++++++-
 .../cuda/gemm_special/bmm_rrr_k1_tanh.py      | 59 +++++++++++++++--
 tests/unittest/ops/test_bmm_alpha.py          | 37 +++++++++++
 tests/unittest/ops/test_bmm_rcr_n1.py         | 42 +++++++++++-
 tests/unittest/ops/test_bmm_rrr_k1_tanh.py    | 12 +++-
 5 files changed, 202 insertions(+), 12 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
index 56f43cbf5..b407b431e 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -112,9 +112,13 @@
 SRC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
+using bfloat16 = __nv_bfloat16;
+using bfloat16_2 =  __nv_bfloat162;
+
 namespace {
 
 {{tensor_accessor_libs}}
@@ -287,7 +291,65 @@
       return __high2float(a);
     }
   };
-}
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+  template<>
+  struct InputHelper<bfloat16> {
+    typedef bfloat16 scalar_type;
+    typedef bfloat16_2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return __hfma2(a, b, c);
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __hfma(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return __hmul2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __hmul(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return __hadd2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __hadd(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return __low2bfloat16(a);
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return __high2bfloat16(a);
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return __low2float(a);
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return __high2float(a);
+    }
+  }; // struct InputHelper<bfloat16>
+#endif
+} // namespace detail
 
 // Each thread reads one row from "a" and one column from "b",
 // computes dot_product(a_row, b_col), and writes the result to "c".
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
index eb5cfe109..7f40abc66 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
@@ -89,24 +89,27 @@
     """
 #include <iostream>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/fast_math.h"
 
-#ifndef __HALF_TO_US
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+using bfloat16 = __nv_bfloat16;
+
+#ifndef REINTERPRET_AS_U16
+#define REINTERPRET_AS_U16(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #endif
 
 namespace {
 
 template <typename T>
-__device__ T fast_tanh(T x);
+__device__ __inline__ T fast_tanh(T x);
 
 template <>
-__device__ half fast_tanh(half x) {
+__device__ __inline__ half fast_tanh(half x) {
   #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
 
-  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(__HALF_TO_US(x)) : "h"(__HALF_TO_US(x)));
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(REINTERPRET_AS_U16(x)) : "h"(REINTERPRET_AS_U16(x)));
   return x;
 
   #else
@@ -114,6 +117,49 @@
   #endif
 }
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+
+template <>
+__device__ __inline__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDA_ARCH__ >= 900)
+
+  asm volatile ( "tanh.approx.bf16 %0, %1;" : "=h"(REINTERPRET_AS_U16(x)) : "h"(REINTERPRET_AS_U16(x)));
+  return x;
+
+#else
+  return bfloat16(cutlass::fast_tanh(float(x)));
+#endif
+}
+
+#endif // (__CUDA_ARCH__ >= 800)
+
+template <>
+__device__ __inline__ float fast_tanh(float x) {
+  return cutlass::fast_tanh(x);
+}
+
+template<typename ElemT>
+__device__ __inline__ ElemT intrinsic_mul(ElemT x, ElemT y);
+
+template<>
+__device__ __inline__ float intrinsic_mul(float x, float y) {
+  return __fmul_rn(x, y);
+}
+
+template<>
+__device__ __inline__ half intrinsic_mul(half x, half y) {
+  return __hmul(x, y);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+
+template<>
+__device__ __inline__ bfloat16 intrinsic_mul(bfloat16 x, bfloat16 y) {
+  return __hmul(x, y);
+}
+
+#endif
+
 template<typename ElemT, int num_thread>
 __global__ void bmm_rrr_k1_tanh_kernel(const float4* a_ptr,
                                   const float4* b_ptr,
@@ -137,7 +183,7 @@
     for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < num_elems_in_float4; ++j) {
-        tmp[i * num_elems_in_float4 + j] = fast_tanh(__hmul(a_vec_ptr[i], b_vec_ptr[j]));
+        tmp[i * num_elems_in_float4 + j] = fast_tanh(intrinsic_mul(a_vec_ptr[i], b_vec_ptr[j]));
       }
     }
     CUTLASS_PRAGMA_UNROLL
@@ -207,6 +253,7 @@
   {{exec_paths}}
 }
 
+#undef REINTERPRET_AS_U16
 """
 )
 
diff --git a/tests/unittest/ops/test_bmm_alpha.py b/tests/unittest/ops/test_bmm_alpha.py
index 39f4adcea..aa3440ebb 100644
--- a/tests/unittest/ops/test_bmm_alpha.py
+++ b/tests/unittest/ops/test_bmm_alpha.py
@@ -319,6 +319,43 @@ def test_bmm_alpha_float(self):
             dtype="float",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_bmm_alpha_bfloat16(self):
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+            dtype="bfloat16",
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr_add,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=False,
+            with_add=True,
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index ee7e60122..8497892bc 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -32,7 +32,18 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.test_count = 0
 
-    def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name, dtype="float16"):
+    def _test_rcr_n1(
+        self,
+        Bs,
+        Ms,
+        N,
+        K,
+        use_fp16_acc,
+        test_name,
+        atol=1e-1,
+        rtol=1e-1,
+        dtype="float16",
+    ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         BDim = shape_utils.gen_int_var_min_max(Bs, name="batch")
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
@@ -71,7 +82,7 @@ def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name, dtype="float16"):
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
         self.test_count += 1
 
     def test_rcr_n1(self):
@@ -93,7 +104,7 @@ def test_rcr_n1(self):
         self._test_rcr_n1([1], [100], 1, 0, False, "zero_k")
         self._test_rcr_n1([1], [0], 1, 3, False, "zero_m")
 
-    def test_float32(self):
+    def test_bmm_rcr_n1_float32(self):
         self._test_rcr_n1(
             [1], [1000000], 1, 32, True, "static_float32", dtype="float32"
         )
@@ -107,6 +118,31 @@ def test_float32(self):
             [1, 5, 8], [100], 1, 123, False, "static_float32", dtype="float32"
         )
 
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80, "bf16 is supported with CUDA sm80+"
+    )
+    def test_bmm_rcr_n1_bfloat16(self):
+        self._test_rcr_n1(
+            [1],
+            [1000000],
+            1,
+            32,
+            True,
+            "static_bfloat16",
+            atol=2e-1,
+            rtol=2e-1,
+            dtype="bfloat16",
+        )
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, False, "static_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 7, True, "static_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 123, False, "static_bfloat16", dtype="bfloat16"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
index a50e67111..15ad23632 100644
--- a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -50,14 +50,22 @@ def _test_rrr(self, B, M, K, N, test_name, dtype="float16"):
         else:
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rrr(self):
+    def test_bmm_rrr_k1_tanh_float16(self):
         self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1")
         self._test_rrr(B=1024, M=0, K=1, N=32, test_name="bmm_rrr_k1_zero_m")
         self._test_rrr(B=1024, M=32, K=0, N=32, test_name="bmm_rrr_k1_zero_k")
 
-    def test_float32(self):
+    def test_bmm_rrr_k1_tanh_float32(self):
         self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="float32")
 
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80, "bf16 is supported with CUDA sm80+"
+    )
+    def test_bmm_rrr_k1_tanh_bfloat16(self):
+        self._test_rrr(
+            B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="bfloat16"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 3122cca90510b6e64104426bdf028271c18323ae Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 078/638] add bfloat16 test coverage for perm+fc fused ops
 (#190)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/190

att

Reviewed By: chenyang78

Differential Revision: D42980393

fbshipit-source-id: c596b92a419033fa8ea4f9d1e02343e95f834ef6
---
 tests/unittest/ops/test_perm021fc_ccr.py             | 11 +++++++++++
 tests/unittest/ops/test_perm021fc_ccr_bias.py        | 11 +++++++++++
 .../unittest/ops/test_perm021fc_ccr_bias_perm021.py  | 11 +++++++++++
 tests/unittest/ops/test_perm021fc_crc.py             | 11 +++++++++++
 tests/unittest/ops/test_perm021fc_crc_bias.py        | 11 +++++++++++
 tests/unittest/ops/test_perm102_bmm_rcr.py           |  6 +++---
 tests/unittest/ops/test_perm102_bmm_rrr.py           | 12 ++++++------
 7 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/tests/unittest/ops/test_perm021fc_ccr.py b/tests/unittest/ops/test_perm021fc_ccr.py
index 18c843916..17daa41b5 100644
--- a/tests/unittest/ops/test_perm021fc_ccr.py
+++ b/tests/unittest/ops/test_perm021fc_ccr.py
@@ -88,6 +88,17 @@ def test_perm021fc_ccr_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"bf16 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_bf16(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias.py b/tests/unittest/ops/test_perm021fc_ccr_bias.py
index 196e11f73..dbf91b245 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias.py
@@ -99,6 +99,17 @@ def test_perm021fc_ccr_bias_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"bf16 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_bias_bf16(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
index f0b6a2f16..3f11392b9 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
@@ -100,6 +100,17 @@ def test_perm021fc_ccr_bias_perm021_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"bf16 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_ccr_bias_perm021_bf16(self):
+        self._test_perm021fc_ccr_bias_perm021(
+            test_name="perm021fc_ccr_bias_perm021_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm021fc_crc.py b/tests/unittest/ops/test_perm021fc_crc.py
index 5f99f3275..aac752c33 100644
--- a/tests/unittest/ops/test_perm021fc_crc.py
+++ b/tests/unittest/ops/test_perm021fc_crc.py
@@ -90,6 +90,17 @@ def test_perm021fc_crc_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"bf16 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_crc_bf16(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
index cab5f5bb3..8c9e719e6 100644
--- a/tests/unittest/ops/test_perm021fc_crc_bias.py
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -98,6 +98,17 @@ def test_perm021fc_crc_bias_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80,
+        f"bf16 BMM not supported in {detect_target()._arch}",
+    )
+    def test_perm021fc_crc_bias_bf16(self):
+        self._test_perm021fc_crc_bias(
+            test_name="perm021fc_crc_bias_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index 0e70baa99..62f143035 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -33,12 +33,12 @@
 
 
 def cuda_skip_condition(dtype, arch):
-    return dtype == "float32" and int(arch) < 80
+    return dtype != "float16" and int(arch) < 80
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_TestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
     def test_perm102_bmm_rrr(self, dtype):
         arch_ = detect_target()._arch
         if cuda_skip_condition(dtype, arch_):
@@ -70,7 +70,7 @@ def test_perm102_bmm_rrr(self, dtype):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
     def test_perm102_bmm_rrr_bias(self, dtype):
         arch_ = detect_target()._arch
         if cuda_skip_condition(dtype, arch_):
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index f6c75110e..026e5333a 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -34,10 +34,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
     def test_perm102_bmm_rrr(self, dtype="float16"):
-        if dtype == "float32" and int(detect_target()._arch) < 80:
-            self.skipTest(f"fp32 BMM not supported in {detect_target()._arch}")
+        if dtype != "float16" and int(detect_target()._arch) < 80:
+            self.skipTest(f"{dtype} BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256
@@ -65,10 +65,10 @@ def test_perm102_bmm_rrr(self, dtype="float16"):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMBiasTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
     def test_perm102_bmm_rrr_bias(self, dtype="float16"):
-        if dtype == "float32" and int(detect_target()._arch) < 80:
-            self.skipTest(f"fp32 BMM not supported in {detect_target()._arch}")
+        if dtype != "float16" and int(detect_target()._arch) < 80:
+            self.skipTest(f"{dtype} BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256

From 820fad56ba10f5667fd3bcee2d821e7756d4f835 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 079/638] support bfloat16 dtype for reductions (#188)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/188

att

Reviewed By: chenyang78

Differential Revision: D42982520

fbshipit-source-id: 4066350d3b1a56731def5598f66de39033ac15f6
---
 .../backend/cuda/reduce/reduce_3d.py          |  2 +-
 .../backend/cuda/reduce/reduce_small_axis.py  | 34 ++++++---
 python/aitemplate/backend/cuda/reduce/var.py  |  2 +-
 tests/unittest/ops/test_norm.py               | 70 +++++++++++++++++++
 tests/unittest/ops/test_reduce.py             | 44 ++++++++++--
 tests/unittest/ops/test_var.py                | 34 ++++++++-
 6 files changed, 167 insertions(+), 19 deletions(-)

diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index 58563d6d5..08557c484 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -860,7 +860,7 @@ def gen_function(
     # FIXME: these alignments values are only for half_t type.
     # make it adjustable to other types such as float.
     alignments = [8, 4, 2, 1]
-    if x._attrs["dtype"] in ("float16",):
+    if x._attrs["dtype"] in ("float16", "bfloat16"):
         alignments.append(16)
     # This is ugly. Ideally, we should have templated code like below:
     # template <typename Alignment>
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index a8c711706..72d54661b 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -223,6 +223,9 @@
     if constexpr (std::is_same_v<ElemOutputType, cutlass::half_t>) {
       HANDLE_ONE_WRITE_VEC(2, cutlass::half_t)
     }
+    else if constexpr (std::is_same_v<ElemOutputType, cutlass::bfloat16_t>) {
+      HANDLE_ONE_WRITE_VEC(2, cutlass::bfloat16_t)
+    }
     throw std::runtime_error("unsupported vector size for write");
   } else {
     throw std::runtime_error("unsupported num_row_per_threads");
@@ -272,6 +275,8 @@ def _get_read_vector_type(input_shape, input_type, force_min_vec_type=False) ->
     type_to_size_in_bit = {
         "half": 16,
         "cutlass::half_t": 16,
+        "bfloat16": 16,
+        "cutlass::bfloat16_t": 16,
         "float": 32,
     }
 
@@ -280,16 +285,29 @@ def _get_read_vector_type(input_shape, input_type, force_min_vec_type=False) ->
     # (2) the input type is inherited from reduce_3d, so we still
     #     use cutlass::half_t for fp16. We will replace it to half once we
     #     unify our half representation
-    vector_types = [
-        ("uint4", 16),
-        ("uint2", 8),
-        ("unsigned", 4),
-        ("cutlass::half_t", 2),
-    ]
+    vector_types = {
+        "cutlass::half_t": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+            ("cutlass::half_t", 2),
+        ],
+        "cutlass::bfloat16_t": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+            ("cutlass::bfloat16_t", 2),
+        ],
+        "float": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+        ],
+    }
 
     def _size_to_vector_type(sz_in_byte) -> str:
         """return vector_type for the given size"""
-        for vec_type, sz in vector_types:
+        for vec_type, sz in vector_types[input_type]:
             if sz_in_byte % sz == 0:
                 return vec_type
         raise NotImplementedError("Unsupported vector size: {}".format(sz_in_byte))
@@ -338,7 +356,7 @@ def _valid_vector_type(vec_type, sz_in_byte):
             return False
         return True
 
-    for vec_type, sz in vector_types:
+    for vec_type, sz in vector_types[input_type]:
         if _valid_vector_type(vec_type, sz):
             return vec_type
 
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 0cd6fc3e7..25ed40eb7 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -66,7 +66,7 @@
       int new_count = new_data.count + count;
       ElementT nb_over_n = ElementT(new_data.count) / ElementT(new_count);
       mean = mean + delta * nb_over_n;
-      m2 =  m2 + new_data.m2 + delta * delta * count * nb_over_n;
+      m2 =  m2 + new_data.m2 + delta * delta * nb_over_n * ElementT(count);
       return WelfordData(new_count, mean, m2);
     }
 
diff --git a/tests/unittest/ops/test_norm.py b/tests/unittest/ops/test_norm.py
index 99951d4b6..848c2615c 100644
--- a/tests/unittest/ops/test_norm.py
+++ b/tests/unittest/ops/test_norm.py
@@ -195,6 +195,76 @@ def test_l2_norm_fp32(self):
             rtol=1e-1,
         )
 
+    def test_l2_norm_bf16(self):
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[3, 2, 2048],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[3, 1234, 4],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=2,
+            input_shape=[5, 1, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[4, 1230, 1237],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
     def _run_batched_vector_norm(
         self,
         *,
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index 8d617cb26..00d9af551 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -82,11 +81,10 @@ def _run_reduce(
 
         y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
-        y_pt = Y_pt.cpu().numpy()
 
-        np.testing.assert_equal(y_shape, y_pt.shape)
-        np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=atol, rtol=rtol)
+        torch.testing.assert_close(y_shape, Y_pt.shape)
+        self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
         self.test_count += 1
 
     def _run_reduce_sum(
@@ -432,9 +430,8 @@ def _run_batched_reduce(
 
             y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
-            y_pt = Y_pt.cpu().numpy()
 
-            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
             self.test_count += 1
 
     def _run_batched_reduce_sum(
@@ -529,6 +526,39 @@ def test_reduce_sum_float32(self):
             atol=1e-5,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reduce_sum_bfloat16(self):
+        # reduce_smallaxis
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-1,
+            atol=1e-1,
+        )
+        # reduce_3d
+        self._run_reduce_sum(
+            dim=-2,
+            input_shape=[3, 2048, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-1,
+            atol=1e-1,
+        )
+        # reduce (common) 2d
+        self._run_reduce_sum(
+            dim=-1,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-0,
+            atol=1e-0,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
index 9524c4b6a..58c999876 100644
--- a/tests/unittest/ops/test_var.py
+++ b/tests/unittest/ops/test_var.py
@@ -83,7 +83,7 @@ def _run_var(
         self.assertTrue(torch.allclose(Y_pt, y, atol=atol, rtol=rtol, equal_nan=True))
         self.test_count += 1
 
-    def test_var(self):
+    def test_var_float16(self):
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 1], keepdim=False)
         self._run_var(dim=-1, unbiased=False, input_shape=[1, 1], keepdim=False)
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 5], keepdim=False)
@@ -152,7 +152,7 @@ def test_batched_var(self):
         self._run_batched_var(dim=2, unbiased=True, keepdim=False)
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    def test_float32(self):
+    def test_var_float32(self):
         self._run_var(
             dim=-1,
             unbiased=False,
@@ -211,6 +211,36 @@ def test_float32(self):
             rtol=1e-5,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    def test_var_bfloat16(self):
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 14797cbb548e8dcfd928990211ae5f3efd238caf Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 080/638] support bfloat16 for layernorm (#191)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/191

att

Reviewed By: jianyuh

Differential Revision: D42983945

fbshipit-source-id: cb6f72607df2d697976d492c9cedf265d2ec474a
---
 .../batch_layernorm_sigmoid_mul.py            |   3 +
 .../group_layernorm_sigmoid_mul.py            |   3 +
 .../layernorm_sigmoid_mul.py                  |   3 +
 .../layernorm_sigmoid_mul_kernel.cuh          | 484 +++++++++++++++++-
 tests/unittest/ops/test_layernorm.py          |  26 +-
 .../ops/test_layernorm_sigmoid_mul.py         | 113 +++-
 6 files changed, 609 insertions(+), 23 deletions(-)

diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
index 6292898b1..62419c299 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
@@ -32,10 +32,13 @@
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{gamma_beta_const_defs}}
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
index 7c6b34ec4..7937338b1 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -45,10 +45,13 @@
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{gamma_beta_const_defs}}
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
index 99140521f..9a1452822 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -32,10 +32,13 @@
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 {{gamma_beta_const_defs}}
 
 namespace {
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index 1ce0bd6f3..7e677c305 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -40,6 +40,10 @@ struct half4 {
   half x, y, z, w;
 };
 
+struct bfloat16_4 {
+  bfloat16 x, y, z, w;
+};
+
 template <typename T, int NUM>
 __inline__ __device__ T warpReduceSum(T* val) {
 #pragma unroll
@@ -407,6 +411,136 @@ __global__ void layernorm_sigmoid_mul_stored_locally(
   }
 }
 
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid [m]
+// block [block_size] -- each threadblock deals with block_size elements;
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul_stored_locally(
+    bfloat16_4* output,
+    const bfloat16_4* input,
+    const bfloat16_4* gamma,
+    const bfloat16_4* beta,
+    const int n,
+    const float eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const uint64_t quarter_n = n >> 2;
+  const uint64_t offset = m_idx * quarter_n;
+
+  float local_sums[1] = {0.0f};
+  bfloat16_4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+        input, offset + tid);
+
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const bfloat16_4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const bfloat16_4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2bfloat16_rn(local_val.x);
+    local_val_half.y = __float2bfloat16_rn(local_val.y);
+    local_val_half.z = __float2bfloat16_rn(local_val.z);
+    local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+    *(output_accessor.get<bfloat16, bfloat16_4>(output, offset + tid)) =
+        local_val_half;
+  }
+}
+
 // output [m, n] row-major
 // input [m, n] row-major
 // gamma [n]
@@ -593,7 +727,7 @@ cudaError_t invokeLayernormSigmoidMul(
       input_accessor.is_valid_alignment(4) &&
       output_accessor.is_valid_alignment(4)) {
     block.x = (block.x / 4 + 31) / 32 * 32;
-    if constexpr (std::is_same<T, float>::value) {
+    if constexpr (std::is_same_v<T, float>) {
       layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (float4*)output,
@@ -605,7 +739,7 @@ cudaError_t invokeLayernormSigmoidMul(
               input_accessor,
               output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
-    } else {
+    } else if constexpr (std::is_same_v<T, half>) {
       layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (half4*)output,
@@ -617,6 +751,22 @@ cudaError_t invokeLayernormSigmoidMul(
               input_accessor,
               output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else if constexpr (std::is_same_v<T, bfloat16>) {
+      layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (bfloat16_4*)output,
+              (const bfloat16_4*)input,
+              (const bfloat16_4*)gamma,
+              (const bfloat16_4*)beta,
+              n,
+              eps,
+              input_accessor,
+              output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else {
+      static_assert(
+          std::is_same_v<T, half> || std::is_same_v<T, float> ||
+          std::is_same_v<T, bfloat16>);
     }
   } else if (n < 1024) {
     block.x = (block.x + 31) / 32 * 32;
@@ -878,6 +1028,140 @@ __global__ void batch_layernorm_sigmoid_mul_stored_locally(
   }
 }
 
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each threadblock deals with block_size elements
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul_stored_locally(
+    bfloat16_4* output,
+    const bfloat16_4* input,
+    const bfloat16_4* gamma,
+    const bfloat16_4* beta,
+    const int m,
+    const int n,
+    const float eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int quarter_n = n >> 2;
+  const int offset = (m_idx + b_idx * m) * quarter_n;
+  const int gamma_beta_offset = b_idx * quarter_n;
+
+  input += offset;
+  output += offset;
+
+  gamma += gamma_beta_offset;
+  beta += gamma_beta_offset;
+
+  float local_sums[1] = {0.0f};
+  bfloat16_4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half = input[tid];
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const bfloat16_4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const bfloat16_4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2bfloat16_rn(local_val.x);
+    local_val_half.y = __float2bfloat16_rn(local_val.y);
+    local_val_half.z = __float2bfloat16_rn(local_val.z);
+    local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+    output[tid] = local_val_half;
+  }
+}
+
 // output [b, m, n] row-major
 // input [b, m, n] row-major
 // gamma [b, n]
@@ -1197,7 +1481,7 @@ void invokeBatchLayernormSigmoidMul(
   dim3 block(n);
   if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
     block.x = (block.x / 4 + 31) / 32 * 32;
-    if (std::is_same<T, float>::value) {
+    if constexpr (std::is_same<T, float>::value) {
       batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (float4*)output,
@@ -1207,7 +1491,7 @@ void invokeBatchLayernormSigmoidMul(
               m,
               n,
               eps);
-    } else {
+    } else if constexpr (std::is_same<T, half>::value) {
       batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (half4*)output,
@@ -1217,6 +1501,20 @@ void invokeBatchLayernormSigmoidMul(
               m,
               n,
               eps);
+    } else if constexpr (std::is_same<T, bfloat16>::value) {
+      batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (bfloat16_4*)output,
+              (const bfloat16_4*)input,
+              (const bfloat16_4*)gamma,
+              (const bfloat16_4*)beta,
+              m,
+              n,
+              eps);
+    } else {
+      static_assert(
+          std::is_same_v<T, half> || std::is_same_v<T, float> ||
+          std::is_same_v<T, bfloat16>);
     }
   } else if (n < 1024) {
     block.x = (block.x + 31) / 32 * 32;
@@ -1411,6 +1709,162 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
   }
 }
 
+// output b * [m, n] row-major
+// input  b * [m, n] row-major
+// gamma b * [n]
+// beta  b * [n]
+// grid [b, m]
+// block [block_size] -- each thread deals with 4 elements
+// block_size = n / 4
+template <bool FuseSigmoidMul, int NumInputs>
+__device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
+    const Arguments<bfloat16_4, float, NumInputs>& args) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+
+  bfloat16_4* output = args.outputs[b_idx];
+  const bfloat16_4* input = args.inputs[b_idx];
+  const bfloat16_4* gamma = args.gammas[b_idx];
+  const bfloat16_4* beta = args.betas[b_idx];
+  const TensorAccessor& input_accessor = args.input_accessors[b_idx];
+  const TensorAccessor& output_accessor = args.output_accessors[b_idx];
+
+  const int n = args.N[b_idx];
+  const int quarter_n = n >> 2;
+  const int offset = m_idx * quarter_n;
+
+  const int block_size = blockDim.x;
+  const int num_iters =
+      ceil(static_cast<float>(quarter_n) / static_cast<float>(block_size));
+
+  bfloat16_4 local_val_half{0, 0, 0, 0};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += local_val.x + local_val.y + local_val.z + local_val.w;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+  local_sums[0] = 0.0f;
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += (local_val.x - s_mean) * (local_val.x - s_mean) +
+          (local_val.y - s_mean) * (local_val.y - s_mean) +
+          (local_val.z - s_mean) * (local_val.z - s_mean) +
+          (local_val.w - s_mean) * (local_val.w - s_mean);
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + args.eps);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+      const float4 gamma_val = {
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA};
+#else
+      const bfloat16_4 gamma_val_half = gamma[elem_no];
+      const float4 gamma_val = {
+          static_cast<float>(gamma_val_half.x),
+          static_cast<float>(gamma_val_half.y),
+          static_cast<float>(gamma_val_half.z),
+          static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+      const float4 beta_val = {
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA};
+#else
+      const bfloat16_4 beta_val_half = beta[elem_no];
+      const float4 beta_val = {
+          static_cast<float>(beta_val_half.x),
+          static_cast<float>(beta_val_half.y),
+          static_cast<float>(beta_val_half.z),
+          static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+      if constexpr (FuseSigmoidMul) {
+        local_val.x *= sigmoid(normalize(
+            local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+        local_val.y *= sigmoid(normalize(
+            local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+        local_val.z *= sigmoid(normalize(
+            local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+        local_val.w *= sigmoid(normalize(
+            local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+      } else {
+        local_val.x =
+            normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+        local_val.y =
+            normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+        local_val.z =
+            normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+        local_val.w =
+            normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+      }
+
+      local_val_half.x = __float2bfloat16_rn(local_val.x);
+      local_val_half.y = __float2bfloat16_rn(local_val.y);
+      local_val_half.z = __float2bfloat16_rn(local_val.z);
+      local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+      *(output_accessor.get<bfloat16, bfloat16_4>(output, offset + elem_no)) =
+          local_val_half;
+    }
+  }
+}
+
 #define GROUP_LAYER_NORM_MAX_INLINE_INPUTS 39
 
 template <
@@ -1435,6 +1889,28 @@ __global__ void group_layernorm_sigmoid_mul_stored_locally_half(
       *args);
 }
 
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS, bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_bfloat16(
+    Arguments<bfloat16_4, float, NumInputs> args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      args);
+}
+
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<(NumInputs > GROUP_LAYER_NORM_MAX_INLINE_INPUTS), bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_bfloat16(
+    const Arguments<bfloat16_4, float, NumInputs>* args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      *args);
+}
+
 // output b * [m, n] row-major
 // input  b * [m, n] row-major
 // gamma b * [n]
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
index 54440b6ed..bf0f283e7 100644
--- a/tests/unittest/ops/test_layernorm.py
+++ b/tests/unittest/ops/test_layernorm.py
@@ -40,6 +40,8 @@ def _test_layernorm(
         beta_is_none=False,
         use_size_op=False,
         eps=1e-5,
+        atol=1e-3,
+        rtol=1e-3,
         dtype="float16",
     ):
         torch_dtype = string_to_torch_dtype(dtype)
@@ -116,7 +118,7 @@ def _test_layernorm(
                 inputs["beta"] = x3_pt
             x4 = torch.empty([batch_size, *MS, *NS], dtype=torch_dtype).cuda()
             module.run_with_tensors(inputs, [x4])
-            torch.testing.assert_close(x4, x4_pt, atol=1e-3, rtol=1e-3)
+            torch.testing.assert_close(x4, x4_pt, atol=atol, rtol=rtol)
             self.test_count += 1
 
     def test_layernorm(self):
@@ -157,6 +159,28 @@ def test_layernorm_fp32(self):
         self._test_layernorm(MS=(16, 64), NS=(4, 32), dtype="float32")
         self._test_layernorm(MS=(16, 8, 4), NS=(2, 4, 32), dtype="float32")
 
+    @unittest.skipIf(
+        detect_target().name() == "rocm", "fp32 layer norm is not supported on ROCm"
+    )
+    def test_layernorm_bf16(self):
+        self._test_layernorm(dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(gamma_is_none=True, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(beta_is_none=True, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_layernorm(eps=0.1, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(
+            MS=(16, 64), NS=(4, 32), dtype="bfloat16", atol=1e-2, rtol=1e-2
+        )
+        self._test_layernorm(
+            MS=(16, 8, 4), NS=(2, 4, 32), dtype="bfloat16", atol=1e-2, rtol=1e-2
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
index abd22835f..1c383393f 100644
--- a/tests/unittest/ops/test_layernorm_sigmoid_mul.py
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -33,8 +33,6 @@
 class FusedLayernormSigmoidMulTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(FusedLayernormSigmoidMulTestCase, self).__init__(*args, **kwargs)
-        self._atol = 1e-2
-        self._rtol = 1e-3
         self._test_id = 0
 
     def _test_fused_layernorm_sigmoid_mul(
@@ -44,6 +42,8 @@ def _test_fused_layernorm_sigmoid_mul(
         gamma_is_none=False,
         beta_is_none=False,
         use_size_op=False,
+        atol=1e-2,
+        rtol=1e-2,
         eps=1e-5,
         dtype="float16",
     ):
@@ -127,7 +127,7 @@ def _test_fused_layernorm_sigmoid_mul(
                     inputs["beta"] = x3_pt
                 x6 = torch.empty_like(x6_pt)
                 module.run_with_tensors(inputs, [x6])
-                torch.testing.assert_close(x6, x6_pt, atol=self._atol, rtol=self._rtol),
+                torch.testing.assert_close(x6, x6_pt, atol=atol, rtol=rtol),
 
     def test_fused_layernorm_sigmoid_mul_fp16(self):
         for eps in (1e-5, 1e-1):
@@ -292,6 +292,86 @@ def test_fused_layernorm_sigmoid_mul_fp32(self):
             dtype="float32",
         )
 
+    def test_fused_layernorm_sigmoid_mul_bf16(self):
+        for eps in (1e-5, 1e-1):
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            # block_size = n kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+
+        # test ND inputs
+        eps = 1e-5
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        # block_size = n kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        # block_size = 512 kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 4),
+            NS=(1055, 5),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(1496,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
     # dim0 is batch size
     def _test_batch_fused_layernorm_sigmoid_mul(
         self,
@@ -301,6 +381,8 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         beta_is_none=False,
         use_size_op=False,
         eps=1e-5,
+        atol=1e-2,
+        rtol=1e-2,
         dtype="float16",
     ):
         logging.info(
@@ -398,11 +480,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
                     inputs["beta"] = beta_pt
                 x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
-                self.assertTrue(
-                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x4, y_t, atol=atol, rtol=rtol)
 
     # dim1 is the batch size
     def _test_batch_fused_layernorm_sigmoid_mul_dim1(
@@ -411,6 +489,8 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
         N,
         gamma_is_none=False,
         beta_is_none=False,
+        atol=1e-2,
+        rtol=1e-2,
         dtype="float16",
     ):
         logging.info(
@@ -497,16 +577,13 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                     inputs["beta"] = beta_pt
                 x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
-                self.assertTrue(
-                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x4, y_t, atol=atol, rtol=rtol)
 
     @parameterized.expand(
         [
             param("float16"),
             param("float32"),
+            param("bfloat16"),
         ]
     )
     def test_batch_fused_layernorm_sigmoid_mul(self, dtype: str):
@@ -610,6 +687,8 @@ def _test_group_fused_layernorm_sigmoid_mul(
         use_size_op=False,
         eps=1e-5,
         fuse_sigmoid_mul=True,
+        atol=1e-2,
+        rtol=1e-2,
         dtype="float16",
     ):
         testname = (
@@ -744,16 +823,13 @@ def _test_group_fused_layernorm_sigmoid_mul(
         for i in range(B):
             logging.debug(f"output: {i}")
             y = outputs[i]
-            self.assertTrue(
-                torch.allclose(ys_pt[i], y, atol=self._atol, rtol=self._rtol),
-                f"max diff: {torch.max(ys_pt[i]- y) if y.numel() > 0 else 0}, "
-                f"min diff: {torch.min(ys_pt[i] - y) if y.numel() > 0 else 0}",
-            )
+            torch.testing.assert_close(ys_pt[i], y, atol=atol, rtol=rtol)
 
     @parameterized.expand(
         [
             param("float16"),
             param("float32"),
+            param("bfloat16"),
         ]
     )
     def test_group_fused_layernorm_sigmoid_mul(self, dtype: str):
@@ -921,6 +997,7 @@ def test_group_fused_layernorm_sigmoid_mul(self, dtype: str):
         [
             param("float16"),
             param("float32"),
+            param("bfloat16"),
         ]
     )
     def test_group_layernorm(self, dtype: str):

From 63f4e9d6d37f2c657c59ae943f8b6ab41767a486 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 081/638] add bfloat16 test coverage for permute ops (#192)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/192

att

Reviewed By: jianyuh

Differential Revision: D43007558

fbshipit-source-id: 71a7857d58d7611cc5911a20bac6130aff0905bc
---
 .../common/tensor/permute0213_common.py       | 42 +++++++++++++++++++
 .../common/tensor/permute102_common.py        | 41 ++++++++++++++++++
 .../aitemplate/backend/cuda/tensor/permute.py |  4 ++
 .../backend/cuda/tensor/permute0213.py        |  3 ++
 .../backend/cuda/tensor/permute102.py         |  2 +
 tests/unittest/ops/test_permute.py            | 26 ++++++++++++
 tests/unittest/ops/test_permute021.py         | 18 ++++++++
 tests/unittest/ops/test_permute102.py         | 19 +++++++++
 tests/unittest/ops/test_permute210.py         | 19 +++++++++
 9 files changed, 174 insertions(+)

diff --git a/python/aitemplate/backend/common/tensor/permute0213_common.py b/python/aitemplate/backend/common/tensor/permute0213_common.py
index 7dceb9cd9..cde9bda01 100644
--- a/python/aitemplate/backend/common/tensor/permute0213_common.py
+++ b/python/aitemplate/backend/common/tensor/permute0213_common.py
@@ -134,6 +134,48 @@
 {{indent}}      stream
 {{indent}}  );
 {{indent}}}
+{% elif dtype == "bfloat16" %}
+{{indent}}if (x_dim3 % 8 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<bfloat16>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
 {% endif %}
 {{indent}}return;
 """
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index ba8200abd..cd705c9ce 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -138,6 +138,46 @@
 {{indent}}      stream
 {{indent}}  );
 {{indent}}}
+{% elif dtype == "bfloat16" %}
+{{indent}}if (x_dim2 % 8 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<bfloat16>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% else %}
+{{indent}} static_assert(std::is_same_v<T, half> || std::is_same_v<T, float> || std::is_same_v<T, bfloat16>, "Unsupported dtype");
 {% endif %}
 {{indent}}return;
 """
@@ -153,6 +193,7 @@
 #define DIRECT_BLOCK_Z 2
 
 namespace {
+using bfloat16 = __nv_bfloat16;
 
 template<typename T>
 __global__ void permute102_tiled_kernel(T* output,
diff --git a/python/aitemplate/backend/cuda/tensor/permute.py b/python/aitemplate/backend/cuda/tensor/permute.py
index 6c7746f5b..e23aef5ff 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.py
+++ b/python/aitemplate/backend/cuda/tensor/permute.py
@@ -62,10 +62,13 @@
 #include <limits>
 #include <stdexcept>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/arch/memory_sm80.h"
 #include "cutlass/cutlass.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{custom_libs}}
@@ -119,6 +122,7 @@ def gen_function(func_attrs: Dict[str, Any]) -> str:
     dtype = x.dtype()
     assert dtype in (
         "float16",
+        "bfloat16",
         "float32",
         "float",
     ), "permute is only tested for floating point type"
diff --git a/python/aitemplate/backend/cuda/tensor/permute0213.py b/python/aitemplate/backend/cuda/tensor/permute0213.py
index b277eff87..2d84d3299 100644
--- a/python/aitemplate/backend/cuda/tensor/permute0213.py
+++ b/python/aitemplate/backend/cuda/tensor/permute0213.py
@@ -24,9 +24,12 @@
 
 Header_files = """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/util/host_tensor.h"
+
+using bfloat16 = __nv_bfloat16;
 """
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
index ddee78a74..d28b36aa8 100644
--- a/python/aitemplate/backend/cuda/tensor/permute102.py
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -24,9 +24,11 @@
 
 Header_files = """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/util/host_tensor.h"
+
 """
 
 
diff --git a/tests/unittest/ops/test_permute.py b/tests/unittest/ops/test_permute.py
index 1578e8bd8..8543cfff1 100644
--- a/tests/unittest/ops/test_permute.py
+++ b/tests/unittest/ops/test_permute.py
@@ -113,6 +113,32 @@ def test_generic_permute_fp32(self, input_shapes, dims):
             testname="test_generic_permute_fp32",
         )
 
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported by ROCm.")
+    def test_generic_permute_bf16(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.bfloat16,
+            testname="test_generic_permute_bf16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_permute021.py b/tests/unittest/ops/test_permute021.py
index ac2dfa27f..8fe7c56b7 100644
--- a/tests/unittest/ops/test_permute021.py
+++ b/tests/unittest/ops/test_permute021.py
@@ -97,6 +97,24 @@ def test_permute021_fp32(self, input_shape, dims):
             dtype="float32",
         )
 
+    @parameterized.expand(
+        [
+            param((2, 384, 262), (0, 2, 1)),
+            param((2, 3, 384, 262), (0, 1, 3, 2)),
+            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute021_bf16(self, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name="permute021_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_permute102.py b/tests/unittest/ops/test_permute102.py
index 0069f783e..588d1edd1 100644
--- a/tests/unittest/ops/test_permute102.py
+++ b/tests/unittest/ops/test_permute102.py
@@ -98,6 +98,25 @@ def test_permute102_fp32(self, input_shape):
             dtype="float32",
         )
 
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute102_bf16(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_permute210.py b/tests/unittest/ops/test_permute210.py
index aaa68f502..c9135e3f6 100644
--- a/tests/unittest/ops/test_permute210.py
+++ b/tests/unittest/ops/test_permute210.py
@@ -98,6 +98,25 @@ def test_permute210_fp32(self, input_shape):
             dtype="float32",
         )
 
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute210_bf16(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_bf16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 61b0ceb483d4fd679db65e0feaf9618aa457e549 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 082/638] support bfloat16 dtype in slice op (#195)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/195

att

Reviewed By: chenyang78

Differential Revision: D43008045

fbshipit-source-id: 446eec154e35e65565b9e8a1ea15fd6d38ca7287
---
 .../backend/common/tensor/slice_common.py     |  5 +++++
 tests/unittest/ops/test_slice.py              | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index e3c89a09f..4db0adf5e 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -239,6 +239,7 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16 = 0,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
@@ -262,6 +263,8 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
   if constexpr (std::is_same_v<ELEM_T, half>) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
   }
 
 #undef HANDLE_ONE_VEC_TYPE
@@ -444,6 +447,8 @@
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
     if constexpr (std::is_same_v<ELEM_T, half>) {
       HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
     }
 
   throw std::runtime_error("Invalid LoadVecType\\n");
diff --git a/tests/unittest/ops/test_slice.py b/tests/unittest/ops/test_slice.py
index 06e0d50c9..b3d5789c3 100644
--- a/tests/unittest/ops/test_slice.py
+++ b/tests/unittest/ops/test_slice.py
@@ -160,6 +160,15 @@ def test_dynamic_slice_float32(self):
             input_type="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_slice_bfloat16(self):
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+            input_type="bfloat16",
+        )
+
 
 class DynamicSliceBatchedTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
@@ -280,6 +289,16 @@ def test_batch_dynamic_slice_float32(self):
             input_type="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_dynamic_slice_bfloat16(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[None, 1, None, -1],
+            end_indices=[None, None, -1, 0],
+            input_type="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 79c96dd1971e658698d86044047e1b5544304e0b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 083/638] add bfloat16 coverage for split op (#196)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/196

att

Reviewed By: chenyang78

Differential Revision: D43008342

fbshipit-source-id: e68d936f227b6f573a57d3a83a880d94ef51015c
---
 .../aitemplate/backend/common/split_common.py |  5 +++
 tests/unittest/ops/test_split.py              | 39 ++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/python/aitemplate/backend/common/split_common.py b/python/aitemplate/backend/common/split_common.py
index 44c0d040c..694b401e4 100644
--- a/python/aitemplate/backend/common/split_common.py
+++ b/python/aitemplate/backend/common/split_common.py
@@ -160,6 +160,7 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
@@ -189,6 +190,8 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
   if constexpr (std::is_same_v<ELEM_T, half>) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
   }
 
 #undef HANDLE_ONE_VEC_TYPE
@@ -272,6 +275,8 @@
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
     if constexpr (std::is_same_v<ELEM_T, half>) {
       HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
     }
 
   throw std::runtime_error("Invalid LoadVecType\\n");
diff --git a/tests/unittest/ops/test_split.py b/tests/unittest/ops/test_split.py
index a6c828cd5..30073da3c 100644
--- a/tests/unittest/ops/test_split.py
+++ b/tests/unittest/ops/test_split.py
@@ -15,7 +15,6 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
@@ -75,7 +74,7 @@ def _run_split(
                 [idx for idx, mask in enumerate(output_masks) if not mask]
             )
             Ys = split_op._attrs["outputs"]
-        np.testing.assert_equal(len(Ys_pt), len(Ys))
+        self.assertEqual(len(Ys_pt), len(Ys))
 
         y_shapes = []
         for idx, Y in enumerate(Ys):
@@ -96,7 +95,7 @@ def _run_split(
 
         for idx, y_pt in enumerate(Ys_pt):
             y = outputs[f"output_{idx}"]
-            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.equal(y_pt, y))
         self.test_count += 1
 
     def _run_batch_split(
@@ -146,7 +145,7 @@ def _run_batch_split(
                 else torch.split(X_pt, split_size_or_sections, dim)
             )
 
-            np.testing.assert_equal(len(Ys_pt), len(Ys))
+            self.assertEqual(len(Ys_pt), len(Ys))
 
             y_shapes = [Y_pt.size() for Y_pt in Ys_pt]
             outputs = {
@@ -160,7 +159,7 @@ def _run_batch_split(
 
             for idx, y_pt in enumerate(Ys_pt):
                 y = outputs[f"output_{idx}"]
-                self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+                self.assertTrue(torch.equal(y_pt, y))
             self.test_count += 1
 
     def test_split(self):
@@ -237,34 +236,30 @@ def test_batch_split(self):
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_split_float(self):
-        self._run_split(
-            input_shape=[2, 3], split_size_or_sections=2, dim=1, input_type="float"
-        )
-        self._run_split(
-            input_shape=[4097, 128, 64],
-            split_size_or_sections=1024,
-            dim=0,
-            input_type="float",
-        )
         self._run_split(
             input_shape=[8, 6, 4],
-            split_size_or_sections=(2, 2),
-            dim=2,
+            split_size_or_sections=(2, 4),
+            dim=1,
             input_type="float",
         )
         self._run_batch_split(
-            batch_sizes=[1, 1],
-            input_shape=[2, 1],
-            split_size_or_sections=1,
-            dim=1,
-            input_type="float",
+            batch_sizes=[11, 5, 9],
+            input_shape=[2, 9, 4],
+            split_size_or_sections=[2, 4, 3],
+            dim=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_bfloat16(self):
+        self._run_split(
+            input_shape=[2, 3], split_size_or_sections=2, dim=1, input_type="bfloat16"
         )
         self._run_batch_split(
             batch_sizes=[3, 4],
             input_shape=[2, 3, 4],
             split_size_or_sections=2,
             dim=2,
-            input_type="float",
+            input_type="bfloat16",
         )
 
 
From f37cd17944aac7f0dd17ad22c979694cbaaa0def Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 14:14:31 -0800
Subject: [PATCH 084/638] support bfloat16 for groupnorm (#194)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/194

att

Reviewed By: chenyang78

Differential Revision: D43013584

fbshipit-source-id: 62d81bec1e3766b0b0841248b46dcbc43ab86a44
---
 .../cuda/groupnorm/groupnorm_common.py        |   3 +
 .../cuda/groupnorm/groupnorm_kernel.cuh       | 109 +++++++++++++++---
 .../backend/cuda/groupnorm/layer_norm.cuh     |   8 ++
 tests/unittest/ops/test_groupnorm.py          |  31 ++++-
 4 files changed, 135 insertions(+), 16 deletions(-)

diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 1213743b5..06db17cc3 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -61,6 +61,7 @@
     """
 #include <cuda.h>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 
 #include <cub/cub.cuh>
@@ -71,6 +72,8 @@
 #include <math_constants.h>
 #include <assert.h>
 
+using bfloat16 = __nv_bfloat16;
+using bfloat16_2 = __nv_bfloat162;
 
 {{gamma_beta_const_defs}}
 
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 78566b9e4..a99d9ff54 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -33,45 +33,102 @@ constexpr uint32_t kFinalMask = 0xffffffff;
 #define GROUP_NORM_CUDA_CHECK_LAUNCH() GROUP_NORM_CUDA_CHECK(cudaGetLastError())
 #endif
 
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
 __device__ half fast_tanh(half x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#else
   return half(cutlass::fast_tanh(float(x)));
+#endif
 }
 
-__inline__ __device__ float sigmoid(float val) {
-  return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900)
+  asm volatile("tanh.approx.bf16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return cutlass::fast_tanh(float(x));
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
-__inline__ __device__ half constant_half() {
-  const uint16_t bits = 0x3800u;
-  return reinterpret_cast<half const&>(bits);
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+
+__device__ float sigmoid(const float a) {
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
 }
 
-__inline__ __device__ half one() {
-  const uint16_t bits = 0x3c00u;
-  return reinterpret_cast<half const&>(bits);
+__device__ half hsigmoid(const half a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
 }
 
-__inline__ __device__ half hsigmoid(half a) {
-  const half half_val = constant_half();
-  const half one_val = one();
-  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 bf16sigmoid(const bfloat16 a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
 }
+#endif
 
 template <typename T>
 struct FSigmoid {
-  __inline__ __device__ T operator()(T input);
+  __inline__ __device__ T operator()(const T input) const;
 };
 
 template <>
 struct FSigmoid<half> {
-  __inline__ __device__ half operator()(half a) {
+  __inline__ __device__ half operator()(const half a) const {
     return hsigmoid(a);
   }
 };
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct FSigmoid<bfloat16> {
+  __inline__ __device__ bfloat16 operator()(const bfloat16 a) const {
+    return bf16sigmoid(a);
+  }
+};
+#endif
+
 template <>
 struct FSigmoid<float> {
-  __inline__ __device__ float operator()(float a) {
+  __inline__ __device__ float operator()(const float a) const {
     return sigmoid(a);
   }
 };
@@ -129,6 +186,14 @@ __forceinline__ __device__ half Rsqrt<half>(half x) {
   return hrsqrt(x);
 }
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+__forceinline__ __device__ bfloat16 Rsqrt<bfloat16>(bfloat16 x) {
+  return hrsqrt(x);
+}
+#endif
+
 #undef __AIT_GN_USE_FAST_MATH
 
 template <typename T>
@@ -393,6 +458,20 @@ struct TInputHelper<float> {
   }
 };
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct TInputHelper<bfloat16> {
+  typedef bfloat16_2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return __bfloat1622float2(a);
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return __float22bfloat162_rn(a);
+  }
+};
+#endif
+
 } // namespace detail
 
 template <
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index 386fd69ae..e6c4595c3 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -152,6 +152,14 @@ struct DefaultComputeType<half> {
   using type = float;
 };
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct DefaultComputeType<bfloat16> {
+  using type = float;
+};
+#endif
+
 // #if CUDA_VERSION >= 11000
 // template<>
 // struct DefaultComputeType<nv_bfloat16> {
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 881eecc9a..549ea357d 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -45,6 +45,8 @@ def _test_groupnorm(
         eps=1e-5,
         use_swish=False,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
         dtype="float16",
     ):
         test_name = "group_norm_swish" if use_swish else "group_norm"
@@ -112,7 +114,7 @@ def _test_groupnorm(
         # print("pt: ", t)
 
         torch.testing.assert_close(
-            x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
+            x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=atol, rtol=rtol
         )
         self.test_count += 1
 
@@ -181,6 +183,33 @@ def test_float32(self):
             use_swish=True,
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is supported with CUDA sm80+",
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    def test_groupnorm_bfloat16(self):
+        # H % 8 != 0
+        self._test_groupnorm(
+            x_shape=[7, 13, 9, 12],
+            num_groups=4,
+            eps=1e-5,
+            atol=1e-1,
+            rtol=1e-1,
+            dtype="bfloat16",
+            use_swish=True,
+        )
+        # H % 8 == 0
+        self._test_groupnorm(
+            x_shape=[2, 16, 16, 640],
+            num_groups=32,
+            eps=1e-5,
+            atol=1e-1,
+            rtol=1e-1,
+            dtype="bfloat16",
+            use_swish=True,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From b1197a7a7b38894ffadf789d79e0a9dc11a513b0 Mon Sep 17 00:00:00 2001
From: tissue3 <173666635@qq.com>
Date: Tue, 7 Feb 2023 15:52:44 -0800
Subject: [PATCH 085/638] Fix flaky tests with fixed random seed (#214)

Summary:
Gemm_rcr_bias_fast_gelu and gemm_rcr_fast_gelu tests suffers from non-deterministic results, i.e. some numerical mismatch (e.g. 1/4096 with maximum atol of 0.12). This PR fixes flaky tests with fixed random seed.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/214

Reviewed By: hl475, chenyang78

Differential Revision: D43096632

Pulled By: tissue3

fbshipit-source-id: 9389a7b8fc1fdadb5b51fb876ac4bc3b7bbe2097
---
 tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py | 4 ++++
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index 027cd6cb7..abfc65e88 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -29,6 +29,10 @@
 
 
 class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_rcr(
         self, Ms, test_name, use_fast_gelu=True, dtype="float16", atol=1e-1, rtol=1e-1
     ):
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index 51ef92ac5..b86cce950 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -50,6 +50,10 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class GEMMRcrFastGeluTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(10)
+
     def _test_rcr(
         self, Ms, test_name, use_fast_gelu=True, atol=1e-1, rtol=1e-1, dtype="float16"
     ):

From 919b345e8d628f16cebfe803b991cc91609bbc34 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 7 Feb 2023 17:04:31 -0800
Subject: [PATCH 086/638] fix oss lint (#215)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/215

fix lint

Reviewed By: wushirong

Differential Revision: D43103377

fbshipit-source-id: 904fee259921a11c925c5a0b3b67c10ecc576654
---
 fx2ait/fx2ait/test/converters/test_ait_conv3d.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index 603c61535..034201556 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -1,4 +1,17 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 
 import torch
 from fx2ait.acc_tracer import acc_ops
@@ -100,6 +113,7 @@ def __init__(self):
 
             def forward(self, x):
                 return self.relu(self.conv(x))
+
         model = TestModule().cuda().half()
         inputs = [torch.randn(4, ci, d, h, w).cuda().half()]
 

From 8292caf713f9842878409fed2e94a4ded02d03d4 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 7 Feb 2023 23:05:25 -0800
Subject: [PATCH 087/638] place slice_reshape_scatter's output with a suitable
 offset (#207)

Summary:
This PR actually fixed two issues:

(1) Previously, there was an issue related to place the generated slice_reshape_scatter into a wrong postion in the output, because we did not take the relevant offset into account. This PR fixed the problem by simply introducing an output_offset argument to the relevant kernel. Please check the comment in the gen_function body of slice_reshape_scatter_common for the rationales.

(2) When we update the newly-created slice_reshape_scatter op, we should add it into the "dst_ops" set of its "inputs" rather than overwriting the entire set.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/207

Reviewed By: tissue3

Differential Revision: D43058204

Pulled By: chenyang78

fbshipit-source-id: 65937290b9614a1ca2d48893cae5fa1453272a20
---
 .../backend/common/tensor/slice_common.py     | 10 ++-
 .../tensor/slice_reshape_scatter_common.py    | 10 +++
 .../ops/tensor/slice_reshape_scatter.py       | 34 ++++++-
 .../compiler/test_slice_reshape_scatter.py    | 90 ++++++++++++++++++-
 4 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index 4db0adf5e..a20f7dd3e 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -369,6 +369,7 @@
           {{index_type}}  ElemsPerThread, {{index_type}}  ThreadsPerBlock>
 void slice_scatter_kernel_launcher(
     ELEM_T *output,
+    {{index_type}} output_offset,
     const int64_t *output_shape,
     const ELEM_T *inputs[],
     const int64_t *input_shapes[],
@@ -400,7 +401,7 @@
     scatter_dim_offset += slice_meta_data.dim_sizes[i];
   }
 
-  LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
+  LoadVecType min_vec_type = get_vec_type<ELEM_T>(output_offset);
   for ({{index_type}}  i = 0; i < NumInputs; i++) {
     LoadVecType vec_type = get_input_vec_type<ELEM_T, Rank>(
         scatter_meta_data.output_strides,
@@ -435,7 +436,7 @@
       }                                                                       \\
       slice_scatter_kernel<vec_type, ELEM_T, Rank, NumInputs, ElemsPerThread> \\
         <<<grid_config, ThreadsPerBlock, 0, stream>>>(                        \\
-            output,                                                           \\
+            output + output_offset,                                           \\
             slice_meta_data,                                                  \\
             scatter_meta_data);                                               \\
       LAUNCH_CHECK_SLICE();                                                   \\
@@ -516,7 +517,8 @@
 {{indent}}                                {{num_inputs}}/*NumInputs*/,
 {{indent}}                                {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                                {{threads_per_block}}/*ThreadsPerBlock*/>(
-{{indent}}      static_cast<{{elem_type}}*>(output), local_output_shape, reinterpret_cast<const {{elem_type}}**>(inputs), input_shapes,
+{{indent}}      static_cast<{{elem_type}}*>(output), {{output_offset}}, local_output_shape,
+{{indent}}      reinterpret_cast<const {{elem_type}}**>(inputs), input_shapes,
 {{indent}}      slice_start_indices, slice_end_indices, scatter_dim, stream);
 {{indent}}  return;
 {{indent}}}
@@ -698,6 +700,7 @@ def gen_function(
     func_attrs,
     backend_spec,
     elems_per_thread=8,
+    output_offset=0,
     update_output_shape=True,
     element_func=None,
     element_func_def=None,
@@ -748,6 +751,7 @@ def gen_function(
         elem_type=input_type,
         elems_per_thread=elems_per_thread,
         threads_per_block=128,
+        output_offset=output_offset,
     )
 
     shape_func = SHAPE_UPDATE_FUNC.render(
diff --git a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
index b8901a062..0162eab33 100644
--- a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
@@ -80,9 +80,19 @@ def gen_function(
     # TODO: consider to profile elems_per_thread
     elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
     element_func_def = None if element_func is None else tanh_def.render()
+    # slice_reshape_scatter is a temporary solution for a special fusion pattern.
+    # It will be replaced with a more general slice + concat pass once it's
+    # ready. Second, the constrains of slice_reshape_scatter ensure that its
+    # output_accessor's stride is actually linear offset in the output tensor.
+    # So, let's not to pollute a common slice kernel with output TensorAccessors
+    # at the moment since we do not support output TensorAccessors for slice
+    # op yet, which may have perf implication to the kernel as well.
+    output_accessor = func_attrs["output_accessors"][0]
+    output_offset = output_accessor.offset
     return slice_common.gen_function(
         func_attrs,
         backend_spec=backend_spec,
+        output_offset=output_offset,
         elems_per_thread=elems_per_thread,
         update_output_shape=False,
         element_func=element_func,
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index cb8b34819..9c3facda3 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -15,6 +15,7 @@
 """
 Slice_reshape_scatter.
 """
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from .... import backend
 from ....backend import registry
@@ -85,15 +86,44 @@ def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
                 idx = i
                 break
         assert idx >= 0
+        # The original output of this slice_reshape_scatter op is the output
+        # of the reshape op.
+        self._attrs["output_accessors"] = [
+            TensorAccessor(reshape_op._attrs["outputs"][0])
+        ]
         cat_op_2.remove_input_at(idx)
         transform_utils.remove_single_tensor_op_from_sorted_graph(reshape_op)
 
         self._attrs["inputs"] = [
             op._attrs["inputs"][0] for op in self._attrs["slice_ops"]
         ]
-        self._attrs["outputs"] = cat_op_2._attrs["outputs"]
+        cat_op_2_outputs = cat_op_2._attrs["outputs"]
+        assert len(cat_op_2_outputs) == 1, (
+            f'{cat_op_2._attrs["name"]=} may only have one output, but got more '
+            f"{cat_op_2_outputs=}"
+        )
+        self._attrs["outputs"] = cat_op_2_outputs
+
+        # setup output TensorAccessor
+        offset = 0
+        cat_dim = cat_op_2._attrs["concat_dim"]
+        orig_idx = -1
+        for i, input_tensor in enumerate(cat_op_2._attrs["original_inputs"]):
+            if input_tensor == reshape_op._attrs["outputs"][0]:
+                orig_idx = i
+                break
+            input_tensor_shape = input_tensor._attrs["shape"]
+            offset += input_tensor_shape[cat_dim].value()
+        assert orig_idx >= 0, (
+            f'could not find {input_tensor._attrs["name"]=} in the original_inputs'
+            "of cat_op_2"
+        )
+        self._attrs["output_accessors"][0].update_base_tensor(
+            cat_op_2_outputs[0], cat_dim, offset
+        )
+
         for x in self._attrs["inputs"]:
-            x._attrs["dst_ops"] = {self}
+            x._attrs["dst_ops"].add(self)
         for y in self._attrs["outputs"]:
             y._attrs["src_ops"].add(self)
 
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index d60779452..f202ceb09 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -63,7 +63,7 @@ def _run_one_test(
             Ys_pt.append(Y_pt)
         Y1_pt = torch.cat(Ys_pt, dim)
         Y2_pt = torch.reshape(Y1_pt, reshape_to)
-        Y_pt = torch.cat([Y2_pt, input_X_pt], dim=dim)
+        Y_pt = torch.cat([input_X_pt, Y2_pt, input_X_pt], dim=dim)
         if add_tanh:
             Y_pt = torch.tanh(Y_pt)
 
@@ -85,7 +85,7 @@ def _run_one_test(
         concat_op_2 = ops.concatenate()
         if add_tanh:
             concat_op_2 = ops.concatenate_tanh()
-        Y = concat_op_2([Y2, input_X], dim)
+        Y = concat_op_2([input_X, Y2, input_X], dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
@@ -102,7 +102,7 @@ def _run_one_test(
         Y_src_ops = Y._attrs["src_ops"]
         np.testing.assert_equal(len(Y_src_ops), 2)
         np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
-        np.testing.assert_equal(concat_op_2._attrs["input_masks"], [False, True])
+        np.testing.assert_equal(concat_op_2._attrs["input_masks"], [True, False, True])
         Y_src_ops_list = list(Y_src_ops)
         slice_reshape_scatter_op = (
             Y_src_ops_list[1] if concat_op_2 == Y_src_ops_list[0] else Y_src_ops_list[0]
@@ -122,6 +122,22 @@ def _run_one_test(
         self.test_count += 1
 
     def test_slice_scatter_reshape(self):
+        self._run_one_test(
+            input_shapes=[[1, 2], [1, 2]],
+            input_start_indices=[[0, 0], [0, 0]],
+            input_end_indices=[[1, 2], [1, 2]],
+            reshape_to=[1, 2, 2],
+            input_x_shape=[1, 1, 2],
+            dim=1,
+        )
+        self._run_one_test(
+            input_shapes=[[10, 20], [15, 44]],
+            input_start_indices=[[1, 5], [2, 10]],
+            input_end_indices=[[4, 15], [5, 22]],
+            reshape_to=[3, 2, 11],
+            input_x_shape=[3, 1, 11],
+            dim=1,
+        )
         self._run_one_test(
             input_shapes=[[8, 16], [20, 30]],
             input_start_indices=[[0, 4], [12, 2]],
@@ -152,6 +168,74 @@ def test_slice_scatter_reshape_float(self):
             dtype="float",
         )
 
+    def test_slice_scatter_reshape_float16_2(self):
+        dtype = "float16"
+        input_shape = [2, 6]
+        input0 = Tensor(shape=input_shape, dtype=dtype, name="input0", is_input=True)
+        input1 = Tensor(shape=input_shape, dtype=dtype, name="input1", is_input=True)
+        input2_shape = [2, 3, 2]
+        input2 = Tensor(shape=input2_shape, dtype=dtype, name="input2", is_input=True)
+
+        start_indices = [0, 0]
+        end_indices = [None, 2]
+        slice_0 = ops.dynamic_slice()(
+            input0, start_indices=start_indices, end_indices=end_indices
+        )
+        slice_1 = ops.dynamic_slice()(
+            input0, start_indices=start_indices, end_indices=end_indices
+        )
+        concat_dim = 1
+        concat_2 = ops.concatenate()([slice_0, slice_1], concat_dim)
+        reshape_to = [-1, 2, 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to)
+
+        slice_4 = ops.dynamic_slice()(
+            input1, start_indices=start_indices, end_indices=end_indices
+        )
+        slice_5 = ops.dynamic_slice()(
+            input1, start_indices=start_indices, end_indices=end_indices
+        )
+        concat_6 = ops.concatenate()([slice_4, slice_5], concat_dim)
+        reshape_7 = ops.reshape()(concat_6, reshape_to)
+
+        Y = ops.concatenate()([input2, reshape_3, reshape_7], concat_dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test.so"
+        test_name = "slice_scatter_reshape_cat_float16_2"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        Y_src_ops = Y._attrs["src_ops"]
+        self.assertEqual(len(Y_src_ops), 3)
+        slice_reshape_scatter_cnt = 0
+        for op in Y_src_ops:
+            if op._attrs["op"] == "slice_reshape_scatter":
+                slice_reshape_scatter_cnt += 1
+        self.assertEqual(slice_reshape_scatter_cnt, 2)
+
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+        input0_pt = get_random_torch_tensor(input_shape, dtype)
+        slice_0_pt = input0_pt[slice_indices]
+        slice_1_pt = input0_pt[slice_indices]
+        concat_2_pt = torch.cat([slice_0_pt, slice_1_pt], concat_dim)
+        reshape_3_pt = torch.reshape(concat_2_pt, reshape_to)
+
+        input1_pt = get_random_torch_tensor(input_shape, dtype)
+        slice_4_pt = input1_pt[slice_indices]
+        slice_5_pt = input1_pt[slice_indices]
+        concat_6_pt = torch.cat([slice_4_pt, slice_5_pt], concat_dim)
+        reshape_7_pt = torch.reshape(concat_6_pt, reshape_to)
+
+        input2_pt = get_random_torch_tensor(input2_shape, dtype)
+        y_pt = torch.cat([input2_pt, reshape_3_pt, reshape_7_pt], concat_dim)
+
+        inputs = {"input0": input0_pt, "input1": input1_pt, "input2": input2_pt}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 6e987d4dbfdffde433df2e883a5eedd270fecd9d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 7 Feb 2023 23:07:56 -0800
Subject: [PATCH 088/638] Added an interface to query the current profiler
 cache version (#217)

Summary:
It also unified the cache version for all tables.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/217

Reviewed By: wushirong

Differential Revision: D43106719

Pulled By: chenyang78

fbshipit-source-id: bd14071439220ec780fe16bd23f4a1071ef5d6d8
---
 python/aitemplate/backend/profiler_cache.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 6818110c6..e76ba38a0 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -457,6 +457,13 @@ class CacheMode(enum.Enum):
 )
 
 
+__AIT_CACHE_VERSION__ = 2
+
+
+def ait_cache_version() -> int:
+    return __AIT_CACHE_VERSION__
+
+
 class ProfileCacheDB(object):
     r"""Local SQLite profile cache database."""
 
@@ -481,8 +488,12 @@ def __init__(
         self._mode = CacheMode.LOCAL
         self._db_commit_flag = False
         # Some design rationales:
-        #   * Each table maintains it own version number. This can avoid re-creating
-        #     tables that are not involved with the breaking changes.
+        #   * All tables share the version number, because we are exposing the
+        #     the cache version. Using a single version number seems to make it
+        #     more clean. One caveat is that we are going to re-create all tables
+        #     even if some of them are not involved with the breaking changes.
+        #     It seems to be fine as we expect the frequency of cache version
+        #     updated to be quite low.
         #   * We only keep a single table (i.e. version) for each category (
         #     gemm, conv and norm) to simplify how we handle breaking changes
         #     and rollbacks caused by failures in the updated version.
@@ -492,9 +503,9 @@ def __init__(
         #     leave some content from the failing version in the db. How are we
         #     going to update the db if we update the version again, and so on.
         # TODO: add similar version control for norm
-        self._gemm_cache_version = 2
-        self._conv_cache_version = 2
-        self._conv3d_cache_version = 2
+        self._gemm_cache_version = ait_cache_version()
+        self._conv_cache_version = ait_cache_version()
+        self._conv3d_cache_version = ait_cache_version()
         if uri is not None:
             self._mode = CacheMode.REMOTE
         if self._mode == CacheMode.LOCAL:

From 9d4bf052c137f7b8bf983f1336f8554839c94937 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005367269 <generatedunixname89002005367269@fb.com>
Date: Wed, 8 Feb 2023 01:21:17 -0800
Subject: [PATCH 089/638] Daily `arc lint --take BLACK`

Reviewed By: ivanmurashko

Differential Revision: D43114752

fbshipit-source-id: fa61bb776336663d84f4dd8ea8ae993562e55f5c
---
 fx2ait/fx2ait/converters/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index 26d89f704..696df2d60 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -141,6 +141,7 @@ def nchw2nhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
 def ncdhw2ndhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
     return [shape[0], shape[2], shape[3], shape[4], shape[1]]
 
+
 # TODO:  This is a hack to workaround AIT's dynamic shape requirement.
 # Detailed explanation can be found in D41743385 (aten2ait) D41974191(fx2ait).
 # We will throw this one after AIT provides vanilla support.

From e07a45996e43b8c4461be60e4c4e41656bd48652 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Wed, 8 Feb 2023 10:06:29 -0800
Subject: [PATCH 090/638] Skip some AITemplate unittests if not running on A100
 (#221)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/221

Reviewed By: wushirong

Differential Revision: D43115032

fbshipit-source-id: 6314ecda215c87fbc2833a85ddcafa02e5bc951b
---
 .../test_fused_elementwise_complex_dependency.py | 16 ++++++++++++++++
 tests/unittest/compiler/test_refine_graph.py     |  4 ++++
 .../compiler/test_slice_elemwise_fusion.py       |  8 ++++++++
 .../compiler/test_slice_reshape_scatter.py       |  4 ++++
 tests/unittest/compiler/test_strided_view_op.py  | 16 ++++++++++++++++
 tests/unittest/ops/test_cross_attention.py       |  4 ++++
 .../ops/test_fused_elementwise_broadcast.py      |  4 ++++
 ...est_fused_elementwise_with_strided_outputs.py |  8 ++++++++
 8 files changed, 64 insertions(+)

diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 45b4ab545..9fb7a0cab 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -286,6 +286,10 @@ def test_fused_elementwise_non_elementwise_ops(self, dtype):
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_indirect_input_dependency(self, dtype):
         r"""
@@ -365,6 +369,10 @@ def test_fused_elementwise_indirect_input_dependency(self, dtype):
         module.run_with_tensors(inputs, [r3])
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype):
         r"""
@@ -460,6 +468,10 @@ def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype)
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_multi_dependency(self, dtype):
         r"""
@@ -567,6 +579,10 @@ def test_fused_elementwise_multi_dependency(self, dtype):
         module.run_with_tensors(inputs, [r7])
         self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_find_fusable_graph(self, dtype):
         r"""
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
index 685b8438c..2fddd99cd 100644
--- a/tests/unittest/compiler/test_refine_graph.py
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -116,6 +116,10 @@ def test_elementwise_ops_single_input_no_refine(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_elementwise_ops_single_input(self):
         dtype = "float16"
         M = 10
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
index fa961e4a7..f80a28668 100644
--- a/tests/unittest/compiler/test_slice_elemwise_fusion.py
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -310,6 +310,10 @@ def _test_slice_elemwise_fusion_dynamic(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_slice_elemwise_fusion_dynamic(self):
         self._test_slice_elemwise_fusion_dynamic(
             slice_input_shape=([5, 16], 10),
@@ -339,6 +343,10 @@ def test_slice_elemwise_fusion_dynamic(self):
             expected_data_t="half",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_slice_elemwise_fusion_dynamic_broadcast(self):
         # slice_output broadcasts to input_x2
         self._test_slice_elemwise_fusion_dynamic(
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index f202ceb09..4c5b9f591 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -121,6 +121,10 @@ def _run_one_test(
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_slice_scatter_reshape(self):
         self._run_one_test(
             input_shapes=[[1, 2], [1, 2]],
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
index d413b7a0f..6f841592a 100644
--- a/tests/unittest/compiler/test_strided_view_op.py
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -172,6 +172,10 @@ def __init__(self, *args, **kwargs):
         super(StridedViewOpTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand(
         [
             param(f"single_gemm_{name}_fusion_{dtype}", func, dtype)
@@ -280,6 +284,10 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func, dtype):
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self._test_id += 1
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand(
         [
             param(
@@ -331,6 +339,10 @@ def test_single_op_and_view_fusible(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self._test_id += 1
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     @parameterized.expand(
         [
             param(f"single_op_{name}_non_fusion_{dtype}", func, dtype)
@@ -457,6 +469,10 @@ def _test_two_parallel_views(self, dtype="float16"):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_two_views(self):
         self._test_two_parallel_views()
         self._test_two_serial_view_outputs()
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
index 675a1ed3e..86a8e6e23 100644
--- a/tests/unittest/ops/test_cross_attention.py
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -134,6 +134,10 @@ def _test_mha(
             )
             print("Batch {} MHA verification pass".format(batch_size))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_cross_attn(self):
         self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
         self._test_mha(
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index b8784b9e0..e7943d42f 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -87,6 +87,10 @@ def _test_different_dim(
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_different_dim_fp16(self):
         self._test_different_dim(
             batch_sizes=[1024],
diff --git a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
index b5398eb38..2419b3eb1 100644
--- a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
+++ b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
@@ -122,6 +122,10 @@ def _test_fused_elementwise_with_strided_outputs(
                     # Do comparisons.
                     self.assertTrue(torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_all_aligned_fp16(self):
         self._test_fused_elementwise_with_strided_outputs(
             batch0_sizes=[1],
@@ -217,6 +221,10 @@ def test_all_aligned_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_not_aligned_fp16(self):
         self._test_fused_elementwise_with_strided_outputs(
             batch0_sizes=[8],

From a72ce799cc585917858d908da4ad3a992a134393 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 8 Feb 2023 10:48:51 -0800
Subject: [PATCH 091/638] fix oss docs build (#218)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/218

docs builds are failing e.g. https://github.com/facebookincubator/AITemplate/actions/runs/4119426179/jobs/7113074011

Reviewed By: chenyang78

Differential Revision: D43109649

fbshipit-source-id: ab0c1b7333e88263d7607429edf2ef0850f33574
---
 .github/workflows/docs.yaml | 9 ++-------
 docs/source/conf.py         | 4 ++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 6c1bd8ba9..3ebf6640d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -22,16 +22,11 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install autodocsumm
-        pip install sphinx_rtd_theme
-        pip install sphinx_gallery
-        pip install sphinxcontrib-inlinesyntaxhighlight
-        pip install sphinx_toolbox
+        python3.9 -m pip install --upgrade pip
+        python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox
         cd python
         python setup.py develop
         cd ..
-        pip install numpy
     - name: Build documents with Sphinx
       run: |
         cd docs
diff --git a/docs/source/conf.py b/docs/source/conf.py
index bf239d5d1..51fbf50db 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = "AITemplate"
-copyright = "2022, Meta Platforms"
+copyright = "2022-2023, Meta Platforms"
 author = "Meta Platforms"
 
 # The full version, including alpha/beta/rc tags
-release = "0.1"
+release = "0.2"
 
 
 # -- General configuration ---------------------------------------------------

From 73a860c69568e133f9f67b7c930ac88098dfb6e0 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Wed, 8 Feb 2023 20:33:07 -0800
Subject: [PATCH 092/638] skip conv3d test on Github CI (#225)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/225

The tests were not working on SM75 and throwing test fails (see [the test](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate/335/workflows/24e491ea-49d3-4ecc-bc3a-fc22e81e30c9/jobs/479/tests)) So this diff skips the test on SM75 hardware.

Reviewed By: terrychenism

Differential Revision: D43132815

fbshipit-source-id: 0b8b1c81e4d040d43723d90443d0b0618a3e3453
---
 .../fx2ait/test/converters/test_ait_conv3d.py |  52 ++------
 .../converters/test_ait_conv3d_depthwise.py   | 114 ++++++++++++++++++
 2 files changed, 124 insertions(+), 42 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py

diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index 034201556..bd88f0952 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -12,13 +12,23 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import unittest
 
 import torch
+
+from aitemplate.testing import detect_target
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
 from parameterized import param, parameterized
 
 
+@unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class TestAitConv3d(AITTestCase):
     @parameterized.expand(
         [
@@ -37,48 +47,6 @@ class TestAitConv3d(AITTestCase):
                 w=224,
                 bias=False,
             ),
-            param(
-                name="depthwise_conv3d",
-                kernel_size=(1, 1, 1),
-                stride=(1, 1, 1),
-                padding=0,
-                dilation=1,
-                ci=96,
-                co=96,
-                groups=96,
-                d=2,
-                h=56,
-                w=56,
-                bias=True,
-            ),
-            param(
-                name="depthwise_conv3d_2",
-                kernel_size=(1, 1, 1),
-                stride=(1, 1, 1),
-                padding=0,
-                dilation=1,
-                ci=96,
-                co=96,
-                groups=96,
-                d=2,
-                h=28,
-                w=28,
-                bias=True,
-            ),
-            param(
-                name="depthwise_conv3d_3",
-                kernel_size=(1, 1, 1),
-                stride=(1, 1, 1),
-                padding=0,
-                dilation=1,
-                ci=96,
-                co=96,
-                groups=96,
-                d=2,
-                h=7,
-                w=7,
-                bias=True,
-            ),
         ]
     )
     def test_conv3d(
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
new file mode 100644
index 000000000..9dbd5ef97
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+
+import torch
+
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestAitDepthwiseConv3d(AITTestCase):
+    @parameterized.expand(
+        [
+            param(
+                name="depthwise_conv3d",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=56,
+                w=56,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_2",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=28,
+                w=28,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_3",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=7,
+                w=7,
+                bias=True,
+            ),
+        ]
+    )
+    def test_depthwise_conv3d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        ci=8,
+        co=8,
+        groups=1,
+        d=4,
+        h=224,
+        w=224,
+        bias=False,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv3d(
+                    ci,
+                    co,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    bias,
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(4, ci, d, h, w).cuda().half()]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv3d},
+            permute_inputs=[0, 2, 3, 4, 1],  # inputs should be NDHWC
+            permute_outputs=[0, 4, 1, 2, 3],
+        )

From b05dff119b2337628243051e108071eed008e69a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Wed, 8 Feb 2023 22:49:21 -0800
Subject: [PATCH 093/638] split slice_reshape_scatter into multiple ones if it
 has too many inputs (#219)

Summary:
The CUDA compiler has a hard requirement on the maximum total size of kernel parameters. Similar to our concatenate and split kernels, slice_reshape_scatter may also hit such limitation in some cases. This PR splits a slice_reshape_scatter op into multiple ones if the original op reaches the limintation.

Note that the slice_scatter op also has a similar issue, which will be fixed in a separate PR.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/219

Reviewed By: ipiszy

Differential Revision: D43112730

Pulled By: chenyang78

fbshipit-source-id: 7fcd9a41834e45f9f9f541b6ee9bb80492047ecd
---
 .../ops/tensor/slice_reshape_scatter.py       |  32 ++--
 .../compiler/transform/optimize_graph.py      |   2 +
 .../split_large_slice_scatter_ops.py          | 160 ++++++++++++++++++
 .../transform/transform_strided_ops.py        |   2 +-
 .../test_split_large_slice_scatter.py         | 111 ++++++++++++
 5 files changed, 293 insertions(+), 14 deletions(-)
 create mode 100644 python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
 create mode 100644 tests/unittest/compiler/test_split_large_slice_scatter.py

diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index 9c3facda3..1aad49a71 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -15,6 +15,8 @@
 """
 Slice_reshape_scatter.
 """
+from typing import Optional
+
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from .... import backend
@@ -138,29 +140,33 @@ def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
             y._attrs["src_ops"] = StableSet()
             y._attrs["dst_ops"] = StableSet()
 
-    def __init__(
-        self, cat_op: Operator, reshape_op: Operator, cat_op_2: Operator
-    ) -> None:
+    def __init__(self, scatter_dim: int, element_func: Optional[str] = None) -> None:
         super().__init__()
-        if cat_op_2._attrs["op"] == "concatenate_tanh":
-            self._attrs["element_func"] = "fast_tanh"
-        else:
-            self._attrs["element_func"] = None
-        assert slice_reshape_scatter.is_valid(cat_op, reshape_op, cat_op_2)
-
+        self._attrs["element_func"] = element_func
         self._attrs["op"] = "slice_reshape_scatter"
         self._attrs["has_profiler"] = False
-        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        self._attrs["scatter_dim"] = scatter_dim
+
+    @staticmethod
+    def make_op(cat_op: Operator, reshape_op: Operator, cat_op_2: Operator) -> Operator:
+        assert slice_reshape_scatter.is_valid(cat_op, reshape_op, cat_op_2)
+        element_func = None
+        if cat_op_2._attrs["op"] == "concatenate_tanh":
+            element_func = "fast_tanh"
+        scatter_dim = cat_op._attrs["concat_dim"]
+        new_op = slice_reshape_scatter(scatter_dim, element_func)
+
         slice_ops = []
         for x in cat_op._attrs["inputs"]:
             src_ops = x.src_ops()
             assert len(src_ops) == 1
             slice_op = list(src_ops)[0]
             slice_ops.append(slice_op)
-        self._attrs["slice_ops"] = slice_ops
+        new_op._attrs["slice_ops"] = slice_ops
 
-        self._update_inputs_outputs(cat_op, reshape_op, cat_op_2)
-        self._set_depth()
+        new_op._update_inputs_outputs(cat_op, reshape_op, cat_op_2)
+        new_op._set_depth()
+        return new_op
 
     def __call__(self):
         raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index c4d2f817b..72071c20b 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -29,6 +29,7 @@
 from .fuse_parallel_gemms import fuse_parallel_gemms
 from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
 from .split_large_concat_ops import split_large_concat_ops
+from .split_large_slice_scatter_ops import split_large_slice_scatter_ops
 from .split_large_split_ops import split_large_split_ops
 from .transform_memory_ops import transform_memory_ops
 from .transform_odd_alignment import transform_odd_alignment
@@ -82,6 +83,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         transform_special_ops,
         apply_padding,
         transform_strided_ops,
+        split_large_slice_scatter_ops,
         split_large_concat_ops,
         split_large_split_ops,
         transform_memory_ops,
diff --git a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
new file mode 100644
index 000000000..a25810375
--- /dev/null
+++ b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
@@ -0,0 +1,160 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This transformation splits a slice_scatter or slice_reshape_scatter with a large
+number of inputs into multiple slice_scatter or slice_reshape_scatter ops.
+"""
+import copy
+import logging
+
+from typing import List
+
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+
+from ...utils import graph_utils, shape_utils
+from .. import ops
+from ..base import Operator, Tensor
+from . import transform_utils
+
+
+_LOGGER = logging.getLogger(__name__)
+
+# slice_scatter and slice_reshape_scatter use the same kernel implementation
+SLICE_SCATTER_INPUT_META_SIZE = 64  # bytes per input
+SLICE_SCATTER_OUTPUT_META_SIZE = 16  # bytes per rank
+MAX_CUDA_PARAM_BYTES = 4096  # bytes
+
+
+def _slice_scatter_kernel_single_input_output_param_size(op: Operator):
+    """
+    Return the total size (in bytes) of the slice_scatter's params.
+    We need to adjust this if we change its params.
+    """
+    inputs = op._attrs["inputs"]
+    rank = inputs[0]._rank()
+    size_of_output_meta = SLICE_SCATTER_OUTPUT_META_SIZE * rank
+    # There are one more params, which takes 8 bytes.
+    total_params_size = SLICE_SCATTER_INPUT_META_SIZE + size_of_output_meta + 8
+    _LOGGER.debug(f'slice_scatter op {op._attrs["name"]}: {total_params_size=}')
+    return total_params_size
+
+
+def split_large_slice_scatter_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Our slice_scatter CUDA kernel takes an input meta argument whose size
+    is proportional to the number of inputs. In extreme cases, the total size
+    of the kernel function params may exceed the limit imposed by the CUDA
+    compiler. In such cases, we split the slice_scatter op into separate
+    ones, each of which takes the original output and inputs with correct
+    input_masks values.
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        # TODO: enable slice_scatter later
+        if not op._attrs["op"].startswith("slice_reshape_scatter"):
+            continue
+        slice_scatter_op = op
+        # We create InputMeta for inputs that need to copy data.
+        inputs = slice_scatter_op._attrs["inputs"]
+        num_inputs = len(inputs)
+        if num_inputs == 0:
+            continue
+        params_size = _slice_scatter_kernel_single_input_output_param_size(
+            slice_scatter_op
+        )
+        if params_size > MAX_CUDA_PARAM_BYTES:
+            raise RuntimeError(
+                f"cannot handle cases: {params_size=} > {MAX_CUDA_PARAM_BYTES=}"
+            )
+        total_params_size = params_size * num_inputs
+        if total_params_size <= MAX_CUDA_PARAM_BYTES:
+            continue
+        num_inputs_per_split = MAX_CUDA_PARAM_BYTES // params_size
+        num_splits = (num_inputs + num_inputs_per_split - 1) // num_inputs_per_split
+        split_sizes = [num_inputs_per_split] * num_splits
+        if num_inputs % num_inputs_per_split:
+            split_sizes[num_splits - 1] = num_inputs % num_inputs_per_split
+
+        inputs_offset = 0
+        all_new_slice_scatter_ops = []
+        outputs = slice_scatter_op._attrs["outputs"]
+        output_accessors = slice_scatter_op._attrs["output_accessors"]
+        scatter_dim = slice_scatter_op._attrs["scatter_dim"]
+        has_profiler = slice_scatter_op._attrs["has_profiler"]
+        local_output_offset = 0
+        orig_name = slice_scatter_op._attrs["name"]
+        element_func = slice_scatter_op._attrs["element_func"]
+        slice_ops = slice_scatter_op._attrs["slice_ops"]
+        for split_idx, new_inputs_size in enumerate(split_sizes):
+            new_slice_scatter_op = ops.slice_reshape_scatter(scatter_dim, element_func)
+            new_name = f"{orig_name}_split_{split_idx}"
+            new_slice_scatter_op._attrs["name"] = new_name
+            new_slice_scatter_op._attrs["original_name"] = new_name
+            new_slice_scatter_op._attrs["has_profiler"] = has_profiler
+            new_slice_scatter_op._attrs["outputs"] = outputs
+            new_slice_scatter_op._attrs["output_accessors"] = copy.deepcopy(
+                output_accessors
+            )
+            new_slice_scatter_op._set_depth()
+
+            # import pdb; pdb.set_trace()
+            new_inputs = list(inputs[inputs_offset : (inputs_offset + new_inputs_size)])
+            new_slice_scatter_op._attrs["inputs"] = new_inputs
+            new_slice_ops = slice_ops[inputs_offset : (inputs_offset + new_inputs_size)]
+            new_slice_scatter_op._attrs["slice_ops"] = new_slice_ops
+
+            # We also need to update the offset of the output tensor accessor.
+            # Note that the strided information remains the same because the output
+            # remains the same and we just shift the head offset for each new
+            # slice scatter op.
+            new_slice_scatter_op._attrs["output_accessors"][
+                0
+            ].offset += local_output_offset
+            for input_tensor, slice_op in zip(new_inputs, new_slice_ops):
+                input_tensor_shape = input_tensor._attrs["shape"]
+                # This is enforced by slice_scatter op. Just ensure we didn't
+                # violate the assumption somewhere.
+                assert shape_utils.all_static_dimensions(
+                    input_tensor_shape, scatter_dim
+                ), (
+                    f"Expected input_tensor_shape[{scatter_dim}:] are all static dimensions, "
+                    f"but got: {input_tensor_shape}"
+                )
+                start_indices = slice_op._attrs["start_indices"]
+                end_indices = slice_op._attrs["end_indices"]
+                strided_dim_offset = 1
+                for dim, start, end in zip(
+                    input_tensor_shape[scatter_dim:],
+                    start_indices[scatter_dim:],
+                    end_indices[scatter_dim:],
+                ):
+                    n_start, n_end = dynamic_slice.normalize_start_end_indices(
+                        dim.value(), start, end
+                    )
+                    assert n_start <= n_end, (
+                        f"expected normalized {n_start=} <= {n_end=} for "
+                        f"{dim=}, {start=}, {end=}"
+                    )
+                    strided_dim_offset *= n_end - n_start
+                local_output_offset += strided_dim_offset
+                input_tensor._attrs["dst_ops"].update([new_slice_scatter_op])
+                input_tensor._attrs["dst_ops"].discard(slice_scatter_op)
+            all_new_slice_scatter_ops.append(new_slice_scatter_op)
+            inputs_offset += new_inputs_size
+        output = outputs[0]
+        output._attrs["src_ops"].update(all_new_slice_scatter_ops)
+        output._attrs["src_ops"].remove(slice_scatter_op)
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 2e05fee5c..862af2329 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -80,7 +80,7 @@ def _fuse_slices_concat_reshape_concat(sorted_graph: List[Tensor]) -> List[Tenso
 
         concat_op_2 = next_op
         if slice_reshape_scatter.is_valid(concat_op, reshape_op, concat_op_2):
-            slice_reshape_scatter(concat_op, reshape_op, concat_op_2)
+            slice_reshape_scatter.make_op(concat_op, reshape_op, concat_op_2)
 
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
diff --git a/tests/unittest/compiler/test_split_large_slice_scatter.py b/tests/unittest/compiler/test_split_large_slice_scatter.py
new file mode 100644
index 000000000..582091eb4
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_slice_scatter.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SliceScatterLargeInputsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterLargeInputsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_slice_scatter_reshape_float16(
+        self,
+        input0_shape,
+        input1_shape,
+        start_indices,
+        end_indices,
+    ):
+        dtype = "float16"
+
+        input0 = Tensor(shape=input0_shape, dtype=dtype, name="input0", is_input=True)
+        input1 = Tensor(shape=input1_shape, dtype=dtype, name="input1", is_input=True)
+
+        num_slices = 139
+        slice_outputs = [
+            ops.dynamic_slice()(
+                input0, start_indices=start_indices, end_indices=end_indices
+            )
+            for _ in range(num_slices)
+        ]
+
+        concat_dim = 1
+        concat_2 = ops.concatenate()(slice_outputs, concat_dim)
+        reshape_to = [-1, num_slices, 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to)
+
+        Y = ops.concatenate()([reshape_3, input1], concat_dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        test_name = "slice_scatter_large_inputs"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        Y_src_ops = Y._attrs["src_ops"]
+        # We have a single concat op. All the rest are slice_reshape_scatter ops
+        concat_cnt = 0
+        for op in Y_src_ops:
+            if op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+                continue
+            self.assertEqual(op._attrs["op"], "slice_reshape_scatter")
+        self.assertEqual(concat_cnt, 1)
+
+        input0_pt = get_random_torch_tensor(input0_shape, dtype)
+        input1_pt = get_random_torch_tensor(input1_shape, dtype)
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+        slice_outputs_pt = [input0_pt[slice_indices] for _ in range(num_slices)]
+        concat_2_pt = torch.cat(slice_outputs_pt, concat_dim)
+        reshape_3_pt = torch.reshape(concat_2_pt, reshape_to)
+        y_pt = torch.cat([reshape_3_pt, input1_pt], concat_dim)
+
+        inputs = {"input0": input0_pt, "input1": input1_pt}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_slice_scatter_reshape_float16(self):
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[6, 2],
+            input1_shape=[2, 4, 2],
+            start_indices=[1, 0],
+            end_indices=[3, None],
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[2, 6],
+            input1_shape=[2, 4, 2],
+            start_indices=[0, 0],
+            end_indices=[None, 2],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From b626aabb461ea745a1dbf16d537b6cf6955dc8fb Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Thu, 9 Feb 2023 00:34:21 -0800
Subject: [PATCH 094/638] Amend get_item with list of indices to use ait fusion
 pass (#229)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/229

Current implementation of get_item over a list of indices first perform tensor_slice on pieces of index, squeeze, unsqueeze, amd then concatenate.
This is because tensor_slice will squeeze tensor when encounter int, while it cannot handle list of int.

It doesn't work well with AIT's fusion pass because of extra squeeze and unsqueeze.

To avoid this, we want tensor_slice to skip squeeze when inputs is
```
a=torch.randn([10,8])
a[:, [2]]
```
This diff change tensor_slice to be able to handle list of index when `len(list)==1`. It does the same thing as non index version; the only difference is instead of giving a `int`, now we get a `slice(2,3,None).`
Then AIT fusion pass can handle this dynamic shape issue.

Another thing is, in practice, there are quite some dynamic_slice context that can be merged
e.g. `a[0, 2, 3, 4]`, we want to slice twice, one with `slice(0,1,None)`, the other with `slice(2,5,None)`. This fix the issue of subsequent large slices, e.g. https://fburl.com/phabricator/t26ylhbw

Reviewed By: wushirong

Differential Revision: D43071233

fbshipit-source-id: 828a93c9411ddbf19b0717d60469307ec7629c39
---
 fx2ait/fx2ait/converters/ait_converters.py    | 24 ++++++++++++++++---
 .../test/converters/test_ait_slice_tensor.py  | 12 ++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 6d3b665d4..2e580fc26 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -487,9 +487,27 @@ def acc_ops_getitem(
         # In terms of performance, AIT backend will take care of fusing these ops.
         groups = []
         kw = {"input": input_val}
-        for x in s:
-            kw["idx"] = idx[:dim] + (x,) + idx[dim + 1 :]
-            groups.append(unsqueeze(dim)(acc_ops_slice(target, args, kw, name)))
+        start_idx = 0
+        end_idx = 1
+        while end_idx < len(s):
+            if s[end_idx] - s[start_idx] == end_idx - start_idx:
+                end_idx += 1
+                continue
+            else:
+                kw["idx"] = (
+                    idx[:dim]
+                    + (slice(s[start_idx], s[end_idx - 1] + 1, None),)
+                    + idx[dim + 1 :]
+                )
+                groups.append(acc_ops_slice(target, args, kw, name))
+                start_idx = end_idx
+                end_idx += 1
+        kw["idx"] = (
+            idx[:dim]
+            + (slice(s[start_idx], s[end_idx - 1] + 1, None),)
+            + idx[dim + 1 :]
+        )
+        groups.append(acc_ops_slice(target, args, kw, name))
         return concatenate()(groups, dim=dim)
 
     if isinstance(idx, slice) or (
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index 6b95bd608..e14126046 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -53,6 +53,18 @@ class TestSliceTensor(AITTestCase):
                 "slice_zero_slice",
                 (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
             ),
+            (
+                "slice_start_seq_slice",
+                (slice(0, 1, None), [0, 1, 2], slice(0, 10, None)),
+            ),
+            (
+                "slice_end_seq_slice",
+                (slice(0, 1, None), [0, 6, 7, 8, 9], slice(0, 10, None)),
+            ),
+            (
+                "slice_long_seq_slice",
+                (slice(0, 1, None), [0, 5, 6, 7, 2, 3, 4, 5], slice(0, 10, None)),
+            ),
             (
                 "slice_list_slice",
                 (slice(0, 1, None), [2], slice(0, 10, None)),

From b408aa239b02a4e0be9592731974f902cf6108c8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 9 Feb 2023 12:29:05 -0800
Subject: [PATCH 095/638] fix cat/view/cat fusion (#230)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/230

```
input_1  input_2
    \     /
     \   /
      cat_1
       \
        \
       view  input_3
          \    /
           \  /
           cat_2
```

Reviewed By: chenyang78

Differential Revision: D43144257

fbshipit-source-id: b04bf61b4c0081d5ae0dbf87367130cd4182ab56
---
 .../transform/transform_memory_ops.py         |  2 +-
 .../compiler/test_fuse_cat_view_cat.py        | 92 +++++++++++++++++++
 2 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 tests/unittest/compiler/test_fuse_cat_view_cat.py

diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index a585393e7..3c426f97e 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -60,7 +60,7 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     #     y = cat(y1, y2)
     # In such a case, we cannot merge those two concat ops.
     if not all(
-        accessor.stride_dim is None for accessor in cat._attrs["input_accessors"]
+        accessor.actual_shapes is None for accessor in cat._attrs["input_accessors"]
     ):
         return False
     first_op_inputs = first_op._attrs["inputs"]
diff --git a/tests/unittest/compiler/test_fuse_cat_view_cat.py b/tests/unittest/compiler/test_fuse_cat_view_cat.py
new file mode 100644
index 000000000..ea11dde2f
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_cat_view_cat.py
@@ -0,0 +1,92 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.public import IntImm, IntVar
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class FuseCatViewCatTestCase(unittest.TestCase):
+    def test_fuse_cat_view_cat(self):
+        dtype = "float16"
+        B = IntVar([1, 2048], name="batch_size")
+        M1 = IntImm(16)
+        M2 = IntImm(48)
+        N = IntImm(18)
+        K = IntImm(9)
+
+        input_1 = Tensor(
+            shape=[B, M1, N],
+            name="input_1",
+            is_input=True,
+        )
+        input_2 = Tensor(
+            shape=[B, M2, N],
+            name="input_2",
+            is_input=True,
+        )
+        input_3 = Tensor(
+            shape=[B, K],
+            name="input_3",
+            is_input=True,
+        )
+        concatenate_4 = ops.concatenate()([input_1, input_2], 1)
+        reshape_5 = ops.reshape()(
+            concatenate_4, [-1, (M1.value() + M2.value()) * N.value()]
+        )
+        concatenate_6 = ops.concatenate()([input_3, reshape_5], 1)
+
+        # Set outputs
+        concatenate_6._attrs["name"] = "output_0"
+        concatenate_6._attrs["is_output"] = True
+        # Compile
+        mod = compile_model(
+            concatenate_6,
+            detect_target(),
+            "./tmp",
+            "test_fuse_cat_view_cat",
+        )
+        # Compare
+        input_1_pt = get_random_torch_tensor((1024, M1.value(), N.value()), dtype)
+        input_2_pt = get_random_torch_tensor((1024, M2.value(), N.value()), dtype)
+        input_3_pt = get_random_torch_tensor((1024, K.value()), dtype)
+
+        y_pt = torch.cat(
+            [
+                input_3_pt,
+                torch.reshape(
+                    torch.cat([input_1_pt, input_2_pt], dim=1),
+                    (-1, (M1.value() + M2.value()) * N.value()),
+                ),
+            ],
+            dim=1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        mod.run_with_tensors(
+            {"input_1": input_1_pt, "input_2": input_2_pt, "input_3": input_3_pt},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From ad8a33e21e76191540ad0726b61bf00fc2458bd1 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Thu, 9 Feb 2023 16:31:02 -0800
Subject: [PATCH 096/638] call depthwise_conv3d with bias instead of use it in
 the initializer (#231)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/231

depthwise_conv3d should accpet bias in the call function instead of in the initializer. Fix the issue in this diff.

Reviewed By: wushirong

Differential Revision: D43152843

fbshipit-source-id: e1f8d3a37ed5cb722fe70f93510d2bcb07df0fc6
---
 fx2ait/fx2ait/converters/ait_converters.py              | 4 ++--
 python/aitemplate/compiler/ops/conv/depthwise_conv3d.py | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 2e580fc26..c094c3faa 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1184,8 +1184,8 @@ def _choose_conv3d_op(
             groups == weight._attrs["shape"][0]
         ), "Currently only support channel == groups"
         return depthwise_conv3d(
-            stride=stride, pad=pad, dilate=dilate, group=groups, bias=bias
-        )(x, weight)
+            stride=stride, pad=pad, dilate=dilate, group=groups, bias=True
+        )(x, weight, bias)
     else:
         assert (
             groups is None or groups == 1
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 1d234988c..3fff25a99 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -266,8 +266,6 @@ def __call__(self, x: Tensor, w: Tensor, bias: Tensor = None) -> List[Tensor]:
         self._attrs["inputs"] = [x, w]
         if bias:
             self._attrs["inputs"].append(bias)
-        elif self._attrs["bias"]:
-            self._attrs["inputs"].append(self._attrs["bias"])
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)

From 62ef87e3aa4e8dcb1460aeb617daf9ed16beee93 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Thu, 9 Feb 2023 16:58:12 -0800
Subject: [PATCH 097/638] AITemplate tests clean up (#235)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/235

Reviewed By: houseroad

Differential Revision: D43149403

fbshipit-source-id: 9a5e0ac0f8b2d7c19187b6d8073dcadb9a97309d
---
 .../compiler/test_fused_elementwise_complex_dependency.py     | 4 ----
 tests/unittest/ops/test_fused_elementwise.py                  | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 9fb7a0cab..902d77ee9 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -286,10 +286,6 @@ def test_fused_elementwise_non_elementwise_ops(self, dtype):
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_indirect_input_dependency(self, dtype):
         r"""
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 1ac2fbe90..162a21144 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -43,6 +43,10 @@
 
 
 class FusedElementwiseTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_fused_elementwise_constructor(self, ait_dtype):
         BATCH_SIZE = 1024
         M = 256

From b888133202aa838a13406127a03ff510b6e4eb0e Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Thu, 9 Feb 2023 19:23:25 -0800
Subject: [PATCH 098/638] conv3d_bias (#238)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/238

Reviewed By: frank-wei, frankgt40

Differential Revision: D43177048

Pulled By: terrychenism

fbshipit-source-id: 75b81803cd0b2764f54d309ab449ef3b4c86e4f2
---
 .../backend/cuda/conv3d/__init__.py           |   4 +-
 .../backend/cuda/conv3d/conv3d_bias.py        | 624 +++++++++++++++
 .../aitemplate/compiler/ops/conv/__init__.py  |   1 +
 .../compiler/ops/conv/conv3d_bias.py          | 717 ++++++++++++++++++
 tests/unittest/ops/test_conv3d.py             |  60 +-
 5 files changed, 1385 insertions(+), 21 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv3d_bias.py

diff --git a/python/aitemplate/backend/cuda/conv3d/__init__.py b/python/aitemplate/backend/cuda/conv3d/__init__.py
index 84e693cc3..6187336e2 100644
--- a/python/aitemplate/backend/cuda/conv3d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv3d/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA conv3d module init
 """
-from . import conv3d, depthwise_conv3d, depthwise_conv3d_bias
+from . import conv3d, conv3d_bias, depthwise_conv3d, depthwise_conv3d_bias
 
-__all__ = ["conv3d", "depthwise_conv3d", "depthwise_conv3d_bias"]
+__all__ = ["conv3d", "conv3d_bias", "depthwise_conv3d", "depthwise_conv3d_bias"]
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
new file mode 100644
index 000000000..57c455354
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
@@ -0,0 +1,624 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for conv3d.
+"""
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+{{indent}}//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,                                            // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},             // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},         // TensorRefB const & ref_B
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNDHWC::Stride(0)},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_D
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params const & output_op
+{{indent}}};
+{% if is_profiler %}
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} conv_op;
+{% endif %}
+{{indent}}auto status = conv_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
+void {{function_name}} (
+    {{instance_name_base}}& conv_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
+    void* in_ptr,
+    void* weight_ptr,
+    void* bias_ptr,
+    void* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_d,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_d,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_d,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_d = *in_d;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_d = *kernel_d;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_d = *out_d;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNDHWC;
+  TensorNDHWC layout_A(TensorNDHWC::packed(cutlass::make_Coord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNDHWC layout_B(TensorNDHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNDHWC layout_C(TensorNDHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_d, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv3dProblemSize problem_size(
+    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),               // cutlass::Tensor5DCoord input_size
+    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),  // cutlass::Tensor5DCoord filter_size
+    cutlass::make_Coord(pad_d, pad_h, pad_w),                                                 // Coord3D padding
+    cutlass::make_Coord(stride_d, stride_h, stride_w),                                        // Coord3D stride
+    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),                                  // Coord3D dilation
+    cutlass::conv::Mode::kCrossCorrelation,                                                   // cutlass::conv::Mode mode
+    1,                                                                                        // int split_k_slices
+    1                                                                                         // int groups
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv3d specialization."
+  );
+}
+"""
+)
+
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{instance_name}} {{conv_op}};
+{{indent}}  const char *conv_op_name = "{{conv_op_name}}";
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      {{conv_op}},
+{{indent}}      conv_op_name,
+{{indent}}      {{ni}},
+{{indent}}      {{di}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kd}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{do}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{stride_d}},
+{{indent}}      {{stride_h}},
+{{indent}}      {{stride_w}},
+{{indent}}      {{dilation_d}},
+{{indent}}      {{dilation_h}},
+{{indent}}      {{dilation_w}},
+{{indent}}      {{pad_d}},
+{{indent}}      {{pad_h}},
+{{indent}}      {{pad_w}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {}
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}}
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+template <typename {{instance_name_base}}>
+int benchmark_{{function_name}} (
+  {{instance_name_base}} &conv_op,
+  const char *conv_op_name,
+  int64_t NI,
+  int64_t DI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KD,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t DO,
+  int64_t HO,
+  int64_t WO,
+  int stride_d,
+  int stride_h,
+  int stride_w,
+  int dilation_d,
+  int dilation_h,
+  int dilation_w,
+  int pad_d,
+  int pad_h,
+  int pad_w,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementOutput = typename {{instance_name_base}}::ElementC;
+  using ElementInputA = typename {{instance_name_base}}::ElementA;
+  using ElementInputB = typename {{instance_name_base}}::ElementB;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name_base}}::LayoutA> x({NI, DI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> w({CO, KD, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name_base}}::LayoutC> y({NO, DO, HO, WO, CO});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> b({int(CO), 1, 1, 1, 1});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << conv_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_d = std::stoi(argv[2]);
+  int64_t in_h = std::stoi(argv[3]);
+  int64_t in_w = std::stoi(argv[4]);
+  int64_t in_ch = std::stoi(argv[5]);
+  int64_t kernel_d = std::stoi(argv[6]);
+  int64_t kernel_h = std::stoi(argv[7]);
+  int64_t kernel_w = std::stoi(argv[8]);
+  int64_t out_ch = std::stoi(argv[9]);
+  int stride_d = std::stoi(argv[10]);
+  int stride_h = std::stoi(argv[11]);
+  int stride_w = std::stoi(argv[12]);
+  int pad_d = std::stoi(argv[13]);
+  int pad_h = std::stoi(argv[14]);
+  int pad_w = std::stoi(argv[15]);
+  int dilation_d = std::stoi(argv[16]);
+  int dilation_h = std::stoi(argv[17]);
+  int dilation_w = std::stoi(argv[18]);
+
+{{shape_func}}
+
+  uint8_t* global_workspace_ = nullptr;
+  cudaStream_t stream = nullptr;
+
+{{benchmark_instances}}
+
+  return 0;
+}
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  void*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    conv_op,
+{% endif %}
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_d}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_d}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_d}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride_d}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{dilation_d}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{pad_d}},
+{{indent}}    {{pad_h}},
+{{indent}}    {{pad_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+@registry.reg("cuda.conv3d_bias.config")
+def conv3d_config(func_attrs, dtype="float16"):
+    """Populates conv3d cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = common.extract_config(func_attrs, dtype=dtype)
+
+
+@registry.reg("cuda.conv3d_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    """Codegen for conv3d profiler."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_d",
+        x_dim2="in_h",
+        x_dim3="in_w",
+        x_dim4="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_d",
+        w_dim2="kernel_h",
+        w_dim3="kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    instance_name_base = "DeviceConvFwdInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        instance=instance_name_base,
+        dtype=dtype,
+    )
+
+    function_name = "conv"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
+        config = common.emit_instance(op)
+        config_name = common.extract_config_name(config)
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        conv_op = f"conv_op_{instance_idx}"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name,
+            name=instance_name,
+            config=config,
+        )
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            conv_op=conv_op,
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            di="DI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kd="KD",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            do="DO",
+            ho="HO",
+            wo="WO",
+            stride_d="stride_d",
+            stride_h="stride_h",
+            stride_w="stride_w",
+            dilation_d="dilation_d",
+            dilation_h="dilation_h",
+            dilation_w="dilation_w",
+            pad_d="pad_d",
+            pad_h="pad_h",
+            pad_w="pad_w",
+        )
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = SRC_TEMPLATE.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        shape_function="",
+        exec_paths=exec_program,
+    )
+    func_call = FUNC_CALL_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        func_name=function_name,
+        in_ptr="x.device_data()",
+        weight_ptr="w.device_data()",
+        bias_ptr="b.device_data()",
+        out_ptr="y.device_data()",
+        p_batch="&NI",
+        p_out_ch="&CO",
+        p_in_ch="&CI",
+        p_kernel_d="&KD",
+        p_kernel_h="&KH",
+        p_kernel_w="&KW",
+        p_in_d="&DI",
+        p_in_h="&HI",
+        p_in_w="&WI",
+        p_out_batch="&NO",
+        p_out_d="&DO",
+        p_out_h="&HO",
+        p_out_w="&WO",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilation_d="dilation_d",
+        dilation_h="dilation_h",
+        dilation_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        shape_func=shape_func,
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        func_call=func_call,
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
+    # build
+    return common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.conv3d_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv3d_bias function."""
+    return common.gen_function(
+        func_attrs,
+        INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv3d_bias.func_decl")
+def conv3d_gen_function_decl(func_attrs):
+    """Codegen for conv3d function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv3d_bias.func_call")
+def conv3d_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv3d function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    b = func_attrs["inputs"][2]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_d="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_d="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_d="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_d=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        dilation_d=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        pad_d=func_attrs["pad"][0],
+        pad_h=func_attrs["pad"][1],
+        pad_w=func_attrs["pad"][2],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv3d_bias.filter")
+def conv3d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
index 1233111c6..3744274a8 100644
--- a/python/aitemplate/compiler/ops/conv/__init__.py
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -30,6 +30,7 @@
 from .conv2d_depthwise import conv2d_depthwise
 from .conv2d_depthwise_bias import conv2d_depthwise_bias
 from .conv3d import conv3d
+from .conv3d_bias import conv3d_bias
 from .depthwise_conv3d import depthwise_conv3d
 from .transposed_conv2d import transposed_conv2d
 from .transposed_conv2d_bias import transposed_conv2d_bias
diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
new file mode 100644
index 000000000..3437e6cc1
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -0,0 +1,717 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Base class for conv3d.
+"""
+import itertools
+import logging
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from operator import itemgetter
+from typing import Any, Dict, List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import alignment, shape_utils
+from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
+from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+from .conv_common import (
+    filter_op_instances,
+    generate_profiler_sources,
+    get_profiler_filename,
+)
+
+# pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
+
+
+_LOGGER = logging.getLogger(__name__)
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}DI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}WI = {{x_dim3}};
+{{indent}}{{dtype}}CI = {{x_dim4}};
+{{indent}}{{dtype}}CO = {{w_dim0}};
+{{indent}}{{dtype}}KD = {{w_dim1}};
+{{indent}}{{dtype}}KH = {{w_dim2}};
+{{indent}}{{dtype}}KW = {{w_dim3}};
+{{indent}}{{dtype}}SD = {{stride_d}};
+{{indent}}{{dtype}}SH = {{stride_h}};
+{{indent}}{{dtype}}SW = {{stride_w}};
+{{indent}}{{dtype}}DD = {{dilate_d}};
+{{indent}}{{dtype}}DH = {{dilate_h}};
+{{indent}}{{dtype}}DW = {{dilate_w}};
+{{indent}}{{dtype}}PD = {{pad_d}};
+{{indent}}{{dtype}}PH = {{pad_h}};
+{{indent}}{{dtype}}PW = {{pad_w}};
+{{indent}}{{dtype}}KDEff = (KD - 1) * DD + 1;
+{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
+{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}DO = (DI + PD + PD - KDEff) {{div}} SD + 1;
+{{indent}}{{dtype}}HO = (HI + PH + PH - KHEff) {{div}} SH + 1;
+{{indent}}{{dtype}}WO = (WI + PW + PW - KWEff) {{div}} SW + 1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = DO;
+{{indent}}{{y_dim2}} = HO;
+{{indent}}{{y_dim3}} = WO;
+{{indent}}{{y_dim4}} = CO;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+NI == {{x_dim0}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
+    """
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class conv3d_bias(Operator):
+    r"""conv3d_bias"""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv3d constructor.
+
+        Parameters
+        ----------
+        stride : int or tuple
+            Stride of the convolution
+        pad : int or tuple
+            Size of padding to add to the input
+        dilate : int ot tuple, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__()
+        self._attrs["op"] = "conv3d_bias"
+        self._attrs["stride"] = stride
+        if isinstance(stride, int):
+            self._attrs["stride"] = (stride, stride, stride)
+        self._attrs["pad"] = pad
+        if isinstance(pad, int):
+            self._attrs["pad"] = (pad, pad, pad)
+        self._attrs["dilate"] = dilate
+        if isinstance(dilate, int):
+            self._attrs["dilate"] = (dilate, dilate, dilate)
+        self._attrs["group"] = group
+        self._attrs["has_profiler"] = True
+        self._attrs["epilogue_alignment"] = 1
+        self._attrs["epilogue"] = "LinearCombination"
+        self._attrs["workspace"] = 0
+        self._attrs["split_k"] = None
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[4] != w[4] * self._attrs["group"]:
+            raise RuntimeError("X/W Shape mismatch for conv3d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            x_dim4=x[4],
+            w_dim0=w[0],
+            w_dim1=w[1],
+            w_dim2=w[2],
+            w_dim3=w[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["DO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[0]
+        self._attrs["KD"] = w_shape[1]
+        self._attrs["KH"] = w_shape[2]
+        self._attrs["KW"] = w_shape[3]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            x._attrs["shape"][0],
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape: List[int]):
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+            x_dim2=shape[2],
+            x_dim3=shape[3],
+            x_dim4=shape[4],
+        ).replace("\n", "")
+
+    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3, dim4):
+        return self.exec_dyn_key_template.render(
+            x_dim0_lb=dim0_lb,
+            x_dim0_ub=dim0_ub,
+            x_dim1=dim1,
+            x_dim2=dim2,
+            x_dim3=dim3,
+            x_dim4=dim4,
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _signature(self):
+        signature = "conv3d: K=[{kd}, {kh}, {kw}], S=[{sd}, {sh}, {sw}], P=[{pd}, {ph}, {pw}], CO=[{co}]".format(
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            sd=self._attrs["stride"][0],
+            sh=self._attrs["stride"][1],
+            sw=self._attrs["stride"][2],
+            pd=self._attrs["pad"][0],
+            ph=self._attrs["pad"][1],
+            pw=self._attrs["pad"][2],
+            co=self._attrs["CO"],
+        )
+        return signature
+
+    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+        epilogue_dim = output_shape[-1]
+        if not isinstance(epilogue_dim, IntImm):
+            raise RuntimeError("Conv output last dimension must be static!")
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
+            number=epilogue_dim._attrs["values"][0],
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor) -> List[Tensor]:
+        """Call conv3d with tensors x, w
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, D, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_d, K_h, K_w, C_in)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, D_out, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self) -> Dict[str, Any]:
+        target_attrs = ["dilate", "group", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def _should_build_profiler(self) -> bool:
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this gemm instance, we update this gemm op's
+        relevant attributes with the cached result and return False.
+        """
+        if self._has_dynamic_input_dims():
+            # If there are dynamic dims, we'll have to generate and build the
+            # profilers, as the binaries will be needed for dynamic profiling.
+            return True
+
+        target = backend.target.Target.current()
+        workloads = list(self._attrs["exec_path"].keys())
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(self._attrs["op_instance"].keys()))
+            tmp_op = self._attrs["op_instance"][tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                split_k = (
+                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+                )
+                query = Conv3dQueryEntry(
+                    dtype_a=tmp_op.A.element.value,
+                    dtype_b=tmp_op.B.element.value,
+                    dtype_c=tmp_op.C.element.value,
+                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    kd=self._attrs["KD"],
+                    kh=self._attrs["KH"],
+                    kw=self._attrs["KW"],
+                    co=self._attrs["CO"],
+                    stride_d=self._attrs["stride"][0],
+                    stride_h=self._attrs["stride"][1],
+                    stride_w=self._attrs["stride"][2],
+                    pad_d=self._attrs["pad"][0],
+                    pad_h=self._attrs["pad"][1],
+                    pad_w=self._attrs["pad"][2],
+                    dilate_d=self._attrs["dilate"][0],
+                    dilate_h=self._attrs["dilate"][1],
+                    dilate_w=self._attrs["dilate"][2],
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    split_k=split_k,
+                    exec_entry_sha1=exec_entry_sha1,
+                )
+                cache_value = target.query_profile_cache("conv3d", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    _LOGGER.info(
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace = cache_value
+                    self._attrs["exec_path"][wkl] = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        """Profiler generator.
+
+        Parameters
+        ----------
+        workdir : str, optional, by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = backend.target.Target.current()
+
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+
+        if self._should_build_profiler():
+            x_shapes = [
+                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
+            ]
+            self._attrs["op_instance"] = filter_op_instances(
+                func_attrs=self._attrs,
+                x_shapes=x_shapes,
+            )
+            return generate_profiler_sources(
+                func_attrs=self._attrs,
+                op_class="conv3d",
+                workdir=workdir,
+                shape_template=self.shape_eval_template,
+            )
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        cmd.append(x_shape[2])
+        cmd.append(x_shape[3])
+        cmd.append(x_shape[4])
+        cmd.append(self._attrs["KD"])
+        cmd.append(self._attrs["KH"])
+        cmd.append(self._attrs["KW"])
+        cmd.append(self._attrs["CO"])
+        cmd.append(self._attrs["stride"][0])
+        cmd.append(self._attrs["stride"][1])
+        cmd.append(self._attrs["stride"][2])
+        cmd.append(self._attrs["pad"][0])
+        cmd.append(self._attrs["pad"][1])
+        cmd.append(self._attrs["pad"][2])
+        cmd.append(self._attrs["dilate"][0])
+        cmd.append(self._attrs["dilate"][1])
+        cmd.append(self._attrs["dilate"][2])
+        cmd.append(self._attrs["group"])
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        target = backend.target.Target.current()
+        # query cache
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+        query = Conv3dQueryEntry(
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            op_type=self._attrs["op"],
+            device=target._arch,
+            epilogue=tmp_op.epilogue_functor.value,
+            split_k=split_k,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("conv3d", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            _LOGGER.info("Load profiling result from cache.")
+            return cache_value
+        if target.use_dummy_profiling_results():
+            op_type = self._attrs["op"]
+            raise Exception(
+                "This is a CI run but we could not find the following cache ",
+                f"available on device {target._arch}\n",
+                f"{op_type} {exec_entry_sha1}.\n",
+                "To bypass, you need to make it available in the db table.",
+            )
+
+        profiler_filename = get_profiler_filename(self._attrs, "conv3d")
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape = self._invert_exec_key(exec_key)
+        command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
+        runner.push(profiler_filename, command)
+
+        runner.join()
+        result = runner.pull()
+        if len(result) == 0:
+            raise RuntimeError(
+                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
+            )
+        out = min(result, key=itemgetter(1))
+        best_algo = out[1].op_config
+        workspace = out[1].workspace
+        ## cache
+        cache_record = Conv3dRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kd=self._attrs["KD"],
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride_d=self._attrs["stride"][0],
+            stride_h=self._attrs["stride"][1],
+            stride_w=self._attrs["stride"][2],
+            pad_d=self._attrs["pad"][0],
+            pad_h=self._attrs["pad"][1],
+            pad_w=self._attrs["pad"][2],
+            dilate_d=self._attrs["dilate"][0],
+            dilate_h=self._attrs["dilate"][1],
+            dilate_w=self._attrs["dilate"][2],
+            op_type=self._attrs["op"],
+            epilogue=tmp_op.epilogue_functor.value,
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+            split_k=split_k,  # todo add into profile
+        )
+        Target.current().insert_profile_cache("conv3d", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def _has_dynamic_input_dims(self):
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    return True
+        return False
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ):
+        if devices is None:
+            devices = [0]
+        self._profile_static(workdir, devices)
+
+        if self._has_dynamic_input_dims():
+            if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
+                raise NotImplementedError(
+                    "conv3d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
+                        dynamic_profiling_strategy
+                    )
+                )
+            self._profile_dynamic_dim(workdir)
+
+    def _profile_static(self, workdir, devices):
+        """Profiles with static shapes."""
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+
+        for wkl in workloads:
+            _LOGGER.info(
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            target = backend.target.Target.current()
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results():
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl] = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl] == "":
+                best_algo, workspace = self._profile_single_workload(
+                    profiler_prefix, wkl, devices
+                )
+                self._attrs["exec_path"][wkl] = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+
+    def _profile_dynamic_dim(self, workdir):
+        """Profiles with dynamic shapes."""
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
+        # extract dynamic dim from exec_path
+        if len(self._attrs["exec_path"]) <= 1:
+            return
+        if len(set(self._attrs["exec_path"].values())) <= 1:
+            # all exec paths point to the same algo
+            return
+
+        def _extract_dynamic_dim(exec_keys):
+            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            var_dims = [[], [], [], [], []]
+            for key in exec_keys:
+                dims = self._invert_exec_key(key)
+                for i, v in enumerate(dims):
+                    var_dims[i].append(v)
+            return var_dims
+
+        dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
+        dim1 = dims[1][0]
+        dim2 = dims[2][0]
+        dim3 = dims[3][0]
+        dim4 = dims[4][0]
+        algos = list(self._attrs["exec_path"].values())
+        # generate region
+        regions = []  # lb, ub, lb_algos, ub_algos
+        for i in range(len(dims[0]) - 1):
+            regions.append([dims[0][i], dims[0][i + 1], algos[i], algos[i + 1]])
+        # for each region,
+        #   binary search to find cutting point
+        #   generate new exec
+        special_cases = OrderedDict()
+        new_exec_paths = OrderedDict()
+        for lb, ub, lb_algo, ub_algo in regions:
+            mid = (lb + ub) // 2
+            origin_lb = lb
+            origin_ub = ub
+            last_mid = mid
+            while mid > lb and mid < ub:
+                mid = (lb + ub) // 2
+                mid_shape = [mid, dim1, dim2, dim3, dim4]
+                _LOGGER.info(
+                    "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
+                        lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
+                    ),
+                )
+
+                # run the profiler binary with all ops on the mid_shape
+                # and fetch the results only for the lb_algo and ub_algo
+                profiler_filename = get_profiler_filename(self._attrs, "conv3d")
+                profiler_cmd = self._gen_profile_cmd(
+                    profiler_prefix, profiler_filename, mid_shape
+                )
+                runner.push(
+                    idx=profiler_filename,
+                    cmd=profiler_cmd,
+                    return_ops=[str(lb_algo), str(ub_algo)],
+                )
+                runner.join()
+                result = runner.pull()
+                result_dict = {res.op_config: res for res in result[0][1]}
+
+                assert len(result_dict) >= 1
+                # if there is only one result, assume ub algo failed.
+                if len(result_dict) == 1:
+                    assert str(ub_algo) not in result_dict
+                    # last_lb = lb
+                    lb = mid + 1
+                # if there are two result, compare to decide new lb/ub
+                else:
+                    lb_time = result_dict[str(lb_algo)].duration
+                    ub_time = result_dict[str(ub_algo)].duration
+                    if lb_time < ub_time:
+                        # lb algo can work with larger batch
+                        # last_lb = lb
+                        lb = mid + 1
+                    else:
+                        # ub algo can work with smaller batch
+                        # last_ub = ub
+                        ub = mid - 1
+                last_mid = mid
+                mid = (lb + ub) // 2
+            lo_region_key = self._gen_dyn_exec_key(
+                origin_lb, last_mid, dim1, dim2, dim3, dim4
+            )
+            up_region_key = self._gen_dyn_exec_key(
+                last_mid, origin_ub, dim1, dim2, dim3, dim4
+            )
+            new_exec_paths[lo_region_key] = lb_algo
+            new_exec_paths[up_region_key] = ub_algo
+            # find special cases
+            # This code is kept in case need fully tested dynamic code
+            # So far I find binary search works well.
+            # def _find_special_case(lb, ub, algo):
+            #     for i in range(lb + 1, ub + 1):
+            #         x_shape = [i, dim1, dim2, dim3, dim4]
+            #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
+            #         runner.push(0, cmd)
+            #         runner.join()
+            #         out = runner.pull()
+            #         if len(out) == 0:
+            #             _LOGGER.info(Find specail case: batch=%d" % i)
+            #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
+            #             special_cases[self._gen_exec_key(x_shape)] = algo
+
+            # _LOGGER.info(
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
+            #         ub=last_mid))
+            # _find_special_case(origin_lb, last_mid, lb_algo)
+            # _LOGGER.info(
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
+            #         ub=origin_ub))
+            # _find_special_case(last_mid, origin_ub, ub_algo)
+        special_cases.update(new_exec_paths)
+        self._attrs["exec_path"] = special_cases
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        op_name = self._attrs["op"]
+        func_key = "{target}.{op}.gen_function".format(target=target.name(), op=op_name)
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index a0141cc7e..bc88b367c 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -37,6 +37,7 @@ def _test_conv3d(
         stride=(1, 1, 1),
         pad=(1, 1, 1),
         batch=4,
+        has_bias=False,
         test_name="conv3d",
         dtype="float16",
     ):
@@ -54,19 +55,38 @@ def _test_conv3d(
             name="input_1",
             is_input=True,
         )
-        OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
-        Y = OP(X, W)
+        if has_bias:
+            B = Tensor(
+                shape=[co],
+                dtype=dtype,
+                name="input_2",
+                is_input=True,
+            )
+
+        if has_bias:
+            OP = ops.conv3d_bias(stride=stride, pad=pad, dilate=1)
+            Y = OP(X, W, B)
+        else:
+            OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
+            Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{has_bias}")
 
         X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
         W_pt = get_random_torch_tensor([co, ci, kt, kh, kw], dtype=dtype)
-        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, stride=stride, padding=pad)
+        B_pt = get_random_torch_tensor([co], dtype=dtype) if has_bias else None
+
+        Y_pt = torch.nn.functional.conv3d(
+            X_pt, W_pt, bias=B_pt, stride=stride, padding=pad
+        )
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
         y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
-        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        inputs = {"input_0": x, "input_1": w}
+        if has_bias:
+            inputs["input_2"] = B_pt
+        module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 4, 1, 2, 3))
 
         if dtype == "float32":
@@ -75,20 +95,22 @@ def _test_conv3d(
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_conv3d(
-            4,
-            224,
-            224,
-            8,
-            96,
-            3,
-            5,
-            5,
-            stride=(2, 4, 4),
-            pad=(1, 2, 2),
-            test_name="conv3d_fp16_1",
-            dtype="float16",
-        )
+        for has_bias in [True, False]:
+            self._test_conv3d(
+                4,
+                224,
+                224,
+                8,
+                96,
+                3,
+                5,
+                5,
+                stride=(2, 4, 4),
+                pad=(1, 2, 2),
+                test_name="conv3d_fp16_1",
+                dtype="float16",
+                has_bias=has_bias,
+            )
         self._test_conv3d(
             56,
             56,

From 09d7d24cda01be0e47aa4daf979c5e73e60b29af Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Fri, 10 Feb 2023 09:21:01 -0800
Subject: [PATCH 099/638] let conv3d accept tuple-like stride, padding, and
 dilation arguments (#237)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/237

We should let conv3d accept tuple-like `stride`, `padding`, and `dilation` arguments. This diff relaxed the restrictions in fx2ait.

Reviewed By: terrychenism

Differential Revision: D43174953

fbshipit-source-id: 374902b19dc705189eabe426b6d8601d1a52054d
---
 fx2ait/fx2ait/converters/ait_converters.py    | 14 ++--
 .../fx2ait/test/converters/test_ait_conv3d.py | 84 +++++++++++++++++++
 2 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index c094c3faa..975c15822 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1181,8 +1181,8 @@ def _choose_conv3d_op(
 
     if bias is not None:
         assert (
-            groups == weight._attrs["shape"][0]
-        ), "Currently only support channel == groups"
+            groups == weight._attrs["shape"][0].value()
+        ), f"Currently only support channel == groups, but got channel: {weight._attrs['shape'][0].value()} and groups: {groups}"
         return depthwise_conv3d(
             stride=stride, pad=pad, dilate=dilate, group=groups, bias=True
         )(x, weight, bias)
@@ -1210,13 +1210,9 @@ def acc_ops_conv3d(
     bias = kwargs["bias"]
     assert bias is None or isinstance(bias, AITTensor)
 
-    stride = identical_elem_tuple_to_int(kwargs["stride"])
-    padding = identical_elem_tuple_to_int(kwargs["padding"])
-    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
-
-    assert all(
-        isinstance(x, int) for x in [stride, padding, dilation]
-    ), "Expected int stride, padding, and dilation"
+    stride = kwargs["stride"]
+    padding = kwargs["padding"]
+    dilation = kwargs["dilation"]
 
     groups = kwargs["groups"]
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index bd88f0952..d587a9eaf 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -47,6 +47,90 @@ class TestAitConv3d(AITTestCase):
                 w=224,
                 bias=False,
             ),
+            param(
+                name="conv3d_mvit_0",
+                kernel_size=3,
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_1",
+                kernel_size=3,
+                stride=(1, 1, 1),
+                padding=(1, 2, 2),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_2",
+                kernel_size=3,
+                stride=(2, 8, 8),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_3",
+                kernel_size=3,
+                stride=(1, 4, 4),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_4",
+                kernel_size=3,
+                stride=(1, 2, 2),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_5",
+                kernel_size=3,
+                stride=(1, 1, 1),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
         ]
     )
     def test_conv3d(

From f302c96b9593dcde5bb9997c13245eb6e3e2faf3 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Fri, 10 Feb 2023 09:35:02 -0800
Subject: [PATCH 100/638] add max_pool3d in fx2ait (#234)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/234

Add the max_pool3d support in fx2ait.

Reviewed By: frank-wei, terrychenism

Differential Revision: D43163131

fbshipit-source-id: c36cbe343f6fd13f3ba550d69a91904e9ccc526a
---
 fx2ait/fx2ait/converters/ait_converters.py    | 76 +++++++++++++++++++
 .../test/converters/test_ait_max_pool3d.py    | 44 +++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 975c15822..5288cef15 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1219,6 +1219,82 @@ def acc_ops_conv3d(
     return _choose_conv3d_op(stride, padding, dilation, input_val, weight, bias, groups)
 
 
+@ait_converter(acc_ops.max_pool3d)
+def acc_ops_max_pool3d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    if (
+        isinstance(kwargs["kernel_size"], tuple)
+        and isinstance(kwargs["stride"], tuple)
+        and isinstance(kwargs["padding"], tuple)
+    ):
+        kernel_size_tuple = kwargs["kernel_size"]
+        stride_tuple = kwargs["stride"]
+        padding_tuple = kwargs["padding"]
+
+        assert kernel_size_tuple[0] == 1, "max_pool3d only supports kT == 1 currently"
+        assert stride_tuple[0] == 1, "max_pool3d only supports sT == 1 currently"
+        assert (
+            padding_tuple[0] == 0
+        ), "max_pool3d only supports T_padding == 0 currently"
+
+        kernel_size = identical_elem_tuple_to_int(kernel_size_tuple[1:])
+        stride = identical_elem_tuple_to_int(stride_tuple[1:])
+        padding = identical_elem_tuple_to_int(padding_tuple[1:])
+    elif (
+        isinstance(kwargs["kernel_size"], int)
+        and isinstance(kwargs["stride"], int)
+        and isinstance(kwargs["padding"], int)
+    ):
+        kernel_size = kwargs["kernel_size"]
+        stride = kwargs["stride"]
+        padding = kwargs["padding"]
+    else:
+        raise RuntimeError("Only int or tuple types are supported")
+
+    ceil_mode = kwargs["ceil_mode"]
+    return_indices = kwargs["return_indices"]
+    if ceil_mode or return_indices:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+
+    N = input_val.shape()[0].value()
+    C = input_val.shape()[1].value()
+    D = input_val.shape()[2].value()
+    H = input_val.shape()[3].value()
+    W = input_val.shape()[4].value()
+
+    reshape_op_0 = reshape()
+    shape_0 = (N, C * D, H, W)
+    input_val = reshape_op_0(input_val, shape_0)
+
+    permute_op_0 = permute()
+    permutation_0 = [0, 2, 3, 1]
+    input_val = permute_op_0(input_val, permutation_0)
+
+    output = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+    permute_op_1 = permute()
+    permutation_1 = [0, 3, 1, 2]
+    output = permute_op_1(output, permutation_1)
+
+    H_o = output.shape()[2].value()
+    W_o = output.shape()[3].value()
+    reshape_op_1 = reshape()
+    shape_1 = (N, C, D, H_o, W_o)
+
+    output = reshape_op_1(output, shape_1)
+    return output
+
+
 @ait_converter(acc_ops.max_pool2d)
 def acc_ops_max_pool2d(
     target: Target,
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py
new file mode 100644
index 000000000..f598bf2b0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool3dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((1, 3, 3), (1, 2, 2), (0, 1, 1)),
+        ]
+    )
+    def test_avgpool3d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool3d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 8, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.max_pool3d},
+        )

From 256d08bb845de90f3c9b2ffeae88cebf1fc56b1b Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 10 Feb 2023 11:05:09 -0800
Subject: [PATCH 101/638] attention module (#222)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/222

Reviewed By: terrychenism, wushirong

Differential Revision: D43109561

fbshipit-source-id: a4f1e54300cdec374fe4f48e94f2288f7a8912ff
---
 .../test_ait_multihead_attention.py                |  4 ++--
 fx2ait/fx2ait/tools/common_fx2ait.py               | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
index 163d1f2e3..cdea5b34f 100644
--- a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -47,7 +47,7 @@ def forward(self, x):
                 acc_ops.unsqueeze,
                 acc_ops.getitem,
             },
-            transformer_mode=True,
+            leaf_module=torch.nn.MultiheadAttention,
         )
 
     def test_multihead_attention(self):
@@ -75,5 +75,5 @@ def forward(self, x):
             model,
             [x],
             expected_ops={torch.nn.MultiheadAttention},
-            transformer_mode=True,
+            leaf_module=torch.nn.MultiheadAttention,
         )
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 067eb54eb..de9058841 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -81,18 +81,21 @@ def run_test(
         precision: LowerPrecision = LowerPrecision.FP16,
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
-        transformer_mode: Optional[bool] = False,
         passes: List[Callable] = [],  # noqa: B006
+        leaf_module: Callable = None,  # one leaf module
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
         mod.eval()
+
+        leaf_module_list = []
+        if leaf_module:
+            leaf_module_list.append(leaf_module)
+
         mod = acc_tracer.trace(
             mod,
             inputs,
-            leaf_module_list=[
-                torch.nn.MultiheadAttention if transformer_mode else None
-            ],
+            leaf_module_list=leaf_module_list,
         )
         for p in passes:
             mod = p(mod, inputs)
@@ -155,10 +158,9 @@ def run_test(
             end_event.record()
             torch.cuda.synchronize()
             print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
-
             # PyTorch Transformer model would yield 2 output tensors, of which the second one is
             # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
-            if transformer_mode:
+            if leaf_module == torch.nn.MultiheadAttention:
                 ref_outputs = ref_outputs[0]
             if isinstance(outputs, torch.Tensor):
                 ref_outputs = [ref_outputs]

From 543b19ee936648611c40390a0b00e57ae1f3e312 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 10 Feb 2023 11:55:36 -0800
Subject: [PATCH 102/638] simplify test_power (#236)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/236

motivation: it's timing out on circleci which is a false alert.

https://app.circleci.com/pipelines/github/facebookincubator/AITemplate/397/workflows/f8ffcb15-64ef-4d7d-8d9b-659e04132578/jobs/610

also consumes a bit too much resources internally

Reviewed By: ipiszy, wushirong

Differential Revision: D43173676

fbshipit-source-id: 8ca3f2140a9a6ab019d1479ecdf2c39137482d0d
---
 tests/unittest/ops/test_fused_elementwise.py | 66 +++++++++++++-------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 162a21144..8c25bdcf6 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -16,6 +16,7 @@
 Unittests for fused_elementwise Operator.
 """
 
+import itertools
 import unittest
 from typing import List
 
@@ -34,6 +35,7 @@
     get_torch_full_tensor,
 )
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 ait_dtype_to_pytorch = {"float16": torch.float16}
 if detect_target().name() != "rocm":
@@ -354,31 +356,47 @@ def _test_power(self, input_size, exp, test_name, ait_dtype):
         # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
         # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
         # print(f"BW: {bw} GB/s")
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True))
+        torch.testing.assert_close(x2, x2_pt, atol=1e-3, rtol=1e-3, equal_nan=True)
 
-    def test_power(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            for i, exp in enumerate(
-                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
-            ):
-                input_sizes = [1024, 22400]
-                self._test_power(
-                    input_sizes,
-                    exp,
-                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
-                    ait_dtype,
-                )
-
-            for i, exp in enumerate(
-                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
-            ):
-                input_sizes = [1025, 22401]
-                self._test_power(
-                    input_sizes,
-                    exp,
-                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
-                    ait_dtype,
-                )
+    @parameterized.expand(
+        itertools.product(
+            (0, 1, -1, 0.5, -0.5, 2, -2, 1.4, 3),
+            ([1024, 1024], [1025, 1025]),
+        )
+    )
+    def test_power_float16(self, exp, shape):
+        dtype = "float16"
+        self._test_power(
+            shape,
+            exp,
+            f"pow_{shape[0]}_{shape[1]}_{exp}_{dtype}",
+            dtype,
+        )
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "float32 dtype only supported in CUDA"
+    )
+    def test_power_float32(self):
+        self._test_power(
+            (1024, 1024),
+            2.5,
+            "pow_float32",
+            "float32",
+        )
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "bfloat16 dtype only supported in CUDA"
+    )
+    @unittest.skipIf(
+        int(detect_target()._arch) < 80, "bfloat16 dtype only supported in CUDA sm80+"
+    )
+    def test_power_bfloat16(self):
+        self._test_power(
+            (1024, 1024),
+            1.2,
+            "pow_bfloat16",
+            "bfloat16",
+        )
 
     def _test_min_max(
         self,

From 22b916c2f72b20c43507a830f7908593d12408f5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 10 Feb 2023 13:22:14 -0800
Subject: [PATCH 103/638] Add deploy status to README (#242)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/242

Test Plan: {F869587146} Imported from GitHub, without a `Test Plan:` line.

Reviewed By: chenyang78

Differential Revision: D43194914

Pulled By: tenpercent

fbshipit-source-id: b987143b004d88e7080ef98c65c58c26bcfcf536
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c98b98db7..1184b0312 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![License](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE) |
 [![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yaml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
 [![CircleCI](https://circleci.com/gh/facebookincubator/AITemplate.svg?style=svg)](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate)
-
+[![Deploy docs to Pages](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml/badge.svg)](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml)
 
 
From 807bf14b955fbc8163fcc3204af58020487b094e Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Fri, 10 Feb 2023 13:42:08 -0800
Subject: [PATCH 104/638] tweak env variable names (#240)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/240

Moving to industry-standard convensions of naming the env variables :)

Reviewed By: chenyang78, wushirong

Differential Revision: D43189636

fbshipit-source-id: c8e5f45163da2d508b6d5d55f4fe33a690d22507
---
 python/aitemplate/compiler/compiler.py     | 2 +-
 python/aitemplate/testing/detect_target.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index f9ea0f8ef..874254ba9 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -141,7 +141,7 @@ def compile_model(
     if constants is None:
         constants = {}
 
-    recompile = os.getenv("RECOMPILE", "1")
+    recompile = os.getenv("AIT_RECOMPILE", "1")
     graph = None
     # Super important: we cannot have commas in the test name.
     # We want to add a -Iworkdir/test_name flag to nvcc, but
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index e85a46217..38e251f20 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -81,7 +81,7 @@ def detect_target(**kwargs):
             return CUDA(arch=FLAG, **kwargs)
         else:
             return ROCM(arch=FLAG, **kwargs)
-    doc_flag = os.getenv("BUILD_DOCS", None)
+    doc_flag = os.getenv("AIT_BUILD_DOCS", None)
     if doc_flag is not None:
         return CUDA(arch="80", **kwargs)
     flag = _detect_cuda()

From 2774672c9765c477071247eedcd91f42b76c8696 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 10 Feb 2023 17:48:39 -0800
Subject: [PATCH 105/638] fix export pages workflow (#241)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/241

sphinx auto-upgrade broke docs build and pages export on GH; docs are already fixed

Reviewed By: ipiszy

Differential Revision: D43194515

fbshipit-source-id: 4716244be7f6b7bc10953ff97c7ce56929761b41
---
 .github/workflows/pages.yaml | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pages.yaml b/.github/workflows/pages.yaml
index d9074b8b3..815cfd887 100644
--- a/.github/workflows/pages.yaml
+++ b/.github/workflows/pages.yaml
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.9"]
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -39,15 +39,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install autodocsumm
-          pip install sphinx_rtd_theme
-          pip install sphinx_gallery
-          pip install sphinxcontrib-inlinesyntaxhighlight
-          pip install sphinx_toolbox
-          pip install numpy
-          pip install jinja2
-          pip install torch
+          python3.9 -m pip install --upgrade pip
+          python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox jinja2 torch
           cd python
           python setup.py develop
           cd ..

From 262815bcf2bf76cc1addfc0fdbd555fccfba1689 Mon Sep 17 00:00:00 2001
From: Dongki Lee <123964440+asynclee@users.noreply.github.com>
Date: Fri, 10 Feb 2023 17:59:15 -0800
Subject: [PATCH 106/638] fix negative prompt error (#210)

Summary:
In the current code, if negative prompt comes in more than the set maximum length, there is no truncation option, causing errors in subsequent pipelines. Modify that part.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/210

Reviewed By: terrychenism

Differential Revision: D43175389

Pulled By: ipiszy

fbshipit-source-id: 155b143637d15af07bc92f6dd75c407a6ba04943
---
 .../05_stable_diffusion/src/pipeline_stable_diffusion_ait.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index ce744bff8..9e5f1e5c6 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -293,6 +293,7 @@ def __call__(
                 uncond_tokens,
                 padding="max_length",
                 max_length=max_length,
+                truncation=True,
                 return_tensors="pt",
             )
             uncond_embeddings = self.clip_inference(

From 0aa839cd1abdf58880b7029fee36cffd5811119e Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Fri, 10 Feb 2023 18:37:54 -0800
Subject: [PATCH 107/638] Add neg (#244)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/244

Support lowering torch.neg w/ fx2ait

Reviewed By: frank-wei, wushirong

Differential Revision: D43196238

fbshipit-source-id: 20a84535fec8e2b79c96286485326204a8de3a43
---
 fx2ait/fx2ait/converters/ait_converters.py          | 11 +++++++++++
 fx2ait/fx2ait/test/converters/test_ait_unary_ops.py |  1 +
 2 files changed, 12 insertions(+)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 5288cef15..99823b16b 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1453,3 +1453,14 @@ def math_sqrt(
     target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
 ) -> ConverterOutput:
     return create_unary_op(FuncEnum.SQRT, args, kwargs, name)
+
+
+@ait_converter(acc_ops.neg)
+def acc_ops_neg(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    neg_one = AITTensor(shape=[], dtype="float16", name="neg_one", value=-1.0)
+    return elementwise(FuncEnum.MUL)(input_val, neg_one)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index 0b75f85da..a743000c1 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -30,6 +30,7 @@
     (torch.cos, acc_ops.cos),
     (torch.sqrt, acc_ops.sqrt),
     (torch.clone, acc_ops.clone),
+    (torch.neg, acc_ops.neg),
 ]
 
 
From 0525e8ee5f61866f7253418a4982f68c1349941f Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Fri, 10 Feb 2023 20:33:29 -0800
Subject: [PATCH 108/638] allow users to pick the optimization level for the
 host code (#239)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/239

It is typical in our situation that an option --optimize <level> (-Ox) in a Makefile is for a HOST compiler. And the default -O3 does literally nothing except for the enormous compilation time.

So, it is safe to allow users to override this option in order to **significantly speedup** the compilation / testing, especially for very large models.

Use AIT_COMPILER_OPT environment variable, for example "AIT_COMPILER_OPT=-O1".

Reviewed By: frankgt40, chenyang78

Differential Revision: D43189546

fbshipit-source-id: 4d67cf295bf31b1053fe10816fb5646c4dda1f15
---
 python/aitemplate/backend/cuda/target_def.py |  7 +++--
 python/aitemplate/backend/rocm/target_def.py |  6 ++--
 python/aitemplate/utils/__init__.py          |  1 +
 python/aitemplate/utils/environ.py           | 29 ++++++++++++++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 python/aitemplate/utils/environ.py

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index ee1998202..bbed5e0dd 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -31,6 +31,7 @@
 
 from aitemplate.backend.target import TargetType
 
+from ...utils import environ
 from ...utils.misc import is_debug
 
 from .. import registry
@@ -103,6 +104,7 @@ def _build_compile_options(self):
             ),
             os.path.join(self._template_path, "../cub"),
         ]
+
         options = [
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
             "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
@@ -113,7 +115,7 @@ def _build_compile_options(self):
             "-Xcompiler=-Wconversion",
             "-Xcompiler=-fno-strict-aliasing",
             "-Xcompiler -fvisibility=hidden",
-            "-O3",
+            environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
             "--use_fast_math",
@@ -252,6 +254,7 @@ def _build_compile_options(self):
             with open(fb_include_path, "w") as fb_include:
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
+
             options = self.nvcc_options_json["args"] + [
                 "-I" + cutlass_path[0],
                 "-I" + cutlass_path[1],
@@ -276,7 +279,7 @@ def _build_compile_options(self):
                 "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
                 % (self._arch, self._arch, self._arch),
                 "-Xcompiler=-Wconversion",
-                "-O3",
+                environ.get_compiler_opt_level(),
                 "-std=c++17",
             ]
             if self._ndebug == 1:
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index 9a7b2dd83..884eceebf 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -27,6 +27,8 @@
 
 from aitemplate.backend.target import AIT_STATIC_FILES_PATH
 
+from ...utils import environ
+
 from .. import registry
 from ..target import COMPOSABLE_KERNEL_PATH, Target
 
@@ -154,7 +156,7 @@ def _build_compile_options(self):
 
         ck_paths = self._get_ck_paths()
         options = [
-            "-O3",
+            environ.get_compiler_opt_level(),
             "-fPIC",
             "-fvisibility=hidden",
             "-std=c++17",
@@ -329,7 +331,7 @@ def _build_compile_options(self):
 
         ck_paths = self._get_ck_paths()
         options = self.hipcc_options_json["args"] + [
-            "-O3",
+            environ.get_compiler_opt_level(),
             "-fPIC",
             "-fvisibility=hidden",
             "-std=c++17",
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index 6f57327ed..f527e0a88 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -17,6 +17,7 @@
 
 from . import (
     alignment,
+    environ,
     graph_utils,
     import_path,
     markdown_table,
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
new file mode 100644
index 000000000..d4ec9f193
--- /dev/null
+++ b/python/aitemplate/utils/environ.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+
+
+def get_compiler_opt_level() -> str:
+    # The reason: it is typical in our situation that an option
+    # --optimize <level> (-Ox) is for a HOST compiler. And -O3 does
+    # literally nothing except for the enormous compilation time.
+    #
+    # So, it is safe to allow users to override this option in order
+    # to significantly speedup the computations / testing, especially
+    # for very large models.
+    compiler_opt = os.getenv("AIT_COMPILER_OPT", "-O3")
+
+    return compiler_opt

From 8385da310cefa9c2313eb3897bfafd9afec89d2c Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 10 Feb 2023 21:05:34 -0800
Subject: [PATCH 109/638] Fix the seed for test_bmm_rcr_n1 (#250)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/250

This test is flaky - I ran it twice, the first time succeed, but the second time failed

Reviewed By: tenpercent

Differential Revision: D43210593

fbshipit-source-id: 09e7b3b3b387168fc6daa90081d81f60fba66946
---
 tests/unittest/ops/test_bmm_rcr_n1.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index 8497892bc..0e67119cf 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -28,6 +28,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMRcrN1TestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.test_count = 0

From 3d03931df72b19a038aad3bb3bf218021bc9c6b6 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Sat, 11 Feb 2023 21:39:39 -0800
Subject: [PATCH 110/638] support torch.unbind, torch.group_norm (#246)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/246

as titled

Reviewed By: terrychenism

Differential Revision: D43181992

fbshipit-source-id: b641713bd774cb7c7bf903f514bff5c87a6f3a33
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py           | 23 ++++++++++
 fx2ait/fx2ait/converters/ait_converters.py    | 46 +++++++++++++++++++
 .../test/converters/test_ait_group_norm.py    | 45 ++++++++++++++++++
 .../fx2ait/test/converters/test_ait_unbind.py | 37 +++++++++++++++
 .../compiler/ops/groupnorm/groupnorm.py       | 17 +++----
 python/aitemplate/compiler/public/__init__.py |  1 +
 6 files changed, 161 insertions(+), 8 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_group_norm.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_unbind.py

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 047e1d887..9b0316dcf 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -3226,6 +3226,29 @@ def clone(*, input):
     return torch.clone(input)
 
 
+@register_acc_op_mapping(op_and_target=("call_function", torch.unbind))
+@register_acc_op
+def unbind(*, input, dim=0):
+    return torch.unbind(input, dim=dim)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.group_norm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("num_groups", "num_groups"),
+        ("weight", "weight"),
+        ("bias", "bias"),
+        ("eps", "eps"),
+    ],
+)
+@register_acc_op
+def group_norm(*, input, num_groups, weight=None, bias=None, eps=1e-05):
+    return torch.nn.functional.group_norm(
+        input, num_groups, weight=weight, bias=bias, eps=eps
+    )
+
+
 ###############################################################################
 
 # Set ops as side-effectul, this prevents them from being optimized away or
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 99823b16b..2cc8c1a55 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -38,6 +38,7 @@
     gemm_rcr,
     gemm_rrr,
     getitem,
+    group_norm,
     IntImm,
     IntVar,
     IntVarTensor,
@@ -460,6 +461,32 @@ def acc_ops_size(
     return size()(input_val)
 
 
+@ait_converter(acc_ops.unbind)
+def acc_ops_unbind(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    dim = kwargs["dim"]
+    shape = input_val.shape()
+    res = []
+    for cnt in range(shape[dim].value()):
+        idx = []
+        for i in range(len(shape)):
+            if i != dim:
+                idx.append(slice(None, None, None))
+            else:
+                idx.append(cnt)
+        kwargs_new = {
+            "input": input_val,
+            "idx": tuple(idx),
+        }
+        res.append(acc_ops_getitem(target, args, kwargs_new, name))
+    return res
+
+
 @ait_converter(acc_ops.getitem)
 def acc_ops_getitem(
     target: Target,
@@ -826,6 +853,25 @@ def _get_dtype(dtype: str):
     )
 
 
+@ait_converter(acc_ops.group_norm)
+def acc_ops_group_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    num_groups = kwargs["num_groups"]
+    weight_val = kwargs["weight"]
+    bias_val = kwargs["bias"]
+    eps_val = kwargs["eps"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    num_channels = input_val.shape()[-1].value()
+    op = group_norm(num_groups, num_channels)
+    return op(input_val, weight_val, bias_val, eps_val)
+
+
 @ait_converter(acc_ops.layer_norm)
 def acc_ops_layer_norm(
     target: Target,
diff --git a/fx2ait/fx2ait/test/converters/test_ait_group_norm.py b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
new file mode 100644
index 000000000..783cb28a5
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestGroupNormTensor(AITTestCase):
+    @parameterized.expand(
+        [
+            [True],
+            [False],
+        ]
+    )
+    def test_group_norm(self, affine):
+        class GN(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gn = torch.nn.GroupNorm(3, 6, affine=affine)
+
+            def forward(self, x):
+                return self.gn(x)
+
+        mod = GN().half().cuda()
+        inputs = [torch.randn(2, 6, 4, 5).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unbind.py b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
new file mode 100644
index 000000000..fc5735c92
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
@@ -0,0 +1,37 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+from torch import nn
+
+
+class TestUnbindTensor(AITTestCase):
+    def test_unbind(self):
+        class GetItem(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.unbind(x, dim=2)
+                z = y[0]
+                return z
+
+        mod = GetItem().half().cuda()
+        inputs = [torch.randn(2, 3, 4).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={},
+        )
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 6aa31a68f..119c34f76 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -66,14 +66,15 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
         # check last dim can be divided by num_groups
         # minimal group: 8
-        if len(gamma_shapes) != len(beta_shapes):
-            raise RuntimeError(
-                f"Gamma and beta must have the same number of dimensions, but got {len(gamma_shapes)} and {len(beta_shapes)}"
-            )
-        if x_shapes[-1].value() != gamma_shapes[0].value():
-            raise RuntimeError(
-                f"Input last dim {x_shapes[-1]} must be equal to gamma dim {gamma_shapes[0]}"
-            )
+        if gamma_shapes is not None and beta_shapes is not None:
+            if len(gamma_shapes) != len(beta_shapes):
+                raise RuntimeError(
+                    f"Gamma and beta must have the same number of dimensions, but got {len(gamma_shapes)} and {len(beta_shapes)}"
+                )
+            if x_shapes[-1].value() != gamma_shapes[0].value():
+                raise RuntimeError(
+                    f"Input last dim {x_shapes[-1]} must be equal to gamma dim {gamma_shapes[0]}"
+                )
         if x_shapes[-1].value() % num_groups != 0:
             raise RuntimeError(
                 f"Channel dim {gamma_shapes[0]} must be divisible by num_groups {num_groups}"
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index 425965be2..a6caf8b35 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -61,6 +61,7 @@
 from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
 from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
 from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
     group_layernorm_sigmoid_mul,

From 36412b6f842d280f6851a9677ee5d8f9c14030f4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 13 Feb 2023 09:46:17 -0800
Subject: [PATCH 111/638] serde with torch tensors (#204)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/204

numpy doesn't recognize bfloat16

Reviewed By: chenyang78

Differential Revision: D43016527

fbshipit-source-id: 67b99dd006ad3edb5b84ca9f87a19def0f3ec896
---
 python/aitemplate/compiler/base.py                | 15 ++++++++++++++-
 .../aitemplate/utils/serialization/ait_program.py |  3 ++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index d690315cd..62db1e4b1 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -299,7 +299,20 @@ def __init__(self, tensor):
         self.tensor = tensor
 
     def to_bytes(self) -> bytes:
-        return self.tensor.cpu().detach().numpy().tobytes()
+        if self.size() == 0:
+            return b""
+
+        import ctypes
+
+        t = self.tensor.contiguous().cpu().detach()
+        # We used to do tensor().numpy().tobytes() here,
+        # but numpy doesn't support bfloat16 natively,
+        # so we obtain the underlying C array.
+        # Results are flaky when tensor is not bound to a local variable.
+        raw_array = ctypes.cast(
+            t.data_ptr(), ctypes.POINTER(ctypes.c_ubyte * self.size())
+        )
+        return bytes(raw_array.contents)
 
     def size(self) -> int:
         """
diff --git a/python/aitemplate/utils/serialization/ait_program.py b/python/aitemplate/utils/serialization/ait_program.py
index 2ccb0addc..12e3068a4 100644
--- a/python/aitemplate/utils/serialization/ait_program.py
+++ b/python/aitemplate/utils/serialization/ait_program.py
@@ -23,6 +23,7 @@
     _TorchConstantTensorData,
 )
 from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def convert_to_ait_const(const):
@@ -79,7 +80,7 @@ def set_all_random_constants(self, dtype="float16"):
         const_infos = self.get_constants()
         for k, v in const_infos.items():
             getattr(self, k)._bind_data(
-                _NumpyConstantTensorData(np.random.randn(*v).astype(dtype))
+                _TorchConstantTensorData(get_random_torch_tensor(v, dtype))
             )
 
     def model(self) -> Union[Tensor, Tuple[Tensor]]:

From 08bccc376840958bd2da17760470607e2254b880 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 13 Feb 2023 10:00:50 -0800
Subject: [PATCH 112/638] fix groupnorm with bfloat16 (#223)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/223

att

Reviewed By: chenyang78

Differential Revision: D43131198

fbshipit-source-id: af5c953da571f22470d52d330caded27f64f9fd5
---
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  3 +--
 .../backend/cuda/groupnorm/layer_norm.cuh     | 25 -------------------
 tests/unittest/ops/test_groupnorm.py          |  4 +--
 3 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index a99d9ff54..59bbf792e 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -894,7 +894,6 @@ void GroupNormForwardGpu(
     ComputeType* mean,
     ComputeType* inv_variance,
     bool channels_first) {
-  // using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
   if (channels_first) {
     layer_norm::DirectLoad<T, ComputeType> load(x_ptr, norm_size);
     AffineStore<ComputeType, T, affine, FuseSwish> store(
@@ -950,7 +949,7 @@ void DispatchGroupNormForwardGpu(
     T2* mean,
     T2* inv_variance,
     bool channels_first) {
-  using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
+  using ComputeType = T2;
   if (gamma_ptr != nullptr && beta_ptr != nullptr) {
     GroupNormForwardGpu<T, ComputeType, true, FuseSwish>(
         stream,
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index e6c4595c3..af64e2a76 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -142,31 +142,6 @@ inline cudaError_t GetNumBlocks(
   return cudaSuccess;
 }
 
-template <typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template <>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
-    (__CUDA_ARCH__ >= 800)
-template <>
-struct DefaultComputeType<bfloat16> {
-  using type = float;
-};
-#endif
-
-// #if CUDA_VERSION >= 11000
-// template<>
-// struct DefaultComputeType<nv_bfloat16> {
-//   using type = float;
-// };
-// #endif  // CUDA_VERSION >= 11000
-
 template <typename T>
 class HasCanPackAs {
   typedef char one;
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 549ea357d..4b4258443 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -118,7 +118,7 @@ def _test_groupnorm(
         )
         self.test_count += 1
 
-    def test_groupnorm(self):
+    def test_groupnorm_float16(self):
         self._test_groupnorm()
         self._test_groupnorm(x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5)
         self._test_groupnorm(x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5)
@@ -165,7 +165,7 @@ def test_groupnorm_swish(self):
             )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    def test_float32(self):
+    def test_groupnorm_float32(self):
         # H % 8 != 0
         self._test_groupnorm(
             x_shape=[7, 13, 9, 12],

From 048385a81e535827ef273d9594929dfa9e9a8205 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang22@meta.com>
Date: Mon, 13 Feb 2023 10:53:16 -0800
Subject: [PATCH 113/638] Add elementwise SoftSign op (#253)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/253

A new elementwise SoftSign operation is added to AITemplate.

Reviewed By: aakhundov

Differential Revision: D43156082

fbshipit-source-id: 2c1608e19e750846f0540e609681a642e2a17c78
---
 python/aitemplate/backend/backend_spec.py     |  7 ++++
 .../backend/cuda/elementwise/custom_math.cuh  | 28 ++++++++++++++
 .../compiler/ops/common/epilogue.py           |  1 +
 python/aitemplate/compiler/ops/common/math.py |  4 ++
 tests/unittest/ops/test_activation.py         | 38 +++++++++++++++++++
 5 files changed, 78 insertions(+)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 7ca35a719..b5551f51d 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -258,6 +258,13 @@ class GPUBackendSpec(BackendSpec):
                 "bfloat16": "helu",
                 "float": "felu",
             },
+            FuncEnum.SOFTSIGN: {
+                "float": "fsoftsign",
+                "half": "hsoftsign",
+                "half2": "h2softsign",
+                "bfloat16": "hsoftsign",
+                "bfloat16_2": "h2softsign",
+            },
         }
     )
 
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 376506e73..71c7c68f4 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -920,4 +920,32 @@ __device__ bfloat16_2 h2elu(const bfloat16_2 op_input, const bfloat16_2 alpha) {
 #endif
 }
 
+__device__ half hsoftsign(const half a) {
+  return __hdiv(a, __hadd(CUDA_FP16_ONE, __habs(a)));
+}
+
+__device__ half2 h2softsign(const half2 a) {
+  return __h2div(a, __hadd2(half2(1.0, 1.0), __habs2(a)));
+}
+
+__device__ float fsoftsign(const float a) {
+  return a / (1.0f + fabsf(a));
+}
+
+__device__ bfloat16 hsoftsign(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hdiv(a, __hadd(CUDA_BF16_ONE, __habs(a)));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 h2softsign(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __h2div(a, __hadd2(bfloat16_2(1.0, 1.0), __habs2(a)));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 #endif
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index 7cac4fdee..4436fbfcf 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -62,3 +62,4 @@ class FuncEnum(Enum):
     FASTGELU = 24
     SOFTPLUS = 25
     ELU = 26
+    SOFTSIGN = 27
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index 4d40952f3..ddb6e4e94 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -105,3 +105,7 @@ def softplus(tensor: Any) -> Tensor:
 
 def elu(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("ELU")(tensor)
+
+
+def softsign(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SOFTSIGN")(tensor)
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index a9528c1c4..d925ebd3c 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -255,6 +255,38 @@ def _test_elu(
             module.run_with_tensors([x1_pt], [x2])
             self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
+    def _test_softsign(
+        self,
+        input_shape,
+        test_name="softsign",
+        copy_op=False,
+    ):
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            X1 = Tensor(
+                shape=[IntImm(dim) for dim in input_shape],
+                dtype=dtype,
+                name="input",
+                is_input=True,
+            )
+            X2_op = ops.elementwise(FuncEnum.SOFTSIGN)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_shape, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.Softsign()
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
     def test_lrelu(self):
         self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
         self._test_leaky_relu(
@@ -366,6 +398,12 @@ def test_elu(self):
             copy_op=True,
         )
 
+    def test_softsign(self):
+        self._test_softsign([61], test_name="softsign_1")
+        self._test_softsign([128], test_name="softsign_2")
+        self._test_softsign([128], test_name="softsign_3", copy_op=True)
+        self._test_softsign([121, 128], test_name="softsign_4")
+
 
 if __name__ == "__main__":
     unittest.main()

From 7b62fd60e8faa1772aec898f2eb20ed3ba4bcfab Mon Sep 17 00:00:00 2001
From: Chun-Wei Liu <chunwei@meta.com>
Date: Mon, 13 Feb 2023 13:21:34 -0800
Subject: [PATCH 114/638] Set ProfilerRunner timeout to default value 300
 (#248)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/248

This is the first diff to refactor timeout parameters in ProfilerRunner.

1/ Remove the hardcode 240 input argument and add a default value 300 in ProfilerRunner.
2/ Remove the hardcode 180 value by reusing the self._timeout value in backend/profiler_runner.py: https://fburl.com/code/u4vevwwx
3/ Add an env valuable to control the default timeout value

Reviewed By: ipiszy, wushirong

Differential Revision: D43204970

fbshipit-source-id: 7f1b04cb333dae1ff3b0dad6ea3d7c50b6f114a9
---
 python/aitemplate/backend/profiler_runner.py    | 6 +++---
 python/aitemplate/compiler/transform/profile.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 1be2adae7..0271790a0 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -236,16 +236,16 @@ class ProfilerRunner:
     however, the results are empirically better compared to the previous runner.
     """
 
-    def __init__(self, devices: List[str], timeout: int, postprocessing_delegate):
+    def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 300):
         """
         Parameters
         ----------
         devices : List[str]
             device identifiers (contents of {CUDA,HIP}_VISIBLE_DEVICES)
-        timeout : int
-            timeout to wait for all profilers completion in seconds
         postprocessing_delegate :
             object responsible for postprocessing results after futures completion
+        timeout : int
+            timeout to wait for all profilers completion in seconds
         """
         if devices is None:
             devices = [0]
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 2bc00b1d1..a87c5c190 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -107,7 +107,6 @@ def profile(
         )
     profiler_runner = ProfilerRunner(
         devices,
-        timeout=240,
         postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
     )
     for f in gemms:

From cf588d6e6defd3120dd05fb794ea558a2fe5fa30 Mon Sep 17 00:00:00 2001
From: Chun-Wei Liu <chunwei@meta.com>
Date: Mon, 13 Feb 2023 13:21:34 -0800
Subject: [PATCH 115/638] Remove hardcode block_timeout in ProfilerRunner
 (#254)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/254

This is the second diff to refactor timeout parameters in ProfilerRunner.

1/ Remove the hardcode 240 input argument and add a default value 300 in ProfilerRunner.
2/ Remove the hardcode 180 value by reusing the self._timeout value in backend/profiler_runner.py: https://fburl.com/code/u4vevwwx
3/ Add an env valuable to control the default timeout value

Reviewed By: ipiszy

Differential Revision: D43205325

fbshipit-source-id: aed1d8c6640791b3c29c0113c14248ca3fbf9660
---
 python/aitemplate/backend/profiler_runner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 0271790a0..3a4e11236 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -313,9 +313,6 @@ def join(self):
         done, not_done = concurrent.futures.wait(self._futures, self._timeout)
         for f in not_done:
             f.cancel()
-        # block until each done_callback completes,
-        # or raise Empty exception after 3 minutes of waiting
-        block_timeout = 3 * 60
         for _ in self._futures:
-            self._done_queue.get(timeout=block_timeout)
+            self._done_queue.get(timeout=self._timeout)
         self._postprocessing_delegate.postprocess_results()

From db67a1b24d0419405a2f218d293ecd0b28044567 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Mon, 13 Feb 2023 15:24:59 -0800
Subject: [PATCH 116/638] Revert D43109561: Multisect successfully blamed
 D43109561 for test or build failures (#257)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/257

This diff is reverting D43109561 (https://github.com/facebookincubator/AITemplate/commit/256d08bb845de90f3c9b2ffeae88cebf1fc56b1b)
D43109561 (https://github.com/facebookincubator/AITemplate/commit/256d08bb845de90f3c9b2ffeae88cebf1fc56b1b): [fx2ait] attention module by frank-wei has been identified to be causing the following test or build failures:

Tests affected:
- [ai_demos/server_model_zoo/models/uncrop:uncrop_push_blocking - testIndividualModel (ai_demos.server_model_zoo.models.uncrop.uncrop_test_model_instantiation.TestModelInstantiation)](https://www.internalfb.com/intern/test/844425009864384/)

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1568617
Here are the tasks that are relevant to this breakage:
T145356499: 1 test started failing for oncall aiacu in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Reviewed By: TianXieUSC

Differential Revision: D43204945

fbshipit-source-id: ca29233243e3b6568349913fc31594fd5c835d85
---
 .../test_ait_multihead_attention.py                |  4 ++--
 fx2ait/fx2ait/tools/common_fx2ait.py               | 14 ++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
index cdea5b34f..163d1f2e3 100644
--- a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -47,7 +47,7 @@ def forward(self, x):
                 acc_ops.unsqueeze,
                 acc_ops.getitem,
             },
-            leaf_module=torch.nn.MultiheadAttention,
+            transformer_mode=True,
         )
 
     def test_multihead_attention(self):
@@ -75,5 +75,5 @@ def forward(self, x):
             model,
             [x],
             expected_ops={torch.nn.MultiheadAttention},
-            leaf_module=torch.nn.MultiheadAttention,
+            transformer_mode=True,
         )
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index de9058841..067eb54eb 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -81,21 +81,18 @@ def run_test(
         precision: LowerPrecision = LowerPrecision.FP16,
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
+        transformer_mode: Optional[bool] = False,
         passes: List[Callable] = [],  # noqa: B006
-        leaf_module: Callable = None,  # one leaf module
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
         mod.eval()
-
-        leaf_module_list = []
-        if leaf_module:
-            leaf_module_list.append(leaf_module)
-
         mod = acc_tracer.trace(
             mod,
             inputs,
-            leaf_module_list=leaf_module_list,
+            leaf_module_list=[
+                torch.nn.MultiheadAttention if transformer_mode else None
+            ],
         )
         for p in passes:
             mod = p(mod, inputs)
@@ -158,9 +155,10 @@ def run_test(
             end_event.record()
             torch.cuda.synchronize()
             print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
             # PyTorch Transformer model would yield 2 output tensors, of which the second one is
             # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
-            if leaf_module == torch.nn.MultiheadAttention:
+            if transformer_mode:
                 ref_outputs = ref_outputs[0]
             if isinstance(outputs, torch.Tensor):
                 ref_outputs = [ref_outputs]

From 2d8668d92937975461569194e3ff78475d766d19 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Mon, 13 Feb 2023 15:40:59 -0800
Subject: [PATCH 117/638] Switch Manifold path for AITemplate
 test_stable_diffusion (#261)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/261

Reviewed By: tenpercent

Differential Revision: D43255278

fbshipit-source-id: bd4abd78282c501a328977c8a735947a16b64925
---
 examples/05_stable_diffusion/src/test_correctness.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/05_stable_diffusion/src/test_correctness.py b/examples/05_stable_diffusion/src/test_correctness.py
index 01b93f24d..ab872149e 100644
--- a/examples/05_stable_diffusion/src/test_correctness.py
+++ b/examples/05_stable_diffusion/src/test_correctness.py
@@ -43,10 +43,10 @@ def __init__(self, *args, **kwargs):
             ).to("cuda")
         except OSError:
             if ManifoldClient is not None:
-                with ManifoldClient.get_client(bucket="aitemplate") as client:
+                with ManifoldClient.get_client(bucket="glow_test_data") as client:
                     await_sync(
                         client.getRecursive(
-                            manifold_path="tree/stable_diffusion/v2",
+                            manifold_path="tree/aitemplate/stable_diffusion/v2",
                             local_path=self.local_path,
                         )
                     )

From f951dd7336fc6244d7ecb40475d7afc8b5ae687c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Mon, 13 Feb 2023 19:44:05 -0800
Subject: [PATCH 118/638] move around model files in OSS demos (#264)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/264

att

Reviewed By: hl475

Differential Revision: D43258981

fbshipit-source-id: b5841fe984e7526fe43a13c04d37b64282485135
---
 examples/02_detectron2/test_correctness.py | 4 ++--
 examples/03_bert/test_correctness.py       | 4 ++--
 examples/04_vit/test_correctness.py        | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/02_detectron2/test_correctness.py b/examples/02_detectron2/test_correctness.py
index 8a0e19d92..52d6174c2 100644
--- a/examples/02_detectron2/test_correctness.py
+++ b/examples/02_detectron2/test_correctness.py
@@ -257,10 +257,10 @@ def test_detectron2(self, config):
         if not os.path.exists(checkpoint_path):
             os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
             if ManifoldClient is not None:
-                with ManifoldClient.get_client("aitemplate") as client:
+                with ManifoldClient.get_client("glow_test_data") as client:
                     await_sync(
                         client.get(
-                            f"tree/detectron2/pickles/{config}_FPN_3x.pkl",
+                            f"tree/aitemplate/detectron2/pickles/{config}_FPN_3x.pkl",
                             checkpoint_path,
                         )
                     )
diff --git a/examples/03_bert/test_correctness.py b/examples/03_bert/test_correctness.py
index c80902e8c..7cf6d4201 100644
--- a/examples/03_bert/test_correctness.py
+++ b/examples/03_bert/test_correctness.py
@@ -37,10 +37,10 @@ def test_bert_base_uncased(self):
         if ManifoldClient is not None:
             model_path = "/tmp/aitemplate_bert/bert-base-uncased"
             os.makedirs(model_path, exist_ok=True)
-            with ManifoldClient.get_client(bucket="aitemplate") as client:
+            with ManifoldClient.get_client(bucket="glow_test_data") as client:
                 await_sync(
                     client.getRecursive(
-                        manifold_path="tree/bert/bert-base-uncased",
+                        manifold_path="tree/aitemplate/bert/bert-base-uncased",
                         local_path=model_path,
                     )
                 )
diff --git a/examples/04_vit/test_correctness.py b/examples/04_vit/test_correctness.py
index 745ab2d8e..8d30af6bb 100644
--- a/examples/04_vit/test_correctness.py
+++ b/examples/04_vit/test_correctness.py
@@ -93,7 +93,7 @@ def test_vit(self, model_name):
             num_heads = 12
             global_pool = "token"
             vit_pt_def = vit_base_patch16_224
-            path = "tree/vit-pt/vit_base_patch16_224.pt"
+            path = "tree/aitemplate/vit-pt/vit_base_patch16_224.pt"
 
         elif model_name == "vit_large_patch16_384":
             img_size = 384
@@ -101,12 +101,12 @@ def test_vit(self, model_name):
             embed_dim = 1024
             num_heads = 16
             vit_pt_def = vit_large_patch16_384
-            path = "tree/vit-pt/vit_large_patch16_384.pt"
+            path = "tree/aitemplate/vit-pt/vit_large_patch16_384.pt"
         if ManifoldClient is None:
             vit_pt = vit_pt_def(pretrained=True)
         else:
             stream = io.BytesIO()
-            with ManifoldClient.get_client(bucket="aitemplate") as client:
+            with ManifoldClient.get_client(bucket="glow_test_data") as client:
                 await_sync(
                     client.get(
                         path,

From 867beef1d7e3837a2fc34dac43d2562291bc303c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Mon, 13 Feb 2023 22:03:46 -0800
Subject: [PATCH 119/638] make a number of ops with respect to PyTorch tensor
 layouts (#227)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/227

PyTorch takes channel-first tensors for a number of ops such as conv/pooling/batch_norm, whereas AIT assumes channel-last tensors.

This change unified how we convert tensor layouts back and forth between PyTorch and AIT by applying permute ops to input values, weights and results appropriately.

Later, we will add a pass to remove redundant permute op pairs, e.g. permute([0, 3, 1, 2]) followed by permute([0, 2, 3, 1]), to improve performance.

We may need similar changes to the aten2ait converter.

Reviewed By: frank-wei, qxy11

Differential Revision: D43135925

fbshipit-source-id: acf68fa5e25108a59c91c5e57a0810ade58e7fa6
---
 fx2ait/fx2ait/converters/ait_converters.py    | 97 +++++++++++++------
 fx2ait/fx2ait/converters/utils.py             | 41 +++++++-
 .../converters_model/test_ait_vision_model.py |  1 -
 .../test_ait_adaptive_avg_pool2d.py           |  2 -
 .../test/converters/test_ait_avg_pool2d.py    |  2 -
 .../test/converters/test_ait_batch_norm.py    | 47 ++++++++-
 .../fx2ait/test/converters/test_ait_conv2d.py |  2 -
 .../fx2ait/test/converters/test_ait_conv3d.py |  2 -
 .../converters/test_ait_conv3d_depthwise.py   |  2 -
 .../converters/test_ait_convtranspose2d.py    |  4 -
 .../test/converters/test_ait_max_pool2d.py    |  2 -
 11 files changed, 151 insertions(+), 51 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 2cc8c1a55..72925d08e 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -70,6 +70,12 @@
 from .converter_registry import ait_converter
 
 from .utils import (
+    ait_ncdhw2ndhwc,
+    ait_nchw2nhwc,
+    ait_ncl2nlc,
+    ait_ndhwc2ncdhw,
+    ait_nhwc2nchw,
+    ait_nlc2ncl,
     create_binary_op,
     create_reduce_op,
     create_unary_op,
@@ -78,6 +84,8 @@
     ncdhw2ndhwc,
     nchw2nhwc,
     unify_dynamic_shape_name,
+    weight_ncdhw2ndhwc,
+    weight_nchw2nhwc,
 )
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -716,13 +724,13 @@ def acc_ops_conv_transpose2d(
     output_padding = identical_elem_tuple_to_int(kwargs["output_padding"])
     assert output_padding == 0, "output_padding is not 0!"
 
-    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
-    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight = weight_nchw2nhwc(weight)
     weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
     w_last_dim = weight._attrs["data"].tensor.shape[-1]
 
@@ -817,7 +825,7 @@ def make_slice(x, slice_idx, name):
         ]
         result = concatenate()(conv_groups, dim=3)
 
-    return result
+    return ait_nhwc2nchw(result)
 
 
 @ait_converter(acc_ops.nan_to_num)
@@ -1055,10 +1063,21 @@ def acc_ops_batch_norm(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # TODO @qxy11: Update channels-last assumption once AIT backend is updated
     input_val = kwargs["input"]
+    input_shape = input_val._attrs["shape"]
+    input_rank = len(input_shape)
+    assert 2 <= input_rank <= 5, f"expected {input_rank=} to be within [2, 5]"
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    if input_rank == 3:
+        # BatchNorm1d
+        input_val = ait_ncl2nlc(input_val)
+    elif input_rank == 4:
+        # BatchNorm2d
+        input_val = ait_nchw2nhwc(input_val)
+    elif input_rank == 5:
+        # BatchNorm3d
+        input_val = ait_ncdhw2ndhwc(input_val)
 
     scale = elementwise(FuncEnum.DIV)(
         kwargs["weight"],
@@ -1068,8 +1087,30 @@ def acc_ops_batch_norm(
         ),
     )
     bias = elementwise(FuncEnum.SUB)(kwargs["bias"], kwargs["running_mean"])
-    matmul_result = elementwise(FuncEnum.MUL)(input_val, scale)
-    result = elementwise(FuncEnum.ADD)(matmul_result, bias)
+
+    scale_dim_val = scale._attrs["shape"][0].value()
+
+    # input is channel-last after permute
+    input_shape = input_val._attrs["shape"]
+    channel_dim = -1
+    assert isinstance(
+        input_shape[channel_dim], IntImm
+    ), f"expected channel at {channel_dim=} in {input_shape=} to be static"
+    channel_dim_val = input_shape[channel_dim].value()
+    assert (
+        channel_dim_val == scale_dim_val
+    ), f"expected {channel_dim_val=} to be the same as {scale_dim_val=}"
+    mul_result = elementwise(FuncEnum.MUL)(input_val, scale)
+    result = elementwise(FuncEnum.ADD)(mul_result, bias)
+    if input_rank == 3:
+        # BatchNorm1d
+        result = ait_nlc2ncl(result)
+    elif input_rank == 4:
+        # BatchNorm2d
+        result = ait_nhwc2nchw(result)
+    elif input_rank == 5:
+        # BatchNorm3d
+        result = ait_ndhwc2ncdhw(result)
     return result
 
 
@@ -1120,14 +1161,13 @@ def acc_ops_conv2d(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # TODO: qxy11: Update once channels-first format is supported
-    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
-    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight = weight_nchw2nhwc(weight)
     weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
 
     bias = kwargs["bias"]
@@ -1206,6 +1246,7 @@ def make_slice(x, slice_idx, name):
         ]
         result = concatenate()(conv_groups, dim=3)
 
+    result = ait_nhwc2nchw(result)
     return result
 
 
@@ -1222,9 +1263,6 @@ def _choose_conv3d_op(
     Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
     and groups
     """
-    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
-    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
-
     if bias is not None:
         assert (
             groups == weight._attrs["shape"][0].value()
@@ -1246,12 +1284,14 @@ def acc_ops_conv3d(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    input_val = kwargs["input"]
+    input_val = ait_ncdhw2ndhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
+    weight = weight_ncdhw2ndhwc(weight)
+    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
 
     bias = kwargs["bias"]
     assert bias is None or isinstance(bias, AITTensor)
@@ -1262,7 +1302,10 @@ def acc_ops_conv3d(
 
     groups = kwargs["groups"]
 
-    return _choose_conv3d_op(stride, padding, dilation, input_val, weight, bias, groups)
+    result = _choose_conv3d_op(
+        stride, padding, dilation, input_val, weight, bias, groups
+    )
+    return ait_ndhwc2ncdhw(result)
 
 
 @ait_converter(acc_ops.max_pool3d)
@@ -1322,15 +1365,11 @@ def acc_ops_max_pool3d(
     shape_0 = (N, C * D, H, W)
     input_val = reshape_op_0(input_val, shape_0)
 
-    permute_op_0 = permute()
-    permutation_0 = [0, 2, 3, 1]
-    input_val = permute_op_0(input_val, permutation_0)
+    input_val = ait_nchw2nhwc(input_val)
 
     output = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
 
-    permute_op_1 = permute()
-    permutation_1 = [0, 3, 1, 2]
-    output = permute_op_1(output, permutation_1)
+    output = ait_nhwc2nchw(output)
 
     H_o = output.shape()[2].value()
     W_o = output.shape()[3].value()
@@ -1348,8 +1387,7 @@ def acc_ops_max_pool2d(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # TODO: @qxy11 Update once NCHW supported
-    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
@@ -1362,7 +1400,8 @@ def acc_ops_max_pool2d(
         raise RuntimeError(
             "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
         )
-    return max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    result = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    return ait_nhwc2nchw(result)
 
 
 @ait_converter(acc_ops.avg_pool2d)
@@ -1372,8 +1411,7 @@ def acc_ops_avg_pool2d(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # TODO: @qxy11 Update once NCHW supported
-    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
@@ -1387,7 +1425,8 @@ def acc_ops_avg_pool2d(
         raise RuntimeError(
             "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
         )
-    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    result = avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    return ait_nhwc2nchw(result)
 
 
 @ait_converter(acc_ops.adaptive_avg_pool2d)
@@ -1397,8 +1436,7 @@ def acc_ops_adaptive_avg_pool2d(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # TODO: @qxy11 Update once NCHW supported
-    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     output_size = identical_elem_tuple_to_int(kwargs["output_size"])
@@ -1416,7 +1454,8 @@ def acc_ops_adaptive_avg_pool2d(
     stride = HI // output_size
     kernel_size = HI - (output_size - 1) * stride
 
-    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+    result = avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+    return ait_nhwc2nchw(result)
 
 
 @ait_converter(acc_ops.contiguous)
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index 696df2d60..96f0fbd42 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -18,7 +18,12 @@
 
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
 
-from aitemplate.compiler.public import elementwise, FuncEnum, Tensor as AITTensor
+from aitemplate.compiler.public import (
+    elementwise,
+    FuncEnum,
+    permute,
+    Tensor as AITTensor,
+)
 from torch.fx.node import Argument
 
 
@@ -142,6 +147,40 @@ def ncdhw2ndhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
     return [shape[0], shape[2], shape[3], shape[4], shape[1]]
 
 
+def weight_nchw2nhwc(weight: AITTensor) -> None:
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    return weight
+
+
+def weight_ncdhw2ndhwc(weight: AITTensor) -> None:
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
+    return weight
+
+
+def ait_ncl2nlc(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 1])
+
+
+def ait_nlc2ncl(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 1])
+
+
+def ait_nchw2nhwc(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 3, 1])
+
+
+def ait_nhwc2nchw(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 3, 1, 2])
+
+
+def ait_ncdhw2ndhwc(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 3, 4, 1])
+
+
+def ait_ndhwc2ncdhw(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 4, 1, 2, 3])
+
+
 # TODO:  This is a hack to workaround AIT's dynamic shape requirement.
 # Detailed explanation can be found in D41743385 (aten2ait) D41974191(fx2ait).
 # We will throw this one after AIT provides vanilla support.
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
index 4afa00218..3a0a72d02 100644
--- a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
@@ -35,6 +35,5 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={},
-            permute_inputs=[0, 2, 3, 1],
             permute_outputs=None,
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
index db535e1d2..3fab2361f 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
@@ -47,6 +47,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.adaptive_avg_pool2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
index 846ec3987..781ed8ce5 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
@@ -42,6 +42,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.avg_pool2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
index 221511f06..017efcfd5 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
@@ -19,8 +19,32 @@
 from fx2ait.tools.common_fx2ait import AITTestCase
 
 
-class TestAdaptiveAvgPool2dConverter(AITTestCase):
-    def test_batch_norm(self):
+class TestBatchNormConverter(AITTestCase):
+    def test_batch_norm1d(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm1d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs1 = [torch.randn(5, 3).cuda().half()]
+        self.run_test(
+            model,
+            inputs1,
+            expected_ops={acc_ops.batch_norm},
+        )
+
+        inputs2 = [torch.randn(5, 3, 234).cuda().half()]
+        self.run_test(
+            model,
+            inputs2,
+            expected_ops={acc_ops.batch_norm},
+        )
+
+    def test_batch_norm2d(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -35,6 +59,21 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.batch_norm},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
+        )
+
+    def test_batch_norm3d(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm3d(6)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(4, 6, 24, 24, 11).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.batch_norm},
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
index eaaa7bf30..b1eb84704 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
@@ -59,6 +59,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.conv2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index d587a9eaf..0d94f4b09 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -173,6 +173,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.conv3d},
-            permute_inputs=[0, 2, 3, 4, 1],  # inputs should be NDHWC
-            permute_outputs=[0, 4, 1, 2, 3],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
index 9dbd5ef97..54e1b5cd3 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
@@ -109,6 +109,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.conv3d},
-            permute_inputs=[0, 2, 3, 4, 1],  # inputs should be NDHWC
-            permute_outputs=[0, 4, 1, 2, 3],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
index 59d761698..33b9e9d6e 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
@@ -65,8 +65,6 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.conv_transpose2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )
 
     # only works when in_ch == out_ch
@@ -105,6 +103,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.conv_transpose2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
index aca28f8b5..81ac49e4c 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
@@ -42,6 +42,4 @@ def forward(self, x):
             model,
             inputs,
             expected_ops={acc_ops.max_pool2d},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )

From eea8699d8f83b6bdd3b8b8dd6b9319a5be52bd03 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Mon, 13 Feb 2023 22:35:07 -0800
Subject: [PATCH 120/638] Add check for MHA converter (#260)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/260

Add check to avoid kernel execution be ignore silently

Reviewed By: frank-wei

Differential Revision: D43248480

fbshipit-source-id: 684ddd890cd3daa3233c31fb35976a4f4830c802
---
 fx2ait/fx2ait/converters/ait_module_converters.py            | 5 +++++
 .../converters_module/test_ait_multihead_attention.py        | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index a869b18ae..792444be0 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -40,6 +40,11 @@ def multi_head_attention_module(
     value = kwargs["value"] if "value" in kwargs else args[2]
     bsz, seq_len_q, dim = query.shape()
     _, seq_len, _ = key.shape()
+    # TODO update check condition once AIT backend ease kAlignment check condition
+    if submod.num_heads % 8 != 0:
+        raise ValueError(
+            f"The number of heads for MHA module is not supported:{submod.num_heads}"
+        )
     attn = nn.CrossAttention(
         dim=submod.embed_dim,
         seq_len=seq_len_q.value(),
diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
index 163d1f2e3..05bd3be2c 100644
--- a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -35,7 +35,7 @@ def forward(self, x):
 
                 return self.attn(query=unsqueeze, key=layer_norm, value=layer_norm)
 
-        seq_len_q, dim, nheads = 4, 16, 2
+        seq_len_q, dim, nheads = 4, 256, 16
         model = TestModule(dim, nheads).half().cuda()
         input_q = torch.randn(128, seq_len_q, dim).cuda().half()
         self.run_test(

From f45e68a5d965604e9e4d8fd98ad0b2ed3606a6c6 Mon Sep 17 00:00:00 2001
From: Oleg Khabinov <khabinov@meta.com>
Date: Mon, 13 Feb 2023 23:03:43 -0800
Subject: [PATCH 121/638] Re-enable main fx2ait tests (#266)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/266

Reviewed By: yinghai, wushirong

Differential Revision: D43263441

fbshipit-source-id: bb4dd6f8abfae15049db5fb3328d0d5ff4a66128
---
 fx2ait/fx2ait/test/test_fx2ait.py | 42 +++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
index d0d2a80b3..6dd503979 100644
--- a/fx2ait/fx2ait/test/test_fx2ait.py
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -14,6 +14,7 @@
 #
 import io
 import os
+import tempfile
 import unittest
 
 import torch
@@ -21,34 +22,49 @@
 from fx2ait.ait_module import AITModule
 from fx2ait.fx2ait import AITInterpreter
 
-torch.ops.load_library("build/libait_model.so")
+OSS_AIT_MODEL = False
+try:
+    torch.ops.load_library("//deeplearning/ait:AITModel")
+except Exception:
+    torch.ops.load_library("build/libait_model.so")
+    OSS_AIT_MODEL = True
+
+AIT_MODEL_CLASS = (
+    torch.classes.ait.AITModel if OSS_AIT_MODEL else torch.classes.fb.AITModel
+)
 
 
 class TestAITModule(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_fx2ait_impl(self, test_serialization=False, test_cuda_graph=False):
         class TestModule(torch.nn.Module):
-            def forward(self, x):
-                sigmoid = torch.sigmoid(x)
-                add = sigmoid * sigmoid
-                return add
+            def forward(self, x, y):
+                add = x + y
+                mul = add * add
+                return mul
 
-        inputs = [torch.ones(2, 2).cuda().half()]
-        mod = TestModule().cuda().half()
+        inputs = [torch.randn(2, 2).half().cuda(), torch.randn(2, 2).half().cuda()]
+        mod = TestModule().half().cuda()
         ref_output = mod(*inputs)
 
         traced = acc_tracer.trace(mod, inputs)
 
-        interp = AITInterpreter(traced, inputs, "./tmp", "test")
+        ait_dump_dir = tempfile.mkdtemp(prefix="test_fx2ait_", dir="/tmp")
+
+        interp = AITInterpreter(traced, inputs, ait_dump_dir, "test")
         interp_result = interp.run()
         ait_mod = AITModule(
-            torch.classes.ait.AITModel(
+            AIT_MODEL_CLASS(
                 interp_result.engine.lib_path,
                 interp_result.input_names,
                 interp_result.output_names,
                 torch.float16,
                 torch.float16,
                 1,  # num_runtimes
-            ),
+            )
         )
         ait_mod.engine.use_cuda_graph = test_cuda_graph
         if test_serialization:
@@ -56,10 +72,10 @@ def forward(self, x):
             # Have to JIT-ify the module before we can save/load it.
             ait_mod = torch.jit.trace(ait_mod, inputs)
             script_output = ait_mod(*inputs)
-            torch.testing.assert_close(script_output, ref_output, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(script_output, ref_output, atol=0.1, rtol=0.1)
             torch.jit.save(ait_mod, buf)
             buf.seek(0)
-            torch.classes.ait.AITModel.register_library_name_to_path_map(
+            AIT_MODEL_CLASS.register_library_name_to_path_map(
                 {
                     os.path.basename(
                         interp_result.engine.lib_path
@@ -68,7 +84,7 @@ def forward(self, x):
             )
             ait_mod = torch.jit.load(buf)
         ait_output = ait_mod(*inputs)
-        torch.testing.assert_close(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
 
     def test_fx2ait(self):
         self._test_fx2ait_impl(test_serialization=False)

From 8ec06a253a6ce499ce0baba04881304562a1934b Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 13 Feb 2023 23:34:18 -0800
Subject: [PATCH 122/638] Ban merge split concat pass when tensor is output
 (#249)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/249

Currently when we merge split+concat and concat+split, we don't consider if the first op is output. If it's output, actually, we should keep that op and perform no fusion.

This diff add the is_output check to that pass to avoid output tensor get eliminated.

Reviewed By: chenyang78, wushirong

Differential Revision: D43163853

fbshipit-source-id: 8a5ebffd145b1c597dd655efc01cbfbb50a7567b
---
 .../transform/transform_memory_ops.py         |   9 +
 .../compiler/test_transform_memory_ops.py     | 210 ++++++++++++++++++
 2 files changed, 219 insertions(+)

diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index 3c426f97e..34e1c68a4 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -46,6 +46,11 @@ def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
             single_input_cat_ops.append(op)
 
     for op in single_input_cat_ops:
+        input_tensor = op._attrs["inputs"][0]
+        output_tensor = op._attrs["outputs"][0]
+        # tensor can not be input and output
+        if output_tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+            continue
         transform_utils.remove_single_tensor_op_from_sorted_graph(op)
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
@@ -145,6 +150,10 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
             if len(output_t._attrs["dst_ops"]) > 1:
                 found_cat_op = False
                 break
+            # If first op is output, it can't be fused.
+            if output_t._attrs["is_output"]:
+                found_cat_op = False
+                continue
             next_ops = output_t._attrs["dst_ops"]
             if len(next_ops) != 1:
                 break
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index 59132feb4..e069c26e1 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -260,6 +260,216 @@ def test_cat_cat_elimination_e2e(self, dtype):
         )
         self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
 
+    def _prepare_skip_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = ops.concatenate()([X0], dim=1)
+        X2 = ops.concatenate()([X1], dim=2)
+        X3 = ops.concatenate()([X2, X1], dim=1)
+        X1._attrs["name"] = "output0"
+        X1._attrs["is_output"] = True
+        X3._attrs["name"] = "output1"
+        X3._attrs["is_output"] = True
+        return X1, X3
+
+    def test_skip_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 4)
+        graph = transform.transform_memory_ops(graph)
+        print(graph)
+        self.assertEqual(len(graph), 3)
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_skip_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"skip_cat_elimination_{dtype}")
+
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        out0_pt = torch.cat([x0_pt], dim=1)
+        out1_pt = torch.cat([x0_pt, x0_pt], dim=1)
+
+        out0 = get_torch_empty_tensor(out0_pt.size(), dtype)
+        out1 = get_torch_empty_tensor(out1_pt.size(), dtype)
+        module.run_with_tensors([x0_pt], [out0, out1])
+        self.assertTrue(torch.allclose(out0_pt, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out1_pt, out1, atol=1e-1, rtol=1e-2))
+
+    def _prepare_skip_split_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        [X1, X2] = ops.split()(X0, int(self.M / 2), dim=1)
+        X3 = ops.concatenate()([X1, X2], dim=1)
+        [X4, X5] = ops.split()(X3, int(self.N / 2), dim=2)
+        X6 = ops.concatenate()([X4, X5], dim=1)
+        X3._attrs["name"] = "output0"
+        X3._attrs["is_output"] = True
+        X6._attrs["name"] = "output1"
+        X6._attrs["is_output"] = True
+
+        return [X3, X6]
+
+    def test_skip_split_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_split_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 7)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 5)
+
+    @parameterized.expand([("float16")])  # , ("float")])
+    def test_skip_split_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_split_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"skip_split_cat_elimination_{dtype}"
+        )
+
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        out_pt0 = x0_pt
+        x4_pt, x5_pt = torch.split(x0_pt, int(self.N / 2), dim=2)
+        out_pt1 = torch.cat([x4_pt, x5_pt], dim=1)
+
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
+        module.run_with_tensors(
+            {"input0": x0_pt},
+            {"output0": out0, "output1": out1},
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
+
+    def _prepare_skip_cat_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch1")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch2")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N + 4),
+            ],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch3")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N * 2),
+            ],
+            dtype=dtype,
+            name="input3",
+            is_input=True,
+        )
+
+        X5 = ops.concatenate()([X0, X1], dim=1)
+        X6 = ops.concatenate()([X5, X2], dim=2)
+        X7 = ops.concatenate()([X3, X6], dim=2)
+        X8 = ops.concatenate()([X7, X2], dim=2)
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+        X8._attrs["name"] = "output1"
+        X8._attrs["is_output"] = True
+
+        return [X6, X8]
+
+    def test_skip_cat_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_cat_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 8)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 4)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 7)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 3)
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_skip_cat_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_cat_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"skip_cat_cat_elimination_{dtype}"
+        )
+
+        x0_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x1_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x2_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N + 4], dtype)
+        x3_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N * 2], dtype)
+        x5_pt = torch.cat([x0_pt, x1_pt], dim=1)
+        out_pt0 = torch.cat([x5_pt, x2_pt], dim=2)
+        out_pt1 = torch.cat([x3_pt, x5_pt, x2_pt, x2_pt], dim=2)
+
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
+        module.run_with_tensors(
+            {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt, "input3": x3_pt},
+            [out0, out1],
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
+
 
 if __name__ == "__main__":
     unittest.main()

From f36a67d78c14169dc91629396c9d0169d3afebe1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 14 Feb 2023 02:45:53 -0800
Subject: [PATCH 123/638] support bf16 for common conv2d (#209)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/209

att

Reviewed By: chenyang78

Differential Revision: D43067435

fbshipit-source-id: 84688e14911689f0268aaa536e95ba60db3c1d3a
---
 .../aitemplate/backend/cuda/conv2d/common.py  | 11 ++++++-
 tests/unittest/ops/test_conv.py               | 32 +++++++++++++++----
 tests/unittest/ops/test_upsamping2d_add.py    |  2 +-
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 2ea880f08..992229faa 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -503,13 +503,22 @@ def extract_config(
     if lib_dtype == "float":
         data_type = cutlass_lib.library.DataType.f32
         acc_type = cutlass_lib.library.DataType.f32
-    else:
+    elif "half" in lib_dtype:
         data_type = cutlass_lib.library.DataType.f16
         acc_type = cutlass_lib.library.DataType.f32
         # check target use fp16 acc
         if "use_fp16_acc" in Target.current()._kwargs:
             if Target.current()._kwargs["use_fp16_acc"]:
                 acc_type = cutlass_lib.library.DataType.f16
+    elif "bfloat16" in lib_dtype:
+        data_type = cutlass_lib.library.DataType.bf16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.bf16
+    else:
+        raise RuntimeError(f"Unsupported dtype {lib_dtype}")
 
     def f_proc_op(op):
         ret = []
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index ee92173cb..7cbd70a75 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -53,7 +53,9 @@ def _test_conv(
 
         X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
         W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
@@ -61,13 +63,13 @@ def _test_conv(
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-1, rtol=1e-1)
             else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_fp16(self):
         self._test_conv(
             test_name="conv2d_fp16",
             dtype="float16",
@@ -81,9 +83,9 @@ def test_fp16(self):
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
     @unittest.skipIf(
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
+        "fp32 is not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_fp32(self):
         self._test_conv(
             test_name="conv2d_fp32",
             dtype="float32",
@@ -94,6 +96,22 @@ def test_fp32(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bf16(self):
+        self._test_conv(
+            test_name="conv2d_bf16",
+            dtype="bfloat16",
+        )
+        self._test_conv(
+            copy_op=True,
+            test_name="conv2d_bf16_copy_op",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsamping2d_add.py
index 51eabc454..4bb5d2b11 100644
--- a/tests/unittest/ops/test_upsamping2d_add.py
+++ b/tests/unittest/ops/test_upsamping2d_add.py
@@ -22,7 +22,7 @@
 from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-_DEFAULT_BATCH_SIZE = [1, 16]
+_DEFAULT_BATCH_SIZE = [1, 3]
 
 
 class UpsamplingAddTestCase(unittest.TestCase):

From 0ab49f9ff165839a4e726073d7ef3ba6eacf6fdd Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 14 Feb 2023 07:29:59 -0800
Subject: [PATCH 124/638] Modify documentation to reflect the recent changes
 (#256)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/256

* Rename RECOMPILE to AIT_RECOMPILE and BUILD_DOCS to AIT_BUILD_DOCS in documentation
* Add the documentation for AIT_COMPILER_OPT

Reviewed By: chenyang78

Differential Revision: D43244254

fbshipit-source-id: f4e67238ed2c5b5a640c533653a9b1bfc14688c8
---
 docs/source/reference/env.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 4c7cf2eb9..50847a6ae 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -7,10 +7,12 @@ Codegen
 
 **NUM_BUILDERS**: The number of CPU jobs running in parallel during codegen. It controls both the profiler codegen and the final .so codegen. It's set to 12 in NIGHTLY jobs. Internally, it's set to 12 for normal tests and 24 for heavy tests. By default, the builder uses all the available CPUs for building.
 
-**RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
+**AIT_RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
 
 **AIT_NDEBUG**: If set to "1", compile with `NDEBUG`, disabling debug assertions. Recommended for production builds. "1" by default.
 
+**AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler to do time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
+
 Profiling
 ---------
 
@@ -31,7 +33,7 @@ OSS CI
 
 **CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compiles two random profilers to make sure the profiler codegen is not broken.
 
-**BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
+**AIT_BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
 
 Miscellaneous
 -------------

From cef1098792698e3430a21caafc0f226b8447d554 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 14 Feb 2023 08:31:43 -0800
Subject: [PATCH 125/638] add unit tests for bfloat16 inputs to cutlass-based
 conv2d_bias_* (#268)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/268

att

Reviewed By: chenyang78

Differential Revision: D43265649

fbshipit-source-id: 3e53b6fe5a9cf63ccd1bf3d00a890bd1c9d8c91d
---
 tests/unittest/ops/test_conv2d_bias_add.py    | 30 ++++++++++------
 tests/unittest/ops/test_conv_bias.py          | 26 +++++++++-----
 tests/unittest/ops/test_conv_bias_add_relu.py | 30 ++++++++++------
 .../unittest/ops/test_conv_bias_hardswish.py  | 28 ++++++++++-----
 tests/unittest/ops/test_conv_bias_relu.py     | 34 +++++++++++++------
 tests/unittest/ops/test_conv_bias_sigmoid.py  | 26 +++++++++-----
 6 files changed, 117 insertions(+), 57 deletions(-)

diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index 7ebe7a185..69adeafa5 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -70,7 +70,9 @@ def _test_conv_bias_add(
         W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt + R_pt
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
@@ -82,13 +84,15 @@ def _test_conv_bias_add(
         y_transpose = y.permute(0, 3, 1, 2)
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
-            else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-1, rtol=5e-1)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_add_fp16(self):
         self._test_conv_bias_add(
             test_name="conv2d_bias_add_fp16",
             dtype="float16",
@@ -104,15 +108,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_add_fp32(self):
         self._test_conv_bias_add(
             test_name="conv2d_bias_add_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_add_bf16(self):
         self._test_conv_bias_add(
-            copy_op=True,
-            test_name="conv2d_bias_add_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_add_bf16",
+            dtype="bfloat16",
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index 15193f057..3f4e61e0f 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -60,7 +60,9 @@ def _test_conv_bias(
         X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
         W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
@@ -70,13 +72,13 @@ def _test_conv_bias(
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
             else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_fp16(self):
         self._test_conv_bias(
             test_name="conv2d_bias_fp16",
             dtype="float16",
@@ -92,15 +94,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_fp32(self):
         self._test_conv_bias(
             test_name="conv2d_bias_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_bf16(self):
         self._test_conv_bias(
-            copy_op=True,
-            test_name="conv2d_bias_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_bf16",
+            dtype="bfloat16",
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index 5128e4908..8fd22026b 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -70,7 +70,9 @@ def _test_conv_bias_add_relu(
         W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt + R_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
 
@@ -83,13 +85,15 @@ def _test_conv_bias_add_relu(
         y_transpose = y.permute(0, 3, 1, 2)
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
-            else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-1, rtol=5e-1)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_add_relu_fp16(self):
         self._test_conv_bias_add_relu(
             test_name="conv2d_bias_add_relu_fp16",
             dtype="float16",
@@ -105,15 +109,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_add_relu_fp32(self):
         self._test_conv_bias_add_relu(
             test_name="conv2d_bias_add_relu_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_add_relu_bf16(self):
         self._test_conv_bias_add_relu(
-            copy_op=True,
-            test_name="conv2d_bias_add_relu_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_add_relu_bf16",
+            dtype="bfloat16",
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index a56b9995f..4c138a2fc 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -66,7 +66,9 @@ def _test_conv_bias_hardswish(
         X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
         W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = hard_swish(Y_pt)
         # np.savetxt("y.txt", Y_np.flatten())
@@ -81,11 +83,13 @@ def _test_conv_bias_hardswish(
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if dtype == "float32":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
-        else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+        elif dtype == "float16":
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+        elif dtype == "bfloat16":
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1, rtol=1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_hardswish_fp16(self):
         self._test_conv_bias_hardswish(
             test_name="conv2d_bias_hardswish_fp16",
             dtype="float16",
@@ -101,15 +105,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_hardswish_fp32(self):
         self._test_conv_bias_hardswish(
             test_name="conv2d_bias_hardswish_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_hardswish_bf16(self):
         self._test_conv_bias_hardswish(
-            copy_op=True,
-            test_name="conv2d_bias_hardswish_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_hardswish_bf16",
+            dtype="bfloat16",
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index 28364e1e6..f01063f6d 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -23,6 +23,10 @@
 
 
 class ConvBiasReluTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_conv_bias_relu(
         self,
         batch=4,
@@ -60,7 +64,9 @@ def _test_conv_bias_relu(
         X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
         W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
@@ -71,13 +77,15 @@ def _test_conv_bias_relu(
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
-            else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=2e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_relu_fp16(self):
         self._test_conv_bias_relu(
             test_name="conv2d_bias_relu_fp16",
             dtype="float16",
@@ -93,15 +101,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_relu_fp32(self):
         self._test_conv_bias_relu(
             test_name="conv2d_bias_relu_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_relu_bf16(self):
         self._test_conv_bias_relu(
-            copy_op=True,
-            test_name="conv2d_bias_relu_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_relu_bf16",
+            dtype="bfloat16",
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index 9cd3b38dc..a9ff2be1d 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -60,7 +60,9 @@ def _test_conv_bias_sigmoid(
         X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
         W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
         B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = torch.sigmoid(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
@@ -71,13 +73,13 @@ def _test_conv_bias_sigmoid(
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
             if dtype == "float32":
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
             else:
-                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
+    def test_conv2d_bias_sigmoid_fp16(self):
         self._test_conv_bias_sigmoid(
             test_name="conv2d_bias_sigmoid_fp16",
             dtype="float16",
@@ -93,15 +95,21 @@ def test_fp16(self):
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv2d_bias_sigmoid_fp32(self):
         self._test_conv_bias_sigmoid(
             test_name="conv2d_bias_sigmoid_fp32",
             dtype="float32",
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_conv2d_bias_sigmoid_bf16(self):
         self._test_conv_bias_sigmoid(
-            copy_op=True,
-            test_name="conv2d_bias_sigmoid_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_sigmoid_bf16",
+            dtype="bfloat16",
         )
 
 
From e80cfd30e2e9aa8a5eb5f7df7232377453d7cc3d Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Tue, 14 Feb 2023 10:27:29 -0800
Subject: [PATCH 126/638] Match AITModule on ait inputs and fx inputs (#262)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/262

Currently AITModule doesn't
check if python's input and AIT input matches, but directly give AIT the corresponding input.

This could cause us some trouble when AIT optimize the graph and eliminate some inputs.
e.g. https://www.internalfb.com/phabricator/paste/view/P619446970?lines=37
Here,  add_14 can be deduced at a very early stage and therefore the source of add_14: cat_37 actually isn't used in AIT graph anywhere!
Therefore, we would got complaint: User passed 6 inputs, but the model expects 5. This is actually because cat_37 is not longer needed in AIT's graph!

This diff put checker in AITinterpreter to insure eeach AIT input can find corresponding python's input.
It also change AITModule by checking both list of python/fx input and ait input, and find corresponding fx input for ait input.

Reviewed By: frank-wei, chenyang78

Differential Revision: D43256832

fbshipit-source-id: 369bfddfc0bfba20e62d794b3ed4fdc2797ce9ec
---
 fx2ait/fx2ait/ait_module.py            | 18 +++++++++++++++---
 fx2ait/fx2ait/ait_splitter.py          |  3 ++-
 fx2ait/fx2ait/fx2ait.py                | 22 +++++++++++++++++++---
 fx2ait/fx2ait/lower/lower.py           |  1 +
 fx2ait/fx2ait/tools/ait_minimizer.py   |  1 +
 fx2ait/fx2ait/tools/common_aten2ait.py |  9 ++++++---
 fx2ait/fx2ait/tools/common_fx2ait.py   | 15 ++++++++++-----
 7 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
index b214a6bb2..54d212685 100644
--- a/fx2ait/fx2ait/ait_module.py
+++ b/fx2ait/fx2ait/ait_module.py
@@ -21,12 +21,24 @@ class AITModule(torch.nn.Module):
     def __init__(
         self,
         engine=None,
+        interp_result=None,
     ):
         super(AITModule, self).__init__()
         self.engine = engine
+        self.interp_result = interp_result
 
     def forward(self, *inputs):
-        outputs = self.engine.forward(inputs)
+        python_inputs = []
+        if self.interp_result:
+            inputs = list(inputs)
+            for name, inp in zip(self.interp_result.fx_input_names, inputs):
+                if name in self.interp_result.input_names:
+                    python_inputs.append(inp)
+            assert len(python_inputs) == len(self.interp_result.input_names)
+        else:
+            python_inputs = inputs
+
+        outputs = self.engine.forward(python_inputs)
         if len(outputs) == 1:
             return outputs[0]
         return tuple(outputs)
@@ -44,12 +56,12 @@ def profile(
         self.engine.profile(inputs, filename, num_iters)
 
     @staticmethod
-    def create_ait_module_wrapper(engine, trace_ait_module, *inputs):
+    def create_ait_module_wrapper(engine, interp_result, trace_ait_module, *inputs):
         """
         Some use cases need to torch.jit.script a model with AITModules in
         it, but TorchScript does not support variadic inputs. We can get
         around this by scripting the AITModule with some sample inputs.
         This is turned in by passing allow_scripting=True.
         """
-        mod = AITModule(engine)
+        mod = AITModule(engine, interp_result)
         return torch.jit.trace(mod, inputs) if trace_ait_module else mod
diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index 742a0062a..908cbfcca 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -153,7 +153,8 @@ def _lower_model_to_backend(
                 torch.float16,
                 torch.float,
                 1,  # num_runtimes
-            )
+            ),
+            interpreter_result,
         )
 
     # TODO add _find_culprit once minimizer completed
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index 75b3a2324..a0fb2224f 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -47,6 +47,7 @@ class AITInterpreterResult(NamedTuple):
     engine: Any
     input_names: Sequence[str]
     output_names: Sequence[str]
+    fx_input_names: Sequence[str] = []
 
 
 class AITInterpreter(torch.fx.Interpreter):
@@ -115,6 +116,7 @@ def __init__(
 
         self._input_names: List[str] = []
         self._output_names: List[str] = []
+        self._fx_input_names: List[str] = []
         self._loaded_params: Dict[str, AITTensor] = {}
 
         self.dump_ait_dir = dump_ait_dir
@@ -224,8 +226,21 @@ def run(self) -> AITInterpreterResult:
             args["tensor"] = output_tensors
 
         self.engine = compile_model(**args)
-
-        for i, input_name in enumerate(self._input_names):
+        ait_input_names = [
+            n._attrs["name"]
+            for n in self.engine.debug_sorted_graph
+            if n._attrs["is_input"]
+        ]
+        for name in ait_input_names:
+            assert (
+                self._fx_input_names.count(name) == 1
+            ), f"Cannot find AIT's compiled input: {name} in fx graph!"
+
+        for name in self._fx_input_names:
+            if name in ait_input_names:
+                self._input_names.append(name)
+
+        for i, input_name in enumerate(self._fx_input_names):
             _LOGGER.info("Set input{}: {}".format(i, input_name))
 
         if self.engine is None:
@@ -238,6 +253,7 @@ def run(self) -> AITInterpreterResult:
             self.engine,
             self._input_names,
             self._output_names,
+            self._fx_input_names,
         )
 
     def run_node(self, n):
@@ -245,7 +261,7 @@ def run_node(self, n):
         return super().run_node(n)
 
     def placeholder(self, target, args, kwargs):
-        self._input_names.append(target)
+        self._fx_input_names.append(target)
         input_spec = self.input_specs[self.input_specs_iter]
         self.input_specs_iter += 1
 
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
index 2b2ca18d5..c4187de42 100644
--- a/fx2ait/fx2ait/lower/lower.py
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -131,6 +131,7 @@ def lower_pass(
                 _precision_to_torch_type(lower_settings.output_precision),
                 1,  # num_runtimes
             ),
+            interp_res,
             lower_settings.trace_ait_module,
             *input,
         )
diff --git a/fx2ait/fx2ait/tools/ait_minimizer.py b/fx2ait/fx2ait/tools/ait_minimizer.py
index 255248973..95b9e961c 100644
--- a/fx2ait/fx2ait/tools/ait_minimizer.py
+++ b/fx2ait/fx2ait/tools/ait_minimizer.py
@@ -43,6 +43,7 @@ def lower_mod_default(
             torch.float16,
             1,  # num_runtimes
         ),
+        interpreter_result,
     )
     return res_mod
 
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
index aabedf79f..2fce7ff7e 100644
--- a/fx2ait/fx2ait/tools/common_aten2ait.py
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -150,7 +150,8 @@ def run_test(
                 torch.float16,
                 torch.float,
                 1,  #  num_runtimes
-            )
+            ),
+            interp_result,
         )
 
         # Inference run and results comparison
@@ -242,7 +243,8 @@ def run_test_with_dynamic_shape(
                 torch.float16,
                 torch.float,
                 1,  #  num_runtimes
-            )
+            ),
+            interp_result,
         )
 
         for inputs in inputs_list:
@@ -360,7 +362,8 @@ def benchmark(f, args):
                     torch.float16,
                     torch.float,
                     1,  #  num_runtimes
-                )
+                ),
+                interp_result,
             )
             # Benchmark Pytorch Eager
             # warmup
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 067eb54eb..cd2ccf8fd 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -131,7 +131,8 @@ def run_test(
                         torch.float16,
                         torch.float,
                         1,  #  num_runtimes
-                    )
+                    ),
+                    interp_result,
                 )
             else:
                 ait_mod = AITModule(
@@ -142,7 +143,8 @@ def run_test(
                         torch.float16,
                         torch.float,
                         1,  #  num_runtimes
-                    )
+                    ),
+                    interp_result,
                 )
 
             ref_outputs = mod(*original_inputs)
@@ -233,7 +235,8 @@ def run_test_with_dynamic_shape(
                         torch.float16,
                         torch.float,
                         1,  #  num_runtimes
-                    )
+                    ),
+                    interp_result,
                 )
             else:
                 ait_mod = AITModule(
@@ -244,7 +247,8 @@ def run_test_with_dynamic_shape(
                         torch.float16,
                         torch.float,
                         1,  #  num_runtimes
-                    )
+                    ),
+                    interp_result,
                 )
 
             ref_outputs = mod(*original_inputs)
@@ -358,7 +362,8 @@ def benchmark(f, args):
                 torch.float16,
                 torch.float,
                 1,  #  num_runtimes
-            )
+            ),
+            interp_result,
         )
         # Benchmark Pytorch Eager
         # warmup

From 6320e1c20f448a87d11a719e7a45bf5abdfdd00b Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Tue, 14 Feb 2023 21:26:27 -0800
Subject: [PATCH 127/638] attention module (#267)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/267

Reviewed By: terrychenism, TianXieUSC

Differential Revision: D43265476

fbshipit-source-id: cda492410851922a1a1950f192e61c1dd3a78d7a
---
 .../test_ait_multihead_attention.py                |  4 ++--
 fx2ait/fx2ait/tools/common_fx2ait.py               | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
index 05bd3be2c..4641745ed 100644
--- a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -47,7 +47,7 @@ def forward(self, x):
                 acc_ops.unsqueeze,
                 acc_ops.getitem,
             },
-            transformer_mode=True,
+            leaf_module=torch.nn.MultiheadAttention,
         )
 
     def test_multihead_attention(self):
@@ -75,5 +75,5 @@ def forward(self, x):
             model,
             [x],
             expected_ops={torch.nn.MultiheadAttention},
-            transformer_mode=True,
+            leaf_module=torch.nn.MultiheadAttention,
         )
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index cd2ccf8fd..e1a934347 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -81,18 +81,21 @@ def run_test(
         precision: LowerPrecision = LowerPrecision.FP16,
         permute_inputs: Optional[List[int]] = None,
         permute_outputs: Optional[List[int]] = None,
-        transformer_mode: Optional[bool] = False,
         passes: List[Callable] = [],  # noqa: B006
+        leaf_module: Callable = None,  # one leaf module
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
         mod.eval()
+
+        leaf_module_list = []
+        if leaf_module:
+            leaf_module_list.append(leaf_module)
+
         mod = acc_tracer.trace(
             mod,
             inputs,
-            leaf_module_list=[
-                torch.nn.MultiheadAttention if transformer_mode else None
-            ],
+            leaf_module_list=leaf_module_list,
         )
         for p in passes:
             mod = p(mod, inputs)
@@ -157,10 +160,9 @@ def run_test(
             end_event.record()
             torch.cuda.synchronize()
             print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
-
             # PyTorch Transformer model would yield 2 output tensors, of which the second one is
             # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
-            if transformer_mode:
+            if leaf_module == torch.nn.MultiheadAttention:
                 ref_outputs = ref_outputs[0]
             if isinstance(outputs, torch.Tensor):
                 ref_outputs = [ref_outputs]

From b63e9d1e668836003ddeb9190099511709d430bb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 14 Feb 2023 23:09:02 -0800
Subject: [PATCH 128/638] fix backend type for softmax (#272)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/272

use native type instead of cutlass

Reviewed By: wushirong

Differential Revision: D43294434

fbshipit-source-id: a6976d9b24917919bca62751c464e628e66eac9c
---
 .../backend/cuda/softmax/softmax.cuh          | 22 ++-----------------
 .../backend/cuda/softmax/softmax.py           |  2 +-
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
index f93b186c4..d5be2751a 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.cuh
+++ b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -120,24 +120,6 @@ __inline__ __device__ T blockReduceMax(T* val) {
   return (T)0.0f;
 }
 
-namespace detail {
-template <typename T>
-struct numeric_limits_helper {
-  __device__ __host__ static constexpr T lowest() {
-    return platform::numeric_limits<T>::lowest();
-  }
-};
-
-// Cutlass doesn't have `lowest` in their specialization for float,
-// so we define our own helper struct here.
-template <>
-struct numeric_limits_helper<float> {
-  __device__ __host__ static constexpr float lowest() {
-    return std::numeric_limits<float>::lowest();
-  }
-};
-} // namespace detail
-
 // input size: [M, K]
 // Currently the softmax kernel only supports 2D input with dim=1.
 // For input with more dimensions, reshape first.
@@ -185,7 +167,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
 
     CUTLASS_PRAGMA_UNROLL
     for (size_t i = 0; i < m; i++) {
-      T max = detail::numeric_limits_helper<T>::lowest();
+      T max = std::numeric_limits<T>::lowest();
       // find max
       CUTLASS_PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
@@ -232,7 +214,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
         input_tile[j] = input[i * K + j];
       }
 
-      T max = detail::numeric_limits_helper<T>::lowest();
+      T max = std::numeric_limits<T>::lowest();
       // find max
       CUTLASS_PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index 68a0eec7e..50282542c 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -299,7 +299,7 @@ def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
     k = shapes[dim].value()
 
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
+    elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
     return FUNC_TEMPLATE.render(

From e947942ed7fd360b6435198ed1fa5cec3228f509 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Wed, 15 Feb 2023 00:30:03 -0800
Subject: [PATCH 129/638] Add floor_div op (#265)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/265

Reviewed By: frank-wei, wushirong

Differential Revision: D43278032

Pulled By: terrychenism

fbshipit-source-id: 1d58ec61949ba9c6b9e1da037497d6bbbc3e5c65
---
 python/aitemplate/backend/backend_spec.py     |  7 +++
 .../backend/cuda/elementwise/custom_math.cuh  | 29 +++++++++++
 .../compiler/ops/common/epilogue.py           |  1 +
 python/aitemplate/compiler/ops/common/math.py |  4 ++
 tests/unittest/ops/test_activation.py         | 49 +++++++++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index b5551f51d..1298a6234 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -265,6 +265,13 @@ class GPUBackendSpec(BackendSpec):
                 "bfloat16": "hsoftsign",
                 "bfloat16_2": "h2softsign",
             },
+            FuncEnum.FLOOR_DIV: {
+                "float": "floor_div",
+                "half": "floor_div",
+                "half2": "floor_div",
+                "bfloat16": "floor_div",
+                "bfloat16_2": "floor_div",
+            },
         }
     )
 
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 71c7c68f4..60140aeb2 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -948,4 +948,33 @@ __device__ bfloat16_2 h2softsign(const bfloat16_2 a) {
 #endif
 }
 
+__device__ float floor_div(const float a, const float b) {
+  return floor(a / b);
+}
+
+__device__ half floor_div(const half a, const half b) {
+  return hfloor(__hdiv(a, b));
+}
+
+__device__ bfloat16 floor_div(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return hfloor(__hdiv(a, b));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 floor_div(const half2 a, const half2 b) {
+  return half2(floor_div(a.x, b.x), floor_div(a.y, b.y));
+}
+
+__device__ bfloat16_2 floor_div(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(floor_div(a.x, b.x), floor_div(a.y, b.y));
+
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 #endif
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index 4436fbfcf..423b30626 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -63,3 +63,4 @@ class FuncEnum(Enum):
     SOFTPLUS = 25
     ELU = 26
     SOFTSIGN = 27
+    FLOOR_DIV = 28
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index ddb6e4e94..d79597682 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -109,3 +109,7 @@ def elu(tensor: Any) -> Tensor:
 
 def softsign(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("SOFTSIGN")(tensor)
+
+
+def floor_div(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("FLOOR_DIV")(tensor)
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index d925ebd3c..a962c36d8 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -89,6 +89,46 @@ def _test_leaky_relu(
             module.run_with_tensors([x1_pt], [x2])
             self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
+    def _test_floor_div(
+        self,
+        input_size,
+        test_name="floor_div",
+        dividend=2,
+        copy_op=False,
+    ):
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            slope = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="input1",
+                value=dividend,
+            )
+            X2_op = ops.elementwise(FuncEnum.FLOOR_DIV)
+
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, slope)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            x2_pt = torch.div(x1_pt, dividend, rounding_mode="floor")
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
     def _test_hardtanh(
         self,
         input_size,
@@ -404,6 +444,15 @@ def test_softsign(self):
         self._test_softsign([128], test_name="softsign_3", copy_op=True)
         self._test_softsign([121, 128], test_name="softsign_4")
 
+    def test_floor_div(self):
+        self._test_floor_div([512, 512], test_name="floor_div_1")
+        self._test_floor_div(
+            [1024, 1024],
+            dividend=3,
+            test_name="test_floor_div_2_copy_op",
+            copy_op=True,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 92a35068e366a660d4f3b066545cd952de0ee8cc Mon Sep 17 00:00:00 2001
From: Yanxing-Shi <yanxishi@amd.com>
Date: Wed, 15 Feb 2023 12:14:09 +0000
Subject: [PATCH 130/638] fix graph bug

---
 .../transform/transform_strided_op_and_view_op.py         | 8 +++-----
 .../compiler/transform/transform_strided_ops.py           | 1 +
 python/aitemplate/utils/graph_utils.py                    | 5 ++++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index 3d2604582..d97e07e12 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -32,10 +32,7 @@ def _is_supported_strided_op(op: Operator) -> bool:
     from ...backend.target import Target
 
     op_kind = op._attrs["op"]
-    if Target.current().name() == "rocm":
-        return op_kind == "bmm_softmax_bmm_permute"
-    else:
-        return not op_kind.startswith("group_gemm")
+    return not op_kind.startswith("group_gemm")
 
 
 def _is_supported_view_op(op: Operator, tensor: Tensor) -> bool:
@@ -86,7 +83,8 @@ def _fuse_strided_op_and_view_op_single_pass(
                     tensor._attrs["is_view_of"] = None
                     src_op._attrs["outputs"][idx] = tensor
                     tensor._attrs["src_ops"] = StableSet({src_op})
-                    transform_utils.remove_tensor_from_sorted_graph(view_input_tensor)
+                    for view_op_input in view_op._attrs["inputs"]:
+                        transform_utils.remove_tensor_from_sorted_graph(view_op_input)
                     break
             assert (
                 found_tensor
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 965e288fb..4e1882015 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -482,4 +482,5 @@ def transform_strided_ops(
         ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
+        graph_utils.dump_graph_debug_str_to_file(sorted_graph, workdir, func.__name__)
     return sorted_graph
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index d3dcf6f52..6c7fd2320 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -70,7 +70,10 @@ def sorted_op_pseudo_code(ops, with_shape=True) -> str:
 def dump_graph_debug_str_to_file(tensors, workdir, name):
     if is_debug():
         # Dump graph and pseudo code for debug only
-        prefix = os.path.join(workdir, name)
+        debug_path = workdir + "/debug"
+        if not os.path.exists(debug_path):
+            os.makedirs(debug_path)
+        prefix = os.path.join(debug_path, name)
         graph_path = prefix + "_graph.txt"
         pseudo_code_path = prefix + "_pseudo_code.txt"
         graph_visual_path = prefix + "_graph_vis.html"

From 446cb508afb64070587e34b5610812a030dda130 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang22@meta.com>
Date: Wed, 15 Feb 2023 10:39:35 -0800
Subject: [PATCH 131/638] Add odd shapes to activation unit tests (#276)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/276

This diff adds test cases for `half` and `float16` versions of the respective device functions in [`custom_math.cuh`](fbcode/aitemplate/AITemplate/python/aitemplate/backend/cuda/elementwise/custom_math.cuh).

Reviewed By: tenpercent, aakhundov

Differential Revision: D43277017

fbshipit-source-id: b0db5d9bc9cbfe3d5bd130a76c571af4aaa9704f
---
 tests/unittest/ops/test_activation.py | 29 ++++++++++++++-------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index a962c36d8..73b78e629 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -340,9 +340,10 @@ def test_lrelu(self):
             test_name="leaky_relu_2_copy_op",
             copy_op=True,
         )
+        self._test_leaky_relu([63, 63], test_name="leaky_relu_3")
 
     def test_htanh(self):
-        self._test_hardtanh([512, 512], test_name="hard_tanh_1")
+        self._test_hardtanh([511, 511], test_name="hard_tanh_1")
         self._test_hardtanh(
             [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2"
         )
@@ -355,7 +356,7 @@ def test_htanh(self):
         )
 
     def test_softplus(self):
-        self._test_softplus([64, 64], test_name="softplus_1")
+        self._test_softplus([63, 63], test_name="softplus_1")
         self._test_softplus([128, 128], beta=1.0, threshold=1.5, test_name="softplus_2")
         self._test_softplus([128, 256], beta=2.0, threshold=0.5, test_name="softplus_3")
         self._test_softplus(
@@ -367,13 +368,13 @@ def test_softplus(self):
         )
 
     def test_cos(self):
-        self._test_simple_function([512, 512], FuncEnum.COS, test_name="cos_1")
+        self._test_simple_function([511, 511], FuncEnum.COS, test_name="cos_1")
         self._test_simple_function(
             [512, 512], FuncEnum.COS, test_name="cos_1_copy_op", copy_op=True
         )
 
     def test_sin(self):
-        self._test_simple_function([512, 512], FuncEnum.SIN, test_name="sin_1")
+        self._test_simple_function([511, 511], FuncEnum.SIN, test_name="sin_1")
         self._test_simple_function(
             [512, 512], FuncEnum.SIN, test_name="sin_1_copy_op", copy_op=True
         )
@@ -386,49 +387,49 @@ def test_tanh(self):
         )
 
     def test_sign(self):
-        self._test_simple_function([512, 512], FuncEnum.SIGN, test_name="sign_1")
+        self._test_simple_function([511, 511], FuncEnum.SIGN, test_name="sign_1")
         self._test_simple_function(
             [512, 512], FuncEnum.SIGN, test_name="sign_1_copy_op", copy_op=True
         )
 
     def test_abs(self):
-        self._test_simple_function([512, 512], FuncEnum.ABS, test_name="abs_1")
+        self._test_simple_function([511, 511], FuncEnum.ABS, test_name="abs_1")
         self._test_simple_function(
             [512, 512], FuncEnum.ABS, test_name="abs_1_copy_op", copy_op=True
         )
 
     def test_loge(self):
-        self._test_simple_function([512, 512], FuncEnum.LOGE, test_name="loge_1")
+        self._test_simple_function([511, 511], FuncEnum.LOGE, test_name="loge_1")
         self._test_simple_function(
             [512, 512], FuncEnum.LOGE, test_name="loge_1_copy_op", copy_op=True
         )
 
     def test_exp(self):
-        self._test_simple_function([512, 512], FuncEnum.EXP, test_name="exp_1")
+        self._test_simple_function([511, 511], FuncEnum.EXP, test_name="exp_1")
         self._test_simple_function(
             [512, 512], FuncEnum.EXP, test_name="exp_1_copy_op", copy_op=True
         )
 
     def test_sqrt(self):
-        self._test_simple_function([512, 512], FuncEnum.SQRT, test_name="sqrt_1")
+        self._test_simple_function([511, 511], FuncEnum.SQRT, test_name="sqrt_1")
         self._test_simple_function(
             [512, 512], FuncEnum.SQRT, test_name="sqrt_1_copy_op", copy_op=True
         )
 
     def test_sigmoid(self):
-        self._test_simple_function([512, 512], FuncEnum.SIGMOID, test_name="sigmoid_1")
+        self._test_simple_function([511, 511], FuncEnum.SIGMOID, test_name="sigmoid_1")
         self._test_simple_function(
             [512, 512], FuncEnum.SIGMOID, test_name="sigmoid_1_copy_op", copy_op=True
         )
 
     def test_relu(self):
-        self._test_simple_function([512, 512], FuncEnum.RELU, test_name="relu_1")
+        self._test_simple_function([511, 511], FuncEnum.RELU, test_name="relu_1")
         self._test_simple_function(
             [512, 512], FuncEnum.RELU, test_name="relu_1_copy_op", copy_op=True
         )
 
     def test_elu(self):
-        self._test_elu([64, 64], test_name="elu_1")
+        self._test_elu([63, 63], test_name="elu_1")
         self._test_elu([128, 128], alpha=4.0, test_name="elu_2")
         self._test_elu([128, 256], alpha=0.4, test_name="elu_3")
         self._test_elu(
@@ -439,13 +440,13 @@ def test_elu(self):
         )
 
     def test_softsign(self):
-        self._test_softsign([61], test_name="softsign_1")
+        self._test_softsign([63, 63], test_name="softsign_1")
         self._test_softsign([128], test_name="softsign_2")
         self._test_softsign([128], test_name="softsign_3", copy_op=True)
         self._test_softsign([121, 128], test_name="softsign_4")
 
     def test_floor_div(self):
-        self._test_floor_div([512, 512], test_name="floor_div_1")
+        self._test_floor_div([511, 511], test_name="floor_div_1")
         self._test_floor_div(
             [1024, 1024],
             dividend=3,

From 660b8f0151260908bb01a5bba133dd0be6b9fbd3 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 15 Feb 2023 11:12:09 -0800
Subject: [PATCH 132/638] Introduce access to both bounded and unbounded
 constants in model. (#252)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/252

We add 2 variables:
1. `bound_constant_name_to_idx_`
2. `constant_folding_optional_inputs_`

To track bounded constants in ModelContainer.

Reviewed By: khabinov, chenyang78

Differential Revision: D43228920

fbshipit-source-id: 5ba34335458b82d125d5a7141c4cd92c87a1c296
---
 python/aitemplate/backend/codegen.py        | 34 ++++++++++
 python/aitemplate/backend/main_templates.py |  7 +-
 python/aitemplate/compiler/model.py         | 26 +++++--
 static/csrc/model_container.cpp             | 35 ++++++++--
 static/csrc/model_interface.cpp             | 11 ++-
 static/include/model_container.h            | 20 +++++-
 static/include/model_interface.h            |  2 +
 tests/unittest/backend/test_model_api.py    | 75 +++++++++++++--------
 8 files changed, 162 insertions(+), 48 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index e2c597065..7b80ec293 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -320,6 +320,8 @@ def __init__(
         self.set_up_constants = []
         self.set_up_param_names = []
         self.set_up_param_dtypes = []
+        self.set_up_bound_constant_dtypes = []
+        self.set_up_bound_constant_size = []
         self.set_up_output_shapes = []
         self.set_up_param_dynamic_shapes = []
         self.state_record = set()
@@ -335,6 +337,7 @@ def __init__(
         self.set_up_constant_folding_outputs_offsets = []
 
         self.input_idx = 0
+        self.bound_constant_idx = 0
         self.unbound_constant_idx = 0
         self.output_name_to_idx = _construct_output_name_to_index_map(
             graph, output_tensors
@@ -432,6 +435,28 @@ def _add_owned_constant(self, tensor: Tensor) -> None:
         self.constants_data_size += num_bytes
         self.num_constants += 1
 
+    def _codegen_bound_constant(self, tensor: Tensor) -> None:
+        name = tensor._attrs["name"]
+        self.set_up_constant_names.append(
+            set_value(
+                f'bound_constant_name_to_idx_["{name}"]',
+                self.bound_constant_idx,
+            )
+        )
+        self.set_up_bound_constant_dtypes.append(
+            set_value(
+                f"bound_constant_dtypes_[{self.bound_constant_idx}]",
+                dtype_to_enumerator(tensor.dtype()),
+            )
+        )
+        self.set_up_bound_constant_size.append(
+            set_value(
+                f"bound_constant_size_[{self.bound_constant_idx}]",
+                len(tensor._attrs["data"]),
+            )
+        )
+        self.bound_constant_idx += 1
+
     def _codegen_param_setup(
         self,
         tensor: Tensor,
@@ -444,6 +469,8 @@ def _codegen_param_setup(
         if data is not None:
             # Owned constant. Set up logic for copying the constant in from *.so.
             self.set_up_constants.append(self._tensor_slice_func(tensor, "constants"))
+            self._codegen_bound_constant(tensor)
+            self.bound_constant_idx += 1
             if self.constants_data_file is not None:
                 self._add_owned_constant(tensor)
 
@@ -783,9 +810,12 @@ def generate_source(self) -> Dict[str, str]:
             param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
             set_up_constant_names="\n".join(self.set_up_constant_names),
             set_up_param_dtypes="\n".join(self.set_up_param_dtypes),
+            set_up_bound_constant_dtypes="\n".join(self.set_up_bound_constant_dtypes),
+            set_up_bound_constant_size="\n".join(self.set_up_bound_constant_size),
             set_up_output_shapes="\n".join(self.set_up_output_shapes),
             set_up_param_names="\n".join(self.set_up_param_names),
             num_constants=self.num_constants,
+            num_bound_constants=self.bound_constant_idx,
             num_unbound_constants=self.unbound_constant_idx,
             owned_constants_init=",".join(self.owned_constants_init),
             set_up_constant_folding_outputs_offsets=self._create_set_up_constant_folding_outputs_offsets(),
@@ -821,6 +851,10 @@ def add_constant_folding_input(self, tensor: Tensor):
             )
         else:
             self._add_owned_constant(tensor)
+            self._codegen_bound_constant(tensor)
+            self.set_up_constant_folding_inputs.append(
+                f'constant_folding_optional_inputs_.insert("{name}");'
+            )
 
         self._process_dims_for_tensor(tensor)
 
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index a8359c7c1..0da33e22e 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -170,10 +170,13 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 ModelContainerBase::ModelContainerBase(
     size_t num_inputs,
     size_t num_outputs,
+    size_t num_bound_constants,
     size_t num_unbound_constants,
     size_t params_size,
     AITemplateAllocator& allocator)
     : constants_(RAII_DeviceMalloc(params_size, allocator)),
+      bound_constant_size_(num_bound_constants),
+      bound_constant_dtypes_(num_bound_constants),
       num_params_(num_inputs + num_outputs + num_unbound_constants),
       param_names_(num_params_),
       param_dtypes_(num_params_),
@@ -183,6 +186,8 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 {{ set_up_constant_names }}
 {{ set_up_param_names }}
 {{ set_up_param_dtypes }}
+{{ set_up_bound_constant_dtypes }}
+{{ set_up_bound_constant_size }}
 {{ set_up_output_shapes }}
   for (size_t i = 0; i < num_params_; ++i) {
     max_param_numel_[i] = std::accumulate(
@@ -209,7 +214,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 
 ModelContainer* CreateModelContainer(size_t num_runtimes, AITemplateAllocator& allocator) {
   // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size, allocator
-  return new ModelContainer(num_runtimes, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}}, allocator);
+  return new ModelContainer(num_runtimes, {{num_inputs}}, {{num_outputs}}, {{num_bound_constants}}, {{num_unbound_constants}}, {{param_size}}, allocator);
 }
 } // namespace ait
 """
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 5024039f5..f0d41e6c2 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -941,20 +941,32 @@ def fold_constants(self, stream_ptr: Optional[int] = None, sync: bool = True):
             self.handle, ctypes.c_void_p(stream_ptr), ctypes.c_bool(sync)
         )
 
-    def _get_constant_names_impl(self, constant_folding_only: bool) -> List[str]:
+    def _get_constant_names_impl(
+        self, unbound_constants_only: bool, constant_folding_only: bool
+    ) -> List[str]:
         num_constants = ctypes.c_size_t()
         constant_folding_inputs_only = ctypes.c_bool(constant_folding_only)
+        unbound_constants_only_ = ctypes.c_bool(unbound_constants_only)
         self.DLL.AITemplateModelContainerGetNumConstants(
-            self.handle, constant_folding_inputs_only, ctypes.byref(num_constants)
+            self.handle,
+            unbound_constants_only_,
+            constant_folding_inputs_only,
+            ctypes.byref(num_constants),
         )
         names = (ctypes.c_char_p * num_constants.value)()
         self.DLL.AITemplateModelContainerGetConstantNames(
-            self.handle, constant_folding_inputs_only, names
+            self.handle, unbound_constants_only_, constant_folding_inputs_only, names
         )
         return [name.decode("utf-8") for name in names]
 
-    def get_constant_names(self) -> List[str]:
-        return self._get_constant_names_impl(False)
+    def get_constant_names(
+        self, unbound_constants_only: bool = True, constant_folding_only: bool = False
+    ) -> List[str]:
+        return self._get_constant_names_impl(
+            unbound_constants_only, constant_folding_only
+        )
 
-    def get_constant_folding_input_names(self) -> List[str]:
-        return self._get_constant_names_impl(True)
+    def get_constant_folding_input_names(
+        self, unbound_constants_only: bool = True
+    ) -> List[str]:
+        return self._get_constant_names_impl(unbound_constants_only, True)
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 6c95a17c9..b20565646 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -23,12 +23,14 @@ ModelContainer::ModelContainer(
     size_t num_models,
     size_t num_inputs,
     size_t num_outputs,
+    size_t num_bound_constants,
     size_t num_unbound_constants,
     size_t params_size,
     AITemplateAllocator& allocator)
     : ModelContainerBase(
           num_inputs,
           num_outputs,
+          num_bound_constants,
           num_unbound_constants,
           params_size,
           allocator),
@@ -505,20 +507,32 @@ void ModelContainer::FoldConstants(StreamType stream, bool sync) {
   }
 }
 
-size_t ModelContainer::GetNumConstants() const {
-  return unbound_constant_name_to_idx_.size();
+size_t ModelContainer::GetNumConstants(bool unbound_constants_only) const {
+  if (unbound_constants_only) {
+    return unbound_constant_name_to_idx_.size();
+  } else {
+    return unbound_constant_name_to_idx_.size() +
+        bound_constant_name_to_idx_.size();
+  }
 }
 
-size_t ModelContainer::GetNumConstantFoldingInputs() const {
-  return constant_folding_inputs_.size();
+size_t ModelContainer::GetNumConstantFoldingInputs(
+    bool unbound_constants_only) const {
+  if (unbound_constants_only) {
+    return constant_folding_inputs_.size();
+  } else {
+    return constant_folding_inputs_.size() +
+        constant_folding_optional_inputs_.size();
+  }
 }
 
 void ModelContainer::WriteAllConstantNamesTo(
     const char** constant_names_out,
+    bool unbound_constants_only,
     bool constant_folding_inputs_only) const {
   size_t num_to_write = constant_folding_inputs_only
-      ? GetNumConstants()
-      : GetNumConstantFoldingInputs();
+      ? GetNumConstants(unbound_constants_only)
+      : GetNumConstantFoldingInputs(unbound_constants_only);
   if (constant_names_out == nullptr && num_to_write != 0) {
     throw std::runtime_error("constant_names_out cannot be nullptr.");
   }
@@ -529,6 +543,15 @@ void ModelContainer::WriteAllConstantNamesTo(
       constant_names_out[idx++] = name.c_str();
     }
   }
+  if (!unbound_constants_only) {
+    for (auto& [name, _] : bound_constant_name_to_idx_) {
+      if (!constant_folding_inputs_only ||
+          constant_folding_optional_inputs_.find(name) !=
+              constant_folding_optional_inputs_.end()) {
+        constant_names_out[idx++] = name.c_str();
+      }
+    }
+  }
 }
 
 void ModelContainer::PrepareForRun(
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index dbf03378c..4b86c78c5 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -125,6 +125,7 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
 
 AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
+    bool unbound_constants_only,
     bool constant_folding_inputs_only,
     size_t* num_constants_out) {
   RETURN_ERROR_IF_NULL(handle)
@@ -132,15 +133,17 @@ AITemplateError AITemplateModelContainerGetNumConstants(
   auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
   CONVERT_EXCEPTION_TO_ERROR_CODE({
     if (constant_folding_inputs_only) {
-      *num_constants_out = m->GetNumConstantFoldingInputs();
+      *num_constants_out =
+          m->GetNumConstantFoldingInputs(unbound_constants_only);
     } else {
-      *num_constants_out = m->GetNumConstants();
+      *num_constants_out = m->GetNumConstants(unbound_constants_only);
     }
   })
 }
 
 AITemplateError AITemplateModelContainerGetConstantNames(
     AITemplateModelHandle handle,
+    bool unbound_constants_only,
     bool constant_folding_inputs_only,
     const char** constant_names_out) {
   RETURN_ERROR_IF_NULL(handle)
@@ -149,7 +152,9 @@ AITemplateError AITemplateModelContainerGetConstantNames(
   auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
   CONVERT_EXCEPTION_TO_ERROR_CODE({
     m->WriteAllConstantNamesTo(
-        constant_names_out, constant_folding_inputs_only);
+        constant_names_out,
+        unbound_constants_only,
+        constant_folding_inputs_only);
   })
 }
 
diff --git a/static/include/model_container.h b/static/include/model_container.h
index d9247d8d9..4b6e90a4e 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -39,11 +39,18 @@ class ModelContainerBase {
   ModelContainerBase(
       size_t num_inputs,
       size_t num_outputs,
+      size_t num_bound_constants,
       size_t num_unbound_constants,
       size_t params_size,
       AITemplateAllocator& allocator);
 
  protected:
+  // The set of bounded constants/weights/parameters. These are constants which
+  // have value during compile time. We maintain it's size, and unlike unbound
+  // constants, we do not need to check whether they are set via SetConstant
+  // prior to inference.
+  std::unordered_map<std::string, size_t> bound_constant_name_to_idx_;
+
   // The set of unbound constants/weights/parameters. These are constants which
   // have no value at compile time and do not participate in constant folding.
   // They must be set via SetConstant prior to inference.
@@ -51,7 +58,10 @@ class ModelContainerBase {
 
   // The names of all tensors that are required for constant folding, but are
   // not necessarily in the final graph.
+  // constant_folding_optional_inputs_ are those that has initial value during
+  // compile time.
   std::unordered_set<std::string> constant_folding_inputs_;
+  std::unordered_set<std::string> constant_folding_optional_inputs_;
 
   // Offsets here correspond to the offsets of constants that were the outputs
   // of constant folding. The indices are guaranteed to map to the correct
@@ -70,6 +80,10 @@ class ModelContainerBase {
   std::vector<std::vector<int64_t>> max_param_shapes_;
   std::vector<AITemplateDtype> param_dtypes_;
 
+  // These are entries used for bound constants.
+  std::vector<size_t> bound_constant_size_;
+  std::vector<AITemplateDtype> bound_constant_dtypes_;
+
   // NB: technically these could be derived from both the max shape and
   // the dytpe, but it's easier to just cache them.
   std::vector<size_t> max_param_storage_bytes_;
@@ -122,6 +136,7 @@ class ModelContainer : ModelContainerBase {
       size_t num_models,
       size_t num_inputs,
       size_t num_outputs,
+      size_t num_bound_constants,
       size_t num_unbound_constants,
       size_t params_size,
       AITemplateAllocator& allocator);
@@ -192,8 +207,8 @@ class ModelContainer : ModelContainerBase {
 
   void FoldConstants(StreamType stream, bool sync);
 
-  size_t GetNumConstants() const;
-  size_t GetNumConstantFoldingInputs() const;
+  size_t GetNumConstants(bool unbound_constants_only = true) const;
+  size_t GetNumConstantFoldingInputs(bool unbound_constants_only = true) const;
 
   // Write all constant names to the array pointed to by names_out.
   // This function assumes that names_out has enough space to hold
@@ -201,6 +216,7 @@ class ModelContainer : ModelContainerBase {
   // are guaranteed to live as long as their owning ModelContainer.
   void WriteAllConstantNamesTo(
       const char** names_out,
+      bool unbound_constants_only,
       bool constant_folding_inputs_only) const;
 
  private:
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 9485c60ad..7af1aff50 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -157,11 +157,13 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
 
 AIT_EXPORT AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
+    bool unbound_constants_only,
     bool constant_folding_inputs_only,
     size_t* num_constants_out);
 
 AIT_EXPORT AITemplateError AITemplateModelContainerGetConstantNames(
     AITemplateModelHandle handle,
+    bool unbound_constants_only,
     bool constant_folding_inputs_only,
     const char** constant_names_out);
 
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index 146913378..7012d3c24 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -44,6 +44,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ModelAPITestCase(unittest.TestCase):
@@ -1438,44 +1439,60 @@ def test_get_constant_names(self):
         target = detect_target()
 
         input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
         constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
         constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
-        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
-        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        constant_3 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constants = {}
+
+        # constant 0 and constant 1 are not folded.
+        # constant 0 is unbounded, constant 1 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, x, constant_1])
+        constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+
+        # constants 2 and 3 and 4 are folded.
+        # constants 2 and 4 are unbounded, constants 3 is bounded.
+        y = ops.concatenate()([constant_2, constant_3, constant_4])
+        constants["constant_3"] = get_random_torch_tensor((1, 2), "float16")
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y)
         output._attrs["name"] = "output"
         output._attrs["is_output"] = True
 
-        module = compile_model(output, target, "./tmp", "test_get_constant_names")
-        names = module.get_constant_names()
-        self.assertEqual(len(names), 2)
-        self.assertIn("constant_1", names)
-        self.assertIn("constant_2", names)
+        module = compile_model(
+            output, target, "./tmp", "test_get_constant_names", constants=constants
+        )
 
-    def test_get_constant_folding_input_names(self):
-        target = detect_target()
+        names_0 = module.get_constant_names(
+            unbound_constants_only=True, constant_folding_only=False
+        )
+        self.assertEqual(set(names_0), {"constant_0", "constant_2", "constant_4"})
 
-        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
-        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
-        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
-        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
-        # constant 1 is not folded.
-        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
-        # constants 2 and 3 are
-        y = ops.elementwise(FuncEnum.MUL)(constant_2, constant_2)
+        names_1 = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=False
+        )
+        self.assertEqual(
+            set(names_1),
+            {"constant_0", "constant_1", "constant_2", "constant_3", "constant_4"},
+        )
 
-        output = ops.elementwise(FuncEnum.MUL)(x, y)
-        output._attrs["name"] = "output"
-        output._attrs["is_output"] = True
+        names_2 = module.get_constant_names(
+            unbound_constants_only=True, constant_folding_only=True
+        )
+        self.assertEqual(set(names_2), {"constant_2", "constant_4"})
 
-        module = compile_model(
-            output, target, "./tmp", "test_get_constant_folding_input_names"
-        )
-        names = module.get_constant_folding_input_names()
-        self.assertEqual(names, [])
-        # TODO: uncomment when the new constant folding pass is enabled.
-        # self.assertEqual(len(names), 2)
-        # self.assertIn("constant_2", names)
-        # self.assertIn("constant_3", names)
+        names_3 = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=True
+        )
+        self.assertEqual(set(names_3), {"constant_2", "constant_3", "constant_4"})
+
+        names_4 = module.get_constant_folding_input_names(unbound_constants_only=True)
+        self.assertEqual(set(names_4), {"constant_2", "constant_4"})
+
+        names_5 = module.get_constant_folding_input_names(unbound_constants_only=False)
+        self.assertEqual(set(names_5), {"constant_2", "constant_3", "constant_4"})
 
     def test_set_many_constants(self):
         target = detect_target()

From 76f186b3dc6e162b591aeca644087709642876f3 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 15 Feb 2023 11:12:09 -0800
Subject: [PATCH 133/638] Do not allow users to modify constants set by AIT
 internal. (#258)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/258

We add a modifiable_constant in Tensor to make sure users don't accidentally modify constants that are generated by AIT internally.

Reviewed By: khabinov, chenyang78

Differential Revision: D43232224

fbshipit-source-id: 671aafc98e4a505f3436048e9a1450803979aa66
---
 python/aitemplate/backend/codegen.py          |  3 ++
 python/aitemplate/compiler/base.py            |  6 +++
 .../compiler/transform/constant_folding.py    |  1 +
 tests/unittest/backend/test_model_api.py      | 44 +++++++++++++++++++
 4 files changed, 54 insertions(+)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 7b80ec293..91ae6e6e2 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -436,6 +436,9 @@ def _add_owned_constant(self, tensor: Tensor) -> None:
         self.num_constants += 1
 
     def _codegen_bound_constant(self, tensor: Tensor) -> None:
+        if tensor._attrs.get("is_internal_constant", False):
+            return
+
         name = tensor._attrs["name"]
         self.set_up_constant_names.append(
             set_value(
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 62db1e4b1..d0c388497 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -351,6 +351,7 @@ def __init__(
         is_output: bool = False,
         value: Any = None,
         is_view_of: Any = None,
+        is_internal_constant: bool = False,
         check_nan_and_inf: bool = False,
         check_outputs: bool = False,
     ) -> None:
@@ -381,6 +382,8 @@ def __init__(
             empty list, this Tensor is used to represent a number.
         is_view_of : Any, optional
             Whether this Tensor is a view of another Tensor.
+        is_internal_constant: bool, optional
+            Whether this constant tensor could be modified.
         check_nan_and_inf : bool, optional
             Whether or not to check this tensor is nan or inf during runtime.
         check_outputs : bool, optional
@@ -399,6 +402,7 @@ def __init__(
         self._attrs["is_output"] = is_output
         self._attrs["is_input"] = is_input
         self._attrs["is_param"] = False
+        self._attrs["is_internal_constant"] = is_internal_constant
 
         # True if this is an internal tensor that aliases an output through
         # a view. Set up in mark_param_tensor
@@ -575,6 +579,7 @@ def _create_host_zero_tensor(
     dst_ops: Set[Node] = None,
     dtype: str = "float16",
     is_output: bool = False,
+    is_internal_constant: bool = True,
 ):
     """
     Create a zero tensor stored on the host machine.
@@ -584,6 +589,7 @@ def _create_host_zero_tensor(
         b"\x00" * get_aligned_size(shape, dtype, alignment=1), dtype=dtype
     )
     tensor = Tensor(shape, name, dst_ops=dst_ops, dtype=dtype, is_output=is_output)
+    tensor._attrs["is_internal_constant"] = is_internal_constant
     tensor._bind_data(zeros)
     return tensor
 
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 3243d76f0..20266ad5e 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -71,6 +71,7 @@ def _non_output_from_tensor(tensor: Tensor) -> Tensor:
         dst_ops=tensor._attrs["dst_ops"].copy(),
         dtype=tensor._attrs["dtype"],
         is_view_of=tensor._attrs["is_view_of"],
+        is_internal_constant=tensor._attrs["is_internal_constant"],
     )
     new_tensor._attrs["is_param"] = tensor._attrs["is_param"]
     new_tensor._attrs["data"] = tensor._attrs["data"]
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index 7012d3c24..d984ba5ad 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -28,6 +28,7 @@
 from aitemplate.compiler import AIT_DEFAULT_NUM_RUNTIMES, compile_model, ops
 from aitemplate.compiler.base import (
     _ConstantTensorData,
+    _create_host_zero_tensor,
     _HostConstantTensorData,
     _NumpyConstantTensorData,
     _TorchConstantTensorData,
@@ -1494,6 +1495,49 @@ def test_get_constant_names(self):
         names_5 = module.get_constant_folding_input_names(unbound_constants_only=False)
         self.assertEqual(set(names_5), {"constant_2", "constant_3", "constant_4"})
 
+    def test_get_constant_names_with_ait_generated(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_3 = _create_host_zero_tensor(
+            shape=[1, 2], name="constant_3", dtype="float16"
+        )
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constants = {}
+
+        # constant 0 and constant 1 are not folded.
+        # constant 0 is unbounded, constant 1 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, x, constant_1])
+        constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+
+        # constants 2 and 3 and 4 are folded.
+        # constants 2 and 4 are unbounded, constants 3 is bounded.
+        y = ops.concatenate()([constant_2, constant_3, constant_4])
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output,
+            target,
+            "./tmp",
+            "test_get_constant_names_with_ait_generated",
+            constants=constants,
+        )
+
+        names = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=False
+        )
+        self.assertEqual(
+            set(names),
+            {"constant_0", "constant_1", "constant_2", "constant_4"},
+        )
+
     def test_set_many_constants(self):
         target = detect_target()
 

From 7764db1d15d60247012d1270f62e4d31959fee2e Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 15 Feb 2023 11:12:09 -0800
Subject: [PATCH 134/638] Double Buffering with AITemplate (#259)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/259

Add double buffering for AITemplate.

Reviewed By: khabinov, chenyang78

Differential Revision: D43232225

fbshipit-source-id: e2bfde524521f622f09acd138f55dedffdc3066f
---
 python/aitemplate/backend/codegen.py          |  72 +++--
 python/aitemplate/backend/main_templates.py   |  18 +-
 python/aitemplate/compiler/model.py           |  87 +++++-
 static/csrc/model_container.cpp               | 272 ++++++++++++++----
 static/csrc/model_interface.cpp               |  45 ++-
 static/include/model_container.h              |  52 +++-
 static/include/model_interface.h              |  21 ++
 .../compiler/test_constant_folding.py         | 136 +++++++++
 8 files changed, 615 insertions(+), 88 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 91ae6e6e2..904a278ca 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -333,7 +333,9 @@ def __init__(
         self.num_constants = 0
         self.constants_data_size = 0
         self.owned_constants_init = []
+        self.reset_constants = []
 
+        self.set_up_bound_constant_offsets = []
         self.set_up_constant_folding_outputs_offsets = []
 
         self.input_idx = 0
@@ -458,6 +460,12 @@ def _codegen_bound_constant(self, tensor: Tensor) -> None:
                 len(tensor._attrs["data"]),
             )
         )
+        self.set_up_bound_constant_offsets.append(
+            set_value(
+                f"bound_constant_offsets_[{self.bound_constant_idx}]",
+                tensor._attrs["offset"],
+            )
+        )
         self.bound_constant_idx += 1
 
     def _codegen_param_setup(
@@ -469,14 +477,20 @@ def _codegen_param_setup(
         """
         name = tensor._attrs["name"]
         data = tensor._attrs["data"]
+        const_slice = self._tensor_slice_func(tensor, "constants")
         if data is not None:
             # Owned constant. Set up logic for copying the constant in from *.so.
-            self.set_up_constants.append(self._tensor_slice_func(tensor, "constants"))
+            self.set_up_constants.append(const_slice)
+            self.set_up_constants.append(
+                set_value(
+                    f'constant_name_to_ptr_["{name}"]',
+                    f"const_cast<const void**>(reinterpret_cast<void**>(&{name}))",
+                )
+            )
             self._codegen_bound_constant(tensor)
-            self.bound_constant_idx += 1
+            self.reset_constants.append(const_slice)
             if self.constants_data_file is not None:
                 self._add_owned_constant(tensor)
-
         elif tensor._attrs["constant_folding_output_idx"] is not None:
             self.set_up_constant_folding_outputs_offsets.append(
                 set_value(
@@ -484,8 +498,8 @@ def _codegen_param_setup(
                     tensor._attrs["offset"],
                 )
             )
-            self.tensor_slice.append(self._tensor_slice_func(tensor, "constants"))
-
+            self.tensor_slice.append(const_slice)
+            self.reset_constants.append(const_slice)
         elif not isinstance(tensor, IntVarTensor):
             # Unbound constant. We will expect the user to set this via SetConstant.
             self.set_up_constant_names.append(
@@ -766,31 +780,47 @@ def generate_model(self) -> str:
             num_outputs=self.num_outputs,
             param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
             num_unbound_constants=self.unbound_constant_idx,
+            reset_constants="\n".join(self.reset_constants),
             profiler_annotation=self.debug_settings.gen_profiler_annotation,
         )
 
-    def _create_set_up_constant_folding_outputs_offsets(self) -> str:
+    def _create_set_up_constant_offsets(self) -> str:
         """
+        bound_constant_offsets_ stores a map for each constant to the offset in constant buffer,
         constant_folding_outputs_offsets_ stores a map from each output of constant folding
         to its offset inside the constant buffer.
 
+
         When the model is loaded, we use these offsets to wire up the constant folding output
         pointers to the outputs of the constant folder.
         """
-        if not self.set_up_constant_folding_outputs_offsets:
-            return ""
-
-        return jinja2.Template(
-            """
-constant_folding_outputs_offsets_.resize({{num_constant_folding_outputs}});
-{{set_up_statements}}
-"""
-        ).render(
-            num_constant_folding_outputs=len(
-                self.set_up_constant_folding_outputs_offsets
-            ),
-            set_up_statements="\n".join(self.set_up_constant_folding_outputs_offsets),
-        )
+        constant_offsets = ""
+        if self.set_up_constant_folding_outputs_offsets:
+            constant_offsets = jinja2.Template(
+                """
+    constant_folding_outputs_offsets_.resize({{num_constant_folding_outputs}});
+    {{set_up_statements}}
+    """
+            ).render(
+                num_constant_folding_outputs=len(
+                    self.set_up_constant_folding_outputs_offsets
+                ),
+                set_up_statements="\n".join(
+                    self.set_up_constant_folding_outputs_offsets
+                ),
+            )
+            constant_offsets += "\n"
+        if self.set_up_bound_constant_offsets:
+            constant_offsets += jinja2.Template(
+                """
+    bound_constant_offsets_.resize({{num_bound_constant_offsets}});
+    {{set_up_statements}}
+    """
+            ).render(
+                num_bound_constant_offsets=len(self.set_up_bound_constant_offsets),
+                set_up_statements="\n".join(self.set_up_bound_constant_offsets),
+            )
+        return constant_offsets
 
     def generate_source(self) -> Dict[str, str]:
         """
@@ -821,7 +851,7 @@ def generate_source(self) -> Dict[str, str]:
             num_bound_constants=self.bound_constant_idx,
             num_unbound_constants=self.unbound_constant_idx,
             owned_constants_init=",".join(self.owned_constants_init),
-            set_up_constant_folding_outputs_offsets=self._create_set_up_constant_folding_outputs_offsets(),
+            set_up_constant_offsets=self._create_set_up_constant_offsets(),
             set_up_constant_folding_inputs="\n".join(
                 self.set_up_constant_folding_inputs
             ),
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 0da33e22e..332522035 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -79,6 +79,14 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
         {{ set_inputs }}
     }
 
+    void ResetConstants(uint8_t* constants) {
+        /*
+         * This can be called if we want to use a different piece of memory
+         * for the constants to be consumed.
+         */
+        {{ reset_constants }}
+    }
+
     void DeviceToDeviceCopies(StreamType stream) {
   {{ device_to_device_copies }}
     }
@@ -174,7 +182,11 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
     size_t num_unbound_constants,
     size_t params_size,
     AITemplateAllocator& allocator)
-    : constants_(RAII_DeviceMalloc(params_size, allocator)),
+    : constants_size_(params_size),
+      constants_primary_(RAII_DeviceMalloc(constants_size_, allocator)),
+      constants_secondary_(nullptr),
+      use_constants_primary_buffer_(true),
+      buffer_state_(BufferState::CLEAN),
       bound_constant_size_(num_bound_constants),
       bound_constant_dtypes_(num_bound_constants),
       num_params_(num_inputs + num_outputs + num_unbound_constants),
@@ -198,10 +210,10 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
     );
     max_param_storage_bytes_[i] = max_param_numel_[i] * AITemplateDtypeSizeBytes(param_dtypes_[i]);
   }
-{{ set_up_constant_folding_outputs_offsets }}
+{{ set_up_constant_offsets }}
 {{ set_up_constant_folding_inputs }}
 
-  auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
+  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
   const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
   for (auto& constant_info : owned_constants) {
     auto* dst = constants_ptr + constant_info.internal_offset;
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index f0d41e6c2..6ee7fce41 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -761,6 +761,44 @@ def set_many_constants(self, tensors: Dict[str, AITData]):
             self.handle, c_names, c_tensors, num_tensors
         )
 
+    def set_double_buffer_constant(
+        self, name: str, tensor: AITData, stream_ptr: Optional[int] = None
+    ):
+        """
+        Set a constant. All constants must have values before calling run().
+
+        Note that the pointer inside tensor must be valid for the entire
+        duration of run().
+        """
+        b_name = name.encode("utf-8")
+        c_name = ctypes.c_char_p(b_name)
+        c_tensor = self._convert_single_param_to_c_format(tensor)
+        self.DLL.AITemplateModelContainerSetDoubleBufferConstant(
+            self.handle, ctypes.c_void_p(stream_ptr), c_name, ctypes.byref(c_tensor)
+        )
+
+    def set_many_double_buffer_constants(
+        self, tensors: Dict[str, AITData], stream_ptr: Optional[int] = None
+    ):
+        """
+        Bulk set many constants at once. More efficient than set_constant()
+        since it only has to acquire the lock once.
+        """
+        c_names = (ctypes.c_char_p * len(tensors))()
+        c_tensors = (_CFormatAITData * len(tensors))()
+        ait_tensors = {
+            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
+            for name, tensor in tensors.items()
+        }
+        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
+            c_names[i] = ctypes.c_char_p(name_bytes)
+            c_tensors[i] = tensor
+
+        num_tensors = ctypes.c_size_t(len(tensors))
+        self.DLL.AITemplateModelContainerSetManyDoubleBufferConstants(
+            self.handle, ctypes.c_void_p(stream_ptr), c_names, c_tensors, num_tensors
+        )
+
     def set_many_constants_with_tensors(self, tensors: Dict[str, AITData]):
         ait_tensors = {}
         for name, tensor in tensors.items():
@@ -770,6 +808,30 @@ def set_many_constants_with_tensors(self, tensors: Dict[str, AITData]):
             ait_tensors[name] = torch_to_ait_data(tensor)
         self.set_many_constants(ait_tensors)
 
+    def set_double_buffer_constant_with_tensor(
+        self, name: str, tensor: TorchTensor, stream_ptr: Optional[int] = None
+    ):
+        """
+        Set a constant with a PyTorch tensor.
+        Model will store a reference to the given tensor in
+        torch_constant_tensors until it is explicitly deleted or replaced.
+        """
+        if not tensor.is_contiguous() or not tensor.is_cuda:
+            raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+        self.torch_constant_tensors[name] = tensor
+        self.set_double_buffer_constant(name, torch_to_ait_data(tensor), stream_ptr)
+
+    def set_many_double_buffer_constants_with_tensors(
+        self, tensors: Dict[str, AITData], stream_ptr: Optional[int] = None
+    ):
+        ait_tensors = {}
+        for name, tensor in tensors.items():
+            if not tensor.is_contiguous() or not tensor.is_cuda:
+                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+            self.torch_constant_tensors[name] = tensor
+            ait_tensors[name] = torch_to_ait_data(tensor)
+        self.set_many_double_buffer_constants(ait_tensors, stream_ptr)
+
     def set_constant_with_tensor(self, name: str, tensor: TorchTensor):
         """
         Set a constant with a PyTorch tensor.
@@ -936,10 +998,27 @@ def ait_data_to_numpy(
         )
         return arr
 
-    def fold_constants(self, stream_ptr: Optional[int] = None, sync: bool = True):
-        self.DLL.AITemplateModelContainerFoldConstants(
-            self.handle, ctypes.c_void_p(stream_ptr), ctypes.c_bool(sync)
-        )
+    def fold_constants(
+        self,
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+        double_buffer: bool = False,
+    ):
+        if double_buffer:
+            self.DLL.AITemplateModelContainerFoldConstantsInDoubleBuffer(
+                self.handle,
+                ctypes.c_void_p(stream_ptr),
+                ctypes.c_bool(sync),
+            )
+        else:
+            self.DLL.AITemplateModelContainerFoldConstants(
+                self.handle,
+                ctypes.c_void_p(stream_ptr),
+                ctypes.c_bool(sync),
+            )
+
+    def swap_constants(self):
+        self.DLL.AITemplateModelContainerSwapConstants(self.handle)
 
     def _get_constant_names_impl(
         self, unbound_constants_only: bool, constant_folding_only: bool
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index b20565646..add4ea980 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -17,6 +17,25 @@
 #include "device_functions-generated.h"
 #include "raii_wrapper.h"
 
+namespace {
+std::string GetEnumString(AITemplateDtype dtype) {
+  switch (dtype) {
+    case AITemplateDtype::kUnset:
+      return "kUnset";
+    case AITemplateDtype::kHalf:
+      return "kHalf";
+    case AITemplateDtype::kFloat:
+      return "kFloat";
+    case AITemplateDtype::kInt:
+      return "kInt";
+    case AITemplateDtype::kLong:
+      return "kLong";
+    default:
+      return "unknown";
+  }
+}
+} // namespace
+
 namespace ait {
 
 ModelContainer::ModelContainer(
@@ -67,7 +86,7 @@ ModelContainer::ModelContainer(
   models_.reserve(num_models);
   available_models_.reserve(num_models);
 
-  auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
+  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
   for (size_t i = 0; i < num_models; ++i) {
     models_.push_back(Model::Create(allocator, constants_ptr));
     available_models_.push_back(models_.back().get());
@@ -342,35 +361,90 @@ float ModelContainer::Benchmark(
   return max_time / total_num_iters;
 }
 
-void ModelContainer::SetConstantImpl(const char* name, const AITData& tensor) {
-  auto it = unbound_constant_name_to_idx_.find(name);
-  if (it == unbound_constant_name_to_idx_.end()) {
-    return;
-  }
-  auto constant_idx = it->second + num_inputs_ + num_outputs_;
-  ValidateDtype(tensor.dtype, constant_idx);
-
-  CHECK_VECTOR_ACCESS(max_param_storage_bytes_, constant_idx)
-  auto expected_num_bytes = max_param_storage_bytes_[constant_idx];
-  auto actual_num_bytes =
-      tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
-  if (expected_num_bytes != actual_num_bytes) {
+void ModelContainer::SetConstantImpl(
+    const char* name,
+    const AITData& tensor,
+    bool double_buffer,
+    StreamType stream) {
+  auto unbound_it = unbound_constant_name_to_idx_.find(name);
+  auto bound_it = bound_constant_name_to_idx_.find(name);
+  if (unbound_it != unbound_constant_name_to_idx_.end()) {
+    auto constant_idx = unbound_it->second + num_inputs_ + num_outputs_;
+    ValidateParamDtype(tensor.dtype, constant_idx);
+
+    CHECK_VECTOR_ACCESS(max_param_storage_bytes_, constant_idx)
+    auto expected_num_bytes = max_param_storage_bytes_[constant_idx];
+    auto actual_num_bytes =
+        tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
+    if (expected_num_bytes != actual_num_bytes) {
+      throw std::runtime_error(
+          std::string(
+              "SetConstant did not receive correct number of bytes for unbound constant ") +
+          name + ": expected " + std::to_string(expected_num_bytes) +
+          " but got " + std::to_string(actual_num_bytes) +
+          ". Check that the provided tensor's shape is correct.");
+    }
+  } else if (bound_it != bound_constant_name_to_idx_.end()) {
+    auto constant_idx = bound_it->second;
+    ValidateBoundConstantDtype(tensor.dtype, constant_idx);
+
+    CHECK_VECTOR_ACCESS(bound_constant_size_, constant_idx)
+    auto expected_num_bytes = bound_constant_size_[constant_idx];
+    auto actual_num_bytes =
+        tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
+    if (expected_num_bytes != actual_num_bytes) {
+      throw std::runtime_error(
+          std::string(
+              "SetConstant did not receive correct number of bytes for bound constant ") +
+          name + ": expected " + std::to_string(expected_num_bytes) +
+          " but got " + std::to_string(actual_num_bytes) +
+          ". Check that the provided tensor's shape is correct.");
+    }
+  } else {
     throw std::runtime_error(
-        std::string(
-            "SetConstant did not receive correct number of bytes for constant ") +
-        name + ": expected " + std::to_string(expected_num_bytes) +
-        " but got " + std::to_string(actual_num_bytes) +
-        ". Check that the provided tensor's shape is correct.");
+        std::string("Called SetConstant on ") + name +
+        std::string(" but can't find in either bound or unbound constant set"));
   }
 
   auto* src = tensor.ptr;
-  if (constant_folding_inputs_.find(name) == constant_folding_inputs_.end()) {
-    for (auto& model : models_) {
-      model->SetConstant(name, src);
+  bool is_constant_folder_ =
+      constant_folding_inputs_.find(name) != constant_folding_inputs_.end() ||
+      constant_folding_optional_inputs_.find(name) !=
+          constant_folding_optional_inputs_.end();
+
+  if (!double_buffer) {
+    // If we don't use double_buffer, we can just SetConstant.
+    if (!is_constant_folder_) {
+      for (auto& model : models_) {
+        model->SetConstant(name, src);
+      }
+    } else {
+      constant_folder_->SetConstant(name, src);
     }
   } else {
-    constant_folder_->SetConstant(name, src);
+    // If we use double buffer, we identify whether it's a bounded constant or
+    // not. If it's unbounded, just hold the pointer. It it's bounded, we copy
+    // it into the constant buffer.
+    if (unbound_it != unbound_constant_name_to_idx_.end()) {
+      if (is_constant_folder_) {
+        constant_folder_->SetConstant(name, src);
+      } else {
+        model_constants_[std::string(name)] = src;
+      }
+    } else {
+      // Constant to be set is bounded, preload into constant buffer.
+      uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+      size_t idx = bound_it->second;
+      // TODO: check whether src is host or device memory.
+      DEVICE_CHECK(DeviceToDeviceCopy(
+          constants_ptr + bound_constant_offsets_[idx],
+          src,
+          bound_constant_size_[idx],
+          stream));
+    }
   }
+
+  buffer_state_ = BufferState::CONSTANTS_UPDATED;
 }
 
 void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
@@ -404,6 +478,60 @@ void ModelContainer::SetManyConstants(
   }
 }
 
+void ModelContainer::SwapConstantFolderBuffer() {
+  uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+  constant_folder_->ResetConstants(constants_ptr);
+  size_t constant_idx = 0;
+  for (auto offset : constant_folding_outputs_offsets_) {
+    constant_folder_->SetOutput(constants_ptr + offset, constant_idx++);
+  }
+}
+
+uint8_t* ModelContainer::GetInactiveConstantsBuffer() {
+  uint8_t* constants_ptr{nullptr};
+  if (use_constants_primary_buffer_) {
+    if (constants_secondary_ == nullptr) {
+      constants_secondary_ = RAII_DeviceMalloc(constants_size_, allocator_);
+    }
+    constants_ptr = static_cast<uint8_t*>(constants_secondary_.get());
+  } else {
+    constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
+  }
+  return constants_ptr;
+}
+
+void ModelContainer::SetDoubleBufferConstant(
+    const char* name,
+    const AITData& tensor,
+    StreamType stream) {
+  std::lock_guard lk(constants_double_buffer_mutex_);
+  SetConstantImpl(name, tensor, /* double_buffer */ true, stream);
+}
+
+void ModelContainer::SetManyDoubleBufferConstants(
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors,
+    StreamType stream) {
+  if (num_tensors == 0) {
+    return;
+  }
+
+  if (tensors == nullptr) {
+    throw std::runtime_error("Tensor array cannot be null");
+  }
+
+  std::lock_guard lk(constants_double_buffer_mutex_);
+  for (size_t i = 0; i < num_tensors; ++i) {
+    const char* name = names[i];
+    if (name == nullptr) {
+      throw std::runtime_error("Constant name cannot be null");
+    }
+    const auto& tensor = tensors[i];
+    SetConstantImpl(names[i], tensor, /* double_buffer */ true, stream);
+  }
+}
+
 size_t ModelContainer::NumInputs() const {
   return num_inputs_;
 }
@@ -485,28 +613,71 @@ void ModelContainer::WaitForAllModels(bool include_constant_folder) {
   }
 }
 
-void ModelContainer::FoldConstantsImpl(StreamType stream) {
-  // NB: No need to acquire models_mutex_ here. We're guaranteed that nothing
-  // will be concurrently messing with the Model vectors while we hold the
-  // constants_sync_mutex_ in unique mode. See model_container.h for the full
-  // explanation.
-  WaitForAllModels();
+void ModelContainer::FoldConstantsImpl(StreamType stream, bool double_buffer) {
+  if (constant_folded_once_) {
+    // We do not set the buffer state if this is the initial constant folding.
+    buffer_state_ = BufferState::CONSTANTS_FOLDED;
+  }
+
+  if (double_buffer) {
+    SwapConstantFolderBuffer();
+  } else {
+    // NB: No need to acquire models_mutex_ here. We're guaranteed that nothing
+    //     will be concurrently messing with the Model vectors while we hold
+    //     the constants_sync_mutex_ in unique mode. See model_container.h for
+    //     the full explanation.
+    WaitForAllModels();
+  }
   // We might have already started constant folding, make sure it finishes
   // first. It's OK if we throw here, there's no state to restore.
   // We just won't finish the folding and will need to do it again.
   constant_folder_->WaitForCompletion();
-  constant_folder_->Run(stream, /*graph_mode=*/false);
+  if (double_buffer) {
+    std::lock_guard constants_unique_lk(constants_double_buffer_mutex_);
+    constant_folder_->Run(stream, /*graph_mode=*/false);
+  } else {
+    constant_folder_->Run(stream, /*graph_mode=*/false);
+  }
   constant_folded_once_ = true;
 }
 
-void ModelContainer::FoldConstants(StreamType stream, bool sync) {
-  std::lock_guard constant_folding_lk(constants_sync_mutex_);
-  FoldConstantsImpl(stream);
+void ModelContainer::FoldConstants(
+    StreamType stream,
+    bool sync,
+    bool double_buffer) {
+  if (double_buffer) {
+    FoldConstantsImpl(stream, double_buffer);
+  } else {
+    std::lock_guard constant_folding_lk(constants_sync_mutex_);
+    FoldConstantsImpl(stream);
+  }
   if (sync) {
     DEVICE_CHECK(StreamSynchronize(stream));
   }
 }
 
+void ModelContainer::SwapConstants() {
+  if (buffer_state_ != BufferState::CONSTANTS_FOLDED) {
+    LOG(WARNING) << "Called SwapConstants without calling FoldConstants().";
+    return;
+  }
+  std::unique_lock constants_unique_lk(constants_double_buffer_mutex_);
+  uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+  use_constants_primary_buffer_ = !use_constants_primary_buffer_;
+
+  for (auto& model : models_) {
+    model->ResetConstants(constants_ptr);
+  }
+  for (auto& [name, src] : model_constants_) {
+    for (auto& model : models_) {
+      model->SetConstant(name.c_str(), src);
+    }
+  }
+
+  model_constants_.clear();
+  buffer_state_ = BufferState::CLEAN;
+}
+
 size_t ModelContainer::GetNumConstants(bool unbound_constants_only) const {
   if (unbound_constants_only) {
     return unbound_constant_name_to_idx_.size();
@@ -578,13 +749,13 @@ void ModelContainer::PrepareForRun(
   }
   for (size_t i = 0; i < num_inputs_; ++i) {
     auto& input = inputs[i];
-    ValidateDtype(input.dtype, i);
+    ValidateParamDtype(input.dtype, i);
     model->SetInput(input.ptr, input.shape, i);
   }
 
   for (size_t i = 0; i < num_outputs_; ++i) {
     auto& output = outputs[i];
-    ValidateDtype(output.dtype, i + num_inputs_);
+    ValidateParamDtype(output.dtype, i + num_inputs_);
     model->SetOutput(output.ptr, i);
   }
 }
@@ -631,31 +802,28 @@ void ModelContainer::ReclaimFinishedModels(std::unique_lock<std::mutex>& lk) {
   available_models_.push_back(model);
 }
 
-void ModelContainer::ValidateDtype(AITemplateDtype dtype, size_t idx) const {
+void ModelContainer::ValidateParamDtype(AITemplateDtype dtype, size_t idx)
+    const {
   CHECK_VECTOR_ACCESS(param_dtypes_, idx)
   if (dtype != param_dtypes_[idx]) {
-    auto GetEnumString = [](auto dtype) {
-      switch (dtype) {
-        case AITemplateDtype::kUnset:
-          return "kUnset";
-        case AITemplateDtype::kHalf:
-          return "kHalf";
-        case AITemplateDtype::kFloat:
-          return "kFloat";
-        case AITemplateDtype::kInt:
-          return "kInt";
-        case AITemplateDtype::kLong:
-          return "kLong";
-        default:
-          return "unknown";
-      }
-    };
     throw std::runtime_error(
         "Got wrong dtype for param " + std::to_string(idx) + "; expected " +
         GetEnumString(param_dtypes_[idx]) + ", got " + GetEnumString(dtype));
   }
 }
 
+void ModelContainer::ValidateBoundConstantDtype(
+    AITemplateDtype dtype,
+    size_t idx) const {
+  CHECK_VECTOR_ACCESS(bound_constant_dtypes_, idx)
+  if (dtype != bound_constant_dtypes_[idx]) {
+    throw std::runtime_error(
+        "Got wrong dtype for param " + std::to_string(idx) + "; expected " +
+        GetEnumString(bound_constant_dtypes_[idx]) + ", got " +
+        GetEnumString(dtype));
+  }
+}
+
 float ModelContainer::BenchmarkImpl(
     const AITData* inputs,
     size_t num_inputs,
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index 4b86c78c5..3a7460155 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -123,6 +123,32 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
       { m->SetManyConstants(names, tensors, num_tensors); })
 }
 
+AITemplateError AITemplateModelContainerSetDoubleBufferConstant(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char* name,
+    const AITData* tensor) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(tensor)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetDoubleBufferConstant(name, *tensor, stream); })
+}
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyDoubleBufferConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetManyDoubleBufferConstants(names, tensors, num_tensors, stream); })
+}
+
 AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
     bool unbound_constants_only,
@@ -355,7 +381,24 @@ AITemplateError AITemplateModelContainerFoldConstants(
   RETURN_ERROR_IF_NULL(handle)
   auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
   auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync); })
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync, false); })
+}
+
+AITemplateError AITemplateModelContainerFoldConstantsInDoubleBuffer(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync, true); })
+}
+
+AITemplateError AITemplateModelContainerSwapConstants(
+    AITemplateModelHandle handle) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SwapConstants(); })
 }
 
 AITemplateError AITemplateAllocatorCreate(
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 4b6e90a4e..56be22df4 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -30,6 +30,12 @@
 
 namespace ait {
 
+enum class BufferState {
+  CLEAN = 0,
+  CONSTANTS_UPDATED = 1,
+  CONSTANTS_FOLDED = 2
+};
+
 // ModelContainer inherits from this class; its implementation is
 // generated at compilation time. Most of the ModelContainer
 // logic does not need codegen; anything that does should be put
@@ -67,9 +73,20 @@ class ModelContainerBase {
   // of constant folding. The indices are guaranteed to map to the correct
   // indices in constant_folder_.
   std::vector<size_t> constant_folding_outputs_offsets_;
-
-  // a single piece of memory for all constants
-  GPUPtr constants_;
+  // Offsets here correspond to the offsets of constants for bounded constants.
+  std::vector<size_t> bound_constant_offsets_;
+
+  // size for constants_ GPUPtr
+  size_t constants_size_;
+  // Pieces of memory for holding all constants, controled by
+  // use_constants_primary_buffer_
+  GPUPtr constants_primary_;
+  GPUPtr constants_secondary_;
+  bool use_constants_primary_buffer_;
+  // State of whether SetConstants/FoldConstants was called.
+  BufferState buffer_state_;
+  // Mapping for constant names to pointer
+  std::unordered_map<std::string, const void*> model_constants_;
 
   // size of the containers below: # inputs + # outputs + # unbound constants.
   size_t num_params_;
@@ -187,6 +204,17 @@ class ModelContainer : ModelContainerBase {
       const AITData* tensors,
       size_t num_tensors);
 
+  uint8_t* GetInactiveConstantsBuffer();
+  void SetDoubleBufferConstant(
+      const char* name,
+      const AITData& tensor,
+      StreamType stream = 0);
+  void SetManyDoubleBufferConstants(
+      const char** names,
+      const AITData* tensors,
+      size_t num_tensors,
+      StreamType stream = 0);
+
   size_t NumInputs() const;
   size_t NumOutputs() const;
 
@@ -205,7 +233,8 @@ class ModelContainer : ModelContainerBase {
     return models_.size();
   }
 
-  void FoldConstants(StreamType stream, bool sync);
+  void FoldConstants(StreamType stream, bool sync, bool double_buffer = false);
+  void SwapConstants();
 
   size_t GetNumConstants(bool unbound_constants_only = true) const;
   size_t GetNumConstantFoldingInputs(bool unbound_constants_only = true) const;
@@ -221,8 +250,13 @@ class ModelContainer : ModelContainerBase {
 
  private:
   void WaitForAllModels(bool include_constant_folder = false);
-  void FoldConstantsImpl(StreamType stream);
-  void SetConstantImpl(const char* name, const AITData& tensor);
+  void FoldConstantsImpl(StreamType stream, bool double_buffer = false);
+  void SetConstantImpl(
+      const char* name,
+      const AITData& tensor,
+      bool use_secondary_buffer = false,
+      StreamType stream = 0);
+  void SwapConstantFolderBuffer();
 
   void PrepareForRun(
       Model* model,
@@ -233,7 +267,8 @@ class ModelContainer : ModelContainerBase {
 
   Model* GetAvailableModel();
   void ReclaimFinishedModels(std::unique_lock<std::mutex>& lk);
-  void ValidateDtype(AITemplateDtype dtype, size_t idx) const;
+  void ValidateParamDtype(AITemplateDtype dtype, size_t idx) const;
+  void ValidateBoundConstantDtype(AITemplateDtype dtype, size_t idx) const;
 
   float BenchmarkImpl(
       const AITData* inputs,
@@ -270,6 +305,9 @@ class ModelContainer : ModelContainerBase {
   // prevents concurrent inferences from happening while kernels are being
   // queued.
   std::shared_mutex constants_sync_mutex_;
+  // constants_double_buffer_mutex_ is separate from constants_sync_mutex since
+  // when we use double buffer, it won't affect the main model.
+  std::shared_mutex constants_double_buffer_mutex_;
 
   size_t num_inputs_;
   size_t num_outputs_;
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 7af1aff50..4cee8cf02 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -155,6 +155,19 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
     const AITData* tensors,
     size_t num_tensors);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerSetDoubleBufferConstant(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char* name,
+    const AITData* tensor);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyDoubleBufferConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerGetNumConstants(
     AITemplateModelHandle handle,
     bool unbound_constants_only,
@@ -264,6 +277,14 @@ AIT_EXPORT AITemplateError AITemplateModelContainerFoldConstants(
     AITemplateStreamHandle stream_handle,
     bool sync);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerFoldConstantsInDoubleBuffer(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync);
+
+AIT_EXPORT AITemplateError
+AITemplateModelContainerSwapConstants(AITemplateModelHandle handle);
+
 AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
     AITemplateAllocator** allocator_out,
     AITemplateAllocatorType allocator_type);
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index 0ca1d804c..2c99d417b 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import itertools
 import unittest
 
 import torch
@@ -498,6 +499,141 @@ def test_constant_folding_output_in_middle_of_chain(self):
         mod.set_many_constants_with_tensors({"x": x_pt, "y": y_pt})
         mod.run_with_tensors([], {"x2": x2_ait, "x3": x3_ait, "x4": x4_ait})
 
+    @parameterized.expand(
+        list(
+            itertools.product(
+                [True, False],
+                [True, False],
+                [True, False],
+                [True, False],
+                [True, False],
+            )
+        )
+    )
+    def test_constant_folding_with_update(
+        self,
+        update_model_bound: bool = False,
+        update_model_unbound: bool = False,
+        update_const_folder_bound: bool = False,
+        update_const_folder_unbound: bool = False,
+        double_buffer: bool = False,
+    ):
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_3 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constant_5 = Tensor(shape=[1, 2], dtype="float16", name="constant_5")
+        constant_6 = Tensor(shape=[1, 2], dtype="float16", name="constant_6")
+        model_constants = {}
+        model_unbound_constants = {}
+        const_folder_constants = {}
+        const_folder_unbound_constants = {}
+
+        # constant 0/1/2 are not folded.
+        # constant 0 is unbounded, constant 1/2 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, constant_1, constant_2])
+        model_constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+        model_constants["constant_2"] = get_random_torch_tensor((1, 2), "float16")
+        model_unbound_constants["constant_0"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+
+        # constants 3/4/5/6 are folded.
+        # constants 3/4 are unbounded, constants 5/6 is bounded.
+        y = ops.elementwise(FuncEnum.MUL)(constant_3, constant_4)
+        y1 = ops.concatenate()([y, constant_5, constant_6])
+        const_folder_unbound_constants["constant_3"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_unbound_constants["constant_4"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_constants["constant_5"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_constants["constant_6"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y1)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        bound_constants = dict(model_constants, **const_folder_constants)
+        unbound_constants = dict(
+            model_unbound_constants, **const_folder_unbound_constants
+        )
+        mod = compile_model(
+            output,
+            detect_target(),
+            "./tmp",
+            f"test_constant_folding_{update_model_bound}_{update_model_unbound}_{update_const_folder_bound}_{update_const_folder_unbound}_{double_buffer}",
+            constants=bound_constants,
+        )
+
+        inp0_pt = get_random_torch_tensor((1, 2), "float16")
+
+        def _get_output(new_bound_constants, new_unbound_constants):
+            x_pt = inp0_pt * new_unbound_constants["constant_0"]
+            x1_pt = torch.cat(
+                (
+                    x_pt,
+                    new_bound_constants["constant_1"],
+                    new_bound_constants["constant_2"],
+                )
+            )
+            y = (
+                new_unbound_constants["constant_3"]
+                * new_unbound_constants["constant_4"]
+            )
+            y1_pt = torch.cat(
+                (
+                    y,
+                    new_bound_constants["constant_5"],
+                    new_bound_constants["constant_6"],
+                )
+            )
+            output_pt = x1_pt * y1_pt
+            return output_pt
+
+        output_pt = _get_output(bound_constants, unbound_constants)
+        output_ait = torch.empty_like(output_pt)
+        mod.set_many_constants_with_tensors(unbound_constants)
+        mod.run_with_tensors({"input_0": inp0_pt}, {"output": output_ait})
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
+        new_bound_constants = bound_constants
+        new_unbound_constants = unbound_constants
+        if update_model_bound:
+            for k in model_constants.keys():
+                new_bound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+        if update_model_unbound:
+            for k in model_unbound_constants.keys():
+                new_unbound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+
+        if update_const_folder_bound:
+            for k in const_folder_constants.keys():
+                new_bound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+        if update_const_folder_unbound:
+            for k in const_folder_unbound_constants.keys():
+                new_unbound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+
+        if double_buffer:
+            mod.set_many_double_buffer_constants_with_tensors(new_bound_constants)
+            mod.set_many_double_buffer_constants_with_tensors(new_unbound_constants)
+        else:
+            mod.set_many_constants_with_tensors(new_bound_constants)
+            mod.set_many_constants_with_tensors(new_unbound_constants)
+        mod.fold_constants(double_buffer=double_buffer)
+        if double_buffer:
+            mod.swap_constants()
+        mod.run_with_tensors({"input_0": inp0_pt}, {"output": output_ait})
+        output_pt = _get_output(new_bound_constants, new_unbound_constants)
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 2f8eec7993119383004ac7c63ce73ce4653f7679 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhao <zhiweiz@meta.com>
Date: Wed, 15 Feb 2023 11:59:56 -0800
Subject: [PATCH 135/638] Add set and swap constants functionality to
 AITModelImpl (#263)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/263

Reviewed By: khabinov

Differential Revision: D42790812

fbshipit-source-id: 004716dbea5afc4c95a556e924d4a9f7dabed02d
---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 127 ++++++++++++++++++++++++++--
 fx2ait/fx2ait/csrc/AITModelImpl.h   |  20 +++++
 2 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 9325d0011..2bd041d7b 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -166,6 +166,23 @@ AITModelImpl::AITModelImpl(
       "AITemplateModelContainerGetMaximumOutputShape");
   LOAD_SYMBOL(getOutputDtypeFunc_, "AITemplateModelContainerGetOutputDtype");
 
+  // It's possible that these functions are not loaded in .so file.
+  // Making these function possible to load as nullptr and check when using.
+  // Once all relevant packages have been updated, we can just use
+  // LOAD_SYMBOL.
+  LOAD_SYMBOL_WARN(
+      setManyConstantsDoubleBufferFunc_,
+      "AITemplateModelContainerSetManyDoubleBufferConstants");
+  LOAD_SYMBOL_WARN(foldConstantsFunc_, "AITemplateModelContainerFoldConstants");
+  LOAD_SYMBOL_WARN(
+      getConstantNamesFunc_, "AITemplateModelContainerGetConstantNames");
+  LOAD_SYMBOL_WARN(
+      getNumConstantsFunc_, "AITemplateModelContainerGetNumConstants");
+  LOAD_SYMBOL_WARN(swapConstantsFunc_, "AITemplateModelContainerSwapConstants");
+  LOAD_SYMBOL_WARN(
+      foldConstantsDoubleBufferFunc_,
+      "AITemplateModelContainerFoldConstantsInDoubleBuffer");
+
   // It's possible that we have new field added in AITemplateModelContainer,
   // But we can be using a new AITModel to load an old AITemplateModelContainer.
   // The newly added method are usually non-critical, so we issue warning
@@ -183,17 +200,14 @@ AITModelImpl::AITModelImpl(
   LOAD_SYMBOL(getNumInputsFunc, "AITemplateModelContainerGetNumInputs");
   LOAD_SYMBOL(getNumOutputsFunc, "AITemplateModelContainerGetNumOutputs");
 #undef LOAD_SYMBOL
-  // TODO: this load is optional so we don't break backwards comptability.
-  // Once all relevant packages have been updated, we can just use
-  // LOAD_SYMBOL.
-  auto* foldConstantsFunc =
-      reinterpret_cast<decltype(&AITemplateModelContainerFoldConstants)>(
-          dlsym(handle_.get(), "AITemplateModelContainerFoldConstants"));
 
   AITCallCreate(createFunc, &model_handle_, num_runtimes, &allocator_);
 
-  if (foldConstantsFunc != nullptr) {
-    AIT_CHECK(foldConstantsFunc(
+  // TODO: this check is optional so we don't break backwards comptability.
+  // Once all relevant packages have been updated, we can just use
+  // LOAD_SYMBOL.
+  if (foldConstantsFunc_ != nullptr) {
+    AIT_CHECK(foldConstantsFunc_(
         model_handle_,
         /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(creation_stream),
         /*sync=*/true));
@@ -250,6 +264,14 @@ AITemplateDtype TorchDtypeToAITemplateDtype(at::ScalarType torch_dtype) {
       TORCH_CHECK(false, "Unknown or unsupported torch dtype");
   }
 }
+
+AITData torchToAitData(const torch::Tensor& tensor) {
+  return AITData{
+      tensor.data_ptr(),
+      AITemplateParamShape{tensor.sizes().data(), tensor.sizes().size()},
+      TorchDtypeToAITemplateDtype(tensor.scalar_type())};
+}
+
 } // namespace
 
 void AITModelImpl::allocateOutputs(
@@ -481,6 +503,8 @@ void AITModelImpl::profile(
 thread_local std::unordered_map<std::string, std::string>
     AITModelImpl::name_to_path_map_;
 
+thread_local bool AITModelImpl::deserialize_pickled_model_{true};
+
 void AITModelImpl::registerLibraryNameToPathMap(
     std::unordered_map<std::string, std::string> map) {
   std::ostringstream ss;
@@ -519,4 +543,91 @@ const std::string& AITModelImpl::getFullPathForLibraryName(
       ss.str());
   return *path;
 }
+
+bool AITModelImpl::getDeserializePickledModel() {
+  return deserialize_pickled_model_;
+}
+
+// Set thread local boolean to disable real loading from .so file
+// for reusing the same module later on
+void AITModelImpl::setDeserializePickledModel(bool deserializePickledModel) {
+  deserialize_pickled_model_ = deserializePickledModel;
+}
+
+// Function to update constants in place with double buffering as well as fold
+// constants. The weights supplied must be the exact same number of the current
+// contants loaded in the AITModel. This call should only set the unused buffer
+// in the model for both direct used constants and folded constants. The weights
+// will not take effect until swapConstants is being called
+void AITModelImpl::updateConstantsWithWeights(
+    const std::unordered_map<std::string, torch::Tensor>& weights) {
+  TORCH_CHECK(
+      getNumConstantsFunc_,
+      "getNumConstantsFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      getConstantNamesFunc_,
+      "getConstantNamesFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      setManyConstantsDoubleBufferFunc_,
+      "setManyConstantsDoubleBufferFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      foldConstantsDoubleBufferFunc_,
+      "foldConstantsDoubleBufferFunc_ not loaded, can not do in place update");
+  VLOG(1) << "AITModelImpl in place update for weights";
+  const auto numConstants =
+      AITCall(getNumConstantsFunc_, model_handle_, false, false);
+  TORCH_CHECK(
+      numConstants == weights.size(),
+      "Number of constants loaded ",
+      numConstants,
+      " mismatched with number of new constants provided ",
+      weights.size());
+  std::vector<const char*> constantNames(numConstants, nullptr);
+  AIT_CHECK(
+      getConstantNamesFunc_(model_handle_, false, false, constantNames.data()));
+  std::vector<AITData> constants;
+  // TODO: Add check from caller side to make sure the weights are matched with
+  // loaded constants for sizes and shapes
+  for (const auto& name : constantNames) {
+    auto it = weights.find(name);
+    TORCH_CHECK(
+        it != weights.end(),
+        "could not find the constant named ",
+        name,
+        " in predictor supplied weights, ",
+        "failing this round of weight update");
+    constants.emplace_back(torchToAitData(it->second));
+  }
+  cudaStream_t constants_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
+  AIT_CHECK(setManyConstantsDoubleBufferFunc_(
+      model_handle_,
+      /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
+      constantNames.data(),
+      constants.data(),
+      numConstants));
+  VLOG(1) << "Completed on setting constants in double buffers";
+  AIT_CHECK(foldConstantsDoubleBufferFunc_(
+      model_handle_,
+      /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
+      /*sync=*/true));
+  VLOG(1) << "Completed the constants folding process in double buffering";
+}
+
+// Swap the constants stored in the double bufferings for both model level and
+// folded constants, this will take effect immediately to make this AITModel run
+// with new weights
+void AITModelImpl::swapConstants() {
+  TORCH_CHECK(
+      swapConstantsFunc_,
+      "swapConstantsFunc_ not loaded, can not do in place update");
+  AIT_CHECK(swapConstantsFunc_(model_handle_));
+}
 } // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
index 14f992422..6b78d735f 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.h
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -57,6 +57,10 @@ class AITModelImpl {
 
   static const std::string& getFullPathForLibraryName(const std::string& name);
 
+  static bool getDeserializePickledModel();
+
+  static void setDeserializePickledModel(bool deserializePickledModel);
+
   /*
    * Returns a path to .so file (either relative or absolute).
    */
@@ -92,10 +96,16 @@ class AITModelImpl {
     return floating_point_output_dtype_;
   }
 
+  void updateConstantsWithWeights(
+      const std::unordered_map<std::string, torch::Tensor>& weights);
+
+  void swapConstants();
+
  private:
   // @lint-ignore CLANGTIDY facebook-hte-NonPodStaticDeclaration
   static thread_local std::unordered_map<std::string, std::string>
       name_to_path_map_;
+  static thread_local bool deserialize_pickled_model_;
 
   struct DlcloseDeleter {
     void operator()(void* p) const {
@@ -133,6 +143,16 @@ class AITModelImpl {
       getMaximumOutputShapeFunc_ = nullptr;
   decltype(&AITemplateModelContainerGetOutputDtype) getOutputDtypeFunc_ =
       nullptr;
+  decltype(&AITemplateModelContainerSetManyDoubleBufferConstants)
+      setManyConstantsDoubleBufferFunc_ = nullptr;
+  decltype(&AITemplateModelContainerFoldConstants) foldConstantsFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetConstantNames) getConstantNamesFunc_ =
+      nullptr;
+  decltype(&AITemplateModelContainerGetNumConstants) getNumConstantsFunc_ =
+      nullptr;
+  decltype(&AITemplateModelContainerSwapConstants) swapConstantsFunc_ = nullptr;
+  decltype(&AITemplateModelContainerFoldConstantsInDoubleBuffer)
+      foldConstantsDoubleBufferFunc_ = nullptr;
 
   const std::string library_basename_;
   const std::string library_path_;

From a4f2a9d0a20156b08c310601de90931491ecc9e3 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Wed, 15 Feb 2023 15:16:36 -0800
Subject: [PATCH 136/638] add ndhwc3to8 padding (#269)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/269

Add ndhwc3to8 op so the conv3d can use it directly.

Reviewed By: frank-wei, terrychenism

Differential Revision: D43266195

fbshipit-source-id: ec45cb3466af26f912d34679f1f171fe90651b62
---
 .../backend/cuda/padding/__init__.py          |   4 +-
 .../backend/cuda/padding/ndhwc3to8.py         | 252 ++++++++++++++++++
 .../compiler/ops/padding/__init__.py          |   3 +-
 .../compiler/ops/padding/ndhwc3to8.py         | 133 +++++++++
 python/aitemplate/frontend/nn/padding.py      |  15 +-
 tests/unittest/ops/test_ndhwc3to8.py          |  57 ++++
 6 files changed, 460 insertions(+), 4 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/padding/ndhwc3to8.py
 create mode 100644 python/aitemplate/compiler/ops/padding/ndhwc3to8.py
 create mode 100644 tests/unittest/ops/test_ndhwc3to8.py

diff --git a/python/aitemplate/backend/cuda/padding/__init__.py b/python/aitemplate/backend/cuda/padding/__init__.py
index 455e327d6..37bb6eedb 100644
--- a/python/aitemplate/backend/cuda/padding/__init__.py
+++ b/python/aitemplate/backend/cuda/padding/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA padding init
 """
-from . import nhwc3to4, nhwc3to8, pad_last_dim
+from . import ndhwc3to8, nhwc3to4, nhwc3to8, pad_last_dim
 
-__all__ = ["nhwc3to8", "pad_last_dim", "nhwc3to4"]
+__all__ = ["ndhwc3to8", "nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/cuda/padding/ndhwc3to8.py b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
new file mode 100644
index 000000000..6aaca1218
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
@@ -0,0 +1,252 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for ndhwc3to8 op
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_d}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_d}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}ndhwc3to8_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    DI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+// load 128 bit every time (n ElemT = 4 float)
+// use as many as thread with factor of 3:
+// each time load num_thread * n ElemT = num_thread / 3 * n ElemT * 3ch ->
+// num_thread / 3 * n ElemT * n ElemT ch
+
+template<typename ElemT, int num_thread>
+__global__ void ndhwc3to8_kernel(const float4* input,
+                                float4* output,
+                                const int NI,
+                                const int DI,
+                                const int HI,
+                                const int WI,
+                                const int max_in_elements,
+                                const int max_out_elements) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  __shared__ float4 shared_mem[num_thread];
+  const int out_offset = num_thread * num_elem_t_in_float4 / 3;
+  const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  const ElemT zero = static_cast<ElemT>(0.f);
+  const int in_idx = blockIdx.x * num_thread + threadIdx.x;
+  const int tid = threadIdx.x;
+
+  shared_mem[tid] = in_idx >= max_in_elements ? zero4 : __ldg(input + in_idx);
+  __syncthreads();
+
+  const int out_start_idx = blockIdx.x * out_offset;
+  const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
+  for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
+    const ElemT* smem_element = (const ElemT*)shared_mem + j * 3;
+    ElemT tmp[num_elem_t_in_float4];
+
+    #pragma unroll
+    for (int k = 0; k < num_elem_t_in_float4; ++k) {
+      tmp[k] = k < 3 ? smem_element[k] : zero;
+    }
+    output[i] = *((const float4*)tmp);
+  }
+}
+
+template <typename ElemT>
+void ndhwc3to8_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int DI,
+                       int HI,
+                       int WI,
+                       cudaStream_t stream) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  constexpr int nthread = 240;
+  const int NDHW = NI * DI * HI * WI;
+  if (NDHW % num_elem_t_in_float4 != 0) {
+    throw std::runtime_error(
+        "NDHW (" + std::to_string(NDHW) + ") mod num_elem_t_in_float4 (" +
+        std::to_string(num_elem_t_in_float4) + ") is not 0"
+    );
+  }
+  static_assert(nthread % 3 == 0);
+  const int max_in_elements = NDHW * 3 / num_elem_t_in_float4;
+  const int max_out_elements = NDHW * num_elem_t_in_float4 / num_elem_t_in_float4;
+  dim3 thread_block(nthread);
+  dim3 grid((NDHW * 3 + nthread * num_elem_t_in_float4 -1) / (nthread * num_elem_t_in_float4));
+  ndhwc3to8_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)in_ptr,
+    (float4*) out_ptr,
+    NI,
+    DI,
+    HI,
+    WI,
+    max_in_elements,
+    max_out_elements
+  );
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_d,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_d,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.ndhwc3to8.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_d",
+        x_dim2="*in_h",
+        x_dim3="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_d",
+        y_dim2="*out_h",
+        y_dim3="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("cuda.ndhwc3to8.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.ndhwc3to8.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_d="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_d="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/padding/__init__.py b/python/aitemplate/compiler/ops/padding/__init__.py
index c3b9b2f3c..10518e995 100644
--- a/python/aitemplate/compiler/ops/padding/__init__.py
+++ b/python/aitemplate/compiler/ops/padding/__init__.py
@@ -15,9 +15,10 @@
 """
 Padding ops module init.
 """
+from .ndhwc3to8 import ndhwc3to8
 from .nhwc3to4 import nhwc3to4
 from .nhwc3to8 import nhwc3to8
 from .pad_last_dim import pad_last_dim
 
 
-__all__ = ["nhwc3to8", "nhwc3to4", "pad_last_dim"]
+__all__ = ["ndhwc3to8", "nhwc3to8", "nhwc3to4", "pad_last_dim"]
diff --git a/python/aitemplate/compiler/ops/padding/ndhwc3to8.py b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
new file mode 100644
index 000000000..908f3858f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common NDHWC3to8 padding op
+"""
+import itertools
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = DO;
+{{indent}}{{y_dim2}} = HO;
+{{indent}}{{y_dim3}} = WO;
+"""
+)
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}DI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}WI = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}DO = DI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = 8;
+"""
+)
+
+
+class ndhwc3to8(Operator):
+    """
+    Pad the 3-channel input data to 8-channel.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "ndhwc3to8"
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            x_dim4=x[4],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["DO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self):
+        return {
+            "padded_channels": self._attrs["op"].split("to")[-1],
+            "shape_func_template": self.shape_eval_template,
+        }
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/frontend/nn/padding.py b/python/aitemplate/frontend/nn/padding.py
index c1a6efb6d..c5294f5ad 100644
--- a/python/aitemplate/frontend/nn/padding.py
+++ b/python/aitemplate/frontend/nn/padding.py
@@ -15,7 +15,7 @@
 """
 Padding related modules.
 """
-from ...compiler.ops import nhwc3to8
+from ...compiler.ops import ndhwc3to8, nhwc3to8
 from .module import Module
 
 
@@ -30,3 +30,16 @@ def forward(self, *args):
         assert len(args) == 1
         x = args[0]
         return self.op(x)
+
+
+class Ndhwc3to8(Module):
+    r"""Pads the input data with ndhwc dimensions from 3 channels to 8 channels"""
+
+    def __init__(self):
+        super().__init__()
+        self.op = ndhwc3to8()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
diff --git a/tests/unittest/ops/test_ndhwc3to8.py b/tests/unittest/ops/test_ndhwc3to8.py
new file mode 100644
index 000000000..dc13948bb
--- /dev/null
+++ b/tests/unittest/ops/test_ndhwc3to8.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Ndhcw3To8TestCase(unittest.TestCase):
+    def test_ndhcw3to8_fp16(self):
+        target = detect_target()
+        batch_size = [1, 3]
+        if target.name() == "rocm":
+            return True
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 4, 224, 224, 3],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.Ndhwc3to8()
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "ndhwc3to8")
+        for batch in batch_size:
+            X_np = np.random.uniform(-1, 1, (batch, 4, 224, 224, 3)).astype("float16")
+            Y_np = np.zeros((batch, 4, 224, 224, 8)).astype("float16")
+            Y_np[:, :, :, :, 0] = X_np[:, :, :, :, 0]
+            Y_np[:, :, :, :, 1] = X_np[:, :, :, :, 1]
+            Y_np[:, :, :, :, 2] = X_np[:, :, :, :, 2]
+            Y_pt = torch.from_numpy(Y_np).cuda()
+            X_pt = torch.from_numpy(X_np).cuda()
+            y = torch.empty([batch, 4, 224, 224, 8]).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()

From e037455704f8327f20ce31b0f7333061f12a110e Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Wed, 15 Feb 2023 15:16:36 -0800
Subject: [PATCH 137/638] add a unit test for conv3d with bias padding input
 channels from 3 to 8 (#270)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/270

Proof the concept that we can use zero paddings (pad input channels from index `3` to `7`) for conv3d with bias.

Reviewed By: frank-wei, terrychenism

Differential Revision: D43267930

fbshipit-source-id: b6f4bbc77650464cd0f984a2b4ef18772f490f50
---
 tests/unittest/ops/test_conv3d.py | 62 +++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index bc88b367c..974428de9 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -24,6 +24,68 @@
 
 @unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
 class Conv3dTestCase(unittest.TestCase):
+    def test_conv3d_bias_padding(
+        self,
+    ):
+        target = detect_target()
+        tt = 4
+        hh = 224
+        ww = 224
+        ci = 3
+        co = 96
+        kt = 3
+        kh = 5
+        kw = 5
+        stride = (2, 4, 4)
+        pad = (1, 2, 2)
+        dtype = "float16"
+
+        X = Tensor(
+            shape=[IntImm(4), tt, hh, ww, ci],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, kt, kh, kw, ci],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+
+        B = Tensor(
+            shape=[co],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+
+        Y = ops.conv3d_bias(stride=stride, pad=pad, dilate=1)(
+            ops.ndhwc3to8()(X), ops.ndhwc3to8()(W), B
+        )
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv3d_has_bias")
+
+        X_pt = get_random_torch_tensor([4, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, ci, kt, kh, kw], dtype=dtype)
+        B_pt = get_random_torch_tensor([co], dtype=dtype)
+
+        Y_pt = torch.nn.functional.conv3d(
+            X_pt, W_pt, bias=B_pt, stride=stride, padding=pad
+        )
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w}
+        inputs["input_2"] = B_pt
+
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
+
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
     def _test_conv3d(
         self,
         tt,

From 034300db7007cd6e7a7535aab4a7d8f4091a632d Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Wed, 15 Feb 2023 15:16:36 -0800
Subject: [PATCH 138/638] add conv3d_bias (#245)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/245

Add the conv3d_bias in fx2ait. Now it only supports `groups == 1`

Reviewed By: mortzur, terrychenism

Differential Revision: D43200517

fbshipit-source-id: 2174215b183b94d596449fc32bc1c7910bc30a30
---
 fx2ait/fx2ait/converters/ait_converters.py    | 34 ++++++++++++++-----
 .../fx2ait/test/converters/test_ait_conv3d.py | 28 +++++++++++++++
 python/aitemplate/backend/codegen.py          |  2 +-
 python/aitemplate/compiler/public/__init__.py |  3 +-
 4 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 72925d08e..acd2f77e4 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -29,6 +29,7 @@
     conv2d,
     conv2d_bias,
     conv3d,
+    conv3d_bias,
     depthwise_conv3d,
     dynamic_slice,
     elementwise,
@@ -44,6 +45,7 @@
     IntVarTensor,
     layernorm,
     max_pool2d,
+    ndhwc3to8,
     nhwc3to8,
     pad_last_dim,
     permute,
@@ -1263,18 +1265,34 @@ def _choose_conv3d_op(
     Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
     and groups
     """
-    if bias is not None:
-        assert (
-            groups == weight._attrs["shape"][0].value()
-        ), f"Currently only support channel == groups, but got channel: {weight._attrs['shape'][0].value()} and groups: {groups}"
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
+    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
+
+    if groups is None or groups == 1:
+        if bias is not None:
+            C_in = x.shape()[-1].value()
+
+            if 3 == C_in:
+                x = ndhwc3to8()(x)
+                weight = ndhwc3to8()(weight)
+            elif 8 != C_in:
+                raise RuntimeError(
+                    "When having bias, conv3d currently only supports C_in == 3 or C_in == 8"
+                )
+
+            return conv3d_bias(stride=stride, pad=pad, dilate=dilate, group=groups)(
+                x, weight, bias
+            )
+        else:
+            return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+    elif groups == weight._attrs["shape"][0].value():
         return depthwise_conv3d(
             stride=stride, pad=pad, dilate=dilate, group=groups, bias=True
         )(x, weight, bias)
     else:
-        assert (
-            groups is None or groups == 1
-        ), "Currently only support non-bias conv3d without groups"
-        return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+        raise RuntimeError(
+            "When having bias, currently either support channel == groups or groups == 1"
+        )
 
 
 @ait_converter(acc_ops.conv3d)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index 0d94f4b09..79843e597 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -131,6 +131,34 @@ class TestAitConv3d(AITTestCase):
                 w=224,
                 bias=False,
             ),
+            param(
+                name="conv3d_bias",
+                kernel_size=(3, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+            param(
+                name="conv3d_bias_ndhwc3to8",
+                kernel_size=(5, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=3,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
         ]
     )
     def test_conv3d(
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 904a278ca..476ed2c5a 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -553,7 +553,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(set_value(name, view._attrs["name"]))
             return
         is_view = view is not None
-        if is_view:
+        if is_view and len(self.param_name_to_ptr_idx) > 0:
             ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
             self.set_inputs.append(set_value(name, view._attrs["name"]))
         else:
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index a6caf8b35..3f5e77761 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -58,6 +58,7 @@
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
 from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
 from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
@@ -67,7 +68,7 @@
     group_layernorm_sigmoid_mul,
 )
 from aitemplate.compiler.ops.layernorm.layernorm import layernorm
-from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
+from aitemplate.compiler.ops.padding import ndhwc3to8, nhwc3to8, pad_last_dim
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
 from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax

From 03597e4187162cfe5882fb7255900fcba4bc7b8c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Wed, 15 Feb 2023 15:26:48 -0800
Subject: [PATCH 139/638] Added an environment variable
 AIT_FORCE_PROFILER_CACHE (#255)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/255

When we update our cache versions or make some changes, we may generate
a different cache key for a problem size from the previous version.
Such cache misses can result in re-compiling profilers and re-performing
profiling.

We introduce a new environment variable AIT_FORCE_PROFILER_CACHE, which
may be used to ensure such cache missed to not be overlooked.

Reviewed By: aakhundov

Differential Revision: D43242228

fbshipit-source-id: 038b221fb9fbcde17f7973c3ea4d6b13a32b0e85
---
 python/aitemplate/compiler/ops/conv/conv2d.py |  29 ++-
 python/aitemplate/compiler/ops/conv/conv3d.py |  27 ++-
 .../compiler/ops/conv/conv3d_bias.py          |  27 ++-
 .../ops/gemm_universal/gemm_common.py         |  27 ++-
 python/aitemplate/utils/environ.py            |  24 +-
 .../ops/test_conv3d_profiler_cache.py         | 212 ++++++++++++------
 .../unittest/ops/test_conv_profiler_cache.py  | 212 ++++++++++++------
 .../unittest/ops/test_gemm_profiler_cache.py  | 192 +++++++++++-----
 8 files changed, 524 insertions(+), 226 deletions(-)

diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 38316ce25..549cbc454 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -29,7 +29,7 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import alignment, shape_utils
+from ....utils import alignment, environ, shape_utils
 from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
 from .cache_entry import ConvQueryEntry, ConvRecordEntry
 from .conv_common import (
@@ -316,10 +316,19 @@ def _should_build_profiler(self) -> bool:
         entry for this conv instance, we update this conv op's
         relevant attributes with the cached result and return False.
         """
+        force_cache = environ.force_profiler_cache()
         if self._has_dynamic_input_dims():
+            if force_cache:
+                raise RuntimeError(
+                    "We cannot force to use the cache as dynamic dims require "
+                    "us to generate and build the profilers"
+                )
             # If there are dynamic dims, we'll have to generate and build the
             # profilers, as the binaries will be needed for dynamic profiling.
             return True
+        # We are forced to use the cache so we skip building profilers.
+        if force_cache:
+            return False
 
         target = backend.target.Target.current()
         workloads = list(self._attrs["exec_path"].keys())
@@ -426,7 +435,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         command = [str(x) for x in cmd]
         return command
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
         target = backend.target.Target.current()
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
@@ -457,13 +466,19 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         if cache_value is not None and not target.force_profile():
             _LOGGER.info("Load profiling result from cache.")
             return cache_value
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
                 "This is a CI run but we could not find the following cache ",
                 f"available on device {target._arch}\n",
                 f"{op_type} {exec_entry_sha1}.\n",
-                "To bypass, you need to make it available in the db table.",
+                "Please adjust target.select_minimal_algo function.",
             )
 
         profiler_filename = get_profiler_filename(self._attrs, "conv")
@@ -544,8 +559,8 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        target = backend.target.Target.current()
         if "op_instance" not in self._attrs:
-            target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
                 target=target.name(), op=self._attrs["op"]
@@ -553,14 +568,14 @@ def _profile_static(self, workdir, devices):
             func = registry.get(func_key)
             func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
             _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            target = backend.target.Target.current()
             # if in CI just choose minimal configs
             # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results():
+            if target.use_dummy_profiling_results() and not force_cache:
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
@@ -569,7 +584,7 @@ def _profile_static(self, workdir, devices):
                 self._attrs["workspace"] = 102400
             elif self._attrs["exec_path"][wkl] == "":
                 best_algo, workspace = self._profile_single_workload(
-                    profiler_prefix, wkl, devices
+                    profiler_prefix, wkl, devices, force_cache
                 )
                 self._attrs["exec_path"][wkl] = best_algo
                 self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index ecacb385e..fc7e7159b 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -30,7 +30,7 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import alignment, shape_utils
+from ....utils import alignment, environ, shape_utils
 from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
 from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
 from .conv_common import (
@@ -303,10 +303,19 @@ def _should_build_profiler(self) -> bool:
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
+        force_cache = environ.force_profiler_cache()
         if self._has_dynamic_input_dims():
+            if force_cache:
+                raise RuntimeError(
+                    "We cannot force to use the cache as dynamic dims require "
+                    "us to generate and build the profilers"
+                )
             # If there are dynamic dims, we'll have to generate and build the
             # profilers, as the binaries will be needed for dynamic profiling.
             return True
+        # We are forced to use the cache so we skip building profilers.
+        if force_cache:
+            return False
 
         target = backend.target.Target.current()
         workloads = list(self._attrs["exec_path"].keys())
@@ -428,7 +437,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         command = [str(x) for x in cmd]
         return command
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
         target = backend.target.Target.current()
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
@@ -466,6 +475,12 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         if cache_value is not None and not target.force_profile():
             _LOGGER.info("Load profiling result from cache.")
             return cache_value
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
@@ -555,8 +570,8 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        target = backend.target.Target.current()
         if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
-            target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
                 target=target.name(), op=self._attrs["op"]
@@ -564,14 +579,14 @@ def _profile_static(self, workdir, devices):
             func = registry.get(func_key)
             func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
             _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            target = backend.target.Target.current()
             # if in CI just choose minimal configs
             # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results():
+            if target.use_dummy_profiling_results() and not force_cache:
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
@@ -580,7 +595,7 @@ def _profile_static(self, workdir, devices):
                 self._attrs["workspace"] = 102400
             elif self._attrs["exec_path"][wkl] == "":
                 best_algo, workspace = self._profile_single_workload(
-                    profiler_prefix, wkl, devices
+                    profiler_prefix, wkl, devices, force_cache
                 )
                 self._attrs["exec_path"][wkl] = best_algo
                 self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
index 3437e6cc1..7d3e827e3 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -30,7 +30,7 @@
 from .... import backend
 from ....backend import registry
 from ....backend.target import Target
-from ....utils import alignment, shape_utils
+from ....utils import alignment, environ, shape_utils
 from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
 from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
 from .conv_common import (
@@ -303,10 +303,19 @@ def _should_build_profiler(self) -> bool:
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
+        force_cache = environ.force_to_use_cache()
         if self._has_dynamic_input_dims():
+            if force_cache:
+                raise RuntimeError(
+                    "We cannot force to use the cache as dynamic dims require "
+                    "us to generate and build the profilers"
+                )
             # If there are dynamic dims, we'll have to generate and build the
             # profilers, as the binaries will be needed for dynamic profiling.
             return True
+        # We are forced to use the cache so we skip building profilers.
+        if force_cache:
+            return False
 
         target = backend.target.Target.current()
         workloads = list(self._attrs["exec_path"].keys())
@@ -428,7 +437,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         command = [str(x) for x in cmd]
         return command
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
         target = backend.target.Target.current()
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
@@ -466,6 +475,12 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         if cache_value is not None and not target.force_profile():
             _LOGGER.info("Load profiling result from cache.")
             return cache_value
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
@@ -555,8 +570,8 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        target = backend.target.Target.current()
         if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
-            target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
                 target=target.name(), op=self._attrs["op"]
@@ -564,14 +579,14 @@ def _profile_static(self, workdir, devices):
             func = registry.get(func_key)
             func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
+        force_cache = environ.force_to_use_cache()
         for wkl in workloads:
             _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            target = backend.target.Target.current()
             # if in CI just choose minimal configs
             # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results():
+            if target.use_dummy_profiling_results() and not force_cache:
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
@@ -580,7 +595,7 @@ def _profile_static(self, workdir, devices):
                 self._attrs["workspace"] = 102400
             elif self._attrs["exec_path"][wkl] == "":
                 best_algo, workspace = self._profile_single_workload(
-                    profiler_prefix, wkl, devices
+                    profiler_prefix, wkl, devices, force_cache
                 )
                 self._attrs["exec_path"][wkl] = best_algo
                 self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index b8a6ebe19..329c3cda0 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -33,7 +33,7 @@
 
 from .... import backend
 from ....backend import registry
-from ....utils import alignment
+from ....utils import alignment, environ
 from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
 from ...dtype import is_same_dtype
 from ...tensor_accessor import TensorAccessor
@@ -399,6 +399,9 @@ def _should_build_profiler(
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
+        # We are forced to use the cache so we skip building profilers.
+        if environ.force_profiler_cache():
+            return False
         target = backend.target.Target.current()
 
         build_profiler = True
@@ -568,7 +571,9 @@ def _get_ab_alignment(self, exec_key):
                 )
         return ab_alignment
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
+    def _profile_single_workload(
+        self, profiler_prefix, exec_key, profiler_runner, force_cache
+    ):
         """
         Schedule profilers for given profiler path and gemm shape (exec_key)
         or get the result from cache
@@ -611,14 +616,21 @@ def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
             self._attrs["workspace"] = max(self._attrs["workspace"], cache_value[1])
             self._attrs["split_k"] = cache_value[2]
             return
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
                 "This is a CI run but we could not find the following cache ",
                 f"available on device {target._arch}\n",
                 f"{op_type} {exec_entry_sha1}.\n",
-                "To bypass, you need to make it available in the db table.",
+                "Please adjust target.select_minimal_algo function.",
             )
+
         profiler_filename = self._get_profiler_filename()
 
         def _gen_callback(split_k):
@@ -671,14 +683,15 @@ def profile(
             )
             func = registry.get(func_key)
             func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+        target = backend.target.Target.current()
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
             _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            target = backend.target.Target.current()
             # if in CI just choose minimal configs
             # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results():
+            if target.use_dummy_profiling_results() and not force_cache:
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
@@ -689,7 +702,9 @@ def profile(
                 # we have cached best algo
                 return
             else:
-                self._profile_single_workload(profiler_prefix, wkl, profiler_runner)
+                self._profile_single_workload(
+                    profiler_prefix, wkl, profiler_runner, force_cache
+                )
 
     def gen_function(self) -> str:
         """Generates the function code for the gemm op for the current target.
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index d4ec9f193..55cf8e739 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -12,10 +12,16 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+"""
+A common place for holding AIT-related env control variables
+"""
+import logging
 import os
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def get_compiler_opt_level() -> str:
     # The reason: it is typical in our situation that an option
     # --optimize <level> (-Ox) is for a HOST compiler. And -O3 does
@@ -27,3 +33,19 @@ def get_compiler_opt_level() -> str:
     compiler_opt = os.getenv("AIT_COMPILER_OPT", "-O3")
 
     return compiler_opt
+
+
+def force_profiler_cache() -> bool:
+    """
+    Force the profiler to use the cached results. The profiler will throw
+    a runtime exception if it cannot find cached results. This env may be
+    useful to capture any cache misses due to cache version updates or
+    other relevant code changes.
+    """
+    force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None) == "1"
+    if force_cache:
+        assert (
+            os.environ.get("FORCE_PROFILE", None) != "1"
+        ), "cannot specify both AIT_FORCE_PROFILER_CACHE and FORCE_PROFILE"
+    _LOGGER.info(f"{force_cache=}")
+    return force_cache
diff --git a/tests/unittest/ops/test_conv3d_profiler_cache.py b/tests/unittest/ops/test_conv3d_profiler_cache.py
index 816b04b3f..a10c20eb1 100644
--- a/tests/unittest/ops/test_conv3d_profiler_cache.py
+++ b/tests/unittest/ops/test_conv3d_profiler_cache.py
@@ -12,7 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
+import tempfile
 import unittest
 from unittest.mock import patch
 
@@ -24,6 +26,9 @@
 from aitemplate.testing import detect_target
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 @unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
 class Conv3DProfilerCacheTestCase(unittest.TestCase):
     def _test(
@@ -80,12 +85,13 @@ def _run_test(
         first_dim,
         test_name,
         logger,
+        cache_dir,
     ):
         old_trick = os.environ.get("TRICK_CI_ENV", None)
         old_cache = os.environ.get("CACHE_DIR", None)
         try:
             os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
@@ -106,19 +112,23 @@ def test_conv3d_profiler_cache(self):
         test_name = "conv3d_profiler_cache"
         logger = "aitemplate.compiler.transform.profile"
 
-        run1_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run1_logs)
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
 
-        run2_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 0 profilers", run2_logs)
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 0 profilers", run2_logs)
 
     def test_conv3d_profiler_cache_versioning(self):
         first_dim = IntImm(4)
@@ -127,74 +137,136 @@ def test_conv3d_profiler_cache_versioning(self):
         cache_version_property = "conv3d_cache_version"
         target_name = detect_target().name()
 
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=1,  # version
-        ):
-            run1_before_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_conv3d_1' does not exist in the db",
-                run1_before_version_change_logs,
-            )
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
 
-            run2_before_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_conv3d_1' exists in the db",
-                run2_before_version_change_logs,
-            )
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
 
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=2,  # version
-        ):
-            run1_after_version_change_logs = self._run_test(
+    def test_conv3d_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_force_cache"
+        cache_version_property = "conv3d_cache_version"
+
+        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                with self.assertRaisesRegex(
+                    RuntimeError, "force_cache is enabled but we could not find"
+                ):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                _LOGGER.info("make cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                _LOGGER.info("force cache with no cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+        if old_force_cache is not None:
+            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
+        else:
+            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+
+    def test_conv3d_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv3d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_conv3d_2' does not exist in the db",
-                run1_after_version_change_logs,
-            )
+            self.assertIn("generated 1 profilers", run1_logs)
 
-            run2_after_version_change_logs = self._run_test(
+            run2_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_conv3d_2' exists in the db",
-                run2_after_version_change_logs,
-            )
-
-    def test_conv3d_profiler_cache_dynamic(self):
-        first_dim = IntVar([2, 8])
-        test_name = "conv3d_profiler_cache_dynamic"
-        logger = "aitemplate.compiler.transform.profile"
-
-        run1_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run1_logs)
-
-        run2_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run2_logs)
+            self.assertIn("generated 1 profilers", run2_logs)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_profiler_cache.py b/tests/unittest/ops/test_conv_profiler_cache.py
index 039f54053..406a2a442 100644
--- a/tests/unittest/ops/test_conv_profiler_cache.py
+++ b/tests/unittest/ops/test_conv_profiler_cache.py
@@ -12,7 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
+import tempfile
 import unittest
 from unittest.mock import patch
 
@@ -24,6 +26,9 @@
 from aitemplate.testing import detect_target
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class ConvProfilerCacheTestCase(unittest.TestCase):
     def _test(
         self,
@@ -69,12 +74,13 @@ def _run_test(
         first_dim,
         test_name,
         logger,
+        cache_dir,
     ):
         old_trick = os.environ.get("TRICK_CI_ENV", None)
         old_cache = os.environ.get("CACHE_DIR", None)
         try:
             os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
@@ -95,19 +101,23 @@ def test_conv_profiler_cache(self):
         test_name = "conv2d_profiler_cache"
         logger = "aitemplate.compiler.transform.profile"
 
-        run1_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run1_logs)
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
 
-        run2_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 0 profilers", run2_logs)
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 0 profilers", run2_logs)
 
     def test_conv_profiler_cache_versioning(self):
         first_dim = IntImm(4)
@@ -116,74 +126,136 @@ def test_conv_profiler_cache_versioning(self):
         cache_version_property = "conv_cache_version"
         target_name = detect_target().name()
 
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=1,  # version
-        ):
-            run1_before_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_conv_1' does not exist in the db",
-                run1_before_version_change_logs,
-            )
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
 
-            run2_before_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_conv_1' exists in the db",
-                run2_before_version_change_logs,
-            )
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
 
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=2,  # version
-        ):
-            run1_after_version_change_logs = self._run_test(
+    def test_conv_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_force_cache"
+        cache_version_property = "conv_cache_version"
+
+        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                with self.assertRaisesRegex(
+                    RuntimeError, "force_cache is enabled but we could not find"
+                ):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                _LOGGER.info("make cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                _LOGGER.info("force cache with no cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+        if old_force_cache is not None:
+            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
+        else:
+            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+
+    def test_conv_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv2d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_conv_2' does not exist in the db",
-                run1_after_version_change_logs,
-            )
+            self.assertIn("generated 1 profilers", run1_logs)
 
-            run2_after_version_change_logs = self._run_test(
+            run2_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_conv_2' exists in the db",
-                run2_after_version_change_logs,
-            )
-
-    def test_conv_profiler_cache_dynamic(self):
-        first_dim = IntVar([2, 8])
-        test_name = "conv2d_profiler_cache_dynamic"
-        logger = "aitemplate.compiler.transform.profile"
-
-        run1_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run1_logs)
-
-        run2_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run2_logs)
+            self.assertIn("generated 1 profilers", run2_logs)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_profiler_cache.py b/tests/unittest/ops/test_gemm_profiler_cache.py
index 0f20cc7c2..597e2d2f8 100644
--- a/tests/unittest/ops/test_gemm_profiler_cache.py
+++ b/tests/unittest/ops/test_gemm_profiler_cache.py
@@ -12,7 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
+import tempfile
 import unittest
 from unittest.mock import patch
 
@@ -23,6 +25,9 @@
 from aitemplate.testing import detect_target
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class GemmProfilerCacheTestCase(unittest.TestCase):
     def _test(
         self,
@@ -69,12 +74,13 @@ def _run_test(
         first_dim,
         test_name,
         logger,
+        cache_dir,
     ):
         old_trick = os.environ.get("TRICK_CI_ENV", None)
         old_cache = os.environ.get("CACHE_DIR", None)
         try:
             os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"/tmp/aitemplate/{test_name}"
+            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
@@ -95,73 +101,139 @@ def test_gemm_profiler_cache(self):
         test_name = "gemm_rcr_profiler_cache"
         logger = "aitemplate.compiler.transform.profile"
 
-        run1_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 1 profilers", run1_logs)
-
-        run2_logs = self._run_test(
-            first_dim=first_dim,
-            test_name=test_name,
-            logger=logger,
-        )
-        self.assertIn("generated 0 profilers", run2_logs)
-
-    def test_gemm_profiler_cache_versioning(self):
-        first_dim = IntImm(4)
-        test_name = "gemm_rcr_profiler_cache_versioning"
-        logger = "aitemplate.backend.profiler_cache"
-        cache_version_property = "gemm_cache_version"
-        target_name = detect_target().name()
-
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=1,  # version
-        ):
-            run1_before_version_change_logs = self._run_test(
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_gemm_1' does not exist in the db",
-                run1_before_version_change_logs,
-            )
+            self.assertIn("generated 1 profilers", run1_logs)
 
-            run2_before_version_change_logs = self._run_test(
+            run2_logs = self._run_test(
                 first_dim=first_dim,
                 test_name=test_name,
                 logger=logger,
+                cache_dir=tmp_dirname,
             )
-            self.assertIn(
-                f"table_name='{target_name}_gemm_1' exists in the db",
-                run2_before_version_change_logs,
-            )
+            self.assertIn("generated 0 profilers", run2_logs)
 
-        with patch.object(
-            target=ProfileCacheDB,
-            attribute=cache_version_property,
-            new=2,  # version
-        ):
-            run1_after_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_gemm_2' does not exist in the db",
-                run1_after_version_change_logs,
-            )
+    def test_gemm_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "gemm_cache_version"
+        target_name = detect_target().name()
 
-            run2_after_version_change_logs = self._run_test(
-                first_dim=first_dim,
-                test_name=test_name,
-                logger=logger,
-            )
-            self.assertIn(
-                f"table_name='{target_name}_gemm_2' exists in the db",
-                run2_after_version_change_logs,
-            )
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
+
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
+
+    def test_gemm_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_force_cache"
+        cache_version_property = "gemm_cache_version"
+
+        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                with self.assertRaisesRegex(
+                    RuntimeError, "force_cache is enabled but we could not find"
+                ):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                _LOGGER.info("make cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
+                _LOGGER.info("force cache with no cache 1")
+                self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+
+        if old_force_cache is not None:
+            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
+        else:
+            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4131d6c7ad4a680339b5090d773abc7ec2799292 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 15 Feb 2023 18:14:12 -0800
Subject: [PATCH 140/638] support bfloat16 for softmax (#273)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/273

also classify test cases

and move cuda code to .cuh file

Reviewed By: chenyang78

Differential Revision: D43296258

fbshipit-source-id: ac895aeb57d86fdf88671674e85f0d41166f3237
---
 .../backend/cuda/softmax/softmax.cuh          | 355 +++++++++++++++++-
 .../backend/cuda/softmax/softmax.py           | 196 +++-------
 tests/unittest/ops/test_softmax.py            |  95 +++--
 3 files changed, 449 insertions(+), 197 deletions(-)

diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
index d5be2751a..2b2c33811 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.cuh
+++ b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -16,6 +16,84 @@
 #ifndef CUDA_SOFTMAX
 #define CUDA_SOFTMAX
 
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <math_constants.h>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+
+using bfloat16 = nv_bfloat16;
+
+#define SOFTMAX_DEVICE_CHECK(call)                                   \
+  if ((call) != cudaSuccess) {                                       \
+    throw std::runtime_error(                                        \
+        std::string("softmax kernel call failed: ") +                \
+        cudaGetErrorString(cudaGetLastError()) + " at " + __FILE__ + \
+        ", line" + std::to_string(__LINE__));                        \
+  }
+
+#define SOFTMAX_LAUNCH_CHECK() SOFTMAX_DEVICE_CHECK(cudaGetLastError())
+
+// unroll directives copied from CUTLASS
+#if defined(__CUDA_ARCH__)
+#if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+#define PRAGMA_UNROLL _Pragma("unroll")
+#else
+#define PRAGMA_UNROLL #pragma unroll
+#endif // __CUDACC_RTC__
+
+#else
+#define PRAGMA_UNROLL
+#endif // __CUDA_ARCH__
+
+namespace {
+
+template <typename T>
+__inline__ __device__ T fast_max(const T a, const T b);
+
+template <typename T>
+__inline__ __device__ T fast_exp(const T a);
+
+template <>
+__inline__ __device__ half fast_max(const half a, const half b) {
+#if (__CUDA_ARCH__ >= 800)
+  return __hmax(a, b);
+#else
+  return a > b ? a : b;
+#endif
+}
+
+template <>
+__inline__ __device__ float fast_max(const float a, const float b) {
+  return fmaxf(a, b);
+}
+
+template <>
+__inline__ __device__ half fast_exp(const half a) {
+  return hexp(a);
+}
+
+template <>
+__inline__ __device__ float fast_exp(const float a) {
+  return __expf(a);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+template <>
+__inline__ __device__ bfloat16 fast_exp(const bfloat16 a) {
+  return hexp(a);
+}
+
+template <>
+__inline__ __device__ bfloat16 fast_max(const bfloat16 a, const bfloat16 b) {
+  return __hmax(a, b);
+}
+
+#endif
+
 template <typename T>
 __inline__ __device__ T Inf();
 
@@ -31,7 +109,7 @@ __inline__ __device__ double Inf<double>() {
 
 template <typename T>
 struct Arguments {
-  T* input;
+  const T* input;
   T* output;
 };
 
@@ -120,6 +198,8 @@ __inline__ __device__ T blockReduceMax(T* val) {
   return (T)0.0f;
 }
 
+} // namespace
+
 // input size: [M, K]
 // Currently the softmax kernel only supports 2D input with dim=1.
 // For input with more dimensions, reshape first.
@@ -146,7 +226,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
   constexpr bool can_use_vector_load = ((m * K) % vector_len) == 0;
   // read input
   if (can_use_vector_load && m_idx + m < M) {
-    VECTORIZED_TYPE* input = reinterpret_cast<VECTORIZED_TYPE*>(args.input);
+    auto input = reinterpret_cast<const VECTORIZED_TYPE*>(args.input);
     VECTORIZED_TYPE* output = reinterpret_cast<VECTORIZED_TYPE*>(args.output);
 
     const size_t offset = (m_idx * K) / vector_len;
@@ -160,42 +240,42 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
     VECTORIZED_TYPE input_tile_vec[n_tile];
     T* input_tile = reinterpret_cast<T*>(&input_tile_vec);
 
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < n_tile; i++) {
       input_tile_vec[i] = input[i];
     }
 
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < m; i++) {
       T max = std::numeric_limits<T>::lowest();
       // find max
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
-        max = cutlass::fast_max(input_tile[i * K + j], max);
+        max = fast_max(input_tile[i * K + j], max);
       }
       // get sum
       float sum = 0;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
-        input_tile[tile_idx] = cutlass::fast_exp(input_tile[tile_idx] - max);
+        input_tile[tile_idx] = fast_exp(input_tile[tile_idx] - max);
         sum += static_cast<float>(input_tile[tile_idx]);
       }
       // normalize
       const float sum_inverse = 1.0 / sum;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
         input_tile[tile_idx] = static_cast<T>(
             static_cast<float>(input_tile[tile_idx]) * sum_inverse);
       }
     }
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < n_tile; i++) {
       output[i] = input_tile_vec[i];
     }
   } else {
-    T* input = args.input;
+    const T* input = args.input;
     T* output = args.output;
 
     const size_t offset = m_idx * K;
@@ -209,34 +289,34 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
       T input_tile[K];
 
       // read input
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         input_tile[j] = input[i * K + j];
       }
 
       T max = std::numeric_limits<T>::lowest();
       // find max
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
-        max = cutlass::fast_max(input_tile[j], max);
+        max = fast_max(input_tile[j], max);
       }
       // get sum
       float sum = 0;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
-        input_tile[j] = cutlass::fast_exp(input_tile[j] - max);
+        input_tile[j] = fast_exp(input_tile[j] - max);
         sum += static_cast<float>(input_tile[j]);
       }
       // normalize
       float sum_inverse = 1.0 / sum;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         input_tile[j] =
             static_cast<T>(static_cast<float>(input_tile[j]) * sum_inverse);
       }
       // write output
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         output[i * K + j] = input_tile[j];
       }
@@ -535,4 +615,243 @@ inline cudaError_t LaunchSoftmaxBlockAll(
   return cudaSuccess;
 }
 
+template <typename T, int K, size_t TileSize>
+void LaunchSoftmaxSmallK(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  const int n_threads = 128;
+  const int tile_size_by_n_threads = TileSize * n_threads;
+  dim3 block(n_threads);
+  dim3 grid((batch_size + tile_size_by_n_threads - 1) / tile_size_by_n_threads);
+  softmax_small_k<T, float4, n_threads, K, TileSize>
+      <<<grid, block, 0, stream>>>({input, output}, batch_size);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T>
+struct VecTFor;
+
+template <>
+struct VecTFor<half> {
+  using vec8 = float4;
+  using vec4 = float2;
+  using vec2 = float;
+};
+
+template <>
+struct VecTFor<float> {
+  using vec8 = float8;
+  using vec4 = float4;
+  using vec2 = float2;
+};
+
+template <>
+struct VecTFor<bfloat16> {
+  using vec8 = float4;
+  using vec4 = float2;
+  using vec2 = float;
+};
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK8Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 8, 16, 32}) {
+    if (8 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec8 = typename VecTFor<T>::vec8;
+  softmax_stored_locally_multi_dim<vec8, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec8*>(input),
+      reinterpret_cast<vec8*>(output),
+      batch_size,
+      NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK8Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 7) / 8;
+  const int cols_per_thread = num_packs * 8;
+  using vec8 = typename VecTFor<T>::vec8;
+  softmax_stored_locally_multi_dim<vec8, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec8*>(input),
+          reinterpret_cast<vec8*>(output),
+          batch_size,
+          NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK4Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 4, 8, 16, 32}) {
+    if (4 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec4 = typename VecTFor<T>::vec4;
+  softmax_stored_locally_multi_dim<vec4, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec4*>(input),
+      reinterpret_cast<vec4*>(output),
+      batch_size,
+      NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK4Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 3) / 4;
+  const int cols_per_thread = num_packs * 8;
+  using vec4 = typename VecTFor<T>::vec4;
+
+  softmax_stored_locally_multi_dim<vec4, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec4*>(input),
+          reinterpret_cast<vec4*>(output),
+          batch_size,
+          NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK2Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 2, 4, 8, 16, 32}) {
+    if (2 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec2 = typename VecTFor<T>::vec2;
+
+  softmax_stored_locally_multi_dim<vec2, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec2*>(input),
+      reinterpret_cast<vec2*>(output),
+      batch_size,
+      NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK2Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 1) / 2;
+  const int cols_per_thread = num_packs * 2;
+  using vec2 = typename VecTFor<T>::vec2;
+
+  softmax_stored_locally_multi_dim<vec2, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec2*>(input),
+          reinterpret_cast<vec2*>(output),
+          batch_size,
+          NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK1Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 2, 4, 8, 16, 32}) {
+    if (i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+
+  softmax_stored_locally_multi_dim<T, T, 8>
+      <<<grid, block, 0, stream>>>(input, output, batch_size, NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK1Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int cols_per_thread = (NElements + 31) / 32;
+
+  softmax_stored_locally_multi_dim<T, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(input, output, batch_size, NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
 #endif
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index 50282542c..d1219f912 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -39,20 +39,8 @@
 # and this experiment log: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
 FUNC_TEMPLATE = jinja2.Template(
     """
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/platform/platform.h"
-#include <math_constants.h>
-#include <assert.h>
-#include <cuda.h>
-namespace {
-
 {{custom_libs}}
 
-}  // namespace
-
 {{func_signature}}
 {
   {{shape_functions}}
@@ -61,159 +49,65 @@
   size_t m = M;
   bool success = true;
 
+  // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
   {% if K <= 32 and K % 4 == 0 or K <= 8 %}
-    const int n_threads = 128;
-    const int m0_by_n_threads = m0 * n_threads;
-    dim3 block(n_threads);
-    dim3 grid((m + m0_by_n_threads - 1) / m0_by_n_threads);
-    Arguments<{{dtype}}> args = {
-      static_cast<{{dtype}}*>(input), static_cast<{{dtype}}*>(output)
-    };
-    softmax_small_k<{{dtype}}, float4, n_threads, {{K}}, {{m}}>
-        <<<grid, block, 0, stream>>>(args, m);
+    // K <= 32 and K % 4 == 0 or K <= 8
+    LaunchSoftmaxSmallK<{{dtype}}, {{K}}, {{m}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
   {% elif K % 8 == 0 %}
+    // K % 8 == 0: vector8 kernels
     {% if K/8 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 8, 16, 32}){
-        if (8*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float8,{{dtype}},8><<<grid, block, 0, stream>>>( (const float8*)input, (float8*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
-      {% endif %}
-    {% elif K <= 3840 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+7)/8;
-      const int cols_per_thread = num_packs * 8;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float8,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float8*)input, (float8*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 3840 %}
-        LaunchSoftmaxBlockAll<float8,{{dtype}},{{K}}>( (const float8*) input, (float8*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 3840 %}
-        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
+      // K/8 <= 32
+      LaunchSoftmaxK8Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 3840 %}
+      // 32 < K/8 <= 480
+      LaunchSoftmaxK8Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 3840 %}
+      // K/8 > 480
+      using vec8 = VecTFor<{{dtype}}>::vec8;
+      LaunchSoftmaxBlockAll<vec8, {{dtype}},{{K}}>(reinterpret_cast<const vec8*>(input), reinterpret_cast<vec8*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 4 == 0 %}
+    // K % 4 == 0: vector4 kernels
     {% if K/4 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 4, 8, 16, 32}){
-        if (4*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
-      {% endif %}
-    {% elif K <= 1920 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+3)/4;
-      const int cols_per_thread = num_packs * 8;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1920 %}
-        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1920 %}
-        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
+      // K/4 <= 32
+      LaunchSoftmaxK4Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1920 %}
+      // 32 < K/4 <= 480
+      LaunchSoftmaxK4Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1920 %}
+      // K/4 > 480
+      using vec4 = VecTFor<{{dtype}}>::vec4;
+      LaunchSoftmaxBlockAll<vec4,{{dtype}},{{K}}>(reinterpret_cast<const vec4*>(input), reinterpret_cast<vec4*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 2 == 0 %}
+    // K % 2 == 0: vector2 kernels
     {% if K/2 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 2, 4, 8, 16, 32}){
-        if (2*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
-      {% endif %}
-    {% elif K <= 1152 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+1)/2;
-      const int cols_per_thread = num_packs * 2;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1152 %}
-        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1152 %}
-        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
+      // K/2 <= 32
+      LaunchSoftmaxK2Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1152 %}
+      // 32 < K/2 <= 576
+      LaunchSoftmaxK2Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1152 %}
+      // K/2 > 576
+      using vec2 = VecTFor<{{dtype}}>::vec2;
+      LaunchSoftmaxBlockAll<vec2,{{dtype}},{{K}}>(reinterpret_cast<const vec2*>(input), reinterpret_cast<vec2*>(output), M, stream, &success);
     {% endif %}
   {% else %}
-    {% if K <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 2, 4, 8, 16, 32}){
-        if (i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<half,{{dtype}},8><<<grid, block, 0, stream>>>( (const half*)input, (half*)output, m, n);
-      {% endif %}
-    {% elif K <= 1408 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int cols_per_thread = ({{K}}+31)/32;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<half,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const half*)input, (half*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1408 %}
-        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1408 %}
-        LaunchSoftmaxBlockAll<half,{{dtype}},{{K}}>( (const half*) input, (half*) output, m, stream, &success);
+    // odd K
+    {% if K <= 32 %}
+      // K <= 32
+      LaunchSoftmaxK1Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1408 %}
+      // 32 < K <= 1408
+      LaunchSoftmaxK1Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1408 %}
+      // K > 1408
+      LaunchSoftmaxBlockAll<{{dtype}},{{dtype}},{{K}}>( (const {{dtype}}*) input, ({{dtype}}*) output, m, stream, &success);
     {% endif %}
   {% endif %}
 
-  if(!success){
-    softmaxBlockNocache<half><<<m, 1024, 0, stream>>>((half*)input, (half*)output, m, n);
+  if (!success) {
+    softmaxBlockNocache<{{dtype}}><<<m, 1024, 0, stream>>>(({{dtype}}*)input, ({{dtype}}*)output, m, n);
   }
 }
     """
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index fe29658eb..4ce3a7dc3 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -24,6 +24,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import parameterized
 
 
 class SoftmaxTestCase(unittest.TestCase):
@@ -35,6 +36,11 @@ def _test_softmax(
         dtype="float16",
         testname="softmax",
     ):
+        target = detect_target()
+        if target.name() == "rocm" and dtype != "float16":
+            self.skipTest(f"Rocm doesn't support {dtype}")
+        if target.name() == "cuda" and dtype == "bfloat16" and int(target._arch) < 80:
+            self.skipTest(f"CUDA SM{target._arch} doesn't support {dtype}")
         torch_dtype = string_to_torch_dtype(dtype)
         X = Tensor(
             shape=[IntVar(name="input_batch", values=list(batch_sizes)), *input_shapes],
@@ -46,7 +52,6 @@ def _test_softmax(
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
-        target = detect_target()
         module = compile_model(Y, target, "./tmp", testname)
 
         for batch_size in batch_sizes:
@@ -57,35 +62,69 @@ def _test_softmax(
             module.run_with_tensors([x_pt], [y])
             torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
-    def test_softmax(self):
-        self._test_softmax(testname="softmax_0")
-        self._test_softmax(dim=1, testname="softmax_1")
-        self._test_softmax((1, 13), (7,), testname="softmax_2")
-        self._test_softmax((10, 1025), (16,), testname="softmax_3")
-        self._test_softmax((1, 17), (9, 8), testname="softmax_4")
-        self._test_softmax((2, 64), (9, 1, 6), testname="softmax_5")
-        self._test_softmax((1, 4096), (33,), testname="softmax_6")
-        self._test_softmax((2, 21), (34,), testname="softmax_7")
-        self._test_softmax((2, 17), (36,), testname="softmax_8")
-        self._test_softmax((1, 64), (128,), testname="softmax_9")
-        self._test_softmax((2, 31), (513,), testname="softmax_10")
-
-    def test_softmax_fp32(self):
-        self._test_softmax(dtype="float32", testname="softmax_fp32_0")
-        self._test_softmax(dim=1, dtype="float32", testname="softmax_fp32_1")
-        self._test_softmax((1, 13), (7,), dtype="float32", testname="softmax_fp32_2")
-        self._test_softmax(
-            (10, 1025), (16,), dtype="float32", testname="softmax_fp32_3"
-        )
-        self._test_softmax((1, 17), (9, 8), dtype="float32", testname="softmax_fp32_4")
+    @parameterized.expand(
+        [
+            ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
+            ("odd_small_fp16", "float16", (1, 13), (11,)),
+            ("odd_mid_fp16", "float16", (1, 4096), (33,)),
+            ("odd_large_fp16", "float16", (2, 31), (1409,)),
+            ("k2_small_fp16", "float16", (1, 1024), (18,)),
+            ("k2_mid_fp16", "float16", (2, 21), (66,)),
+            ("k2_large_fp16", "float16", (2, 21), (1154,)),
+            ("k4_small_fp16", "float16", (10, 1025), (124,)),
+            ("k4_mid_fp16", "float16", (1, 17), (132,)),
+            ("k4_large_fp16", "float16", (1, 17), (1924,)),
+            ("k8_small_fp16", "float16", (10, 1025), (72,)),
+            ("k8_mid_fp16", "float16", (1, 17), (264,)),
+            ("k8_large_fp16", "float16", (1, 17), (3848,)),
+            ("no_smem_fp16", "float16", (1, 2), (12500,)),
+            ("2d", "float16", (1, 2), (100, 100)),
+            ("3d", "float16", (1, 2), (24, 2, 64)),
+            ("dim_1_fp32", "float32", (1, 2), (6,), 1),
+            ("odd_small_fp32", "float32", (1, 2), (11,)),
+            ("odd_mid_fp32", "float32", (1, 2), (33,)),
+            ("odd_large_fp32", "float32", (1, 2), (1409,)),
+            ("k2_small_fp32", "float32", (1, 2), (18,)),
+            ("k2_mid_fp32", "float32", (1, 2), (66,)),
+            ("k2_large_fp32", "float32", (1, 2), (1154,)),
+            ("k4_small_fp32", "float32", (1, 2), (124,)),
+            ("k4_mid_fp32", "float32", (1, 2), (132,)),
+            ("k4_large_fp32", "float32", (1, 2), (1924,)),
+            ("k8_small_fp32", "float32", (1, 2), (72,)),
+            ("k8_mid_fp32", "float32", (1, 2), (264,)),
+            ("k8_large_fp32", "float32", (1, 2), (3848,)),
+            ("no_smem_fp32", "float32", (1, 2), (12500,)),
+            ("dim_1_bf16", "bfloat16", (1, 2), (6,), 1),
+            ("odd_small_bf16", "bfloat16", (1, 2), (11,)),
+            ("odd_mid_bf16", "bfloat16", (1, 2), (33,)),
+            ("odd_large_bf16", "bfloat16", (1, 2), (1409,)),
+            ("k2_small_bf16", "bfloat16", (1, 2), (18,)),
+            ("k2_mid_bf16", "bfloat16", (1, 2), (66,)),
+            ("k2_large_bf16", "bfloat16", (1, 2), (1154,)),
+            ("k4_small_bf16", "bfloat16", (1, 2), (124,)),
+            ("k4_mid_bf16", "bfloat16", (1, 2), (132,)),
+            ("k4_large_bf16", "bfloat16", (1, 2), (1924,)),
+            ("k8_small_bf16", "bfloat16", (1, 2), (72,)),
+            ("k8_mid_bf16", "bfloat16", (1, 2), (264,)),
+            ("k8_large_bf16", "bfloat16", (1, 2), (3848,)),
+            ("no_smem_bf16", "bfloat16", (1, 2), (12500,)),
+        ]
+    )
+    def test_softmax(
+        self,
+        testname="softmax",
+        dtype="float16",
+        batch_sizes=(1, 1024),
+        input_shapes=(6,),
+        dim=-1,
+    ):
         self._test_softmax(
-            (2, 64), (9, 1, 6), dtype="float32", testname="softmax_fp32_5"
+            dtype=dtype,
+            testname=f"{testname}_{dtype}",
+            batch_sizes=batch_sizes,
+            input_shapes=input_shapes,
+            dim=dim,
         )
-        self._test_softmax((1, 4096), (33,), dtype="float32", testname="softmax_fp32_6")
-        self._test_softmax((2, 21), (34,), dtype="float32", testname="softmax_fp32_7")
-        self._test_softmax((2, 17), (36,), dtype="float32", testname="softmax_fp32_8")
-        self._test_softmax((1, 64), (128,), dtype="float32", testname="softmax_fp32_9")
-        self._test_softmax((2, 31), (513,), dtype="float32", testname="softmax_fp32_10")
 
 
 if __name__ == "__main__":

From 0be3ddac8d8470986aa11d617390e9be3a2fa519 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Wed, 15 Feb 2023 19:59:53 -0800
Subject: [PATCH 141/638] multiscale attention nn module (#281)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/281

add multiscale nn module and benchmark scripts

Reviewed By: frank-wei, frankgt40

Differential Revision: D43332879

fbshipit-source-id: cbe6ba5a281bf9d79488b19a6acde663b0db9c4c
---
 python/aitemplate/frontend/nn/__init__.py     |   1 +
 .../frontend/nn/multiscale_attention.py       | 724 ++++++++++++++++++
 2 files changed, 725 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/multiscale_attention.py

diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index 0d9c00905..9a3500043 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -28,6 +28,7 @@
 from .view_ops import *
 from .attention import CrossAttention, FlashAttention, MultiheadAttention
 from .identity import Identity
+from .multiscale_attention import MultiScaleBlock
 from .vanilla_attention import (
     vanilla_attention,
     VanillaCrossAttention,
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
new file mode 100644
index 000000000..544f0c5c9
--- /dev/null
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -0,0 +1,724 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for multi-scale attention module
+AIT implementation for MViT:
+https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/models/vision_transformers.py
+"""
+
+from typing import List, Optional, Tuple
+
+import numpy
+
+from ...compiler import ops
+from ...compiler.ops.common.epilogue import FuncEnum
+from .. import Tensor
+from .conv3d import Conv3d
+from .dropout import Dropout, DropPath
+from .identity import Identity
+from .linear import Linear
+from .module import Module
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Mlp(Module):
+    """
+    A MLP block that contains two linear layers with a normalization layer. The MLP
+    block is used in a transformer model after the attention block.
+
+    ::
+
+                         Linear (in_features, hidden_features)
+                                           ↓
+                                 Normalization (act_layer)
+                                           ↓
+                                Dropout (p=dropout_rate)
+                                           ↓
+                         Linear (hidden_features, out_features)
+                                           ↓
+                                Dropout (p=dropout_rate)
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: str = "gelu",
+        dropout_rate: float = 0.0,
+        bias_on: bool = True,
+    ) -> None:
+        """
+        Args:
+            in_features (int): Input feature dimension.
+            hidden_features (Optional[int]): Hidden feature dimension. By default,
+                hidden feature is set to input feature dimension.
+            out_features (Optional[int]): Output feature dimension. By default, output
+                features dimension is set to input feature dimension.
+            act_layer (Callable): Activation layer used after the first linear layer.
+            dropout_rate (float): Dropout rate after each linear layer. Dropout is not used
+                by default.
+        """
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        # TODO fc1 bias is set to zeros; unset if bias_on is True
+
+        self.fc1 = Linear(
+            in_features, hidden_features, bias=True, specialization=act_layer
+        )
+        self.fc2 = Linear(hidden_features, out_features, bias=bias_on)
+
+        if self.dropout_rate > 0.0:
+            self.dropout = Dropout(dropout_rate)
+        else:
+            self.dropout = Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (tensor): Input tensor.
+        """
+        x = self.fc1(x)
+
+        assert self.dropout_rate == 0.0
+
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        x = self.fc2(x)
+
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        return x
+
+
+class _AttentionPool(Module):
+    def __init__(
+        self,
+        pool: Optional[Module],
+        has_cls_embed: bool,
+        norm: Optional[str],
+    ) -> None:
+        """Apply pool to a flattened input (given pool operation and the unflattened shape).
+
+
+                                         Input
+                                           ↓
+                                        Reshape
+                                           ↓
+                                          Pool
+                                           ↓
+                                        Reshape
+                                           ↓
+                                          Norm
+
+
+        Params:
+            pool (Optional[Callable]): Pool operation that is applied to the input tensor.
+                If pool is none, return the input tensor.
+            has_cls_embed (bool): Whether the input tensor contains cls token. Pool
+                operation excludes cls token.
+            norm: (Optional[Callable]): Optional normalization operation applied to
+            tensor after pool.
+        """
+        super().__init__()
+        self.has_pool = pool is not None
+        self.pool = pool if pool is not None else Identity()
+
+        self.has_cls_embed = has_cls_embed
+        if norm is not None:
+            self.norm_before_pool = norm == "BatchNorm3d" or norm == "Identity"
+            self.has_norm = True
+            self.norm = norm
+        else:
+            self.norm_before_pool = False
+            self.has_norm = False
+            self.norm = "Identity"
+
+    def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            tensor (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+
+        Returns:
+            tensor (Tensor): Input tensor after pool.
+            thw_shape (List[int]): Output tensor shape (before flattening).
+        """
+        if not self.has_pool:
+            return tensor, thw_shape
+
+        assert not self.has_cls_embed
+
+        if self.has_cls_embed:
+            # TODO: enable has_cls_embed
+
+            # cls_tok: Tensor = torch.tensor(0)  # For typing/torchscriptability
+            # if self.has_cls_embed:
+            #    cls_tok, tensor = tensor[:, :, :1, :], tensor[:, :, 1:, :]
+            raise NotImplementedError("Unsupported the input tensor contains cls token")
+
+        # input shape: B, num_heads, seqlen, head_dim
+        B, N, L, C = get_shape(tensor)
+        T, H, W = thw_shape
+        tensor = ops.permute()(
+            ops.reshape()(tensor, [B * N, T, H, W, C]), [0, 4, 1, 2, 3]
+        )
+
+        if self.norm_before_pool:
+            # TODO: add batchnorm3d
+            # # If use BN, we apply norm before pooling instead of after pooling.
+            # tensor = self.norm(tensor)
+            # # We also empirically find that adding a GELU here is beneficial.
+            # tensor = nn.functional.gelu(tensor)
+            raise NotImplementedError(
+                f"Unsupport batchnorm3d when {self.norm_before_pool}"
+            )
+
+        tensor = self.pool(ops.permute()(tensor, [0, 2, 3, 4, 1]))
+
+        shape = get_shape(tensor)
+        thw_shape = [shape[1], shape[2], shape[3]]
+        L_pooled = shape[1] * shape[2] * shape[3]
+        tensor = ops.reshape()(tensor, [B, N, L_pooled, C])
+
+        assert self.norm_before_pool
+        if self.has_norm and not self.norm_before_pool:
+
+            # TODO: add support for norm before pool
+            # tensor = self.norm(tensor)
+
+            raise NotImplementedError("Unsupport norm before pool")
+
+        return tensor, thw_shape
+
+
+class MultiScaleAttention(Module):
+    """
+    Implementation of a multiscale attention block. Compare to a conventional attention
+    block, a multiscale attention block optionally supports pooling (either
+    before or after qkv projection). If pooling is not used, a multiscale attention
+    block is equivalent to a conventional attention block.
+
+    ::
+                                   Input
+                                     |
+                    |----------------|-----------------|
+                    ↓                ↓                 ↓
+                  Linear           Linear            Linear
+                    &                &                 &
+                 Pool (Q)         Pool (K)          Pool (V)
+                    → -------------- ←                 |
+                             ↓                         |
+                       MatMul & Scale                  |
+                             ↓                         |
+                          Softmax                      |
+                             → ----------------------- ←
+                                         ↓
+                                   MatMul & Scale
+                                         ↓
+                                      DropOut
+    """
+
+    _version = 2
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        batch_size: int = 1,
+        qkv_bias: bool = False,
+        dropout_rate: float = 0.0,
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        norm_layer: str = "LayerNorm",
+        has_cls_embed: bool = True,
+        pool_mode: str = "conv",
+        pool_first: bool = False,
+        residual_pool: bool = True,
+        depthwise_conv: bool = True,
+        bias_on: bool = True,
+        separate_qkv: bool = False,
+        max_seq_len: int = 6272,
+    ) -> None:
+        """
+        Args:
+            dim (int): Input feature dimension.
+            num_heads (int): Number of heads in the attention layer.
+            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+                bias. Default: False.
+            dropout_rate (float): Dropout rate.
+            kernel_q (_size_3_t): Pooling kernel size for q. If both pooling kernel
+                size and pooling stride size are 1 for all the dimensions, pooling is
+                disabled.
+            kernel_kv (_size_3_t): Pooling kernel size for kv. If both pooling kernel
+                size and pooling stride size are 1 for all the dimensions, pooling is
+                disabled.
+            stride_q (_size_3_t): Pooling kernel stride for q.
+            stride_kv (_size_3_t): Pooling kernel stride for kv.
+            norm_layer (Module): Normalization layer used after pooling.
+            has_cls_embed (bool): If set to True, the first token of the input tensor
+                should be a cls token. Otherwise, the input tensor does not contain a
+                cls token. Pooling is not applied to the cls token.
+            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+                (average pooling), and "max" (max pooling).
+            pool_first (bool): If set to True, pool is applied before qkv projection.
+                Otherwise, pool is applied after qkv projection. Default: False.
+            residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+            depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+            bias_on (bool): Whether use biases for linear layers.
+            separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        """
+
+        super().__init__()
+        assert pool_mode in ["conv", "avg", "max"]
+
+        self.pool_first = pool_first
+        self.dropout_rate = dropout_rate
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.has_cls_embed = has_cls_embed
+        self.residual_pool = residual_pool
+        self.separate_qkv = separate_qkv
+        self.max_seq_len = max_seq_len
+        padding_q = [int(q // 2) for q in kernel_q]
+        padding_kv = [int(kv // 2) for kv in kernel_kv]
+
+        # Set placeholders for torchscriptability, may not be actually used
+        self.q = self.k = self.v = self.qkv = Identity()
+        if self.pool_first or self.separate_qkv:
+            self.q = Linear(dim, dim, bias=qkv_bias)
+            self.k = Linear(dim, dim, bias=qkv_bias)
+            self.v = Linear(dim, dim, bias=qkv_bias)
+        else:
+            self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = Linear(dim, dim, bias=True if bias_on else False)
+
+        assert dropout_rate == 0.0
+        if dropout_rate > 0.0:
+            self.proj_drop = Dropout(dropout_rate)
+        else:
+            self.proj_drop = Identity()
+
+        # Skip pooling with kernel and stride size of (1, 1, 1).
+        if (
+            kernel_q is not None
+            and self._prod(kernel_q) == 1
+            and self._prod(stride_q) == 1
+        ):
+            kernel_q = None
+        if (
+            kernel_kv is not None
+            and self._prod(kernel_kv) == 1
+            and self._prod(stride_kv) == 1
+        ):
+            kernel_kv = None
+
+        if pool_mode in ["max", "avg"]:
+            raise NotImplementedError(f"Unsupported input dimension {pool_mode}")
+
+        ## TODO: add pool mode support for {"max", "avg"}
+
+        elif pool_mode == "conv":
+
+            self.pool_q = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_q,
+                    stride=stride_q,
+                    padding=padding_q,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_q is not None
+                else None
+            )
+
+            self.norm_q = norm_layer if kernel_q is not None else None
+            self.pool_k = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_kv is not None
+                else None
+            )
+            self.norm_k = norm_layer if kernel_kv is not None else None
+            self.pool_v = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_kv is not None
+                else None
+            )
+
+            self.norm_v = norm_layer if kernel_kv is not None else None
+        else:
+            raise NotImplementedError(f"Unsupported model {pool_mode}")
+
+        # Will not be used if `separate_qkv == True`
+        self._attention_pool_q = _AttentionPool(
+            self.pool_q,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_q if hasattr(self, "norm_q") else None,
+        )
+        self._attention_pool_k = _AttentionPool(
+            self.pool_k,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_k if hasattr(self, "norm_k") else None,
+        )
+        self._attention_pool_v = _AttentionPool(
+            self.pool_v,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_v if hasattr(self, "norm_v") else None,
+        )
+
+    def _qkv_proj(
+        self,
+        q: Tensor,
+        q_size: int,
+        k: Tensor,
+        k_size: int,
+        v: Tensor,
+        v_size: int,
+        batch_size: int,
+        chan_size: int,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        q = ops.permute()(
+            ops.reshape()(
+                self.q(q)[
+                    batch_size, q_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        k = ops.permute()(
+            ops.reshape()(
+                self.k(k)[
+                    batch_size, k_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        v = ops.permute()(
+            ops.reshape()(
+                self.v(v)[
+                    batch_size, v_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        return q, k, v
+
+    def _qkv_pool(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        thw_shape: List[int],
+    ) -> Tuple[Tensor, List[int], Tensor, List[int], Tensor, List[int]]:
+        q, q_shape = self._attention_pool_q(q, thw_shape)
+        k, k_shape = self._attention_pool_k(k, thw_shape)
+        v, v_shape = self._attention_pool_v(v, thw_shape)
+        return q, q_shape, k, k_shape, v, v_shape
+
+    def _get_qkv_length(
+        self,
+        q_shape: List[int],
+        k_shape: List[int],
+        v_shape: List[int],
+    ) -> Tuple[int, int, int]:
+        q_N = self._prod(q_shape) + 1 if self.has_cls_embed else self._prod(q_shape)
+        k_N = self._prod(k_shape) + 1 if self.has_cls_embed else self._prod(k_shape)
+        v_N = self._prod(v_shape) + 1 if self.has_cls_embed else self._prod(v_shape)
+        return q_N, k_N, v_N
+
+    def _prod(self, shape: List[int]) -> int:
+        """Torchscriptable version of `numpy.prod`. Note that `_prod([]) == 1`"""
+        p: int = 1
+        for dim in shape:
+            p *= dim
+        return p
+
+        def _reshape_qkv_to_seq(
+            self,
+            q: Tensor,
+            k: Tensor,
+            v: Tensor,
+            q_N: int,
+            v_N: int,
+            k_N: int,
+            B: int,
+            C: int,
+        ) -> Tuple[Tensor, Tensor, Tensor]:
+            q = q.permute(0, 2, 1, 3).reshape(B, q_N, C)
+            v = v.permute(0, 2, 1, 3).reshape(B, v_N, C)
+            k = k.permute(0, 2, 1, 3).reshape(B, k_N, C)
+            return q, k, v
+
+    def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            x (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+        """
+
+        B, N, C = get_shape(x)
+        if self.pool_first:
+            x = ops.reshape()(x, [B, N, self.num_heads, C // self.num_heads])
+            x = ops.permute()(x, [0, 2, 1, 3])
+            q = k = v = x
+            pass
+            q, q_shape, k, k_shape, v, v_shape = self._qkv_pool(q, k, v, thw_shape)
+            q_N, k_N, v_N = self._get_qkv_length(q_shape, k_shape, v_shape)
+            q, k, v = self._reshape_qkv_to_seq(q, k, v, q_N, v_N, k_N, B, C)
+            q, k, v = self._qkv_proj(q, q_N, k, k_N, v, v_N, B, C)
+        else:
+
+            if self.separate_qkv:
+                q = k = v = x
+                pass
+                # TODO: implement when separate_qkv
+                # q, k, v = self._qkv_proj(q, N, k, N, v, N, B, C)
+            else:
+                # compute q, k, v and perform pooling
+                qkv = ops.permute()(
+                    ops.reshape()(self.qkv(x), [B, N, 3, self.num_heads, -1]),
+                    [2, 0, 3, 1, 4],
+                )
+                # input shape: 3, B, num_heads, seqlen, head_dim
+                shape = get_shape(qkv)
+                # obtain q, k, v from qkv
+                qkv = ops.reshape()(qkv, [3 * B, self.num_heads, N, shape[-1]])
+                (q, k, v) = ops.split()(qkv, B, dim=0)
+            q, q_thw_shape, k, k_thw_shape, v, v_thw_shape = self._qkv_pool(
+                q, k, v, thw_shape
+            )
+
+        # attention
+        # q, k, v shape: B, num_heads, seqlen, head_dim
+        q_shape, k_shape, v_shape = get_shape(q), get_shape(k), get_shape(v)
+
+        q = ops.reshape()(q, [q_shape[0], -1, q_shape[-1]])
+        k = ops.reshape()(k, [k_shape[0], -1, k_shape[-1]])
+        v = ops.reshape()(v, [v_shape[0], -1, v_shape[-1]])
+        qk = ops.bmm_rcr()(q, k)
+        attn = ops.elementwise(FuncEnum.MUL)(qk, self.scale)
+        attn = ops.softmax()(attn, -1)
+        score = ops.bmm_rrr()(attn, v)
+
+        if self.residual_pool:
+            score = ops.elementwise(FuncEnum.ADD)(score, q)
+
+        score = ops.reshape()(
+            ops.permute()(
+                ops.reshape()(score, [B, self.num_heads, q_shape[-2], -1]), [0, 2, 1, 3]
+            ),
+            [B, q_shape[-2], -1],
+        )
+
+        score = self.proj(score)
+        assert self.dropout_rate == 0.0
+        if self.dropout_rate > 0.0:
+            score = self.proj_drop(score)
+
+        return score, q_thw_shape
+
+
+class MultiScaleBlock(Module):
+    """
+    Implementation of a multiscale vision transformer block. Each block contains a
+    multiscale attention layer and a Mlp layer.
+
+    ::
+
+
+                                      Input
+                                        |-------------------+
+                                        ↓                   |
+                                       Norm                 |
+                                        ↓                   |
+                                MultiScaleAttention        Pool
+                                        ↓                   |
+                                     DropPath               |
+                                        ↓                   |
+                                    Summation ←-------------+
+                                        |
+                                        |-------------------+
+                                        ↓                   |
+                                       Norm                 |
+                                        ↓                   |
+                                       Mlp                 Proj
+                                        ↓                   |
+                                     DropPath               |
+                                        ↓                   |
+                                    Summation  ←------------+
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        seq_len: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        dropout_rate: float = 0.0,
+        droppath_rate: float = 0.0,
+        act_layer: str = "gelu",
+        norm_layer: str = "LayerNorm",
+        attn_norm_layer: str = "LayerNorm",
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        pool_mode: str = "conv",
+        has_cls_embed: bool = True,
+        pool_first: bool = False,
+        residual_pool: bool = False,
+        depthwise_conv: bool = True,
+        bias_on: bool = True,
+        separate_qkv: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim (int): Input feature dimension.
+            dim_out (int): Output feature dimension.
+            num_heads (int): Number of heads in the attention layer.
+            mlp_ratio (float): Mlp ratio which controls the feature dimension in the
+                hidden layer of the Mlp block.
+            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+                bias. Default: False.
+            dropout_rate (float): DropOut rate. If set to 0, DropOut is disabled.
+            droppath_rate (float): DropPath rate. If set to 0, DropPath is disabled.
+            act_layer (Module): Activation layer used in the Mlp layer.
+            norm_layer (Module): Normalization layer.
+            attn_norm_layer (Module): Normalization layer in the attention module.
+            kernel_q (_size_3_t): Pooling kernel size for q. If pooling kernel size is
+                1 for all the dimensions, pooling is not used (by default).
+            kernel_kv (_size_3_t): Pooling kernel size for kv. If pooling kernel size
+                is 1 for all the dimensions, pooling is not used. By default, pooling
+                is disabled.
+            stride_q (_size_3_t): Pooling kernel stride for q.
+            stride_kv (_size_3_t): Pooling kernel stride for kv.
+            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+                (average pooling), and "max" (max pooling).
+            has_cls_embed (bool): If set to True, the first token of the input tensor
+                should be a cls token. Otherwise, the input tensor does not contain a
+                cls token. Pooling is not applied to the cls token.
+            pool_first (bool): If set to True, pool is applied before qkv projection.
+                Otherwise, pool is applied after qkv projection. Default: False.
+            residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+            depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+            bias_on (bool): Whether use biases for linear layers.
+            separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        """
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer
+        stride_skip = stride_q
+        self.attn = MultiScaleAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            dropout_rate=dropout_rate,
+            kernel_q=kernel_q,
+            kernel_kv=kernel_kv,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            norm_layer=attn_norm_layer,
+            has_cls_embed=has_cls_embed,
+            pool_mode=pool_mode,
+            pool_first=pool_first,
+            residual_pool=residual_pool,
+            bias_on=bias_on,
+            depthwise_conv=depthwise_conv,
+            separate_qkv=separate_qkv,
+            max_seq_len=seq_len,
+        )
+        assert droppath_rate == 0.0
+        self.drop_path = DropPath(droppath_rate) if droppath_rate > 0.0 else Identity()
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.has_cls_embed = has_cls_embed
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=dim_out,
+            act_layer=act_layer,
+            dropout_rate=dropout_rate,
+            bias_on=bias_on,
+        )
+
+        # TODO: Add maxpool3d
+        assert numpy.prod(stride_skip) == 1
+        self.pool_skip = None
+        self._attention_pool = _AttentionPool(
+            self.pool_skip, has_cls_embed=self.has_cls_embed, norm=None
+        )
+
+    def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            x (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+        """
+
+        x = ops.permute021()(x)
+
+        # TODO: ADD/Fuse batchnorm1d
+
+        x = ops.permute021()(x)
+        x_block, thw_shape_new = self.attn(x, thw_shape)
+
+        x_res, _ = self._attention_pool(x, thw_shape)
+        x = x_res + self.drop_path(x_block)
+
+        # TODO: batchnorm 1d
+
+        x_norm = x
+        x_mlp = self.mlp(x_norm)
+        if self.dim != self.dim_out:
+            x = self.proj(x_norm)
+        x = x + self.drop_path(x_mlp)
+
+        return x, thw_shape_new

From 3b42e84c79b037a4c21b48bdf363e611e246a925 Mon Sep 17 00:00:00 2001
From: matteo serva <matteo.serva@gmail.com>
Date: Wed, 15 Feb 2023 21:17:18 -0800
Subject: [PATCH 142/638] fix generator passing to schedulers (#212)

Summary:
This fix allows passing the generator to schedulers that support it.
Without the fix, schedulers like EulerAncestral will always use a random seed even if the user wanted to use a fixed one.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/212

Reviewed By: terrychenism

Differential Revision: D43175342

Pulled By: ipiszy

fbshipit-source-id: 2a673b7934f10ee3310d7eca8aec2852fc0935bf
---
 .../05_stable_diffusion/src/pipeline_stable_diffusion_ait.py  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 9e5f1e5c6..1043d5372 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -347,6 +347,10 @@ def __call__(
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
 
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance

From 4e2479f9af160a6fb1722b4847ecf192ec021187 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 15 Feb 2023 21:35:50 -0800
Subject: [PATCH 143/638] fix flaky tests (#282)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/282

fix random seed so as not to accidentally exceed tolerance threshold during stress tests

Reviewed By: khabinov, wushirong

Differential Revision: D43335333

fbshipit-source-id: b186739651d8a8665a79daad3a3351ffd8d3588b
---
 examples/05_stable_diffusion/src/test_correctness.py | 4 ++++
 tests/unittest/ops/test_bmm_add.py                   | 4 ++++
 tests/unittest/ops/test_conv_bias.py                 | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/examples/05_stable_diffusion/src/test_correctness.py b/examples/05_stable_diffusion/src/test_correctness.py
index ab872149e..cc6a27852 100644
--- a/examples/05_stable_diffusion/src/test_correctness.py
+++ b/examples/05_stable_diffusion/src/test_correctness.py
@@ -31,6 +31,10 @@
 
 
 class StableDiffusionVerification(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def __init__(self, *args, **kwargs):
         super(StableDiffusionVerification, self).__init__(*args, **kwargs)
 
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index d1d391353..1bd77cd82 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -27,6 +27,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMAddTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def __init__(self, *args, **kwargs):
         super(BMMAddTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index 3f4e61e0f..94d541961 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -23,6 +23,10 @@
 
 
 class ConvBiasTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(1)
+
     def _test_conv_bias(
         self,
         batch=4,

From f0b9187dea47f9188364decc3c1603235c29ce8b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 16 Feb 2023 02:50:34 -0800
Subject: [PATCH 144/638] Fix mixed-rank read_t in fused elementwise (#280)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/280

In the `fused_elementwise` backend (`elementwise_common.py`), `num_rightmost_non_broadcast_elements` in the function `_get_types_and_sizes` is initialized to `len(input_shape)`. If the `extended_input_shape` is longer than the `input_shape`, [these lines](https://github.com/facebookincubator/AITemplate/blob/main/python/aitemplate/backend/common/elementwise_common.py#L388-L390) decrements more than necessary:

```
            for i in reversed(range(len(extended_input_shape))):
                if extended_input_shape[i] != output_shape[i]:
                    num_rightmost_non_broadcast_elements -= i + 1
```

As a consequence, even when the last dim of an elementwise input with a lower rank than the output shape allows vectorization (i.e., using `uint4` as a `read_t`), it doesn't happen and the values are read / written to GMEM in `half`s. As `fused_elementwise` is I/O-bound, this may cause inefficiency.

Initializing `num_rightmost_non_broadcast_elements` to `len(output_shape)` resolves the issue, as in both the following `if` and `else` branches it is processed correctly.

Reviewed By: ipiszy, tenpercent

Differential Revision: D43331365

fbshipit-source-id: d74ebd2c15f0b976a420bf0536467d0cb5fda6d1
---
 python/aitemplate/backend/common/elementwise_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 632f2ef4c..1d8b230a5 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -377,7 +377,7 @@ def _get_types_and_sizes(
                     input_shape, output_shape
                 )
             )
-        num_rightmost_non_broadcast_elements = len(input_shape)
+        num_rightmost_non_broadcast_elements = len(output_shape)
         extended_input_shape = list(input_shape)
         if input_shape == output_shape:
             input_broadcast_sizes.append(None)

From d65e824bf214344ddd1d441db7fbd557482f86fc Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Fri, 17 Feb 2023 01:18:20 +0800
Subject: [PATCH 145/638] fix timeout bugs

---
 python/aitemplate/backend/profiler_runner.py    | 7 ++++---
 python/aitemplate/compiler/ops/conv/conv2d.py   | 2 +-
 python/aitemplate/compiler/ops/conv/conv3d.py   | 2 +-
 python/aitemplate/compiler/transform/profile.py | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 2be76a39a..feccf63b9 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -307,7 +307,7 @@ def callback_when_done(fut):
                     )
             finally:
                 # unblock one future in `join()`
-                if stdout is not None:
+                if not err:
                     self._done_queue.put(stdout)
 
         future.add_done_callback(callback_when_done)
@@ -322,7 +322,8 @@ def join(self):
             f.cancel()
         # block until each done_callback completes,
         # or raise Empty exception after 3 minutes of waiting
-        block_timeout = None if Target.current().name() == "rocm" else 180
+        block_timeout = self._timeout if Target.current().name() == "rocm" else 180
         for _ in self._futures:
-            self._done_queue.get(timeout=block_timeout)
+            if not self._done_queue.empty():
+                self._done_queue.get(timeout=block_timeout)
         self._postprocessing_delegate.postprocess_results()
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 6bb9447f6..26482afa2 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -466,7 +466,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             )
         if target.name() == "rocm":
             runner = backend.profiler_runner.Runner(
-                devices, self._attrs["name"], timeout=None
+                devices, self._attrs["name"], timeout=1800
             )
             op_type = self._attrs["op"]
             all_op_names = list(self._attrs["op_instance"].keys())
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 4341cfa64..b48bb9700 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -477,7 +477,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             all_op_names = list(self._attrs["op_instance"].keys())
             for op_name in all_op_names:
                 runner = backend.profiler_runner.Runner(
-                    devices, self._attrs["name"], timeout=None
+                    devices, self._attrs["name"], timeout=1800
                 )
                 x_shape = self._invert_exec_key(exec_key)
                 command = self._gen_profile_cmd(profiler_prefix, op_name, x_shape)
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 33edf2a36..5b42a6bd0 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -106,7 +106,7 @@ def profile(
             workdir=profiler_dir,
             devices=devices,
         )
-    timeout = None if Target.current().name() == "rocm" else 240
+    timeout = 2400 if Target.current().name() == "rocm" else 240
     profiler_runner = ProfilerRunner(
         devices,
         timeout=timeout,

From 057ef66ab41de24b7c641af07150cc2b89c50ade Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 16 Feb 2023 09:20:53 -0800
Subject: [PATCH 146/638] correctly invoking force_profiler_cache (#289)

Summary:
There was a mistake by calling force_to_use_cache, which didn't exist. Fixed it by invoking force_profiler_cache correctly.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/289

Reviewed By: hl475

Differential Revision: D43345002

Pulled By: chenyang78

fbshipit-source-id: 7c44d9b28cd4595436a21ffb2665e9bc0aee7594
---
 python/aitemplate/compiler/ops/conv/conv3d_bias.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
index 7d3e827e3..2fb0450b8 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -303,7 +303,7 @@ def _should_build_profiler(self) -> bool:
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
-        force_cache = environ.force_to_use_cache()
+        force_cache = environ.force_profiler_cache()
         if self._has_dynamic_input_dims():
             if force_cache:
                 raise RuntimeError(
@@ -579,7 +579,7 @@ def _profile_static(self, workdir, devices):
             func = registry.get(func_key)
             func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
-        force_cache = environ.force_to_use_cache()
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
             _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),

From c168196aa4d3df7768f3d640b50c0301e8f6fbb7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 16 Feb 2023 10:30:33 -0800
Subject: [PATCH 147/638] test attention with bfloat16 inputs (#277)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/277

att

Reviewed By: chenyang78

Differential Revision: D43325122

fbshipit-source-id: c586a13dda5c68baa2db3b9e5e755a72c7eb68d7
---
 tests/unittest/ops/test_attention.py | 201 +++++++++++++++++----------
 1 file changed, 125 insertions(+), 76 deletions(-)

diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 2d4a7f72d..f6a3bec08 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -144,7 +144,6 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class attentionTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -289,17 +288,17 @@ def _test_flash_attention(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_flash_attention(self):
-        if detect_target().name() == "cuda":
-            self._test_flash_attention(
-                test_name="flash_attention_fp16",
-                dtype="float16",
-            )
-            self._test_flash_attention(
-                test_name="flash_attention_fp16_copy_op",
-                copy_op=True,
-                dtype="float16",
-            )
+        self._test_flash_attention(
+            test_name="flash_attention_fp16",
+            dtype="float16",
+        )
+        self._test_flash_attention(
+            test_name="flash_attention_fp16_copy_op",
+            copy_op=True,
+            dtype="float16",
+        )
 
     def _test_attention(
         self,
@@ -392,12 +391,12 @@ def _test_attention(
             )
             _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
 
-    def test_attention(self):
-        if detect_target().name() == "rocm":
-            self._test_attention(
-                test_name="attention_fp16",
-                dtype="float16",
-            )
+    @unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
+    def test_rocm_attention(self):
+        self._test_attention(
+            test_name="attention_fp16",
+            dtype="float16",
+        )
 
     def _test_mem_eff_attention(
         self,
@@ -415,6 +414,8 @@ def _test_mem_eff_attention(
         benchmark_pt=False,
         copy_op=False,
         use_perm=True,
+        atol=1e-3,
+        rtol=1e-3,
     ):
         torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
@@ -541,7 +542,7 @@ def _test_mem_eff_attention(
             )
             _LOGGER.info(f"benchmark eff-mem-attn time: {time_per_iter_ms}")
 
-        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
         if benchmark_pt:
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
@@ -559,44 +560,67 @@ def _test_mem_eff_attention(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_mem_eff_attention_fp16(self):
-        if detect_target().name() == "cuda":
-            for use_perm in [False, True]:
-                self._test_mem_eff_attention(
-                    use_perm=use_perm,
-                    test_name=f"mem_eff_attention_fp16_{use_perm}",
-                    dtype="float16",
-                )
-                self._test_mem_eff_attention(
-                    use_perm=use_perm,
-                    causal=True,
-                    test_name=f"mem_eff_attention_fp16_{use_perm}_causal",
-                    dtype="float16",
-                )
-                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
-                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
+        for use_perm in [False, True]:
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                test_name=f"mem_eff_attention_fp16_{use_perm}",
+                dtype="float16",
+            )
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                causal=True,
+                test_name=f"mem_eff_attention_fp16_{use_perm}_causal",
+                dtype="float16",
+            )
+            # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
+            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
+            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
+            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
+            # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     @unittest.skipIf(
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
     def test_mem_eff_attention_fp32(self):
-        if detect_target().name() == "cuda":
-            for use_perm in [False, True]:
-                self._test_mem_eff_attention(
-                    use_perm=use_perm,
-                    test_name=f"mem_eff_attention_fp32_{use_perm}",
-                    dtype="float32",
-                )
-                self._test_mem_eff_attention(
-                    use_perm=use_perm,
-                    causal=True,
-                    test_name=f"mem_eff_attention_fp32_{use_perm}_causal",
-                    dtype="float32",
-                )
+        for use_perm in [False, True]:
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                test_name=f"mem_eff_attention_fp32_{use_perm}",
+                dtype="float32",
+            )
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                causal=True,
+                test_name=f"mem_eff_attention_fp32_{use_perm}_causal",
+                dtype="float32",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_mem_eff_attention_bf16(self):
+        for use_perm in [False, True]:
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                test_name=f"mem_eff_attention_bf16_{use_perm}",
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            self._test_mem_eff_attention(
+                use_perm=use_perm,
+                causal=True,
+                test_name=f"mem_eff_attention_bf16_{use_perm}_causal",
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
 
     def _test_cross_attention(
         self,
@@ -615,6 +639,8 @@ def _test_cross_attention(
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
+        atol=1e-3,
+        rtol=1e-3,
     ):
         torch_dtype = string_to_torch_dtype(dtype)
 
@@ -710,41 +736,64 @@ def _test_cross_attention(
             )
             _LOGGER.info(f"benchmark cross-attn time: {time_per_iter_ms}")
 
-        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_cross_attention_fp16(self):
-        if detect_target().name() == "cuda":
-            self._test_cross_attention(
-                test_name="cross_attention_fp16",
-                dtype="float16",
-            )
-            self._test_cross_attention(
-                seqlen=1024,
-                seqlen_kv=768,
-                head_size=64,
-                head_size_v=64,
-                test_name="cross_attention2_fp16",
-                dtype="float16",
-            )
+        self._test_cross_attention(
+            test_name="cross_attention_fp16",
+            dtype="float16",
+        )
+        self._test_cross_attention(
+            seqlen=1024,
+            seqlen_kv=768,
+            head_size=64,
+            head_size_v=64,
+            test_name="cross_attention2_fp16",
+            dtype="float16",
+        )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     @unittest.skipIf(
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
     def test_cross_attention_fp32(self):
-        if detect_target().name() == "cuda":
-            self._test_cross_attention(
-                test_name="cross_attention_fp32",
-                dtype="float32",
-            )
-            self._test_cross_attention(
-                seqlen=1024,
-                seqlen_kv=768,
-                head_size=64,
-                head_size_v=64,
-                test_name="cross_attention2_fp32",
-                dtype="float32",
-            )
+        self._test_cross_attention(
+            test_name="cross_attention_fp32",
+            dtype="float32",
+        )
+        self._test_cross_attention(
+            seqlen=1024,
+            seqlen_kv=768,
+            head_size=64,
+            head_size_v=64,
+            test_name="cross_attention2_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_cross_attention_bf16(self):
+        self._test_cross_attention(
+            test_name="cross_attention_bf16",
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_cross_attention(
+            seqlen=1024,
+            seqlen_kv=768,
+            head_size=64,
+            head_size_v=64,
+            test_name="cross_attention2_bf16",
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
 
 
 if __name__ == "__main__":

From 5667d8d3d51aacd261a2c03954c353d8ae672506 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 16 Feb 2023 10:40:26 -0800
Subject: [PATCH 148/638] test argmax with bfloat16 inputs (#278)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/278

att

Reviewed By: chenyang78

Differential Revision: D43327784

fbshipit-source-id: b7faaeca68f7ccee2cb0b6c03e2a0ed9e6007f40
---
 .../backend/common/tensor/argmax_common.py    |  2 +-
 .../aitemplate/backend/cuda/tensor/argmax.py  | 26 +++++++
 tests/unittest/ops/test_argmax.py             |  2 +-
 tests/unittest/ops/test_argmax_sm80.py        | 77 +++++++++++++++++++
 4 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 tests/unittest/ops/test_argmax_sm80.py

diff --git a/python/aitemplate/backend/common/tensor/argmax_common.py b/python/aitemplate/backend/common/tensor/argmax_common.py
index acdffd824..2bc8b0038 100644
--- a/python/aitemplate/backend/common/tensor/argmax_common.py
+++ b/python/aitemplate/backend/common/tensor/argmax_common.py
@@ -260,7 +260,7 @@ class MultiplyFunctor final {
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(int64_t* output,
-                   const {{dtype}}* input,
+                   const void* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
                    const {{index_type}} instance_num,
diff --git a/python/aitemplate/backend/cuda/tensor/argmax.py b/python/aitemplate/backend/cuda/tensor/argmax.py
index 9f38f2102..9f82e584d 100644
--- a/python/aitemplate/backend/cuda/tensor/argmax.py
+++ b/python/aitemplate/backend/cuda/tensor/argmax.py
@@ -26,9 +26,35 @@
 
 header_files = """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include <cub/cub.cuh>
+
+using bfloat16 = nv_bfloat16;
+
+namespace cub {
+    template <>
+    struct FpLimits<bfloat16>
+    {
+        static __host__ __device__ __forceinline__ bfloat16 Max() {
+            unsigned short max_word = 0x7F7F;
+            return reinterpret_cast<bfloat16&>(max_word);
+        }
+
+        static __host__ __device__ __forceinline__ bfloat16 Lowest() {
+            unsigned short lowest_word = 0xFF7F;
+            return reinterpret_cast<bfloat16&>(lowest_word);
+        }
+    };
+
+    template <> struct NumericTraits<bfloat16>
+      : BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16> {};
+
+    template<> struct Traits<bfloat16>
+      : NumericTraits<bfloat16> {};
+}
+
 """
 
 
diff --git a/tests/unittest/ops/test_argmax.py b/tests/unittest/ops/test_argmax.py
index 4c47a5f97..3aa93c604 100644
--- a/tests/unittest/ops/test_argmax.py
+++ b/tests/unittest/ops/test_argmax.py
@@ -24,7 +24,7 @@
 from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-class argmaxTestCase(unittest.TestCase):
+class ArgmaxTestCase(unittest.TestCase):
     def _test_argmax(
         self,
         batch_size=1,
diff --git a/tests/unittest/ops/test_argmax_sm80.py b/tests/unittest/ops/test_argmax_sm80.py
new file mode 100644
index 000000000..aeb4e498c
--- /dev/null
+++ b/tests/unittest/ops/test_argmax_sm80.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for argmax Operator.
+"""
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class ArgmaxSM80TestCase(unittest.TestCase):
+    def _test_argmax(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        dim=0,
+        test_name="argmax",
+        copy_op=False,
+        dtype="float16",
+    ):
+        o_shape = list(shape)[:-1]
+
+        X1 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        X4_op = ops.argmax(dim=dim)
+        X4 = X4_op(X1)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        scores = get_random_torch_tensor(shape, dtype=dtype)
+        y_pt = torch.argmax(scores, dim=dim)
+        y = torch.empty_like(y_pt, dtype=torch.int64)
+
+        module.run_with_tensors([scores], [y])
+        y_reshape = y.reshape(o_shape)
+        torch.testing.assert_close(y_pt, y_reshape, atol=0, rtol=0)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bfloat16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bfloat16 is not supported by CUDA < SM80.",
+    )
+    def test_argmax_bf16(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_bf16",
+            dtype="bfloat16",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()

From 9fd39af53571a3249ca55bfd0d1abc597a73a6f7 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Thu, 16 Feb 2023 11:59:21 -0800
Subject: [PATCH 149/638] Speed up multiscale attention (#293)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/293

use fmha for attention
eager:
INFO:__main__:multiscale pt: batch_size: 128, 63.3606591796875 ms

no-optimized AIT:
INFO:__main__:multiscale ait: batch_size: 128, 39.77083110809326 ms

optimized AIT:
INFO:__main__:multiscale ait: batch_size: 128, 30.06564426422119 ms

Reviewed By: henryhu6, frankgt40

Differential Revision: D43364600

fbshipit-source-id: 11cfdac6d4b3cb58534392fe5bad4ee4793aed8c
---
 .../frontend/nn/multiscale_attention.py       | 26 +++----------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 544f0c5c9..a095408f7 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -301,7 +301,6 @@ def __init__(
         self.num_heads = num_heads
         head_dim = dim // num_heads
         self.scale = head_dim**-0.5
-
         self.has_cls_embed = has_cls_embed
         self.residual_pool = residual_pool
         self.separate_qkv = separate_qkv
@@ -533,27 +532,13 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
             )
 
         # attention
-        # q, k, v shape: B, num_heads, seqlen, head_dim
-        q_shape, k_shape, v_shape = get_shape(q), get_shape(k), get_shape(v)
-
-        q = ops.reshape()(q, [q_shape[0], -1, q_shape[-1]])
-        k = ops.reshape()(k, [k_shape[0], -1, k_shape[-1]])
-        v = ops.reshape()(v, [v_shape[0], -1, v_shape[-1]])
-        qk = ops.bmm_rcr()(q, k)
-        attn = ops.elementwise(FuncEnum.MUL)(qk, self.scale)
-        attn = ops.softmax()(attn, -1)
-        score = ops.bmm_rrr()(attn, v)
+        B, num_heads, seqlen, head_dim = get_shape(q)
+        score = ops.mem_eff_attention(causal=False)(q, k, v)
+        score = ops.reshape()(score, [B, seqlen, -1])
 
         if self.residual_pool:
             score = ops.elementwise(FuncEnum.ADD)(score, q)
 
-        score = ops.reshape()(
-            ops.permute()(
-                ops.reshape()(score, [B, self.num_heads, q_shape[-2], -1]), [0, 2, 1, 3]
-            ),
-            [B, q_shape[-2], -1],
-        )
-
         score = self.proj(score)
         assert self.dropout_rate == 0.0
         if self.dropout_rate > 0.0:
@@ -703,11 +688,6 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
             thw_shape (List): The shape of the input tensor (before flattening).
         """
 
-        x = ops.permute021()(x)
-
-        # TODO: ADD/Fuse batchnorm1d
-
-        x = ops.permute021()(x)
         x_block, thw_shape_new = self.attn(x, thw_shape)
 
         x_res, _ = self._attention_pool(x, thw_shape)

From 231dcc6935b72d2570c11aa892904bbcab62524b Mon Sep 17 00:00:00 2001
From: generatedunixname89002005367269 <generatedunixname89002005367269@fb.com>
Date: Fri, 17 Feb 2023 05:43:09 -0800
Subject: [PATCH 150/638] Daily `arc lint --take BLACK`

Reviewed By: 0x1eaf

Differential Revision: D43390280

fbshipit-source-id: b9a35ddbdd62c66b7e17b98b9be28d0acf0df8f7
---
 .../05_stable_diffusion/src/pipeline_stable_diffusion_ait.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 1043d5372..5d560b206 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -348,7 +348,9 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
             # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
 

From 43ae67233720b1eabfe49a92532867a58837f4ab Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357 <generatedunixname89002005232357@fb.com>
Date: Fri, 17 Feb 2023 10:18:47 -0800
Subject: [PATCH 151/638] Revert D43200517: Multisect successfully blamed
 D43200517 for test or build failures (#296)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/296

This diff is reverting D43200517 (https://github.com/facebookincubator/AITemplate/commit/034300db7007cd6e7a7535aab4a7d8f4091a632d)
D43200517 (https://github.com/facebookincubator/AITemplate/commit/034300db7007cd6e7a7535aab4a7d8f4091a632d): [fx2ait] add conv3d_bias by frankgt40 has been identified to be causing the following test or build failures:

Tests affected:
- [aitemplate/AITemplate/fx2ait/fx2ait/test:test_ait_conv3d - test_conv3d_2 (aitemplate.AITemplate.fx2ait.fx2ait.test.converters.test_ait_conv3d.TestAitConv3d)](https://www.internalfb.com/intern/test/281475061569445/)
- [aitemplate/AITemplate/fx2ait/fx2ait/test:test_ait_conv3d - test_conv3d_6 (aitemplate.AITemplate.fx2ait.fx2ait.test.converters.test_ait_conv3d.TestAitConv3d)](https://www.internalfb.com/intern/test/562950038766204/)

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1581357
Here are the tasks that are relevant to this breakage:
T143207728: 10 tests started failing for oncall gpu_enablement in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Reviewed By: wushirong

Differential Revision: D43347563

fbshipit-source-id: f2c2b9ad01314ef83ca093bbfab032af5e1b8990
---
 fx2ait/fx2ait/converters/ait_converters.py    | 34 +++++--------------
 .../fx2ait/test/converters/test_ait_conv3d.py | 28 ---------------
 python/aitemplate/backend/codegen.py          |  2 +-
 python/aitemplate/compiler/public/__init__.py |  3 +-
 4 files changed, 10 insertions(+), 57 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index acd2f77e4..72925d08e 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -29,7 +29,6 @@
     conv2d,
     conv2d_bias,
     conv3d,
-    conv3d_bias,
     depthwise_conv3d,
     dynamic_slice,
     elementwise,
@@ -45,7 +44,6 @@
     IntVarTensor,
     layernorm,
     max_pool2d,
-    ndhwc3to8,
     nhwc3to8,
     pad_last_dim,
     permute,
@@ -1265,34 +1263,18 @@ def _choose_conv3d_op(
     Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
     and groups
     """
-    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
-    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
-
-    if groups is None or groups == 1:
-        if bias is not None:
-            C_in = x.shape()[-1].value()
-
-            if 3 == C_in:
-                x = ndhwc3to8()(x)
-                weight = ndhwc3to8()(weight)
-            elif 8 != C_in:
-                raise RuntimeError(
-                    "When having bias, conv3d currently only supports C_in == 3 or C_in == 8"
-                )
-
-            return conv3d_bias(stride=stride, pad=pad, dilate=dilate, group=groups)(
-                x, weight, bias
-            )
-        else:
-            return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
-    elif groups == weight._attrs["shape"][0].value():
+    if bias is not None:
+        assert (
+            groups == weight._attrs["shape"][0].value()
+        ), f"Currently only support channel == groups, but got channel: {weight._attrs['shape'][0].value()} and groups: {groups}"
         return depthwise_conv3d(
             stride=stride, pad=pad, dilate=dilate, group=groups, bias=True
         )(x, weight, bias)
     else:
-        raise RuntimeError(
-            "When having bias, currently either support channel == groups or groups == 1"
-        )
+        assert (
+            groups is None or groups == 1
+        ), "Currently only support non-bias conv3d without groups"
+        return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
 
 
 @ait_converter(acc_ops.conv3d)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index 79843e597..0d94f4b09 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -131,34 +131,6 @@ class TestAitConv3d(AITTestCase):
                 w=224,
                 bias=False,
             ),
-            param(
-                name="conv3d_bias",
-                kernel_size=(3, 5, 5),
-                stride=(2, 4, 4),
-                padding=(1, 2, 2),
-                dilation=1,
-                ci=8,
-                co=96,
-                groups=1,
-                d=4,
-                h=224,
-                w=224,
-                bias=True,
-            ),
-            param(
-                name="conv3d_bias_ndhwc3to8",
-                kernel_size=(5, 5, 5),
-                stride=(2, 4, 4),
-                padding=(1, 2, 2),
-                dilation=1,
-                ci=3,
-                co=96,
-                groups=1,
-                d=4,
-                h=224,
-                w=224,
-                bias=True,
-            ),
         ]
     )
     def test_conv3d(
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 476ed2c5a..904a278ca 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -553,7 +553,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(set_value(name, view._attrs["name"]))
             return
         is_view = view is not None
-        if is_view and len(self.param_name_to_ptr_idx) > 0:
+        if is_view:
             ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
             self.set_inputs.append(set_value(name, view._attrs["name"]))
         else:
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index 3f5e77761..a6caf8b35 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -58,7 +58,6 @@
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
 from aitemplate.compiler.ops.conv.conv3d import conv3d
-from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
 from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
@@ -68,7 +67,7 @@
     group_layernorm_sigmoid_mul,
 )
 from aitemplate.compiler.ops.layernorm.layernorm import layernorm
-from aitemplate.compiler.ops.padding import ndhwc3to8, nhwc3to8, pad_last_dim
+from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
 from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax

From 65f306ca362262392f099c733d19d9d352906da4 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Fri, 17 Feb 2023 13:07:56 -0800
Subject: [PATCH 152/638] Hack on bmm dynamic shape (#294)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/294

This is a hack on bmm's batch size; otherwise a lot of production model would be blocked.

AIT has a strict checker on batch size to be equal for bmm: https://fburl.com/code/xagsi9yq
It is fine if the model is written in AIT. However, the fx2ait lowered module may not be follow the strict naming requirement. For instance, the model owner would write something like
```
a1 = reshape(a0, [-1, M, K])
b1 = reshape(b0, [-1, K, N])
c0 = matmul(a1, b1)
```
Here first dimensions of both `a1` and `b1` are the same, i.e. batch_size, but AIT doesn't know that!
AIT will treat `a1.shape[0]` and `b1.shape[0]` as different symbols, name them differently, and at bmm, we got trouble.
A concrete example can be found at: https://fburl.com/phabricator/fkrwxz6c
which raises the issue: P618427861

The solution is to trust fx2ait converter. The rationale behind is models given to fx2ait converter has been successfully run by pytorch Eager mode, so users shouldn't be wrong at shape. Therefore, at fx2ait level, we directly unify the shape names.

========================================
side note:
Originally, I added runtime checker at bmm gen_function，but then I found it unnecessary, because if the names are the same, the emit shape name would also be the same, and the runtime checker would therefore, always pass.
e.g. A test P625543843 would generate model-generated.h: https://fburl.com/phabricator/dbx9of86, which simply checks `batch_size!=batch_size`.

But this is fine, because before we reach that line, P625543843 would raise runtime error P625545240, coming from model interface: https://fburl.com/code/rbm6jxwz
Therefore we are good.

========================================

**Notice all of these are just temporary fix and we are waiting on Mu-chu's proper fix on symbolic shape: D42295538

Reviewed By: wushirong

Differential Revision: D43176013

fbshipit-source-id: 985a4b732290c6df1e7c81380a4b4153f830592a
---
 fx2ait/fx2ait/converters/ait_converters.py    | 60 ++++++++++++++-----
 .../fx2ait/test/converters/test_ait_matmul.py | 41 +++++++++++++
 tests/unittest/ops/test_bmm.py                | 56 +++++++++++++++++
 3 files changed, 142 insertions(+), 15 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 72925d08e..c24542245 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -924,6 +924,25 @@ def acc_ops_flatten(
     return flatten(start_dim=start_dim, end_dim=end_dim)(input_val)
 
 
+def acc_ops_bmm(name: str, lhs: AITTensor, rhs: AITTensor) -> ConverterOutput:
+    lhs_shape = lhs.shape()
+    rhs_shape = rhs.shape()
+    if (
+        lhs_shape[0] == rhs_shape[0]
+        and lhs_shape[0]._attrs["name"] is None
+        and rhs_shape[0]._attrs["name"] is None
+    ):
+        lhs_shape[0]._attrs["name"] = f"acc_{name}_batch_size"
+        rhs_shape[0]._attrs["name"] = f"acc_{name}_batch_size"
+    elif lhs_shape[0] != rhs_shape[0]:
+        if lhs_shape[0]._attrs["values"] == rhs_shape[0]._attrs["values"]:
+            if lhs_shape[0]._attrs["name"] is None:
+                lhs_shape[0] = rhs_shape[0]
+            else:
+                rhs_shape[0] = lhs_shape[0]
+    return bmm_rrr()(lhs, rhs)
+
+
 @ait_converter(acc_ops.matmul)
 def acc_ops_matmul(
     target: Target,
@@ -950,30 +969,41 @@ def acc_ops_matmul(
     if len(rhs_shape) == 2:
         return gemm_rrr()(lhs, rhs)
     elif len(lhs_shape) <= 3 and len(rhs_shape) <= 3:
-        return bmm_rrr()(lhs, rhs)
+        return acc_ops_bmm(name, lhs, rhs)
     elif len(lhs_shape) == 4 and len(rhs_shape) == 4 and lhs_shape[1] == rhs_shape[1]:
-        assert all(isinstance(i, IntImm) for i in lhs_shape)
-        assert all(isinstance(i, IntImm) for i in rhs_shape)
+        assert all(isinstance(i, IntImm) for i in lhs_shape[1:])
+        assert all(isinstance(i, IntImm) for i in rhs_shape[1:])
         # Current AIT bmm only supports 3-dim. Use reshape to workaround.
-        reshape_op_0 = reshape()
-        batch_size = lhs_shape[0].value()
+        channel = lhs_shape[1].value()
         M = lhs_shape[2].value()
         K = lhs_shape[3].value()
-        channel = lhs_shape[1].value()
-        shape_0 = (batch_size * channel, M, K)
-        reshape_op_1 = reshape()
         N = rhs_shape[3].value()
         if K != rhs_shape[2].value():
             raise ValueError(
                 f"K dim mismatch on matmaul. Expected: [N, K] X [K, M]. Found: : [{M}, {K}] X [{rhs_shape[2].value()}, {N}]"
             )
-
-        shape_1 = (batch_size * channel, K, N)
-        reshape_op_2 = reshape()
-        shape_2 = (batch_size, channel, M, N)
-        return reshape_op_2(
-            bmm_rrr()(reshape_op_0(lhs, shape_0), reshape_op_1(rhs, shape_1)), shape_2
-        )
+        if isinstance(lhs_shape[0], IntImm) and (rhs_shape[0], IntImm):
+            batch_size = lhs_shape[0].value()
+            shape_0 = (batch_size * channel, M, K)
+            shape_1 = (batch_size * channel, K, N)
+            shape_2 = (batch_size, channel, M, N)
+        elif isinstance(lhs_shape[0], IntVar) and isinstance(rhs_shape[0], IntVar):
+            if lhs_shape[0]._attrs["values"] != rhs_shape[0]._attrs["values"]:
+                raise ValueError(
+                    f"Batch size mismatch on matmul. Expected: {lhs_shape[0]} == {rhs_shape[0]}"
+                )
+            lhs_size = size()(lhs)
+            new_size = getitem()(lhs_size, 0) * getitem()(lhs_size, 1)
+            shape_0 = (new_size, M, K)
+            shape_1 = (new_size, K, N)
+            shape_2 = (getitem()(lhs_size, 0), channel, M, N)
+        else:
+            raise NotImplementedError(
+                f"Expected all dimension except for the batch dim to be static. Got {lhs_shape} vs. {rhs_shape}"
+            )
+        reshape_op_0 = reshape()(lhs, shape_0)
+        reshape_op_1 = reshape()(rhs, shape_1)
+        return reshape()(acc_ops_bmm(name, reshape_op_0, reshape_op_1), shape_2)
     else:
         raise NotImplementedError(
             f"This case is unsupported in {name}: {len(lhs_shape)} and {len(rhs_shape)}"
diff --git a/fx2ait/fx2ait/test/converters/test_ait_matmul.py b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
index 5aed7240c..95f65c961 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_matmul.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
@@ -14,6 +14,7 @@
 #
 import torch
 from fx2ait.acc_tracer import acc_ops
+from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_fx2ait import AITTestCase
 
 from parameterized import parameterized
@@ -90,3 +91,43 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             torch.randn(*rhs_shape).half().cuda(),
         ]
         self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    def test_reshape_bmm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = torch.reshape(x, [-1, 3, 4])
+                y = torch.reshape(y, [-1, 4, 6])
+                return torch.bmm(x, y)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[2, 3, 4], [2, 4, 6]],
+            inputs_max=[[20, 3, 4], [20, 4, 6]],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.matmul}
+        )
+
+    def test_reshape_4d_bmm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = torch.reshape(x, [-1, 1, 3, 4])
+                y = torch.reshape(y, [-1, 1, 4, 6])
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[2, 3, 4], [2, 4, 6]],
+            inputs_max=[[20, 3, 4], [20, 4, 6]],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.matmul}
+        )
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 117c7a30a..c0ec115ab 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -498,6 +498,62 @@ def test_bmm_broadcast_bfloat16(self):
             [8, 8, 16], [32, 8], "2d_broadcastable_b_bfloat16", dtype="bfloat16"
         )
 
+    def test_rcr_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, 8, K], dtype)
+        y = get_torch_empty_tensor([2, 10, 8], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be imcompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
+
+    def test_rrr_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, K, 8], dtype)
+        y = get_torch_empty_tensor([2, 10, 8], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be imcompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
+
 
 if __name__ == "__main__":
     unittest.main()

From e6cd290ca0823e4ec545138cadd5f8ff578669eb Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@meta.com>
Date: Fri, 17 Feb 2023 15:10:13 -0800
Subject: [PATCH 153/638] forward fix for new
 _native_batch_norm_legit_no_training op (#295)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/295

forward fix for D43357221

Reviewed By: frank-wei, weiwangmeta

Differential Revision: D43376240

fbshipit-source-id: b8bdde86f0bae6010062c33aec03a4e13a87a6ab
---
 fx2ait/fx2ait/passes/lower_basic_pass_aten.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
index a66a28704..f49cb7d5e 100644
--- a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
+++ b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
@@ -221,6 +221,7 @@ def replace_aten_op_with_indices(module: torch.fx.GraphModule) -> torch.fx.Graph
             torch.ops.aten.max_pool3d_with_indices.default,
             torch.ops.aten.native_batch_norm.default,
             torch.ops.aten._native_batch_norm_legit.default,
+            torch.ops.aten._native_batch_norm_legit_no_training.default,
         ):
             modified = True
             if len(n.users) != 1:
@@ -241,6 +242,16 @@ def replace_aten_op_with_indices(module: torch.fx.GraphModule) -> torch.fx.Graph
                 new_args = list(n.args)
                 new_args.append(False)
                 new_args = tuple(new_args)
+            elif (
+                n.target == torch.ops.aten._native_batch_norm_legit_no_training.default
+            ):
+                new_op = torch.ops.aten.batch_norm
+                new_args = list(n.args)
+                new_args.append(False)
+                # _native_batch_norm_legit_no_training doesn't take in a training arg (assumed to be false)
+                # but batchnorm takes in a training arg at position 5.
+                new_args.insert(5, False)
+                new_args = tuple(new_args)
 
             getitem_node = next(iter(n.users))
             with module.graph.inserting_after(getitem_node):

From 6e6cf3958647c0fb8c42783590fd7d1854b5f761 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Fri, 17 Feb 2023 15:42:59 -0800
Subject: [PATCH 154/638] Fix reshape op (#297)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/297

Reviewed By: frank-wei, khabinov

Differential Revision: D43400539

fbshipit-source-id: e8f4cbd46402e5603cc48d24395db3f0e010581a
---
 fx2ait/fx2ait/test/converters/test_ait_cat.py |  80 ++++
 .../fx2ait/test/converters/test_ait_common.py | 382 ------------------
 .../test/converters/test_ait_reshape.py       | 144 +++++++
 .../fx2ait/test/converters/test_ait_split.py  |  86 ++++
 .../fx2ait/test/converters/test_ait_topk.py   |  80 ++++
 .../test/converters/test_ait_unsqueeze.py     |  61 +++
 6 files changed, 451 insertions(+), 382 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_cat.py
 delete mode 100644 fx2ait/fx2ait/test/converters/test_ait_common.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_reshape.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_split.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_topk.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py

diff --git a/fx2ait/fx2ait/test/converters/test_ait_cat.py b/fx2ait/fx2ait/test/converters/test_ait_cat.py
new file mode 100644
index 000000000..e739ac22e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_cat.py
@@ -0,0 +1,80 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Callable
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestCatConverter(AITTestCase):
+    combo = [
+        ["default", 0, torch.cat],
+        ["positive_dim", 1, torch.cat],
+        ["negative_dim", -1, torch.cat],
+        ["default", 0, torch.concat],
+        ["positive_dim", 1, torch.concat],
+        ["negative_dim", -1, torch.concat],
+    ]
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.cat})
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat_dynamic_shape(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+                [2, 3, 4],
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+                [20, 3, 4],
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={acc_ops.cat})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_common.py b/fx2ait/fx2ait/test/converters/test_ait_common.py
deleted file mode 100644
index fc3598647..000000000
--- a/fx2ait/fx2ait/test/converters/test_ait_common.py
+++ /dev/null
@@ -1,382 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from typing import Callable, List, Union
-
-import torch
-from fx2ait.acc_tracer import acc_ops, ait_acc_ops
-
-from fx2ait.tensor_spec import TensorSpec
-from fx2ait.tools.common_fx2ait import AITTestCase
-
-from parameterized import parameterized
-
-
-class TestUnsqueezeConverter(AITTestCase):
-    @parameterized.expand(
-        [
-            ["default", 1],
-            ["negative_dim", -1],
-        ]
-    )
-    def test_simple(self, name: str, dim: int):
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.unsqueeze(x, dim)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(2, 3, 4).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={acc_ops.unsqueeze})
-
-    def test_simple_dynamic_shape(self):
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.unsqueeze(x, 1)
-
-        model = TestModule().cuda()
-        inputs_spec = TensorSpec.create_spec_from_shapes(
-            inputs_min=[
-                [2, 3, 4],
-            ],
-            inputs_max=[
-                [20, 3, 4],
-            ],
-            dtype_list=[
-                torch.float16,
-            ],
-        )
-        self.run_test_with_dynamic_shape(
-            model, inputs_spec, expected_ops={acc_ops.unsqueeze}
-        )
-
-
-class TestPermuteConverter(AITTestCase):
-    def test_permute021(self):
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.permute(x, [0, 2, 1])
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(2, 3, 4).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={acc_ops.permute})
-
-    def test_permute021_dynamic_shape(self):
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.permute(x, [0, 2, 1])
-
-        model = TestModule().cuda()
-        inputs_spec = TensorSpec.create_spec_from_shapes(
-            inputs_min=[
-                [2, 3, 4],
-            ],
-            inputs_max=[
-                [20, 3, 4],
-            ],
-            dtype_list=[
-                torch.float16,
-            ],
-        )
-        self.run_test_with_dynamic_shape(
-            model, inputs_spec, expected_ops={acc_ops.permute}
-        )
-
-
-class TestCatConverter(AITTestCase):
-    combo = [
-        ["default", 0, torch.cat],
-        ["positive_dim", 1, torch.cat],
-        ["negative_dim", -1, torch.cat],
-        ["default", 0, torch.concat],
-        ["positive_dim", 1, torch.concat],
-        ["negative_dim", -1, torch.concat],
-    ]
-
-    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
-    def test_cat(self, name: str, dim: int, op: Callable):
-        class TestModule(torch.nn.Module):
-            def forward(
-                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
-            ) -> torch.Tensor:
-                return op([x, y, z], dim=dim)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(2, 3, 4).half().cuda(),
-            torch.randn(2, 3, 4).half().cuda(),
-            torch.randn(2, 3, 4).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={acc_ops.cat})
-
-    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
-    def test_cat_dynamic_shape(self, name: str, dim: int, op: Callable):
-        class TestModule(torch.nn.Module):
-            def forward(
-                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
-            ) -> torch.Tensor:
-                return op([x, y, z], dim=dim)
-
-        model = TestModule().cuda()
-
-        inputs_spec = TensorSpec.create_spec_from_shapes(
-            inputs_min=[
-                [2, 3, 4],
-                [2, 3, 4],
-                [2, 3, 4],
-            ],
-            inputs_max=[
-                [20, 3, 4],
-                [20, 3, 4],
-                [20, 3, 4],
-            ],
-            dtype_list=[
-                torch.float16,
-                torch.float16,
-                torch.float16,
-            ],
-        )
-
-        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={acc_ops.cat})
-
-
-class TestReshapeConverter(AITTestCase):
-    @parameterized.expand(
-        [
-            [[2, 3, 4], [6, 4]],
-            [[2, 3, 4], [2, 12]],
-            [[2, 3, 4], [24]],
-            [[2, 3, 4], [-1, 4]],
-            [[2, 3, 4], [2, -1]],
-            [[2, 3, 4], [-1]],
-        ]
-    )
-    def test_simple(self, original_shape: List[int], final_shape: List[int]) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.reshape(x, final_shape)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(*original_shape).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={acc_ops.reshape})
-
-    def test_with_getitem_size(self) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                d0 = y.size(dim=0)
-                d1 = y.size(dim=1)
-                return x.reshape(d0, d1)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(2, 3, 4).half().cuda(),
-            torch.randn(6, 4).half().cuda(),
-        ]
-        self.run_test(
-            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
-        )
-
-    def test_with_getitem_reshape_dim0(self) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                d0 = x.size(dim=0)
-                d1 = x.size(dim=1)
-                d2 = x.size(dim=2)
-                d = d1 * d2
-                return x.reshape(d0, d)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(2, 3, 4).half().cuda(),
-        ]
-        self.run_test(
-            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
-        )
-
-    def test_with_getitem_reshape_dim0_dynamic(self) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                d0 = x.size(dim=0)
-                d1 = x.size(dim=1)
-                d2 = x.size(dim=2)
-                d = d1 * d2
-                return x.reshape(d0, d)
-
-        model = TestModule().cuda()
-        inputs_spec = TensorSpec.create_spec_from_shapes(
-            inputs_min=[
-                [2, 3, 4],
-            ],
-            inputs_max=[
-                [20, 3, 4],
-            ],
-            dtype_list=[
-                torch.float16,
-            ],
-        )
-        self.run_test_with_dynamic_shape(
-            model,
-            inputs_spec,
-            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
-        )
-
-    ###TODO dim=0,1 dynamic has problem due to output size is not IntVar for dim1(P537903486).
-    # def test_with_getitem_reshape_dim01_dynamic(self) -> None:
-    #     class TestModule(torch.nn.Module):
-    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
-    #             d0 = x.size(dim=0)
-    #             d1 = x.size(dim=1)
-    #             d2 = x.size(dim=2)
-    #             d = d1 * d2
-    #             return x.reshape(d0, d)
-
-    #     model = TestModule().cuda()
-    #     inputs = [
-    #         [
-    #             torch.randn(2, 3, 4).half().cuda(),
-    #         ],
-    #         [
-    #             torch.randn(20, 30, 4).half().cuda(),
-    #         ],
-    #     ]
-    #     self.run_test_with_dynamic_shape(
-    #         model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
-    #     )
-
-
-class TestTopkConverter(AITTestCase):
-    @parameterized.expand(
-        [
-            [[4], 1],
-            [[6], 3],
-            [[6], 6],
-        ]
-    )
-    def test_simple(self, input: List[int], k: int) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                values, indices = torch.topk(x, k)
-                return indices
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(*input).half().cuda(),
-        ]
-
-        self.run_test(model, inputs, expected_ops={acc_ops.topk})
-
-    @parameterized.expand(
-        [
-            [[2, 4], 1],
-            [[2, 4], 2],
-            [[3, 3], 3],
-        ]
-    )
-    def test_multi_dimensional(self, input: List[int], k: int) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                values, indices = torch.topk(x, k)
-                return indices
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(*input).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={acc_ops.topk})
-
-    ##TODO results mismatch.(P537992074)
-    # def test_multi_dimensional_dynamic_shape(self) -> None:
-    #     class TestModule(torch.nn.Module):
-    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
-    #             values, indices = torch.topk(x, 1)
-    #             return indices
-
-    #     model = TestModule().cuda()
-    #     inputs = [
-    #         [
-    #             torch.randn((2, 4)).half().cuda(),
-    #         ],
-    #         [
-    #             torch.randn((20, 4)).half().cuda(),
-    #         ],
-    #     ]
-    #     self.run_test_with_dynamic_shape(model, inputs, expected_ops={acc_ops.topk})
-
-
-class TestSplitConverter(AITTestCase):
-    @parameterized.expand(
-        [
-            [[2, 10], [2, 3, 5]],
-            [[2, 10], 2],
-            [[2, 10], 3],
-        ]
-    )
-    def test_with_dim(
-        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
-    ) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.split(x, split_size_or_sections, dim=1)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(*input_shape).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
-
-    @parameterized.expand(
-        [
-            [[10], [2, 3, 5]],
-            [[10], 2],
-            [[10], 3],
-        ]
-    )
-    def test_without_dim(
-        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
-    ) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.split(x, split_size_or_sections)
-
-        model = TestModule().cuda()
-        inputs = [
-            torch.randn(*input_shape).half().cuda(),
-        ]
-        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
-
-    def test_with_dim_dynamic_shape(self) -> None:
-        class TestModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.split(x, 2, dim=1)
-
-        model = TestModule().cuda()
-        inputs_spec = TensorSpec.create_spec_from_shapes(
-            inputs_min=[
-                [2, 10],
-            ],
-            inputs_max=[
-                [20, 10],
-            ],
-            dtype_list=[
-                torch.float16,
-            ],
-        )
-        self.run_test_with_dynamic_shape(
-            model, inputs_spec, expected_ops={ait_acc_ops.split}
-        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reshape.py b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
new file mode 100644
index 000000000..49cb6ccda
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestReshapeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3, 4], [6, 4]],
+            [[2, 3, 4], [2, 12]],
+            [[2, 3, 4], [24]],
+            [[2, 3, 4], [-1, 4]],
+            [[2, 3, 4], [2, -1]],
+            [[2, 3, 4], [-1]],
+        ]
+    )
+    def test_simple(self, original_shape: List[int], final_shape: List[int]) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, final_shape)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*original_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.reshape})
+
+    def test_with_getitem_size(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                d0 = y.size(dim=0)
+                d1 = y.size(dim=1)
+                return x.reshape(d0, d1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(6, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_reshape_with_non_int_param(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=1)
+                return x.reshape(d0 * 8)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 4, 4).half().cuda(),
+            torch.randn(4, 8).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0_dynamic(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+    ###TODO dim=0,1 dynamic has problem due to output size is not IntVar for dim1(P537903486).
+    # def test_with_getitem_reshape_dim01_dynamic(self) -> None:
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             d0 = x.size(dim=0)
+    #             d1 = x.size(dim=1)
+    #             d2 = x.size(dim=2)
+    #             d = d1 * d2
+    #             return x.reshape(d0, d)
+
+    #     model = TestModule().cuda()
+    #     inputs = [
+    #         [
+    #             torch.randn(2, 3, 4).half().cuda(),
+    #         ],
+    #         [
+    #             torch.randn(20, 30, 4).half().cuda(),
+    #         ],
+    #     ]
+    #     self.run_test_with_dynamic_shape(
+    #         model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+    #     )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_split.py b/fx2ait/fx2ait/test/converters/test_ait_split.py
new file mode 100644
index 000000000..6930d5425
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_split.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List, Union
+
+import torch
+from fx2ait.acc_tracer import ait_acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestSplitConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 10], [2, 3, 5]],
+            [[2, 10], 2],
+            [[2, 10], 3],
+        ]
+    )
+    def test_with_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections, dim=1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[10], [2, 3, 5]],
+            [[10], 2],
+            [[10], 3],
+        ]
+    )
+    def test_without_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    def test_with_dim_dynamic_shape(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, 2, dim=1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 10],
+            ],
+            inputs_max=[
+                [20, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={ait_acc_ops.split}
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_topk.py b/fx2ait/fx2ait/test/converters/test_ait_topk.py
new file mode 100644
index 000000000..0da25aac0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_topk.py
@@ -0,0 +1,80 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestTopkConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[4], 1],
+            [[6], 3],
+            [[6], 6],
+        ]
+    )
+    def test_simple(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values, indices = torch.topk(x, k)
+                return indices
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    @parameterized.expand(
+        [
+            [[2, 4], 1],
+            [[2, 4], 2],
+            [[3, 3], 3],
+        ]
+    )
+    def test_multi_dimensional(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values, indices = torch.topk(x, k)
+                return indices
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    ##TODO results mismatch.(P537992074)
+    # def test_multi_dimensional_dynamic_shape(self) -> None:
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             values, indices = torch.topk(x, 1)
+    #             return indices
+
+    #     model = TestModule().cuda()
+    #     inputs = [
+    #         [
+    #             torch.randn((2, 4)).half().cuda(),
+    #         ],
+    #         [
+    #             torch.randn((20, 4)).half().cuda(),
+    #         ],
+    #     ]
+    #     self.run_test_with_dynamic_shape(model, inputs, expected_ops={acc_ops.topk})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py b/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py
new file mode 100644
index 000000000..bfdda4cfe
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestUnsqueezeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_simple(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.unsqueeze})
+
+    def test_simple_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, 1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.unsqueeze}
+        )

From 1fd9d0bfd6d02cb5a5855117a154b4a4847d117c Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Fri, 17 Feb 2023 18:10:59 -0800
Subject: [PATCH 155/638] Add A10G to detect_target (#279)

Summary:
A10G has compute capability 8.6
AWS instance g5.2xlarge is A10G

Look like we still need to return 80 for A10G.
If `_detect_cuda` returns 86 for A10G then I got - `Arch 86 is not supported by current cutlass lib`. error.
That is  bit strange because cutlas 3.0 (and even 2.11) support arch 8.6 - [doc](https://github.com/NVIDIA/cutlass)
```
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/backend/cuda/target_def.py", line 154, in __enter__
    self._operators = f_gen_ops(self._arch)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/backend/cuda/utils.py", line 60, in gen_ops
    raise NotImplementedError(
NotImplementedError: Arch 86 is not supported by current cutlass lib.
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/279

Reviewed By: wushirong

Differential Revision: D43414385

Pulled By: chenyang78

fbshipit-source-id: 4e97a7a48201dc2e40098f8fdf92b176a007688d
---
 python/aitemplate/testing/detect_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 38e251f20..9df75a1bd 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -41,7 +41,7 @@ def _detect_cuda():
         stdout = stdout.decode("utf-8")
         if "H100" in stdout:
             return "90"
-        if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout:
+        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30"]):
             return "80"
         if "V100" in stdout:
             return "70"

From a94d417fb8dd3a8d4ddcc0fe7c10db01226aa40b Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Fri, 17 Feb 2023 19:17:58 -0800
Subject: [PATCH 156/638] Add floor_div converter (#284)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/284

ATT

Reviewed By: frank-wei, khabinov

Differential Revision: D43282624

fbshipit-source-id: a12f894d315a2eb1417c51565ea91980ebf431e0
---
 fx2ait/fx2ait/converters/ait_converters.py          | 10 ++++++++++
 fx2ait/fx2ait/converters/utils.py                   |  2 ++
 fx2ait/fx2ait/test/converters/test_ait_binary_op.py | 12 +++++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index c24542245..29f517fdd 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -125,6 +125,16 @@ def acc_ops_div(
     return create_binary_op(FuncEnum.DIV, args, kwargs, name)
 
 
+@ait_converter(acc_ops.floor_div)
+def acc_ops_floor_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.FLOOR_DIV, args, kwargs, name)
+
+
 @ait_converter(acc_ops.add)
 def acc_ops_add(
     target: Target,
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index 96f0fbd42..cad21e444 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -122,6 +122,8 @@ def get_python_op_from_ait_constant_elementwise_op(
         return operator.truediv
     elif op_type == FuncEnum.SQRT:
         return math.sqrt
+    elif op_type == FuncEnum.FLOOR_DIV:
+        return operator.floordiv
     else:
         raise RuntimeError(f"{op_type} is not supported yet!")
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
index 95a1ac4a2..e75f1b861 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
@@ -26,10 +26,10 @@
     (torch.randn(3, 4), torch.randn(2, 3, 4)),
     (torch.randn(2, 3, 4), torch.randn(3, 4)),
     (torch.randn(1, 1, 1), torch.randn(2, 3, 4)),
-    (torch.randn(1), torch.randn(2, 3, 4)),
     (torch.randn(2, 3, 4), torch.randn(1)),
     (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
     (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
+    (torch.randn(1), torch.randn(2, 3, 4)),
 ]
 
 
@@ -61,6 +61,16 @@ class TestBinaryOpConverter(AITTestCase):
                 acc_ops.div,
                 [(lhs, rhs.clamp(min=0.01)) for lhs, rhs in TWO_TENSOR_INPUTS],
             ],
+            # TODO enable full list of test when OSS python version upgrade to include pyhton floordiv fix
+            # [
+            #     "floor_div",
+            #     operator.floordiv,
+            #     acc_ops.floor_div,
+            #     [
+            #         (TWO_TENSOR_INPUTS[i][0], TWO_TENSOR_INPUTS[i][1].clamp(min=0.01))
+            #         for i in range(0, 2)
+            #     ],
+            # ],
         ]
     )
     def test_two_tensors(

From 6bf56e3edcb4101f82c7c21c869e6efcc753f94c Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Mon, 20 Feb 2023 03:21:12 -0800
Subject: [PATCH 157/638] Add masked_select op (#275)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/275

Closes T144613508.
Adds implementation and test for `masked_select` op, similar to `torch.masked_select`.

We use `cub::DeviceSelect::Flagged`, as `torch.nonzero` does here: https://github.com/pytorch/pytorch/blob/e844120b2f44c363590e9b6eee4f13726fa930cf/aten/src/ATen/native/cuda/Nonzero.cu#L60-L111

Reviewed By: khabinov, chenyang78

Differential Revision: D43125310

fbshipit-source-id: 3497791edf1fa4b9eff6828e3ad3d6a5e195884e
---
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../backend/cuda/tensor/masked_select.py      | 241 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/masked_select.py      |  96 +++++++
 tests/unittest/ops/test_masked_select.py      | 235 +++++++++++++++++
 5 files changed, 575 insertions(+)
 create mode 100644 python/aitemplate/backend/cuda/tensor/masked_select.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/masked_select.py
 create mode 100644 tests/unittest/ops/test_masked_select.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index 381f2c010..6372aff54 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -23,6 +23,7 @@
     dynamic_slice,
     expand,
     gather,
+    masked_select,
     permute,
     permute021,
     permute0213,
@@ -42,6 +43,7 @@
     "dynamic_slice",
     "expand",
     "gather",
+    "masked_select",
     "permute",
     "permute021",
     "permute0213",
diff --git a/python/aitemplate/backend/cuda/tensor/masked_select.py b/python/aitemplate/backend/cuda/tensor/masked_select.py
new file mode 100644
index 000000000..dce3ca88d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/masked_select.py
@@ -0,0 +1,241 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select codegen and CUDA kernel
+"""
+import jinja2
+
+from ... import registry
+
+from ...backend_spec import CUDASpec
+from .. import cuda_common
+
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const bool* /*mask*/,
+    {{index_type}} /*num_elems*/,
+    {{index_type}}* /*output size*/,
+    void* workspace /*workspace*/,
+    cudaStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const bool* mask,
+    {{index_type}} num_elems,
+    {{index_type}}* num_nonmasked,
+    void* workspace,
+    cudaStream_t stream
+    ) {
+
+    // Make sure input, output, mask, and workspace are valid
+    if (!input) {
+        throw std::runtime_error("input is NULL!");
+    }
+    if (!output) {
+        throw std::runtime_error("output is NULL!");
+    }
+    if (!mask) {
+        throw std::runtime_error("mask is NULL!");
+    }
+    if (!workspace) {
+        throw std::runtime_error("workspace is NULL!");
+    }
+    size_t allocated_storage = {{workspace_size}};
+
+    // Keep the number of nonmasked elements at the beginning of the workspace
+    const size_t NUM_NONMASKED_SIZE = sizeof({{index_type}});
+    {{index_type}}* num_nonmasked_device = static_cast<{{index_type}}*>(workspace);
+
+    // Get needed temporary storage size and reallocate if necessary
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cudaError_t err = cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, input, mask, output, num_nonmasked_device, num_elems, stream);
+    if (err != cudaSuccess) {
+        std::cerr << "Error when checking the required buffer size!" << std::endl;
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+    cudaStreamSynchronize(stream);
+    if (allocated_storage < temp_storage_bytes + NUM_NONMASKED_SIZE) {
+        auto msg = "Got pre-allocated buffer of size " + std::to_string(allocated_storage) + ", but need " + std::to_string(temp_storage_bytes)
+                + ". Allocating a new buffer, expect performance degradation.";
+        std::cerr << msg << std::endl;
+        // Allocate temporary storage
+        temp_storage_bytes += NUM_NONMASKED_SIZE;
+        err = cudaMalloc(&d_temp_storage, temp_storage_bytes);
+        if (err != cudaSuccess) {
+            std::cerr << "Error when trying to allocate a new buffer!" << std::endl;
+            throw std::runtime_error(cudaGetErrorString(err));
+        }
+        workspace = d_temp_storage;
+        allocated_storage = temp_storage_bytes;
+    }
+    allocated_storage -= NUM_NONMASKED_SIZE;  // First NUM_NONMASKED_SIZE bytes are reserved
+
+    // Select nonmasked elements. First NUM_NONMASKED_SIZE bytes of workspace are reserved for num_nonmasked_device
+    err = cub::DeviceSelect::Flagged(workspace + NUM_NONMASKED_SIZE, allocated_storage, input, mask, output,
+        num_nonmasked_device, num_elems, stream);
+    if (err != cudaSuccess) {
+        std::cerr << "Error when selecting nonmasked elements!" << std::endl;
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    // Extract number of nonmasked elements (size of the output)
+    err = cudaMemcpy(num_nonmasked, num_nonmasked_device, NUM_NONMASKED_SIZE, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) {
+        std::cerr << "Error when copying the number of nonmasked elements from device to host!" << std::endl;
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+    if (d_temp_storage != nullptr) {
+        cudaFree(d_temp_storage);
+    }
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}  const {{index_type}} input_dims[] = {{input_dims}};
+{{indent}}  int64_t num_elems = 1;
+{{indent}}  for ({{index_type}} i = 0; i < {{rank}}; i++) {
+{{indent}}        num_elems *= input_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output_ptr}},
+{{indent}}      {{input_ptr}},
+{{indent}}      {{mask_ptr}},
+{{indent}}      num_elems,
+{{indent}}      {{num_nonmasked}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("cuda.masked_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+        workspace_size=func_attrs["workspace"],
+    )
+
+
+@registry.reg("cuda.masked_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.masked_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = CUDASpec()
+    x, mask = func_attrs["inputs"]
+    y = func_attrs["outputs"][0]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+    mask_ptr = backend_spec.cast_to_ptr_template.render(
+        name=mask._attrs["name"],
+        dtype="bool",
+    )
+    # Number of nonmasked elements, i.e. size of the output
+    num_nonmasked_ptr = "&" + y._attrs["shape"][0]._attrs["name"]
+    input_dims = "{" + ",".join([dim._attrs["name"] for dim in x._attrs["shape"]]) + "}"
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_attrs["name"],
+        input_name=x._attrs["name"],
+        num_nonmasked=num_nonmasked_ptr,
+        input_dims=input_dims,
+        rank=len(x._attrs["shape"]),
+        output_ptr=output_ptr,
+        input_ptr=input_ptr,
+        mask_ptr=mask_ptr,
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index b3e32e846..7ba9bf237 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -24,6 +24,7 @@
 from .dynamic_slice import dynamic_slice
 from .expand import expand
 from .gather import gather
+from .masked_select import masked_select
 from .permute import permute
 from .permute021 import permute021
 from .permute0213 import permute0213
diff --git a/python/aitemplate/compiler/ops/tensor/masked_select.py b/python/aitemplate/compiler/ops/tensor/masked_select.py
new file mode 100644
index 000000000..bc994f4fb
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/masked_select.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select op
+"""
+import logging
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class masked_select(Operator):
+    """
+    Returns a 1D tensor containing elements of the input tensor selected by the boolean mask,
+    similar to `torch.masked_select`.
+
+    Args:
+        input (Tensor): the source tensor.
+        mask (Tensor, boolean): has to be of same shape as input.
+
+    Returns:
+        output: 1D tensor of length equal to the total number of elements in `input`. The result
+            is contained in the first `num_nonmasked` elements of output. The rest of the output
+            tensor is not meaningful.
+        num_nonmasked: number of the non-masked elements in the input, i.e. the length of the
+            significant part of output.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "masked_select"
+        self._attrs["workspace"] = 0
+
+    def _infer_shape(self, x: Tensor, mask: Tensor) -> List[IntVar]:
+        input_shape = x._attrs["shape"]
+        mask_shape = mask._attrs["shape"]
+        if input_shape != mask_shape:
+            raise RuntimeError(
+                "Tensor shapes of input and mask are not equal! Shape1: {}, shape2: {}".format(
+                    input_shape, mask_shape
+                )
+            )
+
+        numel = 1
+        for dim in input_shape:
+            numel *= dim.upper_bound()
+        # Output size can range from 0 (when all mask elements are False) to the total number of
+        # elements in the input (when all mask elements are True).
+        return [IntVar(values=(0, numel))]
+
+    def __call__(
+        self,
+        x: Tensor,
+        mask: Tensor,
+    ) -> List[Tensor]:
+        dtype = mask._attrs["dtype"]
+        if dtype != "bool":
+            raise RuntimeError("Expected mask of dtype bool, but got {}".format(dtype))
+        self._attrs["inputs"] = [x, mask]
+        self._set_depth()
+        output_shape = self._infer_shape(x, mask)
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+
+        self._attrs["outputs"] = [output]
+        # Allocate temporary buffer. This empirical formula for size is deduced by looking at buffer sizes
+        # requested by cub::DeviceSelect::Flagged for differen input sizes. Required buffer size depends on
+        # the number of input elements and on the GPU architecture, but not on the input data type.
+        self._attrs["workspace"] = output_shape[0].upper_bound() // 128 + 1024
+        _LOGGER.debug(
+            f'Allocating {self._attrs["workspace"]} bytes for temporary buffer of masked_select op'
+        )
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_masked_select.py b/tests/unittest/ops/test_masked_select.py
new file mode 100644
index 000000000..22192328a
--- /dev/null
+++ b/tests/unittest/ops/test_masked_select.py
@@ -0,0 +1,235 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for masked_select Operator.
+"""
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
+
+
+@unittest.skipIf(
+    detect_target().name() == "rocm", "masked_select is not implemented for ROCm"
+)
+class maskedSelectTestCase(unittest.TestCase):
+    def _test_masked_select(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        test_name="masked_select",
+        copy_op=False,
+        dtype="float16",
+        zero_mask=False,
+        benchmark=False,
+    ):
+        X1 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=shape,
+            dtype="bool",
+            name="mask",
+            is_input=True,
+        )
+        X4_op = ops.masked_select()
+        if copy_op:
+            X4_op = ops.masked_select(**X4_op._get_op_attributes())
+
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        if zero_mask:
+            mask = torch.zeros_like(x)
+        else:
+            mask = get_random_torch_tensor(shape, dtype="float16") > 0
+        y_pt = torch.masked_select(x, mask)
+        y = torch.empty((x.numel(),), dtype=x.dtype, device=x.device)
+        y_ait = module.run_with_tensors([x, mask], [y])["output_values"]
+        # y_ait contains the correct result. It points to the same memory blob as y, but has the correct shape
+        self.assertTrue(torch.allclose(y_pt, y_ait, atol=1e-10, rtol=0))
+        # y retained the original shape (x.numel(),), so needs to be cut before comparison
+        self.assertTrue(torch.allclose(y_pt, y[: y_ait.shape[0]], atol=1e-10, rtol=0))
+
+        if benchmark:
+            print(f"Benchmarking with shape={shape}, dtype={dtype}")
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors([x, mask], [y])
+            # Benchmark.
+            num_benchmark_iter = 1000
+
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                [x, mask], [y], count=num_benchmark_iter
+            )
+            print(f"AITemplate time: {time_per_iter_ms:.2f}ms")
+
+            func = torch.masked_select
+            args = (x, mask)
+            # Warm up.
+            for _ in range(5):
+                func(*args)
+            # Benchmark.
+            torch_time_per_iter_ms = benchmark_torch_function(
+                num_benchmark_iter, func, *args
+            )
+            print(f"PyTorch time: {torch_time_per_iter_ms:.2f}ms")
+
+            print(f"Speedup: {torch_time_per_iter_ms / time_per_iter_ms:.2f}x")
+
+    @parameterized.expand(
+        [
+            [(2, 6), False],
+            [(20, 6), False],
+            [(300, 80), False],
+            # Uncomment to benchmark
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp16(self, shape, benchmark):
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_fp16",
+            dtype="float16",
+            benchmark=benchmark,
+        )
+        if not benchmark:
+            self._test_masked_select(
+                shape=shape,
+                test_name="masked_select_fp16_copy_op",
+                copy_op=True,
+                dtype="float16",
+                benchmark=benchmark,
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [(2, 6), False],
+            [(20, 6), False],
+            [(300, 80), False],
+            # Uncomment to benchmark
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp32(self, shape, benchmark):
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_fp32",
+            dtype="float32",
+            benchmark=benchmark,
+        )
+        if not benchmark:
+            self._test_masked_select(
+                shape=shape,
+                test_name="masked_select_fp32_copy_op",
+                copy_op=True,
+                dtype="float32",
+                benchmark=benchmark,
+            )
+
+    def test_input_dynamic_shape(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        test_name="masked_select_dynamic",
+        dtype="float16",
+        benchmark=False,
+    ):
+        """
+        Check that dynamic input shape is handled correctly.
+        """
+        dyn_shape = (IntVar(values=(1, 10)), IntVar(values=(1, 10)))
+        X1 = Tensor(
+            shape=dyn_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=dyn_shape,
+            dtype="bool",
+            name="mask",
+            is_input=True,
+        )
+        X4_op = ops.masked_select()
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        mask = get_random_torch_tensor(shape, dtype="float16") > 0
+        y_pt = torch.masked_select(x, mask)
+        y = torch.empty((x.numel(),), dtype=x.dtype, device=x.device)
+        y_ait = module.run_with_tensors([x, mask], [y])["output_values"]
+        # y_ait contains the correct result. It points to the same memory blob as y, but has the correct shape
+        self.assertTrue(torch.allclose(y_pt, y_ait, atol=1e-10, rtol=0))
+        # y retained the original shape (x.numel(),), so needs to be cut before comparison
+        self.assertTrue(torch.allclose(y_pt, y[: y_ait.shape[0]], atol=1e-10, rtol=0))
+
+    def test_empty_output(self, shape=(2, 6)):
+        """
+        The case when the mask is zero and the output is an empty tensor.
+        """
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_zero_mask",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()

From 5e3f1ba9a20feb4519a048a73c808479bb1eefbd Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Mon, 20 Feb 2023 22:32:44 -0800
Subject: [PATCH 158/638] Config the mem_align for the Attention kernel (#301)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/301

Let's not force the mem_align to be true for the attention kernel.
Instead, we set it to be true when the head_size meets the alignment
requirement and false otherwise. This change allows us to keep
the mem_align benefit while extending the kernel for more cases,
e.g. we just use the minimal alignment value for "ill-aligned"
head_size.

This change also replaced the simple return with a runtime exception
when the alignment check failed.

The temporary alignment check in fx2ait was removed since we
can ensure the kernel safety now.

Reviewed By: tenpercent

Differential Revision: D43415811

fbshipit-source-id: 243565f0334cc7b3755a269fb1e7d0df463ee253
---
 .../converters/ait_module_converters.py       |  5 ----
 .../cuda/attention/mem_eff_attention.py       | 30 +++++++++++++++++--
 tests/unittest/ops/test_attention.py          | 21 +++++++++++++
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index 792444be0..a869b18ae 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -40,11 +40,6 @@ def multi_head_attention_module(
     value = kwargs["value"] if "value" in kwargs else args[2]
     bsz, seq_len_q, dim = query.shape()
     _, seq_len, _ = key.shape()
-    # TODO update check condition once AIT backend ease kAlignment check condition
-    if submod.num_heads % 8 != 0:
-        raise ValueError(
-            f"The number of heads for MHA module is not supported:{submod.num_heads}"
-        )
     attn = nn.CrossAttention(
         dim=submod.embed_dim,
         seq_len=seq_len_q.value(),
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 6a473d7d9..9a3e39d47 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -29,12 +29,16 @@
 #include <iostream>
 #include <cuda_fp16.h>
 #include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
 // TODO: this include should be removed. There's a bug in CUTLASS, the
 // header containing cutlass::gemm::warp::WarpSize is not being included.
 // Until the fix is upstreamed, just inject it here instead.
 #include "cutlass/gemm/warp/mma.h"
+#include "gemm_kernel_utils.h"
 #include "kernel_forward.h"
 
+using namespace gemm_kernel_utils;
+
 {{func_signature}}
 {
 
@@ -80,10 +84,28 @@
         std::cerr << "WARNING: you will get better performance with `kSingleValueIteration=true` (keeps the output in RF rather than GMEM)";
     }
 
+    using GemmType = DefaultGemmType<ArchTag, {{elem_input_type}}>;
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            {{elem_input_type}},
+            {{elem_input_type}},
+            {{elem_input_type}}, // ElementC
+            float // ElementAccumulator
+            >;
+
+    // If the head_size already meets the alignment requirement, then
+    // it's safe to mark mem_align to be true to maximize the alignment
+    // benefit. Otherwise, assign false to it to use the minimal alignment.
+    constexpr const bool mem_align =
+        ({{head_size}} % DefaultConfig::kAlignmentA == 0) &&
+        ({{head_size}} % DefaultConfig::kAlignmentB == 0);
     using Attention = AttentionKernel<
         {{elem_input_type}}, // scalar_t
         ArchTag,
-        true, // memory is aligned
+        mem_align, // memory is aligned
         kQueriesPerBlock,
         kKeysPerBlock,
         kSingleValueIteration
@@ -133,8 +155,9 @@
       cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
     }
     if (!Attention::check_supported(p)) {
-      std::cerr << "Kernel does not support these inputs" << std::endl;
-      return;
+      std::string error_msg = std::string("Got error: kernel does not support these inputs") +
+           " at " + __FILE__ + ": " + std::to_string(__LINE__);          
+      throw std::runtime_error(error_msg);
     }
     kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
 
@@ -205,6 +228,7 @@ def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
     )
     return FUNC_TEMPLATE.render(
         elem_input_type=elem_input_type,
+        head_size=func_attrs["head_size"],
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         kIs64x64="true" if func_attrs["head_size"] <= 64 else "false",
         kSingleValueIteration="true" if func_attrs["head_size"] <= 128 else "false",
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index f6a3bec08..7099d2560 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -574,12 +574,33 @@ def test_mem_eff_attention_fp16(self):
                 test_name=f"mem_eff_attention_fp16_{use_perm}_causal",
                 dtype="float16",
             )
+            self._test_mem_eff_attention(
+                batch_size=16,
+                nheads=4,
+                seqlen=8,
+                n=80,
+                use_perm=use_perm,
+                test_name="mem_eff_attention_fp16_nheads_20",
+                dtype="float16",
+            )
             # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
             # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
             # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
             # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
             # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.expectedFailure
+    def test_mem_eff_attention_invalid_head_size_fp16(self):
+        self._test_mem_eff_attention(
+            batch_size=16,
+            nheads=8,
+            seqlen=8,
+            n=80,
+            test_name="mem_eff_attention_fp16_invalid_head_size",
+            dtype="float16",
+        )
+
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     @unittest.skipIf(
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,

From cc51fc2c83f8acfa4beb59a67ddba8efa843a862 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Tue, 21 Feb 2023 19:08:50 +0800
Subject: [PATCH 159/638] fix bugs

---
 python/aitemplate/backend/rocm/padding/nhwc3to4.py     | 4 ++--
 python/aitemplate/backend/rocm/padding/nhwc3to8.py     | 4 ++--
 python/aitemplate/backend/rocm/padding/pad_last_dim.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to4.py b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
index 8714a216e..f652d8b75 100644
--- a/python/aitemplate/backend/rocm/padding/nhwc3to4.py
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
@@ -18,7 +18,7 @@
 import jinja2
 
 from ... import registry
-from ...backend_spec import CUDASpec
+from ...backend_spec import ROCMSpec
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -172,7 +172,7 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
-    backend_spec = CUDASpec()
+    backend_spec = ROCMSpec()
     elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to8.py b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
index 302317c4f..01e508a2c 100644
--- a/python/aitemplate/backend/rocm/padding/nhwc3to8.py
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
@@ -18,7 +18,7 @@
 import jinja2
 
 from ... import registry
-from ...backend_spec import CUDASpec
+from ...backend_spec import ROCMSpec
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -184,7 +184,7 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
-    backend_spec = CUDASpec()
+    backend_spec = ROCMSpec()
     elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
diff --git a/python/aitemplate/backend/rocm/padding/pad_last_dim.py b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
index f28f96a4d..5c5936e77 100644
--- a/python/aitemplate/backend/rocm/padding/pad_last_dim.py
+++ b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
@@ -18,7 +18,7 @@
 import jinja2
 
 from ... import registry
-from ...backend_spec import CUDASpec
+from ...backend_spec import ROCMSpec
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -200,7 +200,7 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         [description]
     """
     func_name = func_attrs["name"]
-    backend_spec = CUDASpec()
+    backend_spec = ROCMSpec()
     elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )

From 25adfde4c82453a25afbba9b1b49bf6c215e224c Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 21 Feb 2023 09:44:21 -0800
Subject: [PATCH 160/638] Reduce unittest runtime. (#306)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/306

ATT, remove unnecessary unittests to reduce test runtime.

Reviewed By: chenyang78

Differential Revision: D43450380

fbshipit-source-id: abc17c06b1c735b6b322f89c4b2434112eeb4a13
---
 tests/unittest/ops/test_groupnorm.py | 58 +++++++++++++++-------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 4b4258443..fb65ddfd1 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -120,14 +120,12 @@ def _test_groupnorm(
 
     def test_groupnorm_float16(self):
         self._test_groupnorm()
-        self._test_groupnorm(x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5)
         self._test_groupnorm(x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 16, 16, 8192], num_groups=32, eps=1e-3)
         self._test_groupnorm(x_shape=[3, 64, 64, 128], num_groups=16, eps=1e-5)
         self._test_groupnorm(x_shape=[3, 33, 64, 120], num_groups=10, eps=1e-5)
         self._test_groupnorm(x_shape=[8, 34, 10, 72], num_groups=6, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 64], num_groups=32, eps=1e-5)
-        self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5, copy_op=True)
 
     def test_groupnorm_swish(self):
@@ -136,33 +134,39 @@ def test_groupnorm_swish(self):
             x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5, use_swish=True
         )
         self._test_groupnorm(
-            x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5, use_swish=True
+            x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5, use_swish=True, copy_op=True
+        )
+        self._test_groupnorm(
+            x_shape=[2, 8, 8, 1280], num_groups=32, eps=1e-5, use_swish=True
+        )
+        self._test_groupnorm(
+            x_shape=[2, 32, 32, 320], num_groups=32, eps=1e-5, use_swish=True
+        )
+        self._test_groupnorm(
+            x_shape=[1, 512, 512, 256], num_groups=32, eps=1e-5, use_swish=True
         )
 
-        shapes = [
-            (2, 16, 16, 1280),
-            (2, 16, 16, 1920),
-            (2, 16, 16, 2560),
-            (2, 16, 16, 640),
-            (2, 32, 32, 1280),
-            (2, 32, 32, 1920),
-            (2, 32, 32, 320),
-            (2, 32, 32, 640),
-            (2, 32, 32, 960),
-            (2, 64, 64, 320),
-            (2, 8, 8, 1280),
-            (2, 8, 8, 2560),
-            (2, 64, 64, 640),
-            (2, 64, 64, 960),
-            (1, 256, 256, 128),
-            (1, 512, 512, 256),
-        ]
-
-        for shape in shapes:
-            self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
-            self._test_groupnorm(
-                x_shape=shape, num_groups=32, eps=1e-5, use_swish=True, copy_op=True
-            )
+        # For benchmark only.
+        # shapes = [
+        #     (2, 16, 16, 1280),
+        #     (2, 16, 16, 1920),
+        #     (2, 16, 16, 2560),
+        #     (2, 16, 16, 640),
+        #     (2, 32, 32, 1280),
+        #     (2, 32, 32, 1920),
+        #     (2, 32, 32, 320),
+        #     (2, 32, 32, 640),
+        #     (2, 32, 32, 960),
+        #     (2, 64, 64, 320),
+        #     (2, 8, 8, 1280),
+        #     (2, 8, 8, 2560),
+        #     (2, 64, 64, 640),
+        #     (2, 64, 64, 960),
+        #     (1, 256, 256, 128),
+        #     (1, 512, 512, 256),
+        # ]
+        # for shape in shapes:
+        #     self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
     def test_groupnorm_float32(self):

From f436d4dbb2e4498a47f9fb16509747329f261822 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 21 Feb 2023 10:43:30 -0800
Subject: [PATCH 161/638] use TensorAccessor facilities in concatenate_fast
 kernel (#298)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/298

Reviewed By: chenyang78

Differential Revision: D42723049

fbshipit-source-id: 4d1ed089d3467b47b447cf23ea25a298f12a1ef7
---
 .../backend/cuda/tensor/concatenate_fast.cuh  | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
index 387d508d2..da8b26996 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
@@ -425,24 +425,35 @@ __global__ void ConcatKernelGeneralized(
   // Allocate a temporary buffer and perform all these read ops
   ChunkInputT inputValues[N_READ_OPS];
 
+  // don't merge these two branches, it is slower
   if (actualTEValue != originalTEValue) {
+    TensorAccessor inputTA{0, false, 0, originalTEValue, actualTEValue};
+
 #pragma unroll N_READ_OPS
     for (int32_t i = 0; i < N_READ_OPS; i++) {
-      // do remapping according to a TensorAccessor logic
-      // the remapping is expensive.
-      const IndexT iInputRow = (readPositionContiguous + i * (IndexT)ChunkInputT::NElements) / originalTEValue;
-      const IndexT iInputPos = (readPositionContiguous + i * (IndexT)ChunkInputT::NElements) % originalTEValue;
-      const IndexT readPosition = iInputRow * actualTEValue + iInputPos;
-
       // each read op reads ChunkInputT::NElements elements from an input tensor
-      inputValues[i].load(inputData + readPosition);
+      const input_data_type* const __restrict srcp =
+        inputTA.template get<const input_data_type, const input_data_type>(
+          inputData,
+          readPositionContiguous + i * ChunkInputT::NElements
+        );
+
+      inputValues[i].load(srcp);
     }
   }
   else {
+    TensorAccessor inputTA{0, true, 0, 0, 0};
+
 #pragma unroll N_READ_OPS
     for (int32_t i = 0; i < N_READ_OPS; i++) {
       // each read op reads ChunkInputT::NElements elements from an input tensor
-      inputValues[i].load(inputData + readPositionContiguous + i * (IndexT)ChunkInputT::NElements);
+      const input_data_type* const __restrict srcp =
+        inputTA.template get<const input_data_type, const input_data_type>(
+          inputData,
+          readPositionContiguous + i * ChunkInputT::NElements
+        );
+
+      inputValues[i].load(srcp);
     }
   }
 

From e49c153d5ebac2eb65891dca4fede044cb44ce34 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhao <zhiweiz@meta.com>
Date: Tue, 21 Feb 2023 14:33:54 -0800
Subject: [PATCH 162/638] In place update unit test (#309)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/309

Reviewed By: khabinov

Differential Revision: D43476294

fbshipit-source-id: c0e18f7d55697fb31ec6ddcf4fa21e0a1d1693d1
---
 fx2ait/fx2ait/test/test_fx2ait.py | 39 ++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
index 6dd503979..0154faa8f 100644
--- a/fx2ait/fx2ait/test/test_fx2ait.py
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -40,14 +40,24 @@ def setUpClass(cls) -> None:
         torch.manual_seed(0)
 
     def _test_fx2ait_impl(self, test_serialization=False, test_cuda_graph=False):
-        class TestModule(torch.nn.Module):
-            def forward(self, x, y):
-                add = x + y
-                mul = add * add
-                return mul
-
-        inputs = [torch.randn(2, 2).half().cuda(), torch.randn(2, 2).half().cuda()]
-        mod = TestModule().half().cuda()
+        mod = (
+            torch.nn.Sequential(
+                torch.nn.Linear(3, 4),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+            )
+            .half()
+            .cuda()
+        )
+        inputs = [torch.randn(5, 3).half().cuda()]
         ref_output = mod(*inputs)
 
         traced = acc_tracer.trace(mod, inputs)
@@ -85,6 +95,19 @@ def forward(self, x, y):
             ait_mod = torch.jit.load(buf)
         ait_output = ait_mod(*inputs)
         torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+        if not OSS_AIT_MODEL:
+            weights = {
+                "_0_weight": torch.ones(3, 4).cuda().half(),
+                "_0_bias": torch.randn(4).cuda().half(),
+            }
+            ait_mod.engine.update_constants_with_weights(weights)
+            ait_output = ait_mod(*inputs)
+            torch.testing.assert_close(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+            ait_mod.engine.swap_constants()
+            ait_output = ait_mod(*inputs)
+            self.assertFalse(
+                torch.allclose(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+            )
 
     def test_fx2ait(self):
         self._test_fx2ait_impl(test_serialization=False)

From 8d94b80a7b3006795920c5dd3a2773d71f8a569a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 21 Feb 2023 18:09:52 -0800
Subject: [PATCH 163/638] clean L2 cache in per op profiling (#299)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/299

supposedly doing memset to a large slab of gpu memory will clear the cache

also make profiling device generic

Reviewed By: alexanderguzhva

Differential Revision: D43410829

fbshipit-source-id: 3d099d5bf06bf5e3e8876acafa213694ee91a708
---
 python/aitemplate/backend/main_templates.py | 27 +++++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 332522035..f2233fa82 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -107,23 +107,33 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       if (!ss) {
         throw std::runtime_error(std::string("Could not open file ") + filename);
       }
+
+      int deviceId;
+      char* L2CacheSlab = nullptr;
+      DevicePropertyType deviceProperties;
+      GetDevice(&deviceId);
+      GetDeviceProperties(&deviceProperties, deviceId);
+      const size_t L2SizeInBytes = deviceProperties.l2CacheSize;
+      DeviceMalloc((void**) &L2CacheSlab, L2SizeInBytes);
+
       ss << "{\\n";
       {% for func_name, func in function_pair_seq %}
       {
         std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
-        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-        cudaEventRecord(start);
+        EventType start, stop;
+        CreateEvent(&start);
+        CreateEvent(&stop);
+        EventRecord(start, stream);
         for (size_t i = 0; i < iters; ++i) {
+          DeviceMemset(L2CacheSlab, 0x73, L2SizeInBytes);
             {{ func }}
           DeviceCheckLastError(__FILE__, __LINE__);
         }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
+        EventRecord(stop, stream);
+        EventSynchronize(stop);
         float milliseconds = 0.0;
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        ss << "\\"" << "{{ func_name }}" << "\\": " <<  (milliseconds/iters);
+        EventElapsedTime(&milliseconds, start, stop);
+        ss << "\\"" << "{{ func_name }}" << "\\": " << std::setprecision(4) << (milliseconds/iters);
         {% if loop.last %}
           ss << "\\n";
         {% else %}
@@ -135,6 +145,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 
       DeviceToDeviceCopies(stream);
       std::cout << "AIT per op profiling finished." << std::endl;
+      FreeDeviceMemory(L2CacheSlab);
     }
 
     static std::unique_ptr<{{model_name}}> Create(

From 2c1b574fc8134ac0a0ec37413d995f8601ee4d4d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 21 Feb 2023 18:09:52 -0800
Subject: [PATCH 164/638] update per op benchmark with input/output sizes
 (#300)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/300

Reviewed By: chenyang78

Differential Revision: D43416618

fbshipit-source-id: b7c88ea24226afe44fb2e72fe11c1fb9e2c56c37
---
 python/aitemplate/backend/codegen.py        | 39 +++++++++++++++++++--
 python/aitemplate/backend/main_templates.py | 10 ++++--
 tests/unittest/backend/test_model_api.py    |  4 +--
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 904a278ca..7666511db 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -247,6 +247,30 @@ def check_not_null(
     """
 
 
+def extract_input_output_shapes(func_attrs):
+    if "input_accessors" in func_attrs:
+        input_shape = [
+            [v.pseudo_code() for v in acc.original_shapes]
+            for acc in func_attrs["input_accessors"]
+        ]
+    else:
+        input_shape = [
+            [v.pseudo_code() for v in t.shape()] for t in func_attrs["inputs"]
+        ]
+
+    if "output_accessors" in func_attrs:
+        output_shape = [
+            [v.pseudo_code() for v in acc.original_shapes]
+            for acc in func_attrs["output_accessors"]
+        ]
+
+    else:
+        output_shape = [
+            [v.pseudo_code() for v in t.shape()] for t in func_attrs["outputs"]
+        ]
+    return input_shape, output_shape
+
+
 def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
     src_name = src_tensor._attrs["name"]
     dst_ptr = f"params_[{dst_idx}].ptr"
@@ -313,6 +337,8 @@ def __init__(
         self.set_inputs = []
         self.func_name_seq = []
         self.func_seq = []
+        self._input_shape_seq = []
+        self._output_shape_seq = []
         self.tensor_decl = []
         self.dim_decl = []
         self.device_to_device_copies = []
@@ -666,6 +692,10 @@ def _process_src_ops(self, node: Tensor) -> None:
                     seq = f'  {{\n  RAII_ProfilerRange _raiiOpProfilerRange("{func._attrs["outputs"][0]._attrs["name"]}");\n{seq}\n  }}'
                 self.func_name_seq.append(func._attrs["original_name"])
                 self.func_seq.append(seq)
+                input_shape, output_shape = extract_input_output_shapes(func._attrs)
+                self._input_shape_seq.append(input_shape)
+                self._output_shape_seq.append(output_shape)
+
             if "int_state_flag" in func._attrs:
                 if func._attrs["name"] not in self.state_record:
                     self.function_state.append(
@@ -756,7 +786,12 @@ def generate_model(self) -> str:
         # are not supported
         target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
 
-        func_pair_seq = zip(self.func_name_seq, self.func_seq)
+        per_op_profiler_seq = zip(
+            self.func_name_seq,
+            self.func_seq,
+            self._input_shape_seq,
+            self._output_shape_seq,
+        )
         return MODEL_TEMPLATE.render(
             model_name=self.model_name,
             function_decl="\n".join(self.func_decl),
@@ -767,7 +802,7 @@ def generate_model(self) -> str:
             device_to_device_copies="\n".join(self.device_to_device_copies),
             set_up_param_dynamic_shapes="\n".join(self.set_up_param_dynamic_shapes),
             function_seq=self.func_seq,
-            function_pair_seq=func_pair_seq,
+            per_op_profiler_seq=per_op_profiler_seq,
             tensor_decl="\n".join(self.tensor_decl),
             dim_decl="\n".join(self.dim_decl),
             function_state="\n".join(self.function_state),
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index f2233fa82..6f6b3c6d1 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -17,7 +17,6 @@
 """
 import jinja2
 
-
 MODEL_TEMPLATE = jinja2.Template(
     """
 #pragma once
@@ -117,7 +116,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       DeviceMalloc((void**) &L2CacheSlab, L2SizeInBytes);
 
       ss << "{\\n";
-      {% for func_name, func in function_pair_seq %}
+      {% for func_name, func, input_sizes, output_sizes in per_op_profiler_seq %}
       {
         std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
         EventType start, stop;
@@ -133,7 +132,12 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
         EventSynchronize(stop);
         float milliseconds = 0.0;
         EventElapsedTime(&milliseconds, start, stop);
-        ss << "\\"" << "{{ func_name }}" << "\\": " << std::setprecision(4) << (milliseconds/iters);
+        ss << "\\"" << "{{ func_name }}" << "\\": { \\"ms_per_iter\\": "
+           << std::setprecision(4) << (milliseconds/iters)
+           << ", \\"qps\\": " << 1000 * iters / milliseconds
+           << ", \\"input_sizes\\": " << "{{ input_sizes | replace("'", '\\\\"') }}"
+           << ", \\"output_sizes\\": " << "{{ output_sizes | replace("'", '\\\\"') }}"
+           << " } ";
         {% if loop.last %}
           ss << "\\n";
         {% else %}
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index d984ba5ad..b99733738 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -528,10 +528,10 @@ def test_profile(self):
                 profile_name,
             )
             with open(profile_name) as f:
-                report = json.loads(f.read())
+                report = json.load(f)
                 self.assertTrue(len(report), 1)
                 for _, elapsed in report.items():
-                    self.assertGreater(elapsed, 0)
+                    self.assertGreater(elapsed["ms_per_iter"], 0)
 
     def test_get_output_dtype(self):
         module, inputs, output_np = self._get_simple_graph_and_output(

From f2540d530601233c48d07f38ee89534337951d11 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 21 Feb 2023 18:09:52 -0800
Subject: [PATCH 165/638] adjust for memset measurement in per op profiling
 (#302)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/302

create a pair of events for each memset

Reviewed By: alexanderguzhva

Differential Revision: D43420630

fbshipit-source-id: c41de935855150f90c8592c4c8911d17a293c2ab
---
 python/aitemplate/backend/main_templates.py | 24 ++++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 6f6b3c6d1..a1299b801 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -119,19 +119,27 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       {% for func_name, func, input_sizes, output_sizes in per_op_profiler_seq %}
       {
         std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
-        EventType start, stop;
-        CreateEvent(&start);
-        CreateEvent(&stop);
-        EventRecord(start, stream);
-        for (size_t i = 0; i < iters; ++i) {
+        std::vector<std::pair<EventType, EventType>> call_events(iters);
+        for (auto& [call_start, call_end] : call_events) {
+          CreateEvent(&call_start);
+          CreateEvent(&call_end);
+        }
+        for (auto& [call_start, call_end]: call_events) {
           DeviceMemset(L2CacheSlab, 0x73, L2SizeInBytes);
+          EventRecord(call_start, stream);
             {{ func }}
+          EventRecord(call_end, stream);
           DeviceCheckLastError(__FILE__, __LINE__);
         }
-        EventRecord(stop, stream);
-        EventSynchronize(stop);
+        EventSynchronize(std::get<1>(call_events.back()));
         float milliseconds = 0.0;
-        EventElapsedTime(&milliseconds, start, stop);
+        for (auto& [call_start, call_end] : call_events) {
+          float call_milliseconds = 0.0;
+          EventElapsedTime(&call_milliseconds, call_start, call_end);
+          DestroyEvent(call_start);
+          DestroyEvent(call_end);
+          milliseconds += call_milliseconds;
+        }
         ss << "\\"" << "{{ func_name }}" << "\\": { \\"ms_per_iter\\": "
            << std::setprecision(4) << (milliseconds/iters)
            << ", \\"qps\\": " << 1000 * iters / milliseconds

From 8a0e2b82ad05a2640c76f760b27f2ea689263ad1 Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Tue, 21 Feb 2023 21:15:56 -0800
Subject: [PATCH 166/638] lower to ait (#292)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/292

* workaround clone bug: multiply by 1.
* apply specific fix for slice_tensor in the case of 2 x unsqueeze on the most inner dimension.

Reviewed By: frank-wei, ktnag

Differential Revision: D43158762

fbshipit-source-id: a768c82a51a058e56a64ff82f90e619795611b66
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py             |  3 +++
 fx2ait/fx2ait/converters/__init__.py            |  1 +
 fx2ait/fx2ait/converters/ait_converters.py      | 16 +++++++++-------
 fx2ait/fx2ait/converters/utils.py               | 17 +++++++++++++++--
 .../test/converters/test_ait_slice_tensor.py    | 11 +++++++++++
 fx2ait/fx2ait/tools/common_fx2ait.py            | 11 ++++++++---
 6 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 9b0316dcf..e361c0ff4 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -2254,6 +2254,7 @@ def embedding_bag_4bit_rowwise_offsets(
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
 @register_acc_op_mapping(op_and_target=("call_function", torch.sin))
+@register_acc_op_mapping(op_and_target=("call_method", "sin"))
 @register_acc_op
 def sin(*, input):
     return torch.sin(input=input)
@@ -2261,6 +2262,7 @@ def sin(*, input):
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
 @register_acc_op_mapping(op_and_target=("call_function", torch.cos))
+@register_acc_op_mapping(op_and_target=("call_method", "cos"))
 @register_acc_op
 def cos(*, input):
     return torch.cos(input=input)
@@ -3221,6 +3223,7 @@ def baddbmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
 
 
 @register_acc_op_mapping(op_and_target=("call_function", torch.clone))
+@register_acc_op_mapping(op_and_target=("call_method", "clone"))
 @register_acc_op
 def clone(*, input):
     return torch.clone(input)
diff --git a/fx2ait/fx2ait/converters/__init__.py b/fx2ait/fx2ait/converters/__init__.py
index 8f62fbc1e..990e3a43b 100644
--- a/fx2ait/fx2ait/converters/__init__.py
+++ b/fx2ait/fx2ait/converters/__init__.py
@@ -15,3 +15,4 @@
 from .ait_converters import *  # noqa: F401 F403
 from .aten2ait_converters import *  # noqa: F401 F403
 from .ait_module_converters import *  # noqa: F401 F403
+from .utils import set_tensor_layout_policy
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 29f517fdd..332bb4912 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -207,9 +207,11 @@ def acc_ops_clone(
     name: str,
 ) -> ConverterOutput:
     input_val = kwargs["input"]
-    res = copy.deepcopy(input_val)
-    res._attrs["dst_ops"].clear()
-    return res
+    # deepcopy results with an error. replace with Idnetity multiplication by 1.
+    # TODO: implement __deepcopy__ / clone for AITTensor.
+    one_const = AITTensor(shape=[], dtype="float16", name="one_const", value=1.0)
+    identity_mul_result = elementwise(FuncEnum.MUL)(input_val, one_const)
+    return identity_mul_result
 
 
 @ait_converter(acc_ops.sum)
@@ -642,6 +644,10 @@ def num_slice_types(slices):
 
     output = op(input_val, start, end)
     for dim, squeeze_func in reversed(squeezable_indices):
+        # TODO: fix None for a more general case.
+        # unsqueeze(dim=-1) to unsqueeze the most inner dimension
+        if dim > rank and squeeze_func == unsqueeze:
+            dim = -1
         output = squeeze_func(dim)(output)
     return output
 
@@ -1180,10 +1186,6 @@ def _choose_conv2d_op(
     if last_dim < 4:
         weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
         x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
-    elif last_dim in range(5, 8):
-        to_8 = nhwc3to8()
-        weight = to_8(weight)
-        x = to_8(x)
     elif last_dim % 2 != 0:
         return RuntimeError(
             f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index cad21e444..ea8894a0a 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -26,6 +26,13 @@
 )
 from torch.fx.node import Argument
 
+OPS_FOLLOW_PT_TENSOR_LAYOUT = True
+
+
+def set_tensor_layout_policy(follow_pt_layout: bool):
+    global OPS_FOLLOW_PT_TENSOR_LAYOUT
+    OPS_FOLLOW_PT_TENSOR_LAYOUT = follow_pt_layout
+
 
 def get_positive_dim(dim: int, dim_size: int) -> int:
     if dim < 0:
@@ -168,11 +175,17 @@ def ait_nlc2ncl(ait_tensor: AITTensor) -> AITTensor:
 
 
 def ait_nchw2nhwc(ait_tensor: AITTensor) -> AITTensor:
-    return permute()(ait_tensor, [0, 2, 3, 1])
+    if OPS_FOLLOW_PT_TENSOR_LAYOUT:
+        return permute()(ait_tensor, [0, 2, 3, 1])
+    else:
+        return ait_tensor
 
 
 def ait_nhwc2nchw(ait_tensor: AITTensor) -> AITTensor:
-    return permute()(ait_tensor, [0, 3, 1, 2])
+    if OPS_FOLLOW_PT_TENSOR_LAYOUT:
+        return permute()(ait_tensor, [0, 3, 1, 2])
+    else:
+        return ait_tensor
 
 
 def ait_ncdhw2ndhwc(ait_tensor: AITTensor) -> AITTensor:
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
index e14126046..8236e5077 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -48,6 +48,17 @@ class TestSliceTensor(AITTestCase):
                 (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
             ),
             ("none", (slice(None, None, None), None, slice(1, -1, 1), 1)),
+            (
+                "unsqueeze_inner_dim_twice",
+                (
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    None,
+                    None,
+                ),
+            ),
             ("with_squeeze", (slice(None, None, None), 1, slice(1, -1, 1), None)),
             (
                 "slice_zero_slice",
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index e1a934347..a2cb261e4 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import copy
 import time
 import unittest
 
@@ -83,6 +84,7 @@ def run_test(
         permute_outputs: Optional[List[int]] = None,
         passes: List[Callable] = [],  # noqa: B006
         leaf_module: Callable = None,  # one leaf module
+        apply_passes_to_lowered_module_only=False,
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -92,6 +94,7 @@ def run_test(
         if leaf_module:
             leaf_module_list.append(leaf_module)
 
+        orig_mod = copy.deepcopy(mod)
         mod = acc_tracer.trace(
             mod,
             inputs,
@@ -102,7 +105,7 @@ def run_test(
 
         print(mod.graph)
 
-        original_inputs = inputs
+        original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
             inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
         interp = AITInterpreter(
@@ -117,6 +120,10 @@ def run_test(
                 cuda_inputs.append(i.cuda())
 
             mod.eval()
+            if apply_passes_to_lowered_module_only:
+                ref_outputs = orig_mod(*original_inputs)
+            else:
+                ref_outputs = mod(*original_inputs)
             if len(expected_ops):
                 self.assert_has_op(mod, expected_ops)
             if unexpected_ops:
@@ -150,8 +157,6 @@ def run_test(
                     interp_result,
                 )
 
-            ref_outputs = mod(*original_inputs)
-
             torch.cuda.synchronize()
             start_event = torch.cuda.Event(enable_timing=True)
             end_event = torch.cuda.Event(enable_timing=True)

From 3d098f975dd6d0c0976e0ffb0f7f12821489a297 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Tue, 21 Feb 2023 21:54:30 -0800
Subject: [PATCH 167/638] Update SD model to version 2.1 base (#286)

Summary:
Stable Diffusion Example provides scripts to download, compile and demo SD 2 model.
One issue is that default resolution in compile and demo scripts is 512x512 but download script downloads [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) which uses 768x768 resolution.

To fix this issue we can download "base" model (it has resolution 512x512). Also we can update the model to the latest version 2.1 base - [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)

ipiszy

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/286

Reviewed By: khabinov

Differential Revision: D43449192

Pulled By: ipiszy

fbshipit-source-id: 36268294e2780f358da2573e1a16567b297dee87
---
 examples/05_stable_diffusion/scripts/download_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index e5ffe56f0..1128769da 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -26,7 +26,7 @@
 )
 def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
+        "stabilityai/stable-diffusion-2-1-base",
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``

From ff5615cb0683b590101eadc6b0f9d48710a22016 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 21 Feb 2023 22:03:53 -0800
Subject: [PATCH 168/638] Time each make command (#307)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/307

ATT, get detailed timing info for each nvcc / ld commandline.

Tested a simple case test_gemm_rcr_bias_add_float.
Most time is spent on compiling the profiler: 1m24s.
For .so compilation: 54.37s is spent on obj compilation. 0.18s is spent on linking. obj compilation is done by multiple threads in parallel so the actual compilation time is only 13s.

We could reduce profiler building time by separating compilation and linking stage, and parallelizing profiling compilation in multiple threads.

Reviewed By: khabinov, chenyang78

Differential Revision: D43451031

fbshipit-source-id: fa282d0975cc5052e9689298500d24975264de6b
---
 python/aitemplate/backend/builder.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 1e735117e..64da0f252 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -53,6 +53,10 @@ def _augment_for_trace(cmd):
     ).format(cmd)
 
 
+def _time_cmd(cmd):
+    return f"exec time -f 'exit_status=%x elapsed_sec=%e argv=\"%C\"' {cmd}"
+
+
 def _log_error_context(
     stderr,
     build_dir,
@@ -328,6 +332,7 @@ def build_objs(
             else:
                 cmd = cc_cmd.format(target=target, src=src)
 
+            cmd = _time_cmd(cmd)
             _LOGGER.debug(f"The cmd for building {target} is : {cmd}")
             self._runner.push(idx, cmd, target)
         self._runner.join()
@@ -356,6 +361,7 @@ def build_so(self, target: Target, objs: list[str]):
             + compile_options
             + " -o {target} {objs}".format(target=target, objs=" ".join(objs))
         )
+        cmd = _time_cmd(cmd)
         _LOGGER.debug(f"The cmd for building {target} is {cmd}")
         self._runner.push(0, cmd, target)
         self._runner.join()
@@ -435,8 +441,12 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings)
             cfile_cmd = _augment_for_trace(cfile_cmd)
             bfile_cmd = _augment_for_trace(bfile_cmd)
             build_so_cmd = _augment_for_trace(build_so_cmd)
+        else:
+            cfile_cmd = _time_cmd(cfile_cmd)
+            bfile_cmd = _time_cmd(bfile_cmd)
+            build_so_cmd = _time_cmd(build_so_cmd)
 
-        build_exe_cmd = "$(CC) $(CFLAGS) -o $@ $(obj_files)"
+        build_exe_cmd = _time_cmd("$(CC) $(CFLAGS) -o $@ $(obj_files)")
         targets = f"{dll_name}"
 
         build_standalone_rules = ""
@@ -734,6 +744,8 @@ def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
             )
             if self._do_trace:
                 cmd_line = _augment_for_trace(cmd_line)
+            else:
+                cmd_line = _time_cmd(cmd_line)
 
             command = f"{dep_line}\n\t{cmd_line}\n"
             commands.append(command)

From 588f8b22ac6d105ec7b622030a2f3092be4a256f Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 21 Feb 2023 22:28:26 -0800
Subject: [PATCH 169/638] Print param name info in SetInputShape(). (#305)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/305

ATT, makes debugging easier.

Reviewed By: khabinov, chenyang78

Differential Revision: D43449721

fbshipit-source-id: ac17d94f76ee5c088593efe336bc5ae4487cecb0
---
 static/include/model.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/static/include/model.h b/static/include/model.h
index 51f00713a..ac9b1c30e 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -195,7 +195,7 @@ class ModelBase {
           std::to_string(shape.size));
     }
     for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
-      param.shape_ptrs[i].SetValue(shape.shape_data[i]);
+      param.shape_ptrs[i].SetValue(shape.shape_data[i], param.name);
     }
   }
 
@@ -277,12 +277,13 @@ class ModelBase {
     ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value)
         : lower_bound_(lower_bound), upper_bound_(upper_bound), value_(value) {}
 
-    void SetValue(int64_t new_value) {
+    void SetValue(int64_t new_value, const char* name = nullptr) {
       if (new_value < lower_bound_ || new_value > upper_bound_) {
         throw std::out_of_range(
             "[SetValue] Dimension got value out of bounds; expected value to be in [" +
             std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) +
-            "], but got " + std::to_string(new_value));
+            "], but got " + std::to_string(new_value) +
+            (name ? ". Variable name: " + std::string(name) : "") + ".");
       }
       *value_ = new_value;
     }

From adf6c4b4b1584bf638a77a9a518645020a0fd46c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Wed, 22 Feb 2023 10:40:45 -0800
Subject: [PATCH 170/638] fixed a couple of lint issues for ait_converters
 (#311)

Summary:
fixed a couple of "F401 'xxx' imported but unused" issues

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/311

Reviewed By: alexanderguzhva

Differential Revision: D43494219

Pulled By: chenyang78

fbshipit-source-id: 5f1ce44dac24d0136db72afff69acc859a5af52e
---
 fx2ait/fx2ait/converters/__init__.py       | 2 +-
 fx2ait/fx2ait/converters/ait_converters.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fx2ait/fx2ait/converters/__init__.py b/fx2ait/fx2ait/converters/__init__.py
index 990e3a43b..c2c99763d 100644
--- a/fx2ait/fx2ait/converters/__init__.py
+++ b/fx2ait/fx2ait/converters/__init__.py
@@ -15,4 +15,4 @@
 from .ait_converters import *  # noqa: F401 F403
 from .aten2ait_converters import *  # noqa: F401 F403
 from .ait_module_converters import *  # noqa: F401 F403
-from .utils import set_tensor_layout_policy
+from .utils import set_tensor_layout_policy  # noqa: F401
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 332bb4912..0aee6b664 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import copy
 import logging
 import math
 import operator
@@ -44,7 +43,6 @@
     IntVarTensor,
     layernorm,
     max_pool2d,
-    nhwc3to8,
     pad_last_dim,
     permute,
     reduce_mean,

From 0d44313638920e922ee101f72373272a26e8001a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Wed, 22 Feb 2023 15:47:27 -0800
Subject: [PATCH 171/638] make ProfilerMemoryPool work with low-memory cases
 (#310)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/310

Previously, the minimal number of copies in a ProfilerMemoryPool
instance is 2. It does not work for edge cases where we are
under GPU memory pressure. For example, for some models with large
weights being loaded into the GPU global memory, there may be
a very small amount of memory left for the profiler memory pool.
For example, the free memory could only hold for a single copy
or even less.

This PR mitigate the issue by allocating a single shared blob
that is large enough to hold the largest input of the op
being profiled.

Reviewed By: alexanderguzhva

Differential Revision: D43490681

fbshipit-source-id: 3723077aba1591c024223efeac08e53899ee744c
---
 .../gemm_epilogue_vistor/common_dual_gemm.py  | 10 +--
 .../backend/cuda/gemm_universal/bmm_common.py | 13 ++--
 .../backend/cuda/gemm_universal/common.py     | 71 ++++++++++++++++---
 .../gemm_universal/common_bias_broadcast.py   | 10 +--
 4 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 0ab286363..ef20701d2 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -71,13 +71,15 @@
 
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b0_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  size_t one_copy_sz = a_ptr_sz + b0_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += b1_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b0_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 
 {% if has_bias %}
   memory_pool->AllocateTensor(b1_ptr_sz, mem_pool_sz);  // b_ptr: index 3
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 4f4bba5d4..4f388af7a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -135,13 +135,18 @@
 
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim2;
+{%endif%}
+{% if has_d %}
+  one_copy_sz += c_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 {% if has_bias %}
   memory_pool->AllocateTensor(c_dim2, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 1de5a3de2..d6ef0ecc9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -353,18 +353,20 @@
 
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim1;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 
 {% if has_bias %}
   memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
-
 """
 )
 
@@ -467,7 +469,7 @@
 
 template <typename DType>
 struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
+  ProfilerMemoryPool() : shared_input_tensor(false) {
     std::random_device rd;
     gen = std::mt19937(rd());
     uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
@@ -479,6 +481,50 @@
   }
   ~ProfilerMemoryPool() {}
 
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz) {
+    // TODO: special pool size for A100 L2 cache 40M
+    // need to tune it for other devices
+    int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+    size_t free_global_mem = 0;
+    size_t total_global_mem = 0;
+    cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
+    if (cuda_error != cudaSuccess) {
+      auto error_msg = std::string("Failed to invoke cudaMemGetInfo: ") +
+          cudaGetErrorName(cuda_error) + ", at " + __FILE__;
+      throw std::runtime_error(error_msg);
+    }
+    size_t single_copy_nbytes = one_copy_sz * sizeof(DType);
+    while (mem_pool_sz > 0) {
+      size_t nbytes = single_copy_nbytes * mem_pool_sz;
+      if (nbytes < free_global_mem) {
+        break;
+      }
+      mem_pool_sz--;
+    }
+
+    if (mem_pool_sz <= 1) {
+      size_t minimal_required_nbytes = ptr_max_sz * sizeof(DType);
+      if (minimal_required_nbytes > free_global_mem) {
+        // We absolutely run out of memory
+        auto error_msg = std::string("no enough GPU memory: requested ") +
+            std::to_string(minimal_required_nbytes) + ", available: " +
+            std::to_string(free_global_mem) + ", ptr_max_sz: " +
+            std::to_string(ptr_max_sz) + ", at " + __FILE__;
+        throw std::runtime_error(error_msg);
+      } else {
+        // Let's try to allocate a single blob that is large enough to hold
+        // all input tensors. Note that this is still an approximation, because
+        // we may still hit cudaErrorMemoryAllocation error while allocating
+        // memory for the output. We will rely on cudaMalloc to throw out
+        // an exception in such a case.
+        shared_input_tensor = true;
+        AllocateGaussianTensor(ptr_max_sz);
+      }
+      return 1;
+    }
+    return mem_pool_sz;
+  }
+
   DType* AllocateGaussianTensor(int64_t size) {
     size_t length = size * sizeof(DType);
     blobs.emplace_back(length);
@@ -494,12 +540,16 @@
     return ptr;
   }
 
-
-  int AllocateTensor(int64_t size, int64_t copy) {
+  int AllocateTensor(int64_t size, int64_t copy, bool is_output = false) {
     offsets.push_back(0);
     strides.push_back(size);
     copies.push_back(copy);
-    auto ptr = AllocateGaussianTensor(size * copy);
+    DType *ptr;
+    if (!is_output && shared_input_tensor) {
+      ptr = reinterpret_cast<DType*>(blobs.back().get());
+    } else {
+      ptr = AllocateGaussianTensor(size * copy);
+    }
     ptrs.push_back(reinterpret_cast<void*>(ptr));
     return ptrs.size() - 1;
   }
@@ -525,6 +575,9 @@
   std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
   std::mt19937 gen;
   std::uniform_int_distribution<int64_t> uniform_dist;
+  // make a shared blob to hold all inputs in cases we do not have
+  // enough GPU memory
+  bool shared_input_tensor;
 };
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 20c2c402f..2e5f9de6d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -318,13 +318,15 @@
   int64_t c_ptr_sz = c_dim0 * c_dim1;
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz + c_dim1 + c_ptr_sz;
+{% if has_d1 %}
+  one_copy_sz += c_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
   memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
   memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
 {% if has_d1 %}

From 79395ef43679e43839a7d5b88cb3f84a13b2e87c Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Wed, 22 Feb 2023 18:38:30 -0800
Subject: [PATCH 172/638] op support (#315)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/315

1. neg op support
2. fix for unbind converter neg dim case
3. add pass to replace the bmm module with replace_hstu_module_func
4. skip const folding for dtype op
5. add pass to remove pattern of dtype+to

Reviewed By: ipiszy

Differential Revision: D43298909

fbshipit-source-id: 8eb52426aaca586ae50fde75cccca6a0827a8328
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py           | 28 +++++++++++++++----
 fx2ait/fx2ait/converters/ait_converters.py    | 24 ++++++++++++++--
 .../fx2ait/test/converters/test_ait_unbind.py | 11 ++++++--
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index e361c0ff4..c64771f68 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -377,9 +377,10 @@ def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
             getitem_node.meta = node.meta.copy()
             return getitem_node
 
-    assert (
-        input_obj_type == torch.Tensor
-    ), f"Expected torch.Tensor type for {input_obj_type}"
+    assert input_obj_type in [
+        torch.Tensor,
+        torch.nn.parameter.Parameter,
+    ], f"Expected torch.Tensor type for {input_obj_type}"
     assert (
         attr_name == "shape" or attr_name == "device" or attr_name == "dtype"
     ), f"Only supporting shape, device and dtype getattr for now, not {attr_name}"
@@ -430,7 +431,10 @@ def tensor_size_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
 @register_acc_op_mapping(op_and_target=("call_method", "add"))
 @register_acc_op
 def add(*, input, other):
-    return input + other
+    if not (isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor)):
+        return operator.add(input, other)
+    else:
+        return input + other
 
 
 @register_acc_op_properties(AccOpProperty.unary)
@@ -1040,7 +1044,10 @@ def rescale_quantize_per_channel(*, input, acc_out_ty=None):
 @register_acc_op_mapping(op_and_target=("call_method", "sub"))
 @register_acc_op
 def sub(*, input, other):
-    return input - other
+    if not (isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor)):
+        return operator.sub(input, other)
+    else:
+        return input - other
 
 
 @register_acc_op_properties(AccOpProperty.pointwise)
@@ -1746,7 +1753,10 @@ def abs(*, input):
 @register_acc_op_mapping(op_and_target=("call_function", torch.neg))
 @register_acc_op
 def neg(*, input):
-    return torch.neg(input=input)
+    if not isinstance(input, torch.Tensor):
+        return operator.neg(input)
+    else:
+        return torch.neg(input=input)
 
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
@@ -3252,6 +3262,12 @@ def group_norm(*, input, num_groups, weight=None, bias=None, eps=1e-05):
     )
 
 
+@register_acc_op_mapping(op_and_target=("call_method", "long"))
+@register_acc_op
+def long(*, input):
+    return input.long()
+
+
 ###############################################################################
 
 # Set ops as side-effectul, this prevents them from being optimized away or
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 0aee6b664..886da477f 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -490,6 +490,8 @@ def acc_ops_unbind(
     dim = kwargs["dim"]
     shape = input_val.shape()
     res = []
+    if dim < 0:
+        dim = len(shape) + dim
     for cnt in range(shape[dim].value()):
         idx = []
         for i in range(len(shape)):
@@ -1508,6 +1510,16 @@ def acc_ops_contiguous(
     return kwargs["input"]
 
 
+@ait_converter(acc_ops.to_dtype)
+def acc_ops_to_dtype(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return kwargs["input"]
+
+
 @ait_converter(acc_ops.gelu)
 def acc_ops_gelu(
     target: Target,
@@ -1587,5 +1599,13 @@ def acc_ops_neg(
     input_val = kwargs["input"]
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
-    neg_one = AITTensor(shape=[], dtype="float16", name="neg_one", value=-1.0)
-    return elementwise(FuncEnum.MUL)(input_val, neg_one)
+    new_kwargs = kwargs.copy()
+    dt = new_kwargs["input"]._attrs["dtype"]
+    if dt == "float16" or dt == "float32":
+        new_kwargs["other"] = float(-1)
+    elif dt == "int32" or dt == "int64":
+        new_kwargs["other"] = int(-1)
+    else:
+        raise ValueError(f"Unexpected input dtype {dt}")
+
+    return create_binary_op(FuncEnum.MUL, args, new_kwargs, name)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unbind.py b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
index fc5735c92..a10d27c53 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unbind.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
@@ -14,17 +14,24 @@
 #
 import torch
 from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
 from torch import nn
 
 
 class TestUnbindTensor(AITTestCase):
-    def test_unbind(self):
+    @parameterized.expand(
+        [
+            ("positive_dim", 2),
+            ("negative_dim", -1),
+        ]
+    )
+    def test_unbind(self, name, dim):
         class GetItem(nn.Module):
             def __init__(self):
                 super().__init__()
 
             def forward(self, x):
-                y = torch.unbind(x, dim=2)
+                y = torch.unbind(x, dim=dim)
                 z = y[0]
                 return z
 

From 210d790f61fd2eb9249b7b8e46a2d37befd2d7eb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 22 Feb 2023 18:57:53 -0800
Subject: [PATCH 173/638] fix topk kernel (#314)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/314

att

Reviewed By: khabinov, alexanderguzhva

Differential Revision: D43517038

fbshipit-source-id: c9f7640ceaf909828210721f2135b7ac8b8dd49f
---
 .../backend/common/tensor/topk_common.py      | 75 +++++++++++++------
 tests/unittest/ops/test_topk.py               | 28 ++-----
 2 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/python/aitemplate/backend/common/tensor/topk_common.py b/python/aitemplate/backend/common/tensor/topk_common.py
index 044833bc0..f546795d1 100644
--- a/python/aitemplate/backend/common/tensor/topk_common.py
+++ b/python/aitemplate/backend/common/tensor/topk_common.py
@@ -119,32 +119,65 @@
 }
 
 template <typename T>
-T GetZeroVal() {
-  return static_cast<T>(0);
-}
+struct NumericTraits;
 
-template <typename T>
-T GetOneVal() {
-  return static_cast<T>(1);
-}
+template<>
+struct NumericTraits<half> {
+  __host__ __device__
+  static half zero() {
+    return 0;
+  }
 
-template <typename T>
-T GetMinVal() {
-  uint16_t ret = 0xfbff;
-  return *(T*)&ret;
-}
+  __host__ __device__
+  static half one() {
+    uint16_t ret = 0x3c00;
+    return *reinterpret_cast<half*>(&ret);
+  }
 
-template <typename T>
-T GetMaxVal() {
-  uint16_t ret = 0x7bff;
-  return *(T*)&ret;
-}
+  __host__ __device__
+  static half min() {
+    uint16_t ret = 0xfbff;
+    return *reinterpret_cast<half*>(&ret);
+  }
+
+  __host__ __device__
+  static half max() {
+    uint16_t ret = 0x7bff;
+    return *reinterpret_cast<half*>(&ret);
+  }
+};
+
+template<>
+struct NumericTraits<float> {
+
+  __host__ __device__
+  static float zero() {
+    return 0.0;
+  }
+
+  __host__ __device__
+  static float one() {
+    return 1.0;
+  }
+
+  __host__ __device__
+  static float min() {
+    uint32_t ret = 0xff7fffff;
+    return *reinterpret_cast<float*>(&ret);
+  }
+
+  __host__ __device__
+  static float max() {
+    uint32_t ret = 0x7f7fffff;
+    return *reinterpret_cast<float*>(&ret);
+  }
+};
 
 template <typename T>
 T PowOf2Floor(T val, int64_t max_power) {
   T max_floor = static_cast<T>(std::pow(2, max_power));
   val = std::min(val, max_floor);
-  T ret = GetOneVal<T>();
+  T ret = (T) 1;
   while (true) {
     ret *= 2;
     if (ret >= val) {
@@ -157,7 +190,7 @@
 T PowOf2Ceil(T val, int64_t max_power) {
   T max_ceil = static_cast<T>(std::pow(2, max_power));
   val = std::min(val, max_ceil);
-  T ret = GetOneVal<T>();
+  T ret = (T) 1;
   while (true) {
     ret *= 2;
     if (ret >= val) {
@@ -558,8 +591,8 @@ class TmpBufferManager final {
             instance_size,
             k,
             heap_size,
-            GetMaxVal<int64_t>(),
-            GetMinVal<T>(),
+            std::numeric_limits<int64_t>::max(),
+            NumericTraits<T>::min(),
             (int64_t*)output);
 
   } else {
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index 01f65c026..3a3353d02 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -72,18 +72,18 @@ def _test_topk(
         x = scores.reshape(shape).contiguous()
         y = torch.empty(o_shape).cuda().to(torch.int64)
         module.run_with_tensors([x], [y])
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(y_pt, y, atol=0, rtol=0)
         self.test_count += 1
 
     def test_topk_heap(self):
-        self._test_topk(shape=(2000,), topK=100, test_name="topk_heap")
+        self._test_topk(shape=(2000,), topK=30, test_name="topk_heap")
         self._test_topk(
-            shape=(2000,), topK=100, test_name="topk_heap_copy_op", copy_op=True
+            shape=(2000,), topK=40, test_name="topk_heap_copy_op", copy_op=True
         )
-        self._test_topk(shape=(4, 500), topK=100, dim=1, test_name="topk_heap2")
+        self._test_topk(shape=(4, 500), topK=50, dim=1, test_name="topk_heap2")
         self._test_topk(
             shape=(4, 500),
-            topK=100,
+            topK=2,
             dim=1,
             test_name="topk_heap2_copy_op",
             copy_op=True,
@@ -105,22 +105,6 @@ def test_topk_sort(self):
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCm.")
     def test_float32(self):
-        self._test_topk(
-            shape=(4, 500),
-            topK=200,
-            dim=1,
-            test_name="topk_sort_copy_op_f32",
-            copy_op=True,
-            dtype="float32",
-        )
-        self._test_topk(
-            shape=(4, 500),
-            topK=100,
-            dim=1,
-            test_name="topk_heap_copy_op_f32",
-            copy_op=True,
-            dtype="float32",
-        )
         self._test_topk(
             shape=(4, 500),
             topK=200,
@@ -131,7 +115,7 @@ def test_float32(self):
         )
         self._test_topk(
             shape=(4, 500),
-            topK=100,
+            topK=30,
             dim=1,
             test_name="topk_heap_f32",
             copy_op=False,

From 979d79ea6e1bb90b94349fe61ff2f6e37c72ff86 Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Thu, 23 Feb 2023 07:49:34 -0800
Subject: [PATCH 174/638] add conv3d_bias (#308)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/308

Add the conv3d_bias in fx2ait. Now it only supports groups == 1

Reviewed By: qxy11

Differential Revision: D43474419

fbshipit-source-id: 04ced1ebc422e14050cdc0ae622c7878baec652b
---
 fx2ait/fx2ait/converters/ait_converters.py    | 34 ++++++++++++++-----
 .../fx2ait/test/converters/test_ait_conv3d.py | 28 +++++++++++++++
 .../converters/test_ait_conv3d_depthwise.py   | 28 +++++++++++++++
 python/aitemplate/backend/codegen.py          |  2 +-
 python/aitemplate/compiler/public/__init__.py |  3 +-
 tests/unittest/ops/test_depthwise_conv3d.py   |  1 +
 6 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 886da477f..22a50345a 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -28,6 +28,7 @@
     conv2d,
     conv2d_bias,
     conv3d,
+    conv3d_bias,
     depthwise_conv3d,
     dynamic_slice,
     elementwise,
@@ -43,6 +44,7 @@
     IntVarTensor,
     layernorm,
     max_pool2d,
+    ndhwc3to8,
     pad_last_dim,
     permute,
     reduce_mean,
@@ -1305,18 +1307,32 @@ def _choose_conv3d_op(
     Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
     and groups
     """
-    if bias is not None:
-        assert (
-            groups == weight._attrs["shape"][0].value()
-        ), f"Currently only support channel == groups, but got channel: {weight._attrs['shape'][0].value()} and groups: {groups}"
+    has_bias = bias is not None
+    if groups is None or groups == 1:
+        if has_bias:
+            C_in = x.shape()[-1].value()
+
+            if 3 == C_in:
+                x = ndhwc3to8()(x)
+                weight = ndhwc3to8()(weight)
+            elif 8 != C_in:
+                raise RuntimeError(
+                    f"When having bias, conv3d currently only supports C_in == 3 or C_in == 8, but got C_in: {C_in}"
+                )
+
+            return conv3d_bias(stride=stride, pad=pad, dilate=dilate, group=1)(
+                x, weight, bias
+            )
+        else:
+            return conv3d(stride=stride, pad=pad, dilate=dilate, group=1)(x, weight)
+    elif groups == weight._attrs["shape"][0].value():
         return depthwise_conv3d(
-            stride=stride, pad=pad, dilate=dilate, group=groups, bias=True
+            stride=stride, pad=pad, dilate=dilate, group=groups, bias=has_bias
         )(x, weight, bias)
     else:
-        assert (
-            groups is None or groups == 1
-        ), "Currently only support non-bias conv3d without groups"
-        return conv3d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+        raise RuntimeError(
+            f"Currently NOT support groups != channels when groups enabled. Got C_in: {C_in} | groups: {groups} | has bias: {has_bias}"
+        )
 
 
 @ait_converter(acc_ops.conv3d)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
index 0d94f4b09..e47a74d88 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -131,6 +131,34 @@ class TestAitConv3d(AITTestCase):
                 w=224,
                 bias=False,
             ),
+            param(
+                name="conv3d_bias",
+                kernel_size=(3, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+            param(
+                name="conv3d_bias_ndhwc3to8",
+                kernel_size=(3, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=3,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
         ]
     )
     def test_conv3d(
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
index 54e1b5cd3..809d5abb4 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
@@ -67,6 +67,34 @@ class TestAitDepthwiseConv3d(AITTestCase):
                 w=7,
                 bias=True,
             ),
+            param(
+                name="depthwise_conv3d_4",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=3,
+                co=3,
+                groups=3,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_no_bias",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
         ]
     )
     def test_depthwise_conv3d(
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 7666511db..764db7421 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -579,7 +579,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(set_value(name, view._attrs["name"]))
             return
         is_view = view is not None
-        if is_view:
+        if is_view and len(self.param_name_to_ptr_idx) > 0:
             ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
             self.set_inputs.append(set_value(name, view._attrs["name"]))
         else:
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index a6caf8b35..3f5e77761 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -58,6 +58,7 @@
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
 from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
 from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
@@ -67,7 +68,7 @@
     group_layernorm_sigmoid_mul,
 )
 from aitemplate.compiler.ops.layernorm.layernorm import layernorm
-from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
+from aitemplate.compiler.ops.padding import ndhwc3to8, nhwc3to8, pad_last_dim
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
 from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax
diff --git a/tests/unittest/ops/test_depthwise_conv3d.py b/tests/unittest/ops/test_depthwise_conv3d.py
index cabd697df..d0948a9da 100644
--- a/tests/unittest/ops/test_depthwise_conv3d.py
+++ b/tests/unittest/ops/test_depthwise_conv3d.py
@@ -178,6 +178,7 @@ def test_mvit(self):
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (1, 4, 4), "5")
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (2, 8, 8), "6")
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (1, 3, 3), (2, 8, 8), "7")
+        self._test_mvit_shape(128, 2, 56, 56, 3, 3, 3, (1, 3, 3), (2, 8, 8), "7")
 
 
 if __name__ == "__main__":

From b11b5e989fe79262ba29a9390003a6c9a210ca5d Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Thu, 23 Feb 2023 08:45:10 -0800
Subject: [PATCH 175/638] Improve batch size detection (#316)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/316

Reviewed By: frank-wei

Differential Revision: D43197422

fbshipit-source-id: 85db98f2161261dcd2a1da7e15b7ab78b5ff4b4c
---
 fx2ait/fx2ait/tensor_spec.py           | 23 +++++++++++------
 fx2ait/fx2ait/test/test_tensor_spec.py | 34 ++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index 8827e38af..5aa4f1665 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from functools import reduce
 from typing import Any, List
 
 import torch
@@ -195,20 +194,28 @@ def find_batch_size_dim(cls, inputs: Any) -> []:
         if isinstance(inputs, torch.Tensor) or len(inputs) <= 1:
             return [0]
         shapes = [i.shape for i in inputs]
-        batch_size = list(reduce(lambda i, j: i & j, (set(x) for x in shapes)))
-        if len(batch_size) != 1:
-            # Unable to find unified batch_size value among input tensors, default batch_size dim=0
-            return [0] * len(inputs)
+        frequency_map = {}
+        for shape in shapes:
+            if len(shape) < 2:
+                # By pass for rank-1 tensors. MRS model has rank-1 tensor carry no batch_size info
+                continue
+            # Dedup shape value for single tensor
+            shape = set(shape)
+            for i in shape:
+                frequency_map[i] = frequency_map.get(i, 0) + 1
+        sorted_frequency = sorted(frequency_map.items(), key=lambda x: -x[1])
+        batch_size = sorted_frequency[0][0]
 
         bs_dim = []
         for i in inputs:
-            # Default batch size dim = 0
-            dim = 0
+            # Default batch size dim = -1, indicate no batch_size
+            dim = -1
             for index, val in enumerate(i.shape):
-                if val == batch_size[0]:
+                if val == batch_size:
                     dim = index
                     break
             bs_dim.append(dim)
+
         return bs_dim
 
     @classmethod
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index 8f02116e2..e1838b025 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -117,3 +117,37 @@ def test_input_list_with_batch_size_non_default_dim(self):
             ),
             specs[2],
         )
+
+    def test_input_with_no_bs_tensor(self):
+        inputs = [
+            torch.empty([2, 10, 4], dtype=torch.float16),
+            torch.empty([20], dtype=torch.int32),
+            torch.empty([7, 10, 9], dtype=torch.float16),
+            torch.empty([20, 7, 10, 9], dtype=torch.float16),
+        ]
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32, 1)
+        self.assertEqual(4, len(specs))
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(2), IntVar([1, 32], "batch_size"), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(
+            TensorSpec([IntImm(20)], torch.int32),
+            specs[1],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)], torch.float16
+            ),
+            specs[2],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(20), IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)],
+                torch.float16,
+            ),
+            specs[3],
+        )

From 921d43913548cc5120dddcc78edad23d8b1f325f Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Thu, 23 Feb 2023 11:14:51 -0800
Subject: [PATCH 176/638] Fix xrayvideo blocks (#317)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/317

Fix xrayvideo blocks

Reviewed By: zj00377

Differential Revision: D43527802

fbshipit-source-id: eca99ba5a40d89d8f6f6df91361bc1375e6a519a
---
 python/aitemplate/frontend/nn/conv3d.py       |  6 +++---
 .../frontend/nn/multiscale_attention.py       | 21 +++++++++++--------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
index f105c717e..69b6f8e97 100644
--- a/python/aitemplate/frontend/nn/conv3d.py
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -15,7 +15,7 @@
 """
 conv3d Module.
 """
-from ...compiler.ops import conv3d, depthwise_conv3d
+from ...compiler.ops import conv3d, conv3d_bias, depthwise_conv3d
 from .module import Module
 from .parameter import Parameter
 
@@ -105,8 +105,8 @@ def __init__(
 
         if groups == 1:
             if bias:
-                raise AttributeError(
-                    "conv3d with groups==1 does not support bias for now."
+                self.op = conv3d_bias(
+                    stride=stride, pad=padding, dilate=dilation, group=groups
                 )
             self.op = conv3d(stride=stride, pad=padding, dilate=dilation, group=groups)
         else:
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index a095408f7..eab3bc378 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -18,6 +18,7 @@
 https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/models/vision_transformers.py
 """
 
+import logging
 from typing import List, Optional, Tuple
 
 import numpy
@@ -31,6 +32,8 @@
 from .linear import Linear
 from .module import Module
 
+_LOGGER = logging.getLogger(__name__)
+
 
 def get_shape(x):
     shape = [it.value() for it in x._attrs["shape"]]
@@ -83,7 +86,9 @@ def __init__(
         # TODO fc1 bias is set to zeros; unset if bias_on is True
 
         self.fc1 = Linear(
-            in_features, hidden_features, bias=True, specialization=act_layer
+            in_features,
+            hidden_features,
+            bias=bias_on,
         )
         self.fc2 = Linear(hidden_features, out_features, bias=bias_on)
 
@@ -104,6 +109,8 @@ def forward(self, x: Tensor) -> Tensor:
         if self.dropout_rate > 0.0:
             x = self.dropout(x)
 
+        x = ops.elementwise(FuncEnum.GELU)(x)
+
         x = self.fc2(x)
 
         if self.dropout_rate > 0.0:
@@ -182,7 +189,7 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         B, N, L, C = get_shape(tensor)
         T, H, W = thw_shape
         tensor = ops.permute()(
-            ops.reshape()(tensor, [B * N, T, H, W, C]), [0, 4, 1, 2, 3]
+            ops.reshape()(tensor, [B * N, -1, H, W, C]), [0, 4, 1, 2, 3]
         )
 
         if self.norm_before_pool:
@@ -190,10 +197,8 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
             # # If use BN, we apply norm before pooling instead of after pooling.
             # tensor = self.norm(tensor)
             # # We also empirically find that adding a GELU here is beneficial.
-            # tensor = nn.functional.gelu(tensor)
-            raise NotImplementedError(
-                f"Unsupport batchnorm3d when {self.norm_before_pool}"
-            )
+            tensor = ops.elementwise(FuncEnum.GELU)(tensor)
+            _LOGGER.warning(f"Unsupport batchnorm3d when {self.norm_before_pool}")
 
         tensor = self.pool(ops.permute()(tensor, [0, 2, 3, 4, 1]))
 
@@ -202,13 +207,11 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         L_pooled = shape[1] * shape[2] * shape[3]
         tensor = ops.reshape()(tensor, [B, N, L_pooled, C])
 
-        assert self.norm_before_pool
         if self.has_norm and not self.norm_before_pool:
 
             # TODO: add support for norm before pool
             # tensor = self.norm(tensor)
-
-            raise NotImplementedError("Unsupport norm before pool")
+            _LOGGER.warning(f"Unsupport norm before pool")
 
         return tensor, thw_shape
 

From 0ab78501f0cd242185b5eba8df77742bb9e15e30 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Thu, 23 Feb 2023 14:22:50 -0800
Subject: [PATCH 177/638] fix flake8 (#319)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/319

fix flake8

Reviewed By: henryhu6

Differential Revision: D43551065

fbshipit-source-id: 3a24cd0dfce6e9d9fbeedcc957a8fd64e3e94fe1
---
 python/aitemplate/backend/cuda/pool2d/max_pool2d.py   | 2 ++
 python/aitemplate/frontend/nn/multiscale_attention.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index 1ec46cd10..ca4a0c20c 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -165,6 +165,8 @@
   dim3 grid(NI, (HO + block_h - 1) / block_h,
             (WO + block_w - 1) / block_w);
   dim3 block(CI / 2, block_ch);
+  auto kernel_func = max_pool_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>;
+  cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
   max_pool_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
       <<<grid, block, shm_size, stream>>>(input, output, NI, HI,
                                           WI, CI / 2, HO, WO);
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index eab3bc378..2ef5c11f4 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -211,7 +211,7 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
 
             # TODO: add support for norm before pool
             # tensor = self.norm(tensor)
-            _LOGGER.warning(f"Unsupport norm before pool")
+            _LOGGER.warning("Unsupport norm before pool")
 
         return tensor, thw_shape
 

From 0e87b5822ee36ffda4ddf3a79b25629a107280c4 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 23 Feb 2023 15:25:57 -0800
Subject: [PATCH 178/638] Add jagged Tensor foundations (#313)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/313

This diff introduces jagged Tensor foundations in AIT: to allow creating and interpreting jagged Tensors in the front-end / IR, as well as processing the jagged Tensor metadata in a unified way in the back-end / runtime. In the following diffs, new jagged-aware ops or jagged-aware extensions of some existing ops will follow.

## What Is Jagged Tensor?

Jagged Tensor is a compact representation of (potentially nested) variable-length data. Conventionally, variable-length sequences are padded to represent them in a form of a normal rectangular Tensor, but this comes at memory and performance costs. Jagged Tensor's compact representation, together with its interpretation, provide a more efficient approach.

In this diff, we adopt fbgemm jagged Tensor semantics. More specifically:

fbgemm introduces a number of specific jagged ops that take dense Tensor inputs and return dense Tensor outputs. But the jagged ops treat some of those dense Tensors as (potentially) nested jagged Tensors along the first dynamic dimension. Some examples of fbgemm jagged ops:

- [jagged_dense_dense_elementwise_add_jagged_output](https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output)
- [jagged_dense_elementwise_mul](https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.jagged_dense_elementwise_mul)
- [batched_dense_vec_jagged_2d_mul](https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul)

Where an fbgemm jagged op expects a jagged Tensor as input, it normally takes two arguments:

1. A rank-2 dense tensor `x_values` with the shape `[sum_B(N_B), D]`, where `sum_B(N_B)` is a dynamic dimension and `D` is a fixed dimension (e.g., embedding dimension).
2. A list of rank-1 dense tensors `x_offsets`.

The first dimension `sum_B(N_B)` in `x_values` is interpreted as a collection of B groups of different lengths `N_1`, `N_2`, …, `N_B`, with all those variable lengths summing to `sum_B(N_B)`. If there is a single jagged dimension in the jagged Tensor, each of those `B` groups contains a variable-sized fraction of the `sum_B(N_B)` `D`-sized vectors in the `x_values`. However, there can be more than one jagged dimension, in which case each of the `B` groups contains variable-sized sub-groups, each of which in turn contains a variable-sized fraction of the `sum_B(N_B)` `D`-sized vectors in `x_values`. Again, the numbers of vectors in each sub-group in each group should sum to `sum_B(N_B)`. Theoretically, the level of nesting of the jagged dimensions can be arbitrary: groups can contain sub-groups, sub-groups can contain sub-sub-groups, etc. until the lowest-level sub-...-sub-groups contain contiguous fractions of the `sum_B(N_B)` `D`-sized vectors in `x_values`. Ultimately, the sizes of all the lowest-level sub-...-sub-groups should sum to `sum_B(N_B)`.

fbgemm encodes the nested group information about the single `sum_B(N_B)` dimension in the second argument: `x_offsets`. It is a list of rank-1 `int32` Tensors (vectors), each representing the offsets at a progressively deeper level of group nesting. Consider an example of an `x_offsets` list with two rank-1 offset Tensors: representing the offsets at the first and second levels of group nesting. `lengths[i]` below shows the size of each group at a given level, whereas `offsets[i]` is cumsum over `lengths[i]`, with a prepended zero. `offsets[i]` is an actual item in the `x_offsets` list representing a jagged Tensor (`lengths[i]` are shown here for clarity).

```
lengths[0]: tensor([ 3,  2,  5, 1])
offsets[0]: tensor([ 0,  3,  5, 10, 11])

lengths[1]: tensor([ 0,  1,  6, 17, 12,  5,  8, 17, 10, 15,  4])
offsets[1]: tensor([ 0,  0,  1,  7, 24, 36, 41, 49, 66, 76, 91, 95])
```

At the first level, there are 4 groups in the jagged Tensor, with the sizes represented by the `lengths[0]`: 3, 2, 5, and 1 sub-groups in each. In total, there are 3+2+5+1=11 sub-groups. The sizes of those 11 sub-groups, in turn, are represented by the `lengths[1]`. Each sub-group contains the specified number of `D`-sized vectors in `x_values`. The shares of each sub-group in `x_values` are contiguous along the first dimension `sum_B(N_B)` of `x_values`, with the sum of the sub-group sizes being equal to `sum_B(N_B)` (in this case: 95). Therefore, the shape of `x_values` corresponding to this particular  `x_offsets` list is `[95, D]`.

## Front-End Implementation Details

In this diff, we extend the fbgemm jagged Tensor semantics by allowing arbitrary number of dimensions following the `sum_B(N_B)` (not just one dimension `D`). Otherwise, the suggested jagged Tensor API follows the above. It also aims at making it more convenient to combine the separate pieces like `x_values` and `x_offsets`, as well as some IR-level metadata, into a single jagged Tensor entity.

The new front-end components are:

- `JaggedDim`: a class for representing a single jagged dimension encoded in the first `sum_B(N_B)` dimension of the jagged Tensor. With min/max value and the associated rank-1 `offsets` Tensor.

- `JaggedIntVar`: a specification of `IntVar` that contains jagged Tensor-related metadata: `batch_dim` (`B`), `jagged_dims` (as many as there are levels of nesting in the jagged Tensor), and `total_length` (the actual dynamic IntVar dim this JaggedIntVar is based on, representing `sum_B(N_B)`). The name and `values` of the `JaggedIntVar` are the same as those of the `total_length`. Basically, a jagged Tensor is a normal Tensor with the first dimension in the shape set to a `JaggedIntVar`. That's why a jagged Tensor can be transparently interpreted as a normal tensor with the shape `[sum_B(N_B), D1, ... Dn]` by non-jagged-aware ops in AIT (e.g., gemm or bmm with the jagged Tensor as the first argument). Another important consequence is that, due to the fact that the majority of ops reuse the input dynamic dim in the output, the jagged Tensor input to non-jagged-aware op will normally result in the jagged Tensor output, as the `JaggedIntVar` will propagate to the output shape (with all the metadata preserved).

- `make_jagged` is a new op that takes the normal "source" Tensor with jagged Tensor's data (`x_values` in fbgemm), the list of rank-1 offset Tensors (`x_offsets` in fbgemm), as well as the metadata (`batch_dim` and `jagged_dims`) and spits out a jagged Tensor as the output. Importantly, the jagged Tensor is a view of the source Tensor, just with the fully specified JaggedIntVar in `_attrs["shape"][0]`. Other than being convenient, one important reason why `make_jagged` should be used instead of "constructing" the jagged Tensor directly is that, if it isn't, the rank-1 offsets Tensors may otherwise remain "hanging in the air" and optimized out by the graph transformation passes.

## Back-End Implementation Details

In the back-end, the offsets of each jagged Tensor are represented by a new structure `ait::JaggedOffsets<N>` templated by the number `N` of jagged dims (or, equivalently, offset tensors) in the jagged Tensor's representation. The structure packs the lengths (on host) and data (on device) of the rank-1 offset Tensors, making it convenient to pass all of those by value to CUDA kernels. In the runtime, the `JaggedOffsets` variable is given a name formed from the name of the `JaggedIntVar`, hence fully predictable by the back-end codegen of the jagged-aware ops.

The back-end codegen of the `make_jagged` ops solves two tasks:

1. Associate the lenghts and data of the rank-1 offset Tensors with the corresponding members of the `JaggedOffsets` structure of the jagged Tensor.

2. Check the validity of the offset contents (there are a few things to check here: monotonicity, end values, matching the total group numbers or the batch_dim with the number of offsets in the next array). With the offset validity determined by `make_jagged`, the subsequent jagged-aware ops (e.g., elementwise) can rely on the established invariant. This should simplify the implementation of those ops.

The docstrings of the individual components provide more details of each.

Reviewed By: ipiszy

Differential Revision: D43225824

fbshipit-source-id: 660e9e37f7fe29a7db158737bf6f8a49b3b5f80a
---
 python/aitemplate/backend/codegen.py          |  36 +++
 .../backend/cuda/view_ops/__init__.py         |   7 +-
 .../backend/cuda/view_ops/make_jagged.py      | 260 ++++++++++++++++++
 python/aitemplate/backend/main_templates.py   |   2 +
 python/aitemplate/compiler/base.py            | 230 +++++++++++++++-
 .../compiler/ops/common/view_ops.py           | 177 +++++++++++-
 .../compiler/transform/name_graph.py          |  34 ++-
 static/include/jagged.h                       |  36 +++
 tests/unittest/ops/test_make_jagged.py        | 115 ++++++++
 9 files changed, 891 insertions(+), 6 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/view_ops/make_jagged.py
 create mode 100644 static/include/jagged.h
 create mode 100644 tests/unittest/ops/test_make_jagged.py

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 764db7421..17249e62d 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -341,6 +341,7 @@ def __init__(
         self._output_shape_seq = []
         self.tensor_decl = []
         self.dim_decl = []
+        self.jagged_decl = []
         self.device_to_device_copies = []
         self.function_state = []
         self.set_up_constants = []
@@ -651,6 +652,37 @@ def _process_dims(self, shape: List[IntVar]) -> None:
             self.dim_decl.append(self.f_var_decl(dim._attrs["name"], intimm))
             self.visited_dims.add(dim._attrs["name"])
 
+    def _process_jagged_dims(self, node: Tensor) -> None:
+        # JaggedIntVars are processed separately here (besides being processed
+        # like normal IntVars in _process_dims above), as they require adding
+        # the offset structure declaration into the Model codegen, as well as
+        # the batch_dim if it's not set when processing other tensors that
+        # directly contain the batch_dim it in their shapes
+        jagged_int_var = node._attrs["shape"][0]
+        name = jagged_int_var._attrs["name"]
+
+        # we use the key with a prefix here, as the JaggedIntVar's name
+        # is identical to the name of the total_length it is based on,
+        # which might have been traversed already
+        key = f"jagged_int_var_{name}"
+        if key not in self.visited_dims:
+            for i, jagged_dim in enumerate(jagged_int_var.jagged_dims()):
+                if jagged_dim.offsets() is None:
+                    raise RuntimeError(
+                        f"No offsets Tensor is associated with the JaggedDim {i} in "
+                        f"the JaggedIntVar {name}: can't generate offset-related code."
+                    )
+            self.jagged_decl.append(
+                f"   {jagged_int_var.offsets_struct_type()} "
+                f"{jagged_int_var.offsets_var_name()};"
+            )
+            self.visited_dims.add(key)
+
+        batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
+        if batch_dim_name not in self.visited_dims:
+            self.dim_decl.append(self.f_var_decl(batch_dim_name, 0))
+            self.visited_dims.add(batch_dim_name)
+
     def _process_dims_for_tensor(self, node: Tensor) -> None:
         self._process_dims(node._attrs["shape"])
 
@@ -781,6 +813,9 @@ def append_tensor(self, node: Tensor) -> None:
         self._process_dims_for_tensor(node)
         self._process_src_ops(node)
 
+        if node.is_jagged():
+            self._process_jagged_dims(node)
+
     def generate_model(self) -> str:
         # Disable graph mode on ROCM because the updating operations
         # are not supported
@@ -805,6 +840,7 @@ def generate_model(self) -> str:
             per_op_profiler_seq=per_op_profiler_seq,
             tensor_decl="\n".join(self.tensor_decl),
             dim_decl="\n".join(self.dim_decl),
+            jagged_decl="\n".join(self.jagged_decl),
             function_state="\n".join(self.function_state),
             target_has_graph_mode=target_has_graph_mode,
             unique_workspace_size=self.workspace.unique_size,
diff --git a/python/aitemplate/backend/cuda/view_ops/__init__.py b/python/aitemplate/backend/cuda/view_ops/__init__.py
index bc232e36a..1b7b20efc 100644
--- a/python/aitemplate/backend/cuda/view_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/view_ops/__init__.py
@@ -15,6 +15,9 @@
 """
 CUDA view_ops module init
 """
-from . import view_ops
+from . import make_jagged, view_ops
 
-__all__ = ["view_ops"]
+__all__ = [
+    "view_ops",
+    "make_jagged",
+]
diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
new file mode 100644
index 000000000..b9e17d17b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -0,0 +1,260 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the make_jagged op.
+
+The main responsibilities of the make_jagged backend are:
+
+  1. Associate the offsets structure members (lengths and data)
+  with the corresponding rank-1 offsets Tensors' first dimension
+  and data pointer, respectively.
+
+  2. Check the validity of the offset content (non-strict
+  monotonicity, first and last values in each array). Offset
+  contents are on the device, hence are checked by a simple
+  CUDA kernel doing an assertion for each constraint. Some
+  of the constraints can be checked on the device, in which
+  case an std::runtime_error is thrown on violation.
+"""
+import jinja2
+
+from ....backend import registry
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <assert.h>
+#include <stdexcept>
+
+#include "jagged.h"
+
+
+namespace {
+
+struct OffsetBounds {
+  {{offsets_type}} min_values[{{num_offsets}}]{0};
+  {{offsets_type}} max_values[{{num_offsets}}]{0};
+  {{offsets_type}} last_values[{{num_offsets}}]{0};
+};
+
+__global__ void check_offsets(
+  {{offsets_struct_type}} offsets,
+  OffsetBounds bounds
+) {
+  int64_t length = offsets.lengths[blockIdx.x];
+  const {{offsets_type}}* data = offsets.data[blockIdx.x];
+
+  if (threadIdx.x >= length - 1) {
+    // out of bounds of the offset array
+    return;
+  }
+
+  {{offsets_type}} group_size = data[threadIdx.x + 1] - data[threadIdx.x];
+  if (group_size < bounds.min_values[blockIdx.x] || group_size > bounds.max_values[blockIdx.x]) {
+    printf(
+      "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+      "Error: the offset difference %d is out of bounds of the jagged dimension %d (min: %d, max: %d).",
+      (int32_t)blockIdx.x,
+      (int32_t)threadIdx.x,
+      (int32_t)group_size,
+      (int32_t)blockIdx.x,
+      (int32_t)bounds.min_values[blockIdx.x],
+      (int32_t)bounds.max_values[blockIdx.x]
+    );
+    __trap();
+  }
+
+  if (threadIdx.x == 0) {
+    {{offsets_type}} first_offset = data[0];
+    if (first_offset != 0)
+    {
+      printf(
+        "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+        "Error: the first offset of the jagged dimension %d is non-zero: %d.",
+        (int32_t)blockIdx.x,
+        (int32_t)threadIdx.x,
+        (int32_t)blockIdx.x,
+        (int32_t)first_offset
+      );
+      __trap();
+    }
+  }
+
+  if (threadIdx.x == length - 2) {
+    {{offsets_type}} last_offset = data[length - 1];
+    if (last_offset != bounds.last_values[blockIdx.x])
+    {
+      printf(
+        "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+        "Error: the last offset of the jagged dimension %d is incorrect: %d (must be %d).",
+        (int32_t)blockIdx.x,
+        (int32_t)threadIdx.x,
+        (int32_t)blockIdx.x,
+        (int32_t)last_offset,
+        (int32_t)bounds.last_values[blockIdx.x]
+      );
+      __trap();
+    }
+  }
+}
+
+} // namespace
+
+
+void {{func_name}}(
+{% for idx in range(num_offsets) %}
+  int64_t offsets_length_{{idx}},
+  const void* offsets_data_{{idx}},
+{% endfor %}
+  {{offsets_struct_type}}& offsets,
+  int64_t* batch_dim,
+  int64_t total_length
+) {
+{% for idx in range(num_offsets) %}
+    offsets.lengths[{{idx}}] = offsets_length_{{idx}};
+    offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
+{% endfor %}
+
+{% if set_batch_dim %}
+    // batch_dim must be set by this code
+    *batch_dim = offsets.lengths[0] - 1;
+{% else %}
+    // batch_dim must have been set before this code
+    if (*batch_dim != offsets.lengths[0] - 1) {
+      throw std::runtime_error("batch_dim != len(offsets[0]) - 1");
+    }
+{% endif %}
+
+    int64_t max_offset_length = 0;
+    for (int i = 0; i < {{num_offsets}}; ++i) {
+        if (offsets.lengths[i] <= 1) {
+            throw std::runtime_error("offset array's length must be at least 2");
+        }
+        if (offsets.lengths[i] > max_offset_length) {
+            max_offset_length = offsets.lengths[i];
+        }
+    }
+
+    OffsetBounds bounds;
+{% for idx in range(num_offsets) %}
+    bounds.min_values[{{idx}}] = {{jagged_dim_min_values[idx]}};
+    bounds.max_values[{{idx}}] = {{jagged_dim_max_values[idx]}};
+    bounds.last_values[{{idx}}] = {{ "offsets.lengths[" + ((idx + 1) | string) + "] - 1" if idx < num_offsets - 1 else "total_length" }};
+{% endfor %}
+
+    check_offsets<<<{{num_offsets}}, max_offset_length - 1, 0, 0>>>(offsets, bounds);
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+{% for idx in range(num_offsets) %}
+  int64_t,
+  const void*,
+{% endfor %}
+  {{offsets_struct_type}}&,
+  int64_t*,
+  int64_t
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% for idx in range(num_offsets) %}
+{{indent}}  {{offsets_first_dim_names[idx]}},
+{{indent}}  {{offsets_data_names[idx]}},
+{% endfor %}
+{{indent}}  {{offsets_var_name}},
+{{indent}}  &{{batch_dim_name}},
+{{indent}}  {{source_first_dim_name}}
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.make_jagged.gen_function")
+def make_jagged_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    offsets_list = func_attrs["inputs"][1:]
+
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+    set_batch_dim = jagged_int_var.batch_dim()._attrs.get("isolated", False)
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+    jagged_dim_min_values = [dim.min_value() for dim in jagged_int_var.jagged_dims()]
+    jagged_dim_max_values = [dim.max_value() for dim in jagged_int_var.jagged_dims()]
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        set_batch_dim=set_batch_dim,
+        offsets_struct_type=offsets_struct_type,
+        jagged_dim_min_values=jagged_dim_min_values,
+        jagged_dim_max_values=jagged_dim_max_values,
+        offsets_type=jagged_int_var.offsets_type(),
+    )
+
+
+@registry.reg("cuda.make_jagged.func_decl")
+def make_jagged_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    offsets_list = func_attrs["inputs"][1:]
+
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        offsets_struct_type=offsets_struct_type,
+    )
+
+
+@registry.reg("cuda.make_jagged.func_call")
+def make_jagged_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    source = func_attrs["inputs"][0]
+    offsets_list = func_attrs["inputs"][1:]
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+
+    offsets_first_dim_names = [
+        offsets._attrs["shape"][0]._attrs["name"] for offsets in offsets_list
+    ]
+    offsets_data_names = [offsets._attrs["name"] for offsets in offsets_list]
+    batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
+    source_first_dim_name = source._attrs["shape"][0]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        offsets_var_name=jagged_int_var.offsets_var_name(),
+        offsets_first_dim_names=offsets_first_dim_names,
+        offsets_data_names=offsets_data_names,
+        batch_dim_name=batch_dim_name,
+        source_first_dim_name=source_first_dim_name,
+    )
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index a1299b801..366bc411d 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -29,6 +29,7 @@
 #include "raii_wrapper.h"
 #include "model.h"
 #include "macros.h"
+#include "jagged.h"
 #include <algorithm>
 #include <deque>
 #include <fstream>
@@ -179,6 +180,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
   private:
 {{ tensor_decl }}
 {{ dim_decl }}
+{{ jagged_decl }}
 {{ function_state }}
 };
 } // namespace ait
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index d0c388497..95105783e 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -22,7 +22,7 @@
 from enum import Enum
 from functools import reduce
 from pprint import pformat
-from typing import Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
 
 import numpy as np
 
@@ -206,6 +206,228 @@ def pseudo_code(self, with_shape=False) -> str:
         return str(self.value())
 
 
+class JaggedDim(Node):
+    """
+    A class representing a single jagged dimension encoded within a JaggedIntVar.
+    Each instance contains the min and max value for the variable-length jagged
+    dimension. It is also associated with the rank-1 offsets Tensor representing
+    the layout of the jagged dimension within the JaggedIntVar. The offsets are
+    associated with the JaggedDim instances after creation, while creating
+    a jagged tensor with the make_jagged op.
+
+    See the docstring of the JaggedIntVar class for details.
+    """
+
+    def __init__(
+        self,
+        min_value: int,
+        max_value: int,
+    ):
+        """Initializes a JaggedDim.
+
+        Parameters
+        ----------
+        min_value : int
+            Minimum possible value of the jagged dimension.
+        max_value : int
+            Maximum possible value of the jagged dimension.
+        """
+        if min_value < 0:
+            raise ValueError(f"{min_value=}, but must be non-negative.")
+        if min_value > max_value:
+            raise ValueError(f"{min_value=} can't be larger than {max_value=}.")
+
+        super().__init__()
+
+        self._attrs["values"] = [min_value, max_value]
+        self._attrs["offsets"] = None
+
+    def __eq__(self, another: JaggedDim) -> bool:
+        return (
+            isinstance(another, JaggedDim)
+            and self.min_value() == another.min_value()
+            and self.max_value() == another.max_value()
+            and self.offsets() == another.offsets()
+        )
+
+    def __str__(self) -> str:
+        attrs = dict(self._attrs)
+        if self._attrs["offsets"] is not None:
+            attrs["offsets"] = {"name": self._attrs["offsets"]._attrs["name"]}
+        return str(attrs)
+
+    def min_value(self) -> int:
+        """The minimum possible value of the JaggedDim."""
+        return self._attrs["values"][0]
+
+    def max_value(self) -> int:
+        """The maximum possible value of the JaggedDim."""
+        return self._attrs["values"][1]
+
+    def offsets(self) -> Optional[Tensor]:
+        """The rank-1 offsets Tensor associated with the JaggedDim"""
+        return self._attrs["offsets"]
+
+    def pseudo_code(self, with_shape=False) -> str:
+        return f"JaggedDim({str(self._attrs['values'])})"
+
+
+class JaggedIntVar(IntVar):
+    """
+    JaggedIntVar is a specific case of IntVar that encodes one or more jagged
+    dimensions within itself. JaggedIntVar is used as the first dimension in
+    jagged Tensors' shape (this is, basically, what makes a Tensor jagged).
+    E.g., a JaggedIntVar with a single JaggedDim represents a single dynamic
+    dimension encoding a batch of variable sequence length. For the batch
+    size of B, in some sources this is indicated as sum_B(N_B): the sum of
+    individual sequence lengths: N_1, N_2, ..., N_B of B sequences. This sum
+    is represented as a single dynamic dimension: total_length, with B being
+    defined by the batch_dim.
+
+    Because JaggedIntVar is an IntVar, it can be treated so by the AIT ops
+    that are unaware of the jagged Tensor semantics. But the ops that are
+    aware can interpet the JaggedIntVar as the first dimension of the jagged
+    Tensor by specifically processing the underlying batch_dim and jagged_dims.
+
+    If there is more than one JaggedDim in a JaggedIntVar, those jagged dimensions
+    are nested within the single dynamic dimension. E.g., if there are two JaggedDims,
+    the JaggedIntVar represents a batch of B (batch_dim) variable-length sequences,
+    each in turn consisting of variable-length sequences. In principle, the nesting
+    can be arbitrarily deep, but in practice it's usually just a single JaggedDim.
+
+    JaggedIntVar should not be created directly. Please use the make_jagged op
+    for creating a jagged Tensor from a normal Tensor, the offsets, and the
+    metadata (like batch_dim and jagged_dims). The make_jagged op creates the
+    corresponding JaggedIntVar under the hood.
+    """
+
+    def __init__(
+        self,
+        total_length: IntVar,
+        batch_dim: IntVar,
+        jagged_dims: List[JaggedDim],
+    ):
+        """Initializes a JaggedIntVar.
+
+        Parameters
+        ----------
+        total_length : IntVar
+            The existing IntVar defining the total length sum_B(N_B) of the
+            JaggedIntVar. The "name" and "values" attributes of the JaggedIntVar
+            are the same as those of the total_length. This allows transparent
+            treatment of the jagged Tensor as dense by non-jagged-aware ops.
+            Must be a dynamic dim (IntVar, not IntImm).
+        batch_dim : IntVar
+            The batch dimension B in the sum_B(N_B) representation of the
+            JaggedIntVar. Specifies the number of (outermost) variable-length
+            sequences encoded within the JaggedIntVar. Must be a dynamic dim
+            (IntVar, not IntImm).
+        jagged_dims : List[JaggedDim]
+            One or more jagged dimension encoded in the JaggedIntVar. Each
+            JaggedDim specifies the bounds of one level of nested jaggedness
+            of the JaggedIntVar. See the class docstring for details.
+            The list must contain at least one JaggedDim. All JaggedDims
+            in the list must have their offsets already set to the
+            corresponding rank-1 Tensors.
+        """
+        if total_length is None or type(total_length) != IntVar:
+            raise TypeError(
+                "total_length must be dynamic (IntVar), "
+                f"but given {type(total_length).__name__}."
+            )
+        if batch_dim is None or type(batch_dim) != IntVar:
+            raise TypeError(
+                "batch_dim must be dynamic (IntVar), "
+                f"but given {type(batch_dim).__name__}."
+            )
+        if not jagged_dims or not all(
+            isinstance(dim, JaggedDim) for dim in jagged_dims
+        ):
+            raise TypeError(
+                "jagged_dims must be a non-empty list of JaggedDims, "
+                f"but given {jagged_dims}."
+            )
+        offsets_types = set()
+        for i, dim in enumerate(jagged_dims):
+            if dim.offsets() is None:
+                raise ValueError(
+                    f"JaggedDim {i} in the jagged_dims list has no associated offsets. "
+                    "This probably means that the JaggedIntVar is instantiated directly. "
+                    "Instead, jagged Tensor must be created by calling the make_jagged op."
+                )
+            else:
+                offsets_type = dim.offsets()._attrs["dtype"]
+                if offsets_type not in ["int32", "int64"]:
+                    raise TypeError(
+                        "The offsets Tensors can be either int32 or int64, "
+                        f"but given the Tensor of type {offsets_type}."
+                    )
+                offsets_types.add(offsets_type)
+        if len(offsets_types) > 1:
+            raise TypeError(
+                "All offsets Tensors must be of the same type,"
+                f" but given the Tensors of different types: {offsets_types}."
+            )
+
+        super().__init__(
+            values=total_length._attrs["values"],
+            name=total_length._attrs["name"],
+        )
+
+        self._attrs["batch_dim"] = batch_dim
+        self._attrs["jagged_dims"] = jagged_dims
+        self._attrs["offsets_type"] = f"{offsets_types.pop()}_t"
+        self._total_length = total_length
+
+    def __eq__(self, another: JaggedIntVar) -> bool:
+        return (
+            isinstance(another, JaggedIntVar)
+            and self.total_length() == another.total_length()
+            and self.batch_dim() == another.batch_dim()
+            and self.jagged_dims() == another.jagged_dims()
+        )
+
+    def total_length(self) -> IntVar:
+        """The total_length dimension the JaggedIntVar is based on."""
+        return self._total_length
+
+    def batch_dim(self) -> IntVar:
+        """The batch_dim of the JaggedIntVar."""
+        return self._attrs["batch_dim"]
+
+    def jagged_dims(self) -> List[JaggedDim]:
+        """The jagged_dims of the JaggedIntVar."""
+        return self._attrs["jagged_dims"]
+
+    def offsets_type(self) -> str:
+        """The type of the offsets of the JaggedIntVar's jagged_dims."""
+        return self._attrs["offsets_type"]
+
+    def offsets_var_name(self) -> str:
+        """The name of the offsets struct variable in runtime."""
+        name = self._attrs["name"]
+        if name is None:
+            raise RuntimeError("The JaggedIntVar is not named yet")
+        return f"{name}_jagged_offsets"
+
+    def offsets_struct_type(self) -> str:
+        """The type of the offsets struct variable used in runtime."""
+        num_jagged_dims = len(self.jagged_dims())
+        return f"ait::JaggedOffsets<{self.offsets_type()}, {num_jagged_dims}>"
+
+    def get_max_dense_shape(self) -> List[IntVar]:
+        """
+        Returns a list of IntVars representing the maximum dense shape
+        (rectangular volume) that the JaggedIntVar can correspond to.
+        The result has the batch_dim as the first item and the IntImm
+        with the max_value of each JaggedDim that follows.
+        """
+        result = [self.batch_dim()]
+        for dim in self.jagged_dims():
+            result.append(IntImm(dim.max_value()))
+        return result
+
+
 def get_aligned_size(shape: List[IntVar], dtype: str, alignment: int = 64) -> int:
     """Returns aligned size (in bytes) of given shape and dtype.
 
@@ -496,6 +718,12 @@ def is_a_const_num(self) -> bool:
         """Returns whether this Tensor represents a constant number."""
         return len(self._attrs["shape"]) == 0 and self._attrs["value"] is not None
 
+    def is_jagged(self) -> bool:
+        """Whether the Tensor is jagged (the first dim is JaggedIntVar)."""
+        return len(self._attrs["shape"]) > 0 and isinstance(
+            self._attrs["shape"][0], JaggedIntVar
+        )
+
     def size_bytes(self, alignment: int = 1) -> int:
         """Returns acutal size (in bytes) of this Tensor."""
         return get_aligned_size(self._attrs["shape"], self.dtype(), alignment)
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index f61394e43..b47fa6ff5 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -25,7 +25,15 @@
 
 from aitemplate import backend
 from aitemplate.backend import registry
-from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.base import (
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    JaggedDim,
+    JaggedIntVar,
+    Operator,
+    Tensor,
+)
 from aitemplate.utils.shape_utils import convert_shape_to_IntVar
 
 from ....utils.tensor_utils import wrap_dim
@@ -548,3 +556,170 @@ def _infer_shapes(self, x: Tensor) -> List[IntVar]:
 
         self._attrs["out_dim_to_in"] = out_dim_to_in
         return y_shapes
+
+
+class make_jagged(_view):
+    """
+    Creates a jagged Tensor from a normal Tensor, offsets, and metadata.
+
+    Jagged Tensors are normal Tensors with the first dynamic dimensions
+    represented with a JaggedIntVar instance (as opposed to a vanilla
+    IntVar). The purpose of this op is to take a normal AIT Tensor "source"
+    that contains the jagged Tensor's data and return a jagged Tensor with
+    the same data as source (with the is_view_of attribute set to source)
+    and the first dimension set to a JaggedIntVar. The jagged Tensor resulting
+    from this op can then be treated as jagged by other ops aware of the
+    jagged Tensor semantics (e.g., elementwise). Importantly, the source
+    Tensor is not sufficient for that, as it doesn't carry the necessary
+    jagged Tensor metadata (which the jagged Tensor does, in the first
+    JaggedIntVar dimension of its shape).
+
+    *Important*: this op is the only right way to create a jagged Tensor.
+    The reason is that the offsets Tensors passed to this op get registered
+    in the graph and, as a result, can't be optimized out. This wouldn't
+    be the case if the jagged Tensor would be "constructed" manually.
+
+    See the docstring of the JaggedIntVar class for more details on the
+    jagged Tensor semantics and representation.
+
+    In the backend, the purpose of the make_jagged op is to setup the
+    unified offsets representation for the jagged Tensor and to check
+    the contents of the rank-1 offsets Tensors for consistency.
+
+    __init__ Args:
+        batch_dim : IntVar
+            The batch dimension of the jagged Tensor.
+            Importantly, this is different from the first dimension of the
+            soruce Tensor, as it logically represents the number of variable-
+            length sequences encoded by the JaggedIntVar. I.e., the batch_dim
+            is B in the sum_B(N_B) representation of the JaggedIntVar.
+        jagged_dims : List[JaggedDim]
+            The list of jagged dimensions encoded in the JaggedIntVar of the
+            resulting jagged Tensor. See the JaggedDim and JaggedIntVar class
+            docstrings for the details.
+
+    __call__ Args:
+        source : Tensor
+            The source Tensor of the jagged Tensor created by this op.
+            The jagged Tensor is a view of the source Tensor. The main
+            difference is that the resulting jagged Tensor's first
+            dimension is set to a JaggedIntVar, constructed from the
+            batch_dim, jagged_dims, and the offsets_list.
+        offsets_list : List[Tensor]
+            The list of rank-1 offsets Tensors describing the variable-length
+            layout of each of the jagged_dims. There must be exactly as many
+            offsets Tensors in the offsets_list as there are JaggedDims in
+            the jagged_dims list. Each offsets Tensor is associated with the
+            corresponding JaggedDim before constructing a JaggedIntVar from
+            them for the resulting jagged Tensor.
+    """
+
+    def __init__(
+        self,
+        batch_dim: IntVar,
+        jagged_dims: List[JaggedDim],
+    ) -> None:
+        if type(batch_dim) != IntVar:
+            raise TypeError(
+                "batch_dim must be dynamic (IntVar), "
+                f"but given {type(batch_dim).__name__}."
+            )
+        if not jagged_dims or not all(
+            isinstance(dim, JaggedDim) for dim in jagged_dims
+        ):
+            raise TypeError(
+                "jagged_dim must be a non-empty list of JaggedDims, "
+                f"but given {jagged_dims}."
+            )
+
+        super().__init__()
+
+        self._attrs["op"] = "make_jagged"
+        self._attrs["batch_dim"] = batch_dim
+        self._attrs["jagged_dims"] = list(jagged_dims)
+
+    def _set_jagged_dim_offsets(self, offsets_list: List[Tensor]):
+        jagged_dims = self._attrs["jagged_dims"]
+        for i, (jagged_dim, offsets) in enumerate(zip(jagged_dims, offsets_list)):
+            if jagged_dim.offsets() is not None:
+                if jagged_dim.offsets() == offsets:
+                    continue
+                else:
+                    raise ValueError(
+                        f"JaggedDim {i} in the jagged_dims already has associated "
+                        "offsets != the offsets passed to the make_jagged.__call__."
+                    )
+            jagged_dim._attrs["offsets"] = offsets
+
+    def _infer_shapes(self, source: Tensor) -> List[IntVar]:
+        jagged_int_var = JaggedIntVar(
+            batch_dim=self._attrs["batch_dim"],
+            jagged_dims=self._attrs["jagged_dims"],
+            total_length=source._attrs["shape"][0],
+        )
+
+        return [jagged_int_var] + source._attrs["shape"][1:]
+
+    def __call__(self, source: Tensor, offsets_list: List[Tensor]) -> Tensor:
+        jagged_dims = self._attrs["jagged_dims"]
+        if len(offsets_list) != len(jagged_dims):
+            raise ValueError(
+                f"{len(offsets_list)=} must be equal to {len(jagged_dims)=}"
+            )
+        for offsets in offsets_list:
+            if len(offsets._attrs["shape"]) != 1:
+                raise ValueError(
+                    "The offsets Tensors must be rank-1, "
+                    f"but given shape {offsets._attrs['shape']}."
+                )
+            if offsets._attrs["dtype"] not in ["int32", "int64"]:
+                raise TypeError(
+                    "The offsets Tensors can be either int32 or int64, "
+                    f"but given the Tensor of type {offsets._attrs['dtype']}."
+                )
+        if len(source._attrs["shape"]) == 0:
+            raise ValueError(
+                "The source Tensor must be at least rank-1, but given rank-0."
+            )
+        if type(source._attrs["shape"][0]) != IntVar:
+            raise ValueError(
+                "The source Tensor's first dim (total_length) must be dynamic (IntVar), "
+                f"but given {type(source._attrs['shape'][0]).__name__}."
+            )
+
+        self._attrs["inputs"] = [source, *offsets_list]
+        self._set_depth()
+        self._set_jagged_dim_offsets(offsets_list)
+        output_shape = self._infer_shapes(source)
+        output = Tensor(
+            shape=output_shape,
+            src_ops={self},
+            is_view_of=source,
+        )
+        self._attrs["outputs"] = [output]
+
+        return output
+
+    def _get_op_attributes(self):
+        return {
+            "batch_dim": self._attrs["batch_dim"],
+            "jagged_dims": self._attrs["jagged_dims"],
+        }
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        batch_dim = self._attrs["batch_dim"].pseudo_code()
+        jagged_dims = ", ".join(
+            [dim.pseudo_code() for dim in self._attrs["jagged_dims"]]
+        )
+        return [
+            f"batch_dim={batch_dim}",
+            f"jagged_dims={jagged_dims}",
+        ]
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index b67b61d64..2d7f09648 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,7 @@
 import re
 from typing import List
 
-from ..base import IntImm, IntVarTensor, Tensor
+from ..base import IntImm, IntVarTensor, JaggedIntVar, Tensor
 
 # pylint: disable=C0103
 
@@ -95,6 +95,36 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
         tensor_name = node._attrs["name"]
         for i, dim in enumerate(node._attrs["shape"]):
-            if dim._attrs["name"] is None:
+            if dim._attrs["name"] is None and not isinstance(dim, JaggedIntVar):
                 dim_name = "{tname}_dim_{idx}".format(tname=tensor_name, idx=i)
                 dim._attrs["name"] = dim_name
+
+    dim_names_in_shapes = set()
+    for tensor in sorted_graph:
+        for dim in tensor._attrs["shape"]:
+            dim_names_in_shapes.add(dim._attrs["name"])
+
+    for tensor in sorted_graph:
+        if tensor.is_jagged():
+            jagged_int_var = tensor._attrs["shape"][0]
+            # JaggedIntVar's name must be the same as the name of the total_length IntVar
+            # that it is based on. Due to the fact that IntVar's _attrs["name"] is accessed
+            # directly throughout the code, we can't enforce this constrain by overloading
+            # the name in the JaggedIntVar class. as a result, we must resort to a hack here
+            # to reset the name of the JaggedIntVar to the name of the total_length after
+            # the latter might have been changed (e.g., from None) by the code above.
+            # TODO: wrap _attrs["name"] (and other frequently used _attrs members) in
+            # @properties and override the "name" property in the JaggedIntVar to return
+            # total_length().name.
+            jagged_int_var._attrs["name"] = jagged_int_var.total_length()._attrs["name"]
+
+            batch_dim = jagged_int_var.batch_dim()
+            if batch_dim._attrs["name"] not in dim_names_in_shapes:
+                # The batch_dim set inside the jagged_int_var is not present in any other
+                # Tensor's shape directly. We mark it as isolated batch dim here to set
+                # the dim to "offsets.length[0] - 1" in the make_jagged backend code.
+                batch_dim._attrs["isolated"] = True
+                if batch_dim._attrs["name"] is None:
+                    # the batch_dim wasn't named above, so we name it here
+                    jagged_int_var_name = jagged_int_var._attrs["name"]
+                    batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"
diff --git a/static/include/jagged.h b/static/include/jagged.h
new file mode 100644
index 000000000..99e94f467
--- /dev/null
+++ b/static/include/jagged.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+namespace ait {
+
+// This structure is used to pack the offset metadata related to a
+// jagged Tensor's first dimension: JaggedIntVar. The offsets are not
+// available in compile time, as they are coming in a rank-1 Tensor.
+// In runtime, the members of the structure are set by the make_jagged
+// op's back-end, from the corresponding rank-1 offset Tensors' length
+// and data. The OFFSET_TYPE can be either int32 or int64. The number
+// of offset arrays is known in compile time, hence specified as the
+// NUM_OFFSET_ARRAYS template argument here.
+template <typename OFFSET_TYPE, int32_t NUM_OFFSET_ARRAYS>
+struct JaggedOffsets {
+  // the lengths the individual offset arrays
+  int64_t lengths[NUM_OFFSET_ARRAYS]{0};
+  // the data in each of the offset arrays
+  // (i.e., the offsets of the JaggedIntVar)
+  const OFFSET_TYPE* data[NUM_OFFSET_ARRAYS]{nullptr};
+};
+
+} // namespace ait
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
new file mode 100644
index 000000000..63dd7a148
--- /dev/null
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim, JaggedIntVar
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class MakeJaggedTestCase(unittest.TestCase):
+    def test_make_jagged(
+        self,
+    ):
+        offsets1 = Tensor(
+            shape=[
+                IntVar(values=[1, 16]),
+            ],
+            name="off1",
+            dtype="int32",
+            is_input=True,
+        )
+        offsets2 = Tensor(
+            shape=[
+                IntVar(values=[1, 16]),
+            ],
+            name="off2",
+            dtype="int32",
+            is_input=True,
+        )
+
+        X = Tensor(
+            shape=[
+                IntVar(values=[1, 1024]),
+                IntImm(value=128),
+            ],
+            name="X",
+            dtype="float16",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[
+                IntImm(value=128),
+                IntImm(value=64),
+            ],
+            name="W",
+            dtype="float16",
+            is_input=True,
+        )
+
+        batch_dim = IntVar(values=[1, 128])
+        jd0 = JaggedDim(min_value=0, max_value=10)
+        jd1 = JaggedDim(min_value=0, max_value=15)
+        Y = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jd0, jd1],
+        )(X, [offsets1, offsets2])
+        Z = ops.gemm_rrr()(Y, W)
+
+        assert Y.is_jagged()
+        assert Z.is_jagged()
+
+        Y_dim_0 = Y._attrs["shape"][0]
+        assert isinstance(Y_dim_0, JaggedIntVar)
+        assert Y_dim_0.jagged_dims() == [jd0, jd1]
+        assert jd0.offsets() == offsets1
+        assert jd1.offsets() == offsets2
+
+        Z_dim_0 = Z._attrs["shape"][0]
+        assert Z_dim_0 == Y_dim_0
+
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        model = compile_model([Y, Z], detect_target(), "./tmp", "test_make_jagged")
+
+        offsets1_pt = torch.tensor([0, 1, 3, 5], dtype=torch.int32).cuda()
+        offsets2_pt = torch.tensor([0, 2, 4, 4, 9, 10], dtype=torch.int32).cuda()
+        x_pt = get_random_torch_tensor([10, 128], "float16")
+        w_pt = get_random_torch_tensor([128, 64], "float16")
+        z_pt = torch.matmul(x_pt, w_pt)
+
+        y = get_torch_empty_tensor([10, 128], "float16")
+        z = get_torch_empty_tensor([10, 64], "float16")
+
+        inputs = {"X": x_pt, "off1": offsets1_pt, "off2": offsets2_pt, "W": w_pt}
+        model.run_with_tensors(inputs, [y, z])
+
+        torch.testing.assert_close(y, x_pt)
+        torch.testing.assert_close(z, z_pt)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 00bbc3462d6f31bc600d51d81400d62b4a5ea9b6 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 24 Feb 2023 00:58:09 -0800
Subject: [PATCH 179/638] temp solution to set correct dynamic batch size for
 jagged tensor (#323)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/323

as titled
Use special value as tag value to
1) tag the tensor that needs change
2) change the value to max_batch * max_seq

Reviewed By: frankgt40

Differential Revision: D43515878

fbshipit-source-id: 55d95ffa096d9de7952a6a1c4628efd67e554d82
---
 fx2ait/fx2ait/tensor_spec.py | 52 ++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index 5aa4f1665..5d349aaa9 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -12,11 +12,14 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 from typing import Any, List
 
 import torch
 from aitemplate.compiler.public import IntImm, IntVar
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 
 class TensorSpec:
     def __init__(self, shape: List[IntVar], dtype: torch.dtype) -> None:
@@ -188,6 +191,55 @@ def from_input_list_with_batch_size(
 
         return result
 
+    @classmethod
+    def from_input_list_with_batch_size_jagged_tensor(
+        cls,
+        inputs: List[torch.Tensor],
+        max_batch_size: int,
+        max_batch_size_jagged_tensor: int,
+        tag_val=None,
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        result: List = []
+        result_unsorted: List = []
+        left_inputs: List = []
+        left_inputs_ind: List = []
+        for ind, t in enumerate(inputs):
+            if t.shape[0] == tag_val:
+                shape: List[IntVar] = []
+                for i, d in enumerate(t.shape):
+                    if i == 0:
+                        shape.append(
+                            IntVar(
+                                [1, max_batch_size_jagged_tensor],
+                                "batch_size_jagged_tensor",
+                            )
+                        )
+                    else:
+                        shape.append(IntImm(d))
+                result_unsorted.append((ind, TensorSpec(shape, t.dtype)))
+            else:
+                left_inputs.append(t)
+                left_inputs_ind.append(ind)
+
+        bs_dim = cls.find_batch_size_dim(left_inputs)
+        for index, t in enumerate(left_inputs):
+            shape: List[IntVar] = []
+            for i, d in enumerate(t.shape):
+                if i == bs_dim[index]:
+                    shape.append(IntVar([1, max_batch_size], "batch_size"))
+                else:
+                    shape.append(IntImm(d))
+            result_unsorted.append((left_inputs_ind[index], TensorSpec(shape, t.dtype)))
+        result = sorted(result_unsorted, key=lambda num: num[0])
+        result = [r[1] for r in result]
+        return result
+
     @classmethod
     # pyre-ignore [2]: Parameter `sample_input` must have a type other than `Any`
     def find_batch_size_dim(cls, inputs: Any) -> []:

From ba4e326e7e9e3e7676b09affaaeae72eab634a76 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Sat, 25 Feb 2023 21:21:06 -0800
Subject: [PATCH 180/638] Fix flaky ATen reshape (#331)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/331

Torch dynamo did some udpates.
It use to trace `x.reshape(x.size(0), -1)` as `x.size()/x.size(0)` for the `-1` dimension, but now `-1` is simply represented as `-1`, so aten2ait have to deduce the correct representation for AIT. (Because AIT cannot deduce more than 1 dynamic shape during reshape op)

As a result, Aten2ait's reshape op on  becomes flaky after pytorch updates.  https://www.internalfb.com/intern/testinfra/diagnostics/562950242769806.562950040185593.1677314979/

This diff implemented the fix described above.

Reviewed By: wushirong

Differential Revision: D43596790

fbshipit-source-id: 9fb9d714816d4e34a5dc2a4e39fd6f5862004a74
---
 .../fx2ait/converters/aten2ait_converters.py  | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 12f9ada77..5fd96a7a5 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -745,8 +745,29 @@ def aten_ops_reshape(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Unexpected input for {name}: {input_val}")
     shape = args[1]
-
-    return reshape()(input_val, shape)
+    new_shape = []
+    for s in shape:
+        if isinstance(s, IntVarTensor) or s == -1:
+            new_shape.append(s)
+        elif isinstance(s, int):
+            new_shape.append(IntVarTensor(IntImm(s)))
+        else:
+            raise RuntimeError(f"Unexpected shape type for {name}: {s} in {shape}")
+
+    if new_shape.count(-1):
+        assert new_shape.count(-1) == 1
+        input_shape = size()(input_val)
+        unkown_dim = input_shape[0]
+        for i in range(1, len(input_shape)):
+            unkown_dim = unkown_dim * input_shape[i]
+        idx = new_shape.index(-1)
+
+        for s in new_shape:
+            if s != -1:
+                unkown_dim = unkown_dim / s
+        new_shape[idx] = unkown_dim
+
+    return reshape()(input_val, new_shape)
 
 
 @ait_converter(torch.ops.aten.sym_size)

From b983110b68bbf5754dadd5cb3d6cbb27763fa8fe Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Sun, 26 Feb 2023 09:36:31 -0800
Subject: [PATCH 181/638] Pass fixing and refactoring (#333)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/333

1. fix some lib related path issues
2. add composition test for bmm
3. adopt the customize the `replace_pattern` from fx. The considerations are 1) simply some cases where we need to do pattern replacement. For ex., there are some MHA implementations which we want to recognize them and map to our MHA converter. 2) avoid sym_size comparison. For ex. we hope these two patterns are the same. One pattern is
`a = view(b, [sym_size_1, sym_size_2]) `
while the other pattern is
`a = view(b, [sym_size_1, 60]) `
5. getitem(slice) changed the trace behavior compared with our test in Dec.. It has more than 1 user in internmediate slice.Tensor. For ex. slice_5 has 2 users while it used to have 1.
```
slice_5 = torch.ops.aten.slice.Tensor(permute_pooled_embs_auto_grad, 0, 0, 9223372036854775807);  permute_pooled_embs_auto_grad = None
slice_6 = torch.ops.aten.slice.Tensor(slice_5, 1, 188, 2636)
sym_size = torch.ops.aten.sym_size(slice_5, 0);
```
After this diff,
```
aten_compose_getitem_slice_2 = fx2ait_passes_lower_basic_pass_aten_aten_compose_getitem_slice(permute_pooled_embs_auto_grad, [(0, 0, 9223372036854775807), (1, 188, 2636)]);
sym_size = torch.ops.aten.sym_size(aten_compose_getitem_slice_2, 0);
```

Reviewed By: wushirong, tissue3

Differential Revision: D43016248

fbshipit-source-id: 3c0de021fb7d35c4abf22749063965dce337aff3
---
 .../fx2ait/converters/aten2ait_converters.py  |   2 +
 fx2ait/fx2ait/passes/lower_basic_pass_aten.py | 212 ++++++--
 .../test_ait_batch_norm_aten.py               |  29 +-
 .../converters_aten/test_ait_chunk_aten.py    |   7 +-
 .../converters_aten/test_ait_linear_aten.py   |  18 +-
 .../converters_aten/test_ait_matmul_aten.py   |  16 +-
 .../test_ait_slice_tensor_aten.py             |  16 +-
 .../test_ait_unary_ops_aten.py                |   4 +-
 fx2ait/fx2ait/tools/ait_subgraph_rewriter.py  | 481 ++++++++++++++++++
 fx2ait/fx2ait/tools/common_aten2ait.py        |  17 +-
 10 files changed, 719 insertions(+), 83 deletions(-)
 create mode 100644 fx2ait/fx2ait/tools/ait_subgraph_rewriter.py

diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 5fd96a7a5..aef1c91de 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -67,6 +67,7 @@
     aten_compose_bmm_3d,
     aten_compose_chunk,
     aten_compose_getitem_slice,
+    aten_compose_mm_2d,
     aten_operator_getitem,
 )
 from torch.fx.node import Argument, Target
@@ -555,6 +556,7 @@ def aten_ops_max_pool2d(
     return max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
 
 
+@ait_converter(aten_compose_mm_2d)
 @ait_converter(aten_compose_bmm_3d)
 @ait_converter(aten_compose_bmm_2d)
 @ait_converter(torch.ops.aten.addmm.default)
diff --git a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
index f49cb7d5e..be34c513d 100644
--- a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
+++ b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
@@ -18,7 +18,8 @@
 
 import torch
 import torch.fx
-from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.ait_subgraph_rewriter import replace_pattern
+
 from torch.fx.experimental.const_fold import split_const_subgraphs
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.fx.passes.shape_prop import TensorMetadata
@@ -29,6 +30,34 @@
 # throughout the file.
 Input = Any
 
+from fx2ait.acc_tracer import acc_ops
+from torch.fx import symbolic_trace
+
+
+def replacement_pattern_abstract(replacement):
+    """
+    Replace the pattern graph by a node of call_function of this `replacement`
+    """
+    traced = symbolic_trace(replacement)
+    replacement_placeholders = [
+        node for node in traced.graph.nodes if node.op == "placeholder"
+    ]
+    for n in traced.graph.nodes:
+        if n.op == "output":
+            before_output = n.all_input_nodes[0]
+            with traced.graph.inserting_after(before_output):
+                new_args = tuple(replacement_placeholders)
+                new_node = traced.graph.create_node(
+                    "call_function",
+                    replacement,
+                    args=new_args,
+                    kwargs=None,
+                )
+                before_output.replace_all_uses_with(new_node)
+    traced.graph.eliminate_dead_code()
+    traced.recompile()
+    return traced
+
 
 def run_const_fold(traced_mod: torch.fx.GraphModule) -> torch.fx.GraphModule:
     # Now we do constant folding on traced module.
@@ -99,6 +128,7 @@ def nchw2nhwc_pass(
     return PassResult(module, modified)
 
 
+# TODO: delete in future
 def replace_inplace_ops(
     module: torch.fx.GraphModule,
 ) -> torch.fx.GraphModule:
@@ -213,6 +243,49 @@ def replace_transpose_mm_op_with_linear(
     return PassResult(module, modified)
 
 
+def replace_batch_norm(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Current exir.capture enable_aot and it captures bwd needed nodes in output P619801318
+    This pass removes those unused node and replace with classic aten.batch_norm
+    """
+    batch_node_list = []
+    for n in module.graph.nodes:
+        if n.target == torch.ops.aten._native_batch_norm_legit_functional.default:
+            batch_node_list.append(n)
+        if n.target == "output":
+            output_node = n
+
+    if len(batch_node_list) > 0:
+        modified = True
+    else:
+        modified = False
+    for n in batch_node_list:
+        new_op = torch.ops.aten.batch_norm
+        new_args = list(n.args)
+        new_args.append(False)
+        new_args = tuple(new_args)
+        user_list = [x for x in n.users]
+        user_list_copy_node = []
+        user_list_copy_node.append(next(iter(user_list[1].users)))
+        user_list_copy_node.append(next(iter(user_list[2].users)))
+        getitem_node = user_list[0]
+        with module.graph.inserting_after(getitem_node):
+            new_node = module.graph.create_node(
+                "call_function",
+                new_op,
+                args=new_args,
+                kwargs=n.kwargs,
+            )
+            getitem_node.replace_all_uses_with(new_node)
+
+        output_args = output_node.args[0]
+        new_output_args = [x for x in output_args if x not in user_list_copy_node]
+        output_node.args = (new_output_args,)
+        module.graph.eliminate_dead_code()
+        module.recompile()
+    return PassResult(module, modified)
+
+
 def replace_aten_op_with_indices(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
     modified = False
     for n in module.graph.nodes:
@@ -416,17 +489,35 @@ def match_pattern(module, node):
         if node.op == "call_function" and node.target == torch.ops.aten.slice.Tensor:
             holder = []
             holder.append(node)
-            while (
-                len(node.users.keys()) == 1
-                and next(iter(node.users)).target == torch.ops.aten.slice.Tensor
-                and node.args[1] + 1 == next(iter(node.users)).args[1]
-            ):
-                node = next(iter(node.users))
-                holder.append(node)
+            qualified = True
+            user_change_input = []
+
+            while qualified:
+                next_user = None
+                for user in node.users:
+                    if (
+                        user.target == torch.ops.aten.slice.Tensor
+                        and node.args[1] + 1 == user.args[1]
+                    ):
+                        next_user = user
+                    elif (
+                        user.target == torch.ops.aten.sym_size
+                        and user.args[1] == node.args[1]
+                    ):
+                        user_change_input.append(user)
+                    else:
+                        qualified = False
+                        break
+                if qualified and next_user:
+                    node = next_user
+                    holder.append(node)
+                else:
+                    qualified = False
+
             if len(holder) == 1:
                 return (False,)
             else:
-                return (True, holder)
+                return (True, holder, user_change_input)
         return (False,)
 
     modified = False
@@ -435,6 +526,7 @@ def match_pattern(module, node):
         if res[0]:
             modified = True
             holder = res[1]
+            user_change_input = res[2]
             input_n = holder[0].args[0]
             last_n = holder[-1]
             list_args = []
@@ -446,15 +538,32 @@ def match_pattern(module, node):
                 new_node = module.graph.create_node(
                     "call_function",
                     aten_compose_getitem_slice,
-                    args=new_args,
+                    args=tuple(new_args),
                     kwargs=None,
                 )
             last_n.replace_all_uses_with(new_node)
+            for n in user_change_input:
+                new_args = list(n.args)
+                new_args[0] = new_node
+                n.args = tuple(new_args)
+
     module.graph.eliminate_dead_code()
     module.recompile()
     return PassResult(module, modified)
 
 
+def aten_compose_mm_2d(arg0_1, arg1_1):
+    sym_size = torch.ops.aten.sym_size(arg0_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(arg0_1, 1)
+    mul = sym_size * sym_size_1
+    sym_size_2 = torch.ops.aten.sym_size(arg0_1, 2)
+    view = torch.ops.aten.view.default(arg0_1, [mul, sym_size_2])
+    mm = torch.ops.aten.mm.default(view, arg1_1)
+    sym_size_3 = torch.ops.aten.sym_size(arg1_1, 1)
+    view_1 = torch.ops.aten.view.default(mm, [sym_size, sym_size_1, sym_size_3])
+    return view_1
+
+
 def aten_compose_bmm_2d(flat_args_1, flat_args_2):
     sym_size = torch.ops.aten.sym_size(flat_args_1, 0)
     sym_size_1 = torch.ops.aten.sym_size(flat_args_1, 1)
@@ -500,48 +609,49 @@ def compose_bmm(
     combine decomposed bmm (matmul)
     """
     modified = False
-    for n in module.graph.nodes:
-        if n.op == "call_function" and n.target in (torch.ops.aten.bmm.default,):
-            modified = True
-            node = n
-            input_n = node.all_input_nodes[0]
-            other_n = node.all_input_nodes[1]
-            output = next(iter(node.users))
-            input_input_n = input_n.all_input_nodes[0]
-            if (
-                input_input_n.target != torch.ops.aten.expand.default
-                and input_n.target != torch.ops.aten.view.default
-            ):
-                raise RuntimeError(
-                    "Bmm is addressed in fixed pattern. A new pattern is met!"
-                )
-            real_input = input_input_n.all_input_nodes[0]
-            input_other_n = other_n.all_input_nodes[0]
-            if (
-                input_other_n.target != torch.ops.aten.expand.default
-                and other_n.target != torch.ops.aten.view.default
-            ):
-                raise RuntimeError(
-                    "Bmm is addressed in fixed pattern. A new pattern is met!"
-                )
-            real_other = input_other_n.all_input_nodes[0]
-            if len(real_other.meta["val"].size()) == 2:
-                new_func = aten_compose_bmm_2d
-            if len(real_other.meta["val"].size()) == 3:
-                new_func = aten_compose_bmm_3d
+    # pattern replacement for aten_compose_mm_2d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_mm_2d...")
+    aten_compose_mm_2d_replacement = replacement_pattern_abstract(aten_compose_mm_2d)
+    res = replace_pattern(module, aten_compose_mm_2d, aten_compose_mm_2d_replacement)
+    if len(res) > 0:
+        modified = True
+    # pattern replacement for aten_compose_bmm_2d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_bmm_3d...")
 
-            with module.graph.inserting_after(node):
-                new_args = (real_input, real_other)
-                new_node = module.graph.create_node(
-                    "call_function",
-                    new_func,
-                    args=new_args,
-                    kwargs=None,
-                )
-            output.replace_all_uses_with(new_node)
+    def match_filter_aten_compose_bmm_2d(match, original_graph, pattern_graph):
+        if len(match.placeholder_nodes[1].meta["val"].shape) == 2:
+            return True
+        else:
+            return False
+
+    aten_compose_bmm_2d_replacement = replacement_pattern_abstract(aten_compose_bmm_2d)
+    res = replace_pattern(
+        module,
+        aten_compose_bmm_2d,
+        aten_compose_bmm_2d_replacement,
+        [match_filter_aten_compose_bmm_2d],
+    )
+    if len(res) > 0:
+        modified = True
+    # pattern replacement for aten_compose_bmm_3d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_bmm_2d...")
+
+    def match_filter_aten_compose_bmm_3d(match, original_graph, pattern_graph):
+        if len(match.placeholder_nodes[1].meta["val"].shape) == 3:
+            return True
+        else:
+            return False
+
+    aten_compose_bmm_3d_replacement = replacement_pattern_abstract(aten_compose_bmm_3d)
+    res = replace_pattern(
+        module,
+        aten_compose_bmm_3d,
+        aten_compose_bmm_3d_replacement,
+        [match_filter_aten_compose_bmm_3d],
+    )
+    if len(res) > 0:
+        modified = True
 
-    module.graph.eliminate_dead_code()
-    module.recompile()
     return PassResult(module, modified)
 
 
@@ -615,12 +725,12 @@ def match_pattern(module, node):
     return PassResult(module, modified)
 
 
+## TODO: we will remove this pass once dynamo fixed the bug
 def acc_replace_mul_ops(
     module: torch.fx.GraphModule,
 ) -> torch.fx.GraphModule:
     """
     Put constant at the end of multiplicaiton, i.e change 15*x.size(1) to x.size(1)*15.
-    TODO: we will remove this pass once dynamo fixed the bug
     """
     for n in module.graph.nodes:
         if n.op == "call_function" and n.target == acc_ops.mul:
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
index d94b066d3..6450643fb 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
@@ -27,14 +27,39 @@ def __init__(self):
                 self.bn = torch.nn.BatchNorm2d(3)
 
             def forward(self, x):
-                return self.bn(x)
+                y = self.bn(x)
+                y = y.mul(1)
+                return y
 
         model = TestModule().half().cuda()
         inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
         self.run_test(
             model,
             inputs,
-            expected_ops={torch.ops.aten.batch_norm},
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    def test_batch_norm_2layers(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+                self.bn2 = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                y = self.bn(x)
+                y = y.mul(1)
+                y = self.bn2(y)
+                return y
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
             permute_inputs=[0, 2, 3, 1],
             permute_outputs=[0, 3, 1, 2],
         )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
index 110d6a280..3acdb6ad6 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
@@ -14,6 +14,7 @@
 #
 import torch
 from fx2ait.fx2ait import TensorSpec
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_chunk
 from fx2ait.tools.common_aten2ait import DispatchTestCase
 from parameterized import param, parameterized
 
@@ -40,7 +41,7 @@ def forward(self, x):
 
         model = TestModule().cuda().half()
         inputs = [torch.randn(shape).half().cuda()]
-        self.run_test(model, inputs, expected_ops={})
+        self.run_test(model, inputs, expected_ops={aten_compose_chunk})
 
     def test_chunk_dynamic(self):
         class TestModule(torch.nn.Module):
@@ -57,4 +58,6 @@ def forward(self, x):
             ],
         )
 
-        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={})
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={aten_compose_chunk}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
index e7fbc678f..618669f06 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 import torch
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_mm_2d
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
 from parameterized import parameterized
@@ -21,22 +22,19 @@
 class TestLinearConverter(DispatchTestCase):
     @parameterized.expand(
         [
-            ("default", [1, 512], True),
-            ("matrix", [5, 512], True),
-            ("no_bias", [1, 512], False),
-            (
-                "multi_dim_matrix",
-                [4, 5, 512],
-                True,
-            ),
+            ("default", [1, 512], True, torch.ops.aten.linear),
+            ("matrix", [5, 512], True, torch.ops.aten.linear),
+            ("no_bias", [1, 512], False, torch.ops.aten.linear),
+            ("multi_dim_matrix", [4, 5, 512], True, torch.ops.aten.linear),
             (
                 "multi_dim_matrix",
                 [4, 5, 512],
                 False,
+                aten_compose_mm_2d,
             ),
         ]
     )
-    def test_linear(self, test_name, shape, bias):
+    def test_linear(self, test_name, shape, bias, expected):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -47,7 +45,7 @@ def forward(self, x):
 
         model = TestModule().cuda().half()
         inputs = [torch.randn(shape).half().cuda()]
-        self.run_test(model, inputs, expected_ops={torch.ops.aten.linear})
+        self.run_test(model, inputs, expected_ops={expected})
 
     @parameterized.expand(
         [
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
index da93658ba..e652f1a07 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 import torch
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_bmm_3d, aten_compose_mm_2d
 from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_aten2ait import DispatchTestCase
 
@@ -23,14 +24,21 @@ class TestMatMulConverter(DispatchTestCase):
     @parameterized.expand(
         [
             [[2, 3], [3, 4], torch.ops.aten.mm.default],
-            [[2, 3, 4], [4, 6], torch.ops.aten.mm.default],
-            [[2, 3, 4], [2, 4, 6], torch.ops.aten.bmm.default],
+            # TODO check again in future since there is a diff about not decompose https://fburl.com/nysuuf7q
+            [
+                [2, 3, 4],
+                [4, 6],
+                aten_compose_mm_2d,
+            ],
+            [[2, 3, 4], [2, 4, 6], aten_compose_bmm_3d],
             [[2, 2, 2, 3, 4], [4, 6], torch.ops.aten.mm.default],
         ]
     )
     def test_simple(self, lhs_shape, rhs_shape, op):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = x.mul(1)
+                y = y.mul(1)
                 return torch.matmul(x, y)
 
         model = TestModule().cuda()
@@ -56,11 +64,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         [
             # Only M can be dynamic: https://github.com/fairinternal/AITemplate/blob/main/tests/unittest/ops/test_gemm.py
             [[[2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
-            [[[2, 3], [2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            [[[2, 3], [2, 3], [3, 3], [6, 6]], aten_compose_mm_2d],
             [[[1, 3], [2, 3], [6, 8], [3, 3], [6, 6]], torch.ops.aten.mm.default],
             # FIXME: batch_size cannot be dynamic because the permutation of shape change the names: P544607056
             # b, m, k, n
-            [[[2, 2], [6, 8], [3, 3], [6, 6]], torch.ops.aten.bmm.default, True],
+            [[[2, 2], [6, 8], [3, 3], [6, 6]], aten_compose_bmm_3d, True],
         ]
     )
     def test_dynamic(self, shape, op, bmm=False):
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
index 19ce54269..724d7d2c1 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
@@ -47,7 +47,7 @@ class TestSliceTensor(DispatchTestCase):
                 "slice_basic",
                 (slice(None, None, None), slice(0, 3, 1)),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -76,7 +76,7 @@ class TestSliceTensor(DispatchTestCase):
                     slice(None, None, None),
                 ),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -88,7 +88,7 @@ class TestSliceTensor(DispatchTestCase):
                     slice(None, 2, 1),
                 ),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -97,7 +97,7 @@ class TestSliceTensor(DispatchTestCase):
                 "slice_end_none",
                 (slice(None, None, None), slice(1, None, 1)),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -109,7 +109,7 @@ class TestSliceTensor(DispatchTestCase):
                     slice(0, 3, None),
                 ),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -127,7 +127,7 @@ class TestSliceTensor(DispatchTestCase):
                 "slice_neg_slice",
                 (slice(None, None, None), slice(-8, -2, 1)),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -145,7 +145,7 @@ class TestSliceTensor(DispatchTestCase):
                 "slice_multi_dim",
                 (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
@@ -172,7 +172,7 @@ class TestSliceTensor(DispatchTestCase):
                 "slice_zero_slice",
                 (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
                 {
-                    torch.ops.aten.slice.Tensor,
+                    aten_compose_getitem_slice,
                     torch.ops.aten.add.Tensor,
                 },
                 None,
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
index 09a4d2bbf..884e85312 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
@@ -31,8 +31,8 @@
     (torch.sqrt, torch.ops.aten.sqrt.default),
     (
         torch.clone,
-        torch.ops.aten.clone.default,
-    ),  # clone op can not be the output directly
+        torch.ops.aten.mul.Tensor,
+    ),  # clone op can not be the output directly so expected is the op after it(aten.mul)
 ]
 
 
diff --git a/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py b/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py
new file mode 100644
index 000000000..e3fa593e2
--- /dev/null
+++ b/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py
@@ -0,0 +1,481 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import copy
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Union
+
+import torch
+
+from torch.fx._symbolic_trace import symbolic_trace
+from torch.fx.graph import Graph
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.passes.utils.matcher_utils import InternalMatch, SubgraphMatcher
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class AITSubgraphMatcher(SubgraphMatcher):
+    def __init__(
+        self,
+        pattern: Graph,
+        match_output: bool = False,
+        match_placeholder: bool = False,
+        remove_overlapping_matches: bool = True,
+    ):
+        super(AITSubgraphMatcher, self).__init__(
+            pattern, match_output, match_placeholder, remove_overlapping_matches
+        )
+
+    def _match_args(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
+        logger.info(f"  matching arguments: {pn} to {gn}")
+        assert not (
+            isinstance(pn, Node) and isinstance(gn, Node)
+        ), "pn and gn cannot both be Node"
+        if isinstance(pn, Node) and not isinstance(gn, Node):
+            if pn.op == "placeholder":
+                # Check if we've already matched these nodes in the current
+                # traversal
+                if pn in match.nodes_map:
+                    return match.nodes_map[pn] == gn
+
+                match.nodes_map[pn] = gn
+                return True
+            elif pn.op == "call_function" and pn.target == torch.ops.aten.sym_size:
+                return True
+            else:
+                return False
+        elif not isinstance(pn, Node) and isinstance(gn, Node):
+            return False
+        else:
+            return type(gn) == type(pn) and gn == pn
+
+    def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
+        logger.info(f"  matching node: {pn} to {gn}")
+        # breakpoint()
+        assert isinstance(pn, Node) and isinstance(gn, Node), str(
+            f"pn and gn must be Node, pn: {pn}, gn: {gn}"
+        )
+        if (
+            pn.target == torch.ops.aten.sym_size
+            and gn.target == torch.ops.aten.sym_size
+        ):
+            return True
+        # Check if we've already matched these nodes in the current
+        # traversal
+        if pn in match.nodes_map:
+            return match.nodes_map[pn] == gn
+
+        # TODO: use a more efficienty way to check if gn is matched before: two-way dict
+        if gn in match.nodes_map.values():
+            return False
+
+        if not self._nodes_are_equal(pn, gn):
+            return False
+
+        # Optimistically mark `pn` as a match for `gn`, and save a local copy of match
+        saved_match = copy.copy(match)
+        match.nodes_map[pn] = gn
+
+        if pn.op == "placeholder":
+            return True
+
+        # Recursively traverse upwards to check if `pn` is a true
+        # match for `gn`
+        match_found = True
+
+        def flatten_args(args) -> List[Any]:
+            # Recursively flatten args
+            result: List[Any] = []
+            for arg in args:
+                # flatten the list, if only it's a list/tuple of nodes
+                if (
+                    isinstance(arg, (list, tuple))
+                    and len(arg) > 0
+                    and isinstance(arg[0], Node)
+                ):
+                    result.extend(flatten_args(arg))
+                else:
+                    result.append(arg)
+
+            return result
+
+        pn_flatten_args = flatten_args(pn.args)
+        gn_flatten_args = flatten_args(gn.args)
+
+        if pn.kwargs.keys() == gn.kwargs.keys():
+            for key in pn.kwargs.keys():
+                pn_flatten_args.append(pn.kwargs[key])
+                gn_flatten_args.append(gn.kwargs[key])
+        else:
+            match_found = False
+
+        if match_found and len(pn_flatten_args) == len(gn_flatten_args):
+            for pn_, gn_ in zip(pn_flatten_args, gn_flatten_args):
+                if isinstance(gn_, Node) and isinstance(pn_, Node):
+                    matched = self._match_nodes(pn_, gn_, match)
+                else:
+                    matched = self._match_args(pn_, gn_, match)
+                if not matched:
+                    match_found = False
+                    break
+        else:
+            match_found = False
+
+        if not match_found:
+            # revert to saved_match before matching with current node
+            match = copy.copy(saved_match)
+            return False
+
+        return True
+
+
+__all__ = [
+    "Match",
+    "replace_pattern",
+    "ReplacedPatterns",
+]
+
+
+class Match(NamedTuple):
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+
+
+@dataclass
+class ReplacedPatterns:
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+    # List of nodes that were added into the graph
+    replacements: List[Node]
+
+
+def _replace_submodules(gm: GraphModule, replacement: torch.nn.Module) -> None:
+    gm.delete_all_unused_submodules()
+
+    if isinstance(replacement, GraphModule):
+        replacement.graph.lint()
+
+    def try_get_submodule(
+        mod: torch.nn.Module, target: str
+    ) -> Optional[torch.nn.Module]:
+        try:
+            mod_match = mod.get_submodule(target)
+            return mod_match
+        except AttributeError:
+            return None
+
+    for node in gm.graph.nodes:
+        if node.op == "call_module" or node.op == "get_attr":
+            gm_submod = try_get_submodule(gm, node.target)
+
+            replacement_submod = try_get_submodule(replacement, node.target)
+
+            # CASE 1: This target already exists as a submodule in our
+            # result GraphModule. Whether or not it exists in
+            # `replacement`, the existing submodule takes precedence.
+            if gm_submod is not None:
+                continue
+
+            # CASE 2: The target exists as a submodule in `replacement`
+            # only, so we need to copy it over.
+            elif replacement_submod is not None:
+                new_submod = copy.deepcopy(getattr(replacement, node.target))
+                gm.add_submodule(node.target, new_submod)
+
+            # CASE 3: The target doesn't exist as a submodule in `gm`
+            # or `replacement`
+            else:
+                raise RuntimeError(
+                    'Attempted to create a "',
+                    node.op,
+                    '" node during subgraph rewriting '
+                    f"with target {node.target}, but "
+                    "the referenced submodule does not "
+                    "exist in either the original "
+                    "GraphModule `gm` or the replacement"
+                    " GraphModule `replacement`",
+                )
+
+    gm.graph.lint()
+
+
+def replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None,  # type: ignore[name-defined]
+) -> List[Match]:
+    """
+    Matches all possible non-overlapping sets of operators and their
+    data dependencies (``pattern``) in the Graph of a GraphModule
+    (``gm``), then replaces each of these matched subgraphs with another
+    subgraph (``replacement``).
+
+    Args:
+        ``gm``: The GraphModule that wraps the Graph to operate on
+        ``pattern``: The subgraph to match in ``gm`` for replacement
+        ``replacement``: The subgraph to replace ``pattern`` with
+
+    Returns:
+        List[Match]: A list of ``Match`` objects representing the places
+        in the original graph that ``pattern`` was matched to. The list
+        is empty if there are no matches. ``Match`` is defined as:
+
+        .. code-block:: python
+
+            class Match(NamedTuple):
+                # Node from which the match was found
+                anchor: Node
+                # Maps nodes in the pattern subgraph to nodes in the larger graph
+                nodes_map: Dict[Node, Node]
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from torch.fx import symbolic_trace, subgraph_rewriter
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, w1, w2):
+                m1 = torch.cat([w1, w2]).sum()
+                m2 = torch.cat([w1, w2]).sum()
+                return x + torch.max(m1) + torch.max(m2)
+
+        def pattern(w1, w2):
+            return torch.cat([w1, w2]).sum()
+
+        def replacement(w1, w2):
+            return torch.stack([w1, w2])
+
+        traced_module = symbolic_trace(M())
+
+        subgraph_rewriter.replace_pattern(traced_module, pattern, replacement)
+
+    The above code will first match ``pattern`` in the ``forward``
+    method of ``traced_module``. Pattern-matching is done based on
+    use-def relationships, not node names. For example, if you had
+    ``p = torch.cat([a, b])`` in ``pattern``, you could match
+    ``m = torch.cat([a, b])`` in the original ``forward`` function,
+    despite the variable names being different (``p`` vs ``m``).
+
+    The ``return`` statement in ``pattern`` is matched based on its
+    value only; it may or may not match to the ``return`` statement in
+    the larger graph. In other words, the pattern doesn't have to extend
+    to the end of the larger graph.
+
+    When the pattern is matched, it will be removed from the larger
+    function and replaced by ``replacement``. If there are multiple
+    matches for ``pattern`` in the larger function, each non-overlapping
+    match will be replaced. In the case of a match overlap, the first
+    found match in the set of overlapping matches will be replaced.
+    ("First" here being defined as the first in a topological ordering
+    of the Nodes' use-def relationships. In most cases, the first Node
+    is the parameter that appears directly after ``self``, while the
+    last Node is whatever the function returns.)
+
+    One important thing to note is that the parameters of the
+    ``pattern`` Callable must be used in the Callable itself,
+    and the parameters of the ``replacement`` Callable must match
+    the pattern. The first rule is why, in the above code block, the
+    ``forward`` function has parameters ``x, w1, w2``, but the
+    ``pattern`` function only has parameters ``w1, w2``. ``pattern``
+    doesn't use ``x``, so it shouldn't specify ``x`` as a parameter.
+    As an example of the second rule, consider replacing
+
+    .. code-block:: python
+
+        def pattern(x, y):
+            return torch.neg(x) + torch.relu(y)
+
+    with
+
+    .. code-block:: python
+
+        def replacement(x, y):
+            return torch.relu(x)
+
+    In this case, ``replacement`` needs the same number of parameters
+    as ``pattern`` (both ``x`` and ``y``), even though the parameter
+    ``y`` isn't used in ``replacement``.
+
+    After calling ``subgraph_rewriter.replace_pattern``, the generated
+    Python code looks like this:
+
+    .. code-block:: python
+
+        def forward(self, x, w1, w2):
+            stack_1 = torch.stack([w1, w2])
+            sum_1 = stack_1.sum()
+            stack_2 = torch.stack([w1, w2])
+            sum_2 = stack_2.sum()
+            max_1 = torch.max(sum_1)
+            add_1 = x + max_1
+            max_2 = torch.max(sum_2)
+            add_2 = add_1 + max_2
+            return add_2
+    """
+    match_and_replacements = _replace_pattern(gm, pattern, replacement, match_filters)
+    return [
+        Match(anchor=m.anchor, nodes_map=m.nodes_map) for m in match_and_replacements
+    ]
+
+
+def _replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None,  # type: ignore[name-defined]
+) -> List[ReplacedPatterns]:
+
+    if match_filters is None:
+        match_filters = []
+
+    # Get the graphs for `gm`, `pattern`, `replacement`
+    original_graph: Graph = gm.graph
+
+    if isinstance(pattern, GraphModule):
+        pattern_graph = pattern.graph
+    else:
+        pattern_graph = symbolic_trace(pattern).graph
+
+    if isinstance(replacement, GraphModule):
+        replacement_graph = replacement.graph
+    else:
+        replacement_graph = symbolic_trace(replacement).graph
+    matcher = AITSubgraphMatcher(
+        pattern_graph,
+        match_output=False,
+        match_placeholder=False,
+        remove_overlapping_matches=True,
+    )
+
+    _matches: List[InternalMatch] = matcher.match(original_graph)
+    logger.info(f"matches = {_matches}")
+    # Filter out matches that don't match the filter
+    _matches = [
+        m
+        for m in _matches
+        if all(
+            match_filter(m, original_graph, pattern_graph)
+            for match_filter in match_filters
+        )
+    ]
+
+    replacement_placeholders = [
+        n for n in replacement_graph.nodes if n.op == "placeholder"
+    ]
+
+    # As we progressively replace nodes, we'll need to keep track of how the match results should change
+    match_changed_node: Dict[Node, Node] = {}
+
+    match_and_replacements = []
+    for match in _matches:
+
+        # Build connecting between replacement graph's input and original graph input producer node
+
+        # Initialize `val_map` with mappings from placeholder nodes in
+        # `replacement` to their corresponding node in `original_graph`
+        assert len(match.placeholder_nodes) == len(replacement_placeholders)
+        val_map: Dict[Node, Node] = {}
+        for rn, gn in zip(replacement_placeholders, match.placeholder_nodes):
+            if isinstance(gn, Node):
+                val_map[rn] = match_changed_node.get(gn, gn)
+            else:
+                val_map[rn] = gn
+
+        # Copy the replacement graph over
+        user_nodes: Set[Node] = set()
+        for n in match.returning_nodes:
+            for user in n.users:
+                user_nodes.add(user)
+        assert user_nodes, "The returning_nodes should have at least one user node"
+
+        if len(user_nodes) == 1:
+            first_user_node = list(user_nodes)[0]
+        else:
+            # If there are multiple user nodes, we need to find the first user node
+            # in the current execution order of the `original_graph`
+            for n in original_graph.nodes:
+                if n in user_nodes:
+                    first_user_node = n
+                    break
+
+        with original_graph.inserting_before(first_user_node):
+            copied_returning_nodes = original_graph.graph_copy(
+                replacement_graph, val_map
+            )
+
+        if isinstance(copied_returning_nodes, Node):
+            copied_returning_nodes = (copied_returning_nodes,)
+
+        # Get a list of nodes that have been replaced into the graph
+        replacement_nodes = []
+
+        def get_replacement_nodes(curr_node: Node):
+            nonlocal replacement_nodes
+            for arg in curr_node.args:
+                if isinstance(arg, Node):
+                    if arg not in val_map.values():
+                        get_replacement_nodes(arg)
+            replacement_nodes.append(curr_node)
+
+        for ret_node in copied_returning_nodes:
+            get_replacement_nodes(ret_node)
+
+        # Hook the output Node of the replacement subgraph in to the
+        # original Graph at the correct location
+        assert len(match.returning_nodes) == len(copied_returning_nodes)
+        for gn, copied_node in zip(match.returning_nodes, copied_returning_nodes):
+            gn.replace_all_uses_with(copied_node)
+            match_changed_node[gn] = copied_node
+        # Remove the original nodes
+        logger.info(f"Remove pattern node from original graph, match={match}")
+        for node in reversed(pattern_graph.nodes):
+            if (
+                node.op != "placeholder"
+                and node.op != "output"
+                and node.target != torch.ops.aten.sym_size
+            ):
+
+                gn = match.nodes_map[node]
+                gm.graph.erase_node(gn)
+        match_and_replacements.append(
+            ReplacedPatterns(
+                anchor=match.anchors[0],
+                nodes_map=match.nodes_map,
+                replacements=replacement_nodes,
+            )
+        )
+
+    # Update the passed-in GraphModule to reflect the new state of
+    # `original_graph`
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    # If `replacement` was an nn.Module, we'll need to make sure that
+    # all the submodules have been copied over correctly
+    # if isinstance(replacement, torch.nn.Module):
+    #     _replace_submodules(gm, replacement)
+
+    return match_and_replacements
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
index 2fce7ff7e..bd350c0c5 100644
--- a/fx2ait/fx2ait/tools/common_aten2ait.py
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import uuid
@@ -29,9 +30,13 @@
 from fx2ait.fx2ait import AITInterpreter
 
 from fx2ait.passes.lower_basic_pass_aten import (
+    compose_bmm,
     compose_chunk,
+    compose_getitem_slice,
+    remove_ops,
     replace_aten_op_with_indices,
     replace_aten_reshape_alias_with_replace,
+    # replace_batch_norm,  # it is needed if enable_aot=True in tracer
     replace_builtin_ops,
     replace_native_layernorm_with_layernorm,
     replace_transpose_mm_op_with_linear,
@@ -39,6 +44,7 @@
 )
 from fx2ait.tensor_spec import TensorSpec
 
+_LOGGER = logging.getLogger(__name__)
 torch.ops.load_library("//deeplearning/ait:AITModel")
 
 
@@ -82,12 +88,15 @@ def generate_graph(
         # Torchdynamo+aot proxytensor tracer
         # Below are common passes
         passes_list = [
+            compose_bmm,
+            compose_chunk,
+            compose_getitem_slice,
             replace_aten_reshape_alias_with_replace,
             replace_aten_op_with_indices,
-            replace_transpose_mm_op_with_linear,
+            replace_transpose_mm_op_with_linear,  # after compose_bmm
             replace_native_layernorm_with_layernorm,
-            compose_chunk,
-            replace_builtin_ops,
+            remove_ops,
+            replace_builtin_ops,  # after replace_native_layernorm_with_layernorm
         ]
         # Combine with customized passes specific to any model
         if customized_passes:
@@ -104,7 +113,7 @@ def generate_graph(
         )._to_server(ServerCompileConfig(passes=passes_list))
 
         fx_module = run_const_fold(fx_module)
-        print(fx_module.graph)
+        _LOGGER.info(f"aten fx graph: {fx_module.graph}")
 
         if len(expected_ops):
             self.assert_has_op(fx_module, expected_ops)

From bdefae8f5ed86d9e17232dafbb998c374a5ab892 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Sun, 26 Feb 2023 22:55:38 -0800
Subject: [PATCH 182/638] do not run detect_target() in global scope (#324)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/324

It broke someone's import

Reviewed By: ipiszy

Differential Revision: D43576720

fbshipit-source-id: 939604b7dfd022fb7e0037756aeb3c43c9775d59
---
 fx2ait/fx2ait/tools/common_fx2ait.py         |  2 +
 python/aitemplate/backend/cuda/target_def.py |  4 +-
 python/aitemplate/backend/profiler_runner.py |  8 ++-
 python/aitemplate/frontend/nn/attention.py   | 19 +++----
 python/aitemplate/frontend/nn/embedding.py   |  9 ++--
 python/aitemplate/frontend/nn/linear.py      | 10 ++--
 tests/unittest/backend/test_profiler.py      | 27 +++++-----
 tests/unittest/compiler/test_tensor.py       | 53 +++++++++++---------
 8 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index a2cb261e4..90d18fcd3 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -22,6 +22,7 @@
 from unittest import TestCase
 
 import torch
+from aitemplate.testing import detect_target
 from fx2ait.acc_tracer import acc_tracer
 from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
 from fx2ait.ait_module import AITModule
@@ -70,6 +71,7 @@ class AITTestCase(TestCase):
     def setUp(self):
         super().setUp()
         torch.manual_seed(3)
+        detect_target()
 
     def run_test(
         self,
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index bbed5e0dd..dcadb16a3 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -286,13 +286,11 @@ def _build_compile_options(self):
                 options.append("-DNDEBUG")
             FBCUDA.compile_options_ = " ".join(options)
         compile_options = FBCUDA.compile_options_
-        _LOGGER.debug(f"The compile options are: {compile_options}")
+        _LOGGER.info(f"The compile options are: {compile_options}")
         return compile_options
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if not is_debug() and self._include_path:
-            shutil.rmtree(self._include_path)
 
     def binary_compile_cmd(self):
         """
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 3a4e11236..8456f7a05 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -28,6 +28,8 @@
 from queue import Queue
 from typing import Callable, List, Tuple
 
+from aitemplate.testing import detect_target
+
 from .target import Target
 from .task_runner import BaseRunner, Task
 
@@ -260,7 +262,11 @@ def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 3
         self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(devices))
         self._futures = []
         self._postprocessing_delegate = postprocessing_delegate
-        self._dev_select_flag = Target.current().dev_select_flag()
+        try:
+            target = Target.current()
+        except RuntimeError:
+            target = detect_target()
+        self._dev_select_flag = target.dev_select_flag()
 
     def push(self, cmds: List[str], process_result_callback: Callable):
         """
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index c297538f6..f2272218e 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -26,10 +26,6 @@
 from .module import Module
 from .parameter import Parameter
 
-# pylint: disable=C0103
-
-USE_CUDA = detect_target().name() == "cuda"
-
 
 class FlashAttention(Module):
     r"""FlashAttention provides an implementation for fused
@@ -95,6 +91,8 @@ class MultiheadAttention(Module):
         mask_seq: sequence mask, default: ``0``.
     """
 
+    USE_CUDA = None
+
     def __init__(
         self,
         dim,
@@ -113,6 +111,9 @@ def __init__(
         assert (
             dim % num_heads == 0
         ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        if MultiheadAttention.USE_CUDA is None:
+            MultiheadAttention.USE_CUDA = detect_target().name() == "cuda"
+
         self.num_heads = num_heads
         head_dim = dim // num_heads
         self.scale = head_dim**-0.5
@@ -149,7 +150,7 @@ def __init__(
                 shape=[mask_seq, num_heads, head_dim], dtype="float16"
             )
 
-        if USE_CUDA:
+        if self.USE_CUDA:
             # on CUDA flash_attention needs packed QKV as input,
             # then do split + permute inside flash_attn
             # input: (B, S, H)
@@ -187,7 +188,7 @@ def get_shape(self, x):
         return shape
 
     def qkv_proj(self, x):
-        if USE_CUDA:
+        if self.USE_CUDA:
             if self.use_flash:
                 batch, seq, hidden = self.get_shape(x)
                 out = self.qkv(x)
@@ -204,11 +205,11 @@ def qkv_proj(self, x):
     def attention(self, x):
         # fused attention
         # output: (B, Seqlen, num_heads, head_dim)
-        if USE_CUDA and self.use_flash:
+        if self.USE_CUDA and self.use_flash:
             # input(x): (B*seqlen, 3, num_heads, head_dim)
             # output: (B, Seqlen, num_heads, head_dim)
             return self.op(x, self.cu_length.tensor())
-        elif USE_CUDA and self.use_mem_eff:
+        elif self.USE_CUDA and self.use_mem_eff:
             (q, k, v) = ops.split()(x, 1, dim=0)
             _, b, num_heads, seqlen, d = self.get_shape(q)
             return self.op(
@@ -223,7 +224,7 @@ def attention(self, x):
             # attn@v: (B, S, S) * (B, S, H) = (B, S, H) #RRR
             # reshape: (B, num_head, seqlen, head_dim)
             # permute: (B, Seqlen, num_heads, head_dim)
-            if USE_CUDA:
+            if self.USE_CUDA:
                 scale = Tensor(
                     shape=[], dtype="float16", name="scale", value=self.scale
                 )
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index 018a597a1..e5ee72c69 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -46,12 +46,11 @@ def tensor(self):
         return self.weight.tensor()
 
 
-USE_CUDA = detect_target().name() == "cuda"
-
-
 class BertEmbeddings(Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
+    USE_CUDA = None
+
     def __init__(
         self,
         hidden_size,
@@ -63,6 +62,8 @@ def __init__(
         dtype="float16",
     ):
         super().__init__()
+        if BertEmbeddings.USE_CUDA is None:
+            BertEmbeddings.USE_CUDA = detect_target().name() == "cuda"
         assert (
             hidden_dropout_prob == 0.0
         ), "Dropout rate larger than 0 is not supported yet."
@@ -85,7 +86,7 @@ def forward(
         token_type_ids,  # [B, S]
         position_ids,  # [B, S]
     ):
-        if USE_CUDA:
+        if self.USE_CUDA:
             embeddings = ops.bert_embeddings()(
                 input_ids,
                 token_type_ids,
diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
index a50f27847..fa0bac88b 100644
--- a/python/aitemplate/frontend/nn/linear.py
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -21,10 +21,6 @@
 from .module import Module
 from .parameter import Parameter
 
-# pylint: disable=C0103
-
-USE_CUDA = detect_target().name() == "cuda"
-
 
 class Linear(Module):
     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
@@ -64,6 +60,8 @@ class Linear(Module):
         Tensor(shape=[128, 30])
     """
 
+    USE_CUDA = None
+
     def __init__(
         self,
         in_channels,
@@ -74,6 +72,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
+        if Linear.USE_CUDA is None:
+            Linear.USE_CUDA = detect_target().name() == "cuda"
         self.weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
         op_name = "gemm_rcr_bias" if bias else "gemm_rcr"
         if specialization is not None:
@@ -89,7 +89,7 @@ def __init__(
     def forward(self, *args):
         assert len(args) >= 1
         x = args[0]
-        if not USE_CUDA:
+        if not self.USE_CUDA:
             shape = x._attrs["shape"]
             x = x if len(shape) == 2 else ops.reshape()(x, [-1, self.in_channels])
         if len(args) == 2:
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
index f2fb16597..438df946d 100644
--- a/tests/unittest/backend/test_profiler.py
+++ b/tests/unittest/backend/test_profiler.py
@@ -17,8 +17,6 @@
 from time import sleep
 from unittest.mock import patch
 
-from aitemplate.backend.cuda.target_def import CUDA as CUDATarget
-
 from aitemplate.backend.profiler_runner import ProfilerRunner
 
 
@@ -53,21 +51,20 @@ def test_profiler_runner(self):
             "aitemplate.backend.profiler_runner.extract_profile_result"
         ) as mock_extract_profile_result:
             mock_extract_profile_result.return_value = ("", False)
-            with CUDATarget() as _:
-                pr = ProfilerRunner(
-                    devices=[str(i) for i in range(12)],
-                    timeout=60,
-                    postprocessing_delegate=Delegate(test_instance=self),
+            pr = ProfilerRunner(
+                devices=[str(i) for i in range(12)],
+                timeout=60,
+                postprocessing_delegate=Delegate(test_instance=self),
+            )
+
+            for i, _ in enumerate(pr._postprocessing_delegate.results):
+                sleep_for = 0
+                pr.push(
+                    cmds=["sleep", f"{sleep_for}"],
+                    process_result_callback=delegate_cb_wrapper(i, sleep_for),
                 )
 
-                for i, _ in enumerate(pr._postprocessing_delegate.results):
-                    sleep_for = 0
-                    pr.push(
-                        cmds=["sleep", f"{sleep_for}"],
-                        process_result_callback=delegate_cb_wrapper(i, sleep_for),
-                    )
-
-                pr.join()
+            pr.join()
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_tensor.py b/tests/unittest/compiler/test_tensor.py
index 76047a78f..f29a3e18b 100644
--- a/tests/unittest/compiler/test_tensor.py
+++ b/tests/unittest/compiler/test_tensor.py
@@ -21,33 +21,38 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target
+from parameterized import parameterized
 
 
 class TensorTestCase(unittest.TestCase):
-    def test_tensor_size(self):
-        to_torch_dtype = {
-            "bool": torch.bool,
-            "int": torch.int32,
-            "int32": torch.int32,
-            "int64": torch.int64,
-            "float16": torch.float16,
-            "float": torch.float,
-            "float32": torch.float,
-        }
-        for dtype, torch_dtype in to_torch_dtype.items():
-            x = Tensor([3], dtype=dtype, is_input=True, is_output=True)
-            x_pt = torch.randn(3).to(torch_dtype).cuda()
-
-            expected_bytes = x_pt.numel() * x_pt.element_size()
-            self.assertEqual(x.size_bytes(), expected_bytes)
-
-            mod = compile_model(
-                x, detect_target(), "./tmp", f"test_tensor_size_{dtype}"
-            )
-
-            out = torch.empty_like(x_pt)
-            mod.run_with_tensors([x_pt], [out])
-            self.assertTrue(torch.equal(out, x_pt))
+    @classmethod
+    def setUpClass(cls):
+        cls.target = detect_target()
+
+    @parameterized.expand(
+        [
+            ("bool", torch.bool),
+            ("int", torch.int32),
+            ("int32", torch.int32),
+            ("int64", torch.int64),
+            ("float16", torch.float16),
+            ("float", torch.float),
+            ("float32", torch.float),
+            ("bfloat16", torch.bfloat16),
+        ]
+    )
+    def test_tensor_size(self, dtype, torch_dtype):
+        x = Tensor([3], dtype=dtype, is_input=True, is_output=True, name="X")
+        x_pt = torch.randn(3).to(torch_dtype).cuda()
+
+        expected_bytes = x_pt.numel() * x_pt.element_size()
+        self.assertEqual(x.size_bytes(), expected_bytes)
+
+        mod = compile_model(x, self.target, "./tmp", f"test_tensor_size_{dtype}")
+
+        out = torch.empty_like(x_pt)
+        mod.run_with_tensors([x_pt], [out])
+        self.assertTrue(torch.equal(out, x_pt))
 
 
 if __name__ == "__main__":

From 24de3cffb2a1503851aceab103e616d08a12b0eb Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 27 Feb 2023 01:11:09 -0800
Subject: [PATCH 183/638] Improve OSS documentation (#334)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/334

While reading the OSS documentation [here](https://facebookincubator.github.io/AITemplate/index.html), I've made some content improvements.

Reviewed By: chenyang78

Differential Revision: D43609113

fbshipit-source-id: 9698ed86d64e315bb1d1fa33084c7ae17f667a9f
---
 README.md                                 | 30 ++++++++-----
 docs/source/arch/philosophy.rst           | 13 ++++--
 docs/source/debughints.rst                | 11 ++---
 docs/source/index.rst                     |  4 +-
 docs/source/install/index.rst             | 31 +++++++------
 docs/source/reference/env.rst             |  9 ++--
 docs/source/runtime/cxx_design.rst        | 25 ++++++-----
 docs/source/runtime/py_design.rst         | 25 ++++++-----
 docs/source/tutorial/how_to_add_op.rst    | 42 +++++++++--------
 docs/source/tutorial/how_to_infer_pt.rst  | 55 ++++++++++++-----------
 docs/source/tutorial/how_to_visualize.rst |  8 ++--
 11 files changed, 137 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index 1184b0312..0d6e1686d 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,11 @@
 [![Deploy docs to Pages](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml/badge.svg)](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml)
 
 
-
 AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
 
 - High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
-- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+
 
 ## More about AITemplate
 
@@ -24,42 +24,48 @@ AITemplate provides unique advanced horizontal fusion. AITemplate can fuse paral
 
 ### Vertical Fusion
 
-AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reduction operations, and layout permutation operations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion.
+AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reductions, and layout permutations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion.
 
 ### Memory Fusion
 
 AITemplate provides innovative memory fusions. AITemplate can fuse GEMM, LayerNorm, and other operators, followed by memory operations such as concatenation, split, and slice into a single operator.
 
 ### Working w/wo PyTorch
+
 The AITemplate-generated Python runtime can take PyTorch tensors as inputs and outputs without an extra copy. For environments without PyTorch, the AITemplate Python/C++ runtime is self-contained.
 
 ### Extensions without suffering
 
 AITemplate provides a straightforward approach for making an extension in codegen. To add a new operator or a new fused kernel into AITemplate, most of the time one only needs to add two Python files: one for a graph node definition and another for the backend codegen. The CUDA/HIP kernel in a text header file can be directly utilized in the codegen.
 
+
 ## FX2AIT
+
 FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving. Using FX2AIT's built-in AITLowerer, partial AIT acceleration can be achieved for models with unsupported operators in AITemplate.
 
 Key features of FX2AIT include:
 
 * Easy Conversion: FX2AIT requires only a PyTorch model and input for conversion, generating an "AITModule" output for inference serving.
-* Expanded Support: AITemplate does not support all PyTorch operators. FX2AIT's AITLowerer offers a solution for partial AIT conversion for models with unsupported operators. Check the example/03_lowering_split for more information.
+* Expanded Support: AITemplate does not support all PyTorch operators. FX2AIT's AITLowerer offers a solution for partial AIT conversion for models with unsupported operators. Check the `fx2ait/fx2ait/example/03_lowering_split` for more information.
 
 More info can be found from https://github.com/facebookincubator/AITemplate/tree/main/fx2ait.
 
+
 ## Installation
 
-**Hardware requirement:**
+**Hardware requirements:**
   - **NVIDIA**: AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
   - **AMD**:  AIT is only tested on CDNA2 (MI-210/250) GPUs. There may be compiler issues for old CDNA1 (MI-100) GPUs.
 
-## Clone the code
+### Clone the code
+
 When cloning the code, please use the following command to also clone the submodules:
 ```
 git clone --recursive https://github.com/facebookincubator/AITemplate
 ```
 
 ### Docker Image
+
 We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC.
 - CUDA: `./docker/build.sh cuda`
 - ROCM: `DOCKER_BUILDKIT=1 ./docker/build.sh rocm`
@@ -67,6 +73,7 @@ We highly recommend using AITemplate with Docker to avoid accidentally using a w
 This will build a docker image with tag `ait:latest`.
 
 ### From Source
+
 The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA/ROCm compiler installed.
 - CUDA: CUDA 11.6
 - ROCm: We tested on ROCm 5.2.3 with a customized build HIPCC with the command in docker/Dockerfile.rocm#L87-L96
@@ -93,7 +100,8 @@ There are a few tutorials for onboarding:
 
 
 ## Examples & Performance
-AITemplate provides the following model templates & reference performance data on A100/MI-250
+
+AITemplate provides the following model templates & reference performance data on A100/MI-250:
 
 - [01_ResNet-50](examples/01_resnet-50/) with PyTorch Image Models (TIMM)
 - [02_MaskRCNN-FPN](examples/02_detectron2/) with Detectron2
@@ -117,21 +125,23 @@ Long-term plan:
 - Composable Kernel CPU extension on AVX2/AVX-512 for AMD Epyc CPU.
 
 ## Contributing
+
 Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute to the project.
 
 ## The Team
 
 AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mu-Chu Lee](https://github.com/muchulee8), [Max Podkorytov](https://github.com/tenpercent), [Adnan Akhundov](https://github.com/aakhundov).
 
-AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
+AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank Andrew Tulloch, Yinghai Lu, Lu Fang for the valuable discussions.
 
 FX2AIT and Aten2AIT are co-created and maintained by Meta engineers: [Wei Wei](https://github.com/frank-wei), [Shirong Wu](https://github.com/wushirong) and [Zhijing Li](https://github.com/tissue3).
 
 
-## Acknowledgement
+## Acknowledgements
 
-AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (Led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (Led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration.
+AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration.
 
 
 ## License
+
 AITemplate is licensed under the [Apache 2.0 License](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE).
diff --git a/docs/source/arch/philosophy.rst b/docs/source/arch/philosophy.rst
index 2eefb8f5d..d1ac35db4 100644
--- a/docs/source/arch/philosophy.rst
+++ b/docs/source/arch/philosophy.rst
@@ -5,12 +5,17 @@ Design  Philosophy
 KISS (Keep it simple and stupid)
 --------------------------------
 
-AITemplate avoids deep IR lowering stacks to reduce the system's complexity. A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference.
+AITemplate avoids deep IR lowering stacks to reduce the system's complexity.
+A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference.
 
 Pragmatism
 ----------
 
-AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization. Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN.
+AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization.
+Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN.
 
-
-We believe most of the neural network workload can be decoupled. For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics. For encoder and decoder, it is a computation bounded problem. For decoder logic, it may involve more control flows. By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack to play as the silver bullet.
\ No newline at end of file
+We believe most of the neural network workload can be decoupled.
+For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics.
+For encoder and decoder, it is a computation-bounded problem.
+For decoder logic, it may involve more control flows.
+By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack as a silver bullet.
diff --git a/docs/source/debughints.rst b/docs/source/debughints.rst
index 074254a75..0bd07d3c1 100644
--- a/docs/source/debughints.rst
+++ b/docs/source/debughints.rst
@@ -1,14 +1,15 @@
 Debug Hints
 ===========
 
-AITemplate is a new project under active development. We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected.
+AITemplate is a new project under active development.
+We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected.
 
-Here are some helpful tips when we learned during the development AITemplate:
+Here are some helpful tips we learned during the development of AITemplate:
 
-1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting ~/.aitemplate or setting environment variable FLUSH_PROFILE_CACHE=1
+1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting `~/.aitemplate` or setting the environment variable `FLUSH_PROFILE_CACHE=1`.
 
-2. Check the pseudo code/visualization generated by each optimization pass if some optimization is harmful.
+2. Check the pseudo code/visualization generated by each optimization pass if some optimization behaves in unexpected way.
 
 3. Always do the numerical test, from small to large, to make sure the entire model is correct.
 
-4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph.
\ No newline at end of file
+4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 775d33792..9dbcdcc9a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,11 +1,11 @@
 
 AITemplate Documentation
-======================================
+========================
 
 AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
 
 * High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
-* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
 
 
 .. toctree::
diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
index 48244cfa7..2528bd036 100644
--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -7,11 +7,11 @@ Using Docker
 The easiest way to get started is to use Docker.  Using docker is able to avoid performance regression caused by incorrect version of NVCC and HIPCC.
 To use docker, we provide a bash script to build the docker image.
 
-- CUDA: 
+- CUDA:
     .. code-block:: bash
 
         ./docker/build.sh cuda
-- ROCM: 
+- ROCM:
     .. code-block:: bash
 
         DOCKER_BUILDKIT=1 ./docker/build.sh rocm
@@ -31,13 +31,13 @@ To launch the docker container
 
         docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ait:latest
 
-AITemplate will be installed in as a Python package to Python 3.8. There will be also a copy of source code and examples at `/AITemplate`
+AITemplate will be installed as a Python package in Python 3.8. There will be also a copy of the source code and examples at `/AITemplate`.
 
 
-Install as standard Python package
-----------------------------------
+Installing as a Standard Python Package
+---------------------------------------
 
-Before start installing AITemplate, first make sure you have correct hardware and software environment.
+Before installing AITemplate, first make sure you have correct hardware and software environment.
 
 - Hardware
     - NVIDIA: AIT is only tested on SM80+ GPUs (Ampere etc).
@@ -52,24 +52,23 @@ Before start installing AITemplate, first make sure you have correct hardware an
     - AMD: ROCm 5.2, with HIPCC 10736 (commit `b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb`)
 
 .. warning::
-    - Incorrect compiler version will lead performance regression.
-    - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`
+    - Incorrect compiler version may lead to performance regression.
+    - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`.
 
 
-When clone the code, please use the following command to clone the submodules:
-```
-git clone --recursive https://github.com/facebookincubator/AITemplate
-```
+When cloning the code, please use the following command to clone the submodules:
+
+    .. code-block:: bash
+
+        git clone --recursive https://github.com/facebookincubator/AITemplate
 
 .. warning::
-    Please check all submodules are cloned correctly before go to next step.
+    Please check that all submodules are cloned correctly before the next step.
 
-Then build Python wheel package and install.
+Then build the Python wheel package and install it:
 
     .. code-block:: bash
 
         cd python
         python setup.py bdist_wheel
         pip install dist/aitemplate-0.0.1-py3-none-any.whl
-
-
diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 50847a6ae..9e9f7769a 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -1,6 +1,7 @@
 Environment Variables
 =====================
-AITemplate uses environment variables to control the behavior of codegen and profiling. All the environment variables used in AITemplate are listed here.
+AITemplate uses environment variables to control the behavior of codegen and profiling.
+The environment variables used in AITemplate are listed here.
 
 Codegen
 -------
@@ -11,7 +12,7 @@ Codegen
 
 **AIT_NDEBUG**: If set to "1", compile with `NDEBUG`, disabling debug assertions. Recommended for production builds. "1" by default.
 
-**AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler to do time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
+**AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler perform time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
 
 Profiling
 ---------
@@ -31,11 +32,11 @@ Profiling
 OSS CI
 ------
 
-**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compiles two random profilers to make sure the profiler codegen is not broken.
+**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compile two random profilers to make sure the profiler codegen is not broken.
 
 **AIT_BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
 
 Miscellaneous
 -------------
 
-**LOGLEVEL**: It is used to control the logging level in python. It's default to "INFO". "DEBUG" is useful for debugging.
+**LOGLEVEL**: It is used to control the logging level in Python. The default value is "INFO". "DEBUG" is useful for debugging.
diff --git a/docs/source/runtime/cxx_design.rst b/docs/source/runtime/cxx_design.rst
index 5ef18f889..d4608409f 100644
--- a/docs/source/runtime/cxx_design.rst
+++ b/docs/source/runtime/cxx_design.rst
@@ -1,29 +1,30 @@
-==================
+================
 C++ Runtime Note
-==================
+================
 
 `Model` v.s. `ModelContainer`
-==============================
+=============================
 
-These are the two main classes involved in the C++ runtime implementation.
+These are the two main classes involved in the C++ runtime implementation:
 
-* The bulk of the runtime implementation is in `Model`.
-* `ModelContainer` stores a set of shared constants and a collection of `Model`s. Almost all functions in `model_interface.h` forward to a method on `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one is available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
+* The bulk of the runtime implementation is in the `Model` class.
+* The `ModelContainer` class stores a set of shared constants and a collection of `Model` instances. Almost all functions in `model_interface.h` forward to a method in `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one becomes available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
 
 Code Structure
 ==============
 
 Some important files:
 
-1. `include/model_interface.h`: The interface that we expose in the compiled .so
+1. `include/model_interface.h`: The interface that we expose in the compiled `.so`.
 2. `include/model_container.h`: The bulk of the `ModelContainer` implementation.
 
 Some files are generated at compile time. These include:
 
-* `model-generated.h`: The implementation for `Model`.
-* `model_container_base.cu`: A small part of the implementation for `ModelContainer` needs to be codegened. So `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
+* `model-generated.h`: The implementation of the `Model`.
+* `model_container_base.cu`: A small part of the implementation for `ModelContainer` that needs to be generated. `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
 
-All codegen templates can be found in `backend/main_templates.py`. The codegen implementation is in `backend/codegen.py`.
-
-Note that many of the headers in this directory rely on generated code and thus cannot be `#include`d in external projects. The exception is `model_interface.h`.
+All codegen templates can be found in `backend/main_templates.py`.
+The codegen implementation is in `backend/codegen.py`.
 
+Note that many of the headers in this directory rely on generated code and thus cannot be `#include` -d in external projects.
+`model_interface.h` is an exception.
diff --git a/docs/source/runtime/py_design.rst b/docs/source/runtime/py_design.rst
index c143123de..5c9d630e0 100644
--- a/docs/source/runtime/py_design.rst
+++ b/docs/source/runtime/py_design.rst
@@ -1,6 +1,6 @@
-=====================
+===================
 Python Runtime Note
-=====================
+===================
 
 Python `Model`
 ==============
@@ -16,7 +16,7 @@ This class represents a contiguous blob of memory that AIT will use as a tensor.
 * `shape: List[int]`: The shape of the tensor.
 * `dtype: str`: The tensor's dtype; one of `"float32", "float16", "int32", "int64"`. Note that most ops only support float16 at this stage.
 
-If using AITemplate with PyTorch, `AITData`s can be constructed with the `torch_to_ait_data` utility:
+When using AITemplate with PyTorch, `AITData` can be constructed with the `torch_to_ait_data` utility:
 
 .. code-block:: python
 
@@ -30,7 +30,7 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
 `run`
 -----
 
-`run` takes a set of inputs and outputs as `AITData`s. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
+`run` takes inputs and outputs as collections of `AITData` instances. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
 
 .. code-block:: python
 
@@ -55,9 +55,9 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
       outputs[output_name_to_idx[name]] = ait_outputs[name]
 
     module.run(inputs, outputs)
-      
 
-One important caveat is that the output must be its **maximum** size. This is because of dynamic shapes - the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
+
+One important caveat is that the output must have the **maximum** possible size. This is because of dynamic shapes: the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
 
 .. code-block:: python
 
@@ -67,7 +67,7 @@ One important caveat is that the output must be its **maximum** size. This is be
     max_shape = module.get_output_maximum_shape("output")
 
 
-`Model.run` returns a dictionary of output `AITData`s with (possibly dynamic) shapes that the runtime inferred.
+`Model.run` returns a dictionary of output `AITData` instances with (possibly dynamic) shapes that inferred in the runtime.
 
 Nullptr Inputs/Outputs
 ----------------------
@@ -102,7 +102,7 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
 `run_with_tensors`
 ------------------
 
-`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists of `torch.Tensor`s:
+`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists (or dicts) of `torch.Tensor` instances:
 
 .. code-block:: python
 
@@ -115,9 +115,14 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
 Streams and Asynchronous Predictions
 ------------------------------------
 
-A pointer to a stream can optionally be passed to `run`. If none is given, the prediction happens on the default stream 0. If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. `sync` is `True` by default.
+A pointer to a stream can optionally be passed to `run`.
+If none is given, the prediction happens on the default stream 0.
+If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns.
+`sync` is `True` by default.
 
-Multiple predictions can happen at the same time (on the same or different streams). Under the hood, there is a fixed-size pool of runtime objects. When all the runtimes are used, `run()` blocks until one is available.
+Multiple predictions can happen at the same time (on the same or different streams).
+Under the hood, there is a fixed-size pool of runtime objects.
+When all the runtimes are used, `run()` blocks until one becomes available.
 The size of this pool can be configured with the `num_runtimes` option in `Model`'s constructor.
 
 CUDA Graph
diff --git a/docs/source/tutorial/how_to_add_op.rst b/docs/source/tutorial/how_to_add_op.rst
index 160745336..988f5375e 100644
--- a/docs/source/tutorial/how_to_add_op.rst
+++ b/docs/source/tutorial/how_to_add_op.rst
@@ -1,17 +1,17 @@
 How to add an operator to the AIT codegen
-========================================= 
+=========================================
 
 This tutorial will demonstrate how to add a new operator to the AIT codegen.
-Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`.
 
 
 0. Prerequisites
------------------
+----------------
 
-We need to import necessary Python modules
+We need to import necessary Python modules:
 
 .. code-block:: python
-  
+
   from typing import Any, Dict, List
 
   import jinja2
@@ -26,9 +26,9 @@ We need to import necessary Python modules
 
 
 1. Define the operator graph node
-----------------------------------
+---------------------------------
 
-Graph node is usually defined at `aitemplate/compiler/ops`.
+Graph nodes are usually defined at `aitemplate/compiler/ops`.
 
 .. code-block:: python
 
@@ -72,15 +72,15 @@ Graph node is usually defined at `aitemplate/compiler/ops`.
 .. note::
 
   - `_attrs` in Operator is the most important data structure for codegen.
-  - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in backend, must be **unique**.
+  - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in the backend; must be **unique**.
 
 2. Define the necessary templates for Codegen
-----------------------------------------------
+---------------------------------------------
 
 In AIT, there are 4 important templates for codegen:
 
 - `FUNC_TEMPLATE`: the template for generating the function body of the operator, and invoke GPU kernel in the body.
-- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defined name, and arguments of the function.
+- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defines the name and arguments of the function.
 - `FUNC_CALL_TEMPLATE`: the template for generating the function call of the operator. The call will be used during inference to invoke the GPU kernel with given arguments.
 - `FUNC_DECL`: the template for forward declaration of the operator function. This is usually an alias of `FUNC_SIGNATURE_TEMPLATE`.
 
@@ -128,7 +128,7 @@ In AIT, there are 4 important templates for codegen:
   )
 
 3. Create the GPU kernels
---------------------------
+-------------------------
 
 In this example we use a simplest add one kernel. The kernel can be written by hand (as what programmer is expected to do), or generated by other tools.
 
@@ -166,10 +166,10 @@ In this example we use a simplest add one kernel. The kernel can be written by h
   )
 
 4. Define the codegen function
--------------------------------
+------------------------------
 
-The codegen function is the function that render the templates we defined into valid C++ code string.
-The codegen function will take `func_attrs` from graph node, and fill into the jinja2 template.
+The codegen function is the function that renders the templates we defined into valid C++ code string.
+The codegen function will take `func_attrs` from the graph node, and fill in the jinja2 template.
 
 .. code-block:: python
 
@@ -213,10 +213,10 @@ The codegen function will take `func_attrs` from graph node, and fill into the j
           ).strip()
     )
 
-5.1 Register the codegen function to CUDA backend
----------------------------------------------------
+5.1 Register the codegen function in CUDA backend
+-------------------------------------------------
 
-CUDA backend functions is usually defined at `aitemplate/backend/cuda/`.
+CUDA backend functions are usually defined at `aitemplate/backend/cuda/`.
 
 .. code-block:: python
 
@@ -240,10 +240,9 @@ CUDA backend functions is usually defined at `aitemplate/backend/cuda/`.
     return gen_function_call(func_attrs, indent, is_cuda=True)
 
 5.2 (Optional) Register the codegen function to ROCm backend
---------------------------------------------------------------
-
-ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
+------------------------------------------------------------
 
+ROCm backend functions are usually defined at `aitemplate/backend/rocm/`.
 
 .. code-block:: python
 
@@ -269,7 +268,7 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
 
 
 6. Compile and verify the results with PyTorch
-------------------------------------------------
+----------------------------------------------
 
 .. code-block:: python
 
@@ -299,4 +298,3 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
       outputs = {"Y": y}
       module.run_with_tensors(inputs, outputs)
       print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
-
diff --git a/docs/source/tutorial/how_to_infer_pt.rst b/docs/source/tutorial/how_to_infer_pt.rst
index 67891c46a..8aa68c9c7 100644
--- a/docs/source/tutorial/how_to_infer_pt.rst
+++ b/docs/source/tutorial/how_to_infer_pt.rst
@@ -1,16 +1,16 @@
 How to inference a PyTorch model with AIT
-==========================================
+=========================================
 
 This tutorial will demonstrate how to inference a PyTorch model with AIT.
-Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`.
 
 0. Prerequisites
------------------
+----------------
 
-We need to import necessary Python modules
+We need to import necessary Python modules:
 
 .. code-block:: python
-  
+
   from collections import OrderedDict
 
   import torch
@@ -23,9 +23,9 @@ We need to import necessary Python modules
 
 
 1. Define a PyTorch module
----------------------------
+--------------------------
 
-Here we define a PyTorch model which is commonly seen in Transformers.
+Here we define a PyTorch model which is commonly seen in Transformers:
 
 .. code-block:: python
 
@@ -46,7 +46,7 @@ Here we define a PyTorch model which is commonly seen in Transformers.
       return hidden_states
 
 2. Define an AIT module
-------------------------
+-----------------------
 
 We can define a similar AIT module as follows:
 
@@ -69,15 +69,16 @@ We can define a similar AIT module as follows:
 .. warning::
   The `nn.Module` API in AIT looks similar to PyTorch, but it is not the same.
 
-  The fundamental difference is that AIT module is a container to build graph, while PyTorch module is a container to store parameters for eager.
-  Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call. If you want to share parameters, needs to call `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch.
+  The fundamental difference is that AIT module is a container to build a graph, while PyTorch module is a container to store parameters for eager.
+  Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call.
+  If you want to share parameters, you need to use the `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch.
+
+  AITemplate supports automatic fusion of linear followed by other operators. However in many cases, especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with the `fast_gelu` operator.
 
-  AITemplate supports automatically fusion on linear followed by other operators. However in many case especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with `fast_gelu` operator.
-  
 3. Define a helper function to map PyTorch parameters to AIT parameters
--------------------------------------------------------------------------
+-----------------------------------------------------------------------
 
-In AIT, all names must follow C variable naming standard because the name will be used in codegen process.
+In AIT, all names must follow the C variable naming standard, because the names will be used in the codegen process.
 
 .. code-block:: python
 
@@ -93,12 +94,12 @@ In AIT, all names must follow C variable naming standard because the name will b
 
 .. warning::
 
-  - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter a name with direct map to PyTorch.
-  - Because all names in AIT must follow C variable naming standard, you can easier replace `.` to `_` or use a regular expression to make sure the name in valid.
-  - For network with conv + bn subgraph, we currently haven't provide automatic pass to fold it. Refer our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding.
+  - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter with a name with a direct map to PyTorch.
+  - Because all names in AIT must follow the C variable naming standard, you can easily replace `.` by `_` or use a regular expression to make sure the name in valid.
+  - For networks with conv + bn subgraph, we currently don't provide an automatic pass to fold it. Please refer to our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding.
 
 4. Create PyTorch module, inputs/outputs
------------------------------------------
+----------------------------------------
 
 .. code-block:: python
 
@@ -115,7 +116,7 @@ In AIT, all names must follow C variable naming standard because the name will b
   y_pt = pt_model(x)
 
 5. Create AIT module, inputs/outputs
--------------------------------------
+------------------------------------
 
 .. code-block:: python
 
@@ -139,12 +140,12 @@ In AIT, all names must follow C variable naming standard because the name will b
 .. warning::
 
   - Similar to MetaTensor, LazyTensor and a lot of other lazy evaluation frameworks, AIT's Tensor records the computation graph, and the graph is built when the Tensor is compiled.
-  - For input tensor, it is required to set the attribute **is_input=True**
-  - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**
-  - For input and output tensors, it is better to provide **name** attributes to use in runtime
+  - For input tensor, it is required to set the attribute **is_input=True**.
+  - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**.
+  - For input and output tensors, it is better to provide the **name** attributes to use in runtime.
 
-6. Compile AIT module in to runtime, and do verification
---------------------------------------------------------
+6. Compile AIT module into runtime and do verification
+------------------------------------------------------
 
 .. code-block:: python
 
@@ -180,9 +181,9 @@ In AIT, all names must follow C variable naming standard because the name will b
     print(f"PyTorch eager time: {pt_t} ms/iter")
 
 
-In this example, AIT will automatically fuse GELU and elementwise add into TensorCore/MatrixCore gemm operation. On RTX-3080 for this example, AIT is about 1.15X fast than PyTorch Eager in this example.
+In this example, AIT will automatically fuse GELU and elementwise addition into the TensorCore/MatrixCore gemm operation. On RTX-3080, in the example AIT is about 1.15X faster than PyTorch Eager.
 
 .. note::
 
-  - In this example, we fold parameters (weights) into AIT runtime, which the final dynamic library will contains parameters.
-  - If during compile we don't provide parameters, for example the total parameters size is greater than 2GB, we can always call `set_constant` function in runtime. Check runtime API for details.
\ No newline at end of file
+  - In this example, we fold the parameters (`weights`) into AIT runtime. The final dynamic library will contain them as parameters.
+  - If during the compile time we don't provide the parameters (for example, because the total parameters size is greater than 2GB), we can always call `set_constant` function in the runtime. Please check the runtime API for the details.
diff --git a/docs/source/tutorial/how_to_visualize.rst b/docs/source/tutorial/how_to_visualize.rst
index 5af7c89a5..b1d646118 100644
--- a/docs/source/tutorial/how_to_visualize.rst
+++ b/docs/source/tutorial/how_to_visualize.rst
@@ -1,5 +1,5 @@
 How to visualize an AIT model
-==============================
+=============================
 
 Visualization is important for understanding the behavior of a model optimization.
 In AIT, we modify the codegen a little bit, from generating CUDA/HIP C++ code to HTML/Javascript code,
@@ -9,7 +9,7 @@ then we can generate a visualization of the model.
 The following code will generate a visualization of our first example.
 
 1. Define the AIT Model
-------------------------
+-----------------------
 
 .. code-block:: python
 
@@ -71,7 +71,7 @@ The following code will generate a visualization of our first example.
   graph = apply_optimizations(output_tensor)
 
 3. Generate visualization
---------------------------
+-------------------------
 
 .. code-block:: python
 
@@ -82,4 +82,4 @@ The visualization will be generated in the "ait_model.html" file. This file can
 
 .. raw:: html
 
-  <iframe src="ait_model.html" width="100%" height="600px"></iframe>
\ No newline at end of file
+  <iframe src="ait_model.html" width="100%" height="600px"></iframe>

From 3511ba6cd1b28aba074e6457d9e6b22a85644629 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 27 Feb 2023 09:45:14 -0800
Subject: [PATCH 184/638] Fix MultiScaleAttention _reshape_qkv_to_seq
 indentation (#336)

Summary:
Seems that `MultiScaleAttention._reshape_qkv_to_seq` code had wrong indentation

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/336

Reviewed By: aakhundov

Differential Revision: D43614354

Pulled By: chenyang78

fbshipit-source-id: abc9341a5fb835bbb2818920be9dccb868eabb11
---
 .../frontend/nn/multiscale_attention.py       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 2ef5c11f4..5b982b906 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -479,21 +479,21 @@ def _prod(self, shape: List[int]) -> int:
             p *= dim
         return p
 
-        def _reshape_qkv_to_seq(
-            self,
-            q: Tensor,
-            k: Tensor,
-            v: Tensor,
-            q_N: int,
-            v_N: int,
-            k_N: int,
-            B: int,
-            C: int,
-        ) -> Tuple[Tensor, Tensor, Tensor]:
-            q = q.permute(0, 2, 1, 3).reshape(B, q_N, C)
-            v = v.permute(0, 2, 1, 3).reshape(B, v_N, C)
-            k = k.permute(0, 2, 1, 3).reshape(B, k_N, C)
-            return q, k, v
+    def _reshape_qkv_to_seq(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        q_N: int,
+        v_N: int,
+        k_N: int,
+        B: int,
+        C: int,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        q = q.permute(0, 2, 1, 3).reshape(B, q_N, C)
+        v = v.permute(0, 2, 1, 3).reshape(B, v_N, C)
+        k = k.permute(0, 2, 1, 3).reshape(B, k_N, C)
+        return q, k, v
 
     def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
         """

From 4a0122bbbb27caf01aef6d0bf35cf09cce8a9c14 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 27 Feb 2023 09:45:50 -0800
Subject: [PATCH 185/638] Fix fx2ait readme (#335)

Summary:
Updated FX2AIT description: "FX2AIT is a Python-based tool..." - the description is taken from https://github.com/facebookincubator/AITemplate#fx2ait for consistency.

Plus fixed some typos

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/335

Reviewed By: aakhundov

Differential Revision: D43614388

Pulled By: chenyang78

fbshipit-source-id: 1158005a37f1aa6aab7dc061d945498dfc5f2a1d
---
 fx2ait/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fx2ait/README.md b/fx2ait/README.md
index 9443aa197..6f7e1e223 100644
--- a/fx2ait/README.md
+++ b/fx2ait/README.md
@@ -1,18 +1,18 @@
 # FX2AIT for AITemplate
 
 
-FX2AIT is an python based tool that transforms PyTorch model into AITempate(AIT) engine for lightning-fast inference serving.
+FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving.
 AITLowerer built on top of FX2AIT is able to perform AIT conversion on PyTorch model with AIT unsupported operators. Model can enjoy partial AIT acceleration using AITLowerer.
 
 FX2AIT highlights include:
 
 - Automatic Conversion: FX2AIT only need PyTorch model and input as input for conversion. The output can be used for inference serving directly.
-- Expanded Support: AITemplate doesn't cover all operators PyTorch provides. FX2AIT provided AITLowerer as solution to support patial AIT conversion for models with AIT unsupportted operators. For more information, please check example/03_lowering_split.
+- Expanded Support: AITemplate doesn't cover all operators PyTorch provides. FX2AIT provided AITLowerer as solution to support partial AIT conversion for models with AIT unsupported operators. For more information, please check example/03_lowering_split.
 
-## Installalation
+## Installation
 
 **Hardware requirement:**
-  - **NVIDIA**: FX2AIT is base on AIT, thus the hardware requirement is same as AIT. AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
+  - **NVIDIA**: FX2AIT is based on AIT, thus the hardware requirement is same as AIT. AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
 ### From Source
 The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA compiler installed.
 - CUDA: CUDA 11.6

From 58ebaa575b9864626ee4bc3bc7e3a499d3acc563 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 27 Feb 2023 09:46:08 -0800
Subject: [PATCH 186/638] Fixes in Visualize AIT Model doc (#327)

Summary:
Fixes in Doc:
- `plot_graph()` does not have `network_name` param. Removed `network_name` from the tutorial.
- Updated generated ait_model.html file

Fixes in `plot_graph()`:
- Currently `plot_graph()` code tries to get `file_path` parent directory and create it. If  `file_path` string does not have parent directory it still tries to create empty string `''` dir and fails. Fixed it. Now it creates dir only if  `file_path` has dir

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/327

Reviewed By: aakhundov

Differential Revision: D43614342

Pulled By: chenyang78

fbshipit-source-id: c8f6e80ff9cd70a03591519aa537be592d5e2877
---
 docs/source/tutorial/how_to_visualize.rst     |  2 +-
 docs/static/ait_model.html                    | 46 +++++++++----------
 python/aitemplate/utils/visualization/plot.py |  4 +-
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docs/source/tutorial/how_to_visualize.rst b/docs/source/tutorial/how_to_visualize.rst
index b1d646118..1b6856699 100644
--- a/docs/source/tutorial/how_to_visualize.rst
+++ b/docs/source/tutorial/how_to_visualize.rst
@@ -76,7 +76,7 @@ The following code will generate a visualization of our first example.
 .. code-block:: python
 
   # Plot the graph
-  plot_graph(graph, file_path="ait_model.html", network_name="ait_sample_net")
+  plot_graph(graph, file_path="ait_model.html")
 
 The visualization will be generated in the "ait_model.html" file. This file can be opened in Chrome without any web server.
 
diff --git a/docs/static/ait_model.html b/docs/static/ait_model.html
index 18c56089d..9a93d717f 100644
--- a/docs/static/ait_model.html
+++ b/docs/static/ait_model.html
@@ -4,7 +4,7 @@
   <meta charset="utf-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
   <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
-  <title>ait_sample_net</title>
+  <title>ait_model</title>
 </head>
 
 <style>
@@ -83,7 +83,7 @@
 
 <nav id="nav_bar" class="navbar fixed-top bg-light">
   <div class="container-fluid">
-    <a onclick="back_to_head()" class="navbar-brand">ait_sample_net</a>
+    <a onclick="back_to_head()" class="navbar-brand">ait_model</a>
     <div class="navbar-right">
         <div class="autocomplete" style="width:300px;">
         <input id="name_input" class="form-control me-2" type="search" placeholder="Search" aria-label="Search">
@@ -98,7 +98,7 @@
   src="https://code.jquery.com/jquery-3.6.0.js"
   integrity="sha256-H+K7U5CnXl1h5ywQfKtSj8PCmoN9aaq30gDh27Xc0jk="
   crossorigin="anonymous"></script>
-  
+
   <script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
   <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
   <script src="https://d3js.org/d3.v5.min.js"></script>
@@ -108,7 +108,7 @@
 
 
   <div id="graph" style="text-align: center;"></div>
-  
+
 <div class="modal fade" id="X_modal" tabindex="-1" role="dialog" aria-labelledby="X_label" aria-hidden="true">
   <div class="modal-dialog" role="document">
     <div class="modal-content">
@@ -119,7 +119,7 @@ <h5 class="modal-title" id="X_label">X</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -162,7 +162,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_label">gemm_rcr_bias_fast_
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -193,7 +193,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_add_4_label">gemm_rcr_bias_add_4</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -224,7 +224,7 @@ <h5 class="modal-title" id="dense1_weight_label">dense1_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -267,7 +267,7 @@ <h5 class="modal-title" id="dense1_bias_label">dense1_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -310,7 +310,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_0_label">gemm_rcr_bias_fas
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -353,7 +353,7 @@ <h5 class="modal-title" id="dense2_weight_label">dense2_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -396,7 +396,7 @@ <h5 class="modal-title" id="dense2_bias_label">dense2_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -439,7 +439,7 @@ <h5 class="modal-title" id="elementwise_2_0_label">elementwise_2_0</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -482,7 +482,7 @@ <h5 class="modal-title" id="layernorm_3_label">layernorm_3</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -513,7 +513,7 @@ <h5 class="modal-title" id="layernorm_weight_label">layernorm_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -556,7 +556,7 @@ <h5 class="modal-title" id="layernorm_bias_label">layernorm_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -599,7 +599,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -631,7 +631,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
     </div>
   </div>
 </div>
-  
+
 
   <script>
   items = ["X", "gemm_rcr_bias_fast_gelu_0", "gemm_rcr_bias_add_4", "dense1_weight", "dense1_bias", "gemm_rcr_bias_fast_gelu_0_0", "dense2_weight", "dense2_bias", "elementwise_2_0", "layernorm_3", "layernorm_weight", "layernorm_bias", "Y"];
@@ -798,7 +798,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
 layernorm_3 -> Y;
 }
 `;
-    var popover_data = {"X": "shape: [512, 1024]", "gemm_rcr_bias_fast_gelu_0": "op: gemm_rcr_bias_fast_gelu", "gemm_rcr_bias_add_4": "op: gemm_rcr_bias_add", "dense1_weight": "shape: [4096, 1024]", "dense1_bias": "shape: [4096]", "gemm_rcr_bias_fast_gelu_0_0": "shape: [512, 4096]", "dense2_weight": "shape: [1024, 4096]", "dense2_bias": "shape: [1024]", "elementwise_2_0": "shape: [512, 1024]", "layernorm_3": "op: layernorm", "layernorm_weight": "shape: [1024]", "layernorm_bias": "shape: [1024]", "Y": "shape: [512, 1024]"};
+    var popover_data = {"X": "shape: [512, 1024]", "gemm_rcr_bias_fast_gelu_0": "op_type: gemm_rcr_bias_fast_gelu", "gemm_rcr_bias_add_4": "op_type: gemm_rcr_bias_add", "dense1_weight": "shape: [4096, 1024]", "dense1_bias": "shape: [4096]", "gemm_rcr_bias_fast_gelu_0_0": "shape: [512, 4096]", "dense2_weight": "shape: [1024, 4096]", "dense2_bias": "shape: [1024]", "elementwise_2_0": "shape: [512, 1024]", "layernorm_3": "op_type: layernorm", "layernorm_weight": "shape: [1024]", "layernorm_bias": "shape: [1024]", "Y": "shape: [512, 1024]"};
     var graphviz = d3.select("#graph").graphviz();
     var pop_finish = 0;
     // var dotSrcLines;
@@ -815,7 +815,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         obj.popover();
       }
     }
-  
+
 
     function render() {
       // console.log('DOT source =', dotSrc);
@@ -849,7 +849,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         var shape = obj.find("polygon:first");
         var color = shape.attr("stroke");
         shape.attr("fill", color);
-        
+
       });
       nodes.on("mouseout", function() {
         var id = d3.select(this).attr("id");
@@ -862,5 +862,5 @@ <h5 class="modal-title" id="Y_label">Y</h5>
     }
     render(dotSrc);
   </script>
-  
-</body>
\ No newline at end of file
+
+</body>
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 9ed1101aa..00ebbfe7b 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -189,7 +189,9 @@ def plot_graph(tensors, file_path: str) -> None:
                 modal_set.append(_gen_op_modal(dst_op))
             dot_graph.add_edge(pydot.Edge(tensor_node, op_node))
 
-    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    file_dir = os.path.dirname(file_path)
+    if file_dir:
+        os.makedirs(file_dir, exist_ok=True)
 
     if ext == "html":
         basename = os.path.splitext(os.path.basename(file_path))[0]

From 0068e22aaa293ee79b93a970f6695c91a984d6dc Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 27 Feb 2023 09:46:31 -0800
Subject: [PATCH 187/638] Add RTX 40 to detect_target (#328)

Summary:
RTX 40 series has Compute Capability 8.9 - map it to 80 since cutlass [generator.py](https://github.com/AITemplate/cutlass/blob/master/tools/library/scripts/generator.py) has GenerateSM functions for arch 50, 60, 61, 70, 75, 80, 90.

Alternatively, we can use `nvidia-smi` or `pycuda` to get GPU Compute Capability and convert it to supported SM number - https://github.com/facebookincubator/AITemplate/pull/330

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/328

Reviewed By: aakhundov

Differential Revision: D43614336

Pulled By: chenyang78

fbshipit-source-id: ab9baae96cd95519fac03bb350e6c74506e160e3
---
 python/aitemplate/testing/detect_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 9df75a1bd..7c250d673 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -41,7 +41,7 @@ def _detect_cuda():
         stdout = stdout.decode("utf-8")
         if "H100" in stdout:
             return "90"
-        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30"]):
+        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30", "RTX 40"]):
             return "80"
         if "V100" in stdout:
             return "70"

From e8f17d1f9c8116b85ac945fa926f01da6d8c1536 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 27 Feb 2023 13:47:30 -0800
Subject: [PATCH 188/638] Fix typos (#332)

Summary:
Fixes:
- ouputs -> ou**t**puts
- attenion -> atten**t**ion
- vanillaAttentionTestCase -> **V**anillaAttentionTestCase

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/332

Reviewed By: khabinov, wushirong

Differential Revision: D43612459

Pulled By: tenpercent

fbshipit-source-id: 0411f6c284e4cb9f09ea978d6ab5f9233dcccf50
---
 examples/04_vit/benchmark_ait.py              |  4 +--
 examples/04_vit/verification.py               |  4 +--
 examples/05_stable_diffusion/src/benchmark.py |  8 ++---
 .../src/pipeline_stable_diffusion_ait.py      | 12 +++----
 .../pipeline_stable_diffusion_img2img_ait.py  | 12 +++----
 python/aitemplate/compiler/base.py            |  4 +--
 .../compiler/ops/attention/flash_attention.py |  2 +-
 .../ops/attention/mem_eff_attention.py        |  2 +-
 python/aitemplate/frontend/nn/attention.py    | 10 +++---
 .../nn/conv2d/common_conv2d_bias_act.py       |  2 +-
 .../nn/conv2d/transposed_conv2d_bias_act.py   |  2 +-
 python/aitemplate/frontend/nn/dual_gemm.py    |  2 +-
 .../frontend/nn/vanilla_attention.py          |  4 +--
 .../aitemplate/utils/mk_ck_lib/generator.py   | 36 +++++++++----------
 static/include/model_container.h              |  4 +--
 static/include/model_interface.h              |  2 +-
 tests/unittest/ops/test_attention.py          |  2 +-
 tests/unittest/ops/test_upsamping2d_add.py    |  4 +--
 tests/unittest/ops/test_vanilla_attention.py  |  4 +--
 19 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/examples/04_vit/benchmark_ait.py b/examples/04_vit/benchmark_ait.py
index 3d3eba043..f01475b98 100644
--- a/examples/04_vit/benchmark_ait.py
+++ b/examples/04_vit/benchmark_ait.py
@@ -132,8 +132,8 @@ def benchmark(model_name, batch_size, mod=None, graph_mode=True):
     # prepare input/output tensor
     inputs = [torch.randn([batch_size, img_size, img_size, 3]).cuda().half()]
     ys = []
-    num_ouputs = len(mod.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(mod.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = mod.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     # warm up
diff --git a/examples/04_vit/verification.py b/examples/04_vit/verification.py
index 5220b1213..8a84444e8 100644
--- a/examples/04_vit/verification.py
+++ b/examples/04_vit/verification.py
@@ -134,8 +134,8 @@ def verification(
 
     inputs = [input_pt.permute((0, 2, 3, 1)).contiguous()]
     ys = []
-    num_ouputs = len(ait_mod.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(ait_mod.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = ait_mod.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     ait_mod.run_with_tensors(inputs, ys)
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index c278dcf1e..5cac6a465 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -93,8 +93,8 @@ def benchmark_unet(
     }
 
     ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
@@ -172,8 +172,8 @@ def benchmark_clip(
         "input1": position_ids,
     }
     ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 5d560b206..7dace1275 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -129,8 +129,8 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
             "input2": encoder_hidden_states.cuda().half(),
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
@@ -146,8 +146,8 @@ def clip_inference(self, input_ids, seqlen=64):
             "input1": position_ids,
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
@@ -157,8 +157,8 @@ def vae_inference(self, vae_input):
         exe_module = self.vae_ait_exe
         inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
index 592260981..ad2885086 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -140,8 +140,8 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
             "input2": encoder_hidden_states.cuda().half(),
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
@@ -157,8 +157,8 @@ def clip_inference(self, input_ids, seqlen=64):
             "input1": position_ids,
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
@@ -168,8 +168,8 @@ def vae_inference(self, vae_input):
         exe_module = self.vae_ait_exe
         inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 95105783e..1a10d866a 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -286,7 +286,7 @@ class JaggedIntVar(IntVar):
 
     Because JaggedIntVar is an IntVar, it can be treated so by the AIT ops
     that are unaware of the jagged Tensor semantics. But the ops that are
-    aware can interpet the JaggedIntVar as the first dimension of the jagged
+    aware can interpret the JaggedIntVar as the first dimension of the jagged
     Tensor by specifically processing the underlying batch_dim and jagged_dims.
 
     If there is more than one JaggedDim in a JaggedIntVar, those jagged dimensions
@@ -890,7 +890,7 @@ def __rtruediv__(self, other: Any) -> Tensor:
 
 
 class DynamicProfileStrategy(Enum):
-    """Dynamic profiling stategy enum.
+    """Dynamic profiling strategy enum.
     Instances are used to select profiling strategy when there are dynamic dims.
     """
 
diff --git a/python/aitemplate/compiler/ops/attention/flash_attention.py b/python/aitemplate/compiler/ops/attention/flash_attention.py
index f81a59543..d0b8db9ed 100644
--- a/python/aitemplate/compiler/ops/attention/flash_attention.py
+++ b/python/aitemplate/compiler/ops/attention/flash_attention.py
@@ -60,7 +60,7 @@ class flash_attention(Operator):
     """
 
     def __init__(self, batch_size, dropout, max_seq_len, causal) -> None:
-        """initilize attention module"""
+        """Initialize attention module"""
         super().__init__()
         assert dropout == 0
         self._attrs["op"] = "flash_attention"
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index bc910e6b2..6703984a7 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -59,7 +59,7 @@ class mem_eff_attention(Operator):
     """
 
     def __init__(self, causal, dropout=0) -> None:
-        """initilize attention module"""
+        """Initialize attention module"""
         super().__init__()
         assert dropout == 0
         self._attrs["op"] = "mem_eff_attention"
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index f2272218e..db82bad1a 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -48,7 +48,7 @@ def __init__(
         causal=False,
         dtype="float16",
     ):
-        """Initilize attention module, create a tensor for seqlen"""
+        """Initialize attention module, create a tensor for seqlen"""
         super().__init__()
         self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
         self.op = flash_attention(
@@ -79,7 +79,7 @@ class MultiheadAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
@@ -166,7 +166,7 @@ def __init__(
                 )
         else:
             # on ROCM ck attention (bmm_softmax_bmm) takes three inputs (Q, K, V)
-            # here we generate packed QKV for spliting
+            # here we generate packed QKV for splitting
             # input: (B, seqlen, dim) -> (B*seqlen, dim)
             # gemm: (B*seqlen, 3*dim)
             # reshape to: (B, seqlen, 3, num_heads, head_dim)
@@ -218,7 +218,7 @@ def attention(self, x):
                 ops.reshape()(v, [b, -1, seqlen, d]),
             )
         else:
-            # intput(q/k/v): (B*num_heads, seqlen, head_dim)
+            # input(q/k/v): (B*num_heads, seqlen, head_dim)
             # attn = (B, S, H) * (B, S, H) = (B, S, S) #RCR
             # softmax on dim -1 (B, S, S)
             # attn@v: (B, S, S) * (B, S, H) = (B, S, H) #RRR
@@ -296,7 +296,7 @@ class CrossAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
index 276a360a7..96e5efe1b 100644
--- a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
@@ -37,7 +37,7 @@ def __init__(
         groups=1,
         dtype="float16",
     ):
-        """initilize the Conv2dBiasAct class
+        """Initialize the Conv2dBiasAct class
 
         Parameters
         ----------
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
index 368c64922..2dc54ab4e 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
@@ -37,7 +37,7 @@ def __init__(
         groups=1,
         dtype="float16",
     ):
-        """initilize the ConvTranspose2dBiasAct class
+        """Initialize the ConvTranspose2dBiasAct class
 
         Parameters
         ----------
diff --git a/python/aitemplate/frontend/nn/dual_gemm.py b/python/aitemplate/frontend/nn/dual_gemm.py
index 2ddf59d30..109aa7450 100644
--- a/python/aitemplate/frontend/nn/dual_gemm.py
+++ b/python/aitemplate/frontend/nn/dual_gemm.py
@@ -33,7 +33,7 @@ def __init__(
         fast_gelu=True,
         dtype="float16",
     ):
-        """Initilize dual gemm module, create a tensor for weights"""
+        """Initialize dual gemm module, create a tensor for weights"""
         super().__init__()
         self.w1 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
         self.w2 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
diff --git a/python/aitemplate/frontend/nn/vanilla_attention.py b/python/aitemplate/frontend/nn/vanilla_attention.py
index cf7e62592..b7ca8c309 100644
--- a/python/aitemplate/frontend/nn/vanilla_attention.py
+++ b/python/aitemplate/frontend/nn/vanilla_attention.py
@@ -97,7 +97,7 @@ class VanillaMultiheadAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
@@ -212,7 +212,7 @@ class VanillaCrossAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index 5a59cc185..bdc086ae8 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -600,7 +600,7 @@ def CreateGemmRCROperator(manifest):
     return operations
 
 
-def CreateGemmRCRBillinearOperator(manifest, c_element_op):
+def CreateGemmRCRBilinearOperator(manifest, c_element_op):
     operation_kind = library.GemmKind.Gemm
     a_element_desc = library.TensorDesc(
         library.DataType.f16, library.LayoutType.RowMajor
@@ -2075,14 +2075,14 @@ def GenerateTensorOp(manifest):
         library.TensorOperation.AddSigmoid,
         library.MemoryDataOperation.MemorySet,
     )
-    # TranposedConv2d
+    # TransposedConv2d
     CreateConv2dBwdOperator(
         manifest,
         library.Conv2dKind.TransposedConv2d,
         library.TensorOperation.PassThrough,
         library.MemoryDataOperation.MemorySet,
     )
-    # TranposedConv2dBiasRelu
+    # TransposedConv2dBiasRelu
     CreateConv2dBwdBiasOperator(
         manifest,
         library.Conv2dKind.TransposedConv2dBiasRelu,
@@ -2094,35 +2094,35 @@ def GenerateTensorOp(manifest):
     # GemmRCR
     CreateGemmRCROperator(manifest)
     # GemmRCRBias
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.Add)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.Add)
     # GemmRCRBiasRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddRelu)
     # GemmRCRBiasTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddTanh)
     # GemmRCRBiasTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddFastGelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddFastGelu)
     # GemmRCRBiasSwish
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddHardswish)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddHardswish)
     # GemmRCRBiasSigmoid
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoid)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoid)
     # GemmRCRBiasAdd
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAdd)
     # GemmRCRBiasMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMul)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMul)
     # GemmRCRBiasMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMulTanh)
     # GemmRCRBiasAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddRelu)
     # GemmRCRBiasAddAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddAdd)
     # GemmRCRBiasAddAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddAddRelu)
     # GemmRCRBiasSigmoidMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMul)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoidMul)
     # GemmRCRBiasSigmoidMulTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMulTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoidMulTanh)
     # GemmRCRBiasMulAdd
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMulAdd)
     # BmmRCR
     CreateBmmRCROperator(manifest)
     # BmmRRR
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 56be22df4..04c026476 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -133,8 +133,8 @@ ModelContainer* CreateModelContainer(
 // to start up two inferences on different streams concurrently,
 // we can do this:
 //
-// model_container.Run(inputs0, num_inputs, outputs0, num_ouputs, stream0, ...);
-// model_container.Run(inputs1, num_inputs, outputs1, num_ouputs, stream1, ...);
+// model_container.Run(inputs0, n_inputs, outputs0, n_outputs, stream0, ...);
+// model_container.Run(inputs1, n_inputs, outputs1, n_outputs, stream1, ...);
 // StreamSynchronize(stream0);
 // StreamSynchronize(stream1);
 //
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 4cee8cf02..1415c5f2f 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -220,7 +220,7 @@ AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
     AITemplateModelHandle handle,
     const AITData* inputs,
     size_t num_inputs,
-    AITData* ouputs,
+    AITData* outputs,
     size_t num_outputs,
     AITemplateStreamHandle stream_handle,
     bool graph_mode,
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 7099d2560..ecd77e2c3 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-Unittests for flash_attenion Operator.
+Unittests for flash_attention Operator.
 """
 import logging
 import math
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsamping2d_add.py
index 4bb5d2b11..a53fa51e4 100644
--- a/tests/unittest/ops/test_upsamping2d_add.py
+++ b/tests/unittest/ops/test_upsamping2d_add.py
@@ -85,8 +85,8 @@ def _test_single_op(
             r = torch.permute(R_pt, (0, 2, 3, 1)).contiguous()
             y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors({"input_0": x, "input_1": r}, [y])
-            y_tranpose = torch.permute(y, (0, 3, 1, 2))
-            self.assertTrue(torch.allclose(Y_pt, y_tranpose, atol=1e-2, rtol=1e-2))
+            y_transpose = torch.permute(y, (0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_bilinear_upsample_add_fp16(self):
         self._test_single_op(
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index 293bde6ad..bf1012edd 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-Unittests for vanilla_attenion.
+Unittests for vanilla_attention.
 """
 import logging
 import math
@@ -96,7 +96,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     return Y_pt
 
 
-class vanillaAttentionTestCase(unittest.TestCase):
+class VanillaAttentionTestCase(unittest.TestCase):
     def _test_vanilla_attention(
         self,
         batch_size=16,

From 210edef0d702047932040691c28598eac68076da Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Tue, 28 Feb 2023 19:30:14 +0800
Subject: [PATCH 189/638] fix bugs

---
 fx2ait/fx2ait/converters/ait_converters.py | 6 +++---
 static/include/rocm_device_functions.h     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index e225b4d4e..2826f60a1 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1175,8 +1175,8 @@ def _choose_conv2d_op(
     dilate: int,
     x: AITTensor,
     weight: AITTensor,
-    bias: [AITTensor],
-    transposed: [bool] = False,
+    bias: AITTensor,
+    transposed: bool = False,
 ) -> ConverterOutput:
     """
     Helper to choose conv2d vs. conv2d_bias op based on existence of bias
@@ -1307,7 +1307,7 @@ def _choose_conv3d_op(
     dilate: int,
     x: AITTensor,
     weight: AITTensor,
-    bias: [AITTensor],
+    bias: AITTensor,
     groups: int = 1,
 ) -> ConverterOutput:
     """
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 96817eef0..0e787a628 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -57,7 +57,7 @@ inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) {
       << "\n     Has 32-bit integer atomics for shared memory: "
       << (arch.hasSharedInt32Atomics ? "yes" : "no")
       << "\n     Has 32-bit float atomic exch for shared memory: "
-      << (arch.hasSharedFloatAtomicExch ? "yes" : "no"
+      << (arch.hasSharedFloatAtomicExch ? "yes" : "no")
       << "\n     Has 32-bit float atomic add in global and shared memory: "
       << (arch.hasFloatAtomicAdd ? "yes" : "no")
       << "\n     Has 64-bit integer atomics for global memory: "

From bd9aad7a444abd9580e70caa1e0367e5b98c51e3 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@gmail.com>
Date: Tue, 28 Feb 2023 20:20:20 +0800
Subject: [PATCH 190/638] fix fx2ait

---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 7558bff00..b75ec49cc 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -146,6 +146,17 @@ AITModelImpl::AITModelImpl(
   // It's not clear what stream we want to use yet. Create a new one.
   // We could alternatively use the default stream, but that could cause extra
   // synchronization.
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t creation_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
+#else
   cudaStream_t creation_stream;
   TORCH_CHECK(
       cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
@@ -155,6 +166,7 @@ AITModelImpl::AITModelImpl(
       std::remove_pointer_t<cudaStream_t>,
       decltype(&cudaStreamDestroy)>;
   StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
+#endif
 
 #define LOAD_SYMBOL(var, name_str)                                       \
   var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
@@ -618,6 +630,17 @@ void AITModelImpl::updateConstantsWithWeights(
         "failing this round of weight update");
     constants.emplace_back(torchToAitData(it->second));
   }
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t constants_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
+#else
   cudaStream_t constants_stream;
   TORCH_CHECK(
       cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
@@ -627,6 +650,7 @@ void AITModelImpl::updateConstantsWithWeights(
       std::remove_pointer_t<cudaStream_t>,
       decltype(&cudaStreamDestroy)>;
   StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
+#endif
   AIT_CHECK(setManyConstantsDoubleBufferFunc_(
       model_handle_,
       /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),

From c33cbd1e2a1248555846acc3bb21b878ef059d32 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 28 Feb 2023 13:57:53 +0800
Subject: [PATCH 191/638] fix bugs

---
 static/include/rocm_device_functions.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 0e787a628..049f25070 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -67,9 +67,9 @@ inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) {
       << "\n     Has double-precision floating point: "
       << (arch.hasDoubles ? "yes" : "no")
       << "\n     Has warp vote instructions (__any, __all): "
-      << (arch.hasWarpVote: ? "yes" : "no")
+      << (arch.hasWarpVote ? "yes" : "no")
       << "\n     Has warp ballot instructions (__ballot): "
-      << (arch.hasWarpBallot: ? "yes" : "no")
+      << (arch.hasWarpBallot ? "yes" : "no")
       << "\n     Has warp shuffle operations. (__shfl_*): "
       << (arch.hasWarpShuffle ? "yes" : "no")
       << "\n     Has funnel two words into one with shift&mask caps: "

From 120ef41723b41139b2ee240f1302490970634c5f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 28 Feb 2023 14:29:15 +0800
Subject: [PATCH 192/638] fix a bug

---
 python/aitemplate/backend/profiler_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index f268c2ae3..f73b67567 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -313,8 +313,7 @@ def callback_when_done(fut):
                     )
             finally:
                 # unblock one future in `join()`
-                if not err:
-                    self._done_queue.put(stdout)
+                self._done_queue.put(stdout)
 
         future.add_done_callback(callback_when_done)
         self._futures.append(future)

From 312b34008f0a71ca4267eeeee6e7976c3ab90d38 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 28 Feb 2023 15:06:32 +0800
Subject: [PATCH 193/638] fix bbert bugs

---
 examples/03_bert/benchmark_ait.py | 4 ++--
 static/csrc/model_container.cpp   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 00257a3bf..6d711c805 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -25,8 +25,8 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-from .modeling.bert import BertBaseEncodersOnly, BertBaseUncased
-from .modeling.torch_model import BertBaseUncased as BertPt
+from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from modeling.torch_model import BertBaseUncased as BertPt
 
 
 def mark_output(y: Tensor) -> None:
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index add4ea980..bf684b81a 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -401,9 +401,9 @@ void ModelContainer::SetConstantImpl(
           ". Check that the provided tensor's shape is correct.");
     }
   } else {
-    throw std::runtime_error(
-        std::string("Called SetConstant on ") + name +
-        std::string(" but can't find in either bound or unbound constant set"));
+    LOG(WARNING) << "Called SetConstant on " << name
+       << " but can't find in either bound or unbound constant set";
+    return;
   }
 
   auto* src = tensor.ptr;

From 41d57f5edd14aaf26a9d69cb4541753d4951866a Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Tue, 28 Feb 2023 09:43:21 -0800
Subject: [PATCH 194/638] Fix pip install wheel command in doc (#326)

Summary:
1. Doc says to install `dist/aitemplate-0.0.1-py3-none-any.whl` - this file name does not exist. Current wheel version is `0.1.dev1` - the file name is `aitemplate-0.1.dev1-py3-none-any.whl`.

It is probably better to use wildcard wheel name in pip install command.

~~2. Use RST syntax for git clone command. (Currently this command is mistakenly written in Markdown syntax and rendered incorrectly)~~ That was done in [PR334](https://github.com/facebookincubator/AITemplate/pull/334) already

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/326

Reviewed By: aakhundov

Differential Revision: D43614325

Pulled By: chenyang78

fbshipit-source-id: 8e1821998dc79484131d2da52556dd42912f8e88
---
 docs/source/install/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
index 2528bd036..6e684ab03 100644
--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -71,4 +71,4 @@ Then build the Python wheel package and install it:
 
         cd python
         python setup.py bdist_wheel
-        pip install dist/aitemplate-0.0.1-py3-none-any.whl
+        pip install dist/aitemplate-*.whl

From 492006a59c2c646d93b7286c13876d76d6955e13 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Tue, 28 Feb 2023 10:07:50 -0800
Subject: [PATCH 195/638] Consolidate bmm_xxx and bmm_xxx_add codegens (#341)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/341

As a part of T144417069, we need to add `bmm_xxc` and `bmm_xxc_add` ops. Currently codegens for each of existing 4 `bmm_xxr` ops are defined separately. In order not to have 8 separate almost identical files after adding `bmm_xxc`, this diff consolidates `bmm_xxx` codegens: now they are all defined in `bmm_xxx.py` in a loop over 8 possible layout configurations. Same is done for `bmm_xxx_add` in `bmm_xxx_add.py`.

A follow-up diff would do the same on the ops side.

Reviewed By: aakhundov

Differential Revision: D43621471

fbshipit-source-id: d830bdcf281218c15335f28627649ab3981ecc3a
---
 .../backend/cuda/gemm_universal/__init__.py   |  10 +-
 .../backend/cuda/gemm_universal/bmm_ccr.py    | 152 -----------------
 .../cuda/gemm_universal/bmm_ccr_add.py        | 127 --------------
 .../backend/cuda/gemm_universal/bmm_crr.py    | 124 --------------
 .../cuda/gemm_universal/bmm_crr_add.py        | 116 -------------
 .../backend/cuda/gemm_universal/bmm_rcr.py    | 113 -------------
 .../backend/cuda/gemm_universal/bmm_rrr.py    | 155 ------------------
 .../cuda/gemm_universal/bmm_rrr_add.py        | 130 ---------------
 .../backend/cuda/gemm_universal/bmm_xxx.py    | 144 ++++++++++++++++
 .../cuda/gemm_universal/bmm_xxx_add.py        | 136 +++++++++++++++
 10 files changed, 282 insertions(+), 925 deletions(-)
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
 delete mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py

diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index 13e978dcd..cebb32746 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -14,16 +14,10 @@
 #
 # flake8: noqa
 from . import (
-    bmm_ccr,
-    bmm_ccr_add,
-    bmm_crr,
-    bmm_crr_add,
-    bmm_rcr,
     bmm_rcr_permute,
-    bmm_rrr,
-    bmm_rrr_add,
     bmm_rrr_permute,
-    gemm_rcr,
+    bmm_xxx,
+    bmm_xxx_add,
     gemm_rcr_bias,
     gemm_rcr_bias_elementwise,
     gemm_rcr_bias_fast_gelu,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
deleted file mode 100644
index c75243fee..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Codegen for bmm_ccr, which computes A @ B + bias.
-A[ColMajor], B[ColMajor], bias[RowMajor]
-"""
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "M",
-        "ldb": "K",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_ccr.config")
-def bmm_ccr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_ccr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_ccr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_ccr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_ccr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_ccr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
deleted file mode 100644
index fe8e605f0..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Codegen for bmm_ccr_add, which computes A @ B + bias + C.
-A[ColMajor], B[ColMajor], bias / C[RowMajor]
-"""
-from ... import registry
-from ...common import gemm_common
-from . import bmm_ccr, bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_ccr_add.config")
-def bmm_ccr_add_config(func_attrs, dtype="float16"):
-    return bmm_ccr.bmm_ccr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_ccr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="(d_ptr)",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_ccr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="(d_ptr)",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_ccr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_ccr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_ccr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
deleted file mode 100644
index 5f85b482a..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_crr, which computes A @ B + bias.
-A[ColMajor], B[RowMajor], bias[RowMajor]
-"""
-
-from ... import registry
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-PROBLEM_ARGS = {
-    "bias_ptr": "c_ptr",
-    "a_batch_stride": "M * K",
-    "b_batch_stride": "N * K",
-    "bias_batch_stride": "M * N",
-    "c_batch_stride": "M * N",
-    "lda": "M",
-    "ldb": "N",
-    "ldbias": "N",
-    "ldc": "N",
-}
-
-
-@registry.reg("cuda.bmm_crr.config")
-def bmm_crr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_crr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return bmm_common.default_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS,
-    )
-
-
-@registry.reg("cuda.bmm_crr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    default_mm_info = bmm_common.get_default_problem_info(
-        PROBLEM_ARGS,
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    (
-        problem_args,
-        input_addr_calculator,
-        output_addr_calculator,
-    ) = bmm_common.make_function_strided_args(
-        func_attrs, dim_info_dict, default_mm_info, is_permute=False
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        input_addr_calculator=input_addr_calculator,
-        output_addr_calculator=output_addr_calculator,
-    )
-
-
-@registry.reg("cuda.bmm_crr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_crr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_crr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
deleted file mode 100644
index 23f818a50..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_crr_add, which computes A @ B + bias + C.
-A[ColMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_crr, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_crr_add.config")
-def bmm_crr_add_config(func_attrs, dtype="float16"):
-    return bmm_crr.bmm_crr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_crr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    default_mm_info = bmm_common.get_default_problem_info(
-        bmm_crr.PROBLEM_ARGS,
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=default_mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_crr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    default_mm_info = bmm_common.get_default_problem_info(
-        bmm_crr.PROBLEM_ARGS,
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    (
-        problem_args,
-        input_addr_calculator,
-        output_addr_calculator,
-    ) = bmm_common.make_function_strided_args(
-        func_attrs, dim_info_dict, default_mm_info, is_permute=False
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        input_addr_calculator=input_addr_calculator,
-        output_addr_calculator=output_addr_calculator,
-    )
-
-
-@registry.reg("cuda.bmm_crr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_crr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_crr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
deleted file mode 100644
index 866ca7921..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rcr, which computes A @ B + bias.
-A[RowMajor], B[ColMajor], bias[RowMajor]
-"""
-
-from ... import registry
-from . import bmm_common, common
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-PROBLEM_ARGS = {
-    "bias_ptr": "c_ptr",
-    "a_batch_stride": "M * K",
-    "b_batch_stride": "N * K",
-    "bias_batch_stride": "M * N",
-    "c_batch_stride": "M * N",
-    "lda": "K",
-    "ldb": "K",
-    "ldbias": "N",
-    "ldc": "N",
-}
-
-
-@registry.reg("cuda.bmm_rcr.config")
-def bmm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
-
-
-@registry.reg("cuda.bmm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return bmm_common.default_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS,
-    )
-
-
-@registry.reg("cuda.bmm_rcr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    default_mm_info = bmm_common.get_default_problem_info(
-        PROBLEM_ARGS,
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    (
-        problem_args,
-        input_addr_calculator,
-        output_addr_calculator,
-    ) = bmm_common.make_function_strided_args(
-        func_attrs, dim_info_dict, default_mm_info, is_permute=False
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        input_addr_calculator,
-        output_addr_calculator,
-    )
-
-
-@registry.reg("cuda.bmm_rcr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rcr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rcr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
deleted file mode 100644
index ded795721..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
+++ /dev/null
@@ -1,155 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rrr, which computes A @ B + bias.
-A[RowMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "K",
-        "ldb": "N",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_rrr.config")
-def bmm_rrr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.RowMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_rrr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_rrr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_rrr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rrr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rrr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
deleted file mode 100644
index 44fbda070..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rrr_add, which computes A @ B + bias + C.
-A[RowMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_rrr, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_rrr_add.config")
-def bmm_rrr_add_config(func_attrs, dtype="float16"):
-    return bmm_rrr.bmm_rrr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_rrr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_rrr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_rrr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rrr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rrr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
new file mode 100644
index 000000000..7e9497433
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from ... import registry
+from . import bmm_common, common
+
+"""
+Codegen for 8 bmm_xxx ops, which compute A @ B + bias. The ops differ in
+layouts of A, B, and bias: each can be column-major or row-major,
+8 combinations in total.
+
+This module registers functions config, gen_profiler, gen_function, func_decl,
+func_call, and filter for each layout combination under names like
+"cuda.bmm_rcr.func_call".
+"""
+
+
+def _get_problem_args(a_layout, b_layout, c_layout):
+    return {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "M" if a_layout == "c" else "K",
+        "ldb": "K" if b_layout == "c" else "N",
+        "ldbias": "M" if c_layout == "c" else "N",
+        "ldc": "M" if c_layout == "c" else "N",
+    }
+
+
+def get_config(a_layout, b_layout, c_layout):
+    """
+    Return config function for given layouts of A, B, and bias.
+    """
+
+    def config(func_attrs, dtype="float16"):
+        import cutlass_lib
+
+        layout_choice = {
+            "c": cutlass_lib.library.LayoutType.ColumnMajor,
+            "r": cutlass_lib.library.LayoutType.RowMajor,
+        }
+
+        def fproc(op):
+            return common.default_fproc(
+                op=op,
+                a_layout=layout_choice[a_layout],
+                b_layout=layout_choice[b_layout],
+                c_layout=layout_choice[c_layout],
+                dtype=func_attrs["inputs"][0].dtype(),
+                epiligue_name=func_attrs["epilogue"],
+            )
+
+        func_attrs["op_instance"] = common.extract_config(fproc)
+
+    return config
+
+
+def get_gen_profiler(a_layout, b_layout, c_layout):
+    """
+    Return gen_profiler for given layouts of A, B, and bias.
+    """
+
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        return bmm_common.default_gen_profiler(
+            func_attrs,
+            workdir,
+            profiler_filename,
+            dim_info_dict,
+            problem_args,
+        )
+
+    return gen_profiler
+
+
+def get_gen_function(a_layout, b_layout, c_layout):
+    """
+    Return gen_function for given layouts of A, B, and bias.
+    """
+
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            alpha_value=func_attrs.get("alpha", 1),
+        )
+        (
+            problem_args,
+            input_addr_calculator,
+            output_addr_calculator,
+        ) = bmm_common.make_function_strided_args(
+            func_attrs, dim_info_dict, default_mm_info, is_permute=False
+        )
+
+        return bmm_common.gen_function(
+            func_attrs,
+            exec_cond_template,
+            problem_args,
+            dim_info_dict,
+            input_addr_calculator=input_addr_calculator,
+            output_addr_calculator=output_addr_calculator,
+        )
+
+    return gen_function
+
+
+# Register functions for each of 8 layout combinations
+for a_layout in ["c", "r"]:
+    for b_layout in ["c", "r"]:
+        for c_layout in ["c", "r"]:
+            prefix = f"cuda.bmm_{a_layout}{b_layout}{c_layout}."
+
+            config = get_config(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "config")(config)
+
+            gen_profiler = get_gen_profiler(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_profiler")(gen_profiler)
+
+            gen_function = get_gen_function(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_function")(gen_function)
+
+            # The remaining 3 functions don't depend on the layout
+            registry.reg(prefix + "func_decl")(bmm_common.gen_function_decl)
+            registry.reg(prefix + "func_call")(bmm_common.gen_function_call)
+            registry.reg(prefix + "filter")(common.function_filter)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
new file mode 100644
index 000000000..aae526481
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for 8 bmm_xxx_add ops, which compute A @ B + bias + C. The ops differ
+in layouts of A, B, and C/bias: each can be column-major or row-major,
+8 combinations in total.
+
+This module registers functions config, gen_profiler, gen_function, func_decl,
+func_call, and filter for each layout combination under names like
+"cuda.bmm_rcr_add.func_call".
+"""
+
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+from .bmm_xxx import _get_problem_args, get_config
+
+
+def get_gen_function(a_layout, b_layout, c_layout):
+    """
+    Return gen_function for given layouts of A, B, and C/bias.
+    """
+
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            bias_ptr="d_ptr",
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=1,
+        )
+        (
+            problem_args,
+            input_addr_calculator,
+            output_addr_calculator,
+        ) = bmm_common.make_function_strided_args(
+            func_attrs, dim_info_dict, default_mm_info, is_permute=False
+        )
+        return bmm_common.gen_function(
+            func_attrs,
+            exec_cond_template,
+            problem_args,
+            dim_info_dict,
+            input_addr_calculator=input_addr_calculator,
+            output_addr_calculator=output_addr_calculator,
+        )
+
+    return gen_function
+
+
+def get_gen_profiler(a_layout, b_layout, c_layout):
+    """
+    Return gen_profiler for given layouts of A, B, and C/bias.
+    """
+
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        a_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.INPUT, 0
+        )
+        b_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.INPUT, 1
+        )
+        c_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.OUTPUT, 0
+        )
+
+        args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+            a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+        )
+
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            bias_ptr="d_ptr",
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=1,
+        )
+        a_shapes = func_attrs["input_accessors"][0].original_shapes
+        b_shapes = func_attrs["input_accessors"][1].original_shapes
+        d_shapes = func_attrs["input_accessors"][2].original_shapes
+        bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes, d_shapes)
+
+        problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+            mm_info=default_mm_info,
+        )
+
+        return bmm_common.gen_profiler(
+            func_attrs,
+            workdir,
+            profiler_filename,
+            dim_info_dict,
+            common.SRC_TEMPLATE,
+            problem_args,
+            args_parser,
+        )
+
+    return gen_profiler
+
+
+# Register functions for each of 8 layout combinations
+for a_layout in ["c", "r"]:
+    for b_layout in ["c", "r"]:
+        for c_layout in ["c", "r"]:
+            prefix = f"cuda.bmm_{a_layout}{b_layout}{c_layout}_add."
+
+            config = get_config(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "config")(config)
+
+            gen_profiler = get_gen_profiler(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_profiler")(gen_profiler)
+
+            gen_function = get_gen_function(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_function")(gen_function)
+
+            # The remaining 3 functions don't depend on the layout
+            registry.reg(prefix + "func_decl")(bmm_common.gen_function_decl)
+            registry.reg(prefix + "func_call")(bmm_common.gen_function_call)
+            registry.reg(prefix + "filter")(common.function_filter)

From b97e7ac8b27c0f36aeba78d71f904ac2225c9b99 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Feb 2023 11:09:47 -0800
Subject: [PATCH 196/638] Set version to 0.3.dev0 (#344)

Summary:
AIT 0.2 was released last month.

Current dev version should be 0.3.dev0

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/344

Reviewed By: khabinov

Differential Revision: D43666688

Pulled By: tenpercent

fbshipit-source-id: fe87d0ca284cb09134c67e54cb28966e6e7e606c
---
 python/aitemplate/_libinfo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/_libinfo.py b/python/aitemplate/_libinfo.py
index ca4d89ecb..c324afca9 100644
--- a/python/aitemplate/_libinfo.py
+++ b/python/aitemplate/_libinfo.py
@@ -14,4 +14,4 @@
 #
 # current version
 # We use the version of the incoming release for code
-__version__ = "0.1.dev1"
+__version__ = "0.3.dev0"

From a7ebe3c2222da1bd8f55df6312901939e8a808af Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 28 Feb 2023 20:03:30 -0800
Subject: [PATCH 197/638] Fix test_strided_scatter.py to use correct IntVar
 (#321)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/321

Fix test_strided_scatter.py to use IntVar with explicit names

Reviewed By: chenyang78

Differential Revision: D42967155

fbshipit-source-id: 2924ef7fb7c6921b10247727090070b8452e26ed
---
 .../unittest/compiler/test_strided_scatter.py | 89 ++++++++++++-------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/tests/unittest/compiler/test_strided_scatter.py b/tests/unittest/compiler/test_strided_scatter.py
index 64187ef24..9b7284d1f 100644
--- a/tests/unittest/compiler/test_strided_scatter.py
+++ b/tests/unittest/compiler/test_strided_scatter.py
@@ -20,13 +20,14 @@
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
-from aitemplate.utils import graph_utils, shape_utils
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import gen_int_var_min_max as gen_IntVar
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -43,10 +44,7 @@ def _make_tensor(
         input_name,
         input_type="float16",
     ):
-        x_shape = [
-            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
-            for d in input_shape
-        ]
+        x_shape = [d if isinstance(d, IntVar) else IntImm(d) for d in input_shape]
         X = Tensor(shape=x_shape, dtype=input_type, name=input_name, is_input=True)
         return X
 
@@ -54,8 +52,7 @@ def _make_add(
         self, input_shape, input_0_name, input_1_name, output_name, input_type="float16"
     ):
         input_add_shape = [
-            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
-            for d in input_shape
+            d if isinstance(d, IntVar) else IntImm(d) for d in input_shape
         ]
         input_Add_0 = Tensor(
             shape=input_add_shape,
@@ -244,8 +241,8 @@ def _test_strided_scatter_dynamic(
         self.assertEqual(fused_op._attrs["op"], "concatenate")
 
         for d in input_shapes[0]:
-            if isinstance(d, list):
-                Ms = d
+            if isinstance(d, IntVar):
+                Ms = d._attrs["values"]
                 break
         assert Ms is not None, "expected to have at least one dynamic dim"
         for idx in range(len(Ms)):
@@ -256,7 +253,8 @@ def _test_strided_scatter_dynamic(
                 input_shapes, start_indices, end_indices
             ):
                 input_shape_pt = [
-                    d[idx] if isinstance(d, list) else d for d in input_shape
+                    d._attrs["values"][idx] if isinstance(d, IntVar) else d
+                    for d in input_shape
                 ]
                 x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
@@ -276,15 +274,20 @@ def _test_strided_scatter_dynamic(
             self.test_count += 1
 
     def test_strided_scatter_dynamic(self):
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
             test_name="strided_scatter_dynamic",
         )
+        dynamic_dim_2 = gen_IntVar([10, 20], name="dynamic_dim_2")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], [10, 20], 4], [[5, 16], [10, 20], 10]),
+            input_shapes=(
+                [dynamic_dim_1, dynamic_dim_2, 4],
+                [dynamic_dim_1, dynamic_dim_2, 10],
+            ),
             start_indices=([0, 0, 2], [0, 0, 2]),
             end_indices=([None, None, 4], [None, None, 10]),
             scatter_dim=2,
@@ -292,19 +295,22 @@ def test_strided_scatter_dynamic(self):
         )
 
     def test_strided_scatter_partial(self):
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
             test_name="strided_scatter_partial",
             make_slices=[True, False],
         )
+        dynamic_dim_2 = gen_IntVar([5, 7], name="dynamic_dim_2")
+        dynamic_dim_3 = gen_IntVar([1, 10], name="dynamic_dim_3")
         self._test_strided_scatter_dynamic(
             input_shapes=(
-                [[5, 7], [1, 10], 4],
-                [[5, 7], [1, 10], 6],
-                [[5, 7], [1, 10], 8],
+                [dynamic_dim_2, dynamic_dim_3, 4],
+                [dynamic_dim_2, dynamic_dim_3, 6],
+                [dynamic_dim_2, dynamic_dim_3, 8],
             ),
             start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
             end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
@@ -314,14 +320,14 @@ def test_strided_scatter_partial(self):
         )
         self._test_strided_scatter_dynamic(
             input_shapes=(
-                [[5, 7], [1, 10], 4],
-                [[5, 7], [1, 10], 6],
-                [[5, 7], [1, 10], 8],
+                [dynamic_dim_2, dynamic_dim_3, 4],
+                [dynamic_dim_2, dynamic_dim_3, 6],
+                [dynamic_dim_2, dynamic_dim_3, 8],
             ),
             start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
             end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
             scatter_dim=2,
-            test_name="strided_scatter_partial",
+            test_name="strided_scatter_partial_1",
             make_slices=[False, False, True],
         )
 
@@ -430,7 +436,7 @@ def test_strided_scatter_multi_dsts_2(self):
             start_indices=[[0, 1, 0], [0, 1, 0]],
             end_indices=[[None, 2, None], [None, 7, None]],
             scatter_dim=1,
-            test_name="strided_scatter_partial",
+            test_name="strided_scatter_multi_dsts_2",
         )
 
     def _test_strided_scatter_input_masks(
@@ -452,8 +458,9 @@ def _test_strided_scatter_input_masks(
             f"{start_indices=}, {end_indices=}"
         )
 
+        Ms_IntVar = gen_IntVar(Ms, name="Ms")
         input_A_name = "input_a"
-        input_A = self._make_tensor([list(Ms), K], input_A_name, dtype)
+        input_A = self._make_tensor([Ms_IntVar, K], input_A_name, dtype)
         input_B_name = "input_b"
         input_B = self._make_tensor([N, K], input_B_name, dtype)
         input_Bias_name = "input_bias"
@@ -464,7 +471,7 @@ def _test_strided_scatter_input_masks(
         input_Add_0_name = "input_add_0"
         input_Add_1_name = "input_add_1"
         add_output = self._make_add(
-            [list(Ms), N], input_Add_0_name, input_Add_1_name, "add_output", dtype
+            [Ms_IntVar, N], input_Add_0_name, input_Add_1_name, "add_output", dtype
         )
         # A, B, bias, add_0 and add_1
         num_extra_inputs = 5
@@ -533,7 +540,8 @@ def _test_strided_scatter_input_masks(
                 input_shapes, start_indices, end_indices
             ):
                 input_shape_pt = [
-                    d[idx] if isinstance(d, list) else d for d in input_shape
+                    d._attrs["values"][idx] if isinstance(d, IntVar) else d
+                    for d in input_shape
                 ]
                 x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
@@ -560,11 +568,13 @@ def _test_strided_scatter_input_masks(
 
     def test_strided_scatter_input_masks(self):
         # gemm_output[Ms, N]
+        # This dynamic_dim_1 is actually the same as Ms..
+        dynamic_dim_1 = gen_IntVar([5, 16], name="Ms")
         self._test_strided_scatter_input_masks(
             Ms=(5, 16),
             N=4,
             K=10,
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
@@ -575,7 +585,7 @@ def test_strided_scatter_input_masks(self):
             Ms=(5, 16),
             N=4,
             K=10,
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 2], [None, 10]),
             scatter_dim=1,
@@ -685,8 +695,11 @@ def _test_strided_scatter_input_masks_2(
 
         add_0_input_name_0 = "add_0_input_0"
         add_0_input_name_1 = "add_0_input_1"
+        Ms0_IntVar = gen_IntVar(Ms0, name="Ms0")
+        # This is not ideal, we should have "check" against the start/end/scatter_dim when we construct the test case instead.
+        Ms1_IntVar = Ms0_IntVar if Ms0 == Ms1 else gen_IntVar(Ms1, name="Ms1")
         add_output0 = self._make_add(
-            [list(Ms0), N0],
+            [Ms0_IntVar, N0],
             add_0_input_name_0,
             add_0_input_name_1,
             "add_0_output",
@@ -695,7 +708,7 @@ def _test_strided_scatter_input_masks_2(
         add_1_input_name_0 = "add_1_input_0"
         add_1_input_name_1 = "add_1_input_1"
         add_output1 = self._make_add(
-            [list(Ms1), N1],
+            [Ms1_IntVar, N1],
             add_1_input_name_0,
             add_1_input_name_1,
             "add_1_output",
@@ -916,19 +929,26 @@ def test_strided_scatter_float(self):
             test_name="strided_scatter_basic_float",
             dtype="float",
         )
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
+        dynamic_dim_2 = gen_IntVar([10, 20], name="dynamic_dim_2")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], [10, 20], 4], [[5, 16], [10, 20], 10]),
+            input_shapes=(
+                [dynamic_dim_1, dynamic_dim_2, 4],
+                [dynamic_dim_1, dynamic_dim_2, 10],
+            ),
             start_indices=([0, 0, 2], [0, 0, 2]),
             end_indices=([None, None, 4], [None, None, 10]),
             scatter_dim=2,
             test_name="strided_scatter_dynamic_float",
             dtype="float",
         )
+        dynamic_dim_3 = gen_IntVar([5, 7], name="dynamic_dim_3")
+        dynamic_dim_4 = gen_IntVar([1, 10], name="dynamic_dim_4")
         self._test_strided_scatter_dynamic(
             input_shapes=(
-                [[5, 7], [1, 10], 4],
-                [[5, 7], [1, 10], 6],
-                [[5, 7], [1, 10], 8],
+                [dynamic_dim_3, dynamic_dim_4, 4],
+                [dynamic_dim_3, dynamic_dim_4, 6],
+                [dynamic_dim_3, dynamic_dim_4, 8],
             ),
             start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
             end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
@@ -993,11 +1013,12 @@ def test_strided_scatter_float(self):
         )
         target = detect_target()
         if int(target._arch) >= 80:
+            dynamic_dim_1 = gen_IntVar([5, 16], name="Ms")
             self._test_strided_scatter_input_masks(
                 Ms=(5, 16),
                 N=4,
                 K=10,
-                input_shapes=([[5, 16], 5], [[5, 16], 10]),
+                input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
                 start_indices=([0, 1], [0, 2]),
                 end_indices=([None, 2], [None, 10]),
                 scatter_dim=1,

From 75358a7ead0bde231bb12f82c46084c824b0849c Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 28 Feb 2023 21:15:49 -0800
Subject: [PATCH 198/638] Add a util function to filter test cases by test
 environment. (#343)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/343

See test_softmax for example use cases. By calling
filter_test_cases_by_test_env(), we can define both A100 and V100 test cases in
the same file, and define separate buck targets to filter test cases by test
env.

Reviewed By: tenpercent

Differential Revision: D43562998

fbshipit-source-id: d38b9aba050c6def5e4ef37ae252709ab461e9c3
---
 python/aitemplate/testing/test_utils.py | 73 +++++++++++++++++-
 tests/unittest/ops/test_layernorm.py    |  3 +
 tests/unittest/ops/test_softmax.py      | 99 +++++++++++++------------
 3 files changed, 128 insertions(+), 47 deletions(-)

diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 643c948f7..b871792b4 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -15,16 +15,87 @@
 """
 Utils for unit tests.
 """
-from typing import Any, Dict, List, Optional
+import itertools
+import unittest
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import torch
 
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 from aitemplate.compiler.dtype import normalize_dtype
+from aitemplate.testing.detect_target import detect_target
 from aitemplate.utils.graph_utils import get_sorted_ops
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
+class TestEnv(Enum):
+    CUDA_LESS_THAN_SM80 = 1
+    CUDA_SM80 = 2
+    ROCM = 100
+
+
+def _ROCM_filter(method_name: str) -> bool:
+    return method_name.endswith("rocm")
+
+
+def _SM80_filter(method_name: str) -> bool:
+    return method_name.endswith("bf16") or method_name.endswith("sm80")
+
+
+_TEST_ENV_TO_FILTER_METHOD: Dict[str, Callable[[str], bool]] = {
+    TestEnv.CUDA_LESS_THAN_SM80: (
+        lambda method_name: not (_SM80_filter(method_name) or _ROCM_filter(method_name))
+    ),
+    TestEnv.CUDA_SM80: _SM80_filter,
+    TestEnv.ROCM: _ROCM_filter,
+}
+
+
+def _get_test_env(target) -> str:
+    test_env = ""
+    if target.name() == "cuda":
+        if int(target._arch) < 80:
+            test_env = TestEnv.CUDA_LESS_THAN_SM80
+        elif int(target._arch) == 80:
+            test_env = TestEnv.CUDA_SM80
+        else:
+            raise RuntimeError(
+                f"Unknown test env, target: {target.name}, {target._arch}"
+            )
+    elif target.name() == "rocm":
+        test_env = TestEnv.ROCM
+    else:
+        raise RuntimeError(f"Unknown test env, target: {target.name}, {target._arch}")
+    if test_env not in _TEST_ENV_TO_FILTER_METHOD:
+        raise RuntimeError(f"{test_env=} not defined in _TEST_ENV_TO_FILTER_METHOD")
+    return test_env
+
+
+def filter_test_cases_by_params(params: Dict[TestEnv, List[Tuple[Any]]]):
+    """Filters test cases to run by given params. Only takes effect in CI env."""
+    target = detect_target()
+    test_env = _get_test_env(target)
+    return (
+        params.get(test_env, [])
+        if target.in_ci_env()
+        else list(itertools.chain.from_iterable(params.values()))
+    )
+
+
+def filter_test_cases_by_test_env(cls: Type[unittest.TestCase]):
+    """Filters test cases to run by test case names implicitly. Only takes effect in CI env."""
+    target = detect_target()
+    test_env = _get_test_env(target)
+    for attr in list(cls.__dict__.keys()):
+        if (
+            attr.startswith("test_")
+            and target.in_ci_env()
+            and (not _TEST_ENV_TO_FILTER_METHOD.get(test_env)(attr))
+        ):
+            delattr(cls, attr)
+
+
 def _get_torch_tensor(torch_fn, shape, dtype):
     dtype = normalize_dtype(dtype)
     return torch_fn(shape, device="cuda", dtype=string_to_torch_dtype(dtype))
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
index bf0f283e7..9dbf2e118 100644
--- a/tests/unittest/ops/test_layernorm.py
+++ b/tests/unittest/ops/test_layernorm.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler.base import IntImm, IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
@@ -182,5 +183,7 @@ def test_layernorm_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(LayernormTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index 4ce3a7dc3..9462d82dd 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_params, TestEnv
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 from parameterized import parameterized
 
@@ -63,52 +64,58 @@ def _test_softmax(
             torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
     @parameterized.expand(
-        [
-            ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
-            ("odd_small_fp16", "float16", (1, 13), (11,)),
-            ("odd_mid_fp16", "float16", (1, 4096), (33,)),
-            ("odd_large_fp16", "float16", (2, 31), (1409,)),
-            ("k2_small_fp16", "float16", (1, 1024), (18,)),
-            ("k2_mid_fp16", "float16", (2, 21), (66,)),
-            ("k2_large_fp16", "float16", (2, 21), (1154,)),
-            ("k4_small_fp16", "float16", (10, 1025), (124,)),
-            ("k4_mid_fp16", "float16", (1, 17), (132,)),
-            ("k4_large_fp16", "float16", (1, 17), (1924,)),
-            ("k8_small_fp16", "float16", (10, 1025), (72,)),
-            ("k8_mid_fp16", "float16", (1, 17), (264,)),
-            ("k8_large_fp16", "float16", (1, 17), (3848,)),
-            ("no_smem_fp16", "float16", (1, 2), (12500,)),
-            ("2d", "float16", (1, 2), (100, 100)),
-            ("3d", "float16", (1, 2), (24, 2, 64)),
-            ("dim_1_fp32", "float32", (1, 2), (6,), 1),
-            ("odd_small_fp32", "float32", (1, 2), (11,)),
-            ("odd_mid_fp32", "float32", (1, 2), (33,)),
-            ("odd_large_fp32", "float32", (1, 2), (1409,)),
-            ("k2_small_fp32", "float32", (1, 2), (18,)),
-            ("k2_mid_fp32", "float32", (1, 2), (66,)),
-            ("k2_large_fp32", "float32", (1, 2), (1154,)),
-            ("k4_small_fp32", "float32", (1, 2), (124,)),
-            ("k4_mid_fp32", "float32", (1, 2), (132,)),
-            ("k4_large_fp32", "float32", (1, 2), (1924,)),
-            ("k8_small_fp32", "float32", (1, 2), (72,)),
-            ("k8_mid_fp32", "float32", (1, 2), (264,)),
-            ("k8_large_fp32", "float32", (1, 2), (3848,)),
-            ("no_smem_fp32", "float32", (1, 2), (12500,)),
-            ("dim_1_bf16", "bfloat16", (1, 2), (6,), 1),
-            ("odd_small_bf16", "bfloat16", (1, 2), (11,)),
-            ("odd_mid_bf16", "bfloat16", (1, 2), (33,)),
-            ("odd_large_bf16", "bfloat16", (1, 2), (1409,)),
-            ("k2_small_bf16", "bfloat16", (1, 2), (18,)),
-            ("k2_mid_bf16", "bfloat16", (1, 2), (66,)),
-            ("k2_large_bf16", "bfloat16", (1, 2), (1154,)),
-            ("k4_small_bf16", "bfloat16", (1, 2), (124,)),
-            ("k4_mid_bf16", "bfloat16", (1, 2), (132,)),
-            ("k4_large_bf16", "bfloat16", (1, 2), (1924,)),
-            ("k8_small_bf16", "bfloat16", (1, 2), (72,)),
-            ("k8_mid_bf16", "bfloat16", (1, 2), (264,)),
-            ("k8_large_bf16", "bfloat16", (1, 2), (3848,)),
-            ("no_smem_bf16", "bfloat16", (1, 2), (12500,)),
-        ]
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
+                    ("odd_small_fp16", "float16", (1, 13), (11,)),
+                    ("odd_mid_fp16", "float16", (1, 4096), (33,)),
+                    ("odd_large_fp16", "float16", (2, 31), (1409,)),
+                    ("k2_small_fp16", "float16", (1, 1024), (18,)),
+                    ("k2_mid_fp16", "float16", (2, 21), (66,)),
+                    ("k2_large_fp16", "float16", (2, 21), (1154,)),
+                    ("k4_small_fp16", "float16", (10, 1025), (124,)),
+                    ("k4_mid_fp16", "float16", (1, 17), (132,)),
+                    ("k4_large_fp16", "float16", (1, 17), (1924,)),
+                    ("k8_small_fp16", "float16", (10, 1025), (72,)),
+                    ("k8_mid_fp16", "float16", (1, 17), (264,)),
+                    ("k8_large_fp16", "float16", (1, 17), (3848,)),
+                    ("no_smem_fp16", "float16", (1, 2), (12500,)),
+                    ("2d", "float16", (1, 2), (100, 100)),
+                    ("3d", "float16", (1, 2), (24, 2, 64)),
+                    ("dim_1_fp32", "float32", (1, 2), (6,), 1),
+                    ("odd_small_fp32", "float32", (1, 2), (11,)),
+                    ("odd_mid_fp32", "float32", (1, 2), (33,)),
+                    ("odd_large_fp32", "float32", (1, 2), (1409,)),
+                    ("k2_small_fp32", "float32", (1, 2), (18,)),
+                    ("k2_mid_fp32", "float32", (1, 2), (66,)),
+                    ("k2_large_fp32", "float32", (1, 2), (1154,)),
+                    ("k4_small_fp32", "float32", (1, 2), (124,)),
+                    ("k4_mid_fp32", "float32", (1, 2), (132,)),
+                    ("k4_large_fp32", "float32", (1, 2), (1924,)),
+                    ("k8_small_fp32", "float32", (1, 2), (72,)),
+                    ("k8_mid_fp32", "float32", (1, 2), (264,)),
+                    ("k8_large_fp32", "float32", (1, 2), (3848,)),
+                    ("no_smem_fp32", "float32", (1, 2), (12500,)),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("dim_1_bf16", "bfloat16", (1, 2), (6,), 1),
+                    ("odd_small_bf16", "bfloat16", (1, 2), (11,)),
+                    ("odd_mid_bf16", "bfloat16", (1, 2), (33,)),
+                    ("odd_large_bf16", "bfloat16", (1, 2), (1409,)),
+                    ("k2_small_bf16", "bfloat16", (1, 2), (18,)),
+                    ("k2_mid_bf16", "bfloat16", (1, 2), (66,)),
+                    ("k2_large_bf16", "bfloat16", (1, 2), (1154,)),
+                    ("k4_small_bf16", "bfloat16", (1, 2), (124,)),
+                    ("k4_mid_bf16", "bfloat16", (1, 2), (132,)),
+                    ("k4_large_bf16", "bfloat16", (1, 2), (1924,)),
+                    ("k8_small_bf16", "bfloat16", (1, 2), (72,)),
+                    ("k8_mid_bf16", "bfloat16", (1, 2), (264,)),
+                    ("k8_large_bf16", "bfloat16", (1, 2), (3848,)),
+                    ("no_smem_bf16", "bfloat16", (1, 2), (12500,)),
+                ],
+            }
+        )
     )
     def test_softmax(
         self,

From f2e94b13e260a6fc70ab6da4975e2e09b7c540ab Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 28 Feb 2023 23:36:51 -0800
Subject: [PATCH 199/638] make better vectorization for "broadcastable"
 fused_elementwise ops (#339)

Summary:
Previously, we were quite conservative in terms of determining a vector size for reading operands in the fused_elementwise kernel. We always based on the minimal alignment value among all the input operands. This particularly hurted the kernel performance when the last dimension was "broadcastable". For example, for two input shapes [batch_size, 10, 1] and [batch_size, 10, 8], we set read_t (i.e. vector read type) to "half", assuming the input type is float16. It would be more effecient to have a seperate read_t for each input. We could load the first input with "half" and the second with "uint4". Furthermore, we could make a vector that contained eight copies of the first input and add this vector with the uint4 vector value loaded from the second operand.

This PR implemented such an optimization. Performance results showed that it may result in upto ~2X performance improvements for some cases.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/339

Reviewed By: ipiszy

Differential Revision: D43613248

Pulled By: chenyang78

fbshipit-source-id: 787b1507a218c2a5fe1c61b3df08ea9d144c0d90
---
 .../backend/common/elementwise_common.py      | 228 ++++++++---
 python/aitemplate/utils/alignment.py          |  11 +
 .../backend/test_fused_elementwise_backend.py |  15 +-
 .../compiler/test_slice_elemwise_fusion.py    |  74 ++--
 .../ops/test_fused_elementwise_broadcast.py   | 355 +++++++++++++++---
 5 files changed, 549 insertions(+), 134 deletions(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 1d8b230a5..3997c8c9c 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -25,7 +25,7 @@
 
 from ...compiler.base import IntImm, IntVar, Operator, Tensor
 from ...compiler.tensor_accessor import TensorAccessor
-from ...utils import shape_utils
+from ...utils import alignment as alignment_utils, shape_utils
 from . import tensor_accessor_codegen
 
 CONSTANT_TEMPLATE = jinja2.Template(
@@ -68,9 +68,14 @@
 KERNEL_READ_INPUT_TEMPLATE = jinja2.Template(
     """
   {{read_t}} *{{input_name}} = const_cast<{{read_t}}*>(input{{input_idx}});
+  constexpr int vec_size{{input_idx}} =  sizeof({{max_read_t}}) / sizeof({{read_t}});
   {{get_strided_address}}
-  {{read_t}} tmp_i{{input_idx}} = *{{input_name}};
-  const {{op_t}}* p_tmp_i{{input_idx}} = reinterpret_cast<const {{op_t}}*>(&tmp_i{{input_idx}});
+  {{read_t}} tmp_i{{input_idx}}[vec_size{{input_idx}}];
+  #pragma unroll
+  for (int i = 0; i < vec_size{{input_idx}}; i++) {
+    tmp_i{{input_idx}}[i] = *{{input_name}};
+  }
+  const {{op_t}}* p_tmp_i{{input_idx}} = reinterpret_cast<const {{op_t}}*>(tmp_i{{input_idx}});
 
     """
 )
@@ -193,7 +198,10 @@ class FusedElementwiseMetaData:
     original_inputs: List[Tensor]
     original_outputs: List[Tensor]
 
-    read_t: str
+    # holding the largest read type for the fused kernel
+    max_read_t: str
+    # holding the read_t for each fused input
+    read_types: List[str]
     op_t: str
     data_t: str
     input_broadcast_sizes: List[List[IntVar]]
@@ -351,21 +359,111 @@ def _get_sub_func_metadata(
     return (sub_func_metadata, op_t)
 
 
-def _get_types_and_sizes(
+def _get_alignments(
+    extended_input_shapes: List[List[IntVar]],
+    input_broadcast_sizes: List[int],
+    num_rightmost_non_broadcast_dims: List[int],
+    rightmost_broadcast_dim: int,
+    output_rank: int,
+    dtype: str,
+) -> Tuple[List[int], List[int]]:
+    """
+    A helper function that returns two alignments lists, where the first list
+    is the alignments for inputs and the second one contains the alignments
+    for those non-broadcasted inputs
+    """
+    # We track alignment for each input
+    alignments = []
+    non_broadcast_alignments = []
+    for extended_input_shape, input_broadcast_sz, num_rightmost_non_br_dims in zip(
+        extended_input_shapes,
+        input_broadcast_sizes,
+        num_rightmost_non_broadcast_dims,
+    ):
+        # make sure we are not going to wrongfully generate an larger vector read type
+        if input_broadcast_sz is None and rightmost_broadcast_dim is not None:
+            num_rightmost_non_br_dims = output_rank - rightmost_broadcast_dim
+        num_elements_for_alignments = shape_utils.get_num_rightmost_static_elements(
+            extended_input_shape, num_rightmost_non_br_dims
+        )
+        if num_elements_for_alignments > 1 or input_broadcast_sz is None:
+            non_broadcast_alignments.append(num_elements_for_alignments)
+        alignment = alignment_utils.find_max_alignment(
+            num_elements_for_alignments, dtype
+        )
+        alignments.append(alignment)
+    return (alignments, non_broadcast_alignments)
+
+
+def _refine_alignments_with_tensor_accessors(
+    non_broadcast_alignments: List[int],
+    alignments: List[int],
+    dtype: str,
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+) -> List[int]:
+    """
+    This helper function returns the valid alignments based on the constrains
+    imposed on non_broadcast_alignments, input_accessors and output_accessors.
+    """
+    max_non_broadcast_alignment = None
+    if len(non_broadcast_alignments) > 1:
+        max_non_broadcast_alignment = alignment_utils.find_max_alignment_from(
+            non_broadcast_alignments, dtype
+        )
+    alignments = [
+        align
+        if align == 1 or max_non_broadcast_alignment is None
+        else max_non_broadcast_alignment
+        for align in alignments
+    ]
+    max_input_accessor_alignment = (
+        tensor_accessor_codegen.find_max_alignment_for_accessors(dtype, input_accessors)
+    )
+    # Note that we use the same alignment for accessing inputs and outputs, although
+    # they may have different alignment requirements. We may lose perf a little bit,
+    # but reduce the complexity of our jinja template. We can do some perf
+    # experiments later to determine if we want to chase more perf gains.
+    max_accessor_alignment = tensor_accessor_codegen.find_max_alignment(
+        max_input_accessor_alignment, dtype, output_accessors
+    )
+    # all alignments are capped by the max_accessor_alignment
+    alignments = [
+        align if align <= max_accessor_alignment else max_accessor_alignment
+        for align in alignments
+    ]
+    return alignments
+
+
+def _get_alignments_and_sizes_and_dtype(
     inputs: List[Tensor],
     input_accessors: List[TensorAccessor],
     output_accessors: List[TensorAccessor],
     backend_spec: BackendSpec,
-) -> Tuple[int, List[List[IntVar]], str]:
+) -> Tuple[List[int], List[List[IntVar]], str]:
     """
-    Returns Tuple(alignment, input_broadcast_sizes, dtype)
+    Returns Tuple(alignments, input_broadcast_sizes, dtype)
     """
 
     # Handle input broadcast.
     output_shape = output_accessors[0].original_shapes
     dtype = inputs[0]._attrs["dtype"]
+
+    # Determine the rightmost broadcast dim among all inputs.
+    # This value prevents us from wrongfully generating a larger alignment
+    # for cases such as X1[2, 2], X2[2, 1], where [2, 2] and [2, 1] are shapes.
+    # If we do not have a rightmost_broadcast_dim guard, we would
+    # end up generating alignment = 4 for X1. But, this would be wrong, because
+    # in the kernel, we might have a single effective thread that loads four
+    # elements from X1 and only one element from X2. Potentially, we could
+    # make this thread load two elements from X2, but it would make address
+    # indexing templates fairly complicated in general. Let's make simple
+    # cases work and extend it later if we had to, e.g. we saw large perf penalty
+    # without doing it.
+    rightmost_broadcast_dim = None
+    num_rightmost_non_broadcast_dims = []
     input_broadcast_sizes = []
-    min_num_elements = None
+    extended_input_shapes = []
     for input_accessor in input_accessors:
         input_shape = input_accessor.original_shapes
         broadcastable, _ = shape_utils.get_broadcast_max_shape(
@@ -377,8 +475,8 @@ def _get_types_and_sizes(
                     input_shape, output_shape
                 )
             )
-        num_rightmost_non_broadcast_elements = len(output_shape)
         extended_input_shape = list(input_shape)
+        num_rightmost_non_br_dims = len(output_shape)
         if input_shape == output_shape:
             input_broadcast_sizes.append(None)
         else:
@@ -387,26 +485,30 @@ def _get_types_and_sizes(
             input_broadcast_sizes.append(extended_input_shape)
             for i in reversed(range(len(extended_input_shape))):
                 if extended_input_shape[i] != output_shape[i]:
-                    num_rightmost_non_broadcast_elements -= i + 1
+                    num_rightmost_non_br_dims -= i + 1
+                    if rightmost_broadcast_dim is None:
+                        rightmost_broadcast_dim = i
+                    else:
+                        rightmost_broadcast_dim = max(i, rightmost_broadcast_dim)
                     break
-        num_elements_for_alignments = shape_utils.get_num_rightmost_static_elements(
-            extended_input_shape, num_rightmost_non_broadcast_elements
-        )
-        if not min_num_elements:
-            min_num_elements = num_elements_for_alignments
-        else:
-            min_num_elements = min(min_num_elements, num_elements_for_alignments)
-    alignment = tensor_accessor_codegen.find_max_alignment(
-        min_num_elements, dtype, output_accessors
+        extended_input_shapes.append(extended_input_shape)
+        num_rightmost_non_broadcast_dims.append(num_rightmost_non_br_dims)
+    (alignments, non_broadcast_alignments) = _get_alignments(
+        extended_input_shapes,
+        input_broadcast_sizes,
+        num_rightmost_non_broadcast_dims,
+        rightmost_broadcast_dim,
+        len(output_shape),
+        dtype,
     )
-    # Note that we use the same alignment for accessing inputs and outputs, although
-    # they may have different alignment requirements. We may lose perf a little bit,
-    # but reduce the complexity of our jinja template. We can do some perf
-    # experiments later to determine if we want to chase more perf gains.
-    alignment = tensor_accessor_codegen.find_max_alignment(
-        alignment, dtype, input_accessors
+    alignments = _refine_alignments_with_tensor_accessors(
+        non_broadcast_alignments,
+        alignments,
+        dtype,
+        input_accessors,
+        output_accessors,
     )
-    return alignment, input_broadcast_sizes, dtype
+    return alignments, input_broadcast_sizes, dtype
 
 
 def _get_dynamic_dims(output_accessors: List[TensorAccessor]) -> List[IntVar]:
@@ -428,11 +530,20 @@ def _parse_func_metadata(
     original_outputs: List[Tensor],
     backend_spec: BackendSpec,
 ) -> FusedElementwiseMetaData:
-    alignment, input_broadcast_sizes, dtype = _get_types_and_sizes(
+    alignments, input_broadcast_sizes, dtype = _get_alignments_and_sizes_and_dtype(
         inputs, input_accessors, output_accessors, backend_spec
     )
-    read_type = backend_spec.get_elementwise_read_backend_type(alignment, dtype)
-    op_type = backend_spec.get_elementwise_op_backend_type(alignment, dtype)
+    max_read_type = backend_spec.get_elementwise_read_backend_type(
+        max(alignments), dtype
+    )
+    read_types = [
+        backend_spec.get_elementwise_read_backend_type(alignment, dtype)
+        for alignment in alignments
+    ]
+    # It's safe to use the maximum alignment for determine op_type, because
+    # smaller inputs (i.e. those being broadcasted) will be placed into a
+    # larger tmp variable which is valid for selected op_type.
+    op_type = backend_spec.get_elementwise_op_backend_type(max(alignments), dtype)
     data_type = backend_spec.dtype_to_backend_type(dtype)
     sub_func_metadata, op_type = _get_sub_func_metadata(
         ops, data_type, op_type, backend_spec
@@ -446,7 +557,8 @@ def _parse_func_metadata(
         output_accessors,
         original_inputs,
         original_outputs,
-        read_type,
+        max_read_type,
+        read_types,
         op_type,
         data_type,
         input_broadcast_sizes,
@@ -497,12 +609,13 @@ def _gen_input_broadcast_calculator_str(
         output_num_elements.append(output_shape[start_idx:])
 
     res = []
-    for (output_num_element, output_stride, input_stride) in zip(
+    for output_num_element, output_stride, input_stride in zip(
         output_num_elements, output_strides, input_strides
     ):
+        idx_str = "idx * N_ELEMENTS_PER_THREAD"
         res.append(
             "{} % ({}) / ({}) * ({})".format(
-                "idx * N_ELEMENTS_PER_THREAD",
+                idx_str,
                 _gen_int_var_product_str(output_num_element),
                 _gen_int_var_product_str(output_stride),
                 _gen_int_var_product_str(input_stride),
@@ -538,31 +651,45 @@ def _gen_dynamic_dim_str(
 
 
 def _gen_read_inputs_str(
-    fused_elementwise_metadata: FusedElementwiseMetaData, broadcast_sizes: List[str]
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    broadcast_sizes: List[str],
 ):
     read_inputs = []
-    for input_idx, (input_accessor, broadcast_size) in enumerate(
-        zip(fused_elementwise_metadata.input_accessors, broadcast_sizes)
+    for input_idx, (input_accessor, read_t, broadcast_size) in enumerate(
+        zip(
+            fused_elementwise_metadata.input_accessors,
+            fused_elementwise_metadata.read_types,
+            broadcast_sizes,
+        )
     ):
         input_name = f"input_tmp{input_idx}"
+
+        # When broadcasting an input, we are reading a different number of elements
+        # from this input based on the "ratio" of its read_t to the max_read_t
+        n_elems_per_thread = (
+            f"(N_ELEMENTS_PER_THREAD / "
+            f"(sizeof({fused_elementwise_metadata.max_read_t}) / sizeof({read_t})))"
+        )
         data_idx = (
             "idx"
             if not broadcast_size
-            else f"({broadcast_size}) / N_ELEMENTS_PER_THREAD"
+            else f"({broadcast_size}) / {n_elems_per_thread}"
         )
         get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
             tensor_accessor=input_accessor,
             data_ptr=input_name,
             data_t=fused_elementwise_metadata.data_t,
-            read_t=fused_elementwise_metadata.read_t,
+            read_t=read_t,
             data_idx=data_idx,
         )
         read_input = KERNEL_READ_INPUT_TEMPLATE.render(
             get_strided_address=get_strided_addr_str,
             input_name=input_name,
             input_idx=input_idx,
-            read_t=fused_elementwise_metadata.read_t,
+            max_read_t=fused_elementwise_metadata.max_read_t,
+            read_t=read_t,
             op_t=fused_elementwise_metadata.op_t,
+            data_t=fused_elementwise_metadata.data_t,
         )
         read_inputs.append(read_input)
     read_inputs_str = "\n".join(read_inputs)
@@ -579,7 +706,7 @@ def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData)
             tensor_accessor=output_accessor,
             data_ptr=output_name,
             data_t=fused_elementwise_metadata.data_t,
-            read_t=fused_elementwise_metadata.read_t,
+            read_t=fused_elementwise_metadata.max_read_t,
             data_idx="idx",
         )
         write_out = KERNEL_WRITE_OUTPUT_TEMPLATE.render(
@@ -601,7 +728,7 @@ def _gen_kernel_function(
     output_params_decl = ",".join(
         [
             KERNEL_DECL_OUTPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.max_read_t, idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
@@ -609,7 +736,7 @@ def _gen_kernel_function(
     input_params_decl = ",".join(
         [
             KERNEL_DECL_INPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.read_types[i], idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
@@ -622,7 +749,7 @@ def _gen_kernel_function(
     read_inputs_str = _gen_read_inputs_str(fused_elementwise_metadata, broadcast_sizes)
 
     define_outputs = KERNEL_DEFINE_OUTPUTS_TEMPLATE.render(
-        read_t=fused_elementwise_metadata.read_t,
+        read_t=fused_elementwise_metadata.max_read_t,
         op_t=fused_elementwise_metadata.op_t,
         indexes=list(range(len(fused_elementwise_metadata.outputs))),
     )
@@ -685,7 +812,16 @@ def fused_elementwise_gen_function(
         backend_spec,
     )
     # Dump data types into func_attr for testing purpose.
-    func_attrs["read_t"] = fused_elementwise_metadata.read_t
+    func_attrs["max_read_t"] = fused_elementwise_metadata.max_read_t
+    # Fused inputs may not be in the same order as the inputs passed to each
+    # elementwise op, so we save a tuple. Note that this attribute is different
+    # from the read_types field of FusedElementwiseMetaData, where each "read_t"
+    # maps to the input at the same index. The "read_types" attribute is only
+    # used for testing purpose.
+    func_attrs["read_types"] = [
+        (inp._attrs["name"], read_t)
+        for (inp, read_t) in zip(inputs, fused_elementwise_metadata.read_types)
+    ]
     func_attrs["op_t"] = fused_elementwise_metadata.op_t
     func_attrs["data_t"] = fused_elementwise_metadata.data_t
 
@@ -713,7 +849,7 @@ def fused_elementwise_gen_function(
     kernel_call_output_params = ",".join(
         [
             KERNEL_CALL_OUTPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.max_read_t, idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
@@ -721,13 +857,13 @@ def fused_elementwise_gen_function(
     kernel_call_input_params = ",".join(
         [
             KERNEL_CALL_INPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.read_types[i], idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
     )
     constant = CONSTANT_TEMPLATE.render(
-        read_t=fused_elementwise_metadata.read_t,
+        read_t=fused_elementwise_metadata.max_read_t,
         op_t=fused_elementwise_metadata.op_t,
         data_t=fused_elementwise_metadata.data_t,
     )
diff --git a/python/aitemplate/utils/alignment.py b/python/aitemplate/utils/alignment.py
index c5aad47a7..014d3e8ec 100644
--- a/python/aitemplate/utils/alignment.py
+++ b/python/aitemplate/utils/alignment.py
@@ -48,6 +48,17 @@ def find_max_alignment(number: int, dtype: str) -> int:
     return 1
 
 
+def find_max_alignment_from(numbers: List[int], dtype: str) -> int:
+    """
+    Return the max alignment value that is valid for all the numbers.
+    """
+    alignments = get_alignments(dtype)
+    for alignment in alignments:
+        if all(number % alignment == 0 for number in numbers):
+            return alignment
+    return 1
+
+
 def valid_alignment(align: int, dtype: str) -> bool:
     """
     Return True if the given align value is legitimate for the dtype.
diff --git a/tests/unittest/backend/test_fused_elementwise_backend.py b/tests/unittest/backend/test_fused_elementwise_backend.py
index 1e7bf3416..978522e46 100644
--- a/tests/unittest/backend/test_fused_elementwise_backend.py
+++ b/tests/unittest/backend/test_fused_elementwise_backend.py
@@ -59,7 +59,8 @@ def test_unary(self):
             output_accessors=[TensorAccessor(X3)],
             original_inputs=[X1],
             original_outputs=[X3],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4", "uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -134,7 +135,8 @@ def test_multi_inputs(self):
             output_accessors=[TensorAccessor(X6)],
             original_inputs=[X1, X2, X4],
             original_outputs=[X6],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4", "uint4", "uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -207,7 +209,8 @@ def test_constant(self):
             output_accessors=[TensorAccessor(X5)],
             original_inputs=[X1],
             original_outputs=[X5],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -275,7 +278,8 @@ def test_converter(self):
             output_accessors=[TensorAccessor(X5)],
             original_inputs=[X1],
             original_outputs=[X5],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half",
             data_t="half",
             input_broadcast_sizes=None,
@@ -360,7 +364,8 @@ def test_multi_outputs(self):
             output_accessors=[TensorAccessor(X6), TensorAccessor(X7)],
             original_inputs=[X1],
             original_outputs=[X6, X7],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half",
             data_t="half",
             input_broadcast_sizes=None,
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
index f80a28668..f7e8ac3e5 100644
--- a/tests/unittest/compiler/test_slice_elemwise_fusion.py
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -33,13 +33,19 @@ def __init__(self, *args, **kwargs):
         super(SliceElemwiseFusionTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
 
+    # "read_types" attribute contains a list of tuples like
+    # [("input0", "uint4"), ("input1", "half")]. This helper function returns
+    # the list of the second elements, i.e. read_t types for all inputs.
+    def _get_read_types(self, op):
+        return list({t for _, t in op._attrs["read_types"]})
+
     def _test_slice_elemwise_fusion(
         self,
         slice_input_shape,
         slice_start_indices,
         slice_end_indices,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
@@ -81,7 +87,9 @@ def _test_slice_elemwise_fusion(
         self.assertEqual(len(sorted_graph), 3)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        # import pdb; pdb.set_trace()
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -111,7 +119,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(2,),
             slice_end_indices=(None,),
             test_name="slice_elemwise_fusion",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -120,7 +128,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 8),
             test_name="slice_elemwise_fusion",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
         )
@@ -129,7 +137,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(0, 3, 0),
             slice_end_indices=(None, 5, None),
             test_name="slice_elemwise_fusion",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -141,7 +149,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(2, 0),
             slice_end_indices=(3, None),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=(4, 16),
@@ -152,7 +160,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 3),
@@ -164,7 +172,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 0, 2),
             slice_end_indices=(None, None, 7),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 3, 1),
@@ -176,7 +184,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(4, 10, 3),
@@ -186,7 +194,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 0, 3),
             slice_end_indices=(None, None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(20, 3),
@@ -198,7 +206,7 @@ def _test_slice_elemwise_fusion_dynamic(
         slice_start_indices,
         slice_end_indices,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
@@ -266,7 +274,8 @@ def _test_slice_elemwise_fusion_dynamic(
         self.assertEqual(len(sorted_graph), 3)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -320,7 +329,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 7),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
         )
@@ -329,7 +338,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 16),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -338,7 +347,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 0, 7, 0),
             slice_end_indices=(None, None, 10, None),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -354,7 +363,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 4, 0),
             slice_end_indices=(None, 5, None),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=([5, 16], 4, 16),
@@ -365,7 +374,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(1, 10, 15),
@@ -375,7 +384,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 1, 15),
@@ -386,7 +395,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 0),
             slice_end_indices=(None, None, 8),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=(10, 8),
@@ -396,7 +405,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(3, [5, 16], 10, 15),
@@ -406,7 +415,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 12),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=([3, 7], [5, 16], 10, 8),
@@ -419,7 +428,7 @@ def _test_two_slice_elemwise_fusion_dynamic(
         slice_end_indices1,
         slice_start_indices2,
         slice_end_indices2,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         test_name,
@@ -461,7 +470,8 @@ def _test_two_slice_elemwise_fusion_dynamic(
         self.assertEqual(len(sorted_graph), 2)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -503,7 +513,7 @@ def test_two_slice_elemwise_fusion_dynamic(self):
             slice_end_indices1=(None, 8),
             slice_start_indices2=(0, 16),
             slice_end_indices2=(None, 20),
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
             test_name="two_slice_elemwise_fusion_dynamic",
@@ -514,7 +524,7 @@ def test_two_slice_elemwise_fusion_dynamic(self):
             slice_end_indices1=(None, 7),
             slice_start_indices2=(0, 4),
             slice_end_indices2=(None, 8),
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             test_name="two_slice_elemwise_fusion_dynamic",
@@ -527,7 +537,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(0, 3, 0),
             slice_end_indices=(None, 5, None),
             test_name="slice_elemwise_fusion_float",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="float",
             expected_data_t="float",
             dtype="float",
@@ -537,7 +547,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(2, 0),
             slice_end_indices=(3, None),
             test_name="slice_elemwise_fusion_broadcast_float",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="float",
             expected_data_t="float",
             input_x2_shape=(4, 16),
@@ -548,7 +558,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(0, 0, 2),
             slice_end_indices=(None, None, 7),
             test_name="slice_elemwise_fusion_broadcast_float_2",
-            expected_read_t="float",
+            expected_max_read_t="float",
             expected_op_t="float",
             expected_data_t="float",
             input_x2_shape=(10, 3, 1),
@@ -559,7 +569,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 16),
             test_name="slice_elemwise_fusion_dynamic_float",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="float",
             expected_data_t="float",
             dtype="float",
@@ -569,7 +579,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast_float",
-            expected_read_t="float",
+            expected_max_read_t="float",
             expected_op_t="float",
             expected_data_t="float",
             input_x2_shape=(1, 10, 15),
@@ -580,7 +590,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_start_indices=(0, 0, 0),
             slice_end_indices=(None, None, 8),
             test_name="slice_elemwise_fusion_dynamic_broadcast_float",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="float",
             expected_data_t="float",
             input_x2_shape=(10, 8),
@@ -592,7 +602,7 @@ def test_slice_elemwise_fusion_float(self):
             slice_end_indices1=(None, 8),
             slice_start_indices2=(0, 16),
             slice_end_indices2=(None, 20),
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="float",
             expected_data_t="float",
             test_name="two_slice_elemwise_fusion_dynamic_float",
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index e7943d42f..ace6989b7 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -30,13 +30,22 @@
 
 
 class FusedElementwiseBroadcastTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _get_sorted_read_types(self, op):
+        read_types = list(op._attrs["read_types"])
+        return [t for _, t in sorted(read_types, key=lambda x: x[0])]
+
     def _test_different_dim(
         self,
         batch_sizes,
         ms,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
         dtype="float16",
@@ -74,7 +83,10 @@ def _test_different_dim(
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -97,7 +109,8 @@ def test_different_dim_fp16(self):
             ms=[256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp16_static_shapes",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -107,7 +120,8 @@ def test_different_dim_fp16(self):
             ms=[256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp16_dynamic_bs",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -117,7 +131,8 @@ def test_different_dim_fp16(self):
             ms=[34, 67, 256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp16_dynamic_ms",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -127,7 +142,8 @@ def test_different_dim_fp16(self):
             ms=[256],
             ks=[34, 87, 128],
             test_name="fused_elementwise_different_dim_fp16_dynamic_ks",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -137,7 +153,8 @@ def test_different_dim_fp16(self):
             ms=[13, 256],
             ks=[34, 128],
             test_name="fused_elementwise_different_dim_fp16_dynamic_all",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -150,7 +167,8 @@ def test_different_dim_fp32(self):
             ms=[256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp32_static_shapes",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -160,7 +178,8 @@ def test_different_dim_fp32(self):
             ms=[256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp32_dynamic_bs",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -170,7 +189,8 @@ def test_different_dim_fp32(self):
             ms=[34, 67, 256],
             ks=[128],
             test_name="fused_elementwise_different_dim_fp32_dynamic_ms",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -180,7 +200,8 @@ def test_different_dim_fp32(self):
             ms=[256],
             ks=[34, 87, 128],
             test_name="fused_elementwise_different_dim_fp32_dynamic_ks",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -190,7 +211,8 @@ def test_different_dim_fp32(self):
             ms=[13, 256],
             ks=[34, 128],
             test_name="fused_elementwise_different_dim_fp32_dynamic_all",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -203,7 +225,8 @@ def _test_1_shape(
         ns,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
         dtype="float16",
@@ -244,7 +267,10 @@ def _test_1_shape(
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -264,8 +290,9 @@ def test_1_shape_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp16_static_shapes",
-            expected_read_t="half",
-            expected_op_t="half",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
         )
@@ -275,8 +302,9 @@ def test_1_shape_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp16_dynamic_bs",
-            expected_read_t="half",
-            expected_op_t="half",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
         )
@@ -286,7 +314,8 @@ def test_1_shape_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp16_dynamic_ms",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -297,8 +326,9 @@ def test_1_shape_fp16(self):
             ns=[1, 3, 4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp16_dynamic_ns",
-            expected_read_t="half",
-            expected_op_t="half",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
         )
@@ -308,8 +338,9 @@ def test_1_shape_fp16(self):
             ns=[4],
             ks=[1, 4, 7, 16],
             test_name="fused_elementwise_test_1_fp16_dynamic_ks",
-            expected_read_t="half",
-            expected_op_t="half",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
         )
@@ -319,7 +350,8 @@ def test_1_shape_fp16(self):
             ns=[3, 4],
             ks=[1, 16],
             test_name="fused_elementwise_test_1_fp16_dynamic_all",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -333,7 +365,8 @@ def test_1_shape_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp32_static_shapes",
-            expected_read_t="float",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -344,7 +377,8 @@ def test_1_shape_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp32_dynamic_bs",
-            expected_read_t="float",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -355,7 +389,8 @@ def test_1_shape_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp32_dynamic_ms",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -366,7 +401,8 @@ def test_1_shape_fp32(self):
             ns=[1, 3, 4],
             ks=[16],
             test_name="fused_elementwise_test_1_fp32_dynamic_ns",
-            expected_read_t="float",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -377,7 +413,8 @@ def test_1_shape_fp32(self):
             ns=[4],
             ks=[1, 4, 7, 16],
             test_name="fused_elementwise_test_1_fp32_dynamic_ks",
-            expected_read_t="float",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -388,7 +425,8 @@ def test_1_shape_fp32(self):
             ns=[3, 4],
             ks=[1, 16],
             test_name="fused_elementwise_test_1_fp32_dynamic_all",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -401,7 +439,8 @@ def _test_chained_broadcasts(
         ns,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
         dtype="float16",
@@ -447,7 +486,10 @@ def _test_chained_broadcasts(
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -468,7 +510,8 @@ def test_chained_shapes_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp16_static_shapes",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -479,7 +522,8 @@ def test_chained_shapes_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_bs",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "uint"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -490,7 +534,8 @@ def test_chained_shapes_fp16(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ms",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -501,7 +546,8 @@ def test_chained_shapes_fp16(self):
             ns=[1, 3, 4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ns",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -512,7 +558,8 @@ def test_chained_shapes_fp16(self):
             ns=[4],
             ks=[1, 4, 7, 16],
             test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ks",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
@@ -523,7 +570,8 @@ def test_chained_shapes_fp16(self):
             ns=[3, 4],
             ks=[1, 16],
             test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_all",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -537,7 +585,8 @@ def test_chained_shapes_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp32_static_shapes",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -548,7 +597,8 @@ def test_chained_shapes_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_bs",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -559,7 +609,8 @@ def test_chained_shapes_fp32(self):
             ns=[4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ms",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -570,7 +621,8 @@ def test_chained_shapes_fp32(self):
             ns=[1, 3, 4],
             ks=[16],
             test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ns",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -581,7 +633,8 @@ def test_chained_shapes_fp32(self):
             ns=[4],
             ks=[1, 4, 7, 16],
             test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ks",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -592,7 +645,8 @@ def test_chained_shapes_fp32(self):
             ns=[3, 4],
             ks=[1, 16],
             test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_all",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -602,7 +656,8 @@ def _test_consecutive_1s_broadcast(
         self,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
         dtype="float16",
@@ -637,7 +692,10 @@ def _test_consecutive_1s_broadcast(
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -654,15 +712,17 @@ def test_consecutive_1s_broadcast_fp16(self):
         self._test_consecutive_1s_broadcast(
             ks=[32],
             test_name="fused_elementwise_consecutive_1s_broadcast_fp16_static_shapes",
-            expected_read_t="half",
-            expected_op_t="half",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half"],
+            expected_op_t="half2",
             expected_data_t="half",
             dtype="float16",
         )
         self._test_consecutive_1s_broadcast(
             ks=[1, 5, 7, 32],
             test_name="fused_elementwise_consecutive_1s_broadcast_fp16_dynamic_shapes",
-            expected_read_t="half",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
             dtype="float16",
@@ -673,7 +733,8 @@ def test_consecutive_1s_broadcast_fp32(self):
         self._test_consecutive_1s_broadcast(
             ks=[32],
             test_name="fused_elementwise_consecutive_1s_broadcast_fp32_static_shapes",
-            expected_read_t="float",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
@@ -681,12 +742,204 @@ def test_consecutive_1s_broadcast_fp32(self):
         self._test_consecutive_1s_broadcast(
             ks=[1, 5, 7, 32],
             test_name="fused_elementwise_consecutive_1s_broadcast_fp32_dynamic_shapes",
-            expected_read_t="float",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
             expected_op_t="float",
             expected_data_t="float",
             dtype="float32",
         )
 
+    def _test_vectorization(
+        self,
+        batch_sizes,
+        ms,
+        ks,
+        test_name,
+        expected_max_read_t,
+        expected_read_types,
+        expected_op_t,
+        expected_data_t,
+        dtype="float16",
+    ):
+        """
+        Test add(add(X0(B, M0, K0), X1(B, M1, K1)), X2(B, M2, K2))
+        """
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_dim")
+
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(ms[0]), IntImm(ks[0])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(ms[1]), IntImm(ks[1])],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(ms[2]), IntImm(ks[2])],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        add_1 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        output = ops.elementwise(FuncEnum.ADD)(add_1, X2)
+        output._attrs["name"] = "output0"
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", test_name)
+
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for batch_size in batch_sizes:
+            x0_pt = get_random_torch_tensor([batch_size, ms[0], ks[0]], dtype=dtype)
+            x1_pt = get_random_torch_tensor([batch_size, ms[1], ks[1]], dtype=dtype)
+            x2_pt = get_random_torch_tensor([batch_size, ms[2], ks[2]], dtype=dtype)
+            output_pt = (x0_pt + x1_pt) + x2_pt
+            inputs = {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt}
+            output = torch.empty_like(output_pt)
+            module.run_with_tensors(inputs, [output])
+            self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
+
+    def test_vectorization_fp16(self):
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 2],
+            ks=[2, 2, 1],
+            test_name="fused_elementwise_vectorization_fp16_1",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[4, 1024],
+            ms=[1, 15, 1],
+            ks=[4, 4, 1],
+            test_name="fused_elementwise_vectorization_fp16_2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[10, 12],
+            ms=[1, 1, 1],
+            ks=[16, 1, 16],
+            test_name="fused_elementwise_vectorization_fp16_3",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half", "uint4"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[8],
+            ms=[8, 1, 8],
+            ks=[127, 127, 1],
+            test_name="fused_elementwise_vectorization_fp16_4",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
+            expected_op_t="half",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[8],
+            ms=[8, 1, 8],
+            ks=[1, 1, 1],
+            test_name="fused_elementwise_vectorization_fp16_5",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half", "uint4"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 1],
+            ks=[6, 6, 6],
+            test_name="fused_elementwise_vectorization_fp16_6",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "uint"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 1],
+            ks=[12, 12, 12],
+            test_name="fused_elementwise_vectorization_fp16_7",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_vectorization_fp32(self):
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 2],
+            ks=[4, 1, 1],
+            test_name="fused_elementwise_vectorization_fp32_1",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1, 128],
+            ms=[2, 1, 1],
+            ks=[2, 2, 1],
+            test_name="fused_elementwise_vectorization_fp32_2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 2],
+            ks=[8, 8, 8],
+            test_name="fused_elementwise_vectorization_fp32_3",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 1],
+            ks=[2, 2, 2],
+            test_name="fused_elementwise_vectorization_fp32_4",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From b23c555692422ad6762c9f77b32abe2e2f5c711a Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 1 Mar 2023 00:53:41 -0800
Subject: [PATCH 200/638] Add jagged Tensor support to elementwise front-end
 (#322)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/322

Jagged Tensor support is added to the `elementwise` front-end. In cases when there are mixed jagged and dense inputs (with the dense input's shape overlapping with at least one jagged dimension in the jagged inputs), the inputs are treated in a special way.

This imposes the following rules:

1. All jagged inputs must have the same `JaggedIntVar` and rank.

2. The output of a mixed jagged / dense elementwise is always a jagged Tensor, with the `JaggedIntVar` and rank equal to those of the jagged inputs (the inner dimensions of the jagged output may be broadcasted from the input shapes).

3. Broadcasting is possible in the static dims of the dense inputs and inner dims (after the `JaggedIntVar`) of the jagged inputs.

4. The rank of the dense inputs cannot be higher than the rank of the "expanded" jagged inputs: i.e., with every jagged dim treated as a separate dim. E.g., we can't add `[sum_B(N_B),D]`-shaped jagged input to a `[B,N1,N2,D]`-shaped dense input.

5. If the rank of a dense input is equal to the rank of the "expanded" jagged input, the leftmost (IntVar) dimension in the dense shape must be equal to the `batch_dim` encoded within the jagged input's `JaggedIntVar`.

6. The dimensions of the dense inputs corresponding to the jagged dimensions of the jagged inputs (encoded within the `JaggedIntVar`) must be static (IntImm) with the value either equal to 1 (then broadcasted) or equal to the `jagged_dim.max_value()`. (Otherwise the semantics of the mixed jagged / dense elementwise operation is unclear.)

7. In the `fused_elementwise` front-end, two new flags are added to the `self._attrs`: `mixed_jagged_dense_inputs: bool` and `output_volume: List[IntVar]`. This is intended to facilitate the back-end code generation in the following diff.

8. In cases where all inputs are jagged, all inputs are dense, or the highest rank of the dense input is less than or equal to the number of inner dims of the jagged inputs (i.e., the largest dense input shape doesn't overlap with the jagged dimensions of the jagged inputs), the elementwise falls back to the traditional way: the jagged inputs are then treated as dense inputs with the `JaggedIntVar` interpreted as a normal `IntVar` (equal to `total_length`). (Now that the implementation-specific code fragment from has been moved from the `fused_elementwise` front-end to the `elementwise_common` back-end in V2, this is not apparent from this diff and will be visible in the following one; stay tuned.)

Reviewed By: ipiszy

Differential Revision: D43400183

fbshipit-source-id: 25b76f6650013643c8ce5705a2d482c2ed53be1d
---
 .../compiler/ops/common/elementwise.py        | 127 +++++++++++++++---
 1 file changed, 110 insertions(+), 17 deletions(-)

diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index c236675b2..dfea0367c 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -27,6 +27,103 @@
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
 
+def _broadcast_dense_shapes(shapes: List[List[IntVar]]) -> List[IntVar]:
+    if len(shapes) == 1:
+        return list(shapes[0])
+
+    max_shape = None
+    for shape in shapes:
+        if max_shape is None:
+            max_shape = list(shape)
+        broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
+            max_shape, shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                "Input shapes of the elementwise op are not compatible! "
+                f"Shape1: {max_shape}, shape2: {shape}"
+            )
+        max_shape = new_max_shape
+
+    return max_shape
+
+
+def _broadcast_jagged_shapes(shapes: List[List[IntVar]]) -> List[IntVar]:
+    if len(shapes) == 1:
+        return list(shapes[0])
+
+    rank = len(shapes[0])
+    first_dim = shapes[0][0]
+    for shape in shapes[1:]:
+        other_first_dim = shape[0]
+        if other_first_dim != first_dim:
+            raise ValueError(
+                "All jagged inputs of an elementwise op must "
+                "have the same first dim (JaggedIntVar), but got "
+                f"{first_dim} != {other_first_dim}"
+            )
+        other_rank = len(shape)
+        if other_rank != rank:
+            raise ValueError(
+                "All jagged inputs of an elementwise op "
+                "must have the same rank, but got "
+                f"{rank} != {other_rank}"
+            )
+
+    suffix_shapes = [shape[1:] for shape in shapes]
+    max_suffix_shape = suffix_shapes[0]
+    for suffix_shape in suffix_shapes[1:]:
+        broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
+            max_suffix_shape, suffix_shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                "Jagged input suffix shapes of the elementwise op are not compatible! "
+                f"Shape1: {max_suffix_shape}, shape2: {suffix_shape}"
+            )
+        max_suffix_shape = new_max_shape
+
+    return [first_dim] + max_suffix_shape
+
+
+def _broadcast_dense_and_jagged_shape(
+    dense_shape: List[IntVar],
+    jagged_shape: List[IntVar],
+) -> List[IntVar]:
+    jagged_first_dim = jagged_shape[0]
+    jagged_suffix_shape = jagged_shape[1:]
+    dense_suffix_shape = dense_shape[-len(jagged_suffix_shape) :]
+    broadcastable, max_suffix_shape = shape_utils.get_broadcast_max_shape(
+        jagged_suffix_shape, dense_suffix_shape
+    )
+    if not broadcastable:
+        raise ValueError(
+            "The suffix shapes of jagged and dense inputs of the elementwise op are not compatible! "
+            f"Jagged suffix shape: {jagged_suffix_shape}, dense suffix shape: {dense_suffix_shape}"
+        )
+
+    if len(dense_shape) >= len(jagged_shape):
+        dense_prefix_shape = dense_shape[: -len(dense_suffix_shape)]
+        jagged_max_dense_prefix_shape = jagged_first_dim.get_max_dense_shape()
+        if len(dense_prefix_shape) > len(jagged_max_dense_prefix_shape):
+            raise ValueError(
+                "The rank of dense inputs of an elementwise op can't be "
+                "higher than the rank of the jagged inputs (when treating "
+                "the jagged dims as separate dims)."
+            )
+
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+            jagged_max_dense_prefix_shape, dense_prefix_shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                f"JaggedIntVar of the jagged inputs ({jagged_first_dim}) is not compatible "
+                f"with the broadcasted prefix shape of the dense inputs ({dense_prefix_shape})."
+            )
+
+    return [jagged_first_dim] + max_suffix_shape
+
+
 class elementwise(Operator):
     """elementwise operator definition."""
 
@@ -58,22 +155,19 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
             raise RuntimeError(
                 "Elementwise op {} doesn't have inputs!".format(self._attrs["func"])
             )
-        max_shape = None
-        for tensor in args:
-            shape = tensor._attrs["shape"]
-            if max_shape is None:
-                max_shape = list(shape)
-            broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
-                max_shape, shape
-            )
-            if not broadcastable:
-                raise RuntimeError(
-                    "Tensor shapes of elementwise ops are not compatible! Shape1: {}, shape2: {}".format(
-                        max_shape, shape
-                    )
-                )
-            max_shape = new_max_shape
-        return max_shape
+
+        dense_shapes = [arg._attrs["shape"] for arg in args if not arg.is_jagged()]
+        jagged_shapes = [arg._attrs["shape"] for arg in args if arg.is_jagged()]
+
+        max_dense_shape = _broadcast_dense_shapes(dense_shapes)
+        if not jagged_shapes:
+            return max_dense_shape
+
+        max_jagged_shape = _broadcast_jagged_shapes(jagged_shapes)
+        if not dense_shapes:
+            return max_jagged_shape
+
+        return _broadcast_dense_and_jagged_shape(max_dense_shape, max_jagged_shape)
 
     def __call__(self, *args: Tensor) -> Tensor:
         converted_args = []
@@ -95,7 +189,6 @@ def __call__(self, *args: Tensor) -> Tensor:
                     raise NotImplementedError(
                         f"Type promotions are not supported; got dtype {arg.dtype()}, but expected {common_dtype}"
                     )
-
             else:
                 raise RuntimeError(
                     f"Unsupported data type {arg} in elementwise {self}!"

From fb4e65a00ea2e244e901119ef5569fed173ab6fb Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 1 Mar 2023 01:06:43 -0800
Subject: [PATCH 201/638] Fix make_jagged op's back-end (#337)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/337

This is a follow-up diff to D43225824 (https://github.com/facebookincubator/AITemplate/commit/0e87b5822ee36ffda4ddf3a79b25629a107280c4), fixing the following two issues left in the `make_jagged` back-end:

1. The block size in the `check_offsets` kernel launch is made fixed, not depending on the input size. Previously, as the block size was dependent on the maximum offsets length, the kernel launch failed when the offsets were longer than max. threads per block (1024 on A100).

2. The `stream` parameter is added to the `make_jagged` function signature and used for the kernel launch.

Reviewed By: ipiszy, chenyang78

Differential Revision: D43610268

fbshipit-source-id: e5715404681cd52460d42fddf412c2d3c3849aa6
---
 .../backend/cuda/view_ops/make_jagged.py      | 64 +++++++++++++------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index b9e17d17b..de0adbe59 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -41,6 +41,9 @@
 #include "jagged.h"
 
 
+#define THREADS_PER_BLOCK 128
+
+
 namespace {
 
 struct OffsetBounds {
@@ -53,57 +56,72 @@
   {{offsets_struct_type}} offsets,
   OffsetBounds bounds
 ) {
-  int64_t length = offsets.lengths[blockIdx.x];
-  const {{offsets_type}}* data = offsets.data[blockIdx.x];
+  int64_t dim_id = blockIdx.y;
+  int64_t offset_id = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
 
-  if (threadIdx.x >= length - 1) {
+  int64_t length = offsets.lengths[dim_id];
+  const {{offsets_type}}* data = offsets.data[dim_id];
+
+  if (offset_id >= length - 1) {
     // out of bounds of the offset array
     return;
   }
 
-  {{offsets_type}} group_size = data[threadIdx.x + 1] - data[threadIdx.x];
-  if (group_size < bounds.min_values[blockIdx.x] || group_size > bounds.max_values[blockIdx.x]) {
+  {{offsets_type}} group_size = data[offset_id + 1] - data[offset_id];
+  if (group_size < bounds.min_values[dim_id] || group_size > bounds.max_values[dim_id]) {
     printf(
-      "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
       "Error: the offset difference %d is out of bounds of the jagged dimension %d (min: %d, max: %d).",
       (int32_t)blockIdx.x,
+      (int32_t)blockIdx.y,
+      (int32_t)blockIdx.z,
       (int32_t)threadIdx.x,
+      (int32_t)threadIdx.y,
+      (int32_t)threadIdx.z,
       (int32_t)group_size,
-      (int32_t)blockIdx.x,
-      (int32_t)bounds.min_values[blockIdx.x],
-      (int32_t)bounds.max_values[blockIdx.x]
+      (int32_t)dim_id,
+      (int32_t)bounds.min_values[dim_id],
+      (int32_t)bounds.max_values[dim_id]
     );
     __trap();
   }
 
-  if (threadIdx.x == 0) {
+  if (offset_id == 0) {
     {{offsets_type}} first_offset = data[0];
     if (first_offset != 0)
     {
       printf(
-        "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
         "Error: the first offset of the jagged dimension %d is non-zero: %d.",
         (int32_t)blockIdx.x,
+        (int32_t)blockIdx.y,
+        (int32_t)blockIdx.z,
         (int32_t)threadIdx.x,
-        (int32_t)blockIdx.x,
+        (int32_t)threadIdx.y,
+        (int32_t)threadIdx.z,
+        (int32_t)dim_id,
         (int32_t)first_offset
       );
       __trap();
     }
   }
 
-  if (threadIdx.x == length - 2) {
+  if (offset_id == length - 2) {
     {{offsets_type}} last_offset = data[length - 1];
-    if (last_offset != bounds.last_values[blockIdx.x])
+    if (last_offset != bounds.last_values[dim_id])
     {
       printf(
-        "\\n[func name: {{func_name}}, blockIdx.x: %d, threadIdx.x: %d]: "
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
         "Error: the last offset of the jagged dimension %d is incorrect: %d (must be %d).",
         (int32_t)blockIdx.x,
+        (int32_t)blockIdx.y,
+        (int32_t)blockIdx.z,
         (int32_t)threadIdx.x,
-        (int32_t)blockIdx.x,
+        (int32_t)threadIdx.y,
+        (int32_t)threadIdx.z,
+        (int32_t)dim_id,
         (int32_t)last_offset,
-        (int32_t)bounds.last_values[blockIdx.x]
+        (int32_t)bounds.last_values[dim_id]
       );
       __trap();
     }
@@ -120,7 +138,8 @@
 {% endfor %}
   {{offsets_struct_type}}& offsets,
   int64_t* batch_dim,
-  int64_t total_length
+  int64_t total_length,
+  cudaStream_t stream
 ) {
 {% for idx in range(num_offsets) %}
     offsets.lengths[{{idx}}] = offsets_length_{{idx}};
@@ -154,7 +173,8 @@
     bounds.last_values[{{idx}}] = {{ "offsets.lengths[" + ((idx + 1) | string) + "] - 1" if idx < num_offsets - 1 else "total_length" }};
 {% endfor %}
 
-    check_offsets<<<{{num_offsets}}, max_offset_length - 1, 0, 0>>>(offsets, bounds);
+    dim3 grid_size((max_offset_length - 1 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, {{num_offsets}});
+    check_offsets<<<grid_size, THREADS_PER_BLOCK, 0, stream>>>(offsets, bounds);
 }
 """,
     trim_blocks=True,
@@ -170,7 +190,8 @@
 {% endfor %}
   {{offsets_struct_type}}&,
   int64_t*,
-  int64_t
+  int64_t,
+  cudaStream_t
 );
 """,
     trim_blocks=True,
@@ -186,7 +207,8 @@
 {% endfor %}
 {{indent}}  {{offsets_var_name}},
 {{indent}}  &{{batch_dim_name}},
-{{indent}}  {{source_first_dim_name}}
+{{indent}}  {{source_first_dim_name}},
+{{indent}}  stream
 {{indent}});
 """,
     trim_blocks=True,

From 6a559c18e44f352d963520f85d367df5cbb8d31c Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Wed, 1 Mar 2023 08:34:17 -0800
Subject: [PATCH 202/638] Use pycuda to get GPU compute capability in
 detect_target (#330)

Summary:
New version of `nvidia-smi` allows to get GPU Compute Capability. Command example
```
$ nvidia-smi --query-gpu=compute_cap --format=csv,noheader
8.6
8.6
8.6
8.6
```
Hovewer, circleCI pipeline use image: "ubuntu-2004-cuda-11.4:202110-01" - it has driver-470, cuda-11.4 which do not support `--query-gpu=compute_cap`.

Alternative solution is to use `pycuda` python package.
It provides API to get Major and Minor Compute Capability of a particular Device.

Once we get the number we can convert it to SM number supported by cutlass [generator.py](https://github.com/AITemplate/cutlass/blob/master/tools/library/scripts/generator.py#L4824).

Currently it supports SM 50, 60, 61, 70, 75, 80, 90.

Not sure if we want to support SM less than 70. So, we can focus on 70, 75, 80 and 90 only.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/330

Reviewed By: alexanderguzhva

Differential Revision: D43670764

Pulled By: tenpercent

fbshipit-source-id: 58f03d611f4d1755c6d72180ed07b75e950abed7
---
 .circleci/config.yml                       |  4 +++-
 docker/Dockerfile.cuda                     |  3 +++
 python/aitemplate/testing/detect_target.py | 25 +++++++++++-----------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c86323420..a9c258eff 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -19,12 +19,15 @@ setup_env: &setup_env
       name: Setup environment
       command: |
         for i in {1..3}; do
+          echo 'export PATH=/usr/local/cuda/bin:$PATH' >> $BASH_ENV &&
+          source "$BASH_ENV"
           python3.8 --version &&
           python3.8 -m pip install --upgrade pip &&
           cd /home/circleci/project/python &&
           python3.8 setup.py bdist_wheel &&
           sudo python3.8 -m pip install --no-input dist/*.whl &&
           cd /home/circleci/project &&
+          python3.8 -m pip install pycuda &&
           python3.8 -m pip install pytest &&
           python3.8 -m pip install torch &&
           python3.8 -m pip install numpy &&
@@ -35,7 +38,6 @@ setup_env: &setup_env
           git submodule sync &&
           git submodule update --init &&
           echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV &&
-          echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV &&
           echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV &&
           echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV &&
           break || sleep 5;
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 0461f45bf..4f75bf741 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -40,6 +40,9 @@ RUN bash /Install/install_doc_dep.sh
 # install Pytorch
 RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 
+# install Pycuda
+RUN pip3 install pycuda
+
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
 RUN bash /Install/install_detection_deps.sh
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 7c250d673..df8d1159a 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -32,22 +32,21 @@
 
 def _detect_cuda():
     try:
-        proc = Popen(
-            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv"],
-            stdout=PIPE,
-            stderr=PIPE,
-        )
-        stdout, stderr = proc.communicate()
-        stdout = stdout.decode("utf-8")
-        if "H100" in stdout:
+        import pycuda.driver as drv
+
+        drv.init()
+        major, minor = drv.Device(0).compute_capability()
+        comp_cap = major * 10 + minor
+        if comp_cap >= 90:
             return "90"
-        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30", "RTX 40"]):
+        elif comp_cap >= 80:
             return "80"
-        if "V100" in stdout:
-            return "70"
-        if "T4" in stdout:
+        elif comp_cap >= 75:
             return "75"
-        return None
+        elif comp_cap >= 70:
+            return "70"
+        else:
+            return None
     except Exception:
         return None
 

From 95f3b2bedd052edf5b5ce82872e3dee06dc9f361 Mon Sep 17 00:00:00 2001
From: tissue3 <173666635@qq.com>
Date: Wed, 1 Mar 2023 12:53:10 -0800
Subject: [PATCH 203/638] Remove noop split (#346)

Summary:
Remove split op when split covers the entire dim, i.e. the split is actually a noop

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/346

Test Plan:
```
python tests/unittest/compiler/test_split_getitem.py
```

Reviewed By: chenyang78, wushirong

Differential Revision: D43684207

Pulled By: tissue3

fbshipit-source-id: f7cf1e8d8ea50f82aedbe0e5cf1a319bd38cf10f
---
 .../transform/transform_memory_ops.py         |  32 ++-
 .../unittest/compiler/test_split_full_idx.py  | 193 ++++++++++++++++++
 2 files changed, 224 insertions(+), 1 deletion(-)
 create mode 100644 tests/unittest/compiler/test_split_full_idx.py

diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index 34e1c68a4..312328bef 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -20,7 +20,7 @@
 
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...utils import graph_utils
+from ...utils import graph_utils, shape_utils
 from ..base import Operator, Tensor
 from . import transform_utils
 
@@ -188,6 +188,35 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
+def _eliminate_split_full_idx(sorted_graph: List[Tensor]) -> List[Tensor]:
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "split":
+            continue
+        split_op = src_op
+        dim = split_op._attrs["split_dim"]
+        split_sizes = split_op._attrs["split_sizes"]
+        assert len(split_op._attrs["inputs"]) == 1
+        shape = split_op._attrs["inputs"][0]._attrs["shape"]
+        if (
+            len(split_sizes) == 1
+            and shape_utils.is_static_dimension(shape, dim)
+            and shape[dim]._attrs["values"][0] == split_sizes[0]
+        ):
+            input_tensor = split_op._attrs["inputs"][0]
+            output_tensor = split_op._attrs["outputs"][0]
+            # tensor can not be input and output
+            if output_tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+                continue
+            transform_utils.remove_single_tensor_op_from_sorted_graph(split_op)
+
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
 def transform_memory_ops(
     sorted_graph: List[Tensor], workdir: str = None
 ) -> List[Tensor]:
@@ -196,6 +225,7 @@ def transform_memory_ops(
     """
 
     funcs = [
+        _eliminate_split_full_idx,
         _merge_split_and_cat,
         _eliminate_cat,
     ]
diff --git a/tests/unittest/compiler/test_split_full_idx.py b/tests/unittest/compiler/test_split_full_idx.py
new file mode 100644
index 000000000..8984f6d0c
--- /dev/null
+++ b/tests/unittest/compiler/test_split_full_idx.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class SplitGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_split_getitem(
+        self,
+        shape,
+        split_sections,
+        split_dim,
+        test_name="split_full_idx",
+        dtype="float16",
+    ):
+        assert len(shape) == 3, f"expected shape to be 3 but got {shape}"
+        target = detect_target()
+        M, N, K = shape
+
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="input_3",
+            is_input=True,
+        )
+        Y1 = ops.split()(D, split_sections, split_dim)
+        Y2 = ops.getitem()(Y1, 0)
+        Y = ops.gemm_rcr_bias_sigmoid_mul_tanh()(X, W, B, Y2)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        src_ops = set()
+        for tensor in module.debug_sorted_graph:
+            src_ops |= set(tensor.src_ops())
+            for src_op in tensor.src_ops():
+                assert not src_op._attrs["op"].startswith("split"), (
+                    f"Ecountered split op {src_op}."
+                    "Shouldn't have split op after graph optmizaiton"
+                )
+        assert len(src_ops) == 1
+
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
+        Y_pt = torch.tanh(
+            torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)) * D_pt
+        )
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt, "input_3": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem_to_noop(self):
+        self._test_split_getitem(
+            shape=(16, 32, 10),
+            split_sections=[16],
+            split_dim=0,
+        )
+        self._test_split_getitem(
+            shape=(16, 32, 10),
+            split_sections=[32],
+            split_dim=1,
+        )
+
+    def _test_split_getitem_remove_output(
+        self,
+        shape,
+        split_sections,
+        split_dim,
+        test_name="split_remove_output",
+        dtype="float16",
+    ):
+        assert len(shape) == 3, f"expected shape to be 3 but got {shape}"
+        target = detect_target()
+        M, N, K = shape
+
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="input_3",
+            is_input=True,
+        )
+        Y1 = ops.gemm_rcr_bias_sigmoid_mul_tanh()(X, W, B, D)
+        Y2 = ops.split()(Y1, split_sections, split_dim)
+        Y = ops.getitem()(Y2, 0)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        src_ops = set()
+        for tensor in module.debug_sorted_graph:
+            src_ops |= set(tensor.src_ops())
+            for src_op in tensor.src_ops():
+                assert not src_op._attrs["op"].startswith("split"), (
+                    f"Ecountered split op {src_op}."
+                    "Shouldn't have split op after graph optmizaiton"
+                )
+        assert len(src_ops) == 1
+
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
+        Y_pt = torch.tanh(
+            torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)) * D_pt
+        )
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt, "input_3": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem_remove_output(self):
+        self._test_split_getitem_remove_output(
+            shape=(16, 32, 10),
+            split_sections=[16],
+            split_dim=0,
+        )
+        self._test_split_getitem_remove_output(
+            shape=(16, 32, 10),
+            split_sections=[32],
+            split_dim=1,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 721107783cc56cf2b8715970b7f64338f8ca4e68 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Wed, 1 Mar 2023 14:29:56 -0800
Subject: [PATCH 204/638] Add time to Dockerfiles (#338)

Summary:
Changes:
- Add `apt install -y time` to `install_basic_dep.sh`.

Issue: https://github.com/facebookincubator/AITemplate/issues/329
Related PR: [Time each make command PR307](https://github.com/facebookincubator/AITemplate/pull/307)
ipiszy

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/338

Reviewed By: khabinov, wushirong

Differential Revision: D43687222

Pulled By: ipiszy

fbshipit-source-id: fced78a9591f4745f9be33ac097bc148bb1a4cc8
---
 docker/install/install_basic_dep.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/install/install_basic_dep.sh b/docker/install/install_basic_dep.sh
index 801ef53ef..18f37f628 100644
--- a/docker/install/install_basic_dep.sh
+++ b/docker/install/install_basic_dep.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
+apt install -y time
 pip3 install numpy
 pip3 install jinja2

From 042465dcca664b2437749a56e4360c401f883f44 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 1 Mar 2023 15:34:34 -0800
Subject: [PATCH 205/638] Add jagged Tensor support to fused_elementwise
 back-end (#342)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/342

In this diff, the `fused_elementwise` back-end is extended to support mixed jagged Tensor / dense Tensor inputs, along with all the pre-existing features of the back-end (broadcasting, vectorized I/O, nested expression generation, etc.). The added functionality should be orthogonal to the existing one. The diff also adds a few Python / PyTorch-based tools in the `testing/jagged_utils.py` that can be used for reference computation in future jagged-aware ops' unit tests.

## Implementation Details

When `fused_elementwise` has are only jagged inputs, only dense inputs, or the highest-rank dense input shape does not overlap with the jagged dimensions of the jagged inputs (i.e., with the `JaggedIntVar` in the jagged input shape), all jagged inputs are treated as dense (with the first `total_length` dimension followed by non-jagged dimensions), and the implementation relies on the pre-existing back-end implementation for simplicity and efficiency.

For the cases when the shape of at least one of the dense inputs overlaps with the jagged dimensions of the jagged inputs, the semantics of the elementwise operation is that the elements of the jagged inputs are used in computation together with the elements of the dense inputs at the corresponding positions. Consequently, the elements in the dense inputs not having the counterparts in the jagged inputs are ignored. (It is assumed that the dense inputs can either be broadcasted to or always have elements where the jagged inputs do.)

This suggests at least two possible approaches to the kernel implementation, both of which are included in this diff:

- In the **dense space indexing**, the kernel is launched on the `num_elements` in the dense output volume: the smallest rectangular volume that fits the jagged outputs (as well as all the dense and jagged inputs, by assumption of the front-end). The flat `dense_idx` for accessing the dense inputs is readily available from the CUDA indices (`blockIdx` / `threadIdx`). The flat `jagged_idx` for accessing the jagged inputs and outputs is computed from the `dense_idx` using the `offsets` of the jagged inputs / outputs (see the `KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE` for the details of the index computation). The `jagged_idx` computation requires accessing the `offsets` only once (at two consecutive positions), which leads to `O(1)` reading of the `offsets`. If it turns out that the jagged output does not have an element corresponding to a certain `dense_idx`, the thread returns prematurely.

- In the **jagged space indexing**, the kernel is launched on the `num_elements` in the jagged output shape: the `total_length` dimension followed by the dense dimensions in the jagged output (there can be more than one). The flat `jagged_idx` is readily available from the CUDA indices. The flat `dense_idx` is computed from the `jagged_idx` using the `offsets` of the jagged inputs / outputs (see the `KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE` for the details of the index computation). The `dense_idx` is computed using binary search within the `offsets` array, hence the `offsets` are read `O(log n)` times in each thread (with `n` being the length of the `offsets`).

The two approaches offer a trade-off:

- The dense space indexing runs in the dense output volume. Therefore, the number of threads may, in general, be much larger than the number of elements in the jagged output Tensors; and there may be many threads that return prematurely without doing any work. The index computation, on the other hand, is simple and lightweight.

- The jagged space indexing runs in the jagged output shape. As a result, the number of threads equals (up to the fixed gap of `FUSED_ELE_THREAD_SIZE`) to the number of elements in the jagged output Tensors; and (almost) each thread does meaningful work. But the binary search-based index computation may be heavy.

Reviewed By: ipiszy

Differential Revision: D43482363

fbshipit-source-id: 8194edda7145a839d53e0a581f24e1336dfdeb60
---
 .../backend/common/elementwise_common.py      | 469 +++++++++++++-
 python/aitemplate/testing/jagged_utils.py     | 314 ++++++++++
 tests/unittest/ops/test_jagged_elementwise.py | 591 ++++++++++++++++++
 3 files changed, 1345 insertions(+), 29 deletions(-)
 create mode 100644 python/aitemplate/testing/jagged_utils.py
 create mode 100644 tests/unittest/ops/test_jagged_elementwise.py

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 3997c8c9c..1655d40f8 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -18,14 +18,15 @@
 
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import jinja2
 from aitemplate.backend.backend_spec import BackendSpec
 
-from ...compiler.base import IntImm, IntVar, Operator, Tensor
+from ...compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
 from ...compiler.tensor_accessor import TensorAccessor
 from ...utils import alignment as alignment_utils, shape_utils
+from ..target import Target
 from . import tensor_accessor_codegen
 
 CONSTANT_TEMPLATE = jinja2.Template(
@@ -65,6 +66,97 @@
 )
 
 
+KERNEL_COMPUTE_IDX_TEMPLATE = jinja2.Template(
+    """
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+    """
+)
+
+
+KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the dense_idx from the blockIdx and threadIdx
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the jagged_idx from the dense_idx_elem
+  {{index_type}} jagged_idx;
+  {
+    // dense_coord is along consecutive dense dimensions
+    // jagged_coord is along the total_length of the jagged Tensor
+    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
+    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
+    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
+
+{% for i in range(num_offsets) %}
+    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
+    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
+    dense_coord = running_idx / ({{strides[i+1]}});
+    running_idx = running_idx % ({{strides[i+1]}});
+    if (dense_coord >= next_offset - prev_offset) {
+        // this element of the dense volume is
+        // out of bounds of the jagged Tensor
+        return;
+    }
+    jagged_coord = prev_offset;
+
+{% endfor %}
+    jagged_coord += dense_coord;
+    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
+
+KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the jagged_idx from the blockIdx and threadIdx
+  const {{index_type}} jagged_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} jagged_idx_elem = jagged_idx * N_ELEMENTS_PER_THREAD;
+  if (jagged_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the dense_idx from the jagged_idx_elem
+  {{index_type}} dense_idx = jagged_idx_elem % ({{strides[num_offsets]}});
+  {
+    {{offsets_type}} left, right, mid, tmp_value, offset_idx, offset_value;
+    {{index_type}} running_idx = jagged_idx_elem / ({{strides[num_offsets]}});
+
+    // binary search to determine the dense coord along the current jagged dimension
+    // the goal is to find the index of the maximum offset value in offsets.data[{{i}}]
+    // which is <= the running_idx. the (running_idx - offset_value) will then indicate
+    // the dense cooord along the current jagged dimension.
+{% for i in range(num_offsets - 1, -1, -1) %}
+    left = 0;
+    right = offsets.lengths[{{i}}] - 1;
+    while (left <= right) {
+        mid = (left + right) >> 1;
+        tmp_value = offsets.data[{{i}}][mid];
+        if (tmp_value <= running_idx) {
+            offset_idx = mid;
+            offset_value = tmp_value;
+            left = mid + 1;
+        } else {
+            right = mid - 1;
+        }
+    }
+    dense_idx += (running_idx - offset_value) * ({{strides[i+1]}});
+    running_idx = offset_idx;
+
+{% endfor %}
+    dense_idx = (dense_idx + running_idx * ({{strides[0]}})) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
 KERNEL_READ_INPUT_TEMPLATE = jinja2.Template(
     """
   {{read_t}} *{{input_name}} = const_cast<{{read_t}}*>(input{{input_idx}});
@@ -102,14 +194,8 @@
 KERNEL_TEMPLATE = jinja2.Template(
     """
 __global__ void
-{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements) {
-  const int bid = blockIdx.x;
-  const int tid = threadIdx.x;
-  const {{index_type}} idx = bid * FUSED_ELE_THREAD_SIZE + tid;
-  const {{index_type}} idx_elem = idx * N_ELEMENTS_PER_THREAD;
-  if (idx_elem >= n_elements) {
-    return;
-  }
+{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
+  {{compute_idx}}
   {{read_inputs}}
   {{define_outputs}}
 #pragma unroll
@@ -134,6 +220,8 @@
     """
 {{head}}
 
+#include "jagged.h"
+
 namespace {
 
 {{constant}}
@@ -146,7 +234,7 @@
 
 }  // namespace
 
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
     if (n_elements == 0) {
       return;
     }
@@ -155,6 +243,7 @@
         {{kernel_call_output_params}},
         {{kernel_call_input_params}},
         {{dynamic_dims_call}}
+        {{offsets_call}}
         n_elements
     );
 }
@@ -163,7 +252,7 @@
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
     """
 )
 
@@ -171,7 +260,7 @@
     """
 {{indent}}{
     {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
-    {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{func_name}}_n_elements, {{stream}});
+    {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
 {{indent}}}
     """
 )
@@ -208,6 +297,33 @@ class FusedElementwiseMetaData:
     dynamic_dims: List[IntVar]
     sub_funcs: List[ElementwiseMetaData]
 
+    # this flag specifies if the jagged and mixed inputs need
+    # separate indexing logic within the generated kernel code.
+    # this typically happens when the shape of at least one of
+    # the dense inputs overlaps with one or more jagged dimensions
+    # of the jagged inputs (all jagged inputs are assume to have
+    # the same rank and JaggedIntVar / jagged dimensions).
+    mixed_jagged_dense_indexing: bool = False
+
+    # this attribute is relevant only when mixed_jagged_dense_indexing
+    # is True. it specifies the smallest rectangular volume that fits
+    # all inputs (jagged and dense) and outputs (jagged): i.e., the maximum
+    # rectangular volume that the jagged output Tensor can fit in.
+    # the output_volume list, therefore, can't contain a JaggedIntVar, as
+    # the latter in the jagged output Tensor shape is "expanded" to the
+    # list with `batch_dim` followed by an IntImm for each jagged dim.
+    output_volume: Optional[List[IntVar]] = None
+
+    # this attribute is relevant only when mixed_jagged_dense_indexing
+    # is True. wether the jagged index space implementation (as opposed
+    # to the dense index space implementation) should be use to compute
+    # the dense_idx and jagged_idx separately in the mixed jagged /
+    # dense indexing cases. the dense space indexing runs over the
+    # (dense) output volume and computes jagged_idx from dense_idx.
+    # the jagged space indexing runs over the jagged output shape
+    # and computes the dense_inx from jagged_idx (with binary search).
+    use_jagged_space_indexing: bool = False
+
 
 def gen_function_single_thread(
     fused_func_metadata,
@@ -359,6 +475,11 @@ def _get_sub_func_metadata(
     return (sub_func_metadata, op_t)
 
 
+def _is_jagged_shape(shape: List[IntVar]) -> bool:
+    """Whether the given shape is a shape of a jagged Tensor."""
+    return len(shape) > 0 and isinstance(shape[0], JaggedIntVar)
+
+
 def _get_alignments(
     extended_input_shapes: List[List[IntVar]],
     input_broadcast_sizes: List[int],
@@ -440,13 +561,13 @@ def _get_alignments_and_sizes_and_dtype(
     input_accessors: List[TensorAccessor],
     output_accessors: List[TensorAccessor],
     backend_spec: BackendSpec,
+    mixed_jagged_dense_indexing: bool,
+    output_volume: Optional[List[IntVar]],
 ) -> Tuple[List[int], List[List[IntVar]], str]:
     """
     Returns Tuple(alignments, input_broadcast_sizes, dtype)
     """
-
     # Handle input broadcast.
-    output_shape = output_accessors[0].original_shapes
     dtype = inputs[0]._attrs["dtype"]
 
     # Determine the rightmost broadcast dim among all inputs.
@@ -466,6 +587,21 @@ def _get_alignments_and_sizes_and_dtype(
     extended_input_shapes = []
     for input_accessor in input_accessors:
         input_shape = input_accessor.original_shapes
+
+        if mixed_jagged_dense_indexing:
+            if _is_jagged_shape(input_shape):
+                # broadcast the jagged input shape against the output_shape:
+                # in a mixed jagged / dense op the output_shape is the shape
+                # of the output jagged Tensor
+                output_shape = output_accessors[0].original_shapes
+            else:
+                # broadcast the dense input shape against the output_volume,
+                # as the dense indexing will be done in the output_volume
+                output_shape = output_volume
+        else:
+            # treat all outputs as dense: use output_shape for broadcasting
+            output_shape = output_accessors[0].original_shapes
+
         broadcastable, _ = shape_utils.get_broadcast_max_shape(
             output_shape, input_shape
         )
@@ -491,6 +627,22 @@ def _get_alignments_and_sizes_and_dtype(
                     else:
                         rightmost_broadcast_dim = max(i, rightmost_broadcast_dim)
                     break
+
+        if mixed_jagged_dense_indexing:
+            # in the mixed jagged / dense indexing case, the number of the
+            # rightmost non-broadcated static dimensions of the dense inputs
+            # to be considered for vectorization can't be larger than the
+            # number of the jagged output's inner dimensions (i.e., the
+            # dimensions following the JaggedIntVar). otherwise, there may
+            # be an overlap with the jagged dimensions, in which case the
+            # vectorization can break.
+            jagged_output_shape = output_accessors[0].original_shapes
+            num_inner_dims_in_jagged_shape = len(jagged_output_shape) - 1
+            num_rightmost_non_br_dims = min(
+                num_rightmost_non_br_dims,
+                num_inner_dims_in_jagged_shape,
+            )
+
         extended_input_shapes.append(extended_input_shape)
         num_rightmost_non_broadcast_dims.append(num_rightmost_non_br_dims)
     (alignments, non_broadcast_alignments) = _get_alignments(
@@ -517,7 +669,63 @@ def _get_dynamic_dims(output_accessors: List[TensorAccessor]) -> List[IntVar]:
         for dim in output_accessor.original_shapes:
             if not isinstance(dim, IntImm):
                 res[dim._attrs["name"]] = dim
-    return res.values()
+                if isinstance(dim, JaggedIntVar):
+                    # the batch_dim within the JaggedIntVar may not be present directly
+                    # in other input / output shapes, so we're adding it here separately
+                    batch_dim = dim.batch_dim()
+                    res[batch_dim._attrs["name"]] = batch_dim
+    return list(res.values())
+
+
+def _get_mixed_jagged_dense_config(
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+) -> Tuple[bool, List[IntVar]]:
+    """
+    Returns Tuple(
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
+    )
+    """
+    # all output shapes are assumed to be the same
+    output_shape = output_accessors[0].original_shapes
+    input_shapes = [acc.original_shapes for acc in input_accessors]
+    jagged_input_shapes = [s for s in input_shapes if _is_jagged_shape(s)]
+    dense_input_shapes = [s for s in input_shapes if not _is_jagged_shape(s)]
+
+    if not jagged_input_shapes or not dense_input_shapes:
+        # there are either only dense inputs or only jagged inputs:
+        # in both cases all inputs will be treated as dense, because
+        # the JaggedIntVars and ranks of all the jagged inputs are
+        # assumed to be the same
+        return False, None, False
+
+    jagged_rank = len(jagged_input_shapes[0])
+    max_dense_rank = max(len(s) for s in dense_input_shapes)
+
+    if max_dense_rank <= jagged_rank - 1:
+        # the longest dense shape does not overlap with the jagged dims:
+        # the jagged inputs can be treated as dense, meaning that the
+        # total_length of the jagged inputs (not overlapping with the
+        # dense inputs' shapes) will be treated as a single dense dim
+        return False, None, False
+
+    jagged_int_var = output_shape[0]
+    jagged_max_dense_prefix_shape = jagged_int_var.get_max_dense_shape()
+    jagged_suffix_shape = output_shape[1:]
+    output_volume = jagged_max_dense_prefix_shape + jagged_suffix_shape
+
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+
+    # because at least one of the dense inputs overlap with the
+    # JaggedIntVar of the jagged inputs, jagged and dense inputs
+    # will need different indexing in the generated kernel.
+    # output_volume is the smallest rectangular volume fitting
+    # all the input (jagged and dense) and outputs (jagged).
+    return True, output_volume, use_jagged_space_indexing
 
 
 def _parse_func_metadata(
@@ -530,8 +738,21 @@ def _parse_func_metadata(
     original_outputs: List[Tensor],
     backend_spec: BackendSpec,
 ) -> FusedElementwiseMetaData:
+    (
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
+    ) = _get_mixed_jagged_dense_config(
+        input_accessors,
+        output_accessors,
+    )
     alignments, input_broadcast_sizes, dtype = _get_alignments_and_sizes_and_dtype(
-        inputs, input_accessors, output_accessors, backend_spec
+        inputs,
+        input_accessors,
+        output_accessors,
+        backend_spec,
+        mixed_jagged_dense_indexing,
+        output_volume,
     )
     max_read_type = backend_spec.get_elementwise_read_backend_type(
         max(alignments), dtype
@@ -564,6 +785,9 @@ def _parse_func_metadata(
         input_broadcast_sizes,
         dynamic_dims,
         sub_func_metadata,
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
     )
 
 
@@ -586,6 +810,7 @@ def _gen_int_var_product_str(
 def _gen_input_broadcast_calculator_str(
     input_shape: List[IntVar],
     output_shape: List[IntVar],
+    mixed_jagged_dense_indexing: bool,
 ) -> str:
     output_num_elements = []
     output_strides = []
@@ -608,11 +833,15 @@ def _gen_input_broadcast_calculator_str(
         output_strides.append([IntImm(1)])
         output_num_elements.append(output_shape[start_idx:])
 
+    index_variable = "dense_idx"
+    if mixed_jagged_dense_indexing and _is_jagged_shape(input_shape):
+        index_variable = "jagged_idx"
+
     res = []
     for output_num_element, output_stride, input_stride in zip(
         output_num_elements, output_strides, input_strides
     ):
-        idx_str = "idx * N_ELEMENTS_PER_THREAD"
+        idx_str = f"{index_variable} * N_ELEMENTS_PER_THREAD"
         res.append(
             "{} % ({}) / ({}) * ({})".format(
                 idx_str,
@@ -628,15 +857,36 @@ def _gen_input_broadcast_calculator_str(
 def _gen_input_broadcast_size_str(
     input_broadcast_sizes: List[List[IntVar]],
     output_shape: List[IntVar],
+    mixed_jagged_dense_indexing: bool,
+    output_volume: Optional[List[IntVar]],
 ) -> List[str]:
     res = []
     for input_broadcast_size in input_broadcast_sizes:
         if input_broadcast_size is None:
             res.append("")
         else:
+            if mixed_jagged_dense_indexing:
+                if _is_jagged_shape(input_broadcast_size):
+                    # broadcast the dense input shape in the jagged
+                    # index space: i.e., against the output_shape
+                    output_broadcast_size = output_shape
+                else:
+                    # broadcast the dense input shape in the dense
+                    # index space: i.e., against the output_volume
+                    output_broadcast_size = output_volume
+            else:
+                # broadcast all input shapes in the dense index space
+                # all inputs are treated as dense ==> output_shape
+                output_broadcast_size = output_shape
+
             res.append(
-                _gen_input_broadcast_calculator_str(input_broadcast_size, output_shape)
+                _gen_input_broadcast_calculator_str(
+                    input_broadcast_size,
+                    output_broadcast_size,
+                    mixed_jagged_dense_indexing,
+                )
             )
+
     return res
 
 
@@ -650,6 +900,56 @@ def _gen_dynamic_dim_str(
     return res
 
 
+def _gen_offsets_str(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+) -> str:
+    offsets = ""
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        inputs = fused_elementwise_metadata.inputs
+        jagged_input = [t for t in inputs if t.is_jagged()][0]
+        jagged_int_var = jagged_input._attrs["shape"][0]
+        offsets_var_name = jagged_int_var.offsets_var_name()
+        offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+        ref_prefix = "const " if const_ref else ""
+        ref_suffix = "&" if const_ref else ""
+        arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
+        arg_name = name if name is not None else offsets_var_name
+        offsets = f"{arg_type}{arg_name}, "
+
+    return offsets
+
+
+def _gen_num_elements_calculator(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+) -> str:
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        if fused_elementwise_metadata.use_jagged_space_indexing:
+            # for the jagged space indexing, the num_elements
+            # is the number of elements in the output jagged Tensor, hence
+            # the usage of the output shape here, not the output volume
+            return _gen_int_var_product_str(
+                fused_elementwise_metadata.output_accessors[0].original_shapes,
+            )
+        else:
+            # for the dense space indexing, the num_elements
+            # is the number of elements in the output volume: the smallest
+            # rectangular volume that fits the output jagged Tensor, hence
+            # the usage of the output volume here, not the output shape
+            return _gen_int_var_product_str(
+                fused_elementwise_metadata.output_volume,
+            )
+    else:
+        # all inputs and outputs are treated as dense:
+        # use the output shape for computing num_elements
+        return _gen_int_var_product_str(
+            fused_elementwise_metadata.output_accessors[0].original_shapes,
+        )
+
+
 def _gen_read_inputs_str(
     fused_elementwise_metadata: FusedElementwiseMetaData,
     broadcast_sizes: List[str],
@@ -662,6 +962,12 @@ def _gen_read_inputs_str(
             broadcast_sizes,
         )
     ):
+        index_variable = "dense_idx"
+        if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+            input_shape = input_accessor.original_shapes
+            if _is_jagged_shape(input_shape):
+                index_variable = "jagged_idx"
+
         input_name = f"input_tmp{input_idx}"
 
         # When broadcasting an input, we are reading a different number of elements
@@ -671,7 +977,7 @@ def _gen_read_inputs_str(
             f"(sizeof({fused_elementwise_metadata.max_read_t}) / sizeof({read_t})))"
         )
         data_idx = (
-            "idx"
+            index_variable
             if not broadcast_size
             else f"({broadcast_size}) / {n_elems_per_thread}"
         )
@@ -696,18 +1002,26 @@ def _gen_read_inputs_str(
     return read_inputs_str
 
 
-def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData):
+def _gen_write_outputs_str(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+):
     write_outputs = []
     for output_idx, output_accessor in enumerate(
         fused_elementwise_metadata.output_accessors
     ):
+        index_variable = "dense_idx"
+        if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+            # the output of a mixed jagged / dense
+            # elementwise operation is always jagged
+            index_variable = "jagged_idx"
+
         output_name = f"output{output_idx}"
         get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
             tensor_accessor=output_accessor,
             data_ptr=output_name,
             data_t=fused_elementwise_metadata.data_t,
             read_t=fused_elementwise_metadata.max_read_t,
-            data_idx="idx",
+            data_idx=index_variable,
         )
         write_out = KERNEL_WRITE_OUTPUT_TEMPLATE.render(
             get_strided_address=get_strided_addr_str,
@@ -719,6 +1033,62 @@ def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData)
     return write_outputs_str
 
 
+def _get_output_volume_strides(
+    output_volume: List[IntVar],
+) -> List[str]:
+    """
+    Generate the stride expressions for each of the dimensions
+    of the output volume. A stride expression here means the
+    product of all dimensions following the given dimension.
+    The order of the stride expressions in the returned list
+    is the same as of the dimensions of the output volume.
+    """
+    strides = []
+    for dim in reversed(output_volume[1:]):
+        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
+        if strides:
+            strides.append(f"{strides[-1]} * {str_dim}")
+        else:
+            strides.append(str_dim)
+    strides.reverse()
+    return strides
+
+
+def _gen_compute_idx(
+    index_type: str,
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+) -> str:
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        # generate the index computation code computing both
+        # dense_idx and jagged_idx, to be used for the dense
+        # and jagged inputs / outptus, respectively
+        inputs = fused_elementwise_metadata.inputs
+        jagged_input = [t for t in inputs if t.is_jagged()][0]
+        jagged_int_var = jagged_input._attrs["shape"][0]
+        num_offsets = len(jagged_int_var.jagged_dims())
+
+        compute_idx_template = (
+            KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE
+            if fused_elementwise_metadata.use_jagged_space_indexing
+            else KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE
+        )
+
+        return compute_idx_template.render(
+            index_type=index_type,
+            num_offsets=num_offsets,
+            strides=_get_output_volume_strides(
+                fused_elementwise_metadata.output_volume,
+            ),
+            offsets_type=jagged_int_var.offsets_type(),
+        )
+    else:
+        # no need for the mixed jagged / dense indexing:
+        # use dense_idx for all inputs and outputs
+        return KERNEL_COMPUTE_IDX_TEMPLATE.render(
+            index_type=index_type,
+        )
+
+
 def _gen_kernel_function(
     func_attrs: Dict[str, Any],
     index_type: str,
@@ -742,9 +1112,16 @@ def _gen_kernel_function(
         ]
     )
 
+    compute_idx_str = _gen_compute_idx(
+        index_type,
+        fused_elementwise_metadata,
+    )
+
     broadcast_sizes = _gen_input_broadcast_size_str(
         fused_elementwise_metadata.input_broadcast_sizes,
         fused_elementwise_metadata.output_accessors[0].original_shapes,
+        fused_elementwise_metadata.mixed_jagged_dense_indexing,
+        fused_elementwise_metadata.output_volume,
     )
     read_inputs_str = _gen_read_inputs_str(fused_elementwise_metadata, broadcast_sizes)
 
@@ -776,8 +1153,19 @@ def _gen_kernel_function(
         output_params=output_params_decl,
         input_params=input_params_decl,
         dynamic_dims=_gen_dynamic_dim_str(
-            index_type, fused_elementwise_metadata.dynamic_dims, has_type=True
+            index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=True,
         ),
+        offsets=_gen_offsets_str(
+            fused_elementwise_metadata,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+        compute_idx=compute_idx_str,
         read_inputs=read_inputs_str,
         define_outputs=define_outputs,
         write_outputs=write_outputs_str,
@@ -889,6 +1277,20 @@ def fused_elementwise_gen_function(
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
+        offsets_decl=_gen_offsets_str(
+            fused_elementwise_metadata,
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=_gen_offsets_str(
+            fused_elementwise_metadata,
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
         kernel_call_output_params=kernel_call_output_params,
         kernel_call_input_params=kernel_call_input_params,
     )
@@ -943,6 +1345,12 @@ def fused_elementwise_gen_function_decl(
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
+        offsets=_gen_offsets_str(
+            fused_elementwise_metadata,
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
     )
     return function_decl
 
@@ -953,6 +1361,7 @@ def fused_elementwise_gen_function_call(
     backend_spec: BackendSpec,
 ):
     """Generates fused_elementwise function call."""
+
     ops = func_attrs["elementwise_ops"]
     inputs = func_attrs["inputs"]
     outputs = func_attrs["outputs"]
@@ -972,18 +1381,15 @@ def fused_elementwise_gen_function_call(
     )
 
     output_params = ",".join([output._attrs["name"] for output in outputs])
-
     input_params = ",".join([input._attrs["name"] for input in inputs])
 
-    num_elements_calculator = _gen_int_var_product_str(
-        output_accessors[0].original_shapes
-    )
-
     return FUNC_CALL_TEMPLATE.render(
         stream=backend_spec.stream,
         func_name=func_attrs["name"],
         index_type=backend_spec.index_type,
-        calculate_n=num_elements_calculator,
+        calculate_n=_gen_num_elements_calculator(
+            fused_elementwise_metadata,
+        ),
         output_params=output_params,
         input_params=input_params,
         dynamic_dims=_gen_dynamic_dim_str(
@@ -991,5 +1397,10 @@ def fused_elementwise_gen_function_call(
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
+        offsets=_gen_offsets_str(
+            fused_elementwise_metadata,
+            has_type=False,
+            const_ref=False,
+        ),
         indent=indent,
     )
diff --git a/python/aitemplate/testing/jagged_utils.py b/python/aitemplate/testing/jagged_utils.py
new file mode 100644
index 000000000..e3bac12ed
--- /dev/null
+++ b/python/aitemplate/testing/jagged_utils.py
@@ -0,0 +1,314 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+from itertools import product
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.testing.test_utils import get_torch_full_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype, torch_dtype_to_string
+
+
+def _check_offsets(
+    offsets_list: List[List[int]],
+) -> None:
+    offsets_len = len(offsets_list[0])
+    for offsets in offsets_list:
+        assert offsets[0] == 0
+        assert len(offsets) == offsets_len
+        for j in range(1, len(offsets)):
+            assert offsets[j] >= offsets[j - 1]
+        offsets_len = offsets[-1] + 1
+
+
+def _get_preceding_offset_idx(
+    idx: int,
+    offsets: List[int],
+) -> Tuple[int, int]:
+    result = None
+    left, right = 0, len(offsets) - 1
+    while left <= right:
+        mid = (left + right) // 2
+        offset = offsets[mid]
+        if offset <= idx:
+            result = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+
+    return result, offsets[result]
+
+
+def _jagged_idx_to_dense_idx(
+    jagged_idx: int,
+    offsets_list: List[List[int]],
+) -> List[int]:
+    assert jagged_idx < offsets_list[-1][-1]
+
+    result = []
+    for offsets in reversed(offsets_list):
+        offset_idx, offset = _get_preceding_offset_idx(
+            idx=jagged_idx,
+            offsets=offsets,
+        )
+        result.append(jagged_idx - offset)
+        jagged_idx = offset_idx
+    result.append(jagged_idx)
+
+    return list(reversed(result))
+
+
+def jagged_to_dense(
+    jagged: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    dense_shape: List[int],
+    padding_value: float = 0.0,
+) -> torch.Tensor:
+    """
+    Convert a jagged Tensor (with the offsets) into a dense Tensor.
+
+    The function converts a jagged Tensor (and the offsets) to
+    a rectangular dense Tensor, using the padding_value at the
+    positions of the resulting dense Tensor where the input
+    jagged Tensor doesn't have elements.
+
+    Parameters
+    ----------
+    jagged : torch.Tensor
+        The jagged Tensor with the shape `[total_length, D1, ..., Dm]`,
+        The first dimension, `total_length`, encodes the `batch_dim` and
+        the jagged dims of the jagged Tensor. The following m dimensions,
+        `D1, ..., Dm`, are regular dense dimensions following the jagged
+        dimensions of the jagged Tensor.
+
+    offsets_list : List[torch.Tensor]
+        A list of rank-1 Tensors, each representing the offsets along one
+        of the jagged dimensions of the jagged Tensor. The number of offsets
+        Tensors in the list must correspond to the number of jagged dimensions
+        encoded in the first `total_length` dimension of `jagged`. The offsets
+        Tensors must be consistent with the offset specification:
+
+            - batch_dim == len(offsets[0]) - 1
+            - offsets[i][-1] == len(offsets[i+1])) - 1
+            - offsets[-1][-1] == total_length
+
+    dense_shape : List[int]
+        The shape of the resulting dense Tensor. The last m dimensions in
+        the `dense_shape` must be equal to `[D1, ..., Dm]` in the jagged
+        Tensor shape. The first dimension must be the `batch_dim`. The
+        following n dimensions must correspond to the n jagged dimensions
+        of the jagged Tensor, with the values equal to the maximum possible
+        values of the jagged dimensions.
+
+    padding_value : float
+        The value to fill the dense Tensor with at the positions where
+        there are no elements in the jagged Tensor. Default: 0.0.
+
+    Returns
+    -------
+    torch.Tensor
+        The dense tensor with the `dense_shape` converted from the
+        `jagged` Tensor, with the `padding_value` at other positions.
+    """
+    assert all(t.dim() == 1 for t in offsets_list)
+    offsets_list = [list(t.cpu().numpy()) for t in offsets_list]
+
+    _check_offsets(offsets_list)
+    assert len(dense_shape) - len(jagged.shape) == len(offsets_list)
+    assert jagged.shape[1:] == tuple(dense_shape[1 + len(offsets_list) :])
+    for i, offsets in enumerate(offsets_list):
+        dense_dim = dense_shape[i + 1]
+        for j in range(1, len(offsets)):
+            assert offsets[j] - offsets[j - 1] <= dense_dim
+
+    dtype = torch_dtype_to_string(jagged.dtype)
+    result = get_torch_full_tensor(
+        shape=dense_shape,
+        fill_value=padding_value,
+        dtype=dtype,
+    )
+
+    total_length = jagged.shape[0]
+    for jagged_idx in range(total_length):
+        dense_idx = _jagged_idx_to_dense_idx(
+            jagged_idx=jagged_idx,
+            offsets_list=offsets_list,
+        )
+        result[tuple(dense_idx)] = jagged[jagged_idx]
+
+    return result
+
+
+def _dense_idx_to_jagged_idx(
+    dense_idx: List[int],
+    offsets_list: List[List[int]],
+) -> int:
+    assert len(dense_idx) == 1 + len(offsets_list)
+
+    offset = 0
+    for i, (d, offsets) in enumerate(zip(dense_idx, offsets_list)):
+        prev_offset, next_offset = offsets[offset + d : offset + d + 2]
+        group_size = next_offset - prev_offset
+        if dense_idx[i + 1] >= group_size:
+            return -1
+        offset = prev_offset
+    offset += dense_idx[-1]
+
+    return offset
+
+
+def dense_to_jagged(
+    dense: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    padding_value: float = 0.0,
+) -> torch.Tensor:
+    """
+    Convert a dense Tensor into a jagged Tensor (using the offsets).
+
+    The function converts a rectangular dense Tensor to a compactly
+    represented subset of its values: a jagged Tensor, using the offsets.
+    The padding_value is used at the positions of the resulting jagged
+    Tensor where the input dense Tensor doesn't have elements.
+
+    Parameters
+    ----------
+    dense : torch.Tensor
+        A Tensor with the shape `[batch_dim, N1, ..., Nn, D1, ..., Dm]`.
+        The first n+1 dimensions of the dense Tensor are encoded into
+        the first `total_length` dimension of the resulting jagged
+        Tensor, using the specified offsets. Importantly, the values in
+        the dense Tensor outside of what the offsets specify are omitted
+        in the resulting jagged Tensor.
+
+    offsets_list : List[torch.Tensor]
+        A list of rank-1 Tensors, each representing the offsets along one
+        of the jagged dimensions of the jagged Tensor. The number of offsets
+        Tensors in the list must correspond to the number of jagged dimensions
+        encoded in the first `total_length` dimension of the resulting jagged
+        Tensor. The offsets Tensors must be consistent with the offset
+        specification:
+
+            - batch_dim == len(offsets[0]) - 1
+            - offsets[i][-1] == len(offsets[i+1])) - 1
+            - offsets[-1][-1] == total_length
+
+    padding_value : float
+        The value to fill the jagged Tensor with at the positions where
+        there are no elements in the dense Tensor (e.g., the consecutive
+        offset difference is longer than the corresponding N dimension
+        in the dense Tensor input). Default: 0.0.
+
+    Returns
+    -------
+    torch.Tensor
+        The jagged tensor converted from the `dense` Tensor using
+        the offsets, with the `padding_value` at the positions
+        not available in the `dense` Tensor.
+    """
+    assert all(t.dim() == 1 for t in offsets_list)
+    offsets_list = [list(t.cpu().numpy()) for t in offsets_list]
+
+    _check_offsets(offsets_list)
+    assert len(offsets_list) < len(dense.shape)
+
+    total_length = offsets_list[-1][-1]
+    inner_shape = dense.shape[1 + len(offsets_list) :]
+    jagged_shape = [total_length, *inner_shape]
+
+    dtype = torch_dtype_to_string(dense.dtype)
+    result = get_torch_full_tensor(
+        shape=jagged_shape,
+        fill_value=padding_value,
+        dtype=dtype,
+    )
+
+    for dense_idx in product(*[range(d) for d in dense.shape[: 1 + len(offsets_list)]]):
+        jagged_idx = _dense_idx_to_jagged_idx(
+            dense_idx=dense_idx,
+            offsets_list=offsets_list,
+        )
+        if jagged_idx != -1:
+            result[jagged_idx] = dense[tuple(dense_idx)]
+
+    return result
+
+
+def generate_offsets(
+    batch_size: int,
+    max_seq_len: int,
+    load_factor: float,
+    offsets_dtype: str,
+    spread_radius: float = 0.1,
+) -> torch.Tensor:
+    """
+    Generate a rank-1 Tensor of offsets for the given load factor.
+
+    This function generates a single linear offset Tensor for a
+    single jagged dimension in a jagged Tensor with the batch_dim
+    equal to `batch_size` and maximum value along the jagged
+    dimension equal to `max_seq_len`. The `load_factor` in [0, 1]
+    specifies how "full" should the jagged Tensor described by
+    the resulting offsets should be, compared to the corresponding
+    dense Tensor with a rectangular shape [batch_size, max_seq_len,
+    D1, ..., Dm]. The offset differences (== the lengths along the
+    jagged dimensions) are sampled randomly, to arrive close (but not
+    necessarily equal) to the specified `load_factor` in total.
+
+    When sampled out of the [0, N] interval, the offset differences
+    are clamed to stay within the [0, N] interval.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch_dim of the jagged Tensor specified by the offsets.
+    max_seq_len : int
+        The maximum length along the jagged dimension specified by
+        the offsets.
+    load_factor : float
+        The fraction of the [batch_size, max_seq_len, D1, ..., Dm]-
+        shaped dense Tensor that the total (compactly represented)
+        jagged Tensor data should correspond to.
+    offsets_dtype : str
+        The type of the resulting offsets Tensor.
+    spread_radius : float
+        The radius of the spread around int(max_seq_len * load_factor)
+        that the offset differences should be randomly sampled from.
+        Default: 0.1.
+
+    Returns
+    -------
+    torch.Tensor
+        The resulting rank-1 Tensor of offsets.
+    """
+    assert 0 <= load_factor <= 1
+    assert 0 <= spread_radius <= 1
+
+    if load_factor < 1:
+        spread = int(max_seq_len * spread_radius)
+        mean = int(max_seq_len * load_factor)
+        lengths = [
+            mean + random.randint(-spread, spread + 1) for _ in range(batch_size)
+        ]
+        lengths = [max(min(L, max_seq_len), 0) for L in lengths]
+    else:
+        lengths = [max_seq_len] * batch_size
+
+    offsets = [0]
+    for length in lengths:
+        offsets.append(offsets[-1] + length)
+
+    torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+    return torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
diff --git a/tests/unittest/ops/test_jagged_elementwise.py b/tests/unittest/ops/test_jagged_elementwise.py
new file mode 100644
index 000000000..42e50eeda
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_elementwise.py
@@ -0,0 +1,591 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import (
+    dense_to_jagged,
+    generate_offsets,
+    jagged_to_dense,
+)
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+def _add_jagged_dense_ref(
+    jagged: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    dense: torch.Tensor,
+    jagged_max_shape: List[int] = None,
+) -> torch.Tensor:
+    """The reference function for jagged / dense elementwise add."""
+    if jagged_max_shape is None:
+        jagged_max_shape = dense.shape
+
+    assert len(jagged.shape) + len(offsets_list) >= len(dense.shape)
+    assert len(jagged_max_shape) == len(jagged.shape) + len(offsets_list)
+
+    return dense_to_jagged(
+        dense=(
+            dense
+            + jagged_to_dense(
+                jagged=jagged,
+                offsets_list=offsets_list,
+                dense_shape=jagged_max_shape,
+                padding_value=0.0,
+            )
+        ),
+        offsets_list=offsets_list,
+        padding_value=-1.0,
+    )
+
+
+class JaggedElementwiseTestCase(unittest.TestCase):
+    def _test_jagged_dense_elementwise_add(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dense_shape: List[int],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+
+        jagged_dims_max_values = jagged_max_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged_inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        assert len(dense_shape) <= len(jagged_max_shape)
+        dense_dims = [IntImm(dim) for dim in dense_shape]
+        if len(dense_shape) == len(jagged_max_shape):
+            assert dense_shape[0] == jagged_max_shape[0]
+            dense_dims[0] = batch_dim
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        DENSE = Tensor(
+            shape=dense_dims,
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, DENSE)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"test_jagged_dense_elementwise_add_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        dense_pt = get_random_torch_tensor(dense_shape, dtype)
+        result_pt = _add_jagged_dense_ref(
+            jagged=source_pt,
+            offsets_list=list(offsets_pt.values()),
+            jagged_max_shape=jagged_max_shape,
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, "dense": dense_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 4], [4, 3, 4]),
+            param(2, "int32", [4, 3, 2], [4, 3, 1]),
+            param(3, "int32", [4, 3, 1], [4, 3, 2]),
+            param(4, "int32", [4, 3, 2], [4, 1, 1]),
+            param(5, "int32", [4, 3, 2], [3, 1]),
+            param(6, "int64", [4, 3, 1], [2]),
+            param(7, "int64", [4, 3, 5, 6, 8], [4, 3, 5, 6, 8]),
+            param(8, "int64", [4, 3, 1, 6, 1], [4, 3, 5, 1, 8]),
+            param(9, "int64", [4, 3, 1, 6, 1], [4, 1, 1, 1, 1]),
+            param(10, "int64", [4, 3, 1, 1, 2], [3, 5, 6, 2]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_single_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_jagged_dense_elementwise_add(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[[0, 1, 4, 6, 7]],
+                dense_shape=dense_shape,
+                dtype="float16",
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"single_offsets_fp16_{i}_{use_jagged_space_indexing}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 4], [3, 4, 5, 150, 3, 4]),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], [3, 4, 5, 150, 3, 1]),
+            param(3, "int32", [3, 4, 5, 150, 3, 4], [1]),
+            param(4, "int64", [3, 4, 5, 150, 1, 1], [150, 3, 4]),
+            param(5, "int64", [3, 4, 5, 150, 3, 4], [3, 1, 1, 1, 1, 1]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_multiple_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_jagged_dense_elementwise_add(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[
+                    [0, 1, 3, 5],
+                    [0, 2, 4, 7, 9, 10],
+                    [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+                ],
+                dense_shape=dense_shape,
+                dtype="float16",
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"multiple_offsets_fp16_{i}_{use_jagged_space_indexing}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 2], [4, 3, 2]),
+            param(2, "int64", [4, 3, 5, 6, 7], [4, 3, 5, 6, 7]),
+            param(3, "int64", [4, 3, 1, 1, 1], [3, 5, 6, 7]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_single_offsets_fp32(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        self._test_jagged_dense_elementwise_add(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dense_shape=dense_shape,
+            dtype="float32",
+            offsets_dtype=offsets_dtype,
+            use_jagged_space_indexing=False,
+            test_suffix=f"single_offsets_fp32_{i}",
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 4], [3, 4, 5, 150, 3, 4]),
+            param(2, "int64", [3, 4, 5, 150, 1, 1], [150, 3, 4]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_multiple_offsets_fp32(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        self._test_jagged_dense_elementwise_add(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dense_shape=dense_shape,
+            dtype="float32",
+            offsets_dtype=offsets_dtype,
+            use_jagged_space_indexing=False,
+            test_suffix=f"multiple_offsets_fp32_{i}",
+        )
+
+    def _test_jagged_jagged_elementwise_add(
+        self,
+        jagged_max_prefix_shape: List[int],
+        jagged1_inner_shape: List[int],
+        jagged2_inner_shape: List[int],
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+    ):
+        assert len(jagged1_inner_shape) == len(jagged2_inner_shape)
+
+        batch_size = jagged_max_prefix_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+
+        jagged_dims_max_values = jagged_max_prefix_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged1_inner_dims = [IntImm(dim) for dim in jagged1_inner_shape]
+        jagged1_input_shape = [total_length] + jagged1_inner_shape
+        jagged2_inner_dims = [IntImm(dim) for dim in jagged2_inner_shape]
+        jagged2_input_shape = [total_length] + jagged2_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        SOURCE1 = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged1_inner_dims,
+            ],
+            name="source1",
+            dtype=dtype,
+            is_input=True,
+        )
+        SOURCE2 = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged2_inner_dims,
+            ],
+            name="source2",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+
+        JAGGED1 = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE1, OFFSETS_LIST)
+        JAGGED2 = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE2, OFFSETS_LIST)
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED1, JAGGED2)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE1.is_jagged()
+        assert not SOURCE2.is_jagged()
+        assert JAGGED1.is_jagged()
+        assert JAGGED2.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_jagged_elementwise_add_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source1_pt = get_random_torch_tensor(jagged1_input_shape, dtype)
+        source2_pt = get_random_torch_tensor(jagged2_input_shape, dtype)
+        result_pt = source1_pt + source2_pt  # jagged inputs are treated as dense
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source1": source1_pt, "source2": source2_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3], [5], [5]),
+            param(2, "int32", [4, 3], [5], [1]),
+            param(3, "int64", [4, 3], [1], [5]),
+            param(4, "int64", [4, 3], [5, 1, 7], [1, 6, 1]),
+        ]
+    )
+    def test_jagged_jagged_elementise_add_single_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_prefix_shape,
+        jagged1_inner_shape,
+        jagged2_inner_shape,
+    ):
+        self._test_jagged_jagged_elementwise_add(
+            jagged_max_prefix_shape=jagged_max_prefix_shape,
+            jagged1_inner_shape=jagged1_inner_shape,
+            jagged2_inner_shape=jagged2_inner_shape,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dtype="float16",
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"single_offsets_fp16_{i}",
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 200], [10], [10]),
+            param(2, "int32", [3, 4, 5, 200], [1, 2], [2, 1]),
+            param(3, "int64", [3, 4, 5, 150], [6, 7, 8], [6, 7, 8]),
+            param(4, "int64", [3, 4, 5, 150], [6, 1, 8], [1, 7, 1]),
+        ]
+    )
+    def test_jagged_jagged_elementise_add_multiple_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_prefix_shape,
+        jagged1_inner_shape,
+        jagged2_inner_shape,
+    ):
+        self._test_jagged_jagged_elementwise_add(
+            jagged_max_prefix_shape=jagged_max_prefix_shape,
+            jagged1_inner_shape=jagged1_inner_shape,
+            jagged2_inner_shape=jagged2_inner_shape,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dtype="float16",
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"multiple_offsets_fp16_{i}",
+        )
+
+    def _benchmark_jagged_dense_elementwise_add(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        num_dense_inputs: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        jagged_dim = JaggedDim(min_value=0, max_value=N)
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        sequence_dim = IntImm(value=N, name="sequence_dim")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE_INPUTS = [
+            Tensor(
+                shape=[
+                    batch_dim,
+                    sequence_dim,
+                    embedding_dim,
+                ],
+                name=f"dense_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_dense_inputs)
+        ]
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jagged_dim],
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = JAGGED
+        for DENSE in DENSE_INPUTS:
+            RESULT = ops.elementwise(FuncEnum.ADD)(RESULT, DENSE)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"benchmark_jagged_dense_elementwise_add_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            dense_inputs_pt = {
+                f"dense_{i}": get_random_torch_tensor([B, N, D], dtype)
+                for i in range(num_dense_inputs)
+            }
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs = {"source": source_pt, **dense_inputs_pt, "offsets": offsets_pt}
+            outputs = [torch.empty_like(source_pt)]
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                fused_elementwise_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("fused_elementwise")
+                ]
+                assert len(fused_elementwise_records) == 1
+                runtime_ms = fused_elementwise_records[0]["ms_per_iter"]
+
+            items = total_length * D  # total items to read / write: the jagged volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            io_num = num_dense_inputs + 2  # num_dense_inputs + 1 inputs, 1 output
+            bandwidth = io_num * items * size / (runtime_ms * 1e-3 * 1e9)  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {num_dense_inputs=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_jagged_dense_elementise_add(self):
+        # ESUHM use case: "jagged + dense + dense = jagged",
+        # with dtype=float16; https://fburl.com/code/1e9z83fb
+        self._benchmark_jagged_dense_elementwise_add(
+            B=1024,
+            N=260,
+            D=256,
+            num_dense_inputs=2,
+            dtype="float16",
+            offsets_dtype="int32",
+            use_jagged_space_indexing=False,
+            test_suffix="esuhm",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 4e6e41645df42c8c0f29958e78e9083c1f4e2883 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Thu, 2 Mar 2023 01:37:00 -0800
Subject: [PATCH 206/638] Added permute ops before and after group_norm (#351)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/351

Similar to what we have done for conv/pooling/batch_norm ops which
take different tensor layout from pytorch, we need to do adjust
layouts for group_norm, too.

Reviewed By: yinghai

Differential Revision: D43726397

fbshipit-source-id: 87643682cd05b548b163ca31ed34b4040b06b57b
---
 fx2ait/fx2ait/converters/ait_converters.py           | 4 +++-
 fx2ait/fx2ait/test/converters/test_ait_group_norm.py | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 22a50345a..dcfdcc706 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -887,6 +887,7 @@ def acc_ops_group_norm(
     name: str,
 ) -> ConverterOutput:
     input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
     num_groups = kwargs["num_groups"]
     weight_val = kwargs["weight"]
     bias_val = kwargs["bias"]
@@ -895,7 +896,8 @@ def acc_ops_group_norm(
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     num_channels = input_val.shape()[-1].value()
     op = group_norm(num_groups, num_channels)
-    return op(input_val, weight_val, bias_val, eps_val)
+    result = op(input_val, weight_val, bias_val, eps_val)
+    return ait_nhwc2nchw(result)
 
 
 @ait_converter(acc_ops.layer_norm)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_group_norm.py b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
index 783cb28a5..0b3534444 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
@@ -40,6 +40,4 @@ def forward(self, x):
             mod,
             inputs,
             expected_ops={},
-            permute_inputs=[0, 2, 3, 1],
-            permute_outputs=[0, 3, 1, 2],
         )

From 9fdf6f2cd380209dbaa6c26c358d9949f2a899dd Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 2 Mar 2023 02:47:50 -0800
Subject: [PATCH 207/638] Use first dim as batch size if same in every input
 (#349)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/349

If all input tensors have the same shape (in particular, the same first batch dimension), the current frequency calculation code can pick arbitrary dimension as `batch_size`.

This diff adds a special handling of the case when all inputs' first dimensions are the same. This guarantees that the first dimension is the most frequent one (or at least one of the most frequent ones). In such a case it is probably reasonable to declare the first input dimension as `batch_size` (because it typically is).

Reviewed By: amateurcoffee, tissue3

Differential Revision: D43716722

fbshipit-source-id: 33d2f21301afe5c779f7cfcbd5267324cfa1becd
---
 fx2ait/fx2ait/tensor_spec.py           | 15 +++++++++++++--
 fx2ait/fx2ait/test/test_tensor_spec.py |  8 ++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index 5d349aaa9..7d68f6c72 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -247,16 +247,27 @@ def find_batch_size_dim(cls, inputs: Any) -> []:
             return [0]
         shapes = [i.shape for i in inputs]
         frequency_map = {}
+        first_dims = set()
         for shape in shapes:
             if len(shape) < 2:
                 # By pass for rank-1 tensors. MRS model has rank-1 tensor carry no batch_size info
                 continue
             # Dedup shape value for single tensor
+            first_dims.add(shape[0])
             shape = set(shape)
             for i in shape:
                 frequency_map[i] = frequency_map.get(i, 0) + 1
-        sorted_frequency = sorted(frequency_map.items(), key=lambda x: -x[1])
-        batch_size = sorted_frequency[0][0]
+
+        if len(first_dims) == 1:
+            # first dim is the same in every input: we use it as batch_size
+            batch_size = first_dims.pop()
+        elif frequency_map:
+            # first dims are different: we use the most frequent dim as batch_size
+            sorted_frequency = sorted(frequency_map.items(), key=lambda x: -x[1])
+            batch_size = sorted_frequency[0][0]
+        else:
+            # no dims to sort: no batch_size
+            batch_size = -1
 
         bs_dim = []
         for i in inputs:
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index e1838b025..340e91005 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -69,6 +69,14 @@ def test_two_input_lists(self):
                     ([4, 10, 9], torch.float16),
                 ],
             ),
+            (
+                "same_shapes",
+                [
+                    ([10, 3, 40, 5], torch.float16),
+                    ([10, 3, 40, 5], torch.float16),
+                    ([10, 3, 40, 5], torch.float32),
+                ],
+            ),
         ]
     )
     def test_input_list_with_batch_size(self, _, settings):

From 387d45045ab21aecc2e83f06a662a3e7f047c607 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Thu, 2 Mar 2023 13:07:29 -0800
Subject: [PATCH 208/638] Add an env to guard collecting compilation time
 breakdowns (#347)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/347

ATT, to fix issue https://github.com/facebookincubator/AITemplate/issues/329.

Reviewed By: houseroad

Differential Revision: D43687047

fbshipit-source-id: 8ea06a812ede0430181b156c8f2bb1d9edffd4b3
---
 python/aitemplate/backend/builder.py | 9 +++++++--
 python/aitemplate/utils/environ.py   | 9 +++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 64da0f252..bef7346ef 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -32,6 +32,8 @@
 
 import jinja2
 
+from aitemplate.utils import environ
+
 from aitemplate.utils.debug_settings import AITDebugSettings
 
 from ..utils.misc import is_debug
@@ -54,7 +56,11 @@ def _augment_for_trace(cmd):
 
 
 def _time_cmd(cmd):
-    return f"exec time -f 'exit_status=%x elapsed_sec=%e argv=\"%C\"' {cmd}"
+    return (
+        f"exec time -f 'exit_status=%x elapsed_sec=%e argv=\"%C\"' {cmd}"
+        if environ.time_compilation()
+        else cmd
+    )
 
 
 def _log_error_context(
@@ -368,7 +374,6 @@ def build_so(self, target: Target, objs: list[str]):
         self._runner.pull()
 
     def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings):
-
         makefile_template = jinja2.Template(
             """
 CC = {{cc}}
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 55cf8e739..24165ac46 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -49,3 +49,12 @@ def force_profiler_cache() -> bool:
         ), "cannot specify both AIT_FORCE_PROFILER_CACHE and FORCE_PROFILE"
     _LOGGER.info(f"{force_cache=}")
     return force_cache
+
+
+def time_compilation() -> bool:
+    """
+    When enabled, time each make command at compilation time.
+    This helps us doing compilation time analysis.
+    Requires to install "time".
+    """
+    return os.getenv("AIT_TIME_COMPILATION", "0") == "1"

From 324f7ef4c972bfc28004db2552d01e87476f1e6f Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 2 Mar 2023 15:50:01 -0800
Subject: [PATCH 209/638] Add AIT_TIME_COMPILATION description to env.rst
 (#356)

Summary:
Add `AIT_TIME_COMPILATION` description to [env.rst](https://facebookincubator.github.io/AITemplate/reference/env.html)

Follow up change for https://github.com/facebookincubator/AITemplate/pull/347

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/356

Reviewed By: alexanderguzhva

Differential Revision: D43752096

Pulled By: tenpercent

fbshipit-source-id: f521248d661b87ce82d954a40a17a333ffbcc5b2
---
 docs/source/reference/env.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 9e9f7769a..86fcf78ac 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -14,6 +14,8 @@ Codegen
 
 **AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler perform time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
 
+**AIT_TIME_COMPILATION**: If set to "1", time each make command at the compilation time. This helps us to do compilation time analysis. Requires to install `time <https://man7.org/linux/man-pages/man1/time.1.html>`_ package.
+
 Profiling
 ---------
 

From 81b2cc1aba783095a84c3446ac6ceb465e6498d3 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 2 Mar 2023 22:00:46 -0800
Subject: [PATCH 210/638] Fix code in py_design.rst (#352)

Summary:
Fixes:
-  fix typo `"output1": out0_ait` -> `"output1": out1_ait`
- `outputs` array is created based on `len(input_name_to_idx)`. but it should use `len(output_name_to_idx)` instead.
- simpler code to create fixed size array

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/352

Reviewed By: tenpercent, houseroad

Differential Revision: D43756996

Pulled By: muchulee8

fbshipit-source-id: 9e8dcea10d9a22c161a2a497823725d653e67350
---
 docs/source/runtime/py_design.rst | 6 +++---
 static/README.md                  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/runtime/py_design.rst b/docs/source/runtime/py_design.rst
index 5c9d630e0..55093b8df 100644
--- a/docs/source/runtime/py_design.rst
+++ b/docs/source/runtime/py_design.rst
@@ -37,7 +37,7 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
     # Arguments as a dictionary
     module.run(
       {"input0": in0_ait, "input1": in1_ait},
-      {"output0": out0_ait, "output1": out0_ait},
+      {"output0": out0_ait, "output1": out1_ait},
     )
 
     # Arguments as an ordered list. Note that you might need to query
@@ -45,8 +45,8 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
     input_name_to_idx = module.get_input_name_to_index_map()
     output_name_to_idx = module.get_output_name_to_index_map()
 
-    inputs = [None for i in range(len(input_name_to_idx))]
-    outputs = [None for i in range(len(input_name_to_idx))]
+    inputs = [None] * len(input_name_to_idx)
+    outputs = [None] * len(output_name_to_idx)
 
     for name in input_name_to_idx:
       inputs[input_name_to_idx[name]] = ait_inputs[name]
diff --git a/static/README.md b/static/README.md
index 97c3f1b81..d1ffafaaa 100644
--- a/static/README.md
+++ b/static/README.md
@@ -55,7 +55,7 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
 # Arguments as a dictionary
 module.run(
   {"input0": in0_ait, "input1": in1_ait},
-  {"output0": out0_ait, "output1": out0_ait},
+  {"output0": out0_ait, "output1": out1_ait},
 )
 
 # Arguments as an ordered list. Note that you might need to query
@@ -63,8 +63,8 @@ module.run(
 input_name_to_idx = module.get_input_name_to_index_map()
 output_name_to_idx = module.get_output_name_to_index_map()
 
-inputs = [None for i in range(len(input_name_to_idx))]
-outputs = [None for i in range(len(input_name_to_idx))]
+inputs = [None] * len(input_name_to_idx)
+outputs = [None] * len(output_name_to_idx)
 
 for name in input_name_to_idx:
   inputs[input_name_to_idx[name]] = ait_inputs[name]

From 35ec3e6cf66307820b8cfa8b506f2e39d8abb5aa Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 3 Mar 2023 09:48:26 -0800
Subject: [PATCH 211/638] fix profiler timeout message (#358)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/358

as titled

Reviewed By: chenyang78

Differential Revision: D43762214

fbshipit-source-id: 4f5457ac90f53e98ffcf4bdf3b0b88459c8f2d4b
---
 python/aitemplate/backend/profiler_runner.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 8456f7a05..612c514a1 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -261,6 +261,7 @@ def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 3
         self._timeout = timeout
         self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(devices))
         self._futures = []
+        self._cmds = []
         self._postprocessing_delegate = postprocessing_delegate
         try:
             target = Target.current()
@@ -311,6 +312,7 @@ def callback_when_done(fut):
 
         future.add_done_callback(callback_when_done)
         self._futures.append(future)
+        self._cmds.append(cmds)
 
     def join(self):
         """
@@ -318,7 +320,19 @@ def join(self):
         """
         done, not_done = concurrent.futures.wait(self._futures, self._timeout)
         for f in not_done:
+            # attempts cancelling, will fail if call is being executed or has finished
             f.cancel()
-        for _ in self._futures:
+        cancelled_cmds = [
+            cmd for cmd, f in zip(self._cmds, self._futures) if f.cancelled()
+        ]
+        if cancelled_cmds:
+            raise RuntimeError(
+                f"Profiler timed out after {self._timeout} sec. "
+                "Try increasing the timeout. "
+                f"Cancelled profilers: {cancelled_cmds}"
+            )
+        for _ in [f for f in self._futures if f.done() or f.running()]:
+            # sync point between futures and queue.
+            # wait for callbacks to finish
             self._done_queue.get(timeout=self._timeout)
         self._postprocessing_delegate.postprocess_results()

From c12674d70fc376c6e2c3032c150d6ef0cc54bffe Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Fri, 3 Mar 2023 11:31:51 -0800
Subject: [PATCH 212/638] resort to the old detect_target approach if pycuda is
 not present (#360)

Summary:
In some dev environment with constraints, installing pycuda is not feasible.

This PR enables us to first check if pycuda is available. If not, we go back with the old approach to detecting the target.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/360

Reviewed By: tenpercent

Differential Revision: D43775402

Pulled By: chenyang78

fbshipit-source-id: d9ab747110e41cba8f993ede83a6794db34bfb66
---
 python/aitemplate/testing/detect_target.py | 25 ++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index df8d1159a..1031d6386 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -30,6 +30,28 @@
 FLAG = ""
 
 
+def _detect_cuda_with_nvidia_smi():
+    try:
+        proc = Popen(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv"],
+            stdout=PIPE,
+            stderr=PIPE,
+        )
+        stdout, stderr = proc.communicate()
+        stdout = stdout.decode("utf-8")
+        if "H100" in stdout:
+            return "90"
+        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30", "RTX 40"]):
+            return "80"
+        if "V100" in stdout:
+            return "70"
+        if "T4" in stdout:
+            return "75"
+        return None
+    except Exception:
+        return None
+
+
 def _detect_cuda():
     try:
         import pycuda.driver as drv
@@ -47,6 +69,9 @@ def _detect_cuda():
             return "70"
         else:
             return None
+    except ImportError:
+        # go back to old way to detect the CUDA arch
+        return _detect_cuda_with_nvidia_smi()
     except Exception:
         return None
 

From 0a5b3b635dbcc4bf21c2566accfdc9ec022c6d9a Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Fri, 3 Mar 2023 13:06:41 -0800
Subject: [PATCH 213/638] Implement expand op (#312)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/312

### Implement expand operator CUDA backend

Adding CUDA backend implementation for expand: https://fburl.com/code/nb2mcsmg.

The operator semantic should be the same as the pytorch version https://fburl.com/fljywh6p.

#### Implementation

The previous expand operator was a no-op version, which only worked under very limited conditions, namely when it expanded just a single, already existing direction, and could be merged into a following elementwise op that supports tensor broadcasting.

This new version actually expands the tensor, supporting multiple expansion dimensions, dynamic shapes and adding dimensions, just like the pytorch version. There are three CUDA kernels implemented, one dealing with the general case, and two which are specialized in order to be faster in certain scenarios.

The pytorch version is in principle more effective, nevertheless, because in pytorch it just needs to create a new view on a source tensor with different read strides. As AIT has no general notion of strides for tensor dimensions, this is not a real option at the moment, unless we add that support to tensors and operators on them.

#### Further possible optimizations (not part of this PR )

 * When adding leading dimensions, this can be decomposed into writing an upper part of the tensor ( requiring strided reads or writes ) and then repeatedly copying that tensor ( which can be accomplished using effective sequential reads and writes and can utilize shared memory )
 * Further operator fusions should be possible
 * With all immutable dimensions, a more efficient implementation would be possible via loop unrolling and precalculation of strides etc.

Reviewed By: chenyang78

Differential Revision: D43419041

fbshipit-source-id: 84ec2c4716c3e21860d1d55807cf649ed543ba2e
---
 python/aitemplate/backend/backend_spec.py     |   1 +
 .../aitemplate/backend/cuda/tensor/expand.py  | 401 +++++++++++++++++-
 .../cuda/tensor/expand_static_shape.py        | 394 +++++++++++++++++
 .../aitemplate/backend/cuda/tensor/repeat.cuh | 161 +++++++
 .../aitemplate/compiler/ops/tensor/expand.py  | 105 +++--
 .../compiler/transform/remove_no_ops.py       |  16 +-
 tests/unittest/compiler/test_fuse_expand.py   |   1 -
 tests/unittest/ops/test_expand.py             | 249 +++++++++++
 8 files changed, 1288 insertions(+), 40 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/tensor/expand_static_shape.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/repeat.cuh

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 1298a6234..fb9394b8b 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -57,6 +57,7 @@ class GPUBackendSpec(BackendSpec):
             "float32": "float",
             "float": "float",
             "int64": "int64_t",
+            "int32": "int32_t",
         }
     )
 
diff --git a/python/aitemplate/backend/cuda/tensor/expand.py b/python/aitemplate/backend/cuda/tensor/expand.py
index cdd50a486..45c4d7f08 100644
--- a/python/aitemplate/backend/cuda/tensor/expand.py
+++ b/python/aitemplate/backend/cuda/tensor/expand.py
@@ -13,19 +13,410 @@
 #  limitations under the License.
 #
 
-from ... import registry
+
+from typing import Any, Dict
+
+import jinja2
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.tensor import expand_static_shape  # noqa: F401
+
+
+def _to_cuda_dtype(dtype):
+    dtype = CUDASpec().dtype_to_backend_dtype.get(dtype, None)
+    return dtype
 
 
 @registry.reg("cuda.expand.func_decl")
 def gen_function_decl(func_attrs):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+    if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
+        func = registry.get("cuda.expand.static.func_decl")
+        return func(func_attrs)
+    x = func_attrs["inputs"][0]
+    func_name = func_attrs["name"]
+    index_type = _to_cuda_dtype(func_attrs.get("index_type", "int64"))
+    dt = x.dtype()
+    dtype = _to_cuda_dtype(dt)
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,  # name of the function
+        dtype=dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        index_type=index_type,
+    )
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* src,
+  const {{index_type}}* input_dims,
+  const {{index_type}} input_rank,
+  void* dst,
+  {{index_type}}* output_dims, // written to ( runtime shape inference )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types,
+  cudaStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "logging.h"
+
+using bfloat16 = __nv_bfloat16;
+
+{% if index_type=="int64_t" %}
+#define DIM_TYPE_ADD 0l
+#define DIM_TYPE_EXPAND 1l
+#define DIM_TYPE_KEEP 2l
+
+#define MAX_THREADS_PER_BLOCK 1024l
+#define MAX_BLOCKS 65535l
+#define MAX_X_BLOCKS 2147483647l
+{% else %}
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_BLOCKS 65535
+#define MAX_X_BLOCKS 2147483647
+{% endif %}
+
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+
+/**
+ * Sequential write expand kernel for single block case.
+ *
+ * This kernel is optimized for small inputs, where we can load
+ * the entire  input into shared memory more or less at once
+ */
+__global__ void {{func_name}}_sequential_write_single_block_kernel(
+  // Implementation for small inputs where the entire src can be read into shared memory,
+  // and we have just one thread block
+  const {{dtype}}* src,
+  const {{index_type}} src_numel,
+  {{dtype}}* dst,
+  const {{index_type}} dst_numel
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}}
+        ,const {{index_type}} read_strides_{{i}}
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_idx = threadIdx.x;
+    extern __shared__ {{dtype}} src_shared[]; // dynamic shared memory
+    if (write_idx<src_numel) {
+        src_shared[write_idx] = src[write_idx];
+    }
+    __syncthreads();
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+    {% for i in range(output_rank) %}
+        read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+        remaining_idx %= output_strides_{{i}};
+    {% endfor %}
+    if (write_idx<dst_numel) {
+        dst[write_idx] = src_shared[read_idx];
+    }
+}
+
+/**
+ * Sequential write expand kernel with batched read/writes on trailing
+ * dimensions.
+ *
+ * This kernel is optimized for the case that trailing dimensions
+ * are kept between input and output, in which case we can do block-wise
+ * reads and writes.
+ */
+__global__ void {{func_name}}_sequential_write_batch_kernel(
+
+  const {{dtype}}* src,
+  {{dtype}}* dst,
+  const {{index_type}} dst_numel,
+  const {{index_type}} batch_size
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}}
+        ,const {{index_type}} read_strides_{{i}}
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_idx = (blockDim.x * blockIdx.x + blockDim.y * blockIdx.y + blockDim.z * blockIdx.z + threadIdx.x) * batch_size;
+    {{index_type}} read_idx = 0;
+    {{index_type}} i = write_idx; // Used to calculate remainder
+    {% for i in range(output_rank) %}
+        read_idx += (i / output_strides_{{i}}) * read_strides_{{i}};
+        i %= output_strides_{{i}};
+    {% endfor %}
+    if (write_idx+batch_size-1<dst_numel) {
+        dst[write_idx] = src[read_idx];
+        for (i = 1; i < batch_size; i++) {
+            dst[write_idx+i] = src[read_idx+i];
+        }
+    }
+}
+
+/**
+ * Sequential write expand kernel.
+ * This kernel deals with the general case. It relies heavily on L2 cache
+ * for scattered read optimization and does sequential writes.
+ * This was benchmarked against an alternative implementation that tried
+ * to minimize overall memory accesses, doing sequential reads and scattered
+ * writes. But this implementation is faster.
+ */
+__global__ void {{func_name}}_sequential_write_kernel(
+
+  const {{dtype}}* src, // source tensor
+  {{dtype}}* dst, // destination tensor
+  const {{index_type}} dst_numel // number of elements in dst
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}} // Stride for writing dimension {{i}} to dst
+        ,const {{index_type}} read_strides_{{i}} // Stride for reading dimension {{i}} from src
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_idx = blockDim.x * blockIdx.x + blockDim.y * blockIdx.y + blockDim.z * blockIdx.z + threadIdx.x;
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+    {% for i in range(output_rank) %}
+        read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+        remaining_idx %= output_strides_{{i}};
+    {% endfor %}
+    if (write_idx<dst_numel) {
+        dst[write_idx] = src[read_idx];
+    }
+}
+
+/**
+ * Expand Operator entry point with support for dynamic shapes
+ */
+void {{func_name}} (
+  const void* src, // input tensor
+  const {{index_type}}* input_dims, // input dimensions ( passed by value )
+  const {{index_type}} input_rank,
+  void* dst, // output tensor
+  {{index_type}}* output_dims, // output dimensions ( passed by value )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types, // Output dim types ( length=output_rank ). 2 = keep dimension, 1 = expand dimension, 0 = add dimension
+  cudaStream_t stream)
+{
+  // Calculate number of input elements
+  {{index_type}} input_numel = 1;
+  {{index_type}} i;
+  for (i = 0; i < input_rank; ++i) {
+    input_numel *= input_dims[i];
+  }
+  {{index_type}} input_dim_pos = 0;
+
+  // Calculate number of output dimensions
+  {{index_type}} output_numel = 1;
+  for (i = 0; i < output_rank; ++i) {
+    output_numel *= output_dims[i];
+  }
+
+  // Determine stride for each input dimension
+  {{index_type}} input_strides[input_rank];
+  input_strides[input_rank-1] = 1;
+  for (i=input_rank-2;i>=0;--i) {
+    input_strides[i] = input_strides[i+1]*input_dims[i+1];
+  }
+  // Determine stride for each output dimension
+  {{index_type}} output_strides[output_rank];
+  output_strides[output_rank-1] = 1;
+  for (i=output_rank-2;i>=0;--i) {
+    output_strides[i] = output_strides[i+1]*(output_dims[i+1]);
+  }
+
+  // Determine read strides for each output dimension
+  // (0 for expand or add dims, otherwise the stride of
+  // of the corresponding input dim)
+  {{index_type}} read_strides[output_rank];
+
+  input_dim_pos = 0;
+  for (i = 0; i < output_rank; ++i) {
+    {{index_type}} dim_type =  output_dim_types[i];
+    if (dim_type == DIM_TYPE_KEEP ) { // keep
+      read_strides[i] = input_strides[input_dim_pos++];
+    } else {
+      read_strides[i] = 0;
+      if (dim_type==DIM_TYPE_EXPAND) {
+        input_dim_pos++;
+      }
+    }
+  }
+  assert(input_dim_pos==input_rank);
+
+  // Calculating tail dimension in order to determine whether we can do sequential batching
+  {{index_type}} tail_dim = 1;
+  for (i = output_rank-1; i >= 0; --i) {
+      if (output_dim_types[i]!=DIM_TYPE_KEEP) {
+         break;
+      }
+      tail_dim *= output_dims[i];
+  }
+
+  {{index_type}} batch_size = 1; // sequential batch len
+
+  if (output_numel>MAX_THREADS_PER_BLOCK) {
+    // If the input/output is so small that we can read it all into shared mem,
+    // sequential batching makes no sense
+    batch_size = 7; // Determined experimentally via benchmark.
+                    // Should be reevaluated after algorithmic changes.
+    for (;batch_size>1;--batch_size) {
+      if ((tail_dim % batch_size)==0) {
+          break;
+      }
+    }
+  }
+  assert ((output_numel % batch_size)==0);
+
+  // determine CUDA kernel grid layout
+  {{index_type}} output_batches = output_numel / batch_size;
+
+  {{index_type}} block_size = INT_CEIL_DIV(output_batches, MAX_THREADS_PER_BLOCK);
+  {{index_type}} thread_size_x = min(MAX_THREADS_PER_BLOCK, output_batches);
+
+  {{index_type}} block_size_x = block_size;
+  {{index_type}} block_size_y = 1;
+  {{index_type}} block_size_z = 1;
+
+  // for very large dimensions, we need to split into x,y,z grid blocks
+  if (block_size_x>MAX_X_BLOCKS) {
+      block_size_y = INT_CEIL_DIV(block_size_x, MAX_X_BLOCKS);
+      block_size_x = MAX_X_BLOCKS;
+      if (block_size_y > MAX_BLOCKS) {
+        block_size_z = INT_CEIL_DIV(block_size_y, MAX_BLOCKS);
+        block_size_y = MAX_BLOCKS;
+      }
+  }
+  dim3 dimGrid(block_size_x, block_size_y, block_size_z);
+  dim3 dimBlock(thread_size_x, 1, 1);
+  // Select the right kernel to call and call it
+  if (batch_size==1) {
+    if (block_size_x>1) {
+      {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+          static_cast<const {{dtype}}*>(src),
+          static_cast<{{dtype}}*>(dst),
+          output_numel
+          {% for i in range(output_rank) %}
+            ,output_strides[{{i}}]
+            ,read_strides[{{i}}]
+          {% endfor %}
+      );
+    } else {
+      {{func_name}}_sequential_write_single_block_kernel<<<dimGrid,dimBlock,sizeof({{dtype}})*input_numel,stream>>>(
+          static_cast<const {{dtype}}*>(src),
+          input_numel,
+          static_cast<{{dtype}}*>(dst),
+          output_numel
+          {% for i in range(output_rank) %}
+            ,output_strides[{{i}}]
+            ,read_strides[{{i}}]
+          {% endfor %}
+      );
+    }
+  } else {  // batch_size>1, asserting (thread_size_x % batch_size)==0
+      {{func_name}}_sequential_write_batch_kernel<<<dimGrid,dimBlock,0,stream>>>(
+          static_cast<const {{dtype}}*>(src),
+          static_cast<{{dtype}}*>(dst),
+          output_numel,
+          batch_size
+          {% for i in range(output_rank) %}
+            ,output_strides[{{i}}]
+            ,read_strides[{{i}}]
+          {% endfor %}
+      );
+  }
+}
+"""
+)
+
+
+def create_template_args(func_attrs: Dict[str, Any], indent="  "):
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    dtype = _to_cuda_dtype(x.dtype())
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    index_type = _to_cuda_dtype(func_attrs.get("index_type", "int64"))
+    assert index_type is not None
+
+    input_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in xshape]
+    )
+    output_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in yshape]
+    )
+    input_rank = len(xshape)
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "input_dims": input_dims,  # list of input dimensions (as string of comma-separated variable names )
+        "output_dims": output_dims,  # output dimensions (as string of comma-separated variable names)
+        "input_rank": input_rank,  # number of input dimensions
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+    }
 
 
 @registry.reg("cuda.expand.gen_function")
 def gen_function(func_attrs):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+    else:
+        func = registry.get("cuda.expand.static.gen_function")
+        return func(func_attrs)
 
 
 @registry.reg("cuda.expand.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+    else:
+        func = registry.get("cuda.expand.static.func_call")
+        return func(func_attrs, indent)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}const {{index_type}} input_dims[] = { {{input_dims}} };
+    {{indent}}{{index_type}} output_dims[] = { {{output_dims}} };
+    {{indent}}const {{index_type}} output_dim_types[] = { {{dim_types}} };
+    {{indent}}{{func_name}}(
+    {{indent}}    {{src}},
+    {{indent}}    input_dims,
+    {{indent}}    {{input_rank}},
+    {{indent}}    {{dst}},
+    {{indent}}    output_dims,
+    {{indent}}    {{output_rank}},
+    {{indent}}    output_dim_types,
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/cuda/tensor/expand_static_shape.py b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
new file mode 100644
index 000000000..06683070d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
@@ -0,0 +1,394 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import math
+import os
+from itertools import accumulate
+from operator import mul
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+
+"""
+Specialized and optimized CUDA kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
+
+@registry.reg("cuda.expand.static.func_decl")
+def gen_function_decl(func_attrs):
+    return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  cudaStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <cuda_pipeline.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "logging.h"
+
+
+using bfloat16 = __nv_bfloat16;
+
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024l
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+
+{{custom_libs}}
+
+/**
+ * Get read base offset (e.g. excluding tail offset) in the middle part, given a write offset
+ * into the middle part
+ */
+__forceinline__ __device__ {{index_type}} {{func_name}}_get_read_offset(const {{index_type}} write_offset) {
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_write_idx = write_offset; // assert < {{mid_size*tail_size}} ( i.e. < mid_size*tail_size)
+    {% for i in range(head_dim_count, head_dim_count+mid_dim_count-1) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+        remaining_write_idx %= {{output_strides[i]}}l;
+    {% endfor %}
+    {% for i in range(head_dim_count+mid_dim_count-1, head_dim_count+mid_dim_count) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+    {% endfor %}
+    return read_idx;
+}
+
+/**
+ *  Copies tail elements from a contiguous source memory region into a contiguous target memory region
+ *  Using a grid-stride loop and the vectorized dtype
+ *
+ * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__forceinline__ __device__ void {{func_name}}_tail_copy(
+        const {{dtype}} * const src, // base src tensor memory pointer
+        const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
+        {{dtype}} * const dst,  // base destination tensor memory pointer
+        const {{index_type}} write_offset, // Base offset into dst via {{dtype}}-typed indexing
+        const {{index_type}} block_thread_index,
+        const {{index_type}} block_thread_count,
+        const {{index_type}} copy_numel
+    ) {
+    for ({{index_type}} i=block_thread_index;i<copy_numel;i+=block_thread_count) {
+        dst[write_offset+i] = src[read_offset+i];
+    }
+}
+
+
+/**
+ * Implement the "middle" part of the kernel, where we have to deal with non-contiguous reads/writes.
+ *
+ * Also utilizes grid-stride loop for efficiency and flexibility
+ * see  * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__global__ void {{func_name}}_mid_kernel(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.x * blockIdx.x + threadIdx.x) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} thread_idx_y = blockDim.y * blockIdx.y + threadIdx.y;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=grid_size_x) {
+        {{func_name}}_tail_copy(src, read_offset, dst, write_offset, thread_idx_y, grid_size_y, {{tail_size}}l);
+    }
+
+}
+
+
+__global__ void {{func_name}}_mid_kernel2(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.y * blockIdx.y + threadIdx.y) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
+    const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
+        {{func_name}}_tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+    }
+
+}
+
+/**
+ * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
+ */
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  cudaStream_t stream)
+{
+  {% if mid_dim_count>0 %}
+  // we have middle dimensions which involve non-contiguous reads
+  // so we need to invoke the middle kernel
+  dim3 dimGrid({{mid_grid_blocks_x}}, {{mid_grid_blocks_y}});
+  dim3 dimBlock({{mid_grid_threads_x}}, {{mid_grid_threads_y}});
+  {{func_name}}_mid_kernel2<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  if (head_size>1l) {
+     // now repeat copy what we already built once, multiple times into the rest of the output tensor
+     cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
+  }
+  {% else %}
+    // we have no middle dimensions, so all we need to do is repeatedly copy the source multiple times
+    // repeat the entire thing a dynamic number of times ( e.g. head_size times )
+    cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
+  {% endif %}
+}
+"""
+)
+
+_dtype_sizes = {
+    "half": 2,
+    "bfloat16": 2,
+    "float32": 4,
+    "int64_t": 8,
+    "int32_t": 4,
+    "float": 4,
+}
+
+_size_dtypes = {
+    2: "half",
+    4: "float",
+    8: "int64_t",
+    16: "int4",
+}
+
+
+def _ceil(num):
+    return int(math.ceil(num))
+
+
+def create_template_args(func_attrs: Dict[str, Any], indent="  "):
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "repeat.cuh"
+    )
+    dtype = CUDASpec().dtype_to_backend_dtype[x.dtype()]
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    dtype2 = _size_dtypes.get(_dtype_sizes[dtype] * 2, None)
+    dtype4 = _size_dtypes.get(_dtype_sizes[dtype] * 4, None)
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
+    index_type = "int64_t"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in xshape
+    ), "All input shapes need to be fixed"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in yshape
+    ), "All output shapes need to be fixed"
+
+    # Calculate number of times we can repeatedly copy the entire result, based on how many add, expand and singleton dimensions
+    # we have at the start
+    head_size_lower = 1  # Number of times we can batch-repeat the entire result in an efficient batch-copying manner
+    head_size_upper = 1
+    head_dim_count = 0  # Number of head dimensions
+
+    for dim_type, dim in zip(func_attrs["dim_types"], yshape):
+        if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        head_size_lower *= dim.lower_bound()
+        head_size_upper *= dim.upper_bound()
+        head_dim_count += 1
+
+    # Create a symbolic term for calculating head size ( e.g. repeat count )
+    if head_size_lower == head_size_upper:
+        head_size_symbolic = f"{head_size_upper}l"
+    else:
+        head_size_symbolic = "*".join(
+            [
+                f"static_cast<{index_type}>(" + dim._attrs["name"] + ")"
+                for dim in yshape[:head_dim_count]
+            ]
+        )
+
+    # Calculate number of tail elements, e.g. number of elements we can batch-copy in the inner loop
+    # via effective sequential reads & writes
+    tail_dim_count = 0  # number of tail dimensions
+    tail_size = 1  # Number of the elements in all these  tail dimensions
+    for dim_type, dim in reversed(
+        list(zip(dim_types[head_dim_count:], yshape[head_dim_count:]))
+    ):
+        if dim_type != ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        tail_dim_count += 1
+        tail_size *= dim.lower_bound()
+
+    input_strides = list(
+        reversed(
+            list(accumulate([1] + [d.lower_bound() for d in reversed(xshape)], mul))
+        )
+    )
+    output_strides = list(
+        reversed(
+            list(
+                accumulate(
+                    [1] + [d.lower_bound() for d in reversed(yshape[head_dim_count:])],
+                    mul,
+                )
+            )
+        )
+    )
+
+    output_numel = output_strides[
+        0
+    ]  # this does not include the number of elements obtained from head repetitions
+    # since we have excluded head dimensions above
+    input_numel = input_strides[0]
+
+    mid_size = output_numel // tail_size
+    mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
+
+    mid_expansion_rate = mid_size * tail_size // input_numel
+
+    # remove the first dimension, which is the total number of elements
+    # and prepend the head_dims with stride 0
+    output_strides = [0] * head_dim_count + output_strides[1:]
+    input_strides = input_strides[1:]
+
+    input_stride_pos = 0
+    read_strides = [0] * len(yshape)
+    for i in range(len(yshape)):
+        if dim_types[i] == ExpandDimensionType.ADD_DIM:
+            continue
+        if dim_types[i] == ExpandDimensionType.KEEP_DIM:
+            read_strides[i] = input_strides[input_stride_pos]
+        # For keep dim, read stride remains at zero
+        input_stride_pos += 1
+
+    assert input_stride_pos == len(
+        xshape
+    ), "Incorrect number of keep and expand dims. Something went wrong."
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+
+    # If tail size is aligned to 2 or 4 elements, we can vectorize reads/writes
+    # Note: Further vectorization not easily possible, given that it could happen that
+    # the read offset and the write offset can get different alignments within the expand op
+    #
+    if (tail_size % 4 == 0) and (dtype4 is not None):
+        dtype = dtype4
+        tail_size = tail_size // 4
+        output_strides = [s // 4 for s in output_strides]
+        read_strides = [s // 4 for s in read_strides]
+    elif tail_size % 2 == 0:
+        dtype = dtype2
+        tail_size = tail_size // 2
+        output_strides = [s // 2 for s in output_strides]
+        read_strides = [s // 2 for s in read_strides]
+
+    mid_grid_blocks_x = 1
+    mid_grid_threads_x = min(tail_size, 32)
+    mid_max_y_threads = 1024 // mid_grid_threads_x  # guaranteed to be >= 1
+    mid_grid_threads_y = min(
+        mid_max_y_threads, mid_size
+    )  # so that  mid_grid_threads_x*max_x_threads <= 1024
+    mid_grid_blocks_y = _ceil(mid_size / mid_grid_threads_y)
+
+    if dtype == "bfloat16":
+        # bfloat16 is not available in model-generated.h as a type,
+        # so we can either just declare the input to be void*
+        # or  just use the fact that we don't care about how to interpret the value
+        # and just treat it like every other 16 bit type.
+        dtype = "half"
+
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "output_strides": output_strides,  # list of output stride values
+        "read_strides": read_strides,  # list of read stride values
+        "tail_dim_count": tail_dim_count,  # number of tail dimensions
+        "tail_size": tail_size,  # number of elements in all these tail dimensions
+        "head_dim_count": head_dim_count,  # number of head dimensions
+        "head_size": head_size_symbolic,  # number of elements in all these head dimensions
+        "mid_dim_count": mid_dim_count,
+        "mid_size": mid_size,
+        "mid_expansion_rate": mid_expansion_rate,  # How many times do we read the input for the middle
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float )
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+        "mid_grid_blocks_y": mid_grid_blocks_y,
+        "mid_grid_blocks_x": mid_grid_blocks_x,
+        "mid_grid_threads_y": mid_grid_threads_y,
+        "mid_grid_threads_x": mid_grid_threads_x,
+        "custom_libs": custom_libs,
+    }
+
+
+@registry.reg("cuda.expand.static.gen_function")
+def gen_function(func_attrs):
+    return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+
+
+@registry.reg("cuda.expand.static.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}{{func_name}}(
+    {{indent}}    static_cast<{{dtype}}*>({{src}}),
+    {{indent}}    static_cast<{{dtype}}*>({{dst}}),
+    {{indent}}    {{head_size}},
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/cuda/tensor/repeat.cuh b/python/aitemplate/backend/cuda/tensor/repeat.cuh
new file mode 100644
index 000000000..cffb6602c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/repeat.cuh
@@ -0,0 +1,161 @@
+/*
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+Functions for repeating parts of a CUDA source tensor onto itself
+or into a target tensor.
+
+Used by expand_static_shape.py ( expand operator )
+
+*/
+
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+#define SHM_MAX 1024 * 44
+
+__global__ void repeat_head_kernel(
+    const int64_t* const src,
+    int64_t* data,
+    size_t head_mem_num_elements,
+    size_t num_repeat_copies) {
+  extern __shared__ int64_t shared[];
+  const size_t stride_y = blockDim.y * gridDim.y;
+  const size_t stride_x = blockDim.x * gridDim.x;
+
+  // outer grid-stride loop
+  for (size_t ri = blockDim.x * blockIdx.x + threadIdx.x;
+       ri < head_mem_num_elements;
+       ri += stride_x) {
+    // read only with one thread per y dim
+    if (threadIdx.y == 0) {
+      // in y direction: thread 0 reads, all threads write
+      // repeatedly direct async copy from global to shared memory, see
+      // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#optimizing-cuda-applications
+      // and
+      // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memcpy-async-primitiv
+      __pipeline_memcpy_async(&shared[threadIdx.x], &src[ri], sizeof(int64_t));
+      __pipeline_commit();
+      __pipeline_wait_prior(0);
+    }
+    __syncthreads(); // wait for shared memory to be populated
+    // inner grid-stride loop, write with all threads out of shared memory
+    size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
+    for (; wi < num_repeat_copies; wi += stride_y) {
+      data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
+    }
+  }
+}
+
+__host__ cudaError_t cuda_repeat_head_vectorized(
+    const int64_t* const src,
+    int64_t* data,
+    size_t head_mem_num_elements,
+    size_t num_repeat_copies,
+    cudaStream_t stream) {
+  size_t threads_x = 32;
+  size_t threads_y = 1024 / threads_x;
+  size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
+  size_t blocks_y = INT_CEIL_DIV(num_repeat_copies, threads_y);
+  size_t serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks if necessary, so we do not exceed available shared
+  // memory
+  blocks_y = INT_CEIL_DIV(
+      blocks_y, serialization_level); // reduce thread count in y dimension
+                                      // first, e.g. sequentialized writes
+  serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks in x direction if this is not sufficient yet
+  blocks_x = INT_CEIL_DIV(blocks_x, serialization_level);
+  dim3 dimGrid(blocks_x, blocks_y);
+  dim3 dimBlock(threads_x, threads_y);
+  repeat_head_kernel<<<
+      dimGrid,
+      dimBlock,
+      threads_x * sizeof(int64_t),
+      stream>>>(src, data, head_mem_num_elements, num_repeat_copies);
+  return cudaPeekAtLastError();
+}
+
+__host__ cudaError_t cuda_repeat_head(
+    void* data,
+    const size_t head_mem_bytes,
+    size_t num_repeat_copies,
+    cudaStream_t stream) {
+  cudaError_t res = cudaSuccess;
+  if (num_repeat_copies == 0)
+    return res;
+  if ((head_mem_bytes % 8) == 0) {
+    // no need to double memory any further if it is 64-bit aligned
+    res = cuda_repeat_head_vectorized(
+        static_cast<const int64_t* const>(data),
+        static_cast<int64_t*>(data) + (head_mem_bytes / 8),
+        head_mem_bytes / 8,
+        num_repeat_copies,
+        stream);
+    if (res != cudaSuccess) {
+      return res;
+    }
+  } else {
+    res = cudaMemcpyAsync(
+        static_cast<void*>(static_cast<uint8_t*>(data) + head_mem_bytes),
+        data,
+        head_mem_bytes,
+        cudaMemcpyDeviceToDevice,
+        stream);
+    if (res != cudaSuccess) {
+      return res;
+    }
+    if (num_repeat_copies >= 2) {
+      // recurse
+      // we have already repeated 1 time, therefore the (num_repeat_copies-1)
+      res = cuda_repeat_head(
+          data, head_mem_bytes * 2, (num_repeat_copies - 1) / 2, stream);
+      if (res != cudaSuccess) {
+        return res;
+      }
+      // deal with possible remainder
+      if (((num_repeat_copies - 1) % 2) == 1) {
+        res = cudaMemcpyAsync(
+            static_cast<void*>(
+                static_cast<uint8_t*>(data) +
+                num_repeat_copies * head_mem_bytes),
+            data,
+            head_mem_bytes,
+            cudaMemcpyDeviceToDevice,
+            stream);
+      }
+    }
+  }
+  return res;
+}
+
+__host__ cudaError_t cuda_repeat_src(
+    const void* const src,
+    void* data,
+    const size_t head_mem_bytes,
+    size_t num_repeat_copies,
+    cudaStream_t stream) {
+  cudaError_t res = cudaSuccess;
+  if (num_repeat_copies == 0) {
+    return res;
+  }
+
+  res = cudaMemcpyAsync(
+      data, src, head_mem_bytes, cudaMemcpyDeviceToDevice, stream);
+  if ((res != cudaSuccess) || (num_repeat_copies == 1)) {
+    return res;
+  }
+  return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
+}
diff --git a/python/aitemplate/compiler/ops/tensor/expand.py b/python/aitemplate/compiler/ops/tensor/expand.py
index b4a436a87..0dda00166 100644
--- a/python/aitemplate/compiler/ops/tensor/expand.py
+++ b/python/aitemplate/compiler/ops/tensor/expand.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+from enum import IntEnum
 from typing import List, Union
 
 from aitemplate.backend import registry
@@ -35,6 +36,12 @@ def _dim_has_value(dim: IntVar, value: int) -> bool:
     return isinstance(dim, IntImm) and dim.value() == value
 
 
+class ExpandDimensionType(IntEnum):
+    ADD_DIM = 0
+    EXPAND_DIM = 1
+    KEEP_DIM = 2
+
+
 class expand(Operator):
     """
     Expands a tensor's singleton dimensions.
@@ -44,12 +51,17 @@ class expand(Operator):
     The output shape may be dynamic.
 
     The other dimensions in the input must match the input shape exactly,
-    or be set to -1.
+    or be set to -1, in which case the output shape is unchanged for that dimension.
+
+    Tensor can be also expanded to a larger number of dimensions, and the new ones will
+    be appended at the front. For the new dimensions, the size cannot be set to -1.
 
     Args:
         input (Tensor) : the source tensor
-        dim (List[Union[IntImm, IntVar, int]]) : the target dim
-
+        shape (List[Union[IntImm, IntVar, int]]) : target shape ( dimensions with size -1 will be kept, excess dimensions are added at the front )
+        index_type (str): Native type used for indices, may be "int64" (default) or "int32".
+                          Pick "int32" only if the total number of elements is lower than 2^31
+        optimize_fixed_dims (bool) : if True, and if the conditions are given, allow to apply optimizatins assuming mostly fixed shapes.
     Returns:
         Tensor : the destination tensor
 
@@ -75,55 +87,92 @@ def __init__(self):
     def _should_reuse_input_dim(dim_tensor: IntVar, dim_arg: IntVar) -> bool:
         return _dim_has_value(dim_arg, -1) or dim_tensor == dim_arg
 
-    def _infer_shape(self, tensor: Tensor, shape: List[IntVar]) -> List[IntVar]:
+    def _infer_shape(self, tensor: Tensor, target_shape: List[IntVar]) -> List[IntVar]:
         output_shape = []
         input_shape = tensor._attrs["shape"]
+        assert len(input_shape) > 0, "Input tensor must have a shape of length > 0"
+        for i, dim in enumerate(input_shape):
+            if dim.lower_bound() <= 0:
+                raise ValueError(
+                    f"Dimension {i} of expand input tensor shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes zero or negative values."
+                )
+        for i, dim in enumerate(target_shape):
+            if dim.lower_bound() <= 0 and dim.lower_bound() != -1:
+                raise ValueError(
+                    f"Dimension {i} of expand target shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes zero or negative values."
+                )
 
-        if len(shape) != len(input_shape):
+        if len(target_shape) < len(input_shape):
             raise ValueError(
-                f"Input shape ndim ({len(shape)}) must match tensor's ndim ({len(input_shape)})"
+                f"Target shape length ({len(target_shape)}) must be greater or equal to input tensor's shape length ({len(input_shape)})"
             )
-
-        for i, dim_tensor in enumerate(input_shape):
-            dim_arg = shape[i]
+        add_ndims = len(target_shape) - len(input_shape)
+        for i, dim_to_add in enumerate(target_shape[:add_ndims]):
+            if dim_to_add.lower_bound() <= 0:
+                raise ValueError(
+                    f"Output shape dimension {i} to be added has value range [{dim_to_add.lower_bound()}:{dim_to_add.upper_bound()}], but violates constraint that it must be greater or equal to 1."
+                )
+            output_shape.append(dim_to_add)
+        self._attrs["dim_types"] = [
+            ExpandDimensionType.ADD_DIM
+        ] * add_ndims  # 0 meaning, dimension is added
+        for i, dim_input in enumerate(input_shape):
+            dim_target = target_shape[i + add_ndims]
 
             # Convert IntVars with the same upper and lower bounds to IntImm's.
             # This lets us tell that expanding IntImm(1) into IntVar([1, 1]) is
             # actually a no-op.
-            dim_tensor = _normalize_dim(dim_tensor)
-            dim_arg = _normalize_dim(dim_arg)
+            dim_input = _normalize_dim(dim_input)
+            dim_target = _normalize_dim(dim_target)
 
-            if self._should_reuse_input_dim(dim_tensor, dim_arg):
-                output_shape.append(
-                    gen_int_var(
-                        dim_tensor._attrs["values"], name=dim_tensor._attrs["name"]
-                    )
-                )
-            elif _dim_has_value(dim_tensor, 1):
-                if self._attrs["expand_dim"] is not None:
-                    raise NotImplementedError(
-                        f"Expand only supports expanding one dim. Tried to expand dim {i}, but already expanded dim {self._attrs['expand_dim']}."
-                    )
-                self._attrs["expand_dim"] = i
+            if self._should_reuse_input_dim(dim_input, dim_target):
                 output_shape.append(
-                    gen_int_var(dim_arg._attrs["values"], name=dim_arg._attrs["name"])
-                )
+                    dim_input
+                )  # no deepcopy, dim symbol should be identical
+                self._attrs["dim_types"].append(
+                    ExpandDimensionType.KEEP_DIM
+                )  # 2 meaning, dimension is kept as is
+            elif _dim_has_value(dim_input, 1):
+                output_shape.append(dim_target)
+                self._attrs["dim_types"].append(
+                    ExpandDimensionType.EXPAND_DIM
+                )  # 1 meaning, dimension is expanded
             else:
                 raise ValueError(
-                    f"Tried to expand non-singleton dimension {i}. Input tensor dim: {dim_tensor}, target shape dim: {dim_arg}"
+                    f"Tried to expand non-singleton dimension {i}. Input tensor dim: {dim_input}, target shape dim: {dim_target}"
                 )
-
+        head_dim_count = 0
+        head_size = 1
+        for dim_type, dim in zip(self._attrs["dim_types"], output_shape):
+            if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+                break
+            head_size *= dim.lower_bound()
+            head_dim_count += 1
+        self._attrs["head_dim_count"] = head_dim_count
+        self._attrs["head_size"] = head_size
+        self._attrs["non_head_dims_are_fixed"] = all(
+            dim.lower_bound() == dim.upper_bound() for dim in output_shape[add_ndims:]
+        )
         return output_shape
 
     def __call__(
-        self, tensor: Tensor, shape: List[Union[int, IntVar, IntVarTensor]]
+        self,
+        tensor: Tensor,
+        shape: List[Union[int, IntVar, IntVarTensor]],
+        index_type="int64",
+        optimize_fixed_dims=True,
     ) -> Tensor:
         self._attrs["inputs"] = [tensor]
+        self._attrs["index_type"] = index_type
+        self._attrs["optimize_fixed_dims"] = optimize_fixed_dims
         for dim in shape:
             if isinstance(dim, IntVarTensor):
                 self._attrs["inputs"].append(dim)
         shape = convert_shape_to_IntVar(shape)
+        if index_type not in ["int64", "int32"]:
+            raise ValueError("index_type for expand op has to be int64_t or int32_t")
         self._set_depth()
+
         output_shape = self._infer_shape(tensor, shape)
         output = Tensor(output_shape, src_ops={self}, dtype=tensor._attrs["dtype"])
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/transform/remove_no_ops.py b/python/aitemplate/compiler/transform/remove_no_ops.py
index 0bf586e40..cd0a8b81f 100644
--- a/python/aitemplate/compiler/transform/remove_no_ops.py
+++ b/python/aitemplate/compiler/transform/remove_no_ops.py
@@ -31,15 +31,14 @@
 """
 from typing import List
 
-from aitemplate.compiler.base import IntVar, Operator
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
 
 from aitemplate.compiler.transform import transform_utils
 
 from aitemplate.utils import graph_utils
 from aitemplate.utils.shape_utils import is_singleton_dimension
 
-from ..base import Tensor
-
 
 def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
     """
@@ -56,9 +55,6 @@ def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
         if op._attrs["op"] != "expand":
             continue
 
-        if op._attrs["expand_dim"] is not None:
-            continue
-
         outputs = op._attrs["outputs"]
         assert len(outputs) == 1, "expand must only have 1 output"
         expand_output = outputs[0]
@@ -70,6 +66,14 @@ def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
         assert len(inputs) >= 1, "expand must have at least 1 input"
         expand_input = inputs[0]
 
+        assert len(op._attrs["dim_types"]) == len(
+            expand_output._attrs["shape"]
+        ), "expand must have dim_type for every output dimension"
+
+        # If we just keep every dimension as-is, it is a no-op
+        if any(dt != ExpandDimensionType.KEEP_DIM for dt in op._attrs["dim_types"]):
+            continue
+
         # This expand is a no-op, so we know that these shapes should
         # be the same. However, the shape inference system may not be aware
         # of that due to different IntVar names.
diff --git a/tests/unittest/compiler/test_fuse_expand.py b/tests/unittest/compiler/test_fuse_expand.py
index dcf3f67a9..64d8ac73e 100644
--- a/tests/unittest/compiler/test_fuse_expand.py
+++ b/tests/unittest/compiler/test_fuse_expand.py
@@ -55,7 +55,6 @@ def test_fuse_expand_elementwise(self, exact_match: bool, name: str):
 
                 z_ait = torch.empty_like(z_pt)
                 mod.run_with_tensors({"x": x_pt, "y": y_pt}, {"z": z_ait})
-
                 self.assertTrue(torch.equal(z_ait, z_pt))
 
 
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
index e17500058..ca2aa8c6c 100644
--- a/tests/unittest/ops/test_expand.py
+++ b/tests/unittest/ops/test_expand.py
@@ -12,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import math
+import sys
 import unittest
 
 import torch
@@ -21,6 +23,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -163,6 +166,252 @@ def test_no_op_expands_removed_size_op_fp32(self):
             dtype="float32",
         )
 
+    @parameterized.expand(
+        [
+            param("fp32_small_noadd_1", "float32", [10, 1, 5], [-1, 10, 5]),
+            param("fp32_small_noadd_2", "float32", [10, 1, 8], [-1, 10, 8]),
+            param("fp32_small_noadd_3", "float32", [10, 1, 2], [-1, 10, 2]),
+            param("fp32_small_noadd_4", "float32", [10, 1, 5], [10, 10, 5]),
+            param("fp32_small_1", "float32", [10, 1, 5], [3, 10, 10, 5]),
+            param("fp32_small_2", "float32", [3, 1, 5], [3, 3, 3, -1]),
+            param("fp32_small_3", "float32", [2, 1, 4, 1, 6], [-1, 10, 4, 5, 6]),
+            param("fp32_small_var_1", "float32", [10, 1, 5], [3, 10, 10, 5], False),
+            param("fp32_small_var_2", "float32", [1, 1, 5], [3, 3, 10, -1], False),
+            param(
+                "fp32_small_var_3", "float32", [2, 1, 4, 1, 6], [-1, 10, 4, 5, 6], False
+            ),
+            param("float16_small_1", "float16", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param("float16_small_2", "float16", [1, 2, 10], [10, 2, 10]),
+            param("bfloat16_small_1", "bfloat16", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param("int64_small_1", "int64", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param(
+                "fp32_large_1",
+                "float32",
+                [100, 1, 9, 3],
+                [2, 20, -1, 100, 9, -1],
+                "int32",
+            ),
+            param(
+                "fp32_large_2",
+                "float32",
+                [101, 1, 91, 3],
+                [-1, 100, 91, -1],
+                "int64",
+            ),
+            param(
+                "fp32_large_3",
+                "float32",
+                [100, 1, 9, 3],
+                [2, 20, -1, 100, 9, -1],
+                "int64",
+            ),
+            # Largest tests commented out, as these lead to GPU OOM failures on Github CircleCI Hardware
+            # param(
+            #    "fp32_large_4",
+            #    "float32",
+            #    [100, 1, 91, 3],
+            #    [2, 20, -1, 100, 91, -1],
+            #    "int64",
+            # ),
+            # param(
+            #     "fp32_large_5",
+            #     "float32",
+            #     [101, 1, 91, 7],
+            #     [3, 21, -1, 103, 91, -1],
+            #     "int64",
+            # ),
+            # param(
+            #     "fp32_large_repeat",
+            #     "float32",
+            #     [101, 1, 91, 8],
+            #     [1000, -1, -1, -1, -1],
+            #     "int64",
+            # ),
+            # param(
+            #     "fp32_large_var_2",
+            #     "float32",
+            #     [100, 1, 9, 3],
+            #     [2, 20, -1, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_1",
+            #     "float32",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_2",
+            #     "int64",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_3",
+            #     "float16",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_var_1",
+            #     "float32",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_var_2",
+            #     "int64",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_var_3",
+            #     "float16",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            param("fp32_m_1", "float32", [5, 1, 3, 2], [2, 2, -1, 5, 3, -1]),
+            param("fp32_m_2", "float32", [5, 1, 3, 5], [2, 2, -1, 5, 3, -1]),
+            param("edge_case_shapes_1", "float32", [1, 1, 1, 1], [1, 1, -1, 1, -1, 1]),
+            param("edge_case_shapes_2", "float32", [1], [-1]),
+            param("edge_case_shapes_3", "float32", [3], [-1]),
+            param("edge_case_shapes_4", "float32", [1], [1]),
+            param(
+                "edge_case_shapes_var_1",
+                "float32",
+                [1, 1, 1, 1],
+                [1, 1, -1, 1, -1, 1],
+                False,
+            ),
+            param("edge_case_shapes_var_2", "float32", [1], [-1], False),
+            param("edge_case_shapes_var_3", "float32", [3], [-1], False),
+            param("edge_case_shapes_var_4", "float32", [1], [1], False),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_expand_op(
+        self,
+        name,
+        dtype,
+        src_shape,
+        expand_shape,
+        optimize_fixed_dims=True,
+        index_type="int64",
+    ):
+        x = Tensor(
+            src_shape,
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
+        y = ops.expand()(
+            x,
+            expand_shape,
+            optimize_fixed_dims=optimize_fixed_dims,
+            index_type=index_type,
+        )
+        y._attrs["is_output"] = True
+        y._attrs["name"] = "Y"
+        if dtype != "int64":
+            x_pt = get_random_torch_tensor(src_shape, dtype=dtype)
+        else:
+            x_pt = torch.arange(
+                1, math.prod(src_shape) + 1, 1, dtype=torch.int64, device="cuda"
+            ).view(src_shape)
+        y_pt = x_pt.expand(expand_shape)
+        y_ait = torch.zeros_like(y_pt)
+        stream = torch.cuda.default_stream()
+        start_event_pt = torch.cuda.Event(enable_timing=True)
+        end_event_pt = torch.cuda.Event(enable_timing=True)
+        num_iters = 20
+        with compile_model(
+            y, detect_target(), "./tmp", "test_expand_codegen_" + name
+        ) as module:
+            module.run_with_tensors({"X": x_pt}, {"Y": y_ait})
+            self.assertTrue(graph_has_op(module.debug_sorted_graph, "expand"))
+            time_mean_ms, time_std_ms, result_tensors = module.benchmark_with_tensors(
+                {"X": x_pt}, {"Y": y_ait}, count=num_iters
+            )
+        print(
+            f"Write GB/sec:{1000*y_pt.numel()*y_pt.element_size()/time_mean_ms/(1024*1024*1024)}"
+        )
+        self.assertTrue(torch.equal(y_ait, y_pt))
+        # measure time against torch.contiguous()
+        cache_trasher = torch.zeros(1000, 1000, 42, device="cuda", requires_grad=False)
+        sum_elapsed_pt = 0.0
+        for _ in range(num_iters):
+            # trash the L2 cache, just like the benchmark code of AIT does
+            cache_trasher.normal_()
+            start_event_pt = torch.cuda.Event(enable_timing=True)
+            end_event_pt = torch.cuda.Event(enable_timing=True)
+            torch.cuda.synchronize()
+            start_event_pt.record(stream=stream)
+            _ = y_pt.contiguous()
+            end_event_pt.record(stream=stream)
+            torch.cuda.synchronize()
+            sum_elapsed_pt += start_event_pt.elapsed_time(end_event_pt)
+
+        pt_time = sum_elapsed_pt / num_iters
+        ait_throughput_write = (
+            1000
+            * y_pt.numel()
+            * y_pt.element_size()
+            / time_mean_ms
+            / (1024 * 1024 * 1024)
+        )
+        ait_throughput_read_once = (
+            1000
+            * x_pt.numel()
+            * x_pt.element_size()
+            / time_mean_ms
+            / (1024 * 1024 * 1024)
+        )
+        ait_throughput_total_lower_bound = (
+            ait_throughput_write + ait_throughput_read_once
+        )  # Assuming we just read the input once
+        ait_throughput_total_upper_bound = (
+            ait_throughput_write * 2
+        )  # Assuming every byte written has been read as well
+
+        pt_throughput_write = (
+            1000 * y_pt.numel() * y_pt.element_size() / pt_time / (1024 * 1024 * 1024)
+        )
+        pt_throughput_read = (  # Assuming we just read the input once
+            1000 * x_pt.numel() * x_pt.element_size() / pt_time / (1024 * 1024 * 1024)
+        )
+
+        pt_throughput_total_lower_bound = (
+            pt_throughput_write + pt_throughput_read
+        )  # Assuming we just read the input once
+        pt_throughput_total_upper_bound = (
+            pt_throughput_write * 2
+        )  # Assuming every byte written has been read as well
+
+        # ait_speedup_percent = round(100.0 * pt_time / time_mean_ms - 100.0)
+        ait_speedup_factor = f"{pt_time/time_mean_ms:.2f}"
+        ait_expand_variant = "general"
+        if optimize_fixed_dims:
+            ait_expand_variant = "optimized"
+        print(
+            f"""Benchmark Summary (test_expand_op:{name}) - {src_shape} => {expand_shape}: dtype={dtype}, variant={ait_expand_variant}. AIT speedup={ait_speedup_factor}x. Throughputs in GB/sec.: Write: pt={pt_throughput_write:.1f}, ait={ait_throughput_write:.1f}, Total (lower): pt={pt_throughput_total_lower_bound:.1f}, ait={ait_throughput_total_lower_bound:.1f} Total (upper): pt={pt_throughput_total_upper_bound:.1f}, ait=={ait_throughput_total_upper_bound:.1f} ]
+Benchmark note: Total throughput (lower) assumes the input is read once, Total throughput (upper) assumes every byte written has been read as well. The truth is inbetween due to caching of repeated reads.""",
+            file=sys.stdout,
+            flush=True,
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 75f54510d8e02114e013200a66ea9a5d433e5f81 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sat, 4 Mar 2023 02:38:08 -0800
Subject: [PATCH 214/638] enabled bmm_ccr/bmm_rrr + concat fusion (#359)

Summary:
This PR enabled bmm_ccr/bmm_rrr and concat fusion. It also clean-up the relevant unittests a bit.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/359

Reviewed By: tenpercent

Differential Revision: D43775333

Pulled By: chenyang78

fbshipit-source-id: 7ce94b00066f7f5142388eee397d6959cde183e0
---
 .../transform/transform_strided_ops.py        |   2 +-
 .../compiler/test_strided_op_cat_pattern.py   | 236 ++++++++++--------
 .../compiler/test_strided_view_cat.py         |  15 +-
 3 files changed, 144 insertions(+), 109 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 862af2329..3be187323 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -129,7 +129,7 @@ def _group_gemm_cat_checker(
 
 def _is_bmm(op_type: str) -> bool:
     # TODO: support cutlass bmm ops
-    return op_type.startswith(("bmm_rcr", "bmm_crr"))
+    return op_type.startswith(("bmm_rcr", "bmm_crr", "bmm_ccr", "bmm_rrr"))
 
 
 def _bmm_checker(bmm_op: Operator, cat_op: Operator) -> bool:
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
index fd7a21dfe..36de11705 100644
--- a/tests/unittest/compiler/test_strided_op_cat_pattern.py
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -475,7 +475,6 @@ def _fused_gemm_e2e_helper(
             "./tmp",
             f"fused_gemm_m_{m}_k_{k}_n1_{n1}_n2_{n2}_n3_{n3}_{dtype}",
         ) as module:
-
             if not no_fuse:
                 # Verify the generated graph.
                 sorted_graph = module.debug_sorted_graph
@@ -620,7 +619,6 @@ def _fused_gemm_alignment_e2e_helper(
             "./tmp",
             f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}_{dtype}",
         ) as module:
-
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
             if gemm_op_kind == "gemm_rcr_bias_add":
@@ -1277,13 +1275,55 @@ def test_group_layernorm_sigmoid_mul_cat_fusion_float(self):
                 dtype="float",
             )
 
-    def _test_bmm_rcr_cat_fusion(
+    def _bmm_parameters(self, bmm_op_name, B, M, N, K):
+        """
+        Return a dict of parameters used for constructing bmm ops
+        """
+        bmm_op_name = bmm_op_name[:7]
+        bmm_rcr_dict = {
+            "a_shape": [B, M, K],
+            "b_shape": [B, N, K],
+            "c_shape": [B, M, N],
+            "a_permute": None,
+            "b_permute": [0, 2, 1],
+        }
+        bmm_crr_dict = {
+            "a_shape": [B, K, M],
+            "b_shape": [B, K, N],
+            "c_shape": [B, M, N],
+            "a_permute": [0, 2, 1],
+            "b_permute": None,
+        }
+        bmm_ccr_dict = {
+            "a_shape": [B, K, M],
+            "b_shape": [B, N, K],
+            "c_shape": [B, M, N],
+            "a_permute": [0, 2, 1],
+            "b_permute": [0, 2, 1],
+        }
+        bmm_rrr_dict = {
+            "a_shape": [B, M, K],
+            "b_shape": [B, K, N],
+            "c_shape": [B, M, N],
+            "a_permute": None,
+            "b_permute": None,
+        }
+        bmm_permutes = {
+            "bmm_rcr": bmm_rcr_dict,
+            "bmm_crr": bmm_crr_dict,
+            "bmm_ccr": bmm_ccr_dict,
+            "bmm_rrr": bmm_rrr_dict,
+        }
+        return bmm_permutes.get(bmm_op_name)
+
+    def _test_bmm_xxx_cat_fusion(
         self,
         B,
         M,
         Ns,
         Ks,
         cat_dim,
+        bmm_op_maker,
         test_name,
         expected_num_tensors,
         expected_num_ops,
@@ -1298,27 +1338,36 @@ def _test_bmm_rcr_cat_fusion(
         for i in range(n):
             N = Ns[i]
             K = Ks[i]
+            bmm_op = bmm_op_maker()
+            bmm_params = self._bmm_parameters(bmm_op._attrs["op"], B, M, N, K)
+            x_shape = bmm_params["a_shape"]
+            y_shape = bmm_params["b_shape"]
             X = Tensor(
-                shape=[B, M, K],
+                shape=x_shape,
                 dtype=dtype,
                 name=f"X{i}",
                 is_input=True,
             )
             Y = Tensor(
-                shape=[B, N, K],
+                shape=y_shape,
                 dtype=dtype,
                 name=f"Y{i}",
                 is_input=True,
             )
-            if N > 1:
-                C = ops.bmm_rcr()(X, Y)
-            else:
-                C = ops.bmm_rcr_n1()(X, Y)
+            C = bmm_op(X, Y)
             Cs.append(C)
 
-            x = get_random_torch_tensor([B, M, K], dtype)
-            y = get_random_torch_tensor([B, N, K], dtype)
-            c = torch.bmm(x, y.permute([0, 2, 1]))
+            x = get_random_torch_tensor(x_shape, dtype)
+            bmm_x = x
+            x_permute = bmm_params["a_permute"]
+            if x_permute is not None:
+                bmm_x = x.permute(x_permute)
+            y = get_random_torch_tensor(y_shape, dtype)
+            bmm_y = y
+            y_permute = bmm_params["b_permute"]
+            if y_permute is not None:
+                bmm_y = y.permute(y_permute)
+            c = torch.bmm(bmm_x, bmm_y)
             Xs_pt.append(x)
             Ys_pt.append(y)
             Cs_pt.append(c)
@@ -1347,142 +1396,111 @@ def _test_bmm_rcr_cat_fusion(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
     def test_bmm_rcr_cat_fusion(self):
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=8,
             Ns=[2, 2, 2],
             Ks=[4, 5, 32],
             cat_dim=2,
+            bmm_op_maker=ops.bmm_rcr,
             test_name="test_bmm_rcr_cat_1",
             expected_num_tensors=11,
             expected_num_ops=5,
         )
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=16,
             Ns=[1, 1, 1],
             Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
             cat_dim=1,
             test_name="test_bmm_rcr_cat_2",
             expected_num_tensors=7,
             expected_num_ops=3,
         )
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=16,
             Ns=[1, 1, 1],
             Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
             cat_dim=2,
             test_name="test_bmm_rcr_cat_3",
             expected_num_tensors=7,
             expected_num_ops=3,
         )
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=16,
             Ns=[1, 1, 1],
             Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
             cat_dim=-1,
             test_name="test_bmm_rcr_cat_4",
             expected_num_tensors=7,
             expected_num_ops=3,
         )
 
-    def _test_bmm_crr_cat_fusion(
-        self,
-        B,
-        M,
-        Ns,
-        Ks,
-        cat_dim,
-        test_name,
-        expected_num_tensors,
-        expected_num_ops,
-        dtype="float16",
-    ):
-        n = len(Ns)
-        Cs = []
-
-        Xs_pt = []
-        Ys_pt = []
-        Cs_pt = []
-        for i in range(n):
-            N = Ns[i]
-            K = Ks[i]
-            X = Tensor(
-                shape=[B, K, M],
-                dtype=dtype,
-                name=f"X{i}",
-                is_input=True,
-            )
-            Y = Tensor(
-                shape=[B, K, N],
-                dtype=dtype,
-                name=f"Y{i}",
-                is_input=True,
-            )
-            C = ops.bmm_crr()(X, Y)
-            Cs.append(C)
-
-            x = get_random_torch_tensor([B, K, M], dtype)
-            y = get_random_torch_tensor([B, K, N], dtype)
-            c = torch.bmm(x.permute([0, 2, 1]), y)
-            Xs_pt.append(x)
-            Ys_pt.append(y)
-            Cs_pt.append(c)
-
-        Y = ops.concatenate()(Cs, dim=cat_dim)
-        Y._attrs["name"] = "output"
-        Y._attrs["is_output"] = True
-        y_pt = torch.cat(Cs_pt, dim=cat_dim)
-
-        # Gen module.
-        target = detect_target()
-        with compile_model(Y, target, "./tmp", test_name) as module:
-            input_name_to_index = module.get_input_name_to_index_map()
-            inputs = [0 for i in range(2 * n)]
-            for i in range(n):
-                inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
-                inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
-            y = get_torch_empty_tensor(y_pt.size(), dtype)
-            module.run_with_tensors(inputs, [y])
-
-            sorted_graph = module.debug_sorted_graph
-            self.assertEqual(len(sorted_graph), expected_num_tensors)
-            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-            self.assertEqual(len(sorted_ops), expected_num_ops)
-
-            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
-
     def test_bmm_crr_cat_fusion(self):
         # [B, K, M] x [B, K, N] = [B, M, N]
-        self._test_bmm_crr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=8,
             Ns=[2, 4, 10],
             Ks=[4, 5, 32],
+            bmm_op_maker=ops.bmm_crr,
             cat_dim=2,
             test_name="test_bmm_crr_cat_1",
             expected_num_tensors=7,
             expected_num_ops=3,
         )
-        self._test_bmm_crr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=8,
             M=16,
             Ns=[4, 4, 4],
             Ks=[3, 16, 9],
+            bmm_op_maker=ops.bmm_crr,
             cat_dim=1,
             test_name="test_bmm_crr_cat_2",
             expected_num_tensors=7,
             expected_num_ops=3,
         )
 
-    def _test_bmm_crr_add_cat_fusion(
+    def test_bmm_ccr_cat_fusion(self):
+        # [B, K, M] x [B, N, K] = [B, M, N]
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 8, 14],
+            bmm_op_maker=ops.bmm_ccr,
+            cat_dim=2,
+            test_name="test_bmm_ccr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def test_bmm_rrr_cat_fusion(self):
+        # [B, M, K] x [B, K, N] = [B, M, N]
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 8, 14],
+            bmm_op_maker=ops.bmm_rrr,
+            cat_dim=2,
+            test_name="test_bmm_rrr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def _test_bmm_xxx_add_cat_fusion(
         self,
         B,
         M,
         Ns,
         Ks,
+        bmm_op_maker,
         cat_dim,
         test_name,
         expected_num_tensors,
@@ -1499,31 +1517,45 @@ def _test_bmm_crr_add_cat_fusion(
         for i in range(n):
             N = Ns[i]
             K = Ks[i]
+            bmm_op = bmm_op_maker()
+            bmm_params = self._bmm_parameters(bmm_op._attrs["op"], B, M, N, K)
+            x_shape = bmm_params["a_shape"]
+            y_shape = bmm_params["b_shape"]
+            d_shape = bmm_params["c_shape"]
             X = Tensor(
-                shape=[B, K, M],
+                shape=x_shape,
                 dtype=dtype,
                 name=f"X{i}",
                 is_input=True,
             )
             Y = Tensor(
-                shape=[B, K, N],
+                shape=y_shape,
                 dtype=dtype,
                 name=f"Y{i}",
                 is_input=True,
             )
             D = Tensor(
-                shape=[B, M, N],
+                shape=d_shape,
                 dtype=dtype,
                 name=f"D{i}",
                 is_input=True,
             )
-            C = ops.bmm_crr_add()(X, Y, D)
+            C = bmm_op(X, Y, D)
             Cs.append(C)
 
-            x = get_random_torch_tensor([B, K, M], dtype)
-            y = get_random_torch_tensor([B, K, N], dtype)
-            d = get_random_torch_tensor([B, M, N], dtype)
-            c = torch.bmm(x.permute([0, 2, 1]), y)
+            x = get_random_torch_tensor(x_shape, dtype)
+            y = get_random_torch_tensor(y_shape, dtype)
+            d = get_random_torch_tensor(d_shape, dtype)
+            bmm_x = x
+            x_permute = bmm_params["a_permute"]
+            if x_permute is not None:
+                bmm_x = x.permute(x_permute)
+            y = get_random_torch_tensor(y_shape, dtype)
+            bmm_y = y
+            y_permute = bmm_params["b_permute"]
+            if y_permute is not None:
+                bmm_y = y.permute(y_permute)
+            c = torch.bmm(bmm_x, bmm_y)
             c = c + d
             Xs_pt.append(x)
             Ys_pt.append(y)
@@ -1555,21 +1587,23 @@ def _test_bmm_crr_add_cat_fusion(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
     def test_bmm_crr_add_cat_fusion(self):
-        self._test_bmm_crr_add_cat_fusion(
+        self._test_bmm_xxx_add_cat_fusion(
             B=7,
             M=10,
             Ns=[2, 12, 8],
             Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
             cat_dim=2,
             test_name="test_bmm_crr_add_cat_1",
             expected_num_tensors=10,
             expected_num_ops=3,
         )
-        self._test_bmm_crr_add_cat_fusion(
+        self._test_bmm_xxx_add_cat_fusion(
             B=8,
             M=4,
             Ns=[10, 10, 10],
             Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
             cat_dim=1,
             test_name="test_bmm_crr_add_cat_2",
             expected_num_tensors=10,
@@ -1582,44 +1616,48 @@ def test_bmm_crr_add_cat_fusion(self):
         "Not supported by CUDA < SM80.",
     )
     def test_bmm_cat_fusion_float(self):
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=8,
             Ns=[2, 2, 2],
             Ks=[4, 5, 32],
+            bmm_op_maker=ops.bmm_rcr,
             cat_dim=2,
             test_name="test_bmm_rcr_cat_float_1",
             expected_num_tensors=7,
             expected_num_ops=3,
             dtype="float",
         )
-        self._test_bmm_rcr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=1,
             M=16,
             Ns=[1, 1, 1],
             Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
             cat_dim=2,
             test_name="test_bmm_rcr_cat_float_3",
             expected_num_tensors=7,
             expected_num_ops=3,
             dtype="float",
         )
-        self._test_bmm_crr_cat_fusion(
+        self._test_bmm_xxx_cat_fusion(
             B=8,
             M=16,
             Ns=[4, 4, 4],
             Ks=[3, 16, 9],
+            bmm_op_maker=ops.bmm_crr,
             cat_dim=1,
             test_name="test_bmm_crr_cat_float_2",
             expected_num_tensors=7,
             expected_num_ops=3,
             dtype="float",
         )
-        self._test_bmm_crr_add_cat_fusion(
+        self._test_bmm_xxx_add_cat_fusion(
             B=7,
             M=10,
             Ns=[2, 12, 8],
             Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
             cat_dim=2,
             test_name="test_bmm_crr_add_cat_float_1",
             expected_num_tensors=10,
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index 52a79fd0b..ccf6ab607 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -41,7 +41,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
             ),
             param(
@@ -49,7 +49,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 1, 2],
                 cat_dim=3,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
             ),
             param(
@@ -57,7 +57,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=4,
                 new_shape=[-1, 4, 4, 1],
                 cat_dim=2,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
             ),
             param(
@@ -65,7 +65,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 2, 1],
                 cat_dim=2,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
             ),
             param(
@@ -73,7 +73,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=4,
                 new_shape=[-1, 4, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
             ),
             param(
@@ -105,7 +105,7 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 1, 2],
                 cat_dim=3,
-                expected_num_tensors=11,
+                expected_num_tensors=10,
                 expected_num_ops=9,
                 dtype="float",
             ),
@@ -157,9 +157,6 @@ def test_strided_gemm_view_cat_fusible(
         X2 = ops.gemm_rcr_bias_add()(input0, input1, input3, input4)
         X3 = ops.gemm_rcr_bias_add_add()(input0, input1, input3, input4, input4)
         X4 = ops.bmm_rcr()(input0, input2)
-
-        # For now these ops do not support output_accessors yet.
-        # TODO: enable these checks once these ops support output_accessors.
         X5 = ops.bmm_rrr_add()(input0, input2, input3)
 
         # [m, b, k] x [b, n, k] -> [m, b, n] b = n, k = n

From b677e71c9a60c20abf2412b789d3aeac7cb95840 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sat, 4 Mar 2023 13:48:37 -0800
Subject: [PATCH 215/638] fixed a couple of linter errors (#362)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/362

Reviewed By: alexanderguzhva

Differential Revision: D43813375

Pulled By: chenyang78

fbshipit-source-id: d9c65bf2b15e6362343b6d4e77a510853fad5613
---
 python/aitemplate/backend/cuda/tensor/expand.py | 1 +
 python/aitemplate/compiler/ops/tensor/expand.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/tensor/expand.py b/python/aitemplate/backend/cuda/tensor/expand.py
index 45c4d7f08..be535af9b 100644
--- a/python/aitemplate/backend/cuda/tensor/expand.py
+++ b/python/aitemplate/backend/cuda/tensor/expand.py
@@ -17,6 +17,7 @@
 from typing import Any, Dict
 
 import jinja2
+
 from aitemplate.backend import registry
 
 from aitemplate.backend.backend_spec import CUDASpec
diff --git a/python/aitemplate/compiler/ops/tensor/expand.py b/python/aitemplate/compiler/ops/tensor/expand.py
index 0dda00166..bb77919c4 100644
--- a/python/aitemplate/compiler/ops/tensor/expand.py
+++ b/python/aitemplate/compiler/ops/tensor/expand.py
@@ -20,7 +20,7 @@
 from aitemplate.backend.target import Target
 
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
-from aitemplate.utils.shape_utils import convert_shape_to_IntVar, gen_int_var
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar
 
 
 def _normalize_dim(dim: IntVar) -> IntVar:

From da1cd1e21005efbed6ee8c542fe3b6760625f2c4 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Mon, 6 Mar 2023 00:06:46 -0800
Subject: [PATCH 216/638] enabled dynamic h/w for conv (#363)

Summary:
This PR enabled dynamic h/w for conv2d and d/h/w for conv3d. The profiling strategy is not optimal as we only profile with the max d/h/w values. We will implement some better strategy (e.g. bucketing) later.

We also removed duplicate codes in conv3d_bias.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/363

Reviewed By: terrychenism

Differential Revision: D43821796

Pulled By: chenyang78

fbshipit-source-id: 8f91b9193becf1727b704573a9bdca5a036d8b8d
---
 python/aitemplate/compiler/ops/conv/conv2d.py |  78 +-
 python/aitemplate/compiler/ops/conv/conv3d.py | 112 ++-
 .../compiler/ops/conv/conv3d_bias.py          | 680 +-----------------
 tests/unittest/ops/test_dynamic_conv.py       | 159 +++-
 4 files changed, 316 insertions(+), 713 deletions(-)

diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 549cbc454..5cfd441f3 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -83,7 +83,10 @@
 
 EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
     """
-NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && HI == {{x_dim1}} && WI == {{x_dim2}} && CI == {{x_dim3}}
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} &&
+ HI >= {{x_dim1_lb}} && HI <= {{x_dim1_ub}} &&
+ WI >= {{x_dim2_lb}} && WI <= {{x_dim2_ub}} &&
+ CI == {{x_dim3}}
 """
 )
 
@@ -244,13 +247,29 @@ def _gen_exec_key(self, shape: List[int]):
             x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
         ).replace("\n", "")
 
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+    def _gen_dyn_exec_key(
+        self, dim0_lb, dim0_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
+    ):
         return self.exec_dyn_key_template.render(
-            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+            x_dim0_lb=dim0_lb,
+            x_dim0_ub=dim0_ub,
+            x_dim1_lb=dim1_lb,
+            x_dim1_ub=dim1_ub,
+            x_dim2_lb=dim2_lb,
+            x_dim2_ub=dim2_ub,
+            x_dim3=dim3,
         ).replace("\n", "")
 
     def _extract_exec_path(self, x: Tensor):
         x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        # FIXME: we take the max height and weight for profiling at the moment.
+        # Let's figure out a better profiling strategy later.
+        # The following attribute is temporarily used to hold the lower bounds of
+        # all dimensions. We will remove them later once we have a better profiling
+        # strategy.
+        self._attrs["dim_lower_bounds"] = [min(vals) for vals in x_shape_values]
+        x_shape_values = [x_shape_values[0]] + [[max(vs)] for vs in x_shape_values[1:]]
+
         x_shapes = itertools.product(*x_shape_values)
         self._attrs["exec_path"] = OrderedDict()
         for x_shape in x_shapes:
@@ -541,10 +560,6 @@ def profile(
             devices = [0]
         self._profile_static(workdir, devices)
 
-        target = backend.target.Target.current()
-        if target.use_dummy_profiling_results():
-            return
-
         if self._has_dynamic_input_dims():
             if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
                 raise NotImplementedError(
@@ -592,17 +607,8 @@ def _profile_static(self, workdir, devices):
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
 
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # extract dynamic dim from exec_path
-        if len(self._attrs["exec_path"]) <= 1:
-            return
-        if len(set(self._attrs["exec_path"].values())) <= 1:
-            # all exec paths point to the same algo
-            return
-
         def _extract_dynamic_dim(exec_keys):
-            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
             var_dims = [[], [], [], []]
             for key in exec_keys:
                 dims = self._invert_exec_key(key)
@@ -610,11 +616,41 @@ def _extract_dynamic_dim(exec_keys):
                     var_dims[i].append(v)
             return var_dims
 
+        dim_lbs = self._attrs["dim_lower_bounds"]
         dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
-        dim1 = dims[1][0]
-        dim2 = dims[2][0]
+        dim0_lb = dim_lbs[0]
+        dim1_lb = dim_lbs[1]
+        dim2_lb = dim_lbs[2]
+        # dims' upper bounds are the same except the batch dimension
+        dim1_ub = dims[1][0]
+        dim2_ub = dims[2][0]
         dim3 = dims[3][0]
+
+        num_exec_path = len(self._attrs["exec_path"])
+        if num_exec_path < 1:
+            return
         algos = list(self._attrs["exec_path"].values())
+        if num_exec_path == 1 or len(set(algos)) <= 1:
+            # all exec paths point to the same algo
+            new_exec_paths = OrderedDict()
+            # Because we have a single algo, it's safe to just take the upper
+            # bound of dim0 (i.e. batch dim) values.
+            dim0_ub = max(dims[0])
+            # we need to generate new exec paths that ensure the ranges of
+            # likely dynamic heights and weights
+            new_key = self._gen_dyn_exec_key(
+                dim0_lb, dim0_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
+            )
+            new_exec_paths[new_key] = algos[0]
+            self._attrs["exec_path"] = new_exec_paths
+            return
+
+        target = backend.target.Target.current()
+        if target.use_dummy_profiling_results():
+            return
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # generate region
         regions = []  # lb, ub, lb_algos, ub_algos
         for i in range(len(dims[0]) - 1):
@@ -631,7 +667,7 @@ def _extract_dynamic_dim(exec_keys):
             last_mid = mid
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3]
+                mid_shape = [mid, dim1_ub, dim2_ub, dim3]
                 _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
@@ -674,10 +710,10 @@ def _extract_dynamic_dim(exec_keys):
                 last_mid = mid
                 mid = (lb + ub) // 2
             lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3
+                origin_lb, last_mid, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
             )
             up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3
+                last_mid, origin_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
             )
             new_exec_paths[lo_region_key] = lb_algo
             new_exec_paths[up_region_key] = ub_algo
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index fc7e7159b..13be89072 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -92,7 +92,11 @@
 
 EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
     """
-NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} &&
+ DI >= {{x_dim1_lb}} && DI <= {{x_dim1_ub}} &&
+ HI >= {{x_dim2_lb}} && HI <= {{x_dim2_ub}} &&
+ WI >= {{x_dim3_lb}} && WI <= {{x_dim3_ub}} &&
+ CI == {{x_dim4}}
 """
 )
 
@@ -221,18 +225,40 @@ def _gen_exec_key(self, shape: List[int]):
             x_dim4=shape[4],
         ).replace("\n", "")
 
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3, dim4):
+    def _gen_dyn_exec_key(
+        self,
+        dim0_lb,
+        dim0_ub,
+        dim1_lb,
+        dim1_ub,
+        dim2_lb,
+        dim2_ub,
+        dim3_lb,
+        dim3_ub,
+        dim4,
+    ):
         return self.exec_dyn_key_template.render(
             x_dim0_lb=dim0_lb,
             x_dim0_ub=dim0_ub,
-            x_dim1=dim1,
-            x_dim2=dim2,
-            x_dim3=dim3,
+            x_dim1_lb=dim1_lb,
+            x_dim1_ub=dim1_ub,
+            x_dim2_lb=dim2_lb,
+            x_dim2_ub=dim2_ub,
+            x_dim3_lb=dim3_lb,
+            x_dim3_ub=dim3_ub,
             x_dim4=dim4,
         ).replace("\n", "")
 
     def _extract_exec_path(self, x: Tensor):
         x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        # FIXME: we take the max height and weight for profiling at the moment.
+        # Let's figure out a better profiling strategy later.
+        # The following attribute is temporarily used to hold the lower bounds of
+        # all dimensions. We will remove them later once we have a better profiling
+        # strategy.
+        self._attrs["dim_lower_bounds"] = [min(vals) for vals in x_shape_values]
+        x_shape_values = [x_shape_values[0]] + [[max(vs)] for vs in x_shape_values[1:]]
+
         x_shapes = itertools.product(*x_shape_values)
         self._attrs["exec_path"] = OrderedDict()
         for x_shape in x_shapes:
@@ -603,15 +629,7 @@ def _profile_static(self, workdir, devices):
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
 
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # extract dynamic dim from exec_path
-        if len(self._attrs["exec_path"]) <= 1:
-            return
-        if len(set(self._attrs["exec_path"].values())) <= 1:
-            # all exec paths point to the same algo
-            return
-
         def _extract_dynamic_dim(exec_keys):
             _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
             var_dims = [[], [], [], [], []]
@@ -621,12 +639,52 @@ def _extract_dynamic_dim(exec_keys):
                     var_dims[i].append(v)
             return var_dims
 
+        dim_lbs = self._attrs["dim_lower_bounds"]
         dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
-        dim1 = dims[1][0]
-        dim2 = dims[2][0]
-        dim3 = dims[3][0]
+        dim0_lb = dim_lbs[0]
+        dim1_lb = dim_lbs[1]
+        dim2_lb = dim_lbs[2]
+        dim3_lb = dim_lbs[3]
+        # dims' upper bounds are the same except the batch dimension
+        dim1_ub = dims[1][0]
+        dim2_ub = dims[2][0]
+        dim3_ub = dims[3][0]
         dim4 = dims[4][0]
+
+        num_exec_path = len(self._attrs["exec_path"])
+        if num_exec_path < 1:
+            return
+
         algos = list(self._attrs["exec_path"].values())
+        if num_exec_path == 1 or len(set(algos)) <= 1:
+            # all exec paths point to the same algo
+            new_exec_paths = OrderedDict()
+            # Because we have a single algo, it's safe to just take the upper
+            # bound of dim0 (i.e. batch dim) values.
+            dim0_ub = max(dims[0])
+            # We need to generate new exec paths that ensure the ranges of
+            # likely dynamic heights and weights.
+            new_key = self._gen_dyn_exec_key(
+                dim0_lb,
+                dim0_ub,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
+            )
+            new_exec_paths[new_key] = algos[0]
+            self._attrs["exec_path"] = new_exec_paths
+            return
+
+        target = backend.target.Target.current()
+        if target.use_dummy_profiling_results():
+            return
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # generate region
         regions = []  # lb, ub, lb_algos, ub_algos
         for i in range(len(dims[0]) - 1):
@@ -643,7 +701,7 @@ def _extract_dynamic_dim(exec_keys):
             last_mid = mid
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3, dim4]
+                mid_shape = [mid, dim1_ub, dim2_ub, dim3_ub, dim4]
                 _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
@@ -686,10 +744,26 @@ def _extract_dynamic_dim(exec_keys):
                 last_mid = mid
                 mid = (lb + ub) // 2
             lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3, dim4
+                origin_lb,
+                last_mid,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
             )
             up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3, dim4
+                last_mid,
+                origin_ub,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
             )
             new_exec_paths[lo_region_key] = lb_algo
             new_exec_paths[up_region_key] = ub_algo
diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
index 2fb0450b8..57623840a 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -14,98 +14,15 @@
 #
 
 """
-Base class for conv3d.
+Conv3d with bias.
 """
-import itertools
-import logging
-import os
-import re
-from collections import OrderedDict
-from hashlib import sha1
-from operator import itemgetter
-from typing import Any, Dict, List
+from typing import List
 
-import jinja2
+from ...base import Tensor
+from .conv3d import conv3d
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import alignment, environ, shape_utils
-from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
-from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
-from .conv_common import (
-    filter_op_instances,
-    generate_profiler_sources,
-    get_profiler_filename,
-)
 
-# pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
-
-
-_LOGGER = logging.getLogger(__name__)
-
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}NI = {{x_dim0}};
-{{indent}}{{dtype}}DI = {{x_dim1}};
-{{indent}}{{dtype}}HI = {{x_dim2}};
-{{indent}}{{dtype}}WI = {{x_dim3}};
-{{indent}}{{dtype}}CI = {{x_dim4}};
-{{indent}}{{dtype}}CO = {{w_dim0}};
-{{indent}}{{dtype}}KD = {{w_dim1}};
-{{indent}}{{dtype}}KH = {{w_dim2}};
-{{indent}}{{dtype}}KW = {{w_dim3}};
-{{indent}}{{dtype}}SD = {{stride_d}};
-{{indent}}{{dtype}}SH = {{stride_h}};
-{{indent}}{{dtype}}SW = {{stride_w}};
-{{indent}}{{dtype}}DD = {{dilate_d}};
-{{indent}}{{dtype}}DH = {{dilate_h}};
-{{indent}}{{dtype}}DW = {{dilate_w}};
-{{indent}}{{dtype}}PD = {{pad_d}};
-{{indent}}{{dtype}}PH = {{pad_h}};
-{{indent}}{{dtype}}PW = {{pad_w}};
-{{indent}}{{dtype}}KDEff = (KD - 1) * DD + 1;
-{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
-{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
-{{indent}}{{dtype}}NO = NI;
-{{indent}}{{dtype}}DO = (DI + PD + PD - KDEff) {{div}} SD + 1;
-{{indent}}{{dtype}}HO = (HI + PH + PH - KHEff) {{div}} SH + 1;
-{{indent}}{{dtype}}WO = (WI + PW + PW - KWEff) {{div}} SW + 1;
-"""
-)
-
-SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{y_dim0}} = NO;
-{{indent}}{{y_dim1}} = DO;
-{{indent}}{{y_dim2}} = HO;
-{{indent}}{{y_dim3}} = WO;
-{{indent}}{{y_dim4}} = CO;
-"""
-)
-
-EXEC_KEY_TEMPLATE = jinja2.Template(
-    """
-NI == {{x_dim0}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
-"""
-)
-
-EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
-    """
-NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
-"""
-)
-
-EXEC_COND_TEMPLATE = jinja2.Template(
-    """
-{{indent}}if ({{cond}}) {
-{{indent}}  {{program}}
-{{indent}}}
-"""
-)
-
-
-class conv3d_bias(Operator):
+class conv3d_bias(conv3d):
     r"""conv3d_bias"""
 
     def __init__(self, stride, pad, dilate=1, group=1) -> None:
@@ -123,148 +40,11 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
            Number of blocked connections from input
             channels to output channels, by default 1
         """
-        super().__init__()
+        super().__init__(stride, pad, dilate=dilate, group=group)
         self._attrs["op"] = "conv3d_bias"
-        self._attrs["stride"] = stride
-        if isinstance(stride, int):
-            self._attrs["stride"] = (stride, stride, stride)
-        self._attrs["pad"] = pad
-        if isinstance(pad, int):
-            self._attrs["pad"] = (pad, pad, pad)
-        self._attrs["dilate"] = dilate
-        if isinstance(dilate, int):
-            self._attrs["dilate"] = (dilate, dilate, dilate)
-        self._attrs["group"] = group
-        self._attrs["has_profiler"] = True
-        self._attrs["epilogue_alignment"] = 1
-        self._attrs["epilogue"] = "LinearCombination"
-        self._attrs["workspace"] = 0
-        self._attrs["split_k"] = None
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
-        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
-        self.exec_key_template = EXEC_KEY_TEMPLATE
-        self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
-        self.exec_cond_template = EXEC_COND_TEMPLATE
-
-    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
-        if x[4] != w[4] * self._attrs["group"]:
-            raise RuntimeError("X/W Shape mismatch for conv3d")
-        eval_func = self.shape_eval_template.render(
-            indent="",
-            dtype="",
-            div="//",
-            stride_d=self._attrs["stride"][0],
-            stride_h=self._attrs["stride"][1],
-            stride_w=self._attrs["stride"][2],
-            pad_d=self._attrs["pad"][0],
-            pad_h=self._attrs["pad"][1],
-            pad_w=self._attrs["pad"][2],
-            dilate_d=self._attrs["dilate"][0],
-            dilate_h=self._attrs["dilate"][1],
-            dilate_w=self._attrs["dilate"][2],
-            x_dim0=x[0],
-            x_dim1=x[1],
-            x_dim2=x[2],
-            x_dim3=x[3],
-            x_dim4=x[4],
-            w_dim0=w[0],
-            w_dim1=w[1],
-            w_dim2=w[2],
-            w_dim3=w[3],
-        )
-        output = {}
-        exec(eval_func, output)  # noqa: P204
-        return [
-            int(output["NO"]),
-            int(output["DO"]),
-            int(output["HO"]),
-            int(output["WO"]),
-            int(output["CO"]),
-        ]
-
-    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
-        self._attrs["CO"] = w_shape[0]
-        self._attrs["KD"] = w_shape[1]
-        self._attrs["KH"] = w_shape[2]
-        self._attrs["KW"] = w_shape[3]
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape, w_shape)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        output_shape = [
-            x._attrs["shape"][0],
-            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
-        ]
-        return output_shape
-
-    def _invert_exec_key(self, key):
-        tmp = re.findall(r"(\d+)", key)
-        return [int(x) for x in tmp]
-
-    def _gen_exec_key(self, shape: List[int]):
-        return self.exec_key_template.render(
-            x_dim0=shape[0],
-            x_dim1=shape[1],
-            x_dim2=shape[2],
-            x_dim3=shape[3],
-            x_dim4=shape[4],
-        ).replace("\n", "")
-
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3, dim4):
-        return self.exec_dyn_key_template.render(
-            x_dim0_lb=dim0_lb,
-            x_dim0_ub=dim0_ub,
-            x_dim1=dim1,
-            x_dim2=dim2,
-            x_dim3=dim3,
-            x_dim4=dim4,
-        ).replace("\n", "")
-
-    def _extract_exec_path(self, x: Tensor):
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        self._attrs["exec_path"] = OrderedDict()
-        for x_shape in x_shapes:
-            key = self._gen_exec_key(x_shape)
-            self._attrs["exec_path"][key] = ""
-
-    def _signature(self):
-        signature = "conv3d: K=[{kd}, {kh}, {kw}], S=[{sd}, {sh}, {sw}], P=[{pd}, {ph}, {pw}], CO=[{co}]".format(
-            kd=self._attrs["KD"],
-            kh=self._attrs["KH"],
-            kw=self._attrs["KW"],
-            sd=self._attrs["stride"][0],
-            sh=self._attrs["stride"][1],
-            sw=self._attrs["stride"][2],
-            pd=self._attrs["pad"][0],
-            ph=self._attrs["pad"][1],
-            pw=self._attrs["pad"][2],
-            co=self._attrs["CO"],
-        )
-        return signature
-
-    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
-        epilogue_dim = output_shape[-1]
-        if not isinstance(epilogue_dim, IntImm):
-            raise RuntimeError("Conv output last dimension must be static!")
-        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
-            number=epilogue_dim._attrs["values"][0],
-            dtype=self._attrs["inputs"][0]._attrs["dtype"],
-        )
 
     def __call__(self, x: Tensor, w: Tensor, b: Tensor) -> List[Tensor]:
-        """Call conv3d with tensors x, w
+        """Call conv3d_bias with tensors x, w, b
 
         Parameters
         ----------
@@ -272,6 +52,8 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor) -> List[Tensor]:
             in shape (N, D, H, W, C_in)
         w : Tensor
             in shape (C_out, K_d, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
 
         Returns
         -------
@@ -286,447 +68,3 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor) -> List[Tensor]:
         output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
-
-    def _get_op_attributes(self) -> Dict[str, Any]:
-        target_attrs = ["dilate", "group", "pad", "stride"]
-        attr = {}
-
-        for target_attr in target_attrs:
-            if target_attr in self._attrs:
-                attr[target_attr] = self._attrs[target_attr]
-
-        return attr
-
-    def _should_build_profiler(self) -> bool:
-        """
-        Check if we should build profilers. If we have a cached
-        entry for this gemm instance, we update this gemm op's
-        relevant attributes with the cached result and return False.
-        """
-        force_cache = environ.force_profiler_cache()
-        if self._has_dynamic_input_dims():
-            if force_cache:
-                raise RuntimeError(
-                    "We cannot force to use the cache as dynamic dims require "
-                    "us to generate and build the profilers"
-                )
-            # If there are dynamic dims, we'll have to generate and build the
-            # profilers, as the binaries will be needed for dynamic profiling.
-            return True
-        # We are forced to use the cache so we skip building profilers.
-        if force_cache:
-            return False
-
-        target = backend.target.Target.current()
-        workloads = list(self._attrs["exec_path"].keys())
-
-        build_profiler = True
-        # Now, let's query if all of our workloads have cache entries. If that
-        # is the case, it is safely to skip generating and building profilers.
-        if not target.use_dummy_profiling_results():
-            tmp_key = next(iter(self._attrs["op_instance"].keys()))
-            tmp_op = self._attrs["op_instance"][tmp_key]
-            build_profiler = False
-            for wkl in workloads:
-                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
-                split_k = (
-                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
-                )
-                query = Conv3dQueryEntry(
-                    dtype_a=tmp_op.A.element.value,
-                    dtype_b=tmp_op.B.element.value,
-                    dtype_c=tmp_op.C.element.value,
-                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
-                    major_a=tmp_op.A.layout.value,
-                    major_b=tmp_op.B.layout.value,
-                    major_c=tmp_op.C.layout.value,
-                    kd=self._attrs["KD"],
-                    kh=self._attrs["KH"],
-                    kw=self._attrs["KW"],
-                    co=self._attrs["CO"],
-                    stride_d=self._attrs["stride"][0],
-                    stride_h=self._attrs["stride"][1],
-                    stride_w=self._attrs["stride"][2],
-                    pad_d=self._attrs["pad"][0],
-                    pad_h=self._attrs["pad"][1],
-                    pad_w=self._attrs["pad"][2],
-                    dilate_d=self._attrs["dilate"][0],
-                    dilate_h=self._attrs["dilate"][1],
-                    dilate_w=self._attrs["dilate"][2],
-                    op_type=self._attrs["op"],
-                    device=target._arch,
-                    epilogue=tmp_op.epilogue_functor.value,
-                    split_k=split_k,
-                    exec_entry_sha1=exec_entry_sha1,
-                )
-                cache_value = target.query_profile_cache("conv3d", query.__dict__)
-                if cache_value is not None and not target.force_profile():
-                    _LOGGER.info(
-                        f'Load profiling result for {self._attrs["name"]} '
-                        f"from cache: {cache_value}",
-                    )
-                    best_algo, workspace = cache_value
-                    self._attrs["exec_path"][wkl] = best_algo
-                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
-                else:
-                    # cache miss - we will have to generate and build profilers
-                    build_profiler = True
-        return build_profiler
-
-    def gen_profiler(
-        self,
-        workdir: str = None,
-        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
-    ) -> None:
-        """Profiler generator.
-
-        Parameters
-        ----------
-        workdir : str, optional, by default None
-        dynamic_profiling_strategy: DynamicProfileStrategy, optional
-            A dynamic profiling strategy, used to filter generated profiles at compile time.
-            See also: :func:`~aitemplate.compiler.transform.profile.profile`
-        """
-        target = backend.target.Target.current()
-
-        func_key = "{target}.{op}.config".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
-
-        if self._should_build_profiler():
-            x_shapes = [
-                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
-            ]
-            self._attrs["op_instance"] = filter_op_instances(
-                func_attrs=self._attrs,
-                x_shapes=x_shapes,
-            )
-            return generate_profiler_sources(
-                func_attrs=self._attrs,
-                op_class="conv3d",
-                workdir=workdir,
-                shape_template=self.shape_eval_template,
-            )
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
-        exe_path = os.path.join(profiler_prefix, cfg)
-        if not os.access(exe_path, os.X_OK):
-            raise RuntimeError("Profiler %s is not executable" % exe_path)
-        cmd = [exe_path]
-        cmd.append(x_shape[0])
-        cmd.append(x_shape[1])
-        cmd.append(x_shape[2])
-        cmd.append(x_shape[3])
-        cmd.append(x_shape[4])
-        cmd.append(self._attrs["KD"])
-        cmd.append(self._attrs["KH"])
-        cmd.append(self._attrs["KW"])
-        cmd.append(self._attrs["CO"])
-        cmd.append(self._attrs["stride"][0])
-        cmd.append(self._attrs["stride"][1])
-        cmd.append(self._attrs["stride"][2])
-        cmd.append(self._attrs["pad"][0])
-        cmd.append(self._attrs["pad"][1])
-        cmd.append(self._attrs["pad"][2])
-        cmd.append(self._attrs["dilate"][0])
-        cmd.append(self._attrs["dilate"][1])
-        cmd.append(self._attrs["dilate"][2])
-        cmd.append(self._attrs["group"])
-        command = [str(x) for x in cmd]
-        return command
-
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
-        target = backend.target.Target.current()
-        # query cache
-        tmp_key = next(iter(self._attrs["op_instance"].keys()))
-        tmp_op = self._attrs["op_instance"][tmp_key]
-        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
-        split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
-        query = Conv3dQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
-            major_a=tmp_op.A.layout.value,
-            major_b=tmp_op.B.layout.value,
-            major_c=tmp_op.C.layout.value,
-            kd=self._attrs["KD"],
-            kh=self._attrs["KH"],
-            kw=self._attrs["KW"],
-            co=self._attrs["CO"],
-            stride_d=self._attrs["stride"][0],
-            stride_h=self._attrs["stride"][1],
-            stride_w=self._attrs["stride"][2],
-            pad_d=self._attrs["pad"][0],
-            pad_h=self._attrs["pad"][1],
-            pad_w=self._attrs["pad"][2],
-            dilate_d=self._attrs["dilate"][0],
-            dilate_h=self._attrs["dilate"][1],
-            dilate_w=self._attrs["dilate"][2],
-            op_type=self._attrs["op"],
-            device=target._arch,
-            epilogue=tmp_op.epilogue_functor.value,
-            split_k=split_k,
-            exec_entry_sha1=exec_entry_sha1,
-        )
-        cache_value = target.query_profile_cache("conv3d", query.__dict__)
-        if cache_value is not None and not target.force_profile():
-            _LOGGER.info("Load profiling result from cache.")
-            return cache_value
-        if cache_value is None and force_cache:
-            op_type = self._attrs["op"]
-            raise RuntimeError(
-                "force_cache is enabled but we could not find the following cache ",
-                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
-            )
-        if target.use_dummy_profiling_results():
-            op_type = self._attrs["op"]
-            raise Exception(
-                "This is a CI run but we could not find the following cache ",
-                f"available on device {target._arch}\n",
-                f"{op_type} {exec_entry_sha1}.\n",
-                "To bypass, you need to make it available in the db table.",
-            )
-
-        profiler_filename = get_profiler_filename(self._attrs, "conv3d")
-        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
-        x_shape = self._invert_exec_key(exec_key)
-        command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
-        runner.push(profiler_filename, command)
-
-        runner.join()
-        result = runner.pull()
-        if len(result) == 0:
-            raise RuntimeError(
-                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
-            )
-        out = min(result, key=itemgetter(1))
-        best_algo = out[1].op_config
-        workspace = out[1].workspace
-        ## cache
-        cache_record = Conv3dRecordEntry(
-            exec_entry=exec_key,
-            exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
-            major_a=tmp_op.A.layout.value,
-            major_b=tmp_op.B.layout.value,
-            major_c=tmp_op.C.layout.value,
-            kd=self._attrs["KD"],
-            kh=self._attrs["KH"],
-            kw=self._attrs["KW"],
-            co=self._attrs["CO"],
-            stride_d=self._attrs["stride"][0],
-            stride_h=self._attrs["stride"][1],
-            stride_w=self._attrs["stride"][2],
-            pad_d=self._attrs["pad"][0],
-            pad_h=self._attrs["pad"][1],
-            pad_w=self._attrs["pad"][2],
-            dilate_d=self._attrs["dilate"][0],
-            dilate_h=self._attrs["dilate"][1],
-            dilate_w=self._attrs["dilate"][2],
-            op_type=self._attrs["op"],
-            epilogue=tmp_op.epilogue_functor.value,
-            device=target._arch,
-            algo=best_algo,
-            workspace=workspace,
-            split_k=split_k,  # todo add into profile
-        )
-        Target.current().insert_profile_cache("conv3d", cache_record.__dict__)
-        return (best_algo, workspace)
-
-    def _has_dynamic_input_dims(self):
-        for input_tensor in self._attrs["inputs"]:
-            for dim in input_tensor._attrs["shape"]:
-                if not isinstance(dim, IntImm):
-                    return True
-        return False
-
-    def profile(
-        self,
-        workdir="./",
-        devices=None,
-        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
-    ):
-        if devices is None:
-            devices = [0]
-        self._profile_static(workdir, devices)
-
-        if self._has_dynamic_input_dims():
-            if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
-                raise NotImplementedError(
-                    "conv3d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
-                        dynamic_profiling_strategy
-                    )
-                )
-            self._profile_dynamic_dim(workdir)
-
-    def _profile_static(self, workdir, devices):
-        """Profiles with static shapes."""
-
-        workloads = list(self._attrs["exec_path"].keys())
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        target = backend.target.Target.current()
-        if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
-            # init candidate ops
-            func_key = "{target}.{op}.config".format(
-                target=target.name(), op=self._attrs["op"]
-            )
-            func = registry.get(func_key)
-            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
-
-        force_cache = environ.force_profiler_cache()
-        for wkl in workloads:
-            _LOGGER.info(
-                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
-            )
-            # if in CI just choose minimal configs
-            # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results() and not force_cache:
-                algo = target.select_minimal_algo(
-                    list(self._attrs["op_instance"].keys())
-                )
-                _LOGGER.info(f"Select minimal algo {algo} for CI")
-                self._attrs["exec_path"][wkl] = algo
-                self._attrs["workspace"] = 102400
-            elif self._attrs["exec_path"][wkl] == "":
-                best_algo, workspace = self._profile_single_workload(
-                    profiler_prefix, wkl, devices, force_cache
-                )
-                self._attrs["exec_path"][wkl] = best_algo
-                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
-
-    def _profile_dynamic_dim(self, workdir):
-        """Profiles with dynamic shapes."""
-
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
-        # extract dynamic dim from exec_path
-        if len(self._attrs["exec_path"]) <= 1:
-            return
-        if len(set(self._attrs["exec_path"].values())) <= 1:
-            # all exec paths point to the same algo
-            return
-
-        def _extract_dynamic_dim(exec_keys):
-            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
-            var_dims = [[], [], [], [], []]
-            for key in exec_keys:
-                dims = self._invert_exec_key(key)
-                for i, v in enumerate(dims):
-                    var_dims[i].append(v)
-            return var_dims
-
-        dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
-        dim1 = dims[1][0]
-        dim2 = dims[2][0]
-        dim3 = dims[3][0]
-        dim4 = dims[4][0]
-        algos = list(self._attrs["exec_path"].values())
-        # generate region
-        regions = []  # lb, ub, lb_algos, ub_algos
-        for i in range(len(dims[0]) - 1):
-            regions.append([dims[0][i], dims[0][i + 1], algos[i], algos[i + 1]])
-        # for each region,
-        #   binary search to find cutting point
-        #   generate new exec
-        special_cases = OrderedDict()
-        new_exec_paths = OrderedDict()
-        for lb, ub, lb_algo, ub_algo in regions:
-            mid = (lb + ub) // 2
-            origin_lb = lb
-            origin_ub = ub
-            last_mid = mid
-            while mid > lb and mid < ub:
-                mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3, dim4]
-                _LOGGER.info(
-                    "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
-                        lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
-                    ),
-                )
-
-                # run the profiler binary with all ops on the mid_shape
-                # and fetch the results only for the lb_algo and ub_algo
-                profiler_filename = get_profiler_filename(self._attrs, "conv3d")
-                profiler_cmd = self._gen_profile_cmd(
-                    profiler_prefix, profiler_filename, mid_shape
-                )
-                runner.push(
-                    idx=profiler_filename,
-                    cmd=profiler_cmd,
-                    return_ops=[str(lb_algo), str(ub_algo)],
-                )
-                runner.join()
-                result = runner.pull()
-                result_dict = {res.op_config: res for res in result[0][1]}
-
-                assert len(result_dict) >= 1
-                # if there is only one result, assume ub algo failed.
-                if len(result_dict) == 1:
-                    assert str(ub_algo) not in result_dict
-                    # last_lb = lb
-                    lb = mid + 1
-                # if there are two result, compare to decide new lb/ub
-                else:
-                    lb_time = result_dict[str(lb_algo)].duration
-                    ub_time = result_dict[str(ub_algo)].duration
-                    if lb_time < ub_time:
-                        # lb algo can work with larger batch
-                        # last_lb = lb
-                        lb = mid + 1
-                    else:
-                        # ub algo can work with smaller batch
-                        # last_ub = ub
-                        ub = mid - 1
-                last_mid = mid
-                mid = (lb + ub) // 2
-            lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3, dim4
-            )
-            up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3, dim4
-            )
-            new_exec_paths[lo_region_key] = lb_algo
-            new_exec_paths[up_region_key] = ub_algo
-            # find special cases
-            # This code is kept in case need fully tested dynamic code
-            # So far I find binary search works well.
-            # def _find_special_case(lb, ub, algo):
-            #     for i in range(lb + 1, ub + 1):
-            #         x_shape = [i, dim1, dim2, dim3, dim4]
-            #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
-            #         runner.push(0, cmd)
-            #         runner.join()
-            #         out = runner.pull()
-            #         if len(out) == 0:
-            #             _LOGGER.info(Find specail case: batch=%d" % i)
-            #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
-            #             special_cases[self._gen_exec_key(x_shape)] = algo
-
-            # _LOGGER.info(
-            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
-            #         ub=last_mid))
-            # _find_special_case(origin_lb, last_mid, lb_algo)
-            # _LOGGER.info(
-            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
-            #         ub=origin_ub))
-            # _find_special_case(last_mid, origin_ub, ub_algo)
-        special_cases.update(new_exec_paths)
-        self._attrs["exec_path"] = special_cases
-
-    def gen_function(self) -> str:
-        target = backend.target.Target.current()
-        op_name = self._attrs["op"]
-        func_key = "{target}.{op}.gen_function".format(target=target.name(), op=op_name)
-        func = registry.get(func_key)
-        return func(
-            self._attrs,
-            self.exec_cond_template,
-            self.shape_eval_template,
-            self.shape_save_template,
-        )
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index 92dc686f1..b553c2616 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 
+import itertools
 import unittest
 
 import torch
@@ -26,6 +27,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvDynamicTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_conv_dynamic(
         self,
         test_name="conv_dynamic",
@@ -66,7 +71,7 @@ def _test_conv_dynamic(
             y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors({"input_0": x, "input_1": w}, [y])
             y_transpose = y.permute((0, 3, 1, 2))
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
 
     def test_fp16(self):
         self._test_conv_dynamic(
@@ -85,7 +90,157 @@ def test_fp32(self):
             dtype="float32",
         )
 
+    def _test_conv2d_dynamic(
+        self,
+        test_name,
+        dtype="float16",
+    ):
+        target = detect_target()
+        batch_size = [2, 32]
+        h_size = [3, 24]
+        w_size = [3, 24]
+        X = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch"),
+                IntVar(values=h_size, name="input_height"),
+                IntVar(values=w_size, name="input_width"),
+                4,
+            ],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[12, 3, 3, 4],
+            dtype=dtype,
+            name="weight_1",
+            is_input=True,
+        )
+        W2 = Tensor(
+            shape=[36, 3, 3, 12],
+            dtype=dtype,
+            name="weight_2",
+            is_input=True,
+        )
+        conv_op1 = ops.conv2d(stride=2, pad=1, dilate=1)
+        Y1 = conv_op1(X, W1)
+        conv_op2 = ops.conv2d(stride=2, pad=1, dilate=1)
+        Y = conv_op2(Y1, W2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+        batches = [2, 5, 32]
+        heights = [3, 11, 24]
+        widths = [3, 8, 24]
+        test_items = itertools.product(batches, heights, widths)
+        for batch, height, width in test_items:
+            print(f"Test {batch=}, {height=}, {width=}")
+            X_pt = get_random_torch_tensor([batch, 4, height, width], dtype=dtype)
+            W1_pt = get_random_torch_tensor([12, 4, 3, 3], dtype=dtype)
+            W2_pt = get_random_torch_tensor([36, 12, 3, 3], dtype=dtype)
+            Y1_pt = torch.nn.functional.conv2d(X_pt, W1_pt, stride=2, padding=1)
+            Y_pt = torch.nn.functional.conv2d(Y1_pt, W2_pt, stride=2, padding=1)
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w1 = W1_pt.permute((0, 2, 3, 1)).contiguous()
+            w2 = W2_pt.permute((0, 2, 3, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
+            module.run_with_tensors({"input_0": x, "weight_1": w1, "weight_2": w2}, [y])
+            y_transpose = y.permute((0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_conv2d_fp16(self):
+        self._test_conv2d_dynamic(
+            test_name="conv2d_dynamic_fp16",
+            dtype="float16",
+        )
+
+    def _test_conv3d_dynamic(
+        self,
+        test_name,
+        dtype="float16",
+    ):
+        target = detect_target()
+        batch_size = [1, 4]
+        d_size = [1, 4]
+        h_size = [3, 224]
+        w_size = [3, 224]
+        stride = (2, 4, 4)
+        pad = (1, 2, 2)
+        channel = 8
+        X = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch"),
+                IntVar(values=d_size, name="input_depth"),
+                IntVar(values=h_size, name="input_height"),
+                IntVar(values=w_size, name="input_width"),
+                channel,
+            ],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[16, 3, 5, 5, channel],
+            dtype=dtype,
+            name="weight_1",
+            is_input=True,
+        )
+        W2 = Tensor(
+            shape=[36, 3, 5, 5, 16],
+            dtype=dtype,
+            name="weight_2",
+            is_input=True,
+        )
+        conv_op1 = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y1 = conv_op1(X, W1)
+        conv_op2 = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y = conv_op2(Y1, W2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+        depths = [1, 4]
+        heights = [3, 78]
+        widths = [3, 8, 224]
+        test_items = itertools.product(batch_size, depths, heights, widths)
+        for batch, depth, height, width in test_items:
+            print(f"Test {batch=}, {depth=}, {height=}, {width=}")
+            X_pt = get_random_torch_tensor(
+                [batch, channel, depth, height, width], dtype=dtype
+            )
+            W1_pt = get_random_torch_tensor([16, channel, 3, 5, 5], dtype=dtype)
+            W2_pt = get_random_torch_tensor([36, 16, 3, 5, 5], dtype=dtype)
+            Y1_pt = torch.nn.functional.conv3d(X_pt, W1_pt, stride=stride, padding=pad)
+            Y_pt = torch.nn.functional.conv3d(Y1_pt, W2_pt, stride=stride, padding=pad)
+            x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            w1 = W1_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            w2 = W2_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+            module.run_with_tensors({"input_0": x, "weight_1": w1, "weight_2": w2}, [y])
+            y_transpose = y.permute((0, 4, 1, 2, 3))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=0.05, rtol=0.05))
+
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by CUDA < SM80.",
+    )
+    def test_conv3d_fp16(self):
+        self._test_conv3d_dynamic(
+            test_name="conv3d_dynamic_fp16",
+            dtype="float16",
+        )
+
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()

From ba3abc8cc06d09dde7a878b4089c1dacf589da7c Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 6 Mar 2023 08:52:47 -0800
Subject: [PATCH 217/638] Set the leftmost most frequent dim as batch size
 (#357)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/357

If there is more than one most frequent dimension in the input shapes, the leftmost one: the one with the lowest position score (sum of position indices in the shapes) is picked as the batch size.

If there are multiple most frequent dimensions with the same position score, the choice is still arbitrary.

Reviewed By: wushirong

Differential Revision: D43755669

fbshipit-source-id: a8c10bbd2977e953ce44a22b0ee2df8e7c976963
---
 fx2ait/fx2ait/tensor_spec.py           | 17 +++++++++++++----
 fx2ait/fx2ait/test/test_tensor_spec.py |  8 ++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index 7d68f6c72..b8337f163 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -247,6 +247,7 @@ def find_batch_size_dim(cls, inputs: Any) -> []:
             return [0]
         shapes = [i.shape for i in inputs]
         frequency_map = {}
+        position_scores = {}
         first_dims = set()
         for shape in shapes:
             if len(shape) < 2:
@@ -254,16 +255,24 @@ def find_batch_size_dim(cls, inputs: Any) -> []:
                 continue
             # Dedup shape value for single tensor
             first_dims.add(shape[0])
-            shape = set(shape)
-            for i in shape:
-                frequency_map[i] = frequency_map.get(i, 0) + 1
+            seen_dims = set()
+            for i, dim in enumerate(shape):
+                if dim not in seen_dims:
+                    frequency_map[dim] = frequency_map.get(dim, 0) + 1
+                    position_scores[dim] = position_scores.get(dim, 0) + i
+                    seen_dims.add(dim)
 
         if len(first_dims) == 1:
             # first dim is the same in every input: we use it as batch_size
             batch_size = first_dims.pop()
         elif frequency_map:
             # first dims are different: we use the most frequent dim as batch_size
-            sorted_frequency = sorted(frequency_map.items(), key=lambda x: -x[1])
+            # if there is more than 1 most frequent dim, we choose the one with the
+            # lowest position score (i.e., the leftmost of the most frequent ones)
+            sorted_frequency = sorted(
+                frequency_map.items(),
+                key=lambda x: (-x[1], position_scores[x[0]]),
+            )
             batch_size = sorted_frequency[0][0]
         else:
             # no dims to sort: no batch_size
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index 340e91005..c2f32c909 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -77,6 +77,14 @@ def test_two_input_lists(self):
                     ([10, 3, 40, 5], torch.float32),
                 ],
             ),
+            (
+                "leftmost_bs_dim",
+                [
+                    ([10, 20, 30], torch.float16),
+                    ([10, 30, 20], torch.float16),
+                    ([20, 10, 30], torch.float32),
+                ],
+            ),
         ]
     )
     def test_input_list_with_batch_size(self, _, settings):

From a7115fcb619d933cef4121d060a12136972140a5 Mon Sep 17 00:00:00 2001
From: Mengchi Zhang <mengchi@meta.com>
Date: Mon, 6 Mar 2023 14:13:46 -0800
Subject: [PATCH 218/638] Need to justify None before compare (#366)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/366

Cannot compare int with NoneType, so need to justify None before compare.

Reviewed By: aakhundov

Differential Revision: D43815532

fbshipit-source-id: 384561e43bd51007b6c93530e5087a110758df12
---
 python/aitemplate/utils/shape_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 75a857e07..7816b81fe 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -99,7 +99,7 @@ def get_num_rightmost_static_elements(shape, num_rightmost_dims: int = None) ->
     res = 1
 
     for idx, dim in enumerate(reversed(shape)):
-        if idx >= num_rightmost_dims:
+        if num_rightmost_dims is not None and idx >= num_rightmost_dims:
             break
         if not isinstance(dim, IntImm):
             break

From b5e4dec4f6a6b1caa80f84b802da5019550accb3 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 7 Mar 2023 04:35:29 -0800
Subject: [PATCH 219/638] Set batch_dim in make_jagged if zero (#364)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/364

Previously, `make_jagged`'s back-end was relying on whether the `batch_dim` is present in any Tesnor's `_attrs["shape"]`, to decide if the `batch_dim` must be set (to `offsets.lengths[0] - 1`) or validated (to be equal to that).

This is problematic for the cases, where the `batch_dim` is present in a Tensor shape in the downstream graph, hence is supposed to be set by `make_jagged` instead of being validated. One such case arises in the `jagged_to_dense` op, where the output dense Tensor's first `batch_dim` dimension is not known to the runtime until the input jagged Tensor is "unwrapped". In this case, `make_jagged` must assign the `batch_dim` present inside jagged Tensor's `JaggedIntVar`, instead of validating it, so that it gets the value by the time the output dense Tensor with the `batch_dim` in its `_attrs["shape"]` is processed further.

To mitigate this, in this diff the `make_jagged`'s condition to set vs. validate the `batch_dim` is changed to whether `batch_dim` is equal zero or not in the runtime. Being equal to zero means that the `batch_dim` has not yet been initialized (dynamic dimensions in the runtime are set to zero on declaration), which, in turn, means it must be set. If the `batch_dim` is not equal to zero, this means it has already been set, hence must be validated.

Reviewed By: ipiszy

Differential Revision: D43712183

fbshipit-source-id: a4570729bf6ebfba21034330b68c0362f685c72c
---
 .../backend/cuda/view_ops/make_jagged.py      | 14 ++++-------
 .../compiler/transform/name_graph.py          | 24 ++++++-------------
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index de0adbe59..16a09d23f 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -146,15 +146,13 @@
     offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
 {% endfor %}
 
-{% if set_batch_dim %}
-    // batch_dim must be set by this code
-    *batch_dim = offsets.lengths[0] - 1;
-{% else %}
-    // batch_dim must have been set before this code
-    if (*batch_dim != offsets.lengths[0] - 1) {
+    if (*batch_dim == 0) {
+      // batch_dim must be set by this code
+      *batch_dim = offsets.lengths[0] - 1;
+    } else if (*batch_dim != offsets.lengths[0] - 1) {
+      // batch_dim must have been set before this code
       throw std::runtime_error("batch_dim != len(offsets[0]) - 1");
     }
-{% endif %}
 
     int64_t max_offset_length = 0;
     for (int i = 0; i < {{num_offsets}}; ++i) {
@@ -223,7 +221,6 @@ def make_jagged_gen_function(func_attrs):
 
     output = func_attrs["outputs"][0]
     jagged_int_var = output._attrs["shape"][0]
-    set_batch_dim = jagged_int_var.batch_dim()._attrs.get("isolated", False)
     offsets_struct_type = jagged_int_var.offsets_struct_type()
     jagged_dim_min_values = [dim.min_value() for dim in jagged_int_var.jagged_dims()]
     jagged_dim_max_values = [dim.max_value() for dim in jagged_int_var.jagged_dims()]
@@ -231,7 +228,6 @@ def make_jagged_gen_function(func_attrs):
     return SRC_TEMPLATE.render(
         func_name=func_name,
         num_offsets=len(offsets_list),
-        set_batch_dim=set_batch_dim,
         offsets_struct_type=offsets_struct_type,
         jagged_dim_min_values=jagged_dim_min_values,
         jagged_dim_max_values=jagged_dim_max_values,
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 2d7f09648..512a3af0f 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -99,11 +99,6 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 dim_name = "{tname}_dim_{idx}".format(tname=tensor_name, idx=i)
                 dim._attrs["name"] = dim_name
 
-    dim_names_in_shapes = set()
-    for tensor in sorted_graph:
-        for dim in tensor._attrs["shape"]:
-            dim_names_in_shapes.add(dim._attrs["name"])
-
     for tensor in sorted_graph:
         if tensor.is_jagged():
             jagged_int_var = tensor._attrs["shape"][0]
@@ -113,18 +108,13 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
             # the name in the JaggedIntVar class. as a result, we must resort to a hack here
             # to reset the name of the JaggedIntVar to the name of the total_length after
             # the latter might have been changed (e.g., from None) by the code above.
-            # TODO: wrap _attrs["name"] (and other frequently used _attrs members) in
-            # @properties and override the "name" property in the JaggedIntVar to return
-            # total_length().name.
+            # TODO (T146653032): wrap _attrs["name"] (and other frequently used _attrs
+            # members) in @properties and override the "name" property in the JaggedIntVar
+            # to return total_length().name.
             jagged_int_var._attrs["name"] = jagged_int_var.total_length()._attrs["name"]
 
             batch_dim = jagged_int_var.batch_dim()
-            if batch_dim._attrs["name"] not in dim_names_in_shapes:
-                # The batch_dim set inside the jagged_int_var is not present in any other
-                # Tensor's shape directly. We mark it as isolated batch dim here to set
-                # the dim to "offsets.length[0] - 1" in the make_jagged backend code.
-                batch_dim._attrs["isolated"] = True
-                if batch_dim._attrs["name"] is None:
-                    # the batch_dim wasn't named above, so we name it here
-                    jagged_int_var_name = jagged_int_var._attrs["name"]
-                    batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"
+            if batch_dim._attrs["name"] is None:
+                # the batch_dim wasn't named above, so we name it here
+                jagged_int_var_name = jagged_int_var._attrs["name"]
+                batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"

From 086d73af0e28af44d4e31d5c70ff57423207b355 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Tue, 7 Mar 2023 14:18:19 -0800
Subject: [PATCH 220/638] AIT Splitter with allow_int_inputs option (#371)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/371

Reviewed By: wushirong

Differential Revision: D43859425

fbshipit-source-id: ed4e568d44a81c52769bf0c43ec775c4ddc88503
---
 fx2ait/fx2ait/ait_splitter.py           | 26 ++++++++++++++-----
 fx2ait/fx2ait/lower/lower.py            |  3 ++-
 fx2ait/fx2ait/lower/lower_settings.py   |  2 ++
 fx2ait/fx2ait/test/test_ait_splitter.py | 34 +++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index 908cbfcca..b96ddf3e6 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -67,7 +67,7 @@ def _any_supported(nodes: Sequence[torch.fx.Node]) -> bool:
 
 
 def create_ait_operator_support(
-    use_implicit_batch_dim=True, op_lowering_disallow_list=None
+    use_implicit_batch_dim=True, op_lowering_disallow_list=None, allow_int_inputs=False
 ) -> ops.OperatorSupportBase:
     """Creates an `OperatorSupportBase` instance used for AIT splitting purpose."""
     # Create an `OperatorSupport` that declares a node supported if it
@@ -81,11 +81,17 @@ def create_ait_operator_support(
     op_lowering_disallow_set = (
         set() if op_lowering_disallow_list is None else set(op_lowering_disallow_list)
     )
-    return ops.chain(
+    chained_not_supported_ops = (
+        []
+        if allow_int_inputs
+        else [
+            ops.OpSupports.decline_if_input_dtype(torch.int64),
+            ops.OpSupports.decline_if_input_dtype(torch.int32),
+        ]
+    )
+    chained_not_supported_ops += [
         ops.OpSupports.decline_if_node_in_names(op_lowering_disallow_set),
         # 1. We only support subgraphs with torch.Tensor inputs for now
-        ops.OpSupports.decline_if_input_dtype(torch.int64),
-        ops.OpSupports.decline_if_input_dtype(torch.int32),
         ops.OpSupports.decline_if_input_dtype(torch.float64),
         ops.OpSupports.decline_if_input_dtype(dict),
         # 2. Node is supported if it has AIT converter:
@@ -95,15 +101,20 @@ def create_ait_operator_support(
         # Note that this is not required for correctness, it is merely an
         # optimization.
         _decline_if_would_trigger_extra_copies(supported_if_converter_registered),
-    )
+    ]
+
+    return ops.chain(*chained_not_supported_ops)
 
 
 class AITSplitterSettings(splitter_base._SplitterSettingBase):
     # TODO: Fix this once pytorch nightly is updated
-    def __init__(self, min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE):
+    def __init__(
+        self, min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE, allow_int_inputs=False
+    ):
         super().__init__()
         self.min_acc_module_size = min_acc_module_size
         self.exclude_support_node_name: set = set()
+        self.allow_int_inputs: bool = allow_int_inputs
 
 
 class AITSplitter(splitter_base._SplitterBase):
@@ -118,7 +129,8 @@ def __init__(
             settings = AITSplitterSettings()
         if not operator_support:
             operator_support = create_ait_operator_support(
-                op_lowering_disallow_list=settings.exclude_support_node_name
+                op_lowering_disallow_list=settings.exclude_support_node_name,
+                allow_int_inputs=settings.allow_int_inputs,
             )
         else:
             operator_support = ops.chain(
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
index c4187de42..c006c9806 100644
--- a/fx2ait/fx2ait/lower/lower.py
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -100,7 +100,8 @@ def default_split_function(
     model: fx.GraphModule, inputs: Input, lower_settings: LowerSettings
 ) -> SplitResult:
     settings = AITSplitterSettings(
-        min_acc_module_size=lower_settings.min_acc_module_size
+        min_acc_module_size=lower_settings.min_acc_module_size,
+        allow_int_inputs=lower_settings.allow_int_inputs,
     )
     splitter = AITSplitter(model, inputs, settings=settings)
     splitter.node_support_preview()
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
index 685f6c19a..80d5d55d8 100644
--- a/fx2ait/fx2ait/lower/lower_settings.py
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -48,6 +48,7 @@ class LowerSettings:
         use_fp16_acc=False uses fp32 accumulation for gemm ops.
         Set use_fp16_acc=True for better perf; set use_fp16_acc=False for better accuracy.
         For LowerPrecision.FP32, use_fp16_acc is invalid.
+    allow_int_inputs: If AIT acc subgraph accept integer inputs.
     leaf_module_list: The list of modules that acc_tracer will not trace into.
     output_precision: The AITemplate output precision level.
     additional_inputs: The additional input to help determine input batch_size dimension range.
@@ -68,6 +69,7 @@ class LowerSettings:
     # If None, infer the dtypes from the sample inputs.
     precision: Optional[LowerPrecision] = LowerPrecision.FP16
     use_fp16_acc: bool = True  # only valid for precision == FP16
+    allow_int_inputs: bool = False  # If AIT acc subgraph accept integer inputs
     ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None
     leaf_module_list: Optional[Set[Type[nn.Module]]] = None
     # If None, infer the dtypes from the sample inputs.
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
index 990c37cc3..77e3955c3 100644
--- a/fx2ait/fx2ait/test/test_ait_splitter.py
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -146,3 +146,37 @@ def forward(self, a):
             dict(split_results_double.split_module.named_children()).keys(),
             {"_run_on_gpu_0"},
         )
+
+        # nodes w/ integer input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+
+        split_results_int = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_int), 1)
+        self.assertEqual(
+            dict(split_results_int.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )
+
+        # nodes w/ integer input should be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        settings.allow_int_inputs = True
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            settings=settings,
+        )
+
+        split_results_int_allowed = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_int_allowed), 1)
+        self.assertEqual(
+            dict(split_results_int_allowed.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )

From 46e9004ea590fbc83a2557a54005673f8e8caf05 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Tue, 7 Mar 2023 15:10:45 -0800
Subject: [PATCH 221/638] transformer converter

Reviewed By: frank-wei

Differential Revision: D43677477

fbshipit-source-id: b916f43bc7170de8bfcfb5468941d7dc82f26524
---
 python/aitemplate/frontend/nn/__init__.py  | 7 ++++++-
 python/aitemplate/frontend/nn/attention.py | 9 +++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index 9a3500043..5ee80550b 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -26,7 +26,12 @@
 from .roi_ops import *
 from .upsample import *
 from .view_ops import *
-from .attention import CrossAttention, FlashAttention, MultiheadAttention
+from .attention import (
+    CrossAttention,
+    FlashAttention,
+    MultiheadAttention,
+    ScaledDotProductAttention,
+)
 from .identity import Identity
 from .multiscale_attention import MultiScaleBlock
 from .vanilla_attention import (
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index db82bad1a..9bf9f9726 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -390,3 +390,12 @@ def forward(self, *args):
         x = self.proj_drop(x)
         x = ops.reshape()(x, [-1, seq, self.dim])
         return x
+
+
+class ScaledDotProductAttention(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, q, k, v):
+        attn = ops.mem_eff_attention(causal=False)(q, k, v)
+        return attn

From 6019b735ed6fd805717eec3373c6e8a091bc360a Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Tue, 7 Mar 2023 19:33:42 -0800
Subject: [PATCH 222/638] Remove noop dropout op with acc tracer (#377)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/377

Dropout is a noop at inference. Removed with acc tracer.

Reviewed By: frank-wei, wushirong

Differential Revision: D43881227

fbshipit-source-id: 0246365e6facc6dfb13843fa9854802f35c0938a
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index c64771f68..b653e64cc 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -835,6 +835,18 @@ def matmul(*, input, other):
     op_and_target=("call_function", nn.functional.dropout),
     arg_replacement_tuples=[("input", "input")],
 )
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout1d),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout2d),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout3d),
+    arg_replacement_tuples=[("input", "input")],
+)
 @register_custom_acc_mapper_fn(
     op_and_target=("call_method", "detach"), arg_replacement_tuples=[("input", "input")]
 )

From c7a8881fa36d360366afffb567ec09bfb54397af Mon Sep 17 00:00:00 2001
From: Stax124 <60222162+Stax124@users.noreply.github.com>
Date: Tue, 7 Mar 2023 20:32:37 -0800
Subject: [PATCH 223/638] Check if target resolution can be divided by 64
 (#355)

Summary:
If the target resolution cannot be divided by 64, the compilation process fails on the UNet step.
This PR asserts the width and height immediately instead of compiling the CLIP model and failing a few minutes in.

closes https://github.com/facebookincubator/AITemplate/issues/345

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/355

Reviewed By: tenpercent

Differential Revision: D43784017

Pulled By: muchulee8

fbshipit-source-id: 7ab7581f80c4e649e1afa4a22b53da3aac959c13
---
 examples/05_stable_diffusion/scripts/compile.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index db0a9ae93..0018dafda 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -15,12 +15,9 @@
 import logging
 
 import click
-
 import torch
-
 from aitemplate.testing import detect_target
 from aitemplate.utils.import_path import import_parent
-
 from diffusers import StableDiffusionPipeline
 
 if __name__ == "__main__":
@@ -57,6 +54,10 @@ def compile_diffusers(
         torch_dtype=torch.float16,
     ).to("cuda")
 
+    assert (
+        height % 64 == 0 and width % 64 == 0
+    ), "Height and Width must be multiples of 64, otherwise, the compilation process will fail."
+
     ww = width // 8
     hh = height // 8
 

From 810a5c41cdc3e8a6dc058ffacb5a1e38e75e7cfc Mon Sep 17 00:00:00 2001
From: Zhijing Li <tissue030@fb.com>
Date: Tue, 7 Mar 2023 22:19:15 -0800
Subject: [PATCH 224/638] Updated the softmax wiki link and add images (#379)

Summary:
Updated the softmax wiki link and add images so that the wiki will have link to refer to

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/379

Reviewed By: muchulee8

Differential Revision: D43890078

Pulled By: tissue3

fbshipit-source-id: 5893e904c14b684b16fe8601419cba74bf0d50d7
---
 docs/image/gpu_grid_block.png                  | Bin 0 -> 166428 bytes
 docs/image/pack_size_1.png                     | Bin 0 -> 143113 bytes
 docs/image/pack_size_2.png                     | Bin 0 -> 140996 bytes
 docs/image/pack_size_4.png                     | Bin 0 -> 145439 bytes
 docs/image/pack_size_8.png                     | Bin 0 -> 137856 bytes
 docs/image/softmax.png                         | Bin 0 -> 21003 bytes
 docs/image/vs_oneflow.png                      | Bin 0 -> 393445 bytes
 .../aitemplate/backend/cuda/softmax/softmax.py |   4 ++--
 8 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 docs/image/gpu_grid_block.png
 create mode 100644 docs/image/pack_size_1.png
 create mode 100644 docs/image/pack_size_2.png
 create mode 100644 docs/image/pack_size_4.png
 create mode 100644 docs/image/pack_size_8.png
 create mode 100644 docs/image/softmax.png
 create mode 100644 docs/image/vs_oneflow.png

diff --git a/docs/image/gpu_grid_block.png b/docs/image/gpu_grid_block.png
new file mode 100644
index 0000000000000000000000000000000000000000..a486a5bf9a33c7a1f91ac9ab7580725990cb5861
GIT binary patch
literal 166428
zcmcG#V|XQDlRtc7+qSI<Cbl(6Cbn(cw(W^Ev5kpsb7C8BX6D)5|L*tq!@0Vz?ml(b
zT~%HDb(p-Y7#uV<GynjAlMokH1OPx9001CANZ`*aU*v!;0RUJ&Ga(^)2_Ydud3zgU
zGfN`?Ks+qzJA~@D5%g>gMWXttqQDJMBr2f7z$svH2LF;kQhrJ%aR2ZQTIyzWCE=dn
zs&Go@bTE-q+VBnrbfrY6THWs(g}zlUza9==4kmm)IB#-TOb(KP_}@6b(@u!60fte~
z#Px-MF%1&FVHQ&O!YTm)xcCD`M=^fp7m#e>VOoA@Z)ib;R&?x8&OV!b_|T<$3h*KT
zNU?Vuy?GfBb^-uf%G|Yu06>0ZQ>r3ivYx76BPclJ6*1O?(iSzA?Dwq>`;14`Fa1`4
zSmhQ?337mW3gIN>o{=D218-=Sh-V%ozzMv+)6Fg65%GtBC5%-+;^=MKZ@weLcNrEh
z@r2QF+}`k(VHi&NRECLt*%!Xqt1c<+D#xi1wu&;f_$k%pOuY^318L|Ky=HsyYDykv
zXnnd+KFB9~k5C2kWL8!Iy%_AP>t#k0ehhsRUUo)YeUi`sk(M|lNnRcy%rQUAb416o
z8ebHITHgK$&?f;q$qboziCRe=Q?KB_am8zs7UUM4wCb0cOweS-52s|`g1Mo&@VcD@
z!eT*7HA?BoKvg*XAFr9swngJd`Lxb^xlzK{G!dJ~JF0$^r2Kg?XvX6H21)m1ZchO>
z17D1wIO8PSy9jA`mEs60qK1-&5DxvdkV93W0=WENYmovRwqbw+P#uP{p@gCAB(o59
z{C!miR5Ph3qEuFP5B%2<v25ULExh$I?P;@B0qli>1e1su!b3DW{SkF7deh;qm-~dk
zH3EdX0IG`Ly2!)ez0n)N+O9v2pujZ=fztQ^xJhGMp`>mxklMTaNRKGQK*I#s7=#yS
zzJkL(c=~AH#bSvr=n`Xwf#e7$K(+}nHl+3ogoQScdklslKen@AXb=W7V9F~#Hg5t>
zU4p^VAjwGZZinRw>Pf+oK*I9+b@TD8Ltl0_TtVBP`;Prt>?V8MJew%Ei5!vfk*kne
zA)Cmd4<)?A?O~7npL>337Jj6{tYMqCZUZ$VD24oLrniQ~wflY)67(W4X&q}svzu%b
zbd_+wX?)pdPI*Q-ftr{vcJ*WFom+8Sd)!J~B>9BkrXZBfi*oZcFFWV$r_V4iLE|=`
zhJHj1$P(5ms5AUme>MR4f<K%7*^&Yl{)$%2ew)RVMg?~^ZVAb2NLNo%k`eP}%R<Z0
zQ$&6>Kcn(}H2A5+jv2C^u?kQCSQjKPxLCgf_JpmSvmFrsy_Z6J=SKf<7fz`4i;ML;
z0pPJ778VxweIlp<P%{a_`!2J|sX_kgSOTzZ45LUwrRLCdChBeq<no+>^@XYdG}`jg
zcjLK&_#lGz@}q`AjOYSK^1_@uK&1j3`?lm_Ci?H|vZ+FR@u$;;Sc1C`0JjCr1pMd$
zvxU+GaoIZm4bH$%;0nP@0x2kH3W>+WFO5Xg4`e5h8p?niC`?!@&Kw_XNK7moKE{7Q
zQY4O52frLSNB9y;z6WX#|6ACa2q6}KkL?<2Kweo1hfTC{8i^9cC~w=8n+tzjn4&=B
z2-uNI-A|>UZOZ)?)E%fJ*g213${SXfI+flym{VUmGD<{`N)?>E$5@X{l~W~ZUT@qs
zt6En}mEj(VV#hJl?`tm}ZaBpjw<j|%jE;YGpUc*rD~>mqkMG0wCM2srQjg!3$QI%t
zGBeaS$bmrn-Y>mGy;P61PAD5lVZvJZaMK{X5hMMUJB~ZZJMKI9J9hdMS4g!v!o*my
zxZ=Epl*lPc{M6)?WM{+;vGfV85(LB4hTrO7I8!qP(c=!|Mn}Gm3=OA@$c^MsVWZN7
zV~~aS{*W%9EwC(N{YJ0YN|jTZu7p#Dz?PjZlO<J|yPzPaaG|(aQc{vy?poGg0;zmZ
zqEvFK%vpZTzh6+6T-HIY!KSQS)~e*xc5FAVQD$Sxn$9+!lPpM+OII{MB|CFI%Ud!%
zpE<{2?%A+xo@9P!o-(gp_HC|mPVE?FE@EzU9-%}>>0&<DRMnz}tr`0#D>Yj>tF`H^
z#SEL6xv1HJ8U6HHshjLg8oI=&I*oWlbpyXe#DYYnm}8{1fXS@e?bB`L?YCQNtNg~O
zFJfOc+LSz+oI{**qLZV`W%DPLvLv$#xU^bRs~YIV*uT4^oqMmJwC3DK-l@6+cMP~Y
zxu@K1oFL3TEMgp`99>Lz6@QHQ>LD7&8r1aW_H1PnS)1Bm8gLBQ25eLHZRoA^wTD~<
zC&8}roi-ww8`_{+u&$d%D6}bb)rQ%H)-JhLUP<g~qiCV{Bv>XWlMhL|OaCtJEEb$q
zE~YD<#NvWQCrMH0F*UeN!%7QEml*$&;gst2rHM(7F~_8H=rtQ`-rp=vul85|eE#Ij
zoPFdz?5O(astPJiv2yY}b~%g6MWt0mLIs98yDHz$>89-^)aALAwT7bQ=VjT(%Ek_J
zv3i4*{iVvLHp7^ewq=v$?<+cv{CI14f$%4ITp=jI9`G1&)%f0h3iEOMsz<6PkKGcB
zRJ~@|-vVYwk&;OiMJq7vjQiP5*bg-iD<@n;&7)v(IBg7Xh8OAY>A6!RQ_2fW3;e~#
z@#i^?lgb0iyPX0~WY}9*F`I&{E3I|bdDh6zoz8jilkv^)mGRkASH?^akPp=4{7rlh
zEhkB`2|Cs~pxf^~-uBOqvUWRWUzSZXrv0Zi#Q3h69=4w??|)w#Jr+M%U!PsG-0VIp
zzwq2PT^_#J-1z*eGcfw;T&oSv0U-_y57uttQEO5c1EKFn<VWU*C!j6>C2-<@4v_(2
z0=@|T5Y!wbjlhX72?>Xwh2$P!Ft%-H&f%qpugB82(yQFJ+q)RV7g-Y%E6f@rFccW2
z#(7{}s9Z={M+7Zwm#-qa9?=+<jqAk`jEREx4eN%~+3+-SiN)@=In3_s?#2K_KSM-U
zf^G((jH!&0oU}}Zgl7(Fs#5YCn{)qCngIhT4mnH$&Np$YFsfC>=OM5t4W6p6^K=ay
zch9aFSKo$mGITkJTzaR&j!{}r5DJlI;%A(Ty|zW|%3I!kXWo|G^5SPS1wZ&-OzI3=
zAU#6<0+nO&Wd6mRZS-tdHFi33JxV=!j8+$P5?Ea)^poAVYclNDayCnBMq(R9OlC^*
zBIykBe&`B$nsho>>9<otdUO|=`}B5q`y5iuIy!<8f}Bm&TsWV-)uP53ok9d?FBlpa
zK52<ed)=}FrRlkQv-??eIS+MI&D&K+Pc9qx6WE_v4kp~GBk3(}=*%=;Dkqi^%P__+
zlTc|t4d|V$C)PF(IVWi{9-O^rUJt+qVGd|rZ>5GA8V1Y<;&*UhIxws?0~wI%k5#$N
zuT};&B_GKSrLv?(rMhUh={P@5hIMGX@l?}m>kZOg>flDhMny)et1mTb3#5Nml$=ig
zVxl?u&ZLr5;oj!HbL_BRwU4tXzbL-ws4T3+TGgQ0=6ROlq;^uJ46bxu%3P+ddFRdJ
z^R^H7j&uA4+lF@i(JaAsc6^*D<D|T@e6$I%3Dw%Ij?L~QZ8NoN-jZcarS<H@dHTKz
zulDPTIk&mFGxnLLO2mSsg@<#|d29VwTNi^YgY^TRRu9Ps|BjiC$%mq|woX1zpA(=1
zzt#X1h;sO8grqr=yV>Ix>^kgL>;#;z8NKQ1866o2c+OnyZ=uU^zY4g-Qt%4!HEb<&
zi%N<<njc*+!p2Zy)F~P_I63e4<{MKRqrYwcn8=3b^}2>Vksejc`6)KxmSw>;$-QaY
z^Csxkrty-ux$ez3x4og@Grg<7OExJro>lNQM(e)Kb4PM3Y4$tkcLvRf_60Yd?dNfv
zQubEsYa8CSB#)E%v)<L5i?J4|W(|9}UF|2Yf|q%prMKE2qa7zl0o@R12)p<jZv7s&
zJ9h884C^r2Q`w1pY+mX&WBbc-*<0CClS4cg-bCJ4ElrPd<9y}XE1lerDIYp#UIs6%
zIuhD&?b_FaTjTdlRW=$g1uqv}qaF`WULz0z2w!=b`Ldr9Z-4(jq|y=Bq3&GqvU*2<
zKRf5&==M#e+5Y7N#;7_S2@BA+f$&X+1>j{*mh7S)^0^71ny7FBe?ZYpgkJB!^W(kx
z#QYBOVP^*{wV~m6<#xJL1EyI4J3K&K=5z9R+~p&aNp*sOhrTk?I<q%3JzZ4cEQbho
zA8WvAWi*d1Zy58Do&p}<&ZRyp3JxPx31b-<0QKiNBmgMH3;_Ch2K4#E{`>&|z`4Nx
zbp@m$7v#Urf&Bj5Y|RyA0s!~{62byXu0W?9W|3syo$)bWo|8RWRE!pSS1{_zniB?g
z7Dzo((${AqE5K>hR7R~P0lmY%L*U?pn3$172H@F1ppPLuB)Y7PVk{jJ<_VQ5E)Hh#
zGR_VrXlfN?7CH_dX*LmFX%}m*3@jEzZH=c@6<jF;!>&l|#MYh{11&n9519`8+>>PV
zkpPlzNJilQIS?YsV?v1fBI$x+0{zb+Rm}J4iW{}P_kTUx4Jker^Cb4a33Nl!5_Z43
z0>Nk-{4Wx^ps55mHvgBfD*q>8VD$FE|3jFO?_l%)vR2~@xY-4xb&dKz%;|cMb^SlQ
z{^eX32<Tu4M)eZ=|7@@OHPr{?e|Gzq^=@F`u^5b+Ew=yJ@4wwYF@1plzf$<T>Ie3@
z4F=sFi~mEz*BOLxm`oy>l(yxsG;0bWyXS15iXg{U_j;LKQkM6lztQV_SG|RaKA`*i
z-`8hE#DkM-(Np{6GjPi`aBt7$d#$;|Y})a{Ct}@4l267pDS)Np_`tf|5~OG_mX9Nq
zU<GYD<0BiS6?Ll>l2ZPXS*wTamK-rGaX+9*Y5R35!Xp^{^(ZOA`R<F(whrf{2S!^j
zNr_hwotEigZ{B(WojJk1))^YQ?9iVb*h-PuHr&J9Ro};IV0T5>oC6!c2=VB+p9qH?
z!thdV4c+9p1^tn>Uj_;g*egc}-iaEcBWUihNE`-Mtqwb1Ct`=S1x7>lQq0-<;Dhu{
z(9jjdUny200~MJlS6w%+7P?nos5Wao5#O{ul4R1**RH~dUH?-HI}EVDtUlr?FwBCL
zrT`lT%dWmy44=*_k7j_+0Gj3zFiw^@B9%DQA=zIoE(Y-Hd8W>2`5|>$q8;<m^UEbm
zL<vyzO8s5Y+8A`WP|LEXJ4tT}o#kK@yVGLL)#@D$L;F;5qSLi2Q(SSSEo4X5b>xq!
zb+ZuyM)S4%FVS&uv@e{|a;y^={KJ-+{w~DM2aJJLCHJv}$1cR+aJC<Oy$ifK5Y>YY
z^@@2xs`=L>nq@(C<#jKrM)$N#V630}f)eeOUmuIR$gBzf()b?I9kn&I_*;jFi<-En
z->>bL=Y_$Av29q9VwXqCR@bNWYow8yx#~1jQ;gipyIPHPR@iGqugZmTs{21VlnUbC
zF8AnJo~EH~v<bSSc{~EUt9tHpy52p%E3wMo6`X~kmwu?3N-aMD#)#hiXS?o@HF&>?
zdVH|{ybjIdh=@JN%_dW$`I@njhjqG|1X7*GAJE5T=--?@BatO$TXEjR>)_0723MGR
zy8r083K&Rogyz>=A|I<E#z|*N3a)UXT?;&D;JoZNi69ae1;VCof`I=R-W;U3K>P`I
z?M^<%(H$}m+dZ-5N`noLkjtHq4J_Ze`C$Csi(<S~^g7vNR<x3Hb*F+S=%;Ab$RRZs
z@+A~;rH03>G2;&<qJ>nY<xKhl!rgni=dApvqguvUW%kcUgW^fU^h}ooFNm1y!tH~J
z{%P47l$1=GMP{~uRKbXB6j_Kpo@0g;j$lMmyOF&GUSJ>m@w+c>fieQ`?b1ZsMOKzo
zpu*bXA_I*&X0QMvDN_7*TfIMSX-g55Q5U@JOEoa95+f|hDD$F|iLo*NK@Bj;<NC#G
zo7=QQ9(6UH;Fdj}unnZ#2#j~;eq33fDMeZ$WgUP0k>G#KXgq?S+~QJPT#4>`k+j0R
z<}X$)ls7TGeRL->#-M>HTIQJVZSH5U&5ZqEoP){)uBH!sFNYt3i;*1Tj^onTwBg9J
ztX6&f+6xr_tP0alt`XLa+Ud1Mk(o~H7qXfC$_K&6epl~hB7OZ{Ih#Y1*V`DvE3KRs
zPM=%a%-I)3nQ;`V*0UZxz$XD+H$p-fCa`*rB*V=IDQ7C#UDk2jK5AzzTbxCixN1KF
z6rnDqA`VAWO=Z?qI<-37A*$Y6LlGCt^&7#u;g<N3cEdly(PaWoT~{5~U81i=Ak!H2
zui#ybQ;#pwNJ?M7S}m5@?xH2qnSywYM*u_$!>$T)K&r(5MEOCJ1PxA_H$+qCybe9e
z(87IT{8!Mygn$<`$xg{UlsPDlMkG=zji18jthS1=vZ-7}QgYQ9xVy-U-7y&FsN;0d
zDw**XE72IjQ;`K6Y%-|^%QqxiLs_v%^wwejtnL=Ed$ch>+;ndPo!L`0$I989tw|{!
zv5%2uR`#oU>LYIYB3twKHwbjD(IUqx&KH}HvkzvjMtytxnVXfeJPNbu2bAp{f38Qa
z|59KmKfTGmxtX<f73X5_iC5)H4*65p)1r?$^Js799r$3?3b*$T(BeQ0M$UYrm573W
z_fJ|DSdwv8C9SIhVHNx9s;$3Kv0DTZk%c_mbK2N*j;>+DRCa>2tV})HXtntFQQmN@
zgPGAr9VXDNWdt;D+>^|;c*4AS5$(y-cb^xvTyw$XCM~|m{u3w`gMVi349ZOP4XFT5
zSww}aa-!?w;UmO?%X`iaPJRzZ2Qbuts)#F$l>Y^5%eef^Loa(2F?yFnws&}nG@*ZY
z0L*VD<xLE=^|7h>UgYG)sV4tG?Mg25(mn)(>N<QsA4O}<SHy!}dq7$KuN0($d^00h
z5@dV<HLhW`KDIl4>YH1-^UPx^<sbo&3BCI%#DI*>BLbBX^&h2-Ch^Q)zkX$8l>bdb
z(n<#Vwy_MY=oK{Dd%QbEQ?c1LjP}-X?SRf^H<OQ{(cWIx_!(|mct2nKS9e75cuFyS
z5=V5&D)V1yEI(?ju^LTf@)}XWtDD-1Gj76?syh3q#%R;lfOC7PCCc=v(|?;f7sLq^
ziO+Jt15?S_qFh2MlrRroQ9?l!BT}_$OjB~<uB9du3qfokgwu>&WK~NKDe_5XqW$Ko
z1$`Ni2veVhl1If|<3Y%Z%FOjRBP9v%L!&}-NZmr2cHgcghVL*`z#aJ$Kvbzgb+L}j
z`s1$)XB9!!VvG2FvkOH9VvSU=i-OIAvPxbOb*@XUgWW(6D#@&f+FQd4Wul00Vt@A9
zmFp!^w<7nW^3fH!iu5dc1d$bBq!SAr4~S_#?@yU|oKex+k&Q-7zDV&N9`)LqO`GNF
zVPMo2EkYhMmS{e`X?t?LkA1rQE_%={ojkMQ2SSn`8N`Q(@fL4hx{=|95jCiAD!CTH
zmmw8EchrkF?K&uY^kYbDg{f~K^r#{N-%7M3;e{f3;bnyI-J>Go8G9sU*H8z|-vL};
zcR|D4fvb6xc&Es>X%pI*Jg)=r&!O=V?};jxT^vc>xxN4jQ69&pB`9FJFX&Q0=vjg4
z%BoU&)FUq-cfZFXBrjOlz$uVMtux%RJ_Cd`Ca`P!(SF*P=+#shOCRnOdArh&C77n=
zouu!eQn%SMN?Z17pwQ8jw!o1!<rt|NmSQEWdi0{Fw!uZBj_<DnzZkU8&$>=4wOWx(
zXlUCx%8tlObo%c{vmlBI!t$E`MCLkSK!vMar!M#IO-zemOh@QkfCynZAzhVQl#9pR
z>mxQp{iAHRqQFzB1!-$|*Oc!7^g>UbKs8LcjC#<L0F2&CNipM*zjDFV<5vX4ENHNh
zh*@;<IupvEOKFUvQ8b?c2<^W)+Gm1=1l*e$Pj$Y05>&uDQlf2{gAv`#WZ?>>dMve;
z8howx*QXL)(rgWZK?IY4VAWM|X{NGX<?FbL#Rj2ZBX0l#e~ZZcniW81sx^o_LGx_2
z2ny;Bv~?C4h`Z@CACR2P67ob5j4Wnr1-^BA-F%TdJp}ukA4GO15ff6{f6vX*>6!^h
zsl&g6g7NB6)DD{*|1^ar5MZf^7y)}0$n}0VjE;R2IkFt+#GShwR;}R?r5`7g{PpUl
z127>_&#SXBzjfnh2R3_YkwF-$i0AaVzQPtWX55Nd3cpDY4xrD$Z{ujq4a7G-REeL!
z+#Qr(LjPr;i2>NKe57!afe$V`o9(ThYq8}*(b#iaBqF<b6-D!~;nDU_5}}2tYVHmD
zRhiT(nDcCfHoqqyQdSxtIk$a}wL;vyxG0EOfwJdj2yr|p5VkJ-xL(krgsRuQf20O%
zRf3LQ|Ixi2Qw%xkTyna<;7Y=ez)>OU_r=+@)Zxo#r13iu^Ub7@H?@tCd?--uaunbi
zF4C~sN|xF!XD}SZ&?+PRg<-W9^Ec1cg!d~t7!v-bC;xjsD^MF%F?s-+j%XjcQBoTX
zOk{#IiZ@m*D2AC1OoF_eLdpzDBMyeZN^J~1c640>5gu!&-;t>&Y=DBcRWxzM*TEtP
zOeB|pl6fIwAv7Qm1RPIfwYzs<P>!uxDGlbIctC&|Begu1HPX6Hq)rX<?55tOoW}_q
z(~@>xQ<krwGUz?7N39I)U!fqcjsFOW_K(KezMrl9s#->tWQ6MW$0eZ&AsmXw6*>Wn
zDo(YKw4sQ!t5RHuD5QDl70dsDHT#%dLC7ac9ZE~q*!|nK7`H#|czlylOE%Vi4mC3K
ze(M-aB)d8o4RL2c1@#1`-ouVZ+vIOd{UVafSZV0%0nqFftB%8cm~e6Ov2(gbyBRJ2
znrgxFr<}7@1FgC@daI*&BEu2j3WJ1E#t5z;9GdbnBHd=KSZ)b6D^DXnX-qU^ZELI+
z@1zY1PwX}k44MRmWF-qr_&GN<&k9w<S_h{#p3g1A<*iz3?u|%1&CF4&#~Hc|BPeZ`
zB_cs>VN8xgWPJ5Aau(&M&sENQ*T<#LyH0WWud?()EM%FJR(L$FC=|sC7`UgDHMsvI
z1Ybn>Ne{)Y-I|-7V1fZxNULZ`N>Qv41ssTGuW>q5Fnu*Dl}2j8D-!Xl$@At7;R$Pq
zpiENEEsDK&PC_?t@XTDh;p}^3p2&ZsK8~cS{)QQdLL4xKez17~WvdwNu#BKTM%<=9
z?^`A{{(G-q?8l_^cy`t)tGX59X6^R770yBh9$r-vtkid>d1=9Bp*^yu$a?hDd<2sk
zOEKqhgCFUeFB44#-K+K9ZzwUmDwT=d5iit5#uVYUR+q+V7Qr(kBg+9p6PU#dmI9CB
zOn51_(FN?F_LlVh2G3IJpos_B1dv}szaqA|_WZ3itReuW^l+Cw{^;8iB3z1`FoHUL
z6(p_JnCbbLy$^7|uYL+E^kN2Q^h4qXxm8-N4kBw<g9fMVt_UDHwm0fwVpX&j`}*(h
z1wE`sHG}2^{BgwJK$DXRFmjzu9&)!icJdgtR3*sRWQjTZ6jz#jjxjFUJuau>+-c-1
zp!;>HB%%p;ZqL6~SF3T}KK3D;y4kiZO0YuqNnj6}Ws27Lh668i)V_?%rsDFr70;SS
zMpm$DEmrC$$Jxh9Ai0_H&&KMD^eS_e{x&i%U;Z6fYjNBg1OKzipOgWcjI>yKA?M#a
z6Zx3~`4R;Do=E8(07myRG^7kCT-V>57}CZKN+&Fhb<#UzVuerud1aF@kQrLiqIJC&
zIUF3R>bV!pzh{#HVn6^!_#m`KQ3y_tvhRT^ec1k&S)mcv%)Pp52Z2$X%V-ME8kjL=
z6)Vrkm|cXJc)#w*ZF<@0IO#U~*qWYbwXw>4yD>aoOA8?AbUB*xYoH0aSa0B&s8d{q
z?^}x?mY9|qa?~`dqRZmOz>an}oX|=|H{4wqS0K;f#Suj^go!e5Va;;HmBJ!dWc#IO
zZLVI7#YIllpLNJ|@E~QTs1g5Fe3Fuz))o@FF`uSc5Vz%DDci&a9P9TDb`8p#w)ltt
z3r<q|ym@aW@BwN<XRogTOrWvy{7kAQtvpCN6REZWP&u^<eE{0&g|fOP5!&I~n^&v8
zDAEB50&Jfk2z0b}n$#uvr*(f8WUzm{o~Z~C)a}MP5m5vor<wwsmif!fr1r<a70O|z
z<k^kR<8`r21~~W#$`WfK?TXv|b{2|>?49!HS@ic(j!M;ZE7ARQD1}E&sXEdTgreh`
zWfRF5-ls$hY#LE%t)v6vG4=UCTg0OH6bY-C8STWf_#ODb@JgZah2cElUZ$|An%$c9
zM7(q6d+ZjLx$^qO@8Dk-_5aohPecHMo&4!UXvo2=%orpE!9cY|<sl}^(SGNcLQTFH
zPc6F{RuH`)U08K)HnyOS2K7pgkvj$=9Ci%{fg^z%Ul#~fc6u-_{Go_5x9W$M_meBE
z&yc&-D6sXtJI2pwl6yLEMz>gZOw~VI@`aD<xu-LRuE}Py?_u*}>p+=?w2Ndv4)h_!
zURiyaeL=GW7=77pj4+Ou*YmnSNQ~WbIn8{5VZ)v88>_N}&kMVY3+(SGo%W(%PZMXW
zqGDnNgsfUD;YA3%*{`mzkJ7UaG(r|%EVHt$q(rI0xgXVq6S>j^!PyMQ2uszbcCC$J
zXIg4+8lT`UqnQ7?*iUPR2!Km4T!R@U5y~#Uv=DN|bv4Iyn+o3L2vpakPTI}I0fDo_
zSlx6x#GAZnv|ewu+Z~i^#vi<@>b>`O$l-D__w>X~@i?8Pn1a2<5G-2Y+kS0N`&2xp
zAB~{y$P?4;@)Lg(u|p7}kG=n*hnoAvo0m6YROB7s>y%I|2{*xqP#xU0usH<k1#G^#
zMx(Wa!`<uY+-bSZ`MOJ4DdxOaNDNnzIx>6L^QkpoEIdBG&Da8wl;sLDG!V%|QJ35K
z9sB00i*swK`u-Y+(FI>IrYwzkSvW4qkUce)KUSZN%*gh5ySQaDVT1^gw+>RGX4qfu
zG;n}o4IrUETSS;XAX>7>kDX+EFU#fLQ$1WthNwdgDpi`CUs|!Pt%89DfaXI9E;c`i
zNcTEBe+@>RFhI;HGw$x#-Xh&S-)?kx5aC~VyCPtNzej0N`*;59pV{I;($et-D@u2^
z?-c=*2?MlYPQqy|TMC220+lQeix12}Xv`b@=roEFD9Ett=M(q(#k<vzbhj}%KsWld
zNkrhASEg)Z)gWRr@o~*<G`}`mzh%uHMz3OJQ7BiWsT@OumrC5J{T^E$VgK|W`5SBP
zMza%HE^=8p?jHqxcgCyCw93p>=5k3&SS)cRx<hW6MH2p&4*k0@ZW*dq^COsxS7?`c
ze^V1$1W-`+!Bm_g*Miiqmm8$LT^KsgP`C@yTC*~6@I%>~qGb(!D_e^n8w5(!Z{Ysd
zj&Nx-JFEi7dFGV}E)g4kuTc!D`oK!c(gDy>>EuEkwiiWrRPER)fsOHnEoWTKlH^6v
z--x>m>kcu<>vixk9cVrqJX^*5x@J#I?WnQF!0cs)rO9IHiNumIg)6wr^M$gI-fPpe
z4yJ}Rv+*jhl3B+0S%=!tJCt$iaeJ&cY-;hm?%zK!cz-%OtMo#yE0N&iX|#AReFSWJ
z)g32Bd3L0;!nN6Ze;Jpuz;j{!EuO^s(Q{J$Xs52hbgRSnlGer!dvvA@;T-+8tkdD(
zUaZAT<c4^AdCp%pK`O=g0jV!3?3#V~#y=`g*~RmoS6ctCL<W!mRA4{AkN^;?xM{`o
z<%$kvK=u`BD^*%9IE{Y%9cgL5V|bs%Z_+<5n%=KtJ|1(BW1{upqERHZ%bM?6u#~^&
zlYW6Km^X`og_T?Q(Va*1LQNfvj<U*1{(}O-{B3JC+DdL*IwRP4xpC194i>IA;C!s*
zrw~ZMWlhA)nxJi%@pIm$4xVX<m2iSEln5wxdYXwnqP)6%tP=;NfdR6Ev*A11XZqBQ
zMpRS(iYeo3C=Ex%1W>dm6{r_EhFq3J;#`b#EXO5765ZM&nO<a|A_EJ*E_FCG*7(uz
zYsrnTI&riQ&F980ayW^I`xc=>cw(0Vf1^G>F^P4=#$9q_Vpp<wB~2ZkB`g0t-}U;{
zjkIOy=JLE&8AM`hRpG7bjzq`vpw#AV3yn(MkU+qnh!GJ1oql>CUMU2hliYbts?AQs
z)t>BcY_N6%MIQw>q%)EMGux>18hwUyapayv;J(~Kc<_o`=9dr8>?iEEt?kDBp<9mV
z-Y1L0*uwyPkb?U~3C!C%bq?wrE7>SAy2|&5aI!%$#k}*aVrd}Fqkx6}4sK@TVwF}U
zm3qfAtBgk@Q;H!~|8D64Yze11G6H-7BRr$x5l3V3@5}UP62_NU!CIK@-Q<t(NAN#(
z`HZM)?+v>uhU(ZU4~rYzsn}c^_QZ+YtT4v5nmX7r9Y@UPbH8lz!`NiCN7x+xosSQ5
z$Wc0aw?bXnYJx148cHlkuf}2#UL;?LQ6k%DNdS5gjwfLb5N%tQ_<OKl^Q+2BW+%|8
zB-~dOTE1b!Q7do(K3Y2$<}T}uVVd-t=5Zf^e{+g%WnzFNG#AY7aXS4ePvK^R#bU?D
zMrXIh#Kc5oWTbCO_?oRag4nK;xKd;^5O?$JZ%;?t&7J^=?WAezrk588q7=)e8ojfX
z2Ew+IQuDg(^4;B#t!B^jHBQ=>mzTv#4R#KWX8U=}VQa+3?SvmVTuUq^EV3R)Q#}DM
z2s=s^-#3|B?a>~Y>e0q1!H~U^ng-_o2<To!xAciAGHEJ?>X7LTWX6^@pnyr!#|p>0
zJM>y!HisvqmhBNsth8NU7zYze@iuQM_Wc?H;)Lbf-EROAMysd{9Emi(zazRLYsgf`
zQs9@|<5W@xfeLZ>kSIz8vhg@%X%(&G2t{diIx2E%m6{?nkupEd3XNWp^mM^~Elg>o
z_^4#P3{$%I?uM$ZJ5wt=n5K)Ez>Cbko?T9mU)AXdr7WFPEs^|E_eRjh+8SQAJINev
z+9Ni38b$XDIjIf|aA8|$PcU8xdFxl#%iTq7kGIq1S$Y5MAy~7PWlQVT#<OgXb0Dck
zS_~j!lZj0C?H~-<Ob(!TIG;m<p=ct6!E`o@&Z}`B)B*g707`*W8`5DHtCzkv0^Wz$
zo1vi?p4+MOd4qCWsHt;QJl|+gThZ_l$v^FZ&&meuPiO!i?e9)D1&>p#FfJr8zzObx
z9f&_)>%59$-^|FCM8LDYSkk6eE_8;;n_x|Wyaium`}uA^VyEXTkXxHP7#FZ+1$8e-
z2Pp<FCGG|kV!R~NW4e!c>_|_RK7ZZUB05AV@e^)DFntecWRt-n8EhSkO~5O{*o#Y8
ztM2u8k}Jc%A`t_aF()DfYy`NCZ0O-_WJH3J5CsM0k}vq8U(3a+`tHH6SCpyP3YzhQ
z<<?+H=Mlu<ZOad&y7T_mw$ScsqfLjyi&iQvw(wJy%LbRjy=(k)`C?>i9owR5=hFhi
zr^JFUc|kQ0U}0G^&b(gk8`NX7zqS&!khyYugdg);Vez>iiSa#eU+s#q+bp+xy{Lpv
z)fIZ+lVz~LqF`SqrM|76WbCyWeWnv~f1?FGG;_GWrg^GGfhq`Run$-<w#eQUN(|ri
zecJ`e<+;uKs6+76{4`SesdyvRK)hoo{ea%k$(Ww#3~i6Fi)~hSn0}Lz1PRpklt84T
zpdun!UEVQs9T5qn5QvkkeyI{wPL|YX&Bw~qVeQ1nJa|GeDM4`hRx@v&xCOPY95!9o
zJBc}3mB94Hub*TD+<4x=p0<<mUpK~<3OIEb>PsDC0yOH5yVrS9iZ(D*Fi?;LpVz@$
z`n*M4EPfkeG#nlVYMOt6QOPj&LDrzA3&Gh3;&R0IYPGu{22UVObWrxl#^hfI88)46
zul>_EC_l>QulxJ4#`42KF=M!|hd~gyK_B#wrec$~O$9bEs9m}AKRiXoWAF%kt_D)>
zeO|Y#vK~C&(<Wo0s{F9m&56xQ4AtPPV4u%(UH+>YA*|UWjZIY7&)OPc<%HC~<>!fu
z8e%jOKCZ<6hVQl4^RB)*O)~Up|6&Xsw>@Df+qaQQIU~{0a(iwi615g-MmYFbko70J
z+J&rsaYe3t(kScq24kmsX7Sh__H)ciw$@M%k~XW2MkOj%vT|K_H+_ebQa>i@wY~Sx
z5-6gNs8wyksjj9-amDQ$b1y)Zz8`8ws`X@J*lauf3k)Wp2b@=6guaix48WkM+QTd?
zESC+KM+CyG8*wCJ(fD{aFEWqc<uM7|)cRv%w;lbSgeS7T20&dB-dZo!z<&$Zh3Wl$
z-u8jN%9Oy$ZEuJ>O%<EJ9q9+jH%1L?A1TETt=(k3zTS2olpmQ5py-#_ntHv->Eh+}
zwsmyepPlps-@pi^5lHCQB%)3<kawQChQS^u_)P45ksb{}lU4YFC1{llghdHO(}=fC
z!wz&+4l%#X*+}TT_Xc261-!YxoHczNUuM^SiZhBk;@&m-saSce9-Pi*te$G5E-5HA
zx?2+@QMfN%nJjgw&jQOcBqLlCfJb+R8O|!7biUm3w%^1X8-E%^z}<9V;(X42=s#Cg
zfq*iT55|d(ij_jOLIFTNpT6w{P#{`nn@r?H;$Qrm3Vjg<I4h1_f<md43}14C6TN3`
zKCR!KT<|}C0afhF?t0vbuxY;nFS$9MFW<%h(uT=E=#{LsXRQ?nhJZ31NnH$k2Hvcm
zFGEQ<REde~AprTsq{$YWi|;))6}DXl#0Lj}ONZG`Q?0Ys!>gs(S}WPKobd9@HCrBh
zd)P#9!t$OHKs*UwUtA1vy)*{d_LVGBgz}V3@A)@`)d}yOIT`98m!~&`dba!SEq}uu
z!90{$)_8mOam><nqoMP1$|46Lu<8`a4NpADuzIhhvEcRo^Yi`Q@(O6M$A0!Qf+?lb
zlB$$@dGq>px)zx@)L1n?Ibp4;FkjG8hxRp{)r}qphyG*M=DjDp&a6=2pdnxOrgX^J
z=glGq$JSv;X{Wn7u}I}7Pyy`Elb1ZBwvLuJ_wk~x3IYbvfW<%2tx=3$ItK!Ei@}d<
zEjq3R_#}uQrZ^&Mn88oCc?jRbawr;am=_cZ>)574JETLT1UHpw4L`iEC+P~Q9?e7K
z7^+2+zE1?cA46@M->N51;YUeif-m|mX)~pgc_!AOUp;Ja)T};Un>>Mo49U32G-D<%
znk~4};CAkA)7!%k;7<!ViMXczxz@H)Jgyg*7mXCBFP`|RR-7+QFEL6ORA`|gorcHO
z#%QRDVrDPwW{3h7bkhtTS$(hy6qv_<_z3l8)yM<CwXM+&L$=Nv^%VVCM6n!RIwx(d
z+4!;RrC^hf<(m%Mg3U>M#gC%!I{PKer(K^^3R_v2f?Sq%^KMG1)$_-WiRCc1a?8D6
zjI0BJ_tGR@SQT^!k0=u}H@!?FLyz*0aynF*nlb;1DQqC=R-g-3tQ&tsi9yuaRB;BK
z4s)xY%3>3Gpp#hW<4k8u=flIlcK1;Tt0^yox<%!+5!Jc_oGgL#28LKnCb(TM0_bv>
z$xec2$j>$EtPIT)OpK2Q$huQIqGKSfE0)kpd$)ecRP!=3-`=;`=!2_-&gyvFmv25V
zG|A@hzG3)WVQjwk`P7*^!r9{saf%{Gk_LYj16<2YZ~kZ84GiiH_RN%@#DpxSFGOvY
zs)jv6wPm0;QdUkw-)x%>VKqh=<;Fv*71wBmzg%Z$`iAW?4(TSDirE>F|Jgp+MeH;v
z58g);ieF4t{~3jwF0yAf6$@fkLTrV5;xfm5<Z8EckKHp3G8@oXpHG@{>sc}hx7zc>
zqL2f&7=a}G(vShPuM>=dD_9mp2y_@Of<HW`9&7Ml_>~(-U+98ZpG;|RWb*ncU%vN8
z0}klEO$^T_l`1{8oJu)4xdLp_-?=P=UK&4k*|E2}vr0>`WLPe3TwO(Qe~884=VX@`
z_~)CVHMt*cJ(ebzw99gEea;OH9d`$#<b8-h2sms35WDu-awEIN#T2nBE%T!C;o&HP
zVc_K(wY%r`Ey!Y2#9)(Id<b%|sE}inrj;jLH|)JMe=9^!E?}wgn7Ct(<UPpuURGti
z%oqv2g<Q%Z?-KoU1hR6>+?plmRvI$D#2~Xd*%i1_g_Ig;!4dw@?5bqx4xaBJ1a?bt
z`a1S>9wlA=2D-!@#tzQ*T-V2Wi%NU_K@h>2@JW^48vcrURQ8pY%2F!JV>HQgY2zlv
zUvmemtcVFg7k*xbx8X{k>ez8Ax#E20E^C93sHiQNwj@s+5NZp$Awz>I0qA{wdW~!z
zr%O<Ul)RpgkdRlul4vZv-i(@HMX>Y(%S^@1ksgWqcV8|%0HE$Fq@AX0tGcbbVliDe
z_b;6o^UvVts%_H?Ym!9-YwR+04;vmAr^`QW_k0CkuSXg9K3;DQCuDrJt2JxO7T8UX
zz9h$-yLrwI(kUe3@_8eB;}Oo9rdqUx`t$@5ac&N7;%st#GOCFOEOn|QP-h2B_9dku
zGn0lg3Onex(WTYY9_Yx*s<?TcsXJ7FHF>dQe135CXqmP7uw-7w)_nZ*L=tIle?e;n
ztTNr$1JNX*G)TgMksT^43tMD@<T4Yk!Ggt5sL_Ix%ye8vzVbtCIX;FB8&_G;vO0P_
zw!1Y>nWD4R{BXe+k&teP7L><y^M_ax472Eolf3@`ERKi)T&q%MG2Hj8bSRC4;ULfF
z(q6}Hn{U=X=fw$~AQ7N#_vtd4He5En$eL!(2?M!e3P8=adI6=Ry1@-}8jX4f?H~~F
zZ{s>XA6tHQq#}c)gdAwNDeP7&tQ?Fpw_I-xejx-aZ_g)uJ{KJb<mAQtJw)dGX~ESj
znUQ#H+;*MteBm7gr`4PwG7Vv70^}4YV(`&jm%UPDmTN8-LQxBrs59IZ94yb^@ARLp
z`LkV)44Rxu@C|(%mvcRkfZ@6fZ46~}Y_6XB4(S)}zFBsu1o#6ZDQ9#!mnWkmXVVBp
zzzlha+m4AHfTm*jvg8u6qGEul{*d)P&YpJyZhJ*~s8<Vk*GmACMomd!oHDwsh}xb+
zj&;Qh{o9^Sd=eZ85Kd|fYYfsOZ65@@LICFVY%FysE^Ubw2Jny@Eq#9gAJeC|J_`qs
zY8z$)Sc0!;8ReCw+X8p?kQGIRw>ncxsx<AkPut8b$GyH@sQo;EPt~8&H#E^`Fo#R6
zO(lU=aQ_l|XTta5wn#^k55L0Lzu(C_hQrhB@qmBS8NBVP?O|&>fFkD!o{aU5HjEpR
zt~)5MSo_(BQzj`__lBtTjsCXL>a@DI2CL52YJX?rb=K7NaCXsC)`x{{*gAW)wL87I
zxM@vd8T7Bi{XMW7`Np7LK$KfpfJ=F&O~1;xq_<LZc92#w{Q%q8yu>fPT(_OcmBk)+
zG!-b&vILjPT)$@p+gAS@)8H2zH?vgO-d8RzPgdk_i?{EPDBr}!3B#0TYcy%3%$#{q
zwF0VH>L5oD!**fS{J6O4evu95Mn0M%gn!u)X9T1emKq6glb=ug%?3V0Z33dNVT4vJ
zQ8-#fU#maf$4Jg=SibS4a)4cs1(TcF*mxvtwKRyTlj<^Ro>M?W4}<sBs6Xg~Ut{mL
ze9fvVjV7Q%EDGU7eE4b=N)T{ieN5ZPcL96rv4-w;zMHfkOq`*^ai>CafcaodPj7c*
z-l3RNZ_oK9yo@i_{0r=RQo%9Iy?37_myK`S!vmaBxw1ITk&HaSz#wYbe@BhvOw?Yl
z{M>AL93>9bSRCdC7!=BFpU{47)Fz~eX(*%;li#Aa@!%Cl@m3OK(oob?dWmCBy`{^+
z6_{S=<R~K1yY1^&h^e>~GpCvmesis6J-~tj@O>PDg9K^{+c5`RR>JnI8pKU8Nr6mz
z(5k8&j@|FQHG(6)gT6kG;&rvdx|>!64Yj6j-^ugf{=-!i5@qod01z9ipi9ccj*q&A
zTzo5kHXL(_KoTsj8i{(h1dIdY4|Ac@tXjWfHR`AmQ+37FBmcqJo=i8udwE5q#pcO?
zi>1!Q!3ZnI^|&emL~6b~H5R`LVZy2PQ*V`R-j>$I(Gl(Q!wvn8u9cYfQMLGLd<_;h
zwHT%%^FAG(8tkVN<pEqRreLAtJGD$iSp0sx`;YkTp4G>;J&n&9pP!tFZrGiHbVf>_
zj$j;_+NiFCh&E)n_k`kS-u?^vT0FQ)x(bdptiJG<bvS}r;*hwN-?LlUN{f+xR|-G)
za9lSWI#R*~l|9<6I*T&wu|SciBnR4|Kx1TPF>cVTD2=c!LGhO)PiZ%5iPi(rJYn96
zX7wsE4mW6Y?cOR>Q7j*5IcVB2<Xz1HLn*0uXuk-%e+lw|7-5(Y$PG*0UMW#=0Z2g*
zPmpM&Rcee7J1tfI8^}JIHcKE;W#Y5$qktPy@|h}`aKKV=1lhtN_BdPl`Q>U}o|-&G
ztlBU$ZH+aUHm*JTGf^urSU@!EZfSU-T}eQm2(9vR;2r#%su+SlX||$vqo%p9`9g*a
zE6B<4p6HESL7ZANR*BwRDa(IgH%UY_{jZqCQJ6I|<HXXS^(X5vLxvph+H>Mc1=NiC
z{e>(Xw7aS`Cc{7?GPObXGx$A~zZsN^zd2P4X7KxA99p1c9GuK*TedtN?{JHZkDg_1
z_Wg_6jKDmg{GJ}=Igcsehq`6e`Vxoj@y|J~5*z=<dQOCwqh~8lnVATm1@<5J`VTew
zc}T|kM3km1V+@Y}?>nlBf6${;?Vk4k@6(umM!}!RR-^kMJBPn+`hV!4exg{Vpc!)h
zwr2i&3EecJPlRonm)@G@|E5DG@`<L6?;WN1J0|{bhE$Cq8Tqu0buige{)fju&;|)4
zUGMh7%j=Q<v+V!x?B~*%c{r%`@=5$k#FeV0Hi{Q6<W4aatBr<cR!(IYTRF_zLKD?}
zB8;Uet&p^@5)UISV&*w@otxKGDbyUf>>pj)pp$H@AN=8h_dfb(#`@V7qJxAy5cQ0#
zaH*yxw}|+H(H1m*F|T><lhPg`33EV0VV+&e$&17{K}Hsbs>bBcG3TU}R7eVKsWr;%
z3B%PHu@6tvl!Md^ha^r$H?H)9wm}0=2#$?Y>YJ_6ceY;w@!Q*zS4X2z(*E&%*Od0M
zA6GY>72eb-7z~%{^(C~JmoPywBFe5wpAxpV22IqAlNc`)!?#GAV%2NEVh=K!bWodM
zQ%<VmmOz$?W0Q&a`KVHNC^Z_RM-D{v20ldxJkI?M^g)J<OqOMKj;;@A!VbB#HOk9x
z^9$|n!TY%F$)Ca4HDtGMa2P4XRG5+4h#uz+ZB#EM99R18K}`WE(JmfGNT&3UZbexJ
zANmb<2V!|oyK7(&^zpGB*LRP0-S063yC;Lr+MmO#?m7W}A_0_25e3~=y^UH@Z(516
zypweB77`hQCHGmSSxV~$^<;~%=cZx(&mn{FSX)8K5fYf;5h)p40vlgw04=*EmFzz2
z7n<lmrX|vN2fTt>4AmtKP_s@r(XC`L!tZcG;V2Ty<COPGCH<{uOKfTi2kljl@~&!r
zU+z+8=@5R=c?Sfwe7PKeIRkN_>C|U_IFdbl`F=3;smhvmMEUFN?_l8Xp#-yv72;X4
ztE@7Y2B8_n=BmjgX6DJ8iaSiSOEcF7dC1eN;Ps@mBM?Y1mj)xM4j~?=6{tg??C)UV
zs78vG>VA*nNq;6v-F1S#ls2SAwXBW3r>v)x=k+Ua$`ke@%#udPIH4>(*s-9~`$M0w
zqy8W(we?%8jU~a+;dszZI=h{XCN;|t7&vewQ%m%hzj(P(NT3=wrY*TidM5TrDE$P+
zMXv_tc0C8~ySj8Ei6IN3b76thkxC=@lvDx~7I57<*vz1_z&mL&5by8{@1>{FE_d%@
zEK_4DSfc4s1=Q4HJfd_l$j2u12~YOO?#dXzZE*9vDvRRNTi3^OoOh)Pp7_|3qLdkZ
z0}qvQPckGZZBRKH5;f?Roth{l<kt<9@WYRtugX^<&Q?6uKSrY?Hjs6inuY!<`dW~G
z<m%SgH8UuO*3&R7Q;%~oQ#PUBSEs_tMB><qTT5BWnoJUdxF#h|?WM4oo|CWhv6pnN
z{0LR-!nQ*-+pMU#5R}N1dafI0G#zK~k=c8{vb8zs_0N#93LA(nrcnGYze2OzG#GWz
z+o7uTz%89pcA0EHRaOGu%AXh`@VKnH7h=s2GPYj$BTz?EbkrYT$Q;=p*^_olB_#VV
zioM1Yu{$7BVUQ>srw2vWkEH*6B)rXtN6JZP7H@szT1ml>b6}S~7}AFM#*t@S&7M4b
z8Wb2+G=cehEFa9fAAv`nlFd_=Fm!%9ie#gO3&5+mpR+Um$lW?D=1=SMvs*2itt^!U
zDc4F32DRaa_DJwOi+OO72lNa|U6WQ%KaB28U3~<j!y5E*J+C|7K!MTVnR&#qFD73q
zae&@)^0F=WSOjDAFHDdgatjIF&cJn$>L+Hn>-mTjI+8nOrL#w!m&*4B$s&XinythN
zEMtM((qGpMQe!b~>cGGRdSoMDa9V|`3QT+9Q+Q(Y#LLlwn~ZCazc=+qrr0FoCH>AA
zoBv>M{fq+miw$`*qvJ-S9EH@B8v<X8>nO)q1N^f_I-$R|z0y%^_5cGzPQT3Zy)6ny
zn`!1A$$;4uQ{Q@#D4^b7$NvdFNWi(~Hw)^dJDT#WTh7}YMN=?Lr(Iq*jgve#Pfiw#
z6cRmG=(K(IIvV=kk?;yc#<ctLzr*nR;t!ZH8Czwk#QO*(yd<Wzn&2=Dhy0de4~F&p
z7>Z)A-TolwN5TD2<*M>~d7yWXFNI)wRr?&GDRnDrSX|}MXF#xk5sP;^liQ#=I#xHy
z`lnJZ6$52iMgsGt?d>1h&>htx(dA#iQvc)g6JcdVtiY4TaixjkgK`glj%wbvK4qx9
zxQ{{+g8;kG%P3u>MCExTT%@5}JNKFzN%FqEFT|nl`{4gVEK){f2-b^jL-`1zSqrjT
zXD;jJuj^z7VE7DWx>!WMi@KlRInPY7MBl1JDmcqixF4Qb4Sb7<{Blb?X4FDV2>X>S
zTO|~CG%Pk2X$C$(4(AFE!=Cwl#%27knt36+?Z263=e7@^tp7Z#*-Sl!lH=$d38%n2
z$ST{=H~Jjll+gE3v`6O=7u@t`K)6dxYXECQuOsRR_|e=ObKNIQbYJ2A@nsc=@}zqf
zw;0l-*<RSnA$-g#pnSv=px44SeP&4Q4e}X_$q>K;Fd9vj>l?L4B(<c1k@PG9LhG;e
z(UTe~a!e2Hk^Fvujw$`j7$vxPjw@jU4~2eF@l@A6oBg`aA*HF}7*~e{z-((tBq29A
zpea=WfhIPkA3ca0RYH=y^e=Au)1gxC*c=i#TwzCw@%pL4Ds!R@CeqSkfwi*K!5Ts}
z(_(<OVm+g?r-*_}aEoq2NkA#2$;Xw_pz(ny^4AE4(f;#$a6sJ4f<%LJL$|^h1<$Up
zEWga)3uJpka@`_idMkf+ezIdjkP0V^OtrI}9vnkB`WT`#lS%?QdsN!u?>Gd&lZw-2
zh(iNy-BhHyway}d*@Pmw{0+%ZCAvk2C4+69;J|2rDTh=qfWYIbSWEv}N=V|enZoy0
z*#1B+rkf3oL30ZNkV3Z+6B!kVZetcF3cmbn@j$>eQ~t@@Oys}gE8Lh8z*HhA+~dsh
z+q0U~)%3k#lPK9sLK2ag{P2qkeOX~mdtwISDVa@Kk}-n{WNz_DM=1vryqR#b{va<6
z3BoQvdop_DFpZ?e$|P(^lV7YT-Y|naX-(hI>q~dut?xu0+UTtlJR^wkF|yGlQ*dD>
zxm<tsT;8O3mVe+rzFSLxg6<dn$gpI?#DR5n`DNBT5HHd-KdN|RKqxuG*x@zBcTBi1
z4I(gT5tdH%7(ySf63Q-Qm*3o>In5HMko^bfUK5EZ&%_tH_l;RB9%G9VY*t-WIaSeJ
zfL_{)NQ4lIOB&?cp3&L><YpMmYRpA4m%_2+6#_%lu(2TOU;H91A7~&6?sQ{*x9_Zw
zPny5W7R*K(!#5)=YjtyPT1J$s&rbqAQ(JhNC29S5Uya*>deK(oID?A3hxWUaPyiV4
zj;z^{T5+3b7qidB<p<S*lLADjE`xV|KM#G+j}jK2Q(*U<anRK(f|>yu$OR;?6&?yG
zzI=oNEqTxIcgsAuo5QrODqUVjH#~|$pSiLf`l+gcvmCz0RcfVkUR{wl{mVChk7+9A
z##PzOpDCO#%x5iQS=#<ki%;`Jgu{qC7IFDVd`6GodTP-55Za&p8K+YM=<H?raQ;64
z&_FN0ddTr|ES6$BFK&JWik`C95mIkS{i2-XAw}u-qh!>{l@6b~;&E}n?XX<@72<v`
zC_;@B<0rj6v_+tThlQpH5GGW-Nn)MGVd#+|R7r-<sk0V`7dUs(#zs+!Qx)r%t<bI4
zt33Y^l|>uoH_+v<1*N+k53hK$EuLQyU@zhME$S36z*PI92GJ%_s9v}7;`Y<oFH>kB
zZhmvmXb6izTgi;Xy-UHihz^Lh7XVV%gHJ`fu^XqN1OUR$h$Wr8$hlC|0m5W~&;b$g
zDCm&54;j8;#r;x!(f*>n<RsQRKi%>zyxs+z{-6LK7;FMZ@H4TETwxcz{dNsPvFlb?
zE-gPM=k7e#;AAXCSO;MAq=|<d;!><g3I4%jT}@K_<mP!3o`Kt=fTX74C4n-fkm&#`
z#FEwp0Qeo^dB&0q2ADLeVG<P?0Z(LZE=7qRWEA4QROo|%9V$JZY9g9qJAUZkOvqAO
z)FcB3Ur}L4@HNmCY4oI53S%X{9Bf>$1`zzghsd{tI!PQ=xR%0_7RPpt3r+*uQ=AMZ
z={;Ixr_dgH8KBM-k6G`EWTZ?ob48{<r6UYtuQmMlCPsjM3Yn>7BE*(nZ0HAuiGVQl
zj-UekAn+^04@T5c@iP{;B7QbE@lA2lE<k`Bi(v$M|GG~(eb;0n$k`0}hy-uW^ui3`
zS3nQGP@Zo2rE2JnMx$gqNC#^gS`<YCeUutj(G1@YR7Fpd%&<&A)ND2f2S!2<v-B#n
ztusTzrB=HMQ*RoT*?P+~bzPHnGf`l-Q<^^@PfeM`_!timCi8{OMb7Co#niq?FK+l=
zq$;MW>cXhPq*DG6Fj@RV!S{uNoV8d_k|a%HT}Mrdn<v{Xm|!M~j-g!GV}u6DT9zV(
zdDuTZFx#0`bScha%eGsMwwh_RQZ4awJi~G3A;o!^#N~2%a(bdxt3^qIYsGPD7)BB%
zxKI+s$YE10m1pK=Rnth)7*nq~riEVxkdd+kACnlZg?T{VgTMI2kiQPhhN?#7xD%UM
zO7Z-%5@}|lYMJl*AjhiIr)#dNqRa84q#nhdCfN!`p>b2_4D(D?GBgr*n8rX7e^o`+
z$O@qb^*|}O2VFOTRHUe?6$CWTXSw3UNjptE9AfE)oNGyt=9vzlGX+`QX&MZCiJ&Y|
z65}pRgYNOx`)$*MU-ae$VxYqGTOun@rX2<xzaHE_*#vCC<BpBDR9RYn3>;NrS`^<b
zm>;l1R37~@!Se9yrzJ&n3Q{H0vrVjNo+{l^P=CVBbRFJt<HZlI_~l`VB7TcxzBHgD
zIvWBL{{UU_CMW6$%&0zg^|~Z{Aeu{CMZs^n_z|OyMsx)XoBmLJY{DWVlG^3P_(rg)
zdoFPj*r-KcjW_}pMZ7-tE?GP_IId_mowQqu2I40`E7X<tnu_y~^`no1zEhk#PMe9C
zPSh%P6%7=J@#5x206#em4?Q)KesA=L0&(d#3}Xs)2liA^L=Hz4$>8K9MVHTI*&rYu
zeDOSrN0q)!dfvr@EuL1o6Y<Ur+e3x{y(euadxe$TV3{U{0Uo7js-?*ghGf6vHJfeM
zDQTMSI*uR4WbeRj<1`H8I8t59_S&6}Lwr|Jb<nj+rGjOec|i!4=EGA-m26YCLE)kp
zv^3Uq@K=g$m+@esJIGVVal#-4YXmp7Y)#jpIf2Oex~78<$8i9%i*xF_iN;x$1rm%l
z#JGMKVg<WEXR*&T!GK%Mxz^gzHJGd`jkES#D@p=g)xnRA+%RoJ&rC3LzZF=9u4oF@
z@q}k_QY}~L&OjBRZKwtqXrnQ^a%40~p^ho-rf(azZtGr{rBU2Y0$7MNjWzI6ybXZ0
z((E!xlwy8*V#dVW)dbTOhhX24p~$XjnyRKHx#tJXYF{<*WWSyIUK_V;n5k|{mT5;G
zF>D&VpuP<4$Ml_G=ecd$xXm=B1$N+7f!-F-spVkvs&2cjR;mGF%T+bG(`m>`h&PBV
z4I9lCa?UzN1(3&4teKKVj2jjWY@ODDVc<tV_161^(`^zXpg3yWE<MrIBcpf5=lOmd
zwQkwMozt)LtHl#VFX<@sv%^18#M9SZrSu;tpc){Kppx`Kj0E*aDIGPxPJ8D|@yiz4
zGk)LQ?ut67u44mrSM)<Kj;9szfj0D7LhIeWPc%S$?k&2N6pVIJ>9(!-06|);h?eyP
zLLnd#%O>#~Tp6kemIbSZNu(pF%|xMMx+gK<fIr14dC3Sc0*t^-jzAdsj_JU5MPbtR
zL)UgJ({YDub8~YbW}XzywVE)MZn?bfj@7$%-v8(WpG)KL&;R59-u9-V!~=P5c^(KO
zD6g!fs-_yYOI$OvmDEmi#V%K~JV=r-%OR}7gr;c%YYS(9L{kjIv4{ZT+EIkC9$i&}
zP6x!d;zDHAuq%p^DD?eyM{s4`q$At{Qm{;Dt5Fb}s$E@C`~Bbhhv28jP8|LD+i%o6
zYA6NZ!Qg^m{vgF^kb)78jjo!WnSkjAfu5eI_YG9QrwvWV4eF`^G27HFuhpqlhA|5<
z`kRi|iSkY=#dv=(3`<AvM=eM8M}?TZs#}&JLE)7_?$apE(k$^i9aS-G6I@lreZ`S4
zYaw1hI?a|M+pb+v9f#f}@^%R5lfY2Q(9|<1NYI)U#WYF^n6M1W+d+E_o!(t=e!-Q2
zU5Ng%-DsA|b`F+r<{iJ|x^@_}-4du#rUC!54SuXZo=N~I$a;XmSDX+^Gi2|gpl80x
zf0o?5YeRtf@wMT0V~)ZA5MHwBT+oLFK@>EV-kwQ!5v9<Hg%XUXp2#s=GKf7Ew3uoV
z%bc&@J*@Yqc;^$9!8?mH_LO7dsl6huE<T_=i?;Xn<8Q%$X#-JMw1t)!VH12B4J^)n
z(Xj<j#!gzHGvK~RDG*obUbLu#1H}q9QXQ-pZ>V%ZE{p&pzzE#@2tYhFN^0OIp%-Y1
zxpDorEt|KkTXV<ORogpKr{12C`5^TG06+jqL_t*D_tCz$-hFHQ=wVe(O;uUxj6_m6
zbGF_I>eYU??6_W@>!t(RD~x6=XXFo&#(v^Ks?41Vg4cr><NaBHS=*rm!;6EG(}J(6
zZ~%ZW6PO;TEOZCRxT$Sf04)llH1$CoLF^zQJC;r^AF#MNsuH~h@rB<-lEC55R3Rlh
z_V(2~u;25%M!n(3zSHjn;GYqV0_kwJ$fVH~E1r7t=`#4bl%6_u;zuw3U77~qsJ3l}
zaS$c}r09XZet__m-}ow8S}oTO9NGWFpZ*n8VY2m5m2qovj1{yQZWtNJHT*~@JPCph
zwn4GX;W~mdU{s1YDXyK%s%9#x0x=tJ2c)KFbW`xVfa`>*o3`n|wF9pKI5o}y#!s{#
zhe{@ng%bzeB<W2M-Ql?iua8Qr-qbUxY&ni)HR`jHqgP6<swb^>1L|_6REAXCZqF&X
zRVtV9iUQ^jNC6<6XV7dAZfg9ZS<Y_$Un#fn3K3v_e1$+QO*ITf!;X<9z)}`&o8*K*
z<WaCq!Fs8V0#_7-s>z5NKm{7_m@nk}dm*Lmxn2;57G4n`vM{zdqoBw_zJ-NaREUg=
z3&JkZ!@od<=>yd*yeqImg$7l{5}Ej<h`20ytbumZJ2!^X@R4eC>2Ei+IChDnu)t1|
z!@JiMTj>RzT7Wu*qKutj-gtdLd&K3a8W%=@5nu#vUIf6F;8Ov=R?BejzvrRPJpR;%
zku9+9F+6osd39;c$V0=1rq0ex1VIPB6q=+Nl3{C(VpTLSNBBrI!qig1l;bppheMph
zu~jow!>ROv0fuoC{=S)^CJ9tmxWtkJV+2yJn}P2)1MHP83&JoY*aV&lV9#KOAaSw?
z#s+Yt6Qkrh=pEEh+6%^PI*Luk*3|m~FX}Y>Fb_c`J53+|O8r$ZX-L9QtjRq=NtHw%
z=^uP($Ag;W!j9iDy8giagGWys%A(vTo6uG@MGxV1klT+vy655Tdla|@OKP>R|Hapy
zk7e8$m^M<r;&P_xpotWz1c0IhJ|XbA%Hf_uNU9KSlfX{{uc62$n75&8@Kp9Zuy9*e
zbOlb#l0h8=fFy|qv>AL~f)_P8V?cM8HFU1vuH+1jcbAxyb-Y192?iu!wn-;;wG!O9
zp{oaZhxkwe_h{KrHTb;%LV$qA;c#7nGOo%AC_5Ua0zv?YX5+2n=KVQx%dQUr=Ev8E
z-}N0$Pc*$Hi?e_ok`gR@Az)y^56QVgh^c~E;v-&UiwhR~Reb6)V8LI(O)qK?=w@LT
zi9*v^iR=m{++EJs7sRyp31_~5BKNlAB)#o|vyulnt_;^U;j}?S8O}+9{^AfcMo?a~
zikc_rGEsIq3+?LOjo{yO1v)h>9rg=f)#7;BqQ6`i0Y-okxY-fN;GGNF1ZHb!?qiQU
zzHwwLjN^wNe|YB1IgI>myKBqvaDTZ}g4abHwRBl(dCi(LfM77)GQxHMCR;7{_YVx6
zIe$jgGte^frA^`0OEGXDYRz_|<ib-}RdXHQwUjCbx=9E^3IANzF`$n|;F;j5j&1i>
zYW0~Wq~rv;EETAn&}_j#BiD6dS4|Lag%nWux1r5YbAzm<f(;`4OooQ4_E!f>cF8o%
zhCidi*BDM3Ndoqn!Tke)x02BcgOXWwjOvE9n@^oSVJH@SDoizi3-#=56HXFOKK?W)
ztSaf}>oe%<qA*iTP0&r80rw`ye(J%4*Y|b!kysXxffAhL)K2hOQ!#K~Wz$8_FPfbv
zk*T|2uB}#!6qX#m%%}_fIMs|)(csXG8644z=pes%2Y_N81!)+eP5`oPnVOBbIvGKd
zh^+I2Fg0^yY;aY_!<Pon<#c3d*bhDU9nYMfF4s!H5Xw5vr#n_0W|NbXaF*@_KH3lE
z9!u;6q~OXo`wx@bcTET|KfWf|Zs<XHo-uetqdoA)BGTBChd~!HoJ5qgNR?Ia%>`Eq
z96}B0F;>XB2$Ptn$b}4zV+1z^AH_oq>e-{?f^`x@6%-bq=r3_okq!*2#7gP>^tV^$
zFNJ#D0wYJKrJ59KD9AB6h|m_?7}R7mp0<(?G_{GM1F_z1CrsqRAvgjIz97d)`zySz
zd)MK=i~u9R2;BS#kc)+GWQnwTY#jnj#*dAE=Uab0H93Q5xjWt*8(L9qdNcLONy^`l
zW&=)*qP#M#TB`YBC)bo#D>-@elvA?dF$@-rK)5gp(-@2|$A&&<z;#X7$!0JO9&Xu4
z2w@wp3f@V1%;0^Xf?w)s7B=dw$)G+myebUgZ$LUR7#%n$00c>fdjP6pW0GWE3oH|v
zB2-<sRn36_jj%b8$BI<}DfHSw!<&P~s@Y%*2}r3yDJdYcA_0|LH3!gTn3A$*_mc+?
z?LT?;SlAAsx(1y9?!|ZBxof}}MiK~FGFSGmZiaPTf$mGOcyJj(?*N$?A`lWjR)+^D
z8ApbAK{y^Ggb_D`4nvJ8xfOa0xGck!!9?d6`K?YfD3xl^hNawu`*jg@r{oE=blb_{
z2NJ<U8=fgd{ozXj7Y$?uF;ZMJ!A?KR{7jDEv5l?=>9ydfa{Ab*!Qr8@Sw;^ue&Wo^
zRl_5LV{IRCl^$Lm&<Da}+tjR)!IhQ%@|lS<$UOoWNyDa5S(4=0zvAY9Ft4G7%#W`D
zv>SGapp3#fp-Uf$CBrVj3g8-`k%BbC?O1qQ(342y7@{)qPfDyt6~tDd$kJ9Ej+`M2
zEu>A|kKo<~>+S6pg6^WKUDAt7(3vhQI0+gChv{M$U6tC5g0tXPq{2lE>y*BL$UgQL
z|B$|~NF;~P8bom-{J4Iv3F17E#VHz_xHUPLVF@EV&%m)lHk}J2zz8q`H#q{(A>mqp
zWM_UWbtM-@4}<v#OB?JTYPRN(9Ie%^XELROD7hx=97rux+2MhqFMR3AH5<mN<$=Q|
zPk*p?|7&l&LJDdc!J-ci3~gL@*RH!B+pun{t5{*yId**f?RQ>3cyPbxx8s0}^}qZd
z{-+~Hj!jNXjgF4qdFN&XZ~W}%FTL{0YZK?1YuB#3hsv&@%D}8Y^U+5ij*lOD|NZyT
zs|eo%x$F#V*s$Tgdmq@cWgFZW&YnE~<{NL~+SI=!X{~QSv9eiz8vY#cVn(B|0-p?W
zBN2ki<ivFUP-#VV)V0fyKrpGiZMtx-=qvX>{lsTODbgkT)bSG=@7Q|!!~`;_#TlF&
z5?$3+kFLJ^?z?y1wPUz%7%AjVoH+6R2k(A-@Z)-`j@N^+(UGD46<__*?;O~F==7<x
zD@R9g4;9<}=<vRG-g#^9-o4xI+PP=XW1H7*0nx_CpS}L-frI;Oi0TPq{`6XPX#Kiv
z+qUjpzi#tDWq7(Ved5H?5B9!;It^&>6@7Gc)ky!ym!9~V*9hKz_uZ#H|MWorz?sR@
zFTVQHlTSYpC*kvd``kw#ePml@^i8d(wRzKBU-<GD8l4#gEPnXmhamI#_UJ336njN~
zc$5D&xp`NO0Q2LkhU~^3pj)LNK)i8`w2XM6hpmJg7DA9Qe1p_9utp%Of;M7Qio$!<
zxZ?)_{LB2n*C9cu3Ou$zU6JXr$AcF)kTRwDQD{5JI+^0VA6hSz15z6BNDEivf^(Ct
zTPR&Wj)Dpcnoj#*1L$R1p;%*_1gS2)Fp@P3o?c$t$LO0O!}Bac;!NZTLKX7jLIzL@
zmPjoZF$^kR$1xU#&ZFI&vmM*-v~0(1%zCy}GN3_#K*PA<6#!<Nyh3O+kjBcmFanGK
zBd{D0fH6c02+D2TH+p>X`1;|kYuB#*%5Qw-ufFqLx9oa;!>~ztEtlP()5cI4yjtgG
zXaB{2|IZx@ql6)nA#Pu@ee;?fUg&|~4h&U-u=VV3{??rv?y;p2NS3h_RFr}3>)l=J
zcmDZb{)fGL-^WltZ1b=?*1vNl+!zFH#mHMaHgd<r>F?jYbI;em{`aeHC6OXS(uQrT
z@BYE3?*Hs_!_WQjhd+Ry!SL|t`gI$A_wW5K0<ImoguZ0k9b32Gu{}t9_y@!9VQ6sh
z!^0ogH3LKt=^QXXiA2tnS_4VuB;;gOUw--Jul?3<fFpk8%U}Cn|F`{`rF(v>?AA=f
z*|KUIoPl3^^|dV<cOZ5Tjw6QRL@{0utW6s?{^LLV$Bu;T85nj{R=Abn?W^zE{^0li
z_B(I9`9>R>wwDd8U%9fp&e&Fa`k`7cTsymS)7|%Mdhn&SFFm#UsYpWC8J4x$gFAO`
z-MH(2{x|<BmZl9#A!|JInWvwA=u47hU^r5fjKS*Q*v^%Y?0oR2Z~yo|{pUZ2cS@t)
z+<nhu>sD-=HfI09*YAgyN;_-bF}$JGcz$i)dZ;oF-rGEK@Lc3UWJb(h?V<Z0+c>x-
z8T7yZgTKKq5nUlVLAa?PGYN!b^oh&i=gDonIs}*>UmadIwl=iVso6QlaotiGiFbUj
z19A)=sTq3cH86IA@i6d8h^jD2Q?s+N-x(XJXnCxrl0}&pLkxo<4^X+<7Y3NxcfNpA
zlEWFOEuw*4N{$mpo)>wE?I1iE3>G6%umGqwFVl)+3lVqzG_*n$AREItQaFN?!6K9q
z&4J|T$ng_^YMaPTNr^k*nuaiB6)_Ad;-O2(-h;V@L*I6hYaJw6LRukYmrOD>^3puY
zGzS(5k;cBMN-mPdBLO>PiA;oQpz*m5&X15qD9IO4cWM$BMt~7u1a3wIASl|d<+bCf
z`qZh@C)W>eE*Z|wojZ`m;<@L(zjyz8uI<jX+VBw2EvOZsUU^@whB3qs550Zt%y_A<
z_P{+)Kz06&-}uT0d*42O{P4hF1%YuGY21I}(4mh`)TirPHg4LvYg@lt{p>Se8b5pt
z4iZox6iiY<_FS3uW{#gYg8ahg&YxVrZp)Xx`0Ri?iUGTKKltDmZ@#g1?W$)!`&1yu
zdmefG!+jr*j~{I|I$!+aZ(34ClGQi&{-R!=9v&LH|IP;-%?7@ZD$bodQ>|7b%p3m+
zWv)JR?AWnQo7asDjtmX0=yaNfZg<+j=Rfxq5s#vM@ZS5|x7>}46$zz$NJmCi`JLd)
zpMN&=^OCLq_1}JfVq#*&$k3yYKRV#no_gYQ2M-*^#2~Fkr>_sr8`e-|71Bk#{_f8_
zuRS<8c=y)(aGOu<{v!IOBgYO-PftT2Uo*U+=2RZK@6i`t{jT9?dmee>o_kPD8b3RJ
z?C9ynTx<RM^<%46j#T=$Zr!?L$1XsG*hEaKg7}=_{?(ZjA31g8$jQUiS_$7^AjNpo
z*s^)2>-@OW=s>D*9e2y7o#<0fG)}bJZJZgs^L)V}pMVW-Ufu{WKmIj;fAI5%5$SZ=
zaJn*ex!-jl@gn{gu@K<Oap1*1rL`=VOa1-*V8&77oG!=Bxdwt3YUMu2x1A7q9w^X{
z?xg3pFn?EhLRs7~4rcec$WjR>Ga3;BOO-K)lQ3V!fyJRjMaj=WP|}{GGtG&WBPK#}
ztuVx_MVO@sE?i_0U=Hi;%!eBsoWmeU$0@D_r!YV0n27&?RSP0pFH1IPF%r2am9nkG
zauE8Ml*loRNQ1l`;$=X~;dDk9g>Fo7Clo&c+E3vi7ZLdR`{ETwfDvE>ZZ-rkSO+#q
zsSw)jpS<{Yx@O(K^PxdZW|4;9{Kg+0I`Q!j{`UKg)=V4)Fuv9508*h@x_bEN;qU(S
zcW0W@CCqzK8{2xvj*3~SR{PR?q}u1c`toZp$3H(dKHHj$kz4KTsq=lcYI$3E&CvR0
z(}P#8p*!#gHYBT)_&@&f3m@(OKzJ>;H*MUyeryZ;-~Q^qeEY>0f6{8tjgOCGWb$*5
zKVw;?JMY|4uQykXtr@9|VWy6E-hb!Y-}$y`NT#j^p8v=L4<r0;aCo5AX;%gvBzEhT
zLPJ?lQXpZ(Z@=@4t((`EjLJg~?)k|JKh#u5G4jWDKM9%j?DY9VhYp32=h!PSWeHra
zC(q5?yX&4!V_V@y_>)&(eCdT(Kr{~@I9#g@?0I<ipffx+wrZ+A(`qA9P{(DF5_$a8
z_>X?{Bg~dkDfeyKxZTyNXv|Bm|M-<xUuZOESFc+44}SlT`rTD)*KCFhwW-PX-+TXn
zF^InGM?d_#sYy6{Pmdp7y<x+;-}%aKuPCkDap%tC$BtmyK1D|7OZkyLI(*=-pZoT?
ziIYy*LipWlZ@l)*?l1N0rOoTNz4z`1=$JOIzq4laBS`Yyx8K9y0K{Xoxc~&hIrHQ0
zZ{wEx!-Lnq=C2gFV(6a=Ux(0;0wZhZohfhnxa?24l2^0yJ{umpVW&BJ{`|R<CyzI0
z&aWP>jSW^TC2P%0dac=3b9QEG0ut~1od}Xl-z<1-*+f|M+_@9S5TC3|Y2>w1Xu6g`
zp>s5_bqS%Svrxbw5oP+H$T4*2#%$ed&Umf5E+vC~rJ=r3t3KIj%po9HLg=t+Bh~?Q
zUe1CfYAKS}Zk+d8lXB+I&YZQOY{-xb92vYkrP)6*wN&KKW>GVbno`oJPo31{0L&W^
zE8?Y`d_Pife%v}2Mt~7u1eOT`h&T&kkjbPr(0B68iSIxEy}$g{|9x)uq@`-@xUqTd
z#{cG@{I`4dJmv_+W2WBFbfX!xe)y9g96Nm+qiqwD=k_1?7@{&<0zkr`4YwQZ_ut!l
z;@D{<Djpsfu2yOoQKY%|V$8S!+nhs80Y5{=38auab?Ov6anGGQ+r$ibEfP&p2hnQn
zwzqQS$~}AbU|Fx%Ln&+}jZz7&eZgIK-G$aduYK*c*N}4!PSQwb{PItK((KG(K0B~S
zq%wmu9+IRL?soVd#7dDj38^(gtE9jG!P}j<g&ydUM;?_G!|Q~1-F3GiLF>_9dF2(1
z1Y$-yNs1B1h9txAuUNf$H5})nG&pwT_}I$Pt(&)u46mG>sRywiO8)AxRhUN){RKuC
zfkP|kKzg5?I*la6$4`tyY=*Kk-I|79$C0B4=jNtQo*EYpzzV>Dj_%myvEjAoJ>Ea`
z{=tJE&ovMk=^Z(G==~4gX{VUMJ72kS4B0Rc7Kz9`1mz$*$g8ja^u(#72*~RMm@LqL
z@4a_%n_lkScke?gQi@3W{r5i5i2~^T2M!!S&I)j1AP@Jy!ugnk^OniedDYEi{rIXO
zyP*eUopz_zS8~cm*lEYDX=LW<v-K4#on~`ZicMLvqfC!e%-O4#-BM*>$neuY{^sZ4
zVD-jZ)6f6pr+ymuSIVm8G+RE*6D(Ix4yI;;<oc~SQ%>)_YwMr<vnP=L=0`8gy!_Iu
zv$Hd{jX^!s#X-b_dz6~QWBQ2m&<Uu39D^nIS6w871h0Pb(cMo!wGv1E*`GZ>IU9+L
z>r^d?L!_Wbx|=-qM+Oj-4VgOE4SWBA@v>`%ow$EsxG~qpq(_)|6m=SN=iE{@GTaA-
zJw1uZjZ!Z_W_NO;21Uja?iLggNayCl2rvSSz|D>T%%)*mm_;K9!@&Uy8Gk!L^R=J9
zbng7=%^NrG*s){t>aB1B{>oRLJ#+Tl-hCf9c7=?vmK`|qv0b*1c>?5ec5X(D2Idfk
zkv!A)!b++7^b=2S+q`@A*hWXUe5nIT6$Tc0+u)=Da%?)7;DXY))f<!b<`gE-KtP8a
zCL^OGffU$A>G!_=d+^`wqz$k^7qfdvNSy4zcQ{OZyKH)CyQKCVK6<cJc9l#kl?Uo`
z)5ysT-h6K2EathxNFu2(u*E_-f$obtr6oc{aE`2lpu@M{{>5XvpQxJE4ePhOzi;p3
zPd<a5W!9V9x9<bPP%tesd>D{|4X(+n$3{vRjigz#vv2;vAA%?&IIrY7G=i!i?j&+t
z({?oE+(5cHa`030{fGAT^_!R#1u|00CyooJ;MrEaI^dX=hMX5MUK8K}q9VIR<XVHa
z1m_TW;_Mj=M}o?hkdp(mLx-&>@X#4y+!ZoBqWB882;lb7!M$c#S5w8U=wZDzF>~fj
z^VEvUDugXoDt(wnWZk-TmZ6=VIX*KpQ;5f4#-PXrKQ8F<&Hn4;_FX*!%#W`gvm1Iu
zwc<oUD{gfvM!sqNitQU#ZCyXQVz__vT63Z)9Xk^rIX(5k!BhK>o@sU(nAB798tX>z
z)-HW$lQ0}3{vd}@2o;UyNkXCg4@H&<GGJL>#c9q=Oi!M1pfDh~E<;kDv{H7VOu&4T
z)w_%b11K_03XMD%Fp2m8iri^=nD7ayj?0d|rjHtctHYC6F0VkIb@&`ZU_i1VEss<y
zx#!*uckkNPYWZn`^h5Ggy^|-t4$U|WOiKDLWB3m<p+2@}*VZlTYn7Vk%isI%3&<Le
zJe45mibw!N)<<OSSe%Z7+RKFzU<4R}n-T$HtCpo2@#OS`mLr3(DQoH3$rHzq!?oey
zH@@+Y2Hl~F$!XWE-gW0avvciKN-WI)?ZadjZLeAB8>DbBL{UpL_XMK-kiKpA1A9LE
z%oj?^O86R}KHTGc*>e=VmCouA8xUfWM}gFa*Rmh9F^4pGr{@K-f<$v248TE!J#p?R
zbc;&WhF@|+MkEO)>B|os_^^MVoaJFjMc)yDea<xl%gG^F%+?zTVoES32p0s`nr==h
z4&W9EI)!3Ew4Tz5qIbx>pivvBy!!g9kM4c~ea@3lJsqdg`jwjy>i5<=Z_|{}k?)qQ
zAZ=xGoTOf-qg4CsRPt@fJTrZ=zpuX`BwpNgt37L2%HH?i!hmWcXvhj&2NPE}F`FHF
zjk)HGWB1_<;Lw;-8uwzmd7MD;&1M_tZF2~^3g$Wyap#E8iJ^|8ndoZ_lY$EoA_EX`
z8_-|iA)TcG5+`HcY=|((O3^>;wvn~Xk2+yfHT0K%`s1(v<{vt$v3bM)&)$1S$&p=W
zo{^c65$SDti$cK`fPy#e(2i}{bfPGUlt@t$rL!x|YG(i0(U?7R&dkx7`D1oxG)Frc
zjpocqltd$vBE@d9y*Kb4DA+2Xyva&$BO@bne=nk{P-w&MmPDXY$mr@oW@NZ`-;4LY
zxbM5)T@4!dQ4f;z+N&>%`c+jCIc~`wb6RV{Q^b^iBHLr*rv5<|XyD_2koo?B2ZK99
zfk1jb<#*S2ZCU%sJv#^2$szhm3zateq}Gk@?HfDx-{0|fFU-Ah=oI7%0^_K&lFr1J
zP8v*yCs;P+EZ-MkuJHaYUuqfwaCNWZida=m*c?&E2xj1^SW-f^_or(UNElj`1&gnr
z1Sdw_^n2BSM>Y*jM3q<<L*f!W$l0<`>cN}gsOw&@%dBc{S8d0(wcGmHi%t-c$Vb2V
zS3iJaT-EY!*+h>X5qEm<&hEC)e)_IX6eq3ar5EKQz6Ag`A>B;q38gm!7|xW|!XAx(
z8Ww0+pkaYewgo`1<Qn$*J&3sxWI+OC7zJqXk`ZM1+NGy|^5b9l!k2nm`d|!S9zMHy
z%Whq&p??WDhKNj12O((4{K5uSB@dDa$@x@IcgJIoKJJ%-!=odA_jk{pI)BU?aDV2r
zk3V+bg8`QhXkDpRW)kyE@!=N6Rl-yxc`lGl_W9j;(#{dwV&uN}gYRCtJk-CYXLfF4
zc5br0qXnfPc(k!tQ&}qjpgDB&){W!i*YbrVf!4v0SJO*%S;S()aRU_M_O`^_EbtB+
z<9aO+C{iO|$@I+RWOA}8-nOo1gYl5?tKbzra`+e#FZAXWy(GJ$<isZ2aAs~6vsCLv
zbp8M6KmW%%J`v_QD@8ok;Mexf=8IR(lA9wCR8>fXKw*jbQ9SW!$%X%<RI5YX)^JFZ
z&W|mXaxN5(Ib1cb%kA+h1iR-lMSRkgn%2?YCNgI;?McFtYAG0NQr$AcNtVPVLSfm3
z^oNyp&xkdL^m@+iC7!*43)!tYhR&bXK2!8bkzKpD<1AvgLN$#v9uOQtBvm6ow#dmR
zc-akbe7!lWtWOQ_aVzT+S8itRTFj&0zjwoze{t`&jW}YZ3zNo+2S@+nM@PQ@^vPpq
zMd#sHq(IPZ%47m!;!ea78wfl&>jhm(<SNB-L6!}<ZYa!8nMIG8_tf(q19cv-CW({E
z%-Ts+k*1kmP_N{j2I@Q-@LH}{TvFMM8iq(rA~QwjHj#}m+;t5%wJhO#AdJ^Qfn7zh
zvYagzQ<-$KR;l1hAU>k1l^}%a5Qyu=D)VMcht*wq{?T$RanTI{$k}t}02=fJk<_4e
zxxA|B^VH<JDwVn;-ql3z0kx}QH$PDoZdr#VE$MW)ZmH_26+N}A*C@*poe>$LG|2(1
z*9cOiq?FeMSy_~ko8&7Sxm81{Yf8P$A2EL-v7eBL7IYE*EL0C=Lmb5+?5Ub%il&-6
zZW&UuAnL)NY&!YO3HjrM{E9I>X3^SH9Ky*t!_qm$rR$t$;fic$#IP^NN9&@uvW?y3
zYhz!^&RUq;THkO3mgKgjOd@`@CT(#=S<@KH<}nry7h>M*c6}|*dBf?hf8jA%{)gT9
z69cc<N+=Y^v3~#kuWz;Y7Z-WozZ-VEQ@_||wo1QLao0&*cKz7aTV^4PwPka+R`R;;
z^tt`{RDnLMyR~Q6_C4`PvsN+YW>bZHIp7Zw5*KpAsetC23WWm1tARR^hOpd_u6V4K
zgmdT@S=<S{o-$&_M6G!wA82m)>g7~=wymWZ$m{cXdA9h=q_QGUUIacH^<tzm^IEmY
z6H}M)nAE4H#@Og`Ndp_#21CAcXHVnxT)%c*E}J)N_14z*a#3h?qAkR<U1xinMwoQn
z6$~`(+H-fn8^dFgE9Q#DVl)<}hV4Hl3I#^yiNb>+k~!5-Lqe>B`NLx;j)Jy%b~g5{
zFV~By)ZEnc1Wajv(38$2xZ7HtE0p{`b@=jmzr%|Z-F^G-!3)BlHy|X^0G+5HlY5VI
z2Ymkd<eXEg70nE#BZEm>OQ+N6qWB`C2pIxlHz0sit@*tH19rBgr!zBXWN=;XAt3v9
zZ`}>Ynj2-ZG!O_uagL}FZXE^-?;*NgX1bI6*CmZa)XC(WjO(YaWB)=fuW9<J(`P{W
z>-#sZU%MVN!m(y(v%SS6>Ek@8-Pwi^_;{tZa7ws*v5UEU%ZmQo@E_mG>cy!lSJUx;
z<F~)EOVh-d{M_N}cYgBH^nAhX3Wa=;%-?>0a7+L0`_^P@soByjzGYX<R|0o>C5Nic
zRaMRD-P9kwaO!BYkEl6st}sP%B}K1mHJ28M!;n&Hy3YZZ%MC&hgg6t7O-57ENAt2y
z%z>`E9o2}>6LQn-IdnM0o`6f!lgVtlxp`eVSLzITOZjZ7npXVL8fj47D#{dEMS`S}
zGi$1|z${qBaH+vSQDY*k#U&G<QV2)^IuCjgVl+N=_s-`J9-A&y)L2W(P=anHkWBvO
zr+<xEJ^ot3!nm2;%5-|x+uTccok`C%`;54mk!qxck`bUJyVPth-P_e&D{yN?zc*Z|
znEq%uRnF4K1w3kIZa(I1W=5^0GkkXFR7-oC;*X5Z%(is*k4z<cdU}f4lt=MolJm0D
zjE4L!LocLeqCOw!TvfxVB1}FnGoi#HDlShxpLV(O@WK<LGogSd6qNFXtXqv1%O#)B
z>GB7rONOZg;gv958B(f%W35t(cl4YcO-P<dOIJ@elXB9T8a1ceFS^n@V*$>R0HL!v
zRIW;4W*an(@FQwh_@6|;H<K1LGAHUJWZ+eKO;Qzxw<X(Hu-F=V7ofBCVnZv!0R?dq
zx0sbG=6&sH<1%78#bSX4KGv&sFJObFZ=(s;4AN+$@D{|4EiTr0FLn{_70f5Vw7o2z
z&i-Z{ZcXi6N7@V977tkY|1MzSrE)LXiVt)kV&dI00$=&FUlaugK-g`iukkG^b$7%a
za#UP9_wD<_r@olhvM;{yN@6B~u-V|?Ac7JYr)1`}YOY+GtD89{^E^Benkh1FI8=|~
zo1dLjEOpL8W*&mC6J>iO93*M;)^#Ebu2}np&wX|8jy(_qV9cpblG=G>73VQSxyhIT
z<3iJlKA*3ol~12O`=w7;ypr$1`}c#z-+JS<Ql%0LweH$^&(3XI|L{Njd%qGpe&p<<
z`=8_zpL_BPg?x1`k-GQ(2R3iqAWJ^M<v{v3=m&HA+$xNVLau;59f%xUj`z1l$Q{mI
zuixo%6M}v0*rB`c-s6<~pn1Q;`QyL)ZZzi0WKul$p-6~kfpZ-d|6)En#xj4cr>XzG
z{STDNI_Wmx^={g{t*5tR?fU*7{^)yS<HN8>LVki|iUFru*PVDjszo`MuExSKxDx<m
zA++xbIsLx$9L^vFN2^jDu0^`FGy2M*=N{bqFocsY{^FP4dFP#UIt!Ed^Pm4LSpw-w
zUU}^$Z@^X2knhg{wM9#<)tck+QYr2As+D5ZD@UZN{Mrj|-81+oDZHKV0VVD5@we+v
z9Z;@S>!?oA+AGy6qImoVI$n`&+sgCxL0uY##B$3P;S*xxidq`@_*PaY57w%-?)!Fh
zhXDj0>FGCT{`$vn4d+VH_BFMtD^I>mS-(1#n0<PrA~R3q3;2As%p{Md01A#8^LGx#
zGd_B8^P2YVaOYzW?v1ocv7i*LOQ+A~o;iH-<hhG(Z-8W*5US`Ha3S}!`9A-|{`KoO
zHML0Ttdz(Uo_pc#R5CO%HXn}ptL1D{M0x0;ZCkc?bhk=HNtzy$h7S+F@!E?D2~!QN
zCX(by%xek6Qg>BFMpNOz1y)$ev2h3U*-2JZCi{|e$(CTe=92bq-}LI?qb}J|)QTlo
z9p&WbcW-MB30?b(=V$h;0a6QzoKREKY&PeGj^`*n_0WTZJ<-8|0I%qyDe1t)`R8AM
z_0omY0k^Lw*t~c9#z!C7c>G-Yg#)iV_SEB>)(5)1(vf!xCyyREc<j&<_w9P(GoR|}
zm59?mab9}!;JFuHc>{gTTFu!Ma-|dVo3{3~H%0E=@%g?^gf^tvS?R*H+D~74kyNIm
z;}aYDcawSa_kZsz(Xe#n_0iS6J-hbu-XOj4RvK?aYutTpF84=&{fvoIM^FJvifa7E
zSH3`Y(u-FUiPE9zY%QJ1l3q2YIu*BCrmF6wB_5B%AOwR}b=`x5-zhM$M{%Yy2{j;N
z5k_31HUoU9nfX#ql^Oe&x>^CPEZ4h*RILtJxR7aH0`l58_GejBXap{H+kA8*(^4g4
zxnY4{uo|R$bh>40uZ2t4rxROjT*n77-mzinJ0Gkg1kSc^b)^jhwh{iGYYSjl$J*m1
zYs)@tzr(yuG*GRG$^l)STrCSxh?v^?<<ERoG+-v-g*nXe%-ErK-n=+G<oCLLAtJEx
zUrK1MRZU$hGC$^k0SCVp-3UCsGn3O-OQR<)o>;$rpgkJ@C;#+!kUXPtf?QRJV@+I`
zv(vL~)eU4667!V;++vY+Pl(YA7w@yLJTtg?XJ1S2=N|pc=O2AaBj{V={Eo6&Y;Ebx
zXS4H()XCGQHxCZ_CGXdN`8RRCAlIPPODHZ=r;v}~CnS@JK+ucwy&MejJnI_Ss_XtR
zUhR5SLV{God5rc2{J6_kFZnllP@0>{j3={mRkB#=rJ&D`yd79S5Dt;>&yuq!T)2Fx
zrKuz63H{RNe))5s`vtDz6O#CP-r$|V#e)hWN2vpgF{Y4!L(A)=l7^R3D#0eiP<5Ca
znRFf%)TU?*PB6Qnq@jA9^O@OGr;oRFc5Lk0v~A;#9UJ%9EfUl|m!0Ac5$GT`kw87C
zc<(TaRpCXiRP(xCMA^xs5+4|<6x^tAOwZ4DG(r2WYf>I{yc#duq6fHB7ONM;cr15N
zsh8Jo10OH9#!dfRS0=_X8-Hadt*>fIuf1?uFMA>$pT=xlvE->cJEIYoX`-uEsL6`A
zw_K>i=s;m>(R*n*zoVoovXM)E;r{!c+TRBf!xsk#hQIc}p3ts6`~Uixwx?e?oHO-S
z)zIsrL%nN94_%JOa<FxVq*`|%6nx+>zID*w7Mdv+o7=<x;om+2$ORE_wn9Z(+avAl
z>EC~U|G)W9|9u{@45(_5LXgH$*uh;RIDueklT*4J4mXk5csoethddE?lFjB5V~&l3
zu?>AvfA6aC(<8E64LOSrJ#+UxpTY}WG^IBW9Nx43vmy$*T1zAniZ_HG`?vr1ukG3(
zHF+fBC6ubPCMI>=)wI6-OMms9A0%gI6f+-Dv;8g7>bB@-AO0d2;%>=Y^5ort&)ogs
z;Zyxvw)WtV2HbZ+;#)86TC+*cjgL*u5$5c7Dg&FlfAxvo8+t@tbD%#aHN~V=Yvq>K
z$De!Sr@68tmAwZ0V$Eu)Ns*eK=#7VkRp}L--rDLl?yjiRwkq_UXG3#EP4ReP8#YD#
z`}YK?!O>|SYz~Gx208j}p$E;FCFtUJ&SGY>=_rx!dR6v0orVLO%ZA4-yL)@vnO0!F
zLPUNVhNg)oub^-Jo&|KuVy$lYjqNtBUD~?*D;IJ)WI{Xnm9S3Mq6YXT^FZ(?5Q;&F
z|00>g+8}!Ug_GQHJ+|?SJurwvEm&-QXsm9qz>QYA;U}t)U+o1;+Iz0>wU_{L*3s)E
zant4DORpSyrKh(q=nuC9+siQ7B$*5#M~)x5d})Z?fS&1W3ebj*m`in*Gli^HZf<RE
zYDKuWp3oBHSPq1PM6?ZEJnK-LoBB2bZE>2;WM?KOCOW&j0zPl7Ibu4hekR03mN*xw
zNuy11#NFT<z>Cl-`U$}b(DZGk?)DyL7E!{7&XKL;&JPWhL4@^6Nh`em_A80Z?DlQj
zyr|B}EfeYSk*k+u@lbn5Q-#pFpwFmBbNRIDRd8;Qy%}wKka0Gbo5+s$guBUyLRnHX
z(<n}yJbj|OyR$9Se(30-vQ|<QcfjS3MPsn5lhq{Cmk|CC#2~}-n{U67NF=vy-`Uf(
zN^zi90XQ~ibF)*k6Zvx9LqMEQ%@=c+B5P(8uTq%J*J@RNFhH&;kEC9n8x2Qc3PVZO
zqfJrr?IpYkg6qiLS}v5%Upi0LfXvQZ@2b_2Kunc<i4tOZ$pde{H8y#zYF7P<*QK~7
zr^gvH-HNNNv#n5a6moUQ)}?Z(Qcl)u!BV9@H8;`T)LfNHue|ZnY+}kF#i1nBKkktB
zd_x~-Kg!}|n2y+5FFf6JoouoG7A&*y!wUH4ea5F1P^#Og2%&Ib{d$20Mn|N%=~OV%
zmMJB|;ZTs&(a;u36P0|PxHqR7thr;VTUW}L8SNQov{KP+R28Fe|Gq7c-_r+Zp3O=>
ze*5Co^yCvecWho02?V4setO+&=Pu7^v!+Y1Hx!oy=WD)LJ9OsK)oWvS-Mzc7C%$!+
zx^3&4?D4CkdcCW;8BxvYN$J$-lbK>QQ>;Jsz}{WGQamoTcl4&N&fvO0RfxL<JzG39
zFfsiFe=f!aA|9BL5h$540yneldHT7Rw(R&NZup*kd)_#HG3X*gX*t#!=<5xdvUL1R
zZYq(cvcxQ2C~O#$j#6ghhJi=6Nq7TBrloh@I_9jHeSLj*-`li%l{B_<?=#OlW11C@
z%LrM6?^sE?nn<5GJM_@q+xwz0M5=oS1voxEQlFZgA6VPm)g)}y88GIZWq8&BxBivS
zJ-W3=Y<&5^<gs%@-f);X!S$We{vD2zIXF2tF+Z1z_~d3Q8&zvqf=~DM*}UTOrgPa>
zUZ4E>=eL1{@7=TQ2S0t8kY+@mw{IEXY}f>8%yc@fL^`0%B3??22qEDTF3dY>A?V)i
zj)XmNziK!$<xJVAR2&{hHeD&|IlQ;tV5Fv**-TD#o3VHVC(Zlx!tZM_(J6n>Vu~a<
zVxb-cB(^fBTN|kA#TsCm!tU3ZDRz^6=3NA74rNcVGo(<~RtvMw!NUXETXqABr(F2?
zj{ZmBVB1#9Ew*G5yVmZEwQwB3+;QuA>FW4MHk<VZf*#rD4~0^RdF+(Sq==al87$=b
zd@AwE>o38+h&8t`yVB9!3GpCb%0K)3PhWiPg;hO0Ll-XLMW$zc^^KR0o;v1n^XlzG
zUV+y>TnXS{$N<-_jX($1>g9j(FMrqL_wjZL=7!k}P!2~zvP-=-esv}>bNI+R07tLl
zB@j@r=v;*qsMCq5kU!)J$ir7J&84T096jLi1uB}Em`|mWiD1~@))pm|NpgM${WwLD
z;ri+tZXAbC4Zx<jHaha%?|o--d<w9>dey4Vo(_B@C(oR?dhJR!pO1#a*+Pb{AXg})
z(ur^X&9~q%z%zCt>cQMmRUe-oL&^B;#dEPx6YsKknrn4K2>HwP#O#zm<SovnUw!lC
z=bwKu6p4|0wXL(&qbP}F@_+oV|1K+vS5+sbrour`Yz2kmzy8i&@p6n%29^g**K6j5
zE9bc}?C)2DvZ`j1DF`RobP_glBo?CIE>()a=->Mnzv~P5&z(Qq(iZf3NVnya$&XTZ
znK~1$4u4QBO9es!4<9?odqM#x`9ddfN3>^a>dq=DrM}()+FNW)%x8vod$Db@_-#e}
z+rY=Sv>F}N?%uGQmwHK>PtCe<BBUoAa^?T{y)P+>)DuNNQ%r^_s`Sd+iSIl;QqALX
zrI(R#jMc#!jPT|BFFyHHjJMv3^rwIJ?D5(8nxj5?ZupU1n}72wcM;OI`|jP>QZKR}
z9tJ9#nBpJ(_~6TL93wT|(3PwI`k()f-|hP4Pj~+0jU#TKcRrK<%WqzpyEK%XC&;8i
z__Cv<tpDovU_k0>AGml8<sgk|JEqDI!NBuD+%*w>AYlO)1YSm$!Wt~=us)$#sAtZc
zo12z`ozk|=-mVtFP+qMT_V3yg2?&J!;#-I6q!+=N?U3@Na#vSZdg|J*e{ox#KB*-A
z*FXN#iJ2tgmR+qa>-v7TwNo10(DnTv$X*vSU{Rwa=G?0p=}-UYY11En>-q0|^WXhu
zJmBVOefEuu-+BJfwegAVYdZejzxp}_f9#&^$If1mWY5654TEdYdzapR``q9D^v#(9
zX{_{fmxlh$KmEGTCq1;Q`%nH?A`)NYsaJ|6O@;s$iahhv%inzVg-of`*Mg*}{I$<-
z<M2oJcRl@dLCaL+y1D<ZH57I>Es>>3DAEz6T`mf$t%n;8pdoZiTiDIMIAb$u&qqfm
z=ADUJHdD=4&4QPS-r=#K{tbNtn>V3#f)~n5(nPW!prZn!<-65@Z2kn(uZws~z^-?Z
zD)>z_ts9C(-_2&ug9oa{$jy%;&XX7#mum1H21VTx^G9vVK91V4K>4z&vAxz}yDT8~
zyBBC8Y+AbP(t%4`-hIG|{Vve3{TA^-MWk;+&St?f!fbREcgzCgiV!!5v9Rp(_=L3*
zB%}J26pKa+g?yz_@`u!DIK)hSF`uIgl|8O}F$<A5nVHLHGadxYngmrhl}!=W=JWU+
zE+^WWM7ia1BBMt(l_AHn(_<FOscgXr#C*u{Vqtu@4wo5<_{pw}1Dh7EX<9)qg`!~^
zf2CHWUxIysawgIZc!g1rp!0SR^^Ld$LElvg6zlT`5wv%?)kxe&@{ZZWBx#z$u^_Ho
zQ)q9R%m_IcITX4*GoLHWpF7>&*4EM<$1$ABCIFPgVML+<blix>j>Ln!EYM%Gy`<#{
zdLxe%a-VRo$f6Pmd23Z;ZhmHNZVn0-vY#RkR=FIE_*y%f17Tk-lLo>fgvYTY;wWYd
z<Wt2Ov@GaZ$wVR?3&f*QKe|ZSTp~3CcIW)q-K}yNnF&b<9!45v4_<dUP9*aNe2Ym|
zmsu?5>jwO361KvqaIIOlIv7?e<qY2OlBS`;gxYaud#B=aRcg6X9a|&G=O9-LFCkWw
zzGE8n4Fz558hN7azXksG;#YQ9QC%XBR%5wk762F|vs3oyQAAkB=?oJbrzr8D)aesG
zR8y7WvY_-L2boV^jmO%wtWcEUPx1sM1mulcZxgdt($QmM=|W|uEM-cX%dZ?dHT2kh
z15MrG!GV_V|4<_x2M%T```GadZydUus*=StdTDg(^vR2RcMf=+Qj<@aNal*w{5zvZ
zeAQ|s(i01KTmd<snba!VqXCH&srZJG>LEB<P)IF49a?PAj;+xWJ{X>33%ghk5rrzF
z+U{^2I&}5XU+9atCB*AqdHMOcyL|t)wcP%6K{_@xJlNk=0@AppqR0YewuC*fDdKf$
zWNI=L4)%mNt`bt{Ohp~_bW83)BwL(CPFeLyrM&dV|Mh$GSvg;jqiX!hwNz+h?AmPk
znU~)j&k+$FnMq}cdUTN-8&Nz}O_hvwTL!r?p5SMmdu}YnSt1U%F*-hb{NS1U_ir&3
zslBaxK1mO3wlTpcsfW+!e(<B`{oValax0bcdM0yqyxtUdbOoht8+t~DuSJ5Pz7}zB
zXHKV583wSRQ&DlVz^@?L2}TiDyJI?3uq<Uw%uXLVkS(N!<cbzmLy{I`64#K5+jb7b
zVv$Ov3|oj74;;R@yPypT^I3jh>w~>wrkYx!uB^r+5c3BaaS<HXsuJNc0AN6{fXhM}
zki%UrKQb76A|eneRLI9-s{@vh%~iH8$GHWh6W8K@vG;qHrC$HW`(WRtU*2=f3f>@`
zNp|rlKCim=LZ`s?rLXqJ_ciC@5i9Lq)N)Le(a0O;@hO5LD>M;YI`7QYQngkgngSG?
z%p?#Uf_bcZd~T<MOn7ccYQ&rYHVEA?Yp@aMPWb?g)+_Xnb@FC7WRxPzNRwYP3jF9$
z*jKy=va=O&YG{Ru%`d7}4KyX3u$o+GDrdyII;~x}EKqCU;h<LrL_(9}xzfl6r+6Hy
z*TuNvQaF>&EkmCJZW2O3v>*L?u8`q%78L9DDtG`&$S1=aQ)Rr-Su(1+-9om)NpQ17
zK@7y0tHn|&M<5^j@qGHdKAw_b*hc_Oxs-!<!5kw|eQvi0sZmM_QH>NM2-|u5ZqnDm
zl`4{2)vP*@nsd~uwJH*#1mx9>nk<WaDEXp+<uG+RI}a;aD1k!nQ;8A;XM?{XlxUO^
zDzTfg5mjd(Q7XafAQVB<2@bAQYZ?;E-hi5BimQNy4U{@6r99X1_<U+OluKzlb`N&N
zo3Yvp2VTw=Je^~7Bw@R?C!W|H+qOAz(y?vZ>~LaZ;z=^GZF^$dPA0bX<$cdN-|FAB
zR^3$(s_$p-YtKfTmqoJui}l`54Oxpk$_mS37|XGofvgr9P{;CqKRy2}4UBVeBIt9g
z3)^p6tkS*@K8c}pNf&yZf)VNU9#DyEAK!YNc(n{5U@+=#K6L!Uc}Af^r9(UUZ5Oll
z`wz6fEPl=`Za*j06;}-h@{CT9y*`lS^4k`gdTO`!y@m~Ed5hxJgogI7!;g+{ttVGf
z54{`Tv`U%q`Gq*q9XycT`3RRAO&o7Uuft_4B$U|N)uy@q0y?TF^a4CU-|u4ismbYn
zl*Gh=Ylkz_E~-ENn9mlz3ZVFB>_7y1-K2VnPwDX33QdB^?)LGW5;lO0X-nP09kIH{
zoxQUhmR2q(y<guiOq-IWDx6GfZ7P38{;00*u`~REg#kH5BF)@9B&$w)Uy?Z>6RLp=
zoGP3sx;nWh!$L}3pQH`k(!W&e_?F-Y@lNP#&S_F*<FZT6EHI-iu9D9JH&`wLz7*L`
z;qW99iT~hJrHk)#qN2n7x%zRQ8aB0a5I_x#Eq=D*1`$@G$scA-B;q4K<$R2|biV)<
zf_-Npe3~cXV)$Jjkz|!v3!0JerN%Ah9kEcOJmtc%W+J`XTuEo`o#5MBQ;!x%vHPmg
zX^OM7TQ#Amxxq_fo7kI80Dqj+f&*?PiXjTG^k!00+BBj0?H5me4$3jzw}ZR?OkKHz
zz<PR`KEe5vlKBH%U}bBJ4yh6rQ<g~-8gCU7*j5GD#jrO+BHife_F9iUNLF=xi{0{}
z@sUOGp=W4#__8|BHt`+*OzUrRs%emMn1IsENvSZyyNs<t%m&UXx6DLKGNWUdi_~c-
zLiObXqsV%26G4UKteYfiU>{^gw9=99`VIsSsbetMl?nW~XOUq5BT-6dU}24rt_NDa
z5}l2ge@~{IxuG^nDl{{u7VVW9w%bDCukvui$P$hQ)@1@W{SF-KfgyH8-2ULZ^xx=l
zjaN)RqoSzUs2MF<6T{4uIDnR};Dj@9qscP1Ft2DbA~}-`EQKm;G-3p?Fz<LD=DHMM
zrgXpU*#(_ovFe^RU6S+zuMoJasRcdAv0?I4Z^(srX1!bUuHW)d;hG`6I3b*-X(`-K
zhZUt5m9I(#1GkS!!;3NNg^CW5d%&&%^%Ul=o|wX?(CW}Xq2@Wn#GGUuD`lkj$CIQJ
zcbyZZk)7FQSU3=g1bu>TdAk2@>^=WwJ9k1+(hCs;vmE-6)ps%}Q&?<>+^tg4NZU9l
zXcGeu64peN=rGlr86OP+T7XScb3ut18CPTRMeyYb_zRPydc<=Y>!(k7LJ;{2L$uIj
z;>b8t6wDSRQX<;I&FF|Iny}iqJ8y=+FvjXz?u~rjzkU}qkpQ}<)`iI6ry-c`qk!Ox
zR@T>3s--FkVV&gSg!q-Hv1XH{l;V7XUyx+A@~H>kHd_jo;I7~c6iX^EeoG%nwIAPL
z_ckpGilVn4r>nXM$GqrvUnd(7T3VIPi$@Ffr7_U9<Rg@dj91SLd6^9ly6X!Z7K<%W
z6;-T+<tkHYPQeQQXixmWSQPJ(v;eu;x|%lB7>WMr7PXFpx!9=3M7k3L1A%&u5f^mz
zoydDZgUZbKNM%(Q++AI8GInf(nyU<bV3>WR4c@GJV~NMA5m?qPYN<?0j2vFRE-TKq
z3n#U<D|7QH3*t+WdCql~i7J@>u432Zw`Lk)-PXL$({Y_Y7zbSVf11{xEu`%uHvn#E
z23<?iiWr#%PG<f5*N+w<2t{TJuWMT^J3P{UQ+1hl8&AsT@M+<G3Vk+HVLvaqH{*ot
zcV+1?t5y+HD2L#(B@GW_#s5JEFWiT9ZQF2N;VPu@Wo6uDj-Wp`*s8O6J*>&-aILFI
z=2(%gqr-~+n=grc1h;Yzkx81s4Yn6uB=#n3(M-@hAKfzwp#Iz8?m_p4y0n4G>0zNI
zo$76M-Rn#~N1Wu+FIvoGz^y}ri8Ix<h~)ZE3G42*n}KeaMnLaZC!T3COc_$}Z4=np
zKG9QiA@c39(CnCv=t_!VJvm_btPT<~)!m?%$aak$pqSQ2s=BTvp-KPY^8uCJXAxjf
zL@P5W_6yun*Chzdtij!ly2cwdwG!|>K4U`8AaJT7ly6SVQTa#{>`=C&z}QLimc)_h
zB=_5;8zWsKS_G#qp#?W?T6U(JDnXsB1cHHFxihtO4`XT=#C1(jCgKMS)@g7#oN2TT
zPCTLrG$5Y`aw&X7c$q*V+rtwY{^!*2*bbn|tlac{4+#~PaySm;4fjhowOlorRMyxU
zYKp~fGhQ};HPH3<_ahq@9hcl97QTT~CTH|?P$*sB^)_3!?QGQc-=b{<k33ubcmZTI
zl0CRZW4_-Qow{Y<8$0IZ&WQeeiRsZX<M-IcF#6DrZfppo`9H!5Aqc22tn<r8;b8|b
zp-F6^N-VCSAjqG$d*HmJPQ?0$|IxYR9E-q0k?RAlAX8EN7o|9Y@U2!(MdRef;9aKF
zDpQt}>*hZD#)JrPdHDncvZSfLtur=zID3yCiElL2uG9QSyQHH%=avRh=|GV6IbTKD
z7DXlKb6L+sls;Lc+TH!DT0+#RRCBz5!vA-!Hl5m`@q|F52y70+&km78g%z)Z-RBYu
zKItHEeer4x@Q!SBWaWEERf6C#Me~1+#)5NVDe~2-i-j^IaqWSA0oknkx4V0)S$4*7
z@t%+>K37%6iwzCal=?orPIu$eD+z3EoR9CH&kiGUdexr0xlNHbKYsro3!qg5z`j$j
z9h0vgjI3O`s8)?k`lIl7F$^hEdL|f^$)a+ly4V#f+S>>6=Yzp}RJ9YE!N(1g?Rm^(
zF`Py*BDRBd35}Y_?ho1}a*<-T<B>F?6<9;r_w8IV|EG5o?Gmmm8n+?sU$&)+?ifM7
zup4vp9VYIJnCoQtM>Ccl&?JhTgh-IAtsOLt`(uez7)DLiJ@^@C;e+qXa99w8sUkRo
zG9(!@sA$mSbTp1Bca;aKk$<j1`pP05a5Xq(*~?&XlEi2eSk=e9X~26*l{q~*25wNc
z-}M<5;mZ2L$;Y?}a(>Xz&17J$ZD|>Fo*HDHp8bpG6;V&F#1_47H1HqrtfLMql%+3`
zG-0^OZf%&+?O?p{x%9cIN&e9OE2jG|<=?MXM^}-9<3t7rK_!bUm_s5KEOZTTeP?vK
ztl0Mr*mP+{-ClC2fE}bu6b6jp)TU#5v3%em)T^>xqse85*YSF3uiUA&$tix_Ud&I2
zqQk0<b6>?sjtG>`4$9{dRKr)n17)0O!ar5l1PKygj^jj^=DR8Dq_4b|n(WDC`<;xb
z9d&e9WGR!Z+jYKH$Huj)<IGHi{(`rB_9|WUZ`56k^m$bp>p?k~V$|?~oZe`p<O9*&
z3#rp<H8F>XOo7c#{G$P8Xe2-K0yYEw<!(=qjWO7o`!fH-BKTc^78xu(Lm%7dUL?J1
zA$Ks`-!V<vigFZ;4R#5*fK*f(DEd<i+dmwkh-;M`OoUrpT7rDKlPe-T4U%{OyN9*x
z%A<ws!kE{=xoox~4(jDh=E1SVp}$HTnX-)pU$rp8bAvTsurcpwrAZod?!PKtG1#}D
zFwFchny*YP(<@Q?9k&~O+g?C3FQ-{BN<#yGZCM5qC0;z9xQF5)=?_v=s`@0Qj*C<H
z^evr%I7E5UhBo>5u83wp@uDDG9^ftFl^lyMuEtSW`1?{sEqan_EHCA~rxH<Ku?I8z
zwl@|suZP7=-;paFEmj&(wm$~o!GMFWS*6<Z(qMY+{gw@-Qe!vR=%TBaxA0v>h1<qX
zC#@8g<kv`W2sIB;#E)Hu@*)(u#-J2v!LH=wkue)u4MFdl-L?-Y1y#R)wF^ndrDxZB
zyo68?MFo5)67<m;4%1LuCoZ>hf_f$5@s5+?%DkfR%s7w-vqqcs^&Z;?v25Oii`C8g
ztF>kVH=U(<JRCyRd|8o9J^Yx_s{-wd9Zrn?yE1SHvxM%O^%e~0>Gsd-2e-;P3god3
z@xP{|tGixKLUxq;b-FLJeee43EY7ABOzF$6%&URx_&VSJaQ<5(C<@eNyV2%n8ur5F
zs1skvFqSn#wW-vQbExG`Ptnd-c5wKWo!t^?@ugmS7U9vOsXVcjOD>Gp$jF*{3d*%L
zXVi-C^ZShLd=xjx{5`W@WVlY>Z&?o2A&10DTbLM*-zc%?j-|E$EGj4FM=-~%B~wbJ
zl(}|ZK6Sz^AfP|}=gwTOS!sTF$e+8I0N)>nbmwxW5+}g&lC+l&T0Lde*2#<Wb+4ad
zn8^RZ0@}p;CqFUeltWq%nLAD$0oN*r#e^k}2xWQOvxB~E!kXlFc>Q{PIrrH8Fxi{Y
zKS4x2c9=aq5(xtJIeW~CXmYFMNAuLt;m1^KH;&RTRvk+V-(vYcN4b<9)8+T~lYK=%
z5s;AyOZs$_(tPC-ztVrFebZ4lW-KBU{>!m0#7YkwXM$XHJn?}a#J?>EZd5NgFxA^-
zKPPxQSyw1CA|Tl=yDr(oR%iBu`l!0Ue2=#pOzcoiaroScm6mIlKEt~gQA?6c=!9;e
zUJ1>KWwaVL0|(6F@<5Bk3@N(Z{Z@VviU(3cNI@2ioAK}&5roLEyFT7(mWId5jbH%#
z3gLt||E4KPF3)@iC$a1MQ^S%l4@yFiS!^6*`A}EMhWLiGci<$?zDGHaf@w0WV=4M_
zW6N3#{+8<D<M)s4<VSXXDrU8`bp7Q%I!@g^?Y_+S3&}y=UYKgg$DHgjk|LX`CB#XK
z+N}0k{F&W)xBjy}PC@^H`}>F6&6}=jUx3H!^KXL=itxc$HuR>#=n5mLs0no(ap~4J
zPmiW^k_YX#4nq>YRa1&47dR=C0H76~1n5gLONv;9=)9Rx1_rDDQ<UgC%2q4=7JB>U
z1rXv()rD*NJw3BpelmGduM2;p9yR=-+@RBAVsp_&6pJZ`Kx=EIz0rdhflDOuuZ4wU
zRGEyFT;VklM8bP)UAgu!03BwF_inh8FVF9|+`#jV2`L*tE8WL)Xwkq6M5fR>bqRVn
zZ$8<%T4<G~A$H`<{oDPiv*!QSrv^yWCVP+aIcpT%+ux!GbNDF_&7~l9vSI1HCOv}X
zvpdtiOrpb?1?T<f)h`{Kv~~(;E=W{BYLL<({VNVJ$<THe-))MC`bb<-yQ?TmVakkE
zw6rPc{Kgh{u|1iKQOq`*4zIzV%a|JI91BzG&bYeP{xTlrU~6ULbI?;bDY<wYc{yhA
z9@G8Ss{nVKUmIWNCwjRWNMItqglvF2%XaJ>sTvUU6qQ(1T0{!jhArMb`vXQ|Q4sA1
zR+P(l8_fk=;?y_%qHp<xL8BA3Q5H55Lh?T)FLXSt^Vk!|trp%=g&wg7!dvTHx1ED}
zuC7PQGu0cFgj0BD;hhJW(IVL@JYjuGTNkblYDXe?N}Hu4pP+6%St;aXr<Oo=+a@y(
zt9w{Ok@hJ*1AZQ}yT1}%hEn$sU)sFYy~wLi((?TWrRSEq+v=-5=2FHN=#ujD_ijhh
zzkYjEQ&sw%-eV~YC8^BEBT{3;)gmnOWzmm)TojwCj-Panh+h(s_AnooP#m<)1MmCY
z=iK^Y`R1XY-Yg4qGl0``X1AaaR%f{Zs~aPCV|4?e^NafgDZnH9bB&?S^fjA`Yz>C>
zZUaGDYJ(MclA)t=dYdpLk?PN=<o6%l)#K;DLln7neQX9f3pH|VbSV+TQYesaWv&77
zrY``gjuQ3+S_Se_1p9Ijzj*^ND&Je_T-yX^3swPzxAOV@wPxAIC#h3^^|MV43Rl0W
zd!aBKNN=p-^yi*CAnr?e9x1ATt<4TZKxBv&Bf0coc#XX5KBWDPt~&9|oe-ais&n=8
zdo#o3=c4;qh)qraUjypCq*j#ZSshdI<KHz+_02)(9$%2_+dIpoLaQx^9<&h^W!Y++
zyVX0nI~#Rh+p*i?)unInGmhnL)aGb$&sHOCfjHRAGD%kQRqRGMA!x+p#IeP#SgE(M
z=E|7TT?h8R(@0w^(bMqiVrNE02y|%2U)Szo3z=f4)$D%M)_8Li;s@nNS3`#9#&+iD
zp^ukyfdSh)kV&u0^dh$JsVJ?}UDRAzXt(yK+hIuH?2;PDpl`*<<a>LJRV_Pdkd@{G
zy8VkUCEPMjp4QlXU)Ytmh$Y6PpGc`ZNYwdORNd-QrtEXRechWv;Olpc*L_R(P1k%Q
zZPxq{77k}5Wc03OdT~BHhJc_<h$Ra>FT)Qx4f>gz8|B`+l>=}CxgqPFfVHaFhp#am
z1rvvSzTTWqHz0AWqd??LiZ?}{!WUo)T~S@ZXm&eznDXH2_p&Agx*ma9%(bp@%Qx9J
z5E>*{b>GgL$|><YKx>{xGm6wrgSv<C^nmMU2p_*XN&^nCyXB{)afHuB(u`ZrPT};;
zosS&d&~!eX)gR@wc#u&&eTjo1Dojvfw^c_H@6L%n><SOVxtb>n6Ti>ZcM2ofOIM$2
z^$rjt)2A1HSGyb3_dVa{^0Uf88B#3JFc2bp6~PRn6&d{zl(*DyC>`?oH0jK&?0G00
zF~G#qe=;LbR@_;xQ_9tDrXM`5P??LkwzvXy>T9SXn1HQ7b;P9G!p_6v^?YL%yfX~e
z?tZ?S(xlEELKvcZP#0PfAvIY141%jvSax5Lq9oLNrDI@74NHa9bG|K#@qy-wK{UYT
z_2G3qm!VCC%8<;+OkdXA>NYnvrsd$9TkS04^*$ILVe8mF_kU8N<`Wd{*JD^7p%w-k
zA=!QUwjTThEWrC741U>mR(ZxDsI>%5s>mQqJyPNyNC9{o>O+Jv`Fm^xfca%kLVsO-
z9}rOchku3IHl)&uD({N}0?b+NxJ%zkX)J=`9&UmHqT$tgwm=OJAOWeh@FN`jo>dIH
z8ENgAvTcwsP!_d&tow2ln9B;c>O3cr-!*i9-Ks<mHlFh|HHD}37M{CHr>35YDoi4c
zjrwg++UojcKW-f5|9&&cG3bB{KX_TN^U=OHS}{obmA&(Vbh&0Mt@yP8>GK&3457FJ
z-iPvjjQ+E&mqrznY1oG?Kmh;65tF7AI+0vYFAzyrEY++e?O2P?<6g+I$}W)k>EQp;
z*V}uRl(V)pM-f2tYY5v|xXAUn&h_lAF*O!p;z+yi`XbE(2{djn<LJFhu&FtwBU*j%
zvA>q%^G3&aiDs&4jZt>JpmjXeeQ%;!TUYJ1)Wxu08Nd?1_;q~mc@X$Y2#$o-IrO&D
zSk9&(^m*c#mpK1EDg5s$A;Cls;{HDS0RYR&)SAGgiwW+cx1NZq+Gp4MN4do7(D^FQ
ziESvqh)c&%vN#uy8teyt$i}xun}5!2Ud%G|PHM5H94eKmHZFy_(z887NXKWYBY5HZ
zggKlUqQT$5|6RkYk2ad}I+()gylTAP`hUJt5&47e4Tar-f|Ol~SXEM~L*?Y-ll!BU
zJC3v@iauR3(=r~kg~<p!C?}3>tAR!;O9h4ntq^KvVPt1+6MzC?=?vO<?D-g7r5W~Y
z|7V8XKQJ&be2d{#8Rah7&9P=Veh<Zr{`?#QAs|H^etIdpr{kX!43+2Fj0YteGkmlu
z+haWH;`vllx)q<Zf}60|4Ep{@;rht`<kch-aEpju)W3mspl~PAPwmxZFYq2m>a+Ay
zZzfA!hNorxSVj7Byw`!}F~^8-L`9;raDX{PcNhfzSzTRib3cD+Jx(9fVATJbIZxMn
zpnqp)w)S?%2A!N}5xp{xo`wx`T)+Vm-++T)tRc)u6sBKNMQ@CAo%VX4ZCxKL(lkkg
zZLA{L<roeWXd9?@yO{R&&ZCz+_YZ-hC4!Y(4J+vGnRESQ8j|Yw_jGoZ09hmBpvjzK
zG2`9oBeg*&d;Ddr%l+80wLy_6e>>(OZzJ#EW-RIg;ow2A<S=+YC<6xj((ed+N+u6P
z!<6s|Qh@@tstt*YDRC3-?u^ZgBGRdxS`&w%8_Oc8fdX1*_p_64LAA?2!Xb^++)@6b
zx5&NOaQ)bM`2&81!^ZblqW2YG{afsHg;^c{DghAa@Fyd3%`1mP%Gc>tB=_vpzd$7_
zOR?Gzl|s8~+0M`^%1wR^_j1$4DhK)L%4rfg%ydT{@@?>}?*<?UzyQA(@xA~X>dUhK
zwB~SlwRg~*yR64LG%z|}_hYKD8H+5ewcRz8&v4ZZ+S4CF#fyZQn9|gvH_e^Zl0a|L
z0sHUXBeM^?R_E(d%6ut8y@-R$n<MDFJA~AY5nfjNr#WM43<{ljh`#&qI}g8Hsj=Bz
z-xb%#woW`2+OsxAO@yCfl1@?dmH%tzQG<hYhBaq1CxztS&fet?#tEB!A$J8?1dp&U
zo%E-dPBj8v;XE6YuC>GQU#2h=1zN63zK)0r10;#lj2(^qt+xlDkmEwX+@^735hSk!
zlio4haBsZDBzu)CRT<J}T(<A|?3!uloV*<QX)pGUT+glu&#BsPegB?c>!sso^-LWf
zLZ7TVDr;!476y+J&3ta>qa&Ypl3NqWHf9>G=GNYJNqD^=N#GcW_I>U1ER$R&i#hK+
zE9>@tby$}1nDgUl>QjR4_fLq^+YxE*iSLFfvKAR{;*1KEm!8iPY}Fm^>WCpjbIFP}
zR2kAED=VLm3I^_cWf$SRu6CsOn0W(pDy?tL2ILI0Z*QQ%om(+c@6vc`^{&S^zG|ZO
z@!tZuL}fBBB`n2k!j41h?66gtysL{xbtBd9^`EGBZMR+?ta|Z3Cl%TAy!$xBCl6>t
ztO@wD8Kfq_tSnnHMgZf62#)kmnDPYY%#3njeL4ShEA$`KIqNcg?DUCs?RRxl!_eOg
zai1}r$^tU$o;;+<@V+PN1;}9*yoaAjGAX(vBOusrvI<j=57J?Ug@rj$#l^;E3$V@4
z&PsT1K0H)ek|XtZR942M^8SEnf}w2Jg$GbJNm&ZHxjl@>>Lpg`C%?26q;RhPg2>rO
z5%8CW<BZ6k)e=dTVsU?}<Pz1ttkUz@z~hoKzurw)n#?9Dvw-=z<4LPweW^uY(5}B7
zuRM4FFLDgZ;l)@w`1~70kfKOH1lYdG)B7G-_ny8C=P0YuOGlPIDU9}z4IFYFka>z8
z`|Vy^1E_(MrPGcKH$_WJGN)cmYBb9wiSiw&wms5Ze6zpoKMUEmV!DVEg)kk546~!A
zH{Su2yRqs}zNH#N?AQiPkm?Q<cuJaD?zBBD_%C|bE)I0N=lf4-hmUJ^J5$)l4brBz
zxRhfM)#=km*%cx;P6bg2ji#Ub3T{5Lpb?^tg$Ee`Bj|VGwl$XxxE0Qjcc~QXpi&?^
z^?$wS+3-@^XnCElv&K;~0cvCz?ySiNWFgJk&DC#sMV1FeR$%btI+Q9)3cpU=tlW6V
zs2DqzH_;-?7HqZ4Z3rFy5Y&#=)~1iV$Vw&m^$}RePspEc9@cS@hLQ1c`Pn<~L$0b#
z3YQc9ql|K%UgBa*IIUdAk0Fm#d0w&QUrZ_+Nu_Jlv?*?mx3W5wrKCmYKbiCt{4#Cw
z7D>p+8uj1F9=&C9UjCji73qpSc~i?J-dCbCC*k2cgAKW>P<CpkMe_1Cj!ffL;I9~F
zapsAMwm6phB^-qomFxnA<>XJjH0OiP#o*&M=j%rQBLhKFZt;}50>>?X|A8}^eRx8!
ze{Hq1R!>HkT&6XC^cNik)bl~x7sxBDT34^RNhs&Hn!uMgTf@6OL*aK9HZq6k`Ec6_
z1<P}#7^&PvTOth_5kpdW<8EsG0ZsBg5X?n=u5VM3RxcsnSB8DO1GItDmf@k6gozgT
z=RS~=fEDNBo3$8J<u>~qPXYg(acyC%4XqVq#$KS}@DXxc3}(h2!#GkZg-G9f<wWT5
zC|R*$84oe>E16yL?pt=d2YQ(B)XS(%b@`^D+;v81-+Bkdfo)^SsS$5;L*T|)6nc&%
z5v<lUu8pJBwe!zWj2CkKYS^qa!v|%kW15CvZ>I3!(89Utri`gmHT}Mi8_iz;DPSt2
zUaPYi$k^J}Hd?j0c3IOygQ*ec1_tiAO~@8;^pXK9rlzWTi#tB8>tXIlf4SMlrNSY3
zex8(P{_1aKw2QJ}CSm3cl$5%N#zotyi-L5?d#0I{Ge4JC;ffvg!SikE6m2f&?5zFV
zeG|hPxrj<-c`}fqTH#+IdrFAWjX-iw6mL|3Y|u{diWoe~)nDd+9{)7H8vY-=!PPs#
z*I(#~T@Q)%<OTpmvt<uB4E$FYPEVJqe~9Z14$XuJo~VBq4FMD0D9Se+1;FT)>p=ft
zq~QLO0!9`!X2zVvL`=}cLYT#YkQ5vZh-W%)5;WlI(-CFqgUZL4>v9@BcEwe^Q&eeg
zPex+X-)G$$2ehgn5UXtwzZfu2a58bH@q1Wp8o6oYp4qK^aL?R&tMc{JKN`Z$>x^tR
z`e<peTf^-|R%gg%vgR<YP2Elj-)K2_Y|z<+2pRG_Xmn8$nh)952&Y9gV1}A%L)YN3
z%I3}JPt3hKiRzPP#?IT&;;W2T`|3ojMpmGEsSi&k=&cY%w{#?k<eF2P1A(bpz{~~P
zBHi#{en$UkBAdRgOeRW&oC)Z+uEPt&qzL}9vk$|Xa<ALQT$8NclmZD=9dwdCm1EKi
zWr}Itsx-e;7=dT00tkr`;30pYsS+}W8|r0rhgpMf|G`WF(`u-nx=IC4R<>(7HS*J#
z^lp|2`VtzLf@X}w??xG<82M(DrQkqy6_6*TaD2KcG_EWAHrXDVuP$LDarN$_Bxd58
z8Zp2}I$kD6c`;gwxM)G9s6SJjR)v)pKavY$IR8zH_R$VMn2t8Y=33L%+<X!$Sb4rl
z%_hohcw(pGa8{4H9{tcvH4-Y1*tHn%*{AvBz&SyxMRO#`M}UySU<8)1L5P?yS+_KD
z5_K38%9uO?C6KDxx+?zBOJIdxveA6*c1KutwmespJiYSons>Fl+>rLqq`A_El&<Hy
z4yJ%0XDAEiON7`qkzXVZkY#zK<0Fu>n=l!^@>W&Eoh~nMFEbb^vXzVx`f3P-7+y}Y
zb$=6e)&}e^7Mw`FgGJ)br{c;5!x`mDn!N>(PMW7b$<bNp7pjh0jGmp{dSz&@PJ8#k
zf=S|-4~E8M-aNmp)PD&%^-1E{Ge=To&h$k{syJppF2E!7GnK}rXP2)s5aXWT!#2I2
zw~u#rGbQ;0w@(5YG6_K~ye#h96I}HpUfoh#iftZS?*-O9VOH<bp<pZA>mHBqzPTm8
zhW#%2w%Q$c4c9XKM({#?Wr@p>Yc<GCV0Qsp;@Di*Bw#YZIg<}T+St8SHZ0k22)%2+
zHG7Czb1FKq{$dHPb&sgq{0Iep!?yu7-JD#N|4U&IN1UXIG=Kky(2US&43k!(R5I=2
zk^4fx^?vGkpwx;Mu>yO3aEpom<s^iDft!KGM=I)5>_(kkYTKMOF3iF(t20?0C@u{7
z=@o9@-stwJZU)O-dw0g<yZvaU)><=_C^Y*pweq*oqaL}y;^61Iq%$Rd7NDk)(Y%!>
zWEygcz%|m$y=vLr-ugf-C#UlDnfkPlcf0@QebD{YucFb-sCEK8cDH{>8!&25WRkP^
z@uvR@s<>smcuSaKq6X0SaoOhTD~a0n9?nNCBt+$$=M1PsWP)~3YJH7zbB+DWv-0Oi
z*)OsAMk!*n7E@A+)@mu3HtO|``wn{dESu%ZX;^9VVhQe`_#^$Fb{m_dPVaP;|3LKI
z!u^lWi&DiZ<kw;RFvdC4ZBb;>JQH*>ct))3Oe=`7n@UbZVNiwoPo{okynqThedU3&
zxR5u(y=*6KGG*g~qU^rT?n<ThrMq=_g{wTs#2iO4@5yXJ3XYK2IW81dDUDwiL+3hg
zOaBn0#&)2XyCbqUxMFAy7rM5VAfLWtm{x2|n`ESc8a(+tCf+=!1*rTxpb=QoHdhF6
z(YlDjl3WNgr9a8MupV)g79@~1<jO|q*oC8s8Fr8uLjQ%xi#Kr>>FE)M10NPn{Hs4)
zC|p-L#l9{nyAzzR?Sc{zi-)3_uiIyb7N7)E@K=U8vphtz$6!hWVx2eW(rp*1e_sq!
zC7!$L)co6l5(Jhbj*~eItApDd@VmI&qt`F%R<Ut_$dIX{s5QFXnkfxVG_`jlK3vJc
zXvGy~B4K>lwpoA{=MLK7*<BC?Hg3Cz0|ScI@vm$?Wkx8UXj+zC#=+@|7Mz*2-90eA
z8X*Hf+b^A!QTxqC9I7jHGzH0%A0T~%Wi>e!45GkjcA>w>w5Ve}>O=Pw=yQdh;?NSy
zBDgB)f2_m{Y8!PYcPwqEHrpRA+Nlzxvbp^t-3lxGFMYON6I9n*?QUcB`1Y>3<>C8&
zl<1cS%P0JdnscvgJpz{BM6BpCN=8|qUoJ4}dF~7z4T~_hnBA7y{hazd$cc-NML@7m
zi8M>x%ocF6V!eEs)#xVW@R0i6$O+YCkB^PROG^=2I2l8#<7Q3i?-tB%mdJ#=Sp$?c
z(3nYdIpuJG`Xlm`{v(ut^&SusT6Y5GmTyRREanN0>LqlS&=#Pa9}<`W$NSB39vw46
z5(7j(d(q+>f9z#%bqZHt5ttzUit2*;%}EF2;Ba9lOgAFX1(hm7V36w~`5&-9ERkM_
z0V&TGOH-0d8c(c~R;-1L99?C>;eDHLoDg(hZZMG=8Q&3nh}TZqE04yOZ4&h`V#RZ6
z*Id*AKZf{Z>$KPY;d?2VGPv28`qcKlu{I*U3QM9Hi&y$O5K-#cSVR?-fzKx|`_SU^
zfzMUD`=PPdm?q32&OGWfo1?exNippRwYc%p=-GQ`d{Ky~LGO*4m)Axs)Dnx!_o@3K
z-T%Rl$Sen$w_lQHadoJU$f8w05^BHO(4{?XvHDD*Ubh0?>0odQKS;o-PK3Vyqs7M9
z`nI|2li=^7!Ru-@F>6e}aD1d^*bf`b)ELJr6|m+X@K}SOtwEzu%x+OhZt;`jGX?Xc
z$oN51Eo<ycpJltI+OLmPyO0%rsvSQxMhaAq+W8JgxgCZUKc4_Ty5HlQq?>cPZonm9
zBdk;5Yfh%gzj?ya^#GdzZC-{<z)!TZoI#BkU)!8T5d|X!UM;?P&l;_N@`Gs%7bqoT
zLDd)r)lUsI^jRsfhSKPg(U0`sWZUWM2p-4}EeFm>gtY1`jfq~%pBilaPw}wOG6AU}
zQH=Ht7v|T^2Oe&F=L{DllVp=MuE0ftRek(ZLj0CMtke){#8)%~S?JBTTMPO3R3Vzg
zs@=0^_=whDLfGIwt?Pt4;x#W?ig~{LDvD85Z7Lsw-=hH!HFnhw)UDV_PipKwMk+HW
zw#|^CeJ7fV9U<57WV}cwtmhmY90GUK#ow9Vnf&iQ?OGkS8i7=sUx8o!l5wsc9uAuw
zIGynBC}B5#2U@ar2dX~_$|26(BwMM~Ze1dNF=4NK!4pjlD?JwyHe5#LzeU5xDYS8B
zkEHs%E|PpO5cB6PhN?4~1}QS_qLylBaJyPQoz?Olx4-S=9oysPXg6Eg{9%BmrpyFO
zOSh2y&VI`%v#AR|mdBKzNRKcg9BA()i86&O9yGp(@;qKvl0ZlJ>V)35G0Z&s1jxkK
z8c8#3#x2nVOV|K*Zy1i~yot}x%#vQD4W6(A;EnD649CZd{Yp!sUY-0>n^dElNM~p5
z!6;O3)GU(0gs&h#jx^At@=9|?QrubO96e|=;h2u#a@l%arkZV>`S^EqnK^M)kSH$S
zXDg-+{dGaY&FhDf>i;qdt%hV352<{_KR)HE*l^v^)rnLIj?0^n+u`r?qRN@)JanF(
zV(FL6g$SfNe1`e!)6Y-w&PlEu{fzyj9=aL!JL~sa#lkB0F=!Cd=3?ALy59|Yx4=46
z;2ZkrI(;b4f`4T~NXU5ubZM?4zdwcw?ub5iF}Q!pFxK)iaJinPYXgKZey9I7Jjv-T
zQY@=^fB_#bs_g07Ny+%lBU40=DNep}|B4w`>}G9!GMD40>}UMB!ZxIk#Vd}mH9Y)%
z7Q3^vvs}G$wZ+MVo)T>Ur4PBr-0vG>{QkX^S^evUWZR{$qTz%{Vk#>K3R`-Pv_+wm
zE9eQ%vZa(@1x}83_kfbH-O<gJ{P0>vR$SI#GzJ+aXfq4hAG$X+V&-^`D$cunFec6~
zLu198!s^RmBgE?1raF-a83}FpgAKndYK16=E?6FakHVf5+6dHNWlh+`$K#Y>a~+8o
z5Qoe3`wzi&`MWW#gf5GtwkP`IrrS0MXtQ_e??i|ancL4}aWl$3l_R7;8|cU_yJ5H4
z#)AQeS#IW9B+VKYRF_qGe(7J!_at>DOPw=pRX;vSqR~@#D?Ch;7l-D(>6Y*W4RVLc
zJULRn3w`%zH1AxXvilOa=t0DsZpi~2zfLt%^1X9K^6%m9O!6Q}i<lgyeKDwy+-Vo7
zF7=_>#C>$waF$?;O`pK-L;I=SVID5G6tIaLpVwPmu$c7madAZipSOE|J|!mf2@}R<
zor7C3gXZH8Iw}0C8KE+rWRWkrPUmxfE<FE2i=ax9#<v=r?=V%BDuThy67a)4KH$)4
zf+&_$AebDF4O`_c@Wiin(U4qPUl=<c8B-=9gUU}H7+ATr@bEuT0ryIsOQa*^Nw-lt
z(<1y9{(Z3)^1~7l9I5>(nXNN$t17fut0nhpN6Sh04UV!2Uxgj77i&jh+;`2-w4J&6
z2$zs2ahVY&l9F~Cy9JsklvlhYIFf9FJ3OX(A5gTYs#cn!t_qrxD`!7LXa$F!n`&Z<
zOE$BpqhdG&by+pH<T1_Hd`4DmLJR6AEUZ$`PfdW!i@^ITngPx5jQ~Fj8QTSD^)m~B
zaE@e-iV&ns>H8-BJxfsWqvrK3*o~2>2)twoQk+1)fs84-{1Ai&W3X*f_a6vEHpHxO
zqo#ZPsnIGpL3NR)tRuY~$}`l&OUktful$Sc54|6ame4X0VzkL{Fz<UN8vRV&%e7@g
z)3irO1lBRlu4n6HJa;w`P6MgH(z{lN_iusS7a#ih>_=CS!_(hgnPhb*^?2Xqt^X3;
z4f<9U#p)|e0SuObf@VJ}4&@2t1?i)BbAoBz5o_q*dV@mp%aPRfs{_~NgKW_4^M;+x
z8JN)kPuRP3<S6AVE8QEhHu{8^aVL*2rPEkBzJV5@aAwXumybV_QG4t!kH-&ZQ*M`&
zMod8i51c7+4q#^$W%?CmEel6Ez9UU8jp|x;E*ctG3g};yNGQJgk0&fE{Odq~<?<(l
zoD1LQ^q1kt<(%W<y+}t`iGQ0j6sM%XIJ!5TUNw1|<CVC3iS(xRxY=9B0Ob>q&AfY!
zIjAO_L4H)%*{8pA5OSty0;9-EoMzS^iYT^6g5a^!2L&2=N>1<HYOtE?DXJ`g)ku#N
zv&7!iO8Wxn{_rUI6<*x|GM6e}AX@j(*q$A<$=_QlDylo4FDp7E9Ej`eot>xVF?qaL
zzBtxS#ZLbX9M|Rbpyc41YxfiW-j7+QlAn6-T$S=jI${VoNv(=zuarg>eA9|Md;In%
z0ueJI%kpO#<bt}*u%;%%kLvl53?AM|?zlm`UrIj{Y@v#7pxnT1@=N%7XDq_jUK!|c
zWAdcIpPfOVxBZbMRZ?LbCQ6QQ`1GZ;*iFV!xci_XIMt<|p}Z5JkD#m>VGMV|OwBqr
zaJokAl_oZj?_`_l5wzgP6X%DPB>UNCeTB=~$KuxmwOin@e=E}Y_GrLMrwqBmQ?@Rj
zzOtiSN>5#I;i2-95{9>V$JcqzWLZxhI2<B^BSUfR75#oick>MQVd9hFSsMSx2!PU)
zE#R|F!R8bv`TTDte@Nl?kK3zUNy3dzkIbpD3t@R?2$r`<K@PW%6U~a&iCX))ziyc?
z5@EMyZ)1gOnK0jm5_j%Zb?;fN@%>7xtG(0=T-3^SyRg+yxcNuK^Wx$r3yw(3y&rv#
zPfHx63I~8&?~CKZ)7yGJ6kLRgqGhOjHd6N3Bue3c?}AH(Y5CgRPFE;M^B;145!eEO
zz68G}LwkBGJh5c?t^xsOBUZQ94}XWYw{-<!KGT}9*f%^2M<$LugO+0(pvZ9?*<LtV
z69IMO<w(EJ3$5)yPyYZY?vt3?L`*9@S)(Im_2!msr3}j}d%ZVJSu3CUIUTh}VcLI;
zK9M<Q&RaykdK0mYi!Dbw=3vEV4=@l?-pT3rTnPKOX~@XX;AVBd*;D-a;kUQINl$sL
zkjgp*oY<jrl^tIUry=Z8p*+Y8o|8EkrC&#bPj@&amcV(5SwK_@S#+2&srpaHb5kJy
z+Z06Pv+iOFKs5AQA~0(znLoEk(WSm%0e0`!wd#TT!d$a=8@d>r+pVcd<JVP0xy$QA
ztr85o52wSm?J_xa@ALRTBWl1|tgi=rAXkp0GD7oNnWcE{uDT3bI@jG>T=0`o3tx@3
zisQmp-wW|-4!F9LPo;71>iSUeFd!{Rit!6o2n>8JEzL#+rH3V|kczAY(9F>vWDz{h
zBw%OPvC02%l}|1Sx)_=QqwKnlari?vHV_rmys`HeX6j4`P3a0)G0UY+GzH$2)y}5_
z_q<DnEEJpNe>3hvyyx%+?H&raEC6_}4Tv(Gbv$)|I&9b<J`CZ%hHY<CYmC3u*Y4kP
zgWK#k$FTiN@4~p0)VW9d31Nwtu*+=w>EYpF!_~)c8_Et6p&NY2ku)&03gXP>(vHmJ
z_0~ty`&GleYe~EAwzV0VklXKOxe+MpFUXH^=LN%6wbkjX<Ma6<)i83NNMRrwpv(nJ
z44Z=<jFW&4$z`wN`L5Fa^kBm@^|8`?x!OiS2-`wRQd$lS&(0`i>pvpxYMHH7pOq!C
z`8|c}t@bbE6m$C~)Bhv&!rJ^j6wx7@J6A9BV%gYsy_W0HqrG#q+V{SE!FdxsO6q$U
ztsgZ$`Fat3VPe(Gx}mY>DRpmcKw{v0_FDH3r3mu%<KyGfV~)iEt=+z0ESn=O;a;R^
zXxhJxT9UcVfpO<4@7GIxlcjiPvJxI+lkio>Miwk{Az9}dHF5qie^H5S14G}1mN}bG
znGivs_C?*jE{6yxdKRmm+Lo5irh6ubkqLIj7ZN}T&~M>+-vn9kdEn<>5_(uHA&<|+
zI3El$a+UzmH4IUQ!{=(CoHweIARBhvE^^hPD^#ZuQ(`|<S+m9Mh2vV;NIuLaw<jc;
zQ%Aecsp!N<(bf1BY&Fw#r#V@WeacI%I(BP?%|TbMU63$;W?BZ&{=zO|)8*5md6Ny~
zRad}mEEuNxTgZ1`*hk0WG(!-Go6foKNJp4puUHNZ+NTPup(kg&qYdx8P!%g6^fLo4
z?TW&!K7Fx7LZSWx-`gfy0!IYB#o0{#T~V)t`ZmCgv!&JtmE~b`>Re&#-^6m)j${Su
z)B-S@<<Q<><2F7VK4A%drb`mOe>ujB<?;EP>_8{gU9Ufdw+FKq-q#+^F5-rIi-w~U
z(59$(%&3vw!giH+C$cKz_?z|ueUm{|<}Gg%zE8eW1cE;!r3?H8y0~eCm|CspOvp@i
zY|podX@1b0S!s7tXl17!vMLXieQcaBxT>+%AH&1UeY4RbE!>)4Ny^oXYIbWJBTKTK
ziWpih*--@SfRLm)X&Gm{I|BARnwv7sTK>BIFJ6#u{i{Y$7*~MAwNMij{V))kdm3o+
zJfI`2n)Idh{8ln{j0Cy<xY_Q1TU2>0cCsHam&$}}&;dFaKcGt_t=_6Ld(?a7&B$lY
z_-eqAwn$U9lA+I<-a2}MNJfMU2ed{oWH?$q)T)Ixq2p8D^l4bq$BHsDbPBQ4-`16h
z=Uct|8<QNmK6dH&9@a6}TeKoxDe}`=ycfqQGDHPS(}b=kPX*pjn;Y(}27}7}Bu3+8
z>91(HeJ6oWXPAyVO|<{+?`mT8A5h|dURujD!1At(YwIwEZu2jogs%dj#qB6@4ZDwe
z>_&GK@!#7>sbPH{PbY1K@yh9##wdBit&eYlB|mONwSK*N_5AleKF_!1r@Q$GGMK3x
zo)!G)v+mc6taeT%NQ~8rK^iqPG#Ox>Z3}+(;?tK|TIV7F%YrE_bGzr3YD#{*$dC_M
zHYHn_<#ZW3k-0nBw~~LUi^JR5*(z;)LCj#lk>kE7Q~RMmX_A;|`WCs*N|0Pu;Or--
zV5a*Kj%JPZcId2(!W{lgrW_wljYxGPBF_Htzopq{Lw%8t4p-Wo3Nekq%s0h(h_yA(
zrZJU@Fm=8c&+X0099VBCLi|`j&pWxILeDNpr4w{T=fA1y<nuUR)1t;gCKPb~*Yl%T
z-|t0o>vKlIuf^}SXS1_i!{a`Z-@Ptbw$~`bdnC6dEBCO?bl(CE4$OC^M@-$}bPa{-
zKtK2D;k4Rrt&wYaVU@q7>Y#5Yy%bft4Y`KT{{zIDNr1gf{^NIv#t4-%>57}4##^K*
z;Aj%x_kLmN((h^Hq7o8*_q<9*TbdOk@d4I(IFa-wva(2`_kYKr!tehShps^pgZDzN
zBm9TX_>3XE<KME0j>)r__P6XHAT$mcUT87aZm>}X`A*2ispzj5#}1j5%HeoAaKh(6
zA9sK2&#wBR*errJS1aR>(U64n+_|!o3%q5X78j=34#7cKoitZas0@7;+x)5spYX2d
z*t<d_lpO3etZIhEU*vA~dOJIhj)lEaa{k6hj2o#?r<?V`vK{*kpXvfNiuw4M$dg+#
zh%g&XqAq6Vi-LA_TX-MxdgYAtD;%1Ud2$Wu6UbDqwtotybJpvs{osi36|V4a>wH_{
zM0olrET|KA>>o*me<)5hSU6%LE9ClI5~QTD32+H(@@wrix06i4*pa@^m~<{P(Ssih
zUM_aCJ8M_xUsSMZ_=Pp@h+82Ox-Ga`A{l{gJOuE{rkV+qPxWHKn_?c;%b6~*#tCjf
zNGGoh*FCAlJ5}oUu70hJZTex8HU`in-??OJ?l?n7vbZ<u)()Dh2F2xYD!E41`CGhh
zCk3D;yw0X`fszJe=4{22zXknvlqDsHLKnUO?winj*b)0O69_)P2aRWrJ%(2KuEBsS
zS~Qyv|B`Ze-2Y|m6;IA9@7cV1^J#qT{`;myd6_#Y_B7a7NyvrKr8u2>mC7u1$Bk^o
zaTG;Ef}j6PCSw<OuhMfq_J{n)Ft6I=Gzxq-7VnXT%o;h%pYrssEffst(~Q#fZ1a8T
zZ`s}L|8H7*`oq70uCKo^w3*N&38s%(2~OtwS#A&^uU!s*rjB{MV+j*}?<i_#u#=l5
zMo`XGEk-vb&X8wRO1~O;G%kg4tSw}ivg+;s#t6el;t-vj*<d?6K8F8_>c!*2jcHf(
zILJ!CL2d0mN=R4Ze(7O{)B8576doNN{S{{R@%Frx)BT7qI-Z7`seQJWU*H{(-((IJ
zxDq(d>XtvkjTwdW)X%O3VU`o3-rrBFfEd$h5(f^90Vk&3UIQsVrHM3=z;72P8G`(D
ztq%;=Z%{OnJA=`$J+UoesY*FS$u{Z7qt{rdb_yD&k#s1H@=0WY4$$7{)+aj?uSjrU
zA%8^{;<D+l-I7qOHqHv3Axp|RAsCwd!}CgNH-Hrj2zp~L`zmol_sfFr9(>McPwBUp
z`aGYLE6uT8cW?Et4+P6?HV%xbx4vWvvOboHE3~{R;e{#J8Bc8+3eomua&b2IdBA@N
z{12{s2V_FD>YZO-jRrHLZZYq2^n6!#IeJ(oF{%DH??8>M=R2zGX)cGE$!zcYFrkw;
zJO$qe9JZZU0zQ}B360DjHbY0<C2DxMndzDKg~|Yh-lmwiU!9_*iu5eOh(0^pg1k$2
z&i|DDNCpx@0er8KxcqbLEZ2>DMF<qvXO}w~S1ji<a_3TwJ80^$w6-&4deKXzy@v>)
zQm~;!8plyf`4_GQI0hU$!Hr~gng^^&%#!TAn&T3!#vMG9jepoIciDNmkIELFH)NK6
zEsc2V#N|@Fr{bQYBS=o;A#UNk3KGOyL0S^*IH*x6_EtQtdjoULfia0xG^FCMzwKi4
z*<xAfx~w<;&McfR7hZlTim(`Uxjin{k#l@6p0WHU1+l(}@ylH%sS#eG^FgD_M)_83
z$8i=LGYtkAbGo3X3^BH<SB5dUL1i8W%%wM0k0Y2b$bk0ySAN5U`~{DrqvI5zo%P|<
zt<c0!j6e=Z1P^zbHQh|Y+1vID1wk=FZf{Aw-Z;&INZg6sV2{<MvdqbyQn#*|KZ)2m
zMF{1M@!x01m)rAZ=t!lo(c8&@lcreXN&;0BP&cq*$;yZWFbN_0J56yMxcTKu<$dj^
zd-TVVPX2RA;PZ9u_QxM#?57o3)*O!4o3l)#CQ$Lh&GEc8`h0r)G0XHLrJ)B*-M;?1
zjMWNExq!5!bLqfl$Lr(e)|cmHe}A8kR<GOdop0-TG6!@tg(UFsl)+)sWWQ>=CNa(`
zGsyG%;kW6!teIF);3eWIdd`b%t>(P4R;B*TZWX7XfQFr{f?Pvn2xjJzo%fk^bOA|R
zH6Gi+tik8mksH;+GZRQ2xZmo;co9WRE|@h1^OwNZuyWxPammi1)R!rI)<-f9TmbHl
z`szx_u=bh-KCmN*M9Ebg?h@Bnp%+joH(_bj^2-DYJHvdXTq`$6rj>rW%<&IDxY))2
z)ZUf*ZQ9ZM#9LXRE}=4*@>U1zRFYw%Mm9QrAzGu<X$H`&sfDI<5aM+GEBq-9*;Zi!
zx?@A+sf!4bv>ks)njEcM4V=!P)BFDGFSGCg7gG1bERYFl|Mg3=+32$ro7?Lb@wNo_
zuD#*9HzBhkLlM1hD9A<Z)pSB#h(}_{ZDJPmp*GfZxk059iE-F*n`&LT99r2fu~g0L
zeDT!WeQ)rh4QNjq2=7|oMAXFRvY)~80Iu>K|IcLc#%RgJ&@=_ni~i9(8nOOU5-WyS
zwAVb3)*tKbRHeOOEWgMatzH;v<H>7<Ri3P_;If`C90&eL0u@w~9)VZ&lT>XSEXaJ0
zRlCXVi|2ZSFs-Pb04T7GvFr_@aD+qB*Mw7{sAA9j*CU5e!827B%<J{l<y_VoV$y4K
zeV#4M5&S%}sqX499)wf8Nr}O8-C#B;N}iP7Hsy@3S|S){NYtuSs+U|k^X^ci0l17o
zSM=g9<8^nA+(>Us97;;WxZxq&uX)#CPIkV%Byh#r30(zabLZ2e06Bia8#nB63mjbc
zsh?y$QcwTqzCCilqoU&qdxl+loXEFCzZ|3qy10bXbrGJ8>aaf}^R;m5wU9F;1X5UQ
zn!_(N%9CUk$a}P%|NZw(8y+k7Ec$k8QDHz5Pm;J+U#zPCTZk%DF2$^O-Db)5B1JN_
z`z|R-fjj?5;evuCnoKHQm$6a~o)1uuU|L7lgQ4<&naeSyWai-U+Z~AbN{y>o8(v{6
z4h;=`I9q{GCnO}4FP!rv<x3Is66M)$2F=x=pK?;snlXeGXH}~K#WHnjFK3PnJe8&j
zkk2+5dU=D)?Js+$@IlHrmTYSQrsm`_Gz4%VmPKGV6cHzZ$!&O5;@x+#{~6r>t-c!{
z>H<s<uv5uCB-YVK6Nag_NH=k7Q6758aH^1U5|gIRvCFW2+o#U5u()-FvD@Ww_McJm
z8VW8!=W3U#mp<mwW#JX!X*gszHC?OBOm2-hOc)+rkS9?<4JV4XS4s!monFB40E#}h
zV*3ti#06`yN{l2>K<kG+gR<Xm3H{%k&KUK-LOm332I6v{L?IHC*c{dyLas)>3zQFX
z`dO6<Orfl5dNicXusW)np3}FRUFOO(-t7rJUtImWs><SHpIVy5VaQ5F8HNt$A=PeH
z>`eOq%a?dT59ywX7JBr9Kb)g2Rd7hSm&DJMCOezm%lE~t^%e>ZY8%YBVet$(*S9qF
zaz<l2AFcn4i=JJjZzY*P%Tz{A&MJ-}s{SGk_@tD;u)shUusM^%k?|OD4_O>j8fows
z#!3vDP#SdW)p*S_ZXotR12d1LUGx^=At||ic$K4$D@EV*>0+hMmu$4!?|5mG`-YN)
ziIS3v>FuA^k%iD!&Hn?;Kr_E#0|v_5w{L+2R6J%mn(c(mrfylWct$n~(OL(8_19<K
zJ^RBS{bl)`E2mDs)2P&b`+xW+vTg>T#!3xYN>EIRU!ey<%P^R2NRsJB`&51J9eNb#
zQJ_bG9tG|O1r`|}Q$!gN`2!rYO#mlig~&iTJEg50%~eVg-XiVD9qbrIN-}`-6eU85
zUg}Zc{-Xf<B99E(Q_}+EEZ42q8nGC7j)}2}-+}0l>D?V@$Au-EqR189;&mfh+;jz;
z8o_-N{|SNq)Gn4vrk;HAiG2qTjg5_+IC%<BzBiA(#YsAy-Mf3w+O=ypZd}EUhGqGl
zC&nnI6V$4PYxS+)^mqQzMy*t+mWp=V5pc{F(=jEbQ8_~{Kssny-Xn(_3{;}Y#u6hb
zcH52{AIJ}W|3}|XyD4T7=-q3z+S_lxJy)8G$K#KG@=5HguZ~^G_w|*^g@m0D8urp1
zQUE%~<5qvR|NDRSgQS~@=v?*6iq138`4gY`1U!80+O<CDTq<$p4&Ttb{rxD=qyPI+
z=f3@IvGFm=V>Wgrp+6Q391V>iU}b2*b%PTYUUAt|mdTpI&Q@INr5*)(6u93gz{Y{}
zbGk3fL`;M67?4;r?cUr`cM3R`u;^-+K>(k^l=oXrKd`*i+>Kl7)~;H+cGVM4JaP8y
zS&+)bi<e(`;e}TZzcw&3yk+aAotw7|<+Fl%u|G*Bm^_4DL&%ZLBob-S6G&J$;4orS
zP&k1=B77{5s6ZeeSd5ST27-Rf6$_PO9ix<=y!ex3GTE$)r{k-yzKSU}R@yT&Gqz*m
z!O0{ll}d$;h)+KDg5PHafX;<Nsa&o?=a&?n>(JSX=-g=2S$WDvbOu1*XEj{f#qH65
zX}8ZWS-Qo>$4VJ23Sp#P1diK?S?#{-<FdmKm@R^yTN;tw5c|YG?&0HKvN?N?^(gQ$
zQb273WGa9IIGzVOvI&SP7PwiItl~+gi_tCc_yOtb^C+JRMLJx>UZc@8Y%8Bl9y+iG
zlj6DA!r_-+xpIB%op;Z@dE(fMeFJ;9?>x9`*U0J>UWlT-WxGk)Km|JroHPgmLS7QQ
zBBEX7&M4neCfrE2=<V|aZ$GjGK8Oc`+7heuLx&C>JMtzwa_5tI%0E6n-apj8a^*_O
z=y_hgKUXLg<GrBHzig8!I#=Rxq4UurZ$alwI?v_;<Kx%*hw@!IqdX=QcTaiz%aZv$
zpX<^8JxOyv|FG!zcmZ(`;!1*dG)8TAJ-LHXlj*DkOC>0BXE0sxXz#Q~fgS~xJO$W8
zJ{G}V!NU@VMxkNs03ztt1#l7OTe_(3BT`x*vY0_=whE;jVcCRZCP(^5zw|deKRkWy
z(%UDFUp#*D)t7#8;`JlRTx!pgj}j$$^|Dby!i$z5rh^&-BbJaHWOYrf(j-aI+8-aW
zl=tLu&=^b4>4Z6$9o)HdCrWe_oKy`}@z$+d*@m#G1?5T+btW*H%~@=To83!yOo52b
zsgyZ5GSsDWy;<$j8FTq^x!k35*WdJx-_(2Hb`<E*|8`Wl@2@R7K1NS0>J$;{C`Nc4
zV<fADSROJ)nrbkO^vVpDpkzsr_9Z&+9eWh$QQ-cefarvyN$>*W*eFf~m&NgbBokoJ
zqnKFmz4OHVQ^^}k8ln7!*MtCUB`8}}Q$cnMX|=$JT(pQ@tP|@^->_!Q`Za63#~;6N
z{PgiR-aLEp-M{+rk7ACsa$tDRrp<ddZnbhbJDy;VQPc)ZE^rRS42>K>#^6YWj}@7P
zdfn8ebN9-}e*z9eTL~tf^e4A(-+uAp#qvy<Vr4QJd>L8mXB?nLK(}lzS7cM}Uizgd
zFi+>%Hk}J|Wq7Yk=ZJHla}GMQn`1BCRSNXze^<%5R5Dv^e2n-;YDYt02ONt`=c1{#
z1Src8A^^GrqWLLuG+8Pn>XoBMfgS}u6a@%P6VrSJ*|4(U6Zp&Ddrf!+EGv}!p)$X}
zZUP(!z7H(MgiHclU@Q?Ouigk6RWr1mcwa8v_voR0J9f^NO5gp(bJwOO&b@p7(wTRT
zW?x;ias7&2TSiu`O4)H7^Gsc2rqB!I9foKDt!{=<w$br@h`sQ(dw33Ra1Cr}OfVTi
zJT`9J_~vVG0*KeFSwnDW*?Zj5xu@n~ULw%{9v1C^dKEO(iIA=7#X>2f^O4uzG)${Y
z=O_#d1n<##F&e%{|HYu{$DZe6<6{O#gAMaSO?I0V)sk!w03Ze&vu-G$7<ID9{TAV$
zOi_}^=F>eX`LQ!buK+y?+%W~({Z=q99=CdAU=iga!H*+fJTX4*B$*z!SB*NC?(5G5
zRSN@Pp+O9a_Vl3T<1kgNyP2GAxhb{d2RdfG(PAf+d~)ckzxlVPyyB(P=ifPY;`+&V
zU-;`6jTyn6BLDzE07*naRI?}gmMweoz#}O)k<aBanUoDu1_9|h5V=F9=w#^%%tE0M
zGxTIUL$T2oqccA9_8A}=HUI>&0It;<ULqNHbr%3_5Cf!JvrY|rpC5(-&>4K3NTj-S
zK6&B{-tJvGqr%6B44Mm_@%!whyGnr`{qHI{mr7=fj*r!vpU6CXccRr*;4iN0kuHSm
z3q(7oL>zrjb<<Lz*<CJ&5FnZ;EXcS!O0<dA{Yl-~{WZGGJzNp$b#LpPETDiQiz1vL
zH_@^2A94xhqZD2LFvrdl^A9c{`+SDHq5ZChsZk)bp%#l|-YYzbH-Vg~6(JEdm{`g7
z^0H<`f=GDfKA23)UPp$4v(!2C#kc}H0~6cghFDAh8O6ldFhy?w0mKSJ%R8CDUHcx}
zwXZrpckJ!AUOaL1?8#H_o;<T`aA^Jd^_w@W9~~WKxm?uAvRTP{&`nWxp$xrMdmn|>
zgFm@f(SM$ArCB~yTKEXY9~6e{q(}>1C-=A+x6$xTM{{HLba7(!y3th|mSb_zH<GE;
z3o$cRs}|jinMx%tT)r?eI20M8sDerH-XkrnyVRz8|J^<Byq(ukmm6N%`Kj|MY)?5N
zQm9j|sEf+oyi2Z|^65PO;Xa{rOK+;FX6FGya;wvL)&T*>;w@V@rpjY0H;$~_IE=qz
zek4`)W)0J*Rtm0ZCR5I(%h%Z7u}x<s4;RWkm07p6owwYR&js}D7C`y%-u~@!yI5~Z
zA|4LZUUblZ8~XEZ;dnisvl^Zsi-&FcZ&<!m>E9)Dm&l5Q(Lt0Nk!nMZQIB5m^TN}Q
zf&L3$e{aqg8y}-ECiFdo7FhvB1BI92Gx{V+juHoo$QATdU%n#h9w?R4`?h1tP(55%
zeM~W~8e<m9Vjx8n35$6JBve%FvUodl3>0NtYCjt}x8Hov`<COu`LVP3AY-c<IRRnf
z`l;d!5<3t|!SfjQ*opxq&=}N4R0wqY;G;}eqK4ox6~as&=l^_CB3tee=%L2Thlrmc
zAx;ItibOna#R$Xz#h@_+cK`|zjxl5hO@2`U&&3bnc9MFYp!d<)d_@d1LcZ{tE;u+!
z)ZdY)6rJBx)kWgz62qs3I6z?PiJ{!1pMG-tqkGHM`uD&07ubxx{pQg(UV9nK(aoDT
z@7}#z+}fH!w0oc0rX+|of*3jn=%h)Qs7Vi)Pk!hVXqjs$aRE|s>_6l_J;Va|CI+5=
z=r(6kL*a0i1W7ch^dA~Jh<QLavSNHyJqh-gNS%&l+eWJv6a&o<?Qo`6Ir!<_l(VYO
z$qs^AnCRE*t+{%2F5B<a!UF6-&b%Nx%1hcwUKkl+cR;w@BB&g@4Eh>Y?RWdA-YjdL
zGL=&`ZI^|wlA`8TN1+M#1e)0_S4$~3>)MHPXU^^4d|+&P!qN%iVN<;55w)%&7Uk5k
zgYYD~4@^+_fBHZESxbgg5<@9`MXqENm86@2&?+kM4`2E(opB+$pnj5H<pjsio9L$e
zf^N!9GP_%8Ar+%M6}QM3*C`p7g*1xfU|vB5ZH|hlF03I@U|T_G#>~)<*}D7BfB%0Z
zNFXyYSo&@i_11iZ{{Q<QwDb=BrIe8ZOTDzs6<qC>AzW(oR~;*=S$qE0j<XqEQ^-v8
zZSbXP07lQHL;t9Z_wv{g+>7>30m2fyE#i3L8^{iWW0EG)K>!5oFH>TElj|H6!dSEt
zEv<}|(3B!U29a6a)o1yRTmZW1XrN%?&e9`9lb<B14hb`pNk^#={Uo$icc_WAI(rxn
ziXOsgGWS&p2n*{ewu_`LKe0G1wmsPMh*TEnEVo7vy#F!c(5NB0;JDo!^6obOA~f6i
zr~zByiKsXa_))O1%rzoXD<%}`h{Y6mO`&AP)3KJo!8VI7y4L68BYMOIH@;sn8;5qa
zIa62gQj>T-Y|<x01Zl<Mwgb_A_aFRy7R1?Mgv936hYufSTRv9FpMLsj>~{wT2LS}Y
zP0<}H`(344o_`}hMap_nSSx~?pml|f1xz>Cv*~t)oX_CqGbNTH0YtfZR&XA|=YRd*
z{^t;55k$QLUSQT%%i<18FqqsT@SiOS>2h=mEwD^e1fH-%=h%iCk>HBnEBOn8RC#t7
zMsx%U)iG?4nHs{PdluoS1PG<>mVpvS6zd$@UxgY}zZsT-SdewIiFi5|*vpqMx9tR5
z<!G{m(0QcI`8U({Lwsj!B&vz0BtQ6uxlKfYC}$yyqLRY&j2+Rq+sRjrHC^(`y#$=$
z$H>AdkfsErF7ABQP^Li(F_{|qsZhI(l?<2aD8_*DfsAfKI%?W&1=vjHbpc!FgnFo3
z!gc(<=n48a0;8syQqf#>*Y0Z<q8RRhQI$Puo90V_{%!UUdcJ4*1NkZBDCI|`xrFH7
zrSv^IS#*3X^?>E2-?ozgi)c2kdIVhEWo%h{4K3CWJNG{LpQ`TCe`MCs?cma@qQs{l
zye^48G)08_8CvvE;zNDtr|$b3m5Oqn+thM}DJo<oPeu5)FmgmmWyyLRUxl*WG2;WO
zFJfbU0u!}YyP-NLnXKflPE$gEbcGK~>M*m=Q&E9WK{9Hb5I{KJFb@k!?x3CkT$I72
zNEwWJm_#Tv_SAUfkw^CK-8(Tcaqir?)2C0fE#J4k^)1xNnDg$~v4dl@<uxy>R7${K
z+l~v!hTsA_M5&A#yGwlXk8D497nUA(P(-;k>MtEW?|da_vkF1iHkbhu3z{JUv}~PC
zCj&W1iGa-%L%|&G6n>aLiW=pnB=1r^Yhe#1XUU!4<fD!5M9CtV-R+}b1NAC7v{Qs9
z`3~tgkA8@Hy=+2`@gP&0>(?gZW}2^T(t*!hnq9>2N>cdo#+>Gf!|<u*5{eE287W;`
zO~@m1UzZ8!l~VBeXVt12H{2;d+*p<m{&q|L3I-^qkcSmZv`g9+svn%^J-&k~bZQ}m
z@0`mSUm@$lqiO<Gz;><Lh5fkb&+G57{P&o;OX+osjzpvG932@xX&D5bh)WW>l7+O8
zZ;`%J)B-&65FAxsYFnEqnm`c1@w^lhG6|8C5J94SIFTV_R^31-$-@r;CEMH*S-f_1
zJQ3X(julRpBb4O`+bh<CI5&`5DE$GcEzP-0waabpZi_*AmCXHk(ZfQ2D$H^RyvGu|
zq;~j5lqiZ8f@a!Ny)K8#jqgQBmB+(#06z+ZyfJkzf`C`7Sb;ur+qP}Mua{qb`SRt<
zFTVKV(W6J#tXV@`2-3*N2w+-e5JRIZ;0#r<f%U1`2z^;PX%YIum#AvMBZ(9D%hA5u
zV4NsGCY>TeW}J|@s)gv1VTGoEh6tPsL`D=z2Z^${_1LMs>YmY^s>dyqU_2Z#l)71R
zTW}J6y?aNcs-V;qRjf&F7?oskRIaG%sPizMOc-V?){Gf3+c8o!hppMMShZ8gyYVTN
zkLW{gh?wqdTzZ@;OSdh!p?jx#^@1T+(MWz31!2cJMGYM4&c!)I1iW3|(JH9?BJ@-o
zq_~7)-dKg(J$$dOQIhCy;R_>XTX<#R`HhADxaog8V!mhjh0g@4-B^W%m3z?77abqN
z>GO*zO47-+6<N|gNatMG0JI0Ibu>uSED!uqDk>C^)D{ZJ2SuXJWksUMhE*g-1oG4a
zij3;))<fqu0pAZ5JymCp$R#9&gZR6F<vK<z=#K$%>epN8cl#>{aWx?EYe&*`nn_9Y
z=pdJ5?Cfx{2$%<+lNH@Ls6WwTsc~1dQjtL_R^TKI?v=O|La8Uz)!reZfWoYbhtzlU
zyOClTunVqLieVe)DiSfgu3x{-w$-Ey7cO9QF)%Q|HffJP{wSC%pYQK#l>IR9yjt8%
zfLw751T>3|S}k*<M)uoNnGe?j!qck1sP>g-I)IEd0x<-mdr*7<L6KCGrEewe+r6ay
z!E?9F1Ug+L>RH_z<sI_>28~JEHiJi`MDm;DxEM!<FR4xRcHY)K@0LfEuU(3mRw&L=
zwYVNP44c-h)oPfU&~QSlyOVD2H43^%LX1*y68&*54Iv-%3>B5&=I=Z}y`n5ctf^r-
zx^*#*yz9oK-mETCx2PDZ+)GkGbi{tQ3aa33;&m@}PgG@ZYO>qZ|6bC+Tej%r{mXyT
z%@5nxyV!4UPr92rLtAV{`eP(&#BHh@w^>X_{V?BQ+I>6Xs0Z_?Rvg=8Qs-?#M)Xi(
zt^@Lv)VQg$-AeX|#?fuSl!u6(s?P0-qY;s$Wc>mm#jh9~N5d*VGZ?7bKKvmT;k;;6
zjNT<#%ufO(tB8T4(Mb#sB|Vtjdqcg%c(H{&No49OR9CZKuowj48&g3R*xQdN&W=da
zJ4U7g^Qf_lVu^qT%)-#3i+3Sbgq6`2lOB8QvGeE8vo9UTQ>RWHJ$i(TTefW5vSkbU
zW+Fvkd5pbsL~YO=S&;)OWCfAM=;fD5Z3qSb)3OMemXGCatI-P48Ph@Vf~TFOr|2=%
z*wP)Sk-#;TxC}J!yjOLW&KG`CfAinFXH2XVPDQ>&{Z3#j{sRiyMJ2h*M`qV3Yv5z?
zR)THC*i>Gsj<><e)H0vX(_~i5GIhJvY*njOi8ey~j=Rb3QS}v&ZIeRf-X12TIXWUp
z5#;7qM~_2S5lK*ATwEfew^uT|B)@kjimJj1QQ;8Kow2Z6+Iv@+4|r2V^L7`AxK|w`
z&bawu)dV+O?AGYxpns?DbY?;CSAHeEyQRIM4iCwf#l^?S5v46^vxp)$y)tT&E`^nX
zxupk;12{{@Fn-64L*&Ura74?Cf<<CIDbs%8#kMq4dS7TI3QG~q{0G=|vF`#J&W}}c
zqarK$L~M<$6sMVofRe&3lIZ_xoQV8{;J`LvMJ_P7waEhA+e1T0gxc;<9cA%8N0lGX
z;$WsIjfW6#6mYEMPKWxCo#Jq$$&^$7L&DE6b3{-2w4fpu)TLQug<a?)S2`@p<%nAE
zL+yv_r>;sh0+D<LL`HBnA_TCQ6T~EdD+0LO$qk!Snw*>*A0L15#TUS27+sL?-s6`o
z+qRAj4^`^bgal*O$djCv23d?{RnAeTh+gu(M`=@OP}Ppn3C$=MZQB%GvFMMn>_>PZ
z>=|6>eA4bpt4wpVIv6$j!mU7}GPK`|>n3MW=W1W-d`MmrN?-C(*Z4|yghUcVt_pVd
zkK``vnetxI<|sRa=qS~u+p8;)eS#3AR;v(N1j71A_E^GkoP?8zX=YHDUq5n!s`3}*
zBcxRQi%%I_qt+3W)_sq7>_w7LU6KbEltaF9uL`r%X;)-q_vytrQF5|dY~Dc0DI}$k
zF^;4@gzBoAOKp8$Cioo!bv|K97(b$ayJ@5%>b@}MckdT|`MBx7pfKHre!ucdMIPd`
zxcFERHkxyTT96pg+H}-I1yU$@!D5+#p9l;V6E70<0a5&vhoM>MR-$t2z%1!;iVpJ0
zD28@~4ko>-GqaCm$O6(V5O>j41W*UAwPI!<>e2<bKj^2*{#9hW8W<v>uZXO|x+&QU
zfqXpZVMtvJk?$Zu$ViB*AxF3u5TIBWn#(T{rDYNmO>AOB6`+A0o(8~?9DatR*kHoF
zoHc}KQ9JNWenJWe)2cyHjq7*N0uSu1w}Rv73Jdh2e>9<|B<e3}1a*v5umEMQa}hL*
z$KH+|+pxQM^Ub$ToH%jm(xq$Ht`SO^-9<k6nNJ6~bUNvFFtJdQ=TwWx`G-!teWA>e
z79};CUZ_imw|3W%jzSm161^sKe;Jl}5HsXxE=oW+9*pu)IYc<GB|$vYouakv+#vZ2
zw+OV7KYEv(aI+Q~n(R<0cPv6uGDIMzdNnVI3P98cDV(H897sh}8DgPr5Sha^0wZWL
zm>J+>?jk~;YhvHBcqyX5JRREQ;)M`abl!sBWa&)m(5;aKtl&yi8(_E8YSD;E-tUzt
zTqF&;H&Fhl#N8`P=|mVpl}EMvJP&Y<D?(CPT!<3C%GkX$=r7nHD%giBKgI6Uqg|bc
z?BmYx!7o9ao9Nr^w_da1xT#hE?-2e;qJ<VqS|UN*)MP9sGUZ`Ns_V3&RR=`0>m4_V
z6=XF1Fm728tzN4R59dq8N=E^66L~}*q@31u?#bnI#bP~~jMG`YCIeE`8E$&yLHx>`
zLb^2dX883HOVLmovRPwb;JMWFv{IqAaz(aStU;A#(<dD59VNs|I+Ge7pN+c?i`5>Z
zpox}GtBY!{oy6}6LtBw7L<evpoqy-Y2l<oP0wXX)cP!0`+jG+uJ?1;6=9qDb`ysO6
zj?7&jWLmvf-4qO-ACxeS=-wFt;^=FHl{sH{Q7M~4>??+cmp%IEqgSq6A>PH=cg}&8
zPoFy*k6WL3>M@Ws&l1rBdu5QcDUmKjqs(;@8AhsQvgIq+QCgECMKzmR)4MnaST%?Z
zr|GFoqNV%e)6?8ZlF=6=fVx<?AF<Pl+ZNft63qq{9h4SaON_4&B`dZc;C>Zw19J-t
zvxHPK(X4}<V{z9-5gP<e5P!K^rZ8fpA=yZt^l^nic!g+3`PU`U-96z2jb_t!<Cf`A
zV2r>qAfYgLJgRAiKu-mU5HnIRzDN)|D!@Pa6L(0<5HHEQS1{_%A0mz<#|U{@>^;cL
zwAEtc56ElCUegG{!1p}cMY(U5suhZ%n?}9nrBW%jE#&671D$I)Y(nQsr6O_Y1VS%n
ziY7w0yus38fGTXmjDjNB4!mYHWx+&vaw#W8e^J3lLoj`?R4O}`@J!Ic_`lxp>mkNR
zdQ2GJwg{ZY2JyNjYdCJ)X6{75MXaZYR6`$$TM`xgUZ$x0O|L=mEDJtwO4pXaHthV4
z)^f2K;$GJEj^A|``|a)CErN_rOwSegAi_i@isG;1#*-GEK-Vr_tyrA%D-^D2m%
zMJ|;TV@V_cgSgJMC2OQ35qW^HovmmgT-t2rS0?-#<43hLTS9OE%hZ$uRNZA<k&*~I
zoOn#rVx+;e&voNa``rsuL(2!c_brJ!FLEJ-$He%7j%`C1hOB5LX*48i1^gDq+?hG8
zT#5}0^jSs&v&lOWz>Fg3^cf*_)3IG+{Bi~3P=*x?3&TiFst^LauDBiDnRui<Bnb-G
zj#jU0P`e&zgUj+kBaAiSW0o;sL*R|lUCCM0IEqLQ33cvaHNhEKl~%7_z3;&OYPI}}
zS6-f)8Yk)n8{z)sCqG%SQX*aK+PSONXoBZ}ag>gy5F;c;ZulHH#gGYJ0iy%@5HHng
zop`er`&=6~T+dKiC=?C5l}@K`sT}P~;|g2@i-Qw1-IlRGW`Te!b#HiJu*v?~^=2zH
zY{St_T?{J%&0;FWq|>M42%M9bRw`nAZL`QGJxp~bx@^>~Z+Y$AzQCT$rAn<DR4oUX
zjK+b${Vf2n%)+Io3qPyUFw{jxM0dLcA1*(kGgFIzUCni9rN_*s=T+*pfuVky4)jB@
zZ7{THhS)%bp<`M=<dBZZ@}*^RN2Lm?bEJ*bsQuwydsprf`Z4!_4^5N#laXZqb;F=W
zl~OgCPTiGv+_g;9=&N{JWr=WC90@G0)jZ~Sq0g=*jvlkaW~=5k5^0A?rRHdYH5sF2
zvMD?)p~nq&T=yAC0>4@DCE5(kCOzd|(7z*iJLTuql}fE#uEy=SBWoS9K?R$Dib&|(
zd)H`q|73J8ktkd-FVsB0S*Qiodh6?d{O6U1HdldN;tlq~L3u3v5lO%Y{fKd}QN@CS
z^=N@tYGOd0S!9e`dfW=Las9V`^GmMj=Ta%nujX?3dbH`-EgXhSWd4b`LmO19p{;3o
zUhv#=FQ2|rX~a-LJ?NAa){=;l%6^f_I<S|eUKy6LV2OqCu)(~+N!sydy<nU5!2$Pg
ze&y+byln~b1sr_H)R)aS8eTk}oGVsqGv>Fx^B1LRtLE!<Gz7ActP*(&C>HyTSdi$1
zlamsvF7O~9gRmJit8U!yOY48{w|*mO1qoA`5E1oC4J5P&7F4~oNEBcSE$U5Ef+z3_
zG=)h-hZe3{yXMlRcTb&udwOPqeMYWay+AZ&^vYYe@62ZNU}>IWbZ7KNk^r3t`UmP>
z9aCoXTV%zrjNbrgnfIYFW|L#XbX~hzF9E;OnRz9*)Jt@P@^;$Uh{-;E91PPjb(UOA
zCt)VsSgqW^yc)!tbR3M98&!|Z9z4O)qQC}Iu(X&+rdgQ@)1jMmDTd@MvM~ZA(YNR*
z?^>jUB2Y`}mSJl#qf)HF+%6WymgzT$TSo<%U5lBAI;RNjiZ4=OqHGl|zxfTSxtV1!
zd9i6M)&rQ<B;0hpRAX$3X;#vSL+wU|2}$Vro<M0CWgIMEl1X-+Hp5IN=_aeCyBdn_
z=3=6rXv;``X?y3(J$l@>bD2CSvp83%R%<Yu3d1tk@ZV#=yCvs)cHuqRSe$5$7;52|
z8^?+X{VTH++ppJ}?29SuCI)*|26zL3mfbk$pVg4U%$yZ>(&-ePu3E3rKw=NM$mzD_
zXYrX<V2Cs{qg<*X2V6Ua{<0pz5A#j7$VFRn+21>Stkn43`s<lY+H7ccMw^%|RRd?L
zRF5a!N;NbQtja|L(W>r#kdGo(B`6Y>+0<2R0U1#Q(6LPB%_Ph)?P!KgXRD!WFP3H;
zM=icEe?ZEBr$vR_=3N=Ncvcb$JW-b7N`Cy*`RQQL!z5JFLvR%PMOZp)8Q5*J3@ZaC
zQMQp;2{Y9!hb{{lUdyY@VM3V5Sjmj$mF^6zLFY=np6koAd`i>2dug)dX;qfXoGi9N
z3|-QaQM-u2y>5WL9~yp!eRU=yjaD)Y8Q6wL(^Nmf)^M$`UaANW!mh9_G*T9MkgJ#O
zE(Icc42EY=W*gC}USON{hIJd(tX=Wa%Rk@0fB%Jx=a3ZV&YgSv#Hkf4R^i6Sp4Fga
z6vl8kgLi~Sxkna-mG!eADvS$SxKPJD^uD^E&E%8Gbfs2cW-j9E)+AtnUG(UPCi`9&
zE7iIe#O*{Xoz()_#Dwmuo3To%#MC~Q%~I@Qp)%aRERl%6b?sQAj%|R6UbN{q1Yt<m
z5_zYz$i}_lotc>#sunlH1`$6RT7F>AF`X;du4Pgwg)^8E<H9FPe-v-A`T2ctia2wM
zp$sZX4Yex`gx+8D`T+zd05fzpXj5h&nQR_w077UF4)vim96Rw=v);rUKs1{kyK_l+
zlY5$!y0kD?Y`9J4Au}@tq7%~Gq*AG1397%R#d)8X-G=^BIQUYBcqr#FB!hvWd9hGs
zC4`QXNTym1jM3Q+UIu&qn}+3D?&R2|P1`ojS#ve7TAVF0=VC6}@ceux|2_ryh(Fzi
zlAZFy=MXw>wftra)A|0sVak8?`eip8!G@3c=KJ;WdxMYX`ATO0$^&$28dz*3^NHzV
zJ#^xsnb8yd=uLwb`s0VCNl<gL>?lX6vS2$w4nW_QpU?G~LDh-Xvwd1nabe?fwUJ1G
zwr;;48@eCKAx8WrlaZ0VX|owt+TvQ64fMOTb<=&o<f!4HBwAx!g~VG(#xNCBq3AJ4
z$P_>6+O1eMh_M;2c70mQ-=3WBQPoxqYqDykk?)H+M3hZxP9l#HnPVhMHT1{0@XLcL
z#1%jg5N>IHQjQNvim~_}V`GGtS*zBk0ZqlCUuJd}Hj=gwS}=sLX;(?2AkRZWoO@VJ
z<NzspVMJoD>9M?ml`6o99Y#i13<G=#oc#8QGjp@W=brs3Pzhs<?j|E~NBqe^*nuB3
zJY<UsQ-*>p5M`Kc8>|%9>mFvVVCZ^1n*6~10*2+i?U%E5-12HoOOGYfnM{AYTE}+6
zuyoh71y$6_C9E6+vM~|~JAM87#BY4*H+SvY#q{al{=5ITS}W-`O1F5uSz+L4Pd+7g
zHN}!o)xYIrQ4*CH85{w)VCu$_(XK;#zwpej6z57``^MMjW@Z69pyDXfg0Ac=sQGOm
z$f6YOrc8mw(76t}%_Q@gzP@^`5r!x*9fL_oORtpKRZ$P?cn)TgR{q-f*sp)(H+Syd
z!C>=m{@uT!f-J5v`@saZ-EKE!f0w^Xt97%KH@0H^`2jE_MP0jU<KOw)UuCNH)?07=
z@JByrX!nQyQd!uyFca}={)(ZIxKYz%76$i<CG2L)!)22h4B7)$9*T8usDH&j{QLiy
zwY1lcy!Py~&jPRW=|N|}DVGXe%G}G7p`l@xbcnjbTFvh5`=0s2uNLOYfAq~ioGe}H
z^yPb5oTc)vMaIVq7E^AhaSO+eB$2NYDdy&>dT2KdXQt{`JZzzmo1j4WD(XcK@{!?(
zwHKv;z_$W?EVimBk4+HVbjc3NS;w!|Gy|a3(y*O(SpUKhg%qif+cbql%wYqZEke|$
zoCDCBnv1cYZq@?7^3c#zlc-8egCU47{&~u90!u*%i<dv^{4F#kDcwkzX3|La31Pz;
z*2--<ewYu}O<Gn3y}ci3UQH{M>IDz&mKJl;KG8zNMpD++BwC9^%mAb?Vm>VVjCpan
z+|XM=Hg2Fw5v8JS#*Jhg`#W*xig1l^s5+0j_J^T*@174!0aU*V8Ou~zokWNYiv^sX
zgGNxr;sRuBJA){g*OJz69=|@xCL`~>^A3mvjq>v4%L#?CbLUPhkGT{0OX6ly;K%SV
zVk+*i*q1a*r4pV-@gyofI>bEUQDx<;{NWkq1<hnKb71eGJv$G$iFCaZcn!SP_-9s1
z<@`X_6g9H%)oXwL-S5TC)T+L9q1LEZo26pec5xgeig1nbg7r1w7{xyHvl}BVEY^4d
zaWF^#>sU=D2KJCMoEz?!4eCXm5fulR+(yL|iqdiQ9w**wnRYGkl8MZ|eFyjM+>d1i
z=CH&RF%4S4tdz_7{u}@e*;w_e-~Ha7Ic9ogdOZ@-s|4jj9sMz_T&dJE>4Y+mS)>|3
zn;Xf)Kqkb~QJ$hb#iI}xhW`HHL3i+C>Ke8QI({_#zO)nVZ(B~q^I*=wfn{I*(%(#=
zch#M0nP9b|vBD@E%A=iOwWL@nPR~yL=db;zk@za8k+4tKN|-Cj)|RzuLkzIr3pFho
z5h(w}^~o?0KN=r*bKMzCF3%>x5)5JVtVK8ZShSjb)QlGyAKzTOcHdy_yV;<l#<V5@
zf9ytJScz-}1&0e}hF2aMJRwLyu^=cph8cha%~$qyMm(-rp-#vJ1O`TUO-LeZXQFhu
znW*YJ(#>(KQnAhhXz=Rw7NPQLjatKMdM3SevBr-jma_ye68WOnTk<EiYs6fVs3>k9
zn_878CuXtW_OILJpy-83-D5d9LTk6B>a7>ivd|DT2aV##v}C%kiiFpbb<GicOk@Dj
zps6{Yh&lAs;6hFw3i_k|u}z|cpeymLc(&>Zb2E{s2fhs(x91`44`?ecO>MKETPfYU
z^2<<wF<3n|FC`*!W7f+h6ognhas!wNts%x3H7{T*J~Ya+v$JQ;oMDHNv9Yo9=g+_K
z#v8=JK%Km9-8$eh*@C?BmZQC}TFzwqHf-J^Rue(M)+K&3GFny2W58M`(GP4BTjECA
z%`O`n-MDg-fiZ8tMgLTLQiw(vaVJqMHNV~}W3S)z1Zo?ajbm%t&9J&=yPDy|h#t|&
zRX(GI2z5o!6}@~>j>6A2fFa?s1joFO*MK2x9LQ!{bLx%V%2EaacZ;9sGW;z}($+L6
zM(3Ejj0`VdziN}ExfE3s;}-Y<*d}yVe_=H&;*a6iDXXc|L0e8Tkwc5mbO;rUA1to@
zv6Q<#w~=CZl8$mGg_nLHrtN6!n{L4RekRjbZcKFUS*o9^)e&!){l_ir{j+P=Y)G5^
zit;Q7!`{$YqfWl&TbfzzD>`;MVQ0Caq?Mq5Cf$!^23?5-6Qvxyw>4GeC)$QVv78*M
zop=Jk8<wWnzO=BJ?rp)Az}prXAB#>@&G_lls=o?$fnoR>qI!rmJ+WH{_Bn>-1q}>%
zP`oL<2c21v%Lg764WlL$DWC+UJjH86G=?J8cr$-Ax%yQOoHr?nU{ippsB3u^1j=&}
znvv%uek+_UrA2`t*IV){H$_B~NfiKJjv{$5kCW@seoLH44()ffOVuq~yCkYSXifwo
z$PqURbjOs7f)ThvU9zoapgB5`$oML|REc^L!G}dHxrzkrq(JmWjb@$MDw<<JGA<?Y
zL=t*S_@)m*$><*PR|Ha1Mzq^PD_$%)P;BZc6qRe#PJUIQS~pt~rJuS(XsHMWhw_6I
zSzHc)i7nk?bYT}5gDUWwY+0h{z)`&`8eMopx1|()7SgJ7&Li@g+ug~?OKu#Qf>CPK
zQkcZXw8_HQNPZAg3(Fy*RbSk~?wH9L&Br|p$f)vtp94)mt!NGiK-1{96_lCl@hTHz
zV|W~kOF*)!JJ~=X&Z(4KQPiX4?K&#j-cVnNY_u@8(Sg;fm3fM56pu^=cuV60&t(pY
zbtync+C{Yox-6+hyUOhv$rZ&X@;pCsPn&z`Wl{|31yTNdX;0Opv%oYovhs`(o1bWW
zm3O^=e<Ta$X{=p9RlK0_7LD5khz9?N?vK(-7$j*PwveDeT;a}Mx|<ZBQ<9w$5;qVx
z5`gJMdUARSl#G}~H;j!$CX*?bOEWV~H{l#OaA5bIoma1nUA=bsjU#VPO-~R2?)4+D
zEnl``-TF1FSFL4pxLT$9+N($0WGb5<#DfZ(B>+tc<8s<kL{z6Pd{tt%iI@+Y#aiL~
zyXRDVG=zf<@I7$gkfS*l#x9QGE_m$%9)6g^PEB36b{r%MoGkvZ0u)1|S+3SKR%qf0
zLIA)<(O!%&t03%d&s9=Xy*lzakW(dK)CrwMKIJIAK+Z{Gh$S7QlhnLgzC?A9>hLqB
z(sR>u#ahL6(>m6<nnh@qS~&ud>JiB{de?%?+nGk!qhsei3kue~^dV28;?jq2NjlLV
z2!~g!mEL*xoCgx46Srb%xBt-ICtW>x{_45w6W6YcUBne^aIhab+vW~cQ$6N^-ePsM
z%~;9$RTnkA2&<^fbXyr8m5}6})R$R`3{g@5A>ge^3dhSA6tc?`{DK;Y#ZdB*qxg!b
z{As>Qy~0FG#?Y3PO7_`W1~GzBti6=NR|x&vthTTXppw?`n*iiM56363zy8+i^-9CG
z5(E`JwD%J^H&<#DPMtY1H9Ky*62^{hg>AhdyT-JPrp@fKR<1Ejn?%cow216BQ;F(D
znde0<58d$mrgc=hnf~)dx{?0T-R)Kgv456!l&%E-OKn)S*0fB<otrMq)rzi@Cn}?+
zIbN$S!I~cKqyWUCkpQ~W)@tOIu`ud$N=DHSV)=xDE|0ZtBE;4!W!p)z5@#_AtKJp0
z@dq4<nqs>+ePFtj0u}lVP#T7Gg{oKfwEp2?P<U}>Dq+Twt|d{1hz$w+&>BuMbX;p<
z66b5Tzi(yT^TV*L8CBMK*bo(oz+#9V5RKU}h(I4}hK7$ZYhW}1E@~u@_?NarbOM=%
zR|~a){EF&q%d+DQy(0Q=g-Vo+fES~Bw`)(0qND&&Xiy;Fjs^-rWJq(!<PAFk{PtP_
z(U%a?Fn;JaGpS?{daT`xxJPzV5=mGz;o*lC$AOy@2^R>Bjh|s^HR=(gg!DU+Rv$n~
zbq}?x2TfLB!$uWl5&maH(!|@3Zj_WcFo;lM8-aCklXkF0jA=FcGJeouwds{Yy%Eo6
zh)H75sy-SZ!uPXlK!DV+d^=s|ie9&?K4uNFr5_R;geMD6B2g)E%p?{kP9*C$tzk2;
znBhne9K}MQE~HO05FZl5auUfn26hlOqOcG<BC-H3dOT(moS2~&R|uY01j)2nYnDyN
zt=5`xGf$xtgyn747RYf$&JQ{;3egZXych_ggezpTszz!*AQXuvwm~v65cG)vk{Oq3
zZN&|0*pNn-6CKqZp;h5ess=B~vbf$N=#tr}1~8+ZjNw&dC^vJ@CZd!r5rjiGMgSmM
zX4Y|SJ)WsmoBRfvAY1YTSg3u=BF2!K(C3&R3O-e{M8aqJdV;t@t!By4L-tccSTmR=
zQ~foM(8UfAStg1x77FVIo0|Y?@wJmi`cV8seS@&f!~s5S)rFg3YnaBANeFW%CV5OO
zFem~agSlxWSrF-^yGQ}1g(^2XSNk){T;XH6h9;P<gr|DBT192(c{PbND_ZBUCT7Ip
z@W|lsNdNZj+b><ZaPHiBcIA8b!aK)Lyg4$me9!KE$U{t(-aUW$g%^Ht=+L2TK22CC
zH%@#vxK90nen_emysj43%#2lUm0myo(vi1brORUljdb02U^nB!#LU%i{PBN6>i8Nl
zERho~fp-jz;W;Rl<_xPfI5ILdSGKJT5oG-qX0jf7El~t`foo?QO$nF<OgCeuSMz}B
z!-GRJlT+Y$VsE&vL+s>6vpO)`KQlXlg+`@X%x5zw$kEt2PLkz&z&o>rOePh!nzd>P
z!2)p*q=X-xnVlQ#8>Sx&Wk$geg+j#)6S{^v+h*Nd^={0mWTv4cm=ZixSo%7$P0k|9
ztE0L&|6L^B{I@$fBhv->M#2QT;||-1ryLe#UqAlx>&IVJ5(kPphxR?f_%b<r>1%)T
z?=_>&9^eEM0hor(ilbTR>1#7hl*c2>R!$eD4V!A!0%AOy0qSUUq6`9YYp~f6|FdMu
z)=fh56-uR<p`nqf$x=Qy(5TfBWyFT7HyT6316U5Q2~D|L$z*dv=S(hcyJ8HD?_a%<
z$)?M-ve&2%42qG@r3+J|%h#1lO|&J6xQN7Zv8iU)$w0Fi)w9gUnv-`zo<+YGktmbT
z_Lv^gU*=IFgJUfZ?Yk5E_MiV@grbeWEftDi`QqPl?BKsX_pR$w7a6@9^=8~kq>^GU
zG#DR@5tP-pF|*9}rAn0o;btyeIJIv5Xt^@S=t2CqS|ygt4l?docB|f)0d>$N`}+r3
zlZ(gQM!kmKAC4vP1#Dc$SD;?avH`+e0q@MYx#G~kApK%`Vw#1ZVzHdC6PL#>Y}~wj
z{MuF9N)HXLc<0;|LY;<9uRq_Xg>GS%iK)<^6~D6RvwulOUG$c?Cod2{cFz(*g3jFr
z3#y-U&d^Ku?t*8&ws}kLZm-EkE3>yAQ6YF$4-XRLqbW1dkbBGk*kvLIt7Sx==a_cF
ziB*ep*cu>7vx#)YEBe(kk<%Sk?~Eh?4XWj4oG?@(;Nb=UhXSC2IN)b+o1kL&50JsQ
z9*BOFn1;Gnt~e%6+!?p!_%LYQPS}<Yp5`$re8gB_OQ{Gc!b*Z>l!Rk=kjg3zG#3RH
z%RTfOfz%Szf)PQAAf2y6HGV>~C)k2Jl*U@#LXQ<0EuB`-vvk+!DY=I#NH@NfdbeKT
zA!MRpH6LoCeFZfeIw6;%!0~iZs?B1j<<MR0<zlf~O%5!pmkN|;Fr8`GjRG+sJw4@Q
z^Z;3lzEO0LqC#e~I@E>%9&-kT667qz#vV@KKa{SQ>$a0fSc#_TR|}P9wUNrC%%lq?
z>a_})hK58z#VHO<=~a;OAOMiR5En9blhB~eV^E%0(E<!Y2@-<lQdCkh9hdn}oI>!6
zqI&n~4?f`$$wVdfuu_6jN_jEJl1fu6RbMg6q-@fHQHhks3${R>>Xl-J`5GfV;}Ao#
zh4&0i>NRn$bW9hbR{)u{GMyuZ%T3a)`(un-Ev9K~c0zWP0*h(`eFK#e9w1RR;jYeC
zb%jIwSLtV{;DS~%9{0m~wK8W1T0WIS+f**sV`d7@if&eH30#-^;U<`XTyEu-Td&CD
zg3r{GVo5JpxXu5lxoK)9b3zQdDnF5JM;F2idg-oEU_oDq@{{(sF=rSO(jy|G#Uk}%
z#3b{_p`iir<;s;SnXg{Hd>QLx;=w)p+>eu1+OY{4mp=aXNwzb1>eEjyUpY{yum~<p
zvLMmeas|vQ@0UJ+l|GVFctm==uw~S10;bUHAELZziA;xn%bfFy2M+Atwqx_^^_xs9
zd-=-P$y0BiIQ_<4=@KCtV9ot|_8i*(bgk;Yed_H!d-kkexrTMfum9(7o;q=A`2e9f
zRvp;CcX(u=zdt)UGyclqm)?B)jSCmf<pwgd)6*-Kt=PV8&+1hhwr$;$F|y@WnV5Gk
z{^F;{jvbkuo5nUf8BZ4GDjU{r+PiE2#x<K#iA+p$PM<oH&h%lC$)pYyj8`m(GEdb)
z5m;n`BB>JKLWp@{$~WYLG(%KzxuBkkPCn{U1d+&}Zt8R^X@ZE(2oC&}Yk}-dYqLR%
zWr~wTrBZ!l?;~4wZd<cqRm?T6TswFA<gt^dkIWUuFbW4o@7%rr(Eev?6%C`BJ$tvV
zT9-jo|Iyd}!^x9pmJP02y<+XbeFuk!2mAZ_CuS#KJ^aeiw~k)8a6aFkpPni&U$%C~
zww-HNt=+zL2Xw9m<>~3^*WWz+v!DOitIiU%B$w`Q)>?aaJhFbn+Lfzw$yDO{wb@fA
zF11ij8ARQXkPhrtzO)i_zJb)j<U)Va{Pl(O7mmk#h}|VagIIeY;YNsCYk*de-MjpZ
zn|=CIpWnD`L;q-Bp)h^=)Tw2I1E<fvQ7X@7GyMa7L%;ra|L(}p>aTzOn@@f6k+mE8
zvT^&!$=AR4z3*SXKC^no(3ijb>+9C78R{FFtIeG}ed6__N8Ww+T{dP)rTS*33u{+z
zc=X^S2lpS!S$&k@`pm>T=guE~^QFsIkGoiKhW6BWar@@IJ9lkey>>8}a?u){K5;dZ
zTERS;y>SR5Q)`D+R26*q4)+uv-*xiBVX1_x_Qe44G@6Y%7^o5uOtYCyq#CVeEUde6
z=4!Uta_f34ov=Kw2;Rk12)$d}Nw#z%5!P*VXUcX0$R&l8Ba2RG-axHr2nbpjS-&Y{
z96GI*la8mVm=t-lQ<IYj8s8&g-H4Oa2qh0piE1jyAnG|vbVe5=aIDTnazz)H$dSID
za0UAc{6fuMt2Kf$rW^q<3Cj%zFXCm=sg*oHdp=Ac9y8;x$XXq*)~aEGnCMzM*@tS!
zui_quA(QSKA%EyFJPTcN%h#D;5TYa!d$NUxkeXQyST|%lR&4tc087jk30~~;5Sw~{
z(=m(?3ojYiacF0fObAJNREC@-ofN4$OwcJQoRv-{7=Lie02X5&%>;*SmYVe{){b~V
zYgUsWtI0%?)d0UzpP4K;ZmA_}AMB(+93hu2H~c1+y_PJ*h-s;4HPlhgqX`4Amgy&E
z>vBYR)4Wamt4+U7Z$bYYc&NV!nTm4`)}0dZBbua<fq>btv$T10@jKP%eLlDpU;#&#
zO0WP`Y3tUlQ&Ur?PMtaZ)>(q}!LFczOP4Ra_+o4KzU`~mkGv-(U?jl}0wdaKfduSa
zn*U^tU*xZ0eD$keO<M_+5uxS|Y#v#&b`_@luOFS1MYX!Wa@p!#Yjy+m4s6@6q^!*1
z@E2sl`Q<Nv`M}Qo$P`cWY)xO$w=(zH%=R5yzxBu8KqqH7sV{!vS03N_DORS`Y6%;j
z#76s9e)_4W9n1dN&z@&1jSQO7{^T>C`y2bV?ZZe-%_hvz;T24O=~8t~G^Vq~`7%dR
z{(Y;p@aB7Y4%s_@Ooo`2AReSUi&YFqBr0^qY8mj<)V#rsd)KXAjj(_HjWTf-u-02K
zI=XZHHa6}$v}-?bj({{Q36zUf76E?!*M4LFwkMe7HnfJVSu66H?9+o=x9$1%x4&Jk
z6k~Sci=X?-le-^BIShSt)&v57`Ov!j(}Q)d{KAWWiO)2?&fB)^{^Tb<vu0$qh9xd*
z7wcE8TE01HB`6~kApw^6eWEI=31qIK=1y`#2sq}1t%T+7-M^o0TWA$u^ZEue>ju}4
z?bz|}|KkL@C2ZeuVBNlHo2j|K_mBU3LV-i2iZ<gn{K8E6@kgHeyT9|hDV;vjWPxtL
zO+UG3bjPlPue|cg4}S3dMlIa9ZpZKccmLGTtO5!_%^07UTsgFU#i6wWgZV%F`gs;&
z(Ran;>CZp?`EA>_#AV_{{_e_8Y(|9wq@nvH-1u}4AK%X$7)CMd)2;(P)-^AePGfn>
z>htXEwRj?<TditoMq>pDtm@ZF1r~RFf(fNEIfSFnY|dx%nlh@&Ive-N%AgHsk;Py1
zo07!ZRG_m`5XSHZ29L<<ur+>VvcDfIJZ>)&->bnsVPT8;cPgLdu{NTRj~jaUl9ZuR
z&u$7?x%Al+82iz<KsV-?7&5cs78Av-nh0YIjmlnh_87yAU%a#|KQPii=r^nu;rv<P
zG1AvAP7Y;;)x1K-Ypnr7LpvnK)gWf9D`YWO-iRtl0IraU^(Hh5#EEo3;aC9J2^3ep
zPp6exY}}1w*U@C7PO+Pn4Wf8(o|y9!cQU(y;PRFwkZy$WQo^~JDS$3~)?)eqKiNit
zUCF4aY50LbB+PnU41Ppo?zjnJ0K$=$OfT^Jk2wh}PY5MTP*YUlkP%FcRhijpVDn3a
zTG3jImYanSGav+W3{A1YT!o9N9QygPWhU}{{ncWvS|CJ#mCCzw^;tL=rbm~L3%Sy=
zt2xZPMg9_?W3LGJiUL4Dw0Df+V$$UMBf}#YElf?#T+dF-%*+uOhwZ7k;rQ|6n&IaM
zB^cc8l0bpZBv#>rn{N{FDw<}|D1#p4e#AlAWQA8BD_(i_r!TBpx%I%_$NTL5y}S3F
zI`ul5U6~9rW!G%k23OOLoj+Qy)?GqfXyNCc{tVhMIREU$Ge?geEfi-qZCU?`gO6`n
zzU7gFkA34Cf0WIo#V}Me&b)i(#PM^O-J)#%)FV$W&#wODQ`T#*9meL>^ZX|s|HQ#<
zk5R14Q<qP?eKM8IZQ8tbFf&3&t~Tl&BNkOTkzP@v@Jmkks8u5bOhjI_UWas`O;oWl
zBXKqT>ddugfA*v0>y{nbeIRcS?b-dv$y0C5&P_0Fk2cr?x}m0c^Yk0F#-s|tuRr~n
zXZCD+6!VO8mrfr&dZbX8+q7ll69=EzGP>!}Lx;Zh&94!l4NCy%a`yb0Q^!x$tI&D%
zryl#1rRP5J<kK(y;wQ;eF4nTY_{^^jrB^b}zI5!Bxx&=)k+nNFKH6SqL&<Gf7~XWq
zdp~{(*`tL!#pe;I5I)C02rEDc<eE-htG#&S7c-?vW}zF0whiT1E?>5qoq3ogRx3V>
z;V9xjz?0?iD`Tf{pky1Lk>QmG4?L09bErI?{pnB884ryN9(d%yXy5XK`}RNo{Bz6|
zv7%y-oGeegarg~73gfBsluP)P?Q8d}TfOPj+2d@%bKt<E`?norE%M6rg||;0OS^-c
zHtiis=NYL%Rtx$3ex&|KFZ2DaKc@afD~g4+T*{~w>V8l+G>5LpUX4i*o)y@b;j)Ow
zvUJ$UC9S^x0iPYMFI{EwMjV1BD|^JPZ<fV8Ato_C(Ojr~PDM;3?3B1hlqhtemeuj9
zM$L)++)#EX^ea~{UP@(RUR<)urp??yJY+{*@#<5OTjkP$aqT?SH917zyI+^W31DCJ
zmrN~jH3))QX%0w6U#^zQiFAtL5Ra>LBAHDxdSN@>sF!Aj(>XV&c!g=tn`5uIvGLh~
zHT^^RWd_Ep1TV#;B8Hk%XYSI#E4>9)Lm?y`SCVoyQ0f_3X-9SBql^GDHc^tA*d}k>
zu%_V=?4D)7DyGlqKiJq7s{;N2lt630VK}I;_56S&6Pu++RB3&w?0jphvv6pzkI=GO
z;DvZKV+P$oQ%+b}`}(!(nd|_@^iDtk4vbP5nTVLy2%81d(&q};3+ryb8;d6}N3m16
znF{eTQMrMAF@Pp-Rzd*Bk=j~bB`RbA$F?tint|6u@J{1W72qYM$45u{n_=VH*cCU&
za$896h^!{m=oyun<2;e2)q3e6pa86h1OTA0I){~7u~Y;&64zpE>}o2Ni@QlKvk^6x
zi|D>ioOoL?$!+;0;-W6eZu%Ns2O`69{7DprM!-^4`0AUlJpb1}Id%5J$jGJT%hs$J
z!31Da+;V4{S|;6R>n=UV(cSB{Yd`qG_b>>qmFv}#_uGHxt00M*U;XwszCAHLQ7jd%
zPmJ&0y*q6upMK(VKmMy9G9~@lbASEv3rDY9o|v62R?6P!=x9EZ+p~51aAG;Z&f;c*
z<KCSIaK)Rc&HU+~{^{wHrvP(%_gwk(r$0N=H|*Gs3O3)Zk(@;G?1oEA<qHNb&>Y#z
zSPwEoGuWXsemI)-@{yOG{p;sWy>nvdOyBa+k@W*xH>}zoH<Rr0!|DNBQhQp-)pC=?
z>woow?^kNqE48^wx%S(?`44T)tp@dPee(~ermq%CbJwTN?cBdEt*1Zz<Rd@)@t<Ks
z_p_h=<tx8<{>qgrh3Ntfv|{<nTrR$I>%o!SD8n>k=*Zx*k@P5K`}u1>|BD}dZ*p>M
z{km<fFT{6jdju)ZCWWX|76{IzLeM)ZM|*xEju^B8^oRH;)Ww}9%4d9b?4{R!@$3sf
z^1>Ri$v6JmUQ5&0tk}S;!mHv9g&A;wCo4Dq^tqoLJ@yN)!N%|Qisc&*Y{#-(|LKcA
zdhx{{Umd%`M&hAS{>tZnz0Zm7+q;>a<gZ;l`)~i#|4p7(FI{0wV4?7xbEkg)pZp6`
zvsaC-JAL}B8H*p>_b4VqI0^jOcfNb_^zn3JV9%bZ&pef&X@~)dak=QB?x?~K*3I`j
zJ`N|wujxTGGqB9G!*Y}6BvyR^NIX0|T&zrbek+~IN;vya-@9e))1TdjGr+(9!|zRx
zv$_#yZ3rCF45m?AqK?3Kgib4Rq#YfCrWOkU<aQ*~0U8K$Lu?NS!@#<KYsZ$2zxPic
zN9O$1kIujG`YGJCu-X-C0_h5plx*sLQ<oMTU_N;*|CdY>49n6ETlV47TrOn$(*v30
zRKT|EMk<~qNG5f!l%~qnn(3zdb3?U4$&DLNKe_v}pII9wwSW14K0j6Ed9h`M#5#M?
zhiuKN1`sjP6=5xxmBW@8b#qOOq*?fH#BGaZ0Dv#fzgSuf_NBl2-|V$)?WLD5z4Gee
zqE~m5Y2p~M2X4~?>_rwDONMZFP!KVEs8z}^D$y!(DK`~2FlEtwi~@sIE0z%^->((e
z#VDE3EXQErFV0N-@z?%At*-s`3*NIYzCK<@)7~nVi$*;#9OxkPF^<eE<WtPe;ZbOg
z8kxpK*UnC-ETy7q)^geYKF1}{7p?Nho}FL*)#2%B?ce=}=f(>&$}R(Vj1f_L@k9*U
zm!v-a4v5{m^_QZ++}s@am`BkeVQvdhMx(rD&6+Liw&Zes1jfO9ib-rbn_!2tMp*j(
z55M!???Gs&A8(d)ZLz9;;~x3S5*0Fr^p}72{VUfl;~_UYJ#*~XTPsFZIG8$c4@)TG
zgYTe1Ccxa+|L~9Ae(UIlwHujpuUav(c4U<o)-jkS2P|KkzK4bskEpTo`0~DCYzX3U
z>-4F2vF}eM^Q%{Ftk;Mts*hcp+`Sd!tr(pWYg*!Lj0~@$LtVLam4MJp1DGKke*M*C
zI{l^3e<`lT>kUzABIctCsmgWh5>-{Iumlo4p1+8;N>ATWXX5NY@lfOZ<zIgP(%7Z`
z!7MI}$Bw<Z`ZL=dErZaY9nf(?9A?e9E&S2f{^MJ3zp-w^Fz$UTMpv&H-cS#ND;KXL
zE7q=B$A;DHD}gcbm_NBZJA(JJqVqX08OADWR&8MK64o1`^Pa5)M(`XfRVi0?edYiQ
zi?rt9!$)WcyfAR-``OQaHrT&r<;W;AH1UMF-w}!c0R<dHu5u3DrA63sl8GFqDaVh$
zb>zt5QfV$Qn{OOF@;kpqL@#S}`5KHnnTC9jv_k*@KmbWZK~&k6UI|Jm!#I8M?BT<&
zmTNP}ajzb1+^~ghstc9k#fw*yPG;NYU1{`lTxnL47LoF@0GDRdKXvjf>nb?BvNk!?
zKZIearD2cJpf>}Ki3d0`w48b7_}B!VxG*$){`zZ&)9$h_{f#fhbsOhh8K58HbboUn
zJwB64tX{V4tH1J8($)x_IXFZlFU@temwtZc;>C;aT)bW_&#=mzN!rT?GTTOFwT2aJ
zEwmEsO9KBHM9wG97aC`Rq9I2xjTQD4S3Jx@MTAI9b&jYfaqTzQEK(zGa~NQd5frc)
z9dRG$Hnx&PzY+%U9oupU<`d~j-w^hSdW908Y7B^YPN_ea7VA;-TB*pY8oMqrA`UDc
z{@hcC9@{fg#5Rm2cL~MEmhr)vw~xJb^2~G<hcUb(CyQ0DWoAZ(*FxLbnJcr^Y}^?(
z#Y;Uj!<ZGYMn+(IJO`OtXg&^TYB3cWDw}V1^Z?;uXyFr>m94mJ(RFfvla1K+Y((#&
zjb9o(W}-)Bqf`ll+9IqPie&N=!rqW%vP&J7R8s7ulm`?`WR02>G57?rpgBB{{p90^
zc5Pc`Q%_&BT&?VD*T=NeM~@#raSV_0cq;4K!wL3K2(=W>ZuR2i#P!)$V4=2m9el%^
z4PV(6igE~}KuvStMpo_=4b%h#7eK)iFr-*wj9;9#&8FTgI5E$OuN_JaXH%M2nZOUV
zrMayT`*OwbsP^?Jh!rKfna)d!z`epO1_i*x5k6+B3=YP`0=z<)<dl<SYea%arZO2>
z|Mg0&+F}0mo@IEy4jFMNkHTQ>ziO>)N^rw)V&Xb$^x0fyc5Y^T>^f`i^lXD@`IcKL
z5@jcJtt1SMM_vCw7GaF-2Yz(~UyJLht!p>_%Rl%R3aEy2ULoV8$dKNbN#(}JuMZ83
zJoe}(H*VNAIItYkTL1^zO4h0AHv~2Yo!|nIh!M6W#uj4KG0s+2nvF9ua|U*kmf}Vb
z{>D11B^$v$OE01CQHlw%=oM2IXu*mcVgZ^=OkU6ClG$8px^ngU^=rxjkyS&CBV3GM
z1Ha+ws4)Gx*{Om4tXHc|PfaA843;gfp5DA}*T4AZ{})mQSCc}0E*;M>@(G>Y)#Kyi
zLxY>2c=QQEXb<&|U_Qs*ekKm_V#1g(b@a#HVE+&$L7_cyeIk{>Bf}wn|D}r;xsTC7
zJS7Mswe+ZY=Ot3S4UAA0Zm<tT@D;egl8{YYaK$5V3QCOe@v&=D69d@++(8)N;6@zd
z(?$}KUb}j^R;|Pxhv^EYhNH_?&{CP?;NSo1@3Et+WfD3`!ba0UxH(~sYRx<Y=$e&l
z_U}KqdE@4h{$(_GwSh=?xC0|fG}DA|agB7PQNMQWy6|~C9n<2)xron2%W_*jzw_$f
z$4}hf_*e)Z^N0F!d$(&a5yq-;UOGn%dTsmP+B8wv^uy=hdG;5tU7alCQ~lzEr*4>@
z9t)bwiH1}3W@}}(8!s9T+yH(<dCVQcy)YT)upq&`6`(^R5*&cZZcQAh0UQ`_xWscQ
z2|sFDUp9lso>xamvhL3uk^4mqp}`YpHv|*8-Uq3bbWx}SW-Y>qnNDlfP=3pD856`1
zSUI&i>x|DI*!%Ju+yCrY0tp>=Q>K$5)L7e-f^C}<F2=Nk)g<zp?}Oy1KSHntaFeKK
z;(fzHKRRO~8^mfzUZ_b(3(Rl>AHB>BLF(gHzEN*XO^rh^#+$<QHLo%Q9mP68T*!=c
zGKUMMA}76Ak^-I4>HTqxJC+v<QgM6Z+GYDTD5<8{Zr0a!tkHh;vF)!P+y19N{QmJX
z7aA=)pR}?vt`TkmTX&*lOF*s=Po{b)iT#VPmXJs3=GlA!mQ)Ggj5Zh}4R$htbz#dG
zJ67V1&vseOq!mj!L|)S>w5wk-L$RBug>j%{79*j)+wS?y8`Uqp!^1=YRCh{u7)n41
z!>HA2z~yW<qk`eYC9oMmlmV5*T>~D;z9IDTn-YDK?i=;c1;4_S3a#FBcXX#rfQ8}k
zTlLuu`O?)Z&8ERlB}$wkD-5Dm5v&SbBYS;S!t&hQjKr8L5>PC;YSjw0YSm=Z(3z<j
zLedhjs8lT%tvSF+p)iX=WO#Vw!2UyD{K6Meq@fhCVh*68(k!Kj0IJ1MjZaTca-`%)
zeoQuP+bS04blpN_p2?)qtT5G--q%@Li8@CFj&G^alDM#Ng`9Fmt-^#SDjXKZNZiX;
zFEsty@UkHp@2GpsPN%bzQ`cyEq)A-^Gfgp9DODy2B8J5_c``iK8_k)St3k7h#<E;3
z73=~&424305M!+05>ESzU-*KhIp8)MyGO5HYE&$?UbIn6qQZ3M<_flEqwv56z>ecv
zk0npTa1%oVLl~Vwb|Q_E*ODOOM<|4vo3)=u0!KXoJ0j7dF&OHtTCGx#RdYnL$Y+?B
zGVw#Z4}NCxu38r*1!`tUN*p(g4pBTaQ-w0K<=K2{Xl(LI=+|vG?%~{BuO`w7dIUBt
ztiuq{?JHmY%G%}YnMT6TrCNziK)k>s8Wr}X+@RW4OWH}Camlk-oY7;x>*gffA>%TN
zD*^!C_vHTwEr%}kBR#zY9%lJ3Xx5ng+m`m)8#8AwT}L(i=p$=3^=thJ?F&z@fA!5b
z`&)e;{+x}fY_W=L_UqYnYG!JxFP-pX)2W<2SC}N01pyZws}(1Lce6$(&t&@vcN{bD
zOCu;_0WHP!<mF5{+o)7RqPQ6$VK-Yf5BW&8#p!|@vx)48ZWtLsTxLvY@tXoCgXfSq
zL|^X8&^D$M6Xi&{E&)2KB7n47=y6S_*dUafSuReGE+bfnRxD~iJv=s3s%=@f>XDri
zYU0Vg_KAzD&s@3`I<ZPHS7?+Fw+xur0wmL`2~3uCwAs1qi9S5e?6K)GwiClCw^S|=
z=f+9&)yp*}j>58*PC1v)Um5NnRY66<T2PTWNGs_#s;R7w2{Ibdq~Y6P({@}!UlhEm
zn+84OIv@d=7mT_~g@*4_4%mWOR&;}ybUSvfS{&*ZJX@)1Km5U~Rx*<sUjN8Gj7qe<
zTeT~b+b>)hYY@v0PZ`B#j?QMV)2WfsM&Kunmer`)Ua3EuX@=%(mF-PkPb|D0JCT&N
z5yrxhKrW_Vnlf9>locyC{IZ`k35~%#rpk6@4al3r0fdQOy<iiP*{fPE;loSul$pzA
zP?)_>b$UNNz!Zp7M-fD3C<P^xz{FynuDpvPO)5S>a*vp_E=(N>x9}RWj=RK3K>h%-
z9Gh555x6&QA(KIDtOc_wf@27R5l4eYvxa*~I+>X(3Llbb(+52V+{9iOJZ>UjCHN+x
zi|UzV{+)Nu67B`lE~1Qm^K0Lv*o+RQBXP>vqlEjB%GirM{khKqO=fFzfA**U6)Q}j
z|1-~g_L-+Y*CYymwVKMLy+(x5L)NUxkif|*_%U7?%o^F&fw*J0-!}o;+(d{Q2=Fm{
zf!soE=*+~!Nmzjp+)qfH%(!X-kiYKvwN%<=mw-ZX7A%GKhp7d&TI|UJ-GFxtW3q^<
zqo1e~-uT@4vs@>T^|7}Of8(3~T5DC{2n(TN$b5+rP-jOgtM>FWdjf45_2hT|^v}<n
zI_*{KpMT~{U--;#K&3=7d2Q?}uEw16qUEEbm#@7wIFt)pRoJOenad}$xFgqU>`1^Z
z_i-ZLvoIcRopqvjo0aNpe{R4lH%bx|Rca5r62oqIU|_m<%`~#iOSx)dS0U~w4zZ&G
z*>%r?2^+pQSDq$3dFb0`&YgbdlSCA0{y+cU|1vv$HJM2_**n7F8G~(o@_l`}Q(Ln!
zyLvfVkD0^AkNm}-f3L}GreXZ6fAz1Srg%@<wi53&m_ItYjLwqIq?kft<7b$-!h0+l
zv6yoolm2cL{l3S?^?HrP;~X~aE$yWjo<H&KHBiE<FB*UF-~P_J71;P|BLn#>GuIsa
z%m{0Re8ywcB5bUwC!JblQZovsSschF{kqP2Llti)%P$wlu$lstIC^4krohe>>?%~R
z6vJk9cBaDgj@S;B@?3SY=C~m%i{yzlT&wEk;z`dhA(ABEE+JfeJWUBDjPAoQ?HiUL
zm#n^%g3(pjxPqSMIs`kFjY^A6?a#maCu37{EBZ6Xwr~1Jzx73;<Zs`y`lpBAo~*DP
zND@rOaL3p{ufsqYQpb#@u1(cTe%#1y->_<Ka$;`c9mYbZZ-mijcD%N7)kwWLGdEQq
z%n#azPw2<N;Z%Y^zwAovHS&GwWU3(=sz4hZ&S}jGaP8EE^BcD9nJCuqw-o)3`Y)sP
z-RKGTCJzQHiVKlZL9xfWL@!29plMUrTW=hGwNS3kwcKN`@Be4N`*^FSZCJC4*#-ND
zVP^(nW~>XUB|^3Mj#Xs`uUai*hE6bDpS<R=tyj~^E?ZkS>?)?owORBNE#D)E+~n2M
zEAp;a7|RdknWdTO)Lf-dEM8mQ=j2la#iR(>&cp_?&b8_5PI3^-k9BKD-#&f9F!Oj%
zyAo=tmwFUfKml10qW5v^CUq{?yNMfao47~*6ucHwVjwK4Vyrk2J51{O25`yeR|3*I
zwrfvK!~4ax>o#%`iOSYXm{Ne=l!+R+4H#|IyfUy1{D&5^TCXJBB;!4cP~?fu8;4Ym
zEwwExXe6CUFhRr_n25-)9e(-x#PrDEKz@M0#zZR81M+F$VjWD?rUu+W;$D68+m*il
z9BPt*q0voS34`TQ5-bJS=8`1pz)t_57capi*2BcaSAi>tCMk4AI(yar{(g!N9o&Rv
zCt?$GQw48o?b@v({{MgW-UG~z<GS<hn{&Q*BJv=U07-xmj3Oxpks_5W$+9GKZAsR8
zWqa2f9QO0B_kHhu`@Y@xdA;km`($fv*~%7WiKIx1L83^CU<8Ov<TQEa<{Y~F-uFAz
zGlKyLi6Dg`2I3ShdiwV5uCDr5)u~gbPEZYmSRJlfWIaOI?Jl?M&J&;t$w?#-sW!`g
zpV_ylN5mX97w(L}Stni#+#b};w8KV+Y<`@(?b^5F)om}OCo<h#U5P{t;tscmX?Hjl
zMpREXK<{Gx#&zSF{l|xnp+J7+RoBM?=xI<e{F~WYQ)2FU2<<r)iD?VzZ&RzfiMC*K
z=1MJ?PDEu(mMx_twb0m<=)GpKoGVuH;UGG}XtwDr3}CE%Y(o>*L7J&kEENc^pgFu%
z#Ke_isa8Z=m}TBoS6%Vq%LUer$U;UZ$HUQZBo@gQa>;nSy9f0Zv#OPU^yH%l4(<1O
z{Oi|UhRZ3M2ip}Qx%k|%vR(?gqRUn+t=G%3WF)FNQ>osyYgbba<Xz&6oT>IboGyC)
z7}sC;RiGNU+iup%C@nQ@or%G*7oYv{qL1BW7iD1FA{e+p_Th9~$s=j|cD2@S8eXTq
za?O%e>tZWcu3WVo{|wsbxOV7hZr8r8W24z>1t!{&im<3{_V%Z)x@y&y8<KISmaS_i
zj+)Q><i{sY9&|e-!W;7AxEHO?hS@YO-7s*)HJiKow0#E~TVH-{bRtK5K&K0n4hi-_
z-(*zG=l;forL&X}{b|G{f*5CJtBzQ`ufN-EG!7rwlMLOBmczhOEj?Lmxw}zXYIpDm
zBmA+?6%x0+?PkE+4EtTTeBkyK>(+(hno-jTNWXXQ{(pS(<;h}gSzn5vEf_ub_9d^n
z?CK4d46Gc`8ZB)sUER5D_X|HgJ~lSiw@8$7k-gXQXvfww6NA@m-gM*bcSX9i1IM-R
zJowlz(rlck5YI3T;u(Nho|&6VQcXhIVGfCJDR+3+wimsh-{QqcCM==#G^Z<>V1F6^
z9#G0w!0XC28#cYZDDL09s^_lj@9*#RW4L&-sO>(gKmOcHBbjooR_+jK#K0rHwsf)g
z?whV$vpBVKQ4~$KpX@V+(xn%F{IpRj=v4<%Xu?h{?DdT0Ck9q5S+-{Njkm2I8rK5P
z`i`F%k3}QO6lq@f*m>I2zp}-$Is-?mGjsBmGpM=;PD^NnWs+g~WM>G?W`cd-&|Gp5
zr3KSAk(sD#gwv~Fq>3Vz(_@b&Lw!rS;h5QHMgppp+wf#A^jC2jgy&B7KJ<py@5Mxn
zlfC=*-n{93{i&W$fA-g3du{vn?XO`;al=hFtXRE#aB%Ro?XUVmo?<%}u_xAVSexvQ
zuU^0A#v5)}v2uA;t9Ws_>%uX4z|_s-gGVB3LQ#MC_y6Ea-}=@!mn>iM+rRlmj~2+c
za?r$J&|j`)XSFcm)pLMh)&uX>DFI;7Ox0D;L5L0BfuKCZFY16Zy4naQ5e%~=c6#mc
zU}Vwa9))w;jWD^^VU2-E5_Vbca@tUC^Z31Z7ik!keFt{kbj6as?w(J7`g5;sduiM2
z+g;wkE!SN$uxiDLA%(LqRxq`wBX-H!Ym=#M>(*Rz<MlTT46H)kqQgYpD*1!{14rJt
z^^US%i@yJkTg$bR6PY0#`EA;~g*R4ku!FTlvnCV3MVw^kSOvv#xZpNwb+$5KI?cAI
z*W+?tX|u3;K^NGO@*xw7)>`F=FXHyI%jz=Q4cWBObQeL-kOQIWf(s%CqokF|jb&@&
z34h=HANciRsrknK{VXP9@ntvNa$`?_-`{`jtEels+d-Qa4rtNk%dZIf`}%tN(JU%9
zN+B<fr#*qNHyVr_I&xssx+@c**zf(zfAP(4e{JdV#lQ8yKlf_MJcfYID%u)3kx{*O
z)5XmnLzO%}pSUwH@FS>2!x&p+F)UcKX3d7REAd!`U768pvYWvE9orcUHFyzX8+Y_Z
zM>nq7a>@H{TX)I8RP#r|T2EM8wluzC;O0ku`1Hx6Xs0<^<%!$xxc<gl5=*+}Hl(B|
zzFm5$W66>`zx+Q-lM_ckkfm0;QP67X#Y<P+v}N<AO)?P-k9f<)LaWvAxV(5Ql=#Ur
zNv!nd<FJRwi<&FeDs~q&W!I8*DeiRHH_IX6bO#Z7b|u4X+_MRE>=>d3<R<})Ih;0e
zfXd@&Wz_5VFh-BL{e{neX33(!NCPUV*+QZqe*3lSl1o<n?|*oF<YYOMD_pXE@dN+t
zH+oZU^u<Z|mF^A%E?;(CJhbt_?><~_6cD)w(F)Y7K4-hHr~Bg{xqJC?QG*_t<Ym13
zw7oaJfGI1G`Oi1AQfd&hRTfN6wligJHSsg;#H|||lgVfZ3sNTk9dD#F`9i}M%oQ*_
zg@@J%(1^-JrI73G?Yi@t%RYE(FKe?(1KUyrz}n55Tmx%vdg0adBab~sOFCTbu-Ea?
z`|i5-vQ#&Ar>gtia5+ZPagTn}^N&4IE7lk_U|-JXjb?Gx^5kbfu^xE)lG=A4ob-DA
z*eBUxV;0f^0#gVu!i32^g~yD{SS|feT|eVYZnG$!b;RxpB?|N+>IgX7cx-XC*(xy&
zyTqc0`e<&WqmN8XqS@j1x-*4IQBKvW_{zYMx|48ESX2uU!pEHOc(F(@moDT>`RBJj
z+tt;zZqX$hR$sb)^}0{~#>bEfqd<g<*~#oE5#|pb9-hdK`cwF*3;*#S{~MP^s2!1L
z;(ro3K!pv$a+>Y4=U#Xw5(y3TtXQ>Z^?&+z|FMP~t|`&!qwZ)$t5_6<qa|LIuR_x2
zBUM)c^T4@^l!uV9cBc*93R!DlqAuQX2&HF^jZGT%8rE649KoD*0-(5k=*$?XxFG6K
z@=F>IoXuFopUzDd%9-c3{<tf-bj{+88`dIZU;Xjl_$0y<aIS0h@$3YP>Vt<TCNpFH
zZg0#N{o_CSBjoPjj0Y0Rqic1y*IsXxj~+k#+KyK@uD>Ge34H29A4k#|&?0TkQ)jJW
z_xJ+tmf3Pb^)A}P+GBiT5J*LzzS9ylPQQm$S1D6z>U9j}uzI2cqiOBJd#R_D%^s__
ziiVc(_#HGsx!AP1oJieVjsPJm>F0F>|B)z6#i3*Sw(WS~hAr<8+WepS<fqwArJ2S{
zqp5I;CSwCPo6GFpyN4+8QAhNXAO0lK^%A}qT~uOuGq*ShIX<2*R)6^T!|_ydak6{m
zqJjVTZ~sF>Gt142!-Fn#xTe)8|3!=a`6>7;rtSP>_-^M?t2LradnM?nuJv~JEUR`J
z&fpbm2d=wvC3QP?v^seFL?^tgT&?@u>^O@{ZY0Nq*!6~d_ujQ`V2RNCZM$->ZXF3n
zQdeBpvv!3RUmb4Uaoykl&C5ic_qyxv+mcw`E!6#?pB`%J_5`c6E1huCWwdVd71%yr
z)Tw8dc6&ec!J97KBux3%SIsAW^b`?UQI<sup8XQIA9Aqv)@LvhmTuNuGjsg{p0LX)
z5!&oNwr`NTvx@M!oUxGq+H0@QRn5h5-@P|niPl)l)Lwo0rT)IfV;PyOQ2B?9Q5E)M
z@adaOg#O@5A6t|X!Z0#2{_w8hfXDm6E$jPZ_SJo!`)<GGM?c&#k#4P7*3}hpV|g}`
z)}DOwc*DqDb;F9wSH=?YKro!p(J)ZSVKWGP__+r@yF@X^Pd-=u@e8kD&^HzAjWKMo
zeisn6pZ{uBWwT2MOU#5LA@cBNN6&E*&_iZsI!Rw&-{NIUL*0G1-FQ7V#yqxd`)kSW
z9>cS|JbH|K5-!9CDBIVes<>vm^M>ApS1Yx(*R~%U8=nlt5_jLadPzXLd2{NGSAC<S
zV_vW4=4-Ei|J5m)=0sL|=m(E#-q6MyZeFvDm1G!$B)oR9T;$hQy>!QIH~!kEZ=(eZ
zy7rCl>?6`xCTE1gkko~=fWR3D(4!HIopJHaU+McwS}b!n#E(aKkmVrf22M1g6omdE
zkTt9)UUjiRs@8}yEndO%W+oI1;5@x!W9`kJatUs@sj<J`fXTqNBVR`RhHbP+H;J<;
zHmbZzqEQ^)`qB%TRhi9~Uxn9LuZF_}QOqDp!gzWZsatPP_v24IeEoH|T(R<+s=>k=
zcENn=XOG@`<Bc$I$f}_xM$4SYjO^I6t!Y*k^>jrXA@VsieB`0+k3R762mG2hlVO2Q
zElx{nz0P1_B|C2t5v@dDqJcR<swZA)6>xq|__`)uPO%!-I=NbgWk83>rcQWQ9jysS
zd#hd7+l_oFjiE`tm}}q;OvB+lp|l7{ga}hAc;xsSFTDJd$?W)5mtTsqB7upOn7N&<
z4Nv3_q>D!pEkFMBx3_G$Ve^Vj)pp5d7xwO{=byjpru$1~E|v-(J8`sLvA^=#3m9Ck
zSh+Z)p{eLTo<Dl>WI7pLvSQhC6bX<qvnMcb(0uy^v9*`4SOR@DwMHoH^#)jv5T~P*
z$>*2$tSGmO0egsjk60{X_Zr|&D2A?7w53(X(u3JTx>>J<VsVMBiUM9frx3=DNo-8U
z(#N0O`ed$@`M{lbh3!F>CA42Ej81v+jp5y6nS=NX7|Xn|ZTB;)239W)FY){Ie0%Wt
z@u5^=Q8Y?hpvV+<IzsgG-TSt;)(s?5u81wbkU2PV_@P&ydf<}}_%tsb#NXV{-#!W#
z@e>z0f84QE>kV{XndkBHci+9&-|G?f6rL4YnVT#fJ8~48l3cN5)SIxxey9cFUN$N^
z#*)6SRZHZyt=o=&@4JudwH}xI<dI|bKl!s?1C$##^nIOpzRlWw_g%eyu^4gw_-A{c
zc=X9~0~3eXj~*MiYSR_v3f^S9$brwLt9myq?caP=fcTqRU#dRx*b|w2DV2;HHdoWs
zu{Ld?b#PizL4fDq`pnw#N5c-IFjNC^wd1mBv5@nhf8hf_N5tWf1Q@M1P502y$^CB>
zQazg;1o5`F08M6$R^8clEL*xL5f{em(VxEZPmev{viBLa)`=Z^{^$>WJrdHc+Z6lq
ze?1UQhCg!8ohibOYT94?mxmpgGgn5B554-r;+2{4RK42vhCJw=q2<z*2z}s=+tx03
z(+aQd(qDdQ>xmP?-XQvOB;Lt;)Xb~Sqzv;cA!edP;z)ZsA#lRn5Y?`xfl7`DuU0Ix
zefra%>gmx+rWST+74$3uo?I?7GMcN{yj^I#tEZ715KJW5)px@+{k>uKDYbw9U%xey
zD-#+a8H?t!m5+a9)AF!(-_6(m?ce^bP50b!{YIX{1p4p3^%RmboO10t`Q-8?OB>nA
zMy*5$2;p07+O0QV_le)Qlf~OuUHi)aetKxKlFp=i7Y#5~>Jo=`AuS+qIs#LK%j(kS
z)<dSaWb)xD`|tkI7l~uXK)?}LJQZadJrE!wLEw=`AAWV)D~(oqA~%E%D^3+KDT77$
z-GBeZNYr1ijuV3yjny5yx9#3}cs!G~hmrQ`I8Q)I5ebD2qp;_Vy@w7R`o_1u&VrK&
z7x{7yw`Gl16^k?i7ac!&q}HnJ+qb`~`;k<<KVK;2a~Uk3UV8Z%Z@@J?dNPg<o!vP!
zGBG+j{Nrc*eO)Qm{<$oSqZ5091BVV(>lI|NXWds37Ws6XF7&td@BZ80@#3w<Ss%-s
zh;>C+NQjY#3!ER>{>qLf0r8OXMzJg}V^oc7<X`;h??l3}YHci$iXyw+xqH{{oku6r
z1)s-_oWD|QVT2b6Go6+9?tlH@;RE0N&R_TSb%i5AY$(SjM(eFQ7M6*w(6PaTwN`P@
z{++#Dz4&n}!f0m;*q%N1<P&HP*lgukBAiNw_aE9fI5=FaWS1_(Ye?wiv9V&Y84mO$
zlF5n5Lv=lmDvL$oFEo<x@crNT);~P<*u%P`kuHwlkkHMhUfY#S#g800^#A<De{DBA
zShSaGg*I$V7%L^)zxk8jkH@>*Zk+wQySo$g&;Rz{|8i(}qC1s@SR;O}*2_3C4#)i2
z!q^KhJ@evAKP8eShCQ?kv1>=hM?GFw&!WWG$SB5qC%=7SVA*mejd(ORISG$l_Ie_L
zpr>VI^5t|_(qF3O9(nBHM;>qW^!xD-n#&TL(yuqY2aX)5H6~20e%5~e4zaihAG^r;
zV;m2A{pcT~!HxivBZ(@(SE1b5l?rdXbi;>?(BD6`z2kIYr;JZ~Zo@@!wOCobdKq*b
z)rcLhy_%a$2i>VmZenQc_|LX3+j#Zjq^qa9E191t(`O8AIaF`ozU??nH=FiQq_<jj
zJo51K0WY@BwuUJhRU6kYi+FHkqdoKd(95sBk<Vr%x-~viiq*Pd^M`oP#gWi&LK{$I
z3vB*l?@eNV2p>G?<{@mv+ge3i62QT<_L<-KWbW%f&15UKcGe9etzhq%MX$@hW?&i1
zm23@#;~|?pna#r9#RiAU&ptQw{@WLaz1n4K*Xw4ZHzcgv?p^I_p;Fh2gm@btpUf38
z*CnKK#EBOmP3sD3H(Yxe4a&a%t~aXR`N!|&Y8_VL7YCY<_BU-2sNGl%`#Hm`i%4d=
zDpo)xKkl`gTFUEUxAN}0?><)FdT7Lx>PcBbXag%eB7H{!EBZu#^Oe_!2h;7Ev#;8m
z@YcP@4vjf<ag*GG<9BxexBiP`+C)`*eZThnYlE66>UTt2xyHocuDDlkRWj{Pj%?X`
z`Oy7$;oXYm$KQSVXJcdK<0B)>Rt=1%2MLv^+0e{b(6P8!y4&2BbBZ%owBTId2BB3v
z^whHmlH0hk!bbws8~Je{?8_A=t5qx)Y@twBCZEyG0`b5QV_=ieHoTR36}3zDG<5U?
z=&>D#-YEH91_AC_!MQwa#A8OpzKqQu^i`{sT)mY_r6w|y!{bA+6shjG-4(~b8#1|p
z)ywmxB1UB8T5)nB+cMkO=QT<-964u7)j-(OFlsQA;W++!-F`1ljcPpRbo%4**x<<#
zC$bn9ilIoIAxQH-fwLC_9KzOvvp?$17P8e^g#e|YP>kg_?m&{M6oeP4y~Eyy^+&e_
zo=Mk<wT1Bl9^8t4SBf1){Pp4qL^n|jCIo877jk2v-l!M4lZkXXKQcBB&RwZqyb7`6
z$mR<JE7#%!3Jr5Yu1roG#9wuHs;5{eJ3WqaEpNA1(47$Bz0DSm2HipX@aVB-CvJCY
ziLPWRR~wrgLtC17G)h!I|8f3#C2TKV+>dD;)^EvFjQx-b-Uh1$uPeBG`EtAmR`S(A
z#1~H{i?vCYt3wD?TzBS*Y11q;j4Gyd4V_2m{Eo4S@pxAd?V`Hgh=vK?;H#C)R-+yX
zB*w=wC_<nqj&C@^KV|Z1XR+Shvlv$xHJrMYYur#NH_$@Gkss6f==iY5UneTCjtlDU
z1WHZVN7b7I$V|JO!FV(|c%ndVXfd(<fTOpcSGV`V^^2ZA#=(2NhL)13)%?X@eQVdr
zoHN!J^=O~DamAgtT)wE^`To?(Z#}o)>tbw+s0=26ResCV*Ij~>KaJsj_~;Sh1c^?Y
z&9-dW07USX>o?Q(nA>JX$JVV`1TV-_hYuWdc^6}5mvje*hYN{VgaC87d>w{{jdJ$*
z*!9NE)(by-da{U-FithgxZ1*HJwZ^#!j~JSSRpn735r2s6gw|-#(WVv$j-kQcN4{~
zWq0Dt*{F9qWz&`}*8anP``XFT+^W9hy2Xh<eBfjFdRxEBchwaupL}Yk&0rOg!f%Dl
zdnUA$g)dTO5sOYtlqWNW)7u@3^`wt(n=I6WUKwypm-TMnv7Ksz?Y7r;_ySR*Qul?z
zRkrq;b_7E8ddqJ!0XA3GdL%Qg+SDF><Of9rpQfj$Ygw^IHf$Y=hallX5z>kZAh>c=
zld)XC(CL(zO<ep?EZJd}za6$U(453{6TNh1+QWyNU-_T^qfjnIx_T~IvG`9ub03Z`
zH*ZMYJaJv++2Nt%$86eWazLo>aYIFAT?dDPgPum)?~5!(ogRhG;ge(al16}<o><Ij
zVgMfCo=`wTyExvpqGCE*9UC_OW+m+pdkNx;Y=AZ*^`z`9^Nt<UjvPK%?YQ7A&|Ze;
za1q2wdE2<qI4-<@z968+*HmGYdZG;asqGulrhulQy2=fTtnpQ>1eB(|@(NG?$0C&{
z&@-+OVF_&`*_t;|<#OUb$Va@J7LGBUUN3r?W0Ps0Hxvy=v)Ll5`NTy)Db>eL3O2j-
zD(=|uMObSYzEA)i3yHH(tq>=ScwzovME2#0mxJdx&CKL74V)J_aakCj%x3V>S!h%$
z)lxjsQz^qjlpx5&-^B-LPfu65T!wYSzjD1<m-uaMGnR^%%k^9_;|{pJ0q6MS7#rd(
zT>MBi)%a1AN4^MEXJZgjq<r41(QZ+FBr{lc$PLp33QrGwdE1G1O}|c_1xkVa>=chT
z;K&?jV(zB+trq7MI8FE3aFD0t4U$ON9*1E!(&-VlRzry(-a?qJ48!#JJ>r<6)2TH|
zMh6Fc;?fSeBf4OC;w@E(dyH9)o7TXN0v9A?#6mxv%{NcsoQsAc&1Qj{<MAZ!EGzg$
zwc|}g$K$Wl<Mq3PcqDVTJB3Q7SS&;np-P#hjUJ7-nRnvRNV!_V`!&11RFJ7z0cBOw
z0#1;(btt3-h=Nw#G`EMhgFkeBEIkTKikT@Ibk#=P)*&*i!|!D$q~>xE%iNBGD-&;O
zhVC%%p9oQIQY$ZeDfl^c*mZn)`}BAsM8H#Q_=-g1VSz$uy=uMQYzG4g@}vi(x>944
z6D<6C`nr?$E*{Jm%X*`UXc_)EmQ0jO<zzfnuQOyE_F5;AOf{PIYPIHe+34GNQEb&p
ziBuQySrGKoHuQ&ybUqdjG}Qj#Ji>o&TldV|)$l&^;zjh`UMp7o?l7CwuuI<P(uOmc
zse6u$Rlf7kPhQ%7k{A^@MOi+urc!HqeIbZ73j(p}aM*GczDaO|a<l1+1j>5ea9}19
zGR%lBJ{w>_^ja%ZZ<SzQ<f%HAXDqu@sJ*)UPNJB`QeCC0gsI@wb`jT!1=VLi_3>oH
z*Qyo+IP=DJkxu`&)vHYwcnoHVk7^~ENO*$JJW9mOHeAV??(k~0h_hC06q{~a)uH9v
zfl|J<tZPYjqIURb@!$R5f8+0hv&F?*%xSOT3(sNr3^iT2T|J|dm0$!qsztoo#Au%I
zw9Y^rXZ=;X;f#2JtAZk-UWj!Ogh`Yu+HE%`8%|GTe6nijL9-KPdclKIxq=CfMx${X
z%@OoyAH4Uj-b6eZ3RJTbemlz&LV?TFj0&&bw!!@Id1sT(T($ud6VM17AhB`fLk}YQ
zN)1QwOwwrd(GT505B<;`iWbel&vqR9KmW`BW?o=<u&gKHXjBqjJ8*Ye6=D+FJGgLn
zRYhDTUR|*;XHC*<;H0JPaydn?j{*!*gL1yBTU1c!a3@a=;TgVBHz0d$L~mMkcyinq
ziIy8U7!qu87Kg8pRm-(2Hw{E0wpzWMEmR_1%j%|t)0<0h7V^Hh5P%mEr)sLubR5`U
zi~b3{RDF%^W|26UW;iE=0$iv2W2AsA%3~5w(`Gl=3#RJ^-G;+3oo1<Cb^DQ4;C2@(
z(rnu?+9P~gClZLe9iCdH3{Ol962xikj<Z(R@Brx!h6r6>H*oIhZm}=e(&*vhm=fO;
z7(lQyL#RzVI&BD+C447QDe#4X|1p;<RBzw`0~WwjsTD=0Ax<#xBc|g8rc`Fez;8Wl
zn++ed5|Q2QEYwS;gSD*T@R`MW+7}|keHF_*;XZ{Wh6qWmmH0y~6@)DY5LWMEE6iw@
z*gX;ykZDCeEEr}!3b(g$U|SDn<<y4)h{rC;{7oO|PN&hSxq?h=Sa3P2m6i{;IK*r-
zi2h>ec7khHYi7w83EJH5a;53<_#r9c=B`6T7#vl!eg2@0*rjIC;V~L`a3kKcGi;h(
zg0@gQ^im+b7S7^%+R1h<kz}1_%ka89c<bXUexuNSfyJdo5Ilo6Zh$enB<_c9O1xtz
zcPL848Nj3$pt;fX1Ult9u2(y>nyJCT%ey4(mH<naW732S;tUGU5OhWGmnKk{ybUDZ
z*{yF6e|uX9Hz5S<wt7Ry^weeZvb3l-TBhBkX*3V5&}?-KqR&X27AFdf_;kaA0G^9*
z+1Uw%f;!H&S{V~QwubK%mXL{7i>~hTcp?CZVv^m{VPnMQvEw6|2?Za8s3gfYW5A0q
zIdMvDHqdD)HwuQWRcRDqz6+&BgUQnAL=?r6k1=9*`b(829%E=Jr;j0-_60q4H1Tco
z3;%c4YVY_di<Z6P$1m`M2n~H+R{%$ePBCV#87-ExP-WI=lf^vCGeyF*bb1UrUMS~E
z4T)ia;}d@{kj^qBg~>?v^pqP_U(joJbq*Xn+SRi(Q<Qq;^65w<(B0LKtS(#8mMvL>
zw*lg9=JR82mtLu)v$+xU+sL76Xxnxi+P427=WCV+@44$vcAemgSR*E)36^mLiex;i
zb-#!ZWSEmg?ji$UeVG>rC@dmy_<tX9bR2pBVXsFJE7XPh=~cAL+T)p0w$>ReDy|O0
zjt=WrkQ4(wsg}dz4+V!uWLW~~>hHm|XrNr4D3`_)DR0MCK@tzAJCZ($>%Brl%hk2Z
zHg5oVOwj`&f;I*56KS_Majr?^SD=A+{QeWq93Mw7SKD-H^8WkphfaFghca}$`QjUj
zI97;NQ47ICh;kBQNgCz_o~IQ;T3tSrkB^Uz>xP~6dC{*EnkWj&%<7|A9BbP~(?zCL
zV$UQZP6NmKl_JwP;vVYhCVoeyN-nZ&bJ@%J;aXuR=r!XJcQhHK>(XA1fH&Ng7@8OX
zv~okc`s(GaTH0f)6w}9Aja)Pq2!{ME-3){i6o`Ai{>|@FBt%PJ_}u#fey0zql3}nV
z7mFrO6IPv8pM@{)O$21hP|{Ec89btJgs1G>rFRRbMiTDvG~zakfm09i6q~&2Lk@g+
zmOL>Xv)mg?PAVfqSR(ldsZ<P#@S>&zp|4yN(mIudoKEFw$CylgB<JbhW#r8y-UYKY
z{ZZ6w<~Ma>l}a5YD{Kd6e;7Bao~NFX_gHnGxi*`)t72FMpulX*NIK*GnFr79#45l{
zF{JvlK8#%hPX&A|pj4iMl^{R;ll7}=K{Z1OAXcfRkm?bxOo=k&t4w6pl0c@gDUk6;
zA`%};a<&hxJl!1Aw^^B6O+a(=td(Z2@J5(v)!cHFGK)9~X%m75VU?wjYE~5>Rf%aS
z^>-?(cj3=l;V<|I_|pZOhE(CtPYQh1I#SrP;?MYE#ez`TY67rO_$$#6&zuJAS(`u_
zbOwK`1B+6htfm+m)9o^S#mcbRvGY+3k{ZVI75_PJVru}*`D^Cv1^7l8(sjJjAS2L1
zA)?<k?6#uKQ409mk(emOGI|`2a!-#x6tZE5OpnIjX3+1;q$j9+!iUW4DtIfsStrI@
ze{atX*R4pPb*X7+xVKuy_rCih9*G3;>9(;{z<FFN5-_7-{oZ@73;0@Ym+YI@8`w=x
zyzs)l(QF%q+JV)Hd++}cni!F2?AWnmY|>$xEu;DjYIusQ&lx;#Gh@~pcC*!PR_di(
zc6_v6E)bd!DEwXr@sQb{V?7Xy#Wrl%5LMBLTSnU(@wUC4hOHqB5AkGEiX~j*V~4Uu
z6aIMJ%9uw>cZKUd`}mSBbIVn$;0D1H>x5D{i|91w)7GxX<<GdwQ>v7YmWqcX;bw0S
z0hS1ciRQQD*|Gcn;;;TXT@#M*x@(dfuGma`I5iCthEv*;Vp}2=rIyT<d6g8^qeRdc
z!IF=pR!e6;k&ul<&V#7ag;-<YBRkdI)3<2x(iN-Wj~lv{DHv|QA7_<x7Gz{Kqg`&a
zMkdm-D8Y*K=F8SD3M@-Bg3ipu=$^hMvHnHEAOHCESI3&wv04$DN}pe|5>fPc+%>r*
zZWy)fs%488_p!4~<Xs5^2435C?E3Bxe&+XzPV<+3_ftI>Q3p`KC|3))nSg())C-6A
zIs$62%?5$)O_!cNT64U`S1G|JTO~?_%5$9XQ5i@q(Kml&UqwEO`*|D3vt)Hf=1PWZ
zNpj!%<};ps8(*jSG^o+c#5!{_8?<+Rt{|GJ&P>^6VLQ8mZyQM~AMz*XS$Y24_OkAr
zDaLs|QCn@(m6Zt;W>GQX)={y`inq3wPfQmw9lUdJ&*8}h{7*H(g~0zDn&^Dq0t0yw
zCk6~txj*Qrvq5j`gd*;NfyK697hKmp8}C}$-^KiYXvlzfKq0tPO;J^Z!FtQgWz*ZX
zy?XauA4-O_yWf9RyXDxkb0`puUw^~Z$)Jp|XJ4BnB9q_c4@di}s2w=8kKezc+AuMV
z9$2;dj=MI7!`hymZFJhXIUGd4BL34Co~h_he(`tjCeO`R_WtC#zLw_o2aRZ|CtDCk
zL=D}k(K{tVhK$Z~YoPAAu!oM<h3FRx=PsK>(8Ekj&_HW!`SRsv)0Rkd5fJF6Th{=i
znAYt(Fo_X{)-IU+uxs$Ir5ak-+V&kUe*UAYsmUjP?cSbO4?p(YfxblacRqFNnr>I7
zf<fM~RCgcR8#{I$z45Ans89RDfBm_yee)kzthxNo4_vo)apdV2C;$FyUvFzg-XeK6
z(8q9o?b}a%@we}Q1b*UEn}+|gS{|O1NYqma(uq|ey2vtD&!a^9qKYURprq-UQY0UC
z^yabk#FL&fEYb+@v1aW?)9!A#V?E)}M{Zul?BsK6d)_$0b_iBPR2*j49ZGaO0mKon
z@9lK0m-XHke)GdWc=S<Q-Ms#q>p%UO_rV+w6}3YXx$vTu!y|(aJ%8+;n^q+K+VA}4
zM_zm7<%wMN^Pl^j<pbL5FPA1y9&x&Szzbrfs`2cL+m^1qV&ytaPPIF3z3r>t`9T-9
z|D~#1He?sl0s?14K**O6v>9rq{5MGV#_KTGVV6`w;3wyZ{nYWz=hhXD3}oTOrthD*
z{`0T!jEIoW;m*0gM^SiLs!vmOQL2~%X9gufL_o9BG(^+i-{LP1%I9qLosI1*G~PCz
zv-A0R=d<#kV^qOIbt1(aL*5a4f&iB~2t$BEF~oF25uM6r!nE?wTj2Oke_p`<!s7o<
z+wGk`cX9K_)H;{Tl}ZHl(R^Op$L_!3R;AL5kdkN~jF=N+^}TzKCgNS?LeAj`z+uqA
z{eiH}3?LgG8PPl|wUAT0=YyL*a{nfBgW+Pp9XP4|<j2pO=)!qDKY4D)`VH5vzf|*k
zw2yx5@?Upc&e+5;FKR^rzg8>ZEM5!PG=FSqtrI^oT7U5I<M(}FIV|$;{j2-_<lleC
z;|YzAkGC;kz@ftoha|1V{K1Ff%%cRwsn<%_78*6w_a~yEHUZh4N$-k9+8_Pvd#%Y1
zMvZdgCHIxr4n6n6YmwC2hTb5yF^UtyY<U7Mw-@2&*B<=gZ~fW_lOg9Fx2}8tZR^<O
zC(;KHV2<|suEA{1$QLr(cN|)O*?`-nty-b|Uw`(ATuTcQ=~#k{XrZ82H=E?iyYdZ(
zH`f3C$9@{^Ui0C5m*Q#RfiK+n=YRgNnv5}oA;BEu51FE|OoFMNM+xQ_7Fp4FrhWjT
z#i>&V$32d=9tpV62cn}ARrfOw+#LujCYWsvO-q-wmyT3leSLSK8u5F4)Gd#LR=cm*
zFm2)3D?9dY>{-9IOWV9AaK)NW@GFi5Nd)zJd2i;K?T0Wh^18x@htq}5il|MyWpme!
zSKP;Hr=s|5R4h?kHcb0OUh9o%iN2NTZ1$Vqd-O|x@JXWNynhSf!S4Ug_kR%ahjk^Q
zpGSZ$6lLBaApLoonsF{Hu2Ko*j1&1VsD)Fsj%eR1W>Joc)tf#}-!t>8Fhjy${-WdT
zoKHI!Ci1lRp7!hXyUy!9w2VlOg(rqfCMKq;q7<JYZY6+v)7kWubNMT1$O_`^l>aR;
zoQ<&Le0u(G*<R-r!_!+09RT`|Bw1#tg%d~4`3RK48S@8tEf#s^2YF;J(gOY$7XP_u
z(_fJ1#myh11{d&0TZVfy-SjUel$eBiCNXjbGupwOdtcabVC&8!4S$c<<FPrC3^lf#
zQ@x9aPh?IWJO01^`kQOlFaOAgw=7CdJvExs_8n-v^y>42Bbl0Ew>1pI=KI$7estB~
zWp~}vhnF@Ok&1!!wD#q1KYH}&(d8@FBIko$&ea?hEqrh&pLy!pXix8!P2N;QW2Zb*
ztS5XC+zD5zFnf$)8SU0kmsbkuoL?!Gh@#?;X-eDhpq?@@nJ=v9B>)~22tovddu&)M
zR<-Rrcj6*GQ)rco`EWeZvc&*_r9z^6#mL}jZanQVGkWWxd+z-Z$|~^9#4@C5ezNuG
zSN`GYa?R!QL?Ws1jy*$_zkcHG58QR>fXt#{w^nIud-i31^yIU|URkknB`<~T>sxl|
zGh6qpTDSR$XI@x)#nMnj>xyV?Y#}=YAEH#&l!GF&Ac-Z+`~sBTC=`6Up=HXX#O-h>
z?y?&}zbli~CR5_0!ylHLs04iQp4Y1*Cyze4ZCAeBuIOHD84(N@>RQX;*X>@s`9JaF
zt@ixnjT;8;ec$4IK?}#FX%0<lfBlW09vjKVde%D}&XK`mKYo69v(morw(B+zXg<4^
zs}hodR;WGvgNI6mYN~&EJf>xJ37K9sy%UAzHy(Wbw|{$6k;P0fi1I#*52h<NI2Y0a
z0<#dHpCYIeK1J9Rx`(iX!u4{7nVtMP^-<T<lWH}qm>E=Wt)Z$mQmoFUWP#SN9LXH@
z+;cceK2wUk)A`8Drq3yc%IDoWs#nV{78<&uA}s@25a7R6WhvY*kpyH$q|lHv>Lr;@
z*ZPcenB8+g(z?fL7gfTy(_TDqMtRN`@~3cv5Lu#d5jRX#QHdVp+`6JJ&*fnOf9Vw$
z4*$7q+q36=ar4J8Y&J)1*WR)JeC(UJD;cTUZJ}<>6ArrUq57z!-7@U{aAI9Fx;U4~
zeD!NjKl!6b!{wMP6vO`b^7WUL0E|F$zv`vecOTxh_b?GgBH?bF{uT;(Y?dd|<NhGV
zV*VDpIv&?VrSX%OUi;xQMO&w}bXgAp8cOAYEaRPleTN4B<o|fASkzm(!=31hM*NM^
z;8(u-jRzk}4vwUQk*-3y9*Os+(>Y9{rU)36Xeu$PXU?auUEsfMaXi_q;h@bQj8<EP
z??3v~gWuONV>O#d=p!C~pjxZhT%CL|<8T_j=%P^Q5?mcU{`8L5UOQ0E<8aY%*%OPG
zEgnhlsqYwn>bbwvO>aE85?>9O@qJFW-|1f&35FZBy2FoM^Vmz<hmH<C48Mrl24>2|
zLbgTFs*>sTF5SB)^N0WTA%X_sXl!__6pJ}R;po5l)302*;j$qN03BGwBRQ7;V%Q^-
zHF#Qc#03U%_Un~sMrexSlKF*RMk(#(as|E2$%)c8zx|MAHiBMPwNi4p{n<u4f_+?Z
zq$?Rq7weJal1jdvDVIO>yI+n6y=_;5sIOgp%W=E2=kRbPS9t7^CzA1m-5WkR$+E>;
zGVL~ZPXlMBrZW;><1(8sy>jft{$ZciT+|iCCT|4Ccn+6?klr5Se|-6C|I6)0r#xGC
zhGI*Kt>%w@`toagj$N{0!%v@keqdlAQ?R7cvn#Z4zJP$W<g;+FSQFug<hw9f!WSxj
z7q$$E0AE?V@so8YXEP^qd5Q+kUe=0H0j$qC9C=!@oyqWYa!Q{3`feVj=2CUL9TyGa
z&kfZNB}KO6$I~dOYQNJ1iiDVgoL;Xp^O^;SRR0{1wC<Um&$+i3Pn}Vo^MpAxlW2XG
z2ziQMmo*iM#TYn4`4Qxxp@^v7@?}9~=XwfQ!2iPHuiF3JI$YfRF}`(-4nBjTI-|oI
z^%0!L2Un++4ZJ#rLw*l_OzL*8o-@Lc#dUnVIYaq+72Ez|wc)pUDveHGZ*M-Ib(pql
ztAy7ZV0X4%zG%PRHVKYU$d@9Kcs4WX>o~)~Sh<w1>z=`r!x9!)Ysb4*7RrrWrJX4?
z{fGm7!Gl9X0j-*d`iM?L@Q^?#)atmh*>oz^TdN}hl*I652+KdADO@<6rmK4{;GwI#
zJ3Tp$yEMERbZj^sL-`nf%1?a4ddHc|*O&pk9<L*qXm@n4KRG&@k0!ef!#_INY*fru
zOV^BzjE`pxzrTN?QXs5oqH9sUqybT}<RUgzqlL@Nj?Wj$RPaR~4ke=5O2stwVzuaY
z;{XzGBuSSSZ=`;^(_?fZh8=__R<6Awo6o`8b@whgF*GhSt{A96JEvv>fl?f$*{v8c
z7ZAX+zIqnXrob{y;&40Yb0ecC<argxI=BYyxJ=WAt9C+3hl7!kY^m)I6)VkPINCvj
zr5SA5eG{2tL3efvs0Bw#`EsV#Zr8o#j9F{us)o%G^mOdrYDFS%W3C$XvJNTu{9U<P
z)nm8SM(~4WXl_?7U&eon!xzJ6Y#UE671QoZ<XXt4g5mB+v08inrS1Jo296#Zj-^u5
zQM}Jut%Zy4Wdu}DR*aG5M!@1=)PNMn<dzDu(zQ_6rp5s0>gyD8j0Hsir{t?LG~OPj
zVuWBlqVhSFEu@y!v#gutnmWw+QQm|o9gdDt$=ly%W#su^m|(aqm*ul+bc(plylv)0
z!Ta`P*V9e#cH3(X5EfYi$!bG*Zb_)ksM?9%8X$<(xH<wjOu+0upl(0UhXwpEEdJ-Y
zIo~q7i=98lB^Ch@BAG@-!gDx%NQ~PKe3ctGImYR>>5bZKgy--TYpp=2D_?K6oWTx$
zOS~@J2H+As4Rhd(GOBd^2f1)4X4mm3h_ugcZxI{A=fi`8BZ5VgR%l|d78x9`+1$NE
z>x1;=%XZ!FYZ!J{*w?VxtF4A7h9gHi0i}t8gLlDPNurYC?~KrW5b-H8rPj9$ZP84H
zvU8@cyk%3p3pY?H%Ix&|YAi79e!N8kBYda$gTW0Z)ZQoCTxgicZmO+au4{q)P0)q_
z06+jqL_t(QK(E(2P6tjy{bA2|CdXf#T$VaIAVaf;OZddV?XK668puP_oE~_$Dtfja
z`ksxIjOO;CwhY3!YqB{*<t7Tmj)v}Z`4^S&E9cZ*fkd{{XgkIAMY%~6DRC5VfSZDD
z3O~`5r6s3-eHZXLw^xAJ6caFO?#GE;quCOszKu8dsDTy(L8|ByHa8C7>nJb!A~pvB
zc=3>_i!jX*D%MQ9cwr&LC<{5eKbR=uRo<JbGBJ2*4m<$iE#Kx9uPri#7|n(gC1d=s
zdRa!|36-sH1CQLUpsR)95HYiyo+h4A5x#K01seuAJ~v@O@|9K~mcp(0{OO$M2KK^T
zbBn-?KIo~}sCE9I{pp`!Cnpz$%puw1uYC3S0@-YSQF8HcdX&J$#EVR$>YmN6U%#FZ
zIqkN=s+a*y*a-$NK_#*JIF<9PGr}uyzxX$FyTv<ltHlT+pvW8h4kZ!^BHC~R0oxXJ
z^$kxBCz3Hma><H;aP|ym<$eB6Ro#U|@?POD#o;~CA0Up?#9s<y6=;gjdt1fp*#GQL
z{(K7OQ>B){j;F|MCcmg_5!H`Gz}ZX-(CPJ;tZg>MpES6E7`U<FCOG3d_iV}Gv<uD{
zoMT2J;Xs%OoZSD$8*eC_32JC7mhy{IeZ!d{uGDMj4bEW#MSQs_NYz^q7f;JXWceZI
zKrK@VA+78<`&HC-t_pPF@IP?iK%6)$$vDd|@L$~BKav?vB~!%sn5)XFeCHSbb6VB2
zWqxt<$JB`7K*WWn1OgKe@@NB_vboc4;taRrgppNOICNTY05&H<ce%kX-UJOtTgDZ%
zfQMw+&%z((7;d9Y#Ap_(*u(O>&=JO=qwOL$SVj?5GAto{@|e61f<;g$HH_Q3NDkx)
zBq*>QaC7P_n{Ddvsl&TVxKF+JEzTGZN+m#o7-3>EC4G_7D=vx}sfrL0Q0Tl3zF6eq
zvbqo}a{QXxsWYC?Sv9cLZ|@Kzg{FbG)(oeI$rZh38y@>{=r}D2ko<%K@?s~9vV?Hq
zk|4}0K*=pS6w=U2kxr(QnNrQ3`4rA%GM|zHQ}xlQQcV?DXt-<-vdBO_OOHZO!VF4B
zqEiXZ$b#9OH8rn_J|IsSHugynj&naX<tFw#L_@e?Cxi#ej<iMFK?aNla%VPHP;fBh
zesocQkJlMEB;<8;OPO3`EXbOWb2Wd<i}+P|cKWrHa!P{3*+N=C;A{vm;uxyZpQYmo
zQIP&bM-=&S$5zhQqJh-tL?#?e5Z1U+YrL}cbwf8-FTXV4i?VT~6EM@}aNC`nXu3=2
zfFvOSryUtAZJW#G<_O1$j~z7Zi9m5+?@_07==R%h_xqh;e`0hz9Sx_-`3lj-R6eRq
zayB*e)I|;q3MuLNKSBxBRG>1KX+qUv=0(%*e@B0rqO`Uf8RqoUWJ?mn%DGlFm>M0=
zhJz`EbM2L_I}M|?dSG3^3%lIb^_JZQkL=dj88o06Jajt37Up=!#a|QACfW{St@zym
zEPWgG=7BxOoUW6%-+n6rs|DwYjNqKB!FEd#<^oBTr*tGsemvD)>TOd8>0k<B3x^pZ
zHiOUXo6p1f1;l^<?xRlEiQ8^_pAXxeKyqv%8;f)ih?^0iDmK%>&U3GxSNNZYCVZP&
zT;%+*jZll|2or=xrgM|nOTx~!kR(djMqC=e1jrR4Ex<U4Diij0VoW0Zv54F>7&zzx
zJcMv~xIvI@ZVd%net`@@dE{q?CO3%U5g|*J0Ses_K9B{rylUEaH<S_6UnkDE6}?bT
zsrFDb|4oCI+fGrsx7luUb2}(89)#$S1P4V#6$n!oXZT|!Z$z?ew^fw`b9|0bUmaE$
ze`K6DMOlEElAwfyRYFXuHJvyFX}j7@0{Yl&B6Tp^cmV+fd@L)e00`MUJR%taj<u!`
zDZ7IyLD)Nsp7X<0f^tqLtIR@D=UWn{b#|I6qChJ>gnBBLR}>eyhQT3UTe$gPp0ZX5
zf(9!wRt~fdj^f*5mPmbJk?mStq`z>qhNDB2E{-~(Q!Ivs_Y%?yRTT+~Xh)Hc$S=Yn
z%j5EDMge^o)>$>P6h{^w!V#+p$pTlEeFkv3X(25jFdG438t7Nk)QtYjYL`x~<i`wL
zd)OD994o}aey788_`uMKqoef0k-_xJRRe^_t5<ZF!-YmX#6cG1)A9xBp)#DOjsyUr
zce?C8SZboS1&9RM)<y=0$cH_Xk<}9$H*O@TO`}>^)RQFmw5<)_Kv$M^qblv`+e%hx
z&;OArfTXCIoG1oPPON0rW%|qMWMTTAU;I~yhoEF7$}h>fXv3&R;p}rd-G>hx7o2S!
z!FlC!x5HVlv<N6KI75CR4bp0-A#9<|svO5@3TN4b4S0hJXM9Dbp)(_^(;GKlYT#L@
znqSl%mAJ4%qd;2CW#{|I6fD$D<`Jr`w8`aVGs$9zslI~Jdi%_eb8>RQ@E<-o+3uKY
zSEVnzYy<dbtA+mVaJ5`VAx|oLzLB@ko`3kyNu&OPOsxU-3vQU_KfxTs>6#`2OVlD@
zg>CG?H4smlcDA)7=n>IMZT1?Uj=fHpP{@K3m_awKh{zC8;YH{W!Vn9|kSl!Ti{jg4
zL#Ly}Y$9(Hk+72U358+Y!d!5~9D`kH8OCyt)B1>rPBA@Jw(}fHDV~ME6k9B{7VQeD
zyHI546dy?v+mjCq2887B#Ds*U2>_o+Nazemupg^MIJWI@!)(~e(uL*_$+5kz*@^Dm
zP_lVz0#jkM$N|x&1lO#$9aTO=jF#4zhEEEh-akbUt#g5xV$qXX?5TNr>YjP?$C{yM
zA63z;g43fA;>rhG6vW2Tn?!zW*j#P-3fs(r0re9WW<m<t<UmeL!;n-@CPWrJS->ag
zktkVMtstB<$V-dJD6kRB6@PAOqeED5)-ckORjVSrx5~l}3Ioc*m|tn`i=93wpcf8v
zjsOFYVJM?dQA^P)u*Rd$=dZV#kx;aWFQAsWd)GdnJ3tqHWADLSwkUf@kX_*_VX0l%
zmZ}FGiswG<h#|I3Boy>x>G$|>$Y=BW_U<PKuP3l)*FL?~CVFZ(97ovc^M+JD@P`yh
z6=ten@as@}>&p2*3Q25JY+j+Fa0^|Ms;T4oIj9<0AgiP7JAu5z?;&os9tp*pO=7&Z
zr*LlD-`Kl9mjh=(tS!1Lq8DzryWWhjx?*?G1VGK=+>(R0?fFb`-@b#aTc&W<%~rz*
zhoi(cLV0Eu!gGPA)RE2GQ*V*_Lg%dyzzT&FWTn5o8%dSKDvvt8Tlll>YT-`^cD-q$
zLepsIUa$WYGIP+Y$(+x;!vEc9<5}-`mwMH#H=fT^#h)p*O|c8GPYBN(Xi{|G(iGz*
z5;eo2HTcNZGXk{^0tzH;aBMJkh&XvfcmnurhbHWWb&t9YQCu6%0GEkkvK@|uo4G-S
zUaZ?}@^NT&m)3NfrpIi1bq&=~#Z$Tz+f3;!*3GK2;y0##kZvd3^LZtmzdW*(nCh#K
zVwI${6cv^Ykj0yt5>J{6WWh}q4T_86udrM~s^mIA$x=a-Qaa9#?(Wn*?Yg_u<lNad
z_^3u<MZ*nF3tM?uUZe_Oui5P-J3O?bCUX6=B-P%+J44lZNY*n`bJAIEzL1v?C5aO(
zVUDMd!hp(ROnsCf7;x31YYG72UuB6U3MV3pLuG<TWVu33fX>NB7+vNB`Zo4@N+d6R
zTh+9~(oO*)kH8|(DpR+)+B#oBjP5NxT!w~pdsk=XL!YO8;kNlqfqWt1TX^@pMu5>j
z`y(53iegwtp>p)}V!q(_5)$6_#@@Xrjvx1XL|K05#L4l|v9@mbF!p8J&D5oncj(nq
zbb#K+z@9nEaG~q6#M5l#fDn0NbZqeCNrcMeaPrvkeS7!1><-RLg#s^~dUhr^p-74@
zsb#(hxrOGR=cAA>1}wfO6<0BRl!}}sy+fAHqOKeT`YC+mMB(i5qRsuro@tzIXW-0I
z1cL)1KU4V3LfE>3s-WPExWm?=)l5U-d}0>Pd%)S|^SVp<+*CeufaG-Rs5w_PlT5y<
zzH%buOX@B})%pm%V)o@T*XcsyKY0AazG?i+h5UOG|G948)AGN_^(&weqECrH*MY1T
z29A-VcoBFI(Ro!A5WZn*Br$rp2hCc$Fwjs6mb|i7M)?9A3{mnVgN}zx)B~cr>@?Z&
zCv&9&YeNnvBUT{f#ZW{3iVRiW$#6xlxy5$2ENJ-!`U@{2Bl+S<BF}-!@E`4MiVd@?
ziqC->PIF+^B3>9Q&_JOA!LQo1M@FG27ko3C6mWAq&YHo4+(IZbR${aY>b6Srf{i(o
zc;9X&J#rz5vD(y}(NaQCY?FbT8`OTkVk-q`)tU-YHAl!JeAKyIRjs3bo(BgiZk1ws
z|53gvM5lkSIun&p{gCb?%E3zGf+rF7(Bov)1;SL5?Pt0ls}A}ax;>m@i^?CF+Yk%2
zVT<K%WWOBA8RW>&X(ASqSp}#WENYCXmXd2$i%*qCnprq%Sq-Y*O=;&9o)(x3$N7l>
z!;n5lhhbr+j%){+Is?Ab(S4r4&YiEjFiUMTV8XDS9vdF%@9jz?BQ4!9(T+uKp*J0N
zk0>q+>6S4)bHv9a7I@sp(d}_#;WsimtT)hIcDEWeF7Dp3W7B1qHCuJDfDl_PDFnS&
zl}T9PsoW@>8h>+fl!~h&&uX5x96mhqmiL^>Pby*|W7R^=k?=uXTTW3xGktE~&Ye40
zDm0okw9=Z**4W5!e{T=AbQtHEs3O`i85A&Qu$_f73znwfY&0AWkK66aW(p%CBRXE7
z?9Nu5i}u|+U*B}aCCz44h!HqrJ~3x_O3u_l+EJ*Is`u1UeUtN3$=aYh%MYh+`Q^_p
z6#fYH!5<66T|2g4arycNJ)haqWIq<2PpQ^i9p@STzkCb6tsE|H{uq|R0cRm{HFkwW
z3lnC7-DlVW-Hx^;d(#NRAOvs=FlBNZj0JJatZit~v%-V2P&AY@__W{&L_G^v3Nj|e
z%*<u_VLQ-rMo++$4eOTD;t<Ay2YEaBJ547iydPU%l)R>RJjk9%rd1NY+d5&ePhFD}
zYe-x7%!8xiF~Jc2OKJT}nYjiS%r4*~wN}Mc24PbhI`~+?Fb`z_AU16#S}i8*o5OUm
z5^)*u$GTTJ&cjfQIZ$jOaat)7Q<tVkM9UdEYnOhAZa0e)UMLK(#RqdXGu;YS)6B8_
zIt`-~h-aV=a)rXng?W@@Vpc#3LNiH8-sK1SDv7Vc6inkQby2zv{Kz%y9yRsSqv+Bs
zTBM@<YLP`aX|yi{Qg%`0T!AB9RFslM3N0%@Q?;MyLn|4-<imBvISYKk84LJ`pDhL*
zBxi#>Q}%_E_cQ_)B?E%l{n=XGP2(cnM(7~=qYi9%o$zl*4<E{BGXcN12|Ek7<#3!h
zezd129*?aT4Sej*T+V8%?zGc~r?Z*$w{5^-yPJ4aVd&!!eemRoV@KT(+qTxM*8@IZ
zK9@Oq<WPTquh%VV^u*MZQ6(ReBIi@YR*1{2jL-L}5-X@otziokvni5w{*H$gk26Ls
z>StAuSCO0GiR2h$00Aff1XxhnhG_<<o~~3pN>C+a!)>?IRmHQ-EZdD!5WozXEuS}l
zjs`ai&O;|@CbzO#py6mf8j+^#U)1dt2Sku(iZNGss##OyP}77+(6bP<UWxcg6@GS)
zVV0k}biwck0ln1#|7?2l@WBI%78CN!i946J=o`Iv@SnSuJ$ugYc%Ys=yLb62oEf%9
z9nFn^*NI&LHW?C@ojo|+B@$w&q|fYxv<`M)+(Ji^t!j&<Kyr*NKekJZI7p7<gXFmF
z;XdK6J1rN!hs2Sa;$UR&2H6`7hEx%9Zr}ml20?10IpFJ*+?}er&ft{J%575&@hJ{N
zUM~+zQlolmY+G06$x$><M7|5#O3j3If~i8jOM#VK5&aXn0qZ&df!sPUipI49I$=td
z47MU~ruI~yH3Qv?6SzIu({IBZBfMlx<A%)Frco5%2jDJKequu9PFcoCL%3a{(%rze
z5B+9g;}mO7<0$@b7S8etUPuY8&pbhLrWmrzOv9@RDi;$1kOhpuVyS7$cQz~9hR@X9
zYRbW!4wY^UNfE1Y-CE;yh(gE`LZo7%)va2J4G^>m>1V<XtGB`ry8%$NA$hN8#qeIb
zx(7`mm|wmsy>R&J7P=W8lMnC$JEK8%KbUn?SEo>X7SaL&XCg40mYw<ARG(2j2MXYD
z=%`G#+go3H0baR<%@<=CU1D2%cx-TFWQgeWZE<KLj@uM*g&NZ{&vJCZ5U>)*Y^+ru
z9vvDP9fHar4TZ`h^@P^E_~HvVj8igZmYR}-toNV}a#U1E2$5W&%yV-T62vQ1{%->2
zj0-}tNY?R;-+s|oMNU<Xrm7}z;dYIRQ++@H3IG8XS##TL7FLZ`UE$nud+>X6c7)}+
zrs3s+rxeZ&_3R)`Dl%#tt|pr(fxr0D3+!594N2p2|J)#{`ofto2<e=@CQz&MnJd;W
zZyV3)4A1rN1;HPr!5=q>h?BOy_%j2(7>g38TB6KZe|{4p^8)`jm1+(zT-@?wwxM0&
zaC$twDiPD`jbf{#cbZFk-N%mZS+#QAo`Zw^t8Olr^+sjF<<M<D{J$ambzs2Ja%nw@
z?%Ht4i2|nEaQTCky6JOz-G&oWU)=*8s5|RTya?$YN48e+6RpBwFJ)@JK+qHRkLB|k
zF_Jr4)X{2Yjz&W*Z=`$IIFaI(YMnX)Y-uq?YgE^j-9}+xtfZ*t^w^&B!94ZSbNmBi
zgf|I7uC&8#4gNUS0XLxsjz+l?3;Tj0TeX(MCuS~}Te))ciQz1wbAQOCRkE61jYI>j
zS~cHjMw2P{>wKXA)9VceTBi8j#MF+t$!Tlqrr{1o)06pByi>_d1RZ)f=&d$&e>{Hh
zWV&ZjUrTQ~^`chK``vKN1Z?rvaeM6Yd;F17xmIho;*luEI7;v-5>Q$BC>C7!xihHc
zIaYkGZey=kjOx^?%$hf4o@NySadS9bEwk-%xo}w3Zq)+_1oTFvW9Dj&wl7|_8&QII
zI5i_*a}YYj(X7@Dmn-NaIAo{PF^hOCY$20(_K0T>8^KoU(%Epa(hk@=#0Gb^^oq;L
zJ~qaw@E*}t-HqJN>ufbaw$llQgV~{jOCvZ##^A%QyP_?xPj{Q`PQzt3WNm||7nv~8
zbvdgF_*_A@knem(K>C=a?sR4;&m?7PRBAd)HoL3Y(C@kL{tm&MeEz@s%fCV?>z_UF
zn=FS}!0D!mEYpSob-hJ+PTZ#RAjAHQqu1xFHgI$U--hx~d&`YCY}vAf{`s}9eg{V3
zOTYJf^?H>p!=};n5Ob%kQ_SfCy;-0L(O{cpE{^c|vOfc@N8j&Egid|`1wWm->1<~d
z`-~(PcpX(nuz>60d+)oaZQ!^+@ZbOPubdsnXCL@1wstHmh<L@(<^boq$L~Xv`wWBy
zb4~O5g4H@h9!qw<n_F%K<}EDKzy1&Zpttoeed+hAjhf40YqU@tcj>GJL|{Py<_J&n
zesl9p&!qH3q4t8ntaR$vSr^aO>4n6<Ww!p+?@!^+Bntiz_xBk7=c^shHJ^){KZfI|
zp@M_LM5D~&)@Jwky{=69_)S|j8hU5divHu18A7sk2icl4?E#a>aLc=r*-TTS#Ep)x
z>{&8CmJY?ECv)j|s=HEzEkGKo<%+rPRKTZs^%?;xYyoelQBU)lWH1TctrbhrP_*R?
z7(|4#w=dmriCsUw@Az?NU?XM=CXth^5i3J?Ce7)foq5YIe*&$6-GuYUtFFfs`2({E
zT$ckM{93DWVE^t-SFQ_(yRvz6GLsKQf{l(*D@}x5W+dQjH*-!+k41^nQpQ%jzbjeD
z=Afs3ufNXTKh8HXJTsg4mqwz5vpv7p7U>P!Yq>&}c+qZ~9t?)O)kYQnO-X+s>eAOP
z?`5z5wViui-bk%Q=sseLIQ>4+*MMb*D^_X)B6b$1*Rt;@LhD~1h4YsK{JO}Gl^TPx
zjFE{Gvd(oV8&zOBZF`U+Gp`~WXjK~3^bOZsf6|%UGc<0RrSZv;B|!&NITB&fLYQx3
zVm#XyfupVjBOZ6#Q)R0|?TwIuDA0?2q(s;<nl-nBD26B$YV}&B+Gsi3abF;un;cCn
z?ItQtwN+cQc4fqDg`3*wNTucU)HG+^5U+g7D4m^F=1065o<A@aa{84(KrtofS`*=X
zS!y|5Yz8yrMe7xBji^#d@E~@BNDO0WC4yo0KZM~TYsSbKM?-BNaf=9y@u@D%HBTwd
z+Cju2L}{Y5fpA)0C!WG(FGcOo@W-MF9IcG4KUl4B%lSQ8IiKm$Q7Utuk{BXOQcR&@
zP1#t37k8ou3{6CgtQc1Qf&=gn+5@?x!Wm*C6{ZzLf$mKRTWBEa=bjFHv{<6xP!F6%
z`whHhHb7SfplM4077a$#DpRnYUnHk;le)e|dRGbl0{(AmjdvCQH@$ssFI?QSpQvUU
z4Ii7ydJTnH9QKAhP9pkz`0iVKd$etPw7>n{F5Pekh~S$a4Y?c<ua+L%k5`o3*cf|A
zBgc2fV_j9V2sf;`wU%Br-A;ljBc$y2#;du4&8UTgu5@lZne_U-zRAHs9P|bf!O3)9
z_ryak&)PMs{`3pWt0nEff90PFBc5C?8}i9c-o=~5rI$=6P)SjiP-K|ewCm-9&)r_O
zsP|uf>Hefo`|&H<m;dI$U@&ZNmKH_4wfsaiU+e4bs@GeiV-vn`OmB7ksdyQOV(?ff
zAflu#Qz0}KI&8S?Ml$HB=ErK;@xG9ISt>y6x6$0F(;kBG`rVF5(7P-i`K<@8%;dBT
zuvQGc+{9{xNY~7Ql}aVxMT=m%bg;(rI58Emeeh;62zW4`VPoN77sKH|^zPQ0@4Ip1
zZ+`yDW2W}U|2bF9YaxHc-7dlHj-K2bO7v?!PuOd7G_laHRVT|XUlN50<&Q~}Qbd`a
zwiEsZVSN)Ls8X)tbQl2_I)@Iuo-JiZ5!JViW~o$+CBvV4;MV04?c_H7|NEP7I=u<B
zHPG4=+j~)l0aD=oRi=IOxXb)X6^(u_OciV^i#Nr<aYe<IknxL^0*gdC9{K=$6Y2EX
zlKNGC5Vnt-6su?P$JRLF6%Gc2sCJRV#Y}?FR{e9iM#U|hfpRv^mX}5gK#tbQ&&zP;
zSHT%NAYcmV;)F!DyuyiCNG)yNn>hbG=)h+I|DOlWMSA_>=8pj#{=$PwL<`X`VzfF-
z<;|C_+=!ZkreU{Rm>i79dd$*j*xB5?;j(q>*Iv2SAMt2=$FxdOd-9v#FO};@hX$id
zS2Xdu3xAClTD++H+%>&fEM+z>Uc9V7^}f%2sB4ktc4-l}_S$ysdrv%WnDw%)<*_^R
zxwNJ&kB2qFc>e6z!BDWrZp>}G%{lI8!W>Vlg2)`h7YHPx1h>;>G|N|Cy|&*ck{LYn
zyEG%>Wy)*x`JH!u@M9Y<jp}%~^J$ZL?Wv#c+_v{%VPeSb^mkmImSK3jfoelye(Uv;
z-{&GL+)txre94BDAH3<Rfn~mQLHmFI;bE<T*c#r<F)=yjZrY<hE!MByyyfZ#A9;!(
z0y2+^yj7g{iO%sENmxT9A3tY;J>%vJ{S{(L5txJwlq6J0EG&?sQO}3&y*XghqE4-r
zOWSqbO$3l!?y?PQKd|Yl%dSZwJPP}@eJ_^_lj-N4f2l-YYvbXVinCf^5@R^SbHH$5
zVA{Cqs*P7{T)l3!qbDTRdB+ZFFTK3`)opKN3h7`VfGoe5*WxM7w#>D-H-5C-#6h16
zHU?%ImIX6Zq;!a&$$ud&An;BRfZw7ck?87ELQUa`<>E9etW-9TNWOVQzv-pwsVQDf
z{W$fM${dlWD9LjDT(Wvo-WT_U**L?{0)>TxK(9#6%-3@P$9hCPI@L_pQ`0yr#pfxU
z1@MKmfd8*#AGo;rV}y}R<v3G70B6&i9bHEO`QF<;f-E^x*Is`98M9$`7~WLacH{Ne
z-SWY;VYjdk4CwW}TBAqn`Sp9g|Inl3Eia)M3e|Ejh*qo49yY3(<ytuq^xl5^mXF>V
zZz0=MM%9J1cI^hOtLp<_{ii4Pjg$znFm&Shww+gR+Mr#1<t2aj*c)!2yIP*xc<6K8
z3q{`-38c(f>aV!_5HrNE&sC__ZoKwNnB&QUw)54O;~}53SzO)Y|J<iPymToc@-UMW
z);Z+We&;in_dF5%+IJrbr@}3pBUi3PlVPJ=cRRIU5EU3LTPSsRb!}R`;-h!0TGK58
ziWU9ZSG6kEXa<VjE_bcbm}u6HjA(27wQDvdzxD0B-5GYmx!K#OUIlzKq$=SoIMmEI
z>-#)96*-xeC?w^UZDPW6wA+Z);9N!RL}con70ddT_u0#a_VD9_h<T%-NOAbcmP;@B
z?5FPx^;?{;NOW(%HdHBx+SSHW&%UlZQO#!kLTAL#5x=Ie%Y<Ul@4f$yyKj%h9THOn
z)o+$|SFh17U$b%9(*CbM{1a9x)nf7SA3k~8zrHgO(KcQ>aNzm9G@a8fVS8~^g7J;5
zqoc(<OGrOmhlRfj2%IwlbaG)^74OT@N?b%~SgqyAXi5n}@nmON_FAu+Ve<IO<J0eF
zhYQ*BRFb?@k{jM*J0Dc}Y&he{7(HX4mNdilat==F+L?!+gR@{hg|a*-%Mub7ktv^<
z#e4Gm0{*{}&3@5S5q!`PmkBAC-XW}XGam77TI+{=j*l6{5n8$^i9yoUm#_Hn2i7Lt
z+HhKX`l-XuzA*C2?p)ZWZCdZS=Z>2qK1a2bfkDByy4h~j8zqm|)iI1MS6_C=tuPoO
zX@2_IgWrF6yI$8KK5gw%?XH`z)tco_tHw^#(9pQwp)E>kYnJv^vSUavF5ZrU<QRen
z!=|P#vC0zpv3R0rw`#>i)OYn-hhNj`1>w11kI_8-<KO@2`lTqRXnS`~e)%g;{_wHg
z?Cpc?2k%(Ae&sTc@{y_Qx-!qx*i`lTd}v2>cP0Pu_wW4TXIAwlg@G&QFgDYgl?)ap
zMyt--lU*raH1@Nthg+uB7tz+QTIzG*mPA5V(5xsiB0{Ek!p2Xf`Pq|Cbp}d0&B2Sr
zg~^PLD<>kc@1Q~N_d2h*>{4zEIJJX&clz<C(x{}ufiHaaP8JVdP22kV#6v$h@U;hD
zpig#%w9kBE{T1t1IJLT6Ybk|halk@!bYYim?d2<DA%|9KYI_fAue><4ecKRNF;RZt
zmPBtV4thbaCo?%t0ipqI<A&8f6!a14n@tRNaL`BDh+>L4{1mzie-;pUrwGjOQr5}L
z*V9?9bIpAw6CR(P1J`H1o@-veTKCNWI|Gr0$qX!fJ@*>VWFjcfVvemmtzXYQxA(%G
z7A9}TYW9_L$6*2gbH{4#?!36=$Kc{{xH@pPhKW&<%VS@+el^?!vWXu({&cm*_H)&t
zH?G@!Yuu-0H0?Xz{o(G{_oTXdinY?fpZ`mL1i5c2{HPCgVsTs1!Yd^H?3)hL8+P08
zxouN`=bPGBzwy}X`*-k`m!Ez8pMU=I>(*;mT^?P&sO#v-jNW!{-MaPm8$X(iYIogo
z?ZG48V~bjaa+td&o7<d~{Q>MJjGYxo0_IrQP^Vqrc*#nz1=FXWc)@HoOsDgTOV?Bj
zrak0%=4VI#>Cq>N<-O<Nq4DwYFaGu&VTX3>jn{nhk!S05EgDTVYE=)OQ0?sk%H7q*
zidCtl?CfjW_HDWKJ;8w`Zk$dB{GPDgM?9=DX1Tb3YulfH@x{;HzY4MRJ$K#q<-h;O
zO1<WC`C*Q6FM|EaG!VjWl3?X6O*`xBygG_CjGCK77g<iRSh5)nw1iQuzyY4g`Yssu
zZoVv4Z;CENxm>K(v|2iU|E<?A?$fF*?e#;&XMXbh$+1zrTIq4S@4x-(SU_8{DEi8g
z!&uRD+CH(T#O?`|d^>LBwb%EZ*uT%V=cVn9T*lX~u3j2i(Rc6SzMkQV7EgpD%{Go&
zY9r&{e`L?+?%QzrMptiFq*>`yq)5Vx<4%=1v5hx@g|vXcFM$B^$Qh9&Q$9KImBpdD
zIP;S*12eb%g161&Fn!NVR;L7_R?gy8^1YM<%2|38RGqI1X7$6YpJs0R1@N4X^VE9Y
zdQkl&SJc-j+WQN#dXM~Mp=?!l2E7IRFZR}-VKOexiV&W|;ld4Z9ha_dcqjMj)hn6d
ziCB8**q|?xK;#$<xv#jyhidH66Rq9*4mthtVuM&9f!BA95ZI~N()#=QJT5|>I+_(s
z{<VPH?$GtH%eiR@e2cbYUwtT7soDItK(b=k4jw+n#y+-TSgPTDz#k0c3Ptv>GkNXS
zD;++qMU0Y*6H2SpN}9>mzjB5k5>Kp_B?bzT#O7riR?}T@C4J=JA*>ZyPF#KU<|T_A
zrK+~;z|rHA#qmmOBwHPw%%FnI!fVqdT}GqMZQZGap|@~S4Hsy4db;|Tz=u7*Q~NJ}
z_5445|G^U{k2Q&1sA<_;x=<|J-5x`86e}&eFNBlna#5R@)HZMMQGiCR0(0#5-~f0k
z3Ez22E;}Xdp8j!u{1qf6Da{JY_EEmVBFGjq+^h{}#5&$9d@g$`LhZCi9($fugx~Lp
zL>$*#dr7gZp-%GM2Ol{(oJnV!-F*YkJ^ymOA*+j9Zds2ug-PTZBH4(CWW4baqy{I|
z_Gg}Z;pJ^R#wH4We-hKZqeq60965$|cQmN=_xDYVPo^hx(QxeWf&H{8jn&;93HVvR
zv`vF1$K3$B75qZsr^+vdUPucFyiEiwR(Li~W^oc&dXcPi>sPo6i?KNC8)kl;d1Q7D
zGmlum^3s_r?~#)QoG)xcFW`S+@t>o&U-WdO(KI>^Z^!L#vU_Z^uU+qFuYWwJRa$Ju
z`!G{VbR}BEL~v+3b|2`t!*xfvW{cE2;q7~lqn71$Ydzgd^g3R#JT8aF=XSKJC2zaE
zI1+*p48muh7(G!iYu;qp87>&kgM(x6WbhNjeL`EHSZUyy;lP1|v54pigb3+yN;htf
znrv>fop!W<*9kZ-ksPCpjA+(uRj%CRXYG+K5(c(iuQu@R)72e^d#ZJ`m-d`+#Fmsg
zAz#<hq4DgAp&Zhi-h^n@h5~*JB~c!T_}wkc8tv|4%@|D=zVi3~bl}*DLbZX*JeOBi
z6>eY1;q@0QP4vBe;bgw58y)8x`;Yc_iGs4<<BWv;#bUwd_u%H!;*8&f#e8SxEd67S
zGLgeA8#oo!@qI-kunp^1!SW)W&lEdNLv(<hUNhO_4TS!G_TDr|k|aIPi;RrOJL|6Q
zo<3%Jx@V?mcXoI71{Q?43oe!fAP|R^kN^o1Ofrh1QAY8HBr}RM&?Ga-^g}Yqh#%4q
znM5-gD3b&cf{2?q7PG(}n3>%Lc6N4W=jgfm?mDybj>rf-@8_9WS=~KdSu<Td)m7o0
z9Tnl>;o<)J_4*f|f9`Fzy+&)$jjFZTbJte}{Wyg9lK0~vTuLGzlg8MX#=xm|4FS(I
zR{*-Lc3Q0;??(C6_1?M1o{st@R{K#d-~0Z{g+irNtTmgR)s+=|c<%?FeDuckOS8--
z!!YX7GW?J)A+f~CnadD(*9eS1E{|h?>Eh{}W4(8w#p9>OC*wy(aY!%rmb=^<`y7!I
zvh0u<=a<V#h;F$%4)#tbF8nQ#j03y7M-b;R0=JxG_}>D^;kbOz`LP6YYz-FW0_+p}
z#e4`m_Pn+As2At?hU^te_4!$70dmLdD;upS?6OUWUypLLtDD;On0~ZcnJJg+STjRV
z%GOM!ASkB(VyTL91(97|Z6Lpk$_u^1>{b+By1qIv3R$bwuo3C83xnt1SYO4q7GI@L
zEze4d_ux9QPLpxU?6Ec@HHxh5-|Dq8@<j5uxd}7I3|<L(H!~}P6Dtj`o3GyJ`bmBH
zW-IBXY?>1BAhU=N5fRD+h;xujc3Vi?nW|tSBxcJz7DeXRpit)&1gK%QV!$?t{JaoR
zs5VT_`HjW~QNfRAW^3RYhIy=E2zhDQVi%YbIU+XA1$TA8jzbYLC_1M!)1M*$!#rUC
zLIZX(j}~WZgkY#&-(rtv_I->8UawSE6@?#zXqulbluvrWLc8ZRnyn5Dns}hapqn4C
zjuJ>@--A8(m?;M3788ubuhkb9mQTO@+Lh0K{^MJHPpe0L*0&e?k&nrielPBJ1>($X
ziA`h?Y5UrWE5Otn#%N`dE0Z4&OD+48A&?=EA&?<(gb*0giw8HyviS<-%K5BODin*X
z9%D$qeq|+2OC9#*%>{)Lq)rUr`pTvk)_d&w;bTL0ZmWr@3gw$^*;4jZ$Q5#V2`0<M
z5=42nT4SLTw2Tk^om{EMgZbJZtgN>>$W3twkJ&wl-Qfm)aP`I-egr<g#LBb>Ty&8_
zYsn3JO7laVRnrEJ*qtDQ+_-Y(%0Ox@g*ZuP7qr!O1NIhFwvtk=execi@ZCPt&H}@a
z9T8-iGl~{P3{YtII3vei80ErP+f#?|Jbp%wF~Y!mC@#GeZhiCSO;Kl7q#Zj3cMOZA
zUWD3(wxV@XBUmBgZRf0HhUW+E4{4O}*nXr9Sm64sF2#feo9DrgPaaz$oBXb>bo{V}
z7??c{5FYd3W~15cr`<U0#)W85N^&J1Gu@$J;`0T@PPJigz;>_f{w<Z5APq`E4TXG+
zQS_o{tJD1(Kk?J3AQmuC_^sYXi;7>2lF-jr`%$uv0*Bz8Lxzo60UA{37tt&dSAO8T
zhl5rNvh-vKWC&ykWC%PM2t3#WAp9x7=xd4`4#}BUDB6DTNZ?jCg0O-}tv_HJ&Daq;
zWNDT90@P0OGah1_xV&VXn5J*FIxLuCzu(>fW3&M~$c14++M&dc-Q!%)k5kxUp;Bcx
zU<X-@gqjWL!%~tjuQuAow7nYY8V|N3bC68d=fTQco&ZQWE|0Qy<HiaEHBCc=lh|u3
zFgsz7Zd8VuDzJDCrC~&m+IvkC1|-K)(8#0H9imErR;%N_>@$sBJ`^iVRE!i^FB=S*
zu@L0_C_N(*-Pl-zGLo9v#RUq)FND2*#AgDy8>{h#rlE+`77>bGj`V#GPB|$3W9DvW
z9JH-&5KwI)$;qG(uU%vfsm&8sH@5P8!${d@ETD=4BL*dc0{dY3-58b(PR#nMR5fDG
z=PJI>tT4d>2nH|wv{&%_LRv^VgOkUO_u8AQE0;d}ktcud=guJMM-%1`pZUtQn;S6!
za;0LaK1lOhTNso>Uno!Aj@b^F{jzeQXf_gVlt2y+j#)}F1Tq9N1Tq9N1nw6C4>&)D
zAEZej@vqi7F{(o*AXQ!*1%Ajd4~*%cho=ZuZ5G;A2-#_@ANQJA;e{NFe8fFy9Q%Wy
zYsnV%?8Vtj^4%m4X@DyhLS%E;f{l4m8ns!yN2L7Hs0*LK3P3Yvk84@6h2a%TFqH@0
zxhmm^CQ#y#c}hjX-54Z%dq=0Gh=3ED$3VfZgi*J}My6YhX0cph=8(@NEO<ouo4xL^
z9gY4sIxq+=*;ZP_SB8~|R*D=ZPF9J59_RRQivlgAQJ6$Q8aX3q1R!RyVPvH6Z(ngt
zuui2CEdz^I=Lg3*1W6B^*!`T3CUmBPY~Be~mx&i-6ky^(7u5?d4GX!-AQtQuG_&(;
zIpzl?<^h<?>!An)4A>G(2el+Bpb^gY)PBFkMmeY+7h`|WXct%%*;+mS=;Gi1`=0_{
zO7PeJ<;7Rtc&piiZr28087Xp-G8<H3l!}-r(SlT)b71Z}Zz<HR%w-5<2xJIk2xJHx
zH3S}PevGl<yfh{uuW1O;7<8e$NHphaRYrPDTVS;<j=JK<v3KUonXN{n0JR%*;?CA_
z_J<a!^197Ulm~h-7L-}-RK^qr>%%tN{TL-o&zr9ni)p`*#%veY-CCcom!h8ZlDn-2
zrhKC$x78mM>kDves6=SG9EYL~!Qza_N!|%a=&EYD9(kNySiqnde3$(O*VZ+`m@j+P
z5S7(rYjXpI0e0G(sbWFIYc^zb2~NinN><z<ikE5xy0VlSP!Fk>DDsFzsy1rE!%DLx
zE+#$9P}eFI0ulfl*V6e&QhoIoi-ySEiZst3K#F{OI$gT434*8=vWvzJo1(Nfo77%h
zxr`;_)E_VttFB%b!ffHu?X(iF6XbhL5c2+jicKX1#;6le>vIFm5}8gEm{deD-%aKU
z#cOX|{LqEdf9F5^iF(0ncD*luee)0h=u4927i;-KjRk$JF6)>OMN4xAjTm0a+u3IT
z+KrY)fOvjl<&z~XJ7frC2xJIk2xJHx2?WGQ9%zROr$6XJWkdG#iMovN@P-p`IOLod
zaE-dnjaBihLG+OiJ;FERwBHPKT{d5M>hUx1V}zYoufC3C7(EKU-B;&ld#UWlUfXQ8
ziNN!odgAPCG0pWh3u&horXPRmarBaT@a;D*!XEkwJNX93&YZ>U0Aw1A%kZ`b8c!s+
zq#qV*WZ1G|K5gt6d)USEmX?oW0gSa<NHVWp-b6RzSj9VkW|{c|8?OQQ$)$xyP9KMI
zUB9}~?e>uM!jHoWyAv)3VG+SElNm_x?Hj2;ZAFgL0kjyNS4jF0zS-G2WeT><b_;C#
zT6!i*O&X<NF+$r3XNR2rpca7W=g1*DMl_C!1=3)S2#i-&ZU7p;xrLG}cn{FsE?<44
z$IdAWb?^A{Jmk33y`IlEPn=kI_*5PBlpB}W-kTX;v6L^(6pM4^LM_afQ?C#YW{af{
zo`3j@fAdE#oYa1Yue@^o*M9Avw^$+>6cWFXW4e-Ki5@JtSgX}Z6vZn|afJI!W|)Hb
z>@@9TmwA-l>~DrZhCqfuhCqhE5kla8kU3b)*p0<|?gR<oSO_SxGYUDpYnOLFXI}|@
z$4`?&E(!YR8%xGnpPlj3Za!)D+s)OL&5fSlPr|3p!{k!GeZ8LBTCVplJa!V+wbc!-
zT)x?9tSyw=i{<8t8FV*0`J~@yG&du!<#``{s<Ie%y^TwycyoEa@Uf4dhgv`t-+1E^
z1{s1us}dw<PMju#h>%w{wA2ZA_cV|1y1ctfmv_!75{~2FJt-mgim8rf6oHjw+D?r<
z-uUIw+>Se4z)&7%XKI7A9VIRHD89O~k+Oqv!MpIt!mPisT<jI1^%Ex-mlnOv*t@#A
z(P?i5X{Qi37D~y9*+RVpEsXLW#249cuaNd&z-&tGcwvzCi%AFkxr<6OEC1lawOZAo
z)a$SUGdGAl6ICOVm4yvkL?|k6fWf`B{?AIlN@0ZLB&6IJSF>@*=}#tw!IRN~?f8<|
z#!&Cn`an6FQ2fvv#%u*UciT?RL7p{sD~%RfI%IaHRzQtdRxNVD<&|b5QIUN7!xv7?
z<cqB<wZZz?<1@#Xyl&)u>xC;U?1N3$3;Fp<aiLO}E9a|0T<|-C-pb$prO*G+$BHzE
zLBIWf{`3DW>Ti^bK1QVZcv4OqGr2YuxmaMOCAMEY3@T#?LvneH%cX1zP7;yp4;$n-
zJCj+v<2jz0%Mf@k2uwCHoAJ`XCx7F(nZUV017#$>Ch_%U*tjgj<oWm!PVUE)@4fUe
zlWDyd=hNp`{nECRScKk4*xHY+<{iSwA{@5U$Gjcu-p?kB{`T{|$M>K-k>P&;N@{15
z*oOqUkyT9%3S;rRE{!8T?&!j}mlH<*(;eYYac%ZmyE8vC8?SHXn%$)5-E4cchw@>t
z6+|}{s$r6re*H_YrDbo%^ZuLv<lp$j>Dtq$O27EUkFyOPjPGkNv|f8{eYqSw<X`&L
zfBgAh`Hz3(=YQnmi)pjDw)O|l@=@BGt$2U$7ryY*Kk)d4dH>)1=^s6D(p&eu-+y*>
zWuv!PEtk93P8Is`+9pe@Fx$Jb-X!7Er(aN5r><Pn6-j_9Z3CIkkeqb~lam3ny<5j1
zG1I3<j5LQK=^knjJwNE=i|x?wh21FN<No0FO9>{{7H3(rwN|Y4u)pxHzVh4})XMYz
zt)KbS=RQ1tzP9?aKk~t!|M|ZGuxPsczkm4U#o20ou=auZ-v9icKlM-l;itd&V?XGn
z>}(O$!v13qRnIOLpEx=510OjC0gT(;C!T!x+;ZV#=a+iTYv>mEb4xdS{n^EOoM=;$
zw_klDZp3JOFV+@X>sx3i=hGg^B(<!`gLk4$%LdD6^}32AU=2^9YG68eIlf0nDU2O&
zaq(+g=h0hSX}Xd{M**R@h3Eco2*<=k+(%10Eux?8r|mfI1;uZ_dD*r}@gAC6D0<zQ
z+2TeL|NdXT&?<SGp7-zmtq*?TLl1v)(fi2{pZcYr`2vx@x#4~Fg>U3@8x?=`)Li&K
z{Fi_2zyBwn|64!vquth<waVt-{kuPN_N=7Qt83o#U;o;flgB>%fro$iGaq{T!ef8^
z2R>5S`u^F<_2qDl7u9RtdhEUW+Ey{BH@9xqYOGG}_HsSHP+;{QDc?)|QQ*vF2)tVa
z)FbNWz6Y_3CSL`8);kQ^GM%18y>7i)CdBnC*L&?Ay>O{mEtcyqzjCqLAH4F~n{x}x
ztZ^7MG}T7^tao)MmayAt);;rKKCCIR9o@Ohk^^LzFPBF<zFVO7^xQ4EbI%U@r_rRs
zNU-O3+ij-qQMXsG)R=x;zkIXTiq~&8DusHrIP==~Uypn7>#ts1nqQD6mgdj5M1HR?
zRcy#XTtVJ;yu;h~gv@~poZ)|<io|I=?r)N)+w==`6OLGLiQ|CJ*f~#kNyEiWbA${f
z1q_4*-F9!OzR+!4dj7eKU-;ZPa<z1<zWMgeUZ>p&f^WTY?THVZzi`?+d))i4|IsfJ
zIg(?3-+I;iqp!X+xA<_i+@G64!PUbi)<W5zFBewg_-o&KW%=aC{_11ir=D8;_)|ZH
z=#+jIN#56A82siReXU$uq$<x9{jJTLpZU}>w^%Rr{nsvfvyV)Zau>!jdh5a6BEm@r
zRA6|E=VXApoKK$~l?YtZ1}C%^%%B84Ju>;h778cqFX+{d*Sv>L%&&BN{eHUH%>9es
z`->m_D<4^?`+x7>{!zejz;2}8Z~yMAm)BbCpU3_nGXb{9lz5C6vs@C%@ac1p{_;Qg
zG;F)k@aAV&BI(sC-hc4#|9H9TUGI4t|1|#oTi3d|((#4m3r{|YaQN-32tfzca<v=f
zdhK|oj8K^Zp)ZzJG=EfIQiR7O9jkjomgXv+s@Be#hQmr@mDR($KzI8s-sJ{4ZjCD~
zS>qgI!7O!CF$qTZD8YxR872~M3sJ;aO1Fm*i$bB~1-Y$WYpaifwW|G77asf4*T2ay
zk1>$fZ{GOzXL?`!Y_XE_{+%y=>X&}{Q^i1)5b^vo-+1-AZ@rCC!d%L(jokM-=nyB}
zpb$la##0{(r2{TqBkyN_>hqkEfX$wlFY4KU^cP=Ud;67oDR|<-)94XlhvdySDbl!v
z1(+b|MpE`pAjjImWN74Vnp))YoE<X+-W>w847EueXf>>g9!HbR&(EUsgZ|Ij+8TS?
zzx?v6&;R-JdFptMr6Yqcefdwj-S(3ocw%vJ9>w`;l@C=sJL((8(d_@pIL(|WI}e1D
zsB-A$=jTzZLQo={d*J-abI<<YrT`YQQsS9szTEA#o_zAL#icn6OIE8D#6hD+_vx>7
zoUV`G@dit?gZqI2dPfHCbU{7O@V^r<?~~vC;KzH-nrjf#JVnSP9$-14*W1X|k$T17
ze&M@6^0{-!;hy@?(~bChy+P1yw!iw^U!+m=b3gI9r=Jvo<EhJA-dmR&zxf+~{@kCx
zbn2ngtJiLP<kKI<pSw3!w;G*JE~tF_yWh(p#{2QloH^#r&FQJtruWiC@5^6(?xl-w
zVU8Emyr|M1KekYt_W-j%Outsz-s`V*7M5pMup}@|(p{K6_ATS_WBNA`a{Bbp<<Q}<
zf{_!X2s-jZGMf73V&#Qzz4-K#KhWxU7aqHC>Dd?NYs*o;{mhrYlXTKgedc2iolv6C
zFM0Vje%Jr_nP*cUd(?$))ce?{K222oHn$plweNR&*Ke#Ko5$GO?2LG>&&F?T)Kb-X
zr!U*msE@bXjncyW>0?C{%D?;l*CR|@U`=n}cl$l|2ic~e!(otIV$*_Qb(mfLXm^VV
zI;#sCe>DMc7w_Rn4C8Luo%2R~oF{KqI&P!j<GTzw-8Ah*DPuhQcQ>18^mMK_yoYPv
zhb}BH&D1Y9I@rI9`_XUy_U{bZKlJ%eA3wK<r9p2a@m_vC{pvTr`={T0@%mQ()QQ<%
zbMxF2kMhQH-n(?|3RVdHJf8q$#*pG8Z8UO@1Yq-OU?*Yo)YNJvjXFR4xzAJBfdA6V
zuXg)!iHS!(#FUGfFF_fGud92gKkos~?9mK?Z3M<F6E8dW+ie+jDF#I<<(X5b*}wCf
z&wZ0knxdXxs@66)H`|@Yg$qy2G23PZaNdsxZvMO-=eTFE4r9Xq9-pe+58VXs9C;n|
zzis5lENlm7-#dN!WV_Y<#`E7|{W^2*N~O;BrNa5ipO~AQ0Z(u?Q}=Z3`{V%gu91(s
z;C*`keUCQ7|Gq<hw-+8be#}75NKT(f&ENG%h;FZ4sh0e9u(rNZE0k9nC{lWjuJ`cy
zCx82y=Qq3Qi4!NneDKN}m;Sq7`CpQL`}nEle%fDatmF&zZf9oh*vb0Bv0^>{sn7gX
zOp;w`df)!uD>v2~OD7hvhVhr*d+o->i*wa-v0BC4?^>g^$r3#6D-yCpa=lu^s=!Zv
z@n;c^CKd0wzj&Udf+TyIl)EslWH^U~xQshG!knB-eWf#n3U|(tfO~a&N)N9KO{Fi-
z@;QAxL&FbS6IfHrrIG)wZ+_<&fBpw%%HB`?%}+h^+{>j>^~Of?;_BAT>(BnqpM0-W
zsRpGoHb6F;&5f-_x7#_kG?(NCaN-~QD;JotY_+_veD&)~OAiI*=<?NT|L`CG*5=JC
zULW0<GOD&L4&w_cHXBNF%Nwm;tvWZiRQv2_Wta?c=^K}Cu<dlS+Y4%Qq^KXWC$pp6
z_K9yyky%=17y%%ahwP9C5YWlEIvIlaIN0}urvuX!2M^MDyPwYOz1(un6&6QrXy@cu
zQjr{X_wJJ+=2bBAh7vMTXFp$RcLvXYbM5iZEu<Ci^vM&A!A*8uDHijccI&r(_m6(#
z*KV9Yb&N%jTg^_CFE#ppGc6u_^n+U)Z{_mo)1SBiP#Xj9PyXyHonF5X6#t`t@IRq+
z#MH6fY{4FoY-4UEELGTIoe~*z+Yc?ywl|x{Pf7ms;w#?On`@*b2+M<kcHqPGGLmET
zkA2<Wy(NN-TC0M-n_4<VE&Fq{5g1k>)gsqYIO1sOP_fXKv<|Rak2&!9xrHa5c=G0r
z^>)<8ay>e2Pd)Y2!w;VhLciN>wL8rM-$d@avoE4b?JlhDz9($@7ebB==a|*!><~1+
zF#E)X$8O$O6>ke<lG1Q~I0!sO-gdiFKxScG-}JB8OGvtRvj#fsU&fudb?bYH=s^ja
z;eVh?>aLTs_bH`A7Gex$*K<=*!^Up0-RT!=b8OY|CtusBl)O)U)O+ZWbDPcH&9$u#
zEx1@uOS4IFw$Tqz5^l$(tE;_SaVA%)z5Lc?DDtDHBt`qfFTZ$gbt_-0vy;Ny@`*vd
zyxt2huO+WvX<u52Hdyi(aTjza002M$Nkl<Z)ay%UPMv)s&J{~@OV}7#K2?H|!~ZT`
zdh6QCP59ZaY2FSX{&XwKx@&w(Y{T4;%!%y18@K(s-O?s@Biq6VG-Y8}3bpE;=%lc)
zP`tUi_5GKX286zl<JMNYQl39{`a`+OnT<~M+Is%d&B67xq?^>MvnLCcrF><fI|x4c
zna>_u@YZ_R7kmAUt5;rq?RB<BURXYUZLQhQm%90~S6OO$m885F%q)AwS?z9teLYOY
zMg1oquTdy(Z+L(C(#5TA!oJdZw#mq|c`}S$*umoY%dmQj8)o@V@CwXPXhrg_95IZa
z<y|gLPH>Ek+;s_<jN9EE`=G*5EL2gA?)DR`-jzx-7+m}FXTL!JG$_uUeYD%|P`8`S
zrnac5%*`&J>4)_j-TbwVzm`-R`Pq%6w!v<NLHL9J+Gn0TC&~SnzOs6g6{2A|!A^2m
zUu%0;)_RS8u`qL{wshVrJ+u{-o4s_S*_mBfZnS%kJ^9qqyuO)y|JAjK;x3e6#vQC$
zPzgfqT>?FJ8JYe#&R}+uA@J@H7&m85Sdwd9q+j4WOIXCXN(mvz$_iFrAZl!F1D|ND
zuayeH$De+BYjeHL7b<q9^!0&hsQ!zBm|4q_kA7O!w@*LOV#$o{=uSlLCw~SB?0@`w
zcNp#4Q+(Gdh6VP~EES5YH*fI$>O&uVs=2kv9AtBSoefYw_Vh;@8=I|WljX&LPeS$y
zOpDe<u6~}L=(gh>%5mR-xla*i_}?d>r+dwT<HwG6I5IKKF$}R<ftj^jv5M`9%GH&|
zGhcn45x&hw<fVrmIsZgucCi)tH#XZFofO7)ZLQOc!mxHC4QAq8i9PWaPn`O(pZ>wM
zf%m1qX#L@zd>!IjUtC&kG+I#%;f`{(PFija%3U~IFvH%|o9+0@&CS`R6YI_H@zal<
zI3Ygtdw=})#Y@*}^UKquc^qc)gp5P2#I%fNaoq8nAY{{(>@d`x@Dy=mXFDlYh!)!z
zBd!Ne4ofp}lK;)${SuP_<^Z4jkuM-?_VdN-8@=^T(DkZuP>*tzt$wc2Nwzv(u{O`n
zf~|h?SO40_QAd99<@67p`C5H;xmurLgAYF9L*o`soJ~t}Ydyb_6jnRz?pEq~VKYLI
z93%2Sb@KQJ&L|h({@&_aSJ(Vfy<DF|!?xAwh!qPa8+bGhi6S!`n%GHqCSep|CAa~|
zxpN9LI^XMG_n_i*sj&EoL~-sU+(RGD;U-_AWFrGZTFXf?kO<iiF!LO~_Qth8{^|<w
z|JYAHF6zuT3qM3&+>X+>Z!|7m-B@X2+GD1d1MQhyWiIkdxuEp)CqB-C$nU)3{l;(o
z-ptYoW_@u`PO)(8m%3@OpRXl_xwtUbqb4WCT%o#j;v@yf$B@7N<DVrjfBCZa$6x*m
z@gUFcCyCU<k-}l$8bx#y9=ohfd7R8=cAg=yM+AmV_|~V=e5={%V!_7C*Xy%+KbV_a
z_~=JJe*XM<s_Gz#&YwT`;g38e1BHRlhV8{7E$i-1orZNF_xwml;3Bs-JZJ|#H+8#R
z8h*Xbo<#Y%xw(&i^l9LQw;@gj@DG3Z!@`+$3PAve-X9FdrK7HLXg9-I&lpcX9yDO5
zORt~dKOIo_Db)M$5$Yc1OsvQRh?vElsn%O)w&sG@F5US0cfF@S=$(G#(Ti6$E?>O?
zldH^B2i?vF)_99^?4H{j<im1lz0+?s8_V<a-*}<%;`drFz5JIq*88Pud5|J-$ggd-
z>eZP-t%!jFHr`{A8fvf1VRM6+9nKmZb|*_d`taj0AB<K!_l@s1ySc@gdeYk)q&u0@
zJ;dhSMa7yt(Lb@xkz?KtV_NGp!eZ^Mn;S2^;Vqr^mQPk0h*me6mGUebvEb4GJ8<kR
zt;YxXbgSFn>bBWS<=fwFJp0`G<+omY<@GCzODAXM=98@mLwHDl(#=j1);If6xw?$P
zG1eTqTMd?5maEm{%gZyB@_Jjs<F7pTq8HXE!p^{tk~B^zA2E5HBk^LHL%FTQ-R{;q
z9Vc1rE<x`U1&{O8xYTsHgZmmwR?4;!yyIIKfl#i19Or`)`C&UeKVRu5oj-g2#UK2k
z&o=ws{6i04UhOO$Kfb=%*lIVIPMse_9hT0pij<GaJ?bOhOsIpM{<psM+6&(aS+R8K
z+WNx65?iWAaVi>744D9;vAs@RjeC%Hc2M8Au{KjLgyrJmu@m#l-g+Np@dkPqNtdrI
zXdbKln4BaDtRpHG3h`dl>F)$s=9eL`M+CIFxqC#t5VLd+s0<CF6yn>##(a!n`=q(q
zICt*c<*QdqrP32mJV78fG^K&`dfhng<2k`o$9&60YYmubC{6dCA4hMP_^U4QL2+SN
zjN_<SEO$B`2r)=Dwl>b4KYQiMl~TFz#1oH^kqMkJz0NA@`;6!I*mNgUA0)JIPe0gV
z7eB-Qc4XccpS_W}_a%vUe<^mdYL!x_(MXD6qtlpKIML~L|JkqpPAB%(o4sOnHYl*D
zqs*of?Loi9?g&A?)$4Ax+NU2mSA_MZYx!c~|NHIVUYITQI*p~pGwWN;IO)tSFWuZ|
z#%T`WbP-t^hUC~`YyhpMabnN`+psB-Kl_dEeDC`&_uItXURZi`qtQVi{opEKGG4I3
zFn~+`Y64bNV=<zKMgyC!u`l;ZEBYt@?f?GpxeMQU<syvk_{p<yH*Usk)?we{wGkX9
z%knvPTWPg67muI(m%sC`s>Pt)*jPG#w%1QKS2k)h3-B;F8G95iA3t&9`pwWUZ#LGK
zi&aQ)8uy}ZdwqTV^;ceAxpvvlB`<vEyO2e`5!_r~ug@PVW8f*DlQxW!igth;XP5~#
zQmXVHFt?o~4g8{@KKhht%m9=Y&=?uSonEg}VXRqr>5Y|N_&fh=t}<JkS-f<8b<wY&
z3S6!)v^r5^t5q%**k_v<%B5mhtzvqqTweU6FFngbrOQ{Yec}^8^y=$xl*?5rCuEnU
zGcjrt2zprX$U=Wn@}xFX-CDod3F2<E{XhQ7@3&h`$n;jHpD)iM#UAi68r@^|E+J)T
zgBz>|49qM!M+yPgb`S{{8=G6DVz9N<EEP&ratthd`jel0^Ub$fo%Y$YXK&oN!7QDc
zPAy0I7#ZgXreO>_C$f-ZHN$&<q=D1O#k_|cf7m&VQge|1G8HL+bEAhH8+2Fbnpm{3
zwzly};rwR1JwL=b$fI^lFHs0J!#4rJJ|L;mnMBJKs5Ck>tC1TQv0Xj*2gASIXf!hX
z4@hCYZ~R!JHG(%Z%tqgEK?I5=x3gK?1u4i|nDa5>#;mv7i}Hm^vxlw3LZjUssHK+}
z<&iA1k~xkOSTcgTE}InR%e9%A)r}S`&@YC~UT1c00TX7W@?yJ_1R(@BXti2!3bq@;
zq)=pBEM*s9;&Ck&4#?KNebh4xF!Ov$ryTMuyOBYur%%7;-Q-s|Sgs`&)rqdm>1-4o
zKp`DrW3@dwi(yAMbN9zBdU%q11TZaWxg5};l&rj!s;^zWTBy~7T)5e8=&P4j(9u_6
zJ3?|(D0PhS1TTS?wc5QftWpZ4>H_wm0lZLP0Zk9X2q;+uVQFQ(;TOuCL5eENRzJd8
zHJe9=g<=%NGcz;Suiqr*`K43wfE`|<>I}A!#fym_^;lA-G04FOjw9#{FyUrNS5Djo
zX=4OMYlNMXRr|Oc_OnDet>Y0gC>-oH<W^}glms<zQ+ZkloefiE>9n*eSvc+H%;t1f
zpg;~3>}E&Vru{VZN{ew?*xc$b96!VEjLJYCiM&^?)<G4Gz*4}L>QTJ47D9uQVj*1W
z_q@fWvv0h)n$OqKALa=bJjxVwh+Sr{&+h62DyEVhN$!lGIx3W^t7{E3QZN=96f4@h
z8$A!!`Y{#kvuTTzyrW)USyRQI(xH^2p-WH<bE*ao+}T!qQzCf8gRw!qH3~Q`J3i%R
zXJ#*5zEZ2!`@KPluN+ZQVWN)uT&q(qmYZ9xkAC!{h}`%pjdU{%^HDcqeJNi*Ts<@}
z$2EAe9$QyM<-XTHCcf+&k9yKz5N$Lz&prC+=GJBpgYpur&@k+1-{>rD@7n>4T;~JM
z^dg4yrI}i---|1Sn&Dg~wXAhy+TPx3e&VAaLnMz~<Z=OFQ_5#6pfhzc`Xse=>rvIO
zTCYbBbWgCI#K)NNkQ1|?QeMlNH6RXHe|+xIb4XY)Z8XGr^0@<fPO>YN3eq=H$EQ?E
zp<1n8y>xk|HhV|>KmCbMl4S~lkZf&AJ!jr!Ju_ngRbH|T1~>4hL#OX|-__r9DNJH_
zsPLc6*j>)wH-1e0Kq$-LO+&T8)bR%Cw5q8=Gj^_3y|WKAOinr$m|Eir@w6Ov%c(l>
zS;UMyN4Y9kF4dw9(>vvFh?FzuM+CO*j&u2>WIIQ|S<noX7!7p_Itaek(ZA|Jr9g05
ztyG+NScJsP8|9erO0aC-2d7EDsxL{#K|alv8n42ObhAi~7s1AO%dn>bC-n}ln%YxS
z9Kf^BlH@7z)SD0m+dD>GImyVGX<>wm?uF!=nFXPI6`(kp;Y03uURulM^in@1D-Iv5
zw23q;tI}Z3`(7aX%Jj9vLgbIl0Q5S6%xVmtC3udaq#PJ|jA+>_66Qq#vQb`XJj}`-
z<mezsKY!dOT3#jb2HG-Mr&c1-oH61kO%T2ZFu!t%o%+b*uFPDySdvssENG<Mloy-t
z8AT@2JkocSG8IXAp)^&o^fo(?7qQrFm&Y^;_w6ZxLru2Z)KcEdx=8?ZS<{IE<O&=W
zpTapsXB`1?_@1G{3?X<>i!XVG<&Ek<^=^up6a~bS#}FKksmf6>FmlC6N)+3)^=FtD
z*P^YbW@8!U)h18d`OMBT1n!6cTCG%2vz;)2gke_ftvBCB{DFm=Q6n1GG5*lgpe#$1
zf;vDBnBwsubuVF$`p{aoYXj<G=$+I|<Y*D9_4{(vVwfa$8VJxew0L1Zq^A@hcA?E4
z6gR?IMk-M3CXsKw^;R(qF*EOas!>9RpmjPuN``#VL(m7}$PS;j=uy<z)hXJz!CDLT
zPlG%=JiT{>hly|{ZsW=j!(<z0Y+3e(I1kde66~uzjS(vE`L9hRgq%AeI~2sq%G%Y-
zH`L*d>#C}*9C)keS1&)RMFus(BvC2)SdEu-;810T0pxm&3-7sx+p9-o)uF<Fn6usf
zyl?ziRfEJ?&8O7Ij)o@+Yvb`0N9u}IpcLZ<ur#96Kx{RL=_Q1S!JZgJ-Nao)l3ew2
zEDos|6S$t%HxB{~hw*F<VOz>aBj|&eF2Yd0<+F*CQfkjC9z`X#0h<<2VojR^S~XHn
zO_6fz-PPLNqeR0~1_v=<&qzt9&Wq#QG{We*VYXYD?zj)065UOQ=Wy#MON9>(=17d4
z;X6sqY1uId2|y*7#%Qnru-dj-xZwjhK&Kqw1UgS4Uc{ThOau)&Yb7WkfK)%YERIv%
z5Vl(2rdhGi_@`);9r7ix6+-<9r8S&6tB)`+6@|*3gj2MpH()GHsPeOdvo{gPbdE>#
zHnB=gnHZ5R&!YK~*D6y)r1V;`TBg;b@zdmn$4MA~OuUVpDGe?iL=ETXTw-){R+kiy
zz;XDnC5qnenn^H7Lz`Kt*$7}F5E?upSoEBHR6x95X-^8g%rZg<($8%b0qCK6PHK2W
z*+a+EbXSN<p{A=p1lAr7)|f|mQe5_Sr6zM30(VD%4<oEwrE#&R5K}Z-aKAf<d;Jg_
z=~@PYQ<v);wG~ARw2`5tIO&4fJ$jV^dirwKKvk>Ns@zkfbLVr9kMI%_p_XVJ1%wiZ
zK3Jb`UNmYo%&{A%aO`vcPJj!F%=>I=A&v{n9Q5Nt&c5LdJ0$K8v1Z*P&@xNWcP}Gr
zJc#T91IgCh8LSYEXqK+-QR|P%-FwHD1W<y*G>p(u|Ha42foyrw5mLG$?l?MP6sAYP
z4)kC3iYBa~6hIP4!C#sce#JWBah8WCtcRpRu`0wp$dT+Zb_}APolD+9G#qh~J!Sac
zi}HWp_%U@tEs6@RA|0-jSJM(#RoS~vaEhb)O!YJ@oO6$fTR<S-0#c_WYN`$Wt%!(P
z7LjYvhZ3Y|-`9o-kwnV_J|{tV)ZF}BWP4;t!tHy^r&SNFp=yyHkLzb|vDk>5597R4
ziLm!hn|?7AmqXQY{#n}eI{6bXFXKK06A2~4SM~dfdoK>EWv_7W+2x4#F(IrI*Xwu}
z|4ELf94ag2UVg$`!eBwJq*}Bt{pQ0>mls1xNwy~ms+h#UM4Nb*n4bDmIq^DCZ9IXS
zNV!5n4Gc#6cv}wh%3O%95se^I)Vy5$Z+Nv0BdMbjxG;JN^$hOD8DJv%#(;9j&UM3C
zfTduH1JW&NhG@;AUd&Gg&wEtglyW6RfZ08}B_EyOEfkoF>C&NKsps4tAg9n?5F(`t
zW3^5p%^7WV6-=*#JZOR;NgGMf8*$*ExsWogc?osd3UQnmUL<D;wIo|o$b<pTO%<!&
zOCqWH79sL%rQZ&_9EnH4)J0QYacbQN)&rnXf1SAufn5+l7^YFc>JN+uvyvtDj$}$p
zWtXFR+Dy<cXwzy^>SSm#RCxaBlOGQLry#CdrrmF!tN+oZI@f!D<nwo|Jw1t@x2OZM
z7I)w({}5+@^F=OUX{b-|P1TF^(N*cc#i?EEKf(1OZ{##wgT3|PmQ%Vx9Oa_G7RP%z
z($yQ7R_=;TF9J+dp9^TQSp0^wh6C48;d$WDwshWlfD_~J6|>K2rIU<&OCUBWhZ;Vm
zO&5$N>1(P@LU=^?G*q~z;rd<GH&GtN{9cfE&mLy{hsfUb&-=!Y)rcqp8{w^<U~&<a
zt++_*yK9PXxd}_7BG4~#G>)L=L6^ywXtf^EA5mm@1A3NJm@q{>GhpynAV>~Wm@a`m
z#^6j?)|F;xWc81*3c*y&X2qk|TAI}YRe#`Za0pjBEg3EeYRpqR$2&Cl<~^#8_K{u;
zUV?9FS(pfUsAP~gy?|*WCmrxPdb24X00+&<l^Fh7&m@5-Nz7rw0xIZ+Ccu+PQDvN%
zNAk$YTsyXARCWlD=0F<F)CJ2)P;7-_ySwS7a`snW{w-!Tel)!G0C4yS@^b?vVZek3
zrfaGKdP+m2est7aD7bqIeK@`<YmJwv&5&v!6;UNi;@z%`Q5@4c&FhSi@&GZYm^6)&
zPBSGay$B_q<cPCcq%<)VBCk@VvlE3dX*}Un4=vm7h$Kf~3t?iCD1n)Y5^IG>7V#DM
zHZEB_R$^A^iPg0oI!Jqipil=a%=Ope0XbsOH*8XE{;GK57NnC#Ng2^YGY7C)5FR!)
zQY^bVqF`Yk%UC7cm!v~asG+M0WiCTtcLd-HG+UM*Fc=K_)IzMW&|_1dHB_n37<MRX
zUa}?Bk5#=i%a)^?CoCotZK7@MipN~jQp=>@-i~4S)a*G#((E_+!Y!eotyiH9STj&#
z-s!Ae?)18!S0kKR^nj5H%_UH=7@*cG5xnabb|eRR0_cgDkrO&@TDi6miSx$oK;Ff}
zk;f=OmBl@9a6cE%sMLnVm$ZemMBpOppi9MYNZ*s!-h-6y5(VQ3A%F`<E<lVUg)-=n
z7xHjx^?+Weely6}U#d41og5cfj6wxBQ8bo|pi-O@v&O!A;0`VRQ(~h|=Y4S$Bs7Cs
z;7ZgTRm0&iHZV_<Ds_j}UBcl^QIeLT<|{J{QV)pTu#E7Ej#kuH^)3=aO4+WMusv2F
zX#EnxE#Au#CQZf&hJgyGk3?Q3pqQ+zs(jA5mP?<(3*9+SLfS_6r}%3!1{Y3@TXDmT
zMFHtyNCM1-Ebs>F0<3SJBG$<uQnY2c-PBI07nMcBL=;VUYVe^k%hx%1O9h1=XENeD
z(ViVvB%1aMnDAM>UHE7^LUBc+d|c@%ds577=-e<O{%JA7(l8FpThI}qW}si8@sC@Q
zjRv^^D$W*CY1Kk%D+@XXkygb(6$4I9Iz;zH*{Cm)z>`0zXVA$9O8*|*xH#yAIA<JC
zP2to&XbCcn!A|(<GS2H&p}E5=T~S)*M3%`P$Z%IjLqXfJkX~y7M(0IdQPgI3M_>}F
z2eejFN2+v^Yh$-f1;M~&N1nF)xT8Z@&zUEMNLx^HDM3q@UQSBXe5|yU7RitKlB%VU
z%+3furPJhtw9uG^w7}*Jl<(DOIb|}kU=Jz+Zi=h+t6|>wfT#oWzPyjA-C-rR+^Jh&
z(#R)2Nk=%UFQvX=kdStd1~3aAbkCZDZrf8dTE+#1+%G5VlhS_HVyT#QSF907J<l0V
zq`?l;xnFOeI$p=0G~f<##)COC%iD^^kBJDBz-@??ZmI`$^vmZ2Rt>jH`>ewjw{vfX
z_TjiBz|?8EOF)Y67K%deO3aB*7sa+<zzooS4|^IIq~v?CGcbCW7!SxTgIoO~KuJTL
zI?D)xwp@5;{0Zq&Zl$0hMJ1PdtrkW<r6l9-8X8uT#**K2W)Cg?QzCOq<(u-Udk{?R
zht@|Ew1ro6D6CkP*{eYe7ns&*q^Et;Q9>J_Vq#kM(}FUib;PCDNFrwyj8;WUm&gy^
zMoYsqndFhk|L9~*gyy7*2-A23H7A~z=?a%7HBQMM>4&YA@@fT>rzJUMofFi}GB!;f
zlve9BC8*{p{UF8EO1^l7^gMYjWV08NPiE{Gh8VVlX`&~LbO}uKIAwTp!wZpNle9e9
zv^fkETdT^6XDFS8|B()kAXk=vTWnF?7C+%5jxWTZ$HGUPiz`Y1=7)zIG|K*Ru#n<s
zl%O641l}uX6*c1wd{~Png^N8*4Ixu<jtlAuQe#2%LV?JU(N<%u3W)vS3PBE?wUC~b
zXDy93`4}Z)n!h~7W_MQMln{z0@bmiMkq_7d&)BjRy*R8ZHe$txDZXkSDIt&qvNl?i
z2oY-~SIiVYK6(^#u`R{Jo12<N47@z0jwTSffF<(~p3)EI54y-X<Y$@(>g~oD#A%Tu
zE5;#Yv;|jis+ejlV5H%W#F$fo2Tu5H_pR-Sk8`DH70+~z3Q<5GBNt*WCO~RGe9GD=
zv1KkpU>5`&ucAgG801Sn;{q+0Zb+t7hK*g!*c@FAosulsnA8&PMxz*N$I?KoU0D-Q
zrMJ4Mrf1#$E~$9ew-eH>o?4$!2EBgItOzsD#CnnzC?6c>aHeyFQPJ@TXI3;0a{Ueq
zc`TXy0kq+Td>n_P(+z{7Y(|D9JUjg+Y?w&wu6Uk0`&`Nw0E^|l9aL(=!XBLr`7x;;
z;VjKv;f&FZTpF_b=!4(@YLd%xi6cjSzK7fie_(fE$Q*SZ7Zx-*X!o>*B<1MP;7AE@
z*|%*_-zyS_7yl`dxj&W36}Sdz)ps#S*r>X%0t+mHUOn{6VP_EfsQnf->W{+jmI?6(
z3yk!OC?Qph=@5&&X=g&<=t&WVNpi>cANxX=_vmMt&9_A3@_l<E&12X&U^`!4jgXSf
zLn6lZLE1sG1_88zog}%IsW)n2-jE`TvAGVL8*~e@{zVd|b$Xhx(qQJN>IfaNujhk~
z?{$h&t56?B+lZ8%3ErhwjDS)a9kya%qyr%E3)1#drO@-){dy`VHbjQVwIrReDeLWI
zXae~nih&|pdXoZZUI9L=>(CTmZQ6b|cEP}g_=q?gH&hXb5W2#3xi)@ANOG+(yxdCA
zk{~W9Rg+?<r(*O<*e475v0_&KR4y#K6Jk7zc^iq40g%9!a>;HLmWdK%U4}|}EZzw6
z%{YqCXDsI1uv?#<XbCs915&<0!fFwU(iKB{i@9D!sys=xT!z02KW^&9nvoG+VW|~L
zGp~_+d0c6AmkWi_5yxqP`Jo6ne*9FQ;m5p;UeoZGOqc3S06Q|+%La?p9E(5&$lL5E
zZ3|r*&Uyh88fZLlQGC3aguA!sdo*%&g77Tt*_uxv`LWnH?SyBiabzbK4B}j`kV~sU
zTC(|rg!gJ}BvS9xM-h7sS9@UI)4V<l^WY=E(2i5@H4?40+ik3KR?9QWoYha)lw9k8
zL{qWMi8n}5ZuqQWF>DLgy6jmsGV4S32`VEoHWOy9nDbFbc1c1_gm$MErWe!h?vv9i
z)f?b;ywqTp@Dg%t63n>U>Y!&)FV6F6PQhKTGQx`RP7?q_)U_(MH)k+U21}!NrQ1US
zibd3ojSY^-m5G*HisU`I$>MX!DMEFy`cOcRC8-u2oX_;5z;K7I<lsQ9&vVF6qM%-#
zETGH+L{M9f8fE2V6d&q)NBk|ADi@ESk&j&zia;vkTFW{>cNyd{S<TT&qci*+rtk51
zFDmiflHjPM3qd<#pw(EbdB5agF_O1m{!jlsss{r;gEvip=4`|H-6FmHp7jDllEGa%
zP(?ydXuXl%c>6O7XTGk(bdQ)Pu^CqwJ~NJFbUljAJ-)N!c^8QvM!WNm!vRQaMV|I0
z(Go+#7zML7W?k4YYie?k50YxqVl?*97t8%u|Ha=H584^?u2(Kw<BSMf2`0DuS6+rw
zuy=@B=uJ3RI?iJp?eM-uJ`Ot>lDL%}dxhS<GtOJ$ndYv0#KlQu`2?9P_21=YoP@ib
zP;V1<#-`Sa%E$RZ{+Iu=U!?Z(_IE&HnC!{asybO-#*a)U-sM-}4H<WftI1cm7>04~
zgykL6G0GOH1x)wQI91yT7^RI{F3dQQqX_QycN`D7P}P-|BFg=mtQjeRtQSQ2F(_`|
zG;Ff42mJ5$?(E(nKmfLd?hw;n$QW}_j1nQ=EMT8o8#<CdwKIJ`897Nl;07_Ifluw_
za_z*46Ksj>tYtwF5G(Ql8pmcL-Che5v&2gTCf^k+fnYiD8_a`qe4yG4PI%%&JVb&c
zJhTIU*dhE67D7~`L~Q?6%aXzsy2n=Mi48j%nE%lyQPaIrH?8-YyAL&A(_=L_oICQ@
zeYoR;oDC|-K+s%+mKB0>5#hJc71ZF&e>>_*%UEpU0TQ(3<ztK0Y8fVstXzU;+%mQ!
z;!Bq<z468y7~KOXkYsw`U}T>WV8lMd7+V7KAPkQHOo2EzvMVhP(8dwcX7;1f7FU*o
z6!RaIKQkYt+`-UMt6(=I`sHyK<N_azBPn-<4zo8%Wq(7SfS~$<SO;*a@F;*wGti1^
zC;sqhAVN~ka=xwR=7lF8uh**xpJ*4<Rs5G1mw*v4UwP#fCMsYDhD7Ut<(ykwI$#B(
zIeeVMp9zo6MPuqV5mJb=v$MQ+h~Wc_|GTdR`^}F@88ySz>|y;LCRSPq+=3#T8@0;e
z-M#)9HKHBQ-O+P0#4Q0vJ%~ugEw}FY)Lp_%hQBB08X0y1cx$x7yY>fQX^K1$Vg2j9
zIv2y9;<+X8EtiuCy5-g_m*bH5ztb%jW>0s=1-%QT<JWJWK#}d@w%WTLWarr(6LyQ|
zEtf6==Q+MLKAF7B`7QoCUw-GS+aH<qnLNMq^Y83?$7e?(D0|GByL|ZJ?p~!WNOm}a
z2#f~fQP!y~ZlG}0#(C?UZ!L&g9Cd?)EiEm<%wS!N`cMS8GGXAyZ@(R0zI++B44J^+
zkp{wy;`gA!cyh@JOad~;T_^;e9L155m6`%BkHZ^aHsfc<Cn7(p(ohPh-H2Gg<)HBd
zvWCFz$vljo-M16+Xzv8l9W9X;k`OgOX?>GE1u&Ynos{jAl6MD}(JV*rysfQGrV0QK
zIEXVj0yuklGM8`@2?BEt{?3Pp9sV2%%w2Bb$+_cg_~1a866DBBdC-z$iQiExlya9f
zn*iuz-GmIsZXT?|2uxuu(DmDPu9)2QDC9&<0gCOgR41!lEUjeQNYLXJ$U|`534&y*
zF-1U;L36<nz+Fw|1cV932}gJCP6!6(_)``fX-#(5CFT&|PcHZD?gKyGbDFXTG6XUN
zG6XUNG6Z%-fPowmz(9_~i=*i?c<fFOZ-Zk&5#SAs^e0cAL^j4oYmRjh(h&|$oa06C
z3}PbyKG0w14}s%^S2#Y0KVgV=*MuFi0FEDX1Pvf^I3txuTFE5{Kk~>U4?p}cESbrJ
zD+~A*;R(6CygcUCc;XzMfC+n}m?$Io9G={A@q;#Z9r-?FNWQ!5fFurxoO6{Eh`#;p
zZ{rU$2KJ|(dI}N@+AAw7+;TOU*MTr1Vd^dA!X=n<m!}Bef>A~;S}Hc=n6ki!B7kS}
z?7K_w;dm<J$A_aFvs`2dWC&yk961CWdF9BU&d^T&8ThYWy$W^XoMHaVnKP$PpGK&O
zgKyzUNJk4`0|X$v;{=d1{GkQ-a1MLm5qJY9`0xsz<($CoE|*8gk$s1Z0~}bu1pWjq
z15Z4G4m7}r^D)hZH>2@_<5siT1iI_juQPjrqVYI^;cfHt^AA1r5EL072%N)~bAllp
zCGw*K@?CNYAnq>6bHXJ!gD1EFEIyQl3j?aa$PwtE%Yc9E*fI2#2@LG0Uf>AK<2cHc
zB4Ac?>eMNar$tbX+~tJW-hTUSRATOuszZM(<Hv`-h_ei42xJIk2pmoX90`ClARdDa
zIQHNMepmuBV<$5s^LWA!I6@sDzR<G8MU-fzMhtaWU0uC-^QPksL;`Oh0Neuk85|nw
zKvYh03~xhK1E0eQ=sV{M!h{>&2PGWv0x8IVDz^yW{JDhnf-nRQFu8?4tP8-eT)Dyo
z1^|(I65#mp<B)6@?WDAlm_a_{$78_nctVv1NoqbIQwoHkI5<LcK^ta_T%Qy7EY%*U
z;mXWX_?ALQm@$={Qg9PR%7~D7x)^8_89#Q=WycJG41o-R41o-R2MGbk%*N!*D4Zx7
zdJ&v6>f>B~9Cd>?zy?safI5tCK@=bj2s;s&!5fGZJ!YiNP~<TdkOs#dAj*z{jBg#0
z696QqfZ$Fz2NPxs&|HX@BUR8f7%jztLvqh8WcJstT_YGQ?XkxmBP4_jktecYVpv#M
zfUyzb%P+soEXCcWTqp&mD@ZyWw>}~W-zA%_?h#;I3Ls2H#h)n$lMs+}{Fpf&p4?^P
zasK>yJo%Qwj1C`aKHyWgDI<c>7MPImEPyj<p-@1Zc!-d<z@gF8xJ#0b!mUhxd=yGK
z%VUN>hCqhEAw<A&0!JDUq`}YNVUS(qUhrK;dMFp!X6%O(a0}`HOF*>AgOD&JWY9AB
z0b(}z1TsrJVG;y@9CHg|!I7v?>vWI6+lb1&_Yea5&RKG^0x}NTAOL6}32wmU9>O0g
z3_idJn2=-qp;=BaPk4Nst_uZ(2-yKk7<d?TnWu;lobjO?xg5vr;y=nq>Mah0odX#h
zboR5K{p=(xAo<o?Z!sqUYG!~G7D&GN=9|Dvc!;vA#<UA!gCFDRYBB+6BScP|FlNe^
z7DRx_$dBN8#*dF+p=Vjn5Xcb75I777FpwK_(1DR22F#F;<PvfK32-@thq*7s5NHD6
zPe|w&mdLP225&=V1~Y?6!weklaFS!h$rmqPL=+9HK+FkCMh*&Z;B)p-6CndHFaiz^
z76o$xH)O^@iVrx$uApDAUP=UV=ETXIJFbSD5~7C864>1Y1`H_$Amv%Y5IKQ~jThlV
zFq|vC><9u5mN69W5&)@(;15=y>g4o*45|bGVZu{m3Bv~yf>CuT1?oQqM0lQZr9d#g
zicrrff-&q|QYgG){L&+MlJVmsSm;@nGXydOG6W6-0u1m&k(#Yoa=eW(9tJ@6;9CTE
z_0?DTQjcI5j*BPMnIi%-Cl!WIaOwE26E+ixqXonatH2QeuD}se$Sr7u^Tg+fII{0(
zF-IVE@V@riYp^J&FqSJ|T~IFMmmmpOW~RX-z=ctT+p(g6C<{0l<wD4_XU{_2c${g8
zQ$eN(coC0tazqfmdo}?Rd_a;DS62y7J)~}f64H9^qB{Yw)M}1n)gWp+A43StgJZG_
z$UF<e;D7GiIqtGP4;2eeC=QNJ+x`|(j{i|}89zR1C7$IsLm)#SL*S4gz!;AsGmPb9
z9tPP$F9J3I@q!khh55~IeiQBsnS<LwF9-=~xLvoENMIYt*c@+r^wCF=i9$FKFmvQB
zj<+2_-T-25D#DZl`USfJ1n`6q!-jzrsPRDz4|=d?0y{Rw^n)Vc7R3bM0LVGwe4gdR
zaXx_k$VWaxB=~U7BS@5a5f4%v_+)Ne$$<!<aDYQWpu0lRP#k<SVWL7sr8t=T5zKu*
z0cS8L4B;se&fVkqgDUuQ#0NbYV&KFz3hF<N$g%CC41UIsk8<f}dCw5Y5XcaCfDv%K
zfeb(h82&lJ01y?U8UaN^&lSetR4QS*Z~_RNV*rpa*nr~}Jcxrpz%z&n(%{G$o}c){
zCtNW6VSdEyhy}MEVAS@WbWjBr;B|6GFa$aW8iWir75Fip2r^-8M9*h^W)#54Es&>}
z2n^Zfgd_ZzsNi!>4oz7g(Zq-FAnZzt$MJ!p?LD!F@Chm>4)g#SSO62pIfzj|!JN{e
zw&Rc-kB#VoDz{L2arlo}GEKuxV;uf&wnIF`3E-3tG4ML3lpGHsnp0;f<Hu7&l?BWY
z$Pma7$PhS42*4E>`x)9f!7;!yqC1+4Kib3aV>kr*mx#om1q}b(g`#nZn2iU?4Lk$7
z00qRK6G$FknRyB+CLE3k;lwyd1CC%w=8hU~AipllWFN#}S{(5K3J~K2QpMZ@LI$Zq
zcuz3U#=+aT3wn-z@t{KwX686Cclnx+{R#pTC$T}70iQzQS$r}F0#s~{RB7h`4aYfP
zf)$SXOmzi(<`sZUO(p=3P=TrEOjq0`w+O&pBIKMRaM8Z_;)`732*!km62afKDNc^8
z%lPq;E&43u83Gvs83GSB0&WP0dpWw`#(0kK26!;vh~aHe1b71quaILn1Y8Y%fDdFB
zDnJY{GiVuXj0Yih&<=d?N0y0-t>e(d4`~?FKF2m5Z0z=udJqE^at#_F!=qe+@EESw
zUw<9pCBaZ!VWF7&IuE4izk({TgD{W4p<!l@KvNv}a7kI914C4VBy#vMF;F!4xOeR(
zrH3=D6B`2;j#7XS0}f?@12pms!c<p~#Ic+d^bo_-3ILq&6a>|tXSqe)cGDF%jpSKM
z?E@e90KxEaF>?aphE+RWemF6mB3Btdo&uySNQOX$K!!kuz=1>n3I#=QT)`!fK^-ap
zA%h>FXz|HUeiDCHFQF{UE&Soi5Cup8U!OVhtTWaN=XmL*m*94=1Zc!q8G~2A9$*zv
z5F7-Ky9W}C{YsQW090XGPUwga6AvJTx`8TejQImh3v98li<L3%!j+K|vjU6<fs|l4
zKqjyY&pAj^Or)DI#K{R70z8Xi1_eRs>{pf^sCcP<fCEIJjiU%C2+*cZassN{qGFRL
z@T4+7`|PurXCVL|LztrQJ%j+<B``;4?HEe!FyL{gXyGp1k*47$gp-~}?L6bhN3F!O
z{ALJb2xJHx5(MB45CR5xIPN%0cp{sGM&OSi4Vnetav${JTL>?C1icH^?BjR}>gE_W
zGBa2RQ90g53{d28w46XG5R{II#pJC+e-1351f*cgBmr<7=8!Pp!XKe8z>YnEo$xR-
z_%5gt42UQWXdI64NID6RCtR6A;X&dgFnkVCG<TtBjxis4Y<JBd>=p+a;0D@G7)-r&
zKH$tbfIF)!zzf>=04&v>N5=G*sU0{|&mG$6S`aM-%CwMRoWs0n6wFv$^Wj~)rtJ_1
z*mr)M)1<{qnHHvgqRF9iiSKb`_eNd>Qm`ZDgpXmAdXt&g7n7k&x6d7z_=BZZosORH
z1*E@ySN<9;2w=B3JtA%rt7M4nuwMD2N+@Kgej`M-9~t@F_3uP+C|qtF4;S0#<GQ`G
zSnnG1A-Xw4ev~6XNPt{UN7c!kl@2x8e6vG_zz6}F4yu+mD(@H%Xk6BWt-s_>&RSv|
zOPQFV9ZpOP4Dby6FasP67#aryfYCnv^wTh7SOQc44gtM*^wCEVb2`$-gD?z81AKzV
zVP=kAKqiobau=x^L>b-&Z$KbNRB(||g17%#dd1Tds<(B{*5^>ksrK8Q!{`C$<J!f@
zZ(si2{_62-bSupz{Jb2PhoWG2YLl=fC&C0+0CWTw-4zHf4xE7qtU!-*<_`eK2?Q5^
z$S#OENhkPo3t%Z7@Z{|T!$=uE6e2Gt0FxPtfB?w;QL-JT^_Grzh^NwT31;}?{Ej#e
zBY406jM8RxSCq)A%P3W3^Df2YIjv$>x2dRX`2llfvO&2}bvXw&(B?>Sa0E=GzsQMk
z6c)1%N`dMP!r)KsB|?e@oM|$Yms6#nh>4I})N{g9KySZ&d6>cn9{&;2d-=P+{FsUx
z0DmqSqye_Y`9eS~pkDYMsxwjE8`Me#iX3w;N$M3!HD*V9N&F!Q3xlO&vQA2ETYaT<
zrfIit>l=Kg;~ov4YE6Gj-yhH$<kCT!4E%K9kLGrXAHt7$eBkv5=p{v-pX3Yq5T&sw
zb_2;F0BhGQq=HISI-kaQFX2b037(L%ftQaH;PLntD-EtB4s%JNQ0hmnSKT#k3qZ|H
zh#*fjR=xHaR`Zb!$EAYatJ0LSVzo0`fGbwJIuZxqjk;|=$DX}a`tD}8pHLvwWI6*c
z?`T6FFXKQ1%_LOqGnXMiOUe7eAW8~hx!+|k4=lOrqddx%Y!)_}OG(|57VRocp_q@M
zP<(y%gO%0wXTJPr4#r}s(rR~uuo%V3`bOhVp7~Sf50ePh>Ibj9dhz1xZ&E*Tbgr<s
zA+ZPw$BsXQn3OQze&I#P1Z)GhvAIQ@FgG4=Y_;B@(kA^Rs@BU!T=HQl7<g$IgfSiV
zuyn0|m}H<sR=0nJ4-0dhl7<#@LM|8h=njX4u-oqz%i1b}1{(Eyp;x|R)&w&16Yk|D
z@daCsK*6!y>2X3o%klaR?MR#t3In~<cmA7=CKp~V-{Y0U6Ake1_9A?Uv{J3l%`b8Z
zhel;!VR4x~5AoMi3P42GXKaI}5tA;wUaj(NAPDoNu;3+WAzz472E=W#kzRTr+z5X<
zI#=8&(<fHKNmxs+8R9u<wlI^2)x6NZm5C}3)JE1rQpo|ZR-fH!YW*BI1C)?t3RjZB
z?@qVRj}x5p;iOUxLB6-T4wC&|pUDt~NU5RyLTtsrr<s(>($^m(gBskpHbb-oXtp|5
z86I@}4UTtp(4fAbPIBS|d4L-ghXMe#og(wSIBGZbon@w4@q$7OgBhgk-(nLf>jO1#
z686v@fxrZrG#3hi4uG5@T;rHqn5%cY(I6h=a{(TV5Oe_{!-o(VPVqJAH>o>aU}Lip
z<_nduTq{=~%i!Km`{?mA9$3e6Xfd3uA;>A=Tgq=5c_|rZ5+S&fT#zr7m>EYY7EI}7
z7_@>U>NdC5izW31dvZun7*NAdZD%lu!ysR;mKqIq8E`|9Bgd(e+*=6|@JQfKK9Naa
zcc@wnyqLY1v;z|(YCa5N<}Tn44uV^vzTtRhJp-4S%McI(v>uufKWd0MKhNwI%}eSG
zb%=JPGp46HY9?$SC-A2;oZ=`pOf}R$)y;%nf-Y8D!dX0ehxCkz8D@Ou;Zv91z7iJl
z_4?A{(!BdfjaZH_%#Lq=BPoaw<i+`J1kudX86moh19YR!j<33F*AH0IheMk?-=`zv
zfL?REQ+|gIu7=c@sVgltWyAr>X^?z(VZa~UaPgpTNTN{dKmPdn#~wT1k9x@flC2_b
zY{|+SLgi8k)nu1ccT{%{%ft#ZN-;b_x*6hNg(zphkHgY|60zNmJq3cZ3TH*2`I(Q>
zFvjnKLNScHOd$3Ve<@Cce!u7U`W+@kt@c*2xWKGWI{L^l>824Ig~fOfqgl>}0*vxK
z$oLPd_`CkI-~3qJ9etVV^mf;vN&_tp@|A9<mGlR>AQ$%rt!^*P<?H1lwL|?QE%xvq
z1LMaw<%LiokQ6`0<(kahT5Wpa8nZu?x)>rnEJx;({<vQQAs%tS`U{KmNR88e(nVa?
zhaS?<x~S1D)Mpjr;YU)ng<MjZfG?mXr9S#`HenY(=9^zoEDe}I$GuXJD^?4BP@tpU
zMpLlw;*i8Zi{nH~C9sYIQEs(c3@ve7VKg<Ai9teZD!ePm#54Q$M+X09lL|VhG8yr6
zQ6H(jUn*9j9A5y13>A9_u?D$4VZTe!WVa6o0^+^Y5v@c*M9B1>Nxp0*7@xDgZYW)I
zJi|l<Bbl6rnBC4LhlUkX`4s2CldMJ`l=PX-i}bO{M<B;$yUj|ucJ9%$EaT&|KOdR`
zi6;E@jn(Tnu6Da^442*huEa~+!YO)&VljmHK&hcVeEWka-&R1wEZpXQKRm|#*l^~X
zV{ABs4I*z6usf3PE)2MVHWQL4>fg9|4W*8yCA3z`#R5Mh(>@an<o#=FE1hnuP^1fY
z^B|H$Nl`IGjyW%t3dK?(iVH)Wt0hKBsz#?L!Ro<xj3AnynWLU?>CTnk{lp)5sq%=f
znW1boF%$kKHIom0wb~5dpRJN7?M{1ZYm>X6eOU3IvWDzCKUQa?qUS%P!Sx8D4onT_
zYE{0Y4PbtGwkGlmWfb<k)@GFB!|T!D6OxcvK^ZgFv;dA}-9XA@a%QFg_d>WqpB1Rz
zih7-XA<t)&p&p(+I_zYvzoU28H-B-r03uf=^0?D8dFqjvZx}xoej4y>4Ai;xsWY>)
ze1*!<BWX13GxuOp#*jDCP%$ibKP3HjQvxl%f_WIt2`MG!Fua!mh8J65SbZmm4b@hr
z#+;A9e2;28zg%zfH5JiWKFNo1J^*g7$$J0F3_XBUfT)x<!l_u~4wSrS_ZpCe*iQt|
zn_v=3Hwb%0l*tZiY=00_RYJa@iy6EA(zP=kO}^k#WS*`GzVi|KE77mI2yt*Cgeh}m
z*9AP)-)j5lHnOvYg&1~QuW8#n+#r#O*{fHtZf>G<o5%j!sOq{3tYebPZ?(3P+yDZd
z^R=5sr^7yQe7NEZs;li5*N%fr_71Q6a4{Nr09uO?-6RA!5#OSkX9?JsW57rTOj-)Y
zu5WGl8BinCbgH!YlNjXprgeZ|n}ucJP*6SWD1e=@%qKjZ%mmJDaQ5ky?<_9A4~XvV
z7337m-7JL@CR8}`9u|uGihsY~rRYt`SazwIhqT*G>N#5AOzmLm2$e|*3A2hZs8p=o
zsrI`1a7gf<vb6S{ADe)aDh;LMdpFv%*2Cq{ys3y#B&g8(d{^lYeB@5O0eTlqA}I-1
z(GUMo!g5U^)Jy8aidw(i`euV!b-~XyTN@~%R4b)&n2(3EXA7YNoxT1*y_j!=epn33
zd?Z7FiMkRR8PxKUkDS9os^d81crx%ga!Ti$rZI{=BBfq{&hYAH2TeqDvJq7vWo|dO
zin#%!pDN+hj@Zzu2m`+x_qR40xge}mXHXwQ)?cqP|8}K8bV7W@k6n2la^!7TlEi#<
zlc|DN$a#F~$ayIG2wxgIU&sw+Ez6@T(Bx$}lDTSWH?zZ$M}UtHCLz?9F;=65rLa;j
zG2_Rmnqi}U(ugvDthe0bSBR9VH{r%K5iMb80gS;<{QhRN-q;8)y@f~;>cHwHZaM0P
zN&rveiJjPqXtqyl^-9q;ek)gs&>ebUjz~e6R39iu!ki;jup&Nx>C1aAXsowAhUtBC
zXrAz1z~qh6fo8jwW~1Hf2wuebk@z}&M1bhQT&IKF-3oo!UAS-23AKU+W~o0YS4tqP
z4*`P&J0`3S*|)UsOT^SbR}dh;mMN`9Z(s4($4!=l@mZ74r#ZjZ?RFa7W~as5DFruK
zBA?WJB=g0J6od{p{&z3beQXJOv8EmLVs3e?D1wm%jghR5Y;0^Ihbt5+{WMpudIjIx
zXfT7PHMlB%B#s7%mMP#II-Ec>Dsyv1W(s_4WX8=j0#KM9YX?rm@D`ds(zhe9C<wh9
zDqSqFh*Q*50^}{Iu<#T;$0+>a#}fciLm{6irSWNBLz0_+piNjSgDOf8l``M-DoCkv
zaeL3}L363(D+uXPu?Wx%FHnDrdDh^du(hN3cyG=o+!Lem5y0Q`dWknX?;$%*dig>Q
z8$+!mif86#H(Q{sifXkHRNQw`hl#v1mmvTGjL^1ZKz$ct+ENK4#Pe{eN6(!VK5qIu
zIY4ZGf7kt89bAMkjuvxtC(g&o^B@X%I-)>elc?1Q09)XkFa+Qn0q5=SA^>kdkiT_^
z-@;N68qLQgi1qxu_TfY0bEIEkP2lz?@4Jt1HYmsxWC+<I20Bn9Irm6?#E14JO<)V1
zC2H6&uO26Z6CChz<SxOu%V*F)`mZDBIuH@%1f0>Y*v5HbR&yBJ2XPeoK>sD1u69wK
zIOoU+zOcOX_&Mwe?qjdWi`=x06Nf)AaRQ#Chm?T4bDSgKE=&J=<PH)3qb%+6cc1yO
zy<K!zb(&IF)t8jXwvl8P!^Opi^2OY>%bi@IJbTP*vZNJ54BRFREbu5C1%}}=>imV4
z@3G8abqj`-EX>Za*s0UpYP6bKK@n69KSFDvgFP+4unT~x2Ba^ALb*^au*hPw=WTVo
zlG~y4C?IJ9N%3PXHPJ-NiEPxHQkVGUy{)!)`9@sHM+-BBAWS!#&0?u;jLm@D?ZEiQ
z?pwSkELE3|&-Y?)ZHw{QFIK%S2um`mDD0pQ#oO+P;152L$C~!d6;$CnkDg*KQeIuT
zUa(q7xnTEdgo&g?s`wl-mmx4hfM(0jw1cHq0a=A;R;#khW<iWP3|X>yNEu@fj<J6n
zdh#67AU<ZsZQKd`&4>&1vEmqm5j(UIU~&2YJ&tF5Lb`a0pDnl2R@FR0%K4Nx`L0nn
zafcYjQSJAM%aNl*ybHDj>Bn+G)aj2VA^Va9UPMP}uynwgOF-t*?3aNO<*Oog$)l{I
zNNXiWZq$1qU=AU}hI29K_Tbz|qPw}6PS^GNqbCmPUsr#~J~;&e91{kttwr(v;@|6u
z0rPe?1qU0ZMw9|42z{v0qz8u&>Cwt+AK}K~#DB_qvA_J7{$eZ>MXoR9SYr=jFgSE%
zi@CS1cK(n5>o?xK)|fy3$jU}T&4({*Mm3JWahOx;r!{@y18K;otRYQ1jg@m}j{o=<
ze)vNdPRl%*=anmE^zj^o3Yela=j3qO>J!-u)jcc~F!1ud*RTKS*Ps993on=J%Ub_<
zWboAL>l6T)J%BA(sji{bh*g_v4Wm)5xU>26g){&9=YRP8nMJfvCWl%<x|=Ih%ZP=1
zFW(t>-+B4c@BYD`U3~j`d1k2}p>j*{4NK7es7P%Mr4(Rz(cwR0X<*<j&CRS|fAf(O
zGr#l;KlAWIWtzGATr4_{u&{F34e0Ftkwk#1;_{7U7$ky><tYAQWdb5${%}gy#JJ*w
z6>)omiJIsh7!zD~Qnau6+=9;1EszXHT(EM5Y?YgEI2@iP_!ODA-JvFjQ2^8ipck4h
zOJos~e(aQu(!cYez4kskpMEb_6F5uxRX9s_8~Xw5>Agg}%d^ZE_!<DWMaPwwGLcYM
z-5c~HsaUvFQ6h+Xj4ePCEJD|Nx)Qe|Y&ahXUOo*{4UEqX^i^<Fe|Aa7`+b|dLqj=|
zS@7q`Iahoo-e3HCQMptGX9$DpGrR*cPiFSm#K%CYhDLfw9t6#5GS@bG9U}avEdTxG
z$1V|+{H>|U@qHx11T|I^EA{f)7KWhm>z7ygdM%Z8%EEe=BXE?E(YhmBaz{^L<QD--
z(CP<i;q8^JlV?vi5uFTjg;EugXUo3a`aA-of6Ot)6fzI;6*Xq67XSc207*naR4>A=
zZx5!-=g8y#>{l-*e&uEx3r0r>PgdPX3PJN0GQv~FcUj7jdK)p?ZC;p!FTb_&3&$2w
z^vVaA?UV_mDIM`JR_9CA4r1z_N6}C~twCXP0|Tu^R!7Qy8P0VkI#F^c-y=|yl}EA)
zt!P|cE#>lSTV0wq4V-39V;_F&uo<!-QkRS5WP0q~L)r0=BM>vERd<#aD`lqssH+w$
zrHCbZR=+IsWBSGkGMBvp!Zhv95Af*?F)xm}uOw;^V2t2+r3xS5SjvD!d2Eo1yYm%9
z`kQxj=>U@p5MtB<`cb!3F2FpXgX)BCcSb~R_r5<q@E*~zG(~XB8(~YFkXpL1F~0ke
z68N{kq&N-BXdpF}yw=spnT2{QY6*U*1zJz78|pV4h8$JyEVV(756|vJ*l#XjQz$G-
z6XQ-mD}saO903Jz5{BfSBP^LCi^umBf8mBuSaUW9yXrZGBD8Jow885JtRzlg9)}bE
zDeJ>N^JB+Sl$K!wfG(I|&tz%bHB+bvIGMoqbHyZAlBLT%c6n1~^`Hi$MLS-_{({Om
z6Y3nheDG}pW4$>aa~@#F3KfuHO9gBH9N>I>3siW!kB{B4QAGWR7<CA$QLfx)iQdt0
zhKHa4$nh*(d(?P7<8DF2UWbmLkCk=4gu-^6j`ChQRYq^*#4H`+!wCIZoGbQya4yHW
zQj(*CmI8rRP4W#y(T;7l9Ekz|a~A3&7NMd>KxBO_G7;3m$v=On(xcW*Sx8i&!C+qp
zgd+j`A<N`&uy}`DSqsxWPF#c2--G4UF}rkHA`e#L?wXoy4XC>YyKgtuIjFC&7fv3{
z<DjMXy~A?Qa&*-2-&unD%#V2)Bv?Xx+^ruD7#;D4N5r`>W*@Q?`-P#D&UPm8kYABq
zgGdtxZLA8Fz`l-1GVm~`0}ZMuv<v__Vs+Q}gagS9Bob1N9AW{fA?uM;JC;F?nRtxu
z9`e5<er&S@H_Otl=Bnh9OKB#7#8Mh6?kCKRy6pv*VL@1-@NkihTOzMK$d#zZl#z-;
zyOfxAhZXK#B7d;M!hl79bAvEfz-EY0P_wt<80F0BDm9sckTo0o05ZCl{XOysXwrH|
zx5L}@e%o{4J3jF~c@EQNeGEw$U%pStISj9TXAJF&>y9k$s3$w_WEbxifuoE6zVl-v
z$50AaZ6z@tR$#b{jJ}B<4?N^~r~!mX;IyjqXjO6B!~(gSjNb_;Em;C3Gy+m~kCfmI
zJVeT*z)ek+Y*Gi)B;c+<G#T>n7G1K5<*-F&JNkm^zL)a5ZFu+iIWkV}g$K;nx=tj?
z&b}LZ6xuB-#}0a6DJQJA=M~Ox_oIpWr551&43&ci7i1&ZaNre8m550BQiZA_0{){V
zKelhuWP1<`4T;dLEjz%a86@0PMpmP3Cb2u>S%?gQcR+yJ#TN-zz1|_n`{nJrhj%6x
z?^n9A_htxW2uzK@elsUikVO(I5h<gh(Ypy=!VrlU+mRtCg(dTOHq{mmd#5c^XZWB6
zG`e7kVi~JcCWZ-wcDWxu(w|KS0lOu$uc=0a;S!cnUKk)mJSOR$0WCa_UJpGSE(I)8
z2z|TIq)a~%R-Hjxfiyrd*^|DV*xGbX&w4TQn20T)5>`^UcontHZ4?Xd)t&_e#aa&M
zqXM1o4G(rGrU$0&&1^yHK*QE<c@iE&oDD^5_}p>Rm<K!CS?cd40`Ek3-?1lq#~b%D
zRa3`B?Y_m0>isR=2lMh^feA);=$zlFhB!nITn_eB&<8H{M-2a43TuD)v2kJ~qaw$0
z>N>QF-3^Yg2Y%dwxEMu-9K&Ex4Lxv`#8Q9n57@wmOF`g(2W;SNfCD=G=w`tIH4B&=
zbU2^JLBf7-n2yx$oR$<FB~aEni*ec{eK#>4AO<LGD5@iaCuEcwCH^Hqgb$Mm0mOeX
zW7RkS&okLD9*1oq^e@KP?S6s^I6}}=nm}k)sAz;Yr6x|y`?Ty@Ngz(Wh@(mNm6D_2
zfPu=1RcWTX%uNg$gJ4)ctulhp#45d`085s~dx3zO*S&}*i{p?YppGT0A@5(|9!C84
zmmiyQp$&%&kP~%RJZ4mmA`u!*B|?VE!9TRW^wDro2vI1q4&x&Gwk#TOj24TEVZjC_
zM<M0eVjAv&IUA9^ksH#E(WL~YqjTNTg|U<)gQxO4K}MNupp8BJ95FCdVYi&bOgT&*
z9}mWpw-Iu8`3n!`DKvlr_6M<Yp@3`xgCB(_6=&0|wPMx!Jv?-NmpBjCEw(Bo&s1Ti
z9afu1+1qaNu8M9K)6oHEghlrE?g(txjZwJa*fDO}lXYh7ISTxK{&flCIFTxP4cvCu
z%*Djr_cOJ3eD}dj+ij`3y^^S}6@_@f``g=cddE__-=4Rn>d@l91KIuM$6^cW+E6{D
z+ERD5<+1870;Dm%HeN~sC(R0Y3*p6CBud@=2>YPq*iu={gy0&UxlZv>iE@djxC|-K
zo5gUDBkIFC%L2{VpTby@h{|NOT1Ut)wDCPsG_cn`@yDn~D~#S@@39A9ODQ(J{n!sj
zl3j^}_1I8OCix;bAa}&mIZLO@xlV1qKq4(E3IPcGl(3N#=>QUGsVJBg98c<D=_t!b
z362xND1RZNV8k&El4G7o%2A1q-0P4)qhBlOzTO2dj6>8s47Yw>x_|ir+cjgBT~@>g
z9yT1X(XQP3NkPnBxIq)8V;$vCr_SY7?hX*%sPeL9ZA=LU7(Jjw%$SK56ox5j;8NC$
zl_Q)DlZt*47`vkkD_#|xoI+9`co4+cQdSar=kOYx6Nb<nNAZus?E2U0vqHJIStMK$
z6>;g+l2s2=OXP&&op>pLvpu4{)(J_A$Wg^;7t}i0C!)G4NE#Z^C;P-j;*(<1BO@`2
z&SXlYpW+Vnl)a88Tpld@?k$Q2=PIG|QslOFkOyfa!(7-i)^^;hiAj<9wpx@#PS+Au
ze`siyeI|1D?h()=!mh1dsb!OR6&2-c{@LL$AYeu63U*ZBuBE%oso_&C^f}RfuKrU2
zv_RR~q^9xI9Mw|!OdKCDc%;W=J^U!$!}>qEyeEHMN1^6F=>X03`K~{3$H@Q>*7+^D
zBk_015#5Vbr~S?1C`q^AuS>^XIF8eKm(=cWD(_M3qs(qccb6R9AN;qIav#oUOZ)2R
zCB!fjHr0Y1VtWDhHYmzagPO5ui^OjUbT;Sb`%ctnFsp|#GJ3Zo0dnP9zdKkiSJ!Xe
zs4mo+y;eVp=4$f`!R*@?-^^8t51l{#%C(D?Y8dn}InSE6N-^(;{mopz8-{~Ix!6sT
zwNA?~R%Av469j1(3<9hYgfYhdl3oFW>Ahmogz?1r8E58}lo!KNr`am`K{-fso10MX
z+Ofy4Z#Kh%m^hX*;W1>G{6hjDjWBXq1GDvZ_=_nRdlvthBv}xNT*4wU9K=x6iJ!&{
zmM}s-G`e6OKuwX5KE@-VRwR=lr_2&G&ue<VZ8YUq6|Kb5Qd!;=Nf(9~1FXU&T}T_R
z?`=fo;!+webYs6)&c#u?(%&fixhk|SAG2vulneSG=Xv#cHga#Lxf6q+($2$`i(!8j
zEBU2(t>8te-|GNk9xHyuVw6XYD<y<93fM=<_q#qT2)$Xh49vy-X44C68|f@|I?8Dy
zNYh4=56V?6Hu}(KtaL`*TBQuM*ftkNV@H%W{MxsBYt^tdW)7xvY|0b_VA2qpSr5%2
zh>==1n!}7E+T{exZbS-Cr7Zak3y6J&$gLQ$lA-@Nsmj<W>C~VhwfaHZL#&P#!`V)E
zt6c8o{B9C$(jrR1Lu^fuglq&A&6m$^tZY^5#V&d^i%Xj~FQ4^0>}=92R`c~b)4P6;
zb~INk)T4+!*-eYuB-yHLR2Hj0yvtq(5UPi?u++yWf1_F{wl}(!u$uO%)3Vsw>9z7D
z@mm|md91+BWu%jgFx|q;;MDa8h~e>(wP;PP+`Pn;y-AE}f~szU$yI7U)r^vwm?}Hm
zvg3h9U|73V!G{!Km|RsltKoKQ=t*^tdcB4DWB=m+`HgnFTL>$a0`{<bU;2YD-@JMK
z#6!zJ{u5td^Ru<}o2Sp5xP0|$rBWI8hi>jPc^pN#^Y86L_|E(H_lj1;3(3MZOkB%~
zj?!T_b*T$;HoyJ<*?SWxIgazrGb^jID(kMk4|D@)+&4&^yv0MJMBRsF*_Q2*J!8M;
z8PE8bo$GmL=IxGWXWx44XYY=!vFwp;`O;<G5=Ds+4^id~kOT=3SL0|j`mE}@tFkgH
zYkyxxb)f+Q6q;fK-DC%wiKxiP$jE<u8TpSdzW8GH$F=?2EJV69o?Wb%2=zw2W;va0
zogU7$pzIhOt=7s14jh=69_#Jxec*uy=rc2!%)r3Fn1}OK?zWil6j4z1neEY}(jIyj
zWv^9X6}gD*RUYssOBUUP7i63-RHcc(>!aiDM9%-=tK|}Vpn$(wwJXc`|739A;Hwu{
znB?h@TGuo3<UN+SCr@-E-fIoH6OE{dyzJ@V(MO5O$sd2@lRq_+EtX4ls~R@+fWJZS
zE99wGNso=ObJxoL{*jRpzrk)BRdzvg>$P-dB43<w8?}IeI?gE<%67dTi6&v`(1_Nb
z5_-rJW~R6%p#E4pP$jgR+?2mkg#t~S*hhj!D(nwwwPK;*H-aAXmB;o9gyN!9retx!
ztGY-J1jzvB@Lp*3n2JexgvRSeJuU@S+)++(N3ReWSLmXC12B?B``GLvMVkn3CU}V+
zfK*uhT_c{<xmaq{EH-)2g8@Ar@#F12sMjI=3@glt(b<vmIn|&GRV%PlQ9}>*_x2h-
zx0;^}`EAn(TD5w~w)AM&)<UiZbJkl1Gc{cJi6cL0dXGmc0nl+F%-?i6TPl<xnIVI|
zJ?3CJ-LPtvEZYA{*{<4-jBRB5RvA{|isF)>$g!s__QoD4pS+G$`cTl2*#j@XqGIT^
z1#@-0q?{l3V|L_5TZsQ-nuc;(f80#PDTas_)ceJKMF411@xJG^W;R<bmz+|mU{wpQ
zTQqblh|_z2026aW3S!;Y+ih3in*Kt?iN+GKkgq!#O2$HrTfB1EsjHsNm%KVKY2bJQ
z19b-)rIu#=K#j-@ZguazWP{!t#Zno+^s#6-!YHOv)Lo~eJ>}IGq>xyoOUzHGzbBvk
zt3>o#j-o&IUSxTx<$vlUShPs6R7p<A0Fi$#Ye)~X<--z0K)ql}?A1{TkKfx&j=c|7
zu3R}am7bcKb{wqqeaDWSn4ZqW<B4Li%s$NQtXrv6Qi=HNqpPdu9%s)i?tew9JuI4C
z4z-=bhJV7kD`2l!2s@&&<%ez8J)EaJoar&OW5<q7Pfy3=@nW&q!a0#hT(J-@QFU2Y
zKYzW?AOq^}b#ClsXhHuv8Go`<7x9-b%OznJe<kPOKgB-e7~L)7e+f)W?Z-mvj~5yb
zx4*l~vC4*1boFd8?}XwpU8~?(prgaI%k^q`tXi6G?;db{hLxL+h695`1G!wmEmdk5
zj_R?RQ`c(YU@!(_77{ERi*LN+;-kv%KuPf*>~BCC^0JAN1_xEMCs+n>X=8fwLdoxJ
zY&W8Yc_BMvwso^H0yeKcE$9&r!_$y?TumuL78;MAu>3|K5$y#EBRJitPX~Q=xp=;&
zrBUh@%MCNAml|#Y`xMKch@>VR6P^G0;p6>nI=k|~U}tjqnig)v&{w;4Po--6;Z}NG
zH!MA5`J#1S$nUSfm4uzb;xrr`gxrS7HzgeD3<YX_*mb>LuH|8k@h;sL4Aer=@hnP5
zcpxN;8wR9s)U~pX@v=al_1eVC@?7_y7yHzUw`LI4+tbykSDm`;*DEYZWdkFtl((@c
zF{{;D9zX)I_MjN-)S;`G(ByN|@t~V?DwCFLLYHIRkz|yWnN??8^I4LGDF6gbNat+u
zOlAZ)0>H>r23q)_TXXG7A(t7C1tHu%GuSwPb~xG5%d$hU%2oqciV~_#xcp1tB`zP{
zrU-c4?+VYZq)b#C1dlz4#4+Gz#HX{YxVC=7x*z`VN4Z?q2$<|DTrB1X2Uo0Hx0VgV
z$RCM>XR_&JGNI8SFT%I@3Qv5sq&e@)o5zeVyP1&h=6l{;oXsxox^AKIS-*bc6OTWc
z&*lsxWEe)VkYBN?f8DwbO3xgL#Aaq@lF1~al*`KWeyYSp(~I)a^k!Kvuj(8zyZqF3
z`F(Nl;|V$UoRxED{kjcLJob-*zsR{!EY7T0iJULve_a~p&A!Rf=Ev}i{LFN{nva=I
zC{V3hRzTB({#wxIo1Pf%Xlt+3{OgB$Pfbs=S9@Hy|F8e^6S<1^$fKv8dGcVz_C+G?
zY)@CU(f#VWKg=d;vQ$oE{T9>ab?j>~ICew{%A|bQ$8>N5^)P)0FRdF1`rA^;WF&CM
zuFXGle}6Wo{kPBlXfju!lj7welMY3eE==_`&CbXpOYp;1GjeuOsnB%WX`rdK>Q$^O
zQf&sa?z*9u{f%-EU9wR;cRrQsnwieD4Xmq`9U~fAwX*x){Q7ONxb}lbk9_wBW6@B1
zG+fSE7CW;tspxp-A9X!o>1JJHlQ~rB72EGH;Go~*=&5^w7ls|fh#Bh5-mPXxU(wh1
z%OBmfqD%YSS5FmRJ<rygXshY{{eHF;wTXDUwzgnwMZj4B-b%TV5vdI_AU-nq#}f)~
zG5`@uPfh3zyILLXXtP5>Cu)R>PSy0AU47jXQ{z6nluPH(uIYh5!wlso$3Oi)evKWi
zA3QYk;13T@Wy{#wR4e&<o!xh_S(42{J#&3AF7^zUJrG1o>jklWCj#6-qY-Xv>!^n_
zZY_7`Z9Ct2&nhzgk54~-VY1X<tRh><peI)I5@HPn!?}?NF5i18B0whzMfSQ%=1X{0
zQKYrCwQbwB?eO7O*qzx31mp4Ow(VPbdU~o>kp&g9yw!EFFQUIeyRbxq_m=VMEmSYW
zm*wK8LqG<n=Dh%rIeqw~;JkIm;lnRuRA&SNXpVthaIRL#J&UtvgLeg#Rp&{WizHB=
z*(a87CqG*=QPQtUEt}GUye+ug2>zGcgdSEQCTo8L|Ax<#^Bsp@dIjG|h97-hVj2HM
zB1~_f8cUlWqlSzaMtidDeRpl#xUR2m1TZ~-@0yyHKR<qM_|(IXAG8X3EEq|q;*HYz
zl~Ju8)rR{c&-v??3;By`0Tw=4bTU{OW2=jmENp}Aop}wg1n>v~`ePJxXpSKnGFbQM
zt&`16>-CDm=I`d#j<}YLYq?@(DwFL`h9MLzR3fV`HU`qy$R+Z@n0?iyp}be!JNpKj
zZNV<V(APpgixG7!V%~fE?p1vqZCzaz1_!2AC^QOZ&lg9hhxK!#Gt&({ZUlTY(<j&U
z-NqxMgrAvA`{F^%E>ES$`Z_m2v}_-n)v?Ho#fuoDf}t!g^Ly$gS>1Or_%SE8eJ)Jd
zu3I*i$6Dj&jao$0R<7tgdU8a3Lu(qw$AMt1ZllA$*fEiQX`gkm(8UlFOA>|xbDx#A
zZ@A%LAQB1$+Pm($W5cF(oh!Oi`AIF6&_<@UVx@TQ%<#E0qo+=s$A*XLXNCrGQ^PjC
zUMpwPQ{|E$GtGc*qtO=>S=or*SENilB(h$#_0a$UWh5sn;b6&bFg>2v+8L&l3)%DW
z&`qm51f=ZLsNYTT68NzuqPC)&S6ep%yZS6G1}vBBum}+FUfxUUF-s9hyo5ZEjsZ=b
zg}Mgw{-r|kmVJ9C#wO359c3e(_V)HwgM*cF(P3D~B9KZY9*tFNmJtYfI*3c|u#|rA
zK7xY6<Ptd4LwMA4E{2!(4A=6ce1lZ*M#4meqBWdGh*dL|1J3*R?;jtZICt(mT|rxC
z`|82f)lwCm0-cFfwi3|<_TQ*Bo26Xxk5@gfz)O;E*bmkJc()fw)!%H7uYgc!%tEam
zU_tBa-HMv1g+t97H}S{3P~nf9pBrJ&*VdU@#(zPZV?hgF;{3Rw)k;OC^4G2I*f|uy
z;&Mh4a*QK@=wPq^*4};l_kZ?_-=7{Xj7+6hbU5N5fKbT<5O1Hq>bOo#oN$FhLCjoP
z^%Q7WJ|6}EHpa4bADaeyuYtS?SYtTiV*!W6T8*`31Kn+=sb?~IrmA(VmqG%DyLwe$
zJv#+!X9*H&VX>@c?>c$0ufuCAPxg}rHobr#X^vs|qIzKO&P`iIy@+v%Xu0hkUuxZ8
zF*i81`{qCV>=)4YT2|U{bFec&H2l6)G^s^n(S+ZO_^e9VbKp>6RTi^Pd?+!DkQ6bk
z!0npcg+J^s{GNl0I*S4Wwh`2f8jVt#9?O*TcQV}?3pJcdmL3rpU8k!1B4}iNXjQNJ
zsEZ0}njWPRHRdQ21)^b+na;Xxm6%tK1l+Cb*X-Y@^B(2`!jA|0$ra1pwBy2sEr0m?
zf6kKDd^*#e!tPkuYb4AvKBEyb;T%>q&-kQK=Xu34*yFQeIxMa&geyzO%`gJIvFc`d
zC^F2i72UnTKqC@NMD%zZ0~s;U*6eaN5RAw-R@gLl8e$~99F`GyV+bHS7+Qz~iM~@5
z!$kA>{F+s32L}hwk6eH^tY5#rtE;O}%rVJ^nGgobtS7?+S(C1rp!&?Ec;u<oBY4lT
z;0NzsSW@xYGTEc)`FwHB>h*(ztIm&%qb+XYoG<dKV_;?5RWlS8oO#2QI`opGs=0J2
z-Z$%pA4P<`peokv)|x4FXawoQX0y#5cu4Ak234izE$Gxm%vt_i0RIgl=jTT~{5Oi6
zFXO+UO|f8wFKvDdED<B@RHqZBhAW4PqkZq8L$P>I%=F#5t-UJ}TDLZ^c5q-SmkIO)
zOy4L>&T_P%T@HAk|9YK^Sbye*eD%6k47e3T%lqNS;shXMi=PI1d2DDg+o<rzt`PKn
zEdSFn%*_@8v2Y@&_qE5>NW7HIHViAKRmlX)z?4~5fYHtH0?SM8z07kOQqB<;6%p~-
zmObg$>vlO)x6^tZ&7^;FLVM`3mx7%uS%Pr?t?|C3)}PdF*}oxubflP`?&>t@C#nwL
zHP#L`N;5gzn$m(!$Y;XY{Js){4Zo}%4Eo(7>+tZar~3l9bH<3p&?<G!4*2j8hfbM0
zR{f?M3;X+xSkxa}nGm99Hfm0JCKwLu{t$+`2AXWyAO&5&gjw$s3R{ks^jTh(8+;I=
zsp_igCaazqYq_CNIKVW8Q%S{z9M@d!;Rlb!qU~+HiH+MrJ*L((6u9e-n;(DXbin5i
z`7zWMLkGK3^q~jxhY%^tEn<&>u?bHtPcHqlsOpI~CCjt(;a<qIxRO8u8(OuR$yf4S
zDN3`k79uA~dd7O$Qkh+{8OJnGQwx>GX^QvlX;%IQ!?=9=MT@{C9hgTAc?nyuV!-dm
zvyEv+LZMKpQd+xq=<M0EJj>?IoBRP^G#akea3JKO&cpy95I}?By~4uYkntsWyh1wx
z#+w{wHOEq{;Dvi59RO>xWIX6rN~QH{){dMRVLk5V&0GC?AR3K(IAd`UBwN7GCkuGJ
zR=WJC-ugv@ygZ+(CxYh5MiWaFzZ6@<?-FrrW_>gN@g#iiO}y{8&gZXMu{=3<D`oIs
zH*)42ef{Rmo0su_Gx5K^Wi4%f%qxOi2AoD)GJ=9_B5ghP@b~R{SaWhGHm82&pM5G8
zj@@(T_5+W96Bibl+&NUs$$-{@`VH#`T>0uYE-&<23EsgnYZ!2-Q8aKgE1W<tLl?w+
zv=q{cK`k59EW8}!BZAd?Xzkj<taB<`R@EsMz0O7piz8yT^AOD_no?HvvCA&x*n5xV
z(4tD5y}qq{?RRs~f}2<l+lF5Yq!Re$VDi26^1(xArk)H0&Er2x{m#Gr5VjE8w-0^$
z=rJ7RWil5i-7gjoevC8FvIj7f@YU+r9*bAIGHAf-Z~_7)wtexk-*6$wt_ysjMnR}9
zAnX2AqVD3%p<EFI7se|-e1wTVKoj@CZprkcFl|(HF2%1ImR;Cu;EQ@`1dpa70HQxe
zwB!#tk#QEbBvqI`=gtnNpLpc^eqW^0D6U@L`5*rE&!KgH;GO#)dE(`^cu%#Gqaol9
zKC$SQEfk@}4AC?*Y<M3JbjEe$%?IN}<anv!(&UCXqfZt(&L6`S?1N(A2;&e&GPQD1
zt2q>l^i~V$fIlWa{`6X%fsK#mq*7{Jw=_UruH|AtK)N|E5Rp~NKJ4fHwVIQNo555h
zlg^-58yXsd5XTcye2ulYr(n<ZniGpfvbjuKs-s%L1tCRTjlC`UbiN%ejqF7*7Ot;i
zL@RJHw5VBS59e?uot?$mNko&GOt!tfli>&cO$Fy{zO5|@OP*~g4{iA|+j^HX<{OJS
z6{9y(b)zP;l_XhxMX9Y)Axlqq-GizR;}d_VRx2eECLRnk>6s?}jd~)sjQ@3SlsE5&
zOPe2qL&J8=K(LT0_TvI87_8Uaa3oT(az|fzwLKgwV>@AJeZ9StQ&V^%<2{27G>Vp8
zvJ&BxTg!$5+HHH+uV1%v<F;ffra4({<ixo{Pd)$q(eX+?AMQ>$^?bd2exT3TzG=tC
zy|)al)lk??T+j|3Ja^#x5A%k?xha#0ot2#(BU3YIK5xG1=Dm0A4n(v+{oJz`rgEqT
zOT`Lq;9k>eH{yMMO(iZ%U?F2RJJX$rwI$n&dCS)sV&pL~dd4vOgMNMZ_;C<31KQfv
zsamBRjYdOui2fN}F<$mk$#^zfsT8JGv?tdLt>3bB)7o{Zwl=)tX(wLJKlqi$PG6WZ
zJJw_hML4uwn(XWi-h9i>6*ui4TCF8D?f6;y)fW!_=&2Wvo*Pef;GIDW2Jp>L#G*T!
z9^0^G$1nZTN838Jmyc>+`TE1y#&{o56b5r)^b#+7q#OHlwnK+4G`n`M!j5g>{<)(w
z1=E$GNGx0{m+*=lG<*}I=PNZQ84Q#da=Ccvmgxu!Z+W_AmC8Mx9Wxo9v{5Q_%ihgf
z)(_phvkNeo7&-J(=D9<MjvPJV4=?~^7Qk`rT2F7!#<gqq@40!)CiK<X*-7p3ADw>w
ziAS(0Vmv3Ji??cVJw8)*a_{@-&)l?cRXV4A<!eWe9y=3>#+*7!dFCQ~W6@l``;A9H
z7@(@ftZefOqhWlsoj-dX-w6CfAB@!c{XYEBM?>*S5ufP3a4=jh6!+Y`iwq5?j>pJq
zscbMn_PMo6EfU1V8=hF0hT=STxUPSUxx-;o*IDIV##)-1QweN#@d7cgnut?%m!&HK
zhOpUuZq3k8uE26-Hjnb&euda2T^9G5NAPqv+AFUdVK%Q`v#B1QC{Q?f&TD+<PM=j&
za(MVm#0)d%fB5iW;5~ivob;32-8-J16Q7$daS5syU63P_TqesVJe;Yyhci|m;xPCs
z_f_Dun6b2r3ebI=cA)99o$Ah=+wmGGJ7+C$;06dbTCW#hd=Z0h$hr3ve$nVR^mC`r
zdulLkcz8Gx4kO4c)C1MUgFoUC<g=>jl}d@Wz>^Hx5Zuf7WlG_SEv^7vEch>Q^Il&3
zlIO=}5Cc`$#K9QN%d$GxhYqr*vwt9wfDNN^L*q7b=1epZpPHDYi((JAu2d&H1KnF9
zsD0%9@3?1wcSn?cHDr`(wrNBAR&MDZSo`Anzj^e`v{efQ@;yDlpZdUETQ{x@#I&)B
z))Cf{E42+PR|d5Ep8nAjLEXnHJPbB2oH`!w=-#t)+ebdUI~vta=R~(%DdBi7h`&jv
zO4pA-%GOG=pBExXB!7!21pH<!+-}!uQ^gGH;owef9l`cabEbw>s<nD(ppX|c*^|dd
zDy3Q>U0r2@v3$`2p&%bUU!%HZP5-TTY`y=!E#OId-qBJ4?e3k4Ju5!&)S>JjefeqJ
z`mS4*48}V?`O!Oe>|Is$X&Fzsx?`2GYt{Z#s;~I1r`&w5Uh(t-cBLyBGW5o$K6(G@
z0ntT2S9Zh;vZM4L+(XT&p{_=|zu^7jyqapxjKO8sxluD*Ef#Z`w9T{xlHzN$r&3l`
z%goHI+uYlRjj*OoPtBA|*}7SDN;47J8%B$pk&#m;6N!PrzSKuP{N6o#6K&obj12MQ
zy|=aBzW=_5A71g@?>)pPlb0dYo%+--er%vmOYxQW@~rLCdVXqXpd<41fdj1OQG+%t
zJ#?-1;Sat4&U;ry3=LD{Qh8?G+Vv++o(W+-)BI{)_6W;A-*yPR&g+FqD^9i9cp{xn
zvj_~IZkQBvG=hk&_mVcznjIuEN73y1W;;NSw23vE%J2OY*_&l`3mSK9HmTwz!Vsx0
z24HmPEO@{oU$o%!r~_LV!H*dp@a!x%#C-%fxQ)7rzJzk#<N#I>7hY`D?<s7X2&-GM
zG=Yzfy!ya+O-HYrP@U?695v<>#Iyq!`v`}Jvp5fCc1^ga*O|ADvnmDe91nEKqctCA
zu7`D4DzjkM=OaKR<$dsOCOZ;PJ*CUPq>UFF{>U@<b7Y8O$bUk72{^4<V)$PI^FsYt
z^8C2&T691V3);6(sNnsP9WL+&@S(eR*i6OywQqmx*|5o?k6I!YArFSeuD8^eH6r)k
zw(A4;bh9-=*3rIt;GjPgiaWKByz}PO1G*Vr`NWa(g`-C!k=$MTcHX&rC9meuwD!Fp
z9gl>Qt5@~hv_Wg{?B=b;?313JZnZre*uQW0$3MO$f;C1%d+-~tPESrUxWu<DHM~CW
z>`yEvfl}TzgE1qZg~Aq25_sz~rIBPO%FEcQl^cHQ-FsOx2NXESGktBnD_7K~ULg-1
zKfANkYGvJ5-L+%G1NUx$j}4D$FP}JfVeHIcXY##w?OfTe`FEzi^>kO|)cJr@eaHS?
zx9?d+$9^`iJ@V|Kx?}CXdH2R{!;E5k9AfpPC|vkrIpy5+h5zGs|995@f%Ef6wTB*h
zkd3UpCnbmMR(KIxp~ycm2G^y4QmKUGurjH=9qP-lNnf>U)$~;1u6NzBYu`pyh3|g%
zyX_sx3s$kWTii_WnDKB7Psx6_vS;V^yKYUe4c&#bcJPI>IRB4C%?IAOw!^R8wQti)
z&pb6TF;T19+xP6=wp!Zj)CKME;o)?lu;sR$tNXQhGM+CMt%|sOqF5YoZQZu{r{8-U
zqeQ+_fA_b~WM|TK%s=oxBIf1GVHtrI0<y$b`8jUBcd}2U7B!>DFv}H$QaTOx;q=2Y
z@mj-(_DTU7;|-rV*Lq{+T^1X83F_?iI|c$qjZG=>dQYYC?CD?wLQPPC^NA#Ktu+bw
zgLPr6nD%4M<bx;+SN8jAVitATP?ItV^LxKkZdf1+d$!)8OqSF_3oq*hy^>pnb0e<1
z#v_kQ=4<EE^UE0u1wEY6&@<v^3s!KZ%K<B;p}e%T*~e(2Bo&(*nqbVqP(9|{Ja2Lw
z<k3?DzISg9-kNi2Kc{X=$1gW-z9H9${A}eH5`U!8wi&`T&@oYKn@xr3<=r~+e}#6x
z2%h)2bDq&V<0Xn{sR(IAlvH25a!JRQIzM(RRwW$N!zN4lwXN%-8-M%5ylrU3VqRNK
zXSEAs=fg3-RV&4kQ5m@Uw2A4o8H*PR#Sgt_eG;q8n)b*4?;k%iS}fP~t`J_c>YsXG
zPglEk)7JH`o;_Kq74NxY56rAs)&Bb*d?sgy(%DLPSMRQycC(}PbT%i`I&}87_JRBF
z+I!EIR8Tv4LHow{4?TR~34gqsU6X^MP`PTEGJiopdYT^q-W_6#OX-QvwphX{R!ZfX
zVTRzmfH^d{@<08~zr#|6FC9~-;82_?WTt0|8J3FWx^!08Nnns-*Naipw|V1W+^3~W
z+84j@y%&xj=QZq%>DF}m*FJu4cc*s8E!#&X#xfIU-*evs7z|{}+MoW(XU{;~in+rF
zk8fDBVSGF{lPh;(A`V~;Egld4&cFYyL_~`j+JR@wfB&WLV0M8MI?ZR!;l*AJmcoHL
zd%dR)ylkh<P~Ngd3<kQw|LQm2!!Qo(b$*uX8he(XJu`xPm6<Hwk@1wQMNAFndjq|_
z7;e7fwzW})E=BE&U;f9JUOsMHl}OlNOX`2}GdHbh)9%`T$G5-pO_ue&>%QCQv2vF7
zC!hX&9=~z6{p9g8n>KC8jgC#Fr=vKrfDqTUtsB?=!Y}`VA_AZJn}?1ay^ziL%&^(n
z)n2aRykt2nBXCs&)VGn(EKCQ31n;|xgBe__U=$lbdvWC}`PMe|sj2|UMNh)tto~2+
zE7$2rRe1#7233FfG4(MF@!UL1bFQf-ID2*+?}_B;*g*~YN>~`upokQtgvjN+q1@5?
zxOkNjwCPT-;VA8(YI{x3E7_Yg@S5^7h=P`h^HT;>()sZHY(Ux&o-36zTY0%lOHPQi
z@RX;Llk)g7RJa@lIkzyuyQ6nsDnON5)l;`?;m=R6fuyE49)HGc;Lmy?2r+1}$sfog
z^IoN2+k)OhQEdsW2PCox@b~Jp(1+GDFZ7KTsC;^_Dmq4F6(Ob3_$$<LaiC3QBqbix
ze(vW#q8TN}SIibBu>oc~vWkxZl+8f4D<(|({Hc*+uMF$K?t+ubyRk=}d@(P^a@x=u
zjH61ah&h<jP(>d(I(+Qph^<HJM)Lej<-y0FdU^N^8C}_oS*uiP?|=WzD>`vwq<!^E
z4;?=ELPx425lh&%Sg_%&p!EuQy}$QL3nVQ$TOfSqT`bBU*^^ji{6pBaG53#aiX8OU
z>%BlGY-%g|;y?Ydk8awyKBTkxwDfa`QOz#<^jceoxn@-t@7dV6Hhd<Xt44g$!SQ_K
z$roOo&ewU-R<1^yQZfQ|dqVWVr%#NYI+fOZon~a{`02tEPrrJ6BvaAYLBtOurylRP
zbI+F5QM6Dmzgqd;cfOOJo{X9yUTp8d)cY(AEuGb(56ut1*WI8eqwU2D7OvO$P9V1>
znFAp)SJF(smPq)1^SA!D?%uAo8_}{iqWC{{HM`2<x{7JkHm(-Y9T}-T|H9c!DdK9K
zKE3nVXHS%C(iR)nuX8M4GTJ`aBMo=_ly>6mOxX_m!+pbNvrjzr^6^t=&1eJ(1v*w@
ze)J<BiZP@qHva6-zA|<$GkR`f<NA%IVwNr0XGzSW2Y0!G*AsyzwY_*t`RXFrh`9HM
zPO4V1O6+7V`wjWLqij5M@vlB7-b^POj)eGV-+(`CLbxybZ=M1pJt9G-u`n>5UuEjd
z<dn;02EHEK7F$&nE+W;8y$L-(^p`);WlEPKf1*fkmQQ>(=4DXXuHo1$@EUT;edqg!
zp*fvhK-Rts)I<Q;^!A~;Qc+i|j7XZ&pkKU%ho61k*Fe2c376nc6NnbS1^dN@xLC<+
zg6oQpJa3$@cGDLAWS}8v1;CfdpDWkxij`R`_%Bqu&M9|^>yLd7d)8PE`dNHNXfGYN
z{^pCHpGarB6YX33R^4~^U2C^!+c#-<+_mRh-}!pjXY&R@_(G{DOQwg0*5f)BmBIPb
zW8qMH-rrph*0qR!V)SSM78uqNaZK-vD_8W<<B;>hShjy)ea1GToqaQGm#6#64ci|M
z+c>kp8yvgunQU;T&7>)CWO`zzYiM(!E@Dj&%aOS-Ixs@B|Ki<RAf>{b)}3z@UlYar
zRK$!$jf7Tbwo!Q7!Q&@C{n;;`n@nz9zjH%R@MG`Zw{C^DeRE>nQ2)f_5pwVxj1b^(
z!H_Q=H`=43X+He&saz4qLW7nb$yKKet;7-l#w~`76bdU>cTqQ3?#Q|6NO*<i>y9zw
zUm7jfeMU$W!&To5V~?2Mj{1#L6UDxs*q{9A*H4}}5${-e?%cUV=Ku@JmT2JV_^4IN
z79Q>*@o)%R5?PyA_}pi{H9C^nuz59K=J&t<JzF+vD|_^Tf&SrF$JiExhpjtWG#oAF
zva8p&t1-!m<EN?&{v&NJ!xLXUJ#nE{!(pCin(-AKjdsuw;PlFgk!V|6q2`7(Bbpp!
zWrh)`hfH<^ugsLR&ZsunlO*TVSbq5U__4E@EgN?oJ#wsnaLw3-GbT1Rv!C203ixuh
z-$Vp13Ii0m*`-PGupEwJOdv~s0vsK@&+vI=+oh)bzzFFAugD8k*C^$Px$Kqm$k~Mp
zuvk=^l&^TLQYS-p@t8Kho8pDIQ5O*WI@SX$J7fNu6jpC|?;;<7D<Uwzn|Mf4Te@dx
z8D(AsP#R^<EnsZTo>Dc|k3C9by(u-k@)M!0#M&6N8A6Z)%$ai#^RNhhCSMyo&8vuw
z3ay@qNGoynr1YaaE9nbZ|K`m^rd0)qD4P)Pr?drGE>>W%;Scf(e@1bLI{1@96Oh=#
zaZY5h;y<T~3mRSG{FuEDg03NEUW`Vycp`NA>@W+-e2)9VWN~6N^B@1~hfA)u_ty0f
zJ%SfrZzBp6qJc(Db3*L3fCZGTRf}%D9xqz4D(;JQpFi4#Y7RLMgtbW2hy59GQB&K7
z>rYIkYgm9593z<s*X%;rj8t4)3ur838lFh^_H+ymXuEE@dHm6*^O;;h!<ZJwlUQ-8
zcZnl^WT!|KT9Mg=8~d-Xo@(>eYXv6#ONCsvATN7c`?_*D(cQK4<ZyZH)uZFXhkx&P
z|0&hHY4_@XeCin-C1C8Jhc%Pk;B==_Mb|7>VXfjCDO>Nf>_E-vsE307II4aen_<&e
zHo{Sh4J~Y~Tn^NoWO^nTs)fo<doXI?GqU2A9g~)TXV<23g_S++m4^1opZmnv)L#^A
ze<B{A1Cqj1B>=3#dCvJo8AZhwjmNBF4UcQnX=gBnL(<^XbjEhv7his*qqFmyUw?G-
zzr7o{ZoA|5$6x(?M`s%n$QKr0(I{KOLHWw%i>#t6gj10M-(MH!PBW22gQazR<qcDd
zChEZmuDfNdV)+W8w!lofVAZO=KniDYEQ=}T#tP+`u=iEU<gG*^v~7KI)0#aE-(;>>
zk4F=Yx@EE#q<m|a2C9p)so#Q?x}rd^B^w`FR_S4Uj9CGmG4YOq5`jTjvyVfN5aL<t
zM&^sqDAMNrZs-W?4G8GJrT@a?i*1L);b;_R8Wmijv5aXxwGkiUJQ~acn@+1{dCL|V
zC$R65tYdz&g3??nE~d(CX^Z(eH~xn1xPb3F<`vBBgERQRiSc;?M5WT?)jJs~Ba~Bu
zl9O2x@_s_#<_MC?w(!R${&VDZF6PbnY^~>Codv`nKY$G07@T0*hb>Sll|u5_yJu<P
zKYvKy;1vtl5^u0Z^SKNObZb@imMB*>NGRqL@la@Gcc)&*!>c_$F~X}u_g2o9Q;{gU
zKa-!7V!7I|tjV+Iq{q^=WP2M^%wgA#Vu@TT;K8s2eMX%&Rb_&3W{Umr#J(aNiQ#3e
zBkE^wxUQ(rEu?+bT$IkRj_t9=zE6)m{v=mXhx_l}ee+G*!eK^ab}DAFS6akW-hJeL
zG9kvahBpA6yQpG5&%JPYQ3X4d%8cJ{wXtq2APR~~wc*;XUCFq$nH2-w=SN5RK$aYh
z)nR_OV1!)20isAS<hs>dHtm6`_4jqf!#ER}3e?gGTou==_{BqGP<7c%#F<FXfH_~$
z?w*cdqtb4=ey7MDahQ_3c1f$v1ZCGfS|N1!$mv5zNAb3_VukNL?|Tm{oK3{7O3vGM
zgxc}f>>okYQ$8&mIP)%9I!bzTHg;BX1+vT;dE}*Yt5%-X5s73Bu1dKf+^}eNHxnPp
zgjFq`J9`X&ZCss<#_+v1Gd{&|0`+gUFdd78(vxY5U;=ORhE;AQZI{L(279FEd;9RE
zr<E}-%1#xk+3~cr6muGhXn-9F+ag*b=xF6k#K#y<t5u4iit%{G*1q<QZ!;ny{M;w@
zG0b6XV(4s?I*l8?7QFL*_~i@Uq6nyOhLUYwe1w_*Bk>5TR@AP%hOE#Nmkk{2V$M6i
zfT!~t5qJbtXxtfc+}GE)Y15{S8#fLN49v4GL2NUyq~#CV#uiaf5DJ6{ikeqELaQK%
zy`kV?FW-*YFo&YeCaHhr79LT1!}q$hyf^8WXJ^c0BLn?VcX#)O4I9?2Ti4&;f92wv
zPe(`Sb@E<c>gDu)O04;G*Y_dS8=$H@H}18I)LAu?id^5SsY+e_$l#5@c?7$&vvd9W
z^)vxX!>@eBBE|p8m0gI`xvjep<u7cRFwLlL6w~QQRMcju9`G3J#}TjL)@svx_itto
zgal?bnL^YxQ;|+;gJ@Lq&Zdq&Ei)~9Q5f3L_Q6ElN)?X{S;H&qWAEOxKAsdNH-3I9
zss~NI#?mM=tnJx5w7NrI70y*oJUZAI-PfnzHq_o$FXFa`%%z6*)N`l5@$JLB4SoIE
z$3A{nXPc?Jh0OSHdtB!q*#H<ehgVu0Z$a7{kn`CWX<<Q{SLsZ*1UlN<eOkUyz7TJh
zaY8OPk&0Eq?va#nyqG(3$Gx{qSHv@MHCv1uPO&m=#)KbdCo+v{Hp-ei&sh1^J#7Q=
znQ->WKI^4Ed-(nPHW+yD)wE|1p6(pjIy!}eJz0x(+y37E$c0Gh>0t4>q&vKOV`9hZ
zXs11C7A7gv)&hlE{P+IouR$32?s`Z6{@ZSL>$!-D52g}39m-zc)KNS^h?NU}i!8`)
zd9uKqi4F!vE)x^{u|}_s15m$VWwX;veE?axj4M;N?6^=ZPU&`T-~Lt5Uv#?{PNi3v
zogqiZZhpqql09anoU=-LxvaHCwH+IK`pj~?I%*U~2Rp*+R`;MV4*Iksr$-uQA{cGU
zBaW{2iTgGr?P{Wui<L&(oin#@impo4lYzRS8Ia?Uq5bfQgFkre6`<>B*M93?z9$?g
z#sX!xG@Xzsmc`Y-FS7Q1dsPR%<ylyYk)`*i27X8g6GzO3)WmIt5x|uV?xXORh6|}W
z-Z#~s>qR)u{?TI-$NYd54CLW0PPHZW?cKd+*Y0>UhWBn%4^GX-OD6X+{8A{1)pCVD
z(#2*0EiL-#I<FWnkuvf^;IZOwkXfBoKUFr0B9TJmel?qA$Xvw&>RQzzQp^{~K%8r%
z;Ye3UXSrBXmtW%%<^l7=rZXT*(4;fM$hvOVJSXA~gJjv6RsIxuVy-%G0pbo9t#764
z;5KwG-j|J@MY<Y!=NXfGN2=uQipgljQ4#Li-j>|IZ_nO6yAtszaL9f`PK|cs6#Fm%
z0&wBnhChWBk58RBDyXG(c>=F>fKh?NPZDV+a)Kd0FU!nNawg*mP*s<QLncm|yF0tc
zKZ}V%X7<>!XD8w<b>yhHvJ6AyPfl=Dk1faR@(J*OZnmAk?Nz|TNZQjoD@PT)N_jPv
zjLpBbYgXTS%f9W~wpdlR&~kBy5BNmHU#$l$5P`Gze&97#e8s9biAs#g2HM11?LsyM
zGU+6epA_}Va206dOO!;=<|^`JrqgXnHZ4fd=t~R#CbA3v&%7p&7QQ<3zXM}_GaRql
z!k=&vv3AV{B~!QEcH6d{>!LA9KXUj)F>ATL2-+}&x~qF|?A(hTspNsjPJiJ3p{}0D
zKmUzi`~2TLoG;oveI4((eJ|wiLQ!KiUd5_c%b5d@9r?h!x1>_qCx7<i4?g(drgz@H
zZQHhc?_BY{M@GLq{A#u8!rthjW{TQ@N1tB1aqH%Fdfy7|f%o1&^AF#u)C0wGA(@I-
z@To$VExj34kQH#!Iku*X=KtUebA@jvTMh?(L7x-CqqNrp^!D`LymOmX$}{2|7+Ae)
z_g;*%^L6d?$r0?vFe%~j=_Gr4R&{jsjE#;w^usgnymNJ1O#8@(?pyIxB%PjGJGAB9
z?|KJr@36gi_2l@gCnowL9gjaf`QH8AECBo12kw6Q#aGOZ!LF6-H{aZI=ul?*<jZ`f
z=m=>q%ZX}6`yc+<<NxAU?m}2U_`!{@oIEvrW(?C9Sw$n$19VUq1KkfOFRDPV$LCu*
zFTUWSKc>RiS!3MI+8=BrM8%82=E~KBD92ZHuSs@9ckN$~-`KpZogTiBoh~-qfQ%6Q
zEIh3-@Hc~9&pemA_pT%$|JJ|!<*$F^J7ZJRTX*i<fBRm<>)ed?+@Uk0<9Hx3pMHAm
zUAOiD(<eUq!RMcTs#3G>dDr_lZRvRN;OP0&$BHF~R|b&rDE-|7kGrA7JMJ9BS@KW+
z?8pB6FaJ6m4vAkvDm>fKFDg`*D|15-K#W^Kx`$Q|CcT+36@*1742+@qBF=q1=f+#b
z5FuTia#6TqoIqEDHW!LX9EOL|5pzjWm>Nb)O*(;e6;wgwA+ywom4dpCy$2D>;S#cW
z1`pz1gG6$Y$UY`SNFgZAF?mSm5@nRBZZWr(KQ=D!xOAFS14V;WMuNwC1U3&V>HWRr
zH+GoC-|L#ChyboYQNP*3R71#ZW>dU~k4d^Kh56zPRw@7{5rwUS7r20qzJhe(ByyG%
zj{H>ZNFo<fk-28aq>x8yEY2&g7St*4TX}If;U&r=3l55j$|v<|WxSc`+>r;l4F6_L
z=G=!2s^)Xj=0urQK#~DXiJXFtBi1IsqJ$9i2tWje$TLTNa#Xl+<Z^|xY6w#JNg;?d
zR}p!woX>3zGE*b6ap9b*T^3bOv3T&mta=Okb7}Kq?0U=SEJE>gSzD#ykXZYb-}rdk
zQ^*x9jg8;NOWL_n?b(-)7ufmOZ(wCZZ=pxqOPCS|B0qTY;O1RJiI~<u5c}<a|6v3H
zF+pU=_O*weeeRW$nwfO#?y++dcWYZ%0(jHS?Stz-*%8xnRpB(#MY|D7m~Fkp#caT5
zW;4aV`_flG^(onrVej6^6VI(Z_{xb~#pznnRkme`l@~6)HqzN$mv?&h)<5oF@$c5o
zo^9;k%y(ugt@XEQeVy6|Kk`0Saf$Va_tLZF#}6Kzc;V$?*B=KQ5W{(5y6leSYB|UF
z-s8`$-?o|yx9{q?Y1fB&zOtoZI{`|c`|^>|u^fxq^;rDCBhT#F`2OCM_Kv%E?74Ht
zL>ZG@=~1wW!kC4rT5OJ){U1-~eMb2C7hd_&g9G=!dkx-rfAOEZ_YeN)>obr?A;%Q2
zx~hw){<w#{uIEKZ#toaTELBy#dDXJn3_8ZbVQ&~R*r$E!SAVuUf$<62CwZi-rv31l
znIoq!G|XgA{{ZNL|3u!(VoBj^eB;0mS8RCy>RxT-VDO*)iw|RrK{>!do+C%KqsPjT
z*cPnnpLyo=?%jQDLGA9l*4%pM8VoAPK;gQXY6KG%H^?GL{9u@={_|7WhaP!y#rnbC
zUJU4s^;>tJIx{kvEqZRKuPb89H+wrHz?Ys+INx=}1LS@20_lA66$4O5zV1ZGsW?^(
zG!psblb;j_M2m312q}|k{ah}m7AKLDvs?~Y;Sv@2Nl>K`Ay2F1w^C%jID;PeC^Qsl
zQ*lgJRTC(FOrmN#%7>+~sWnG%Q@3jYO9|vu{EV|iTBqJhAivJDAcvd<E06&V;s8Pb
z2U7Ui;=bzC+;o+V6mV;0zIHjMK(69)LaT5h%i*;MNJ}i;z_dfr@wF$~adyCBCQ&CE
z+O(swT}ULXog3F4dGzR!<0p=dOb@JH+uPUAJ|JQIyEVe8zSU>OFH99{U;M_8_U_%f
zb9>4vYsrXK#tFtz?dd~Lz3}p@1v`*Rb_DgvH-GTt`I+j+KmNehAuSTa)L3IL%mars
z++<zI*Cxx(1WvVdt#j3;(<fhj{OLnI14F;`3%g*Ax4mQcnW<uRYG!PFBEtR_UazYZ
z0Q9@46&E4$%sn=2Js2>JU}QS2P4;NLajam4SMi}3FKXixvLVIO&%QW0oiiitz1=II
zZiJC^*$$>aFINBnKmbWZK~%(I-LY4O$N%s%554ogd-m@M;;e$J{2}e75$!wQeC+IW
z5jPec-K%Q#{JF82zx?~h@7Qzm&Ykf<P-}~8la}`KG3~%3PmfI$Hx2dSMZ_P`l3fFu
z@k*so_`;XJxnu8dbarYhIxuhycs71WuaG^thqCSSW*!$Cs#FmLvZBS!+F~}(q%dA5
zT;I8|YG1e671bgMvFc%`*ol&Mdc-<>^e8x=Iz8pOX4Y}UlOX+RyeHASs#esFo*Db=
ze|Ti~o{jhI??<fA^>cQ3Qv3Ta9)9t~V?nc%9uP+^>D;+L|I66E-J7;=3noH>6Ycou
zGmUS3?V()F*EP72pA&g4SJP|$=xC<$?E}X?{>hE>+&lK}JU)sa>KyAg78{<+^?7R}
zpuXJd<E@T7FDAS&0JLb330MUIhQZkuEhCTOZIA{6e2zdO3QPC|%outHT_Zo&@^+A!
zqe_P#!<acmX08>U;xQHS@bj&T_PlT=GfbBNG62F6JOPJv@C0i0lOuUR5^M=kHj?$)
zg(K;zEw}_=$*GFs6mc4Y$lE9I$UH2^Rsc-miZ=tG!Vt*>D-{Pf0OS?mLb53gInGPY
znetVwMj;A+iha9*f9+znwD~b_XRN*V{Pe^h|JB$2=U>)y+3DWS_K@L^n4xrf21bH^
zfQnu3TDdWAH!d7K^Y8w1q3+fS`C`;)i*)oy<Bdn2dHJcAUJmF+TVho*)m11LM<>rU
zblV?_H%xOZTQU4r$Kb}1bmjLy{SSpgiA5WD<~nlZ2pfljvKjAw^2OtiJpB@)HjOn;
zJhgsn=MSHG@ySESr*p+<vST_|3MV^bp@1U>sHh`U(-36^hp@V9Z>sB(lj)lvhGed5
zr_+Tmf91P>`?op0QFUr!^T&J;dve1~%T+o-wixWtBe7RbpZta2{u<7n&z?Qq)z!u7
znucW_J(oHEh3|akud6-X-H}Lm?EGl4l*h;ky}KRhHzTncd(YZ7j(CSpjmEwc2dI%u
zX?&*W3r2Ze+j~|`WJ>?#|9+57;4C+gEf;%wyW^(+zy7zsEmo@WwoWuY0LKCtJUR&R
zlr2VKp-phF3s7(TCSn1i>0RBuRkoMTWxnz4?|<`~RlA(eW+tPNWH`}R!x*rEH#Q_B
z5$hUYD%c<NeeyTIluU(dC99Mzvzj#7*7xW$FHILK|IZ)%)lEBg`2)d|r_WcNMjQv!
zZ3)Y=xr?PY@M!pt&mTT}e(YNh`l1oD>eMeVM8RxGmQIEK*WZ8rFFx;LZeDal!DupD
zuKe9szx&+r(aB7~j3%ZtGrfarE9@t}9F`HdjtI0EgSsaj0s&?K!GdzNwY9OX2)Ya{
z;3wNz!<!3q&7~4Ff=gfwq?3VcEYBxSBFuPnbaVuV;k6oT?>Ge~fW6UgqfRDqYHCVR
zVN%o`uj3Qnm|M*YXY|AjA;1c#feVNL7M8YV%^HqSWN?GJadvuonhlI-g^rGn-rin5
zlt4|>WWB&CrRn6EDdfgH`69Th5;oNYK&Q|E5QLmWQos!<<&(-+m&^z-umhn2kQBv?
zIXf>o=aULWZryPaAhD>cu(bIx2A%1AxyG`mU<}^{;iRwT58{*ub2a)$d52BY%4EZ4
zoIT-U9Vy4J%S?1AS+;$w445fp8)h5!!x1xbKAj)STIHJ6@cBcr2%fjdqX)yGXnZ<5
zEzA&up=9Lc6C=dCSFU9rQQmpFjN)`&b+zGCPp*{5nMT8BPGyQ6J^iI>t*dV!lPd<;
z(gJF%_#5VE;>}>OF=G)^jOWT#hbIUd!B8ZL&9uz~jt}n=fmla-I1<%;CT{<Elt4HH
zl9}nkx~;p<j+`;VK7XXW<_pi{is49GwUDb6YAgB%PL5v4rZX#74utVJT`sQ}T6I3#
z$QP=~fPs^cu2k>2$#fu7E!kt;gX>FJ*H)_F%sv(@x*5+@8z`h>?PV=()dMi<Qq@g$
z^o@;Av$!o3N_ylN2V@xd;g;PSZ!GF}je0h<$h7@*I?F-{zt5kq;Dycg>t?K@FK9$2
zv*ke8^dZlFU$qu+Ds`un6WNZ&s*dgtnb2RS;>5bUPhVhgksTP?eQu&wspJD@GM)+-
z%7vO;j3pEFr?K_~+6H{TpBYO>LT0s?_8CDucGnuAp!k8S74gp+Fw+IQX!~_mdxVlJ
z*7Q}~pb<_L$~N!ROtuuljq9S6)y3V^%e8xh2tW?__A83e;!i{%Rgf^Kz{-^?F&%D|
zj*AIC_HeQb7cOw*+C(Bj2GW%ZL*=9&flt6N;2*5nrzqk|@y3x3oL-%B=+enRPVUH0
zN^9NdEf$$C&h*M$!f1qH21mdFXBaTPKIS6B=mjvzIW#mxTp>&b65+}{Jw3;c9h;n-
z1Y{5<QC$n*a|?O<!~ub4CjbS3pCALauovJ0TVPjcAizWrYzB2>V`Gd%xQgK)t|#Ut
z=d{T9_&Ap<Ij2k-i}>3I^Lah#(&oo#4MWic7Mni9EaY>1>}}q}NGf9Zk$`YCuKR=W
z_8wH8<Z&#=E+YpnvuiBh#0IDC26dlauDI1gEf`2>Mi6IPuvfd`mMfMX2zGXNUpRld
zy}bvK9T^{|M~Ealkf-tV3~Q^9P9u__52dS}oGA#gOSWZZrV_Y1N``&G2=p;%=kUzc
z77sdAS;Z$5>5_2S&r8nTkRS0x!nSxtaoXk<t5q`;F-(>u*F|$+<Kom|<e>W_K{IUN
z{MmI*o*gyu@!asc{_xmzJ`^<Ksbt9Yvp{@wG8>7u_s2U6l?srstnA?7qy1}kx(%zL
z8+OBQCc0=EUT?#)a^+eunlk)iO^EFT=;bSgNHj8;tB^0**>UKl=i56H*+R8%U}z>&
zXljmI#ucJJmQ|07FVMuZW1(jC<YQoj@c-uz1>)XDV>O%`^Xz%c4kg<gtN{<megfoU
z<}&0D8=*kClFKuIs9P*d=X5$>>{+$WG(zVt<T~2BW9=Qea<*EjMH5NgDW9JjO~$b1
z){EteVJ2)n6iW2^{MAS-I+~scnr1ASv?`_a^i&A9a=Jen@1(aavWnmF<#U;MvYnOB
z;fPr&72(Gh>%aKGU9Qnv3<1R$I1+d?I$m`|KIwj83{V6TQG?OL^9JD>3~Gy-jZPRH
zCo~!Yq12XKh_VgEFdZ|l#K;WEp!cR^RA}guDHA2A%1;JpANskuoS4kyd5a;e9tRM3
zG(q{=a0VaH181cqW*|YoJvcZ>TPVda<?um7mkiDX8U-Z5A2!Tby}iAiwo@Y<5-EiI
zO0`c3MCQW&mIVB=hf;E-z~_{bb0SJ683!uC1}nb2oK?&iS%u&ty(9v4YY;duIp@V8
z%$4od-7l-;^3NsCkG+XACLUsqm|e$qCtSrkN0@~l-pneGiOF;%mMB+CaAia<fb%Pv
zs<VRO;PlLNdn#Tn*OIAjURS0y=m=|=PSqRXXe1hsRxM{Ljr*;TPd9SRHTMmOYaz?d
z&E!|EUWMC~idDvnBM|hrr#ecNBI9Aq24d|!=#)d@*myeE-M3;USBS?Vh+`m#=QE{3
zR?1^s@GL&@!;^HS2=zAtxcx$9>#wuvolRmXmPmn&3$fK0SNO{oewB*wQH&*`(eQ=Q
z@xJ~Z(~K|%&gSxbtKzXJwhvVlzqot~n7I-iVxzZz5G`^z7WOr2!FX~sJ%hicVutm8
zVg_Tz+vq2%l?r}{VrIy$v98aLLuphEEEMbP9f-xke8Ylahb5U<zR{U@tHo#8r#Jdo
zbOVknH?r(RfB}UW2!ghQy|*tIiiR828c<lE#)sKpBkqucuOGYAs4ym>6T?dn`d<?t
zKqF^QuU<8fMl4qiB82s-8;OSPdTpjq#sPe1&q@@=K<y9dg>u!5#dDPsU!ZKd+}_bS
zJ<VKlCK@I=!Iuo7rc)Lx6`!A%2%r6E*|;~Zf>2{<4u<{9VHttzgaG|B(nuhpKHD4#
zP$;BPY0>BehKGlt2yg|M0jf$AnGgtgFuV=k08xWOU~$1-n<(43iXI6<0IP<pA@dyJ
z%|v)9P;<hkc|7jO<pgr7pQK!;$G+~jc>_4FUAq>nz*cczr5k2I0s}sC<_sMg>R~bf
zJ5BTAi!TB@4M$@sRJn??uxeme2-C_uH+hJ>-T1L;aFJ}!k{dt(kT~rEunJ`IAhF0W
zXOV5#FutN0V<6$2BIOOpxq7oyizxiLBSF<?`FI%uOPnA3Ts%!CVFp$On`7JviaS8h
z^Ci7qtx-!Ql6?4Mk+3*p6zWUoi;7ZrY9;nic4}3Wc9k+dbWuTT_<Ribnd$J=@otZ;
zo2T~k`E}f6vI{WIAesMQp;db-k<ZRx`-Zc@WIPJJsgw%b7B8K;KLD#{OY^E-hwl{2
zHE40&ahv4$GVB-UPu&$~h`u`7c3)7(Jt_7UOzT%sXyQx}%P6tJkSB$!@jlB2Eu|uu
zSmhGRaSQ;()uPF+A;PhQvU(aIr7A!tt@|;ZV8>M_fSQ=iR7CH~#}JPd>?Xp<rf$ey
zDh(WOqMxp_4o~){)orH+Su9s-m>Doe@jO?^g&u@nM0%E{0n6gV^|D$qSfhJ_596qg
zPd_-7gop-M*G&%?)Nqom3xjr@AQlb*zsq)mvH%r#rL~G(>FSj45PIco@w{#_C@|Q?
zv4LMc894-Sd}*_w&lmRvWH&RLg@1m$%sTA5S1M;>(J&n%t-uj?nA}d)P2A7Xw6d=O
z{6N;<cyGYP;m2i<u>5lwfj1oix(lchvWIZ;g3xcMuQ(~_AfXO?+L@zAXrW)^L}v^?
zgCFyg%%sD>xE3;f^ypD|Ftm(Hl5J*YhIA;}`Sa(w5I#-G&};5U;lxqSk<R5D-}L8N
zYz592XH-1Q%Yz;`gB3VK*A%eeuW*Aa!;Ar$0HkzK9Kj!8X*EVHD51H8Mxmv+gh2~G
z;e~Gpz%LU31_`8q0Av@S5LZA0CD0(Cr{{>oBDVm@De|8>b&4Shab5(*hbvaBpoIC#
zIhRu)rSU3pL=4pNvg$4WR0u3>eheYQc9RddN0i`K=wzE_zVga>y_}?BMIpT{l$Moe
zGT+U<ryPCk*DF`x;Z?lTi=Cyor)5T2RK55YP@5wtFYd}ziraBFj?ffO!66sOGCN@>
zY!3|6hSGV}k*YK)o9AJHd_pz6G8N~Lny6-2G)JkAs1n6i0S&iox&f-Bh_i8?JHW1Y
zta{`NCR~@74Y!uw)6M;&Tr>sthNYAVAaWEB`3zemRhTu`D69a=Qy8(#PWUwAXzuLD
zd9E?N>nTBUNjc(0SQ_;b(1g-SpzOt|LRK(RLzLbkCKBWkW}i;g-zQrMU`@>_z}4%9
zk1-ufI{`#iBFY6KsE!Ar>{ZslaF|h>V5*B~OKZpwQHuA>++E%F@Ip3i#x=SdWYYz?
zFP>3(EoK*zb2o-3o~<!wL^1DI?@zO;Qr>b{Mqr^45IOYN6<mRPK_H|<aB#=Y#*P;~
zVL`>zDYzGO4IV~^&yh>WPhd(=<>B*Bflv_gLnh!HXp3PO@L(u1`MHV`7W%msTlpT$
z!eXxxXS9_dsj#9s6sr7$^@2Qfnbskl6cwilFcM(g0@R#REwb1^xPkQnkk;J+O+kmy
zVnn0}F^LQ$m;<DN6qhAGap0xxI08E&&2e6GKKH(@3M@yihQQM1$2^r#t~rcIF<MC{
z)eG<o7zI6`+S6Hda&Qvevj!G-U`-XKP-L$h&Itn%Qfv$H6#}fdi^5QmS}7X-p$xb<
zA-^e^4R4y8fJkO2QS+bKcLClZ49*K4O`OfsOj`v12o4I+*-Fb3!IxPui_Ny|0f0|v
z{s&`V;=}Sf>$d~}!LW7I{Gx`Ir}l;)oB<9JM03o)l!oQ2Xo6+Qo$NNnf1Zw+N_h<@
znLX1KA%E(t=INa|Pf0>T7XgNIs?emu&049N3q_s{vma)!;vk8j$|MCYM0jijOh))@
zdZY&$5s&zC9hzjAE|}ru4mff54HWVs>v(UIaRO9Week&*%_I#f8Mwp~SWh*~DTaq&
zOe^x4l;X&wD6*HIas+WnT@K3#Tt@`>{PPybi{Q;Q!^|MZo(Y^tGQ5Fw_ynYZ(Bf+3
zCmW1`JnBNu!Y4>1g=}PiRY2@m(u5vVJrHk~BGzP9DQ}<Syl{rA0Uh8d2CU-zRQMC2
zG>{|$&I)9-#V})12%I8a#g~sD09+ga3c-UAbCge>XJ-P!tqdfBA>%rTvKnS^ifp_T
z3TIWqwaYoL03|GkYeZnF^JDRvCP8SyY=GIKa>J3V$1$2}Fh2CWy`mZOc7jp5F|JVh
zHpPI28VbYp2pI%|GVuodT8QM9!nZvU_gDnik;Vmn2cm~sPS~3lJYAe{K`!FFWSc$J
z%&`dnX^|qSuKK~ZrPTv^3U=x2WMtyL?xxOCfIPT7z1K#5A%Y4Efbe=uFCD_o_mI4z
z?SUC%?*c>i`Q4fa7d%f!Ei&llC<X987pjuvC)wnmMIO|UNows?SNObwsLAz#8S4hj
z9s=^j8rBywa=;+P?}Hx$m!x`>mpDiSB@xEZd~Ci?uxCrYV=76++xx*dkexsnfv{Vz
z;d$q0KnI@86Tn8q5Yq$Ds~^(atd%!ToULK=9?SnMBe0+dDCz(+fG8-18F2y>nfvlG
zzzLvbitI804N>3{5{Zzmvhfp30My11JJQJvrGP!-k4&kJvFtp3`ZR0HpmBJ7dwcR@
z@C4Xv!5Jl`LYVyEsRV!s)D7MS$P_{#P9iv?JXY!WLdA#$H9w2_lu5>d3eQq0TnRbC
ztZ+kaiSQHYRi{7?nFMoQ0Rjjs({1o4PEG=6dER-+`7%GAmxMG0eO}ir#>~P4pu^su
z5Hb9?IhY&67x32xSN5T4Yx?W)9+@mYsI1KI;|T#?D{=KF!~$ho6B(#Sv)ESQqK}p-
za{S06)ZE2(S!9JmRq-=XXL7OkCn`Wu3QBfK_x^+ui5<aW#8K4=2-M%~b7;Du8`wSA
zRok&@m?6XB#P^5yC?3<}X~YOXJ(v8r>`z#&K+3-!Cf+~=MJfgw9@?VEgz*aLg>y5Q
zf%Wk%1S+7zoM09`;vS^U6fa#}=C$6U!_<2wS^dUe^8r=;TTdYmsKi4mFH>g1iN#?y
zJtiZC*@0NAEj&Tx8t`;OJ}05`R}W)1jxw1osQp^WuNAyMB2^MOB^jKAuO4S=RNe@7
zWYM_=+jl!P&yYogTm)PGsHzthv4pA$E+5}=2+&LOOnmqW#9;<7GiWY9iE|3lz^px=
zc_;<eu8;$!q)AuVFq1{o#!oWCC&&ij;5~v9!`MiK&q3hGj1@A0{FF>0m%OdQd~pV3
zHLif1fiTD`oH3tJ=m9bjKvvB`KgKPvW+9N1v;3sV-c}&L&cmuP1{@b;)SwPQMLyMZ
z2kEe61{J85iDSe<L=8Fk3HZDbh%V(X%lYeo{F9%0t#b4yFY|hqz(*ZgD?AH|K~N-&
z2sZyhlZ7>?@4d{|%cQc{)ynwS%a2x<5g31%AS;y{v)hu56v-n+a)nSg#c$}my*NWX
zPsfG|PD+&jSU$zFHi)>;K*{DQIK48UbY5EP&}1G`Td)4D6N@0?ReyGP;MGS`Sf&SL
zXv3L|_c$y(6&_2Hg11?5o*C?B30CWAfK=c`c??~X(ga0{mEo^<L<3akl;|@V#t1Pc
z51_jt76K9#BLOb*>~7@-TuU`P?W&Z}g4Z*fzNGSUda(r_p5Ptf{Q~bSVs5!w37c4X
zW9rwyUx?2T+ROSp&1MLPXvv^%Ha|k^4LhVX3PCIESspWU(gXy282cf;GJ2GO35sbv
zVhG~g9HGTznehp!T#P8HDlq{<fxI8`&uqnrE{A0V790US|7gxwm=Co@CLs+FViIw0
z3`>Aakhpg3TF%12P#&WXhN4j|c$lKXFa~~-4Lf5lgh{|Hh;s>Xb(R#!9OdA0fy>DT
zOJ4A2TTC~212`+A6lfVZ13s)53>lv&I|=}2Kms{t>;cH6^SHDf2y+BvQfS%5gu#uh
z3zE$30g$D5NV`G?41pJ^M$|zP#IOZoSOS42jtp~&BGW{`pN8PVdC57qxRH;&)b73H
z<<+BE$Xe6RqQ#G+u>>MWFTn5&(#7}QFBTX00bo@V6hY-zEDfFk1A`xUG=d{^S=5*C
z36E8HG;VfMUw9s*fueoPRAY63K>*yX;o$Yct+N^wk|Bge6nq{H5|I$5LGYGQsi+VT
zl!L7tv#B@w-zHrXzD4lr?Gd;-dr1V%Xc)H>^_pF_Yc?S8vI%FNQ)fW))_BcHlBg(9
z3FOp0dkiAp*g)Xr473m0Oot(D0Os&PEGuxwM-A{?AxO3c8>$_|2vO9m9@CQ{kJQ2|
zK<=}s0+Kchi8mH%O6sL`6bV)MV38Jz1bi{Kj&qA5awPzX)epkLFoUa+Fb_RXEOoss
zX2(K*!2lad{yeN_%a;X}l<c!`90%tGRJ^m;ny?TI-x%KTG2?5;<TUVmS1MZ|uQs!V
zEdRTV!1Y0(#oJ(Guvpjtf=W-0IRp$CmaG((%+!+(jZi#{vv^maJHm(pex|Z<IoTBP
z<;cLBBL)@d;pn%S?&pZQj&#a_+g+arTqJjSV>kmW0P+a52+aZDGzWpkQpAh?1O+p#
z#u5BUhaA&V1eIZtJpK)@2GkajgCFDP7_n4pV`_Ew>{+(CVG|a_7~)HSKqI1@0z**6
zTL`BRUHCC^&dy8D)%$Y8AAhM-VCnN?;h_**zV5C}C2N+U$p&w|ZENuZR<uee<X`}#
zu&4tAOmvgFYx{g`b;0^OQ8L%XwgOs)o{|WF81OSn3p5Orcpjy}0{|tUF%{N`O{lj*
zN<+074eF-Bq_)M*y*aj}#x#qvP|SIB$75{#rr6m4Pn2`QL$1h^X7k<9f8{aJrZjc)
zqFskjxif9SL^$4NH-gz*&8n5z0wl;Lg3vE974T5ebf$&sTCjoIQ;?M5g$IQo0Jr10
z8CCG5g5ddaI)?v{h8t{1dsQ4ALaam6q%UPJYR%N4hK+Q#q(&(eD3%x$<KgKILX-*_
zNWqf02U-*P8w%j;&E+|XdsXtJOP%nv>9b=7lA6;CJ_npT!oFnGn}n$QoQlY+Oi+lS
ziN)DqBL}&)P_FxsQm#bR>tKo^OYIOtS#JvD*ltVkA>evZ(<`ENCdm!M)cuBDupG<Q
z`D`U3?5F3mOO;%$RBwdpnnCM$O<$)RnHE`0V0zEAe0Yl^0B_?7=>in5mFX$Z%7VFO
z7%&zQOfIAMC7nEc?CIOlw+#;u!`qbB8Q#Wtk33KZIGHL3&H*EVPe7pI7VK4_hyu(Q
zwn5=<aadpTc=N>>$_siYPMn}Q2wcujFsBvZXNul|Fs;K_g>dlT!3`TWFceV(588;U
zl!eA?>hg9>Ktvg1AfIad00bbcAVQ>Z+Kpr)mJnzqz+{FHEA|Y|$Ul&B6_I5*UpN7R
za4g{f|ERcbM|aowSQ>g8^}8{>)F>T);O_T)>_a<^u=cP1^P!0gW1XGdaN>Yf3P8nz
zxJ}8ng><%)XD4092u|gSbeFmn&d-#h(G)xKIc~(QSx&*}O-9-5Eu`0`CXcV~@Alh)
zsZ6GK&CuCQCK&6qN~P8D$Up!2_v~D&J^aGx?|$Z?aK~DwgnuqkFwY6Xs=WYBguaLt
zua!c^OEb@&xk3Np4T%AT5eV_(T6N2>yJ1!ym&c>-i4WfX-rL``%Lr+|`X9d^_6K!3
z$6BQkV3)O-)jcu0oSn&K!inxiyniAOi;hCnf~>k@T!DTaB{0f+%^wb#b|vq#%h5<E
zU(EET!uG_8wp90+uf5;~(L4lk736RHn_s_kZ;$rEtNGvg?|;!hbo1%6Z0goqs91rJ
z-w4#I)qDt7)xasMCxH|y4~!UKxExyG-XI7sxvh*0Ji`+HRU;zC2`Gattk-?)U}FRf
zy_idf3}0tkyf|_C-d*c|<ri<mdGUYvy%!5-?XVGLU1Xt{b^T7GGBcE{+70boy*tv;
zRhp?_JlrsVf&W8HkBA}}&_%!+u~4{NnsR;TV)5$K<j9)A?X_amuIZJ+h2D;2&X;i9
z+JE}j_Y8)$L(dle*I#`ut0n8v?yS`a7~xPLSS{ulsj!=Mt+u>B$R!QB{NqhQfbTjS
zi(@MSIe^+e^UO0>ET7DfFt{u#G`5_8Sg32^87e=-7v4Y|N<mS_j~~ZhEZjoT3oa+X
zMPMQ9oCp7CrIP~rP(M{=$UvYdeqOOMODM4gZ!7W>;>@!Hz8aap<k$z~wP>I=25tOL
zzxEXADv?51A4zy^p0`5^@=3VjlOoBaBkG3^9U`KHu!S2bj7ijmFlGvQ19Hyg;E&ip
z_Sj>psO94o5m?&%Soo&jpULL@!7#geu!{nl>vpFa`*#d<nA)ikt&(Fmgb<VTmR0as
z#bn&rwQu{aw{GrBX`lQ1haY_QnAw|bW7p#ROk2PW2TVM|Rx8!I-+*q2bI5W*tCw%N
zeaF7N@7c0G6sT(FMztej6W@L2xuX|yrV-4JpD#~MMb>uh-MpeR5l$D2>=s9*u88$)
z`qj=}<)Z7J?0qWmpyZ-3XI2cO;eb}_O}V$;G}s=}P8GBQ1lMhJg@g5CNjCzWJ#G89
z4c>dlo_JFK>i3@i>Z30QBfXZWJ<<P);!<pm8<<dFNUK$9R-0^40V$kZyR!f1Km5*3
z3Aa|)f9v<Z%vP^<U8m2b70;imzoW;$zBe?`ZRYa$pZ8fdtP%~VxB}7HSOl{fpW}B?
zREypeY~@a#^7Fa__WC!x*xb-o>hI<D8Em;?J1oAl9UIEe7AS?8=~&Qx_no`ZWgQ;Y
za_Mu3bj^ZQTVcZ=hy?C`*WDl9)y<abpMK>0cb`5H2?ViJwy+&0_y7&hc%p$|pnxgH
z^yKKiy=(4$=dJs;c9H}8jpv`Jo;-H`rI#lsXC@noAls}QebHTai@sxZyes4s%OwnF
zbYE0VI9S9-+o9JMTjk}jjKGaY;4-Qza?V)LJ6^4FoF&kUtDMA%P!3^kx=OiPA#b;I
z(3tzk3N&?`o8F4Q&Z$<O<;WXD0Bn`4zW(K8BP`4L>!b9>&Ms|!>@lTap=d=C(S{{^
z>9Ke8s*Qs?w}>VCQ_n)Hs_|rCYGxu9)VFRNx^?&Vefvq!aEZBL&FVv^#`D;>v8;)V
zjd~bTpt+@5)fYsu=QsRL%yd8c!S~&MTOw+D1_uFc^JZ=B=AL!i-}i@qaUe6DtyS&k
z554@(Tkh!$Y5TTrfAEPTdL*%=C#9J>GlB*i+}XB;m4sgpVp$C?iCBF7Ko>JQFTQ}f
zg$aIm6e?!3qo?OX@4tKNN^Kw|b_88*38z}}hgodL5<8f29shcMF^~m7*}wut{`ly~
z_H_e$cHDByZ5wVL)H+Cm-Zq>dw65NeefxC%=qrb7dv5Aag?8`U_TUr8LxH5`$W}wP
zYc<$XjyAL5+!v|m8<YO)a8+}1;1ZdT$Hf6i#d3g`-K1k7>`a_$F1K^bz?Pv9AE*}&
z9^o;=Aw$ggqWX&d!TTP#Yu83?z2~iP^<ba&k1sQ;>DXmMM{8bZIUlwVY;=eT5W^W)
ztNrS)fAr2<0<4jtG)8O*pZ2ah&3E0o?pxmv{@phot}zA*1YUUIg%8}atG5fO-hAfZ
z8FrXNnO&<hGLrLj&ZyR!eEvEhwS0?Z1g?(2To$Vc!DUSDvP;!jNQ2@H>TK(hxqO0C
zt@O(Zc^m!&iWVBzg7aK>--d84+#>@%4Z)FHv|8k~w^OG)F~6wGXysp)^VcFauaqUg
zFkb(8d`=@8j%c<l-N$&<v90afHuF>%AHQ<skkhEzZV4;d{kPxriC?(wp4;OIZ=Zxr
z7Di#kf+4$9YFM_(zND<)K-uZqL0{d_str3AjX2vj4cvckB57(TC$%pfIQE5ao_P9&
z8f;@jm-dl&-yLc=iFo_LgGXixqA`B&9oss@G~f~{^!k-wd}rAv&-ON)VmvAvf{3LV
zI|aIpn{K*^+A^2<{PTyfYVaGVas#{f?)#^oy8D(L8v1x1yHZ2bTyIas*dkgil6k{5
z+=*ddA#|Atf01yYr@P}rAA096e`3RyLAlDp%9d9fPDXw_lr`|!;6HU@c>IC|s(0S9
zHyJjtl*NX?ghtk#dJWrguh&?7@V%x+m%Lv}GDw2U0yGR58xhQjF#-<z8)3tD>+W69
zSq2uv!zU3l(1?aYdv@>o<j>u;Z;NKi!aOZoz@cEZEgoe=fvJ%ArvR}=#jjy!EMpSS
z)I==683g9)+7F*S`Pid}PoBx+Hx8KJ`>vJUT|L+{Gq%B+^X!;Z`QH2Pj)d8mT2}E>
zJyiG*Wzv^Fey?fJ<%Ao9fHJ61_vZB{T*e75tCr#x<Uy(lM7V@R${}a#aw7axiI<h|
z_WU_toB?oN;q8g)8+&R6EY}jbS_#W?el=v?lyvDVmTrJ9RLiBdc%ob^5r-(mBIfoT
zTZ!ij+W5psxsuBk)9qb}y|-`ZYttr5+JVPUnVRsqx>cDTAFErnL?r53b#d^9qSUq`
zAz5<g)H11re&_8wnC!-2>TkdD4`2QMfd_y1qc42*tEbOc*azLa(;s2G<v^%h^FMd!
zH1ps)hqTol$$*;gUb2`vvUm|yBs1)=U_5_WPR%ld#sl}?1FGjowO3y~4z)7^VY{x~
zdh42@er<GG`|kITF%uCF;sy=}$(B_rLUx4^V|eT_W5}FDw0dqPee+G*Zr!039qpB2
zZTO593u{4$IK<bf!3Z%W2pOSlI(z*1N$$95Rj{`sZu)Ul#wy9WuU^9pfXvPHc`cm9
zoMUlWa<{4`g83Sg8%&4UbXA4~^%~aLnqBFPhj;Dl;E~71Y?dMwin(f~lt{+5Zyj3K
zr;SW%KYC&e(rjS&;JT&}DCcJqk$Auf8&1>*8E%9$7jyY|qY<e$A}3E@_}YPq|L>0;
z`RxDv{^$SdiNE>tZ=ODRMs^a@v}AjScs{ml7So(MH3ezjFcf4Z4RqM+4up8{nS7m2
zdCsFPN0$+}0SL4>+3VlCmF+TN=BG%xN19*1gtz(G7Jyg7p_SnpBUdYDIsIw~C_%m2
zS(QE){JA+v|Nq;2vslTJ^uF(8o-I%AcdfmuyQf$7EUg(0NzIU)){KUvD2SjS2{1(1
zfCWjEK>5KiU>oq0pY-4d1F{UrrXj)-37ND_bDfRsnbFK}IK9l$>-6+e)m2^Vt-AN-
z&E@3Dvt;`BMP%m9yHr(X^{wi<l@YjgDl$%-SpE?w;ve7n;tTyap2=|IpAvz?&5x0A
zB~cfdWg3NOq4u-n)QRIKYh|`+y!hhPb~h|m$`}yqMBO*vy8P!auKnx(^>@GdpPo~;
z+$@BY>=pBD-@#x7*1|Y(Prgv2aocyXl+f>1E6yiAip<XX@lV!Xdj0M58{tM|zkT8A
zJ8!?qS!9^eT3xC&J4t=@<aeI`zU^6sf%WK#MbWiA%ue1Yv1^>Mo6d2t$q$zO6{=da
zx|#LvyX!<Kmusj?tZy`-%U}L_@8AC0Kl;-zeMwgu3!?~}C5;IOupmo1{2`ELd(GFs
z*Ztpr>l?rQ+yCLxrT2A2NI8@dZRfn>BAHJ5_M1OB+f6MNGbfKM4&pF}W<?UwNDz1a
zCX&Z{Sh}$+J#b>nqMvzuY)B$UB=lk0ZLZWxN2-F+kKcYb>S21qaa=!6vbW#8^3Ct1
z|KWGO`P_5Q@ocFirr5Arh?KWj2yz2Amla^f{AKh(F+<^JIqxr@f8`JU@Eb3E|6;N5
z*ka|0ci+B#_U&^#PX3EaE0tQEmWbk*4be#|E5I`+Pid8V{cR3@%)n~oYSLQ9-3JIT
z?BFQK*fV9J=+C-K>-aj6UC-_txGT<_{2aqO_T0MS8s6!zw@%I6okU>V1k?0S-2~&u
z_kCpF!0{u-vloHGt$z&LsFeJ6t5HNl!pX&*?jw&pNhKk#ue|&Q9IsTawYoXZ`uXyg
zzj3Tuxb)_mkDpv_N0tpG&a9Q?qmApBO^JQ4m?cfuE@1l&oy)wNbKG1|v{41^gx0IC
zyy2Bs%Jt=1@mO;0+^es=_A6ib1jYi+oIdgP`LpeQ?(F3?^a&u<4;`)MUxjQOVkbHk
ztT!+kW@8}&l#+rUJaqaLd%RFaJ$v?SF(}7gp&O$A>wNXkzu5}c1G~H61`s=HZaZ$Z
zw&44V>rEt~6SP%rp<{kqK6>o!v*#Mu--}x7eEjr-0#CsP*`vmOK-a-woYRTeX<j^g
z;TmF2)+djgI`-0w@4A)`fkx8Yr(js+liYT=-JE&H&HL^LrA1|gbsd%~3bJvpe`0l!
zw-FfMc<WuuDR_k%BH@i@=X)=`^!4Tj0`mo@&0SgsX*7DtLAeBbW-Xx(U2(AV1~V30
zc)1vZ898jQuwgYHIL>NrfEoGn;^GQx9hv~U%5dLOsodYZ{?0q^c+Ow%bgfe-kJ~T(
zB<W+PPmUcCgL(bFD12u_`vCVHf~G+8gutv*yN*A(z#~KlYS_l4SqF}P&J(En)O8Y~
zeQxgwGPv!5<X{4HL)^AGbOh7S4+iu-sZ21KJ@*blK(1e3sB}AxYNhD893>GQSy}EX
z8v$rO<nn&2h27#pv9z+;NDwhEES`jxR1218TkSAxhP`ggG1RcS0#Y<g3Lw1fCUW4e
z!<H#ILC>ldt@X_?Us%1gk-fj(>kWd7m#_I^0@>nZpyJ}<*(+-a#=21h%~^{z4=s&D
zfZNTAh-j7rPL$3bQPlPTNRpg^gvh++TNt+F(4J<i1H(Fg;`GH!*TB&6ix_@}y1|J1
zdG|(x`#qGG+14l24MR?N#E<p09E7jcOWIMA<(zgD<y_V>vRt7J`k0`03#nzddzs~w
z*PBsZ8TmbNWEsU{7q*Uu23E(0=6%O?pqa@0zISRj;c&a25`)2HlZ3UIQ}ErQZ?7)a
zs0kQeytdvO*u5-=_`DOthdodQ7W5)uw=SpLI8LJ#7J^cUDdaqMJ=iRX(A~#~UdD4a
z23UHXUR<iK1m&eGYn#_LudSX~dit4<f;C9Jd+xo9mo7WL$Iku>7cP(~DlQ8PHKsbm
z($ZFHsez<_a@f7^@ihf}kP(0fXg97T-4!?fx!0p3OkRmIPDF=KhU`sY4nsyTi>O|^
zF;Jrb^Zz|UfVLo~agV)Ou9H1{+*sqM_J-Un9TUD60il40*$tXo>j-+2v=j9>N7To5
zU=eAYWo@jtUAyc#3%Oh^j{U`jN7^mORb9S5D@kXuve4?rTrJ!0a`au^345u9+}QQv
z{@_Mq>(s*!6B9mhW37vbIagTiWF`192DYHU$TeXY8||>Vy0X!2qk9m=)}xP}6k`I1
z*<nu<46(2Xd&`m@Htaa){rJ&ECL(H$YB4rM!!WWPZ+)||a_l(dc&pp<%hlFEcJI}e
zj`*cI1lRRTdDkD<PNUuR{1T92iVuUxaX(*MIL1*F1INpI!TMHnh?nhSAbF$JwcNm~
zE=HVH;gmPp!WS1Rvid=xfL(u(wgfr0=)#B?6_am6uq9x4bjW=LKjl(vb9bRIs^9HJ
z7zzbmYQrYThtHfO5@>9MNjI~-^5VcP4V+@Hp9jyr?KV4Xv6BwT@}aV@w7%8OJA>41
z_WgDkck+I&mu=?UW|F()1g)^wZ~{P1qQU0!N&91;K2s`63%>f!V0~-r$gvZf8(WBn
zY0X}2g^_h+6;sS|KntbShGw~rjmz#Xz4>hrxVH$fRMT$E25y}{(TFf+dzxwxI8H=A
zJ`TBcYV&h<1oP0mw1#nhs(=5~blfwMSc%Z;CQdhy>E}0r=Ys~1=*0(3=R<p5WSne?
zlcViK?8#|85G97j&5dRkt<EqN^9WfXOWa-;BMN@G1Vy%*k=5=-g<{|q{6wt04!mN(
zsd>#-3o96<TK&k;6Wu6o^WC5A{4t-aIclurdKD}0_n>%+reNNz8FmnCrdE3S`j+Qe
z0VkgwYKNE?xi3doaF+3CqqmdKdB`kPy)a0Q1hi7Cf*XfGuWxJyr81U@!(PmM1pV4<
zG&zu}R4K1F8WF<zY)~qf!MR+nsuKz^rfj#v9#ZExwY)-5ES706nj?ylLl-bQ2Bq%w
zkml#to8kG3A=y^SY-6L5MVyJz2PDXR`bWrl7?6g|%HPhz^JKdoU5-(YilA5`Clp{_
zFu){;@Z)aE>5jS0R)`oL$uTnZ+R{p`zU-IEokX<2J1j_iSqfqc4+|}9T`ZJK95zx2
zShjEsIO#R(wOVVdF&K2Kb-PsS|D%8SNmefu<q!Y#OLC+n(&b7OZErBVa;1m)!eWtK
z{8;T1-Er0O!YvL1!lsM{0fT@+z#w1{ICumOJwJxHWdk{f32fj6wVE&toSlb2R2;j2
z%0R)bcy^GtAa$HaD(tpUu%H)^4ECS#SE&S9KgMoC(oYbPuC3i@b;7M?x6JqbfwfX!
z3@{ha*y?OGJlm<(7oi%7Xw7D@I^TEM$F|YfBwwc>qU%HLV4vzdDQmc99Sl77iD8&R
z)hdd@G)0+Wz%f>ES#0H^pMo9+Nb>C-^;u!nkiIlqsxM%B0an|KqH9;KHX0l4PLn$e
z^%XxTqwt-}p(l~&06AK&T%he@Y>MGXimNCw7fTRmC>?DoN%qmmaM3&Fh-5Ti^qC-H
zu|nxB2<1e%<NiZmN+rCMMH^SqLZPIPdro}DP)96hdeI3JHZ+z>9(Osdt`{e+S3oI6
z(D4e*Hs=OL8(R(6^#io9v6@1k&oT^sh>U@gaq?a^aIUXywzk@FJK|`=AXlh+m9!PH
zsqyG)`Jeo=Usm~AzyGH%zw*jkdDm-)5g?NY<yu?2${SHDvlPUZ2Kq?sXyt+<`9rY?
zQC-tq3<3rLgMdN6AaHO99Pa&j-~dBF_=bEF1^|oV<VDToGR_ji*jU_`^Cnybpil~q
zmdH07Tg>_ByACh_5Qium_c)=Umyb5D7FX-_YB8u((VFmbPPY>g6A2sk#nQ`Hmx?u1
zE8@=P%9#a}1z-*+82F@|vpm;>)lmTUNGaj<!L`plPYEfnXk-u2zJ@5X3J_`Hfb%_2
z+|IGC@CGdmE@aJ4r&biRYf;!NIuO$wYX-3L+`Mn++ub%A#tREeMb9Y))j<}qovxV<
zP=E=$k(I?&$F-7n4~W~XI4h$L?qLls!x%}=D_3I9)^z%I(uF+3c+t2;TudwbIXR&k
z$u<-{hoxSau^jTIF_Hr^;Gl|(-Y=`Z?XqpRqmW~i)zSU6eV5fk!S$1{#X<~K5d`oQ
z9px5OPvU{V<>hl}vyrp2u-%O786A&R1%~0`u<heVjlSLE(?<yv=6b%sasm~LUOU`s
zr}@@urNn{i|MEA!aC*^-`qsCeyYSrCUaZ!S^jvy)j<t=);)nHKxx!vc-Vl}=ePuXL
z(_&4K)$)U`ZJa^CAYc$M2p9zB2!X@RkCkpG$4A2(Fir(Qvm2c7wkqvcDi=Dff#(m(
zejcq?4|`Q^FNx!oDgro>bYcK`kao&}mq5bO*2+>4ch<ug30l<YriHSfb(0Jo0PIv8
zEhgVSck1-VunutP@}*NpKMqrb4qmx*xnA_+Rx7Zx<EzKul1YN7@O~j@chrj!9kP&s
zX;KTZ@FnLA53H@No|}V@S=DOQ4$&=k@?DlCa-eawSjhD{ximq3jCtL_?iXEQe$Z=&
zwMn<x2)BH!g$U#36bqLFL!iHQB}Mq@dAW*j+ld%jt`zdA)hl`f-|YupKkIT*cUs2!
zw9>wSIHLN&RyDTq!PVKmCo3XE&?1YKI9!Ao--jvzz^96cY+Ho5O49-nGKZ$faoccR
z+eRjy&*e88@tKn-=~zp(%GO4NOxY{SS$ZWG@@kilES0@DYDKkbp<KY^MOyJ(FikB#
z?C>4qGM_loxR1t0K6UC=&C9YQ#oT}X>%a8G3Hhe+-ka7}{`fonw8A^W&Q!;%vk($z
zI}bgyNXdFVkxA!rybM@=8}?x-AP*a75HJWB1PlTOfjL9qP?I^@3f5ugb3WgfIe883
z%l9r^VNS;#J+^4ODJlRqXY|G!{Q;k+JMGR@Kaa3kB%K8}hl#Nm^JH?wLM&Hr@X1d+
z`SjzD)B-<jZbhw5Yh9T1XFv7m<EKiSmtNbt{Q8rpmp=0|e~m0QJJ#BjtA%_%ZZ+L3
zTZMSAv#KFQQu!a=3ltgM3eFS6AaSa7WUpPlN`0dKpj0gAK-YJh?X7ON(+iv3=0?BQ
zhRT9f(%$Se*0Qj5WU-o0LMQF|x$K!IAA91Flc!ed(B&G39rn6u7^Yzp+FV5g+#Wy*
z!;Nb+2P=)Fvjt0b^9lCOjxPv|7cV0&&hrr<N>bbNGb~l3xhr-SCdnJ}<NK7#(OWod
zW1>T-ZH4N)2f}$&MRIvgMt9D?e}%3?<du4bRawb({CvN?aU+d8al6spT;rtgLaq;1
z{a&ZrT+gC*ueHh1=Z=-+tQ#MDdiBYNmQO9!*#f!j`Hvkd{oP;u=%=5yqR9IG^H+ZR
zxBj=D=pA<(n_C!H&Qj5zVEsd>N`WjZ*48+{JK|V%^e%Kb9^y<C+jtEE1_6VBLBJp|
zF9?Vr^^iM6U|xdSWNzSlg?HaQm;d}RYP7Ui3Ew!sDYD9}*H2F$fB4K}kK|g5$16eD
zvDk|D$jPH0fBMnJptv6AFI~E{(OLh<=@b9*H-5V4SbzGRXa4+8{-m{j{rlhh(PLkD
z^7yj%>%aE%|L=1@DCCyD_?e%6;*5_0ffrvqyLtUa&RVTF{$jC$=oE7E&Rd&oyE@c(
zLW%OS<ZzH8qloPIJ~nzUU%t!<5~#@@J9f-^dov8%9Q}sy@zGN!gIu^=SY0VZa8;<-
z>O$q?Pd(NTj=aCtYHwbDWOeb0hadZIe)F02mi65qME}<xd?{)-Dh2Q3Pdyam!!+Jn
zDi!)+BVSqksmITBil;26^1`cc^>Ut@G#8E)ffNRJ?))W=OU4FYyO#vzN;m3aWf}H9
zR6$mpQu8>3v-caJQz{k{g+h0G$m}t?k|1YK(ir32h}F-&cb;z*be?0&_1DjD7O>cc
zWxO<c^6}H?XSZ67hmLuEo`W8(<+A_Re(KSsV(HCy&NbTCf*^hLr>g(MZ+?vPy<dMm
z`MrPl|76Yezxj8c{gr2z*v_|7aIal?`O$}#3WcMs)a?~2z4Fpm{^~XK?$NKP*XvP&
zRt&rJHn3LVSJCUpu>=Bn__2H<Fe=E0`h7NqHwYL63<3s$J3`<d@MD>3Wlx+4`}j|D
z1$Ok~Bvd)RpA$e~RImhb3u<(mt4fP+?2a{vQJzHjg@Il^!js&|M~;_XeyyAJgQ6c4
zmwxf{pZmp6`bDLA441`X#V0;_@{v!T>{!-w&-Z`t_kXv#TC=TQS+p>%g55_ow!P7L
z>9w<8{JE#A#r#iy;<2Yb@fhXf+3Ref%30rg>2)kM_(8l>D=*YJJ5ThU-+Avm>_^q;
zHYJ~^ooLVsH0t~N@L6PgfF@z;*kp--@n8@^q=ybJ91E!<8S*CuRwN8gLo$+*Jd2tH
z^hUOU5rUhxJe<6N9X5pFa?%~S=50=MzOmH=QrGt$edJ92g&#LN8=hZXT3!74zxmfc
z_i1YxZ4>A*s=x*77e4#M$3Oc7k^lG`ov-}aR}k1=x(LS4)GAiJZYOaAm6eZw^rQdc
zpFO@<urA-QPA}8y0f_$h|J{Gq2HUQ+`M-qMHaD_XT(8zpS)qokT{4>3&BeW_N>ld}
zJCC_H-Bm;a3Yge0kwWXmiR^^K5ds_|7!@>_X$D#mJZstWNRdFrpXwqBvVCnNx~91W
zZQvDUXlZ(lx4xqCA-5)-G$UtyU_UPxr9B66I9d&^*B=xwuD7T!=LN1VR@3fQFHPN|
zwOFnEoxlB=Cmyq^C~S8vJCF^0&wlRI&wcI`>##rk+GBt6+}Aq2M$+ks327{-V5uc*
z6ifEs`o+gLv!skwI4k~pzb5<hP(h}EGzK<7o`2=Xz0Ow2@1Hz%L|HYoZZtY*b#o+6
zKeLmJ)B8AJlHyATQ*O%sLtNe8=mN?ku%L%CHoZqW{L!%lL*>`$o2x;<AYc$M2p9zJ
zBLsF%-UnYWI120m){t_LfrV;=Fc{3SVf3KKek3MIVKMTPKlDdA3``!G@=wpNnCVS>
zP1Gei?HkvwuV3q0Fs)Dg%tA8Qa<Z-)Ms=qUH~5Y%O2hDjEz4@;t#w6Fg+PVrqT}+l
z`t<2lK3;=J+S}md5O49+D_64r_Fw<;H-0kcD#HTkHn%M6d*`hG`d|Ig+t(texX{l<
z&F=c&c=ijNOm?+xy?^~GTlA(ZJkiKaxTvsT)O_V9G)o?bJxH*_8Fj!z$(091RhKh_
zIY*-pn7N*SCO5_9G={R14Yp`QmQ$yfI$fCb^yMK})FJGwU0q{%WlOV;CdN)OvAJVU
zY}*stwl%SB+s?$CaAI2%+jequ=6v_N&vW<R+N)8ks=KRJy>Clxo73GyBmuH_UM{pc
z?QNu{Jon4U*g9NB<W48<-U(^x)Nw-v*KfYS&BrqUAI0@)Dll)Yn&UJw2Jv+W5!jcD
zEH60mGl-fQ-aL(kvNd#?SQT>k6Es1k8VLCVmhr8$T|5^*eoy*jI=Nle=(N`c4RbVD
zbG#D&xvm<MK*10u)V2!4^^cu2_M1ndkkT!$1T~`d!&FIRG;#)g7@Vo4!4a<G2jz17
z!01FoY1r-HY&4igpcu>*4=5L%t13(X9_NHDL99eUG}x5X@<~_9MFtcJ0Go*kO0wJh
z^(zgOsh0cYMsD%#RdRUvj`;T1Do@<S4ETC?TlBYcWN>XuO`t3t<P-abb+!)2xaUAm
zYziyA`dtq~g(F%0N>hZ7{3O*3%k2|`?X(|JM^w|D-S~CyP3z(D!LQ5ZIgHp}w~ZGI
z$3WCNH$Q)w?qpS$YGO&qqNA4>L0>X*1QJ^o@nJZ+i}v%1&{y?7cBZej4zoNW0A`zW
zz~%`=4*4T#5$pGeFd=Jqydb(r@ezzj?tA9CtiJYaK&UzqueJWv7brCd6ey&vwRut7
zP<}7sOB(zcpbS`;I#rT5`bkdV@{v<Z&>OKf<g#%OzAw#e%;7aaAO!b5n1dp-2h_+n
zh+5p?i)r<qL+iQ?s;nqKf*A`!^FG1>*CEhJBYT@vxw>uEpU682aJfsj9&gu1_A;+o
zg)Se_>M_3s17-cpFd5y>=tF!W?B-JZ5NaB=8qv)YOAwJNzMIv%d&!Jv!PjFE{?%u!
zn%JmtW{IT424Q%E_LZrX7#SW`w)cs!>&&A^m*>~LYqmB5?80ES`*BrH=1Jgkl$7O<
z`;Z^^dwejgO$)CM`vo4j1+1~u?MafYslu8#IF>qqw;6ixxVZ^iwI5Gew$+uDTAEU$
zdjrii8;=AZ2Ossy)igEu`kG5gIF{oo854u%?znfJuS45}=iKi~Xu%d{gY6i9>N5??
zVVfzKA3|L7q}j{ba1lEeVVp<?NnQo$1FifpNx^5xSaFS_KoRW_m(`#|0{Bs_yXE2-
zE4W2D%I9o!SWo%%q_I;1E_+&XbKfBP+>+n6k!n7U>#5a1UzSaI?`P!bXfPr$=yhLJ
zKV~mzUs45P*|f0Lzme?~>g_pDTDe}nt?)e>80rOSpA@C|_^zdd9q*~?{<s11H8`A+
zS{rK7kGcnVQQE2&*o1#?IZ64Wtv$x`bQsc3?+pUNeJCL;pe!bv%_%9c(P8q<lVT}6
z&2kQa$<M&u%l4zGnUbI4Oy15CA(dU<h|(<@x7IhfxGI*lwPIU5Qo>H`s^nF7MR0+c
z)E?CXh$TgiOU#~DJb{rs!7GH*>2^%Y%s8|MH?DJiov~oZM-mZ8A_aF);{x97w6_xV
z<Kg1hE5Q#wj38BS`thQ?sUYr!mJ@~DhIE_=wDPm}>w_O@R(gn&ZzUI;MGjv%KlW_j
zap1S~3$V4nejTY3?=|`jOs27qm3YudNe<^g)G1rwt{mvJpO&o4AF~<b75xK}>|Zg7
z{Bvox_}1o+De3*Lb`T8vtax8ApoJ7Y8to^A)esdT08l*&XSMc9#BW=85QCS%89;UO
zrX<JHSXT!s)va4GpL)T@#-`tJa&q#jrJ+HLiy}I7ce1#+umBy%c$tt&0Y^(mCrF})
z6bnCOUoh?48As?NnypkKTgU$6<LzM!oiu2NGV$b(=LOG<fP3VPhZ$_>m(L_s@n%Sy
zeRtYy;_vfldph)pZb2OLKtIIH9;iRRX*a*nRo8O%_M~O4IR@QSX6+u8<UrDSP{z*K
z_5e_%U5nVZbw8=_X~p;kzh%hW;`m{Up-%a;fVv*FIcc=9ZM7vhlJPz!YziUqu*`b)
zQ!`tl?GwH8Yz|`ozJj1c&Cefw?rV;vBy+)@C}K#x>hM^F^fl2_j5KfBMC&aL12tT&
z>w2oP{WvrI;9cgv+nxO^iHK>gh1S%9g(ZWxpslWS@mO?&o&72O5~Y&i*LbLzbleN%
zeYz|dyqOy2d)}!rJZedSw`xu$T{iHpTh}<?{z5vNS{OwPj}Mh3Ng3v;58NsYj^TOg
zS5)8sVDs_%a_%3fD#2o_d9MJm)h-kdQXtqcj4(tfW?+{IqY7457&Y~kd9rs@6zTY5
zkGjzZ7ajVe4@tX8<QrH9{#p=0c%U;?$^qBo63puQ`%?fx=W|fb3-|9+{2$i^y3Z4%
zMI?93!03&orIEV<K$?emg+r+r<XeR*CfjHHH!zu_*1+j~QlZP?y64cGLrq76Z)rGR
zwSK6XS&{MpXFLbD4eyA7Z|c#PAvv#+X@$@A4nP+dKY2Q!X9%waIRsLMJPB+(4<);$
ziM<)Gf`2LOwx}aAR+YtSoXWXYmjW@*;VY+PkZ4}5-n{f%eG9?j?0;c_ihwdt7oehs
zQCN|8Sq*MbQ_NHUmK{!UA-g!fsvrMztfv<{`RoyEU<+>Y*JB0W%@LvL+v((6Z|lbe
z{i~qety>a_0&BC4ol9<N+tAZ#)s7V!yr`#-vHg`ApZSj~+qV%^(|*E-lrAU3H%?%<
z9jp#QT)jcg5?A2hjikgRNS2_ynd^!lMsD%XpB^1egn*9g>zC3Dp`0T@y?i1gmAx?D
zZ&}ac;+%Li+R0)w)*>3!&j8A@F;_jpAykVsFJche0Ibc_3;haTU#rM6Je?q%h$-lh
zCZ;PPXuaQSGj=`jxdyS!KF320O|vC(sMREgsiUK#c4nB6xaqe~5-xJwR}iY1(J*NN
zxSNE7oB)48oyX|~R3G<LyggN^7^HA!B^*qDj$L?6KWOFLbq0<#bo6Ks9u=naP*kv$
ze_X>G0yMMsi;2q}>?SB9x^1ma?feN#7CBEE2E-qD)x2-+c=+D83fO`h#`7Ui<>NaT
zbXvars85K0NW`0Dr?h~`Ef~yS-6NmErPF#Zp%bDcBYEJXh(VI*H<|LvDU*t~C(ddP
zziltQ#6V13h2#FpxWB=m)A2sb<o>uFM8{B#`EFrlRDPZH_#E~7H@R-=n@yraF3VV@
zhVFfMQcDip>^pPLGyH>(8{NxRjLt!3j4V>{QSS89pMlZL@Kf}WD3>LxAf38?thvvi
z@*Xaf`>kSeA^T~LIaF>^x4G*%$`)$xIgyNDUiPTBm-OrR+-F$q_X9a)(oOw>i-$`F
zpOI~W#^qR$V=*_aA$%H*Sz~F4n~VxVKD79P>mjN9`1trp+%OJ4_mk7nn%ml_2e=%r
zrVbASZ`7x8$Ux?low4qpuX|JhUdFK%cAI$F1AEMoMKOFnC`NyNY&<_N>SFI&nDezb
zA&_{4O&(d_6$#eOOnOQLdv|#OBYA!e)wxUXEr$IBX1@zJ5_&MzbX^-J>S4Jo(|q1Z
zXhlZC3+7-C8>T?%8bm=8RAUb425pk$gU}R-bAF5tv3&u*RBsS(?~eD=Os2b5QVth8
z@Re?WeskCAwYMJ<ew7D<WGbeE>{rG^7)sf4Jec@(S-?8?!p!%B1+N8zPTSJuuu8m3
z8_{|<(mdMMsYCJ)j9QnzPi=%gQpxM?=LI^pnl-<y-~N2OB{`g_Ezl3B_l&5TM_9>A
z{{-R8G6vne5s9{MnT~GK;2EWRKQ+w}$vf0oeU7m8l4=C&P{B3pD-($y8!<?J2j?w+
z&j0=i<udNBf9tZuobzP}-fybSfJVXxE5C^tOD!k5jZeYO3p!z%MeUUW1@iuQ=Z<Hl
zMdA7+Nb!TFgxQkBg=X{vfDv~Sv2X+z5jA<8R5iB{#|GE|LfQ+Fz~#ZAS+DnSSdK_o
zyUJ;9aCtlu=fuNHs<n_Ap&E7`uXYU49o(+?C^GYOD|ytUe3>DlKgr$iPjPHJ@<4ad
z+3QO|D3Ym#Aun5d-52oluk!k4Jtwoc#8eAk{8qzIfwAw~B{{ksZgJJ(xa^G6m-Uj|
zj=^E`2j8~hnm2qh@TKJT1)$lG%EJ^RQOBS0$2HzL!hS674N9k7<0aX0YfzCvsFBMS
zi|adw!#HG^e}_Sa9+dKRv+*=O9w4U!E?osxDONVH31Q}+sQ!A!*!b~TLs+i1@6}{b
z;2!0L`|C8NTr8U2wqc={F6VCk2nF{~L{%+VB%9maczsP5-p6H9%2_X1h)7K}Z&Pa=
z<lgp3T>{L3v}u4~k*rJSSM$7jkJq`zJJD8U?#9=CB?7cA$F1_aiKkK7x<Fr>=W951
zR7c0~@$eR#!`LlKUMTJ<dI{aH)=#K2L~vkuNSJX*)V=Ut`5rx!nO;9J8m5yr>pMxm
zN#EVOp~TSyX-NH~Q6VLf2n!OFf2&B*d71<AMPUz5qsBO8?cdRHD(+!+^5>;f<!gO+
zfd+b7rQH5{zH^n!=6p|J(}{V>ulzmiE)r6pI#nL!>CGBQd7c6c+_}xHO(DY3E~PJc
z=p8ZUsq1=6QLV17atsVAtQKyd93*gXlF*Sg!9QX>Tyv&95=PyenpLhvRFv<IPuf+}
zF6v<B)Q}#|n^(Z|=i4?iE6kJvljLNCCO1y_hGfm4H6#ylxzR;j69gy#h8>eY&BvmM
zS&OaX@Oj)-g%CU!D*aT6&XMh)qv4R;&QuMIs9$9gNIW4V*gpwLx$pd{!-(m1(ykg!
z!1LURp)+*t!#swb9Ie}Kzuw3ZC71XmX(FnDQV2M3VkqgZKXBx_W?*xtRu%V2@uxt2
z)J|c1vjraSPu|hc3R3-y(WBH6_G~@xyjQ-VU%gp_Zd)i3lb`%kG5N=~Q7bWm6Dl4W
z-mn&dIK*KT`*W&GtSTnRIvP>}PrP_21}QvWeOI!|btmREEY2nng-H#}3DhpvxPRKG
z-|C|4g)W{IjKJeCPYmB(SS+9+4mnQ$x1AX@%TLpv0rLXUFoQ_*zH!~hyw`Ct-Cz>(
zvkyX19|BwOIEHC{mzXTEgEtBLzIAz_g6y<{sQgE{#u_Lbw(OHD5{GBEEVcPk^@xQp
z(~5SwGkFg)QAZzvAS5R&oXpYe{8cX;`P+;|p9vmPo@D5?87VRRj(`WV1*$Qj*Y@$4
zEr_xB-kCtU(y)|h!l98uiV+BlYFyc2SbLEPWupNgbOO{vC#*9n_CbFr+Ko;64LiWr
z-G1+d{?}8?`%^`vw<=u?hfm|rT3jF|(B^@z3IW%O=&N?g5M)3VVu+uUHjcW(2hPTu
zKxW=rA6==#p9fC+U-DJW1?-o|bNUwsASZ#=f+|+xfhXveaQbW=?|+JZe1`OcNgl1s
zYH~O4y}i>i6j5Ivi;e%NV?g7!56WYpA9E4IrVk-*Dd?4jxM+#mgUBy2EQ6B%@VdqD
z3L-!<4>2)QN(w0+2~^7e7QwFD62X8uEqJe?aDnuG+S+(`uW34|Yg+_2NHDRt{G9qI
zSl+N-XCa{-vgZ57?3eX19i!{{<7{N_f&OwJiV`-;ypp0n3%Uw~CF~7smOumCf!m=>
zS$D%sRN)v3|3Kj*W8+za&L*?-vP+5QNKe^s3w-#DpC)?Q#&7aVxHlCFU!VvTbck|9
z$W}Q&w!a`H1R8Lb5TM}QLWrwD?QLfQ#ZChBw%Y}=4TFhNvS^&vS#7M$lLZoZiU{{v
z<m5pQ_7JC_f4pkazi#dktoB?N;aHb6pp=}pNx}J{x+Sq&Bl*fHr)cxq-`;fXvWEV;
z>4oz!;&#}^Oi0EH3bl!~y~&?8B$}GRGa@46$-^s&xbv{AcYKek{}}xBqq=iIEGc;h
z-Owp9j9y!;@GcRZ<L5DG1D*5C)cSsT1)rc0VMz<DK~%i~#c`8HMdXUft?$zhAAr2K
zm72CpockaVLON`p()2*!5K2J%ntyoS89yN(%%FhRm5@Gi?G_L*ItTunR%30HW})vV
z%LA>#4iFwkQBa3_AXgYVjtJKd4y8~J@D<GdbuPQllZPS`6qblF5s`u)1xW*4>necO
z&u~1MQRVZL)Uu%D;T!$u3(SJxwtY>3Vg0Ti=Ah8Uo@7lhnb41;qA}z`Il@#D<$cQ6
zSQ8H>``aYwz{zKP8TC{52$$xkI%2hSY3L|DA3XI^<LfMtjIs>1u^P!$mEwb~wQ|#?
z46QBn(9mUAqvaqIAYI~#Li6%yCc`l8oa!R>`p9*dO37jvEib{U4&}iz8}?#&5)+(`
zX0$D}M^X8x3MrRmQ?*!__vzzwBTj}?1(06%Pboc(>uf2GYk)6!?3r+=XtjK>T0L3}
zs1x9^=lCe3y^8h|1PD8CQ07vw9&l7vF0Jl^34NMyC8Aa)kBW~zUBG}^)K5v$@VsNl
zBL+v^3p6v+Ln~U~7Tp43Q<<g1l;Nch5MCXPg8vG{mqOm%X(5&6EbN9^kB|b+(72j+
zu1aRkqm}>!T~=^~@$6xpv(Fs+ab|??;03LegU$rxq3No%D{B@sByjGwDWuK}){SLm
zk#MD-@(-slv@!aJpv9rAs&C<3h9?^<F0@y*C{}>@?$0YGy+E*F=tXKq#X|Mtwjkxe
z{P2n>;iL5Aig$1k^vB&MHwZ=<)1+>!21XvDQc2guZ^JFSsOwkzI*)8}3{kMhGc3xQ
zn51P}aqUKu%Y|e{TZBb~e{@pnmv-<*-@T<JUgnTBkyG7uw7kvEo>!jtBH+Xb!qwAq
z6Zg>`DW*mRE0Bm{A&?*T71dSZaAu5P9qf0uXRZ~g&6kZYG#nB3k{V+2!-e-@o*|01
zF3pbrn7J-LC~I&}R8iw7STY;M=vB4FI>!=n)q=<FNCn;!PWzS2>VpX9rk{J0(#P?^
ziIM=Lh`;m<<K6Me<bjiKmw`B@5kPE0bxhBt4CA+T7gqo11)nkv`qhi%ge$nIKc(U$
z_%(D=8-Evh*GR(>m2b<-cwwdPlEcS>!5ub6Hr#0_=u|w!v=NdSYikSWw?=GFoRI7D
zTC3|kP&NS7gY_ISTM0+Wgx`V<lMe$zJ8K6@NP{paCv&<Fdyw`FlLfdvoN?mrE*$kE
zZP_|(O9u_WkDW0*+Uh?*CvuXCEy-~#+Hgn#FaTYN6Qb{L9E%^!ihC#JcH1Xn&=TSO
zYgz08q0?&;5t0SH_=J?laNqDiv}duxA<{%w{ifkzdKdOp-gc*TaM}8nYG@v;4@Op2
zNMi6Lps<=8hWoF0AUwrnGq;%Wf#MIn)d`ONU>jvn3cyPG%4Em42Ra>nxLXx2(cLkz
zW0H!Bx25y@6}5tE_D2LwyLEgBLW;$o5Ysxr$pZ#e9(Tukdqo<F#j^5(^0bxSoFz+1
z`LKQS{s1H28MPiNU#VLv2=Wl>jUgh=bmO7^{H2&KSQsT12qf@!K{Ox8d1ON)s&i$G
z2J@f;3Gx*I8_bJLDN7to51os^`65GbpxebOS~7%~T&0bTHg=?yZK-L$5#3A=#d+N@
zVH7IPEOD!VZLIUkx`DHKL1R$1y`LjvE$ZCS9TZk&ix#CbfUX@s$*hSnQ?>ENyneCk
zkJdxDlt<GwT)vZtmo(e(W!&P6HQGjgA*bzDnV|a@PLX7>M$8Yn3)Dbib6<zic%l1a
znWMRYT2cIz{aE4PKs5*4_oD`im`eu*r)4(pG*K%yL=mq6@j<oZJL7iIf&_>=5;S_D
zfR}YRRRvgsTPrJ@#Jc+gqz->8oNc%-)UeOABdqER+NIx5yKP<iN8$rSjAc=K_@x4E
zezF_K2%dH3eur#*zIK2bwN}p@g{oqpR8F2!UAA&DSDG^y&W=Xow)iAlrNALq>Lr;@
zjBoCyr#$kh^#6`xam0iR7yJ-fk0&L<l;Y0MAL{Giq!0=g9ZkFbox|r^kzx6!7!^(<
zx9<uy#4Z?2P|qYi0Xa<!nh05k3=?vi5SM*B7P{F9J>3ztmxR4nB7xc#%L4ow+XTx=
zI7PT*c)joB>u-lVQDrDsQ+7*bB1f-sFfSA$nW__id4g`NGlB0lI%bLPeV?a%uwcES
z6c;B2UlHwW))^uvL55vmw?DU^*0A7%+Ik{`NfaN_@gf8gl0SD<!Kjeq4YVI;FYTWK
z^-(_Hm8qUAR0Q<d8OHQ>Go&Dkf%y#SPh8#vqS%Zk<#12J9l`5`gHDlSUqPTB%uGFt
zI;VqG1=cl(#%>Z*Ig|M94q^u=YZ5&KZ)C9ci#QOkW(tvmy;8S~jx5pC_o%5sk=EYl
zCq-%-Q{-<4a6vMjm_Ovo8^@kLQ(+!OD}w##)Rm(B6b7|*PRG{~MLKzo4#>$c%}Sy0
zkg1du>{fBuyFAoZ>Yyqx;R2S9vkyz)0!A`1lyG}-#8-$6xB9fmUzDDX9Ks#dsPI+|
ze6Y?<NFJLI;d;1b%Mu}wt>Ku!wbYt)PxA-1A&KC8Nvr3M?S$tbiiQf*Wk!l%!6h2C
zOO_G!3o<=ZYR@L@8M`n_(g?nqMYGFT?G`s#8s=b<%$Px4d+Rp&*KPZ$)k#gdTn`B5
zr4kP)+j=e~FpG<a1Z<>2GO86*ft#($s=vhMfKfh=JV!4j2fys?q=G{GmG<>QT8yGM
za<*;%oVJ1AcTs$KKIF*K(Ws&=)*sP#W?hM4IQ2r*tW0;VC<<c<*#v&<7W73EN6(Bu
ztIx;)Kv846BSk!M)teI#pUsgZs(&@rVJVL|{bUFSk&(3&_$E?$k*5vz)S$4F%uUmt
z#|tGC`MoLLGv#}_+5&zOSPU`#Otaz`3LGH?Be-g2uDl!pT=$ncPUKLyxMEZW*uLht
zH7S=_j^$SI1k+5V@uUFbz|3}=YK)(wfuu*&#OgV8jHo99qQ1*`gG>2o=zSP`M|N{*
zfFaQ6<t%DxfBGRRe&vKP3EY-1s;_Q@JsKqb9D9w6pG@XEkdC0rAQJ-6igM@I2}XDI
zMl2K2{8^N$hhNtGAt=#ExGP0AIOoO*OYtEl(sQPX_56uPGHkmf+6l{{ST?umLP5Fs
z!oVZ3x1M_37i9CfYE2q$t?{8)vY<~`_)U5s3vV@*&tQOoT_eI_IghuFP7HwE0EV|5
zYR0?OelEdTfOYYMxT$q@J0LZA_{_uVcaDe89@S4C8L{vIFffL${a%0<m#R}9aw`KO
z&o39$u+?|>a9VYH=??j?j2~p6LX3eBqo>tNptmV$X^x|xmpb?F64COK#-?eak|m+&
zv{oo87O)zVWo<EYHmxNV&OaC)3tXpb3<W50QgGsiI+60e26BdWGUV~O4^*ZiCi?t5
zWNZnFXObajhA{S%-rlk=m9L|LubAD5;4y`PlsI(OM39}ie~PbW?=0ktQu^IddKydB
zLo+OyE-aoj_q}k`y{Qled@tQKHvX7(zS)^vh-My~DlkwO4b%YEIlud><QDlJa0;2a
zds_^IGI(qXmp{UnHTKaEC?1t5Dt$#e-d0#e8NVjhLiB^Gyc`-EJCZOtWl~>)7Mgo;
zk(Nu+YUSezR-uqcJ}h^}q8Qy9tbYqjmq{%^0;upB)AdC#zluS~6ht;-hglrEj)hB1
ztjc#tgM^$%D_v)IBOasWj%NaVu@V2fK`4qF)w&&wO7}JNIFyaS$&!8^At29A(7z#K
zs7-`P7CfQ~>1`myIn*OSRAS@#$5B|P%XHFWT1vXA;1=Vi@#0Mdw`}%|Q~X(#60<9T
z9>njt1c@w*YbCDNK!#<Z)6`8OO+(I6IXtC$me=%8*iEW_cQCSPZzK$9*CE85#TuDR
zcCP@Qxcsa!UWn#O!53RWiaoB#vB)XJ>W~!$4DENMP-ZVgZfD{tCfwi~U+fk=2j4Nc
zYV#}1wtq4wviV#2LdVt6O*_e8zwHTqQ4`FBLF8n-S|b|pFB^Yz^D#E}Iw0P|ZxxdY
z?~l~4ZBzQhgW-2-O}h?%+IX3dw@FnmcmFT8S2wnJ08zh@d89x9zsoO?@s_@;fq5@%
z>Mkl$J!~L6e?&oGH6s?>H-&!t+uK-@9;CUY@&I??AWK9x4k_kj=&hu}CiC&w<6)kv
zO~N_J0Yz!?!YV3Vn+RSZnh+Z4qd-RT%suIp--F+;Ap5DjGzEH>#-ZqDW|*!|<^*%V
z@r43l4wlk*`P}It-%4{L0u1sLuQ3-1d>;En7u(nL%Trfwn>k&C>nKYf0H8nnj>^kX
z;@j6yPQRz6h5a6wj@vsg4TYrudz86d8o4@BN)Lr@>kI@|kip|dfBn9(xvsL{HtZ7E
zIAXw6pdZa$rm3V|bVkbO#<hpGT;-rw9750q+poZ9^h<NDQBs^sDVjH}CyqjW3b5<%
zx5cWckxnTa8}A^Faf10JR5o_SZ0tVq2^}lETU`Y~5xGZr!7BHgzZ5zZd$LnOdVClS
zMB&V;4MgWx$0Zvu8egSk+>pI8^XU0}8az@k7)kr@5=B{s*?|rriAQ6zDgxr3d`@`O
zHf&F~Aj9`pt|S@GVIAbN#r4`~JD&W=dIl!h#UvIXx*v1%14w9_$%sb8gOeob01~1d
zAmbK<Tk2yYG*)uAj!!<39|=Sc4vNeYBu;iNVmfu~SEP8FhJaMB)&<Iw8~`-8od8}`
zA`;gtR1ot|X^3Qy>a&Qt2fQ^s!(7-2!pT8R-Uo25Sg@UrEpAJ`Jk4Vtaq#JRfk@sm
z)amBK5C||PM24{6a)4sn8;&n8RDrX^9ms7tv#RDK@iR!rg~dN&OlUtc`+t?ZruLYy
zBNqhH{jP=@(3py1-y#_X+FR=@M2ZNP(n)Jli3G(7&Z;rRMv0&8=8}8Milkb}<LPIJ
zmU@QA!yC#J^T(dIB>16D!e&e%wt)t0*{)m*c|Nh%WQRzJf6EoN<ARP4k!GkzF9-GS
z^PhYdG=|JsH+Qdts?N09=`(jIgovIpokRUui)wEtW_j}a*+i?1AkNXJ#1U4mh>B3}
zI#QloyF=es9JQ59|0~At1a&{U)?ypZ9Q$C!qPzgK2sNg95}GH6P+0q^$lr0f;zG!S
z;r-t;ha?pk<y}RDd1Q@OOSa>RsKy`5cLx>eSl=o4DW-G%5~uFcpdBTUw1eqTE^Oem
zu_s{r;V;>pllIT(n5;>ZoZ-ZC73y;R1S<inZ)y-VrRU-|al!E(5n}As&vGE^>MP!Y
z5XMK`=3O~N^zt#+xa3OR5NCP_G|gWP<|4+0t67Gf5M>I-(B}yi6Rh*7z_GQ9f=!wQ
zwPYV-I}+v7;h(zlUOrgd3kCdWld+&K+>#FtD)UX9aZ)+rb`+ncD%ujLE*i8Q(xqiG
zg~C9Tc=t@w56a)tW41$xyBiwBZa$f-TKzEqG1UwW@W&t`{CbfRu$4E0)*L*3?3b)>
zgt&-(7{WLqfa(NdOEBjJ(tOG+#JM9!RZqmb@2Z#J+wipjdU1x=$Z>s4yuP?%SzNE$
zXsFYnczf(&_r51YPM9~`#6wT+3iG;qUn;+~nYU#A@uaOmClf$1_&-4U%4fBxLoaiT
zHcsn)DL;}};ZK`#BHnL<^YgrY?^AJUE#ed-2@aT(>~-nfb!`hvaG7;!P>T<6H?dp2
zRX$wl4_M*LydK=Djk119^-$p{B6Env`4%7tJtZ6>Fd7eJGEgP}wj^ZQ5wFbk8a8Tl
zKY~jrKyL9#>IynmQV5^pw3m7A9GmFTx;=HB>{a-*u6~Uel3e0`zlVww)bu01^Jd%F
zA(5s(7YRrJrclEL&`vD=0e^3~WV<Ofj;xiT%&_C(==>r}EkDHO360)Ej6Aj}M`Kqy
z%1YYJF6~Av|LBw)qf}zKhf=4<Jq0f<LFXtq!E&awQs)3CW1%QOSf=*E(1Z7JHpnir
zNz;bl03AvNnGQ}oV&$r7^&qm-sZ5p4(IoE>|5+<LAd3R?uEZIut5s@n7Zb*9v1g9W
zcE)M$80)r2Y+3+6%&1IY_m%f?Dn*{~Ty;e|GR<GfA$yF$2#PPc(QginI06hRv<$n>
zVr$A48!wR&1V1-Czb~Y5X<ET`&MNoGo({Da;S8+{R`dH%0wb*WvAWp#@d7ek`wQAi
z-0nF+mC9ivoH5^^)OZCCMKU-CM%9uD9`{x$39fdz?OF$M_iNP7_Zb!`xbED6&>j}r
zDOGmR-N@FQp|J<y05(8W2@BE!z8g4xWP1;Os06NE5I_G>s3^3m8H>Fex(Db-H?k~*
z3=?#q&gEBiGfgZveBa-B(d@^9>y1!>rUO{q5}+WA5~7?#H0oCt_|g`eqRdCZ;EcQJ
zG_;@f9BFN^cF4M}dfaX8GiX<{v`=BPomu>kbkMUdwMKI%VFi;93vbE$n`eaBs>D;U
zcE+9B4$lvvEPo3+HV1`{DPPTYQymLx*OTNvS_?+^HXa8*f;9JAv?6u{MNH$&nC|3U
zznjB8$Y);#RzR_0kpOY0o22r*I6pIU*f2pBhvLs8a#;<}91@Q#M+C*)+IP8y%-0)D
z=)y)MsL>&hy48}G;;^`d;rDh6^~uM?rl$=aoi6P`b@UbN3L)11dB2NOy^%YF4TQYv
zZjnBEYJPH}Rp2P4n8Ks21%-SCO;AKQ%BI2(QmB+{;WiE~gaa!_2g`DRI0U>Cuz|Yn
zRdUSl2>YZ7ypXcSP;b@k60Z?(4s9IL5n=0xwT+&s(kB%b=8n?GmS1~{EL)|LtKe#Q
zb*Pm+sV}X!*mr7Z;)lDrl68Gd`glrZeb=O+SMd)z?hV^`bY5*{SFRgAil9=8irJM7
zC=a+3Ej`WIHlmpPtnV{XD4x!}`$~vSrxD1~N(Q8+LOjpUu0G)w40!a+viTyQM!UhU
z^MS0?M+Xy~E@@K+xm`UWJHjIQf$l@O)V)0wtTQUPay4yS6EPn@kxVj5t(c?6`Ev@p
zs*6-$XCEF^h6n4zxif7i?~C+h&98UP1kSneH2iQ7KYoT`UcGH0%G}z1ARzg7Xh_J{
z&>Bjx5~}X8P$5FsyB%(5XI~b_y$(j7_?C$*n$4briL4#_S9!ol?oJMmXEHzj^w-Nv
z*68XZs6i|*7U5Dna5xw+dQ!~9Zq#b&lPO${F?bI`R@Ki`ypWFj9$F_p4*f7;<EE%5
zC`l)Vo;?K?D&m|6ZAb_i!CU4$VIfw7UzXWUi5bZm(Cafzvv55r@%ovjNn;Y5-7tIp
z4l1xolHDIEu>{LlF+{p<srg3Nw3a49r0wzCapt@veP|XR0Z?(my^ZA>NpjadM;O!}
z816@^0z6d=kuO*EuU7AMg>RP*3G*lOgLc?|3FWH>_aV!{{fRuR!GSN2`e7}FScYt0
z5U{J9$d7ygqYVM2eQBmGxux+IWvSl{>Zk@sw`*CXxFl%NzK8((g|E-RH>n?{X|6O=
zsm9>GTau(@qQNcv679JH8mqOhct~P5((7E|%j|sMerz!_2m$Z-XfhYVm>Pl#C~u&o
zc#}gvF~@2Al~kowD8I6_@3fBE+t8_SB5VuL!Fa-|C)=M}6u%ELNtX+c@;EuG<#<gP
zsJbE+!8af&-b{J)SpB_5BS3IkR_K)GfC^M{y#VZ7+Uu8YVv`PK(;;R6snu;eBY8$d
zKq%uVRq3%JaVnY4SBXPV!UO{t*~_$%xUS%c^3j{)P-o#*<-YI9M9}%@uQ0bmsddcb
zBz?E$JP!B|!YeUdRk!s;+=?JadV%WEQCH(*3(sG<`7IjgbHL^*1a%XOr=MxTx;iM8
z?o6B1SV=0NCd^d6^5~qs5>_QkErpv$WcoxlBqk&l<zTX<Kr@snT`=1AA%W|ihrp7c
z8CsYYAB-4HN+W3#f=LTDIL%>s#zOYO2**l<7F!mAIO71}(l*g)ARNXZnTFVo2tk8>
z0P-Do1z0)an26HL0J74!&33QcbLY8sCPMgDYC_{^l`Z}ZM*(Q~=QBFuCLroti&@l}
zre~zLjsS2r{5gi1C;QE?rPOR`c4TN!>D+xc!Rj?KrW!~~q-Q>F%@9Cn0G;EW0GK-e
z#4Y|2o56wx_vO_ED_`;gcKb9EmQz!chDh`m$J5w=(3?<*)P}10FZaj#Yy6c7(r2XK
z89fPS>3wKr5~5^CO6J+;?Px5@`u%`YAJ7U~N@fsseqc8k9uvU;fWS1Egm{GHcaMB+
zLVT2&8HPJ46eR<3e~HXrWtoy1E~<ATEign|W{?s|%Iu|b4Uw;}3y^z*I>NkC1J$_F
z5{fBuX9mfFd!%Rz^Lg+~Ui^Nk%n=+9OCh-JUeY=Ea*@DLLAq5yo@rm3FBNR_H4kn+
zjc}M<VXTIK*n7ha`i7`xk_<C*RFB(a`ZdLrky;}eT%L>G<P*jrX{}t`MOuE5FBl^M
z%5EmCfbM-9LISow&#Od>K_FLkxwnLTOG+VPb70S^mDTA0CqvG_nP|n<{HTGjp=x|V
zq3!)I>3Q`H(W4aSr1_d84oVyeiBoj$bw*!VWiAJUJ%1f>?cJ~ON4UoGW_4#V{V(7W
zfs(O^O>ISkR{=g{U&Wc?<_%bd*k)ZIsbKlO58NW92RIpH6?0*FFMTl<wvA8kpsqwz
zFTCW0o|9yiN$K-}T!us2kbO(;@|w`p=6?hx^T5(Ynu6pBunvyeya@5Mn%A`$i|+4-
zgmn943D61;ip1kn{=oghcN}b?UDB2SUprHBH&ww!Ni@{0SZ6U-xcQuIIoiK#55k2*
z1*sxWMb5(;-I_@N!Q7Y6^DbEy36i@Cv+OK|u}=vpF|nAKGT;~{L|1oeqA!ffBkMrw
zoqf!yKuKX>5RZG56DTCy(n~vp(U+)ZjZ{#5;gO`#IBStGVKi)eX8I0FcwAl9IG4DB
z0L)?(Tq%glybB_Hj}`-5m1;pqUWFzknuF~rg+z_eqEUobs?&#UDpp_E!SZbRrkOhR
zSm>0+-YQ1B_-p8s^2AiRPojSh@7{5w7gb_R5|XBw1=t*TxrbXbf!DQ*rCYTO#+=;u
zY7|X0Qm{WT@a{9>5?<>q2@is2#59HQ?e(ve?r;T3R;`%_B??QhxM&wC3XO$ynzIa_
zJP)Hmkt@sVIuEV4Q0)##F9>gg(SVnAS6o3QDx|s22Z8!v`oDv#-Em|IK)MSgwE<I$
z2A4kXH<a-BnVAg5NT}Yk)>P0EQCJ~gIEgNR#BKg%B_b}1HR(RTWVFeHrf5Whjk(db
zIUsD_Tgp>+ix2dL=jxBO&)5{dYP?9)5{bO*DcV$Choamag($x?h+AevWOs7HA?A{a
zX5(lJh+#r?nK6pRSeCF{6fR1x2gt7>%s>8OO4vytf=q8M!gI)Tql9=)Od<+MSx(PM
zu>gDw2{Ic~c$no4<QTd#9iI+G5KN>ZV$#c_JA3*DSU)dC2p&Y{qn`YE1~GIBQY91-
zHe*m<G*d(`Srn{YWLk$o(S2&L74=JH!b4Zhiv036hC+eiuR7)U9UlNXtlPRsBFEM0
z6&OlfM6~0H+N~tr2Qd7Ua3zZ`NfuoF=N7z4)3Ive+O5=u(V4+Er_;au1ogWfB>VN(
z^y_=oZ~RDF5m4^g+jN{M4ZhK}R-i;WwA_8jTT5FWtP$@JB(?+8I^`D}SkNJ9l%r=r
z4^2p^L+wF1*C}nX8rmwdlUzkC$Somz1lyoOPQn1LYQSwtIdljSdAbMS8t<LLgERFZ
zXF^A02aQ)^!i#u7e@#wuaU8LD^4c0u;%@C6E`6+{mu6bNZJ&#v#zmMZ)*(iS?3=&=
zGG*vU#IiqpqN}Uq{=-|Cug8U$h*C!+!L8D1AA2cqYP%Ej{JY=-=*~V3`CCVcD6A-h
z`R!Fw!JaWfjGEwbcwn!Cc*8B4lRf(sWdux5Kk~!?g=YU6eP`|J5kM9f4~&BxSCN|1
z5u!*OtMJ!-TdmQNsnv^VQrHj6dIdCz8xY5lLV%cqe0vNlv&+f%5iR4iD5D09osdw+
zAY4bhl&2;i$!_<GOPnDDvG@67w8|#caDk;0!8<gonJ42@TrMoYU6uXRIFH(_Q^$A|
zEgwZA;du+qgyE*{ak(NsZ`)aj;GLeVUUzv4V|Y^C<bz_chSeA~9Ly8w9PHR~AIkmX
zX5V73PsJ=LxxJG(?)_!GV9BPJG0Li-LpT6ES<U>$WpzqRV%8)BDs=eVn8@+T4>4qH
zAW^Akj9-HNQO`8ylTfsF{?9h0MbA2qY8mDXs<W{X?n12lM8eLVFj;U6c&80~3~;%N
zj5@#lfB|sKo-eIPJHJ#pG%|7ns0jGt5SW)BA@%JzyUl75(F=kx2u7RrySQZNv0$!%
zXI*zJfCB&<4gu;*1{bfaJ}~C!0)0l@O7SAyfVjnElyF{o<|%$jpzjs){_Et`O)DnL
z&^H}>@L_iVvvZ~ZL+@q(E3b$XrmJY!+AFfNTYXn&y>MXJV2Ff7qCy~LeA1zkXhNw_
z$#uG~L255=qc03X4%Y9u91<7$8G+33K4=mjG#r7SkEV`PZ!iE+xe8U+TcP(2x*4O=
zYbj)#v_{z<rgVL<h}1<v#5+25zzBU`QNs&oruW)iE_pPQvYoY`nF+&YAGHfRCBfr$
z%=T!`I63;6ondD<Owtz6gQ2{zDxNd?m}?pR{cG(H`wT|57B8eMR{j~}KPM5EvpD?)
z(Ir~&U@ji(CU3|O<q7#w<(di~i3SB>7;{63LZzbv4^p9r(r^bQ7K;jIJP95v=IOtO
zHysd`%0fB#k_~-UkI?tE!PyjG2y-Ve$j;vmc_33iOGZamQ+7bdM;dFYkXx7&s!`jM
ztDa%(Bs?Z{PV~7M$C2=uUQ6m8i|{1lV)D-x<oojHC=n+!hGfG|r4b8o2SpL9WAO(2
zQ}CK`(lYsd=ns1R%vqCroSajGLVo&lwG%*5CDHc@s{RLa$g2f$ukU<FPrR^QL3d5B
zAK=A?K1=~GUkY(6(gi!-gEk;fvjYAl;tM*wTWRCvj`v!Z-%I~0;>aBv+hW~Ipo#^t
zpzc%mZ?vr<Lr<iASYfIf3>2>FXilMkm_jeWPAxg2(&4~5X@vaba_~vFcdO6WpwLKj
z9i}^OjMt<UoD43;vk$^*kPGTpqz(bwl-#YsW}%_A|6r^@qd>g$6nXR@Mi*4TiHeE?
ztVp^OgdFk6^zMklfVYlFuk&IE*sFKmPy<o9hZRNsD6HpCkIaHLtjS)xpFp<6%1XO@
zh&CsyFjRl&KaUT4agxHF0qXco0K1ivSF0xFC4=io#}{@}_|h!)aYF_dh*Rw+0yOJM
zqITbLrOZvxce52gK2DLB<eS<&8QZm>`?sJEZS%PdpvB|Wcpanbf3VPcun-JdAm;q?
zBcjADV6c$cnt-@EifF2N$4;SJfnPuCknr?Xpod%qgvMAXSNp6Yc>p1e2+R6|E;zC%
z4okm+P`;ROE?Iz0w5N{iR6Ak{skxlhw+Y64NfM$O*O6J!Ib1I`ryob@Bst*E{lkWg
z?i1KIVElk%Ni-01v`4Xq^LhdL{=z!44#)EPc&=}$$odzfhQtjfILc57i51Ml4;zG6
zz5@@fkfCH8aRyG$q2io~V9K73OPXjB#F-kPZ7sA-UJ-<}&A{PDMQ0KsVpF!129{<F
zvzEfQ7T~%b@({8feTuI9yylyC>%Mzjc6Letv8DM*{XIvDR<XKbZvIWicOV7|*?m3b
zoaEJZofFDu@p1<|WRBGAuc6_aLFlz}Y5h{8x-5CBn`tseN>CHi7eig#lC2xYlSV53
zwSh1#ht}R?M_$J3LL(OPWDQ<Cjq1Tx5l|M3k2La!PXv~+mB_^Y`(|YsUh?D1Wjz?s
zIBXr#SV>4?DzC(MY6rTs{qQ+SQlgngMw<7rhX9HAwLSGLB=hzy;G+|=_j1QGs%3cc
zui@j$LTp0xLf&&i{9Xww(%(WbaD>exw}5<|n;h$F)S`p&8!%8!hmPAG<zu(oo#=Ws
ze_%4CuWlDw3Ka#Ymx_ew6v4M%pO+%d*{WNHq0LTzlPJ_3ZvF1xgbDJSB=fsf;2#l;
z!8o1R$e@P=rzbj-S0Z3%REJFv65NL;e8AS=1EyU<*1yySW6-CA$6AZ$E?jhw>w0~-
zn0|5El{*GIksEC5=`N`POU<VNub-bAmU8fV3wI-4XsPZ4>^GEw;CrcrdsAv#avfB9
zhl>yUTjjT{A%tdd9+e}BWbN%5o?Rh&FdAnH-ydU^*iAOhExI1Gw+T?*A26Z%VlEgr
z$+`ZLD_rIh=c@aoFvB@xrknhbw`_0MG1!s{I%=anTUxgtv;F+jW-OuJ;}wxPg_88&
zN9_2BYio9`Fx`NXO%H~6rc?>lFT2jggSKc-(SZ}ql9ye-N}5AuG{@T4H{jo!ZNl24
zKB^q~F5By!yTB*^{Kq2T!I03w3@zA=hLKdxY_v1%*TYyt=6`)-->(+fWFXSp9SkXJ
zlve)>wC#S2=%%}xzQlZZitD}Iv)kbz#5MCAM@ofdogmE_UqAacT)CH4b^D-w`hxi3
z89tTy?Ve{~!N}x)fz|&KN!UFC7_N8v7EOx#|8f{?3JK~d7BUf4tdg-L_11C3kW5^+
zrisWCD30^mHRWff_o1NIO6L~FE$6jx+CPAsazepKBP|MtWP)p*e@8pO_*j0bI~~JY
zNvxxR+C7Ztly*<@Ca62y(R)Gmq)yd2a>U)~-tJ3i#r_NK#)D?y9UAIxtf^*zV{EB;
zjik;bq&9?CF}gGArZrX<|6p3|gg(vwy!;W(@^A;4^s`((9LWGNf=Gh_J)`b=2eEo(
zCP_=ek$wBbo{tTabq>*_rMUx14cfGI?-UsfwFGjep!sSGp@pQOM{;%0t((97yR*x~
zf#v{OIMKPR$F-9j)+?Xz>wNb&E3GtQBS3)8@j&TwPVj~cmyZ?H<aWD|Po>9JbaX_k
zm9*ZyM*zEoe>XMysXV;fD%HL(B_QTM6M=zH6L*|R#T+uGt*W+T8=yMXVmtmW$!oYd
z5OFdXO;D%JXT02LIG1Kznvu|A=+96|p9@>v>9e~Vrw)Iu<I`D}=6WL*T^*cd6TJrs
zpdK^*g?JP?;dqfmY{Vpi*DdTUt`YO+=xjhg2SC`$Jca}VfKnM#S5@szJO8?BuCd!q
z<sy#KR&#l<bnmmx`6O7zEnct4v1HN2jzWGe)`e?2<Y_`;756bPW1xe;O<>lWnnh5d
zDBVQ+#0(;)-@Lp;rD~adap$3u2!HA|abxXlB~NYu`=0hMymCp@7uiZWcP>va(kfg=
z-)qD1&8AtG>R>XHW7{F!s@nEPZk>(8l!x70xEn$RP!Z<0Nvnr@jqT@0BUFf<k&$ps
zyFLc1Mvc8(&d3g=FeO)B@kfRuj+cu7Le}wmN5})K`FW7*F&6PZ>Y2;1ET(lAbnkB#
zRWnw$WH3<VCN6B{^XT1+!=w5;ZgX>ab8QgLGKmAOgXgl3bzcRm8d#VYe7T?c#bydu
z<szaF$nDu=6CBE5InGTpL&|<-rvE#MRAK%GY5?$Js>OGlXnt%pI?I*?J3ydJ<sK)4
zhdQEfsvPaYtuiA6YM!Rf)|vwjWAVu8v_Cq(RHG_?(BbCrrC1;AU*8My1#w6+sb4+{
z+{;y_OrB-G@%~X?uTD2_HM*gY3%+v0z{Q7Og>+jD4yo$KCH&OB@pQj8no_`ChZlqT
zZ5o3HJz@yx6>{PF=6zfDwU2e{)AzcyNI*3IER3!#ENcDo+1Wi(lKo>7k`;w>A{LD&
zlTfNPS8l>3DtNP#3>^vx_#&R_<h6KEvl_O<kTRgY`Ni+TRpTtK^elNeHRE4T?)$uI
zMkhcmpJ#e<X(8Fxiq*2{i`ac3!_-v?`U{kBblJ)2YibCKhzgm$#p2he0gNsIhIBNm
zjXUqK-ZvYg)@nk|MfmQ@zkZrQ)NS{ZfY0SdrG8c{pi+6nIL&x<VQ&=SH{Z@Vyk9p$
zs!~U02s+#|TA0^DeYI9+gIWzW@?chxMK;sljrRc>)Fk}oV3MAOhSQx*W>!(&hE;>a
z!Ao_7frKyk-oimzzd>x=`WY3^f7jpEn8BpUn8HVk2Ge~~+nhC_RAAFjELR8ndaLmS
zCn6%EB)V@mG6Mk=FuUWFZoC9-@7R=8Pv7y-b+)fRyk|frgHqMTrrAc`^mjjLS%ltA
z!APl?_o!L4Z0;4nR;l>b|9hD5#$hLG6^c?MEsnsFU3AP19&vlihHaRu>0qyqf(kOo
zC!V;}4i|^9#s05C^?pDE-b631tttqM!O>P}O*m%lh+*7C{T4Meuj|4aezXbh&Q1!n
zY(I2%^`ITJ_c?I&arJ(K3?CAho(yv~SY9#smxH7w2YKpublW;zuVLt3*$!}2fJ4@^
z3zb4~td#9J$KsT^Voa+Qw_q0@30d{y$n=(i8$Tk+v|_$gCAII_9;Lmy=AmRVHYm*E
zb15Ig0P}$Bi?&?A0B28iqyEb6a`LT-_4u3q@J^3O7a+Z>$%CDsm$Y9sqgEW!Jbj>R
zhmA9%2qpf>t9g;~NhFc3SZ=kS<?kyZAO?fi%3`7705#^finNyJyAZM0x+N8kY!6df
z#3RXr&3<zzd39T4Df$tPrL^XjJv-1!g52mcr}gO+eo(le!4X%XJuNPld}@ctm!MF4
zzU5G<LOx?3Bvh5)gVYF@EV1~2Kjc{dY@c+*-Ah5<UOX;G3R0)|G5zF+PxzPeE@H24
zs&B3dJ6G1)$dX>&LHMS6-2b=+kr;n)M_Gj)WI|b9e5eo=x1bs+Rjm=fvMR^2GM<>(
z?}Ie?Z*l(&1L+GWBn!tc?I^u;EG2jueq#Waoey_`oomXH8$sx;d<C#$b?^@m6xHZo
z<pt-g7u_ej8!b|dK}ia>^l0THrZR1AkX_3B{HNm{VnT52f)wnYTLHzw8nN1Trs(l(
z3Xxfo^XiHw4z>nvv7ZS)pd#WV#y<~HZhC+zp{YbwyfeSD7oO4~(pn~$m|$y|l#Pd6
zXmYrJ4+;S2+HzIRm0R}Tgf6b8fmnCc$_R}s{eMmbls5|X|Fbzmh5Ejyi}@Q)8f`QC
z2xfZCJ^s7y@QCOc0A^>F+FSc`-0cF<BYZW9h>xvCat9tcY0A3<4%cbgcK<TnT!IkR
zx;A>($3o@)B+x*^Yyz63=ri62dztMqW8wVu=~WVjIi(iwek4po)+a_sbR4eRu_C5F
zv}=FfUci$lxNZIph8(~^eGbELf5pTevkqO6!JvmV*!u0M|50Iw@n6rz0ppNLz4Bpk
zWLZ+!9!9n%ew01_sVmzoWvZIY8J3aw>|&nw^4Ej93GFVBW9>8bJs0hKE=Kho{x!T=
zB0JlVisjwGIlcId6IcbkIALj3F4WQg1Z3wp5H`fxUA7~pum*pJBnrJ2ChmHBI`%O+
z7GpZm!~JjI_FopJPjW17=rtuKA2&TlvTuI}`Je9f0I5n4@O;^`!EE#Y1_b{VEMb85
z#XJNJK>R(J{4dMar-M;~1rtn~_hnf8)!P3FP9uD`248v&{@oJ(CFT0dF7)-Yp@W7k
zna02G?EeJ!BHf?SqCTVFfBXEG;>I!jvmwWo%i@*)CwL+LJY#m%rt<%_oc~VJkK><)
zIIP{2tpA_D8=2wNO;=4i_rH_~e|6HG`A<WctQ}?lu803&*7-E=E@(`j4tg5$ssE_@
ze}gyKe;Sg-`J=r0{{*_9+WUinj{0u|^IwfKlt90J_)?JYNU)uhf_(lYL}Z1l1@-*@
E2QgHtKmY&$

literal 0
HcmV?d00001

diff --git a/docs/image/pack_size_1.png b/docs/image/pack_size_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..b07bb5ff4250af31f26763eb906d2de9a8ef79c4
GIT binary patch
literal 143113
zcmeEucT|(v`mSO@EFfkSm8Pg5f}#n%s-UO@P-8*5ND}E%Lraihq*y?t8A?!*U__cU
zsX;-h2@0VTK#&?jXdyrd+>MTN&idVRX6F8P?_Ki;YlY;?$KLOL%kw<%`w@20NPpYr
zy_?spS+ni@Ilaqk)~x@$W)0us_Ko13v@@87HEXsy+3D(DJg=*}@1mOv%FYqFX3e><
znCtu|h}vBQvnP)qf9BhCPN3|-<#p!-o^N<;xn6X?z`^L<kYDeY$sc*=CH(Pz=KaG3
zF{rgUat~}+4>s&XTF^cQ*Cjl@o5`)?7?l^z3^(9c64fw6J$`FGFIB{xo4$B-&Gg3&
zO}|ZAe|nndv2j#(-}_|2in}s#GEv$6{V&!=zh7oePi#E-3-2m=Lz_<ZO8Us;2B^fE
z8r4wP0@PBwin69p)O&~B+BJIjdfYAq>IDv;d?R;n?dej}z}y(`I6m}|sV{32UanWJ
zid%az#jU8xagE}{DXo;paoc<pwROjj3_Yk?b6fqCSOvswVY{3SrmivhWyRq3u2X&j
zRmUYN8kWLgT$d%h?vlCl-1cjk4ku}+^`Pe^Fr6#-_?4pzKGUZpm9sD1YYJWeaq*_0
z%b?Ny%hpG#==}{(P0nQQU*6X*HzT;_f^gk|CoyxcP8fd*URbxdy=1y*|E2Asx?<AQ
z2VvJnVYzxY#X`S2pGv#GS@ryO(UYa;_H8k|(%~3sbm8K{X@}gGj<FX4!sWy#(<j*{
z2?-6Cr^v#aRgd7{Dd(O>Z$7cXCjIoK3tOx9ZZD~EgQQ$hPPzZv_|sFOB+XO8<egOl
ze6HTvPbB3Uo&+CyHEuAC3}SpN*mNdw_fBn-eenAihxj!<4hU8Wwf$<bEA&QfkS1Tp
z(~<Z2lN<IwugkyHxS{`>-p)q;^QZ3o>UPX1^4mw!xa*cvmhVoxtj!5|bNrH$h_US5
z1Vn8F`uyhpC(*9`XVkiDRFRtVZriuD?^}QTu2=#5xWKnHm&`C0Dobf%0_)88ZRj|;
zCh=<g`#q#}2W-3z*PrPV3SECvSy92;`pTNk&zI{TU-R+Ycw=44y>&3R_lVG$Q@_*%
ztqiWqt<vqeTmg9{%&XK8QI>e9BBcCrp4=v6Pqs6;2+3%w;n#vH@*fR*UbWq3{fURS
zBSgQheJ3fgE$dWv(N}mk`i^+-xPY+$^Edt97{3XAV1~W$X_9;)woQFsz|$il_}*c@
z!oWm=_wfm#j{UIA2+_6Z?sbbdzXpEJm7%@8>MHm4!+apuMQ(n=`PFz$rfsz5LovMq
zt}8J(D0q1L1F5#&Vf{T0Ll?_CbPwn(q66Zv+T$s>ysBL{L$Y(*1;#bKeo0@q^^HJ!
zwQ0_+&6;^7gstP5r(6WqO?^8p@kM88ThEO%(#Xt~OpDCqm|GiuC2hDLR82UDm)?oo
zvSqjYyw~>Qsy2iNkV;AmW!svMC1)4reMNO91y9YIkw=GC1n&kQ0s>YdU5aa@*9dx7
z4)jhMqN~@RG!wt;zD=_z?;@#krwE__h1(yZP@CX^hq8khuInr-gw_XM+%8~qYR9>C
z5bN2~?;2CBH$PoF^sCLq&9OmB)>`db3j!^y6;wBkRb0Bmm$+8sJ@Srt_q~!dNa=d5
zKJgLV?Z5r@dAqtz=+phJp$Gbdk4LUK_)FN?9gh_cuB$y;6)J`gY|y_D8B^nHD|h*x
z*vY+*AvHcP#fu(k>J>j$sge0bm-OnG{jZ(R4qTM3JDu|RpoeDsBcrnqK5rPgoF|r{
z$9a}NA}}lEE6zM=^`dNd1?>d3LCpQr)7_#EJ~rjZ!|zi*T*!vL*^&N!?^kU3eNSt8
zyMAJ5v+Kr*t(ZIO2@hu`Yv<+t1<ECQ@3r-(O{UN5mh!J$cqaT%Abh<QU&!&`$A(W&
zAC`_5_S!B8F{;d^o%hy1xPu*5py%Sft2tDyYP9N5)uSqG70O0q>O@YQp46TvV&|7%
zn(T<Xa`K3D#sTJjrO3?3>gOeCw3Y^1H|+zhsin$(hE_-ON*BLg@?s925%K0sfi(BY
zty=$S#cAlYm)n7b9Ys;y`d^6cmzQMEM~l^&Ur<X8c8hn5n7(FVK|f8`*Rr&DsjXjT
zjwLUuu|virN$OYlk0hvvA4xlwI`L8&;ezOGx~(W`y8UR<wUcHS(qAOK)Ov>f{LJCq
z?K_I|dvleJDB3G!-Y#uA=wRybR#E2mF1uJeiKfz*9T{g@-+r-iaLq^AUA?Vzn@$^R
zAv#3R!ds|qmM_Z8xTSOM;!naK8NCv;LI+ZnlcLlkMt9SnP>C}mGoxLjX`@NSdKP9D
z;m+aCqzOy9GF?JTTuZ3zmok#?PT%TT_1P0xQEWJlungq|Evn4wY=iCjv{!m}fwu`I
zTduN4bgRlH==R;wyT@VgoF{7^-`s8g)ZyKSZ0cmWu=6+TUtAPF-j4cWmoo9SxRMt<
z6CAfyM|COug#Ea6qO~U7JHa*GH7ksC{#{s=ZBp&|C}#w=d0(?wa|^NBu%uxGwH76n
z+k!GgDeU&$y6Zs1C93MjkrOJ)5y#^aT$Eg3tlaGUa|KV)IgS10hdw^B!y}72-+p{s
zUDsfr*RnO;K3${L)`6J+**AlIgTYZQP$nsFaT;?ZbnR_-b9~^K?!@Z&GS0`8?z-A(
zkt@>)W4bd{I<34;iffAhCb&~DPVADtf_~({*=jGDU!x+T+DT2LL|6-c(0s(4x%Bx=
z@1ag4!62zVUHIC8CjB?NUA?NVUpH>kY-46(^v51P6jO7t9jH-p7IfAu++64|p~yMT
z>6l<oWNC?Mt(Qp9boOb|()T+y>o~R4Kgc}9emyldT`*0Z?CHZsk?J4bI(CyVvwwa)
zw_#3gZ01W_ePIRX87mn_#TDU7c!Y?z;)izUV`;YnwghCZTw4iWbN1xfirBm1CudGN
zTO<2Y_SNsheGY$ymiN0R=pe+nn(dISIDco-uRy_1x<qaL76Bny0lO_N0)~PXl8gHq
zq~tZ;sK1fvhWYz?yRY9bb;RaOm0`F*NHwW{R?k<T`cQM{nMW5Mno6<{ckVacOIO`1
zF0SU=GU9il^+f;EVw3IlVHP|6bg0VCZrzdIHvZ=RXO^0OixhzEph`DEWTYpI{NsK}
z%#$B=erxeov*^$n7}EU0)jBTiF)fB9e(UT*yxNdW{H6HA8QMwFgeT<($}o;2?Mk;a
zs(r%cBrvy1ne(K9M&sl0k5#_5>rukG!$P2&_K^0Jz_i_O6~%P-z>S828zdwW3S&7>
zB&)0_tR3@Pu5BhD7tk3Ez654+*^v^1-G}8NUJu$|uwS&P39BWz_Y8j;nrL7)8c@38
zvSO1Xd-t^p58pR7geEC^UM{J+bXk7%>9~du<(q_Ev`5I+$VYK4A|oFn7Oxd0Ey*lY
zez^PL{s$q%WujR8YptuA`JVBWc^{hzJMZn}LB*4)2|dtC|4-d}gwStM&Pn$g>JnR^
zU2b!~b|krf7#@!|z$a*M3VkwXu2H!TxUr>A8>F`k$$Kv$=8aoE?)})*pRk3u%h~cb
zg&jxj&Bn{U>HS%UmY!@$%}hn6CdW<2n$P=uYaG<!C{Rp03mP0ch=Wyw{evpk)@Qco
zU-m<OsjE&uoAug)Vq)=jF-r;3mZ&guxT`ew?X~iP6wTsNTvAK%AVWJ#B$I3@Y@Xv1
zG!^_&w8!O}OX*wL+XxCOijtb3<0R}<l&|FP;FM~J>`?A6_~2w*nC7U%-7*vRTKufW
z1&n`(Y)?my+|4%95!UP1^iL&mS5sQ1$1=4B=FEFU-xijCDnc=|dDv9$d)&gZ2tQP?
zTXnZ*_TcuJi<GQZva>2&$NcgGno&BF1Znl8rL`2Q=qcwZE-FcA_q*8+Uz$nTSv?zm
zRKrNEyw5uTaS7p)`?83uHy2iY6k;|(5ObgIB+Vn7(o&3)H6UI?-)^*a5Z#xD!wDR%
z-aCOl&nI5AeeAr}*jh>US@wEINza&j!bx4M*NzkI($AM9@9-Y1YI&TF&UDSRD3x$8
z<DH#;WTsM0AAk3e%wF>9VYXp5vu5c@?h$vgzLmG~-tvDEIucO0g0%{{UY!%NU&u`I
zkajWmDQA8V(h=2|w1v<_h}DAmU1N7M7rta>XVxoxDj)He_33D{SPGew6t(OqTVyvr
z(=qe=H0x0wTRstx`z_;ZS5kn_QcII)8@JD5xul$PNOG&r_+rhH_Z(s@4UI);M`_0e
z<g7H!D~&PAmY0@s0@v%r=YOfN*3Esp#yc0X#&qu*50k=6pN=@ZUwbN=%~->?A|S{3
zxOR%~*+v|>8dI5-kg#UFSFm28CQHDfMRo6*^V(}XF_QL2B9TQUvPG}5jrMDfumZL?
zC^sEhQ!dY9vC_v`D{r@L+~QD;%6Yr->L%gVt!Dk@wnOM&0}j+J&Xy=Gx34?1UZSsh
z+3Ie{<B>8e`BK0?RFNj<QHF+Vj)2#j)~pS&TeBX#S_^*mf*(Lrp7H&BXZ`1A>wdoG
zTU`B+hPaZpX3fbp=k-oq@m@Pl-;!vH^pROc2Y)@1uxIOg(F^-k6b|?CetjPLX~drI
zuxOaWX<^S>Z&882J#v2WeBoBBZDpoS!zaB>XC`+At$ieId+72ezO&ncu3w2heNJ!P
zhkYEgmEJU>F<G~bHtS*B45PEP$U-!~o*6<H=gReD0*+fMC!Iz-F1+vL-8F0Z)^FPW
z7cT<UHoZyIH8T|Ws|$bLtRnBO;O*vLHvjdle|+P($XfBwLoZT~{=C6oEepP7<gT*e
z^S`<NwS4ZvpVvAaGws~*7d!p=_y5_zKPT`%T;(4Z`ip4(Cy4(l9sT1%|G3cqsJQ?4
z6|%sq?sjkfRfq#!cX~gI3iDpsCmfXluc|l2hn8vJO(sW|!{AlJ{9*H^3JA+as$O?@
zo!&UcNxEMaf)7pGwqxq0votEHpml-bYkjMyFVRc7--<s>)l~|Tb4vp@q-qq9yFt67
zR?ob*&T-DZc^hWLR;o1*ChX?{6K)*0mGT-z+jPJ&1yEr$2FlJ0w?|4z&}(VFhX7uY
z7YWSp)EU8Hib{iqFcMvhya_BFUWG4%^VBAF&Pv;4g~4UzKYN^4fK0VmO81{&!mA>g
zaQPCu9BoXuP$M&A0O##&X$tw7;@^mvoKRC*yZ>JSY@JtO0k&H%WNTldqoD+u+L8p%
z=xu7^sN+MebWT8W=3?+gF6?b3Xer#vGkBF3y<yU87k4Q*;v^-QPx2wWilwQIi1Qy@
zgjXHlC$TTAS3Wd3#()Z67mM$>QeSkGE5xeNTl#2;4^4JP4;4Ie%3*Jd4D+5CfU!3<
z(q$l1OP0h|IRMeDi#Ey6@t$$%gVR)euaA5Tm+nc2FX$#0qQ{B|u2|{*!64e=(W5--
zetf7<Pjr~-w=#UFu|#A?Q-+HlL!oszocp!N!jDtwKGPKcv#eW}-QBNR2Kg8CbJXRd
z=*%vUD<Fe42uzHhbbm`BT1s68S*M_V4Jj2bxSt}M<cg5)j}vUv>+Y&oG$?a+;EVHw
z3X}a%C^HT9>Rj{nUPZDH*ZFPuFImj1_{9f}`k8fC?n#o5LZ*uSpu))`s4=(I^V_~J
z`@~vKi6~-EvO1?^z&)dI5ay{<deln1`s~m6$iA{cc|L6*)pv7v!p}wDt(X?R9lWuK
zmkbq_b8gT}oLh!hok(gpOXbYhR5<|`aLK6CTdL&YLmSJELR?3+fptfbV_sh2@Ty+R
zWZ8SI`*H#_5M8oI-&AOo!Hdev;8k3s20hLBx`q!E^{tx}`M49&Ve=Ihc%s)_8-H(1
z*!)G{M6HsxQWD<Rh?)Y-GW@h&%_&qbVb|(DjuZGMFx8CWl3wvFM%fi=D{u00(dF={
zpDVVO74YWkyj*M=9`nar9b1;j4`6IXp)Oc%SU5U=^w}&V30~zV4HI?{q|nZays4)x
zj9%7(PIaHEbI^h0n90HD7j6cwF9#u3`fVIuwW0+}8R+1f>TIYynEf_R)>u>YrRI&P
zr*L{l=P)>{=dZ_h{IK)I4c)A34+hQM$R&wc6Z3=w?$@cZIcnk`i}4DahOdYG4tuTb
z1WKnSo6QOfVp~tRc;~UmBlFAg%fU)!X-m&>(*05TNGaSR@b)df_)r}!4an5Je0->x
zAAZ{U4FoEjr6vcNN(ZqvmjrK6=aHR>nsyFnr2FH|V8U&<gOHqo8K|%y1}e-CqMFRh
zgjY#G@J~JMgX1;HU=wRJjzPX`U6@WOfe9PWpOfxK8&ma`&dMOvo%3P26R6gWE6~WW
z`Cc?yO1jNf3OC&fuS$la>QF@WyYx;<^a4pjqD?$Xb_QX5(4jLX-q5II7KVI9t%M0%
zb!b6y?4=;Ohyl-T>(|vjOELJ)6HLD(xX<*LtN7=Xh7ubaPidPKOO!WEB@Zgh($<7b
z4W5!JF9RN*26mfPE)SVXzK*`Y4|2=HMD4D6y`|faBPm0r&qm*2Oyppti@k+V;oh?2
zh`5zBA=|pPft&cyUVne$vEFMJBp<-5ss(GKvsQqa(A$WiLn1I(1cx7_N{9}DWDrpf
zg-pFZWzTD9^5f#CJ^imp_dk<^3A^k$*q;S5&X&x&=q$zn?1Xo@mG)QG%CaZXoJGo7
z8RAVjh#;j1nvt^G(ug69nXw6QOzaT!SX<YMnl^%Pywz%D0d#|)!dBw&eBIGpA*!BO
zfB_L_q+DHQl4YzahnSmtamr@;20qj!38eYCeJK$`Ul)rBMb)Au_-UV#V-VM`31DL^
zq|{uA2Er&MN>-c^UenNBH?lw;N7i{6H|U9FD??nTZBetnlqVcu1rcVfEUR(=te<5Y
zf2}ghiiN6+I>s;CJ8T3J{=!>0PkZiVgsoK1EbxI>jj4D)lQ#^QJ9(&Wi+>ttQP&bX
z*VNLUh8>EL)bL_R2F&=3;o5l4<)U=fkT~muVLcCrQ6}+s;ip4%KoqZ-1GiYz0KSFY
zhX{rSSV{N)_5eCqT^!cnXqL>K=E^_^ais}i^ISS~a5z;7AL^G`6U~hVD<G&v%5nx-
zbu?~p$6U9+_|g32T(m~k!ZD0asU?*+*}W`oQ_fOh{G`eSo`E}~5X}X+XW<dQ$I$0Z
zVpy6+zdjS?JX%KZ>*?x`EXxS!PJ2@my}x-5<d&h&a341c7b=XAc#7w#T9vp+_eUty
zZ7mFV7A3oqjrApZn5Z3xbvqZtCwsd~_e+u^WyPzD@n7?$!44nHA)u*2bOyANdA3E<
zxeZ>GMo5qqPbkKRmYCv0v63~k{FQMUe2-gZ6p4L3AC{WZ<BAUjNf)6`z=Fc{1)PKN
z7;4NHmSvFpjcoozHoOe3H}f(VwGdnGJh7l8PPSa0Z-|}9&s@pDLWLU{NPC_Pxc$oV
zQ+O5ArG_?`8-p8MhY!U_B*<o@sV^!q%ElkVN1tI3L=t_vO(r>7R{U~!<5m}zP#3=G
z(zi+Y9w_?_96N2^JT>|bl&M9lB5lZ24e)Q9IA5?io<uUqZ)IK$ABxV0<(JE1J){sl
z)o4_FXK1AhFx!3^SU#>;v1RQ{UoK2|Ww{ts({bQ;IldsIG_)XJw2mThQy5d}{^BTT
zvzh3TCCUR>e#&4a&&ShG27y~UZ;ba<HTDID9#cWv^IkO_IMxu|n7$60dZ1-rWrh{j
zLc0IHF0cz(3!#3D{$!7{jsu0p_HMZkFb%7bmS0|KLG<rouP#oj+p3L%o4AdZvUI&l
zT;lU;gXen8I?SkHX9Dq^6WyyOV*!(~#GZuB_Zfl{8{(kqILQZ~`Se$poA=M8xKQ;p
z(HwY{&vjIATk+eduz6z<bR8#+3tK(YQuY)0XkRDr2U8FPrD!G5&HlM7uXi#VT}*QG
z>iF}Pz20)VGXff5tn_YHCRus{qk`38;V?7hUU6KWbINDU4z{dIqhkKxa1JTi*u~F<
z*cx$!7lN^^lhjKD6%22QddD_r$t{;fg{dyi;HUX53(>)a8n1`av>%hI*}po2&o8g2
zb@8&fU$^zPOoXV=E=vE7h2#?}>h5(#AN?5to8*y!EUS1~a!rpG#I@-GNUj<(01kb`
zH+{9`z_pr%EWk<Wo8nD-qvK_A8^*oo#=2m_XtFj!{kUVC?8+q8Ub<hWhhSH*<I7Uz
z@}4$eyN0G(IcTMLV|@6zWFJ@Q{$$I<u=y+nDlKD)!2|_2)e~<@E=nB|xmI1Oi2(S-
z)s*N_VV1OK!n@URMZ}m&v%0Ow4w!lYA61>s0tii013|dbRv#S>M~Lr;7Yv~IZs8SM
zf%4RrhC&56sn0F7lJH+se0oyx(<1XpaEl&IpW)vk(Dv22T3NMviL0>cH5RxFg^mBp
zp;(n2L#Qu#LOr>Lb{qgGFUcTxO?XB)6FcfB8RwbsDGfESs%?ec$_|$86g@@pXYnbe
z?~?6?f~^$FkHO4W7LuUClqgYD!R_+Q_;563IxZhd>7I0Go^l+Nfe9;gsUQej=Ud>+
z!!*qW*BqTJl!Pnztk(jTdaL6BY)~L_*tRa}%dy(%@EipBaOuDy{Pe;6ICNdx`O(%`
z@vdFI_5_b9W{sXlc>Pw4rr*TthLLde1Gc9)Li`)fy%5gyZq-~n&9O=(^<wr8>-Ihl
zd)hJO_*`qaq>MEOs<E3Z(M!{Byc{1&)&$`5qdQdCkbg9-bU%W#sLw@SsP4iMTNOIl
zq>?LA*&_h-v_+Nl(r3*Yc@sl)5d{vHZb&v5YfQ_Cf(lEjfjcN<HR_dep1^rUnF~QL
z2eRs;b>{7nsIK~vLBo)271rw6%`IYK?BSkztHdzxY7&0>w?2RxL&V9;Iv`KCg*8m#
z;uae_D$$CCdsoq`?>o~HN!&YIr?}r!Z}xG%LH(^i(aYU6fk!42BI=`Me({9mYj*}!
zt?WJ6AChJ(rHuugzy=_Zd?sA{SC#^BET*DyuutO@{<6-Anpp+`AaA4sLfxDmC98Ya
z5_tD<vJ#@OEjr#RF?r!6k#tn|xAKD!*Rh-U%K`2Cnz!LpKvJ%D#zU{vX5!WL%wUQG
z)Sy#K#OtC*{PUB;REK8a3-_vu@S((86}|okASYTBY;N<2FZS?FLNefc!e-{+RohBX
zQWJhw#I5<t=dez#F6!&INmO`Ut;zuq&Qdkf&Xls^ISk7AXMi@TDvJPd1>lGUy<AUx
zC@hPzRp_hlSO-`@10WS8gh*NA^@}*-y8<ZE4-wJdsRb~MRg$dutWoVR$%RO%<>9aR
z>6gVgX`8fbM2|%CBw0BLopXRBDb~-$hacaPS9qou4*NFoRRnz@r14bUph={x?vUWy
z_csm8rW6)V5oszSj<0KzO3)Yd>%36jBcF_oZ7&QI3PE|cx$7*W@SQc%mN<7$qUH(a
zi-wKcB0}L?m#r*U+*ojn<vUo*7A?&@MV|Ib(aeS5WREs~f8JFPjne>O&9ndz)|HHu
z>d~yxEiC)meccb=H04^@5Nse}TF$|Ogj(xG+`4mdzCpL^TQ>o)lI6Ce_^*S86dSBX
zKxUMzaf2qTdA!U&p#y*wa}@+5fu$ii1`y(MFvzUMXktNEXDVm49^0}SqZ@OyV8Q_y
zN9q1AOt@B^RVb*=Xcfp*lLD&FGjZ<`c6SO?*ti6WC^tUVH0-Q(`VgO)aRo0Ecz7AN
zo8^1~9fYi3!eN47^xHW;e4V}G$sK*=PGiGEDN+c6;<6CPuiJxYtO2y;6ItVf;<hOz
z?Y4vW0K!joX`~Tl2vM>D7nRxk3ZAu#O|Fq)?c`)x9hD?}*t^j~py;~m61N`Gj+E62
zDF8uGaA?`xOKmU&J`q9}GS-&+z?;-9mv8u~!xxxTKt2~d{k-ufkE_?Ujss|otR0Ex
zB`z~=;yW|<@QrKm;2AnhP?4KDBqgF{>$yR+VAut@Mf|kw8x`c(2ftR-m_NdPd2JEi
zYU(4rDj=ajF+ats7pEuvV$c1rjR8jy8b=<P&M@fkQExAE;?|qK&P1;oRaPhXT{&`3
ze$3ZJ9&peH*U(Z%8ZwCFKAMMOLJ?|4@1=1W6cN0J7X-fnAilT|G5By&QP1hdUXUf1
zrd43V6xR$jMTjePEk0C<*)Z9o<9>(Oy3%w4zfGOcaz^N=VOf?X)`@t`o{-4S@B)n3
z3149#R8Mnyqos;axAp=RdSS8DhXZF`oTtSBvfgND3-(exSp|n(FBg=EA<Y2vk{^Z7
zxVoTz<7H#|CH?7ia%@MR$kPLRwLz39sH-DHv4h3%B+0A3lYkcF<d62L&#*&ekoF;|
zD8KXwYV>|%V~U=+n<df1WA95lrKMsMt2V=szM)O=jY(|<P($V<!wZ<EqVPJ(d!aRR
z0Z1Yd{YoyB=x{Vp4U%v8UH~L!Qx7P`DGV5LQ{xOYpKW82T&#bH?}0)efVLB-%NgCQ
zV=srAx>^~@33y&P)29r+=f(0Nye}bHLklrHY+5V@^|V*7;sHX$(2t4NxwtrrM%t$v
z$Ur^Iiy~a?V?nB<_`H*JJ}9v`hErvQriTTDyxMi)cgZb~B5abc_9*WUjl*P_k4sOI
z)&0!_nw!i*XyCtkQFJb{MDU?91u%C09;pd;Ab_X<2)aHw9XjawdR<rN02I0DaU0Uc
zH{Lu7Pe?k}u=H^$W%)QF_%#DXOn4-(`8=VcPvJA#<8euP%pOagae369Oc$yj*l}gO
zN+Bu^q_en<vRNPvzn-)@2FO5bZL~~N5<ym%SPn&~hij@srZTO7nlukU(QR1Gki^HK
z!jcq_$Qps{pr~u%v_gdYe4ufGP!ymFE2T>s+15Q)M5D&0hXhP`2xva;iXFWgR%t-)
zqD0Fu3_bho<}er?SU0X*i2`Yzds@?gk?UIaRhU55L4?wR-#OwU-D`nPaNPq46inpU
zLTTTx@G3|vs_r5=C|X7}kr1YO^YQe{k0egzb?3%tHb(p)O4K*E1JQiRowE)c{z%&p
zqreCIPCM(Y`1{;W{3U}BMzZfkqQ@Ey6@-{VO|%y!r78IO9-5*t%D>*0s9ao_5caMd
zm>#!$i>NJ}Vz;1?4nrQMzNgW`aZZ^7t7(ln3s)no;}09+c7-z=Mg|;X#hNJ5MOG3o
zNj>&i98Z{VZzl{PG*A%)^~4o4UF0t%$v}?HvGGjRqcW3sHY+CcPhe*721qxuMpj`0
zm(F$^058W3ryqvFZ*+><UdbI@Ov{HNUyXwt?01SNk89i4%v3I%Oa&;2A%i^#`LeWR
z66aVv+b{qd#PQ-_qGD|~BEzykd@0A)H+D>fFqG&fwY*8+m=H$qeq~f<_|LK|w%+5U
znc(#=YCrSTu9MarOVo@CZK-nZm<FgY&q@1d%e(5MnI0R%0FMjR0qWoK*eu*Gz?zt+
z;r{sY(-TRe_}j}LiuXVr&!&%dxJW#~6CUZ8X&{D%7lal)4HB-$`y?jwY^3|M0KGeJ
z3&B&mT8HxNgYWno0bbW!PYc%&Dc42>i**C}DX1ALJj{U(`p!ay6T0@NjN3#tuLCJr
zv<)d$jyp@tbn8rmA#2D0<TT1#v{9ow_plx;+u@)0Mp4bNtqTk@eCPOeZQ!RRJ<%k#
znW^vec|0_lvCphH-wDGT67Xs)I|6ZC*e5IkkdEN(py&k#l&vYK1yJuZaiGZ!F!7-Z
zWeN~iju}*Vg+`4Ik4B?_hCu{To*O0Gl`trRiy*5bI>nEllXdhpeY2XWrVwQfuTm@7
zur7{q4NXQ|@T?xaX3onh0l@fJpf@H63RD}~UMN`y^L*9Vs{w<2@fR#R22Dx(xLjO5
z)C$)kDDk7mmK;;{4Q-WIxQ&ioT}^{+Ua-`GDGb!pf#P}=^SIbLJoRI_w!5Y#ML_=5
zT`v9TfCrg@!&G8EzXR3In<^TMVWE}Zba?ng<U;c)k62XNLQ(95sw$#2e(Z~K+koAi
z%FOxKnBK>-y7z#_7I56_D$!27erVW{Q^}5Zenaa<Q<u7XO}ak5a@Q%{T{m~?ca5*z
z4Rn1Ax<Bvwqk5gTUKF^PtErZz)<MKJPrV_zctR=WRV(w}(*5iBQ5Se!{9!AT;qX28
zUji!g?G0QHcEw~yq%h6@AY>}+)P?H%*NDz!*$nOd%?&ZnkM|~#W`wTWksY9^x9C{_
zRdE56%$KauODx3zS;r3_TAc^0)Nf7#N^C9oh^YY1PEC>;&9CDLt7JTS?Nh5qyu*PZ
z&lLgxp<E=z0r857lP>V~t-lZQlLvz6JTY$oO<iXox0;l%8QTWOl~2uqQs89(O2JU;
zcyFVlvjbE4df#w(4NdTNSqUulzzv_wZu&=VvP%OkqwG}_N%yrg=B#vokBY5S`D8r!
zGUfs?lhbqytori}ys6IXyw?e`1`)fz$Lz<&()gaQ4?kG&8XndeHuQAY(KZ^7AnzM1
zAQ;1D_Jlspdmy!c6hVzWE;iK9!#37}v<atl_u2$l=r|h!T63yycr+4^*H`jgCg6zv
zWS?s&kCICU5}K+o`_Vf*iUYKgZAh7oiYJx_D6<*Gc$D*U)&nedDBxx5!R$dk3MaLL
z@Jq)y$&U41m67RYDO>HaL_G6x8y~XoZmN%NRoHt_!W~X*=!Mn{Iirf{2M!?8y1MY{
zfWt|iL3uBRf!@K`1DNN+Wv*I@#KO{rRd~b2gf3v2CJ%mOk=@3DR|kALslO}MCUtyh
zbDz@zh0B+<gKj%L&<~&s;N5Ji1)+@n%DPCZWBXR8_N|69&zoWSEn9j(gmCBKd!%fk
zh&baoBH@`AMHWBW>D^ZYPjb1C5d}j!+t=i=I8b3u2360ow^lJD#ERtz<jtvBIIn)f
zN&}$Nyi>t#8dH^kX{Ajp?$<PS0EsJI6cq3;tALuj4c{c}0)>?(4V^!&qOFaHHP%`?
zA-PI>d>?;hqtbmGHDG4x3km_o;9g((p`IH0Vo9>b*5EeSfXAKIdL$*Fh1@12lEhC+
z6oJAwed_F+(o>!@TL!XbVZspv;4sD%NRYM%lIRuPX$>*E(oIT|O;E)>KHAxO*<>LV
zB(N$xJ~W_A3o;eyOkDbPA_^a>WmK;>z7h}D%GC|8hwD`A*LL?NZrzZFAm(*pAA5IL
z;){Du$L!+H*wb9CRKN<3%0N?0TNY6x{luYB3_8B(o*&j9h~6up$dI+piEo}al&pzn
zv;fL6SlVvY#!m-A2lv#4kUo-3=trNRhD9G5%hqP({KHgKO0yS>8hr*4@oNd%iP*qW
zsyW-%k^Och1i#He!qi!t>;<Zpb{3HUpDyzv<}tTTe~wXxa+YiK%<llj#Y#gJGF38+
z+{JRo90(!WAK>E_G?vb}sLUsJpNn!Td2J=Au+sHOno}}?8J>ytAeXr2yOw}3|D0SC
zt@COOi8>}uAfqgZt%XK4t7&R(!nSVv92#(E`z&g727FtNg?11u*QnPrl0<10izgWu
zJF%h2g&jP#5<*98Ot+9rXCo+S$rK(Y)X>57!tbfdq?k`GZsNIy+mC4(Yx51Qvlyq0
z5q<fVHYav5J==k})u*kt0T+!1>b+M<cXU{LVN%$<J+Nb+94w!%P$~!XI3OtDP@`P{
zS1GBW#+vv$@#?+N1Zbq)kkrHiC}k-CE=pcqoYj-=uTw$SDU1LuV!W05&@_GN1MYEg
zsSn@hfO1&`LjhP(-JT5v@|1WhRqxG`Es^wSegH5*t5x;RF%@;LAvdCq)>`5Z6u@7d
zK~>A2Ed(lQ3UEUvx_m$4thSaEVoYU5M3S=0>o&TMd)pAOT=89L)RfV~pzN2<QUMLp
zUv?vE;zu<=KP3<5nKV?pHo^&M&!2xZDZo|-!O875@B{Uy`mF(s?YJ_mfE-(T?n0q0
z5NBW9f`PnkHI66ga+Q-66ztLVCHIaTof~d>H83F&w<kZZRYknzm_LHps)L0hhDC}!
z;6OBZg0UTIUEu+Si)f_M7KwSl-Hci2_%i)&Z)fR#milTo^q6z~j<K#MCz9BP)!7|{
zi>)5t$|pwP76Gpm*=ma~ImodlA*&dY_*%x3kl6U=NF8tEAA(=64(O+}w+%nnM>|JX
zc=70_3s_rHdANF3hQ(?&n=tPQTGiJAG@UsmuwD&Gtz!uL1VJm9D4Y@i6lM`Ez!t>a
zE)l7X`5t{Y@x|9>`1d70t)VSmN)?qtOhoiNCS?d>5k#s2J2{tumxlv@^hum9l!fW&
z4hU&~_rmn|d8>*h5ipW8YNd<**>p6y7+7c-7j~N<rsO`razWOuD>lU!&pzs{p*`=N
zd)P2Js}h2)+wKf#(sb4w3}BvsXp*s!y=jlDEfJrd6#@#?;YJS1o~I$ysAt|(Pg?}a
zxA00<0!bICe3amSS=t5w91?x0txz&q*0={SpSFZ|!0ax7AG$2RSiJ6wHx1xD2m0B(
zB#`sc0b%g~!jz)YIlbnatf(aT9@oiNpp(%^p)t#5e}gmcT#R$H9GwOQr*L^6qRlP|
z^v=j=WZh#7NKIr6G?z>rH<@c?p~vPD1P_vZL0cmPf-hd`v6S9O?|*x%8!uuzHVqWF
z=x`Vvo?$hSI>eg{E(5fX@rd5=A(_`Go<Qm<18SFgz(~$&u%H+wt>vKodP}4EP5zy%
z2Rt98v6?J(MaUF41T7UG{?d>;By!KC^cW(RydN@!0L5OcrA{$Jvb+r=5gX=Rbq`;h
zEClpR-5evV5$zDe0ElF~#B0+1B!ya9%6u6FCY&t!vQ;Hj2n3<-b##1Zhn3I>%DzNH
zs3Cg)tE{@{$b3MY^{Tz3ZJcBf!HzgMYEICW5^Jok-P~|SQQR(^d<+q6;*3-_=_!7G
zus?$V9pqM(gI(8D0<4~sX-rSq5tt#<hr^VWap{l9{0XdEz~?4%Gg^+#@tob4ll{!3
zUlftwWhL0fo>dGuKKaQH%C4mXCaPCzCxz$&vHSqg%@t*~yGiaci;yHMbWU8e_zfu|
zixSlpu5k!K*u7j1_pzB@l>&TChy_+-I_NgojrJ$puOYt0l9ns{zRTkM&skgCf3U-f
z-J9X@KT+8FPnWHwpixl~bQo-D@!eH?ZLLU|JWOpNF9|dumJJjTn>c%IYo?zb02qnM
z4sZDOO~C_PA5y9T>z?R~u(jtws5Nk-9xpf4ZA{d4W(+>Dd*xTSMV4CQ@S9q=k#>I4
z9?No;5xB97L@cR`?4H$dhZ~U`Lt>(0m#5w|O!|^N?x0W%sLtSgKsW_%q}=vgZF!+w
zxownM+c23|G|c&wkN-M@XSpgdlJ4M8^Y4}&G+?FVYOBZiM9uLDu%z-rRB#uwxejiG
zTfO64VN3iaRziXdEJ24ibq0SkE<J+cxa_4|I2dO{-O9=;qOSNlp}ZY9Nr|#;u49ZW
zyWUcQ1|E-Hq+`%y%T~U&s5y50t_N%%_P{L|TQy-OaiMrwL@g?ci9@-$HRgk|3Q3N`
zCz|`25Y--hZM)8fd6vax)NU<IaG+QbVn|(1;VbuCz;1f5mr&m6tA6Jqf|Qm)<0S3E
zXEv~?TNNbP+NI;!4U-Gq?%)y)3J#2fqukyW5e%_2o&Hw#LlFR^l_nEW(N;6K`B53<
z*?^VZY#)uTxj61H@EpsyxVu~n&{;1}p|hM{<*@4w263F$TxcangTi4B<u4QsWGmsn
zatnsECsL|WXSpk!Aed)BmZ}b{n1@yX)>7uWgs1snKMCL$-LdXgP4X_d^5xlD{1@&G
z;6y-w9U-SHw)KLn);p6jQAihZ={EQ|E%Howa(&*3agUMcpOd1Zkaku)yvnQqV3N`!
zkg23Bl$0}Q9IQ5Dr{aNDoW_a?i_|^3JQ@RHDl-L^N~h75QrRL34SGv`d4PypJqa5j
z#D~qdr$8yD?DHo?yjH88l5Q4uXc#IS!X%J%&6JHF77fP3bFb}rG1UaGlIeLOt1B=E
z_4I*7f4eUVYA(=rhtaW?oj`1OniNekP$&m;3OIn66L^V0i7H(w2R-fH$gtbXHn<=q
ziOY`5ZoFp(pnvHH`T&g)lxT)}jqbG@IxwZGF2~_QHz(8qhRe;C!_^$iXjxs~a?ss+
z6$N4xz%^{VjG$7;RC8lpw9E@bloX+(R?(o4*A~NTm~;(y+v{9Zj>=@}JE7ef$zLdX
z<G7o|%zO?5>KWQxY>Oo=nEUkX#20&%z@P~$--_|9#v;OsO@L(!%)-i-0R+OdUP9RX
zR2S&Tsw*Nw6nlm%;66>g9GEaa8wxADHjFNQW0F9sA2u6X9i;KrCpr&yf_C(<5s-}%
znt-&RLre-&9WKK6E3%8)#q?M5;65+<&*P_;yHyau7nVcd*d6DZakMrh?6<+yR->*M
z8PtGHiyz_G{`2ly30V;cAl!%rAc-9Y3pl0vVaEYZ2p7yp6afOv*@lODu$lO`z53{}
zs4*N1G$uP2>frR=hReA;S8jLGs5Q=K^SfS8ve`s_C2yi&imf%q9W!V34t3%((K=(Q
zIv(eVJ%x170HUU=M;pA%#bF|-Q46?Baxwmke$I<HQa!pv;Wpu*|JNMgqXTuusIiOR
zD2;3l7U0=S!SID5ZX&#>7r<v8RsJX_!)SGo@&fMBg&O_SvXVh(*Ht9{rz+O5{+goJ
z7nH+owLeCGr+26#L9akzt|40RdK^mXSf?B!1jjW6<{5gr_p&V+uthx?7$ypJ_$}((
z6RtWwbloC;!39%rpePN9oNA^(mG9Vx45g4}Ix=GLPI6t;X$I(wp2`NFZr>Q@?WTfI
zH}ZhyIHpOSAqr|0Bjg}CZF<r+h8l>yPdD<h06SXg1U{9|0gU65(Z;63fuOhlde4iw
zy8(5yvXzgkdKFMIRPteLcgB|__~@&y+NzpxGP3qb;w}cGcwwouJSNQhv<f6BFr~h*
zazjG=b7BH3MW+SCA-z_<-I=ID+nT|pTT0u2F4lxg31DdpiCZ3cVu$>#ukFhg0OL76
z)j$S-xI{YnMmExmEC?A`vY)#gg4H$Wgsi(bN^j_<BKMoB=&ayMVn_is?7SL1$5HGl
z_lFeRsy;;Vamr}>xLAzQ(Im^2nzW&D`skxS!B6Wd_+ePB)Bg#6+K=$%@=iOai$y2x
z*q8l`575rg>sj@B-F~N}`*T+w-}r3N(Jr=Lxo};lm@-Lr^{^cEDtHrTFJ2KgL4kf$
zT*n4a_1{l*^;7v9fiVViHR7s2&-FX4biCYG#)FG*9-%Z+9Nl(q`I)D{^?SgU4TdhO
zUHpBGtJlXCRlG*xa7I6S{ohx6OKwsWz#HKpJ+bz>hetyoGCw!*UlsyXj@a>3_r9M#
zcNEgj_y(`a%>om1N;R=g&7EK~W_%_2ppmoft&@0{swCxg7<^&LA}(yyjw1(a&iy_~
zpM4MPvvB+8vGuWcflzbkPNbOBzwPeU{%jjJ`@IixJ)iuW9ezr1e^mE>f01p?Cnw@)
zpse?E>gwlCeh$X{&u`y4^lV+HiG{O_!QcBv;5`U9=&$llVLw>?f4lD=3;yR1{9~ej
z3&0=F`i~d=d(i&J82!gY|4*7I+x`u;wRVU1(+%ZKje7p1xUhF~K#0D@saq{BdZpP(
zVLrya6KJW`fkS^5_-L&ka%_%cj~psXc<1ssbafIl9SGUAr=$qIV9ux{DXiV@@!xv1
z>tR^M{aTV8oZ4o$185JPKpvtDfHsK-ESL2{>!}n~6X=#z*T7VtVMrDPOvx^dCIgta
z8=lhU=OhD$umEb|?MvxWF(_}Z*DIZC13js7^=5f-oCWNul(vs0(d(|LGb#9QWPWQk
zfswlfiJ3(A8y<nT!odtLdjVg}CF-`;M>K#YVZEMtTO-ZO*yplz|AXEdy>ZQR7Cq6p
z;VO8QBw3@Gc2<+w2N1j<kiyD<kh&lTX0IA0@nQ2-K0q<brHS?C@*}Fqy+D;i9zRps
zpNemabOKtqBo)juyEzaGO8OA^P?3BXjAKLiJL@gl1lDVw`EthON}wDtKU$1HD$(w*
zfmi8M>uLLC!2r0A+YK;P&>LP)WB0g&(c$wMU_P&H4kpZHCXkFZHHctv!5qlUIrfi>
zd%BEj^_Jf25nDOOz{$6+YG1R9;4lI@07X!LDs^nB1YYIdg(rGeVL+o@M3O)fACH$Q
zwNQL&@AqTW_dlkEKj1;&;Gt|cpQP~I{n@H~R^#Yk4!d;#mOpT#*#>EA&!3Am#iQ)1
zbijPjjNvFYZFLZ8XOL`v1mMOPO@~$uHUKP^8E}M|dPNdp^_qg3XhA1QFed=Ue$`>!
zQ*Z3PBv!MM6YcB|_<}DofTVO?>d)8oeQ_q1eCp4RW-&$`cb&K9)@(J7irk|`2ZOO@
zU|PZ}rIp*g6W{4P+4r_mitxFPwun#N842cVYTy>W%V|Kq-*%a?QWPi44kER#)B%Y|
z=k@HEk+e<q`zvNVN(Wv&>+fKj?9$barDa=lhy0U@`YwFvP?#;nd1ZDSzI9`F!+sUC
z5r_(5m0my{tz1orRF%|#1veu8fj9L%_={$vYW$!RPYmiP<l;?NQY}|67|w%Hx<}et
zz<|d}EtnH@aDaJQ{v8x#?XwySrI~5DHx35s15D_m28PRCV6zj#+Wn575eu*Z^A`(C
zpm!NjPYYVY1Mw{e$j0OYV8E1)C$V*k)B)L*(|Q}T$C=Fo4b9%XV~AnQ9(bj0SnDoP
z1uZv2;xX=W#@{#1wWC?huFC6oUGGbhK}$)Tz=wHrW$~dbHcXi1gF($dwVW&kW2p*t
zdT+#D7L;Tp1OKq&TqEKWc%E1I^N2xax%ax9DT?mGC9Fu&OoCZ_$698%87!e>uCmRU
z2I-L~*Xo4+;i~`bMK*0cp@S-)pVG5f%@DY4wz60C*F%Tz_<_!GR~SAt%9uifsP&x(
zya7bM_uLHBa{+nXRw{fiTv%ruJvPlR1=D2i##53dgz;ZTdsbV$-M!%&$gye4Q7s!o
zVnI1)0-p4DotE&&8AXMIS#Tc}1ZI&82G$c*!Qhm;;N`fLTcF!J+Xzy(D~_mrf@zi6
z0JQu%8auuzDY9e9&SG^aq3s&6l~o{(7-p4%ArF<pyopx0QyC*3p44l4NGrN9wv@N}
z$ZAEw;o~#4&&rLRA6&0J@qH9C@Z-)q0om_*=6*ae5&PmU_g<=Zix})jRGF{kY<a*a
z%V1K#fKa|%zjuLM;L!P-bRqQEb7f<7Zbn$co$6fTU}HWw#HA=WLNv$g_!*FCqxWm2
z5zOZxL8JoiC7o4I;vI%oHjmZNMsw0(p3C|{B&v(^sJ;ryp3wq~SGwcBo!7qwf5r3;
zKm&X%jL|d4x0v^WqzijGM#asA!jqCSJoJEWIF<ksM!>lB>lu|S@c2W=1r;`|GDD&n
zM0|At)N?sg115|Cv5uCbdw_f$UzA*|_qU=+Z8cMiWxjAXQ4F*=0MmT5cg*eG;9@*K
zc+LC;ug+wTz>`)yRs|wC>|#IMDZe@eCY<s93Ih~bFiF(e&BJCTht0RDz$l%<jcW3U
z;h4XK8o(s6ao`c?dWNrhLdXqk&NXS9#`jk+UduXQ<`2lr5l66j341Jk&uJhS0fX)E
zB(fWLbZRweCf_IqeFjMl9Yox6Lp_-7k<|i?Bvs(oMCJZ<X`r3|swo7J<J#2(2iN#N
zUd@>u-UAGnCpyjw%skRB;X^%xXoH5ML3DUkZNO^w7IZ>O5x~EKS+oLAVM&nD#8&d+
zAU;%d5%dQ1zlYPHsxyT)NX9y@<~fj!=Q9dz!RR(wO$RbXmRXhOu&?97R0kuVhE7#-
z;SF1pjh&wVtu!pQ{@R-Wn=jwKxvB0MK53n^K3*m%KN(mb6oFz+45%w4W8ggg;Gv9O
z=Ng(<L^tSsepIlPnsWwWO!P2Pfqd}+lHL3FzvR%ZkBFDe0o23+^|@rCLD4~fX-)ti
z+T0UEV#5q5QOyKD@Q_Kp185khNC4P`OasFKJTMBpb;@VK`o6b~93pP@0jyP~Yi8AF
zPzKiuM;H7ni|^K1%i$luP?Ao3kASD?AOef-+Zlgn#-G5*;D>pOnQe{jkk86soH`rN
zmo5cy{Q^vhNlXlD_dMPrXk*1$PaFODzW{C|kiZ1A;H@mCZQaFqtD1*kxLJ&<07ec(
zmcWJuK-X{`uz<t$)#BXZ00ulY)gV*VtU5(AQC=~^R_YiSS|p6Lf|lX&r&72(w@~i}
zZh(s6KQ#R?9G!CS=pUnB|D9sWE37sCQ3xH8fweVyvj8@&0uFufR0p>ZnyTBjFZJs#
zqh1zx5GD@{lSTl?CWD{hQ#KFM{0=~-V!(8Cbr6lGnHkg3PjCMnOJ#53TjsM9e$4#!
z`y2A$=`VhR@n6-<zf-;Jo$I?595;BB9Q<x;12{0<juPuF{_hsH%Yf<LEeErupzJVI
zN-)j*8U5oeQ~Xn$#VaPY+o<^_b-M)&+3?4MeE(Cg_(NA%|H-Ukd*T3=m!$PK^401p
zhmHcNqjdDmxxcA&Ozi?Gt~86B`-dF#-_5+b%!<+t9w!r2l6L+}gZl5Q2N~yh4**J!
zIBd+^zc%%!iRft|$Urm1+<(`I{_Lxt%ue#>O_eWG{|*9PHOkEDj(a?={W!KiyY=^V
zR(P-e(}te5-j2U<T&v%(UfnSzfqMThZ~eo~cDn$j?GtxV!|#7n5dQO-H>~d1r};+Y
zkMIBad4X#NR$ao|DggZ>#{bm%t2YbuS>3UVy4>IVw-eT@E+NWdu*UzcPps~E$Pj(z
zZ*KVEs!RM61b=dH{{+Fm!M6Vx7XKdvLB;DSZS2n=mOme81lQB)pQF;tYAvgZTg93D
z8bNK;Z+cZu85)eG#YOham%Nf(7@qB?GK1MVWOZj6(W#MlAbeE3wW~-}`Y&Sd=^2n@
zZrqjs{eh}oMty-3LG+QcZ<qmT?p6vK59uBFG1XQHk4+Br6G~R<j}4;-Q!7ncdkZib
zk`0`Rkzrq<H-5)oIHHexrdiGmK5o*=D15(Ee0jo%GR(M&aK9cit+$|%cC7=<b4_$3
zEF(t6lylt4G|&1bsLNka@W4kPLlykd538u$cjuCMOL>|j2YVfRWMU-lwAcEC!&z$i
z8&_P^quDRCl#&(r<5r~KUR00GuUm<bUt!k;Y|*59EEiLQ%0gW-mfPld%uwEdYtHNZ
z3XL4KKR1s2fxwvh5%a{&Kk{?-qs^M_mMbsclIcwMjFN&cx4I{@k1?=iSQnSdy~sbm
z*Y@_OAi0SmF6sX$*z&I`&>B`udUN>vO-}3KpgYIf^Jc2T%XT6iA|^E5Y>*ujYnj!B
z*Ygh$+&{17i7yK4lq~L{i_1ubmk*v{CYcFmk*9}k;w-DuoqcWw9Irw-`WO4NT=F)G
ze^wtXa^O8Gy!2OH5aHmy*<-(#6$$0Q+6i01_&ouB{lmVP{5t1Op*<$<zGP#K09NZ<
zD*e3(6Vg?9;2H-rnmH2N=B=lo5t}igl7(O+X3_;akClX;5zBk;<*uC;_I^bpXhq*c
z`L-};j>Q^W>P=q0H%BiEE#Yzsq|FdV>+2~F&Wvbu!ABJhmDveaT)0zC+H+O8h`%6f
z#8pT_y!iWycbWFR?NRCEcpX^Iz`C>E_e{1HG924vEh_s}sD4P6a7JkI{g#XvX8Rrc
z;pz9ZQe}4tW6R9KieSdoq>;0lGxI#(E{>EoroYXUW3<&}begK-yKpdX-J+i$;f-?l
z>23eHEdPW-pUkWWy_3s-0xF4Jf&RyC==9$bvFa>#fM}?W$EM|D<eAD%Q;raaqunn=
z3X+?8pKJO0#aVfJa(Zw#v^}*`%uG-JDS!){`URjE&8(8#9|0<qUZKM&?<g+ZIaL-E
z(2jkkQkov)aLwr3_NQ6*MdmOQLJC%I;)DFSRf0*O;>L$pSld|aA!_k*5byoPM5fsN
z=D%V(LOVaNWj>b}`cZ`fH*8nM6dL$p*!nIb_t0ATg_U|$DH<h)Z^px&Id>H7g8TjF
z^0^$$$fKSvWLcI_@D6eMhaG<&3lGy)>%0#=N$I<Vy-n6Hr(!8pScq`16PC<9iS`ze
zqgtH+2YB7MG&@%**<X34#`5^oH!3l>zqH>**OjUW`*V;J2+T&|{%%^6<ALti9T#iS
zjT+<Ulf>(K<`Ub>axEO*dzE|?wGZy^Cgtk}oC)xDSIH{Mn6V};qlP!?x(d#``cq0`
z{csfy4Lo4czQdu4(*pi#natkn%5xb?t6m+^Gxmiil8<qIWLcgctElU$&3MMs=)&kr
z-g~a9-(~Wr2Vdd68DKL)8~w(2G#a>J=X|?7c*07&M7n@=cD$uGg-q9T%#C`kTG(MQ
zQ1K^_Vbk{ZvtXfJXB8e+{z$h5r~1aIY#z4+JR3V*Oywng(f9XQ`Rp`Zi-tvbyNC3b
z7Ml@fTN8>3eE#AblUL924~hxQOdd6FDo$7Nb?49OLECV>s><or1=k-r7f|Fox}U2m
zB+vV@dU}-I+Xa97kJbARC~am4%2LKzm+J5DSdVTtZ(b<u;SEk^sq;!)tpa9!#Kc3q
zyjE?Sam?owSC%KPkrFX+|3TJ7PnSZy#tGpC`#&!(6;D?!Ocd#{`(fdu!iSBu9eP_N
z%kHH~bl7K%ozWz5=UZP>N4&bmo&*hq1<4dLX`Ro>FLeB)e|Hq+Uxu?E?fp|DeKJ_3
zEyX?5*zX&5j6508&rj!<mMZsHEF37Oju?t-`P}x4YaJQvTHK-Q>dd%aGxf*Q%s+F@
zzn^b*#;RtNVJ=Yp<4J|syA?&Qb}k5G?JjZ&+U2T7zg^Nb%+&ya#Lw~}s$CZo<5*bF
z`iqIV`%K|~mTM~Bt`epjR+ch9O4P2ieZgS~#X_?Ov;!`T-pLO$l)%<MXC|rS_>+#R
zzbrI%><!<o^viGrm7L*nn40Q!Rb1guGv*dqz)w!wckcRrl?jngl3~$VoSxYAEiwx7
zMN+#~W=^uXBU5zhVQYB~uWE8S)if82F2P05_|C;}Ifbl8+7Ie|tz3Fb!Uj{?^8Uhu
z3|84wDwZeqgHd~%Y;Yd$flgV=^t<~7lwn*jvzg)AtTSa6ohfk&>_d$#?bO@>X)5bJ
zBXeT!eRq--vieU`OocxX<r5;}J-?q##nVmxZ)S>|wHL<Un@J{y_Dr7Fxyh7BVggYP
z-%idY$^5o0t0>Q2+3zs*UAcRJi(OJe%Ar36vlX$c=X}Vw^~YuPSNeKyS**ObtU~K7
z$INLlzsUH^etVvjWQO!nk(RQG8_ny`b@eL$0*(ZVg{b}<X8F%?rhEjP#n~HIzEi)z
zsa>jBm>aCjYgOy${Kcv69iM00AvkCOLtitXzm_T9srw+#>Eo78<55=o9;|P0znIf4
zfj>R#dIYfkppQk;|6=UB<C@%-t`$K<K$?Jn7^D{wmEHnKkPgzD6cq@)cLE3~RYd6>
z3{{FqFQJHn^b&gSy|+Lhl<z^k_nvd!@7{C%%TGh{?7e5!teIJ}L;mw^aj{>+J;XM}
zv<GXJbMib6_oZ!T2cWd0lU4T&kUAvoM*#HT+_2f;F7o1du~a6!7*C`qCUjozJ2e1&
zS+UB$W^(?&Iq=urix}x{paqrsY2S-X+dGpB1qfl1!?LZdAC2402g~`v(I!Ws$I2TU
zJ+uaO<CX~@1yaj~ypuN&>D>p9B<<h*DEgoF<ZE&vEh(-}Qu@^}>`<OAA57k8Rcg#0
z(jTeve6i1G;M|UnIhrqtu64ji-kGqe;(B;<$)WyR*PrA$|3<jqFMK<Jcx&yLS<3L!
z7}~X`$qG1hT@`^#jOq(%l4r<HTQ^q++cZ}k2f;bDFU5X%Kf1XzT#s|<{|62Kwx<Wc
z(<W44{oMOoFMY>6T`O1+B$T&zn}R(@)9=2kSvX2gmP2S1M4yHcb_#lQE>szNuy$D<
zZrM%&cTLBl>L#lzggvU5{`$CoK6LHXi|!bTV;TJ`1X5gs6`vjjBQPYchg;T)<pKPq
zS+$m~Qe3l4wUX7s&H=IEo}Kv2DdQtWqPjD?d7poQb>^5s-GB7Gf5f_fFX&6hMGMT7
zvVZ00x`A=g$`T)C+hWkpVK(eJu!H?V(lu*GWWvf29_1720h?!QP@0gvn67YPqH_8=
zWL2GykL$DOB>q1y;4k*nu5)3vFy~~NUpseq`>=qkal&LI*>dZS6C`nD{T#ZbG@Zx3
zDf;T%_JEiUF9`B9j`h9_N3d?$feuSqH+%qBy+ALzVf6YaMNau>^v7~%bIxhif-=+I
z!GinPdE|MVWLo=d*jl>23Qy_sWwnc04o{cbHc$(Tj4Cn?WZrw{%T-X*E<wjSe(}xP
zb75}d20EHvNOs;!=K$-ekkcG-o`+lL`!TaHo!XZP)3;2%Q(zIlx`a)4>GC!4OaJZV
zEpaRvaT;-DW*eMeZ|R8^_qU{A3`)5N)8HSTo`iObB28rAPRY4Bk=_l^;d<RKH<!*(
zusNH%|MP=?ezR}fCDNW36mHnR9^uwDmNH`KlLYC;O=mu<SzG=*k?#T6NUB+GW`;T{
zfb+dMcpw)IiC<V@@0S4GYZ|$HJ@<b<{;zMjeD@W0>E#`sS?1sGPR8e{qL2((-Z?Q*
zxAnuM2p(c5-xc^Emp$^1^pjusc)gygRWZHd>r8>et<)>yqJOhVz_%PoUuI0(2=5l!
zr24r}2g1JfJ&wg)!~<QE3*R1{9i(%+CNBtFF7Q7<Gd%74@O7yXdB2vdK341VBgJ>D
zR^K2sQoet87#FMQnZkMhYH%u5;w|X+fdqeLKNjB|hnBi6w8JHET$6YX-P-IRI!WFX
z5e=t3y;b7A0_xH+v;8ZyU38TCA&${#@uMZ2-{<+i4@umIQXEUy$LH!p?_WkDj*CV3
zS?4`nwe{+Ys?Z7vZX3cvDvJ}YD{83L``>r3MAec8-ETUICzpPjO9&p2KcGeS%X)5q
zF#AV~d#?>xn|%J#mGs|lUnjdfm-2BU!Hi5}jc>A?>tpZx13U{YPEkEm9k|3h==$vG
zY1wP%NV9&-=BDzXbY54?>@&>(;M|h=x_(>LPaF7eHol(;Sm3cLXei*<=C8l>U2rI~
z8H9A-f3``jxZT>VeN5Fhz4(&VFypE#jUA_TdDmt+q-=F-|23U1)cd`U1{IR}5Y^_G
zMatXaQ&n-;pzm6B6WXOT^kc|Jt9A?U2Joe3wtr%#U@~B@Q;~Duez~Qu7t0s<{;@B2
zk2m^R8t>D>CXs@;%f>cm3247tH^&RkE51C@CL8z^y~taa7-m{`S~UaV_8Lhl%xBE)
zb$#P~N~}_=EHI}~qgk8i!b@CRxTIJgFt#=@=E4Lzmz8&o1(2ZL5_moqjUV=Zj|8v8
z>95gU<|IkMd3*Vn<$a08a{W|KC03#pHjxR~xN&Rdi86S|2RdpjI82fmd{c_K&d{g8
zoPuT?H^eNjs(bQf@L;a({gc&wQ9P4E;*Iy+l%m2ZIh{Pw{3$nKvDWa0K}Y&gTqm!y
zvZNpiMI(GLe2)Tf^C}|Ue*z|FT!5ROo!_3l@pHG0#3)$P`K=@Abju^4y$%h;OBAUu
zxS5d8cj@BCW$UN88K1QK#Psxc$FZQngCn5)lR*(Bfs(1V^HBx3^)3=F1|{VkvtH24
z1s}zYIYpXPz9U0+a9#I_lP}e)T{I{Mox`+4_U002uKf=H{g?Hn`QXuAF7OdGyv6wo
zP3Y*qC~nwUzgs5SOBqC%JZKZ(Z_Bc)ckEbVTrxqW_70FHtah!`Pd3V%g!qux@UF~F
zY32&_+-}6PklbLCuSeT#?>rw;?J0e2R$F5}qwMyiRDae)!_*H`VcVdZv?29{=)W2L
zAK;#V2f!`DQs~XYD?i_TMNj+#v$T_Et6`?HNFImnm*<nVhP@CA<KWJ%j#(!Ds)wz+
zVn@YZg^eRqTJmP)_34bUce%NPiWQNcA~zYS845v&w3Owiu0y@Ji)6GJi!C4y_=n@&
z5FIdy;!|@o9ZU)F8<RD^n8+Z$7Cnk{s~cP^vfV4tqsq;g$OR-{ZRA;do2HcM7yQHr
zM@y9d5Tr<CR=BP-3H-}Q=sZ;Gb7^kPc8*@3xaTKzKi)avR^vu`qn(^X!>LEDNq2`S
z5Oh6MNe0?{DDJb(#CP25HvS=Rc9yN}-N+r~j98bwO=HZ`;y03HmsS5@N4^Onxje6y
zNq2q3Dfv0tD&ahog2qcc>SfnWwdk!_W*B^!**W`|w<6HJ#v0Iuzj4-6TqC`cpsjt>
zPE$3(qe6Dr9b=fmt(b3!k;Ykl`hh*@5oOlc`#H=&ScF>VnOd%+OEsUw{%4P=vRm+U
z^S3g{*I57H@Wkn6a2{OI{U#=Li|&_EedW4brC9o?F_q(dWmV|1Aaf#Fa^)$3mxxKF
zlF)%lAH@-ibHzONXRp%Ni<gg2idNa+-y8KKL2(LBK{a4cFw}*xOv)x=G}7(I(3(v!
zbHqZG&P$)T&gQj@q<PsLX=$>)+<9Bd^W672Pfj~IAMKpNd6Ag{%5lyaN{+};^u`6?
z8_%w({IaIMK@~a`AeP1CbQAo*`L$XjU%rti#=#UZQ3?7rQ#EN)?`Ls0<TTv_2=U-~
z9V3t0Nk7a~)wzkOv?-x#Fo6uCn{tvw#x6eU`kWsH$CI|AnC0woRa2JF+!3pSQQ*RQ
zGq<aPo~ZfMuESuf))F-#?S#nK_9RkpFjS%MaGvY7vN|VaUZB(Vh?wP%b{?wnI;#ct
zzU7k+izKOLy_^5QtuJ8(-+v#me5Dcn8{sh}U%#*7=-37ic1@b;Ie%$fAcrpBv$>-r
zX+>`m%b#1eDB8-MQYB0=DdOCX7JSa}d3}L2!G38>n%sW^+W9HOth9PCQzs=@X(AtJ
zaGPDYc%!_1Iai{hY-=*qZZ^%*A#B{IQlQ&_{beol=)-c~HIoC@!`nkkD^5k4d!#8*
zrWWkM)KYnE#4uXSRoad*mD#6W^oQ~elJg&8U?0GDN&cBv!7t!mB?a&8m0xhbP2xLX
z;cq>2+AL@A1y$$WCQU82yIryuuIc@xK+Vigu@7XvJQrAyH8{YklmVv+@#aY;D^}>+
z|B*h6OHRtRV4l>&`{bFJ<r!qwL^3={#FyoKmXU*U3bAET+_5uT60vM_Bzepf?kGB;
zz0Ht961J6BQ{?};x?o>?q*kO)FD}76o1U6pZ0=i3j;!_ja4yD#CxN*XV37G4m#zZ?
z5kbNJ*lJ6c&5@9@X)~S~bNhTZgPc}iu&q$aPFTPQ84iEiACT|Azg&Ll3rS$i3}>@*
z|4n+P@OfknUZ{~yRJ#?S<Za58j_mNd4j9@CsOTeZBgr=vVwpxO4rc1Ccnc!T{Hp}_
z)sz5n;aKD14bpBnLkG9k{c2|62scxfVKeJl8{82rp_FF#FX+~jL_z6$)etibN%rWK
ztk<S?@HlFE&_euFLZb&NPWfKRa~fF@OCNDcjI~QePR`w2E!!R<0hI!k!Vj)O;Fb?J
z*6!bU!u|K`l<t!6ots|?%tV7i)g*s$eemb6AXLfcE_<w@Z`S?H{^4~FqdM&JWP~wP
zLK%J-luJ7^%2Ss$XmuVj8}@cOk4Oz*`dbkAN6pnfeY<*ybt@0qJVencD2c<n)BohF
zzSxqk5&?c||L#PM*fVdF7cCBc)F#@b&x*q3BRx~^EUtaHj^JzHP9L#%8UWW_Rl@xz
z0GB3=6?eHnSZ?_@o-9B2<u`s~;{s8pto=-5yxpXf>YOCY4|%itAUXHqspSbpt?w+^
z546-P`@l@M2TJ-MW;QF+Q{T;AiRY^{^f7P!A{~M!GAwB}nXku8%JtS=Q%deKOUg(J
zu+dw+j9@$OfHH~_jasA8iWk2b_TNzCZhRoHFNN9T{5F=fuQn53Lm!b&fKLleE9kE)
z&)|^j!wyAw6<Da?<_DosO{b)p{WK{nZ*faT#!GewN2gUXKZ#Ix%B#METjnk)EqV(D
zM=;vr%E&tSYh*4k@h{ko=#FCe2h4_A5FAI<?~G}}ulzF~w!I)76&d!lzn={ktDaK6
z)UepzbLGa+poFKYQUtT?ezH{LhEcOT=dpXY6)vr3Rj!n^Mj|;ExrNP*&g8CO$6{xR
zcG(?&83(FIk+HM4IlUhoQkI-_82JwnlT8)Y5Alo<ZhktcLxl3>@`E~<EqHl%3CCRL
zzDEBmApeU5{eRL--&5sFcxIR>$9KX%DY!2-v72no%R~Yse&|%OqL7=o+G5K3BTgiV
zX{q7|AbjUBStUFp3x6p(@pM1Er8m6na{`nvl*3O>vOVYqe1u#qmsyTG(Cp)^2kQL#
zh{YNci}#4G2S@gnZMKwEMPY6o2=~eVKGENqfEbXAjQONkj<fxAFveHnWuuwj{FXOY
zls}{%-OHJ$oUbyLbgDb8=}?!b^{B6FoBV9+03ob?TpLRXJH4`hL^ePVZ`j2?G<*bD
zo~?Obm__6QVmgyc@3cHzJ~5wPBR3q^xBa(6{?7~n6Ah#(2YJ~v{P;iJ#*1@X52IBg
zPqM*Y82g+|EI~aHF0&Vr*q7DPFrJhjag)uvr`ZJI-Z5Sn?6OQLcee9%+bAH1EL3@h
zi9axJ4^=vCb0v%Eo}M}_f>Gf^frYE=x3^cX{F5Z@;L1g&i>v&|hU8~lI5wCzKNRAX
zgOk6p%o({Fz66(&a(K@m1i3vOA!S!TX)$6Q)2_vc)bs*wE9USoOt5LbIS*=*x2e_v
zhUTkM=_L6o&&ADUBg|jK$tNN3T6toH`RsrAfB5eJ{U39>Vh9+|m2<%#w2v=rOB{6F
zDq^5NE9gucoj+#nkNfDOQ@-p&2(*e!J%C_yNRLyoRHCkXZY0Mi(o>Wr<&E`lsXnC-
z%2-&Nur}1J6y1<-*hPQ6iXfh9%KGczYo}`J2Bo{P^jC-WPpz3(?zw7D5nW;U&)5HV
zesjQZu}jj*d^Y4icgaJU4#@G}Eg!6a?pKYlCgQZB*PyFTG_$L=Z_cG}ES}wM@}#db
zv>hlN5n3Z#7(^I$VUUmEntEE*ma-=Ul0s}O?xaXod&QzfstVNw8Rz%GTl~|O#z7HB
z%xY$pCsy`TQ`Wh+?}C?(y>H*T{P!W@*k3$x+{A?)-nnUDk6^a=Xi*e)r~*y_><pYj
z>G9hrtC_YlJfo>^BN3Tv>Y+El^Tol2kNoc;ueDLgMjU_MCV)Qg<W7k{I#^0_TZD@1
zHFGE8M3X!zsC0OQq;n^rZExP#Vp4pEG(_F-bBdTcaFA7DXk5}azAD5ox4+wDFZlNq
zYoGG+0JENZKJTvW?nCKPO?@4ghutKqipuI>hpR}Joi9g`>s<Vi0&@^7-B}UJTX;X$
zPH@RL%mHB&2{|0n)nM_nU9Kb#TvJz6hcw6SPutWE7HdpFw~8Q${k1)Vy8Gzjk@B3`
zw|l`3EBEZ*;cZsbmqB9d6iU*YqNHkF53RFgnaQ^atX$$mY`wSiDYg85LB#*CP;np9
z9I}O7A6~8(CQvPd@;I}|s_gYa#TxpX-LJ}DK3P|C8P#$yWDVVzQ#r&2DmtPp`@cWc
ziG4eji9h>nU^7C*KhBabW0t73H~vxM2KRjI#Pp5Px$xzYgyco3s;K6oP7zIqDaD4N
zw*J7ebis$#8A+e&Ed4MWTbwH%NpUMd5sII|-Y(tduTKvjS)dbd9khNbh=^5)<|m(c
z6wO-w+M-*Z3i`YB@xN9xi;pHxo#a0eOgMgXcG>6MLEWj&$e~+6$rFrkUWfe2&S3=!
z{pEss(U-TlexYT?SHlU8tOnxK_BvZVt8L<XgHxuq*@I`?!>^y&)u-5)_mf+c(d8Cb
zM31Y}z{+mM>OFvwBUv3MO4piqNyF)zvf8H4FkYbqDvuZliko`~$swOSqK;FEpv;kX
z5r;x2gap#C6zhWz?$4X*=|T_jznz4=eNv>9zhD?O6iUP4)1ihPX=cxvP{I1&z4Tw0
zJ4}Q<x^FYubXX-tu_o-c){*~h<wVd-i$h$>JOYx{TK&OHAenzBq6(>Nh2Q|H44ZNv
zLBD(~?TRnTCh@@Xwb)Z*ljPO3?<u+6`2ycEb>ZFZ$|F3J2O8#O!U_e`5T@#n+tN}Z
zFJo*-35S}(1IRkt`j|<=%7;wOlgK`)vJMgQ@0>m*qx-ry&ZGw?uz#IU5FB|UJbmi3
zk+kzGL8+aGLb6%0yXB#(QYZ_aL-$4@_Z@opqY}57otmi({?-TQSmSm5NIeUwtH>u_
zi3k<Dyg7HYF~(px@b9fvjIzhNM(DjE&$9Vm<z3yp!h49y2s6adT1}wg-C6QC6s1Ww
zVc~bFRAo=x_1=kao-7Cl-x#RpJliv#q>MBxz6QR*NsFK1^ejyGUmh()W|sctfLmpv
z4sBnhN5(rwfmAtfz=`L}2>Kj{GU3*K;&IF8Ss~?!g)S5Fs=ecea`v7ZYH%b`paF=j
zt?`ld?cA;=!`~;|$ouh86XY@ulKUSLSR=twdnXdrihQz4mokKOoAJD#k3q(8h7yh2
zoFKBy-^1ol1I<6Q&2Q{}BMPE)&W!SVm@7-B&rIEhQh~gj*!0uLWBt7TtMY^iotYsS
z_rcbe(`rOL#`y=J87rRb<sV}S@7$ei8^d?cq^qzw);)W6Pc`?zN_d2zSDGuX^7XPf
z3PnC;AQAO>&pavgwvn(PfucT;OcmhM@%+YBAyRECLkhx>Wl@BIOOnUIj7n^(P;hZl
zcT)Ew%nQ69?|$?qR~H;`!5c{js$4@f9e(&cZOttCk%GC2e5yCkypm>+Gi7->nX&I4
zO9IkJ@SPCr?)U1){7U@W-xnM;*GwnNqXHEYn2*&)Dp$n5KuSV}qRxZJ@B)|haX{N#
zZ5!<_TGSqNk+E213?b6&XM*a`9JVYkuJyksBXkdO5){d;AJmj)@ojbqLWDQ+j=Ej9
ztF(KT)h2=xYD7}LNGR#?Fm$Bm#YaXbA+lYaXq+CbA~;_$2_O{Ty!-8qaH4(9zSW?!
zM<_u(sGpQll24TU+~RozS%&hh)x6$m%|s2@&c-pa$BV;X`U6w=$2psM1)ucs=tq?x
zT@S5kZRa>8wVk1S!}rfbAf%@s*L#vc%8YU@BWeNm^tw++^0(L!WidmQ6@a?M>M2+M
z-!Cl97H9NXZEjzZ-4maTDINIro#YeqP(-R*%={x`(*R`wL=Jk_GI)38;K=88doZ*y
z7z(%DvZgbn1t`eTNK>uE1qJyECNqn)o{#OS;oPqr;dIB7A9*3uoa)`;C&AZ<(rV$>
zhzYXYDoFTZJ*WsP3VSirX6P`hqFvTH7=X^perj`c*_9*=p7-66#Sx(dXKdArdQE)G
z+|gZ73GVL@pb`-V>W%S>D#CR@-LPNDU(0iUQYKpev|@{@{CS@X?YYSE)Vx7=a1i0H
z;pWCc@;|5n`c6_zcNcw2$QcC7`=|j0ufn9gGSU>Im)&w=&ylkUM~<Jr(&zA$zGY2Z
z;>GkLAc48UAzO3ZqJdS814H*zWU?&8?61jnUk3~v>Sgcp)BkVLVTJCu7aNckuuT~W
z>pASvkV9GP=NF$g2b*PW*?8xjai6AeP0bH-X;-kj-FDy1UEd<dG-lrAQO7xyl%KkO
zQ;BA1M!^iJQoBf<WS1nvTF-BNhXwvdAwqG{)}@roI_p!-4x?D)6c@fvTz7EP3xAAw
z+14Acf1%I6kp2I1wf%SE(X|uf^Fg_dwvznaj26f;oH5Y)@o4;k%)^hqX8DM!<1QVD
z`+@K=Zd>8)D2{-gqo*EP0MD#l3(Na?#^PdR+SwIBn)3U0t&UkK6%q^c!P3I@>a(p5
zFk_nj!tAw+DtqRlE%tCd1-SSD+CuQSK0BE=f>r(whZ4bNKnymCfF>wQO=H`rEp)Kb
zw7r{!JeQ0SDI>}aBlIa@$SDm;<}Eu3#o#)fCT%T|mL=$FrTclL-}%WU|NGj25H8B_
zlx0fy8MtuGF?)4pQ#{4(WmJmV4%u0f<J;<wv+vY`A2kX%s}BXHOA}oru7SSx9zQp2
zITZ`Z#BXbOQaR2f&{vnG;Z!<rTHaplomFDn6Ud#48ki_o@!#eywdSeNnAC1bx8q^z
z;MOnW`ZO!<_>{cR-6m@+PaUJV&XW5TX__y~3o-ZW>v0>avr17-AFK4P9cR|9dYGsb
z2^D}}={O?qE3}>xE}t^ykG83NMTVU3)-q^TqY5-hxA{`!BJvLymhv@#D<j5^$vw;G
z)pwR+H{@kl78sR7ye6Z$HU&`1<PgaYwpkuz9sdDF7dSqzQ-bN^pXl@cJyv)wrL}0d
zRl%^Ct_DC(1dkM=nb4H9$>x$mm4?a{Rw_-qH@?@WRdrgtG{DswD>*olSRH#=sMjEG
zqZ-g&O1m$q@%Va1aFopK$m`aZQWh-TV-B^t{E5Sy1Vc-q=Hsbky-K2c6WU%FX~I&|
zwp<ek&9ati;k@R=6KmM*>wh0=JRYa1XyhCU3EC$JRl@S`hImhs_nP0=cXhawaIAi@
zoVM`LpP(iV$5RjKglmwAMqvCtxi+L@3R_bn%*VGNUK#TEn}c(lhz*Cs>xP+6#ME{h
zv!>e#BcX)?u|weQiwX|yI1y(sr(OOkjIE2@wDVkEJakeqB}G;%uZ1+ZPifvZ1Up*a
z)E)UKMbI`9lYAU@ZS~DBgZaO%^#=yIo-FQPkT!51eVgjIXZCxZ{}?ht8;dQN_x9VX
zSf_eM{6}=UHeSTFre4GY(tgX=X8e~!dvR6XZA*X<HA^sJQK%PjLEgTE)4Vnak{SsP
z`f(k!A1u!e{MifOe~Q6w!R0c<UYf^r$8Yx=EIFk8p3?Pj;pi$R?#R<jQs`Q_%EC4K
zb|Ml59lqe{z}r!=P85eMP3I^Hb2UV?A$6(TMA7cza>BB76A`20+Z54&HKHEFIg`gK
z#|ZI{ce49#PBqPDJL}`}?`_UH7weGm9`o?XMtyc~RcfbI1VE_MY)3Tjh!W+z$ih?-
z8boQMFL_@5fo3$??D5Mikv8u((w~)diy=J3-}6uOYpk%W$NA;3FK{_(kWYkDwDO!W
zOw@u@4LIoFqZ%XiQ-$fRL0qKN(h=Qg`GEAc;%#EKq~%=>5QLHy)Nq=PXIQd?rmLRg
z10~Usu{AX7XTa<xC5njP{W92x;pGsrrTYmh{>!x*V0Zy&q>BL*MaBkEzPC#v{|Y0c
zEb?0X8@@<>3Z?fC?h3a^Gv0>pweVvF>U+Ug5eh40$m3iqQrEE~*eQjIepP{geD`DU
z46FY#9dF)j;0WYyd~WZt#h1!WFj<RQl~613!orO{*>_~boh}nuFS2c8I0JZ)iES&U
z5>6ttT|RrwaxEq;3X`JR=jeW#Y!Z!C%WOj(@*1Z#Q%kwcc}Q8qteeotc5i{rHA0P|
zy&00fcNTE)C6=@|nuYTW8qLy<SP3*NLN=1tvv@{$U5%bf58&R6cBwlOPCv}SFM^@|
z3~!c`*udzmcA1&X?yDHOQEE0wEbKcjQhpE@C&uz(ADw-}wP#1N@nOXJA7R_7uUL+P
zk{7+_{F@&RNaur>RWkP^oEF`Gej5Fo<r+4y0MbVJg`UkjB7<Wt)gYzwVj}obL3mnT
z+8wN5rl)sv74qnHZ+fT<x;`rHRz53LED@PY1wYQ*EiY_S@@UnTE|nBDJPN-k7tc{=
zb>kU|e1J0FoV%&~D&Wyq?03l*WT%0@>~Z0sB6Fpv4s`4p_{``qG9(eb#3-9dp1~Em
zHMQ!}rmOUHEI$mP)?AT(^{=7ALq;4R8Fz$um420w{%w_M$~cfvOQg3{>LGs8^*gdx
zyI;9fUbB&we*qK^Ig3(=-e#p<D}7G{2X*dA^i#0al#VojgV-ncCH7nBL6D{o-{c+Z
zR{U8V-xH-&v8o1n^L0y~bZQBJAoW$9()pr7FGqLD!9kns2NI?0CLodf8w`Wzg_+=>
zw*2r5d_>w?jjZ}ZYephv4!c>rh)+#E5Wz7az>BTxprTCh8nDCJd$3Y5;e)$?K!EeJ
zB03&!gD+}xsv1+EHX3Aga?8;&Ac#?V-}#WKZ8c5wEGXrZ`9N9mD6w^MeIj=%G~+ru
zmC)4fE<8!xmcnZDM>&+M7{n{X5N-{esiu%mmO+#=>&4ro1s-qds6Th7=AgF@(^NTP
z7sXJ-Tx<oC<s8oc9|HN6h1`sMzy>kRbqQI!d82iUtJ*eNUsvX)3up0&cW=q8z+bpX
zFiTI)<WSf48tq7R+5!XFIz&3bLb{~}-wg{V(&Q?_jI%uWK9E~Nd;%WSy}s%86+NBV
ze$a6%<~AGT@Vph5#rqm)M^fxDdOE(lZk1v1Y&;JvaG9C3=omq99MtJX5polrxOhw_
z7Yxbw*+-%ry&6fs!%?pH_DLo}tzCfw6q{IL(DXU00=t>=t}tkVyqw~;8}OixUJy&c
zKo-@YNqg`MlpD4$1o-iFQI<7MWVK&sR&<4gi7C3BU4DlkKu9vc_-SEEVPS-8oFVz{
z6z&VC;)egu5PopfZmDd|d{!dw0Z`MMV9qn9nIhu7B5CAzo8v?Q_Nheec0Ag1)XE|!
zY;6c(T`fEn2g*8^KK6;}ow()7%JSf9G`kmcIi_j6dMS}j;`HfXde><jz7!Suw!9FH
zV?Ouv-K`QY+nu+>NTs8ilfdb8Tr`0Zr%U3*?8)1{!yr8-q;7(W!{z$PTo>heF&LRj
z<R@Sob_kY_QxD9W?)<eLa5d->SFP;U$aLaNCja-j>wZ9}io4LEbS&Yd%h~xN<YA3h
z-8V<TL0~Sg##R42Qe-V>8bHAEuka#flBRd*4{e7v9xQwUg48uo(eXj}<9YvOKQ{1-
zf=uOOI*WIZ{4<R;|1}0CO`jDWP|<0c^b#v5sgjtj$bcvo9kek=f=w!66gJGNhoiDm
z6AP&&xTB(S0Ez7CY}QRNm%}|=TZy5aO<iS^E|sr|Iaih39=Ksgsgz`(7-ibr0wVz;
z_!UVy#SGR|w}VgSNkKox(u%+34^A9s)to+f{!qCy$pr-iC#LoGNS~bsr_=}YwK$(#
zZs9wPt5TM>EPC_j@<30SxHjMF)ipz9k$E}B?(NLNq%&Tn%3KZk_LvSvi!@#}&7HOP
zgr?pjD3L3By4CRcZ=^~ss^OvatOmYc2l9jzPs>YNI9(!V;yWj{`e8F$wu}3ri)ai+
zZ{cN<-Jofrvl?z8rUKF1bEOJ4Rvm`frua)ULtKo~tjBV|lOD}X&jH?9N?|>f2Lj>g
z+R-;NGKkTJ1*oVYiEl*Osy+r_3g%wK@iKRHS76({G2d`ioSu|E|AyNXy<m$D(nx(e
z%}_oykWR*Y9E1vCj;mdQHLe;~fL~yc(kwC{WbL#y-_bONYKviTYr_Fl<R#)KZg{=4
zMUza&p=92^NPd?F!E0Te4iwE5sn}1%-u0nYUc6sHkW}OpSd{GADx7~@cJY=J$}#R$
zIqn-GA#_yhz!3uQ1vgjx;{~N!@Ce)NQcY4xC2-A1HVSs=SQ&s81`YnOt@tFJHQ5Y$
z#gi9i3Je%Y6aFR26+b5%6!DLJ#vXG^sq&GQ(kV4dl(h!Y0wv}(#{?ra8fEp={G!8{
zenRj(Yqx?scrd6vv`bI6REowmn>nP;=7vyR;aZ~LgjM?N5Nh_uF}dwfE(fJyhwZJw
zo<62f&}^nD0Q;NsXX$g{U9ZdgYL{a|00{bbSoqbeIWD}^{VJxu`kvc~$Q2;2Uhug8
zILA}pWzu*J8o)mUHPO@2m$NX_koU$Q2$cM28WJOOm%Zgsg2%olT>G)^x&jD1st68R
zU4u3120}4G8?Go;)*|clJrZFL%YyLXvox8zZ9$Oyx;*Lc4zNZK>--&(v1W#A`<Y+;
z)lJuKb_-M*g_=7zFjN(z+wTL7O45}_j5&v)3{nwh=~V>>KfV#w&(tFP`7<J|j{Q0=
z(CDbEcP+#{?or0$>chZLj)Ma6k6?xDPty-;5)oazj-HWLBxg&x2$R<ch*#gXYPS#q
zHjJ|){YGD2qL0*oy?MM09V582k@M%ah#$_0Dnp*Wou`lOr}T=bbFh4774yELw|;w}
zEr;#=VS{c?wb9yJWp7vSt32n&ZU_&N^N-vDALTpZyzkcE8Od1+U87Y=*OojVOg%aA
z^fWFRo(4ZHVv^!m*Elkq+Zh=4d7YBfiU<AKc1t~BAKQSh5GkC_H_yrZC8T)W|FY!R
zb9HS!0jD9o_306h5_Uy8)58l@MO-)jljt-ou(f6S-0Sx;!GycT@-X7)TOi1%58v>h
zA0G1@8-a?>w(@r+oO<0rki4XDZOn}&hH@Y(yxJz2So5HXdPSjV7n<^1IxKwo4)QYY
zF>q#wDj4k230`ouC(p4&8WekyY|T29!9&C+-t0lckN_v*y;>K#yFm1<s{5E$CZG`!
zHMP-znf&W6fS}d!#LRYK_m>{8<I2shvSehfiU{lO5183c)KLfo0rd^0Hjd6rMxdi*
zIzk$Mw56yslOP52e5x4H1#cL4qtlP^8t6?kC`#?4Emt_10Ez|pe{!5@XpO}e6v52=
zz{hred#vcQ#cz8@p>Gpc3|s@}x#&k-ClX_X*W(4>f>5EPiZ!}hHa%jxGW&HH^VTFS
zH(5pfkCW|d#}sfwSZhqSW)e7pRW`foEe!&EKY!HN@=EuYtdznNTu&iPZ4ZC<^x9hL
z&CW%x*PEJ!Rk56@&~Dt@`l&l}D6?WvrSmtI;m!&%{+bL{FVD%$GT?yI=hjy9%^l{W
zRES7-D1Ec^?KxF~>WyOh8%{AqW|~t?hbBO)%ovcP3;$_RmwX$^%*r)P{jaCIVi+-w
zzM<1Z#yCvkz<4ORVu;0dYFP=0%8oUS{P#%20#G0{9eV`8w>S<OxXJK<Gd`pSL9_sV
zTTepCRwFint1_Yrf*|7uUr2kSS(9N-^~N8<(Y^BkGL9K?iGG*PuSJ9xU$`JYFA$v$
z2h!7+o?t9QDs8r|@#Io0l;m$EpTVhsbUA*PFWF+QP_$G;tz1S)1RrF?UNv%FQ|jEq
zb=Kmg7inCv#tCba_!c$ZWIOCg;-qhkIX@e+*h7&>A7x5ExhJY^M;3nQj5)71f{NX|
zRqMn`xRmOj`)0tw4EON7^cnLIhs_t^eZI}14fxd$=a!;<x8<Z`NSN}<_6q?qk}Yw4
zcdhdsq(!7ceBTMKWxUH%7=J%w&ZV<tg&vlx-+$K-_CH?o7F(t|VMORy-8`wb?OsV7
z0Lsj!#2N?|Q?U-`fmyuWu`liHw@7!n186^jo9Y8AdfeO<-uTp;<qIg-<0fWAaN1&W
zUO%hdCp*YS?rp}eoQ0|sa>4A&K?-AjgOSw_StHj~kLnUTgUM4<oA5Mt9Yd88+qvJw
z^TcloTyCd@Dt}ts8yyNYoPC|IzBBrbGNjUA{r=QQ!y1W&)@uJ)`N@;BZv?f;hF#Xy
zFVO2(<mrbZv)K>&Q~8=5y@oR?Z%%I6$dZD<KIWTm+Mr9@xb&rRpYP43o5X$7s3|*{
zasz(If;6UzmjR9Pjzkwq84{3fIHZy`%cYETGOI)^lH+xt-J9nzd-akFXEhsgne3lx
zEQ^%23C3=YeJ1&tpn9lWs^{9L=ez$~inW}Lwc%qZLz1^n(_7g1<4wIxe;rTjmGPD&
zBC*1a_HME8NbqR*4U9gX(bdw=uX}FzhF>$j4g!4|-m`1G3=X=(c0iKqMj<-;Xu*GZ
zY-*RJfXSolcslcPG7<dQ48!0ZFZE{IWG`c>E;DCfBH}2>_Rc-P0v#OzDz)Zv^b=~V
z!@{+<8U%r84Ukd;lAe>8!fC2*cC9uNSVO|DP<*amA#AZ%8Bs8vWU88@c=d3th=ODg
z=m(jNG+TnM?PxfKnyu39yfv=gm$%_tEeYj{yG<3wOgl2`L!9<~{hF4o-`zn9Ym)d!
zm9=>>Px#GO1W$Y^<B7yrEvCdE&I<@r;><Ol9x{^+y`n!8Qa4{lHN`Q7AqpfUPrZVE
z-{Gpy-Np&S17RgA>U)b?tB+ID_T4glPUBvjTTf~x#bxSZcYBRHN|ncJwiYH1<ShzH
zL}mKuIbC5f%4|FD^x2fC%U^%AqY!8_-^eBWaavpHX&$$h65<_M2el$ea_bU4hmy#g
zb?G*x?mQi>Y!9j7ymj*NK_f_cuQ0G%2zBBr`0?!xJNI^YEn<?n${L?fYiEu8sG>L!
z>VTD2YrTqEZ^m<4-HJKQ>DtuXS(1%NT))~mx%*)}Vg+o!eaKX??UfzmYEtDhasPlZ
zr3rL=ZZT?BNz<wn=HRf?q-_5!Ia%;X*V}6#Kb)j_??~|DdqAOb7-fnY$u!iao(P>{
zMjYxlv7h<9KNS}wpL$3Amq-=N{<W~=txqz!`sjK~YdR%%861qf5HG^TBySl$#=ETm
zo^6lq-j%pnI&XquxSz1@Z+3<SDjKW+?1c*t1d(KT6H6L&^NcxTk7NyCbQ|*wb;gn5
z+EmdS4CVErsxL4TUZu5d()q3~0pY{mhN6!H91L3zJDowA-Fp?U+?lh+JBtl)!)H3k
zQ-h6%QaO}dr7_<0mA1;|R$Aes7y=|k30QrkH8s}1<86o_2YL&~Js0D*+)6SteyTFv
zcQ(f?TbffmZas7&@xzZog58CMS8Y{+y#h3kO)dmiu@GR|DIb$^P(HIo<VPnS%}_Xl
zCibq2nQ5txw>1?y%<do;^Gfc=oJhn|Z@n`g!bNy-^_<a2qg{))Qpw)_R>t%UVWCgc
zTvL24DkKWYy;~LUCxRPzt^ErS?cHxtSzcj8OPsX_#4P2(`;X7xWZttM4<TD3A$-Wv
z+U<y?76Uf$8MG5NsZ_x16jrp?u^UX!m9v&?cj+o!E74FJly2^vTh2AEW@r~H1G-e6
zBl&sE4)n2LSQA<b7PY$C;<C|IPrb8%Eu^yZLqE$@PHEnuox>o$iM#nz^WOQR0T!29
zi9*BX4gIKPO0(WZ$9<l|&i7n_F}pjG(NTuF4Z#k@qt+bz+>y0>TbYUUEH4K(W1aYa
z(6()b6K3!IFld{jdCZciMh@C~%$~upr1F6)D!641eJ%)}t30?;0UHgif*F>==gD2H
z44+h1^jnH_AsvcEq|0raz(7Rw_2POpMR`!m)^+Wdc<h!Lmi}qo7Xwa}@O$$^dI>_X
zWZox9#^7edVJi0UzUEfnEbi?s7W3&T`b}aij1rv?_eh#daI)*ucYuv(h3~!&M;i=u
zN#}d3XEXv=>0dA$lv&;$$yDBHZ(926aP9hIA&^M45914zzUZj#9K&GVsq+g?0FJ!4
zo^B4Vn#%9TjxQNQxg}^-9KR@#Kw?MFDg5W_K+>+=!!pqp1@xo2ciQ+*LN%XK(JSW&
zyb+Vx4u)ojUbOF~aC))1Zodp-z8TK@sfLi9W1o5gn3d?UM;<@tk<w>9Re<8ghT^7|
zfkuQ90LiMDe2MG*v(DR}V+*pPG3kFOwgN9tq)0{?`9qW#v7-ZGoO|!|^fS{yhaCNS
zl{`K*YLZw#t&cw|**R&tSM9Vj9}QVUaWxS+9^t>cNGq3OGsb0w6;4{I2XyyR7(EQn
zGt68_KrA--C-%%NDdtKddgN2}UWz}}=R=@{&o|cgf`7EoFi(Q{0a0V~+ClB%3&ZU^
zP&=N>uC_gW>Y`Eh_X@igiC-#SZ&9@trZ#<gsl2onXtlc@W#d*pVmi_Pl{+U7`q;;$
zLVvdGB-X7S!6X@+U@vV?A9r@vyH>bhAB5w$mE0W+-F0N7O_e%h*~v#_zul+#To1oL
zi(Q^p++Jf)sc;-WI~aVO1ZWTcC2Jv-PHRwqtr?3O&LRo2xItv|MBWc*c5pVN(h+by
z#Q6b{v$Eody$CWWvt;13*RX+dh!{|}vO1?)`<XC8+W`JU8@Kemo@7xz-#dSsWGo!K
zx`~1x?rr%T1>NbKk{2D2wgQFI^^jrdsTbIAL-6G&TJ_(>Za#Q^rfGi?U92BNXkTx+
znDX-K9-kCr6<Ft+YH1~fVkwf7^#sS~67k~c;@WI;B7I<2myQ&VYIjB+%g?$UE#A%5
z?@qa^ISb#b7fj1+_!>6!%ZH7lt&WydyOK`y&&1zgf$1#!K{qS$PdbuAmwoRmhurw%
z4h)N7Crp@XG!<U)Z<FBhYzo&zzs?0|jW@<?2(E@ULG9@^_EFD!>?KgY1_;P<`z@on
zc3jeHYv;2Rlj5?nVih^(smfp9&=-wfTi`r^h8W5vS~}FXVw_j4^}K<f?|O-9b?Av|
zhzxqt!!*O;R&%s@Fe_&2!??pfSf+I+GY>}10tkW<7QB~(Zba9&d%thyR#3n*)b5to
zT)jt~RCFlWn)Y2TTqrn|<>-4+{XFK5rB9?6ks<U!w}@-V!rqqb`>`<&lLR~m_+or!
zCrJs*Hfpvlh3k@aUw8th67-i2iZ6uR%vso^V2&!=oi>$|=tuY?M%A`HG|D95RzY(R
z+NFP=2oPkU!k6&U6(0D?cOR0_^q=29lqelP4jRA}t*&U4&gbP)MTb3{0ZL>)%QWN*
z;9nK74G=F-Jd>99YSgWN&?;>ucDxJ$YUwTMFo<RFSxpb{FU!c&AGj>N5V`yz%8c+~
z@Er*w!|ObWgDs@~^}0vb&fC7(=EM%Qe+jOCd?C5CNiCV>Fb$gJPgx*F#-5|C2Z#i}
z&&^Qvm=;#idDkElJ~;fC&kH>wbcp73P$;F)yXJJ?E}uHPPu(un_3_%Xl11B=c8Zzv
zXRuyUeO+$pB)%hsvsd?WtkMHH&Rfpk@g%a38u@vA{8PUW69}a8e6=N1J-ue7!jLA@
z18Va^^&tWH_x{>F5X~C&Cc=9iODUtR40`|cItw_nZ3c&77hI<gG^aJa=UA(a7jEpm
zK#D<k-47UfaFQPyM_4&rW%&`z^@b&U{x$dBSnHt_X-!t|8yK;Vk&&8753<%Ul|EqQ
zN|f?@MAYYLds<VU`u&;U8b(%qz5~6j;#3-y$RWad1<#DQuLj7*uSK(3kz3dI_U7NY
zvK)T1ChS`>HiTJhzj3Lil$6Im^1odapZm;Ctn|;&N4(78`$X@qCSp1-d+nia$D#{n
z26C-I6dBQi<#j{_qemE75ihGxvv2rMZMW|<faayIBS&J<bROT2g2b9Oe&DL?fV>(<
zxq_mGz4wt?(L`{`8wUVi6<Q(yohWJmHPiY;7_nm5P53A^s0dJD0>A;>#n6I_bS*?(
z@Y>3LTqNfx&=@#gmR<8Vn^XkD%OHj+v;NDhfHj3n6A#B?)n{k#+enmZTJaRHqdO>-
zF@99@L13wuiXnuXLw%ftt5b`#b!RRJv(+nV{)H6!?DJw%NjEk_D6Tc)-8K$fVQlA)
zQu(~t-(J~U^NOtdCQ@%+z2vRs*)TanBM-NK`1OPjcdMg1nU85Z3r~l$n=k(n5N<ui
z>U`m-1jT!7?u{ou)WEsMC>J3@7Lz!A-OV$t`k8kiI`r+qAsJ}#(EUes;y2eFj36?;
zuGh(6WjP$y@E)<LL3J;j){~wk1yA=uCl{w+#OE|YzxoGEfw@){TG^We{*w14O?Sy1
zn{+mxEu3z4)Ya3M8=GZ`h(J$B;Sv@#!U1GmH;Oz)IZ;XIY@3`)f3mM|dGCYOw4)cm
z*aLmA&B`Ki@|NMNG7jYHZ`N_X)yV`de^L+oLP*S>0yV*80_N}YTtZ2PsUP=keSyX_
z#hdS^7`_J{*Y?A{di}sf4nfh=RAKZ?d@-;Y|1~`ru{HWD(1l3I(Wq|0^?3yf*7n>-
zroJVjM>&Ha)*rsv?n_-~@u1!GC$k?sqRTDZBN0tlUInVX-l9{|`E{r1%N^z{6vxw$
zYM?%7z_16~rm?32`X4~O#M3FbJC=dT`$>hivWVYT7D~6le7mrK<<CvZArnAEXXz4a
z%e0?6Y^-|mg>ZFX4^=ZCY_+&1%<s8i5-Rvy!z#!9+z-8WY-<`meGIF$vw3*#7d;<)
zHFzt4`kIsc^c}?ORcvI$n?jW<wqvofCGWBcq@AjgAYchc+x8oJ%x7IoBdipfXAWxH
zBTrTY$;GHz<#*%M8qMi1G%}jg&T9-VVROIm=r_Uqa*j8B=%3=%rMGcIjl*q4{$rN1
zH|v?MY?KPO85MG7_H4brGsN~hgIDoIa?bSHW)zoN{A__o)w-Z}efEd7p<Rry1k*&A
zeW0d2JTbm*zpeEI{60SC9ntFBV*|3~RcEaTg#q&u?}?AAo0&uf(}#r4<((R}d?L@T
zVG#p5yKT%j?`_68ir<Ip$ws`7`Syf+>TLY?JRtato&mK$UNX5rRnZy$A~(cVTJfA%
zW)<|kk@fETI4nw_=gD<{M?xKFfC`YZ6=4^c&5kVldl7p{9?>1Q(}Nm<p-4m>1gJpz
z`Jkt54X%SA1s@*oAFzXp8cWu1gCP7JD2e^NBQP9_KWPded#om!?l@t$!IBdIhAiN1
zn2;3NC)bf;h6lclB{}%z7)_(F2{Sz$Bxq5DHvZ_|@_M={QMR%4zBNjWul*`&wf5Me
zcKGZK7iE+i1#PRuuAXBdZ7A@kxaFQKcKLU*0ilPW=Dla>i3)a@W@{>TW#?@CGb1^e
zM_W<5U-+cL;ozi7V{pF*|Hk#r0E$1RP6Vu%4LsDWZ#Gm!$uS709Nj~|w<*i8r`M@t
zptk99mD?jQb!Zs&WZA>;EmmYtHX9O9&D@W7X(~JMtVw*T=0>WL60ZXz9Ky#W79AQh
z?BwgU)DD|3r*Q3NjFLRs@<ZcGL(@K&`;dpn0YUQr0PAG~KW<l48-T1A5gq7^^on7t
z9ydRDD0hvqT-<5#J^6=kQ><awDpe>6plm3?V}M4_AMQ>8t$dR=L@DM!j;67Bmi+7E
z*<ong4j}!fq6ZrM#z2Elj2tulI`stzVyg>mc&GuibbF_W%l`^p52vp_d@_KLMYYSb
zaYG%_S={vgofwZ~M%8<U<Fnqq{M4O$-?KhFg;f~jncI({G$l(F!4a4xLC^c`!MPjq
z??h|SKAZFW2iBKqZO1N#I8qg+)fg3WndHjYy3XCr&zDRurinjKu~w|{)j^zAl)Njo
zn3Ox?V7x3MT5&oa9-JiIm$ZBKKcr$P^!2`+O29b}l$%^R>Ycg?Q2HfWv5gIX1Xme1
zU(*VmC!{@)W=TbI2FeW`4@b>r(F1Gi+^&){ucYTZ(f3O|%g;z7qNX4r!zE43_ShFP
zCL{qj&@9N_rIcDZbdkVK&$E}epxnb1{=TlYE)`S1%#7Q5Z$v^`{io#UKRp^?1m=sm
zgJJ~BY8v70sTtO&bp2C?9G*rpnv77GppA<0gLhU$MAx&o9$xNvHTberVCU@KgV!+P
zWO}c4uUVb<3liUOk(8N4@RzT{l~dd(oRWEftXr`(w974^v4rkn{}#UQ7reddXvkhC
zeN{WDHD`f_H2`K>N?<)xbGSe1GP@4;g~@a7OCKjM#HSd~tsgV(3@N%uIusvK=93(d
zViY$6v3$MQ-Y=4M-$Nopf5fRv`kGYf)G9+C%k$!XLUwh3vD<`)=3U{E=Uv^Z^5JAO
zxU+A#YfS~Tvi`WWx2|F7cfDG?;p9$+`z?9)j)x@Og@8!NKt<w4a&ig#Sx7<oXD7;f
zqK05Io1(D)(?#)}BsIGk{esi*UjCSct}wcWM{xnk+GLpYks#LG(^9=sN%>+xCfnHF
z;HT3KbNJ$Z>KNpJN8=Bvmf(X2boP`p2xOg#Z*TQh`WVJ~=w?p!)&vNO@^8h*H>6X2
z<Vt}F`n(iuHI8q%4`43c7k;efT;Ps{eoLp4kwgwB*t*>*xpUDXQ<8ZUEqA&<_+8d(
zW%}Glt})|}0>?BGtv1qV{&a=G17hq1eM)(|kSiN0R6SR}Z2*hbrpxGsBrvs+1F{`g
z-R&kfwytl|a%lfMWBG}Gh>w45o`vJ_BwW|s@Kf%Vx77q05Rbo$Ib*t*j7GtpmtO4d
z)taxCo1zo>L3a|*;02To6~f+aw<6)e_cgMSbh2KCZXbIdghNZ}pbG`b`KR$8A)S@!
zLB%Ma_7cbx5e>twVXQaD4<>Y8EROYYM4#^s&byiGO>UOVszO4T+akHMy<EMwb!eJ|
ziul6K5W4ry=>0K&e#lduG##s~+|g-anamGY@yr^)W-Hos(n`f@mBG;KIaH<h>Xbu2
za}J-;sT80Cz1y1pkDNhV8J}nQI?8~Bvb85L!Ja(XNnEX-6d0ec_o#lLeQ}mow9?wt
z*Z8EIEL&e7LT$N4-t<*HJ2_n(Bp{J}=7Sn`Tqo|dE!92CR`_wawvnKVxhML`O;)0;
ze19s^uh2W~_($WGu_J=u)B78nffU!)@qK|3&yg4xZtiV4)g7pnK@UA_cI1MD1=&uG
zB`EMWkjp4lmP(OBlR&U?>&1`mOjV5~v^JU=7s2?ClD!@f4*GC>g+^+-r4O65$IrQk
zl<|E=iP;1G#UR;{NMW6CWxDrMj(Z+&=8WZZUQH(aYj%53d(o3aR6j!3h+Ltjy$|%{
zdh0J{2<lU~-d<el@%0s%Swzw$hG6fqU6P8(3g-nvzV%sKZXG6$R8~jSaN%%6!jw_8
z4zuy6Zuvxju0FTPtFm1XHEH-xWuZjnIVuBJ;GLu;8z<x>UMBH<33V>InLU#!U>*KU
z$f5U287VifN-<4Dq?g2r^XSjPcqnlcpbrY28~fAXudg@TG+HBno=bO`vxGknIPM~b
z`4**7|8NLCc2GcY$%OmK9@)iYC@t*n=T;l`iLZs<T}CMvQ~~_>^LSudGY%x~WT`d`
zuA1M8S4Yi~R%K6LNz3`4#_$E9AQ13H^hcV;GUYlYD&*nS)tYnEv0jyhr+3_?m%;(g
zIi_G&Lf<M_cl{j>uXK~4XloW%%RK)p=@$9Uw`kLT@iZ;$>l#b)_MhV_bot{`XyhVj
z9!rvUH)d%ZbW}AAjhMu0)%6k|8NJTGc3~BNh?o1gfI>mh`k~183QIOKT~e{xWlA1k
zc7Q75VpDuKuQ4K?J?z-&=v&0+U{<^Vev#;IwR)-soZVkmYGjDVII>}ipPH}8*Y$a8
z8f6!9TSyp6u5x{$ddAdHpL}V>@*<T*alcx-(Ekl9<HRfv39KbAS2fwTF;l(Wt~w&K
zN?;aJ^LYZbJBP3-u6k_{sDydt=J4>lseeZaj!^EY-ePm8ynWa2!ZAhJw#^f{RB2Um
z7xooteKK>|2IgbGPRNWMHHpJ8b-7d)q2r8oU3A<M6wfGOgPl&cGuC_PH^BuZ2<yKj
zV{{*IPDuBXIcwwDZ}8w&LF%r&EfB*aNM~EcYUTd8l96P5M*j@cD-&HZ^i2HMB?|wi
zv@UKmg~!;+ULt{;%B8>5Emp&Pam*J&xYIrN?>QI8yszi$lZPs9A|<B;r@O38j|o8D
z+O_&E%Q?o-{Jhy=->Tgd6X;&Cqw4z{7#^aTEsSQCR-!`DseDd;(WU%#r#bu<?-d=o
z&Mbjq$Xs4?&_#TDW6+n*{ZWcbM72`@g=jiy2$CoH#Q((d<rtuPtJ`2T7}Y$D_ujft
z$o#=Jea{$s6cW{=uidd{4|^811aKrQ-*ATaxw~dj^wf`8lw(F3q;2}bqsO}q(Gq{|
zB-8b%VYOOB>KFQVcgfrK3Z#6!VqydYBx@tl)JmEQ>%!ysBH7>ih5pzXnQjvcOjZ^I
zD4f`FJ699hqYkx_paB9GH}Ak^72r|&7V%yNyZiwgH2WSb+yY@=`k?wvw&(yzh~mN-
zE3pi=?@QG4Gi7xno#()O0NHR&H+65LndlQ@!Xd+W7i9i)pC*E-EYvc1wO?*e*N*>B
zSvK8WoCI@jYo`Xu(2iG{B;)UZ3!6xNpF8AM^;`w%DCg}bo=By6?0t4)sHD-Q1N60=
z>YU|lXWtUgkyUzo-9=tQwdQ0FIk#^9yRSK1h!ZiN1xH0UU{kSAg^@LEn60D&s++ac
zZki{Azy_yHT`N`5#$XrgEs8F>P{TJeg`er+rV~X6@ZiC1p~{;6v`7h;IHh7*r}gyQ
zpK|(*&fLoFcM;PP#__?cXaab{EcTJ$g}{|~8Q=FYuaavF@sXx%rllrC@pBWgt!Fg>
z_tI$Lj#O`l#OieJjn9i_a%e^aYW0cg?W<>wLl&-ax=0DJKM^px%Q)2!uTaNKD|}rl
zs3&536!~iUkZHc?rSkkCi(i||UQU(I^QRoEWKwkNy2d3;qf7>xCIN<&Yq~m}W2c!s
z!Orl<#2PDO^I;)o1xJhhd-%cSBT{`^V*XQ5_3$>arA@B?N7h#WRk?NVD;^L;MY{9Q
zEnRXD4j_4?JEZ&29RecV-HLQ~cSy$pk?!v9`X9Xa`+fI*|C!erXK+T|z1Lp*S<myV
z^%B%V{x7kNcmbjmmA0;@+E!#H{F6F-lVPI}xV--xlFsvz5T5FVywEQzZgL7%+M2bt
zFPSP9<sfu|Q-<mpn4Rd%<M5PM5Vx1{OR#6f97n@8pYKBqN+M<Q0CbmxwngxQKl;Wj
zrj{ANhl|Jbb;n@eR@XF4R{qE%8MhJ{?j@>T19RXnG?$oXumjcup-gvz4|OQ;Lku4T
zp%Td4tCj`D<{k+AAJBy%_if5Dt@$m;*}znf5`Mk8jFiVCbe!_qA9XH9@3Qp=#~tB%
z5)@UKOT&}LX|WK;QsGYlLrH8J%|LunTp1d5sFxC(lK_ms#%0N+3QRaK$PvG@W5C-)
z?Gh5EGk9YOVsTi1byiPx-FiRZ=|wN+m#W2kN1mZxRcX;Hol$C`HPQ~MftB|&m$+wn
zdxho67b(Hpw*EmTTuYKsI@J;TfDXRb(={k}g;V^|_+9i6xFw17mS^O~S{|3UB^?y<
zNT3O|zi6bM#}xElJK|+YMj{p%fIx=01($Y(sYJ6;_9s-Bm|T-5DJ}_PuVGO`&G;$i
zEsduPw?D0t1XM$8JUG=gaS`S)X;Xf8$*0si#Vv<~aVdm4bsr8{^Re7SsllgaSe`zM
zGQ0TV+}Zg*Kqrv0mHnH{6s?bV%<}8Z#=mdQdYpjfp+0-@I4?5^>xVtZ$a)>5nhT|V
ztD_|6Ko)KN_?|AGT$zJxEf3BFl13~0pLYaT+*b*~R4JBGE$PjlF^kn_9h=tLCZA-W
zsPeCsf1G-maWOb7JTZH^WP^K*TU{-ch91&TlTXl;IvigrTAidUQP~k&XZ%tw>(UCg
z=I0KQkJn6<sMLtTZF1GR5O0}d!a%cc;dlMhg!@;iavC92UKq8*&+Eh&>fry|`Z>OQ
zBNtBJ6Znjy=_hW&HvErB^jiH}8z(9T*lE};$kB-{VTn^D^g-MtEP)=0A%imQCcuE^
z`6+yAiNdaD!5v|$OvZO#?q+$#>Yk=xOykol`6)pHN@i5=aJrAV-CG>k?jy&3=JwGj
zRtI@;OUnXId6Wc{-MB<{@`p;*0U+k)<ONkI2X^K3|GGx5!EkG=a&sw!2;OJkhWsd@
zpl89`8J`g;?wNZpt*7h;LtY8)$X@6bt#zgwHS}62!W+@+;p)Y{_ENCo=ta&j?FqZ?
zUHP%UW|_pS7S{uL+Eu?M*8u}VAXh!&HLk!ApQ}{Sq3^aayVo7flC(e`DQ0A6S)S+D
z3Yqn^at-b-nv^SzUD+jk-0`0R8BFWV<!+Oq#*z#{12NgqIFMG4c82DC$vXQ~Dawr{
zOFWlK$u@^vm1ZHy@}4@k{t_b9t@~*EtxgDy*<#|Q^2v>PdgU$$3)Wrq0*y(e#wE>F
zbQ)s!KMKj+BB7{R1Fqb~io?<5E`JrcZ79p@u^q$<<c0S*?#%lOrgKDLfx>NQ|0r>B
zkuQv@z+f%@vcYJ-&Zv}%fc|+ec)+?u1v(R()Dmc_^MYxOyWOYn>Gg6D^b4<d*a1)q
z=}h<?07GhVOe{dLo78y<OO(&kLdmGXN&&2t%~u|J02ASSoO1wsZ#x9nv4N7m)i`*7
z_v($55|8dHTkyR6c|~3$;5khI*Y>67zf4#{I>;OaOmG8UOHTE8<?%t$o>Wjjc1OR9
z1V*Huf+4&KMMQVug~8U!nK7qaJ+VTBm*Q>R4^YmGXr9N&&zcKGnN)RC?Um?~v5CPt
z`c4$%<&CR}=Jvo3R#j@_Z9A=8@XCtd(u2U&YM!0lC7bbLEQ)uwMR=&NDK}!lf7zw{
zf$@UUGbxSWtQ!P{C$OY6>u`G6#ti&mmetMznRB!gE02I~sc}RNV|$PJh?jYpgbn++
zd+6tZrMp!MObl-Ol;`CjhMRf($Q?*i1m=8(c2UN(B`)m8`5ae$Mk|ixY^p0ncvdoF
zFgtT~=t2KLdU1!o@BU$SS1a0HLO#w2K{1Hd^l8>&6iP<iV0Iy5q^v4H$FI8NprF?A
zy53tu0~-;YUC98wj1D<wQ2Ob2vJvHn9_1tnwInOGdc)y6kewZLY<)gDdxVR-7n%my
zPhlTU;bmWlo}K02a7tV9teh$;;D!%9cQ1QB&ZhX3IPJ5FMWcfm{@jo2d1f=8C&-`w
z^AN%NI?PsroNxd&iCICLc=)^i#=4lFpIGYZfKEvOLQeouUq6o#%J@^enzVs9hoOks
z_^3HspERLS0t4BIzqiHx_cLYtF}<&y%spXvv*C<<^|ow~MZ>Y2fkcX05&^P^5CEG)
zJ%yD0Y)BxN*7FPNP&IK&=1H$o0aK%tUUo3zd5h5lbT#t4)7Z=qW%^p<7I1F4kIQUB
z9P`g#^mFTX>Q*Ue{ws>h!mSt`-ZsRWrKjG{B!wc579M`)o*21%%3Al&tS$tTXgeOp
zS%ru)x=Fd4Z=aFlfVU9W9y$=psn4!6;#H6?%yrU~8_JI7)QGP7=hf~Tl2vuHT52zs
zUBCxD2Q!2?n0p@?_7tKt0-3LBOU~y@Ko;s+4Cy8FccD}^b3PR^`xl}uw-~5H;JfXj
z=nIcHtkk7)tJNJNK^2dq4zPH^g31E7uSv$u`!kkgXF<t0MK9`f(f~z#1-1)co#e?B
z1&x`(9T^9QK|lAo1#Is6gVp2Fc-RN~8$!AwQI&$K41VD`zIvlxMyndxIUW1q#K!U`
z;zBbn9nIauqjbD^V}W9I3JjDw6T)^1y-?>{@p{t2YedkKzR0lxW|r4hIWR%((-CRG
z^hwznY3p~+@J;-`=k;qiD_C)?MLg@6&1l;lpbHXsmMkPce?vO?0p|AR$<sG@dI0xZ
zN^$$0U;eA`jTJC=pUu3C$+2~o583othD?6qTHa_{m3nn7clG3=(9cY{(FNjjFB&h|
z8?S<`(~YTPAmXzj6Dg!Cgh5*^%BS__&fHAz`xKv?`Lv;d310=N7|&>`nIx86*fmG$
zN&vyBzSMp7+h*Xa^S}{bfk;qFwzB(|ozRT27_6hSj|d24PyYM(`v@7(3A7m>gHYt_
zdrA^M65GH>vWlT26Ja6z81J^sBcImW=cuV*rL8v5-!&*h)^}LGKWsV!uWZ@Yi*+{M
zq;n^#bhccUGA$2rVhh-mOEoqc#NtY--ERf!((^q;8QGmBq%@<}=%LR!j!0$QbEJ&*
znSxE}HCE@_BQG5SZ$r)zirF_UL}JLp-(V7}xQEkG^O%1VA7Jb7VW{D-pTHoUD*T>Z
z97l-Vm~TQVFKrzM94@5Szt=ZVbB<f=S)4r}A~tQkp7?Z2OS#i&lE5D(IN*|2pMQDn
zM}#jsjURsO>FdgI`b5P<4!UJVa?mW#Xgle6Ch)E*1(=P#F?<!UwPN@*95|EuX(%dX
zEN4|Mj5p}5Yl_}?f+DtI4V>vf!?g>>^Y$t)Sg_mB4?1<B!OGznJ$Bqz$@{}6d49$?
zm}~iVXMXILw`@8xtnRg^JGAyWu>I)$m)$KY$3t{23<2SpzkAU~lc9^ATmCQL>StA%
zeVM|Xzdq`J!$@e63>`VxXr|NB-{LNa>GrD0IOv^81l@}{IXhR^G7y}_`OKwSO!@Qa
z<L2Fg7vEk)9ZM}V!7js04zkcxm{N+xdD12;Oq0@^Wa@v>IVJ1`(Uks56|IT4y#N$_
z_|<D6Cx2_2hTL!49kha}B!rLrk_<t3qd;+xcs&`-4YK9<D#NIa8uqE7`sRcaqG?*f
zerwR^E_&-m56PUV(Ru~pxR53shxU0*Z_+H&1g_M25<n6*3eu`d(f%7x@Zarud@Bk1
zh-x!>`luEOHZt=C=mVK`E^-1^a^D7le*Y?C^HPP}Z5+-%9bVA6ub-mSzVUmQs@wY|
zL(Mn}|9*6~?OiH!oRZ%7x_KO%iN+vK#t&={`7WPaRzIf*q6}qNhG&KkJWNVv*ckl|
zu}$+@LPn7amt)lZ;l-la<-pU9{jTSRpM!;|cL7bmy>4^KTdl#={%oRLD1?UscVaok
z-Jtl>*9|9UBw7vk{OjEpIJu&JuH^Al{!}JR0|+!AwBJJggLl*vJpD}!oj&Yywco7t
zeV!o%>9sT9q%ft*9RPpDDrvPplaBqNRE^3*``#u9|C5;Q9T8vhkiHk=#*W-t$#c}v
z&8KbcY&YRQ>4*C(i>(rWVqZi~uNrB%aqB?pgFS~GYPQ{eGbVN~RIE*6ui%y*GHiz<
zzfe0IpNPq>Pzg`$5@vTh&p&xl*P_QaxVV#|Ctfd?jaD?N7;ARr>``3%!5VDOmc@O8
zIt?5z@F$38{#5LmUHip@Javi9AzX42@#$mg`Rn1)1tg_bK+UX%U_RD9!~Xc^r76Id
zUd(p*<B7LdXr|T}@axH1TP5VhV?4g*^S0EBf60)+AE$JrQ0UG#s3045Lb4U(7p;X9
zpGq4B+p70x4|J@l<?q^yicgB*@2erkT`0o>9VLvP0S>D)S)w#K#E}r5#Rz>LvNMh1
z2gL$3G+p~GarWYZH%@eFS7xo5c|e_XTyg^M?`!Ug$@9^5`tPDBUNoH374<jpk9~I<
z4Kvc_`+`$^d5@u;1oqNl#@U^qi=p5ZNBGZ6J-(W^h>)%mfWxa8;1gpXX<Q1}Ag_JH
zBPxs_7&V?4hA6t7MxZd%o|ntYpv+sXsORfhYp_M9UFz6;CR~}8H`V;UM3RN4K|>T^
z>^L5@QQw?iw5e88a~HYYnQzLVX1&fFsZ#oeVT@RYbj1D3a(b~rV~;6M_l@2i)+-e)
zmz)_or^9=$8-t}4dJ1XOcRbO#iSa(}KRy$CNXr~^Ua$>caKOyJ$G3F)P@lM6^>ccn
z^n`E8<9snpmgk+bZm=qHQ33-~{3vE6cQBFDePkOnth0ww6slwDg4oy@k`aydD%U~+
zW@I@^*0m0(j(nrfouw3ox}&^4vq5y;>nyn^YcG+oyMZM!ce?5Y4}(PG%z-%BHvAqk
ziA0}@BhY%zqymp?#m(tJgTgHlA1b!WONu%t`roDpf+ZIuFO=^;1?uoQ8c&_~&Ykgf
zo`3z>0BphSLR)wHG_Qv4t?l&6!uvm{1`cnCJ|Hk<s~0-VW=PMPWz)YPRY;liA8mt^
zZk26zOm8tBgN-K>5EsyG8ox%P?hySd_)@_*@FlJ-SS4V1dnv_^Epmd=Z7GXuoUaQL
zdJHo}V+?jIHgC5tNejcqRn{6J4UyMk4z+QsD}9FWEKVqlknVP2D>tvhu;&mHrWgOp
zjzo$nSUvQiytJ%2jSo>ya7{ia&;`vSjm}R%{iw4pp{Qhhmc@<#j^aIc(x%eg0pXQc
zKyyy#;j3tyPHr}gUn1!L#c7Xk^b&#Ssil#nT;`f+AO$E(p_$Gay(Oe^84>WB+&e;g
z0^hw^(<ZE0mkVZet*}`Q*eJ<q8J|_z_Y>Ku56%x%-<zrax>L3&?bbH+`b>C`9c3MY
z!BM3GL)IX=?|BR@4tN$Rk6yKLEur_AmVg0GC?^KKvnwSo!RtlC#)XW6g5N>=A53Rf
zquEI}TDsE^sEU(V9j>!{ldhayeMpv{V-b#HGLY2MeHzzB)kkAn&u2FtHds<BW@5)J
zS$3f!;H1$w{2oQ+SUf@fI=3QuLQOAw8^fSXJ3MK82m7H69vkeCc;&&B52i$0gDi%1
zEvB{n&7Zb8STq?moR8cy(9bEkLuY8(K!V{-Vq6P1QjEe{VQV|XE$JFtET`S?048N|
zJHN_D(xvVWy6t&=cOtif0C*?B=l?i{JgflqQ@*pW&^~#W3><cc$i&qQlE_@7J|y^<
zx)G!frB1v@w6ypJ!+Gjgt)oOG#f46_C?NfE8K25AVV#Q5Bt`<I`1e7Y%(W9|OYYJK
z$@+@#`Y!HHYy$uXFcT!@=*y0u07)7V7SXd8C;%jE??Af~&3p>iy3wPt1FU$n0VCKa
zAOc7HORr(xeG8AY6NE|5b-&p~LQ;oi7ODM3{4@0Y$u!Mjse(<kZA@pP5C23!WY(-G
zm9XrTSIn+R_kvUGh5;r3j&2gzSg^+%9u9zNsbQ0@dBJ5t-H8|uQ5|)a9a@_lBb$2Y
z-SmJtwcDqO>+$iDeXoVD!D#WRwNb#nV5yxF19BYkbqdnNV1)^DgOk>#OG1rq9#8AQ
z;%wg8UFQOh=Un=2vwcxH0Mw%0l{&QY)}Z)P#lq(&Cjy2`rk*QzS#v(@Zpz~>U7jSS
zNLm)orV`viKRiXJl^+jKFB|Dc(ilPc>F1livo!+s?Vf&ZmUfUh0nakr#did*pIe-`
zHH;_bN9V30aBU5*KG@_a_26q;b9HDIW}xr3YeGnprYbbkNO=~}#|sQaFRZ;>nR^K|
z=<SnTT|Kccpd21iL#(qP%Ku&<PvB+2x2f6@_2vbFm~zEPezEciz_`$rE?LCOi#N}o
ziHAIWyg88Gc{@(vqif$ec6AsTv;4p=IMK?d=F{*38wP6ApI4YS+&x#@Utm5f*YeP}
zHg;YvI%Mz^8_f#j805u3ekqLL=jTE%gNSC)G*5|vGUgM`!;yyIkb24Y;E`Q7OQ(-H
zi;Q9?P%}grz?I<F-(2&BCS&b&MN_;DNs{&o<3BgmH&$q}vR=7Y>Ld?ION9{Bm!J!j
z35#A4Z6u{4v!+GK>{+t?avC|3?Qe|rJiShIKmE|6mRDPL>EILQ;Kjzn@(x&h)F%s+
zj)|To8u#U;;8@#HCq;8;d3tHO?c5!2-+lJI?7bjw95Z8M%1t73w=BNS$c13$N9B&g
zSog+vE!*YP89&C0lrF5@R!$8WJ%V2hD=y4P#U+(31QYSj4OxTz_BWJTR&217k|jOi
zpUb`ewINl^=NEA0_pSwHy=Bm;c78ug0qsagUa7k0EhD2{&M=6b|8mvA%5-0gF;nE9
z_aILzKrv@`ky^uGaC>{a&GG~zg+l^xvxH9sAQK`ann3^Y=d$#b)lGC$<}l50BB(Zb
zc%NbXLg|q_W{FG55-VlzI&Q(6s!)D7@OMG+#u=27##1=iL2SFj&yy`o`iU6`Je6=I
zm=H(sZj=7@(GI|YFVId<1G7)i_40C&@1os_XHFEdQ}*a*$s*5X>-4I5r;t2X?`p>+
z$22DmTBn(V53u_u+SPqyf|;EA8?h#39g&%L_af5Qr=aZ_f`5!q@pVew$3uaz)hynM
z^M_v@Cgu06s6(nI%2g45n&R6BN=*5Z_sGAt9?>=<9%g(k10!nwucCterGr@UTByoG
zCqjXoqk8QsPDX@GYVywob|=)KS;UH#R^h@Q<Is39fbsUpF+Hy}qOnKQvYKnunN(I-
z*~U!&<u?xgG?eqRO<x@r44a|jvp=Qb%J76Rr`<wISLClfsr@=<ZpnYX$UM+n`KDTf
z#W{;^LNLq0#T{rG)8I<vRV@364uda#7+9k}f-5aZ2+|*L#p5lL?alWJ=Fy*qBg0&K
z#cl+_P7K<7wr-tF?YnbUV_eWYd@F%jwN<1ywt6}Lg=bXlI<#^1bAvFLBE3uHJqq$m
zJ_O9BC-7rZVAPh~gOSq4c|F0dnI=SI=6S3+P}q_wSj#dqXtxI8Jc(jB4X9fHO!z+k
z$I?Fm%%gg9)ks*rX%o*5zU@O{o<u`Kh6{Fzsg`!w;t}OJD(SrX3-9oL3TL9tw+geL
z%^gD2thwZFt?yaXtba;b_T@P)bkkVAN)@JyA_d3L1S!-;`{$Lwd|u!r1<nK?Xw>>I
zN^6@H|GIDMk2H%}3i=@No=gk-kn=>!C#}9RGk#%(uc7j`cr<g9jqguA^Viw+^%1Jb
z%jUK?{8)Y}Cq={g7{Z#5A*?xp`Qzy$-g_py$W^9!L&dG3l}wJkaBHGn>aCv+GhDWR
zD=?qRK?VPq3v5SpEvR7v;KZPkqTqZlukG4KOdZDBcDDG`yA7<HL0r;k&4JTfx-n@c
z)3pDQah8La0(K)be0^61lGT({QTH6xmhIzZZj0tB^R$e^4u)E-UT~hpIZ`VJ5&e4^
zNxiwICmWuG9y3r|f1#p$^Fa(jUo$a<Y6bYg5C)5l&ap9Iz1fROZd?qB*SoF4O_`#z
zFPHflNi-?gMZqa#?WkKvQuzC4K;k|I?(FqrS{y`6<J#d~W#D_{dHzwRdjLoD^1IY=
zHRI#w?Yps_|Njdkrkmw4UdflC6g*)Tmxh=<RWS4oy^lpHkxGQERK10tiw@}36p}ku
zy`>!X`4@XuBGJ3-R`HMZJ%e%YnkJunx5<zx<S>*IU6nBU%PZch@)Pn^U9=Q&(`fo|
z-v@Zom*vsN-tfL3E_jgrbb&$l&TY(8U8UymRgBa4aNh$<yokung<o3i|I~sv65m2y
zb$r;hRufUIK!l<WZ+O4}J@{l-Y~14d6zCgb=H|B#{){{fSfE53se}!WUxDXXZUKD0
zW1m~yC?ib6<f9oMnv?V#Y?ZBn4(BD?Ly2vk|Ds>6#lESPFn9s6G`Xd)VQ(oW(S{c(
zo;;mitWU#x^@#Qwp$LQJ<A;qDcg^c^O;U*5PVR=^CD_4K>m6Jjh!Z=|VdfhaR|!l@
z6$TZ7UZ3c19P=LESCg*L+kyy<e9gWS0g$-8+m{7mvwgZ%e{csnV(vn4AQoGp?lybv
z58BwX1vq8eh5AcdMoQ#f@1Fd`7Qpz|FBs)5M2Vm-XAK#pBSomx6*#0s@h-d8dpm;>
zni;-1#skl&-XA^#zx%P|q3vJ2zPJtYG6_3no_By2EV>^4AFT-g%c<<QF20*=l;Y^T
z;DJnCZUNtrCjb-QT^~K3$5=V^M48=FuBDbyRH++upf(9vL8WT}EZ|BFjRFBwP}^Cl
zEkCO4U*dwOk{zd%H*JIgVmtCDYPgE1Yh<h($kM-Dn3WL2tA1Lt5mQg5Tcp8GB|MNW
zxHTTxHc+k8nWKy8nhjXsspTDj1LI#fx)B$_(rRn~P9*lcjD(^;WlBK%6TQ~okYB3M
zBt%on#qlSFt+}+M(OGn4NB8f5vzlhkPxVOuah!B@0Mp|*RTAh9J;X*cRVZc!A>{+T
z;H`2YVk3-8ISyJBIR8P*!hnSMNH*L+TDL;8MKIqfUPH|G*ADT!az<G@q~1=JF;~UA
z-t8R=N9AzJu`?V@%tw9W_MsPHaOr*T;c#}Qw2NzSi{bl$j5P)Yiyr4&uP%Zhg1Y~!
zbD9F0VJ+YWUB*kTk1jl3ODK|6`|F7yZQ)=0+ow2~kBjt=^GFtv0N>T554$H>rp&|G
zr^{%97$kr2lJ|wM?#o#CX1*3IhfU{&a~+2&$Q^8h|A#d^ryj9@ALtsVsWdlYF0Zps
z%}*|LE;`vZYsiHNPvh~P0Upzx$D5Ctz36jEHxZMjfqaB~4ix0dX?@?ZMD3q?Xuigb
zIW=XN(>UoyY?~u00=?Zhg?Dvk@`#;fh@)6NA$f!M1wes5VsO~{n}q~V%KDt>{R_~E
zzyO8}tB%(7oD!HknU6aZG54K<|K$9LGdB#W#(EyDZ>k>x_IDUopZq7pf8yPQVn40*
z+bezlBY%H2&7}`MhG_wNw%b$nVxpaMK^(WgR_A$mE3F&7$LQfR)BMk`Te$88K0RKG
zyyC)kXaR0@B>#*XA-JNw>98HDo$ok(n-2uDU@sEHJZAftcx>NCc;nrS={(Sssm=i-
zJSU!-@BgHN4aq?dqp<)dEvtT#NOs)DJ(C=a9PUg$oeo8*#}9Sqiet{~|MR~HaeJfa
zz06X79Voy(NOjE*E#+5K#ju=^u_b#^QNC$n$9UnDa(x?85Y*5;XJ;Po%xgR@nf6)v
z0+wz8VL5;QPpqN~MX;<4G@F|@blcBrX#DzNmAx&!d@qf}!uL;kOr(gw=O<4IH&e*j
z9HlI>bB+L|_H_o(*6XROq@Mr#^8FSp6r7h`W$t5YPoV8;_o<XLtxe*wf_Bk*OaB-^
zlQjQfUvO0Nv?yw?=}>7JUOJ}JhJCju)?`r=Ai63ntLZVmUDfyKrD(xgKrbGllq!Lt
zz4`DOfevewGB7NKU;CoEb6-d+h+WJjgn_++1BR?nC_5As`*TC%*PJM}z|%QK?P!*M
zXVAZ}j8H44?$e}cG9R@}1<x)3?ymn|uwii#qo+EjVJiqImv0g2CUslacxbWl3jE57
zm#N~&Gv=Q;YFFl)no4|2WW{4(6Frq7DZXHAJO2nbd`*9>?ZX2;<_$`+Qm}w*10zg!
z&Y&aZLRyWY!w)_-e!0T6c5h3j`A*9$)Wy4DJX2feluexZg(T6HwEHK|2;9dLnAY0K
z>k_6<jFb%rpOwi%PRINiku=n`ynYJ<>nJ9?u2w?@M{}<WWT#-#r*T@PyI#h}H&k#U
zcjnTS_7Ot-=fRr>OwazDM>L@r#wsM34q<hD{MjZ1fjqJJ?pbx6z3kEzcH6tWoR*?1
zDPDAzLAn`>f1P37T#U_M{HD**nq{o2tDI=Q>59C{8{;Ww40XEV@+KAaO?phu*T;I+
zk4NJTWbr{R)GQv%VEbUG0d52-voSP)my-2L|1%5VB|BK(GzK5}f$<#s%7W6v;J#Zm
zA?}gc=n@Nr0}nEEZ{gGpc|!__%9Lx98XR~(2pwf65y34h*qb{l9vu)N@%YWXsRvoJ
zp{XvNgn2no!o@3`4AHaz$9LwR{*`NUF-(4hmQNs?Fo#n}S-(~i!sGt4F(}^f+9b|K
zM_x0ID2|N6wINrh$$<^VrI1B}E-`9q|LngBJ1RgZ*ha7D)&9pEyprRb(r*iy`5d^K
zXz*~>B{O+}MC0-|oA?%96ViRaoKwCpot)~g>)LjY?>ieBB^^gu|K!SB?xiWSZP@Vj
zra|h#ct2^Y%>$2t*)|>*j&R_+wMoq?7w(jo@7A)RJm@2n(Q(d5Tzdn21PLU=s%~wg
z{oB6X=)Y565%&jjjY36GxiKu2#qN{BfrMP2h~0_)g`s?tg_377&Fp-;6PS-*#Vx*E
ze}?w>)<pCObgqEV)#$Z^;V;8`Yd2o0aIHG6OA~$J0&P62&E@~_J`%FGQIl*Fxex!0
z=`p<H01GbdSwBj9);c;NuOHXO5mR+N4O=D&c*$V_BW{pt;1-T0AB1^rCLO9-0XFD5
zZ^~!V3pF(MnzFo9F=rdlIULUE7^V9&ehij9(-mj~dOrueCVoo#lt~6D{eY=Pty1Hh
zA+D-^(e>FNyX+&R245ZbT}|jnV?~k2+Ug(DZlB&U+zW&(sI1AX^WO?${VvMz3;!2O
zil+|XK~Q%oK_8)A5dO?n=}E(?4o|MD)DZ<k<G3&G=b;K4*+n4-<^|3*OGRp6{sFsX
zQ^Esri751sscdWuOw(EovgOUhKKqq3MUE9K&#AW1><g&WX^PWaKfMA8nOeOCseqXp
zDf;+^{!`>F3cWFTJO0ZYz@hQc!FkD>2Xh)P;4!xcsJ!w(dd%&2WTlY=_Bqt`(5bPE
zUPMyopSyV!3g|Mn{{0;2k9ltJkwJAb%DKaBau%^J`6&SwM9qfp7$tSYidawtkf6$>
z){xFj(@gD`S)-7{kAm>VmQE$SMZ`Z85RgyPQ~w@q9l$b(PPqPvs&cM8Oqy<$2hSJ+
z_`ppCm~UBZ|C<e-c(=)ZdpyU~&N28DXy_w8N3G8ufrfvIP!bVCH);m#w3jzs8_)fk
zufcvjH4BbHul<EoIcBH@9p2py&gC>NZciu1$6*#ZsQgYDc45@>C~y-mak?Ap_I3G<
zb41-#90(^Z&TqbULrYKw%_#HjV!t<BzVGE~a5!66&p^5|xw_M#u(g_xV&Ks^Xs%o2
zyZ4wcPP2vZ>@U5+-aYF1duivNdmAaFHdM#Ak`Ytu5En0WqgZc#riSO&&&}?e0*?W`
zo7{S^YdcLBpstm?WFl%~T1iCS)Vj)MQ;$coAv5n&w68s7zRK=`klGm;Ts~WN?ws7X
zA~Y;SmWYBiG-#K{R_o7cRDmZgWphrng=|;O6Wa}f-F@4Rp8}ieg^&zGgq~*^r9E*E
zU2rm6oe-X2ZpF-gpYrkbe^nq~KOz4)BLYOZoEG$XB2Ye7pBdgzq=Wn{EkV)p-QRh5
zQwKqRR+?AJmK31$_42OX*sL7N(a4sXE78cPJd-wmKLb^Qni{K`7{(=NP{NqY)bADS
zyzFTl)Q6s{ZGU`KKZI@xa~iP}sOBCY55;97pI-IUIhDduuF7Mw<JRs+1Jv)8m&N(1
z!UE35?dN>@xiOlsJ#?*;wa7`<9NV+Ut0b*xr=nM;+<PN0H`A~2hZaT`=u$QMeAsSU
z;8u1Ccq4_<^`(sf3F%YjwvC9i4)Y9^vlb|&s|pL_HHn>zf~M9sFw63-S9wbXc+V_&
zI@6WJ&*Sf)3cv_>5wEc~Q!EBh*s8obW+ncgWTA_UsY<7fT@+?AzFd`*W$bNwp3iRb
zg@WLFg~10TWhnx2cF5aCqpRQ=kG%Cd?Pk^RSU5+nox(9ECjKY|C-C^kFDtfbm6Q?6
zG@zWL;XtzKGxPy+-*0R#xlhz4^MeN{(?(uOTDlfYa!i5XEJBw~;x>@8c{L3+KjdGr
z|5WO8{JdflbJNx_!Hdq%#3X!w_R=>^=&N8mHbF+@KCnP`$FNdkPiIYw-7U{LLhEhY
z6w;A`dh4BoqtzgGzp4pUeC&b;CTjHVL($~|JFvZwQ#+w*>hAq~yN6lqj%5{pPbxq1
z;0@u-4x;!xF83G&E4<&`W}PujXDa`m;2INBYM+~GIO8&(Rbm?NB6`aZ4E!3{#R;12
z*_&d3PjVVS;JV{KLQNDvd-K(yzywBAH^7u*7?wNjVK{~BJbO?}%5+Jsv!MmQOs;=6
zhq*4CP0Rc!Xo;)qyOBJ=`x*IbbJ{Te@Blj<;uU<gi^C=6XMJOZ&iPkvM`HKJd8lhI
zKU8yMG)u<;joR~T;|v_%Gx85~-N><VJZi5T#q44z2xP2ZkU9F5G1rgoJ==cVW^p(#
zcE_Vc8U;!Y_*;HOEsnQHE7~8<fCIFSahv@wMj4;hoz)7Xi!L_NkY6$)_|;z^o9qn2
zRaT_m3G)!B2cXg>2$(TNigmAt1o@a8jS5L7nbo7=+}K@a(K|>WmAx}!hL9!s%TSSq
z1O9u!$XI!hE9+#+Rh}lnMf*jhZq>Ih9s>_=0PwT#WH(F7SG+b_naRb97d5u90-Chw
z`n(SlGpuRHGoM2|H=oNcJC577li@}>cbliV%)BG}<m+WcmBL*N6C-W@k3MWC&_;N&
zhI(@(u&SVhxh~01w2X>@U)A~7x8?AIk|=7HH!IOC`d*vObzg6oN_5!WNQQq*(g>q@
z26pdE=Tx#iJd5Q!m7RtS2>cS^^lCB<r=lO9P`Op1ll$S3LLz^U<Qi;M{d~LdXwN>t
zRfUEJA+bqeuGj-yUO<7~`{yRVs;>%iBAidc_R{<)ku4yM^&2C!CLTTn`;+IQK)o<B
zBKUG=N><g0%bwLZthRcQbgbSlbo{!-$aDVTe)DFp$p_fdM{Lxv?ZWXlnn&V)>>Vp~
zX#27H{DOh{54%M0cqb7|;QxS^TX@0%61##*FZujp+vFW-*8R#9Me*GSy{<$K3U82F
zupXuWe<vtMa7>D(J197xnPn>rK@)yOcu=##P)8K4&yo-sg(}+a+qO~a7<ogUJJsHP
zy>BDR&~!S#JxTTvCroagp@lN&bYkWtZ(i&h^`5!3^A!4q|BIX>t3%mkM<7^BMmg(!
zhjjV}R0w?rrZ(vB9mSc{3Rv}_FgBt{I#|)oVXwNAuG#po^Vh({d1hpX8Os=HDZ~wz
zw6Cj56u5WH&cxndM8!hu8O(=_y(jpu{o6sT-5YOzi+<5AAKWjfx>yPy@`aCnAB_$I
zc8eqG?A&1_4woHDG=4mlX0KW3KFSV~<4W%4l-1G9Rkh~yURU(m!EO%Qzx<ka;H?tv
zDRRFdmZ5&MGZOA{Z&XB=?leyj)|u@k0%8Z3#q!R>M-;{(6tCD!Z0qz9@_LPFD!3jg
zh6z#Mm#>B!+Zu4=I9fJT9~5;1jQ)_e_Ag2(&AsHx4ai}54r%p@n8Zu=m&Q_oL7OV5
z4-tbWDsVuKI6$Wq{w-3?0|Y53W^&t5y1$dlK3#XKU9cZtMy*|_I#*ezzPPVDZKBDA
zA5}Q)9!UqNOLI0Y`MAANlgQ?QlS3n95nLQ!d9B6j#$AA}gDxu{d(l_WzQK-{RzpD_
zP@!`!k%wY2ByWp9!;c&5DKc$bEN_OmnYQ&{kpHj_Lhmso{IqV{lta4Ax;@N+QIwDs
zUW4!4*M4r;$rW&Sa<B2<k~sI}J_PvEG4s~Z&xgZxQSUn%R#=#dX%wv@{wSq0MR~}Y
zd~rjnx<Uy*mYjxzKl#uYc1uBm)x)=iDDo}?+VJAc9p_SfGatToFu+K`i!<CsG@WR#
z_xt=^(dK+|0^zuFO9!T>e{ae5f%>`yyF=Wr8sW|3`~K(YmHZaUC0B&HuT|euf$4hm
zoD}5`)`c6S-{1j_nVY_BU^~GWL<K2ShAZdpfQnsq#VcJKTlC^F>&Kf-j8(Jrzk|)%
zB1;C|xk>~e>Yh){V%uF=gvEis7^RF=rWuDXgo`$sRcI(f9f=a@2?F0Wug_C`8mHq=
z^2jW}-Ar6dX~6AEpTfhYUc(5*$dSt<@8inp-%9BQxVqJpA2gMkIuu0#g92L${Iw{3
zeg}c<Zi=$2Z&NkSJ=}*0@ZscvvA1&Zy=JfI*(}J-bt%zjLqYm$e`zIuC=S)BY1av}
znMfDE4=``5d*E3ZuqQ7+NxUymPYSXxNTv(?S@Hgz7@Bn|(nPG?m-tbM{>$6zv5Mg(
z!o44JG<*27o<T&+;vb6fw90LB+$2E8z!hyfmT<S6tBEMwgqn1bnRF~$Q;It=!SFlq
zQck(c?XjlEcgj9{VUi~Mk~ry%o2U5p3Gd{OZeweGn%l>_gZn$jr^SD{$qO>OXV&>k
zG1Pgz{Z65>#p#grdz@+PvVL@-pLBg8S?t2e)fw+0EkIv5TIKRSRZU5GutofOW!t`M
zLOgpXL|fwv=ARNvm+XxRZ2aW+*V>#oFXf*X35X{9c)e@P^ZrraMaV2SUvk!oUDuz8
z+0W=f!QpC9@PqnFC{$HUe&Hr1btQCjfDH=Xh;s(gE6l4#pqDk*_$R?*<5_sCSyR|F
z5y<72DkPP!`C*G|Q0mv_Ebo|RT<B$kHZZyOW;A~Rhck#qc%W~TCdV3Nx!k;ueigEC
z7XiO1Ujw|QQ;TmGH1B<gByDc*+k-jn^Y`QU2`}fq-a&T;GeizCV+GI?ruRr}HF(z<
z-zt>78OFjQH41Rf3yBzl4<LqFBqy!dbf)cUu{4_&n;U#$^?=)Pb;o$Jr&R?>Q2tw9
z3c(X0nBtpE=xLdN9AwM`z=3<T3HFTn%aA~c>y3VG;E`|iDV$dx_vqpj=Pa&xUW67K
z(`r2Mrr>^{{f9ukr1pY8`7Tb%RzoVCVOyd`-DR(fyDsBs{Ei4vld7Lc>HH}UYp4?L
zU?MR!!ImIBDErD&UU9RkLqO&DmoN@si~bJz!E4&^p7TVM@b<IjaK+EKY>RK*!e~U@
z?UXEy^}Qf#mC{}7^~mpnA0IxgA!?H1(AdbbU3Vlxh#iwnvD{N!R9Tc58`;p9)at+V
z&z)n3R%x~!4tr)!KI_)kRO>AiCMiiX5`61R7$Eqjwj_IW5a3X&--fMKT)ft|vURe)
zZI)&nH$0QWE3fYB`Rg{cyGo%*?c4{-==ko|k%f->ms$1#eT(9{k^U?c$L^t%;|`^x
zh1+$mkvqXF*t4PMUH2;+{3C}QbIB^;xXFD}|M%gc`&kQ?T7IdNnjl0UW)-+2-;%0U
z`-5?<^dJHspYPDR*;GCQG_EAJaRC1PxH?U0cY{VR#)^TQ5~*)o4OQz(Y0qbLJter0
z4~D5V<W9`*zi|AF!S}k;i=4W|`de6k7Wi3ztJ2_mc~P}Ft|(txbncDXH7$E@nfQ2a
z4Th4?R0P?ZZ45rY4iUY|JDwinBJK=QmUA(C_`H-?y&w3TiTRI_p<r&$VM16|-%J}n
zsXDx5+yo(T2Z9>*+^@Z4J+vx&FFj*^rIWD}!yL}-90428s4Q7g_by(6BB|2pJyKd~
z&!x)(<Av|WvRYCX9<_Y9c=Ud;`&zq3<juR5bxv5H%kb%1R=NU^QI0NEm(7jO|IpK7
zYtiW!+WOqbKFItu-0*vYGVwX)kGR_^%yBoU1X_k{L`IBLd^^6O^@O?ZSS?a+6}gdW
z{9oP`34bC;+E`<*UDTrKImH&Xn!%M&b7iMFtVoHAt*kk~y*me1S%v%g{<6$P6askU
z<2=A+5n>PJr{X$62Zk?Doy)B#`6L&(Zb)$4MdN(1=Faq_n5;N25<Yxkud-EJZJQQ+
zk)p@cwf?SqJ2+Sg8#QGUc+Ek0vd@3;=B!xuurc3R9+XYdUWurk@Vgy-@;B9GOVt66
z)5Hil$%FM>TMB0Op|7)R`)CKMT4`pQnR`a$ftx7MV6V8T<kkDw@38`cZnj5xQGSwE
z*(plW9MZ%avXc{K&s!Im>nW1B61vBIzw!pstiRLj5a!QMFJGH}X6@s_5!ohO2Q&jl
z4(79@U<U54XETHm;a~^ttsMHU8!dgauorBfT3IIFhdg7_X|zBYh6WDuR?RNl$%t=4
zl^=2gO}_A|UG9YZ1#($gkt-9`T_@fYML%HIbF_I(E3#+U{we@>Z-R!Lb-B>jaglyH
zyi;xZ@??)J($ac9pt-)45Oxq+a~VyHE<qHzi2wzY_MD4x$T<V~<egWlrv11{g{KJp
zjlvSb3ztGS>#*ko(yHNqJmv{OOBfRFLL8}lgM_dQQWxNL?kUa*1)Tqea~9S4mc?V}
z9(1P+W4f0f6a<mE8>*fOTFm0+p6PG>XE>(DlHG5JDCRETY{84n#(tM&3KP19j=c$E
zuLG>KZy$3KAe0NNBfvNCX0QEh>}m<jJHvH_^4_0lTeHii^it@9zl{@?Qh<9hpJ^Do
zltoX&?%RGrv5umJ&9v5b#x9C0HnfBKqdLl+O=WM!qpqY}^kmaBbGF`@H?~;1KXv!I
zwdQy%`r*$@F@#I^(sUKhYry5sa)wK~8YU6G0mhx_xd_@xl%`$+e(}W#QSM)LI=qz%
zK1L<N{fCmKhN}cGs@17-nW?S2)6cD7RP2`FaE`H68e%yw)S+cJ_&0pwN94`nLzxIR
zX6}4j1ax2%S#+2+(-)%=A2d?iPAH}_GB3y2Rh<ouYJ-_jtJl!=GIrefpBH7X-&oJ4
z6JLuq$-BvhC$=0txM|B*9PtX8jtDsD+;p@ziFWk^SBhI|=_wj_FbvM#mTTw?+qLe%
z8#~O`b#;OOjw@OGSvvK0IT}@~F8!L53oasuYGcx2Bnqtzv<Y1&ufqAeLNj3WwSs?h
zeZB8>{oUiz6y3c#ZYfXa0PVL+r{wHaQ#<PS(DQZ-<dLYA{4*UEG9mufU41Fjq2^9P
z5V^t*9Q%pK_cRK-0PC&4a%Y&15`uc)ls%hBBwh{Ow1VHL=Ne5%wvCaU8^M$r!^7}o
zCFRH6+=dVTCuo)Mn6*AcnKdl5i~?UUxv6te@uaGz9>{aTAO$5jQ<~6knd)~^E<cu8
zF7wU0jsc`5Y*#49KQ8Z-0z^-|S|Y@$coOP}X@7`--s|e3J1aNosWZWb2@Vwa?cEn3
zs;)7zxZs5>sWVzkI4g(iU1|Xvj3Xsu1?3zZb*{a>-th5~M$N{2bSnsu^J{B1pH5q7
zyS`7&>VN4KhH1*Bg+yAzf@Z~f5d7x$?@R>;wu>ii_gW{84dd|7heCr2q*!M0ye#qw
zW24^%r$338+KyN1zLEROEDk@7zmlKxPz$h7(K6u7rYznGQfIzUXfPiW$i54;UJuLG
zhmsNGFf*rUbzK<G_U>2S3;p0Te4g$^(`3*Xkt?Thhm1Pv7_hPNt^<AOx*nkaaYEq?
znIN(2gkZU13BQZ6(RO3=gjhW6#D@~yUhz1#`~HEuo@Q5M78>?4?&c#eD%N;h9brl6
z0kkJ`Qjz3>_NdHyi#}{O8$`omOLUZ00{=9O`<=J?s1bM4s~})zAlyH`60hCl)z&0%
zfUQwpy-_O$Gp#Uu3Ua2q@7`=r)873{yP+iXW;&O3XfXQTblD#TO2c2^zkg7+MPK+v
z0*$$2L3ye!@cp||6kFw2w?lEaVGQW)d-G)O45%p-%y~8+;MqJv2+JFt@y1Q-661;T
zeH*@-S`fn>7JwrGhA6q{FU`=+2kf)X2h5kwUp|35s=S|yeCx0$h|`2&h8CpaasaZn
zP9WKgxT`YF$8b-^_>0%CAPhJHTr9O-`djVo{TrpE?G?Ho4nMa(rC^oYs8htBoxJW!
z2E!SjjxkX(1330yrQ(`<wS041#|k$*cq~o$Zgr;R&Sgq(4PG3+lFhR<9-<6Fv3Cj_
zKTb~y50}MyUIX~67Wm8C+3HQX*ivKlrj02(Sb+Uc6UZNp1P>2jdz!Px+m4!59~ZYh
zwKLaVDEMNyd~*{%<AzM#g05ZoD*CYKj!q!>RS#;GGk-OQo;03)NHOg_!>|t4rg_JM
z-P@8z@^Dtsn$FiRhQ<bKMXxkj!i#Bd`xwCDGc)=7Oh(^Rn?s7%JCKqN8M#I|c$LF&
z!TRH(u3WezRM5fZhLIB8)6^hR?y`}ARCRLWvXPKJNuyMZBmaXT>ZEV$z0iU-8i8ON
ziTEospFja2)27$Zi$;$^;c$2UcNLw##o{a+Ahjg2yMML4P!fCNC@}plFXf>%%%^Y)
zK|aoK&qe=0X!cBSa-~VPnmgkv@>kQB^)^AM&dfvYso3d2#e6^<0rXzt8PvlP@eB@u
zqr6+~u9aYY#HI=LB3}*N!@q7x6D}`981EwBymbo&c5n1`mh6JK)8q$QyQ#><-u9O^
zm$QM<cevMM(h?KS_1e*{ikn2pFD(!-JMko|z*}}>DY(tmOlG)l&FLJI#C<JjmL<@j
zM$*vH4vP<%@^8F1qh1K0OX;lh;qRA50RbR6OV65h+|bC2mak!JN6(!mwPN{e8zb|3
z8hC{bC?VKCEF5K4J>+^ZGGObN%L=petY`0?e4t3eT9BvzK-jPGyt&E(&9toTwqjZ;
zzx9KFwK)IDp|WJF<*Tgw24VbC1BHt{r1mU#OWDmAeAGNES$x4j^r3;?<zK9%(Cc|I
zuiw%~k^QyLrcUwsy3vcO^igS4wfW>GR4nga!N4_L%|8N?W(!d77xY>(*K;I?)Ug9E
zs}yZLFzAL*XL#6V7o}lWspuTBf_*?Da^8}#mrA1Ah%MLY<Z(aiQO6B`i{E>FKUQmg
z50i$*HPy3`w2K|ha0V+M-eP-vkcELwi3p*EfhGQb+{aJ6@2N%#CkEYtG^75Afv!<m
zK_*cK;lcOTUpeei$PlKVHlL4W)vRKsA-=N9KK=>GK!(1rdL*vlEmd(44>``@#kANV
zp7%!O{0cb*-lZId4fFI|C(gG~$>9U2BonGmZ7D2_hF+(rp4KvtE|1q(-dK9PiL1lh
zcHKKYou7m{IEidqIsiO!aFeOHYz;e+xR}C%=2?epT2EHt*CiC{bsvIF>ls6RCe)~{
zNUDAw`Dxd)J;aHVFPF*7`5CC@0bC3~FH4v9;UdlwBu<}y@wD)7so4=;5I-&2L9$Pu
z0gL{jn!Q)4B6fzt81WM`*s=}Ik3HT=5&fX%wgWSs6kTx(N-NTOvaR6Yq;ydPQym~*
zlo+Sd$N%_jlgYp@6)#0nGPaa{<i1CHgK|UHZ7B-OKf)RPNcpDe9&<Kj8H#f(5D#}j
zs~wT?{TeFYiw@)pO5W;jnneo$VNm^bmH7MoP4)ASL#AvkZ}Ih|B8_{1onYJp@!k<i
zxGwReX}^RQ3ck>Q3SxZae$0X2g%&NFy6kx}9zKSePqYE=L-=|Pl61YwUAA|1)>4Cj
z4*`x7jw0>{Z1(ThmTvtxaHeRQksh_`*IaP#@3Cgm3qh&S#|a#8`FtKFB!cFJ_IGcZ
zTra+r-xk^#W2x^RZmB&Heywi4MQkEX?wNsGIMy2i|43y6;T97k6MZEfVKmKWB~o6B
zLy;4iRB32A%60R~4Pd5kFaKGDqQf0txn`np%~h?w|A&wEZJT)eA6@7%V&vCMV)uC3
z<M}<75^p`;#VrGjPgjC8ByTVFqfa@A3;c)xU#Yzl$Udc9yZ4T>`{3`EvWyE%?2<}*
z#oSZ-?7;1gk85acf&%Y4J^hL*Xq?TIBG2J>_a^PR2!Rg?^2qg7#sqxnK(P4$S^jYi
z_Z&y-(rIE%_hUsUI6AlyeUSF-r+I;~Gq65;R^#^QQCPtf0L1m<Dh9tK89p);&Q$X}
zmJV0Qqo~y`cYc(;Ry?`tyMswm@MjC&uHWwV2TheI(2ClIJwlis_@qsNczRz*gEkZg
z?g2uBEFc?<d^-Qog>JSMd(9_dU)k7n*!&_)s2hUykb<qxyB!i_$4vcLTyAPy|6Wh4
z63$MmE@frvxchNxgOjb4T$i;*$SYi(R`D$bWc)j@B>_Hk$TB-_p3{va(DEPh+f&H5
zK}-6D))U+JbXu#)D9k;<e?`8xw<O$7)2kM>p|w{dWp&L55<+U55^9RsydbBHU>J75
zrOjAccjjIDx`+vthm(PZx4gkBs*|l}f9qw}@o-Nej=7~ZX?YAXX-6b8ypIloVCuH7
zdozN*NeQP(V%>o1pPC%fTYkF$J=I4EVe$P<=uNfL8ct1<${LjDcfXy17=G{y<9kz=
z8Tn&X&;wAnPbTj@z-516QU@DQHsYpdtE6UDFvbtBllFon(zd}_b^YiLN!5r&!(C2X
zb7Ap_6_+R?z&T^Ujio#Bqho@I00HyNIX?eVQvg3^6I%AC^=iuBUKgb&Vp;T~DNm{p
zjctF&9?CX&emH(&L_nf)tcdxR26e1DFJ}`)yt3n6%kq}XZ8Lq6r2;$YNcXRd`O)#3
zODg_@%!Q-3$y-CV>Be<G7-;-?YQ!hJK0nlC_Ldo?ENh0qR(K(}`fA6sRiwRZs=V5V
z7^rahn=*3_U@`fHQ_dgWEEOaYKU3ry9QcJ5SEII3>sN1)=Dbl&{-EYO`rO8w4LwvH
z2y^<zj?*TRnvO&vUS@ge>TIUsb(W0?_4xjfox6(#h)Q0TuX>pGM*mD7lIw8LeM=|h
zRMR~O+dt3P`&ciP-Zr5Xihb{T^MM7^edwX)L}{Y=fdq2~?U6c1l6`U0Lb`87XenPM
z@OsG9_ryh|Gnos0&Z#lIe@#!J_+n$LU}vL{f`B?4lQizj-!<b0l*0$#@>djWJjo^j
zTa_B0voWsZkp7%8R#ueEqYILUcQm2L7T&}O`a&h|7b7qOhRm@8c6ML2LyQM3YxSsx
zHRSAuicRO50tfZS^L6k@>mEMbc~q!YfLz_3Wd81kA$ozpRIlDdekmQ~#RaE>WAeh;
zqU-w<(QqC8S-n5<Bz<M|^~As4`$-5>TAQzLI9!5bKoX7ZQ0OE$#H77!-Z?Py0|kBB
za_EQQBQg66I8}F{=(-Ys6A7+t>VgaJg*O6|aQ_pYGk_$Tr49!^vuR&MFDLlGD~!lg
zIPPw0LZNu6hlgHNR1m9EMJ5Qt>sj-A^rDK1FGiflxUzHfNzi};_2$~_<rbZ#0MM!0
z8E^5Ub_B2Z2s&Tm;@n;tZd1q&y9~$ha*Mcpnog&0rvAxuPB2QzBhHrz_#=$sWN+b~
zRlvKN>>!6XyRs*GTawHNm6r8YSBp+`qQ<p_giWkW^dF=3aY9*wcUg!w{45xCPkcC8
zbWen<djv07WWs`xCr1t9L4&K?Z+$K==;$+-2eqTupTA>d*(&5S^e@Tfqi|6bw}XA3
zL5%M{|8YNe_E?YJUlw0uS<*;b7R$JNnsl5t`qBP&9_sX#TrG8S`65r&A{F-tGRGG0
zCt<ZS8(=NX$(V1x=;&Z9(rrs%OE(_c*PFK1varet{$gt7&J#5wkT~3eR+iPr<J2rL
zSoS2<<!7HlCo8+=1+m8NK`UyP*<2bzdG0q0$Wt}B_|J^;!<O>eA6qBtzDEKP_n`#u
z6^zr_Qh^i!2MNjB7Z)bK9!+uH_rSx9v)>-~(U~>-9u}e~xy8e{C1axIg3-x~5B1;r
z-Yk5)``lza4zyD}WEyioTKv0>s+ElV^DwIKs_^*<E5YE(_InXU2Ct@7nr=iVYRF*P
zU6y~K?cPhfve(7W|3c`kZRCrcXQCDH=v9+7S~Bl%d8va)haRw7AZR~&uSM!-!iQ1k
z7{4y#08H-e$MxGQ65j0jut6Eg;L!#Y1BYzN!YkJkco6d6sK#<<uxE1LcZB14y&+r*
z%{7hZ&WO)%M3;%3uMh9WLH^v@3Oodf*3^mhLy%>rdTNGNZoJw?;qA#$COEM@pHOa(
z#808UM%T~g%v|!gRq*T0Ro!(cA{{Qp&S1-NQ%*;rrf||+#_sVh894y-nR{C8Frm(E
z>_-V^C=A=%-Nn<1TYq6rmZHnf!M(W?{Wiv#r<Jt%(CfBJJcQb$tzBW}hwG^n-=vL?
zZNYh=a^N#e_-JwgdI>6b^ot#CtOc+ZdhduspLBoGD@!5GVnT(WTA`(ROOTC+<Px2k
zfg^r@{C{-41yGe;7dA{OAkrw(-KBJcQqm35sC0MNQ97kl;?N+V(s>Y&?(XhxI2_`?
z5uf*c-v9f)nR^&!1m<|}wbx$jinYJA){+=3KeC`m&f@r)jqsC@?Q-S?_Ls_?b_QqH
zi-mUqx8}dC4d}W#r3$61?yf)__ido40jlAr8rz%?vTsUo4HzM-vPT^`%KO{))t3i9
zs`2k~DgV$o4{cE=(xLl%a)ZXz9hkr|>^$%{ao0kFS{?f@L4~(DCH+YX)T09lw=NXO
z(=VkgvSMtBH~*N<o^m7nK=xeto~R(?D1Z1_Gj_9|4^zU4Maw2>FAs={`(@PVRw8y{
z4~L4BdF<?W1#mPtqYZpW57KlC+ry3^+>3apzg$u;7~6pDy_-E>*E7LuI|uulDfG*2
zXIy%&is6>DWE4WrWvzX*<)0-pLOUkzoxrmLkLw#M*m-IoAJ^Q~6TkaC2bxOB!~1bL
z^l7PpsO)gca+Y?)=F`pLDJJM0QX%)rd;6GW<maQ`2sDo(hCa$kbPcOFoYvR5oNM~F
z%6iqU5ak`W8OU9Zi8SsBTTTYbZ=e%L%j_}F2-kXs1S*Ww)dj!^5=g?P`GO9=rA`^n
zhU(p!kX<$UJ^FqQ1?ML?`itD0#?b8_K?cg$Jltud(sxXkw8C#RoXKuN9@+l|l0Sz5
z*V0W%bw8E)(fReYPuxS#{(yElVbj|gJU7tkWU8Hvwm4-uQfn#omSOTeSRj912F<$j
zA@Rm~)kOPO;fO&Edajv0ABXC1IuvVEfWR;3uGcf)QjR!<T)}0Bnu9)Din*K4!|k?1
zDi(b>-bh9Z@G7hq<9a?G)JM2?*gLFI){Uw=ELB0dJoVCsXh@G)L2JhnbIi4OVO_L@
zzq<lr)dDF9i>1&Kz%<@P9GIJ)dg7u=BcsF%bDXi+9e7yAJBmRA4B;2qNJyfYcc^k{
zu^Yp9Qp4K21wyY~ElvSRK!~tRM+|-^B@GV<k0(@;$!8&VAjz@bfq&xrmyR(_>$Mw`
zldIjkv=*32@yxClRHHKB#=2AimUe+}rN4YqSF6KF1&cmbm}0Lsj~e*lk4J%}6FMyQ
zmY}{<Dw9aSw_!Q0z2=AD(e{abd!ke#dBmP4WZfZ=sh$5h%CuU%iMM!7X>PL%8%3Sx
z*5GDLwz97k>hN13;dH9@!crQU`p77WFJWtQdwY9P@vhx~Rei6btFLPO&^a-6SF(S`
z%(0*<+7VmtO2rmC6)8ipqP~jAtU2e4EbbQ1@dYb~jdJ>u?y9DOU0QvE-D~a>{H+9h
zl01(yr9J~&a7~3+iSE%V9$4d|)V=5Oz&L1%s#f?xF5KKqJGfq#U_l%0MtptuVatCz
zxQy2x{eOm8xFHk@LSD5WJ$bU8vh5zasdq)dau`Wks8v$z<;_2&akc^V{q9BPKjFua
zD8%?4|4)&>=!b3gW9&o8$~qokkT=T{w%E!E3^skdpeBj{Y6L_dRrrx&)vBCRM9R!K
z(=r<on^f3TML*|*xyOJFI$>(+wlg1PUE(|+tnWUi^DWK{3`9;83vfb!#0=|gV&2PF
zI2<v7)g2%=ik~>=(yiF7+p=(LV)F_{_J0bnNC~TAD%>yySsC-`$%FeN?2Vt5ufsY8
zw))X_?&-FeDABA)NzoJKr}AER<3!Ihp$@-AVtl_mriOq1w)RJ$Sq^`I@RC&KrPN`R
zBL8s_Wo@vP=Ykbuhz*-j){i;$-GvLS8sGq}EHErlu;$ipuHvd9>#N@G>#1ofNv+QR
zQZmO`cU3!ULzXx`OH$jH+kb{o@<qC`(97+u>)PstquWAWzx>{@riVy!T&YL>jd}En
zYogA)L7$=vG*{GEoL)nylrlzVz?^OAT_QyG?K=1qI~)0m2Su{UNU`~VQGw5Z-n>TR
zZdk#<#F0pPxFYTG^v$x1t&1&`a>d~ME$feF?<kdesXd{nM9+AE`5i{o1NDCF-Rq5H
zmm*gkgDJ5jNj-$h^8&w!N0#7miEBLs?%G;*O0%#hGanR%55`>B1OU%P`DI<_57qXQ
zvw2xjjTciq%CD1|uo@KIxqm9UN8fawF|Xq$4TdYZtcr7V9vmcpi)uLUj`2M?ux4i0
z=_Lu9)im8NZaVou_WqFnsxH($`XS3aRYUL;7uAU|d%ER?Hg+enltWeK_HPC8W_&aK
z*()83V9B2NbB5p1{e2=bcelBhVpZLcxcOg>(NvpM+nlm-5>lxjjlG2a%A^}a5c={`
z#;47QQ0Jq{0V`$ThUP_*h>tH_+tBAw(dkUWJ~%=-^B+85WTr;!n}f>v;PG#VlBY>$
z<3gDgZz=f=Jg6)}UJR90@PJco&d2q9b*Ztx(l<IM-cOS-T*Pu~DRE&*ux*pk_lrGA
zFBFX!VhG&y&z0~R;e@I-x+yJm%<X=!*VuhWy0D=3s@nt~YprHL!&?CmIss_!7eORi
z5BQYjTjq7LRuLhUzE7nnW<sGq1lmvl&*U&#CA(Mu$AL^L=>ilaT=z4!6knvf=gw;l
zt<uB8b8j-BK9G;<L8s9isf;cq%igUB?AX{}wuBy;L_4^Y6BmCdy8}k?r-1q98wI*H
z6KH)C_NmP!olM*JSqv6o&raGb65F`Jwc9_++zk$+27)YND4q>tgGeJjK4#txU2QdE
z_wMBQaZ8gTZbcQNIryfZq-Phnz6V*$K(FIX>ASYdcm9>%tw&09S*ljG_#+N*z7*Os
z(|in)%ZmHZAMz+B;F|H;gu1Y^DtI6I&9Z)H?7;GF*ptwbt)T8(`X<R0YTw4dE1?l?
z4~L_VC&BH7vfDk+q87qAgu#UWq$29U=g*X+JbZPpUkpp=%lSP&5J&yXzuccoqU0d2
zy7r?9)9JNa4&%P!Yw}b;LmiLk-ljs0;KiX_@{ZYO@~xVgPoC{ZI+4y&jt}u)9^wt~
zDY1CyL5U=0!}DlD2z%5dIHe_pxRuf4(2Wro#M}>W_|A(fdBD=KjXyG(V}fmXfXEQA
zofEp=uWrCzL$z&K2Q12`B4`cR*7HD}aO>)`9AW0xR@d_bu*`h9M&ea?OP$b_&0*lw
z$5;c&=o6MU(mE%)MIrwL*beEK-9lLR#T&gdH`!1{ik!~u!mj}F#hAX*m%?%*-wVPG
zpz_^}XciaH&y~LQEHTmw2`<yJ+;$H7kQKazHH0F66c3D+@uo3URWDlW{OFQHXv~Y>
z+H2Pb@nsY<diMcdH8H~%zfb=ItqTP@0>aP^p-u`_kC&*o5>CW0nPxtd&ztFlx~()!
z$Budi-SqY2PAN{Rz7~!y=xYV4K2E6gL}lB0o8^k)^ZTkIAA2lB#*=5_uMPIaB+@%S
zjixMmx2J%c2ZVOs`T(sw$YB7lj@5jAvo{~i@>$5v<?wS@RWSqkH28p=P4w>{;1xvz
zGig;QMZD`^l0nL4)-$55C1>jn;s*@uS6e)WoPbtn{k^TYf!cg<<k#JZ%?fh>{O4XE
z8gi|u-tTWu81D_G2De5YN;>HT@PK#XtM{+|RA#(L8p%A)sY^6&@`%n3^BlPY@{V3%
zUu6m>jn18DmwX!CC)H%gYK?-X5*FL7iU^eXw$&rOO!}X1Y*7=#)d8h6mejp5kL|Q8
zpZF_yWp;FGu(0tCP=-6z{Izcw4q|=L!=7(1Ix#iir*l(_I6s<2Ha*^;n4{m3|Bq!M
z=3f0?z#Y6OY(kEp{gU_|z5M%RVj@Wv;O;Rb9N=#%4hYkUT>A0Dnnxkbv=%CCGhF}m
zA_6W%M!=WGpdTV$9lB1m93`zfMS1ZYJo&lhhznSKVo^A(tHfqrfL<McVk=(Ov>dTX
zHsC>ZWxL1H6m>1=ySiNB^P6aB#ulL2WZc8b{)nSMVvE=plw3WFw>}NaZmEUAlHJTi
z%Ys^}Ng|mQW|3Zb32XDJq_aJ`{P~v%K6kQN%sjo1|F~+%p(t``85_e!c98o5%I5RU
zAHQ6)A;rrS;X)B>HwGo3>JV0JJC*yI=j`0Wd8=!*uX$1TU-m};G#N9nSNC4V+t|xi
zjGu<inO|x-Ze`UC`#|d7+q(G{yR&?4yXO<c)Dz@u{plk~d;rw*{P9^>p~x}}^Xn*c
ziqR(z`Ug|?Chi}eVS2GBSw@YFgXZ2|>xO`pNwT=``R*%FMSM%14}MMrydXYX<V&O3
z*&i)LY_IWtAT9`N13E-rCnWA-$vvv{#tJuUz&bdXoIevQpMZ8utLXmOBaReHe9vVD
z{5W!Vk^@RJDmnYM4T00XX>aSdFwnz^lK)|ep3RP&dEh!H0mG>7A;y<xD3hhRh8;CO
z3Cd+YNvR(s1SOOlDdOzsko+23d^ptJvI8zB<*_IGYNwK>1N*s!KWwfoSh*OuEAw$-
z4_NQRSD5@Cc0kzTQ&ZCw5gJsu;AtUVQ^p`i<vpX%tTe-H;|jsKH<kWrn1Jg@9!wLK
zrd!#)Ial`|X%R5<x%l~FZWA?N3+Zk}nAz0wfV(8-gF`7pHm4nI#p40+q$-$Xu|JdV
zo->nJREEZ!V7Io=?$TAR3@eAxPO8Z|RjdetME3n|8SQf3;&2rNStK~1>}JMU4wUkj
zCEa0fVRNz<;B*64u9y_jtPk)hPxrx5e&g@|6D3q%SOzl7(WRP00&@=dniM8-T;`jj
z2T#Rp3itoK&HH@*6R^t2SNwW(!Kw$;uA9Hfyr}=OBoA++5&K1+NP*(_Yw~)*>Z?zD
zzqp8YakG#i;uWfp0?qCJSU9O^o3RJYSVVY<;@H)(NvJ+G&?AFbAELrPqyIwE$reZ)
zOhA`TMH#|H6Wu0jLuf0`0YH%vJ%^HND%%m8Qqx3IZ+-%TE}U0y?(rdiq-ni;a`{v7
z6^jY0PsqHDp;Q@D0E+JB(lxf95mPWWIxJA#cELzJtYIp*!4|%(>k10}(|-VE0)5>5
zsD%sHn-BV*MgXJreGq_tNs{NViwCX^@NLgR@t;4ADowxDIKjUqeWCMHu!i;@dPvmo
zk(nO$AOe4^0`wVf5n>wsy`|%y$87<ry>OtXm|8hc_(NrlKuxW79pCB4+om~&su6Bn
zUCh5m$wt(Q&24wv)m^=+4N|nh*;ft&I98uU1%Z^ioZL_rLky%5gtdrGS3s#T4`f$g
zO;4&&)Up0A^SlU3)YwAijo99fARch6Oe6L)IiN%e00PICeV#Yx+ll0T1BsZ_Tz(~z
zH|9VsUOy7(d@U;Zt^X&*>r;EjhQ86b(~!gjN=Sv85+RFl0*fYgE}(YRK^l9*_h%O=
zWwvkPiNy)?^kHuZseYczY8i$=_GB(^$IFHP=!p%#V|7NGPP3EB6-Aw<JQFFxsMg~i
zzY6}LC!5eQA^MO!MA@+c#%Am>_G~sVo(Std#@L5*2{7#X8`bqMO43g9{RZr7G<-M1
z8>mgT)y0qXIu@#VhD$0AZ_@EP?(2hQEb>8xvk%3T7S%J)x#S|r>_YqPw0LlgYg*f)
zY472Q$ymT|<~huZG((01fYn8f*g_g>5e21FL_-Sz%%jIMK{S-p^{02Cs%r%(gKx7$
z%D2h@vycwN)(lWK2pKmX5lOV603;!K2&$mU2_;wYB1S@Bn04Nw?{2~)QtldkU(e|?
zjUQTe!Ioz&DW;xVrmKP9b`Iv3uYvcC{7Ljl0H-vVY9n%@u#vN7p6$1}M<E>kA8iUd
z0>ixkCB$Jgx!jZ$omBU00W%~B0*N+r-P0v|;3diVP%E&}217r=684V^ME^$RKXC<&
znPHzBI)#_#9M|K8p*4Cf%$sDOs*^}n=!sb0+F<n74s|!v4H_BP-&KkCEptJC5?`)N
zEnruK$*b+~HlyEwY%0b3`%7v+5DY!{MxJYrsGP2zySf{guW#0L*1fouRcuqmdGUoT
z6T9tIBw6s~UYp4wZilqr`Wkl#fyWH(KAM@W&YkB0OY2^v^|0^Vs|xv4fYQn{U*Tl<
zGWp7feKInFWCn#U(w?{#Wd^=Y3`a3*0$Nt-j6sy)&5eTf_ATTLmd4>Z-kUCTA|xdR
z+bus2Uyh*oAh?nU4fsH6lM73soz8|>Yr26k%I6w;ZnHihZ#}yJE0XQK%uSHj>szbq
zJY;+G5{s7=G`x}kKKzFL;x#3>zLH*nPtZVI=Top*3BG-uq<F<XD)vwhI+*tcLQSdN
z^hW#?CfSDZ0th2Nioq8tX-Gy^r5-a*4k@-0#n6Qs$~wn&bthKl#^lsrRAO^1dItSu
z=*j`gEC(xo(@*YhyD`;hd+sv?uPdsYJ}*V5N}A5NRemW#tC>>+UOru2n#ic#u@)Sb
zE)WHu6GUygj!a-ThZ$k+s8}<l)B90DGX&?;dJD7`l|DP!Ww`3Y*8BT4FLWL<D5Afp
zlv#@FHNhb5x`}8xVouNI<NsFFPc0BuHdVJz5;jHM3Xh;#I7+9_8;^2Vzql6pYD27q
zhV1Hk6)xH)Xp38U6Tk6H3KC3^PIHyXy-!C~e4NfJ3c*WXS%{aq^r$nT0+i+1OCK09
zvdzPnB3_#e^`3w_Y$pPwSO!E7NHZ=59WcJD9I(!Cry=h^e?dN?bKqO6cs1s!EHrKJ
z2lPr<`Xb|hJz=^;Vs}`%q^OsMw0*R$5_wwec8wfzr_Y;#N-)7Pk&h>m)PY?dQPt24
z|L)Y_v;=w0wV}`n^-eEhh@*O5xu>wPYZh#rcPC?D?Uv!#%fE|pV-tK!`u3LeyfEKX
z@YUx9K!6xO<JMrslx<2PG-7jwQDSsw=X|s*n<z=l{Um5;z4i8a5Ao>>RRWLgblHL0
z<o<K_GP0H11CE{XvbBplYz#rG-6bVE-iA|?6PVleORP<F`0K}$L^6n({dhE#SJqUa
zBQ7}DH=SY2%@w+t+I%5~_vtdaIs3|ZCa>0%<5GN(#?$ThHNWO_HyLvtZakO~)bD7M
zbs`rUq<VAtDQFe1?@KO8*Svb+RJ>!p=?#Sz27p*M*UDy&E1=M%zy=Ja!E6WaVpArr
z45&4019i<ASTFw=RM?Rm@R^zBNR~eKdHrG&6u;9lJoUWs?Soyod;Q{=edFq0Y8Y1d
z&hZK2pMC!V3kggxs5bhn_CVmhu|He|5+3h=E7wIi9J>Z2x@zAhy@J~~L@L5LGS<t!
zCrw8^u9N3gm(6-telr&m>08n_^Rv)$zAyeZ=hb|-x7M~3WMepfYElQP_BW2s#b132
z7hH(VBrNAH_~!7(t+ng@3Ydt~>U{~LLcQB`&w~I#yR7>f3ydxG)&z0rt;olv%G9JV
zQ&yr7$)U<~F;F6J<=XqCHSfoDYX~LR#iu{xiLOZ<F&~lUd;U0@;vzOP79bL!9%LM3
zpH3t0#fB+(*95r_KDpXfNJnMzQoR{T<Q;r#AWN2x`zWur9vyk#{&ION<w8?#*pa{a
zn*cF)5Hn=|ilK7+=#1{J^V8TEv7xbVVRhq|z1_1k51+Rl2tALSssgIF=NIehJE4%W
z#fz{aP0b#d#K5Vm+A&PRK#GUJ2a@EVzF)XdBAc*?-up|)a$L)YtiaSnP+7`p5;Aps
zwQF?|=tyYhopm8r)jmXBz3|?nJo11jm-Yhv&h@&9aLZ)Uasw3^;~gG`Q$*OU&100H
za4!r8U(8GV(elVU)?WU*&1HJ>A7?b@rdAi-rIig!&>^(q@6P)l3O78I3Dg4xL14M@
zKtWWv`(go9E}ki0UH_D9hsT6@ge7zUDpucZmtyP7d829(M_1RckJEjEhOFcRxx;ps
zIF&uSdp3m}OA<>{AbE@mgN;sDLh3^m#l->=P6X~#+aksHF6*pF(bEmf+pp)Tt2Oi0
zyKh)rg}e$~Kx8YiYlmri2yC9pUcJQ?^!@i>N#+5n)0L2q=3+ACR?{O$JH8it5r;Uw
z=m~~12=@7zZ@XrN<P2u2j+{Ql&-+wm_ZW5YI`if1EGD1ru&JicYN^pzO7&@_ei5Sv
z5w8tUN1h`T?&dY)l;c0m)#57ZqSw-D^82WADL+%vi@QQz(Y6{b1C~Pp32jBjQCM#U
zH<-~?YW<@QUnVuo+pGCD@_k+9H_>P`W^)WnorwwuL+Q(|ZA}mCcX90$8*B*+wv0x3
z`8yi3Z6L0XW_h)9_HYTc!%`ENFlb<HHV1DZ-5wPPalZvu0`tWAdHRn@tFo>L@WIH>
zmrc2WGi9UITnmkFY{(nN=5pO6g}BDkO|Nd<=q!HBHCTUM5#_iLtCw+XjyDk5dF4aq
zRLt$0v@l@W6Iuv1ZhPzlxg<(46~=mG6d-^HF=$!K0!$O9$yz^u%#a-;3*C_<q3@du
zlMa}1A9N8+K@f#)+5v=gg*6Rb)=r6IY%h>+@L6QNb35;9`*_|0N|`<F6Hbny;a1CZ
z?N<1L+re!_OUFtDVV1Fv&(*DIQ*v%E(vVGX;$dBN3SVnT(%90Hh73z7Adg!5DM4b?
z%M8q)-`*NZY>~LNgazMTs9+DZCyHMq>r+tHEwxgD75lM|+Qg!BtM8>jpj*1dvtr<a
z`NgR`m)oOPQyD)TI|@Fy7?dwBEA6|B$w>_$9)}zhnCp!s%~#3@SayBwQ(n{(UDV>I
ziv!Y{T&9LRXD%%vh>0_@x|MZo3mUe-j{!k<kW!1j=0(0wRUitnOtq+ym1hQ-Y7I{C
z2F=z-iqC~~f#lYEp$(}FVViw~Av&T94|@8D{@0%wCZ<uMQj_LYOv35itX(m#Ca1Yh
z|I7l|{P{8~H!pwo?0o;If%eP4XL)WSB8**qT#5}uRM`#dNni5|6FWBz<L(o-Z?#^+
zx8b;(uM@lb#BI<GZ*@!51F6$R7_tHDKw$#Nc|#ot198vuhGBalGdnwkwM5g}ryoxd
zY2axUD})<j3P!lgW?OIKEla%3YvJ-VAWi$PFpMnLLcpZ;VTdJi#(HNHmmUGI!BDLf
zsDJXgOOGtqbl6R9+Ij7iO?qEsVz^UT0lId=pjWR*+k`Cd!W9DYxRKYzC$q=-b`Dak
z$24Iwe1?ps1ap<&POfxWh4z%{i(Gcer%7%<H}wmiPa+o+Lm-mFPwNt3s^&JXKm9l8
zT`TKeI|@t@Tn+2&8cVY-zHC_ZH3H{y>Hzn7`*t=}Ic0}a4XR9c6)ty3zY9oZzqqkn
zyBa|+HgPZ<qBt9QL{!Y@TsF*9bpfwCH>%+r46WRDKJEN`?6VXHQ#NTIiz<sgw`H$(
zN1a_sZc#fxOsWa#bl`9&;baTg(8Dv!Gv4Y&yj>$;>y7nsDYe?L|6Y=l&e?r>&T!%L
zqR@?2l;qK2iO4L@QGtcoq6R`=-{GZ#kvTE7sQtCj@r6Tq)0jgy$lUeCtzecm&+o4V
zV@B@zbJ;EosJy4!L(L6)b@J@xTJ<pFo*A#S&ixhB0ZeI6b3TmhU3E0u@3V@&vPmol
zrq1d$^Nv(a;)%Tn#vco~SMP6_=eU0EHWU_|z>1G8RH@MhqKn4XW0w~{_4K8rydt5Z
zypqPk);sgTma>s@S?>3l4-VE4Z@0vf;y>~!00wT^oue@8Uq@w)VM&WhYQ81C;BYj#
zT1wbdU8fb&rRzZs&Hp-qw2&8ERkTo6pQy+@@~xlMU|D|ySC*%XF;)Z32Z;AyqK2<m
zG`-3<1gxHu3=^ah4=waTNpy09E#xX$T)G_G`Z3^VK~65bEA-?_!GK6CYXRNs7SG3Z
zq><zDP4b>i2y4>1dQ<1^);L6vS@p`4O?N$>3hiF5;VlohfgBJ*ddOB}Dl}BTEgNy?
z==QU}Qwg#^&4#U)b0>E<R%FEYPapm=31!ji{maIb&#ugRRq8wfTAvP)x9iiYw)3}1
zqy1R4Qfxa4qXh83x5~QqL(N!_&9IOS?O}o^$}jd68|Sh;AG7E19scIz)uJq;`G#)E
zZJ!z8W8UtVrdaP|=X4hrG5b`qLG8my5Z~bD%f_b5{A*8Vo2{<6IgdToYOZ3o)rDHV
zW|M5@rLMyJe5_I<s@G<2D{u00ChPL-iQ-^upz7J`vJ@fIIeYzT&S2M?smo){36#+b
z(@dVxoD~X!nVN1*MeYQI_nQevYT=I)D%{|nkm|RB1h<n!NlRURYHSznP7-3&99LU4
zh9SDRwO1TscL-WVK$O1LRI*MGYCnbC(h}JfkfvVTJR+uByx`pB54f;ZrdKyM2u0}A
z=v_V1oQaxrpUnuh3xV{~!D3V2+dzD@MY58@PB@HADyc?mHcNE<R2{1IJsHC-+!r`~
zNGj&tHcmHhmTk|72Qu3yi<%F7^<~r%7SFd#!7B&VM9&3VI}(!%G3OU{TB#YIZ&9lD
z5(lPY`ZMu0hbJz7^__XL<d8j)au+%GmWQ#eo&^20+hG!a+}0y}flTVODzGGV#UsD&
zH@sO-o=++)%R~X9H=hVfY+hU0Fd6Ncba8I$mNOyh<=EZGRX6>%wj#wrvOSNCq7o`0
zYq8(iq=?Us?CW^${9=zuy<359sU6xP-Aw4Oj_v0Xs6Sf$DL7HNNwN)lDEjNAp@9Jf
z<yDX`9(H5JYrF$jiN;_m1txKEHVEaiZ*cVQGg4jP-z%_x``k->PRT$SLsf(3A-7iZ
zkRos4si>H--_!;#o4D^T<SSQjU-NW73ovQ~Et?-3RQv8g!_qw5wl1FN29v!P`EGTk
zoxU}6lM&F9_%u#Gn+%9%(-^<67q&KJWQ+NDa`_6RTTwhd-4Vy)6M?dF?mu6omy5$0
zzzyJ-5kE$X1ZYEODAe&c{+HcyrVOu_TJ|*%alLU#&Z+CQ@v?roga=^z?*`$nC!4%`
zDu^4X|Kqe+AlA8PvmfdxEXJwq{flMY<4?W);0d>DdYNP}_gb)O_NmG#*jvFA(X)tV
z&~iiu^^|`s_bT+}yK$?j<6`j5VT%s0Uk<x~s&+tAQXUldWLE=bo)IfroZm|Yg5{Ea
zy}0>(?O8VW>SAsL#SXaeuLN@Jfl;GU&a^Qq36iqOTe`I)S1DZMw3M3FxRO}h!8xt)
zY_YRQH}_IOXll`~-<he^&QI4E932~}A^UjUM`SxkXwP14k<4q%8L_mo=gL-KN)ojo
z+_avhkv<iU7^Pp;cC$MduBv+?q+!*)6LfU2H59IY?2$AxCxwnoJcT*QbmLh>%HueB
z+uKw`bGU7P@@2)RXk(+qpcP)Jv~dX~Y9mK}Xq$3>0>wAE)PNe^yjsNW!VC4X7G$%2
z4s}+791iuGynnWF5wxM<u^z4bHEZQIR#+RanVQ2!JwRZ>+v-vN)P-?K$?4eo>P*kx
zFL*N(v;)Y1`^~#WH4&kekh&2}EasoT)oIH4;N+r`1RPk|2nsQ4ybXFu&PGRc{hlF*
zvetS(J4wiOb|zkQE@B}&`UP2YyB0*$YWpQPLU=QmW|C>dDq7mov|!ME5KC3}<Gm#!
z<lrYq?^g?7S!?6gcRnI7g932uIzZ7&tgX0#{#{wDX%@wF^OzfGbS9ihmgI1IPaEra
z=7E_#KA-7@JB1|MI8w(#gVU#|DX(rVly%vv)d>lPVxz)>ZN|a63`51Q#hqa2luhJS
zV&bP+I}+jp!fc^Cf2!g;6v0f*qe9%Fs3L=ty|nci^uEME2Z__5TU?8{9yStQsP5{G
z4nTi<eQ>`bT*j{vd3R)5W~#W`+q@;>0dR<R>nL8WA|{vwU@}(Up{jsY`~G+W(pVuf
zT~aT2V%;)2Nk<!;PQ4nm_eb3Z5H|0pv1b_*tOC9Ga;?T$`uP)Lf55XeR`@hBJle_C
z+uwKS_*^?={a0o9G0yEH8yp5Tk!Iggq#!BGf&UD)eEbYPjqPbsqGKm!o+{T+WHZR%
zR{Bh=6<+GJErsm-wWz#}Nj9Ue)LKUh-{I%P+4`?Dwyp>Ml6mvXWJATzkO`F)GnK0k
zzVBJ<+QqbUlvRc=E@Zfyw???CNy3|QN@ZN*1W94;aIk*ld#MOnC*qwe8<zr^!_ET1
zXJJrRa?dXqX>UXv_cjo$cn3Qp7S9!{8=sV%!g!TJOke0w0P=la{F!J@M%O{%xAiv^
zzJrd<*h1DBFO?0aKLuU7{@|`3YtVAsMy4M(u4?#%7_M-1HbCO;ztUU<Gky~mHQJv#
zfIGRl(-Ksh;a!Dfwl4}b-@n1}SvD9_Hq^%Z@|(k@#Mk3-lx!umHUG$`OGH;&*!Qt;
za+j<LPHgf4ZE{89)nLX-vuvUa91TzelDb}BI41C|n$!_JE1mcFsIHhfW#uCho`acU
z$rV*ymf*jZ=kbx(U=uy}mku(#nfLXyOLXvECX+aE79jz095a9(oFRf&U;7%T2b+xn
z0oG&}9c(s*2%gM<oZ)MTpNb@eUVhG%X`^HB17A;f0vKkRJVMF3VW$~*A?1RnYU(hg
za>rpcM(BHA2HHo2p*8(N6$Rv?%HxU1Ad(@Bn+vA91=-j}=03?$#j`tID2ec|E=W0T
zE%N=T|2d#Jw(b|Py<n{6nb<ygKGdZYX?p#_^ZVM-BfIa7;1>h6a*KK}{B6ZzcWLw!
zl@&+JSSo_8ywqZ-=bD5|S12Sso%vz9*;Oc9KjxEIuZcT}%Mtl&z7qW+&(da0di;XB
zrWS=&qOwiNZ1pAX#lJN;=+_Ybc|@ol_MaK6tGKgIJFxfpQl=5T}tT?!s%`sOO{
zi}8*n4~J48euhk#%2n?(+6ilYXM2<;g*anm!qFtGl#Fi0^yHa`8|cb=t=nH=ZYiL(
z#{z)Om@(vZu<WN#uPsP5H^14k*$HKrMP{H?5Ph`WPB!nKFBK_rLzO-g<|rw{aCovu
ztZX|pP=bVaeyv}nC?vEKLO|^HOu@FWv!ayvt34Z$-8<`jcE=lYB;q`hP0pKIROU!l
zk}dc43JnY^r?P&}WSD2SY**m+m3=DIGx2t?NJqo2nSxCV5bB;^z{t@xBU?d-pM1v;
z7a|PE?0L-yJxufk`vyLn&0f2!Y?<_>u?Wt}J2AQDO-8Y2>uHzur;}}<nMR$73aV_0
z?vB>Z=FbkN);5>VzVmU-qED2~yYtSXC-Bn{(Tn?*agbanv8zWIqOBwD^#)6tQda^v
zu0XfshyLa-t<w<)5?tMfyGV84Ij6I{zgR1dBk2U0)}~kcSC&{3me(S{n|XCepGaN$
z2kCB|s(hV6V*mgEC32-=+9aG-sx`Lw00e8IL~e##YX1lAr4efQNycG@btLQTxDcIY
z#mR2w)T;h6nhel1aX|TJ{2IR}OjsM;&;0}V)J+umlPV_CNWs<5yaU~4U=a~<1>CVN
zCaLHDaHhb|EfQ1QBUL9)Rh&w0zvM)!F4H{TBdwH`7M;ee#1x@S6v}wV#*yp%9a4yo
ztwcxUrja8<)NR^>jzN}^&E#&v#HG`>w4&)R`&@s$R*Vm=Hi9G7!PhI<gmj(1E(DV+
zly_~x*UjYnIv{_}qr17T%NL;nnJcpSdc7KYe>PI|*moP=vvm%Juje9!Jia*t&Sm5~
z?4AD*9D4myGphy>u`Mv5@YF|^Gdho7nr`yZb~WD|nNLOk9U?^W8aIqgzYC;!dlsn4
zljfD*)A~p~wyBR}W^0OQ<JdV{##uJ0F%&U;G&q2Yt9nDRsjvKpR5S}_D>JhAP@fYK
znO*q4biHm*OCxB3Zr}VSXPxt#t(TR%18pr%567hXA*@$l<YYYp$IL~QGkXf9uBC7B
z{PJsNs32YFQxVK}J@zm9e7UL}*!a+6Tx!P$Kdc_CPthp4yJuVyL~*erdsyzTBOJsI
zTP@DL5OQ20sk|2cUR^L1iHNx;!DPV~LEqb(m)0x_j7boM(<XyFv3;!ka|kVterv$J
zUlXj$szAq}oTfNKm@VEloJUSsqzZm$7fd5Xc{+}Jq!(8r%95zk@}|B5)B0Z~YVg}#
zTi?o0e}js5T4~*!u+hQdQ{U9e%)KT{ZC#_)LIx-+U_!283?Z2emoK`mwvqHvKw#}P
z-;3o}2X=i`J%@28m0yQ6+Z@y_o;)i0?Fk430hAIL9Q(vKq%J&&x5Z$b{WIHlQk61w
zYi7G&=4H2~w;EhSkCuF{PS$_*A!Yx85J2LYpIuT&7`KNgwNrPsYEu@$a1~#e|7SqV
z;w+t#MMuC9$c)&V_Nwb++6~^^jynQYwS89@@t5%rq}gPL7iXW(*Vc%3y9N%`O6mF=
z{SYG<Tfd@-^h1i(0(J1`E29g<BQWAaHzzHN(}-TS%bHuD`WLMprqsWQ+pe#1{hFn<
zw{&|@J@Czw5kIMl99BQPTLYu<i6c1S!g#5|VQHH&0MeLB?(4oW(n}24IkL4nPnz|c
z>wQEVk)`ECcM&rZ;mY5HK{{VKgMds}+mRg~aLxz09^zkLy<o9SZl4%(j%K*UQKVie
z)plrH^Am$%pE039`J;<@G#v%3WPD7iSu|$`^qB0`U6`XXQ-l>Vo=n9Bjo8f36A~jk
zU47k8lq`)JE4FO#PqtsA+K3{kT<v#fdKF*PeFU4Ewe}5=Qk8Ix&DlHcNwzJ&^YiHc
zw!w+N_@X3SsppxKq95}e7m^Pf5I>^0f8s}Q?=>EeHgWPdE@qkP_tY}lG<Q3ta&@@r
zI5;#Bbm7LofW^G}+|0C%U)qYZmp(-J<c0*}#Q3!W@fo7#_IuFdhRVxIkIH46lomY_
z^HOcsMmomI6-_x^^%{(Z`|~Qb`DKp*n$QBYvLZATk!?)2Z`6AEm20(=4ufyU-niaw
z`Ugw7tdm4H%=-?+uX&LCWxuz>m?;!thXO`u?M1=2ezWvjen7j4+<0Lqg0^ntmAETE
zZSh|-(jYQY-L>#Uk(&v!x6#lN0+1_n{X$#K2wO$4?mzo~o<J1%8C(?hn9`pUmRg`~
z#=7*N`K^?%Ncu-l&53+FLFOe5O1Zs$V_B|H5@JIQqp)HbUO3B~MvM1&W;qAP?B&mp
zXVvsuVL!}0{4(Nog@jy7;)+^%=|F14W^JvDepD>xtXRc!O(c2~FU-rTT0n>q8gb^O
zi_ljnCAxZMbWGr*@j>qd)jQ}GB$sAzGoHo)n*YgS53d7KyoIWkJ-JjK^=T)}Zi>9z
z0e~8)?D5jp+w&ZW#OPS^(XOe*IJv!IggdsB%cZKn!y{}wg-6Q#x|CQ%{}Y!iCs_Rc
z2mwZXm-uv5n3F0qi3o}u@3!_O!rn(h3<qDT8XZ4DR;rpI=0wI@__3Vv^=y=IKPl@>
zkj$#3$cS-s3zdf$SSztF&8sxHZwlApD&+f$^l0EYr)kHw_~0Teo;)6;yYTx{aRNIV
z+s4Dm)KVfzTPUJTmo9G}lar_QR_NTba$y;5cw^+v)yS`izx&#UhMXhAT2mSSIo|eL
zt)xMm8_^sO=LNGY+QkQ!<AFPWA&%q#(p=%)-98a4wcn0C5$V{<I*fHD;v<hhj2_LA
z!QfO`*}V+V#kzdu&x{|ezR#M|B~0ixaDw$=kYgK1wS|gIVq!328RZ!1Pd@o2mg#o*
z^byq`ZXfj8Fn>S!$aJKeWfXfD#;Kzx@l3eUD>JU5hz{LX1Z=bLeg+LGJ)S||!3cSw
zHrpi0@=SD#>k5F2fGJ9}GpJc@ITwbh)?n5|{m4Id%NIpR?a=h(!Qu7+*>~T(Vr)@>
zbF-FaX}GB6VZ4PMArgN<h>QQ}<L=S6XB%Ysg5HY%GcEQhDneZ+o>n@bz?UXl<*bg|
z`0_&F@x$n*mqrgj*RZREWJP<}FlLoNaLk&bJN8h0bj%u`VlZ=}*pok>jnug;iCZjA
zE6v_0R;c!!!RA_Pwaqg&p^ca|#CBY_QV#jiH&M?^w9@B`8GSeEqE6(9bHa3^75}(K
zL0Mv=g&}!uhi;~aLb~ZAg0TdHbzbA<$4{c~OFj^VoMBv3q|LB+>#hX4#p9tr<PF{9
zm{)UkVYVofvc>Cn$n|u?_-6LH;ehh@{#S%4o8irB2?n23y_3Y9gZ3#$v%G^>_^&|X
zVYUCR`3EE{KbHjk!`gCjI^XXiT8zs#36lKK$KR2GG@wN_Sm-WHCR<VJtjY_%{C-St
zSk9ww&GbfX5e*OX#RSD2W205bbf3#b|DjQJ=$k@D7|xLgRrPk1|HFAFme_10Agui<
z)-56ZO-4XOgna3Uy_R*>%ekx`^oL7aZhM_@Y6Ql;-rQ)DPb>6nvANLJ9zLxMBQvMr
zSM=uZ)fA`6(vDcWdVmu^O%#LJF570P3KC3cm7N?QPv8RN;HO@|#`!-Vv3R(IMdG3|
z!FF_Dp*E28IU@2}vGz2B1P8nmz7M@eQmm76%Dxq)p9ooo-!QX}+8i6(nj<8!dH2c@
z<8HCABdPm~KIXmAR4hs~g1|D2M0pB4+?<7nYN_W$Zp1kxJdlydt^5>tRlxKMcVL{P
z{I<sLjO?l0{u7YV8&hwXb~<XG&w_8z&fhF}l^}||$2nt~Q?2!!)}Ju`ZzTVFMgFdf
z1=34Ipt4xzX9wp`gk04eshZiu;CyX~oO}e7emSUC4mxk77Uzo4`YyzrZcfefbdNBB
zFv5G%xhRC}?;P0ZQccC5=_|NXyP|7X#_kHOrP%%%nSS)gtEPqUTg}KAbwN6;JvBzO
zLc-cr@6o1JY%VVe&<80O(g1YUz4j>5kLjGIK`3Fp)?kfh-ep*1SkuBGz<wG`s>dLk
z8#R)VIY0>hAkrP~V$WCgkmD6105V-0tIsdtr)Tmq4X>D5fof4R@cRF+>k|rxFnD#j
zR;ob>PK_&wB9d|hehx83RBYw>%u;RG7|@t(kE!Y8Wm6<=y88)NPIt_Sf#^ThC%WjR
z?5=))$tO4Unp0Q4xN-JD9{>j;26ds)`Kz?Tn^{(=X2DfRPUP?lA!pjpvMR_pj()i}
z;)}{<vd?bstB8Bfejc}n)6O^l;RzLFHk~W#BeS8h7{@HTENcdtof4#EFo^>@{GNF`
zMnWK{jRv>wYtL4MxF$lqCTLTa2nOj>;N@w0{tDRs|BDF9dYkD+k3snH&%D_t>Ors@
z1D`}iNmG|I)TrmT`tKn*QBB_5?l(WZn%(586X2qs!zNRL2uD2Cb4@1tlhgkOM<;ec
zCGMM${<WAb?s}#acor>$2X~eq7QITYzK!9Foeis--X}+RRVHVL!v|wW=twwUG-{sQ
zpZvhnNIod-IcC$Ekw29ic>?g^*~N?pO9XUdc04WXiIo3)dEUdX0AIxt;b>Dg^Cx_y
zu&@_c7-U?Qi4@vE<geqH2QI>uDZvXb!PhFQK)=#Y4T^XlW{v6P@9t$<0l<sPSAaHV
zGV<N4UVMi67nv^P_ne8F3AzRAw70>}*xzbH>>m(^UgVCE6oZdNg|!~^Sit{M9ak=-
zYu$@mh}K-UTh{LRA(`r<)Q1lwm&rJ5E^K2=TXE12tBx1fTVI^L8Tr8ZK(OE57Le&&
zAX4ET`v`hX|6VK95nf?WD3~E*NA^_l9kdq)2yEU3U{VQU$7+2oXKMA-ae9NuDy9d<
zWM=<h7c3rj(kB5@<zN`)2fufT(M<agqVx%<gPz(Uki5?iNtjows8G`E8TAJm%7D<N
zf1jEEU33tIEF8R<=~k}qEWig^6b5hRKE@u}H^Ekx&Z*(JTe-u=-V0_v&oE=cA3RrP
z;a9f4w-cQ{0tFr>Y#nZ~jA<%#u{0*@o5RIb_)CS2SLC@Gg{Nq_!-#K}zA&~N25kMk
zJChF~*8MT3)~DEn4Z#EXh4o}AW7Pg$L;@{9z9@$s?O{u@`mh%-PhP2QE<FWLd#QPU
zVDgR1LGB{K^-K6zA?52j4mpzAdR-mu)0@#q{I=?HkWyhTYYbV#QB;f92rP&kE-kFx
za>6=4Gl<|pArr#dS9NX}H!d*{ngQv>ly|P+IL4a8&t=NK;k3mZxboa)>0I}~=|G)e
z<BI2KcL-av`Kdn3h9NgoaM>Q{Akf8#yY?BW2;ET#y8JSw{kw1GyY{MLss!6f6wh6o
z4%yRMA>it>`Gcc+>b@(1R{i=J4ud%pOvlFX{hxX%Ne1~lt7^wzIjH;A)vIQm`gWEr
zbeq1F3o{xdl#t~%_>M1B%1wuDOEAFrXq=ry@Z(KQc}XQ>TjNAIRlL6+Ps_!n>^ybM
z11PW~pZgju8z>8;rQpq$r`$)qojxbdTeT0TO4%F!YrOb>Dpuf|NF4fZqrC!TB5#?+
zFf?)TyG`+~ybK*W%DUHK%g1Q__}JD96rneC({*ks!JGMIVV7@wa&Y$o00%ipmC~?N
zrn&7?uE9`W+P;C)+n%R5(DOtzU%jX!ar_-LaPr`B4@_Vo%ZFh8mAFpJfz6fR9aj6t
zl+;?`sbc=dZ!W)0FCA&*Fq|JGE^sJ0*zX1}pDpZ;h|t~H^dC$z{^n>+vC!Ea)&@b!
zP#Sh6>@79TmD3&^l{AATm^C+nn=CLLCXq&7BGGYfJD6vk(25_I`VAMVQhfo%6S?wO
z{306V=4nraF{3;#1~$NMXZ)75a)vYqaku|bsPm-|#gi8&p5aI_KSUA-9>?re){npS
zo7z8{@<6ZGj^P-fiE^@Wmnp1U5?mKx@11l*cO;SK=ygs})wC^L&%&#Zwo&7)Zug&0
z=LaCw)9*7YO&>Ln*KKb0a+&ns9uvztn$A0t47oX%-F^7^;z6AW$V*3&0NS9yK>z8(
z5K^E6@`vlkzd~B!n?W%V6%b>av!IiCoUX1B671s{9lVJ{!*5U5^iy2?bVmbk;;!x)
z_RSK&%BQAEi1&O8-i#`tFa6_<YN9$!m@2Jz;?8_SGecq?v_r|7!GGhWw_U!LMrJln
zJU*P#+!)hb4vT3dQW)8Nb|D2amBn&PSiK+D-W-BD%9?^D!-?E^f-G+e+McnB-F8xj
zBMj<qeTbQ1l+}dq-+sE9X~j5}L4D>vyXKAwU)+2x<>GYom!hlGkoM?olq7O&jj`8B
ziSNPLkTrACGp5RSi6fe8h8deu?^Nv-MGTm_=T(!Ur9IsUT25ZrUl~0V>lGgmQnXeE
z7X+%dVoD<?JF7U}*sM<%rGuqh9Ap$(+*V(w+ed{K%b4J**M#lll|&(otQ!c#s&ci;
z_-Q|C9MEfFFi)=^b$EgZdi}T4Cmd7(Tw(8xIe%r4{j`av5Bz^@fk0-U(>Xhm_4OAQ
z2Qmk^<_L`q2iw|9lG;@!bom(9Izo*S-2K*mZ=sqJ?CYZ>Uctu>E)$`D1E($bM2CFv
z0*TNJI3twn(-Wy=CgbCsi^3a!w~6%N(JSuSC6L@}=4uUKEp_si_=msbmoU@{r;_<9
z1A6|zo}hWGq=xUN4E-6bY{Y)!o$zLQ>6_Yh{*RY8jBVS4`R|ViJ8o398NRC#P(iZK
z<o8g-0QRi%Chxtji_Ew9$;)iNWxW!Tusv3Fg?-hjW;>13wy_;)3_eKqZyjN}QjrB;
zd*t+wJEnE}q|qhX&m|PhL2#=Wz*OgZ1n)^V#_F%EUAJ#m6FiOe-a~BWC^+Kx^2gKt
zUP)bCSZJ#82a2}4R68BPwq7<49TUR%^sc7Sex5Q7pqv@|h!hMp#Xut~&bGIk3vbDf
z{FOL|Cji_42~Ok6q7{%trhP`@S5rO1z%Y_8XXhF+MjU*K{G;XH>F3|y{{*V_@ebR7
z44@53p97Y`_a?940%R(Y3wFD!C{Y1m@!O1GH<QXolwgBv-8R=?DM~_YhLUHOz)7tT
z02Ed_d9C>eT4i~Y>|gcg7ieI6<>%5JB7@M+Ocw`o=Xq*TT56TaW)+a30Z!f080SB=
z4X^+OW5D6Ol+puz4~;gRAx6&Rb0PFG^J#wgaFM_OZ-u8T#6yqPF|Y52y@d16%gnwe
zp`(;5izX46^J2ZdKnj$AH`W!wr+48z2r`xAjGgp4zTUkY`Y1bztFy?!?6~egq--(o
zi2J$Ar<;fsP1$`Qb}w(A?`O_hWC`RN8AHzTVAGKIWnWXtr}c&@lg}#+J&(TyxF%Yo
zHMAAn6DOR8-#QbS8RtZ=NxZCD-H9_bi#p6}e|+l=PvuLuoupS^*>$HqPn?W?A9p&R
zIH@uK@@X0sF9b^5Xvk>7<D<kR90ts^#JH2GE9LZy0|tEG;AwY7q2sBWH>LWDO{roX
z80O&5F0s+MiI~HZ7nE!)9-R>F)1N|TG`fx;A6PHPDKI~hWi<A<7}8J>Z~C@ebzQF^
z4sm!iJb>MC`VR)_AyL#T$?-MCX6`pMd~aJ?J%4X>xATIC6?1uDyeFXp`pv+#w$$uS
zw~==*{CohIp#mFp^c2qKPywz`ZU4fm^rGO<M;D^xnS8y1<e^yqv6c9OpvO0H3NnHq
z<zGyO;&cgMQ-PSwQ3x869Y~qa=!5<sf&Y%7hwlR!;Y0L3ol$@^KhxVBHgpEt41p&X
zCUi8JlxHnmjl@NcD8V}w^sIl*kwRRvkv-Pd`a7?qdf}_rRBOYhP95-O4ll$MVn8N<
z%yTG3a$kK>O5&$$ywlEUlvO-gs0<l>YQ|rZ!voM|o-+UHI#bW`CgkDb1B#bE$pN>&
zc9mx!<$VwDSYO4li*)yPV{W#jHyLA!x;k>dQxGe6A8k`gZ+&XU^>-v6tUP@${afyw
zVn$dsRf`7RQovM(9AzLUkBHBDL$tDl`-!3I)b(%)@$76m+B?TRbg9yfo0D&om^$2E
z+{!+F?~(~&&F;i6G%eaCIN}nwPfn3&&e|nGM#eaX?bh)rCt%TX9Mvh^zMJJclbzs!
z*r*h??=-BAqZvdQz5~3DcNeEa9+5^fhhi$U*g_lWPi^UaUih3D=>Vyj)k9;be0S%!
zK?_HLW_fYw1ZLxx6LqJ2C~v-zCqfklbi&f$3@7P4)Khu*yL~NE0ZCT7gza4GnZh`r
z+j)7%T^?}h!)uYvQcN=^wvg?+V{U02cbWcF<YJX(jbJ>Iz&c9AsAq5JN{Hw->Bd0<
z^Efd7Y-)EnO~7J;r0<Nmv8Xnkm~Ad$IB^6GEv~IYpz7t|aGQOk^@L;Qth4yW*1(W0
zX+2pLEJ&2Q`A6)z&2%IGOaL3{$TNxJFkZOI-lzp$dkWauE91%gtDpXFPo~PtO&tW0
zDeBOfZxCI)4bOUYaq#?1ymlLY64i>pBGn391^F^u#oI<(OfDeb45MC)pWry8-1X}w
zGO6uQ`s&j!eLB&Iu>YOH^Eip*o<3r^BVYAh;$Dc~)wN;Blkv^JyKFDe^;y2JzWtgR
zaFcS{gH-z9OJu{C8Pbo8Y}!c$IQm_xCtQe7$v08Om^5<_&@*}0s6I+PKp&1Veyr@*
zWS_g8B5J=NE78KMrSr^kIZ@GBQCx{EgRL<HyNxbI(475gr+;yLGv37>YlqH0HU8~F
z9bpl<Ei?JniekRk2Kw<%grs~gYMfK5x_=IRfkPG8y|cZG%62jw6}^)_2Q6%N)6#vI
zyRQUQQ>F_HDmst~lI1M{I_KF#V}jz6f|SxK7O{{jrB<@v@AV1d8t-qx#hqr7J$Vy{
z>Xf9H*LAoNw5?ArJGib>SXTA1_|i)w(VBUHO`P<<XzBm%T7U(;A>AY&+M1Ca2)=Bw
zp$6N_#Le$#kuTG5{^ZD5vq*?V0jA9iCL$5G=;Y0r>Wp?J>@elm{Un9RC!9m_r^?2P
z)jdSq3e%ty7Pnc7@`GH(!#i2PH~=^#4_U(`7LL^rTGkTqvS`KU6lsKhhlFT%8pI)i
z5!ZTF7lj{Ok_hhaAG$H}K_-M+KdzLw8$B3Jnx0uIPe8LcWglBus4t4cDHPCcC-?d0
z^i6m@9d~vF9W^U*=r*#GN5*@?O(97zNkrts%kZ71jGfj<+02=$V(#^d8E=8|WoKbU
zFl-Yusvh~t05u3>+PtcG69w%FNq#tO&52kR&gK8Uj{m+Ue>fA-ot2AU0rB9Pp$wbn
zzoGLsWAw_lmbgOr;$B%iKiweGGI$&JHdMPSB<_;gmU1RL0Y|iIt~rIJ#gZ1|o2|Vn
z<SjT?LxApDQ?+yxTc@bwoXTbd2(sk#(+9~*zO3PK3~H8|BiwYJ$yjMj3D;j(*0k<5
zoOaB^g>yh5GFXOWHK0=T%n9IK1X>Ck<xbEYouc%b%{3>>bPxy9ObAVy+Mu78o?U!O
z%vG&s^R13FjdY+9zPj5F&93Nl&^L*Dl8Yu=LrSu<!7S~~kQn!IEbQl6bVgW|`G9o2
zMAK+O;Kik9Bo)k&FFF18Bz&N$GmL6LEt+X_KwUMi&fr&O$F4npugi@MxrMXOR`#PN
z-^$_G9bewhfM!7zM$pQG@JPRCd<`S?)Czl%0M8R)P;H4Sp)jPl|3J$Fe<1;l>eIt`
zf`8%m_pjbC_<^i|&tp`c-HhxEo_|6KzCNy5-?A2}*T<_?#v>TwqzS#b_nq=OBM3MT
zgbN9*I}pa~#X(hCU+Xz(IhL@k#8I2_l8}#E0JaD8fw&jbJF5yXFQ$_f=RP5Esy_AG
zD=09l{i^|*`~8Ln{!FC>nBkQ;n~g^+{#BMss`c_(M96Bn6S=pKgYCU*;NhVoR{~4I
zWF$DJ7sRU{D};=sdy|r5kcZtuE>=C$h5EWo9L@V}f8M5#nmE>Sx+Y|v^grh$du^}R
z7xj_lPA6i6@M`jjRJC>JomoH}dg^k1fzViqao|MjKKcpa0!kaa`~KGyx2W)?V{WkC
zl@8NwK-rG2sISp5QTRox#sbc!QiH#3XO$MY()aK`E)s7~qY7X%{`2|%{mtjkqDPA_
z+M5Y}>q)lZYj*@M``oM}U=JugXE@K;AD7h^Kdm$d*Hr45ga!j-;^d#EMU`$)RL6?*
zL`w!Yu|JvkQF=UVSc(2SrEzunx=of;VwqODPtCoOJ6oq#7eARY^#Qjew<|)j*z;HV
zFnnS2R9^UrsBGD%KiK(0g6FM9sR2fgw$~3B`;_xF?!d-Q(5|c98AcA#s`(lTW%{u|
zk#<g~{h$6n#@+*}sjPb+RumCYAyiQi4ZWlEj?%mI4kEn<q<2y2&47UP7J5^92SGrj
zhbA=wQUcOLC-B|iI5Y46H#2@~WwAoS;@)%4KF@ykv-dt%nn6wbW%<2Ze3%6F|9#!V
zNFP2YnwV~&{?j@Jd+!u7fQdc>Ar5advWbot$2SHq?|FbT<37)6Ht`w#xf#!qDz{qg
z4?b!{8`P?NBDJpN<?c3Kh1G0Id>Ad6eJ7Y{*Uq_mwbXE~2AD{3e~hLKJb2b>w5B4&
z6j$k|S_wD|K(XP$$~w8hHPi*qR?||h7x8hfALbo98FI9zkV6pB=8C7kN+h){m4{@o
zD5U?F{JMgX#RF$98u^MzIB#*Oc)w}ZqXXAvDoLQsn8BXIVZn(&h2Q5e2~_z#@R1fh
z_^7cRoH@A|676r%Y%L*?#s$XsFw{kV`)S38S>wuKx9xJ3AP#j2kIk4G8=B>GTPNP<
zJU}mJp>OnZ9_|!;0^)&~UyD|E*}6wt??><#(k%sO@}k~q=B4{tA$y17A|^ZX(4T#+
zm=*MQdiP?koJyH1b-nlmt6w|K-22ktiWZ;P+8gTu(JHITZyPpP^vM@l4PTa*;z#WL
z!_r|k_7egIO5&_rFJ9de2WO5%1qb^suz?2e0D&_E8#IWPm(+K(f@M)NhK1)*mu0OF
zM@&`($It2Cmn#m$!e|`#-j6pEnM$R$?L^in_@fI^3|gNSDy?5KGCr9xDq$O;*kj7P
zv=~rlG3vm6U3B6AYN26Wm6)}i{28OiwvgX@qR`M%N|#t+*}#Y^p*QS0rtysr6Ief&
zi1(7rj{9q$V>*e$pJ*+Ur&_|2Qftlm2Ol-&FLzQn#6jv&yksXs*H2T)S9-Y<<;ySH
z9`3#o6~-b5@=CQKN|O(>Jh&m3eJOT4iO1&(qQ|{2VW*IF@H!>T<6QqIn+7K4-lGeg
z>9lW9#q*csBr3LCK!a4Qgdh#cINu^7>{P`(5Kx3f2pTk5l^_mil>K%G_&@0zSGiuZ
zR<-3;YZ>v6#IB;YPnoQ?l-~tWsB}25mQ<Ot2iD3~t%pgpt>OHL6*+HP0!ot@v0})S
zu4dCU?gD}}%_iC3pyO3+%Yj4bAy&UcufJJg7!Qr~xOsIV?_piP?{T;foVI%wc?XNK
zAjI{uJ@dd+&u$lwd~Xd$lRvuTD?aQ_U$k(*uNt-gXZ4Tb0lz6Ie-acEyj*&h0EAA%
zk&uXT2MvY+<#-#H0qv3$c<kU4T5z3ANU;4oMsS^4Sg;&52pHwJ>x5q`p35&^9lw<K
z-Tw+8i^h>{?Mf7{e;B12cJ%W|a^0lvFKA8K5pCk=4`IRV%w%k=!Eh*@*S&b5Z|5?0
zgvGn7ra)H=Ce!fq)l#eU^j)#j2A(%{Mx*R>D^g_Vx^OKgf5G&u=t|RrD9ef_$AFUW
z&yAkRd6(T4%9Z-I3{MKxqGM=jR#4RU@eQ6xRj$l+z5e68z2WbW`}4iHKPKIP)L^3r
z4N|a%4*9O!yb08MKY7vO>!oeM!Vcm!soxmZf3)s-^W?KGP_o(WRygErWYNtRYqoIZ
zCFOhe$s2L;o7`TvU48h+v!03Cj&F`{i+8ekStwsflplS0KGA&?{<?zq-&)!?CsIXH
zd5qRr<*q*0$?av9_*?PL6_EeVaB1M#qDKzr+0@)_vCyl^b=^^q9tHh1dz{8&?o86-
zyi<|>A|pwv`;pSGc0v9(tse@iSAD>ViHHlpO}LFK)IB1r2|TXv|5?>Petm)a6t_>r
z_fEr{q(l&S*Kyt=y;kd0u!UTx^Oc(~=pKEepnEU715cVd3-4~A-(=y1)ji3Qr@(oR
z1SN_OUI|~rTWvF89;)Z0n`+qPN%jAreZ!6V%9G|HuW(F1%SWlF4{_2dA-H{*b`)~B
zs2|Q9{O~_-=IJsa11Gn$|7^=cH|d?u#3BA48E3ZWMO6_|Rl8FMPwuzLH2ztf>DfOz
zIn!%@T(@N8{`U74?!Sj01Xx$k*&U)3D4+I=4z|oD0&Tr?FFLKZ18PV&ut0<1`QXfA
z!{IZuI6hF31N!>i9jTZ$lXY~crd}(@E;>nDO9j4}7|$czEP!5)J&og-gvDb{!2)s@
zT>ebLa1swgIqUJOMhq|ZgV&!tEc~vNx|%haikwOR^)NNyewO{P=25wAjiKzrW)?+#
z8!EYW#fW3O%*MUrQ^CW4&uI_+Z;nU^-X3ye&%(VbCg1t$Z+`ag<KdG7e5JGss<)#k
z0jPhyMGn=u2d?`8D3zjoAQd%&-QEXLV_!Vk!AB=pAe;_GB9PBeNN`C+`0`^~0YiC)
zOa2+s>BfhoD_4>8C0c5B2}+nD9#7*fCbV2E1qzNoKaj8bexyR$Q9V3Npm88SY{pNo
zieXdB(pZ&YW%7<WDB~nM)fBdD0{g!LJs|lnXPaHh)9|?V@BpPuX+c-f=(#(G6HnxF
z=u@I`1AGC^MQr%e>EqtK!>8+(XjJRB_y2c`0iP^U{FDm!bg01-C_oT5X9Z7CkPkxG
z2|y{L8WPUkETF;XfWyoHC<0Azu<pxXLf1Edtauer4xPZ6$E0DwGk`;dmrFPv9()dN
zss!4Hr*^=ZTXLUnV1zNtr0MKBA(irRNSy|uOaK$Ur2jIZ50q`BYx{Kj2XFsCOfgcL
zY93P1P|4Fsaehp&Lly3@Qrc5F#UqtF{1+5OqJ9N306*hUzZ#zHv8ctZ5^{Tu`S41n
z_i9d6fw*_u;N4w=W5pPw(337gzFETL*R*`faf~ZJPs{lWwSMvrMXYRbr>_>i!(_}L
zs1beWrIu6cS|Ay41Du)nwbM$sUn-Jti?SpIoLTi5Zrynbf`ZV4R6wH0JtWvt0#<Vu
zlRtfi)qtXtl!x>M`*hkH80PEeFHv1q_(1DoD|=?^7J#nfCZOcH!ECgGe91N(^WfCn
z@6MRX(4`E7it;9q*X4lWDpUzwN;JN_bD-AsJkcT|qsyiCbN*or-rxO6s@MaK*R!cW
z{T9=djr$d!qbuk&pDtsdwo%fC2NW?vm92aw6VBmptqy=}0IqXT+v1yrRokPMFFr;J
zql~KAHx|kRY<4fr=Q!2Mjs7bF-J^R!KJbjx!n810qTzB5XsgHkj-*6X5fP|2ff;<8
z2#7Uq-X%!k+x+or7o158l<aocfHN7wTFRYU7%uvoT(!LpNtkDAlvT5>6)_L;otdKY
zl}J5~0SX@qS;rHMji0|OdzCk(JnF*qhZjYW`Rh8pq~(h@!R<Tsv~$q^W!=EB+)s+l
zhbSb~F`I(;9WHB@fOscbqQ2EH53+o^wp#0sBI@&Aq+bpRl+O7K6(y4n7Wv0@e$g^c
z3k<7|Z6ixMsaR5?wjvB@<M06CL}EkrWr#s*Y=m1qX5=9D*?b8{-BJk&GA7VRU}cs3
z#k`iCO@j>~Q|1D;RTuk~^coYbyPdyR9-5({h}_iFT?qr{@4Eq!q8-!fUN*Gx%a5}+
zF(QcA83+(TaCE-ZSXx0BsP-8+c}*BUp~u(vChWzDeMHLZ^|xN8u$}94zdY%6U91fC
zSl!H>lhS-6e2Gabel#|%U9&gU_woA9z}m&T;#}d@)5P&bIQ?F@&QD^-OJ-IDd9<X6
zG*hS|cc0$*7oYOY9YczR1S~!j0cn6SD^M=9?-`Cm9;e(>2m#1vIwUxh8idXuNjb?t
zTnA_7HU?YjWd$!s>w*S@DuVMgLzdqbzmkx!0X{D$b9^C2lX<5ic=<LfxFA?!sR-S}
zm3UKpSfzJ$3OE;lNP&c;yxCl-GPa_*c<2X@;#5?d{Z2#LwnAv<6+4Q_>IbKEbsChb
zA9>Dc|8=Z3+l2edT#k%>O;<80hk?HAxbpD%yyKQLzeL1M%E)CoBWbhR$FRR3i{tXu
z@QkFjwMa{dHuvfzg{7;6otmEYoWF&4hYO;gmu&@7a-4g^+a&WW4V4X?7QhGy^+an|
z$6MuNV!sSaxQUcLZOO^95W=j|KfNg80lE<xh7)jafcJ9Kdu<8S^hjn+ujqSl>LIP_
zkAB1#Y;RUw7VyyuA~d)T++9k!<kuS<JarAkJ`G}IQ=a_N`)S{1KO}f&iiPQk7h^~s
z)5FXl&5*eZHCSHPEwx@vKBi2q{~V5|<FlZym6JuNmUm)%yq93H+|?p|1E>ZoRcpF|
zAJ3)V6qm=Pke*0;n2jle2hLf&@6gpFt1Du}f1#I3rOQ`1#|T%N1t%Q@sQTx^+t1Oo
z4BSr-Q-IRm-dG<o8j#q?I}`WYUiiL=U!!GiSCa~PQYNam%`|q$Ui4`xs>^c|MZi~g
zIXAlSw(LClfWD&4(fOG!|0E<bD5TfCzhemSh^Bk*;h2z#esV93g%pJDeW*g=?h5or
zE#m<lBufVDp?YU-fUo!@r1C>~(i*U;C~rApoYaWm@MSe8VE?sLtuTR9<V_{qrdYwA
zK$%wLLkIMY2t5UyDA8V1WbgA7P0u}a18VWQjx}v>$Ll2CnJVsLVxWN#$T0F<w%6jE
z;z*iFqhj}It7Ja)%ewjcdhX9kqw^cTWqnyH5&vjaXz=Q>zuC~CR2J8y<k`eg)asDj
zY(Ho@J;mJ1g)Lqoi+tcQ95`m_9}VG!m@&X%n0v4d3HVsC^j-xo?~{fqHRkIJ8XfAX
zO!Wy%Nua(+fd<DkgM)d50LM>-$zOi71`T?$hoTRGBR}$kEywIZgL$8WQ#}DYO3?sk
zLW%*#?w-Y9xU%!aRKl-H{Vn3CrFSC(os(3*-&bZ(!c23lQ<f;o#63u&ln=*R=o0QL
z%AIb?x_yJa@o_}hOdY*P5m--O<6)8|*L0rCABopn4wrLkWebK06jkTNL*!gbZP@(e
zyfQo2ItCV_|A(wUxz8cu?b`=om8DuU-q|rL-BL})nZgsY;9<ek03}u%WXEn_-+mZ#
z;P_=||9=jt``d20u87qRIL=$#SCh%H41i{#lhV-Oy$zz2qYq70`o#V2nt-YlA0O;)
z)h^Sucjg_!aPxZi$id2U6*jWmV;KolFC^sInS}`BT*62N!`@O=4ig9`nH99qKtWAF
zKG^?~VGn<}Sg|&nS53g(v0Ji0y^D};p@|{8Z`^rfNO_gMM%0R@RljzHNm2b7lSZt3
zA-3o_q6lFDm2PWHk3E}HwQC;$Cyq?N{XqEU%3rk~{uxr~twxz_^N*rN$Lt$JAVPBG
z(|dZWXJ$9zC+t`DtSSRbn#H*_)&QQyo)xlwGXFMrBjjI5;jKgfT6wsP+`^BpgAU}G
z!AFBY@yP4jpuzaKV5QUaTG!`ZD+%DtF1fcmbVFc5&@`tOaeS$-w!|=qaj$t<@<$ZQ
z9zqZhR)(rXj3p%6z+pD%B{`L3syw|Lv#efI*}TlTH4lXyJev_hLr11Io_+vH_Pf)s
z@UwZ`DWawK=$g^`;aOMab2Ww*Tjh}^%Z@Mctw|+q1VF&|OY#Y1@!y$RTIOXAdCvKC
z=@s7`0yn~2I8#GZ(#F6SInaAfQZ{egz?4$SKWDfJbeUiy2C@NZF7}U9(4YV6uQ%x&
z0AnOs#A3giOzcnP1sa5(F^9ULm(B<r?<EqppU&7fZWl0=_}Dcv^rcy@P9QUDG}&5y
z^xg|avwNu|_uCm1O4jOo>dJyM_l=lBR~tv~h3;IkW1fE~;lDhRS)4J?my(o`Gx6Lf
z`I<$3UT;&rQMJUH@6w9VOpK7_2e$GB?+ov^O>Ev@Y0giX0gg{qy2m^8z2{Kt!1#V+
zbyQ&d<0K{azaPkU-4$!$2+Yk-pmC(rTfPK~a*gtE=m^c0z^DyFg*OtS0sT6%9!00v
z(qkdMIQ_yOqjdbA7m60cm1z|#xW8YepaBgotrLM<q@+-Y@ZjZn-A^P+@aJoHMQB3Z
z?rJfH?$A%%E|lofoqWZxw_#7xeh6?#4GB!$$!$H_P#Zf4sQu8tYBvtCE=V%yAJ`@^
z4-_zHh)$7_XEIiG#2=U*j_*mxLsn2(k<MCUGko!Rn3&Gi^XJK+En@TzL5JzZ4#-At
zmj$^9fnR3<jA}Cbw+Xs%x+I}I|9EmXCY%8tSpGD*zVl8u;!-&;%|cXl+lJNG%2#<h
z!lupfMtqldfXb=y3^<Nq4qZtLJPGLyoX_`{^ZlDHY6Vz+oYWOxf@N%fQEfm>J2VI3
zFfoD#%SbW}x!$On84=Lp^{ZvVM0;aMU&P@#bgxyHO1|v|rSLw&<e*}yUbw!O9G-0K
z_iel~@l4SB$T7Dx=~;J8&F9URIZTZ+<m?}&hu1>(j;TnB3*|)<SsFc7M^Vqu3xQi|
zAB&9xWs}=IiR)maf|$m>o?h*QkJ~Eyp;nwg5BXn4`ZSM#VPqcVE~&n-VYtuS+4CY*
z^t5rkdGzF1&d?JM)XE$Nh%+D_DR!n^WCgtKyaUDKJP!3=6!d@HARi-EHoeB3*yH3D
zpk-3GLS3R_CM?*MJ0y79qD5f*rtLTrre`E(`p)jqRc{0kCUk`yD9wKkK6MEQju#7m
zTeTm8HlUH4;2_+ck&0%p0{RDzccOTU9uvp!yV{t>GL%fu(1q@xD`TrZEtiiZO&!7+
zQ_;l(4u#IL2idgIKzm#MS5eJp)$pW1OLxl;UQ{dtIKm3}X}$h$)Fl?~cvz9N3*W?E
z`FT>r(k*}a6Fxz?uBAyqUvK{=(I%9cz~_abhbAZH((2Db4>{jFrI)?8cJ^JC=he!e
ztBjUO|ExR`FPI6n{|cWiuc1B-77=1ye-h8n2Rz@|&-~PPTB~LT<cBw1Jhy#P{L#~w
zL*Z<=vF>di%fOeD2lBqTR`@BQ%XajaKVAA0cV(zKwEM~69eC$3joDHyT||6Z{bVKZ
z%owVTCx}a&>AORMPqhd@y@>#G$RPsR;bjit3w7TJTT*=X@hG&V4=}O_<&Zhev+w;9
z5>)WauYLu3BXV!2jtN3MA;pj07KzG*Jo9sjW$2?40HCCC+S(4(UMZ~+GIt_2Zq1vb
z{v*T}skgg@b+ppT=cc{ySK5uT!83$X5vUfaDdK7MEVOWHJ(ClcqpiW{<=-f1IC@7d
zaXAXLHB0KepI->ln=lBu?`g31J^-sE#;fJhUpwPf3~(UA@@!z<^i<7yoD0%~Wk}TU
zP=T@^RlFah8EXi+5UX7_b!U35j%$C?HScC0SuJ&);`Z!&Q5p{yjvq_jAI2fN{6@BV
zYldCNjTL&^6?)tW48@Zw%K4*q1(wN5tWp>M0f;d3D0b~)&x<&$MTL4T0nlLKjm!f}
zUVVU#d<KHn5V=oeaB%ysXT3(2L}PJ;Cp~ymFicWNeA*t=UqMJdNKy&T+~8L!izVX?
zi9XdwPpu~5J?(C^Hvv;mkddp{cQlyF2MBw~zO9fWY<GQ>&Cs_I!O{oV>c&kK{ra-U
z#7oCE%%YDEB|Y0JZ45mnT!JnEoxe2PB5xU*n$RslDCc-1&%N!0fqX%sLA_c3Bih1{
zx?n7y2OcK^oRCF}MWToSEij>A$a$-jQ__pL^=9Ze*-a1N?3o>P%V)j3q#GZWa54{L
zsfJ9hZ(I;$^FI-c+&)vj>t;dxCpKTV#7b}osB0y(%Lw#f_m0SmNnCZKIu?dI*WEl#
zH8e4rS{d8CzP>2nMr4G4b-j1fsaL%Fio3ygx+C1sX5J*~f{4j(zEs(iZqB$Jskm*S
z%t%wT^jG#no+iWS4{J#<!?#AnYhw<u(|1Rz$|p@GI)aV`^>HrGJvyANPW!qK`^*R}
zrE==&A@zE7qt8VKJU#u@IyvKg64V}ag;6H$OYcOCZ|g{RXWM`4G#_WM9r_W0GFAkl
zxm{wVzch^p7xk5cb$<4MVGs;(u*vMiix1Ts04Xh3>!?;S^WdA75j)s(-9zO^=V%Lf
z^T=FfD!1fwu&<pz!`{B@sv4ez<Jv|E5Z1K@2eSpU+!UJ}0$gjFrHvZ_txZa)#D_C1
z+R&ZaY$N=b;CSC~;`m&<HG5Ed_F-p;{E=Isf7RDWg58hAE3!av0u*o)Y5~E?;mwKo
zA6L5lElvlDF4@FLv$|-w+TOrP9bAw%w^jV(YyqIR7C4ZlpcRCEx_f0Z=OC!7mi!Ld
z{T=1E{$9blAtG*jTUCR&JRXIsw8<Db=#SUaJP^afg@P%f6pIZ<I)XKhGcmu$7{V%q
zt@aseSK`=8e0oS<pl35?81JMe3eqr?9E$DR(1PC^#lF+FLu~=nQ}mhf+06&dcYZ(^
zu`W67anHQ3?Rk*@Bi@GpiBCmygtrAcKb+39_QS=$sVRSg9p;V0^UI;iB?8v_B>uh7
z_F|t?yiC5RVt?6htpJgOg-J9jGXoC%fdR43l+r2uT_ty>dvoVcQnR(2Su+aoJ!mM)
z-WU^x)a>rL*Jhjeodz;g^A1*WN;>+c+j6t`F*_fiW)yhU)Dr+{Q;!y?I-2e3KxIXY
zQhn)(ZsEif&8VFx#ICDiapz2s3T85ojf|Jghi|EPCYVZL4&nPnL$14E#m;KF#e<oE
z-lp#+7Gif+o~m{j5lU%k%uvfaYrdWp$tIlopAr7eZ|7tDcIaXcsrS=F1`#PYf_4wv
z8wyxa#Q@-X=?x#F{69yVe1R+}v4*ZrW0)x~50F?QmG~g)R~P{iv^QDW*9@D3>zRkm
zdE2iFu!8GM6B(9hkE$0k`O>O(H<Uhc!X-$@>Lh{h{UCNr*s}GtqKYK}bp~a;W?^!#
zri#%Q6N|p^t+B7(4C=iv?>x{jd@%K()X3Mb;~)ubCxQqQI%o2RSZ)P{8Ue~wh@l<d
zxk+y2ri@7gIv~6GH{;nPa%gXQEzLjU?Yan25Df|{v~(G~k_k2{%X>TYOz$pJ)2IB~
z>r&4X@hF%pV{e%7zbI0ODx@-Mv$};)ZL_KsMCez%vTz?F%h#37cO4&w0411=PKf3>
z3-4;OWBjRpd1zoEa9}pkXQzXRmnHwGEunZXvV0s@E^X;`l5l)6;3vccKFTE73V1Jr
z9_FWTZ6P?Q0D-2@nk6J4XJ<dA@sc&Qd^XNDINffQgw!TnCcXdQ`xgZ^8f!wH`ww*O
zE}gDPo_UrHrs@`#UYR&8T&j?E?fW2nRL%M=dty(Ur*cI1m@H2t>xQw(FzmJImFLv2
zTVB%8JtGWkx!`nGiU{lyUrY#EZMn{ww(Ap6xvzV?*N`S5yVyC=uh+^ffG~re&f}0G
zu`F=0<vn2gy_GWcx-9KcJ3($bp~sDS*g7!L1DNE|vQQXmjf51*P85a;3d!DQL3$-f
z)-5^3%^lP)X~V4gR4J%_6q`5GTJ>+|$`Hqj^ezZzY_i~<`A=xHAKHk^;q%i{dqoGe
zPFSeAjXo&!@k6rOWI6#AfTyo&n@tY^OIRX*bm8LVz`_=n_2?Rb*uHn1eHt|ILT1CA
zX{IW4)ATN%Rm9z$HK=n?iH4nKvIXA7%RtvI@Z**hmPuLlN}T%b+=Lj!ER5|;x57q?
z;ajLpW+Dr*Yj$N5+lkjg;h@b#<HM7WWw43KX0o&-!#vsUHgE55v8g1k`nT|pRf?-0
zzn>_oaudaymwl()=B~SnA1Qe3qs_-g<%briN~35{dWRoi=9rD|qnn`fF}bV&##D~J
zgzbD7if{|s(sm0pDvq}`64*52%^W-S<~V^5*F*QAan{ej%H1TdWTwR_eb$DDpI}1t
z>&DOT{^yTxHJ7j9XtJyz`Af?Aku}%?W>EVSGw9vVEO14r<dG2`Ydcn|QwnXem>?^5
zTn;I+Z3kGTfoT)mGh=RL$A*p3F*W&vjUuWKuUlaa0iwvH>dpNwT2iZS*RTpmX|{W;
zXfgzP`V5-QH{(|5woq~BHY+^PTz)IhsjpHdYn27A31RHpZ<^cr*dR;BFYoC4zWUT{
zk9BPiPA#9RBCUVK<;Fd|E+q{wJ-V}fcF)lBQ$D$e$Z^^@RS;II=9rMpTCUuNM~)xD
zfa+R65F}LOjV}#jip6Ej`mAFilz@jQo58C!OG5T&v(sEvcg7vfXwW002-(hkP9x8b
z&$m5j%85mHDxujm{iIk|@4dKybv0b*qOTvvc9WCHgP)&tGPP`wMQVi4gMKc&2iA$N
z@pqr4Am6!AaksA`j{biAEUE@GU-nH_SlV05X9Uk^%4N#8=~8&x2{lc>=pC-c)~EIB
zZHyps<kKI=JuCdI=XfTgM-|EnZ!BcF#vWYpTC_Hq7a0-}_iT3j0GAZ$$6g_mW@O3_
zH)ur|;b_eq>%>SNJ+);#Mem2jvt$QYsLa5QsVc5Fww9N0H8^nIP-3n=?8|Eqe^t+r
zFVf?E$cbZ~!#us9_sj}XXS|*faM7nUwb^rMpVEv9GdKKL)n*Y^1{BvmXuUpji;xrD
z-wUL)(xS5qPuoecLJCp;oQm`L_{Sd{4p<tO+!#wtu_$N2l?!&?6a=<ie~hh02k?Bw
za%+#<^bLN6-V*6wgGx?Rjer}MSu@^$(SwzzNY;JQfvI>n_Y@nFB3I>E;bU>Su+E2J
z6@toJB1?kZ4-kuQQ{JoeOKQFtIJT~8;!S-QsmR>mJbO^SP2zOamsWtD9#^guDAP)H
zu6Fh*LY25c!`M!zpAamwYYX!eUxfqP(;v#j@qjXhw86Qto~tAzJ7`oiwrc;?miSpP
zf+`^Pu}j9Cp?)Fp-wo1X4vRQFrDpa{iG!jkw5efuJ23yzIDe<km4(qM6!F#MB^KlC
zhF9pWI<p8CIPxg8#N1A{h=Z#_ooxfLDc^RG-^iTS4tdAdE?C-*|3kNO?VBPybK~Jx
z)M8h8XM7~*ON_ax`wLE07Ibf8aaGo<2H@lhElf0Wv3p66#>mZQ4yxQ|6pjS_fjNKM
zx)=!h>sLPCi`NDs5hVytdciQ%)>rt({ljksJ>cSQ7Pyy*yrX3PNxI2FwlpbH4G*|y
zCJ1zfQU~^+kML&teGV+_QTFl7Jr-{VveVEuBOhIHZ)l;bF~=IGW2md(^v%ZaZOrQE
zq0^Af74?8MUank{fS|{-65c}xdh;AHjmB}5z^y#}bWJ~&Un@4?@ef>!yw-r&M|y@D
zZtCBcZIRFATA4G4c2)Flr4luc1o5U1{q4^G8pGe-J-vX<bY)Xlf92NYrgAvpwT|@<
zGQ9~4x7Y@j24=7-nh#z|9(}wPRxz%3MA5^BMJ@4Exk|K*1X9J*{Jkz3j&Bf*=b<uz
zvXzS_OOL(#Tu#a`?Ct;)mHh&D?otO_FVZ!hR;xF@psYO8X%6log|QeA^P9l!hxF4A
z4qN0b7btwWcpE}FZ<g{YiPtu<yAjX-)>eeDG&-FKmnd|;)h9?=@5aIB)9kB=JE^oG
z&^lFl#}gSn%Cgt7!=rkz^#+pug7|>Ime0iQ4`)c-K4pA+s-!%HSTR?N*?FkPZxK#K
zLIVHDd98G2s>*uo<8c&|PvquG?^ZeeA2xH(LF`^=j&!BrQLawmP-Fh^hngYEy19kO
z3dw2XhQq#1Idj>FYo&&fZC7smkDdH&j(-`DlJ*O%3L?56XIKa$D`->QSNPe3ph#cG
z50XdEQpGJIA@u?fIBN6KIC5~y;8i&R-!yjWRcy-?#{=RFzVc=s^m>^3+EEfex5?Vz
zo^4_L1bnT$Dp}97|FJGqBhalqA_CCNnNrL#m@M`!z2#jV8}lc);ueXO9Zm7*(O1LT
z@oMydXeL_4@I}r_Q_K%)3<&=t==}Vm#SwS&THpLL!EIST-)c#LRmC`JD!)u=_3w&_
zKUm<^ex^{V1^&T8ClRdN+!i9I>Lup%Sg)fiyxhf+UwTtdw#G84r1xvL_w$wStk<zA
zXi3e_Z8e=Z_MBgnf}loxtZt&Hp@TZ2pN297_54V8%IDqu7bAc2zwfwzm`Q?VX0eO9
zM$n5BY0E5dyygm-V^5a$Q@3aGTjoYa)_|iEZkqDdi=u6)<Ag*btfA}SEbXi10WC7b
zUl(+<Hn)4Dc?xQI*j^9H$LtIh<e475z0nYNVmgjC&i?eAwZT&j0F5LwAVW?dmR{P@
zj!dG~n{ktCt8IF?CV0oND>)Fs<$)AMSx&{h?K{Vo|Cq%KSe<NQ7biX>Q4W!h%wj9=
zh33`?wV&=KkRpk8Sm7Fg%SP-UJU`|x8vvF~`*Ml?ih8UpiOxY<%d&PK#XFOIY2{=3
z)@*^3$b)uY!$uv@7}u}}O+<+OK<`$|R7@e}+SIb&QVL7uSIV~24w`g&w@x__v97st
z-Bg{||8t1}1EA0W%#!*}%3JCgGCdeOTbayI$^vXEGz|PgjLQFtn~E;cF1b)ZeW@hA
zMBz=Rvj?YjIn@EN{Y>gh(q_4vA8SZ}D{v@Wwht`{wpO!p7B^6LV6v)T+}IL^qSDrp
zutw9k-)#*7K2i*<N`{$$#GYzj<*Lb-b>X^7MT^vQqg(6q)zucxK0$ZpRM(7`oCtBA
z+iVb7o??NCnK86GTg$A{&}<UFS~wH!IBrGvJIwpWaJe9I?$%BIEB;hw0Mnu>qi74E
zM-Jdny_X>q=$5U+jhFn!1>$pSm(YO0`4iu$*QlKhc$MGq<bnze8Tc+O$!!=V4BIFe
zxbK>~D!R_&w>IGWNzSt0#!NM^X&R`Tcq(HDnwn2%YnmUI(`&jT$Yt4CeA->i;D_T0
zP1`l=pJe}Q<6)eYJ}@Cy4(cJhahtDtmRK)&Z_U@}*Ryb4CgAW7>@r^9jMt+%EX_S2
z^V(hXDLGN>H0RyxxnBA`wp3L^_mt24d=?^=zP<de=$IW-!l0nE=BJhRuAFr}T)rP|
zye%_D+96uxX6eiq8Tj8YZ~;r29WdZQJ<1WNmJ}(H6AySK87(?aa-v~SRmfM<6ri$J
zP&NnBd}if5<#BUpQ-UtcqEWK6x*s4{5ZAb|pmDLM61c>g#Dujc)SY`auX1v@p>~WP
zw8~QoMq^xPG_TePdRWd}s&5S(OFUf{=4sR^H}K;OzmLJ8<lq*{m{J!P9R{%>o;Bps
z<MG5BI(oOxo<1+ULn+Ix6Dt~fCOX(p{owU~52MA38W_a|KaF_4Fcf`A_DTDgDJf&A
z+Htsg5-+lTUl%q>Eq}P<WkI^Ac=QUJ6d4u<aHb|R+<71#vh=(x71v^<aJuH~O~W1G
z$Fm}n&CHp*^5~wW;wUYCyQ*7V>A+Dw94Bv^=uY2t?7S=&jS2Us0u+kk=OGM(Klyk?
zO^5tEo{9Qrn8D8eL291+Ze#Cpc`BfQ?A1WzVdEL5wN-q*zsT{wA_|36`W%OoLW+bx
zgsuEFN$)nD&yiYO7Q4`@PsL_t6M+@LSo8b9c0Ea`nrhTt#i*9ggty2dB3o};CMFoh
z>dZb$Dcrk;&bL1-<i*#F+jZOcFgO{HsN?F92>6;Be{KvM@3AyWTA6VJ%!F^K7^t64
zh)(SVBmCYSZd*Kj^_yw_Op<=V?oHD%P7-T-QZs8A@qUy`p(-h|;EMd#50`|W{6AU1
z-@ciBCP~IC+C~2EHJ@6ZK16hH-<RHpk(hta@dT^7z5~VseOz1mU9Pmw>@-Y8i`dX^
z{V}578%ZAiuaooZ3vA0*sC_YmnNVIoxL)XSXWhtsdH0d0<=O6gmvt<G3*5r;nhPqU
z63p*(<EcRC9bM+`s%7p=GiNS0pa}Z8BOIc!`{3?cAlb3-X6=!XFI8T`%o(xm##sN2
zX8s}om)*U{uTeiXZOJ&Zrc9;{qi@LO^Mw_=>cFCU%Vj1TBn!SIIg0~PhsHbU3;r?p
zH+50-I{TFCBT2~Kd5b7fa<|3ApD0IE(`HauS8)OoHQ1CGj*}_wIxV7S_M`X-g8u;`
z1S@aFZc+FnnkT{8el}3oDs31Nz<0#S8%fVqg)nE$yZNk96=q42x{125wHehDmbA}F
z&Q0aSo#T&zTVfxG{SoS#12;VTdX)1-JF(hVQ|wOs^bdO{W1ee%hnr;Qbm!aKt6b)4
z8kF)DrSIOT^I3H!$H4?yXX>WiVApnZh5|ob8=Oh$30?^uvOV9+zn148d{WDOVA?2O
zIx_LHyO1I+l(k{C<(Q}y+62`NMMG1I-|!++mGxj}Q0+)((w(dK93EbFwGbUyVQBE=
zT(d{6<Zc$Rxw4W4x!I4Jl7**zSMDHkP{a@k|2ACWxtHHzlhfQKY|vAyF7(8mz2hn>
zHZK~hF!76t=6QYlDd*AJKT(Z8KT~+2<#2NwfisjMsarh(M$dT5ySpq#kXkv<$Wc8n
zD1RLbU(|+??CUWIF~>f;?61a7VAwcbV=Z@h9NBAcRacz<%XQ^SJ$0RWGb&{)`wchN
zP!=id*K|PQ!jIhgeB3wiTgslU{(@vcPVtJ__v;qWri#z1-mmR-LYcEt?uT&=>V$TF
z2mlU}Vjm!Hrq~G1_UXzoQx+QT-{C7}c)jJ=)w^YTyd;FnK7&G2bOI%tfWKk{-CB~i
z2vDzk(8yBI1xqXP76Llc>v=tC#;IhrzGR!o<(?}y|KV1jO5nZ6njknkVr~Vy?9uL)
zxJ%8B6P&fWH-K0N3Z#cBbv#}Uu9tl;c;9NQ@jOSgJ6<&Moa6bcF*^j+kB*ovY#eS1
zE9b5$O^{R{rRM$Oi2rUc9E8{mVt2kl&_E1O(5Ls|ENED@Vf+L;Zq6Gqk7JEkDt0%j
z0{GN-+={wIOS^-mI{m_^605rV4c31^T*YO!FgArkLgZt&xKd{(D+8xi&x(W!-8-38
z(K=WDW3=ahJ`Pyxjm>=ol2z4W<|cB4T-BDcjN&1{<StHZL<ayUO0RJbxAaRdtPvR)
z5=FPLp<deI++n=foz`_u$y)LWYV15rbnnDB522Zp=FvKb9zUJ-zhC-R2$lx*mUvoE
zzqcI_Cf5N&9_j@Yl`}tt6|^MPo-q<1HOk-ja6oIp6WAbmEls9s+op6b?Os|xYWD{c
zE=_7|x$|xns@i#~xOVwx)Swuo&aZYw?`cj+sC6T17q{?ahUVmve;ixxz<yPlbScd=
z6a}jS4AZDYwuA}vn<&R0#Cq;z(J{%AN0zpXep^gBuq1Gpb3pl9(Nti~Z`ENHH!&I{
zQF=GI#fmH~7iBCRT`HYXSdjmz4-(wEjb(C@mq#GaD>0OrsD6a}L|?Y&d)H25n4&u1
zu)ADVM+1CLwEn{lQR`y%DtxTjHe0gpd8lOF_lIGqGnV(Ssoc#;85_Q`zXsMvTqk{m
zyI0ps`H;S7|Hk;P?63I`jBEK(H2c1E&inlq<5*mP7W;8kQ73+}#gf&(FU>hj@Qq+(
z4C}2|`>O^Qi}&}Ecy(dT+P1jNS-oX4OACrer8_L}{fA-0ak=}pp1@>W_qVUsE6tuw
zlin}l3GlO*PaFzu6<76{ATZK;*mXX_mM0`N0;$oOUdSILMA4x&D8@-2=>m>Zu@m>v
zng2h|ki#3ZQ(zmeDl@He7KN;EKY13o#*Mf+@iq4@Fgk4*+-MOe={ckwhFW!Eg^OmB
z=C|ufh-3Oi25y%sj|Ci~OogrXbLAk~WXe{xUu9#v4s4(43-n(-hYtc0nb^;=)q7kL
zl`4l8`jv&_8_M1N!1J>I0ptJrtVM?4oFyz=(}7h>k|O)bbYV9XNv$R|!%$;@R#n>b
zLB_=+X79V=(HJ00zuEz`l*npr*rbIHOsm1{jR;M(<kNR}6$OAP#h$&{E!ip|cZ*B2
zxPtM`Pc89!;HZjG*?^jR%rdK!RW)QoCE61!hF^}Gc77N2au8kyrYjS%O=AHSA=PUw
z<c-&R{AgSxtim*3vf#7cmbmkG#X5F?c@1t8FJXM8KapibqMI<qHR0|`PsF=knLk;~
zFm~VY*J0oCsT%Ksg86M_FQZ}Nazb4Xs^4t%z-qCP%~SW>ja;uTJ=ig){PLJ~=&@eo
z!soQodw9SWyrP<}I624qVHoMk)4crw&a8D%XQ`gT^+kC{%RGDMCqGIy#AuKCjiEb(
z9legyqEV+UZ|O7lwClPNcgKPbZ)=e-uPuoM{>G+n1!9m69$z=42V;T23wGn4JasLV
z^t8@e_>Av1Q%h>KC&&u7<7CZ<Vu2s_J^#Mya2W_H8^wU&sDY<1%4ig!lQpfuS=!aA
zph>Q!t9(v~`HHbi6xmc1nIh7-J4hZ{jhzTL(KGqNFpT&vyKv2BevvSW(<RpW_soH#
z=-j?b_m$(dVa<lNjAsNUojR@f4<VJWTY<+Xw_}wvCug4=0;)qx3*Kw23R)ZSkCrp`
zERDX6w`0g>)6{;(ZJPvn;mpt4;YG_UMWgb}bf*f(8WWnmc6~<e{7#1&saO@NR6o)3
zUuP^Z_X4)1ILfQB;3K#fAYRiT9qmEeU|$2F(W{{4kcnt?LIt}QlZQ~3DC*j@-qGx;
zkBLCrq5*@J%rE%k10I4~_;I8?<ANfZEL}L?u89+BI9#}LlWI#SB_ou@77z)32}=L?
z?p6mDN!vmB4P?|_p_?vjAJ|AH+nV0rI}=&q(u+DUyc{5%6R!hPW_d5utrj+XEZ09r
z@)cvoq+jm{mn<r*Y^B^-LldWt&@7ek`6a4TrVypY-$YH<-aX&%Pw|rDQ=r<9#upNQ
z6cEPR6=q%nSh4U6+}~(6fly#>UZKpIILl-Z2KG?HX*=j>QI(8ev^I<!z{$`CS!GCt
zjFF)&bVjfr+-OxmOK}FE0Rm7Y>vyihgT7Z?z0T>cIYwNp_{A^p{r31adDxdffQCcp
zvN=0M#*Vh)x!rEr4V|Qzx~%s%bv4%+I)0`S{&mi7-Nl$UbmN0ADKhRe*vC9W@~9t&
z70#YkDT6SyhKleY2o1YD{Ao{Ka6;g1_&TtDI#R2k>eqMxQTz59Ti0i(tdhCgC})P7
zjx%$6=cl%H9pAtDsM}kSCc-{1?3xm--MM{7j~-gU_kPRpZuMbqX$w-VM`{&T{hDN;
zLDI8fobXzYuQB-?)yq!tZ(L-{n}T!k$-{S>---CyBZC+Zn6%L2PlDeVAn#=U)LDU4
zTfA6AVQ8!zFQgG%i0f193ZPH5*@8K_^nV(V*i-kb>NldI9FP1$$r>b&RLpVFx5Jzf
z&3dr-9nDjxCPoO{Ynu)|p%BK^rSl~Xqc8zl?5-BXlXZ1tS;b!WxtTWf{kPRj6*Ohb
zt#hM!Ph7)l(__?r*w!IESLG{W>|`tthX!aBTh!05%Lq{4UOgD1LW}(-gm=)zd-8%u
z#t0#r9k*G~LmM|2YA)@WET60M*neD*HKTb5+{jucL*fzXOtJ{VXlZ+OIm@%yv*9c!
zE)ZnGZj<9~c!2Q~vgG6)T>KU$9Z3Mdo7zdOqZyN8?UKF`GP}5!y|$$0;1N*RL3Df;
zaGbTPqxa4)&Hj0ozGEuu?`-_lxs9`)+BRlaY!-nb8R<Vr<sE6ujQ}YjyYHb4ey<$}
zNM{;1CYEd_=%Gz^?nbl*;`{9mt&H624?a$KdiC6YNnD+(Is(*jKi(}ISsT%`gVWev
zD1hTW-4Hn`$=Mg`vJAR8zX_-~STKGJR=U-LOKP=Ffw>DeAS=OQK@JO=uey5g4c-r@
zJ6ZJL{K<_lJyESt)mSB&K0d922m^uLm_u(^4T|q4+67j(^qzhxqKl2=1_Z*3y7e0}
z_~^4&lvZJNZ6|>Xdh?Omb-HTml#yA#yCaJ8JA!Qau~l4$_tvrkb`vXR<G#B%K_iMv
zJypu<zwKFKL}?!VK2C8I0s1NbgkSa_Y~o?&LlzI3sn{)#DNLnT4eSH3vslA=1KV=^
zZIQsbV7JJBvW*LXJ}uUcJp3xe4g8(MHny(8PEpV?tFWNM1|av_N<H6BdnI@tY$V^s
zDlZSL;9)x}vKbRyn@FkXT<i9PZ+AZU9iu_gv|O{ibYh2QD3_r+ePi~5-IpTXYx825
zb4EEuCpI6pKZs!<wlQw~Is9~aoFs|#@`cjP%qj76$_HOIJ?dM%Np60gJbr45r;lOV
z(7q&xz#qyU4PUnl06=qm?;??6rOmBolXzEt;_;b%Z}5sIJ@PjQ$ESE6DY2J~=)uTR
ztFLdiN6kN9`$KHj%`!i8Anw~t9CI>3Cb;+(u%)nTU}SpXUo0Zr)_+_$qJ%B_R0RkP
zd>J%J5`FaI$kMxhyWS20nC;qvn27~4+&z!Gq3o%?-C(;JOUApR%w;mweHAifd||_}
z^uz78<uV8bR(SbL=Gf#CKm~aPyI0$60f((}^@oe%)&vd3JLFw-ChD~(qUOZ&cxe*Z
z2iN&&o>FyU#72}OS6)N<bc-8R4$zay_Qz*$$0pD7tY7^)8sE5hA_@I`)fWuY%rE^$
zyBU%L#PEx!b!?8LNM%C#!?4mW7FEOCJ@R}W2;6O>Ql?ruZhL9Jj|EWIaPU~*WPqen
z>eEfwK_tI1hyrX&`(mj!lvq<t?>!r_`{G&}{$7xC#s$`T5q{Gb=W4{o%Qr#q2u5-P
z6zW`OX`4D^&&^C6Q9ZqcU-a5Fq)$s@$}P_4H%Oy|?;koe{~!pV=IfL|TNDiCMGmEv
zQTWX2l*+`iz%v_U)lwuU-ti(GAxPoGyTC>12jc8U%-LP&>Tesa0a{6GgxsnqbHo=m
z7cGkQ$2~vyF2QB{?%P8XHA3ymWI~`ydblrQwzVJ;5S{<kfnvT;lE;YENE@&B<EjmZ
zlVEwrGwo_7GiZtRd*Xh>c%ae*kblHH&YgEecmJ5Q*1mqs>%$6gx!(*R#$i-m3Z?Y>
zNd+r10w#TYV+csy{;})WJS?q_zN`x6p!MwLZ`Lh?c>vMh8n`}mGmE~kTW{-<Fp{+$
zZ8E*`M$+24inu@ZoHj>F-6Kx#^*Ko0mL4s9kn|z%mm+hOg6T=d<4l=47o@Y+>dh37
z`Mm<2-Ury05H4f<|Ge}!6zDvVDO|O$Acmag0yaT3qe>wP*Q{rOcLIFNZU(Hepjdam
zi~`BjL(Nf!?KyR7Yjp4H5-(j8D;q>*;|_c^<`-`Oj!<qX-a-}eL|<cm{3wxG_}S3w
zrE(?UjAA2KgN)qY0IY)mu6l|RzF>0d?*2yAYu-b2vE;<~GiZ*HE!1~MoD^Bi$qLu;
zfK>^><LpSStgN9`Uh*o-L>@7HZgF#bdROl~VH#MW>P`d$swnIFi9XUhVkTMfX!^-l
zWdCTb;x9<J=fLM*K95@v+*SxLXS8x*vin*|`ytlmxJpAst%*?g2V`s+w!w1zAL=A#
z$*(J(Ta8!kunsIu*ciIlS0<y^D0w7h&A2D++4Fb?<|SeWJ(YgO57+6Te(?eqfP%pZ
zK3mhePC&i`kl@JEYd1%Y0Vn(Cyf}Dcm5+S0rLDLzkr`Qpx@|)z5?9HOeC1nJj#a^=
zHA~|8JKaEB=Un5f=A71nxd4&!MsM|NEsGgTsMJz1>VB?(yf&j<q@=E6b3kjkDx(wu
zUzu2;P3rDu^mb<eR)wvUsxL<3!0d=fsc%>Otn`U)QVel0F@MUo_~GhRo~o{U$4+6=
z;P+J@*Cj6kaw{0|g}e{!V+GAVcrsl^tR*EVPHL4|P8?+d)J8=bq*MFIiYjOC4K#0#
zz<2OU3ThWqG6;r^JS@cB&DeelHU31Pye~{9dV{v+&tt7kkY2ilXePV;!^I-jn3<gJ
ztV9Qj-zhp7EP!7VKb*wv*Mv&17;J^Nmft@O*mBQ}6)Q}HSLz%}-=#EIFTFVw0P7f|
zTZjVA(Z$SxzX=H7d<w-EiuJvU-fK`AZ^v(y0OIiPGyK<YVC>@1>+1?DoSsx0hPOjE
zvZPqhJGr$l0(o_Ezg8GaTpiGEByHM6-4=8kZ^s+;f3a9_zje-Q`IP+e%FE}z$kfpY
zh^|8T=TiBFLmDcW>>?)~|BOBV9m<|E;Gy<8OEXRy+7`!MyTH>tfWO*`^FMPpIQN$~
zuvD}rNMyZYI^(%nJ=%KixUTq^@K6i$y=^cK#nfhCuu%J3)dG&@V=wbLJQWLF&)Sjl
zQ7<o&CCH3RnAG8Wu!a5`eWc^h2>n0g&3Bj>Ao|p3+Juo+s6OW-SkfbgPO(~M#3K@I
z-0v7?A{xa>07MGMCmr8ENUXRXSFcSP9my7QeK=G!inTt0A=SM9w6p84TPBB?A9}U#
zz-dLdz1pzP(KS@d*R0jX(e0~UXsU)ZlR;LPM#{OO@t=VAl;J9dgd<wpdgJDLEdWY+
zT*#WSNvnKLk8kn$YYHb8^=_~B4%N$*-q94Xsq;0y<#uf$?GGoS6e$MepY{rRt2Y{2
zM`&haldQjJ39WinYp6KcGG`dfvnv%IE0sn4r4{|tyOw)+>esi?(m&SFQu15#ew%D~
z0Tz;=Dgr(atl|=@^$F~SQmShMaPmJ@J%U>@{^QmTfJnt$qPHDBVS?;(_nIO<LN~Pq
zfUSOu437a`@aX7v4gyH&G@LsVr5rJ>GUkbdK&mMV5icH;kKTQzPb6uvD=~%THd#&O
ze+W%LvGGiFYMO>59DAofl-U3&Tn8^v&qpB*&fYXFX5%X!`ps)mBj=f$VR0O%{QK;_
z)iB5b%M4kE-)&KHzy&)EvJaIgtjN{x?6lqgohQDckKr%$!gY8}J~`R$KhAW_8Q05!
zjKC%Kf!o~2L0Jw4rToG{#4q+{@|DO+YbqWUY?^Shty5<I3<|#u1EELV{o;ry?^4Xh
zX{>XSp2&Qnn-g@0w>DDdsPAPO?6?&f=ITQ~86%5FcdFRH3>D$navCo*KEWS7E3QVz
z){o{|@FPFE%Rjsy2SmKlkppKBDeT77icXG<!_L!-81(;V+iwA37eu;W#sNCa5FxDa
zX4Z0<GpG*CD8UAoS%1n7Iy1)t2QU#>uH+YDg??NHNdb6#y>PQ^vI4gVw|+htM+NHY
zA1V!C3Y;!3>PfO^e6y#{5U@*?bM2FBuC2vx*p+J&86}sr<U37gV^brHJ7vdT=2lJy
z+1ccCo!5n&w(3|&R)HsB0T#yql7IoJgA<u(YOP8+8JgjuazFexSsJOT^FR>oM_Vbw
zUacL9zx9fQG{1TyCBIKkJAo0>m?gc=yU2QC`v4*>l1fc*D-KJfXn_zwmR+K;Uc5Ap
z)ibrx;|?G&aVY6|fKKaT_&H(pY#cn~u_T1ZX6feG@Ll-O){ctBS48T%!z^v`ga~be
zDrb0ep4JG7>@@4NZ0(uy5N?If4Ld7kKEIe$tz3L`^gu4$NB*vc-mOx75UENZtb0^h
zt`Lko%Bem`eD9ay7gi)H4am$A|7~dgATS{F-}4zv0;;TX%)6mfnbg2YN;W_S#wlU1
zm<>=V%C)B~i#N%o<wau3IC0^+Q)sp=o*kG&JfXKSLVXZfRg6z~*kj1De5wE3>^
zkwoySIC*8!8-a>XzR2*${JLK~*?h(WxZxGkYclm^1N4IWxw|S1>gA`YmjY}`72H|Z
zM_VdO_-CzWkD|IeQ&g7kGzzpW#P^}0dx58z#7_85E7yo=#y(`L>iGoi;Gs_J<MLd$
zsz>K4<iC1?TQ{)|`(FgM;qXt3v>k6$yw)ODtWIvo746hfn%S*=h?V;a@cfaZbU0}!
zQTa4?XJ?K6%CI(f{fyA;KUosT+sg%IU%p-&$XQOVc_^g4Y}Obtq7Bh=Jm9|O7vNiJ
z6Xw}Kl0(U$ejMUQGeOwlMpJK~NVhgQN|cttfpAE7DCQoI-+jF`Z`XflB%qhSRvXeh
zEJE*NF{eD;q<!13|6R?SfLQpPJUo$xIL8b}Q=xt$;IUwUMSnp9N!8$QC_DQ(`8uv+
zER*3~05pMbRae6a0PwD}T}wP^69|ayJF4<DAnJdY5S8Pz^z{#Ta3;I8W>fw$CI|Ug
zqg~aeih4Q)x$C-SFeb(J#m$6S*K2oudPY-Wa{VSTOqd)JCTN13?NGGF-)Y1)pM7Iu
z{j)5nLQe6pP)LjW?y`-(3x%fS(1@(|Vee9aRTEvzPgz(octwp_|2K-lIe`pZ+<)Gq
zJaSy1n8u5o%*SM%pmYM=4~*i72mqD&oNXUmKAy|PQ&!+RP}z(3g`JD^HzPa$TB+#E
zeZd`dp_~5w17Bg_=-XvsrUAxO11u5ID7;FV>5z&(N(LFc0IR-hL#etSgx9anH<m76
z54x*ivrm~{JY1!eGz{v~=G_uAdZ!_yo8r7R$7`XXaw?~6lp@PHR9HUYFqLpSX~GFp
zGW#Fi^@<jN-XC|z=4Zg@abZ%V|1ry~hO8Hhs#_{N=*LRMYpP8bJsA3!MOAqwwDZi<
z78l#nBXr*OK9IE7vohIsCp@hUt{-!5EJP8FNs-lbvCC?l_7qzj{*nZNayIpE6pAuT
zgve=MsiybK^)BiCM!V$lQ7nj}n^f1jGB06UE0zOvfYjG-vO#necV2qZa}h;zty46n
zF@1B=P&jKq8eV<PX3i(0*?5-AXuj1+g45eznLUCnxadj5f5W?jH6TR@R?d_n)-)hc
z^X3sSkb>GiLg7Nx7g>o%zw^&_J7S_q|HgI4vE+rQCMh&=dc@w1LJJmwgTZwnR#7m1
z#JTTic>`MZ8CmMB1N*rw_UbIR4Y_1b=L>%)!mB=v<TK*$b&<$G>?y$S?z#2gG8beE
ztc=#TPV|q8A}88(8qtz!ecepuPu}o&1WNHE*G@xg=x#Psxsn)${1J}+vv&fLy0R~-
zN0Vv3QXA~ybz{oI;}yzf{J0Q=zE9(-=SD>M5#UDADjBt8ps3I5WUrUhs;nHtsh-jR
z(MwdX4b$7RPbO2`Tj76&p7(&_=U7Kc+tRNMx@!6^HB&)o%G)Wje$zW4rjKu(bxStK
zMB<~5H&RI=uFaFNi-?wq(yD2z$b4_@P^n_wF$%8(Bg34uq`dUqNqlDUCK6=M2+#N?
z<K<X&Y^I17t^UQ+{^5cz0hN>!EuMFGM^bI8i1DMg>FhkefPH+Sl69v7@>?}JKn?-I
zQYjN<2WTHsr1_kXSsX|=OZ(1OU`*GXY1oy&&dP~;NUEm{56fmncMl<T{-mAv99AyV
z)tAqI(LQR<thUH%wG$QeFyvv2Ikn%O-ug6fHkPs>x6*bHu+CzH#Eb^0i*je?JB~LH
zjV!vb<`(aB|HI1)cBwPcN&)2fkM<&F)NkAp5m6rBhavuBckI-yxViCWo*~t#jp@2|
zDftaUcMTPQ3ypPy(Z|+d>Y^Qln$ZWG=hg7;K%u}rb|F2>0E>W{j*-`?zpPkX`RPll
z`D=!C)b%|1y)elzk(G{Ic&ni3tth<tF-1RlRmwd)$4AoTs=5~P47~C%ZX#Y&JEf!V
z(WR30kI(4k<0MN1w$f_p%lFh*BI~jheslX-#OW8-7&4UhjJ2fV;Pv_f)2b8@LkQ#g
zzI{Zrr01~<X+AXsKA(`i&j4hPIs7p|c9r@1FI4%4BC26>wTg`wi9L~eI4WzTi*i)F
zN;NO6$V=k_B}p7J&!Ab$n;aT<(y^dJYs|Z43g@Ffw8q2ZB#x9AWG6RZ!uCd^L><2|
z88jg4jW&17QZ)|igjQqvI`+M)kM8ad2*_{PTKVMwB}8ecxH%C(PawE9;Qap)cHQw*
z@BhC@gW}!>NoXNk$({`vhwOQ9D4RpFk8x7rW@hh08Cl1%4%wq28Ha;|V`m;S>)>$s
zebl|*TQ~LlFJ{dvvj>-l=U-X9go%eVd+neu;HC)Ni{Rcg#RsNUrm?M@z>YCKiF
z8ZNh}eAg12SS%PB*v<;|+$&4;C{>2zJZpz%8@Od-c^xG#6S$Yw4Q($$S0n)Z{Qn5@
z>0b4O9j5L)$MZfB?vx=1>jnpo>52oy)Mk}pm_^KI_gEe*e7iD3RQeu0fF|Q)U7Ri6
z>Q5;NHBtFI3Ox2VK0Pk)E3P{~k>OT88&UD)VC}=H#v2#scGu-<GYZ4I1bhHMNNNbY
z_2>p_VNP#=a2k~BDzsUuOnOTNXIfOscuxqlQtgF{ouIP)f5@B5+F$F7$uWR6imOs&
z0-AMQcY{`2yDlZq@?cki(K%E2Jiy?D0mEduK=YaP@dVO+02uxmu%z(~=TSrX_Xy<@
z{<#YvlaWTg=s*hdw?h2St1r0tZ&ZFT%A2`E>Rcr^^|td5kt)xnC@tDm@Bwy>-xMmm
z1uvQB;AX|(PZ8F8;Bvh8NS4SZq93_5{j&Cz-&RKg_rEn7@DnNqt8ySc--`|!AYjx0
z$V_zWgi7A`W5(`7Ktw5&gFQ;iaNm{y_Q`UA_}E5IaarG_#6cOQdiQ^aJ$ML4)#hf&
z9&I{RMwW;qweJoZxp{iJOc|+^Rd$3syGq_X;}|D;%Xa<c2v5w8%K~LbjWX%olUkji
z*dpiKIE$$dFAwm|{T814j&kjRS9ozPGdNuZ%wtOBw@IH3j84(55A|Lmj}3#<gMjlv
zl0^vBPCTP;<YGQcf{zB@@ueBut?=w<`9e8<1sHBeDXcZEZitJVWZP)@JH|e|@^`O7
zmwcjz57=W_S-)|SnwyrNbV39dFbv;X`TR*s-cDj{(6KDCret~e6;6RP9#xAmrXhdW
z5F3sU{3r748?=v0_};?Q3(raS9o>|bsFFLk$TQk~)8p>PfT;(U{#R3-FRr^BpW%-G
zx@x7fOCe<)X2(|CjsK&}Z-=Q;N8U57;E7`?$HFmkM`@W#?La`R_sne{S8?$qXB}~A
zv4m(`D3NqN;9HtqI{Ibn>6rT1`ETUL-IxQlkhA*3A}RFWY-{Y#k#(BJG-0ncQ>v2A
z?Mlgh{i~7^t+jVq_O_qCfgO{-Xui(hfyW5e(9=4il`HEJ&rNr$gX{z4{VY)k$V9Y%
z$mNn0Bpi!=Z8|aL0eKF7rS@^M*c~#xzC~-<^LyU&bs|t}z20Ez=MGv;Dm%Y9xeg3<
z$LP~r+YO})Vu}<N<88ySRm_{P^T555|H_qvPi-ov8u=ITAVWslbpnl`Oy)7uS~tJq
z%Y!c;IKI~8@wTj2#d?p9sOD`yDiTbB!z-lio70ER^fa9QEoF8V3k09!ag-(0B78`y
z=OlIGUHx=LO&)AXGY>Xa6Yyk~6?%g1km=X$KSY%xs>Uw7rmW#oYjff%1YnHwCEw5P
z4e@<<R@D)n5Vteot4EqV2jlceEz6?k7)aSC96}rLR`Pb$sO9b1=wTJ`DMDD8ck3dx
zrBwEOjo|;|1uwbOgtS=aoZ<PF)oiB`<O(^Mm_tU&3?U?@;B&oX&<(Dz)#%zn1!fO!
zgq!wLxpSJ9;Vd3qm(WuxB$&<KOjC(M?VsSlsXcn(yR$+}68Jvm+$T7JWs-t}(&yUt
z+VMNq3pTa<t9zF3Hr<T=+GW)vtq#E3ZAVj?UPI`6o{@JyTf9!YOqg?j!qp)CdlB0K
z^r(d5a9zNV13-^$A$x)N0yOT?V!%|Bml;CNVEx73h;4}lS`OcC7Vbk#m4j_qgEkI!
zmB)N`sre%EEj|6uXD$dFrN}-h#u<nTwRt;VTQWVC@CR|YY<tp9>q_^V89i$t<ak<<
zHLWXS7mUAjzj?0+U-ZeU4qw$G8rpFKTrr0_`)`Bf!%urnIFsceU5GT^jMI3VZVH_4
zz=u|VN9m<UH}hf#fTJ%}gKO811v!Jv2+X`hT?x>}%`SP~_e+f;246cU|BAV8gr<uf
zh=7qkAi*^C=c|4M*3LuxgU(aY>MFf58nb_iBhyue9lZ3m-q!CvXewGoE0Yw)UfI2l
zt;vEn8zir)m0si5>tB>eI#c)CLeV#pZEv2hnOFlW=5=@<1pjOga;#m%u?|8B5?n&c
zWl^E)%<+u+foZ^6v4$<;YZu`=u{s|bIvobwlVt59$-#7rB*QAhe=b&G55?wSJIV&_
z9dMEU(UDX3f8mo>89+NiL#IG1=XS5dgo1bDw=!KhuwhvFu~LB@o6>%_-*ZPvZ-6ks
zVZY(eEjh;PaQ2hi&>T<3SoP&8F#!7&*0uvpsEN`KVj#jCtpg|X*TvQU)nfjLIxu#n
zI7fjr*P!-Sj8ZLpJ;_Shy+sgw>m}Y%(ta_}S|tRNb2)Oybz;3!ho9UQ!IsY!vE$y~
zQ)Dz+@pb#v&m(2~(!5f`f!|uLsFLr4&D(489!=ridX;6TPjD@>caL+1t`Fu!XOx7*
zPJBda#9(o}dqZCT){6JI4|;ZFM%s5o0|UBDgC!;GF)Sg%mBc#+iIT~sl?AoXVsQ;?
z)s>o$fLq{7zfQ|vfp!Gp8Sd@G;Ks@El^-et^(txzCn1r3L$z|<FKaJ7$`_B$G4w0b
z_5BAv?q8Wwx8GYHrt|{>C_B~O<ov(NeJq>(b-={`)(sin{9~OWHU=A)`KaN&=^Xr8
z{byxjUBwBeWs36vDI+2_3sHioR6JmI@?;yDsH}EI;jC*VdK<A>o;H`a)?@s;HOFS|
zRPY_6vdQj>Q_*(060(w3e5?0&p}+3qUh8e#tH(>@J6Zj4)`H>;*OkEZ=1#>TT{q+W
z_5h4~tbONndu_q=nVx9otID%SSE~K269-oGU&jJh0&%X&_Y;Lm`WRsk>mzM2w@~d8
z*#*3k@>ZHmnC6@YpRqv(j_yL|kV-wBEZ0K1G3oOR0n2EC-!&6_NB4G36nFS4m)Vgm
zJQEJYqOz#A;S^vjEp$1mZ9R0OYHgw8$Njsnt3lUglTc+j*CQvN)$q;L#>}@i;H_b1
zQ-Yh^QmmoEP1Id`3#6|6XgTIK>i*EuwU%SLD_9G%Vd=^K$!HmjxFDgQR7jM=_P#eY
zFS(MJlnE?S#ns+r$YrPXvsrlcPUjxen7o`{!EX=WsS1Em&DG@AcylpgqW3ZjNOTb)
z+30hy?SCHg)iy~-gM;qM>--_iotTdo5COeh*9e$>6qwP7MLO2pPmuV+g>mUeD2g|1
zv>mJTF6+EjHn_5WSLX}yF-tT4ZQsk-H9s~}xf10HGu)=S_B1N~ypD3vZBqkDp+X9D
zHo-&R{H$)yv*^wzls|aGzkUJy<a_ZdAn8B0HQcX!Zv?U?0*7+Olvo3+omI|L*Edl1
zDw*TpH)v-Q&9TLOz%t1dB(V1G$s@$um*kudY;wNO6AWI@f0mgBWYYtKH!fD4ypdFF
z-Lo>IoDMgRuuh5mum$rwDFhbd%{{i&!{U`OwYqUhJAh0|=i|CFdf<1#|9AornJAT6
zf%)Gho8|+MZBahAy^)aTNqbw#|2otv^?5^IyPP)7tLl+!X$=DoAIhw>P8t&kM0P31
zs|rlJc3po?vZ@Gk%v-x>Ocn7Kk)>L<ULgwLdC*wfkhu!mNqe<K8t(_wn%Vcka`#b3
zy_@q^v?nsSbtSaHtSYYE)@N1uKjvJ@{-FI{M@{}W45)7;$6nO$zvoO(u&`tt333wq
zQMgB`gZJE%tF=oz=NP7^-;Mt`|4>WACkxm`Lhx;01DMJC65urrt~#lck&twwrb1K_
zZV;OcaT|L!)x~K&Hb6<Pz2|iSB8HqneEMk223z{PVx;{BbEcUiqZ`*4nOr^NWZ-P7
z#q39c=7~c2dH+tn=W7A^HaaxMe<#a5sRsO+rFIdLZI{6Jg4{e+BDz|*x>Mj4k4(i~
zFYaF7P>YMPBkH^$W(ntBKF0mJ6@Sj{*(NZ5DwrsP!}Vb&qMkLBxL0Cmx=N(@wYGM3
zcJ7ouGMISsl*$G)bY9zUOo_T{+t1pCr<;AW=eLEuXfSZTfO=Ev??d|6)dc0gW;$=N
zw|>gZ6&|pNxs{?csP1Or2Vp9n^0u62T}$jnJiCQv!>}`(`XthEzxLa42gfEex4{jL
z=-l&!-JecgO$N5#kV!h_uP&9|LTB6F@-^tK{ha^B2E(&TNbl0oN{qSpNGDYXK)O{w
z#_k_1FW3qogYT~PU;@6;y#HhZe}t@@&PDD6m4=aq(QEb^HBazI7zPsze0y{I?1!02
z99UpHwMB-7MB5KxBo*vxkZkU+H1=0aIyX~~2opA2bkzDwmN%<pp_}j2Qe&P`pgGSB
zAsz(E@E1*0UO%0sZ<-+G;4=MslSYu+kZF%tKJ^<<H{Dw;DA<8*KKwz^bze@20t1A%
zo+sFc<~s}NPrGE;G}J7VK+=<&*WXF)n3x)zrtU1F^s`oN8>9!n#ADKQBNhaFs*q)}
z(3Ne5WeRlbDTtaS53(IoB4b!mka_{k2*o}Dcq@g9Ibg#Z^Nzaf)=PtFeh`1K9sIsH
zJFok1FcZKz0~{x^R+<Hvy7@j-rJ`gVOnU&vl3%$}OuXH8%J(vOfMplvYrq3A<rtby
z7RcnvnQouGU7+$BT`3Y7QNqD5+n-5^q8=<AXV_d@=oWWC?X)CDw(%-!>48*Y2ZcD1
zp|@F8KIKGwR<fD;a4ItJcg%=?31En~NwcG)-=7Om_F;oPp382cFcH|(i0-ss)^(p<
znaqem;NEZ)HIz_tXHK9tO33B?m%xO%&1v(Vj8hK$t$m!H+S8y)*ALLqN4dKBVf_)G
zT{z~bxzqgyn-`LzCb+ZY7&5gLd&6$m8%%Rl7Bm=~*D(xwtA6h{<jrBZJs!k#D)R>w
z1IrrPCbzUw=&uu?JCF2D@%;_Wf-4k4;G4X`i-BNo<J8P~efg8rodd4JjHK5we$idG
zYN1QCJ<&?l+%E~DU=Y1A+c`x#3)K(vqvz;^rL&0zsI|bisB=cZc&>2*J>-S9UpV*u
zsT|Ghi!}<}e5bSIj!<_s<k$ju<kc^yZj!D5ZgcM>PpsTO4)aZ}XAu-*oiIx(8;29N
z;rgZq)7qzqx8rq(h!v$-jfzZzPU@EDsJpOx7K!xY102^f5sq-Zt6>Rp@38EZ87eLF
zfaAt8<k9*%8^q~|)|$(PbF(3%u>#UNhypD5YzpGu^PIQst7>sNW4BU_qI3dk%G&vq
ze`l2Tz)3?BLr?v8R`BYXZ~dJ7j`u~n=9AU8Dkbt#=!p_oHFtVY;;I*p(Ld!AqFMJI
zjh~H2JwB5P)0iAzGF&FXZ6+~PYP*B*$|z->z1Bx<BV)`6i-qtOK{!y79Qn$(Z{i1m
zj#MH-rHQ7KkB)!6WGX+D+gnfSH!XrX(>>SWp^N;F@{b<R+jE^W2|Qb|%iN!v`et_@
zu_z7(!6MXhGLd~CbzM6uoBp)W8+qxOUvgCYQ`VEp$a_#@^y8R)|2Dq{V5g;2wQ9O=
zKGhO#&eM6XZRfO5WB5AL)5U2~tIB+jx)<uGZE&itM<DZ@zezQpcEKXF!1AapMVJ{q
zFj{v^MgVUS-j-w+o+%czO8j^!q%G}%J;oiRSAPLm^DNU(2XTM{ds60j8of4qRHt9a
zRfrTn(>-cVpUt7_-mPX9o*xF>XT$P?=0*-3;yP`d#Hgs5wxUI#BaXrrJ5Jm)mB~ZI
z80<E*>3+`V+1?898emjxZK5Bx-#SG%w+L#;m35gzK5(?Gxz+Fg9(;0Z-4~dGqGc7P
zKM&kjW~FTK7od>kAS^%|j02CVYa5tL8vvvU%6r?u%f0_A@2{97__S|M`0~xoq@zj4
zeOmzr&kG?(3Tt^ZF0n@M@9hY5+PXh#>{JasL2Uv<Z?Uyj;T6j|J5_Y==sAr5dVq19
z6YjOxQqgkBd*lH<AwDE#sP8t()2!oJmEvyAfCkTFiE#zt@9ohf^^gYJBkZQQVD|Bz
z<JDY(1X`n9e8DiY3-|e#Dt(x=d#u!*t9JLP%$O{oty`^p!IcjUL2fnJgwR5?ndfJO
zf`Opp?E(7yVvDB3Xg}N2I?o!)3*dv}psF*kynBaVl9a_kV@7hnrZejV_I8)!d<wnt
zBDveSf>syY0L1PpY!J9!#a8r8w|iU#)&WkB9tOt32Hb;gtP6uSd|SO^mi@pfh9==P
z1--!hs`d$b0`vs1wVx3}Pmo~+wjC6BN4q6?W4<UQPsH@7^7l=9=R-DKf>xVjc}+7)
ztNbuqw4keIi6-Gb1r3+x?Ftn(S1aEJS`Qa1h{(r8^b34faseGE_0&tbFOaFiDhy^s
zelh@}Ytb+ZVXjqABR$5}fF~u+w)j6mvz?UONk~ZPdm!J1it#GC<vg&La)Rx^i3^96
z^Jnu{c2GGY*=T1_Oy0-Ryvq*96wfO<KD>3kzUP?fNs2JGqkl9=Y>+ORazyk6R6RxM
zTO2-IaEJYhEPD<6;xk4HJ;d>+#}u7U95a2wtmrDtUl=+rXPmJRW1xXr+u1G4(JC^_
z07H4Jy)YhmPMC7gAZb7+H*bfKr0lv=J99DQiYu9|@&V)`Sq<YhIhfr#5>suwVpqkB
z%oGk{qWUp!{u^6OVvhbHo5)5NQRZk*<+HAX;y8klcCFMUWOmIqt*>@ZcR$getuB26
zmT@W8CHOKMgQw<0jWcAnQ+se5m6|AQqr>`)Edd6N-UWUcyJ%*dp$c8Gxmnhb>umHH
z?c(J$E8?(*r$+Gd1g*xadPEzixI<ZZy;tZ_STJfGPmLpgKpJPvKkvaU{jkqmx5+Ra
zY9z;|;80*?@OrR~>7*?ZHLz1$q`Eb6u*69Ja$HP57SRzy%$sZS%Enxt^<Pe47ExzZ
zy8GnFsB}IiGhlwi)86|6b0Ajr^r8a&IDK%lt{<_>Rqa2C`rnThC6CF)R@A^8On#lr
zz&zRo*MAcAnie4x=B53Gi;QPSwW<?xE31^q=FkwhS#A$bBQd&2q1H2~C-SLfs~~)q
z9;M-wjhWmCD1||Z2J?@r@O1t=m4Kw^YMW=9>We4;j6H(FIt+Y9i*cPX53ixGF8u(k
zyZx&iUjl!D21uJXz04A&sT<`Zw4<hXg%H|Z8^_<2KD<zVjqhHD+u-KpeV<Ey$=vr7
zLsYeh`JU{o-08ay8&jqzaXzBtet|f}fQIl_F+o4pLjP7p7X)ro9ZhX*Khg+IVf9U@
zQZiZrPHOP!er|u1W}wOr0(S_79J2-PQVTN%k3lvVP)MH@7rqm~2M&Af@$Q)3z7aIq
zt$!F6efJao+^)uG_lIWex!pph>Qd5cJju5!w>&DtrHf<n3x4KDDpOE*q!)__iEkc4
z&PHs;uWt2J^DiF5Vg|l=6DVf+0|yn^x;a5k*=-o&;$OG@R#(=&^t|gW)No|Nixhxd
z`P3=Gxo>FQd%8}@$cFY-=I-F5fTKy4z7_?!y$dcgA}gClfZ2*X1?y6wkW2i-i|FoJ
z*W?}@_$#Ds4-CrOeLPO;e6<qlxN*i7HSpjG;2w74#A?klsbwysHK;L^vm@CUYEhPK
ztm@6;`LDeITaolf$LY_~V8JCiMmXa{BW>qN;RpCojQ?94KUd`^6MaGV%u}E0TLU5u
z*u52Yxfx5mQ~C=8S&Md5Eruic3wGH0ib{BU5C7Ud4X6bu@g-h716_7!+^@i*cgEiu
zgBO=ohDWt$uIAp*(;<8`1&+)&y@a1pEkWOKm)gwo?y$ApF82YH=BA(S0oW?=A0<7u
zvsvEW#$a!tG5fE<nGk7sf>)Xu+kJF~nHa{g4)(i+Z$uSx8<s`mFZA`4&8x@@5*T`R
z^*bO6DGH^HmtA)ro^OS^-T=JhJA3*rn%3PoWpcR0nkf4wxsUUU-6~i03bS#>%c7E)
zyzP;B;OyQPV9>QhaPyNM3y04{t%D^Nbx03CUuje;qGawC3b|uecJFxV38F{xT%YCQ
z8_m`)u0-Q-?r+47fYYH3r$>{L+zY3D8F^lCg%3H6B;-YacGvv}O&u?Kk50;K?5=ZP
zZqmzQFLk`d*xdMN0z1sJ`9$(Dv9HqA><A5^Qlz@bT{=AfYRu~ZL3N;J<T?6fbA%`L
zk>a<~!Xo}a3WG%-<;48A2Yg6>$YN&7&-NtB@}OA^B!aQsVv}-mJ{jWsI!%`_Y`@AS
zwR!@D^du!iR`3SY#pRaD#;SZq%i^0wvcL~-_muIjXEtF9jfNM2X<*f*p)Arf=<a_n
zW8BlnWWKBrG99f&3Vo>mSunBMp))av;}um?sgY>*1%0@qDDalfNT$VB-H%VK-R3@$
zhg;9Lgyl$;o%ppPJpKd}vC_~M)aL4Y1UM>lk7u;2JPPSXo{iT9YfRDu&M=9K7(w^h
zq5~#emNR5vQkxro^|rQNil`o)-7govDuc9DRT^sCJB#j?fq<_>VPYjwL3{8<fSKyv
zHiP`Om>6@)$@OP-=l@igSzO`mt5rN>XHha&FJP#^S~;~?{I+y)bI8}Z%I9Oy4#{L#
zbMuKZ{uHs%PaWpwXcU*CG#qJ^xLNx;J11hZ#rE`}KjQQrV~(6+as)PB_Y~k{^cX|q
zMCN|H{%w4<?j41faa^yJFl+sBtKv<<@=#lOoy-~I3=yBo9-S|Fz^y2?1D~}?$>1sk
ziI^Z5%gYCveM9cB&6VlE`FKc`uYR=6HS8KgcMn5ibO4QPr7YvcD=ZN`Iw?QyqK#-#
zpfJWP63{7JKU{n^1iOy(zAes=zM${020q*r`e9U!Mx$nybsY3)=TrVJ{T7>)o}_Uw
zed*1wTX5fb+5h}aG?(H~dm9%&Lkc$qJl->Ui0F#K?o0sz7`RjP^8@1y$fv4%#uKlK
zN<!f^!(Wf=)G^y6k30^9UIrj~?S#KI&ba*5rPmBodT{C-HTP`kfT@7*Jurx-g@##O
zz2e$dew73o{;`-`UB2rXpG)FX^g7n|T&V@|sqs3261-wohetM*epV}C?8f?v+VlQ0
z42`r<{~;?;j4*_T9=q|>a?t~&Yn{b|Eh<t-sqW8ReVg6W*{DyxjzX^NyaYnNYRMYi
zx;vEFWwg0U=_L<xsk`1bcX%LSZgxJzI3s{%6)y>{Ac~`qd-Dw6pPWFQ7s~JfX!eKY
zcA6EUWjsbaKR>y^+bf33(vwi%e9Z<?^w%Vi-Msa@Z<0V{%|};a{UbY4?*L}ApSS@k
zGxn<Nfq6EHN2Gz%?l)j)c=&FoaYkX0LTr>AX}cB{Jrw1hT+tCWIE{AM2~9T6K*<4K
zajGozQ|@X{e^n{5W&(&=<VoD@DHOIDG798vG93IZ%iUSf1{w@5UEq^HKS&hSBXeru
z73U><do+kOkF1QdW!P$C%I6cO>5of2&UV>+WR72CNYnkm{1lUK35Ef}>tB5Ao`uLS
z!zw3`v6CbGnU5wJjF3e<qeEjdTn55wdvaGzR=LQihS~A{gC5XmR;6mCT)rhBDf_y)
zM}N)VPnWr%JV9N<Kmgiha9G99h?H?HbCDZRFleY!%D|*H>1m6tZB_YNnV&8JH@t>F
zA3aoFpSv13f))Y>3clCS!^}hUjSH_L`dcgrCz@_mPCGy=t{EjM#J-QV-oqjY)vabW
zu`;`AK{2h3h-9}YkgJd29Oy&C9n0c>o7ewyhi^Jd<y6bY=xOB&9kBuyOuHx`d{s(4
zhKp-mjVdk}hBClQUbsVzCdSHjImwPFth;`CX^~$0tP%>F=2@*IPwNRy-VI8u20BcM
z0MdF6ytug2xOOPldG{U)nWj_KlG`PP$t{t9LFXO#mNwrP_0st~>B0H@lLtq(X$ExN
zp@WS=&;JeqCYdx{kG8l&uX^(ho(ae!ze8LSOX`(r@AJBf%S+DED%JiJkISzl3(4t5
z?WD|+s`~LmZv?dRGA;-(QEhk$Fr{!U9{Y;vd<D7dru+nFo4UW*XI#i$@Cs3NSzdh3
z*v{_oKK!Va?kbFUA!ix(XwxlQm#~0=MehME5Zq^t$rxN;j4(!F;T7+5S0VN*uw@t3
z^YI)gtjyvU8yPFbY9-*lTZGT2J!qNEL!?CT!`R$iUvdAi=_cw%5#2TEMvDcb`JCCX
z0rq+P-Q}ZoOr0L(T|@o!&}$gY7`(s1<iXO<`9|+wFt0f=?do~sc0zW<&Y!?qU7rl{
z)%b;X9Xxc7qUG_2SPBY3Jrl?(m$wjbcmPFlKh?GN(O-*bE~rwXJ8hit)Ervf_P%}=
z<ip7KkGoIFK8E``)4uQkmfYiT6sjCYZXlH0uB1E<pdo-NS&qLoT`u!7#pmWE?>>+k
zj&|dae!9kuZR-?;FQeXtA5D@tD9xHAVQc)ipyRm!LC2HE-#B~=1#QVd7H>;lbN0$0
z65KThPEj0HzGq_og-fWvDcOZ2@A4^^SL)MAqt{`LjNe<h0P>K|X^>WH2Bd?|-_O;O
zZu}ba3GDo=O!A3L?9g1ifKG3yW|ZZ!bjTETaTbNWjjNd<b$e8zMFy7#4NRaPb<&~n
zSHJ&@|2#18Z*@t3x)*luOtvIWh5S2Q@^=KR8=!w9e!|1EM79sVJ|XpdwWfdQCDq*`
zE0CLdlih<S%iSkDu^n2Zi8WT0t+dtf&%4j+4bL47h)f94{Dppe`}kxJKucvzhgg4b
z<Gmv(Usa2K-+GcT>~Gtpt-xBbU|BqL!Fw^Z$8xV|go2`HqB3H7(yw}3%*KwX@sS5q
z;Ca##FwY`x+CI@=C5kvCkl!q3BPYCnBKYc}s45=lsvpxhHyq<)iDk_vKW~pzniVk$
zcL74XSk;P^dJw%CcsY9n<he67$6x#ux2SsL0q>u`{4Sxr?`lmO5BWZYd?jA$`cA$P
zHGF~YdoS~%N&*~Z;Q2NzYW{t7^QWcf#_K~ybmo^`hbF<hSAhGKYfle<3>dp(S@^5?
z6t_b$-?>X)e*5PfV*)#tr^?T7c?+C~QdfPk!#9-3j0SJNW0sRlQn(HFcY<J7-aUG_
zGMXNvSc|H~UhLmYUYdDT{9-7vS_=F}#$=1_tDg@4H+3X<2YA_@#mA?gf8cYzhE5aG
zF)f;{+~_+n%PQeS%X!wJ<(evyXk$d45*{g|zS&sGqToOOCb4H*5uc*8&QVz$F500#
z_hTR8Uz5oUJ8-asu)_1^)K5<sH%h@B$d)sSby(llD-@0Paqa{!FI+{hm&%!on|su_
zU0^P&u64n1IJbvxSvi)`9<df>Ji+oX?N?9Lsr~o>-r4*50K-oP*`nvGWDx$O<eJ9=
zaek#*&*m@&DV#r0<bpGS0CgCg+8J8uIa@7$E*)aH^12Z9f`wM~beizJUxBU6#J$HN
z-g(3N^JBSC<i7Q@26K_8=4BA8r|Fd|M^y;hSG7HJ`^pkV?I6QGAspBv`L$_hKW3=D
zyCij=T<9ekyY{Qfvu>0A!eiH(lmn=rSFD`wrb=lwXNbaYR7<ZPz&rHV^U!l(o3R=3
z{DU4j*M~$9*)Mh`Coxm<bEy!c-NIKg3ZBK5Z{vRL)GKTNr15H6srV_%2uxAsf8yr@
z<_+l_ww?nfF;p|=qOW`H8t5D3(<OS9OapH#qK34DWoq!Enqt4C|4LI+N`R)yBPLz`
zHI#pT^3^=*Oh4aYDeCFq<yl%6x=ANeB_H(2n&AI_6il9IwHnq_WB2wVTXw$fXsaB<
z1o_^AiHoH!Y5fYOjpWgI<<`EHoaJw$+3!3_JO#ql%FH+MkD68e5r(jpWL=JAKX#ts
zX>T4zAXL9u!(gRt7uy!P)n0svbo8hT`4#h#WPOmr%F-Ze_NVxCfWlHCRl@a5^aUvP
z#QR&7ll=4Ux_WV-pi*as=$6Cx)cs43PbsB`EmVM?-h}N+vFfCt*VE~)yv2p)#{8*p
z>K9?E?J;15f{#D4{Uf~ZxE`mAU;H-$dl_d0)7C7oLW%iW^7+S_HJN5Wk9-Pt^FIT#
z^!%HDkxmj;-s=X9D(6fHaR&bK!AgPe>0}=}^pmm6j%-t}j?>)Ac(LQX{MO)fdP>cT
z=hzeFgxX?mr1EDRLK%ViaEhh~x(C`6L{OWZCBtpI_epDBy(FdRKNSRz{G!BE2B0<c
z)+O3~kauQT-8ZxWR8Cy8oau{0&MHIEAgDFi#<b{I^B5S>+W2bq2pOKt@+lqirjNq9
zd^O{XiS(T1DFr90UwmKHU%(?W+%)}qpU00FGVVDAC{nI|w`htZ!Y!TV$tuKQBR;PO
zUq;O1r?i3g+d14nqO3-=z&tCwMXFv)QKn<V@}oAb@D5W0u~K>AUBbUg*|N7EA2<i0
z{>jxI*YTXu%SWiG*gbFCPM$cK4A&}_O{ne}_eU$nqPvG|gyUuL3(2Z}g<2%l%AMD=
zcL_Z>c;4Bg*tYS+bDVefk6ONBJb+ATWxR&Oe~yp5zGr4yc^+k^fND{CYP}K3m2jnM
zT+L=(Zcv+HPWl;6tA=F-_v?)4F(1z+$g~MN83s2#uIE|n+8BNeGkM@=%h(fQoXTDr
z?d%+{-pBJ^Uo0rnOVFmV)-1fJe>R=BjI1@?lp&JuZ9b>W`=S&Xp~C)xs77+XZqu{b
z8nP>OUO9cZcVkJwgpA7Y>{PF${e@Rppa50&ZP|HP?ju<1V}^!?fAr4<Lq68X@F-`<
zyqVxwX0e69`?>GG4p9|;K@^&Qo)%BlLE4pM{Y&^Ze$C^$NlzbQF7P&vmAKh?5+HSc
zG>>l|0kRIDx~b^G%f5qTe`njHNhf?k#S?|R;XYzj+&nz#!~Nu;sz&2!7hBXP@kjk}
zW*%AVOZRUmu^KFA3&bfcR{MZN|40w>uPnY<`2q1o`E0Y@(_i?e;JsAu4};f4g8%uh
z?*#qO_z>>dnL^%JAKtBtyhRcl7KNy@LD=1PsrRGG_`<btyLUuYE!ss%gRWQ3>m*O1
zD7{?I;)@c5Pe=xac^n*!v0cemhG}}4e}?Mf!gK!kg}^gk9uQ1bc^lBcyHDT~yg9OL
zW{#!Lpu4izMxTL&&y{oPCn?Ul8RmzYErVQh)LHlt3U{GfVT&X!>9{^+YbI#sjoDp&
zlG}1T_CC?MP1*YJ{-lS=xnqFL@6tU5?_=;5MRnR9-iM3n-m!MPQYN9$DcTM%gonOl
zI{QSyN_mb)--_guFM647A;0Ur>ph`tU0X7X!TLz-_7DdU$FE>5(So%4@Q+hQ#nQic
z?6zva&YT?DxPL<10I_t-#1B?%!%nt>c`qC+kr~|Cu~XzDV>8_*XplyggzGNPI#W33
z!&_BPOKai-M0OS*ZYx{E4YrM>&t?oR5d>oJ2Y->!L}w}HEy-ux|BdzE|D>D>^kkGW
zM5fQol=$>bVg<|A&Bt&)?0xaH`dj6duI%zO*(lK{+({L^1i=h%17S@AF8xH>K7HMH
zL4m{z-aVOXFS#T0Pw<B&s~aRabDWv|1^yq&paIBZkjVHx;p+ee|ILwx5?y+gPS0Br
z7=F9V5{cmyzCvPb&yI<n)qDk_xj6Klg%XHcqdmR(@@SJ;SLp6XBRJDxCqm<v1Bmtx
zp=WF5{)Nb$ySwK@@jKOh#in_`w7v^<$E`GF$MzGKZ!w1&@7%C5J9gGwFs83rFSkt@
zCOnVAM7h2VO;6&S4>cmFMbWOZ1dr9zCZ%>QC!Ed*SZ<6|cUW^@eT?T=72}MIahN;N
zeW${r^Gu;&tZ}SSHmxlL`NHkAhG$0%$#K7L1@r;Dl#bG%`N@x*-?zBoYt2(2x!B~D
zS;DFq`OZ>lD-w@s95N(DGt0znDO>M3Ms`b7=3NgRZ^warr*)j?gkTz(7*X3^uUxT&
zletO+Z+Z4{j9p~f&2#<mvLW^;b5hsIm-}OVfgq~5gHU=I!=Zh;<N}FQfDIVs#=e%;
zOhB*2W4j=Sn*u4<8-YLFgQS~ydlO&1k}VLUh(3XCvAmUN$B4h4v=wXz!9HD{^|DSw
z$MYVmK%P8|gl|?v%T=O#>~b)1#ES8pE=x7=DV=x1w)<n+77G9;ePvgFj-|e0R|^cK
zUM48W#MOwkl&w2JKhOKFsu^&ndlm<Dj%ro#!$sY<>TrVug%|!Ow}d*<L4qHNe<7my
zCpJGC;ViJbRkH~9clkxmZ$AV1lM@4;sV}j8t6eClgyQ6D7Qdpc-E?niZSM5`IH2<$
z08C3fuMB*>YWF`ewXW|Ey$JpmZgp59F1Oiu1gc-nd{5pf1FMTYD<5$^Rp)T$Lgkui
zNbkl<H6`H<GrYolx72rf${R-ZKBt6?M&oKnU^JQfo@63gPWMu^kddx4Rd)HitodIk
z@4+FUkrpTa{^xogI2n*s+S(CZvBqG-iz)GQ=$ta8p^IlUS`clG*J(1}2ds>~GoZ`V
z-%T8CdS@wZ+x9F|F~%Zw9wodJ8G0HEaoF;5e3$R!Qo}x5u;X@S)`>3Dh{;Z|<0?A9
zMvMIy($UzH4$v<;{S$xy_y^}dkM;$Y@J2q-Nw0~Ts$6xrQTu}Aa6TCQE@oMoMKJ)n
z>#Z*pS86+!)~68lmEcY&xIt;R?%<_$*>#m%wBlsl5d-jliaqahyUpj_Cg_zx8K5F_
zYw-TA?`wS#z%vGthwcB_R{P$ZOfrvs1Exc-(B=Gu%@}87MV?GT@Pzb(KHX)hi3kLw
zr3OLQzrhgWg+ANH!Ky6rX7iy&@$9w(8N#gHmnlCluDb@&DB+-2s4ND8bmC|3Q*>D_
z=FHic-n;uG9H7}_NUepL2wF1u8Fv8sYH{@?zT#awfUkI{j|*3jNv;*-E7*}28BF2{
zU!(JKsJIZFugu4nxs1Bas>G^#mxk~)BU<%%FEWDzDP0G{PVaIYpTu4!M841ZMYWhj
z<D0$B+*AIB-+14PvwZd01o)lr$Ptk5#W#bk9iq|MtG&sX$p?O`@n-VkQJ{4#maG0J
z3GHo{6<bNCqQ5o1SL2b8n3P?{?tYM)(b@ip56afrtI18>ewg-Bfp}f5Jk|Zhp?nm`
z5b<i`=k}F{vbu?Oq2yX>uS%a=u0&Zpw$vdM^vc-@v(7;S-JW5>jnf-xnuFOiffNC7
zV<YvYVe_P|xrbcU4r4J7FDR6~mk7EoT*D35x~yLG#USSwJbmVsz4UN@?8j;2e;{02
zsI1d5j0hpx`gQd)^h`_o>&_ax0X$|xBIu>n{0F0seo8{Uv!qqS^@!lfPwD&v+uA4k
zJ0NP+MpaHedeM6A^A$lWw}pKaUG=XcKznqC0cd|=fPsvFFT<MtOqT5-w$3|$;{}C;
zD=RbvRoD=#rxRfqMK<N7#+Et2Jn*Jt(Ed5p_DPkB<9QMqCH)?dY50{zG;wNSb1V<j
zwi6P<N=Jzs&fn_1I15lvn*_t2doRBOga3o2Ee8);$R<+K#h3l{pZK=-QD6(W1A$HK
z6d_?w$;CaFdxh3B(L0y!y@7pwp~^Y~%lR$4-8T1<7l=>mmSfo`l+|pQY~o+73e7j&
z6|@yu=*UW58EzO{DhR2)1~(tF9&_Ntu^Zil$<=U`yxup;wWtHhz-hI9rS$vJqJQ*b
z@mkiw@GO@)K1(EXWscWg8Cak#-sxEwV3I!tX>igN*HQooo3)Yo_Lxz{Ry)@lv;hVM
zD`RcT+ZgNDk|U+dnHf2m*fliTBX%uvRp@ycP|3Zt*|!&E;!5+yO}2Pzb9v$WIP%X)
zCM#s~m2bi__)g5n^A^VYKnha}7+ZCQ4S7yrCEN^+BsmBXyr8!oUBdT@=Vtp+OLs_j
zc8RNhp=J7Z<zMng`*3Xg(OO_-7$9`?j-3GW?RINFLy0p7NN?e0ma7g&zK3=HoXEH7
zDA0cEh{b!U_$WFS6u7RlZQhhvdL_|gRgLbCG;;h%WDt>hah%E;$ULI<W2d4Gj3Xg#
zjU)$~cm2rW;LEPqC&-mZrAyW2?%FdTHy_U`C1$@>=w`;t%hBRN#H;(mJ&PCs>c+2C
zy#eoA=e6b3DfLAp_^l#7lLUs13G&NV+QP8;lwmQxMHrzc*1HvJFYQDXE;mi5TL?3?
zgq{|vTo|m-m7lLFQpq3+LH#hnuH7F%w<_o1YsvL#jujPIWxD=BG`pY{qFRX0`TYPm
zK#y(=1MfE5w;DU(%Wx(u{mkY`#4NVCdlbn>$Lu=(a@;C!0y`CivJzEFskGIFmAqg%
zv9(;N70H+8{h<D-IiwRIYGdgQ9b@s>imi~B&ztAs3DxOUwiYo=a4|!g;B%d5vidg6
zm&~8b^>$UM?pq4BW$#r5pIU?Vsf7Udk~l=qz7nX!RW0R_Tv63(9P8Ex?mv~nNom!}
zCb_#bi8_7z`3<$>^yn5^(bnrx!H3weZ95JYKtjC-!FQqnf}f_eWa?gJm@-V;(C9EW
zztkjwcQRX-!S)Fzy_Ab52<p18xJKrH`cy(9-|)=7W+ld1AZ{2}ngI~#nf9kP+%~~0
z3x=d+y!<1&P;W8mSEI@>lLx`{yq%hxJ9d(2W=)=_v!U$htKQZ<PtVcf^2>w^g9dq+
zqQ6#F4HozMjZ?0*QrfU6d%8S$jsd8JoOou1eeFF05Eo~m<H>yx1rU#v?|!Ri9D1~8
ztV$b`H@1^09VN?T4K(P|rCv1J-O$_3gu!=(`{4siuPgg%H`VgkO0}}=vehIuS3omJ
zVH~RD7O2SPB684KfBGnnKGu9l_^HicVQ!XQ%6=vFm(KuWVLafczOSMGA$o*@VLa%z
z&_|s<FO@HBl0Zz6(=7?P+vir<`XWZ}0x$G%m#d|8Bwlx+_msQ_VPx{HB5$!TXUuDt
z3rv`FRflTT+pZz1vo&4)Gi$>92A7O#T+BO+^#tbIU;^_(?-K?nZTK<#oOmSk@Z5ep
z&SVxKFEP@HNB_P_wdIg+^n2_h|DtFEF1jplHxj-Gt>66!V&;SiwZs;_qqM0U`>Ra&
zX~L1N8l65SVE*P~#mg!SawqSYH$8|uJs^nVh7E^I)%bIOj)0?=Qk>u6JzffC>uu!}
zh_770<5#bH2(((LaZ}nbFfhCm-p2)e`}hG^MwEFlba7vu7OP&+&Afob`RRjoN|)<;
z)-6z&Nqq@KZ$q;1PAl_P;3GxM<jEAbuGo$3yNeH+6!=1}r1@lY^2v`q%%URHtkGEl
zgF@bwEHOuhOVWMN8E(BT_Bf+TZxx*ByzMef01DN+yvQFqD95(%0E@s|D!k<e*=cKE
z0RlKyk)pfdGvv8*2H(AH{+r0?vf-3k&b9bX|1>I$Mk1$*XY+wRG9fMd$VXRX8Jje8
z7qQ_i9n)WTx_Jjm*>UM1KHq4WO{&acAevS#yXdtBT^q7op!5uut&B%iO0*}uRGu!~
z#gC|k6W&V3tvk*(H&*WCe60BAE`VF|X~Q}!!`9@nFd0`Lom%4+@x~8KsLm>T-Ht!K
zGs|d7MAz$*jB=}FhWF#wBmd0mI%j_Uh|>@65o@vY-J3pJf2i{LyWyZo#DNq#<L&tM
z*G~)gjCat8E|V=RroLor3!1GeF(C^RV5Z~uSx`PdZ!E%}eTWb^$nc(M<y$@eYT%MX
z#q~;2-v04d4gCnOn^~v-o=l0;-0MPXh9!!89prgj7j<Z(N`S3Y-zj1@`QUz${A-3C
z5%slbpOILbdDZy=5cy0t>uN6)<5zffFr~M}8n>|gIANYNqvF{ae&^g?6?)uGFys7?
zM@g2PrwmMMM|z<)cgUt1!(f?FyRF?ULiD$>s4qd&E70I3h2I+TP}-XB7QMM{jmc<h
zxJ#q5bv+`CDue!NSG_2r_qdbrWkWU&E0KBD^CJJjL_o*(XsSwXV}0v>Pzo>><v;Y!
z#Jn%PfDCLNpO<h!PWFX_eOS5`zdCXv?7jNzqLQ7Ws_o*i9$k*NV1H~dJJ1-({&T*l
zTJW`Gt~5LV=Bk_GKVQ1*LGF1~*813>U#fe$z7phsX1kk4)oJg*<9lvKFv}?3(6BAt
z)$z>*XmKZ(wH$TnwWdd+w9z-qDaExmjyoS|c=;$(d{%yO{Y-!cT}0nTA$e#qx7X3f
zYq6f>?m*tCcacv!viK>YpKMpWq5pz!*!-rrEu@T?lXp2bzbQXB8JB+y82Cj0%w|{t
z6q~Jn!4Dhb-#Jdz6<?ys!8{-tRT%8VqEOhGf&TS5g&V{8+wVKlV?cDw?vjilSQB+R
zt&s7wmoZ9j9j~l2mU~&`7cGwIiy=G;awYTU!X|6(Og|Jpx4q`Cy%=_WHmqxX*runD
z?1O;xpoXjR*S9?BIL2_a7V?1xFJlcq2nRP?O84(^6O;3#*Pr8z);cRxZ~Lqn?c`v$
zLyY|7FwSxluS%qpxOMrp(Ju~T-##B+^krLn-0Q1G7rb`nY4(w(1nCIdB(?i#DX~!e
zba&zj+%tyYXACq4aKgR0@`OL7I8Sw6Cacn?#jncP^Etn}2dOZ1%wh+cc@s)5<3MGK
z`YSo!*`<Pg@kQ%4Jzn$YwzY5h6}4=kPlmA0D&6mr$z)EQw_gvgvX&hfs{G&hs*@@(
zm*%t>U)w9;!zJ^=<B$2#&(F<n2pdRzdFhEzzKa|zULcIz%nLWWv0leC;YX4DGVM6R
z$ouxyr&cH$-HhaNs;<vx^yMw;(PrvdHD68xuakLP7kqMLr6Cw*IKcBQ5>Lqi<8D)a
z-uSAl-7$&%Yy7?6J5TSfUvVXOG6FqwBS2)f+uSzMkNmoWxwOul{rz0_{jR&8QWHXC
zPR|}h%iF_VeM!aF)R33sKUpYQgZed6B7rM77hWZxW4_pZWgRA4?AD>)dY4_>ykVm2
zr997_`&zj>O9v>d6K6mkHGGEg9<uYxUgjM-3c|Jqt_{bb!ardJ6N^25oe}fA^3Feq
z{6EmIEm0OA5V)6v#^KpselznD=MD-D8rPy9J@4p>$#3C#LC=$_qbJL2AcI_I9nxN8
zJsv;<30!ps^-Q*PU3LUTP<aR`ReK8;$F0-xC|A{9;X7{5Z40du(bv;T81qRA6WPM?
zwN>6`3ojX@A1d*Ii=VZ{)$)5>VQ$N7<x#VkZ<Ao@et7W&l(7wBwxp_}^$lFYR`b0l
zG!Q9WGBXNS7x{Jas@bcrZ)hOQ1&}X{vq85j*BH!JB9?c4O7<os>ViX4tKPx`jNj!^
zOT)*U1sG9IhHqjp`9Sk!)8vCp1LNjl#68-NvD-Y9HbmOP4JMzx>4Kwui%vbc&vHKZ
zG`8xMAhLI9sv?;ELHTt8_a;I#LSg+wKv8%Xg$<bX*jt=*ZjGq%ba3qZrYGqUiVRS2
z0$R>`f=GicZ6NurH4~q-P`b44G;1(nGD5pTOJnxHQxfMvY5pWmMc}>bdq=FBI|JC;
zoOfxYhvj^v*k2F+qoMCUifZol{+HQgbgdLS2n-=RptLKkA%xCS259z?b{xQu_d=00
zxVwbv=l9Fj)-5ppg$+-vr5_mBKzDDKAL;y*p_Hc8Y|HNzmSD(u^>C1Z#lh;Ko0}(J
z105p0)l#09+d{mSW?&80oB^|y>b{MlO4u!-;yQdXNGi~kC)HKm)EE7#qiiV}w|=4j
z(OOo5g0^;1cFULl5?^&mK=WPrHLHxnf0hIPHNue$YQqHHFz9pVY`JpDFKc8ehc3y=
zY1uGdp#((<SWSG$>4_HW&4>3vFnjMCp?3%h!clrR=7Xd0hdqTV^IJd0gm_jKs*bs?
zZLMY9eF9a8*Vsb;sn>aT7HqwUuDq8X3k7S`ghjjIp4xD~)$ovonpdPJ=|eMSyi=@O
zDW5S|sT-frct4d<{8cC4<tYIXD*V)B|D@56d+XnY?+XuIspvTdpSN;m&PL=0-E(k3
zdwnP?9H5_T$v_QRgJ7AZQI^DqUx>aT*I;U2M;G+rX3bVtDIbVenwL26xSyV18W#)c
z14-4am3#4I*G~Q=(ps0`aAa$duX3v;hQx!{bkH3bNJMAQYt8EC*qGid{h*P`4~iX!
zmLKVz@kLvtLiLM8bJ-8|{Ko?<p383Wc(7KT_%3*UXF+>)pT#p-uz`w{9`PJfJz#Lj
z_PVW8zg5d+oMF*QIl|XYGJ3E5Ecpk~tVpIfD~_lSuj*TOO`9)Uvh3y$Dpm1lQcbpC
zRJ_}K0Y7GQsHaM0?owq`Y&|-WiNWyxHFQ_$)-0LmiGOvZkI%4^34f9%Yq0ld7IUI0
zQhE#@Sa4+}<Bn%BRsB<F(Abk#5g;l0xs|0!Y}S8ijwD@Q;tFY|O7uH7{$2-r%uJPR
zYI*v^atTLqRs!!_Us?`xp-!whi{6U4uy!w8LuUuSVSa<L*?U;C>0n47f}@jcuGUIv
z+jyiGQML2N=YDxXYry?hypwi<V7^rOgonq&g{C!E8GUkh^9Y`=ztoYY$Xr@83Fw#`
z3mXLU7P>xnOJ4v_u|;=1v7Vg~Y+cinZH+xVFV>zM3%y)mA=p7hm$$idy$&bIhYJHI
z@hHrU4f?T`U0rPB#_5>Xvj6uPl`phQ*Za8s+Rz@?9sk3s?WpVXnC#Yt9()Ne-k+>{
zxJRq)wgqLQ(hOK0N6Ue3u+%G#?|xDy+-Y$CX6XRF%fc{{_t*?sf;LL9Xx=uMb-+tB
z+g#jX;!9ha9Ip~CCk9XM^9~hRWRGIXzHYn7RYr~-r=nmnqahCSo%)mqjZ(_EL|4Nb
zyh78jfvJE!Mjp>Gr&sI0R-5FDEPpriDv!kwoP^<jYHKcoHZdGBa(@AJbb>!n)ag(0
zY=;_8|FvWHWADCBnPCrJXV(6?@WV(KJN+Bf7rgjF!aLhs%s*qTEmFW3VN11To5bA_
z<zweVZ<`;mgOrTue)dIEuGJY_PM!3I+;WD%FW2GDF48@He&oX>jBpj-{CMCMZ^!|#
zg>P94g?*Wx3@@c!h;0A%Ba2chM`QJ4*zH%`YbFERz4{Kt`LE^}__1LVj?Wht+bUj_
ztpw&5$?(rAX!pw6FIAfGQu#Vu1ZEc<eBbiT^V8m?JFs|SkL_*Ei$6iX16&#ihZGZg
zk4HP+G;TtoNWsy}*5K7T{;w$A#P#SG4^vmk#_Spz3M<X%G0E>7(IWN0E~O|X>$$RT
zjpsUj`jM4-{lk;Xz$+CZ>N?CsQGL@j7P74rcKmUU$ZI=Z;;BnkH~rawr|gq_Gl^`i
ziXpYwK49WmQ2Xn;obRLtYUP%t`g3}%eto)97l+asBPZiD8F(dCza8A|em&Ym@U0&#
z)l5Ap#l65xeePb4Ie3v@nMc5q{GLU!>0Bd+Pg3i7%dwhaj4cRhIp4>Z<|w&w+-;&K
z3{4qwzziaMR!X!F^`R$rO-i71TYMHYyM^R^dV|vTu;wmlwqH_j!y`GdKkbNh40}*S
zPF}UW`WGUF=ahNvHJ_UXtYu>fTTa8ZSHMxaX<aOHJuQbbJw}0xYT>e$8^ps;S|7&4
zQs?5Ys}c3QgoE$Sa;!b>d*F#U@nFqo(zkdrI7+<r{xJL%9A5a?V4*ezMmh)LTwLJW
z3JT+mA2_S~K$1rxC;mv^40(*gUamX;!7%*M?v5t!oE!MkSCCdGwg=?Io***$Zy$gr
zNT0O_?vwu~7Wx)ntlK?+IH8^Y{96`o*;7S}kI#S<g_Fhw^qYH8v>By`EIX)=pLqAS
z<Eb^7{wx=zt(>5m^PA)O9$&KlGy_TLU3-Wby+07Ph{SxvvC_sKOuH=+76pm`&~W#w
ztCor;J~}DoNJca8n|u~!a7iVL)`|(34SJ}j2sTSr(g?prE|2xSMic6BA#XABgA?7;
zi51>jT05_YgB|Z;ZIbOXidsKks~?JOeLS@|t0z1En5xrm`-!t-7G5QGZ0QX-XAzRn
zLutosEp@c#_-UQaiZC&`S|2fl-XD!0xLa$`IEVT5rF0)-4<D>3%Ev2DuTpUCYR!45
zA^j<?blcBDo3=<~Cb9C$#pyI_W7;#stqZX%nwLd!)G<v@&BBa!?W7I#>RO&y49N_G
zcxKsRCKvg^N@7dHkmg5Q<kNZ8-i4orAvlK>pCQdqNc#ZfMyTy-Vl98a#+FjqY|<h&
z$zxR(b`@=fq@6ZVFLs6Fre7avULp_STNfP8yM3|PauO&`T?)Ooo19P8w_TE;cR@5U
zRx8h_-R^;RrpjXN!&)(1m};{yYO!oKYRnSGUg_AEQtL;>bep`jUT+2Gn<A12b_Q6>
zmTv2lF}t^=?aB3bcTF6xFs_dkrIm+?gaNc^R0;#_ytu&v>tb=*t1cIiyhh<zMPO*J
z9oa=93>c&>ER>m7@P_M7D@Y0SMfYtky>COoA<|0~M*33igWA15m2D^W1^V!U^BmL^
zQ|gavA);X$og*2?QdOP?)bW0Y93JK$+F-47Vr!lK_K$m;QC<R0h%;`$^6}K&)Xk@h
zPYI)4K8N^dGNWbQuzs|=dtsK9TAq6Mpd0-1h^s&`t9$>3tj!j5JC#|N$kYgK=Zzt^
z5}!Vj$S;SBhK_e$Y`;KFVJVx+xM6+R1&=Lld1^gRKd^K_P3%-Odfpn`S2lKim$XhE
zA}#qC_FK$yOgK*X)EbK+N~0V5me!V3)*LI1`%s@ki&i8b>K;QLNM>f!he)%=pmAYC
zB*#YlJ5u>|eQN}K?#;W<6|x`Zb+<UGDnE=v+_hTC>iiUR3}zGaVOFG)Ub-ove@N$E
zFDi2VwNGuU1_Ewa>jf`$y25(GXM*GN%4XaeH1m3h91YhR%WC-n&expn+v0@5A^|NL
z8h_y@*7RmH-R;z+%uJZsgJtG&-n-a0qU&S+^GzE)>Vl#=hu_&%%{4#0f8-Qgy77ZU
zsjR+t70Znk+4y-@V4D<TSrJW}_@_qs5j>A<QV9i={vl|7{V)0f4Z4a=Z0C-IwSZnh
zUV)72dT&Y!4srIZM8pX;A;n^;qeI&y15>B_csF4rzJv!#gJtVm&KvCSB25;2B2Bm#
z^1y0~1r{5C*)!$a2X}l-&q+r)xr2ZOp^L)&-&OO#bm<8E0A~Hv$lAC5OPmjdmvo#D
z=9vWWin;QO0!eymSF(@vNTyEKml)WSTP*Gu@5quG>hPl<=*AO4al%{jC&Abo3KMj5
zX=vMXWV5U5tRo`u7jQEpZ$hU9yy7t9TnsP=An^83g5bG)ydm8u9uxq$H;!wq#lI%S
z_O7G5YEbi%hEes#i5}Rk2b+2e^=v@cQ|4(uPU6!gulKNt@Z}!qESgwLH85C2Q8zze
z)C*dx!&fy}7$<HlJ5ByIi5&>D*lMu4<b7E(CX}B3bV!;<E2%YpR&v_X6w=ykH(6&j
zW-UKcUz^{n9_m?po<A=owlSk<Y_6&SV{6>lrdmDBi0@q9GP1umnyL@p*z(igvf^0#
z0B}En&!7GiX@Qs<$4s$JH!|lQT1E3M0Jo(B6k1z4d(YmYtp(6c{J-|TJs#@x{aZ?E
z*{rmc6s<Nzn0BU6a;Q}*gRQBY4;{=nG=!WFll84gu_ZK4Q^`oq=d&RqX2w*;X)r{{
zVT{ulj2X{8lxl0gyU*|W<9R*LYx|?uy!f~e*LB_3^}gQM^|=RMkl?Ilx2pAsq_IfV
z(YE*>F6Sh6)oaH<ocj+K1y=sPvswD&W%)*=)}GpnZC0ndFjwNht8RsHtVBw*UMy|B
zEp}l$`*M`M)BeeaT|I7{l_>gPu*fZYM+REtcZw}F26B8}xK<@Ts+Y{wwCs}cEmcjC
zA+%0&#;fX%j}2>DwUSO}V?(0RmP{>~)o)x%+Xw^KYgs?4RNalkq;DPBD8B}7S`jG0
z19w}-u>^>T?GO@8A|^Ol{BqGg)9x3XXTlbdi7JgzEW?7W>r@)g9&$WkVRob?(5{aZ
z-n?!@;L#$g<5S_*xj-dsY@5H??W{wN<hRxB0dbH+qt;I`UCFZ^X}iA@bT#e%z2<aI
znq;%$Q>1LlX^!7kQ$hx$ZfHgR=&Oh`mwoY(JFtG~SuvPQ&(zIIPco94k<E0L@xcp>
zsu<kvV}az&MKApI-5O59v|HQSEmyxj2r{6{lPWDM?eq6Jyv3;(x3rN0Drrru0c%Ra
z2*g(TZo6#^rLN}U{;XbZQ5|yY6qgR!qBkVRnk1IrfgB&IjT^M?F?Pi9Tn6<9%U-N=
zsJ)lf!5oNaOX+p^Zn9yBzC&fSs}@~jV6wpw07A;yB1T4-@IpBg_Hb@8E2A-0qxD|i
z?DNv~N3s2&N`RkT*)#o_ZK#C{&R?5_+q-<|`a#|Vrh%+=eP_XUqOfDz^JE~$2Cw{b
zW=1TC`0)5SvD}EtyeH!34y!tR31Kn{m+Mr(ipIMuA4t0mNUs_uZ<IUh;B&#lGJp1Y
zHRf9T2&ADT<fKBI*io2}L(&Pm)dDexbGdgOnt}$HQXtlFK}V0Kz{~EIr}dtk;sj)r
z><d)ML*ut7wMLqyzM{F5X8K?3;7HG8<#%A_XXHa6VL8}m`}zl13M`q+Mso_4EB!V;
zByzoOqcaQR^6n4Hck9W_iA_UB%eFh@n3?AlOK&wh3yMU$-Xxwcs^5NchjG3vBZB)L
zL*tfbsYjnVFwSn79Ms+#_Qv%@g1+!O?nPjts!eRwT8@g$E$Izt9xs|2f_o8o@K@cT
z6=8{H8~e}t#O<7YT)HI<<5QM*?bxYlzYaEvH6H&&eY`6}ck-@zu#y*6TE0dE_h{7i
zt={={a=L*StIWi`MBbyG<EQ<*(4&MrXLGga8E1FP(E%|eOLFuWr{;HzUtIF(6H58(
z5B+vg&%@L~&r6O6t_O|uF1f;*e@@Vh(eqV?_irli3QI99*&bshP1u_yU|?#sCAUMS
z|HpDd+TLwaTG4mjO~?Hvy?dIOM8+O2V+p|9<~%d4Z|Bv@7)j&8HrPlVtLN5+I+;7o
z%+<KHlwDbPn=1VWt;=jdQ>!y_iv=JI_x{AuJimha+iEJ+_F~L|R(6QX{CbhqvorIl
zIhniM8*V`?hAHGg8|mE~?=01A<g~JjPD5M`v-q_an={&Q9=w4qeJ_NfS8!)zO6ms+
zwwp>_mMeHAx#u;{yzBJBWM|+79;cH3pKt5$K6DxV%Qb4ufoV?F9ICx!mNRGkgZE88
z?#{jz)GW)>E~|8YGPT30H$<w7t#cp=Sa5^9D&(^ZY%6Ae>=q^swBU@E-sZ4A_#GgZ
z4r=~5`r$pO!JT{G4hV&*9_0jFMQUX>FoauXjr#-k;ly98bV;B?YNa^g&YbgZUgln<
zLW9}f1l~&K#Qa3*+$=e93%9U_HCZ!mgE?fHk*RHdN^I+l*mI5Djq?G87Dvmil5uBO
zPt{u=DI(ai7&qUWT*RokmouC!lOpHB+NJ!YFo$13XoYdi0r_MZlo@1qIYD%8f4b{s
z)M*)~>6i@HR9tIGC^<gw&hS3n=vt$Rll>QolkudL2Q?8UKC)$s7SU&2lNH*X{OaF<
zKs2{6@DXKyp#7cgX0WHz<}{hwiyE&~kPKJVrj?GIf~J)T7V7G^PM#0QR0_HEjA7-S
zB=j;>Y2}4%fBKp~?9T75m4a~<$y;hU{_5!^;y+}O?aoH)o#fn0>3!jRI6qa~TyL0h
z92z~#6vbsFuRpZ=93bOXWas|4&!J(2D*D(iG7{h3TjzS7L{G?5Xdh|oAcRVXQ0%F+
z+Kdb)#?-IP6WQg_@LKe-yI!~gS+ZH#!bdT4BhmVgEtSnkYo#2fZskgYZR0{+6a&u)
z08+5(TSX!h!(O~5Mb>xN2IQD~n+l)2DPfQhQX#U+x2FB1zqvMAcgMqX<`2rZ7j7IG
z-iIykva%Y(Beib|&_q3tcgTLmPM}U&KPT+fL_n%PzMx@U1Kwk|t~sROxM4B#2$+-u
zY31ZGiU>dDPpq(oM*8UFOTMyS-6BoIDXlcyXe@H^x=^#DpE6sZg<mvQ%zDV&hkcY?
zw5l)rdkwGs#(SUE3OcxBRCZWQS&yiWdL0VgsIp_Izp5@o*VFBo9!!m0FLkbufz5p`
z`v~1E9aE=_c`NFnJNb5A2-crgfUQ^`bHXX)j+uJLgTc}Fku_d*$BQtwdVcsampafC
ze>G;@;Qr}{2PV4`+lTewgd-{6iCrP=Sx|l9>mVO40a(2QyFx#^9mD$Enz2naIX%Bj
z$SwJFwFCf!)DmLNMVEGLA`f<KGHdLG0AMBrbWYA_V5xz&VW^E~LR4~09arpkNW=O3
z#>nEK`YQAKoyn5B&@HzW$q&t3lWUAR$`$Mk6P>^<A?uvN^(Lp`rjg1GDZvdEb|muy
z8;;fQ+!H3$8GSg9^t(f4<nFLvizrX*^m^rNq=pz-&dg1H&ezF|<io>ZEgaV5mRhlN
zr`UoV=Q6iDlkV$&)V$dHhtNR7yW0=I%Lwr`)q>Zb;d|-$v$Gtmliu56pW1wtO~2Ll
zPrrwSyKUIWR!{H3=GSHu5|q+fXCB22!**$wCrUp(l-)qT3D%?RJde_dc|-Rcwe-Lf
z)`Pwe-l`(Za=Hn=?%~&Kz&O5FT(@+KmUSrKG(r#A(&d&+2yz%y86Ydd5CBjHNN#^6
zn#u^_^udy5*FUte)(4_e<n)O;gfX6lF^Ah_<_ldls!V;H%p}hg`*t3+^En@ot!{K3
zY8{zcuMSO)EmBB>jGJb05=sjzJfX@zf)kqxD1=t`eFc*3>ys_|W19mnM3@he6xAZz
zJRNDxk$zIl;|w*27`^`Zaf%4f{*<>06BJ(A#uqB<j?kor&<7@QcbH{7tS_|&_dW&k
z_TW}s7s5hvWTxxU+@M&044ZSr5ZaVNDx0JY(9Qa{{TEhiG2{ODU*R>fafG+^z<MP_
z42uZ9a_f;(j%C6V^7kU~;00shuzGAG@F5OIcRbw~=D?<|yc}+^KG&zS^vU`sQC1!&
zdi1u>k!+;*`haGs@oQ}nvoB8g(7)5P3pagHllF68#{7hm(6gg{HQCoybWL7_<~AR>
z)F;ofBI<Kq_^=j^v_)p^m)f(Xw>h@yo|%Bsabi=#vx1|B=k%Yn4Bblobyt+BB~c@D
znu->_B(BaW_DvW4ZS;HAwP>MMs92^(^Gvbt1>)ou5gzNZUPr(Ckp%NnZ>;Wc=>{+{
zR37wbg<t5|{+Shg27JD1!3jWGqa&1%ZOE-DzYDdTZdcRH?C(4#S!Q#{`2KOj{*3^F
z#1=i<9ThEFPJKtr(F2X=7T#48t&z0brqUtVJdvJDSQa8|)2Sakodf#em-pnIX*yym
z8F?&(HfU;*2+cfa{7?}c*DiW`)U2dyhb^sr%~K_V;_zoI;yo49=(XT&cKzekEB;Kf
zewN@acf;|2h7h+EYkI#T7rdh}Q9(;oNjhuh(h>2dH@{7%dn>q-;C?N6J}N))&L*e)
zJIds0LFwtgd<F5_Q(mEgU~3R_TjIY7?GvN(;gdQ4<sjj2kNCy<KRtq25B%Cr6@$<G
z+NVOrUwztst)}Ip3ICOdK1~5Wfs#HDaPXb*|E=w;c{slRFP8nq=*vslyAxR5Uv_T(
z)!c%%zTf|&U3{DeegZT#-)~i0!Z=@P;oI}@RkQNj%l?Z!{;L|G&3{$%|Iev;=ywvu
zZ`&iC2)GF@0Fn{Octh8)OpMByRw$DnS4dlDdGuQie_sU)aN-{H0spvo@zGCXr&rm7
zD@DrNQl=h>c}LXLyA16i>|!bOfvY+W)_V{2w*Af5dVf6D+xNnRjDL4UYO0QI32>Di
zO99uK@N7fCEj}YdZkCBSIJlDw6WG$M<R{Bk;8q;VNOS&pX~$nV%H^daqd*g08|#pl
z>Y7;bfX>$`Q#WO%XvOdNSJp*=n^81fz#Vs1<?qkVY4_*3%_rr1MU2Xq{^>Th|K@1`
z%>)FpNF7``CZw6&<ls<~bJFBVSJ{gcs%3My)kJ~gqKM5nTDeW1m7M<0^Hq%e03w8X
z{0^pI08C-*I%b?!e90xwXz&fVkHx>|9JmzF!)c60D`7a5o#bBywA`Wccg}8a4*%A-
zk%8LI9aqk+D)%L{G$&?Q4CFQ8Y=>!imS(zl?jQA5^Zp->cuZ(@-`RsMtq#L)Hcfe=
zS-WVaTT0`7spaok@0>q;edMxwzU{L4{2X4JLn=j&ZdtLa;YNk_l=nnLlu*X5Fr59n
z#jC;?X9nWZ##_?Z-udYbZr|!h|2TBO0-;PjAm_<5rVvkNdp%xMZM*CZhYj0q-~GjF
z{4ITR%Tvlvw{D#p{N-5AgO|3{&6BaOs}&8upR22#fj{mn%#eJwUDm#!J^W{7Yn{U<
z`X4{t{WLK9#dUACm8Z)Sh`vV^ywk_0e1JV`d^Rc4b;yjH^(+Q<cGa36wrgJ#2x!_q
zHb}8KSB1UmvyOhPIjx|1&F#DjLs^))qa?O3iVHsYw^Bq~QhowYuMqt3XMgeG0epp%
zb_{74-?GX*xFVp8H1;Ut7HNXm$0dzTbJ)--R!MQqnm!Aqk(vKK5LjBps6zMmwk5+6
z9`sd?+7C;MMw-#dQoaSaWTGC~PZe6#ckSiwza0fEIpt`DKyH)E&N$(KxZPKb&4yMG
zT)pthko_JeTCmb4hBx-D_x^t}P`JFc0PT{cYMET7o=~XTK17U~gC-&>Dq4Lw0bAVk
z(@mtTH#(Wsza-he5D{PK;8${HU{Q7D{@#YxQELvds^J<_0}XXLzFD|rmdHSQFudBw
zfK1I3^Y+9m$L#nAqBq_qxcXsQ=<s{RH6v#NY7J4cqgOof(sNm1FLlg2DAc(<J0zUV
z&199k?j{f8d@lU+VI87zkxkAA8-5oEh?5_~x52BYoNQ#%(R$VK=wM^aCAl6gLUeH9
z+%q_%($%nDr?lrCJWW;i@2zEf=D}5>+Gg8YUYq>Tnka}Oqh#aD<t4_O{E>xqE1QwB
zyqPv^(5A~|om4wmY1x*)F}_7}uG_d*0G*wIqf2ZKu2AJ*gC^&&LBD!qgLe2}gZya!
zIr2}IRWAgV)wij2Q2c|o{zjwLMr%j1Qr3+W{u3+WKa*L^pP{p*rR|^25VdymwS1!O
zwg-QE2A7tc!ndy`Kfkzb|J@Fkcjk`0w!*gLc5}emMZfU3fAv4v!bl!5bfn^wkj_1-
zK=s4Kt=^+#^R4a&7m~%XL98-(wO<lcoRc5?W0cHMxt^7iK#2n|as8MeA43~ie>Vk*
zERPst=fP)jZ?ZC`VB#_{6dhmIV|cYi*vm5oJG`;kQ6R?zvw$PPjG<6Db6hZ9E!^gu
zf2c*LQHg!h@W8!@koLeuQFizSfRi!aHVdy#0<#zu1Q+T-B*wdw;ng%txpU^l#To8R
z1}okV6+OvCfvZNq$i*89WufAe=wyu6OvX#<0`Cd|JdO6jR^Ey+UV6TTvQzc0A8WMv
za}TAcFc!hU;=Nv;!P}*C$ku~{ZY!Tg_m{b1kBVvK_H^|_^<Cf1k=`&;(Rz)(7nEUF
z3y<z8wjZF$HSWWBp2pivUnag8U_BXxH_m60A19p`THNM?H35_co9O<H=wx=~CMUAu
z!iJ?q9GFax*E{KhUlh3Y20;}6DV1R;%gZ{8084eyYb-^}<;+WaxLGlt#iir)A|YtK
z#Y2`kxr(kDWfxu5o;~IIoQJlHHX>h&><UXCO|i&BG?b<c+<Oy0_`OK=_eLo@mz`6l
zkBIr}OVRzx!>}sCn1;0iLipS^!Y{CSnMmTbl*H9RtO8VYf43W!MlT0Ly39lO4p-!1
z-!e2+qD<y#ZcPS*m6HbbRR2h=)P*-U;Ejh|4>s;wn0!DQa}T3Gkk^GR%@w}$%Gq$c
z=!9m}4W{d!ZP%Kc{PI%Vi~W_csW|B}{xbnhvSa;Nc;g6ot@_Y!ysGhws>o<$v66mH
z7Y|$9X$fXGY_QpAG&odFJFDIYFY#8EXJFSadJttHO<0s9kNI66^L=e@qMdZnFOB>9
ziXNsy8WBjS9=kM!V{gQ`$~Di<9UCpX4h{{Q<uSc%)FEr;nQ7QL;jW304k;uwWMtn{
z-t%dAqvF=>?(FZeFW5e`tN45)-Vc7$=15P12oMRr)Glqx>6ubhdsK8fZI;5ozEzFa
za0YwVvPXEIO@h#(lh^f?_76ba)Loa9xqg-Z!V}V%Us^|3q`E-b;*#z=v^2S%u%p~x
zwCk(#zxgvQW_&mdG0Hx2Vmqu#quAO9*if4)0=Gk2B|?&3PQCh3^S2U~j?DYh-54~9
z%Hc-mD{79-54XegT?<&)Vk@tzp$mS@C2<{E5*NKtyJN=Q;gi1@i5Kj%+8D8*ZLV2a
zWPXQ0zn4r9h_u@3u@7si>;%=HbVVn(FEn4pI~KHd45OfKgg0H2`KTdFU{j{Pyc1Yj
zh6`^KnsPtwMp9o>hsc7q6<t8J^yZS|-|H{P#1W~S?h6Y?yoy%7T0UsKY>CIZe6eJz
zuPJqwL*Rt#rQT>Aj50CpmTbfpV<i8W?v+F68L^wvY4WJ%&t!D_i1ph|r=<5qm=y1F
zimsy${AZ#|s9ww5%JV;qmfpy6tvg~!R8`EZb<+ys0i^yh1O;rcrRj9IFyT5GO5zAH
z61d?fv<%+KE3O?63gKW)7czj?lF3VK)&im3!t(NHRwMQ>ru}q*cZ)o>xZ|vP;tRw0
zIC;G$=70~@lyJpS|B`Hc;V+1mcyDiHHx=LfA?Ek~e8ogdVmwy@nuo&BB7+h~1Kq-g
zDQ+M2mY!i3yrk|OW9+0$_GV!|SLJ$EAi$Wd9c~MmN<GD@kH6=|U$K<CKv8ka_;7$z
z7Z31~$~RxHDCeN;i9R~W!HG0~q-V)Qm+Hp!rK4e?6{R?p>suvAoNX(6tI@Q0M3$WE
zbwL{esvFMtb}KzHNUJT{gtr#c(*f@+7yY3`4}uuhf&rvvGir&zX6MdKU3;n1(f1VI
zIa=xpbt_77KN*7K`r;Ljb~yaH2*olIU~VZ;ZRQ^CGIppr(LERg6OYQZxw4=xCo%q5
z2^&LiTvm*JRKc#rl@tyNE_j{Ck1jZb)lvJ10Orj3%3a7;5+9g`i389X*u}=odHg5P
z8@pLy$^XY%8&P&9Y9JQ1x%ayOT}Ki%Qfnd$UL7@uHMQzb4~16`)9=dX1rA(iVT07!
z(7bZevdnKK4Biphe9gUAv-vY|+^x9IC3SDAXh9C8v&%f>dIo6JIifO!+UM849t12?
z0=46?1K6a~=u2u}I;{u_6*sbqNM>~PUjr6m@0g{Pipjy5*gFj`o+#}C!G*ms!D<XD
zT&D;}7(`=(%4n&jhD$qGbCqR&n)bmTT6{PEGc<6;VAo4(WY~KfS-h%^>^d4uV*Kev
zn0O3reb(DhY*0d2J#~d0>v#)pc<~ei&6-;^u3wbk`*YnUlm^k&p=lHy^Y+LfpRL$n
z&nahYP!}CU<<0<XP(~~+xHUmZoZ@vDS;+h##q*0&Z0uV*l2=N;eDo8ts<^Y(oJP?(
zGy&rIg1sJhUw>D_r5j;m+-<-vNB;oZ9qo9&!qx~#qD<W(X@Ps#zQo4S0)mK)j8|3>
zNSl`}nG(NzcyU1YGHox|K)=>Y>a-;<8t!-TghftvQE7_1Ge2m#Oj~F~-Jgapi7ZO6
zPsSi(;hH(kpN4>nTM^U{3zk>T0MJ{eh-kr#&%>*K(Y2A)WGBL_=PdDC_df=X6UDeX
zHIEP?FCwvk*K4;n5p_!0w@KCla~4qXzL4z;zV^s*UfBS!ggcEOS{2Xv5mMaiCEXWQ
zU=e>!^`R1_?%Z+&SmpRzw;2~+T@^~y@tvKAcTU-0VuK6Kz?mSL&qlqrnT_;8)xViY
z;-rzO42-8fUO5b4nLx8+$XOf0OVKEqX;>A@ei4~zM+s^HV+=P0Al?grp|?^nae``G
z(X584=Yq0sN86%P<65=S)4h40g|PD16ww0X)$!ZZBZ6D~{gHb!Q<cOwVU#4s=dh&i
z@1LQA3-Lgk9rq<Q9f4y!1%kRKN^9YMZ}d_SJoR0vvkIci^W5tA2-Os=OA0X0i$gW=
z>f(H=v=-V`t_PjNKPVgN1U+s;IUgYnE<B%}w|MmOw!6Im&6b-7F27jvmU|xwZUQKc
zt*HW=2^>Fw^@d`y_e+d-r1AaUD2V^2K21Mji%UC-h#r@QnmHL{YF`6waGTn3bOFiY
zN05UOIY8=crp=Avrlp)!S%^2X6B~rlP>0R@r@0tjEoIO!QX2oxA}@p=n>S!T!#xl6
z0{lRRw^7V8D%g_^M~<ycvjjdaNgpN4>o&m_^Da(eE4B0d@y&b~lo}Trk9qd+z@le(
zRaRld!B!^i$6<qKoXuqY?S7T(5g7mmqz1<7%7<59k3z~y7yKeIUbYPzv<qFY6MHZw
zd7!b0$>GDKB`-TT!B2AH!_$_b_L^Kl85~8YW55U&h{@`pfb>SMnpkj2)-r!*&OX|C
z;Y&mAU1&Z2De-{RQPad))e%tuwl1Z>49quDVC#K?10>XH_N*3>s(I+0KR9=)xL{Ap
zWu)g^;5KY=*QSO#Dnk>BOCBy+1hfZozLZhb4w$+0>{F6;w>Jb<>VKXbE*)-SEahye
zh4}Cn0Med)vvtW|7(6VU9l41HUo>iCCf5TZUmC=<kJ+2k4i(qxBHqsv&p5CXEcuI>
zh!U+1UNLRAmV;=;sM1&i$R-h(T@vF~z#?tTWDRBsK#*`$q?4O51W`Z$No92tT6WqF
z(E?HNXbXi~l#E%O_TVGM^XT9ie>2%~kjtl9czg46`OfE)Tj)o=UtD9Wo1oPoO4k2P
zB&|W+fUI-h&ce0Vb>XVL++#Si{3nTVKA7<1FR|G`m!fs{)u*kkc;b6-_*uTrNQp|$
zy#H1jQPgRnjBg+;j=vG(K5Y2QTwv@m96kOjUlgz>MC#m&{`zYf!iF}hoi5?XI_6i-
zBR%<1H!iQ2`CZi}FH!r_6=+fA-<6lMtlJsdlAhB~l2AhxIc{K2W5UoqGM?ueW`Zp7
zicNzili+N$ntPpPM?qdv(?(epaEnpkE_%doBI{3AyHJuwoSSuFHu61x{0b8<%cpuT
zK}2Bp6Y}rXx5ft>4_-ZQ#ctO^gt@lTNYB$``N(N;(^dCAco`2tt*%HKy$FRGnmFy+
z$L3yd?{rW=IFu~8I;`6l(Wb*5jnDV&JBzK<@^nB9M@5&YK=VeLB8$!Y7l#fd?76R}
zZrOP3Gs?*Nc(+zQDjLbm-$!;c9M^aCG>h(^XJ8+7o-=nw12kWfPNBMoc0DkeyMig&
ziD2*H&cB(4cLGm)GQ<g6C(0f!Wni<L#-@fqHYA)%rgCv=agub)C2c?WIbc}x&MBr7
zgdS}&C%TLc^+ei#Uo^N8-vC1Ey<|4m3_yAS@J))#x}qDX+zcOB{h0q30c{Buyh^Ol
zYPdAyS3YmCFjYi)qtV7sZ8EE1EVZz%2a<Hc$20({_^kiNs-7DPO`mf4)=nXiHXtz0
zdk6xKZP9kojs4I)I7|Aa8^mEGw2D!7K@aK4!Ne0q#s1jCm8SO=^MU^<AckSOD1zeP
z2ZZC$4$F%&+6VH*nHqqq3K!bV@s3WRi@Sk8uF(IO_tfWoX^*=J3o?Z&2n<DO;n7S7
z$R(t?3^>63wtAh+C;6gIR`$!%dGY!~uJF?s10<$JqeK~&KPJTtWNE2I@J8MwYhHLH
z(f1-q8>A0LBy%DN3AkUP<jq>_z5vrxHGs`1ahCa%3b?}pCLni-kjv^mbIC=dz4l&{
zEgCA9ZeW~C8IQn37lDr{pMPvh?@;M@?zSH|OB$vZj+_x2@X1fK^LO@J%u`^#mJ|(n
zL@078?Upm^#;?=fgX;X4?#YKPrMAG-RVhF@k!VR?S;2g~)oQ%byt>ZpO=O?ed*x>K
z2hf}I8R!Lh4oJJfjvKL6l(oVc#C_z~QqT}BL&g2SmBk_!w>LOf^h7P1o(Xsn)se^G
z)`}(YZ>{>HXBAlMrn2i8f%0VH0yN5GYeDJSk-A1KULN<}IJGPp82P4r)JCn*=Zkye
zcAMkvHu7$=uMeEnEHlwmGPO*!2)+1WTP6=%BIjbW@|5;yB`l-c`WK8P{Nq?&ku(6+
z&v~R&weVZYyuUA}3Z_5V4(2M#CQr0~NXbZFX0Q!y@pjRit>MWS)!h_fktBeMnGfnx
zOOuuieUblQO8cbBCwX;>r7nC*;kB(}SNU%Sp%uiZ`G@+|cL5v20K~>>)tsxq>Gprd
zsCip*bUvQ&$Q`dfD#au;3%wZWDjGKKCUoIoxM1TJq7Fb+Bc;ttmUnmX3kSNJveqgZ
z0MAaj3w{s@=XOiv0PE?;)0I7$UVTZX?gLl)5mPKv4&VF`aChy<FWHg%;s5OM6@n5<
zfY{Zs(u~{n&G9}c{?lj`??FhHiYtr!l>08b^KX6F`x<bw`cr+4ULW(vuZH|+T}zYP
zHURA1>oy1Mmp}g2*o!Ovs}1aUd#pl0;HR7uI)~4E!l$2r2!Fu;XEyWi20mEB|21>^
zR|EfQ;Q!Dg{cA%1n$Z8dCS;xQdViYDH-fmh)d)(~07pJ^Hwjc;*NxcNb6p1LYB9kx
z=d=*)Vg#rtNi||kwMJc|02ji<f{N9AF1(uEuG8U<mg~s?$^W_`P@p=O5knRYZ3pCl
zpMu1AP%?aXiE%gWrOwD}<amu`(S8X=a}oD>HU^AW(ufTb0mV&r1M<StL!Jr}&Xi=*
zt05aBb4hFvp*>oX&H$uykkxye8!aAZC&3FEK};4*Tvy*lR(gK~D6WZs>L{Rb-ge3W
z@`9ZwX=!EejSg1xKZvl`{}R|#c#5Rj`J2|1#CU&Q*3C2(LLy<}hVA#0QM1<>U`GD#
zPrE`|V}ob3GU};mPCmABJ-_1WMOU9=LEX584S>}N;&i|yoJS8}-_l2$B7^o|0MU3f
z<}D-g*?|#vY><=Xy<oNT2PMXvv82;JW`xrLDWFKV2?8qQ;Cm#iOA2gmgq>Uclu;xH
z5(z3OWmPAlgFT6t<$CI3pf)K7{pJB#rJfHKa&mxv5U7Eyk9}KKK)xiaVu0ey!?k)F
zC_FMyOY}`T8r#S20o-$S78|IB+XL@hFxh1z8}IKTccCg8ZUGqjXG-F(Xp_Qb=$F`S
zvmt1C0PpKPHu~DN?b%>_xw{P%V03(Eda*$z)6hMKX4a@;1^_M7-KL>>NfxA4!L7_|
zBz+lAmFI|#v>5*yJiIyv1rrbJcwlm%E)WkDC*=Ml;XIs&?UNap60>={ZS=uPX*N1%
zo=c2$(o#Z1o?~834nR@V<zbuG__)YDF0CAE+hy~N$kF;Ec&2_9Yx*H+Tt<)ZdmyC=
zznV7{@zqJRK5m^AfYP0UipyY$w|U{1U$<EyUmOLE7tGg@mDh^V*ce{$8*d%ggbjsA
z-Tr=dxBM=oEWSN^qSu1YPjYjxl~R>>7*VSs)L*WL#70FDx}Knms1-R2Wq|d2qlEQ#
zzc~#R&p3DhF-*wWaZ$kZ$6vlQs_@;JwLiO?;=Z)O#ibs?xFTl|F&>t<;F%e7q>FHT
z#GZM;DrawSerEw)9e-BbRaLi}pQNv09#R6W8R~39ejz5`W=yU;U(tg(-sK2ng~{3p
zJ2ZBAo&p#9a*_)2ro94;eyM|kUn&{EWbK=RHr;JRz}SB;lcy;$o&|%7d%U+9nzPu?
zPI##kTe@Guc>(b4!|NJ!MldOWX=XR*#O5B581Dq_3`R_lP#qd+1*S`nOK$gLq+pB7
zFM9xXt)RWh)kqgGyR}GrOnD<nyQ+W{w}gR|>8drp_|Q}<se2v?3tXvmiU%m!cCQ!B
zOO@&{EoBL3j`}4)juE@V@3Todr7jN0z1q+86-~=q0GV=&i5h@~j81p$Q65I!U@!P;
z!8{w!Y9U%E7}?3LlG-Fq&<TQZa>b{2wYeI4n&YKeQ)0-WB(>paDVz@n)Tzj84eP0W
zC*t<BI1KSX5x^=o88xYD1{H^NM1R%iQZf2-w?u47W0QOOpHD)-+Kv2!02lyS&0Hl3
z=PQt$VZ*6=z(87cFUdAI5qlUsB53u1`qj3>#GU-ikXK@p;W`8*ahX_(s0{O2zGwzy
zCGK$x(z7<s9I=`ncL0|xjH&5|S8K(Am52;9fCTdXAdU?a2gOp8LbvKo6f$*^*K-{k
z6jch9a6W2wgUxw<02`!ra2Kr5V3W=2mG1x%(;gMzCc;&9suOi0WcJs;1f^#dg^60d
za(z3PvE7fI4Y$Csfql6T7=~d0d2IL62>h1fKR4+%2h=n(Na|tum*6CQho*Z$VOheE
zq!SB`NoIvZ3$ApxsVX-ta=Tq+fNxZ9#GW{?hx_xFDQ3yB3aMA_cTy5Qm6DbkI99D0
z`8i+!-JqlB+uO!(E$Yv=DVFkdM}tCPsXQ>>5i!tf<LmDO+Hdj!m+dYmk>vy`9e_$+
z3%Jx${b~u&^?-{biZ&af6#5}kP&dD}QKKZNTgfSML;%X8>kmF3F4yDgimj}O!@}Km
zbLWS{Vd7f02>VSq%MRd_+o79-@^?y%#{sYI<Ooc2`YK*Jsq_TH4Y2=RO&zZhSZ{89
zMb#UgUuYeb#cYd|cktllmdcNni;<|k5H=!~84v#KiIO-YgtwYYxClXnsmi-rjiO}h
zr6~0kId2xC%GEA*n~d-MjDz6kz@XJC8n^PRNE?u~VfRXk<@htwl}!Y{6UebT@+@g=
z&f=^qsQL9&O=4;FAq7ZTOFtLn*kG-t++)$rKFy#a>Idjm|E}VnVVkw{4!~y4*Z`t|
z0-81$&B&o?ejSEoQ{uM+HkX~Q6gZp>$|S9W@ah}j0j2?JwLwN5mB;G_r2wg(epsG=
za?vKoI-Ly*Js!3o4zpp{AYCUCH9=#54GRoFpzXQo@$KqR)?6nN@TA_|k3f+@xeUaR
zl>YcD7x2<tt_(?Z^$d6Fj-wlyHL%g&doq|pGRMT6FjiPep+VlS)66Tn(rt3!COaM*
zJY7{XJ7{tJPo8?O0GaZqAMsQ92j9gy+vD9wqd;hJd=0)bJeLfP?KD`k5!r7f7p^f!
z%hx&Nq=#>YsIVV`?fr^}x}}7X0PXvvA#Ua!yc+cE^`M+dXONy;HYmHyUEj}P;3Nxi
z4@?T(J!gd|4b-`b<T`*mObI0o8jcE}rsHgOL+eG+tG*s^o+V_+IwNh7LA8c98yz)x
zM4Z+x1aQ&|IQazvqM^I1p+TqgiYuPFRAyRsvH_iD{A0izW{!O6kii}=LdG(aN!AaE
z1HkY2oX5L!eAJ=hh6jJj8Sb)WCFj8MxL`%x9XazZ%w$l4C!&LGLFx5G4Ci_ROpiS`
z1}bWO<#$J(mY`&tr=eA~CdLR&zoJZD>@4Xyt<$Nf^q0{9v6s|IOp6^BP}uYXF_QD8
z`HBA7*;|t?tJ*b|wXb&`!eWEO(7-3tBfV8HlSA-o$;qz+>qsczc^IKaod+a79^#Kd
zS5CY86sX-Lk!^s#UaWovT}*<qbTqx(<t`BQpn3i%iiNX2@WFrNEf&M7LIK;KQ4UaG
z94I{)(mE>dniP+;5f#fiFPL>qzam|5xH(J|sGR7}&3+LDslUIou@?6rd0mL#tw{z7
zIpn@8_CYc+T%~|g4$xiv-C)n^Gx+j4=6z5cZq7|ySE@cfL+U2maY;|&@_1i*pMba>
zfDmhNzPjP-0~n_`X0!_>>)3{u_61D~27sEUUt={IT1P7JCK7>^D5#T+hM07-nB-bc
zB`DZ{DwYjs|7;eX0_~sSrxEFt^igJ$I+W<tH?&yQ#E*Im?0~v?WfeK>$hlgSi!<>q
zaqjHNLhviz(0TcW1<)T>pR}8pq(JoV^|sVw4c`?bi(<AEj2LOgabi)Zlj!QiD1g>V
z<s}$~buRt}L*k@QWISIL0d*j-qbD*R!Tzk6H6xLNAdbsg`ncfNgW`APwLc`F;$5St
zp}81!k9RlV8udb4QoUB(Y+?wMr6!z4)R{MrF?AZfAylvZRLWA_+^5nPS;#ZO107}n
zTXUFOJT{xAVImH@P_x~Z$gvxh92<KAF-eV>q)Lpwx!{eQ#T`-N=nqZV08x%+{e+-g
zaMXm&=!`WlcRwtXbne0{R=#q-nKH8VtD})(+#TEqs`Rl9REaX+WDPs-o6HJic|RGZ
zpw$i{hRZ<_UUBOKL_s01<0t^PCG_azVSk1{w)paBjnJ*$!AY!Cmr6_d+<kawy6>&Y
zbWq5%Y9~_rpzO8~sEytDprj00vE2V{w1zB#xL+M8>5QwP+Ajz*t=?a;L8|3D5W|L^
zaJ+KHbkW4Inu5zjosrH+?`FvCB)qzI0G3BzDyl8Z+?Xt=#VNSc<Qv~`=PUPR9U)SR
zl#_7QfglX0NuVE(Sb((W>c_=6>Q%?JpoUKQFL(j_BBAX4uzQ?pyh)%zK_-9aW8ddv
zVxX7CEJoR=+XHtmcxJ>G)ZD^B7@$)li~T2M0A}W~axg!a9lazQYK}{8Cro63%D%o4
zu-`?*HK+&LCroodne?<PC!I`9<8eW&zyW|9ou*-Kenzgf`k<co&OwiRb#N9PynE%0
zv<9fbmnp>8%0vav-01~)wj;V|wzjKHx6lW~1S3$yPFUb;#QP+k=Sx-kvpF{H8q2|$
znfRdwZZ9i(`GdPuz)D%zW`TX-lA?<CdK%nKWr`Bc)4urTjH~ITD+cq>!L0>KfdO*B
z=B1qh3>gC`uvsP|xNtZQDn1Y&p1jhB?uwM_;UTaQ(>2ppC2A7mTGIEE={L8=275N>
zLB(yXUyTEI1nInI(!n$Vg+*3N?#`aWwMf}@m-VWD@~Ed8LX@`(n1o%aa@CavII-OC
zgj`QN`2DCm1#SUi|6x!W_*EJ!>BxE=MigEB803!_gah5rUhYDBGB(ICE~Y41x1Gh|
zzP*a<-Us?zEC8a=kNJj8+~GGa*Q5-^E(_q}2Z^6E$0YufI2_Ux#R}1v*&7k;$qNB3
zG%^J{d;X&jv|o@sY;=&_4Z?$Vn-_pFPe4cmP02HJFF<bGK%oZMi*-E!Jv4zJMC;pJ
zc%A*~{M(7g>h$U#eMTaZ)gjIFXmDa*<XaWmv|;qkhG>1G7|`w(q}6p#a%MCtnI2^s
znY@y#0-6O3+9HGV*{alqH{sCCr==!NXtR1+5Wi(XyLQ5rT~`Fagcya&*rc98HiL=x
z4P1Cs8=9}<TY^ijH5=@>z<`{y3SIb5?WOwDaeNq{42d-xDfF3wiWH<1MTA3}_p5t2
zo?qCg3@ON)a6P0R9;{|-fgk0TBVghgIP%A`?s5^rdjDE}PJXWO#DPs;dFd5`3F`~x
zKnDzf#q&?~t0ACqcG@2HjIQrkLTQT)R+~Kw@^o{MMLmy#x@o^H0zDAB4BGis8l6;z
zjpN*Gl%Y<fVY1~L3U#<RhrY_)4s@ZFsfZU0+sHaL$@S1w0s}%Fpe$b`ZT%@takCb9
zv$^DEaDNk2oMNk|<I`pspXzF2;DpNt{oMVa$bZIcHO?DNbP2VV`f{~<y#sKw1-BEJ
zx1~SE%8%*?X~cu80cATtlSj`P<QodoWzK5#G`Ic?a)4b@Iym$a;5)d7yPhx`DlS~m
z)}|}7&|BuLBH_#eySo_La2rEl{qd!n8EN20=mw*%qOT$}m^D*j6%A4uukPvE>?=n*
z|HpwJCg=$4xYi3kaYA6LV}ou2M}T`82ZmmV8hYDwH(1R_6H&lBriF-FY9RUqX-;ea
z5odiywC1;F((XpB0w@-)gH2!LE@rXc*Iu2ju$1db6}u!m3X72Rwa+C#aQ7DKoI43b
z|L`JgPwVTYa_1)#5CmnYm}?<VLjIg^fmOJB1gLL|3oc~-0vg!TC?oy~eTau_T)mFD
zk3O>X?ZaO|k5R8BqQRkqQ}l+{sQ_gScw}*zSwH2pnzFte&-##1^FA%5-ZqUg*)Ow~
zFP~dp7>KvdKek>pSI1lZ<rB+-TKce;4is~T&3di=rFs12e7Dm8iE+`#?&|F?HT(@D
zS_(hC{34&Hl}y<`d7A?KNk!{-o3E+lM;)vE%pR8`**8A?yM=!ssDHKa_vZSqA^mGe
r-|~C^dZcf7-9HQJ|BJX#u^{|gY|S4=2ghgv;NOX(20H0StbhMsJ(b@g

literal 0
HcmV?d00001

diff --git a/docs/image/pack_size_2.png b/docs/image/pack_size_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..6769b069127e0b40eee3b19d60ca6390997b8ea8
GIT binary patch
literal 140996
zcmeFZXIRtO7CwrI6$KGHO<{yVMKlqR5@JCaHGmqLbOnL}Ql&Ss0V*JZK!gMp35JeH
zCj=#cQWXfH1rWqY0tqdYH11|b&zbYTXU=%<r~5p2KKPg?`IWubUVE)~z3<vYoj26q
zxPI4qAt9lSXMWSWC?q6OAS5LGb<=9_jc`e!m5|Ve>(@@6Jb&io$=&C@+)>wD9fgE`
zi+W<V#vJ)!8`kpiqemZw*Z#JSpm1^JZ|k0~dSoLawP)Qg2|FZj-6tH_SLQ2GcR%m`
zuO&}VD+>2NxWam{YOAAlXWhNVlt(x7xQ+9M)#Wp^X7p03rr+0rAfb=EnkT<apH~r@
zu3Oc9XVR`d`OS^hW6HbVris<uR7z5cFBl$vu_EEEfHloreQXQg#Ay`;-msMYZE_Vv
zR_OiFNa!NOMwiqgG$iG}`PvF0y}JWmXT$ZvX~%N*-(7K{$|AhziGPx?)4r)sD^gyH
zsFRXboKN>EZ+8_s!aNR3f0VQ_;E3+Y@qJ$(kc6&k9ha_=^jh4s|B7E@TiVN-(M^5F
zgVvD_$<{RUVxU}i9{MED%583wX`aim&J%i&GqQfYOX#OdDvJTr$K})uF5GR86shA}
z7IPmpynoSdABj5LoNRt7Z;xR2@ctPwp|cW=3Xh-6y*g}Ee{XRmXH(^L`JM}#q)tjJ
zwmyh59fKC>U6zjg>~=i!{`#Y5Hc1_;`fazUfpL#(oZ;E?izi%)Ub-fp4U5??GnqZf
zK88(czBomcSbub1C?fs0<b?HyS6#_IapCL+(ymRF@4Y0`jn&id-x*InE``@QE<xN%
zS|{x3U+`FNfAiyedtZ&8n|8d-s4H1}Ds{(JUGv?D`{%!|(XJa2BZ*UPS#OKH^x?LS
za8L5Lx5bmI_B?Mawr^WC{6%kT+nO`SJ#Kj&G>rRFhadM8J#JHW!hJ<yM9!fLYTJ#J
zZ>AtW#5$c>Km0hsbNG~I|NEnkItyN#Hg@k8IdoIH1aWBH7oiK5e%2biOzCwit#+^K
zIVO~9^7QRa{7Qu@{stnahQuR9j;S9}^|vz?TK`<o^vE>8XZ59(>33H`z1|`tXO3@q
ze|u?kWfAG*z{MKLR}%bM?d|Ha4>iQq>lTO<acAN+^YfCq?eEvXAV=1yL_H^Mx*~G;
z;ni5F&nwF0WH;s?FDU<v=y&pv8600{v~J>#{vF00v3C<uF9O=-UPy1$+8vg>Z##OB
zCR`exiuFIl6z|ys&5M;<;ncs9bNO@l$0DW9*Cw9(U%y)j=eqA-V7k2;f1l@&pz}~#
zuY~J~yLbB@ZPNpJ${<aDr_0y#)jcN_;2fv0q!8Wl^w2k?ZI>eoin`a0>-cWTUb!J>
zU3P;-q5XQDH<j28<9Wy3*R7oTazgeKoVRh{(kVs9JaV3OUfL7;Rk!e~?%!^}{(@HA
z>L@C@<G_ONrb9=sU>``<wvbyk3e{DfUR(&2f=`MapS2{8eO(f}8HNlCTZ(h9c&{iV
z=3hH9IBDS2Aacx7=BD>Xo$@#5@wHpG3kRRQ`Ys-|77@O;;5Nf^rA>{vNcj0p>#iK%
z{M$-NyV(<EZ5ej!lUIDbb>;l}#M^3iu<i{d;nsGlN7vG8E_ev1uGs$8(L<*HZe^xq
zl?ZG|=G)0lckX=Lq;(}Sc~4iQ!tlLAaYDari8{Uc(UD(PemG5vltza)>z|E#@;=aE
z|HZq~$96rEd>`;qraW3lui}x$d!;QW@vja#-|Bs)a9*+TMEb{HZs<IXHaz{{<En2L
z-$>``%|9#twr*BFP-fzo?F+(=n$E)^&C=fW$vdPT)U_8LK-_P6ceVhMvpM_iuFoOW
z_kHZB-TJAK9iFS18~i*(un%V@KP(&wURNzUc$YGqIhnn1vTDuJ*=G_D*Tsm~3P&8e
z_sAgm#IK48626<nBn@keI?wp)|FW4KRifwazpW!ulQc%!ONu6ikWg2&rw$h;>B;YW
zEPY1s(tLB0@v(i1xe60|)Z+3UX`PYl>?AjL_IJMPY$ubPXF3}@eY0iEDqqYYa$|E&
zl_+u_+kXh2J~9oN_VrR&+*}^tum1_xeUYbpCPBK<>a1qQJ+G%;vD2p3*3=VJeVC2)
zOI>|}RS1!z$&R4Q;kAs{>`T##*_U}RgZWY&>5d$1zj{Q<Vv`ES^qA$@>=&4quxBA3
zpShG>^*FMyt4M9%5ofi$t5xm4xLCNnKB9DW+qJ}Nvh7tbdvZ^8z5aB?#j_Z7&E%@u
zRca@ljB|<Yj3KvDY+ev7xm9!CGLK`T4PS}bI)%5YW8$@9$97O3x8i2L&5ZSpWsYGg
z^sFtdW87lg@Jt)3I#m`X0~04~A>aeI1~$xU%^nVs3W*8D3LyO39F1A{M(ED^U5YzO
z{LNbm_SX(bZO~W?*|Z~J$2hdiZSupT%R8KtUCQ1Sv`$t_xP7tP;(ny=YW$~b>CDd+
zwfuWC_mVcikMd#;JCEC?+UZdJQ#`Xh^P})*%A!aPm=9;--H;(2yE~*i$hZcB%I0sV
z6)5>4GRgp@x+8GIHig&=tw-y=9oA5fJ(QH<uI3J96%`c!R+8*g*fw0fx9;&Zv}1Yi
z>$=wsjm^$)$Q!bqv$eY%TyVu719RDz81w2S>Uee0>vUIa->z;i*9WfI*I7NE#)T&o
z`%HSRi<Eky6W$XVy|%u`WOQWih;0>1lD?p?svkFUy1`fJR(x!HH@<xg2PLCNt-e`J
z@IIaz+}rDjJ%?$^mM~Rl*U#DE=}R&*Got8FCi49B=?@=DYq~p(yw`9ObJHnZU+Tg<
zGJj~khskE<!=zy%vMD-l0qrpTpo6onQ@r8PiM{NsDf)EDbnE26P)Iz!>7o6>%h;Jc
z3nJX+IZgV^CrVRk&HOV~T4-x%d1xgc8~a*j?~dY-PWv#?u)HPHr5K^p$4=KI-i$dm
zbIi@oaVULv)9%oZG0&V11euDth%-zITndgXcuWSBs0JxkYKn-i6IWh$O_aROK+IZ>
zv%6XTfOd{nj#57~IMCl)WRLv5E2l^XG3z24@WZouf%>fvb+(?0KKsx@j{R%z9*bSn
zqq}5eGy}=sf(~~b9!{<>-_#Uky)_8ls_y31ALoB1*eduGuj5YKI_TzB#db+0MW$hJ
z(w5XW2gcl9Tff#R-+Kz`Xm!@4k=yyG^9f$Y{`5n%=GQAvFFgG<R~M6jeO#?T@N@mv
zt!A&?5D>Fp*3Z6bVgWzWW_0N3BaP49dM#1?Q4x@}JMlX!p}OAo<xlzt5QbmxHOtDT
zlqSwUmLu7=ur@D<ny$w>E;{8l2Vy7E2>U9}?fCV8r0;|77wi`lO$p5u?*Ur<S7!4>
z+qss$r2NFRxWV0B614k91`x~<pNo~G3l|TJC68+dw0x1hmT)6tLtJzcdHc6_u^iKK
z3{PpX_T9~Q_uq*lFXE)1X2DE!ihZ8ezNzcLZoRve50Oc0O&NgH2G{rR6nA=!a>Lwh
zZcHUZ`n=|D^<cc;(Z-*iL#JrZmj>j`n6`3VLg~EvRf?it5BT$t3r6I+U3KllDWd#s
zZZ>yRH>)^Xj#v9rhx7OH9*bt=WjJP}B~2z;Ed+dN8->rSwpjF*G`sZRMoFW?qZ+0n
zdELbqgB(9KHe{d9&$4bYw|>pZSCgcqs?Pk{SC#nMw7Mi+r=ltpL#`NQ=;m+FBicw<
z6}sP^x>qMP;QqzE>b3G!WD6?3B_joXUE+Fqv0AXp^$Y{Y9`)gpch`+dGhN|a(V3(y
znbX>5{epXx2YL$kU#8&qv9hwL^_58`>Evm89&BXJYC!6BX?1-$ilNI7$>6>XT@-9z
z0}<;#y2GbnbkofFmi#WF+ffAE>f(b=!|VyXWS388Cb?8YPyNjij+(6Qu$Kev!c6+s
zhS?YuZ9~oKA^$Ms1*Ch?%W|&XTvUUKq$Lw8?LFO#UqD{ZOgBu^mh}Dl<x*D<&Ral>
z!Op`5J;DQ?Gha~ZdQIE9YKZ{?-?EnM+_}f|8Js5FB&k0AVmabw|4|b8QMOZ_XP$MH
ztT%ywdOF%tqk%eJR!3y>d<P~de(PDYRE&45NB)=UF8=E^_2T=&N|!=xBg`5KBld_}
z%I(#y;3m&6j7s*zw_!xF?bt*ZG{}_QKe6~JuOP2UwZ8gWuyQ~T#hMo}CnsgoL*TI6
zp201H>Su3MCss4VioWE2?!$xy@W|~_6z-6<pt5>?uiOUsIOjdje-25{bP7T0#_J}9
z6)v?esL>e&0Z$OR&a6>pVM~qO$)eXn{zZ~P7Q2LQn3rCt-{<mn#qk6-Lr8dO-F`;h
ziYeh|t3!zmezo~2DMI6eVoj>=^VhkMkM0sWqbua&C+EB`&au2wx%^du;U1lDtT0g*
z_4a*2)dyHCR`xh+>Gj6dqAm@n!q=-!)=G43upF*-`08{kOreo8Td5}KUU^DHcBnyM
zdo$wEH-hbfDo}qMbu>SNGB6O@2R^SAS`l$gNCbRZ0sid*|3H26O!&t)A_dP@{`f5X
z_4glf0pPx4LTB`j8~d*qr;4UJI0h&Qoa_=L4nqrGUHzymq9?f}a(m=)WnsqN$3k&>
zi1YVfNk|s0TqkAFd7gJzidwK@LrU`6iz3EHejRc=DIa&@yr;de|0)q>5wo=lham4f
zCYHEcLO2_&ksTdSsv3+sIKw?gpuy%@_&i-0e^&EBeSO5CO}mfX6j~uHvi5I2)VvWX
zdD_*FQvA8gg~HdZSY<A`W!K+6_>Y$-V{Z!Qj0K=mer7D-8cE^x1&Y5O`MXhnf7&4!
z7)-|uMB^tWx_pbG$QC<;^H=}IBtM2~7YGJ}qdB_$y%m4I1+n(mny1e`{;c6=mj~0<
z)c>=V+;sk?+VSVde`-~Q!pGKu!C;fh*ZtgN5AAs-<Z|o8?H_aho40gGg2BL4y(NEc
zvUVqqt>_ZDyZR@W38w@G6KM1GCw;Q!4)C#c>o@-FG8uuv%vLoU{LLJ{uggCU|Cf>B
zABX=dm;1-z|1#|S6XE|V#wKqv*ZI``?}&ORVwLPQM|s_TBa*p-<P`n1;*}&*MO3hV
z876AM0xmJgL^U!r(UEc*5Q)mc)To7WU&Y~7d_A4~_$cLw7N1}wvs)PbSt>57toljR
z!f~i%VTgicp(R{G^%_bZ+iA38t?Q#2ngu%2QVPA%;)^E~R`?{!ysKGHCpSLI|H6LB
z!t1J%GpMrsxTvF<Pw?W5Nic*=Qyh~bf9z&Wutc{N<#wJQtx5YTzo9V0hNav2K`)iu
z(g|N9IO`yvSPIQJe6n^c6+tSkKu0<qfl5TEK@YR;T?k>8$zlgA8`zV73d70!g_nU#
z`RkK&|IR`+6(US`hqHT{70`h(M3|=JluIsxO2p^Ny@5!$!*wM+TlS#G^4TidD0zWv
zK7!PRX&wqZ9TI#ysGuM)Z~{R}vr36tXs1AtQsY%xl2g@4sKiq76Lg@a_W{>%3iWYR
zne$R}^Cg|p5OgHb78~V%3JUdR3|hHVs6ZsRy8Dr=x>c1Z`Ov}?JhvFYl0&v4r$b=h
z0{yD1yt_iAXGR14Oq7s<kD~-M$GojdO>*jKC|zK1oj;|Ej*QJ~)*JUetvKu*fl~LG
z@OEGPKKF7zn9iTl=hyo;^*6OXfnJ9H3;2A!bBryHSz>QQzkQcYHFJ`;Db<qvq_qbr
z_#Eb`_+nq=E3vQ5PtcJX(#cW&1u1x3piJ>Gw&*>LfyX-!RQ0k-5Tq2pCOv5{8^z&u
z4JdiHzJroe+FS>DL2k-U_J!_&#DfJBjR;bBspCDO`gSL}EpMXWC{m73V13Y&W*<`=
zhGGyf>j%F(YtFeEHnYO`t5F(*y6An7HWLRHu+525DDmA}x8>DXEZwN|kdW5=SB?)q
z`z-LtJ9=1)-hm33q-SFUR6@T5Lhg4V<V7NQFXN*YYU|LEVSz^_r$*-;<n#K0?V6|v
zLNNQcN~6K2%KYfi;qn#)EUgQ-F7?v~z2K#2bf9$@`zwOP=tEZ!ov9vIa5{lsA`zsl
zlMo4(j;iETU1(N0VY4~0#_b$Lf<{fdv{dC{6f)b1AT<CN^rOZ_`G2WHdnx5WA)_%j
z`8kA_>c5Zo)ha7i*S48Lu$sD9@}eIiFM>9x>7(~AY$n0HIcojyL};fH9#*b{^ph^|
zM#1iyv2*xcZNMYtq?_~(^udvBH$G$`V7i~^rxfQVGOOkw5`7vFN$+c}FKUw9K6LI6
zsCxjd70u7mOzUgWtFq5Wkfl6p-=ZUfz`Sf9)VQv7CATDK+pv+$96tx#K)V{(@of0e
zLV6%w+2Z#hJ=^9%u(WJ$NCfNQZz_Y_I}}*<c=GHIM005Ss%>p`nAb*sjK8LTn?EBw
zQ-gV}c_PGWa3Eyi^;Uu-f!%WW=`}e9h%kQsio;uj9OZ4gTJ(4<Z^)>G6U9NEIfiPC
zj*DqZxD{6f9ldb11reZUv0oA?w$u(*Bk+RaaD5O&LL)6LsvAW_kPhY|NZ4ouNe=Tg
z%HKj8Tt`lyJ4i=0$~=n!*JnJ5DswSV9PXhx$qN`!=*U4A$FrX=LM2mlc>3CsQ^eK}
zdJk2a^ms|&p<%E@<&>SOH!CvZl}ARlij_*E!T)|Qv%c<!F=R?692?Wui#O{9dp96R
z!@i2E?=>=#1?O3```J*R%U$9YXfIld3v^|6&$pPlc7&eE)gu9?O4T3|nKH>y3#UQo
zw0s=p@2nv?HIv_}#{-v;-cBGk$W(W>B+Q?LpL9x@yOZTHaq~3pfNrP&>mQ(ag(d}+
zAeI@RYR#6$-Ei12*GvQ{wgNr9wlS1GRz&6)JIL!68{vZaa!FA~E8z*mH#=23(N%oy
zUUcM$YHKGlwhV+H2EUUZZ|O7(8D%U?B1kr7PK}rvPSiq#sp9Zs7w9M>@RZ_<irT)x
zV0mO1-;5PoYMmJ6AF7RX_bJiM?*jK-h+l)E=a&*K;*|Z8>9Dy)UR?-8VvsKGbA0sN
zT&rGAnKm-auq#OSX;)EOYr=IxRoadqPu!N(*pw*$Gw|AZ$CBziCHJX0LtOAcUZb8>
zpes&L(Gy~TlRmv4xxPuP1!2=PITd>Y>PTpClAkM6l=K`nM=f}f6U0HDNxGh<%&Ogr
zj+_Tu+3I%w;Fv=rMvSXGNHYY_;?oiQknsSp!Z9#4$-`6SN6H!-*bnJE9k=w;iXv|G
z>VPx~270)nPR;i^)`y)LGr+5~4#8Fo_z+T9rzw>Ib=uyw!NdW~S^6gwj`#VT1~L!6
z-BG@JrW!$-o&nQum21?K_Pd1RnSX3W6wi$rg_U8HDI*vJX{kpS8ERtz>@}2yC^yuE
zOHRG;#aSh#J;mos;d;H;K5sJksW7tV_3;~&+Sm`}AO{gox{h;OynaA^yvVlbXmMoA
zRjuz)?Ahj{!;|xM)-BGiO+kdAck?7MY722tNu@DC|MWy^lz+VdxY}cJM|ox+6v_@2
zbh&{vQgr}&bV(r>u^2(^fM(d)w>1w1BiX!WZHUAYkBU%Er8}%n+U0_3RDEjHm>^t~
z4!J$Ek+Ik_0g*@>)IhEe(n3darO=U7*$<uPmL@)+r{Cv-tV=9GE8lf)aA=gdKM=2c
z*KDv+FWBpp;&1>4LFoT7bJ1Xtx*t+km=g7hdBenDu#>V8R-H9Drq<TzN;hSU!i!xL
zuMpjE%q4`9<kVnVT$DLK5djOy>Pl$^+pg{t8Zz(y+_Jbod;UX$FvyWvejT=HVJC1~
z9CF^fayVIPyq<7$WRV-d2UCv_i*=11Ac&1;jXKebrO+3@&E+5nT@TtoKr2Jzx!GMx
zeduXjkU-g|rTr~&I-A`OAm@$V)Qfh{=_*Nj(k`GQ;R9;O^>NFAISm{2E%mW7aY1XZ
zq-WDQbX4)iBZ8@C2<xjy`_jy^Ez#4W1z|0nl`~aPhy;b8fSeb1fi1Qp7OQ#zq6vcE
z_9ikD5hN2X$dhGS$n}2A)(x$2$F_yV1@<X)WIikvpTw!}LQe-!nmYL=E+xdg><Qi*
zbd>)@F?!nPwKeXk7O$9-Dwx8e1FJ=STN2)VShmy%*jsYeXl3G6m;$nTN%|?CCe@X-
zelvVA20=RK*Q{sd?}PHdg8@bkmgyj?;jq-5sqW@&m3i;=c*hl>M=7sb+lCU^IVv6@
zEXsB@Vu^2CB6%QvYOIi1of4L&WNTJF5Hg-xHekKPu3}I(a(gi>O68LiK2B|ELGC!Z
z3OqJ?DVP%vh4?HLbRh?lq83t$ptoc1<R_*ued;hE=1SN$CBQu_F!(NYHUrt#)X8PP
zHbi-ptMF;Al-j%qN~l?-tc~T#?-We3k+5Y;=55adKb&6W(vZFx@I@~$om7wWio<<0
zlzg7%ugLWWeVaS=IfYWt438IaU`b7VoMJxQHOsfeL>;BIfHmReLPzC7jX(%(q$7$G
zBk6J_ZB={0GP_~QeubL*n`|8p2CGOS$|giD@NYxd7NKBun6HZVA?Mw%W^?Yg#N~hJ
zylZBpy%)LuV<Kub$cgNwuTf}@+-$ah7#4urV#qrrIh8^KQKqp1?bQ%JW&7T2l{?uB
z%C86lpg1ZXiX@ueRSEO=LV38)RYH&g;mRlYOfx?lP3ADd+TVn#NpH4<N>G?jmGAyi
zidsFWvE*Tl+fr2wVRN#(VgNerFnZtI&*;!XJMT{s^tw37w}#l@YzEKGH9gFm%<7$1
z^?^v3tT;~&3s6NyZ>+WnJ%=h-asdE?Y|AvlrgJK?*5_x9<-0j#6~3ipO*&_{flNxU
z#P#oFTB0k?<0@tbU%~SgS)L~T#eKf$i{?J(as7KH`w`{PuC3^cJG2B3UZ=^XD*Lqu
z8sLI439s`Gj6WTFUeWP(ULHU`MPSHc)^PLWyX*@g)(!sQ-EyvJcwi}QAwkb^G>2<A
zTD4>ek%$+~NA-H!Q|9?@io*s;PV{%#PRzuph4<xP1Ehhdu>MXMQw1HEE!Pc0t~Uxo
zSA;pxG6!C63y%{m8wh#W&t`H41M~be#w+}*ZCQCs))$w8dynps^b~+6EctsY4nN!v
zVYm5^<-X*U?L`WP@NvLowV7tCF0i`yf>E-+Qo-*8TiCAvXvafG#R?xl*>dVtd?QG$
z-1X{=rBs-eS1qngcHmHE54MRt=_+ao|0B{CWQZo53tssHJl0rkUbvkGFi__Wh|9z!
zC}axGjaq1nhe+5Sg-U+10q&HG!i9?T=pStBWZ;<_rr&Nww+}L%hJ9%7^`zTd5^klh
z0Wc<V*|8t8K6Em&Ey$`dZGpca*@^nU$fiaeUA&1HgFj1)+9@~!$*8%^MFbev`l2}!
zCgk+7Ed6AI14r>wxd1%B?*KmjMfJU&F>%LI0}CZz&k$Brv5qy3J;p^SAhCKNuX}=J
zjf~JuP>y{{&kW7&I=_`P>t5q?o%CrtC>`(^{MX(EiV_<X6!TDaV09k|w^UUWebG;I
zACmdyhBs<bB2g(VUOBe$=sw9pDjMyjpDgEi-$eo0r4hOxCj%V)a|Zh2GvAB2g(?ud
z;bP4R+E2jq&h0{9JX&@Hxy|a6(Y5A}hfXpovjN5(2}e(VOFai=WAX(&5~7Ee&P6}R
zqy?&x74*fcbdc+1xyqUL7jQa|H)+LPP(eL$I0iA6Z`i$XV16Q5`R<*T=Am>(^~cr(
z^k82b36neLAA$<r%LrJ!odGssP7}nX#s8|qi>0M$PMkG_J=?>n$|{{)=pu1>CfgJB
z#y=XtjM!6baR3KhI8kL|mLN;!6G3W7PYW#nkn8RP9gVT=b5b<QZII)g$MLS@p_OCL
z2AkrrHpRX+80A4ZW~(vbc0Sl#0<Zl@X0pF2ZXkB93ms{%1)x8tdZrK{CNuY@1mQLe
zlpUN>$$onZfU%Yc=%}|V4MB2>ZPgp^Z0-zNbOAugZQMcLW|581j2(%+Uo`}fmuOyV
z0xMpD)1Nzk{z1)GNe==e&@Y6gOQ~dZTyp8vAr4qGexvl{FmJMZwxWyk`s%~nbaf!3
zh8;@@%3(qujrbwFoc*APNZ{vqOEM;^-T-(zO+W{BjkL8Sz+X9|=z=UCghONcU=VuR
zyW<i%^2r(ieCZ{KfCMjXWT==-*cM)*z}we{N6uzkrqnKe1E_>-cE{`wPRXu%ZQ07M
zV9ofL@6~Yk@TwAPkg$TZq3q8*!3vP4=n}taut8!_c5<R?TNA?a5$FL%Cf4(!%LP>=
z^WhCw9Co2b4INnJY_fFIB3hLbs5nfFcFY?1oX>(Ck-R{?FJKKM;zfP#sVrg)aa(L;
zfD3w5?UuZ-^%pCDZ}3Giw6+V<O$<_%^y~p3S(%2SYcCkzHH}ZoVMfj@o?PByVOZrR
zKj`Ro)=2Et)T(|&?BKwMgm=#wPziFV3Mht{U*6~|UbzH?ma$ct7(92-!vU51E<3P&
z#Bzj{O`)dZl}!8$E1m<;Ky!9+gi21`TSm0RBS_XnC9^F-zcrd@L|0C~PME%lAhfv5
zwI&FY3~|i-YTd9Zd)&cS+{N3#1#I1%=u=rfV^e77oM0Ip7&6v~%)ywVvdSANA0l=0
z?@fGQRjCwpsZQQ&UU>IjuZkCmfOTlNf9M>kwTP8{j2G3;7H<2HzZ7r+cd#4ZRXQ7q
z2>3Gc0Uf#jU~|HWG*DTtw$*3*SZ^hn2AvOK@=O(wY_V!LDga9*qv*F|DV4N7P`+-@
z4>Mb~99WkxU2VmXjAKjzg0#l1B|(3OO){P)OJ2m6zB*+$E1c(N&1(Hp*~(_(uDK~w
zDr;k=Tev7(wOh7qp<o!yskb^f5GR}F#>jf#dD3LlToB+s;5jDz@Yek{sd!q&YYe_2
zo3W7PbZ$wrQcH|d`-C)NYb{RO>UsV3M}f=>T`!gI&U9|8c~u6pjQ@jCZ`KH>QXCo;
zHd~<au}qpJ`t4#?*fS~&BGGmYMW@guF4?+)lH45>Z&rCN36v?}JeQac3HrOGpF~An
zcD;nUa^uC}#qAwNVPQC(tn7Gn^7X4DJ(y;Tuy7sxv2np^l*bgYS{d4HD>L7^9kn{s
z{c2kY0ijG9%dAp|veSD7Bli&`Gf=1LzXtI&E-u31>}Qf8IubdU99702LMR^@b2>SH
zrIyCM21O1Q;sZO|Bx686*bs75vaqQEdYi#-^Tp{r9?3z#2o)<YvR*EmrkFn5Jk-hZ
zgN(L1kE}XCo&y!r?HPAhBcF)e%wniibt0X=)JJoWcca3Qs!M|<t$J0t@p!I=8RMB7
zu7ApnClN+gh2(VuDxwzzm~&t~lT<5zk(?TL0kuaaKY=&|2V8~~eAF;#gcacqqR40|
zLkSl+Yge3+b>6c91~FkxQs|75Kum61_T`|-VHB#n^Q|Cl5ENI>4!+;ga1P>FO*g%C
z`@w8bmi7$6`v9_S)=mjI;0Ma<FQ4IHC(9dq3#Qr;0b4GIy$XyVn$kRtZ7GAgFyx$5
zTj&dppu_+Q4Tfhcb8kQ#$D{ei)RYPKb3?`9dms|JjhCANm&sB)e^Wiwsp3Lf8h)<h
zhH#Pc`4HK%ftDa<!do`;G=XZx(ycE!pXV1rI1~CDql~QNZ%DcRVrf-0NP`~mQSk!@
zKO}s}oP!=7yr(YU4<lgnGp%iU#&CESvTHE#(|ts_O_)6@m#?J_nVRs$*)#+%YWA@`
zLJTQnFBmVm(Uq@Fs%U2a;D=Dl0ubU?i#6X9yxnRM-*J?B@F_BU+>QGZp`5%k>`wQ5
zCHghi6h3to!~hyVw(wM}@(c^DJcA({T|?|T7>O_ryKD)ATx++IOsBBo(L16u(-M_O
zq!z_{*-zDT)B@ndSvG>C)C(w{EMHV@SMeTi6*o{5^cn?Bpy=<OoEuD4j<xCoykUbf
zI`RQPl)9k&*r@Vbr1Cv|3v^^DsEC7U%?QFrW!v`&`q<^@`lU;y1u=%1t^S62ymJBe
z`Ac1<G_g1i?-bNghn~LL^aKDYq-7g?4^q~#r>)b4YXLSEbm<o)+{^pZ1gMmGfR^nx
zMA6yN&M|Z`6xRkYlFo36q<m2O<%Bja08ExYXB(%y${bKIt-<b!!-MdIDD#<C0G!pK
z8ILA&5Tvk6AVL1>>e$G^E!?FP<uxTN&V9=~(glA=pK<QC(J4;ZsqRw{Hq9f9K-d^2
z$LNk#1IM=t6z?P|`Yqd(_y~@}D9=ow@p3*3%p!nYX&*1d?bLn9VQ>J8!xkLq(_W64
zfq~jZp9jNm7K1oLVmYWXOEnzn#U|XHm{9t<4%cW758*xEY-o2H5N2+>G8xvCN*Cho
zZH;VZZ$p)DP=gI>0?RH<^Yz7HX>%PA_QDNII8_^&xGVqLQ>MVUFC|#1MXeOjH=Er-
znoO=xN4Ch!d|fV7*qYX6*%hs+ymDe*fc__1p#PS}lJ9C~^BSP@&U`+gwfcKJvssJi
zUvc%mcBb(f%NE<MHYWUNQ{1ix3-J%260`}lGM_b}gq;SQ!2?j4g|xQn8ONT@5)4nj
zoPkQDjb(1xcI0>T-#!8%n+I$ex`qx6F0rT_1klIP9e}egP@JZ;HR+|6odd)eC=sAf
z(QgT3Z=e!k1OU)DWc0;H(^H7@DI+ClS$_fcb|3m3wJAa08S^;mmF(6<vZ>;Wo+gr_
zILj(Q`L6nYD4T9-1r}I3`u8aH2D3Aj(*+(<<Ku_&;M|+jDNXA6Snd>7-BtpKHng9E
zylfRja*70K%}f^XmSR)X+`N;0>15VPsLu*hKLgyBi%+=sbH`5c2KzvsupMaZj7{k0
zG$S+xrGVm{V1s7^uxP`4qK`Uzxw!|psJV-$5)vod+g8w{$kAs9s1?3F6U_}mpTRJ8
z@;w!hZIgi4TDSrAS*kC|(k=@D#qrT^?FjyFpRzW&lzM|~uHdWa?%rujfma@o^t=uq
zotZ6Y{zBNLD>$Z#tRAd73JtEljFhEnLtW{1OR-6gtGU_OC~fw`u%#T1nGb~AT~CbR
z@LYLVi^nxMQRBfRTm=$@P3KTbWuD&*>~ml#r!jee5nMii3GO6gPAbidOA#wml##Xr
zMmi(!5j%;MX_p^toEY2#VHZ@O@eLYpe9RnHN2FbIl25e3;xpZ^!JQ~U)t)NLWiM?6
zkURZmw=@2r)WcPd+f5}F#Lvb3A-^PB3Om_^Sh;ft`g=34F%9gW&cr&*Spv4xK!)0u
z(3d0oL_MIM{>hD;uBq!Ah{G}*ogCy_FU2Xx{<fp3vqQ;Z#P4Ip)XQc)cO5TtoNT^D
zv?SEgl^1sbPDjbhS^y68#C~K8Y$4DVp!bwaCpvLFq^}1ZxqTM6%#>VPg7)GK*`Pb0
zm0J+XM=M}R+2;#bu+NDq$n~G<A-6$M^NTmP2;#_Dkf{+Yb^+3xv(w5g3+to~MYa@A
zb-=sd>jm@T5CrVW*!KzA9%|4GBE<++`^YpMqrB>uwzkeyo3sxA$c;<eBbraP=D(Xw
zAs|SG6A0@`B=zuDdow5ciRZ<pg=bp!wjeIm-49*eWx|)I&x?ULAIt2!><Kda6dUc8
znTbN#O=(i58i7}bfUFpi4K|q{B%^@tjgHI&F7MA|K-rhay%DHk@r98GYfz2p=Mx<o
zgSF=*^TE0#rQo|j0eo*_US$F#c2}o+Ehh73v2MA6Hf4&)=$3v)5eULdK($&(52for
zmkqMPrRE!=CNGqv3n~T}Ou+4dJxhuiJihPXxKm?Wy3Kycm43s33YIR>RY59753U~%
zcY!zt2f!ItsEO7AF4SdSr`kFsRHa7~sheNVLQLI8;hOs<Xj7uO_T#lc5&M#eSgbDb
zQyex@ZGOn~_LHf^xyT4ts<l-tcDRHX=GiBfQ_1@)la^Wv-?H-<)aZb76$93#y(cOM
zFgYJxoEqh}_(0<`#bO4_0GXi?)6_ZAaebm4@PhtnXl1{BgoDT~h#>Ow^v+bY*TTRH
z1g!5h9~5MUQd9)G`mP|=TCN9IY$cvQ*v6i;*OlV1HZdqio_yB^3#NoEuk&6c{_=<9
zuq~_R^?+~mDMF+Q+b0iO+E)UCO=vL$$r?pxL)gK6%Q_R#?SivY@S^9kG{Yu35z19`
zfb`!|SZaWy9{{@p-rn3fV#1*HqMZ*6hvz%V>-IEu@@p+@uR6)+h3eyYw>b3((p*b(
zK}Mr$4tJSDxw$w2vfI*Y3^e0B9tdFBT`O7;B-6AeV}e9j0EmqM@KWI>*@3?dX+}8F
z%S-vdJsdIMJ8D-ZaQkq52)kW1@ElHOaq1&tY_KM5A-`*qgu+$DCW66TEYJv*HhFFZ
zbEM`HkLpNzc1uA=TXi%5-`m~X*<>81DQG^8JNU7;b4n%<9jI->Sd)vuH-V%P7NTRn
z1muV>j%bz4$t<nBQQ%oQ6%ar+@?td{=u`Z`Cjc9FHO00X+e?HIy>X#$uCW)Vr%^bp
zJI*3qP_?UM5MvLi#dYybo_A$5fO2hnj3a#t?IzO$2xeY2f-t8Y+o+csyj*F38!AFC
zt0J$%hUY{9NqgOqF1VcQ9rIyBidHvqmV}_%Zar<_6OonmXfaU%!k%84JJt4y7V<07
zh`&&buGo}^onA0yUu}uieY7k%0_osjV}dsGi4pZeC{s6-^>CwPcSUK40x{MVpI;l}
z;yqspC~<Af{4)lXZ7ZkxhC%(n!WhZMD-%rKc6`}c-$2}sj%@O6>;x^pmzHY;m$sD<
z_C>PXv0xmiu_B7y9w5qff#tC-OZHIq!g8e<@(Z-X3s&W(57_T=*Kfp=IhGexe*m(M
zRa@+?S3Fnyq1&U3hpa&=$<kh9V6zw@h4#9QJB`C~x>zW>zU_oG`t4^q@#X}5b2aGg
z4kEcJLHk0;Zsee5=F^$X2*<2K4!gz%r_(t35J6qqz6S7#-++Z9`heay=0Mk9W$LZ7
zbO%9V5)kA{P>foG8r?=q9;r$R-7Pt_6u4aCKj_u)0m&$r>(r?5P4EkHJ$4s5QuPKX
z!B6t_j90m&xi~gTQb4e>3O)|ly8W?DokWlPOjmP8Ht<n7K>*GM=XpYL*uAy&_q`(F
zR{}v24G-o7Qeu~rzwrG*O&!Upv<U=^|JF^B<9PYf4&Sc5=s-@Z2I}nR-;|K^u@!8r
zJ;W#6u9&Y;wGS!qBBbC4lgM1}EJ|agg)Q5HAUE*(oZ^c>0wEs%^=Uc9Oa_Ce%y)p&
zZ9$;FT#l-lTsE1RLy**45XGI%gcuG0YgpULFva0Ev1UEvy9s0-S+#KhtP>&(Y$rU;
zQC`g5A*<Yp#a0!2ucrejZUQFO7GS>@(o3nYGIhFkktytRKpRMd9|h8MWo(;XaBdn9
zLovV~Vy%<#ftjJy^msh2(!C84Ai@Idw!?E9T>qzhbW|DciE?aC$Y~T^ILo<3Dd#=d
z8YX)_+_wZ(d{K0>MTb=B6TK=I6I^gww+hk-K7JEum{~JO_X~e8B-~^IgX%+Elg+Zv
z^P@~aO;|vw$&FcR0*bLQWxF^?@tu)bc7j}S2f8-*AWUA;(^VVU24aD`FC;_Y!lM;9
zFT!C;NVi8lL?08fCDGK6srB_GqTGHq0#GTRfJ_}q120|r?yY^Gl<+qK-pUW}wc`6l
zHl(T8e^)fCw(WraaMiW0+roVv>g%!PfJkFIZFj1#w-L~rySqgb9ahu0R5zUTyXnx?
zF{AHmA-9`U!fa8l;}i>YVPS}2(}vd4`9%=odYn-8^`Tx@P<DVpJ=pdz8aDqJL9p^k
zRkcpsDaBnx0h|UjWqW&y+#@_CY;d}Hh_^irgQunRb7+8&Zz5b~HGe5@S|}T_mBH+k
z%5FVbyd=f7z+`ekGOxWSJ7}5zVS{X2J=3hX-*da;>UqX|3i|B?jPV!@bcG9eawRn$
zXa|(jU~~*8r_~gA7S_E#b!R`3zM#Qo4;nRZkZrNGN!;1b^wD=(O&jdQI>YbDw28a7
zZg3lYE(smgi3#Hy*a&VFHh0#hVH5GO^G_liaW;8R?TypS=4CPXdGF~d(~(%dyuE8q
zPt(KbiC_Z&5jDAv_4Rq0yOA<=LyY`+2-_<kyd~AW0w$-ajm$6YW!VOrp<G>C9MNx!
zW9SUd(y~cXcp)!)+mbNgO5O30g(oW^w;N_^z$I+szQvwCsJ%UTF^7{)p>r>)g?0~s
zsT#v-d0F$d`>{zo1$_H%+JrG<uwB<Ldt|Dj08|qoyK?t~HiO<>&_QfIrE)$Mv$JXd
zU+r#|Pi~W0bD(L1O{odRGfhVe+P_3(VeqB%y$>L@w54`Ou$0bRvcNc>=wXEulvh_A
z#N*81wqc)1&65+ZV{oTR*-#!a+n6@Uo~#(0sm&|37SHcH_dWI<@)PM_;bN0S(LDNt
zE!o`72dPlIv{g^%nkTM54iF@fa;)Si^L{`W2teQ(Yl0r0z2LpXr6bA}Iwa7L!G-hu
zj0uWri(bFyS;b*Z!1ZY<ffUQ0MHCx{WIj3vlmMc$Ni3E#oDGp!GoFl>n=qpu{=$@k
zk}r-*_?V&ObwR-wkYJ&KG|HFFDE&4E`dmyjKuzXGc>P4#uK=`u7!iUUaXMak%b9pW
zdJu@}3(pZGRf4MI1%BPE#W|h4OZ?Gk8}O`w8PFv$D1%X^6{j_*On1LuP{_k+Rp;h?
z$gy8E7~l2Pzupl&{adYulRP%`1g?6<$VWek9s?!2RSkpw2>qNwP&!$lo$23fi5(xB
z2YgHEj=&?EeLPyxV~ckU6|Xb^c3EqgzlIMNVyDfsIaWO^(DUHKfI_80X|F%9+Qwv;
z0P3SFgygXE_Cq98tuRq#^DuOz)LdI<Y)-#V8=}0UFBT#Z1aK6^G}g%X(hUO~wz@Fa
z6foSPtfo$v^2I8|V*fReY_rs|@)^q=L^NiKvf)VD19IrB;fDlmI_8lwtBne{nZl(6
zJWJ$43w)fB6L@EkHV^o=DlEt=cX)xg>XB1Ey4m@$@p+)Ae{6%nmt3s2HJH-0TqJ~1
z5F{~BHM+O~NHq`moCzj)kntQ7=og<#07jlLa5Djp3infzOY#&}9txze-O9VfD3wJs
z0f%h!{USn^mb$m1ITaHEKt9LB_c;Rghpx47>m+Cx`1D}|d<~!(0y)N%Rb-5VDohKx
z)Nc;!t*(goV0-i96CLEms#K5w1OTPGv=1Fv8E<LHY3qrXu@;B>-m?}r3eAM8GXwpp
z2Lu0T<qa=aEqIl+Dur9=EdHapcjnyCOCWO+2Q`uFV*nPkD$zm0IrKgZBH+l|1%<7?
zu?Q0APwCLW%hnmg9)Esrs(2;V1~o#_CS!Fzwj`XmcN&0weA@JbvZ;}tY9Q;Vcsp@&
zTM+#B6*BUYQ#?Qpi@kwl*ki@~PT@jlhaZDRumEkz!W^_RbO%RR?jeBa!64c8$+afj
zsy1ESekgod5|G_l-atgrbC>ve=tzn$RAPS$n&a&PTFAC!l|T-^nJh;1$VUKG%PcLe
zR2`6yOrR+Y^-oWD)Ze3Ds3gyS$^<u6zJWkX&{11WVz==_LC?k;L)^dsX!IhDcesJx
z0z^>5fn#<sK&Q^!r?8oU1M=R1kK3qY7;H-jfje><5UPE-plrL(binx?fFAZ^&$zQ+
zahAcz_F2h7JXjcA(>}B++{vt=xZDO%ysSX~k=99|+}2NEzL$?Hgh$0=w%7$o{YX0M
zu{FlTN4+{hj@cq>n_t#s0RqkkY;AFy$DrQ~?uha_*QF*CzEP#-X{%Ptx#iBB+hB-m
z3}!bayMqt3zMVcg-zC^(4Cb0sV=7hoZ1i6ZV9Ouu_`;x9gq?nVpwJz~Z%Y1GgU>O$
za}pke%8XQH?wKD=KcFSgRrCOMVx#$YP3>Q`1FIM!5iuuO=%i3|UQJr#Um{yDkUg{q
zTVIXr`BP_;{W1{DN&VMAQhqG}5)$cL9_Zs@IZ2v1dkTwuO{kdz(#{;e8E<X*QM(AV
zV5AYVA%_o-13CPP@$ZU-`UMco6@@KQYqsqk3iJ~_ddByA>Al=D79O|uz%D7d)34Q}
z{`B&HPvv_Z^zU)qtkK!r?JO(v;N;I<{~MS{bV6VKe^~S{-vDIs^#xx$dT#vC#eZ1o
z|2~r6zj<g37|q|xuRr{Y-+%bYclKfN7tqqKi!o?8@s}g`cjG5m$$tj_r=|Z-6aQoR
zKVZs#aio9T=Fgkx|Icny;IAw5(0<P$ipX+PHg5<tN2Or^fNgJ^ou3N=JYW(4C4`^@
zZF)S=f>~G<7Zoil3naO{fb8}G|IFbM#O`Ljs+Z`Vg^{SK%f<~R*W330O-x+QSh(L?
z=%B?X7<-QG9<Ie8NG#cS{3&LYrevZ0?|oY)tBe|C(xMgy!!S|)FX0<3_YIc<>7^BL
zqe`>TV1aWZ1?WJ$KB&ZC<w2x@Ot5vA9B5I!j5{bKl{zy8OV`b7aHsD0iLrbIC#+gh
zGt?v^n*hORMgWxh`LI`)ah5mP4GlnypMC?_)9vG67(D~%NSC}Oz2FIeVF&YNLEB98
zT|k<9F%p-H=PbBhe>nuS!-qxsPwWShjLRj(;k1K|di^0rI4tXN8<3a=nh^tMLA^YZ
z1=QXDvN!+J8#g7(07;{0#5q?~&(6R>-cUyuw7?3qpgb`XDT|vv-l~#?TsL%J2xys>
zpkXnf^e~F%V?hH<*g$J1pI=eXBrkS#xygY2)gJVyjXB7dFvKO2Le(V;-2p)?<B0^C
z{YTKRmvu9{M^64AL!8Ysap?rhNcov%ECHtibGKTmhA$pC56g1TrvY=(u7PeQXJjn@
zWTq3HtwI-<kUM&Kxy$Q1>x(2*qDcy!RjN}lIg5bV6o+m`U;N1a1R9&CI0}&4cg@Js
zl>p#{`Ch@%qrc5XM6zZfK4IitWKWa6nSaBYzqDKO^<{IZxbkN%4s|3s$-9*uM9K|+
z0wQ66lM`JqR#F{og0smA8_8NJkEOm}?y`LV_=hi-(Sh_+X`qQfehxXmH0TCM<tTS8
zEo7TU4!Z)N;1t36Hg+rxFovMS7|AN3{a|4KGW~!f0DS-|pe(yNwBnlMfNL5}a+KF{
z0+m$zdsp7`r3MW^0ICAfYgtYkvq1;uMQ~1%<^id-G+4RZj6ryV#@F8?9s~%=8|bq%
z?%eM^8qxH!tcifV|Kr8qe`+oP=iPd?l?VcqJ=#Gd^*Plhy{Z@Jp4pT9YoKeJsEW*I
z5f8L<jv$MDuL1^y1G+)x^Q{0OS~?*vagKxMW;#z40Nt6L4wS!zSFzaxuebfgcJc*H
z<xF}mriQo0|AYf7bm5FDv}`~|3jJ1a8CJW#AnU{{=km1vcznIQZ?FYUcJO1@7vY?=
zfp~n9EZdW8DQn=;jUk;b^0E9|2v?*q`2vOV?E=mJbTRXpF;}fb6`P^B_Z!6e^iZ`C
zO8O`d|AHhuSIONyWtp<OSAwkq6XFj&1(a2QHaVe3E6<s4^P@ic6VComT(&~ENM%Uu
zcgGUJ?I{RRfR@#S#Pv*GHc<sFyuLuLBl2G(V4;I_1g!dbuF1RuAh|6Q@eTSRR-lor
z<KmR|E~nq86W$|{9IT{2NI^C9Sp5Fp8rSs;p}Hv*U4I-jRFA#MVx=@tP5<~2vHWBF
zQ|~Zq=1sNvwm6v8F<~+$I}``+ToY=hy6ij^`vG&*{=TyonuMZHRdP-)524bnt<zi8
z`tkQ6EFVevJC7;a53r|i2w1q&>vW3Xoeu>5Cd)kn#jlfgrZRSdNiG4JU6!~6^in=(
zP)8P<f;FQ(t-?4rzKY52#eQ!|`peA;MgU3=aNhL#Wrk9Lgm9BzW2b27^AvnTrgO;W
zX#|YY;oGQpgmaQgtcqKv<?p$%)_@$ek>Zq<p<okuY*{`_2P+ozb1Md>Q*t*qt>*OF
zs<Cef!Vno8R;ca(jlJ=fE}&Y)KY*}1Z9{#QpUPE-N(AYs0$m{05A@Q=b#cs<BoBAV
z(aS^2fr6Ob_5W&uM4%DqxX=)UTNDD2*e${SS~>!1toew@WfvI>yK!Ur<sbr0MX;KI
zShC~~PD_LpLqW*n1Zzlowy%MhxRYGOo4{eM<<8pwcf)u9Bo-QAnMyz?rm!QXe|<{@
zbc!x$eD4I`9xVSA={^}71A@bHL$*@sKxd;~l?Mv}W8VRJL>mU~PPGQ31}BKjJLKfE
z^rb^5I}rrlm#$Z_F4J?-KXJGhWwG4X@xVeZT)w~retSzTXALblhZda+)EEw6R*C>q
ztq-DKX?~Mlf0(V}m9f_dl5HwDgwccqtz|azs!^Z~D6dsdnhY#j2-2))kmB%H(0-c-
zl2-Ks8KL~q85p#7**z}zQH=hV5B`y1la<3y*^ksBaz6&H-IN>>F2{ys2xTWJ`#p1Z
zkoR?WSU&jXIz4FzT&4%Gb$vzzOxkq^^#G{k{vBjb1K0E4`|TpbSSaVn?P^eV+4tah
zXns{NJdI0xFS~hp&xMbzUl8b8AD;Q$PT}QBL)^uFZKa}e{lx?*=w!7Vi+F7I<8J{Z
z*Fb@MLH8-~U5bi}+Cf&ml0AaAablF^B>f-2;J<9=Lr7uS<42K?yH5W7^q2SIEpR%!
zc*wAP<&So+{~aR$W&=1W;U)WjU-skO1w-I+l`<sr=1+uI5P6f8LF$I?y$1UqQTR_k
z9XJXu7iA`B+x=yk^B)#Non3apGZx~1pTU-AAZiFMzok7Ymi!-2`KvG*j@}GTy8mPP
z|7zTSO#j!Z_aCqS>;3gl!1?RY_D{h1t56I=(EpbLj-BWh|1v+bAV;Muj<j?tv{FL+
zhFgqOkmXRgE6&=c3SHt-^ry7;KXR+<>TtCQmKw1yrc4_gKzO#*(|u%rgl+Y{D>;kr
z!^Pps!*S*zvxJt&%!WR#hWU4g?t=PH`6lA0vU|a)^>1Xa>t6J&+G1XOkTolsm&fls
zEHPi|R>tcyi<bK&!>4pyAx=$APj|EuS>ELEj8A#oH}?RX-{oe%{=dfk|6_&1RZb6m
zswYgv8<JoxVNq@BXWyF8(!{D>ZLxZyzd_cUwm1@TVXI9p?Nz-+?5t>M7xcjy7x3q4
zEJRPn|8<b~0XG1iNtB<oV|4lELj}LSMUjklCE0TC0<f#!CvG!yEFi&dwlU1)v&l6>
zHP?>*@<QHRq)bJ=WnkwDWXtuyz0ZWQ|8z3?uLCyVt`4rBeX+GgkiJ67><Oi^GL1^R
z)#1&PQVQWeo{|Y2zi_K?E?p!-_N>A|3uES;V(XFZrd4lp+`0982!rMlh1cC#{+@o{
zE-G%20GL7K^uL6b|24_*mOY2M%sU7~SAUb42hr!WGKR9I7Eh&V=?=f`iZ!gbo(Xww
zA_y{AbYB^fThW&~_tH%Nz;(GYO15FNd6)J}enp0a(KxJVK8H|m)=sT?g1&wXARM76
zvA>Qne_GV^?dCnYs=8L{K}J?yoVq@Jo%U+h3o=<I4_Byd^i8+8yd!S3)f8iUDRkje
z#$;B&<bBG3OrZ(uLm}0kF=lA*gTkBu@7r=r@h8d!00s@h^;5F7=S$>6`xj>q%Y_8>
zT)ftBWPN_gbr^B}om$4wKCw%oq@K}Fw^~B%m!@}7bFXBh0@RwbfX`nVE%y4USuwVo
zcT^U7Tif$zzI2`<)I}>*!H1oCxpMrbh{Rjj^?RZfv?Tg&q!k6vyR}}pw^BA}YUva3
z$X%FXl^-qZ|A&+R$!1_z@7QGCQKp>d9_tr&-Ofv=ypqy=F7LkoX;#_Sw8VGWxs~Vo
z`bz6***0socR=L<pjEh4vh(MLzS+E^O4;pbNN{85ZEu|n?gP&nJTQFGo5~P(j|yzd
z{sQt>YE|w}S=+o@cvQ^%f!opRWp5ud^QQW{CdY8DHefs{l)<i_f$!U1)ugwOF+r9R
z;%cn%fl$HjcQ2S-4Jrp?WrMHy1$ZZY9l8PYJ_$Ua|ET!ft^eie-z{htzAk5s+FhP^
znD?x7OoaubVIxmP*HM*@M6@|Ie%%U6mjM2C%uhXOeS*GtJ$vHw8saa$1drI!+RPM)
z#&R(eJ!7Z(KNs+Om9;Io=IK`h&)TtC@xU|gl<ss`aE0>!<LN|6i4z4+Kd&LYtaQ;S
zRr`Pbh_yPsR@Cy+tyAY+Mm_zcwf#RnsIZhPUC4@5@=en4yC!BxftHGFYs;k;YW!4A
z`C3}@bRZhT*i{?rnR0sMjkW51^G<V}xjS2HX;4SE<)dXkzg}x|uJv!Blt##nKW2K@
zV}oXa6E&*<C-P<lcT4?*SBD22EKqcFwGNw@ylvIqr#5$0Jk71Kf4Q<MZH*xOEgJ}?
zoJ<pTd8(0H@%hc?HRb;{^SNcm(~_IJ_cQK-U9ZzyrV-j6<^QU9x=cGpM(6!0YHqA$
zun9@BvMHtV$eK(BI%Qb~{u)qd_FwJXJuV!3Bdq-rIkK#Xd`)1WF&sf4UL|s8BO_A$
zd|BUI=cq+1|9ofnut*olpoI}s>lgIzh3F4C;k^(Hj2K?$?cz7S@jrvh4+g;XV7R^|
z{CcjNUO&0&l~1VBFms_l%P9B0{~LbI=sX-xG)rm-m6$Ik1}ugh0DsS7q_omqWk!|3
zJ3BIIiXO1)wCFsT|NY3s_jGEWS@X24&?lr6J~r*&GIZcvMcd9G=BEsz=23&T{;eKs
za>|3;#&`bL0ijHWJ`M15xxEr_X7K9}!=Co}N_R^i2jw!3=L~WU)@*$qKpGzX{G<)c
zcj?h;R4zrY{}YVyQan_Ccn!vHVw#(zuhY#tKL2%KhzHo29}@*1h7O0TMW+mNyqrE?
zu6;a^SEwJ<ZOr#J0qbi8gU<c&XK4O>!}5Hf2M%?G7EB$h?Ejn<beS>Ia?yjVJvXe8
z(3t(CWyVO0>mm_*gH?Hzt+hT6*Bj048*~Y!@g)dda@M@A2Th#e5o&D0-TFXr4LQ1D
z@td(fFP!<&e>q%3<uAUB@lQK+sR`xEfO~YqZV&pe3QxGfwjB)J&d`FK44tzD^LW#3
zR)kzz4<DzIes2MDCCKcaELe7HS|Id+c2lqR_?AK+?uSC_fZ3sf%K++K6K9z*pOLu@
zE0>;cxxwDmz?%6QBF|(6mrg}r79CUi6LzT)-sJz18{e9BnshSvNdwh$d}xI#UByl3
zIq!0lI@Re%%m6JAEfX{LtpBj}4`fyFS-3@CEB1&2Gb)T3j6<%^d+KgG+gGW_b7Cn}
z^4NU}vTpgk+`Nsymi1Q5CpeyqeF%KaAW0@DjO0Ch(kYBL9%)@6SG6c77VUr5k65@;
z_Xyh!*3JEV4PKU~^P+&w>@Z8vckTbgt?a@_PhYK_(w8$N+$%q<??Wr#?xGx7%C(-u
z^RMdkVx+jJKL_k?h)5$Qf0)xc8A4S|oX2__p>5U|ZYyodzWiXbdDW?jT+DwLO2RGd
zsy2O7!niTgq((o5VT@8k4Fg^dog`={QA4>4u2L60)Iy!IC;EuKHSPe&gv^))5es#$
zWSh4KUo6VWlp*|@Z&{kQqwkFZ_Wxq*tHYw|y0?d+yQN!T=<WtZ5v02tLAslvJ5*8{
zq#LBWMY_AYyXzYted7IH-=A|`XJ*b>d&Rx(b?<!$e28h|jwTnM8y|jJ(-m7hEOzI`
z9SRe>95<iMup69>gm?B_?c)V?cc+8;xt>ZM<EW_8mhKmi>`ayp30bV36g|$wv&nqw
z9@l$PBP)GMeD=L^h)&0+rW^H`qLrJ<t_O{t)=w09<6UsHuqzmKC;2ZfO`g}SQrA5U
z{u3MEwpbuKr*#<7oJo!KZ5iZj;<NH^_bC<Iq&n`&U-Iymu^=VS3el`j_%B`(3b>C`
zS9dx&^i{PU&8nrm{ymGK`F3CA>&5>$6jc8S$+4H;=rF`XK#jX~h8|X}MDAdm4ptn!
z=4n}|Hr_9nJ(fMcAh^Umr`$q{?t7R-Jei$A7h0r*K$%T~Uy-C+mwvTttp$qDV(IPJ
z($ipwRilUVHC@%>tdQA(#4=N$>LH8vL%m9lQPYXV86oM>;$%w|PS7Lk64@~`cKX<3
zcU5v@tY7YsmdEp9rQ%Lpw4kMot_56A@8grt)euZe;o8If@L&r-N|c=OGa@KlRca4T
zGg(SUOe<urwPuptjv+s8#PIbpJ$BKHiLb9p469AoqG}jRkeE5YH24CgijneT5I@FH
zBN{Is-4AT)`q%Tl6a{LeZF?T2I+zM@EIWMh7*1IhRLX7%a#}ztk=M@=5-&1j_+B`g
zdqad1l3A}$+rr?PX**y9YC0ou$M3HYuf$+`g^vA0;c?(0{VYfEEldailN|U8jDP_}
zDf;Wf39Y<8uozWi>Bj$v#gngd<t0~;M^peyRL7A77)E_@TT?+g^q=pr_*Op<X|O=)
z4%|f-chu#AYCFWd-ugPg3X70a{r6}8`i~1<Ag*DKcY5|pSnJnIVx&Ne&8>;7%Oz{w
zac39kS8v64-d!xbq#2Z{@MG`}ybBc1#NIE;dKtfc!;gf#f+_ep8yq$aUN#bXG**0i
z?W*089^pNcAsXRDi_xj<Ge0aWI|=$x=r^p-g;Wc-XMXm=7u)J4kX7S$+lC<*(L$Z8
z_s+HDHr6dbwTJ4v;>U3{n(RDg73+7M&DAm;^6ULU!R8=dX<hIk>QsegzNDP4uGpM`
zs>JNS&->d(JFqaeh#hBT?9C*pN6V|M97sI~HceM@d5NX(AvV&vzmwnp$F9Eq2nXQi
zkuE6qF9|zHaNzTnLrV)+^t1Po;!yGYYC)@PL-O%$tOqZP6w^bw2+e{#$XEj^zI4sV
z^5dyU|DW{{J$@-qTk|&iV9eX6ze@UErH{q2-IE15{bUd0gEmbF?65?8T+7VkxFQ&8
z>u2{gA&zz^{uG%6=c?QBw_01bFhLz?)-MI=owm_#kwUE~`eho#=kHLKhEID2f)Jl*
zZDQl(75qDxT*od)P{3YWt$T}tV09<ys{>vf{yue~2(E-8i0Be+^;CZoQ+;(TD3mOd
zyzrg(F8#(;OGUd%^rh=S&_kQ`o94#RY}IjMFv*#MgUFfiClT_>@-tTa&E)zeg}f~H
zf2{Mz>vnQswB>UcV3~1Qob!&gca{;CAGPvH+t=%T_`4m^fiK(kNT)+JrT&;BLk&BB
z)k)O?_IzA#`knyo8{Wi+T64)t>~|(#U_cwm8M;sr!JjY|QWgw6BC}+UxB&B$dWy&4
zZ-%vs#Si5w&^ejEs73S=%snLR!G-QC&*TX59~3KN{%<Id1Dk!p$&))42RqiM^h#J6
zvCW?z(3_TR!*efs?5wbD3jieeL8RFOu(MyX<ZZj3<mCF{5k5l?i283*w?z5ZtH+>i
z0cgwhI#BVij6@7Y3{>?mr+0!Cp$adjP4=0ctkrQnna?WpB=e}XTX7QTmA8}9jaL+X
z&-OQ16n2%1YQL@#sOv)f@{Kt&0R~CPA>y>rOo0jD)7`*BsH+6wH=_+e&>^?n%i&(<
z6{pom5YD;AmtrF>P}dv{S&U0Juq!FJ3+TH!foKtPwwV=!;AjK9Gz^~W0T9{=8mEh9
zW(sOVe_3pln{fZ_*ndSSF*4BC5PsS4M22w8Anut!LPL8ipBg_J+uZNPeV&rcH95wJ
z&{wIFm;b~!8cn%$Y(I^Ytql4i-c3jwK(Hs_>>XS=mqUU@ml3VU^hUVmZmj#oDfJy9
zS1U0*;h?v!t{?Q#QqoY)iR7sW4HQo3JTk1XXnyR+ji0iXy`$Fw7JuE#-&qOrA)^Rm
zrc4B85Wt-@f96dRO`6r!x`wy*_*&XmuK%i`P8w#QO8SBtS4R+^JWc4+1rVteu86>b
zd+Iv%q|l^&-ILf*GSL;9ReZ4D6)W()|H@t6{8VPJ0m)Z_s191%kBUyP)!2ZaZq^T#
zfx4@>NU2d1K$Djjh*i?mPao!B)u2%f%pIN~Cw@ywnlL8$r4<iA-$onU-&Nt<z@~jm
zUpX`M_nf^rSf((e{aB9ea^h{r(As(;ixa|?WT8NvYOf@SxswHh;{Icuw<UAxOa<SX
z%LmJ)XXTmB>El3k)d)wrM$Fv@Lp&=1uJC`dF*#5tMijizUTd)(6u7I9lby$xr}Pf8
z0u%pt1x`M!QR-#_WT!ffW8mu2^cbm5>hfkqR-kpP0K#+&V29B8qs6>XJu#AAC^rT3
z1$@bZw_9UY2_G5WB)Rr8sseD`txgN=hU!P1Ea!%Hd07$B+lU9IqEi{ZR4kBtXFo9Y
z<_If)KNx29jrh;){i+<ky3!eFHRnu7D_T#r8X6$ueC71_I?cp(iB}Z6RNjdK2Gs5=
zlLdG2C}0fkh)DgV$}BG@$53q$xq}oKzfM~_5Ot4XIEc*df;HzwgwKS%0c>CT3{g?`
zcMSZ(H!EJ6WKO0FU3A=n;c^>0t6|Ybd8z(!MF89nX#AnVA6nxUL3Wu9GAm;gO)zCI
znM?9tY&LN&ix@F2Ys-Hp6$0%CpKbR5P^Pa>+_;m)<xHMt>4Ff5)qw8_6S_uu(+pL2
zYVk#rG~=tj5ogH8wbj6;<!-typ>E8SOTK8I##h~Ul6oFrYNd_BH2zPWBnOsYAz)C`
zX(6-qt$g(JiHB4HN$F?Tuw7l4x8TX<QoQQ;-*$56^BvAq=d=ZvU2;!?8Cf?_;ap&3
z`@3)U2dObKPwSHB32(Vd{ZY$VX`|KVTV$;Cf(cc!fw1L<b`(Wi0JXMynqsflT7xRV
z(i)bgfX0DyI^+|mw$(Y|$4?A+uv&4I7{cAY{k~_@mJMPCiFI%Fn9I3?N+$6?WB$pm
ze}~eD5n`Pkf9T~zBfKuhuu)yYbJ0z8&d%yPE}0TN$Q}v*OHon}RM`W49-PJaNFb#b
z_w%TX9N&`j?$H;jA1Hq6LE_YP2_oO|_0#(p+|@GWBZbr+UfEaCuEh)4@3wu<hywvI
z^Ag9|FauTBg~S=WXao|VVTvfEI#FoedTM+*jB6nIadhGB*YLu+ql+JuRKxT_U&J#u
zTl&F4?M!?+54*TGBX2PYN$A5We`qV-(=7<HQqn*t&njm2x`_hK+<4pj>s;*RcDryI
z-DKTnVl6+ra)6$dI-Q5o69?(@lYR<+Nn}&3%)e#RcS(j^{%Ayt7YpO2uw4zg3Ruk6
zK~$jia?GHX0%OEV^BpSnKZ#8L(~Z0bj1-f4Q`}9#_VwKI=6q#X?b`#%V1XOm*?_}g
zd!|j?8S86hmm-PvQfgkguY4{C%{d?6M|L^d1xV}{w{z;>lF(Bf%W3cec6Ii$HVL4O
zLl8UZ2ABgm645g+EFksOS0rD$OQ|6G0A_~rN$>7(msj4Vrf)FUq(tX=#){CjSu$Gx
zq~IaMNMy!TwX4Cqu^{k&{O;9?I##}XQP;g|2W6erR9UzJnEFM%u4dphpKGgvNN+tg
zGj^BXsjv~ah5cHy2`Yjo8gFw&5y$IhcI(YTy|$e6Ztv%r*Bw{Sq4mS-DcPX76vthq
zL1|_*u2z@+BD+4(cn*_YS8aBTF+7p5j_>%VO1BPMn7WA35Eie^m39J!jbnDv4kW6O
zJuaKN;#tXlTDrf?D`v*RI9g`0?HMZ$!ygN_^rMNqKK_OF)?EYYdCXKvpY#;X`wDo4
z72s^P8Fh9rb#vST)Pmj4kRT_SJ+g!0s??5DgEbN>vH$hj3_J{5D0f2K8)B&g28Pb<
zS4@L0&v+JV`o{gQ_>Y`_1==Aq0axOmK%?@Fw1l@Y2BgQ?uwMoP%8O5w%zavgiV`Gn
zw~m=rJNI#-@;V<nrn|gRijXY6&)?Jv+af+Wt^!=Tbh__A#PV{;>^l-ahg(!kMgmR)
zJ--Ozu(bLug)E+gd81vA=Cs`KPtoBGX{**64@Ut*jL>O?A`S2bq#~>~;T`)43n%!m
z2#{(blHwdX89gzm@V*j9)z3h!s`Wqg9?s0Cy{q}^6%YwB(AW}#K0!^6=f@2+pC3pc
zKTeCzbkBcS!tsph`e+g<Y+aIr{U0Oz%CPJ2GDs8l=1Nbl*&wvOMhHcEy2CKNcHvN?
zvYS?9xbk{;l52XKtLcJ+5<nKCb&bcKIKLNy%_>XAgm{EvE+-XG$W@<IhZXQO+p$Tn
zH@Bg908oN{Z|r%kx7OO(VTOex7P-!$aDKv8iCMf2b#QQQKi`yF-|nOhu^XDLR{ZK3
zILC4@76X5T-l)2t&m|0uG!<dt0bMACPxNEyW6KQRy+(Ty`HW7z?0<c%a#H`J_nG66
z3lEtdp&o}^{u=Y~#Z7{JIL%VXI=P8yy@tiZI3x-GTiu&eMKfmFYw^vHG8vg2Ca`3t
zeJSMywzFf8#C|m$>--7~zd?9L0=7B~S*694H$s1vss!rR@tHVdS#V+RjiPSo;o?u1
zsvZePhq;At;{X8fd<3R5_<rG=k_^a&`2<bxqXnCfM~S9qCUTCMr2SG6FZq%2<3@C@
z4w$3lzCC&~5Q$v5YrXg*j^XaDl-%ap7`d(2g4O<s7#MMVH5h;9V$gG$=|zv(x@f-d
zGWBu{f_%wz(zAJ9m5X?j+z0CEA8}ZLst;_pW`W`i7*)k17De9WH%BUfO@5ZA)Kp%M
zDdq5=e1nJiM)UnO+Wyl1iDsMY>QVovoL;~It#w%T%d<UhirBTX-w@b&WF`i75H{Di
z;v!+qRgzEXp>-{fO3WTu^5G{eeQ~HFBS6OhN?e9!m5jwMO(MXAfnJoH@b8Jx45o+r
z;SwNvX*QB^vQ7#?2{9)RXk6#5bdf!B6>sa5@HmGK@GAu$&K2Q>tFbP0nSOf$iY&L>
zOIWsgijKTWS@+ts=%%^k(oM&@tV<d`;Xft%;-iaJ9^RUGS1Wah2fn+*9Dsj`5&?@3
z@{V&{=L2M4RNa=X2UPj)X<ol~-b(S0>vuU{0`(00^R2MWQxLuWk6&j@V<N-|+^8Nd
z9I^@(I&#Wxkna=x3Tci~s!P8=Ar0~^*=cJY1$Zhp`A$}J9viFS89qV_k4)D11O^}J
zjjfG4fwqzvhd&#jv{Z$!1fR)sw5#>{bRU52l1g!`QBsSgvXRvzp;s7-PQ(QC^A`)v
zqkN9^`Pxu4qtnV}Y$h1j)6o{FwXqVK^b?YcV3^}FNyGd5+H%A<2pnjKK2rOH=W8}k
z;;f5qn?5}w*|5OO(53sef^Y<9sZq2xgt8=-@(KBd)Tch8?iI8UWjqGu^~iPmK9{Wd
zUhwgXYLz<P&94IpohCnrZ61m!CF!oZJ4jw9_bon=D#}ud{#&P@D8a3H&DeMmYPQu{
z&GBj2N0_89CrH+=tEAy`VM}0k%JPi$9BA9#UcDMSf4$R(IE0&mIh!U<9)NB}g@Uxd
zL1zCPAkMh#WHT$Wtav>?;UDO-?)3qBIme|izCLQTLa<zUQTU3iVjv?!e(CF35i!nL
zIi7u5TJ&}BRm=C>dZGK3*4^W^$Mn&vejC}YoZVYfoq)|B{Gni%wF&Rj77YX21<^%Z
zpO2(x2~|oq9s5c{4(ka_U8m3tC_+2@)stTloap5e{Ye*YNXEyeSD&e$94CtGuAV32
z?lc*Rf82+H1ocFC%U|YyDK8g`OCH+=++V^As(p7-eMkFG1r8BCroaHX;8?$RKC($9
zfX&Wv;LmYn0g5da)lX>tZLsF>Hp9nHg{!<VB9kS)x7S{-bE23br}rAwHPT0Mc`Auh
z0g}QLFEV=RcvKyzT5%{*k3cacli_r!79TgJ<yinPwl+>JJaP+4(yfiRFi7&Da*O(3
zeQOg0roJLEHnOv}ivjR{LgnOMeo+;!(f)OqFpoQPUKRYD5429`X#^(ZBV1FH;v8|x
zNd(!Y5$uT@sI&=!DPrOsdC)T86W@++XPn74itWL0y<HWhc@VkbFr6zCiT#?H2YS4|
z=smLq>%{goQ<ubE4TUgcA9u8})r))4i!j!tRa-|ilCbC}qkm|@*7Sn=_-(r>CVabO
zrN~hDZMUz3DnY9cu4;rG82WqLsS`f*_r&`c)sFN5H(t`BY1&S8XpAoS6W@cF*Pu`T
z$*vIi;HUvaJ<tNN@rsE*>j<|@$nxHBK6XmYXpgqgyh&8*$D&<%g?65NYCb;F8AqIk
znK!T3;$@bptrIB<BM?bz(R5_U|B$u{gMU6faC`Ms6v*}CETFk;EdbxCHHCvjKj4W&
zusv#VEUbkZ)+H2AF+B-ZzNmHshv~!U4+-L_B&($4X=t2IvWttjtCq{$i0>$1P#su~
zJs338|HTu#Bqn{XSfLGWe&$6W#e5IXyu6WT<~HvrcG<gJYYtxF0P*6xRNJ<WCIbOK
z0D)o0R%T2Xb+-;DENxUl{|qkHGg%U5{V5zn+oMF1bns8ssA(1L4BC(Td=GkVWr6nz
zpiLVZ=8n^FZ{88x3Lp0Fe=E<cUug(~+Ed!McD_U~7}5XBh5C_{%_&Urbmkr=iF1AP
z6qUO~>OT-#0^tB_a}24_y8pI_kpq-?iWI8pe2ji8qUOapRE!Hvj&VNt-b>-gXh@t<
z?^Zf%2rSR&f-Up8S23j8|2XbF@ygJ~wj*CnRbhr?BkGNY2x(slZ9(w4jl2SiK6tMe
z$m&>GyOCGf0}xcZsVT;Ks)q~xfY5eBi?>XZ&6n#DXx5UHJ#QlUknRbc_|%;#BJ;js
zi`cP#*t^6|@F9PYXQZKDkmUvb#YoQ5dpB&d4X~Q)<P_pvB1ia8x9^mgt^X4fn&osj
zi3J1nOrD4!`S-D+HIW~k!>DEQ_xu^{V(f((&s?Af_HEObtI*l3cI-rXCs06%OqDDb
zfNPXb5!DJjJ*EvvmUH*$jRPA=pygG=XD24$$M1_@=N;i{wa(cN|Lr?;(g9xSE=OFD
zEi!f9fvq0<%7~U)BUkl)kWP{;{2E=c&L(NT$<b||n(mDpO}SPJ!mvL4b;#?|1XaFk
zPQ^$(v$e-1TL-z(htotk*d5XBgWyy}suo2(iwjE=`_FiFCoS3~M(L}!{Bv0_=^lOj
zI2otAdoFN7zL@}!vh#&TgQ%D8u)(EnCJj>IhUozdQeoX2-aufH;US;w!e{?X++u@<
zVIyU;OGq9~<G)Y=e-Nhgv7kfBn}Id06gNoMAv_1kYDRm{IU_V~>@m5u()XSM78~GC
zdHGoG`z(k><g>f!9ze1){o!NR9xTrQ`RhMGMGO1G#C@z6+7&8xfzpdRm+o#qByna~
z{6XC)_m4EW$>c9yvc24&g7Ir`+Alc$8{xZyd~@!M@?St0i0k7GTQ9!~=yEeT0wZyh
zqqhtAZC(%jv*_r@gKoZu<3r-bwf)ngOuZ2b>S*VSFZtC%R66%#iR5@%Pq-?`ItvpC
zce(4vJAer=u1#!%Q4OCjRUkhbWTk<X(j$S|7G9?;9#;wd^%4oj(`>nrz1*+HFXFxY
zFu<GqE>zLntXxDVtzshCs-8y>gn(Uq@U7xzrR$;cLNXK)PLEsg-8rf&S`dU_>4axz
zm(p#*hkbum@cm7Ii>Pz+xfK1(-c7WH?Uqn+P_(JO@FFEkups_4WhqIp7-d*DV;eR0
zWk43$JpAi}H;l7@fFh-{Ub#-BI%zN3jL$c>vG;Y&zBj7*8?A=+(nWs5ue}pf2^x*(
ziU<=}!8oVk@ku*>F#q2~qN4)PS~Xii1bmH0EnS1W(2j^*!Zk?^ods>TBB&`F^{(Cb
z9J`Apth6@*>PDJhyjs<7&7H~@yQ_MQCJOt5Yv{ap+SP8fc74f0txA}9HR18K^Uu6I
z6MP)jc9in#$Tt>F<%?7>X9#WbtW|oY1NX-PSJN-r{7n#+reE*(+n3=-OD~*0y0^B4
zzFMN*Nwqj*veKh!rJIr8(fIr+$B6F&V73;kR8WeyY+U#{uIm){G^$s^iECNEC8Yfs
zxtwX1s(cNe=9QbE(%@;$bDPuqV5z0`)HGK3ljm2#ME=yJaez<!6nl(TLR#-lub$<<
z@k_{sbX3_*8>&h6rg9~szmUM0Vi}6Tj`26s?wCIIm-SC8pP@7o3XKj@LJP~&`MY*P
zJniBIkn03h&!(waXLEU99@|VZ-k=HbE7r@$ak^X&thT(Af0+(ZH41rVdLe0q(jRmf
z)jjnpbV@_mtK2#y7`nIiXyed=9mV&#I`FC?xN)<=_1{@0MJ^C4o@nAL-X$}Z3$Ldi
zhZr?on)Kx~02yZgLg&idR!6Vd#Q`Z<lKJFLArVZUDNXQQ?7*P}>#smTvR7%EMTG=y
zXrJOP%~IUv@{8h?8Lx*AiS$W3e0ZKniR%sylHga*aNHW;^l7M`k8f|<By~w><5y7(
zQcf`hhg-eg5Mg)S=e#cKg>tMjo{KFSmB9VBZMuNS8ur}0lMeb$h|=n-ffA^;rTnGs
z)UwHGScPrPY;QYpm`8%hp~j+du%FFGuA+$88i&ZSNCWiw{}fU_ASpUGR`TXb!PGV#
zx9p?IPY$IK38u!S1<2c|@sY|b`PeCKS)qblP6)|W2V-3jo0i8QtSuM3+0B!8k|)nL
zD$daXSNLk&F)hhizQT&)H5{zAg|EBvO0U{d8Z(xHEA<?0ll|(W9$+b0a(5$3%C!}r
zC2hbPQDj*-dc=RQ#5P#mATo)-<y!xtX@6wkL^?HDnW2^@@uO|1IETP7&n8F^yZ+9~
zQcbN|QpNgbxtDOXN%GD6&F6$b>}X}#A*C;0`KH2{#+WA|r$J7#?I_2DO5uaA&)MAV
zk8rmE5yo;6K2W`wLFXQ~65)rBw@H?iy+OPSy)Md-lN5T8&Qc(6pXBATp^IvC6^WL;
z1*QGRLPBw(sLKd?Qrk;^(Upf?E|sLOAuXKHp1q>k?_kYn#J;K3hMgod>oVmo_ZQ;5
zz2m~j(g48oE~gYq;P7fnQ>bztXQNSfxzm1(%*O?gFm<`7whkAew&iIMDGRw+GQVUJ
z)(IvA_nk2k^sIy&8a_%Os0?*WthN?Buk84Sdts)wj`1yvax=NOPgJBQg)^HuvDVmt
z8|Dx~{TN+2r;#ctZ|VZZX?bg>>8O8o#}%OOig@)t4MWVZpv7oriOe`xW+Me+q>^79
zg8yaJxNBCArZE7<7G?h95<G&$#G%7+8gNBGuYb*YDRCBjxSYumzZ4B!ZmrAk(Xn6K
z@jndIFFK>)g|FtZqM>XFl9G{W+GhX;lAXmj`=75*U#;B3XisJ1NTM3K40>UZ)!oF0
z-FugAW$PRS=J-fJ$-~~OZxn2~F6&}Xlv_mkQMkSq3I3FP6M^4oY)armd#T59@+8f9
z+C!*ky<HxCV{io1BxlNKG`Gwl52EqzKjH+4*@Fo}45asazYKJvkVgp#ZkjjwpsCRQ
zyxC=P;3DJc<5%T?KSthM@AK(#j@zM~@bo|IywRLZIaWXBa!o*ch!>%%EGJ2xuu&|P
z^lr(aOr^0Vu@dl-9=fT!oWx#`LbeqY)l2fQa;j=$8X9n6sl#3|J~uo{hKvmI&JMc@
z#MUIe1-L0^P)s@}M{BlD`rqWk*wMT|_RVyE#W1}{E5l%96Ydk<S(g6pzqqUsJKqG<
z*0Fpl`@&SR{ljkTyVJ!uQF#|D-s<E(9w#bCDc@~TKM!a+)a)$AK*F*$wPfHnd4R!=
zqJsj(is`9XeU%G|vqocl&x@xk?`+<|u0ye~&{`)K0(+bPr~u#zm%=b-P4B2;Lzy=(
z8l0}w1vJIq>9w7TwFUCmp=!?M2V@HV0LZ8%B)yF)0G{mputFgl^5;d0j7Q`2MbQtR
z?IuJ;)2`f`poFr8T2?)=XD2SlvT>EiFvqeVPBVj*{xXT1vjpQMA7s=-)Dc{T33s1O
zhX0J;OO3uEaBHKDS}rY;UYXG}>#vrvOGM&?Dyruu+jDD;GV9h>p$(J5jn^A_dIv#1
zzL~>yHhD%JqS&2xM8p}v^AoA+*W5Q83qs4SNBlL-mdG{dTIa={4|MzH$Y!!e-t7NL
z_z<&>;37^WSy@!sS!;lZXe1+gFC;eXnBUVum>2S6+N_f*4w>iCflk}1uT<89Riys(
zc}HX(7$jxzH6)A-tL<?+yKi4t;mee&ygndetnF=zKl7v)^<PTRu0g`)ZHZqmLPg%D
zeG}d334L3JC)a!b!`6$QjAb4)@vic?g-cc>r5VtFJRyf?vgY-XZ#Yt~raTU~ly85P
zK4Bs-s>FUzWAMnRoi(yRF^-DlKEulA$Cn^57t4R4ZGz+eVm!R7f5Cmjz10<VNiSQT
zDbH@~)AZ4K2wq1}-t1L2E8TUMd5uWtCW4?Gg6=M62X>iU7i%J_s)Q*p>acFB;bqWZ
zPjll|Bj|8;eVN21kmNX^7@uwS8>rEEt14#lU86ig;u*_>C)$eIU{kyi90>@>74&Zn
z1Yqv0!i+%2f0W2|poL7th2lNNqR;vg&-p-`GuXEE&CUmM@6G<5hk3$0L6PlPK%ZEI
zZB2II%@g0nV`<0SYlXC5=wp5m<;tsdu}Pk#B4RsV$YCk>!skb<@VwN)xs<@Ahr;`o
zY)MJbQz(I4QBx!|5^#GY<0+e9q|T(E9@MR=85=sm40|$iL!^#FgC_r|4)~a~|06n~
zLe8#U0K6TZaL!(QJ!QWC(`^0iil~NCIoDLHZu)kvYdk?hYCu{_lON@CiVjuor?{`p
zGi91B8aC{tnQ`yFzgGf%uUxgON!>F4)P8af3t?!5y1@6WcER_;IEI&_*sHyJygUtf
z>E4kwS{z`Gy2Y>2=F$J;{hd3Qo%<$-(oyG2jDgpyZQd<oaXgF9#1dUL>s!UpJ9&^d
z8Fz@J8ius3V&p^MobZd8!ZD+xwtKa7+SSg2b$K_52@JL-nB(xt>Z9a$r@{j=%2I3c
z)KSb<WWy%?xJV8&<8DdPP@bgaDHfMZCcKGq>1T~74~>RW2b8O^jgO~XYYdk;l9^v0
z^4tnW+_COCPpPyP49peszWTeie&4y|HR#voKL1Iny(i}`Bx;8fVV45xrE+~4)I8`(
zI>Y^SAGu7XeIJEuR0aEi4kfNC?1&%&yJzC|0JxPd%T|J&aeO^hFVBeiyASj&m1(Q}
zcF7~i<-LdU9N!>=ft&`v^SBpR-+$`DhdA*_;Iy3xK3Z_K5cvBWN3t1FJgfo&mxF6^
z89lt$TX_z{zP1A{-#Qrd20Lc}=CJ(NuXSj%@I_@b^j_$D7CfysZhm}S0^>@!WEjVt
zE(#Tos-<3Ce2!4EwcX5o&0)5xvV!%n+YaR|^7L*}jr7yWyZJL~ooK9=C>qtXLy(EL
zYuosXBTZ?)aku<-YSi!=T49!yK(&?XzvlUMr7TTn)zh>X;qvQ3%+HNsCqJiK#Q0T~
ze%c};38!?g0$RVsozNx-r&yX<9FI0eC^3iqho<?Z0O<_|xVODq)?V>E8oHWT_cVRC
z8wx|Y0-qMj<?7Sm>O3)eJ+l4DoOR(KEwCfO_1slqD0T=>_g3==o2CcKy{F^6#>r=<
z>*IJ~!a7!94DNFqFOKyo?Iab-@x4lF!Fr44JX(<}RB_LP)`ie!l{TQ;UUrXZ71ZP>
zH!kwMBf}Y~%x@Ag$saFc+tkiz?C^)pw}<(G=}J3dXz!uYF93rxV<EK8CHf}u&!os;
zR_s!!tP8m_mek!yOKJ!qK+QsLP3@cYcL(t+M|L7itc>{+#X97P|26bZEbZgS=g2uq
z_LCFq`{zR6_17yp)l(Co6opjP?!~cI+)uCubKL-D1G92@R1#B(Za{fQ(S`Z+S8w%?
z9)aq}U0Q{TMo{S({Y1h&36)3@OR6<m+=e@lz3!ADF@U^$O!~Kcn-)<vV4?D`%nWHC
z2q!P|^YKK_n*eP_B_Dk@jt&1zWlb}sy@70LJ0T8QxiW4~O@nt`v6zb;U7u3fgt^m*
zcAio4wO$hnd`36lo@bs6j?WB?Hf2EI2kpDjAzKpkmE_UUO3LLiLIHA)({@*xN$bhE
zWwCzy|BF=k<G|N0=FRlJRzGiLC`>^qt|5-wpCL1y6vUeVY3Ko!@zT!w$t%rYl(+$a
zHoqub@%UAHj7Is52@cBZDrp0xPR*z<y&D)#S;oL$Xyb4fgtK?@Eh}7MWFkJU;97B6
zJp@Say*)+{3b?dvoDIA5G+SyNN4l!`p``kOOJH-$5l6S+Ub|`=TDgKH>s-w{p>dET
zUDPt<!Rkmyj-*RYZc2@{sqAuTyGBESWe|gkwu%A9IjtQ_jbWka7VaF@TlnLt?}l1s
zf_RqozkGAEAAniN^~bV+fxGhVj(Icib=kp7#B%pdlRVkqSGAqejOc&Gzx(XxlT)h9
z)ltutFeHS%jRg$o$2hbfa_*uFX6SNf`B+k*0uZK{`UnasoC2hqTFGB7)Dw9AbR`Yz
z`!b*bUbl)0;?U|Q4TR2V5Pf2X4^&(8c_<aXi23Fme$7Vu3cF0W3*~}FH`^H@h2o>d
z?7b2c+^!yNJ*$tNw`xj%TA~<XOQ$X_QU!xuXvbN8Qpbv-8$?@`^qyk>EijXkRMv3E
zS4J<{%b~?#u*LVZ*_5qmEhSR%ANTxgeAdQBp&zI=k1KmHa#>m0^BgNiuXz>ydYGpn
zdMEt$M~J8y2CwfUmi0&-PWr>>cjVG55&vIA^QxCsL?S1XNfIuKP~77?IQI{q)vC<Z
zhjJuL@UK~1nhZS;L%AU1KNRw*g|#PWq*Ne8wgtNC?6b(^wOZ!_NDj(2o4;tYzWnU1
z3XRilWA@lUOzXEB?68&9TTEDC-lvwpKe<LmHY68CT1%{-n1t=(!-a;o)BD*zMo%Pe
zfJGtknHO>DL2FLP&~H$lo4Ee<UO;};cZX!@+2phI0lw7#KIq6IqASd{#!xc?gto&!
zN|B{oOa>s9H(Xz4PyJp~Q6B1;Ug7a-Vtu&_;n9hAWa3WcP^h*90z^^W-~zc2r}TUN
z(dg<!%zoM~_Yz`Q^};wb*KA2@lua?X6&}{vSM1x6LngmkwR^@{R;N_pWGKY#0&(O?
zN>;&~ve(tX;?RL^TQxl_Trm1O-Qm#aG=e}NOKf`}0T{PXenS846d#OX*YLz}5=jWj
zvG6_y^nT-NT*tPd?Fun>7;1yVoOm_Xzw^#tqrMu1iCwG88QyKA)y}cCL<s$j;fqGm
zCOT{9tM1mAlnwaZx@3rkr0>h>ZfhvAE;G~dzFq!i`VK6L*`39v=TZUkrjY5m>PZTI
zhi423Xp<K*Y;f35R*x(#ZMVaD8s5OOlWPBk8T+}0Znbh>g^eeqgaS*jDTH{|Wx<%H
z-n&W;qg31?^N=C5HOq1Q=FZItLRVt{W?asY$<;2pB1Gj}7rm7CGs!{t9mRjRdlzaj
zX}LBLd#V;01hV7r{(l#Q3XEL5`u#nax`3?0;%MOp*^H)=)c{_Vs!p1Jdw(aEqM;RK
z1h9-)p{$3ikM_){K%>}*(?%9ns_0eT0>MwVJ2T6!RAro?fY0f>BDq=H^cUdj#%j+e
z>Sz#+c;u*RjFaSOUD7Rls@ZFM;o7uj&*|+D`aJO=m?r31iIetr9o}-%A9s5ooQ`Tv
z9-9HnHh3g~jqFm9xb@4G>PpGlf!Do%({X<~RtLn1uO${$&cbgHh=d(prTn!hQ7=~v
z#rPAmZQm}6a3I4<NDNpeGMor&_3RlGQw=+M?jf;J$ux;QV*7MGItoYm@0>fZyz-s>
z^fkQkRhwW+NDj@*M|@g<TfA^F9DeC$>>GJ}oOapu(7U%E^X#oKLySY$ETjDRF_Akp
zF(>NZONcMpgY8ez9+In7h;m%`fJ-?djoOX*Vt?Oj1HkuJyAQ*h%5+_B(_1q}P9pyz
z02@L45`lDBQz|c?^w%s^XpLQw_RnKAV%#K_m^l%`(wKN;hThlm%fldH07N*jWfB9{
zO(83gt8I42b`dfuOA*TcUgqz!08}Bk!yJM;(tW7+i)?qHC!=BY-Drz&C0o1f%+a+i
zZTqqEv`NT(>WR_9N5-dO1`z61fSv-?GTd}v@oL09)EFl$-(P(U)iT9F?f8*k7jzMS
zg@n~>-znM4VsDi5FaHwAzaGxlNGfB<@=J2$If2Bjt#e3v)etOQAObNe6wV8henibT
z;}cYL`|Jk9BwFnlY~jxe>ce{Di&bSk)JSgSD2NbPT8m>8dn<MItsS&67Tryka>Jqg
zB(}_hO^za2z_w5?nB&L_b-c}~`94Zo>`7J?&c=*suw8kyizt;{;lYk}WvUK`rN*#K
zmr-{WKy-=Pt+od9)Bd*9y_5KQO9A@t%<D)8<g%o)lplI;G3JjP6mQVbL+d-oN0tMT
z-9ZG5OViPsa=!<DixF*EiRgUH9@`(Ja`H~)$7^i+D*Vu7Z)8JoH+8<2qf*3!8q_7<
zf|c}PkUK_3)%^>Mo%_I6o(gG3rLxiptjtyk-c5qmb?O@y!h?-8dJ{jljOO-<fuFWy
z9GF!}2AS>T+M(=e8G>=$Y$Boa^~u5oH)F%s{#VE<+KZJxt`#1>p1O}>(7A!I1qoM(
zy@`c<K~8Mi*0u$rTq3&#o)`Jwf<jiQK|elI&{&en;&ck!6~#kEa3a5G_46xI_fD?+
zVjz5XFX(<lA`gwUHu18F6pEjd<3aHmY5y|KPsdg0x+reVYaN24dqctP=9$F$WDjaB
zw1BXS61M5g6$?BMitXK704hYN*}bnynfA`hm{yi`(`WSBpoK<>sqkgY-~O;O31hn}
zcJq#(>O-uAeK0LAL<~1fE($D8vm+_M`yB`)-H01Sk(Iiud;XQLxZXYYtO=^7<RZpo
z4RNi`i9EXYqWC`Xeh8Arb`Mm4%*!$>@+XWV=F$yZJALV#iy{D8GOzVL++^U&Eu_is
zq=t%4pceJ13Yw=r1*-c4)pZ`?3Om=?RSgzUe()!?k;hqB<L=l`yJNIL3DPoP%H<0B
z`B0*|M=zPA*eeMB0snsz{0=_Q$<kw1byXXWSq$_Ryg3l87$W)x3t=-N8mlfg<3aj}
z142*Ir6<*3bc!9gJ0!#Wr6z?o)`<`WM6Y}zPKd?v-0Hp^iey-~p<$tSmd%SblF0Za
zXrLBlTlmB-gdV6u!}0h;P#5rdCbh<)!GY+i0R79e|7}?Jv4gd^+&Nddg5}82EhKy(
zcvloAL{yGOKAg0?<){6T!56};VA@~$!eMgTldAEwQK~)IKTqJx{B|Xsj;wG`;`_&r
z8#~H?%p$KvJ<UudHLi3F2Cup5Tr%#ngHt1!5U9E&x%f}Tk0sUtUY^a?K2JkG2`K!G
zsT({ab+AmG--3Gz-tRo&>Ic%}aKS-L2u2pUq#zWz-gB9~jtADM%ZvTUSuu;uvfq2E
z-~}Kgg`)QuRXQ`&Ui3?1{$60nUH_#&U&%+{EVNgmIcWhOvM~5ew$i6>r;@t?QP`h$
zaU>v*wvCAZW*rWw7M)UiQ!ylfKDkf<W4K{*fV8V^7N^-Oj0~MWDQTlTbPLKcLY!!K
zZh>{P@ppTOrUN8z3iWZ&y1^(bO_vEiI7}8`&C~$)M_o1-gV58Y2Z;{I)I|vXHqFv2
zK;RF$edH96g>QzJw!flC$RC12+<Tn9>He`Y9(B2_4JIm<e)EiFaWFM)BXDSZwTG^H
zzxzHw-6uxG%&UB{opJ<h;Ukdjn7TwYm)=_CAX$K+^D}w4l9DfdnjR2)n=xOx<vMpS
z?8|>&@M1Yy;4{^7#Fp<~LpKkJp{XVAJtaRGJn*{k^_q$IOS3HY|Fu?Il4$B|E;rG(
z<ArA&cLy^;`^nK8h{6WvH%-JVO8yFs#}0}7b2^&YfJ3e#&C*8rcSxNhCh?VQmW$&6
z+uU+cO0UT$Gzsav1S%)Woe3w7J1_nvHM<M^F+<(i!@uoP1*a5!>4iLuE5e2bt9vKr
z;6lNy@Z+jecvCy-AK>O_25dRjSjOu2_EPYr{2(NI!RG&}_(VD<gjDiZg*xz4boo=E
zK4Si*_Sss<?MkBU(}%(shD#RROexFDse1(oaLkmQl)$c?ZZ}(&W%}|SDMQ}M&wV{S
zr`(r^J_b;BCaB8<uzou<k^@JcfcI4sC3?#6Ms9Ez7Hwef^r%Xef4U~Qm7jqK{={N{
z2kBVr$KQ$k&!PomK+@xTHiuIB0MukY%AtR-XND7S^cPpYt|eDhaX)h(dYnm~AF7uB
z*a1oQ-=o1=)BXGdVc0W&Nu$zN@2Jn1RQl~-Walx|eprj-vYMWZ{)OFw*W*ua7PhnH
zeHh95<$3LxCErIit_{`ntCdfllC8R-JTRXI8G|jFhaTi=bxM1;S$R`G@gR@O1DHzy
zO`6_xBAqu#PizlrfY_wXa-8++{Apt}PUIcuqJ)boLaINHlOc_f?`w18zP=DkgQJW3
zzSEI}99RtjZo339e(sVy`G4}}UnDhU)fX-#@I$f9Kqidf&h1Y}+F>=4*Rwg|llmL}
zov2uq{Do6gWoBgs>$EdhD{}h7jn{`HEmIRnU>N*vzrl98=8rx?mYY-l^Gc^SC+{LT
z@cW(jCBw;kTA&_qpHSm~`cx1V5T|HiQ{roS2wWvc>z9b1{?djaq_kuL2<wtb-K*q~
zxH><>x*1w%VyiL$GE3#NcRfnm0puS8g*adnSUC*@9~!m4LjwGtT__M2MvE3LS=w_0
zgvo^+I>?G3uR_Sct<Wj&P#m$}7e>L{B~}g8ClaDS_Zu2P?ah|%eD*&&Pgd3uV}dl*
z?Tu@ZNq|QRSnpMJBI+B=6D0R5u{2LLz`+ztw_tK`O83!3>Ns0%J04QmT%|2X?h1TA
z6kNf?gVtvN-V-ru9)cX*@=etChnOZz>uCVy9_RjB<aYzubm*%*X4&~E4L-%R&(}+y
zq+&HcJg~+UAu7FT<8M*D`^BXJdUgXpbvme9)xJrS)VY-ukMd&5TVv@N^%D*XYxJ-w
zZ|Wy7TaRp3Mz}@2FX$TCw<{KlOGp^1?A=USx~OFL1S})mC&VXYV#xB;6ZW=4&4>E7
zO?%2EMjgWs^f-9lc1Wdt7(R}*<)%5p5sct8bAM_Pq`Q#(ZI|8@QPg=5_lINi&PTar
zQ1AwtT$}6qZS+yPuc12*6qf#Fe!o=eZ;<a>Dh?3RJ3SWEBY0MgRH0Ij0pQ~~WmOp1
zRFUI$`s4%lGVpalBHqn58107mXs!o+g_zKhJ_1wUd<OSm&;W%<>!yIwTy%%(&%>?8
z*HHzlPCp}cTos<)%nYB-_kDCl?e)zpunrxKsu8rSF5Yw{`*c?F2F$CaS8%^(#~H26
ze5gLaLLvXU{|QAO;hw1qo4$4re)Y&7&eRKbuM4dj7v5TW0(%ynj+i`_=X9QpO<r;r
zf3^QL#h+aLrx1T2gNlX^6W?yyfs%HGRUb+M{V9hJh_0~QqpVVLlfPuv<r#7Cu_o?r
zXJf87TZwe_YGy`U>;NEdkZ}oCzNub0Ff#2^EWP*7Kp;g8n^`CC0R4$cCpY7(t`1h5
z52&2e?;8mMZM+7$PNf_wFyLd~R9ks<LAaTLo}0Hqgz%qhs@}|caqaYl+OX;c+$Om6
z=T=JMxE}f?D5ZCA&_D5Q40oQArI|jZx1SOpyT<tR8DX!-I~A*D!PD1kdNTs9_R+;?
zAzTnw0mEs#8+MTqmqR%dieAx4_bC^&p73go&{a3fMb|nJp+9w+{2JtCR?TQ`^X6&q
zeLU&hLF4P2NlLbcl<4s~=H0$%yDwG5e>ubs9L&i*UdzvpAVSaC<IopV--32lrSSWt
zz>~oD(Sj&DpPI8||Au3{vM|mk&6T0h#f;i{dl3#)rP`ip2rjZmr*yu8U(<7;fR6a#
zT)kzDpDgMc@Zm!we~8h<+}CE?SzxQDs~;x;jm#P@9kr-hduBfYICPLRQlJ@!H6#69
zbXVhio#)*vW;sD%5u9foEw9CCcLAFZF1uS1Ed-uy4`T}j`(M|!jt<Ak>g{_o;hF0w
z9t&R)pRAux3I+0t=0f&Y5IgkKd>CVr9%cLA#lL~n&nm8>x|Mr7c(TB=RmP(ggt_0$
zrf(-D#vlu40v$0r76oD^!(;f=xlRKNEIqC#*ZmKX`sv=A_`>;IGESHEx?WQ+IXgl$
z);aiABKaQ^@vwWsS<AqbHn{2D#h`V@y6^fmI+_@PYV=ayRp$7F-lA_zjXxqhiY^%4
zfZYIceNUV01JNB+Y7En}YDzgzC{ru`7SL6S)-sY}@gU_Quh3Nd2D^!xsP%1`;)!FU
zI)5SHF477Sx7E}HBe5EL^tYd>pf*FYJc|cG?o8Hxs;0dzZW5`<%Pp-6suu}gW1j;O
zE<R0&n*1rGR0tE5k(-dt{G&L!`opWkUmzv?7f6}9{I>MF-nH4$l&gDo74jje7C)1(
zFXPh~z0QDni<50!ej<0w#fGBRxCO|x{MG^m?;cfM?;27;8ZWM2*Mig;p5>h{p5wrv
z`VA*UbJJ5NRoeM5)~h(I5mq!vqnLrSEgc!J!`q*<J?k1VHjVG=M@0T0Gk_Nd)kUUu
z*Rf9J@QnLWU!TG~XB(0?INPWj4G}s=zZUe+^kWqk1->O^fa$BN5h_|tfl2|y5^>Av
zF58|srhUodWwNjF9=33=h~MBn@QSK*q*!FS{>C%pf3Xt2OOkEZ<Ws^o%WiFNy00<n
zAiIQC{15fHl+k#WrL&-aU(1UEgaQHfiU4=*(Pp}_tJ>wUtx0tVu#(L~oKc{6U`?wF
zNs|OiZ7LD0IvjFF`(|(ldA2D{UVOG~2;}uK2)#mc><Ng5?Kh8;dXL3B7C5gWIps`R
zQ&2HFnR(O<nlqqZXY`b3%Rr`h#rrT0sdXB_lpjZfuK76RA=lF3y2qZM;WX09@p?UK
z@-Gqv%F`P+!%rdxt$W>4!mE?krY<Yd98~gE*1S+$Lf&+zC*o>`PSClgS)FBWc-Sud
z;HfAJoz7KZv8kFv>&N&(m3vi>(@Q(~Yo5|p3uCP3?L2ymPsYUZ)%jLm+fY8*CB1s=
z_PfVLR;-$$ZIg6doAYK$^dE4RVT8D%_`Qeu^}d!xt;r$t3uO$y=i%g$=?(ow_P8hy
zdMHi5@;qIQJ-pvRMXUt$fI)hXBC7lI*^%iyMbmXjt4T+}Jq?YAiPGcLAcoYKSsxI&
zhJ9;D(Lfk1+<+e~Fj$8?Yyi(Gi|LaRDnO6X!S<Ft&#tze`uYM%DaS4&e41k=aiv;E
z-t;S{D3s;rL}2JhGpwWAnokqjeWu}3;JZ-291T!cM>!J_)*@MRe5T8#c|x)<(2V;9
z>Stewn1Tau7YR;2u^O@a{fZf?NhHe4HYGNj{C6^oKb*_q&}^GqU?_3Xx+(xh<EaDG
z;_4wa(pXQj%h|R;d5K?ak<kRX3L5<85Jz&k5}WO(i$IM2)2nU+&B4uSXTbRAjXV}1
zDmd~mJ#~Hpc<=iV#68a-=6MasR=>ng|0CtjP{YXgx3aA2C?yPWIlefyXC*2Q!XG|<
z#q%h8u`R*&3!nRTQe%{`jH}tUjmBTEM5-dGB|S4~gkRq~*@tufL{gTGmHGa;BKb8}
zW*NPJU6rI)cWZ2&a=HesNG*ldu&`XayyxZ}R&VcD87KSAmnk8p_q8J}B=|Y@$O}ak
z!NE1&YfTI3f`Q^7r_=^MvOsNnjBp*#9rn~uga%j_8d%^8lyhQ<Va1qFCwasdj<D(B
z59O*uAM^`}%I--k5Og(7&b{bFlA&3ZpVAh!0@mL^qy!%x`s?3Amrd^=l1%-SWBo1-
zzs0^UbvUj;7DpwgimelP=?)@YnPrYrU-Nhok=9~Z*1q4NF_zD1iGEC*{mCXH#P35O
zMfm5@tv0)ei(HRrp<UwfYFgbHPkldidAcC@c^pC3o3Hz!%2y;J=vheTxCL69)fIoy
z15BsiJeJCz5uN%E6=R3DgoD;~I=jejo3@PSe(yH+m8D!c6)k9rTfbJ7Muv^&vk!gL
zMZzGVceQq-<Udk`sO^+L=KnIiXLMWdoHa=1#By1hztEp;z`CwRpi0X@rCOf{Ts>z!
zcpBWu1D-|I=d9A2&)ILcuA7LfA!>8C;<?~lp9Rp^H$tU!ADWsy%tS^YI5GjHyWgo$
zogQ-)DY#jG*5Vwd)lT}*^A%&9vI39nPTR|;-(f5~ttUyxRzVT)#y6HUvg(8%BLXGH
zfR`1kw6~(s-j4j0WEIS~whs*QCNrGvQLK_O2>u5X3(nQEfwaeTF>du3Rh!?@M!RJK
zDB(SlzOSp2ss+xwIGZumw{TZEC!6}d(m+9fC6WF#PzP??b9=N?U7N3fZzY4^)hyX0
zO9l(PD;?FeG8>q>O4yLg^}zV}Whq-Vu=k6h^PXgsG;U>)j9$?6@`V92bW{+uwr6Y`
zLGoTw{&YlYX8v}Ie|`U;M`^8XMy0Cdr`O1=Sxo0>J1T{cz1C>*D!?H1p6#$!vh4%I
z$ir1UKOI7ft_apsxc}{_+(A&<V^UUAOGA<8$HoXm+%7QZ(RuiPk}x|k6C+QO9*p4j
zZtcbqXYL2fI1<ZcS&?#!$qQ+4`w3AZ^}~3hri}j#Q%kqAYA64#h&in4{^s^is`7VF
zf|$Kb8FijY6j2`ixhXxia{CVYEwq9srmTs!BX$BVod(5%R%pyj`HBuUQXQubUhHIf
zf`M)VMLQFgzBECXJUf`udHZjuk%|ZQ9nxzvy_xoJ@zJ9zC5h9`%_qxspU~r81r1{H
zFakn0IrM8p#l9n&iXrs|4g_Z;T_Rjb-39-crqY%`gWs^gC(VQ{!e)nGzrFkFfJghj
zGbR<jkFfsd7l&S466FHE56^9-Fk6vI_!h_B4&-7hzUEOMhIIkRYM5B*+ixjsFn9X;
zjXNGW9;8+CtNR#e$<fcHj)D_g(FKyzgDL*d4vuJ0IQpTeb6VHF$Ct;cpn(Yc==NjG
zXgZs>V$rlK_8uj@*RlL>VHzl)5_<VtR(<**&ilT@dhoKf>=5o`-MEpfm+WXMN9ho*
z+_GXZMY?Rt8Tw`ur4h=rdC7X{Hd_epq{k5A<GqJlDd~zKZ58C@a%$%Ld><r#t{3BU
z#;1Mpv`VbFvBwVG#S;Zq0n?YPFDnCVH~`%i&I2~K;BN|QOa<yoGSr`+$;Shq(eJ_>
zIDiDAU__($h=aR_ix7#G%&S$EM1GCMFcC*+<8Oq0snR}!HM?-owrYml%d`*Yw=h!3
zxr@w(I1QF+Ua`pME?nnKzPIrY%2WFHx<jM6Ve8u*RkGD)7#6=Z{MLPva%xykn)c-g
zUFFSJ73z6=oNd?z&r`Tfhvz-9mnKkf`d14KpFnN+!YS|u^_|s(r*U_R+2qo#3`@$q
z7<s}nPs5DEGH-Nr9O|mMw$c9*v<#cY*0HX$KM-HWZTBO?c&CvM1nh5lE;`vjq!14K
z!z<1og>!}eK^+<U5Us#@cB7<6mw4_sowE-H6<;^>)O>`f@4pJ1w&x|%<bHg14Exhi
zQBlda3%_tfO7I&;-yR^@)_Vg+8$@|yFa<h47?_q8TIV0>717%6{Q-fpkZr?ZD?}(I
ztUC@)F(XLaFTJDIcV(q9oE)Jgj1N8=7@c|`dXsQ@At6~!z<3MS6g6s8zd~0d7BcF4
zqvJy_IV`J-zSUbu*6u0V(Wbs(M#H?tyEGo14zE3L*Tr_GY>CGz5AM|&gfyt1$4|~x
zsar;1LGmBXh2?ZkPh79g?Lg|-@Phrd18@F}M{FHoYT5;#dU2vpxQ_k#o}9YBYAyJ<
zA|<p0;tQ#B=8fNYaD=DT;f-Jmy!N65zFHZ9p=`5$Qq@1$B3mA1>+Z1zeDSfaHC<H+
z=mmHSPg;Wpu$(1)CB83?P*?W?m|F&{0mAL-Z)&6WsDQx|EAP({Cq9_1PJ%1Fe|pc-
zB#pa)w)PLVe&HhuOEvWRtu^%V_zniLboy#xjt`+u)xwI8-H~ZUE6NTmXy0M0)a}fJ
zoHj`Yu~~)WuLDQbK?ryIh$uq1eVBJ@>vraRUT}g*(HwLdz-MXa`3{C<H#zQ3SBxnR
zaWr<{1&E4=`~Q*km0?kEUDreB0D{yErGRvIw<wKtcXzimNUL<02uL?b4J|ozH`3h=
z-{^fm@B6&h_Y;_FesK2Ld#%0K+DE|-Uf`r!93RzZsX5Iqsacb<e!tvw^TlT~nl{Q3
z(*^h<90EqicWCA5Uaz7I0q5)GGq7328hQo925*Y@MQ2r3Q}N<79>d)rU{fBZL5Nv)
zNF#!@Zn6~lA+M5#X7x%k5HEZ}SAj{&Ni2HPbk?wt9KU+nAH9AM?B3J)zEp)JB5mzD
zUyL_j^eyqX^`qMyKgkKx3&jHAYP@76)#93R+HRA_kTi!;o7UU6Wex=-Qvac9fAKJE
zk%F6d6rK5qofNjFardk7OxX|U{{p#4QSy(mw_hNqWED8y&6Z*ITJ=^E8to4B*62F9
z!<G&0`WJ0q8^3{>Gp-TCU^9?YjV%C?U-<skdw{^@$^^NNrI3sN?*Y=8(@qB6d1fXM
z3J7$&Q!L^+#oTj)cdb}~{At5q;ch<@aC#!yd#=2s;knuD{8z}^drA}k<=&S0N;-?p
zV_nd|x-V3$w4KBgC~OfhxKXY>J>>N5qxr-B#Hb6AF4VrqJ!F^dK-FiSnZD)h1+lHY
zf=ritvuXeq=6Mfy_@!?$a|d*o><ar9-XXT|-;24U34i00>kHFwW$wUXyp`Q{mDx>2
z{Dq(nZ7H8DArsFTngt4Zc-CKe?75im0G{xgb1@9!;B>nW!?KE<55qc<pwqg?m&NV~
zztr6kcrAjK0L$Jz_QSXH^B89C5^y0F=TJY1A4`BN756SMLmXVowL|loSJP73pdF{X
z1XQfwT_TQ8ACB;vh#7yYYU`m^8Zj0r&|0f+$un=4FRq_GP&S2n(y`%G#KpR|2PYT6
zY+@lnbgp=8$0jgmux1n(B-7e?xrMpCjGe?oFsvIc3n}hrL^>z8(P7q*U_$V$cjQww
zo*hRP=fqeAC@M5fNn!}Z51FA@#5QrAyhXxYB|i!L!3~IV+HJRvMQ}ymz^}C`WQwrp
zKo;qHA1_ZOz4kRWPy(%BzI61J1b)fp)iI?6(-uC6LS!GTRYAf|{?boO^)<%kPSuz$
zJtdp)I~4SVmHaw>-5;a04<z&zg^ZcWHwtXdCijB_P62z4!vzQvLF##qiXyB}Y%hIp
zLkz=ppr-?hWE`=CUi>Woa`M1fO3{^V>Avw4EiIm*&Ik;pjQUg1^Ut@HDBHq;*tUnb
ziXxx?Y3Kjx_DM?{vOLrHzbKh4&>!$|PxRM&^_n0|UV81zgUwZx3Tx)ZyXXt3M<W~1
z8GE5e>0Qs+?`C+3;)xZ7?qdX)i?5H&-tO=h)Gz|04_kchr1pj9vLm<qkrl_`^&l_w
zTDFlm-#<tBvw2wc8hXx*UprVgCk`QDz<+n<GnDyBd|>6oyV1+UybIBCB1lfIGvLyF
z7ZO3dT!5feU7yj{KS4!syx#ePTVVEe7jy1SJjk%(L1WowjS2U3^soD)rycwyF`G0r
zn}JIblOsj+-yWQ_6iikCzSlhgXMsfA!u-|Rp4;<p8EywpygEQy5Oh-);j@6Lp*)%+
zq|0sv<!sp0<`vHH!SvIzYQPe}KFbuAL-EgvASH9|=P-TMh|((C$_t1V-U0Gx_FNg?
z)4S`dz$JjbXTSY$ovvTv(nF|58%ci3(kRD(ZZq5sqVBv2zyJg*_MHb?46tU|jxM}I
zI@k9kpHPjmj2{l>A@vAig!1>fsCX%2!BtuD-X(_{O}0&Es1=de)Gd27BZH6qDzq)*
zv@M+veS;qv6((MRo(-tW4HJh}y5^W^IpkCH0qLef-)TB&BZR;4Fa3}!a|b`Pqolh9
zcVG2X1kelqkwo##*DxFp9yJ($Vb5shh1B|9xQ4DN62$T361mq)r6?U513XGfa#{M0
z^Xa^PYQr(gJSu6JwD|ncSvUBUT}BJ-Bg;SG-qx|dzee%HZ;%CEY&V+b8XR3ohZNya
z<7E-YI+g%?D$e~W;jT*54uVS#al?)a+NxgaHI+aOnR=V+bhw7$q|bqf_A}VvCNhf;
zY0|eG$n6eV2BbYCQOfZD2JPQ2B1|k4(0+cBOgmXZs?tDg8y!+Ro%V7d?EQUCOWjIo
z*=8Z{zra=yUU|>>rnsZPaIQ0GTbXs88wevX>H9@0)RlH{v0eRKzGhqs51SU9-CMu8
zuhVjRIgHAJ{4u@YVKO-TyM<HF6&4XFeM)E#P|i&eqO`G3%P1N`6;j?Ep>;sTRzE~p
znRbgn0<DK<U;v=cn8+w{?G!Iv(CtNaGQ`QBwnQPI+$^_T>#0`E8Sz#ZjHx8v5kHW-
z^XtY1Pw;t<Tll|!XK~|a*M=`(>=cw0ne@nZP(njMZD#TdDnL=YzA<Z+LLL&;%z8p}
z-Qu$+gfr%{j8bGH_d{|m$E$r+<i^WBU_4}LBK>~*USOUstA98Rg_YzeG(0W-KMK*<
z&!Rxvm*8lZH6_DT%4EZzSVLHH)wvcf%%XHck$<{Slbhd0F5Karla>J7r42e9F-UXn
zE(9N@f|meoH{<rhn<FF+4O<)eOZSosK!*guY2Ag2!nv~BS*v3iBc1@A?+ZYr7i0*Y
zLVflC7ojcHwDkzlKAa+rm?5l?2LV5PRh!ZYEU1fbUFG09mS+q4nAnyjii5W5QW=&L
za?>o;D3^dBY5bQRD{m(oQdF2Q9q~!8FHL93<99>S#%!-trP7fxU^^*mKpe??UY3rv
zZq?zqvhF+bgmM5<$5a1dtQD&~CizuPczfm#<+brrZ?FSo++gr<K5XTi@vnpN7$1O0
zU~CZe(EJa1Mp6rLB6Lkz7PBuPB>UGNnoLWy1h~ex=?(}XUcB`t2_(kwr_phQ53;Jx
z0ed0ZoAw1rE8pzS9UCdnk4s4=EcO|^4A59(vx|aQ1+6*67~ic2kY=DPhCQ^{V(~f_
zc<g4XI2x*#e)7RU@f@6soHV;ZEAJz&UWx6&WEvuDmk`-&^QXhIt@4?YC`gI$7#dn6
zD9o4m9|ZfSEs;;91U<RV5{GV7GuN5t`kS>;=fKMu?9lwLZ=XLbzyV%G#7p=&Vp~{{
zFsb2#8=ZeV6MZcMOAa8a;j7rXHZCYsy9MGKxqI(Q=HCZU-BDvB|F-(<qo$!GFruR1
zxy%N9<9RiINkB`Pt4Jh@C^GE#b_>oZQQEMlcVGe$y~6Mt|J?L9c&VIvftA$S4Pn!8
zX^QA<OcO5t^6|J{p(vv=ZWo^Ah{fB+#tGhOYCV^@z8iw9CRt5DNUCED9){YFpHgd$
zxt=eNYzoX0ODS&S`EEQY)VJE8&ClbBWv*ZTQGfO&{4uS7zn#-L?TTF!-7luE4X#F&
zYP>+*Q)?|4!3`vnfo#Lk@1r$yd7SORKtN;2LK$D%<}$wtkFM;p$#AhSR4a}S#+|Y(
zczjQe=K#Glu+9N{0~qqoykYB%>00}z+*LvL@Zpw#Ggdyt9+2NR2VC_>a+g@s3|z_>
zR7QWXg9<MUm4J2#ln&(JP?m`|H~9~*A?0?4j>XSuA@`Xa#4*?fFPpcehAwPM^k0vh
zVLA5Psi1!Tqsf4<HgepzrS5<hLi!z_ZTpzEjFXDQsnIKOF>=&zcueF+W=hE>WYt0)
z83ZIM;+#{1&(KKpyEaJ8SvVtt5!KcMa(4E$P!QGM9{mR5h6=R=`XA%PpJleh1#>-Q
zFK3Y-u>Gh&XKi#f*rg%?A@ZijaCd+_98pKlm!+0RxJ@Kc>&?j8^^vUuU{;<c^0<a`
zq3VbY1&pqC0h8yWZ=T8rcz{k_DoR!;Ok#2I&|3G)zcyUVn}R%gn;0;2qFQPj(58)s
zcEU8s?9C3YX!fFfy1ad|DE=K<uIL2ti1RMn@ggB2fm9&-v`hV4Gyj=xkJh11&QX6S
zs*l(x!N)M_5|ov!`zopY?;p_+jfc$EWr8>CKc_deB;GMdt&L9+8K5k1)YFm_s@MHY
z${&@ny5;Wr+<vV6A`b6|X6j-yItRM17M_wXuQ)?8mK=t*m5|*d>mCUs8Fga|6?xp0
zmT%q(;~{bfFLS^fq2nugAAR_he*Tod0*yuV3Wc@8eXQ~p6xA7xKU)A1g!f+itU649
zsa{<qp`iwaQtcU3u~(5MarEO4W)5__YmVS3DjqQsk05U^ec9RF)k<-scmFlVtMkU2
z-qRWv;}a7e;w}~3_~6ruZ8fv$xojYS?s%9cb!Bp8=(?e~{jt^We2dEIhiI4R`Op0V
z@rwiG?5I5VmuBg68`S=N*V5kvWk?uK-J-p<O0KE?rJYAFip*9zHJWC7S66hO>n#Hc
zur^bm4}-LY&v+CFrBD7bR$&MM5b|o6HcmEfRezZb?q)$8%#Wh<&F-Ya<BE@amC!xz
zCV2yT!wos6v;{P7!+`<pp}?oh2KD*X!MKx8dc>v@GiL9ifo*qSdfStN-A||&RDm8B
z+bvlLzo}cSesIzUoQ(m#1>-$@TeT#Pyc`Y2b#YUwXeYFrydFd{E5gA8vroskW*wqB
zT}FdXT<AmMIHLJx1T?ARhlt2RHX@;*SMZ@w3H(|a7pa<B^m-Z<3{Nfuq5+h1fDy1v
z8?&^ne?AAy)-E$`3El7gNZ0Un5!XoPh;N-}aJr@SlkT7{E)it6=Bri_pz+%SCqbZ<
zFDZzyw^grqhZpZtaE1f(#27Ng3}dh!`t@7rBM5{@Q2XW0)bFNe!LWU!68t^hc;U@U
zWMH7Ibfl<UTymE@MV}@%o)}TzLM;UUBXWabw`T>C4yDy%wbIGl?EB2y=~B8{9sF|h
z?0OeU34~HFuc~RYr`#y4ZBHRrT$22%S7`r@FhnaTlpLn}>v}m760VS6cJ`Xb#a$;9
zT@BFvYrODI5-m!jcw+^1aXou+KB~g46Mu0#v@|mIQ*BlI>Atpvt^K`5vpZ9tbF6>5
zd&$gjm!mM9IsFxf=2@r+T^$>!*0EX5umnZ&(ImVrpS)%V<_yJxr<_!qb0NqT0t}|P
z@a>T;wa^th{dBMe8t-Sw7mbShCHk;w6JUi#t;ra$k^Vz3MG4U+s~i|$LjtoHfAAwX
z7mh$K=?V6V5G(3uxM@h^2<PBO8N<vfxYfjD+Y|Y-?e=;&SYNdx`Pfxe*Era}*rNoZ
z(g*}dQ_lq$tO~)OR7;<THO?b1Mlf|D>%RDn)>!$AvhAaX>g=j4HmQ9u%hFVSJ+h`6
zKDTQf*Ayi@Dc;Spt(iRX>R8B(DZcvS>$0Jm%hW>r>j#HxgOdyj1oYLyGhWn&%ahsj
zO^p0Ns?C#XFEPM@DY@xRVWtIH{)pqVYn__qZ~({QhfaNF3rU(1)`$uW&YNQ6r{Jku
z>iHU}SD*z?b)nx+n$kYwB5$q_DJb5EY8R}}@=1hpX{OcU0J~YU)`UNV8~YMPTTHJO
zRYZF1_lf+%_)URuBDW{-GYmm6`;@t&qgG&;X<_5mj?Ej_1hCSHJxU8lf${8~Y-)HV
zL>wPQHh>5+H^br)<L2%%rI0{i8Z*w}TE@I4vW>z2MR+b@R%v<dR|;RdDU!Bi`I`|y
z=|J1o85%ZP&2DvXU*Xf>Ub5A+GZw}4(jyvipMB-A2o81K$(QI#@$Yo^Zx;Xu?zN*M
z+GP6W54nce+)VPLF@&T~+hv>IxxRn;M-_G@5UA;=Fa{k3)W0nagf8%bLHbPRMEZ3q
z)5_|V#OiFg*-#XYEYCN>Q0a5xJGn57v|&qb^~3282kvs=k~wiWs9#LvTg0&+w+bH!
zdAANLF|+h{MQ#osR?wi|H|_xnG1;k+7$!!wWHb-^%2@wI{lWX3_iAbAfdZ*Jc0D@f
z%QZvabs39zP2W6~=k%6@NDko;arROnL;{EnfklJz@rMqG1ax4>f;eg4nr8PTV4JIj
zv{^5*LD>!wcobq?3<S_PfqgfZDUXwaEWRxS=&-5H0vsihfPmsf<cChEEyT$?sDW!a
zpc9s!n!R5Vn*>+DUYP^7T?7iBIrENg7o1;W1|lz+!XrYO&%mTza8DY>qv0>?Q<J9|
zb<z0~H3|B`i1wGn{*7~Cm+dbMetd`+=1d8^=%H6C<1ynsbZl$<-C-)YdNUKtS@Oag
zlm6#)N&e9LXMkp(2se3+JN;F4X-DaMly|DXHj&XwK3lHJGX_Y+4X)NYS^*>8vP31h
z5yfePC~XC`rG3_w`O~H_NTj`TtkezLAq&q+`3*#1e@r49pUFO{+Lj<JF^lJ`o$aXD
zarcsrWOf<Dl($e7s~DUBCyxU04fQW10250{Z+550sNo3(0kF1~X4(_+^U?YZR+y$n
zqvdGh>q_I#wa4dcslm~kBA9vKXw-wF4=qB3lMR#tD*VqUGs0cif}32@Sh^e4(uNfD
zo5sA~#=qy8&g*0Jvs$mi#Bd8hoh(cAR+hZ3(|2XFr2VG~iy~fq1uQ|Ek(qV<QmcR9
zP>hO_<>a-Ll}^?D$U^Wf6bXxSTlgUJuLa{HC541rLZ!}=R8@!9P4Jr)Gk3Y&-Z@WN
z>^|t_+Z6t;r>D7`SW{^W_=BL+Vm!{hD&U?96+scO*Tlbst@mM>+13B}P@~8r8d~tC
zSTZnc_BJPBMhUjeNkD${-OB7@=@{pblu-R$Nh89F2G@bc9nZXmg7m+DK{$}&5~#!E
zWI+?VnZ<(RGi`%wxf7#BK4`4Ycq;1CnU!eURNukUJYzvJY`1gSCAfot$-hIH0|&cP
zqjvDah6oT7FYcQ(nm2Q+wyc{aP1|EpvUU)rYQYZsVp9YNj(Jy>n}?#8h~=@iB?5$t
znEM8^N*x3Sn6raV!VieRzCx<U)g8%4d&*{^z_xj1Htw5oS4qU5;)XGcRIl+NVB(S?
zoCvb?aT_U|??US_I`PfYVm*P2F~of=X%q}pz1Bgcoa#p8fayw;oCu5`O_whTdiA6A
zNTN_$n)0l?1zvg<VLjvKBt5_OA@FW-E}<Z%nlGM_tr6G4aWDs{a7gdc2Zsqz?|0|e
zM{}tnG2Ax^rAbSWUK6SJ^KWsKo)Kvpd71jsI&>1T>kJohrfD8BlW0cuPaUN`K6~PV
zN5;lpf4aem6k?pK_3j9{MFWF`yRB_rIOeg_(u`0+~9nDK`h>-YBF!XKzWCgTm=
z(jEYEDyvW@AR}D(SG{CD4f`pQU|f$KwP+a&Ge!-A;j)}B8or_1f!sS;Tb-!!|3uYa
zVFOB}Xuuq(hOX?=w(iNo&fLkE_Z(VC@(-7pg(A8R1{EN?zZ7}3;JxETV{*%Jn4!{4
z53h_p_3e#1iS_|oZKTMX31-ABTP(%PooqSqW4x<2=d8kUf)wH%6<h-<+pB@SOa&JC
z#G;1Kg7O}WheF(6vYQ%36+rfq8re!BAOT--@dMvKISe98A!`47-_^$Y2hR*4Proup
z@4Rzx`*Fhk*{-GG3mCErbZfP$e++?u=aB>Ivr5YQYmJU+h|#!n-~ctb`;QqtFnIOc
zR1XKNYy?_b;@AaOEW>ctxSr5_+ZeLJ;SMX<cNf9$H<zTqpY-Gw2YIW;oQ?!y9@wmo
zgnLQtZzGuHkYpr+{>=X+KM(tq$XHjOKVI7U!V`h5F@lJ@+8;sL(T1x*5?kFdyp1?}
zw4^&WGO}reqIZY!x~u@8e4y!<*8sXEyU6rpjb4x8T~w4MBNSl8ibUh2fJ-5v3u4Wx
zwd8CtD98Fj1ePmFy4V@w#mlM6LABwC9!s)pcwSBzHqP?Bn-PYa`09t8vCcP3(Avex
z5K!kEWj#(bI6Q<`ljN#(WIp|AbN^OdwM`ppmh#new5vLP>Wl*I#MGNsmkPkVIX2IT
z!mFx5;LnNoK$Y^$a;&F{)0eI?X+@X=M=0{5@wCJp!aiT#n0r4DC!t6K8!o_DE9?%M
zAntrz7D^cQO`fM4h3HS2s_GrCa0j3a;!JW-m8cBw)~7UwMBUHi|0hJ+5z~IZ@(n_~
zNot2Kr^R8UY5I;xPHsxM=)0N=3M-bxCufB@<&ZJvwI)%5`b<AgR<O!lYUSShWVgDu
znyyNBjm;m4o*Y)IRTZ_XWq<bzx%*Mi{_f>gdlKL?<Re<U_+y!DxJ;Z!ay>qs{2zf8
z(JJC}7ueQ{-+Pi%3Hgx7Fot7d?OW)T54!biA~wMi)Q=IlQ+ZT0g=+W<>IDyc-0%6-
zbhT({05bvi%^CKjGUn_m3(zoF@ht}|7SgQ2qdjFXgMxh4DuW0AytYwrh3iWgvieT>
z+7OMi=#9C&ZqnYhDzK0D*-s_diW3r0PGpEFAEgdAdsKbE@k$(?#0Crh)#?jvx^SZU
z)a@bT-yeFSID~cQ;J>#%OHkPLhVGoA11oMKh_2mIyQOXJH+S`&LL<!q8yAjN4WBV0
zu`6vr4+c8$=cS*@TlTbnj6av-eNqY50w8<}?qC;P#Wpp5jc5`*ezxNBNhdO?xu8`T
zJmhW#tZi#R*69V=+o^NwnFAYMZqYa>$D}Oh)cas^j}48|+ch)a?5=(QKzpi(_K_fU
z-gW!mn|9#7&P2%<(b_ZYcl$N@mrEkC$3>ds8p88`YGC_6=o#`vtyDOZ${x1O3ah_}
z`(tbyF9}#VTf~w0|NR^Z349>Cb{~2`#DURybS>GA$sCTaY9I7?GMM8vKf{Y1AH=W{
zhNyCu&W4yQj_qpS3(UTBXHen&)>uvhzQY`9WulNuLik+Q>W7<{ij>%Yl~#$OOuq5e
z-)G}#ZLwuiQc>g0e5ez6KBFRHe(N9CjSw}USzYoziD|Gq8GgexjJ!CpfaiOgNvakE
z1cWQr!5QALf&p8yV1AoJ1_KiqGHI1oNdYdu^u9BM)q-Wp3TYOHc{+U?TfmKV46seR
zZ=@1-+WCR<t9a)?&UXQPx7ZQra3{8#0xfUdj&9$;EIl^WU+PZ~7v(TL2pBSfY(&TA
zo=8-HsVxq@?!##ZXu@WJ0A2)+%5bf=uVFG_SK`Rjo$dP|Vmf$ykUkq{%U3^l5Q6zC
z-0B&TLadP!JC23Wk{sJB%vE1EbZkNZPnAI>Ezr^6c4{2-QY6*01{8GKD#aA$K2wlY
zShn4KPC&-eNhSK>LXvoB%(0PIwXsrPr`Ra)wG13%f68Z9%H`~|5Lv_uOC1afQJ$e<
z{aCV}hJpK6x&e$ckLGylI(fDAVfkjr!0t+N-2c<0%A!`Pn0a4f;P_REjhn4)u{h!E
zAA`%PDBcN?eky{WIzRb~9dbnRSt7nHI85c@O+rSuER`m2y8R|xz(bblw6Wj6UupSj
zl#0Qtsk2PmDeyX}@Ua-?L~!~gr)yc(TY|V|pumR^^#^yB?#tI4ZXl|I!2oy9Zk|-D
zQ`n*C2}L4<M|1L~kpq}?>;RthE6B~;4ZzGn182LFDU4H8P`M-6_qNXo>?r%44t9&P
zf7Nv)yYqX?%mpxT3a*#9(Kq}(Ln*_O`B!ONAeqYT9%d{@kLig@yoq!`14+VYF$tU;
zlF?&3Q;Uo=#syWMg4v5%deuB~(jgiv>KvVVlEqRorj`oamGBh%tbIkAcak-;;efZh
zU<*^0PG|g_vJMQi7dx_u;xNr_X!(o|@UStZSHcNCxq3ns5v5U0j|$~|Kwpi+XcZ=~
z5!Zj0ZI!?3!IA9s$+-#E(~PBNBEc8l7aPe&B)V-lBxa4J_%_M;Mw(*%EhroS{a?>4
z6&>^KgLj2Rs(@5t>@`X<2nRY)hWcl0vQFaVs?`4>AC(@6d&Ay!)4UBpBa_g_+^-@3
zvqyje{xMXP<l(E{Yq%h@4C1GP=hBv}D(+M1YUk>QLAvliA7oWRo2+(NW7rhDXHA08
zjC451l#|uE2+<Q0A^rl`80uQ2nTfnQ`BLBE&o1o<emHMVbIGmj(73NL1_R1RVD8Ua
z!48chuA!fNgUA8f$#bg=?ME;O)(fkxx6GArt6}MOCz~P-L&yQ`GO%AQ=WEmg0p|p>
ztHVgVG{b-DcD*mxPLT+%<%ePPmk{<-xROoFl+PzPs)jA>x3X$0p=f<4w439z_E!9H
z#sKHt9%I&eaQ+4+F*+Z@;w$tEXE-IAw&Vg!!zp#$MQZr1{I$fNMq`HsJH+V5@)_2D
zFiwvD3si!@-j8IYG}v6Pc<UH5%kvQ-uQ`W6onry?ciZ$bd4&ugtM7H?{OQBroHnh}
zzPD|r&xk=DO4WhceJ)$8+l)(WJG)(Ok9R;ZWKS|=xQX!4<jcyc(+`hrKg>k`&xwpm
z!B5vnXqCE{RTQFbgH-?PkAZDckU!qvzD%Vdv(p)T-Z)YZQO&D<*MQ7DA$^LBedWK|
z)G!C9Xmh0#v@5uv#F6WJ#<=ZFPptX(JoFsW?8s><M`l+=A}CXUPq%u*smXQ-kZ1XZ
zju{xvMS#p|f-4_9!GQHVJHU-gEUaT>;Rps)6T%uv4nD5Mm2igIUx97o;O#j8t8)Sn
zfQM@f;N|H-4$zKUMd*A7b}T6h<-VDp7=ut$erjd@K@Gu6bV#+%aMn|3g+CI!4oCr5
z4MIl*)x+2V?guNB*)$(bd3D2S1u1qv2k7v;vze{OPf8FxaxUlBEEu{$Ya^$e{UWF(
zFq4n_niU5%WQW2;$dI(+dn68)W4@#WW9T)<Q^RD_4aGE7m<_>}+`8_q-}r{-3Tao#
zd0U6O!ddHFXBwt`YRY|mkvZVFMhztL;{89>;V()NMdW`4^T}?x9l+T{6F~)bLn8mn
zm^X@oCb_$@_7VxF7;ovhq0<XK{dS`XJ4bkS*t+g=QUP{25C~-eR6eV0q2UM3Wj-k9
zLo1!W@-MN5gA{h9Pm`TIzVZ?0wp|yJysG@kD*cexqR5&E``#0__q>62iQG4DUVu#y
zAeQFAEbYzGuV9>IfgDDR&q=_J7}Rlq1dUn1$rw7=SBC-%Y>fcXwL*Z{AaMgQuMoGD
zLjr*}5>8)$jhpF}aFAxVweK<9fCLcO_t7!#rk5`ec9{zEpX770c-U>;T`H_?w_t6I
zm^$}pl68+Nu|7Ey#3Zt8SPY;d;V?_MqPR<H#+!P;Z^Eo0dB~Qd3y}@0_tj;HCx*m9
z4m-Jbuf2#JbY!exIp3WIhdIwQ1;8u0lI4f8{0vV~`r5eXh)X*bAyJ1bN9>{^uieC-
zt{ciU@y*?+dBpTokW-EeaQR(1aHo_z@qd>9NE)uba7=%@oO5~`W*0kWIiD&LY{vZX
zk-)gO(&<O0^T2<Qq$tckCi03^T;GaaS!9!4jM200`5P!#&}!dSH-#a~T;?uWWduSo
zN?RW}3~!kK)U0hd3P4#Nuc!)SyAI?We4}PXk0^@dBh~h4reSC?K?Lmojv;hqq}de@
z!VQ}w<E`EBuVQa43E&T2%HOtkoPk>BeZSv}&@BdyUaG~O3w9}O_`|jq7k&+tf^6C+
z26EY@!JhVm*LyVz;0E~qDgVo`L!+&MGZd_W^OoD|Kn{S`0(aJ@@c?hwzz+?a2UU0~
z(tbgra4c+Y$g6>~$(o5L<j#$lmqh9jLmaAzN!h($HEH|2(WzLY-~W`#(NgbVwml^s
z8fmA6dLI<aEb#TTN4?^A8w=$@z{IQRwmpPZK^V;D#hXPtX0QW%WcBa=d_%EMRcHH^
z-<}yV$lmWK%Psm|tJrE1FpNJu?mchKK={^$j2Z4@uEO|Qi(Rq*orzsA%d;h{gD3wl
z<^J=3KBf`9J}=kH_ij^P2-PpJR*-osu+knhTp9U=a*UG4pZ9L}ru_1K-G8po0SmmV
z|B8WW@e0iTFSOtZWOQe#>0`#vLFfXcXlb4hfowyIoW4fX*U>H+pmL^Wwu}R838CjS
zMEsVc6$RUUc{H38DXQ&@JcGFhdE<iyU8F9n$p^T*gN6Ca@{qDUc_EZpGvUc+{2fOL
zn=5|)r=s<q@@Jzq9nSOb^G8slN`Ahl+7~whFWOaem);<r@(M@0wP!C+=e<`yb4EIe
z`9&+2y(hw75ghd1_kD*=gr>a+%VP=L5_BPOJgx?&IuU7W<m7r?S%DY?J2tu@WWX^&
zu#lbSCTZaeUh;LgK}BB)+Yqt?QzQy~AkRdURHk#*<4~T7YKmAFL*_Seq-hmX&rP|K
z^RI_{QkMy?+%4z{UvNUPm!0;uPi&^W{8fWtmC_$RcongOSd=jX{EVTh$@@O#R=YF<
z+p!Ky`FX1`(P+2oHKqxTR@$j>NypitccGR0k6P^8VOyF5r5nQ$VdG;H5uh`rfx}g<
zgtg2+W}!bUD-c04N1=klj5LEGBxX2TaW4lR|G!j5IoU5haljrku?#R2ijHd;^>@|}
zj+S4~)@mYt+Dw>eK;x7sla6bY7dAO-)cgFH{t$JhTv4Q-cD*R$-^Flj+mJ?$7MF=s
z*)PaQz&<pEU?a=V60Q$!(2ujeP*m$4b4mQU!f0suu3O4ho1Yn#z<=-t5A^czL-x1Q
zl(y?Ijpf^8oa3a{x_)NYChoPbj29_pSsPJYS0!N6)`kz{(PQWMz6BX5Wax|*>+Fmk
ztS!#=@2MYCVc_m^yF}LIuZ;Kt0e=6t`jjbPdgs_J|0?^lP3qvr+Ma7@n8spzhVxJ9
zoH1MJ^~UPZ*01;Cr7jDv)-7s2W=7}~?b5QL?#IJGN+nDr6{U4%9is!>lbBm0EoGe_
zNJ?Pryxvrt!?`W!Wb*f->QfXIDIP4=fWT$3I;2~-k7IFArIm}f>h=vF{(Pz75UG&>
z?eQ#bDnA)Fg}oiYIOBB_(5A4Wch#2&u(>23C{IUGDwJ6RBT&KM0$wEpoKslyMfgEG
z$gsvYVX5!_I>E~Q?}0r@<}8<L*u<Pi{{h0w<+^YF83X;JV*tEbg&Dh5;W5Jp*DILZ
zvRt|DUw^kmI@0Z2*>qsr_EsHlM)Pd=%^aRQi~$ccpFOzf%M0@R&n|$d(MtOg&o@S6
z@E48Pb!=0XpC{Cc1D=0xikJYs2~HMV?%O)#HMW-)_Pm51?E`yGD&I=*JE<Mrdg`@@
zA0#lglqa@OtZ}s{EVZ^VAK}6^vhqfAeEfAVVzI=yuXU*FE7b9|v#;y)((JMytw`F0
z{FC^U*Zs13K7ssn+vG?-Iv3sm4b=Hho_rsGS&6*mR;cwi-QQsb@~C~;{dGR|0-{=p
zOq6T#O7t?->T*8Nena;6>)&&wdr|xIUZd#ZU2#T)d3|Iqu@7DqnMlBB)C)Fv*d_$)
zfU$#KDRwf7&%p69%E#^1q+xvdK_{fNruGvoLP$@sbQEaA0GfVo-@E-&ALh^}_oOuc
z>6KYJ1(-T%qV2qJziROg%3n+N=D!`237|dIn6V>Dj@%zK*NX|$c<5;HDO^Y3_Yv5r
zlebJk5;>{EHtOOYFQS*{6AtiaagfapM%L!C=DM$yCzZhsY?H}Rc9_t?EMLTfvQj(Q
z*t@9>gck8?L${KbA|t;!pIXmFVT@9eye!6cGsvWl`MV*H)e&`a->sSwp;U=2sV6<K
zE7<(ME;1ZTAgxmdKQW&2L($DfXG_!5sd+Iyc}BPZO^$Gp_)W5k0t;o1j-bjR4$_^7
zXRFL*UAZi~qgbk!;M!9O9OyDOBa-f3cq{X+E5t+Dp-JJ_16bK%j`2q~bN%t-2W%f)
zOXE#*0cR${XLBfe0K2=y0De*2rF~V~5~B-~n4enkvU-czMwVUkSM<=Ej0nXI^`L|C
zpgFEauI7(aPjjjPp63lg(#pAl%ZqB>B3t3bXjbv3-X3p&zER7c9?NE~x6Bl!9;@2Q
z*f9*q-IU(6;bqnd-)z>A2pMkj$5CQpjj1%c@Fil(!HXfd%yZiOLmqby5kX$@OOiO~
zCL$AVLteFoP#N>w&ZZ2RpKk&FpIB{DEvO~WoH-#UsKNntiMe~7AHD!rabGb?)-n}v
z!b<SHIK`*FQzJ}ZRX1>t`@k~uDgNe>q}h3>EN%PF`wApTYMfLE&Bpr78g8}Qi{j8G
z+yjb?&aRU!{c>X{XhQgmJE7CVL~V0c3O7BXYjV%Q6sDW=3Y#FOgAnjTPkZy#vDo#-
z8@*QB8Rrtq)MQsY;gnczo_?=Qz?U;iK0Wt#5o<9-ELu2(PVd+fW2pxTv;YR)srFMh
z-2!3yThWdC?^6I=%#Q+mgZ`#FKia8SdfJxb*ED@azQPjI-+D_G02MMbtK#Kee!6kQ
z5a-MC8Zf=*qD}s99+nPqbV6zto);!J$j<I4BP(VrCc6~Uw=&)ASr|1elv_;sNSzcn
z&0{t)F->7o&5Jof)fyZy<8rNjDcPpysYNqGO<d8o0`_24J%(3a%yXWmMuOR59kZy$
zJ|iE!Z$r8y|2tBwc@!v`E{PJ+3nV+=B%gFsM2JqDJJ3hQq1qT?-7xUa+2TwGDKm`$
zu#$h-<7cHf)Ktwe*B32KEOZmZ%qaS1+bAijA!PTxq|p&dY%Xou<WK6*e}~gefGs6-
zV(2F`wp&1iQ4Tc)SOz5Wi5*VAS~%~g%TP3vcAgaIYAwdLc@TQvj{mr4Jt<N`gF(*J
z?bN_Lxl|S|gw>jLG0=T%T%Li3Q#KOd62o<Khp!v^h?FA6n<SJ*@;St#q+9zu<_;IC
znXm3eKr~aS7D`RRLG?7XZU9D`z%#1umRtTF@D(8~U~q{ZAq`&s-$kk3q=C<SEUvAH
zzxwqCJglDAfb1_8sR3jWBQL9v+add+Yqz+FR6Y`hLPOgP-}4zX&yCjnx9_e4bex8W
zMUa3(rbpsxA)9}!sJ&FM41d4_Z#imo)(Ayv+=y_U&G^O2xLL=ib|{F-U%xO2pN#nl
ztsCVxIq(C?ofGWIvQHbJ0*;#J6m-^i*`<mcSc&%^>TSc!mws-(?#=biONnM<RkEC%
zcIeq3Xc2Gk*T?s42ecTNYV~01a4P4nz4&1e+j%d6*xL`Hkc`_#!}Kom)Ywz+)VWh5
zu?Y2Cy2Pcg1Pz&28U0?y_Q_~mdQ49L608tpa<P0Z!St8sEK#3S)x)^o9~>yO8mA4k
zYfmB^NhgkCI5QiZCbDZH$s<f;zDEHSMOhN$&nJCI`!_Hlfz+bc*9e2@BlR?Ji{f%3
zv04j5E_6{EM_Tnh17<;*C%%iFqo$??U?2|V2(Fd&HOSSM0(G*IMY9SS_SjD1(>rv~
z=T8;RZJJ}394MliQi#*!uT}7%EPbS`s`R=6k49~1eX|Uf6jTkcpRAq|PJJHq_4=l(
zkNHRQJv}Cg`W&)PEjWMg9p1e@7F_6EE(53ExVOe=Jt|{_3t<+SHY2NQ!M1RUsqrnq
z=9}PQZtQk*eV#&duZ5ZmUw%l)spBLR>+6I?Mv%7vXD$2m=)m4_?6{}!5iWDYsjy5<
z`813yvi#=Ic7iFH1`3*rM7HGlVkQKR1(l?j!57pJCBL!GXyO~bPIGbkE_e0zMXz%O
z8&9F?DTwuLYUg#hewoNbqe_T&;zRM^yt-d{{dc0iL6CgcqC#^pfQ(V%^FCGEx79ky
z(v^PR-H*$B6Hltan3Z=EQ{S5YW{0~MaH&rr^MU8LB+-_?pYQ}c)|?uTIRgwWL_;oE
z3Ny-K$`P9zLJ|$eU6Ggi&5)*HA`YIX5zv0nou&)!Gz|<{tO9UEp&IrP=)lI{C0%0;
zGJ-<i*-#XSA8G1!AdfCN$j=kWUfJ?{3T4u|%v%mA@^ZFenyO-=<S32VC?bb7Mbn4#
zQml~y!&D4>UdqYD&D5Km6%BkReZ@fV)T7xs-iesr$6<|O6x1!J_g!!EUI%A>9@r=o
z-=OjWN$dOK%83d4$*1KeO*sZ+u9Kc8otBzL`B8V#ziBJi6sp;;LF*(hHz5}f#UiPq
zor_`1Hjn*Eh7?zco@K~}+uG{Z+U{4TMcFn=T|HHkSbeUee52qzgU_HR<j*VjMSA_-
zRj^s${-Y}6KiL%PCESM{ce$U+w)p8t_S6g&0|L+1&p(%sMZ$mi7qoE>!tmmMcnb+8
zjJJpoP3~ss!`rJl?D6dt%#v2|5a#8=!yp&xr9fuCAushkCZ{ENak(-N`{H}CuoR^^
zEe85y=0N__%)2z5;1ZfyN=wu!z1G|)B>eE4EE;6<w}p9d?=zl4+h$kA5rs7+O%_u<
zf%en;K3hxd*aqZ99&}m9i}Xrw#Z`$QL|5Sl%^b?@wAk?zwJwjrdh2?h8ixf32{(8>
zQm)V4sAC62(|qna$TFXATQSTjO^x^1G)e4$WqF6ke0{0$-GTn((_PpPK4uL+eWM?R
zFbuMl5*nPFo*Ofr#iw{vS;0Voxy^zZCP}}%l<%=y3Uy<Z^7bY=DN_dZp5;HLOD+mQ
z6V`5Lg*Q({cs2#fvu)w!<a>Q4-`9P6(~&lxfAxM8c=0pa1HLF5KORn9q&bI^^l!~F
zps9>VmZ$+VmY;}4wBH8{0Ee-opG%!_ElMoS^k%W}1Bd<~^S7%$?y_f>%)X-5ZotKQ
ze2`&fm(6L^x5mPYKw>-DKO-1OAjRmAdnZ;XH2fi~n&ISeJ^(u*z3|{Mb>Gx>NYmE<
zo3I}vF1ioJh_oEp=vm&E(PY++{T?l&>@>h_IA5i}lT^PqEd8R3*k866YR5^k;5R6F
zV1-SLPB`3%3FP0HWw|Wyjmuk3wB!m{7&)Kg;2p8EeZRJG16eKwn+l!n%LUCja#>TB
zGn&x+REZpAmCg93sJ6o<^QK5b&Y}!km#_SWJ=+SF9vnJL;DPZuQn|J58tX_UxqOsl
zXGEq2WNL;RR%B4EMtG=6j+w?gte#U>jWjcS@9oapU-0MTfowiPn=89TmSg*JbH9$R
zwo1r*iA50cMZRo3S1UsH=jPB?o!<U@z5<OzDI(Z-zWv+pn_WSCm%G5WX04A&?i6r5
zm95KDUvomWNTTiA?d*HBc1`uvih+I+3w>MghgJN665e5er4^qr%FF6FL~Wr!hcAi{
z%GoLRnxQ(XmUCf=%bRbAlLj$kJQmBTpD?N>Y1w*Rmt(DK6?4RFmG+6f2Y3{#`glQz
zyVvv>O4u`M!MKFp{OBoh-t_%7{z2Zr3r;GI%4`ZQRiMT!R6lZtV-mSWX)!U-oBB#D
z?BjeBN5$HVWmcl64Lu0;PseOi_gOS?GM}d6gL%Wa&)cosCpVgBih3L&b+^RhKv3be
zppmusys7U;;vsXYEH(@imq=l~e7MZ$>8ehhlIw1@Gm|FqJTed&%e_FG{r>4?OF7v4
zr`W^nc)UekmOOQYN#0oHKn@MV0rtjMD0*f?_kk6>i2X3#N8$aMn?<6M5svMW5vcg}
z`3Ui4A;>gfD!cs6@ZZ%Equ8i|#psei1kJSPtZFBn)DLV5C};r!80Kr*(e(uhNA!X8
zS1)|X87d|`k{_K4TuM&w%5UytyYT6Zf23~d#>l{u2rmH9dTe9d8m%&E<%7FlMEiCY
z;KkRMKRe5Rf#~0C1AH0Nrc8OJp;4;g=+Rn}84|#9R2lRr@{q^HWKXVc!=KT9OiIUR
z`3HwdQUaGgIm8g;_;@!9g4hZ?!ilPb#c_jWTfIqxHx+dj$9WwyaR^I_D9iZT;7z~&
zbB+_SwshtN0kHZfVp0w)eC6Pq2w4?^yr6GWtPF@6?rdXP`>iYvUlUt&y)ff{`=|7+
z4qVdf(+I58VITUn-<K35t00P0KNQI|_pEWo8L2l4cTzd2(pHJ_gSM4R+QV4b^o)|y
zeS^{2FZJ&hcq`1+40L%&e=_M4t_N!aw?{YmWX*QQ<m3s$oeVtod-Tq*cN_8ZA3T>)
zT>M}?dQ>k>g>~lur@fIOUDaQo|Ayxk&)?kKNTScpg4zs!e;RUMaa92Z77p|JaCuuB
z&g{XS68@i&QfuI!VD3`OVXLLClr4LCY`lf+WH8|C%WTihmG>ia+t{HQKv4dA8zFi;
zb<Q8MeJ|YH6=b<3&s%H`TT&(^t4r-9mTx`hJ-qQk=I~3!4_obqB$H<Ph4Q+O@#P<y
zZM|&fRajOgg!9pX{*^0cYJ-ASMiVjULJw5AZSve2oVjheCF@+e#PrNt8sF9l0-Q@D
z`p&N!Yne@J$>NTZclff~)6R@h+66p;oneCv=DOs5t3q2dR{8B}jSvM9hI4x5Y$^@Y
zxf!mD&*4-HYSQad8^|4>*K^T<YZTMnn}$Q{)#15f_*?0`icdHjbPX8ze9Q;67>LyR
z(z@>imPJ?arY%8E51R-0xA3taOFnX&IYszodOYws=e*g|Qv!EjGi0jUdjzFKrgP1&
z^kJPp2orSn{yHta&(FR0x=y{^B!I<yas^pT=AZm98i&exZVmuCs@?hh$k@Ox_D*&*
zSyV6A^UnG-37}(@53OhV=DLljO&$^rvrTEWxwYQ4GWz{$*Gk}j^AwWWDE|IwY%#`w
z^H1-5M+k8g5<-Zo+VaV{jx;<}2XU_pAN1)H;-bg$k=6<#97qW5y9g@2=Fx?hxAE<o
zpd=vUb>AQtCPV$eGRV+tL8Hd_Nd&Dj3kq!>78V`!Ru`Dj*&~-ONg(EU4ixbe-^b2^
z0qREo*2koYyC%(Y{jX5s`_*UD-h3_IBWCZPd&N#(dNvybVKoZ!m&E37KB1QT1C#XU
z5mnyhGBH_Y<6}zid{wX7-b!Vv5ltWU6zMd4jZTOdjNNkgWny@f?kI0a?XFQK9IE(P
zr|!X6A1&#ar|11XZ;%{g6hrSdTmp?<W-H;jl^rg8{-viadA|RLlZk?zJiUU-RNNDx
zm(<8~+aFuQ49pJ9XAY1?E_(+B(47ssu8a9IXpGzc&;XpkN2!39KS|@4Ovwl^h(EFf
z{b|E&7<dph*BZJ5Avo_g2%Vpu2maIiab!gueS9=jDs*vA6h*l2D8(y!{^GCD&O~@Q
z<ZI`&o1ED|);YdNxOQD`bY|Q*t}Zg_oih?PYi(H<^$e{EXo62F@!xp(A^HAEy8aN3
zeRS70MPc)WV~EV#Rw%1me_jc?k^wEvZt-~W)zjcgxxT96npeXK4f~Uq`u28&iipee
z9q+}^rk|ec@;4o>JykkilhIWfp%Ou44R5mpe=hg2;eFz7#=k#*v8135Pg|QcU<H2_
zVl107v^J6zR5cn}7tC5JGn&e>;w<`F<niGPJ^BK+>nStQQYY;3v3}*a{*_q0tT@%5
zUV8i+J6tyyW~Y3330$T8j833EgE7c}97k4d-{^n|YWSuQldH(FRbFee2s>nNX(s%w
z^k1<CE26NejjGVd6|^53p||Kizs9cG%z9rn#P}l0=@S<Fv|-dYAJ`Hc_<G$nbZR&M
zxrgE>yH*29VjbRnHeW;3u?;Wy^9t!dee;Wt@JKD)lf(fi#s<djp_%czU-_48A)Qr>
z(~~j*eq4~Vtiq(SH3P?sTAHJww5Y6aX_9L5BXr9jo_Qll1vb9j0`iArK4+r+**Z$J
zaLjW0)4mTsz5jak#(FmG8`Q%6W4$N`?VCICi^U!m=))nVYD%S{Nnex1V>028L}<nX
z?{zSrWk78*S;~AY1!e<dM(tQls8L7pO_QKzM$w0t^v*+$7vq7<^;2(e;3g7Jl0M>3
zkV#1`w}IbyHK0aM7bZcw6Fqi_L$d8oTD*zKUqQ6Afe`C8owm36VoA5E{+)IYS|649
zox@m!m^*zU<IY8|7@c+@d}0g-G7B36isC&)-t|F~p00ZK`0+mKhw9duO%jXI268`g
zXdO14pT9R27|$zvE<3S0?5>=+$^U~(4F2Mhx#LT_L@nkP@+ubN$P<^Qvgl3!=ytIG
zNd#wK3f?@y|B^(p*D#V8jF_dczyxov>#0El$6bvw*@}5d`LLA0voI#ma5!m0%c=cr
z|Mu$jg)38@R<yN_&2u66lDoAhM_8&Y*Pypiy!FeI4EB!N)$=c%k=3P0`;<1%Vy?!>
z(f5wg!b%p$4}t;-NfjKvektvRjK7Y3jSOpH8+G>Kb&HBC@OeN721q-Q5ycISe1)V)
z2O{R$1SnC-%^VWBd23&lHV@FJ#dqU3QUv_%+m7dcJkBeLY6H|9Vdc`XohD5oZBKIH
z_Nf!Dt*FWo#m!4g@?M%AIlV2mQW-&K<lfS$VElCYxCHe$aQSOi;ZYdeK#+e}&5FaU
z#A5d6|99bhEpDT%2$KhWK;EK*DMGK^Ce4_%L=kf1hfJg|b{KR^bZ`xqzk}*zrnra+
zHxNvN7kjxCPaUrEX+XC+LK?TpDt|<|e&GQ+O=Tb8>=nFpHP{1=w`^DC7yTddU2BW+
z0IyFGFA<W+%i@xxIi9_J(p8|-)gKOyeO_^V)UXutQ2TWV&h$B_U+t|FF<;4y!S2N|
z@2qoqWKQ-J(o0PqK8MF=-RpmlRz96jlc1;=yn3;2?SudlM;fpsje4xb=P8-uN@<FE
zRBVH4W7Zrbd8&P<J(h@xd;_adTlIERkFK9cnrtyY#?CZ=4Ba#We+%UgB37;57Kw@@
zG~&68vPNXvk7$LU3nbFURwXvdHhyckYU{I~_yy+lwvM;?O&?t{>N4FoZjz&>nXo?h
zq0-n5<h*B!F*x!T*EUQonk3@<i^caWa2&GfgDbBzpFzXOoeO<MT_gn~ql*8#M)v|0
zrhwC}^6~rA&^0%`Bi;Bv5RC%fNBU0=cDYwKO?Y60YY+c5@a<6XekL=AEaUVys%{6f
z^d8$Es`CGn!h4w<@fPbOdd{5HC};AbgNIw;Prxr-j=2j&?ka)Z+t;ivMZ&tr`?T{`
zD?bQ3kyIZfBEr-UM3}P59A7P=%XxU%w7ZWuidKsc?W8!}__@>Ss(!rvpj+9e@G@hq
zn7C-U#gMd#p@i^W?6HYmfb`O-#a=;P0AsENC;vwIv@di`T;4rCMI#o2%dH4=1D1Ch
zWBo7;-j^Ip1T!4?6W-IVJ!(IDXKsdcbepMsr!@E|wpC!Soij#U@u+Su3XpE4_X%(9
z2h>MxPh@|pBkOznJyyNQ08eH3%gs+n8T>70Y50shX(U@t7RNi|#leuH?bd^!gY7k#
zN8lr}+|oZ&TG2kl`UukRbATa*_H4CxzN1!LM3xJC%b^<8mdKwWZ&O64>J!E@U(jyf
zt`l2nVEx{F++cj3-?91*Q;I_nPggr>K8$rmfPREAJbNenSp&L4c~Wq*46~c)Fr3d~
zMQ8rb=(e&gf&2A&JGGzay4F*OEBBT9^6^PvQ->`9_$_~SPss@pMgxnkQe+s1?dSUI
zz$2O9#a}aEU8};Q3f<2EVZz>Sr-cms$J9#F5gpjMg$zzo50^!b#L?+oS>^l=+p76m
z8vFDy=oC#KT5Mp^i(c(0_=9SLa%g403@%e)=kxK>9&1|WkuVtFEtqe{!g#55bKfzo
zb;SLgU#sNMO}j_9M}O2M=`O^_I4)2<^G3m!G`K~{X<m>J-;tf-4bHK_m-V8&jY_@f
zRJsowY{OrBs%FazfxjyI+|5XnZ+X`DLBN4=VeKOW>TytQ?z&^?kp4xo<bR6)0Rs#e
z*DH?tJU^!CWjHG|`jb|?Ix~HA-St0t8r!djoRYI|{)Nz=Ab>yF6wbBkeofV;M`M7w
zMZ4ljHz_kP?8Llz@vQb-vNfwDyd*l$=ELc&es-CEZ^pid`ufs*>3GZTDUtpw{R&^S
zOL3sB>`-q^W}Q1T_Mb^5^!m@^!)x&DanhM2lO>;iaVn-se|zhFMyc8@&Pb>kXTmof
z_p)AiOIES%Q?*q()XvLPmJeIwOZ}n_xr@-Dlx!=&=OZ$T+)ALup7N<TYz}htskI5y
zfY4i7q37kcN=TJNcY^k0;f{_4V~QCckYEg%@Fdk?GI<uW;blUUGI@<G46J<=xFg!m
z(vfL-#7-pa)`5Nh-v;}1<UoYzH4GfJ__W-4+ra?AzQ*U?zpI;fHW0J$VUil8KLWeO
zUOO}K5n9qBC)iHhK`++@aTyO8*c?6ANAJS7t9;}JUwDW6RL{E4_Ssxw(yM^5CCsNw
zwxc`=HVFM2m^782S$Y%!0oTUuqnAVtiL`jVX_ycRSq>v2g0L#XVEsgI-89Xnxsg%I
zq>SXJX50Y&w))0NYdI3CggCQPu<}@*Hl{|9Ckc74Qea<0)f-!mu$~CfU=(yJL*ZDh
zS^N+95BOJkaI>AyiKkFgT`ihFBQ?fj$<aj5s|beCsiqdt$K=RJQhdtrN(IoNgcHXC
z+BB<up=R##nq><0bNaKp@XuKSgB{$aYX;Ua7g)n|#H3P??_4zl1*Dtmaw2Ht{531`
z@k-5#K+Tv?LfE_RX%Y|;iI@_yOF?WAW^`xxK1TbHjYwzEfv)*u!8z^v=7SXW^s1Xu
ze2>v$6ItTlDElc3%bACG=T9dZZ??^KCt~4}W>{1E7Z0SQ4zI<U{?l&2B7pnZPR@Ad
zx|~Hd`pu(ACM+xN#^8M0%!@#i>E6_P+k&W6Ay&jLWGV6p<P^WVi12~Sm-%3(Nv2-+
zQ71P`s(i@10r!&LXV>Unx)pusm!L_m-pM82|6}W`1ET(#uXjnMr5lv)?iQp&x<gvJ
zyH*60?(UQ>0qGT_8>AbgyFnIs7xeqY?|uFwclR@Q?wK<)XYO^(ln{cm@W}<l`ckgw
zvo<O|6dzyhFp(}zi~L%6|A~;QJJ*j@A}W<@D*JoFZ%VT_cm`6o-Y8P7Gw;5Tg=69U
z7^y+fgwVR_&CAA9!wAU-9Bv7rd`)tfLeOf3z>FO~LH%7q!1ToA{&~B%prI{R0;!Ug
z?bdDDX{`b!EU!3O*K2j?41~L+5rhA0!HxsjXuPvby%j?u&87S|LlPsJ`#C?Uu3>!x
zA~6Hw{e>PWeD)F6Oo`hkb5sZ4K9TGXImVqU!Id5U1|ke1V;_kE40o{2*YBvFgwTMg
z69Ku+P3^$E$IADrPdzzzd{i>}TgGAfR%bf!#7?O^alJ?*<QYjWz%MV#On|`acOc^F
zR^Ipbb$VPya3ah6aiTblHls<~JE4X#H}eEj6hBd%sV|+Yx|&s-X}k@wt<;E19tvA*
zw}MDpgc-WW2&A2l3N4XB=uV~8$X9Vc6j-pFe#^F?D|Ck6_?~IuFYs`+dD&9-x<ygr
zglJPlpc46xZBwI`eC(gd=>6mpar57ik7;-VP<QWZZdC8~P{t<~IY>Lo6@DDOBV7{t
zm#FHT8d_OBQqT<St{H(d<>0hlc%>*pMH=^|a*Cx;Bq*52_JYWEWSuLSjLDw)T9uj^
zj5)@LJFj`#Fl#->48B$BmzxSeRA`x6yJ~7pVEs;ymh!<1CpqQ4kTp7Y(WeQVU3GbU
z`<_UDdnNIMISfmcgu>U^e$k65MncAuYZ@P{-#$#MVpdR(@cHl)>h&RG3L$8Y?y|KO
zDT>}<fS56|Z$0tIFY@Tc?kt%(aUKhr(2Q0bE_793Hv2?u;QKMXi{d8$qe$_}GpuoT
z7Wo*L1W}6kT3eWGVuMKECE;HFUX0AP^H-dv&)zDteiBg2wSm6KZ+K9SaTU@EVC%aJ
z2{SNL$scd@MO0RcT8)ZM)W@PyQ!`Y<kTCpb7=DwNe4W<Ee32t@{$2Oeoj#5Xgk@@<
zF)x(M{jVa-OP4tD8ghfE)+9L{2-F4pT+_th=gS(pk_cCHFFty_>hvnJN>vq@X!o5@
zr*)NwJlC?%Z{k1cwzQ!!8N%4FY>>`=ETPfg4}ah!LJoIy*sk#(n^I|>T<i(cSdkrd
za7<1KzIkaQe|Zf9J=$#xXHzKqTJ!~eKTB`s27*?pNJy@mVw0S|N<F6GvVrCespi$e
z&pgJzW+H&OFwfAl7W6d}6+qB>=|ddWb4wq30x9zU2N$09*KJ4uN&o<j6`NT@)9M_>
zmU^#~5-jOwI4ATa>;3rTfEyE{NHNy(z<3Ct|2zUh|C0!0Cu`Z?kU30bdPl8v;Tf=@
zpYT*pgTbr#jFptxgmJ-SUdXGo;&Gn8UCdMwjqRmP!zT&{GnUPa;2CA%j{R$eZ+ZY8
za;f*z`vi0Pq0rqQV|8<MjpEq#qb?Hl91}bRbgDDZi}0z$1O~mraL87a*!m~^t`Y$8
zn8g&UHgD~F`WWxL6I5G6A7GbgzGTc*!+U0frF8ZNlT1S0*HK>pon8z>R}h;_-egv3
zmc<2+KcD_Fp(KLYIfaAXY80H0YJMPu!g(y&oI=LPZU5P-tf9T@QMK8ii?S3e<X@~F
zKq3;Bw>Bj6$}F!$B50ngRwHaV2?)V%(~MeEk>G4uBOx$=@0b46;g<Ky9{K(leWB?s
zwD_VQc_7cYrx2{Ecy_soSpCxJ3z!LhgJ4Z>ldUCoosD{j1g>LfmHKnu2PJu~39F2r
zuL?p8cs)?n;a`pYISPosbvWMj*fR`a@WWxAP-m*wKL&vA0#Wy864fR*pE1b|AO(=s
zR||9KYtGcd2MgjHQ2KwJzF0PPToSl-P%pG{)ylVCv;rgkjq$!|5`Z;p@|lJ42fXK6
z!@hZ7u;w?aMKxl?zVB2k7o6N0izQ`#tf?ckLe3IfCZQ{;afJ$q_(dCK?Niw=@!Dxl
zj%Le)HJi>ZpJY<6MknCrZifnrM#S#^(J@`ehz3<)+MyjQ&uW&Cn$Ym+=WQjR^=HhX
zo<qE9mufqjF_-%NnPyYE-@3kw0VBRhLSdU4jerW7K_GE*H3a0ncgNGUDrz4cGp|Bp
zCrd~NG(XqnD)Ub76~{(66(NDogp#>XHuKkfU!w+1C_AoETBet#3cMZkoDVvc;3mOg
zlixIFP}@e;po7r#>&mjdm=S?nlL(_>ELw8^9M!*Mhz6Aqs~S2N>BRrNQsPz?kQL>m
zD`ZqRuTD!O9z1vun<M{c3z5S>y@ePb=DSlqY3C@09Fx|yk;^oy{aobmQ_uA0?I$c`
z#k&x$AB_~kh!?WY9UeNmX6zXSrXjCqGRR-4$G@LX9d8W|K*I(RqW*FzgyezY^BISH
z$LxNK9erqUWUVw>9kTQ=JEU(KvBm39P?d5fe_CZ?7Z}q3*HIQA>Ql#nwdxtR=<`aB
zZ57^ff<^e-5no~=a~J}!DVHRoBb1!(B!e|h;JouO!AxlksmCFU^^e7{>)C5^<f6~D
z3V)^qS)-o~Vo2>`ptADQNlv@6p7hij^w<JVoZsYdce=cFWC=6S%=*N?|KR>SDj5m>
zxIm{U6%{-ldva(^32yfKD1D3QqHU#3E|L3vGEC4wKO{6+WUD)2SRbwm=Z+3-O#d*L
z9uwsIYDFF5slP1XTCX57fS1IZpdt*-?x2R;=K?O}r}9GYKK!<ixVj+sEPQM|A1X8X
z!<QdZz(3Yr3hJOxpuVMm#RqCO)=I9xtZ8(X)aN}j@0|P+0q8vM3aju!71B+SLdp~u
zFIE2S4si<-u>R`x#cn=w-1j2@*34go5Fx3<{##Epvo~8CY4=X^+EE$^L33X^GkoJu
z#cxRxDthvo$H_T)di&oDKoF4*K|lk2m_oT*=C7TUzHmWyzB5PqVjRO_^<LK^os<*1
za#bcDqZqvA${&cz(Lf9^+oK-LXa%#12$em616Lu)7Ku$7&wd2l-y2aPv$n;9oyH$=
zajD$j#k^_`aIdDSQiXwVJgBRijbgopO03cRY6SfUbLFb_V6l=6Rx}|d-P1l*(%mEN
zw|XI;iPe7iq!aV(sWC2=(K`re1n4^9RK^1e%(1L7wV*n5Gyo`P?Zcq9-4@mXCs`sy
zb8XL%=&szULW&A%Na$mY3c*}B!5Z3+2IxWcmu{I+e?|lD#8|@D6`uqU|2BPVxF|~6
z^I{YsZSBA`iF2C*C1ze|X-&&*K38NKAkT`9!>Ij<H1@~Hk4j3>>`&%wNYcOQfD5v*
zg4351)^ZcCVD2IwrGO0#vS&{oe69kMMMOc!2bQeU=SF@IL)as?&V}a~<RmS*8=xyP
zfzdT`+~^D%cI}w_#TTN4_(^5rrf=>Kv1L7e7Ouhe)&hXWLR#Q1vvtGLH(8hp`?J?;
z2_*E{1o7yzdJE}oR@_BZF$UmG-*T=dQqQgkE^!>MHUdKc@&?H)0g8U~!0aK>7zV>*
z|No;%I+|)NvK>F#3bJeDt&*jOdJ1j}EgGaCR^71y@U7LUdhVUg9lObq9HoSjKjtS^
zncokT6)hWa#)ZuGTk;6w3m*PU=A&2|h*u|WIx4t-oYh4XY&T?J4%Ks3RXh2_{$F2)
zo(b^C^6AHF6MEMgkOHKoaeJJIbxT7i1z-Jx<)hZnNq>#XMr=-uSE5i{f1o}Nrf09~
z_l%stD*A*GjPP1ezWF0`NMklmGy$Sj5e=AhnoN&P@lHI*e686!)-Kiw_Y@`rut4!2
z#9bQ^o=tx5Zo0N(Fn}%G`q5lgm~mQv;l87Dr~&YgV@cJ=XDRbhKI(8&uaRtKbBjG*
zd%N1vVXfj%|B4E%=h;U5!(mSir%!LL+oGcpL4TmO2h&jNBLm4s3WON&?eoJ2)Q;2m
z0vX2mcfo-_d-<b)vl-9jj6z_pM}4{olf}*Yp*&uKTrqvQnuf9Z)dLA9+?OqD*W#U=
zI2E2puLJ?*vkl6jBG^GP&CwCUs7!3>e+oMPcE-Q%s|p>1C@qp!5&8Q)Gf4lMqxBAW
z-}#HHq-~LqDWxdP6qlO+i|89~Owkg%=T$)u4C`5a-na*up|MgHWFllj_Pw3II9t7+
zlROnkE-6{1KZ@l6q;)+$JfD8Fp_l7YNDVFrcgd-n5TT!O8wlZfG}qBPlLm-)yD3ho
z=1evgpI}5ac?5!Pfo7lObB7XXwVZ{<7~~;0@voJF06k1zaGhl)PBB0QD~jhGzJQFl
zA_Xk)ILUJ)ECav2Tvj5(twiBQ!f5%KM1S6f%X18K>W&=e`JI8i=g66X{_p<jqs{O<
z(ls6H^xG=Zs<o8=>r4M(ob2XDyEOm!@)uWK!bD&C6z<`R!rH63g?+DQS3!vZA~m#)
zpU^)uiLH1slf&t~2>4lDx8EFdZ~HhnGkR>5+$N!~{_Br09Ro-MQw;IIssIERz9|Ku
z`HgFh{n+2~%=8ggqYKapywAf2cdn_c7XS~3y$Icm^+;<cI8EB(;6|RaX9FbG2rym(
zH*xeB5r`lQA1raf@3J;9Jqg2%07=xD6|eZWD;smQb)pQ>H{0_|C`E;DMJJJ<QSKHx
zup}blojBjPm$7*#sQ)>$>YTqe5qeO`3{`{;hYcq$QlNP_#3(9w_$g8!)4a8q8qu8x
zA(vqYCr0qxAJ9IHH9>dH-`MeQ8$pvGl&Z@_;<~?w8#BLv-h0;jlWwcT!Zde_;q%6n
zU>M(YUMw!mQ(Y6RQJ??IF^MDLyNx7-pcwGdyhVm}_~OgceAQ^8=rd%@Dc0TC4-;w<
z^Bs%h4|3h*x4d9`6S8=m<Oj(Onq0+0jqj@tPbH19GquYE!Bk*6LsTGZWWv>i?}{Lw
z{><%Qo!kgk3?cON3-CG}^2kgDgyDus*%p9%#-mWzO%HRVlnjj~LSp3okSso+01{T&
zS~T6gP}5ka8HZQzoOvckSb_hI*g(wCXc@4+c54!gctya|cB<<JU_H4oV~spUt_Q7{
z&xjF`WF$YL|B4g<D(5U?x*WJ?qlk}qiLNCwp8faW_NP$$Z*xgx(>$V((%XFDKehx%
zfeBj`d8`6l`E=FdP8fnkU(g;)*{=8yqH~Fkzk$J7gH}f*ql_PqHmSWMYJ3{`JJ*B7
zI~jn4{h2l!HG`ok>2&!7mkjOL=gB1I#jhjyk$5&C=><m-6R$wFRb6;5Is7#cqNV^Q
zF6qx*_UpEgRg6-Ry6pT7kL9?YjT~KoH(Uz+dn%0TFjtNsRQgnaR8I~-Jq6TvBm#1Z
za}Yk6Y~c;P7JpAOTf8jC(%W$>^w^A3th@C03!Hl<Ld*HpxPcOai`w3>UBphR4_JB9
zgaNI7X!04B0cbgRHXaa!+-)_Yc5fZDhMIJAqde{u2#4Gn)pK47KGqX}Zl7T8$bh5y
z^a}9M6Ewo0+$Ep;nUBa<=4gDa+N0Lmi#DaS9PUCwk)k<S=t0CWXFwPQMk3??kly|O
zzNrN8M}7G({6B+RQro_f<-hWlU!x?*1>~wzI=_DukH5}-9Eu1Mx!$o*#ZBT>{fuK_
zQOE@UY`pXO*{WXDpQ5=wRdkFmwS+8Snq9L&5GV380ev}Wa`D8{p<omsm(fuU)CjO+
z>+Q%5eASaIL#+k7z`d@?ss)sw406PSl^VODYC@BH7|co%s5S`fV8RbFaixdO%Dlp$
zDk_buHmCI}Nv|4f|Evq`vNK63X>B|18ZNZ^_;0C|@N_XCfvbpAn3W{O8QQviUTpek
z9C-}fl+v+Oz|l@l;~2ySAjlp^`l#|Y>>W@kc!iXhT6%Qj^&vV`6+o<}E2De3FN~Ms
zEI-wYEoC{4Oahks2R83eP)Dai(~v7=PCQtl(+8didk51ErR$|UdmB3IX8=uVza)1v
zVXqG+nihH=r~Qk3G~3FX=7=Wem*Gaw61KefGHBu>GB||${x#G3M1}qfB%m_}(0RwB
zS!KD$kPCQW86p}{wq-ZyFXE3=IdPz7Im9ncS^IWJPm#(Sb0+ufT0D8u#!6yxL1@6{
zQ4WTE&q`S2U%|rtryL1*F{2BEbwf*PIr@3B>^&p43C#(iWk#F_<43jt@7hWuC@6nM
zXdvcPEJriAf5(~$FB2fr$nqkPfDmp=!qpN3qTHuhBWC0~9p{ADJz1#Oj2evrYNc4Y
z_}DeV@{Mz>F^ptCV}Yl(Y}GJ~hJGvcR%7jA<?GS{g?%6VOfT7Bvpc=tkx^Y!Vg0Qy
zPbg9&3F-iMa=NE`E~x+NcYQzOdA2&sbT#|f-94IGn7KB5^RcP!h3XpHX9R9mfuq-)
zUuP?x#z^*NOU0%dIy;+PO(P(wm8nq&@1W|dRZDPiFu$fCR8_xu8f$0@u)%!5@MnYG
z^Ahq~Xn%QV&$JvN*5GxvVnPQGLS>eA2kF)n;N<s5ZI3&-)rK?X#b3~63)8jp6B_5)
zb~;lzb-r$U3C({;jTE&5`DmWMfZom`mmy1+^+y`@Kfqjx-0~<*<?mvV!`#Cyy~x_}
zie$K#BZMHcSy<|Pg_d$YA3lGT43wdPvnua-UZz^?db*e3VKiqSpo^AhVDK^1Y>*tE
zajsJQsVEBaeR*u1(LsR%M%*b?5OPzq-c3on5uUEp^~(G_4v-QR^jhsi8s&jVu<^Zy
zlRf|0W%1b(grHHa)JKxM1h-K`uwGx|HLElB>84?zt9z>1b*vI97K&Jn=`R3%eC84c
zCifK_e7pTzE`rJ2BdaoQFLp4fjkzVWpq}9TBNvPjr5=slCC<^AGS2O%)~Hq+Q}l0#
zZ(xfh4c_+`xi8sLQjYN36)s|WiZAg82di6>*WbU~a+}-_OU}ia%BHd&-oWqaP+CnQ
z6cM?Gcf#d%-g!&VoiTq@iOC}l%dy{*-ulWA>>;45aqaTlj;?nUW3OzgcIxnUmgs2^
zF97KVp(Q+65a(EOX<(*ZuEB<cBqwTAhL~KIq%%?p{+!<<T+m=mquAyf;rq)wNTiTu
zgVVYv;Usq}{K$G^BvNJR0(jbF&idNo%)j?6Uw_ZJ?B!xGr*0%NfIin45Mf6%l&bP>
z9okAbox*1g7C+A+ZqGV>M*8_g3@`ACidd@r_gq=GN6|F^iHh*Y3A*ueUx(+8!hqj#
z+XL>3CD*;Hm{r2(n@zcMY#L27|4{o;2hZg6!sgNFl6^dw3#$xflNqj_b~nn85ii!Y
z1>m8Fj{N_A^T--OzLd}z&f90vsehVe;7qusK?W$lj_aAb-MQm*$3)(^4vYgE-Xs^1
z-dRV9?x7o}T~pjNeu5s|kl1MPTNCBaga!J84Ygbs+5R84dK)GhE{3j*uwvnWiEObw
zD`^v_;88Om>!16I@_q&7<(!}1g*w6G<P313An5ZZ!2Op9mD{rb_F5UPcYeBTGHF_I
zaMvdNuQccciA^cHd#YsEwSdq$b3$O`@+V5KDz$dbB>Vs}qn8UH1b$nykg4ZW7y^+Q
zEaug%JcB2M#J1*9X^2%>NO$J510!EEZQbT;e~$yH!$U%VC_G7Bh9L;cV{0z1XoPd$
z%wTH~Qsy_pcIiG>`?0MTpQ8!BIf0(^zoRw6IH3LEz^%r8NZ0ZkyPjT!|3Eo#+{Le^
z`@<cjT?A=a)e%xH@hNH~ticzbno&aX1KP6W@ryWw0bUN`9uit(t+LmFr1u9kmE%>d
zAWoj)%F;FN2$_okV2Ku+&!>nIQ#q7l+Fxs~&v8s@@d*)f#56X~gGgbS{9t-UbRYH0
z)gYu%5I)eFGXfL!6IA*Rs<RiZGpHj}+wem9qEZG_hqcMH*lRY+d!LN#1_3afD2bBZ
zH?V=VF9BWM<D_5vrCWw;a7)-`0rN&2-2(64tO_u75oCV)&OVJ4fWgt^Ks?-O@+GJ;
z&B6Lh5Kh_mshsX%4j{GTtq(@WYtH@VZQRZp!n#o3(AI-r<-r<GW_``PwFs@{9s}J|
z6Einfkmf3`R&-^)ma?p#X@6c~go(=>GOH$gxKUV*rMcKC;g-tf#U_+@CL11yeHhJV
z6aVMFtbg;Y%aq4ihT8=!*T10%HWlpdL}9-MXlL<M)2tVN1w9sRPQCYGWGMrzshx2H
z`><HHFC&2sT$KRteogq;<i2WTz32ewDxRz-<#)5PWn!JM<ws_DNZFZrU-bIG*rI#2
z;NYuUwdkZ(X+oezm&VJPx!eF^T3Y(lk3z_O51!18t(;eFG653OrvmNhZC9CRZAZS#
z_86}Zokwmq*K}WdQreRnTk1tyh`5npf9V`q&+2A}mAjtWuS`!VW9Pn~zA@aOIfw)C
zz6K(>&>w%DHg)$-n>5YZ5_CGn^@(26s;9R2I^wd=g<8+mJE&#tkKyKNlrfK)x;|`w
zR6fD5Zp=448<rFFetl4;ViXhsLxTJjC)|d;AwNmb`bU`C07+v=ycFIt>w)RYuWOT&
zBKboP6(w`Yp`@xWr^u5k0TdjBuAC<7n}V>NX;w?800-G*j-9jh{NsU$AGApXLyP9-
z^06>Z4VIbM*zbm7X*RAa>R-B@emg^u7SnJsNv<tCo8CKbxfCs1&YD_Jy0?%_!iBus
zXwa1XuI?e^by1-hbw7djMp=94`Sh;JNNu8DwmQ-%gzCOu3n^GWhNYg}B$2rh-;|o_
zO6@EC`0J9Qb(-`0ygIqeju@5=+K^4#l!7AUL2yu@nIwJ6_mWNZ6cBwaw((M6<od80
zW02{oE6asRDZN6|TWiI6d7Vp?#zUZ6F5kqSyrT)ldcOEo66#!AoA-$>O}l<H^H--+
z*W>mINP=I6Ny^v(_2S94PCwo-lTJcg<S9o|pS-3P-reQYf^%W+htm!@&_oAMlHU<`
zc5hS_P4mK?SIJh@gYZ!?GGF)B;wK_JJ=GM*gC~_;)`RVhvzmEFXm_fKPqmLH(%S$S
z>}*jQ7|3la7CD;g<97gfFAxCw&+vx%b}bUPMy?dpN^gKjZ?i>^AxZ5bOIQJhj9znG
z-}F8=l*bT=y;?cyyv@zwc25^VgxA?W^hkm=aNwzI5vH==?mNryT;e6fZFgR1-iQcP
z&GlT(Lb;$71tJkQ0v>=f)%tmf`g|)Qor`vwaDuItLaDzzEtoha*56E&8fV;`@LL>Y
zYa#KheW%_K5J+9WQ#Z2EA|MSn3+AJX|J@)6Ka=u0dIG{@-A;o;^XBC9iA;^ZqBVp3
z?289homYXCQtT5GIvG1(3JygP;gwU)vhZ8&V|9_zS1O#Q-#C5v0)W@351DmqR=-z9
zZuua<V4uarf&Qj4MRRx?LJ42bOHUB(cSlr_MWO4h4)J4wSDITMTeS_c5jc@P&xSQ{
zc31BSn`2^{9y_r$3G(|y<QCGxY%bsEmd==hU-(9IK#QLKd4V#XV)Semae9hXrSH^q
zucJyfPUi6rOgVgMzG!J_kVDGE#g#3Hp_pEuDoE>{9<Kt@FyUg@iAEm0gSYwqZpTsS
zFoeDR2;rt=%2_bl;MF;IiYH?_mOj2IU6&<maIXw^T6>0<N!_5&X5cb;yV6_(7UaBZ
z+7B0h)y{$E<}y#{s-^wGcj?2Oc^3Rs$R4-hWtZNTe9rRDYRH*Cn@l%HkWXWCO~}*A
z(>g3XJHKjQj!I*JWBK^^N7zvBfY2U$t>!R-cY_Q$4cOsnMM3K_*)Od^UCm30ESNd#
zoZj8ON@1fMf%kJ8H|!EGcu3!N(cC}V2UmOz81&e%<EY_8^J_D`L3dJSmjkG$Rw~>K
z6MC*srekDmJ6)NeKfYoG(lJ?pew<Y`(~eom+IEhmJ&GXt+1x%4d#!@LT)aVBN!$cV
z8J#QLti@(w!z~E-ja^vD+;xn=4=Hevx?2(y2&0UwBUtskVetNKD{gWNwmnHom};b)
zUGlwf%JTsq&P=?T^_p{yuWvHjv2mHNdFVXm?77z(b!*pUqD-li8%+uV%;HnL=S5_{
z%e=>rn#jv<MQ?AP<d(Q?9TvR^bP?0C^%-d)D^K`PRZsO?K)~elU<=o8>J$R#)Zhtm
zW83#HZd*Eh^wOUBHtx+dYf|CqB0lY>!zrSH%Y@upj&T&H^)M)3h`!1ql@BIZn-9fw
zFo=*GovApOAkKd0K6<%0o^s>06RyqgJY{#4R;R=mb5wLkW|dj|f^6Cteg7yiY4?~i
z$ovZnR%PsVyqSr-*JgE%GEeVWJ;8{}X6+cW(4o{GuVSdubM{UQ10I2Vf5RD!%DDTx
zrXag%7H!QOcyrxS?g0;rAb+P`qB-_uC3MF)&GdH3B)1lT>Z{qNhMThw*mLmOPPKsy
zbMHyrt_J*AZYr5>)&?2{aJvJOP;ow;d~+U-;vo@RH8`Ye{yDMyAUp!OpAR1yY}@J6
zs~OC4W`~`!U1lC2v3OimC~mBHU~eD!x06K~Vi{ol{gDIw_0`R4=*8h(nAg?D*)HAF
zA8H{B9{Sr88GEjq*#^FMZPZj%=J|;Yw_@w}nl|!#BKatWtrJl=V+_{AD_5nu{a<HC
zyu{Ptx6v(Z$;>UXGc*q|`vrhUPs)RqEZ|39N1?B`j*-tA5UckTkbcTTtmXk&<Fy7*
z&r;X<s-srp()!zPcFtG_AXd*=0;du4!XVlB-&i3*OpZo-M=)dGQwoFUwHbsl<Uk?K
zBn`6f<Zm^|{2EU!PVYL7JrZvMVmdO8^B>gVS_s=(A(38mDB(n74%WAMXr6Xej0Nyg
zCmUuQ8@Hh^uA>|KtX5^7%L+EN7Wll+hYvaBzy6Zm%Bc7Ce4MxSr_*9bWAp<}CX*q>
zblgvNjyJZb5A3%bH4_-lc96)FrXLHDp_+xV-<2S*cW>n4B8Lw~2J_VpnbSC&-+Kj@
z?;J&eUQ54>J+zzm+gT^INue5*HP%LAN~}b@$MmO~$KAJGTsJE$t17(UVHPscIG4yD
z(VP}ZiM_oPz?;ediPXg_G=9+CxE1GVv6Ap?3Mlc3C#Hc{E6*#4rmuC$$JQcY%pU(h
z&u3WJ4uCD%jxDaK+Q;4|Q%l+8KsP1&l@w<f^7VD&8rCmQy6)zQx@En!d-m4J2g_CU
z&h)#rW|5YasiU}ZL$WD8-%PrnbE6txChMr!CTN7hA50Uih}TAzTylKgg<K<d#PjNd
zw?;mnUwK0(d+IVq`?&MlPi;y1(sE7&hMFTF59aMcSGo+&?W8FLjk`mkT_Si&bjk6R
zjJit5rbvV@4byzee}-prDE54Slq4A1JVvu8C;SuGp1#aZgXe|S;0Zi+dHn6~H1O7*
zNjnEjrn|0f?MUlwsns&6OJUR=L@iIZ-VMf<gU9B%zsM1mYC-gnpyoxhpgnowN9B#8
zC3v-_AZy$KBI##u3Tb1Rh@Kn|dxgd!60_>yc=8==>MNr@n3R_Tu=Bfa5{PK)KZ|-y
zd}83uE6KDUbpJUEAO?noZ$v_%|7?Oi1UNJ8=E{B+voJyZa1x3>9GCJDGB9A!!hJ=}
z!GTBetl8%7a^d7s=*^ced?#qy9W#sUJI-kUZ7WoZ+e`1(lMxO%mNr5b?Nfytzb~_s
zZz|na{hGWA=x*AIuQljCIv#2Jbh*&!b#iTr$6)IPAvwR^-qnr_yqinZviZdGE^s;}
zL00^I8HY;7zN=(OJl$y*Zfk{E;>uDK>S1Fv)2Qr7nR?F1G>Z;RklE0e@;OJ<!vmbz
zcg_Q5-E?$=Y3GqQJFgj97!qGIc;hbN_T9IQ;&?%?B$!`q-#QF8EtWf5r&?}gy!ut(
z7EVCZ9;I--TS#?!TT!Iz|8VOg1@XW4Z}Ab@UsP>Rvp}j{JuD%rcwltMeGrAbn2n0~
z6O2ZQ;C|mOAGNMp|FF*gv$ihO;Pw##tX<u8NQ&R`Z?99k$7ODL2jh<-FbE<K(mbmV
zHqW}PDK;yX7V(7~PFcmzoqNRJvpk$xzzDSRv_DhvFoRwJV&*2?cYk5quua4xhE!=&
zn56r}j(82=sMDU)Mh2896umJcz4mP3pR)&%4g35!3IpkzSH>%nSDPWFh_pj*1ltL4
zm-nn?`5b_r95zVKXXPO{h>h<Fo`UwFVGL>Ay*{)Pl6`@%UneSl=yl%GXKEodXR*on
zji_I9TzGA7CYcE%VmkgdYwKgr7whB{V`euMUXzg&ryRW`?E12A6aBG+_n?%40>G9-
zQkC2r_>G_>n9lupt|O+n4`JKBn+SaN%Dm4hP|>)j>kkx&(wt67>V52e390Mb?Izmt
zt^|{oqwFjh;X%<KL>p~p`4?rKcOFvD{l|z6vf3%^7)E2;Ui09Kn_6ZIoYmKM_XSFu
zM<dq^ak8MFYogaVd8cH(2$)_kaBgpx^~<W)RpF?`-Zan+EZRR&J)RLK?ty&WA*h<F
z*YCb*Oj2FTCujb3S^Jk)JmUMm_n=FXsF-`kr>&ePw7#c0e%r#&DbaTYN$l#z_V&y}
z-%u7qyrs&@v5)#{)IMu5AqQV@sFmVVc`9>Hcl3>`RV`FN^U@;k>EG9N$!pbC%{KQf
z1N3&Mtwkc8kmZ{X1h+A5*wMqR8R$gYrug%?#y4U}%dazRUBFB@%i+7GJ<a>F%}SX^
z=n9g^>9fB>M724&+9-Qbr2IaTc-l@MHIAaCvi(v?*+FvVz6-=Q$iNg1x?$G$NMLr^
z`JJ*fBZRz9E-(W_(@1ncvr*8K!lLkyi2W8dTq)~zQxG1<zp1I73Wp=eocew@|C-w<
zzq!kY`zu4s?+G!V(RsLWIR^ozp8`BBo&85)$D41d=3Q13?3P!kIr@WE*;qOCzPa?D
z|H7;6YQ68_i0k1S7lMg+x0lQG@<uO(N+j-T<=OuSYf$U*@&j>=NHWNx3ZaG>J%UR;
z0O7x=V!zN`Hj1-<So<;;-|uI00}myxDP78^m;#R!H^%)?35kq5ODpby_;2)FH#B-n
zdqD@w>&)iz{`;Fco)jnabb%d(9Lns8Dg0DprkUT4U3q?4`~Cy__M*r&q><B22rL{G
zKcH7+p4VO|1N|L#zOJ53u2*D@-w2^0*;kY%zNRw4OT4SZ{%C5RxC`lBca9qW!k&o{
zP<=QxS2=JO6)sYLtx6;J(ga_}fExb2c*krSbNo?K_=D{1L%bRCkSuF|)^6m8_nP{J
z5C}=0qjv4r?c4!0iqUl{92R(_$t*=tNYRegn_B1j(k;z_>^~E%ch6Dyppr`dm*@R;
z!X|PsOc^t|Bi@r&_96+cPIpBBXY^}FHlc@SZFh(X<Ma_hFU+1!;7P2tI3$ZAyql`(
z0@Pi$?Pi%K_VzmO(IlsoA5|t0unfwa^hR4Cx+y=pw`e6c{tkWa2mEV!mk0vP0Qd<J
z{NTQfWXpVp@Lysy<@G+AmTX|2#AU3b&jCuB%R9TU$Md}3%y%sI(qPP1bGO+kmhmJy
zJ7xB2u9Df`a8a>OOv=O%xu5ZXREOFJ8x@ceLf5d?<JR{5TJK}T%H;YI-rzOAM@U{d
z)N{T<1zX;Fo(0J?QD#-Ilg!XM$ipE>VBMv}4ECDQBxc=+X39NF9-|M6soQMZwu`e!
z*O#Ii{ngOJ(zrlH&OyV3grnZMF){?h!8-NYlKVe5>_2;VR22rA1JG$_COtkZ8IA{U
zh0Mse7#DB16K%)$(LDF^VxuWFGxvHc3ETctes#h(s2>I~#c95kE?kUdkquZp1?15y
zxTXE>lvn6J?}rR~%n-hL{!XxSfK&U4I@vBGVEb~8lq~uVkMY5-WddO6;QlTN@gCnT
z85U^c`-U7lCHEQccYWW^i?f22XY06;?Q%$0gGm!|=juQF3GV5GU5FXgVx@eW>pag7
z(ta&oaD$KpY=;=>P~@#H4p-~Y2jO%&gd2Ed=fdtn7)CMbyL4ZlDpNrtQjw}u&C??O
z2`m3RvM7iY*9fSA4tsX~#1}%rOQS2gu*bKdOl@?sZKYS&R))K=J-2(kcDHb(FOq~#
zI5;x<ullaJlj9)!H)Ldp`Ffw}PHZz4o?cNj3s1T+9(QEr6%QSA9rz&3Ub1oFqHNB6
z4m56P9GVe7<ua!+Z69zibmxwIKQK9P?|z+?B2-g;=B-x>ugii*>wfgP(&1FM%a2aq
zLmjg_xPNrOJ~T-QB>W1B8{6e^=(%5qzWuzf*QsjvsT};M9%a~{HFzL(1CfSz30@r{
zA=a~!93fzki~rwGn~6)iyMwoK(@cG=GYS}iM?JRU7cKP1-~`i2#w}_PMlimKiQET2
zZmaANxXkJ_y}0yS(YnE+7h7$;#pV*UoHf{k<EuHW?%$R#zxIEKh`{yeVm7$H=r;}i
zenRPf2{4lOY4a^og6`Nq-XH>pkh~TE(Az6A%3e6_s^E|BKRcQd-oNSwS853F5mBTl
zJ4#yT-`3ePxiZBt&t8t^PAUhY2=iaL?+`E+t-}bgA{0t(2`nZ`s?1Q`R$pL(92}_2
z44mp(<dM!<fQH*hLirw-DgqQRZXb^{mpmnz#WHSZHw(f{3p=+%uXoAUHDn#k{Oc*O
zEKzjxn8YHvv|^7?sLbU#rwXNf+8i(cr&}CiLfyh$#_dFK1QGRkIgAq=XaFl%t3hF(
zov(|hMBSW_U>z`EZW+K(WEnu980a#ah-+8<_)@$&pvke|FLa5uj3fSqyL?#ZOnSDC
zzuO^7IxAf6?$;HFsU4AxEFvjkv8O@K&|`<8QD#7xPY7CKd9s)&5C@+WodX^rIgr5o
zlVH~7NGA}TQTzO@S8(aFiL+&-DR~YTqfPib5E--HXWyEUK-W?Q@n{$pfiCNpQ)(i8
z@r2m`6$GR2f3Q)C_U$3di9f*r;_iNwRU3uK)qB(-^$2uRr75U>w_YG%akKdNhs6dg
zJ})AkdC1K+D?DLm{Tp3w8ZtFw$>wTm&-6bl`QJhQEmW+4)U+odsN@HtX+cO%u>+^9
zbAglatODw|Qaf2%`df5-GZ4NpOaZ1xrT`Y{CBQa3Yj$sExyPij7v4&RMM6RN=DN?$
zM{Nzd!RDpo{Q3N31g(Z@65M4#wkoCIuQIiXyW=W(6m4wL`9J)g5X+K<bt%&r8TR-g
z(T^3S7HmOtr!J`GMyT&+{4*hi)_TA_2p86;c5TT+3B{-6v|YRvaq0&VI<DS8<IJCY
z^%x%MfK?MW;N88x^-?SK&9%-#^W~o~%mt(a>wMGpt4RDQyMWKZOc_y=^@o`L_p`fX
z0W83~!UN|iyB`p)o8bvp{7n=N!@^~FQmU;I5qMkWVZ*rySJd)=-T0`Zv61rcve1r=
zM*&I|KYKALp>A|yje6Wuo@ygDHQiPK#Jz7u;GMSmv*}bp7_G!)p8}?*4^fyrFG-DL
zU(-vkZrPgH#>gF!kIWc+&$ztwJl$%rFR@w3+Ed1+FA1)UBT!hl%_TxZYhGxb5&shs
zw^PA<V>=Zyu=cfBZ>i@ItEsI%nRb`{mMUY`*0R5x(FplbU$C2?W%2WO!^UzC^Xwj@
ze=1!h2tCDJDzL0<A3I|oiK6OH+{1&X@4z{<nr&DT6ui9`?Ab?c*RTKWlRaj8lJ>!e
zyfdmU0l`Ken-^npeh!}(<KX)thci0;aRD--0TTA?>d7aW9mfGGcjt_MKHPsMZlYJu
zs{o~Gj2v>$myZ2VkH_5<+&+WR1yCJ^Tu>d_CIQ7RO@1%(a)8tX5sX5XT;psaB>Rdq
z0u)mj%j44TrsyMD4&M;c4K|6vE>vs5q9{rYcRt})26(p$7@@3NWXnUVG(ul0QEd|@
zFD4o<u(a%Tc|9qnNf=M{hMsIh)g$2q|DEvH^Fj^%-S+_m+Z?S6Du3u~q+&LBfU*bo
z8=QTZtPpEg8y9no>iw_!o{alCm!n*oYL-MENFXEQem+Hxkm|#0%h+lqP5alr(Puv_
zW>t_|mS@kGbvxNO6E%(h&{;T;ist9%2>8G5jaT!jz)Ra#&eO7DZ*MiM55tZUhvQDz
zZi_DieKKiB@$FI5B2WZw5{eR%*_?Yc`U1olG=9y3^#6jye+6)8)_cuyW(RZk9TW9!
zgDNaUJRitCZ52?tPzYqMdfbk?yJl!0gl}W-^N;Bl1Ee%9^8tH;sK<+1uw_?-d;yOM
znCJ)a?i&)8H=MKbqK@$n4*t8`Lpj>S1}E;!-+Vn!BhY^kjOmKd$i*hpZ5k)FkV*Z`
zy%%4yf|I_*r*HF{m@r+p-qoim+vQ7I|EAQXzoR2b*peydO0-%9TyI$W2JX8IwAhPU
zK<*5V?Xuu6=Oq^d|Kxs{P}UqdFk%urH(cd(5O{hNg4-;xv=Mu0?C%W<uHu@*oVzm5
z8o)d{>xmo?-lF)g$N#ffw4)P`{Dpc|o5wO>D{~U?z;8&wVN@`oz~Y2m<QXyvh!V+x
z9>~Em1;`8Eq*WXgrxOJUGgR?#UQdw+rEcho*V@GC;B-rMQNRECaNbUO_w(VcUBg)G
zW2Mo1x%f+V%Uj_SznX_$2;b=z%VSC45zDlmnVMJp{@6T@ehE(6^7#=@kTIr`NW9+x
zZ)QIHyRXHt>(=xkis>e`IPQ$`BC6+hLy)m|7v_~aG(Y;C0LE~x@b!knzJ1~LtFEzO
z36p1vGo6KO+qoSqL=6oIH;-tHhFm3k00pF9B8jd4bNOlDv;rm-N|JO?9ZJSrCS<&i
zIXN;Ac>659j)Od`ps{=!>E}m0y*Na)LCcu;EKc?&Hbc-(4Gc}wJ4cYst+h2;rIdj2
z3K!(dy+BwjPXC>Ek^vpZOY_86GD~rP;UC1KXr#wWhb;*f>6s47A-(RIyE6GAI06w`
zo=w<KZJw_0zkg`he%CM!;8AE@{MiLwyk{$g*SXD=oTt})BAj;`<n*E-c>1-0o}reP
z+Bw6^BCgs6;mx?-<eof{U!QXZm!Blo^%1kfCK6f4_V7X5QU4-p|N62?HvnF1k!h}T
z%bVFUAj=E&ZI&W>JFSuu5b0bF#B~A<y^3uYee0$>Vo8EahLQ#mYsW*<Nht4=n~y*X
z&CDp`0|UR}4rx;NIt*_Y954UALnIA|UWBv@_j=@p?DIhSphKOD4nDEWs_5cdRgBU1
z^U&;}z8idnSAN_iv8upfFQpF$q4oQDbS7#Ztnh@B+ZHfpn?m1SAt2?(vaU=mq}hiW
za8#xP_d_)srNj?;z3=t1$L#f+8zCdNXucZ8-4Z4TIYK`2|9Y~&`{{9oy)>W?DYTd7
zaXB4`aIKOLWNs0SYreI!45%&T0uv+<8vRy7WzCg<^>L}EzVdi_2woKs*Cf%{c3f12
z+j<@*^x>zaE*mZG=AO+E+*$Rp_(%IRP^|4)i@Z|x$G(1PXPN%*J=_x~+Ft;GZ6Q)G
z{0U>2=H#r#JW|P2>t+OpQbK5?dfHN3PcyfrD|8&V!En$fjZ~AkK>A*63Vls#eB?^v
zc49pvBV_&QgExzc3G$L+2D}^ld6QHo)0Q^paJ1QcbCw~IJ@sXbQ$kerYiEl}LbI;<
z&K0JAC=r%Z&+Olj^`B7zY696u6Hui&Ea6%Pl$u)yj4(n&XxTVmK9Jh`xW~depvanY
z_I@z0&;|A$N;4iv-?QgE0XMG?NWt9O2Ml!eYp||s>_IQ1n`|(3L{PkVQ)Ad}9RuMz
z1SKu{Ycj`n%;a;EhxeojBgQv|tuyG28|)YTNf6G_K;zPzt>@sfYZAX~_``3jiC+X^
zmUw(ar$3Z^9}`WBy&s-iBBc9<#7;4FDmBhwY-ZluG$0E~gmt^U+adVMFM!_jSol8I
z0EIfdadAb&Ye(P9U0y=sOpfpn-Zkp+<Nx`|_A;t^!3P{Zb>{pcbf}L*Q6>rxngadL
z%H|3xmJx7@NYeK>2)BpOs9@UE5K^vANS=Teo$_+8;^=+NtwPB-Kn~CP7itti>b^(Q
z{q8cT+ok5tM*`|%*`+Y0>k*D7=`B~#<FoRN37B2r(U?8k+xS(s$f^dy+g{dmwXm{}
z=o2!^$%q~Fok-!1q_C`OY+`&q-S1)kvk^sEaPcY6j0;d4BD{+f@JyLn;dP>24^;LL
zyTQ>VAmI(@M#1Bn(U{%q!KsmW7fpq3v23?%3_XJC&5VU9U4?WaXZRnUGx98chRxh2
zx<g&0_=5PxU0kKKt97-98H&7@1LJ}3oDmxgjipA&todg^;{Ox#LLM<syh^6X*SLUM
zfZsY8;fr=&49|W9f(O_-Kbn5A=GOm)8<n;5eE^jl7Z@y-x_>|7rEtkBocmZnQ3p+0
z^Yl8Q^?56~VY#4%-%rn(w|Olx>N`!p^p9)^p$#iiY*M>VC&kD*yzi(d+i?DCf@N%c
z{DkZHg*nESPBcz-@u^VV*Nw0hx?V8^*a11nh&L73lt-mjpX$i>XmFwkn83fa?hdMP
zO?{_@_b>ZB-F~-b!fV6)fKrrmFA$R6dyBpiXK2*Bt}bTS1R~~o=@QW#Og{W3f=pRD
zOSc_iazLkq;{}Ib{0}Ik{hP7RemuMP%b*tUfHJ1H(TepC_X(-{XE~jmVC>Sq%cR$^
zEq8hf_5QY_rdek%9xK_yaF>KBpjUuB>-yPEG`EV=^)WK!Dz_e`CaJC>I<KJ?CrReF
zVrrNO3rOM5$ejGKHX{G2^^Io1AzWh?!E0--cCeBImTg+Q)Ol9|t(Qp9*}<#=Gj0Xs
z*8Y~9<uwmAp*`z&PLLk+B|9sMmHEq@s%h8@JEh(RQ}!n=PP_#Zcwp+Bsb?bN6G~Jo
zbY-lo;B5Ijoe`Cc^x1&;;WgWyBhw|A&D-&2<=qUf{cO4qkjmVJbC@#9`EU2~zj^t2
zJOH`MOO|`J<-#>}->6{BG^^n?EMyI+Q2&tY0vkHfcWFUtJ73W!7)Q2J5a&BA_2^jq
zSYN((f9ZpTrk^iN=i4GUXqsOmqV$Y@6S?0|wmvGomq8TSCNn<>y-MsG4_8i^=1HFY
z2dR6JP*}Twz+_Tp2Uz2jm)1K>Ij>;sw4AF6OhGk4;`04@6H`WMD5eit8fEOa6m-2M
z*E;zeon@zcYu$jSkk~2(CZF<mSHEW9)}Q<yuHfonbfc8iYbztB;p~E9S9##NcYJnT
z#CH7Q!Sb%|^e9*19t(#oo}kGf${*8TsiKsxh4di$B-0LBLHxkP)mN!9SO3H0l$7|X
z`ucxEix9;q*|PW%K_7Ew-6@KK+m^amoNrr)b%18Qb<JIp{IqEn)I!e)SJ3x3Qw)sC
zpceecVKUK3ScJ5MEPQ^FS+rRlf42_@wT>NaRkU(yD=+ar*PQ+382L}Ij>e%qc_W65
zUV*X|1dJJuzW6MSoV=JH#wMMq`4g239+*+zJ&N5a1Jr;f*+p6lAZXUi3?F<G)?ub-
zF3AJC{=H$-z}Q*M@Zz_A4V*P3nXx-nVCg{UQq%{$ixSulZk0r;SQr|pO!`oo)Y)L}
zzagLUVy0c=tOU|hGS;GX)ng6PJaD&om^Q01Jm(2pqJh`e?QH)xq5)`zm>fI#SX<Wr
z7R$=BYvY=tvLrePbJf*9oi%;_dN?3qu=9WIG_SXh!RS-k(N9&H_cat*Z8ES6yt=T1
ziVgc7%1uB`UDVNNwBxK8gm1^h3T_Wr6R62HT(f>pP!mnNg0T~h&6Q^8Kr%$k24&YW
z@g9l70&lktYei1_8hCnJT)Tg%H@Wl{$2j)gI{xF^AX5;g>Mv7feEG29P5`pcdazQq
zUOic+iwvjqZwBKO5C?g6x#7|YX8&hIr$-75X2Ycg2e6I~$Om{_#Sd7=u@vH7*|q8V
z+5J2yx0U(s^?>2G`89fXGi90uSTE(C-v)uD`{pFD*~Tc+uv=Cq$>fSi3b-_a3T(`7
zb8kk}D0}R~Ek>HHYJEmWAxVb#u(#D?LR$g?x`!UB9<!~c<M8_G5l;7)<-G|hp#u5Q
zOr*r<&!NSr^#7GLpSL{j&ZX-SgH?(cgk+CkNu-i-)NdS90NoGN-n}9ZfsceWAPJgM
z|AdL~8FvVwZhjE)m4QW(rI{aj+N!Z|2CVaUK$<Lv%w~bFa|^8q>Znz8u<0-5_^`Fl
z)258w`icfO;x;$Oc0)?y?3-Wdv2E?5Ez!yd$5Q+gLnxnPH+N*iYbQx<Nx-5wcpnTY
zH}30gBV`PGOt!U#&q?DdcUM?uO%kX)fz&xSDD>KiV**H*<^&J*$~(W>7b7wc!ewE8
zcpH)KjYZ`vRz;pOi_lw~>DxRn5fkOUC)`NxiU?D~Yi>@S9?UoU)GN9Ru3QezP>DR0
zT%gPAyHQv1`FQn7A;P@O)D>zr_`Rq!gihi>UeeH%Z=+(3GM_8%Z!qQ8j>+N0Z1KUq
zvSaI?&;HN;g2G-)3^dpAqNv-b3od{a^lL}BCMp6hxdcm^e|^k)^oFJZ57_T(@KUsO
z97LOddEJ%)4+dpG;T_J|yPHps*`5VVsoBo=Zd%XES{(+3Y7{(=8-`Bmqv>hM-CV?V
zZ`jetRHQd+;6T?AUzx{=l9e(Yi`hn`_g|mJr4$TMCB^`#Iy=#ZM$b%n4}VPf)Cq;0
zm2bGTZWL_&z`(oNb>^zRBpsqdRFTXK!AuU!y6mn2M7af|2WLbZq%5tQmp1`@a`)D9
zZPzu>rlsbB-04y3unveRQWl{%{Nrlqjph-oq8pFm&96Ri!1Y&xB9YPAvBq^&H@Trg
zS9T5!Z2SA}9@9cb$i)Y3C;fQ{UdPrnO!GR;VxCvB&^O9lY@;p`T%%bwvvux`HZ<>l
zfcrnGCYXro?YU&@TSNa0rI8812m-1@AGGpNJSh!I7&FCD<b)1=hjGo`50m1EQpg3}
ziNU7Zah^W!bK!@omL8YJ30{L`4ui{c$Gy>HE>8{}mj-%WWNy+`eXE!<-eFc#naW|;
zW_woX_9-97#fW`YsH*b4@62N%>~YtMULncr3HJ9dXw7Kdy-Kw`Y@BY!^R0DJ3if%3
z6&NIa{HvZ;QTW1HZMg!#^LFjy8E6NtS-^nnc8oup04iuI0=yYJW_?9mll4AACj(pW
zCSeti)B{x{{r7=aZ-|{iXoGBys-|u~_GG*sQoPhrhOjNB!|QKeCl`_iO0reKJx;hR
z?fD7oPI_F1<{mxfS0Cb15>5uBEvDwoNVU(iu|v6TGm!Uw!G>hiX*^0*rf>a_q6UVW
zBW$vM-V$jjGjGvV1s0*vDP>V9J~S-)%kC#zmMOS5xrzJ8{@)PvXs{&=VEM+oJp|Ak
zKdQq6J1VPty=3Bk$xlGS4#EPbE-Ygh*C}kZ3)%w=JG?$)3%`#5sTfpMhCOp|H9v(v
z!+Nzslpw>bzWMdb40Xh1>;2-r41_*W>4Oy17^f}DZ!r=+0vn!^vDcR-PlcC9uZ+M-
zwv@Uxj)Qom0~uz6p@<-Qgn@6mWK}BkcDF}Eee>o@y6|oB@UndpK_ksF;of-Zn@jD6
zxqcL#%g9|8<|KlO!Q3IkXo3pD3~Yml_&fgh#cxeRH<tM}He;uD*lQ=>uSOOuH>2{>
zHL>}Olc{(lIAp4Xhbe`b`TNZH=v|W8AEvuB_!ybFTrYg{U!>tcTu;BTH>4sX4@dS{
z4fq)MYiAhS!PSkd2JRi_)Gt0@^Z#~NN-#lH)AQd0XI$*-lh~-oU|H1Tvk*>EX`Yl5
z`YF?fF8@3rvqv+vH-t(9R!tYx;@fT^3ziZz%OZaDHZVe2HJa*jSI1^^Vju*iQ-VVO
z9c7kFCMV~i(qt2~cl5?xZq-JG=nvtcsajm|kd}tFQ~trA^N-+<+bbwq(X%0ELn@7q
z2Xbw*=RX+E(w`Pq3#h$oKRj6ncJpz*Ogd|E$+){2%<4ykxXses<#CwOfvarKa8uZ1
z?`<1cM-KV)Q$CckpB>UARRV)=weMkTG-F62Ur3VTOUTqxZlyCChOoE<86Z~<yB<Y5
z&aRQ*@*3@mXT+gXb~9)S(ym`@u6l~-GH_VdG0D(2lPWWC<>@y87>LJK1s^f)|MYsG
z&tE56QddrLk>c?I4c-rrQzaV5x&~e+N;oHmjI%)ZkWP`X7`WtJOs$)JVNeQ!yp@mh
zCVm197ix#*yW&NYfCQY8YlPW)d(Fu<Arus_t6#~Ot)YDi702u3hF_f|j6=#hxD{HA
zS<hHbOl%8w_UKzg9@Ul41P{1o1$-Y>wNOSC$pEB*2Li8GT3BXv5ck6fZLBkqTYPth
zY8Y`HkQ7BhBek$kix1_<<&GwnUu%BR+v%~&36nd44<p@|Tz=tYJT?zZ5lYzol@AIg
z3RdU`oyg8r95$q8;ud4F8k!?*Scy)2;+cD$t_Eb>o1|3R7Zz@OEzD^%@fW!sfOJ}M
z9o4+ch~Z6p{GLx31LY4ybYTC=_Lw;zroYfM^6dENEYnfQU_{5I#X!rjxuO9d^*B))
z7NXA&UUCV*F`$6g8Q@aMPOt<2b|qYawhNjS3W!)eV<nMlq0JGwLQ7LgrqVIGHLj<e
z7VU)t;_|n$;$?g;ubs+hLq)$j>&dZ?UuFsf*4@N?Pv2)W^0et54VuWb5ycmXn#(8h
zFpdXlI-eeV7yc*z@zNttJTgIHeUX><YF0Q9&qlS^ck<%?%{=2`(F~T4%pvR3f)QWA
z&}kaWwlF@oT%PwTfH$ZW`Tmfz$5i{%P+QQozo0?pJ5I%SM2Z`}M^%Nwyr<s$CGSjp
z@CmQbI2B4yPtm|d__NxBA<`L=OdH>QU!CZ>js+GGmU$EF-K-i>>z^v_E5Ms8ZG3fM
z>O|PD$0hRh|2u8Re|}!1UJ&`hv4=$f%B+z}fo)!C2p;$7gg5!?)TqZ7U4$$B1cXKo
zO+ck-%K(39tEsurD!~2+As^e5?8ps<=uYlFU{dM-G4|F0P4?gWI3XY)Vjv(0C`gY`
zKsuC%oPdI~l+xW@dlXSpKt_kMf#m2RIZ;4BYLwLIn3S~i_nr#s^U?S3_YZD5+3P;9
zbFOop>zvnpQ!f|g7mTD!Fx(SDWuE$yY3X|3v};yBbbz(h=mX_?TShOXCi|#;P~C$z
z&6r>0GZZ~L|20Ol&|9nXc8){U)W+ZXRC9|2)0@|&S@076&3V7Ol3iK91~K>Ot9Own
zh~1n|s^yk85)<{klol~`!H_j|>gz?t1ZE>!$ji;eLVQ|=gKNHb4<z07_D0R5lX;2s
z4hTaJXZ*LfbzC=-$)}`_E2Ljz5LHAyHl^6K!`<0DpJpzIYEz9Fxp9k7Vn*ot!#?M5
z+ZuT({P?5pBcR?}%J$HLv-T3MxF}AQDhVf>hfv(%H?nrMYj;eWe;vx?ywxBRYL#d8
z^#L%c6nxf=Wtt-Fiiibv2!7_qi0u%R{>Wt_o}vB`ADlm>v$`MF%St+goYiJb#s}~C
z?Ov0@={=oECqiD0oBZ?q)>RiN%!T>ecOoA)Bmm%W;RTH$*(vhn#5G1v8SmzUhV|y8
zsZpW3a;~pzLl>JjsRrs0Wz6>;k2@OyU2w;dw@EI3i;P~<e}JBKRbYH$qa87nMwXYU
zNAY~X|190N3b2_VcV0N1Q<*Vs0g#?m8~s+L{2pAjz8O0FkYNd7sKx0qwF*>TXAPt*
zGN}!>?<I$rwtjEz;TS(n{2BS6SI>PfCgmF5Xp8!Iq9n7y@~u}C=gDodj+hSAw_>I#
zPKKzUwB*4XHVuga5j^$@#4Gw1SZ7PxguJ+MQh*-w<R4uZ;2r<SWD1-3u$<Tg>ga2h
zVgU`o)ofL;u})XQCy||a-R9PTA|qnbOk22ZE*x4|%fD9m_Cweiwtkkf+byB6I<YW?
zXKf72Hj-8E5c?0$c22o}^xMuHuHt^GTpy?pxxU7D|Lp9swjcv}$bQG3q=}OceCOpb
zmCjTt+t4MOx0+AOoAjPGd~50;$xoehSasM$amJsQVgi~O(0cBHKlZu-v}eNg170HD
zA!>*#9%RasWf5Bk7RAzyg$W1&r<_zanf>VnQPaynW5AoDP<$k?-@`I+7itwafF<^|
zEGHIuuDa>_O+35IQrKw!kH)Fe#{5-#(@fzip|0iveu9>FXJT3eQb$T0Byxndcq+tW
zefi#feBnf+76+%De7o}8e=G|0_D4O5TH)q*LUS0IMk5?*M$MCAg8!^<ALaZxYQSOK
z$RBQArGF$(hA8O5lh)6~-^|i?ubb_&3RL?DNS^dIfs}Os-7kjeyB{L8tOCi~`|((@
zP=-U*Jp84I;|lbw4ueA&o!^Y^5^!jO^yT~M&8@OjVq15<4}ME-9XoUh<6PN*1hJ=i
z!|KIxroTi<aDK9@JOh3*jq>{1?26;dYT~1aI$Go};XU`Q`!d6~OfrhVcZcb3PW6jX
ze1UJ%jqzDn7EcC9>Jv(oDHBq`=?k5Mh&{~#oo<hm;!D!3AWDfvXkg8ELqBoB<$J}P
zuwuRVil%caMlxpv$PTQLTd`4MK56^U)M}G%dV=rQ7?Bz5>*!lWZ%rQ0N;Mf8zgqMp
z&Z8(ndI5t{yNL6MAAO<URif{t+lU5`9)$gZ1i**+h8fMN>&r9Nfy+Q^C@n+~wF;**
zo&n^M!;cw}0f2heYJI(CU{P0|@Z<JtwxPo+KwbL2(89OXrb!fq-ODTacMF=v67i+X
z0=usWD1InMlz;1Ay~mFPc{>`e8fZPXT0=lJ>5hR><bX((JnPdd9q1`uzNf}62oIn8
z^kpK(YNeydou6%qKNuRYd8I6_2lA3*_5(=r({uZQ82<+mY`fL_8ES1^rOQM)Y*x;@
zLtD3t5^Wv)Qsgnlir*_}nZpY@=N+CV0^?N(I$<fU^3_IGb+6Hi+fv0I;XMtMl|B#C
zS?%+zp2P1CAs<NBM^sbOPIR26DQBAcp?=~KMSvma_{h(A8!rsU+<Mto^S<w*x<6y!
z#*!RR{<XgQK_&hV^TJtujt%G0=cjhkn<WQ0GMX1V){`FyCRhwCX3NL;rLJ4XcTGjq
zo)dhtzkutFrJ+6((TjsnIqvF#(>sOWo`0z+d$vh3c@ej3JDfcR_AXx6xbljCO6O<%
zciNm#O_Lh*5R94iu4{AVtT$Yno(lTW%>9EY=iG1zk;$j~u+vmc>$$y*lGwn(W~B1V
zW;?S^R}2~Or@H-R6QG3HDM8^>0ag;>z7JuPY}(9ax6P}g54Z%Rc#Gal`oS$czLo`m
z2oe`^{dW!Ccxa9(=O}CAPcKb)CjK0j2LQDW^^ZUcjhkZv@V$7!vVTYv-~1D>pJNZO
zHZkk^J$6O7{y~wfk4ESBHj_U@$ekroqLU7JD-L5d%JoiBpB6kG#c(Ohcekr{t-5bS
zdp{<&c}w8q?F82qSKWWF;bfU#g{4Vt<69Dg?)L)AUhc5A2fFR~Z+n{keJS2GIOS$P
z?zu27k}lVM@2ekNTTJnN_xSaDAUZdr<VzZCucHjtY`2fWr1w`aSto{E^Y`pbfRMyT
zKPP*mT@A#Hn7~}?>rlO)>Fm#5qY8<Tx;mVK=w5DjyF3y&Z5ZRbEB-H=$DRoSzZ@PT
zo{A;KLcJP!aKiRNm^SVZK;~s;wz?Km0sIEuLmi{J{IacqD!(07=!dF&QM)^yE(ROh
z#rJG!^|d%-9)BZ4I2`NrAyQYJxo`vZI3dH=1kXQs3@FC;O`*&Z1JKq7D}PCm-^~VD
zIt$>*mIzy_S>rvG3C9VP1&yVt=S{I^yH4MVd>;r!S7@!Sn1|-_M!wVTU1P;wnKb8B
z;_1D+47|NDKBmiaRz+2l2F?NEML_?%*RCHFq_wCo7k^_m=TVTh(+{SC+kkWrvfIg0
zgERnJ*W)G9DYhHEQSoS$y@sWFpcpT==5*SRgI<5|_jFVgnImaEOYEc*Zru?YkSst0
z$`sk2U>k9yTMut0fH8lX+V`GUG}ue85+>H0XiR(}5v1LKo14w2dKKg{f!xJ-$5q=O
zC4yDb+oc(P3TfB9{TFZK27UhjuoBhWoJIG@wwT|+{Tsj-f|g~({$i@NVAFw^ClRgy
z;VU!;#Q1!?)_HY_#Ux%yorytj+1)O~Jqdsik$nD0s1erz-7Q?5=iTlth#dlJ-yXf1
zk|`6?`F*+v2QpxS^Yxsy{G}sNZ8aspFpaP{XOCg`gK_xfkyZ~vugzA;Xvx3(I^aX5
z07|hAUFuT|>cU$VOnfKyLL6t&Ici9XIl|5jMV@3-+@_@}D(g?Q-P@<$YTC$6crR(C
zR;T>>%ygY!=HiVyr%2uO=GxE5S<Y%MM^;UEZlLr1OKQ8B!{x!U!xr{w!bhF$z>~?w
z!OG8s_<i3N>L3xZ%dR}Z9|VaxH9S`Bp8GX0K0oN@N7L-cx{t{oS^SUNdiLa82kd)p
zNvAxSS47`61?jt(-}N#`J3(KZN34OS`D-EnhpdU)4x<Boi9R&@S7Uvp0j{D%qt>6m
z`=sKf-<N}!<)cwA$N@XQQwT`TGJ4skW>)=iv)G^e2|t|w-334p2<CQwvFkudXAJ9_
zN?ize7!G=QLGaDwqR*3y6pk6)rQbSe>C&Wm5jXw`FB)Ev*uKz=aJK{a2#5Qk*e<I3
z2ZS6h0QJB-P4(O_+Fx_+hAe**ABZ5&RycAGKN)E5lL*edc?qPQQP6(em>ox(K>Lp-
z<sVTb9W{Qk5O3bGG{Sh4q=xuwBrvs=O-E6fp+!iQ86&wOy;np0Db*gQ@|lrUX6mX~
z__zQvHsDP>&o3e*3f-H@%Q>%(4oq1BH8sk)d3U`rO3x>)W+)4|o3CMe&7+&+ZEn4@
zgsdjEx@I;&v;SzD=6ndw=YI@@BQ+7^r)SDHxQEP9E#iq)(#N0pg+u*($v_IkX@qOU
zgN4iiy0ID!A(X5ki-10y{)-Kw*$+Y`7(9Bg`H=2IRu!(YmI*lb0raw@$~?_3+-2F<
zlGaq?bo?2HWKw32NX$U@#Ad|^@uz38<cw(Dx^sGRz72`FnO9F0C;iY!AX1#qb-hKW
zr;N1Owy2H5jNaj6M`?ug`YdL53ja9c|07AARs(oQ@N6#@)B1qoOTJFbNKM{-!aYA2
zKE_mVU<IaNlz@n;&(~6M-%VQ+8p!BFbmNh;sFbHg0&!N$SI9O<E7Z`urK{<#ch43A
z6TTc1DN(ENd4go5=33`Dp~U^P)|$SFukVA{NAXWxn;k?dyqUytgKx%7U;f^G6X^{*
z8N{|FrEl@!)T`^YQiIg<>W}|74rKsn-Pbk8;A~4Mhzd{ZGbNFa?D{~t3Dn<wAN%px
zX;s|oU<!}Ii8;XRRr{te{B}j;LYK#Z%=f0D%qWeHLET$Hd0^=bDo>T~uX-ZrpXqf@
zz%L8c2D~jEXx28u9pHLc6s7>wMa5=E8f5w?>FwmJ;IoNSLx#&nt{{~1=J+vz{R1eu
zssQ#iaQW<t81<p}GI(0Weaj(0fiR3e(KY~`aW9I^N^r>N1jU!;4(sB5`qX|rAE3vH
z>(~K9*q=vIJFOuqLPKp3j;(|10*0F7K$O1mS#M7r+2mZSxbGd~*XvNny~Rj&Ur7_o
zl|F26NzLU~`yQUW9VTx0uh*l=Q#$*z9)!y;{-mjW=hGeEhWKpLND~*DRc{NAF$0Zf
zkFAW7a5kSls4JoW{Eu2>Cly6kbxqLwa>--WUGYmstPUc?#_<fM8h3ZQFuT=#qZ-?j
zM9uxur<N04R_tUTOcPHUf2A(9eEeN4r*s&GXLh{!yxRkN@y`Swh1$>PIW}cF7)S8V
zY0Qf6yXUi28TIN54$K<(z4$kCei4+5Pu9)jGwkq)Z8!OwL$Vi&+48G6t@^2?N=tFM
zH&52s!LDE)Q`|KF>e2x`Yb*v)J8U$LC~lrShQsv3lgZZdk8csa*qOjf^BDz3PP#A3
z8Oix|vPK+p{XZ-$kU)XCvnv=9Ipw>*8P8@JQ7gIe8mKr=^AB4_cT@orY#%>rCrE<R
zzmd;ZGUR$JeuPeoA605OKQI)uZMie^Up#+j!L&NEbnt8HoPwc!V+%rHTD$7Tr_>;%
zl8KfTElpZ^>B!}@Blg?LM?`jcEl|(6n!zkY;){>%>234s!LifrkA;|dUG_Zg)$Qfi
zC*cm4{Gna~{?_px0g-~6Amu>}YVLYlTIz(y{5M+)qAdfv<OxqHvTx5dJ(7oB1|t{z
zv*=VdMVWx<Q9p$-FJ!L}2~gK`0CmkSU8UpmepZsEAq_P6%*;|(mG|k2%DD}^g)slJ
zsqyLNenXQiyF?-OU#5=NjUdlr<hPk)d=KZ}-AF?4b8nwKZZ-dLt;N8mx9DB!%~+9-
zk!;v>CA>kbQeIcZv~LXu-0@x|z6I?O2C)>14br-mt{ZQJtZ<aev^4*lnYlIUVdq<C
zz_!(8ohaKq>%5$GRjMI;;Pk18_6Xjmzs)PLHD=f;bW2Ou?aAd5E&XrMVZnh;)az-{
znd@2jPfmB6dVKw08%}i^7PBZlsG#OW;n;LbWz&{Mr3)|X=lbplxUkSoPVHv9pMQm$
z>C%_ilP$4Zm8sd@UQOdws{KqRVAldnt#n>)ucLjqTJ+O115Vy9y-uh_f~;(=o=Qpq
zEn>=hTu7>%|EP=~RUOY+fU`4~qFLr?Pf$1lH$WY5$@@Y#7PBUZs&y@%4BnlyLg4eJ
zX5)WG^Y}oM?0-y@bb6fF3}$1IlRv4+mqh5a<Ohc~*FU;$Xr~(LN}PwmxDzikg+a{g
zTueqTzVXnYrD%MP51h&Bz0bUROuuCyS0Upo{2IoA&f-^9w})%!@_f_GAUs{%Gcu_w
z!z_QA9scR8AIPXonO5(5ob+7*W`0&qsceQB>A%Lu=w;*A(pm+k0r08ZImccggjxm$
z>fH12?Gu20send|zUPL27QVSC1;8L7QXGJMg%*)NNPcbeY_)A&u+q1J`xNRqe`n_X
zWs=toHAw-W01Rtc@!SMwH(RrMs7WA0eChe!Zzr5_JRfz1;5>Wy5^z8P$4q=b9QZ2!
zsR4ErVsjxf3c38Qz)eQFRgGx{QN_DLD9t#lK%Br6ev{DG9wP64|9N{_yLz&{j7H^t
zw)o0hpFDXe^Zh)f;c!AA@iU$AWE}E!v8*~9<0E_Crs>nGNBdViO1IV*!qc3@x{V%<
z6i)?3z3ER0A!{M$BjR#=^TxU|tjVs<*~9F|5pH1JUj-n~+)Mdxpd6?rNfOq$vhufv
z@~Djs7IN+;3bdU5U({2^A0L>S7@dCYb69T~C=}3%mzJ6v#ma}~*<Me%V3;DPPU!1Z
zOZ>Hgi~?<Dr1upk<E<Ou`^gppbZQi9;+l!a?UiN7uRgzgP1^8<?@QVJ?Xu<Q1me(d
zN1>+naW^IT;hlrz$@U6v5`=wvoG7xz$cbZ7;>e!A0<BCFK{9^s!?X-WhLaRun2=V1
z0W~Uw!^=V_gtm2H{!PM#*WGw5EkY<-pj#II3ZJ)#j#EOB1jnYN&*>l>PP#kRWo3BK
z7z@3Lo3JDP#2{@@W1Tu|`5~a>!c0qNc~8zl+mQ|nC^EeKm<Z>fn^1nk=f#dP^I?5+
zoy(%-gA6l)$@T%yw~J2kqo{&qo6Mg6IP3aH;NtiZeyNKb5aM`yp#i&&TCNUNz+&bQ
zZ3DlabA)aiRT)}9`w8v>z9iw8bJ2v%Y)C(;-FG8Hu|Ue$dOqQe$F)vXkn|e97mHwZ
zbM<!huUZl?_(d$wD#a=up6dOI`^92k5aJej!S#G0dledpgU-$?yBjEdepRWR`kz<!
z2iwl+1cJ}FFLY0L^%@c`sARPUGn6o-k<z70r95=M=xYMhsN}$t7?YMQIg?4p$JE7$
z6y*vHM8=)U!QfJos_Def85e8<3IvQ;OUXQ@?fS#Ow5Gzxc}L_F<XNb~03d1y^6n3J
zZ5cHyI3LZ#9J|6VtJ=u`<cUgb?m;*!-Pk(7l%sVBOJU!qz6Z_;qJn|h(a$S%#xebu
z8lHJ^0OV+r04t0q?it&yuQ2qW`|atb?K>&EL`zXNwnY>!Tg|7m7xkk=uh2R10B5df
z)tr8tXt`92>m|@6OikXXps8sO{3c7|LjNZg{qk}y1h=+VJwbk%s$F=9Ec!8EF~Vs8
zC|v}GuKvv=XP0O((DJ7E@^*lZ^OU$Op)Y$M-fU(PsK!K4Vf%put^~OYndKHw#h+j2
zA9P<Qeh_Im+?6a5eNK}P3_K-`Pme7?m9>m4pdmBYbtbn0HLrg2XBrx_BtjGEFeIDq
z6L6wF8z=uSy-5ZlOQaL%_PX(|L@mZDu&l#6&_kD^kar|737ECjXtW6o1=><_MZ{kd
z`tj;0=r|Qm(oL00cH+?k6Fc_6XvqyV!rQNV@Z59Q#HE-=0uS}`@EHO1G>o54^a{|(
zavUZV45kK79p~f@*e4pSPOAXq@)rB{hlk?Dtx^z$soKP0;9S(`3m+b$yGpc<+;<oa
zY_k%6b6=JVw6g%&B6$)2AedHkal=epHgvsq$pelQ6)$}Bw@tw9hvQS3t}Bt-h4d%b
z=qSdtEsKfn92Nn6%mR5rSg@aa(mMK4uw@`&8=Y}LCF^3!BGU_dRF-rGzLbCwZUCu@
ziD?5kKVcczYv?l7FEI?XH3c)UdlK`Hh&|ui1o9*ap^_S{0+IR@{Oy*G@AFCN#_)m4
zr)?$r-qF1P<Pi9J-N0!Ky0J^S_{6YBQlmGBIIBZoGR-%dp6}X^%BKSz56(MevbVTy
zA#BPHQoW?3N7Wp+`TTSQ-40l70tp1JGSaOszcW07Lk`EQ1Vn~Xp{9`T%m`PxBHu&Z
z1BI!i73-RSF6%?DO_i5ZiiS^{j#B#{5J#r&3?N%k`Xy*|emEBhwByXlZZGM;q6qSW
zg;4FTtu^tIRYGs}7k${Ytpd3%ECcBsI`JNcW$t5qF+(T(7lp2u4Bl&tGTMg=2%`4E
zLT_f9sU+G`vhZvDtK`Y>x_LYfZ+Ez<vT(oW>BDQln##u@aXe?FNf2lkCra${1rkpK
zBOg`wVtDXcUH(a%(_2Ok%slty&gzf2QR9StyYDaAr)!+%<3C=KyE>fqw)n-m_G2rX
zf=#c+0o-kiz6V~VlJUfguA+ks&yhjtO!EA}v>R*tpS(O{K0<r_UvwO2=Rbug3E=b_
z#D}WDLu3L!ZsG~bWS*IRla6mlL1(O>%p#y&W^GC;o6&5a8%9iPCulsMK_GOqdrf$2
zz>97ymt1I|-^V7FxS~**uqUq@IN^oH8x0jg6#$)&Zg}Ft$|959X}Ymh8^P6GzhHU)
zG-TVbdxMXV`UlN|sDF#h=+eto0slihzgV)=jkjEyxQRgTrb#=LZ@c364eo6@d(+za
zJjB<s^NrlS``jmFb+rGCfLHq)E$9idjCVRQ0Zf}m^j-$Cbmloc*<gpMp?ABZ+<!B)
zc3_I>A>T2q{(?$JFE8WBUfBC<gVeV&z<QRZN$uAvX=@jpxC_kzqU<%Bg24CR^aH4K
zi2__crm8<P9lzGOicQ=<zZ-8<HxEAq-5hZ^NkH+%0T_I=NFdh13gIS>m(sk}52-DS
zgS2JQ6rOX8Dw~JF9}nnfZ2~>idhrqn3IYt%qY|d_*h)M`2c(ROHUJ;2=TQN4M28*O
z>VdkMvO?E5FW+*b)Y@#;Pia0b7tHl4%vJU4I*xBU1MK@8FGSi;S~Jbzt%dQ!w$OvX
zD0q>+zFCUb<!SXmiSM@$IQEcV(gp72C9XS;bLXXiVkbfklIYrMI3x<J?tED0{LtIZ
zsq&A}@d<EZ4WJ9?RLh3^!sYTLAOV5jcu^%&8_Ihh)-2T_F-RIt<+`87)=ggR0L6W#
zq{%Y_|G=x3t($up0?0JG>CKy4n+oFlhNE=&i1%)m&w%H@bY2h>Nk>PVlDp=ulItQ(
z4xaYbHwcw{p7kY1kWS;_QglM6?}hfwnwfIXmq!_8{Q=~xlkUB-Koj>u6}dOU#nOU_
zT0l2;fuH2oFFMoFp8l4KxPbkAxV}5Kk&Z7VsLw(KiUl|hZC;d`7V-k>6845bM%T4U
z$y!@sQo);q61zSsn=lJCv}qdCVn&meRk4MV+I=A+THG;<%}Di5y#Dr1Q-!#ZteZQZ
zH+BRT`wdz4u~2O6AtkO3V3({Mv<gJKwk~aCuma=fn1eYrFeTftqFL|T`rHTFv}f2C
zTrasH<-;?t5PTly<64T6o{iQEdgI)d2h_Ss;zd6XTdoJ9-YaGJDuBfik)~F2jGhLm
z(f<S8cb>>N=+)$?J*}e?MEL+sX$xz?kpLT>&cY3Mi1Z1H&UpKQs%yY@3iaXD>1RhJ
z1Oc~H1j?*9eG13KF4IVdL(2_;MBC*zn!vi-8B@fSVn(<%jLW1f52SrO2rLxb(!36_
znl>!YG%rJ0Z16w6B06>M#%t50_f@b@2~L>zPMrSi<?N}KSreYEnZJzh7jQDPFnlJ`
zY|7@ufqCyPp+wAC{%|7voQUG5bm*yavnwxI!uYH7O3N{$YltQA&Cy!DJ6nrGBXw)O
zdYW#kBRMX0+VUfPMs6@+vCwrzbIx||#gQ|*$_lYa0n;dIL#rsp5ff4(YjIGlk#L{9
z`+VT?W~@bx<7hI+A$ff(SDFTxC3wPN*XLojg2Ly)G@dlwM-B<1laI!m&)dtb3&6Kt
z#HJ}w5}dpohzI=OS0{kOPcpq(Tbb)U3`}>eYbwcN**#dW>v|xa-P25sf8vY`-k*Lz
z@bSqCbN2MdmwMC<J<EI(SgAhl<;hi|bn9aZiBtJqE4z~yL0P+8?Fb=A-$!++T#uR|
zW`;KXZBI|GXt_hH9J!r$(j|(>2I#~dOR-{8gkH#ji@8q7WXxhL^wuR`j9Lwx1LZyb
z94=yGje#|Lg&f2zrbu?HCvSA$06R(aZ84_NrxuRAtI}bpz5kW2M>*i8$m^=j(|$S}
z*P>5<=C(O3T7PpWw=HP#*+RR9PXa%m*WSHNX<J0{$F&Ed6v;t~d?Bjq?^aUF#&-p1
z%!?dg7WErv|9r{#CmLU&o*hWf6n*u#_HU<LDy8bK_D_)WcFcv(Sy1Ai|5VhiZhzOi
zzK17DPOtJ4^LDk8QM3MCoih7;x$U)#yaKNFr1fIt3!}sP7M#n79*u0g+<Rl1-XpDc
zP;`{*(l&JB>%A>v8e;~llH||1He1!TrF_0m(Wwk-S=&ZmSY<ohUJ|DU3avY~tur>4
z?*`Y~ie+SFum^Kg=d{xG2|a=HSu`$JTa+OZtOU;f*tz&Xf;B&#e>;Q!{i5QQZ~?WI
zzXV>`Emrrr?k%<0ys(?NbnjV@twr!dx+9<X<`}>IV3WbuiVa{|Xwi4-ru6Gkzf=;8
z$|c(L0e1;J%w(m~8F?6mIj0-4f;P0uuG8<=sXza85QKKfLn`Jnhw_kGa5}`8if-Ic
zfv)ypdE^eqc2yE>IUCj790qTLo!n=gz)trUYofOjFk4<8B37!h<M+`~Z5JmVXhhHC
zu}7u0Y|7`G>~RLwZMN2{hL>~h)VFllLFr&TgPKy!yaCe_a^o(;36e!EP1~L`C97+5
zH*qfex&Y<4|3+&>*G2e5`<=_51NQUEy3wIE>*&gNO5M4<14DgDpSBDR^LC?|-lJIu
z1_o9fZ_5Jz(^@l7y2EB8#>DWuqg4CjANU6ByH}t476=f%v$VMZsYW!q)^bJevv-ZE
zH0aX=d?qhICu%ml!A>6;2@up@N4Iof@BFB-&$PhP^|@#p@FD&lA3X?;d39#~ibIsS
zDEOM;pg31+Hc}R&?tkEF!MS76R!Y^RK&c-xsWBqn14MqRRJZ!2$ljW)$r~!pu@(U<
z6A5<}^t`II<Uemz`>>G(X?N9{Zk-9zhI_fgg?+zLLsvMR3gy1XD}?S$1xM0ZkBXM_
zUSeV|R99o0E`GDipPV~Bo<IUWw<T?tCl`d4wl7vR+M@|wFM7|c(ylSB<W!YN!y##6
zomi5*<2rg#x1}QhRHC^26ol6GA_t$F;n@ofaNrCufTzZpaykgC%29?MgyKRuZdL12
z+Gh;7Lnoxf;&wSzQ!j@5LUjt(uZ~`FGSYOMD;O4Wh1Y%GPg`ylre0y>zH(-=4838y
zK8s}%&*u54X~dP*+cVeF(9avgyS3bvb4Q0(@5NyY-?xp)VCMg7G<>Q*%_d{jyn>U!
zg5!%Xn#S!t`k`1lOTpfQrs7J=a>~(2*8Vr!cfW@{%98ZU?6zgIi_!P!kXH?q9#v?E
za<hE%_d7iJ2NDRVtKD!gyRqiB%h#<Uxk}Mu#nBP-D0>4d()nl8D&CFyITE7w!b>O2
zThuX?rksOvUfdBz5~iFHYUNx1B44Dry)2>Hrx67^`7XR>1-*&Onv)*DklR5g^4@N7
z$~X1_&*)Y&8>wAQrr8lKXx3z8T8v^GWgLj<G~CqR43Gp#>SVWtV5-JvweQivnU27h
zRuV#lO`iMbP}yp0CM#p9D~ZuV{*Ta%^ZmO3IzPca+Ez9vRV!@KpI%&Vrp7`(st@mp
zC&=j~2xIHtP-lxe#3Hn@{!$JN8w2$E1i|$z&+`tjIov$x?O9!z8aq@YfKPf8y1IsN
zguXf~9suBpJ^?NwHGmN=2X?eQ%`WObCgh+s87{)RnDTi)b)DL3E0PDU4HnKTq-uI-
zx9r)Iuo<KzeW7N|`$@d7!F?Kj`1S$w=vc7GOZiIR6bDusuI|tm>&nQNb}vR6VdwtU
z@7l=7nHo_}rYw@<E4PER7L)tRf`rV%i_XQC(M-y2pIfyXh^pY@qR%8e`q+Q{+W23f
zND#Ws<Qs>4Z1!%|eRVM#Q_k(cUWXam-TZKiJ{QP*yAg~?#g01TsM^8-X17_<?W)~L
z;C91a@PBLRnZaF$txSN^=8J>Slr``<rfQG-&<9Fw=&$;tCY%fxhUi(qI^IiUz~_)2
zdLfg_K1PZ>>PDE55)7Sg$YfFZmgk;p-(>x6@pX`q)GoL$g+}=C3a63_wQ`X)LQc|3
zjS70{<B~y%mHjlfZFPCz%fT1w*q&S|GW}yLzVnbEya1y=-aFh-;`dO`%Q@15ggN!G
z=0NZfzW<-genpyW=@#_N+(~}f!^$-3V1c*wqJg!Jp)aK5R4csV0|%Kee>s&}t+#8i
z+|F0MP}f-lYAfs1SN!uH{ra$gPo|t06#GGqT;e21?y&9|51Kt-lzD%OGj#pHLl5TQ
z+s16L076Geph^04V6uQ)d)IB700=&q8zZ+9K3cBGWMC6@;19fUcqX^g_FmmPcD?r_
z^T>(<;6x+2Jm@iY?+X@T;WZ;mfk=Y(YB|CoZ9D3`_?X_+fIK7By7PYG<)Z;4{k30n
z>MPOuuluLUvxi_8yM`Qmw%$T8Fu|5%R`OaBvW+yN8rU+bTK47gjnu&81!8taVL?tB
z+x*9A9>;TjOK0bArxzS5UH99(eGV{mPrOVP2(1s*hb`NcP{kw*bu+hB)t@`~cP^M3
zJ6(NYc7#s%*3CWu%kv}}Jgt@;)I+HP-k*a;w<h&GN8Xt$MKMdRz~sc-d(^$%IFj`&
zq7FPC;k-rL+qo|s3_@>t0Kvi=Z$}zr7Q=&<a^P$PgD=^)C9M-%a5gp#VwlX0)T`nc
z->f;gHB6Treto<3zWdhUP;mnmcBQe^=<Z1B{y;$9?wm5VVknm>pmGmWvRCC0a9C9M
z!hAhyO8TGa$@Om+7*>E{Nr1<{bBltDm{UTYaeN8%KjYRo&LoldhFyH2m8d{hNZ5{S
zAK6q#i#&DB@PN!4V0VzH3u7H?Q5(~rzndxS30^>L<P)*2f4-rMu_w{jHD6Cjn|<qz
zGO4#KH|iYBsIigfLf0#h)dwB}?4j#J^E~J(z!~;2t2>BRZX4NjU25!*KF_#|p+ywi
zw2Ig#^C(7)IVTySUA+HXs%5MW3^8Jnecw~l8_5*HgRU7ZR*Vx1?THb_l2n1@#(TpO
zsIim|NX4`PV@`woRrg{L`cXbLHts>_Jw4`?0SnM{koH{+S!9tRaHpiflG;O3GWg`%
zp`gU`LX~f8G6H--`%%08@$&Hc+{)Ly1(Za(F9E5!j%k1UI^hwa9lJ+FEl-p3g%suk
zl~}AasL;zZea#9}+mK2Q{y!e`!-4pvPFQ<s=B`|km^0X0ZllI_2<wHgZ-JijJW`bk
zdZ&)x$8`7%?)jiAbqItuYj-B2nRt;%9nTBa32}TRw^Jvq1FONDgPvNa#yVSYl4aF{
z4Yj!?uh|?*g|2HjkP_)&&ZU}8fzSjNQJGU+R!FO2#Q>K$xt(DHYHZg>=CWDNB>_+D
z`|etk@R~8*GS(U;o%${YrJed+hGDiUbEDaJ=*ZmmaQfk~t}bb*xw?ge@pfOv-LgDi
zsqVd`%@L}~Btd<eh9iH$IZYXt<INl3jqILcQ|o&)<%>&}eKh58cM$gJPW@ZRBfdsS
zUeHRuOY{@;W&I=5{Wt4CNZDoZ`}hN<jM_oyvkshfUn2pRKOk|}t3Url9o!XAT*C{L
z1(EK0ApjMcR}SKVKP;aFr*pn26%0bU29;0^MR~{3h=tx8jOg)GR@+PD05dK2LGp&Q
z`|cNBli21u)xQyR9i+WoZ5`>lJv(7h?|V9nvTeUT+Hp%q{|OgLY^bI`;4;;}Ex~-p
z(8*6sG~-agap<47(Jr~&Vf1LUerZ9bhe1phAIgEE%X@e2zqoL;%T6bNf1l~tNfXwG
z)qS!Q-ENSs>SFf0(b3!?J#bOQD~4JQ8k5o{LU1xH6E(S%eLw)>en9t%=aY64n6kXV
z$n9_rUYoZZfeCJ5mlAciTSJ^wlV8(r(~-#VO=di@qH*3ykRl0Lp&%gmOlk#A!(+S3
z5~mtlsve~N2d{p<eaCZ0dYC48u74MS)P=3JmxWc!`(Z-h#D@p-(Ehrs!yh{0#V-fS
zC>d`-v)0(zUPO>M&{#*<xIXexSP=nLcgHF6%^y_Ny(wRnFn_aEeo!sB9rEPJ-NT$9
zL|w@X{~TIdb?jsb_z+N2{582gAP17`$FxI`Mbv2thp31R-H=5@BXlAYa9B7kldnH~
zGC~ThQ_1gO5`^%hw$cRd0not+plzy*2Nm@tmHI>AMs?_=!u%fUt27PawP?$Y-N**%
z+wSt}9^N0A_H?%Q>Csk6dhSXVX2Mm-H>*xKtopVbND<qaTBg>x%Xxy!`{Jo>anZ}`
zBUU=lgyReO9qxfD_a>3*dZZUSNAs8JMK;|Rht@;tRc=9Y)~Qxe1<yd}DG7MVC+0wu
zC$(pNP<<{3)%CKC_gYVdDpHs_?)ib*bcEV^gY5E4H%)E41738P!=LDQJ9=+s_9fji
zdF?xy(|A=n(&%`3*Le6x)SfNu77e(S++$Me$dh`FalCIWnd<|;Jvf`@q@@$^rb=h;
zxdGZF^vLx8S@G>l_y=|4TmgMdPkGQJtM|5NhED^9$t`1~PfkE~Vv4Mq#rWIM3<#X0
z<YfnK3g+E_tV^AXM}-GfZ2?ExyLDGx*HFIYUntZ(0lC&XpWN55)AA{@)d@`{gxhFz
zoC1|G#amgLzs1245fBHNkzeN%<aS~yb)R5gSvV%-Q|iFL$<wEJ--FM}7=I%Ara)|)
zjohIznwxzAuui%H@6nY7r^OIh!Ia$AFjMSktyEH}Q;RsvN^b#E$`RHyJ=^){=)%Ly
zAlp`l>W?Y&0&dG*FkVD!IO#mTv)mPKWtb-g9R;~4{1<JGTvEgi;JB(uwsSO>R;aNk
z4d}~`Y;y;TuQrgE2M^K{UB-a&jlUE%DV`QVOjfve$D&6Mzi-@A{}BCX;7$nzP&~Qi
z3okwYe1FZZEsnzPbj?wip>(J#uGa`!hpc%#r`V9HH9~fH1VJ#b1C;7_@@fyWU&^N3
zX8McG0#Oy`)A4NLrO>%7q3hX0JnVIrJFKCl7IAQq05j7lOl3WeKB=oQHZLO;jc)Ux
z4IR3;7X^^|u+Ze`SbO&~GU>!8PB$p{8eNq{++aGeF31yVxBa4g)nj*|;+m~&YvfwZ
z-95OZpXEMbFqc|-1C+6vFN}Sh`?QhZF|s6+R=@3tpi{Sck_PP!+H(KE@N8#dLWOa}
zNn(BtU9iA8nxf>;124G7)i{8wc#mB7w^y{I1b};V&sL#yVWTRbSiVZu5=F0+zATuM
zS4^!hLatJ%_@2L?)zoISQe6>-+0Q)#$W(Elm(oQq+RXvaF1!3?a!Tef2GJK>&!sJu
zA^eqmvhkj^>xQ9;_+Y+(_w}J!c>6lGJk7Vz_l8?-myn;B1U`2yuf92$Qr%xO;-4dx
zDvS~$_##o<s%XZn&7{h#ZL9Ze`6GRsN@1+BEWVbn)nNS7{}K8p>MKu{T9f&%*n>S^
zX<}f~Ut!Zd<OpRKyBBhU&xNg0OCAR}r3hoMIdG=50o2m^=GY2|$~eWCnS8758Qln}
z>g&}zKGi%`hbPt&pu9U@HcV7E?3KlYvR9`n7>@japNs%Jx!sIXe=Ljf+ijQpeEZ3z
zSLc5-=r2s&_!0o0n$wgX^QXa1-AT+qtyt+pQLNYxDyKsDi<D3`EnEpi&5<t@h(CAR
zZducLO~;K2DQ~ZomM}r6C)opOM(fwwFn}O*Lilxa5W#xF#<N$eTgXxAq)iwath@Uh
ztQuGM?{U)k?`A_*F5|Qv>4s)6Zu=cs2cU@cK!B;U-Gbqs8@F{WPhP%5PEhk1{6fjc
zLn39PqD4UOOk49pvwOx!WwAwfTBCaKCSqv!?dm`e>xKY5=8DRe@Ks^6Um3=vrm?r6
zXl^K+C{%6FPu~VzM>X#0Hs*KbH%@*Od;l&~YXuVI^?QeX?b>JsHQ4z()X42XkW~BZ
zL-Q}bRV<~W`mn}oC1H7Jj6skSQ0+cJsSU4Lsj)F^r4)3;jyg3qVheb~(&E|`KToc%
zSZ|do=#N0pe|)IC+8Ew%A5C{3eB+&_kNYs?(f-i-LExb?xt!S&k&D<=7<axz`^lr7
z{qHhSU(dY`3eFGX>x!vPGOBB1h97n_GuW0WhRy)x)9#hfQr{p}D$}VV_AVV5`=&mu
zN<DNPhNKeS{|5sTZC76}0r+JGd!e}a8MmQDZqO{!(MZR;vyrdFNWP9iwcO6%*eqV*
zQf+x`M;5DY`2^JOr8I@nc%8=2_<ltS3ZHFgD_U=0-NJr4<8ZJWyy@km?xZ%DG_ZA>
z!pSc=zyWxGvFl$=9Om~;y}f9-1TbsqLPZ-bJ(wR5!dJ?uhGJ9@ybfrF0OKfs>7wKX
z%`lcg86hI~DneZ(!nX;>vEM*e&L?Yclf4r_=RaC%&=G?!)UE~7zZ0ri=zk?@mqSjZ
z+dWCYudg*dQb22ZY~*+m0+)<|+AC4!ya}m&qS-v+m$u}*;S6B0SGA6%Hoa=T9$W?&
zxEARrwVw?9RjQ)3AOPJIQD&NBwuoY^?O@&?6a(vsIVQ*@CL&jx+gzYuV0pdNR^3P*
zD#KWC*CUz5lM+(%+2^T^d7jRghTE^t`iP7`C5*ej=#6aUz}3M5NztH>7}^8zUfPuN
zDYMEownzFegp}hsS5yqNv|YpH-u6B+(<i3AbXA1q7hnB7Z%@hwvKxo5&U%)YC<au%
zRlf~%jV79PELxe{oS_pEn*g}G6GiMq<aK~a1`R)Y-<RsRN+n!_{yZTzU^LveMPnAO
zJ9m0FV_<O7E@|25ocUOAYQwteKu(hETaKSxEZxXB8*tVrV06gs@#0vy-yx$L9L<N#
zCu7w=c9Q=S=}ugz^M%&ykMiT>xFlBqCQi4X9#2ice&*A8=@hr%mD*lE1TeH-GlgE*
ztGE))R>fwNJ(umeq-gJBP-UoUhkeVGH*|4EBfIQsYjlzU{O-=Nou085a`?y&PQiA*
z!_VQ=#4I}Xw)ZvvfvufinA=gf9V205FdmW4<fO}#0YF+Q$QIm5R4dlU>L0nh($n!S
z90?HK?Y~MD(ol1^W5xM{Ql;%=b)AN1<+}yj2<~45qWISY8vC%kb|rFQ!{pSt-O9yg
z%=^^Ga(_>4K)(r>+IFTto+G1L)v`UT>ro#z9q|TVllr<hF1G2#z0?Yj)q6(9$kdj!
zKuv5r`72Rfo;*Sy5e#%1OsN%b39hv5<;lN5jQuIXb)qk>^(NnYVyUtPpnJmKC7O>o
z@_d`g1c_7}@$%`_8o44jA3d!(`KW^_bCMB<y8MX*k;}ef#fQB2tjYK1r0{dKn?{Nh
z#iEz)uPJI-LDj*P5|s_wj20)3Y#KjD0FLoe(zvfX-c{v?o`$8C3Tx_ZOUVB*x#M=A
zF;j*|!q?2Ug(7joElO@Dv<lQP)&^(?2MwCk*g-kytAL6vUPtJ7EmyxR9z^4;C!1va
zdZT)kc;d$Ps`hQAdUc_fKC1JgPO{sMz1|U5DcSWO>U$UZIL*dRBkfep9THuFdKUN$
zg!<zmyN?;)<xak_X!40C|0I?bH}r~Q$3D7`(gP)@t&_qa&1wG^QrN2V*w3G`uknT#
z|Io6JuJPD!&d|<(+d+2E>=H$$aGqGcL5|RM=m%zlX;5qj&M1wkD0`2~;$1NaY7iNi
zQY_p4t^ur&A-@mEoduUFb_J$OESsTB_lr!@rGFJ0WyZ<~z*579SMwYkFfSt(${xCx
zN{%GCX1>G&^G*k^I5QpBA^%wwq=*Z3?y=^<)Dp&)^e}k$erV~Ka_Bk}PE>0HsXL@{
zsYMI{S-tS&LZ1dZ2&G3GM386lrhC3h(Fl-0pTse)3~fpGXl>JVcbl#7=uSKRvUP^s
zg*|Qk&!Ei1wE>Zp=>FO#EPrNgfSyyt_bsKrL!U2HMvj!UGVia*gFUbJDTlh6U|<2r
zK4w3m5=90tWyCj@#!=|A1j%dFP93<__H2PvC!;swAMtfZDl*BDm4#0$tGa@fB{6GK
z94hYQ6NLm<e!|cg6;XkkivjR3P!o->cg48^x`0NAnosrmtsUp#q}X4#{69IA{{~Q5
z<gCt?UvU^$(%u62r4k4&6w1?Jn(EWYysv2;g(*QZrwl!a^7jvy+wrxI8myKBkd0Oj
zQc&8&3Ef=w<Xu$aRx^FeA&o!pcn-?w8avw`&8zRUR?ENif-BdT@fVOrT!{b7Cg-MF
z>AS&Na813T+(3dfm-#f|Utk<X3-sS}TbT4}Kxh>pED!)UUq<TP8gB$tnpqEBm`8s3
zWA?5o6|rPtEYl5Ln30}MRJ)NSu0s%IxDMs26SWUUouK(df7s)updJ}(1fQlYnJkIf
z=^Tx|-TXOvb>-~xGoOr9$C!6ZJ}yjoxxPrKsVD#{GbYE;ERv1-4kl9hLIW|bng6SK
zL#q$OLhq+pH(V@4@|a-X`hZ@he~y={T;mPBmxg?>Y5)-S1l{`cMXOYkIAZw$cp%T|
zul7dUQsi1~&2$FSD^ZN1ISP|^X0uc09CucmT9*69^olQ*z&Tf94+d9Fe(`zLyT(e_
z`K&^?)AsfOoVoC%$`7HT{y%Z%iOl4u_?h<!(*gj+5HESWNDX$X?$B3Vy30-uxKj>i
z;gDo(%4<BEmucsx54+rYlQo?Xn}zFRgm9mL>^Z*0vI=^w>gi3#sdq6uXm!4yLKLL?
zLdC}vCgJz!f!q$~O}}1nlHq0zmtIq2`@N|3cRriD1-C1;Tzc=F!jY~&Z#aJ=Aot{~
zlDK#3!$1ybq`TfNNN)`_R-p>Sp4Jmo!h;^dm_%V}>(7-(;Y4xvy3-by15sZ{7chIS
zJ$7-S)4`QF7l2t&{s%@<zM8q){23e|$+^0!HF1x}NtG`hBqVhiM<&*aZ07{#egc{Z
z1B8`nDd1cm@<v$>3s1nbdtH+o>r1>=02K=1e`jtg-<aE*eN4(X`H^e1dKwg3I*|{(
zqf^$RPNjoGK%)+Tz1~wByW{(QOU@*_Q#ROe&GZraRAQfyH~i(2)hJQz748i!2cM6N
z&XIB^&*&MdcN@4{mi3o2Vck+ZD&PVGCj$|_7dBl#lgE>rV`9Qa<y7NH=qZE1UhgI#
z#Rgs&b+fsC3ORZ1&!z>!KI7^gY2~fy@(U*lbZw(<V637J9z2q(<SAFAGSm#rpoiS0
z@?`dATa4gV8>0~C4Wrk{zETjWkWqnk1WGPx^@bagH3!AhFHssmfyib8{+ZKhsUdsK
zTI0$TPVb(Nk}z8OT)~n~b~FV?V+0pEf3EvKuNr};%jjqxe8N<<FxOa=$t9*DADoL^
zDpX|BwTa?QwTUXIKr{P2sFAB=FH?lGQiO6B0-n<g2t$%FI2Rp%D_YQ{>7&)KKiF+&
z!z6O=5+%k)DNYV)`p8H0ClwIUfHdp6+^6AnHuF~Mq1CR{(+Rm_VD%fO|7T^^`as#k
z6(EJ{tHv!CDe{%+!+3Dmr~*P$w1U+AFfgWNKoBnPV%{GH+BQoPV3B|~ayu#FAa$SA
zAE<M|vwmN2@fla$c91`C`zl}pw7J$ZGNy9RWkt;*t^*-uZzy4<uj73O978|x8#8;N
zVBBoC7Bf?^T2{{P6aBx5ZW#|C_;LhDsT3<B|Dnce01z2mu2-*4<->y>9W{%Rs%>X(
zd-I8TV@72Mb0c&e`<_{HmS?;khv80;eU~)15v%DTP|qBdahZaC8Eb(A!peU|QuwO6
z!41xYOPW%VKigXnf%7_vAHj(eE&ZOh;adi(pCi20Sz|jM9)smiTJZf<`nUAb)*)Bp
z1!J#fmB4S+hc&tD0AXDMrAv|MzGeI`scUJ_YVPYjl@4ngrwv0fcFES>s@0XKCDBU`
zL(YfMEglJ}ypi_yzZHV0FDLIu15LbtHvPuKuGH}@JiWE8Nqcy+T6!rYcsBJu(SI1h
z@!NE7g7Ds|3m$aJV}XF`(*ZK6YGC$T<EF+2Ra0!V9Mrn3vDL6Upm((ZZ7l`9I!}8O
zXv@&(R5-Y%`u1jLWu(E>6*gat2AFN{=6##=V4qRh>UXHXmh&^lrpX=&B{*0j2i4>s
zD(Q@Ny1QMS?jK#+dd<Dgho<j&@l)shy=Vc*ru=sV+}Q3p2PLcSR+s4dY}#G1M+W97
z^bTgV)z*b2F$0-o)>PZ%5)QBV^}v*jygZWePds+-Y>TJn<O+X!RP;H<D$!8t_AiS$
zNeXfU7_s8$r;C6vSfMbV0#s1<^pZFd#NLY-{1r3GfU!+LlZYen&}$(_d|sm_%dsz_
zQY-d0vW?>PfO4rpaXjiGDd<ba+KIeP(;+B{*6G{(-O~;)S@jDp(BBHUR0|N9pQR8a
z=j8qK9V&^{J)R0W&-r63hrE-cC;HSB_}FTq*7F8e7l{%Rdg`pJMPelXN~`qF;b=ix
zqo@E6I<Xfzp`^7O0YZDVs{;cP;(%XkL3R?x5zWk2hR!c-KGwrys(Y+-map|RRA$VU
zSZH89E48H=Uwz5p%?X{CJzW3VIM=4+9|Ss#JywnZ(=zi91g&!O8N8W5mo_U!+GA)u
zf)>Zj!Drps(BdN-i4vaAAO3ZPiGcJ75SQkF+7b2XBxhhzuX9lF`Sz{x1meWf*PI|m
zjT(Jq7G(ov@|e5E@P+2ydDs)vMza#F<gtIA{DdR<d&XARrL96_@%y6h1jfdW<^UW-
z=TN-+PT+<AMeFY1hzL+tt!#W`-fsy8y>smnsQB1N0z$L4I}_7Pz)xOw93&v8jA#c&
zt|#;Lm(u7nlUgW*uvqNs1>{1jLN%Z_hdh-!y1;ZU0`P>F7k=M}c-<AVX{4`h-@#t9
zj)9e$d55}P2z@n=uHw&p@E;U-jZ2hIur8t#JZMPP_qE*40^tyd0utm8^hA2AT-2O7
z#Nw5k_6pua+A&rvv$rmfo*RwF*pBFK)Be&w?mb^2A}Va}XhOOpBw*l&Q)Wk#u(5sE
zDs|w|Pd9V5wV8P$X0x%ecLDwOzlHY?6tGqS;N4{Bp8~*Xvh0z`d#|u@Aatq$F#Z-B
zCFea)7V275qA2NgwQiKz?^>CnPmLU+v}4#liq{#cT?yzRkh4@e7;m^k1gtN?rzb&~
zT!`J;nNWD0aOmvtrFKT=(Pmg}9^WFvt1s5YX<4nZ0T0~`UElh#h@9mOKjuD;dIVtm
zd4P5FFZ=$(pt}u5?n>?x>5viW$cFt-ov`Z+PS)<B5eZ$ddInzw@*OoNqI>6sAg2S6
zt;!UU^3WCpU-)pUFjlxvAJ$=Cs+hJL&AQGGMGpXiit$w<f7YD@Q){}VGy7qgQ2Qyc
zI7^*fBr<c0uX)*J>yC$w5;VVJfDBRmQXczrQ)zi1aQj*ZRTzRBaII_47Cn|!#JUT6
zYF<>Y`puk~?Fu@+wExx^l>{3ZKD*C6w)il1q-v!tN$VtOrODpono4<hsjXx&Ft2vz
zMc{Aak>BO&$-C*oqOabKhcom(oYz<TLY$S=`bvL~RMq(8F_0VIJ5ivNVzWx7hO`cg
zeW5m<unva_L;Z8&dg%XWWY7bV0Spv<s(hzDyq@_%{ZiVJ;iK<0PcBnCsNtFq0K*3-
zPAgDWk@)F&^M4-rRFk*2ucFM089z|9c6W&e8~if42=FRyeom{9MNYUjV|9-ZJm-^M
z%mKTJrUc}8t^f5y$lVJ#c-V_s(;8k6#(Y~s97r8dpjSR86a$ROSc3dg1k#*xoljC^
zYA87sWaQTRj4@njDtGStC;;v^zAi_hO;ckZXxip~6G}xs{%r#SQ5@ez?y(&+pf6a5
z#4zup^l?+1ZY|7yzWH*4%RK52F)(_T*5-nGTr*V;Qu~@kG*wAqI6`Ptg{$rnHIy+g
z?A@^vdq<W4JmM?V6wAK;X=K+=n-=K%Wve<GPIL4B`?po_M-zmzcsUgq@P3&`>cZ9o
z^k9$dsI9tuL)RUoO{1WQLdDtXD6v&VwFRW22b>zK<iNSK_N-dTCHdP>=ZVHh)LA>=
z=0nOgR<jHYOnd=GDhJi&10`^!=HZKLe!p!>8LQ)6--ypOt_OJuIco4(borMW!)r|i
z;Un#22*#nY$pS^aoNa5aC;y}CV)l*S<KO@Pr_2JqD$B_7H$2TKanJI-lu{T_{Q-Kh
zdS+`&6ZWTu`dS47N49muF;{yuzf)4slWJP@kF@2y<?}#GLtRSsXUhJM8Bud53r~#C
zRoFz8C{m7EMETpLu!0yWJ~FESgS=BSqF_(;KI_m@><~2>q6I#8(|C_`eS7IOcg*Rg
z(V;hPF4J4WBT&Xuv%l04jZm3dDid|D_WW<5+%g&VUpjIe$YBw9YA8B}E$t=m?}F;8
zGSYjEO07{$pV|sqLN!|iQ1kN&W0s}w0{8%t$GpjcF-85LmVQ64VRj)pbB-*)jk?ib
zTlD*Q%(w69DM0l)TWPQf<=ceCD#PI<U;%$Ek{HZC-eP`tYYe$~40Px${??Yg%r6Ga
zN!qv$t9kc(>%!3I5ggTGoEO<I^Dhw%#^etW7OW20;#4IkZCa0U#C&nf?6sesc_)dW
zO*aF%JXugVUtG$m+0d8x<!WAt$f>^>AOq;l^l_xSfPsWKR(fIr^P0VStgzOjsjeVz
zuLav&JU5f~n;iX6_D5~r;pS7xj#cK46!`h|$7cQhb$1MKjyAvFGP3Cvy8di~8XH}$
z)V<24(lRtZx>H@_EDq!{<5T3oS!ySn_y9Fm5x?z+(+;q=Wu^k=V2v2z%+TWB3W!E1
zN?q8f&1qWn(6KEBu)uOyWy-7HEb!kYFhCkV4?n4TapH6{bB!Nd&bwY;IpZC2F)9e?
z=S|*BmmJUnNq&W|di0zdsaT%5@KQS0=<agsubvHvA2gqBdo9olDkY|)qmm^%>E4qQ
zlCE+i+2^oba=Sg#u=!FMl?HwQVE^;=M{hN$akw~kxXb`ut<Qyr@~FQM#-in@gAUJ+
zIPWYHScvZTuaE)IcLf6ES8zShKbCc2`gCsQLNLOjzCRtxCpkyr=KS<HcKBzApu<wq
zRuOQ-;+~<i?^BPGk>$G<%J_X}1pnHT(Zr#1&M<n*T_+@|mZw<p&^Jxax?Lj-r(B%Z
z9*ZB1l}J<^aT+>^W90I9?V*9TN;?gXlkfjrd8d$*sW$q{FM2mE4M91TiAFMwG1gP>
z0IIi)NpjV#lceP4{Y5taFH=buBiQJDL*c<m`qcoKM_&gsmEZBye2S}oSs|O2xl3xw
z3e!W_%JEHM<P-nVIl7o%b&4=<NbalF`x?~#hORG9aq|S&bmZHEy=4y}b8=1R)~BU@
z3BuoZq8CT3_0WV{x~0H?YN;wUZiMF!ucOFF`2l2qdLNz&1?Cw__!x?QswcD#1Z@S^
z_^K<dH3P0_!M7{`hRhl(kNxYk0Ie0^?(%w`og1Uo_3Ew%y(7?xcpTXus~C+Le6(6Q
zLDMI>TjU(M0~=5)hm-6_FFb+$GOdFLSz#>{OpW`b{y26pZrM|@NdSwRZq*cf=6}Rj
z|4+s}i<<>_Zy8+s1<^>2?fby&#}3R*>phGr2S(^v9nJ#c7aMSUni!Cz2v6-&t(iEi
z^3GEs&V)djYO|wX3e>9imuYrpZd*U4f6%~X!)NC`u=NDfp|G|6@a_OV!(ZK+1q7hS
zNkYony#O^HG%$bsVBIDOgkFuM#wLFo{zXq!r|k62=2uAJ$cOuh!S3KbX~);AzH4xj
z<=kI8pg_x0L`~qn-_?An3W?O{oCj4v8fpH2gk5<&l>7HDB}LJO%GM>6ZM0ajjTXC*
zeVtL+jeTFnq>>OqmdKWU8T)R=QXym;jAd-u$JiOm*nW??_onW>-`^kgN~J!}dCupY
z_j#Z9Ib+8;=L8OP{}}O~&36^1{TMU0-DKdrP$UcJXmcI42JlTWT_L4v!+-`C=qlu2
zX~)MoZoTiq$60SRQZ$=FrWg#ufYMSwo=5rDn?6j&(%!Okp0g#WO=+fcdA;8hSVo7h
zw%N@TWvut;j=ahZ4*u^(_cvNI#-V1r;;<9jAGrb?yIGqXTfKJ1=!KgJ2=UzehrDN~
zihNTjN<W7$i1O+UtfwkwDyJr^EoA({C6B2kT(e%3Pl0xe?i4N3?KzO#ifNWK;meBs
zVX{!Gog7hSxdoySN7$$TwhI3Z01O7cwyevlVlR-jX;fRfbqol#WRyh>dZh!^R}f>?
z>7~P=ygj@OFE1tj-9D8xWIdsuRi1MV^(|(hoWr6unF1>o?Ig`*iEvqum^NeO?WC=`
z9VV}E@qbkM+564Y?DyT(+U6OxXn>9t$(N--r;t{@1p&nwiOit!X2)Tchzq9_PD@;`
z+$K7i!?FB2(et_MLg6$AdN_ch3R|Hkz)L;M$rl&e4b-UE{QR35S9Ua<b45pw1>3*=
zI~x8KX8&|le*OI-F>>4-GIiZ1#Oo1Ks$%18fa4lZz%7S}6i-~JkonB5fG%~Lkjb`U
z*=h%0Dfeq_=1TsZKoOySw14pVQbf@*GR;J*l-Cak#Z^+Kb-!>yhL2CYpGY%t%2-Xr
z4Ns<4G%f6eh_YyD@mM)j>H{mA4t559yF0ncPuY({<8cJ9ZiOLMv{kv3`~1abLY00W
z6~i)zz~r^>YI6&f1o5GgiQVu=kG-ZFAGAfV7~yLZtqxGT$HCD0fKEB3CQZb!(-lmx
zp!h`>-?@>s^4Psl#=vB$Mura@pa*i%g%!1&cK>N8pi~w+Oekt>oh3)um!_Z=Eoy1$
z@TFLtyN5M~{5^KoL=;>fYr;Y%MSxKfk-bK7I{XblPz*pz@BVYsN)X33Q9Kaa0DbyN
zYHO#52KsyE8FhfRp8cX&(?RLu5weW4xA9M0(4z0?eBwDx1vlE&u+;T)cDZ2^EO1d#
zx{{wn%5TMm_qQ`ZfOBP3-sA!gJi4Y7yv#AT3$#qW9t&7aB>*~lZh$%h4HyG6Yo;N9
zC_Z@}B?m0dQbh#;kvHAId(Abg<VyIIkd)SJnR{5*h-rBtd}0M4zyAJzYK~4CO>Ml)
zGe9q-y2@LK-z!DjUVRlLgBk1Ve7N48*8ZPR&Gr)DXR*hyw(m)0rRTNWhbi|p>Xlya
zQXH>{zX$aBfVdV~-p2YM#yVsdhsc+mbMs&MVLV|*n?d`%fu@zFNamB|kp~}NNoHhc
zNpJh@;xj7KI3y{1I~;Bft=cB;X>WC}%3fFqbpJ&1nV9PM+3ozz{cSDwj}_hdD%Q!#
z=vwvmL;#?&4~mT#Eh`d|qQkcU-NGD{T(EHjMh7UH6C#efhu-U`RTr6bVGp*-7AuU{
zs9&9I`g9O9^CwHHF-BZgOo19K<+xTbEWbr{g1oTAE11Zzg^DrrUoHC6w)?*l&Ijml
z&b5nc9nG;RWwU^M92n&|C1Hd66oE0@Tw?^rk`U9p-^)`KQL?$lk_~VE*nOH|$7mD9
zVL6n_{qfx5rNWgDRKK#xr-f;~Egwe;x|`S4S|-Uo(#zP%70@e4%2@UEyox{dbHL#@
z^E|D=pHO+~<PI_QT%!dVg2q*4eXQtlU}&tnRh1DOE@bXua*gC+6jHTB3kZuN_W7K2
zDeq4mSe^CUT);fA&00O{+=I#M5Lp`A;<xd+#xCvXDpz?;dO$lS(=~Bk4v~?|E;BIl
zFz7EG!{1)_myMKw!8dH<i<fR<yLs%}EW<z>t@;ydi{9gqswmy$zE)OVUD!2_r79Yo
z<h`OHRs#<J!v0}X=caZ42e<%aMMKlJ{V~MJ`+w~MU{;VxYA`7=37_f=<q5D}`n9n)
z;&`;xKULbvZ?!oREgn0=Rc*hOC_Y0uWfh2c>umB%p!@5Pe=<%$-}<~E8vEpPu>Mf;
zsrX`9C%3TvWxBW*;b~FzECawendl-QpZY2tu=;7AbaKfqk?j#y=-3zKzOJfy1P5*Q
z0DY?q>(b&4%~Ln8RXQ#Wa3^yhLMnxMe*ul&H_y0QmV9|&k700nXQ`;3zeC*3<&+s)
zq@nPX!A^0<)&E!Me_Hbt@V2B?y>EN3c`@^xJeb_vLRp3PDN$W*b#!=}y?$17Eiguy
zijo7D_MdIV8U7j0&Hy#;j1kbdA3;<XuXtZS_cu5~ok*4+PY$+r)e_?kz4T}7YK5(%
zl}Mh;bJJ$MnExc?l=ktFA6(kgv8qtk#X3(`NGf6(x66_*)nD1hDh~W82IgH5s0P;9
zF`kLseKD=Ha>Ac7jM?7p#S^L5#ep4~=N9>2{QboIiewH0x8?4YVWVkng3^q_K_sF8
zuKSEa@DKgee`cSW`}9C4eZ5O9Jgfr<EbFIZ4Ev8Op7v$~fEHF5=qFavyA0{=rpAQz
z#Z10Pq2R<9te!Ieg?%{w`bEvwK1a*v`OsNOH9mYe5i}QuTBWz#y)AyW-XjTO{%WNO
zD)(R2K7amx8vru}SEs{s4%siqc-;i%bF+(NdEKa#%S76Mep#TD{EG83!R{E~GjWnt
zt0PR|3jk69f5AKthaZUB7cP3&Xa7;cejGs@soY+brNw#e!TQ7Gp=&fL4gUigE$RY*
z5kj~QlNoN|!BqBW-~HBgKEM&LAK+LA;MO)(1v{J!TFSkrO*@(5Mu4NR4MqV3Op^c;
zi<T=KJIlJj5J_4Y^FE$+Cxe@(ExZOGg4b4}Z$DSt6ZaDR(ks8-H-MJRp;mru^);Dx
zT|ef*a%S&tPvTY*5T^9|Q-=<gH@}uF$5+40->c89xhq^nk$?ia4S?25C|aSdB|e~<
zE~dlhss*fa1CwPH%bM>IEsp_J;5(q4)4RSalp#fG==w_rX#08YRaH!{DlZcuXi!fg
zICY#6;kB_)W7N`?^XeB4M=~Lr%+B8Oh5)s);hI`fMIO_D$OpbOS(1W_jlI$0<%ZwT
z%wJ#D90lrI%P8<R&Cav$;i(RM0U&4rW_hTzU-dP@2r1?68Y5-#HKu3+P&n*p2CR+%
zKhP|C$t$LM{)FwuJz$vcyB_t#xB^Xm_ySm5rwWiIJ{q{Dq4{g$KmDbo!VUH;g^%YV
z^oHlNPiLuGq8*em;V<Kr2Wbjh|J$JF48F7DOe(r08;`7%vT2)~LRlyZ(5`4c4iOxF
z5`xVxIX@M-<~4VY4j=JFb%wX3M<l=zCj(@LCxCA0JAX<^_E5S1oKFhHnD08ctbBO~
zXmsY?pgtf4N#e*;?MV&$IGC$Dr~gFL?GggLOW2j=LC*$B!w)u>f31x<GJ&}DX|*vj
zf_e6fo?VAB5Ke<A7Iv^!5w&Q{C=fSxYsI;NfkBV-ma~-L<?}m$xg?24gN%lI*!;TA
z(V5)#L~U0EHoj&uMW{rN=E&xpILL|Zr?B8Pbf>b5sv7nJ_qF&(dd|Z00m)F@wy9i9
z<nAB4N{&2XL0ft1iRXUmZ+=YxQT+S3x>IJS3>Yw<Wd#PzK}7v&o|L4=9)5tHIDD<u
zK4Q6J3YCJ22a2Q%Za~h|9Awd|YFnMI$ChRvAA4v3Q$QVToCAjn7<Euz_!JhSHF#e6
z1WtZV8*_%}Q(G<i*0>>5<xTOVa_94Rs{-rdebqA4YiR3qvK;>VKgs)FjacvV-H6wn
zff?z15=yQGnCP$d2-}|+HoEO*`6dNLY}&5~GbyipL%6(i>_xatxy+|~PI=;Me^Xs{
zh1Rrf3VCqXR5&(l+TC+pe#c}lv&?=1TsjVQy!C#M;IJoV?SP>Zk9K-#1-;Z%i~=Ne
zzax$BF}tR0AHlwmXsSTPZjU<spuvwbDj>$IScYU>)|Tr_xSNI_-V?D>Yvwc0tkzZd
z@C)g+|K@;52Yt4<S3b)YvU4wyX4u(ML?A*}P^m>}!Y}7L38IECv=o%_ja`j#**5mP
z$_SoiQLgyt^q+^)rZ;0G*gW0bpGq!fZD$7R(lGD7?T4G$sf3rOY^p%3rl<ZmsIdn4
zl4i1$yV>hA(hsQ5!95DD6mW{zRGC`Zi-lGd?Ec{*79O;v)HC%_(r4|5Hj}hU0%eH5
zcW`r*>xz=CE}BMKTL@R(sJK%6q`&w4wA{UN11_hO!I^**rQujEn5d_guE>1A@{*^y
zJVWVNE5G%W6Bi>+5bwMx$T+q095Q@fh*Y62!aWB%3X@AUBKqB8i>A|gWGJjO=jwio
zJu4vD-RHjoD*su)bpu~ml$j_GecU?}tHU)sPHuCJWzX3pgnIlM3y8RZ3J=;J&@rPd
z;>2Mk^1xqfxG!2T^WyCh3YVl`GBVB3t{#xgs8o^xs{_8qO>f^pw!gja-Xm_~&*6b*
zS(S9=fmH*GY{1N<N}T)hT?P#^aI06SlPPAw)P*o%+lq0livG9+GD*D54Fj!AZN0p5
zvi;@xX$|NHMUj~x88}2_RP+Y!<&#JjJ;Vj@S*QDGY&=t2L-75SrJ&s<?;aK=*PSyi
z1H_j-Oa`}Y2Oo<S6PSm4!*vn|xqZ~Ec6`*(R_IO7TM7ktv-1K!<zk{2!+Vwbl<()a
zfWgB)$UxI?J))l?tppa`aSC&h{MI;PiD>(Wykdf046PSyz(*{EMR%~Nf@jrC_r~ok
zDqP`gqg6hw*@KpuFk93k`AhPJ{$pkqMh%+1#$(P9ulFyW<&<6B_%3e{#m}bKbqD+#
zS;OB!K0`Xzf8STQa~+{<5#nWyoKVblp3EICY-0r^)i<k|wE$sZ6B*OeIsDke^h$Xz
z%eiWX(MBzzm;EhvF;>H>c}Tw7!wssZ>DSD2Ja7N0Tm;(BKt)kD@nawrfY+{QvVDcl
zBF~kb*x^tnh2*UsaJ18XRcOEIp4qo#q_x^r4?B#Wrar9zXJR{dKtV=^qqhl%=7pO>
z{fQf|N;&T^wFk&u)#*IEcNcSRX#DWrV~Yl5DbSaoct|IB*WS6y7n2nxVaJ1dA^41}
z)A_|mK2C$^7k6ugVGE)l>sQN91p2Y@!-)69b>F*r>3pj+E@N#g-{>aXf5eB^Rr|BA
zyQVcKf51jzHDDnCSgkgnNUF~lFxb~i0uyXC-&RA+^v+LC_{QN@$18A=HphLmbkw@u
zQy-Wj0{-S}A@3E;ML+TD)8J`%G+jwn(gc27k>u$Z51vW(b(vSHxQj@^mWsTkcf%hR
zPi+N;%2t9^aYOF7P$N~>4$)CUthilP9WYBKp|+h2oL0y*5Tet2F$K#&Y?|N8?gtE+
zzDt(LA9;k5T!Zht>ZXdMaf2`93sn|154Demgjk}5ohwFtnufOpQ4-aSI2||rR&=_(
z#!e(JT>rS$Tk_?$76owsYH~$bo<?u~d#tZ9y_IFWX@t$nFumz8Q`~!6YEb2l*zjq&
zNTVHc-AEs@oO<E&uv6zP4Bo$wy!B!wdh{4ej^w|E{B%12%Du!N@n5Ffm0EyDJgS(!
zs6Tnu^!9VAMOC2iH1=l0uZ80@Uv?%XLT`vu0Sklf1$P`M=WFhtDKkpQ8xJE1S)8yT
zHT0>yqs`u)+qKzB`X+on@3XUh1~FMYgZ9Hf<@h6~UT4$ZDw2_&J6};ni<2tP1Lqc<
zSQ;N4QI5JqPDY{4ZD9hai*jrTa@`o}0NcrZAMZ0}yL*0^kyxEmzFM&@#=V%?oJ`-%
z?5mo!`;iMbzks;Wt=v<c94Ff}#Xwpi<HG5#e6tK}u};{`Gtd{gz=C`EAQ5$y(#JOE
znqb=GyTE+7^#j5)y<Wk}dg0+#Sku<Z6HySgkAqxq>+vyXST2h&7y-XfIwp4{ci~k<
ztoZfWi48L&A8XLUEP_?r<AyD2-mHth6&(74oxX5!yx9AhM4-Kgd1v#G*D$JR+?e>%
ztvF{m#@WYeNPojgZe#h|`cig<_P33XI;Na=)FX!kRT3t);**{1oO7}z38-6QXZWZW
zZqCWhxkksorC+;NsO<f3mmgSq5pg*?;?_DKF#z^|^&pI<D6m}H+*8j3pc~ptf`p{J
zu92)r=AxA20gr$YhJDzPe&R5*xBfYnBVj;TxSR)cH?&yckBnXKYM!@Hp#u}#KVF~v
zOy*>F^_&Anoj#(33TGa0&s(QgY4Ca<P7ulpqQt?&3P`q$tpS9Og6C(9c5rzwAm@V$
zcT-kVfqup$%yUE-Mo1cU)fH^SZKgac)}l-qJ!a#hO4PYZPYEmuai@(-j*#;$d=f(Y
zxR|qH6lKVL-lFep(3~Bj7MnP07Qy1X>%CNl5JT~uB8ME=@ttJ$bAN(Q>rgPaxx%D3
z08FixD;Bq3a=CS1Fn$GRB7I8dYI}vLH0HD~(xifPbv12AIaL%Oz3&P0(c1Re!Fgh1
zm6QqatA_%%5KzCr44K4H(?&>=ZVUXuAzTmy;suLf({;N)DD&z+H+;Ub&*6}B(HcH*
z5z2OLPD{B>GFr><u-Ihc7uqjkz5RG<Xr7+i?3_ZgsU(rvxl)JHDA1&WzlfZ}D>&&m
z--5;Kz*RIiZa`whLz?40JAJS0eB#-3+lZ%Hz2by9PozAk=#6E-oeP>Q+Z}UAx@Zo+
zwV2EY@)=G0Oqp+{arVLnoaE_#b)JX`YjoffOM0Lz-IgY<vu9Uo6}ig7Vn%>IE2u;^
z_$X(Z=-nGQ%^lJD;FGE!-+7o-^t+eLu{fc7#D|cYaNR`gs+qirk84wN@A<e%+NLTL
zldlVX3EXW{w^#a`ODYr1Yu_<X{W@2#0ZU`p8lZNLtd>CW6-iW90e`f9s;sn3<?;mu
zA;Jty_0r^6+I{Em+jjPK2PYO+NUo$?Za?<AoZH{O!-?foK8*cLLb;U!#T*$~IAtE+
zYN^2*!r*c&Epib&COuNG|K~!UN2f<9Y92|B11ePh4`kxzHWn!th86YDyy6?kd!pM*
zdE%o%qTF4=bWXbFzTa_BveIz4?%_tWQXMPaoFU(58SfGkxL}}?XPNvNsD!4H`GMrx
z`oY`z#Q3B_3@Pu_Lik-ShYO}iAxbJ56HMo~MxDf?3d*U8hTM!`e4H|@5)6ELV;V+T
zQzCNB6%S12uU*N})e(6p&<`_L*jo4$vu;4E?4RPo(PLQ<Ik`!7b8DFX2axy9=K(zE
z$!6!1`5!50l?vEU^O+ReE}TEax3ItJ0>J7O*^dmy1h?qRzH9z$tIft28kv4v#qam=
zLN3`UJJ%s4o*kHHY<A74DQ!uL5J6tFP7rF(*GN^s%Kb%TQbY|mNv`{?Xe`mNIqtmE
zS0aeXWx$0PPY+;2s`BNJIT*t7$|t8;V%1IXaNHpyWwHukZpy6XXWZP`lv5AnKAuB<
zTsz8}?}o}`&+RvLd4J+<gb7M}NBKsj@NKPA`Ho@X#y)f0EnZ>eDzX=R?0Hxy3v}b#
zvM&?l*7f^ieXP2mdBkjL;0$fZ+i!crO5^09iWQHycKRk}VmTBE{k?NvL7lDHY$yV(
zQOQZzd^a?ppgs@YVAW$8N<*$Y18TSMzo(<g_4kPXPe{){yI=i?UZV#ZIevWn-3Emg
z<{aG(owo)l^L~NZDgxf_-Rx_@ms2a`3dqtkqR&tGs8}0PO2Fo_%&L0fEs*W#q-BKe
znW>(Oo|?PwlkVh>pZ4xNZ`OJV4Qy(N<5R!qYGuygti~<eq_-znyd$IOgHU5w6B)CH
z#`|z))!Dqt4NRUIEfE&Gr_?56<(7zDnSZlzYOGDlHxyNEJt92MVgq%#HKTBA634vm
zcwWA6H7#aOZZO2LbT3u(l7bB}VEYOS<?TVIM`u3eB|jFsxjoDGFHdY+4?J<~^*`wM
z9RR}J9?H933Of$(u-v+R_t(mBUwjqaDNU5dq;$Ad)s~1TmlQv)P0c&ea%wv}nwogi
zvf)Ecmy$cg*GL)bp7NBNQSY1P^m>WWW0^~>0m&8Wes{A12(wK<)|Gr|b(LA+W3DrR
z`JHD!+Nr1Qj@iZ)J0^g$6lMmp8n&9hLLAWiLvtSQX<u0x>?Pe2YgS6(^u}L74l=cn
zRLxgfkp=9Ed*+RTOfDGJ^}RwDkczXxI(!(hK6DY~_pUa~^^Ndp=$oryPp^f%H42~z
zqE*m@kvYBZwYFr&LY9*3WItQY2_oHk1-YFxbr0L2X0I50do<S+7#5vim9ZSk1Zyhc
z33z)*`>a&R(fm83a;RgMU-gJt@9jBEjXt{&#D8evq{*b$I^xlq+n__+CZj#y?Twb9
z(y2i#!ro}+<Ed|gPH7d61||Pi$JVo}WCR!4F}2+Jepi!@wpuE$#a>2-dtKY?R0lRw
zX)i;ssPY;_5%2%0c!|Q0eZ2!tVWFW1y6C_+)s!6-nOi~4pL-^4ZdxJ3d~xgU?z7gk
zlB)9jp#d--s+qdN%_Bb3xr#-1CDAB|$PnNW*5_ArIrbcnyz~#4WvEIH{1b3hJGYxZ
zXiyHhr)?{8&*E5ZIt8S1o;UA1L@Kg|aZqz0T;5?;$aukGND9W+{$t10&pY!}Qj|QJ
z%^IF-t>t3PrbCOxF)%(v%(FS|G@_vaQ#bKoaWr@3>6Dw5ttk>@*zW#s4>$$h_r3KY
zJr=X1*c&SRLsU<eMPWh71h=L<LHE@kv$Y&^kN4kZvGvbhe(L1+jQlU&_E!gu${}m+
zj?r?!r-fAmtIW=U%Q+rQ8e^bhOUTu`c_9UHo_|1Olf^5kU)~|)V)>*Qpxj~3u%Q$x
z=S6A(l7L=>Y4Kg%jzU@H&oiCRk}o0KY`B_+>c3`gxEIZpARa!#3E00!4xgV&op`0o
zrpDj@FV{)Y;V_wY=k{Uy&Dd-0jG2ou9SQ&ote}gkSI~`g-Lx@tZ}gg;b|O00dcB_p
zt8BY3R*;sSFy#v`h0ZIY>P0e(;gXQG=j|ix=<}##cO8t3DhU56BQo%oM#L}i=ZE=g
zqR0kycr)`djyeUK;2tu>=Scw5SadZ7#JybCJIr55%MKt8=Jrdy7Wvs<o%K`Z0%ZIX
z@AjKMu|8$sHES{<FD;lHxsW#(S)2o@%u%0#yWe@p%M;ZNKHEi7<WAQ(Q3B?Z6>rx#
zR+epZ3>}dyE+r7IX&mq&R<7G{iBB)>K&|}efdbD>SJ>IKh_V4&FvuZ+5DZ9Sh8=$0
z)nKRF@+=516t^7eb%l6o%L#f4f@u(P4}=eNuX);sMebZ)xtN<D?DdDM9GFN2!El4z
z(9KKAO$un^u-%P|xtcYz29R2gf4=z<19EGSX6o*}4k#u#lUqJmV%d*B_sy<i<ui(c
zU8DF1kD}yltP8H4vht|J8Sbfk&=)D$Q5S@9f>YI&Aq;#04^v92`h+5QneYsQg=;f+
zXNCsDH{CSB<yUg0uJu)l_0l2bB8&xaaW5wF^_8tY^WnVIB!tvqqW`4O`a$j8yK45*
zM^BqY*5H;yC3dcX@Dq_JUDG|+8FnNn#__7VT$|TWv)8$s?tN_puXY9Zd1Z7xHQ$^+
zBzh5x%3;~H1L3EygBUWhV9wPDT(j4kUV`zRoH+=l(F>F@uw%|}o!|h$tfxWFdG(<V
z?+1hL5#g_&zDjzNv#F6uWCxomaqO5aB~0FD_(QBJ)7ub?0Pa95Cng7;J8`50U6m2n
zOkx;3hN=B{MBBrKiBD#<KkJjuZPp&^${RTAdnW9f7;AEFVr;>g5}0^jxfs@-kVIaT
z4LYKy!<MH}=w)0bjQtRp%^qNx^5LJ3Abhl<-tMF?5UNg{-5_$Jr73cE2h|{J4AJj;
zrUSM#R|^a=$8t4o@l&O)a>KoN0g{tw{ECGYV&o18zfwKb-S1VL(&yLHE3?z4>|Xp#
zi>irZ2hZXrN0SUe_AI!4R3Pawpml2tpdJ{7{t)-#Ak!27XuO>+gLunp=HCi+jGtF=
zc?7pcC!hO<wEvv>sjd+wTo_+Et;nd9tvp2eDPyj!_3f52N68>|i`(?^;9C5bS*Hm*
z(`W0ZF0nMkbU7Q&n4ZMaaW;>+x8z?#Lz<5xtF~@4&}YA~cJnl!ypt^;YGs5RIHK3w
zG=l9PpX}9;W%**wTXCZ?sAcdUWQimv)#;j<@w7IyvYR)^t2WE&UA!#d1`vT@l;~2Z
z7x6j>uNAi%T2+iw5X?hIcByHuXspG&H?~9~HtvATAC3a%K`W$9u<x|)&2@`35ARh%
zlAdd|uvsghlA#`eao)+GlT`%qb<@V+4Fyzm?uUPoH1z}X*63>){rGH9W?g06daNG7
zS&<PwQ0K_M6q2GegA6&O#g=D$IHxO(wMbIy;)qD!Nk^UbEbE5IJA61qg1s1vJPf{t
z*H$tx!}a$aE9oa1V=NmJAAeEW?w`E3Tm{qF6j;iY^K!^e!6;ACm_sDas&m6Ae!n>0
z)aNfh?e?aL)r%vMIl0LdPPz8c6g6?A(6|Nek$N~F$T5O(*qRK8%8(s`k<_7Uckjq@
z+V-6%S|F=?QgV-hJgN(dTh}^+U5l{|x}^NC3Z%OD^-#XI4@fl19&*ZWt+KTWnf}E+
zmdFa91@hA-PYB8+P*h}to*R^0m{ZOoXYsX2uA98uXSz%LdT#12I**=RCNyz?v?}{J
z?5qXUv+!z95A%{7Wif(~?|HSq?TKegkfm0@=C@_|?z>%dU<UhOxt&vgxXxul$gOwu
z$qS94ALrlxRqg$CXC-Bh18(rD;p+n>A#Z;a>_r+Aqz6Gn<m0l4q(0+9mRIZ$<lJx&
z1XEW~Tq5)wx>v9mTDAE;vdJ}y;`2QGivsCemx4QV)L69M3`zw_av6$u4;=EEh?sn3
z;8Hc*oT@r7VO*PU0MTM@5&1``xXkZBPWQRooIOk;{XVlN*H-<U3G<>w$u}m!Z-zx7
zg%S>|7rA)S1<v;NPkHbAwv501R=-g54yOkLn}YxpJhVG8wDX8xURh$RQY2Q^MJx0(
zVnhE}0i}7YCPlT<KhWmQhffb-6+JTt)EcNzm(a<G9X$wdP%!+n!VF(lG2C>~0x1;f
zwb2Q|^c4t{La_8}F;PCN!!0YJyL~q9fSLgk$s|<ZI{Q}3atl=;=g?$fRo##I0LjHZ
z^d+Hn2|WJ@LqJr#bu_cKzE3_yujmRE7DmjZ#jN{Q3S`;6l9sObABv|oy*Eg*zrrip
zN~V`VQC!6Y0c}midzP0)^@FCZ3X(q5!-o5r#}b&<!PX$h{-FNF-Z{5YBKRVPKQ|9L
z-fTl%Y{>AB)co}jfCFxh{9#XZ{1$nTGzi}rzijcKqla;G21tPx%!4y7b8Gug8?8p}
zB}NRbazRJ#j38kcmdFPpE?5Qk?T4V4{yd&*+<GgpwIe96Tr4Tn7=k1&@flpKajzV<
zMCDEj|1%i^Lb#)JsVf+HYqjHCa+R0R+rD4Gp%RAR*$_xi$K$KA)sZh<1;gWp*J2;n
zkg|$nagY`YO7k>G_@39$k;V#`CfCmWWfc*jQbTR+=x$i-?=D;82wYVGrua)euWM+@
zWIjZ$X0$*s%^<g8YEoJ?km@flSy<Wd%>_VO1gKPq?m9Iy7eV+f)m-fEq^g5_tukya
zRB6paZqGF!a@BT*od{fL6`d9>1>nvDcenCZbx)SlYTXL(sErT{5yiEwWv?ICMe)U<
zc*}E%O}BqgBvJZ@CE7T9D#wO+iA(J%EL~gUYD_~rUq^*SL#m3_!m~m0xk&J($^PA}
z;_9JARl5#qlEt2jKUK!gfW;^O(K$mlhgek7bZJ#?RAew`uW?$!zV|TwQ8VQsBW7a#
zof9y8Nu}7+wgMdg;;wDKt%u<5Eg*KKX4SX=4;~T-WV08QC|#Eo@HhZ;rhvPu#kW&?
z6M}hH5xJ89OY7;a&6*^{44duYO;OQ$dUvv`gMhyTM*~k+Xo}J$j2Vb@FLaBz-^tao
zjAj3~*V%YR`;7ajk0NPH8GaS~USV&dl_KDtU!Ykyh;j;9t(&!uEtKbibKj2Q{gQ&{
zT%WQFM<OIVZ$i(esNu_B472{ed(F_FYH=?|ZTqmD+i_G(O*{q2yffHCjx)$jF1$c)
zZ@UMTyf=9w*G2SGrA-V7WbQReoNqwku*xZrq;=1mu5$5;=;Uy5AS5fjMHWj8rJVCc
z>&mQuU9vzbuc_XACD^yny!GQAwo`j^BgS5~6+5TfQ~!844lSU@Xe3S!R#n!(JJ!+!
zZ&U<^Ow8FR&9_i|SnW%@Cmbo4E8%tdBq`{6zwHf1IOXTT%Hj40&R<KJ#V^<t5LrBp
z?ul@Q?OLzFJ7jui0;xjfi^%r_4gWpz4bS*Qnh4zYvUU0@;2xqgb^(VpO~q~IoujF?
zy}q&UthRZ`(Knf>ZH7N^CzYaSg{W`lHdgjQ+ybqSEL#+Z`)OJ34CsLH`Z}0_0^O|!
zT@ma;cYh@-@N|Vg)fqn#7r(&K`4g5a^Y<%cEW4SK>7A=WH@cqfDYL&QEI;4FBKI<z
zCF|504ST^3w0G^9=Y1r2uuN^r3Qn9)(c#suc96?%B<0;_gJ<{IWl3YT%t9=Dy290y
z;(h;d@9qPsBKo*StDj^Hd30@7JV^&+o({u=7RcfDo=2|ccUf-Rc*#RBGQ$h(OyZO5
zJ7atO3Z(7<!pxMVl8@^|2;pol7ApWeAi8|DZiIcuY^Hlaqyi!CE(bz1WEJo2`T`F5
zuMt%7fj3Q^=m&~?iMN9x>-X`^eWd{b?^YM$z4%16WGP6&6ZbAv3LMe74%XkWd+19H
z8WVbra#l#nUDI@62_a0*!QHwAk?tl}@@ONquV$UJMzhrvbLqU-QCD#In4SH&q4NL*
z^@E2BK6E&H>8jmHG9lTK;%RpAWhw}Aq<f~I&t`k4Jkn8X)M<{sj}3&Mb<5Q}L?~=?
z?>qmpJaXryq0W=|Jt%C&VlG}o5Wm}%RlMWf)tuuJ8@U=AiAo+GoOB`{!kSWnG9V>b
zH^gNCK>YiAts|#o2<FO%p6uFM(RsIOSiu)>On`D#o#9KyEUd$?5hbxPQq?b!)kA^<
z;P>3J6rE$2l#s(F=-62|xXckh-dnhrlh245MXqBXKb(Zk<<)3nE<GYd7@3!QnO1OZ
z(Dc~&37(B)f@-j@_Z%2&`ECgjwWomFO1R9sM<3r#;)P%~3yS$jre4@)GR2Op?$V%*
zc?I*nWFKQWaEP1Sp58n={7GcwDpjz%91bY3Cd1iJMX>LL0pExinD2IPLu>>9p~?q%
z_a0IZ!H?KnpvGEHk5s1~G0xo^b^zfc9dfY*33^mACt?sVS!WC?nOmmcf~ofGKS9IO
z8pqRS$D%N^%q=BUySt;$eby0ci#fJ`gv0?Kjr;9AQKp;4_xW7Zh%;*4aq<pwEJoQ<
zSRRqobF1A`S+7Ud&~nXzj-@$8v?PaT+jq_-PCwk46Wh>j9=X1D?#2Rry<4K{@3lX0
zxw!&EjAn=z-4O$HWSy!bgj$6Pn<T9*n{T<!KwVAeE=S)STD8CiJm*s=@cA`1(A-Ez
zoCqj}Y*7k?ckI(`fU^<{B(c;$A(`cL{|O$2N~RD5U}m1PTXB%CS79W1)ugMON0OH1
zUO@~xxtc3xwd!+Lar|FV`9gI~&oeE;#(pXjQTP+V@e|26#NtV9uef3HHF}%nMR5vS
zolpFf#mm6AHI?qTCX~ksV%Ke8v*5VcAm)Tjn*uEzL{h(f-}zT=YzZ323P&@gC9|}d
z#PY&FPvo0yS?UO=aqLI4%M!zzzCYLvZ#<}GYI7Cwvd5lcSK&4wRE&B|R+-fE*+9Mf
znAG|oT`^V>K+Im%f?)P4>J<7(df!V$yicyH0F<5AV3d95D|oe_NBo;H_$pVH(?Ga_
z!#F$KEiH0&CD^^MeL0>O8pOC(v8<u$FrI@TzNN3t>bMI#AfEk;WictKd9XR4(wMSN
zpxdM=joH!umPzG7K^cE(@o7++v0>K5cKJ((d8w?!59gQ|5^U{QS%44x6YSg2svSyr
zF!xGnQt;x3pw-wSsL$J7)Iy9gu&CR~uWQFuYn6bq8uM@0&{iR2bk~VK)3rESLYE@M
z9=WIArgUH)<$G|5A|reKRDhhBEb?iMI_-r#?{2smQM^OJoy}d&1%MMszsP=UG@Biq
z%~j4O-1uG@CnPED77WMW!1^NziD*9^uECG^wjsY$Szj*zslxPY(ztZ$(;AiQ`sr_|
z={Z*tvl-2^&a_{;_{dUOSo<Wq;9bvQZ~oIMyS5xzY0*U`Mrz6rAI#Cy##f|8F{LQi
zZ~5Q8v!9Oqkw7#voIJj`Dd7;u?(%(Zs=b`dGtLEDXnHie)qQAym#pVXJ1@xGV6=NC
z7k{P4*4E()U#djKv0KzXCsV#Z>i3__Vk8|XYc%>svrSA5QtAY;m8(zZEl@HR20ZjP
z_uL()GCX@DAXsny3n>dmTWSkX2u?eA^V?(o$YWc)CB{1*Rt+WbO)sCXvTW^&iJEeH
zm^4QgI+U;Gj~?mc(c5#hub$)(4Arb)MM~j*Z^1^L2XJiI$FDKJu?#=I{CdkfMM@dc
zlAJOVLXjYmO7j-JB0<4iFcLn8;JvMWET1Qxw)v^1^&O<)9i%*4x)Tr;Ov0EU4h&o`
zo6b~LCEH9t{v#azxLS5|oOeR~CBk&!NZ)()3h+>Pi@&Odpm)-~LH}~W@4xEjmIOYf
zJnsGPW#rHL&RaD>w9qb{vTs8tYXbb*W{r~O%X1xUQ!eL6y^bT=AXAD<Y-a3R&1Z^v
zfLmCyHtri*tA43hMEh99Z2Lk#WCqU0#FHE;h&t1zT|JJk@A8QUy0%6^NCYeH&DYix
zH#_I_E5DN-C>g+33uoR${(~U>i<`N;2syg%)*7O<&+QD}him6LEf48#67Nc=Y<$u9
zHa`5Bs$|JQ*-HS-lJZ<HxVgT!!opI?U_+GDIh)P2v!#r<CTf|Biy8=PD<QJ9g5Pg-
z`EVE3WlI0!y_InQXX*J|SNm6~`S%h2#x9CEHM1yDwKra}>y1a$z;6d5+k!ByzUyX}
zWfAYUoxFxfgE&a%Q>$v-%Diq49>o+844@RJNntY85kLcm*l0Dwmt1{rp$$Is(W7uZ
zLv()(>EN3h;UYVlmqYg1`j_AR<xQr(l}sFHsPDf_rD%-wLI`V~l)|snt!%ngj^u6H
zsM2T~ycT>G!6c-aWs^DOPdDAamN_dsr=A2wJDY1NBrE5ZLmjdavcqAmK?NP<6*uB_
zLw*BJ?Wfbpy#M*|fBSDI)X=Fa`K02`^NkE)aL8`Ur{KD(%R5uSJCfHy@&Q})bUn+F
zWLu@oyhjZ7+?%KTOXtM)M(|dRjp`(D7--x^!Tq|2IK$Uu_lJ=bWxv*el4+FuWSVIb
zZu1PkTIQz>45fjOj+uHP&tZj?5ojR!u22YY%0l}#Rm(0|&3r6|@}}{5mb)M>6cMwE
z_xW?;hxZly=#ia_`tvX0%ENEf)$nYaKzl7j;?jbWsWGMk=G1SDvntBA|IDZNn}gm|
z1*-4Yp4kKE|0SvaX`+Sm>BkuMxQd^?m3Y%yV1T8dkw>4UPJLdo+t3J&^v=%B-f?!n
zE_*NLr4{o%oL#tc!8%g-eQ%d9LtLaXd}Fhv&3C2l%PT!B;z|{?P!<2gd~9d_RTd`a
z{=w-7AVAh1*KaoabaOJb{O5+2e;Uc`_&`X`$z5E(UT6#cnK}u{RDIsV0S^}4%oi4t
z7pim12u^lF&bcn&zq0C+#&{9Pl{=&bA+hkd)?{e!c5djdKB;;%-q_S?E6(=nN%r3?
z(y96MOkV0-(%8rUl<uE4tFHObm3qFYb636hDvjX|dPC)Xj6!9A_$AroYDLIN3G#0%
z3=pg&S8OvV*(tD^Ndv)1jbX9g%(@Xp^evDGiLfCwXO-+E(I76jwq~!NDO5GLDiFg`
z{tEiMuK_{LvPt3+-^qWP;m`2r%>Zf#iUWIm)i$*gZek!!RCR%(Gx1~Et!?*w&Wl%*
zLB90oXiU&~TN95jhqX2Y$ccB9;m*W<^>x6%Gu>02nsqW}NufmtjC<z=i!FJ>oVf;>
z0FIIpUt(n3Pn!&UE{ICwZ~ZFkC;*8wNVhTkcTnSplULm3T`|P&RNN%2NmLe|G&6Yp
zJkU_nQ-j{BGC0a9KQ96MQp+A%{ia`7H=}-#ni$`n$FxVGxBEfZG?#P)VLS;vzunxz
zu%-Th_f2t6s_JXZoSiX`hX8h%-kR^ZZsE#&_R{YLE1^7HW5LbI`ER139|lV(BsXC<
zct?q^HHtO4Sod5vvZaaFl^7Km7A_CvhRE@m(0wg9Q`lsPm7ubq#nfPHjjm{?Sep9^
z#dhVzE-Xfb;iWvjL1*-UAUz4qM0XDiyAb=?b$F}A(u+Tw>GBUuz(yOnX*MTMzJmVd
zT4Vb(iPBZ-|32LzZvwqpkR-8V`-m|=A*n#3FoUu%Kx$BF%hNR_FF${EVqDKh*|?0*
zWw|MtXQ*kxW-(DQzh7-^ebLIzMX23#j_45*A~E%+Pz4sI%JaB(>rpXx$vgAX7?gNz
zHHIhB@#amiSu~P7rKTw{_QXl%af_kgxnB+{>AO!vFCq?=!T&o-TPN4N=jYDqCHJhJ
zZ#AI>SeoXi*gn@kvN$nROW}?+Yt`vJW0D-`ei#1Y{DRebl2Ab6<x^IdG(}VC7-w1J
z#iofC_WVmLdhyh$p@O(+1FS*c<U(AGroA~n)dy%P(SeKkGdM7zj}x0D2-hqEiS^EU
zxv3Mo_V9=9YO3Mt&6zq+vibB957K)zMf(=!WnrNIxs~Ma-1cX9ulY(B75P<(+i!Jg
z{P`ozJO~TVbDc}(N0fC8GvBPe{M2YDaW+bxkI=)cICq>$UU~dphWfC28*O*VnzI*Q
z*YnjilEd<4?E&Xx#zzDU1FTS6!`}QszsZ<opIwV4zOYw^l^Cb1(tE{^dBEamaok0G
zXB;XTQUbqHQ>-p7*HWSf8b$!UBG=QvhhZEDx<X1oBn~W`iv91fBRL{MUQwvKdI%fa
z$F&J{9>|d13+j}@51gZau54Gv$@rT2i@U&W^`HceAlnH7UDxuN+#oA(_$i79x>;MF
z10IU#pu$gaVhOx`+7S1Y1<7)Q%DH(;Twd+2D-Ma81?`=MaebaLF|KNq9&Kti#NI?p
zb(grRWJL=;(`EBU8W~l3ZWVkeO*f`Hq%cJvep#cy|Lw13S<fGmhX^9=t|tyW+3#>e
zk;c1xGuyC(l>yBCP>W}4!y0khp5#|(E?x;c${YWTKi^4!bR;ck>Yj7l7&`4Hjl4h_
zPf;C1KOYRO-P^0%AI}^H9p1HSSWbLA9vTh5QNTRZk;Ys$<DNq&e!OJ-A%k19Gf2Zm
zRly29bm39f#c^HD-dFnXif-_<THetO*S2YBYnw`@6^%Ra0)Xxd3b*&fv4CPd=-_Ml
z?eI)j^Y~k-!tatGgE`ifdnSmBQV>4VCh@$ky<S|03|0iF={}Ws6$yuLs=tWDe##h-
z>9WXS!3_w{_c~MA_ATuM$NPQCcn~Oj<7|Yrisi9JY8w%qpc*G_&ZOWkDJ^T9{VX<8
zo5g*;_Es)Wa38H!I_0gsvxn;5Q50xO6ll}oF*8PT^E*wjhpS%Uxwu_pom{am8sTWJ
z;!_8+smW8xo|9!1)`$NyaR7f1#pr#vJeaXNYjZhc%YDq#EpJ*7d3EbXk<nIjM1RkS
zOeC@cjD00=iuRKaMU2uaiWYvKm&VqOKkFon9BOkEXs@M>zOe#lsx3?QbX2KzK-|oC
zJW)|y=LYkfMNmvL<&L^GmhCyCI2;t@5hHS)8)R`Mz5D&$d`7!w=K&NNgjJ<7A>MkX
z!@VMHYZ*>w+HHoD!D^WV&b=<uuvq@N^UTm62<*`xXkIq|u0Qz+n+|!u<5p>gF{Xb_
z-EM|uf-4`y%;e7Uz72mp;KZ}_*r=tlNMDJM<@SuCa6FcZ%E)*}=VE(7LWJ@iGjC+C
zJ`0|K0ACo`)3g`dAaG_0$P856GX4-mE8+AZt{ChG(pH#jI|j`YJUcSe#}8JgG@nBY
z`SKBL6tt2JYPZ65w2CCEZ+F4qc8}DqaU6`N{22h`<p_@v`X6Ec@KK=c1_&{@-W6RQ
z%8n*l8l$Ck-KB;HhGuW#fl46gGttDE%jRe)Y54lb2#Rt~i#oP<4<4E|u=G0X-}7@!
z-+S>A1~()aj@~KovrY>w?W{nz<*0dJ4to?Dw9SXC(+&7XKGQ3yzSvbU2ca^P!o%zQ
zNA4hT=cp`npvOvK9?<-RbZzAWM@(P>j@_U5<@Q1Ne!$w_9n0JND!=GLD=hl{;0N}1
z6s6f(yoiRtD-*hTxl-loUK#XGn@0XJEF~FdZsH9)j+-#OYH(obn31#-TS~xs!%M8u
z3oIqOM!65l&35h=o2U>7X%OBcy3pZmi;KMlXNn(h#-+_#K30>lE<6;$tE<Ajk`NJ?
z+<oC;voxl{(+D}_==Jb~^W_6YhW7$15P^VFf$D#GzNBH1Bzxh8Dl4yAZst4UJEC*1
zd(6qatyo_i#4@-y{we_6Qt1XO&VMOL(7gQt>+PC-(ppD_z*LL8SLf%+D&0RrV_M+O
z1?NSn9xJXYnG075N;mG&EEx<e>-zu~OnGE+7%9N58b4}lP$E+NB^paxdc^=cEO?_p
zQ%g0l=>UB(tpZeKOr()=tN%{605&clpPVebdwW^E>Ta>wVGbR{rj${nCMDD&sK{yY
zCf#RBOq}c0lPMU>zC_1%l9me#7wqi)xj`pkrmryhIwU<+-<osrIh=CCBhkJSu~=I!
z@#=4)#)b+w*b_b{u=SWRtzg|Z2W~2s+k_pO!AliZTU+<o5}dRuMCRmE+!|+#LJtIb
zfO<P_*wRkXY;`dE{v&L58YJAtV7Yi(#nS<5mxsk~W^Czsbaz?#KqU#npvB$))_0v5
z^wR%Ex`wL?z^GghODHxCATNT6Px=Pb4m{JvOJaf&ur->Z9vpDJT+`<TULHxMQb7E{
znA#Oy9;~p^(Ny4#A};buU2NyFegHp;w7-JX*UgUc&pc3db!q^a)e&y5FZ4f>60jru
zTKh>0BsTQxb6o%t`>u+&Bxo)k@ny-HVt&;h&%*uEp2}n*Fr<hlUE?HsN!6Ch#a@3x
z@RjF2T_?5iB6<!-bKxSY$3CrDiLxT%LDN1XJ&8a#50A(c%FtanjLaF3`I<>D#E#*0
zgWCTQ_O`dfhv*Yn8Gi86MX3UJF12fH<njNybiiT{`GidQdG9p6l@vgBRI0n$ctnq(
z)1&E35iEYLI6>3m9L+I&pexT(ND)M7&A0X{`ohqcW0d%m7MuGla)fl!m~Fd<b;}AP
zyaY<$x$KBPyFh9(hC4S?;Tu0nAHtOhBc;sy*wC3KSbA}(7E+3o%{-y`CYAlB`*fq_
zP};ua2Y~m5(+3(!hR=>3H+*oKC+*+ps-Ij<R2*5iO1Mlu)=Vc`fOI3Bcmi${HCMX&
zdZ#gE%$>V1J)(*UHgiW0am|D6r1qW_jTtS5fRCwD$-8zop#HpHMeBDMHYWyJpN~{_
z+E^~!YF-B&(e<<Hef;D}->&_j2a8U#?nJq~`dfEp-m95@<VAuylH8?+*dS@cW-|Tq
z;o%!J)_rTnbmN)EfreTF;enjk<F?&50-XAse|D#9DghAe4rq{Be+Q7iy7{63IUOPO
z;|E<*)v9$%6EfH(tuQ+GOLAeOPf?sMKR4=(*c;=+NTBW;Ewo8y?&=R?RHQf;Gs9nu
zE*YBF95zzplg1mZMZ7#m@IZo^&Y&)-T@oB#)Y1FW9Ddi2D6y<rbVv)zuJ(p5RyRr=
z3Elkq0W}HNeyd@%*OXVL4?Po&vl=Y%$ZKZRvpClGvrscB0{}oZ*!t&+kKbeWj{wfV
z$`5L#4!^$-Xyyk<wjQJCV&R^ulS88Fu8a6?Q|grGI@}4w@U-d>=EAux7Vx{a{JE(y
z$x$}#2hNs315PMmT6l8qn25%)@~TAi#PLV{BU(0A3Y*>4<GU#(DG_c@EjJy|QoQ*I
z*q}N$fm_e?1aMU!%#N1wd=WT1Z2_b-e{!k1wZHj1Z#+l4z#v}~;eY#)q#?&~M<3nE
zkQ67~*iC3Zl3O#2JIuP##D}d`a6B-B+~$FG&oMp3&AB$uqgg@$UOJ8nHYRD82s#Le
zHKGC>ffI`5>QR2tX>6N$F}dKTk~{;~EEgp{L8|0sXhECr4G&8O9m|>u`=<K~ddS6)
zQWLNc_sRofe(s1`WHud7-ozbVJ!lHMakYrf`|j;BSia}hEUJ<*qgn9pBHo*Ub!3lM
z=0H;Y_ji!N@rIiik{ppLo}r8{RTp30oreWV?}mb|Y9+btefd)GwvSuD5%XBb$8q)|
z)#JJwWwgzJ^1nnZ-RYLq(XyZy(g}p=`jE>!dy%*P+6B-fo7bGm_tkUe<D~k_=n>~f
zgu0Kp9=X((UxqAghQE!V1-Bv#e3l6>>*XgJ<>zaWb(ZvW-2-I0mON3kDcz6!fLP3>
z10*-ijO!mY9Xyn;gENkbKK&ap7s&zbsF=&QPyNrBdWcL-E+IlkpSv*QP<K|QLGSWc
z63WhJ0($lsIgOZsg(&PAch86Q=gK;Y+@&MfPdk^W1w%F5Sy|*$?&1ADF!phYLvmOK
zdV9TWJC^{lNy|tCN14F$M-3&ou2&t;OeCZ}pr@?S*bC;Cr44P;rJI%jZ>D*yX!%BZ
z@e{e#A!)2dwr@R&3mQD%6IrRJLAN14C;9g!xAg&d3~ov7|Hsn~9nQEdH7d``_goJ<
z#XMf{uzY#mh0HioGs$uH<`KVOMJHL#uH!H!5DIiezhuYoW?AuEnOL4vdbQmEj?k7y
zOg~}MvhJp{^(g_3YmQlO_6U5PZLD^sD#?k6y)kUaW>%;B20LrO4A1;>g(tXp0NZJ7
z-qI5`@y-T%VaiATkG~G=_y#yHwR`UX*Z*VmbG-bV_#iqAMwdafNhU&<fdS2XY^&4a
z9dA8$20sqdEwuvVQl^Gk!h9@t1WB<7k7~qN{^v9fG1izH*CStU%~|H1I@fh<YtF!-
zvWqCwlyPQklFbj=;IQlEv(qPtu`$z^S||bTtsuK7Z+f_%B)3jP_YJ+^z!B^vDhoH;
ztlNOkB;WY!CK&eb!}pTlAgX^6oE=PcK#$~>OptE6+j8}1{WLxQWzfw+m+FREVf~G`
zPj!BuITX5fSG6y`5xGod%N~hbC>)7ka<L^WeU=#zVF7VC5jZo6zBhHm!=3Spw`oUn
z20d<_ZgNp*%B|#ej=&v%vSndh?fwaq&8R~dGS<;Dxr5}E!4?qtM7<t~fRKNuV($!2
zm0h9cu1cPqpZhug_3i>e^nzXjcZMLD;%2CQGP2uZb<Ka4DOdMq!P-SalB1X?lP?l{
znm64zhA%L?`=o8G1<dAHMYK1UXc2M_^F<d>hlzZkj%0%eUeam4p|e}>z!Dp)p=#ck
z>f_d3!sAw!mflaoWOsn%7;yCTRAJF`)i4jYfycZ&o|JOZ12>NT=^R#cfLhK(Md^nW
z{MQ`yO9oTZsyQOhY*-+vI`5tA0W0`C?*g%Q-kf>o(U#_fD*!QH^Grs<ms%dTlcE@}
z$qe~302rryRaz|iq?9>OVc1I@Rh%=M!a}dkIJ0`NY;K|Q=gj7Nl1>T6MXaaI2A2Y$
zXNf$8S-7)sE$s~#j=Q6DT>^bMW77CdmaFNTW-(QVi%UAux_R9BV>}}og>yi&qN9FR
z)xlrxhr0}g)!2&YE46xvxpA)>F6M_kUCbX<7i3IMs>6RRZ3`^CvFBAK2#s^1l_*9&
zFplY4^VCd#C8%+fix?uP!S{;r6Knu8P>ts{bQdkq!|;p)YMLk#(0fzfG5t*YBfHxo
z$vmr|S-oAtUfM+cJ0fr6uD-;CJk=bcw01LWey+E*hSoXRw>E}eqbEd5v;ZK2qAbE@
zA&jC7`#SnY$FsxlV3c*nK(98^X9yMD^a(mafPEk1`g+y7blD*TKBaXr%HQ~@yz!eR
zGWC0RNG5CY_FvZGhg|twZ7s~;ylY0z1I}B(_@VMz6)oIiprXMtseUkeI5^UJp)o6i
zypXx_HurNKJGX3OWP;|6f~umtF_@^;V*rLf$Of4nxK)PoREtLjukE;&{iy`2X9qgP
zWl_RE9NF6(_;RbrUB3sOZ~81STSK3&U7q1EG79z1Rnl^}ip__jggP4>v7zoqkMaIo
zUlc89Wi279K5Nk<R;5_1D8I%>Gn9Lx`H7Y_0bh_EAy;bB{DGp>WNPpoMXyM)!50cj
zL{)Bz@23Q!$H&@_9xQe(dv4m9ai5QzFKq}<<d$D(*7}@d$QGTl65vk+7z@`0vZ*9^
z{Db7dI?IX+DF$}R5~q{dX4-rmsFcKU$2A3HGAl034dWEms#^5GoX9I{a2GAMgz(v-
zh01a`c<pHf(b9p+RtKR_a0`1jL$q*-nyMpbUsEXX%nn%m`gc?3rrcAO>DJ`5M)ddJ
za?CswKNU+FFSpq*yS(4=0A^A2s^~%3q<!ieVM8*qy!$X?CLFtUoJ(TaKe))Ar}wQ!
z2WhpaX}7%oIJZoxP-?+`4ft)fUq-j%Xz8RZUrR`FXe`@dWJrh}_DXX>j=pZ9R3_cT
z+gEP9u@Y*3P^St$Og2E=CW|KKasU+lXEMztl}BM3Gu>6sGuF2{rQ7FqWj^W*ROhA3
zSIi~SW>6GbD`>ep-YsP>VdlvT7c2q2a_E%bmjXo9G|nn&DksyAHgDFcyGyv;E{~EM
zb`>-^Qi8b5#IuJkt_!m0hA<5Chiek~_^N{kb{3<PMP<P;UaBjPWghDHw4AM};1h%#
zkT2DI1t#4pZjRlyr~huWSl*f`H&ZZTo_QZA8k$f7&Fh=-gutq-#K&ceXC5dWjahEu
zn@lG!ZA#EIJ6g;~US1G#sOQV>r{K6QTMi~ZrOA6(nsuBof31@3bKEnc!*dG%rjSH{
z@ewytgRut^qix#U{X?8KzdTe>rQ#A;Pg4zsji=x35tkaZweY{=E-tR?ddON?_<WOO
zIC(Z|xUe$63WQLrLZs(i)=Xr;Mp%_z`=jtidbk@u;d`?}JWt^40Kghpl)xcHA<bty
zPDh;~%bk)rx3|8Kwdvv?;76V#h=P6N+FEyr+}SdR!_~yKx=vZP9I~*fNJu|sZkNx0
zty)<ev*A@NEJ)hjo*Oyb53dZ~a5>8~4TiYBarjWpvQ8^R+kJHh7uQe#A(#&v0G{Tu
z{s?r?({%SeZMvbk<$HDP_zLhersah4|7-6(!<x#vu;J0c?u;V#0uCq)(lilKsy?D9
z1W+R&Eijazgep=3gvdB5sK6-AfDjZ7QiJr85EPUWiqt?T0USVj4S|Ft?+%QLW#)Ol
z@85g9`ok+8PtMtU?X~W8ud;V^7c?z+Q`ZzZUX+!U-+H(1^dA*Zm8>M*Zqy>c+9P+L
zy>%52nW%mhe6HOV-e=5TNH`N5hWy%lv^-*TqWo`rJNHPSjL*psO&IGv<Y4{Jj!547
z`DxkVeY;-Rk35E0W-Do!t*6iUr*6%ybSOLQT)nrv+Sp=LBB*2L&lOp}ugW&Gk$l$V
zdN!q1O+M(z7At!_V}13Q)8d&r3Dw8_ksBvZkwsTQcU2t7zMpbo2=8#uf!QY+t(^Wa
zZx_B0<ze`uL7~-#p|oj8ET_xPo?FfezuSH9rqQ#Tt?jRVv=GxCm3GQ>=dgg&^4^jD
zpfUVXm2~b+*PLY`79eM=JIrxRA1gHS)d=3z37hA@&72wdbpyj`Q;$X6t4A=VMnW5q
zyeUPE&>6F-WR<ZMLg~t5IJc#T&zNljN*ej=rA3SfykZ=!?kKir+hWcZY<=-93OQR6
zd}W_Y%H?}0*$PKaI|L3yC$7qxxp|)%1n9|Qp2$w^gwDo*P~DF8$5U6|y}CiQy~xvc
zJW+`D7uCw>2*e8K=)AJHy-Cga+7H8(_sb)ieHvG1C@z<*-W$zKB1lMtp`Dk>TL`Gx
z)qN0M7piRMCb@tI7H5WL4DCv8zWPzxIibHHLw_f?wRGn8RiB==+FMh<wXp9>_(wj@
zIyBkpa^z9tx?QafHsoWka750~Z3oJ#d56F9ua+Kd^Lst9b=@SlwR~dk?K5}A=*uUQ
zsx!)BR~$WtgGUc5RILfWX!2+6Su!49tD9VeINa>$)7+w-!3fbGKH6rX%XL4p<r#D8
zM$k1RM)BhQp#*Q0NDCM@e?(T$(koV1x!f=?RipJ1)(7)$E_+Dbsk2jRgSg*pU}1*w
z33gob=8k4T;=I~qS#omFb|nM#+7M*I)eTSPE<T^pkDBS5;g*cXuYb#CA6XYhXV>X^
zm|YH9Kl8T2r@r+(``LK>Z$-?kF?m4(j+4IaQv%L^hulE}-YS>V5^ipbhzv^yMj^7k
zYGAS2k+%#i9OH*;os+lLnC&!1*&NW=uzG#p*3Id{=BmG@^DC@hp9^!?7hOdQVVR`s
z)LT-5eh6)Qb@&Q31A%sVm*ZoEG_PB3=xRI8tCaqszc2ImkmqjM&5thB6OLD5gdVAN
zTaF8RM8SX2%a-0)R)9A?>)GcovFpV1UvHSIhuhFs+oKt~&psEG^%IfX9)9Vqq?hxF
z#5dj$hao$qV4*}0^VLP%VfIu-Id5_{({$I%WbeA8A(x*WcN0oL7w$ZficCknYzF8j
z64YfVm)u)9yx&-j)qhTZ7k{Qnt68ozybYm9!%#SpWsz+o(rw-ilvp}?XB%}Y$h~u&
z{?9@xkqGp><)O=EN;jN%bXI+of0;_X!DFi4^y59AZ;VeAXi%W6xTZnhvxYj;v2E8l
zs?IArqF%PPK32I{W$j2hmb_&QXZ;2bz~~;fw0-XqG_qa?%jGZI&CzRFJU8%oKd_n3
z{MuNR1=jEov>_GMw%bu-vQO;IjMEC!8-z_;zBwOr=j6?6vMuFuIUTgyH<+&zQ-<BO
zMb9bEI9O|@#gl`$G?o34X_qmtbx$dp#M#?fmmXMm(Kd2bij%(4)1soi^zxIfTNGmd
zXnNUr?dq-p$s>eYzKWO5$w=QYIklsOj(Ki|jb{)d?j4!DfVD0qoXtDjx!ixvw#zH$
zM+c0fueB9ojvoHq5CBTK-Ifa=_?)<2r>O{<yR|*+V`(M6OKTlV1BlN^I-%%#S>UEb
z{dE@ugF5;39KBR~bDw4tQdSs!wQBW9vyztgm6bWN#qQGSfSIFRhBUoo*a4>MzZ=Nq
z#u->iTxy+Z%NF-aPs+7B(MS7D;s@KA?7B@uo7#FXSsF^|?l+^fny*D0+W2vslO<8O
zjMh%hgY3RDHa_pV4?fI2T@S!@rZ1DbGh?JICmNomuCj-Wcapdtctm1_chCGe>8Bjg
zLW!CTUde=B_1b!JH|FJ=OPRfCznUtTdYsb8(7s*lvt_(i{6qNnk;;4`HJ?81k$C%P
zaLiCHTpvRl)>>v`doSt5v0VAi_h*`(%Q#SP8J{&ewx!Fw!X6#am3GbDTN?If+pQ|z
zVN<Jhy>&-68HWUEZlwyBLpG!s$LZQ$cf5DMJb0#cwaqEi(+oGXwf%9GcYoycg}J#s
zFs}R2ty-GZSJwLY!I>${+M-4_+080nwC@jl`?6=54cb=AL=}DQA0N=DA^b&cFQv)Z
zC)l443a2&~X4vOblrtE?sNvcq^qjIvrv)pme`;!T5G!$Ip^3_<@9r~$fx6^}_;t*^
zF%`9uWc`ZwC-QQ&niU+K(=^a&<NdD9!BhR$uJFB0b4RMBmnrJflREdKtJ9bzK4bSf
zZZhX$T2n4BtIA87@6Uc<@YF0acX)D3gz&>N%LscPoiVRjQWq}Xyv=g;U(&qmd8&(<
z_cx%;f9~j?M;Cwl$N!2XpoE8gEY0MXuQtmq2DNRkgU^p`GS1X6YzcdCLOGTcKDXM&
zJn7U1`EdWlM)tu?VRyDZ(w6<z7Nf}I1~KP;^GIo7lWg?t??2Kxap10Euju8$*$88p
zw8=r#z8`4oU5w*o@>QSrzLdlMd@@mO#gNR)`ArpXLa8G|&zP0v1rFH#FGh+GGajT|
zOMQhGaVALp61FD>lTUW_-A|fg>YUU^P;K7j%3V;rOh2q%n@}BZPrbRR*nXXa)RIn3
zOYgPg8rK`9@?D!2)x}I6%(Hi=>|*!EL8uVuk>62!zi`2)zIPDx$PdjKsSJ7KfBG@B
zPgwp>pW-lrdXZfP?jOzZtN(~A0V!kmmLK~){-x&qnMH#iIsN{c??U^oY2WSUJJ@`O
z&hP%_yZ8J~IsP9fQ05~lTkHgFVRVUcZE!pSI3tC|LZ9_aop_)g;e8gIF!G@c^#zsu
z@}K^@BtholR^8N_)qCG=eooc#_67Tqqbc3{e+o}UH@@d3Kok<aD#5;V)U@lj_CkF{
zaKRfSwXaLBmeP)x0w;{-t?;u2PgKYi99_Z4?9^fBSLt|nogg+}(J%$u?{Zqf=Ial8
z%a&F#zQMUIGX(7x5r@Y>R39%i19p@1XMZ9#kK)ToPl;pG5Bxablo1&$I1D1R9UO}w
z=E!;w3-(Knd~-ABHzG?BIna9k<mNFAaMYyqcv~CEL;_&|PTOFMATAgg$6N3ys@?lj
z&yq|yShwgUrIr6S>?eYfMb^Q7tsndkt>8_{AW}O9P73yCf$jU@I$O|ZHC|LCXM1w`
zNl*PVOxle9rQdI<RDhU&Xx6N_iV)hEmsCvSze#p);SPcA|3;<-e8E`&KiY;O0{_3v
zf4^Y<Hpf6>7tBA0^wiCcY;s#}6m0sna0F*<D6*fQtV2zE<a|SRpQ7?lpGE}rIj8eA
zpgw1`<>(&rX|bGr6FbS66gle^=~2swFA!xOi(%-uv}ozIIP>M^Bg(@4f)MYG%)x!c
zI>#K+8%7@$b**qa`Lr|sxttiiOYyv2Kk3u!|M^q<cK>Z2-W?BFv3g^bt1mSSTyafZ
zUA57_s1!9_{V1F^+m`rVkv+R2koJD$e^q+AT=D8lBU^iY!SB8p`1h3?tfNZC$VXoj
z_cmwtM*e*C7h+wi79x~(W?nB=acoHu;vegl#TP{^g;Ek{H1kA-l+Re&wwfsU&VwT#
zxb`|_PiXJs4fi49b2C3RjDBg`OLb-A<y=rl_qcmi9TabKi=(2!!eS4BlLm-6)35tU
zCR%cZtvcG3&GedY?R5Rd5D%UQ0g2?IjT@}+V+#|Te^c>9pH(<bY@U&ctfA_#vAI6k
zGrVG{e>#xj8(Q?a1+pLt0})M#)Dpj8IHI2SzKPBQN?C0is>cLR=1)o>Qr#n#s>we~
zz&~*7^F}SYCPf%nA<&^svoWu9D*fK*?39nbC=;DE`m@47HjTWX@phcvGH@ox#13M!
z$N&C#(1c(2>X8_Dw%oX2TlsK2qJ9Wi3)RP){L}&iPGSjf*GURfWPAOu<|FdS97%Tm
zC+l?BhSLusz{bu!fsu!EW&eL5{AWe+e>RW32Rl|IHE>tv`TcL!xghqg(`z;s7o49Y
zEiDWE%R2;Pf%$+w&<39H)gS-MjZ0L;B_23|j$hH8OLFa>Y5F(Nm_P=~e}%s~|G(Y1
zM49pg3hor9I=18p-EY{zXCiGt#G&YnWlBpf7H{4Nit!JMc95TvIi#nrQKYBtfr4WS
zbfSzR((-=-N7wvk`~hMu;sRNAKq4`68XRO31&>H8ByS==Ral`gz{DWx%~2#1R;J`c
z!Dz1Zm_1}{h)oPLS}5aR%P1CGL2T|H1Lt7X-$<loQ>(ISTPt{DVW@5WTNb1mvrho!
z#8nCdH@o)7$mkS6!NE!gnGd{06wKYChifSn?6KPw-gwl?{uK*f*t6f|G~&XqrbGNP
zh4N&k8*5C5{IoYPawM<jh%p`;%4BQ^WvbIE)TR)IpPhVGthkPpK}$yPH3todN<95;
zuzmFTMIyiZXW|59WJ-lJ$W&=Yn{zSVR6`>Fwcu*4O9!}LE5xU1C_BTP%k?`jl|)mB
zP~@?qQ#d>MTU_@oruE}j6cv>uBk%?}&HX*ZUPOF|7>WwxKI)zFO_8cu@FM=Zei@e%
zyyLYF$pqhK&BbyaWHb#?SYfnTqF07GUvh{dCOL$8qfm8!Q82ROT7vtVYzx`_bH9Uo
zP4SaEDeX2`o^CT%1J=yBXHftRzG&Aj=^6NG>5uye)ijd9RA}fRo){%00g?JcLx}U_
zPVTmxYRiVE<riFuM&zPvfks2zzVK9pb1Y@Y^}!+LPE32GhnIOr7X{M}jy3Ar@SI{t
zHX`%faNw`v8(>PD`??4tEWV<Mm~0`ewW#w4O}ZR_h_HNRVjbRxwzD|u3O>^}7M!Vl
zOKxEnN_t(M_{k(%vMmbZq{RQvOxY21(~xoNAaiaV$)wC3jDSzwgRo|lEt!`bRdms`
z2bsi?@txeM>Au=TF<DbGZ$KBp^q!h3%%`+!aKmU#r=*6aoHKZw-!eM;2Qy;OLd#NW
zd`H_AKUQ0c-Pg|R=Nk>J%Bto+=6LF(gD~~Zqs$~4as3&L3z^nJ<PagmW{x7(Wj>rw
zA^Z48JUuYBg7?$L-e>EDHZD84>(nZ1n=?*km#tK-dt~0b2RyMn{OHgd63hnHh^lg5
z8ILwod|wGcgg!iW`LH@P?nEhVw^;sz$lryR?Z=%y9~ig(*O1vbXY7jT>TCLLH9NfR
z3;O40uyQ`MG;@POKlg~H{`3TU9i&&GOb*hUJoz*gQiLO{XDP_>^A(}<rO9wRD7ljr
z71qg|vfTxB_wDtXGKFL_EK^gbcB-b?E*Dz8ac}x&e}=LI)E6(qDtxV1OoxwUj`r)!
zU8#=*2OJ4`gdd;7xR0~C;)83Zy=e~B>&3W}sW^l)g$rf+@n>;WF8uoXW-c*xY=h&T
zgxR@$$FGC$dF~(Apdkk_=u`01`hW5Un_)!Op9Z)BbM&UZ=hoR6K5B?_(zI+xZ$W#T
zb^eb>u?s_4_^OWZul!ep`P;O~wXa@}zSCE-8C3YacU!q5874<#LCBpO6rauT!&_k^
zN=Y+)y5N~R#`c>+OH9eL_h2IyZ`^yE1!KA(TBkaDdvww!t)A!$ItY66)Yoj>d;EIw
z1tInH{qV87@w`(bR#KKDS6IfZsb}+t2Xq7jFEr_ri-s_Z%wvyoC**K9LwKZCSVs}8
zob9Io*J)!>a*8yv+#elCJT16e==8evMQeHvL2-3lYE-UMWg`bY=i@cEiu%lA&S8j0
zYi8Zf35L)6j$X1Pw)u_ED}e6+ng1=Lw|8NsYOHF_Yf+mHO}(!&cg*J&a6BQ|BRRwY
zVIy~FHW%Ly6=tBc9r?WtP<Qmjk;F_K&nx<Z51BIzGndcCk`BTn2GQIzQt-A_C>e1P
z*FieW+@}VO<IP4l&g9f4iy8+B#<^Nn*2PgtWa3c+B%3dGc)!(Sj#Y;hhaT&Jch_L>
z*e2p&Nd6r%<B@iDgWF?g{(_a(*4Wr{p3@!q=MMDbNLewl8kBt^FFZpF489?SVXP$~
zjUfYB7s($)-r|H)hBl;K`7nRz`^#t_|JU;=PL!MbKh2+dE)I6MV5G@g*fe_61ZNlY
z?y9-Q*El)__NVtbFd@F9Iy_Qmq#KIMILw}kE7w$gp9dT9tT=@$Ftf+cnGs}9srtGT
z<P|RZ%ue9w$*|p=3rg^|Hg%%6jpD*<($kCGXm;XhPTG7wFDuC&+g*@#&KgN&3$qq;
zf0ivGP>9iuv$Zl1X1N9{3}#-W4iWCJwiEA*+T`l0B;sl*)xhV@%`m7N25hXJXMI%M
z<xLA~DCDu%r5=cU;)TAAxcmsx-Y2nRH!eVl427F;SWE+4z$y<&Ho1{rz|UrKAE&<C
ztbteAdT<%4Ox|Qc9Tyhoo<Y|gCj$R&7qC&91~&j!x<N5<goeN;QaMWdtr(Kon8);A
zWT4qJTCVNjWYGP<yg35Qk>7m-;*Neyf39&3OO8%7&2>jG1MC;{%knpWN)%P|k&m9_
zlXg!A+Y=XniIOGZUyAw+Vw$<W<JzW@3oHK1_>J;uDLt$lT+0O}C32`xUc|M$6Bi(r
zTA`VO(k>n3*og;g$it;<n$#REP#=~wll)|HC0%ICYq2K?8m{&Rl0D4?P1WHRSZSz!
zjpqDd0jw0Wp+?hpAOlv4&<*tJ)`66i*>nCj3aD&SZ_RQ}JNiV-f(#an!hd3|Swpqv
zqv=vuDOTxO(a!N&CEHse=#m_mqsH_&%pBq1=(4;ys14#iw5vd%2e&;SwpY4q&x@p2
zDrDJiaEviJ7OTIo_%24vd(Q?9VO8KhMeLt&?0xqNkXN$RnCe4*MOpYC$pZ0P(pHCI
z$KkvZhhvp)u`z+Og9bP@hvljpnAZnZm4A~85tb5-44fT)h6^x=fwiJEO+P}w^F;{Q
zy7g#UXrLEk4#(3RSY<0-xoaD*`%%?<E{Ro*3&_sIb^AFo_$D0Q+j^}2_N`4<?Uv~m
z7i8X2{s;yBht`C7)@bm6(QQ+pHpWl}v9{u(^%adX7i9Pka%CBA44ARWeE%|_0enyA
zipJiT@4^&W_o{lR%kJ-xX8^V1bWM0Pp_?R<gaz2*|C|4qmSm4{f~~mIyq;vw8WdER
zax8jBr&FTb4Hh|ro5iqORZHcP%#85@b$LrDrWN42B_crBj)VxCDp04T3>EkHBoS@I
zJ4w<F*m_&IxYHHwQerA~{a9`)htcI)dFBmQpwM9xoJGjLE9^xzBdAX#!AJ-6ll+EZ
z1ilS58y9dg`T}eI5fc|ciKc=Xp*TDxFOoTv<7QeW9jxvtYq`8R*gR{~A|`^-v`G6?
z3vmHjDHYTFVxRMb(Lt6HQryqenvk=h`DX+!F;xj81&`cjTIgp=m{E7JCCH1l1DRYX
zl!n|@pm{<|xEqFT?0ro84MP52qFXC?<5jRy4U2~Zl&JS_VWo^m;X6htnR1K9V22fV
z&=SD;yUi`c&>fxce!Gnn@5}`ARy;2SZ67<9i3?!(`4Q6irM?7>^U2YPBaR(>I-B>l
z*)=I{=fXN}mmBW|CT#!Gf~x6Dkrvh{QWlw*(}4(!(H{hQ@%vz<TWB`ou{lRXTsv}L
zrBIr!cnyx)_$~(^$_VGA4qxrDcGAe1Vxg2hiI*i7u|?pVHU0H#?8HUq;5Z&7eFQ0P
zczAzL(=LcGk^ZN#SHG5MSR%KmDXf6hLeEXBY3y7z5}ioxtuSCY0iR*01jLktk%5%-
z#+5hLH-FNP11@#G1gr+H1m=hxuC)>8F?=O^$ZojqT4!f!<A^%;JYj~@;tL>sfekQs
zCJXeccvR%5GI-ciDO<KkiVCiqc|)$&R~wnG*z>@Nkdy2Fw^vX6{cjM;I*y)f@{GQ~
zAv?NB_V8k0jtbe6tp0pp7h(f8mGl6mM=^<1U>$$U@+m!7Xr}di{E<&iLTIbhcv?QJ
zRDtg;*+Yrm<XTJ6e8j~<g=-&H%R1p(hsyN`GZcV^rPaFPGpGx+kR<a$IXEqt9ZAtT
zvcKmRLDP4N4eM}vOwaD9av{EamyJP`NYwvFu-;VTE}aelwow=MccKAO`Zg<insYdA
z5aHgb$H`D(J`ebQ5B<X00z3_sykPEy`f;i+pO7BBClEA;DUi%r*FAL=>sgdEfl~Z!
z=8PYcp8l;BTl0D<0dzf)=1N(*L$Wa!Q(-CWIUJ0?hUSI~pc2d1J2%lWInO9lxm-2d
zOBZ$4dM%H@|M+SCD!-hQM5s;W<CG_oNN&7%u_{?ikHL=0LoqPpB)nK`ZDa5G?8)*z
z1w?EjS(Nu)zzVE`%fp@pycSJxnCK3%tGrO!tz{#wtP2K+H&V-399h7x#5ZI{9WLOR
z-_JWmT`UQs{pI1fmk41Wy1+lo?nM2v;2)&MC3F+ZD6_8s_C7cC!-@{G#&Ix5M*!n=
z*6j$M=(+t(3xuXPy_jqWfG9}Gmd(GObk@=0xd+O|zBNJGJ-^YUur_Dflb1tD*sKL+
zrv`Rve3+vDsde*ZfF9;L!@bFEMF^I+meMm?p#ut8SGcQnq8UM;`HUyQ)M}(rPzj|%
z>(^q=<&7hiM-@7v^5fg~gy%X|0`c3R`huNKhDZD05My!SxB@O9N{>gYjkD2UvEME%
zwvY#+E5~R@Bd3F^gxuQ603^VX<5T(SmG=TIyt;KZ{wJI_wnz(%1Kak%_)?z{66(%r
zi_JSkGXF?<emgI>;9%p{&NB%xvREX=4xhUp&Iw9#9J2K$ON7)ka#b;oNbqyDqgY1_
z_VUotk_RTEvKu`S=cGQG_3^H>6?!JoiIn1S3n3+ks%h_jrwUo_6dQreBVB<bXT6B6
zf9@FNZ!m?Fh?G|5-OkNR?e!tUh@Nes#|I}26`#>KmCdjYwhFk$`Xd{VCa-yF1Z+g(
z5^9LFG6aQN5ODh*|CpHT*$JI?9kpXM`k{^sKF@4<FKgh25pr-Wp;x0AK~jYJW@#|O
z2ADJU+8v|!7UGAG_;v@s=uOil{GA=Dr@eE@#*aHA$K1!jd%NXuoqPDjxx?>JrXbFE
zhlIEnPI?Yb<=SYEv0G<8#01(&9mW9HR8Y?abZc?txBqgE*XuC%+OZvv?|iUB-3$Gy
zvBc2s7S^9sD-70<7gIIPo?M4QU!{!;=QyLOgsO3|IDl$ImUS|S0sJ_2w?bHjb~ZVT
z{@@6@I&n|XiCqgibt$R1Mz8olS!=^6h+}#y*#ht6e|F<+n=W{6tnLmTCVRfqOcMDM
z4`IRUzA(2TBfchagkxdK5Th&C{Ax6^u)cd&bms1F80wfAEbiva0f*pCerIHNn!(x@
zb<RAiZ3&%gTlesKHu?4vVym)wHd+_gZ85u&Z&{d0<!9~BEIFwZ)Efw|jRGR<4PP4{
ztke*O7@IPM#<NuHt8eGnc#~i9->Q0%CI??mR+daYe|Mg<$HAEc*3x-#EoD!|HH5J-
zme%lyjN5JO(Z#TZzrN*nfkZuGiYZPN)85I^kogVwD#x<5_QVV@{oId$DYV#3y!%MI
zGQF;-Eoi~9wPsn04MS>^r7*4<sl9BK1~qz5XTEgiwHF^XS0}PFGxySYy|EmzbNq$y
zXhAmDix02%Re}if)$xI|9-#r&D!71nWt`vbL*YbEsBq*psPJu12-Ayms&6zhr*>#G
zq(yWQBRaOVEG&yB-|p4IB}-qRjJ<dJz&AV~!e#D|cPe@xZEeDQjn6_{qIN#6y;dQn
zR<pf*pOt}>h$~(X>1)^dA?mdAHL_G@*(`6qhER$Z(AtmS@{b=ZmkZ5AT|cya`9I$A
z`BMlNfCeX$99HUnetltDU%#(--|C0g71tQO-2M+YeDRfs+RH@J36mK+4*zfC3Ec}=
z(>}8m)*lV(Uq4tLSOF`Js-z;{F0S7fZ~yAMu-~t*xfJTM<(~h4yK|AXCvFB2%>jkz
z6Q2zHt7k6Uxj4InYnDq|>b^2KxblM&`1e_V7yiG<$N#(V|HUM~YyJ|u`wAJq!_L3q
z;dj{i4m%&*cj(^q+I{O^eZ!p-^t;x97!vJ<Q|9&8AjN+k1!+?&lrxU1Id7x{&-Feo
zB{Fe06GvAc$W8`%4HgLJ)zMCpJ+~x<wNk4z=Lc@VN`Hx|s;?c?b;I%ez%93HK&Iem
z=o;H?>)LnX0?1MK)Wed{(?9&M!|AN%iHj{LCFGNV815>ne+;bDxe6&RKL+v@N|Aw&
zC#Rse01+hQWb(ES7i$pLTUjH$G84eC{IWuKAQ*;Ji&|Upv>~K;aCC*H>J>>59eE=_
z*P{C&CD9tkAi`tL5K>VV$Va<YIuNoJxbsIP@b48*f8yz5EH5>z`YCt$0bkxoq+gK-
z31)X(L2a#bwdVXKkiy-TM7F_5_V7vaa34lL$ZKk;#|-Gp4%!T|&!%>423eliKLhh-
zpHz)sW`Xol5QD1O9$(=?xoHwia+d5FGO`obi>}rrVc^Kesi_u8#ntx%XXBxOV<N%=
zy=wca#ycy8QprX(w`h?OCqZ^V7cSm;Br?#exd1j`pn)$sA~I1^jiVPe(%bqW!jvp|
zD2WMb1`BC9repSQH$JR15{Nt@6}-jA4LK1%dqGL$Mc;W`01*+M*rvh12P<`}t<+3G
z+)o@@g@Wn^w$z8c%#Qd>0G9gO#)VSUxNB83zGO7Tf2_L~<>lT8BJB`Ah{!~j9`cob
zpI(U%%?#<))3gR;8A8BP>~=Ya?3Y4x4E&iSw^r5JT(ZX*j8`hPS{Cy|2(ChNes&O6
zN;X1nrQoO39|R%`Ec9mEC>1@q3Tg~jCxYxqK7O4p$-8a`NJ75N1o?|K9tuY#;O6S%
zFTn)msf`|s8SxJ;;8HB0U~+((r+hXMfhR3Aw`?p;wpIoMcB~lY8O4ssHiqjanvOMv
z@&3rD#>HSyS$D+KM>pr$n)Z!`bhRefi6f&krnw!nKpj{0LAX-?sTK^>5tzjav2t0z
z!Yf1hZ%dQ;<U=A8cR-Bop>+);uvBE=ZIpaD>*a=9&VNdHfh^PH))o{RUzE8dT>jg`
zUpDP}Jp?Tw?)2?Sf|W|;+7LW@3?@jwnSxTlZcg02KqCXqk<rz=(bov0-RAws!AUC=
zRs#enPH?uq6{^xIH+J3+$cYUi{q8`cP%QHFna{{W=sp3jHvh?5ko@?l_<|3^FPGc)
zG$mU^CQ5m?3tnCWq7h<f;*h7oh|m4NqbO_1p7SV(@NLnEz@y^`V1;7fsx?r4(~hd~
z=jsRXfnM`bu+l<CD9G4N_QK39*yi=sjedm%9&W(g(<R|$Q`G<#h<!j$U9L*@{CNY~
z)a#0ZaHpcF)1Fz!=Z5k@76f$!Db90O7nzt2g$iqRD2TkUfU}8zsr(``K@Y<5gdR&C
z$^iR;8{cha)K=M@cJe5@gDS*zk7&@Ur}5Vss~ee9_{#PdXkRuQSHM%E*$uK?X|WSo
zu#O?LSY@M;upkBIQ`U;3_h;Dw0cJH<GNaHJC3}*MYzJHCTLD<<C?V|&7qdtoS$@lo
zWyHPhI~3bKz`+OL1eGTffj-?%1(g6BkOfTF1BsoEfnXyLDWYN@2Ii$m$9uB$w;{zV
z`&AhC1HCv5TQ&!%{54P%dOTM(&U3P_9A`r&q?C1UDeZRC!FhSS^d*eWqlcivSgf8P
zE3B@H5BwER97c*Gat=XTRC?BJvl~QXyYIkCal~p(-vsHBHVp3xtaQvgY``Kx2FhJy
z6%8rjQcE*V@R_^7r&Xp-_h5VMjQKrI1dU&a731vsF;h^$WZ!n8hGoCwtM*Fcc*zq8
z{D3~GfZIPr0x3!9Ok7<V_q0+2wy9qVn$1WAhENj=a@<!Ef6^nd>7e+tMK^HvE(Mq`
z+W~1(Q+f0=W3#MZp}GuJPyBOYA!)x<-3S-EPJ!v&&?hO+1JsPD#?43~I)E>*0||J2
z5<3P85yrPg1<r<~;sQYKCa?vhoQ;(B_hb<?%^3T78gZ~vyBr(#kR6oS!@Vp8r$zc{
zC<4v%FiM4yyx#*b@i}5Eu3osm0iTy44c}9Fls}yUE6t4#Yp%l^fz<yS!nBl?W4WfU
z8w%jCDX1SPM<;Tm_3OPG%s{HwK*diIPPD*z83pspMWF6nA0@j=`UEyY>NR&d9_r4&
zi`mlc1HyWZ%i5Y#ak;$NF-AorS664`ZU;;g-D80BGI+zk6KMyE1)__E*$?(+NdnLm
zL~H2$q(Xly6%`Q&246h{DcQ})Q*OZacKIT`xM}vsoPj;m&B4?vL1s@Jfi)!X2Ma+u
z7xvJb=OEd`?YABD;JdpMH2g1P2>hfXP!ah92nb_@wCDe52o|~CnO{E4yLuVUrr81&
z1-Zk3Cg4aJ538qRI{!?3$DR2J#}j$g2Nl%ebd+qw?_eiXo`eR@Qh_UxB33t&$>X^Y
zZs18>drn%~$S#~W*U!tAAj<5LfVgvSDB}wJjGAIyy$Pe-AH^!CrSdv>n#2Mmhu)RE
zN}rouHG?7imIx!+ckZQQo4UG6ldt|fR=8KB`mWYgQ~cVcim`g*_5=`{ziZ4=_|nt-
z;D40=2zGNeun8C7bNE?~h8+MOY6r}bVUVr=C_2%{qBSgFn`3$7%=zrHDjRW8mC!`8
z`VmlLY%o#b8sh?PsNm}CTzg<ZraysHImeQ^Y!#9{nqaq*_Ok-zuN62t0+H>B1|`-=
z)W{VeA956A{=lmfk;w(P0DX)qC=SELW7R`<46Dsi@rA#_rNB=P;idT$cukjue3d5Y
z8s<w%0ZJ~yAa$S`JhPahUAp84_m0ovVAv;Rc2HrW_Z5Oifs?6Z4=%yhgDVoGj`@us
zN+H>g6_?8vsai?GIoKD{!eme=rQ2(G`y#u~i|xU6XS7IN*uw(#o=QEWm&Ogb-X`pg
zk!{s>D@`LASJ752@z^8qT7Cy`6ju|;DES6H+EE*0YtgDdpyY_>fa<}cb)bOL`Y7;d
zfi%;BHw=ncQ~l6v^Tm?-N2qJx1F|=x!LbYJ)zAILEuq*fQ1Tg#fR*~`3W`{gR@E?q
z(@Y4!yCaWqi6igQF#Qn#e7j{peg8$aL`X$rrxGyeStO8iLaN6lo2~Lbo{)8{T}|)Q
zwQNqO?r~BO`KvNCQ#(Wffs@uz$NPB`l<#?O=~j_1Z}dxui3RIv*yap)4F|X=RaZ$k
z{x6RiV8Aj_7o=ii{Im)Dzfzo#^cuR?7ic5|azt4&R;pjdME=s-W<vp|U^!gSN1n0~
zujvdM&{#GwWAsCWD0UuTtX4ntnY%Ckc3M><7vp0JD)KoVj!MvWPMjVx4`X6d$!u+U
zM@E|>C$#bb$5ir@g6H~l9l{l;CV>*cR(aYr=sH$(qo{sNh30TWShtZdPfJPUJmR$x
zqRA*Hx^c?nvYC{cJapdvZH778{rx(NKc!QJX=c$2Jnyi^{=0L>5`E4sB@qBj3_!-t
z{_$dscBx*y{k@nA$c<8Lc;YHjwlsH*FesB2)dA1wnd>n;2<xEsc2w0{?C}}^bp-s$
zp8=IU7fMgdvG#`uUoJ!4HdVmauFvKF1ta8)iQu}g>VTQ5w_H7?1Z`1PquzE0{HKZw
zBLo+c?xd5v(^Jr|6l}Kg50qzgWcd{h0+0oH?m=S*T<x^Q{enaAw#D+v2lNy4r@)F$
z?nD%QxeUveTZ<fZ2ZUB96Iald47?+C2%>}EYrx9FI5I&4#7wV~q?L=6_Q(s&NF`tr
z9nQ6l?iws8t{}yOw*2)3X|BqgRh>=LP=Ry8jfB<nLK8=1R2G^k@*(hu)7V)4MAdud
z4b;pXm|U&{wXwIuz~{{X&U<=F1Jp37Dxe=FE5T-PG8Q4D|Cdi2sA(4ICfZ^8WT3+M
zN3~y4%)RN`{Z;rumWQ~^uh`Vay$22bl^iNHcQ;ftnp+6S5%Uc|s#dxKTwFBo0Gu^x
zs!*xv>!J>C(@9m`F(55vCtlPR7BIOsm*IDUFg=rh$)DlnA=v|+!gZ(C&>seRq2PmT
zCM7Zu$!>_@=b=N&HFp;tfO9A_`hX6f)gl95mKhy5djkr*zqT8$pyNN;T;KtSSc<HO
zYaOV64t2oH8HOW&f$ChKgKi?xiIgHOY`eTI)DG&2z4&@63@|Q=7hq=DyTAqhyUvo6
zD!}`qtpTjB0{snrnNVhb{LGhn7{6k4Ug=I9IprHtNWUxe^Bov=c@+VMwbF$TzC_Fb
zZh3P`5V4q`m)Goy-t+<sXN$<CRRdGjmF#)Xu@yJm0|K1!A0fiqEW#2+Ognu}0O|?F
zJqtd~Cz^!^&WeRpTpTOeK-DCHqPHe)yIUn4Rw{GE<}^o*zaNmk2+*a01kdUsXzu1+
zhhdoQT|&hLxA)M3)g<9G%sW61)&M5E;n**58gTSaL36|KSzLF#`rH)|U8G2s40-1C
zs`cB5AHf3`IQ<$I)6tg381+O#F$p-BTBm_-;H6x!a9=96`I?8<-Hnu+zhXP9?`_`h
zyTWwfXc9R^j5(JHT+23!D%=MHNKHbjpq_r#oYF|It+io`B*|AyOZ5TwHRGgRYDBEm
zH1jhi=rwx30&~n4t6Yd_C8%*FU6bWMd0u^u;Mfeli4V>dhm_b}<iCuy<!sSSL~SeN
zo;0)-N96y~z*5@_Clm9Jz@-XOL8SSJd`zVJz2{*b<N@RbLDksN77ohPL!j$MlQQYX
z3NeKUgUU)GS1X3b4#EXYx*dA|B2`+L@g6zI0|&FY&z~Q$<=1*?6J(oe8}vYN_9gu}
zceJTu{Y)D5hKSTq)I{^jGNn9HOME!~T|QtB0ACtY8CXC`s@-t(v@hVwW1k@6ZmrXH
z{2C2}BJLK;fli=nfc%gpg*6U^2DT(rT&>&AKo_|H*PjAf4E!8GSI$%<a`2A=@GH$+
zBh@2;f@YOVtrf%bUX?PFmC5^vqsT$`J+#ocv;m}e8fud(1B?ZsEAHf2kAI&l4zIO?
z!^JCQAVX0DN#TL_S?YU5CP)O$!;##S0p-Zo_hwp$Ai`Lc$iRnJwD|^+y2e$a)1PU&
zzuMU@J-(!qzOr2Z*Ixd(<(1`}(**w7d|Xw1PLogniptU){39?vztfJ!vDdCSNQs<R
z*J3hSXG#ESmqZN@BM6>_xmBRGrAS)DwPyl1pe82qMXabJU=`9P>`z@KA0rje^mA`{
zb@V}nQH^6))J3Wf7e87(8-T0Yz9H+bhr2oGk&C9pf>;f-uiU9=t&&&1e7BGcyw<iF
zyFq_Sb9JNb7m;+QAd(&%8XNx1o&HS&T7s4n(dMADK$IR9fQri4wJN9lBxqOB^qsu{
zvL7`zY@&%nMtKa(C0D^6qymgXab5m9KM$^k4FSFIkfGf8Ms=knbA_l2z^(-SDos^O
z17xjw|DG6g^p>7@(8rStDyyUDa$#Ke%vopYe#L=zW`nb%TX5_t6?+Z{$nKdWFX_Dk
zG;`*f{}ynjVXWVi8u06EUlGi2U3mwB^>&r4uks%M@O9a3U@j5VaF5t;DD+Y<{dfF*
z&m4I17oqfDzV+^}A3;3@ZtuPNEJo^oyWe^n(39b(9-n6czJBiFmEx-cwaPcU(hp9{
z{2yLW+ycsOJ5>_*d{Yg-dPKUJ!1K!+(b50g{RcgOEZ^wa-~OlZem=(UqW?<bACo-a
xMgPz0{9Wt+#lF4+&JxG^g?;_s4mh)XnLw30;CD#Ue*k~Zp3>7y`;X=C{|77t<W&Fw

literal 0
HcmV?d00001

diff --git a/docs/image/pack_size_4.png b/docs/image/pack_size_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..40deaa502988c2e8c556176af60f6b7cbbe8d596
GIT binary patch
literal 145439
zcmeFZXH-*L*EWob2NeY)Dk@DuQA9-(iqc|1Pz<0(K<O$;q>Di4i5yfw5kwFOEh-YF
zBS<en34$RIsi6e~qy`eH0Yczioa4EldyMxv<@@oC@B49nFvcFqV(+!)nrqH$Uh~=s
zxooJnWz(Kb0s;bCE?&^NDj*=3BOtJLXzMTFleGo!Ed&HMJK3E(clqMEb9*n}b3xlV
zq67pkguE~nx{my^Gs*n<vu9t|Zn&_qT<+?+3madpe`X~pv2Wx4s9j>eJ}y6UxCjfW
zew_9AP{9lIn)ixNZgHNh-+{7hseVL_e|A5MN97t;6i*M+2rCJvJcoLG1->p-zPK=T
z`KZ8D_4=j<KWsk7esKR~RAKK&qDbZagE0pqbNc&VuZjA&%$Z{Ua(X-87_*)MuUSbS
z`LP}<El_(h7`6bl(x%i4e3S6nX17K_=V8x1{Q#YS;nNw457(S6GYiOj;T5wMb9l0G
zP5c`{RZ7g7%gOhOn;Zp{*=MwppT%tPR@Oc@et75!Mc|J58Ochqdkb3?Z+TMb#5a|L
zTf5HqZloNSuB0u6!FVo9gmX(4&U0H$vK&sgoYjF|l=kdgA;hg5UGSbdBcqyg<zZ8>
zVD;in5tl*3$5(9*Q<(j<*z4!B_AT%2SDY3R(1%dvp1+uTcf#oNqlI;gTT7>k_g&d4
zaZXac;Yo<eC@fFsretuR^O?7gH=Vq=RpNBng}uTC*E$>{4D~NBoOQ^1;~1@v4^x!-
zk^W=$bW%L+>LeMm>EvM?BKbmW)TR^bZ>68TqQ9B4XKQKgJ+b6#s>zQZjK`jlNYy+8
zA@87UT<hwU^IS%e_WaR-cjK3)P=T!Kf(_>rcJ0u<z8CTM@{o{5^?(RPl<}+O&fptg
z0yWom#EyK-|FM4GD{B62`uhIwIy>k>7th@N_1-bVi0{>@<F3MItcuRMta%@lar}yk
zxRJvBc;uIG%*9Rp&!b%X&!6h9J&DqszqfTu`(DB0_azGu$2Wc#xMJ>UskZc1a^pIS
zz3V$p3nUoFecYY8PVSbMf#CUXqQQcvRh5;zY_17xdbM2l%*5N{mmBMnAFhMl`-lvl
zKC`_xaAj~^9_3um)k?8<5Pp@0xT<uhny6~^JeeVCPqw>$SuC@uR!9q~EOa#F6=mx!
z!4si(!X^6F6v;?$$v%@)+=uAK+?DDb-)OXP;(^`+)&r4G6Ctm?n`B-~Zc*Qhk3B3-
z=p9~L7?6<Ub(}5Qu@9CNF0lsFy>9VlU%=PAgDttnu8O&z<^y;xit}vecjL8Lwo#g)
zk~#%ESJI=vN5fm6$TE6|^>#Z9U9RXjCkJ1|;A8x?$CGg%C_8Tk<>a+*9M{BdPhYn=
zV`F-a+56j@G(VIkZ641$<Fawx<oC1Ejqs%{JvYwFqq17FEVGC&Zm<6}b^YVOnxy>%
z`5h=>;ax}Ov0IOyyp{Astg61Xev3eLsqVtOj|BXO$e9^)^61ct$bCE#k6(#!DXEng
z5b>%S=>1`UsS!MFE_L5+i)QhM%c)g6#Mk=i-}w}Y-hc==kQ2yqU1wD(Dj0Bi>&9DW
zwp~~!W;1iPh@N7zDR#}!ueUC5iVjq<(Q4mZ5MXJebaKO3<(0c@6V`}-MBSC@epvce
ztV~ero7Bj;tq&f2-Ku^oICfuKuw4J6;}HV;w}<F%d#1d9-4|U-up}XXrl%kAqSnV&
z@#;g#(|ewY)q1~?Dt@Y|Q}Rr$_Tct&sqc>2|JwOd?y@}fZ1UIr?wWB=4RxP<T|aX5
zgJh--_htUb#u-^3sfp9pugiB;ww&;%NxFTG-6ip)y6OE9#N+x;`Z>^yZRsEP^!Zmj
z_OM~L>m>v?yZ*x7?0Hu(DRlbBm-!=p8!M!HA2Ryi{z#ucS0=Qg{}K|qF-*{UZP4*Y
z&kSPE9+HoOV7H2h8CK=BT=df0zil?8K*z;vXLIl=$|&Uk<tfFVg1)6OdE$MHj_mH|
zk{6fXT;CRR?et;!Ou31DDiK-F)Gx}kw6xM%x?4WAG__Lfr(38k*mNn=(${l{%<zo!
z1@gS-x4-yJDNjMCu=nH^wiQQq>ot<vuP!NEjFO~U=$}e?bT95+_>_sICG#v(Ps_^k
zjkaF7g+F=m)NIh0Ose`dp~La&VTa!yOJToJMY<q+o9-w}m~B0pXmZ+IKmB#$8?BfA
zUtc;D-MOngzb8-Su(G{M)}6AZ{SIahxylFc?6ixvlWr<|(~)_;Ew}NOgKIw8&iIbX
z9cIf|E6E|eC9Ji9VfDJ)oL4sICiOh*so^^jYfM0cYGR~%_~<U?^9IuN$n<E}=-bi6
z5*<r(%P{9K=Tx>8Q<W*LC8Z@=zP&uvXNOPCjQY$8e+mCET+%X>AGoMC1K$GM{dtf4
zt^%*?^*M@FJrbMMHbA%TirO^}D{}tv<=M?$_OT8{pK=<0R6v}++iZ7HuD%o5XqU|H
zE2-i?ntl|s8Gdpp?1cTeO@fUk(<|OJ-8DNT^<q&7#WwNF#YktQfAijE$>vs4jX^1G
z1ic0=o7ajqKr8L?*}PLO{7S>g>X8#_s^Q0D;$2i+V4S?1{0jxKnD_MliUZZp?Fgvi
z&fMzU8Y<2HL+j>r`*e*qTL)79SD(z;8!WDBfoiI%u+x}hQrDjLdyY>W)15dSjpJ)4
z<hzVJE%Od`!Y14%)H<!Pr=>Kd9*FD^iIKdbr=%A#pj(4I_-kZ%WP57UC<)d|7_=C%
zm{|IHzV|>UD(O;UT{^@>u1POrmn)WHYHGyLWK3jv>WzhlN}h7D9jH}v7ID@r+*Ihm
zR^}e(cCcsJ*;<lXg3|Gt&fZO0dcMbI94D9h2PY29rcI7b6-+h!==tU!nOYZm``FE-
z>3#EpJlfo;vFS!eU125nC5MP>z!l?4`AOlqQU`YB`?uW23*)m^Ojg1KbWiJ6M&A!R
zJ$>5Q2K6m@Z{1$p*RYqEBfcgg4x%jM9ETj``MW=S3zU52OHT<3ZxmJ7XeZpd(Llsf
zW^pe~_J~G?dd9(Sn4gcAo8Ug#!?(^;48k@B)ui^%==kV0glg_M|5QKJOlI~_=RUJN
z%#(Yhq)z#?j`*HvJJBCoa(!!Eh~*Anc!R3*z3vFFTYeUP=a-rvL~Mj@YmjdeJ1EaK
z^o!Y^@Zrd)bFO8sX7PdZFqDP9F_qWytmQ?j)NS2R!l|KKaaZCFWojozB|Wc@EBAC9
zX;-<eQR5w^DD8Q>Y+^ojfNpd=?wML&yH0&bcSsO)!|v4Gr7&%`+r=-sdk}_0k7&};
z@rBXc=Q0%Qdd{|aVUtZss0B<W%_nJsSbn(l(yl{C#IR4=U(dc~oPwN+ck3DcJjA9=
z&@a_@#bielBYO9?L53e289)=2J+78gu3SAb8auAxUH@I$F3LS<bHvk_R`HQf;fp54
ziAx6;sy^NS^!Sq~@+wI(E=|i=Gv6bw>O*yN(vF8a_)sZgLwpak%I|acZc$7w+Bxwd
zjhfI3?YcMjYe%Bnr{VFqON4k0ZlQP9v`GWc0XMevdA+>w&=IdC<h)U9^`7dc{&->j
zPG_qJO52Xwn~zs`G5fO*EIk)a$x1<`5MzEsTg-cZrw_upO7&))1vG~a(jaB9e^AXt
zFsnWPsxPXMT9dAuon~2o-7<GETSbhKpfr7`t1LR#q@o~Mv!o1{*jh5k(#{soB3nT$
z-n#@&KB|`Jary32maA|FS&xpaPl<;+L7a;7Rs0;BQVdWXs{I9@oQw+JI>LFv(=lmM
zx*GbPejN%u9q$!yGExt7($bipOJj_aTc^gdv<BuZdL(iSD?S&aS=xO66y8VN!m_v!
zRHXalE{~kSt<#t5v)jndClPRqt4~@C(<f5J+B{m`wic@CsD4mhRFT&1zh^sqWjc9B
z%}m%)4Z~9v-@Ne1D@d2TH^n@ixsaNpV&?25Nw=xa)On=S+hjwch8T9}`;E2^lH2ld
zSQ1yO_iljqEB0$fb*Blvt%~fuj4i59&zyVCO<Andj*;ln%a<W<^BSbIK1;`Bxn^0G
zNxPNvb*G-1tJN^ai>k@9OW2+XhUX^E3^UO!{BHL5iZ*_(&}Y%Z_`(%`>mbvb_d)wa
z&1DW~m+)e_^Mhg?k@Q62q^6{3Ets##Z1==MV^&U9ozmxu5kCd*4u<7Y(435fRY&>a
zEd3?i-1qa0dqs2w8=v<*v#%==@4eL8B*EZ)vs^B%;2x0K3?E;tUGkbkj=jbBBef&7
zWAN`+n&wr;Smn!0%ealERH^yxl{V+{as|Bd#01Rt2)JJ_yz=?5!^bseqGnkFYgaZZ
zvZ~iiu6_9nj$Gqel^q{1Fy1Rtr&OE0(V_L^9)XM60v?_+_J<=-#ia_x?{W<HX^wF4
z!VaoUhXpE*a5$Xwan4HamS2P&YS8a<e=*(wY1?exUtv3h`4ulmU7RUZS#Do<UQqg5
z&9e3Vpl2iH)<?=f{c#d?{UX}HK;SUA-XO3h$WA~IT&)3rd%z#4PhPJ5;}gN0m+Stx
zUOV*Lmn3iSf2Rd5>YTaewPu{TCBfF%|KQTf=HxJa+sl-gmq?whPj{WZ68d_(@RiGA
z`${GU8!ziTm3I|ye|Be^$K&(F{Sw$6DiUJV3Tx9p+XyB{=(unCsH3*$(T2y5uWrzO
zeP+%2@8wC;*`v*#BXOj+7?KaVgv{jz__B-Nvg^1eroBnp{_NbomhJLu1vhNnd-}e>
zn!mY-jj5dS&TotT+kroCMq2M)8ArPOPi_syR4xgE;l!A>e>$6;U)MOqoelVhvzZnH
z!*Nz-V*hM9HfE=VGVC_3{~MG1aWk3`7{1_bXYlt{T>X~-yERfdB!%t&%<g-S!@=;O
z;S`m>+o7M+`PcA&X4roX|EDhWugCxC*^T`-)cjd`_&3!2e->&?5HbIiK*lTAdU`@3
z9Ek(B+NnB{tyG<H<`;@8kv)c%tvIPq8RR1fL_)~?O+~Ty9AiQ-k%N{^($uGj`4D2=
zI<1D)h69dH{-QPqgUF0wWTQo)kZcy}la5WEpmyUI9gAkFW7{Qm>&S8<f|7=nzeTr_
z?@tBSfxi&0D>r9mA}EAJgqvAS0TdEGZX4%w03^;MI|C*>;3GKC_ho?vfJU0(F*SS%
z!LUWb&~W9jp8PEjHRPtQJ0<UkAw?O)5U*<*Vy*}cu_w-E@90w8s%c+VkxbJ60D=Ii
z^EMrK?K`iC)Vlr;UTKf2G%_ay`?DLAk1qGOASl`HPy>EjbVc(Qou#&mgy2~v8V#SG
z-61D7*|rV}DV>21bln)4mVmESiQhI~e+dR5`rRbiO)@Q^DDUI?d1pf^<pr($JmmX@
zgV3stHp1z+AtypG*(@={%TOclp+r}J71qgCR>T`YI+f!l9TPHNjU%O+$^r;;3vzvg
zcZOT~WH2$$WB7e?FQ=!aE(-4MeTj66!L^PIIq4Hc!Z}<b<1fqiuUHU6PL601f{n@!
z<>j0APFj&K6E)aD!=4jNQ9AvS!%yr{877<Kr*~>=&6)oMC>y=4?^eDpTvDQkdX{x^
zhY*w}E)6>Fh7D2pdFzOfcT%!r#bqmt2uhtOU8lQ|-jYA0kyhD@g$_t%V_OD1EHq*&
z1GDHlk~k;yoVV|J(n59bIaAv>H~yD+7$mz)OKft|igY7e=eH-S9{7gyL>|7B-N0uz
zl@4h%?Jkw+iO70%d*2^Xq1|My1iyKrp!3bdf9;I-`mtn)^D;vIR)+^6xVa}L#Oubv
zypU;)ebC7P;6LQdS{=W!ECgk^HzA~G-jxu%(n446^0u*+U1rB26h7#N)D*%DG*Z57
zvS@D5S!%RKQ!BJ#R9jDb)+Iw5;tN7>9zL#PshK=1MF<uJuIsXhlkaB-!chA&xVF^b
z`^k>W*!=ygfGcHn4R)3s0)~b2Bn1yn?E8lE@_pBJ@Q-|0`DT6SeHFW#T?hZOs8qq_
z(N74s+MtOS#o=M52F+JS_W_qLeMAU04noK24);dcMZdj-4A0fr4;^eY4@6MdnKXCB
zhZ{F`ug*7y34;_$5(r-#9HT3DZaTujP=igAqatRlqU}@*mUk0^7vCkJpKkkY3U>s%
zIjTD^rGCeGDp*cb`)09&#5wsM=$~jfpn7l4trH0k+<rp5ohIM%gNQvP@AGqb-O;|j
zy(MD5r2o=&?}>Rqrl<%MLOiJ|HkknYoj#6{U0$LgC}gnT-R<<KfCxjHPMN!te7{;J
zA(-a@Juy6ERF2qfXLm!spDF=^nEIa-o4k*)m7QaQKNplNjtk7IRSM!nT6m{MAyGl@
zv=lL9e%MgHU#}1)E6wXeP?R#kal;-&svpaTy>D$Md%z$o)8MIhjph4qs-b0jv}jRG
zgvb4o_c01}NG-`M+zrwZqo)^)yI#cc5~P)Dqx5!2Cxy&Yr3t}1n21n=R@QuDi{$Xe
z7+d|IjMT6jy`#3&TD2JTXVtvw9ZV(&GaSrkH8&?kvB`!i@GxzCikc}}R=S6-qdEEx
zL0QQ&m+!B2mPaRQGw4y=szZ6I@!ZFqt`j*XXm$))13Ji>xI~y*E;=GMS#k*qS>dEZ
z-CQuQ%>0z05E-pQ$ckknnofq40c2KO3>H4jV<IRQhf#6)O2zxdCexFZXJMB(yKSkA
z$@hfdb;rKwl(|L)lYzw=$jZoyETdY=9lt`!z6tz_0cl||n-k8+tkcnaAFr_GE$N~^
z;2&#s$`*ehrf28KhP6=;(k~L)M7S>q3%QLg?(h6M0|>!l&b7+K$k`%s`{nk0LT(`@
zd|Oo@4AOfNtog$gG(7I2za_nUz@r?oP*$!W=K7HZzGPu!=gN;F4U;1kr0T4q2*oct
zV&GrtQ4>i~A?+^y^0(@b!+5N*IBq@^!omW}^lc;r7rI09Tj!V3fg6n8jhgajM<FN&
z^iXk}+eY(u<@;wjwK|UM;ij{e#7P)3a&7MzdGVz<v3S}{Gt2+;0U0f4o(!I^=|N`U
z)Ow$?J*UGwy9}Q2ah}IgJ70}k@)5-=7SSQ|V_i^4I=vxE?-W^0%(cG+vG6`Cp5VZm
zw#J*PA=xz`H{jc~kf2C=5mLk#Cfp-V0nb8~67E?qhaud0k28{3jK+pJw>gX9{1g^b
z)a98QT$79sYH#AkGax@t#?ZFwstIvJ_II>?9_^Q#%D%T3R)=?b*bT=Mr{Am9oa1$^
z6dB!!RtOL61h#y44u<;voQsxqX2M}y*NR(21cjJc7g|UzjZ|34@%16Ox3|M#J<eHi
zL1p{ICc~}kqQa2nwE6T$6NrWL(+Y&9kK->0li0l0rSU8f&!wL$NcsAJ91F4`uRGDs
ziZA{aO_eKDCy<t#uq5|u8rou$k!qN@Fcq0o;D!&ZNZw2u>ndYu=scmnovzaoku|oM
zndrh^Xbm&cgUjdT`#M>zG}fSGlN#yDvoaiTC^=UM!SU9Sc5QUy#o0U<1n*}-`j9j>
zVq)#ta)!`!W?~VM$y{6zGirzu*?rI!-D~AfT$+-xr6zY<MJkX_o>CEWt=fx}R`Av(
zbu@T-eu)~~J>2{u)-hFv(9}3_f`GOjPJ9t^vMAAR%wbfHiSU*oMx^FLf~?~e!YLBy
zxW&<?YeJNDbehgmjUo&}UMWJ12Di|o^wg4)LPip!70C1IN5x!czYs<<pYB+m3q~|I
z-fbe9s(C^O#chaRl!K-paBXF?yv~tQiJ3FsgrS4e_GHB91!bxiOWZm!Wd74DLa=qw
zDWp2P``K<6IHta3cD@xw2v+KOVP`eVDYc~zGxs*PFq+<wvsgH1JB+N=T|3g!+ItLy
zr3SAIdw;2KnLuKm3a`^yx&?1;Sy?^<a*UflY3cQ>7Ga7%d>(x3n9{j#QqHATN7FwX
zL8&?RC6qk&9E3Kwb*w@!kxdOPjL#f05JypieKToZ=?_b^2YgNP-d8e&$R!#4>ZcCa
ztSMY3Sz=DKW4T6ix`N7QKXT~DaVt2(OpbF8gMEwKo4LuumQKW9BDF;mYe+b4{-iGe
zDUK;bZ*$VpnaWPuyj&lPHRNkODFYJBrHWVu{5N+ulCu+gp^Dz1Tr*3vLN9q}@aP@v
zAmH{(Sy*`4`$y2<9XuyiFj|Qe>3ZysRFQb&o3A*As&tQAtCQfTBi}!j2hMtUZIp<(
zuWcNR|Lp~#$@TRMu&t?Kv=ey)t*hW6VPOohuzvX%VM>(z7C=Khc>ICa%T;r{?1t4?
z=MathtYOs}c9N(iD>g}xgF?t|I;0M&r%OW=#KW1YJR5JGM$Yl>Xe5vZT9e+Q^o2e!
zSMkQ!9Nr0pKq0=_iYYk!VPd`YI!m1d0`%$)I8s-Wu=vn#;)jWKVdgLo92dNsJ(}%3
z(LU8{SRX1=4?ETnMXqGq#?5+se<4P%Q<ln}D}jnlE;~RWG1|wF>d!4AcjJ4PW`P6B
zjM~OcFq)+K2#TCxgJbDJQaOTIH(!SJ@~!Gwom5>oO-Hj3jAw>jozAe5e~lGLtMTaD
zi;Vd^EN+#!JA0*{iqPTL@d-!A7{(J`#tfNOYD>zb_k>sek{BU4dG1S8MBgzm3Dj6E
zUYPH#G9BztlkShLj|wx^v!!kbz@VtL*Tqr#HD6P$qgwThD;7r(3+f8YlHA*jf~2tv
zLcO5~o#A<!S55jk6I(RLEB~@J<3#J34DF^oY+0I|K*ek@V%q<$y3Ar6f<g;3?do*9
zRLGqGsD`0L1B;g~H<IsPhe63!xSN3aZ(Uws*v6GFk5&+xCf%RejfsyhDwkf9&H1*L
z?Wy*xw6~m7n8ah&mnvC}b4$7WV|F-gr^JMi`5!kxtn~#LS4$ifM`Vr3d{4ORL0bBV
zRD?0C(AL%(I<3TmXl$9!tTq7aJ@`{#&kK++)PwdNgdtH097_~BILvTH`;k|jasZ=D
z-U9&pblf!uZdK&-e9PaVtIhijQRaSP|AAOx`+r#DRFS&xMp3HjxmeqNeCW2xKqNpb
zIK2GpQjm!sV@ckE9YV)w%3Y~)fn}@*h*VA(RI67J3Q-!z=m#Zt!3H70tIi$j0>-4L
zEIt{4u^U=Cr-CVRFvwf!XocCJr%>iMQ8}dg$?+tGaQk+ZX3f`W+HvfF<e*psvSY^G
zj=f@&xy`UaneX&3IxW_rD-#ST1j%3f*~(gROzx|84>jr_qz6D!ECWGF<kUy$89Z&G
zu~-g4v5f*2F}jl6_XtW}38B-zZdg<jR02{c_Jp6)vyk~ZYnt)|7l)GX4^4*-gy(3^
z(h%O)J_F}0tpLvcV8M!1Q~*W!BF~+efDnSY6VVD%ErwsV6V)(Oxg`;LR0Q=GLhy@B
z#OTQ)b5hxl2UqlON*K5~X8R)L`@6&eq;<vsILC7(IJs{Y>qhv0AL$cE>j%|bp$=XL
z>1?wUDtTS3&w%v|ACa0Z?wIx^)7MsC?-Lfi)aXHEqxA<0L!nx^tvIE6gwuCz^a`pF
zvHOIBB|7<$(qlyEvh|QmyGC;cv1QmeFUU8B)H_V)UB5^<r)SsM5}r+VM#+Xrr)Z{)
z=OT75u#}(>mt83!aW$d%h~1M8TpBIEeo6}1%bJ!C8@v({{zaMPV3!>a8{}1OA(&}s
zTffU@(LJFk3IztOcXjvtq9fT`r#w5$YY@@^k-kC|sqV#oVb|%zX)29IQRS35)F}G<
zau5~p7V`ZO69~5`@-_nW-fbnMIu@sZ%o=D(mZhHpNca0C=pgU=T7R~x-a8!|-o~O{
z53;W{M;kbON?v2hc^tX4k>H$rU9WPHCkB;Z78>xo{Tywn7n+IGD7e6`lm%_v=ko-p
z?BJpJ?rMJ%tFNHuU5Z9D&%DWl4vMCA5=a%%Vc`hx8Qxwb&RJbf%#|Mja*2l%NmIW#
zUI89tsI>`)SvggjgikD$!NL>54L#u&WLetA-wL9DYFQh>jOu#_5`}&S?Pg)k5m2DH
zc);9<4IIg6yUtd<z5Wj13>%rp+v%1pMy`XR_%>0bzR;%JGo;YanMFk2)Z8oqT^v0I
zf7v#OvZm@RF>aFD8np3sPoNNwLyKfTOVa2&SeGKX6Cf1R8sJott1d3GQKs}TQr*e}
zmgj^j!}Xq#zr}P06<GsSS#G{1%mikALyxlXU3m3uxzM8+T-cyE2N`}RVWGsS+|N=;
zY%(nyBRl4rsje*M+J*z=&jfA02kvV_O2seKK_xuO>`j=Z%Ci+dlg0$JvDB$ii3tRf
z{51tX24Z2hB@8m_znFDXzW+=h?W{~|G>20hgo<Ni!;d4?_s&5L{P~MZebNNdIc8)S
zI6&IH^~!bm?jJ7_O0HFiNJrR34<F|19)y{uPkH2?CYt6+eQ_x}me;8C18p}i91-F*
zcL{2coNbY47mZP5PJco)zt6+|NI13f@e6@eWytJtR{9d9r=1iRQZ#ZKU>i(Sr@dbV
zh`jmd9JPL3ES`RBLTU@6dyayfRnSV9ob4(dn!=GzJ&vhRShX5R)wCwyw<&?$MgV8|
z7?3nS{kTma%>q1(4%ZjlX;sP3;xU~TvfRj}cJ<X*a_MMZsMqX1LNTXnOs-8-8`|tR
zG5r=3H@Fa8v6X;Mllp`!jkFuXD7M1CdN+I|kXoymh&&6<fP(>l0fpfMHwxnQ9d_ju
z^ZLr<oDn<g$mf(XZ+<VwHyptR?0CW#OqBT<SZy$nc5VT}Tj%US821W1HFCF$=}G{$
zw^nD#LZg|+lFQZZrRw;VCo0GlS;Q$!r;1px!U0a(TLu+tZoX@b_HivaJ}urHJ|799
zm8lYp>Dcz^wlCr7@>*|Qq4$fmMFa^iZXp!r5I@ai-3A7VH%DrGFVh4;*##4)m(TAy
z57G%6geTj#z~|%ua*{eIO$Y&L4hmsN5fr2hbUV6}+n2C~A7Cj1t$nMNFOgTSjnv)c
zV=aG+BZ~T5$P08PX+9qaMdVe4-B&(&pt*%^QZiabo9}etLQ%H)sUk-_@P%8T0~3WG
zNMSko7&YYb>#{@ze8_S&qGPOO!(|sQJ5uO8K%k3>j_R*Lnc8EZ)t|jAJMbC+=hYMK
zmqoNQWD|xl)LLs6h){2nqsZUjTA&s=gnCoXe#EUZb#91aaMbl_vm+l%-gaJb4d+Ai
zv2zP&{jKV;f<lF`yiBQVeA602)0(jagsUbT+1dIk_p>!LEmdy1+%?v253{ijuw|gg
zkzT${+6{SCu*7FV2d_LxP6Y=wo8kJZB1;Lbg$%>2TzQSd`<~W-@m$+fgl3_rQRC#6
zu?baj9$=I{TM>qi&%yULAE6uXVcl0%fillp6N|%|ebz>9e@7Hzd%DpQp~bz-*M;#s
z<`&0GgE-FuwJ*pLLT&+^>R3-=5xc2Tdh5fl>VDFKBTdM+qeiju=JQkDz-`5}69_9Z
z?wYd?5XIJbV|4O>nFOJzEZ%M|?>epl3-7jQHh9r3>T;RTLO$n6wUGM5&&)TB+|NI{
z-6`;A&n;(*;M@pC6@M=O_SZmZu<2-$^uD<X9Le!5On;REDJzc*nP)Y_Quxb+CUz*u
zW}n>+ZAp3?(!vpOjrKm1EFlz0d;r3sMGYQZ5}H6?iZ@NeZ?p)Xc3X{Z25Ly%*JEf^
z7xVaL`}1mIlQUPSWkfsmEQ=pw2=DKmY(g-H7#(sFP!fz}Vn}<p$i@>j?w$w5COhMy
z5Nm%t$-B{x2=*dr%Wr46bpnx^I9ahPA$m7YtB)2#FZMG~Y*yyYP6PTTnt_R1%si?!
z`5oaMHFyYuzY8d;ARZ=8<dRX*KBTmN2Q`#4`&<~7f_qeR(3~(e+enQPv6!F@d7cb)
zQf`INM2eNgCY6$PEWD5;_sTsfN30X=us$OjfwQpgBPhJy=OOb`zJL=dKLHRlr~~(F
z0#Y&uc%q*MA-DV#u5UJ05t_opB}aYoHxi!pR_VH;Sw<u{O?dHXbAe*8`U{@P2hTd+
z^7YN2>y&+eM4QL5l)i^^(E6o(Sbeg{$_Rj$d`TaaEt<G<k+lfeochTO_2a8v<fEgO
zx6q3p>g;LDBB#YDNV$+-*cF(B!+esJ6^Cw^jj&+!s^)}B)g5=qr5*`vPf)xpiI^Sc
zmP!z(KM!jCLU`)HE-f4GL^KbTc%oNBne(aRNGp}rWQU3xBv;LCe|=XLz)A-|b%{Uj
zVMi(mJ3sAK!0Q`|b56U)fnjiLm$ZqqEyXfc8&JGIXsp2dBG;ev9ab0mi5RbJK!gh@
z*VHP^+B)z$C)3(M(OlBvu6fFL1(AxcJ2VM%sne0{2FT%qs@N5o7uQz$r=mnLvfA4E
zqu0MumAkT2n1-~Ll&M3NtCPI=Vy+yDRF5t{2CyZdhqBZ)#NMMh2#VVxAy_XTHlWwc
z0_2oGfLBAw(3Er3ASmh^akbdWYnAM%-_AZm(!9$0fY9;3_MPkwg-o20FYxgh$VH6a
z0cB)I_eKErg~Ci$3)m0lYOspAd^L=|P?iLAfTdPAG#N-6D(JyZR1|{*kR7&tRwJS~
zeDrH=r~!E)6M^Jc09?NSm?U9ipPnaB1EGv%Cqh$1TBstnYj6LGRg7KOP)8zS`ob5>
z9zc-~sDL2rZ%1;s(Kygt#x7O@IsSwLY_*QuQI5F5dwm7%plWHYK{Z&Mw)Y8wZGubL
zqObMT^oyDkNFQ(W2>s1~J1{p(lPlsAvGV=F!q9rT^A36${tZ-}7QJy=ckfxe<;r*(
zf|8OPMQ^?hg@xPC4R#5EN_prTZF~Z4?nVzFbhL#rRzT7yt7t=nx4>rkd2PKdbuB4J
ziApoG9aD0%`fJUVE|Zn&Rj%4Fkxm>OgYR5%u$$AC9cn-q=keLJyezq(h76+9!Xq3h
z!+-A3<hvLYb!tm8Z7%l)OJ)xNT~Z8Exmw{&a&4%Q=qOqiTY9`XqqhUs;2h>xr@vYP
zb7(6f?@=6|Y{0s&1|uvwV28%^M-A*xH{xw5TX`-fR4Q6PuPAJ^GVi*h(d7;^CB`ZM
zF<R8ALQ9z)C-ZLle=zq~t`Rb#D`zGxB!L#jL0hWawnqds-`&n$FjLX!3xe^PSz{YF
zXl9@JB`PIVPuD*i?1Wn(g2GoPbo%(=8~}gh05Usy+@INrphW6{>hhutH7df-M?(6h
zmzf4qUD7f}A^ftRn|wbvt6nETix%qgHf+Be^ESYDEgNiBybR>~tCK^?fL_uogt_H+
zPR(-cQL@_Zo`9EmV)a}>?ruH^I>+^!ZKAKH$16x(NQ$*nKdtEwX3>~To9cg>#!^C%
zp@6~2jki-@YJuF=_PR(KJ@lbbz96kAH&G$}<jts2>zwXfBlNTyKhluoEz@eMJ@ysh
zT5=3o5!Qwp>wHLr7e<DRBt?YGv+Dtw&V{DjZaoNv_$Eb%cr9Im)w@c&`=J*_=LhnF
zFx2GBCLrdd>ZwZSt2dW9o4?ozT&`fCq6ExtODUnlOl5YqF*AQi+!;8AlNufY7n|$_
z^j@!2iud+07N7xGt#zSrD)tpQOB4iG-Drsv_u#}mXAWztQWJ=Jfrj#*b(0iT;uEmm
z7*a;uo=xKb`R11SL>W||2dE#3wyIghp2av^X15|zJDa3Snm5!ycFyS)bDyZbpQ!-6
z2i}9^XRUEuY%)lu?nw7ATVBMVRbFt;j_BIj>ojw!I7D6p+xk+{YPm4b;J0A+#k`5n
zCbdR)^4W-oX8^c(`Hn~^#`-N#;{iftQ4o2Wy-hud2=5169t`!SNfC-tM27D=M*K)P
z_0|Qj0`(?Radx`uPO~r+-*C|l>xMQry$AQ}#j%6b<{jDObKQlc{69FF%&oiP{?j=u
zcyd||TGf}#r7BMght<*iFC0vYx3emwQ)zd-=r;*{KZKK&SuEW0Vh*jlYvuD8?QXjd
z2tN&deA+w<?t7hNXUZ*nRn_dpBd82#)rXQRq-!$MVFN6>c|4&wzwZavg%(=4@~wpC
z#RIj*ZV|2-5!}Bo!oR8y#exoG1Bxqa08GIpG1ji2kp2`ra?XP=)H<z24i|ydW7D}x
zp0KJuCl<7RZrEIXya18Mf141yTg2r?BEi?eMeBHT8IUT&GwW&mMX8kPAcpN{`t2g0
zD5=5YZ^Lg_1>4t8vO%Be2JW09jOkbM1H*zEl=_s9mEkIAvlWX~AXrama${*T&gi2@
zW`<jh!s?W3ZqTPk&Y;_>60A;KkK8RH<AwukCT7<o)Z&jVjqPpzpkWWd>ui}Yxu*`0
z0X6u008I_;9Ahnh@*qqO@aE%a<29y?GS_Fj@qD;UbZTbYf{Y9(wHWjc2?jb23%g`$
z^#a_Ow~Uxxye&ebIciC2<NZl?+!<qFf(8{rUq&-C2=%CnuuSftD=Wnp<5F>KRayNf
zXDa#Z^6b|<<^q!rcor@XbM=zOBHx*hv*LJ;Ey20a^QbW1l*?vdHaZC972ynC5Fv}~
zX#?*5s~Zevz9k5g*`Pcz=2|~DJxV)j0i#7#;myGcX}rKh{_wa@4?AdD3MyG5Q^6&}
zXny2UnJnuQL%(}aWy9edKgC67Z-t-W?0%)7V2`_p{Od4DWjAq-1BxnOkcr5f%eSsm
z4zK7`gbp<KN&`M5#~$;UVXayl74dFaxj8feY2I@WK{}=~T?OL&p(?Iz+#+0CQ_S@W
z;7fa30ab@oYSuiQQ{dJA1$0@32xg@V(}fAo6y%i3_cxbi`TBxJrUdw>Me%%c56CYJ
z$+(dD7jRH}*n!YEYDEfFRtBvZ(4sYRhYp4SiJfeN#z9bbxPkUcH#w1z)ieHy`~h@u
zCMF0&sLV)k<qN9nsdNAx!wmr$C>4<5JZaDiB1tf-0d$!@zb6FCxWnofH9!t0OQdYJ
z0Df_U%mD2`IbQ+^_daHf*QeqO3f83(D|8o;KMj1Tt9-B)k>@w!x>`RcF&kECJzfcd
zvaS=9Gv?-bUsuaYPD&5}13q4$hiFK$^9M~{D_;*%1|9_HKES(#Iicd_A`d&wKfFl^
zRH{|>UHDoJY(`C`hoI{pHJ$ahgqObv@oI`73>{h?d4r&c>#b7f<Dkp=<1r#L(YL}{
zsrf^x#|pw*kds1~vV8^W`VmHhP6h|$9S4~gcDmaf3UWr9z8tiT^I)*!6xK~xCMtx-
zI-yj}dP@rS_Zz68sN4{LXgzS&I^{ZH5C1r=!;p5|D%;my+RL2r&B4j67NJC`OzbO;
zCu;$}45GOY-W8*SWazd;7+CxIl6H4^7RG?Jfiiny*nBbHinQ5FW!CMwKVK$^ps>z1
z>`_))etlV3x5ttK0eWl37lxvYFfdfQIjGv_Rtdj)f@9ZnUZ1aTiA)1N<OgRf7iW6?
zXDTjuQrXbyMsxi`XotlHbm$7c!=K{iDS`--7f#(AWMoCKTZR@C#4kVjcw>2ri>k5x
zY;C&IG-s*~j@?U7^ye<A`91{FT5r?BWE^eYxX9fDebR4ffiUDT>%|J*XSC|Yj0kK6
z?mDSs*-tMxEqkdZpD@WUTdG7j<v03KC-G7-3U~!A52>+9m(en-x8^~FNjo<>C!Yly
z7|u(nF}60<yB6<lOKxrm<z$B~Ay>}khn++gnhu9MfCH}tLRt4>7`ZH8ZY@nUw`E5S
zs(`<M_clN;`LwbonSkHpWdimjo7`E^K;MlXN-A&{%qSUrxL6kFB$(k-;Ze~izPswC
z)D6d>4c>5WDD7_1at=c4$>c)8@{^U4AkmEaP-_mRTmi<r;3Yik!C%R&+Z+TVhhOW(
zRKQ>rxB_mUqnu4EWId+KNNV!66}@CdJ-at%Yzlm8%<WhLUVh4`mbT2lT|n<DE+Aji
z_Dm04=-+x5wBbAlJac)>rz+TlsW2Q_qie(USspf#LDUd2f3aJpqo}9G|9cT_d^uiY
ztqJqfa2r@~$%>Drhn3!k3^;?qdtOqp?8hHfA(XUEE;KB!!g$}6QvLE}c2G?(>jtcg
z4?Z#QO+D}veo3y8vvG=XcERfZ)F|u{Z|71G6bZobY2H0A-)|lRT2@6s^RZgSHGxC^
zHoHNGViR?>h&wYJ1DFGZ5?~9*tUGa|zal7HBhazw1_WDADj_&rTNVVGFbj_}2UmMN
zN}B;+><?<Q94#oM4i3}~a*w+2qDU2o`*(|jLSXqbf-=A(p#8PTC<ZES5eQ%MH1lS<
z4N0Lk9`{H$eEVhz!EQW2=&^_2fZO<cf^Nrb0VtfFj6St;&A!F28(en`)UFGyAiQWE
zLuRoKaecl#dYH;mV=NrCCU#I*g}L^sHQbCkow5NiNG~D5PIoy~uA~Km8M~=D{0*@i
z;t(6+Wq%OKU~5hf1|r<b>MDR%VEAzTkUE1^;Lb;a?x5DOT4nPN9tc}{g#ck3ZKc1!
z`0YbF$YH(l=xM!U@e9idoCT$sVPiLvguYA3A+bqeHH<873dlXwMCx_A0n<epn4?FL
zbC$t6?&HuyZE=HyV|8tqH~{m<W}pxw&|m7H8e=fjVZ+~yw;H3;5<^b1o*`W07A-Pr
zPMkc7)NRo?h3p~nY-P1)QW3CiI!^Fu*84UFWIf#bf_y)L^Cb$h?`?%ukPs%0b%U&J
zSbPYnn^UqMS!hyWT|arsoWK0tl@J`1RjbpD1-b%iJitd!_0Vw<)a4PN$RS!jx0_ST
zyH-fP2KU7HT(`0S@rucb_MHHw=55f!URpLI<DJ=88n`$Yw?gx-dd&*6m5)!L_5NK(
zfC7db^sHCrI?Za&Cy1kDRW(a;oKb!i8AQ9W;S!~Ha$l_OT96is-!Ctj>ut+|GvIdt
z|Ng@PHaN@%+%2vY@ERI<AM#cbw3-|FKOQNL<GhA6{Ht<-1DC%0!-4$*Q29#jVfc4p
z&;B4ciUbXBfUb3VIW(oJv6>L<kq&C?>|-1hHF+aRIyJ||R#v*{3$6E!_TyCT=Y0)^
z^0&xm&~=3xjEL?YdQ{lvOu(i!9~y)NB8T&QL1V=Pn=s1&1X!R29En>42ihxN(1?2(
z8FF5Oe+XKiEN%)KVCki+^eV#6X3xel(Dhgu0Ca0zFR0fS)puXW+lUyOPSbwvMy7+d
z&r0h6VBkhDvRQzIqi5=2p()As2gR;vI7T<Ujs-BN1c$R~rs`-G#Dp{ky44j{IjOel
zO|)^6ML9qvz_u;ua3-l;lR3Hw8caaZMe681R!(!D_7>kcwn*bumff%P-%!+BuVDpf
z^vRtauS^%Z6MF|cdbp)<M^qfIz~788^=`xpjT_EAYQ`YqS;=E88K~ANz?e%FOjW#;
zzRjfHS^#7wV=hjqg;CDyJ6ie)B&~w+k+R1>&3t+J+XbN`F6;gfJIz62S>ji_hTo09
z5D0KmxQhhrb3b5nJQINAr28#+tHh{4MJAwr;=T&e$^O!6xETQMRAbPx^|~Xt5bxUI
zq5TtOuR<fo)XJ%&{;&Qc5BLpqSNaQ=VV>DZKb(4W4$9^4arm93UmZVw1l+_kT5<5?
z|9rMJ_Cw|4i=4{2|N3Ze&b9S`#0~LkJOtqtLN6ItdH<2|HXKo112kY0kJV*<u4;9y
z1=oN?{OF5m^*>f>vMIv~{C{T(A-FZ!aZtBBK_Ea%--c@<e~Wts`|g~1lThC4cm*O^
z4Ry<>bKp5`=Jnd&R+Ms|$IKL*xwlp$09+-0lRc&LU(ea8v#OFadsg1Dadq_n=&^oM
zF8}u@J0mKm&buEoeE4%q{t!U@**^#$ez~sg*Lz*N|JnVYTm%cRc2`Lh{%7O=z5QS8
z+P}v6*EqkQ%YTp6UwFd*>&CIEzNNl<_^gedn3oI4P>Jb4d}<jPa{Ux_DSWODNKjBf
z4QgzPk(Dk`2Ldk>`TiRdKrO8sQlC_z0*YgsXC31)PgOKuCUz|G@oqpnU;%ITnkZCn
zPN0vO$Rqw@Z9l&Xkkb844t+DMHel;@!L=L~K-kms2)DB<uEz4W!ha*B-3Nizp{oYK
zVkVu|R+5@6^`fU7^vmle5P7(QCP)zYrVD&y&t6mla8|OZA+%6NSH7PT37YQ9Yy^dw
zY**%=>3#m7SfhD-NKx-0kIYklAI5F^z^h+oUT(e?l(j5Sr4MTX!Qb#5dqXcb4IAb*
zLhxHn&;og?2(8B^fi{uk$vD{xH9*>NMG3(gx=6~gWQG1ufS$f^wLY3{>C+Abp70WF
zH;%{jjkxr6Z5(1$$b254jE=$=DiIWzC=hCrfkLHP$u<s4_Cv_`56A#rf&M=*;=|Qf
zJ6=^!xLn!Ty^|2Eau6lEJX{LP3;+J1SSqK-M_<0b#hJ8^R|&}7$wW|*UWx(Ic)Io>
zAf9_l2<C%F`wc!vI5BBtSglS*j|NbgtrEdEK?59aYG(}8_I^>MS!3W06+GKLZ(}^5
z5KLyhPR6QoBHjqdM$6;v#`O5iw)&O%xTA;uP%-}BS^7Vp6d2kpw3T<KZ#-fUpfn4E
z2(8)@-b1ic-|m!2eM_e+n=hnb(Q{#@`Gb*WgkZJAXF#?WV-4Ehc2Pja&?XA?NF^{L
z3Zy%!K-TGpu1PPc<|ZZCB}6|Z1;Z^PcXPaqtv$!$5TkqBs6bjX@~IhCpS$fJZ2PO#
zZNKBnGvB@z?|Bo5PVU#Nt~a}$wsI{9H0mpXzRE%QeCRJP)g7k`3q2K^KgfiUf*pib
z<rQsQm`7>{r^*a)I1ZPHD`!a^`dZoGje^zodoGa6j>iH%tJ?@oO_m14%8UOP<j*gJ
z59Hi>g|&qX?o6(Pfb(b0u$3JPz9PfH4B<cLCtvaZ(6dyS)3-z1Iu3|5?>BCroCn7y
zJ7qQoJ$99XT-{s}fSuzH3Be!lJboutCSnC-i7yP%OA&$fJJq6=CwgH96V0MD<vO<)
zNx0J_JCETzkLQQLp_>Nen^nl)BdX)8uL8;+b>uy{`YlQY2(mN3jGuh#wR&!2@DX-l
zpx}yCsL{}0^}Gx^;K0HGH9S-wXAmQz1ru`}0=`DihZ=O51OIO}2i>45EMSJ*UX00Y
z#T4**_9m}uLx|V??MbNozWTCe*?X-cX9GIap-$OIe)t*r{`GnoS!Wr8!7GoDa@OU(
zPUxi(XGz<*h)i^W7tM+gTon!^b$5(Op}`(Fpxhd$29a+%957ldX7aZ{6BRCPi+0um
z!3bT3D~Rh*w`moz$}4wnnzd$Q<jckXM{NJ6SX1#w3l@EoTMT16Ei`z24I>ByY+2hY
zV=J5F=|~#gYV6;$Ornj;ooj{U+3RI4maHDI-V*udP{&zuAY1kubRYyX+@Z6^5bM<t
zaqunRDof>H+FE;|@o=q^fEJYsq<NL^`Fp7RKW_*eUlWiGqJW-EQb>{gGdt`|5EDV+
zWPMrHNdk?)VB>utiyuo~lJ21D9Fai{rOUG6`Z`iKSk1sZtsoREBrCiwl^1&ts=|sy
zbGf{QW&<z@o~1cqa`9nW<ACR@Hxogtz9YZyxZ8fcUWWWD$N1m&&?a|n1<|OD(e_&@
zQaQKv2jBlcuQ9Y2fG=Il=~Dg<x|N-c+0O>#aw00vgNMS1(RS)qM<om(_yT#c$vG9m
zQ>&ADt~Wpl%cRnJChdpNV{*TAC8yr-T|g0n`wpGhQ+QY}%cu5^eJB6Uv)>drhk&h|
zBToE-2({A+ApVDHg?ekJcg|k`>G6^t$W8A4@~^ekf1Hy6fxf$*<~wc9|J67-wS#5}
z9-Eb-7fveKK3NqHt)H?;VF?i?O~UO_pINQ{pn#D-3JQ`p51ojWCL3D9OtV)zYIBX-
z-`9Y%NLwHT$6%q;y^TaH43b$6gM@osBaLIC>VY77|1JOO0phce#v-#*xh9Dwa40ec
zY@Bd>9PgO_A`lXx>3V4DWH^cRPmXfV<+bWIN25OQx5frl777u92XKmFuG~A1+d<5;
zSjr{@WAb6s?LR_E;f^TTENNR=++9og{xU1H?tsA&EA`r_uxogt-5$+S(EfB7WrVL%
z-j~`ivK8vJtFBcDc(mkUbX@)lc)=kr<EI+vuk3UEPQVV~@^d`7(>BflD?V4{`wJRd
zKHz5^fLx4{s_tUe?U||-Co2urr!<1_=nUjKYRuS>`NZD3C=u&S;5S_abXUv_P<PLP
zCURhVpeN`EG;So2{Eza@fX;$c2n5{408sQN!&2lJZ=v-#mk^*}4y#wLF)iel)}Y3o
z|3ASC`;Vtrl0i-*YO29{#-p4;*l=@LMgc4B2J=e-GwSpNW}BTyP-X!}__`7m;?)lr
zl2YA}`5^`jQUyZz%f{`lL#1G!f%}BppSP9m!P(2do*qL`b|j~2>fZ(GezCcF9X}q>
z91?rt><YN02hp-3&Y0idTKg+|f+$saR@h6h@9dMV-sfAdO9M)|V6ih-yD9sa3v0oV
z2QP>WDO%msm+N2#Lh0F7pr87A2O-!JfSTsj=4wnGcsVH@v^A1(#s|e*moeZ)mMwKr
zx_|r@0BA8pd^kpyZFUE8V0Zpx9H7t2<zQUghd+yu{}tL1W;Pv<AdZi!E^=xCSJA#8
zm?dKyrzgz@hzm42f`!6zCmLA3AOHbQ5_XnRkx47)wT}Hge+JCogEybn%m=>rGqnat
zSyytWrB?QwI{RC6JZ`<Fw|7{P^!tky?URCj@ZQO+FZ;u5<u)8lb?6er-V4wIfJ2=)
zGuomj-nQ_Di{bs(^!~)M0hv46TaZl!*mx_WpX~A9tTo`kK5*cl7tsu#|9yf5Y(~IE
zAY4+}Xxz2>D%d}OF{{J#_aDoP{D<SWfpc$gwpI1N>7hS=()HEhM?>ny_y4o;u@8Vo
zCh~H=!QZE-)x|}x4!_h7)wuZ2#>=nTa{o1_$bVoNV9I|@|0izzuj&6Bhl3#Suh;+i
z`Lg*p;QVQt_&4DE#j*M~*8iJke@jRIBAmZCCIA0MI6EUNwI+OJ7a?$GW~VfEXEWiH
z_+KG64PxcvM9wp*6F-5h&G7bJbB`c_!SN-9n;%UCv%28Y-ImeWoF%WjMue5S_S)r=
zaCUT-i7Wr2M4L699j?Khk#jyddN9z}41)6IclQjqbf%MEOHBSMmegap8$!PWf91Af
zLOGT=m^c!(=|Au9{jUj3Z1s1wZhcTi{En=Tm;`X#SCTzE;r&A85(7!j*q0@CEdhr$
z3{gR8qhZ%-dP{G&iqeltz1>7*3Ne{aOcc7WE%zRqZmz?hlKOCSXN+MzD+6Yqcqz~<
z|B6CIV{w7MDZF38)oEp@$%cCyPbw_?pqYlh%UhEeZ2dT}M6nbiY6pK;o<TGC(;TvD
zDMt;!vlS$!9R0~(JhAwuSbk<W%Y>p3lcWWHe4;nK>e?})a_cL5ZmI}!IJzVutmF8T
zo)*7Pc@paI(eE2>$UOE)Bo@duj9RxW*s1k@m6+q$24#r56Wc6rv=x?wkBna(`<qD=
z18i>YJpAJKsa9@-6y3gSsw6YWjj6%sdb*Jl+_CQt*L69VVV!r^q_vLwd(m=ACxfro
z$8AZ`=o*|ag|NabWeQ``Jgy7{1|LjXXzzK98-13)I2s|9o7B_ea(Cq_S(7@~b@Qhh
z>wiBMHZbs8Yn_|-8~pxtK+4e{5%U(jHY<upP4`yrhr*s)jCC^Y)@>o}$=qjcQzn$E
zHs{!w!CgnWSci$LW84uVhRYq(E>Y<wb1RVnA-=>f+!@*BSF_o!<~P_xE+=r@#-r<7
zSX++qCO3q&23I4ov;<=PgrlQfayLBPlXcWZPqKXITMyNW0x1vl5j|8_T3{L$ruNr;
zkI;>vjMIJn>rY3_IrD9-P1MZOQ*L>AOlfm&VDn6}pjt-K@=7F~S?WPPLx~JCXPPxC
zN~Pr{Un`{zU`jf!9M&*<s($T7m~qF)v1JF7&99Lr844>-n)z?Ve@TX9j@?USv}Q`u
z5BE1F231d8^)Zm#A$qtj`;L^h*SEd>dF8X2l6_;y7+h+64WD21%GaoH`%r8aoJ(rZ
z;s@u6EF!wiI+yFciLBnvFfT*R;@SIF(<7g1W@LIgQd;8w_wD#ulm>rYQ&;*&fqML$
zO}m6H_Wt4%VR%PLii|5agiE{g{^sOOq*PkZ{Ogr&X8s5Bj#50PgJg!)AAGsQD9#pI
zA3bY+O6%S1{kdEdig0OAsI<cTuRYDBib4=;XX}pgycT@!3f{bMR<7b+k8z?!l0(Vu
zyBH;n$Ch^&$xo#%y*{!fhK#HVUR8BWGc+^f!;fhe<+}{y%7anfbEuBLqMU3L!0zNn
zWfi#n)YPb6`X<vUAwB9j7}7h@>mifvHL*p{L)xM(dz{`hIi_t@x;5QC=w0CLUN+w+
zCHWerx5&C<Qe}c=?kP%57A{>W^ff2o*=5-uTwL_JdyX-4seW0biZvE0wIUYQ#WQI|
zdCEQ0FOA=q$xN0lf6RP=!s$p}FZtEm{pPE`VkwWT3Gh1{_8)5g<%Xr4>4A@0fx)7Q
zJu*ulv^2gbRSXPkaFtkfwMI<d?5(%)(#@T>pM~rAS#lN<g$m`C#~z6NQp%OH^7s^y
zDfD77s3xs#3hOvG6WH_A)hJuD<j0KgFNGqY9aq+DuJ)mB!>vT<uh$DVFSqh>;IQBM
ztFxK23;YgE{;upaw?Fih7rtey6YZH-t-QNNJt;XV6(K^|)*aR+W7b+;R+>dJPljB5
zMcG-7N)CEuhmuZQzcEDa)_xN&cZ2W1mB`d6LA}X9s)j~+ESxa)WRFo2kpAv{nEvOD
z$-;2A(MJnnB6tM;yi)&ysn$`G=uF8b|4-kRQ5Bak-Unqo{(7pgO<>1iRw^|=gTEs(
z0MEu}dgv+j*S*EN-Ng{-s^#~XkaEd=_QtG_<*4!wvx%kIy4yF@WX^kC@FY89pB<hq
z!YlUwc8+5fjfRqiFY(X&TDMh{gNC67$KJ_MC{5ACvbiE%+_m})gHk^@TS{DjWex@g
zTaRg`aPQ|XbuS~$6zdj^C;GdTcKqd!SpvO`-d47t_?La#erK2>Aw~HfzP=}e=POfI
z+RIa5JnbYe+HGIOl9c4j+?Q?Z)Zg2_Z4J~CD%~E`%Pz~A{szVOx+24mmD!%F`JhF&
zY(0AWhSQvTRT^<K+wQ#I_jX|(eBy<kY|l(q(JS!FG(E)=@jQq%$GVH0{?`c#YYQZl
z6Sd%{R(~cau^lJf?w%QL!%?2r^mOl@`|O#2tMypmh;G?(j`(Kj?qjU*x&z1^?H)fW
z=W-3TOMZyoMiEx@IlJHAyR5s7Wz|8Ape(=QXGh4P{0C|2>YfF%k7_oTUTLGNGMS~N
z;^o0l26Bs%mD8rnmHfWj#9aM9jJ;)4T-y>Y90&vt4oPq)XmAVe4#Az^?ykWtK=9zu
zxVtwR2o|Jqcelo^k*{-3&i&p!Ki+*~?9roP^zPoPR;^hzXVu!<|MH;7)fqAt^9X_B
z@y%i4p~}wx)trkeU5J{CDytoT>sSHD<fqfOqidCRz2+$uBugvVj~MzsbGbnXPtQl0
zNIq(P^UvAozA*9Cw2^5M^t5SWB^yJeYBdY6t|j)i!>1jMRgH~8*7V1VL*pHdR>vft
ztk-QTQ1vR*kt)}I=G!%sMntX!rsj|behWT2d=zv}I80j7#exlH=-j>Q%Fr6+VOwq6
z&bUZ7Z}$57CU?`AOH>{@<H#6D0HhP{6pIKuRs&a29z9hrM$<QVO53$xBEvsLCHM~8
z=~$krg8<=vof~8-c@Z6IN~A+nAcO76(iu!!jilCT3{*Iw7ytWsjSPcW{TkUV1LvR5
zt{GxjBols?#%6r)B{uba4l^~Tnj#ZlcjTLSnd&e>wJ$jaUiD*2hg6Z4e(MMC#$Q{P
zop=gp+x;dnOY{LcNyQBUMp~a|E(-O};=|eeis)}WbmUv?wm<&AU65D83pmYoX@Xr?
z7>pNx9o)rmPo~+pu1i6TQEp06;U_XVA98%*imZ$DMD`<Tg?mGiNx#}f#QTzcGfEJ_
ztY*T)7Qw7RPS!7DW03r%?dM8sG_z}#o#O*n;UHE~If~0Jd@f5M@P4+$Yf{a6D(D5`
zm;ZIRP>z!iEh`Auk1R9#6-B(WjPhA^u?g{EfW<KEBM)-=bEPi=U_{qmK1NtN60%s%
zTmTe)63@VwDdG;MS(Nj9qAZ>}R&q8(%n;!rk3;v(1NNY9U)~H?&8Eg%c-?8y^<kj7
zTC(OelZWDtc<FH>1JseNYFXapzWHZn{x8${B?$M3so5*GQqOG~8Q|pAd}M&{Y(=bk
zqxI#02|B~AvdG8(aff>nm_C%(ynsHf*f*V9hR*Q)?GvCM1Ep^cjP*dXy?5k{t?#H1
zBD!c3ayM~0nXY{jsBc|=HS3!?38>L53eot|jg>o9ES;#GRa#DDvI}AUZv&!!g4IK+
z)V58qE066eQLjpDSHc-IwjP7Sy4H=>{*U>(g>qInkg&w28x<Pg0p97^2PGE|O-pCr
z`_LCnw=${f+a8*JTbyo_z{C+t*fU8z3YREjATlo=o1<%2=b{BP?Q|%Pc<)VfBouy1
zqmaQKtY!N=M(if8#*Br?tmjerhB+P;w0UQp>F>+8pi)0NqXR&4A-oIM{_xKw{Nr?g
zyWao#`o;#XC$NlY2j+rHR8=|_fp`N^qhd-G4@+C?wa)3=ZZ#Yd{0!nmT{?i3(l`>-
z=DY!c<bS?)uMEygJP4H+kg~lvMY#y4O!ir|jkL5-k5p~@pn$nb`8cp7kW*OAl?9~~
zHD$~aU$tRZr?9D2@!4ZSEOG1^IAS3y@;tnJ&ZN5Yyjg*OoK4kPzlsxQAJ5|-!ZTaw
zC@_r50yBEkm{ex99rD>f6H;HgohTpr1U_B;?fO`Zptt96DO0(x=R0H#k+&mo24NYr
zWzoo6v#XNZ>3!hJk#+KbUp}Z5roSlqf3Za|oDp)R*kqtvH?E|{p4VlB*%m-F1tVHj
z!(5lG<nY=8B=G{40f;TM_5w@O2#9UVbfMrf=>ryfPqqrd?%V`oKQ4O365a&cfW^#Z
z!D910d{=X61AHKv!%U03D+L@>9#HB8A80rKCMz~<p`z(0t^mm=jJ+{9_0)z^VXViB
z96e3;6v;dhirU;~ik|mv>0461)HKZnYe3|Jht&=Ol;&^-6eHwj{|xQ3w5xrp9hW4!
z@WM7WwMa6V+4roZ`(GCI+^D7z0j)B4<*rVRQyV6@Yv<NmsxshwVsYQ}E|!xI1mN%(
zI-cG>>nHxHMX{*pOP&WmU3se7fW?uVy4w`|SFofn{>Sk$tm_F+7i~7hRF^-3rZN5f
zIm-MmX_|024hIjSy*QySt~p4Xw?Xk;B+>4GLo?4<(%UCFFoP!63dMF!V^3fak8?Zr
zvS*k(Ao*M0LeyT_5MP|uL8iwY6<BYPeufY1pXm;8=kx-bbi!|CvWiMvo;rasVKiyH
zkJP6YHy4h#2EGUhQem)nE9IB;YXD0R)hsZEP_phzIa;YV;q`}?omFWxzpDXfW(e?%
z2J7VLQVal!vrpb12C17Ld16wQ76B2l5ThrF*#S$2Hm@C%%@jgHVbS+Y>J*7Lp0L>l
zqnNWepyhg**@eB6$}HuL10s~_A+oN>!XT{wwWtD~#0>pWDkbKIXMLkdgw2{Siu4%1
z2ixK6wmgXyfG=^!2qB}^0a>UoV9P4p0brkDNC*1f&z6-4zqSEyF%MU#EO)@O`ikd%
zdBh6EE~M>D!DSv59Cxm%V>-$Q!mm6f#?&te`I>^2e!@?FG&FQMD$5u0b+1Pjh|O=@
z-p?zW4}^La4@IL;&NH^hbi1s4ngyR)vwGWee}WCRs7UepWEj$Of46cP1&TBPo2O7e
zkeZi2C1uZ9EFon#!bI3*zlUvr8HP(6ZPqcgwLT5#_0}bBKyMXkX^?_Ud)otvXK>ZN
zaQa&=u}-EPs(5Q_Sux|eTKP*N1IJ)F3^DwON!t0JRw3t4B5NPn^zUa`$PzX`@9*i`
z$MLz8QF<kCw(Te!<lP7%{cni9M1vhsWpbkh*j_8%zm~lu)@wT|#qBMKWoQ;mDxT)t
zAD;-+bKwH!uUWVr{~}?a=3RL3TUf2t@-Ss)H(&?W8Lubp;4~^iA=%RY2o*LcLx|s#
zCQWdb4`hD=HKfcHj$+A$!sRpc51t{lAwG~nvez!cC|T^(vrB+GAicpI5Qf~2jjII~
zOUS>Gf}GBb;d_{W?x$Ia2o3(aHy8%2>yq_CZ=zmcU*!WO4z5J>z*eT?dC+Jxwo5LA
zHF<gKu3wPD2LuRpp(GusL8M$R!xhq)!~#uor}8&NMM}tc)qYApeT>36OnH#q1uBLO
zsp2cy%EJ6~^b40lo2Cipmb7j37AxJxVL}&Q+n~1f?AEbaRZ?+Dqw<Q3)C(eQb?;+q
zjJen@m1%GiNKP_W=P`KhZ;4e#<MJwYh%QYKGf@6i_#WUqEzK(J9P(K!daWJ>lJ2QJ
z<CLCx*T&8q`o{E(PCssAAQ**Nc|VXg2X-oR6RTVi`_0C!Z^Z+6N;B0YJZo6no`7%P
zaqn^Pie^HYTp0WRA@R9q2q1FFhD7b`>Tej-J@xPgD~6~iw;jPF>uG8_;Nq?iQT<XF
z*E{-pK#U!Jc$PJX3HOR6;PjJk3lvJ18wVgAC{;aDDL5Xngv=PM0G(a$SUmlZ8%UoI
zxcAHo7$)nv-vbMd^YO68_cRxtU4{Nf>xE;>jm5fJb013&A4vY9Jxn&jW`KJl{DSyB
zy8YEmbI-0QGVIHME;qFj9!{K!^qL3{<<9Td(RV7r_;`Bc3N+9VMCn{B?<7{Vzex+i
zh%TV82_%1KEZOX_K_zC|ilq4@aMnXV0?%L1J?t+d1wc6}A)i0Jk-u}lof9({{!Tt|
z`~CAezWoWPQGvNgU0n5paZ81p7#Gf?vYOMom<L?z-KOcV+Ic6+3f7qX0;7hF0`%cA
z)soOXDL$FLVQ4lrDTpMO`!N+5HtQ-s^9r;U-V--*ze@|U{-4MbAeW<M*F-`9AUept
z3(W@m(*X~R#y@xClIyy{9K6uDP*Hh<_@!$F%OeLGl@<xwC9ktepi#@U<VNal#g4z}
zr?*L!(psrBkn;N}@q2JHE-6?{L;i-igwdy$Pg!MvVw+hDtV>)gb;o7q6}x3@t5*ob
zrV3-2IOs54p}0F$w~wax5y&U~fPq*(VVxA~gUjCSa8$A!>MFOg&f1MhwE4R3EQ8oa
z7K`D6mBEA6ycI{b-ech^u_U}h*RN65^}+WuwtH-~cxL)AM6=!kU@+PqR>}YxOMaX`
z+Hj%pB~6z%8SLrPLQj7;ico~W!4p~ggEdSggQ_*d*s371*a`;~1D1w97u%B|k5;8u
zdS%ZLtybD2ca3cW4OGmlN{;eBLK`&p`F;`N;kS7%xNS+XtFX{|+a`*k+h^wd!J4#-
z_xq*eiJzijBouQWDy|hwpkP7O5qy({4xXL`N?r@{k9kwBPS5POtWZ?+z?QjZKn<&X
z$RCexpUKo7=3Fz0damH|zzvRbJB(_VJYTO^d8cvfBPE<8h;4C=a%l-D(nu_RMc_@T
zqLFPDf1^a=J+@LJ?L5dy8s1?=gTma^>U?*_D~lzs$0ZE$3rl&G&txOIZZ=&W;m7-9
zPA_y1BJS^yl+gihjKY&-IbM)XX(o%ORj&Cs(rXhM>8C&n%-`DDZXws|CH|#i3REqO
z){A}jzx)aYW6u`0qEcbn3V;kT8O>ue%JzlRi+hYFh=}3r%)V~20ib&4-T~IHnS?1V
z6h0#)LD>!huAst-0TRP^aOU3N^h(5N!pe;=JI7XLid5+pG0#dkG^F`x===z|*;6Os
zfrd9lI;cS`$tYxyJoABN73--`6AF}`5)a;7H<O$H#6!F^fMpIx1As7(YdMmQ`NXxn
z93#;X0d*tNoLVZKHK!9{v&|*Enck0!ma`FZk}h_49_c3Gupuz=f%L7_==1_0MsEw}
zr}~cx3=s;S=E=KHa>+_x;RIaBr@x?Q7&zc=mhaQCsUxLxUX+X4*Npo3{}7%80%R_N
zy?RgMgBW^CQ}KPD;4+mzKG2cl8AeoDSB9<8z57H$E4}y9R|sDyXBAV55)4^&;{k*E
ziQWSxQ~K#5sJf6*a(f`h(1DI(Mt|&uy3cY&3~YHCGB~c?+gmq|?PgTHT*|23*)78~
z2di0dSKR%(4S`-{1o3vnh2$_YlmBwk+zrDlB;?U&;aM)8l2?*szh9_7N-nPV4v^8L
zmQ3p9e$SauShAR0m|z-3Ih4uyseJ@efopABc6@zeF}I3@_{k5Er%qxB6tmY3Jm#Fn
zcd7Po?mvcv?!j^?9AOaA#Zf&}uFL<E7CvUb@Ee0XTZd!*;g~{_u(!ToDLBtRp0wu=
z2k;vcS3<y5Zq?C0zrlK~x)xT?;-%ZbYQOw;UaYDWi%46lEF~Kyi@II%28F5(TL`Xv
z*^?9j79mLp#WS|i<b1_fmIe%(*#k-r@ntdd^m*rZwCU{sb_#++sG|^#VOR1fZ6cG9
zyhps0gk3DrhDa%q{F>kA#nun$cPSMZQk{3OTgwk?&IKrA=+)e_U$LMc9v)Js#d7=i
z0@eZgj>7nE!auniM4c%G6KF|+2e30QDn~~IMAS>>Ug~z*pcD`Q@Frzo@Y{&2{=+i=
zyveD;w5VldQn+f+&lUVvy39}ZO^D#gIxs||wH5%xUdK&^OBxtkPVX%_kXue*Gt6G0
z{DSzm7!bgOXo3U=MOzJe-&9O^#@Y#rdBi4~n?j-0x(qtV<>k`yQ({Yg%1NkOEN(_-
zw756s+#VB+25mFAM<mrsoLgK(yz^P;R$%&t84*Kq12^VeH1Im<<p=L~vD1<RX%g8Q
zK=G>N&a!ku(CWEHoA!E2JcPSGQTOS{*cD*Zo!qP%yX}^1idwMx1x*6BrBE$yry%g9
z{h*UV+0N7uY0>AhG7I*z$YM{uR-&=}{T;D=iYheqL+;I2oz_8DnO_|L78)+iVQ=}t
zV^zfWuVKBZtF`R$8I$*5&N7NO3z-%DbZ(K7xf1oML*I4y=ENZBp*!P3vj@X3m|bAV
z3vXc-!oS=mKv+7bZ1rN~U<DxPI9JBI%#$VO_>>=&f|W@+R&XI#9DM|&l7Ya&h;<#H
zRKecAfIWumi7N{YVF)xWx*#5OZ)ivWO6t7wjGbG53k^n@WKmmfZ4J(4H&TaRaiCyh
zA4~6KW)mRvOr``*3&uyot?<y}9GZLVpoOr@G}as<c#V`Khp4CRT`yV7g;J(vvMj$|
z7xtIcFQ0(7?-KfgL>bP#3$K|#ooRD(>FN0iuk6)JlYdP<>9{*0u&i;;>}0g!T0pID
zA06*OSnVCaTm$%-fJ1Rcd&=TLtYC3xWw#VgQ8oUfa>VfGe|^Pcm=KZel)VKr&`Ywz
zQ<mltPxXzaXd+eT9@DN{^^@=m*z%_{-rSU+uU+e8QK(*$1vgS1Wfh7UP#_IMf&wWN
zQl&a7XZZX<VIM!OOwZ8ZLKh8h7LD$t&L(m;I&{6zz>XEq+6G<vTW|~*(h!Y7MG`71
zmsai-Do5Tq#vE9&z&%|yeQGU$aw6FVWw($a7p#g{mQ->Z?s2lGuU3iCK80<O%&e7r
znaHbu>o{H^U0y0XX;h%G=~Z#uq6<23avQgA%NpZkucd>RQbHcBb^ORr#g8qFVIkV_
zd$?yYr^o*`fp4}@i{JQ4I&NFG=w<M{-(WW-iv$UkQIn|rD)|c-^;IwaaRLlP%mLzK
ze%MVUXnX}*9Y&CY<J7~xUXN;rd=nq4gBn2qIK6yf>$VcH;y3K=VUHIscTPMAP06jH
zDLEPZZ=R+$uU)E9;|;x?!5w+(;U>eg%kKp2B`hfhnL|EN!3Gt`(dnyvIF5fSuHUz_
z`uECc*ONvz|5nzzU??zRG_M5VhqsvI+5P(rdYJ?kt&_klg~S;REdX&3vY9HBC#o_W
zkn9HI<Xq6EA^v}d+<=#gIfJ$;I`GO3o-Dgs1P~vsd)f2I^SP_y{d73eu7mU{6fcee
zBp>6#8^mDOKIn*Cpj4Wu>0po#^cb>2(Xgv{^?PWe-&Q<F=4w`c6o~mVLPKZ&!hPG>
zDBGC`o~3KY<$n!cXpIpPVB09h1U3c%^1KOC-WEw?`RdoaC`-8C<5z-(B^{6JN<@tb
zF5pd;I9D8_bwrEtACvng_%+VSCg5M%qL3m|a22z@5`kPERcr1RliJ`ubKMC@Eu+J9
z6@3|f@8m3YB8*$Gn^Nh`xB$6}q%{j;C@RPvnYem7qq2rTKJRc2`JdqNN1h4@lFnJ;
zr#xYM_}KGG{t&pAl~1ee$ROCHn0H1k2-Wg@GoV?9?*qKZ%`2J!`XqM%MeHf@XNP&H
zbp#xs)`6xP77_#Dbo4;U1OE?D1SMHP{s!gSt9$&qGA;(W?j4`=+9ij6=)`J*#Zc_7
zh?%HbO<LUnH9g+>AA!^VtQ7><GD69Y_!@6sG6;8)Nw~zgQV_}UTn5xjyvZG6cm}E8
z1Gab`I-~7cM9;8-V~WL+52zLX?a`#wO>o+#^mm77*7|j5z>A{{D%JOQPukm$rP>#|
z=<U-koM(~gZdNq%_~hMK%k>XdT#1v<DG<iE+=veT{ZC1wmO^KG{KCq3ct(sF4`0qj
z%6E@nfd=0Hi-xVOVIt0XY0=#B2=J?2TBslSne1D(Xpo<)*$DXpUyV6IZc&0y&=g>i
zwlep23_NzJ`b8w_SxkKFx1m<p?DwR3mvoY8VVZk`SV!{fOo3h4D>l)6={^*iw#sDP
zNFAf00I{P5e(`}oJS!1QnMaw$0w!YXRImNp#=T9RV|3d2D`R#giQ(?C0<4t#7u*2&
zJB=g3(bG*)y_V$jKTE}V5WfhyBj#5>&~>4WY4a*j56^jjESM>Ih22KX@h>PO#h@NH
z(S~~uP;;uV`zE$Jc;8-JE!L@XmQ%j|kSxv@uH&+(pyQsQFhxj4Z=6i{b3ZWE!1;#<
zScO)S(r<@0!M0Kg&!^*@dBLR)fo$fIQ+N(wI;_?};Wbm)6rE=Hub)E*5CZ5F&AtJs
zE88DF#H$lA1k^@KGLgM290RGShRK*f(4Y$F{fKNx%ry<H;}-vdQIcen4gFo~`DjmR
z5fGb*3W(hh{MF)70xag*uoA(_+K&Bb)!_~>V7??i+>1NMIUi-)nDL~!Pdisg#9;-*
zHhIy|xvo$^2ESqQKL)zvSwI2&UQ?%?qf&EMk_Yh;9#%0GYuYd(&7_WuXgLoQqJ?7~
z?sN>gujA6r(*eN23FWlG6@K@rD=I)V81+nlhpU`anIOEpoChHK@AS3HkQ5sB;E(_a
zcQw-O)UB20ul4N0pT1vxzmD5)QVFqc0D?k7C^Rx#PQ{Wac|>M-V{H2NW0y59l@wyR
zjBQ|x^ioxe22AWOm%my%g>NEdq&lXVT<TZgR-`e2>{BYLr*$8{di~&Zn_%)%O#Bx{
z9<jm1Bpy~_WAzPdjp6kPXeU33oP15ywck|x*c5i92peG7b~SdAH%t0~8_8!2SodPs
za~Dw+%HbV*_>`>%EVC=(fY|MJ?H4cize$d=NqDU8PWpI#S$Q{GMAQ2Ukq{L&a$7xS
z@7~n?09oL9W`BFC6u@^Ly)>52E`3SGuUxRV=k@!dE;G7rBJorT!ucumw)KT9(<lTE
z*l5MCA3i>sjy249=QkBegX2Cn{wL$R7l3)lx_O*M<UjI<(~c-I)r@i2p83>HdB0mr
zu4Nux8s*vZs%V+R&j;X>Uw)Asn=%->H+*7~A|n<yzs@^){?o74A-Jr1oQ?r_;Bj01
z;^=<cAYPOK^q0y&q(g=dMOu8swVQTkAoAYJKa+Mu5P)@GcS}J+-iMp~c-*&V+_z7e
z;)DWXIjNk!F+J@?<`YBX039n$2qIEm#@Yc=puU_<<?#J<l*?*ia%|>!?3nUgVM+(x
z=*K2f5yY8+8%m&XVf*g+t(+Kur6%i1F(+yO`BN-LUVptx88<OM0;up??T;jfw1d)#
zvu>~_t#Js5eN;8AvHrl&1z_tn#4r@c{+DKlst*gjIjL2$nEP%yd1vC<4$!tx+&x=z
zBLhw{)MDrtlxl?+G772`X#<oeS$m&wbX|`?Jm-|D`F8<meI;a}#fIljdW<~revnVW
z>(|2Al!7cr?gzq$hdQQZvb#PWmOmW$^8d9=Aqu3@Ra)2J7bKF7zy%Ia|FDC~jzI<i
zmVRn3Z!0m?W0a8Y^8u6BKUDuQ{l!>@tKW_d10Vg3SW0<#XtV0uHJsnWeMXYXTF*QR
z-$1!j{4A|}-Lc4|-TXw<CeA524ft1O%8(RuP9Z6`;H5tW*nK*3S9s($YQFC!DpHlr
zEQLI#G}D@<MUr=+EQ?rF(CV~S<lZ|jwW5oYyzmf>crLHO(fCc>c=EKi`(ZQ0qMAzY
z9+C1@e101%c92S~=3NBsUj>SAX`#mNc!Ld0mxw%OY^wiRL>SZiTvyidA!=4-KAqU>
zYG1Npe>!fJF*ue0m%hvik-dPK5%Q#BdHr`HEg)|gwy>>D1X?f!hf@=^mHnm2_Et)w
zLB*dA?12ya<c2&wn-9<nitq)}Z1aBn5+054-{3pWjKtn*_G1F3RpW8-o(kT4Ob<X0
z2SU*z5X;MBVr<~nx<fkBFv~Dk!3D)zuBxP_?HzhWN@0tovM%2qMhDqqZ)+w|hWmZ<
zO??Z{3NP45B5{{QQP?fOBZ|%2!&Eko$of{*`f9oc_Z&LCL2*J+m5JM-7GZ6QPxDx|
ztgA!MGOs~y_jNiwo>0l^?{q`R29|Fc`F{87+|~YxbAh>|D}YBX+OcLax6mI-Ma%r#
z_4gS6`Y^OOa2jk(7Pd3~1cfJ?OH#43T&AqU-#8ZdwNer)Ja;|Od<Z!gk;VMUdw#)V
zzaSwxm6I-K9#?C09QL2e=U;WRH{oz}YSAb%04p8lpu*?qQ}~8%c5MI{myw#exwYwh
zbpoaO9no`p6>T9r^2;<hgt5$I`h=EgBH2#))C_&g(FFZ39AbrYFN%Dp72IixDNmNr
z%?<3r-T{Jtc}ViMHhm_WA&Der<IruFshmcO=W?NM(DqxNeg{F_6;$OqB)JI6m$=DY
zIcg?`y6gu-#HmGhi#*EvaO7TQc$CU5uHi~u;Y*~)yuA}ufEnEwWJ?>6sjBHR7A2N+
zk91K@@Zm?l&>uK6?8LD6M3RaQf9A9UEms;ElQ=a$#RF?Uy761x(*y)R&D&nS`;*b%
zBLt|=%ji2?spiaaIJ{#8W%*IJSX{wn8i~SVc=?2)a6)KdKu2jD%1J%}kvJeF=gRNo
z_Z9{>;}t%><qBI*=4!8T9W#>4=%(q&`KGoxzLT0~H`()c;`EygQ9XCTLr3$EqVU@V
z0)#xsdysDei&ezv)g7#~JlOK)y}Bn@Ov8rZL=#hd>lG3+O7p}4YZ2mhU!~UAM}O8>
zMZ8Y4odMQ2Q{AhK()ryA*LO^WR2mmod+YJKf!^%fXFbmzMbVC65YeDFJ>Hq!!?|b3
z)-&`@q)aM_yLhiZmdaprNXH=lw49qzQ_%{c^lEakJoC*U&h^*1HhGQGq?<I~LA?~Y
zig5p<XGBYE{)Dqm>~7XO{2Z1U>Xkx*q~m@D<|*DQnvx7MzWHl@PZ_iG?sS|>Lyx<)
z-y*KrCC*tM4MuxpVwQ4Wv*fkNrwyuM8`X=m{PAd1JlO4PGRJIc<b}2J8Ani_Las4#
zzXGMqG>rmZpWv4-Xtc18cWI77?HF%R|2n6ui}DoUG0c8GK15%(WFA7a4k|n_Az1s>
z!oEVCe*I}ViyS6EKq6;)x<q?~Eo+%ENpPFsxZ&igM?=sICc%&1yPcd@Opl^-7s4>$
zvHsbhiF+gU2nsCQEp%EK{6b@;tqQzEcuKV|WTBhm-sK?v7rlG+A|NN*Z!J7aovJcm
z(mOKJg{Wvd)ra4@SYB>h<vI3P?|z)Ksh~HW;U1q~^a;E_71(&Vbf*3tj%f^AGM(}*
zmW48aly}NZIV1PcZhb)a<{R|+)q7{P8fxtuQd3kFY22$xrVQ`MVUzc)*=QEo`)83Z
z7RmL`JX|#(*G2b}myU@R;eM2r82TTsVh7z*fD94TiGCSPPB0*k7>A1oR{dHfxTy4|
zcz;ElN%LS0U94J(cGvqbg2$`%3S2Hn2t@&B6JKR5a4vWIcK{eSN30KbYEtpYou;en
zaiG%}@dy`e8mV;Z9`$ZcPkz4Y3M^G?oWiz^>&&XW696gS<N5Bg11NoihdKGVN!?)n
zuB-?6$>pf2r_=Rel5bBYXw(Yvm}DYWbDyi7;1=jwFQlzK2Z)n?Hbg_n`{#rXaYMCj
z?b4JQ2z$xH?~?x~C+y5l5ZcRt4k=Qfypxyr;`6b$q1x@P*UMCKS;=LR!^F)=#UZ8Q
z(?)<FQ#V^54Qw}TeDbxjm(u`F(fEdp%w7r*1c~(mt{$S5?{^*9*rVupZhLs*>=Iwc
zA3y7^KT*K{pKSmE{Ft;XVC4h~?d0eDG7hnHa~o^{C!DIfT`7mw6!jv4{Oo~8)^@6N
z6WRi5f7FiT)7kd0(SZraWLkGOonD5R4DpD-5J7ds4%R~UIEI%Q;34hz#qAbrNH?W+
z;|y=ob;KGXDXt>5k&(~wO3`gy&q;=Z6Jxhr2Ybf$afDs^CN9M2YQFrUj(D*<@3Qv{
zb8eu-g$!*Hz}=_80XbY~Du0HNyp6XL-n&;g6=L4;?+v791|AenM_#2KYUcFCIV3YZ
zAa{0L+P3DVLjZ4~h#o+u#JuWscsIU%u6*}p6D<Dt@D<SF(<$1>V0CTaB8D=6Qq^iH
zeLlV)<_w`;NN$-9A@}+|GajAY__BU(q30k;jHKB>b%{HTVl#Dd39C;;A&HDtUWecT
znxg%MR4+4Mpkz)5MbdoP`zhqR2tKwGeVz6>HT69ZfS6g4kEk}k{En236p<1&T-gxg
z54w%_BL)Y*TPorj*f)FI=8PYKL_QySpm(HR3=<F}nj>xFNJ0y^83`;Wh`g0ZpR;SS
zi(XFAy9C24>h|NmQDy7&dIehF39}RCQ5K?#pd|3_4AR5S3h>{kwb^{;);E~D+r73_
zDfglLQg!(LL21wM=G`vBe@%53CL;65jrmP}X+wEq$x?ngcL6ScoX)W?i9nusP{
zSzyCdSDRqgYVyzBUQFrEsns{>q?$P#b@JbniWVn~37KK`!Z}NCGP6k?-9dDEd76_+
zH!KYp>4rRbDek6sb&JX5+1pGX_S33#ju0cAbK7n9p0A(Nu3p#ZIcyoU>nBO+r^!~$
z3fL-hlv=SW=EsW=^Ci%xP8zcpkxLSnd6Wq7_e49nj8{nenh4-9bbW{&6j5+*nKq5Z
zb$E6iDL;s9@s50ET=$0z()~)PBYvWh>|l{>wdwcbyAt;enNvjQ{|pUryi^!2BNP7q
z8vEv#lc9p(1JEFP;#n_GUBpVOX_FovsCW51(;N=>)eHt2X3?mCUhV5yJpl72gm<5F
zHLD{g)}CajV+_MKufV0~nRwsOXO(txmFs@|Vd5zm;z;=zh&R~bZ&3W>!?!<cDWIK#
zR7WmqYgPKuqWfjtf;$m`Z#Xqq?!%I$uH6Rhiui1vi6O@6*!TaRYC=-GHq9#Rlxom<
z?e3MSg)gEJStj~W0t1Ay$(5>B;|S`rTW&j1b{N+PE-UGZ;_Y)>XG^aZ;F*7IK28T}
zaZN$Om7k6MO(Ql;(&SQ>`DMUO>G7E~pPKy-2bEoEikj&4ou6Pp8t}~0Mh|kM&EsG1
z6w(0yjpO&I0npZ%!1$E31oeAFWyM|mwpYWLax&ZsNzmL*nv#y$=HBS!Eu9)>X4Sb(
zEv476pJ6(qa)ez}jvDJG|MZ@RCA|(2FUfI1gPc7T3CN1V5l(x>eKYcAu;}?0Ghloo
z6sn7K-L(Q=b!w7VSq?ncug=v)tN(D#AWLgFfz7OpP)O*vB*x@Ot*8%iugEX5i8zHV
z;}V6X$I~65TSuajx6+xHWS}*Y@KNh6Tu|v8nUz$#zB9d6&%XQW2=QPunW@$~?qS#t
z(K4E-xzMVn3Km#!(BG~e_!=S;86uN`9uAj2a7UcJ-ABQp+~CvKZkXEXsmi?jH2Xci
zTw~{4nP?lX{WNe9!ywyr*6n+uxbX>@mvD3%TSJBcc<`+!HiL7<ox&Ncz=^}Okj|qc
z-yEm444(PfooOilCk!I8(FAP0Fo}UcD4`4&s$2ft#P^gjNq3R64radccoO9SCgv=2
zv~0At3z}p|aqKr?m^Qz^U>-GH&wuLac^gR!igq*ZHIJdWB^VD4PI7_A=i<$(rV)MA
zZ0zw@bgSm=sJf&kxuMXgzD&`m8J~U}w;GIwdrq5s9Ub?N>-i}drg;72-@d)kOvDGj
zSsXM}HF9Kq)?22$*hgKI=8uGywuGveUriN%c=JbY<d%YaXIgj4=GLjzIFN|avHo3y
zIf$v1&Dm_MTC{ojFJhK=s0<5>5F`}ssn$Uo(Kxr5JhDr$LrAAFH5Rp;z@1v0@Tob|
zb|U(!X0hP6AZOq~n7fm*D$oWF&9Qg+7^{D_>wZq&J%(@O=jUI4g3Q*7m3g9|BwZ7>
zY;QHA4URkcUGq7g`%fq29t7^xd*#L;Z{_IoZR@3Qjl`5Cpr?FSzTN0J+8XCr_TspP
zFQDhkPS}=<{;5f-jVCOuJZDLa1socOqxXVJO>fv@(!6b9pMy!obphHBSwcI_8ZA!C
z=e2<6lbuT;tw?Fsr`Cr9%grq}d6!U`wLXl9cSo=EHbgKhc9P|G5GYl%ZU!4%#u*Gf
z2HiTcWUwTE@yG=4x)c5Q=UxDCI+HQmhg5hrgT|`f!Hmbl&LVdnnV;*Lfqf^eNr55w
zC(>TQ7Yx0c*K4d}NfdaiqTaSJL@93tlPFf^kPb&T+S#t~FE6*t7i-@h;7Gb$l>4XA
z^*!Kmgf_=aTv3rA0<4`^hafE3Tzw5nBkxvff;hf6Z`A9At6TCw8ZFCH63kOK89-X+
zVk!CDfEa~1nEB5%Cc=i_un{A@4&jSjcaFz<o8{}ZKQi44L@s_jY=2Amc`RvAJ_f!X
zn!g;a3&=5i9Xv;jYflo~-9ec**a4@bQeB1k0Kdq_mDIQ$l<PK~x07ah&^*tM(yu{i
z03PSE7rfSU>YO6Jo;%ZASkd(cY^z(3|G?OXZ-|KzV8ssV?5d{ny4-?xKGe!VzsgHs
z*)7=8F*m(&N)fU9@#b~-D^)z$a+oV8qYr;aLxl7$l_|~ykF&2~;?XfD@}aU#Frt_K
z!{%iS3fN*9eBZdT^1Q!bVT4mx%U0siqbY7AyE8#J3}{_kDVSA!{q_qC))7AJ&=s+V
zC8s>=Y}q&WwRRz9vkVdz#;g(eN^0|8aBV+4HRWyP(&;X`Pqg*Q=>1sMN_~#B`v!XC
zwIv$+J}V~ZW&jf9D~VmnvrkzIuwC`@9q0SALiS!m)gJe-i7wk8XR$ts6txGg)1=%J
z)2!Bql%UZby%=K|*TtU+ACQtQc0^)u3c{UwbD36Lyl<EYLTNq%ONLajNdzg!YYBzR
z*7#?)JocTN^y(cJlHgef>)!!>$ms=a{sh!7v=0(wC%`nZ2(fE}8T1XhvBgaq$_Yo%
zUWHTbecf1pSS#$CQsRh>O(7n()@`{7l2v+3ZXJQI@8h>m#rBA0tTN5sH*3z4tgvLb
zsxgwbH%s;Gq!ypTTF=2baJPBxmQ-H%;HULycQ!)Bu(r0(P;rH0y3ILA$Q!$NbGJEF
z;nVR`T)N-~oS;XVKj9scWx^f~GzV2EFn+JRE`W5UY`tA(Lh`<K)YhR&2X8z=rSEM=
z6>rQ2Wl^;qG?%c|rs0g_qP)mQyLa7ev19Nsh7tVkI%=ogn{5x><x^X$6LwUuEc8m1
z5QTJd#PWj1Dn{3osm5TGN`|e)N~Fm9vb_2?K@oT|yV-=F-Gv|wr}ug18IxGoL)=a!
zqwXM*epwxc&~gt{u=tDmUn0OZJZOM;2WMH>{m++lp!JF;{>r%3tUKc;B0IHI_KZc*
zFSsLCq)9?wIKJu2e}(^xx2)x0H#>GOJEf2=r~><(7|LlOPIvR8V}wtonG8P2FQGLC
z!=l#gC$osg!%q08x#*G@ieB8Z&Te;2$qD~~=Lfm|A6!O)?47@!kpxeR8wer|bS_d~
z5piPoI`AKqx{=`t(#veT{Vv(^I{LLyqpA%DW8_}goqNfU!aeTqf=cZ-)?5DFkVGxB
za3q(BnEQzJ09!%N+;RCo?l$)l>TY0|(HE6o%iP;ld@0+4(j1{2nrY85)5|2M-+#sN
zW~KZ4KH<n<i($hlvVU+Hw<)SXM*1mZQewBE4=h{(UBy>1|0;uL<_6SUf&kN6Vv;Jj
z?aG9F=UCuY0|rv8$b>?yf}u^;emtjA#kB0UG<XrGU_vTQ3n2yD_zZ20TFX>?;>3s>
z*1K-xY4aI{C1$-jW(4U5n(%}JEsFxCEFR<cpbw78@k{<rNaVuK3-Z=ywOpe0F`Hm^
zEdaUKG6!Pk)&`r_*Dxk^)2Nfy?_5UT^T<4DMk!j`)|x|Fr<h$yz9ErMcu47&qR|5C
zWEB&kC3dOrrV%l})fh@md(3Z)g(04^&93+2!*MgetR6jL3^vZUA9m@<dpaHR4@F0s
zQdYmMr_6m+k3OytxbC~yYYbOHQ!QLkA<Iqk)AJ6PmUxt)U9Micf+s9d0Ann^eo^d^
zsUZ4EbB?kwQNMP{f2A|W{Efbh5}}a<AMe^sxq$K_j-(Kd;806)|9m;!<!A+{2yO0`
z2g#aSq!nnnP}P!cxnYuOpSPF`mDv9h0`Ff2piht7_p{xCDcjw;Q6AjwBnhl&UePeF
zQ+f8`Knhahv=OVx8D4GR!D+rcQ|tTw_jz8KUDRiBHke{uJI{maP75Gdx~@ql&!4;J
zhpvT`?@U=;G7f$hzqq^L7{;P!#TJjUFNu<ib@{cgg=pB=dfE}k#<_cattT=peO|^g
zeKPp8zOct`@lG#qiwgD1_S_ua2TJ1^y=w$o^_nSH&_;eWN7@4;vKfI%_FXp(1dUvY
z)h~vB`hB~aM%9lDIqb9<RbL*j3JF`@{q(W=*qIrL7w!+e@s@wlx$7NWZg6rM@YH*Y
zsO;~iSgNgHP~nq8vg7Vg`@^Ia)d5n)AgEaA=moQVXri!b14WNl2#Y;EAu?Vvf;-$3
zShz|29luaqOHE8^(9drgIvqJZl@C<$)ZgOKo^^j3kw#HG3w+@V7;W(J1;jLMvP-(f
z124VY6?UXNW8~e+<^du-3~hrBLzuQJ)3%sFTxYJIf*AGAPRQ+O8qMvNV>crwrJ1(z
z@HFS0>xxW*;oew<Zd0MU#_-I5VCuAI4TF3hq7U341p5>x<$9;5g2ve=YJBA2hboNa
zPIB+%E9f(997ZfSbo<}xV>vGja?aO&kc~kv2wPw{->&whP_kdmjh>L?q{oWA$aUK{
zPv&pH&u%;Qec746{B{20XIXvQO3*{Q$a|*ga~Onh-&XdVIojD_&d<|_yf`gZ-2@LU
z*RroV2Jh6nWf5mcZiQ(U^Uj@q-XncB7`T!}dYv$#PkRfA>_m9Dx$my^x~ZHWpz$|&
zb~(>@`%9E7^4=5Fj)d|v=2Iu#<70gX7YlQ1KkYxF0b;;kI$*d=pZkS*<(iY^WDqt5
zRM@b&E0cC#k*Z%&J@g`C5{D~Cj0k9fhneZ}(PN?eTN2i%cAz<n4~n$7Le&o(7ti*|
zVbsFo7fHM{g$1&QT(Tt!0>n3V2siFF#+d88n?WQrcQB&J%=@{W2KRNk0I~yA=aUuG
zYTutnlIKnLDWUdjrRGCf_uSVo8@iEl+ZWNfo@Ms>qv#*E-n*2>EO#_n5j$5|_^)0n
zI%^zgzxAB~h1haE_;X$G@lI+G=qss;H60#Oc^EFv8jxM0|5;+K)L&LFD&Jo9<LUf*
z?e~351otANd=utl!bPZ!(t}VNKrLZAkcq)jA#k%(d4FPp&82iJ;w7}yvf)EnL`jen
zWL{A!p@!0-@a;KSQ3krfCoAhp^DRnZFSXZU^5blkM=Gc9fGTm-8$NH~Yov0M6C37g
z-oaov^^-Te0mFko%RGiHnRUu48aQT9>W3C6ER+1`<`(ir11kXTDQEG%PR@>&ES)EX
zHO=d6=x*_sdp#`1S{77=W^%iSji_9Gy1$IW+4jxCH=|cLr|EQVGr;NaT7o9R$jpPE
zZV$}w7i;+FkKbHGqdj!!JMR73FEqBDy>lM!U!+{IZ_H#03$I*6zs4IeBCvIRXvy<W
zr@{3<DIf6!#;+RGi`(hfKJNun=#yA=Xx;N?Sv1VvU|$<(T<=S*7hDbPf4*TJ#Gbnh
z62CF4A)R9G+q1_Zq{W7Ij{FFud}kc1`oM!lflvH1a0JgYe^6^E?!}0zbHw#kSl9u&
zD%bT@^R!j8uLPE_g<ToY@)Zf=&mc>Z+{5Y$vH9!9HLNGUn3Nx?p^LM1#{U0MiE223
znW%lP&|(pOOb-j)%a%`9jx&;e{6kB;`!{6~4(-Gv7NjY^!?A(hSG2#w@fYl3&=sde
zM2D7^;Sl_4vy*(N>ZPUsC%+_Ox(J1a{sS>0y;v*^ZVb2{u1mpugY9RR+D^wN0cWDs
zfhg1Ux8cTHinac!!|1eA-0m{vKbPM(9b-04k%$MZ5hS~9Z}liJN%e#wGoLv&-*xw7
zCy_i5G^~%^a3tU(J>S`BPciU9d=9q_G=6wsY`$NloIAC+SBymIdb-fgE&8L@U_?QC
z+l=G(4Yt=2HJ-7?(s(5=b!WU5Ej$L9u`ki9+UkWqld_YoWf-P^*tTNR%GZjK$_q{9
zmTtE{o_*O++6(M4ufnm7t0+B+(i*D_XE2?ajOmNY<utC(?s7ERe~(KRkTj~-@97->
zYm)Fz8rk#fMxJbp^vIo`b*rsqoy1k#lTbB=s49r*$EK_Cvd3^9mPsqcz-Ds0(dPgn
zW^ugc#xLN6;@DpZ54_N)W^XiGYZ+Q7>ayGONV*IwYVit-HyqTidDm%u>8lvs9MF_b
z=6cl<0)DC5vK#Hn&|J&;e8$?DxSPJwAQy@)is+nE4V6L^d3W&uqE}qYHcvj(dzf}O
zH)T~?iEBe2xo-q^W&l=&8qv%C?P!V~DB>?}R(06zo(3;Ov(3HZukNsyJbsACa3zM8
zJUw?c>Q*W}PZ35eGw3wV(Tf>tdX!92^o^M_Wz^%xAs7>)_x&-m11VxK`2{cFW{Qix
zYU?mH2Ai^H_v6wdX>q3ryK0tv{Nkn;znqyvK87J$%<S7dxW5D~ZXq~gOy(N6C=1J{
z%~o~EY!jR@@#qBL<yt+sS+0)@L$zy_FeIQ0hH6}H+bE3uzU$HFB(~N=f+J~U-bL+5
zg?b3nGw(L^7W2H&vQvMA?fpoc_URL3QNZyT!(A3Y<YF+NZ6ED>`6XufYxWSA#fa;$
z{lHr461%wmkx`T2I0t;~xcbFi_Mn$2XX##DW*bzyHSyfxkgt3C>F>{``{KGu1NHGA
zhHe3Eii<UyAdwkb;+!-Je-a<lW^UF_s1s&)dY3VZ66U<1#}`*C9!Q<W1G%8q!z7|W
zPdJP5T2A|=zF-7pr&Gp0PkL6agF{+=(j7#7i%r)+zB9M*ca6a*bF90sicBC>*P}%x
z@hOO?lGO*su=!cBE{4bRl#R{kILfI&mfeMx8VZEIRGf7Ra*g!ES`^LjtNLZ8*XB4?
zTb(1I&f7<dbl>vz@0?QwHN7##IS<-H^$$-DmYjx0oFv@1$C&+AdukV0v2CAp+UdfI
zB|0)LKB9R4wcibV=kO!DE%)H4AVG^Xk*uIi>C~}AVM-bhyN@0alEhHGIrbR#aCbD+
z6IUfURK?V8M#h|3)JbYayCU1KoUv7(a;(yw<e|Abw!4F<L3P90>5HrT7x&Q!fm1tF
zpxaXYI!SZkp<-E1Ty^U`Ow1VD<Rg+oip5@}Hz#!G3#R)d_2$P1*Xx4|d}lGYi<iy`
zo1c6-!$72e8-MR5oLe(wE1MBSa~TV0Q_15?w2&MGJ-By)?9L84nYZt+JDpz*fjuw7
zl_%Z?1dk#s|EC-^&JSCbTrus3gs9#74EKC!h|eIH^a&NEYVOA6NA(Z8fk2^H#Nb)t
z=dP!ZyOQEh5?9weI^YQ3ViRAnnzeh1sRzr>9fIYL*lHvLsl^n1{>{(zgr8Q>aDSkV
zeZg_ZUmE1Hxm%yGv(86wwH6<i+1SQB$vAS=u4tuhlQ~1vzMyia0?#gThMbE<n!w<-
z3rCaRA6Z9ozIo0$N@pt?BGo8rF(JSJ!o2)1m3It-7FS0;io8{kpW=`KG>o$17R^f8
z%u3cheL2Ry@?$5?C@~6O?3u;8R<2I0C~8v=V6)oYP=xRSX()7Dj8cE$BhKeA3Z0QF
zXyS6fwVNnsD|p7O!1%nlkOh{&GJGq)#3&G`cJd2pDJx}a65rgxMAuVdpJzzXLZ&)9
zW|BnlRB-s&&^KNPz^6<2Is7%3U`9*VIUL!=g_d8S(l|=k0fnBO+Er!D#oLXljY3l~
z=ObIMLCP$tM8}9BEJ7r^_4FZsg=D;s3LJF2=N?JL+nUxu7oVSW`-i&|s0SL@#tR{;
z6^%b<PPs*7m*Vi1rqiy0#n+x{jAd~Cqf=fwzm7|f>h2WTs?6Hxi;Nh!PIm<DXY5^8
zVHx0Lw#Jq*d>0-f(%@}m7EqiQ&&>sX_A4VLl5#iXDiYcUH%fmOFOa2DtY*E5G&RRK
z#-ZPNs{3MKFkf5jX;eII_uK$gS`2n=PfjxUDlcTe;ej?Up!+lrlUH(hj52w&iOk&>
zO*_rBO9Uth>5IaSIw?^qe+98Ez-#d2B5IU+7eQ3N?~Lm5MywV=1AraArem6g#N>KG
zt}#pBxA!s1O>KkAbwOnA#SkcvpZU_&$@{AIhzOIsjax-0vQ6={JD+@bl5z<fA^ZXE
zjN2%?G$11Q-8%ADU7}r%bT7>`VVer!Di1!S?shBfd@pNH!30#0z@&40sAb3mI8<ik
zMkIxuS@Xzbk@-6(e3V;r-Bhh3)vn&CGQ%$-VkBc*@$8-^Q6E#+GQ$|@IzoiK(<f7^
zTH&<U?#QjM%YqnpyjY`pPe;es@&eKQ=NRx~s=W(w?sL(lhL7=w(fo8WCf!Bv_haEr
zWr>}VKD%ER)lZ`r+}72~iNAg@c&$Ie<tBMh1>!R1;KkUE6dZ}!<4bb?1YbmQu{5Vn
z<O`@WmBX~u{kWX5|H;P1*MPAbHKXHxD97MW%oz9m-DE6KwqkQvxnPuj7fjZjv{^ud
zypl`ZaKk@<7gp-A0*>6?T~L)~SB_DW?lrpwtFCjs{s!Hp9C2{;3X-;s$aij8Gf{o|
zgCAcrb1d+io3OazSXRCye#~0}EN#Y1k+RR)2T_EIsBsiqw4lj6qs|VQoSuq&+)93@
ziT)bz)B^=t;?DRzsk5m0+=2nI^i#VKkM1dYK<AcVfz=&%&C$;#`VpndhYSIIss+u8
zYe2@f9$`YvfkmE==zX{%IEr<eI<eI0f#(B<s=PqOvYycPFnvpA31jJd8+#29Q}xCa
zGp^?G@XeWXZ_x@p$^{Yib}LHVopZnJrd*?s!54}VG!Adq;^lW-LnDn-XIJg|8CU71
zzhr?)H$SJA(_GS!B)dkPQhq%-@fd*Hp1(a0GQMOIFLY~_yg~jESQdGWsKIv?Bn~xQ
zIcHnt@d}GiS@oJ#^V38={oRvvVVkPK&uvG-Am-_NJcOB73t(@Lh+iYu{o=D@rnv=P
z1}RJtBc~s(mXACMZhft2+Dho$+8*_=W4@{XReD=%unY3jyVy18tkM(Sd&2b(ZRvqG
zRSNxtE91##lI+h<W#3|`<JqjI!<eE{{L#X*YOE1bFW{q_hRa)treJT~kNu{FU$Qcr
z*$kVA{8`q27L5Kk%5mqw(e1y7d|&Hyeh5DEm&m45$WOb};l`?Iv%d0ed2ol_opi}x
z7J~4{enr*E{M`=1Mo0SxA)$TtOO;@a>By9DTIeqK$(p-)6vbNUG;{sPw@%?lzqfiE
zk-%~s;NApMEr>KB;GLr7s4vqV?Dh#<V>sUDJKu_|Ib8TUtF&YbXHmv`JO|lMpHM+E
zsPCno3N5-({VBZ0WT{LRP*VIXS(ue+pV9@5m%Im~yw9Hy7EYSR26+Vr1(DTtI=4h5
z30lKbTE~C)2&rjBrr=>jjG{E%x30rhS9OZ)k^IrSTz)fgJ+^~rJo|Zco4Rc(_(clc
zis4l;`HWQ^z6GK}=x(NMhGv-r!gV?ZN}jI3GU_IITU^4_pXD=UA|;t^blD`_i?PrK
z266sI?sOonD;-FQ*E;PNB2wkAKA#6wUw}a0^=<WBO*YnCAsDMOi7-G4syK92)F8x>
z(In}6&`L@<)%7XD8PH%2=P-bPwQZ(U;OCW$SQ8W|QVgSVI5O2!<_=U7D$7$2f61wo
zXKynbiSt}KyjB~OB|qRec-vp%`+j$xJciwrj;P9M&YnR!x{zgQ^|R5ZYxqgvxYZ{M
zq;4@LL0zJ*&(+nvDB^U}o0s267nq)EkDhlLQN|A11ndFC>#yerday1k3Y?RjD4CKc
zGELi7z|hpeM{H^A*u4~&=Vo-{;-q42C?Egz_J#RA^R`#F8lJDR{%HRpW4Bs=-0P`T
zC?$#}F$znjWxTsKdGgsG0%qR05c01!)t)(H%~X@B9Bw!5o`XaA2Db<I+`M%r7@8Ji
z<)t)BR>w0MQOY$fTGr4yI%kATH~70n=<y62M;90KZuB9_2M5x#8N%#K`evOo-bH<>
zzZD<DaZCp@g{N|vC*N=BQL=a7;21nCtBar@Wr=zF;h2VxnElX}+U&n-An!Dx&E4-b
zt7s&kE$2`qcld&&8g7M;BlfI!t)pCdD=~Vhe$MrRZ-LotO!WsPh}$z?dK~U$z@rT5
z$IT7nc5Vj<atB$yn?X<R<{;|`d^fY5VK-I-N-}#`R}zs9(SuK>CUYdC<asSu%0qtd
zs$tbwdFGoogCAR3-Gd)1fAs2JsQfs#0Y)}u5R;buMa?Qv9uzd*WWi|t^2GOL?ZlCz
zYig6rvQ^0|gXkhLy?`vq2EP8&{oSRH-37kWr&wg>14^fh21Y}Eu{Dljw08hhIU?Ex
zFKeQH4#2G(M;7ePsjfV;`w~*U<2A&?qf-Sr=TgD(u-XpF;L%N`arIfJ`YgPb4F=mv
zPHZ&;Rc^tzpHCh_@Xftp2IeW3J0$Dp7yWX4NvLvkN~X_Uvv1*-2WEXaYAH|8i7O!z
z`(;6^TL{`eMoe8AOu9oC&9qTLu`*94^q%320{wR}oKzIq1sXn&W=xLzyq^g4Me0KJ
zr<vcz>=n+;`OBV!>FZRRF_DbT;Hg)Atp^|KoE-<|<{o5h?~a)V<Ry~jszvg3XPKb;
zfpt_vR`wqzg8bt<C-v9n8+&!#y*a1kGF*sJ99}EW4uDM^f<Ln8#;WE*;CmFmmMX<I
zBiuIgO$TX_b*tfHcJpO%{err#8h4Lc*Z?lk&0)oZ(6evVEO-Fz;lAD8U!8|0%0OdM
zb7_3=M)=m1S7x>K^hP-5g?QK^g&Iwj2;@dK$k_~=q-oTay^4UjegOkZdwKy)yvkd+
zV6miWb#n0Ov78p@0)BIqaAdH9Rd6_2Zy@u#%?0XO)&7W6Lq=ZW=itOU&$Unm>es;(
z{~ud#85P&kb&WQSyGw9)cXtU6!QCZ5aCdhL?h-<9cW*qwoghJjB)B`=COPLk_Z#E=
z31h&n+O^iKIp<nc+l%-Jhvnr#{8?ZyJ}IJn-v#w@{bC;-^%w)L7Ut$4Vfydko<pjj
zsW><J%SmjhdL@|?-_LkEThP;eVJ}SV3a&zOq6IWiS!Mc%Dj#?lPqaZik-zRN+-(<S
z@XLCVs)#A@O}e*Ij@$i=-cOTqPp_=5-~5=2@&Ae%tgD||x!UiUe1$$3?2v=(RKg<Y
zLp15g-DU$srA{6Cb>^EBXsuTt9CID>Lw@q@OW@Jk8>vFr4@9V0)oK=^7)(0P1|xI+
zmH|n~u6A<~=dFN42^N@k@WF*YoJfcgY$g#09~|39r{B9s)?{)A%|8WMeIVLfRAx~d
zFYkbjY7`J7ro#jiwJ*60Pf5XETh_T>fBEyouWqI*ReVU;$21gAq<9$f4o_t;keWIw
z7cqom`%RDIYt8r4H`Dx8(+Ev6YL}I3$gPRzH>^OA=8tAqqs%;>o0Ug@J696s<<2@x
z`d|b|+sL5|g7g<(Zljwrj<L4YL*lIiwsG`5%hRjOxHT3xrO`@y=6s%C0~_8u>G3{V
zg6Wg_usVdEI_D2PF<(cgWdvENHeo#qi*0b6K2zlyvW12vp3&zPi8vV+D}ulBmcoLX
zI;+BFhmXkg^S%2gGh+8(UNvWlKVr#m)y6v4>)U-X?F&n7qLBxQ*GbW|fkzgFipJ-&
z2V*?NEI$a5jY{Xqnp50mCLwJ7G8U#z@)eoZcJ)wl{VVid4H=A(otrM*&Vp#9>4@V+
z4$dAHQ~03%k3Ue*Uv#{MdQgA==okw40Pb(bYAc`UWLlqlyu-O_uFhkCi3&^a9&v9;
zWwvCNHH{UPLrUs3%jl9O?c}sdq>5FH4oDwR$tLqYy5vQNyF`F^_EC3mCxn>q`kALj
zIISj9cW45Zznpr%`l+T3@V-nK`aPTGk=h54I<QY)Z6@`e`w$bK7%P*3@DEP_u_S*i
zCgU17eUjE>dcvA3!LA%#jmzrApGw89)%oQ#D(v7(K=#ptu3@RLciD7kdChz7^rj6M
zZefxt)VMB8N!Wb1uT}0f?^cog0jA8K<qZz6{ko`qdoO|(8tiGe2pYU2zH>J}bj7R5
zdVTW^;ns-L^FTOBvp=5;b;hxN5ANOnWKK557R~@Qncke^K5JtTg3O;MYp{d^Um?91
zMsDV|`ZThWY+g>@o;1-Ud{hyR2>QSHtxyvZi&L{|*GU#K3!lSET8>e}2GjIcfn&zi
z9bTAo!<DEMCJfi?4SGfN+{#c{$Hw)lkIi3ikYzHal~+<nWU}Z+0y<+~;+qAcTO5w=
zer=h5Qj#+%j1VlkTz$_N$eO$(<(udF;?X2hd7O>z?jvk^xRec*b*Xt*xcyagONMWe
z$yhAzl^(_&i+R$G7VW1uoM;cD`dc*fE})IG+!CdpKkMvd;|Q`uDyd^%OZl(wsm$kG
z<s@LHEV8FwFKuxle!6u@JU-XV?@pv3Ed6+lHvDwD<9o@QNj<CaRup?AVk8V#rD=Y+
z!j*4ev1L!kT1vvV3tWmGXii@&_D_Gd6Z=7n&pAzVPi#fWB3nkn*v?aN0JrWVuzPwi
zgkDW;ptE)Ztwl7<^WA^lBMWwH{m!q>;Mr|~g!%Yw5$?rN-=w)q3eJe~DS^Pl&!wDY
z%mLZexYNCKNf3<PXCnx7>y94EvZ|bm{Hm8mzuODuq8rpE+>cLBPvw>!{q`QdBDzoh
z3rDGSA~dqcr(+Z+WFucz(pWNwo-WtrlZ;8HD1e<<CEm(TpTz<KvJz@zqUrND9%iGw
z=RQdwu^`(xOgw*J4%HS4j>$zK4!V9D<+jCi7H(dM-Ss*Drp%{of2obJ@wE?E+M~y3
z>3Cryi6H9TPbbcWp*4rvr@UcmHeMSrF(9nrkcR(@-U(3LQ3>WTr0s0^600uD(-?cF
zM=lbsH@{CR7^iP2)?5!o&iz6}Lzp#A&L6BQjC37!9u$PTdskA!NFq-aVq`DUFGA3r
z+6DDInzUZZ95nMwKSWcQdXZ_u8^N806pzbacfG*VOPHTI&HSY6Ron9k(yYEjP3%K&
z#P|uvCpREG`E<>vGd%x7tNr0=29}74iA4$qYI<wqieK#xRy`+uZVw4J(C?T^EaGq7
zOO!Mk6w|H<Xw+y{gyLI{OM_2?hB0l7S(J2`_I@glJao?jA4oH^^#C$`nZG?~SuA$(
z{|4f+FM97P$&$)yHE~kU#a&u9?0()zn7ps;XrU9yqJw4{&kp}(I3i}bPGi2*W;0B4
zGNYtYSqrx+YY<)3ldgovA<CwE5&E$}Ctq+<c|Rk^Z~EP*!gD*erA@J8!;k?KA1TsK
zN+F}0@zX^=u|MMelC1CJBF5F%XbHlellhfCnNE{z%p82Vn_oRp$z>4<C<a<q`TdJg
z5?7Kg)f)$rPWAlr=Y=0Sqi^W!dNANb_FuGFChn2fQ8yRerti6Y0-}|nwa#NMYK%=p
z!RKqk!7!>uHe%dm_@cck+oQ?&SgQ<8+-85W4r_3Po>x1JI>s?vS5K)Cn=6thaw42`
z|1E3wno3YYyrLRqA?6DEPctK&^KSL0b$`N#0g1V?3YzmhtJr6fn+j)p8c6yAvlf|g
zQX0}M1CAb%@}G$6pC9+^cVe;m2-#-rY5hN-uJ_aN4->YehH3xCU5a5p_1Gc}z`bhQ
z3~2w^Nbk-v`c|^S=4m7t7ss`0OIiyO!H9*3%AXHhf;MQP#;*hmk>ocSOyGFTs9}(X
z{u__^@Q*sezigPMLv(B<yW)j!-yjaRQ!+mnw(28ox`S~RC8z*3Y67{XqbH_=%4nlp
z=7QvuC+pSk8mUf5B#+lHJBX6b2HPiz<LJ5SC`1hHYc)O1)^#1RxT5xCD6&So>G9eQ
zF}P>ego>z2t1m@ciAOH+_y=KXZKE^_+XTl)9FMd6=E;*Kwkb~hgW=>RnF#$n^paJ`
z;1Vk{%U-bs)U<|sk|Gz8Evi2;ZXoOnmniB#ky*e1yf{B;?tq%^FpaU>Z7f30&MWu3
z+ut%N??v0$i+7`7LTvnLoejE6REe2I=bRgu-l(IVC4Y;K-J#h2t!_%r0n5<(N^WB1
zu?4Ju47-q={f(Gdw()XlbsUSA)35}^h{-T{dnUi=S|E~q4mU(*TGA-1GED79MH`TJ
z$)DUlfLY90Y^~q3l5H&dxnIHiDK(`S_xuV9v3fviNQN8l!*FCC39e5ts2gpg*l?_W
zLdIK1@Zeeq4(WP_(0^UhDUvUigb-B&Gd*t<H9jIE>RfQNFXkGK2aWNNVUzfHQfuB2
zpeI%_7BVCwem`0%atiHP8M%><P(4swtTkrYoBH^vBj+aJpd^^#w{gg|w*mOriR@+2
z!(b(e#o0-LU);jP0l*4%>Ge8;oPhOrgRM*3d!8oPh%vbLt~tatyziofbcr190L=ji
z^exNlnR-Mlaa7=X+jrt_8ktg-$<8c_hS=Apq08;CAOI$uvTIdh#au}Ay3Fo3odm{h
zXSH7=H9veL8MPU~hDw;WE=!LehAN2q0&A{I1mzkKl!s=GzGGjqaR6>2_FHR4tlIU4
z*t895@_u3A%iW4E#Js)PM=R0{$X-7w+ojAL-TO5Hb9CaRWej621TMf|h7H#KUG8g4
z2&t?=BG;pz0?#5oRzO@Ur{^=Pbc!4Z3>Tvi8I&fVhM9soOqEtsSZP)BvSsZ8^?X*M
zreYU+Y16bW=AlvCLRMi_)xtvPa(NSdNY*TEuZ@CWU!uGz3aM!8(hCPm2YfvvnR#2^
zMJI{-b#A7I<=RQdVaLhC^R8D0w)UZpD!Az}VC7=Z@S7<V$W?eS)J#qqil1Dicp<u_
zt}p%k*C!cyeiDp<adM4I@<5ZV<vD^Gh^@h5{(fT#NLOmT<g%aGtuLNp&?oW=J1||u
zPU<nAa;}(v(qE6Q%!d)cewU2ceP>UE5yW$N>i*-KC&gsObi)K4j3;MFj=|SQp14)k
zw;PU?G>j%fCW9Go1x=r?ycr^x0X{K;SMpGi0CZBI<i;t~H3E{C)u}h-jYqlurwwXI
z68UX!Ehcx_lxAWy44Ou`hN<K<I90&##Ybu^H7pyx;3`!+xK<}3EJIW_g@@L8T@g*B
z4Zn*wbMiE<1=YiBJw|75I5)hj%B+YPMRa3~H-{_$O8GuX@M?hdBlkn1P>af!|S
z@jdhw@!ioE)`O^xqsB$dj%U#^9&(z2H_!fY^Hx6ZaEmtNn&GNGjg=+#)#3D0RQWa0
zQ33rzK<THkUa@A@{1-I?xWtF|8SY{O;b+AF60zV6O}JXEy1ris#zD7y@;G>MiEYUK
zvmdWif!(kb`)4vDzxxjSuf%rmvE!{O%-%W9U$Y`z9$Z=KAZx%`RNPx^DS83iEFY4p
zhLl7JN2!gYgUbg?CI(ZKO-h_Pgmv=hRgLMS`>Y{`D+>YHJk|$v&l87jOA$ALe9Mok
z6gBRGx<H;`Mx5okW+)^jiH+N?Z+e1&)b~hK*3fb1T{&F!7np%7+yvG_S@|#}EaH>4
z_tkS!w*kLQ3;APV%N6e=H@?En#eA5dl`$?}N1q5yIn;Q6`&1&#^ui0df-nR1Asybr
z<d#o*vZQR`;yqGM*_Z8;VZ01_tBv-n0dK<VfY01hlY25F6+MeI5SjrZ!JPi&qOU8x
zuL;b)a{o~ZaXhB#+jDthGu7#6;-Jo=$7_VT3E1SHn~JjyP=XX1=4r4__mpGqeS@cA
z=eJe9JfRj9_N(FcZ6Nxi%sEbu4!SAEceL5NBUqbpnC4kK-0LCnkIEUv3tAgtZys1p
zLZr3&??KV!S|KufMm96il{5fNT&a?L+sBo0ZNVSf2|A15w%wEfr<$E9Xycl~67V&^
zuVEMnN0<g;I5emgA$UnVC-L=k{{9=>N{5MGhfJXmDH){67A~kBA^08{_Y94Esvpip
z{=s+{nU<xc3{<j>MACYiHka1HN&*RPU{AO*f4HiZmKU<}@u)cZ;Ac0vv__!|gtW)f
zlPYoy)hwSVh^Sa)G8WS3Vtj9YTl%Y%N%xa+9%Rl(*CZq@fK>w>c@6~Sw{0?{SypsY
zz(t3RuvoAEb=qT%=p-b&$b$&ot#RfD3%yg1fb_UtghhN`V`C*IWhLnZ&HKkKd8<uG
zEjR9*lEfnF1SkVbd4g*}h$W<PeB%i3z$Xk6Jgr|h@(ar>1I@cnt1=^L@a!5krkk#h
zt9z7Kd)h(shTOF8Q|vm>K^mia;-Q9+WrFp&&M!=~B`j#C@X9bFA%5+K4tH~R)Ek>M
z&_O+dpei%T%HN3jU)P8vC{pUWfCZ8-<eNjBSauEd$UZa=3E!Cd6`>FihGL`!6N-xh
z<2d|*_A>5v=%-<&)y58yT8@h&;}zHp(NiHkjNmH-$_^fWV`_56EPFN41`6p%e#di>
zKD&g&GA)DN>ZUx@+LT<V_E6SCVFh0KRxopOQ$EUlwc;fktaN?fsZ<`3>w7S>(F@mm
z@zvOlMK>o;go&gk{kyub#PUHYmXOBM!2%p+S;!(u?qtYty-1@Z_)ZJQ6zf><`lCUR
zPuW7{P1k&b1a9o)xYaM*qMAuB(Scjm5{s?2l(}D=N8S`bbry<`jL9FKU<j2&?Y&b+
zY46-|yRb1762vs34Mo2nn)C8Bqs$7(&2_sSFO{}}_jLHV_*;2=c=ar!xN@%&vYz%t
zC_R_&7Jj`I$}0Hm3su}K{|YU!5uBCkB7l*OU-o&(%t{kAiz}LFd5z|KV_q%BXXGo4
z!83I88Uh2r^*vbu1bC=_k_zrE>B<#jg*o75l(tzo?`yCE>4c<hIxr)P^IIrEV1v25
z^<B;Tp*}|;$=1PXxOTzvwK*3CLLEd028PC&b1vUN;{cE4_5569p?HmzCPyy0Sf(c8
zs@uL8L{&$snV*9&R+bgpbB^=J)G4x1_e2l`-p7HG$}Eo)NqXm}FBmWfyt|=D3PPwN
zH*AlLE-1s3LX;VW2q?>#TkDm&z)n?F>))5XhanEErx_P7PbxDBo`kzf#{C~q-k|TR
zLG62YDHFvD3uNe?MJ<<BmDF~;+Z`WR$Au8bipVtCp^#ZV>4=!kA9&CtV_disK57t<
zhCE0VW%6a_-gaVX$*aOc(D*69=dk5fuQTsZn!T|3<p?MIv@|wD{?wybxaTyf4bLmc
zpl_z`ha+CqFudKOhj6%$<;0{kYQw45&=;;F?j3WKw7s$R4Vyq!@|g9o6-XKQV%O;X
zFu1_UB5Z`pPVowjwss)R=l!EEP(3BV7Ms43L;HKDH{{t_oMvoaEX(WjDl7K}YMc&>
zBcOs^g;5vKk4TOZ!AC*~Y4EB30?Q>UL_%MM3Z&!qn2ZLWO#02IcbptAVyl|BMA$`V
z;a!=OeNQoTR)OfJKTc^=mDL|{SUe93%-8l735kQ{BgPO15f{klyAFTv03-%Ib{JR3
z!Y&T`xw^@~4fDjB6}Gj*kdG3ULKY@(`*DKOXh)ie8&N#SZi}PV615RewU-C_r0#ej
z+4`yaYKijwI_$Ed5&<G}>&B|9>ENOnIHZVuerLXhKRBjDNrChAnm(MMtEGffc7<&A
zEC|*QwlTPqmQD!Jy5|8l6p-JGZ~95&QQu02633*rP^SEr)@NbsP)|lMq!qSnH(HuH
z?;VMw5Uyz5Y5#!laAwUFmY`NR^L#9gEgVvvEtZMXLdMEnk6N4hR8AGT;Xiv$+ACaJ
zZa8(dD`Cz#Ajw-8l9ro*iUK}m(*E7}tsRGK{-=&L<`1$^@fJW>ZK4~zz_uuPMd;xN
z{<G9CT@U;;lxA?Ml{W`rnZC9BKg&?RMv9V<iY^}bZIE6Tk*R|S?y+H1Y>4H3PVYdF
z3G5{)B?^#uCH1o;*T!AM7^G0dsw7O)x1pk+5)CV-9oVv~c`CT#HClVe&u%D;OWC8!
z;4KQvsoWOWMCNu6RM5-2cHFx(o>-x84}0}$H2h2L=Hub9_g&`SuQ#uJ9sMb+&$3rz
z@ys#%T>YZ(p>I9Hpb4&bL^&r`oWcLsc`Q4Yp;&pRib(+bDpFFA-iKX{@^=Lr%B!g^
z@+fOGJ9<1=4hZ@pc}5#;hw!n71r8W3_?5$=O&$iAt<dzpCZT@sxNDLVDr+){GiYb-
z!BzddHg{1(yB^zv*T}&V0bdSS2(sC9?IcMEX7G;}hbY9Sd2_wKBE5I9CJYo>5rMDu
zPUE?f{>#0dLIYVo4+>!is6WKL6j1OWVK9&R!$_b~*A}6QZLUM^fV-c^>+#DS3pxh#
z2T$G@0vcY%9<6XJUD<a%vGS9A@z+lAdaX((jTStZ;qD<H)(FkQJdvNnhC9@m-Go+q
zk*l~^74+}fK=L}<X;0cj_)fftn4T37#L7KyFhaYg(W~_DRlJmYiV#70)Jjk2|G*lr
zU=Z+3#BdVeul2E{0vvzaJC2xr5X7?W9|vUv`*SD#FcrE3{q|F_3$#zR;(%&uBX;d!
zslTZ^@(jxHsn$sIJpx;x9zy&rC`iTHogQbCsAC+%%WTf9-7rXsB#!cE6K|*XN~^@u
z&WVP?ho&C66c`$ne38TMk$2_K2ESJGoYbR=o{8x;%6`kIKNHK4GXqn>k<7BVC!~}_
zJp-DzdKeG^z>^VWbx2sx+I@#lGd;J)6R5jeP;X!oC!G3X`7;z9B1LO3cvtVbv|wd&
zZ*+i0<t0}|yzT7I&dV{jkf<DDelC63Es#j*JQ;ef8{Uw`1N{e%2vE<}W_W#*q#{r6
z77(TI?iEgS>p1YlKN<p~dPsOeje^DfGd|iI2veWRyYe3!h2{+?@kO(7MLEl$HK!Iv
z$c@VS7B+jF3h+WO=R3|b+<KWUW@qoHx+-K}%L1?V+iVybz>r42&Kt}?<7%AyojN9+
z_$je!ip>t)kmlQLTJA&_kijSS^CTy@R*9Aj3ODZYo8R|c(@Iddi5+dbnG#Y|tN6QF
zibCD91f`ySELU4Qb9s5<b=j0e|3b8xdd^auW0osg?77lu0_YC4q9>N`fAd!5W5*<d
z6Vgxj#*^;vTm2gmdHJ8W#@wEeTana!;V8vi8>v#SsDMtt$y-;oQFUHM1B|p$VHsxE
za<)bK-q-4Nu|tzsrwyOiI0YtbZCfeMdP?+h()x6!n=wA%!z6xV38p2bc%^ZFaSTKT
zJh-&9gw{@UrmeXWpfNX;hq28i+3%X{pS8wAu!m&TMVjB#_JYYl;&xN;QB$}cI^r2G
zmnyXg)6we`XQGhTNR~t`Txx39SwW*Aejvch8Wlot{L#?Yvc`Di%~9=BO9*W|rZ@Rb
zM0>JDsQIT|!a=ie^J25%==D?*Nit!f0ih83q*Q{Yb^yKux;`79c)0Pim;hyqO|DOE
zYlB1ij*oMkB=Z(YwH4yjZ0dLQQxRW4V9zfEQVg|_0B#}in8c{xpWhLPvwqkdw8*G;
zq4Aa2hJ<=Uvz^{z6KkSeInUok1(Yn9g_=lVx^eh2nylCg)YfSHIB&MRkT=P&0ZZv$
za~g#<!Dfzu1kqxz7qjq(==$Y@9R0dF;Ul8*qST$3)_Q_brA$I!T1N*)saS;JSm7|P
zWRXtP`k)`*Z5y3vkH0syup~M5$n;(6EOj)kx>f!r;xc`~SR=P8veL2WX;jmb$uegy
zsL#&hheNF8BDTr1eR@MhVz-5K9EUPN99HVd7g|We^^uAABRAA8k-YzIA3VJ2WT%0!
ze^onuNI>f2)4NN~_vE>efZ@86%mcRygv(|F;JC`D4X+_d6usfM+UPSl)M|lX%BTZ;
zq2l))#N-mlUs{QOSGIU+BQ{5LcsEGc5Bk%aq%LsJGccT=WzC<3UJC}gjMw9+*~;rB
zQFvWRAw+4){<7K#ud{>sjNJ@ZZMkvvQLo6#*eJv$muIX+jt+gF0qQHHrV$?oC(1v9
z1rsW(i{~;Rr#>{DO<=n?%O~_@%0JVU>-iRv*B}qd1mP<TToC(Jl5cOpX*A2Izn#{e
z2Y-ACju9u&IAdgv$7wHEix;;u*CE*ES`5gY65AzaMDCyWuq}wCS@!Px^hB6N6@qL$
zGxTjw4&8-+3~sE2pznfX9B{(RQ>StYbD=)l0yzA}nqImM>9TbiDTi1_i3sCA=2sxb
zrBj9+Ik0~2hu~n&B9;lbsd>)sBTy%Ve9K881-DRi(I!%4yo(i+ziXfu)N^D0ekfj1
zX3e=&_APPr3NrWJc$Z@)rnris-4tZ?8wU(FDscW3Jis4X?Jp@CzwHYx#_NVB!K;VF
z(;w+sSJPBbDa^<hI)JsTWzr<gjfi^Ipo@TW6@cLH@etTDYY(AT`^FH9EcF!AnXhev
zv_fPQeN(cWH7POU=@hVufyFCX3O+fy)~xwrR3oD-C=GFl>2>I}+Vga(d3~V6s~X6a
z6%PI6@S?v;wHh-;f%f@lK3!GBlYApG6&{n)qC1e*xDD~bX9Rx=asee=oUxT+5TOp=
zLh8JG4|pSt<iBzEO$Q+HUE$Vn!^W>flJkO^P%*g_V?lxq;cCwR7hF*&w)E1KoW8?W
zs!b2bcb$LXCUxarNvMSIU~><3@q&KOgua}eKKX;seBT!(fsB*7r-eS%Y~}AiKjVk#
zq78``W7~y|A^rdW_$7J!n27jK3Ic7TC=Q($gubw~iNSWi#4!<myc@z~N>0n4(1WkK
zRd<OOEljneYUKZv_F!qjpMNvHJ!4Qr4x#DsvDp2u>?|<CxCMkba&CM*$-QH&`1#y{
z)&*gu%MmE@a}pOmBb>dZSI^0Om^TRW9@3K};lka`p|6V1bCV)_G}{olb9U1be5q?E
z$W<JX8RpX7wOkHfKUeOaFNpt|&zJlxJnhv{kP32Gwc7(9rQrr8OJNG#Y$I1$ZU(DB
z@RA%MMX~~TPUT4vX7jk)0f+Cm=1Bf>R<s}x)Ix)DCUR;M?T**1VQFO{ZHF_*qY(NH
z2WAaQlf>SI1Y_`BwUV7`zB7O#;N#skEb4l_UMLN!ZU{%?osPxcXT3?yRdn6W5NH!C
z#}L<$>{s&%yeXo4$_IQs-~$!$F}P~L7PYcHWs_$?=u>uA!<3=Wze7jgmj=4}pGdbt
zmj2@ZaJe_)O{<&esYbk(kaSh)9wY+DH}(Yvv>ulEcAg-^ThiJu8jX91c#w=@PHsP~
z+R%;}tn@yuiaz|pEFE+*^i*9fobn?U-Ao&((k^;GFo*68uh4bp-bK~ZTV{2U>$dh8
z-%I{!$o)5@t%k_xO7#a4e1^m8kh!xMWk!kK)HdzU+t(((p?JjwliETnBhhdC*M1tj
zcbV0Nl7SV9ZsHh0-yNE(tk8op|Mevop({nBNQC=kmiOaA=fHCwqDt{1Rc8Y*q2DAA
z@p8uT!LCKPPy<V(>d^IlnC2|>@Jo?4n__NEk$Z(Tw%PsRX$Utfe?Ba-MUORYqfu-U
zjO;%ySwRF`WoTTxAzyCPb$gC(Y#~!EDb`!hdE$ICrkY+~I=MF>rP~jN+!L$tu0cB;
zXe9JCu^rs-R>zSHgEz&wvKZxk@m#cxBK}Y?nV81p#12SInId`x5u6taTPKe;5V7;~
zr{CTM#w5Fc5wryu%Dz$KMLg)r)3dh01}Sj;KsQC!2+4)5IN}u3`QIMPrew^tPI+)7
ztl8L8WQA?|RY-4Og{5S51EZ9)qVc#uc6}g*EK&7Z$QKQ#!V+%cR16rSuA0GbsrIY0
z2p0^q_kf^_+cJKA$WCOE5msoGjEhV=P-E2--43!7s9?{9SN8kCec>(1vAtf@-|g!u
z5JWn03lB7=d_=-|yC-bbA)<({vlQNyT&K@C<H3N$L=!OR!nw9?4mPwT*151axi)XF
zD?DzqfzTY}ESq<kfTodw0QWMF@7;}<KIo>V+uPM*VrU(d%zi-c%JlUd0T(uD&M7B;
zh?N_T5WYT^7AgsUs(i0rk`XQ=I*0drD#RDfqbLhm2c7aB;3GNCYgc8s`4NkCyTn&$
z@pc5O4d(tR2rmWP&mhj1!s9Jv3#M=N;yG|{zBvLzHqObnjcQh$JH^|cY%E6iU?r%Y
zDKJGucsJcI$PZID*X<s+*RU|z1Axps3uNfL=(+j3PNABtxdP%0JJ`BWc<3>U^0122
z*hgYGm%AAhqKinwt7OGQ<bQ}yP&JzM?l=J7*kHtRNAMJG>MN2<Jsd6s0=RpPY-7`{
zUK8>#(5PfJ3inX4vc7YR@8!oun>TS0?!b=-Tr`QDc{?2Y#XSdw+Y^Kxs31|G{zSF9
zlEVF*%OSKkKp^5IIA9k-Vdg<?O>IA^gHQrlHn68(pnVAHdu08;9vWJZSymtm*8TmV
zvQ8ILXNB4wC8j+vT}{>zUcFv}gX`HI(R}z@Me=|%65ZDxh;?<t%~T}y>~rwn@5@73
zGF&<iLXd-XNVPkRCdEN3>_tA-i{^ZLE<%db9~03E%JdFt1Lnl2oI!9g58LyU>s+lH
z{lxZYM!8f*a>KhzW2O?-`k|{6Rdauj^@pkDo_;doDfCauXPam(&!)1ABE)nGt=P=i
zk4T+)8A?+<2@eDl8G`YCllloq5nlm)6%!t8q_)zvn9!zSb-$|zV@_}CC)FxRkRaa>
zV_ByDei`+u1qvyi&7Y>7R$w>AQg<0?um*PRLO;tOpSHEK-E?H$-kI=Gp&r^$bt32{
z$Sfx6g=fP(1|V%qBuB&zsH1T8pIs1@Z|$Jz-@wBhK_yyO?rk4pg3q6vJnKCy$7$|w
z@&Lnf?@Jo&pYS#$n;6}^$cP>MQ3Fb^u0BeG*|kH2b0nxvf%H;)jc=LfFIOQ2=2EFp
zMq7kHl#U4$2#o|(Xk!i<05>>$8@H$5J}egIm_>ka&sCe@z&Du`P+nZ@0Z$g4&cSwn
z65!L_q7Lj06Oc!gYM3%e_0iJFhcIyS!pgrLt3d29v&GiE!XUM_*994*VZ7n71n7~^
zNP5tNKBZ=vk>CPhn}$aXT$#`-wL>TVE8j?gkbxlTx^YhWPU13NkkC*-M3Duhhf`qk
z9WPOIS+1b?f*;gx^Sawuh`G_3Riep<o9&B0f~pB^-)nua$wmgIAt#upUc}uKIW?ag
zTYDzGq5IYDRvxhP#X=+#bhacu&>ZGgDD7<Gum64MJzdcDZPRa<s2d@RZ+%Qsw4NVN
zENToldN{BJMb5AviLiR(jXW_L$WEA(weViNf27XBS=iNp*3D`|95nF66MIHAO{{i)
z_vpL~Iq?wG$WX;)7T+$)#)N)5w{7i_aP>a(xNssb&d_y_ABby{?}$}AOJKByBH#IT
z&WX$#&NOu+A9%D2;XX|6U_GdrFJSaDD%a9!%THSvh8*V^p(Q39i!D_m9r@%Wh6yNC
z^eg=q5ksi>tmWOBXkq#=r{#i-X#)@;{^teer%ze~_RgWoti1i+(lHs;7UTck1%Mrr
zUT`Wq{CS0Q03ja7tlU(+(&=1^@jB*Ux!R16#BR~4LpyATTAKHuzStkGTHvZKw^tj5
z+EsktSz-Ep$%T!P3Tb5XlvmUHx|>|Z#+JEs3&-4T1c#Kk4DSpcs4ot=MX-FkLd5kN
zLfPx@dMpIiN)NA1Y^;B?R_=1wm4!fe*QJF9^mUVgX(9;2(W_IM_iqI!5Dv*4_)=oH
zHBT6jzl;5=-Ca~4@5YVmE%0$bK%zdEKE(|Q?fdXc?@R1Mot}{1>E$v=fHOn+=JV!Q
zyy`A)tqN4N3>9RyDAUTA$QD3vFn6it=SEni^{!&mvqEevI*l@nC%@09y*1as#z%AF
zsFf#medbTa0Z+zzDoFRnn8nxs4up^oV(^&(^=)84ab@ZPrmdd5fqxdUIRJ9lH(LfQ
zjKE4Z?dx>aiR$F<60V8Zy_dHS4SuY%L#7*JVH^wJVP$(#da_o&Bx=3_k`WZt+Z+PO
z8}&!oqhVKylnm)#J&4^=UVJfDqn?#K8QYRw?!WIGWcefu4xsiQ*WBD2cQ)Z~1!Yrf
zSM*)#E2rMmthdhv2;>iE*O?aXnLPf$9PA!aYYgZT<s;T`_|4bM(w(@Hsii`tH}e5O
ztwG`g*M#>-v%Ui#f(|gOGDInh#aS#?&~~ZqOw%#7uv&UkE;Nf)X*bs{v>ZBDG*!E3
zUHVyIb2Aeo%vE?(V<p}<FK4K}16M>noS0#rPEfYFYTkyZWZtK?Gr9(BY9UmQ$w#r6
zLlOppC7)!4H`pqFc$7Tgpw1a*!=idfYa8)@-c(PW`Nmy}kA8|?aM6}90D3C{J7hm&
z_^<A;&KE6u?=BwDl^s_I`({CQNF*fS1Cj*lxmaKJ`(>;dn;se!ruAf7N)Yiy`heAG
zA8Hi<hxeD3Avy0h&4lz~`xZ_!=?^+~qcTwj$pY*kxL12`+M?~g`~IV(3CZJ|y!J*7
z=o3Bain$?0050PrFi#6`*Jghuq)?;ILHe!$(KQ!fuTS$osUL3uJaFcg-}9Di>PfK5
z{6@ZtQ+Hjhxx60k+@Vx3Q#mq^y>_&MZamoM+PmGpCJZ(b!3H6DflA;YU>9V;%LQZX
z->ddTzmR5|Dz37LCqiyvi|q;ZT%;5Kz~WaXKg`!Z!S$3LND%gUdes}hJw#9JPSag)
zC`TTUbs2=MQA51UvL4!(ySwSB6HGR=aXg4>>clZui@!<?tv<t0;4G-2lc2HUtP7=~
zr}}oD)E)OID~bT`Yg6Iwl(R{CGEo1wqTe^d^=t~|AKp_*ZwUi(+KkVNWoa{1*X5p2
z8VXmWL8TVo^Q@n${C^(J;`EZC6@58#X+k!*X=q)E!@O61;zwP%4ky37nyLKFGq<^C
zA@E4QlWq$b`eo-CS`-+c50%_RG36p0fPLETpIK<ddFQ+vBX!%RR$OaVR+BnWVjoGH
z(C5FkK_I)%*gu)@xLeOKb9T`Wt~5^HxtCJ^(F9Q1mv2b_5)BANW5^dme9m@OrLvto
z_qPK~Q6|icB3^w^=+Sgpu7<yMGDwZB+?O&njY<dMZ?@wg60R$)?cG9pTaw8bHHYIG
zc_?u{uk{cbM2<M(4fqzPU<Ad(BW-H!pV;O&SqP7<2l4>Rm%<y}5FcAM&k@fOI0KoM
z*rZCn03Sc}tyoxXxW|_uzWh#+_=g^Yt9Ou4v8%;dl%RB}oliOsCn{j4v4FKaL@wnv
zzFPVb9h$2|iQuWN50+;OvB@twV77ZIgw3)%hi#E=c-b7xX$cXbq<hgEZ3AsBi+|HX
zoR*~U3<SBm+gbF<v>Texio4U>f-aF<lHg~L)IT%6?z^HQWN0WwSG+-a&r)ffqkxeT
z;z$qifkK(WUCo=CENu&~K^Fj(qu%ae(^{?%n%b_P15IPI_H}PmRdI&Nse`V(7^{(l
zMkgUVMi~2j{UZalainA`FEbG|2Oh%2K)La?<Oes{pkBdw{Q*L<)bcpUxt~Jt35~NK
zCoz}ZUGiH$K#DSsX{TATrjT}~K6wM}9c(Zv_BO)HbK?ZQBJ5GHOK<P-85(JNN+n?%
z8n7-R)T}bjNXo9NxYn{^|JkkiNjHuQVx#|q&kHEc`ULe{gwAX$TxD)e>uu?8$b$i>
zV`V+58)2EXWQOh;u=OXBV6e$F=zcnD`%aA9!kp|$Mq(4*|8|$5I0eo%3DB<xSPrku
z89fFOAQV!QdF@lbd@%5=D0sv2Sd*uLB!8MV{{S8KC?CY!Hg9NcS@oMBkvQt0a$3FR
zr|rGJ&0xgZzi6{B1GSJjK6d7dZ9FFXYghsg63=``i(iy#1O-1xPP@1fepSCwDZT;d
z<;z->1IrOq@eUpwt4}GQmN8CDGC;f$yIjORNZ?Gex2F(cwB_tad0B{ekX`}CpY>ZO
zM`Kb>pQZG6-di`iE^6x%;ePO8@J9C*G^L6D)`WMTTCCK%Z^MA~uoMbS0+~~gmvz-*
z!QHr#de2YDj8FpwU*rAKO7sg(g_owA2@qaKvO8G~-|2=~^X|c_sD$ByL~D&%#*?ZX
z7pi)_nQ|oOT^{>*rg82|g%$u~e*4$tmftdzds`dAL&^Hhhp1_O@YoHg(3a<s-wXUp
z?4B}3Lu^dlar*C*l)XZFg=DRbx>+HW&^NVF75Qo|XL{iA1b?rdc^03@C`(?MS{*NF
zL7vPwre+4H@ejGD<E{L7-CQLLIH1HO31hT8i{R>$j{A_C8^XSc<p1(Cd>D!C!<*00
z4;QZ_i4@vFmHM6;c98r9?m%1!VofGhlcT~!V5AP>cv?{>gB~~3cq{xZ7c(-n;UTZv
zHDjP7pRTXor(tfQUTM5a(SOPc;AFbw10EoB0xN%BM{uVq2cKTY(6$i3fh9mDw3kwU
zn?IQoD}mLjOFXL0RP2Qu%WL|%K>1>x+MoN%57Gv%43J2Mfui)vWt+M=)Y)oAJb6I|
zRBhfOkhjIH%LDj<-)smG2k&3g7A@#m&N#9iqk0G6R{8xRQ(RfXJCz|oZt-Sl&;Wdg
zg+yELsk1;G7vT_=Zi1ePWg~Hdc#W8EC85K`INmU3`qfu1;!z0&Ab}|ee{JVHt5@A;
zUCmhm9S`{C5&wAgW~_yn_9hE==DGFytH<<SoqMft&1tas!}mRB_tfTm4jhl*;r9sw
zHDg}IuZlKObP7HZz_9<eOziIp?xRAjndOgo!>c6g(+Ktj3`nrUaPU|vt8Rz7Y{$M*
z^Jf|3ggeAXhy;sNZKL|%y`b%c+4qC#hH=c1;}(Yqn61RAXkcj<3B<bG441g`SVBz(
zT!Stq{Cv}0X&@1*VGiEN-l7OuQZcRBsfbdo+PQS*E`_y_j5tY*Jw(4p<usAzAVjl-
zMnE*FP%AdH-wt4@=cyg4J7Mpf2KUq`gkit2<{3_j{l_^66CW;Mhg6uIrl+p*8r9jy
z?)<jxA2UjXyvezgeIMF-MS=p%e~1^L$PM6_1juYc#T7prv?|ad_QOW7IqtGM?2A2Q
zM}z-ckXZ%@id#PvE-=%MKPCkbpyeQee5>c?%mgms`CvYiU%Ld}9U>6L(1lT7T1oM}
zh7AjlVvxx$sCQk)mu5W;QYEG^OQe5YmQ0YIVi!Z^ZSnAUe@a}YfPb)u!S_Zf(J*la
zA*D}KatUc>s@7qBctk^+6<?e*TGCp)W8!#<%|YnADBZ&~!(G>r%VAtr5Ql0k7Cloz
zXrs8%?0E)Tvcr>wELc=Zq1b0*G0L85a({48NHh>iTL_TZPWpGV>y|O5S}8q9<zSv(
z+yBCnL*B=ihL`AS)+HqV#84Am6W#(fkux2Te`i(U%a*vZMl5-ryj9YPz$!k8evzss
z(V~@b9x2KShh(wNCh`*i`+X=Fe8>*BSXu6AY9o$bI1McBHwA5uytPIs2+U(h4|Ei=
z6|rBipmjfP***%#jU%LAcbtVS{fTynKQVTLsXfXqOLIvYUA)pj=i7;pN_(q=LuYWL
zQP5fN$9&-q${v^s$>ZSiCQMZo^!D2DRu_i?>qeOk{Ey`n(1tD8;MWG(nRCuO*W8eO
z^+iml@KEAC%U3xRRy_miPj%0C<g&<pZtQ;^7eKL*v}u_Lg;#Q-5rd3is-9eOW_FNC
zZ|^0(WO6Odx3j7z4|&mD6beRv5Nj@-Ltp!NXZXB+oXflUgUlCiVu^R<+Z@zeesfUI
zRecqfo06JbW(#+twd6_}aH$!(Y7-1yXs<K#4g-J-l+Xfha)0;rE~x^gEo3&7Q-2h1
z&<rnv@5!Ud%(5luo1lK<MB&ZTN6|lyTsUD$g5^^?@CF1DoFse^IvY=UQ{`~8%F~<q
zkxDjWYck3?SV6SCcYGQ%ZR~VlVjvNxGkDvoai#L((d+@c<Itz2F=Dy_0vy&i_9xA2
zr6#Yl{?ipyKmk067B;zJ!dCFP`tn7IbF&*`!4P=_ks1O?h*xdTBnBo8ME#Oxw&c2r
zEVOU*_BDzLSD@5K2RaWnO>;Oy^0*{p>-)a<sQ-+?Kkhjg-x;3?ANCeJp(-6}4Ax*s
z5L4FmZI1-~O~0vixK_8=NR5e7sQ?0jxRw`8JhDB|<XJNAFmv8j6@F(vow7<YPV+KL
z$caUToA;j{x_t}avEgxkJI$YP1vjx2K`mq}YX}YFU3UgG(RAX&oQ=ip(44VlsgF!K
zRAp+F$wAaY`sX^+An0Xz{piHcc{97<TPQhHx^i*S|AwQ`E>aMsg@OVoBvzn$Wa2=8
zWtfT3hCiEE52I%DoNNkh^k6J>P-a+0^51I$VU(tDtx`a64?1ARE$=ousO?a`sa|0z
zTHQcDRs3GcBamkILB?~Z!!VM~YUCHS=hHj+*K{`mYL&hr$A}OqIdn#6n&QagF}Eua
z^5))CbP9xEOs?sLs7x@dH}_ELgy%#V`EP<O@Q&$@vH~Bm!n8V#jv<h<z+eL*4bEDy
zzcqXOhdjXW2c;fTXw(4JGV(a31oCP+IS3&aKPL&xigHUdz<h^Cr7_<|y7um}9h|TQ
zCb)(^pY^rHoB<~kW=b~P%_R$H;OFX0tL41whik4ihGW(FjK+(&{+n=t4}9n7)CKl1
zJkSu8?t_oK0`YrsED$R;&5utX=&Hb%&G|RH+sOu1NmqQ<u`;gHla^4}u=$gM07}Oo
zCCv~hP!(piX&pYPona{iHr_x_%t9fLX&_-ku$D;^^?W6NIY}!OJ=hBQx1#ws9&0d2
zAvBGI*g2X1js=cBNY}7N@VlwW@*eF7{OTnVuhe$}YxAdMko|>BH&%pUmGD>X7n=>g
zNGn{Q8JU6uQ?J6-9OyrsKmd+28^3vcBScH%Lv(USQN%>k64b+$w7pP|mwDLnDz|#f
zf13g?Kk$GR%TWk%b!ZMg86}JuNk2|5Sh66xYJw3IMG(Y@^Th>02rm+42NQza7(R!N
zCYlx5F=0=aPa8tgA##ct2IA-_6xFK>;29`HL(k=HosL{Cgn9%)Pyqbzwf-~LdqxNh
z<EteJ3CO=PFUS}}yMl)Ui60JG9{)|d&rQQe1`JRYAU>VKE%KV-ORTVxaSt8qiw5A~
zrwHP+r5OZNEiKz%jwT{&@OptgIt0BcH~PLMP*J5<g#E|rH*`)Cx~u|RA}6BWw`KHy
z#LX*@kT1yFc7ubxE%@}E)3ugBjSs$-AB5TX$-7$vmjFu`+yZ3&IiP`=h@Hh1SrjRI
z#ESN>SrrJ^NX7cv<btu$t<x}I43Su1nz|w?$}qA7MAq(RJyEt3`O=Nn1i|xvOjYo8
zs&^8Skd%LxI>;DJdyxA#KXz;Jufd;p^sKqvdhdpXBa|sY%+H?34)1^o9136$kq06c
zr^V~oApFWTlC;pk*ie0@3retF=}_ax%J9cVVw<egu1)3t5i(bjAn~=WsFt;Y63or@
zNk?>4?w=d4yR)&}LKoC)@s!$;1kal_R~!{)SAq&0bx_o-hA13lghKk`*p{^e4`|+{
zt_Pqi5i@}*N!()vHSZqacVz4@Z6WtTdY}IA>3{kt{~~$zE0RCJsl5CR;HfI8a*L{K
z>%dQqOrMG}cR4o9iJ9BoXy-i9;5o|>)Z&M2VU5(CwaCMrXOZi>7kVIhP?I;&0cMxM
z@He=Sp^>~eHIE0XDCqmDz`j8V^@Qv1CRW79TMLk+Z~j?8F!K39Ez~QPcZcjw(&sSt
z3DA~&0+%ApD=gg!y&b~~rb2oxl&a8!f~1WzqLVeXeO}IW1eJg}iGo=XSs*NT<|RZ$
zq>Xq4EK~Tv556rnGeHMxuX5u5NkP8^@Wk%d*b34A+MS|k@Rb(Xxv)fWJDvpc_dJ5s
z&%AGfmpErv^A+>QHWkaGAP?11+m62tBqlktJUBaVe2kwG$#t_O1ChD_6;gr@RN>M0
zWr6l@dc!mG4(qnN@q4MZU5DLDQ~t}nef%KHwb(T+h&A~;1@5n)561gqa6CbyWt6;7
zzX!SBV}%o^tR#<fJ%4CM5%P?`WGNwhO)1bv47{<TR(YWL<bBYaSc3fKN6$Ayz?OH^
z^imvY>l#U;l`@rzrrGZY{(t1}?{tH@mBDc8_kuSK_t)CrpoOb6Cn@vBZVh@veC~^#
zp|s9$m&B5suvv>zny>`BzeJyi{Bd3B$sQSGoHoLY1^x(+F#0|bkc>Q!#<s-a*hbdJ
zf|Pi^QvVIDxn!XYv`0AU3TFBB$VRE!zv({=7nCrdIZX>Fw}{)$Y&o(=-kqKZx%*~W
zi7@DG{zD!hfm)6tfjaVer^E%&60OPh#b;Q&3Jf=_wQ!QPtdKAg$mO=$jIbGXw^&5X
zkQ}8-)jPzM0S<jeKm99v^87~dSzk-s|1Y4PyaMXF5I5%^ssljhMF6JOhx<^1BHqIc
zLI)9KnJA%Sd9$+ScfWf_x320yAIN^@{4E9*hFGIf4#TQ|)fVreFpsDeA578>sWvb+
zZAxp&*tW^aI&Z??mB@dlAtqI*q`e*14WxZ?y{2fRbqqy?=8p$&um27tSkyRoY3#So
zM)*XefNZEOAl(symGr<82u{a_BCt!bYd1LC;e|FQ2#!U_EYV0uL^Obo^OtxN0JvI(
z?MDeK{mubDrRqgx8SzsRWM^5lWG;$+{$Dct??b?JW?u+M3HmK`J;(7krd=t5xbDp~
z8~|DDy5jzgM>d^=_Ts!CJfXSoXrbnQ$)a6r_RZ{p@BV=uww#2ocu#~N>!*vku$a2M
ze+s(ytU+Rjo=-*aU96%<Ce4d-4efmsrPrByhg$-25Y3rB&J8MwO%lSO<F|m*CzAW$
zs{e31@W@>AK<z**V>gv2Mh@A~Y!rGGRTa2f|H(NMxW298XT|muy@(CTcy0eQzKDl(
zHI&zJlkP)d-IG`X7B6DVQ-HVr`dQI)K|b%99n!czA8mWFl-a<a;uc9E%OnhyN9hFN
z{0|*O|KXIck=+%5WDWKA-Ao}vX;RzGxzRiJbBs(~q$24nQFY0K`fa$R(L!^#3L86I
z09i-gRWBK<f(qf=AxYC)&8<G^W~FFE6sD;GLf~?R-$zk6f=Xp@8@qF7H+R{dv^L%1
zo0dL;E8j^3WdT%DKZ-Jp19{CZ1mnxjq%pxjgI(FWjKW$CjST0_3Msn(AZCg<Xz~gd
zpCqb(?e+;+5_-E-;I$v21(`DxZ)M_NoPur#OtnlM;R(`<+bZ!=N_Q1P&p=B>0|Ose
z<g0K<Qo6HS-R)58Zr!MM>H$$#4`OsqXnodyrghR3<UmxK61a6C{9l3L{~8$nLvxUS
zM{-FTgrNFCkZ^-Sl^kDcBi<?_`K|zE1kyp43QP%8gkXAlx78wuYvrV2j-6WX&-%P#
zAYi^px_yGzPmqfgHKN||>@&qfP>Iw0_3H=Ej3ak^fCnGQF|+I?_6g~4w{3~YFI}uw
zql8zBAFy+pzuLJ6(R|7Wzpf1Nw@{GnHMMRmgxV?eI=w$D2mTrg>m{ixI2Wm2plS=J
z`XwVFxX|FQ4u!S)$^{M3TPL+MCcN}Fod1%hOROM7(oEznK#Njq^q`HP@)+XLsbC$!
zh$6>=59N-OVKLx%?Ay)*6=?4e(LN0HKXvubHg-w)x-%2n{*L;K!`^`)`n$UFSxvMJ
zu@<IhB4P*WC^Tr+ca(1rAi^sh&}pq+(pcC7!FAo`Y!_ex(?m|s;S4&!>5<@65&MsZ
z_ZJV;L&Za9{W#uKmQ#sSAUZ<TH>fgyB}5zI&5P248YX*qyrWs_8Jxr9_yZ~8Ba(4S
zR?RUtfQx<BWz_W2Zg8k>;G3hiZTRvn<Sf9w!U{3Z5K#q5Wc1MlUbJY5=BBjYUI{x=
z^o7!xwHC4Eg@s@THpBm0-i$n!XqZ2dAr9;5*r?gY(Jg4rEaCl6Wc&h6{}GWi?w_i3
zhw6~ovNAqB)o>E1^&qY<=d$6iL;$lg>9xCxi?=Ss;fG4+x1RZsLf9Qnx9R}4oj>d1
z9S6#E{scD*xbdt`4Z&6_*qmOaLKVpw_s+H-IKoL0>}TKwo!p9earJ}0>NC^0_wnqM
z=^gUFi2pYOMWOiRNgErm%ZUHRADD`Xg0Y^yX4GwI&C@ESaF4+#%BBWp=L#n1Pp9}<
zP_sIAf)zBe7?joHz!HGsXPh9zgdXavXc@)`V&zUU`0Vthfv|d_zftVS(xE7@9R2$D
zJW#^9528+J5<!t*>=_xbV$lW$h_ce8B4p9R?^Y8Th^PJwhQinf>)PBv3;-#z^kzj5
zSJbp`r<h69Gg;4@S=leDL>siV>;bMqQBQL*EOXZ%pI7dj@o()7?g_(@Qc*{IZ~(E3
zJ?sgl0><JxrLX(yr3A41V#fapWilO*YNzq3SzMw{`FLm&Y^v3x9xq@qYB31i@zI`Y
zip=+bF4Rhihh!o{xZ*Q;+{*EUIq;HFgi@O*k;8ILb4<tb(!pX7`k2}yYC;fiAj{k@
z3gGZgaTUFE-!ufsa2x9|qeRaw@N18QXM3+FKJ~L*)bK@uweE~3owmyo?xUZkzbUk6
ztZ)aT8=GjQFu^<Q@b5cjnL<5CoD3Qk^*R7Mjt-0Dv-O8WY+l?X$ecXaPe}c9-82Wn
z!2LTp+wBh`Gd=mV*G+RZwltT<HkIf9@C;E0%#6O_G8OpKfUOe2H9#ln8V(ou2t|s=
zS<VNE0UkP)+b?lm?1C6bq7c4;dyauz)ry}`4$Mm%q&~yE-LuMm8@6YQL%+O2T#@v@
z8uH)Ra14cd8|W4aDt+}v!N0mx1a@coX3sXm3(TUjFO%%K`q#tn9i^+<PhwinJZL~m
zY=%E8PV@nN3uv;rIh!wxDMMu}2K0ffx@*mkhQ75Cc|oN5mye!hGOt28MNh^N%d{7+
z6;0r}cmi+cOk;!O%SnZ=S_9I?;HMjHMT}jG&HNz|Tl|fyOaf1$7GNc%pGu(3C{R0=
zck~Q^-an}_d-hVL-^EWh2{=>@82vUVbM)F>(TAY2tc~m~xzq|5_U8VVMS^fpFj5B7
zZcCX&g(UbZzNMIYb0w?ZhQ2bLbVOtty8hG%Bo;BXQ^JkGuxBT%n&UD0CrT0tj=uI@
zyZC`7yAh|DkZ7psk*+}m9KMysY)4iLj1`3l!=13tBYB&*e#VAXUm5E64tCGKXsNcA
zarZ}f`rEV5kCs>WUT}`Gdo1~JQi2-synmQ;aN1(c{61<A)e)QDGZdS*@@?pKyVlvY
ze0<WVh*1-67@h|gq_>@<M#Bu0vf$Nnuu#5$Kjtc1#eSCmq0c=wXp(tw67hlI5n@)%
zjxjqiqL0z#x?Ww-=q^+zL`5Ya*!o9b-IYJnO66}ociH9^tCBHOMg>U8QMhfJ7HJ<n
zRYNU!^{mu3rG2sR^?{GiRReuUVDqH(Q6Ah+2u2m~hioM?#g+#Fmc2ZrasYiOBN=o~
z5XoM*eK{AXgD$@oMjzGOP2RDcJWV7asS>|3bTO(j9zq{>k6~r4Z3~N-xlS}_z(L_3
zLiZ<Z{(x2ctFdPgD4qT1r%2Itids1b^DBH3!vBw}s}75@d)lyc_X0{I(jna`-QC^Y
zwX}#xmw<G4cb7<aOD_%5()}&I@9(QW_qy1JXV00r=brn_%<;CoZ6Mx|_Kj?$?l5-U
z%Ga%DEVv?}&i){k4|^*r81G~VC%8ZF!*#@DZ;6Yl(R?OyE#)V>!n~IY6%zG%DmMcJ
zsZW7t`8VQ>XqLxcOMsS7MGO(ckIj9)UH$x^K|(D`puwMhfQgk`5xC9T3^+tj(lhvc
zfN%4m4TE&{Z&TL1iNss_*X99Nbm4bt#YF@wkOBxMb9=fen-{9?j!c(GtSr|+OuQ-_
zSdg!Y*c^-8xv_i}?wdK0uwS3H%IZbw(|#x7-O<v>#TBIRk^FRiJNQ%6iGPZJul;a3
zg8AbflNU1Z^oRfGu#3LYgeg}~1@eSxZcNqystg|w^7L)m8FI8I*W-r`X`+C<HA58#
z9=K|mMqX9QrsK`P;&3Q}UVOP`!#txm=2_Q0WZJPvUNNcy%hruZ^yQ)=i5@D(A?og^
z0va36Wjk3_x3JVy>}B`$OK_7Ws_E|*=!=I#WSv**ap4{s6dbvzD4qFtL+XVp;eL%6
z1PYnFZ1}#5?0M-T5EyeMLH08pb4whaN&xaH%b{PBMHdR3p3cEH>Yl3H`t^96Rd9w7
z`@St4YnM63hY}l8-$Zy5YZ`HOMxTR#w%b%wVwa1=WzTBOu@C#(xX2YU=6e1UBV*P>
z)ul7bWi~JUm7Xp|r-~f?E5rl|DjMNJ3Z&tiSACQFHm!uV``O<Fn$x}{&S7W&JW`QI
zMgjaXP&Q6Q6Gmt7guSWy)-@tVCH%fkNzxb;PuMF)_`m92{>3av-9Vr_Y<2O!(!!-a
zgvu)|$XfcgcPKAS25&I#9OJ1PVBq}f%}f|!7jC%EHhU@y;v*flNb;*11u1+_6%>Pi
zRcMUJa0qg6AWDaV`v<&!sAq(}TIiX^>sA_E_hCW9cFZHRD@u;0$S(vdR=8#<Z3$;z
zI@>u;sqvx%sZ0mnzs!5|B!orO_+q9(QA^2<A+cbo);^1L@7)zIwL8wg!CmQTbHN)p
zwZHK1)8Vg_zZbTc3tsbE3=w?`Y$b0;gp;uF>KVk9H2Prst@g(vmhYvDq}R{#K;Xh0
zO)%|%F9U`p<>!h$RxEz(=(XQt=q+>gh!ua}ki%YkvbHz&W{(C}c@dWP_7?}<40L%2
zHZ>Mgx{7`Do_#|kRDXnk+_vlIFAru!ua4P7sp(X{7`=*sir!i%SPPuP88!bS6YS`<
zj<~2v9nohwio~c@O@K7FKekKBUl~mtL{X@NTXNOFdATG2^5;Yryj%S(vh(>GyXJd5
zF=5bqb>m^U=XL8xp`Y?)KS6b^P~bqCQZ2)iM^d1gyz>&n8FFbXZ>opEF$&);(^tnI
z+e&OiF9C0I7`#v0#Ym=jm&H5iLWpIduT$_UyEY{rjNl&eJXz#OG(P|?Ot<SVFT!>C
zEi$N;Cs7ivO5V_u?7T#r`ix=i8ntOCnTM!pD7h<XCE6}{7TW25D*097v0eWM)uMf!
zlpFVZa6ofKgHHEkI@12s@XAjJz<jn}xQ6y3<5N|@HfG$RQz9M~0s#`MRjjw_6ZW58
zZe{Y6&p1Mg=$+Dj&Gu~4i`PSxd`LQ%4685~8S*A@y&H`|0jAxU^D8g<*@|2gws#=1
z#+bl&_*%tH-El+DZ!|52fb?F)ASbyh)4e*H6|3SmqbsQYmQMYnbN|hWF=GQqwi*IC
zivN=#3zNRDe(Thq*>6e47BDbe@FJ{FuQx3S()r!e_mjTe7`A5k*f7ck^i{Y4lvTAe
zFV_F0jbdR^1kcJp^T-#YQgpTr2en1U_YxPP#k30E{r(!>F=M-BYT|Use{Y84yC+^(
z0c~@eWuYGZ@Awmp$_FT#ppnip8&QbrXIk{pE8%@rCmL%_Am!fvx;;d`pB5Po7ez4W
zD;kk!dT$49mEg!o#{-aGvd%@QCxgc|0zr57!FMX(Hl53NPixh=lVs{D_cIu9!Gkfo
zc+R*aUoF<umY;%A@thfqODIQ&tA8-y+7!g-SS&~)w~<7DVw9ZhljJ^h2>K-aWh?1)
zXzm9RIX4^WfFHLAuG{nq>&nM$BRZYT8mBkNAK)VhfBQZ@ZnmZumV+5aKfC(IQW>hg
zutFKisGoCRpI!vQh1N*1S@)Jy<yta4*R?b$c0UDAfzr80B}5n;fOEOw#>L!E<z6}_
zRwB2GCqfrzUbZnd22Qupdeg+DE<+n3T)~&O+aGbAqVP~gNHQbV`Dg^*mZGme2bLHs
z;f$QS(l?zcIv15~``9$pzr-9yluwrS`<8O{nTiCi%W7e(q5Q2m{HUnr^yAOa5T}5$
zqG%6ep$+y8sF*br)?L0ZUK0#T)S+DKk5l_Icf|N8#BKDPd?Pr2bN1SnS`DUQoPGBW
zWA*bV2Iu)rSAlug`Ni1Mvj-6p%P(p3fKp(LWOTL+?;Y)OdG_#;RxREkM=7!@=iAmX
zo1fc}uGLQnD!IeEg<Cg4(8ReI?w+}X6=G=Jxjzo?jO2pt^#!sJe2DxC1WapP{6A+6
z2=9f&*^Jj3MnZi}zz-kMHYDVT3VtO7n7#mx+RUv3oM&{$vNteir}6FtoCFgQJMyCj
z7$oCAw7)~a#b&xShW&IaOm(2(c=m6w<d7R`O@_)?Z%bxwQ25Vb4mINBT{|_+O@^RU
z;GAT1pMfcU;8;Hs=WMY4q<9yX5c1I{5`^#i=mlY;OkP`4+SP!+>VZ)p)}U#bdIi+2
z>hRmm!1MLHAsc>_zCRCJ{#N@zT|!xz!`5boq9r|hp?~mGxGeGp;G@ZPm%O2NeY*QE
zDV-I!h<{)b?tO0jUE-`F^f6lq$7)-CculyA+#ZSE^j6qc4DE*IMIhDCD<m2VY6cAi
z|MSv{8H&Ddc^3_)T9rwm-7z^Jpfladc`2NYsfNV)L^{BVXE(ljI<(M7wW>RB9+{__
z`qGR_!zHb9#31DpMxnA)leyUqdJc2biPy=G33Jp3(26$qq?U8Eg03ALiDl|R`33pq
zn;nD3w};8$19!c6G}%EvRvM~E{3li+GKXZgu%2NZ3mcNrSDDZ<4hQ_B7mU-r>`xu!
z1|MkJO#+5}uqOrxbeyE`wiO~({}%mk&1f=pRieV7>;kCk_t9_NhxJqDVJ7zo)2jJn
zLuxi`)M$~*64S4AS2`y`O;GD8^+#{2bs+gc?w+DCT-BUQhaTnGi&AHD-4Y3cMx(JK
zb+j0MTc{FYlkp%&bga!?KF~PP21_F&o7$PaoL5?ov*%%T=Yn0`eEuL20J)BfgMP5~
zM=cuOVU19XwOr-t&liMC)d!{b-IwzF&hPk~=O4|RyDo$mk;v9#WL4wdfRQ=GK6><a
z5+C2Szx^^xFVAUYHGT{*is<AyZ{>LwJ42FEj2($LVE8#95HaUG+C}nD&Ks=`X!9au
zM+p=v<r;;CsphoBDHHmXK^z~HSzY~%@n=Hw+Pkmiw;D<1IhuA8X7+_olTm@he=4Le
zX`msEAT%cirwsKAk}TcuXJIucAd<lvl<t5AeJDYiT9mc#uH_d>o14b~C+jczP@Rc3
z_<^X#_)ccJ*P>tl9@9IOV^`-O9Ol<~0zSVOnCy!f!@*<j8A|Z1DngM~pTnb7i#-(@
z<=`3A(EaKf>@rMtIgCXNoI1j<1CG0pKmp(1YUaI4#JpsfrDC7)jGR4Ei9R26sm%Y;
zIcs_u7B%hSD(W2q`wHD_^5mv#xihr|8cT89)_WyD(XhS9_-=^2yjSvyH5Q9L!3s#Z
z3SIty`&O^Wzx?S?Sz!UuORb}1Z{J%Y0P0j)(uzuxRSpg`EfR!^%&%t;OVXl$SYl4U
z7jz}%TSeXrI<q!O+`*e5mXBC%t9Wg8)(<8rcXL@uS;Eqxm9&CW#m{0}09io#2Bryv
zoeW~msBpz<(b#BMKDYJwKi*vUK9!12XdR`*zsQORLV8t~?n3S6(P~L9JbSaY6Ks`>
zKRvf2>v3;<S?53Wuw&MX*$YE$wR1O#xc|nN1Ba7@n=MIoF1;i<m{YfgJh?Sl>w5mk
zPgH5iwdGi%M9?om#%y%G+z%>kk73mC%H+Z*=5rE4?+(qyJZ$1Pv_3|8Y`5d^{GO%^
z#SR1lg&0`aY2}m#${HIUPC>zI1)bebr9c8YCEcJoPz&z}=<_{Ijuh{YJZwp?d@`I&
zq5CIP5uRgkfrgg*57w&c-(@6aqhTP{-mStf?-wVib>c6O83!gF*50?yJ>>ZxqG<CG
zOyBk)3k-_9(Tl9e4dvDdrYzPtOs;tTb^aaB(R}&pW_ljHM-fZimq#+G%!xUSFcGnC
zCNLK@wFM^}yp-3){=IhfyrAGYDvNH!XwEF~6eyO;hXZz$#qQ(BHIdin3=7@zi2mRx
zBRz?v_JgiM9&C{}S^~+{_+ttZ&gfXEU*$FMFDfvKaR9wupNSI}SHC@86abu3zHr7B
znZd1Dn)(h1TkS`Goo2x>(u4U!{X46mXrJ|?esO3RW;Egd6WWZRaL6#UR(;PjZQG#u
zzd;Z=fcFa*kod53d?2a;rM0nZf{A-Z<yXPOeO}UEClZj)R!bbRfOc|SGp${@y*a^V
zYDf9}X`$Ks$Cst>y{X}XS2Mi9{za7)xK4*^|L*w^WGw^k{382dqewA3P|R=H7ymE;
zLPaLs0n4K#^JY-BwMnVS8|%@`?1!c!4)NziMsFYV@N31cc;9|Hn*^?Ta;gz`uEO7p
zre{Tcf@yI!n4<D;K(~mDYuZ^46)h*=h{`I%u$W2f#k8_WtTyBU&<`ZweSyd-72RTX
zgZeSc@VebBQqim3nY~+Pg-n^sUun~6i)?Xy2gB~o!iZh?w@jAoY<@z^y>%;DqP1(4
z)g_k+UM%o%L4_`qF)sR(wB=R9KT$H^nP&zULMwu-zKe4yQBzgmub(#=J{HJM<XO(E
zn;v@TfLCFHdikBBhP5sx$C9B#?Fo4ERv^~AD&6DuMF0J_5XM(u;luG~!B=$-Y1Y{E
z;Qjd7N)F~bb3NGM%JRx5iX3~<Hig-^D+7&@pi&~Nrhuk7o-2P1JM<HW6qc%Y;JA+Y
z1MGs~S?F8`o3d8k0#)aeO&PNN+7~HQa>5`{20zW;!Oa0!7V(1OO(nDVnhlcP`FOSV
z-^AO+jT+z=Dkh!PGr4Aok=;L8K=SB$EkWw;eDB*A>WS}s_Bm)F-mU3ho|Z6!5_qor
zh2b<s@#+Qk)x-p$tsDH;%ljBvzLDc1hT0C<Y}<4hmg2jWmY~z$@vnu9DAV_|NGc^d
ziUshJ3!l3ZIOxD?(A2Xs-Z389cX7SRi9GH*ZVcmdbr|Z|Vv=R^aC2JWN{jo#40p@P
zpVZ|9K%aW(2s`~`!=Bd&XNi=S<spU)+MSUI__DIRB*n9Q5mDysAKi+`#APpQo$<;^
zSe%f+RYYor(CxJ0a>RuJv{R2z1M3MFrI83T!u>M|w`2_M+P<=qhr$gsU1hJ;;p#5+
z#904`5kZ||f#Z9Ads>%2rKAG!%!YURnV^Iw8d1iHls(Q*&B{&pYkt$ULCHndp>%s$
z;HQ81arB8RKSlrSOI}^}0F-49CUZa2K1Fjk6V$0y8&tW0CB84z76DOxUC|<((NNX#
zCpKmXeFo0Fg@F85;V#NEjE?vPaPdoOLX~av%b6lvp(uyKsiA)c!I7aH&#IhJ7i1a;
z{25#T`$Nzkj3=<Hupl!O11LGNeA#gej(Dx+0%J8(%AY?0zzMg<ty8y6KIZp4yx4Wr
zwOYSuFcLkr19$9mr8Ul2LvBv`9@9y;PSt#sGTJS#bFTePS1^@HLS9~Qz+4f@g<HXn
zoDxgxJ!XuszMtN|{c%WaEI`>&vG$Q%w1H+A(w=UFl0<R$ZF^=CCJIkSp4kCQXKYdc
zlWv^e%KDqnuVmB^RGI-BET=@?%+E^Q1-kRP{3tv>D&%mTdYI1P<}oz4%yNZ;@PsVA
zmssVl7GvqA5-3O+Hc>BmE<%o0s4^A2*()wGdnsV)g`8Qtr69~4C4SQNo17Z;(i0n5
z;8yU5i&jud4mIO*5#X&(^^n@1a1{oj$QJYFFfB7NG6RT}p}gNmb#M4N8Nr|ydfscd
zVo>vwUNh<wn^&YzzLOqB7kG~6Slw>u;z~$oHPbNw!A;`~uepybxmsd|hH)KmVF*_}
z!QlDC0WDNio#1FM`D<!!&2K$)ijL<A)$S*9`M5#kOX#MXUsC@>U3jvXmZt7$*Ye$G
zKZI>~^RAn6mP5u3Jl4&Zn1~Sj>MT|I>^CuY8+J!4=Qv0A1Of^sHWG*PqNOaCJDl;*
zu(D8d#W%lK;lTm?WnfFH{h$@Kp=m@rp}@Y9`dQh`>X!koh-E^JO-8^x)Laa=Pxktx
zJ*HtX@t}J|JMG1iN2L(@7ZThF2LBJ0r<v5zx7{=Fe{~ZF3Z;?5igzf_!wtn)P%lh6
z2o3#RY>;uzXZW-X`p;}qu^uM+kuHQ}DA++k{{9%)5;kA{IZ1gPjF5fdUA1|7J}w=I
z_inpW=~G=N#Du|mHu&kvduUi*9n~n^p3}zwmZ)VoM*?2iSookZH6E01*I_280Iz8#
zC|jc5XCjzpk0kxGA66;)M^O%K|F!~Q1l@@%rXYvq*lj15OFzUsP@Au_PoaWEpWHd6
z?vhm%ZZL~2b+tdh&}=l}jY=aOh4t+jn||{8b)iNMdS{FvCgeWRU93*Q0)_PVC>7od
zQmm!U_yql~SZjw9N+^Xd*s;gN7uAj*8Egfo)tPB?XK{g5X_x3(G?o_ji5!=eo#8Bz
zL`K<*!|~Bf@@vDR^kYV(C@fA1Mk49qDLTkM?$r2<2UXuPL<5edJ|5DjBoV{aiRmsm
z(&IlNkvIANOTlPrczWKLnl+pT%^AxKi~Zroj@3inG2i>id^)+7?#aN%8y9I#WLYEi
z%KpQF-js=6%}*`98;R!oVMv1G&zMbBaan(&ynOj=1)(pUf^*<d?7z0Y&hc5S;8UKU
zM_p!2su^HMPM(!rWG+W*&tU$zP4ivTi#nQGV|3D*&gtod?);2byR2ewd;v`Lg<O3{
z=;C>|mSp+b;n!-|mTR+f$VeQ0zg#1rjBS?}K<@`Hl$PftnaPk)Z74ipnS-rb>o1Z$
z#m?8my|=?}B)9=3r1fmPt1rIqt+`OfTm-ik;^iN-$BJtbHOUpGi#CrEk5CunFXl=5
zg7Ct&xl)nI=+&BWnr7wyCOvvR!8Gdf(Zd_O^|vks0={SJ0Ko|@*Se%(moGTIcLs~2
zT{;7m=HKM;v5>*Mex2baoS_T^u<-YH0tBcf|9#rjArC>_4nJ5>V*s`hT-Xg-iigNT
zdjJGuAD52+PKTmx*_b2RpJui3Q_sBzTaS}HL=?4cj{r*F9q|Lr9Fm#X<^GY5LEpR4
zJCiw#vs`g9S@2f6E@JT<sx!$l4Zio`RC8-c*d$+8N%HhifiGrNa{V`2Kfnoyfl)7y
zDW4vkBF?r&yq;5xNa%E1e}Uc;IATZrJwyHo-bb1pg19$^<Dh=-NOeo<Ht&vg(c&p?
zW~C=mAkLf<clRgi?mzD84hA9o!J;@`!wp`^%nL$Mb2I!UvQw0q&1kjBcgZ&oL~Hq}
z+{k6O@E1&l4}HsLUwV*mE%bZ%v}x*18d?jl_itF`h?~q{T_T}T3AG|itdR%_3KTbn
zu$QTW2BySLz5q+xGyObjiozk}c#m5<uKlzH?7od77^5V_lo<giqDd&9_|{|l;>36y
zf<Ky7-6gPF6&BFU?qrYeiNuBcPNF)?t9J8@SI02FvjDIlw1jOaHNpDK>NAxM4u9c0
z?!&$$>o1#qUX)yGRg?nKDsUEU?0)!j3Cv044c5+4|4um8UrOni+_;|x%lVzLRs?1x
z>k3~RQdZY;zrxJZ9kV5f<+-;w;V5v_f4U_MtYX%=wx|EOS$E|d+z_qum1ELzKk4ia
zk4x?Gb?l;{y7TkX`q(%&HV~TY`&X38cMi=Bs0}&|jl#ORZ5I2bbHII@^+r~=yM@Na
zz}5PBR=1iZw0a%;Io8>5jS6V|pKxs8FLu&eb$fo933}+eyTXC*-ktGxO?1>{^0;<-
zcJ1*@Oil+g;dV7sB-~56ir?Xfd3ttLnVo3cd=A9K1R%f%J&1;s)OE|Vop|W_?S8mo
zW}KvjdvuxFlds7ui1*8s7O^u#B09=CeHd13#0Q89BO*eRPrgMQ-1Bzy!UZ{?^9VGR
z!T2bi(py5aQAQ+V9x+P#Wm-VXl2|~a5Jf0B(P}uR6pV_!ey!REfKg(p*OBLcxd)Q;
zn5Gb?Zt|j~aA|iTzKU%<A*=3R$?oC`A*xzAR5e@-f$2eX?aODb-AW`h?>jH$BS9im
zMf?f-$G$QMVH|~;B4+x*CKKw#cx^4I5~Pg}a_Roz)HPP`xO(GK(~6jYn}z9@;}TNX
z<k(TXow>|5uv$(k&G<evG|FG(j<k3kQ=rdp((?N7iX*cUtEv87Gf0QA;Lfq}#TkQJ
zW~{d)Ai18$<LD?bSJ-L4F*NB^GBGK#V1h55O*$cOSUBBy0&QyAjT>aZ{;NKc++HF|
z?GhURV}0<miPt4^np*Q4p@-Jv-00guWh0ztfx}Mv8{hbYf~PNEMuvrI0}T|7r$<K^
zqw2DV&c1B2UMVjHc+jM?NI~0<jZwLd(6Ewp7^GI^csFQ6E)3_|lvG<V<7|`9+&x{3
zz&LDFc!*_RLbJ)py&-f(aNxT1H#<#pzR`0DoOLg2Tz`vpl>4x{Qb1fW4Y@$WC&>rJ
zer7#cBSX0_?(v>+-?8MA<D<;B+EF0onU$BqT-j}Pshh!_lOp~nIl=kSR9HP?x(LW!
zC8r}cwpSofmxJTneU|EJ{(|SQS+eCi%suY2+NGQIdR5c6L%RZR<@3Eu!?B7~1Fp)o
zg{ecY&rfPb1LI6F`3plZj@7o8W#lgHI**%gQ{EONlMq?;FE>qYQM`x!YkoeG!Vjvx
z{@ICKpKg|k{9;se-6ML?mmBNnn_%x(t?BhLVAev;X;}DDsVM*X#y|E)28)gi;6dn9
zzU~$=$k!IA&SP5Y!|nZ@+hayRP6qo~7z5J^K4bj4J2#bDU;nGeFA%BuR-0>Y*!zi(
z=J*RTm2HS>DSUIJXgt8?mr9TmKK0ib&0EN+tsX(EEhlRw#50JoyxN#u9|vyy%oV@$
zCN6G?cQIYo;f7foWQ<JBR!Rp=Gc{uUM&_b~a(9^9PORJiSmJHHae;4sV%G8SMBJq#
zp(z*oVOL8L9c>raGj;r^Y29`yNvn)wV$sqnGi`9{x8XO3Ym5S*d;y|FA?SC1gmqGq
z!vW5ZZKKAbBuj}+r=`ltOwDIbUT6NnOv^Qk!MM4YX)5t!3Dx6al_OW!+DBp4eJGu9
z48x*D4Ax68AET%?=jWI^Z#>Q=_a-XR`RD?;^YawVnG4DRjEmlxReRpLmuVBfxGda3
zELjXh7+Sy8b*0A+KFBAoF>8K$SA)Z&Q&p}TN=b&c7MBD#^co3$X=}?0!FC%>1Q!A%
zlvz@%Jb9b*Zx0re&y$Tp^4WLpt#4)d>hV%gb~{|(@YTzEBu~8W^w>WV1K8W``>7s*
z$&L0i1fXInKx-UUyhkP=cp8~yyR9GXNpYE(mI;*}3-qAOV#6fA0}0SmZmF9<M^;l4
zo#(_O+#^5gqTq5oWzxM@bL#Yy{qk9TNw#!#?;L5N60BsM+jCx%YDmvx#$MwXe?it{
zH;XTg@3hmqYR~%{n%_Cr&Ze@~T+0>>KN&ox6o8(Wpc6~Q@2<n5pmk;OQZG?1MTswe
zv5cr2)<xbU5Qgd6avz`=t;MnCcnbD9rSqAcS9##OA}kb+Gi{y7o%oE-E0YMZivO4&
z#Co}*kS3c9YdSxITe<-!mkj7oz1TRfe?W8#OLA4A`MpZa-Re4JAgzdIIs#_A)>JHW
z*e*G>lB#-Dpe!?=e4%80v@1-(>^&im61drR|6}}f{zJO-Z>lTEGu)_yK5Qr{1|*6x
z5@X%|bnvlFnaNdw_nPj?MfiPPp|7cMpqD1uOMKE<3)bCHewV(PJHj^LSTi;!e*>sW
zA(em9K;mPq;Ifss=*p}t8L+)Kmy|7TS${C}22;JHs3%teG+L?>^{xzcTIE&+4w;`Q
z2I$+I$1*3FB++U@F=NQDS8>)9Eg^k+2p0}S7~gKQnjSO912JYuU9zg}qcv?N29g=?
z-v>e4%Pu6|zJ1RfZm042f?Ds7oiBs;M)fA8V;<t@?o!fWmkd_)vF2mbpomp~cy5?w
z<Dp)TPXw>w)B|*<#`!9on@Od8*k*nOXDXkpL)UkrsUH)U7(`dKW01A$uzf-SOZikC
zoVGbK!O+d^4G*7!5z$^!K@)F>q79BEhu=#!i*QXJj<*L`@N|&GK<;!|7fkD#@crQ*
z-A_GGhRI{lu&y+W02hS+Sk@XBU0>?uYNe49y5eZvDXn}Jc8^*Hkd%74u3y}~P1p)Q
zj*Uxsn&D7yww=p5@a`OMJwr*WJPxX7blC!4d<{^V*?ZH;dKKQ+J+iTM$M!4>S%!n3
zl2y`~k|3njhK&U(p_ceYO+!N89R`o)O2~u}@3u;tMWUpY4MML^%K5D>0=zlOr6I3@
zSJWs5NpD-R4lam-|9%B3tK|+}6(_aiNyww6vMJ?Q<^RK-8DKN8A&X=+DhTdzvNq6l
z+`?SJ0!6F)z?{Lz4hiKudz`o?)Lu>Cgq*p#12e~~od&^#2;U>O=5M7axz~<k;L=iU
zKYaV7#f4>6%h5VK6(;Zg1lGPd_*iH?FW->DH{t;U1XgC_l<kf%*6n|Gmf5-0%UD!=
z<q7*dfIVh9*E?SCfRVz^TzDNDu8KH;c%KtpSp|aVxbQ-Yr>FnS0w5N*Q1eSr_*&>T
zY^I;gW1;%1+;%YTK>gu~cHz&1ywxn4V4b1n0{e2s)XnLeC&ddcnDWMj(PldFXrhhZ
zeWU^;zp`Ig&pn?4;XB5w$On=I8tl~fBlcl^KddPW9>8=%4oOnDVz)-WpOp;@Va150
zpJ&bCLdy}`_n#bqr(Xi8VhbJ)p)cDBN-F;teh8cvvb)<6*v_`xRU4%bPwMPF=sYEU
zhH@<>RlE@VF|OzI#2p<u7x1S$B0eS$9)sbTuLIq%_i8CVcuqsDS<?C;`Ge~){%-Bo
zfTpUZuTc0X&y~otKQ>FtPWkrvG@~`b;Kn#ZYq&wO=@IjYW6>$I*wc<j>FyxUWvlwZ
zT0F@S?DMjRu@rWwsw63~3mT7JNx5)V@nEm<vlJa^Nte2yC15U%g5$P7zrs6gR3o(a
z>UYyga=l-2nhk0cyF*kLX#iJTo(U2kU8hX|%X!-6eF++KD;}Ir*Md%Ydiy38NZ<=I
z^RpZp&$p0uE5Q|@ZWXT%vbW>SEO;KwkxZn__VkoHnNhL+`^JO0Hsc=*tEXmRxy&<k
z>i!A0r50qf{wDd9=<(~4!&dF~-?;qWFxTB9Og1-dfzaD?{lzr@&`r8~`F(9*Unbtf
z(UlDFrDhy3TvQvl&aJV+OnVc+rro{3KwpwPivkUa!uRMwG7bG4gy0eLu3X#w_&oui
zYrN9IbG+SN(MFjQYQ_f%u=<7mLj!HSTFTRUd1%#v5kp7_E$@cQ=ZF5ILm<=vP8sp#
zH#*3(AbAa^dXX;REbOjMAMd!$SRs$XvJAD&qbF1>2SGt3Y<SG-_&z*pm-$xUn7LzC
zYno&_E?^p+M*4|Y-%*!xU&##P>H7{9?90w6$96LoiT?Fu<lshG{qveaOw!bcV=30j
z$SPh(Y(}cRwNDp#Jj45{PNfAWOe2<-c2szxVDNjTy<o$GK+To%9>Yv=tQ|Y~gZ|~#
zOle-1Uzl>0_hcR9Ktf_Bwfd~sJQs!n6Hz04Si!Dad|dS?BIrFkVVr_vdx<HZ+^_L_
ztvJ4!_Up*AhjX2*vB6a@7B-;pq+fQf+~Y@!I&AFiRgZbwPASo!zt5fqWa2et7HVg|
zWZTSn$k<*AU1of9aPu6$ohjX>!rH>(9lyF5D8aEk;C`exR{y|^B@;cOwwAF|x^Xhe
z@ffgxchRZTgFpW5p6%$QQO03?AOE|MRrKJ)@*P*)mX^;UsX{puUj*@br#^dIfnV9T
zf1$|`sXeD`(k`&f@{wqcT`MAwYdMzZcY``<V@R~W;XQU*8S8R^#R6=+;JRl=Vm<Qf
zZvVBLj<7)A%$q{VyobH`xu>>Rfg?GiOjC1SgABdlTtuk2eT<q$MD3>4;c!U;&0zp=
z)D)WZ`)K&`@r~A{q*i{jfy-au>XZRHUU%8F$!0#$qC|&3cVi8dnMvqR)oKeBlPa}2
zwc9KYO*$>s$1q_IlvNrrnIZ;NzK@N@^zzs_Cs(A3c@8wN`K{;TquGR7bIL*5D4Zi$
z_lFUmDkd}9SJm`6tWC|OD?bU;prbzRaZr$<v6)b}rYRata?cmhB}47Nbw^bqK5*Ky
zrJgi~0=EzsoG4gizFCKR$E?S8&$mqbDcVyVW&g=3|0%tz|JW^8o2V+DgrWo=Ft{KS
z^8uGo!Ow9?5eVI~5&Jui@`hLrB)#2xl{{iObyyE8lmeM4=F&dxI6xKMk3Uf67wU@{
zc~-iXox0DRKvd-tZrx*ul3p4u!Bilfv?qChEgmg$JKw&@mXN$H@7N{s<o)c(T!<LC
z>#<|k%E=>doVJMVp~Jf1Ycpr=8r0zM{Af7I9aT)HA0Yncnonnmshr>|n?>dWB4d<o
zr{-PV`Q`n#o9HEP>QGglRalooyTaQaIv2cGK8pdi=gHP2Rc(vUe9=VBdh*4beAi0{
zqZSM)r>aA%D9$JEv_UK>G4+r<jturXu^Ul{KpUOg_WGAFirc#-;%j4FTb#54k=H!$
zUu%HuC6N;BPw28x((zFIy3LGEM^+RqogRP$4bZ8|GLBN+%=zWevMzAw3*4w-H{kxt
zvhOMDyxfa#2<i19_iuu1y5{PwjD|LMO4q0-o~NA(8I^{Pr)wQyQy<@JiutZrcj<Wa
zGATG+at(y4+8hi&PPVE|bRMfW0s*4gd`Ru9$t{3{_RyE4a;K|sBBnWy_09MPckPHz
z!PJseLSz2+I)^Iqc^=Ltw`dn-@h#@{+p&HkLbhy{asKP28lKBtbUJ_`U+3qdlWeM_
z$4hlp$CzXf8}>Pmt<=$D9NQ&@k42O=L^eWA`?g(B(btvn#1-ghhCk$BP-iW_T+}mk
zqz)>-`7BJ0xV;qj?1aZ-GWJZ%W3sbMnN+&WTVt%7{QP@+s3^X7Fm5gB<tq_|<V<`N
z*AzEQ@1nz}90gu;mBxy?ccR$|uvP~m?8V#T)XXY$3o|cloQ|6QB7E?J16H2jA~K5-
z{AWd@wd`fRPWLC7>ekjehvT-U;U8%q=li43p3!bva~8r{C11>0^bt24wH}FGnq`<%
zmKPD#X|n_bLJhdY9X3Bl>|a#;Tw@tik2S5iPkmzY6U2AgF7s+)UpL|VI2Hfnx9oWI
zE@HD`miMZ_Huu4oF+%4p1!A1qw98<eQ=v@3tn(Ia(!6!TZL8zcoO2J$gRKg6`tud{
zl#Z<ego$a#^Z#Kr`m*5FYdckN{D2<k4HY}AL8?cE?UH!cHdem)|Fxe>VhHb<>y4TF
zgIDfVlq_!otsS>{?*pvcblD&SvWV1e-uqSux=xt&U5x<fnPe6IsuK%{D`liy4XKrr
zl@2PtRNg*(qZkd^nUyR}lFNECo$AB-;zku6+GBHW=B-P5`ZGOY6AT%)7s!Lo6nb-i
z620po7t}7Q@0$~mPujyOIOXX@=iwCEpXMEVe)C_y+LhKXT59qz<dMB{eX`euEup*j
zCqi};0tHw*GM0W|*Av5%TEVcX%#e|VZxrA6!CN?Rl!MmzqR8NLit0ji)il}~L{w>G
zU*K(Q_Mq%xI9><2g<dd;NcOx3iIf^4`on!;?sE8MK_q-j*gbJHQH57#8>s^5RX@Rp
zb(zKF8;zw*dO2x++aG5dAy{~O>*M*Rc<e*%WMtM25Y$BwMLOE%%r6N{Z2%9<(ELf~
z>B}he{LQRQUX1}uqJNccg0S<BPn8(|kEn{Ri(e<Bv*e72q*fak<YFenY@Al#cYhWd
zuRDl!XnqmZGGh@3lh9_<FL^>XK+PYgqav%D@_x?Abb8*|FKHrc=9I#yy0!jKZ52Q=
z<SqWAu?JDhhY1oaiMHmK6Bn2x>uUztXx(RO^^1`p@?Cv4=DY>dVGBIV8UOflX-DU@
zN~rnIS@Ms9ehytnUD|Av;<8j_3qR=JyOxdFgq<_h3F^;Gq+=?SScEoeAk?^Z<p0A=
zCIw)lr$X<D-rU~~UGmMSyze<-k7?FPo1~fhZ|d@wWrLx}yVs-oK(WH|9Q13(VJ3Y$
z!0Nz&wUO|{bs*tOPXBPd=0f%6J%9DjzX8lIMmW#G<;dx3f>x2+qnqM_h_IkTpY<4Z
zSy8#7KsP&0D;fLOr-37Ed0Cg1P>ZL-haoYHtK^Av8S0O&bH;9SJ}yGmSP<;F!Q$>J
zR&#z=zP<@<cW4B<QMA(!KU0bHgli2(UsX#iic!h~h<{VbdOIe2)A2jtYs!G-NqLW>
zR%>+UOXruxTh5H?)pPCa8tytz7y0{-o;iX|Qks)0pB!>q2jlWW-|`*2<8_wfL`ppo
zSbcr>U(B$zqU7*|F6AoQTH<WKvVmZXZ|KO5SB|72aTkvp9^Z12KafLOia5W^=kUY^
zpFS?Q<>eA9;a(7D@qCkK%=fZU3+T@!M%Riiz4?XA(obB(6F)1br?_5asNwO9Uj>ue
z>rr==I=a!gxHFTiKdX_Yz}9wLJfsoQY4wgruf>xIguw$s^GL?+u*Bx!PYWuYROV{C
zP(J=;7sE!I<#^}QOa48)%<53&cg6K5>un?R0Ku0~TaQzM4tW7py>mpKq>uwyA%y{P
z+4RF0FPSUc8*&-02dWX^L~hVxv7ryW+TJ%?UcnFlgqMA7K*IZU;(H$OVn|s{OR)Zn
zWj|Z#IDquW$C;R=(A-IBw?K4Z>Hm|o3unD2TX*)k-JmBN3HQ90GV0qA&x9-2?}b>y
zhcLY4nHGGQ2>)N@Mg_LsCGZ-0N$Ib2IlBt;w#j>G!P@2VT-PsREd46508X$VFpOaD
z4=vuI4q(jxZCugdhmC$(Atq^(HnPYbXEzF~uW6q#lmcGg_MQZ~LUHOYNU_}_1sII#
zV?NHi@bP%!PSZDU!s_HY{8GhOjK%xMPDQlRL;ZvsY&3L^+f>SeGAzqYi1&h^Y+S`k
zyK_KAZbA2I;)!fHQ5DO4rrC|PW^E1R^;SQU5j#gy%pAnb2>$uTZQzb$L+C*bkR7e-
zT!UduI9QPWRg%N#bCgc$8x07*f=d3Aa_pm{njO=l^=41X@Axw*L-vF<uG&q2nWAJ4
z%DYE$(Du&dorNIJ#>I9qcmJB+^MJS8kLC9Dq!}k||7=<}AHkDMhd9Nqx6uu_ISNnJ
zkA)OQ&9uy2e@K?PAbjc4JKc;ir>6edz)0Y^`gXysuyb0TIZa0lT~CY0TO5JkgF#?_
z0^PCTDeP~Q|13O#&AmzztS5^V6%o34gzmArok~RH<mrXm&>4)bLIwMOQWA8izYzgr
z^QSW;A{cglg7bBuL;3zjdnFTx@XTNg@S6f!sFneo&Zg!3VjMRW54lHq;orERiTLO@
zpV#PtpCAyRoHD{C+&RX99c6}}eO+svkx~?JVs#*?mRMOdQFNX3XK}`L|MP&9Uc)}U
z?{5DzMQf_c-_0ZnqTmuf58E^aq7QH;@;^Pe+rBH`CRtr}`TFqnrM{@(=j>2|;xYKM
z?eyBkF_!F{JyFnB9%B}vqAYz&TEsJV7)8XsEl1WQQ~h2_6JfYst~KpkzDT~oKamd|
z1dvT))VyLM?Bz*_zVUwF;^(M~{OJ7q<I?Tdv$4|CnHQ-8Dwu?RTl7!{0XMWK$9(Fj
zXG|VF;+=r0KMyi4s29n%H#7#wEX>=>`4v9doyVTNMnXe=fxF6^js(XRFfd9Qg3r0D
zr(Fge8c(b|->9^O{Cp3F3TiKnut^~-7J{IV*~yYMxOitL!*u{j9fewm&dx%FWZPD;
zz$l&ZpEv!_wp=Rzvm+}-3=s-$QRY*mrg6|s+`wwy>P>0=2X~-8d9haaVGx|JH4sIs
zDHHGJWfw!+?Gv4f9y8CYGYdOXpRsGCNT7Y5=2A--OSji>PJ(MNELUiPOBqwCi-B#t
zqdxx(SNa4~(pDu$V)xa5;uA*PmT0F2C{h}dM?zj0!V?PeVo=l6|M(^|7*rmv_&)}>
zAAdeJevU}6m=0Ug<6}oP#G2sZ!v*3`A!+N#SYP^dE&+{A`Lb8gwr+o(U~<5b=AZBo
z5Pg|;{`!wyLUa$*AopUVF%qwyuF|)BzsNc+!HXNWu|>T5lj7>zTKNj|+h2FOKAf~#
zJ8u_1Z#%gSYqy(iELa+NVp#EPC@MZ+`<-w!C#+Zpd7RYxN@7X1TIoFS%l*|%cB?uM
zIt|6+XpbX@9iy`+Y*q8&*PHix@i)_@L6@11)Vlu=Y2gLz?|}*K{!rn&sF6&!)<X9l
zy7)W>PpU##zvUh@X#b|#|Je$}piX(RB1grp(=mTct?qaqt?mWOn!vu8Y8qVM4-pgw
z9l>zEJ++%#a?Ffcn1mThZ~6fiere)y^{DhG|89mMxl_}9)#{0Lqq*CRo2{i~v>?+q
zT(p@=el7dY)~~`56`H<~`IZkC+Cpa6Ia@(9ab{ocM#2RP6CT{#8)hNN;A9%ow|!&d
zHUNC$WnI)a2c72C-OpbA?yfW^q43YDS8l~c*C<YOfAm+@R9ox>1|jogrhmK$T?B+H
zuq<-{I}{Ft16q08(k-xD)S?U2<o>0E!uXh4n^|+0NQ1!K2M^Byb<zVD_9tdlHL1!U
ztBy}AVW9MaL5&RyKc8$V#nS3rs~&}is~5^+wG-1t(Ho$9XYN<6-C3k9{3*E5CH3dr
z5GG6{jFt17`E}X^-NB9Wj$<tlrlqN<xNe#Cy6o`3OhXmc#kVm>ORM>3*2XPhSjY%#
z|Nc8_1O*rY3`s}moi1A}YEe;NBk=!HhhGB?y4J(N#@j-zZutwXj4W8j%a4F0!4W{T
zIs-BlY01Tw)I_HU*mwevO5whm9;3?OVy0Z~wQ`A_tmvho*HzS#6OJb&<k{p0qXp@&
zfh#{_$fm+}YP5j1v;rz&BAFl{25~9br!k)0uHf5=We2-JMj-5C5b9?0S;dz6QB;pV
zqI1n4q8V3s`<d-39pJp7yqd!^Op!PyA2<tp|B7VjL;za<y{m58yqIcTnkNhc;hJIo
z&IjsgmCv&Z=vsn8bu9xnf9%8fc$MA>egp55ESDWg{R>i@09<I5xrfF)jTFfMhdHG8
zI-qKrx(Ytq!Si8!cb2S^6-&kFj2-yqAZUW1vNQ9gbF#b$rh4C=LFUJpN~quGsRkXY
zpUeGi$_1=>Gn#LLB{5Pa6W9{{{&BtTO2l@Uq9Ee3XWl3By44vOGJE{%{H8@nqbIxS
zhR`#izGD#5?)Ln|oEw7lKcdCobIDgQA$<o~di~36KW}&xulLn~o^mG8_}0K!53hsV
zRrfnDeGX>AB3q<%wz#<l(?3>S=>q^kbFK5+7!dGmKCHaiU(pp`-U9lD!HCN7so4^m
zv^8E_map8%eKG*3MH7wMYzvLTty#yr&#dkj-o1XD*T@;$yIKo$IT=d7S9V3eS|lOh
zyO&}l)k*QaZY-SVKTX0}unD;N*~OHn%+F(_m(HW(?6o&~qfxW9Rr4D*W0C1qt|j7z
z?Z2a}UIbvI;42>;_$_wa`FZb5FYrqaB448o$oA3k{V-Fd`}g2X6f8^;+bR6+sb%5$
zm&zBtWITPEkFprMwl)R)e5?`f7(s5^%y_%|y6n|ckKVhdjWBPI4ENS~@o9dx?0`J#
z9oGfeVwL}udmIr_XW<0R125*d&)^aRn0-(ih*914z>>2bvglgegx<vUvj2xiL6A`v
zmiv3W7CvTisYP7xkSLJFk1m1;UPen;+N>K-Rl8esAeYL3!o3I943WGV0royP(OMRU
zzOJ*(Rs#+glG_=H$7gUAgW7Dz8bMTtib9|_N+B6H7>Key1R%LAuguJdTu_CzaTmy1
z8tB?(W^*U^u1eO3ZhCTaX{$dXhn^#N!>BKE62N=-z_D$#p{sYnkz%@ifR%vJ7DchM
z7_+bVd(@)@Ld3z{lq^#CA7wF<6!?%^=<aS$7VrwkshVUX{`J?OPC28nqCrijJ4A)=
z{KGq1t-?|mPP7;`BowhKJc61quX-+B0kV*Rn=BlUQHty8vK@xJsK>toy^kDNg^vUd
zevodwb}}Wc3fV0vCJ#^OhiqeyWut7={|1G>s|Ha+B1kF@Eo%dX#!R8JSQx-x2WeD<
z&FZ(aO`%l_YXgnF8CrjsQlQ*snnNE@fv07wMUJxHDX9<T((}=4XDH$`n@mlAi|gBG
znq&aGK$z=HOL$+o8xlbPu<YVNe&S*RMeCgV*mXU%4=`V02@OL@DVg4;-A#&L9k@PW
z3LT?f9XJ&ZXBUCm>;Wu$+E+QDhb1BN?Bd0+W3GrGs#%(7)53^Uuu9KUK=f{fXo5FM
z-Jlohr?-kZ>reE?bA#K;63z7Zz9nDRB<&0an}(iVbM_#ZnC|7V5*y8`2ZSQQ<3n~v
zXxul%qd}gJE+78^QYR-IFRL?`E_%r=S2{(OyIruw?AvHTMbaDzI33U=Zi&glTZ?zD
z|A1&vG$JZj-ymzZcHTfL|D1Bnd@J8EW##<)|JgU_SLOg&Qj}MLDT1WHqz|xWsnuPz
zTN9W}36oiB25l`_(PRtMzr!LtbDRJ?uQLFhmOcVm{wM*@rb-xGq?{RC^pF_75`$r<
zC(6jf;f|QbR2jfCnzb~w@@8)4R%YGy7h0DH=J|47vXl{r6WkFE4^NF*3sBXpjKt^1
zO<k6K$E04Q5k#Hr^p%Qx{WL%xy)m-_TkFKo!N8p$2<xo$#F1jQPQ(TA3p_;7DR{z>
z3@U3wbL!I{tYH^l^i;Xkn}xm{e$y>_7o%dgCU`oH0rhl6%RTr0CIR>g(0YKPGE=)u
znD8G^S-JHc_KX#CjL=?M0%gDb{#WFO)S?&+NU+;Zs<heW7I-X^iu1Y*EZ%!{l;I2>
zHzm~mw+%wcO<RWgbEf=E;9w4cw^3mB&RS8``G1k?lnMf$0++wi^R>G18Bv-Yu_6>c
zFVqIM0U?J=@|Ms8k>=303lLl40iMN`Fc|$*W&mHZ0?(qf%e|x<C8~d-a{@dd$m;+{
zFL?iUuf{~Ow7Gy|0Sb6F>Y}j%y64u=TU%Jk%Y<Y92#mR|anaCI818^P?{RR`ARu)j
zd?fyG6_x~pxfvd&J&11Ej%_dX&=r+WPv1(R;V6)*F$UI-EmeS#=PN`ufbLMxahpl+
zuVWn;?Ex3`I&4!RM72(v|KoTF7(sRk=G?V&KXaX#0`NoqULM1h4n3wsQvdUlFUbM2
zk9Do%)QY!1k@D0lVc<iaiFs+L9cf=V<v;wZo4~Jl9KAaXMY4@3d2|c?rFzLBe!1ZP
z>r@sH4&0DMn(=G|eLBkk{<6pbzO&Ts9+m-~)p;|3^%Jza0|x_N7|o&A8yLV9INIG4
z#1Zg7J{rV)O5w35T6)243^@zv)5-GYPQR1I?%aaU6cM^Jadi$(4B_P*iEzF)bSSqa
zrqIlA8twWjCjps>i=aXa^ZBMKS^x_7wU@@q{iq5gWz{=<@1!}D=h3E^xut87m48v<
z7kg5r%Gr`t?-t!H;TPJH!98D^=ylS%Z7)$w5A+QZbgcdQ0F|+#_-YE=UWn;2D(Xh@
zF*lIBT0RfX!1e89dHjQP1SWSjVs)Aq*?@&)^ftWKDJP+a+Zyn<FUnjIrJpE-qy-5|
zf^&K%nn7v&$F7%1f)nrlx3@xSe-hp+y<<pTafHaZt(i+R>vah}GXH_$|G2XstP5!U
zo0$Y34z~=%;NEDY&j-WtZ)<mVcvjO~F_1B1yJJEY&DS0t#VQhYFbehd+;08^OU+Be
zqDyvdC>KX2&b+Z#{D<a!UEt|cB%B;u`RMbZt(LRj%V6LqDd(pX7qM_{=c<J=ZD(zl
zj)ZQy1`Q7F@OllPXdYz6_^wiDUKQqP9st_!hO=;JPH<zqx2yGA#U`EBhB}e!_CFWa
zLp?z!s6Gl-3pC=dUTOmfy(i}!fuWWotZ<>xk5&&Nu3TDv0+nn?X_5BcMKd#C;sE?o
zh0}uP=w#P*`2|j|=w^%piv$E-I1-a9uJzXR>c*O2>r)Fp`$yCi0OKE;b5EPhy?ODh
zq}LDb$i=UoWG~!hlw};ElzEx-9pn-A8$sgcPF?f&{*CrNbe6QOnA9B|4)&y#5pm{$
z#ul+7(~$otYQ&7#{)Y3{`z414ecL<t^|f03>4u0m+_o#fVh5s<PXC9q26d1-mGZu6
zf#){)P5zz_s%lhwFO^1KcV>HcU#Pt8GC-eE{f%DWGw&*0$$z8CKk&Qy3csF0f?;*C
zd|L9e3_28rj8B4ECI~1&ngEpM2gmZ;;`Ik(pwoAgzANdLf)TBhih}~cdZ~7INdW`d
zM+sR=33$8pen6q3IeEBhUtFo2*?DtXskXECFDDJX6T8X|nE81d4a=q^|Az>uCub8|
zfnz^v>h_F17wm6aIFEUAP6-{Z!P_^H!r3o{?BcshLTOjQ+QN0V)mM)(>U<-A^(mGW
z-`YPH$4mJ%uR-HiO80V6VhKPtn%Y&<cw`*S{;p3olieNp_N+dvvI}NfmY|VCmI4-p
zALN0qG107^Ba#rzT@dvpb1zsk6kdjFf((HkAv@9`eLOJNXR;$^Seg8y(%eNBmmHEK
zi=l8hGCRm~!P3>Q`F;b@o!Xfz+v_Dfcd}DfhFip*3>*hJcutPRo6eU5vQ#bwO5HvW
zak7tvZ;%hD@I)dmexnnj*d{Pi%RbdkW%>o4(MLCdpb=&eRVIb~tr;#gqsky;EXc{8
zDig$~|L3$NgABaCTn1P>9%m(TzT3Nf+qg7(O!B>XcGesG6tQM=A94!_8O{9vl?5P-
zp!7AiL>Qm<%iXzB?}FfnO0<Rse7h{6Ul{})iKVhx835>X4FNH~1g(^4rXfn%k)V%|
zV?Aj3xIRe_bsl=AQ>G@m>P-XjJ_eU+oo>MROnI|FhAcC!R}U^wNqr2^$r21FH_c#F
zZ8F7AS<nRk2Sv-drLG0Kue{mQkCC3~#*Jw#r|4!`^SWtl{=p98(dV}+Cum()YKmNR
z_h>-j%^QOP|Iv$(QHzp;cR_kr&3=QEha-?Gnw7poTl#V4E;sgST;^tcVzd)VcNgYq
z`By@cBK$DX;JZ$A``NW~#61sH$?}cdyaJTXtj4P7m5@@`jR*|>EzT^wjbhDmcv0kc
zOvtFSq)c~YtwYW-{s}C5PEd4ti;aeDD`>u-X4>NT@;>%>(k4BrVTZ?=Z|6bE!`8aO
z0cl(K%e<8P$cRQx+@4^FmDadY>ApLUz|N33s#p@*8JZf8$vv5>;=$RUAM@<ErSQMG
zR+Lhve<*?9GP*;Y6oDlO;p8?;P_Kc+$N*u$+l#=+un2VTJ3H}!&U-msdY!e`;g-tE
z2*H_GaF#+sogHJ^IS~C6|LFAY!!O<<g=1fl-Ro)I0etq+8PEH~Z<#yMe>i4ejJWQ6
zg??p0yFC6&zsNB6%d+;0tmlKy5Zz8c9QmH)``oz2Mm-l>LZ7Dp0DQSNox&TSVlX;7
zX8>D9<Mn#eRR_v4nL%TKfYns2Mq41DIKR#ypin|Wwi8lh!rT0lQuaicViFbTG~%n_
zYqlk$J%q06Ji)d1+!`o8Q3nK`sleIZY~`<8b@8R)#iV%rymvI}y9$ng`1UgJuWuKo
z;@yyFjlKqbeb~(EGWVRqo1&00sgThYG=pEHXh3cqG|%PYtY2$bgRE(G4de(&7geck
z$y6~@X3^->v8Eb%LQ@&P6JEm;ggKJQlSx`aYBsq`Hdz|Pi)`6uUdHqb5WI~xR>E0U
zrXM;FZ~cT^Wg&Tik1*M>Q2}RYI~4zkMvF-OouLffs1w{Mtv6X@?jJ_BXH#^~Za<EZ
zRV5UDAwPKnBP$LAkM`+bmZqT}p9KZ2tc6yK4=y~7(Nk$d<gH<4d`cQdt(9rT@ZlF3
zc@-0SHO_5~$qJsM_l_`SUYd`b*+ij*GHMWgDeT4{5mspE)>A=hQTdn9bwAK^E@@wE
z2&4|)+9ZtU)k|~O3I@G@4||Utbmb8-QP&L$KmBgWYME(Xi^@kpYAIP<!BnUJPx0>S
z&IbJ#Syvqp<@U5;DQTq}rI8e*Q;_a%SS3Ze8&(CB4gu-zlx|j#?(UH8?)+Z7*SNp$
zZ`j@UoHO&xJkQLWw}Cc{wA3RIv5!u}L`9Np?Q2cp{MpKt2^-~o9pYCO10>1<@Khjm
zd?7prxImI(X2U^p*Kp(A<r-YZh0R>X6a%USe6FuP$3xZ|ObX1#uyClF4l66zj46U_
z3soTqy1WpvR9AF{hKdX%BFGhgV9|1RwOYQ2heL8=TR^va95D=@o@<6?;{4_7-mdMt
za?^Xx&Gbq0d%%@t5F@iQXO%5F!>*nxG@u)R2EMeDV=wosTO!f2#gC{$3IZRWN_UFL
ze6GK+vRiSbQ-*GAma|gKW(*Tb@yRUUo-tkF2F!g`<M=$LmsFz`y}5>{ixgx#-g7lZ
z^%&?^Ymqe(`bqW{C<kr>;xqay!e30%p!4hS_FJ(*D>EySNXw?CQ3}4nU>c1s$5Wu{
z8qO7?;4%?DI`j-s{@OqpcB`xRO8rV4sX(##3!=2dg(w;y&Vh1c%@6301i`s%Ipgw|
z3x=DnBg3|WoF@pw!W(Ub@gP3o@Yr=PH-~v+XE+NXxgu5WRP)bWU%!tBsn?Kke`O+V
z@QFTT$5G0Qn2E)4(hPiiz@NTTwk6=bH@UjebtC*H=Pi@}*f@Gr`ek9#X0B&lzKZ2d
z`e9|_C5Ot@*zswj<5jQmv}YikKE~F-@*8)ZY`)7RU(fQL(G%wBfxQqO;&pt6Ok-mH
zY)L-%{sqttMj?SU>T>~tAE4=EK&&3Xe=xI(Z&;TVE1x<Z)}L@totCajrj|_H$7Z}?
zlGu0LSY8I;i}flS<-hpy1v!X=R{K`}N>a{v0TuPFXoI=Zvv8)Ou@%LeF@%fNmzc7u
z+11z87e4)+Yge#$54&OiY5st|5Ed5f$9#w6nk|ZhRFDXE5IapLHv&38^sF?YZ_>4J
zCd~;^tL>klo_|+W=YOBjv5vBIr3uyHV6gILD>3^NQ>In^y^RRZrbUj?78)Q$`lTj*
zt#r|lAq4IG<Ez#afhYz<hG8OTzz%C`K=*Xj*Y*Bg$*+{JXVRBuF3r_voN`{b8h4&x
zfG{UbRU6>U%`4CGl6kVdqUFD<oLpo*UuNifna#j{CDmJm3}_D5LTERSjAP_Ij^maa
zTjX3XqHvd8R{X#d@0PuWDDJUVlES8%$h<7~fRAa#JS^4WkbJ~MCWIB$nY+3E&G50Z
z*}P;gSN!vRM~?v<<-1fm2QBMOdf&u<I%mv~?HaJq8#xNIk2gl1&Vt_{cYHeu5M<wg
zi{8ym(C1~z=VlO&Oy8Li^$bChr+@l_nIrifR_liCrDUS$4o|?PR^B>w<!FRxI8xFB
zmUp3hN>}yy<K-`RzemM|CP$OKI2K4lEUbK(%2cJ$X+>J_)bGkd6hqywsW1$o_D${A
zjrGQsx|}8o3GU^$bEQqGpSwuEc&a46VaOb=9&FR<dQ(eZYuh(;V^NEF>~5o6VRh;O
zY11BGHmlu4A2p)~-dIf|tr4XdyE!jhVVPyuy&P@E=Me5&I(FkYXu}rtaQzWT29IU@
z+MSwhG<x@QLb}2OSCed%&Oe<%dh%eHnu8->%(ZFdOGG&a?**k1zn<Ca!sSOJ+IG-9
zeuRt90l#$ie@XwG<l}5E`8<}sTx3RGTg7hyp$mb-TF{d+E!&$IC}@t|7lBc;BVf9?
zHZE$zC2H3;;5F>9U`radcA2KbYZC;9S(zcYU}3k7Z&$ai0ijkyMtI5VuN>#h*Lz5l
zyLs1K@Jg%%P|usy7y~q4Y1NVM)j+$pxzxWdUJeG_7yley7CXOHujORdBh@Vb&|_av
zeba+nP|a8~_hMn!I5cl3X*F%#WPPl!FIjc-A~fn<nCS(4=0wuJ4VB0mCtHL<iC^k1
zy%UzZ%pnx+sWqx%H)ae!E0)kXYyx9U<Vm9Qq-U?f;2H&PF#Sls>c^J#aUd8?Y%1G5
zIDp>e+nPthHY%OGXbsVn$b0of=SND})IiF4*})t5;grt7k0gZ?uqkN;qdzu-zo}L7
zq;NEcnaiMO`ULJ`@5beK7Pi5$oNEk9dg=?my~Mxv*PVUysh+SVyk+blQhZWZCPXW5
z^jRQDd6>-3Qy|myV+$d1!^zaM*pdylblL`=BovP0bebdbXwwJeRN7R$IWGS&yKM53
z^F2vmPSDWU;@q5b%=^)~zU=ABx9;_R!z&N95K~5M+z>;8$;Qo^l6G~{NX$9JaBP;t
zI4ld-#;JMCH)v#x8#wI-P?mFASB|$TJ>fxA6XtTo-sBu{9a4;`$*QQ3IUpM6oKGd7
zXGt0N(PeY<4e&i_%U=@FPjPCbul=`7crUQeiUVq!AP&L8+{jYiSk;YS%5%$&y=+{o
zOiLRF(f~q3d%0RQ<$u8JAEN@Q5qu9m!l~m6U~FU}ZBYawHMkKmFjw_^z9C{5hRt^%
z$@kprIfU;`yVMF-ROwBVqzgBruaS(MmXe#N+P471##01_{`fptP(3rL`$)5#N}7qm
z30ixCfwxcHohFGK9wE`-#mr#UABe`S0<@zX#ECs3Xe;$OR!(S&?*=)o`8ehq4ql{p
zEZWXz-E+>X7--q8@vs>rW@Kvv+txZxk(cdzD{7zcw|MQ05?c5<MXn2;zzt0L29Ao0
zfF7nY3w^q341!!CzRT{&oY@q9-qgfNwSO`IcQJ4daYiZ5zMhU1H^rD^<lQ>wuw=Ms
zaHcHZG}s)(hv#XSdlb*OKiYMMq#w5rU(S3mR^&d<M8s=JPSK7xy4>;YX}u9o#PCpD
zRAthTX>WI&5N$k3O^u>E&Npe5sb2N$&I6gMzI4cGg##gv$4+_w3pXEmrK?Wu4PA1H
zK50I4(!~Ia=*c(lPbVvv^xx=@%f*otY#&u74VSLrX6KvF+4R7%ueZ^Avwf8d**!Zi
z5?=9DO}SuJ>_~mC{cdm%2(0?zB~weEj=!R#cqB;;CWN*;>KL@l%?|JhQ;;$W*_|fq
z=ZK`Eh`00<JUTk1&E>nn6APbM4VzAB!<b&Ky2L|O?VrQa1}TytKK1hW4wMtA_8&&O
zyCIgvt~bukRv)IT7Nwyktac97><maM{j2K;S3Tl}7kerLXR1cFcVxzb+3O;Hw3mJA
z)J}?-ze8gO4PfCmo0s~x4D>Vo1}q9J^r&M>;Ywh^=}H7_Os0NCJwA0Nsm;41*=jz|
zSH>^-m2GPrQQf3r<k%7v>aM=}EFnWH_9U@TiX}XD_ijWP&t^pxkH<_*y6Rop(~|Y?
z8)$g@YrX)|Z1$J>wBQ0lB1G&_Wn)*s)?IlR!gWXK;-$l`RX>GL%U;<~1(c7r%;z+`
za*-@znjC&t`7X2dL&TYf+u&E4aac-4i!#RVr(w{az3OoBCzQFZq}=+M{=`{=su!P7
z9*+loe?_Cm@RP!)Y$0dF1XWkAv_w<B4o()}+$Ud^s;nqiFMHosg*0!@^=9EwQ}d`w
zP4T2jonBc>r}s5`W71Iaf_#%2Y%*hfdLG@)P-(neFp`(Xk#GubO!`NSU|wL7%MwFU
zwQh~mh>fRl&}2<LG3N-4wF?7t(yhP!O_SIZWHt4h13$)j>q6a(x^!{D#izoo<IRzN
zMN%JwJogcGY5X5jDo6_^_LzE?upVxU=tWZlXDu~V>qKJnm`FnFRY-RAD8T%|h^e(j
zNjfzHoXxA&H&5r4mgl9&%Y6}Ta1D;%)8Cl8G(Cm)WI;0>TBP<VrIf9-)ZtZBdJoVD
zn`OzIj__%;O05w24;x=;Lc1jLL`6cx(+W#*q`GN2W!^tr$Ayn()G6B3JyZp{?vjfm
z|5p3f1woNBla<MGWXY7sM|*kq9nP`8a)w`!1N28C!G7`?6&u-tV<2N>V0+A>tUE6l
zgQ=dCNr`P-Szkj#Tp%mMq82kr!QmMTWq?Gv!qd{Ln|PYOHR%N?*9?Fx2%#lD27t&K
z<I4!{(Zpkdtxkv{{Imt?1)21jQ19Is`XO9fFr&KI(Fe+__PvG`s{bUWL4sehj^qe#
zT|y`C1X+FQ4cMZt@&-;C&yDeff79_fJ}QtD$!x&CX&Q5K7)h$}9btvw&4ah-_43!4
zZYT4MXl}Nj-F|rM^KDfxYLERNaUa0Y(fctOaC;g{&Dm=(j8*I@rp0jihwn4(O8H^0
z&}+l>`1jM}ntUiXJ#_Ux-eOd99~V9N)ccr=0h7N-o04bNkX(Ya_iUqdf%tiYP`@QC
z2<B401%n}cqtbyTLO0p)3n~-tx;;IUn#828oQBKQvxVc1!!hc|^hH173$o1Knx&r7
zk}_H5&}CXH0%HQ5and$%A;RL54W%5B^jHzFV+-D?A>!w2pYI^Uq?N2`CF#}Ku}U)j
z@fPR_jq94rt^pb5ov-4D(z8kN>_A^^C%%m<##+A?kOv<FJ@-xcuAgL*YpzC}63y+P
z-1e#qYsG*bAO}B0Yd-$jvb^_+y~)PeJo}C<WZrtCVZ!t@g6MaU0oqJSj3q8D6dqY+
z)5VDLN%QGe%&&>e6CuAv%mRb2#Q(QMfAbP}J86I;97NTqj+~qHk=dC*rvJkfY*UVc
zFW(nl<CWA&zCFMrT^kB{cjxv6!NU|0^~(|Jy^xZI9U5A)_zVNMDqYZ)Bd{{Q6U2ky
zv`z43uG9xFiL&V_V=fq0L)IA9yFLn`{;OZqE@?I|+d}q{CR8Z^w@Sw94y&4$-^6_5
zz>I9B;2hq=T}4LXgN2L?91}sIodZx)*aIZuhJhI?h+m1-Sll$5d$veOS3?Me+QO!o
z!H>v8USk!SvgDDUs#9&euv_k=!Xm{nCiLNKt?G|Lm0R4(+@6N2YCNZ+Z%oF87&^qw
zs{`AR(1|LZxg<fAZH#u}r%qLBSA~A0vAfQ=sbiqCLbv7-rP@@B$TzEr^Dnqs2}>W&
z?v2jA^2+HwTn1Nw(GF$f)1%U`(Zw%xW$^8|?fBA8`?SM}Q<O*Un79Ym2E{+WQ^7eP
z-FL&~AzANu#$eas7BMqD`D|-LHNQojmyPgj2c!GGI=8HuB~e3ynxNz=+i~=HYKbu$
zhj3UabYdo?VD6i{M3m7fK13;+1O|O4(Tf2(T$orm_SQ1eT7zEf$$-oQ{^gi0x<gzc
zZ^zY)^BeA%&8rRm<SvWozYGBhSvJnWX?<<|tcw)3e#Fz=-DT{)3oAxLe!mxwdj#{7
zmqEoIt?_Tu`N_@)BJU+s+Be;eAE1I*fSipNCVysI9n_07bRy!>I^YN}qvrz_cqMEl
zJ~42p-11+|I*D7G^`~KgB21s*;HPmwnxpSeqaLGxZKqRTjqr1CFq|xrK^{R2D0>mw
ze}f&9$15kwva{|$!UX71g4#qM-MDK%IYtOa=AjVdmJBj|aeI52;cdCv_Wp)CaE?7h
zVS+sObEiej&$U5sDfY<{uu(v)<vEI`^b2b$yUIEihsqwPri@AzaU{@ufND+cmhCEY
z`COXDGs3-Ru3FI~C1TpgylLtPKMyBlP!-t8&h?v-%pqIeZsZyRhOd5LpVP)L#1l_^
zGY?w9zQXC0wsn(K3-3)k?a<Zf`Xn)tJ7PR}jQmOT8T`fbO+Fj#&smfng~!D@`d#lS
zOe=WJ3e6n{-1?ffi3ONm=*?zOkk!R1LAp?Lh1+#T-0Dpgsq*<T6IMeHYETGrP@6H@
z5z#9F+vw`lE2d@b!sO!Njmuiv^WL!>Wz>3CB7jYQMh2=s1~<qgVKNC29LnijF<(-L
zKK@e%64rP`$A0YIJaE@vMyG5AqaIV_$j5idrNO0hx$_}ep`rU=uiH94bn?YWaW>0B
zs=)t}8&Y^Xr)8?)O!zp8bkt=52G{iyNYvylOFxKd8!BD-Pzxw*KY(~X2}8wW1t^Jw
zLEFq|<%Xm#px70;N$XfslsxE`wS=i;Dy$%RmeGLg3oE!nhvhG*w5DTz;;^JzpaMTO
zIse0CbF#qv<`jdZxKu5bg-9v{y;|0D#K~N8gX*5u0$rjs)sde1DL%5Ve3*`*C!77R
zP@{hG{Em{=_sPeOss@FXDU)yp7%{mbs5o?XrYIAJ`J*RAzLnZR*&+=z{`qQmsN3rm
zOQF}_25Cpg>X2hThoDV{kmPrTR=Q>}Mc@XzC-8}^<#gD}R`7%<v9Mv!p2gKMW>6sI
z7#_b`Aop)i;qYYX6BBwm-v3thx~5^KkQ4oLRt%@^vy+v9br`itq91HKW{#yQ*AJew
zW8-c%wKx#Zlj25f1j36?k5NBR5--DymDuk}I4SzHsnb^B#ULy3y?dLULnLMGL4j)V
zTe~N%Q>EtfvTxkGRca7JC=Pvr*~=_-F|MCP>}R7+ipE11gKpQX<TTWX0}YD}uy{YS
zt|bwZP5*YgWnnh;j;&kyp$L**ru4t*;=k=ZKx5o(s9pgvvTRbv)DfwNBOZXlxNrKA
z!kB>F1qg~9bl}Y>%(cKt&Ppay<aRS~PXb7S_O)E4_wc8Q4D<{^3Pn?WI~Fr=<XPa|
zH(DM~)kIq^Mh~0j_RjRj-54J!CX5}zQ0@^zn1xa?FQl}F=v;lU{rEA0hjX9ht4JjC
zENf&^o<rvND=Z?Ajpcc8QL2{#xw9)lMrU}wWv2kuME9$P*Aijg>+R_s>VYbSAsW*%
zTNDzEXj$l<&26NrYYP4BGu|Q>oq|G&@bn7F)~aNM>j4%BoM%}Fwbc3=<;i3X-A6o-
z(0m+;Co!i9@?Srq#98shr-J6cV`zfji+Cs73BS_v?%gn{=&e$WtGpaoIOi!bal(zu
zji4+tzHrFT5CPvkOvyx2OHB9dPl?ae;bVyBY#ZX^=R3~}AwivXj7j@lMjChz--k?V
zcAKDIkW_js6?_-f0KRArJ3YOsxnrBgwYjv=-keeT{t73T5&h5k{ezt8@TG1J)}`$K
z04g{50skX?_{7&OTcgB-1?-p%0F&HD4sp&I#z{`Wn`Q^SJ=ls%j)1d9O&qgmq_Lm&
zaL!<=d8giHhrg)z<d>@2kO)<B%IH6Z-UV}Jbws3#9~6;t=qnf3(ZG3FcJZr;Zfu#|
zT)(mF#lQgC(bP^p_sQOjS)w>3dA0>Zu4Z*ks|$yr80`!WeIffTBAkvlFI)Kr=HZ&O
zp9_@qk7c*CzVb1*uV+(zLIaN>0y5wD7-krW=~1DVP2~9#0+Oiq_k&D58x-c2YJPlQ
z2VS|1`InM=$^}vImuZGUKQ6lv9Q|fA6dvUbshd?Fe}R;X^|#*p5F{Y{I2-BIR&iTI
z0`c1Y(c3%n!|;HLyR|3mK%Tk)rM8K28yR)uZ6tCk>ngWjOC&f%=XvnTN5&WEoZ0vS
z#XBj{+BTDtX!5u%4dmTzsc5+wiPgYWYBPNz^KT#0dOEy#%7lqlzy~!ee!kZ?2Aw-j
z?4;d+!v-h8q$I=I8t8U4&39$;W$$tMuCo{w)#`bkGr&Y{tgeqqb?ZxxXP7PYAh-M0
zL&FP!UeC`-9|oD*4Ncwc->l(vqC(yWL33jLKUkpiR}^5OKRH^^$ge2sgj=%JHOVON
z*#!z&wZ2f)06_^*Nl^jszH;Rb)BKwFqg#1m37HOa^OgA2_%DCVZ0iA<SZa5gR+L7)
zz%dszxSE)hM_Yhrg}QrQ;f~`?Z0-?sx`#Wa7&w-np8QKXcl#hLAG*Erc%l;wbeN=R
zB7KAHegf!RHUn5LTE&6#IRZTAnFA%LC^32LqL3_%SNMq;Z;&9-13AR}iI1DD5$DSb
z3NOz|d->SlcHXe}H)7xfXX0e5{Ok<>0ELKv!?LJ*8JXmAgov1)ou}{P*Tr^8$;fWk
zA8Q3}^80=p^?k|!gCE3ffARblVI{j3B#l%DdE@^I>E0Whq`;rEGr2e%31xRfn9H`z
ztJFDc8&c!5fTd4h-cJqnhCfQrMB<yc%f)S<;747~*G0no<3}l&4_)f$RW1-+9(3z9
zWTJw0bDpC=wO>3koizpIrJm4duB&oQXKZ}`ua$}7zUf!&F&|$~926w}RDD&w$%!$A
zw}fqadK^ueaUs%g6d1JmS)wSSwW;##8ABv;wrU^uCl_-4-2GL!^Ido$?P^Z`#4fU`
zZ&R+2PRd-<v%88t(^G{S#$-JhlGnXy45q(&s1}5ugss`9eWWrNjY%CCj_QrP-K{jQ
zeRmO(If&ci?v_9+um04(t;THdO?1q6QlPK4$SjPp(Mwy!Z%^)X*ZtUQBGpeG>c`uk
z8K17hr^69QQsPl3O^*Ueq5y+0D8l?T28YU%oU~^S>aJ^k!1vM`rLF^&m|xa#Ob^mn
zOf>PK7TVfqdOnJn6EA8t&MWw#FtehT8m8&kTLWmKd^#7}T&-pI%Xt3zY>Ljk0#yYL
zMr!CU^C9nf4I9d047-)DDeRO{eK+3J1{!;bH@?Rl<FsC@TIMvaGxf7E&s{#Hwc_Pq
z!7ZP&Z{Aw3TDzgHHqV@Rc=5o&r;v~;UYED=x^fFb0i0ii6s~8%iw{|wD2ZAx7sVns
z>?svUtt5iCa{NbrYOENHAg#vH7n^0(BWMHAhbrr?xy?cfu8P)YvF%g+t5-N`3OT%5
zYe;t0huL)=m?TPqj7IMZ60G9bUjs|oW`I!cuwpk9o!7}Ao%$7tM8Xm=#N*a9Kb9~?
zkZ=7Utng6thyFouQlo*w<Z?XU#H;&E4Ca1{G0AdkVM<x**B$~rA}@eh0r6;9K?~Zv
zTiA1sTZkoT&-K^RHMRGMw@b+fC!dqD6-M|3@A!cJ%KvU7*x-TDPIp_WB^!hRXp}>x
zE%mt)2&B*vL*(Doik;4ePT5~IM==H@V7=+DiuorV*HP}LdG>NCXD7yZOu5EGGH*P(
zU`!_r<&yk0X^{)gs-_IIQg|%5bc#D?=%z8Bzqf+-o5ZAB0_I!A?<2oW$-`<AYzHA!
zkF3oIR$<sK<qWZ>-flzGe%)O7a$N7>#Qa9_A$LAP??_+f=-)TN6wxX!9HHG=$t9ng
zH~D9EE}t$!V&2%UWDVTA1Mqe}#vhqTo`-_+@lLWGjvl~8r5kkbbPl8csXF{r5RN%P
z`1!+32tn-=o0I~crL<E_)ORov>T}Cg7kKd#azwwk6ZSsxK?NT@L@MxXj;T2iX$M)^
z6;wCm0JO2U)Mp8Y!Aha)#)T%3q)J=U_A))Tz$G7Fy~Kwn`u9wP8vN5^>F4LmgKbgd
zzPe&Eyibf%fqJS81BADLjfF6!Z9e((D#++*%gjoY-`rkV$D{lK>VNvvK0v>TEP%Z8
zx}2dWOM)8#&-M1f@_nWigum_raMWd1tbv8Sw;|xoJ%htj(G;>1YvvVWoys4OZ8B=R
zkuxYy1$D<NkHO7keXXBp=Bb`bSNy^3><1a=r7#E{mDkDw1E88=dl3nYvG{E3;cXYk
zz#IQr>6_O7ZiTS(!;a067np6&wxDSYyG=a-$Ru9ys4l7R5U%HC@t9K6G;NzQW?))D
z0TitpYfOPf+aDR{E~BUKh1q*+gA`FHDv4ugGP{<u2_e?-*63-^$%2`@^p1wvQd+C3
zS1~9I29@6rO9UeXjEa59n5!@R{MRD@_aX(AwupwvCv}?FsUIlYycS8JeZc=#A0rqe
zhb>;NtZT-wDf2>Une$>cqEq~L8TZ%`<R6>bfdWNZl^eK`I~?SH{w12i0cv>a+Af3r
zzf_{=XNU7k`x;YN;<2*D7v4uYcs3t@_SdLlFywL+^~0i`UuytW6QF=Byp>q?{pTZI
z$%6xnN0~5Olp3;#)mB|{eQ^!~Hf2T$VzSw%Ggo`zD?dowh(^z4x2T{db8q!p^2PYo
z<>G3{r4hxflsQ|4)&D*jU?3kEqU2TGJJuPgwKJAS6EnliB$C2_3|ux9ppZi|Nyr|`
zQIkYr_+=Y|(vZrM_T;xK3(r5<w{niAFM<B-l@@fJYtr=yoh4eo;{gWz>2QBuq4KAz
zMel_$NlEQHir@nyaO->YKs1+llhzZxL_WQ*i%Ri8qvv74uc`3{>Jl6VOVJcbxgrEo
zrk#xK2a%Ta%8_IX%3juIb`LGG<8>nu$7(K)ykCSDM_BlfkK46TLho;a*o$<NYWQn$
zr$VqHK2eqtYhP=P5*c5736jtT1|Ke+yIQCjRP`juj!8V<^>YDjVm~W||M;vYNGk<Z
zjKfB)yRKOn$td0e?mb%m(>anjTvmm<VT-L40YkXlpa1%2afy$^Mn4G-Wn#l8X}1ON
zMJIg^=uYg-gl^U+rfD6EQk1&XW&fUPEvW)L)he1y_c<cgh54C7{_*T)F>DT3ah-;^
z$*IC*!QytH&yU-s)4a5QTAzQ_{)Me6ZRmh^+EFJ@Ox-uo4`)>Q^0I^@xJ{=!FZ8St
zmQ6h;N-?>N9!O&k&G&u-k*5!s2z<sI^K;zkALCwt(z0r!b4?<;?7g#>YtkdxO^Y+m
z3Ym)Wi?Mh=FDCKke^SrH$3PQlqR&VyGdg&uPT=-2s!L`Bf=5qDMEfkBrZDfJTCFLW
z<~=R|l=m?h%(FOJTEpV&CLLKY5}InvAEBMuNN+L50g28*_m1Pf3o#;Vy=_L=i$h<o
zp)6$kB})R}cTc(+e1${0P>zEi`RAu{@=^7ht7&c-(Kj0j-VcfcDUjY;L_4?rv26#c
z@a09^-z~2YxL~OJwriFFj5rguKVzbY0~d3nj^&APVUaUa=j{JZBX39eaD#WS6xN6_
zSzZI9>iXaA&nsA!@fI!~0q3RkoSPJQa0ndUdK01w)D3?QH$juIa>}i%X;4+Kn6lCt
zI<F=@`X60|*h`#~RAi@<^WTi;>)aG?*u-zETjcig?^^J|z@hr2-oAIr>-$%M`Imbh
zJAu-(cTDI-#4t1)tH^zOii;nai3cJG6q9LHHt%e{3%5T$5L7s;NRPbus}TF*@s3ZT
zegrE}a@I~OxtZ9)ANyj0d95x0uB@>B{HHtk=Vq=`!P-JMsd8o=iPJT0i#ft!8Q>U-
z_BLm{fstz^m2g-?phOY#b1V|9fUG)!Z`mL-xA{2@D@vw!lBOF<b;hTuv)S9t$*qmj
zI>U8dRBD7GI%*u)Mf*qsxWz^FpXt~?mPu`sL2hN|aJa2LU14_o)1yxPpu-QcUckj)
zt1A>~5^_RMhw079*Fx=To~CO({SCH|_aC8OY;x8<7VZ%V%;9`lhPzw4rnMlgU}@hR
zF#FXJ^y`;^{+Exy**U&6X4z!sWK6Mn$UwHBqN0V93QVCj71y*2pjO*rFp%w7VrxQy
z$(w&C21~NxpVAb)e6KK<CmT&B>nF<ewwM3LUfZsul~aRjHhz+$KSbB%ToYVU_fqcJ
zp8@xuhTSLvE*GFO+FG70N~)Hp08=l8M>pJ;rmpn#SlJpExg#YEyAdeeku^Dgfu0Qv
zR<-V%*?ExvMJi{vDH*ui$ZqTQZhICvAweur=vJXWF_wy5N)iSl<@-@rglz$WpG)`n
zffd$wIL*woYO=bw5o#Jvvm8;QdXA%VrA6)DqXsBZDqQ|46gjycQRrABZ3v!)NjJyZ
z({xhBUKz~O4nLPYV~;e9=sCFXGKk#F&P&QpUYmU)>pZT4>tkJ1J-ovy{MO95{^0ri
zCk~tx7z8?DSQJGckS_u^OnyoCi*a=ktm@PE5jC?KZc+=_M^umosnsdW(?L;vtZJ8N
zBV8iKPcuq_Ziz-#@uByhB`JXFPv_Mw$}4=yma}HZm2EZZ=IcCijGt|}ghhXz@3-My
z{iI1ihkY(?Gn6GrC-r62geKd+zlxIreT<t^(ZKSen%+W*^i`NXiGn*6*<n+@W9Yqi
z&Q;mlot5pmoPHGJ+87Lrw11x#Jc0Z<jvF%?POx%2fOQBIyF!Q>)xTdqZN-OflYii*
zz)1Ml%5b<J!eT&qA7;K^KU(pPfW9%6Ty?rm7K?!LL{dRmbfgyW_1XG9V^Kl+sFrZ5
zUSOR!M#A5EB~jT7GIm}=G&G2%7M@ZfBLX`G5yjJdf0D|yJNwN34xd;6)8TODAWIXr
zJ$A52@_jfiYho>>U<kXxsHdS=C0kR0>RJ!~o)<X7Ezv8o*V+{Q*Jk4oSb&z~Dj1Z7
zeV?MnhIyaX5ua`{BU!XCV|%w4DK3=2k5z+LZn~H&+PmlTlb$_Q-3)^5<LK&&*g(5K
zte>kJP+5@m3}E;Q%{(%AAK)`mZ8!&Ko(y*jjMVMqHcb3;tpDr-<OW3IGurQsEiQ_s
zd227VaLi6+uW)D-25+VLn9%!MGBYxXkQAF2SdRudK|l(t+NzQ&j1)R*-<BU(ZloJv
z^T}w}_5367yU<h=ev9!*&X~qC*Q2E#4Vnhe>dNPr$y>P6Sv1}zgKZTTD6!`pii0se
z$X=%jO@)2`S#~)Hqq7Irv+qMDmpy&+iz^Du1nj#+x0#CC<0IFbW_FQlO$0GPujS0)
ztFONQEE8CV?%TP|r=r*OR<6NJY!_sDMGSrVm>!h&{^lV&JE<zI^B(^>Z#p_CtR{jM
zM$;i9HS)-v6xR8%u2c%mr{kq6>hL+iAMjEO`A5`{It^(G!c%IvCW8A-!q7-b-JqXd
z9S&t1RS?gWw<fn@O3a$vR)?y)Pa|>(+-&Bbi0SQpWT%RLn@Jmg>}8Sq+jriprBr&o
zqgUTBZTi-h`9RN&Pls+GUf6%jq10%AT>Va~A`ch<9WlLRi=6Aj`+b06xH*cMFAz07
z9i~=th9;-34>!bgbS1XW4DE}jh-K~#?x$<sKOPG5XPi7Ey*uO}AFBOiRB8wd<7arU
zUFszy|9mP)_IsmiP?HiCfdM6QLID+oOEwaIu80~ELG|t4!w0NE@y+Vg<Wv^%<2byA
zlk8od*TT(krW@J@=7#zHKS&J@<}K~>wHFs7g3Zs~B_3jWh0-Zg>Z&DNY;t?{mT#4@
z8`O{`R!jYU96ulEOLjq=c$X)R2kouNWM2oZ)JX-^yD;B_kmyUag^Z^_1N_#-8;2Ks
zp<`;ih7b+WjiH+zJl-bqx4%~<tN^jUq0-6osfy)QD}oqsjQZovm*JOKjP6~}P3)m>
z%Bq<_KD`p53!#4h+ZF)prvQ#u;$rUwAr*vBn@xjkWq4s&32=%IY_kCc3N1IqMplN0
z#YO7!jU99c2ZjhbDJ`t|=B;IJk#LJc?@^-zW50p6yCn$LinTAs`Fvihc<S|*UOSEb
zax7s;D#c@9z---xgCPY)Z@6tEUayX2q?s4xnCW2**>NHBMmFjVdL6inFiW;I6Dj<e
z@6?qS;^{9i1y`f$nzGnMA-cHq@X{tn?y}a&W_RT}^lZyo(VV-rpfpW`?#zpj(>WA(
z@>JW_E)O34Icn%W4joZa%<P@UO-T&=Db?g%``#X95psmEO2mz)16e2thad=qIqk?v
z=o4jk=x9>P#7-aFQsNe<mDW@PT4zhFO@~7Bl{vm8p7XPcH?Zp(`#*We68rB4LSE4-
z5-(zX;JqnL9@80o6wfzk%!8_4i>;&2JU}$1S3e2st-Jrha+Jl|ePfLHM5~ei)bP*7
zh~nl19T^zo?1z)?J!oK(BFJh?;bq*%%)jG$&b#pB3cs9?_4oP0Ao$h5Ac0SY4ScNU
z=#c9-aikW+Y3a!sWyr232&f>KJ<&l7BC^r&WA2@X-d_B0D2fXc$mJ8<+tP4v*DMbk
zFrbH#N<>%aXA|kf`(TF&OLA?SdGrk;>|@7qPsTo%C?)D7s}1?zo(NtC*cxR$BdkWc
zM6C!rOgWRe23y#wYt^h%Tf_?p+@tdbZ{3P&mlmBC!Fk-nvyNL&zUk50h>6e_zbzX>
zV}WDyxz45!SHDXIbD3l?&1gX#+J5V3)hi8{+wr>@j`mWu_IY=XpQ!QgT3cj&;7Tqx
z5RwJuuqKjT>V7h>q_R1Ujeu4qP?hjVo(@Nz+I>;e$8x!|=;UVIMw41VlE-q<!-7gl
zm&U+-{REupAt;pM3nxPPhKp%UZ_^BRkEYJD&(aNnB+ratVBMQmXN^n-)u}Q(xf)c0
zjd>qbJvX7sKj)GPy!S)s8;#7PEZqh1Ix$=NW*idrHRKu3rIyOpWwj>F)R@GrZqfRK
zZ$>Z(e}HA33$nE{ZK(#+XH${){n&G&4qwP<`7Bz2>0I6{;7Y(ad7QL%?`GmJ_XC?f
zuz^t*Y6ux8k^-L?MnHYF+4|@r;r9<FBcPumsUcPJG4ObNQVXWQa;YRi!7s2}2hpU!
zdSUQh>r{}?-bi?T9x1GWVQTxGrAQbnZ3^V;TR|yseMAtF5wM^Z^%Q-fPn^kK7mg>9
z-!hSwNYAW-UFmptS|ZuU=M11SuUu)y6rf1?cUHy_C-r;3|LJ#-R}_kG+x=fBdVM9{
zHW~dfG(9N-`)J}JC*Otp;mh9PMghs*Zpt$LzN##rK-t8+5*F(vyx(rOjtL2W^`Vbo
z;i!9#T>R}q+~$VW2c!VGui>$J!;6z*(*vnK-g!D{acCtycXK;|(|Sdy55T|o9~K2o
z2@>(dO|N~sR9+>*$ACVT80I87kQyREOa;l9T<a>wE+V2wb|{x=W4Vo#+)B(ZmIa+3
z3rU0SxDBv!WZA|;e<oAUWxcTq1$lddglKM$Kk~9qMsJcuwb<C#>A5ZS9?aXGpW6fP
zx2C(fB()wBt`Qnd;G}Iz{FBCks~^Zd#}@heNrR6DBlO_w_9J3P63r!dRl7)#0Q_zx
zW~^sc-O!0yQ2d&MoX8tS1t@`zYKnk2=3m$+irvvy?waWuN%#o!S!;4$mqq=zIhLx@
z<ApM}_??ljsS*wyHg@>dLG8s~C16inlq(Pyu)m~Av>kG@PvO4#kNbawF@^Gquoxjm
z*dhy45}H=##lU+tDr0?lrh-L~`0`Tf`Cjz4`@*QJ3N_?H9qataFhaPr6QZ#JwMd1s
z;<;c<KOsls7qUm<l@olK%kr$0>PLRCa;9LDXZ6T7>ekGZ%hz5nbh4+c`%kx6$MoM6
ztw8e`kz{^FSii5rmS4bbdhcw$nJ~ZL4!ZL?cHCg9?0o^(6nmCq9c-*!Kby4IrKioX
zMgJ?*Lc_7Y6ex_e!k%{}Y^en!G8YN<M@=^-?;UWro4RkGVG`eRJ?B}dx_vh926J`d
zY4jWR0ZSx!07`(}qH98lF_(I*<0OpbG9c3iQF(zi0c4kjgq_^>YB5H-&5Ba1UAfX&
z+)`5S?ty2)2Z2cStVhGq7C*6RI+o8|Ef=m!5eubPH>|0BpkQ)0y4Glw0lRqb-Syj_
zDJ2WDfM%+E3!&hD&mK9!B80sarGs&z>80*?5A)Sfrn<gx`cAoUe9wc2KFZiljdozg
z$B(Kt1oCvyO2#X5m}5ef0jLZ@(y!YtSH$b4a&Yc^_dC>*h(YXh%$MzlLl;PM0{Y#P
zJTn;rSGY{)bDea+<1+%hw=@a(?nI{-MKFx!ntroiJtT{VOXvMtGAo~2Y2?`@D*eYp
z*zU9NHWtliRFLtvotiYRK?(s13V`dNLfmesrabCLvYKOb)@bvUNvD@h3B=zf@OwnU
zIn^Q2+>A-UyjSx@1P9yi<uy7<50Tyd{mn*AA9)*@nbGkVgYudC>doz#6N7EfqmVd#
zne?qBc{!LVX!imMaUMeIvV}+@GLbaCQ#0x{c8Yax>iWV4c!blBBsC%ZNlT!Ek6a2p
zqv%hdB6p7{*$q9@Z)x3}8Mn4-h5Y4sg56C87Ih4pDkd_f3@<+M*nVoh@$9v^vEcL#
z82tx|fFlJ9qvD{imo0Vv#3$h<)bcIvVIKrCov%;IH;!A{`6?}PrL4&}nWk@)?aL5r
zJz}=%pWK-<G#e$5Z87xL>|XyJlj!#CAXAqISudOFcQbW}8Dl6sa@%dZuQR;+oaq*F
zYx*}Z5{tv1*}Cmd+;R$yM#cXpnSoF!rsqcx@(B?awb7w^<lOG$9NVsr*HEWc!7l3z
z5bzo{^>cX6@1Q}Sp24@ltUKs2FD=6T2@)w#AG`&a0@$0w{pI_3ciaRuG<pAqT{zH6
z#dkM_*UF@VVaWE4iV!ned+dX$8%`D(=q_5m`g|EE_gSB=>nz83CTAJsw^^aNL$vvx
z;KV!l$+?y7>~HK?5bA@R>-^=|j?}u0letXq&?%Ob3IiOZY1LJJ!#gaUjg|}3(?!>t
zweMSwj*Mz82dNWT$!0NI?-)mKuCizy<+jgmaL*>OIrbCgtoRH&f2Cvhe)$F9KOC$E
z_(`|`<U=TF!hkD_hA<7P?AQQ%7_LLxh~9DYt)2ZITkO!Dc8Yq)Qo&fCw)$BI621%a
zeAV$che?5YL9~Uw;p97@$HL~fWLA|KyFA#gDCG5O{W#y}C0jLd{I>CXf+>UUJq`E`
zE5i8PVx;)XXG<)F*I`AcP=lT7$Euo-%lHkAv--<j<hgR~kUY#!oMqFIyfTSpWoAtN
zjxLOV?+|GgBlY_M+X#JxZ<gcER(^4uoqN<`52Kzv<(r6MW_MOlZSPpyuh}zWdlE8H
zv_wJ2*+Bl+BiP`7F{4eH$Y8M0EfKRtEO1EvF;t53)+0ZF{g@eOzq!y5=xZe`e(gTX
zYCs`i;P*SOps7Q!)NsHM%=19i%n%t2<{HkRQ|Lr&x|+4Yl(EM1^RX{{dS!54OXQ=4
zw%&hR4)o#rj&lvf){h_cCn&GuLkg|3H;$phZuy&ZHs2Rl4!n_xxZ7pzG7+Zy-9|BP
z*ejmHB-R@5&ea_jSH29cSei#tB_?H7y4WfBL%48UYM`G8umc1Xl~ocvz8xQ`gC%;4
z26BB&>iV5qy@Gzw+ng2ni(1QI>hovIM-ityI5<TT;172B?V)Iv5c?oIYFN$cTWj44
zgt?NX9>uK7##L+MU57f-l0N>e>8ss7vfmEv1Cr1La=Z{$R!ji06u~JEM(P|Zdoz=J
z3AtON2*4khj$)6kM2j*Rm>o*{+gV_G`{SZ#zj(PC?ik3qhZY25NvnPe{+q|7Xh0~M
zynXIEZ@F>KhU`Trwt00~NGG0CA>3{!pYD)N(8(yZm6{q^KXfI|%0n40i<<uZG~jh)
z)k&1ZK`8A_-H0TTZsWFYS!+8n5m7)Sb-cU2?R1(G0-T3yOK6W3b+Yv?r{355)auvA
zNr>-}m~+=d{r*fO^zIX`xoEP4T06F`V<n+N!`lzRt5!3qJ{T4GUpL0Chv}Wvx+Mw`
z%x9>pDVD3s?rrd2?wjHanYsWH4*9*ZCm7H-&iBq72Xfx>AACJjyu*QQVBE#e+7|2(
z3-iY+2%JD`mytbInDvG$Jg?reXe1|Uop$G2bJe|W;v5f0>)ABM=lq@50Rj{3{o{gi
z_^IxF7eY1ggO`+Kza#itW$rP1{!yfo`Ovh#Zs4vBDeoMiiFfJZseaZq(Gp8>lO07E
zGOpYAFwq5s-`@wJWMLhYf50|uu5B>{ux-h=#y5)C9n3rzS8Ad8c*$Ol_*X}&HFDOb
zsK2vA0%nk@KlZE@YERq&4Le1K!3pn4L5Th9AMva5)H2wA#4H$ukRpKbkm~K>CtJFQ
z!3(nLV1411!pbR#2r@XPwOwDie0;Gud;p6;#~Oq%_Bt|}5aCqb(zf)Set*v3L5Lk@
zj*SN$CjsZ+azicOcfRK9N<XIlH3O$`CRAL5h}dH0GmX5z!5!G0FHm}~dO&;d<mw5O
z8K^mSszhUgL}?c4EK(cC^lZ2~V#}fjps|i71t^dH$A->Pbm4ZyT^6o+vQtN%aO@_Y
ze6${g+Sw;h;s32Y0O1fo|MaEn#WD-z2<Wj^1QZisHP0m5V6oh(47R;GFnOQ+1X~zD
zvnv$TODk%0-*|z98Z~F#QGU)mZVcIZrL`LUY(qoMk|oxu6(>bA_;B>;-_Qhu;D#SY
zSP~%TmMj;n10^gO^7Pa079ifV43uli3(W7g#~l*XolDM8Dfy!c)yqCMWDD!JUYpdi
zGGbInn!XCg;Of0}oWQ;kFuHrk`JWK>2#=1Zxqa=2`-2&Pg>axp)U<KwJ|-ML^s1w>
z0k{V{m^yaLIVDtHF#>5S{su6#RK@|;n@;X-mHOsf-(ogqfgHc#q4#w=H>-0n<+Y~l
z#7&9T5eKdIGP_7^euUj~Td(VOZa*RXMU@s75ZY=6^CV#@)nl&CqaK}3;oS_yYMnjb
zKGK7o*mR1M$0RWoucJV3dx-7QWBFj`z|k|yHh+iC#<DeQ0t#czt|D54^w22q4?CqB
zP)TWyA-rg~Og?t+$<g#Oxx71)L!6i`(QvNO93(y9RJ}+<3EZ}8;v4fE?88W3I_*5I
z);G=BnM*|YZO<vt7brF`V=vy3TAj@oNB5AfaNtXVnwk{YH2Nj(eH#$Nly)%mo;J1@
zxxaAy4xrH97hdPS+)4-U^u0+_IxFj=Zj*U_K7B1Z565)d9|?i@AZn+X<^y_9l)X;v
zU->t%i*xWaeThD=;R*^T;k790Of4L2=|z&!pd#Ujh98k<3du39Eb4P`6+7iH1d8_F
z`I>$x)#Tz;y$sqK?2!BGd-x#$7slaPh}_k)`V49WTaWRf4?JTp3SB&SZDj3pK6E%;
z!Z79~<t?ef2hJ$kCp>!1CHM=a0O;eEg=K!>^2G<K?rMt4&b{G|GNQ&(@Lo_JFaVw8
zon;~^4AWj<ad^y`?H#<`FF`o<b+ob+gEjVDA&k9{?rk1Lh|?1LtrF1LxPU5-mXrY<
zg@AYzj~1?kKvV{tJ-=j(ypM!epjOzRJJov2ulB=j14{GPGTwpK2v(#);4y>O43^)R
zrB;sEol}Nk3{dY(o!Y$;FIV6yhR%%sAWsSu0?_R-2;GTca#>$k4`~Uz<Z9)`y%H<z
zjXbion~9`?p;=|YChaJ_^sr#gD%!-#trP8o0E_KsubU7qiBLAr(mZqlof^5np1m4W
z0YJv!hQ#DUQ>gjbJ!mI@7<xAcuzXJ2xLJyOoBTA+Z7VzAirZdlVEqRSQjvAD^bJ=U
zhXYp=vb*FipOI;OohzpYe`;*s{km~8A@bMH*Z@hFqm91*>{T;iL`}=gA!;W#rA$PP
z+s}Qs$pIyPJd1G$pA?*n^Jb7|_F;y?YF_PUT-YC*g|LGK97Jo|HQh67oaFybqyXbo
zNEvG*ERnr@#@QDr^RV_F@z`Lo{dEu)lF%{ZjW)i09&U0WG;NeleZ}$H=D7jA$zR4&
zZRV&awxZ~J8xsA&(WrO%)$3^%Jx?=R`b5pYcL;j_=up+0;e#b{ESJD&_*<JMDw_j`
z&T2+yEa=gTQtzZl2_PTwi-GbU6##cg&sdyYGADcQq>E(s5r9htmAXepL<K6R?@$|h
zJ=AjhVqNvGovTHX{n)P+#}`iigd;;*#Vy(|ucL0S`)e)NKkn$FP|upNRumv_4dN+q
z*Nw>a+q0Zdv%N$e?ybqIH>Irl%~XM%&`O!WAtYEB#x7P)=VvA3$djbxXk=eyo+i<`
zTjq*&ik}Kb{~fD+K%V!~4Y@GVlt*?<kii0NxPC}M82s)@L{0kLm`occStN{n0u^CQ
zvoytx%bR}uhz`<ZK$d{~pxiIfF`f3fJdrOH9E~;yaMwqAk846l`=DTM*=KeaB6Ya;
zRy&AYqp#YJ-5~RzB<XjGa2yYn)ce7XNU$VclNRPZ!8yBYB-~f#Ru>7;4En~jk9rpN
z=Gn&hzMrpY92<HN8f66W%OUsDKQOPjpa?I%<w9A6V3p&IDc%bAxu1!>)J~*--Nu$o
z8_C1?B(#MY-QtDH`mun!ZT#;aR!m_5@`g>Bb(rbtcT@urFXvZa$?z<tjy>y>`ln%~
zfP{pV1c7KX>1Di3(_yad#*2$I!<=@frjZ$v;<9e%nm%G618q*^JyLUqjj{@GmGff%
zLlS{V9|rx-<H;zNQ#FC%{UIsAUtkdwZ0@|S41FJ&q*%y%v%&mqZ5k!!Z^d62a{K5D
z!Q$RL-ud?~LN6nJBNJDW=*+{=hGDL&qKUokNUQduiJ8|+***VJ$^rc!^)pV9$<8;F
zgluYvHDfscM8j3n-GOxJU1&F@vJVJO0~|A(zk+@kdCD_iwB%ZQj=iYhY&5T_z>yuN
zoNGUeE?QuAHhmopv;C8rQRpWG2;qNECVSS`kbgP#>Fdy`*#gaPZvi+4P?kGDeT@FR
zCy)!{{5pHejEjNL=S19_#%KelZ2{}t*zCa-|G=|+6<oI)d8H-Mub@6?XXAIYqPI(v
zzytkbc93t3`)=-Tbb<K$WR9uiuY9J}3!3%gy~SQ_qUKSKd6Nnq)Zy;a9~H5(efMHB
zu>23}!NCI(k7i~hJw7EV5II)_QUjm!J*3-AfY49rJ7hid%4+pF_n*2iNU11D?mFZ-
zsXa|Etq?7qdF8!m9XdX~!i!gdPvD}4r>|t+RcM=w?F?xwaa-3DD!vEMe^OX92Y}7Z
zb-)yqPdr2dx3)eYd@q~Qz2i-O*e&a6wy|<jXViC5<l$XMY82BkocV9w>jM%MS(v?L
z^7;Vc-2KQU`2Dc)nYVJ`le?i^gBx1flPZn7w!IVE1w7^tXfv<>%DsU#VSVt`3(?#u
zt~W|(9+d`pBS-k(Iso#hAiDyVOFN?!STv0Pc*#xyU>W<!_bg*HGue4Ip);!|qYCph
zTE1a=Ct!TsY?sV_?aOgKVPGs@(cGhH$2KF2za?QHGbDt30$ARhiQv?8&$6ut600??
zBb2m?dr6<g%u5_yBR7sO`c-yi;9lQ6Pe|{E`&-a%mrik~$!}^$V`A5#cYha7jWWC_
z!8I3;2CbbhmP0hkJTVmbZ7c-4_h|X~4v|ss3Gz&dFdK8t0yX9hRViFa<K=t=v_D1$
zwDoQ|l|05BFpT$-(C@j)a3yXBw?LlPSKp_go=H>axIex(AT%liH;}8dy4%2T=O#U(
z$#?51Trv0qD{8)v=&vA}PH7V%gwcACyJ&NMZmDo<-&j)rC=t(WbO(hOP{iO=7VuA>
z)b(d1PsigW?Q}CK=5{%XL;gylY=RKFJfzr!Z~ZRMz^eQ)n~E#!%y${JCQj7JLf!&p
z-crqA1Dn#9PySBkRzSM%_E*UK8yx*(;N#vK-Tt_pc^sSH<XizT5-R%>PXY7yRQn6&
zo!kqLKh5Z?h5b~sUo6m{)bFthtEiMk5gxdSuZ*B1Dnawrla^&(^Ngg)R84r2O}z51
zblLDb&L@6*XFlZ)Pxm<o>p$f`=(9(3Dz>)_NC}!;V_kZ$`Zp~rRo3LG>pstlS4>v8
zFGRT@oc|E(GffegGI8?jgW>0Yn`mCPVS47|-k5Lmb@uyqrorW-{{&|$_z?DVy>LXq
zonn-95-__E8XwcZ905LEkTIt=Uh4_|^L;Vrx~a^?mvBD>oIb<S<N_`Z_h&=U++n@^
zCi}e+yedVo&QB>i8!>NBfSUSv<aV96fq--*OqN+^qgz=bpuV7hPA;X&FI}i@Sxr(2
zL47-M8PbL-;h_fB8|UDUV(E45vxJMn>f7cPwH$Yg0eYi{N^c=V&d5a{Z}cxyUwB#n
zk*5K(W`a!dTmcf8(+Ouep36|MTj17yCk&%r8ma?$)aPM!J5WYMa_HSuW@ESR|ML9g
z!!l}3XSe)uYn~~?-_a8>07|%vQ=<kVZ!@-KW?axI+kI&diJUfxi-0}9+m87{L)<<-
zZV>boy<RUd^st)zuf=OX&Mo=Vg%EBHL#SA9xjS$9?S2^IQp3usMnFS%qv6G=v|Px?
z0BaYON%8u|JH0wooZBZ~DQdUzg47ga#CDaT;dPJY@NjWcTVuN0n4|5dc*RtS3}?f?
zQ$HoV9P}l*ukzkQ!A@lnu$n;-`hi?NSj_9Z+$TWs&nY@a*SS&7q)~bc>h_m=z*6sX
z%BrmpxwI#YPohM5nOoM*cJ>y{$DSP-rHpw`X1=O|JDpOWjcO$ROB@Qr1;EzJ>rFeI
z)Iuy%!~uRUbEobFmpI#5hL9#j?AyIbA<Mc<8MeOjh?lxbZyTssMP*h)?&$(_0Cyr?
z+d%^+NwNR@R5Pnp^*gL4FW}K*duaLBvlwPa(?pg0$=;-5m5iz$$OPC$C3`6Um2<c&
zK@}z<>y9w$XK0%3r;HcdZ$OxSP1Wk8)*pWsPxm;|QfG`_pXltH8|)rVYl;3zGg(Uz
zf*J{;5Ua~?IVxw)Jd2tPZmP{4ZRuNT_cpU&3NTr6Fow^(w*UCQUjR(ZV}6lhBYLXv
zYkFf&<|G>A&!du{H8ZIcicSj_;9aCJ2$y)8*z?VmtyYP_iR!!jdxRC}*g+nt92n^{
zVMLZ&b-FPKmA=M?R<ZcSyx)`Y7ZpNtIPqC(`1>R^3&bzXN7KXOe}xzT{@ZoIX|ql6
zPug$<qtH<~(`WIm40pcbpZmIZlz(v{3q(!s)`_myj;T3!^?&N9__w-^#s;A<Vtj7n
zS&gH1Fw=jC9aT`iU--<q)`XT_ua)?Np=ZC#oMlxa;zt=AL}sN4760El5t=dN$w}fh
z*d?`y4e(5SX^_h2WiYpW5+#uP48U)mM6O$Ie9L8^HKgnFsF)T=@!Eon?f0YwzdbUD
z3Svppu@hn)>wU22_}|kaAON*rJ)<i<Xv?gjhGo7>h+Eyo&?A~KJ~Jou=?DJGk}}w)
zBV1x~>BhkIA)}T0mfvRM+jG*fKWgqb+I?|P6FPRN#%8gCl}jSvrHb1_#p!EUz6Pi=
zy;clxRn;$F)oQzS<30D4PyQ>13TXgpN}Ji^3sx-19{V>DHx2_W+&!LW(!k4NJ6pIP
zIs+jZu|W9dm>Z_MeOO2T9H*Z>vatS%yh%HX?-g3!6bz-at|e*}OQWsWSu8$*4+pQV
zrni!S;$p}V_Do8xND||pz@`qG<l9t;<(tDn0&qNGi|%pBH=xmh%!PyI9r<e<7~U?m
zvk(5gf~XF?+scPmuKnC{$;&H_%Jv~#t(E_SHg!;BlOQHLDE^6sh9xZiE^rNbb^h*)
z>9la7vtIX^iZ-vU-lIwZm}Im)<#a|?Dbo2Z;i3^Zj7Y!C^M<KxAM(QgBkZfgqTIT-
zkq`+%0YO@kkX8^FX+?x_2<a~A2I(A70cip076zod)1XU+?v9}whM4&t^c>%F&i6a-
z`_Ei(@dwYd_u6aS`@ZkB<}`F>VIjn5pPNsG$Tcx_t@HZtl1+H$insHaJoj^l57;Sg
znx&~k?zaQX{TP9ibvp=?dNXE`;PYmli+?XBg}SH!QQclMC8?!Xpx-lT6wM-ZW3P#T
z{Z?_GUxE|PRoxb2*%(tma1ycX<n#WZ$048kAFy=KS81&+k%SgkzsH@3mJR(?p!JIS
zCKpk2DUDBbk;8$-xJ33F9MxXUO@LQdne)Tpj&}gQD7C{4u_%xF&k4N$fDz|~XGz2i
zaZ#HAmjY<%l@`pm>TKWocsoKLqsci>HW=4NcXz#5$|*Q*i|Nk7a_S^A=XMn8xt>Wb
z*~_dKGU@SthHAe#MJe#5{*HG#q)Bq})e|U1Yc^5X!v)dZhpB)Irb;dfTTV4nTq@F@
ztmP_rmYfAon38e#m8Wqd;|@L(@)aLFa6~1fX8|vV8b#x02URu|s(HefmG|4JXP?kl
zlJZpSsXBDMVLB_)ps3C2KjL(l7QENkU(OOR{@BmGIPCZOywtygFH;Wip$Jjl8J)(=
z_FkW8<@ne~J97%RPTuOSYu@^yJd$50gHlrCQ$7j7oF?mqFI~0A&p%n12dp!{6NbL}
zHKW-3!b8dW5aFGQ9CWtx7-&J|+kWo1Mbi4}XLr5BmPgOcM)k*Ep~LIgpQ><3jq;=h
z9)9DbQ-&&7Q=J?gq$Vt@XZh)sj8}bc&0b3(*u+11RQUD&^`!SD_xDPEgP~>xip#Xm
zASMgOOs$AS;OZ<f`5Gk+Q~2yL`Uq`BVsi~gBF;>;mRYWxe-j4Jt_L&vZci~&t``_p
z3IIB*{(ZbbZe3F?o#R`Gb9R7?y}8mnPTyEZ^IbAZ2LUzk8Q$4A2;H5jN{O(SY^=rv
zan;*i;5xZju5xMo1r-yz`f8Hm>LXkeb{niqJ)OgOryqF%WxeFJv3BJ6f!X4+0x57<
z?v}|j$y4BqAj+9mPWMJ{Y}Z%OxT;9Mclyu5xn83=BJt}=l_-6dUSpxosZc#^h0l{Q
z1ntqxb!jQ(bsWcXL*L`Q+I0qlme1_V?iP*CY`L0##&5<sB~;3K^_#!*&0u~B3{(h|
z|9EZkb%s$=KO5exgMc*M37{|=;xYRAfia?@8H@SxYLvuzCK8!|j7o;B=~N#s*@o`5
zoFDuudJ(kNZKMT|)3~z17Q~z6qb9amn_1bZPEM{9kzKjIiY}iXvt}Out>ib8U6y<v
z4ECi1ez~c7C`OF0q%`9D+C65wI%Md~x=&llUF3!EW+2z@O3v%V;m*Gn{yFBIc@n=l
zMXg(a$6#pt^}T{_E0yW>Zq>fNNnbgvbB8ICDY7oT)c~vD)(@ew?&#|*dfHA}qf1NA
z7x`W8U=_K)*$921Q?i#A47GAj1W{U7<9b-pw8Ml5rdQz(P@6yVx82Dl;6L%lb$WM3
z#pqC1uK-!DBtB9AlwF6c7t2T9KY-saNvhlDnBnmtoU$w9)I!SVYpmS5b8TY`#4-=~
z?(?|yye<56v&l}(UOaJ3@YpYWIo11l^rMeD_#sN$yCaCd)7*qaz9sG-BZ*mZH?Af3
zBUkFV%X7JKte%hxdU*?3hJ`eXoX-7t<uj4!onefjs+sh-Y&cqd*2p7?RrRmM^krIF
zGxcPAmJF6&PSdm+yFPpnIIZlXd^w(gko!=qs`T01$XCU(2}}=bOP(M1z52fNlg!Lu
zyFvGnV^yO%yR`|~TLU_GFDAv5(ay@PDBH1J3%zCdnX%SZ(lN$S3o5niym_0j*|~}R
zd%F5>vu0S@r7ZAd@@YV!G{eZA>xXV@`i%^JY0N`_iEgmGbIG6kM>gNHX)sqps5$l$
z8bbyWab+Q~HYIzlc8RFne0meOyhli*)mR9vcpxoBYrx+eh-9~EFiEYC72mOD*L|_^
z=P39BD#e6elfUmavzrWdsR5Ef%jcIlyjM?F%ey3^<<?vN5;#}z>$PWFnDEtlkA{dY
zVU)dGe(W)8xzkL2n~vqiG2$nX!ajSmUhL;iv{9?$T(lLma)DG4$kSma3ejvV8}U^b
zQI>bPC;UsJh4t+69!leQy0#g2biZQaYf1lceWJvYq$7xm=wxthJAs*4c>YbGrS)&K
z2nh2sqOf0j2rXK2eds<siFJ!?HhjXPHZzi;Dve2>$Qf9D!^eo`{RiwFch0d9g;6=c
zIKb@7M?K7f#Wm22)OO0H!pACho|Quz(wd%@$|_aOBtR=9)1QXF-+t6K?vm4_`VSP7
zgjsP2@Hb`xdCmf$BI^f9hni)q@3+5Rhcs|_!N*3;Z+4|9Y(d@*q6a=6zxs$Nr?`;c
z-QDvCZ}2mFHGjEAvq$1*Qy0yIk2XI=G2uf2?KW5_&Qz4&OUo(|Xo2%OD)=x{m8w&I
z-M%1r&{gUJbwy3BRv`W+j(fvYYAT`j*>QBMGPS<2g>uAmi;aUG-K*Xg6LE=?y`$T0
zCj~RquUtq2Y{9?P?Ac<#sJ&i{N4$lR8g~y*nfc0w*+=ZGdPS0I=Ss#MF*%az9x&#N
zOQes^9lRud!2FsL^*wmk1>BCC7Rjx;e<?k^$0>FBZgnkDC!nk8Lr4b(K1fSm8AkM$
zU(lM@xvgxbFs7*1$?4QML}6PSy>+r}Rzhc;EAe-Bfso609E7DB2o_rbZeR<9>Il(*
z@lsYR71}E&^Ys%wh3b`@LC4<`&ogduhtuCRpQT^c&UxN)q*fE<3H}?+^D<2gyA(p^
z?bLgy8FOElFUc%O=QN_Msmj~3JGt=j(V3w4yz$i$Ra$835aNt>GlpmcF*^?ro@+Bu
z6E1RcNZs9RNc#Tss?V|W<ic5t@_3z{Ie^qF35!54-l2h{s`hbFIgHt>h->6lHZhh>
z^3ow-FP-T@T`{Qh(#GtEZMXe4<(`t?N7NB`9Aq~mUXYwz0RFaoL}c^n9lXg6Mh9cP
zzGN__`p#qUtX_eUo=3^$<yHn0pbjUpd<x@rs6^4-^41`Q*zczi`|O^rOI5@%37hKv
zTJs&w?c(`fNK>5+sf7L+z#*PPInprF>Kr{2@LOdEBI>PL+HiPPOYPaY4$28o={1<J
zie;<@fWUnW^0j8>aw75JwyGZ6;t_Y~(IssP0*4yqd>P&L^hjV+$(j1Q<@*~6Xgd5!
z1QtqyTF2U#X~-pG46Dh^{Te4#PKm`Z)zl{+02l1rUn3XYEPP>N<>f~i@aaSCM1<u7
zX_J^!dY6D1Yajnx@~8aCD905pp`p(0F(|!Bb%f}VgMHk`(4{M_(zH$V51LJT&Ne>Z
zFs;@kH?4--e+syjcc4q%q_;}v>PM&0<eJ>w<0rpRmkVx#;rG;)cRME1jt*rA3H(W8
zuS%wUj^*d4ed5ddg2dGh#<)}Ez_bAOrI21WX<u{02a4>@bbMs<o*v}&tcpB~jS_%4
z^25IIPPo^kIk3o44MK>WN0}SGwNPFe0=`UvHp)BQa45epZv;!SR#6S{k2DD!dha)G
z_86+Z5Z5e^z_1P4MV#8=H4N;8P0^Vp-@d-amaWp}%7o;sve=8loRqES-%$e>WGaR3
zi1xn-Ie-f$L)<y=D{)=d%Uj_>F1;LLideUDIgFm3l3L?yeeB*j4bN+$NO)B>6W`{P
zC%X;J)J$q<+p=$ND5Kj2$YBjR9WvA6y<N|5byJ_!V?KklyR(kB%UW6*ZrR<M&X@YW
z=O9$bmm%^%&^tG!L^_nyj4-)m#xz@KWO3d{;Nrukb+gu*@1Hw<yA_k^+sQcYcUZVe
zq<zO1zsMS;wRc7~Cix{Fuv_nUXp3D+F8+<p8D&oaofSS$lVJsxwxm5doEQw>ASMJz
zx2$auUbH7fJCxSr>G?5nEO(7!n={hjuORKQZ|BP`Ui{l5$^}8MYI+_q@jgly64VZO
zi<L4}(&MB)OU6srGWI;i3%}go8JE<Q<hDcN1k5L!!g_bFMOjmuf2pF9fnV2r#z-pD
z>D7PuLm;DuMK&iMj@5Q!Mh)Usf@1V^&JFMJLX2^>4~0X$(p8*T_3y@<PSKr2&D_Sd
zz4Bb=6Pv6Uq2tpzb(<}vvqyHO16T%nb6nufKt4l-MPH}}l<)Zjs8}Y}AGD&<Bz2Bz
z?l{KPI~)mEb3G_B$eAwi6+4q{oN5cA>z^J?H^LZoX+0M%RJoErl<292A)3ogeDC^8
zE&`g<(dIwX{T=zV`$hh@u%4z}i?k2w+oCK2?O2G{a|q+%)2R4T{4f*Z+`)8aH11;o
z<eKNx*OG6_?e22+r3E{e4OxY)Onu%;@oLLI7@*Id7TLwx_YL7BSo)hi2iPBfaqA1$
zfe9IPYAc|WODwz&KT&b~)YVDBNN}jw&;^6`M~XxUY=C$_vd+F;BhjKf<Na+g0*L7c
zAFwZL)a3i!rBh<W8iqD#fktn+-*mF#<jIDTee(dvinE}8uz6{6$wrZSAYT>}-9~_H
z6TXN0<JGuU1)jEBD5r70+nYuvxt+SES!+u#afcr9b8+0C5khGlp7ogFXqW69+W%0S
z38;4I&Zr&2NR3CKJ(VW6<BNNo4;rXF{)iTT7=U)Jcf}@C0Gb-%RbM_Y4iVSZuS(^F
z^=wM^cVegs)eiO?CM|4+AXeHQx)lMH)VkBmS}+681w8xZJSvZp#-~ev^O@+I{k&J%
zVioh}au$9g?y(djQ9DG>ndUymfaZHVtw0=@f;_Smndsu)#(QT535drL-uSC<36KJx
z#N)&F1>I&zP5kL7<#+B8a0{pidKU@{4A-egkgN}u)#_YHO*qz>A@d2?(EDMcC`d2v
z<czj8-E!>V_X_CDxh0q}k5SB|HlaWsr*HAn$ZjxW^t)+jpXFf7=kk7oUKN{6tmmVk
zd-mL>q^3Wdc;0TAl0*A8l*Nm=tBEGMe)lgCN?Zz4*%TK<+?3D~2!opU40t)kXiK<5
z+oKP4Q9&EoF2YR@NQ7_YXKz+b12&g^(KeY<7@`NO5_YR^O_pdBab91y4wv5t9XFni
z0c_$s5|vkfcf65fRsdP9zU2-de{E0h6yU&1*P2z$-}B(y$<sK@YJrt~_Fs^&-#Csp
z-*(P;svXoq!NR-w(~tUFym`WCSCZ?n#LrbDVqaMpwuIN!mevl_6EB-uuM^&Z;Z#lN
zluXFlz{c`?HV8`*9$U7^GU#|0iMO7mriHUcnxP4ck+^zaJJGc|gA0$r&!Zp(*#|Am
zqPOqZ`-*-8f><p3J7vrK<Bqcuz_<N+3`UDyX-!?7D$xrF-hW?nfcn$fPss9f#nkFV
zq%k+a^_kB+y0y!n>@jVMoE;B1<1zy6Kh0@mABu!9cUI%BXH{KMR~lsGyN#=FeO(}L
zTQ;0mZ>+a9tZq*){k2zV!ii^+yN2ul#oug_K%iRtYR;UIyrZVDx@y||0xh%?ymLeI
z(awU#@mhmu!_2!SzRLFRLOXSZt*h#)|FpLPoE?FT;_g)}3S6>+rs*!9Pa`93SDrU$
zD~xm(F7BTgvwN=?VZ<aKN>uS4*(%pbs7-EB%1;z@$h-tDVbsnzX?*36;}vpE$%e=@
z%=D+7$|5OE`d$N4_2CAeU8VeYbuuLAPQmG0-5BRV-mKL<T{&(UuO}UZ>SC{tYk@hg
z2d}UF>6-FS67Tp>_sP)l$%-X-b^^laz_Lru89&kJ?9XVoMb@G11r-z21s@$NxCsOH
zW3y}6w`QuVK@uNr<+hp;$fU8HAo*nbH?abdZ<#bV+{@&y)NO2=79>!l(S0KSa~43K
z;Ouj4pmnV+CvOC5rjcJIdH?l45cW7G%h|$98txJ^IEtG?kXne-b1R`L4KLZ=Bh|)9
z=j!JjW9BGJy2^06VyJ4t?pl+Ui<+706k&RuC-|X-0<2GcR?kKIZGTCApdPOASBT)w
z-QV0CkeB^^#E-7Nl@G`}kS3m-qqjSw+4+WO;_-NT8ln<zC}CFB%%bL{#@Fh-{{RvW
zZw@pGiSEl2Ynn6J;A!4UXs9JX?9L%${c=aoYnieqN)K$X3SlzQx8wWQw!4Kwlc1)a
z$4nzgVAB1nDlPrNd*eLs7310Q^$kR=3|98|NrT^Zf8ZY^FVKAeG$maeJRM={sBSHf
zkP$NjI06A+^{trVr&BqB_}n1{S_eC1+|1Rydr4p~us-1JS~5ouTh0#IItd`R^WU!+
z+Zm2Dp6};fi?Tnx-Yk85)nN|hQpnXr@mJZ|zgFBCub$L2yrPB5^{SR0|2KytqC?q3
zEl(*7xo_b63KBX@7pHV>t&0SoB!gp_AU=*ABjWuq(@WW$=fh^%{*0*+B8v1vU5l*h
zZ*6n8i7>YZiv(_6bsg%A@(c`z;M%BLIJdn%+e^O7MHfdQ>j0p9s*|dO#Eo8cRqnz)
z$&yI}=Xg24L07$s;d>l6VxJu>306#`nV-CL70MOfc~4L%vZx1jMzj<O6xabeZ}g4<
z6GS}b^CzF*4WQq^K9C!9OKNAWw<4SKSir5sL46;wMMdOEPLg9~o_}mpSOPB5`XGG3
z5Fw$(P2pWgr41q8YSBFV9O{CN3=j~0sXwc7q}Y~dnXMznX~S9_+iUm1+VnrIYWE-1
zV_U4qUjs-LPX5uJ+;7<xL>H%_&XCs^dYX215it32$2#@{9I4&Clm76lq7^6|i;q~k
z2;KCh)(5{C#!H6DKcg9OYNYeYw@9vjTBYrEXOfaaE}W^9ckhUAXAS<i=!qmo;cMHY
zd-L`IFE0I=fg}4Rl(7M9QQj(vPd%X|J`?9`4I{AMl#l*>0>Ip(;O$h#t8bj@Jf+!j
z5<Jb&0pcBY>~LkVAZZRTg<y&uMv6ls#~L&))g*A5$oDf7j-BkW#HU&u1M}K4Pt7$K
zF!E<Cyg|!C@du=N<))kouPI(qW}S|V{{#E#0u;ga&kDL_M$|k`F-}IdW2cUb^lEql
zbleaN8>IAamUuq5Riee9QmI$cy;5+O=si!#r__5~Uv3L}X`dQR_o&8*jtvy|x%eHh
zAV(wFJ)?0hNiM3t0}=5n`sIa#lmzFHYO0J^$Q0v&gKMN}DGzqPS@~gw32+E^eKZ$0
z3Ccadt)EC+cTIu9I4hbDjFnVsUWi@X%G*-%xc<|DJ5cv$6DYtJyACWUcxzBFU#WKi
zve*h+hdNhmh>|P7p(LipVmGccIxMv_`arlNu}ykEU~`Q`iibuDcT%)uX8>Z3<SgD<
zc?_$kR)S{oWcoC3D_@v1kA-tF$E{K+zE!dD6Tc5o&!M$0cU@VVGmyh(h|zMM_N~hr
z{F_?Dxb#d7-~3egptZFnQH5syWA~PFbzAz7jfA!$<JPIZ>(>f|`Wnj*sxGNS{#E|i
z?ngK^_LzqA;o)Mk9jRwk3nq$1uO4#~HQ|ewu=3l7sVVyfeNZT|x_|r{*G)yjk2`20
zT8T}DIY#B~a9b%{ZNrwh<fAnJDniaYShqEI?r2(RpJv#{hE~~Sw(6-h$*Q!fTK8UG
z`e(229OnmGyF<GAmM)BZUh)(x$fIJjVZacGA>>uIUD_V(5YraT_UbIjkM|)@rAc_y
zi2Zw&ZX$}h7X<m{o}Cqx`dU4`ix)~qJY~H8UJc7El^Sh$_a_5nD==Tk+J88f{DahL
zeo21m3q4;We?sD#5z`%weyB$RwDmdg>Q2=FlJKDp(DYNp=tIvP+Z_!9V+>3);!V~*
zBe|W;xNvEi^}WKq1GHoANy_<xT;}h@BiX)lCp;={nJQnai{Bz(eSl$^`Pg;|=NRkW
zdHh<^V;rtHmEW5I^oePv4a;3)VkY^1{N3+WhBx4JSUm1~>gArNRg~gFd-Wv4`)LP8
zo|xRYUZL%%9(umfX*WW?QriXjYQR@`)s9xK(QGu(#Ic38l!?rd$0rx;Z{G6Z_{v>W
zcTkgjojOAdH}S<@&sVKjc{f@%<$8mz#=FYZU%k{J|B)U6b3Yk5Xhgmg#zsW5H-bwE
zI?(HiJ)OF$u2lx^x_&3ypH$YKLyUKG%Gt<@TS2nK>Ev1clV_&N+L3l0w<<d^_2OK3
zom)yxg$JsKItjm2nb9BJp2c#dUc7qqkIeHcMPPV1Sw<V3x3owvcZz)I?s><zEfFbh
zO0r8d$>^i!-ch~yBoaV-k@ixpY2@u(BFN%c;R8qqQM*TU<i;5Yu$VO4)*I2>seVEN
z^7_QM!_0{XDAk}vv$sa|Q4^3>0UlkF-7Y-v>~&2N*Mb;qw#bm{5*)+%>b47?n9rI_
z@Q@TPVeyMEj@1&+hkcdGCMH+BGkm<ZI;hd-C73I)VqsETUE1s-rx9+MaXw&>00vp!
z@0&^U4mpSpfKHf8qSI=H^D-rZsLFf?<}soE#I?k?^|qe@+s=Z+=B6N#XQ}-%8}>b3
zjSGd%)Q+_;*P&pO%5l{ZuWV<fNubRbMD29koPDCd(~{_%RM18EKuDW}LJ8bE!wlUc
z{*M9<Y|v~PPV1V&3ufgLfirJD%`1@Di|K8r=9a8p)wmN8KX#9?v+z+T^%iBPT+X3p
zNr&EGhojbQ778&MO>NOHw0YYuXRj{Szabhq_)`nzhnfE3#*%mfXiZl~^G-ZJgz)OR
zb*L9cc3T$_M|!0?UX2V%XV}5z#?uDr?ENTOg9E%pUo3c;o<(TZd<&3IsP$s81tZ(H
zQwwTZrCYbzVx(JhjH0?Lvto&XVG9Z2U`(h6yG9xsQ~DEgV71W-Psb!iF{TG$Zf--R
zN*Bd*d9GDmQjx~ze)HNH+UIL{9Xh*yf;8)b6M=HXR@bX!PN(w^q9_ZdK@w_h-0-iN
zltqMp_hde($0>!lCe<6%vnuQ!FZrLGz2@*}%^3YaSZv|2sp?%6Ls?+dJhWDu?Wpx$
z;OaY7N95=otO7=hJ(HMmJ*W>fMV*&ik`B~p8Nyj9?l2|F*=CzU)pY|h^hwaLhGk>F
zbgJ@xO2wC(!44+hsrlyj&l^&1K5r#qBN9^jTMq(oQw-pIxw!Cv<GQw=!1~SCk0_v?
z=LU?Jt-40+vYu$ZWmcx`rpmO3?Z?<-tFL!;G~dq%Acf9tXidrU&~~36ipR4(e2csq
zw{cjK`OB15uMBLAZmvBU-S)M}U6(T8WE8}710(pdD}c>u4`h+Xw5?O-344&rObOw{
zOUh(C6D9Hhga!XQZi{_~yuhdnv@T=3M<bwNv@-E#zi6fYHu3_Kc#`{){PWcbjck^W
z0WTkIBh4f}U{CMbbes?%Rl#XIc(Z#D#s%?0st%C8RSjcth60In0(UsC$%XQzcXR{E
z@z(V0Vb$I8w*Bq6F^!!}`AfT8&<rUjbq-IXe(V*68s7Z9<7yDEFI3R#J=U80^H%a9
zvNmW@tDofoE(LQzqbdDw@#V6kC@ouj^BgZ-z3qy6S&F2`>KHzURS@LOiFy+QYITc;
z9Q=7}?@rK0bI|}r@&@G5PC{T2k1<zoJGHK#BK?dlH#J4jF@0Zax||;aV>IbPD!y6j
z%-mvqg37TtXM?<xpHW(jH$t3~vO1b@G19xlD}0OY$C8%kapl^<R9oQYPP|dtkko;3
z7Mo2>97B%rUr3_MA3+wFw;H>R=F7Y$b>^gML58Cni5qgXYM}TrW8+yB{*<8!tL(5D
z7DTkOJcn#`9E8NgUL8O;07(Auv)ABU5!LkyJewHIIFRZ&Xa?3F*J4^|&IL@wS5!^e
zI}R?!c*?QY)WFMyh(o}*E><V`Qz&@N6hFf*R5lLv(%~6FiQdCI*AFy07Z>Z$K5OFm
zfQ_1Zf$72#i>>O`adNB%Sw1zTznH-6m~76A0Dot-*!~cr)Nj#YwS6Ml<q4R$>}oGZ
zU>w%qL^i+Rj9(WUr7N&smj%94{k&=pW-DDf5;57|2>oWVepNYVT*5=RtYB-nmTC7;
zdLFs8$JAn$L7}Tizist}y_<E7>AI)%U&@BBF1E#mt5$xUP_?cgoskl9k_U@ereRC@
zBLegz*TZKBiHgqZ#>`MtT?_Gj){NpA1C5GZ>gB$gL!X~AZzXXqNa&HFrJpiE8_(UI
zDzZT178RsXljwjK_~=NnsDSgoXvsg<wd4kfrEpe(Wd8aAO88?N>51fkggqtloEM3h
z7aR2|nOVP}>^LjrJ!CEX{wD`;n*@{Fdsu+6(Bzu-Gv};yNuZtCmir_2@@+tEFY<2=
z_PiGR<HIvx1f=dAybzU0TwT=`;`626)s)@MmdF~|RbDWLUJ7@-*bZAAdSwNINg8cb
z$q%^J)Oo6sy~;@=?ZroABgZsn8-Sf8uJ1gG114AW^~81$x7xDTnnrpEOE0hCPGs=K
zAEp<?Mv1Bc9n0LCZM8Z5(+k^#_U5-LV9Q8MtRfy@qKr2lsJBs#5}Mx^(|6gQ^#^Wy
zFpAAQe!;6xyXF?2{8+OifIhbS@l3SPY#DE1Wd@<$!nyof3Hga*b!X^7vt!cwV(^=O
zL8AJg!{5T)pH&P5@+JNptQ~!oIgW}f{@klhTkZ9|Q-Lh>_sy_Ppr!ckk?9}*)c=i-
z=hY5S5axE$T|8}|(Yd|%JW9k*iV85G_rze!r7+?p*=I5CxB&UaSdNFQW4IyHz!@3?
zIK{35npZevxp0$DiP2#q;B3|981Ps{%4qXl|Ev;ObD9#)vx~B>d3)x?%j8hd1!bee
zU35%^RxT0i@(i;?6yS*30b_ZDHv8*B7+<D5D;x>8`Pgku93vIlIOb?R`*XTTyV&}t
z1O|3n;a+D_ufQ%_v0*-)N|g$AeXWyz|7~t*vBU(sjz|}-U@YXht5-evFWSiET9A=^
z4#g@4{1-8?z!!tKKfUkVTxY8NS;`FHa<q%`3}ev^Tcqi&SkQZrL)NrjnM&AmZ!u0e
zlPt`CgybbCPt%^ZMxJ;rmp1Xd&VZP>)i+2k(VrsYg?^a((9e$;$p6Q2{Z{ekJu!Au
zkrU6TYXc+tte(;Cm<>FeQpf0vi@t+*?p9z_cz%`9_Y<Sfz8(u@tXZ8h$;k>bviUP(
z_@3k*;#784{t1bhEQ5=BH1^I6Fn_@@+DKbT2&~`v2s}%xNdi*&ncP?B_#$h@M`WYB
zt&a~^oq(NaD8<T00S(+`9~pN(#C9Kc0n?9Sx@};wL-r*^NX#V_(-)XySAxU&q_egK
zUTO`e9mf~x)~w#rL`Bdk=$P<g?Jzj6rmn&0%+x}i&y0~Q>Wf_zq3_Xjp#<BN+j?>3
zXWukxnb4H#e|5FdU|Tpwce(1zOx;(@oF}Yzm=96E>T0dcQ{3GK!x}Qr8f=*nT4{?f
zBx9`SK(+zb(?3J?=a0UI_v~WwoS2BJyX%KlZ_k<rqkX(^<dzC?h+m}!v1g7}Gp1kF
z4Gk>(PO;j~_Uphj-?;%Gdc$*u=vKcLBl8l13SdRQdV=+VPB!mNCc_%QUDFZH<ZGG&
z^h3Bd85P!}1Uo<&5b&Fa(l~1kMyQUah1Xzxc@pHN9eAFe9T-yMEx}Xu(bzfML>klM
zSlYyLfr4TvvSzc9&N%kz#@Kevop@@WiV85#NFEO!;JwrK(hF}LmR@!j8Mz-nLB|n3
z{O+)W(A0dz!{)GA+r4;Td$&gy<)s-@W2`9bp>l%X`mSq5y9VoiB`~UpZ0oD$?~@ON
zDot^7mGgN+!9xI4s*!=3*L=oCBmQ%r`<WLXlAc(esj5->1gsQD<YA?_OcZo`Mu!4g
z9{H6MlVu`y2nlNBH2gEwttJ6Q*pszC2ov8gjLl5fa98t&0L+|o2k14ykF?LqxC++a
zCy2y4Saj38VE$38$%D2Vqs%JBI$mbJ(k;W_eK+EnjsDQ&Sfmex!KFU=$z>jNOp5Hb
zIn>Fz-rPyF(@AGwnqlNj0J3>wWOF7?e+W!%n!N9p=0Tj~9o~(fE(=TTt1*cTwtaY2
z+iHZ}F7c}1flbwZ1~OaCD(^ul=IvmOYQd)Xze<W^K;sKZi?3P$c3bk_A_Q2Vd%iq+
zBuCuflimvmNrKFm#9DzqtMX)`6YmpPG<Eaum%R^k<c00#9m#wMfe^cQOp?57s#hWz
zO#lWs(8XfgNs=_{$wpyYL>rxYTT#+rQok|U($2GF^c9K)W7-Z!)*NkQY$!r0`XaHN
z9|03D9>|l#r86yHBrukFEOow?rgK$(!Xu{n)~K*?Wj|U?8W=Zrpr8_JROqrU+A(?5
z@BQ&m^vBASO@f5^MtwmcYZA@gG23BF`T$nlx}Ke7KF3sbi%ZL<AuZ*=-1x1Rlv|TK
z(A{kX&qVCW4^UN)A<Gp#5boy@%ZpTNt7D|sAIA(!ZB`%M#BW#h&h+`OigCLU^Bj%b
z@!E7-`gzfO;D~dAK2ARyCkkfg-DQjBU@Dp%Jvkv(x6QN8U%%<X?G2bWs}+OU`KgsB
zTS9m}7WgldT=UDDxAW0rVtNpwXtlX^9bCZq@p$5mRAUDiQ%{p4KCwSne<fBq#oOPA
zM&BvNh{F^Brh}2<3X$AnK#$V9X1H;rG&%l97^c=6qa+P!aF*VyBl>2j-hDhz&q^bk
zJHuGOjXOA|TdbE3Si46!9p@;tn1>7Iikiv{8ZDxAWMrcLRQ;)(IAeR{vyUxlHDBKN
zKuEN-*7TOL6bXh?8OK%Y&O1Y!QVUeA-$N=2#2gTxR4u`F5@tQ2@9zYIG4AF+Hi*Eu
z8{Z*%hW6ZATyH@xcgaM?cXTg0O<zBrcGx=R>MIgFc$k{R`$Z!N)p`|TU+<9(18uGA
zIz+)h9#7xGKy+&!(+!Jl%+aZ@*RD&!IJr2cR`N0m%lF*giYVp<lpW4xsd)Eg>723F
zue~<R*iDTbFWo<<Ua)&bhWc?s9ODZ1zkK-Kc`(u>IJKPFX6yQpmLjZ*YdRk7cbEd2
zNj>%vVs=Csct)i3WGa{!2xg?1+J|6ZUc2Md{DUM7!}3E@cxpk_M+#k)6;3kch7QGU
zBcM^9A$g1A?RoK}P5ZTvCUI2T9A41;u6qz9lGubCQQFQR+nHD{Vmqhj+5kIuh<2~+
zRGeNjW*x^njG1?}oDN!O+_IjsV(l%YH)IXN^(a!_86G)9joQlW*OoocaT24iHCvEz
zapXw$7^>+H?Dv)f$>_+#&@~n<@ZxgVBS`v<a#JC}8l#p8lNY?D1e1RDeF|YWFBiZ+
zSG=#u-XTqF#DC3WR~vW@;9Y3InCHS`k&V{Vw=S1xR{G}b%2~voEWN=tWyb_H{gPb!
zxDtYUWh%~!c%nC}cX27t+)A&81uvMwLZ`f?VH`a2-1czcUO%wBz%Z!hzI}t^Xj(Xy
z8a+H4xpRepW=cPToU2%xeUI7N`zym#<2b{G`K$DuH6LGOv+cD<?TXXmM!?&<v4`nd
zRFbtdY+CQe?3IPN#C$?yo3e%-Z&a#335RW4Lx4zVTJZ{sNUs=)cJ4sw1>>q&?^!D>
znF>h_7(>X2qKBzOqOA|+JYI))yW5HRsTqvVMyt4huOndY9-wglpBwitAIc=4p48hz
zwGZyLv=W@hRl_yl=to6|obk6_iQH73`Rqw|@7Hn>;#|QmzKYa3`F=>AQ(mr}B!Eyi
zrCbb{bCys|(;!E5^K!r!4LJ=Eo!{ZdZQkFtI{Q-RI3&^KtTFQ^mEDRo$CWM<R1o{&
zMb?;NewXPWlzFcrP3+OBdsBA3r2D~52x0*4=D^vq`*O;IXN<<_nE|-NNPajvT5Rc=
z7*}r4+eES6k$0Of&1F&K1)XeSJ+3ajZ$|ug1)*<*SPOnX=g+<U|31XQvoCr=TVZf;
zaCoStc21S&8F6sqwlfplW`x8Exo7YG>yQ8)7G%JjttvZ96Fl^r2Scw<s^mQ&VhuF0
zl#ae|xL4h=W+p|IdS+2ksr06sI{=fps>oJCJGJhs2LUg8R#-~bW;vpf8Bh;f-L9!p
zBS#3abGEFu#A*aBz^9QBOx+v&-xSxP0q_X}6_y_#4l+0`6@b>~+ZAbuhZ+vOGoZ+B
zLy^?|4J20&(0CdBP6I~du^yxu*$_XtF=AEAdA?GpXlZAQNI;junAnr0#4b<ezijTE
zmn5HiEarEpZ(bDTP0z6yDLP3weUdh|Uhmv&<UR1_tWl|3C^ZoL<K0PqzWYdVmsX}}
zY6%U>I*+4#j?2#2igkM4*D0oW_>cke`JhiJBo@C+wDLQ{#mWoFLsUAkW+rJeY!y8<
z%CL0!mgUw(lUI$Td#hFuN~iQ4?9o0szGDJM%ZXGGur{y;HjT`1&u(A|8XMpYUR)()
z7qEtHQ*v`eHYZFMQ9!_<*hkqA#Opsijio{jHAJ+f$h(A5YeI-^!oBBF6MhvV7b&hI
zZ62Zp%8C4pQnC@utM!gy-1OPF|C;E4Kdj2mDnmV07hpIeO<%%$*iw@VG*c_y&Gj+Q
z1iG}s3CyDe2Rc`H-~3$2W_^5Rr#{AkA-*^P+Q4J?ZmmpPSj_aygA3x|gA*V}qw3ok
zbc9DpOyIA{bMA#N3TR~7W|AZDZ2OBQObe@|p-JkQC2rm#-MtS;@85#!IHpV1u5dwi
zNkXraBL*pBiXY8Lkeho<2kmnOa-`p}tdLCjn0=dj^TN_1V*Xg`>749zW<tH+)9e8b
zc$B<GrbrzpOddk7V}*yFuj^JkIG$whNpeD21v{bzffHq~!~w5<NN#Qz_t+wXyA=YV
znT{7gkeRZ!28`=!jn9!mwi8kDR+bDEQ$JGZk3D<2rw-CH??#3xNsyqz+6-(HBgHgM
zd|~kElaA<cT~!oFYF%ewngSmYX5X}pzQg{<;_ACzn*;*cR73#S$&I_eqOSP|#q0|5
z2Rh94>90aVr0kQrjmy@Gy!Rw+%0)2o=6YX3QvzQAg*%(u7bm0WnS8Fi9@8K0XWjcz
zB_#*F2cOKWKwcqv`+L!1ZP$QkexME{W_k`px;_16#_>>((#RA*$1ru@i4s860f$d9
zUB)(CB>}N|55fVbuql3r8D9@>@*E|}TI(K|7`jcrhACtmf!ZbscKjC0sR)_wOht_v
za=_2TjPO$Hrb+N~_CBW0l&w^DGw3w5lb~=$wA@peh6L0z!}UxfL}S!H_Bj~y>KqQg
z;5FANYbU|K-DDG?xvz>s+_!~OsHtc2Cd1C_(}x|P_wSa$aK%b?W~RbEpCg!y2yxU(
z4v2NqG1#q$T_zGmvaUyzcX;KmCB*iCRNV`6KeH90l<#{6qe|8j5BW)lA0igZA9a#n
zMy`G}-VPdrsLD2EMe=9OPSf^DU0Gs2_;tD;OyZm%MJJn{q@Z7714aaMDH?BdIl|Ju
zjDS>y;~K#YlLoz#67x@aNANnezTj5>&H57>0YR&1|57fB%14*~-~-Nyn`rxN0MfZ4
z0Ev>C+-$PXU^)+eI{oxXtDeTGuB!@WUbLiUi~s_cofGa~4<>dGb_^@~DB9v&jW2U8
zxL)=eF*?VX9D(L2BjZgI+2n*HdKr>8IOec$$6omPB=T!!uGBM;9k~TR8q>%eNLrdT
z)uiSQJ}6@6fTOhxBIw#V;9<foUhZ7Uw?zdx#xELUfj)uM$YG}@Fb|pTXRz1{HP~6o
zB65#~iaxdO^bH4@E>qh{$3$lv;>o|`m@C*ss`|C5BqNyyZ?3qg@OK2v&`;S}mI@KD
z6L5O-TSEPUZ7fF(9Lx|90^8w2>C?IE$3`CCR#4)$fvwpF3<SwW4knjX9N322H!iE(
zyNW%@ZEfKE%-jI-Hr5n-kD22U2Snk6gRKc3j(W+Gcu^-(LMKfwW`{edSJ4W0ttQ+B
zsEV|o&q5+4smvktB6b<!-#)z^F1@(89_;XHM8{`o-+^dCOfX6ZDPt*gF27Ph#9P3M
z2Nl&tfv#`y2tZBT#6<o}CielR6eKOnhgj&urQOjblYIf>6A<}O@f_YXQ!O}6ds<aE
zx%F*Q{nkWOaB8q11ET1o$;3AehnVE}(ARK}GJ))cue%R-Gs^|>g)V~uc-PY>X|?o)
zUzmu*^<}1mh=mt{((Aib(ZNulW6CNzb%)?Ou&~+9g+{i+(O~&rcP`8D&eTi`V-?Y*
zO^qBS_0aR;s!j8l$+gVh%%ROG`8gFGXRAj`FU;!`X;Ud^kJ^^5{94nW3;WNHS?L?2
zRqtlClW9Zu;1vA2iKE+3pr#Yl86|SLTH5Q;{$aY~v=DE~3TAk~2vkE>?lk;>r9Z`P
zlFF~?Iep+SOw)W9@1>8Ev?nfVZbV0)RTLm2Ia=U0E$!^v6`c$^b?pq+dojg~>eHF~
z_qy2i-YNhatnQ0j!D<tnSy&F)TRKiyW^UsvdBjRZ;is7YZ93r-J!id}!BU%<H20?a
zMC$`Ol@^IPH+C`BKTy#BS!y3B;yh<@vR8L_re@+$R+4jotQpDu0<n!lQi-vt78MLt
z!SGkSJci^;MdA<ACDCbFJ2?{zxciiqU)4*SX4Vhf4di~V59kw)`Ww3sO(<d@55b|O
z(h~_B@NPAY%o&d&$y&QYNkRVb!AO?w%ran?q&eYA(~9-q1NNAcS2-lbJb=u3f_Z$#
z($Mo#Ehm}Rx^|82I?*e1%1P|lDdYLJ%wD%JU^`tK%Pmc4sc-=wZuK0h(Nx(E9tsB=
z{tAizd%yH|aKf%Pefb=g8P2#p_|EQ(eSYNZsQ6<1R4Ym+9SuI#9={n7hF?(raqe}&
zV6R;-apIP&1*y4$v-u;gXt`jy_7hmG+0WQ+xVBUFi6RX}Cpx8(nR)?d;7l@&;CBx`
zI2=^8@~DHVOlQKN))6R|SBOoekb6GO`qw7)Djw=Fl*1c(`yB<y@QV`;G7gp<XOwhx
zQRSqc_N#Qt#d8CxV`%vbJd^1D>uY?z)-9b~JFb4}c#3P@5UaY5s%FA#f?Dg+ZiZ!M
zS(#4Ws*E1bxZY>9se2mi#vZv@a*$PHDwV;DKtq;ch7xFn{%37GN}qlfzJnXHrV_*-
zz6d5EIN%L=oN#*X%6Bj<?qcbMJyq1BY(YpItEI1`Ceo0=Pi;x@;Evr1wr)_>nH;ys
zNV!U~0@7jdJ70;cwa0VE*P5}b#T8*z9^)ICw7WW%-aK`ChL2V`chB?K!vBCI{{rv`
z`FwM$U%ZC5>~=dkGJ5T15$E0&UVLM0yJN_c!$n}M310pDqn6xp339X0yk5-G9$wL-
zCzGIva6mQ9CkHx0%O9HWlR(I?Vo_bfNT0YX>krTYv-eH$#CCv6AAtA>i=$k>uaWuO
zl(pB87`RELG+%hnbgPA)K`RFaqDnPcLK}0qD)>5y)hvh6Mq+k~v6jcy^xV)O1=KtL
zoEbu#^!~B_jr{+$H`UidI-YBQW2Q5L`M9jO$Br~j{nXL(D4}WEFsP`9#5h%Dw5HT}
z1Y7jhphW5ZmKEWIOhSF}lcACWt=$br$5Gv+o{&~h&KG$*72Tf+&iCFkAIf6?RSF%>
zVC96}ps#s%PbGKNuWd-pqPp9xZN#w7YI}3Ux*^1;J1!3yavnEUAnS@A?MFkqoVKeA
z_dFi_0)zjy@@l>dgCCSx;HjrGU;tRZ9`INI&lykFE_lkWZd-cXwN^Au;I_ZL9hfOL
ze&ANc8H3_NR?Z+Y^-kJ`V%{~~NOcDP@t`-^#IForyLjcsiG2g)iAl^cbGxeH>t<h=
zj^Z)?0+vX5S3&5O92rWrb)OyTWQ?53&8RgRyqzPOyuwAW@<-wNXMX)vE`1}o>|(ND
z74Zl0>qX9oa<wKXt&0<NK~|yG5t`;M7mjP|G=A%o4XB$VRE7Hjkj5O^xswDxJxAHV
z1I{VBO)xBWeiIl`^E2H(c#3_$Sx`oA@wsh;X!&;r*`CBlqP7O+rS-Ju!z-H)8fdB{
zXRqCI6?A{N+5YQ5Mc)h25fDrrKPq2*w`y029tWn=2yk;l8x{MW31nMNYvu?h-jJ7p
zG+6z-r~lvi2g5gl`=xPHk3BN-O>YcYy}<-up=cDU&q=K%iotxV(&1CTF4fZ7E3L@l
zqD@%!WA5tH;jeu0xAb+i4qM7ut<b|4Qm=3{j~T<WtRn_<3cur5|H?++AhUME7=+w8
z-tOZj8efSw+{h7NTSRtn(*_T1DZriRKfu|N|JJR>HTjTCWvi&cq@}~$nhHPh)6289
zBbV0Zql&YFju_?f=*c=%FUKT1P|oR42WHpZA?IP$QY-=gt)4gE<gk1Gh_664leaXd
zbQAb`yifMJni*$9k>IKV&2_D(Xr&H<Ues4_+k&zjUheW*+ap5@f|Z#nQLAqOREaEE
zLnV&8vb<&eT<5xv6O%RWj+U5v0Rt#5Q=o4;&=fFy2g(g?8dTJEJsj5{XYEXM+>;4b
zj$LV{-vgOG-s#)L1H#SK_w1teTrY%OXO2cd<Xqr%r($`r^1wqh|6z(Fztf~5iY9|^
z(_g8HHMB#{#WY1vRF3$+T{wNjtC~S#f_bmuOYRM=?+S~Z3`r0r6@4a~qGU!)>37(p
zx8hJQP5d-C1<JF61}!JZx8DmTAJASXdwd;3SL}sZ$ulRf%R$yY6@N5vmlVEhMSYH~
zwj?R>6Pz&s7u4%}6!v|<t!F?$iX4HP$7zt;Dcl0OEPe+#FO<HBmanqTq`X^6u0Z3N
z_Vu&c=v|4SAFO&II27*(yE!mH?uBl(Go@c{tgS!Cqnb26+?SYR0g%qWgx=*JW@oYG
z_F0}+$O+1_<nl+5GsR{tFiwf0zf=$Tea)9`l0NcBg|740g@FhU;6SovnsX7Vb-Bo}
z2!@3hBBL$w+aRBC#-6bxM?Y4h4m1B56ai{?^505Q%Uo&tqByxLqNZ`bp6%JFg#HPh
zRX4xX5qG<XNkyj4ncKO)*;CDEu+r4S#@5Z1cUQK?5nF-V?-{k;?6f@irqZXn+4juv
zn_f>`W=#i^`Ja5(zrG9rl&yH!^6=>!A9-5~3Py`MTnrUXgP{WZYoA3A^c3k&NzFHE
z`%98Xv@CV&2V5lKELl*IDY}68(o!LYL{^CX3Bk#<^RMF*AmM|V|3o#_%@<Z}9?bzn
z-tAsYVpDlZYx5-m&{-`4F$X9O;Hpdk?xJ(gsPn|aTBCU7f`{H#sz~AI|N4wS|8Ocb
z`_bryY7`hs7DGlu57uk2uphk5<+UkmyT3q+AcJ~pNBIMo12`HA%wC4bH7j{WFCJ8u
z^Sgy)8-O$l`jxTTF9U|>B&jB5K1;8mUy-IP00Z-z{OsHb0vPzkE`0hpL08kg_$wvW
zz*LM#swf4dZvHTD%gayevi=H1s)2m({<qM;@V#>DcJg40>dQlup4S=o5CbC@Pb_$i
zF1BPHGD=F6jrR>}eki0D=%8F*kzVz5yii+MmT~VENvt@-XBh~+tSvvbFIg8M1iRi2
zRtb{~6D0tQ0J=2Dj~r3h&0u{<4kUQ}+t-9^j2s5(RJ?yUyowz}b}nG?!O1S}H$Em(
zJJ$ShSQ>x);+Q(I(S0{Q9qK5oQ}i09TlwFz4+ds{R+_TiVV%3736472166nNT{(d)
zG(0!5Q~p?Sazm)$K&=bSG?fCLId-wM1!et2J%srfIO<sITF!^u+2rN`F?ky`_afMF
zPB_@{z%lp{n=L%9{?KcEUsXF*1PBL^?@{?Z1x@b@($b8mHIDP+$H|MiI>-~B!|Ef?
z=Y^YFZ4mG3oIM??bFJm3h1|}CkpB~3|E9MHulvG-*i)0;oHXgdyznR;Yi;D1AUtf)
z8(Q(IqhOr2_e}eMwu#s3n6W@XZ7KZW&k}C;0a#&L#>p?<8~umm=9_|$Z8h_*#KFj_
zMq44H^t(RIb7UY7?>OYz1L5YdRk?yd*pb!Qby?m{`ZQ+Rrq}aV&0jrg>)Bsg_~S1A
z`+5HHw1l$08J_}sy*to>$nD*zq=PVHidMtO>D0sC)&QNnSd`U(TG_JQDKq*_wArt^
zeF{#iOfCFoe0qQ9Y@ZVz&&%m`{^LoKb5a@2cU&aEh5%E)9&eFV6=8H8Pe`8Gr87ez
zL%G16z_3&70c*U#e(5cP0=v-H8Ku7|%>Nq+0MM;c+=;xim;vGC?jXn|yhh2Jopt1q
z!#(X<Hi5|0DLZ<uBkFiy*2ib9V#;`d+`FaWz@{UzUuxyatV7c$BNgEN6C*Z%xx91P
z+2CH}1TzFXDx|z^B&2_a;d__krDoJwe#HR1)@tVMb1ZaoK?kpm633<-{j?*xB6^zw
z_?n=Z6o32d%sq|&wa*m3BP>N>o5zaKSncFN7K4s2V>4ivcbidLHRbnIH7pOUbDpBb
zPPQ%7ZhL_|*sO^s#ALWeT(fsZPQGCVn6IY&!sy^;6PC>DhQ4scxALyR2RYvvtoLIK
z&R+=&Vt!Mf{pi#tRT2M4l#*0M^6fE$>{g7UiYC%~;XGcpjJp_WAo}p}n2Kol&HtVN
z=2-2Ls3O;%ohF`wa^+!=k(Bl40oJzGPWz|TOi2g3%SDnsa_(H?38OL^zlw7suz*7X
z+1_I7Pi!N+p%IdT=DCuBx%~`$k2E{0NcWjTscj?Hayj97Sz2F}BOsjcVc`qZsP+xu
z&VI+Pw;Tb3;hxzYrKiH9%WGqvji(rO*<MH2Ddh02mYyO9*h1mq{{mY7ppq1^Ey@x`
zho2&SnkFYgR7Zxm+nJ@Q8eEn0IX0Bd<e9ju!(>j&WYK*B0S;Rs^m1C?9{#M5v(doy
zbSK*D)XOoiO4_#Z2SY>g7#v!a(~+p0n2j#%ioPM9R@xlb;#6|pxGOCPs1ue}#Htf1
zlPM@rwBY>IzVo!A_sDpr_mqO#3(^Z51hvlXKC7o1KLhJupSd}LVW;&V-%OcVRzB;*
zo!g85Ouk78&f8Bua<fuoB1MDKTy6DSd*7v;H=@w4Xe7?=ym5m>fD6AV+<9mkzy4RX
zn!@b%;cG{OPtHSQ_P*u~n5bupX;|Tj74NAEPqvY3eGSnlVLI1$jCob>h*ApI$^_(z
zkjrv`#Zv6Kz-Q5(iPnNfr;<TTGnF!m*^bgGfTOsdD;sUAwmmJpzZYsCoEK<W+PU&8
z>HJ%cxbvVCNA5*fpk>klVtqPwC_v3?p|Y%0!F+rB{O#DWnJs9Oo4oLS!b0&g1NDeN
zCVUT<1wsI*y#}y1)#CPPx)V^!6s9w!=qsgDHB2IOFB#zJdL%hw9bnn8s5#*+5t0^t
zi3fhvoGmM(5*WU|6rX#)qCY-|hdAbzea(Y^EX49!AwPacCrgGhJ`Tj8-ePOgQc=F8
z24&+gPSDM*I{TlgBuvJ)AFMvdL;xW*A^3#H#i0U12aU$C<#&Cxts_yflCNEXhpC?q
zFzFrk=$?X)^arU_gp+?_srNUpsbH0VA#?543KD8GEQEo$HXF7uAqeSOfikkT1>=au
zs&R(URDC=pac?-oX1h{SA06hk5Mf{DW8mUnzUL7-vN2R7|MW>*YCdb>p4<OOB{XB0
zv8_e2QrU2pqF^sC2H7xbWm+}<v2`sSzzNmlX>0<a^uAmBc>Gq_@hN|HG3+vPNF@Me
zE&xd_#$NhS>^p*&QxaVZh?Ldk0;=TZz3Uu`TGN@hybuHjP-7oNVhaL_gbRapxUDTJ
zb-B}aIIZgu+88aQh?MysDCfopIdGN{U;M#XJTz`d3hvjXOB#+pF+HBpdR9hGVbZF3
z+Sk|tp`Kdy%*vk@Z3A7h<BC5c!FJEll-0&v&xAD$;vv#kx2_8@FZc8)FC)7kc7xb=
zU#=OA@1_R%)Y`4#AsdbQm6*NbTu9hal)$_A-T9GIg8yAHZn!e}+S@oPFJ*?;x@3`(
zAi`rB_~_`t8QSW#4^&!?v<HTFgg6j=D+TWp3=wBrTSm63yN*`@0Ox!2-p>n2G%xTK
z*ooeT(URk9z#?h@ik}!j;#0%}>evU#+D(s`PteGp$T1MJr^8yn+g6NsK6<)BwB{hU
zN|rzhT#=jcYfJtMn6bNt8CyMuSbcU{cX}*!I48(blt?r3ESh_CdT@@cSMTB3@OM>b
zq}DNgN0`ib#71E4*Pr?9w)xNNgaDPEaRl?CY%MfdQqZYV^7OeyL>4IckwXm3wcVwD
zgyCWg5E6PR0lWhUHYU$v=Ho804GOaWHMYn~Kiu2vd32)IIg`y+&}-2ueZAinW*gk~
zKNxG^R`C-1cOIBY9r8fH*L#65ZP43o91CZ%4m^lpLX!yu?9&fsRg0`$I3)Q336hZF
zrwE()+AS)^>kEU*lB`9iRvaF?{O(JrS27F)j1=a8o7xmh6I&OPt>{%YGbC^RK@Fbc
zIA@B5s<lPW7$ymMY+3T=v<lQ_crNYV{;$;f&j($br?Q3I^uALoOt+mWmW<>QxEs~V
zVwjODeQTo9RcaEE3ClwPd_m<*$nZ0dS2;=sy7@Af#ZG?|@DVx1HP+@C1|O_Okt4nZ
zbHc4mS?Si2OC(RLiZn8nl3=%sx)Kk1aboPIc_0sae+4crDr|YkAglg`^(#*W=ZENv
zqVE#_Rq6C;#=uKTP1=(3vxM4g($}6!C-gt!%L`?n$!Fh;tUpK%tupoadQ`M>0Vx-;
zs>+;L=>6jfFX2pFQ>ohH;BwD+Np}e&R_JG!HgZH@C^-Td8dLn-CQtITSC$-MX3B~d
z1~~c$o;aEr*`=Yn7T}clr6LY!cy7Ll{iY85`>H2YLHU2(kTEdd-nEkvW-SQIl+X9p
z<L>}BiJ(7O<$n#5w~r<ZW3RY=U0!N4=b(QKy^+X^Sn0UObxxIZd7mMq26o@SN9{0?
zSP&3c`rW)8Lhb>fsDU{#Af#)L__B-WMivniX(!9RXUV#^KIKOJKd=kS%Yxiz5eH}X
zWT8Eo>nlwxDT}aH$5)!P025kxDZYbKHPF^gypgEKsB~w)s0lbxHfZD0RpNX1lC`?y
z^{~!3BIPJ%+V*q?SiAvYNZc5+t$}}@At!u-Tr=~LZG&DoFAxYP_!~cH1x$LPQ=y*{
zGu$lM{|B=PI86Xl4jF)t%bcDW(@U=#A|@-cnVi7e@&_5qXM-eQ`r2IxIEolT^L%Dk
z$Ph7M4kV?hEij$z#MhHcJy(g9%$%vR{O%=mxovfMZluq*x)~ar(wWHQLMx<M<g4EC
z+4dDn>%HJ?kr1Bcgx~sq?Y(DIliBwMswfsv83aLWI52`pm53+^Sf~;}L3#-&B=mp~
zS}5{U1O!o}8LEXMH9+WHX-bRIYY>o52)#q@2{@J+3;z#yt$Wv+`QU^%$@`wO_da_+
z``P=PglI)~KzuSF$oghprr~8!1$zPdHnk1?MWz^A*6B~|MftqzkI?a|#SbEvt}dc)
zENZFGbD8{4ls_~e_)){%pMse6H%hMUfBMw7_D!e8k_{RCuIyx^d~Jbb-y3U-D-Y>n
zR1S2=?*(=#;DqQaO=*%Rr<p2baQcfEu4zyDy(hm`h22m&RcK#}Sjo~@s_qw-?kIII
z*^#W>04w8tPWB(eB#8ygNhh0xPyo+G^6tjD?wFh>yD#+cf~FH)iu<avIM$}WJ1EbG
zr@r+<eF*u({@80Z+LO6Bgy!Yv<p+IiyZL#Z3$P>Wi(`WYAk8{HGx=#T3l}Kl%F1@T
z1<{OlB6@tVI?Ame%%#ckx`vYg3wC#Z>2K1a;{wstgy3-OX@A?;u;8Y0gZoL2<O8I)
z(SxB`36L39;|@sT3H~IfztCE0#<N~;&eEuB#$?3Urb#AfF;ta=+m0nTsmeUjMu%e+
zqD{tgayYVxpT;fDoKM+XsZuYk8rNfEt$J=9Tt9;XN<#3-l~D&0NxO;`o8rrBaH`xS
zSOC`qYMIkleP-6Af_n*>jsa_fXXP~>2&6Oi&7*y6-+2TJ#Pq^_W)|HtuH8|UP4g_7
zIMC|9nsh4cXr#nGw>ii*CV9Gj-K1bnnGI%@0%m^j>g4v=WiD%lQ!W+c=Vn$01E=Oo
zZPgF1y-V2nB3O1LJ^d7`$WlCtcc?1l6c4ZN!v8{b5(CEd$g;8K-3sRU1?Xaq6z^oW
zsTWHbBKxQ{6A!Bf$X3Rmj?c1a9~xVXGkh*#?$(99UsiV*kPcOn0Nv5^T->TA;gTB@
zLv~GTR@Rw$2|O$UaQABx{!&hZmwjv-C`}rbtm&XzE2!>VrDpW#+FPJ^9aH~fo%&o6
z<)Hd}|K&v=^yNn!2KV#Lagmf#9_mJjFAbzeVPY`(Y2UQ)9z??rr~6d|2MJWSbRx1Z
zCz50Ajh!8WB(&^24ztQBEi>!N49$-_wnWN?txTK99LfsQlkF&N#-jg}AOwxQ%M+YM
zkESMiE?<ZffP`2eJT2yR_e&WdV9(X;)z|O~{qm>I@<csW^BBMOG)U`Gb70uq0n^<3
z(IQ7YsrdR=pWjIo;jJ_Cs)$p?0J9g^1HjQp9DjVqj6R6gKCVLm9e3K0lu~ek%=)#V
zCc}<)i<6yi_b-JET(=33Y7dg>@qFR|oNFm)&b5=7=`-wRAQ!B>tITgv6qT@5H}ho;
z3;O-Kmi^;kP?J;G`H>q(v7r-}V=UGzC#%;?%&PvXmlq~_B@39U%oL37suUmPt4aSK
zFLxv5pH3$}CoZCua^?T5>|(bf724`@xSxb(KrHF((b1=Q9uB=xv{?waFJVy6lsi$W
z^@I#bck@d*bf9Cd@vtA>6fzvTsJ^iK`QW_;W7TDWi@+HE1gHxb!+}SR@{8q)R>4zi
z%J|7TWsy@^DgYd$WC*|0tdfCEFq!1x0@o*XMP1*s_?BuG*M%sR(zVFjAlN!hraS*t
zNA@sx<NN@!pI_jvJ0<iSRv8?zO+68h>Nu+4!A{9~L5&_33x(SDHU>orvZhgot5Tl`
z;F{BUy${^<z+}ECxL6Hurdb6#v=kE}l}C4Zio}<T79O~^*Ul{Wxlrbyi)@&ohqhbZ
zr$m5zeQe_^;_8v*xUQjddqB|nm`V#WXC>IB#B4_9#JaFEr<6W)+aNP|`jhp73EB#;
zrZca0M<zp{4mrrG(#R`EKmqZO1Fc~fn8d<mEhc@;&cgK89yd+C(tJsxk*6C$eeke8
zm@ToMmH%WtuN(Ne<L>QA98ssu^89>=^F)J=RkS_xz9$o$`_ht$9$EI9VF4kLkFw&~
zXD(nZybqXK_+`gK%%kQzdSR>`3k6nsLvGgnb2$)Nh(%k0cKp8FOv#?}Y{T4%MXb$b
zPnvgTep2}REC6Yxg{&J(`b%de#CLNWp1AcQK|m2G_BAtTy1Y}9IMuvdo=Bmrwoujk
zjm34z(^Y7_T;&yeT4XukK^uKaScC<#_w<il(>H5%kNfGAI_<5^XCyR*dhRzg7S@s?
zK9PCW{O@<AjF&2+&WE{}RcFCb6^^fi@w7S4Do>a#J*ZNYFNGXM64Y}t9bcbC4wB@w
zUbuwRueIYt%&n3QK2FBMjELSdJ+?(rfB4~j<8s@1C`dv7Jtbc`V`_O17}JeECZ69G
zr=ivuN65GERN*xfSo#MQ9}Gq_!&sMa6P$b9Hj?#USe<|&b%JV8D`_id<=%vSO6N8A
z)@40E$z6+Rts2vZ$;*o_Hn)+s;}+);ygd_%qe$y~-KXBw*?HIMRh#upcR?&X2+@<>
z8DhTtGd8_x{<cOdv9{)Hs)(MHo~OVMY~uv5W=oshCk3q_%~RcmKG}hf$mejf_yI0f
z{@I0sAkgM|r^Je}weM`s&X=h(`cqHP2B77#scq<Rg&tGvo`?8Z=9-%B_j@bnKVg1V
z)uLG3GZNKxXT>uj;o|HOp^Z}Yd)pK~)bZ3f^t&F&%olk75*1@m#KXTD;u<=>`xYHf
z=hMs+^}WWI&b*eZG<ukzSh@v(n<ZTk^ISPsYiaqOA{J|lijKIL$Dat#J);B@e{&xA
zL@2eLp}jGR@fd+frsi0f4lXpXpoDHz*DIRAF>=5R+fT`gjW<fHn4j(!=1@B~D_1@o
zow_O=xY%X=^bh!Ige9^ym<HSUQY={PaHeOGMB7YLPMf^?Xu3l|z`)3u@eK-Z44}v-
zT*qv8f6ji2JGmG|oj!rRELSx1@r{tM+FE0ovEKk-w11Px13>te`IUF#t@3(wXSt=0
zL)T}#`jQDcqSu~E^%@XCXN7}{Qms@o8sw^92DQ~q<ci6e8B%Iu=WMsaOGIo-S7-ot
z5|xxS(;!9HBd8M5{|8@*4E;s@66(2Vy4>Yb-Sz+rT$YZCYrg42lSB6E8cx@6Arbbv
zi|2mXoI~-GR-b6XV@ymeI969P=p<+3XXqfsqxL82yGB{9lNAqi5FgS>C#TK49Ydtg
zrRpBxuL%J^`Qmg_?Xc>~!EL*aPf$8f3#sbd_Xvsj%t#~7KUTWU2fCEwsTS++rc8Ax
zSq<U~j5Rf!-Zw2?XVh1{e{=h<d=^0H>>B!)&u}C@h>`Fi#OyXeYtPVj@6Y0fcL!@+
zQcFb2TMQ&c5<N2HBI0M%_X0G@K5XA7EOo0?jszY2)M2W6f`%fwA>BJCcA<rP0SR^S
znH%*JMHJj%`1Md-MHEl6^RV1xxVfH|WB9L0(sfghD>>0-)-`lvsir*<=fs(ds$2|>
z9@H$iMyMcOK$qY)!NlEzx^8{Nc|-#T`8=_#ZnJvuNPB?XojyMvMr~JFT+_^%HKRYB
zl-$izY^!j_^7XyyH}!YhcnZb;?o<2$F8k{IKqqL~m@$k&aj7vw;_V1z*uo62R3T|V
z9T{W*=Vsn<D#KG<{6IEmy-diON@w_0v)-L-+Vxh*#!x%eg*{!njto=rTylsebKaSk
z+2VeQxz&H|%#!sHOEb5*29Cc5`O&vr4tkmCTI^qMy8Go&E`B^t4VYLB?^UO%{SRqd
zo*syJkq2FRv$q>1${z1Hsn~r0@S7=`=$Cw~b!K`&MBDn1SanXjqr%Go1P8-;!bV2k
zOtK8$0A)q|+gNFoLB~hVoKyUVsAK}SfW3OxVZ!`ApwgpyeB4*$&9*Vwajoe9$Dk;~
zo6hv|bZP?zzp)_D|8+@U%`Rbt&7`f9F6X{Rs7qJke(vy*<e=bF&h=s;cK4>;Qdh<e
zu_59uD-jL|HeIph9oe;VZUqzaaw<$T*|s)W+(*dvdCl3;^|%l6b*`{Or(W-O?0EjC
zz&fFMzWW3c)M&2g=w}y8Xdjk4>MGM%kh@}FmcDy38hbBh=Sv$~OXc5Z)S!$8qY7Bb
zNjep@CRnJ<z;5y(U0#UqNQ;qyc%tOauE>_GXo$0G?VIJ7Oq?!8P4itTH}X;@tBcOl
z%xjY8+p?fb8Oy=lf10S;g>)R2e^h~&CO$9jvd_Dc>z4D^UZCbzM`uKDUjb_XbU3v3
zklvfBFZOfBGS|2?j0i+Fl?fI7mWia0@2eb-@89O+EL0sdf#wWpn&}$KgH0OUVa(DN
z8Vfg?Bk!{$h3Z`uedDN0o43zoN8(XS`{n~F_K(mvEV6XhULhNbg6~nKDrD+CCuf}C
z%xmZ2LFEvmYv4mpnG#$HsQScu1~zUx6ABzQ2)|!)?GFGOPCC<G*ap}@bN5&5L@)E<
z*HPG97pH#ph@pX#Q$dw1zoCDC1@F+(4+V@`S-sHs1BZexbL|BNKl?l}{{xB^8q2=8
z|0Q|$#R+mhb8>jsK=6ywSqV3L^k$Em!gDD1hfCR6hub5%NPk*P5qnP230W!+fm6D9
zb#g9@dD_*)f;4Bhhn9^}P0~Vt)AjkejG=oqG?|K%pfD4E<`gZpE&%m=UQud}Rn_H`
zU}~aXAYLeZ(N<El#k@FHC%QtRW+YCw&vtGWm+#p(g&X?)Z1zQ!Y?nBj@F1pT_zV)F
z?2$Z@$I8i{V;Xjt278%&YZ3>xK3I4~)wS>EZXDrz?_^jD!SJ5WL3a_7sz)Dp1P9ex
zBXS}wv`d@D!&e`hC&HIKZ{T9vxt}QNiL|vWytp=+5sluF+%R{*JYvYt&+N^js&$Y#
zhrl7KodBLUzkW5zLNTK)+nr!e6K@{cVCk5liMY_2Eqpcczra18L-8N<uY9gx01qI|
zS^eO;UO?m#a^A@k`{bFI_%tsys;Cz}wKH^dR};5UJNbTuuTiU2_torrId@+!if1UW
z);ZI<3yzql?Z{!*nQg9EZoH`uL-yaSZ}oVIyXIshe|Ski9n?p7T5d@v3+P4p+4DFa
zJZ`*vU}4s+d0zA?CnbWHyr#{u15Fn3=VqrXqEcu=yi23ZN~4ZZxVp3awW58;t{nLt
zrr1Koe)-v%(#A6=e(Q6*VM(DHPCj)_L-mKH4WttnJEk$oXLI6ig`FO~?!l7j{8ZCy
z73Xi^GUaSq>}lci=8dNT`jv%J{eF*n0kLNfO>GSFO>cF=nY*%`ubW6G_c0%h8F?>u
ztNdLZe2XZ1#GxiTq^>U8(Ws+AHxMt5KBv@!pTjO-K>;3g?8%S5$PWNWE(_gIS6xj~
z4{OAVj#PmQ&I9e5>K>CPSLEKX{<=i99{=yr9AQwpn#exP77>5FtHU_(GVOA6`=EWr
z9J)#2!}N7eI%=&p+;#0%ai%Py!AhD?SI0U7LUVMr@oEd&vPTUGF-~q4$^6S8E^T=S
z;FBJi-%o-BE7a1abCjtTiSLOBJd;<Lc7}WJv{Q_MNm-7_cqE#8PrbA+SRw2KiJznZ
z+<vv&9PfGv>o*;zmDi2It{e`ot);~#)F&9e5);G!hM)rT*dykyrA|!VGJ3<Uz|=f3
zb?+lb$CbQs&xiTKqPW8A>aA`Uc0-KMb2l!j2I8YLvDZc?PZU=r>6>|WL;$!t_(#W_
zxq5SutVC4drDVJ0+4}2Qx(C;y>xFUXm1f|tG*B;aHzOcw_bw-&t9g})<VdFU5w@S?
z@qX%C6(YGO?)`m#FrCt&>fY-Bm^tjS^*Hc=xN}6&e*TGQUR;b$UNI*=<QQ~L_%iD+
zvr~#r?vy;=1@)>hRJDHUcuG_i)pHqVYYOkEhiYk=?W~{va25q$rpb6?ZNiZ(6KBuw
zk^>e;%Q4TON1&&GCVpR5rldvP_-b_!LA$4g|8Svs;)xz5wpAzZirLC?A?uUx^WxE9
z{4W?F+~eoZi=IvKj%Mh6soEEFosZ|v)B9W7A5$E#tn=Or+kpcHCts{FuMPHih;$L|
zcQFSRYbGCRIXWG5Eq4r_B)06iToEuey$J_K>0$0m2fSCkwP}rce5tUs!Ek9vSNd#%
zfSXz%KkA_N>&E*FCf&iZbd^d*M3W)p7++(1K!&8Q!HT}N`XO;5k$g7ps9ce%Oin9Z
zOSg)SAgzbiXyRYYk@haPl!-C%Ek_wz?)3#cxfRf!8ZqKGYj;6td|cwRC`Z(vzW?Lh
zYy5JH*+s`(P>nj`O(k0IB2OosXI(5$XIi&nto7JC!CCe8Uo?|-zjtO6>$Hhcy=Eu}
zclvp5B4I#@Rqnafsq{DT3J_T8`Si;MYWD8U__N86V7B!TpX|jC&)qyb4mWX#d{R5r
z*q)Ruhv2b@EL4r?NtjP^I<wct{6MXm*_71l=1(W~%AuHOaPVV^1^Q+%_uvEO{T|xS
z?{Jgwev?_zw}uN#`I6~?BbeRltwQ0mi+0(bNKQGi8>H0pe7QDFMM^r_YwzSf91-TH
zT`h9<{bg_Q2FwFL?{-Z~<UJS9(WkASZdCR|6cy%F+h|Q39vO9!Zr@%?FtiJ2HtvTY
z`XMGBlb+1?F&R&;y+V}+H^K`>wB85J43eJ6Q)}bRwCh>Mr&pFzKi$m17s?+~)o#5%
zU@o2fUT|5aDOBL0SAb%e{mmPP<=t>1bsRAUcjg{YatAiLyFla@yQ%Fs<~i&u42Cat
zryJw@`=b)q%YL%A(i^Z6IOnIEnF;+VD|4-=8<q8G2`#TuT>>N$HAV!2?3zdq^U_nx
zo<|1rS}e%U`67XZN%6;Kxj3!*j7-$iTC3EvRLFa%jtr`u+Jffl>JibEfH`l6`B5d&
zuk=ruf^0O0N;F+Gu$Fe-wg)d<O%l6qlTsCO%=Y>1bDM(i7eBsbJQ(HcoxnD%6A_y?
zo}4%u3rfd5QHrLs&o6KXMe62DfgaZ3Gd1D;rF*pA`7F+vkYBWyPBv8MFUvz<M~z<x
zA9$cD^Mdlrfl1<*ttpQEn%pyPi`nGsuVfWT+OC7WR%&@<2xo;&x=(=1t+qA%bLL;C
zTI=t+U!?+t(Srg(fB{}q_G}P3k&+ewS*}E8ICH(uYOKn*{JOb@S5AhJ2Djpq!cN-}
zjeb{JpwE3o!sgb=k2Hf~Sb<VCwxck&K-0@^gkBfzv*0JTOA=&DDUR&Du)FRD5jQ2q
z;A6g8Etxvk)-WP&<_&>dlw9mg?c3k&konBs>L1@uspU~?D^~VXYsXmRs0ViA`^h{<
zxfso3rj&im6!KpPFwL11Qi!&EdISZtVo#>3%>QMdQxymG9caXN<RG23eqS|QUlAXz
zXaZ@nMtai{M8fF=YsPiOY9d1_DchUwhnGYSy{<rwa!WKvFT9v(0r`md!{29-l-nKz
zKbx$-x0L^(j5O$UJ@Fos$&swkk3VxN@&yCdUpg+XEY9e+y;`4Jw@M}`%-VvRXTeqw
z`mVcsPlh*52k-sh*_ZZNWE)HSXZn8zqY%!_(tMOovp`y?t<4#H$o%i>K<l9iARw7%
zcApApC>hW$(Asug<WJEypx@3;m6m-E5xrcS8iBt4_NQCT^mTGi0eAeGDxlX#!)apI
z+L|K@pv+$n9#91GU=gs7E)Q6MP`hROp{|bX)jHMslWvzw8<AIe#LjJbtF!-SbN}Y%
zpc+`R|K61upB^AK{e@jeVyHg^`P<&e&=0dP`aP$&j%}?ZcL{83rk@m=e)8MM@mYYt
zo%}&*>-i%=hrxX-Qw~2)7VKHeunv$CaSD|R?cu2pf6Ml;`lX&WsCe-U%46fyFZ38A
z{Pl=D2b7j)FNX2d^J1jhA8SUZT&VW|^%56D|B6razitQqGVOC?6dXB+y|WXr6z-LU
zQcE8}L!35zc-5vMTx#hJjnB;7TjfO?dj!+4!Zc41e$?VqIyKX<r?BR!C>JE$pWu=#
z<z#->NMevbNNTTRK#_ew%t?U`Jm=8mFAw>p3U^2WZf2(kChWMC6;-~WKfx(CreI=3
z!Cz`=;-=3`3N@3-4MMa6fsJ>C*T+@{zMDDeeY2yy2k3<46Kgp@N@w&?2isK&D)Vpo
zG`8ydhfZ<mE~)w0qT$1pJNuM3r-qWPG(c}#|7R^ib(d>mIEHEX!`|TDIvCjS$5bBt
zqBvt-9_ddw*TH6CmM+`ElO{`-lo?pZmu1$X@t|)SY?zT)xDVPvERY$v8{(r3pxw<!
zo2UL>fgE{N*T>e?kV~8lwww>wPV-0^w88i@WfR%ut;Io2aw8-9VhTU}vqXP47@WSG
z4eR!Omp^7{^OW!8_vQ(r#a=~3f{i<sSX`h}3=C}9t#y2`R!_(S{BCp$OyA${c5o50
zvRavtRn4|KTCDwGvU~Zcykn2*u&+vl)Y9lE*sRP2+7=#XGp_!<6ZY350cHOaApP4I
z%Ef2$OeQl2eLhWbNZIhZ6u$?}Jawe1uo+))Ka2ExRLP!TH~A=B>p|a@u6$9Ptc4?Y
z%3M(*Pq}qWVYeUe>l4Uw&F-Sl4b7Vo#h&0T@_+0LJ~Mv_WS$9zyCD~nvyPPPM29am
z%m%-}-N>0SMQ|;#&N0OcA*I~nCq}2^rJSrDN;x_{J>XRvZswoj#Z%8!w;%9h%YyQY
zA07Q;3W^a*g*^3%_|$tbj4hg%ds;>0@woHSo3vd$;CkGJ2lnmSlULG5U|a1A;VmZ)
z>KOZvOfjySLFBD<+VgvpnM~{nR}1+&V$+B46`Ik-t52lz__7Md3S9EFLqSVXPJ!Gl
z@cBaG(I$PaI;WI(8$I5e6bZG)-4nGge;nNI&Ri(vG}_M;4+Q(f1*T!Z<@Ojg%G<vh
z6GPR_s2SZ-mo6)gte>CaAhucfGx4XNjEbv*Cp-Y1&h}_<K7%+sef`u9JXMmeLU565
z9vPo_2B%K$_U7Kmgasz3R&)+K+#267(~^s{6VuMmJaWZ1a+tgPCHVQ*!h?Pr1E0w*
zv08O>wKN<?KJ>R2qe+z}*jF7s>Q03R#p=b&pqFNc{MN~Lpi8mkocsq^d&ei<n3vZs
zHtFxCRs);Sec0qr>$P*Q-pMh1F4f*_=r5jL##Z}?NBJdA_Ne34sC^y=w}2_Z<|~9X
z9^ka+wI1A>`)#n;06DKI$A6?o=h9JaJ3ir+eeX@8Ba_L?Y)0WFpE5R=87FyAks}pb
zrpx9}-NWDtq(rJTog#IQgq+fq$gN>>vGR{OS+<m$EgN!P{$!bHDq~c4VFNfD0)@?e
zryMT!{53cRVNUOzK`N<wbz?+BbneQRP{Ud)I^1%p8#|*xfC0i=a8|#-wIMy110M&`
zu$XgTgBtjf`*Ut^7{~MFu03S|s&r*ixf~$8CY2CDe@f{nuT+mJ*ky><0ehGn3u8!l
zK~OrR#X9LtPQ9(isXGaP20G40?FW6y;9QR7ylfOM=3Kt3Z_GLP#?5~l3=i^vO2Z)@
zlkvM3C3ZP;XUdZDEB1rm0npZ{;z!m$gr&(AGixvP+;=IEybB7K3c)WTKJt`LMs_io
zC>=8~kGxq>PYC9aKkZZi+95aZT1xh8nDV*p?#ew)`PeMr+ABXPEJbPa?9_8;;_<N^
z03B>oX-vZ^-Z3OP9W9!)K~6X@9?{zMqbV#fF}61xmjkn9ABo@OsuM+m&dmSAltFim
zeCYCVt3RleK%+%dCO8%WSuf`p;Og{9DwIKU-^z|zU=rUSo>4rG2+@8pE#0pm!k^B#
zItcdj%&>sNPr8GF_^+a2O1#SIga<f2Kc4c_n|Jru?g(krckXn*`^Un>exD3sWH~hf
z{Cv@=+6yQfxc^dK7oYqG&=yiYxowoEKGTQi`9*G}(|V_BmRk4w%tV31Iu<;|jr|pE
zLFBs1G|bKl?4euF#9ur_gAyR-L3_|lL$ES-rkR>4eh{i%&azn@ZR5VyBo+WwZz&<%
zPXv`|_gyP&IN8BE+>y7O%XZ@tiM;jHpmn#F{qN%<tdlEt_}Gr@k=LM__|pfRoKx-?
zA{l(`ab`O>Ne3LlF*jGm79r8W)>T}md??5-?52Ok!!+5$g)s-CE|jx%eLe!BZGGz$
zCBdbo9NsXEfB20!!Ja8_CcsQFdn9Nsr!1XqU?nZPKd}DnmaZs)GYGDbL|NW)-rSMo
z-q_#1>qvBo6Y~69y7``4P3eplfjsq(NEx!M+UEY!g9;R1GhSG&%&2!1?JKq9aMRy4
zr(HRBBEh!ubh@mBwwXVHK|8v>-UE-!0{tjbj@*#}ZxWxJ)c!qI1>-xX1Hc|C=Tj~}
zRq4fNpd)WL67-%i4cj98744X2wYh7{W-r41?LPS{+5}IX*t<-u(Al*yFLsSP#qKuS
z-4+W(C@!PK3mo>d=(Tvt@pH=I$_xB2OeJ-LA1g@(VpvRF9oaYX+Kg^Q#TneiZ<o7>
zW;DzCZvTFHh;`b`hl{&ybT>8dA*6c<_HuJ~%Yf5n0@L5Qi5C;_wUTay1xGgA+-APJ
z2b|%mu`Rwj^LM4^uTffG5@dTqFN*c__WjnoeSxDpz%ip62C*p|M!`k`FF-zf=mU(O
z|2KUl-vX&^l%0$hM{ZkeO3>@WJNN<M@`pp%kdV)&zlmA7yMu$u%#NdS&u(7SzQ&+5
zDLvCtUA?~BWK(h|xd<9!g&ybW9NxsBK7T|ygF^2vjrR9%E=E(nrhl9wYgyft^*v@k
zgvQ@WdEK(uwSY?LS7)bgZ^qCzrYk_%TeUZuuVs^{kK$wKd4Wp3)|?#A{~vS;PJK<?
z0XCPjZ*DGN1||RBSw0`hNRnO&4_CFQd+|2OT&`{Vyu5l2-reWzZ*AZAqM|kD^hm<!
zAIOiU#=hRUPu~t55*|%~aGVZ#xI>p!_}zfuLk<q<z0Z8+qdPjTTJDl&yl1#?d(f_t
zJDqG-iH@|@bc(gl^F3`$m%`Uv$b*y4v#L$%s>LIfB3%VG`?gYT-}T4wi+8qcybzGv
zO05ylC?x!k>t8<b{R;>$SQfEJtPlJ5*T1{x*SGYaP|>7v#G6@u|In`=_<I-|SKwN}
z5}@6B%rf6!|8~INe7EE;YDRqp<(AH$JpCUxsmTELj43MQ^ZPvaZ*KL2EADbMCrd)3
zQ*w8IBd(u+<*5k}B*X=+dt`Hx3}6HcUryJC;j1owe)Ho34M33arMbLwo0B957tqs_
zyaA41-1F;O`msQewKGJN_2wiARR((c3stM|4+;NRP$f`WAjsj4(cYiv@gKr}8V&R$
zHxe55kLy1clskqA2(lr&rJVl9ANYql{xkTWY2iPE|GFanVc7o}{LkFZe;)qFP7nC5
z|2+K99Qc18{%3LaKS=Y_ApU=bH2a<oC)xl+1C1%^Uxk**gb32B>teheTCFi&m32mJ
zexpX0HvwbEAOP4c`x4xD|6#m6;jLtNE-%C6fUpr;k!m(Pac*>a076eI3Zd^-jVp|x
z|Cyis8c<52w>wk!L>t!1-zj-Otw*_AG2t<nhq^@S35LnuMQB;E1eZ#zOUGr7UQC=}
zRnD=0AebLSsb}0^jJk570NwMt>w?K5i@8+PqC}#+b0$<UUfo?7@0^bT7%z3=+dyKs
zjU`7fyF5lPWfL&_DFrmk(S}8VQ<tcUmn)!)Wr=%HJ`1%(4EQ~j4101Ac+uyCNnfi)
z)&v@fZD#%y)UMUKvVE5?<3brVCf#X#65~Z0H>z5&LS8XsQ|t=mGQ*2WFj#9xY;KH!
zlrg>w4cX#uM9sHi8*4uIiWie!phLRO39nMiZsP5oa*g#J-2_17I0108a{9oDi~7+e
z3jW?X{R7apF;`Ecs;S$B>`s_DHP{HEy-)gQ80s5U^-*xC<@IU$<Ecp{dsm@ooI)WK
z?cDhG6;)DfiCLp-#U>JqzWwL)br>+7z>{(xSWq);CldTM9Yy~VB+fl_jTIA*_jj9r
zTN4cz=Kto&KXfjw!c=ba@ij%~=gu5$;?!4Y=(?S<9WMECLNC_kw6F}kpSa{2Te$ih
zjIndq?{0SdqJAfp_*S#d7X$1U^)=lG0uslyv%L<FFikeO!rQuvaj(_c8!SDgV)J8r
zvqjBh*nVKNUzY@!otZ)jr+#v#Xs`3GBzE^(BWipL+${)-)JK<q)aizd{?Ags>E7p8
z7g#Hi5y(|tJu5!r4{aNH^%6o~m=|QdDLdT`?DT4L!S-D|+!K9qh$?tg-bseZHegD0
z^Og^xW%+frftgAUw_|xWWm=Tfx&F-nC&jdEfYYE8IjeFOJ)i>I(mVugI{~gmIooU0
z9#ORv5*5i+?es%Ee&-+E<!H@c6?JjA9G?bxq<&zDHnSl8v;?r*LQDGP7T%lFU}d@G
zono-AuEhv;RO4$T3&xY6EG?Vxdr(R;9_>-9L~&E%8G+soA6PJ6i9j3Uo)$HdD=i^#
zHoaVShP3C3Xf*p~#=0A%fq_YmyZ^|kqyo!`jc*vi<~ITyG0BK+UjoCSO;}wNxJNr;
zW*_uB<N4XcOEPH3go(@_S2nTIDECOk0+kcF9V<`#&Ro}@s6RsSWUIWkTYoly?*Y&l
z#Y-<=C0K86I8U9x$5q^Db-%yMPx0pRRiLn<C-yqO_ra7q#_D~%6JJcl{B^4Odv~p@
zDs%KgKmbYMhS1N8@-d|4hIl)S>2UOZ6iFMg(>7!y6bE|e*m5ykZ}N7S0oQj93exXv
z_AayxnTOD`>Q+hWpo}<r4JwVQW{<Q~kIfp4B#unw&8<O+aul77YJL=x$4srbbb{|j
z=hloE!HMTPW|sq-d^+w@hSeOf?su)Y7$)uIF<#muk{2>Uychg)p=A|%9BN&8u4|~-
zwdaX_NryW3tt%~MyQMF*Ou4Iy)JHyqnX;c}4Xu{!EDm8wYOQW=n#Ly1)9D^X@aH0s
z*;l`@d;KP=5K`5RCWb*lRa7OCrn;DL{nZUsl3q>HN~&A_)A)owVMMS*T}7U1QzbKm
zTcVDr8k^wSQv+2c!m|4x%@-^}Rb5`zRXu8HLM5;Gny=4qdpi{~A3R}155<iyS0}yA
z$YvE1OvG#Ar?1<0o}*a!7_B=U5BP*@m|%ad*vR~x0xB^<q6?$B$YMXcGCHyfeqv5D
zH<g!SB>G~s1&Grss+G+ai-U{L!3TxMYLawPMh9W)B-T8Ux~iC#%HT9{z6RC(UmU~1
zM<oxzNRQ>6Z&)iQc@7H3+!=1uh7ig69gJsr)sHuI2gN6_3ma8U?n}{D6`B^NzUGOn
z$;e(Fx(Ro@s~(QwS{8PDw&u=@NpPBUj!jrwmBY;0DRqZxuAS3`wM-!lY1b|6JyT9D
zkl*ew2yu{LN}CE4IqFDQ!<@cMQS`Fn$4B<?1k>WYph3H7R>P=VAr;kWFoP@mY6FS;
zUTLek+&m3g68WM}-{8#q6a(0Gqp6!8-tMM^TvK#W?n)Sm*M_JzT__HltsQL>g;e|4
zeUXx$or~r3EoLqRd-X3=RB8wn`Ib!VihI_g$M_>VxU|v|Mi-NCi0`q}LU7H{s@}OM
zu1d2tE9ciYV+*?v<O`}cIoGS%xZ0r1klns34JpuN2CQNP{zj7pDm1xzG||qmYEh_C
zTdO9Jcpt5++cIIzyUy9UH|<*K#9-!ZQXC1%8C%TMqHvLeiHj!byp%~muRdFJK`qRG
z+<7c4KEcf1s9mPmTs<~npU63t$a$sE>Wplpbe_{^TR!8ex9$^4x^}O`C1a*^2-mVd
zP}jp+fu=|u;atl?QTNV|Xy{6i^PI@<o2;dCQ|2c%NLx~()+I)92_>f6L){D-Ry<n<
zxOb_GR3a_$fw1jiu;cdXnEDi~pz5qHio1hc<{WD-oG|Yx3Nb1kaOQ6dG9k_SAmCMV
zwT`;E+zo^x78CIojtg{MwVXCW9YR=Ln7WM)cEjBXQIT|O0tb6vxn0k%r%KX9*EvOE
z2jbpkQQj6XanBwD@7hg!H?V5f@VL!Hh2jHO!~U0qRKwJN(K4glM(7pL(q9NW9w`vQ
zQs{ykNEU_kI}%pJ3Kn-(*V@&ZspE@N2okoa@Cy8BB=uZ=-%dRFtAQW4vZqotWE*$U
z=Xjx>?Uh@J7`Mddr=YF)RVv0E!j4&^BvdNMa4rBKyQ&r*QDW&!Cww@5PhGa9QKEf#
z1Xr~JiGkTLH0HK$cUN`bWDns!2)=njD$S8-SGd(0s>-3IMb$5910M%7=cGh=JM@@f
z(pTNE+GV%#<>{5WXxv=*?1kdd)lA8Cb65LF1~(p@NQy(SxjOKvRLIPZFL)}`+dH#0
zf1IC|7X<N`3yoU+yu!JjY=n3HjFQ)$@T#R%Hk|Wv2tZ)E)Ap-9Lb_s~HGTgmZ?~%c
z35i7euG0M!!&-O$?x>UR84q<$HaXGf1I*QJXQ;MsYSIT{v>bbJ1VA_z@^TDR)6(ft
zX!RNSmoPdXXxSAd0R8!LPpSlm5j>*m&dAe0YJWljM#oS$W6hfw{d5WVz{wPB;t|b9
z>qf=HKk>o><7GB-*5Ozri2hA;HB-xv+HW>T0y}>>o?%THOe-@L*;Xp{$fCprc9mf4
zBWT$oaM~VuQaYV@$s_I6dtBYt=1BcAGlCyXzgVx<vmNWxC^@!05(r&wrAY=tA2o`o
za%jjuAxK|5KL>?~IHVx!B%P5!rD1yCn*#{ZZRIDw$|8+M=L3F^EB#6Y6tk4}2z*OD
zLeRBq7L)F5)y1|iNu?rW^k~{?m8_-uaj0eymmqz1XE;~iyeN-KaLDMxrERkzOPF+f
zb)fNAe`|bU)dO1QI0$D89URvtpJA}R7|}n-Lb$oU$V=_p1u2gQo*XF!zjki59D4pP
z$bq~TA%gTWW58D0-(kEUBTQ-iuQ6T>K=A)d5^T^#e^Kgqg0c6yj5`ZU<n=;X7!@t*
zBs=x8y=0DnFE4RnyrM3!Gfdtoh0uq<LcLqDmh}8+qga=ulqfgdLbsuja~9Xx?U~Mf
ztfO7}Y!Q_4_^N+wcjf4fS9&M8V%AlqzOBdv5$jTO*bY&(5c^Wl$T6m{hbk!y{7Dd=
zff(fuTZ|Zm#`MEv4(JONzdsAHMd{W~7Obx`_k0Fp@p$VFJfbSNcy<7~_?K@w995|4
zLRs?TgC))DOP<AjfOX{z*&yqD6)o(s_rjYd%+&JyZj?{XTpm=D+$Im9=ZpZ?8v#ZL
zLu260UMZ2ECm?n0Z!5F9%+D}6K5w+cqn@r-@=SNmgd4urHxg|9eCpjQ5@mvHnVrVs
z=IM^E&fm*=<u6LRusuoCZ0Kv<<Y2(X>4<415s*DE%0rB%TGm>iW%ILwMg_w{<R{sh
zMf2_$xJf_O(;T?)Vr#Ql9y-=IO&ikNq=KXSF?P$+z*~4e5{@4|CDXWp4=mh~(j2{u
zaYk&Gd8ZjBKP3Z&MBp>#wTW*hY_M%^?NPYW7T~ZtHP=1k`<N2uWy~CX*qg)vFw=a#
z+C4OhBTUw_5(u_6D+GK(E(C}q6DU!fy&*C?aj<*BIuyb(zwDXf*AQN;zP_qc_q%8F
z7*=uW33;g3a;i#L^FbC2t6PQ$+qP^u^-fOm(hV>Q1r#@=*h-tDSBaT2J>X3aixZ+<
z-Nr}GJ$%_B2I?&BIG^k5_gjZ`=o40&5)6|=0g$^A6MBJMDy!K?F<v30w-le(a~NVY
zGXVYpc0tKHnmsLR0{ctRnDovWOTU_`j#-ejE?+@o1PM*3Aoh{<^w~YD1Y5wtFe%Jz
zxC5q;0Z^I@!l=E2M0}ehju6Ao&vCwSUAGNI_q!ln_LZ@$k}R+RGH8+nnPon69q^88
zCjedWQXX^Zxf&3C{@+26@zUR+;bVd;_`?7A!BWOq(y#L(7~nuzbwK(u7!EBv&;n<R
z%oJjn%<RH=4MzBTpR@l{#joS5ov_7ca{};;Rz5BLB~j21qGHvqlDsgm6LQzhpaU;y
z{z|(HX@Qq4utUQa-#t3dqms33+Eq0LT8#=hRI<d!N`MO$&mAd3areQ9>$xaaI<{+{
z57b0gSN;5p4PNFX6*I5V4tyW%UEn0=+BJ?|GOrOEA@i|Gv8#)H?AScUN6;w0SgR^<
z7PkZf2LHUqy5Y}7l%)BaWXFl~LQbZR8?}#m%0$ll)=U_eomhpkkVY0;nkeR>vVC^L
zRT_gF8(VZeJH0C^Jiyz566(VArsoGeYy{~+3_D}SPH?u1<+1-JzMuhjpmeulBVw$(
zOASm(6u%*x>q;Dio~)Jag>K>P9qX=@ykIYmOFwUucnk12tGvPpHp_Sh>uV9|t~W?|
zU6^#c#^M#RSW9~0M&4dZQtnzkn=*=M*WSyD9UV1|@?R&tH1JfE44^wf+uvb17vnXM
znUuRsfQ5&78|$$afwf5&RTb!hpkPG3@#J;cjfuJsFz|$8|H{j?q~D1IxVyj)h@!(S
zIf|<fL<<o#`Td9PZ4aV1ByCKqV{H`c%c`6Gno}PvZ|56WuB0XZ+?tOJDIo0)(e*bW
zHMQAfy(|VHR6K^)^e<{>s^F+?-z<ke-!tlok)$xn4o%L3E3hI7+|rxGfg6Ac42_vu
zdJbadu_iO|^u_5AqbjvWPR_AB!i9iD0Kt-5Oc7V<DgMMHn9WZ_gyBP?HVbJfugZTz
zJr^aXn{q%t&yd#MVo*v!xQ>2pPgw}3HligFZGMm9?}5_oTau5Eduf3)&efY<o}RD+
zAv*8_@aqx4uMd&7N-1yn#SH3k+-a2xWZrqm!p<A0s)gHy9qcOYTT$HA3Rv8rxs2N&
zY02ek-w;anS^c*vPAicZ2dLZ~AZ<~Co_bRncp-0Zq;{d}?Or4W2By|xZigd~wZ5H1
zQEpckbnIA5_~4+OQ4y}qUc@aO(m+^yqKhG!H;;5G8zqdSKE$Mx<EJsr1GB0qM_nRB
zU>)9XpVRy3hZQ`D&j(2kk3@Hp7$V;_D!m|;mogZTx2&|!K^?`r6D?I|=Y25!My@x-
zRpyHO7=+uGzknbUpF@AH`zfi9bawEn>6}IM+@cRAz3UZW{1nJ~j)ZYdUqG0}7OH<i
z09nx>0+dBC&wP_Z=b+u=ENcT{A46f=vq^7nG&%ZUnu_&cM(xCTJ9v8~x@!(NGIIV?
z6J{@=!7f6dpMOUJRJChT?s&gL*r=K+xqw?`t|$$NEg%~};fIyoeBghjg()d!3pv3X
z$XtxRwF6*YN&xH*h~ikLgvpYe?V;Uqblp$6ilfewx6bwIV&*;&G9W6d%T38+_VBLu
z4q^5$QU8#{XB@3W1tS1pFIuGe{6YxHrW>S@#-~+W23Z%*+BjQ-B*E2|Pi4+*O>woT
z9Ew9Z5|{q=27k?t^E}%!T6gH1urIyq4Ec7}V!1^pHcF=^gO1?*rjb3%u)X*qfjk6N
z70U@R!6ID^EW~pYUGC-k6&N96S9cDrf@yabpI1@cc{JRFmDG=bkBbaO2Im!}h(+8z
zy%CYL+zWY2O#L8cl=O9=&WXZ`xMv=ps1${$qDXm}ZZHd1q$LgRz8e;~)}suPJQ-b;
zmcOC<^^^^Q^CC?F8l<rH&M5t5N7qlWP}PW>6Og6cVx3E<mtBA7=W&zMPU^sA+s~%1
z<LPx+qyLD?Ed$c9=7C(G%R=X`wD@4ej^!OQ7q=c+oW>-W7AYGvP7aKysYbO0v51jg
z34sW4qiMp;_?Wofbkp5h^<!sQQ5Q@JJpk95Ei8Q2r0I4i6gO$7U898%q%(~OELRM#
zPKM~yQD*%#yri2Kq^<H@?c|bIi-mLUl^|E=90W5~i3o~P|1!NG`Hm-)HQoHOL<%I4
zvrhvA`{ySH&Wo?KJL5__?BQzsU&)8rdR$@U{eh|Rg{`y2>r>kZOVGO|!fe1b0?dqU
z@jh1sxE6hWNUJ`SC=WOo5DChz7edbyF8SvUCXbHh4I<2UtgFTfPraUPyW-tAYSvd!
zAA_+gtjL&)AuT?{``q&omTv1WIhQX<@##9=gkOK=80lsS5+nHtui8Rr*(w?{sq6S<
zaZPr)4~NTbZMG!{Bfccm>$m@A%<BordN`r1fJ1+@BKL;hRr$g`{Q5=74N6l@RmVW|
zH|zHoU(uiieJ{;#bgkB@o?i}S9Z*AKKq(-;z(O%}{m<{l`hzRLngJ=x+etn6`&(1)
z_22M+W(faD|Cf^a&v?Gz@SnZ-Z|VL_GXELRuO#!I=lTDk=dmi$I=A}=L07<qDvF3;
ztAo7_oOb)*%+U+V)4te@`CvfD6$8!+>p3GxFL4&+hh~NL3V_;3iGh|?)`5D@HO^l5
z!w~x7Trmb~Nq*c?;!>eUjXnYJVLdh&uechc%VDV{M1Mf0)eYS(bd}*4Nw!(;u`>qb
z6J#3WRlS7qvgbE^(d25#$1wRe7iu>9V5j8+Xc?IR(#p9S(3UYjBS@UpeYo-(G-j9q
zXttYj-&&)aO~eP_Dd8Sa^n9EQN3R&FeKgEpuZt~GEXqW>@Eh71;0GuA)@v^L$sx3M
z5&za&AAM`?V?Z>sU<3ydcA&weIE1UOdUm}<M56mKmv%P*^ktqB7#2n*kpHe~5xb6n
zR+bBb4AzrUn7OuN*Rs9BEP*T%718<q(teH?IeO(=Ktn|sAUP}$@Z7>=SaJ!kzxP5W
zxGHXcoufBm(6AjRM6I=qe4EiC%pL|<D~m3+HN!s{CRZSWwQBvd=d~g9NC{4r3iS>&
zr6olo3dqByt`1t(ya+|RsIym!onx?81*%8_1eJYwfaap64goJ2zLEniV@|0HT!r0+
zbM#6fjM%cw-v-W>7Ut$j@ibdFBpZEA%z-SPqP9le5gN5`o&8svMsrTdt2c?Dkm9{i
z{m6*zSaWsKq)%?c@ap2_SY3zNx4alH)`)7!Beyl+BwFmat!&pmYS$4^6J!Ns2Vj_S
zOAHQgD5c6QP&CLtjhk|;BU^Ed+%D!>5Y4Hpnnz}>;HCv@>&J@t)E#CYKp=N7zsCvE
z*K)QN&eTAy9ZAdUJS<n=_Z9h6aZvtE#xIX2{0qx2>*(L;X|e_g8^#NtTUC!F@-_yc
zxX~o_>`Ry0&SMJ64=My-5{ejeQ1%0nQLkwg@sc$y7PHe`wzoKX3u!U0IWN~D)Wq@Q
zz_PZHYLlFcO*(GpzDXpf0baPe1JXYNm=ZREm0GN?n~}ZrRyFz$-;_4*lq@l|B3)ob
ziJbq!wL@ATnX`@BVL}UqVW42ET?0k-sP;LQjze*b)+<*aG}s4T1Tp$^1FcX@4<26~
z*dH{vb`$V?5A*hldna3RccF>MY@MDatOY)U7TdfQbpFbBiqw#&K=%4BE53hPxH}7z
zF%8yrbT8)ib53(W?#;MhymnG7Ioc^(ila9PaFdl0;ohybh8(@`mx19r*>d#OSb*ec
z4no8^kw9%)juXK~n5=`gjYnYtr&enh0=z=L+`OeXv@AfWE^vDBDlo-mT8tOj;VMV3
zS1F{tUq-2JPYUu+$Wvp?+)i4+ezXwZO0IN)yPc0SWIML-3|e*xb=dhtH#>wL+wIT7
zxU0$Ap9q%tdBr|-k23s|Ww)lau0#WY+^|~nF~AR03tWf$Fr=|#Q7aCyN#r&QaG|qs
zP@G!@E=InKnNq!HRNFPbHZ}%r8^{<0y+avuq02(^pBF^pB9Vykg0=hq)1WTuBPp$6
zgja2pexL89r(aI>k_LM$4G?KPfbx@>6XcGx!wMI~MtPrG1O)^6(X&o@#YM*%J`@zK
z;sxoE%MCU9k-1RGVoSiAR%Hz4=UNhH6?&9rK-1QdX;~GAIrkVSW+g7mh;6O66UxFm
z?gFyM4+U$cKz<2FF+Iub;%Mtwop}WItz^N#EXM2bXfuHXVFsVCmcqPt$rFT}{G$V7
zg6w7t-n)$Kh%`Y;T)3@bSmk(j)GaNFh4E7pARe-oM^J$6?a7DKwoev|FSB4?tM?V2
zhMX)|n!~p&0zDz{2x->&Y9QG}afx8AjtD0wj@~g3C`(i@!K$Z;8C$o`)RNa;Ld%r!
zh}iywaL(_flSJ_xna!lf)88(E4OkFg!VM;PUt)5-^T%h<Fps++PfU;=FTvixSFq#(
z0fi?*koQ8RF2+lkSR;8vCmU3}n5?zB*ix_z)+4}cyeY5IUv-Y{EFcXVRV~5uS;MOX
ziF2x0t^!a{hL(xi!`Z|$pRzEfjMXrD4qR}XPJ))TjGlS<5{6CDE>l1qX1obVW@jUH
z;EeM@@H!vGHGL*3dKOI!h;>y`SuQl!O`%8EW~DMbkZZIXr5LLzy4Erf(0goo0g7>p
zfk`dYfgqPX!gSs3eaFWY*3Rm{#n_nnMOTg&pcyZABH=t4=8BRAtGFlVJG-gi2FYzf
zB>Z1uNkY5FPK=;{&p%(}tN@<~x+{;FTatSi4thHls$-z%s}k)&r-y?YVBzfxp^_>B
zvrj+(MAw0W?pl9n&9ghk+VBZg7vSI6LBWHr^(j{eYx-%GX5o&bM=&nkXPYdnV0G^#
zFW87Pq#1#D3L>m~Ll~srx!Cmzd}6l&Xg(~`A%$ci*bb2^NJDynH6N>yyx?LMTQip)
zRlSGCb0lz5ZhoIHkj;F@Q!W{?l_5}2daoCo6m^4?3wp7}NUQqrYai!`N;9X-#@i3C
zjl5QYh3Af*ZAfio)NATKF0zpDx(ATEhX||g&_>vWP?%W4_mRcj8gLPZ;$m~Zp-(?k
z5xp*haGJ8>wkO^>>Y#*7n{C9FG6Je-(N3&l{3`HnjchM<b5Jh5UdUNv#1?rjgiCw1
zNja8B$&55Jh+x}CGLnXs%h>>-9tQnV3lneu^n~vik3-yV$qIoj)<ap4MW&>5&`5@D
zA>al;X}BqQRrOveA7})lXC=#W+$85oT|n?b3TE43YcuP7^$Hm}O%|;^utKD!IFI0e
zLW?c#le@rl9=+83_jK-Wk318?!sz6Diea)HRu%ZLC`*)KvY<gQfGqh^N$)XNC1pJC
zK|xH+iBBbq?A*ncAY$a)?$m8_6TXNOKPyPgs#l5@>FL3EJp^T($R7WJMDqPgeyyrK
zk5Y>J2N+6^kxMbRS4rpiDfMMgWY{5rMRCL0T51~1g)Oi<n|XUX=>X<%R^;ey%?ko9
zP#Dfe$PWCM3;Y-fW|7C~H23coEaTl5EX+813(6_(noX~WoxyraCw6Skn^us%0~Wy5
z*Einf04&I<u>z{8XF__~CulUPZ3*H`48V&G&7y*W)!BgY>d!u$S-$$H&R47B7?S2w
zs8XG2*1}2<2L5i;MiI~Gc^#}c1S_bPD~gJ#1ji-YzXw%Z`(gkT19L$kG#l3qO0w6o
zl^yJsaFDx7bt8yaVnW3C4VXznfXg%^`kiC_habOF5=gsJp@{*t7cOSppMvxVQq3Om
zt-~F#t*z#hqs8It3lJhw)S>_?b46HE9VGy-ms?nyt1~L&jG}WT0>rzu?GS3Nf!b^g
z)(wE5@18#vLTPfymVqzYSxDek?95O;2+-pU`fQdHcahVgyp4+<U>6gL;Ki9vy44Xa
zDsE>Q)=b~Siw6>~z~<_qkVDo)6ey-e`Fk(L8J>{fG}`fx9?5H<H|_xKKQ01z;l6MC
zvBql2l{!t#?UIhsvnRdIftKce*Vv@uxWc|{!z!MTSx6#zqy(>v;hH{@2B3s4sw#&N
zNgaCmnHbjMkzQ7y$s=+{E|f*$xFFHaOrqf!>#^JyQ7p$IBKz6V01MTuUW*O4pH{tB
zt4?>;EOLbv!c9K)7G7-P;z9(Xlreu7n5dyswWo*;CG)R7U(ly>vPA1)u*JVn+V%<#
zs}Yc6NPsGLMhhrzRpQ?!6<tiIv27s)GeC^S5?ncM<_N@?TelE`0N@uT1apn9mC_ki
zr4zbn&-e}U8;!em=qy*@00bFG+yxXAI0@DVXuM$7!<Yn17Z;J(F&{=fTgI3%^_E?r
z?_@b)zOw<+KSWx#;^^f?0vFN{>CaV-BbbT%lazqVJfe|(y9vA41hSf7K_i`wV$c^F
zGfDDeog2HxySc25y2*DKK|sx2m;^#tOLbtpL@My)$wz^k%`RNblX4~vKr#J;dj8%H
zWaU_2;lhCc|43_qj}D`JjEmk72VtQmS#x>H?op1HK*WoIqOmwT5COEZ(_K5<-bw1N
z*$zrr61WBFE0J}9L>T!nDDVaNfo^!Pw!9Ze*mrwq0+|Gjiu;{F9WO9>t*0JZCIwh+
z{#l5Ti+Z8TfZ$yv!$j@HkqY@yrVos%VlLt~cznX7^Blk~tfBzbEFHz5#mPAc2M4gw
z4i^V-;oK4}>G9}Gu`AAjyffcBwGv}@QS!r^GiuFWMDz8<)5j$S0K}Dy)$E}$8&TUg
zmjKGed>U1OlTCA>N+pXCc~T(EUwU2+>^&|#-22?zC5%@sCqRO8kD+Dd%tluYs`JG`
zBG#z`&qUl>b}JHLm?T9$<|4=(1=|PO%mnEb3~F%Ma5h&*G(4kQUJt`HicHonV^FM<
zJOa8nULJeG1rrn2*`fh&wIMA4H`En^1<`Qo6G$S(1XY^tn?zp#s=rqSLVPK{s)@Pu
zr&`3^QsGg@K~OlgsjHQAp6ab3)SX>ibu|!&Sb+**pRR7U7bMH*U%LTvL@7m+xLZfR
zgGB3$k*s?UR^ROG5g|en>+Dh~$W~#_s95va+B~UnJJ8<{hN@a>ngjVR<m9WozaS@T
zFx8S{uxu}Woe1xov3e+1qtjF~=z?Jdk=cTefxRQObYkl5(fQlut~(8@>R$R~|0zg3
zYr5RS8@4Lkn4F|Lyw;t4o?$Yv1G26AieUM2`>0^dd!jEu4sC;mFT|I{BHU~^6buxm
z-<bj|ne;Hn)jpPSNl3yzN&`G%q7~Za+K95~@b@-$<-0Ft2a40xt56Iwv+#N)s}P}v
z+=&1!TT@Q$!<h67ori@;u7x!d%|?znR_RGBNaVuG{SbAhjDbN10|Ws63YheHG!W%V
z&*gu+1hL(ew&}Ty`h))x+ZOj(HY+m_s<J_!YqDZlF_6lUYluQ)Uy=wci<iN=`7o%s
z7lV0X@J3fJb(PBlM0=r`QgiWcZLJ<#T|5KxZ*dO#B(w_6#r7@z^~!7+Bru>`_98!n
zb(9Ewb_)yRlEvG=(KgL&uK{ks>d@g4ArJp~vA-B5RZu>!kjt74C|>UK9n9e%Q||-B
zhfdcH7)mocs7C+rchThO33t)eupG@io|cW8?01VqA4RoO8a{F0whg9<Xf|5NY)sR!
zM%00m23ax@({eJE^B~0?eFaLSD{lb+dn<w#t)3ZQXIdyN0E!}~A%!h{HQ#$p&}9g6
zUg_%fmcM`HmIq&6Tvd3=OecQcgo^>iqR&Jn?!*VF*rs5dfSrZVODs2Fyf|x(cECQI
zkHWmaBpN;!{8myYHIQrP+D!?S;Eh`P-w;;c1+=qhb<cM<eQovmS}97Uqt=_h50#wG
zi?^h2X|`aEY^9-dq4<vH^9Z(MkPsH<rkfE56w~hu=NEvcsyFX2Le4J~AO3y!bw+V6
z9#o55sz@Kd^P?Nnf@^>LlM~K%3=Pu6Kf@q6ms)gQ-az6XtimDQIjhZ3bTzwQT}EZg
zlB|uc+3E##wClV#baC-$CK(i8rIUpezvC=%e5ucu?80VgQ7%7t)^~dQ{Jn$OC9Zvu
z@0~aKaSeV-)?FR(BVOr8xA4^k&El<c3bNk~^GBat_a#pB*Y0HMjc1Jh^)ID=)ydbl
z^vA$xDilpGe`l{h`sDiKn|**DW^c#oexKNVb?N{3<}GHhwZ=}F;X7{aJI#J8(NsN(
z&4!2)MZUM?pMBwAJ{V0}K<MM2ec-#%0Cxehtin!{mW}@JZ=d{CAYY|i(h69khY^kU
zekSYhpPIxB{QGFXV!q<$L|MTHMl%-8u}M7@Qm_%feLHG%q7<+Lqq)T?^yk0b=8vw7
zN&zEt%56vq_<mA9QLK9!Sn-EM<9BaPPyd<xPo?^w$^Xb4*H@hWe<uGk&jYFx|L=PF
c67tTyJ)P`E565!0fdAyA6eUwG>;Lcn0XQ5Y;{X5v

literal 0
HcmV?d00001

diff --git a/docs/image/pack_size_8.png b/docs/image/pack_size_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..804187fd98fa76f3a2a059fea5241baae0382e6e
GIT binary patch
literal 137856
zcmeFZXIPWj7B-AxFCa2vp(&0s8j6a5)Yy<w6jU@AsUk^4Ac#nZM8;9j2&f1Mp{R%q
zCbUEdH9-NvK!SzP0}4VQlmH<FNP9Okdd~UId&Zgf-}hsFa9tr!+j~80-|JrYT06H-
z+FGw(v1x^zoZRZ8KOa6VC%33TPHy4AnkC?yg~f$Va&oJDJP#c@dGyes%_pz=B0aqk
za&kZ4PH<Ry&i<`Ry5oa;_uegB_Ve<}t*7OGUjAh9J(oqQTbBR$aD(zMcPe#umINx*
z-pRZ3Q*i=vK_LX^&ciKUhj4DLy+z00yPhYa3vH>Td`@GsEY&1vpgT<NowzFD=gE`0
za+9@-Km0oG_U6&cYfFZ9Y<`uYP<359NjtuvxA*CShp(i($+0E-e~_F-EpCO@$q0kv
zi}z^A)$fZonBC)IS;LU~q#C@|bAjC9o84EBMIMgi>?cBQE;v|mK9ZaeoU{<NbK?C1
z{If+yHAxFjW?n7*;4Qaz?0`k)y`<G4do2%*>>R+=$Xzl$pjM@Pb#@KJJ&4|v@vN$U
zP3M8I<u$uBsv5<y1|nZ^@*%MkdU_2k&uf44!NYrwY6Nx2l2c^5vmuiQG>r;Q-TV-}
zsCMpxf^Wa=ozreRYuLSwkIq@;ZIN#7h42;Rjw#W%K1i7UZMWT<TeI?WYsx1}x13s|
zdPr?M6L%XnWI#TAK`r_-`at%b75k2^QQcqh^X8Q&&a`{qw>@@p_MjK}nRntbY%D~5
zoG>ofpN?-lJwa1iv2Q2lc;?TK9<JEE*qw0j)Uj1Ho7R-qUscXLW0ZO4*O5mDRCCM^
zDACr{EMMpsT<|~>()i%kw%<l<CJ{G=Ym1jzrEXYfd2aLZJ0}O0n$`9x)F`(8;;a%4
ze|y7xVf&-OS4HECw>+U2U2Iz1`{nSurlm&@T>0gyp6&fFwK*exD-XDo9Q0jK7)9K5
z%5c5ij_Y{)w{fVWD|#P1^y{@U>8js{FrT@)W_8=<MZ2!66(8TV{EOTv#~^28akkoW
zd8f^b+xN?*o=thRF-Lx@d+>=xR-Y837wtFNs~_xkMsCFuX~R8ONWc=feCADggR8IX
zqxlDZsJ|iWmnYX8>ONhi{F{=b+HAd%#$97YquLo-tKwyv=ed*0xgYA6TI|`oRQL9i
znl<i=cHg}er}}w8iKfQt`~wB0pO1H;uBi8nEVo-e`m6P?!@nxL9=-iE<b&o@wbiDZ
zv5$7HPwwF?q(r8s2k#nFY~Ny#7pJ-a)g?c7;dA6WvUW4+tRIB*dL~lj3z->1|29&e
z=kd_|uG-;Zk>B%MH*Rs(;IvwMIMy4z22N7j4{e3cp|D92mLr*%mo+LEq6)}u%SX%u
ze;~-OA}%M?oiDt&!u(}<`l^w<1HQ}UC%zoicn=k??uJ`!N93{cobxggE-wBhXYrjI
zb?HAQZ(oO4xpITfOyHVb``puU%GC@OW3^mu`H|U~P*v!-!htDA+R%VZ;X2kHi<RB?
zEvw%yrx0A-*E4<sRkvurqxyCK)#jxyPv%swTfZ>;*rnI;$YsYPw-wwN_LFz1Qd|^y
za?Nu018aYlS9Y5^Skjc`w&Kx(fnVHDu1LIL=w{KjsyNcwO@H4qZq=zP3sV=Ye}%ZB
z-gUD)Te)J9#V7T_Lu-Eh_1zj%_vlAkTB5i1-r99v?#CZ)A6a{E?~n3tkJLn~B}X<|
zAG@DWAL;=)eN%1!rhCfuA<xuHW6Tei-7~J&{^3y0Z+e%1>3F>L<aYYO%y&OtGf#=J
zJ%W3;c<}T~wcNwP$3=t7r?f)VNB6rvt=v%6ygQ;%&Hv4#4XU`>4~06%?=W5;E7(I^
zOL(>Ea|HEHfE&BbIyL&E-;%LaL01-~-{p_LozV$jPSxnS+1i^uPMA4Vu~c^KvC`e;
zv5Q<6M(w(F@5G~nKW%@g6u3q~*|wV8d^Fhl$F+jn#fN=^RX#?W)C|>ZtBI+Js6o1$
zP3$gAI;^$vf!a~&vvX^c&g|d0J$LKq7Q_2__e_s!HaD{xo4cA{H-BK&T;?~^n*#~z
z4&_g$kLSh_t%|pc9$b7IKDl>t&t%}$t+Q)O<GZZiKW{rN-f{Gy8r|ubN!G2aDOckr
zVb0F%gKTRH7w2b|)|E~Xv^f(&6jw9H^vu$oc+=RO*?L)H&y4JS?R!34+N*kgjcyuj
zzvD5&)3j$6k0aha_A0q_W$(-;vf<9XmkskSRebo->%13ful6Ms&qPm+4;9babFErP
z@7=xpijbaXFBx89H*;Cfz2cf<S<F_Kr<IPPifMoK2eC1>zbUw)BAG^M@uqP@8`uw+
z&-sJ=q0XV~p|rBY&W_Hp=vZ{lm<!v8tzn^Vp;-AtWlre2(7GwpsofE(5wVzb=^n|A
zIpZnlYJ-h$Hf`Tf9DI&Z0IBX)U1hv%&zcPnH;fpRpvT|dyRhN%Bd?Oz1<Y}(68ekV
z559YAFU7z2%pCh%RxP>3zm>EKx=$Rt`|^les+&1G81F~$%fFp-wB&Y;N7~z?@o4*q
zkDEWLePlhaJ5k;^h+KfwBD0Vukop@!SE+1`JH^~rJGk4}C~j90-q+CAfJZJU`nmWK
zs<5e-x~=wsXELI+gH%hZqc>iD$y!CYOfYNl@OobKE;LsF9~K%F8|4_S^x=A^cW!FC
z>W%Xz`0(1_k1QPB-g&menXKJmFzP>Q+~FFyU)@~&SA}&7NouF8^{wys9jOb{{v|#x
zzAfj&&~pP;a=+7{)2R5JRnN8#M7m8{13?M4^@BBWgI{2cgM(eGdFyCikTv)2T{RP5
zkG^_iv;x|kvV!6@wpX}I*ghs0%ePRoSfqhBM~8f{unyCk@}3a)_K$88yqMrl7EdzA
zyFW$5=QP~CsCOZqzh!2TsBzka%YWb6K&cWw=4D`*m{Lr+Bt4F#zHLKMMDs=LN^G7C
zCX1Cjvj0d`;`P}5{QYP*#HY;74Vy9VVjrV)!e9zsio<6MybAWtTp155)(_iWZn9|Q
za>X6XJy)`ppHOhtoZH-}rDH}kC2DsWgog(EFWRED)7`4(MC|gYx}4sr!=cv9yXNbx
zVvgNCuPOMcW6SwX?0uWm)lEWKgJHW{cK1FiJGZ9cw)46$DANdiwd;Pcd$?1$mH6YY
z_m>;2Wp4kVti65AHazKv)R#I#Xp%F@ymXtD0mA9nS-PnCUUNc@`o$x6lT8NPQ%<G)
zlxvyxF#Q2_Yh{r4V4LAZv$~L2h(^%GiqV;zz9zd}Dff&&w;g8O?z$bdXW7P_jpYWG
z{ufIVy1I|s4%}+g(7;m?g%31qTp7HzGb>>$(h;+$+{V!K(TvKS<u)6B(oqh?wLKL)
zZ8cFc!TWb}-VBU2jyBmaI+OAfGw%0nZc*agu{*IRZEwKo@|shpb%q{|n1wLDXm~!n
z7PabrOcHDT;On?KSZSJAd$#)Z_1AY^E83rau9ot`;;ea5KuYz?+K=h$ZmyH;QO{uF
zyZ2Owzv<ejh$11;X*V0`sjNMnSEqkzPxF7x8A-88#+wN#A$fclQ{;u=ir*~WzH&e(
zSZqII$Ew{_`=J-VQlf%(`Bi_d?q$ajYB0Muf1CKh%B;LBL{>)9c%sux$d{&ms8FAA
zzN5I&tNnR@O@D8{F>Fy@ThZw-#Cv)j;Yj`qXT~{a(p<iwa%-wS|EJE1L=uczoM~QG
zfk|VP^$%O-ug{~oC^;4S-k7*mtJ>}R#kYdA<B~lC8PCYVLw%HdN{bA`y?nAxAli+3
zi(mWLQL??EqLuul7wSjMjs=Cc@91tXgj{IN*~xqHg8imE>1-xzlACAIH|^A|N}^ET
zlp=>MB@tPoSD0Dp`lWjmy7p}dDCl3qKgq~%p`rI3hdQ0cHQN$KbCg>GnzLCH<HJTT
z_s$t=SoU7^;GE)TuB)4h)itv<p?(U++MlxbB|j?_9iF~jr>pEZmagVM*^x72?~|Qr
zn_;FLIPeAD(*E3E%85-ETJ&6r40$s4w6(Sa*3?o>3y}tvFbKKR4}@8B4VFo&oz_K~
zw6($gHLQCCRGwd+bA^U~rR2zDjH7WKd!(e6CJ+a9kG2M_;7zg9{Nt|Vf1$QWNK4-+
z?!;1L5w1}Vb%jw|6dg6US(b?&31|A1+vA(kR;GVQPqZ)yg9*AuXW!=)<TdEOp$>-c
z2x)J17Dr8Ms=BmS&Iy_xLmk83OkJZUQpd35FS(yP)3708)(6#A(I;nVIaRn#a}{)C
zu3j8GZO_d{Mc7-$TP9%(Wglh?xx<xGu@tl1fv!IDLzUYhGD$9&tSonaliam)lv8hZ
zdc9h3;GtkxZlP>BWVm+0#KOl*Ftoa$>U=z2Zlp({LBBqKxfg5SCb^@Qasfe_mv`Pr
zl$P%({jI=ui}@fAyVA?(!%jJ>4v)tpjPPWn)k{`-)gcQ>OU^D+YFXvjOZ6B){es;}
zpPMQ-l(xxREz<Z@Cw09Zb#JiJRi^^vANvsJjv`N-klP79FOypk<teuad|CkhHi18o
zpFCdp?VCjfkLAC8UO4dk!RI01y8UuT4<9%aykLZ_Sm6=iJR=jnei8je?+*6R#)Y!X
z{n7)A5zF@KVi$Pc&f4=OgE+GEhpdD*<F!;tpQ`&bW3Br#L)xuKD_7l9OI~5KNI(9x
zvwrgrkLor1d6OIrl=nGZ1Ron2VV9Q&wV9c71n*4znaKwJ>Ha)6xvMFi?2epPK`&ai
zX7m2*atjvz#fO5QB7IbfiW8Rqz0>!**;dIBO;#xX#cjVm`dsSu1?O;llF?rd<c})`
z7OkpE$=r3|Z;TvV5Lvr?!}X$l`d-R^Yp(wc{yR?bkLCZ=MgQ^nKL>|@+~yy*0kPm8
zod45+`2QZH-1@s7F8_Dnb}X!7r|I-cs<n{k;WZ85FE34}cxE&Ytuj%2K`Cj9d=-0M
zWkLc|nRo^vMP~<*qfrXD;LG~B#lFH;sqV}hjm;3Jj<;IU%ml0O>@+L$;&?0btRyRQ
zO4?1;NUP<k?6+EZml3#7?v?(fL8NHbs7~+uN}b;JG#z)*0Tp}y@}dN*)Rwngw0Z}f
zD?~WE-vPwp<CZH}bw$(ewAlQNo73*48ociI1})nmlz<lrT?enxV-t^iKk1y6==Ant
zR3=ISRs4Q7!{wBp;BsR7KG#MJ?bC2cO)C0EJREUVcaqyXTPAN+DE@oy!^$PO305PR
zAXw{6LmxRhW5^>wI$#@-9d9M3zlYbPIC$WCxT-;PEey6v$Gy|Vs4VR$uV>IK*o`rJ
z!5BAFnMsb08@sRTJ^l-s91S(wjSF_ycbo}n33gjqF-h0yT`591GJCDN6RgAnJ$MbS
zvpum>Dk;h$Y{j*4mct{~qec>~DjsLBW}k!Txl4Cpyrssq3Nu|-li{8?Pm}J-oTLQo
z;3ShcHvHBBp`@<~<M@s>`|{GP!pF;W+<8Kz5=t}GXRL)n<=B0l`gCm6dJe+p>oluc
zv<et4lSlq@-i|l+^EFLfCsVwyePYF<nOZI({wnWHwnJo}u?Us-tD-MW_kS5CMYpQm
zvm%b77^j88q$*$DqZoyX&>2Vaq<)toT`Q1kLf}pE^Xs)-WDf9}(HlfPV=K3di;8J_
zxP70v-r|Tb1PnVNj81?@+|;CQ#RZS5!)u_^q_VCben+))8Q1&Rf}G<$=5^&Cg~_%b
zHB2<ppWA(;{wp5rT{tYx?nEYr{ri<EIz<kASZ0jfi96@#qvH2kM2@aDZ4V8Py{m(H
z*Pw;Dh)|iBC?`cXfd`hjQFMB>=iqGSp{=-?C<1BE`%Nl#XjFiOJzo3D#Ab;6S=)MU
zF=o;PUgJ6jy!ML^ytL*m@U9%TPH$R%gBG<YOl9K9M{>05Wkf*O^JU&Yy9q~0(TZJ(
zK9?9Gy>d<sF`O+RM^muIxDt^G+(fx0jC}P-)dgWhp7rl*_Oa5Hiq4;pb1M7TLt>|F
zpL{h%yH4%P*ne@Q(^VS_!qxvZ<u&K5Ke;wFaPLZjm`1zCU*wslh6Zcnf}4)36o%4>
zc~iIXq`6@9<u&jT17epVDO%mJz3cpTBl}NJVMb;5DPzeAkq)CoJ=gb5bU8wxN@2rt
zxHEO&9nsRr%5BHu;vrQ3U59FVvOuAM3&{(?sJwT2k*&Ae>UT=~0=$sRyJPHQgR&>=
zi|k^q_kBL{+TvRXs%rVE;Hp=+eYf%--xLI_tQfi;f8MybcuZLGBuFUrk5eORcwdTn
zt0Q{18MjaLPN%mv45_3)f>Ofz0TAJNk)l;aaFg91PQhykb_{OSy`=%9bK}$bmBds}
zPb(F__8550I!z{B?jZ-IR31o*wg7<SoBqhkybNFa3uWMR7)n6Cy5zU5;0*_mqKAE(
zw9;oj>-27`Ln_fa-qNeG0#)!4qGMofx@IH#?;atLqpiCiS&3(l!(6sUL08}L*N4QR
ztoJ#EoP%jJrQd%6&u?u1<Rdx&(>oiyUuA-oMv6{g>l{JHv?^6QMr2sKJYq${dxkvv
z8;#c^w&L5rY-FE>*Tjq32a$Pbgc8qkGj1mCLjUoTod~5kE=s8>7_KsbBaotRl)>*-
zMm*0*$d9^k(ZO-*7v!f7X)e)Ha)3>=L_Y^;%fB>5oejr~Q;B-M)O2p@+v5>?NmrNG
zR+6KaswX5E1g!S$ckO6YXik=)shte1Sm`-f5|#T}rxz8)n&oGAo7K~W6HnkZ>Fbe7
zzN#t{6aeJP>k<8;xq6*joS+7S3JFgZ&*I4jhWnD57+UEq4Eh~1C2KB(<zza!dZ~v%
zG};u~NZ<H;iBK9Yo$7|y)K)vHpPcQgBt>uat>-qq9f+{wN^fT)`&lk~b&%=tR^j|9
zo!-*wH#B_K95r2br*)b%$)A3%)0;~1P->kbM{Bdw6WD_u2)FiE?c&_KSh2B$=q()7
z>D>X|{zVjNqRA_<f(9VJ{34915C@*lGli2|1|_YbaFdUp3nn>Te(;)``VHKSi*}ML
zDievy<mg#w(h*8HFZF1IWuJ>I(o!}lRh`E4dp0;+piJ*rkC_@5$t1>5J9VCEHzWVe
zKrnq`EOE9wb-Isz^Wm5$s$Y`|0-3l}3pdkIPMTEIw!D{MV?xu%?c;V4#YK5|-Isz|
zeHsCu&^O_tGJy?hr1vDjP)b4JTOf5iG?`lIYzEzL_BPxCeQq!T&&%we7LMGU_Oeeh
z)1W#qC&qiFW;M?-l_45mSBu-0$aMx}%1zkI)URpsyJZ#JcR{s+(Si4q<<P#ypqj07
zLYc@CZ3WZYSmY04n1pn7;DHdKL;(>+Y>B-f)akWrVjO9umB%N@3c^mqmUxfckIAFF
zOO2u#;%1WaQHW1x^i?BTj+;Sfmn`)l@{KO|m^@=NGj8mjm6&e{-`MuCK@07LP%7_9
zW1(dI6Y$b6Fl_=~|1t{kxI+nEqu6sF;Kh2BlBUU?`Jhf{!plnc+H{9wU}_@xVgKg@
z<m^DV?}*e;KEp-&tGB;vk)<rN9HG?2Lj_0ym`C@7I;r^aRmo-HPjXtZU@=rzlS$Jf
z`zZq<`kiJF`R2Talu|ra_VOZW&*r{*BAQnQP^lF~-ZMT>2;Ug|au0-7)xHgqls?iE
zHb59iX6WC;4qL$uD61ZtTj2IROJmL2Sg_)VtFDC|gt?Ah%m!=B6BU3l5Zp(@sIhi$
zQ%aSVI;ADp^fEy7?rI*9tEsdF&@JrH>7@iU(Dy$B{2)vlp;W0%3Vl-miF-RxY_w-W
zxD*-i`15I5MOkTlg6s8&odjP_4v(Rw7vTlt#Nk8ROyIL2f``OaKW;KAn}*~!YEcVZ
zRVIjd@}4W4Ti(|si#d%L4QM&=s9tAyL<`m%pO6uBgUIxKo;W8e3rTbAfY;c$K1^`k
zKX%X1rmn{rnoXlw!&=QQf~d4uLS)KD!9rJbNlI{iNi~LGrD~}x!J89?@@t`!Ejqm+
zW(Xye%cB^};n%h*6aE}Te^Yij>Am0$BSbU{ugRy7-WP2b%-*GvOoms4UmPbxI$WKC
zn3E}(W5b%rIZ>C47b5jb=wpXLmFvn?Ez^T4%KduEKVm96f7*+h4xck4Bhj;>wviq?
z?07eksc9_O%A?bv%k~+nR{j{yuc*~<2+xnVs^H?_vrBD_QH&N9Dz<qkX^!gB)^H?)
z(j`kD4Q`tlFLPzZMzykbo%lmlIUZkS^zo&^^v$nsUy!;uuXHnnR?psn+c!|9)B6B~
zQ^Tc5L{+UVoc&SJIWZwpId|S{DF*(Hn!y4(o*ezL`(A?UCtrMmY&<MnWx^gj9j28c
zD~Om(fTuyR+_rV?gH;6NW@ffxqn2fWo63Y2c%2Ygh(C<^o4-9ENh4?YEAiDX>8wbH
zILj>%`FAixrhI<Ns7zc4mnhCL#5Bw)*b-YYN@aqdjX=oRLLn&gWXa8NgHO33-HBA}
zX$WoxT8CQMV=V88>6`JMVw=l;A&OUy*us*GQ7$Iji*WVeB0d6MGf&LckAi9Z=zz#{
zHO7iMh^vB!Jpx3GW?Mi{ch`A1j!EBF5<?O~dn&r~NAU@fuvTRM-77)hx$gj<?tyM5
z$XgF6R&)rR;U?Ag#)Qs{=4NW8&eM%@%@t!IH;5s7L$Tcn?So={bduFx*;TA`6GM2`
zVEhGKN`keRZ>scN;^N~Us4_ufldmF6aAeby&rSF}&#*2GY#pPDzz(XXSG|xYU%gfJ
z5u8BellSbpHoYD}QF~}r@wp#9Yq_Jk?5bO9BGnLOIs@}iO4~|$AL$?>Hoj4;NNE-Z
zG%BnS3Dn7^<1b9)Kk+mlv7kYlI1&(8!5b^evM>$^>ls5S<wNxe^7$GxK$?ufGIcez
zz+GCh3@V9%N3ft6fM4kfBvY(=hF!!_`$gOJJ_QS1t^(hbe((prSyu~uv;1@ZgE`^H
zmfi50D7z*t>f|k*-bxN2{Q(SUy~!k<D87*mD}G0~>)O34I&?U1-VKaL(pfYiMhbk(
zxenF8st`h+Q&h$D2azU)t!SgUvIYsSJu~cOiOD=ME;{37rs1*^-ns1$P4KzZW3}FL
zrEe0S_lboxX(rs{&<TA<PFm4c9CV|~L{A+;DZ|_ZH<(wig?W<3vbiT=guvOh{>Xbm
zwd90Z_62{TnYGHq9Sm%V8vA~N4c{5~?}na7tWiUIhJLJFshQV6qRo#Q53ST_Sl9p$
z>W#kH6GL2erENDP>2li^Leg)S^-c)ieYR>GNH^N!Hv!nw$TOmN>8t@MdSHDYRwmGK
z4|J-k=i@grw!M=UCL}<Vx>K#3Di5jnaYUr6IXM>u)DVopW@PZfJe-yS)Bx8;!5ez{
z%t;O{h1X!JRl1q2;Dc-BX2LLU4nX5W2%s9}R~Iw{=wApCd%{c4yzwGOtJ^Z^8!zO&
zz2gOebPg-x(r#H*V9&yq{HC6?E+fc8NyfiUOwIAWXXV{Wr&ql?3vXxMucp_il<5{!
z_6(tvIw|DIh=354_u_+74;x66E-t&h$4*#q9urrEn;EKF*tc}uu`z!i?y@vBDPcie
zHN9@Pd4NNy$6!G~cg@Ip>1+>!E*_A_JNLJwk@nmxNQ-eP8;fTXiDKr3c`9piqnSW%
zkMaIA*6IN09h`$4CI(;e*e#Y_l^K^K17=T4DqiGY>B%Y0=VzFcf0&5A5Rf)d*V<D|
zicV6*m56(Zsk6iSfUzrpPzMe0gF@t`fsF(DxpA#jeF*K57vRS}cJ<sz<Yf+?JQ)yv
zsHIW8GNV~2K$*c!F81z#M`*Hr$kAIv;b}eMQvMSF&NAn;gazqlsDMgQ7}sLnt_;92
z{Jl9ky%`EkS{j{9Zn-$Qo}R242CuQ9Jxplwn?8LXrIuo4&VNl5o0vSwC>N-wXkC#l
z-`5>bQT`;~60C|3a8&z4O<o>3dpOOTd0k*<jT}~^Fr<~U@4&5|L(>zma=VC96AXP8
zZnCw%KtQcof=d7RR!c7!1#_^Nn*wX)MIel0Y;S!SrYEtir}yA(K`L(c6K;k~AYa|l
zkA_J#Ov84}^h<>!NEtq?t~xe~|6wRL3eQ_6{6lf5dbw<$e9O!52h%tH!1Hs{3p-tt
zti;ER9fN}~J~ContJA`v$pbJ>#+A#krfUN0FeGBu?VPh;ck`VCFy`H<M512B#S<`1
zV|xa6zmG=#%xPe;CJ#o}gpg;x4|mDdO6EXIWks6yrOy_*z?QhEg9WrN{l3*+`L)an
zEr@)6S$j$DT_*MeSQJ5LLC!Y>x42<v@b0qI0B_JM=Fc0v23k+&i{P`hqLFQzI8a|3
zD784rs$xtPewa^%@sg}kr4M!71I$z-1a`ax)@;6sjw9;RS?6m#Iqmrb2bcRSD@JlW
zwVlZoAGI@-eUPMCd-Q)ulW7xp?i35s)$RE>a<qj$@C*I4_=L!tVW|G&7P&;djHzZD
zVH1~-*A9WmuL%P_5st-Rvxy47g}pfdcjl3Q=lV<&Q!kAK-i<mB!Nfs(TJu|#^Mvq>
z<AWSlNzkMXEj5?FvSK&H5<-ezU%8V2i4f3<mKw$ZO%Rf40eRym6Pc72_m+D-4H-so
zNl37<)X1<(#Uf!#=2m&NY?^n@0E5lNAxjxeX|9(LjIQ@0pD_5ws$v_XGO;w>|0tBe
z>t*>i(W?lyFiulOXs9+s{tLQ6OFHv{==6HLk(g(Z)09npx?^Gd`YRQ~!9g|{mPxlW
zZQ!LkFAuBVu<>W%TNO*RmtJAF_|P3>Scb(8FAZI(bme`Pqrv782Qv_p(Jy9~n(AY0
zVXbA4@vJ(AU$_CPRAy1$u2@$M8cKNTiX0_B!{kJn!YN_>ANl|0U84&j*%3=eHNSD@
z(q-mD(?ln&C-J)Cyo~p6kF<J8<wxTI#VQPh@$fbt5ye1b%1lczC~<n2AZlOW<P}>_
z-?qjDU)$w_3@{R2LAy)!pKJ210h~GL;~jlQYLvs;O=Sy1y5$Nx&$kbLl!+(3uVKhN
zW*NybdOd9>ogjs7T?G-25?x-5LO{})G3?QQ*Fzi8|KNT;2%5J+VyR8l!<9?}2-~}{
zxRB@G*IJ&H23Ye_4<$wxMgur_>V-iG(9N((F=!#J9A5fk-43{Ocj^MqD^BxVvTW#L
z```y(A?YfzsC|JBW0-AOf26gi*@UsNScA>r!eYyRf}pxPwnF$B20F29U0g7~mAnVb
zDkv36L9&TQ)3u}lal~*7wTB6HxkPi{?pB{<@-vD&$ff((Bip1Yt{$m0N0{D|wzpcA
zADg(2A)mEL(b{=$=~=c>3D{TI(<*+=1y`R}w&zFXcc&(Bc)N5|%BI}-#~Ax-)6yRp
z`U8I2Lt#h&d~Omdq<>_-(N#~A-L)X^k>lPa?!#z7h7Ggnn<D5}K+vO70MpJYs?_NP
z_@WmwqYt;(dXPNen@^`_EsLTc$7VzW{;2>mw*fOsMt19@KzlSkFyGLckZ-#yiRPi~
z0_3zL5R1~s%?vQ%5vpM$$*j7SCt_CZ$6FY%pFuNjc4zl5N<95|Hsn*VaZ{;1njfpG
z#zS{GWB@*!rtwo<c^`ErrUIXkCArWC!{cR*FqqytbDiGKjCy*naug$%QmE_-(&bhR
zHwv&Gb~_E)vQ0HTboxHr#0BeI%oLd#l1iT8Sw6=bddTXEtxeY=>|>*NXW=ho3Wb*b
z{BQb<<3YFn#I|Uke+e|+)GAwvcOT`WIEp>0nUb);fSj~O6bOZM4z0q40nV>^jTB9@
z*Z~>l-A@qMZiH<gU^An2Qm_{~o%VQkpZ8a!6$TjzcWLm)i00ijvd7MpmwxvA9uA87
zFu|rafP8iP3v{txXUC%=SRgr?vl=3l5EYzEIIBVK))5P@39_r#VscUa{6UnGs@di(
zDkW_8Tdv%b{Ipb-cG~z`qSK`Jy?J>c&l`mptH{T_THapITh4(-ysC33B9(OvLlYBh
z99@!Fvv@NBNXxG7Xt=Oj7%_f78xb(fNY3l7*h%1X>=-U@f5ZqIb$ZpS>H3G6K;V^3
zc~5<a;8Vz1n#QPIot&pSz5Hq|X88_?e8H`RItQeHB{bUu{+Vvnvc&*q-9Mkq(sXe%
zXFwvj-0rQGMu;<PnjD`nsyn_iU)w1qL6E)&=(saRIT;Rp-L;u>?Il^6d`>=e-%bLm
zOw~o{szqm~=LwaG7pkQ9qeirQ(5LtW!E`-0O99J!sPlBzWU8b7P0w~oL<CTdBA^5z
zo!fWwxyDC^k+dn}W-+C-A%75o_%zxNXP@U=W*QUq&V6YH4rWEQASkyzT^Ji=9QSb|
zua6x-`(@s(hR672AI(;i0yy5mbMLc}0cl<!bFRasl_Bzp=3aq^Titu?_@Vr!XCWdM
z0GZyiAUU#Mk6?7{HpDgsnma*13Qic-CG)Xk`M&I<jsE85%Mtls3(m)HfO7T6@f{sY
z|Ijntb}kxV%Dt>gwogg(X#o`8knfmA4EbTgOtYcIq_4?rD^4@S0cI=L{8nq)(hzq^
zFQHkCYv{8~wG#Ig>?wGhz#28SC&bcItou)2=P|i`St+3(5b%xXrW@cj`V~gFl3sau
zO<_5Z-~c<1^Fn3vp0_fzq+%e~3?v~C`{_N1{u*ed_f$yVfLb#{tJ?&?X_0NcR*H=k
zh8l16aw*(<dejF(+VgIpjEEkq2Z_V=Ob|c&p2Gsj@3VZ%Od#?l==vi`z9XsHili9l
zu!dZr>kS?Ed5e!p!;t+G6$U4p4adv`@uNhAs>_K+sMs*HM?6jP<rQ50I*Vojk<X!r
z)S@*Hi#v<p5w7}N55a>6o$0CTGK`Uk{p|$s#L*I>B`+%qfpBZgnU6J>5&94tMGYf#
zQR!^_T!^9Y7yfp%ss|s5{`6UFbJDUfV_gQOI8qz^<>|mGw5B&oX=!>q%jS-ed|@CI
z<Rp5923UT)lq!ACg4bA8)0;jXBF*wHi~G!$z-RdbChaqVH@c;$eDAIg?`L~l(_)yu
z1C41psOhFAr1S*H<X#R5<E7z7lC;q6k&N}1q)z7tk4G{fhQnrnzu{e#V}tO#yMi7&
z(Zraz63tXI#Y#)+jHk$?DXZ#UzbHii#Omrd7Cs|lK+SxODdf4}i`jW`-c!@#Xqat%
zm<udPpF%^m22ni$D5)#za{wrLx(d?&kZG6cpdu3*W*K3;uOv;f^~&T;>(zm3UC{i>
zn=VB~H;c(u#{T`M;O$>NSF3ad8hpAay~D_2r`Y(14?ZL230{N3LAH8wg3XVO9s#pJ
zG}-u4&rA=GR}Ip90HXJq;WnaqN~K4$AxL1EAOzh265aHmBsW*5mt#lg(ncix_Va}x
z)rju**rWu5*{QM0Uq3G&C9c|6W&}w(84j``M+X#QbH{N|GZ{F(rxhgO4X7r$X?>6_
z=c^gbRoomfr>5z?Y<aGawF9|!c{#Z(?S?CmJ+Bmj*td(Aw+s1##h}YQ0I(!ivlTX9
z{V*DMJKTuIT$p1WP?=~$!<LHoqVCSVy9rnS=_PZ!k=NDoK$wx=mx9n8_p}ci1&IzA
z<HJfm)mY?uUn|lL*iL@0>)Vx@1Z3s09q<|~NT9+?k`f}%qQK&?+=2_9B#_HSE92@1
zqzka&bJvMZD-h*GBIRR2aJwEnqB}7qWvYqb#}m=fuqB%U$$nm}Sw@y4-lVC_!b`)a
zRHO~$PuY7=h44RD`2*JxXpBvSq!<@^;D?^pux=p9^;|&g9~s&Bthe#PN~!s#YorY(
z%C5=e?#^N!lm4{bK9XIy9TFF`vACN<H1i*~ZZGp6Gh>Uz9cT_wT#;g7P#e?%hf|;<
zRJ-#{nL=RX+Ys_3r^5x_m<IPf4Gn7Lw=|xQs~Q`%&zp$~z}V)Y5^}>@9#>R~dtib7
z^W4s!`XgX<uA`BG6=tp%-ZALVpSUP~QmiF)<T+4b)Dz~>uA?YE!6rB(HNhakB_$!Z
z(Z#()yo#w(5#eP8b8xJ51w_Ati$cuqk%R@Z$qf8e^NI`5x@zH^_Xswst>Agq*Lr2-
ziS=#Bj~U$LuZ}8z)=l6R)pF-R&DF}D^H2%_=n~s#rtPnJlV9+pvhL|tAXQSx(Kr#j
zbgBaazG+H*1gI8})iR)hTo@=26v3TCgUkqV_#9A*95_{@?n>9P{P315_Os6zK`O1M
z8bWA~sXDl}!nBw$*v6ms0&$%w$H%oI0;Hv8K=wQhCGhb%l1_My5emiViiTG1fh462
z13B}cJ!9JfJF~q6FJG*m%EW6I7)L*ls|o05UpaYh1V!5oaln}GgYb<fp<<B#Trq>V
zS`&bbwz0stT$8cCw9G?dj}5-0;#WTmxL$e#Iob}a?bHF+SSU4twctEnG^2FGdn!PK
z;tKLI`7J3;1U<N{8492j5%!Ywel|XZY-0?OC+O943DTxkkS<_+o49>Qu2mF@kr_?v
zHX$S#`CzCOzooGR#m&t?;E4(AsoYBPvl$j6AGetJ0z;q};JU<gQ)0Z8+EgA9EeXp9
zwF9W`{Iri^U7=q04D&VoMmK_DTmX<`x|%($J^B)3nXUoe^P|dCF+FtF;NC3V0U?;(
zYEOn*XUu^Lg`^xuGF_9WSIRr%@5xZ?zHb$Nfu_@&LMBBY#aap@f;3{3+X4$NLS$<2
zJYnDTur!?)+&VGlPF<qTL(k02%sZUSi?h|*1A#X9<SAW2j-#!E*ObOFxW~dWK+XW%
z?xKunNtj)_M%Kodf1wW6u-LfnN35dhFdLYrz7Rm7Mf)BAiLJcB;2D>s7zuC9u*2S@
zZ&Ksz!X<7#)F-;%$XfAlRh>Y%O)*%?=$R2Y!c7j?LkKoKQv#yP++|Mv5nn8JKTPie
zi+Lo;)f^<%ze)EIP)yI3ivD}VkTX2R4IcfcZ}GrTpynX=q=71JPyjg^JBkpTj$~}I
zGuw^}{;=6=04<i7fjUNVl1l=M6Tq8~p4ax;q+m~p<}fXdF(sFQolqE%bftr0mUbAZ
za=a^uMpkvSrxBg}MnRG6BFZBmOy6QBfp#U|#t6;jgA%#1bTgsJQyo;IdG_S1Z}Wl9
zP-$B{E)hqb^`&hmbowqiDgFRzKiD83z576Mp9>1qHU!eONoC=j2GX7@&j7!RPoJxe
z@(`S*g<pl~DU}Ann2%Z*K-zdQyZF;`8DRZe(<$~as;wi)TaO#MD*AQ-k};M)ieQ`v
zlBKp{6L%VI@)IFWXl6dY;BGE~6#-XD<Q{j8mxi!LgklkH8lau*x{kZFs$yV0!nbS>
zq@f2?>$re-JjwNz!YJZCXcrV04CHjSv<JvyHm8`<)Zc8z(txZe-~)pgDA2j(zgN=r
z73N)3@h~X4+K`cBuZ4C)h=58*A;5@WeL6ki<`TF`rmGq5+{d@{ZC;v5y5`qkv`_A~
zMuMe*-2C>w1G0ZDIoCU_KbK&m<V)KD!P)|8us>=fgH;zl7tV!zl~^iUyC=rxXiM<W
zIyGKi59&$jyed8-L-hdBp>BQ9GzsZ>*uYJCBFIC~NChdkRYew_C0K7J;8zmw%2Fg@
z=lJ3t3tfnIy;^$Ni)2S0-XKWb&k4rdAfXadchBhp_BEQp5+sRw$iwO0?K=Q83a*)l
z^Pnd?&&$keBA~i)e<T=@IVz9*w*H|+E?ZOLx~S|vqtP70@t%@eJ0J}V^h4oB2K>fR
zApCcCHPQ7(_IP`^DwzEU;;tT#jznUf$1@s7BxBy!Dl08hS+jQLCivvpSB35M4GgE3
zHsIPXb{>Kd2FB7i0Q_?|(5NgNRIrnieSuL%d&+=BAg!}ya6|do(*9<^1M}M<cv@hH
z`S1)X3T0rx;JmAlov8G+%yvYP-gmwk>%q5pWQxc+JVHqKDHI};H{9CMJ~(XgS-HKj
zQAz=ql=nOWH7Z947{`a4_Q7dw3r=1&IXU@;;@cxVyf_y;qfuM~Z=d1evNaPKjo~Pn
zx7bBIR0{6m6dG(K4Z)XvX{bYH=C_73Wg%8ZWdgG}2A#7{Bxxn~bY<+0*9{l3gHQs2
zYv|9#<NOrE?E*eGDheftFcJZ!q}$b_vGL_nFdC*8X|6&u3}a+KM#_loxc8zAbzlp9
z$22g$HWRG>Wf-NiLKe}J&S>nyppD8{M<ZNit)<{nG1NLmIQu#XmDx}ip^ibzysu%$
z_8F_PZTY4QVE{kGh>sTyG-VA;YzaeT=6`J5$Y~9?sF=xVpMgT(NGhB!i5dhPG}(1b
z^AX4~K)@9x@PdP!W3uq+#eHm5JI0Z?pcE(t+=Qnt%4Z&l>rCk@iBn6D5z2BaNpqpl
z2$&H@(v%8}wKu<~B4a>W7f0vz4CDJAAVc`VVLJ_c3^ol2cizCYB)pvr4lg)Z;>z!f
zrKl_B_cS<t%`k?Qt}G$uXt<y_U;VBB$MI(kq-cr{-pc$MLd9>k7G5I+4KDM!rh{g@
zRq9I{m5cg@!u9Jxh2l8K=tIer0oM-9P^wOExweNAH1MR#doxSi%-}`R09<f<3+Eci
zl$Ly1&q6Si(Wo`uvxAT{?Zw=1wHg8E6R+MbTS<z>h68CLtEGuMIdD1@k_YhAXAT}=
zj^$>h5mR4r9})E|cx5`hL<|T&`Tp1Ralx`Qcny_D*Gj?eu4EdF8MKQV`hohU)ja{~
z0r?6|bdKqWrCq7;)yTFyl@6Lui^p(?6v^Dgnd$Z?Up-oF?4cBAvYmjEdNKz;r20J%
zQJt5jn9+uy$)h~p(1u5wm&i<Rbfv>37%#omCVVix4-}+8^t5zu|C4f*6302-YA?!G
zm15oJRF_LD<zp+7tVXWc!YIYVUgXf*b!A_=a|SqcKJ6YW3uG%k;^c+_Y=mB*Ok2W#
zF*a0b?+XR6*TMkWY=;>s+IK98Wm7g*Yz<>h1)fkTL_zViqPbvd!yQA=pOIsq2CB||
zJUSPaH?tpx8T;L+^9b1o6ony?N3G{H$DMDvL>q`xp>-bp!~Q~|)6VHWxP|gzveXDv
zh|U0gY(@fVzNIz8^F1i_`Ut_@^$bpXI94qPqvH4FCY;?Z285$D=*5I6lTC`YCN^rJ
zX{L~*u}Tw2zNn3uo--nhZK7xS=r<1d7FnW;p}NqZ;q+v!lgtf6^RfJ~INaq^Fy=`{
zHGZyg?^uaP2WHH+b7HInDuZ6bz~Va5<q!xewQCEZ&WGVQzHuo^3EJB52D>hI5EU?c
z&^3;c?9pUmPQdzlOQr8pkCx1aNe)pOv=mBAAy}TJ1*EQ%O{9*AbLJK4*;t^*NPbmU
zY?2+L2`64koLposDK=lLjG0w;kUDL9$5JD-zM`i$V_{uxy_PhTGoOUaYsE<*%yS>O
z!1PY+3OjmJ+e&3(+_ree!dtJl+3CS&CWxpH4-^k|IuE9Gfo_@ow}6Jk+DjB`p#!mj
z-iVbxh+#P9EKn}K%)w_DiwmN&1t6|Ij<0R*&Wj_ug!+}kR0?g3ap%}f`ih4BmODD2
z%?)9G(HIweO&=n!+}Zvdo5#Biiuj=J<1{HHt{NQnKty(iE>7ff&idbGW{2-2=u28b
zH66-s(#rEf2{<hm&ET`4gC3`-CvPR(9x6G(6_w4-wujR!3X^KbUzhgBRqF<N=@oT~
zP3C^Xs1$l>ce?8vP@W8YcnS(|0**C5l%Ze4NiV3s4O%5|VBu*O99U3S9#R9S{|(n4
z9VcGrh)T`+>&kn45rPsQ;k<6fZrCon*&>!uO3z4~**d++)mlcvlE>|t0|C$bdF>1g
z+f^7pM-4Ks@XQ#0W|?s!Ea+P%y5=X@r+PUZ*CpQwzsTC{iR;g7D34B9ySZT7Vl)?|
zdlfry=S08|2AWO9Um)!*?u(29<yzc}J)P%aYQwrK6=s*+6(T*A!3Up=-y~bU1;ow!
z-Ru_^w>N8te(S6F<Mn%=`&4;@=(3_eSLC6JiT##t03A6I2YQ(LCy$0?d>zrewyaZB
zMbE-4=+yrAeXm!^pZx@s>zY^9TGN*hif2}fz80`S8=eoq?CeqHBItkI!}|pgZ}2-g
zA>U@o_@S-84Yb0n2Pw;{&c>Kf(*+A16RKi!b=>PNII<kW_S2FR*hI0oatE%o({?@5
zU-85|F|9(b*w@WIzTZx}VAUcuo*sycjz9i;U~XCqAxNaJa)_b+!9OPZjm>_OB>!Vj
zzaNU+v9=9)XSz1;uTK0Xvp)qi`NYFc+`o#R-@S0xiTTq%W_!B+*#N%Y_v>gM{RRxB
z+&}I9-{buMp7yQadOUt!6r1wwSpP@J`l~r_-{~gTa=5f|!~eeBe^2G(kKpvZ5h?%b
zNBFm;|JRcuZ?D*J-S8Az|H%J3`hPvLYA@I#Vt69sdBlIX?02qo`^WTu>J|T({yXdI
zAFu!Ob^HGeI8}7dj*X&&ZeGg4uAl_POn}$$H1Ss6#@?TU!mK@%T!&CA^P%V%YtS^H
zO(D(8d_hNm)CO3TClvPxG=%&DT5Y-lizh_N7tuYNJd`qqP^{q{<sM3+Vsf<ed3ed+
zfY#zozjp}c4ycE0UG#kaCkd#hF|0iTqNqVuDuqC(;Mbi<>uJ(*3Eyvo&b`T@joj1e
z?L#5^T4kr9xS5--Z?&iao*+wLRXf(Sqd+%X^9-u1{Mp9(fadq0(?cAh;wKz8lD5;{
zz10fMMk*<8GQvdIe8<VY9kS8=wDwmUU2l=%Pm%s2&}&v{(kXU!HK;ejnyvKtOb6Pl
zJv0e)R&^@+DnWyOhAlxd6s~vp1*rJ-jjK$2$Y5%vEV#I`vb!^usnreYT%c`yzy;*Q
zp!+`}J%g1S@f)bkPrf5EjlT-o|Bc%J2@)kVuxqc3_N8*{%(5;G_+f*`gWHxOl!6p|
z@ig4}^d#2kUik-9EU;Oj5S5zD8g0y&(*zAWJ>`11whO_ynJMUed!qqv=2H&5hUWT+
zWwWr`v?9SqM<a>R6B-d-3z`<m&?c_9+7w>e+<OyVBeG>^QO9R>dWB_hXITc5uEhlU
zAa%!IN2sc2i@_GdmnE(E6_Ep{Bf=~}@hcuAtH*5}FKf51@(u86iU941(XiqPp&||l
z{x|BiY-Mh+B--3rxQYB(vv~!CeICB?N`N<zjs}5&T~$5in;og~B<@RhFrB`O<miOz
zl%iqDL>)4q%7Oev-xgGW#=dQJ_)pt?3$kDB8^3;+oY&-li{FYx($5t-z4#<#K$HT{
zLrHU=A%VZ%ECA)9loYN@Xu>el;Sm+3_Oh0yBTa3BxoYxFij)or^S#fxc`{)(Lf~g0
zXxH}u!ppQ2UZV^IBq;W>m4{N4`U6%LO5zXemJuEP*vavo@ys9EwQ+%b!S=q9v(L|b
zssiQYn?Y!hV1oA6`>^66Gl}4UN?~9yq0@7<uM%$vq2z0+g$w=&I#<I$FAZfWP~}T<
z^o5`l)dZ?rd`=S_NQ&_piL9(7;@f%Q6e7EC7^js^aRB<^UO|NgxeVr?Q|G&yWZLt!
z3*pa!1+dT43nAKqgDz1Z(5nS`t4ut;NY<sH)=!h)k4l^EP)Za9-^_+gV;xMGNT*j(
zriI%_ijT>|vQNO%Xp(W2LUw^E_DPO4k;y&Z+VzN4M>(wG*O?1XlhvOE27Y(Of2ekL
z*x8_7E4`aOpf7mzRmuF$m<p}|I;NYJt7}(SbwkUstTP1%N~6Z8@K8T9+{^@M+<rB#
z;x|1GTWYEva9E|9f2Mz5Hgg<C5#3M)b*5}<lwjhGEkpBqKm(|lPAe{fM|2$@Z3Z5+
zvL{aUI&0PV`QVi{<ToV-w{;nW_MQJ1=lQQ8KDt{)?risP@iL*^S#Yjl_9MSL8}^BQ
z02buS;u2RCn1W8U{4`cS<BY+Y!EI4UCF3s8j{gQA)T>dzl!2r#EEs54Qf${F#2E*2
zFMtxvbVnx`sdRX+jmFycH$56E?Snp}n^n_%75-{4;Em;g1$h4Qg--2)<qKt3kSf(H
zh5cEgu?-$deM~u4#bJZVDMi~xEsP+Vs8BQ4yBQY@2dz7WFcgE_khR(u6j5W+%7)$$
z^D3*kpih5EILkz6%iaXWf!^eiIm~-UKf<xQY};p<)5<sf-M8IrR_v88!TZel=ia|!
z_dEMbjJpnPN@_y&r>&h|f{`)n#yO%l$KY|l8_&CPq13!56pGIZMs@1c9`T8ni`3ov
zE_K55Nu&ecb+mufTiP{W14)z305=JjjKeg2N16q#$biwaonY;eH5E0xx#D5q{uS?<
z1@~jRVXO5?`%HV({suywUI}(JH@_~n`-E9@PQ86_lv&Rmod6pXsH33mxcI_+9i){F
zM0br=P)tX1C#oI!KlO@?5%axEt*b%X@tex-!IXjTVfI(tDJBAL@7VIOWtn$k)rtF7
z=6#;<noLcw8S0dA=gbN0#d|S%vSt{J!~5|2qv9o?(^V=py8)o7r=A<y)Crx}ot-uC
zU=15h2c)3FCo6!|5w3x^dU=cTcb70oUygMQuW4HbAUTxMoo*FAFE<_1Y|t9pR6Ksm
zcHa)dutjh%F4zUYuh0{vWEXN7#=(AiHs6IMc;ce6+n|C~gn0@Y0T|80icKljO6&7L
z^fjvIif8R9pbGb%Oe&kwc1dK_&2QuS&g%XBP`nC&;SaqoaaBi_sez4ecU*zkB&GvU
z%dH0Lyj#nB++^;95E^>s2=JP?Vo;?4*h-ry22JB4P;im47~C_@(I>wB7JwMZ%>Z%Q
z51R@keazfFAjc;crx?yx@Xar9h{>uG^dyv$gCfaPmRPT)px&*8F569L!1BlEw*`GC
zU-^3I(M#|i<5h&Ai;NC_Dgp6_V+keL?JEU}Sj9zHA`L8}FE4=3(G99nt{<3Mu@<}M
zJ4h<$pYN3q<h_9so!-JYP%7|l)#+Ua8mB0Vz5>v)V6vOglmRW;LyCs(v&6L_gK;I-
zf+0z3yDV^Rzx#?tqE&b=k%%5Oj-zXVwj^%2{a?hB`Q>&LxVWLGU%v4Anzrpgwl10F
zw}xK<G))Sc$%`{d(NU{>f5)Nlp_O0{#4XAmH_vRH-cg`^^l<=D(lnBt<@Sw&!e19)
z172oeBgD{ao(tZ}X?H!B-!Z>02--sDPWsvB14YM;l?`|z;ELr@lxQ10h8DWq5O=P~
z6l{e6rh;1RR;2{+|4wvPKDXN-4BurFx^3YiqqCow^;&wG9!d_Y!LG}fB-492M3NQS
z3+!aO4CvqIf@tj`(tDdt=rNp&K|NiL2%0U4EIN1CV(%W~KVSd=Q+R0xOV_d-c<xun
zjLdbd%HTN4P+!#lfTr3NGyI4`_m)4j5yuPwToh`P$Bv#;E~^`3<RTGT<Bvgj0$VdB
zrz<4`$DQKmtVa6I1C!9sh!Ev}<8A0}E1(jd`Q-R?{tf*8-Q%902R{RmBMlx^`)k0T
zzrbb(Nb*l}JO1YHK-lYo)6ZZl0%QLxmi_OQ8=3V3Sg$(*eDHto^ql$S{Q~M@X8qSI
z3p}yvG>{yM9+nmVOMm(<epH>AcW#zue%imvM*n-*Urh%#{eggZ%(-*>cW?V`YHly)
z2`o3RmH9VP7=U#J59jGfR-Vt_`;m-Yta&j>Q0Miv9`RT2_uMhS4iite7XOX)16=XZ
zWpMiQxxOO*zxK=dN9$w(V6(~b>im1BA1?x@+e;=oO#V8~&KorGH1OlB=Av(ValaY)
zPyFH^xBqi@_{Z(Pv&#Q*`|qsZf6)HVVdwwvXm77`B|;W;BkXywY-H1)g9Avglt9eA
z_>g$$8?^4y8mEP!gy^9oP9`(FZg%DUTIBbbdGP*a8?Fnk&paCa@#}Nk8kRe?l~;tJ
z^xYXJ`l*@|?aXbXMQmM4gyXl8<Nvy0K`M}|&f8`r3XgpqONr*<G0&ux3e#I{&F13M
z9K!@L;emDeg0XWPzwb*VRa5h^UXoF0@vT7QtrUqJipUBJGwbC}2jrHww)Y10-5RrC
zM>jaOzT}~E$yMLR|IHHr8TKQWUaz7rt+MQ}`WC<<8EZ<40>Gx7H52#Mp~@6N!>#lE
z*x}N&9$E?3THBox?Bq>L??--*4Lmxs09BRp#*StA?X2R*Rn~JELp?3HnG3@$(&MF?
zL87h@SKr*xq^z{TH*Q_$2foJ_9_h`8@nQZ;<FD&FQg{266Hc1Kms>3{?~_a^6qW7~
z0ZQU_#ST8|Rmr<H#?`H$8h$VJ`tRQB>s4!)iG7AsU+GGs%fo_%Pbm}n<E1UFp+VP-
zF%e$BbK^g8xO2>T%EPg13j0=1)QPOB9ewpF4^~6tNf<3@qqVduPLMb>Y;<#P`lF#E
z*oh|Ls&5wmUw3Ud3W!4glxIl(w+V0i;n5J|@G7lP2h1MyPz?5oc29|a7y{CJ!K-*U
z|0f~e*SHAn*>p{dzCW*oY>3UN`wj#ydTs;mKQw1Py84@Q$1k>Im;dH(O}qQ5W4JuG
zygU;R_iMR>eyN4=OiBu*@T^AsN9>m>>c!Qu^#+|jobObnG>3n04j;DwR(&lzcKVmE
zcvJ9pA%)U9P@3O+C|e++1itYdu*x2LkGZ1nF6{2>gfv_h3DS25k1uJF^>+6R=IRc!
z4NJ1uvG;ubUh&Os#XLcMt;o{*=4>ik$KNwiA__$dZOR>Zbv$e;S#bz@X^kY>bwqsF
zR}`Sad+%!nUrSh`IGjJ2OLhyKOdRu(Bx7n)N-F$0Zr4gpzH7S!z;>Q9Ka9$Mvz_uH
zYHMjun4Ny_NG5FqB|+pzxuJ-8_2~&>c59z?=&*)4TDkwhucOiHyT)HiZ|Pah%tMD+
zSzj^Q@_)VkZ|}Tcw_uF-E>V~s{B`#oFeGm6zDhw3Gq-<ZP%)32%Uc^6x9`j6T=R40
z9^<cV6{<D`byA$}jBjE4*=&_Y@O+EMyse4q4_k-sx;Cx--s@^OHScmWIU!Bo9uJUU
zZys}t!$c{L+5#E?>)4oyF8TAFidq_7t|k?WK9>m6Tfz={A+(=vEHK1Yb}sdDG{}w4
zf2lv5H_CM7?*9JVfOkwv#TDOv%cbhg%4_tHyd_GNm9nBRuORWqz|P@`(LG;wPRF5&
zHM`<&`i4k}uo|j^d8OnNbB3k=Q<BzRIj@SiZQprY4fgZ1nk@}!{WdGBm8q?w%V9od
zy`&aw5$|#gc23Em;u$4^qLZP1&JY8B#-%-Ie2RH%iCe_%b>G!Z*8{@Be0&F}c7o_X
z51iMlNb5kvX~$ga-uKNd`WCeeTp<M3EVVPE_<MPY{avoOZ`^;jC5$_=ZE;<GsdQVp
zke*DYKdq~YW&4z64tLFtO<I~N$yy{x9&>u|s{i)@OvidY6YBdvjD2-nRBP8Z5`rLI
z1~9<Tjf8X#UD6#YNO!l0l1g`t4$|FSigedVcXz{3-*Aqc=l!1doa6hSKZL#Sd)+Ip
z>so8?owXu(mpO?j>@+qV8z%X1q2Ocx7Qs#T8Gjff|1#yf{n~4#qe|NX(+?7T6Q==L
zu4F@xZ@%`&$>Gink~U5KyI=gTR3w3>H+wU0*F4%TbaJk7>vEnyf-hRdyAPyXHQKqC
zeH?8sPSOT_?OZQ4#!`;9TT=albZe_lO<{}BviJXHLkAqV4SBs}XMlG$#77qTDOlyG
z%eWABlKNc^r*F13*L8P5XE5B&#CG<?#s6IH`PDv4fs||IHfU>V*1UMzgWJ({cgNs}
zvT>lPyvn6Gjj-om>+{Fk`YZ{EtGcVtKOg!1Sv)1YR_co?cg@0m+bsL0o0_~?U)5dv
z^E0*;Rd&jq2<|`j$4q^&`Obm<Tg9}(Og-(t6^Wk=@Y_r<2;H~+D;+K)D^4F9)>SzH
z-(V%Kw;4Ir)s>SCk5}T`8*PnxNb|>SRocRRJ>r0G(I!5{b$PC@^b)l|camdy=F`7g
z@m8MT^|6@YxU-@lB6!ocp`w!5etjcv!u2*m^`)@}2RZ{amfVYU7RM$o&%<{Go>e{u
zZPh|wqfVYkR=M_nI(141?AP=`VV2zuPyaQT>EOZaa5++M`YXdU^C7yPbfKW~!#hcH
zs;RorL}A~98e$>$z0W?a9Q*|{T`iXxa;I*yjA<n;9540lj-WcOPVqa()`|bNv;S5`
z65ckC7qGh6?n+&rQapFrUB41O*+CZST5NoJ=8LV5M)BbF)5)Jt&Rg5aU-I9dlzH@u
zzl3xWZPj6j^P$2dj{-iqxeBd9*T9PPLM45~kzI7-%Z_!P9Nfltvm_s1@!vO;Ho`JA
zYo3ayLawd2xHip@kj#@pV`?otN<KF;1)Hw8r@1#`i`DgCyV4xL#U)f064_K;Ac|%+
z@4#1*$SyrU6I~yLRH+40^GSbVAR0m2q(HcbCW3&7^A-V_{O|u6k%@@bEIW+hZzFp5
zdw%#2IJ47PSc1tCFY>L4S_*8zuBjc)pw;mbJ}dvp)WW1Pg)dl)knT;|j%;Z`*?P*t
z&V9A_@3JUIa*nl-es@O%KPMs)1Pwkz(j&~jR(1s3<9KZyO=-bm%e5~wL0GN)to0b6
zyc_9W8h&5Aek0RFbKHv5Yru5HhQOMpnvCmTw)AVG(o*-3jWQP1MZte<T|^jhvQ=m>
zC)lT{RHtHME&!mvZ;+T3bl_b0uyf8uT-`CfQFhr*Sd0Ngr4kN%bIdZ?vU%FY7m_pL
zk?;49HoUW}2p>8*0wZY=It=+QBfxOkCVj*e0_b+y8ys>lX@A<Ye9II5TsT<6U2!d6
zK+m17(1EM3Zb%8CsV44UQz)$f!{(Tk{|{FDLM(zXHjum+^p`2-Vvw)txV42PW&=WW
zpe*lP_j8=9f|R4^tj;sY;@9J5dR1{(vgZs0b?XPq*XKM&{@pchw?&?cjPo{DQ4`7g
zKil_4+ZsTX&jP}V2Jlz1*V9E$a}U1BImJ=cO6tjK^HonL7ACS3sg1v*QyF)C`_rXY
zl}QJ(<PKiHUO4+}t#{7$m!a+sX-x^?p4Qd+XRN<`BSqAY25PYww&rJzohK<eqBpZa
zODZn|_y4a!D{H#FbS%Syglhs%hd|M*F&p_ld>0>Q=q`f{rGI}jY13ODIUE(8y}SLR
zM8ptIUPwlv3fW7)>El2;8h`q0uYFXfM(WxZA02UVr&y*fY@S8yRbuUomN`D}8uQuk
zuL5EIQ+(a|1_E+`B79@wrk{ZS3AB3;1eS8G%4j^y*EdHB-#xFXvCHj<1XWE~g2Eoy
z#20c6PZ2Uq;+l+SWAn<%bT*gil(akL##rS4b3EVOPeca|?u*PUNPPGHGI3CvYf-R^
z=uwk;f5(cC2un2aPdez$hdPDT(_xnBJW2aAEoq$CqU!ghR4AFIY98V5M_2ehf)Vi^
z{B;)p;4wzVaL>0iHgHYhx-*11Rk11=LFyEEw^xro91Y?oA;6CEa+igO<uS8Z+*-E@
zfGEA@<AgLAD5V^b;d-d{o|mtB*f)QiY4Rb|RjK#Ul!KXNnggqH9u*9>wOlERy<K9@
z`I~Hp;CF-z&n+-Lj3eY<YxcvX8OWbBPU%1QOA1s6#`i84w6UHl!oA1meFkg1mlnYz
zNJ**4!z^NZXmGb$C3RPcc3AaR>WOMY=srwy6l;9m1Bx|$Kry-Y%B$Ko7rZH{aO98D
zl$l;%ufC-yAL$nHB(_NE-S4u)BW=X*>t%J3&dYX{caCXXC~_*TD3_x`W)G@c5+#jp
z$vQMO*?oQ<U`DA>tV}q?qH5{j6nEgK360e+ELGUk7LZG}dPS8#N0XJYVm?7PNig_s
zVV6;ONZt8+a_w{`nT3Cj`gTrU$^GA+SONz8#1agRQpoO3Y@K!w;~aI;D##$;U7;#(
z?z+KNqoy+w1i*nFti5R24v_0rpiK#}i#FpDkVps6_M|qJ2~j3lr-UDpo(Azjg5&xO
zUV49?QtsVkM#(%@lS6ApUlK*RyZqgz|I39QRl*C<lgfA^+WWs8RiySoJ)HsZcGQ}t
zBRH;?rP?)~rA9=u2FBXE9RrxM`Z=nX&a(gdrE*$d$rqu%!WlAUdzo@&YfaKAo8sZ_
z1I+2d;1$+0Tg{@NG~MFP@^L?a$#;h=zSic+bn4%%=56yH4m_>yRfq9l-1*v&mQfae
zHh+z;oTf$+P_K<~p|%J&1wH;F5ry2*aujyxpq`|4tC9xKE0qJUBG$ThrTTm3!g<_X
z8l-x{14%^VEA2y&8whQWviOnG5A1bGxtNDXPu6^*wl_ka=*#15-)2*}LWu`|1?pWg
z`mZo&n?k{P%QaQR)`xxP+#I*g{S|x9-DvVVEQs7<tRQ%KhBPEQh*IL{c?BQ}C25U`
zfem;rF;Uk`mE4OZYOc^rG`te@>Y1dmzYa>#END_ckOuZ@^i<!U0ye$n+@)mg262)(
zKYz9n#NxcFYj{5cGgG4?^s`QW;BQ^r1`&8()-dnFY%g?o+FwK80;;-jy`fH@+$t<P
zMK=gqQj_a;G@^@w3uB0r=;DEbv56|F)IsP|bdObXo_vCIwqYuN${pQisc1Y+!WuWf
zlzt@~)#R)YXExsj2>l*rW+>4R_8470Qr0JjDOjlXRX~p_XjZ3S>yfzA@PZ7BuBwra
zZHfhUzQpfb?`^Xk{+v>5#Q!k&ebu>_hvn9lAXa&Ah4St>!G0zt4qiR)<i;=u5@AtU
zwD;DUbHhia!3?^{=>p$XaVyv7V7FPBflS+K@aE@zsUx%fCxhR%7kw5)4X|YKw|I!>
zyKI>+Ike4q5khBPdn0I+8mbbY2Jzb9L~!>`XY!2%hvOIXYKZ@1Mf~Lj#t8_B#m~0h
zeX0A)wnfdMJI>REK|YUR{zvz;MCo}yOq))T#<2S9yu4BnAbc5y<>J=3qHci?>clOK
zQpLNL6^!0%@?C#6NgnLNG;YhMzvm+sOQBE?<|kH4*R#1{4ZRT~Iym^@QU6IxR)+e3
zgiu*P3>u{zS0yha)vaqhh1+26I*Apj@`><n75^2_|MA)H^5lFyeW+Rg!7>4QM)ADT
z?4spP78wHHOzJ6FE;c}4x~nL}kCvSQ?CCZ7F_iW138>gQ^^73_+0tR$6ryUj;n`K)
z+?yJmueAs9cie{;g3=ggtl*jNDP~<QBKV=^NpxQ(C2kzZw_%Hge*#Ydk9sLwDz|CB
zq<Tilj4p`Aa=w&l5MYj9vA;3XsU)$6RLqibs6TRC_~Y4cvw<CMGn&@-W!VLRJ12SM
z;72t6#tUIUnDs2fk^t2C_VWyBu$qfP&S{1-01qYLHsIWF$8_nPdB2J+A*I?d)padr
zkHD6a07_S>77iqC)GnQVA9;=&wU=q6@R$oP<n*(7_he&(vNNI;qm|eQ=kKfLO)7a5
z{rsIvh>#B>?~NN~>a!ma|CJF~Kf57QNge6esE`o~{74lb=et;sWvGdjG>7D^QQ|J+
ztuG<lK=O<$`D(0ZW@_y^&2URWr>xfrEI1(Fj2*S}Bs%Du3!EZVMb#KKZ5Fpf<_z0T
zmq`wZS9Ws0M~ud$w_bX_@n!eUKT>f03<o(X$8iM>4|C1%Tl6E!x{R?jXLn{KYFc_<
zI3oHn_8>nc1T8Y4D`*Y9g$Ea-dL_9|=@th>8egB5Wmr4lK69#ND1Fa`oTAB3&-QIO
zHvtxj@((jnq*GYGhbC^f(8SF^GllT(w6jEw8kXpnNd{TtG&|qmd%X>fPca3?t#LR7
zb`!U3zu*wL_Z=8kvZzpT75+ZfFx66gY6s(xVabdV_VkK^V*=+7v{}EvnbBGH9{w(r
z+09SG+Ys>m?B9Zb4l_93G&x!pU%g{yk7WG<#ndac@qaKbB<XELws&}~BY9uD%u9jp
zK}~alfYB3JmWm2<({G}B2XiJBwe3s#@IW^j7mroVf@T<tD#zmjm=tg`AWaDQn2eU{
z7ZJzzQGdIRM;7pPU{>&iq2ATpwg>eVu0OHlj&yM2St>5R$YRyufeaby+X>7dWj&8@
zS4_UP5Yjm>s<+}i=*^&k9hOeg&Ej4)z>vcoe(b0j^}$~*-6<KwB<Hu@yD?oUj>lD_
z@zm9?_r-tU=UoK-Cqw+<A`xE@O&2(FA7+SE{biIK#PIQo%pb_$s}bJ#H6{s88F+MH
z+5{%G;GUh-*OW%2+73gN!+;CQZFu+WF1J2@6^mwtqe{qzqZ%mFg+c=`+<Kh6<TCn2
zW%Kt~h`bD5S7XPa*bR3jZjKXui{qGp1JrNpsa_sDhcvyNStmfK7oOBY7&H;I5Rq3d
z7rlxWz5K4OVrEb6J>~y}ktWi=Mab`|w^{oOBa4&oz4uJhVN!NLPM--R7+vKPCY%}7
z2+H2n4_%I=VbhL<(-}pN@3L6U8(C!PcM<iE8TWtRAxrslXT?M{5O#I&WVFlVMXhC<
z(}&2ScTSQsFqzC&NlfKxx_p_;v>RQUrZp$f@kw^bsWP}0W$YjCakmKLcejB)&+sMu
z?k5j(nr(aepcX+%-MR|(;Xq<2aZoX^Rz}HTJU9Hl78fPTfm0>sexl(<U)9y8pvo2{
z2_DK1c0+f6{|~n``kMeI$BAaAlshtB#~2yvKdFk>%W@=99tnJDF7~e39ua;rsmo!y
z&Qsq!7S-x_YzT?|!S(LQ5(4rwEjT%Q?6p7Oe24n(YtHeONK|S=kYX_z+Dvw1Ui0P$
zK$~U_fDQ_~I!xl9<%~)hxU{c~>Yq*GA{N8$iT#yd{$WEMY`4Mx6Ghzh&d+(g5yMCG
z2bpQyZXCu4BEOHO>!&@;nRCg0JdTLJjQltjq<!d4DHd%}0^Uf)-kW^A;Uu3i_WHME
zQ{>G9IQc6pB=*4g72nO~I6omX4x|BVm?d&9K_f`-8N`#^u2OB1`l4?|8^0<HkTvK$
zhrUS9C9+3_Ab~qUXOv7U6#dQ||71Zw`CG!3sYic=e^-HqY4#HICwDtEp)B<uOF0MK
zUGlLehsQ9t+!7whYlxZF%tgZiTASMY7NP9_s0x2?v_wjH%*fZMtkK>jZ-16z&9O)@
z;?8+SQ4%+9{Y70XXKed+1i9TaL{+t%&J9z4O)<p^Ih(omZig#Q(C<#=t%wes2{9J1
zV)NZ4!{&7Yw{L<<%wcpk3nQ8c^5*Am#O$DGwcwzM;c@&ePagD`+Wn!Mi_W6H3JOvB
z9DASWkCxY2v?n2o(f=1<^!RpD4{Khm(fqY3+5svZlPTOQAD2&-%g^YhxnA>0HllxP
zf#IXSr~fMBfH0ea7ir?3Y`dq5dtWpCxAcy@_co=i5Ti}||3d9Xdni8Qta%y=0x<RD
zpwN-4@PbFc%h0dMQ|b)5e>jz*q!D^-$aN;Ao~?y?9Z#Ylg!yiLe}6bQZ$<cS9WAx+
zJ`C|MM<ZP0WET%u>l`A+Q^medx^)t@=~M^|$u6cO96{ftU}N*^%vF=px0~!`tjfbu
zh`Bs{JcajrJ#U>3ZZxNQhBf@VMP*6Mg<F)Fmw`}2vGTM4QXPr?i~9Cg7u#Hv?MH(M
zdE|EJh{Qo<PVcK2xlw4gR9quSg14RPR9(Jm{5FIhJ@bRV^=pbQnD0om#PH$xOO9(1
zwC1iexgh*s<?CTXa}i27`H#S9(~aQvn68AQpOsv-HQ?*Mk4o${@8}FyyCQlLhQ*{f
zKt^5pD#JHw?-eUdJ8C{B|C2HQJSgKzkt~}C=YTaF^@%_xWoB=mmaHd1mE%Pk-aGdo
z_b;9v^FV=^^6+$p<S52?7oJ+b(!6#@?i-n-<ALzjm*VBaq$8<%;6n501^O9+!Ljo?
zlPvjd6X_2@$b+diPXZ18IQ9GDMx}UHiW1c?-3u&;{q3o?kba{bIAN?k`oT1)ybn18
z7EMq$ZZ|5+AJKSw3<hT5ZMzC5og0xUCr{QGU&N7BNLIXA22_)o8{Lxp1bm%4?W-BF
zDU!8LLjf-4SuE6hWgiqDs-AnlUJl=S*&Vt6Dtuk7p0ao7b>&TD(M-jTL%elxTiz>D
z?)S)k!<1~}=#k6sT(5Kn6(?Jc3u>n$9@+#xxwFJSBq|)LfkKz$)OTxdFN0xzOCpTt
z#HzxjpUeWUvaAht<wnfFtll1k%6dPHP2~m*g7{3*%~JsIdcO#dBj<Y5xVy`)iGLp6
z1NjH61xv855yel;6$@G{CYjAoffK&#HGFXCz>1!*$%byNh^oGhTKkbv6zW3gbyT<H
zlsc*8y%(45*W8TK_v|%D3n$gKf9Pd$t$x<J{AHST<)nI@VnO@wwh=)Pw8=e>s>qYC
zF_}d$sC%$m>HQ_d^!*?>`GU5IoX?C*`&u<kwpe^b9gvbRwb5$QAt5nV6)d^ANu*@x
zws2xLNR5Mnd>2AEJdw?S5yndO{UFUU_f|&ysoI=cYIccvVWOK6(v=fK?Mqw+JCMo7
zt1i5jTjig&^{{r_0r^G^3p+YTs^;sZ*JEdpW*3IF8a^I!0LbWo=}Xy}blnpy6-$7W
zo>E0&Dl~qW`dgLw?;t5N%(+GzpR0N{gq8r+U_Hyy$55UiJr0sNsWeU7u9q#h3uGXu
zYTeZm<&>1Yxc&jcRW(BYk+bXi)jAj<KhougedC{T<)@0{mhXgFVK!6jW%RVF%Vi!Z
ztx^3L1KKg^tvvf$3Inn^O~~MBROcpD5cZSk@pDk#Q;Uz-Bp9-;y$JzuXk=G>T@J5P
z88qFvf)z*9qo(lL-YjnfBHC!TR;0fS_g^6PPb2}9B&T_@UOX?tC|nx566K?LOoggb
zkz18(0gz`?8=L{vq-4@|-4lE-k=D5yS<trT+V4KggjEgET_O#m(=@Ey>vBe#KzVWu
zgy%XRyB8>bCe=slh?_ldc@M{$b=H*6I#=>Ggu~fP7C>-~B)w5Jf(8anyx9zshV&2W
zyOdKTiAI}LrVqzmgT2PjDKzo@jA)M8Oh?1neaKdx9>+519^iKiOXEDE^vjb;V3KQX
z63#7}1zRtBq9{p9OUWkT1G_%CxJT(y-bL$waME2E+-{#O1;sdPJw_n*vlh=qC}Xfr
zO!9eKY?jZr^doh-E{$`MyyWhkY-$Fa=k(fbV5PM!US5V0L(dJ*_P7RkRE_E)ci*K#
z5`3+i1Kz>>TH@Z4(e|_{Etl9{4G=!@A%RcST408fGCpfNqg$7-RJ%&PP^Ndpa8n@J
zs1Qz`iDYu`>H*S;PU`nfR7U5dg#Fj|uRKmLYBRv8GbMW-sf<D2p5;rQ4CAQd@L(dQ
zn-m`3Q_oOs&kI_r`T4_UxU;)6=JIO%#oEJgYN_V~v77|JcIxnOynoGreqV@ri~vO4
z%)VhOdV^0bQGG768CiraF;I2Z6HTg^4^<z`q&ymD=uXLEv1{L6J<C|kE#6Z^`L^qX
z`jhMqCC$^mU_7Bh6f>a6;xpbm?|t@sO+dk^u3B-`p{$>|PHyEXmK6i~W+P&9lpnP)
z-F1af*S6+Q;UuCKrTg@P>D(dQ3TjzT)aAeKaB3RYMF4Bn6$p86`LL<VA5jQfZ@%BH
ziW}nd<TLueZ|5Vi8KLcvWzt&2_3VI><vS%>W8h)Br5hEayZ5%I4KV3*lg35C8;IN%
zHv9{l@6A%KzMf;qjY!R@?<p#CnB8V}PT4KAKX1b3h0L%$DcQKOSWcrE7s4dvR6=G8
ztjzLLQ#Hn7^|9|0d{s^vLL@=5o(7xg_o+QP&6UBqk}q`W)oUC+((2V2wt9DZ@TWBL
z$$^)f3Jty^wOgoteM(1HV5pNf;m=TVXT@Kj<G+MSzmG&gby(vz!7h2bEHEM^-bb43
z<KyiFtZ)&GCGJN5>;=W(`Hs(ixHIT{h0osK7?Cuym^HF58_sC@=!CJb{=BR-tKZeg
z+^aY<h4nY%ZE)`7Iqnpj=Wh|5*$tQfaHgB~UT3~dUe?0mC~4US(EDhdA>`kb<aMAL
zM=lgIw=W|MGnO;6tHcirIK(SX(|r{Ky<77?@HkM#rW7S7MJ`k!PGlk_$*yiE8Df`k
zY9`URKK)In#cv*HCvw?y9ifz^-jHDlcqmk~e^T_uyCzOp3Fq}O58)IRm#GFE8kmj&
zq<(SPud&UG9tCFXpj1Nk4Mw94G5g9;G)T?~URM<Djz>+k)`iP#!tsMAuY(Ra!v#kB
z-{i$LO8dxu+f9^OR-R#zw4c@#_Sjuuugjc`EGlHJ`)0xqam<kpblxo}dFaOhjId4*
zZf?Ep1)B-$|1`2-CwwU^^zTZDag<1?N3T^W$a!OMA<z`{v_8Tn&A$B&*}AM@Ue+d@
z|Hxp&sZvI}vk2?2gk!2Umu*vGFLzWIU;(}5v7Y;wd{DcBdE8vki@cFz{N4|gxj{q)
zXR3k$c^GjYODR$(SZXpYPCUjLf*KdEHA&}PNLgrLNnLhu1yWC(wcks;%CYXVQ~Vrm
zo!SfLG4Z(`bon%Dc@1HGE!$KJ;ww&(-!KPrR-EV7g$gv8Bzh4E4lmj__%$<J_~pRM
zedA(ISXYT>mhn2_T<f=_|EQj2?bp?eq*2m_;4oZbWMKlN_m!;nCqY6n&?CHVVr6Xv
z=2uvw6<&oacNE?&wA_XUS@Q9(G^{fUHCybmtqIgi2U<jvzB5+ZZ2xhXIew7xlTbi~
z2zsgr?~jnPmkxlEjJy#Z;UNzx8->tiTq716B(9U(bpNI|MedJsk_R5ZuD-E;rFo@$
zdLP{32HPApOcW;|a(@$p1y@~;@ZcQJLm&I-7w=&Eir5*Mk){W$w{d9+Y%JUI=!*5R
zF~|Bo)UthFhr`b_I}8nCOVnronm}qVj}qPSag0TrO@@oq5jLZ*WFX<7K`xZNw~VS#
z&sNu;d*Rr%KOvL-KwVfQTNcBw`{+Daq(v`sTa2Yb&ROD<&qJPxFW3ovLK~`8&A-7%
zLlWd-lhYdAs#rsdTumFMT`iRRerNfP!IFv3ob_v+$gUn0!-M?3{D**FXaSyvsd?VY
z;EhfK8T#NXUo$mbto}g5^OGAb(C~Q1*5O=_Vd&*Feh^nnm}8`|31FG1V%qr;gq8WM
z(PFN&x8kWv4raSLK~gbQ(oSLd_`3-5h%v;M-K?F|gI@`$X)NB=)wYb}y5Q=C|8DJ4
z<X<Nhl7U^L;ntqn2^Y)RltO|PzS;{Ujx%Ir#Uec=r@`<sgwBVzDngsH80#+;!JqN7
z8wsP-k=Cfr^s9HS-rXrVG3h)*21^|gpNHexx)A3LV=fo#c`ZIt)&5M)qNF*q6IG?A
zsXCkcw&~2m3a6LeMi_X=&qU!optdOo4f-@|f@2;uAocHvnr`xPOn^I<&UoXvMF_D-
z`h|%Z=Aiq=A@ym|q6>@WN(=MrTMAkdk45?`6Q$HP%Pqj$74EWS)E5^iLrf~2thiKz
zsbNTo)g-xUpAVS=<s%LDWw-*c`AV+Eo~k1WdpNWf>W+*IKYImj_IwwI!{ZJok0Zu4
zbKBe+){dkhbdvT8+|{HO{}Yz{Z?MASh^X+U!J`w;WUxClZMfx%&c(Iw;svR4S|U|9
z%$&F$b>wR_+%1OIEk3jU@(T()H>bfT!}<Q$F0Z|!dRAb!A}CSIwxXtjLu)}(_)Rc8
zV|K0R%WEU8wfE+AD}*f(=kiU$a_}R`BV-V%VI;(npx4HYM}9#Ks;bVRpZFmK7FAmj
z4VEPj|GsMd`e}0_K~T+`(XYWL)xKa@e>cA1;QocGXUujVYwaEdq->jZ+i>F659DwC
zV8vOy=$9X4QSD{^WadX1YzwjO3jc~9^JaaWK?!vfYZ_+t)Y<1NxG2O^!jC+u(gxDC
zv$!`8v3r%<f^9)o5;{pCN)AMMsD#N7L{o$_GApg+!|)9r_y3t<`8)fHz|gkR0kyuw
zpzV$3%ZAg86Gkd=hD+--hZnv+(^sMHU81qfrPdbwf49^Z=3x9@2Df4Q55%ENg+wqt
z!a;b0xt50ETbQ`&n%s`~E6#Wh4fzt@u&@ks3|c-wf^#MP$V+D{t~3$&T4@+xUhckp
zF@cktaq?OAH=Rj%$_-13U)GTs5oyZvIOv)aC$7`(7UB-4gBnoKpgf09=iJWih_jN^
zV*YsuJdgdnGt*uLm~BItAq8Me)6R94_{j`QbbO^AnqZ-#oyzmLwMuuWav`u*S5dVl
zB!Zlc37IBOWq1}%OLt4Ra{A!8*~4(ttbiZ%i&TGSyuT33V_F<lsZrb(RjjTXEf{M?
z%XIi450?+$AkXQ8b5nSKBzhhWu)6jzf_8p&2=7zwCFthqK2(My;j0**2G!s>+#&+y
z#M*>X<P6`9wA}6fWoynZT>TZ6h(Y-^S6DYB(45?kh0r`rExIpfNhQN2+xvl%NXKT?
zfNtsc%-`yXS%yYM*M3?I!F<_;mm&aFtSHP;DW1Jt^=W+v+mtL40_Ef%+mQiT@CXW6
zb^f9Ki#6aON?C<|d<vOmEzn@*ezd&vV^}>6t56EAUa}p*HQFO#yfJvAi=A+Oz>t^(
z_kUqjHawJxDaD3qT<Y#fv$u``f0~R6B`l$0!`y&J-dF1=xt|7_8E1WvNw}Lwy)m-d
ziD7LaIQEbg9vPTLo55wj)I}27V%f%QT`H-uZWmF#6UAF9bp4|l-Vu*4@Pxj4jEKV%
zjTjxvrhVmPbmfqB_WbGJc9l|U-!9r$QqKy1wYJEk%+*w?B@8f&ztGI6!fl{mjq6L3
zQ<DQ59_2KE_?Mqs?ik7vK&>I_a9;d}0~moGIMqsy&jp<%q2^tGx)Hjcp+~=zAwa2-
z>=xdjj$H8#Hcx{?YD4?}@J<wcn?3DBlSy}R+(X={B-qhOP39>$rOmIfDhmlhztCld
z4>*ZDc_7a%W;{NJWj$JF5=dpC^AkQPWfs#BNg?{T^G~0pMB3@z@HTgi3_aE3&iPc?
zW|#APJt+s&>!P|2Qz1(fCu0x18j(qgJAAD<@%_U*xJn^s<jtwZPHp_!+pF_Z6JI$e
z?p}9KXQ+$Y`#;a*p{ifQj%bA&jS{cfla(>S7>_6lVnXd%nkTy}v0eo|Hw^-xH>}`^
z;BgwF&j1G1U114G6d^3ADu~K{d5ufFTI$+0%*)-%XsfI$5w@crY671=BSl76XhhS7
zgy!}u<0ej|YkAA#Lv1-H^#W!8PAh3PLUVeN%99-wHh=*Ok2<B}(+ga|qZdTJ<*6X%
z1NEv%Pp1xnS3a5UUm5ll_N%x+MVrF<KE&Z#%Uki{zV7^kLMCsn+gZ;%YFG2pYHqCk
zG-=(roli<onwGAratl_Rg@o?TlLv!y!?b4?Q#swurf?g$$KbxRI3h*dmjmqjg6GIy
zCG|x6mKte8pPK%u!hhEn*>@cw8BEWx1chP=jh_}(OD;Wv2&AT>wW8Yu!{Nc4JcfK4
ze5jyy&cKSTB3H2b$^X_x{q8*|uk*GDC(l(44}n7|my6M)CWBVhfx)bJp9Cq{iS@>c
zHA`$9>sT<RHzX!2w=8c-)&}58S~`t4Wc0uZwfciznbm@KfHTr4FIi4kxL!suR;^(d
zKpR3T7TCy{jFjIB?zuE#ALKq99qKQjo?!KTc6w3e_%_zteR6^bl3=MJ#XDe8tg3el
zd2zRKZyK?~sPEndP;+4T=3u5yYU0V(qLJ^fV#+~3kh5`2(GE{?04A-jqR@9Lh1F^b
zOHFmJmjzcAxw8q!uwDJu?1V*A0KcMz>OnoZ0seM~88phjH5w=LYfuQC@aP;WMiI0I
zj<FE@9^xeA%Q_RC9tNZITl(C@MUAn#=IT>vB*)19&3N1qo|z;6u}k|sK8sIR{&SHe
zOO8K$=dcNMN5;CKJ}sZPZk#d>K=ZZnwV^KFFSe390HC2$bM`7>>8AYaeg8{8V6O|{
zIY?sV%8A$(UuB$fQl%}6=z`BWatk=~RTC4HGni;{i)T;Xq0^K&{f)(fhr#Y84AfxA
znr!VT{;DRorqR@U%62#2+Zzho*tsI$iaqqPmUo#^jOo9Srs@pWbP!1~oIB7ez1+HP
zjgl5<{2mz8S;X}7o8eD2&4T1_S5Go%6HA@FNS!MWfoq4ySj;oY9ucig$&+qN%=_8i
z2};IH>K&e{$uEC(hu~H6WbFpBo4<q$%qeO75*}If?U|hEDqgB-BipJaU{RIxC9eHr
z_uRE3a`>pAB@Rx>uZbs-TC{w<V#()>JD;%2p5-)noaW1P&9Xg~L!r$7vz;m}?DuE$
z@M{!m(deaQ5v8QhMvL)8iE-1q(`D}>ufTHl+|y~Ba<u(tzry7|qYNLd{4_SVl4)>_
zB-Fc5_3S^7BBni4+|^q0djAk=ucn~6fRLoBARzJb-0#B8g8x(;Mz*^YaW%UEKDch5
z7+)~>v8(#Snp5rw`5<M(Uh&i=71Bq#;KQxOnC1WqJl;sks;%#uMw*s7HPDlCzkGd0
zn>a=TCee%|(_O%RgFksr80Xt|K}%KJ-72>5C1Z8nNH{cCO!@LN0QV;MgGnW0vqhJM
zL1i?kW+f*|turK)6rjGBXfG&i;Uf^2bi5LET<@eM*s$$7^F@=74xm$<ACN6F%Vrxu
z(G|6tdsq$nC3V7CZs1QT&lJG=q3eRok0S8e2kU+z9IyL}t+W=x4e#i56>*67U8I4!
zaKa9}fGuwQ0pyGl(CS%SCa|Q+O*UMhFW)Iy#5Ia8!#upV;*{)O-wt4yQ$(aN97W|G
znvCE!Nb*}r0R;2P`yT+_=B#Nv8I=;$_%GRxN8Rtyj|%LLY(h_~EXvx}R8!!N+G)V{
zeTC+IU6D>8M<R6k4a?I}N3{%mpsu>0U~G2p)|XTgyXb0+C8YP&VLQv<XoV!>HHvGo
z2=q~@l|$n9p!(X^N_&cYPU7bjHcjC`x9jL+(x~vBn{gt@D_%Kod50VuO~N>&QoX_+
zjG12ZJQ>AAH;_P48X?IGDqmKO0xCZECsvvUnt?BHl?qN3>lJnTFJ|r4K8~6m9xq4c
z#ho|jD!W7-`KgoGWpp$7vRQe0vss<bDm{DbLm7qP9lzS4w8<*%;xLL~@NmBg)&5eN
zoPAhhnlJS=Y{Zw)R|;RDIO)1#SO4X&9@bpX11F{9Nn`E@04=YPdM2ne%hgE3CEw@K
z9lOakSJz#?Ih&Ol#@r%<A4Qf&)0;w1rXF$u;S)^X`iQeI{3r<B2F`lKk!%kW*LI0v
z@-fLBw6n3$)P9g}vC!$(X$T%T*d|H{KNbzV9;3>CE??dS>$yidW_r^$Yzle$wI8$_
zB?N|@?-k;iXZJC>3&h|&3PTjygGlTrGOt~1bz!g$YDcY6dRK<p1;cT-aawbX8@-8r
z6?|w6hTiPq)uLlja*4TEiikf3a@|~Kh?o1TjQ+xxw2T8QO*0um8hKBk8$o2u2I9R*
zNQ{|d;|b)o?G3uiG7u#+^8+S|R2#n?Z1m*iJR2Y&H0$9Odijv0WtxHFhz+*JuF`$b
zc{9a#VeEr`napZ((#G(`3OD0rMtH&p2FfZJDwLbMM<(p>%)L?0=IePLG+XAuT96{i
zIlTo{vY%5KX?3zZ@b3}|8-Iz-ngf<D2m8q1^a6~@h8V?!!QB9@A^im)wcZ4FttXYT
zR#?=z9+r!Ry=?+>{qT2-3U66jNjrus(%GWj7^xMql|uzLg<YYI+?|7mL=;aG0s7xn
z33Bs8wj6eX`U*Q(7B2Q6(gYYR)6^d{dlnHGIb>qiL2qN}%LEo;o+zzCVJGDgfVwCd
zA@@}iP?++Eur-p5RzrHoG|ZEuX|&xYJn5K_wG`-C<P$RaiJfB^RUUv@@=GhVdxFB}
z*0pr?4Hc_tY3#tBr@8AKf$^H3>r`S=kjoB3)9hL%vwkUh?pc1B{!SH~>%hKk6csMI
z(0mnRsTQ=5j1!@oNtGNkU$mmI^FZ-2@}hxt*9e`GSo#+9gBIWS3vT+SQ6r_Tu<j+U
z2{*2CG_lKn{akR?7I=_&(`2~9(X=nSG<VXOL|o8ce`V@T<KZ3J-#IZ-N*5VKW;pMt
zgrH^7y$F4H>`v<I#3X9u_F#+MnJ+U)$nX+eo2JBUv!Fj~vE`hEVd|~&$sPm;x5_M$
z@T%E^|I>7X{!LJ0ty7|PgaGwB9)?ARV}aWlz;|m!3$8U@;+)Z$2`M9Q+Q;gRy@e{J
zp|W7lxc!H};JM!sar5lqQIq5v1@e&oT#$97{ddC-PF9srau0_Mj3wc4KCAGF>J{31
z<|`o%k-#7!#$CJf5F|4urBoKAC)nRd?I#N-&+7Tsi>>S$t&BNXP~rpRT&)_G`Y-S{
zpGIxg#p7>h50V<JR|yN$`PuGKNfPDCTN%!uKD})Qt=G4ZXFU3_BDG69k&7Ou#~frZ
zDDj3baLbB;1!5i|oG>!T`vHkZL&$lAQwysjck=m-GK*~#>ZT_1sxNA8L+yFvy{*r6
zGN9GrH9bItvuS7Gxe!PE!S{gE*9X03&-bpJay`6*CIK$$JJ0LA{V%<90OPrTXssk?
zJ|0!__d(0umgu}YA4|8-7sPf<t38p6*p_RQcdWZvSlLT8I}Xqc7q&O+Brnk?1qN3=
znuKkXSQsElbkjGDyAdACvjnt49g4_$_Wj{<xcoYwUY@G#?{fImCrK%8V$Jat;@*v_
zWHEexRT^Doo!JW`>;=9}iVp*H%32ZYwu3kN{HypieLUugh1yh^>aaIF_3Y}56>4t;
zl}_-N=Wc7c9Z3l>8WpmJE;v|d&j);-EEPQB?^t?I%TRlcW3Y*7zs6KJy4#zkwq(i_
zvcW8bU!cF}s8m<S;I(3#U9eYU>b+|^nbI!+QK#I09*K3<Gy7SF3WD7@Yl^kRaFDnL
zjTOQl9le+uwi(42I(%xg?zo$N#DqQSeoTVfPtN9r;`7-@LF)V{&B85M$7I7jE-u^^
z_xnva)1{*_-JcMmL%>g`O7nJXwrpN%rFijqLan;Pnu#nr@rR0=xkbi2;aBKPgU&n8
z;(wUzzJLC<L8ADVZqgxe?^3Y8-ZFqp9FCVs*aFpi@B~-J5{Sf)CKE@y;tS*^dUB9o
z==1MB>sr`u`YKF?9)ta2zQ|X*>*q+OYmjval3Xj_alK;90ZOCN<Qvx!tHq0S5-s(4
zq>NX!3guJa#;kD}0m<+FmlC?_<X6%%MI8(0&wJ({V`~ug)rIvJX@f@0#QozAR$Tk6
z%LOPM>uauMWYlOX`+e(M`tt`U!ySb0Cw-&rU-C^xD(NAmE?eD~?uXf&E3YCYqMLnO
zXb8fBemVEo@DLSd#Pie8S4A7D&FI$^N?V~v*0irvm9LQ<G2|NzpE_zRojhxVJxZdm
ziS^m|fyf_o3LU!I^{kt=bYTo59zKlA&AGf75JY)td&0gVNc|k#4g9c%vhx(;${hD6
znrMBXMJ6k9sGZjbLLj`E&P-Ttnw`F<9(>%#>N?sE3bkHYQF!IA)4r?>Fyo4rQ!l%e
zxbf)W%1b|W{6U^!Jm;L2kWJf9+Z7KjOs-5aE3X|>biL$*>Scd;a@GQ0%XxD@GBfJ!
zK#L#U7ffVCWStIE$OjB~y6ZdjT6sQq52%L8iI6vTF7Gs+0k4$12Fo0S;qm7^;{3D@
zN~?{H@L#`4_ukP42VpD<Kjiod&y3OiLV{3J2;d3$99x=l|5JRs7flk4w_+uY$1wal
zhw|XysC#<PBHhdmKpW;@FYb}mSOFGUw1-lDlG99A7$g0hUYaBHjrZ0jFzv_rdv)1$
z!-z;8$wXUKuK&p^qfxk%tP92Qyd7Kmk6-rs%|>OR&BF<3Qa`}!5yZCV40%?7rZ$8H
zPg$WIZw7%Zy&3;c(kjx1KDX=MX29n}f+$W?tiMZ~8kendImjsoF<rHZu}<3M*Jw3k
zL&{)zj&BundQ;(ix`Eho`q-oplU}xtAX-Wuc)FQgsLZMjFj8l(Ln?UF*nFzVKSj}S
z^QC;##VC-EPAA3{fT~gRZo{&RcR-dvTiAInPyNHbrF&jwS8*4iUvie(($9Pl!nE@y
zi|hE<tY+=R%A-bBEBIKp=uzH6mUY=8wxmKzx-A{sr%`Q)%pO{=N&525qD%U5(m3Op
zkW&_)&S1wqMI?pb$V`WGq!;!Io#m9*2xEoV@K1{gdy(R+qG35nHB3k>C!9;@2BB|x
zYEG%hsjbbJ+7N?yRXXI<irP$#*j29^0T9`Xh-Fq6_}7QL*jOJ~yt{&ZM7@qHO0R8G
zTs-+xe~6$#tYt*VRd32jp}<P^iOr%|5!v3!3D;fwlvp$df1kZo;;ddL;k2I$3Ay@U
zXQ=%&AziGN;VeFn1PKmlnP7M4TLz!)7r5ZQr)l#984DZs*f3}wG)7@Do|D8Z7OnV6
zMt4s|6jBo}xmS$OYHCkR)9UV}d<iP+TLh*{{<qEaXjEAn7t#GN`J)LUuu^@x-+iO2
zZA5VJwR>>?WoVN0rGZOce=8svmf}&rG~$yHJc9q|I1(KkOlC0e*zXe9X_|(|KDnBP
zTPIWi_iM0;&}sSZ71QJ5*oSknW48IniNbS)LbOmk%>N`nvdk=#viaG~arLqIhT-ZE
zG5k2tP0f~LZcuk5HwprUs1fo5fn7MkeYa*vd?m}p-~-eH410UOeU>Yr8Bjef+<K$-
zX<lW`Q5}?Z6LbIlofO;r0L4!H^gYN7!4H=<;_{nhhFc#e<AS~4%)s`_(&MOIppCY^
zjf0!8C6N`bbr>D5gvS6GE8UYUpkAjTvOP-Du5d0p4U_qr$|<UU?+RCqwqtWxekSqR
z*=@3K&i&E&vXWpQ^_>8s_}2B8%`tjF=lhjFUZb2?2lz+QybPSrB~){R$NE{mu5-A(
z&)Z-(S@!ZF9i@owJw{T9>|%>tip*Wnlh@XxK`h|xkNI5mVjMo$V9a|R!AWhcGN~R&
zsq?ICh-@Av9ves+c#z6AUFlZ3#tAAj&hzZr?@(U}UND5cWg66)-5YrDTzL6@XC%|X
zXvJoxE>?Nzu47g2;k}p8ji;QuW0Gp`p;@|J2~E4xfGOMbqMpX05aDV~T}Hz?W_6D4
zkGc*lz61Iejk<5n90OmR5`^V2|C$88uS!l+n<;6dQM9g5NLw6&l>daA=<sIp<QUc)
zwabV>W}a)p7Vh=tlyGPsf_5X=Ri+svb1c-nk<==!p*O_Yau1pt!?Jwfl=bo4)2Eg~
zvg0}(xFleog!4$vPb$|-5wkui-HWEyIiMc|PT4%{1i;3*gift!@x?V!Pz|Vc1K>H&
zlhKRD!wBwxy_K|HD+qa@l^kUJ#5MO#N2NJ+2Vrp1qN4+S^<~6CVpEF~xZewjJTl0q
zX3CW8F;k>RB_Q{3gO@T*)adL%IWO6~ByNGuIpu5*notwJWsznLo}Xrm7<|+y2ZL(1
zn7ej$yKJJ#K3NQE_qf8e+i^5!_f)N_-av2XERM(f%{VE`EZ!A#t9_j={;pNwt9RgN
z{}bmbIwc*QuxjXX@xR!{T!P4ybB9Ye;KksZtB@50GWFL6OA}apR9JKnO2@#2UXwrc
zonuo5YAJFZMguPEso<`m)sIbvdu?5*mFan^6X8Mp18P?2@;`B>YiyEEP6=mhi*R|{
zeC=YHFTLCAwlNvPp}wj&DVNgqio5KoaM_lB6WdIf@*7l4^2}v60`Oc2XH(3xGDN!Y
z(bo}*(?cq=;AV#NcROUAAx%HKfK-IElF|+ep<($8{`Sc`<w1?9Ua-FI>Ia+olV{ht
zo%9R|{Q#p{E6o>uH#T}2SC6`{25{Bqf}SX7eKZiYA|iT?98aKv617RMv%Q%tTSGM~
zB#RQz4UvX=gcKeg_tH=vJa<G=2o>L~mZ$lyWOXW5x@F+yiJSY-91o;9VrX`WX0n^e
zC3C*%vG^^I$(J=A-6;)o-$Sv~)0d@$V9zeLrUQgI*c9AU{6X)!&O;s$31S_q&zvMg
z$yCSSt`4G{;JL7JjVFge&V@G_EY7_vd_)B#Z+qrLfvX3*iRO0Br~4tQ^1N!LOHqM5
z10go>+do#KpqtTk*?g5I7SYDU;#99X-tf9R<L0rE>~$lxoY#EGQ^i1fCipa6Q_`_X
zui-<eVaX};J`h48OlVxk$L7mw2))`crM`)uOcnQCxla0Z@H`UtrUe~hr)(_sm5d0(
zjh69%TI<j_o7{a>y4_sg&6%Codgqm|_ql_q&Pp;i&1r-ronIbn2?>+YJI)+mo(rIA
zX#?z1S-aM2`Gh~%3w`Qq#TN&2sN7CnyN?g7JK<Jq!xQu=2At^>9VM<QZyx5H9xNZj
z!^YRUreCm|*2Bd+v0t!Y0zbIv!LrxR<HTIoR%?gQ&5ZuejJeW?*>6M+$IF^9`SLFl
zranme+m0J9;?4#+Qf*Jr9s>MklPyLq&L#DGl?h#QR$~o&#V;@*x<rQQ4ut0;tIrAX
zx!m9*O)x?pBC`x=ZdQ#6P|(r}X%<cMX|6$>7jW>$8<*>Es0N?l^2F16j0%k#3wJCx
z!Is!ZGe$wfy=p8DRHtRd+O6k5h0V`WEjQA!F%QG5d8#e~7r^E>q8;r$YJ~GH=M3#|
zK=VHzHzQ}GL?-dY6g?nDPgS*L0GxfhrxAwab{XSzDXLyJGxFm5G5aug+3`IOPXuh%
zy$(T=iv1BCi1N<HfZBEQ29tY#sZ#z)_wtE8-{4ZYfK!L!ViTxjBEPD@>%CN-+j~ZX
znTCUi5E%neF9X_0nxFQ4O@IV-7o_DUSUow(`QAo~Y7UR$R?s`)pDdSnHv68*w?y1m
zHf;X*M(<qCzIETbB}8XcYGpe$3JCP}o^;r3LUU-kN?<4kY06?D4-q81D%9q(_pX~^
zcGbC_d$Fy-Pija!0ePOV6PZu6=TT<4a)O$}xPMXhT4ujYy**A5CL?W<JQcjP(U+Sh
z07eS6a_S`TNk3kBhsYsua9q@9AlP@%1U}gNCg2NjVYt~l&KjSlCT^#0GN2i2^(I%N
ztl1eN=9=!m5;PjUs?S@i?am9=b^Z1MuLVfP=qp&4iB(VE=ciHc#Z9<T{I&2i@W(R0
z{OwddePiK~@;XvS+6a7XQ~YQ#<I-nj|2X<mLadjtVMsSDtJO*1^}Dm2gJnJEyhf?R
zE2i_=eNSZN25*b`^won79UF<YRSK*Qm1EcdnXC-1vJpo6tt7s?(Ms@6pmcOQM5=rs
z<B-KUc;yf;{2f4jVf-x_*0?0`VjPzTV+B9NVj_2fnYDi^chdcelU>>55Y|70s^7Ro
zDE+{X#IdBd2ahnhZVeb!1Sm+kcmd!(J_t?>-)lOD<9e@N-UxLANql$yKQtepM)STQ
z{kowm8JCpRD`V+{qGUe(bSt|{*B4_o{(AkY_qsG|u77r@A6Nr5mm_aDvl$cNR?}?U
zc25$(+|LW#tE%A;5L4GW2nwQk1#GmXNuk5<masLPRfi9+<N#Rl?rBi_bcv+#vEiD<
zzGhX9kKa3aXZZOkOSB9P`1AYb0r=3j<yyY|*OyeqW?%XRqDI8PsJpFUQT>;JWu2>O
zS83{Nj{OJ0X`cv%vMMLGy>=cn^9{_w8K!b@tF0Sejp-Hq|CkSIvPO42;OasktllH+
zqbbfKA;dt5=#Pw$z!eR1U!<sLft?S=uLylPoaZZOp!czHXQv;+pXuD=_FH;74t%5F
z`gs(P$8_TjpXRNwVL;XC{CUt9*LuG@+<L4cMQh70@W`hzy;}cTe~@^GI?1R>5xaxK
z6{+Og)ZFC_QsA;O@#4YY0vAL=RdzzonXhdVFmVFev(*#M$sDx#(Mw|b|0!$_@%^wo
z2X<VAnl0d?1P*gyxr!f+M~%lc8`P2>nVY(&-WYZ5DM3-yy4O#3Ojs0SlGX?-SmIA@
z4Zk?o>lYGW#vfb+m%MVBZr^k>rm&ArSa)NswS1q@d_|9?I`0ywmLBo|(x3)*YiN`j
zQ_iB9Sg)0g(P{8%uvuPSB4+HMO>`p}_}&VGkJ{p@kIZBrJREc`V^P)CJ2;0Ix&_lP
za4OsKy4BMK4c{yw7IcC^SA`6I%AjklGMVzng=>m6xx?`10beEXms?lScz}Fd*U6-u
z;;R#8W%ki{Ak$kbFzyD=lRj&qISZv9iiN`5h*6n^n=R~oPF^}7Y_g_>uW`ug0b^8!
z@%5nn4r!Yq0b#Lk94whxdtn-VQ&}vj?N>Qrli7=e7-sCkF4W&8czf3k_|kvy3Q)Y%
z!fmKKv50@oGbgJMHJ;Q&UcO+3A2|}M=E{rwgA}IEr@41!TH9g#n2f@59auYvw;hG*
z?E03xL9n96<vW0P*q2y1v6{7UW^BUBW>4ES3&s{toIJUdk9U1q=)<wNUwtDulRehI
z)J%G6;OWThyJu0D*P!em{&WzxJHvP9Y!x|Y+6movs`pKu1r(0t6h?Qdr`qmm#Ldpe
z?{cK-sj%NTQ76;o))DzgZR#-!H>9xGDHqE4P~_$jg)dIEAM43DfeLfZkY-uU%RCuf
z_phRiWfiY#+OGnyR|P?0KBmXm)!wy=D2ZFoWzj^8me8r1vM1+}APHZhck5xz*^1X)
zJ1;#i9sKuI&$()Mj&ti$`Rk)5gWFz+Q^)%VU-B}r4#U4^`m}dG!waDELoC(u96?Bb
zT%Clmj;PLnxMqj1#YFD(P?Qdm`-0$bGf&U)JSDHIE^B8IDiutB>2{;t`IuZ@Q)Hy1
znf{uQWF8hVkHFBP<ODi{QUouz!i?XlyRMr(4wOBn8a?FPmmkhw>@pZ_Cn6P&4yOKN
zuz5&?oM2bce1O>L9^!XNa$dx-{6wMCl(B0aaB$lLu;&;pm>TJ8AL}u1P25^wx7W!m
z@&Q~Z8YnyNGli6@fIW2z^tL&$30b^Xaza+%wcLBrv7aDt(!zUZd<WFOleo<>7$OTk
z$1q$L!f&YWGrdtLDQqtK!dHZU>{`6(FNyc`wqpp?{p_GzAt-hD=?i3<uL(QEOro`w
zboO%+yZ=n77gE<U+6|@{JTN~?k+%YZNz^YDY)*5h$!os$s&i2^4zf&LJ`br*4Pp&$
z*bXr2kE1=8H&I{R!)$wV5m{Tqc!pWd45y1SJFnT2EPb9b29belT2h#=q+|J~%x)jo
zj{8XHlN%Mz>PfsE8Ejimdu{l@BK-JnO!@W+HEC482=lUnJ|*VL-iDkSHQi%NOuep&
zj3SK-dMPZH@pix-IA<giBcyE~Rl83y<=F4>Wn)@jGzY<8OMnJ%2*0}KG*cNBYQj$%
zV#3SoFV7??+6i|#*CUSn=;9L{vt@7;SQ)u`#PC5SmV=i|RH{R~@)ySj-G!BXBskap
z6@1=rPYE>w-vAPB30<AMU{h0<Go}l85w|6~tlnPWIgdEKVmgDAlCh>^Nu$`=sVOm3
zzi)R~$8BnXkqh->qPIW*``|>9&hs{h=Gbk!ml9cmA65eOrixXS<3f2^B;r14U%xFg
z<*k`H7x_FDW%eo&Y4+m{WiR!C)o{-yE1SebI8SV0&a?}S02tkR$kd)z>8O#@o$Ld5
zzw_R={2$A2z{`gO5~Z=-(F&-?k*&K>4g2Pm{We`wvf^}PRINdZ$nI{^*i?qr?yMxp
zDDLCcPr_WUYiCS?`_x}&f0Fyka_`zV!9eqJcjC%w#lHzn!@P`Rq3XHg=Qy{j2I}g^
z7Pk$LhlwIuI~8@D;n~?5tZokXnhHm3iL8bEi%e(sgC}2j8}p$C)HpKsCHEHvE}?YG
zo?X7Oct;p|b$~ovq4%}Fbob2cXA2u)wFXQ;UdaYWiIFep;#(nHA$O%4=qwpZ_1*#o
zdXc$x(SlV~lMind(Z%H@*p#a9fa$peTmZ1Kgo~&(r}j}?S)27y7g7)W=E?VJC$g?S
zed$$ooow_eiiUp;a-ut1(SA<-0b)@U4V_dW0;&$ZFp6YMWe_F}t{GR^8}kLE#DM3f
zWuYH6kI~C@o@MJ~Hq;**G`5(L<=%j08%tOj&~M&f42o(_&|AC+AGzt{^u1}vou()x
zy7rGGnwEUYVp#N3D&5U{xBT@rMZ6`?=UOjPUy%3_h5e_=_U@O&c=o-F|3Bi4L45Xx
zF9UmvcWdLkT4-Q3Vh(!j5}2!g$!BP-e>A~e2<0cXdOju6778D*l@WLqr4;6Saz?w$
z^ux)Iapj7nba9=*XB9^U^AMXJKI^reHzly&deJxb=@@nA5V%}le3><Mi5M~_U?6{#
z!e_cp3a@vcL&_?^&k9sQbln5=SwR?+)l^BGn_jLU`{vPll}Ykyum`*>Ek(--+!uIn
zm1kYDwK?NZ5$urc5sj2f`AT<9L1?#lVz^j*zm1YC#5=D0|M7H{L2Yhbw@@hVZpEd+
z-QA@)!QI`R;;zNrLMiU<F2&ug0g4od;_{{Uz4!Z(napI8nUm+7z4l&f?R~a>V77o0
zdx(YIHt`rrF8wF16}be2>mlW%SK%j4y{tKjK#h_%38fT&?`JSlXug<GH--AB+GrUG
z#~OxU*A2Dy#C~EiBUAsMsBi7#{?SN(wALxSLd*K=$|WCrKcU9fKffwDy-VFS=fhQA
z#Nna#N$WT+T56e&_k>kfA#pF}^M9jxI66MtZ^7fLoB4An^A&79u$%J0HmQ~t8<|1g
zo=lVdvavk3)?mlZTm#q~o%e9*{p=oreC=|+K$ac$c5zHAznpc=kZ{6zuBu-X<$#_Z
z>0^NE4aKCZ5rK%MQpIa@<;6>2rJ?H{7~;=ER@<sA#pxVsDwbZFB;Pk4lEOE}&v+O1
zQM+65;cnJ-Kh7sz24;abpiA0E&Y!$$Kmp_Yk?XCeYPkdNOiAO1$MnR@Bv-d1xVoH?
z{%7DjUSNLi8nW%*qm;uHRJ%`r>UF+4$D}iKmssTBS{iOxb7{B^x>xSZ^Nz4gL*_|v
zCQF@pb)SW$m&0`}Yfm%%)!B|@dGVWklAV82j(B30j|t^XBPGiGO~U{Zo{i99o^7=t
zZ^%BwM2Ah6<yuqZ!{rg?OqZQXBSF9f5AR{4VZ#k_Rpfq<kZtK*Tgjh8ivLnU0gh?(
zOMZ?i%2T?})r-R<vmu;l<sW`n);!lX>D|hur{psi#UFNuktU_fb(71HFTMn~Y05K*
zLD&$ttPhZ}FNFK9AUW4hdD;3+Nz!?$L_{`Tyw#=}xfPfP`jU)MNrzi!S5(o&o{1e^
zN+!o&dF#K*x2wqYak~tW?m4yz9QyNKOWQuJ6vpDX3k-6D_Ywb?cF%fM^LY;AwS6h<
z*4X^9|CMH(-Hag{@cc&ohW=P4RLNW0;L>rtQ}5|h<daWm9=XJ@VP4}Tk-zM@?kB3x
zegEaJtou)kv&b64&BsiFbG7+1%(w~EvW$~GcE@G7LH|DLQ;m)<pKfPA9;DyRIxz7M
z3MFdcK8g9>zrQ#862XVf$C7QhtosT*^s+Y)VUa{CDtVEhbZh^U_RhVEFLeZx|M~fM
zeN;<l`ou2bc+@a9rim^%{nVp_QKh@pYA29>iTJ7`mXn~YRvS$Fxey5$umAd?CE&bL
zaG(DEeY>oBkD0x<rjbnSHXKZ`-*KsfmZV$Es-*Dt`1-n_2K~K&A^NUZ%aznLFt=lm
zY2=b(QTzAsYC+e<Q|vJ@4)|DC(@>l@>%HE_A3jZ&cPR{CElGMZZ|)FG9_b#|{U<FN
zw5H9I{Up_OnHGo3vm}W}O`6M>AN)W1GA9W3q9g^`y{_878e6GBTGnt4(;1xOd8aSo
zK);@OvHXsv@UZS5W9v0LT#lqy+I^N5tWVBfg?%-l4K<w4da?zB>`t=TL{&tn5p*Q=
zGFnY4XiD8}Q~TC{9Z>d7_w6hTOpj}tV)psXO>FmiGtHO_?adY@`n~0{jezJenAH|4
zDl?_Cr)gPKJQr8;uVFC1p8VTfEqK{oU+?zvc-h|`x<e(STxX=CAd-#E#Y>rzSrLT~
z%L{33RP4Fu5Z6*H&lzRIfq@uDQmi5oMNy*nK=Adas6KN_B4)TxHI0`-rxCsRtJ8MK
z;;&m2o$%ZIM^?PU*HH|rcx4UdF@Z5+moOUE=q0}#z>_kL%#4*oFjOkD(Fp>Mfw};j
zMdh%kws&u_;fK39as4`DHrgBgm)S;PzWMskk#aFf&EGGzi<<N$h}bP7BP}G}a<mza
zaWL0pB@N=D_a&Ie*}OZc+CvQA-xV$F-yKr!KWtGbYVmIc(v9>8o)0hHZ!ULS(bK#m
zHI!(u5%Y`m$rO&*ellIj1#G`=u_?cAG#m{>g^CCiFL*+?!dxE$26^khWavcO(0dyn
zxn^jhvc(#*D)Q@LN+RIM(le3^WvX5=U<gT;urlY_>^l3T;H#m6Y6L_T#2!B`2l{!P
z$w=O=W^|;_n^761P+XQ>E(0cnos7@PoxTZ7`=ZsoZYGiN^Et!wz&yf3ckh>DE7jKi
zM(|i6sK`CvQ>-UsDl_AxDpIWq2lERe4DtImx;88-A1%OC6nZ%tYL=RN?x7#{zzbNt
zemWU6dguhq_xIOO>PR5*_ua)!$Pc|U;|Jr2IJxYR_hu<14?;t|&2&aoRIr{OxA4w7
zX!BiHQP|EN6+Pe4cfmq028SW>Kgg+F(q4MPGml+zaJ{vfr&)0_rtdyvC|Dv)U6_nZ
z#R*CkOm!J*U=WBc&whq4(!N?-@I@GxMoW2RB0>ni3T`H~4~9iH$BX5f?>b&Sv}~we
z(&%T3olsx4?opqPsH0djI!-!cSw1uqzrlJasFPqwZVKb1v}2ZJ-~Rb0mX4ZT7M5d{
zp*=83;z^b14BZLx-w;Fp^Gtflof^rMEroy;&UGk$xdyfZAa#0YM9QC&RhKtk;ME%&
z+<c*ie>Ym1hU#zE@9uxC>@P%>G!`n=plJ-lm-f_v=JRowkc(Y^;K{P8ouQFRvo!8}
zLmi6wftmId7g!go&}sRNn;~*c(l*93EK8kd&_q+ZUX*r{&j4Mpd}P+1jzw$X`ZF1F
zj#_6JP*0pg^1Ptm+e#BwV|O9m#_Ds-E@L@rZ=R&?q8gU*Pie<-9}iaePnkt{`5nZ#
z^%5Qr8djEpMic|+ao<F-i=(FKa>&5Sl9(|EP!<Sgi(}oE>f_vIpTy8B8#gn8Q2>(E
zhtgeKk68O%!l~Y&Tp%{NZ@@OBpk_y;O=cAV;P6jh`?(`Os<Vg$Z_}*{Kaf3kh4i~~
z5(aYHP^z_V3WHd)47i~`u@cXr`z0{aj1*ZFH5sYFUX_DS3ali`u^XV1CzM|rW^u!J
zdJkpR0z2A;&}9~Z5L&w;J%9Z4X*!HTcUXR+qn3_BQL!B%2!ldj>J!K?W0gSzQ!wWN
zvj#&#iCJ%`lA0^w!ir7-Sn_2!8#`$zkD>eE5FIsqueh8p9!1_XA-N8jJTAE|QVExU
zFYV&3iOK*DA9an2iCt>#X?JxlV4P3bx#ptkgtJ}+^F^Zzto_&?J+*c1*sNJsbjC7<
zi2yUpa+%7z&2_I|B>CX!fDi!{s;nQpT_pDhoKzCwOSEDnI2k40OTL4C$Oru@wMFvm
z_)IRPUYMtQxrZp=sGwHxv#t5jQP7FM&cbRa5~KEO@%+Bg!h2APbZosu*@8Or*~^wU
z3?QCI{uV(!#blQzY;{}wTgAfCwRZXElBI1~Hr6uMBnGI(bUozn;?CXEGHY?4{ZjM}
z(GSJp2X6|yTdV)>QG58<D=LZJH5Kco>kB(>;J|o-8BqMFC}UB%vw0XmV<7<Rx;C%u
z|E=g02Poclx_;^E^Sk}|_V4c`z;~96DPWDnx38|Kj5xu1lNb(7NX{#OXb3AJqO2C3
z3DS?z53k_^GeQOT)R}AN33E3$rE1G(&u7-PoJ#l4EjvSQa96*oaR=mAXdj;TF3Duq
zDfyI8Amm)`I$knppPJ4Bmd?HbpVSNQq9h}~%@?1@h2fd!PB7M4H*`yH#~pYm8;z*z
z-^;GAyfMBEY?7H|0!&HI#aH79H;*_k@vQtC_k4^E?|Tey;OaDt!G+(#M<})o@&Ni}
z$Jmite6o_VpiI9k8J1J|w$^{1h7&w7BfK3}YP}>zSDA1UO7J1(HLo0x4%3!?iTDnk
zXn~6T_c+9B)$gO7R^s|8m(U-oQDnBANJVDuuc_-!jZ8Xbrq73h93WJ|NHxr*O#P8s
z+Q1(!rKMzEcp&?4HCh>v`~6F_bk5wD>vMQ}VzYv?uOtBfC7BP<vO2>zx>U0r?6hUS
zab`KXQiXGo<oS-(?4vYh%Y!@&iQ0zfFc_?%r$*)<+nDV|N0;!hr>F?&l#1ASw@eo@
zE%=UWxX_LgU?LTsyTXb`YuGK$Xd_Lvs~N0%@Wct+xW%G`4SzksMSL8tn{`dd6~Ue!
zF^lIUlPXKJaiig?9>wrMZjdNTTx)-hNtThuFN&xnu^43{W~#kY<k<eY%8`5MNH9KD
z@zp?Il?WMeZ(W#v<LR45jPN%a6r^r%wB$`57r2gL-)%XzCrlF$<y5P_PYD2f8(s!#
z^PZyL1Y09=T{?*k0a2!2NU9R)=6U3puv?xB9Lc?vNNqfbR%>_u(hIV86zBt>*e+>H
zS-X4D#boX9SV=FX{p8N9)$ROEzQ||sR4)?SlP}!X4_Y`CBDE$}Fcl0o)Ov9iA?6(3
z6gnZzN(~3`8?@TBNZOck{%nrP4r~&nQGYhJN>2Cda3ni{!7{B@jj^IU+Pm%eB!7}B
zr;HEYM+8+`Req^f|J^bUTnrnipA_JE652yEE4j3$BekDfCBrhPZc+6EJ6GMybk&HC
zs$bHG_Fv^G?nEIwlQ3hoqF}YXJjAg*eTQ2shU1AWqdOQRpUFqJLeG~is)YTXXc8+r
z{TVqX5&o&P@sLnJROIA4bQ7{MuW$QF%{9xkg1dRst>;LftM88>{``XB{oD&;F7VZF
zyf5ZZ@Rxpa#{{<=sQhYa_=^OrPuOeR77hJscI8-q5kH>-9llvs__XJDvv`3L#l^Q~
zxs+^E&V?a++bXc=WM%l=I_czSo!m_)<bz!E)WcJgic|TBksh{n!0h`mfJa(vtz4sd
z@}lxX>KedfX<rMaxQ4D#&W*Q)P|h)+aY~EYWVh19WB?1&C{l+wF3@jtSh>|+tsHYU
zN<I^SZpE_FLmSzheF>*RF1j;HKXSHs@tQCz9-E8*=>AKyX_Ek>sAxzixop%GjDBbf
z&ru&|(e<uRu4iB?NejIWrXmdfNVwtDv?^1xLSL|!C7DcX=u!(mkh%gG1GLK;HQs^_
z!ndDsLi^!x`Q2OJRJ!DKaSry#t8O{2zAywqvgbjaFW~V&n2fN{VR741CRGJ{2;(;C
zS?x7u%Xo!6uVF4$qLsfVuj#Xh=Sqm~xUshqdTvdFx_}t%fAH)^cERkEpxeJlDxaU&
z5vPCXxFag%ww#mgRZhb<6}vDi<LTms?!<(GigxzWwTN4%s}(Gs>Ya+n%Md@>?>%j4
zL<#f6@>`hwv3HRb0AD%i%Z$4Pmu^H{6OoB==oq5N{4vhEWLiB-fMC;49B$$hd`!AS
z2{x>}JkDe#|90|wSSVR5Zc5wGs5COwZx&yvBV#c`e!QNclyF$jM_@5Lk`W_Et78xJ
zcfuRZ#oo%ql;Ln<5|OT4gJt8xM6T5z4bwqA>Um8sX@Wzj+oD8Um&aN^U>M#qlE)iF
ziRD#kFBJx{Ff3%{3*<1$!rK{$?opRwb<T2LA}f<>12vRC>A3CLB%}@~aL_&)9H-il
zjp5C9cwq)zG<)GJ0yglzt0We@kACaA*iY8z3`+MIatAp)ZH>wriWUTG5b;lG=`{yy
z7O9>8s^W}`l*W8}9hHNvi;jo{Cg0cb0X^QHGnJk@>@Q!wt4h;EQy<V;KL^j$;B8D)
z7AN-sAgC2FHJ143cc&jvg|q!B+Z+L-Qqeaw(xaE^cUj-j-Rk!Gx;44~ov#kz#0TqT
z{_+KSkn+mKVov%@k5AWDO59=h(t9=lqtAytEP&89r9)<D<sgUr26WJPc;U@bt89+L
zykI|Gzl3`*k2T-@;FZg(<jr!FcmXZ?1!I0cIAHbMwen~9;?-t{6KQ=$63<2t0!0z{
zo`EP&na3IGOx`;<kIK%kGmnnd0gLjYN4|?jbhY<*)`9npKR_(Rt2jU7VbpGdKW5Wg
z1B-7PCEp5l8ZMJN)zn~BVP0>?PihR)Czl5X6*h?SkGDUa7BR{J4>XYUA<YxNLV}dA
zVhN@}Bg4o+UWuZDg>_Pg5&yyXLqjxNW(nQSHzl$e4d^AxF~DrPg)WNzoUd(_$p~r#
zq1;`<@hAY-X}-z?T-gtx!3@<nfzdTtWJc*vDUrYu?0ach1I*%GS7HiwZ`hRjJr#9)
z(>M{)wc<>9>eH!^z=EK=kE6b3spQ{OTp0eI2`)e9uDv|@=gTcEd_WqXylxC9?G>v)
zctpDEC)A*Upm(^)E2y(_l{S=GbuHB3<J_^93qPv`ez|7*v$j0yQDn}Ke`x0$A!qi4
zVTxc_dL_u-Q;(FXEcJZclV{fF;X0O|0Rst%spCNU>~nn<-^)fS{(02-TmpL+0*XCt
zysyqPUT;;o+$u>%f<>qGFr8ljwL70xzixuuKt1M2|H#TxSOWWA)NmdovhMtoU&k}R
z9D9Wzf|Fh)rppM!!_IDiF%+jYE_y<dnpuZuV(4|qd4?25v?aMWY|yXQ8;-Z?4vPtG
z7p2mHP3^bg#TX5UMklH|LJ61FoB<6d?r{&~d)#N1IX<KJ4={jSCb(i_KZkG)Eoco6
z6m2%>Ybg}K5VG|CVhU9_5ktx)kD!9-`@$$yaRPYYcUe>#(C3&6vlJ1&)EL|-SP|{%
z({nXXd7D=e+!->53BnPu7E#c9<L~&=9(sBG$*t6?ZrP1rm{a8%@M7J&G?g1Gv0G=P
zy3e{DY^l`~m1@~cCwUsvzbTDP=$Q>9f687Dn@`>|3p<aXww1>|Vq7h&P(UJlT0N02
zrZ%Y>Un|dYziScVbtjIIY--?OV06~P<|cFdappL(xY7mR>%f&ru2XHO)`aZOz7MoG
zPhjxKseen?zJH4q_h~0Ag%0=8-_nrcD170M`Dok$v-L!OL0>%3!J3z_#46Lx?wKa@
zlPt83Zk_;inE2|)Tv9T*4KoY878w?73E%xy68rrjOgLakXOJ&9pZaA%9F1s*rYR`M
zs901)s#U&6M>_)rgp)e?vbQE))6z0cJkhP?wM(!9k>f72eL9?%w=g0^l2<>q(4&Kf
z#J%7KA3;m2OR6OPA=klbXs<3mgP9p|-jt$Tp0A=@1wm<fJR_7=by0*zegdD>ms~bP
z5@Rn4itULt$PE{ih&u9K)9c~zf>)RTdsCCSHH<*Ceae|>@~f=621M2^EIs|$M<3r1
zy93`7xm#(N_?rXofM-RY`sw(<#pFW|<vVFe+RK8cOYNhg8^-*oT0xE^Mv}8;tZI{`
zk?}m;fT;FkzN+6ewKf{>kP8fqpzRp^C|*zjQ%Q63#a8Tmn8XwZOYwJA@XQ{<S+->r
zW}Gq5xU63dRAHe*?Ni-rb9nWtZ<Tt;;lP=4M*5<nP5t<2SK(j2nQ4lHR5@ou_j0}l
zSnC)BcnJB5-Gm8Ltavv=&TwbI!us-l%Wts=5ksjRwYE%dSI~nw+J#(bB8`oZ;l}aW
zK8-6Yjr7wMPFpw0EIfSc!T_v8auqXp@nn0o=BFevKg{)V)ma;T!hT3YczkcGl!Brl
z&U7)A&?=;>Tc(5}w$P1T>yO@(epQ=6H4Z5|dfkmujKg@JL|{3`ev3pkXH;$yw|ViG
zscqJSgNzJ#{EX@Zz0PsoQ^R-3RvBXp8Tba5@TZuGo=GFe_{Oxc0b2ef#bg4R!{zun
z|9@y7xuHp8;?}woelhD$ReUxZuRN`}s@P8-g_7yi$_MPaKk4C}Tf?jgPm@4Zlwh2%
z|5<KolPOmZ1AANGWf;mY<415clt+)5M{M2Ep{C0s;75#CYM03CZaev(Q1COSQbkZ2
zkc0PGP(au*6!HZ+%sN&qkc-UERA<L9O92AV8e!GL*|5JvuY)mTe7a~@t!Ibh4QbVh
zi%fYUiE2}@EC;yK%_fk!XM-cL<yb?z7~2;meVe?+dng3QQ5r~x?tt{m7kB4;PL|LD
zWFTdxytqlylG@s`bq1}YWN(^d`J`ksnmIJ@!s}lBL)9q44<@<fkNf>25;*;6h)Yp4
z-iI0=b=edoTtP&*Inzn_0?6=MQkW42`TX<bwZbrY1<P6_ls;y^<iCF~rpi?D%i>mS
zU!AxZhM4yrj-;QjA$?7bxf#OeL{QILN1QmM6Bji?*u%A=E{9FE5>{v09d5Y(BU&K2
zq{3$z={Spum5CGA3!K-a3wTW^`Nr}Afype4#Hr(eBk=`JTg$h1xJ!GSc4=n;FP?%z
z0`T{^MIOQ6cI3NB#Rh>3E50RKv-cI*+rD3-)tszb2?iOw#ePyvgwQl=&xq+MVapuE
zk}}Vd;#3u37*Rs`_Gy7u>QAVV>Mqmm<=11c`RQ5Nn#|Dsl|E!MPd&Jht7Nsw=vO;6
zAFVUt@cVwL*4viYi44fQ`@DjtTSE+05m6YRuS@-SJ_L-)yMaMtQSzz2G#%CoqYvb-
z!qjK3yw&rq{`Cdp5CAn^bNZG7F~8$f6h6_cVJYCpwaDEZ@l|+<men2SqktFmD#%nB
z>;Go3d%9D6l<NTaAZnm^ad<*)bW|2hg@kR~qH^~+_`+RDgxC}){&1QAscP3+euGhi
zx<f@8Y*aE9ir3}eD(PqI*cv1`DYp#5RS9Fj5;3@6a_6uOc(f{yCJ}60!00PTL6gTH
zUAYHP=?pS1PfUHy-7Z-O`Eymz23*4x4T49$Qftj(qgQ<hv@%Bk)>&W^6}7L(kn(J9
z8$S@F%lF{*!a!|*=Vp%CsDtt7%ut4-NPD<k>lnG+P`r_04s^_dFbwD~fL-SLXyYJ8
zw5K_DuK+B0Ff3EV8KG~oqUfuY;EF^99t5TT5`ewrGtwAVj@sb5Vo4BC>UA^0jo`f;
zrduG!Cf0Mff_NeJ`*eOpI7J`WAerfF1M6oiD~Z6brttAR;}XfWq4=#kgYNkj!`WJK
zAp=x#J~3-lwdm&26G&r7;|6+A(1HxUm87#_icQ*ba+t{zdx#89M_A+2*^)QuBAnXb
zMcM}(n!H`l{_Y{rW;YGx{D&Yj7?Z^nu3#JpmonjiZ-nT$fbD3_GkVUfO20K<Jcn7I
z79(xpL&P@*VA+OS4xDGlBU_-&W^lk`pw0TxT1q#=_u)1@=66AUTzO_<uRx@?n5J!O
z2E)r&&RSH&W_h_984HyUdGCVfy+KrCvzZKeP-uuU;5|t7)d1fpNPA|IpoXL4({Mw1
z{)JDqXgf1`<n-;R?b(c^#2H;QKJCEwz3#{o+Xd;~orS;qNBH(m-X3ck9y;q2u+f>*
z@2fw*InegMT<R10Fu5n9#sGu*iGQ;_nKlvrCeTj}jL#!#!0c%`N!j>Bc+JkIO=a3u
z9JYMCCP%b1T_NuCwJ((-FWaCU<!<nI@#Nin$oEebKijHr0ng9mC+$s!-b?N3FBBc3
z;RhWlMZ{1cCG;b2a5z8kV23@Q3o`=--(;11nGONR-Q(g~3!oH%*bvk?ch^P|yq}&%
z=*Zssny4{Rzf`@c`EX_~*`U05oZ%t!U!=#cJGpC5(Ci^eCU<~5&M+X78L0|Wu%I%W
zn{60NtFj%EO(K*5qvFjWFJquMZoJMY;A3PSracER3W%dk?IQ_+z891VCxTrBMXO}?
z5}hH`B|{x*zFgfuuKFvR@B&p;;HSwt!tyU8Iu%|s>=lJK?@1og<{a|ualW_4EnYPE
z#aLY*Xi+}~HAV-8mi*jbFS$=^bjY;FVS*>Ei;m(~T1%$^=b^tiL}`GMGZrsg5>+9`
zkD}>(js!XaE2!s4)0>g+;t(vRKTdhKCtb^s%)shgnNYOyk`kjAGVToTMiO*tk6Ysr
zNehiu!*~A(GRu)|9cO}9(6PmX5eUY~Q+qE#-v=--o4F(=AXbc_>ZjE6OWyijob&La
zn$ep6yJ4a!k(n<(F3<M_CKjz};)PTJd+?&qk1j^<)yNg~2D%hSPqA=rd)i;_95;~>
zM^DEuoZFdvBCMXBUj?l>zq<2Zw#bbcn!i+T#XEePNx!fHqg&HR0qfj{Xg*zO=`#Fq
z-Hn$ta7Ol=%&FkO`}_txYF(_S9z#Y6up@%&Y6M9vSZlec@|(xN>mTs2t4h#=Y+a`R
zFgWJ`M${O<`l$srEGQe4Cl!-pOtJQ}X;zE9+*#A;!0Y9C!{N@)ajyk?Bn<GL8sLf2
zM$<HHJm2{k=KkkBAYagd4CrnAlf|dt!=lDN?LXX_QqTB>KhX!l;lpe6)s129>_jW9
z-R(0Fme}zqd{apwqkSO`!bOyulelX*Sa6WK5xdp>ZV<`@@OQ|meQ2AX=7>Tp%mLV~
zY{PpL(z{4YLqm5{x2%t!-ErwTOwpHg<rBSf4%^O%a&obmZ+%n4_*fJXZFP1K-d}Y3
z(5)|6c@FoH|9;!$qa5<j+Jn+KGJy=ky64AzFVyI}XgS<qd`6*(fF77))LD%z!vGP<
zqeGYkE6y*`G;$V}DvGRPyc)S7Vw)DC%8fc1ju`H9@yr6CG;5?;#jimt=DVQO)<rXq
zNPEOIsz0Q^7{@+9=`3Tb3r7TFn3cX>CDjB)a?0qDSEpkjw(S*t2;#dX4@g+Q+<af$
zGahAU&4fwzS2f^?#fr#Wb?uIFHtGC0&YrRP^#bEfB<QeI2z)MYtL^Z43N9wmVX1U6
z^l5n^dh^<OW*oD4_ZrJm3$%#$OHaiiN|XJF@20Q^Yr&U5YFUq)(y38!x`UwGUo)4|
z$bDY(7nZDrzGbS_yhVkC5cw|}<J7~LHel3umx^54VYlY&MYB|^%zC+-17R&{og;*`
z>(E2_hDyxMmO%inh~TLmef7pqDWl}NXapBiocE*_sa66F(Xb9mW#e-qz$nOAYGcsy
zyfS}L<{}e!LrE}%qY*@De;?6nEMgS0N}fG?cGvJ`20*FTn)z=QKvS-=qbk(^KU<d{
zTLQOM>s-2RixW<Kg8UH)4B(#q8DS$rci}~{XA`sSb3ia4C_yP(E6_P?LeKyB1_)X#
z!<?pe56&2L?sop3b0<|7rda`iT4p$ziX}~1jf#2cz>U%nzlzv~3+_%C4?ywTwMr5C
zaNw!I_XO=lmX0`pIA@0ac-_<I_=6)<@n5y1z>%U>wFZb1ez<khQ?ZFyJz!Radi?%w
z(K9O7LWePycywqg*VQ5lTahD?w8(^t_sAHA?@8eNr^8Lhv91@=D(nUK;z%U~GUu_0
zI?#1GZ?INNYn{|*DeV0%_wS!)1wMq;=sPvXs;!jLsA<5C_>$UVWaEo9g!z)M4|8RI
znX_~l6VY@0v_e<gU<fozHPzc(3^QA(O@OpO<nr}nR!teWeEjuZjfWera1d2+G>0te
zV-mBot=7wSjIeqfF`OWkinYp|juAdcl`h)YfF3O^AGUvkBRZ|tu73_kZiXekm5i{D
zWlF-YR8Ea6Zy2sLkcL*806Jm5$E&J7t+yQc)%8dRGWHqW^KL-JgYqVMoD(zcjCwXe
z1EBv&q=3__oXn27d#L;Eqc6LXqy;E;KBx<k#D4m&z#+v$uz#j=$S@ObV`Vscx_T4o
zs{<eMXlbJrItS*WY+lR~wm-0(S_ehlGRxI`TAS?L=1$b<Cuu=IzAW#$!y!L<pE+Ge
z-9?(t^c7l9nrD-{r+<KG0Rrw_3mwS*hU`Uj0G#T>c~Gp&zx{-tI5x-ncBrjRKgie8
zs{BQC=XuH@XR(SYmFVU}+f*pL&PQW;pUceikW!85<<v?^f91fhS`m+ED$p5@@m=Gt
z)UA8<`+2sa-XZ;Obqb3|V03!-5?+{gf}o;P@2$y;KOM)Aq%gaiZJRC8lI74DbiU<~
zde{sLybQc*%qHXywedsJZrL`*Nvj9y427x|YMh|&0?`UhJfr^$&ZtmOS*hfYDyyp9
z@>o@hPqEx(7iwyg??^-*!B@I;DEl9>Bs$kZ{~<Z@Bnsqu<g$X4nf%$GN>;rvhGSgE
zpD%Dty~Z{g7i)KZ1Y|z=#(J78%CP?PjVEccQC&|ChaJz<7djH8pCI+Fw)`kK(mU2?
z+RPrE;9N_a9Q>NQ?crA!{wHWH$8>Vu*^{yD!(6}>%vBA0Pr0f}{EwggOa;7ul~l<7
z4;qLOdGBa)d-nwO0G-Dw@uv0MT&4+kriMt``#$MVcbs|mJ?AX;m(Ezp4cbb-fXkPy
z@L+crdRr-fUtS<8`)tnPva5r|Z5w)V(dg}f>@?ZT>Vfxk<5qypW~oQ-7|g7ExjF=K
zT6GZR^CLB<s<=cIF{rBwSdAE!Vo`vpD`C^3ouYQ48B@Ox93*|(t<ZJx)^2?2s7bE_
zD}x*lR^X%dZ_{34f0srKdM>gl2UL6M=n&1?Fl;eDV0(O6t7K^s7(*8vHeBLF9BdGY
zDqDtA;<*l>6)EVaz>8Gt1A-Qu5{E&s@aoWMrFMM9^4SSIF=CI(c@#yhPx@qk`Pg2J
z1QGHs%46r$p8SVzYm=0QX9Po762$e6?3$nW&#v~ZzH~oTYc7|OYEBIaApBB?5CmV;
z`A9!_iAV%xB#zQr<R-z&1HmWzX@NPF(^Lsoi%JND-I7JqH~ly*lY`gU$8$EfA=50>
zh&|pY!}Nd2z<KXyy&tbZfd`M6`kUl$QfRgymj|=i-TNNvy3k?8Sn(HUQ$GDHZ*%$}
z*^NLSkQV4z7U}LU2S^RC{fza3(K5(@j;v8(UUkP>9d2G_$A9L4y?)@(7Iqt{RG@(2
zH!De|h?=;}Quvcq1Z{s{D9_{!lBbuT^XTLsd^wgs9Gd!@s|CH}G!we?{pvtJ`F`Bd
zm9-Da)3i4LG$L9Ru*D9JqZ)_*s>&asr?a?=zidBTcOICbBp{D@#?;Kbhn?kMj~XF&
zgOKc>VQdbCq0g%t4c;0LJrs6ew38vTlD9(Wl6$eRj41NAdAWLW=z`+aXs<3@nD)sq
z-fexxtp2`8@zPLSEAYc0zbaj_dO$H0xmN;g)n0UAId1G6s|Z|8jFFz;UBT<^_+u!6
zV-sUW@bh@V)g46Eu9G`%fA}9Qhl&PmJBho*5Q&>AXk5;j$Na!wnn}{ioas8O!Z7wF
z+hs@uYqfsVMrZmoJeU+thZ`T}dIx~Gzdl|GhDbKF?Gri1m}Y-z045mJTkkFX)7wQK
zKk&Wn)qDFW$+t>|*)*?Kol15R`%vB)cR$|i<hc$PquI~u7G12nBE`O}J(D<UnAvnn
znd`sA&?V@9vqfcptw`mYAC1JQnyG2f5xp@RmIKlodz1Buk|49r(Zc)5{wk$f5SBxx
zk>7)qbpI3YpQLw$UXa(TP!&RZjHh53qWX$y@@+7cU#$YV9Wm~r4T;br_G9Gpw-U@f
z1a=Xj=|vmau;dh(YNrPpf_ZH(1-@AaV=?^9aN~miLu{>DA<VMm`opK)_4YK)Y2etc
zDmO5mY+&WHE#w-}NG-Qd9jcOadr411&?|H?+|1>B%zd)VvmOslv-#0Rq8jR>x!L^N
zFJ`X4F#Q=LFYVI0sUzc>>@o<A`2zeMQ|+3U!~7qZ&~~jF4=~8$<I+azw~oVXN62S5
zVd8HOyq9Nw2Uw2W^f28`ew^L`K3Qz@9WN>vHx&`5|N3{in^C<UP4bOapa^-xBoEL!
z))MX+j$idX3|7T@q^56ihj@gDBbjo|(%eY6Rmi>hRKhRs!HQxMjGDjx1Z0{<sfu;H
zO@w}>V4--$g|69wqHiek#~|mFS0Tn;YRAO8NkSfhD~TdU8-GDVvTnG3JvL!^t>lN0
zx@%k9?)1E@NS@q$84pV)c&&w%KFtQei}+&TO$7o>CF+%a(!cTi1|1sG`M99wlUlBZ
zAMmU5n=K3tm_Dm&F)$J{8Q0#4=8N@`_ZlXENsUa-8Cz31oF!PHfR#_Gw{)@c2f&~&
zN!k3<qNnohW$$<=5G$8a5%UrmNyw%EF#{E`ohm{E4_&K>`-&##O(~d~R`TX}?yi+2
zVIgg)f^4LZeC%5#hsr0>PQ)|00QQcmq{RrWRneSZ=O+mmA(Jzh9b{acM&Cqb2GV2o
zhzJj;v#q0<#o7sg0u=Z*y)Ei@0s-<%@}LoVLm($1lDi!@T6D8ZC*y{a6NPa2NWlh*
z>lCd#{Keb(oon>C%fEntir%VizyOC2Qc8E2`%L(A<8x<b1fHMQkvu3IBTlm@l*F>n
zzHqS!xFe>t*&kUxnO$6?8;n5Dyvo|wZbMu{J9!5`QYAr;3a$%>B^L7Po8b){-#Mp4
z*Sa++aRRYkNzb24wQfDDdrL4t;#r`Q@2btm(bfZ?*5SgeJP4LcCsJM_&&F_U=m`Mb
zZQfdSBeLU(pT1Pr#B0kW>InPFC)z2;k})#$g>_<s<2F0&UKkT=Xs>B&agFe?Qlf%`
zH?l8@K3CCFwkCVy$&K_Mj-nueVu!)#&nz>IYKsu0=6rVIYw@|)Czd*{k@x&(;nMuj
z!j=iDsiEV`CrYI%5k-k9slD%G=>i^?HOlLm=##XuA((lC3jGhi6}eRQK(JklrJd*@
z*K|rnEbAH*03WP`h}>jY;8T)NjdNnQixws9oPnvS#PH?*>^&todqs)s0M{V!d^`)>
z99pTxE0$pK^MLdOHOB@R)8!$-z0WYWPg7r!^)~r}OLM)ZH%$f!tpAG(f*~SdTZ{~S
zdmnWQ%L!DDz)EGa#I04c*5YOy(UbZtYAA!=1a_@aKsceAFH!pKQtKou+loD!Kr+{s
zL07RB=CHUAnaK({WRKOI!6*`k-+VE6Rt~)HnzwwLG=AtfoVTPec68=N=pFpS>;Ead
z8wg<Lw>*{QQ^d%fu5Y8J_&yd;CXMdEonZ02%H-q@q%wz1TZP@lPyHZI%<t~^THEG?
zP_(<~#2l+OfJ;{WmFcS?c1gHA2%~S8U7E<2)ghc$$`mS(bO2m-T`%W9%5Zt|b4aUK
zx;63qD`V&NLBd>!c?!*=%ds~DLP@TJBDQm`@MfypzkBjMSuINA7A~x22*KU8TuK#F
zpDW7cZBmx-SBL{Du5aEv6Y8>eu)!j<#HC`v7TojJBW_!UQPo`(Tl>?EFKmPeJ>`KF
zAf=iR)8ovdgZd5uQmM1YO4WXJY~JSF(QV%D4FK{5u>P`D7nglmKxHbj&vWGY(&c9W
zfP~4qRy{Z6DQb5lXvWHDk58?0?EFI>aSxhZ;yTMqpDju@&cgWD`g84%_y=!sMoV&6
z*Clm%V`;N*s-F?j(II?fiUHf6kX9?E;Z&D5-!E4U)cisxt^p~c%~c&e-oJV}KLnPy
zmOI{8P!^&}_wp%hgB4A2s!LNeK>O1WUgXrhqEJyIS%U(?b<kMKSB77i3X*5NP*+F<
zXqj-PoX7uL$Lw5?DNCFme+Un&&7^PDhmWB$G`xXj4aCwPt=H&`RUa$nhiS(Pvgnf6
zU_-CDA(7l33{?nO`L>i0xiQT0Y1n_)z_t$T;@T9gH?@Dm0KM-Q=B0pXuOXkl<%^CK
z2!&HzgjI>)-+fD=P>Eup`CEHKM-^gv^Dku%L`oAH^!S;yP0#Wj;ZIY#wkwmmEHqN&
zGjd|!%>NumPVf-c!tn>^0$rqk{wqffM58%h67}BUV#;6-8!CGKd__T6U?eL&l5v(1
z7$Dlkc&u_xswq(9K?Ircru|BtTp2|Za)m=wmom<y1%nEaHEU0atFO>1#8k#3^8h-Z
zY;kZ409lz$8y>Zobnr3^e6v)1X+tug>tO8(NCm}#^_jSz2bZ#zO7feoeutl23*^=%
zF5HQg=ElkF#4W|Sg|NPHq27KT{*e4Coe|9?)kVIoiEwz($jxs|!D8?b>A2J$X{Ca!
z;72MSkS)Q1Ec{52GHI1~(|xrFY2|Z7C94D}LhFZ3dSUhYUCMx<x8ogNnyB3_b+hCz
z%yDxoxS_3o%@hB)uLxkG7iqU#f8sTytE7fgU%^-BjO?MAxI?GOi4Lw+MhcB6VyP*2
zcYM=sqI=qSQm$FPeNds$je=ZQEzBABA+5dMs4k``03Z|E%KW$T6QZro<kY8?1JO@#
z<XRbN%fc|U4g4^cDv;_Cca+y^PwyG+rhMs@sKmHhx;o$20Y8L+RXsBJx8brKximWG
z&>vd42++T~(X*DP$}~lYa5_Gz$~UgDdS+8A%uZF3K)(t9!z1SP)Y4(3<Mhl;ZU0I$
zY#_c403$kgHR4W#i%HJ^F=wKZLQ6kBJ^mDLc5FU@*XnxiYW%99aIvRF4VVLp=dLs^
z0X#ZcD<m2$ARAj*8znWy?Is2m7<Kj)NSW$#^dF(D27hA0DL@zb>)W<8PEKibA4MRd
z<LNhnVH1wo{(j~d>=3`O{rBOJy;T5*0e|JAUBY;IJYw$po<+kR;(8RFh@uKv*tf0m
z#^o>mX4En!Q>^FXNLj1fV-GBxJ-sTt5!L|vs?(hAF6N$xoy)Ol<ZBAO$Fd{(1Cu;?
z8mbqqpq9I>@)Y~rQpqlXmLkM#3lsKZu+vlnGA`M5DUKp6`Trj!1#}!D2Wrp*)HRxM
z6z+5y*s~SO9d$x(0V>DS_Cl5Q-A{rm39RL)Yt-h*9v1E%JB00_D4|cQrP*3ujNM_m
z+t>jlQ&dKMfs&cx<a43Rer_eiWmxU6{k2;6!tHu}#YVV#h#v<vbE}hZASS;bxf8q2
zTiz7%L#`7YQEmjqU(rQvv=H7}s2&v{nRhd4<oPS~vh(xw5M<6wf0A9(C`KWl&H&dT
zjrdLJr@5xmyWGrtxsKW|wcw8G7NiX*sT-+J3<l&Um4nl9SZPk)4{K$^Oc@7P?4Low
zw$8Y=qhJ4jspc>r)tnX?qF(F4=K2+s-2Oh!A8OPkTC$mska64ezU6%OE3x}Yrd6g?
zCQ8Xc7D!-Ms-t4T&{saiQA<`No~Gu)j-5xv5wU+1R7x9%rVRCEbg>Pq10jgeK+Mqv
zy>5aT?{(Q^;^|Oob!HYxB3rT&<7RSU9Zx%f|KM6)f!iHK3RD{r+B=78ak<XXh9WLM
zT*M)~cI=Y;cFbXBzpWcBU^(TTu}vh$J;!+H85@dZd$ayG`pkHu^QdkwKcV)2si1`@
zuH)Mgj+e*3R)OMi`AHCa($GGz&U8u*lt`-+W{#*7$p)doe((aL(yD5sN!($9b!yXC
z$neKi6ld0@BoOz)zn~n`OVOR9Ne|MT-=0T-UzaD2!3n+DHuH~Lg)Mpi`ME`Sdk8`;
zhrV+JPUcv3J%+v3sJ4Cpfr~3{8?F(VKH%wjF%Oe-EI-c_NQQR+Dc$B;&{?#?51h2g
z3^bY9(d!3ISLJcB7O&l@ZP9|iO7fLc{L(p@*<Fp968VC42+ZQ%WQs&f5{HpYu~)a&
z;lRDI+!_Fwd9el@{LWW9IC8UKbfUf#f!pt&cKc7MspW>QIYP@I(G*gDP%_nzw%5Rc
z(S?P>&cAre)o$Bpw;@m<0})?TR^#1=+Jh%!tzN-!Zq#Tk89Oabn>oaJrBT&;mPZo8
zvtbdjy@`B<Po${w*~r&MN41?%EJ+f0J7>Shl9=%w!Pt86s<<)I0=gfWFF#1BgsQ-U
zb5rAvd#s^;otEE^iOe$ov@9;qIbvrq_(<MUbx?8f{Jz+)Kp5*Y=CzNxWIIHNi~+%w
zABa43hb18)8ar#F>sL+f%ffH=-v@nlefVEi=ZSuSa3Z`sZk7toT8=qLN78nBUTjeI
zK~Bp-9ZQlJEp<Ka)I;?1O|?75s7@5H`QtX{AFlF?t!;73n~6k6ZRe><J=U3CaM{$?
zx=PD6mnM&oP!sRVxj7=WZpKeYCu0?;5HA8F+mUvOUaPc)W$fLxJ?oDB_+P5bkQVE7
zQ04ojvhkd9ac-W$w5Oef5Vrx$wb_@?(EnJ2BCoGP+mdS)?F4Z*+TS_<Wi6j{9Q6bK
zvMya1e>db^x0ENbST9DcnAXi$qEhEol$PecHf)ehn+89*S?VE%InH-;!R_F}>Mkva
zdJpZehze8L|84JG#R`I!Da2MMR*_*<pNFI}^&iGUP}{MdvoaBDGxa5-Y)wgxd!j9n
znzduzt6Wz3>C<67G4y7T2$85IyCFkBVI;s^Wf)19LQ_D*@`G?ol2z<uB4n+YbF&mW
z(oE+rkND8`+n%ZZi`Z=PqdLDZ>Io;ppWQ3?x0mTY((hu{?y4Ae?zwEhg-|P`A%Uz`
zJ`sfatdF0r2H>WCs76?!S!7r{2rg)PR%I6n#}ylpZ^ix>09S~hwpFK+=J3j-+_#D|
zltkKSMzt}gvdgA85FRhuf**?ffu~#&%nbhW3rL7q0jB*B=1ePVINm-LP{c=v=K+#G
zL=U2L7xOF&9pZ*v4k@PBAnCtN5-dqI`orj+i68l_&Kc-F1eR*3+*_JkqCNR_!VEMC
zO5#4?{%pSu=nMMEjQf114U)(rQ%d{!0GEA53cy-|{!uT7UJo;CQ#Zv?7E(p6{_5(g
zh+UU;IDR6K7P>cUR~w6J-a+{vikrf#H|R_Uzuw_(Ii5WIxeb<^1r6|LYGI-gMM~h|
z<pFjR&vk1bGwk<1Gq7C>c7C5KzOVMpBxV9p&rSsc=7ZJfR~<fLC;q;e&ThM9NVck^
z-#nVoF%ZxabKAq!n<^x$)$OL%1pBV^o*O*ws%|h>{@|EGtFcDIkPoozBWWWu`T5kL
z9Gd>y@e>&^O39f7=NGR|A>(Cv=}8z)E%NZOo4x+D{%34?_4jANKT%>wPNYWg3C+Uk
z9Cvqv$$PoOTO0yojYAfdv(M8_We%#Gnf1SCx$Pwn^07((!>t^0!xNXC|9<&}bhsZs
zeB<IP{l)dXrkODKCqzXMAEXyPSf!}&kEP=s0?fKWyd@u%fKgL?n}Av2MvW6Y7NBVF
z6^Hs33Ge^<5=8DOl<A4Q&^2D2e>OC(15s>4Nq)3GC$3>?GZFcQ*CH8182PTCUM`+b
z?Mf{eH9$!R)}VO$3>F+D_A9u@lG|`DGK0w|y{-Kl6&U<awq*?3Ig-4GCiQvWTL(ii
zeZ5bG@n=#0-1iqB)m8$y%KlI)lLg<?9E~4YKmMj}4VXZp&+89W&&nac8jJdHNq_SP
zF_a;XLKS89DAUP2>|%cGv--Mdu465l#{0QxZtb~vU5JV9pX9o?`>#KHPN&gl6hZ)e
z{bS>iO|4Pq*d|kl0egg99QkW1tE_4~eb+iADp7*!!V`aAo-cx$0F>&->G;*_;}OI}
zJd^%mzSNGCl+J(aWJyA}pyJc&v|FKg0ZLHIxR6Dw8b!3@(HNycROAp}uF4NXlj%<#
zWC-BJ&7UPh<E+$n1d*CHV!wXG^k1WK?wb)-KoX&8RYxj?w^{yu+v3fg28NmA`ip;*
zH!{lL9A(z<5f%{9NM2164@1_KX-f7@+`BY+kADiPFX%x*G>LF9v-T%|yopGEbx6l@
zdPp3xo1U!ipy!Y-%Q<KHt^wP6vh^)7$xYCCJ-6!v56<{blw>R=vt5G`Y|Iiv;7v63
zqL&mVBa?PtIRb9W)l!z!;JQ2uT9J_W&0wcyr+4t(BA<o+$EoaO+Dh|Ei0e|4)n#rR
z9{Fp@SlJwAgQF(W?Cl5dyN!GXRN5RkQ8>FyrAF;7HK%1{yF(V0Ocfqx?fjx?Zl1aX
zBt6(GDHQqd-`zt@U<F&iKZ;q5`h@fz^L5wx9L2+X+MB&-gL!yCMmI1ZD7&#7_S)*-
z+VKZZ=-+S2m4|eoQ)}<3F9!FmdS@yHCwHDvC5$Ylqc4QSNvn?7Qg;3;p9;<?(C0ln
zJj`w@>mi2YH8zN0Q9uS*Cw{{VQJb#?pu{0P3GD%m#tj<6zx2?;zdA|(4}~l-yk?A~
zn}<<r=k!6vA>aMvVYV5D<Lwaos}S(TAO~qC89t?=WX}-!Ox{4b(N2CZ?Kvuh^;tU8
z(-`p`7BmB^i*x$?HhS-ky@*K(#pe>F**k1L%O4MfNIRg?i!d~iQMLq2VJ$?lv@O-z
z%e7IwWe}m<aS7NTM9cnBybkfjubcBn-c%1+^$wZQWy7QfG*W131*LZhCRY7uX~TRy
z^7mQo$~vsN|0!O*v(X}PWy_)=u%9l=>jxKm(z`|7Cv=VGa|vjJX&6sZo@2pksnoZI
zno}AS4b`+@iNMEr<gW+<y-^MY*Mdj_J<CeX4q8^rLjjfngAB8KjBVj3&|76L0z-h~
z>#OhDlti$&wSz{{Tuz&na6jla4>~p-`C7vOW-){PMnfK}8=^HYZy^eTQ-f^SJH|*F
zBsop_R3X>Akq5^^nMySxyt4Av?p_b--YRFsnFj3$)2H3o+vf9TV+C50PQ)<j>hPqx
zdV{vqdeN4Pm$>i89~!3A(hhl2m1dk{bN)TCBEr%;JRvAK;fFN-i3DVkuOicKJpOzu
z<2Q^6iU8rXAwYuB(8)Esd^2WMyetU-#EX#^4xLZ_vrybO#o`0ogx#Fd!XFxfpN2;`
z`Y{*Gl5cZOglVCGiyNsG1HbY3?eWLtYKdPh6Z<v=G<<P!>2=XL5YOlP32O_&HR57N
zQ_S70xJOOb{V~)?t@>in125>ai}Mk)#SRS4P*G^hrP2Z2;8P^W*rZ=bpC68|fdF-7
zSaZ5uJ|m@#g$7RQc&DO8;%NZwkmnz6<$R!>vz#fr7L${lsy3KI7jB?3tr^Z&;)Hi@
zOXYXTW&CEuzicJ3Is728`Lgc{Gq-852E{h!k{l#aaLvQSv8j;I<<>=UzXR)46t)K3
z)It3DeKrsiAZM5ZC${JR_A(?_?t<@RhIq!H#zW*|w7aVKYIYx8mld(1vaA@0EI=5L
zD({90S_5qOEC0hTd=u4Ez-d25yfxr;O*U46k-&{n4<#CQs3*^Xk)XJqW3l>c%tvqd
zQ6Sd-rxeqZPBlbfP5n`0V^PLT*KK0?oq*ySQlzSia(&lv$0)xC#!i0+Mw%Uu3)(|t
z>==+GAgv~}IBd-B4?RjcZ@odRL|6AczemjeNVzCRZW#@T@qd2$okP)ru#oRHQ3l^k
zw3$V<^mPVp(SgOwbSodo<R1@X?{VQAN5jI1VEdKM$fD=AGW(w(x0LXju*Gmoyj9XI
z{ehm2YMk~;2C+Q2-d;CmW96E#sb}@NMh(HM8V{lBiosTJErAE}wQYqBmEJ_S@X;sC
zvSXaH09dHH=kcB4+gzk-A$A>1AN{{zEwU1O!c#Mye*_z&bkRr8Duit0G#J-BaQ5uK
zi}@5a@D7lAZQ|&!NGW~aCP-=#vunn%CN9HeNB-<%#gB?KI`tZU<HxLz^j{^c53g5R
zQtKsoqa#(e+HY(PuzgR|nm_g>Jz!*N=$UfNvN%4u4xW7-9*DJAw<aWPBH6fV=~sO8
z6=uDQ0U@kg=HNmil=i3&0L0io!0}3}^T0h}!-tBHb(dvC18wM2CtL#d3?NPug|AMT
z6|CKo0^yPbVjB?q2+P-ehpAs=CYR^I>VF+#`JG{raFR%l)kuHH53%YNKw-iIfh{U_
z-{aQW7l^8*2q!3icqJFGFsWS982A(_>^yy6Kj$sL?7X2PusnIOA76K4V`Uf)hr?VI
zW6Ube>iJ<HzKVEVsjY%&($0{3_g!#V+2F##*U1S}{D@lotRZb%w2?ICGOtrvuYI(^
zX(&AM%dzUypx%F~4>2?e*$%uGN#I<vEN>>TZlI|bpfI*&g2VJarqV<VsEhK4Y#0kf
z-=+cbQHt`Xt-{=V@vSJl5*HxP>aPl)+f4lT(72&NqkI^b3_jfIoiEq<8OFgZL-N@x
z;cw6G?A{_Y76vgKVE`m{3Wkcr`AByb*HOm3alNExc=d*g`#afqnX05HCpKaZ(%&ZY
zzQyTsr<sBz!XWytGy*hXGbIPjlv{J2jkpl+oog#|JUjn^9U*Ek&eQMy2$#sL3B@*h
zZ8HU(Ql_d8ck8heE+Ib|V<o9(t4Oc|2K+_|@!_@hAZzU1P5N6mP@+RkgObU7Sa$R4
zhfmCZSTZpI!RPP59ksYfOzQ7pW~sa@qi5lG%9t(a^%yB1%iAPSMBW^xTk`?D9^Vd(
zwG}A~KR1^n+Njhb15w9~Vh?o$^{E~+BPg*CTY|z|UYqL%+afG_%xrj6KFj7>_D!6t
zmOZ9a%IE3{rD8ry;CT>0$@ki*Csyz_d3DAn8+5;XZ)Zpa!}>%Q5-vJx{5YPta;-zh
zD~5|Un|03IQg9!v*pgUp+W)6!hLA(-7P%8G^dLq_4YPdTKjmRyjA@1m<+{T(cO3i?
zbe)*5ktLT<05Yc;0~jgJ@-HTLL$se_)l}ZzMr*FrW!;pH5x<&6ZdDIn6JN?)x!nS^
zON@<-jx5`Mpnvh~!*L-)M1#Yjkpu5I$*+4|us6hU67g@(wkH<ooqv!-xbAj~SHzC!
zNY-}@C=g1X(pcFM-PV6H4SA5f%x>i8{nZ7Lo&1r6SZ#@#cVB*xmMTdwO%--rj$B%X
z4I~|wFNrHaC2|9OkqfQmv9K?$V8Q8HAmIOT_0<7U{ms{kGy+QtNOyyTbS?r?(jC%`
zbjQ-&-7MYRT_P>rjj)t-*Smh>d7j_<|L(mDckY>UX3oqfA?mwlp~OzyJ6kD0%ejD`
zsuWDGZ`edrK&8o20evm2`~dy1v(5LGw^s{y^8Q`ph<Se9pPK<Y@n4<h>#Pzjn6yyy
zr}t#0i-pDU`La*`8{Il0q=gJLZ4T(fpsLShtV9nfDqzmC3Il|KS_0j5d2h2svv+l2
zb^$cNbv-lA_Qz-+s9t0hK%H>;SMEbwah_^y=~FK5EKA3h7cLpA1ntV`8w+$|lS`P5
z&{XLT19Cqr(d?_qVK~uz+)hn-m%Cd}&3E=w4u(2tU=0qBhePPfG(|lm8UMg_8VcsT
zu#WYixX!(}g|qn1kEtlgLS3+h%Xb$jS*`E^jAu>VV&W+fOq{872-wTGSR=lw&Q7-{
zKdem2PFMFRuEzt%jbYEwoJKcYCA>P`ev<ri^ZH=NEBjz5+zEF{<V08-k6~V1pNnhc
z>6hd<R+t&B8IxUge`qEb7tdX|3gj7W#ZG6Io3ul$Q8#H=M`7#@GXrrZ)nvyz7d=b|
zz<oqc#wmdITNBddioWnSp@?`edIjSB1|>|4I@^MdJCC~RR>L5DI29;<v1#}E*SyxD
zqR*cqJ55>`6uTZQ+@V$o=Zw?r!0&ythegOI$JoyN1H-46{<7wO%B0=onc6DXRl`3_
zi}u4I9}*l4lYhKxdH=5-^oBKXO+Y396aG?eROtE9!mT4K^4?HP?rkoPzmQFKR<K^2
zF&GW%lEa)IhR$>w?S57r3?&|Cp5YuFN8oVbu6)Wj%KMJ!+o=a#c%25BJC|O3CC(8k
z&td21F7s$NNKhm4F*kA6s>5uC3_L3#e&~j^HFs6@M11{2#){(0t>6cmYHs*w4*TNJ
zT@U+*W04QCSnub>-X;|a^RH80>^*sY;>IXzWBEzr3@6KUoP{o(4U<cyj`XN=*r2-M
zwbU$ok`MIf&bH6aT<5jzp)zD!H)ftoX0Vc+r5J#md(lt+AMn)?hE^_GU#;b4q=~A2
z<-OJ@0}5%IB^H|%sDOLSRllAc7?kNhN|)s(dCvPGWdTqL3=+*YGgU)va8<>A{V(`-
zh6m#%<EHZl^6qU#Z&zh%YrB@p{QNc`Zb<v7VZ>1(SokB^&tm-pEsTnfF3jx$eXXP=
z>DEkuK&j>N%Gd+GUoJa-IZQOu*chtARNSz<z*m*dj*LxM%6d5rZ>h^?0(TYrBhyT6
z(t-m`YNRSiYHmu#?jlnbc&onAc$~ih{|XPDw!#^{VG4RQEeAC)9sMk@!zLVd{)gYM
zlWnx+djKtTC_<3&lGEJ%;gE6q!2=)`8qh%MtKmJSG?Sy-Gkf+#%mODQ!oHe(sC9Pp
zmVawUlPh!a(6eF7YvO}}n`WZ<*W8gyo%yw@?7UUU7q+gfII+c5@0tGe#MQ(XwI}2<
z@nbj?z3GPc0o!3FyI$xBW;JXiBY}R9C-K7wDgx7%z2(<{WuI%GABcQ%r*_tQ?kSRa
zw4g}17JpgChf%SRleoZLH*-AP^kIU%U^Rg0rvjq7x0PtKQv6sbv43;_KMja)+<=gQ
zppu+VBhE9u=o08?wX)0dL1#>(SQ=YTKvWcsVgrR9tXhK2xpri8rtHRKO^pe*xWe_H
z>oN^6WLg=+TRf9iZWyRBM9p7-i%+3Kx_T9JmeJ}1f>^T1uw}vwH5SM1W9?F|{jBg-
z6%en}vGXZ%Z+_^UnX8vSbD&B=$iThMr&@lk44R`)6S&jh9yf#V5pO%ZE#6{H-ghil
zi6}D4G@}CM7U{-iHNc@G#w~c<dLU!C<#B%Xw)fwh@2@_J6zoyT;1JYoh**a#tu9^e
z-1$bqD6x3wO&O4ewd|d+>(ZUOw$wlGCpS0j=TVxe&SJIkvLq3F@87<0utfSmJ(E2Y
z%bHhH_xnVx=)uDC0JF{6($9e7xT@l0j!8L9Vpk5|KoJ%5;TlqGsh~;O)OGsr0sj2y
z&{GrtG%P^lWB`C2qnzOz-!%MoxkcIF7wIknk*^=bt%lq&a4EC2c#W=AX!X)bIw(_v
z!zv6&O5uSb-Iz7GM*mcj@A+Y#(e7YYEpp8V6a(NZ_0I`s1W`CKDk(`%PfbWmcTMQp
zVjMGfd8NLBd^hNWVw2|^PyK(yG9dAelZ%LI7stWkZqaAwSP*%46ohN(JakkFenI^q
zq$G=N$KSq^Tlm+&{K13RKKu8^F1y{Rm_RM4QU^zE&edvYs=iN@W}HdQ3B?8CI1swF
zQrA`@LW|w%=ZG}#2!D=oXPul2D1`l_pq4t|atp7BMQs#}pyIvnN{e&nA6WEcfKurx
zi9pY<`_l!wIPH01Q#*kz$%W(+E(wQIf0a)DX?eYc*}kK-#-s2Nz-30}nLNQ0Fm`4D
zt6ZqE^9cc@bbok{%%A`YYlSYpTd%XLeLe(#OCYAqL*=3_Jub!1Jt6qUS%D`@z>K=r
zBU%DG$<XH4z)G$eq2Mq44zpwL%P|ix#NWsGF?jiTm2_fHYHGZzLGu+KB<vo7MCVx9
zF6_qqTAcr0J#URjYqTP;)LoTE_y?v_tsDh2Mz-lTD*n~f?)+E<gBim|ynU|qLW=Sx
zJ6)EJ$Hq%i+wqehU?K6Cznn}VilZ%c%-TllMa;O#dEV-eMce)ua`r?2uyyES1uED-
z{z2bP>ns)UO@s<K+@S(<Y(q6v&PcVbgiU#R^~y<aLR<E)FKqk(ZWu@6S^U;X|7~{e
zwR9;(EeD{gR(IfF%pgi#w<l?)M>9OiDAw#4P$gfXLLPHq$4Iti($)w?LY^}s*z8?k
zUi&Q%e~~I(>61jesy$q({J^k({vHM)TDWxQ+(4{eM~K%}3M0h#uSwl60t&4qIcR)n
z6ZsVf_z50J<9U@x#?xJR-jo%_;Z-Hl5jP&ClEjd#wL?djznx$1L?Cny=V58^dVvsK
zDTYm|2D28x9lX^01RJ0HXJLgl#^PsgUaqDwaU^st#3gwq3Eepji?~K$noqHT@tK>;
z0C95UgZ5F}`({UAi<Lur1iSUNbpQgxEJ39q_N4zV3HbZQ-5*!_@@MWWu@&O_&v9Wr
z>Jbp|sKPzIh-2D2ay`Ab;nF2imm2O`Wc}t=htmux_{yLUjIR!Oy+nx_Kh~gbk)xuZ
z+9$Nk;Nra|8aWnuN#Z?=b=QNQ@Oy*c-4;_U=bceUfTA}T#)pD8@|(h-L+1{u4y6#V
z?bZ0eJsXi+GbL^HWR<+WXjb8=*V$JgJ`ssBzl!IbbkfTwzI%#4V14(YJwy5*%j926
zZ!QoiQ(d9x(KM;T@x^dkbvIo(`gUsI1<;1`UE%$831#3AFOu4N8pPSFs~Bh_qgjt8
zIl-BONI)3?^q}AfgngLr+tNDnv~#PeN%~fmW1cloMQqbJUq(VZ{$(#`7UT$@nRjEc
zMjYO;735d%S=H=Q{P-q7`qcExa8PGR(w514|7I$tTn1X<8O(|Rrs!s>192_hrv55!
zr}EL;*CAgMNHG=G{_P7ux8`8ND0r2yKee#@!hw!DF;yc^DA!f=!l?wQfZxi8{27#>
zK2B6&EO%eZu3EWT{cGU${UYK_>L(Mul@|eYTrJlfO98bE@V3*AXYw`xf(YWhsL_m!
zs5tk&){?udX*XZPF$NMsDl<rQMU@Lw$DY-62+u6MB{+}LlS=IiCz_MZLRv&TWq+sd
z8K8k|#Jc;d-yun>fB1NFpueM8v~PWYzcOLhD!z=wuIH(yWO)g{wnNkU^Ut~xJj5eF
zn@l*_J<dZ}@h*_z%3{{{HkKMXptBelfN&=y6G?q9^~>KP`SlsBbM!T!>e^w_a2E|{
zG6E9ZGCyPly4ZJ5Ef-dfweBx_%JASYSsgWfXDNb<VqTaB*OUp@eDZ2^`?A&)bfC|6
zkJm9s2s@eKb~B8;_Njepvmv9#tk@my?nTFw;AKt}tnFRy=?NF22H($stGgzTRQw8k
z7lX(U^H;EB`L^Y#z#{3-a21m|8B4mdxaV!W-x6Qzievat8FkOshxO>eA2@}c(hHge
z`MlxIhLgt*I&ogu(b=r9&ElZvr3GiF9+xiytn-kk!F?0ahBdv+`B^eD1I{rd-rt{<
z=y$>o%&}TzYPr3R7Z=p=$CQ8v-oxURJ-_}bd;OK)NnuTXOw+Cbp6ti1vPvtA%X&T_
zf2AzhODXzAd=`cvo3DtFE%?D2u;}b&LmfT1-?Zc1^A^CGq2kR@_teUE0h8Dtiy@LV
z;ifdl1_<K*RKY7smi!+7j{zi>a3KEnRn%Fsl&OD&6AzEyCnWVoaE(Pq@29uW4p)FD
zpelkwH}*e0<2*#*)ui`To6VCuNCn)*2rW9{d`&cqpdoeifSJCA`b|d{RcnL^pfd{Y
zQ7Ugapxm*kL4~o|{qw(kWrLNhTG1!x09AXWzGI{5<a<oJGR{CKb*Z6x1k}F^Lc=hN
z<6Gc@p9gK(URG0UO0Q?R7*gTZLarB^%E{^RCK3efKH@^uRm6YQqLo3oV--a7sbp-z
z>yRhjoLL0iQLh&AG9M@s!K_9Kr*%r|6tc1v!edibp;ZaQe%vpGRY#*+*S#i3pU^81
zGD2nKUoH6aul7QJy{@us{dcQNv{s~D^rKJ5J0PRTW!^BtBD44<(sI9u&Oy_8=vuz@
zi*5Q1jdj(<&}Cw|+*XG+$U4lGQi}79qi|2H-tWDESD!A^_w@`*U~tp>?JyHCXb^<;
zIS3i^6n4)Wl;{+I%#YAaGW^@~CX_e!vyBSQ3^>G_nmADf6UCH0`OxA`v@_J&hm0K0
zf*s8^66+n#ukg!Xl0&!$M%7!r#$V1?8n<z=TP2)sa$T%L)}qZ-oXC;0!$p)3WKpCc
z7Y&}3|MhX#%MI?ElKbWbR;)fP3pZywAF7GiGjGHjXUMND4C1Wy#e}A@!5LbLWI<yd
z!<z5R{o?7B>h)$AZQ`77M4~#lw6^cbrApf+wZrWHeR6+MG&k53Xv3?3s&~D7>fK~Z
zOA&-yRtGnG!$)-B9dRG)8Y9%-O37%B<D;N~qK%I2lUzjclts1fOUMVd4IR0Bm9Ixy
ztI_@nLl2$HIl~<^tja|!^ij?cSiKb6D7}FChJk9A=Pla5Ro}nDe})`d6<WQVErR+?
z)R4-6H!RTkFA;hD&ps6@GGCf78kjd!duFZ!AjZ~-kS<*4lo{tBJ+7jJW+3|hqG~9y
zs%j0zPj#w)dFIbP#O|vcvyE6n9vAoYUku`L>O1z#dk|sL9|w+1%s7>At}bA<Uum*l
zw1wCB7IY>#$Z}<^pKo?&V{RVgTjw6=D}?sxb{pD~5zu#XJngcFL}B@8KdZA%N<IlZ
zV+AfLA4b*qSIj56AihV2c^%(v$ETF`=y@=z5o^gEY@16X&Gs7(lq(fD+!2d9;+I>@
z_s6PvPYNSHqGE)^B9!^Wx;zkSv8SrkK{`CV(o&DOX&f$?{zMZ{Ho$B6(zTZ?(9OVu
z$Wrg8hNYcI&j>Vof2m-unzK&}HdfBJq*@zvNIL!eTFw4_lI!*Kp0G$a;f+`4tpjWp
z-4pXo`qVgrHf3S4tFem_MK(p>kO<i~oXkEf{O>#o-@)90QVzsP4IGo%y8_Z}vb4)h
z9H;r(C5&Hf*4l+qkVTXg??SC+a_x_!<rs*2)$G9xoZHc`OMA|S+HeI_xU=Us11w?X
z42i2p@iyh1Wh%$2wihbz31Ry)0(+y>_FrIp)C0X0bu`Cj#^*cZUpV)29h=eO((mA>
z?=BwWZ1>7zQKzB<gOwJ}u$!_UPuY3p77I8%UTZM5DsasNCWmhKU3_Aq_`DUruZ*8{
zqWHgrlm`}5y}X8yq?5w_PMG6j?0jVO72H<*r`AuRd{SIe{G`=cT&B>D2AE}xM_yy3
zR7g70SP^#eMYbywUG$y>IZ#f-HLuf`$__j|7D^sfojXan|7r@Ivhxtwxr2?<mEAkR
z=qcvJ+P*M>&ph(a{)oS7c&H%ZYT2lIltY$zB{9`d<OUV+o|<~#iKTqVTzMmbH|J~l
zkU}F(s<c?sM99{Q>50rn8RPiVOU4mOOf>~qrW@`s4v1aDjDG;nj9Q~<Zf!huP%+mC
zS{itGS=SBnc<5@nUeoPnUK;pN?>xDd^s`mpdJI?*2A0xR0KG*V&I(}i7=QU_YtKza
z6AXMC6R?*$;bG#nH~Jv3Cosa-b(0E162hDy*d6EtFu2*nr0#}Id4<Kz0U~Hn3xu0Y
z7QG202t>JTux@P=y2ntPiRA;MtuoA`5y0SfvU2bfq^Rv1_n*Gad7=MC)8Y2@ncc~?
z>&JJuD+crzGpyC0N98=tfP5C_N7w+SzI;bDa2tnLslc0w!SAs5aor<x!Di7)1?ZEW
z@+GQG67KSBhga4kHfhl}T{f7gE`z_XXm}<)e=P%9_M$T$Lgi*3d6tmiX;Qn{sDDyw
z!AgO%izWJXQAIwTg-fs1ru<B4Q+m?D_!XveDTC7)t!(+L)6TJ*7#D26<^@y{r?6*n
zLI|UC(~Y};DLSN<n_sJ&ygJzaFlOMB4vxxSqvG}Ztwea`JQ-%UqBX;e?cg46KD*j<
ztWAIczuNN1yMK}*gd+WwlFZ~#3x=iD1z6{c5O9ayCJ?f{ScJ^dsEhb$|E#$k$k*XT
z6l_#9{Au4dhELDOiH!P^V_&D?EUVyT1Z#lR12;=ZEL-|URB~NyZu#WG<O;ONSPjed
zH1$MF(xrDbf|7l*>+$eyFtKEuPQaF$zgJ7=cAZn;rgoikshWa*{0>ykc8a)6R)5&K
z=QYxGP}_h9F%E^53bq~}9bu`wAddQ|s#+C#Y*OFJ<EbT(q{)>HP8aIE=)WGNlJP}i
zeGPrag8ZO0?=GL)EQC4Y^k3r}@S&WSSypU7B?x7D{zmTS2<1?;L4&CA$eLKYgtuK2
zcZBj=i#HCoi5kTj`Gbo<!YZ6(_CFU&={^(n?V5;AR%?CY*6^L3&^x&rM?kDCM-NfE
zebqC!?%5q!+iieuFr-NGm<c1;Kj(mir4IxB9q-e+_4-Tohf<C2<XfM(8jte_XkdME
zx`fWcdq@rzaJ4_YL*fX(S*4!+y=WmweUx@QK76c&_97|9EJf)PZ5G1YWYv1*SB8K8
zSw<OkQt2v?;0Y`^G-xHWfrtqy6Y^a?dUGF)D{T=bf%iN)5#myFobU;&5R0kuK|S>)
zGi)U9gHecJI%i+L79#tocxn1->t(M7^iPU{1u@ywU5E}3LOSoF$+LD!O@%^VGHF8G
zRRoImOjow!%v}j4!CUNm&t@0_sBTEqL{E+lKNP(jYLS6K)UBh9EWhmX1wJ)k7)>Xc
z1$yan(1L@L5=WlkrF&-^kv(y*%#xXGNla_n{XWdp;Kf{VZ-`872n~q2M_<8|xj5B5
zLlB&BLxovy62+%?;vp<5T~*%vFL^f}%>qUt{P!YoN$0bRTzbSdER#=LUfF%|iNz8x
zeEVN-7$Q&1A0)?Ln0uzuzyqA?M5{u&^pj``eZOf4#IxycMHf6=rGu*w!RR|3C@<T&
zt{!(2(QYWuaJ{2n9ZNb_`fYF8-A6cbyGUl96x=pV)zVpQ>px@PK7Bc*iZ^bZyB1t0
zU3a$4xr4u$;Bl!8TV*lYkiyM2j9+CLevbK-tUf+R0kq;{{`%+nLgVZk2U7oow@{kl
z#$FiJ>B)6dy~HMb0@Y#3;=mjpqtIX>__6F#Dm64E^6#jOvW0o{^7ia_MjD*oL~GB>
zr1T`cuRRArCPuuv1M36?0Rk$!Z|2Ig_TEhLxCsIZV83n<u|%i*`Swpe#DrkohUf8H
zS5<(9JFD&+W)fxC-c~6N3-H+egZO$+!0aY+)zq(&MbmOh0!v%11A)K{Fmds-A=#v>
zOVjNQ0ZV^txAxXh&8fwg4xgxGfBdh=tFl3~Mp%1P9RQD+yb=&s?&GLi1EFe**;BZR
zEkc$Ym@2~nMrI=f<6Ay-h3M0@in};gN6CxMQ~JP*^jX8t$N?bUWO{*+^hw7l{fF^f
zaH<7Ch#<yu&cRn4y2!}GenI?CPx8S+)Q_0<E@LmLrR?hTa+Q593|B2)q%QkhQkN|3
zNsFsNkdM+?iVxxwy=9vXV&Z;%aRK4GKffILAn4?Mz(J{k&@SVxs3e&0`_eR^KU|8P
zPgzTfCs$662o(k8#=mXS69~U~sA<Hl)8Th8g3s-!{^k-KJZIstcRI1@>e4dx^W>ZN
z4P`s>e!x+Gs=9%?HV*UqLxGQvuKWIQKgu*L8bLXqrMwC#!dzR!To9LMr@+|>ee8ux
zMb}}j6(PF04KwB2^YTo5;xr^+!RIW5`I*wLDT=L)+VK{$>&JuWJl{aI%#v?ue?Mku
zf`LSJbGoO=VXe;ACqwTeJ8gFAA#;TO{L&A#jsFW$Dc$-MF&qnM5Lce8kZy}0=z@IA
zbuqzew!eXA6Vr9#7*K(BpWI$i*;ZhNS&LPl-Q4~oV_(r}ux=qLB~L_JzpO*=DcV;8
zz0U5di*%x+z^M_t#hJP*@;ho|`>K>fno<);*?ZV2n|1+nk)pX$rL=e%eP1le4|${S
z{BPPim31qtIcO3mhxcm`RK9csJ}Uz+KY4SxP1qjN3so~=V)Z#kn~7auVD<(ttTqQA
zv4Z+Ye;T+f9i#2#d44YEcy536cs2wWaD-duq8AFA<rJ^JXUaQq$`=xKtXUC!n8X2|
zBxPGwegRruXwJG~b@ko6Pl9m-t`j};E#-xt&lqJghVHhGh{JEiGxD#K{%)ZAA@`o&
zEi{?*j?$g=g|2BGz{&HY4IA0LIviypjxG(t^>Nw=Y+Wds7jyKLrc%%V_Pb)^cNW{$
z&U^B7o6VMpVR0{(tmC*}rX$$(ERAXqnaGm@IDB$Sh;PI2>NQblc~8~-@S6b%fl2zC
z=BA5>-{x3X2-gBg6S|kc)$_p9FUDLfD=~8q&+M0jxO<h91bH@NPfO(M(K$mf4}4b0
zT6lyEwt5_0FtfG52dbwft2&!=8r|RqNNc-J7tNB5H_r3ER`cuEir?GF69s>B>*;AU
zk8^13?BxQ<e;e$DDbb2MVJ|T!cKm2ud~znaUN(G*RoZnM`Ry`Xt2X7txjJu0_jd;Q
zE0ghJop#B`;Y>C|?Fz5YaQz8|021YkED6P-S+t&uT%?_`%w&*_MQF_Zm33G_+rGlR
zcHbhNwg5Nr=<2mU?tWgxJezotwmJr(1I6A@5b!J^fGxv4yRUk?-G4`G{dJcV;S<CJ
zEPlPvTWg0Ad&du@b;gBAi?_$kCf@+Xhs{cz;|qcE85l1qA7m?e(Xq5?LVciz`M|g$
ze-SEkEhdm|C{F)XG38MhaQb;*J=+o*+2pK#VJoJOuaW3cq2OXfv-D)aL~TZ6dZKY%
z>PHi`s7^47(P6fTrOL<z#nTQ-QZ3oCe$1<LH+Rh*!@S$dTW6d}_bYiQTavHudt%-L
zX#jL58{KqWXSGnmV<g`pM2mdry6@!y(hS><>mZVeL#4oUw9Pb=+08G%ApF88$h1cC
zk|P{hu&lF@q2?gA@?6FA8rGZ+kC2N~h)*doyDqmv(RE98esiPF>^_%R{vk1T8{8yI
zS=P@771H((t4I>$M5sC$U&YcOgj63>$l&kMIhWC50lNDf_@bk8LWtNk8<pt)?gB`S
zZgUJrVh%)N#^NhvqfdYApLanPk``0Y6Ag4mOhb*wpX%yFkZD|U56!TMzZ(4Fii0m-
zw^BHVf|yWmcVq4<`NjAe39hG0({?TK*89?rI?T4BaC{d0gnMIn=Ovb4<BhMW7(fLx
zwmZCrSQU3A`OyTGzh<co8FwXky=J$KN*Z^=Gf`CTQ<s27EwQOQHb8Efa9Gd<aZ|Cv
z+kSxJmfs9u3tcr_Kksz#W?!i3VE6d=`f>b6pBrt+I4eykrEIWu+rIhJM>tT*U+mNw
z>yCe<-p#Qyvf^Lf{8xze15VHBMI4<5?gB<YcE?pCx(bnD#j_jhVWG<r4F3&85uh)e
zx&&mvbMju;Q8pGGAXo4p>}Vf_4pbUH1wjVC2SZ489RxRmmlL6h&k&jcFS|GJ^@GuY
z(Mk8de7tWcS_>8s9%E2M>xJb0#zbKjB*zud)9C3(qxi6{iGp*hbc^kIhp`P(K<e$1
zg|~te3w6YnMk$Zf-bxgbxyMcaiNO|~|M-Wy&YEsx970<bW>Xmdbm6e94QhfFVba21
zy5-v>Wj6s0p5FD|iLT4<Ex<fWaY`~9+~gZe0=>Y1cNp@_k@X@KY5*Hr3Lk?_sd2BG
z1EpJ`=EX-fh~7!}i^RilkP519HwKsWDk9nz#)^1}+xDDrbpcC>DnK(+v<va&wO>U1
zic7zbs$q$?OD<jaMHJfrjvjN!Jc>exw`_S<)JsDQX%!bxh`wD39q)7{BhgZ^MoTjA
z(ZyKXvdnt=+2eAES4Q0oT$S(woCjZJ&j`r~Y*7dE4dYC%8yB2Qdw6y{nvSy=-}3VO
z5;(XrnzomBp^Gw;>N)0J2FNBWDThJKSt^B8VdJd8F9mW<yW{yX2ou%?%?VVY$PO%y
z=0Rp>;N?IcB8OSZ&$!s@jxcMnpwO7m^&_*{4Nuj`aJx*2C!W1rjZ^gkAMR}-yJy~~
z$gSnyAR3;FRI0ilES0vZgmpiXl#}I`54^!<=??|Bv7cHx9iI18VY7~hN%$z<HK37f
zdCJEO@xrkOPz*#GH)#teP<U&fT4v0P&5An*@jT`c(}j<kGTCqgXvX>6K7V}TP9yrh
z*MsY@UQ2R&;)S@OiP6D0e?Yw|`w&IGa%&J5D_4D7<?@&mTv;;hqjQ#AdTU)fgELZ<
z8txt{PfgTCH2*iY{RNFV3?TdjOC`8~)5MZz61j+3qZ2#pVjsB<<1{N7PQ3qq(mzIc
zij=?5&|UH{10A5d3VFk<jeyIS-!y;gi!wAIxPhPv2>Ja`g5J!|Qw{Mp-MjNeSu+>h
z5S?WZ+~{@C2X=7@cC)L3(3_{#?*OaWO8;<0GnlG$)Jk!TW5!YkZ=~|dBwiNrSgJHt
z=T@0m9ZMb|b;SC8uX;R$x<yZ+%c>Mo<hG>tLc1D?<c%pK-p=U3GTT5r=0RYs+^jiy
zBT+IG=u^89ujsB8F&Xc~Sf<!leN(Kii%Sy$td$G$mV=lb8to#P1|ioi@Pv&=ph7|;
z?HP2$BfP|Os>quLPu&ED6NLf=SJ0OQu6=54JM;j>FUWy94&HUB2CD&{=~YFf+4IUv
zljD0OJXI|MT?_LF$B)mAs~58B_01-TY?As~fstlp&%C0!pkp*E0<E@0C70e{N$cch
z@gRDo)2HiJ{LWiZ*~_^Itbt!BAvUzMm08{-lNzcaMq+F2u#DfyMayOE#%oK&xowZj
zF3(rl#>~>STTcfGNiTCqm^*|4t0k3Kskpt2A7%6DdRv3#54?WdP-c`)SaN(Nzqi)G
zpOr?eNqX$#<aV9GU(4?eG3Ra+Q#UbALq!6!DC>zo-=QE%&+wc-#uyUP!k+h2W-^jN
z(mjgCoq>5yNpgT5h>lB@QNI{AKWQorC%?>o?K#asNbQj^+$KL_OMS8f5ufG2#~qNJ
zi^Fq}la@6@CrvuUs|BOa3iBjB8a<@|mE%yD*~Xx_RezuAc9l0MY&KH^C?&ahd2`}3
zzt2wj`&mG>nKCxNBe-@M<wUey!N=52>f^QrHsOFf{~2=!`6DYvM{TaEu3=1w;og#x
zQ!NtxGKs&@2?@c9W*z((VFZKjA(jqfK2|N3>DE&YOS~G(f#T2~3HL9^Uit~Q%GKNw
zO0tjgB!u0vS?D2m99Bj%xhXunup}pBNYUvq@J#&Y<8SF<r2G%e?t6MXpV<Zbk*juT
zNkR(cE?;L=ltPt=LkT)ikMWw!{go7GGir^-t6MQ(D%k9)Y#ag1z<l}IMcC#H1X;_z
z?}jO-rTZI_-6b~6FVd_Mh^(!T1o4!!D<4UAl!cC`WxbAwwLjz_-tHSCO}shKJ<Cp!
zv~ga>^=1`##)xom3F-<AO~ZBr3oiEC4syzDxKhUy$~Tyrxdu6yGL5j%Fb+PomUhE_
zQ)4LhrD1$Tdp<srx*TJjw0L6XOcy2<p{umC9A^jR0dxAqMr<z18I^#X;QGg#+m|_g
zox7z6f3}3_xu_afvV;KE;q<q}%Ot;lCs3ym?EocG4??^g`M7>zu|iMQlZB1;x<2h3
z%aP)KQ5}g*Ovb}hVZv8%2n~qgHtk*c8U)djw&f73(BpMUkrM7~;`+X#siO$`x_l?y
zc?uI?SnFgbU34;G(z0rE*Ku=tL35Gr?z*Pnz#-={jA5t)20a_z?U}thFI#PE)c`zM
z^sQfdqsR+|yC^c8pOyD;hMtlgP(1uHmc8y;fkYix>J)yzFa)&*NKy^HN$NIxvzd6n
z;kj4?Tcl~7U3ZE|m5s@e>?(3H!mgVVeN$kewQw+@HiI#~&QW7$)BFCU<?=1~a{>{C
zCH6^Z^}z#;ZC@%s+TJGcA@TI%$hVB^_ncRLojzHW0)^@9#+5-iL!-r6RF8*Wk}ht=
zbhI|z@dY@17T>PiD!RF=be>-gotExxJEJ7_q9>B_s>#mLVH(4y4_uQ~$O)N~*eyu8
zbQu=%Hf_P6h@`zE`}E6sJxDtBUm^;Xtb{Oi+7F5W(ZQ;R@SyeVx;}!JW3+`f=Q=Bq
z->WM({<q3JYv(&hulY0n=B}@)S&I=*8wLg4c_^e`Iutj>>ZjXw?}RVg3IqXOVh_Ug
z1}*47@j9=N#_0Utk74LFy=9V5JiQ3uIF>sA-9ToarDw=4&%oaSA58tzinXICM0cV3
zJ4ef{bw^RajM&WJdb}k(3`oa@=Lzl2sx%AJBLa2#*5Wr0r^Tz3?)zfthW^9sk<@N|
zN>)W3p_^T}v*;tk*11Z1fcL27=z4jmuA^-@HAa`fJS&Jogp`rc$v^OC7_ps?GVt&h
zQEs{e>8MjUo;My017ej`@acghzgPB^1`FnF%)?pRr${a}AHpoZq>NneQ{2gDH5sHA
z7o-fcF_KI^CH<j}G>0ohiOZMagkeRZUIr<WCB+?ByITJ@^d7PUS1o5QgEv+Xm%cZk
zx+QXh<xH9-fmKG`Q>u`U-~6Y*sl0t#4|B#pV#2IRswidin`0>M6~|_ol>krScM-&;
z8m>6e^S%stOqFm|rv6^ZbCN3|Cf;9921Lpm^nSE><MXEsdn9?8s$AZ@4^XRZAiJhL
z9Mcm2dgL=I6GW?lff~=g(;oW#<hI?f^{o>PV;Y(HIScPLI#YnSS3D%<vlz=I|7X{8
zza~fUCp+=0qX$w~+@`S(l=U|JgZJzz*X=%fP4AczTUUGgQV7#BIcXYiSUH(&y6wDc
zjr^bPdKd1c?qSmdp6X-@5y_sbadA-_iTRFIC#-HgbRIkEW)shs#5`>VppWlg^n>Q?
zZ>%6JGgvBh@MBB!Yao&KJkL#v%}u~;OT_elRrUsXH~FEUy8wkWIu|_d@g)P5An-y*
zX`S{AWad_aKDpGcO#;4iErHp8V}S<`I<WBS&dd_&o?;RSY5MqY8PMV(%0@AzxY+Vb
z^GCYxwYca=%#YauY!Xz<IlRpje1&9qZzW-_4p6lUo-%K{G|s~5-@{C;YVh*kc^=s?
zt{Tktv*6_f_7>LEK&k9c3eekhi9HS1_mlA^*+HRm@C}`pnW>YjqNmoA*B&dbUm6Dz
z1m!Lv5xMtr^|p-_bJ><d^pkLFEa0|pVMv}vFtG(g7X5km(SF6Mg1X9SVlxNVzm8g<
z0c|BHI5$Vujo2e0T}%BLK*tMbXfYkS^uoMY99Ju!1e`duV#^DHM1FC0<xN0f>JxKN
z`kD29>}pRQ{8{<glfJUK*T@2#dcdE4P&{Fk-zi0l(&O*u{(U{{?6Fk!WC`=QbId?W
z9Pw-8<Qo)kZW8I}DWU0?a8(jXs)YdGCq`cRqTKo%co<iy%dU}Zl`9x&cgFqzxM3{|
zvYdHI$Av*Nu;dg_Z>VD&)$X=aH$fIDY~h{<#MQQbPeq%41YJTC^05UK#;r-p))9cM
zu@g*T%VmA~Ys}tqA3=IULAah}OoA3AY1c7CD1dYEMjWQHt3D(x>}xX_^AM_<)47=A
zDMor>GZKdf2LEB{P~jJ0Gf6y*MY+t@Ye(R_CEAa;%G(b$3RZwaaSm#X4`hb|Bda+=
zwi}}(o#Em)v=$%I_|r;&Y%?ZE8Nv3ItsQSgRUh(<Z>%DTdd!_&)<$lhg)F_T*DgNZ
z6-d@GF!y`!$??OjKh+$2Xu)hIKD71i<>MLLp+_*oN2|GCzSm|^+G9@Xtj|+)Ht+x6
z)cqbFRCZOr;WNr`F@LaYbM0gEtW`LKIfj#r;zPu{yxCfJ2#u+~WzQc-eUl_S<xAuy
zM0nda9=sqDi6_{fB=K1eAPBUdyrtX};~-(-l(_|HwT<0vBs-syHD*cw{hs7x3b(k9
zaU$n;+!hkHQF<0i&!(K_1DvuThlO-_sH89z-UbU_Ln+SufA((ao2M<!Y1AM9S{!`Z
zZ|7D5zs|H#CI^I4azR-Kw5X18f8RSXKH#?Z+@|cpj*(8|!o)myck9x^VjI;zc;0`N
zl0eNXpZR%+<ayr}o%$O9x}ibb##tR%KA|+2BS5s(s_!!*i1e-IZlDN&hKbKCm^(G*
zZArjMk5Xg@tLMH!zhh@7;yMZvy?DMRI7ZT0mm3gwEJ}CVQZqEQ7w?3eOEjz$cms=r
zDTSNmt{pBt2lOzCmsNb1YzC%$Ba&h>C52gI#wg|YB~R|e$vW`IvKN)LSrhL|W_h7D
zBAY6QJY|11>kz3RgiH&KO*iK(vX^k>ix@}2HyElG|1_v+NhCl0tt^M;I8IaE3%VK7
zlf*{>34Yn%dB|trtB-Ggm6_M32N8+mGtUOWPBJ;-8<Tcg>?>y$))<vA#6MtUTYQ)H
zQP?J68!N~a%om*@Lg4tyhYYK^!rajZgi{8{wV?K*lmoW!2v>Kh2oHJB7eTK|?f0;i
zX|0}JQlghD7OK!(yJtYylf!cRy3|TzQoMW4uFAF9#aW>jW+}7)HDy;3!yq)Rm2PUa
zbUti3^6C7?+vo)AWTt!Qx!F<)6kW9L-bpdCv*wsECcKH`A^iv^h0sc$aPD^#e_!BB
z_19}G4(Ypzsv_K~3=pC2W%@;_$@i$^rL-Feh@5dqiC<=S>nc^#IgFTk677LFYPPfs
z&4;qca01W3;|F*I*+eBzo92tC6=(8)C6mlXHPX($R089-KB;Tk%n)6!`QJY)GTr-n
z7lYn<Pqijn`;(`M!I5_;3DEq9AAmse-Fjt4hi_nH$AitF=S%iDc>=({bd*YV#LA6h
z(FL@~?*Nw8J8ua8u59&FRIAM_SCf^h*N(%vP!3?|<NT<TEu|oLX;yl*kX-Qs_<8GH
z4>~P<T*rkc<~^!Babj;ri>BEA^(g)V(<2#2XfAPRvt)+>y<7)1>x`}>9DG;wF_ML-
zAoa#9g2{9<<aa2!_F`=y1YBEj@m#(!(_c>4OmsZa*Uyep4bI@6<%5Z(8up_yVZ-Q}
zFCTV;?+va-#cNuP6l81+%6jmtrf0#yy6roPKxSEyWM^M3atK|Uc{nzc*_aRY>d71d
ze9~ZvSYonJNk8o^>EL2i5c2Fg|K-Hvv20kg9;BeBH)GaIE2Pg&1M?aVyy3Hg`9T&w
zm5N&p*1#5R+4Bbzhe7Buz&R~dKb~^3$=)_b4DNS1Gd@`KH-PU_0Vjaz{Bz)T5sm#b
zX0P28!IS34^*U&a=)$KN=HuIv0sE4H_%^5-!MM>kq|>4?(+7xi^f!s89sj^GbMi3C
z^!NRwGG&3(6IZEN#ssTkZ|HUf-!8FQI6%no<k9qX^LhVWv~REtOU~2+Z~EbnknupT
zWVw2=nd%3-&Q|FWC<^e%OvI>}pMdPn{iYeV_@jFjy}0B(N$fhjR9^Hl9IR`5JbgX&
z7BxGTvjo#2$xh>*9<L`maFP{1zw;A$fdyA(6_8w0uy&Y*elJ~K)!`nu__*#>gT0Mi
z5uSd=D{fV6Ub(`FAo_VJ>87nIggRJ0X5EpffqbfTuX$2=S~ED>xy9z1z|+R1A;gzU
zOIvP!<6<Q`@@3Phpx^C3nrVXqgY_1UF{CnS4<W1CTkz4YTm{Qgez!|h*VjpGZh%^6
zyERW%nXM}`^q-~;6h+^Kz{bIE<skH?h=H#7rvS_?aCockHp5K+AS^o4qgcE7Q@_c#
zJ%x`?a@Bbmc+I;r5lW$EgIwH*R`cjn6Lz!@tAvfU=}4PD6r$p?E1Ti4j_{c$vREhz
z-a{LWr?H5Vw}q2H3hmgZ2(l$nq1+b&yyb0<2E()Xkr3|>+ZDa?C_=eD731|F8qw8w
z2xa3(X8cRoc419XfPpS93+bA8!LKfs8&M3;@pb<^yoL6~jheyWkiEW5ch#Yz0s!-V
z*}zqJ$XcOhvc?+$<!9uUd5mLi#l33_N3}p%eTH6I#x2hu`JV6TZYGW{0D7=&A~D}!
zkB8$*sBtDv1JlYJskbPTH()mBpm*?ZjY4F;v@Qh#tXbE;yab$RA+F1idLEjsP_9*Z
zq5FmF+)rF@4xY6L&zDUQ&tRBF(Z610{u2CcvV+P_P$AH5QncAjZ?T$wCs6g!eobTL
zj8_mlqCBIK+<9TjuRV8P#M6j`7xsL}!e|X*#oxeJaCFI!SOHOVf*YSEgPMozKxEDN
z1ybBT9+pTS`j9xrzwL!SAa@j1UR{PWx|Xt^v<+S;Y)Uq2_IK2OnzoqU?WqH;%FN>c
z1QZ*Fw4<8`>Pw7d>pP9Yt(^b_lSm&h&h(qGc|<ZIwjYIeSX!YIdHQNB&pICw1XQ}Z
zZtV~^2*%v6q&C+z<Hz<xJS<N=#eL2qb#6R_&F}=*w;U8Ak_?kw{y+7D2a7z)Lp_V}
zWTKI}%5wHnU)vKD8~x5^C-ylg!^fzDuEQ`GhF7ZqBFQxPU$QDp4OQ2m$AhzlzmXCO
ztqE1GvRt2!DyucDbdILd9E?g)kfDVFN};#PFzV-$LNxxNcaQ6MPVdD(Vg;sBt7Qg<
z_Q&2+=dzN$PdN(Vfj01R<PnX`ORDUFe3vICj7)<8_2mkDHqp+RR?GB+qzl~PT%#UF
zjFVa>hCS9{5E;(cIIDW1q9}Qo3g@O0tV~!nFvs4FQ%bek$^L-`kqo&V3xKL^vSItX
z$g6x56d|JTn8xSO{+>1qhg6~R(F>~p(*e#TbmI|&GhYIsHkZVT9^)c4oXh9+Ml;-`
z3yS-IQO*+bcdKk2{=6!CDN65M4;c1&`x|~E)%Bt{w=T%ti>+3XHNV@<?^JaC+K(vk
z*~z?rQIO&%y~PSM=t4qy7_R|OvjY>c;(BPfHQD5r5bJtut*&%?e+r{5s$SgFMWPiw
zFGd=3=(b&+ZTqu4cE~_LT)fxYp7DNyY$1gQ&(evu9NlcjO`hRG+(NRTl8s5B6jCa`
z!9Gt!ky`Gz0{NlSkV6Q?tgLc2L(fgZ>SorM=+VIr;hc{kN1D!n8L4PO2(3s_sKP8T
zlviud5^Q|$W;YJ=I8eL<QN$r|B&PYGV$yMeWgY(uX@`x3<r)%!vE?r2*dbN$A>)QZ
znZi1CZ6Crp)c6qEc<^BIqE9smycZ;s?okBQDzz~WRoh@jF0n8@T=sUUI4Q5K7GTsa
zUVm=IJt>nrNn7}&-TR-K4l78IG=@4Z^M%PvgIm6Uz)$iK&O&%)mbk)QVP;v_SN#7g
z0ZGCzcJqp|Xj$q58OqVmvW<c-$QqyN4&Tg$pov;}<@dwk4%v<VeR%<l1$xV3S&R;L
z&v+}lnhLRk(+Tnr=3I8`u&9c?IFNYGqP3D1EdMdYBl&3%IYP&9s+{clT^toZT|^(z
zor!mk3-TA1&62@^#_d1On|t3^)GP7z*{)0p*{Z<hd=N{ydK*9*Lm()0X7&KpAwJ+$
zIQ0QW%~<meEtYF#pZ`u3?(^!g3ov<N3{O}8RO;jET>+LNEUZY7JJ&3=)@13T4N}s*
zg=6~Ir{IE7v$~byr#O>zrx=&?B&!v`4&H-j*@~sJPyo^7?l(r%-W;VS*J$ZFl-(Un
zR2)3xG2d@~R(`TEz-5(88!TXW^5gjL##2w@^=YP%hb2rE@Hx3}`I1b!KIvQa3_7up
zI*Shx6Q=uQA9~20@xLdV67FE2iwT<9{v^KIPL7@ljWXl-<5PJW|1^?5R*_~F?ajQL
zw0(u;VR7h9LS&L|m7&&!xzdp8u`c`UaeH`4gj2CM6HH0ExvXM9Cs6aLJ<2uP^2P`*
zcQ#JtmS{#_XI#_g8EE~fT3MgtB~}rMeAP(wOCmk>C?9@SiQ-JaN+@?1j7Z`|HaZPG
zOHLSKY1}Vvr(S#h0;wNYqSPs<#BGAjZ{Y5Q4`?Yxsb?b;fJ2VtI?;P52B87#k9(i`
z-?m7`aZF;98Rhl#D0jS$)hbtE(vlw=M~P(yJ7Q=FEqr!PSuGud(Ktp+F1E{LeAszp
z*(43$UmXsdHjHi*sbKLkrB*-{pnkr@o&451w+dK35*{MHDEIu_V^~<K_RvU7D;OjB
zVbLAdd#_JSb8*nCg!jt;(tnaM)Fh(-!+zyPKGs8V5T-Ii>PP5aJk27Z$s)*F5oU(E
z$;-J#7f`T+QTIPOC|P)_o9Mp#-Q5wspyzFkw?{7eg6&s;peI5*!rx5MQ95U(bM5zC
zDIyj>b>k3zojlibG&A)FCqK%@*PltP(w(rrKX*QPuXf=Y_Ntxab)eQDVAY0R#OZd!
zHr{>;Wd`lR@W>doEI9vp&xF^pd})v!GC@Ts?!}P#yHhjKs|JtX(Sq{u4ZFuZ0p@v3
z6W%kq+PW1&Bo(_e3glm^(v99tTsfR^3rdA%jr+)Q;ZuID>G9UHC|LuyUy3)40LC=j
zgO>L<dSr@5m@s?k8iuTeJBLK|Rd5hXaN&FDpL&kQXweVnGLra;FT~AWSc+H2g(+!S
z03jh!wdnZb{De^e@y&@D{ZKZC@s*N59bS=jjIWDN1JR3KxYED{|Kt!zErK}gG9F$A
zU5CC$20<>CENMzDQV3ZJCK(}OA@6x=Z?b8N%psggYM?naWyvZwxP{1^WS`X9fAEJt
z^}NEGZWTEWDJ5F|gAREQnt~Xrx7twpaRsuUnZrvhi6i^)>iwdzPcSR3mQI0SN$AvA
z>IXbFts7X)9p|=LS9w*ObE;M5`g52vufkHuFY+!&OR?+ceWHhP<SpZrSsu?r^?Q~v
zxhyY&@h6#NHRorT40*@KA&Dxw14@Rs{3(;CEfmRl-9(pBQC_+(IX{RcQNh5<G~f^n
zLcn}JpO#YVko#9tf6KXN=u0CBsXW%F^$46(tP3R`()?IYj<={bds-^pDd8AZGUT!Q
z+suoyq3?-n?{#}mP%DZxsP92KDj`oipNnArfM$eb@n@`jc#?#o#$_m69W}WrCaxpm
z6S>MCV)k#XeIo?(UAXu$7RRI;RZ*)E|FrboNr(ylLqhaEp6r{jA_x)(Wn<buk{fCc
z0=)*3n!k!n3?BXmsW&LFTnleGmy3B@*=kmCW-^$RbgX!i-!j51kFu<ExDpXMJ`i1=
zeG<|f*YWnirS6>B4<C=14;g{d0B&+vQD)h~yuD46x%0)8>Z(Maw6nvC8u`S%44v3F
z6ZqF&XXauF6oYNbILlGe_etGa&Uu%ja|nmtH0!a?4)%wq4So=d8lOd8IC*ifZi@k!
zt+dk~YdBcBX=fI;YQ1jf_)U0luqX0TtBR|#;NsV9KUcPf!Pu#2>|hx`+5Y~AFRf;k
z{d@O_Y|CEo0tR&ayyE@NtHoppF+lVX8Z(luqH_k7Jn^>Ai1p}S`z(XQU?naa3IgpB
zx9SpOq{tU~+{wiC1+V=|Ld+?#xk#EG#*r`wr^S*wMRggzWu<q`JsDGVX7ll{I2u>0
zJn@?%!QzR&pg;A3RcXHX0r`J0k-xW9By5-lWKF^Uny7={yL9i598av$fo!8*cy`V@
zo>R_wXXUcFF%&|%qdI!1LGf6xTWZP0ZWi7r#)$%ypGt*eBblnNb6bazO_Pac8u(+v
z3(>HNszO5~TF)*%DT4b@Uqh?%Wg%0(*&oE91u|_3Zy1|W!Nj<{S%2oMFfE2O$HA{9
zkWGO!hpDYCQAFoiZ<W_%s($7i9w$+AO`-5w>#yWf5AYmL#LrwGtnY8kI5~$R8<%>v
z)7_RkCL>S7cmw=3&XFCCy?UaC9ZU`XF_Hc87*0}ui%h|F74!;S%QeAxI4Z-f#E^|L
zD_YB2`XUe)loqgH@;@p6?JF3mH~JB83_?DzuH5PD_SKM_OZ5gD3ZW6d{)2!yT>#hl
zP8KcJ6m$!hMJ{~4Qls?z$9_nlhG{c4hSmht8~<(_!z;bB`%Sr9<7gl?Wg9;4?|eP_
zJ>grjDwU7QF4Ml@P-(+$0sCo4^f(!JX<ro*%mzxM)pVku)}~0JO22R(rPQ=k!!8fN
zBn|#eIyzrV#Qb`;^HMK<b%zVtjH{p5UCR(=xq|bAv};J2+06q!K-$96ER(WzppnU{
zIT+EN{D+V}xvQ0r>+4uhaA}?y67N8P7do*E1n3cm%uj{0!{oOytj!>3yg@b)X}1l0
ze614BREYDyRI2s+S{;#MM{afX#II67h6yCx3-B87R(vbMuUwEGL*2NB8Kb_0*Z%!r
z*$C-+G3Q0Y%NPkW$|Sf*g3lb<e7m>PExx+-n@Zonkk-qqa+$mbC0R^D_n>YDD$P1q
zAi*jRXRAd*=WnfPb=6=ezMVfU$s!YPe(P+g%y(W6`!V_9d{3_*7PBkbNYL|r?_rGZ
z1&wTGnpe<`k@y|eG%`$X;;CpEhf5fb4tcH^^X`-Nm+|Xd#GNtgn}92CL0I+GBMsxF
zWb*grFy*+57LPAw^QEOK>;^3E$5tMe@ku6xp6d&QJJvALQ?2U&!Er;YXbqJ|8ULHx
zMoppMB^j5j2X2GMwWjFisNH=cyCPbptT%RqNm2ENvq+qo^xQ-L!wvIfHW(RzaTL7f
z&t}QRA!`oH;tb~)mYJxFiB#^Qm2;7PkUD4hg0}xFLCB#JM9{q<2nA}L#t^(zC|W`N
zJF{NmgY4aBxk96UliL-oJ@s3J^(fNw&ISSMP#IxlQ>rU))rV!)^qz%IGeAgL&bh+L
zeUf(Db@ILazqs#Io{mFB3J#_M>Y!@py)g`E5a+dspP>ABzk?}6rQSg0h%_j4@BC$B
z!pbF>o>7eP$NMEU1uGISjVaP}nRi5mi`X~^BT?v0ER$QB9NeWh_3R2?Rs1zg*J3+8
zz3$ouLMqa0oY|6S>(ooHa{aTcf+pZbh}wMvu39K;LwVcmXI4{uKho^Sn|gemz5PXl
zL(W1C3B`1?0}E>0y84~l#Su5?M`TOhR0%D??HbN!MYt)aJWH(V9nnwghj_DZ)3xbg
zG|V)HRT}t?KTuEf{j#g?GX8B$E(REj+fz!jB^CMf0K2>ikgO|0IkEd7J@G|N1u$Fk
zL6b+Iv54rhtyFJL1Y*MMoc#r~N3|J9Fu3xb?xoW_r@=lFb)2ScoHVGYGs^XuH$&=F
zpVO9TZK4PB$MK%VIH;z{r*q6V=~7?I?b~(UPMl05F&i^LJyBK*&~K3jWITsFGP)8>
zsS`~3_zt9nF>S^5Q2{i<PHRe&i+cvH6*Zc4KQ88IoA#@s-^R_4!S4xpa*hXQBlPQ7
z=MP~b`I*H%BrrMx36Bp@eXM;+)tyf!Ez0f}AE^d_SgZTzGVN6^T6`~6gg?uAx!|<$
z{P#L~Ezi9F7r}&m$i6uZab!}5c=Ns})J+UT9Sj-cjKA1+Ols3l;{@!`v)$?uEXVy%
zf_(l=TGEAPRUKs+e(@Ts3u-XTdn`D6?Q+gTVsO}&^cG7Huv+Q~(H!}wcw-iS>;iZ)
zTug7AgylDK!755Ft+OsQ)_oHxOvoAHE>*wi+4Wm*Qv@u_5gNo(BY|<2Vh?^&?dej{
z-<mAo>5OMg!h*Uc45jqr+BJMsyD2a#2%ZtC)v-d(HBCQBBrshwtpz+%Ki$+8C1c96
zCQlYj9#CYJDm`ecV?JpL7F=&HW{tbaa*<~jZEV7xCAyc&u>+xYzHhNJ(L(uc>E8;`
zExD|E35e}SshdRf4KzC{*6RVjDK4|=w802vtu9Gl{g$oD%Paj5h!AVimf=XSQ*LtV
zaOmm_PT;8iA7O7D76rGp56jREQYtM<cQ;6g$k5$LgOqf`sI<~CbV-+V!_eK`NOyPs
z2ID+F=Um_W{RdoJGkdSS*1hgkd+P`cuj5A}yL62NROI1$h*X4(yzO}3`uwX*uQ=M}
za6q_e#{qksxAXN<nYnH<vrs)Hj!5YCuDH);kbj{Afj_&u3>BZ2spIGra2+VS*|~ix
zdd0RIrKZm8mH575G7mpKWO?$lOS*Cz*_G?m={OUMb4d$vcW3D-18B$|)kE^}E;bX`
z<e<UA?$mr?ZfM=K{95X~@MJ$8@xA96{+-ZB;+f{qRyy=Sa;;tEf%q)=`Np*$H%3^l
zqI$}8R(QVT#fhN48G)wdI%q6S1UjZ4cKRr-oe(@A>YPt2ATZzt*G<Pq`Y{stp|D2S
z1=EpdEDNa&-?>t=HBs83-{$JgV{S#{Fbi_B{I_U#USiQCU-(U0VHX^8?JVP{OYGl$
z^GxX|>U45o3l6gBw>MF=JM_qrU(UjV=U>U08Po1|>#XgW<s@A&K_MaJOO)s{@2D%N
zY%%zJD>GmB1B*M?qx2lubf--(i4h-D-_9iI2C7v{9lo6Zr*jfoP254owb%0UwCPf`
zZT##ft8LxF$y+FJd?s~&f4{j}H1)bQrs;>hd0E`=rk)4i{fG>TmJ~P71<X9iuI@xg
z{Z&-2yiqGTpdCS4SZSXw8t->thoT{As{$F%KNh~vz!WH7bA8~IkVDZFk-~K2PsoDz
zh~sS}PMI<><N?Tb&2Yld?D~lvKyx-^MWBZah_%n;W-WEo6r91NJTUMP5zncl=fmO5
zhX+vZ2o{cxBkSNzBWUgFL17R3PPmz4<y)$F&vK%N2d7qr>te=r>Y<E_eIRJ1BDx}W
zBG*5XOLBOJhkgb*4|wV7GagQGR>$g9U`3e*N%teKNqezyg_=>NM>~ug&CekU>`YAw
zDi07Sn{j?O#&;!6(7Gm#y6fxe))mj)?|N;u!LSLRd5U`8_uLv4w#o0cdEP)XSt}C#
zV<*Gw=hIB#1Sao3@@dOdWT~ggZ}o<c9J<6AQ;lp+yX8K14L@{R3ofRLynOkna*{J3
z1IPEiz(S~d0@u=>!PE(2D3Oa>+^#bH=ABq$6SU7+uW^bAvCdkB&BFJy-#gx(%qQ&3
zreCj;@|*LaEpqn~%3jfK*w9QnFLdk|6;KnH^Biw?td-6_v;Evhh80wmxvpK^_73aK
zsJdfS^L3ZUYg~hS%M+<9Xv%gppmD&fuAfQVIipTy*&_?ui9#l*zxL*G!lTGNsXK~5
zOQ6hV>PC3+$&nNV2@8(WGuI|JSHkx)Hcc+d-0J1Yx)CuHnE}QnyY3BN^A4I{C*T+%
zCH4thG>^+7eG4#L*A1cJ!2v#A;xe{Q!N=jN>Qwj~ljmMH{;5M}>O20@J0e*@PM?CH
z#P_oDG&SjiWsL*3=Uq0Zk}ed1PVzLjp2`v)S=sU;Hg%bbuVPl11t^@>6mWB2gc;T<
z*+&1T>l~^Mp~8V}MIu4l@T|HSyf?%ocSIay3u7b$AVg(c9hE*nexU{TpN626MjSnu
zacox~E2cA;Lb0!y#j;0fr=SgyP0=zzF+q3d>nmbGy`@D4Qkx%~NTdzWCr}=~ig^-=
z-%>Z`>thKHA}?#{ogr<dW0t5i=64D4Da-+FoWe+>gA{uqD9zA!En?fwb@_Y7ReS3l
z>4%uvM0Jdo%9m^(Qb9N%F*oHVA`tdw@_N+md<S%<4Tt#MGt01jy(Nc6w}Sko8MP9*
z1cqbFvb*%mc_l*SpyyX@;Zd&1knlIhl>sV*C)0ONUMvgkC><Cs`IF$Q?6r{$bVsqx
z%ZU+Gf6)slVWUA?P{v}9J>MRu|KvUDDOa<JW?uP*A5~qn^pnVh&e%kVo09E3(t*c<
z(cm`==f`FKEu92LD=w)W7tOO7S%maW5?qr3cnA3gBX31~5?`l=R=jGs9ioDN)z7a(
zNp`7S(}u2yH5!%AA|1ZbRFgI98EMT@Yfsp{JQY05J7ydZt+(XC^h_Z9JWVEC%x=>*
zCe>f?HC50HX~g`ta%83z*U&qXlY7&{pwY!e9wSFJ$Mbs9sSj_UFCNmQF)`JT8$C~2
zlX0R6dy6A!<Ngg9Z9+BZ`KphtEWQv*n6Iu#>D8Ugz?LbrkYDDzWs)I<C0n+@b$)67
zRDpahXYL23cI~2hh_j3aw2MYitK0&<{T(I}zfCj=^z9S-h)V$^saY|uoN?4_N=9%Q
zt_W*vy37!0@dXw$!YKikkJj|j$){wQFaiznK9V=D6%hluhlG$<`hBv*U|>{;p)3}S
z1xrsoP1rl5#*7&fIM2n0oZMSx<S32hMC@52t#D^|Z9!qks-L`Pk={$B`lUo*iF9}g
z%_4ec7ysTZCB}E*{zwDVo*Tnw!}gyr`zJ9}Rd*hA6!T*Ae8u%<)<zlCE1Q!SDW+1n
z%sg%J>NDhC&#pw0b;$L1Pw+!={5D2d;huOJLZqH^NUq`>6}1b?crb7b+OYp2I?!<;
zgbM9$`*=n^eFw7(JEV>3LFVf86;mvz98+vzSWs-nF7QFGZR)VIC!|@weDb8ZCmwnc
zW;2Oa!RVO7Kl5!mMeb1iiqYmuQAK|UUs8#J6wk}nQ!-Dt=z@k{0js;Fls(ZvW=@3t
z^_<2n2hg+xnyzBDL0Z`k+Q5}O`Qkj8(AZS8ryO&x&(*R(Nm}Uk9Vw2~$#GFDJ)W|j
zHdo$FSq+OJ!prU&4NlRaXok{Hn}d_Iqc}CQ+B~NDu|w&(a>KH}OkC{%La`XhauV5j
zY<U~jMfB(6xC0ma(1MQ#Z&MAsUuAfNOR`eTP3~iRr1*78x`-*4@M!k>e9wixt;654
zHnj`Lt8amZ7ouwLAX~ksJ;R1?^vsUM+49)rZFBg86hi$Z%|kx*`gwpEv#1vaF4x#2
z^!-&-^2gERHZa=;I$aTEMWaqxzZ^MN6drlZJompV-%u@^5uuJkO~pM@&CQ`4gKK82
z)?LaN*6h?x@wMm^YA<J#qCy}jY^-pvAb!4BY|$hafmO+&O7sCh*+jV{*<Pi&J5lnz
z><I(e{ptMJGt;>p4kqDjqx*`Jp<*#;uv}B|K7D@0@;W@pZPu=x7pZWzh0Rl$0I4xB
zuQClbpszzf?TxPzpuZHzQEpM=lkzr^Hk00(KuP+{g$DZE_whoo{IWR`snP{r;1R5G
z7#E_gAUQ_&2V*(xPUVQt_p?{K;jBkVxV6)HQVHg-dFB(Ni9S2ivZpLy7gb>lsNM;e
zdvQ<p(<e}eGxQTA=40?O@y7Ys2TF~3N;c1E1+)&g90}RpRj2RC$@}ssuFdVnd?#k2
zj3mx|Iq^@*61X_|*q>g&<L={#QXr)UhZj1B1KT$V%WL%4RoDT(_KE}I|Ed27-&6by
zzNDXZAp+IhNBm5)sALX6*-KAJ{a>S!rkx68u=-$DL5Mz^ZWo!6`g`7mVyaD3FsC7s
z9D(4~<Q0C-YdOC>_~_Adfqatr1KclHjN6`V@mBZ}m)<4ng}n9m;!{4rbuXQ5RO_(V
zRgAJwH)iy+e2@L4ix%K$Ec*-eB+b#v=6m0O!BF5hjnCnX-2>6M4S~F21HM2?D_^7k
zu&lrYxk9YxW^+vVdiHgSsf9u&U5&s~!r=LYbmj=wxgkUDD=ywNsxifrBr;dVlaQD3
zFZs<jZ!=~%hN#38$1qk(EpVbqTZT>iE15)f*w=T(Tq?|l`HT%dR%S|bCA<=QV(|S1
zS=y&w1rkf1>=g>5$8B=)92iNZqh>~=@rWQl9%&Ey2_|&sT<#YAkJHBflEJ1!%~VvB
zWQ`fQnKjb#wUBOEF)53A^O^L{<=;I=@oLn!@N@FEV9C%oC&e#m-M4Ae-jA|3cd$!(
z1WT8T-N#?&=So~I6Y313f6UIPP`2lK7orVtE8emkdf~5=a9?3Hyldp=3Z$1t3chVW
z$l%n*_UP97He3vQgF(?Tu8SnbKZTA%iaEdxS3GBz%If9mP`rE9l#uM_@0%M!N8|q(
znE7<6hJ7ococJT%H72d78&csBY*W;{(#{^zvuGzP5+-77E%Y&hfm<>1&u(WUXVK!G
zT@akPu8niDh|X%*mx!oE7`S#z&d-^<zD5M#?`y`8i~67Y-#VMTM&21Y#UG~A&^!WJ
z42fk8JM)Dyjn~tU#~J%9=QU?F2+y|l<EdR~JJ9S<#+&uy%p2Uuu?c9&p;;aI$X)Kv
zvF}iKSE%|H<pt{w{4=8lon^CzE5yF=q9=zuNBVBDv!N3=<Y{o|_%X|JRN}K{F+(+U
zEbMopbb|>F;fN#gu9^8sNBlUV*5KR^QG3(It8&bG0v(@lcmF+7rYJE%y|k4XK}e}e
zF=mzCc`&A0_)8@48x)Z`ovMVN1j8AbU;D1cBAmNn3Jkz0?r<^tF~vESS{G{XbejE+
z4*@Q2*0_G?m5q3#y^!ysd2&9rZg~j-EH;#uflL!FHQMggG2X%L#R@f@=?hss<nS9`
zhEik(l@iI97ixS^!Ilr6k3YYfD{h%Hj$x>^M&KVBtj1ybj&%SgE|b>pdR{JS-QvmM
z+Hqj%B8G1DepQ7@aQIqljtHghm5VG_1pc7Nk;oT#Xd{HEF+L}VLH{D5$-X2Q%6P5p
zlw$n{su^fSk?pI7jMox5v6H;TC=+vSh@@n4@4%EbDQiotpt>?F##higUm_ejn<64z
zRmO3KNP|cf1+C)eUV{!$Huvah_=YYiz&O#Yupd$r_J&)B&a1+6eteKZkV5T7dmbmb
zzRCV&{gqz^iu~Qp4VSbNLCR2>DrY9UCKYv>i(L>vRyAuVfKyUWHM00r7m%)8d{N?S
z!d^cu5V8Pc2^|`>=iS$wBT=vPG4;FNokhguHyN^28oso7{*ew^zd<?W+a<-nL%C(2
zmbPlC3idUzpHIEEe?w(R)qm0)B2~iJ1I^COsolDuH{nH3@=L;BcP@sJf`yr&CZLgM
zV0%CqMMmfOspi<`ArGO%lk|~L%yr1g%Y1mzW;uw=HP1cWl9QNX1e(4w%@q*O66==E
z3gqr$Ia0YD1KcpBbWg_|2+Y1lpSsSHAf%cmH=n!}6YcdGi#bbc+(>%BxZi``U`X-H
zM*}L)C6`M?SJl?xufsABb6TED;S{#srz41;xOhA@zM1^hr}2=MzXOux2w_^0ece@F
z7qxFq2ec5MKzrO>bhG+#1jXDeQNL_(Y{OETf5ZGkFcA1aiDeSyKge{M917K+zx&Pu
zt->qVMFTmgl_uFaD-1+Zq@gt==QZLj@^|b+rQnWe20y*$`O}8m-AR79ht3>vg1XLp
zaLA&)mp-IU0{a~gZ6h1^aGA*T9P9p`;cUTy(W2hs?>A4Ci<jqRi}C{5XT2A7vQTsx
ziZ2~8X67bwD-6u6)OI^*)W(3uwIg%^I%DsgP$<K-U23?Y&6vXR>bBhqNCn1zv<h$t
zyri0~Uu1VsZLJ4!$`(XYlujb2trv5;(MFITOQ9A{n$57En&RwG$M~zf#1psmhmjrf
z1XXqrhbVk1hHHv!f4sePrDx{jRKgg_fUmnOl{9)+wICn=(OtHs+H@$E6aLoSY-o&C
zanTnyELYTj3IPH1LSXo-BNl%b;p#f`%I*o_l{nYBM}f0`TXGD9EwQFo%kl^Lh$Sas
z@dC!R&0PuYK8u~*VLQhLWTxtDLoNinX^h!Tvd!p%?5QO5wG$tm!LJ#;y{Ts_yp)4(
z08g4`Ll~Ixz&&`iD5>#UV+ng$ZQy48e#JtWyK>=#^b;>B&pf1tpxQ4Mb;LP1^wIeF
zPYZEsMu}<TU%cLs?_QX6E^Sa=ydD)4J{QEDGXRI97UrHMp+|_7e2M<W{C*dCV0|EC
zsF`M3<0;Ac7eQu?0Gor(dljXeXXr!HB@iRE*(drDMC8W*B<`<BL6~-Z>Gxk7NWX5B
z(~_pacJuZ4nKo3?(EES}-S{M~gd?A***8ukU=HzT{ZzsZRFDZgzb*}uGq6hY#~2NU
zlq+|m*xZ}9!o7}>$fnN5#h&o_RK3aQL$0^0NAX-(V}lfgcTOsP2f5^T;O98s)btHE
zsrHXaK6ok|#M1*Iadev+1YSag(3y0-MQm<KVdfW$$Q=Q*IZy5oY3pr=6YlJYkfFt+
zWfZd-0qul<fdcHU8h9?~+ie6+s%i^j>{B=HOQcxIjMC$zhdd564E?v5NJWvCIx5BF
zwAqZb`~&(TJZ_OdyLzEQO2o$JD9v=$ij`d#*FJAW<KTwI@YW+5L$7ZGH7)UvKnL9^
zDakWt2qbwghP(`Ecz`$_^(%_{{>`ld@0Lg|{WAh^32eoj81KJzmkkynBlQ}R)rFJ4
zvVe_**eJf?QU}s+e8`{@flQ7y*A$GSmUH=Aug+qSk6#<hUp9({!jO+I!8Yd=@;P)%
zc*-vBmM`X3)<KT_f;@eW?Gxy*?<IttSJj-ykO`=+{U5=p*;9iCYp3G1f+%t)q#xfK
zA|%xBj1R*MsLp-4n+=pnDyqrZ&!L{tVhF_L7uN6kF|r?A(Lb)*{a&1Xfab551dSaK
zGpNov27%pQLJRUbV!>B0sY%c&|N1;L4)q^w@f0J=A^mD3jF}PZez8$;1*L2npS=03
zeJ~HN=l0#bQO}^1GAx?rcT3~q1%+J0635dU>@jXf0=FB_%_dyJ$Q~$;2BUx=qUcFx
z@vda|5_I2jLuZ(+=MsXm=7*25SZ8C)FGq?;L#xYI^6UZ>%+lfvKi(=0xY?mXSB$_4
zGQ*h>co#LoOREu>ahx6Jd4P?PU0~4FI2Y>pMeYFnPGYjyMT`n8`Er>I@+PZTAFVs0
z*OYBs*gy4kHFVrdqI~_7nydZs%Sm!`>Mcw#Z>ahGUD%ulYLjBJR1A4peK4bq&J%~k
z8eun>@$G(ioPAlaYMHLaVq0!yj^v^j$c)1<dxEphG5Y|p%$HZB7#l||O9Vbpt>61b
z%6U`8-;%M2wD?r|n`s)=CZ75Cro&C#fQgXTU^sn3dN`5iU9PDp(IgSNqICfcQVR`K
zXB&W!SoGpmZ7Gq5kl`~SB?_gE`o_J<LbjkP=y>MK2vuP4crL}7KQa+HfWpl5fCumJ
zO_k&xRzYG?9-dTZc-uJ)&)D;T{;Uz|r<Wl8qS0608GNtl^d&qBx!Yb=PB<wN&Io3T
z>9a`e2+7PA?upb%Gio0#M`4-AA-}|t2EJ$fGoA8~wDTqp&3~MDzu@J=r<x&|CNxx9
z(hxr+6n+2rvQbENusU*DTHvp??T(~JsNb(PIJjA$z+h5TbVwezZ)u)zkXl&~kYpza
zF?gdlbSv{1I*{<osi8Rn2xmEk#H9&OImgTmLO6?YM@qssmIB!<b6?jlpmA1Eh@?ze
zagb8e6C1|FULPT&iZ;mIot!M@M86UkN`D!|vId7#+TK}hx37lL^szpa(b5TC&viDI
z@F8`Rjij}hnT;!PvOajcB?3e4JE!5Y#fp@|P6Jd{ERf0}QhuvMrD&>zd?cvSY5k^<
z$Ov0CjpV~O_XHfDbn=xO{VpI|*`uV~6XYJ8_k&^Wj^A@qdL3w$-e_6~?$NP1<G-@U
zCkfZWiYMQw9p6=aCwZczl5r@KPPI*IpMEA)GFHR6QacTgxA?i4Z{t}+&S5nT*-5yz
zCi;zg_#2x$hgk7-_@w%e6eY0tRToG#RG~Tj<phhT#AyRKq@)-j&gQ4S>ne^56?U62
zO;M3!s>Uu&W9ab75<*?(M0KO^3}FTafDT__D@o|sQGBkWTwV~dXq5VX96g-cGeG|2
zmZ$2{iONXskD&?_sivUGAL9IYp1snMU8a6TwTpKA6-N7S6@I{Mv!Af(%qmw2+wYX7
zI9=P){e=Z5CIGfMr9|@pl}JIvqU<X2Jkb==r^jwqwFix)x@=fGsA;89%y+!ae&zKQ
zuYA!x8w2Y5-Np;$FNpnQLM!ihTMOSL@ostWfv%R$7Tiu<q$$6g-K)!6l%l~M;-`P7
z*89i5PzO9cK?ij;`<d!$0w^@r-65?qF;}KNA<?cCvwo(esSy86B+|6i|ESLDNt`8a
zlYH)8Fo%;>En?_9gu(<PCx)t>bS$JGgzf#_>6|-NL4W6%JG(nl6=5=ZjllT5jq3GI
z1s#ghtY)tD$L}^%Gq}|EV=2w7N;X}*teT)IzxdMTSt4}=T=cU(LXiOfeNC2^=Si$Y
z)`H4#655|7km!+?z6h&nj$u<AaRQB^KeG?@y<`LWQq%xfbCf_R0p9Lj<rcAKBGDVJ
zF4%Rp*m$uadG_K;<P-92M_z1rg+o}0R5Bx7{zAq46W`wI8vs6qL!KPML$8MsyB0GV
z+lsyL_@QwW$BSHyfmAs91zz<>#Jg{Gsk~TPUBz2)BOo#Kx9sk6u%W`=aZ|}$?eC!J
zG=0nz+b&mgXp@JwS7pqq#>h98Zd!#4<FCbbCAI)^bunSWkws?I$8$C;&BSZ{b-#Pe
zf5tIa<RI+FBn2#msQADSGysezx%0Tvb6<`_=#rwYiH9)CH8av0e900-uAj{@hKdMc
z0e9mTk|I}@v;W)$P}Dd7e#YIy(aojQqPXc4XNR!$&NZ--*8(u}*aF**7@5?B^o>+9
zZ946o{{?N_TTdd3|M1yID#Ln69-?~DM=<Z%V=f-alGZ}+ZYY>UOlyp}^bS}db3a+v
zvV{K0s~B_WPEdXkHhK{{1k&?s>g`EBT=ShLp~Q*B20EeEMTlJrv`SXc_z7t(Yj#SI
z7G^@QsYEj-hYF6r1dO<TZ-F2ml$|rn1E0uekx2PEjBgSy%2&tRSfm?eKUP2Tw(gQD
zh?LT?iqMQVt~(e^0ks3@9X1Ge!D1c5cEKA(mrQhCAf}|3hE%5aJYHk1Ig!Qzf~9X1
zYRI~&CGp5xtE#dOxfjt;nZnj>7!yUho)rs<Upz7XDB-CG_yiVStbU9A)kfUM4(;>&
zne4IWwwHuybth`!>q$aovZ}`hJKTs^fcWNi=w({upwdXJF8oF}yT=9hug!BT@e&(j
zE5O<{((eN5<1~&ApF~L``(NkPD~y?KHd!I?j3wn>Ugr`rYqkDW4(Em|3>(pk^Jjtt
z(yQOPV)MTfzY@Ow)JvZA{nGg?toqrV=~$4G^@m&WCcBvQvuy?d%OA}EErX%=YPj!s
zxNbaMf!)i$|A;wM*pm;Pn2q0umG{6(J0;~jEjfjG&rfjaogoPbI<vioj+MFm%2Gmu
zhzYY#=3!uey1Cuy8lK6{aDtJOP1-g`Xsd-FF37l2+7ergu(cBUzU_-E;d`5QujZC9
zm><fR^%?OaYl%FGrfKzA2Mw!Y%R3V%LUct)`!DLYQZWmSp||r~ykM42SX0otF-X>z
z!Iuo&XzaG)7<3pWI929t5piBsuzZ8{u#HdNkT=IPHR_M0hlO-b^vH+QoTxlT^vqt-
zYZD<AAvCcZk)wG6XgF!q(pJ&zN`NYdArnmElNQSpq}y@zY4g75Lwx!XmLCo!B8sTC
zMTt#r1gd@i6Z=Dp4^*I$Rk+v}hm9tO$oS#6p6~a25^K<xwF(w8jOVQVx?ul6n~f6k
z_eXgl9~MeG5z`d2F0giQB!WC3alJgyqHj7hJ=+Yu0D>?PQ_S)end=C20F-VSc1U#{
zdVPdT*u&800CaBIN>1?Su7HbG<OgLQ(wu=8etZJwXi*=pI>jVzqP0c{0h)8STV6*2
z5A)QN1?vLg&{kqs41P8R$<up}pbf**`Z!Votc)&2v%uY7Soa9WAB($Z1&%~?m;+9k
zx;GkH09$f8IuhY)#rELZ#^(aQTHI`}5|e%~STqp;dE$}+$LAR<w68dy4jx^G2GaqW
z9&h?Nld8taNMWCevNnZRAxM36bbhhNE0)?pbMHZ<{x<8q!1{m2U>-6j7}ASz@FQjC
z8*>xUI<dJo4pE1W-(<dvne^j7M?P%xG3mmUKIw|vWJEul4WtwEebL?XF@@v<|FzU5
zt)lhPm@N<VbRQ4|v>YTneCSXmacAlmhAEa20+rY;J*oe^4pe@;&|9VEP(6fOd|q)g
ze_w(4$HDT+5pjK}K7o<KzVkZUgOkZ!nkBs}GFk9`DsO{|n-;*ww#Oom^0oaX!W(^j
z9D~#Vmp+6Q<LmW4p~!Q)h1nNe$q>U3SJ4lB`YL3FEG)RL%IsTcj2{eWl(cN#Av^)s
zFJ!r95+^3Lc%!w@H?Ci@kF6^uj;y=$eL%R{%xz=A@hkoWz)gYp<CvoFGRmaanXlqB
zaLj8v7%QwMT=S0*Z3nvdTkTCsgDkIfX3e}QRfU%Cil|5vvQRC<aE+BO)q{k137!-Q
zg*J$J3TC($(+<Thv+ipiJ)IPVL6J+b1_7c7mL5wdD-z@!Jp!`%<%m4h2z;I#ZixU-
zEe!n+J#J)RsbANQC8TI=0p{~8otL2`v*qwYHCe$A5dSw;shzdpSuQscl=p*ME9H4B
z?^cHBWS%&2FCjX|9?UTiqR4qe*Z&CmVIXaM=NJJ>+^36}dHW?U!)r_t3rHCS=Gk&I
zOuNG@jF`i|D0rwjXLfjVGY0mvg&%h?hud$9(G6H<Ugk2NT-}a5C)7v+@A!TR-(8M!
zC+`QGa=f=(-qse7CDFte(Mo-u@73k-CT~Y9!rJZX^2@KUWsZYGc|FpHu~tt~pbu~e
z>43xU4MEo0)?e$_i>j}<Ln)#pdV+JYWugzzEn!*xv<oyIXpVjmc_+a|mPdCvt@?C%
zO<R`e-mMDHC2IDLP(KZYSH{j&P<nWDu_nN5PCvfBcQo6RXT5_h9Fw73;vzkRd0X#h
z=EPdENf{LPeNmW)(NnGUK?cmBfGi&h(XeNoY)4wZrP4$I9=>fFRnsmsAV0)kSQ)8U
z=dGCFGPz4SE?d}`F{-2&|CD+V;ym@VXp3-Ss`qh#vd4Y9mp=DwE9O?pO4Co-LJtZm
z=1EJ?4#ALTzF<(da~XZh{5BU8j+2S!!tk($i@i^Xk0MX%x0EdgF~`iqWd#NQ3~veJ
z0?fl8K>hM-ujKcRtcpDI*4>u-I#!Cm&g``xMDC(89-Bd%cA<+YitlE=8C=5N-3*2l
z2vXYIcY`5z<dg?*MS+Xn(4SLjDG&NLSwQDo;G-r-%AX5kHiu`G!ZfeGedglqauUd9
zOp+d?mEc0t=5?}fRLU1$w1oM?m5z;VT6HXQe~l#e=<*mv1qLyW33zwqdU?50^+K-j
z@%evL=s{y+ydy$7BYQWpvo1t7Yx1__4UNxsIV>E`6+Vk*Jldvt3I7geGPmNoY}=u@
zL|tBcOA&!vgmkI)en@^jc!}gMef&Xr&;XzHxvIH|B5xv{VMXV5PLafp0i#MO5w9Cp
zj)u*yjp-TlPb=^Sq)S5Y>iKkIML?d3o4XAb*d^0N&<%djlka$wORJ<;1wfa&cLkO~
zPT0_J#((pNv-S=facoZb^^<08K5Wyr<12N<I^+A2ALq(luz9`Ws7y%+ms5_^ls+pT
zc2XY<<kulW&(jv(P*NM0mIk6R?*yabv0KxNs72Zy8_RUDn)J@*yfp)H#^UJxcs@8#
zgh7gu@r@BmZ@5H>-)}qnWwZ%Tg1`Hys0be{hh>r7yWZsPjtV#p&JU)p&>>W{3F*aU
z+&?xYpvu<!E!@`;!t<szQ}r%In3rJ48E~&p^%>Ysh)5ssHI@i$cSQf7t_Y%wR;Nr0
z`k-L-#EtiBd6K6%zy^d8MGDMzj<2l~lZHi!3UqCjW@9ANhGtFk@+oJfi0-(Px*n5W
z3g>&PoPTeU`a2z9<mR1~iPUys?0KDa42~tH2UC3mT@g#D0ZG34vWDf+Q&~B`^v)OO
zrGM`YjAB2q==yGsrb3Bop=H5X2saLW@Yyu+H>3Y-1sYKzY|PzlhJzt93-~x{OZtGs
zSfUhysZS&PIQoF(1RO=mFR{;X)b<VbngU#?AJFO5<IX#u-S)9O3oR)@n*IdO1g0~|
z`q_nKh|27;`=hvC^K)UlqYdOL_oH={&c`9FhwP1Akvpw_X%q~~4lt!#xvGj?>MZC*
zeTO5b{q#j!goGBeWv<iyqNO$G3{XQyXp%tNug_yHQW9oZci9MmK4NA-;1?MD)Bp-m
zJ>S^Oz*UtJDcSReyd8zHd7&LmS0~%}UkA+G46o+RA#8o{Ws}7yClmO{SO&NVPS*zr
z*|C76$=4p$Z^+1$4tGSBN3XX=Hh-ru08G?^AvCd%FyvXBY@a*IL#_MCb3h}P)77rZ
z9pBco!A)Bx+9K8g5m=MOC%U5&UVGTH$FD(UR>gfJ&gu+>w*l^s;kc`77e-Wxh+mpE
zU*~<3pOyWOx;|*`PB5gbrzZM2kk`0;TO*J;$IE&@L<)1yVl9v2guFqx3-Lp$QdaML
z!?UF1zw}fW8TaxdRwJ3sULcNAfOJD(yy|>W+@}*lbue=U+Amv$P-8)xFdo%+k#ezs
z#zNZQ`bSwnRID)Wj?jIl<gkEBja%W&wR@{5ZGo4s4g8VWKZ831C|lviKZ7AfTENBJ
zOBPT|Rx8}e7qGxBq9NeobO{U*IB$o;kYic>{PSBxmGnd>S9pnRmb;a$pwH!Me_t|;
z>=gLoWT2e%ERBopGt)NSJzuR@VNqYB#{E%HH9_1jsKS}e+HO#b`S%GzK|zxMY>c7k
zLwk8~HC>+CM|HkmkR&q~Z~-$jcc+`{Kvm?WKZjj0xL1(3=tA?4<@?4#F1HK`?*#-G
zRtRZIx&ywV7foVJU59EL%xKAQIsMV_Z^BLfb$K4niW1r2BWp!E{WDx)k0MUW15*zm
z*~qTj^fxKSB7bE2vJSX7wlMY@6EAg2?;>{&%7Za@7ElTB?8`FSgLFle$##cJ8sHC(
z+vAlZBJMHz!@Vx{DcWm4#D2^Ru^Ok2`0!XIyf_F@@GkwU)Tq@N7w6r(a-wgmAL963
z+dQq-)<Wpv1AEC)srq)ux}^UB*9Q;^feUVbp}gVU>;exnT)|+jYJEdS$A@ze6wfsy
zgGpqXYmxM2M)R<2dSm8O??!{xo!}bC#MruoxFivmD!WBAY;(^+GQH|=effrD7&Gh{
z!#7RPB|R4)F}P*BzxUcpcP(g)3fzK@dS>TQ^$RLqM?W%wOiS&Xps*Mr3*#snSL6mE
z&(=JrWVTZQ>RC{RGo_hn_THfu$z%s3(|?C56loBRGEyutecfApV@#;4R)V)0!JIaR
z=ilWr&k>UTB5N%;5GiFHK#P{p%^gNlY!K0KwxtpKilu-Lzx-U@S=qG14_UUeSHDV4
z+Z;!>1>@hsz=R7_0tNV}N-!jSLzO-XI2%{^JR=$;i@2y^_uhEz)Y`(vGbpN=u2$0F
zVr4tW)<7$Xh)Y}=?UK=j$n7uR@H!JBel{v+9=^XDwz@4H4=AD2ei32_UGY!0c7bYx
zkpIdPb6|u8WHAE*Ltq6nab<kKke&xmO`C};mtoNT*q7E=Ruf8x{IbD6@&Cygpy;xI
zyx-w8J{Ql`Po^PpB$L=E5Z5N0w;7L@*-=q1uU4~&SvbP$38tsO<s$Nrqpswb<{yzv
z6)*p5eNZG7APCJslF#<`dq0uNu#!|XUx->j;>;u?_QC4hYQMTesBj4g->7w~I%DYV
z+irQZipJBIpEy9rPZ_xx`K8f)uR0Uh<xGY7n}I%X5lA);_%Q~1G1fhJ)R4-qdZ}2Q
z5Lnw~RMj=_XZ-Y>#|gXpdSgJ~;~z=2$U?pK^@Tu>1aMJ_SaBM-w;Z(8^FV7R{Q&m`
zcY(XlCRrNArRL0JBB%99O`g>0|8SGvv#Y62Ck2(6pYa*?W`1{WmsRbK@Q$jYJx{R}
zV(>VkO=)h#F{zpL-21n`M$k#AseQD4^^C`<Q{X;bxvA`Ziek@%3VrChk*(X*{Fw6;
z`?L(A)`-A}vi5G=AmvPKXT@B+2ggVZ$$k*5-VP6SgyR<mj$*8!SO1_Ya}E4_<)Tt+
z>~|8)dKk^?l#7MckV|}2_vVC*9(LybsJzI112U%vG>5uo-(v3@I`oHF0ZJvl@57VG
z$>wpQ&vM-(kJUu<T(-`Zy2qZFOn?`<<Qz}Q4+3ws)zUy`2XP*kzOyhnaw#Z{u)wlp
z3_GJut-s{{z*(IE5vOnw*4}Tme_UM&oBQovrFAQs8<=8laIyMmkf;^zIrJfbD{s~_
zUun?Pl@t-^?5@xwULSJG23JZkaTplQ5*&DHld$@z3K|(9{$AlP$a_vkrwkTBB&y0s
zaTz`f=A7GGB0#co!2E1HR~s|)%O`@u$f5lnVTXc8j1MY~rU_ajql3^bPLEZlN=|=s
zc56wk5xyGWpqkW=VGq2HfYaEuGyep`xtuxXm5|7eGRf4dbrV)C{El0Vcq_c4_%AB)
zpbn^jLjxLkpr$LBE}WBQYp!N+;<3-vp={n)L3~q>jI~DT^s<SG*>@FEGij^)y3YdS
z<zh*f5Df?h_!VQit4M*h3d-GQ#i1{a_@Dgc#Bnzi$&3u=P~f$y?vym2TPn=zT!(&z
zSF<b>ZF=!_$@8~sJHVxWk`&XIm-)c|Kw-}hG%KICzC_g3p7ZYybyL0UIl>YmyXd__
z1y%zTa<`dmt0}5YOaM%=?+bO<5#OY@l*OzB)7xJ&#a-Cx@iTJ9$n>B1C3-MNxFP(v
zj=t546F~DpQB%KW*0|!{+?sYUJ%&iE%Shy$RDO0$CU{hobfiq%Z{rZZpuc+V6*C@s
zd)vm+^9|AsqgvQV*4pSqoYsEr&LHvB<qs$RrE5U|Hs-!IKtADC{d_7YbIUjpLjckG
zP+HIu#K$G1uYqP$k@Nb4ELZX$O)d9!&+U6q(}&}1fN0!*jM=YR97o5TG&YM^N;}Q+
zd5j8F(U$t~;_LEV1AIkXZ8I^<WUz#^HmQ&5R61og-IidZ8p^DNJaxrXy~SVd6DmUw
za^^ni4?aiDIYuRe$=&rIIksHp7_avSPcz$KCPwF1G0GvL<%BnCW*G-4d~|isaJucg
zA0^MY%TvcJ7s>QI{zR+jY;^p0SOxXwIMnh!<aZ>l{VO=L{db9%?Rop6r7;uCtoONJ
zO#V3Kk5JnAy1BN~?O|`sU*LGj7pxB<1FG}+6_PVkgRxV49;i6-M#uCA?-Dk9fY3PW
zoN6$@7_aeNSyxSavUVbW4G~97qAQ8qI~fO)7nlaWLH(D3Q#1PmmbI$iUal~u8KUNU
zsyIZvw}o(y5gjSt6L5QM9yV}45=wBvqs>VCk(Hwl|1T>>P+D|%2>-f(*k5QfbiFdb
z;5y|1moneoTR6=6+qqD@f<h+65A79!hy6Tx4dBYV*1j?Bxw;}y_LFQlpfgFbULGb8
z&xJr<h8V~$0E*%WNA*$54D3$n?q&DDk+j)N;mvsgdm}%B9I9_;AJPlIR}MYr9XF6!
zj01!Cg-X$ip3wZ73&iS9><OVa)8cK7)-+4SXwR(1Z@%|SF?R4khgSLBiV!{m(t;rS
zBdOUZDpSTWvje~RHpJ#Jloi^zRyr3vsG)B$q{kF3QD@v!w?x6ZvWv^Mea*ll3^p@X
z;5vo;67RL6Y@W+F!r@H5Rn3*Ga@To{E?D3=eZ*VDEk#HSx}kr|`ujdy=G5G^o4jY|
zu^GA+|1nUTb{8{KJBOCRCSNs|%=<5#hK3G!Rh>zJ3K2KqdGBkDA+DTtX-i+!{?Jz&
z{D*~lo;&aUTxb}?VT*I(#l%cW_Sma+9p`F>F`6kMs)YAFmR3UNJ8=-Ht+=S5=tpKF
zIec+;Av9=+>_pCF%=2D@Ue7Dz|2Vi`Py-@=VG|h-<r4`{)$Z)O2sn|i$?vYb+M1ne
zzp~y@tO0e+L4agyGKPPl?zM=XId|te;^G@FnnH)LCtOq4kl62!T_%5RszvO<cW!)j
zWoU&H<e{Quc6kd-$>*mG?QYeyp6~--Z>(LEJbeE=nN5+)Y`N=VQ(Ts1yoabnQC4z-
zV+Gv^WKwQTu~3zY>l$zJBj4@<`u92hwe7|Te)o?xX?v(atLs5D3gUwQCN>YY%Oq#I
zvP_FeRPPc_8_7wCH&*gF1xAqX_&H7^<mj+3YnmXzeF(6c$?$LalfSeXYF3oj4`<r!
z=LxyM-&|O#ye^X_>G-27at}v9*2u@?2Znr^VgYSy0~cela3+fd){=CUsPuo@zz!d1
zH?=Jb_Miv7a%PwEk~g^BP+1RDDqT*eqPLSNY%=+7DLj}-Gv;b@tP4>UjH8lJfs!qk
zwc*s|{4ZmpW_E|m9eY<H2Rp(=S`-9((6XtTsxV#J%dVOa6TA9WwBqTn<NIYHPXYm1
z*_D22kATckoov;rlNH(8ug(@SNz3|ZE0g~mc+0C4J#MX)H#O}q87U8JJbIJ(+}6!l
zK&bPCIN)skZnT%NJ5MX~1~8OZoE+FYe#UstAYC7?&uX$Y)wb_9l&aU6vLmf+H5b`X
z8JSl#u9Ro>cM2hNTa}K*h||(7krVv`0=yY0TS5InUd6YgWlW5{yX*goZsqhgP_^`A
zi(lHty1a40*Fe8L4^n;ekDVVFgCZr{1|HeG<A#6vOzq1|@m#$NaQ1537g4knM$l$z
zGg!)e{mSN-t)W?hKBjuOQ0#B*sS}s3H^}fZP}&UCv$&ldKYj3qKWGoWdG%MiEn2gY
zpfJen>8mU_vASK=0&~9biG4l4tl4Loji(>XO_JZn8`QH>KU!a9?e9!iC#l!3yMqje
zFu<eSf<`dL>v^js|5@XM&q0NusSC8zV)>1HKM#XBAHIa{-HlayqLm-uA^)Z_+&Qnb
zTxUsM0IEGOT|4%93ie&Hpwj&9xv>ZqAAkOIgCGzVkVH^_!p7oO@iXNJvBBH0NyNao
zXiL!4ia;DLHOx2*b>l)DAjFKlMj^FV@Q#}46R4#=2)}CU_K|9xvfm08L@7cUgL4`T
zf_R>Nx~Q6Oi?Dt2i_iZ9pnqYhoE{T$<V6}c0seex+B9DL3PE-8nIL=Zis#wy|ME?H
z$bejG^IUa;1(>Fv>YrKsk?Hr@g&<-C=S<g#?Sn3-!`M@T&$jTm7w#?Mh!Kh5uFq~4
zJ_LnXBUE>cwR}Ji{|wF+EMG)rGosPdHVt~*Zo~C|uIoE||2Y*!X{)1=PBL4)paECT
z1y4(h8tF8`_0omwZ_KKshVr>eq;nNaK0D*yq;#+A3Gj!pZ17LKY>{1jS^qQ<;i4s>
z_9Qse{gy3SB~Kro$bd2rkjx(};DO3+P~f64%NhH=NQb>~pXGzK51-l5<~q#kk8d`v
zN6ddj@b}q5&-}Mw&H^AXPqL2KZKhW!q{Bq?XE*lk5|HV_{D^-kK`R_IwbE(bX(A0=
zL?gzj@=^vaCOl<<Nk(35!?Fuy3TBdke%2xb(v8SmZX93`YN|~uNqYH=6Evt=YYXY!
z_751YmD0jKIRQARyXxPvmVx2*h_~EL*XJ6*RVL^eh(NwVAv9^w|NB7y=a_v<AmV$Y
za%mAo-#yzk%3SmeoW~shVFY?_kiI{9?>+Au?+#3PfzxFvE(u&@BrRQUyPxQQGd2Wp
zn?f<08mDVr%C$Z;AnEwm>4WgAMuc)T%YiVnRaRMwkK)bNQ)at+fi<mJy~4U}NP@}c
zn7d0AL+(yzmC<iN@m}S3s{|4IPHDB~n=u_QKCWpzo&V!1{}Jv_k?T<-5n*Et!J|0W
z@{JN~%+kFbc&iDwNa>bre)bP5c=8r{$;z`7r*x=~(O@0&s?B4Xi4M3Mn7DS}!-{zD
zLwbww6_>_|j^3KLT<lXT$D~4g%*xbrCR<Ri9z*qI?y>XGee;bl4Bu$PSV&Tg`-Z3I
z+ER!o`jfoOJUieK_y4U2zG)C~J10S$@$Os+_aHz;4zZx8U~$$EGR@8rM9TDgqDq5l
z;l#`Z&%SQUC(|Lw{YInT?(n_9wcmGX7~MGAuV!Ts)xriwc7zm;vf2g}IGA4^_I0^%
zQ+C5OOx-1pJ#djK$N*z8U|}hTG8b%ps7LbpVP8~$Mi#neM|6uEGZeE3zV9|Vue@Oe
zOBZjjph~fmB>bsn>P{Wz>s@eE<qhNI<dIzcQsn+4#LrTvQJnwnG5@32zqMUdeNoBD
zkUAWdU(Ye4djv{}kP=CXB;K3$;PT&&34B`ys*$$Vy62o!RAb<wrs|-?Tl&H4u*Zwi
zXS)v78}k|72f4EG8Z=IQsk#*@_!4ops;=O?`jGAQ_fg9Zm#|pxp?Hmln_rf<nz|y-
zknHMgw!KQed%DTj?C9d^<H6MZpCs&01lsUIjDT8DDdpOu#K^3NpqcqIT*(4bwcBN*
zQeF&dsfX@tThPEk1`2p1516#=hIYP&Vu2ZQ!HaIe8f0=~=#qJF&b}T@XNwpsYDClg
z>CpO0Qz~O&4K$*@#{1#lobBI=h=vKRPSLa5G|fgROFoJoy+bdC<T%l5dse<zf$XXZ
zZ9o|sH0P12sG*)aXlVSc8(*#swt&9`UWwyqwx^|QezHHjgQV3`Rmj_!m@1CKA~oiU
z-T}-o>SiNC^7g0TDMY-Jr!bCYoc%(O-6z`Nk}y~nuEf7#<6KPNfRBQ@cEAu{tmA^@
z7zqWafp0Jz;BsD_JyLg+%e*+PcLT8mb=TeCA+w$G$#2<SS7%v+IPb&VnjQ`K?GIEC
zt}q2=DmtCWJ4a=`)voS!cJRM!U$K^jJ+k1Ty%KzR{_v3xamK%Q{RjUOAyc^;c<)on
z8YaJ8Rb8zO<_@Dcm)iz~Rp;sdh!Wp0Sk#%1@|a8Rtsdft1!+F}z@*D5`zxdCrVgcl
zZcnSuH}C8CDGyHb47!0)g2wyuwGD|3kgGdP13)EX5OO{;i_<MX(Sv7A5d!g6wpUcI
z$d5&288`e0vp%Y6Fbn686nGx0(&y<{M%dyt?wPKV$*ZDC_{_R9Lvk~+E225?({9Z8
zy)I21t%BdAB&`$Czv?{yvpwHZ;n+ys3|t$>nmnxc)(cV@g*G&pL{kTxx_XZU?BW=s
z32ryI3#1sBO@35U+a39Abl3HwX%C-fPHI(iv_7r!Otq4izLVCT{V6t(JhdD%%Dzj{
z{Lxpz%w8-U{k4YCh*K#0SSaVsbU)bBKik>gS0~g5VCDxS+Z(5?XWO<KKFIe8H>hU1
zyl|}RG7<=9VLRaVG1fe&Y{%*7;7IR;0ZUA_N8>nG#Y+>VNaKX<bo&~7Nb7cRceXqt
z^wLLlbxq7Ju5Zss>c36*@56i&3m}+`unYiLsb9*@<geNd`t}!Qz`D&@RWO@-apAkL
zdM+)|Mp=VC?Raty^i<@D7|$q8@l(y{RAdM+@5NL&Tcy7GQhmgH7~o!mJ#*nKYN1^x
z+JP5v$9>J%%A|50?3PEOS^%SFMlw`@0^SQg7BBe~kb{O^LY*^=XQQ<`3C_i38u&ID
z+IIjgEp!{+FJBYNgOT;(f~OWt&mK}~8@ee&>^@}(;}eb9ypdl*Lb$}tW9j*`R~q@~
zFh+caSsfV<a;sl4R6tam77}V56`VQ78?H-Ag#bJ2f_jX|d_q)i{|es!TP76ZdLMJd
zl3*Ay*Qae?(!I?O<p6!em63cmMu?h?p{BOEWGf3Fblkixu4tRUAL8Zc6G3S&(060D
zB1;!Cxu@^0mYgb8o^LT)HQo^=GM>HeP}?nVQFB>2MLFfgPU!={B=Y`1EBonJ+><Uw
zb_di)OMz6bw?~p@Uw05eaHtY!nQ$P650_T%vKU#e04a%D=N^vAQ8a~Q8>sn4N6N#T
zdy?SIy_$Ar25wLo+4sKuBPX*(y!-38`><Hlh_qH>&h6dGF5MTW>$?+5!_<3O|6f-7
zV9Q@DK*u|h)EJP<AJ76s!FcCeq7VJKpWG+VRN>#E6y@9RtactBh1DQ5a64rdhEC9)
z`Mg@|?hYWXc*&+X9%6hiZXGdS7U5DV($Bp2OVP1ue;%npo)S=RxrYTrya}ydMo}6M
z1nD$#8_Rm_A4<+JY=WT^<diEW=T~3|5X#1H+<!VNS$L9?_ZE3lf>M2fIn<+Fx9Rm+
z$^z-*bn3Qp6NPt<^mQoUC_x6*49FKnWg$e2ru&@h`;~Q(Zrq&b$$Y_of|tLQ`!CRX
z;tu-M=lrINKv4ahM7c0R_g;4rYntKvY>`4A<sV<CX9(?tdCydz?BkD|(;hrElEfxJ
zd2s2%0$R53O}d+3j;ILwO&Ru)4I5(*4|YyD`@N%ZsU{1p6JDH#jNc}j(^YUURv+x<
zHOkYyAKi|2;WISOn-BY4ZJsGAyg2VM%r)c_B*JnHS^9UG@_ztU^wnbhmRKyjl<`WO
zo3ruj!f)?u$X9P2=@xn_n5r?%eg${TB@b1)&i#EqAaGFv)c`P})cdLdse$Mrw9&?L
zp6+?J_Nz|aBao6-Gn$2FQzf8Au;LgpF8~Gp&N=Io`*Oo3+!O{2Rwg2FFAIwnkL5g2
zomYi4L;n@yzXJOI5icl}>3~NImzIdxSUy&EYIfVwdPD9*K<jk7k*j9t0BV9o{OpI{
z85#;v@8np_!E5JQXj_D*SuYIlCP~8De62pfT9M^^QH(zl3VG!K%xaE`zMkEIz4#Q@
zl3w?WB?T86OtYujfcR0S7FqW1#36At`;>V~vBymD9(2t8$p-T#5&xZS{p;+1>1HaG
zTux7-yU549QJ+FB#(g&qFGz(T<MGr%$rci%DCVxkV#1qcJtaTn!>ssEcxcgtx+mC^
zI}RT(>vESJ3*yE%H=8(>U(oEl4RBB`Dz{{`b5Dj>;V{l+V4YjXoZF?gdN#Mv*M~7_
zVKsHIo`|ax<zw*7Tm}}JN;LKq6bNi`-u@pM{sl%LY%-g-&VDrE*I<yja=tfS>9(j;
zXMJ%z*KCOKlvf&GGj3Pa@)uipk1ss};lGq#kNX!z&oFQRE(+nd?%aRdb7BE;$hPi0
z6%y@$>yIPB;Z|yc<7K6J4`=O<d^6q#H|`5|7Px!=3b?3;cYHeME&^SHUJQJNv%2jd
zU^`d$U{?bpFGeC%!s%Odmsri)aZXi5j|3R0tGXK=a|7sYtsxqVFCF^2#q(;P26wMg
zKctSRrLalz8+b<wV?1NlkL~>aPtpW+Z!L0*C?16jb`XQ(PxM?!7Pjac&`ivx`q8G;
zBhR`OCaZXWCsyIfGQXUal>OV^(AlCz7PS}r&Xcrd?MM}hAXEDR$hd}ps>2E}gooGo
zzH%fO42fCmfb%S)rnE7nTtU2EW&za!&%6YK9~2=J?`L4banLet)KHXTo25dUj|w=L
zgO9l`9u6KiJukY7S-s<`Vp8=mMi{gr%+swUSEU#rV4@Yk1RFLeo=3zAvryKyXzg+T
zOnZMXf+IO$anrdKBQ7<GANWI(o?HQLTm(-RD0Ikw5hpz-_={r?r#9{+fT+YG^05W<
zQic1(TOsox#9_wUOr{!>q7!Y@kVA2NaFOvgO5$7(v@__5<in?g&-dN!zbv{ZCt?W&
zG-^LqG%o9gJ18jXm_2RI+bPE-qx{Z{15v`maej^iNhddUWdE5tsAZHhnV3|G;cqZ*
zQqqK8D+7J=bh_MEEu{i`A@;D3r^`o`Rcf+ily;Hqysi4)&qtNw)(gK*=uZIqXDfeO
zlJM)rH|Lzndfrp0vX~iyPgnOYLpD`VXW`9^_<!MqZ{^Q?z-?`%AQbkK3JuOI_V-HN
zTO&C><{6w3-Fk;imG+rDn7|cIrzdQ>XBTgo&&m*oCoPJD?fsj57oMAtyXslPQjH+0
z#t8Sy<qwBdQtJS`yF~7h|G$9H8$cnuNMT&VR1S&Uf)#C$YG~ZfoHdXCcCB3c(%$z^
z^9v}I0qUk+d3wEN9Xd|CM}g*3oX#KUmjjfO1s93|A5v@}Z34A)oxxp{NnY#PXys13
zwApCmaWWxjf6p?Sdpg5(Gv#p_;Mu6I$-R-|(VS|wVrHWzO6eIUsa?$WpKtohu>bv|
z-U`Y}JU=XKuO_(^8g0r$VAfN=@Ux%+33+zERN2MN(=PHmdB<G$GgYuZA(_>-<1%F3
zAA-z}|BExy=>t^z0+20`SU?|vkFvJFw~0Qt55jeY+nRkgv*udiCX|=t_5=kl)GyL!
zZ;tP%Qg^Jeyz!(&rFKtk^v?G_*kr%%jckI$0Xn3~Jn7aerd#Kw*VpjFe`U!3H-YfF
z2tw}Z-WfB~Xx}zfXhDS*@*M1lK5M$C*yl+FJM{}dMHTeeW%NsA7yOy}zam$?*9XKy
zr96TGjbT=X!4STz4mga?7lYxFdUL*Dh&}<1GiB>eb;f`RfLp9>XKLWLnmBK4m21PR
z9bYjL=CUYSrf%-jQxr9h31)G2dRR9lvfa&3dU)v$)DIl~Fc@K6K>H?_CS_M!{qO4A
z-&*{@ia_%8Hm|s!zO=o^r1zMvpjrGewQZr<34htQgXdX$qADA~)b6%udG{!y$gX2|
zSD>$EEMrxC{DQ#Uug+5$+DJ$$<VRv{@apL=wV$VlsuY*o&S`;DHnigd+B#x9;E$Y-
z$pTua0t;Ar5kz*w*_>AD1MVl<clN1&ky`A@Ovj5d*zs|}{pH7>W0B?x*mI0nm!raU
zPu6A7#d%A0&$ZK{HVHp?)h&5h7DdFOj0g&3mGYdQ1>Sz!qLBa?j6L15czMg~g!X?2
zG}O%9@I<W1mG1MpxZT7D8C&n&)5~VIoHj0aq&cVU-G~Cj&10vGz4v|TcDSb)<y^@3
z7Dk}CMVaw~P`qePDAP#hB=o3Fu>a~Y-*&diqn%7(nQ$Mt9lCq6Qx=BM*1+6)#wNuy
ziua6Lk#sxgcX@J<K~6ZpgMwMQEfQrNj|F5s)*iX5%mR`i2}ZUU5F6l-f&LPaYXFGp
z{fb<t+p+Vs<L8MxDyJ}Za?!i55tuGa#4Rc2VxnE*XOFHj{y+A<GaSyfYda(&NFpML
z2og4;hY6ym2s=#JqSp}&(TQFs(hx18PLz>?sDsgaqD7B7+8{*ly^P_zM)vc(`+eUh
zvA^T^zJL3N;}Ds0-|JfCT<1F1Owv`$cZ^mo;Viu~3wn>R%E@{CFj(%A8rAbVNqG@z
z|BYn*Us&o~;*EDzJ|01h;X}L*z5>x3<m(8^ft~Z^m&+o!U+rY-(G*DPeEymc9MAvY
z1pBxSiQ6YYQv{z=XXO6ydR2y5k7Gc~)dr15sOBqGp7U^@KK@;43im*Go=seBYH0OE
z@|~tKg9}>^=7m|&n-o&BFOS+6OEcsoW>ihq^Zt-N<O3$Ezuu1E1LTWi&xJe%l4*rJ
z&FcE8h2WM7bQtrC2Xo6_wSykFWC1fZi0N*Qw|oiR^g#{C-rp=UMnskrC@ui%ITVRj
zT&Iatz#UhArnXj^5%l1H$LN1c;Jyaut{bq@jZO+uKO6J1Uew?7x9u(_-%m1&v@EES
z?0TD3MdJPi!_Byvhl@exSO$9-np0*>RKDWQvwTy2bULf^%as6K?pVqW&AL+*{M5Xm
z(m?}Otk?MEHMNn;PZmy8MYoW{!)}cn5o?`j@Ht4K?Zl7IzPuCCM6G=HlH^$TnR2Kh
zJs@V{$Vw_HRAHFb!spGwTNSn`br)@^?ma@l>Q7F3OE#PM<deObv)ruOZg9=5#BWWg
zxbC0gbU$|p#814DTa~ireucWK@_t*9%c-*5jCQA;gk=jZR{7ueH!b&!3hq`uYuGK8
zS~bVAUhdqgFw63&{)qE<Bh0NsU{_m$8@Frv&Dz@+r)cX9i&Xb$rJ+}YI|#K`Y3b;%
z-((KC^#gCq2j0k7{$cK_E9bOlVN1*xdPifu`X@*uu+du)q(dRMoqY4&a~O(Nrj=;!
zDTkedU+?L@b`-wj7H1SL?;!gF5&SRO;a}$knq4T}Y{cu&NpukJJ4bS#tMoPr4F#*Y
zuau++NC}^7u<|w@?Bfw|co=PHS*Pep_>3;%;d&11%=$WJiobzDwS>=q=+&&eoszUK
zo3h(#d#dV`N7B%P$JQh|9S#R-I-?IiAY%iQH#G4ot@Qjp$WnsBc0LDf4!NbI>AOki
zgCWBWJ72#!rYM#<9QQyy`g${6+E&Yph~Q!C^V997EGITSjQl)uciLB9FK!&c2V66+
zP~3R%E3yo@X{n%nUhH?ui}vk#ozQ?j!Wc%uw9*K%+fi-38#nnBGCZ%8v^wj|s%r9~
z+;+iBjbmQ0&ik^u#0c-q?X=HFF@ut~1M=#Pb9rAC)G*YFXJ6I1cXj*t7P>@b+d+}C
zU?${Rju9_+2j;ugKFO9zw*jDq_Ctn?G9{ExWS6&5ki2fKKB$o|Vh_Zd%vK;vZ$Qi5
z-YGRz$-6wkwBs4dnmB*@2hRVU{-a)Q4`7`3Xq&s{I-WK>mA|F(D%B$&I?d#xwmvM<
zAg1mc&QN9*x%<4j9DL#K*ngY2{m-&PgxQH?0-sL8CQQC{0{Qro$GIKSqf{5)QZJL`
zZaygqS7n=Lm~8E!>)3m^YHpL1r<9E7)6yByE2;@`sHzX@Nn*LqeUXMTDH9|_j`;=o
zRQ{EmOS8P+ZVS*`dq3*;)JGm)iN4$5n|F8l%gTB(;E;#x<1N;z!uQ071rDaORln2v
z@7tK2d|VYE!MU_rE99x{_9CPYLs&)%!Of5sfS96Q!vye5+(1Q6zCS`H|D;MZzq+2=
z2$hAKf{?viDMNpxP*eR->Uj&sE?-xXxwnvgi=;(BJ<I)FsONp{C|$C-P^BnCRjh*S
zzcfSt^6md=4_}hAS3tR;*Y);Z^@EV9>rFH^tCIWy;hOQXSiP6<g8Zr$i@`U@JuJ$=
z^Z2*6O+934lNse_sBfNZ(N3F>-VsqP+f(tB;>-QH{BPAZe46y_3XXA)+&4F;Y!Nt_
z(t9=@;)PoLA`oAsTs|fFeTU~m5I_8Jz{fmy7Zi*|kjSw;IhYTK!C!;#a;mF=%2<#X
zlP>lM`I@LVe_uY3OJDs;x^Bzd|0JW>AXYw(w7~U>XJSL)$|-*fI)Nn5&5%T!fdFMe
z2yA*dYY!f0C3d;41X33-efo-MIPQVKgEOvwirjy0gmc%cf-&yLuOC^t{5<crNJxjB
zBN_cx1z!F9WYQU{y8HPW{q<4Lzh;HeGG|qPA_=szt#_IBz9q}lPY1g1?@e%o%k=H(
z3lVM<rwTersb`%zeS9{q_Lyttd6>$du({A%;N_>xyB&8$<NKvC0hoPO0M@+#?NWRE
z0Op~ozIZ(j`gBe^S1LIpUXmkO{TM5qTC!yvD{aw0{jm>xH(viaCz)B(%`5)CnsC(>
z^t%lN_M{Zm{iB8Ai5k+wOLGd2i#+2iUKP!_Qp8qF{Ch|IAs7q<L!y#V9IuvxMKxKp
z_h3+@%SN*+vg62ys3g8~XUB%p>NV1X@G~>}r-OxObO^b#O99ocrR=4`&C-U3^$jmI
z_vpKSn>UZ%KYyyj$95ys<|(<4DhK}PtMUY!_4{rdguaJ5zbROam}`g?yY0GmFIH<#
z501iL2=l4{1y$8Y2I8!KU*(eIQOylFR*FWA=Lth?1TqJxd{2WGYg$~$Hx?vFj_|(s
zeMA<#_Y?`c^ES!H;q2BLm}&UCIu=m3?k_G4qfXD4v>Zk63}XfEatya!kCP^l>O9K+
z$}&Z*+Awyo|MJsxPYj8#9Iwc^i#n?6TTU`iWmgpJRi9@!7+>d*N&0^GmJ}M6AFmr_
zs83MBU73&HQ^oPvAlfG+)Z;GCJt!ZlW6bbL+6SsJeD|Au(cq5OzNMp;eND~!`2gQs
zT-xmSyA_xhtAA7?$vb_y?M(`Jn$=!}74#_?ONW(pY#wj{qczG89XKHmisPHD7>Hix
z3b=30x(F8?=t$U3OaoRujAEp0v*wstOAv+rN0Y|a4|vR04q^lJ0TTFk3tyL+j$yaA
zPCXe?)iijgMrqGm@jd<ItH~sJEt8sBv%zzFzik$B?>=e_&1x(!mjVM`(}{HMg_KSN
zKW2x1YI)~K<ewB?&^cdnpg1TMSFNAxn>bJo6Z@GL6-N*W)Pzm{e~AEk4M_I9h7E*5
z0LS^F<c3RX<syurfayLmI}1~1?$}_M?1}&QlG#2gVd7<^SAwmn#cd1L(*{apGD72h
z%?yhJ^bLkjy;&w(rQ2&~MK&(#31HQ}Q;|xYf3N;nwoL)cgc2*UbsZkIVKF96nT@F`
zWT=T?J>x9odurB%RMy}kZIO{CRA*L{Hn+~2^RX-#Q+OFGLybNUn2=63`*TA|pT*$!
zeB8-^n|-Bm`;wKDDX{GkqN1YuyhHjv6}qFOR@8Gugpk5m+27Et6**J3)@%U;!Fw>u
z!(dL;0tHkqsRgsLz4cw(BQDQeFw>zndEar+aMkExm;=miH)-gdFAUpwQcICCCkicQ
z)3IRjP2-ys_p3)$4v>_h&zB&sEz(Vqck$&GsE!qB@7;&f=BnKD1Gu*pH=4qIMmPG&
z@0AS+qAPl8l1^dM81kCiYrz@F``-9;rKtXab6~hSbcBZ%lY#?Wi?cv76z+Bhnh2p?
zdCEa641X6&an1DXdDLC;S0HNyc@Re!1ugLNUa<2bE@}MNyw`(*yko0_XF};8ZU#SC
zb#+|i7L1s>kKDGD#ot_~=*F==sy~c7BI@30uUy+kK1oTpzuY)}PC>vsmiuZ|{;J&_
zwIjMuz20yP9Hj%oyx-ovj#N45Ag=oio~xmt2vNY+98<_deW!#YCQ6<-nclA4=GikD
zg!^QW?@2A#mfk*!(0-IMe0hKwBSB)}so-+A){ajvNdBn4b!%4=%hqGhXo1L2eMuKQ
zS?g^;>}*j2(%8S;RS@*BvR+UNt-85BLssJrL_7Bn2yx_%=}<}@dBRY(YT@Fs4|OVc
z-bRru3q5()c@1g9)9RJ))o`k*)GkAO^LYB&9V&q+Bzi?iGnDGTX4wC;YW^3Qa5bA0
zVqeA^rN$z<c|&W&cVCyFP})r*Sc0`q2@5d@qDVgL&PlJ$9Hp~-hccstGIJ&(kW7ot
zIY|O!7TezIM>P>){4qN~HeyT?>6Minc=C_g=;=(7FZBBBGAac(W?#soxO+Vw)?HvW
zUw9Ln5I{)|UDNmy9tDo}B!YzA=+ug5k$6o6TQBs8Yeon+nPy?_F%|@y**=5wS6|TA
z)UASQieypKWu_%I_4`s6iB93=+f6K}%`sY?7WlL(R0ii8FLsFRw2**6m_)~De)hRa
zjg?c!u+MC~&l!&6DIpt^kM=D&WpZyX7`n=SIIP(LB0z872FoW6V)q}9mqhL19dIyc
zc-fq{h)R*B1#g;?x8hEy-dpvzZOumA8b;&Pj?Wb-k)vn*Is|Ei5oY^ttC}v0EQM;-
zHw`CNK5wq4Xe1pU9IQ;`BsYz|1oiMu;X$p=KLUTSy8S-Sajm!j8X`HMwb=1G=<eh=
z5ZoFKQvsiNHT=}Br|y_f$rZDez)(gYp7POza%a3@#H3WsoTr0;EoWlW_JvZ)v^1JU
ze@JUPQhln88L33=(koxyM-tn8Piap*vR(VNh`UKCDxrMKBoLmar4fak*$DRbQvNS5
z_z%r4C@MfsomB9pJ~E-gSoRquwG@+~Dl^ITu2AajTv<BLV5w<}fmNp^6dhQLk@YP-
zJ83RsE^(LHm+jQ<Z_eTUX1;aq1GjVwJQW-tYFLo%X3TIhP!+N)sl;j?{n8PX{P|Xo
z6?Z2$u_vlpq5*~Uj3+uIJ%pk;mWkF49n!#<`J#8Ars-|f(wVE-Idzp4W|<c2xaf@1
z0kLabz122xY;*$Qc2kT)DzIAC&+E!&d%2lu+oi>u_s^uI)i(t_)%f;#=@a^HrSD9s
z9RH^F{}37f2Q1Z#2Xh3xpBD$(WMj|jR1I%j;%SAq*WTV=_}6CvkkRk(8YJb(#(HA<
zCVUY+GSmIn&aw~ecfAWvyIF@2C=;x?l`*${mU^%y<TRF1mZrwcdNrQXt;OzC{G$-9
zoZ6a=_4x%t-+WFQv_9#qpejTkNv^2=U`hlVbFn3{YU+phZ=V6wNwEFZc>ft9l)}2{
z*wXLb(2<wopyOX)R;Y_$%x(?jRy(5{$ju9f-=aLB-kh-fV+dC6-QcX9_@F|)$4@`>
zzTTW#E}q%hZ-z8}KvsV<Z#J5sq`WXNlu(ibHsWOL;8FrCn+;?u_bAjpyu5|FT@?O{
zLH(ZwE9HL`|7pgmNINv_{bDvyDjxx0vTy+Wl_Y&L00S}eW)d?>cS?}ll+(KSIJ5}j
zj3g;YV>VR$IHZ3iX(*eSD%I4hz<wrw97m?vRA5G#RnTl9Y(f3OJjnnB6b(}^w{bS;
z36j+aoBXQDZajp{Si$5q&nx1R?-629>}%O4gix3ch}IliSQmod>x~rht;Er*C0{6$
z<y`s>7?FO|rDQ&2B*jRLnz7=89x#5h<MXEoYWQ%HI23=coY)GgX%@b~d10Pa*7hY&
z1a6yCg*w{q)R*xMCUgu_1ZE}hy(P6xWfQaEN6w<BhR1kX#HTm8r#FjD{@mI558m@%
zIrb=r{Q)+<uBjsbmfTXUX<bx&TP?C}d*hPISz>X7Jl886yO~!bls>K|&2Hmz_DP?^
zr1CB2&6ZhP^Lf)b0VkVBr~Ju`sO-!8-#y;!j<-}a)ibMGaWKoc70x<(Qtu>Xki2hM
zyIbe&=>#eo+Usid$M#IXET+QKgj5h6OtJgG0jjg=YzWnPz4SFEH&2t(wDB__%NBcM
zs*=yBDK`_VaV8L#TiL`aRS+)7YWPqnF0k546L+T|RJO~!R<?G~iX8$@0oxYFVfbzJ
zUWLlPHL67V^Jlr^<cBIb*z=-@VMIm|G*m<w5Kz+<U5(Eunm#z)HtCh?PogpZTH>(I
zgPnVfZm#SRYuo^$`o%wt{xDLf1p7djcM4qDII{(6B9Q1?(2@3?nEOWS_nR+!ESK?t
zyojQ-*5a2C5Z~$wnrv?6nXqd9R#1ARX}*(+C6NKau`5q_^rG1dF3BWedK@fuT2p6g
zi5)R%Su1fQYS<5Hqrg2G*&kbZm76m%W9)cbahyXX+i#YQ9b&Q#_pmyWfJOH!PqR+_
zjy#w6*x~J9(#7oI3?ZJ@euxi2JKEwNcEhJ6y+r`Ng*_#FG`fM$;2~bOWJ#Rxa^}^l
zmr0#*_mwdjx4u&E)T1)zhBB2RGv>y}=PrPwd<e?m{8*u@i9fdz^-hem(wRL%a!eI3
z^7o1_1juDIJe<^Re=X)!q^1ccD^Vy6*#C?kFw%d;xN<?_9;|8hUK+}1C4ZOgxeyZe
z;YF!jpw47M6uxg}AybzzMdg(DaQj}7=_;LbN6NUC((vZ_g`{7d;Q<p^Wkdx%MeeRs
zMm>y0X+~bKcic~Y6W2xUBzRN%gQ+fAUn{+Q@S;E*_+qo?YKk(Ofn;iLQ})*#$|=**
z=rv92?%s^}#ANcw)jcBaJ^DKCqRq>sv3jLF<3dK{RWr?}!QcZGLPB(eTY|`ahTT$z
zF2_^zx%U?~ZZOO@|3UNk_WT|v>@L{AVgCcnP@nE5@AcP4B#~N~Y&1XY9)_2VeF1&}
z{M*93?wB%GD9QL*_>aj&)x$GxUz`P{uS;S`om%;^`l2F|s~XC~_JhkW_k+uo_dI!?
zYcX}S6ubWMB(*6->eNU3lxc{ms5+5w{Fzf97zDHl?{0(%p2&GTtr!>jsJKd{A7g;D
zj^^WkB<!l8vgS1b7nUsFO%-k%+Es8{H8WFM&q1RM6iV*`W>ofsR*8eK%W_d=>O4u&
z`y7Ehx4y75vlHcsl6icp*B48^PCZuYI;mj>f5Pagctz-ji@d<qD_2Q>Xgo)y(@iGm
z4aY&QPiqb8gM@eEf#{Q&XW|l!kGR-<K8kcb7$B8WL4)z3Tfx$U^vGr(281tDR!+4n
z3rpDS@C0kWJuEI(<nrmOtf=kO>5ggI<G&Wj{%AK}B;E)Bv>s1ce?2y)JR#lwEq7s6
z>JD?k@BOIOin})91)uC77Qt~3^}4b~yJ(aw_vj0h773|b2I<EuMuzNoq^^EaXH(f>
zJa@j8NRxbbwex~QjW#l3q#-&6A5BQfKgK-uQuPuCPAzxB`j1$3KQd9Q^{IHC%_zm$
zKuBdG^Y~CQRnS^}{nZeyIrM#O-SBmTVKLQ-koImEd83zz@``upH^X1e9RGCnkxYuX
z5$eL{^x7X}#(0_SgNsG3F^(j3@88b^OPryCo^^S8BVeK7>-on`pT!HH{f_FvyitVP
z90eceC2Sshg`1C=r;iuNrCd7?Hl~1AdgsoS$M08Mni!IlZG!?(YUt}noSA*!8++uX
z$WlI9)D*#g5(zugWiL=+c`Lqj_~-y`%#mdX^lSKmP?ff3ukI$_*k4Wa|8&TzDhl;y
z_v3}{|M8qU@s_e@n)v;~<0Jjth5eiyy!~B^UoE(p%ioFTrR9<O2<@9;zr)QkidFix
zi`I2EEB<)k%cJ$}K8feWOkJ~)j;hrije#%=7>1b#GcP=_pkZ2IUS=j~g4o>Hcw>e-
zePjA8a_TcF!;YGDz{Sly`wNPurF`e`5!R3^XAYLR_a%#P^~VG%uK}T?6ScDAqJfZa
zVQP%0JsVMLc#?_Ot|4K+e}8hydI63-pL#<KNf0P>JwGQDUvgVu6<+>f*<55ru4#+{
ze98i)<L8&1nhk=t_I~%y_+M+e|C#bILQ|GCG0z9`*$2HjM;Xbd`qA#QD7jQu(Q%tx
zS?(FjQqH&-7=dc|(wNM6+p0rqwd<^jlN5}rgI>nRu-8a3{+uEh$#mFV$;ib$SKkrA
zrkd`*&YH(++KM<x8bpdNyJajSWU(n*fBPO5JdPa7%6x^RnEPn!V9~eBzVP{UA7b_e
zV#kb1|5>LOUyTP!hNI?l#@yLpqaa0MKl629=$tVr6Dm;r1{dyWXC50TnDCuHMLKX>
zDS2h50p9k)MbMS6DRzU7IJ3@Qi>dJN+pB2Vh|2CF;de8u-g&=MWcVD;&0W?%?4^&e
zG-D<FsVDihr<{Cq1OD-k)FWO;S!>U`cUP8{d^P?sVX;1(jU(-2%>FQYFDzcjN|kWm
zIy_;xS^L52#icjXJRdC4uPvCKIPpbWaGO@<l~bic&1w@B`&ik1d@hWJ`v!oSGP8MI
z$%BIW(?<sf-70OPT%tTupl3<q(8>iI9jIan-uW}3U|(F@W8<;j`~H*mep!%~({7=u
zsyS72rg#^%D}<nUVmmV_Q#$Ag1d6-$PhtqH6Fv;I+N*K^xk7;aIAdDK_qGN!kv)86
z_}M8i^el;~>OVt^u1D~Lj{?jvuBeDr0W((`z9oAg(f__DVdhLQn)6QgYn80}&dUvb
zBhxc(ak6wOJJ<tr)X-V1t7!YtHVc!z-hxdKV8LZnDA6xl&Rxs`;?-Y|b$>)$uPbRz
zVoL9k`cm#I5lveTMXEMxRh8M7^YlR0jx}>`aeMya8HuNbYvYGARK*<*d+D!s+4Q<y
zP%c&LGpDViA0Tv`P`GctYV^=YCP|r}2AX>=G3&D2z9f2F=cQ`a7;Dy0-m&d1=LYtY
zJ4b_83!WGK_CepCkL7k4_cd;QGVNESe0{aO#W#3x%#y5iL%wIVr%g!BE4h-vQVzvP
z-GDLa<WvlyQdiQr@j}`v;FEOE(Ows(tug_h8M7kXx4(V;3GlfJQdx*tIwx=Bm*D$C
z3yzCpgjf}-9O3QoJ?S*)<OcOR*tcMvUOd0;Yp0n}HlH?_-hzc=cv_Dp_oDcABhML)
z$2z2SjE9W<*sPh-8@b7fNm=~+LX`vJ?fs2{>}gKtN~;awP7TZFLs`BBcjhfGX{|S;
zH)C?PEIwx_erFtcRXi5P`uORtkLYmIF)^RAfC2K@ECPAco>U)fLCGuOxNZA>L|=*N
zyUfVVG-moQtzVxT1PNgjzuG=lqDRryxg%fItK*HCWw8<IVFXWJUF+DuP47_e!TPqa
z#fKT2{HNVMOVRcx%?obCifE2EBw7jJjX$mtg3S;l6vwqj3YCl-p(4m5dxTga*YYJE
z&wMg}3cluNyR1Gs&}~^o&c_P3q2Lo0g)j;^06fXRyMYVJf4KU{d}a7?#XB&641^}v
z&u0jiqD?l(E6fO{Zcu^5c67`>RvXToor|9BV`YtX*}hJ1xDK!O{&gf1;l}wi5VXkx
zpHq`-j&My?Ys6Vh`$3}8lP~c;>6TqgI>*aZ$NxI%RQb94E#XhNEvR*Qo~vJ0`yD>E
zaTz4F`$ll%%iX>eisO>oQVz-Iq_~d=r`}GE<b?1>`Ua}NsBr|Sl6wTX*_czrm#o#@
zyCvHFOwnLJ-NLR!S9f~YaColsrps?NJ@A$*_EmyuK#eSHX3;He=9kAX236ykce<GJ
z(mT$G3l71u<A)S8ghn~Y_i$gDHBT3LHkg!X6u4%9e95ok0_nhXXv(~Y&k;rTE;N>j
zFK~Kb`NJAeqS6DUcs^F3jZ8Dy;Jdr^IFy@An{#3zA##ixY*by!zHI%7S-D5Tuz<s9
z!{Jly>Pz9uGZ#hNUkhe?pFE#dYWhTZTQwwlEx*Z1?NJy>Q-^T1#^ROJ;h!G}l1A?A
zHkrK`Djz&pRz3{Yg%hPH{_E8Hsi*2BMsAkZ<?9&9iq(xj&tgnBzNyjoee{g1e9e9L
zsjUx^#-Ad>Cf|BW4xD#C%j$Wa$d6zNS6gqSk_bRib4n4-LHYKP76rl%%JYN*@gps7
zHIm?n`$TD)Q<cgfOg}tNiGV|LPzersB?~~RN^$KKUr75%Mb=*0SSpw`R7bki6kb90
z@o0cHPpeJ6X|lfKU4h;W%imqN8q<J^jl?`dwvIuZ`+BVVSw*QcfnU!PKyFP(yeDB+
zeI1aXw#Qo&w13aP7~lRP=7s?v!pC@Nh)XI#roIxTqRJN-t4;Uj*6Wv}x%IB#DQ_h_
zneP^VCdRZjU!>X~$Z3hoO$kw~_zE^-XD55^RyM6}=P1U5r~iq7e-4pPdiqMC7y_@@
z3lG^+mJf!^<5$0Wd%qvoq>TD5A?Vwa^eBEY*8QbmPOg1eIXT$g;CI}cc0Hfyt>br>
z`?oqGC`nDSwcG&JNfn{F&$Pn7L?erfR@A%j63Vw*!tO+;i<2F@+nx5k>=g4xrq}5>
zT~3Hi!?HqY1Ay1&Dp{Hw7p~u9ws?8uTq;&6pjnT{G|Vxm&dtE)+R$!A(PX$X_)LvT
zpqpsf7J@5^12Rz_kco}DU;|<<j=21wM#(Qs5(iun!9vvey+Gg4_sH&US{DrWi&8*;
z%(2t<__{uG@kuOEq5H+T!WdD$QWE*dSC_^6oKdi|IaJ2;-R`?@A?=-qYbpLLGWDb;
z&d8_^POpwia`+?u@Y{?DD^=3r*d0ak48xVd{kEa<$1NIBo1ZHkC(iu-Sx5c6)RQL^
z^fc+O?_|=^arhVXYcg7wN_WwJBq*XctQZSmn^`bC-Pie~e7}?VmejhpYO99|%%14x
z@Q{<-Ni|)1M)~^K<yX&4E1U(-M>m_gaS1JcU-R5!K(0&8>ZA<g4R!2Z$L3!kr`PzV
zKGsf|Qqf(G$th-9o{aYX=xzt`$rbU*g)FSR^Fv*7=|{F*nTTW}46bkpp$9u@Ci!Ms
z=N3P7!j_85Nhz}nv}`2{@$;*~<45&|Vgs}>qO>_%cagbImh+MRlYA|57j-#|Z%6tM
z*tW^>*_k&gd~t@XH<Tq=j!WKSLV66IB_Z_;sXAI)=YlSa(-YXzTsHDtbB^zIH#SjL
z&9Vzg!0=`kW7hXJV6JSU44Vau?ww1>mbnGBxl)yFyzy#Wk{!>aM&|ZX3N`7m6K5{n
zczlH9KmIazi*&VgHSJa!%s-`;I?#Vu+rw=3<I1i~*A{f$G$S%4QYCLDYG0<ry6Log
z%WwuATwKFF&OFQOk)v|DSFXl;@PnBXbh~egFf*5b?{Pw2N_Si9;Q)8y5i7}W1+JJB
zQDOM=u@<C1M!88YW5J9~=4)#gN{!iEzMD;8Zwy+)nlG$o1D7TtEyYwtm!_v{h1%NM
z*(!K$DYBF<z3Rz$wDonIN*5AaJ?okucB*BdzmT40RsyB(-2HB4PC`$%`e`7nY${G<
zJbZ67arD(h&*XWIpWZIf5>;7O?0;Xa+Gu7Q4PJiTTPgJk;~_iHsC9BrP2XT}ICyKV
zSj<`U8hb$ouH(YGUGlh@tgV8}Y1@{$i07JAEIfI-B`b{a?r_9|9E7JlLbJ4T#Ic0z
zoYB3)C4YjTttneYkudFYeKfprPXy`DR2dhb4a@i7Qn5B=Q`UXWJ<Q=@$T>QGCpp19
z;E1oZ)=2+`m~L$U__{&Cy18DwTRW{@cR1R``H8Lj-kq9FrbPp%=`?@C6ztIby*;ts
ziMp`(_e1C5ecer(m6kV)3s#p9PifaFesRWUE@j7_xX41K2hpRcq!SD!G_`5zuMH}1
z-%8v{G1Ga~Ex<a5nc=OiQ?40&j)~ni3YpYj$(tw}eaodcKIxz(RY4eghkE4Q_of@|
z)5lFJyQe!B@m})b#jY=MyJ#h0kuOnSx>V{Ut_afvUO;*zaF8C`VDaT|Pv?oRv~R|m
zn5=%AwWWsbLt<4dTN>8mAD%+voaGm|W$0M#6b6U4cap>qh&6ZSq22YQN&`B>7F~&S
zh)06RFCF(xuK^j$!1W@J&UUhRjIj@7G`c@Uew!~tyGuIp3zzQvK->Vbotq)g;J(p>
zfV-wLkC|q`l<czNw@5om(qk3+gFXG~!WokLL$>Ji8@+Z7GPObS5q4)}(u7i&ORWbw
zbcT!3v<~KD-swZ0%Tt|FiQbJJYE4|m^H`m;T#>6H|1q5YeF}5@qza<cAxK>AQkn+6
z;XFS>ITI{}Gy}I4IN3#UrnmH%rn4=N=(+dkZE*%>Y@A*e3Ww|-j6``mYk6g_7LLQE
zdJVA#Y8ATn*)7(s2Eu5*GKmaF+gY*I2CWPKIhOzZ=4X1J1Am<VAlN!%%Q?!=V9Le<
z%l9Xw(+i<32lo3i`;|+s`*GYb@}sooW3iT_S$<Ac(KFO_zI-;Rg;ru!;yre8PJ}7Y
z&Shn@JU>1ZRbbpcg%SVgz3~L56ZhQIkKRvAp$+rmCn@FMmoZ04igA4&-W%Kcz`&Hv
zXD<(0V}olBVd%Zc7z{lB=E!QWUO!V4m3v}oPP^4fSo<d{YHM?m`A+(a-7z>G2AB2?
z!YEpxVs6mi&{4^J6?Gc*eQ<OMuCRBz=H@&+5axZixI##({=}i?=spI*bBlL{fx-L*
z6*{5#H(FJ7{?U)MNPlsS0>W?9cBTd1l6oLcW-0V>WvQ6AItf_#xMOyI_2lO6cs+xo
z<hYv&mQQC#b+r?RSd`DO!wnYu@PZ{Vq}F`)R{reJeR{)|D--%cg|buOhuUQnR0z&p
zvYpTC$elz%5H_rO$bcBVBttup5z4hj5?XhZ(R9n>owimsiZ9@Lh1c`FZwaB*vaIfi
z!8D4}^9;v)TM6S0yNRCLX`6AH0!~f)ja>St3btF!v!uq~tk7lP!e64iH%4^5f9<B=
zmaVA1>!s&WP&(N)bdm3NI|@^xEoqdFomeB2=`1exSm~aC>#<1?+ILo`>b|w;w5$ZU
zXWlNwyzCwCH^3oy_OfV~jW!p0;`O-8-n7Sf69zlkPxJbeuj`pFJW5%7_N(GU^pq-X
z(uBt%^qnEGd0P?C8ZEnq2y}m1?%0N%if3}wi!XBe0}a;uA}oVn<fM|qkRGyyE)W_m
zS7DmrGHKf7uJ;g{Hhy6m1u@|qoQ;|(n@oFy++at8+*+EPR0?=X`WLx5HI9ufmvv%N
z7j0^iLd-^c%pkd((zsybGoV_`1+@n+<3{!7{YB<O<F?vn8zEO3_Zx>dj}P3OhhTR{
z>BiGF^%R^~2F~lJtNg5v{>$lIs7c7QY4;?gT`iKmsI(+$m$A|2Y^@e%oL<7G3kOc@
znsd=Q%vd1jW#>Di;cPNKcg);m2~Sd11;5CZ&Bi1BTQrywhFQ1A%{bqcOR3R!sZXPs
z3EQ>GoTJgh5dlT(QO%kYu!~4-?R@|AtS?<;MtXh5g)AlP3nBb>Y7p`J@vTqwieEGG
zn&rKtSuW#EFl@H5gMU_L*|~{FODtZYLVMIsFjCtlEm<nhjx0PF$8L|*rAJk`I6LQ-
zcBRw03ZtrNe-0N8PRy2bgzg#b-V;Kh8RzKpJDj7Ccxa7MYonZgCon#)<(Yiq*`kVA
zq)!kP@vg=?payQvpI<|3nBE}$%~7vY+a{0BLu>lX;WNvsN`<bF@l8Uyu=6+YCFFr_
z-MLVmn05OH*muv7{`ne#ThF5bX23NBNi;4<`Z(j0jc*I>?4l~>rt0QgbgIsg=eZ^g
z*DTr~Dz>Lr@)mh*W0s$zw)W1Ky}ms#ndRlE<1g`Eq)-wqQlyDGG%yWGOFJrYi-wN|
z+zS?M&Dj=oV^dvMmg$fy8dfRr#?LQJglQbpH7g!vT8<%H(lr-{I~MruacNNRQv}kF
ztXZvcLHO;ryu#}~J4)%LNdHPV^JLiz%QI%lUhChSk^Y-}Z4u9(pqk`%AMmQR?SFX*
ztw~c(W?|LeS!4k3R>n&!RB1_r+Ih%1ipm0}L%4H}QeojUTFhB!-5OR{9OEE<VR(P-
zUG9-|@f``4<(W_Ioty>9*r8sWw%qQey^Rgm$V$6lOT37(_C~W6+i8E44SldGCg_ke
zJ>rW~x<S(OtkMuxd70J$w~kDuu1K|{VM-I-o1FOa(DP;0x<)#BLMO$P?X;z7-Tt7g
zaP_=&G<=53u;rs}&46H|TyvIi8E!zJvFo`ifpM{fuXC{*wJFCr>b=<3HCDOS2;DZ#
zWK#>>j85qN5<#ZL0N;5}6Q6d+D=9X$-W8jq$287M-ZZ;qh+pW5npo0ko^H04=N*k0
z?VT8`TiS^idE|<R^)CvxS*bagPkQcO^`eLLLnJX90oTjcnXvI%^c@zzMhus37^$3%
zDd#e1Ua;p!i$iFa*J_c|l4I3O$mxMfX%%@RkU(tr6q98p?gm1f_frgZ=0uPlg0&sc
z8r5K^v`o=8cH!~x+$ebClT0ND4X?}_ZmLHtGW+6T0b6a#$rlkt+1Ztq*!b}2O?gBD
zu34^ZI~ZAVX&EKhCa15S-#)N7W0hq))YYeL2pc?8+ta%Kp;pDV#YSNc8-tBe^;wuJ
zNL?jtFt>QYXXqd@Zl9{nSF4tE_-7Iv+WV{;xpwkdvC<{dWkZKoym%rY-AdmqW20Rz
z<Dr!)s-AZth0U9>iE3ub5CHZbYHyVr%;TYLPd}UEX2Chi0pge<7+Ovk>Bxt*cD?5k
zZ0I7SMkyP{rJAo(W!QqJCiHxf^GXd*om`Jr77!*3O|_AQ1-Laj0FaAriy*Van#<c2
zrlb4rTXVBc(k=I4DmwbUdTd$aQPi%hV_l=qTFY#U&$MUw;2J7{uO_>h>$v47VWwk(
ziM0b~*efyRzY>`?uO%p32s6U=HE_)@9eSeVR9C9{KP83_8SHC3#iiZZbp$#0g&C)@
zDm!Ev^QerO!`^@kQdSsUc8xZt!m@hyy#a@hT_v<;-%E>gw9G6y0Rb~*TY#Zb>!}T2
zrQ+52*J01EUtbKdO%|vsu`)<b$cJ^aZKU20Rmpey`WY%c8<LMKsVwC;U@goSb!J<b
z9>h*ZqMq99_f*sdv5nS{smx4-*EIHk+q_#jh^;8t@}hCDTdvjEU*2fva(K8J=Qxxn
zY-~2?*j^FezM66Qm(2=DS*4iAq#5yR-KD9q0bAjL5E|58AynrNN4sRNchJah;RCxy
zoW`qB(3+#0=8ase8kq_EX=cfMRcZG__4><?WmR5=c(oF^pz*q<Y((hOE?85VX6gK(
zdwbICorjNlkA_IPCjG7xq#7^#A&ilQRYW-LNIGn!dw3xsAL%1F`l?#k88Mjji8KS>
zV^)XrsBdFhH$^*yI*pTwIZ@k0VFqYcCC4Xkcs6qQi7r?~+i_YOVoNf2gKW3UGL*NA
z!~1=fwW19)_bNmn?{>48Y-5>1?Jhc6*JQ?T<DNDSqu$|@7J3rAM9Y$Bh;XHs?wC(M
zzCX+URIBen%AT7iU6}O5yFQ-f0><){R;?m<tL&zga2`B*;Ft>*zuk5YSw5ehKaWj6
zVl~j>lg7UQdQHQnIA{%ZMf7&6%eH^8xdy+nOQb?ZN8c$j+eNb(S{PXTbLb{!_bBSh
zj@*RhulM)G78nvD4;uDmNdTN%`Gnz@VUeFQ*(WrJrpZN^z8I*p1vy)4sLWe<ea9hs
z0~V8NZs&DLZPPV6DY$lGqghkuW8M@#lOQp!!!VA}v#56s;alKuq-)8{h#UEuQ6yc_
zE<lBT$;!9A5L@tQWC3vW0Mfpw3#2a_E%Cz^76`YUITX`AJ;1bq+aXPm>gQVEJD1hy
z>*5juiq;TtHkJ(Cn&*?|Y!$_#v<~;UXmd(pxxqJu(L3NXk^byVND5e7^}M|=qL__#
z+=`UdLu2HHEvIo8ZORVUSM&`+xRNl9os5_ogRyr-Y=E0j*tDGnA}h3c%f>(qB8Bi_
zR|^%?jY-85xGK8Y$M`e#VpAtuffexKhBKaavkqD!cGhs;{$yjv_dMAbdOy9*$ri$V
zc~XqBlPPD^N$d`iCpv{|Ai@f*eQx=^i#w?vx0__*>fCGs*5A!?`E_ek<6@~Q8GQUg
zFwp^AF8lg?bJ>q5*sSOo9^(38nyc<4o2w16l57Y6jIkxz+?=K_Kpf!Rs|4UMopxos
z%x}G(Mu^_^MeCzdL$v|spM|x^?e=j191<UqIvMW*5X{jg2#pvh-Mji>sZ{wx?(WG$
ztsM!Mzal<&sXy0xs)ZkAgIvGhpvFbp?U=sOph2#gnSe#y*OOkEQgT%=E8_oX=%F3T
zS_~XuF-sT6>N>r!2%qVr+5ia@2cMpq2#!BhJB9Jy<EUp?l^EY~$#XIL*8Q|LK7O%~
zeZj`-+mrn(jjN39^tFt&yy){e9aj%M``ZWEoyyZnOpSGITnXY_S?SclO^nsRcqnd>
z;k!N{Rp>jK+vIi;h}YbxKh%;E>?{_~X`e@aE8rG(mdv$u-uIgYVV#y#T7vwh6KY{P
zv;2t*gR7LGbwEUV45=%*X0~);j)$tD`$VF9To$Z0Ugo}9S+TWJGg%W)=$EO88DJ^*
zxo_2TO}3}}om8lyRa1Wp>b%U1OiNuq7c*0i-K<5Xxzh_Q)1e8@Gnbg)cDto3zMMqW
zs4<w&EYI3~=_2fPRDk-ja2Ld<nMOLwz6&+!xOrBa0qLK@8Xa(b?kltgRUkYDYr=wC
z*sd29Uf*h0FrC@sl0=rsc6V@T!Dn>*n8N}ds%NsumV^O+O>zj`Y_U!DG9JVV6EfMA
z4d0~VOLscVW5vWo^Vo=l&&$@OJ{HV(|Ah>fnlW9!Yvv*^uqu`xw5%UWe+|&aZWlVD
zORU#a8ETv8Jy*IPvRWO|-x#&g|M6Jl8+uWV3G{kT7kB5-jB&0l3=l$oHHuseT(VXP
zvc%3}^Xyb*i5=l6(qD41F#?@fCahNRD7Uc-yEX|xmOB8Q^n7$kk6q_UaD&cRbGGCi
z8IMb#b2JP~E!;2eB)Y)b94Pi(CJ;iqe`O_~+NJ`(cVav@szyEELvvpvZv9JEd7fo=
zaP9kYkEO08&;e;tFMMy(#v1nR3<vwSWa3AMc5hMV_0{@L3YKw<KHgYm5`1_vqwCjn
z8VFF&0C0d5h_|5zuT!aaKJh|mTwNjSTg$;v+`daUTh=^qt`>lizH5LxsQU7-w6rQc
z7P-S?TP8!hoa_jpnN9DS#Xr|!FnkrT_4OsRe2?F*RZglPEVUkygA8!n9Rqgi6Ks|9
zDg{_^F%Vj#qrr=Ml_#o(M$C#MONL=@XpsKH#aBC7gSRF@KBs$=36&m#?K}j{*75++
zAkhL%awlcl(-U&6_`M4W)x915Mf1CoFI#Gtid!B=fv}v+M-3sd7Yt={!fda-_t)0*
znu|dcKLBU#Waw1e6s^$wuz8`Gd(!GqNMd*H;K)5zLGjrI0C#%_T0^VSEa#OSA8`F$
zSb%ofZ)&LTO)yZ-%c+EE+O7-GI%svzxYyGkc*41&t#{BGjB-O4b_qKV-F{he2l<HL
zC`>~Rx~bjDXtSAZK|Wy`24L)Zyb^|;cA3krNv;{Be0~jHt0xH&0oV76pfzvt=4>)K
zY@NeOHF|Ara&hX!2v5R_fNpE?HMD%gRj@T;ct5m3PU>3>w1zPSRE>|i+lqC5Fo&%W
zCE@iA5E$L_u`KHiT3q@LP!nY_V^vQdPmsDX{W}xu^IjuEFIo#r3t5E#5SPHTqP*|b
ze6E{zFf(y0KPcnB>LRnnD6Uk!S<)(1&G%OtLUN-ns^zcTg@99irjaC*Dk~$X)`o8_
zgwP;dKp%Z)kRR!>H{B#m!*xw~y(kH%W&~(fne}4mL6w^Tb-kq@D!2ZkG569$LRuu*
zeLuiARYnGEZ8bvG+wOSgalJ1uKyp*1?@7}-sI#k8Jjt@85gUFTP=xU2H2(A%puw^$
zYCbT-*W4;@FAG#m+4jatpfzYw2n|8&H=lh^q<=RKT8?>VnO{!}C{99Fh0{8jm<!7~
z$m7fN#qse7I;+vNR=?H@-Yn!!h>e#?=JGmug&#fbJ=dfT%Mp*OhAzFLE^l{vQiT=+
z@A%}kt6@~Y^_;L&l_|OwXw9%zz}EXf$T)9v36Zc0bn64n$71<!YIb%OZD(Rwk^Zvf
zztJuifT}$_WyV&~-6AKI4{BLT<Gq6PbySQqeg;~@jDzYt?rw=tNwfqo5U?q8dK?AC
z^We#w{Gd#<tUzmmK^q^M)>Y5w<vc3NYR_e3ujEzgmd(8UbfdpDYQ-_T#irRpUVLwE
zAqU4-#H*LG<NUeO+H$CUPnmsms5L)O265AUX`x5ONHwnA4UYfjoSE4sq<9!*vOy0h
z6Ul8}gYv#i8V69D+qqVoTA*_KtK>N7_OW;J$zE*tBije`Fy<7^0nnP9U?@S{Kx;cj
zp7Y(lxIz82?DDMn%VsBL_zIrjgz;L@Sp7Cu;1{}|F`Lj4Kfq1ZC*ddwd6U!G|7P@k
zFtuw_wd+{k(4?P=$?#|cRVZGrMh3^_8e(+{Eq3eBYn3NKUmCnuIYZz&0~9LvDjzF*
zqR;dW3ZPuO1z`DZ)mnkU<C7mZb~l!5%UVC?cgj{Ic_JLWd5sO0Q)y)#`liPCE#q>w
zBFAYPuN_9s-ADwrLqiWb+p+Vd3;i;7#%RB%K`zcimvi)S+MSDDfF%gR2-D1`-OCr7
z1)MS?DVA<{%629Kecw7`ajn1F-2R4ZB^%1k*?2M_SiE*==WcLDAY#!ggC>WAMZ0J(
z;@kVy1p(bhOZ?-Dx0f+3@8VKueigL`ok~8)_5H?v0tCQhEq7pq;orI{kpAyO0B;v0
zDL}iN<&rG37K{Uokk4h><!L}%4W-=~^ajn=dtPChNlpGy=Sk3pYNc)x=O6K0_Rjda
zi5JFe%c#-`i`qf6nv98Nvj`Ne9b88nO_x)!oil)Ti{OIs^C?#v_sxKr59SR<Vmw9u
zu(LX`t8KIq8qGI%#@EgX0{_;XE==R9VdI8{m2%K7M}e|~*9>smgn^6E$Q)XD20-C9
z9!zhXY&eZgK!c2JjZmr6eG{-%YnD7T1SWFfv_+NqIa85-_|XzQCZ@odqVDt)<ZUA~
zXcn)BLn9Fu^TXKP`?HRjE1O}ho1>C~EHYg|b#t?m2`o9;tM6Ky7$Rl^bD<e==)YrS
zE~9RGNz>_kbi+vo!}1bjz!oM@m_`s5l}f#BHEGuPB-l!r29r*oV+rRRZNkDq%5p^d
z3wrU>I`q;2o^)}m35qL}dtCLMEXxAd(W}kkV&baiX{Ayikx_iL(cG|BEwG*r6WD-)
z?fZk|1tjt^Z|&42vkALof`BJ9qazn(a8hUyRzA$o8@9v%o|+v-tPke{wkp12eHU6o
z4tP(W!pMN46~J=oGzqWo?seEzaL_tfhE0d|pxPdS=F0PSGE3!j)W5E?QHz?MiRpc)
zCy}3>a8^L07DSk0hb^54Xf~ZIF2Z&Q?FxEb((j!&rn-Cn#kl?)fF%J>WnQ!SidY|c
zk=8Bg-Q*5%VbNOYB>cDndlX;m$x!MLK7Qf#sgitTZuUDxX<Ois(+t|3WrclKfE@Wa
zFsL3RGn6_%7U(|TKAYm%@}^EBlUla%=7tpu*1PT~&8lvF9;(YbB0%%-6Q#^{>DsKy
z#5FGsg;<-usK>Jd(7jW>vr2?Xtocqx?2xhza`4|ppn4{&j6mXJC7IsPl)DJNlk|$?
z5)jXgF$iJple%tZ3nwhmOS+gUIoRP9TKqjr(rx#@N50=IztVbr2V@rf;hDwAl0>EI
zP5;9TMy67v$4qnDo#xE@q0|B`fz9@MXOO7_8DCBo@`tsbI1}@sDvj5U4@{~;-DXzC
z0*ilGqJM)cRx7f8xTC+<&4?Gx64xp7y`y><@Rx2bQfP6Uxyk;aI0@9Qa&!H<ELS>6
zn&`TFi+_;(<EjNddxb$}y_YL*wK5m;#q7p!nF+-8y#xSgKxMJjN?Pje>eA$IGP>}a
zX6|MA%garg`i3moI(wUSE0w+l+3T-s&EvzLLQKq5eI6cm1&^yDh{FetFbeQUHz!{!
ziNZX@3WeKK460p{b0Xp?RUlyiBJtEJr4nI>Uu{uwWfv7$l-S1;`KrD*1Gq_*rd6wo
zY;SqTowj<KpL^nu3#m!~UuUh|xz;_XpYP8riEJk?PLE|Y+)o8LQc~juc2l85V|LT)
z?4@kTF&RWNwx5ZqJ1kaDa&L72bdLShXbT&gy0wr@3w`HhztGf>u(rwzmgMxcjj$`j
z?h=?{gr|?|!~DaV$4fUxQ~`A$P1zcvoj;e?T(CI(8Gs8H2Ccpg=H4eqd;=(U?Y~L4
z@Z<cbu^*`=(U26Tcz<)(P@C44WqGG27*~&Io~GD-I!#Ump}VPaH}pMq=|@lb**QN=
zFc!=`bS!wL_YDZl%+&MjVES|V%ysu=wNl4Y8vM1!TNx{k9X9v$y9s{??R><Gp>jq)
zbh1ylu((dNTgm^8iQ}2`{6Ke**CLw94ako5#%56)3I1@_;V;mptoeu|zCvU!J?Z3P
zg;-~-DEoqi_fD>gflh+8SX52y3l$I5v7lOWn2?n1vjf)YR|)1k46x-}nxPVossMlQ
z%1@}K=-z!?#lm?N)5k07%1+PM=QK*KM^pZsis7QJ9G`{ETL>3AKfDr}mC3)SvI>hG
zcXwjd&vyVbL`OY^47a%)8oU@H*|CwM+g;>+%;2^-C^*0N-`?<JW(#*2#&m(-0gnkW
zSbcg2WpSs>dHkM|kE3c0+R(La9x>bs(HY)TT}L+2J_Ov<1e>wVChcrinfKZ<eE}?Y
zi*7P<HjuG}G4t<Gk;jsW-E4jsBQajoeUmavOIURngwk<Vb`l(;Ihjl)tO6)0XT8l}
zt%_tXvJqShbv<5koHkXLiHS)!v?Lv6IW8UF=AtFa#$ugOyyrJgn{|Akxz~K&2OFU>
zGgFbcKKdx(Z&da*FhTY4xa|ojeL)88ue}$ou&3Fr7cyWDi(G6A*5!yg*t{OSsmQR?
zI66WlSr@xwichl@uPjdLl`89V3C*zaTFa$og?OyyF3r4V5Dbaw>n$vYS^W*Le#Cbf
zurI-Oa87VQ1w2<v9hQ>(t`(Nd&^)ECd&A*cE;Q=d;)=pg4u@*vcIE@yq=iO@Mjaj2
z&kx8No6EMlCvLBAXi3m)y4o7xyE_racMv%=yk-$96}nyS7L#<rdx<KCoIJQ|g%g1G
zrg^z>^Kq3SzrFe<BE!dAa5Q^De6nnMY&`vh<Z3K=g}LQx;%GB=ylgRV-@1-98qK$B
zw)$DaP9~G@gXQ30Q3YgugTQ#uxx^BNmL_*6+Vou&xr~)({aa0#s%L|L6$3wz;(#EC
zIQ+czT|mq#8p)fum@9{=#7JNWN}2v#PQ&H9N1NDB+g6Om!HToJ#$k>NuieLKKPFpl
z7TWSxtfas%YfgsR%2s;8zqwx6R~h@v#3t%g5^tC1S15q`*uC`tGOAEJNuOC_<mQbq
z1%1VJ{7av<KPkXpFz4CqL<Pj}_wZ~t9^c@;ipfVb3(8VY?^@!;BxvfLhRP?7V(;)-
zmoLRZPKU;{zbUrw=`p0cp6F^f**_q=f1_r;rO<GXNlSi^CR$`qfqnE-9rdUu&L7Aj
z7<e0f(7j&MPg71>>$^<Sg<4%Cl=~C<17o>Pp+ygEAx_L4A;P62?E)ZfmmZR0OjI{V
zTx!0}i{&E<Y1$texD2i2UYdQp!L6jiJl3FHu8b}p-(+8y&MG17@0!%u%oko<?OzGd
zvMON}U%nk3EVJ@?K}k@;vt?@iZ-64%a*%vcLtj~lS>qn5BU7jibGQHX+;z|ReILV`
z_W43V@6GzvyBaEjQ9$pcmTx%rX(9R1P?T}?_9nvEZ6jl2kN0#Dhw;tXyq>_4zvUy6
z%0?R@1Ci21kW{sCVy@HU(PWWk&DfZYgGKREd_R^RCCIkGH{{ymsS?zCrVzCstQA^0
zN!G$(xYFU+sKqQg;W8OqiA4T7ArfER<%An)k4s`7F@<+>XCqdT@#e0LT68Y^#-Ct<
z{%+9CviJKVOa9`x+gKKdnkn|Shb{WoXV?7iv@jI5T81W=pE@k>UdkQ=^>hBZk|*%3
zst~#>X4lN~*iQ3Bxgg%sEI*ql?b#))_*ajvD^!lB(_k);JJIjT-R#+|YY0T?wW4t1
zdDqJr50&s-x=I3C&A21Ew?PaV`JHew7c(maB9B@|oK}>i{#Wmts~qtC1#O2lStqs)
zyLjz5{RrN_0sM@K+7tQt8^DykIiwIKe8HyGor`DD0ocV}g8`;PZ~kiXW34kG6!61u
z;74>G2XT1n<<w#G=Re=q2nu4u<QDSq5^#{2d~bq>KV{oF)GvM%^z&+<YRCMMd^lJ?
zctN8-z{3;$We&B;k2|d+(tex~hYq3tY^Tl>AAaTCp^B0ps1iUIP;~w=`wwgU^+f(L
z`(Lf`7yJCb_x8>j^7emy7QkQM^q(X6t2O?=9>HJV=pVEH)f)ep{U_G=3#0o_dHqR4
z^G|vGSugcZlKDwk{l71-e|@8W%>Gwv{FB>%VvT=t`{AwMKe_#%-2M+y@G~FyhoAXL
zYW;_w`AKU1hZg%;6LHhG@99w@8QI%M`m;5)w||3tbRiB%<K-8th+B9bVFB0opFnF$
zhmZli`uTeq-`ZfdW^DaHwd|O7vJG<hz~M}c0Fg=-(7}V9Pix<wEM>XkeO)+5=P*EE
z2Nrn$g1G*IG5$*lQuTo-KUeH`IC19uS#2B;Ro6;F)@QVUehq6F9pKgm^kgFcYtaeF
zQ4Srg0wp(@b|=3z&<_Nib8CYmcLKB^Y4pn*K>dPAKWVpD0Db}ce(O7~22rW_sKc(5
zI0zt4g-f2#qmKB7k)p~u)zAV_W^5sr2Z~5WpfO(0g_gV2DnMvr)4Im~)+=%lU(s|v
z`iL(#Sr*|@a)QgsYbXw7IsLsB>Cux66i@vv|JN@SM!E^MaQ4=b9{r!F4#e3{F9312
zD_G4Klm8p{DX9wx>k#^USs)-eSCkUVnBej|J48yx(o~&9v$D#n%cbtud*TO0LQ)Y-
zYx=_o#JPjMZ%Y`X?J66@X8<gG*_Z(Nzl;GpS9}aIt9(H4Cz<o#4$~`63M&)9IEIu<
zGI!HvyQH9_967T6PViZpz7RC?Hm$RpB`UX9nULQc*bjBNBfiZ1KtCa=+qDN|Dfbii
zzo#8dr*}~4W&^7p$%DJ~v<o2z>Xil&y2@~j%WsEyadtWd7#3x1&^gE)+hFr$PuGMk
zQ3L(b9uU7fL#i<>onKCd9xh)$c4F=!aiVu{UFY>irr3qo*Hl~%&QB$)u?urrl8v_g
zqb^ZibeP31T{QsnnPjCN9|~}bBSf{g1=au+u%#%b2IM)?XORBX=r3JXh=sKS(bS<Y
z_}`)lDA_nimqVMnI$M1lI7bI?P-!2q{M4GJTLUCe9t6F%hAwZg`umgo_V0`Lou-dz
z_bj@%={rZ<seFG7XyxRtA^2%?k}PR3KzT724xuUZhR{0f??B573lqQ%tO7~#-%2b_
zV4>@c?E^d)F;@XI(p37`a*oCVVblZ_y&0R&ei=~xW<qO%c&b<b#(er%GI6odsi7Zu
zt89K1kipipfS(7!(3)^2q`!Civ%fKG>a_r;;&P$<=-(S-WtUPl58?!?w*TJ4Zy1>a
zHFUquN{G_o=@A51l`V1RUbug@<j7$z{sZ;7L=o!=$70dpM@u~8xeemo;sxJQ{X3F>
z`OE+P<zMwItR!XlFGy;{aAxa1(CVv+(dO)IKx;lJn6v$#_P#W#$*T*u#aeCA(tNZI
zRl!095phUR6qMA8Qft&I0y0J|gn|r3WJ(ArwJH>aS|(qaY^jkM1R*jcK}C=;B7^{e
zFcqVT5Fms|LXx`!Eso!}?%%s^UCXty0wnL*XHRGE{X8ee2j-NW-1H`M{=@Cw3Vu;L
z6<2MNinA*CraAyd3X9>trR~WIZXRqW8-8Zy$FkZXd4ePFgy_H}UlC**gt~Ak)Y32D
ziz4U_efp_5vWgrI@$~uw?~2B>p!C!UmJxS@uclgz9m(&F0J5M2K53~15-HD7<ISU<
zPw8pjEW8}$;~U3N9T4y@>|5oLe8*?z(Mbta^vh9Cj^-)F&(A+-qllD%w)79tx5@8l
z-kK%v{WCoLN}GRa9EruveR;69ut|fK-#kHtDSkR{H|59`nfUjwuDgox_JkVjv!!+f
zBHd6I@DbZIbJ~Suf;(Y)(EF)7k=|>GZ51|UceT~ZhX_;-_n7<i-~mB-4OSmB<JPMR
zt4fS9bY6EhTY4Ys>wcae`&p-rsCa~*!5EnF{Kq{OPKy!{S5e<?w1iA*OXq53ITlFo
z3?XBzkRwv&KE?E=tig*Lz16cV3!gP<=sF-zB|7#cSt7m|9>%472Zqu8r-%|Kn>ul%
zb~&CXy0V;9>t6oN3q`cp+sAXIa!B<zk$&uR038Z6sP-lei&STQn1!C%8d)O(TtV=Y
z&k<RO+%}IR6fyDdlze}J5W-^{V{BGT<M|$fVL4?N84%+I((|xT(xeg6AG*;g5kwNl
z$j@RuSTyr>o7NXHlAcOpEVg38db#ZyxjTm~h2fUdJHMM5;0j8s>fOkiAYyXtdTDai
zYSLs8u7H#4SU^)v<97N5v59n6e&a<(%)VZgfkb@o&lrqj`)4zs{7^{tbSFF)waO`n
z$lt&(0(@kGsE`L!$SU#p5R$dWFi@4;#?p#2Z*D6h@fX28%zRFd?VP=P->3p!8wvj5
zr0Pp4tpSq}Ol*Z^n;lynIQ?7WlVR81#<p6a>i*akW6V9Sf&)L}P#)Q-rhmy^%o4?R
zmzrayt8*ZmqVuTh6rB>jZYf$aC5h!YAX)sy8(`PpkNumBpma~48H=^3moJl6_-htU
zw#jqlQ>9^M$c!^-UGq>S7WAc`)s^1{vGWCuqb<x*^H$+-1)j?2WoABD1mA}fD9y6c
zbqxYj80oXaM=+S89yJy<z-VUqmokeSc#_tbrZ%o5{pan1V=s&$UdXf5Az92qGiM`2
zp^1h<b&smjDp0_vKt(JKAD^XK-JzQISC=c*S4c!${SLAPQ_S|pHt}8)1waqzE~+p;
z*VN@afwY;`CYJL7vd}m>t^8VrYC214>7k(QDXyx3M;tabfA;y{JAgDJOB9`<jO|h{
z`_`+=HW>|Wz(~jys_Lh<pjNsH<`(s=(@(hzNJ#ks*+0v5d^9^|x6EYIbA!lqV{`^z
z%Bl1Vr83f`D$7b0cD<A@!Om^=!Cn@>eLex^+xhk{H2yuE1J?kQ77D}TpV>yQO(f{}
zG+5_bPyxQABip=lCeA9;!LLxW*@FcZhH7M1YENT4->HPyj>V^`foUvP0F4WT=GpK^
z3z7%`n`gtwbl0c4P4RsEb)EEyK(=-1Z1zlDayti5nTk^|Qmv}+LfP~z6`JUj-MZ1^
z(ZkZ6lCWW}5(evyM|mKD?Um#1NTf|Ppx3>g{;oQ_4oH1YCQvMy+qF06d_!l?*>ctz
zlbMrAM2m*JiPa|b*h>&hDtrkct}2ma0q2E=?6Ou^e|S+$mpw3<c#$bQJEx)eD&vA{
z-qM@pUMR}L`iolQ$O?GE@Q5}WK+H_<R|69_raHsOb4$7=0M+6F)K!^Wz*6J<0C7G+
zLcIz(uLa1tU~C`Wp6oW8k{tJnz%y5CkI}Oer1xylC`!m-f?(LXsiSpd#>GAQ$HjF|
z;t|<{joUCbQVzt842T&ZCbKgb6l9v?ES8q4Z2H<>gY#W;i(js>Wi}X<>Ww~fM~L@t
z03I}~1i)tU{Zea#fsLxot-2;6VJ(2*+a9zHDz5MA`P*l-O2uV?kp4LZzl3xFW}Rt!
zGaKcu(H?}zVUZ6SoIIF7oX!sf<oH|A<gX*pFUd3o7Uc+0xX;C&Y&Exfrk96azB5Fe
z-r!#89bEOZbv7c8A8OKw>48|?>2HK;?^D(MamPxe=m%K^B3l##D5<Lew)p63Le#cF
zpx-I8jXri7XK-@NA7}}3c&++L26KK)|5M4#c2;F*Rt{C-9JU2|8=TzF)=L}v3s>C$
z>>krKWk_W0`g*Hw;(mjZ1h!r{|8C>xO#|{jp0F_Ua{4D%6mhE>Udl6`9gDC4CX6J;
zHtn0gdr5NM?Je7Z32P*^!{+Y(sG&{apY*<F;Rry8X^b1mYE#(#84~U%U>%-+Oo4qk
zDczJM6zKjv@fTtmp^W=%<8wgwiCMgatn{A`7j(i+>Jsq#44>Wwgc%V`ESykXNLOMT
zCu7HD8~{X#hj9wNXb=)pz6+$e_P&6kIqhsPP+kD1Kp!_5>C%-ph2Ie@7AM$Na&7kX
zyQ!qgA96%yIFPV>gI%NDLK(bk3;sbW?q}9ntv^LL&sV#Vo8UDkL?7JyV{iD8D(%AX
zBlplR(&SZNJf!faqd|6IibNb;jGa>8s$0PtDw%C6Y^YoTE<;8i!%6`=&7VP#FNkf$
zV>Y4(X$pNz_tqc)QTB~dh2;W$N0>E(RQsNW)?R}Fdwg8fZ4nPwt%k(;`4Z04xb2tk
z$XzM&9)+bvh|36HQII?1Ct-R&xB}_dpVTZkXpBbX0K3jPTOd`Lq=Eeu@366aTI7Y&
z0$lWBKH<F70+N<Kf0;*721T8@$oQgJE~AV3qro7;>YVU?Th%@`<Sj#SCcs|vQI;c}
z&%py|^SFhY*aK8y#SY#X#Z>5#+NTz>?sD~<cg0I|Y=tfwcp=zOfFpHUlPAPT@|D*T
zqC!p?s}DCO04cZT^nwlyZcszYJy>c;IWUR=zSB_`8RM3NH&RaTrRt=&fj7K0>kZG(
z=LNx823r95BLj1Q!O@c3&h!054`$4wmnT}tof$if`J{t(c{y+JcYp?3-r8ZqxSDx+
zd74xPCn6f}w=5362SnotCnC$a%*|`p-==q~%B%tzXaD2}ghIf(qn$Xd8t&1&<z^dd
zI9d>KSZCehc^zM^s9$h1>Y{1g7n+x9ylOlix@+$__x<hE%`pZC|2)t(RDZE&n^$9|
zr?W$if6YGpMRUfAGauinrN58)E#czU4>f+t(cH6U2X}#K#^tNBk5+DNdpf}5y12No
zC0tcWE_pI^SZopO5V#S~sk3tS2guKFfPTC|bJNSG8)=XIRDwiuOMz&}8GzB0tv1aT
z0x9z{Ieb|Q<<Q#Sh!)!4{#b(f^t%^!nYi|cr(p&I`<Q7@6`EY|PY!^1VgO((aWQgE
zD}3U2q^Kv)*0>>hxQDo#>P>zUffPAY0@(Mm?P~mn-Rj4`qC4rgdSBQ}y>-~detU+_
zjMlH!aN=X=(m~a4MuUearws;FjQ~R3ob1U^x{;{Qg73HEI!B_#NKrVRoGuA>9l1I4
zch7c4BwY3^wMmbA`spzmx=EtY_psd9iv7&r-O#4STB0<|W@%-ZA40XML)*I=XXgw2
z7%~TTeUNVVZ&4vfjcsa#t_mr?(*%jcd$|irP$B@3CR!{0`NPX?T%_t<ixs9Xo%N~)
z-m^4ULvZZ@?Fmj<2S6EOrIW$H9t?7S6^3SBo?yfk-PYH>nek2aF57z9E_yP}qrPjG
zYFzpRct&n}p=i@<Z6=*Eg98j(yR%hc8I1hL7cZ(0&sC<HVHI&v&-bu@skUxrS+y0o
zH_ek;hK3vb5x(<8puxZjd_z20S|W82@rQVV83pBOXEN{n5&lKXD%<F7A)$mZv13cU
zDUF#IXEc}*u3FK_zkpKgy!Q3XZ@&{ktpcc3{qtY4C5(4-5{w4V5`od9wF}{yIkO0Q
z`DI&MyHB}^iuGj{s!Z$+(><5|{L!y!JYY7=sU7|CrSA#u=3wxNM(6A$7(R2fv+<E;
zfgo!ox4v!vkR$+hX-B3##a&FxuZzzMAIt&;kjA)N&Pvi-LP{gXvdBg@8~3a%f!ap4
z<8oa|U;;)sn?Xd%T5iWWBT>f$r<nbpYt_XMuB7j1W!hFVCM|`E%Fx)EV|x`(`}y_k
z1Tj9`fx3QUTMmUX(mP_-g`my<QQh8ozx+9?SKoiQ<4h>^VsE%LRjQSl9Y1!C7`%si
zYhJcpqLBB1=V3h&tUOFPn6-*N`N5f&nfaEOjHdb^zS64fz=~i><w%gdT^^TCbRx7|
zSaG}Gr3Xio-?l|gr?qkj?~eqDe!=$sY9wciE4`c^^O{D6!+r>$mMIdrXsK`elV?u|
z<vqKTJnEGj)9OsO7UX-ybJ!^nvsmXsp++#a{#s=(-Tym6|KRW`QE`ScrKqsbWJcCk
z=kv-xx8bGh9xd^es+!%GBu+^&NRg_i_jXISUz>=MA)DTmErxBHsl7z#jWU{$@o;C=
zPWjz3T&M51ek8NjVI4Z!CPsX>1bOV5K~$|^^Di7uy<z0}7em9pcmB&vZ#u{N+Dedd
zO=m@P-}y_h{-4SkAK@u&O{QFuna5Q=srEcBK(5-E<hdl0E%qJpmp))sz^{$?X&CXH
zY~dM6T(r9=JqXh?Qru3cpDd}xHoMnPD(GPBs!)@eb52ALs)MbOOO}w^<x?ZmrkzRZ
zYSj5P->NLh9pktu#Vw^)VUk~iDQnU`89pf7)>xkEQu26G@*tC^+AEnlag?8>sQiz*
zwh)o}LHBKw)?uE)?c6wH`r!rl@*Y-`d%09OuUEDV+mZM*D7~dQULj!G;7yaVXd4RD
zCNV33uLV8!P~stPnRrp$T=a3KYRFp=%`L<;(Mg_ZmCEy3hPlnAl!&nBzrdzv+EL3G
z%5x>k$N7(!k6_1@O~_;2YtjNon*6yf2;$mVOp{nq*NMn=13dPFVW&3TSvKS1j%8Qn
zFFDTUcXD_h^mI;eyRrK1i)X7AZ1pJbSzw<xCHGR00$i@JRV1=PD@mYQZSmyl-hW&X
zYqkC0j2B)#-*QXR(;!j!nd4%`^aFi#&q!1b=4`KY>AKAPk{3R2m@iR6Ic7w3;zjka
z>fn4OA%HriiC5`*2M12+K1N&Y4BT>zGT64CfbQ{Ce0^>g|CBvw`bj6ksQ-ezGPez_
z!;`8DkCkF}tWK_O=rIa=?AqXDY^<zY#_~hd8@F)%o@=%x<DtjDV<(>exq{X+v%rlC
zk18GY0~hk&Q+mRu<mWqzb(gW39Gzlx>#*_;j>@Ie>j(H-z-kXez+s0j+32%heK@+Z
zHzJP<%ZfV)L<9GP(lI4RR`-!^XNp64$D9H<lut0|wMDY=5#_rQ=8O2W5cg9R3r-1_
zrf}7VvaMEf>aA9`)=z9*c0_&x3PtYlAKZR(@V9P3%uBNJAWBWltZNIxAIFm*wlL&a
z?b)~PX=hUKJo9wmUVt>1k+7T>`eHjqM}F3yaleeU-!jwjThgOC4zD?BYpgvsYDWmM
zx6J-$Rp)~j7>K`y*?BPo*)pRE|Fb5h4pZN)F6>>_c;}Ouw0cIw|Ff{V=`!rI!FVZR
z`hWNFgR7wmxN_vD?f>&r=N$gOA3rCGH(&fKoHu1M_s+k<nG?mEFa8zIn=+Yu=U?H>
ziQ>%{{|e_#nasWO|C?}rsM?0RyYkIU!QfC|GE|Pw6(eX>QWKD`o`OT)Ns=fXCN2UD
zZ6=90NL4H_2-<RlzQP=dJOl^s4RolI4z5Ght_NbD`{!}h{dtX#3Iz0Ggy#>PE$uE~
zzoTJ(A3hOL0nIHSH#8*iYj8)M_>+p^&!Mww?14sc&Y)50SDUOdcIE+TRaUBJuIohA
zF!aNOmypN4taPA_Do^mt*JVGD6QI`{h(^@{TNBIE<7C}%7nBnkys2gK0KmpSUDqU$
ze4XQky1{Z294>e%!~1ykCKHW@a_KG4vf~_p55Jc6Y+4!7R1B<M_#~s+%9C&|AIrln
z$7f1q5+qVZ^rd<$Mn6R&S(G)c#_PS@>t+7KJwO6xm<lAxVW+6ux%1R?hb#<AEBjHm
z@9&p0bm8Iq7bB4v0<flar8s==Crnybs1F6Sosdkj+i}&kG`FkPIhwa7@rj=K1=caZ
zYkL4{Wqsbjjw(b;?1m)gPGL~GolxA^KTQ69tM*uh5Zy$lc{f|-rq-EqV!i(Auy)sD
zaO<opkNX%5eCLFYPQOnBk#9LvXB7s$@J^MjhliePsgd?gq60NoBq{+;cKZ@#N#cv6
zi~4AeUpl(@m|V6hi2RDYz&au}FI{#tB(Sj_x|AD$=P5s7xj4FvH36jz&n2#@x6HN2
zF2Wekue&i7t|5WxHiDRM`-X(D;ww#7Ddk&>Cq77d1XPE~Y$Wo632VJ_+tP4LZQC;Z
zlIc~>?qhO}Q~2|raP(Uu#)hLIlCEM5{i}xukVx5zXRhQR>g}+{up@r>RHH%SD-X6Q
z6;fBKm{H;NQdeX~nX6o9p4S_+^nO_*kMdV%{#P#cB}LM3Tj8hiK6TGz`kytRB$L#t
zR4%3hdo~HkH?L%MJ&FyM9|ERRq-lbI@-irXKIkoSgZm<IgfaPnu0+jGT^0OGNTiw1
zgZLb$mU^p{3L~2uzs|J7KiXQo)Rh1g>(P)OkPcN4Uw^Q7A`WX-`jV+rnRjaQfx6l}
z3F*F@(<QFuVkAeZ{h29;L*S!=toxoMAg6zhb^X-{YGLHlKv-7{C6}l?ujn4qG|ee<
zRFJm<-E=o+jETmm8_jcC9O3^T0C*k@zUTagRm%&%bP17624IH1ip2v<Sp`fq#d?Dd
z?_|qE-SoElG+OO&KVUErsSoNK3N%lD84~oGUDKm(m&AEC3ov3Z;C#@iXM<3slQNQx
zgu>zcXKwXDq;eW4G|Ms0NL+Ru>QB0A1F{50pywx$f$@u55T2xoKwRK>jqV3ELU9a;
zkaCq}RXGa1fz7dvRx6K|@bsq+)>~Bud`GokeRR$HrCf6;k90|M#7IO%PsPCFgVZt=
z#{$Yss{+yPis>YbbVLU~b()&1x+j6UUI%?CeYH&tv40IRzV>d>;|P*2l;TRa4>sZ)
z(%M|f%&xypq3kNN``4(Ddo{S_oY|`0oJLjX^`U;Q_=BMP=AP}*T(qu|cvpa{=0oW-
z#@l@fxx<AwdLBGZQ!((u+xv|6EBlLZRtni=Ehs%sFwykETNpLZv@<&0Q%4k&y@kU$
zhiw6;;Qv5Bu)p)Z7&$$fRKT(d8=uB47m3KemfDAzj53g3@g+o=r_TUFP-m%G0oaeY
z84ja4?cJhZ@_=)#=jzj*8$vW)<0cdjCIe46Fjca}@JgXiv!*r{yQ<0$Qtf}Wzh1`R
zjkG6Y+!Y^yYIXH9)0CAsQA>!Ze&WM|k*TTjAAqA(h9iuI`aN?a#>y&9ZAwXF4oIY!
zfN>8yLR6fy$KJNozL^VcHH8@xY2WB(FqGZ+>Ma8`E>&j7FUeP-Vf5=&b*uXM!^$8s
zFr}?1h;h6eb%p!ea+iL#3k<#0mE(#>1+gsXbdkZmT5zEyP_kXy{|C07FSx$t;Q9_H
zL@oulj!$ivXlJ?AH<8X5+q84I3ZjKgDc6Z!J}USb97!yYHHP_YXf<~3OfmXMvO{&!
z#$dqER?u>%fMahmFqtj+@a9%v*6vIM4_pd2zBIbk`wRRY5k=nAi?P|kCoOD7@Gc?t
z>8gE(9`&+A-gRxKE~p$SHr;-fTWn&O9oDGuFfhF6>^(>E6Ya^W3`gX2<-VunoVch=
z=2-TI%51)hYDW0Pm969Ysw2G(e(AjM=^IQy5m`$!H4AwDzSKJZ!Nmwqe*quUBq4eI
zbr<RlOCMu~*86o|i3;IRVW)!ga8@pxB-sem+c!4GE0j~`aRonqPo_#U8FmH(e}cP3
zHDKK3yA$Ezi4|w{i=|wYt*|G_1g`pFx#`(~1PfB?CQ^8*Mo?3V#nAm0CbQHvOuvp_
zqcRnzn4dz>rSzr%*J&4Sd)F!D+E(M0*+*)mXGB#G;^oo;s5v@lIbAjqQpysHgwms;
zJs$4$E!cp4RQtXlmKiZ&i&P}R5=ZZ=J`Yg&7@WugiKMOWBDEc-^xKjbr<ZTjW}med
zw`9Bq+FwS62F(dC^Cfdm>z@~2I-d!remN}wLpuu<e+tkf4pgM#miOAwnH6B)j7l$5
zLC>%1vhxivuS0N!ksUsT6+g6ZI92i+j929~_T_9VOZCiG`5U&b6zOjN>vJSc?tmyu
znua(`b12wb54*J1#gBOo)32pQbrg6tS)(Q^oO{8oP3=e2mz9j&4Wd$nV|b;k9cNUT
zt)ftss#MvB%CRE2Nub^1^<?79<J;7mJ@{npYZd@5nXJL`_8^|q!8ce{4lPECT70+|
zlt<d-PI2$|6A2CIp=n9Rq|Sek`!vH+*hxQIlRuoUUp(X}YB3}%Nt4t+Fy)?PUKTvp
zMxz>>kjMTqYk{>&66=@BnutBvds=bn#34l<u1p#00%0OhlmVF*>lC|QUoIGI;~wY|
zMnFklH8AV6?CkQKoA3-{ZU2@q>c!cT`9ER@8h#XXYAlG8Z*s1Fzi@{KBo#kF+h>=p
zp3?0*gOdj=Y=Y!+=mD;121%Zfh7&!gD<zpjNKs`6uceky-)=GTKz$uX{MbI>s^!Qz
zt}?IjE_ZR)K)*5iZ2Q()p+=c3xmaMr2y~&kj?3;Aqwk-1n6VHW=1=twu5o}4fG7Rz
zM_}CoH6tk(89(65R{%Yp1N)o)PKO&F#NHHk9j?nZ`PEU-i^-r@{uTO<cfS-5XV!S-
zhX|Taz15D}-;`Z(he0gA0IGEH*U}vtg}eb4+;8#xZWvW(6;B8TH57vFOj8Hlp6)tR
zn&qto&zM8?Uy~`56Jf#Lkh!dfsY-uH65N@!I>|;2XQ3lN=nt|nEFKLSEWa=2B9Zg5
z+rvKT4~DE*b~sFuArmNSB;`x4a$tbSRClqX$R2Axv84dG?onnvV0_>Yk7CzEPi;4{
ze)w8M+VkbRb~j&t6Vz;3M3uf1Flc($$7}O!OI&wVF9$Mw3y6ckq}sgZ_zbuaW4)p!
z!rldv*zMDi&`$=tmIQ71cB#g2m`}jO)hS%SGzXd<;a|F49d15I-Oj_F_AzY5ns0Bg
zE=AM*u8>Ys8!N&;IiYxRhuBmcZc+T)dRM#rfI#T9)m<;G(A*~fq_1Tw$x85+K)PS8
zOrMx=XG-jII;MWH=s@3n4CDr>b@BIj-<Cx7Iwd&}BaUu=q#)%SYK*@rg1Jk@IV1OI
zxinu04b#craSl>ZT8dEvQEUCT7OvaUNgW^8z4OlEb^EM#9S!aC2^g+uZipWiSfecS
zp&JEO0^K485!C9lX<|zf`YcCU(76$Py{BxaO`_Cp#)EZkltBwpp$qDXNlP+nkP{1(
zdl_2LonLvZ<TRUK3Bl;;TWsY1!fvVGd(oY|t#4ZTd*|1Ujx-^W-#H<tpG5O9D8Jq-
zMuQby$ytuo-`Ts@A67mofm1`XM}0_74s-d_P~g1{w=0R1p?p<iY=n5KAo|k~VAO_~
z$1;_6-P#NdWm{t*N9AddJ$^&3^cU{}we_{7daIgg?s@`0a{)Bu6AwZM8m@i#bZrx%
zK3r4sk!zZf6L}E4XFJ3bEww@%rO!0yifr?Cl#dCr_q5ITW~J6WQ|5ZHS`Wd6AkCgg
zWUz1JGv(L?^3U3iNF<GJXcMD^n3pp32sbsg*6GPicWrJPBkpKO6)W!d<-P3&fx-*#
zeRK&|@D6kd?2p-$oG!wW_8SZoO~DDut3dgFiWFM@eGr#Te1T_~bim-`5Tu$pF)%<m
zvKfhNpb@?~(O|d%es(?8LE!DJq6;x7)nq(w%u?_GSAD9v&Z_O)<{OX`ReJ6w6!zx0
zrW*-Gz(8zt3XQs83s)=Ey47d!Wt#6apn`Z7BON%0U<k93aqYm%JM<^c0h`V1hoIMb
zsL7Ni>t5<cJa^3YsxUb4uR&m~FHM1Y;rb!OWoK7fuPD|7z4Jy8HAN)L-jnN;{`%45
z%f1LTo87N5fp03qy!kTSJ9d8CzOQt}K9SJi+{>YFwkee*flzM(Rq00P>Rz8rYj2+T
zFgw3-;==&=4RzpAz-l?psYd4<S&qq3xBDQV!H<@8$X3ac7L44EHlDOtZ4;z(hT#rf
z>s?%_pM!{*>*YRtQmRU@g;R;&gNZP$x=%#OzRmGp(E`EH#HDV3Tvaqb#<Tw1kekJH
z@Cju@mm1rZW^^uG!s_W|x$DI^AVyrR@_fk(3`)Y<!>pYI5J3ktPfbe9o=l&u5e_n+
zgd)VO$~1pF>Uyd4{Z+wCI|z_P-S}+|?Tcs}EK@mjKP^wWJq^6sHgGnb@>Y<hk`Gzd
z>tQDKgY`mQYf$N<*{0Sjt&9jy)NKqJW#0t+ovv#J>}NbFF1ltkU%UkbliUo)&Tw0U
zfnRda*RPF?PvJzUSX;dqKX<)E8oI(VU4{ykWPIWG#I2sZs@-f?Pa9kcWVdDj1^c}_
z$Ew@*y8RQ{dYgQY0JfgM+AVQQ96Lu-UpMG$xD?MeI@4E9tt(Hw-#B^w?^ElooOV*1
zJ9|mMd)!wV9aW4ken_Wi(>A&!o_Y(#e4%N?JpESE&(0Rbx5z$pp+?{dZb1pAB0Oam
zslE5i8E0ci^FQhex37hhmtIV6QI0jzt>9skqdNRy;_EMMOdM$^;Wh5lL_KJ^7&(@{
z8J*Ql|BvTq8=i3G$shkYg*)p0qr<}I)j@<v!>-*I%JbkjaSsL!(mI<bbXvB@b?!D?
z)qf%w#{9r0w~uL=BSpE^uLR3k%!-JaqrUQD&glPsaeU*4m4n@I${|`ovYy@Ln$n?n
zh<pqV*=5lDPiuoODLLkTdBO6*CPohoI~5sVYP7e!+!%*>=Nw3vCf70J=7v^Zh_{0_
zUnk(C!GPHzZ~+8cLNDWSs*8(aZM8bt3+_QYeM6y*8W7fPEf9Vt4c!nJu5}4~)T&!5
zZ99+m^R~PyHhFnf(a268Jo8)5t80AIV*h?8;)LcivuE?nXS@HeXy@yqm=O*Y0(ASW
zZqK)rk=`P`AFCd!NvTrX-hJh;@TQpme&>pspt|++4Hv_=e)+5CE!eBwOX1*u=E#Cq
z$^rlNPmepQkyH(-L8<>$9IyQCA{SWe!+ooMeM2kWwg50@co>u5uD~~wfEWAyLc*KQ
zz*=`327dYaIr8n__OIaI*!_Q0_+P>QHx>F<<KJefGe-Wet9jvyUh~}lkGmQr_CD`x
W?_;cIpT7hD_U$=nmH*Z8pZ*UvmXHhp

literal 0
HcmV?d00001

diff --git a/docs/image/softmax.png b/docs/image/softmax.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc3c52367bacc4d4ccd50c27f24408d723b18e
GIT binary patch
literal 21003
zcmaf5XH-*L)24*pYovxAI#Pttdl!g^bU`{Oy-4r9_g(^miV7$lq;~`aM5U>;(0eC9
zAYXjnd+&Si{qwDL!a6IQb@uGpvuB=}{Y>Jay6Pl^^n@4~7$lk+D*6~0*kb7W7JOXv
z-+P;HG%+wR^_`TJp_<CdEKpB3dnac*3=Hmcn<r27HTk)Rttg*78UD=2L+I(N9~YOT
z|0Hy<yQljzOYf6mmW(`e^QAlFOMMuCyYw_cZG+wTFNAg?#(J}ch#(L36Enk4@mNyy
z1i5*wo;)GOsLLMJCRA`{u_x!jk|G@5$Ef|vx+}@@oHD8dvx<vjm~L1O%NESRTKhG9
z7TTr|)=0f+VjG)PVHmb+?fzW+2}U%g%BdIrtXq;bn8Ny!=*}H}IpMo^FWCDtUJ2iI
zR?kbfgqmdKmppi}gioAqfHTo<E6aq#j?K=F>-l-#_DNbxH2Fi!w{SPRC$rBvS{o{f
zw{ZkmYOD}Fkk~g_t@&#lg`imST!0?sBCzL#ZIPqS_LREl(X^xe)_1zDrk~ECp9Ox-
z@K^Aw$*79It`@7RX;yZ~m@E1|8Yup2LE_}w(asya5&2sr3WPyi?%9t+nYV{lGQ`eU
z(_TjhgBN{|kAWHMgaJU`VWL0u=nn=4Rw>q>-(ZWCV*h!M(enGBOB;H_7#Io|nktG0
zewcg3II~OxXM@`KnqeF*L+FoLEk4QgiXcx#3$>P088V&AF`O-&mIo(7+2gfi1TGPP
zhdLfZ5i3f;eGe7fC%b%iY@w~qsFUx&zvQBDE?rc#i*Ipke18FUbDs-~jphHlpk`R2
z-<u>G|NiV`6u^EkF9gSh_$*8=Oiejg38btkPEGerff%O2Tj5PXj`@$Pk~ZAc9m###
z701Qua*x|yInB}MJdOrhfFBq3kINtv!`FyQx#@0vG>k2@zLe_5eat%thT#)q`)^61
zqF7#KFC|K6*|_ds)2@w&iARyfIt!oBtUOZ``N!yJVJdmT6DJESTY{3|z%rCVd+gu_
z=YX%eWd1)!M+@^Ut9#dR&nu27`wcmet6pir&+Rkq<k95k$&)n8*}ISbu7&@o?jHC9
zo9XJxs<b0CVR}O$ACJXsvMB#WO2WTp7N_tX|1jjv!cWT)TD9=ktY%|Hi7g>g71a0E
z691`#2FdX3No=`~bF`vEP5!Ch{;5jzWHc0?m}Ol8R;<^xHzm^U15_0LacvhUNT2RQ
zhM(IlD9e0)>Et8J?W6mTCsax=3A_i$FI7a%#*$Rq9Zty1*s1?HjH5SJurS6gX|jNQ
zjapHvZ%+I_g^W{pOtnNcAs5w>w_IM78pi8XyzKlr0)fA2Ee;l$3By&;R&EE><Ta{u
z0d26$7AD93EoiAK{%)#A2)2i?D9JpvmW}j_$#m#z%4lEZ+o7~QWKcL_4GJ7~7FQvS
z;8HLg?<Gqc#4&7vnlLyuD_ANU=CIy~28U?;Ydt3W0lM9@&ccWEE2FO%cSg7`T*~&<
zRJVy0n1R|L#n%d{%F_!}abI&HEu!<rJ%I|LQ7PdIu~jk$d%`DV0)FK2>dIhcDpo#}
z_{ymAzsx9HoU42ELr(Z~Y)hCiVF3;AP)vE9&MizSs3&XOFCI8*_Dhv&+hb;}TB$Xe
z$j*?5(d^ga@+>onES5*tvp(K->hTc~b4scDv~$wl!>wnctS;89ur(C!r+=;B)B9l;
zMb0=L8S-?;nE{}UUhBJJL|Fs35xwghbp0DTCanx3vP;_w4=WbZlv-aurdxnp?_C`>
z*CeGr@9uG^86S#9+=<%N9=lCZIgu4KRB-y&=K0QgSK=o#nNFm${=TyAW>><hY>b#-
z#MJCE_iC^y61!rNtQdEX;4GGw$5A$FL$T8m2KOW2{<N=*8|(!fbY@Vgk|l~YvFBJ#
zNwl2&Fz#158;LU;dZJ_aZ<X;<Es>7i9TvLdtUu0_f?YPiLhqKc@vtK$rls<O=LOpW
z1+Tp{%h!=y24?dr)>RLOc!|yHUocVk-5*TR@aNDBkD^XwF2v&A8+X<lL8FZ>&JkX-
ze;M<Ati%^`zQ)M@;-i_eJ=T4Fjye6|!^G0Q3_`YG|KWq@<`M9hWxA8ECJUjnu@yE8
zEtbx#5xcOnhyallXa1G-%*0GNK_S{-zp@=<EqIN_{1!=p52goc1_=4|{xy~iR$=*7
zY?@BD0AsXRSABS&3VhAVP-v@!R#49GeNwix_=*c_Me=jIsw%QB#oGgRpRXLBqm_`!
z3@5(230mgfCjNp336Gzu`I@yHS4rf>|65@rRI|i_eT&d#4y1itZt2U$OXwum&Cl@o
z?`w9ZhU@QD8Ba1A6U|ND^`*7(F(~-EP#K)ww&d~Iy)O^@PG|D=?(ut%HGkCDPoC|{
zxPN*#1vRXj2MdcR>&$b9^pf-SkY^dGQcYLH4$Z_s&vY;DhGuO(+O<B6=g}#=QG7tY
zB9Q~?tojho*Y+ipXS7~xCT@=L$3HQTa|?gn$vOzLvB=<}`lr|6j>HJnUSl8Tx}f#+
zjm|_=O_R}#4dI#a&3upnxjwmBb=BkfL4s2y(|60sd)PzJiOi{y36<Wsw@Fo5tZiiQ
zPi3>Pho^HNfEQ@xS<69u<AD#xhU{{T9aw#BkT;^QUv4gPa@)XHTxo@3`Rr5Kzm9ap
zLx95eKiGTQH~1CH{x$dKSkmPQIv;}(%u{Xlz7jWEwhI$dl0n~I5c+Bd$zK{Y>Yqyy
z8W*Q?5}t_2%6;e_fo+bQ@VHI@kIy}}5)%ID{cKsfaYlcnp<g!~_^584+#nA(ANHeO
zs?zH6>tp#Y)uM|w>UxXe1`lCITpG~@^;MwQxwq*4)T}O<kpI^;`vGp=e;QT-`4Y)x
zNX9-qIN{rg2~wvZe9|TD{e<?L<K4qjQ=MIk2J0IgFVxs{jm%DU?u%UCHYs+BMY4c~
zv~$W{=6}0Z7I64Abb{+lj`3mtB61jRdyBDY?kKamBNB42fyH3rIjfYz4Rgke^3if)
zUr*!Mi@`$zyO&9+?>j+t&-pogJ+r>^VAf38{oQMv>4eFfN)l*&Ty-)iyB^zEEzX4s
zV9;E9=<JU$65cSp^(~}gLehwluGDDK@1!c(b{`#0eQX#Un4BWt_+;CE?eOmiZbyy~
z*q%>OEIE|^2E#bgua7KSK+mKg0>jNa<L}|-n?(2;<;S7dg-jdAEwjs?_mJ6vw}zjo
z<Lvz2%sKpf@(O^bx-9MSi_e{E-udf&E5-jt7hbV*cc-dwJLP{i(I6JCN~s1r56Oty
z)}jp`1efCIz`J7T!~c9l;e={4=Waikaw?nu-w?@1IZWd0hn>CL>cjSb&x#E?1K=w6
z4Pos=#QOZQK}X)FDhjzp-|-s?DE`_k;Uf6=<Guy+WxV2`aFoO{oox82U4!`-4iIC(
zlzT4O@Kh$I6-d9F>NlcNYK8Bp`1P((wiNH5!c6u8tfj@g9_0;P9bSl>@`^b`2agv2
zDgJ-$Re^|VmeGdnpzM;`Nz^Frlj-4!W?8ZjxnV%9L7D4F@*hu&afOq7*~286_1TX&
zz^`5i3pUDCRRC-!Vsz;ZDlWgUFDCT%;zZ<a2Z4++VV<miihzOjxbzw8E%pXoUka3c
zE@6}ZdVHw5gz(Y!eO~k5LCrJ`13iI=^;Hqc!-~<N@q;b9tA{>+g0pa}@MsD->^b>^
zLWZ=ls?C!VS7KilIWz1O<;S#W%tnS){DnmB?OZ32PB&sbrq0yJ0D^bisXMl<vW6B<
zrJE`bgr}`2!!AG-ZV2wS><_`I3vD1{U3jBd$`~DToZEK6eVQvDaKUD4(+cv?)%wp`
zrNwp|R-TgBF7gl`giq~b_$tg{mtL@(^t^t#2l!7UW)TfvdjRL%WFPJ{xYLPUOmG%u
z%QpEGu-+GTW4-=}apDIHxuK%`Ui}yWo^QTm2TMqez=2%9<eRkLR;$2_75QMxY`F8`
zom*_JdCc`2iFb<aMZ3YnGi_8uiT}l>!zy8IcTo|3k2_HM1aM2%(1&OwQ>_%1>nKY&
z=U_#+63pOu_^hqdi}-f_xh@w%g^l4ycHG1t@{xsiK;yx>yVB9iZIk|!Ws?VG4@-ZP
zk+E2&c<ghyk^C_WOu3n0{?#ty+ekP`sekFc(jAsj6r+dk@7WLnAfK8q<WLVTo_$dC
zRSLaH*%4!$SR3sA6q%g5Od+WE;P)yGYn1|d-Djc~@7=a>@{Hq5*g?yKn_rBggI0I|
zYq!9nGz&_#PbhaknzTvX54ZoBNpSvhbPx{P<QU+5U<rABD?}nSOBCU8xXTTcvInwT
z!&f=kcT8fG{_}RtC#2t`F{JQ5(yZuS{2~Vmy;j0g<Wrod!3CI{P)stkHND>|aZEAm
zS!n9-Nsf;4qS)UPB|37Ts3F%h%D@*R!tCq<MM}})o?1rnDOu8{@X@&sMO~~K>|DD&
zK-~IHTIupPh56N4>uyR`GN`~clV7r0Sb+Z*w8B4fPU6T0;?;@P6U#^0hdZ|WL)vMl
zM><71R4zklqlo1CN^)DP%sQq0$-~h;I@7--YWHURabhFy%H@zQf{qc-M7ojG&Fj}3
zWOHck53T(-`Ki>#`sq&qhBmj9xZe5Hd_2?Y>0drtx?($n3)1M;%)SqpFX-;s4C|Po
zD|z&iw3?{_{-f$q5QRjOdc>{s$}u&~|J=n#_U;{%KFzOIHM#H9kE@a5HRcPOBowH=
z6RC%?6qV$dx`*=7yB)VUD?h8NVm_0+1}RS)R`;P^xP<2aFjTZ0gtuUG{59=ig)oV-
z(YJbGGv7|O?8-dSZNzaYbp3J~qLdK6Q8TzsCS;Q7Klt^xGBpY^R7!IIDJnMgt+*5F
zNC98-Puw80d4?5NPPfvtL?Xr<m!M6&gLJs8RN3Q%D0u~QmHc0E!|%wiRp1x>os)Dj
z|Jf6Ncq3T~blsr0zNZ~M^l2bGc_dbOM!`X8M>+ehpndPTKcm^I!3)_-@^CQ#LfO?)
z@qA$w8{^~4H^#AcEs&4+b+yI_wkf|iW*HusHg<HKcs@G3Re#Gh9r>_;<yP`eR>)}j
z=j-1zuG?c{q$NZs(BHF9en6Qc>#L>o!=Iz5lcM1tnF;kwqjWwe00gaNVwHQ9q|hGc
zzY9km-o$k%BDDnOPdb@9Y!pEOcy@+F8=so*#VJH#fF2_yZ4cCAo~e!8Q=$~gCp!4t
zitp485J)CbW}OJnEgAVAm@jq3cV8L&MJ{h;RW$Q;rq}OsL;^b7S@ZQhPH<zo$`ja<
z?4_1|53a_Bo#*X|4^hfjL3>F1CCjZ`^2L`_y1zlgN<6HcGOCal)<=r>wi!X6on5Zn
z6X_e@+pDmsEqQQUUBF>B(e&JVVyD^Cy3LBkUZ>ZT-;QMb`2SjT^6-x=-8;)SDg!Gr
zd}c(_6v@VgwA;7&rfEhDngB$WaG;QBi?RMPttLvRJ4#yo=dbMl#bAX(suoJw51NJ1
z-vjtvoIjMa@5wXqRf<tmQiLC}(gaZ&zheOXa$Y2(-Cx{>iiP~8$t#va%8yo5)I<lB
z4`%GurEhCnrf;|IJ+=HCr=Ss*DiMNLcfr<I-=`!HfPCJ))<e5iYV}7LRN%te6UjDQ
zt^)W81j965n#x)w=_MC%PTdaw$1G_9eywv@ipm!_L)8HY=-tEb9zS_BzW#;#6ckE>
z-HXmXu=DZ*1c;9$bY3aI%RWJ?lWn$0&mwl4nfwO5D8CstVbQPCZB0IHEZqW$C)q5>
zpD1w1KAU<``9Z9I<_afn2I(Lyx7`F3lkZ>LULyG(H1+)%)S~=-CwVANNgIZn!hzyX
z97iJfM#Rl>MOGKSJmw17+vzDp>(aAEh#1wWRR0jx>pi^dP7#$?Zc0JHauLV#9~D(Q
z#mgfaOqo2LWtJzg13)X+d8wAV&w|J<B8R=$3~D^o?HM*IuveZbT(pU){tkBkxv;SO
zX~l}}xZoXuguqh5*VOSgr4LKZ%AV?Qw?3JrnP=Q4_)Hu<h(mJiPI~H`XA*9+-@A+9
zaXdm0*SvNUo|wrrVGw)HNvWnp7EPCKBgk=CAyTrZ8^Cla6+QU4RX!#@+NNpZ)5BVv
z3Eklhs^ZxiI-q8G8GG3SXhtnexwL1p06I*h`4z|(w7Un5Bbu@vyb8MUjm(LFY>_4t
zG2A|Z%d_YXzjLx}<2f~pqk16tt^LTWXUGmkb2aH;z`6jELjg!4dLMt(OM6bxKd}Me
zw*SF)MDD0MJc9zI97K;tyGC?|*WCePELimRT+5Z=9_>5A@g0G2VEHjQ^{Dq`i$hMd
zLBMiG`O)&yaA@lwj*jE%OB@{y)uBJoYFZ`+*x|KuQMbE3e4v46(m_#b)ScoB@NET2
zOnq(ILFB+3Q1`>$tkNs8SVS9twwNGkRkBvp&m#>RhjJ+AJW()pp(C00m-TRh;zhzO
znM|gpi+fWLf8wu_TenQqqJl~2f`p=#>=n^hzp;z;L1?2h8H5>cV4ieHf_W`~T1o`e
zOT=2YT%Ob%H6nb~Mnrh6D7d^4L529t;$}{5L2!A6+zleH5JssD3q5x?p6Z?1ozJ+(
zYxab*1<p2;DwJdy6C|jcI9D^=5a2+=m&ck4ww3b?*@C@)`B$x{6bsP8;!jc+x<o9=
z17B^FSVdi4^c=}e@}*_N!VnSA)+PYsPrQ6fBLA3Nhr--w2u~gpJw`DgI`fl6x|)2p
zNC{4Jhyh!Wl-<zy8~3*93lv8~m)w+#k_^R>4vVfmS!`8J^q44I386cDj0iYH%uKS8
zFG|a3%V8WJ(D6+~M>I=%EP;1VeHj{XzEQ30EYewnc0EN;&S<7Wa2`#ygtgGsiSw^w
zMN8WfDT*qNDi|r}$dh^qGW!`WOx^KZ{!TGs^R9G?A?*^*7DeP567tDka8qUYn9j4d
zPFBUH?I+0xWGp$zSVR5pjhD><CZ89fsQ$b)x5F8~jFjCkEoue;_$yoRhfc(yQWkuN
zwtgVpktp?o0w!<B?J(c<VB#V&x4Us-OZ)p@{uqsUE<+iPQMT$!koKZi&>*!*%-T%W
zcSG@Jl7rEGxoMa6X%6a=thyq)&V8E1;LPlTDDQ~(XQeu<C41s7DJ^GTVkKHd45xb=
z(}px1=;5}0*u&c~%S6K`dm`iYr)IQo>cx-OW0c^w2>&=a$g%~6NFiN%Gt1(1BE(*b
zF4e<7lN^M>Ne6_0lon=QLrPvl!>Kf{nt}vf^K)$ZSlElgx}wBVUQ<q!>Gn?G0*#ef
zJ5e^zg(<61<&S?(-8oNr#Gl^8aV*(tADrS3p$-&Cm$^BXUStM_1j#=pG`ad57pNO#
znbEvvHmvGs3h({2ZPN01e#9;ZZY^|vV|5c3y-}vyQn?)HEn~aptJa<&PCP!=96kD|
z4)c=ozws~JkiA><P*3?hHG5f+j)i}F+cRUJe$FJB3hu?ah&L#C^b1$Zrl>#m45Vs0
zRUw)@p>G$UHg5|$=Y3SGbjfH5Sy%VGT7NMa#6*8|6gJxIxx<>Ah$r32d{jXoUS~gh
z8E3*oG{*&0e@}Cau&_3T%&AQ2?OIgifry%~HQ($K?7ScpN)&|=&%OUuW<5=jAH)<S
z3))B_a1)=fUX`MF0h(T<;k8FHUWwCP5UGZ&OXHFHt*}(t3^gqgzeTPdqS)keZZzMF
zfDfhng{rC-oqG8mGz0rHxy;i%9=M(fDzr~Ekues868YCD{Z&3p%j~A8bU+!<$>F}t
zSE*~YVJWFTKl>`EU<@17>`zX+=s{4&<sMetJHIsHKCRa_#giOyp>Rswd!N&1G|l@M
zy8Mir@qoaDiDu-7-Cfu+q$$$dw?bg?9468r4OvM{q%;%r`0~LFH&mJZc6=BrZ|JSV
zajefGuQo^)N0U<FAxu5p2nVyRX>%6CkuT9z`?>G6O>`loKC$hMeLZGrLJl4lgF~vO
z^zf9}@Ad8rVl<=tUhb3oVvI0CkFxG!fllT?g?5g+<`t&$eePl-ub3nfUWp4f^)->+
ze{6>Hm2xvG!btJG!p}KpY7dDsoK5#XYNu%t7(goI@hQq2ucC5@O*8Yd<KeV}u%MD_
zM9_7TS8pS3*7p`af?(O~@0u~keQbM2yOOAM!yh5>xV^Q<evynM6RA>*w8&x@G}N3`
zewqthYvTNnI_0vqLAeg1Jh!Nh{FU(~R;Uv~Dwlw0p8{%8Y*h3QuJ(1ks@zGeO2Q5d
zt~sOQT(L<z_|fRMdzP@bX?)2~^kKF)x2^ko1F8Fd*O1_VN3AM#*F{rAOG`fC93*&n
z;8!p7o-=vTRnr~$4tkQtmWf-6an_3fxm&E8jd<Mu*7X$#u#m6!yCx-_w4~{}xGSH9
zOe+VNG9*M8PQ+fn_94$Q6YR0<Dm8yeYLi&I!MD~P1=3BB6&>t~uorqDl+yvG{5*4&
zj@ABi__*&-an-K)11Zmn8^`g~KKYAHZ)u9qzL2Agw=#p;xqkL?rxG|$<_Npm%m<Lu
zK6tVygq3J#!XeIg6zVH8GB|H3)id-g{rIOqBKo!F_fZ1Oi|em7arpAa4;Y`FQ1t(B
zIQ&dt1KPYh9OUlnKvQWti1W75Zkgo!)2is3<6zey2H~K#XiI65$G@>ah*IgS{6%(%
z{?&CTM*1p@V}Psn=Tzu1m=jJQqb)C%;RBu;PpEjUxdIThFG`aoALSNx8PK??O>cP0
zK@tZE#&YuKO6%f9tc)8ceVtEl$~(nuAUVB9CSrUMRac}z8V<P}woX(f485bNo$1X%
zcq5~Kn5q#dkzv8l^V~B`+Rq&D>RyTWhucH`+lnlj)@QXE6}x>J@bL#@&sErfTTN{}
zO{5VmI~jv5h;0h$Rb8XT&k(PkW?7X%r(~igLGpu}vZIRboXI=P2A>eE1l2})rSLj}
z!{AQV23|36)|m>y+lKg~el{WC8cOMp<s`&vesSM#J%OrHtci}$W2Kt_GuT}uy=ng^
z_h{(0)eo6*uQzmWKYwtFiw+SgOgC%Qn5r27@y@8ccrBS`lgn~h%M3GSmyC3%e64iq
z6{uAEkm#~gVDLhV+miQk1P~gS`!+9aLe+b=nTT?PR@(wI-@GWglZz!;TEawNT1b$)
zE-zT^(qz<=fEw(v4DlN)j<X)TP_}e@z`X--7ym}^W3{4=wuzR0E4!Sv<eNX4bysWN
z9M;<=4@?@*0^>ckHOoJMTtRlX*BxU-YBo{wXI5Qjt+{RP?)vLMr6P=%jbEX6opXmS
zUiqX!Q-V`;H0pKPsD7fjFM*JR<nXQUO*&r`s`k#j#CC;7A$QB<CJbZul9f$MH027H
z@==qEfQm*3do}o+|2}{8E$PH&?7;g}MA@Q#BE~dHvO|<j;yAoFf-|MHuSf2u`3v*B
z4!1Ik;g>(DaIK_o{T4I|%Y(zE65l7^1{rUPc1bkVSfP3k-c+@P-rId!@Mny;;&LbZ
z2-Nl4^}{+Q4fktm%4oV%6ii?uEywRGJSp3A@GP5Ap$F(|9keJQr$;kS_;~R7eZS}V
z4#V~I`N5bTD~uL`Dwdj5CtvgP`w&+Qd?4n`KHxiIt;6QWYN4ZL?R(-^A;QT?b@Y0K
ze)6=ILgnQi+$e#5ZHKy9eCj4~0puMgf7)cHAA598Cn*it&&*x18-_+jKgGa`T}n;q
zg?Q>_qL;yq&gL$EBryu*6x`zB%eFjD6kmw^!9Xz_)`N+qIYWz?Q3yD|KZ$CFG(F)x
z``&^^Ku(1n6`@zJg%kWGZ&^n;pzduGNs?Ii%xb+xz#EQpf6}=3;0gNJX)=PnUKPfT
z;Ob2w`*Q1eo~exIFPSwS%m5`R@kB49l#s$vwprh;-%|jOQ*FH)03#=&MQ5_@x)n_O
z2FFv4PfeT^KSz6Z%#m(R4v=@n4*QsG8dl~E79U<dj;#OK@`4R21PCZ)pPANn_(p~<
zxa4qs@T&dd<+(LM`6{_9Cg56efL6Qqxiouv=Dx9XpAC)3D-V78lN)C>d8Y(<kQ3jI
zoubn`!RS~tprn>;$Un&VgVIp#D7GZee!*8%bS3ay_F$i~?a-j#a(brlzT~S9mNIgL
zBQ_JI`kM!xGAW+lOt`1Sua83|gY4ea=~w(Zse@E1n(SAF7;S=mt1@&q4cho*Bv%(f
zG8B~39?-pz;Vyh?_HHWP+u*k?ew(tT7Yb$_c!u2PdKnJ2s7Jd$_*viMO-)4XN7R{H
z4_v!sjH_6rwtKpK2jKaSe*m`>l#B`U)A1a7b5u`h-YnfcFG#8KR>ld%W(yf&aWUnP
z)uFrwnJ)Z#iL>N5FiDqk**Rv-P<A**K;x>dA(SH{B$As*-6E7>m|Ss7psUVVVZ@tH
zv^Qx_KsCRa2cxs#@+a&sZn-GMgf0}T83>H_W-5B+uIXg|kzSR4B8Wmk`j$P3V%xa7
z2t@T6$yD;&%3#VeqZT86PU>R|sw>L1Nl8wPrjv(LO|t+$=fT@`9yu2M_}m1$jx3&)
zb+G2oR^w&x8~JOKg@LcVhmC&yi#O>FVwz@U<lxt3N3!_?;+ot@(y0@EeY{eODog)X
zPH!RMeAwuuoN8~Tm34HgO|H!NgJtpK7Whl|TxBzAUOj~t)>EzJ7J-N~qAvNaXNf75
zIuzVWVX$<5us~vqe$MYAZkFm&xmcs*<;u%DnWd<;NlPd=`KT&n4E4x~H<!EKAi}N5
z_euOmllluS$9G1C>XIW~5lkXg{nmG<vp=)D)Vk>6p3I(CQlA$OF|2yTNc}v*zd0gV
z6poZHY28?pD+4iG0@j50En;q4N-Mv+4veOsAB>2mCHb^GAsGtRQv+oSju)q@KkjO&
zgpXU8{jl}!t}mFMX41d@BP*Na2ecH{-}9EEE~LH;?UozY*7>MBkmPV3%mk9JG#<$E
z57j!Hs4sbWl~Wf_H!z;k@F<ahfG(raVF=*?y|?QPA^CAd=v2Sq%rE|BVkb3oYBpFX
z*M8T`nbOr}z4u;fN0`=nUvIiU<67lc0kqk=tB_inyc?r)D$J#7%09AqvB45Zvaiza
zW#+@>c%eNbbbj!e#WMD<BMut2ZUV81K`(1*=DK)A{#8;vVKe6Z+JO6T&Sz`$eeUEZ
z@*mBI(yvbGn(m3{*Gy>^s?XcW<39RW7SZ*ZeJ|b1;v`cWo_nF+8vKReWVugA^m$4y
z&()RO_=U5f897ZmSt2O&`-Ac0Hgux(TsPF@txsITXDW6VtHiP|9%)Bi#7CxkY1#B{
z;2EJEOEZgG!?#_Z(<X{%)gJ76ez*4{5ySmmRjDMxLS`@#cVN!RTY)@{UgzBq=6ALa
z_`ebv<y5K_Ustwv-iUC?GIbn4(rCjwMB9UguiRLH!QY!6ehF)IV7(k*&NXwj^n0V5
zv_&Av^cCbgTD3@f%lWzskG)e)g&zFf#95TO37u8hRGc)9Kb3wFaba_GRxdSTa=W5&
z9n@vNmQjBR>i{)OXuo^vGFsXr!D<sqxpF3^xw`-2?6XoQ%r2X=yLdIQ>`j`7YgF1|
z<^lTH{lBHGa4bv<HG6cbr8ZedesHgarU3t?@Gbk2iEc?*C{_X9vElZ|Q^_DJ;D(8T
z?#QUHV1wVefLO}Wi@UXBaJfp5nL?7lSGR)g8@6a>b<<t@-4C0T8nZ+Jgdb(bk{Ii&
z?|i*xxBJRbov2i+xi&189VwEy0%?M~V^8@Lm^l`WfpCMdi^**TsTqGPXU4fJ=6=Uj
z;|)q~P-okEoc<jDl+@dhLtbx6h%H-<U0{$?-#dWva-Zy|!{_WY>ok<Bv=Zg2arN`{
zMPBvlHg^b~Ez#y8>+OZJ;H%t)JE1c|-<V+KA+@~iVnSk`qfe2{^LVD%D>Y<Yk%$5h
zWD)gH(xN4HX(u^cECkOhb`vu$SDTCO2mJaOkE2suTl&K?&B@1;+;InI0m7ap?=7S<
z%RtDIBH&8k1(bWMQ6=X;$-ikT1}a>Z%)`ohg>Emqtf~o*H;;^{b(Y5-JI%>1*iWxk
zi|2U0eC!!P=jBA{RE{^U+{Buu$<;JYv?S}r?j3pYmzjY?J}_OA5Wli^x70mfT@!i!
z=l1;|@SIFy!S|RK5eK;gHktlc3-j3r;{b<1R@bv&zZkwu&XM@Ci#|jj0ci~Nh-a^e
zmr_3eCv93}^d2aGfM6tr8?me+@$PXXq1820ozy3jl1!uhs@h=Ps^-Y!JHXcUjY=6(
zt#2kwm#pRG+TRH3Tb~3<uEd61npk2S4<20Zl)9CDaFJfWKGfny-XBkBh%Eb*mYp0O
zbnu?&d*YdzI^>nHB?GIAQb(A9{O^>$lo{W>cj6NzkZ}~iRDeCy72`K8mncikBO{ox
zp2>BE4cwU9@^r>C*r)L5R@Gh5bn5?=R6*m=9}?#tyEOnJeMbqO8=S51HAt97=8w!i
z19}OwWDH+?0;ivfY1R(l`YQTK8W_R@#Tzs=?$B}3ruYg*eI@Ut$y&bVurDgIYuvA#
zl-A$s)S%A{G0^~}9{x~jFJc+YyWO8y$|BluAzHKi44p_A<gy}S_j*B7sXsh0=0fhv
z+X7u+{)n!AfXgxjeqPQS%&#Xvk~|Wr3rqSB!r!s%e>gnpW8xN+B8_si?NJrevkt~P
zx{u<42F49u^p%raE;3KLmAX#eC-2hRh{RQF7-02WBNa-9!P-h4SUnf;=K(K{Izpwt
zfkv+6aUSuyGA=%y<RwSV+)FsR2VW5$-Dr9+_$uW0iME0X0P^M1GQxFuIUm)c6p92J
zjv@5&9CZ2ujDYX5*begJ^qOiPC4BTxIE48_Vubw}?W(+ND;^70K+<{aX38<H$3nBT
zvw4${{)BdY9=G>Ye;HGh9FYbeArQvPluqxs5K=s4%wQ2q>h1o8XGNz2S4}<<UDlrh
zxezNGLSiAzuaukvRq{+AB_yoiH%1CFUY%s$NVV!h+1VM17TJq4cvYr+edQn}RZSq?
zsNkiF=7K?-w!KRCj1DKd<m&j3$Rc;*pv8V#kcQxBNa8$e`7ND{XYJtC?uLEM0z<U-
ziO#-5YXg`(wXtskL(jliHU@dSsyh`JeX2)68jUD99aNpXm-b$I3J5g^sIX{-ExtGR
z?gDs__TcO?k?ZvzhKehfuq<tCJ%|HSSj2EC<SIEOqfTIvgUYDrk{IR)NNy|L3PIun
z@-7erwoUd)rPP%G$t-(q8F74{x0xvD2;>x4nROINeGY%4pPjG@B0o|`fq`!xx=y#H
ziwkkjI3--osLikC>hqjXHN4ZJ5nOePzZ+ESSLFKw0_j=2iaxIc3~?b$#d|)v*0jTr
zCPyi8vw-PTe+IrZk=-y)WGp=R?zn0Ygg|BLG%&2n*?GvxD$AduPK35(e1yV`+-VU_
zpHjNgc*fWViSt>Eky18EZo{5#ibmeP$vN_YVbuzxgqm@`7f4AcGB@r*<<C5b4{&G`
zud2q;`gSx&2Hf^yTXCT#9kaZ1Tt6`*@d@I8?4!0aODl>TH)DOnHif6yDF~P0VMCua
zXmE9_Y9m@T1Cge6KVYl71+~Sj@<*irib{5@BEWzgH;gs7mhooQ1aNRLnI(5ly8Q)j
zf_T05`-3g*;fu&4>9YK?r?YJC^i4zXMZ@yi?hW?zCOwjosMysfi5hj=473^$Iqa=v
zh`;8yA_8==BsC5J&p(}2>B<t?H;4*mD1^o*?ivxUEvn=-Y^DgPoNFJ_X|J1XYnP(Y
zEi=`-u!iO6<5elQiZPOC<m5($h6xvEal=xLj;Zt=oa&BQHoj!o+=IA8$VuNJ$ccBI
z`zy9j%i*z$Y@CfQHKMp#_8X0s4MX{)!ElhUoMl0R<jCRus>Z%FhsR&%N4On*B=C7X
zj-L9<wPS7VMbs@Xn-KW4=9x(GGcvv>f7A99s`eXk$a5WfF_K36>F2|`zNDdknihe7
z=J}HLqb{qyl4(8jDqj+YA|5;4(&@(d=|_4=hQyBkHmlwEl1m98xXim&rxfNBP}jwP
z=tn1c4E_x;$-2YKq?;vxJDcs9z-rUBIl3rK1!-yuYSC?^_YOHoZumUYn0KT-c6GU7
zjT71KU@rR(<qhQ;T}H%p#nwUG?G6$VQsZman{qX~igN66Lvo-EJi7#@`IUa-%fkYV
z0bPx23~Nc<Ze{pJ6cN+G(8@t<w8vO9t9-2CV624d$WTU(^2%!*P_5j+Hpfd@!$X^!
z;cHXyrNdR#<elMp4MKT!x_w<|BA@VfkBlygejMm<TLGA7K{>Fq3%CTSo(tSlHgG)P
zJ(49C<d_>Yj`M$+loSNJgCZpa>IlnsC45^W0HezQ3ISN7rH%-?)uZLD#~eAkVMRl%
z2QcY*nYmgv4yy1N)(>|Awik!ZmLL=TT|(QJkL19ZCg78h43}Vf0(mDF0_ukPeM7Vr
zj-^j)j(e)VC~?d55ZNsHYMJx2=rqdZK))Z;(neRqIF?F2r0k_|r%+Xio10(3NV*6S
z>*woQzbQ73l@)c(8CB#SvYc`ebeNIbOlY^LFihZ{AuLp6tE)KAuSsKm^(F2{p7pY7
zO_eHMn^djp7O$ZEseinr$Fh^^h*vtA1gd~}eg6o?IMI@86RK*r=qAWJ)rp#F3N)D<
z<Y0tZPDZgzT=hQI+8_Ld>+$7W(AHm$G4T|=JB)oHXD-5>&thV3A6AC?Kpi@iK3?45
zOy!V{a58z0p|>KKlZJGPEbiMHXXfvN`cuHU!k0tQ;Y(F#Jo%291K)xJq9Dt7_&`16
zfeA$=7;B<j^xSHB+Ro4nNW-PP9tjc)!KkMP6Sfl%B8^)f5~}Z%Cl${%1rWav<CV2a
z@BxeG>xx<>U5*rp189(WbSUMB%<A;<PYk2&*dCPG(|&_e&ap#<zYA<Ome#i+6j|DV
ze&+q4Mc_iq2PSNjwyx8Bf+Kcw|98!9N+w)h6oJCFc9LZa3ruEGsB}O0Icf)uwqrgj
zlzE6ao}qaY4WUp!8^)9RVNa@30U{4eH<AiY6HD86LX*qj3_^R2#j#7#t{vM-%eR{$
z<MP18A?13UY%Kw?YD+5!PDl=M(4%8_AcW|LIlnSCoahV>&nJdvs>4l0!{+gWO@`r_
z;yAc2lpel1JhO)N{FNBZBQ=1^6Ws7o@)H>QAgmz0P)G#-JEH3oQdgx2C5mQGOaf?C
z4{jK)DHz>w8u>(cZy%bHtVHz4krQ`CSUU`)7mYxcgD{WfF3GZ~GEGV+XJgG+EK58D
zUC(2lo=td}SCF9;Oa+oMiMmQx5$ZT3zI{iJ(mdE;)|qpqSEqeeh-c_+{7iJ?^yhA<
zZZZNM1>UpeaZaal*R`&8BM#jlJaVR8{E-uyGm0kc1#%-}doTQ34p%npsD_`5&62`9
zSm%S&8b7B|*>)~E?Xg;5*<Z#A3-?&4e9X2%@HDtW&&e5osGN%S8}Y7QUtsKb8d;|+
zmEfGPcuLt4S>{O3i;NPtIucoyqxinQegt-?dlD*nZmFy}ctDk-&3E;f?~_UNlse!h
zKkLf@7WX1p-+_7x3Av=Jzj$1MNX>xs9Qn}6Bh)!F$k3%dUx~}hOmrE>mc9ER;?T>_
zR2(R(<*Sn9vlp1h+I8R%i&iK#Ag7mmAp#j217g#yONsPbKS?DQqHAR&9IaypJ8LVi
z%?G)=<E&oyBK$Tf<1*h7l?g+QV?es=U(~CUGO?Q&&h6Z*rNW-$Dq7+NFFhNI|KT$U
z7Ic$Y(N;3qV>7@XLbw}(Ymg)qp}Ss^O+viaGb_rBFrs2SkD0!!i3_7z9O=4H+dU6z
z!J@fw_@w3}9kivA(^rsmtR)#GZX5h$_~P(V@yZ6WkHI;ohGv(NbndgW=!&5TI*3ge
z9+EGTi<ps92wL;QmTfncHUXQaabx_j%*928Y%vj06!Af2+DTyX;>|+@<NnC?ZInl6
z&FJexBiHnJ_oW|lZRW9+6PehSfROs2LBzs&uywmc)}WzU*@yiPf0`p^$U^S~MXE&M
z1GZpWN$Rsj)ca*Wg{y5|9;{^LCx+fWLfk)D<Ad|~lN~}Og?|ygZEzuu6;hlwtUV;j
zlBRguA3^9UGKZ0sa#n4|CA0?Qdap?kY=weIuI1Coe;TxU3Xhmh&3qp$N!4M49V+v#
zlhmN*NOl)^uX94?>o^mpAheDiNO!FOXFU*2>WT;wl(HTS)zwq7Vv=S~Zf~`ri4?mE
zWv&aTJtSDYAl^Po%kB4|7<l~3SoT#q2;9l1kloa=JlQ;YL2+%6a^&SEhsy4u0D2e-
zp$~dQ%UVScqh?u&zdqbNI+5}^yHZ8IGTF^f9E!MlZ>E-T7a8Og^nGDX>x%#lMR*L`
z2fP4=5Q(6~27V(o_vTZUVc5?W>LD3AKC19nEZq|RV|nB8Yx>ZMhyWJbE894S{KasK
zsQoC2W)-?b*2vTUX{i@#6t}0xzeyK$p<EQz7|oQVrbj}5OO}#LGWFDB7AM)AvL3gA
zUQD`wCO*?rPzJ2%pG<XCUglF4k7it9LoEtPXi_6*XPQeXCikG6Ne(ZbheN)sVTC))
z!6Is+>+X+0k>F`&o*#<Hq|=B^SR4InkmxDPDM^+slF<dLnxqzn96IIo272T4hgvwf
zkQ3IBoke_9M8*!()`5H#kVCn)ztU^(H2}T=3vjIE{V2oR9tbIXzZ(*A$<#Zu=nPh=
zX^3QGUA&l4W<<Sb?PPYUmM7dNJc~hm^guFRKbj1L3hcUz_5L!b-yk<xARWO^Rd^gS
z*nTA1bMcgi{p*{_v6`iUI5Ll+ffqvT^{IT_2fM1JO;;t)A3rbv(8qM!ZElS#y1@pI
zMifn_mV1X=@J+j*OW0A@N=CpGn%~_G-NabA?1P$>GoM5~Y0%_Gd;;@00LuT9C!N9O
zJGzwC?n9Xn%QD4ZG>EkqY}GDF5uJKTWSS=VnbjZm7xkwf38RA(2>-s47?QvEO8NPh
zJ9@@C9%lz?MPQo$F!jhe$#vBbYod2e_2bc~iQ@z9DoWGK!H(3vp|m<eIFQzI*{_q(
zJr74tjv&ZuhNrqXDx%Xe>EgbLzh8x*rI#BcoEggb{6)LvfNH4;{^_7X?*vP#gn+QZ
ze$7>;8Zu-}lhT5T<u^qpoEy{PdSWyHnLDlyS-1UtE|_*dEZ6R8;`72M)j}z75%OfJ
zRm-@=k>i=g(~N0mWkuB0{re{Ai@^uDK9(upI!d82xdNlVk7l&Oh(b_wwnZl-7O#qI
zpRw66+MbmtwhIV`>TScO(8b32I0fm3A4Ds3sd+A8D%UF?4negMT4*s46qpM|t}wco
zqSND(X|2J>JPQYKNp~fE0%=CwcFVzB&8@se-*(vWOc#gUFM)`F)1y$uUp0`XU*+|Y
zGEC@{E=Z5%x#V>o0J4`JBDN~u{CcIuM#-NOO{r<X$1Jg9B-DvSkmXxaN+|wJKN=Jw
zM(M#kER=pDcW&KM?7AW4+mm{iBVL`{K!dxtX1tKzHmt=1IAo6#Tzl={*91vES|+`{
zx8}Q{q^P<-Xp-L;@yD=FD6y`KjE5!F9Xrwwxoz$E&3XANZP1FHW<T^Lp6jFz@-q%&
zYj@__+SP>W{06}+AE;L4E%lnnvjy+$P)mf!-8_pXOooANo$>k+_N`fa95X`i7QuFl
z2mlwF>@L9;2-jpa8(x5mQUft5@)W~h5-7<!1ed5*5NM7`5n<9?haw!=hN3G21Ni*m
zN1mQmmV|QcJ++(6JoT?ff0N5gA(;DjWJoV3W`z=soC?e>V%CO&t#H(vo-Y;dF{br|
zA_@-gdzdmrM}(Mz@n|NzE_VR_Qs~YYL<z<Q<*ipg<N&s8_qM^GwWI=n6k#)Nlk+=x
zodV`1Ztzb?4mT=uyq$2}5EAgll?97|9#uI-53~wi01y+tWO34>_0(D=@$MX_{>v?N
z*m6>5dah3n*lyK*&0SA{6nN!qPPH8SrBPiT=}m$+zwpi+DxQb}l{!0LD;8CLNeO2z
zVEzhI19!Lfmq8_lEe_N`4-0O+<hnhK`fyc>JTEd`T=A2`cJ;^cD7rvWl*fbq#PSZy
z<^#D8(3x#%JVDe<qt}#Ooe=IqJF-sO;Wp<tbDBjO^Y&}w#GO8>`ZmW(dI>!Eud?2b
z4S2rngoH<tcLtYpz{^s>n_Ws;Ue9*Oaf?Fzxx8u-oM~ffT+w`52H;0IX>njkbm{*l
z=17#G4k<k~?!INoI$mrsuHI4^ghRM<==bRNN%Ld`K)f%{#*#3Xt=N*WA(+*2J!9+5
z6+3#P#bSWSyC2hE-w#gqtGD2-EoT4^1xgw?Nn?>FbN2D)7W%0A3o2E~g*qf&T-(;J
zK@m8^LB_+6B?Jxk0;wOEP?5*U?tQg5L6cUDmd|#uVJ};5fOfrj<a<TXgB0G=2ZE}l
zHTV^>HbzFpHHr>?$=lS!P2=*4LHOS6{m+c?l$w?56~$b!-;*4{8h`5q6t4Qfg;>m}
zObXsUw368CAqqILArDF$ZjbJ1k4kgUyJUK(3HJp8b!AyBum-^_cC~g7{HQiv;uZrz
zivCG^WD&FxE#;rJ$2K2Mn*(Tv+)E<u1%#^zIFI@AUD{}o{N7P5P>1%Z!C%9wrEa{N
zetL+N4tve#f+DvV!L-y%d9Y~fREG`Utv*Hn-`lzYBRu%7T4>BF1tSdi7sgTI=rOZ;
zk){1_cz;ZQBoy?U>Xx%?_kL@l+wQJ^R?5HtB&dzY%dlgcbO-dJ^3Um-4oQ+6_r;kX
z0UH)cObO=cr2s1{VTre?tUJ%9(-Kiy^u8`O!Ffri`Ys;kwI3PF(RwIVCD={&xPI2t
zClues4$B=GM!Pp&b(Z#`li870S5ZCltV>P8_rWOD!ZS6JL<JvwbG+tdlxFA{Rm9oT
zc04<u&D2Q&^7k!=!9uiM^mcqsZoMY63pW~YQa2vBFNtq+-<1rr@!Ps%v{Nj|#b}=N
zjn)lE*vY<d|H)MLTL39QN{ktb%s+Qq?4juUp_U`}*xcMmbd6;W|IqWx3@d7m1b!_s
zT-sXbO@68rf8-uyjvOO+)O*#fIh-_9Xy-m&yq}jED~f<V_=}QANPdS9bgq{#P2W`&
zHqqo>8_O^>6L&C4TMR8^gR*1oykH77X$p=JSX~Xhq(6J9|JnV%ud;ZIaf?wj{SVP8
zQlY-jAD^t)9oL4q8kB|nMWT;W(86jLge79!+LUC~>B`dVB#*m`^xmG7<dd-E<BLg$
zm)&{dEaKu6ba&911mDW&)L*N2mig_ZpI|CtgVWZ;Mb_Z%f?N>WJN9g}gZnQRl^fP=
z>Mq<=dff4Sy06{QmCakIe<1ljz%;yj?9M878cZYhC{&yqe_^OMHaiNwe6&Jy=W8Lw
zV*9FW>X9M_TEvo}{Oy0VA}a7vY^L;tU<`15%V~1K#+H;5X|l3Qd9BwEHpvdCGtsYq
z21W$pZUy;;p_kxK=HY)TuK3{s=?;mx#t@pzj)ZUTFRGR|OF=U&BGT;+OR1gy^7=C(
z$OYyj&8{EW%MTwh6k_;7FVo_^N`I7Um)(>KLWLO;?svir`3yDu|2lVJn{)<1oQ^YZ
z@7~@${vokh+B({!y@AY|=CR>70(5R(9El)nw|UbJU;~y40n)T-jnv_;clOHDI>^u^
zuPx2QY}T4_=D6RH_m8X84*&2qSxF?apC{?$xLWO=AR)4YcVmqaAE@r8$XeL7e!B7Q
zG3aRYdB`3S&%!BSx*PuzUbP%J1`16G+F$V7iN$_Z{}WIt`ajAv`t+6U1Hale-6~jD
z?pEL!SNv{z_LqwTLijbg)oVtZWrQqM;z?bHXY$2To;;{C@zi2X%IGGB<t{Mb5_i=S
zRRNJtbx>eXidAmPTb=+Pvda26$*}(id;Gz>dP=#(KI?yJlHuWNjk9@Ef#p*&4u-i{
zZnF2iOFYe$K%F>ChML)nJ8H%e0hwdZ-8f|Mk3z~PSmk70Wem^<lwVjXg&G9DF(k~1
zp_xU^Mx|iZtK55_@Wi_nkAHpebDMrg4qPEpG(XG|v3UAi@r{yTm+T5M-1I4PrKlMF
zN}%pvWGEIa5>1w^DX{dzAZ_K8?Q31Ho^H`u#?k6UFz0P>19W__m@|mu%TB6j&8IJe
zxs{^VslPTW3Gff^ty$+Cpc#_Bv3@+25`tmPVSbkzzguTRE$v(Rz1hFruvEHrrl<xy
zN_zLZn)6>?co@DxAqH6gozgEV`jwXtx+48rwae)yo}+M9?4;mY-XUzqt;#3&`c(l!
zL*<aI)faKO$T4D{FbLfqp$FK3cGHAA*#j7xsd1IisAzerSxchpzEzkD{8W&r03v=(
z)qjEQyR#pzR2T)=R&yjBhM6N6Qc}aF&}_oI2Kb4ZDVP`hXG;qU)|u9m$KJaP!{f6t
z%fMl?S_|p-A@K^4luJr&mI<9gNg`-&T&oQ5sH*w(p(5RBJ+Zg6ZRee1N{zqGM@%N%
z>P|RpHR=e(25ZCp?WX4`j0to+atGy7vs+}6ay1INnx9HgH*)-M;P0H>qi_69T6Mgw
zm!rS$MYN+)8w9JJdXFj&rfTs5!BA-q*Cenobb=l5jrdev{cj6F^#C8sMKvWze*heZ
z0=7R~v0>?GgyqIE;mtYh%l3aYFUY|iz#SL5ImmvMG4hwz;}qB_W(kWM;|=(c_?PWH
z#^@Flm9W%<^&9Yxa**|`J``af!jbI~Gz)+^+QT|(<1YAr?2wV_x_8d0ErRN(e<SBV
ztaqK&oixUp>aFpxI^#*aIhu5Dh=mNM72H=>$Sj-P2y@a?j4aa%Vs2^iqo`}DQEQ*N
zs76~~S9HtYoJ>Im#)M34m%mjeKarmENcuBqweAyF3<z@p?p)xnSP=WKkRQXZ<zu`B
zBaZ1|e&?(zuth}^=1)m0Z}@j^M2UL%gnYiHIP<pi{-su^8z4tD7Qw(DGPMpb@+NuP
ztou%$4b2huHI7lNjV1==2eH^v1<ENzw(+Pd-->z)rARB8xP%&f7<yHIqw`Q|>Bd=I
z2`O7B)`UrTIdN!_uC<pwc$yno_fxX&h@JOuDi~z>AoOtytYC$7S)EwCqx?Wfl_kCV
z#qctzHsUsf$?1pkK*)(5&+4`6q!nb`8eICJ@tS@`COr+E=(FA`@a-Izl!&eWu<qzo
zd|3ivCJ|26Gs2&*{D6T$&hh)V02nZt9zpEqW&hsrMxWAswkZr2)m%wEr71kT_cQGK
zGTKu~sy)jMxV%7cS4<RU^p|iu_|)IOifs#M2Rig*?RfyB*Q#{{ho@%Qxov$G2zi~8
z;cudm_2lCujgck%6}1b%XfXiVX#ZSZM@Foo*~9ydI#JfkJ;|bsX9fU!@>ybGR3ZQQ
zLMPppiR&?pY^{3VfoJD-;JZKWQA6TuSMKYucZJU}v%OKoLg~_NO9!%1w$@FAY~q43
z2d4jR0Q&C$j$&Bt!U%?8J=dTj`KzH%srLc&(F&p7-_V_^2OH>4)wA_m!AhGmI?p5M
z7^(UzA-5y4v=Di3`u@kT+etQ@>exdWM|#_}ITUB<?^QC9jR{l?Msv;k;SVNz(<E*M
zvn>yAzB}*={%@}(t3A0RwF_3UDw3dZD8?73$3AP8#OIH<I=sm0y}0XZL^~?1B8d15
z1ab~I<Jp?ddG_&3mxBR&3o$kpN`rDm1V<uGdDV&iR+9=Exj9%7e_3&<2f^VdT{5Ks
zE-RqK{^7S1TDH3)%fVxsq0G{F*}GmICKPz1?5T-de1QZj+R8y%bTH?0E?fVXy+qI^
zrmSgZ#hyGpIlz;<-=wSAv;SsZYFv}p#T#^+v;D(U5Xp>qg$~MDI;Q%Iw`Be(jTy$`
zb*27m6L1e65Y%ooIKs@pGqf1IjMX>NuVis`d9CrwTC*K6ZYf23GTIQ!BFBhsulUP>
z=;%D*OA>Ta>i4#QzpwAmxuJEO?eXY27N-tG9Ut|ZxZv|1-nibn3avbOWK=WT!Oiqw
zpUr4GNR*Mt1;g&%^ifF6Qr^Y-A%WY^M{8`KjxVqE+y4F=431B+HmtqHF%zQoSmZ|^
zz9^v2!8MWqoU7?s9~iwl)11>@P!}Ozk>y^P@q(Z!t9=uFW#rdsgDp$5G%HGp%Og+=
z_)G83_p;dP$h|_I8sZ^JdLhZRKC-_xU!qHh=(gIDSR@Bek{2Z>Rd@iKJ4lMEQ_o-K
zl?Hf2eIZ~HlN@*qFPj|tu3<+0)+i9bR4=D@e{*%NhIljv%%R4b+$JH5MrwcgM=>1C
zlJCi~=hIxeMF=$Lw7@^S&2nQ642@aVSbcDzT+S*u>u;%JJPwjiL?)L8G#%dNYg0-E
za!$E8FCTp7HyyqvX4SL*jhg?2CQkUfiRrT(+{Bm9;35>2K&&oR_1^!h;>zQpUcWG7
z&z^mqv4p}1Q;9HxsHPBY%2p^#?pU&AY=fpU4IyI-W4rdU#K_v%BgPVuEQ79n?J~0c
zzV7cP_rLjTKJWLO^PKmb^PcC~aE!BCG`<`RuvtHtW(>;<j_8(Z{gf<CigG+hN?Y@m
z<BD&Q$h+LHW9}qyh@~{dH#XtyXSebM(Q7hJi$o-HoljF{QzB3_eD2~s{gO#*a+_};
z|0l|lCR!z)8<x@|u$2q~rA9AF7E%p1Iu&v~>zLMwNxfRoA+U>rs;|<dT07^}k~zG9
z{ua1h$Vs#XZGS6muaYAHm0~)s1#bjax+MP0EmB$v*-~$0l9<~QJw(}s_ws6R%9OMn
z)lX@%s%SGDq@S2!UO8?){A@^sn&n-kr;aFThSVELZf)c>aXaA_|5}`~873hY5{vL9
z)x#a5E{$80T$sb&eZ=V7+%Fd{=#f6ER4&78Hk-XHkm&GM`6PZr)zClnryD#bGT<~W
za%l=7(tMWt;~(s<Y@6*}YNX$lKd?N50_CUskMnhQ<}X1XEPmt68JDlUo4jGskuc~o
zcYXS)6tGL}-a2|_hVfnaO7`yZsmT?PYq!X|<G-;YP4Q;bphq*T#x!8vJCLP`ZKN$V
zeyaU8v^A2jA}?j(4r-U#5@{`+;5IA|Um1apayRQf{AX$FAfEyZGnxH&Ql8+87hKlc
z&1j0N<My=Ja(|K8a)%M~K8YcK5F}f!NZxumM5gh-Sw4j(Dr@#!axF(?eQ+)~z%3^t
zxX!4!d~QbQ>6+%2kL7K3!-3}<CuUYEUZ!7p{fNu3LKb$WPmHQ9_O~N?epM9FlD8uK
z-P&;iIlYjkyx&+J#Cn-(NB||!_lZB@^rCrTBA6HV+9afBM}4%HTjhA^Mr(d+3Xhwe
z^_>NZIC!RABlCJsKUQ65X36}*QskxRS1P~bM<J$OGpFlW;*}dBDc0FthIx|49h(ir
zk|nm~?r#HdAw9IgNukPicv%c&FXe7^p&%^D=qEXxB?y^&wArfK1cHC5MesbFK6pYj
zA@%;!WU;&@#O*9OjMvG=bgSuS!)|7s7pn_kA&&w6w7yTJ3}0=WWdXrO@xuq;bryW9
zNdNOe9|f2`ha<$;t>I6<L4y}kOXtUSjuze<c3+-&ko;iFA&afA7xScMN>2u@Yy#EB
zr5~e-serZW3d=k#_;G3n{Hy(?6U*nuG&YDzu0@fg=8XoOWwR4qV_M>K9_f!%;5EvQ
zni*fw*Ghx5H)CthvHvK_X<?D}wUMU+wbg;!q)p`+cY_0IZK4x*1}=^|4{|PE@Z+*`
z8T1XUxQ=2Mr4AqY$-Mf;DGsTyIlamN*)`4C9P91zJ`4y5$V)HS#N~*5KuiDrU%ZfX
z?2s!I<a*=4B3fPe*pk9CWBb%3uXG@GTsJM#$arlx=`+OIcwJ|5?i=v*2eAkX4#diV
z&R;w@)phx|8n;PDR?qB=VkUHN)=d3$NejwEW(5Y_m+TwlqGtI;=naqi3BMPl#!&5D
zO~%w7g+vfarTkNu!fA1o7nFPoz!9SKP`ZfAYWgF0nvoqvp@$&+&xtpL&#`Q(Q&gz&
z8|<(=2^@ZuQ-WYeCzOVNup)u4w0&kFB%1;n%<vD;Zbm)vtu#}#s9B_ZaiOsDF227U
zN*+-3-iCg3SMB3Ilx_gk9+8j)3OL1eo$u@j{98rm&x9<kH6oB57C)I4o1R8@ES0k9
z(Aa;R%SX+V6edw6;<G*RK6HJTZY;5j`YV5jhrQuMt=0)V=BG->M-a@3p%9(Kgj8M&
zX3&Cq?Dd1;0Wc;d)L(Ub`^uC2jdf%i7^AtAO@k@pgY2)rJ3O)|&^m(MFvLCA5XO|P
zrCFp^5E~%IVM0&H=lkH9JDkr4d1O84W6K!$^UXh%9e3+Q=0S7s%Xm?@WPZrZc;&`G
z^TZC^0sLrUefZ=u)b=4ZL`weqi-(rNXbyKJ8-%C61?~*qUlNCs&=8*ecrKU*g6=UA
z9t8R__(f|!<xsq-GTxfmnSB`*E0Gx6a1|J{|0?7F*0$_heocW9{V1<986(Y2$k1J;
zL2@vG*WumT)ep1klNi(phHvCTr<F*19o@?EY~q|%?JF%2r;Fa=Y6X@xVJ&iArybr4
z!j2<OxfCz!cNa|bkq+*+A$)zZ<4?Jqh&l4DMjy|X7s6B2U=vaGsvc<(GCg8Y$ZcB(
zrf3k9qz{9d?ZULzCv>^T-H?PIT-0i?OdH;S8GNFCAhU4k^BCs2-PDfJp&E1_!AQ^o
zC0`3tCsq$;pSK{?Tp7)kQkZS|A%fTX^Nmi8eMLRX&W;3C1`Sr842W2UE3}am2AMSZ
zs5PqPRiTdU@!pQNzi0Ks58-*8k=!O1<YNb4sJhxMSgF{s<*<$Uo+z$7FHTFVbC25m
zeq}OAuY}4--Vi?PQLOO0AOma~Qo=ePJQUc|MWc7{d-(O^Ys&VZsfL{7mV8Cl!jvu_
zx*P0+jziAnG_^Y+dZ{Qn&R_fgfXVU@w#PJf_*Y~0*z>zhU!b8O-oB%e;K-)R7;=;G
z4Syzq4kM+>srxE$BRD&vM1x-0k!l%xApZb}PXS#_kfVq6r2I^V`!KIFY%YW=l{q{@
zUAG+R!Qsr?x1uBbmrA4gUCqe4Cq?0cVfMOmX-uCY{iZMcsz>(M=r}-V%&bDI$;VN$
zd&a`af?ypRW_PLamZK3UoSKEJ{595q!QPMWWtQ9=^)fqx02QW;c@{vjV!K}NKar)z
zwtU<wFnyN^9|WZR+t>bD>>i?ArPxJ*^@}yYS{>&tZ)kA~RNL(CUZ!OvTSYa4+kQ+5
z>oQ00z~StP;9F((N?j~(k0G2W2N*cUHjh8L#get&qzXn@ayK)9z%Cu_H8lQ$CgQO)
z2lhm!yEi5rY)9=2!&VtEnq5>{C}hD-97TU6<i3|*9elfM@k@HX%snk+>YaN$e?(4%
zN_pg9M~Ml~1C!}iSo9pEqauO@VU=%lqkr9maJcOi@ijJE;tm(vHRVDH(FtlJOpCfL
z)s{fK^!55#aVSnR0P}c*rNz57oJ}Z<L7D!pjZlR`i??E9dp9i#lTVCaz#2f&vuz~S
zeoS_X^-Jct2tgR<IwrU3-w!jzGHM{|xMq1~#C$jsKl`F63}=}Kbj==Ill>G;f~>8V
z>nb!>jurd6SU;P*b2PQDzl&>D);`r<Ix1=ZxZSzwzm7}C>IQY^>~lP`bCcRM0cBkZ
zXme)39H~~x{zA^VX}bGtEFQc42`m~EU-|_UQsp48w+8>)wmKPF4$sHd=5llD0k&uM
zBlcbSH%a7}gW5A}Ts}LE=6x$XM@T9*VZh+EBkhdmV{j(7*P6FntKG2=-K54-Su5cu
z?pwlPz0%U&OXf<2!hBPwP69zHN^3G7_!nVmT&&Bh*+}9h(Rnz-j71;kefNafCakQW
z`E|jvWc<rx$(IdwrHeW&p`Fvkrbg51)6zMwaNMVFol>(Mk%vh|7iG%H0?I6WwK<}{
zT+|mum)iIa$@$ggM>ewov#B$pVJoKJ`)_oziPG1?C?7g<Tal&u`AR;LXd@GckInti
z$9kb@S!oogA(`iCgz;iv0MlUabo>K%gZ~KJ0M2n@6USoqJgIzb@8fcY*)}VxGz<1n
zPYHPmDvQylN_W^oON-;gJXB>}q(MVOr*t86{R2&x9DmRmBG!mxor0sme8eMGTP{`v
zcZSlm)GWsyvL;yJTXnoCjpG%<Vqm+SV~r9*y#r4@Tf#SoKa6k3RcWASCH1ZGEgB62
zQvETfc<F1W59dB$dNg(~j9>z2i{;`zd8jrZI|I3DShSCXbF;ytfpIXlevcu4qm-nc
Sth%p1NO$qPu|biZE#`kiX))OV

literal 0
HcmV?d00001

diff --git a/docs/image/vs_oneflow.png b/docs/image/vs_oneflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..495bdd4268809c4adf8a88139f01bf5b833aa382
GIT binary patch
literal 393445
zcmeFZd0bNazCLb)yNt@pjB>2p$<6_5YtBl8WkgbGYAR^MHY0OH#hI+k($ta^Q_yyT
zW+s*+f>Y#BnwmI}bK-zF;eZpUe3$2U&%O8e``)|HIe-7oANpD@MArHY&--~l&$E1b
z8ELs=o5D6JDXAUs^A^@pQqo8%sZHm1{Q{gh-;+&}lG^TpIdkSR{LGnymwml%WA3>E
zFFZ}O-*V07>%KIngy+w{Z`^ubrd-W>(|MUcrJp-*Ry-v0>&yLe52DMpH0S}lYock<
zzZECm-tbW~=B6M<THY1iR`a+%_4$38xPB2?Q98%__DVuB!j1HWNPS<aN<2S%SzBth
zMw<O-`g(10q2Dk3qX$2w@2t8HAwmcrxZF1zUVd5=%ufAsa__3Gn=}VjC&?L|mi}Eu
z>WkqM-R0k%&8r)wekum;!EBJSi0t#d5MdF)JNaHSa>MB|$B2T&K;lL>jhU7Wsg%w7
z)x-^#Gki<g_oVcuPMKvqC+-N+Ge0w-F%nZPb;tOWQk9(V@-EGrxccUFO4abL?o%N$
z)yGt-zOBURioI4|omoM7F6^?Sxu0x1ZSgx?1=l5cl_Jqz4w^lss{i3iB>Tzc8qtlN
zUc<;}>+2fTz1(lf*Ur)otsUfQ&h3=Cu)AI@A#vgD@k_Ojmp6%aF=tB;UD>60MoGOX
z=BXWDx4`0t(vtztQ(4j54B@*JPnMlO2(r4`dG7`C!sX@D?gf;4Nf+>On#$8T)AJ|O
zQomWxRPNqps1bfK<9zbVZO5f==A6EAVSBa0F6I|sxs0p&8PSg>l20jSo1WTTDPJwK
z(I@ajf~w}XgvUqTP9SDoACA`)Z#_%eFK>SB;Kk_6BU?;r26tA=avq@fJ#qN@&~#&G
z^600cY3W0M))(DsmgWvw$Tx3+pYnd-3q`&ds>z=40iANDpZ41DG3x!XD<{C0j^0nT
z`5NyA-^NXN>BBv1)bqvA)pW^s*Nz_tHy^vNRDAK6%#hR-Cmh;fB}++W6Y8LJ=SeA&
zZOW$u*_+gE23l=C`&0JG=9Bt*I)T@(N^Sddt>L*{kpC|Zn=&Fd>H2=Mc`|ou@0W*?
z;Y|hAXZozG<lgRH{cHl(S9xY2t6#HJ$&tNViMe)JE|>jfi`nmbTeP44S-tD#=Ht)q
z#48SLpsT9v$UF6+bl_r-o40cRgv=$G$w!ut#vkouO+I}S#8!Qyw8QuyK3N0&s-L&9
zB!ZL{cx+0x^N=nrUU7q4&nD50fr#$~khXkVAI*H$QiRw`b7{)+?Zg+_A1_UxDOnVY
zeKH?Ee9YSwbC}c5vpnEFa=D`Oj2cYjh9`!ZPh^A_R`0tJ^`YR0%!Fyc-keR_-^=9G
zIexsg&9so2wta$j%1dU`%+P6-7TC&;K8LgFuC#U<nwFk;OZq{!bo9fzv|nGT%e#U=
z`?Zz=b{#XknHD4Wxv{-*hg1#o-11VeB5Zo+DWOv(e?+qLKHdh8m%Q*|d{LL$8Tfgy
zf7;5eZu3be<@?wjrlp0Kvp>s&H-=ug!y???dNJb2hlk@no1Cj;H%DCFC3ExCp7WdJ
zt_x4oo2l2gC2tscaP#uEq=zT2oBh~c9D%;BW4Lv!>WcS9(gyG+S8wH>NM@E?*=Dn!
z%A;p?J$m$gm+{Rf$%i_gsBs@3dm;7f-lymGJlFem)7Nv=Pn2Foe6zgpBJoS`ADY&Y
zN+%Vb%Y6x=D3`{XS}>j)e1Ysell>NY_d(Zdwae=Dr!&6)>Svk~i#!+eU3%2IP$}19
z@paLtjPP)<^5jXEH|6`Q+Kz{PQ^M9J?^leeVSm)R7~ROa@ZtCOdvZQ041`rg`(N+<
zVM%(@>hsIgcAWR-v}beEUzfB(WhzwqBRSlx>71oAWm_Z{UhjS;6SvuAW7M(7&#jVA
z|EB(OcfhWla>&mGZSX+LU-!&EEw=Cq+}HZVsG47Wq&l`btors%lbPcmi57<sBq+hx
zDA)E7ub$LU&sCc|bm9f=xiMU|t*!lATTdISjon^-cdo6zEg(nPp800sVs8BVv&HJ-
zgj-)jXZ2=(pAGO;Ti#Pj=&@|c{9(Ov6#i1F9(BQp`q($cH-6R*jqW|&YiZ_;rkGon
zqrxghM)Of)s@cX@w`imq$7y6isZ$hv8!wxF_8mP%$6eZFyOT~Aa^8?BX0OA(zjmkJ
z@zz^XC^(^^clQMCP8s`GcSrYpJ;<GXm?Vq}yNuGAd$uFL<)*t&(QS<FofCI@+s4{6
z-Q(Nh+M77eZ_1s-WeZs4gt%Db+np|M5l#AJf^j^5e{Vul=G^EUznh=MCo?S2PUtw#
zIM3`U=U)9@6*FZs+48;R*}?L`bwXp|@i4`(xbU>K-&Y@s41};9x(8|%)b|$$UTgfI
z`MFPVyTR7qckO?<e?ph;IsNtdjs16%-RZ0kP16;-J%_IE_0p@kLukQdObswTuRfl8
zOxzALT!}k=cj7wfx@m7<s!xtj-qUP2{b}_d<gajor%hPv!B(Z#_RKmf=C{$?8*U#i
zXuoZBTW5dp_I+ycSDFlKMvoil#~&l6dY$mn6%>3ZI$xaZ_OY2;aik^z^UAffE59bc
zuKwHI!uIVscXLcS{&3GM`W~D+?=ZfoU#y?45Aqnhm)5QD!}ngyy&MlgXUoLKN%d~q
zE_4B;OLr1GY0%{oa8lV+`O!}KokXQ8mO7R%2G7+6Kpqg{2|u#g{7l{USHq}L)a1(d
zv;9Z9T+<NbhMe7YYHZ8*`+Wkc?d>mdOgWP@oaNZFXG%t1e++&x@Z9NXTC%OgeM)cf
z*kb3@{8XNqlG$dJR8!9&wwYxJRCsS@g*!ZXWd7aE*lh7^({$g@FhX|2vs=&`X>*5`
zHjBS47>&)fa2iUg7GDd}!<)iO!<nmT@%hR}_7{b<-NJ+LG>M%gPU_srb5%+A<4(?<
z^t|r+Gvi>x!SL^Kuidml>~^}#j@y24|Dd<zJsnc46Qa&E+6<DBJt~6%waZxTM5~Gp
zemksX^4|D8q(?V2I1sz}&|!_6XREE^WTNV_xk8Iz%cf_h@@Hc&Jabf?|E=qgqe8Et
zg0iwvaQkS;@s8u%WX82!4NuYXA+RQWPv4#wfj2`@p=VcGAH9&#-P5GbmV>BIAw!9K
zNrhT`&wO;gY3Y%(x~`}Tw)Nt+=WU7E%D2uvdu23oGv!LkZ@K2=muU$VYUQ|lqd!jE
zGN}uS(^SFTDw|x&9&EmJEakbuzz>VYr#(-je&2c^`v6nd9DA!Yv8V4Ma^&$h6_wPI
zq{Rf)YL`aAo+Xgowlvpex7=^RX_M*Y8cf9g-?ZccVt%}tf5S1_ZIp`b<JFE#eVc4X
zG<Fm7lG0!FAMDu8i@s#_J6X@)npu6tT8p1NVG`6hq=I?r7q$IGEU_It%8D1+m6BH=
z%b!{IS<x(68|zG^ly_#frbYfKp9^bR)8r%NSASPdZ%Xa^{c~t-&jDGt{M(-7$Zz$e
z_TRgG7anwyu`J$13gT6&$zn+mZO*Pq>>fV0QY)<v8qo?|v01v*UZYUM=B9#H_jx)$
z(%GYZ*J+|6u$P;6WF-MarBPj}>BQ+I)Kbt;^Dt~tr_r&i_?vrY=5X~eci6ygGwnx_
zb%<+AeO=DEym#ovYv_DY-U&GlNoVf2?y{tOyNcorQ$|@hxt%dQZk`9GRXXoRee`-b
z^SDN_&uhr5EdS^oo5tIOMrta|W4A|X(TPxZ52}@Gr#`os<#DMb>mE!Dnj^kbK4)?P
z7utEWuk)kk4NkU(;N834S|-spqkVRaW;VEh>Ql@wsi-ZzJ#M}lMiqYwUtR-m`F&@P
z;eP)Q!@K4#H|BLzdKzAYp{!%tkU5jtavlC{S?whT7W##Hq7y3ST;D%<SLQP0>x6OI
zCP<@-pMiLrD>hyQlv1(9!qYl!Ij5;KCG2ch_L7Z9Rt7TNL@r=t$f2V%6T8NXOItMS
z_l^kqbLtJJrpvCm<8x)uS^&K<CwC!Xkt%91Cn|PZ7O7V52^_9&f1cw;^P!>3RIuf%
z=VoJ_4C;C(=rxt|D*=6z9Nad6u$PRD_s$!t=vd9)QY)*0FOh_~MA_GUj5;Liq<X}h
zAx>Ug8kXxMG?PJT?6f2^-4MI^p2_7F+6P*LPHn|#=+U4~4tgbOK~>SYvs^Ua{2JyI
zQY-YUNUE5^7YyYNbd&KxEA4DWj`$~fjajjHM0GoCLiA-NaKUCQ%Pq{toM2AGf0VG7
zPK=G0udS?w%h=Z|FYT?mex@K_DzHFK%27ee?^?-~S`GJ48&18PAD7xFk<lEl*)X&5
z^)KO-b-2%Ysi{&E{W}|UzU0Zcw;L)*!Of-oajJJUUbvPrkCwjufIMV6D!_x>_1PLy
z6<Pv;AZJ1#$=~q{$i43N$NXPxx9;xP?!>M5W5n$NUaejvWS&_2vFYq)m7jHMF88CJ
zkCwY=l>z;Sq3bpHZ7VA&4d8gI)P^XG)Mnsl1MsB)d`U@de7*7SXQYv@H~sy1)A{um
z12U>!N=coRf?J%r8n|JiSC(|u^-sx+&gr$7jR`A+{SR;6eetEy8uZ8c3l|u7uQ+11
zKg~Y7!8IHGY~YP6dfQQL&9a^F4sE%k7ZA@5@!uJ2gKJqmIduH!l~2<y(JN^JpY))W
z#)(4E1e7`bsySzb<kmzon(OP`;9?qrEYY?y@Hr#3Ve_t&|I)YD2O@e?4&2)MFTYVL
z;+>QnGH0*W!TVAhxBg4t^dMD_T1T$#|JN_~?Y3hBg%>_;{1-p);Qf=>jTa3)kdep#
z)$#fFZT_RN{`YMDgDv;(Mf=wP@$bF)4-os`Nc#`!%fBJ$-w?FEhyH(qw8FR0kN*tq
zszUxceSHeq>zuUqQr!7O6-VgUC-M5Md<1g_GjEGGw+qfJ#H_%k5*b97-@_M6t(#ZY
zvQ)hv7rG43hwSV-sEi66&ws%=pLg^bl#F^2pd-{O*MfNdx9vEQu(7vTuQ%W;t~vGG
zKfp0yhJIi+ckIR69_3MM5)mVzQ>*NL?4nj_Nbf>r`LPzig@&EPxTf@SQQ0kuK_X;G
zok&qBHN%GNE5S=9HvAVyji4O_C?sv#mb)Jp9FcPYDQZxFgn7OrEPT7WJPA{StkNbN
z`zl)YiHA^2I~qyug|-TKJ0zoYf)dhz>`FMIOvJGj=V<c+%N?GJ_9S6WKEgR_9>;c?
zMUJIoxsU&{VtpWh?j1Et$D4!IllNuil?J917Eb3zxa1W|Qu`p1l*A~j1Y{$zHXQ$<
z&1p5JbZ21iLbx0LD1BNGCk<KsbBML71(^&W5}eTi>=#ZXp+FkEX7*jq8_Ts*`#|SX
zy6ce)w+ccqnM+ymhM3v;^1n)){n!x>7}Al=#)PVo2ZRc;{0L}>q^H57Y>>{<uQapw
zW0M^Qc<&}yt4a{F1T6c{9I~+M4Z%76h_bWq-I?NyK9`s+#WDA$LyY(p4?uGkz^xLn
z?5g*AI<7iKp4u|gVZ8~v;w73J)Jbca$|`4Glcg)rzHH0dQsGV6ya4Nd5@IkmiHlEG
zg0W6?y&Imnl4Z!x45G~}AR3UOsvsmo;>-yY_jOu19&4sP{WCUwhNb4i9ix0jGG^V8
zU8G(Gn0S*kB$4LBM&v7aHLLx%b@V?p5NO2DW#8mGFNvO7aj7exYS`&>l}d9lH|r{+
z%;w7Dr{tPm8=vF!u4TY3h<RIxQ7=u_42$RG2Okj{=8!4=_dyaBgTA8hyz*K@{$M%D
z&$*>&ZP{cz$>57L@gZsu7qoB^tUi(JhhYi=a5qlht)coP?Hawf@zO9Vd8wq6swS@W
zBdSxfEQ`I4BA>y$@(EK4lf~ibgJOpF=?|ZfW6r+wV*co&J$^H@<2-eg5|hOAKAlgc
zxjqa}CHZ~52r19~=@{NeV@UUNm}B-89({|R9tDaE52Q9aseO1mL5UBm_Gr{i?O&ip
zyRUsLRzlnOuub~M*%qzk1*yV58&vX>F1>+*OI{F<FP`k~OTK)AMMvXekh%#!hse}x
z5zbp0@ktK|aYh}~a<siKI|S*!zC`}28bpxKT)#2!=0v(e{-Aowxtgp7#U%6MO1(VO
zfF`lvrG8>ZqKZb1rHNpb;Y}K6`Bb>a5h{E?BIK+VCwP=w9AivRAS5i10&+_*>7p){
z%mjngMW{(u<)C(ewdtlCUA`Pg(M?ri-&~(2Y6<tv3Yp^lgzy;Zi!n*sb@H|I^;}f)
z`K54sSc~wx6?8RJz6r-R2{z=r66uBgo#SNT+HR!Ct&>{kFm^4%aKr4k6SVL*n3cKF
z_Yp`1Gft%3bMiQ)$PB%Nz59d^ZWEv^RtY?@IY2&Q`bGVY4V$~1eqxL;V}C%xmffLN
zuY3A9F2z5n<<&CDFmW1UP+NQQ1va)s)Qmv}2bL3pZyR6+E&<d13ddj0$u#Ca+Kr5s
zvx6Bl!eeHk9;!q;d(@I_cV4NX2HDx0UBofWwzLZlt=EH~9sSssJpBgtXt+XL?caRK
znshk*m+&|0xN-4i__ZZx2T|AnV@stRGL-G$d1-WD2G6`K@);ogIanJhSGbqQ^VE2h
zq4|!TTc9))6E_GOF~kJHh$@C_cOa`AQ^<V7g?J9M2BDlDluqb~_Zz5>ax>)r+(Pm@
zWj9%X>zn7zAw}+;k&8yg2sdHCK!0or{aBx$*r}!F#MJJSuZ<$4rNQb8{bLl*agyF(
zH?$G*He|}e$jsT74GE!p4|8}V;rLOcqwf<!pJM<<XJ*f&f$2|HAQX}Jm075a=!STo
z{Tk`=t5GX8ENoZ{Lbmh6m_k=E?+C~PgOu0|U}oHCgz#7LB_rnBro)tv(EImoX!2%g
z-!<J5VYjUs(JX7ug1az!ih3<t1NY-xOX&N;Gp4UW$`A7#@e=D&QTL$ljW;|TI(Vh<
z`1<7caG#SvS+CQ_xNomEd9qbCps{t-oZF>CZQ4w@3u$Z&l2XgFN*|-F8G$XSnr@|q
zO?&>bYdy)?6G)qWnWhmb7rQr0N78eay!b=nh++&i-CV+b;O-FJFzjmz(BzT)<}MK9
zrYO=#QJ*Pi)}nRko7ymi4JPevex{&Wd9#!-rA;?v%=JeJI-Uzx?hyL~S2*;)eSD$w
zPjC#nD?AX8IMq`!tqQ0_FpAC^+XtCf8DpO_@bbfyGp@d9J?=12!KIiR+4n|APVQZ5
zS0D!L!EboeV$n(;cCUw$jt75;$zZ3SGv=$FCDuCJ-SbnN;uSf$>pF`c@N?$sX2(7B
z8y`XV=e^_Xj$0sPN=PhX(1|I0Px5P4MFlUkJVZP8_faL9Cj8Ecq=kh)KSFa29D^@?
zf`|N3QrIrbqZg_W!^5It+^x+`PGk$m0FW<2^bWCHe!JwuZ&AdLv1SZ{sFv6M!Rp?&
zQF9jdo3s6kG%Zs$ZuM;wzFLx{$!k{lz8qH&n2x9`vX=!TtGgPgGr4F5YhU&^EGjsF
z<h=~Tq7>p099xv3+QcqMn29Dzp_I?=1Pgrzo;{2~8-Dyrh32|^Xo1JuqI%rN1-yE=
zn;({aC5G(x#RqAODJ5k3WBJ>k#ySsV5Xe`PC&o%dcLDej@IbV1X0CE+-cQPssIKyJ
zq})6JQ=3cONP3?6Ah|eA+L|r*l|wUXitW>sLl;fDThg?c!ZKOyfE!$o)&nl?CGq33
ziRfVGmGEn}K0Y}Mm3Ri1BD>R@AJ~zIqMvqaiInST!tuq&!~03)EpSZ@q*Ivq>{mFi
z$%p;L0U8FY2*^kD6KBK{!C8)o7(<McoN9r~ap>MRb9O_^`<|RYM)zka*3;(mTG>gJ
znv7sUysY;?XX*PfPyLrDe_MAXZoZqUo#N8P<Wd&jW7(-9hBr6ojZLq1aC^8*&rzc1
zj94vP;wcF8yOj5R#o7rUS7X0y+X**FLU;G-(CAAV0~P%7vXH(=m%wX$%HT*T*n`93
z<}^S(3SIG#nr5Q#e4W+_k?B*ICFdncxMoB48+Ar1=r={`7V57q7iW&LkN2Z}*bseV
zpwyXQeo8@N6}Xfn^Aaa<*s5BoFlc{eo+N9O?Q_DBK^+=P$A|Tu2;MhHM!)V(%5-vr
zU5=BDp#3)8i2;CBfAE!gd^6J<9-B`<stmAALIaBkq7jrr9$FMu1An-MF1!=;0z|BO
zOP>aJ)c{0IzF2jNf*V28U!HBE@;K12kE0Y&R;h5*#tF4#Rr6l2?+*XVAtXZ#g_JMu
zg{)prLo|1ZsmS0lIC$+Kd5F?*n^B0)pekN~u1Nck5Kg@XUGTidQoVnK5WIa$&NDB+
zX0WQGx6G%?<DC0S<L7HE?SEUDt&&U%@QWtz{VWD*;{7IO<$@sPw-XV0W((WE>MzL?
z?4@F;$5-aG@f<q&py?O5INpct<;EynG<-4Vw7Omsk03?e!$?P5Iiefbi|r>NtM4lb
z&JMowPF`9@{9A*h!nf%;{j`=Z5oYM*{mgVwol^j2Ty4>ptrmt>@GA7{*$AfnFl>fP
z0*Vk$_!1@vGMY`-Qz2F);@Cn5+S;>sKFo8Ag3ck9Mg3E#6+MKb1X=ZrCU9T<GI+pS
zaZ-?M4!yX4iqEzKA-%0h$t0Da8~m;^fH9TV!$b0L1O265z5~_gg6zfbi(UG!db4ko
zFbXA~^SlQxmdl}lOAxXnks|&L=*3?f-t`A#KfwuZMtsYMsMP@N*s{<Egzz;>FmmXs
z?|gGKES-nMvLEOIuf?9@E=n%N`|x=a*(8u$pFUGK;dK$mt^}hLT+(m@d-#JW!%f3`
zA(I8H$mLCPWWT_#R9<_ybG4<pZz?Fx6;h-7s<y6Vv7$)eHK7F2CUJ`wZ9hM1$ZU=p
z-@+`cTv$_z$=eHo_LcY5v`kjQxS?Q@I%BfmrROv6oFRWK5!n@1NSNwxrZ$(*!$SHE
zM5YHmvLa#lUrLkWxhD`Ly`oA&({d?6KNhDAljQWGlVLVK7;#6qOP!!ZE_{l$x$g)v
zy7I$p!vJj!3qiKUH+O%ij+ChZ7+=Fh&H^ltubkhOwvjYYXy)I!2-P{}@eYK)4D}t*
z;abUAb#J@v(Ah4SBtuH4Q_l|q<RUGBaGOXH&iEh|gvO9YjZ)ES^^18$Gzx_O)a;p?
zen%mFIl2!@i-bMVrx&h0LZ#eMQ0>ej#<*+uS03h#m4v+Ij@2DF_1ENsw6R&9Mg58x
z=ugzBOci$981s&=<z$EuuJ9Rq$=!!LgYVgPwpaeBa;AlcI$1Ij(Mz~5Wr&cca+Xev
z!kO7W0lM@O<Q7btVpW&Rk%Z;$Xv4X|BhX1no+jDPqlr4xMb&xcjd_D9@kbhCpAzb2
z>Bo)-FhG*H;3putn<?r-Gy2y^AaGLZ9a)or$n#x2j%57FL=5`yVVTTh$EWonVL@pz
z@x^Ovi$)N%?+bu%4!+tT5jzt5>Q>yz!XyMLn5;VQ$F`Lw*7lqCQ0D{H>jG;lgkL}i
zJd`PVn(4zlau&=P6L`HEVF!kj$LUWq`H876(Q;N>7X0{PpZOvrV@e;ae!a<`EvH8I
z>o7qj@9)riQ4}CB_}YF+FIFysM;fM>FirU5tw7;vQztGgEXhN^ZMinc4xD7Sc>-jF
zA2njkCqfMQ9df88bJ!<<S_&kABQ?VF<4NH<VA?g&PNI6sl*wo5C}1c7?jjavGK6#c
z>ncxtgIk)9QWhJ1ZXCw79>5XhNqZqnwY;MF)w3}qA*>ZWE(nSB;ok{x`pG&F17QIB
zxl4@IszCPV6BbPRi+Wcpj(}ZU#RAQ<U~`Eq4ukhH?eJf*X!YCNs8a*nKX=kz)!TYS
z`e6yM!%FcAW2{&AWqsM6I3qp?xMDEJm>(lc*K5(A8gK^|Shq;*3^bmmrr+Q*&5c=U
zSaufD|3;;i+q+n~Zdgkr+~*Buu@f2^*XVaquUC|3!3#bY-{fzov}2<Qu?jKNB?#d7
z+E+>@>KB_cWoVLsmjsPY?G@?YnbfCbLakA@y&b*!OaMY07Wxn|9=7O@DKA14$syB!
z{A3rX=B*PCN8?ua9%!FIinju!uy&Ge`PZwsXJh<`z3r6FBN_k1MmJbVCRWuQYq@xS
z;8Iqf3Cz^G<@d<Nj~Sjmr>{?rFY1&YOKE-=7MRM}qCCt>JP2kKruv_Slxu0m0(E2z
z1SMV1nqXx_z8||?mxMN)IDmA-RuICAdf@5b-g(kOK+ub$uK+M#N<(Ta;4r2YeaR09
zJL`@3y$<ZLQly4Ij@>B@ZklimxWT%y7O7i**EWgkak3>+rc@q$F2+IR)Dk&w*Js{Z
z7b)X6NNx*xOYjW;0uP&oPTs>C@x?)4^&xF<_GAXxZ+R=YwvOQqA4ve&zy6kx<}gMv
z@ujVGA8`&gDYGajacKs?_`EK<>sRFU)%i$hZ2Z_TTNLL!X4D;txSBpnvHydbp#X)1
zL3kA0AMhMI`gGXr4v5v43(aCd4K!D_6d>H>QC48<qGg!?0*Qq(kA0dwKvehZE&oJa
zt)Uhfftyz7oi$hZpViQYXQsTtwS$V7uAu_o0daXe$ZgjYpCwZQ?3)aSfs0WNn93rA
zrUvw)9(kBjmxr@9)t!-*jJ}&|znkuihP@eOCH{d$8s8!y<jCnjL8)xIu;{a8=3Bw?
z#NHhCk09v9YAA!=Ufj<~dr?F+vvQoE#DbZH^LQwh?bHB`#kKLSh1QgoqpqbU%SV++
zDq<HF?=(fa?B72?!9)EptYNAR+OX%NLpQ$1qcAE^z2BW^#J~Uk!`y|WI8CN^=5Pgc
z<>r~zMgP!cO)ua1e$_kz$nAmTF(_RPgj!lA53u?(W}z3aX*2019sPCbg!mz{uu+#R
ztdv8g1WE#1BGW660@&6XWyIeBY{VvO&fXun?kf70u~HlNU9<7|`o`W196mP+Mmm;#
zf$J7HErGlPK$1h=F(ApGE!|We-Z!71l0(<4aDetM{Ai`lcR<SvOAt<yISjGG-34Nv
zAB7tC(n-vzTrK#g;QNv->t$vr&$PHhshyB$X<+Jm)xG#$zlRExF5-+5u=9i6CcahP
z_LW4)Hd=UDz|cf=j<VG_lSdXZu0UfY89&*<D2)cHZbYqaVVG}bjMIq<SSFWEy+4>#
z(&*!pyDCUO%Jj}&PDKZoV?6-|fOlF7u6=qAY(B!C8+<huZtp#?XIdUIpFY42h|z@R
z)=_H!5Fx~&xonZwl=qF(*E*>)M~*<u$~)jQ+XK}QP>+%fqIqasPWm0k5R_#-{Btc`
zINIxua*Rjl(tNB<34GUw=)gh@*P(~DP~lV)`P7NW5RU5NN&#rFdkK_4x8G1>luP$~
zFl8<58i)y>)s0HN6A%Zow`eRq*PO6+8E6#whG2C&X92sprK}xTlG=<~adz0R;dtoS
zFy#P;aqLR)YXY~L-NmcXZC>*$Bp{h>kvt-KZ}Ycn;<93A$%Pg;%=9z0yiTTSaNBxk
zNgf73H{GM$B^}2WM;Y>)LFlgT$V;U{qa+o+62j-SXhV=a{$j`wS#KzqsM3#r2Hasv
zao`Qi^(JZyFB*FBVjeK_xd?DQFc7&uuyn-%mX3zf8266IbS1D4keL|8C*x^hOG;)N
znZ;E*<^YYd^!;?It2}t^O{)7^!t9mt9YGMZaciW_$9ODoi@$)8UtN3#bp9~1haI2)
zY6YFjonKTmpcjTU_W}K!NcJ^k*b(1-&1b%Q@mr~ACQfD9qtjoQGaLLD65I66h$X=h
z9hF-LBFMHsho!-Nz3mR|CClD@w)WlkZg{BLS2Dw5XyK-D69s+n5ZEyH&Y;+S7#$os
z_ZzWekUh2xeLw$=owSNh_AaT7oWVHw%spOAJy$wCouHKqE&bq(z5V5}$&aQj+*fx?
zNPKqFTJ9)27=f@%L>soETdB-+)k#XC1HTiJHbwE8$6A-B5;AX75icd8!{9VrSdw~f
zz>U*Ci~&}C8xFKb#|{<ow-=RM3>XNq>L=Fq4zLo9vKW3!Cc*sCGWXRhWu6}hp3FY@
zV$TF?&b{K&&L)$ieO?-L@7q-*!c+qQ3~j^E9($4>#}EvK;~(417gI}eXm;SKqs*~)
z3(RluJgw68j-vetxao4LeDS+iyTn(U+F@e%9Mo!HXyxiisi@A$Bp+83DJXdFosP36
zU0-B&1C;gFCwR#>-%nnJAi0aF=38#3@hSdl=;Zq;?OhpnRs&#KhWv&ShOkU-gfV?w
zuYH{^#jPocLja`dkIpem@o<TkOLP*=y<U(981fdLAwS|zsK@-%DWq}M5IfM#DR};O
zaNN5{-57wYL{4V@!Eblv%rdi^nAt|30~(s|_3oPio71#PkLBGd1leD6{>d_@)dO2>
zoTJV-<~&#0!mz)U)1(2Y=1ro+0s5{V+adzbH4{fr<LOIrWjX!kYmZysmVvVJ1;gxs
z_}e9geO**bPHCZko*CLPUNAV^sZF{7V@BG{*rU2=RBGVZTGwRo*MAT>8P(VrLR`q6
zu-Rtmv{eF(4Wr9Lz%^GCC4IgE{w!Imv3N6mhuQZ|PS;(XR4emL+bXoP>tW7bbg-Ri
z1~(i`KjvJ2K~Yb;{N}?PMNh*I-KJm_m85oW<_T%Aif78M0|sTT#WC(qcuCdClKJND
z4mZ=^?C~M4osl!A?N)y|p=K71>~i}^P-#j~NfBHvf3)!QGq742_3(9eR9(hLiV#2-
z|LYFyfW^5@=q2C1B)#95a9kelMw%y3$+H~TxjbWAuM81$>~5CBTT$`sle;B(7k&BL
zQ##9feHqpBRB1?<TM{8nX^0imzEWoAQo$D+uAw}Px@=W*o#<Xq5TC{9DVFHld*MkF
z?RFJjUXn2%9}Oj83P-!9Y6ce<Q-=VrgRFNH;TV`i0O$f7pE^KMY=DyJITMt?%&#9}
zCK)xl%W1H0k?7tODklziBfRe#SpB{|YLGZK+Wb+r5y$71%)IdERh#*i3>tQ<=sgMv
zTRIGGYAdad)SahcZ<vp4A^UlLEh?*2F0YC3b;5TeXmdR~HI4Y%isZsO5s13Q5k>Sk
zq&UkhzfFE~uwUfEq5Xhb7`8dMp}jvX5m914jB>;SU3K`GLx@gQ_uS=&9;?3)7qBOr
zsPlB;yNM)ql6ihhjipnCiwgilCxXjG+TFNE{mGs#E(>u|8%@$rPQLncS*drqqP9#8
zkf^IRhW5>ct-Dwg3c#U2!Nslb2=GJj#?T5uu*+OrXkFOs7T%U5v)x(Z`zix#%RM7A
z&BG?MTZ;unq9hkx!St~gcP#tkpmgt%@AJ@$cgDvlF5htMptB{bmdoCs;FU+2A1Y5d
z7>{7U&{xjBH_AR~BM(|j-v8p{vFbg)J<m#Ret|qdX-*$04#&?NnAtP$Q(i8}5oFvx
z%!O_9)O0q3WbJSRBq3t6-;J=FNqxy!Yr!T|%4iXt%R2CjCiGsvU&!1r^!@fG_J=nh
zxegjY1^HT#@&YbvGv^46|8-!hx(qQ+cI9VByRWvGCSWn*z}qat7Cm#{mQYBe@(^pZ
z#v!;9=qO<&X8>Eo9Rx+)We%|p9Iq5mRNW$Fa*e<XHrVkKU#XvBiOj-2XG9du@5bRt
zBfxM02rHn68!c@j`Gt1EW1+{LHH`W0ia=%4Tj>^*dULr6?yJoG4t^1UdL*NZN|sEM
z2%b80?~V1*t@`~_nueH_t;D7>1sHxrX^_R6y~>$wxQbt6F7d842!xT^;nscw7u)by
zHjHS9S!@A3m1l&t@}G`Oj&fx$A{we>Y2;p~`dwe;Q!XGWr}T}D;bsX(k->t)5b|Z;
z`R?QF6Vkzcjh{!}?*36EtzQrKGG|5h8#)Onm%_in!TZn+=N4@NRUnFoQV^tLF=o(S
z>wp2->9UxN`9*v-eOl3Lu~fK}`j`pu0g*sJ2`O+gEDCEK54}Pg?$!p>RnPua^2-L#
zf9QEMX@Aey+cavs)qV)ndZgdb!ZF7rq$m6q&wK=|+M5&edD~~r&MzQ|guBJWJ18$2
zM66K$Nl^q^B1tZe|1CUgsJB@4hih0*Ow;!tg*l7&e7Y<8C&NY#L70WaIvpIldu@~w
zc&@te>>OeyVQywm>lRzoTz7HZ=qG9WN}hO_wYlcKsLtI|LmFZhl8A`!aK34V+-3e}
zJR!WTH8RLL3}vW0WJpZIuY85RH$WQ)UF(CpRCd7|&M}3n`_hAk)WP9R?#M@bRr^~O
z-J_AlnWK*8DYgyhuBk6sXh-~E0cHGlsovU%Cd%;DoCR3@u%{1yX3V<v)jfM8w*88S
zc@3~lU#5<+x?VX1znx%lDgpb^+S_MbB!*y-!Tnl*vO-D2#sYq!t)%R0KbD<?fTG@+
z3n<6ONy7M}NRiMHtlsD8GcSG$OCR6qJ1~<eLxkb0;DWF3@XiRec3oIW;eegFiq2X5
z2pL+|?1VOa0cgGz6<MXzt7+{bGET_*;om^o3llivDU#p@vpML!R=tnq0po@(!i+4&
zbc}iiSs1N?6bV{M!rHQ&F#YjK?Ol+zOU_fQoD-`zzkX6fF9ie5oa!cBl3q{}NX`Ze
zB{?v;UE<$5@gbmzjy|2LH2}3HrMc9B^w2J<OctjRH(+op-!Z)RKmn@Db8%uH@@P5^
zvyxm!*lE&3ohdCqIRb(j`fFtW+j>t-PRR&R?hvX9<6Orj*?1;N7!|!Ph+k@mlzcC3
zp<4DE*HirgBkQjkRYp?PZYs_z$R+uHyRE0#&e=dXFOGUDzkEhBO5UXT3f>nmswa4C
zpD@oSHUR_NW3an9Uf5_T>ah4UG0ptvb_h8Iiz&+>Iumhx+jQq~iu(m<tbt=RF(1a~
zK}k=s$spp!^kD_FoXI+I5}Ix?gCZ_S8V6B~KSF{j%T%w60rT-YH0h!lxh+h+wHSd3
zhHndvmDL6Wq)ZaR?blG+C`HrDAH%!?*aL!l2&xZlI3rrl2Anp|Ua<7^!GxXdq(1eV
zd4W&ovn*Pp*Ei%Mk5uSOM#OtHt#3zXCqEyn;M>`R@XOxo>|FaSc&#`50m0LmPcaYT
zj5-V?@ErIvw*Y7OXE3oAJdb5B)~~bcfgo^`UwIYKYF2g_^3(FyJvk33{n*UhRbp{p
zNF)fdrU8(!E~?@2#c#vNU~d9wCxYx(a~Puhs=cc`bJAFZosjp%91bMyswiK~drKEq
zaM>=~px@4*r`XMP8_uAU-&TA%-l7*g`Br=>7Ry$9h!T~qbnOE-nVS5d3dD4DXkVPM
zr*nyibh39>aoWU=h@J9=<v-bozAz$)Shk<X^4fWuS~qCX22y8E$G{92!lg&?#{4S4
z8=290kX}|rol)%@rRadsfOP@WW_oK#bQpj|=od}6+e}d?33T}c-!hN&(ObFAHbE^l
zwgYSE(w9id<wj3-?GC8fS~lRq6g>LNkiqw&&Cd1*Y>drx%CF{;%KKf(k{|t)_&9Q4
z2+yQxgESa=B<u~M4h(rz#`8ilC_|Kfc{IJvFG+YO35YWgYOMm;Zh1gK8f-z1nRiFZ
z*vXc9->!Tw3|21pv#yEMJ-*=22Nb_XI819@I_7R>!RG*-eK8QMnz&64SQc-oAbY0p
z?xL(8yY8{!(X>+qjI&6jiw^K-8l!W}-|f~U`Q53Z&d^^HL2fKlPNc4izSWOj(?0mB
zzR^*8zZr)cH?d8TGB@iOdd^HBsc_eFoCDsAqFXFW5vMqbsMQ6J6ZxTU3FZbYxDV@Q
zkEF|cYPjge@WctIY2VNu$S)iueF|ca7Y}UZeB<O{TIJ=zO?slUVCeCRD(b4KPc&lO
z2V+qKKST5Oxhi69h4k75U`&zQ*8mfyVtTc01Aj)X2^U-G!cJ<`L7TR$OYxXgt}bAZ
z0@wYp9x1vE^sHMRGrZ4s01{jc31oj+yY$$AE+SgaZ*Z~sYnCx)Z5}!)m?WT*50Y`1
zGAb3YSB-&ehMkr_-@yjz&HgA4M*e2!om&|n($#5R#}!8}+SHcn9WsuqlvqG=HB63%
zyA&^k+kC1ir;3+Wzqh1A_*)qrY*gv=+=*`LEtpp(Ax@*zJ9DxOlpZB!7NSF)doBe(
zqSAi*o}sCtSNBY6qt4cgsbdr;uK+gCXtO605N;yVA#X4gBiRBmOY2SGcF#gr&iZ7o
zBbBNKP&eHQNJ24_<hP~=4u3eB3SFsfPeVGQeE>I?z2HLT19GOdtP!8hhpt3*IUpGZ
zOki;y7-c<bKSLJsD(Tazj7yBdl1o;;eAz<bR>WU)kpO{1!(T^1X!6w1VeB?ZC?$Qw
zoEGoLGWee7`wa_-UoKHR<k!j&?|}P>$iQTcz37y!!JiN)di6pJQgqIaAqdC@$z52)
z#rC&U=#u=}ji47#X47lCacye7KoI8rPs*a|0IM?9d4K{hF~*3Tk-^Kqg4fEiv>aGH
zVIST5aqClH!T3F$0~n{<knjy*-+TW7>l!P!C(q<YJryP?XKt4so0&2N_W@#<a;p>6
zXP}>`ioId#d{@^aLA$(gf>N^G+uBIe<7&w5)^$Po^J5b}$ARCnri-GpM=<N)%P9Ft
zN%i!{<Xj@6ldG-)pqEReIrG`CPRU*yWW73PK=(dZ8wn+!3T(OxQNA3&zq9itAr96R
zDbs@>3%fg!!M&wO1=A+DtkN*0ve6p@o8e6gcqW+M?O^BtknU-ShTe1e%H{#pSa1(w
zrr!t8O6Rky2Ox1kt<6CPhPQeM0G5kN<FT$kM7*h+RO|(kQ^y91hFI0}<LH#{Vc{-d
z^>YSjw%HVq{R%l2@&E*>712O)uL~%7w<B|P{wBkf)JMwfvs$zo%-|VgYQ4b*o~x@D
zjQGS1^y*w@{}_vhXboH4CkY;En&@dA7`F*9X$I6d4h{%H<dX)tcNmg-KvDFXWM69X
zWmmdV-2|h}1ixR&AsGL!k#db>9_!;mBm2)(RI<)h?|H$1t`o3zOdBHO|2zPlaMJ<w
z=H<r(t`4hB5b>f(>w}Y<xnbOn2r~M*L8-_oW2Q?RM+D)0N_*kpi@t1?bv@bpA&B@p
z{WJAxX5Pd&1r!5ai41)Kk_^1G6>DI^E*bIhfcp3B0jP3-(93f1=9B_n!16>=+}uwR
zLhMoKcl%SM&8(a5U8+3~wG{gSU;wfBJiuNEnGYDy$+STyOPREu=rS4n2aDY5=My$_
zzVBx8muve?Tvvw4ywQ+-3#!;EKASmfzoW8N?ql>jzj4zj&h&YtrZHv!59Gb@n(G;!
z&(y3FfNi7WZ@LVSsO{*Vlv^55v&t6hlCw}VMmHLoTT)o))Je?^V1A~~cTod>DLyMU
z;!oZ}8Dn{X<+p0l+II%MwD@BRN_y?~3PjuvtvvRkXu+RhiwX<{YO;P;cu6#(yM}tM
zyH4g2R?IvW!P5sreayd8bzRnc9wX%30S|;7WeJGPGasSkr+O8h8RC+7|M~cx<k7LZ
z!7nq?U=Dlr?JB^AFxd;^z3c^#`K8E`QtzWeD?kV*aal6yj~RY^!I@u-_+wG1<YSHj
zvGIq)RmrBc)XB8+`ba#_WfVv*J+w5=*6WRwyFAHb&v9P?G=JUSx3mL%Pq7?XuPhHi
ztak1OIPWi-=ukUDUEUtZ>e5C?dFBLra7O%bL0nGF<8%Y+{GY;5;zCJaIf!^!hpDIH
z;^zAm-XGo(`PAJ?E(<V9bh;{TA5Z%eH@yd9Wz+!gR~lz;UP;M{eC6xl4{M%TJ+Wex
z+uc&Uqb=OruZ{PkSb4GoHZFSvkO|x;b?>fn=eQglTFHFa3ifqCWo%rQCo=^m>m=sg
z%2?%ur56Lqm3W>_D-hZ6bs?JT+)kTNNZ)Ok*14joix$ltn}SRkRP0<tBmQP{bByoB
z?c1<<EwKtv%wKZl!Qq41T$b14VN?qJfIf!M4I8E~8vqO1M3ZzH#-nV)LhpS)R}uwi
z=!Ut57$8}f2bTQYW3ZmwiXfl<S>j6Ut5=-;J4yI}-8vmqi49c_0t1WmBm!ucOMb0h
zU$7@Rdw<TAyNCBOL|44NM#ebuJ|B4z<F>f`eaQRqr}}35m3x}%8pZA6er92_sKNko
zOW5nm=~r3neK#At!!w60s9LRl>ORBnkR3v$T-&1~Q-N<ekrr6Gsh?$6?pJ0ZN(UHR
z$R}WS?-|RvhE@WC3VCgz<PFF^`d%<BY6!iwro}+wm$X%`n-YeJ&RkZdBEB04GVMdf
zT=JryDibcOTgROxJAK%2S4eI%^VonM08h5EP(xeDZ;z?D3^O|)cFG%&n-Ub*Fe^Yg
zJ!tEFcdlC(7=vaT-+})A0+z)H$$gIKQKD@A9LL`?k@uVs_gm?)Ph~$@RmgyDpzIot
zHMBaXLzMkj%64uvIelLy%j7hfWAUFX{Z5gT<a*0p4gK{~j6$jOMYO1)?vol&y>j>c
zoU3TRdhZE9$NY|NFz;u!5#4zvKYEf@l1P3|YxzAwZ@k-7^cJ_cJCc&+3(Cg_{Oc}c
z?Hjqc(s$#=;cHV>^`{2rOAp;YDVDhh*!O0NXhXtyBNUaere3m|*WD$P)h)TOo?Z<o
zJQ;#dn;2cJ8&~_Dwy_c2$TV|_4_m*8zs8=e2fyoYO!R*7BU<mh-xWUAl`ou)Pebp5
z#)h8cza8)QzSUj!XffZdl#|?d32|l)+yiq((1oFEZsp<2X6h-$<YOVHU%N&C2^B_V
z7C~KU&@o)aX+3gTv@tZTE&4;;$#Lo6!f>17{~EFXy~BAuw@s+2>J*MZie87mi)^4o
zX*BfBc0bUOUv2G)XToL24BV%*;EKLEUMTsd%c6@@Ge_g|hzuT1wOl;RZm7?zS2Z+7
z1(zKgomTtYt#mnX{LKr_5}PepI=#OGQE2Zf_y7Yk#@}Z)s%jhZhqN%4oBUUNrlWyV
z^U_PEe38qUzE5#-KmsbRfS0)Yf4|bdPk+~ypZ9y7a}_TPJj+seV^GbJkvnhO(nfd0
z;+d=i7uV;GCkL1)*6HtyVHGNXN$8VdGv9%Gl|#!12FWDiZ}Ya<qD*?T!uk_vhjz&R
z4|&!9aSkW8Z44P9`3;(uXqMgTQ~Q4=+DYd_cLyxe?VtT4|Mhzl7l7y3dJ`+U%IyC#
z<@sMx>Hj>suDBUcK6l(PmHy9De;=Q}ZfAG>NL+uXW>`^B-v4-y|8;r)*HM-JR^gAw
zyy?CF+||Ce-58^ZWaI)d?Gck~4xlA<1Gzv*jxF<#4CuiRv>{56e98Qlo?I@42vn5t
zqlWx>2lo93tw>x9Ob7p)5&v~G;T*9CxfsA)Q4GF}WatYh;>U*B;#j0YTR&VCYRCtS
zj<hNuDY|R)4KS9IZ(Iy_D*h8A649jP{Se4$+@QTuLu+K65izF&&2wJuI4~ByF$aNU
z{E_JX5a<07HNVjG6)Eca9Wwa}+aAT7V8u3&g%5MceiM7Z>ZIHLoh3!B%1hcuAYnpN
zh*>8GsIeM3qHr%qAXg{_Y1A>!#<$oS+*%L2kN&=q>HkV?_~bva#DQpS!`9G`@rWqq
z;x=&8AB?YXIRJ9MBp|v!7GgnV0cC`9@BL#1QE@nSRT>a0^esl{!^Zg(91hC{pjO3A
zCSqY%pdOE9A@YP;=}nff;pu>r7db~3CSb|J=>vcQB*js&tThL&CL(V4Xv=Y(&pH3s
zg#S-aRz*8F9oh%SYcr?gQjv{;=e!_Ry<C=I3EKMS18u;w>*G3}I%bQ+1JfyIX~H)^
zkPCsf&{HtQdg`l5lNY<nZ7x3bv-k~R?RX7UIh210vID3ieSWngc%ctFKG>Mw1cX-a
z0r`?_X>c)bJ-zR@8_1k?ovpa6r2Y@V71-kFQ$LRy@t0em$}|5F;B-=(9!nt5j62(5
z1!XPm@wyLAI&E9Y7#&+&dRLaC3Y$Joti`DgQnZbLROWg(k`WC&56i5#j~@J|Ky=6E
zG)-fEw-JPFTn&E+WRt#^qYRJpL%~hwffGO|aArfOUp|PKkOm|g#dlR9Vb`A!%w4<z
zX<gx|8`2D0iV(lD_lZqQtj+s-Z1Lv3)&mZ$NfHVBj4Lq>9FMoHiA3B5hUU`-sK+7?
z5Cvqt=V|uxq%lvtjxolu1};b)q~KGBDd9Rk>>Dr~JA`4BU9=NOV;VOFDpfWE>GB#N
zsQ4+%n4cR1SmX~7Zg;M3>0i(`!Ze3N%a_Wfi5*UkDVh*65Err34qzXuGvZT*p`pqQ
zBmPW+J0@>FUP2$D*dYD+;|pD1kqRbtK(d;)wj6tIjXA;Q99MD6m~^ijY{svIr#br;
zIko3W3MMG3d&zn}v4lQD){)BH?kJIw7k@5eA<=kj+(yjM&Q&Gj=AGo3rN`15*lI?Q
z-sxwE?szRD%wid0Q0vh8be{biEweYe9w+yod;2<P&yq(cb#dh}pvnL3*S1b5tve4&
zE|z`Lz$*Ek0Pq5BnHj_N;}g<Lp@I|_gScSM-az_K%5j`AW>}pQ9Agi}AUJtG>{JBQ
zO!B3iBs}jztX0%jE~$cZWCLQ0->KsNBW(wCfAIcPAfgqoRtK!pt~6-e1t4$7ex9%%
zd+I79KvBb#a*he6#tIn>D7YNeA#MQ$AkgcUGeDZ_Xh0j=4Ll#xPzI7S=P{4H@P8iT
zRI;`PMBzovXK8CeJ~z1~A|U$kC$cLr(S3D1{Q7Icn){0!Jh^)g8d{m_%bziFGs67Q
zLY?V=Y6C00Fa{mmu^wRpeO~_rMWBZN0Q~d71?)^ql0bF>Q2QoVNqY1N1(*(UVpVvI
z6UbaTUqVHW1r;EO_h11}LH&?bbdRIx8xYpTm4o^#?dj92q4|jJ6OJjS{gF>IfOJ@9
z1<<EDkBp@k9T8Blfc|Uj2IOlMVl67uRR$^cNvJMHMx{AFvBcvj^4Chf5piLrNgnAK
zKqII)a@m{t<FeRTm=`<kZ=qx3EGaj`Ns<dVZrguO<^Nu|BJ_7Zk*iVP?mZ9W$2QkE
zT>=s|K=vttkL=2cBB+>uhV#%vtVeXN1drj5#{(hb@Lu%lpOQL1$m+`9GH5{XlKP?q
zh*#y93{$c+*EP$)%$BaFnKLv0uY4I{;R_Scwe~_%R0;u?%>j@n44&ZzzP(%phFERO
zY0TgM7l6<RYPHmZdrW|uHNZj5zHBR5L;i{!sZg??xG+b4fpi1}>9Q7U(yqm;F2se9
zW+10Fz8T!~vx`G5M*%pD#X@6C=B?*1zF-{smF#~%2cJl=V~S=BXCrs7Lv?Wz+zsku
zy}SXk8o(gjRtAbAaN7@o7W_jn$vyNjr(d2d>=ZDT#s;>(1CjJ3j*#Bv`vj;-b)+E6
z1_%_Uq;(o4Wd>l^$IPPNB@IYS8_j-{Lm4lO;y#@t)(%=*hJ|s*Jp^H-JLS#WfLL6?
zDd2-+Vd)%|k>oom6kwtvXd2%$a#RfKfQZaL!4kAVT3UvfuQgB)*m_UAe`I|*4h~T`
zS5fprF%abVLy1qxd+*O~djslDL>mug*|{F=`vt&-w_Mhp<C^9@W8y17glfzc-mK-%
zRyKkR7qF+oZ38gn0elW1f6cstnkhg&tcgw&Yc1z}G371R=@HP2-xbFx6|Uf7b0f_v
zS3c(~WR-sxxSwG!y7y5rAV&M}za$WxQ*Z-pfy0f%L$*i;jzM^QyT<KAg=8rLnS%1w
z>7vO95x(sHEF*pr5RW^M3O%wGQAP;RWP;5qZ~L%sg5#L+IYHBBbK>n#$@iI^kz01w
z2e<6g(TWp{v&33-MwGwTr%2rhWB`9=#tdz^wg!#Oc}_hp4R+QC0$gtJ9G-D@TQA%V
zj0#>mf<-MEoATHP{_qO?)_&=#dBFth08A5l@vZU*do4i7?jjLreG}}K&_D_yQx<n4
zG|<lfVROB7o;gzFa0#rAOdqFM7O)m=KKb`>vV!k-(|19-@UF2S!Qk(lKsS3WEZfJG
z6KJBc9w7ya7_iwiT#o)X-298p=CO-@2HML#pa^sPn+xC+4Mj7|Jvo3XHW=U$hQCt-
zUv;AwF4oAe2S&KDP#Sz@&#=#s0nU)0?h3a2yt$VRB6{M3a|w**fC202BgA6&LOsTi
zvtCQ#r^gR9q2%S+dUvwk;hawdXT6nLQb^0mR<5qAJ<Qo8e=!)`gmUsLitot7-=hN(
zyBUvSE58oLX2opy%(H^nsdS|ww(}c_Vtj!0!tho(^DJ#FhS^2UCjk#@0FGB`xNl}o
zFC%<4Ihr8y0MN2~gkDHPB~6vqQ)gyE=zx-hi2)~_8z%WJ$<r2EluIh!Ot8msPXC`&
z5=q#I1q98|rlaDqm!ApUtK+SsQcgx|mpkDC#uvDx{5#w!F5t#Z-_}>p+;Ij$aRmLB
z=wQE~K6qtwLYgK+Xt*x1o^+ERZE=u&Hu%@iTKq@96EO*1ngk>g?9w8+5C`blNlAUj
zTHuw&t#EurXQUgNaKtv>vCAseESwAE9PD6Vb(%l6T`uY_kk+oO_!3DhH%AccO!%vS
z!RME>S0^(3UT36kAQFhz*t_xRCZsm^cc4M-o}DV=taE9CYo293%?X4!Kt;qS5T=^8
zxkYO+kA1|Jg4~Xo)xa0=oX8!E&D%gaJ;g&PlzS+Cv=xdnuK20o1y&70f2t|+Q-!Fj
zXmFMSI!;3V4`p8-kaV{8Upb>wIC-aKs4ShP1<OQdG;>L9rm2QZY_}3zrqayZK@=Bq
z8fwbaiquq)8Zk2yEH@UnFtyw$T*wU+w~!D`5rdH5p*wZ&ecyNH_WQ3azsvca^PK1T
zEYDfPsv|Cpacg*+@+)9m<byQt<j4S*3j%vdV>Hsi%G^nKCO{DjHIZfHS^zZ5H#`Y4
zIN4s56#3kyD&Qu>-1Sijdx(o~%W%@(!13=+!mn=YTNPc`5GcRY`-&^rRxtA-*}FOq
ze5-L>2E_lA#YbJwH)Npec}xD?e@Bp<;hr-X7*-MR?d0)5(2qzu$}blG)V=T|NIr<9
zc$=pPBSS3zHxP~^nWT*^VFS(e2KJKYsr0&(#A)vP48rke9n|necKWWrUL_xQt=sE9
z%eHGIdYs72tpa5Xqk`}_7pT$*&Z_eyuj{EMA@ba%mjFJ3NrfyezvvLq2lgajD}-IN
z-{d8ZKlT`L;gRR9=PV&H`nO%s*7KPg`tbBa3fueD1@tmz=>ED==5g!=y8&#Y9qjbf
z-dgop%%<p;r+gE@sx}|==tCDkRcYvgZ64%%6-qBS>P-~`vh-RL{B@q>#&pt5)Vok<
z>oQM_!ZYp|JQi&f=Ii`wGj!u;?qPfbAk6%=jXeG6j{=eV>f?t(eG3D)VXLb@YH!lB
z%O8n>z;KKb5mq>?)-<q-`nV6$sLxwHd$&Ewg?7oqp`HGIxdvrn`8ie2JyB7>ip%8<
zTBXl~&uoNl<d2|Mgk&C@^9BeJO3Iy5v8=AMsib;bgk);-_@t8&tEkiF@z&&s6kTd>
z3mmGEWsENxE_}jPv%`2%JMy!>n>bJHf|SM%k(U?7x8l~3>K`1Fp13|!I@b_gkAmb^
zd<EkF^#kU#IHanVsCayQ)lm%4a)5ov4rp9)N6FLAmjHJD7h$d<EsYIDyLu8Y2OQ@c
z087T;PBeQuX8Y8DF&L?@mQtoK{z{r@wBJNc^|H*iw@I?8p2pM>8!}tlyWSlQun9H=
zZnZt(0B$YQ&aDIQn*j<xLuiGqy`&-lsX}%kIH{De;126@TQ|EtkTp0HG7EYtI`m2z
zIG<Kx9;5O8_~_+dAq8$rmlPP;9E-XyUbTJJ@*>rxQIq)PSra_1ykO-yzf!^abd#4+
zELUj{r~QC8HOQ%zvxq*|Wg6Suks`m28+h_Gk!&+I4_0U@p=EghX<9mPd;T<50U6xt
zf8NdWk&d0jEKcZiXz&eP_6Zw0NSRF*=q-ZFqB|<#4?>1BrpM(Y{CK<(_F&2%hc1cO
z>0}JI1zs7;N+HeR0F&B4ea%-DfmFO{x1*#UY`uGe!}$f_Al2#MYnP#Koxu|UNbjm0
z;AVWKc9gg_k!|Sx+zpaq?z{R&1umH_(eF0_zZ(T>f-0)+i^oh;8V+fn&Lr*2ctzW(
z&njU=)jwsV>&!(CL#NVrn^yNCLO{v2=D3B(bTbrbXWZ&3wHBCohWXx>M4kchsKyDk
z`pc!Kn%=|lewl#*=K1PjP&hTe2j~y8noCQL%-*^+#*Z{grGRoXcXGF@`v}<vUqxzN
zH9*rAFek9G+wX%)z`Kk(dVGOL8d?*z+Ws3kEkr)Bih!&N0&^puIeQK`G3O<;n(O3v
z>e7Ly{{?_~AMsxXC;U*0?ryn|MV=n~LubYAY(%)Ml=F}7<Y5Ei_E=EtUgmme>%eX>
zy*a)9($_$*b*L4=Nr~h~?1G`A3tW4WR=wuqJuNLv`}tmKI3~%`o>~^`EAfv*$?i?Z
z3OiB+$q!@_nkNKobt*-dF?y9Krjn!&a<h<v+{z&uV|<kRTW>C0rZFd>?Y`5M70hXN
zMPeYoEs%n7dp{sXjR=^g7Xe!w!4D`#0Uh69fJ}<6tVehPL#;PwnYaHyl!y9^2G7U`
zvSd&nByC8q4@?*THn8<1N|vLLGA!6ObIk`~n%n*SsGMBSE65YJk8`bKXP!CccNSDu
zIVUAEawlu^*s8)R5>7E#x^6OV-1%{b^Qw*YAk|sE+(T=pbC&P$2MA$+zxqn(_D`9<
zC3kAnd*S5#iP-eWcU|^yl*DnAABDQIbhh3tV&LjU4-kYGsA_A`z4(T{2G?f|gIAVz
z+NR(RR=dF~w1A0c`L`l~AwwtaJYetP*c~a=0K-o8C0;0>2WE9cZr%_rU^z^4=2~LP
zUA5c$%9$OSL>i|vRfW}*2)@ET-NZ$m0^j$U1;<9%31qTlo4L^R857u!c>M3W)RcCG
z<RuaS=|o~5dAjyj@S^~?7YNdbV?f&n)*d9$^=#%wO#q<P?Yj_K01eEF5|IwLWLFpe
zfW(pX#pbN(m!u11%!_c)Ss*OnkH89>cLs|?1AfJJ@0@a7w;8!AoaovU>-!ri*c|XK
zuwd;UL$rK76#m`sQ$SnxMb8UwDrSA5C+Mn(2>UiFw$lTSJ-B`6<jAiO|IKiF$;B;_
zKJa*_yrERe__i%!e6^pwL<!!_f1wW3I;R8O&Ut^8Z18bk{OheV!FJh8CpoQ2pM!ZO
zZ#e28xsV0o>ocUo+#2L6d^Ty$swda=M_zm<qTy8OKtk{!-Rndo-_wtyqO@msl|&}`
z)}m$i+6fm)XJIoPA*{G{d2Sem|M>QpVn{2{T-A<*!`8%?lB%{g6J5iA?`?^+810az
z24ZTpbT1TcLAd8CGT1XdOcVP-TgeXHL{}OQ%lCf+EStrZp7D3nU6)=SoSVaeh78`}
zVK#2RC5g0_)P5Rp)A;@<DMVh8N4(mF#7}j=$qA1Fs>4!X7NG-%c^ja6+BBP@p))as
zu7fVOfZxaIk{y6p(6=UHuaV-n)#b`OPh45LRY@7(cTNE{>aQ{{q{A;{h{y#n;IbVa
z^7NVg#lSs03UkOGJqO)bVjCAvj=$?F%G04R%tXpE*!~9Vf%v#=SH@Sc&A4^}tTJ4h
z<^@dbguZc_lbU7o-7Lui@Pv^$Bz>TDOEDdy?V(2SP4;jm>|=pt>p%<>V`$6(68Mye
zfqfOrKYYI}y6?BedttKPr+mN2Wh>HfeC|U~cvU<>aDd;1Ev1y4B?9S|62t$!6uB8K
zJyF#jQ+@@qRMW|tz9@e3$cA6$8e%FPQ`%$xC@=-s2^58LRp!h@=xVr%n|RqE=Lkv$
zA_$qOR%E>|qstQ0E2;y5_?qAkyktQ%QaqMK5-Q@-vvF<KI>WOACYJ<%HO~KSXf|#F
z05%&}QzNTLehQo~P=PK9REl0{w!OI@7oVf3G)JnqJ|5K_1UPQs>J2b&Q~ZBMmrzMU
z=lWdhnR+D)Pj}@>o!U20&C^*n*S7|})mp}}`)TeN93Qkiu%q9#zAnUFbajEfP_$iQ
zmN5Fp!-2H1?(&d}Q$2CvBM)X#buOITH4XgyC$G5rzamFwkv763!a@cjZa<bju}FSk
z2OMS&0a1W=bP+29$iJ_m;$DBGNyRyms`u!LXZiFDSaT-M&4>#E_Ov3=zdOnQX!6p1
zDQGF}bC}j;X8dy`on55X;?h^#2wmv&6Z{9{db9K|etLLY-+oj3FHJcUzwh!iGkAQ~
zvwMfJ`O(>kgZI+&N`5xY{@Og+ZrQTZ!!`W9RDJuemIWPvU(C`EKe&XuVby}Q=^LKS
z5k$Vcxf>aV97JTTm-AF~hkZnF#E4TCVrl%_BvR`Z;HBYEFs+B7jDdH?=oK%!t@R>$
zxjI|EmCa`k?3m!4gW=3Y|HcZ~LTAGyVm6KjclOf-iXA?u0O{%Sy>O4$du%QpU~_Q`
zAFcuav?3cwgpGL0q=MZ(hZ3;4r1)&+?2(=j62#j7YxtJXp0x`U9uZpH7%guzOqS)a
z=^1*(D-VYtMG*25#F~bT^qtq=hh$IffOYV+`v<uDl8F#&JKGy$D^1H)moL4VuaL(5
zf#DZtkY+-b`7H=qX3!ffZMtVr1HVaPFM|4gD|?GAix?|Yyq;vHAB1(tMaFw!ZqHD?
zA-B`Ucq!Bk@aEM$W^)y<GnheyvNejibJFFVa%nA*RPTp$*b<@9vRIr@Uq4CnWn<ld
zocer4sKY7ID!AdtB~`QFodG0%v5pRYv9@MZ^D2rOovPc$sLFSSI?#RHq4PZD^|w`T
z?t_sUS~R`2A2U=H?zQqy?ByI|$LLJ<`N2<O4*nzl?k=0BhukxxZYucyZqHce>C+rZ
zO<4L#qyR`S4Iv47AoX@M4do{!U98KExc=uxJ+TU6)jz#8UM~v&Z!_d?k|du!3s12~
zk9Iqb9gjBo^H)|ZTtHbjuxnz~-~8Txy!pTWpT7I~`FR)on=<}?hcDr)wKeD4zM8as
z^X<Q`_Gy1gc)Zl7M$+FE?)Pc0{I5s%`RR+RE5Bm=3QhQgatO4ZSr)|#<@fRteV0k`
zIoB~h<2F9!oV_p$GfGLw812x(ysvOQ*X$+cNq@)@mq~X|1*^Shf-*?i`+7U6A?HSD
zP6uH!w4G#425IAA<hl{t>To`o+KcazF~Fs$Bi3~3YS|B92z$=gul<zV%MhB-5zf!W
zF{3(?!V=vt&d*B{t7K?ZI)pskF^tR<GKt_vWy;q(<+F7D=b|s+UZS~lwpD-00aSIN
ztDrTEZ{*JmvTH)Iq8tw)Ip`4@!HhC3Y_IFPDCJuKxHT&R5O95PL%Sdwl=pJMKQYRh
zK=bEsyPchQz@G3akNHhe>4K$cjsYV}1ar=72Dvq&0&<C91(YhQOC)3J9bj>s&@ji3
zV!0g_MbEU50}Qdr$7TJmkM@84$;S+`Z_OypxRW{OEvYvrPnYr$Q0GC~mwPx?ZIUBM
z)sCDwYP!>(lhS7hVpS`7N=S(&e$vCA-?8&{F~)ay+iw0I7xA~Png)k}(*t7szr@V?
z<TutJwEWkr)tD`NdCNf(VvMMMF+dx#q+s6NK{^nsNcVfcG-*GXL7M9af*z#?=@3dM
zB_AK96<Y!nKD9?Bm23tiQZJ(TYexyinu2-GLCs!K)ge`=5!$G0-h7T`J&rHm2ihgL
z<M^g-VFLj-cI(Z=%8=7{1JS)Y=YD(|lUdcrzwrc7%!}l!TO;{5ML0eKc<a=*mN;2!
zXS}8y0;2$dwNNETI=o&0qhKkd@qK9Zluby!L1*WjLgQ{OsK7}Fy&LP44ClG)y`|%#
zije^>f&eBp0U7opM70gIVnytI(ufEB?G~A-{GKvTv%Kg%ZSJ#v;nkH0%Zmcg!|A_{
z)4T<mrMEQku*%T$kcNH1>(PBbLyQ3f5bAq=^fXe%X-8C_V9z-T3S<hSd=k8pk1%$R
z%v$|$2huehI&O^6$~VH|&ZLs&{L)DELr9DA@pj^TN(33xgMvn8_1rtHrM86n_McuS
zNi(&Vc&CK%jTZ^Rk5SD{enzjj9aTp1LIF}K9d7s=WtbaqWp`Y5NEYtZ%IA3hOMy;J
zS=P`_kE9ck_YBU*k2N7rVCjKfZmz}%SJ3$(E^xhOd><@sZi`?py6ULC<PenO)=HqG
z<Uv9=Bzw>cCc|^2eZ7&sLBre~t;6u~nV5V?N>9Zcp4Smx*r5GEJXU!^E^i`F54=U5
zC>91i5(S0hXKZz^C30*<K!w9xUWC}zp6(yEr_!a6*Sty>HxjOIsWHr3_qK1iR7d0e
zw-J86yPsG9+DS!}k*eY=N%7&yBs#kZ!J&?FYn~>e$X{Ww;%;(<X@TiJGN#KPwfc-h
z^c~WiT!B<g<|5e&Jgjo`dnmP^io^Pn&y9#xBe|{)s1;}M7ZW`RN_|+I@-wLZ7C?u%
zY=2y*Nu!7J^E}whs%lqI<eElTCuYZZkJAX*0B%?$5Eai5s_}jTY7?TX3#od6hw>tt
zS3vb^j>74T^s$xJyy}j+gu4oLWD5z2@BaoCH;RDzJx33Z75}V_x(LxbXAKnyTor~z
zgt;RgjMQXyOC9x+sgs0exq663aSwa3&BWd&E!zyeBGgd?uB3LuPL9qArIcwU2!$jm
z3tn~6(&RO5B;y#aUA>g?+xtFX2`|?FW6t}te{b3M*W0sI_h(}XH0}3Ih?uR576q)M
zbpPC~bc8pr@BUdejE8R2uWTZ&{MtCEQb+|b!IwZlnz9n!Ok+1AV(!ATai%OQD_bl-
zq;G^4gN)>-`>~7WK%+Yf$7<c#S*~&ef+_qG2pC6a9xg@SI<AO<9yJl&6FzVWS#QzZ
zhR~K3kVfM8pI^#m95l5Q%$3~^2f$1!yxA;<L>ilaQ9J!RM2}!x>f%0?_IWNTxvOn!
zNKy)i4&aS^65A*=ET+;X8Y~vUA`eDTWi@D~QRrAp%=n2B#&2mtMWHW48d8&N@~=;)
zUWQk=M3AR7iR5Wk28gP|w6Mz%z3Yq!ev=eEYfm1ag%pPR`UH$|r>1*bx1euDmxc4~
z8(4VhbC$MeF_f?-DeE<j07Qps8r(~60Lo(`Nr+<rM&$*PEjbLUlsQA=vvJSCSGe}!
zD}Z^OBk3C{{q|Gl^}5j3?}AAQcY{tPTTn35m*L~n8X%YK73G3#_v_wq@c}BcftYaj
za-uXV-N6-FbugBg=qY8a*CN0y#{2kH-UVw^6INazD66iXy#S9VzD;)-5Z647+~bU5
zT0NgW2P>?2hEUYBW<gn2&uu{H<i~63N$3^(IR$~B+2T2<fmx?NIVzLn9FtEPXLk`3
zl9F_5x;o}&qSQt8Rr#|WZOnkF+-$~O?elM;+nfMY8K1^3s`4L_?eyR<MJz{7XTsgE
z6pu9IUev8kGn)k!ic-S6{Sq;j9DfI*=LY8cHp?|>EURy#l1NpvG?H^{abk>-O1+eN
z<)5Q=z|6{TS(TsH_y08#fI-di>X)$~hECQ8W<GyY_5`7{y^+oABj(W0v<n6*d!&qI
za_2S${is~=<OqX}IV_>I_K*)7)u7CvAuW3EgtMPR{kU6Dy^KD7)}r!f`{+)3FsVkc
zo*utUu2l}go$2X9GTUlm+Rua0r&8lDw!7mvQL!EaG}s|noJ>S1dG`}ksV=^ZY+O~Y
z+oXzFq@z4+7(@G%au_)5<2bpg<?PNv_hs;AjHg{d2Ma!qiy9FRR;Aq)iD`R`6{X7l
z5CRL#JlQEIHKhAG+<81uQogkOnU|9u2CV6b-&X%qA^Ku7$TDB3TBq2bkf81^t)Vq4
zs6KMAg=!_1tXXEYtLyNIkPtKUG3wT{eRl#{ZuKG<R5{n9e%p<-zOnT!XK4+0ITOs$
znR-jsgpavFV`oE%EY2zCtY-Nu5l4U!J*8mq<7G1s_%t;I)4iyiI2fZ}pH1NuiELm_
zRW-y6H^5*<Zr@4Q1ci-pXIM#i+ygkyw<aJ**x^|@aY{_<GZ)}zO)dwbMZqo3{;VRb
zS572HR-{N=1S=%;*5$>W9On-HaiF&-)zxuK)79AY3-0{}@zq9Mf<1HbP57sz)vr=^
zI4$)f%qX1A_d>r;4y47oh%4?KIQB{eP+ekNmOi}d7Hf`@^U#CWoXi9F9gM8z5*wWK
zkg#mIcqY|5d~(vqVok+!gm!izOfv&4mzWDAcn0Xkm4@uzhSei%jTuF%&W_QRj@V0b
zm!elkIe=YdjVHTPdg~UaQ(@{oOgnj$*(uYYr_vgU^D{0(_BaXeDJRmuhnmbp>=tfK
zl{&>1Ky5eD<+bq{%qpc+!>T?;SC%~rtu!i+_63_`Pi(?!ojCOyt8YJZ*z=^If7f3}
z@P7*~HAfd7W=XYcAz;fQgjXe|s5#Rk&86d-2-ckFelx<-TP6+*F+;_YQDX7n>0C&I
zBDOBN_t9`}$7K?-ssl0FjNucG1~)ZC$v)raAaNs-;MEO?HSUfQknT?BXV5j#`e&Qw
z=aS~cCQu&R8OMi>+DVAqI+^0&4Nj;=$;KC!2!4W9Rv!>4t<9l+XDlc-7TAMva~I5E
zNv4DCF`mcR!b@@RE!l&rUbmBr1Z)wzv6LR-RcwZ93G_6TlN<wjPeIE;C9hj0Ro{Ya
z=ppHi6MEYCwK9((J?W^63pG!_28GkpctSMxzvT<M(K&aZKg*v}v?-vZF0SH{$rt!h
zXZX0S1_?D);_^+-%w9A`Z69lrhaUttRYkfI2nm6Gcoc}Lwc$`xhHmmVqfnw?iAy-F
z50^~}_I^ckIwTmsXJu7<px2AHTIJY;MiE9&!TOG|nejzw&Nu8OPnuA%X``*7{9FNx
zS>?4V%OPOQgzjaI#@#24C9uMp($bv?pnsx$8*$|kQyd?62!>5VI&8;Iu8x3mstC?$
zjfkmM?Br~uD`%9Bm0m~%>!xCaV~w8bw%AS|!aVJO2Ukzdhx$dzR?ZripQ|eQ4(ofj
z8;ev87{V+#bqL)WdUrqSz2yw-@&Iq>7KV9Qeo9>Zp<fMUO{9)NnVl<LpkTJoIzaVR
zYh<*`c`U1Se&c17H{4xt6YW$!QdKK~#}3Dgi6a$;S2Y~abfy5+>l~3l8e93j)IxDJ
zvyBn1I3bw=slSH=>{I<BtvPwfva0&j-ro3E=LD+lV7u^U0~`E|eM?>36c&lUA0DRh
zv51?Cg2}$;)%mj~gJFFnkb`-#Lo*6Y5$>WPS_);1<{tqq$E5<6dtJb|3?59acut&O
zL`;<OoSW(zT!5e6g;j%4L`$8<Mm0lNI(DQ7Up=B0oJMAbgIuo~B>VB#SVdK%qanM7
zX_mG(f-?uiVR05M*~4Nl*GMe?K^^i0iRo4bbN!J|G)J$PC!CVfgo8H+(HgVY<rjq1
z*{lCllwI98U!R<O7ndct^565<^E*g8!dRrmltEhk6H3fpYa}-8yys{TGP9VAh@sIX
zr#vSo*0D%_@UY1mPmq@vbQ)oOzY(XxQ6jmc#L_a7Ur(Y;XE%{CEsLPMHcq>B@zAKB
zeSjd4T=L5ul5;rlmUXou_OyUrMNr55?6(ozaG0jAg|2+<Sy>b~z&$Tmhu$9B!9r%n
z-A|Mn1O`+;K~Px@b^a1z1A>ZACv~R6gIzX-@$(OXSuJowI;ha_cdH8{eDCsss31oV
zwVJs-{wgGU8{2B#If?i!5<$TL%N$)h5)IE*dW?cX&%jOsgR+<eLVD}mZ;3Gi087s<
zhWc#;Z(W{*<BLrCY0nB!+v$wljNCv4=X5s2MX7d>%mO3ld|(Z;&Vu76Usq#ONkWks
zU35V19=rRWHS2DY4ytP7v;=+v`@RCZfRY***X7M=LhN}E?(0{lfE!x%b=1wLD4J8Y
z#XSe&V3hRj*wz+314lVd(djlGqyMWg^fpBAxEzf9SRUKz!f+50KNJoMx>Sg_q^ba7
zeKz|Q*Mm*fWsp5X@7xmi9EQhY)eMf+?T{M~?*XCpwX*DhM<&0YFpYoRhg8kvfktNh
z4$s;+p7d2zcb!7vQQNRJ%d*lkf^Xo*VFt~CsB#cY@2-b~@zC-R>A|CR5?MMcDs9&`
z&_N_4Q7X+X7{~CJgFu^9Oeq;aK@B!DvPZ^pXi5_q_iA1r62E6eJTsE(4hqac3faaN
zEWRn{NKjuKjd|xzm2uB^T|jO3EcWhw)5y2KOmfC0lOkPsb(LNfPl?p@Z0hLAbkN+)
z93{P48ds?;6+st=^Sw@zOCm3mzNUx!o)6~D>&I{G=6Q|Ls@c3cX`t7Ap36(Q`_$B%
z6*`>5=M)5kQTFbhX&UPIR}E+Akzkq*_tH2Xgqr@-_-gh2XR{TiE9L7L3&5|vay`#e
zSN--<hr}4V0~5xjQFZ;1bwwKzyw?L6lbBo6`UDYTMvk17W%MH4Zz3JcuzVq_?R?+Z
z%I#7?<!+<{^T3LES<Lw{?5_xac!RAr?{cDEf>Cqd4DTw(A<TD&UyhX(NUla+?B=)T
zx6O!_*jqFL7I%a@Pxb^cXr4E}+6m2^ji|;A&^Bd|A~%!E-rByD^K89W7`)Hf4IyA<
zFHS7KocK}SyPGjj`}mjXDrH4DK&8<@OPWGgW_cunWYhz^&kMD1s_`q`h)le7ekAht
zTJs<Ac0UBu3F-Y5_s75O>bjs(v;jOxFoO6=``v%$3Y7~C?PR$=A6u!vL?)ctS+%yW
zH6m1}di@_a`oI2a^_dQ>{!FwC{vM?VIM&6yn#3-G#Nh2jMI?RIr(s=DTfr+-WiIad
zki{j;eQ~pIj822{W3I@T^XtzBJ=!ScCA|I=iHB~^80kJvit=*7pC2b-?Uo`s{`<nc
z0T;Zsc->luJ>s70dT{~>0YB+2&)0M=vg*eK*1a_S@Iq&n978+ZbU9X^ir%1y%+ddd
z^Zjj{w_2jxY?lakdFlN*z7@6$PM5RI1F&y?R<v#L`Y}B%)%D8kt@jnrhkO0CrNxH{
zpm7Yd`U1-3!}a_YNE@~Ag2Ua{9_;lZFR{F^5}EV)CqCWxx~KcVvFJW`h5mwqf=g2%
z<%fAu8T+!mw*)nNKrMrOW29d_`1>Z4A0PUnjzYLJqptlB0BZIxGh<JU?>j-+e$e6z
zr7T$bCU=;0oKyJV?;HO8>xwD5EAnFMQvWQUYyR=ov2;s%_Rq90CHKX3(r)j=9*V<k
zn#Ql}QzkKZ)7sqtpspP$6#w`EPrdiWgU;HH3jrdJwj&ni>C58e4JMyBS@uIg3c8<g
ziRt`L^!pzx92V@t2`PwzwmpwN*D&wH!3Dq8yu6P7FyH_FODC*_gED9BoPJpJpHE<g
z`K)s|u>iItVCT%iM`!nn1f@S%er?&jP@O*DERP|~d>pI4zr)4*x{(}nti}EroNKNv
zjmpQH#T50bn#S&D?@d3a@m>9COu`q2D|M?bqx@IMpEDz2t@(b;XkSLj#+Z^5*FrQv
zj(eO{@=`Turwj6*@Q5omg8;Q=v-~r`hez0X)F<fZ=@cmc;_KW~9XAWBo9Q=JBFnv!
z)_uzLcKDbr1nb4D(ijE#bG;ZM5<2iqRY3O~KeAQnF2V$<W!VE5#EW=G&riDKJkXj8
ze4RUUi^Jmn{5lie3zzjp<2Du|Ok2J_EVDeRd{f!Akl21bxuwBb#P)Ww9=rICIP})T
z-d%eAlb6v&-KVNo*BtqCO4s~j*|Dsn9fJdG+ab&41taBNK6&;2OP3I}PG-@rOw)q6
zMvXkMkKxsJ#OVm!Vr<W+ZZ5TZbZcR>QoQ%WS}=gTUkk_8Fv_*+Qo9F^=PVtjTKRfN
zr7LVyrj*}L%XRGz@%u2Q|7FenZ-&#$I^hpTc`I#eS5C*yfvUcsAMsIPr&g7ZlgB-g
z$wr=OovH5M^_T1-F8`zzXti1w$5snB(7*a~o<-iP=`Oi%V#H&vCFCF5TCK=;-0GZl
z?dQS!o1V41*`~P7m@65!$6u7phCkYZ&a^2UTKkFnpP}wnhQjsmkGFbIGiJq*TsW;B
zCGTH-V|9GcOE1g_CF}LJjXma+!w5>(F<ET9bkg`r$1_*!+8v#k(~o{34W3Y*`tlQR
zS)#jTiO`6y|L5Czx&CYTeyz|D_x)^Sp8Rupk&|ryo^O-fX55t2QliYM0E}X;D>uIP
z6X*57w_simyg`OhK17{Y3j*7Ab#b>h@94H(8Zfmqa^qmZLZS_wd~jb&W_f+XNs{Ku
zvq|5kgC5KE$z=~t>!|wwD{J^4KeNK<k_e2*nErwCjx4Gf8ygtQ3ji|?4B|eH?fV(`
zND~iGBYW1_YRddPS<uT*qg{3$@@alvv4#H8F!C~C!Q0(I4`^HKayow9zCGm5`gb8s
zK|h8cdsFu4G`8u=!+;f`s>bdIr%xVxk(<|+ch{!u@R*Qq^5Abc@C3n${ypJ*cTf&g
zU7z-38CMi|-msHY?}a&`>?GGA<dTL>fPvZq9k+&6uEX(zS%;|Qf-!|6+BYZmlJpSF
zB4|DOnq$Cdds6snulVZEg8^G|O00@ICQm+T4RsFD(X7k~dy;*?kHa><1W%bmT|$iB
zv#h(Dn}&$rSbd|5mH*g{{4glB$9;lMoK6|#U&LT%_~cFxe#TYC#$|kF_+E#t@9NyV
z%fEY;-6b{8FJ6(J)U&hu7Tu!L*)*kl#l<rE;vRvA`4#VtH<Tjt?IVA8{>{!k{o3V7
zHl9wpJ3gi#QX|<G0YlsMA*9I>Sy9cY(#pQrhY%ynV44Pp<@YutLR?YuaSMo}N>PfM
zXKuT6_C6{~Yl;hNljrLA*kyv^-)agLqJS~Xg*@OD#Ix<mbPmDzt&>FpBJmD}A9F}B
zvby$+;(yS%SG~SSzf!aKAFq1n^By1iIO%qt@WCoV6+Yc_OJFxIeciU$uxAR1-6&H^
z`)-&S1)O^n;t+jQ-knCIzsYac{+OW?VEm8C`SGt;y%L&Sp?;5mFm^{<40?r66zCpW
z(TwmHf-*`_K*kI$f%fTGp-raPgHE??#vk+dFPr^gga#U9T@)`5cU$*^52&|WP~vV;
z-bu2J2B=ZC*6&JyT88siO3BkQ0F?pMpyny`<<L6cq3UB^A7fBa#*V53#vn<YJEPcd
zQg(Ad>vvHb$aedAv1_tf%uzo#WAEXCm#3=3!(Ub%O?*W3e90&{k#^CA7x3$1M)FH;
z<k5fCUjOU;YMw5Cc@1DUk~{FK)^+kozDxEn4dKsXq77_su#Jh@iUA#{!X>0L1qkm<
z2~D}RduTShuGkuwE8*{ySLDo2%`xku1=78+4!tA>0DME59lO4K9T8^x+1Z5rK{BRx
zl?Ownjvri(RGpA;4J&xeiI(gCqOq%Zy41?*-RHLkFtgQ{Di-{i9K76}P#Tcxlzqvi
zCcPjreWW~_W=**!3$L)D@4Iq=8}Atdu_mXH3TF0TCHdk1ZOaC_n0HoTe252Y*1?j~
z3HXxSL}`yjZaDw#HV#v8;SgmSGXP+yfJbD^zF}JV2D-3OvjejI7Srma3}9y*R&v5!
zJz4Wx{iFZJoQorR2@gm|zlPIwg2vtSnp@Y^{HW6(rcq4+<N@5Y(PT@lvm{NKrP~m)
zF9k*CIgl*mQ)dHf4f7;((JyljKFrmyr!i~{uYP7<G}E1{Rb~I>T&x*cFjs#+eCkN8
z_(1mo%q+Vt`N2AZj2;y<OpEYiTYWbInx<tPhc}}<>?NEwO3Cdo(#)*^8qQ>xcHEq7
zV-cnyI4L|hgo0WGgo@+;!Qf1EOalIY?}+%Vc6yPHqpPCI0S7J$SRY47CXG8jBML|k
zl4}e{3CY7wGW$dkH&-<SSpQA(&`E4(LtN}15bv_1>>0h1`#@G5a@e#_sR?;X6jX!r
z13<Tb(q+<2z%t&Hf=gsPsp2CH4kK0KMSvd3>P0Ap09g$}XGYQ>-fO*?Gybd}knGA@
zBDIN+u=IMxRY#SQ;)l&(=+Ht^BoC#Yif3T>md2%`kic>vshV+d*QVJ?E(3f?Qwva$
z$_7VC%Hq&4_5R~3e|rfv&ryN}uv2%v+kJzLw%&4U7DUb~ga<pxoM9T??y$Hi{*hEt
zeCT~ras>}T4F(FYR@AJ0bPb}~wh1w4OiZX0g$`)Eq}=l&&<mW|hPX6>#M=U(${&e8
zQ<YeqOr^+Z=-`Nfh_Urh{a@hW{QS#@dOS7uF+KV43K6hx_5n<s2PP0hO_`3Ba0$-a
zMTk6aDR*CeK4dPVZE<y_<ODQ(hPivpxf|ia5fVj~t4gaMY`C0gZRKJH)5MZV!u<}C
z2@-kQaVa`h)J=_^vJk<^(}WN#e;pOWN0xJD)h8kgAq`&7&rFfd4IFSPno%+^VV8Kl
zuT>^YZ^AfvTVLIv<=pK=mz1!k5!VYEb#F4BU`sf%eO2&+lNWnFkEr}OWg{W8mK$}!
z=Ge?N_oKnlJ6`=ZT_pa!-SJ#c(AAlHGka@Ii?%vv{A2FhQ#qZ>3JRj=Ns{hgWK(>V
zIKAu5Ppe4sRQD@ce|Z4X?#de1uF4Ll+>-U$-kZFurtzyeU+*!U9k=@Cw04?T%o}F(
zDHuHTtaL&FM>V|Jt)e(lreD?36{>7bH$vO_bEXS8>!SN-fbmGf6drZnp}7#fc6el=
zx+AbtI7CweXte6UiZ7ypBG?I>lCsFdbosxD59(y>!t2?UEyRS=Ni_MIQLe{NA|P9+
zei9Upa)ELVLH$<5)er@of<yx<HWe_FAQa9yu^`$tzU)M6wAb91Q2pQI8;J7*h=~oQ
zo|&7W`eb^jZ03rT0EY?%K%m4<N$ip48r)?vy({BA<s5iu;U0k=Yi{Bktk1rSs2R&Y
zCf)RP%;HMzDQLVGXet-+b{8xz7f^k_Nbr%uq45Z)og}ZRZ=-Qp#N_iXurE@EyQ$~u
zSx6Nr5Pi#cCJF@F;vUnt-NDh7y&GK&pwA>QfurQr?m~f`cQ9!XA1Qeq3NoegKqn+(
z9Xf(S%Jv-p)nkl1cY2+ZZ69$;G{D_%HApFmDhqUyyMV5zB8SJsgqeJZ-u@8mK(#3{
zqOhT3U6}_sT+0+Vz~=0r`ep7|zIU(WM2hz9GAKRb5k$`h__G+Mp)#uSp*6}bB*$uA
z<U`Wj=7>U`^_f7tFvZakB`vP7kMND_xOe&ze%`CRs@JschtUln<npsnv)C6utNH-r
z{_Z$kGu5sa<AQzIGxU12Q6U`Z8=^UmWBK%Qj<6*NI!35DKnU)1obwvv=GZc-g(0OV
zDdUpM&!|cdsm4d^T%IGOc6!uj4wyZ+Uu~Hg`0?n$vOPXQkM5mVfh>B6LRZ*SEA~+&
zznx2;Fi?l>y0N;Q9bF)}>NaDc6!A@O#<aE{aZ0u58Y>R}8w{rhHt{MNZfMinL7X20
zV_HqUQox&I@#zrj*1mpPIuhf1w_VAx@-d4j6zu0&sd#Ou-nl!#I64l_TsZ@S`h8OY
zggy@&>oihEX0_i#9fk|r3)TO&1s8T(5)b-Mb&cbpej^n^Vg|$#%l92+tG4*==3PLq
zIDZV#MT}hhZOP{mDCv$ik*Ao(i+fHi-umTFnYUP0`^zwsU1JI1rd74XmD`Q5e5xBb
zOJxYD+9ei;T_~N-aHLEpxuUglSoVnc2v!I(lUu`J!vhQqdi9y^$r4g)MGtY3=<XnC
zNOzDlUO?S?F})qCf3Gl{Z(7xa0Owga$5c=cI{>h`ucl89@X_rpuEmF-@8Ds+{ilXA
zm8;k(qC-rp+R^P$MnwFucyJ{5Z6aW$CZq&#zDx16;qM{Bob0!6Sk|as{6WAoj^wHY
z?wYAc{$W7oC6MlV4~v`sNcS2QpXXJAKC0f?gS6;Trqwky0@mXEm^gnRk2U*R8}$cd
z@a-X}pxpCBWI2TDNkVOx?-X#XuBkqQ`puGu#K)`3o+Gxchto}8<<j}Yj6E^>cm4f2
z6IF${$*yE@E}6jNKV&#0sDmUEMwzbp0q(F(8$Xfa4>W=q1w0cu;ReSl;*AGjJKBSE
z5+NvEi-5%qd;|4+>0C$5AX$b_GVeUHd0EYq-BbGCiwa+(x9%VD&muln3cJ&5#@-CP
ze*AsK$UYX=8)YNfc=OV-wu<lNhyD2#SgEO;Jnn&9e5ueP`{#F-#H<i>$D{B7xZ87^
zHK|)ehQIskg!IC?BI#Bn`>lXd;*m3pYG@)d9w5P)O}qJn5c1)7m890dWhv_FDReB#
zXhPL<9R7f3tPl#89?M2KNPfq{V{@m01oE>uVck@RE1_qA>v0-b3%<Sl3~u-obZDFJ
zn_r+zbAE1SSY$z-u8N`HtCy%if3(@XLux?JoX+>ZTiuoCke~hCL!rC^^g_vip)1QY
z8W9TcNQDuo73UoVV3H@?^hOWU7_Hba_1bdwgo+;OyCYQ0y&6YMR0m1<Yl%|ri%}kq
z-<XKrK5jW4!Dpnw=>(gI0W7*-tct*42O5TUKn3Q3w6^bIYpyY8J6<OShYpNJz+iFP
zBPM5!H?k+9j^;3ZW71W2qFcQIFNI@_P^!HsKilru=}V-!k{25-8KZ({XdbgD{$P%9
z%~5wi_dIVejbE-N48dQ=hpk%^#UcsuY^jr5578y_6?ccsv2P<MZa-2~%_C;)_GdCR
z46U$qj@%iu`T)~v0;rnEyM50C8%OhsxO|eI&Ha?K1nnL*XCi5w*mJ7-Fj8A^;8q6k
z_0A2hg2tmUc9L?it?Y&OZEV=a4QhGyp%;J!9O$=RRpbSBGoRJc3(&U?*kT8!Cg+tu
zTkMob@?eF=0p?`}@8)zH*S8WL3-73wZ{gUlm!&g{aBmy0mMT)7d4PuQOJ%E}LPjEL
zMU6`vk;iEy@-$Bn>F80ex=4sn1ERV7DATIQ20IXq5B5)kzq@b+{1?z4wi5fQ*Jwr_
z0qEeB4u7DY?f3#nNOzcpFa7{qn!K-*xz+odO<PP(^v>OYt-rJ(#Jnd&)NOIHG{nI^
zkZ}0cfDQUaiKzAKzee=*clMv8h8F6%Wl^PH*h`ooB&yo#bP^hPb!T11zUojK#52op
z_JNWiciarC93%lYypSWRYf$ia<^o<f&l`M{w&prTi|eJ+aKoD?2DyfL?rf{JS1t5t
z`E1nSL%5+&PaEiw{H#t!z$b%E+Z8HMvf2mVboCPFpH2tUbgiQQ2wii0EaR+JFbHp6
zroNhJy|m0F+^77V+xQ8C{`l8F!M1Mvx%mXO*q@wXoBf)G_iJ-uadmpoyV;vcOy+rn
zR7)K^T*UFWKNI3A68$+^Gg#$*pg=Y0pG`!-7u#%qFZqcd7Z2hCgrC#n*Hfl<10g}_
z^uy*-k@RU}U52HWr$GABR|%9d%H_bm5VYikU>hT&s25R9<%|>-rf4UkpaRBmIoD<0
zJ)N*1e@;<)$^%x)Aw-rX>P<?gI;_7Y%p8REVT&Ppv&J1Ur9y(z3@RW@uhS{S!rDe$
zov4YXCx*m#@%25_T=)a|%9rn+cK~tGcq<)yJD_(kH5XssyOSQ?;_0n!&?SPmt6dmQ
zYdx&KSa4E*fJ|QU0h?WMV?ki4ndhxdh7M>_VHI7m1lPFnFsFCr+y+z{%^=q|M#$!s
zJy@Al^?hxCooyF?68)^L?x45(@;OT*ZTfFoh0}Y1(ueE)C*Z4pp#N37Lc4IW3!j*<
zdz-!FP7m6yDtl4s=;06~<*<m+h`MzL9>zZjl9F7L(6E|z4T#_!_5OAWAzKOLFfrAe
zHU1<t%0>3``$6ux+;fVNo#2}FpzGY^<!B`Sj~9MftBjJ}QBqYQ`jFA@mFpw;8<J?J
zfkOlc+GD1GDps`t>bI=&S>4JrgsJaf8mT7*6Pi<4nOUXO<~1V-;2=ab6*Qc*-r*(I
z$w7COvx_1HyT<?<PN;ZER4A6+Xfd#L^1|MLn9GA{yNHmKtaxW?3wqX%-AI(_VG8_D
z^g817F#NSqnM?OV{n{&9h}!W^>;1EI5HYhdATHG0M9coZaz}}qk~8u4g;@pK!$I<H
zx9>(?wsJJD0A&YM2!EKYSixfs=b}egR(Hlc`_hz?-FW@}oZ<FaZ;|%Ffm_E}*GukU
zCa1<;973&lt~ml-{4(#rv8drr3R87Jt-c^|2vCA4$Pk7{jg0M?tTK7oBVfoicO4{c
zeQ*=e5j)AVc7(SY2eWW_QKy+Z2dcSmx|nqeS=yK~Oo4(b@B7mWN8W3#8$Hp|P8g^W
z5ByWJCTKMufB2R^4f=ZCDSI)}<CVC_5(<B}eKG%iT*=oyZVdE&8nHOL=GJ4!t^OY~
zvx{l(5x&O{G+UD_jV$hN@QvYTU{&$wgB_PIwbuTScWCJi^o<Dvc{q0R;4I{}s_Z*&
zj|UK=&b+2t(?4>*>sT*ChYbC-mOkN!B~d#iD#Du)O~DlU)<R3HuU{jdSZ&}=nN9~#
z(&S!fRl>PUl1=%6V`tcX>+_Oh&{D$~g;24Bt4QJC3#aRQi4D8J#&Rt!lQH#c2-9kw
zO;{Z7L3wF<DKJJD9f9|5Tcqcfg7Nhk+XQd5Qt>atV-;@~21C5NDodfyOSFANlAq})
zx88A;qzD)e6vqKra=mfunu=$&br}$Iyikijr0Ez?3>H>`9^ipaW4I8k?~Z|R0Xxo9
zu-c&qF|{g4di>&CSPTswyH%}+WanMCA%I#3X-sTLM8`T=gGRRf(?7$|5f4f6v852b
z^DZ)O%7HO1L(4+qH2{jVju?nl++<F+XIZS7xCbc^cAqH@^2AS&dJ#}iw$|H45$LO*
z-&0&`8ZMc725+v3Wd!yWGPaWT3ItwZ>fag>OY1ok(GJgu^Cxm4>eET$`EIVHaU4Iu
z9M!hl<-sb+>t3SfF3aw`rZ@sUE5F4ewVt7smDlA%va+1z1x;vqTd7kO<4gCAaKo3t
z%rA^tPshe_wyMd)+|IRcAW=mGY7Y=`IEHi@h3M!bxiY2TPZ|)Az^9BOv~`_h?_RvV
zlQD4qY?Fc8bKhrQ?c$YTn$+ib-IO*B{~4=fqxZcDqbr`l=o3fbU-mSdIp}ahI~>>q
z@&8NwxhC-2qPyW5!kG7Z__+2izYa*T(|EmOG~>Ihq8ah-ZrS%xza`B8q&orYu*M@T
zQMaDIks?)*IaU)}x?<~Vge&LAw0gS88@U}R`ttZzG_rg6E2uuo_KIF{)itJ_b-+~W
zxu?M6UWmkx{0Oa2jO!+@{K9yvX~*2;lV*hX+4X%f7a`Wx!hY_&szX4;-F;6Jt>O$n
zYzF6rVtjv0>P6_(<kI9YU!UF$bb0Yi6&2_ZrBkuv7=dz9HuGu=8W5%;uLN)H?L<9f
zQz^W4wVLYAncdQG8XljjLJI8ir6_PlumpZpnKdX5ldFG+P^|?(P147|#U&P+o@)=@
zW^r!4rZoyyX;n)cjRsGc0A*V}S1g0AacDxwpOS5y!wz<vIlE1b9_fAb_MqI5<7Pb(
zT;&maSnh3PQ&_$ph^~UPaZWqs9*3aLc_a4uCN*A0+bQ@;QnCeFI)!XTOI0b1aNn9Y
zK+acL9Vk4=ns`fMhPDqT;f)BJA_ja-^=?Y_%4U?cDyAy_q<Cx(I876vIz3X0N^0VT
z>Gv?jn4nhkbjb?qM_!8=A2^SHoLPP>Ys-Bu)u8m@2@<{aM7jK?+r3EEp;hUyXFmul
zh;b)Ux>Z|GlH%GlwJgDJE;Y_&OvNmA_>knr`gfmS72cH;W*z@4J;7Dy1X|83v)XyL
zD<V=T^wk$Ft6%MNXuA{Bmly%npBtjfrLh&?DZG2-zn8+D<2PXk%rgMqqTzsd{YJb6
zI%iVRQzxr*b+VqeAP?dNXKOdXorN&mfLsCNSk(kwCe1m5!}a19TdA&)G3sA{M%)ZG
zY>s;Q9OHN<6QXCa6lg$`FjGLbGq($=YK(&V;kN+DIg4FH;05Jzi5ZCM0q%}+fDIG@
z1mk?$ak?-@Dpxvn*3HlJBwVyGJwW(;D#In5pNInqYw%TwTrPndnw={djyM+}`@s|w
z)&%hm=QEtd;;UD}QY<Vx^uCZLL>nBYV?~U<Q6LTPbSevKVYU-b`fu$nBN^u82-8^;
z($jR34&oL-p!zq2BjO!H<>YZOgdAT6(W3z`2Sq}YCq?*HBa*=R^l+1H`&~s((X)ze
z0<gg9{3#f50(^T<T+)f`nHfH@qhae<U)5S{rBM%ji?fK>;=lDQ$GdW1vausYtnrCE
z{Z^+Y{Uel51*@vrCHewH?g8wXIT?rK7SGV3PVIFo3zV7Q48?6@eU-8wHPW|vsiZNU
zR<i`!>dl(9FY&7+eVqn3{5^=~9FjQ7bjBT$pC7r47961m41a9`Z$!U*ZgZ_9TA;!%
z?vzIm;){*nDfjHk8A)##^o2jSZ)Q-NJ}MGa%Z{O)Z#wNS69vUapZU&WVNY>#Mb4RO
z(M<5ltk)hFtCeSCr1!y>DD&+4NE!SOtR2iM^c&nz!wBE=O4iZdhri7B$r+HQChqy<
zJ9zWf$T)AblNDA71Ip~*gQ2FDW}q3h_!LO)`WHd<GmkQ2H(<ar<nxw7^%ptG#ARvo
zQBK~PSRnTt#|`Si7yo|6)7HBic(DoqolpzZ$&raYu_8Cq4!tZ}^yqLk$O_Nrje`>g
z=c+j~ddB|FGI7@!BeSX`^zp+{hd~oYRmtseeo8%yDOdZkW}^brkHU}+Q}dw)Uq|*z
zX*#J{+2UA7AVj@CHvfqLVBR6+C5PdL6B-;$qXNe-YxWB8?Ox$wcU$oN-7`Y@5mx7@
zaem<9X7ntost24!%t~R;MuEQQAoWjnZ0bm%5pKN}d(^of8|Ox~w%2L&&q@+D!Lx(-
z0Cvb6(OrK!BnGJDH5x4Elm`G$T2a#IOaZExUsqbnYj3Yp=v7W%Z~i>J3AJ4v0jhSs
z6ovA(Omd^xV2wN&U+T5FV+Z2K-=Z#imQ_r@Jw(AYX21=<i33GJV?Jpt<7$@6o)LJY
z@9n^`ve;lop^M81McOU*2<dXdDRQE;J^;grbXC-hMfSu9jNZIYX(Vw@Tdtacy(;-$
zISKk9f@Cc`WjvJ>FstVZVAIeBKTq2jtF`tI?7n@y?Pa;M0r@VE=I&<XaUimQv+8xh
zz`GA<>;Hn9a?t$=E0HR#pF{rKeP#FQ%b<B#9sHb&01?7`jL6$a7kXy)nt=hF3%CpM
zenzhPUhiyWRTaZv?_4JoO<ws>ILF_lb1C@q!Y9L=&Ifk>0<8+$5|(w&J?%Fd1Z0<^
zhtR$8jg!DnAV26dF@C6l6LnaVI^#_SIWAsOv5sOhX#*@r^s1K#(-RV!@hXJTMkIbC
zZlIv~5KMywtxYC(K}C5g^Cq<;>2_wYqK4%Tw$9I$Y-ZaoDzH-@8>WqZFVL!0`>Kz#
ztZHAc2Un|l2~KNl)P0-|y*t4CgeYjmDmtxrPq#o#Q)Y9sSK<hApjZ}D#57C);0Z4c
zgE}mJD`j{w1kH9}rikOb^$=}&l#+-sahP7I&D&@w@2Ju3cjZrjY?8mqJ^(okyjLS*
zg_H3|-n%H^C{V%a^F}n)fQG&Mq^{7E@PxQ>GpGxVI{Bk1I6iy|$gqv&k&{zq!O^EB
z+|zJ+oC;X%ULOWqK1>jU`5+*s3s=QckH6y1d&#X+_=Cq91V%eP71hs`n>&pbD(_L1
zBmjjD7oaOkH5%#npaFSpou=xeidZnHtGA1H55$3v`9Mbe<v&Yqi4^-NE=s3IiK_HQ
zS^Np~oFmqpaJsJ9ZlqmsEbvOl5ov<sFy|EKqRWaT^BdyZ53uSTjIK>Rg_$dm>_ZP-
zOQnBw7UR;<(H^tVcuQDQlfUP-;*Wdee^y+8_D}fXt6QAsh)X;Szqt;H7w7>FBRkRC
zm(aICXJDATF_TUb#*`#V39(6#QSDY(9KftAHG84{v{+DpdOt=}2P94C*+t6Uj=)+h
z2(GadK;L+d*avi1fj%7jq58YwAk%IG9zd<92WyT{7qLBjDmTRg>F_2PS{^qX0Mqyb
z&~aNHQY8SV?mo8xg8tEPSe$Aelpb;c5)2aM?m<9bC8Af~^EC3QoHT*00X|;g9K6H(
zfVPi>mL5nK4sg|Ypk-#)DzsH5&rwFV)ZFVJMOFe}cE$MHv#+P*p}zVZlgP};m9WYf
zIz+FcHDXc~KV=K`v$H^<Tu!9TDNccd+A1O5#|gQ{6&~8E_EHyZSVmyqT(Z>#h2tBm
zsW9HYyDc$fn-^OdYnTmrJdnir@%B|JGF+-2BN{+!%w`sF5?^D%o1fsxB(LgTP}t#;
z3Q`v9bs5(+#926%J|}vM3_wx$HXce4PJIayG;j<nQdQipz|JJ}c=6`F%ZNd34^e{7
zeC-@^2^|xrMqCcmkgKE{VQVySSgsT`yFFu^=HgI8T)8*K@2ZJQJ%WJC30!j~X73oc
znJ?hXz7uWd)lJQf#lkA5=D{>W-$IpgIlqe<8Yago9VE3l=ET(Zz#BF_NSQ7-U|RV^
zrY1HhO8*2b^B96>GXooRG7m+3Sm~cJsc89vf~HHw22C>tS1pY;=}NQC>$qhyrSU6^
zqzq)ijK55$3@Z^e{~hM@Q$Lr3TJf?3Xm2S{`~2*c!Gq7>i+>|?nu+sMa{~zrwWH2Z
z{WJhY$14oOm90QS-U@mnk1VAutJ>qd%x`gQPGUnLk78pP5f)c@n2@Xfr}h{%yIH*f
zd4kzUOmKD5%Nbj#(pl2x#+*H<7DC)n5Y?CfIBIcKcsOdo8PBUo96O)0Gjhk$-n*yZ
zhTEO$5&m1i0p`H1`1%=rxJbA>ge(KC64`d=6;3Q^U*YLH41tHyXF>L%=a1UBXb%uc
z<FPrw&zDK^Qi5JazuW-Z#z|?@%acGv9f1o!sF~XKQsC{P-O;x#rhT-KKp$&}>)U2@
z`*omJ);Bl9MJPP@g3$v1^A8L*cS@ydm7w9F3r5jxcfL4Uzlqr};!06+nDlC|hNr~&
zS4Clb=R8(aHoksH?H)NGnA(a|y}FE432~@f$7iK*6D0);cMdMcO#BEX1WV_qrdWnf
zj#frG#Fdfg+W{AFG^v~7)dG);i!XIL(01_wa0IJQ!-F3X&}CT%%IC;c;?=MYs;8p4
zOL4WW0NhHbGPQ~==mF_Is#jPsoeyA-3KB#w6Uv-Oy5%$)_u3Gu|G-(st(i6vxUU%!
zcV%kCX&z~<2w<0tK&>|OmV0R9Pf$l+ZUFfBB7thEQ@{(7ZF^qL>kxQS2}L5R>Mhe$
zxL{46Q~7}Ad*;U;4&N*~x21D^!J&WvRjzny&46Y2{q&9_Ph<+dak0oEUfw-%Wc3k%
zE;0m*=QNGs@o!9tB?qSxqpdxuA?0s8-rz2|n2l#p562f_r|x-94f5;KHOcLD4SzZ>
z4Cgz`_tdMO7U+VYY6<A-Z@IQiC$Fy6^~xA{TNT!k<9d7U%4oT8Kpd#O1lW$pfLVS6
zOx1b7|M|=JQ2pQhka3Gc7sZ^LJW9w71#UM>!1)FOu8`6kP!r+~1rQU)ccdMV+CPr3
z>}MSsuo((5{vPVJ2c+hGTtrAd@R_pOey>zi&1EYmyP7vO-&HqWWs`=v=WBWc+C^s>
z<=cVae54c}yZl{TaQjrr=lA+RVZc9>nrLkgW06#=R@}2Kw_gKqzUI(H^sdH%+ClZ9
zPsz4#zlJyO?>#f&4C_ISFZR^9vaGDniMji_FOwEGQ%d%Xm9&(1T!v^TZXdX{ko{ik
zK2W_hFF4EH)M(#Oi5c#o#vc2Vm^PW(c3z~fR<a()6ZBXAwsia-3Ew#m9gkY_0nzm_
zSX6P(bXdf|>u0ux#EHM_8t-^AK4I|tyPBaE-|97S##ygDRhG`(8qQ8XyS^9lFL;;!
zS6=gHaPdh%<U*85w%<ukZme{h=@pB!FGH4EEDKKL7&bQIYtF@q)a!}8MXV0JlhOo#
z1OMQcsJTe*Gfh(Y%Jb!YFq1p5*}Qh*%6JgsH#WNVeG&IiZ}!m`wRlswuMeIVCqHyE
z{6b^_Rr>`iv}x6s-BVwtE|qV9tvSR>sJ_226=EGNpzCRuF~|=`mCsb8cdJ6)Df=hh
zUBDI)2nF&Os{i?Ts?JZQ#2;UAbOEJy>@4fphjHjX@)<i3jvXE_C*06B5{j6bWX^K=
zeW52&c6^WUK#3@%s0`4F<}eQ#Pni1uD7y-<Cfl~X!6X!<q)S?*C8fJV6bT0m0R^O$
zk{lu}EuD%8N{j}HZOQ;qY3T+9>CQ3uA71tSzW=T7`;KD=2W->byYDN`>%7kEdE{>j
zG>AD00!rs%T!7<$5KE)+c+c_p&#CKawMi}vKA4xS`=Qt=!vU(U@c`w{Up0lYish|?
z#INchRR+Jo167Yx4c;AWsZT>pJ5U4{ib>lKI@X$vXg1t!fb||0kN1B%_@o|!_*j_D
zoAdg1?LVM{--y~E-80<x^bQXvr56tIvkbe|>lToFKgJTuO7Ny9ue$!;+X{WFP_M|S
zwGgryR?ka{;No*xsL6-{?iSeiB_3JoGfqR`<Ghgi&P+mlxK_AtA$_rBahhMo;8dE;
zSsa+PJ0J%g>b&x6cmI41j6)6~Zm(WxW2v19?(uma*3-K$a41aCy>;a!+~b{62lb!#
zCnf}LGBUr8uP0U3bG`|ZeX1a&BR^%5h&XtD>5vC+Oyw`WJK}l!KDUoHGHI~dLQ`Ez
zoE6(x1wH6I8lT9lTHU%05ZTK>Cr-cd)uCyxC)&Plz})>=%tR1vT{(#eCi#Y~6rdPg
zMf1$P{#*SdfvTn1%uA@VXYl5Q%nPU637Aa9n5Jo-@o>B@-~niI+V4y=gKgh_*JG`m
z`RMFneCOUxT<)c<-G-IqMp=38qjM|I#o-U;x03*wV`*a%z-zUSIx+9>F!rTH-U``v
z^zirWIPW(R6v&nNOLzLWi~8*ag4=+m7oY#*!Oyo@h7lL*vAm|S>HX9q82Rny?VOst
zq05sC$wHO($EC}4Ws}z%qWh5x$A|~!<_99*Dh{UKR7M}Py04ca8*b+y^k2@uK^Uug
zXP16mcwnD;+0z<X@BNyoasJ!qzpZTk{RJVvQxU7*1aIkY+XS3_I>h1<3O1G|Q8M;*
zT0lz4CFj!0vlXL|1~#uHKCgDqj+Z~e&cLT7z4r>;q9ld<Il4LLv84MYwxL6?3lhc>
zM!oe`_J=m=e#t47unS!+VLwvAAFA*_{fWMGIx>1GVC&Jsk0k=*N!8z|hPy<%FmL(H
z>w)>@B+o)?GJCHkX?eM2i7zgI&h=wE;z9h<n}S}h_M+vo=!|T^IqDOm9}52{sD6c<
zdA3|iZK>^lC+dAFcUn&(_cD!v`@5amNg1v8^E#-P(F&~f$xNKh2=$%0?wOqh#*sB~
zn;PKqPF32UKIR{w%~#)SwjCO<HN}l5`nhQ1d-`U-De@`xd&N*U|4rxLH~d%p_|tPj
zey8GFze(v*rXT)qz|);1$e;81WwmLNY=Aj71?4Nnp}n;`8a9$#-i}=OT8z8pk#r}t
zZ#M(mbk}Lad#6(Cc6?w#<IXaY44T6)n7!mH@!t&~1Q}ceu&2Q~_ZH<3sX5(cJi%v^
zf~%iLQR!~&8BxKAr1zjsV_f9nuC2Ed@Wq=KK7FhF))&=WozGbHeC*J{?=4YV*lco&
zL=r5E!(*PY`uQSOMz8>-k<)>lh?V_kulVH*z_oLL3ud9(8kK$oqtGVwq0OTMM{nw)
zsJ1s@2*;BLp86AsWCdxuOMr|v8o~XnH`OZEu}jFZ=3s!CAib1qv?0>A@2>sUz;3LB
z-K`BP0a@d*RCA~nW2Hy?$a)>Bc4v62CGuwTM0=>8^2lV~5ZQ0L{hxlQy#Q4I>N5==
z9e!K|+VTl;F&`=?#njv2me+)Tz)}=r#%c<SYlNuY&4kw_8DSmiLLv_JffL6M8r~Fq
z%vZ&(>?E%)-fd#q^(}1_=GC(IM^<<rFZ(zw`Ky=TME3EQEZy=f!aiE~c0{}I@PMYK
z@^_i^pQ^`yU#wd6>B2go9p`>bTUzcDJAQy^WFy{xeszF1F5iUP(buuHX?DBUH)kv=
zK8(m@vT$jD8dvj~_d4ed`oitFaO^lE4{Q2+DuCjl-D9Hc+o*Ft;=Ak}wSKo-bcNDx
zpRnF5pB6Ts05Q=2zmj^R?Z^Q~w)0MT6#dhYZw3JqWC?3rH@s+&y|`xsv>WvgZoKtM
z9d`FS9++7l=xBEC7;>L{uvuj0kvhG7i!ifP<Aeh@C`!`J&r-ya-gm4ryOTWAt=gM?
zxU(5(*qi#`Mzz;Oz{$?A;K%ud-O=G+YU@Aw)&Fxi4j2%AIZkBfsDDf>2`xj!X&e&z
zUsqi@&^(5TjKYpreY^tvSZWHG7d77Fgiz7_0SjLnaZ%pD1qS`@-KB#U?c9d7;*9R@
z&Fwu26HJ!>yq3-iTrnfMWLI-fUOU|o*ELi7sZ4C?gY?-2kCvrAi6>zz&e_x3!^Pu9
z13&Wj|69_Z7yRe#6A7Z-$veW{qBAjVDlUDoGt*|%U`*AJZ~lg3v(~m`G;Aro)5c5;
zNhA_;ziV-)tri*S^J*|93h;#2RpREUvNWs4&uUX91Cr=<y|lc#xGmV5*beOCLvYd+
z<tOsK^LPd7w<d9u6A8K+AMQ#7DJb1byl6~n@Rj=m=EaK};QWcR9V+X+#v^gbMpZYs
z&s5i3>Q1RK>BvwtFnA4biKPBaZ2a1Q=min+FqH-;XNIqa`0yz9M0V9Cd470!oU@_5
zGGo$DzCH8WiHL3ILFZ~^d*#VINodPZ<*-(HfKtk=Y*43Cl#wLA(F<$(IPib_ko-z=
zX?i=Z;ahqc$p`r9;~63-#zw-kt>%X9denaTt%F&ntjvYUMgDBV)+7PcxMeiSIqnek
zwu8J$7P$zRWN!N;wZB6q*BtkXo3G&rvX@dj40cB6sa)pgYVTW;Qc^MkPR1S7dj9tQ
zxBD1eh|+yD>~u%%1@W(5{I4HE?xdHd-M>#jJ9G$a6iy!mSwSnS#_y<zyJNE2&%>mS
zKDyetr<NRDJL`g3FtGP4{6car=0T$Wv11Ju)~YX>%A?&q?g3mQKk@^m8&$4c9p^!x
zu7d6Z70JtdPALP1cML_P2(HXd_pDb{d_u~OP>VtvJ6y<WJYO3d{QSrtAN(JFPIU|C
zKIv1Qw{Up%^ZQ*#61*3$x@PHc<b4#kvQ^V_ESi}##4+8RUe`Kb&2Y5SoaS23+DAv*
z*w<8xU+5u{Fl#m`{l?QHV8-KM1={?|)jyNfSml3NgynaJ>+ga4?$oBI{XA3>27JU}
z!7QxBOz|PKtonuf5cPDK_s~5{QN3npLht_dBKlCxRXdl;vsY#!osEP#GS<GYKiy|4
z1uacH6`5@`I@l2_ozVS0-I!c>*)%7xp8bC~y^HwDLvTvHu=S4qFF&_0cbT;Ayt_Aa
z_(Cj~Kqu?U_o?WO1^p3@$>7`a?CmZRhV0Y<b%^RmMuf93q%yo*M#I<ZzN$cb+%tU}
zd}EhNJOUcIU&nHP4eOzFa$O2$VYtNkUmFu(c0x9%_Vi)mw(c)``Z$S>nKC`mfmCra
zCcFym-7u=VVi;CaVgKM&V3A-n*-rZOw1M*jvc#&S>&@*C;XZrFDURo2B)r!th5}e?
z8I0$-TMu|@Lv`QPUy?5k|GDiyJnBFEM0bk;CiC3aMNH(U*Ix3)hvM>rmhL<YJCMLs
zmg2v8-_tYgMn)QDXJ=wbT!3_c(7%ytx6E#-v_H*xHYl=;yL91|`9hlPpy6K4d;2;z
zGuFMauowT^8TFl>(VTyO@vk!~CaqG1CUj7gF$y;H*LsN<mVUWLBgtgjGJm^4*@KZr
zN2|Wn^)@|OY-q+);-0X?x6ip|tN=e+m6z_HmNcd{4a_=@@^#i-Ms^TNbw?T*GxeAM
zPaAVx{B&ddyu>7aZOl#lj)1#WG9OvZoJ7UaJlU>ieF>r3%pg7ZAf&UY{s{U`^mSy>
zDtIwZdn}(>L042*=IZjvf^jZG|A0$1<5FAK@~gnXpqLH?helz?$cBg_t^didKAieh
zTvK&=>d)^11j~D(ht#qp2%5gJr!T_@!ZZrmS$4yybvNGG#Bjj})cS%4&)~}+Fc`0K
zV-5!Gi8CI1$YQp-4F5NWE<62T%ch46wZ9zN3cn-XOK@B0__ioS#3!8f{__zVHCHq5
zse9&$UsPpF_#B7}rVsbg(dxzt|8+J1@`mI-!5gw;a#6fr86-%#MM1lwv&2IIS|&R*
za9wKYqqp}R)vi0!obqo)i_`IKaz~6Lv*<=!WRyM&G}$>+v6*%)LJaH;#r}i+35k-(
zfbov5a4VRDr|^a8>#miaCt5$J4xi+Pczeo$TP;-JRP*Gc>JaanI`AH0Ma1a4bQl6(
z8pG%(!ii4DLm#Drnw1}{f2-nzn0zPdrXZghKLdX)tcY?Vlg5*oB@(*KiB@HM7dJY>
zj^|gRFt>LE2a!??pV&{*QcUH;m;D2q*{u-jY+!C~j71nb-qUS}@;&H`=B|%S3)R%)
zCnRb1_t~#@^WbYqt|W4c{l1ak$?!WA^Pn4HG4EFo8MAdf`c|STAW4Pp<JH&4gM=)3
zGebkD^_o`^t<-vhE#qy5y#FCODYu|cI3aaTY~XuO+3{0c51Ud!gP+;ar4Kb|GtMdQ
zI(oshm)A$|Bz-QFtDwU{2(k~^b6g?#vZw@b;fxpo@{HBq$(e)@ITlK7N&dKg*5cmt
z<Wq-)n&Od46EoV$uD351O-}X(cJQEU(}{$N%1qze2jNzljyfk-!VjD{hWBHT*(Shq
z#Q`wwC>rvqdD}$)@A=<%h*GH0-x-P4Sz|yhr0sIsuzLk#l}|yFPz|^U3mfqN=6)t9
zn*M+WF%~jS9rri=%fFX}yPv9|thx;BpVN@Od)j-ypUDy*NFw^-TY@9$jymZKo3oMs
zAvD5Cw#U)YF+*C(=OVb$1-Ozey{~{f8-hS7vPyi@J4Se)&2%f4iih?1DAt-G8dO@9
zG*^=Hg4%%7_GG)<MsZ&!>~T=ME+n-n{p(uueX9smEv2QOSUZUmmI*8{t_o-6=8P%=
zQisq!srx}P>rrA4$<OGgK=AR4!dQ%_i%Z=8tvj<T;;kQ!F}nqJMLT`(>4R<&xT>--
zvnb#vt(MstLJZZK6Gz@E_*U`S#Z-c;+8QXaCu}1LPVZ<&jaBn-seHu=|Js<p{>}=l
zBrjt-ExhsC^->Rrz)0aR(9S~*n#sy1HiY508zCT8z$7}fiCwoG@f!0KQnsX{9&%W1
zqz-YE1?p&xzJ3)*y8Hmxf@qfv{pCGj+PLGUWS_(!Btxo!HeA{>0hxUlRx3d<H5xN?
zwh2Lwm7RHwF_LOZ0<H0P?*;VVAh5~xBee_Z+ia3x_#HLS<2Q`2jW5=Tx(R1lCNWY+
z*GOpHe^0y`&RNobY{^)-vD1TjeUN7tkv?{KRQ8`Sfh^trBL#R#Z9?%hX;e5@x>Q2{
ze3BCR)SCv)fyD)Sh)G6Vu^hiH0a$Mj;_a`{TeypJnp%JlMON2RO-<5Ja;5Lq+U|H+
zk{U``@ut&&<c`=9qNiVr5S4MUnKo$w@g{gsVP}fDUQOVa!wH#?i01Vm_M0t(S6j`N
z81Z~j%Fs0WLcPUi;%wf&(g@h{J;#zeH_PN|;0|b#)#~=&Q1D+L5?X<NooIAi!9y+=
z7fQ|K1+Qag>0a6%dQ{1S{#HrcEu0L3rKvmKmC}djbg-jELP+U+#laJPXTft$?C5qd
zaO*$>@vB(R6U8%NBS&t9*(?kTb07hjfjwd8WKe>_(n`mo?`|z2N|$JT77v|wj%fYs
z0cMVha%x|ic)dM?$kkb)e&qq;%BZpx*`uP1!!mt(a^j&wd2g88_0gNc;0cq9vJHE<
z21Uo?QDUK08@$E~<WnE;Oy#|}F)WY#&w*cg<N4WBwU797aie>EHH}b+{6t$y-zR=b
zo4#-HA$6$7LxcGo=o*Lc@wQPS*OAMRR8%-9Oq~-o8O8(_r}^MgTO02NZa?ukT!Vvn
zSudKd1wJ4IPxuIckFCzO&jjZ`3&5R3aSZ!oS4bhLK7{4!<&$cBiax^SDQ&6SCww_k
zM6RDCtzhyf4)h!CaDRC-EjSf|4NW*tF?9sr_Lpg32CuqdEWG9@r^>^J22Dl4F>ez=
zE~psL2@m`566Nkg>@6L*;G|x~;6Z%{WFN(h5tuo*Jm0^mFMp4seE-^+ZnrDo>=W+t
z9G)|qT`=Z2$B?-KMfkUBNa_@kYwrPMzfTxL`PSNs6kCTAVnd%aq=32t5{43)*La4z
zn(Qdb_2y~6SMQ0I?{>McnQnEB>H6p|>%&q?GgT|L5KVC)9t>xZpaAK1*E@WVz{vUh
z+0Coh%`KXH?0%0Op)Z2&6+cZ@4@rBwI%gae>Z;}i?q@dq5&xWs6!)6#BE(+;X=1$g
zddWY9hUJ_fBS%;HSa{Y3Vq;h!f)P6i^wAp_JR~2|3$Z_vl?$8_fIi3{NF6!^mtfxO
zo>?!LC0cKA&}_Fi31kExH-O5%0y(x2<mwpIyVa}S(uE2Eojl^d3=}=0ld6D-hP_76
zG9u2Ha*Btd{7K1^m$rAYJREpO?UYt9!4l4>StOo`7b|#x8pI+MHpG$3%ANe3HcnuE
zr+kQlQacx_=7`MG@Z8LV0s*>{!xRU#!ZYk-AJ$5d2uLwe8jB@%HBzeB+SC$J;G1@*
zYIjKL7bGmlj;O%t)f4!#las~bip_7M5ZLi8&4pq67Xt-zl$IMdRx7>vCVgBNr1kw5
z@qmRSM);ctM1Wx0L_&S>ovr4cuvC3GOd?(&2*;Q0Dpvla?-3`evJ!C-4dZ98dB6M?
zh=i3}U3_aF`q$WNtky1AS?jKiI$bm|K<}ndYHKtF4QV0Rz!P&Hj=MJGmGI00&$S~F
zyW2-3L`H3x-7}w0%pu+<o(WuEjoTnx&_n+d+H*v%lS)<{G&Gm&PmYE$l!E6>9xqa1
zkk0x%$r@>K34J~#CNz{&91kP7Y-%ZUJ1M4ox57OVFMO{?`8$)tE#1ysUb%Z)rC5$<
zT1*#&%Ro?CVyRzN<%nHGr#FO;*W#gbI=Ioa^KnPOq*~d2hJ$9xF#<GPDeFoLW&xbU
zU@uX2dQ9mM9J}oy=Cr>NH)POx)aNO!papX|(O2j-%EtyBU-Qmg)nE8*cr%9J%_q0(
z42i#N3*9Z$J-tF&j5hl4bfAuw_QKqSyyx!DK(TODcJ*mW=&grwBwsi^rXd#6s{tVX
zn)eI!nI^y(kkLQl<J?S=qg5@dh|s+75{S1{IrRxUo(u~)bl9C5JdmUAnExd)g6jo~
zA|C}0+wkXJYPFLsn}q&?2(z^;b;pp8Pe7=r5nPTPJOgW&5@hcmFNqMXlbxwJHj=W6
zP}9~(w)6Qyf;Z;M44$|S+7IEW>7BfSceJ~NG1`_%22DTc#h!IvAnJWmd2kNMB^Fw}
zm)1A>A$6-tRy(vbLw%MyDtrdJG<bfs@DjAQSot9yx`&6H5yoF0f-lHv%6>nK$G3iE
z@4YPz)s*#MBGT(*<1WVNK~t28)rvpH$TL&DnAO-)KMa<sTcOeyS&d=oH~(<G3eoTy
zLYGB875@Y@h>SA98!3V8e^JF8Y7$+iqF5eYWCq7*e8^tJ3FzpAdx#(uGy{#pAce>n
zll^);t`!1@mH}u)iS+|9@Ua^=*kWFzW@g?NrM-;BKE$}+qMC|;gE(a0S-9vEVvuVn
zE#Q;edBd9mWB`aWOr<Sa?ICIm#RlG1b-df#70-9)$b$l0$_GzasXuFlfTkqy9oc%3
zUo@w$#E0xFf~R1toVz>qS)iN=CCB^@AP+Pp^(M*7ZwpvOyvpIIVc(9%Q0lgkV=Wbw
ze1O|v&Mtf`EmFes3lJ#B?t3Kcc=?ExLx$9VBg@~GGZTv%T1!fy{A9e^=(6Ez#Tyd}
z*>B?5m6D6Gkrza|T`s3~7vw<`o_%Qy?xnVrhbmVb-dyECKi0)i;*_lrtZe15L<^PA
zYxfhfj0N(48tdErFOrA%(8&CMOT1__-OxAKcJy=E3br!L%~4vH{&)M~C*_3?tdHuP
zXIfS~`~V2h;3F?w;~<5tL?2&pDp@#NaNfW`BkZ4Rj$kyp5J5Gy>IJ~SAjhfs$Dm0f
zqV-Zd^tmo_sL<M^roGa@2~?>f;NG|vJTOhYSWckKHb>xzcMwPN#rh~v1}j?)<+pnY
zJ1NP>`HzP<ilD^TrPDwsVqrEB%J$g3$0m=Rl;B=`T)VzcJJsbTJ%oYK#G~nBB!TTr
zIx0n`f+kVVY*tD$_26BNDiw!~y~fV$$#EXn%Bfjt6N)|3Q+G>NAo~MK2`#ax6wq2z
zgo)KX1ud$eN|D{tXWRsPJH<uG<l3T5$u_pQ=ERroT%RUYF>_)$HY*4^)#Xn%SK1dT
zsL@;$;8zfcHw>J+ao!4+a|=Uxmm?u7u#L{D?OMch(@#x&JH4<8tu)_9Nfk~3`$-t&
z$y~euf`^SrKmeq3C|sVTa?WNFb&i$0rq?y7DQ9jnhC^YL8pCocP`o_J5XOzJi?hM8
z5aW%}Q)*vtj5T?IJHzSw5=%HouKo3?W^!SDS+orfxDmJ&Z&9{(v7A46#c|V=9DB&5
zUZK)XD5v>Z84RUpP#AD=TcFp6lZ`9!x$Ozyy^PqM3`{neaS58K+=|U{!x)V*t8b33
zAftx}$(5`SIX5-zvF)P~T-(m<`5RSphrq9Z-!FUc;@)m=j)!*eUK|*qR`3Ty+r3H3
zu{CfT;$~Q??($AI74|JTxwZ;23&dZ7L7wm8P9|!M0BT38!G$q_hZv*m9gKx>7n^;R
z-!4Ax;kv?j&Zh92`0W;jyD)~=(Z7NyoMbr)sQ2rsWiw3}MLFAAWks)l&t)hiZVmBZ
zDGa|d`|+jwrIPjmn@@h&SetpRntXmkj$c6ZFWJuPodj}+#Jya?e70o^XP?iTDOISU
zEL1CItK4VV@x}tKwHryP2*y&Du0y;5I~vL0VwXLAb(>MWmt!h;5HN*k4I`MDma!|Y
zAPZw8=^rr^`NAu4W?<M2OAjQ$(bdK%n++5))8xf#4pV<bzMfzpE!9-PV4`El7wg)@
zBa7ao`zH%$@EWnyczl9!ffj-4NXo=s$e1XFi?fOrVkAvHboX31&f>+xyM3toh-Wo|
zrPs!(pcLgI@A#wT{iHQxt61&uQ*!it0%fl?Q%rr?(hz^6?!q?g2Ged}0%yO?v9a}^
zaRUMlxP387J1GV<=*VvBZp&MqB+PM|4ASg%TW;VT-6hm?*xDPk&{*~%qnawpxoB!-
zGm(bU1wwl#*YL-83xl}v(`aR#FsL2ie~7CEN@2`MCa%5+u(zCD$vH~AqwyVx_r9|g
z@$s3{beNN+RB=ca81Akq+M$Ht_dq%$QSmm<cTy;)4&qGwZRb{8oH<MzctaC=Rr-os
zSheN}F-BjJNf*?~jyDuen3&eF57(*hL$H>)KCGQRm;V)707jt0WWjh}>E_@#aW#@H
z^_@buxuFN)I2;jKlv$z>Ro=c&Zs)<+bHRf?X)Rp3wBpcPx$#)jmor)#FR}i~t!Mv8
z$y{>5F9?0PW-VjSU9R$?foMGg8<7R8N|>yQ11$vjsyLphhl9F8Fh*MDYL08f>tD>Q
zVg-=TEV~ATlb5flXe;qOx4YWDsGqG_F*~8fO01D$log?Ae&6Dz*#QV_MB}Yn=n7O4
zT1$I1+`8dS0_?+ygD_7{U(o|)?QpKb37rZ^mHU~IJsEX4E3^7?&&ZiwPtm5@lK>UC
zpu9%KZlF71d03&jdd1<)joBv5eRXPuws;KXz7a(c?0W8gRl@AbJXKM^OArH<Rhud-
zXUPgr)YobwV5B$cLSK>)_VuV@Uk9VLRkN;0xG(*|5<VUuzvZOy9+mXME4-%6tdKtS
zNG~(BibL7i_BTF!Su;e}AF;m6hJ2^KIBb+T<%N-_gc6IenKm@`3KK6>BsdtbA9uo^
zW#a;27xQ-(mX^OZXxcX-#1_uJ&RM-UY}!Bt)IpR}qw;JD08Vt+tF<N~r>-fMLp%#$
zT{Y3DFp`fZo}vXxPej!>iyI8)Y2?raaW?cWug1XY%W#{vrlfo0%J50*?<G6vHp0XC
zsi1oK1;X0|6RD-$IRtO8l_EP_|8jHD?wC270z%I)hhVWILS&_U#)^LZ8EUtdj$KXJ
z!Ks2ufyesFBPlsS=`RH`Et~I?iXXmm4jrTvKeNQHEg3?o*U*!9<rn+*S9Wp<iciI~
zfL)Iq#TJE^xYJTX(|ReevWgn;E_KL$D?en&?JBse3{D+c)1VMnN|!3Ph%D^SR(U@_
zt3e~n&h43b;R7DN;T4woqcuux5UIu*@BGS28FvVVMRiqkS7Vc3B!cUeg-Qjr8ca48
zw1=Yr&+*u}Akrs~0nlTdqg*^5e8zQ#8sMwWmqX<AH?gMF6j5FiCsp6{QF^_3O_M0m
z;Ah!;wtJLQgSsz8kq9v!GhI00*<}a8;Jrp4{X@Cqh$xMLfRzqT=~V}NQjLY0`JF2j
zlPYTY8#C#PsUQ#SaM`O!(E4T*1l@?|?MzcX+qn{+5y8BC_l2X+@x*Er%!}o=7JRK0
zW8@RlKCyfDi@YjaTRh&mb8i=GM)o;6qFOp_h>#$gt)^l@3GXF~;F<PGqY%uDsyx|I
zG$bVI;Hb|x`m{K9)z`|1CDpL^x9Es}7_R#1E2E5VA!}hyFyZ3y*8?N}M~v6s+drCQ
zjn?~#gbDJ4%oVNol^m7zrTo+-a%32jnJDxsB;>BlQfz$4AozBH?6u-xosY!3)%i}l
zN9&FfA}$f)bvud@;9LL1EdZUj>Y%LbZU1mayo74`)_A@i(P+Op3%oJ6tK}mO=Pb8L
zF)aDa6qW<*crST<I<><2?H=)f15D7AxACoT?#^NAnDbIt-KAPiPFaRJ+~Y7}*P%)b
zrNB9x2;&pFH~=ut#8;_8_Vav+*FTD32Ilug!E)xg`^87I!W3X<0ZU#r3oUr7bojR*
z3mv}cDun=f|7aP2D2*I`qOZh(#xN4QVizEEkC+#l24(juMW#hp=FhQmHs$0DvilqQ
zSeWhL94?v@S0b96A7F0h+yo`csVsZn3_^Ma%fjSUBcevxd*_{ZWqka%+B7^TrX?t(
zS9K(rA5yo^K%KdVHx!a$%oX{9PN*RJAM!b|a;VZEkc$V9#J*sCWfsmNC(lMS#mFka
zMc>Kl%4BWm;$=>2EL>{EysSF59wl}?<i5eM=FqVuGyPZ!)9LdZo!tkE4rktVQ=MqS
zJ$gH@lohfIzc850;z9x(w_8Ux0t-I;tzg<{beE9?`k?NvV?5CS6ZWISm=rXZ+cSFN
z^`Q3lB>88e2hC@nr3Ed%6qq?%5OU(`AyP9?c-f4VcE&MJ_n(TS!#)L*>EZ@Z>6PU!
zXR_-sMOBkDrd)@W=DS8M7)oEOUSC2%J|GQAh$b%Y_s0!!qyVG?&a{?NTP0JSkD@%-
zW<xf=Gl|HU@1pPh9#4w)nAoQgvYJj;%hmG`+~_E_a!-LZW~V5RVI%?G7%CEkkPiWE
zaQEilv2{Ca$58U)TO}!i$NcfZEI1}GQv5M!%?(58sBE<&UHj$<2#VB(ml#uNmz8iO
zyGq5rEZGZ?Um!`~da>e9Hl7nNXLV4)LsYh+i1)OR+cQvJ1D<!*eNx@4x(_I!Y?eA8
z-q=b`sIagmQzHJaYXLa)_+R$lKIOzN=xdDzU^T5^5?L!+&}FEulg9E`CkErcbay@b
zo3#~eEoNITmKv%tAW#=+14zwvqcrT7zFid0#seY&11m-DH}M2j9!^f_^Q9b~ex~5Y
z<?t+t&><=?M?$rrzzTrfgP=rlrE!N5ssMq1_Cn5G;@L?yv@kK)nIxh_v_5%gZ+4z&
zU7{$0>&PpvH{h`7UL*%vo)faaglCF($g_44txQ~=q}E4v*8Tj&@Cbj)X^0gopJWnf
zwvBws{EQUiUYRy3HgJVF?$}TA%EbW`&+yWhP9wZxR}7`qx!ggge!N}>5ksDrRpSEs
zT1!z54AxfFsu3?+R(c<u|I+oYYeJXc`1s=G>oT&;DQ|;5US-V*<@uE>gosE*n<@4D
zkI@dn_NzmZ@}(M&=%>yWQ2Exu_ziAWAR9xAd;Xe<{Eeg}p}_AiY+hiYV-DMY%@s~s
zgA4y|<AwXY*+k!N){EWm#e4y<cEzsC_lv_DA~B#9AjcgyNl<8u?wk@>{AA%=cqLq>
zRs5X8!uyuCWK4mK?Te_&r#8g;reCY`cjG{lBdYM69Np!f;k~%QniZdC+yWjY)(1I^
zn#tYIxb@6;s1kC9Bz^hYeY9D*{QZ?aThB{Y(#WP{Z$P|XB-=Pm3dh>C`9ig*z6wCq
z;9>w?XXocamqZb-hvNB;O1s|BsMzX-%%~E#FK%fqaP&`Z0jYY6$tsox74j@wj0Hn!
zElt&X=pa9l*t^7hMQk!d+p$J5Xw1GKA~!0Q)3nkwO4D;}@J17krDy_7b(-v9vWX__
z)*7d^6{?FTzvm%)LVpfj;<ESP4Lm<Vc2j?gg#ux4c4LCcd+b@fr{^>$Om}fc7o!Sz
z!;S%Fbdgq{8|}r^zDW3H`*}EMeq5oVs2KGOGzbumhgG6{oam?JoF&zmohafs<f={l
zsy3`K6+~;U#uv1G2JeWEa~Gqlt_8EDDP?a8^nZyEBr{X2XH^~?X}0dIo+JBKAbYt)
zu=HTq@rK!G#|G$xN#^=qK}Oo^`{Z;$i|=BcCc}%L<pv22zEgXRWfW-**93X-+MN7(
zYRy|_cU``aki`-+99gC%wVEb=QK#|&jHRd57018%&pmLVAre063{aqa252E~qVG{L
z0a&gk(&XrD0+**bQ(Tpp&|3x1%w(i|1`vaq<%{9z0|i)7gH3!GJ9-tX!BD*g07bXk
z^*$_`vGD%9aDJh;ksxXOUM`kmakxVbx#a;#LW`hO%5Q!g9Z|yPEzsVn#cF?&PjE^g
z@Y+&Ze&*sVIpf@r3|gB6_=xv-=sqymB7%6`TLUi01KFPyj9-?!<)5|fpRaJi$^QKk
z&ng#ML|b#<`@<aW<f(7OWpeEPlLfL(ZB)?V^6Zz@=#(o&>$q^69sA3=Pmqfic#f#Y
zFAXlHt5?jj!4`G4kM^%x@otEdPZd$n?++L79vzSSS)l}VG(5+*1N<Fn+l{JpP{;E0
ziN&~y<WhUiVbg`Zy@Gr3&K7}E{y3<T<K27u6<fCQ%5ZDhg&S+OjOb#hrqM~uFvNQ(
zz8ERSecH8~3SL80ViTMJjH87}Z1o+5NyiR-%ub0b_GpX(q4qe7SPOe;ySUJI$l{i-
zOg?&IgD^JzN|x_n!EL%S0-*aIz?4dhGi!!ch}MJg(C@LD4AnWxyy0AEZqA|{=`fqN
zt{h!BDqVkhUe6X~AGKYd)0p+F)GwokccOrIOx#SX;=8dRH`<z_{IC;&(#;-P0vf8N
zm71OwF286Nmo3A_Mj*viHLixK{i;cWM-)Vi)OcSUT_(|v@oalqu0Itb<`&)MQUJuE
zLLIfk=H-A}hkzpEGBY%6lzFX2pQWY+n#gZpZ}uO0yWgBFr_WmPq5(#8`4$A=s=)zU
zpmvr!fmjbn>V*VHXZ2u^i)@n34(dp%{CH9ier}#}D!Mk#QT3yTF_q;^G^cdnN;HOr
zo;k5NpoRn8BBEgw*o6;~d_)QElnpYm`pRI%n;{-wj3adB!e*zY?QRR}!_#8ps%Z8{
zZsQ#xcua9Xu_T|a318qz<U2;`LiV>WSj{?c%sH`@tJ|X#;0QDA3YBrupf%~(M1Or7
zZB~Pm%5xgxrB&{@V9e^$U5Mm2#oj&t^lr0hqV-sj2w5MjmKAYxRxA>^sA$Do7;_Ol
zoWq7T1v-<i=9;|*;I+50LuU(sNJf^_8!K=T9SbEc+opLibuVX315QOlo?GTRFWKNi
zo5<JGuqlVfH>3hDXB*yg3%JC+dsv-rgB$auEguY~3g==6*gC&JfFgovS0trPKEMYf
zkr6hy<;xzWd<P+qAapw(c#gLf;bi7&@V+(*Y=DUp{Ous6_J>uogNHU$;NunNih{i+
zfu~wT>n&-V(z3c}<e^nUKbhfW^UM1pv5Cu9nAL%|%O`>wnA&IN314n|98EV6xh|$*
zkTISVzD;SWa8#;N1?C$?KgKU0Y-lY{8fM(fYYtRVZ(I_p28>=cCD`DQPQ>5|nGg`_
zkg0uSuUd4Eiqj#h^a+-Sj1v0X7hJA_rGV_GT*gq|t4&;PoY4A7txU|l_I^Zv3aiJ-
zqSb!Et!jW-k-`QVt{N>BoDq3ER)%-iIk&YROT0^9H8DUb7P2(cSh#riH*EHMpXFb_
zSXz>bHgq^Yw2W<yaVB<o8q|>ZeV0tIQl)h2=9awpd2hKo!TQ0Oa0>S!y$m42mTVsg
zm8&3EFqBgA>dV;thu>;<&uJtRy<NX+MVu7#mGNkhCxR>4LaD-&zm`H9?h7B%%95b8
z9C-w}B%^ukGbJB;arn`~*&s3%&LVy71&;9(Hz{zdF_rH#+<m;_Aw53f*CHF2&4bp~
zN_o)o@|wL975@1=!)`4%Ii&>z@?)KkH`(zF+c>qu>kqq#wQ54chnD&mAa&dHc$wa3
zz@g>}T6&{5-7x(S$v}d3N|Q;-MA;6lvrQcc$vZ_agK@=|kjIya!8ELja4$0@c)C8N
zrLruzH_5v1Q><);iIp=oxEFy6VhQxY9gomrHL$v<(%#vLU21Jjmy%uCY>7@6d7X;E
zMA19m2bU7K++Xp^Pk69_k2%>6W-lXSll0@AN4&TnPY5TOurD<O)SrB?4e@Z-ga**R
zr3ARXOIE!1v(=a5S09FR4H3S~SaLUNdPn8!j+Kt!su`q$qVq-K#m(KW>8j}ZO3hN>
z9j*64yt6p$zwBmh6`E*Q98Mt5Pf$=yH5mi8!KDkSYXc(ckhKfKWjIYX<;4>dt0PiQ
zhm3&9xstjJWXz=dHE{1>wfo%__m&Z6p2XtGZT$1{-!Gyw*b`c!v;(OqrUZ+Kgf{8%
zUgG4uE|)*)*E?ZtOgDMI?4Gb=v@95P^xPfjJqw!YRj3?RnICS6ugpIMx&?dOH|njN
zT=wL51}EhS`!-(T$hL8shytMSEByX_?4xfgL)I_;Pr-{|q)c|0W;C#ShPl0{^SGH=
z-(7v=*wV@2NVk2ysl<Ju_6(F)OWjRHNaA=)&C*+_<;)!*SOVeDdcAoaTP~Zx{pgB?
zRIPjQ?5x+@b1U)>^2Rb|D<AjnBL|Lp<Cm8uLO7+3G=K96OFO#mC0uu1Y<kG)0=Dld
zAoiN*Z<w9FNKg<Ij5|cq7g@1_CP^n@8`-}6cLe+n;wL8)*EXMgeUT{mcGw8Lb>h=E
zDE_VXrV2r5LeL5YIL0^5Mg$wXfI+e{fzy@VL!7LL1%bX_cetfK`?I9o``WDZnQ}K8
zxY)&6^ntDo`}aL3fY45g8T$B|7~Bi#dh;ZAxwa;JNQTlXqMo>1MNHq(Lik<W`0i)a
zmRxBRfW-@?={dF6nJd^2CCk4IE@g8hqsMgkz+0NVZF+~#heP2u2p$nfQw9MY5jHj)
zz%DP(B&FIQ2>X<KPbTFT&Sozxeb*z0+SkFk&;*z(RXgoyHF7Au#6?r~^)IQ*Cm-vi
z@vK;NE?Gvj1IS(d+|XLJ@?+d29=T{ML<80~K~8S@ga=r5Zs7Uz+qeZUw#72oY2t&4
zzM=Zbsz^!_SnTMxp(6RgLKcy2PJt>%27VuRcVqj9BrTy1-CqZW`vY>07?F){wkv)L
zCAGF|ryoC}(Z4dQWjvyHdZI<XfGnbno{T^~8!9!>v)UoKB!!KAa8$bf{x{Bp5KBvg
zm2%Gd0r$(i2)2DtUdS)Ee>=Y-aMJ|YfOLV|CjiMSbP=52UBY^ebA3AYrddz$Jz7*3
zSbS7$p10;TDRbz%3tlAHtNFU(HW>c#s?avcwiqCW?JyHb2vY7sXAy&s4{uw=(xI}4
zBAY3$ZTs1^{$MI`4!IS94L4nW=Pe0)6)Iu$0<m!Op2c-l?d`E_zUaV!i-g^eN`O+g
zWc`4)G`fJGZ$CnlD}7Fz0TTa1wTAB$A<nrJd{p=M0uO)&vxpv;O0!&}((My2j%3kG
zeySZ?mGXnW3VP>csjUR>l2L@0rGs2z+o!TK)P4wnugeiUbG<FMr^FHe^Sw>t-A`NX
zR5P<`=;hA^$gtXxS&-Hcn&K2v9GL=C$!K4zDo<0;%ZeJ<k<%4wF1{d^eOlSiQiV|;
zvP=9cw(L|39RwDhBCj%S<2Ru_xdf@fpw6R2y(SS=jEK&O<1fe@gvigl0yH$AMi9Bq
zqo}4bZW0UiCO%+z-OB!3y(W};su>lUl#Mq3%{`~$#JXn&!VoG7j5S<PS{sIu=DC-{
zEas@5N!DYbB9^)vy%b(UlIhB87T@o1!2u4TldjI7a30XhNzu^`T+3xIRk?NDr^m<^
z7F!EqF6DAw;I{Ayz5lP2_J>rL>xI8Ev;cd$?61M6k0ihS{&nCP`6`vZ=2;FuClJXN
z2hX5Px37E)f9pZ$wLV(f!9^2-&{shKo#rB!WP|n5&jol2OCownW=NsC4Uw+S%^zES
z=^P+$<BuB6Rs30M0+$4`+ZC+)CU=s_yeiRC!L>`^SoL0h9_Yll*hpA-7EwVBszrLL
z>rij(Udn%-C@YDGbxkyLGi(d)gcm8@A~M9f_0+9SgzqK8{iwScoQMY7f}K4XeksiV
zSkWrHrPGi~80e<dPOLt5Zd2)_en-24zqX=S33ZBiwO~YVtbTNNo^h7U!b3?v)>2FV
z``92Rj9+$HW(wkEuzvjVQlT<E-3x;I_;_OvCE9lje6OnTl-U0}*|yyP=E~e{xZ!c>
zXYGyek~>_oEz6bpierfGShBHgG%^t>=1kf<ay%*l=!MFTw`!)5?>fTy87_)fkphKw
zX{YY;bU=-YZV(`o{m?NW`qu|F1!Az%_rwTdk}dDE)}jK^xH3bR60yd^Aeouno1282
zul6~t^&AK~NlvUD*{obKXljzW<OBee(#+ipu9^>k8&oFa=Z(EwFD$Fd`U_*ImjE=e
zt4+T*=0El+426o@y3}zCs`0L^_=lIgdu220Xn8IfB453A=<9Ias6LzKe4(oHRgQ(V
zqBp2ur(EvWWIWqK;mNMbbpor;gOq*`5P$qd0bf(vQ(cG8|Jr`(ELDa@`0;@iv|V%}
zq*|g<u?t$wIN#ZLo^CpJ>s%|yg&46RVQ$Bp&CQBCXp^v=M+!GK?ocI%`=}m$Z_M|?
zS645XetPLQx!oUnVcim^0?Q1Z>YoD3pbDWfUb7X8r;aT*YNaP+Yu1!UXk=Mr#?b25
zOCAEZ2<-agFdSr5bz8n+Jpp#t1vKweol5ih!?`yB-3*0p9sx?3@<#)O?Im4gyg!`x
z4<88p=^D;x7h5yy)k&!{`>p5kdu+09PnS{Jh1i#~ul=r3E9wdzU&$3GnLz;_cBjXN
z_L0NdoA>c!Ud|1VOC(%$$sFa`t_AdHZWWU^8@vD2kbvddDWkaN_qzHgd|)yJ#LM}C
z6KgnfsTt3z;YW-(|7o(!ESK2aksS3gk)AJM$@}*s3i$U-0#$*%U24lYXV|=Z$KZNc
z@PvW~f;SWB5X>sd>+1d?F#P^!ZVstCXM_LE&nkVbU+2!$B)lGF7Cj?t!C9!W?B8nX
z=O<u%H5wM?O>%ufeU>vPa3aR8dFYi;RBZUtb)am&^6wt=!%3Gj0KZwykjVPg-ym#J
zsDv)0V2`v}X|gVSHp^t4Kl{394sUc^a6=_wvb6R?DUeCNKR4iKsH(6h3n1_<1TcJr
z+cDqmnds8j?`E)RLz#hRRR7v}0AP~`=$+28ZiK{RV=e^}B=wZJzBd6(E-e&QK~_W7
z+_O*A6~&u3J+@b|vdvg7{FxzIXVI_eQ_QX3#ks!^zYLw4)wqO<l=g1{TY=zwVNi^v
zaW@mVVWR^2&{rbmJTUE$5#;vnV6};cg`PDh^GlCzL+6_gZ=wBiuIw5@1i@(Vk>{iM
zQ;oNr;C&Ye@3bhalRVr|jpInemRA!}qp!vDWg1#Gx;Q$+|Lz`)!T8^r#s4qPq1o=C
zIUM!j1FqJaLrn({5v|^t)~tnyXbajCT*&V6O~2Hn&Cz*PPmC&WCc&)1{O0W<i=XTM
z*JkA+fOI)xeDnS<4w0}&;bAvjY4{4ouBPu8OVW886F~I-;8hcN>3o!$sh`>Q>FsGH
zY{Jx=wKJeE>>rl(jX0eRpgc~gURoNUSk+4RCyPL2G56OKNXV`_Anr?;kX|s7GJht2
zm~0me&0#hAJE8g%xp4PfEx{+pQf503=yLE@3slvNKDO?o(Nz!iw`80&(O1UGX4HJ$
zWQKE)2cM(hm#5RbvY0_(EmUJqYy_pQ-~3aLE65p0u~kt1H$U5>L7P{*myHj<$pz|E
zzpI)Wx)^n;rlpf+N_=vh^$_Sj_7-i3^xUkmy8m$Vao_~4x^`>^2WzR0kDgzayL0mX
z?UCU70PY3)Y{h#ABKJ?V_ih1Z6o2~MpU6YGS%Fu)KFs{stZ2OMyE`y#5(uCf-b_}%
zyPYMw$nS*B4XsBhD`5vc8`Fvm{eM0kOV}wyfORa$`~sSA0#7Mz$(3oJWY@)*Xl=%)
zo&_zoXLducaEB!NYRHV!9o07QRH@vUWT?H3Z7dm<Q2R97cu8nX|D5gDFlu@$f;Z+(
zIdt>-Z*K8s&V4fYo3l6%#s~1j*c80RH3PeHbVO`yEz?=<Ir1;N`4u8Qjsim-pRmVv
z?mt3(SY1$!t9DsCxFo>3;`wt!wU8nu54(u$9$`&oLxx&v@Ju9xj@4pn!geFF@M>w*
z>`V!D#$!@#IR884cKkbTqSDSlexa5FD%$=RY(Pn0p7h-JGqoO&USvyM>*f9_yC{#q
zOB@;GW}G5fG}~_G-z*_f<0c&(#wH`h(?0*@;zv#wf4FPs`g8I1L0PLIEXLx@F_+(c
zY1s{RydIo>X_CfRQg&n}@5tW!(~Bd|(HCvOVfVoN&i?yaDaXH+Z%}^#;oo|e4WlrA
zaRqdL_Vt}$!RVrUb_|8(z{)WP4;^~W792dd@Dq{#0aOM(cRY3E<>2jiufCR#Zt>iv
zdP>$6@t0!uXDLAVOdiB_|5Aodi4mmZv`}NHSBqkV)$g{~hX%iF`w58?k4v-XN=c2y
zE?jaNrKoOkq6PoR4lS})_zjQMP%o+uveWNO8%*1js*saKg|5B6b*GTAuoW1y*+ibO
zP1kuNotzyi?t;7Hms?lsB;Wkq5Ma>r`?;jh*#c!>7!iNZT%mjNZomyg2i|O>7V88F
zY;WCPV2+Qcqd}jxkamW@HXb|}be>F<P|lC7GsWOk3d4P{F{7a>O44j)`CG`j&eRd(
zbw)5GfVhkq_5Org{Jdy=Hl-fW=kOJaS=PEjdLLi&uGs=#&^0>R7?;mEns@U1F8=-C
z6uSFr_o767SmM#OTU^#|BRWy)Npu6s4M!}0aY~YOK#=qiDqc$eiD$Z_g2vyCxaxwc
z8i2ArdEd1076Hf62MArL=5&vNTh$)WxpiyT9B{O8H*-xW(Mhj$@+5<t$kGncm#lZa
zZ8TCSaUr^Q>X0)8ZxRpfl*k@BMguKfSq$}`v(At7L&rdH-%qt-{8cZFb5y`Xx8uw=
z2DI5!b8XarwyI0|rvSVQ`@-Ny0%n>4jN~Iw0tvpOlo@yDjtr$PByvQ>Lht0?l&<YH
zl4B0IWzVys_^?c5h7mKG=v151%KMmsq;lU?;+hQi<>;chZlUY5i{lu{z*2rd+KvqK
zK2H1FSql<weCyzk>s}LT%kl0qx1=_;o)WzhsMzCJ;aF~i<h%9%zdZJ1Mzr?SuJ>>~
z-+zRd^9J}pfJ??6wU2Tv0|QS78%VV`GtGIz#6sL@D-*&9gzepxP{KCOcW*tH-(07@
zw?kO;g`ox;12UPd;gNQfLFrgTWH7Dg+1))#U=Hk(O%c8A_pp(7f>{|VPybt$Cx;Tz
zmLwf)IdJ=X>`)sNNv9Zw-&+51{q5a@hp*4K63(mq<v;j1s^Ia6Oh4~u#TcagNg>Q7
z8j5d8Yq1%ics#~GEt(d8Va6?>2&t}y?ORWVcvI^)z*NzzQ{P;5mDlD6U$ABN_*VKm
zmy0;-jwo3=ZQrkJ9-e55%yPT4=A|;c@#&Mrj&yNAb8*(YQqXr3OWsRE8xG<uUQ+%t
z`~Lfp_!GhQlH8DqWTx5JO|yk+_OgE3s+D|jsjC!QO(Oe~@vIRBCyfD-DRM7b<KGFW
zKa3`zr^JOm8Uw&z!NV}fIOSPqK6@!iXpYr}rK9TzcT`)x(_!8#VN)hK@?mW7ZYleu
zaVAsNbXNrqm|3|h1sDHQA`HTb0@VfXLSf&pdRU%xc~kJPwR?h`lyBq9ZVObb^*{=0
zP{m-4^M)~v$IceAmukQ_jvuf44|I%7Elrm|i>IWz%dep(DU$bn9If13UWb;i>^<;h
zn^>+LxrPcKcyW1Ahci0Jcn-<1ssrbrUVT|cxj@O8T)ZE~zdYtUDlwGv;n>V3un{*+
zLwmfsfiOpQY>>{gezsug-RqLJ*hLH-Hy?XTjCW?T#FCz`ZK^@i7tWn6kQH^(6t92f
zHh>--k8omjU-gw#XZzDv{F&?DSz9`q(aSMjXkF^oeQ$qDW+2Biu?1gSM)&Lr!B;`r
zqvkhV;{eS>mJ_)BHe;ovdEHiBeh%)#6h`#|{Pv|((jT*e@<)YlY+_K|UxMk6W2ZnF
za0$hM(7wmk!AJYYYN2Nm$845ts-*el9CQL)lHL)?Uj^S!;27gfIYrK1V{7kei+hwG
z8mkMcqqk~ZB5U`RUf9=3SE0cp0kCq}p=r1vtWor!+I)*~->GbDcQi4%P^|264GXDZ
zD94L<w(Hc<+hP0yP9xLMndF=B8awTZ_7VHPqeXxiwd~L8cph$0di9aLDG;KB<z#q1
z#_k#0*jC+H4&ADpP~-MMPTc*=${^>oGRTq7;{RDu<L>7Uqo#bdB&(@a2}P$xvp$<h
zNc$8p8-|3bTbF7(RQi{kd%wW!$z+tvUWT)9Mzs*2t`YRAbT!Gy);?;v9L`%jS?~P>
zp$ku>8MHPdYV=tA>Usn6lnI(;m>0Hk1*+aZ99d8q$XZ<K@3)q>AxJe9Z>hVlWy#y+
zQ}Q{2rN+WoDms14XRGk<<QM><YDX%r8jBH6p;H79$HO-mBCn@SC_7e5vVze3jew%*
zos!Bd=e{lpX8VvjE2oxwmMT{Tz7O)K?u^3w<A2e>zr2t{4~W~$nR~i<$-h|xofW}I
z9`m~*aGA6PEz-|TBGKAv-o&!b#kFtNc?M^8ivmd8(45)4t?>{=`}tGdpuz3&`E<gv
zvkvbkGK|N~;tkc%FRz7hag-bst5PK%(s_GX2WMEdv1a*HNDF}Fr03nev_NArTTaI$
z(#Yen+I@-P<C~H#SwvIH?He1SE^8vN{p5XJ3fEQu0AafEBsfGL)G}v4^NgYb8m3K)
zHQ<5gPbB<Cz=AEKGsYa}?0lnxPD#F2#A}}pMeU&r#jzGQ^YQ=IYwXY*3$6f44vp>X
z%0UO+9<Ca8d60pRMSJ%KD2`rMaPr<yvc4a57M5f<EctJW?q6HB9Sc~^Zmjywud*C_
zH}{&!f>9aaB!!pRt&x5qowGb$l}VuMjL}3|v-r0}*ssY+Uam<3y`N2>daI%^Z^Wph
z9i!Wuhg=)1TWOCW-ydI&?g4HS*6?7X_$w#j(WgfF$W9Wt!^)be=S16M2}qkM?o5DG
zO!@51L=J--P-0$>Bp>Mhi;so9yN^Z-3%OtKqI0HI0*l2Cmm2h=EnKj9QfdeB&#8Te
z{hE!m#m*qBrXsveL4JbX0=E|cNf5P@QDC8qn5Mtzb3)>h=}=emF=hOaKSS*w&k%$J
z)mSoS6Ys0!(x5ixdfALslIXHQ{p}{1_f`meMk$<#(%&y%;Yg^j-r2MLD@XgQEd!{l
zF6@zj+LONE)5xNr-rBbIfObUnzMsEj52b!l-iATeB(_K{wfO8H=fQl(cREH^77))1
z0XoLfvk*UY(<c`cDt=*OXv%fzYh}-Zvk?f8^p}^i)g;*jzbPw7A{$YD!pt_sV6iFa
z3%v$hEn^J0a?H8f0kJcZHnsPGVRy_ypG!n~!j9rU)Kl;7ppC>m$aoH{OCsC2U7V}D
z&FBZ!4pt6%D?~=%uLK6^)4n*BKE(Am9R3*;X#k~^sMzyg1EowVr{oE2b&JrFH!_q^
zWV-5#+*m9roOsvu@X}{iTMHY9I%&u9tpMRb{`ykU>tn|<IxRh=b)HkPym^rzmZ_cC
z2WXF*UGEvepVOs;-<&Jy{#3K`dVFJ*$fRm2ZT~H#?%F}VZ!5ka4?<1a=bG205T}s7
zvI1&7=nGP3-1YRbxnCyx6IA}9l)_Sj&QCHpdIB^sH)<k5{$SD4&s*;q>)P6f^C$|&
z$HjvbZdnKv&#$)5uTt#PK5_#|4EOXU7FB;0`j#+~b&A;~>s+y62A4GGh_dSS;5+dp
zbqhQ^Hxazitl!i9&4=CHj;uDZn2B)Y8;<*X`FYvj)vcbiz);?g8CfsP+?ier-X0wC
z_LkuzkPDTV@@8yyaSD0T;psQek7_<g-0j}e`EnR~(bo5PS9gPk%unv{(fTv|dptK?
z4e`vTUbWp^_V&I(ZG7juQ$O8OTUgCiCo5je2;(8M0d&mMK)g#F#XofW&QANPP0B4}
zO!JFrQxmSg3m8`=x)~gGSV$e?SlqCe^Ml8!J#tqsTfStBhe#x~627@-TO~Y1Zn_4J
zX>n`_5DMUHdq<XL9fRxPKFumvyD4JwDPv*q;#m-Dc)OhENLZf{;S+r$j$Pl#C!nrQ
z9`sODxU8`A`Eq|1xI_ztj+tt^p1Js7N2*Nu<-5L%=!X<w3ld@>pd0@b51pZm_w*9T
z>2#iZPZi>Q`J%tnH?inYKsn~3ga=IPG<B+Aj+?#_hkmZXBrflGM)K@p$A8U$3!V}$
zf(-o;KeNDc1~B1AlA?YvKcIFqkpRiI$QLB|*~Pdte%y_5#CmG_t;73jgFDX@r`KMf
zCTjLm_Y1oz$?>0rU-ujZwAi8p;Qu4*D}dVUwys-9ffNl=+=8`ui+h3KTA;WVTHK1e
zTPYUYy|}wu@gl{YP~6=me|q10zi+<#&zVV>A(NRW&vVw^d+oLM@uAGvT_t)Z0&LwT
z6%r{r0bZB23z<bJSJyF+xA*+PUeEzpeljf5*-tWj5QaDNh8|!dxtIuc{Sp5X&h)|j
zyeP6i$Vmx_#jKm(qo>5x5}1nQP86KWK7aN975-Trm~*Jho~l-GgC0qEC9=f>1a4!i
z5%!lzv-~GLm=i#zkO`^5jte2pl_H!g8V!|gnTxAsV3@lXTR$LZ;N-XC5;#C!4`N_y
z645n=sLX2mxNKkAkTmWWn<6?a*|snrT%@J?XiX&8>vwj*g4J-K-3T+%0e^_&X)BJ&
znDcerPpk?S-&`q>p%KI?JIeRKTwjHE{G5rnlUjYv7|HVs@pFWzw^`bC+&llmFmZkK
z&&DC3ag83SRS2Aah;UNqVKQsK|Eou9mb}wsb#LqX{DS=s1?eG#rc+0JqF;Epab9Q`
z?W8RTz`G9m1T1(l<O(B6!5{-5$D^*LH0S*%fWI&BOSj24SP6(&u+z4)8W#_Ehb=_W
zPF5=COa4J_d`}M5LsJuP3euUp_TlvBgx&nKTFX#YH}bK}4+3?@Hs{gguF`Q~zScrk
z62vUOqXfH#=gh^feL}~66*u{=^v}F0t*-?>VI-IRl`jbWD&Sgx&0r_bBSs1Q2Q>cq
z19kd+%CjKr!HTv=u@w&&%V!4|nGTscpC|*-XIt*Kwf)tqds!l|_N1Nc|86j%DU^jC
z>)*CCS<7`lgW@6KUnW6aA9a>zD#<zi<iMWuE)m<BnWd!bnVEdD#A!bGXIs8@LFVB{
z`eh(}uv#Y<CiM1D4Jpk9`8lgcpl0`e86c@C#iT~3CjA+qSJE?cG4mO*`~V|E)At8Y
zW#Ig<AEu@Botqq?bdAfR2LTS5w4Hk&05?0#&@xXC_N^RCEPZ5})KH;hd#=INkZC}Y
z6B9xL5o@)wC@keG=K_L-00;m4rIj(fkoEL&gP%;e#QzNyuSt>Z1s1&42#wZ;UT_!x
z+gy+NBdGt?G-hxQmTgU#_2)6Il0RJK?0*35l|CZ|wueb%6vX!V$$w*afORTPq+K2N
z5z||Ah6Pw6S9uf<O6T>q2w}xVXP-w(tllUR-{QBX)E*Y#NzhIh><xC>enF$m+mn88
zcRP8sso;<0Lx^uFQQYeWlGPr2Z|JT$>MeQ6*ZtQ`dN-cySIJVnn(44PvGY4rDQzx^
zxeBh)(6YRSqQN`a6B*7UjC5B8D6HnXUcn`la$^2hLOh=8_p0N!P@E={t&jSAW{x$}
zLcnf>{K;`>hyu$?3g4b|ZyN5j2{&rfC^AI^gMEmuGu1TvFOm7s_<J3QmBa*|G<Rom
z6b_8k0ZmyX#Kc~n<(VVCFOwlKj1Vu&?agzW3<X~o{GG4fruH@vW(qtH$W@e#xxh5S
zGpvJ$RNDI8#LH{*E0QNWy_T1@`xE~s;Lk)H!{HYQDHJ*2M$50Z?&{rmjURCxpwXCN
zx5;|e>>eSLs;8+(I%oR%wCuuI6|ZPZ*Xe=oD5U!N?-sXs&y<EWEl}e24^M&6tLr5M
z->R5&;DkHB7_r|~E?Fuxt8KGa##6wE!jN0v)IWD@6rN9dra*WopW?M^RuUj<V1DY*
z^Tu0+=X8PtMQTJ|q${f7R+ndsx%KzeF@FPf%AW(!dHVTCs)WbHngEjF=@MbUYKXIU
zPwrX;7#6m(Y0@^WBld2UiG-6>12J}F8@qvS0zua~|2Gk|W0AFd_}&<Bps0=-{z<24
zN`%zQoj7W?nR1S8w1qasS*R>;Dd8YgLlbNv)heL9E!$oE4|g;0cHD2qHN1DBlI5wc
z?P$w7RafV$XB;8<&0X#v)iI%ET<S=d<F*U0>p59n8+SXacPF#c1b^eXp%3PRAXUBC
zMH%h>AtJEgDv<vbwo;?nO)t}0US41zl`2T9XC}}GJS*@(Y89q3SG#|DcQVchy}X)H
zmGRCoUpR9@oN~;-A)}2h=tJT6=t%!}t^qp^3L|+Hfdcw7$|sz`@{1|rpSU@A(CL4T
zu5iRu!(J{j|MKtW7BO!s#ml%>ko7y;u1#E>ne>;R%MvJ3%gFV=x>8>IloqfEzstaR
z7Q#mIYWn9b!jPKRH)B~=d;9qI$2w8!DgyG7cKnVd#%}v?T(_?1`vaF+t#xV_59@CZ
zZ!j|&B%?UXPc6w$vQKX8+yP{Vl5{jeL(U~K8oUuLlwb{Bz0AG|uWN;m|99d^!07wb
z)xE^t`m-;UZ@i}@MShWQiVz_Y7)6xSwAGU#%SbgYik1ET0`JIRO0+>s2ux5WIlwT`
zHT3Hg{?;Es+3!)Tyim}g-J|eSH7*Pbqp|xfe3=#d_cw<jsKHlO%nyPA`=N`_i6)I^
zZ8D*0VH}(TXQ`Ws8^2;cx)eME3gmj!>-v#54`m&}aemHoDv@2VR22yo5V*<*d!ZXi
zo!?BShT&@9X#S^gD91lz95(rtta^WH{aop1wd$O!0iJBBqOsfJJy*n%UM+}4%m_v$
z0BBYvKZ|g|^>BP32AG9P&%g~<A--X6cOTvVzZnj4${n%KGF!vukcgk}w!v4lYeN8`
zOw!zA7p+kd)K{-9rOxnbJ>`9QUP+G~dBJUPQsK@>-x2Oowif{2>i08a>A)(fPpe|S
zZ!(@4EI_KA4Z;utLw)G<vE}YYGT_GQ|BP@*FmuDKx_2dgt|$=n?3@sE|ELN%yZ0IZ
zd8p|5Jve)i5+hvv21?J-f?23LHwb=u+j6E|v|Sq`lzc8^GQNrA&m{>5{vdz6Q^J9X
zn1b$orauBS)#iLT-&Wn+Rs8&UG}cBNu75`Rd_{u_p(S751D?$F0+McNtK#+1=VUuU
zRTZ(H5$*tl1vG>3+0$Xm&m$L(=c5SacUUigmzut_Iq4ak0tDlclfLq5J+eR|bEd4C
zEjkaQ`^1e4`{PUHB4=QF+=37<q|5l$kUf98bVhpk^9ISv8Gu#}PRMEvos;dOUh9?s
zpn3`*<~|H(s6K;dwsH8?6gQs1-If(<WNX4mWwT)venh#F7!sd;ZGGo``!+c`n`q=G
zGsn>Pd3t_)<170d#k5-iJo0ceZsR|=u|!h&p(Ke}zDzG=>->E=IWb2j_V3}|4XcOf
z$Covv2VZ{kuPSvc?9T`tUoP)0souJPh{RUq?b@YZi<HKHGoeVz#06JdJp=KL<Hg_!
zLm<L>;?>7xo?`-uOPiqs@D&+IMn{t2CZhS{tk1y%s!dO?M4U38r_!_VSM2%Hpd;PU
zwlR~!rJXYwVd>>fH0kIsHQClQ$lSh~x~@6b@11)3;aB`<-D%(Y0r@}6_64RDHe1kV
z>Uli2b=hk(Uttv`<^0=wcg4z7wcJ;3=1Y=wg9(&}_lKH3W>QUW4ED=Z-wk1NBF7C7
znn1}2AsJXs%@n&j1#LL`GvNJ0?z&}ZEt!ij_tn!V{w0GUkH@tm9J;f&KD8RJcqCs%
zF=M{G0u}j(Od_+OtV=j?<*9dwR`f^cRnJ6;W;sAr(Q+Y}|Gf@ENX)Por@m25d{W;x
z&|;-)m0(E?CI^zw_AGw3bai8;A79q=s_F)$5pVR<c(%yZjRv70x9v{fsbeCmF3}08
zgtr8a|7ZY<fOg6Xpd19{BzSfN@07{?fE1qC?tFJludJj*_(|~`)~H-q>AwPkOZoJ4
znSSJ=<os*dQ8>6ot=CMog!Iwfjq91`%SsL;7ox6uy7u+3%TB#NwuH&JDX`6e;oKg6
zch`E2!aI*K2l`RRr6SDo&R_%X<9MH+-Ulq^b$}>q9iZfgM5;^*MM#3asH{}OkvsUT
z1rYP%8WgiiA1W{X+B%VyK;&K<3Av6*+Psa^^xhg`a%v4QLc*Z#fIyxMLkB>Sd1cjh
zG1Op6gZGALtFqN!n*>DlJ5oC<nMF6EOmUe(k5r&r4+&?p*&Z`}JXaz;vC0l{=^588
zCqVFz#As$XhMXO_Kr$?-#36LxV+R#G&ZCf@XbQqf^tI|#%+O9-X=r-rED_jstt|!V
zqm-{c<E%G9_!{zWTS&Rb=<~}|#mm`7!J5Q27@}=c1eSZBs>U5p`2Zb>Zvh>{$DkNW
zKBOLq(#v=kw05CX!}cGM2g)b(x&3;QhIZ~iQB!K_x089<TSr6db+ndxURWT$Gb-rj
zwUbp|G`TlwB6<viEj{>H_BDAg616}rlUmULh$w9fk)M13e(eziSC>b2(p1nF9^-8U
z4FVolm9nn<AbD5sR2GieOmSkAS~EMx_3?<Z5Id$mLt1uGrt4G7DHwdVL*#!VkJNK@
z3fVB&j-9v~?gM>y!0_|?f(N?2rptRkOa>7hw?)D*v;>ZZ2glhUj=Old+W$cIW_c(k
zn5yHXg{?e>#Mk?_BTTxu|Brh~U}$M`MCei~)fvwu+LoLt^lVWE)1$-l#eDX4c%b8u
zR|6>Ko)9;N`Z{{_Pj$@2i^z+{Wc3FDg=oeaxOrQdm@J&KJxf}<g!5qkd@1A3TlSiH
z>mZ7xaDft)yq}B<=ijTc!{V9THQy4Cbd5p2etJ-7uR*6l3`R7~G9b|;B~S6gZF`Em
zVCvpku7I$J3T6X1B&o|Ln$6(2O#ci^U<e!ARa_wo?O4rKc}onI>UkZB9%&SAgMEk=
zp06i&);TohMG&QV8ocxixyMe89}62XgHB^5eD@`>Aqpc&H<r3MPW?PfxxFYi6>~2=
zi;JCDljw8>P7&0|2(Tv+MiTm8YmSmL+GWI>9A)9pOrZcqxusr!RwxkL{>E67)@{0&
zhct%%yt>)_mEdAyPxp7h^2A3wr)IX|R$JNt!`7TdtK5L+g%))Y(pyfcTj;^NPa94p
zM(y}l*qh{+(Kehik9n;%!slZFs9=plZUz4kHo9gf)=SNT&pdhEYN@}muJJ7LJdPDG
zLP9ufq@)-L89`PYV=ZjCg2xNl9wzv-l-hX|i?%!wYi<(Qg5n!Dwd}$F0mF0>v)PCz
zxA66R+nIg)C&q-Sams!JzC+!(l&nmq@>;@4-f0qHml5FTxCI*)tqs40p~!gW1RIEm
z1dAFxn;{E{<biB}Bw+Kk37xQgW+bH|23(>X3W#Chx^#U{QOIwDbm{7cC@$QXj@H1F
zyXa6sqp*SH8sHtdSatrEEs@f@fRft&L?J)w2_z_S3I%^gsHX#Bd%Ss`fs72gHDUp|
z9ibpaW#FN)1|-HD2~FiMDq$*^j?uAmri)If#B}J$K(^B4qJ3<{OoK?o4PSe(fNmwk
zzP!G+r`EhZ;PpSnq_X>KuEF>qg)%X3T_M#joEhqQB|C9`1h)bDG0ZNam3JJ|+l|`;
zMMYi&bk+7B%hM!ho7-Uep$={WI$Ej#6hh=(FYO~kbx(~D7_}cdKv;lxu`XT2Fz_KG
zlx!pPBIUa?U>R|0<&S#>q2(1%W>J@6yp6a?w4;D8P@cAA3KzRznv)YZO3?h*SAJ`{
zRJI7fX?YfBvsLvRa6Mh$L)AWnyBjxkzS&Hi$Dt#fC!N3=nIHUP_xSKY4%p{~3X+K;
zJ#Gqwse3;k#1NJ;kT~HIRx*U8_}**C8D^P<4zTou4Wulh36^C@K%PYLVlG_N1=~5>
z?x_9u<3B()s+I6!w1Q8(lRGONfuLI%azClPO#sUDHyE4-XV<CA{rj_i8U;u@S9qwP
zGx8v}iv<CQxXIt-)|~k=$!*uff13tfX6tnJis6K}IBdW&1I8<qX0jbXIYtGJWus*$
zQj~5$>M@mr@ikEno-r&dK(=P^h|Vni*8}Lm&isDc8}YQ34HZ8pRxJ(E{}r1nb~qw)
zT{wMxZeRcBQT+SvA&b1|bi!t2?6z#PAhxT1AN0Z9kpeGj>lv|y-0<~u8s^E^{Kg52
z`QbukM!=NbJqNE`-%xf2L5?!=$95#uY%pa!&GeddCik{7$3o|Mk$#G+u$Q|{`bw6v
zvSdbVbLH<>7>ZuWvL0_+hN9#J)9Y_p+N;9tYg3v{a@vP5V?lax{iTI||Gm7#kV)F#
zk<+_xIC(U5Lo_uj_&y|DyG?VKZPP#=$1T{az)a-LvEKD>TKmSyO5sB^IalQcBds<n
z6<V!S#Of)j$ogdGI@81F?5revj-fGHK|}$*MMmFH$uFTMP!5)`6i#sq5IT%RK5xGK
zm{4H<l~ml^x86pjgzPPHuJJ^YHw(r&{GG~bbn#`5fm76Ux229;dnO^ra(g|QNjkJ;
z#lnI)M$edc81M=DOs3xdYdwq1B!%V&XTMZ+OjD{=HMdLawbzmlqTInb^MrwpKD*7A
zR+SO;w5`cBkgqX}sI}ro(AAC@rlJpo;v#@pImeHh2PbAxL>e$=Aw#bk)EO-cios?u
z%&##;V|@ju1l_8L%^dj<Ml%cD@`KMP6LB^c14r4N4Ba9b*m11EnkskHNJ?>}GBXAw
z_fceBD)VI!lTIS=S!yyZ2ea%xJRgT>jvxH>O3Z)%z8gB;kS=9t*BqKx){I6_E<k|s
zbCd>L)c><~D=`ubHt>nhYufCg)KxI7bfWIUh4Da3pNb`s0%|1vPb8u7lpYLz)W_3+
z;N)*{BuJ(#&MXNeDA%>q`RHgE3KdjZB|dr($jY`%LdO)&!K@W@(||J&X8JC{<n|Np
zhNn0LOM_M?#jBSE&@m>9B(5c1ebq39ljkwWUR$%Rzh@f&24^Nq+GRRZc0I~<)$BHG
z;aE9+5!>Fn=ZCVcWssP?B0~)y1I75r&Kcnc!DCTpaW|r5cS<Xr@cm4c6=V}X!A0bh
zAHM;JJqSR2TSyc03iAW2vp6vozZeA6n25YZT0=W0ckrR{@OwB-e#c~OpG;(@??*hM
zcl5BPu1f$#ZUrq^6?Z~?v!_GkyP<@OlyhPKtvbuuBf#}bh8*PS7{wnRP6oQQRJakN
zo6q+8a6znO_+B6lpWrCksyzet0$IV%Q!wp7IU9$3QkuQpT}3-%OP4PS<~D%gzpmoG
zqU9xk!>0zCx&o$)#8OV<5G9P#fw2t~A?-pDFjqpf$op@w7Xfa7RdFUi<h(bKjoC{8
z)0mn6A^rvj0tboEB2!JFwT`R6`#Zsgg7!==sa$*7wrsy|D?7ATp#O>OI9%f#m{!+p
zA)jrPoqYLam6UCjjVM`~S3CX~pr5$<E@rf{l5ZHPB0@TaE#p0io#nfVA~MaP@Azt%
zr@j(&{rkWRhe%8*YG#J97R=(6M5g8Y+sCEXUNBOO+}=``3k(7}3#$KGhM5z-!!7EV
z;u3A6%XlzqnGVB_m|W8s{`2?NUE{TcC`+GaiafS|KjTV*f880cZ=s(?C%>*m&%7?9
zxZj*@ZmAu(s+9L}(EVh_$TqfT&d_GSMp)eaoJgsIeZj}|$H04^()*d|BqC_l5jyu<
zPN)8Rla}I1d~TVehYtk7CD)Eqi%nV47U-DHAPnT6wW-#6Goi`<K%XrKCjKaTnRkG8
z)~$ap{iRxYD^{^dpzvfI+o~#^^-CpM7EF1<<U`So?V~7nrf$rWhUG}nr6Nj&3Aa?j
z);pZPmz#x-`Ea-`U6gHZKRqSBle`^Uda<hw&7Yk6$ODRTL*!(~inOmWf#!8kcEZqs
zLBiP5M)S9W6Xpk6N)z}QB|Z_I<5SVh9lQzdc8^coNZ17xxUM8^>G-pw6Nd#LqDo8#
z-lHYS>NB6_Xv8ODnL8FWMcGy>0Vtm}pufcT0wq=26Q@)UuMNOpN-a4^0$rT#>>`~=
zW!}`YYuhn!@^;^!!&e8t@*c_j38!CXD8NrGRCHT7y)39sk~G<U<tznxqw<QpL`o+i
z8>+EDLHiY{YI`4RQcyCEbV5ggv8fW>ygH#0_(b|ZIw6yUx13Vqop=x4C|Ij4gyrvb
zw9nHjgn8~zLGxR}AIqpWEfp{Tv*<i{G4R%8yHnhWwzM2XbwRHrdmiq9G7b8LuC1c0
zo$~ZY4Z3M-N$2{U3UT$i%#UzwzQE8pbc2rXCc!u6K3v;)F}Aug5Q^6HPG1q~vX3Of
z>|rLinRm4FUE`cg3w-Lltlm~H2EVC9HLzLuKlmG+*UZM2u1Gw}Ant{FE=_{_FQF@b
z72W3RNiVOA{qtkf;}Z7BENo8YX;iO1*4!H!e(wLw0{D0J^|uBaGBa#xElo@nTU(lt
zMZSpHGy6DzoxgdnJ?cEoXsy1XB&Xg~$4^YIl7HsSF+*tqeTW>JV0>GEj@i0MX@#xk
z%D$eMlIT?=dTiMwSx0c2>R4k_1s3<JI}W*M2R}X(vM+q%MDfvEO1AwtHq7x_;F(;E
zJg^Y&zqd}725036Gv=5!5$@jQom0R*#6dFY2dyk0CcxeeTU<`nmg3_H+4}5|o*a8s
zj!t**3mKYNPv`UB6!js<ti8BaXWMxlFIOpllr*kf9#`uW<W`li-zqQ2q*~Q0t+~6k
zxXZMgvV;D_KjriTN{+Ur_9pIZxeJzUfWzZH4w{>`eAw5-^)fTM0s3u)Jd4-mwIQS8
zo3y15Fq9DcOSEY;5ds?D{#Ls)vWq;2;tA>{g(6>(79a^qaWVh*mxIvpV#Uwto2Q4h
zhWBOFAK=9E>}k;JFTHPH{R@vyW@ded_rDy_3A)^)l9!Xpr>}+Mxc%fF6%*~G>;V_q
zJswh%^lJ!6N2X}0HKN8=q()5G7*c_w?yR3vG%nFk26O(T%M6P8XwzniuK9IdxF7{T
zAu9n7{{SoJZ%u!Y<>z8vEiTFP8;>5Bh-6t3lF=5D^<nR2-T0HV0%W4{H;!weDrDn@
zg5m4BKob<za8r=3#u(Man_HtijxfS*{)7~7dyK&|r8MbkN0+J&b(0WnSp?mH`OM&z
z3Zp8)n~~M&v?(g*NNTVHE=1kK-;G!t^3>?t`Tmjw%A%L)p7pqc4%ukv$01^*KiD&C
zu7JxSKJe%bo%VsV04S)o|L*zvKZ^cK)Ig|6lA-yTw*SLakcoXyj^la=+B-_wXo5&8
zy^DL#W!!dGDdiP8sJk|qwal1+0w#)xP*i9ue~`%`JR2J7IU2+4W81rO3@k@IqHkPf
zBzh_0ir+Ql9SpnPmEj(`UJ{2yYRcO^NO#UUv4EO5pJxc7uZ~Sd01nYR!l&5T_pA2j
zVE1Tb#T4Sr<vmN(k)f?j)aVBINMl1lFq7b5q=sAp9+4wFFzZhjFw{|6KZf(8a9U;h
zNfYK912z`Eu4vBqpduq%7x-#OBm>xBB(~J6hURA39DFt@f)mqRBdvbS#0B|D3*Af%
z&tF}iA(c^Qy3{4zf+9A4>6)U7_FXm2iOF%O^tuFXiH4$qRz$jS>@n4ea%!c+{EGDD
zrJIeCm&h(UlZSYSeX3W1Uvc|B|JDCitvFoK7Fi?K>X_2CJN*6mSl{Tj8LjD94NDF|
z?qn}{?&wz!%%$h@QVA$<%!u$U^1NcLx#ejxcgL-CqE~iO0<lA-3ZG{$gr}J@n9=E`
zRCTMdPe`3*8GN24a@|N?*}8?*Yg%}6b>b2o8H)GFRNZjggip|5V$^>8F%kZ{NJj9X
zwKeN`N7%%VvB5iKTM5NjaT7=LQ~lvxwvdvl_J^(1ZYK9$B-igy!k%-ijBrC6g|YGv
z|1duP`|A7w8B4eY;|TqiXUOfdc@v7S=i6U*&t7U2*`;sx?Hd>TdFrIxz&<U+q5J*5
z#in-b1(E*3^p?g;JQJH1y|8^%QHn(GV|A+*lJBw+7GWi1!C|thq>O^^#8%KJa_ryu
zyur}Ad?f<7<?{Zv9O+2}El!Ad%?5RD(^H@R+U?Nf+23_C6){c7Bo9j#^%m@2el=NM
zpWN-86RTb$DinCAd^YPCXnUb$RD9`2=2EC4PMH0ru-YyV%xAEPZTG9fQCv+b1<$E4
zaU;Cqpio1aX0>Yh&xxAmt1Mf(_RNrNMmUyNwWKJ$tZsrUUR7SEtULeooW3vhY3anu
zf}@u!VDsa1l!EqJ!L5H-xz^8|hKu5{N*MC-vtfQkcOq%nXU)?#qjVnT?q^HovJq3q
z>B&>qu3AQ8VTZf|f+**;Ul`^tChOmv1jRTH(&#BdoxTu^74QvG%ew>rVf!2{VDFXq
zW0xhr?T5$#qZk5bg8;JzDWRUjI5Ct4a(_u2$VMLk@@<2g8mc2bZfjI@pQ8;W!L4tH
zR8Isji!RWayzs(jqS!t#z#;NwoM<y`(<7guMtlP^P9QLeCq>#Yi9g+hLb4%E0&;{H
zQuAr>NRRyehsO{yP}6}t!oh>LaT6XoaLECT?D50rTA|jmM0@vg)kiyyl|18z3zw&p
z;2Y~p9v569{F#`Ev*i)Kqj%u124dH3g7lCL+YUuL12|6CaR+Se@(fX3hLi8McL;qn
zPNLUL@dAR}Xhls#k^Q>NktXVBNvGrF&@dDkaAQczslZCQN_HX-AN@09=x^X1F`?f}
zX_9O?0gtDQKWUh05ZuOVLHca@GUj_3IS6Z}1O!*Wl|lncX@K!ZRY{;U+yQM%h7ej?
zxxMyU$z9(^nF<_$f=i|(1#<1a1U#Ntj8ZYv<i5g-Ip3Edj7Q1{1++St0Qb_R4bAQ`
z{7ffFfk&AvpyU{;&-v<j_=M!gJs%;^h@k4S7B<(9Aff{l6>Xm+aQ^M)SF8*f<%LNf
z*jMsP74Lr-<w?1{(B%VT1Y6bLhZlnPKN;=G%((Av<L}eG=}LZB_o7fm=2@fP*X{;(
zxeYGjPr+=sx1)8~!9mY^Cx6oO$nyP6*7p?&=^#6MBVX_bCs|$kyEqX<-|@0uG~9D0
zb0hK+jM^mXO*PR*XYKku6&rj`s{gx0VWNcXyH$b04xdz#xU#;v18V{8kc|b5m*d?@
zpU_+0!<?w?CHK1D1Q?1RYz<;8T#`h@?18^>N9={)GINcc^JvqO_B<Y4`(4rxRG|c+
zs6NLsiu>;}muKpI%i@oE3)i&qjtLQo#^sKKgpr|?4eOSZ-pb1-MX!BN-la;y!BrZT
zdND@)?uL}YnM$qScC}je!0uLC8A0u><S*@PfiHv8=dPU_K9&_q<gME<g|)7Q3^hIP
z$ug}I#F&3wEg`h7ja+ZMVEryTz*PeYJ0)8se5&MrDQg$pb9j$@ocXnWfh#NX(_o2u
z+q_}-P-|&<-*wpj9y?9cJ)?+j`cb%LqEbT>%37dSjn~K{8D5c_`6cEo?WOpwdh6jq
zqAN-fR!E8x-6DB0!>w4jL+m(RCP^lDn3E^b@ANl|#CRVDOtqb+Z2D^rPXyww?Q4gy
zuL}^zJ{Hw&@i@drydW_vnAmC<8Xk98;SAy$_|^2Z2m2q+?7jv?zGXi6wBe3lU=_w|
zw%_g5<JM~FpkonBt~zMSL&>MQR?OCWu0Tw!Y9mz`vdAIE6X9OvT4ROo#&kvsPx|u&
za9ge?f#5%JJ`D|=u+yq#9q2k0Qxa90SQ5R7{|6xAEDUw1{yv5?Owf3WHDGzJXGpfW
z_SvOC*Z>Oy?uLF06{=@!XKWTVIMx;nE|MA>j5K82+=gtF1IRk2Mk$a5gpgj{DKsp?
zq*L}<?2ZB;CMV|rltf^%(ToehpkVB)sJs!JnCn#;$c%SHXc8jZr;>kd^HEsO7;QD3
z28?cs<NBpFsMGgY5Gi0X@a*xTcH@OOgf&IduDuFka`6#jvQh*Hws(-eZgcj|d=W5N
zn`vwCzS-8GTr!AZQ}$Z_$(Gp<b+!(0ZH*0Df~Si^%7x-XAM&X^Y}1ylZv(cWzhG0g
z44yI7Jq`!!meMoB$-yRaMH(ybE8&Eo7!d@~f6S9;aya@#+hxB+|2JlT<ny0!z#IA)
zy9oqmQb?a59pfX4PYiDcW(Ic0bVW#HOCYRQMtny)=mZ*QqT<;)B<mcR@6LGD+2T;f
z96y5~4kvLxOQ*P>8|`IZGo-+HohZ8;qWzzHSh(^(eeu7QQIc)YF_tlZG9?9KB<#zW
zyD2$DVgdRoCC2yi14p-XFhp_3sG)AxqtJ4i7I$IhqkO%XKT8uAC=!3G3jU~YFUO2U
z)m5ca@7@a?apaG-5zC<-Ag)RuOPIS9M8Aj*41X6AiXlCB>7|1r0vZ<E$`L-iPO5@z
zU?@QP7_0Xm3s|%RLzmWpy5Sz(Kn(ELX8OiwlP1^KsCsnO1y@YV`WU$=E8kHpF6qrN
zzzw4wU2Uay;)MzKgaZ5*<Xxkg!z0Q<j8w-e@~U^~!_ExxtaVT6T2X_zOlNQSxXJSE
z)>uOdk(ZAy8XZy@E*F$TN+X)+XH8sKjf)r5W$DhwTFt+U%)Mn8O8wn=inyt(Tlxta
z%{x0mTBO~Q;r>-Yr1Tn&ri&7(Ldjh6E0n?qhIYk?gA<ljpQFxy_E`Z~sue5JgGfJi
z{1kbMhrRQQmRTS7e@luiQ<6eV?7J5)<pu|_%O8)Bn49u5SK0we0(bk_gtJtF3uzAh
zO|F$fbe?qZ2ol|bB<iDQQ&G5Fl_Hi|e5Er?Rme_9>K--Q0hcIspfS7@y-7EP;0(oC
zB?H@AkBa(7w%&<P&{@y7*0E2qXW_8h=Wx(*te3_aqJa4HaM(Ope&sp}6Tpzb>-msQ
z97(@x&usg9Ki2)Q*{$5<lbw&m?u~p7lF$c2AzTTAboTKWelHvfM=Pk5C(T{I*g<<;
zF7kA)_sjT&&n;ui)YYQPcssX_d5`FPU%@Y}74`eP@#R*w#Y_Cwib@@^f*!3@T_dvj
zG~w}5khPC25tUShW!&x$SW}Gm9O7fesokHx^Tb%bG@A7r;v>Q-0&TT1xr(@iKEz8%
z=oq0A1>QGx=9W?x-GVo8oH(!YM>tamE#zE^>tMUJnPjD-=lPp&8U96c7-r7;1_Rr=
z-G-+Jy&`8(_k$?L&f7<<xr8W%Fy!+^7BqHay!SNIT)JQTaF^$<t!a~Ip+5$S&4qNS
z)Tz{2#H`K-RgZMpN$~7@V%PKhj_Bcv9+<bNCe7mYUc%4vihI;S=36~oY8Ct1?mtF&
z&C%7E4?buKD6a#Yqb4|tj3(mF`e?Pq*2glpN#HRLda?|jtvZnVQU9R>1c$mo<6ZhN
z+HGm7Jy~~u#n<oB%28SYw<!B@$Qaade>GV6y9+;$;Ypwbc6pj2=>C3M|KT-vm4&N9
zJRk-OfoPL#^kN610B=tVeK4c)E09t$G%Nw`5#~*RT<ITQbUpTc2B6d<BQgv&YV$)q
zY;3*=6O@9M74cb?3A!Yab0W~Zr-<L09zoZ&$8$eXBe7X8$9RvaCR(O>M@-Od=Q&`T
zPA*SJ1`|!Sr?m(9jrG-}1TDvR=CUdeK4ebOMLAqqei=-K4m*mjM{A=ucc71u4IehD
z`3H9y)O*do`Cu4)QxaTV925Gm1X{{mQFtf5k&F1hqI`ygAyqonStlowK*)2zr6k=z
zNFTIR&$idw2>1lIp-s*%x<>)=Su%IcyrPS<;;xI0=D_10UgX@BPl%Y(msJ+z$hK;$
zh2_7WP<wSX4c!d!0H0MvhCZl@eKGVtWdW=v(zvdeggSn!2Dl4F!kTMgRqhbPHQ9-o
zULv}0@Y~(@H)2lxr*LJ7aUZIAdovxTF1Md}{0G#_a)-JTUTum2TRFuFe)ylAV|0yx
zFTY{PgqQrdpV!!K@@Y@Mc)X@z&33*1<SsJ%Oi}A!g~C+=iuXq7z_w|O6+WCd7KS1d
z^OrT@I%##}2HlQP;tIZ3%F|_v>EH&8Y1@EZPoT=}-rCjUc(rgrkXC(MUcNw}!&*Ox
zVZ$iQuqb#@=F3U5_y}vRhkHje8RXY@k~sxT!S#L5HgV(BYxYMxC)EH0o+eMzE{qyf
z#<(aZ15+2-6zkMlttHrS&6b2?rIkdR`TIL0k5V#o-Va(vjQB5|KOFTiA6hn#55#i|
zdp%_v)pjA%Xb=>unY;a(=fZqlWRI%3^m^?%AKOZOk-E7AfBZ<2iVYB4Q9R-uzW$OD
zb*GD;W%=Gn#4Ae#ZJi{C;V$-Vnq-USD;s9=S;_6?wwE2Lb~*B5Nvf{C__T;-_<vR+
ziR<sjnWg-l-JL%IrV2gCa??-c&LPDpKGCn{otL=-mF`G4J3!?xqp;l^1@wRZb_w*j
z3s*b!9?qc}TGT;SSEC@&%(MB*!yA0GKx${OY-nj3_`y}Vyl;v;GHf{P%GR43{d(kh
zA~T-Nj1%o-2@mJUXg-H<FjENQ)yVN?q_({Le8NJ45m;%^*hSdJu8lGy<BXG@alz|;
z$a$&<-XrmjeAdWf2UDD_2idvB`>S61K-ivU)a{3&m$?^afgU;w3N-qkdf{dI+i*kN
zfcNmFrS8H$YEE3J8p%7qN2RA#0kM%%eVq6flzzPB1FZg^#sPvvB-yL#4TDz6aTgR{
zZ7XAB1?Di48C7VGF-?i(HIt>LDG4yW_m+drwR&1@R2}(h^^dSpk8t?dJ;mBwnA>^k
z6e!YCr_kMM*%DsWC@u?NU*-*#B?EihJzN{dvwhlr3k6s9mSHupb=_FEEC)Ze#J-$e
zCG$VjD0%w@#e0?DOEhuC_>g}?R_A1qa1U(<n_IT{n7|?3g&D=jZ}we74!vCIF8sM;
zH9Go#id=UoWPLSKkC%tpBUFxVt9$kjS0@tIF<0c~f!Wh0yLo~#4fiwA8w`hwQ(tbD
zwr;<)oo#1Rn!7Aoez38l<v?qRNh_;FX6tqGqsYH#N@F0(A4`dnte?O((ix6wPyDjF
zcSCl$a+25exkd1^zm0P!;Gxa$!i8XB%ZTcW&Sh-jwar}qck#I~jUpOy@P`##lB81X
zUZy>;$nQ)}PPHYr&y*AmsS*(37-jvIBBMPoLcFP#*P@C;v8Br#8L;L{JLo2Sb2@O$
zM<5|^6p`Yvr~-n;DCxDd!k%k%g216<ErFrT!Xcp#R)}UFl#aJu3rsgr4)oUWp=!)b
z5i?wMMtxJXk{m<A?eqm746pa^#4{x5a(Hk7-4G{>!m2*ROx&)TTZRs>;o8kHF>`r+
z#_YnK3ELzspY<9ObP~o39WX|g<{qkHqn37AKYeRi<zQ9vw?88PuBUrR4E&XxqUS{P
z*AYp3dKR$W%}1=@x&6C{!^)gYy4P$pC4rrt*Vuq_970eUr6h<5pA^bP@qBEVCP@vb
z)s-4S;GNjJ-1-UaK$#TnNyLU093oGB^VESLk5uv+FWXZoEP?tr84{I9>-gZaLnYAn
zmdd!hxC&4noAl07His!LQQC?W<jE9cmpMVy34mq8Ie6y5^jFQ{PU=MIm%WQ1d=tYT
zZFt|XC`7RheqfACDBxoX!Ff2q>^kj<sjcN~&Gs|7E%NTO^!Ln+{<xDr$AG|8uSl4y
zGapEcCm>XpFC+9Jm3m&XgbPHx3b9L5=xn`oqvcpSK59KlKMx(ibfVo~B}VFT6o)7#
zNmQ>+M$`c)tW0r#3EZU%*%t`q_eb5cf(P1!saaToA=>^2;DFx+NV_Z1++3xsyN$fL
zc7nSl=en0_)u`>1x6eY))F<Ef>R5;Tc|uT%*d$5EVFQHl$ktnsj}roQqb#Mrn5DG)
zUGs^beEDNeySSo64!WIlqa9`=EWw0q&`>foQoSZh!IFh+tWU~(8Q}LM@-vNxoJgKZ
zg$)E`#4XAq2=*(c#ZTWQFfTGtGJZR$P@eRykn1ed2L{N8bFRU2HB-dmcSGQdv*qMD
zAzHdSirs;)x{>qvFI2%HBb0woP?_l*CZ#!+jQx{>)3i4N9O6g=&XnYZ`pCl0RJRa>
zG^APJcCvAHNt@+N=(<ra2QqWRQa$24%gE&OdwMbWSx?{6m(`u>2hM~@_~R3rN1>vF
z@y!>mXr?793WrIwkH*8-S7~~guAGbp#>WX$X5^#XH|cWm7yEzU*(0`JF&$1NYXy%e
zUV6Bc+VB6TrF>tGa)uRUML_X0*!I`skhXl`<T(EBkx7|*vptYrJL6r$>w=aN^5Ioh
z#BaL&{eJt!gOJ`+Sq1F*1xiQjX|J5UL77guZ6ETvd)=S<#1oHG8=H>d?XG9t-dsE$
z3lvk47B$v$PbfYO@H&6LoX3a`YcikfGWH@Ix4uU{W==5M4PAsByUl}(m2IMmIYuh$
zf@AyulG7}UORioSyb2y8B^0G0gxoDe=SR;qUa@NOiq91qvw21yj^j;U9{jZEX#B;j
zxu7rbaKkZ(Oc-xA+bEmg5p;3jZOLOK$j<3>7(0XwpnzEg^*p8u^k!uAL$cvHZNGXt
zXEVVCIz-SrrN)jAI|i4r7BqB7HQ?oM`+qL?cyRGCjU_{fjX$h9GVAG1WJUi*F{o1-
z=>gRH=(qdhP$-NjnXb9~j>tAw+*XUvrT%T^G3qJrxDWg)q43FUufIfW+jcE$0>dem
z>%x-hmv3*rKuY4Pn%^hz!rwxsa>K@o|HQ-wl5uhhNu5WOy7vs*eDK3g!jQ~U16^=6
zm*635MzFGwbvrwDw)f1F(Bu)HhzILl(Dq}ZLurRvd*p-l)(a;I7P0G`WPwPExrAg}
zmiLAFUsuJSoX@K}ENsrYgL9>X(@>YLczA8|*T<S<i33kBV*{4>I8Oa*$e32!$;};<
zla7>!HTdU*4&G{Q&a~7&Lq@tpT{g3CqwqI(Ubgj46OjJsM%36Cgb8<MSbUsR8V#Y)
zcZw<wiJ|;gPi>n-^IeePi#~I}?B);SqU+j_crn8h4LN0RP}k)V3uHr3VqyVoj9cwU
za=*#xA7Abmf;z=1f~hJ#sN@-Ck^!1LW0{mm8R{NprPku-f;HzQoHWWhzt&*#8<-_Q
z+Y;!HE%m%pgXVjX$YCD03&xw2P0$^7@I)e%7YA~9Kz&_-%)F*bmu7!MiUz;XC8+Ou
zIv8xp(xtMM68g|W1NJ!;f_f?KLN|pjX<th~Nk+Yn6#ehVgJ??}Z#95h6-a*1Q>cZo
zCo_aeq`y$5Z27QBLA=eSQ&NM9s4wzepufgB%Zym5%`J|D634)rU7x52`%a6naZ3S5
zzzhN2iSx?e(WN3EfM+fU2Jfy96a(ir32|>DdxXhDol;oaGPdNCWe~}!v#E6PEKa!z
zLTB4XKw1-H(w2`B*P%)~tYo|x82SqVLK+OvT26Gi_QcY{;kj9hFzLOnn#PGJd{C1@
zOzB0<7~azMT{-5Pnv^!Y`%yLj_?O2tTA3mqi1P$;OY>;1>ZWOj^&Ch+w~Oa!>ZR4R
zOtx2u{xh&rr#-}E(>MW-2t@$8IhG<*?P&4rF{%bQ$`s^1NwVwAmLX-Q=OrW~HyrYa
z;YV3SOj~ljx(zD*`5EmD9UgV*XXq{be03ckTDQ+SGrQd+bouDmkXhvkpLjq0#I?k`
zQ0VulioEzXsG3|nlOq-84-Wwl-YvUQ9U>e-0PtA=CA8G}E_iT+2h`e>7#(jQoQNQC
zaa{q;0fv7D@nVWnOfDzqI!5UlMGI+?-pc@ND<6V8vp&Ywa<hKl#%T@Xh%gjG-zw(R
zd`xVDn6%StJ#y9;Tr(}(AnJT~apcr|i8{c;3WM#tR8WQmDYI?qX<uQFY1oB^@nMW>
z^vU6g{<%$1*ihJXHosx_@t!VD9^Ts<wQi+vGQwQK1W%k3!Ch-#(V+e1L`K2XkLUAp
zn{*-Q>jDdU9`8kCS`#eQblL`79HRE=&_w9^n{zSxm=OFxdR+XGV5(xPS1qxhw`7o?
zkuT$-0y82tC$p6dUPH09i1N^zX_-*WaM}OZ79v{x;j1R1%`jF}E3OdHbG(#UJj$*M
zy)fEtA)2O#M_)SZf-ckdrU+zS%-&LtOFSkK{F^#Gxte&dr)iOKo|FZ3A}mX9*uSSZ
zpiKOg$$X~RWcH`GWks;wHQdaw&ESWAC0OZ(6#M2F<sdFT;Irnnz_r?q%(UQ(r~>cc
zUt+G&!ZP$IyG?UgQN@ehDDzowjc~Cl#R-I_VWM@hism)0pnlksz><Znx+m3BijPys
zsz*ufcgu9Z81-7wsnR3a2>zB2<)M{6_e74)4}@aR^svp~u7~o(KaH<PvG?c9^G1i{
zoFVSTN2JxYuCkFS7gyNSemCWbZr(C?5!Y<i;<J&c#mA#?ogWknUw`m_*9`;`>XA#n
zAW?G;<eOHCfHVtPU4KkK`4M$*8olx-qpGfM!g!Yw>)N7t0PPZ#TyB`Pj&&9SyOpY<
zGN8<nki2uiTv?3gZyI^XYs8(fHmX^O3ox?3DRYxG`lGLh^Hha`7sH<1w9@5UXo#k@
zgpYDFVe-vEZZh6In(yW8^tAi;dyKwh)yJUq?qkWd^Fx{+V#Vpw`Nba?XsqJh-5SH&
zY^O=PGdVBl#5V$2hm6(#h1t)>4BQluFGsGL-jC&69vSto-_qA@D7F(pdCJ0rw)xo5
zaoivBKTQl|pgXiPzNI=el`5(^OJZZExvagNYcEDa=A{S`iRcibt6%5ar97;ue`C07
zex7Ed!kIL3&qE@XU&PKw^KK-~<etW}flph5rgi!0xHhpMJ3qm;dRx>c@B5jI``8g7
zH!+=sq0ek*#BkU^SuiZ9SQ|+xmlPQ3aUEj?ymxazGiU|RuTTaTOnM4GuDF1h>sq5>
zPx~lH*p7sFOBy3!cKl3;;d?iN!8>9v0))KqGXRUT9NZY$0Jwge0BeK+F$@YoWP}NL
zF#<<4CsusHoj6y#{<p&)0sZORQ0rs7S%lnh@-s_wq!%|%i~uEqWfeV+n!&lCM2j^3
z7D`qWHeid~HMD?G=-py~+*%{41k<GHUyQ}16H8=VsUXi#3W-7J0TWs+{V|SVcc`P(
z`6J~Bp>B^6HFMc%A72zgY`hNKD#Pe%owf&@qs@X`zqMr=3Lpd3X`A9g^HAD)b#8Z-
z0a#pY)UUaKz0-Gs=_EKSA^~wWN2nUn6Au|%4_jj0+q$%|s~O_ejz`+yv!+1ij)qUB
zmf*9VL}~L1QxcUFpK3Z@A40)o8*+(icgkXFt>y;f-Q)EcC0sJXmvZes^${=^zBI#?
z$=F{ol8WcZOUJVyURZO0N$^S$!&pY-b=@YQKLiNq52``Cy9BTmM}l__(*S(iufS&n
z3el9fIJ-JBiIwnX8gL@Rds|ez7`9BslTQr{$=N3%_+9qyh{o+C_KIX>&NwFEei;H2
zb$izpt`!n%m5HK`2q<^NZ@6mH%Ypi|jme?9=fwW!3yEmz8(zVN`2#Ni{qsG*@@+PT
zd1rT$)Js03`$Qe-FUn8$)VxYwv7I>HRDKy)3^#fRdFjQAu%r6f{|n*0|ABBh<lTvL
zyeJy*YDaLZoA{&}`Yggg@bondHWJF0H#N>_?XpP*ey1M8sj(qaL8rwZVeKz7PETv|
z1e9^G_wyEh!^Np!na2u;f5wt=IQ3)!#cU%Do=whWEL1ppy!}qHgP<))_Z)=;TJfWx
zLK%w-5nuESC1>AtQ&>l$yE;r?BwDC-8LI~A_%A0+_GEvlCA{6ED_98I*@hJbe2!*W
z)vK|N<&GOmy()G44a3yBOw);U3}2jAb}E}MZg5+YRw(bGEsTq=F)1b68hVAW>KFN~
z8`|;9CmwI)50XxQiEDV>>TobL%cS$C()Beu=Bsng&z5bw{@HgJB`n^@@5V7y@|WvM
z4hOGz=1~ukfBa8-6N1?~!Ubawf^%z#E_s=CIuqF(m9y$sVK=|{_6YO4snk3E+_Y5U
z(|@%rW0Zz|=##pyeBEL4Y4WYB@JZ$#IbNSqk1X9<5@BSg*VS=Pew{&Qz>&h8d5fko
zc=|C)H=O+~XSz;v%t-MXi6XC|jLhl`viRg53hv^|#aSkK-9pq0%5fj2Po(3ksjp^P
zM^GN;o9!t17k?zC*#7E6cIlWH8>S2X0VPVgvJ+>IQ<GBcJ;&tgR_qn={z#NPCaSrM
zg|<D`|I@!{B2(ATd&K#AFtw7rPCH!)gp#-QhHuCizJyGn`Ymkj=!~j*LC+SBy{>8e
zqyevgb;*pO5ObthnN`VWNKX&l(0vRO#EdW<+{VOE5W-OKaw&Uj|0BwU!CXFc@6DHm
zw6n|93t0mJ^!1tZp4bFA>&RKS;H%jYi!2I(I?t9r(5aJpezL8nJ9-jsN;(7Jc;db-
z_V3=hRd4f$$SDuq+s9y3{SH^)kV2}}<>r1b^l$sEAqwT}$6(o8*OgDRa|Qxws~c{^
z1pW2*hHzunQIB$JLBI2t39Z***}MAc3an~H+2KaKw%DiJ>w@gck-*jDvGX0jr=B_N
zDgx(cJ#DlVO_x7dLIq7Srz;t<vfo6|4h{JgOEN|bFyH!;mKtSkMqN!yue(QQR!^5w
z)_e0ClfN5}z{vGX`DXMN{*^dS>QI3RVz#vGJFRIZa>fmr;+0d^VStP+b-eaasUh1n
z5kk<-I&Y@Qr3&`fUEBqP$jQQRvz+#X%oS&FAs<Emni#rSP_@5>N5o7aapJE@&~@rg
z=eK4>Gl-F^g6*~12sM1OBSCw}g3g3AIMLKgD@WMhc;Xclf}E3z7TJ+t*z!WWeppy3
z57LLIpeI$s2ELo3sdu)7!WzA?-k8rM+mC;SY>2M|?~<h<qm_9z+A?`9kNcKmVZDo)
z8)#&N<d91Lty{pg0b%T~lk&N7$2ole+bUWA`=jpTii&=NobjCR+hlKs4th<ho+ip2
z&}CBg?oZMqJK4|MBk6W0S$<GZsfza^()dYu)FcI+IqT8AEmb}Ra}9|=KjW)dX0}I>
zUtgOe(-{F{mflM274E`=dp}V8!pEp@eA@jwsqUDO0`6AvTwgu$4K6$)X3_6{tpX0q
zT(u}*GO7lgBhY4g@<k#j#<Cyxf&_QSiLKXt_vq$heQu7w37rS3__i&yl>M=TdHVy8
zkfMPXb=P@(w>{HkTDp7MiaU7lPQk$N#wjB-%}QEbpX|Px8VpyKInmuB_TTRl>tOpu
z<*Ha6+<BCrHOodRQN8l*GBMuqC#xDDGM|;!_2`@qIxRa^U@lTnM(awwW}$NDv47-}
zx>wOK=CB(&`E-AGFVjHSgJArdJVmdLmqT--NalFBPR1O<4BdF45%xb2CY&T$yB_v%
zlg)i~9Np+Q&`rYrA8CpwC?~l9+Y^M^rqlDUvhT5O9)N<V3Umt@{DFyI{2h@5lidAJ
z6=}3Y(6{hfvkPSs49F~T{LzD-1MuF5!Q7O=Cl4voGZspR8uh(I;Uqgkm`jBj+-$C{
zS!r$B9Tbo+vm)i*nLU`HHvJ}cyUtjb2JZ~|;2+}2L6XT6XFRFm0Ta$lXB-_<t@sd%
z;WDg>)FTfeP7zv+2~CThb{aBEd)?EDo7wS$>=z{mBpZa`DXMr_C!VoKaduD^)`;De
z8DH{ihR3|F)MbkjD{)O`gCxE|3QC9~XYPfas&2C?KibfuW1IWWi7cbCCghG;S&o4S
z%5YT8BedHC#c2<oDlMU$r@~6LvU`#1#TWTr#{NIm-2~r%J$9lTGEQ?gBnrG^umt~)
z;%4O&iEDSPRlyMX@p<vcN%I~&D0Z71jzodK;AGn7Z9h}qGFNM;)6Rvrjd!gDe=)6h
zsM+4YHNI6f7vBZdy|Os>ipuX@8uPJ7Z@*@-SeB^p!Nd#Q8hEpQjKe3v`n0#Y?!HET
zR55KTcKe~i?)=hg$F@C_<011nP)g=fWxZN%c=ov5B#=U6!)!?@3Rqwx=*3S_yN}-Q
zZNqrH9cABdB!GQ)yvIaP-t&lk|7fW0(iVongIZ6~NW>IWxO=605so@?`;9fFtfr)(
ztX`LIqrApD;K}7<UlNTJxm(nLM>ndl%gmV398q7>sRQg(%}3HJ;qgsXmd9^voA7mX
zODB{c=@Wai_qRQ7V_u5SvBWmpwu@uNi;ED{caU*Vu2%RAo$%KUmaC7D%=#I{E^_kh
z5-vsyC#ID}%f5)ZKf>pGq>nL@48b;YAKkG<q4H_y-KL{7e75iXDlNDRszs6DY)_tc
zCDQE!$qYHf{*Wa7p;6zZLVFvphrYp1Tbl`;X<C^JxJrYea9BXrS5mxk5>!|EdA5q4
z;Fny&qF#x*rR&!CNnrTNh^$i7wXL(ja@}gV0h-49sKfGkC#OtZmXO<yjBoc&T-{0^
zmoHxp|FzH|C|!1PHdy_i7iS_&WVRo@{+u=nS$$-(@UX}JII2NHR^X(lVGjDuSLBGM
zpm^PLi+9T?v;R`B_x#OJi7xIQBUz%R{^y`u^LyITID`S<Wj_lswW)htYTCXj&Jp$Q
z?&;%mc$~#Mgbp#kdC>VS?{LX8-PNG~N7p+?N7{bdq8-~E+a25P*tTtTT(OgmIvv}#
zZQHhOt8(*w`?vSG=iYPve8yWfM!nCdrMc!@D^Ioo)gy~v5~12%51raeeTh=w@7$7H
zCf%%oehNy+jz=Ny`w8XI-n(rA_R);uzw{3|gnVgULMMTf@x)KA-(g<!Xzqtg;E+YX
zyNSLe&6zABWK8z3&~A#e<iT^~)7tPTTolMyr&uH%^wxMN9xG_;Pn-b-f8Zc96QYOt
z8QgI|s|mlPYUQLP;!pt}wIZ9fvn`*&S2r%gB1$o>#zEV>7UAs|<O@qP49Lr*Hk2B$
zJ}4e*WkuznN*x`!U(yo)?zlP%P;p^054}A{)7mj+j(@S-p3f^`@F}-JRNl`Y?^yz8
z&{<8$?my&?+xGV9uOt{ay150vWlEy&z%_#-&eQg|WKh(u)vBVjyZ`Av!ULZ|L1UIL
zZ|m@{&p{=jq3z+d+u?NiOX7|~`=x$4IlI%*_Z8@<{Ik46FeA7?up+fCG2gwW19r6p
zURe*9$*&AH(Z;*teZc$jePp%E6Ln;EV_=gn)}%oTwI`exxt*y9ItgptW0%AB#!hxt
z<)vB-nm#$2G?_?8raEZLNNLsX%cY$pexYLBm)6-EhN}uaPFD?Q4`N{I?@jizXLNQp
zvYr-YbQevdi$R9Xj>Wg$#ulk|HD4Whh(RW0IR~^q9`!hvR`tVE<_rl*cy=n6%PVnI
zrDV5Sm5R0x?r5A(&3XaV9qfd+)KsTmE$kKX-IaOs6Rb0B1U!zqzPXb#xW-ajU7J&n
zCOuo{@SYNQjso=63ydk;Mzgxrjo*L^yEK3)$SHZ3kxcP2WE=8`)JnNr+UT{PNiV2o
z2eoi7)eaw`{rK6?LLUeDnhUPEY1%MNdtK$tVUF(%f8gzePp(JP=%_L6e`xpH7bX5W
z-6DTNg))KaAG&m3yq93#E(cQQnW&;sA;bM7u^gbtTS~a^nL?IP+|+XogH^YyJ7dS3
ze)B%d9b--*VX^~57ei=6YiAn#vjKJ@RwTW?nFx?bA9yZ6<XNZtaiU5P691SrzQoG>
z+a#@Ao&pwec8{`YGw|`XV9fLzHP=i2x}wSp!e7<S&o)Rev<uUhUQ7o0-cn4(Pw<DY
z@j5};CK(<Zzu#9rmfAnDLg69SLJ1@~wACBz{tnYvy%darrIzGX@oSkPsGm$L@aUS9
zXg3~`?<3vj@q275Vd;gombwg?J+wyP?+k4@%~L8$8`AL&yEgRFW<`*9?n{1hR(^&I
z^Bim>l0!J+XI`&;B$@<ZKxWtWZ@(f`A@T*edRQV&%qgg2p1PrRD^RH9$`#-jZt`JV
zsHSmQQou7!RGe~f0cbZ08$<HZlDw$BYCriCLCnepQ1`Ix1j?r}VlW*SEWE1+#k?78
zHIB%N4F1^g7YY>YST1Z43F!2|<Udr}rN>KnKL>WgLAHO~TX<IYfrkyj8N!B$?YOux
z4zB#x-xi*hcHvZ+nWKT1``GuhQwhyMG~$c%%gYG{Dc7~i6pfGt8gguPZ!c9aS55Xl
zdKIWxzHiRMoaUSG2=^+?uWc1j!zaxR?RH8mj%Mhd!y`-(%rx64p)j*KD-xTD{=<Rx
z{c?tx5o@{;rOKS2%@l2u21Z-<zam)m9W1UR9M=cVKq`ZW4*W>1=z4X(w+rrwS@S>P
zR&KHo4w-X|r(0S4Se1_vgsxeZSUO4q>_fe~NJx+}7qKiVUb;z`D=uNPWz=}n-{$IK
zYte=jwDIeEOUNODA?WQZiU<1;Jb(gezi3vC)rN1cbFjh9^f<-h!_fMLzs94#M4ZR5
zkh2AO;xH>hpg__-#1q{}M0N0`wdixBJq|@awq)79$x6IS@TYA~?xwahl+*z=qxO=W
z6>wqKy1I{Q%@H6b{Pkm>?h2_eu;7dG+?BQ64j^A4P1~&E_zRpIgz+~75TCD8G2~B?
zZ2%>I5>ez#H8xc_N0xE;CAff{d?+Hof@iiXcRT_qxB;~3lyh6MI27jcHeFR@d|d^G
zN3MVq-le52`6G8@lddmF(uZx8bNLIBlmT-(LY9FaC+^^Qrb}*J4Y+KyCD-gCwU$V{
zq-XybJF8U&HUckqh9jY7uR!8nM@Bq$MbB=8HDXn<DN%c$xmjg1_C<leK4iPDwP#GD
z1u|G`7pNNMde~&>7sUBQjZ61t@IxMxF3gnvL3gD-o$Ei@`4ZIaMKP8r3L|eSe8vNM
zjB=fNFx;WyP%Rs6XBT6uq+ny2e(inZ=nj)<%$tx{y#695AK`FLMnd=14+lV0oi5}!
zJ5BX$*KfM*--U2qrQ%a6Xq{W#(d27?P;^<?DFH*!C?{&Ae7LcZPA_J9Hltq~e?BZp
zw&}fm&EYHg>B}aTR)^1<VwC!{NDm;`R+FTo@M;Vuy<oyXci!5Zc4G?qA7HT>$!Z(x
zGwF~Q35=FwejY2ftq*fi4;rlXqSSD3(9?nN5HgeV7zfA1)_&a;Y?8FnN9IG65&nMc
zV8HbH-B9k_-JuQXQX3DIZ))BA(&elAdI`_5L`t|V2FAb$JTt>bEWn$0N(7FhwEA{m
zbuW~>v7KL^*BI{52JK*Z!os7zu7FIb^laZ#xfE@i&s5-D2{=#27W*cC9QONi!7njb
z$$s`ZbajpL5fF%;3Ez`_$_dA?AT56iFITS8TTQsE*-8d;YqHo}I7iO)DMt-mtwyp!
z?f86xvyA!PpvKN^>_FigZc}Y`I?w$I>Y66hzs1DtGf=7Mu$y>LQj1PhHcv0yj#Z6%
z8|2JTEn{5ZCp7qho0zP>*IJep2|tF9V5fCKL(TLpLA?;BvhTJL;U#=rmfuBuH%h>%
zuva|~7XGO>zJ|3M`(Wyk(IP&<^}`D*wN(GuqppSz2?-4Z{Y``V!hpqu8o|$MoDm6$
z__4kpZ9Ur|HzUYdUz+|N2@DTj{s5G3uA3#cb4CZsyK#+TslIRUu5hAp$4%kL2iq<a
zdP)gcw?jBxnT3xJ1bujYnK>n$Caj8~akE&mov&YWjk?knpo{npFVojIQlS%30#|kC
z$#(1r<v*mgF5I2X!%KmL>yDv#m4wuWPkmlABgL_vHTa=mJ)tX&4C(kXurk@5m!k>d
zy-(+L0l@m*0(?ZyNEp)iDs;Q`rv1*`K|rUt<^jQbh4zas{9L}@;U!|L>YR%VOYAXl
z9`|U?{$jd;MLRcObq)Fg%gIE_M0AMH6~DzSyKWb7W+fY*nxq#Q4%=-SZ{Lg}z=}`M
zI{V+t?Yc^JRG_+;#aRvk?Wd!9?-Xjf155Vd^|Bw09qB-4-hVJ$;1G1d>)}mD39Lk3
zHT_gy@o}3IUYKUFI*hl+l>T2t{LN7u3kQ+?vlx_gxiV-q<_2(2JJiil>ae%}60ab$
zRYS2NBgj4Z{oSM3?8K2s%D=Kjuz}65UsC)Xf0#5zW$uuQ<5PISZcAnfi$;<snRsbg
zzP#-`Cib^sbM7s{mTT}US8e~N;T!I+-*vaDl7`9kc=`Zj<2|OO47`WAB+H0*^-3&T
z7JmsH-DZT=2Bx-aQ;@uC`jmK#7?w<}pI*1d5{$i3L@$I=%eC)-bX+;sY23(u86QvY
z;tyU44z7f|WCO<x0*kmz4%y8^Tu_Q2dwolv=v(Tx8Gem1n51@Fwprtm(8TH;9IVn;
zwo}CVGs@6Wh@7DbPaVf<?vc(pqg-tPqfd9Vrl(dt7VFSMYWv$cQOL4_#}dGywZ|6X
z029vtv=bKnm^m3ZX&-D5+swxq43f=pgPO3joVz+--IAYuH`n<u7sX<Ofxf-I6UZ==
zpT{7&jGPa}Sf*L?-84Hp9=_w`!nj>DU^Y(`jR$zNy68S`%T11aKKbdpS{}NF2F#(X
zm0E6psZMv__p|OGlKUn(4VcvSq21VM+kP*RTSsry;&qidOb-NhLH0qtDZ0c5P=X75
z{DmoA!Z|&^Ld3u(?t9?*<C_g^eoyGTnqpX(0^o289Av`J63@Zz(1Rxwe(nNcyOaft
z&qdXpsA@<jz(W!7=!4*U1K^RwOKuR`jtHS+e#8#k?wNAnU@^WH4bBK_3%i?YTqx0^
zVDLM>4f(`;Koc&+O*hW)<y=~HyW|p7rYUsIBDmP%K!Err_4cLN{DlbnIg1%TU+E6?
z7nk9~0+h~@KCtR^J;W(qdgFAwNU>n;u)lrnZ!*kyRee6;)hVKOZzK%4wafnrnJV;q
zsfm$w$i=t-+bV>s&{`&HVm5)+ypsUO0qc*HcAjRH5+SlXe#V_@4s|vWdT5v=nkRjb
z-n&I8=F7da69{{o+=i8X$WBF2`^287M8SlOV$|N>U!TS_zmOr{JYqhkmyt3SsR?&m
z^|W+$d(;TTo4I&TfyFmGY6$o=@IOvsSzjslXRL2Ncc+}Mgd@@`*grhye{Ge6%Lj%K
zs8DYpyPaP=Z=8vF5CG^Coh2`PepP#wH#7b_+vHUQ%%fqWT;*P{eH1u<-6CG+?Ts;s
z*{W*)xST(@rSlF^{Vj(&2*YuHO-Kgy>Uo{mGbs;ts?)t4dT9F{tWW9Ti&-fZm8lcc
z=}fP3^()D~3zA`2YRxL{qj6XINxvmfh5WXxU^G54@z93ttU!)bV7m%~L8c5vm6YHo
z@Bbo;b+2iFHS<7foJ=FfDzmP9kZ*4dmPWP_{P#Z@5~mmtSgY~<>s%dU^$iFqRC13w
z>x504uaXKjavRaK<oj)8)Ds7nH`1gbttpOHw>_gizv_|Z!dE_2-o3V{38!gk3FrCl
zuWSzX)|sPWhkkH%mYtKX2eZ`^*gX;HZFvbtnwo)%=i5^@6XbhFF;Ag-*$2@ELl`f!
zz4bQOwpwtB2PL@@e)V^<QK$-yGkxW+2HetzwO;Ny6+*I!6+{R^0)41$rS`hjDqUi9
z1>wHlG>Q2XVR<5LcfWWs2P@IACoe#}xotLgY3=1GoYu+uTmcCTB;l#;W>csstqRfj
z0erqRcF9y0v!U$3r40v7T)xb6;|h!-b6tWB=YOdanhGIZA6h-~JYDu^Kw1^>Uo~>r
zml2`~=%=)9V!y?l>$Tk6S^jV^;vBhsJqSmx8sl+pVDRb0(VJ(O-JTqk!V$bI-Jf9Z
z@<LrZn%KES&`L{BI%y{WO8c(klY|Dxyxsv}Ld>Pk1?zbfR3~+YebS2`rBNUwp&yhi
z<0gP7E$)Vc<uFsxc{!)XcGsxwKk_&xCTP|%i%%UFx_-7;1I_MSEXywYh!07VWV*Op
z?5EQ-I@;bKgSl}sDf?1>FDV=M&DU_%6OwT`F%B+F9h^JiQI0cb3VZmLdno)-fR;5X
z$vC_;T0XrEfh;_;ZR08tGcmSk9P0U2#8-Z=3YoEzU_})GVwO-M`ueRwL?HSmY_zOh
zOhc@3HE#n3>r~AtpBvKUwscl?=M!t|C-uA7^T&0aFg)nit|LMu;0&&<VbfM+e5<ED
z_~7mL_sbzjW1;aq%3aSohS56R$O^9ZLiIzD#52M&Le2Nhb&)5#paCUAZ7&vPsF3U=
zo*>2seKj~R#U-26!<_D)#%;tGfVQ<xMU#@2G2knz7Xu)jd-YsQTvXQe!Cim8ooppL
zG1VL$-rDq~+_cUR>kx8+(3u{-e}ZR%YPbrqZ~huzYvCEzB@B~GMLoC@Uao0nX+wC<
zfiPD+xJLPgrylheAQaJv^&eI95-XzyHEmW|l%wjes|?0Sg<UNNAOB+f=xs?ceV_W}
zc7js}{<@2uV=ga$upNzb@drpgGKVo-PLki=t9wpEcW_R5WH>+gI~kN{E>PI1ls?*~
zXTEJ&>CH9E4J81C#XIP&Of{pdf_+Aa@b@WM1t(L4+~6B6?6a01`=ROp8v}BS`E6P?
zyjI&4IsVkr27Zfz`O0E+BpD-E|IC4LhqKsLAx73KdT#1Q>=1P|JjV;zyGHFrr;_}S
zy5>V3O!aN+&tBL+z)%Q{QfX+#PXIlFfBYh0u0*1M7uLwE_Sb^wugGjjI%0){E%v|C
z^XfMoi}{Pc;&-ntw%3hBw=-#rA?nMz$FGmJZFrRn<N5381TuAx{G{S%z)mGYqYX95
zU<?m~)x_yY2hzgzI2i4dKgNeon2TkwE!Sr7*l-&I1{`b-rP#%4Tv*)F4IW-Xw;jPD
zn#Z&TpdLC;8Fq2)0>?^N<*Dpk2d=arGzpI}4dFro^6<E`^zam_lsUXAD*}_jLyTKS
zKv<4`Mnq_I%M3l}v_1=3;RqN!f!XLxslJPc#|MZ4e56I`59Kc%-6_0FqOb=tzHev&
z;5XNQ1n(RO>4rXDBMdiW9V1oxD&`IZFi^)ioLPeHKj=ZcNxZCBuub*nyUp5Y+mRhI
z41L)zhvqDrn;i~}o8?wiOW;33RLA{xG&pAno0%*xT2J#-4=#w}=l2>i(Cm62DlIW^
z;#Jm&s<d!SDrNP^tf5t{hMjRFGIw#AGJj9&H|Z;{8V3`Wuk!hr1FS4Wy3#IkQvyT6
z(`_clWj!EiV1l4BQ@W6s<S7KGJ}YM*Ehl?&i>_bIlDTY8<D<9iwd$2Ma-0eWs`Y{a
zDCB)E!8twF4U;EP)A~j}TpX6WaZ`G)7Kk3IEHEJ<)K`ct;L^2auAa$*z}-#@9lpn@
zhzd?j+&P{tq=Jwb;rluO3WbVD_>>!4;Wt9Zxt(z33<C;XS^45o8I2j$fhMy}433`k
zZO1HAocqD<EmEJ*I;p3IF71<EUaY^TIX+nTb3tXteTu$+G!6Bg8P6N4hNazluf?=n
z&YJT(B>CIk9Jo~dQ!6gNu{hHX?r^L`IZ5Lp%+u_PMqLdRGxqFwx^X42)Gp=I*|@EJ
zhp+#5x_Uok#S@tCb>mxP$_xeXQkM__be1W696gA?FzwvkF=UWltH1Jpg3Pbstv)n&
z7qkqh@XXH{rA9f!UV2`r<wLc>dY}U=MLtNmrH#q{F=Gx^1vKh;=HR=g3MII~9Pt=T
zK8z=^DV`^gKu7(9>viIN1imzmw|0I5626k%cMJ#KYmWlnNvB3#LS{nSz#m>pDqfe<
zIbj<w>z7)>0)d$1DsiV*ZO1*XI$V*ifx@I?=s+x2QaZahIe6s+nLl`lk5<Qzsp4=^
z{L8X?8hC8i><iuBzLb#78SSzO$ZkCUrJMLdois?A>&gCu$Uo{Mhy>1kJherT`r=v>
zJ;-pgqVN_=f?ugZ%{mR(bf+!5W44J{ZEIJWg{}*K)!H@0v9ud=%OPxJWx6bWCMSFy
z`YVY~Hng2g9p(=kOWK~(mGNvKnM&-HO!AiQ2GD6AArQJYss}cgII%Z}!iNmkL`6~v
zOQh3wSr~M;n<cxG(JTKw3&1w#3!ECKbzz{Q74_W2(F}c;Z&B_w$q!utw&I-p@Du~n
zP@1f`PljeH+hD-wwO<kIa8|4R8a`>FlLS^e{c$c)e`b!TcqChJl%MJ##g}+}kH4@c
zn<pda#UmqCRe3SIc2n0iIszG3V6w6EyDN~IN6Xt}2)5(9J!0taG12ueK%maj0B7mA
zSe+prA1sn{dNo#l%o62^-m3dK#eaZ4gDT7HWw2R0eGcn(ca+CfyY>tAb&|DtHOgB`
z&xAL6c$lQqHlLfb6CEv|y}XCKYO?n9>@6m-eP^Dxzg3yiZ~i4#sY4!{wCwU$v>^6&
zsZaKCj^25n0-jqS1nQ0r-mVSQi6-bzm9cP_8iR#t79y1X%U_Om4;&i#)({m{LSmPR
zSb&VMr!eSN=$cdTcv<dv!Z?+R;t~PZdY^iaWTdvW{;jL-vG%dD@;5H~c#3%HugUR@
zx6Kt#bGJ=TGafp9K~@obo;I-C4d<Ow7XL%vla=)YBs6a-e(3Tx3<$d6o7U^#9b>`%
zxfNmM%ilO}s<p@bbCdeqcXLI{bf<Ci*Oo-1GnBvR1CO+srrMp*pv~LGmX=?jpiI&3
zq5xzcVPp4g?R=-^#e1d&wHMCWkRYbz$KV*KM8&ffYA_oH8RVWMa;7H-M-`M>xtL`I
z2t=)?vwvr$8MDwd-CUBPr!cg}J7Krb<WSVZttoOqizN9)P+-EfS|kIZi<_`3knfbZ
z+%R4l0t;P*mPdm=ug(Ml^^R76=g2S?C7HJ9{>-@cb0GU*xugi<UDXe5>3+s0D2t-3
zs3<$J;5Ajs66eT-)oxMb_w=ft8SO=8SM~DG1^PRl3A{1-i~+?+ggIh29k!jMT+!C@
z#QZu4eK1MML;YYBUVQiJo$lbPEbU=zP<yiF`_kV}!^Kvz-PU$q8_4(*f1qI^7o5f3
zHostPIp+fP5W`BjCCOlN5p1Y8!Z}S&OXNhg?;@-;RRvik_JT!?kM{ZoBnTFo;<*i;
z-Zr2B6Efo+1e`W3ckJgBsq8Dk+2siH)Mq&lovUqQLeNR(#ViFKTl+VA@R!17fzLm;
z?}GKmHjCUx296Bi?j>812<>+tD*K7Iku7*Y5xgRNgtzaCr0VI#yb-pRhCRb$-ef@I
zn|&SM-tTY7=1`s(vLu)@BFQ7Kev^6-%FpA!(DRf%mH3S(%Wn2QC5uugGubN_g^PEe
z0r}<2m0WNhbFsR(2Bht0!XdsNMx1hHQo5eKTsEgz1Zx#V^?Ln1w&Jlrv(nI`&-8tJ
z=gUno=1LIm{`xfz(463Spok!$e*Za&pg@bn(I0$0z_WzDhn9|aXsuWeS<D(xGZt?5
zb~6*MZ-=TQPOYk}1<9oC?XjCM6CSH=7!N7dJ$EEY6?V~!-D_DUt`>l%-O;YmkYUyF
z8CZm^4{}!iOY(ZWUZYXGurhBbrHm13uK8DnsEdp`KgZ}>>#(%v*yV7b1$7)U4_7m^
zNEpO}7hH$GFk7<HjUzj1h)i+zf$>u6pBlJfgm}8C?&mXF&l18Q1^H(gL6T^cE~z3R
zyge1ux;T;KVFP}vUCxL51Rqo4Fl=V&Xt6f_l>9a<ZiByMfa6UII#nc;zM5@a(XsyB
z-RHxx=eBnQ7#=2xRR|9yB5I2?huwcKrPC1H8G0Y@-v4cQ2${zb+6xW^`WB$d5SGb-
zx$(6+ZAR>7OTah8%bmX*1<l_Mwim~{aUZRpA$bgt71IxMKX;R7j$)Q{6#tkWjhViz
z8pq5E4?c6_5oQaQ+$9Lkl*=+;zXB5&b{K&vW+niio9tDWZvoG1J$hlfpuDr9d_OwY
zEkY!(!{|nI`n2>y`P4avDls?y4Ot4bwT>$;k8R_JXOb4^zD~&HtQ?y@(4H&IIhkq!
zGw&Nq__c40uTbesXNs%z#*-#D?cDuLDbCCH%rWaB50#&eJahVsIA5#nguM#`9JtHY
z-`!E8^^sYrl8*#$Eg?Jg)0`6_+t6GTAkh!i0Yw_%<j{oxcDODJGPTQ7$~@2?=aCLV
zoOTNW$j-eYhyvD>8Ys|Q-9=ri=t$Fxacij5LcUT9ZMPNe9jV{h)gx$HK!=9f$7nM`
z?vw(T6)f3grBl4pI!5U>2}`)<(i&?D_%j4SmR*dF>ZHx{ba|?b)$TSh#;Zh`iH^(Z
z6$3)Ui=02fk#jW>$}kjde!}jVy78lmH{4KlNito0hb}$j^iUc`&5MbnbczH^8J#0L
zhX6kQaZV``r#b6+WO=oSxs~_@^(oN_<xP!H6w9mxINsILs~mM`Ha#7X&gwg&q4kW}
z+6eX(@4i?UTb@Wdtr_B+;bKi72$JmH_WUz=6m83Kg?X+!bci#7*vN<pvsqk*Wt7pa
zj9^6hWjKOoD2(|(aYO)}v3%CZYezb0SxtO6Cj1b&O{bF#-A*LgJSB6S_3&Lb+bx|*
zjQity<ep~Bnb=AE_0Z)exF&({gmH6~?p28JtQ{tTK@s<QzcH9qjc{!b7-yep8t>ww
zO=FWv6?gyf9sfUT<pf7W1qBAqS6VNAJ>~?5-gmAe24pwYBg}Y7moyzNBR8DRPC&lv
zVSWu2M*Ondn0zhI_J%>oTn4pO<PuqDozAaMlDQO5(){AZ<N<$#@P|Z$_0-W|^J$}1
zPIPD;D963pP)fz2idoovB^8Y`W;SW%RRVbKKd=nDjhYVHLD(#N_u3J^LP)+FAR?);
zVlrXKsaW_wdGx@AJu>CSxHRxv;ml;rHKpE=Zl_?V1S|fn_B%%diE4)E`H9%35ctgi
zw2&``1M%dKouI(!IN&7+`b_qG*V(J0G07xF`GpJqAwK2>Vi(JDdTnL##8rB1>VT40
z?4<Tzr?rhMk{#bsqQ9si$f132c8B2FkN+C>Gh3t6oRtzF|E?)HPIEMkZ#4_f3rr|<
z8`yE9j(A}?m2`#^nS<iDmi_#mf!~^7k>l=9N9{l4BBkhGo;WpDgc^x-K`4wlh2@Mn
zV|q*IG{v>lT9aDyE^d#en|&qEFWfhm`<jET4i5U`ful?GJn3R0!GfS)yd^Rx>)sEz
z#YHq3KDDp2WtV#*gH^92D1wG>7N{{4S{wUfxEs;={jMB*Au~plAV@B@xC8er@9!~<
zu(Prm)B|&+y__P+8uhm3-<J}|gFsCQ?{cAOaF@%(*kq(yqesTql^?ERVTJQJLIN_N
z@L<eicpM{k6`3^lqaYx{hy|X)A|tJYAFCQMPaDhb(LiNSewhIGVR?ljs&ve=9M7If
z>Bc+t2YAF?QbRbWy3mA8(fTbry&VS2bSQg&i&0c6Nk4@V22~QmkGa5j($q`;qe~M(
zXJmW?=$2RUccty`K3@xVveG1^n>$I2|B{<!5&QVP@=b`m(%q)~O=B@lwxFJM@8{;X
zW03z|1RZH#GX`c`E}F4i&avyHnzt0DT5T=)d*Tg%7)Z%|7n<f1<p?CExz}|jnQ`Ve
zex;g1^%g@Kdz+C6WBV-@Edp2DS8jd=f9{4Rq0JbhoHW+Zd+uM9RrI^lVu~ToMZLc!
z;7X`aX!|XNHcwureDMGu6w%)dMS*ysyF&b~;TEOeFqbORNiX0C8)<*#NR6<S+a$|6
zWBFt;IriQ%&EXdrk%H|h89iaeg0_3$?d6SPE~R|@CV)Zq3l4zaOQyWddVF2sbeB-4
zl;<L!8(U4_`QO9kf1+o%|4$-^Y@A;g|C~b!{0T8v1@A)|K{By1mcLCQ4no?94%>uX
zsUxfi-tWrb=DsRVHU49a(MRma2zVx7Go$G2$b24E2qFUe>>}lRd%je-)!&d4i-9V3
z!mNan_@#Hh@umh4{k?^@;Tm<5SZJ4{mJ6j>F@AgCmyP_khk;Wi!5%32RgG2o$KW~c
z+39vetQ(cRfgCyBQaZjoNqd3^9`30$#5Nf(MC|-g2Ys^szUn7sEwC<}V28Z<sSoUH
zY=C=C)S?!BQ;ibD6d!jIKs2I58IV+zNwy-aLo4KtY&~Gya&s6}tK{qd%Z7_eU|Qo_
zE+Q+%rcIVK3EVkbZOrMIP%`p>jSE^uLfzB++H_F~N-wH)jSB%evFm>YRYW$>1S9|S
zA-bFkG+>Xjw4n6&@VN8ExuVp%g`CSMp3R+T!61bPLhp?Zfq}bwIKaWFCd4t@+J;jL
z4KR`baLP~iR#q3{?TgXrTk|=V&JPjvgWwMcf+$TvW~pXEErGbeLKczTp9Pb<jRgrl
zsXZ>M*57skT67^Nn)arB>(dYwmEe6=dY$==K^8O>D5lTA)!^t|e61Nj!mJ|1WcxSG
z(n*NeZ#AO{L^8MVnMHIxZ#1@3=4mMdjC8##mAUBM5u9Ovx|2A8@Ib9U;3*WdcX^qF
zbN3S5iJC@ct(FYN)OQa2@4m#^L!SG7f|6?!XoEhNA4ZgLXHTz5mZ&D^lY4-a3caR}
zH9D9$FXrRA6lD9jBPA&nP1rleMImo{9HTjZxtZoq1wV7FRb`Rox1~&S&%$8&*YAqr
z55nF#gI-Dk)K}VJn_*%rob*=9!>+Zk+nR%$@eF(J_|GkeUd#3@EC|m5DpX0s5OaBt
zHB?=4C{&!E^ry^^ags#Mse0U@Ls~6fw4hX_0=UrWwf7~jHZboe_>ZeW8Z<h-n?26d
zWTV+}BK+sG*oQ-qu2^m`>~?wJKgOK54S!Z+n{C5jcCpi0<woJ3lcx63?zAx=Sp4i#
z;bX{L6sZbN<Nthd$4XXcnpWxmeOY~|T^!#?nkhVHF=8TZlC^YAbv`&cv@B5JPqtU*
z7@2=|*<RG`o&>xB>O`tz;NhDviA|WS-4T?lZ`uVa%rl;!IJ1X2s{CZ35E&yafmE|P
z{IxBMUm_?jg`|V0qH4poTD3%Og|Yhwwgk|h9XIBTH&_HlqPxr~RKTK50G-sgSGroU
zWoi!Ec2{hEabMZ~FrI1_7RI$ppzypXC9y>F*3!$#%u??iZF-CX$~0y`O7^|C2sKvV
zxOR=yfe!n|4G$jZKXH1pDc-=4noo@^D~hPAA9U0qfd&@Y)G>#*30<BWHHa!Joaj2b
z_qM8Pst9_oNg?|x)81L|SML?vo0;@|(FHWaCaUG*=>HbN*}p=#_qV0auN2(j8o60w
zS#Z8B31Mb0-8ULR<SYtd=2oiK-@;3y8r>iTLGhys);8>rl*$JDx8VJlwoO>P(OG{A
zJMxF@Dl%P6zTONwvMewHAyz4uFtQGWz#{qU`x2)L8S4A)^sdn7aG=sJYSarN;&)Z!
zT8A4yOR+}9402$D&{8bgFVYXSqrX}2^FUw#j%jXcK|uo4inX08DCZvT<!S4|UzSAX
z)IfIBM{0S70_e?eeJKocGLJ@a5yC*61X)Er%D*QhrNMq*)an&1uTAj^7-t=lqLoN_
zWGx?FtJX^l!@SAvPOJxn-ZrkO%NF1fg;=!trFVn5U-nDo6`Sz*A0hIqcN7^l&|m$V
z>LBuiHq^^uI^|xRqx0u+0@|rNH73=EaTQEW>embKKr#H~R{~%4AahFq2LYO4+J6@g
zqsXi1t24*1Ox?y3%ky3tmdAP<Kf?&TPpajs8#>z0PYKbHC-SrOya$|fLv3GZNo1SS
z1Z2ecFm{QTIfz4oyl3*`y$#M6XVJ~D^+eXq6^YcS$KzB3tU|?5ow;!%U&L}xer_T8
zTl|#yz`ILV;n!G?;v9fCa*G~l7k@%oa#)LDUJyq$Q~%s$pdJsL-@EP6;hB2<Nh=Qj
zGwo}v%6b5}O5$)P<ZeD^TFihWpcr?+UXfBbi$qQ}F*}Zab?jNp^_W|eA&g)a&VuWl
zAf@v!HP;N^fOgaaS&P)-$z>!jxqqO+?=$RhJ_O_MME$iO7K8*_{cK@tJ82IY!HdQ=
zm_RT{UKkN&6y*|x6_J@>w>?f@F0pj(fwo;ah=339ZF<6@t}@0M6;^RpSp`dqj<Dh~
zdXuHqvu8TWRIRC$fpJtf%-ffBR7{!|IoUTl?01tM)8)BJ9RIVd{I(N<x*d>(^eOrw
z&f^?kdOJ0u)LzPK$MkEiZ!;>SoAM)!lqp_&8xm3ep~{h=fX)>AUBxzl<^T=bOxr8c
z(nc%l0jkJcdGRJq!*4yfb{(;Lt@a#!VQ&V@A#MezW2j-LaSuH)*#v3*aHNYE1@-Uf
zMmmF1pcd<tSn<pXx3@+Oi+%PDRD&-e?sRKE_8x5MK$sNlFb$_-4i)>^-lcE4ECg-?
z^ruy7a*g&&e$t6W;xj(4k92F>soM8c$|W#LM^>=l?xTh65T_9Q#IuCrF;A9+Ac8+L
zeOG5M960ADyiEF)(q4k)x(oLR)&EthyJs{tJWT=}go*M`?=hT9)x4-K2OsQ^P4B5H
z8T7~wV&n#0md5rc;F$F80etwbLW)#ssn?o8S$8}0AkUcCOIV@jRisQlstGz#dta44
zxzZyj@0E+d$C($htSJtmL3_FGOKd>5RHL3&Z%I;j9TgljLj{f7|F<&#H`Va(LkAJ(
z<kP8&JWCSJMu!F;hRhZC9D#gC#>B`)H|)9E{jep%VPA%H*OILBKYmDI7eTr%3ui};
zI%F3aGgcPBVD~dfG8b0yyfeWKYjtVzFSf5l5IMqF_9Lw914W|Pupw%-|BW#7T2mtI
ztyeST<`F7rjjU0?QG*I2ubiuru)eQLs(UzNaaiG&YEKEO;r>0%VWVc7GILsE+Hh>4
zVnqX)HEqS$E0J@hn_6L0%fCx;1_>@_ghE`O2L_ID_Xk%A@{>2RP$-T~?8sl)zk{H_
zdX<f6<#blF9?EF{P^%W*L{hslGDZ1Tyoc%p3i7h&)oZVT(jZ-f&El-so7WFIm|ByJ
z+=~DUeLjZ}H*0h+dQT)cO(DG!S898nBFBI44GPv@tVK8h&fV(!REn>ZG_PeoB))Bj
zP-0!J5$*L^Lu;7mnC|KDPoiR66EJ~mZ@u!YmZy^2#-=vE52xEb1Cd0<8di7_RApl;
z6EB{$aj%yaB0!41z)Pu)7sH(Coz#stts058;#6FW&d0ZS!m_ssJ`yYRUAFoj4bZuy
zdv&uTj<W$Yr)^28qmAX8pkiHXWvcj;O&^-mz-=yalO_ORZqa08=J>Ax5t!ZlbF!%c
zM;sT3G4^Y$asl&*KU;(PsPazk{QFO}JYPAr6~T~4uz)_qyp?4BWa}BL`>n~|{Xm#!
zHSV}QI5gmplUvfTvk#QnU|5-7J?%hE_P&VAX~L^Ng3|tAO>@3oD#V?+_LHL}K1)x8
z6_;U<^=VhG{&2H|8!cB5<ItWqKQz=Y4vXkB87W;w-;o7OQPreYF)Tdx5w0IfgLwc;
zG24O99dpAr)ostkj8y;XZV}3)aQNp(CE0QV-o;5}YujU^fD+r<3-#+yBT6jb$GY+m
z8lM!U-Ef*;JJ4sh1VY4jsc-P9>(8;bV^MR)o<~jq@32KyK^)c;z0il<EsjUtHxP6N
zZi0##ydk(g_#1wys2^+v@I1%YLl2)+k^1D~Ks$`V7Jw*FJA3PS-?``E{%>{B9A7-{
zzWJ%L<rN`%*Op>!Xl}v?37hBsTAVgJ|J=JvXGhF^wxT*7M+3H}Sh^gP%qvpFCu{HK
zkgLaFM1^6vy(VN93j~RmWjTAQy}{apICrYOixfnO-1b}CW;t-e(KnH1fUwR{K>fgq
zU~r5W84a~LGdH%BwNqe3>jvG(OaEZ!s;pGb)_-#(Hds(%9_NjzY{$A+WG=fhX-(vv
zD~Rk-ZKTiLxZpMK#Pz+x@M*l~$yO`JH)6`OV|Cmtmta+yTy{Rj-tp=$a+%^8+p1~V
zf$kn%PJ@~)PE*rFCYO6BJ0Il=ru6HG_Ko+~?jl*FmZD&3!f<HT)Iaw$xg>R(?2K0|
zSQda$k&_BHod3b`{x?{UlNaUsVs)4MJ2!9J5#Rv%*_HIVw$cB%9TYeBsI_7k(p|&w
z%ROjIbrjeVwOR!rh*(D<+%z!PeK~^%G2Ae_SauEhAMEwCDDovsa@GFFExs#$!9&&k
zOGk#p0E1Kn5#bbs8&^H@(9`zz=&MnDP*0NUgz{Ooar*VHA6ZL54W%S4`Qb=%f-bBX
zOq-rR+>>KH8@Fe-#J~f*ige7X8NrZP?4^;(G=ep>s*%h`tEqCk7b8i8WBSi&iFhJd
z$ifQBGmO3>{Lzb)a(@luxi%&HsQcOb`(|MqajOO*tGDPs2^VBTRM(he*AebZ9a?)u
zoSP?^w86#1%-~vOL+VrmLgKgCZk+GXaJOI|h2JKL95kj1Yi9NEY2?9uR!;Hwq6e&W
zH~&Rh&ILe;m173nb0+{ed_m383C9R?9cvAlhtX3=_Iwk*ChZrUXQ{&<`~iJG?1ahO
zj(1@@{TFd_Ic{2q8N_!zNRo$j1)WQQbMamKc+EaCsvi{2Rq$wHOH1n=pBEH&k+{st
z<>q~MXl=UI`tB>0je#CY1&y0E2Pun9S-C8srW>Np4PMrR`u06{iAl99af(}hN)@>m
ztfky`O<KMq@4sH#&}zupwL3OZ7qtG??d!&D1|B&J;LwS~@l<KpFaAQF?FV#XB%E2q
z>Ba2rK;W<V-F_sPEJ#1~xYT75!;yuhd1d1m;hK`6XvItE+MmDgVb}ch842T`+OX+P
zvQ_*$`pzw65E?rR;%*vd0Y7GtHoVEOn0E5?MK+@EMf%-atIAdynIb}{giaWCb&~Z`
z>fsC18;w{Z(vOLWpyG|K*m*M1O3JTL>+k*3iA}NQS#Uxqk8(smT-&4mfxAPoDyexZ
zeCS}y`?v{qG&}6HLn%Br?^q=WQ+^=9nGMWQ-Yl86e~Z0@ea(`GQt5PlH;&)GNPYF!
z!lAY{|CsT~>Y9&Y$QjjVw|&-Z?MF}J@{c1+X?X8&%udG<GoH&-iek#qdGeA#Mmdch
zn0DO5gVLp_j^)gbyPY<;8E(<ROPT6OJu2W7=XrJCqPIfi;~I6}1No6&tf+G$a4qU-
z&uJ#NT9+VosoL#14VA#`<tKX<3LUe&sXU=feVnaN_P$|m&HZwzubkBii&Y-&(Vc?8
zE;jSy^YUn>DQ&lRFoK#dHb?~r$oik^r#a>urqw`kB^EHkX5@tk=6FK8zk(MBGj&1^
z=8-;?h)tU<z<v^B52k!EjBu(){-~=7bnZo~tvjE{>vYR!@1N#nXwx!+X2EVh{h>_f
zGR4;hu1L%v>cH=e9vwWMxhTtu#2#v>+|Mz<Zn*ldpl-RrDsIbMYsBb%`-zcLrtB9J
zm@`o5vQZa1!s~{aE!XYCD%Ay$PugL?)c5~SsNxrh!3G-b0MO7USmFxp1`=0~EDP4p
zFZ%@(n1T5yn6qy~-8f`_By!h+5HQpuTnnNh&g|Wru%AB}cG_(#WGpQ~Im3i@6F|4a
zA(`UFPEAkI4BWKrCggg(5<s@aX`VYf$|0UFOG28-brwLghSA+aPFBbz4kx8zLD6Nn
zn!v4mGfiW{VgT8c%or6a-w)3fmQF~UW>D02DIozkyn?nw4&XX7mCPdoG@bwQLpr7n
z7@)~?t|`OpW|t&VaCD-dp$yvvFUb)MP)zyhy^7Wfwa1`pR)?t&{s?SMG(59<!O04S
zXwU$uWHzKwbaU)@Q7s%`TXX(iqxPTT;}PG#;t=949yE4}mL&pK%MJ?efh0jQCc5i;
z*HmankGE8l&Q+Tkif7knA#{<pMT^ewv%4TpYtSqk7@?eYBGT4KvyEpl=qXIsNSp!(
z7qS!z0%3)EIUv<!`g@1?c1axFhXGSqPDO&BL`xg~`J8hSZ(Cr$+&|RHN=aOojj;xf
z`o@IB3)qLahoc>j)I#nXSG10cGt;O?dGc2kewG&6m}9t)kmr3hWI2<UQ+$p!OhJI&
zf)k7$@BR=y79S!3w#yM!Mylo1n?%K2J=({zVzRpFHB+siW=uBMHOy*jtEUznv$jPW
zF<`Vw$7ke*PksJ`yP-&Q?!-q_lR_BnLjwW9_v&(d?c_U3LzH`D*^&1AS@qqWj7K?1
zt9r%v#Nn5Nbmm5|7Y)gIAG2`mrl9C0(&UHzWHTj?a@DiDs9d!rsc~FUy%#}7FRO=W
zs8MU^^kz3XAS0o6{g}|NVS<aKlBR{SRA>U)m2{n}Ju*pm#*C;dNoR;QSTmmtirL_%
zkDNsjUJQG+2n&!tIAfK8Qr56IGfsS4Mm<*E+QuAZr-AP;>JYh%ZEt-%5!(7QQ((+D
zrpUGyh98*P+bc2VH?(uFuTZ141oQXQqW8(Xj+0DF4VjE-7Jum_TK$gSPG{q#XX)&y
zxTrjxPvJU#xG}Bj3OZG#T909GF7V>k<TRbQXisx<iEpmm5l0T?IQzE-HC4(ft7S!q
zzXU=u95SW2uQ{Kq^iJYDe(>fx?%q(zN7<GRUNWOAB|hufCZ6fPUGpNhxw?i`fEX15
zJ)rxotg2!#6m1pMQ`m5}FkdKuc%y-ZQr~5xPqn3X281&M@8nr^myzNd9v!oyJOYau
zkCE*E@Wdk2h-rr3R^7ELo(E+v+jUH=0kz+laXC^;A*T0!YseSr{q1#H_}pL4V4Iy#
zxHLT&?V<t?z)G=(xZElomYG1*SJw@9yddX4!K7-wtYP4FU-3+2$&y3Hx}wlUNQt@;
zBFd+37wl^aTY{bL91J6~_dl}cYW?#=@}6Bli068I9QC^q<rXYB$=%DqEzOLq_S8Lk
zdq<c+!`_2nM&#|>J#-vA&p$np|86G!zmgCK6XI>?BJstCUmY(0RoVF4WBL2uuDp7O
zZ1o+y-QT8DaOZiA3q%(ewMu>>!l}c=f0e@&=Q=a7QGU?1R)&0`706v_kr3!L)-=6|
za-&mmGL*)A0nW@thK3XdwF4u=R}92xbU_7j>`Dr#A6D(`SwE2FDMCb$9n04hSwo`O
zuj|c~;y?VmDgx+k?;+>Q*vk&Git-v)*7h{>J47~P9cjtPQwerwC3g7xV<93W78I@W
z38-ouhzV8PNbMNc78=*X*l~)=VsACJ*M%~Q7?5XvCH?yMZQg!&BR=wbiW}27v1xa#
z^m`W-xbl=mEXNf2|EYqk$*CnjyUOIh^w|msRYs<D$@u0EC4wUjEtT&|f&KzUes$LN
zoP{+*`3mg!q*#7bc3XYK_1LI*FlL<E8zDKtc0Zjj8mp`FEBg8Ru{_e;dEtSy7`OLA
zR$8L9<56{8N_aXj$_xnSD+<#~=)$_L+tsLW1Je3D**?jV;7l?cs9zdltia+Dyw6kA
zzx{p>zA}f~@lc_EGmu;%2%#hUpxt5mlsQlD(WA!83I^7hWD@PtkXOSehWe155IEO>
z+#~?46D@O<9t(XuFwSKqLMTZ1B|xVFn6qy@P{CFd?n&b^Xq)TrqV3v4v)QFPU`w-!
z0e~jTxfnS1+A8O<3Gn2tXY?5);SUwU{-P1Q4P~vV^RL{4ZcR0+bV0;7^SwsP53o?~
zkAplU6{7%1wgAxJ7u{i1#yqlQg*RAv1MwZ#%D0}YgSnNFc&Wq1PRej94@4jUx92=}
zc^5(OypzmlOt&a(qAPSsO(hkc>rinn40e3mE{aKYSy<^^l7ljFh`3BLb#CA@XYWZ*
zo7YLjMPm|BXwJJ0lJcnd_IAgdH+1gT-#H?*vFmFsjb7wBCGF=$?CaL-ULTxu7tW*P
zs<htNN6B9YUX$QDnq2LToov}Z13GBUm~wq1vTTvjR&_GO!j<(nAr@X!i0_~xEI$5#
z2g_)Y;2fEwqg0q@)lFqXF}d|+3@GwcskpJ<Ue2%x-6z!*wr5%nDF;YR>n+kHsn4)C
zsv`5zicMnM#4vcBc^g>i4IO5+lOQ~yEc$a>T)CcF1bRqayl&oPGHlX!vn)vH42f)P
z==7c+<%x7!T!><Oul~M^sy`-GxT&Q@sY=5ZfSAJ0kfqG!gC<3fMj98WI}`2Isu_Hb
z+G#1IUMUTDF8rWQaS`@adRDxRG0r)1H>KDdPrx$*ClNW+vVHc((tqXyWkIxusYYb>
zMIB5y8DWTZjd;uOf*5+<FEEU0ivomd`JEvlEvnG8udV3I=AM&HCWlOgpUPoYJ6J=x
z>Rj1F>3)C(bn`OB0BOQERmC1eBM_L2KS(-HXkJPvWYHcDGn@m6{%@Wmh<IC|_w}sx
z?7uS=e|vC$t-0brJh&C32E9cYB-G+696tyqaaLQ%e0%@WOy2^1Ogc1KG?CKtnBWYP
zxV$RS^jxS~9<x9!BJNXVHore_M=xk}=j4*i{_m)8op*IIo*hY6RpZPJQ3)j-zgOdL
zQoe7ZJ)o(YJDKIUEdc7Gra)%MYl#kRSHz-jrMoCbl_1_}_jH>Q7mZ*hwXazJb$Zzi
z&?;jIMpThw^=9^(BpFE#gfD{Q*MFrNHC7=sD4`(z*>-JTS_8}_R(u8Jk?HkJyM8nm
zzIWH#YL6%yKR>~v_CD3v3oWA=wLXADBZVW)>j$@&<LSPz&%pAX9biJtIC~Fr6-9=c
ztN6Imi@%q)*SxoK6<-kxAB!K72ssD5oFE9y5t&(jo&e|eqP%<ccekuAD%&o6Rd{Du
zl}JTR>uk^-WVobs6Pzl@p&JxmYnj@QWX9E(H${{MLZMtE?QOiveC&=m;z=u6uH7y9
z^QW-G3hyC5<>?V!{lW^fbyLdma#i?MqJ~3@RBTGU^-y^8OS7cTB&(ZRMYj;WDY^A^
zLwwF@X6hrYzGSbjX~{(xd2bcusgEfcLat@YTNC_xhda+jWq4n=>m5n<Gp!&{0%auq
zcs93Z9;qhD=KjJ&6-PTOO|=&K1_&EAMq2qT6GhWZ&t!_vCVc&&R8K_e!UMvjds~#i
zgP`pj6I0f`?7(~C&f64g+g)G6t(D6&iYzHXgn2Oxhb~Zxcjmh}YND9lr^5!i2w^yt
zFYwGc_o@FAtx29pD>qYB9vfM8K-S48`t38%dG|1u!@qf0{L|wYPX(QIp{Q5(e6GtR
zllc1K#yT8%p^H9DvK*bT_z4U8;4}ng={YQO0W&@;6xcanhwXuO$)kM$+afchVnv~y
zBYiKn5Ge##YK2^2?Vulqo<)W*OJzC=zOyGnS(-L)4?cr^%kZFVX1ty}$>7Fz-#|86
z?_t^e8I?_QBs(pFIm@D~mufgP2mE>)D2amT!Z?yqHi$#k{>3@lj~0=!3HZDAFb$8P
zWvTRx{EmmH`ap(WmtqwCYXKQMzL|MSFA_)p4qum<qwlMjN|31nD_dv|?PzZJG589(
zmiylwvTk!eL^k?##ubLI^T#57<T)zLr~YM5kJ+R)kFaQw_GW<wH!XPm8N!!(S}m?T
z^L3Hn@~!}A5#>|KN3QU*@{it0j@bj`kY6ilAK(KQ{RSRxh?!@Ld&guvz}c1U+-mQ*
zJ<!xe*t(}7vISiojj)@-);c)n=F=YRrJcf!E_)pRi}H|$y-krh)&$y*>8ITc4bXoH
z75~TNfn-~Nx$OdQ@jos=IHFssF4)#sBya(vv$Okc8GrnTQ~K>9&qQpLU>J@Gmt2mJ
zE!q`^=T~>~Y}7Y71_E~@#>ckWppWHVqQBnLaH{Rle+C`{UzYvJ&Kld6rC?TjOG}o-
zCH4r1R%66uA(%sP><_FVNW-Mfw*0uiUZ_l-MXw!U#Oh7nGIIE+4dxDUV+x=_*M0($
zX?b3`<M%>q`@bA23v7SD19~y1$l5&o4Q&{&o(Ollfx+w-E3D9`pWZrTTYW+;1%1v@
z->9KntlrDZe1;eFzzAQvEj0W=VF_g_1B~41I664aCt^|MI7k6aBz<{rUO}^JHe4j_
zWfur%*p=y>%J$?UEmN_Zz-@)EvfP_$taFppKNt2ECTJ#s@*$`X95-d~(-@U&>qX6#
zn=WEYhy7^Tt({4VhE$X)N)D1FN}cA`jl8W2W=n>JL~?oVf_D4|j)Khvxz*^vzA<>#
zzwiZZ_>rNqm7y7O?D_hdYh2E@)?iAn$2end<W)4Bw`_vkU2R%ltu2=Hd+AwbIjQbx
zdF_#FOUfTgKnl+srmTZYxDET2ELLA)UQ=hdvj645r0aXrfx(!&cV6|iICR0q%grzN
zrv8M?R;0k6lm^kk?t}%&7k_xjXY{?Yu7gu8Y<mz_E!%OWe}J(?qGYJg$Gke^ZWS9Z
zK=*IRq*@f6_>EYcmhwotucj~RIx}rfoUXf=q1{W*tLc`uUuexbqx3D$FrFHPL}PCv
zWU7juyj}!`sf{d6&`@-`)*k0MK~%yK4lzJ7Rl;?}YX)S;P>%LfobPFK){|$Agcfdb
zpk^WIKK;5ts1SH8I=v#4GbKt*)#rBsyIhlAc5-{S1{rMA@eCg6eP+A38vdrIdp|v0
zy;5P%VU$Am8hs*wwwD<)oWm(sqLtUR{}22*b>b!#uV58l=mrrseXHlvL=1@5eV<>3
zG0fe6JtE(+V|)rgQs`9RU6DNSzpk7j&~sqpt-S2?X=v9;h;1@S)7g~wJky?aMyHhF
z^6+m`h47{pV4c1b(*cwI(eV?mpYpfwxOXXXX3oQ1Q-_j>bKX0N)5b{RAAtG^UpI?n
ze%T}Z17f^{Qe#QZZmwn0fjF*iX=c~SNy1FK`skE0mHLin%<91n%OZVkFek6bZL5YU
z+RHbK!90qUw^KnT585#I_A&zVi~KC|I|^#Nd=de@@N>GG0RIz%!N_ZV2|jOfeR{u4
zwp!Q=+ap#eD1V|gszgWL&xWtf4iO&t@Tse2{B9_qEYO*|KWBdP!w8U5{XdlbWmH_<
zwlxX|cPJpZ1`7_sT|(jR?(XiA;Dx&fD}uXg0YQShOK^85_~qGW@AIDb`_BDw|E{&#
zYE`YR+2`ne^ggC6$C%0hk*vTN4mLHYBwh5`Jd(P|LvufzQJFK;XRl#3C+^9MeBzaK
zI=o5(@*VcFao1*iOE;+7qITvd(RmU9xzF&2fsQ~YLyI*Jh3)^q!~ab|!nvVed)s}h
zJpY@T@CX2C^TmCHno+)@seRsln$1g+=MN*>AwYDZFgTa#Bte{=v3Nl4g^nBeYGQ^0
z1hDTA)I1fLfljRITJEPbQyuqr3HN^uh56%cVna2+#L@XFPH5Gl!F$In4vKu=)Ezx$
z(`Y|`K&*v&od*^_$Tt*-eWb7N%hC_2rD4=d_%EQq3Pk+m6y^O}4(}qQ=%~cQrlVAm
zdN3q|++ZkbHbtp8GXKO1Z3}cj%ZJmy5{}>Wa}TKAAjimc>7zN#1uMcs(%K}T^I7bc
z`3T0O_f>|eDYbFT!6M3&S<c$m*1AKsx=I_kkRBI>`--|PT6Dw_dt8+kA)aZK<P-mD
z0ixf<sx|Uyc=6zKmpK9gl(gT7Wj~DM7a%`Tj<Z56i^&zX{#f%P3eVY<zOKMFSmGWV
z#x<WZYjP_#0o5p9WASqq`)zA%XH)E`Pmo9Zz@{=ayZnEa*o84jRdyGf5g5q{!4sr2
z_ofHC^35bs4tm|^D{g$BWmNyn8Eq`a-Y+y86Mr6dc=UTe3zE%9BpIh2REM+0&btuq
zf^iu|CG-2i?3n$w)vB0CU%V^bLNIBq%yEM1X!!G_J2dIY?&01O;Q)K^5-VH}iYzw)
zsXZxZYr-pH)dX}#mt%>`4nXuK3Gy8I6EzJiqiNgyy&bY^nr-Uh?${TwX=)bFBjpl7
zZ1#w}YG~1fO*i^Pt|(_nqu~rjRxUnHY1Wq|^x_-qV&k=G)DnhVL|9ZgmJu*Z$}AGC
zBDgoKZKf`N9Vue&=ULYr+AZ$%=X;-1`g|vOKKzMZ&2jC{;L{Y#&6VzGHt=2Wywv3d
zC0`BW1+dlc763tI!twWbf<Y5d(rmbU?taW~lN;BzHuDRsB-F{BM$E$7&w5~tgX;6=
zhnUorI4_%*=C-|w(0(9BZNfIf8`SC6xN_mv@jkLbv2`>%=f$b$`}eAz_{+!sbPN=`
zw@^!T`4H>3W_n6g`t>wD`iVrqZz8Q$XEtdlnNg{eDBLT`p8=^ZuaIpj+=92-6sUN#
z)E0?-yK8w+k6C;$ic)6-zJIX4kZXg!W0GE+IgC!N*t!jvL06<D*1W~xHYKJ$6uKhB
zrm;DW6<B*!N~5}TaQPwfO)W@T{ZcZ{ZpolnX0UT|V`q{a;GIDA?AOKp%pX#cb>x3S
zwWc1O4M7N8T4_)xeD2I=!$B-4OX5{EdX9v;id!AWfUglYBJK??<1x*oM(_FTt>O0k
zy+#ImV9T4Ct)Et2p_2ho%6EbKaac{It)aBYoIKEpK5M>J<$qHf-o216fD1BSeRsis
zs|`@$-Q3|Kg#N$Mjx1uU;UhNx{>q-Ms_e|Fbt|I;1jd)m=uV!))9;cQZO+SLm6i*5
z3N`c;Cv`-li+VfG<P;-;-)H(!C@cR^Q?yq~y)Y}$Qyb)Q&q8^omq#<3XIE$v;N8)O
z1p13t2mQjvTFjhUsTj)y&ZT(bt<4>IA*~NWZvPEWof0Cj=JBSoCbvJlzixF-j4a;q
z8rp7SEp6Eea?SVzy(2J-NqkxO%%D_JL`xzI*PZjG*&AT-)BJ8FQ&dwzsn4MhW9Iro
z@`RDAZ+&&6XdhsS-j^7p){($X8~r)(k_s?`vQ?@F5H&AcUg~SH`TUgGCIDb{7h2&>
zD52z6Oi}Dlo#iuBa{`F&P8I3sf>jjbWqZM%8$6XM1}0R;IudS?tq#*tv)eW%RyWk-
zIm742`8`<qE^?nhltV>iUE)DJ;LV}v%~pGnWQmSYK_VKXU|L(;4S^2WoDbqULIT3W
zWFzI4fX1}XgQ_jmqjwt+DjSlchq@2y8rWmg<?^+kGWT@BT_SoD!8C(|s$q&pKl!`Q
zxSx4k8~Wy}!GeWf6t7{bq%_D~z;n5B1n6X-cEl<n>~HDFWu~{Zxy)&q>Ka{7ezFe=
zP{SlIngMi21XxUifLlsa!A&@|tAk{I(7VS_haI5(0I%byQ{snTt}2&0dNhBBz9qyu
zv4$|tj)5a&AmfJmM)#EQ{au((#;~-S_<NJHu#MWraP8Pq8<!FXxdncj5(!v4AFu@s
zEYX>^d?IA}c6WeQLetl-FHDFC-88H^AVQW<u(W$)Y8r8rtH#)oU={r!|6>11_F~v)
zbWM?*8nH4p&x&Kdz)A`W<+koVf7gWIsupXhE}$bP=S?p5wuSF+SpMfe9r0_-iHiOD
z&$78J(3_)XM~pG&ByOlA+*~$zS<9dHrhSPE!>D>hlPGF4xY@vOrqweF3LKJ57Mz5n
z(GR$F_uJw&B=?_$mEJBA$sn;$49C}83PcTy@`zFm<cmiJp_|4R;%(h4i<Z!TCBmLa
zj|;0t2i(nGcI^kB%n6CU|E<9emY>C``h09Nj^ce^73jD?Hr^A(qh>#k{N7Yw5{4yI
zZ+%!bDh3#glI28C)S@4TS^Opvh@XbTw}hGr^*20Suhmsuu~Y%F#9o5mOMQ+#!FIv?
zCRLc*)1;p59~i7VJN#kY5>5KtTR&{w3#2A5Kj-$r^sId&I4*;cKiIbAFsutrm3`tr
z{rnu@5!c-z{%+xQ?2Mh^`+sp<igkb6$aG=i{m<=<mGEwO<sKPW-1Jg4yYXA0F48M6
z*Tm*!>sTk)qgxAJ*)1wL5O>+%q{~{WdAMEOn(l8DQ43^OSmMNi@MGsCb(y?>{}BKV
zEl`(BWK^^cOvp>pKa5zJh%(kGX;^X%eim<WhEeGYkWD-)^0hj*ING&<loCKZ<^N?m
z=(I&lK3ZsmyyrPgAVnGb0lw}P0+=DuZbFf<c$c(=hsG{u*t?jr4UY5Q6P8QpwY+~$
z_S;K*AXQje3z=CfQq0ngs0J(zxAGHt-ydXLKBmg})O}*FEFO<}6eYL=(rctslSemD
zm2zk;#K9N?-IUNkqLy3vrA%`+5TUQ%WsC?cfS4d9FcA?(P$V_z83{JwgL~c=zEx66
zxFD2rb57`-vIA)!)c!fY6fI@IJof`j;PSOxL%rI^12Q&n@u-JEC8$%Wux>REc~2TK
zT+T+c8N~%0Zk?|;4q8jM1KNU;EJM=|%t2M=-Y%?Lh?hy7pN!dhf7*gwi*Hl>0lq_5
z+Pdzo<SJV*_420_tLiR4xc#fjV$2y*6Z?B*6WxmjgIqpjtGYV0L0*4NiQ=xw7`)1g
zIfux^KCZ>UQ7*4NbUrtfS}HW;CN<fgqCX$#!Gr0)@w{Cw0r4U&X~xne(lg%@mL{})
z*Me_%)?S8;1mzuPg+>3AIHoQ_8yNDOVQt&5C+@j<4?Mp?l1ke7Bw{tz<})gq#{(Y+
zPR8iYU!o68UY>{EB)&~-d9HhqZJ`_<N?MM>LuDR=<Jd((2-$Xd(M<~mKgfSO`C-Hd
z$QZqw=gM-LF2=Ac!uQmRICFrMw>pswTAV0I;Iux;G94_zvAr4@`38UPO}?g1i>OBG
zJW@-Cp$>n>sW}UWalapA1%4XeuTrVDF29I2r-^+R1J;D{pwP;8AKQ&H_W7%0n0bDv
z{-@m)eF7B{M=3N)-OZ1MM~+%%Hsx=Doz404D+;6WH%ea(?tD-2r%(|}7VKPBz+Iot
z-6G`O-CPgbNfYw$@l3I(xXq66cgE38%vnv)CDZ%t1P-JkLCKRH{)lcF9Tvi$_oery
zf8;+L!EWMFy%VJJC8^xw396~5=PIag!;RNtn!hK89qjEb_y6}Q`YG!Bw{JvFx9=9~
z|0=I9>?QI-giOhZ7=QfYj@BQHOmWF6<lohuVE=P5WBqQ%j&p<knx|c1dExYVq_2Z_
z&9Xj7bA~qwl=J2Ml2D-`SbYev<){+|ilvc#KiKC1isnb6OrBb7D9gJrA<*;G5>u8~
zN&kHS7oeC6sA2C5Mt*mNB;Gm=Ll_p2BzbNn2CV;yE=6@K@Wu(H&i5_}Jbb}rgiwjI
z&$EQQ+jX@c!wn>85sU$zMu0gp#We)rcb+fln%@W2=x|5i*c6VDbs|JswQ`$1EN6Md
z0-c7z4=?iB!*3kh%mNe8jD?!RV@%Dd4s~P)H9?@m=sGnh3@3P9Ey_DAfQY$BZvmaK
zer}L=h$3~Z&zbojxmHxR+-^Vdw9F5`)gen|Z9xFoILjiyX%l<!>|6a8vWm0uUoAh0
z-64J6hvQpzVIE`56CP5b)PtLW>c-{PRil)IeR%b$c%1AQGD;*rvoX48;;@T*12mjV
zOOk=PV{}ux_*1ioxyRID2XUPkDYuJvd@O9oaLNRJMswDD8+9bt%{t;YU0dw6fKwLw
zwXg48Y-6m(+=qO;(B1>uK2!H^36xHPB6+;&Iryb9E~&%n5Pan~Ywf%+35McKo$SKR
zFZB(ps5_|#9m=0&0L$9(Uy(zoe-IJ?6vw`~H`4U*=FjSeUCT=9Wf07<Oic(~sqtQ6
z<**g>OUfO=J_ui>9oVxtf9;jt);aS?HRHbN-$x*o6}umGTzWOqb;FTN+e<lnQf?bj
zc4@Hl_s?D2-E3_ZlFkXT!N+)(>Epffjkfh;mJ5;*A!a}#m$3ZO;~ALmc5|X-&tdpW
zSjAP1V|`vtX;U|qCKF}ZCi_Fa9z=qlU<$d(V1aTyaW<XLVc_z~2PbqwtVh#cHYIb?
z)DpoX5#Bk{{Gx;SnUp-v1ZzK=w!QSa5^^*0(HL7Yopm|@^Hr#rIdkjJy@Wcors8jZ
ziR7u`-z($RI^iNP_6z3>0*-ThLQHt>s6$2u*zN?N5n9DP$Y=__Bh=cjo#XfRRD#$B
zk%$rX3i)y2eIP=n0bA|+jxbilIrmS5M_HeLKEGkx_+mBoZt&f)`ricfzmq)O%|!2p
zz84QeLr;%?->u0-{CSjOgLLmeH5fjL=-4<zyooqlisd66uGkX-d>)z|f}JZzghjC%
zwGqrvkK)4)A`m#apx+7BI&ch-7VaE_(Pq<fW7S@57f)&z(?ImPGDVxEg`6&zNylfn
za4d%wvoEa%vE$A!BJ_KVG*b8r_bQty&C~YTe`GwlI_qi268Lc?Pnkc^LlT7juk!kW
z6S@(sRiO-=uRrV({c(YHoL^xor2IJ`zq6-X=5c7EqoQEf+JYyBW6IC2ERqT9THUU6
zq&Y<7UnZ_T8ExLSD|-No+!H@rn>MsW*q~jh3(?Jm6TKuIcx1N=WqtK4Y+SgfZd^Fe
zI9diBxkkoG<<j82xtJz~f7pSV`&zFs50OuHB1~g3Rk=Z%BDw(ju<cGvDA3vj@px2l
zY^c}+GP%hf!Qq9aj8mB>N8GG8x_OE-#dS?)d~GflbpuCRe6P-QrMq)|WE%W{2ogma
zYaruwp=7ZvB+OyG<?xZ(><4tz)H*8wSJYNB+_S=%j&=dwSOfE?oi?g*J_dDjYFUI5
zH;(IRe|Ion*6xLJiz7(YP<lwE@amZC_-|tJq{C1fwKbyHI>j*msJ|jL17~77vn!Bm
z)69;jV8K~5G4s7R0;G`+ZS{G*8h&;Q0(;@J12+rg>A33uO!*KqE<9j5>0lUyroB%@
zv&pjI07k_a-!6|tmS7N3DhBGCl{<UA8&L$2@S^VLd`H{YjLVngX*)t7JWn`&&P0-L
zUN_QyXJHXh)h)#P?D4txPnVe`B60?|Z&wk>ySl1t?U<w$;@nfK&sR#>=i~u74KK%i
z@IqW(ddG2uyezY-!Q#iqr`6{0pt6VQ_7o7pB~Y39WFOL~GTNmQ6Lg?_*H#4-&}EwD
z#F~Km)1CF0gKOPfP@zGTVC;vWJ4{hX4n`?B{y62+m_q2@(9u{5n?NU#Pm1{b%!f`<
zCAF2UtNVahbA9Z_8xo4tXdJg_p#S^c{1S|zgVV>}J2ISt?KrEBI$^3^s`<%5X=TSQ
zV!uLXqG)N+Zc<T)_%fz(=yKHW-IUZ3v3Bw~qlKA9HwV@9GkRbzRA8k=I6A#o>O2mk
z)s}Cv(Q}LBRw%(zHk?*RfT3(zU(;&kc0uDxY3!`B`1QZCt^cDj{hw4F0v^=cFZ;E|
zv;XXVft1*RE;^JN?0^qyDW&OvZ|G>N)z~%h>ypGHv6}B1jEfm1KpWA<49D(}_}tYy
zcz${b+K18XT-t0^YDJwga(xd)8%=VHC{Zp!PMXIfPN_|L_cHr?I61ZnkfAg=rQGY~
zHTB)+>PoOYaD}g^yYWYda-A*>M|bNuZLb<Dr$r7zC%&p94LC)c|0D8hW2vI<SD9B<
zuw1IO$X<AHNLcMj{SF!3o*01=8bva^h~7uuFF*VICE-h_lz#j*XFK}|%s#|YA9T*$
zJO!%W&$L9gqo_Qva(&$vqRm<}X;JVm<lf{}jcbARP|ykgfaqj*N?LSb*9KOcCxt`c
zSz1Ef?IjQ}9DW)wfEoz<&ny5T0Ww|rnRAI%k1tI;czE)yHPKcaMC_+K3!9q?%|t7)
z17xcMSmqQt!Ap;F{tlSRnqNc>95*sX-L~|N&?nZnWC;veRDzgdwV@DQPXChgPIXh3
z@yWkfApfqUm@-%KdbY{LuZ{hEYZ0{;y>E;3h9F4Jm~W14NT&WR8+nCbv2<WJc+^Mc
z&v=u<n+{1@Nz$JZyhqn>9>RpwA}2op$xn5hHu2aP&UQ;e{^e^`AkrcI!Bomu{a;U4
z>=c=wG^|ZGtku;nZDOD?9^&8U<4{jQP=xVhc?59x=gNIHjia}Ocu(EiKDAN1b&&oN
z1qNIGc0&`d7<9a9%(t?%krTxDWGk3FZw208BGw*Y4FE!9IF%Zd*UY2LD!aWl&|0A7
z)l%C9gwVI~@C<UW=UBKDS^RqN#>gsWEjmF=M!?*IC+kS^N3HdNSDG*52d=jSTQ4cE
zS9W(W5^h}W#TQV~+t+`4AL1rx2>Ywp7Mzds+0={Q!|r|6*y7-x)Vr-EX?9D%Zf4M2
zF(ny8K+=Vl2sX(Wvm-4BaKEOM7h`WghveB0d{)+8Kt@+lO~^X-<aIimW>a`If1b?A
zNEKmm)%rW&HFrx#lcGyr6C|+%`H7%@#7TT_**GLn_&u0;eumXSkt^$_YasYQF{MK)
z>E#ukNq~7~UoH6Kb1Zh}6mNtmxwJbTp3JCeOB}+LN%G*9FA>8``aliS6(?!++kEOl
zd@Zci2?)$h9#*XRLf6dFlRTJ~?4Lh9_(W|WRfw~dYSKXqPCd+`*?y|hDma(W8{nR1
zZ}nQcQknW;>A&HCHP|^g9j+fUp&B#{-o{7%@g8=igzbzca(fTz0k-wsp>L!K73ogx
z7c|fWepuLVPwAA_1t0z_@cy?a<NtTF4GfO@{<gt&uJhm4`tF2?Zj0N}A0v>bP6KyL
zh{SiE4-+&QV~gI+8yw;6(6?oM9X}zyT0J)ET@+xJC3K<5mFT@%^$zsMUB7HQqcMC^
zvpMoVP_n9Pb#nG7z8u$0i`;04T#ULVzm{&co6Ukss1vpJX%N38%m;Ub+1H>rH)Q_d
z#UeE-39ID9v>$O!JS+0uX!DXLVc8^VO`HE$yl@>3_G?+}5!ICD9zg-m8GD<I`ndJh
zXolir5te)M_5(F9Ir-&yTe(e7RVan<fsr+hT!%5tWNeyRUH|w@DISk}@;TtxyEcNJ
zP5m6sUj_UClanY~N1DY|8@@Yxc*m=5rtzdBBojF}<&Prj#I}%`C4&^ur5IoAG`lTy
z^`);Jj#R4u=v{r{5&bz7z*dI12<Hxkk@i^Z1~Z%_?>Kf<@VKZ(xiUoWBDzXNO-A<t
zN}}qctg#wGP&XeYGdIR|2cbON%G9WLN)NkwE(Q1rIlTdRG#n9-@a|7<Qz^4Uus0j=
zQS8MRhnuVQ5xdcFu3i1^<#H)`W4QPnGiDk^-88L1G^Pp4oncVNguxtZ)~nh(_@;54
zqb!Nzk&<n6&k#^S7|Rot=#BNxLS1TE+YGP;N7mTY!w#yq#FbV+Ie_yc`b7v!+OSS?
z^|o$_^5*NWpoXYA2?VvTu#e2K^&(YMsXR}t-{wyGFoiVBQ>ZhWfYC4!nqeF#9s?z9
zX=)|`IJ3a>A?nTdQ;J$8Ez(=141S9^ZAtIb2!Fq$O~(VievhIGb1iD#sa)#n`|B5!
z%bsTx^yliiirry?`cdS~yqg3+?JZH*??|q%y3X*Uo@(rC3Ar%s3-88fFC#*^g7LB_
zDn3I&?HfPLDR;ZqA%+jDDbJ0~8_DSb4!9EroLL8QmkUd(2^cjWQ=y+6W!OfglN~$a
zaw=p;ef0KW^~F=L7Lo$LRx?d`6_%>v-#@Aw54^DX#g$n~^??+_fbp1!+x%9g(9Vk=
zvj$KF<geY24td|FZZe}S$ARKD=dc786sL&BP@{9*%DBx(#}S6b3#c-SHI=g?wAy)s
zMEYE9MLbN3w|V!d(OjauY(G^MR9UX1-?r%gWW2n`4iIja1Xy!s1bk!J)cGg<mzt+t
zZc9yaH(la62)8Q4vM}_`O5kmd-kHCiKZ`M?i&msU^Kb=oVc7kV4BzqA1O>9$`%sIl
zlZYZP%KX(qSMj}T-P2i;Xtp8By}lOCJ}WgfukiySwlPUwlUSK{JN=|38IdFKukw12
zQ{95!ep{Kyi^n}=hL-q$-_+6h{_(I*e)2&0cex!%i4+)Lq1t%hL}5s&Y4WR2maq?C
zj<=lc*AA!wMv0@M0NaKuGPeC4TxV&#(eLL6Yzz`|FKr^2etU&Bg`|iviIte9|2=eH
z-JNL;6`b)pyl-CpoK#xg&^PmXG+*oSi!tR<;o5-{h*sBuisJb!>wFIja0@ZSV|RQ4
z(MY?q3%XI^ku8jj`6NZM{CoHQ00m=cft540gQ_Giia2R_iM49jK_!a=su!uS8i-R7
z>xf=f%8`&PPnd51nxgj=1^=e~>Cpa=12>TEoi3h3iPPLp@%rvyxYZfhy{p((O0#u@
zhky7CD)M$B7PpA&L!7<F-KRot*wMG}Cb&q%v2R7v$|QIVS~iGWvnJWGIt8A3B+3pf
z--Ujti8aCjOwHkP9A$d4GKrmm6>%96zkI$$`-F*z&%>4bli!9Ve{cUS%2ba4sr%_w
z_@u9XwKLc6h_YJF+pN|6=Xr(fKazJFz;Cny2_%%DGgSY$SaHA0pQ1%%TLz)0bTP_V
zyCbV?9Y-)<w{FZ$a7+ZfvzJRUtu5%CVfh?YgcB%h)!PV7;3*8`iL{ge5LzJ)+&4O#
zq6%{GTaBVRXD^C47|2Dwo<~SWCCX~9s~!vum6%HSB)Ewrq^ss4`#n3Kfn$pOL{KC2
zuW4=1th85TN+JEA2*<KKaiPYrL{khQ+7yS6tQti?${rp6w+gIAfe=4gEHk=}$n7?T
zagO)sQGbPpNz`4{ZH5*#<K}x@ocR<*rFJ^!-bj#l>b8N^m0IWnrea>L$5U2rHJN4{
zJvj8H@4%YJK(<b<<D*vqJ;?3^ie0?H?B#T>ID&b4yt@6Wz&fOs6u>S%Z|v4aM@*`K
zgXmhuaGtOZ9N?iXqP?4;e<`H$iCzPwfYzr_%+WBgf5}TtE?_B%T1OPah|`UUJ@9mf
zk!}G^ZzR6BLMs=Qq-JHc#RdH7sRAb2?aXb1t{!||g8E*(J=@QxR5WZwex`BK{Pb9A
zdsBwRlE-@#suMyU9HoILJa{b=iaY2!I^Wxa1`l`#{JIoc<$lnIc?~&cn|NL)_{Ku{
zKI~@I>4kV?PS}2T2y_XJq)Q|?%AJcPsvKkR-N=SiQg0;kFdxZ_6vXz}!1LUJ-bGF@
z{b}9{;B$-}kR9TY6fuRyegPp^4+*aib|PQ+DYo%}T<KGJVfJTxu*%e6rj?QjZc#)m
zIEU}uh;VM1aO>1|dtyadS^;yM_y1}84i%x+m5HJXc;9uw0TzDio)>0I<AB}4e3$gt
zh4w_sk5FRtA;;R)vR3f8$sIeg8o$E#HW1uZD_zsm)cw4(9h5#fM4*w;$=X?M#0+EQ
zqNkuLE%AwIDM|)xYhFtJu_=>BgMkv)yw0S#x#Ya7?V~zyYsh7Msx!=w_&)ekIjLhM
zXrI0ehr9^%0piVqC_pdDEWRd-Rh#z*LIAmTItu;q9^~_T!la$6zW%n&Q9dhE?^bvJ
zP!y4G*JlSt<(2qI>)768vm6n~p|;Fqon(fEh*t}p38C-3F4n-!;$+k~zN$q$0US$I
zU5m@It5-kxvlTqRWJ8N)*yFBM(qt-h$Rg&*ExB(M5pD7+lF5FSS!5SDwLke3Q>Idt
zYP@UER|aY|ELKwVHJD~m1^(m^f`<Y@|0qe~&DDo*M~%H)8NE(%1AWC0C(o0S1zaVH
z&nDeU@+&G1V)pWaxJE7gK&I5vG)h5Kk3%P$?~~K?MI<lv@E=0s5}+_~ldC>lJ#?fU
zhCQG6UGl|YzHq&H<rITDxhvC-Vq5<7eMhufc9f>+LX`qcJDgaD(aI7pqdNAJrw(gR
zW64^kx0WIAGAzMs$6ZO_z^Wrha>)qpxu#mxN+e&vOPDp6Rb!ItOzHX*qho6~3L1ng
zKJx009lwX;7_<P7h_$EY0R2+|xeQM?>YLP>Wh!TwHbLsFV%MjFHSw#DsgZvNBBR`5
zj&*QZ-9ZevKf4!5_p+xaj^VJrU}2?ayWDt=6SvA^Ln;<78f!|8-zAcSMW!JE81|;%
z&nRO_a5KouYeI-+`zNxHCbFTqMKV~n8|f?Gp7zi?n}XLm;yy70dXM1}{W|NPHezlZ
z_Pgh{;L9ypA0xJ@T8nxwQ$FR)iek3DC;BC|a&AR(zQB}p%~t=Z%A9R%QO9ez3i#S2
zeAss{g)0Cge*ZSHmenn-{=;_L@m_8MoxQipjuVj<1ntn0_wkH|%4WyLr63<f^CjXQ
z=1seb=CN3+V@SS=yCj!;fr?(7N&+n+5#LiRCa?weBP=FV2B8PO5qvtNxV>v%kbUge
zp|vGEKK@O|M0NXksZY5=_Pw3Eg5Ygxn}b9-M}@|sJ9rwa#r*!&eN!dCpwGcND1fi@
zqdAHY@UebNx4ui`hr;ne|A2bTWB2B(<-!pe(+y(L-dHbBm(GF!l^Dg$M(csgY@W0i
zMpOUZD%+xJu2Ge&kx>z&GyC2a?}37|FafflF1doJKQt2KWH!@ej^zx+u=6eaz3s3=
z{fk%2lSa8G#v=8L3N%J>CsR;iH*_EzGzu&(A|pVHyhg+*@V_n)qoG_hABy~6cW_Zo
zzKq;t(Li|T)1f%ru%=jb8BxrAAmqD$>qO+#Gq@}FYk~GSrS6J|Z21)6ZwJW!Dbn&}
zArbBA2hO3qy}O{<8#B4k71_MZX%q*7%kcXGYD^prV^I!2tAw)%*6a?)w)vUEz|*q$
z;f4dFda>CBY60Y|&a*4)WP^``*yMdT3j{7hc7WhHu9uIt?G|7sHq%u2O*NVY6|srl
zQu!DRlYf5jUs#>)Af^2c?PURmH!G12EXOG<fb6t%D*rEZMwr?^2KH48hYje0LR6G|
zTOu9o#gpm0ha;s441f3!lY*0RQ9|_h{7;P87d~PfHNwNnvM-_?emw^2hpsZuIFc{K
zd@T3~&$#qe^ZAQUr7S2*=Oy2P<ha_h#E)O^@9`;fghh}orv6D42M{v1f1)N}C`L#s
z&96L;^+hWu=<e)&6inF^vhG}7hD=E1&vVXJE=^4TZ)Zz;ZqjbJx6S@;425ypBo)2~
zequM`&XM?G7s3o#g860ErdB_LB`eq(nlV+Pid7i>)huZ227^jW3&7zNgsi1GVT9no
zq_Q0=gg74;YI7;MO-Amo!3emVLm}V-n%gR6nwmzc+FSA4rV>wJSK`Eojp)sX7wr>G
z;5ofz26<$G<wdRr1=~y*YJys^H|$PzJ40zV*kAJVft={Y^-2;lAi({^`p)bCXGM=l
z8R+|Ye~L@enwpKN><VrQjAPp;B<8m8x8I?3=dKJpt9EU}ge&%zk~tH0rFb~ik%c>(
z#lLH{$?KSXa6v_3U-n>`cuuVv`*$vd5RH-epT?CF^uo`AYfL!Das@EIbk$lgQxW`t
za*9G5l~ldmu5`-}#UuJ#oDol;wS2>%alDi8v$r77QNQBv@0<j#jmSu+hyuW)noB{U
z{m2B9n}(Z=kTuEjJ<J#v-y2=Rdz5x%@q7aZ&$e`#+;cB^YnR(9gI>ydf01YIo`)-T
zYCJ=fiD8u-PY1*WNE_s4g<Ma=+H%<J$iQNeLJ;b*(y~Hp+rU8U4zRWa-0JL`o%K63
zX_~Hka2(so5SgFS$yO*w*|P89(msbwGsTl<9-+C6yZK?%DAyO4ZLgq=fR5B>w<6qV
z*KF;DUFRr%R9k>r3sFx$a!wy=Bc9@4e=Gh{>lV_Y%j)bIpH(F)!0Erb8yQgoXVF02
zv2M{qie*e52p0>3;`CG`U0iMTEs3Sv{PCt&zSPzC=w9wdc8DEvX+Ca<k@|nF*#Eh>
z^K6;omm2#K3K9O#THo%4mM`-zQ=8TuE1Kpys*nz=5)Framiu845$h|9j=1C1{9WFQ
zBtp6Q&ui=I3V}RzDgUj|X(9Ws56kjs;I6xwYxh`01~<_313lg)TylY!)S8KBRyw(k
z`H21OCvXZ?1YB@Q-0&oD+X`PC;L#YZ0?|*P&{1ten87S|uy;q<yrCcj!l)#*Q%JBj
zQh*WCkCl7|%`-X=K(-%sXU`oaLnuke(%P0v*2JsFO(%F`8i+`|4knYh`ICdsLId<-
zhvEq)@_JSK-B1_oRT4>IxAUIndEHKwV~25uX30_CQEda;7>cC0rT1l7Xgdd}RiJse
zn>2Gy@XTCJ+H!t_WBAh=_Z}nRkS(FQoN7Ir=6As};UzQV&u<AE>qZ@I^sVw18AHna
z4K}whd$CJZQqHAugPR#?V3IYL^Wxk)in_G-Xqc_&Mh<pXby@SDZkucuNmN<FEj4Lq
z?Pg*2$f|}<pN)EM-!7#J-s81VNH)m6{Tg5J3MhkDj=T827>`T`PAi-=HJUJ=QH-bF
z6R{cYu{QU2WBFM={Kev@(UxP;*4^l)jLY{?UJj94b>Myp)GS`;*?W(74HHr{Vjl`8
z%6^l-cP!wlQ+XV5dr<+~sP{RDJf=h^2#=3WIxfGs>=W~r$j<C%K2jyvxSw1O5kLNX
zO8>q$i#`+LMKj;eNm7>=TShtH-at~(DwH<KlN4NeJK0KMR-5;o%dF$j(#IBX3pDH>
zu&!N&CaVS(x)6LZW>MuH6zFSC1@ks;!2%s}iNDw~P30C><LzSQ=vt;mRU(txh!-`#
zAzeNVxoQIJ=ic;Qf6LM2nCmC`lTArqskrjaPjIsD1vuhjmq(f<6u>0@;PXxKQW18T
zQovQj5srP*Pb%%IJ^QXPfQM{-!62rsullAmQ16~$kq!N85fgH@wh)6HVp+e&iF}e9
za~J}Dspa78x29K3!x!K<_(dDcKbwWFTqYNzfY_l48sjSCS=|ZMy(Jx5K<e{)W0Li*
z)5;0nWw-teaUuU@$fK<`T*yN<xK(D<i%-1d9N^PNeRo0J3V&{F^FzoLy$ZuOCzAtj
zpBJax0R0>WJ?979xa|X?Ajs=HKuEB$fn6^&0Oy>{S1#65_?vQPWtcXgrN5ts=z(24
zb<}CFc!(r!fT@VkE||tgM$a(d_N67jdI^!n$7dq<XEx?pk^8-0%?~T4m)j-bZ*d3k
z?H>%JzFa8gD43FfdVSJNCWdgT^iWl-gflvGEbj>0k6-zI7Id6%b<NbR+cmGCE(>#f
zjh)RNU)w}QQg-Chy`$}t7FLPe&#%(dUEv|q!9$$*=>ixh^P<9e+qziA^!B0s>SCPW
zTT|Ebz<?3kUFXn9S^<yCGtQ(`O{fq3R1-mOHfkig5nBmpN(`VkR4dm=kThycurX^4
z4fk2Ww{D5)`n=0B)|DKwzQpzvL|GG<2l}6I^FL|{SloknxE_*e<mCSYPTikZq33*B
zb&saccO<lFed0nbRoVN{6Ao{KlD9{o<uA-s%LEl^e41q{eTm&dQIRlM!Cwu~AwNS%
zKm>^ge9oOl69%MHAB|P2!1bsD_n3m_@I7u<i9V9E{9uxc(~p{m4Y7L9Rse-4^T(_C
z@wI|-;uGUZedg(7cXgSfgF@!7o4jD)ss|^?kgvjJax+80#35H#h=|mBHvG{b9Nj#t
zh7*x;cHmAj+?6k4_|&CbDvhx3AMA9}(-HY@#4P%K@|#keF@gw(`-trHxpbn{vf`?h
z!_+~ZnxLMs&n;zYq8PPB%db1RP6k?8CNAb9@ob7R?Bms^N&b1PMjV%LzWGL;yA?;+
z&718LK*Gz(hm{DQ<vl1CXC+TP=SM%|6iWq_!x{d}M9Q!TOz4nLx9ZnStgv5aXK>N`
zT~)(@T9_~5=$hNbfmVD^u&NYv>N*0^0Ul?pf_+&2s<8#i$Us#zFx`B+47hG60XjiG
zH6y-z_O@Xe_tU73+IA!-qA-0vhWV4Aehvx|fbw*RUKdk-Ax$}O=jAcl1S21>^$Q=-
z@#mnb3{hZlL_TxS?L?`oT`c7z_u~&*R$%k+#3J-h!_tZ5)iZu^v}sJ3b`T>8SS&4|
zNXfzqOmu3N7I#%uyC0!<h$-)mcuA6iO2YOD4qQUK1J46?l%jjmqL~kemGY-c%->oI
zxQo9ns=i4g8iUs<-gr<}%(`c(-tsCb=6>5GlnXrLx?kD>(T|1oAJ=xOz4-iPseR=a
z+_g-rAlDJSP7&=a9=$EpoS=DZ<ZT{kS@~l6;bqHHnBn$%U*+=IpjoAO7uEARq`Cm&
zRq<0)91RKJJw`ue6!AJB$WV^9Mzn)a1ZA8Qu}N3}Q>T5Bc%tl_<2yG`?onNegXB%#
zy25<Y^rXIu#pHZGA|ZN%2mx}29i*LAI|y&RS!jZM;GheZ3QS<9%y_Jrj@=VT97X?f
z5YS6DDV+Z01ASv&0N@;<%=No&PA!hAd1~|&R6j*SaCa>tEx^`weLur5JI~<1pgkl$
zBpnX1{=}v?2st*itNk|pOuSafKrMhd@~!=@IMu4&E6G68t7CX&OZnb*8&h5|4t#ow
zfT*upmewOysq7el6n(p%e*JaZV!LH#ac~d|EPRmFw5k#N6zDIVj0C3Owz7!hJWa_I
zf~1Fe!>->DhNo|3TJ~8b;@pXc-b?5x3ik)}QG<%MOou=I{JMQ%V4Z2XHJHD-g)k}1
zItM|@iUY_C7nP9H)pUOzgFdz@-`>~OqmqL*wmmeiGb;!ywkC2`Uf{dESUA3MaK5;2
z7nouw=04L5{QqMY|GCgT)hHsuBiMF_-h}%f_VH&7nkKM{8f*jUVRAiWXr8`J5{vgM
zJ1mH@|A%Vu)P9fUKf{lW%sy*9JUs5`(TKb*&KJ`YpNcp<*0&jGwc8=v*~nK@js#L2
zV8Dl{xdvKbNDR>_=`wIdr0>KY2rqqW(FuvElTNg&bl;grh=f+s6wwkv5J{C`YF_(o
zHj0(f3L(e*SwvQeFGqKvkV2lboOE(8LS@yXo*9SU<%Z?y-qi6i$EMHcA@}<IY)jNB
zL3cj;MeTFe6}!h9a_WCb)b;@Mg&zDB@|41pFSlk8xrwk+x!B=xZYhd7CL;aIFOzol
z&DV;@A*eymSztIF<0Mr1@AO)>7mZCQ^{#@MVo!IOf<HFgrceBXE#Xdv)Hn;y5f`d7
zNDDu+U~|OO=sZ~LO!vj5!q0lWcOC}=?5T>QBDdw3n#uh#LpDo)>dI<yy(`XGn7{gl
zDXAIMSIzJNZ^M!^aEQ4d-?KFo03^%%#$e!)hKr>|nVjs+DZ%!QRY)#Q?~W0gVDNk-
z&?^KE#()a@(>m_)lmtwzv`}$b?JJ?5F&B?Omp!(vo7NTGKq$2Uec2}c2&R9(9^*CB
zl)XtL{d=BjrIzg-HI!<RC?WOJvba2*ITLwFYJEN1f*8PQU~{{z9Y)`J1<QStYXot*
zYkWYWkf<4j(1=B3d_GK~DFyuwh?#qv+LslgtzUJ;IR#i*upk#unO?`ul>5l_D>`|J
z#=le$%YBgz<jZAImpAV_{*2Q_4>uKgbXM9*?cZr8Q^;t${QY>#{N-HNf#uWkLF(!?
zAr`y*o%Yf_Ag1Ve-cUG6&|k>?SC2MNm(^Uz?Ta#t=SugAvrw3xmvz>FSa?%-nUUku
z+!sSlo~k|E_<$tc)5rJ;VWIQ1lDR77cU>+Q=Lrq_i{=P?MFGkAUr3$crNADap2QJO
zf8Wfhn)IK;*EY@B#Sn6zQD&^h!~@Mc@3P-MrP_2yOgSiRGW~#c?TBGk^H1v<{31CK
z+iS{0@q+4~ZlcC^ZHB1;a@IwUqR=6#fV%2Q9v1nW5zo`KR4VXYtzo?3SKjolkgwvU
z4|VdR^Qpllr%f$z(C1McwHM{72P8gwcl#pgLI%5H_g5YJD?YJ}^<r}-1NtjxA4He!
zJ3d<EGY)@i+F%6dTWrN7ISERr7j**eh~+iM_#T94o-Gtdm6T5Mq9=2;-*^x1KEnJl
zn))S2q9nUmvKVtkI1BH}azixc@l-st_Cb8%=QJK5RlBLV&$onBVjU`hmlqph9Pih}
z#J@y??*k6C;@pQfsRx!<)WY0tGB$G#wX%<e`<ZJiPXgQ32(tgH?C=jTIJq&#d%JOA
z+x=@L{Z~!lrie0oe0u~lL`qWrC%_HVlq!+7`PW!`UuG4tSM^rh?&8AEzHr--M78cX
zjfC%qUeao{$}w4^^DoulPHK+`+24<c#lxRyg68UP5Qnh>{MriFn!-_J!4r%1YUn+o
z??W-DWfEj`?YBWMQQjzPZX#-h*QK%oMNZtURD|X}Llk!BOv}uk6n0GKS%1L8Y+t}2
zpW*pZNk_pQmsxVV^#~gG7)nA$_CvS|%a!;=dphwguCk>W@g|azyNM74B2Ak5@^r0^
z5i=e`h;mPF>CHm$zoeH~VH7H};8M?b#yBvir`8q7MaM1p5@0rLP&L5;knfj&w1p;>
z-~<)(K>{?3cV?^~QSIIjEtQsCF7eFPtDcwp?JC33;?oaXVV7oFGD0oWzM}xwK0NTJ
z0QZzq*V2YW5eX9Rfd-cl@~G6sTrGaTB$Bjh4%w%&ibWDo)Ilk-^MJ6jLiZ^}4$hYz
zF1%BhN>{0c?hZ9b%`kT_3dAcrbNMpeXA1Zw`NFsPcR&f~Rl81KlV^w9I6lbhXon(B
z3zQx=*ewl5zurT+_oMb=87c0RE6t@(QZ05glBO39;>gWQbEi|Am_DgdTBc6?3F!59
z`n(HbBnlcIvdN3GLItRb>GcHys2?LxM+Oi|gC_I3&#gc-{aJ^K;vcyXr;bdv_F(z6
zUDKVA91?0JrHEX~-~At$xa8m{JA$u^b8O1FM|HIc>P>o*IPh!AkXWK`<)0+^LjD>!
z<eu(s)Vj}SQkQo|xdpC&eCDIgV6&7tg9iAH#qB*ea0_%tmLmi}-ET7LE6s}$t05X9
z#G@Q4JiamcCcl7Nm9bLF`-?b!-T!KdMgchOK0JQa!-Je45=J~>n#JC)i+_`6vuLr9
zz4e&^*>~j6x4=iy^{WTV@c?O`(Zy`sAg1jT%~5V>JuK<h>N(xfgeKnyo!n;43mNc~
zWpBXR7DtVe7pd8wyvRzNMjnJGZ;Ma^x6j}sK$}iAJ2l<ErFFneM?#lAyqMO+hOp9?
zh|fG{f``ydoXVYi@-d!LoW>-37p&9+sCb`cy>`@+c6Vb-WM0NP@MV}9(yH)Qir7Vc
z;~LDL*OJvaNbjG@%^;>vW|71R!53Z$Gz)r;SR6loyLZkn96oeuv!;~*2N{?pq?pZ|
ztF3VLBD@TS(@;->-}&!%r>ghDk_hYE)M6cihNvqWk=6040^@%fWsSl7wam8C+3NU<
za%md5ZNaKdZreMb#^%1=Jxxooj_Q9Xkxb=NUW@<Ui5^8f7tkp}S<_uS1U<CB1z)f4
z)E*<Z;=8W67iAO55yi05Sn0}B3s;tVR^9r?DZGWqxayd*f!nYdtLxuV<YXwW7xud2
z8t}o<79M`jyIX{B?4S+J5f`;%LKl9@rnlSo<`lKTw%XX}!z2g%*Z%Ggf@2#QZ1}TP
z#N{ux%_g~6p`H<efh3WEfmw-x=Ws!yR(<TC@l;fSHv#f3eA!u6R_rkIizj}V7h}>3
zeF^ph(9k+laF-=3>vt_SR`zJK$<GQVk%0uC1^(VaR9G(pitxj`Rwn4-r)Yj1_<3tb
z20kU20dCn*Nk5Et-!l*7DMlyrg?$qyJ;DELW`UoZ{s+~AszxN75g^kb72W{zKNQ#t
zr26iv)KX8|-6tD$A`;9I^KbkOs3bCvY%jbgPnH{pCN<9?1e-Gr{A@*KwO#HeF|VJ5
z2B|ps@|F*rd|{H<48t%u9MkoWT*C3Fl&#n}9#h5&X$&{~Y6v3v^Y|H9CfUlGpN)^i
z=4hw)JJn*7cw-NUc~#7ZZhXQsuTqzo8BiY)_A0@R6pbIhluzE)0N2wj+h^Qu9;afp
zEeHhQXogDq!r7Nrm~nt#mK`<66%4Z^YV{@J9drG&uNte<%n=r`@i5UQ4Hwvx$;cSt
z-qC#Wg=qtmoCz_(q0<+C&i>=dPHN)KZ7is>?oFvPZr!Z2NR*?0G9lK_O3`pDL4Jwm
zRq{Cv?#tQ<SNrl}oPkSZM-?h82_MWv2xa?euxwNWy=uf&^GfTDZ17Y03d>!nUrgtt
zkE<#)S;HxvMfAmX+>OdrKrZ9G+>lfi$xF@esyy@9hzyidRfHu<?IZ#BIv#gi#EkT^
z2L>8~m4L;K;oUb8CW)t=_G^{fdaJyb1!*~}BlFJ@vp07pLU}T+3xF4oN8i|p%9JVt
zj=?*Z{B4uz4@-ED+?i9U{A?7=cD-dUe%W(Q4#m;Za{WviRx}Rw3n?@-d<HT(oB5w%
z`IfB`PBR6gEwuI4ksH-NN+B?P=qZii0}HQk%j@fUt3gV*$c-}i(|#=fepuH?4c?Ay
zXt7^(0qa@~&5h&;@f7+p56G>?8!M(-6*JtKJ$5=I1Eo!@ByF>ADttQy`z`W2Dt$06
zA6|0)lz_=pr4~cgq^_R!mck;Ui&2azj!fBzEmVRd39*0^?S*yYc?Cd+Y2^w}wT<-*
zo;SeKf<>8yiFJ|j);&4il$XOHG2Yj7h50tg!%q>%8ncTpK?gwJ(1(g&t`gB73pkyZ
z9+lRb<LgbgGDp@6b5-JaX|SrD%Pop09u*hX^_9b(AcwZS$fQcAc+tSl0&3~F6=!|Q
zGU^ixR;S-?e#cQe6+@n>SWHP6mO^IyIn^R)={`Mp5e}Rq0gr1#gT<np_6+UvE$Jpf
zhkT{R1MvFh%|c=g8L7u4#E$obojH#ONoI}}_r+5Zh!kuYY;Po}s5J!1;WzO^fCt=5
zE-_*)PlLAR@yj3MK1_$q6L}=&p8Qd)dv~q7ivhk^bs0q;WVB84+N_yR=zNcyn;?3k
zK1P9x&2aNgUR?TlsVzDwHc?+w&U;c1^teU7kw4#<{>izw#roH${4OP4FkYDUlKUJu
zz$1Q#Lmh>^5Zb;Nc3`+y%@lPTK!#t|lqImYZX)5X`3^;~ag9p4IAEolW*T?*T39_)
zOxbS@Kb6gZpT1;HSMcIbrZz_2W{Dnrz}9aiY^Z5{!ijpbusU|pe04zsk0E7B%=_@_
z1wVD}@dfl-c#z!Z&gOT*ih?IysRFB0SHzcRotN`C0${Ets_=U{u!_MlPuPuOTS-=W
zY+;)e-a?0Pn7mDjP31PWIei6>4V2Rs6L@!S0zX9-&KBDYK{_J(=idN?I!;8-{*P|s
z<flXKa0IN}DfL`vyFXX#!e9^==I#T;DEphUq!_SB@d_~laQOROOKkYS*4dz?XL`Wb
zIa0wP6M8|n>oDReH9_T_OtH_ICP_JZ7QeHI2Y>@6DS@on&uFT2+_YY=P*8Mx)7B(D
z&$KJDc+PiE8!%ssQdK*hs!N*=yb5XNk~95AO}2vUU~h+MjJudbLJ}a^ddZ!9m`9XZ
zP6A?tYU<m^;-z2Dc4DR8HQnp&|Ei+es^Lu3?C*|K>DKI@fHRDIqUf3MUxQWmG!8tu
z!Pe8$-c?I8*6;I>v=}slo@d?PLmyR$i*w@(T~npNnVnRkQ-z_B1jVfP9?#O|^tn6m
zuKq>-@rFx<nC^~P*OcpHbHXxKO8%8VkEKscToK1mi!34-t|67=a_WU4bx6I;A8cKO
z*j^JTnJWFc+&I>%jkG|LVm*N-lcpH;4MOMMn0nxBDvs8)q71eMS}X%@K2KX^pMpBg
zh01-U0A_QDS=3Jc<uxYl@YB2~(Y0dwKmeMGKdow00$$}jbSRXt$z**&b92I)!mN>R
z4#Y^0UuJ-n*8wY$5nGPCq=e$p@aadX2au)JQ%$fZ7E9h$J{1vj%FMlObPBlMl&fyn
zTz@JxoCwrhah0zUK?o7(x1bcv?i25!V&oe}wGfC0Mp-vHd^~Kx5DX}&i2pK1s$#$~
zSGEnD`Nh;XU4EmT9QDk;y6=u5d1KC?pA)!`#l32w`~*}j@lvT&thYq=!f)K4>3>(8
zc&DTHK8g|^_*gF~I6B8b);2mVp9TXWZn6YbPu;3F+C?+gjM*4-(uy+jeh3Y07Wx_G
z*2T$!yQX7MiWLUr2z+D?wj8(oINu-_pQ&keyRVxrnw7<HY>MHPm!H#~B1|W^FG46^
zZd_NJi^jTPDB>O~j-Eou*4L(~+Kwa1UPyKL-d!i!-%%q_VfZt#Z*FT$LtNx7Fa^#g
zy-haQ(~ZRpN@XpuZW}4h6#Gmqkn4+)^Gj5-YJ~A|%Y<kx<BoiHSF{|#f+k<)3(A5P
zVeAW*&T^Q=51JMqv>fA!#;R)#9EVT*sYyO(S}0+-FDw0U^a+`(7itC0*dN(L)bjQ7
z-fowVX|M%pCEv<+6x?nyk+v0cB`;`&+IPYciTVQ|6Uy(+96qx^46FLsqnok|4i*Zs
zq#Pm3c&z`3APBk}1|#HcFPLG99AiRJqHc$RsNgJI_6wmm@?fhIi!j6KR2z5!;ed$1
zd*a@UnczZG(?F`giy{{Z`l*}MvG<3=**Cs83&ETo>k{Bks&|t=9(jeGKin;z$k)s%
zmcd<le7VRrWvBF~WhW52Y}TadZ61DY$H!k>>+b$;`L9?00}HX=P>9`z$lmy`trvs;
z#6rEB4Mf#8-_PjM*}A?sWW@;M5oLX@0kZxLCZml14~~>@me0EM_VuPmhpL>`>h7q-
zNnb>3E$iM$Ud3w*vbNIU{>2+~whV;zw922C_h@F+-&9|3YQ%PI)xS!tJ@Be^&R_yL
zOyX_Gb*ljga|oA*Cx=?7K5f0LzPohnWcg`ojiQnnk(t5#sM|Em*r^AKk7RWUz3j<t
zU-u}1;Z0~re0KjxQb0_!pU}VdZ*j|-ID1&QYJa$W#jeC`M|*@iL|w>z{5v!dAZ=8p
z8RZefpKt*|B}sg`iCl3OY^PO^y)MUd$>4KF!w&V0Dx$daQx#qPD?f-o4reehJ2lhd
zKZ_xxm;jLar(OPM9QsS+f*Kuw;@0$W0Z~+hwOw{vlU*0Jmf9=ee&(&LNKDwQPOnL>
z`dfwS)v2`)@lk79g?DMp@N8C0o!uwUM;|gqD3g%0Qe;fhs=m^-SzfQm!T|xrT6u@K
z7oPT@5=T!FGH}P6;*cSSbc6a%35R<zCG-Twc7J)S7EL}O<MrL-tJ~atVJbqLJ*hK;
znagN|nC>u6sn1aQn1%2h54aVXbOddxX^69Jo<?;h90m_3-47v%E*swP=z>w|fnKp_
z;oMFpr3BO2j2?%<J@{6DKe9wL<XB6V@9@z12LTP`plAN?j~CFYw#USniGAQfAL9#8
z9^A<>%TpY;u)zdi+_Y>W(g9La^)-saY_9K16_OQ8grR%~5=jmcfgy4GtfC|2;|!d<
zpv6{4q9MuC7_U@gB`3N)WQvP`lvZ8rmLitzOCx0Ojt!Jgvk7RbSSWi;{R=<lGnrQ5
zDUv&r-(@&E)MB9+tvqmQfF*)Jr7Hm(d|=1?e`xy3u(rBr>C(^uCAbB5C?1^RF2&v5
z-61$B6m6k+DaD<Z;_d`@E$&cUi|d#7-uwN{kMrz(PM$q$&CHsuRLRH^JZ2TGba?#J
zSzA6flcQ)_yS_{7oIOoa&2OPi>I&6xNhtk+w{2+Fur>a`0Djf_A-fbBb6Q|KSzr=i
zo~UQ|0fCsL9$%cr{r2lKSUD`XwNU-j=H1!*{j$Q9dlEUiO@)&S!{2J^k$)JY&0s93
z(c+6a`sq?>iqjZhwl)!#WKKCPf7Q39Grx)P`sVnba-jhaeve9<9!A8o%&U)5so_qL
zE6m+|eRHU6<Ze%9;o1r;&Z_0_6mLmC>`=~(m~q6vEm6n9Jp~%jJ7l3$Rmxs_G-=c3
z=Y2X{XvZ<I_cV!n{Z3G=HCz}zLx15Bj+OxlN^s44ygCf2k{<32A$$>9&Gz{o5;F&+
z?xGv}u>Pqc6701#!5>v{XKtZGF8t?UJk~LHP5m4-^Yt;ooK!q#N&LL(Pql1Idg>px
zfh#anmLm2G;mFw7My}sG{lnQ*5|+JeiMMe%?H&slyh=yT2Dq6Y$&<lJuS<Oc9<<))
z6&}n51xF(y1{=5Q$nru8#bSxrI*4|gg+GN`vex;h{Zb3eTxmHI(+%BoCR#baz+0wp
zDRC7Sm{IY!;hN%@SZlf;Q;`1TA4Pl+=Uyn*mC@hR7;!3dc*(aySDBA#t3~R;g~5~l
zeT{?9;4PwE`d?p)pfhZHW+DEw#qB~ryNwwz$sPp#oZW|*fh<9BS?6{ewh8?TWdn@A
z6oPIuzl~5eFT*A73<-<3SQfoLd5w087w!_5`b&&#=&wt_sTSSv;2eiU{M?OVF&q7i
zX@ABnLdAtsV5d=<VCfgpY#6bXFkM|3q_T5E-wcWwW*1DJzh&LK3xP~5bFJt4p#Y9=
zYrx=FH&lQueG#wLWpVER7JA|c8#ytoQ}rGFBM216TpsU?v>T@K_k9C6mQ@vW$?v8C
zimI8ysqUBQfFuktU9`&?(PiWuoD=_v)l`+8=gu&H&EG?;*Ih`^Nf$O2Wif6&+|06v
zd73aSDUuf$SutVM6y`b63^}=Ty}jRo%29*5`meJeg&%w=L`RHDR0tN-gcP!=n)9l(
z_AaakHvE+EjDFMK1K4!x<|)Rw;1U|whk~QzBk0GZzsEMh0LBGSGli&edDtg-@?=>9
zOCrU_4DGL4c9-;Jb6@HU&_Tt-%e*r^oN#Z_QbHOp#b1H6Ze;4ods+!w$@<e^ZM$Ut
zB74J!6TL$@z~i17(|<3)#SF&KCU?ZOSLqc$>5BZ@6=l<2;XY<3%G{zy4O?~(D5A_g
z$&*LR+!pMsFqqu?oBtra9Jo(iq7}Bt%AZQ9B{Y$P+ZZxiqJ2PqWU4A?IvAEH)DndR
z7G#FFO$w{T3tofkr(IZ_JCYgH&3&p#E&{(JQ>g6t>>O9wJ9cbr%wSZqF_p1>iSO%G
z`eJa3y}Rh58)2OV=0n^6s3lM1M2|LBypb*~P}SRsxA{b=(bOBcL<5yAxWpxigprVF
zmVY6z$q|2$$}WhMXMjW+87?W9YAi9$BEQZykN<qS4@9EDd;BF!5XH_AmrA6*6cBq`
zV5iY4FofZQgKqL!=u0(ul$Msb`A0iFy07dt;vvk5=74zod%B_Ov&#yx0$B+Zy5F+j
z(diZ^PnLL0t0KE)!h&@^K13PS`Mv)$7|tL01~v0~CbG{E6qWU~Q(cj7TtA<SA6-hC
zJB1Uy_-&he!&o~a0I%rz&!M(fHOdGHInnZ1tb*vcu=}m900**UJG;Qlt-_=9FtKgr
zGV7n%wA-{OH<K5ZNb<yj$d#HE)D-&lQmr2rHTX7?IZ>GIXnz14-tuD_wC~WrizsEG
zI#y(=E|1+@V;z=TH?5kH5~%aSs92)fsW{z~8^r!~Q#7shkxz18V07)krz@kGEI26}
zP62K^)_PEz*|U+dDF6H-bA@c^AWs`T9@oP>M+qt6#-oqww>1^HaH@=n=abzM25rr_
zt>QH;baFxe6@eSJ0;0t%0ouSeFGUQOBrn2#&DQd@#7379hbr79ePvhWke`TLv^Iun
z8>8ms;79hmF@0B~Fr@coZrAqjIEq|}9n@Uq>0jCgH|WCiv_gFc<cvK#=jPE?yVLd>
zJS*4R)%5al4*Jp#Z?;->B(DHf>sEfzBR7|r#~j1m(@fQWh6VLq++Mya*GJq8o5w|C
z9<Aj%&yJ^1|FfGn!tS~<JJ*K!<PK;A28!)p#I$>!`hGZjEMyoxZo4j!tjOwcz!2Vz
zksEb&d;XE%uFS0!(A3N>vR``RdtK9%W;7iBz^2J>86u1{InxqJt5;6+<Ej8jU8>!q
zTT!=by85;zKp}T!T?H48LphL8C*D*J;rw#Az;i#o5yqFmSS=&pY1w=vMLb3O<D!n@
ze`=5aNOZ5M*RH6*tPm`&$B~;6Yd4{kjs9Z85AD|@f!`%AYq~(KwiveSZR`A?k<KD;
zFjH6*?4il>_PlE<p{~<$2h=eVJ7!V9_?>}Q&Hy;+93gQ@i4K-_<ghk-wS-0dnAM$w
z;NHma(8JjP6v`sl<+n5dWG$>)6DT|2bTR^VW7jLdppC&t^55BoQ!V}%hnex+UkKY-
zvo-e9V=&O@L%>HY=`1pvje+)t7>Ubp^V^N}7Gmj_J4>J}4@=*ME5|$P|2^PWR$hN`
zh+QdBQ&j#qr0Hnrf^LjR5@gY$Tu`K4S?@Qe>Q8yIuAt;vLxH2-%88||i}-sR3?3qk
z0ix)d&kIO<!q>v~3r>F~P2zMG<y^dxGv9I1o9A_DD%I2Y8<*z-GYy`B*t8kQ0k?X`
zB-ko5`GY@A%N&X=`p~fz2!Qp;X|ZgAyoW;*?8ecTf-BOT{hC-d%qO8IF0Zi!|7vZK
z+Zu3*)cNQaWyd*S*V82d0ucwdPexaBuj&@j|9p@B<=5)iR1JnFB|lWLNEy_$GM)lo
zRA~JUtlXJT)`INH?yAEQhbLCfMN^9IQS$ytRFcNu^H10;f(5aI%yMO0f38kaY$Vfl
ztHgd*AK{w3RNx)=>(RNvv2YZ6)2-ULq79;tWeQBlwu+K`ZGJ=0uX+YP_Wg+*l`<n0
zH%VUMB+36BSi*0Qow)@49q#pwOU)(Re?c@h$L{<vv`4wz=rrfulEasrEao*#B6;6i
za=I4pHaj)DI1(rqBDOJAD>?}HR_6Bo<COeq8Cg<(RM9B0rL11whL)9o49`#|BFy{?
zJJQqKG0AiZ^lgLpZ=6HroEmx_LMT=_QSecA<&#!X*NhPlp{TK6M5+az&j8IC`gVz-
zc-gd3J)OTPf_aL(&UTy${E;;EE@JWyvp6#;DJuhglOC;#@(}CD*kz3akn80C_t%I$
z+ajti`!fBFyqDb~iGJ~9F50~Tdv^g{^w4mRVDIiY*AsO;s$}fz++OPL4OT8xnWZ^&
zjb>>s;|L6UP@@K8I_oe6n_M(&7;KTjFp`d(vug&<u32N-rI2jP`AV7NPaSeggORD;
zW7IE3XsV}l{DO>0#x>A4cLeUAxR<0Aj1s<4Lg)V}RkN)#lw}#s@b_@RkX$!**U*H}
z+>gAwAS>LW=3NM@)b*Z0R`aLrp#&Qqhs0Bc14TEJxyIg$6h4PPhO_DANgMiFB84x=
zZQT2WL2wCwcVkBf){7;A+7Xx;^P2lhkH6N*l#s0yjyp=b6!_N%0SYM4o%J?&NgXXV
zWJpX>Ojb8-Wp+{t<#53%y#fc8KZZQbX3=o7dV?VR@?Qho?4x3++G}v97G_Mho)Tmi
z0y4OTH1sCibt507Bzv98&1f7ze3GqB)pb*`H-B(k9lxqD%hrue|ND~=@4&YC-k9(3
zU$dMY0hPs1NkL6+>XvLItb|iaW-I7M5}Wp!_5pIX;Ev)@k#Df+1~Knt!ywfUT-4&s
z@!%ehN2%qO&0-euT6M#FC(Jrir?zqfgZH6!3F%M0lw-)$*};mc_1SC_dj)Sqb4FKh
z@U6I=t#?ihaL_Co`Xp*5*0bRph_=z#9w|f3eI<o-FR%Iy@U>tvo8+dRX!Zd)K+{Ln
zX8pZE57m7y2pA<1hZG;ZMSS+_1Z@?-z!fz{(NFbVeUOTT7Cd9FpmO|Nj14$0Wd;fx
zh;?;_E_)K+s#Ji3@7B`c>%`9BV^^0tb8L}Qq_tUFZpg&u2jI0FSCoq1M+vV(GT<Wo
z<7DvNv3(CcV3cy1>W(mddEBbKf`IEx)QMOWi6uP?lxb5bg(};>!`;*Qtji67Sc1ar
zlFZhhM#4-@62;DTABd&HIH|lU0p@jy8;4Jlhz>l68Xhk;Q^;vaEG-84d|5cs@7sMR
zA57iT>1R(&Ww6@S=^hvbyB-k$P97tc8&?G09}XJjin@l%SIjLWTXYHR1>*MP{L%nM
zwH*@g5e?aRwO&%EZX7ZKArQ?l>rl6YE>$rd#le3sNW*9GO9uhqGd;0v7_0(|Gs=eD
z0vl~q;G*tXoHOc#!)E<aAKuBbpauLLNlGl8e`*%`N;piytI*7=RrE_L43?b=dl>Tq
zPKx0JcJ)O7+zz5$ot<a;pgdQmh!E2}cya@8*j~pj=i@`*DMIgfQHE6F>~26BQbixW
zHh~IE_h66;;H$h#OOUXtaKSi2QVPTxJ;eoN-FSk9slLN*{;&{(qjUzSnzgud<|XcG
z%+3C0e*y&(wIGV_S`NtLZr1l<@!W<0yM{};s_Tp!?!YUdY#X!Cdb8W}^0dU8!lgd$
z$C;K28?*IxoBw`lzJCv`jZ-~*di$p(?G0Cpt~bJqSLy-EI~}oE2iky3AH)Xmpj7EY
zR3!QXoni8jiFG^>j}mg29E3Z{$B%0901Tcq848U=8y4@MFa+_i@^iZZvc{<V&vZe{
zjUhwpOx@sjkpE*g0wr0o(|C&(p~$b*dDDdg^X6m;1o^zb{FIRrTDpc7p<H|GDzvlx
z2m>zVne1uFNdE^|7Cza5a_f!FqnFa}Lv9&}`Ab_W<@g7l1x}V^MzWka>wXClW!6t{
z?_qkP#wW_R8d$MFYZF>90w@-(Sgev1R&LkqQ=G*w#peud-v&bQp1dOzXAkW@Ply=9
zdLx??h0}kEpW)W+ZC`uxH#J6CJ{?z+Tcq_Sd-Hs~e;8b`VS?jFsbkKn=ZJri+Ji7{
zdvy2{7pe8fOt_zT(D^p@%d~aBm-yn}vf#k<Kih1Ey&*Z)Y0=*jr3=}mK-&=X_Ac@<
zL6q)?oEApJ!WE^CJs1OWg1_K2h|*nkFsV4E3iD*(GWPPwDiC$XJ^n1=7r*K;BB^hY
zjqC$9Id73i!{AIj8isHIbFJfc_VQLlb}Z>k^F{omYEp6zLyTzHHJ)~~NUZpFC|2p}
zyBoV<o4W|3%?WceU<>s;RF!k${NYSu^Nck|R?UW@foi>k+5evbw#l~Lo$0RvNxoRP
z4|0z!yvH`Npp;HKU1Z}*fy%^6P1eC<@mnJ-a=J}E6{=B3jd0UNpo?u#`u|*p&59+5
z<P2f2so~ae?iUvu3c4;v+fid1yQ=chV;&p6#u3W&K-tW%9;`B6!*z@n`_Yjr3c&zg
z;jn2YST@m4nG&Ug6uqiQSUh>KF84VM7df5w#p9~pcqpZ!=3*x8@xY~bh%ULxxJy&~
zRb6V8q=n#FR{la>_ypH^7K#F;Hw#aRf$3;NXq%>4I&rzddnM|CS;&`D%?rTbE)J?Q
z`-i~HKJRWb`<&O0|33?$Yhy?Nn7Gl6ukoLk`lQ_M^eyNJSNaW?Tm;$q74!L;HJ3bZ
zKzN4Dy80B(lSx~VBb%?lTvVgd$`M~%q{T<OtN`t*-J$VeH0HTnz;N3s3B91n89l@y
zA#^7`DT{sl6UIdg8W&NFl#yu$>`MGQ+e8L?(J)<Hm`AN2ZdQN|cR7=Cor-CVcKerI
z>2&>fZ}A18##&k=Q1%qZ5r;o%?)|5}*hoG>2KHaQRcF{_s|zHMX~gB|3mgegn}CZn
z?d(N3&6OkTXOJCNpHI$ehzlokqEQ*#Y)as*s_0zOXHibKf#~YrZd)<b!~HEYS%Wdf
zOp_bz&N*7?cb@)b_l$3(;insk@11eqPMZJu!?JmM7h6W6Tdrp6Zs)D?t?~00DHECH
z=Jz%l41^d^8NUqI*S3h8y1JdN@)UE~A&|1yx#;?*&np|auY38}axyB4>S}w*Z$%3=
zc~yPFylgyM^ALruSJj)m>mdJ83#v%I-WOYQBG+H&G4TWH_cYNyMLK_B?C(1AW%%rY
z6dlT-ZHIleXAjOWyU~@;GgHhI7T6-Yu#!sx;H0St_LsA--#OlEKOWz3M?FLNX|*|s
zH9JC)RH5i#RFO{gX=hL#5dq*>7cT$z0`7|+wrBbQLDbInoK5fRW7YoC2Vy~Ou-km7
z0h;13nG%<-AKlg3G1)imkwe8BKEj^Z%x=?*MFw6?dI90_9pc<)PyWJywca8~)Jaex
zlH@W%e8TQO`&Vd45=UJOPq+=Fr_D^I?7(!zJ*+fnj)fup1b~D6J?v%B;B$(7ln(Py
zAzEJF#FDGyeGmvj+!JatMc_E~$wKkZjwV>Z6v*jj^86W=ZiNwM_shEW9VPzbwhs`f
zbYel4?Dj1-zkRs#;R*zN>q)T}Pl4HX)-XgI{QQdvlxO}#9Q;^ia(|)?0@0C-h&w0t
zA5&XQ_+z2ETCG385wy1H2Vgs?e&5DvZv}nWUjGgE<Kcn$;Lj+D3|>Gk$_Q(1sDzg>
zFPw4P)&W>3f6@HXumBp7zeEK%#6#sLO@$JWHVaX(wbrcGTQfUfB}_pys_RIrB)K4!
z-hzPeBL-l5Ho_U8$qjzj2eoTxmD0iP$&sO!bwC`9fOEDv8OJYlNS6T}7VAwR==pVo
z<I4l$jiIouzpy=)Bm`<__Q`i-Zoi5>T^SW9c{Gf27dO%Gr1JM>0*s#%fl(5t+38TA
zbf-TRdMgxxf2NK?G%8ZTr8i5iXtEkCyhjJS>yv__r2&LJUUKMGKK&np5)Ut{tvlfZ
zX<)E>btY2UB%9`Hh&Ja0cKdfEEI?ux<vN%=4Mr6X3cuIuqH6xYh0Qq(&<;2g$rQa{
z0Y;U$LMm5ciK+5bEU`s+(qK5x6ly96J>frZwSWp+<=U?lM;=wSh7zkuKWWmH=26Pu
zC{<y&np<|_5lXk8Ks?~c#?dX7v%K{8huy{F-A`aenk|8qT~(<%znFEAx5#)^G9qyh
zhU?*!nmcIl<7$VwPs>l{4K%p1**|(zVj2tiX>}qcyPu6)M+QMzV=}i<e(G~nztk}V
zUG_ui8qo(f?$}K7Ny6bHuM(o&ezE7Weu6Q1<qb=Nea!zHRsLwR3q4^ulN1UHTb-Bv
z%|FVgZ*G8E+>G(blH8ipJ`WzOnT1P8iCLnPOm4)8<fvtiTy-F)j?ta6Ag?d)nk=-<
zx6gF((-^U3Q5?dk!M4093*_HyUcJ>1ry5D+_$?U1u4VmK`LE|HO7ZG$sghORu8*5{
z)8&~UChQE**`jX{I*CA9<(TFp+0Z0oDURnPwdCQ<cfRXrk<9jT&dp5rD`D;0+$pl1
zGoG+jiho2+ouRjRYsK0-_H|5(KkQKGzp{TXHnDwuR<uxhuU__52;V^Fo5uoeR6&`S
zf>QY^Jxk>Z<F<DK6Mbb8%e@BYYQ8HTqEmjk;@iJ%n%k>yyo-pg3K&`2r8e0@iQaGX
zX|lx1lktiZmlm2rwct1{zHxe|tUZ?awESwT@E=~0F_QV*33p&He{;-(#;g<<9an-0
zljI5VkF^jrmYIjwnyV?g6BKf+sY_`41!guq(q!@5l;x3zwdhu+)9B`M2f%Xt==4VF
zFeJvyT>E4tD??`zwt%;;_GxBKv>HcmjleNPrJOW|om-(47Bp{hfDdqUs+FnNTl;jR
zoM(^u?vHFmM}M@L8JJ{Dv|Kesra1D?9jLn3MgnwyNqkEtBP9^Q7yNpO!aPp1QqHwk
zym2@L?KeI9&`*z`(&cN4C&|bYZ$+)Yi^ToiCnI+;(WUNMEB7#xuPd-5-)AnROkMT&
z3{J3oFRE(W^tCG>4D(#1Qr}7uiXUcD9|=OIDdgb1n?J$3=T&!{Uh2@$iWRsN7p_wh
zzUrwDr^NfL5oV3-uREz<Nq16AV^OsWtF3>P75NCl`8#}}_*UBFAvDm8WR=nCsx=d$
zmu}4MGRke&5#{VykF|dDwPU4ePv?3@n9f!nt(~zXsYgNk-Ap(cbx1%WN31htV47XQ
zF~1A(bXI+KmOYRs<%gnI9@wn<oUA{fu$Mel&9`@#d(1Jq?yom;V?--d_?TW!s0rSK
zpOw^j$_VP`84fls-Xd9x-XWO`EkWNn<Y}R2Lyc>DLvk+tyS`C%N$%JcB>XwrH0@cn
zRJUS7pgS{_dh??`(MHQaRqD6sgT0iJPq{6EQUY(xeLjHZls|6j_7A&dY3>4%$q%1S
z*q8btyuq^5236WT#MfpWIvzr19lxYVQ#I&@ZQce4r#3#*Bx&*n|DS(Kz=f^yFCQ=}
z;tdG;`bTGIzgVw$i#TL`q4o#NXQB`U{Xh<Xg;OU6kVc4#qhBu}_x+q8F;Ru>`$rH^
zSc?QW9@zQH{LlwDgn%E-Q8D2k5|^}C!G5Hy0O@=UgjOlyE)ceth3FWjMb<@k!DpAc
zV^juD5CDaU9K=o0u-|SCU|bDU=w3LkA*TAiipbr?4GQ80tX%4HV~5m50^Sam7QXBR
z1G^mg0Tkel;a9Gm05bK@7CFK1LABGrZEmG!R^!}r=4spAc4>f<bHO8f&~sB$M6>t%
z4X8Fd+q$!63j`e6(D`*W{IG9@SP_P^ty!5%d~Vb~mv;a_I|zW;01DeX`3a`V5gDMm
zzvcy9M~!v9a_P+hO!ad`-IMmg_TVOaw%~U*5s`3qj{W`XDoyaa-=C`duA4xC#Tk79
zN26p_udhW0Iv-|yiND$VvI3rTQ#S%0AG?x3Mr*#DX1|qyH)~vH?QyWg<<n_>KV691
z%1g`!o^n!_b;zY4U~QZXAgwuhr0{Qd2Lxqki`~HMlVb+9OZm%}kN(02+?10N2jA=d
z<&H|s;+(|7z<wG8Oun=Zb>10_Qr+LQfdtC_t*1?|YE1pxC91H<dGa9w#Mzu!huRqo
zbauXi+f~$#6*G#oq$&a@LzHaS1w=>AhBLknzCx=1{GZT~*l+$93IsNDScg_DZ{q0L
za<98<ajiqgx!fo@Y;MP6*H#BYQC2dD!Cz0Qn+O6=!(Vit+n>Ww;qjC=tYLOO^kKFK
z=IhVY#C9hp@X#QB?#I9t>(ZauVRDDJUPJ-FtG#b;+!Rnec-YRig@Hwl3;k60xi;8A
zu7rRdO)O*6{MQ!PD|z7HZ<nF>PeR1f|1bbqdc~aU=Mr3BIsdQOB6B62IP(q_6gzA;
z6r@G&?$8U^Z)R-GR6~i^>W(~Mv9xu6L=PgQ3DgNZeBKktbDYWd4Bjn830diW{d>XE
zK52EwMB-DPOXTgal^5ij0B>r8DZIQ{Cx!JRiXrcZZ0oa4dUOmtnz%ZAJFl;fKU}Yc
z;S{D6jE)Y-zCUu7P!AOpe2@d>m#pw;#I%;8gpfUyQ~UOni<@)VH_y7AzDJiFsk1sI
ze3fM=!80;_(`|1DaP}(waCoD$-xgJod{u@t*vltpRa{2??;A+hOof+XLo00~(2fmf
zS`|Uiso3X_N`f1{9aQ6!|EgU(g8N4t<@(cr=nw*s9js*DrGX@?8T$K{UbGVb=kTR%
z744h9tU)IH%ejr8P&5y5l=s!Uq$3T~H51$OL}r`3s1fbtiO2ii!a1Q;&5M1WponYK
zVi#r7n<|60m{jz62_s3}_ebSU8E%G!CiNk_M|ajuMuO!2_H>(w0Rrg2rKhP5KTF_k
z!=H95aj3fCRT1BXbT4}g-gy&v6Vm+U@^{SQPrkt~Mvb}jWcc5^93$|}B{ug<g!4*+
zLDW%27Uzg*z(fXpg$kkNU5Qx{x!Mm-mb?edM?QN}bSVKFa?dn&?=Oi=k{{$HzY{?D
zQV#Vs^BxN5v`f1gjKm6FTDK_=RurJ3pU#eKpM17!t3{jBBqE2W^H<yk$=UQ7K3<J|
zs5U<T?ZM{|{mj0W4~)cmZe&lN0FpY+v>G!_G_n7RX2?_ITH1Jk5!=w{VGVz!mS{z&
zWhjK5`@7^7EWPPma?o(uV;mCRO{~OBmx>sBqmaBQ#MrI4e)l!p-{M|0HW(D66w^c&
z`$>j!H1b<~iqYh9mWU=6zdmM`){5I^W$3#53{=1TrlKw6xyD3DpIpz?<ZSK>5%sxi
z8~xT3<BOL^Zb0~*pj<e<;LuJlJLLxDteA8;)#7@63qyEj3Tt|)Wa{2q_oB@$zwclN
znjx0LyEQ}5^OQanYYp2Ho36=U(fG4Bp3|#LuA9&D0XoO+SWRSsSIZkavJLN1k<QTT
zRH(HMv-ECW1CIt-P8!M1%}G_lar%r0tOZ8wa^KDj<nA&0$oRL0TXF{M+;V@-qQ+Q>
zsm!MtEw^q64C|Qt;QET<VlaMNyJVv)5A&CRiGkH>9R@Bku7T8J)||m=rK458?48B8
z0Z9w2v_pg6@IPg8UTsDle}9Ds%_yR08_Ih{H`?}%-}QMUBWu<zF~dsnSEesfzMS6n
zjjpP7k1xQYB?MgZk9a-GkgANYrk@g4_4OT}%br=+IgLcs9@o}-&V3z=+T*$3sb1>O
z+KNVr+@mkdZM6!ThRCgQ5+c7&p!KNjND({Wh|d<x61ji;SIw{3cDu3Mz9+-D!k0$@
zh}(|b{oN4N;AOe_LCo~9&_b8%if%8{-gUv}a2esr4+*LiI{gw|+*w1x*3(w8wL}>;
zn8`STJUL_i4K&a4HBLY0zZo#-M^DnIihqR`nNHGw+O$}<-lF4*VNRn0#eXUlSVvVd
zWXdu{Vg5Hj+*mE%1qbC7SD2l7e%ctOv%r4Rfk0p30oDq2InQv<J3ebk{>d){b^JKT
z;)<g9)+H{yCR0nvKu)bv1;Uml1w$z~Y&!9*2>|!y+c0dvcozc@sDpucj5P_I5tF_q
zC<{zAPXe6WQ6z?D+oxanb=4uzY|Wg`vkvF&vhtfr-wK2pv;PHTdRm_ZmiBeAxIN!b
z4d#qQlKe9Yf(8bLZ#XhW+jso5!G1cyiO3Efk_6gteS^Q^)P!h6cLv9evC&6&k&p7$
zB*S)xS3qKO@vG(q{?$hL;Q5Lr;`V2@H=rNvQpCZ+OjtbHoR8_|9pWGme#Us`9U{$O
z@Rhm_=i}o)bYeX*$({)SJ!S62H7amYIf)qh68s4Ub7RX=ez7D37-s(6==>XhB{{wX
zZhW?ZU^ZvV*FxyXDxjUAZM(#0bO&7>!IXDGfUF1+uVB}9(~-`bzlPx3IKc$iO_MVy
zkUtI7v5eHS4m$y1<7Zp1i`c(~XHPxd+k*n-W<l8{_wt`N(r#uI5e%?kH0*X8qgG}I
zrKj^wf8;DWv0`~BlwA1iMpxEfe+MjW+L{f!nRx>O$vaUmzH9^m{rH&@+P@_yYo|)^
z<bF#?45Z3Crv|p4DgsZk!-!E5u~<Q6HtQ|6^jqRyL*4PaoZ>uf39tv9i>BIR`%n}E
zuCJs1IieJh&Ze)hExwEmyoeC4^e*le2(*(8AieDex@6(!bfZwQ3Pm*<47H>g?DR3%
zW*83L)$i{Nx{{RTs(T1Gej*b#=r4ZI<0(;)W5qs+(#Y}LW`%f@gzbghfS;qv@@06O
zT~!%A0A*8K$2z}wvv1{)LId6)@BeRuw^u~iIb*b`-77uK+we<vUS0px#OP|-W83p|
z&pLBpZt?=qHm3gyS%MkvUgCRr83JxKJ9IvXr*0ki)_7wsY;5nIk^_n^H9KZ^A5{As
zsOjfLdXde%Vb5Y*WC<3Vxz;2uS(9=3yhW4~_aW243+){T?_reCTZ-JYl0_<fj#82@
z2}?k7R7`1x*>`xTNmKoQnO9W?CnP6niT&K>x#$lJJz9Rdp)h(??11LrKOCm#m=te3
z)la~L)Z<(B8r)^|!{szhgy9Hh83#I@SHAg2@lS^9!M1K<US}X4Ov<7e&x>>q2I)t5
zvco$sA{<Wgq!JF1+`sPw=Q&$Oe<xn1uxR0}SS>e*<duG?xKU6T)MkvGB<aCVky+r@
zyeS(z!V5TGcAVI836ob|V|->esS`N4_kJ?4ckt(8m?ms8NjigREnBtQKm(fC$3If_
zA<q<OkeX|0ff;`I=xhR2$&pkkZa8uqaGrJAcK^H=PBX}2dC<<6N%J*WDCUd)GNJk*
zm%<|AZw|Q`f9%>2VV=Do7ccMUQU`+?>^T(P(l%z^HgBbw6UdVBp<ajx&BG$nA5&LX
zrXq(zxkvf@#ux{!a~|x3vjq|A>37tEig3%cz8Sq5*})Cej{Uwt^*-9~Ls5Bgu?*X<
zbETXJl@s^ouHR^~LC$&}r)SK`r8P-)XMWQ~!-$5f*!w%VqwhYmVN;<_DKJ~ywZ7Fq
zkw@LWt?d+_Te6Y&swBy+!7fVDU3sfrArYmKQY4m4gNCg<&D4OiHcSY97Vx9bMAM-3
zeHWy12{94Uv$aG#9&s5;u?h{J=2?n=Si9P``Kc;Vw^eCehWXb+OKy7K5)+9xi?rXH
zQU3tm^PvCAYuY-^`0|hsn7oWVPOJLD$LHLNn^m@Q@W3YzRa1XDDy)GJ{1RFi5!cIQ
z_(lqv)IXsSTE<ynznp!^i()4bYg%7L{!Kfyl)|5)L8r8lzG3Jty#J%9CRfZT-g66w
zqRfa&w#8}bh**GCQ!#qzn}S_lM>IjRNnsJ?{<nhNv5t8|(;vYU!qD$c<IliTSL+Mc
z1M8XmpVk@vh8)_^5FNbT%;@8egOmIC(=_<>_dtPv)QVtCK_QPh;k?M%o`>;P&=ptg
zHp|3m2`f>b@UJ`J+xt}eY0gg-`gI2@{uHPS1^Jo~wE-v7%44*^2JDW<l_$IL7xfjr
z?jDWjRUD#coxW0gZkKd`s@TEu)vlt?*XsukF(I_IQ+pM}v7B#A*FK5htOjgR;@^|+
zgvBs*$KuQ%G2QvP``Lo-Tct&xw)wMEbAmSf_p)B^_v<l~of15oy+g38@d5srb{$;A
zd>)Q}0#7<R9H-*V;>AT*U$-0w6Esj6AjUc5ZbR>5S1a7Ih6(CTKBCe&@Gj<&rzWjX
zWtIy4qId`VZ=Q}!Z~f<*8JRRlQFFH-ass`WE3hzS2NWXG<aKHU1Xiz%oQaSp{`;z>
zh_=y0-q{&tNF3||_lW4oGV&kiN_4c7TkCo?sm1wt`$tg$>$g~iWhZ=}0Vo_JI%-#b
zT+u&njZGiztjcP3yT8_yj+9qweS3b)UN=v_UwN=Y$sLGQN?S&}uc+W(_H-8s${Rm2
z>$L2Myu6F^8sgxgO$p7Q+f>&5JX3C9krTx838teIYdihoib<>8;XQ)fFD_L2X+2fx
z<w021qx>PVg)VGQaf=undixe$aYOD6g8uRXiM6+|t_Z^>pP#mkhC2f$5m0=scw&2d
zNN(&<3Chz(NJ~a*&`iYA{zpiHs9{qz=O=Cy|Ka7lZHq&+MA7Mk-zGE=l9mD_v1%Xw
zpTZOnVvGHBKnltRr?c-mz!5~NAmF%>aJ*~q-}62fFk#}eS7AZCs!2!&I+*HD5t?ss
z8tf*PFfX{er@Is+u=>CW04!zOtUt|4A$Y*YC4mUamqHFM!A?zwCRY>PzO#S2%ZM&3
zWKg{IO#D=&=j_U5(53)aIq1vZLY6lGiY7{{S{0G&Lj+_TU2J(CDT8O$u@RGzsD2+=
zBh6tf^jna(0z>*0+Mn}VC(S(n4<BBmJ0+b)S-*Zmy6Ew)*H!m)z#5k}^9bto3FtAy
znzQ-#baV!Fv45J7oYAAf@7PCLpgNjQl(p{n9N{g1IlYJ6YH4LghD!XKx<gcFGP)0)
zb)C*h#NpnnTKzavNR5?z=fCj991`#QQeLJnM{3&mn$jFCzS1DUkbHuf&?}q+=_IB}
znDYlC<ip(oq`kK?{R^#a{$iVk!-h5)zaVE{3*KN%m21X7`Ri>r;li-g8Q?p_t$StP
zO0>PiB^j3JvXH2|2@L9`@U)vi(f<^WE0<W0K(RmazW!&n-5Piua8z@xio090lJ7GL
zzpLnuaLd|7rM?SRFEqswcB3&Tl>z)guk%Wy6+YOM2h?U{sjD($GuS72Cr!`Mn8p=y
zN9MJkSiEWPE0&d{kqi_Y<1IE*LoR|8`|cm}UAN2r_8NX88lHg7_#PX7TWsHZl%kO^
zJ*}uAA(x&RPFO-QEzXm|Kgl)`X=^*DSw}}RP#dHr2743B=-fn^ZRCJ^=5|o|S3Xg`
zub$GyeYQyUqnW0G?IAv1++oRBSB<V6>GAf_qm%9}wlnuTR!*7aSsYSAs?GtCytjub
z4mAr_Zox}xYZJTc_M;{u^?Uj*L|SF^sf$EELeAaKoN|61WuP4u?@OhHD6wC=HN*Yh
ze*B<8KwF(nZq^Fz-Su{ih7BdS3Cpbc7{Yp<Eg0K`h^apr{`5bz#^=piSyJi^{VMaR
zERya!d;f&IR5IDSL|GB%OJJSx)`Nh4)f8{ekK;0pK5WU`s0J4OdcT7i&u#e>`e?*U
zDsjTdiy5tbNBvc`<PEz|F{$X7<1ry|U1P{_iPC`7n!XZLp3|yBGW&NxPAiU2&XBYp
zb+D!<0*+`2#ra#h$<vA^-X~i|`^O05NH*$~`(S*;ln47$qs4!>sU7e*<M|~Q>xAAS
zsHdYw6=??xk1GAvbEg{8<h_B5pJ&I!@NiO@e8}eCkAC(|Mw6NG1;2kXi|RITyq0)y
zH<E@hI=k;5MqIQ@-{w2NFO8TRf=^(Pd;P4b`{T`q821-`z{#=cr`?W$^b~G90Ou!9
zO38}QF~W5@AA@-S`&vh#$moUlv+g}()<o$=*2?Y6`~&W7S*+uW=jrIEtYkN4wbknq
zC+1q_6QwdcL;Kl3g$6;G<vJ}f<;EtnT*A)fr}ncqf#u}}5gm(vL=AEm4G64C4EXUY
z8*{{g3*A|jR;nj20WK#TVvYQ1%i`U*!MhI%Hh)Lb??z9T*SI>;xoxGxI;AV$|FIHk
zE!L(akZ7)9%J83mt>D#s_xl!!+0?9v@?q-}>ec76C5p~MR)$hl{r2A*Id9Ev)$=7g
zdaxfIO!zmxxA?*SUhCG2B)4A!+a9)8{;gL<yFhIYtmof<HX(>55Kh*hoFlrT(`;KP
z^bySabWp5VEJZPsX44N3dN!mr6**0z*Q2u7Z9QY`6{-zH%Gke1g%iapCj2I3RO*no
zu%)OOnKf|nedGTaUcCR#6Z3Y|13xp2t{vIv=myoQgu0etj7m~$>AOekLNxnqnDxg1
z`7jIJvIs+^A4|pCYdn*de?RVSJN$^W`#$w<g69+3qjlr+$l$?@oSB`2>zW_Ry?59i
z-o4cTO7cW#-vfTFhW0nvaMwjF>$Xq1F=cA*A`ZfX2J-cNxSxaGNP;T$3AgMI^fK7y
z&DO!0I|Sm*KTCC-eV5#Sgp@tlEQ=7K<Ca?a?^}HSH&_uvs-;UQQX)3dIhhMfqNn}m
zNgjX;h>)%!mOg#Xgk30PJw_-36NpYUK+qgKkbqqxNVp-D3tJG_*Hz3lfz|w7q8y*a
zrk3HHcT_yY85O=8FBH8|ycBCp4-Sw=vnZe+f8|ENsbQaJR&XISbPlp8W&sjj^dX@z
zrEa(?KgWLpeSTRRKf{<1#;PS^x6SGBMJRUW?q0RL$HOn#2u4gOdt*5JK>%Q=t!jJw
zxUF|*1Z<2hH?RZ{L)HyPUxmn$4fbkGwTO<sde{*tS%!R7X%^+T2MP(8Q*|T_{43b=
z>mI2!IK6+lU<LZf3dC-tJ*$LEc!inYKEm0q(MH98I*}doog#OI(xwPZxFQ;{-jmq;
zm5Z#<hC=8ONkqR)NS9;$_@b>-tF;+5VuW?=mH!*gzz=+9q-UN$VRMoQjx=_@5y<t~
z_CeI-2i}j9-<`fZj(?DKtmfSmv5ge&wX^mW*z}WsGXK0mRKX82N=t;1lVn?(^+)^z
zw=ZMGfcM(_2X~8O{cg;!@<FBtZ$_y+xWS@g#=jic5%Ob-dCrR+7%bNjw=w)h8zi=3
z;?zwBJRRv2xSW|i?FzekfT9~<w#kMOdPxKH$#F2P;>j>reL-F`cpG$so#D=8a$|nT
zY#2P)c2pDMw)U^q%mmfvt{yaPwXWPmwx5$hLr3wz?_Mb;R`X%|{#X4Oe*OiMI_5dZ
zR#7N~Q)=aQxOa+>I4ZscbXUp9wWHc%$!i)R^<6_6s96qcQqkGQfQk}wAg<J%dL+RL
zSzcy_QK7w#v|=6JBkbn4kut`*?oUBt&!}1{<vijd$fHE<ACD%i%SGrN1^e8Yw+XA1
z2UYinA#Tz)$gF6oYii9)T45a#J~1ouScQ@cQj;4}AH4>;Lx=DO)JY=b%RaQO$R1XY
zzFGl~S7_l3%&|ZPQk+qNzr+ENRY0FE&^ZQrm~@*GTlyRL8)iafd7>fJ=7SH*D9y!=
z-JUhD`iucG5YKSPw>wQvaoMYC6Ehk9OxI!oK|3vlf4wztk9kj<3v(|cN4*Pew!NIl
zi$odOy}xqPEnsij<x`v>b0RUe8MbaM`p0QcKOAHhwqE#0k=krb{&_SWE0;u?TU=XE
zv^GyWya0ie_Z{5HJ*urZ5P}9huBx=L7$T2RVYK&(t|4la8}~i;Qlec(gQRXV`D;IG
z=UvV2q-(x!xx9`)#>v95pLF+}Z+`b)8S!a+;XgZC4`%78C)6SF__~N0EEkQTWr`Qr
zQS0os`0MAW3p!k@&hUs?xttu>CD@F-W6ZH&cz;({uW<2nlc>kSf()``zx(!w_7~yR
zZ<0qLj)T*Bl8?D4&pgkG1InMeO22TwQGLY9(#r9t3d&+n2qWh}B`&zP=W85w%E`j5
zm31j%ka99}IK=*qV`xO%EifqE7*CbAp0Abrl50@UCk8<Gt!8t*%b=C~_^nX}$=eeH
zYS@}>-9JwTP_-k>!sLE$8yxHi7B3&G*G|Ov>(NV?yf*t2h1Vbt=KvhQbxR#Qh5q)P
z6owfJawhG0dvV}$u5`uHPTc9~_)ksiENiUDAk?*XwsDfKgiuUr07i2}8Adyc(MsN;
zK6wR`;jEaJKI~MDds|n(gi&*-N~R3l-5;yClZNdAS-R|dABCz1K9V?ty6B=otnyH(
z+~)R0khwl*l_)ZmK|kKmYPv4zt(UlGe>BPIjX4b|9OoAKmU~w|d&kqo<lXkSynQrB
zPHK;6o}fnTM?~#6@9BJcX}mp_%Q_Wdr>07atnPo&d=!S9wEto@61<0g%SfvZar`lc
z@)LrhSFM94#2mdKdg2eH+T}ZyI=U{)&nTDqTHb31%Oi(3bh?LWwrs4ul4JosN9ZM|
ze&_XN-oor^)=e*y&lT9Xdr}1xk+rKl4iu`ldc0J=zsUdQF=?P=t5^8UljE+jmDzi2
zNnU^E-K@8C-(ceHcqA}_nv-g)E1C5xFm~q)*oliJSHOIYnj9({<B#REcA>dzDwChF
zx8>JEHbrgv+TSV4yeo+AOUK<jL;@Bu7<Ri!;;ZYyHj>WsjVYI?=O#%|#x1TV>pHSv
z*L$4k{a-A`?;!%m_iz+BR30}iyPRxmtKhQ13l|c=)@@?fS_MdW9AQ`}j)9S@Qkbm=
z_>qM{W7%byNBb^5FO)EDa6%TsvUEGS>70xP&l>g4N{Qt&1Rg)L$Iv(Y`^!kYCx#~b
zko8Y+@s~s*tbBkRzLwZ}KnDbAA`}=3S>NpbPRSHcV!azjFu90>|6LkJ*j{Q4@z)GW
zlrK$j5|<Nopw@nJ+8~ji1c1BfwY8+c4V;Ly18AcS9c|rm$Eda0qdht;Z%D}dd{=Yq
z(b)Va0~1Ezq%H#K{L<5i8=vtgOk5wh!uDk0b{*|fY?E`GT&^-N=^)eg&ESaaF`oK3
zaOuo_tHvI0>CB^*N~(=npg(mO(HQrcF<%;NK-qob0u5Av$Fm*`#3koRz8do7Q3^hJ
zw8R0dF@En7DuM_au=E+HK`J56Lit}VFrO47Kb?O^YH|m$g7x}BZKvgk!D6#){;l7^
z`}TEKRAT!<>nG-Fk7n#a=RTf)P=SPJg0f}MDg>BLy8}&z#a)W{q0KRZ(QE&U1YU`P
zA005Ofz_B7=O5wTYJep&z_>1AIQ!szM8{5>?g~37`dCs@#mwwyHa&=g$dI!a4eObM
zdp$9L0X~jp0Rz6qKqlEcApw`kE=}T6+HrXq*zp5J-(4Gk$WN8*Q!>S_?ocW(Gy@}w
zmiqz7A}RdsnfBx`BY%t@r?7cuvs<~tZM;+r5uX@eDR)0b;y}H2Rqe~cH_|Vaq`kY1
z-{SUOBb(}I><EEf8b!G*I`<f6*6=mwv6YL?<92_J72+`w?Z=P!598e`YB!b0`4_sJ
zTM<rkJ~KO}`NSH;ps`ZLB^;vM-J6(eMtbq??1^-+h^Q7zX;WXRdCjZO6U<i;j(TX{
zA5YEvl~E2^fnWqGP3%(HnPC-2Os8WAh;!v2a6$=y;Zw6*GM}`AzR3YTJCLxHDPKoB
zv2-q%sPU5DG;k>-<CLCL%(`7w@moKst=LafWB+?#o!N!_yKy2I>Fx|FO%1C|vQG;z
z?4X9CFX|Tbx=*Vo-!CBDitZP4G|W|EV~H<$O|4{SE5b*sNFq|}JMjfuiFVj)JX1qf
zN2~wL=vdI`i#pq;eLR(i+i`!Iw{)ogm|NOBq?8$VDv2`w19e(m8ox1X0CY%}^wTe|
zLuH2MazK$kJu~m^NEEYI`eo#hb>V`&Iu&vfI2t1L7L679RXUC19bW|vS;fWWwaDdH
zBjqo&>aUuFWvczrJlN80_6BM9zWgbVpvF+r6|HW>Vpa0Nk;VTV5?QO4|A*=epMgOX
zB$}oyelHSlRkAKmC4Y^2?%-|gCwBC?#kXi-iNtIX#af>~Vm|d{RPWe?n<E)Je8^Rv
zDT!gN_s`Hj0#f-_znct8ppXiZt^&eoW3R2mN$px32+wkCp6L-oAd&-b=icGHX@ev<
zv7qi_Cxq^LIxI9lka4PPQyK4F7!A~G)2x##lL+*7)s%~iO?NX^^aN4)Mll>ZQZ3a(
zJINL41<@b<G#fH*@IfkU?}Jrgw+b>>z1}^#ml=(se%*h0gygS?qL610JM0XgaWI40
z;&>=TiorTU1+@VSZ%@r((M;S{-D=@N%Zu%fMt>vs<nmp~ghV~>tOiSn%~15SzK!$x
z0Z(z&Ssao%X06J4Pq+fdzbO}O8no_uhmh>q+LFPi*UqjyMS&@I4;P$Gp%AA;$wT`L
z``5nd@Q-pRVLZ(0>IpPHou4avdJ1|E=O24kVP{O=Kj^acAY!I_f3;-812lpsjekSf
zEe%1Q`>*VY+JXFFEt>t}2hCZXyU9i@%EkPG&JXI(E95`2$0SP5`eRm9@AX`o8|_J4
zH=5RqR${kO=9S70*wD#uB=VSPBuSnqXRH4ycr5H1$09kRUBw@^g?f0)(<_p<@{qn7
zcYXg{b-RCFE2OJT(R(9NE9^hl<=DUBUqGT;VQIIV@fx^^`A__SR=Z*W?%ZZA`g<$Y
z_jNyaj$~#r9C5qrY!x%<#g={JXOhce?|Mw?phmSXOR`H=dA!!GugdtZ{4nezJ6LcZ
zsZbFG=6WcP+7>unw@MolcxRpp8xLNxushg!d@uq|Zf$|rS1C{kRtML2u>ent>7dqK
z0)Ra{RBBiZ67`IW64Jn3lH$({OpNw#`^*eK(=a9)gGR&7n{J39ji2HCt@ll4w^<5F
zhOa~JU-jMWBFqHOIDnN%O0KA2u$)Kf<vvX>l~O6B5_`h|=))8614(E?5!^Zn+Um4#
zvgq6jAO>v3e}}o31c0S$7`V=sv&QURpyeI5-Rm<ycWreciyYZ1-Al|O#?QH%V&%kV
z>&*$U)15Tpc?FxdVSBJ>kcruVfL8g@t-FZoxAfFy7vis?LYO3%^uVd5t$|t+hqmBE
z5RoslY~L()v_Q#)e}w7p-5>-W0E=a&!NI|^Pl)PWyUJ1*V%8Y~2-_?BN&Fln+LkQg
z6=!WG_v%}54b0y*X}9YJZLrhRi2{I$j|)7>+SniH2Ma&L<(*{`Yzi(a)_SSFH)bY$
z&t}>c=<oce08w+WCM)mtWdSDg#pwh$%ge2$4PNn}%X0Dawkbn2?goGSIc$$z4MRD+
zQ4p=53bL}JIvlY3J2^LGV}TuH&J4hlMR@0PyE&tPZ-JBi#atQ~%5At9?lkvb%-2Qq
zyQ%JdN6!jALMFs>VB&xkU4UP8uhbUW!A!O0<YaJv>+|B_gP^g37}iFZKwdt(Ahu!k
zK*yBngt;llYFU8i`^n<bw##*6Xcn_VHrGShyMd)NAc=hobiw?iQbjT2Pl%b-cbTW|
zpcK2opUGEox<=!3q$5NSwW>btp9x9p>6%yA<})|+-|7BQFD?-@R!SFS;EhyoCh7H0
zW2YCcqeZR{67063135Xv2%3kYfw6}sDF{QB;f~7EQHk+^p9aQNWMJJ0Ukc&ov}6p*
zC82V8&M58^zTXc*5<v7v9=Cb59>VXOgrhjO8J{}nVh&bkN+w*=>Gg`bT=LoCFycwP
z08`XxSU&P7_;%}VQ{p4MYGGXh8G$OIA0r4R7aR;ZS2gn{cuJv)^byh;@A@$&F}{mf
zv*Y;*zxrztQ}JN@IwHrq@x;6BCr5vbv(pL*Wy7rbX7oOw(4Y+wXfbK>z|U*T>oDy+
z2k!E0#_m6Lt-v*PZ+tO|vjp7dXW=Wo<U|JY>tuzy6$g+9gj%amljgFb0ZjZp-8CqH
zOMIl9(5BLHL80UUgeCcPo0iW{i6s&)^%-Qp#h43d5|S5WJ!a{(Z6am*$R6G2F*la2
zN4oOu{Sty?x;+yHW`7nHarFGf6H&XVNOj>)a&ISUl6+tQ()2fy8IU6KAN4{CUTh%d
zb#9Cd4PQ+cY5ooFlU7=V5qV)h7f<0%TG3!qNa%FXtZh1VC_Ar)6bhE{DGU&sQ4`H)
z#3*sRCz*RuHfXH!Ic-(aFkHr5a)OwM4||ahaqv5CodzU(-#TXVFcFd*M>Lo0l-sLL
zyvKcF)E74o1EraZU}Ka#IKFyzVI!^j;mX)()Mcp9Vea6yEG}cUbt$$Xt5*LB>09(g
zom5LDr;RIGd`B*?O<NJZzr_0%%;o7Bs--oWj}KXgSS4X3^2`0?tOQZMYJlQc`<tK_
zgS_2kHnFF%&)4XYTDR!~xJsX+U0rWaG|~KKgPZWt7OJV<=_U_sKuxtWc1~9Csac_F
zJ3jQc>bG3|XT8DGYD@7nZw>47U%<1imIDSWp<Y~U*r!o~Urtqoq&MS=OFru@!!uQA
zM-WNsiFV)Kb9-T~=2QOEJ$Xa{VZAV&1gwjhe^r+?csj}hh&gfa8pYL?^mnb-)^1&8
zO7gGXJL8j5kLhM-Nhi~L$Yi;oc@o?&&2Wvq2;Dz^Mz61ixm|g;aJqkIa-I%j8DIbB
z^}wFMR--k5n<-=I9TyhqHCuh<S2*vSK`=>s;#X(*Puel*Gxa5EL=1aGPmRBNVD6;#
zJ!|)7rj+cCMOV0=ZXXuqT%wytmmfFYXw&sq-ovltP{vt-=eGHL?>1@!rJqV|n};Rp
zo5OJv8J@S<oESPmTGD4%?mtE)1U`VO^dmapZddYxPNfFUy%t0~<F!1T<CmpJvCknt
zP87o>-0<yQtqo!pGr<`i^J&k#tnC7BYvBzmKY0O9M5A^cnJj5sflYXh#KHazR%XLH
zC}7c-&nU#;>b7W@dpsCE?)rsEcmsbjhWN7%JFoG`*))I`;3yOUncy%O7VnVYqO#o*
zw_3Y|a7I11BZ7yzp}>fmdTx#3IB<VxZlM{KX^({J${>~h4j*t5?gz?4X9KLMCL-M-
z%OE0A@I(eXUl$vK!4(76o$Hqj5sWMUkE^c?ieuZ_4onyb3~qxv1Rvbpg9Z06I6;Fu
zgy0e^!GpWI2AAL#+}+(B-Z|&q`~CQSc6CkHR9El4_Ihlce;NM_TE!^a<f=93S~R{-
znI!->q>OcZ3t|HLt}=pK001f{LI=$5#8n3oz$WJ@$9Vf+kX2$!+1R=pX@`<+RIKii
zTLeL)9hPGe)??p5Jm{w)n3i^SgCNCs6AOr!plAdYq>T`}T4V&Dak3B$<e|UTPJ;AL
z6DEAUwA9(W`Pja%7jS$>)rwT#kB0f7dSHA0utT{2%Zvm_q#?uT?lW7O{}q?RIB-a`
zi4w^AWPEoqP-WHGA^wO6J{$gZ7US8i08E!~picbOc?S!3PwOFXmSJD}T>H)c@f`LV
zZfDt{XJjV1Gm|>MvJzkxSD4MUmDd|^MMP!+U2ijjiWL}tWPCTwj{N;)&k`mXS2ADo
zeGjV{v{9PpwV0>EFNr1{Q4Ht4ywZ@3X9IlrRWGG!PWH3(Do~_yuh{Y51A<7`-uFWg
z)$Idfs`!jy`dQGIA0W9O+bx$z#kXtG8OF}frvbjgRGCx?fOV?URvASn3qtAQtfJY@
z-Y-;NnS=Br5T`_*qSi}xbI(DgZ!!itD90iQeB0V0I@7nna0rC|oH{ZetuuAY>d5rc
zc?k@C<GKQyn=038ut@W4_C50UzJ@G~BO(a5L}O>9`pB_?h^I0{?q>^(WXO+4XMvmD
z+8N$>=(R`exHUXaZ~~r<tjaIO^!1VxA_v#Y?B1e#R!v0Lh(EJ{>NSNb<Egq8uc7F2
z;FRKr$H_!nO}21ZP~{=B`31bY{zOTS+qrK9QA+k4+mxOZhNs?R=f|!f1h5RBN4lI6
zevW!QMRTXGG_EM!Y`Bvddf=J9*gRpnxcXc$HCXN^0!M<)8S<amPWRcd(zv-ey{<|r
zTc`MiT{0FnaAfg{rNZ1+sDERInrsj%k&u*`$SgG0di}ByYc^*dhYUkD79H#lM9Tvz
zC^-806u29_7IZ&bDyx$zD}qy$tlN!@2IEKYYv9BUyYA$Wk7tIKuv*6IlU|*otxY3n
zm#HyN=z4{KdQ%n_i+vVVV%8j7O;dy7*}a$MA|81vSi2@`S%bNQB*(u<owL(3`7;s1
zPJ4{O7>BG$tm{6>jO+B%&Y9lgoCgnsFOTnM$m^37ANy~sU5h=3g8we=Or`K2Iy^%h
z{rUrOC&o;6$BPIV*DdfGnZlU6%x~f<L_xv%jv2xEjk;cI-hLX*28NX+C=5HC^Wq^v
zN%@-ST7~4OK3zNE7eRZk$#~tG%B2})cmlPFA5eKBytN8d@ArAB^rMypsSH?v+P1Mo
zW`B3-cGXjs`Dr2NriMIR{oZp=1}CcTB-Bu)gE`}X`%Ge28vGf^oD(z~&b`4jVz2ZP
z2&CJ_T_6lh`77p>?E8oFhhzw+#+LW?uaXZ(Z2jmZF-XUqx76Ij3uVQz+O!ShH{2IO
zP2?cXFT9p(Z$0u_fT*!oKRVqMWZDOz;RnrA3EOI`X3vmMXC9ZSn-<36=8!Z>+G6#l
zC`ZRq3@ZHxRhPd_@svKkGESrMbWQpJFUegbrK<8ZaQZd0p3`mi2Mhhm%ns0B>;*%9
zp3KpUf%rirpp%|qM6xDq(vB+u&oD6&313MvEAKIt?}gIMIqKrOvY*2|=l}lkxh9J{
z9I1OABaD2vXYA1QvO*n+KLK?q7PEZ(jm;@;P#P`KJbJ^C^H#7f>@<FOPl7<LhSXzm
zfA5rMGVo%oi97tW>adU)QX=z&UtC7rQ>$N!;L4x&OM@is>;~qaM*fGhY2=RNiBZDY
z*XO4uOxktlr(4e740<YN^tQ}{R;QWt&XsJJpl-4!%n%Hff+h}Ae3C0jscY&ojl2Ju
z;l5(AhGe_C-}G;$38I^BcfaONT9D%6XRo*IBCGA2reTrDD;Lh}xUznNmbE0sHN>l0
zG4QWn3JB{odQg?HQAcKd^vS#$8~7~^Ev(5IWcF)t(`O&bkL?W|wt^5Yber3AYwG@b
zTqhSr-W*<n)OMHq&$g8#*`BW0ht5#<TYinL_N^h*gQ8OjeW}5ax5d!A`#G(*XOnk8
zqw&HJSn;`)Cur;Zkx}DTssjs(D>Q-Gj$mRgi!q}k=HC{`W6DrvCHbfJL4r219~Qh^
z$9&SB-G8c)gAFqp;;%@8;e==x_T`TR>WA4N8F52rW@GIzT10^G60}vWGf1x65aEA2
z026BmpnzMPkaIRUHQApW)8VQtc-D7oNiOTv+t2ub69^U@P`dLsRCNmW#!z2+qnH^T
zBl|vCp_RUHIDx4=(CZ7>84qw+iw}v?mry&6eC3Vz{~6kCQV^I<vG>g%B;wTt{|{_6
zGs|B2=G7tC@L;xp1ER!C03#QUw0JmFoyIgl86wggPtMD*KSPGw4Wl7iGYRh6?(qP?
zQ%10RyPR~hYUAq0*`?4}mf=d***!`yB-GcQK)Aic*iaReeR*PfCo}I2^T0;wfHF>d
zt7!r(P1(9PyS_ZbYTT!Ayl#Dwg5x<D<`v4)7VLkT9S^g3(Ev8p62yK!{qk_;%6UHQ
z_G%mc9W~)fbhC9c6)D8WeD3ydx$vlpj2*V6TJHUq<*0QqhdV?Xe<Mu-&ri&Cb|4<S
zS}o6Nj=`)m+<}bP*%8sl4&^1JGki&Z1iNQ407?5{Hp$Kri@S?c-C#qO(Erwo#m}PL
zDnGWF7kH9UIi6~WputO=k{D@OV;Q{nGG{$hC(CE4s4ZN*5>oBGh__IlSf#`y-f{8x
z>?aN5(Qg*=b04o~^el#ZdLo9?<1WeTf9CcnrN3(u2_GVBbdN@lalFsoKo5Gw)*3c`
zsW_%LBn#Pnz&0QJJAgemxrJ?vHy9hdtMOKbh^x75hm7|#n6jyvW`E4Ltc;r~GqZ1s
z;k*?gX&ftc`<}49AcBt+EP^Kiry}QE$Lu5o0v0Pop;3^Gw{fj`)ZUc}&!0(d&|M*g
zFI<@+<4=9y7DQH22%<QhI5ZDfYLQvkBx9lQ;wOoCQRW}y1~}*3I2~$8LBAZ7P=%5h
zqjDZDCm_z0^s+lJu8`@JhYGVu<w*9&9J+5_Tz$b*-5=@|GG7;J8hH6MQtK}K8!x&T
z_Ccpjnz|8kPc52$T!D6|&n7pwO6jU%Nf?894-VTb%X^%J^gz|%TjclOfr8JHY*`DZ
z3!LI!m5=@O7ZDNX!nqSWZ>waagC}SWXjZ27@b|M$0NucnylT?bETd#P<8Qh}{@g=~
z#azkU)q|QvY5ca_sJ|-;c5dC20U7!j0zWRM#QUQ^&bw+LMRL|I#vJp7?5Q+$`{Iq=
zmsJnoS{mLhf2y-9?3IlAo_zG^(}vFoF;dA{PcEa@t7ss_2mCG9J>vA`=d0J*`CSi0
z7HSgzB~vf%=h%rPq`oF-6Sc(9x-<oWjg^e$k3V$~jMf}8u9?!6xQg+ffs%$K%um;M
zt_#zsCJUl}u<=XHGlD`#U4%ZKIQrCgRK7vYG;Z4UA}Rd}k7`;9cbC{DK;P<aA7A1~
zJwZ2w7}~bqyXUdL#ovk!ON<%sFv|7+ag7SR{CefmP`lF|4RK($K=&m)-7T>^fgK^U
zymJhkXJJb7v|^D>;n}d`QLa449^r!$F0!Eu9gIqC+1(q8BqPn(LyGyY#@RvZ*&ht|
zxa#z0iQG5Xr@8CVo36M_6DF0TRW{!WI+n-}oAvHlEJ!ay5H{ala3p#kJf6Lw+1?6s
zrxvp8TM{JR$KO8YdHurm2Nf1qFqUa>CTrb|h633O+jq)0dD4&4cKDWGi-Z0AiA74p
zv_PGPgQo*B&WuYTc4cv^tX0M)EXg`xmFq``cd!i9Gib)2w#zk~X`MF13TCxCI9g@u
z@LED6!M}ZLQDZmM8Ng2K#Y|K>Vdt=tV6Q1Y>RV=t_u#U%HZVr6Fiw78M#*yi3GvkF
zKs`007B&i*A_41{aWwWk7D|PN;h=a@)ThdbBD~~JBGhU67MBf;=u?k)z~$!NOs_&g
zlhOIRkD*D#$vFa#mLbYML@s$&315h_KgDwe_o2V;xW0?zIdFsxGW4{}2j3m5qz~XV
zqOPsGMvgjYgDEQ^l39ifbj5;Qn#GNFcl;u%MY{HjLu)vsN)5NYJ~$u$K|s<9{PQf>
zLVZ>LbX>~R*<7EH7%{VjmK%<UzPNx^D-W?c&vMV(!n&2cs$LmQpme138*tN`URf6J
zJ?@u5!xkdlXO1c-#=#jk#tln7XR4CcC}7Bu-UvC9ywOoC>kPc4NHOe?%^A18YbXe-
z8XVu_ErEh4O2&72EzHNKEM|A!tT$ypz;hFOs=<A|ydB4&Y*tOSwIT@GFzY)YYljhZ
zMUp-A1Qwl&eqw&tuC|3Ky^{`4A0`+uOSHodVW|pTM_Im+0D@YG11xKdO-Vhu4EDg<
zA14#n9l}n~*dVL<VBysWkKpmi9rntzUH;(2h|V9WVBd$2y>_XC9p7>Tjk0H>7+Imv
z0()4B*9wFk-Hkq2i%1+3(Z`k^knS!|uxIi952>R8&b8pviYG?^i4Wu{z@vTk+68nK
zfKAHWdiW|ma<(l%_^86y&APq3y85fba`XPj_$(_b;Mu+JY$%c0WKCd<?Pl#a`x}%Y
zQq#Mq--BRDR-NE6`zg??+xGX<f=#z`%PZ!{sZIC$<DiDsp05^Tu%caCZURMA?mN3T
z!2NBF6;RkG80`0RwSD+3L(5l5{z!%hEZkts9k9ok2!Wht68H+Y-MsM*>GgNmXy<r(
zqF@9=zH-5e;<avY#>*KKaKg>re696I;p~sID6dT>VEHcfSVOwL<DN$&b8E`_?#lxf
z(Dmda=nQ*y)fI-vv7qs`@xxRG)~qxC;bdyv(#2lZ^ZZM@OZddB1yNgW;kxB94O}q+
z*gg(-PZs#>PtHclKu|Vy_wFtR0NcHQJ#<G02xK7!uy*l!-;?A0!`o;2pJe-c>KM%u
z{rYahuE?P}Ry?@f|AcUsM2^__!p_wmzbT$Tr>~S{KknAS<g%MG5WHu9Ah2n-6k1le
z#bftyW4y9pmG29Fy1%G*5kV4*>3b7?zFPluoO|6++Woa3SaqlULRx_#PlT`BJM;4@
z!}(PK(k!5JH`>j|om6(Uad0<KT(9JTX-mWOke|3rI{*NY6TN2KAp+!n99N2K_<eJX
z4KNImrDhMv>?Y(@i>&9vdxrl#$VG0`@r*U;%Z0hfRD(lX8E&mXwsl%M%#j+~YWjy3
z28Dfd>9pkQf*SW5Wh+r^LMUveZTob8q1#Se4yh8w+ddouQH5XJ;eqJ{+n)ZRi;tcF
zIv9;(Pyt8b+B*NgEC5u1_t01$^0P(2xLrJo#CdmkQbfq(6TbiY!ud8>mizbg#b5Oa
zVxB-kROPkuFVDt>hW5CNx(BQBjWcDAWMzNeM7ehf7FSKT!dMs=_t6xa*%?WDkU0=s
zkv($a7LKZzY8g_fizH2Wz|0Z@V<NTE1(c%LJ_o|Ph66o$-T9{_<7+zvVTZkR1KQsx
z_)zjOrA#$-BKXhRY>VU#x}e+%{`0_&(*oCGH;3oI_d^b`@I0R;Iiu=6{wT)69Gcbn
zZt%^&%DgyX6SqxGstprKE(TdH@SQFV;|N+rkCotOgKteTkS@Fd_@K>_=xwSvRON1S
zny8GZ;t|(L2q|#r!udQ0D_hSX;UMR1j%C#dre(Sv)~s6Y0)#KxxskW3ayG>(8*THO
z?cXjmvW(#c4HCFZqfC#>28C58bGhyU0+AhP<B@(8>f-Ewf(N$t&SAatkcx1!si%4P
z38sw_^E5sJ_dX9mri-}<ffcplQ}Gm3Vd5<ESNiZmB{tJzcgb^qH7}lDlyBQgw9cj&
zG(Wa}?F{RfX^;&trL})QcKJx{@@*u3ui*fWWuwtu6xOb`0cj(!gtW0*n<~ZaT#Nb)
z+3Hj^2?S{e-TDnu3g4P9BOn2gVI7t=Ig$Mm{J<tsL3blcAn%H^fwW4C0dVl{QS&;f
z2bef*eoJ~<XI{)UUB2yQ4!Fp%%M=2_09K9>%XpJ}fvTmmx94^ShbfqzQ<f>A?L>yP
z@=|3ceTW5p?IkTwqsUcM;WjX)s$w<c`b2fA_{>e@g=rb<5wGiQXUA1Cf9`5Rd)_J-
zoKR?6Ys*q!>GH}*b`uE8Vk$yjMtZ3+Pd=up$}77!ADF~D{oHyH#Dhm(t5;CI<5=we
zm%JOLMB4ZM3d})SpV!xOK3t^p%Y707cpZwp|1H6oW7A*iqD$z&fmutVtsj%Ze@G!M
z(}`<Bj#FO>_ja})<1RUl4jrHoe;7)dwL4`ObSQoD{A^f4sGtCC8DgsD>TefMJ9f|d
zpfw3`O5pG7{P?$xaS#_aAAQ8O6<tQEMPP!kDLF+!-mgL_yS<p95cBg>0{m+-&O`RG
zD#dnKr4l$;e)u#AlVx$^B@y#GW6mOGc}TIXS=eOUT{=!{SYN?LE|w9-TNu<bPEY9W
zjL>~M&SS|1Se=gtwY2ZBJux3yC|Yz>F0((;f@pygd~_U@@fVc9HYYNpHGyYo;EJ^x
z!Q6n2AOQID7V_lzu-e_hJIxFlE?DaBxT`hCz_kLZB%49u<Ol%4PcSSD%oh!Zs3ZY~
z1d@z&T)L3}g6$MRukMK={d0cZ3KjW>BmtHuL!heyEMP0T7UW~Q3G|a3BowAoV>LB`
z8Vw+VYMlrG>E#U&3^cPC<!G$#g50u1AwXM*q1jP8MQVf8bA%fBWxZvN<dWbFnuH|e
zo4<b8({XA-F!xS|trB}3Hm;c$W`s@Q&hX#dVuM6+eiz#!xS}OH?Xm#xjV<+vmWNa6
zlnza<;H_R(nCFt7;5Trn-X}Yzzc2uzq&CjvpIwl<l|x9OP`VagZ(pt6VOXRCT@Coo
z2Ubg(gLRHSMC=a8!ro9iOoh-G4JRWSSM^%<5ND1dRFfdK<?--KAl2XZ6$gkHF4FO}
zV5NfNOWN}2S+v(q8t5vufun;pyc_?^UL9X{#yjAV2dNQ;Z97c<kY^3%Jv9J%D}T2j
ze^m)KRD8h$3NDKDzIn~(_4WsuAOwp!CmMBhKidS`fAx_!1cI#aIoL~`MS294J74X<
zk`2B7yT_tH(1o%ElysSW?M{?ERG1m;9u;kRH+G(A&Jn8Q8|c4lMgnMGd<!IY1(|(h
zG`c5z<MWc<x#|=-W7Dk7UOB$T1{7wlFj{N9Wd@d~nt{AeSxwhGNd`Oaj%-2g=?W%l
z_E4x;-zI2v?TOuZO~{oF(5^-b@FW~Ot01z6lk)02Jh)x!eDGNi7$bi@`zv~PW~DSv
zs0>>JAl9IFl$~dj+7)O}gq~Fb;a4?>t`2cx*lDG#1nN%jjuYOkj94WK_(#a#%!4|W
z>f$&etYj6yg44>Wf4%9aT!r45TiUkauF#5MV3J|bvZmz_U3db@#Xk^8e;l}$>w1)J
z+f46Ip&4i6wjZ}PC;PTcT_UITN2s8;P)wtE>W3A^2s_acPp{@BeT_`pAJI333Tf}1
zxP%4we)Irwjy*It*x~9b{7Lz>0&zga%MJ{BxrwZTMKf0tzLrXnlt41+@&Rr+P||>+
zYieG1R9CYoNx{S?f;gSK80GxReWTPwXv#Ql;SQh88IC)z;*3hni6<)Okim@)YB%R)
zRuUVPoh5^f&Ql``(3jt8W}5xmA<xj2D>507&%7~-_Dkz;COu14d86!fD`L>~u$diJ
zSK%HV;?LYck6-+V$~q`);}zyjtJ<J9x?py48=7T>5j5;q*Lt6>v*((L@(^!+hMxud
zHFQ;Qn?=sREx<t%@M8I$Hf7QWskx75Y8-43Gi9ya!l@@5G!VWq7QoaqA{p@>Ht%=q
zuw3p4=k#Rr_VOTXj&e3ovL5`NAzbBC>68d-fY?{BRo`};S$`dC^Q;xx-x4f%XYVQ9
z3T+7y;@%6U;~1`5Y&^ec1%A&TWD*zJw=5{d_)UU$1Dot*JbBkdTIWLz%+-*^xa&5E
zWvrD{+xj9934zNJrk}@NfMNBH%D+)fk7505y}@L}-c4MUF_L4oL5^kNl00=f(jjqt
zz?^c=y(UApS9+Xrwj`fhb4sQIscR+iSBN}RUMS$PSG@!`>Ms^iXf)G@5npkZ%4tHq
zm4WT*QEHH4tX=d&DkXvcQr-hos(si`VdWbP!6=BR3^4+X4~(&bhB8pMk;*Xd*`CeG
z%AYwi%pBCti%Dnt=jZ{xaf&wO_n>L*Yp5`$=_L#Q?&ecTrRhI&yL-nklgIj!uAf8n
zcID9=<>Af#_6^T2)icjIC$hcjsX@ch%doYF;fM}5(Q`1fU6beP-FC4-S_EHqd;hE%
zYL`f;U!zr^hB=QkH4+XMMMRjekg6i*5_&rL_b&;T7NJy8wBY=3Dgy~4d{v6DQzw^S
zBY0<pjKU?zm{A4mIMbFS>sn%-enp8v6>c~zPIS%0X>AGdb^$h$H>z58^J{ldOjn$^
zdEQU{goVH&P*m|f!}g11YZS%OQSmr_z<StXVd~q7_9d$c=(?Po5BVYP1d3f)A<b&e
zp2>lf9`ib;oQS=v^$||NPRVk=-%_?={>E%bVf|TQFwF$lU)I}N_cSZ=Tb@?gY4H=j
z+xoJgCGX$rvto8xcZr)XOG6JrVfSAi{0^j4nCCc7;p>R*A)M2ts|rPK8*ZCvRu`ZN
zo{^vw)K%rWMkV#7@u_-C%1Od#rY5V_I(L-O*&Of5kOy7Xcww8|={u<y`xLVrc~<<b
zf2!CU<qTG~O56t)Ac)`I1j8eJHKhOo&E$&m-`xQMLDI}rt-x0|wt&5CSU2%&m%?sB
zFxcLb|49&Nw|MvILeh`0<7~)-6n>XJc<k1ACE-Mbtdsm8t+(T@R~HDfaAWiCrW*sS
zmbt-H4c2VkB_rtFQR@W~@?aS7uw%RIeX0QbgTe{uQ*?K1***k)$Wrb7y1p+7bamDH
zLr&;K0@%b;1Jn2F5&-Ospf9G>fa_m9Yv=gVCE6cyTQB9g=ROpzy&)LrBeMrXaELp;
zb|=6bva8Ge9ZEbt1VzF5z5g_)U?U8VSwpRc_SQ+iG04J?h1Vz3XAiC*Jhm3~-H`pL
z$KQ$Im&@GzSU5mCJn&&UY&UH8-r+_H1hyfResHCIjasLP<lZ7-+dIbsf^-axp`9<X
zz(1_m=|5K6L)M8nqnV%`IXf_eCrX9cnt%difmi993+2N!-G)2sr;e{CjcuUSm>CZ6
zQroxnS*L{|kayx!jF>a=gG--|h=?y9+Vd+i?ER&)3R|4c-G{>|-!?#=T7vYzke&um
zMn-AhO}f4BfKT5~qYjJ*rC`=Wg3yA?#zUdL(?<q?04Ghlb-IP^n2GH*Z?IuSL&y3U
z8}LsMZugjSl#d}>XdAXsM`s5+5TuQL&C;H-e)t*=dvX#<=b5$qOh(2e)?+cW19D#i
z0uTB{tjl|x>Q$sq&ahWP*M+5v`xb6|{gnOEo=LBCkEU0r+oRoD$OT)tFv4l}&g@{q
ziDvpl$T;RNSX}jKGAsy?5m-TNfsv;7=j{4&C>aEjgnbx>>tO9B^}(}yR%V#k4tL<}
zUX1C#0tzVr(Q!uaDK2Q!GFgY6>bmq3HN_u;h<NiWGne?CG7~CpO115UO{PRYHs!3P
z$%{i}Z?ouKN3ZRcr?87f)Yab0N1J;puiuxA%dDk!Wb*>l!tbTZ3<S8YpAz<LFG{f}
z8M#v;-V(tF)8|$qL<(qZ^MOj|jM!0(L=AHMb!W*<3LpHIIUEyveN2e-6>>?fWj?1b
zZ@C}#2v%Xvo8VvNeZs;f<y?u9!xe#!>LsjeUCIrdO&<3iC^R)0cE9;>;&_^v|KU_q
zUh0LDJJWy>nWiH5CH5H|)DugN`WD9$c6`lD(Ll8-)gn5w=>(cBP7QIe^9FWb8L<%|
zl{@oAX}usj(Ev9*_s$QT#mc70XBd4?J0*%O8W&RfVwntVl<8#(ABsz;X-}3nPcY98
zdx1^m-yZ=w4i_Nx#|Z1;YeMV`yDSee<Lxpp`S!MG$9vD81QCexhCazn!PapU$nOH-
zjuV002djTsxoE*Q5xdo&ZfY)!Cw(<W)KSzho<{A;Jcl=gBI)m^)fQRZGO-j{^1HFH
z53<G3B=1N)O(nhvMOLcl8C6LUCd7KT#Qaso=Gy=Exc;v9#q%H~Vt(^Pj>baSBKnf&
zz`lUFPAizIH@Ir(x~x$t)wXqx@@6oaPW-3hZepc>*N=<u+~vE(0+I>CJ#PwLkbNWy
z5|nqQhN4I8ZI)tYTI=}v@Y^#8a#=%2tDOz`wX%GQS19D#Ax=w5A)$Ys+t3eaP)x!1
zyW_m6RO2FX-R}?3mfo0=g@CHzQHd*o!j+2(@jZ7lkqsm4X}SiMg8LYE57`{obU1=&
zLxQOhkj#@Jf57pon{rvLpXTqa)T`oP3%lpiG!8Be;V}|MX~T#|83X#CiX*FrXe%n=
zbT$?T_?A2eWtqo!^&iSc77u(F6uE_X4KMZXI^Lmp9V^@Y=~DOHD|iif`69DFd9$MT
zYQYZ@9Ns@b9un|luCMFj`DjzoW;`nqOaj_n#=x9j^0U`uo2-Tew&N4g2*#c!b2FzK
zQqbLhAH8?ro0w7^<!qX9)F-{J()Ux@>O$5NUO1<n&3a|>{GjAU@LAZZt5Oxq2Q-%Y
z-2u(XXRu0zs9Ms{DI*^~FbEdV7n!lDQMIg2XUw}|eFM;_Sbz87DBGOa27h|byxXYE
zsI~jD+^k8VAzcPn<}K=1`-)Hseu+ptKD^lsLaXW+G)!L}u@GjL>C0#I0~E6i1LSVk
zbI+L)CrFg_>(tqBouIE%ijWh!4`+(TRYKP@^Ui*V=laWE#*&p<2zjL;(}NslTh6}v
zGeYh^eJZDQ$D-%s-95+<yPcr(tHqb0j&G;~UptGS9cu_Ea6ks8J9QA{6)Kp7)_w?T
zFMkZ|CFfsZgNEjSr}ov2z>+0QfK!9+4)Kvde~TW-US$DoZ-)mM2y#IM1Y1S<U;YIh
z)K7W7gsnRKFz%pGAa-Z%>D*)n@gEFK&N>VcK%nJj9dDf2HrI+QN}zXRKVrZ6pW=1%
zx*wlK)v9nj?RFv!>^(gejIuvnmQw(aTsFSKl|sK_ev~Hcc)@)GN<`@9dCM{p3Al~a
zV+Ly{VIcmQKm}~hI^3me?~BS|1PpQo;A_?k_9@|jRtTcR(v%FINskUz3np=~)aU34
zkw8iUQs2`I2U=kgj3E$^6zU)@v-Sa~xuFXLej^5u6!3ORtjK`eukb+Gh%nOV<kbuc
zZ;Ju|XgYKyn57cjtwjaEBmc*$(lGgT5`BCbo^il5ez_iUeU;vqPxFHg(|}HddDN*0
z)^Dv$`o$>|x^riqO4{qEeWo`&G4<*0UKLmEa+@KU^zn5I>kP>79qY{O`Z)=(eL`ac
z1mJXIYcn3BTU4)OtB~~@T8%p1Cs-yO6;P(Y^UD&t_qMX$^JrVb1go|%SupKZ3duxp
zYQ&Y}C~H&_t$!JRxlms^B^w+<P2&5%X;O&%MiYhg{rJ@7jICm$=_w1vo?1rP@sP|s
zEyd`FuqcOBN0AOE3i)eIZxW#M&(9ryh|!ng?CR^{gowCObvyTIhbRgTJLrO*+?xj~
z_4S3*?PltNCu15MJccgVrc13kX~mb(8k~WDr|JtVelA6%^so@9Bz>H=HzBeI7f-M`
zU#PiDAINVdQs-d^!n%I6F})_$7cHW(#Tev8o1cSfImfLi<7ku66fPk5)#;SN5DMpv
zwieKz6GwW$8<R(ybDHHl#vGqjB?)1-Xj*H6-OnU2E_lxcXB^HdQ%rsC8c0eq{Dx09
zn?3o@*V!IZp=zQfX-{2>snlaZg+^m`S)OQ_IkZkd0g2AH-TWo!>{u5(hDu>cK^8`o
zjQj%q(z|(723oJ|^mOyuI|n{P5tRw~Q@`Fa&yM&P1)^Cu1<tS+cvAKwLrI}7Q-+QN
zQ*63Uj__ZU?Q3yPYLjPoA4HOdZas2VoOZF4*%PG)$t2p1V(H|mf$Z173Ck)M>{e<I
zC`a;zWPR`stZj4T66{Y>#n%{*K%sXg{bxl=eqOWKacS!5gqqGMEqsS0kERO;0+DUf
z-pkBmA^{H433<m~d~T;cJX%bP>QO5f+E=zu-Hyt}qr>KOorY0dAS(66R8-Q1ip|N@
zYTB?V(w-DeoFwcsUgJfkLjxmEBv^b<y>?rQPbObKhknAmZE>7EXuV{YyG~g3xTCnD
zW3VZDiL`4EZ1{mJC=*Wn4qeJqUvRh@vo|<<`8&h<EC+2UDc)GJK{$Rn*C4pdl+X^S
zdot!XkhXbY=4eNk%LSqri&%fG5$noH_UN0j`jj*YZS;|03Nv5Ko$%DzEv6CwQGm>+
zkeyYXF`~*0%%8_YseOBuM(spR{rlklMNeQqF6=IS-dyYG0b`X$E$?}c3l_+)(631V
zP8#&M8dn7;rCcDcBow6}4}HQX2{_H9ya$|L)%7PDVSbiisWa_($2tDY8B^5x`Bva^
zEfQ9Cd<gFXj4*soU4ixB_5ir(?r?{zeJb9~|McB31A#tjpg==&X8q{jj_VYET(lf;
z(%pCL`u7mf>iCqvin5&<R2+ZO^cey6d3Ez>UxO5d?iXJpFu^IF4@h1AJf=AVxHd2}
z#jCJ2iB{z!esIqf5rZOMO8%ppSow<VJ!-yM&*v1nm+pwJ^36I_4Vb`y)}>`BRyAA9
zc82vxf|YDPZRaiM?qnP_pGWC{(j}!I+dpeR>%m9arB9_ka98q2PB+pI`dnj5CMZ&_
zwTlR9HwU(pAly|#`M4evi7So+f@`GQYH8t~HKCCxf`p)i(_BZfcsc(5cEGQbJpV>9
zuu_pydG0IL9J>?~$Lf0t%N3_p)GCXg=v&cRRGeh{uiBySl&k(O3jwm2@zE8w9mx(2
z6De!0k4rHOMGm)rtj#Wd_dH=zm>V3qIhEyUE^}NzfSJaa%|{@Znw!FLNAqQ>?R&_q
z(g5Zrf4a^!Mg8~f&{!o2qj-C1t524g8m$p(LB{!fcA=rnW1rt`iF3<rBYVq?Pe1VI
zI`w{WI#T!`e-_!KEHTM6D}Vi|BYnS|Q;p^T-#_cKuGxu<^c3pPzV|qR>lU))3fVIV
zqz@8oCctu$Ivrs{AVq_{#AHku1`*m;KgG%FAm+mQ&7<rlziYv#q4cR2X{q=*)o8ph
z(#bEa5q12JFYc|hUV>dply~z9QiXn8`QaiVuEd2f8CNOO;?S=hngt{Y7Gs`w;F$yO
z3H@XSjQHo{H>rclJ3R`mi`kPzDH}o(e8G<e9-f7l@)ek=ia)L=2db*-C8axwD?)|}
z{Xgl;#^%C!RpMAe-qnOT3VxqSLq<JTZXKpt2x_rPZX${!?B;jEP*uaB^hV135XKHK
z_T&_eG$^3eGby*6YoG}+opj@&IFypGEO@XmueMgSh3bIwHh_qvsMeUkA`%BbbnHW@
zwGGYuA1{7eDVE)YZvsY^48X!kMctwq&{)ovx=%V2#+;5qdae_biW?WF?f7(@&<qPt
zJJ+q8dwC66QB&04zYH1__f0KHnnl2P&C+gyRCOAz9N~cPu^tp%HD}5?CQ(ocHH~8i
z1Vf(o)a&J@qxqMR8tII#^=P_iJw+T14(W_fK%Ysk*7P2a*VH?l6U%ICQyuTl>sTwx
z4+aWcc6!H}&U=W5Vl|g1{=U;S7b2OfX(#K5Z8MCm|GaJU-##Y~H15L14cn8X97q6|
zndcvY9E?BWnHG&V!gzi>1W_i#1mvXz%U8JplmD%>nf?Lg{xIERW{F@8;D#C&cAjAd
z9U|=QNPy1)Ywvr)l6Y43LD`F3-4$L*P-qvg(%BSOpnv=z2Z7|O3dm6Sf8vVYpEFlV
ze^}q;QhQshWO%yjnF`~6ieQH78_w$G?YR9^%MD4!lDT?!8-M=PGWs((jKf|4v5Ttb
z{$P@QQdK5OUt#7iKmO{VetU@jwewZ|Qa+u|6RB}#zvFlaDRs3Z|Nht2X~NLKP#Otm
z+V55K34c|i3#=5$0^W_CkKo`++9#nxlI0kE``N|06M;EC6ydO;iwlt$`QMDOu<`O6
zK1<GJ@+@2@z#r@oeud3r7dNKF4o(y4eXH@KXok_8xcP@2yMV1J8c1PUT*I=Gi<&t{
zKjbDfW}q1?fYY&inHUQyvKgES&86$SwU@+ARjV(+x0;H-%D=gkU@y)=3of|$z&70l
zC>C6a47<NS?iCy>cplN%U@WAwxgb&g1)S<7#88YAQkg#FI%(kjxhp5QJaax!Ht<*K
z!SpKRV%Xt8gm%5zp>;TUAu+t!w^}4_B>z+JTae;1A!e`SvC?DyNZ++?1^3>7)ZGbZ
zckk^Wo4?(|!2pd-(uZvPEsng(QMPBZh^+mey-&nZMWnKBc<XH{&!AD|of1q-3nFUh
zlp>#1QaOH6(|mS2ktNI>KrE;=6>hpix?efmkT;ILZfUe_Q5wrky6HW-Qk4Zc8;J9|
z{HC7iPo)9KhymyuMj+te?-mm$d*Y#1&vT(rmWXg4Dq8u%ssx7Rn^ObEEu_Rx*mW>z
z+kOo?FJf>K%k0|YfRv4Ffo4eAs0+u%Ymsn_3j<es&M#ljNJ2UYX}SH&3OdVRW-ZD{
z*b^Mk@}gQrWD~7+0Kb=|PCp?Wqv2KUL~>;J5|<;n@s;i4t*VjCayliu-j3x;^^0Gz
zuY*wiR^Dsdk${SO|8#G*T&~FTy=z!;yf%cXn9EPb?Wdr`mWOLt8TJu$6QOPhql{-!
zT}_5@Q9?L7758@YVNRStTHE+>kx7wr2JOdF8ixugtD|}Zg+qC-xP=Gd_PcCW`64gN
zrx(v^deJua-4xQmv>lt|L!-3`jkKKm=ElfTtowvLEQjVnEy)ZwZ2IU>VR^a|K~w-5
z$7oQ@>X6{WH6z&?!dLtB+ky{B{eJ`w>&X>9_N$NTRa9n1kbb%eVCO}f@T+}|a2B<-
z&J=55eWl1U<YD9SdN)z9-PHUl>zxtm(z;^%TQ+rx$?HfBdw0!C6jb@1^j|%~e@%Nc
zgzsT`ERiSJI2rACfT(`r9K8(BFOL7Y3o~nCaK*S_MB@@86W+bRW+A$f1nhl_06%37
ze-$f&ibBME5CZmQIS5D_o*@dLBiB3OfIZ$$s04~MjHmn875G9}m|%U}2i)C<<cV-y
zd(W1@tCUu8747ADHoLclzlwDDnK??|kI{B=<nGx6mFRSHNm(AZ9o+{fQ^yvasqBN6
zHj}^oU|TC=@!K1$aksA5qZ@HYM&#OR-n?uQtFtkS;`qr)rO|XKS@%AujXO-21N6|W
zq?8S-82De%z5{9ZFn`*z!O<iRd4{~BQG;pQ>#%6moMmBCio8Y9ME5>)dzP)2HI=xO
zeO*r(O>BxnBMP5`7s?N$BW?KYS3gl~!y(F}WCAGuZU-oriRp?b`p7dWnzOR8v;OPA
zCFt%_TvC6SJ6YmOT$F;PpkQ+GHUnZ`1s++Uey&OV{Q97E-lr4qp>)z5*EguuWk;@5
zI{Dha7&b^*mYqt}de`5nwdw)eELEpts8G}dL&l|?d%7fw+9J!vJWJAY(p&I=m7dAE
z>?rqgIMtZO5{<sjP4^90H(Qf3H<NPWV%3ve<J1k4B~cZdsOC5N`aIPGM^>9K%gz^V
zX)acr(B{sYV!5O*TtS@WQnHz2KgD&*D~?z@>gjbHqye@GsR*=FF6ieTBk2NO;|~2E
zU)C86is!H`T32ks{`5F55r&L!k2hmX$R?U_Q-~<cpPw!hbLjip^OtM=Rxf+gUh>}9
zYkrawG+dOTS)Wxw%%#qSv*}w)HlBXUdh>;UoLV>NV!v?UD1lL*vVl_rnKbk^WzCKq
z(#HnjtXzC=H%Fm51hELUJ%2yzYS)3v`Ni}zd8zpM?nn~^fv_~}#eURLmxey*sH@I6
zfSlhDg-3Ez;DgLrsT?K>Ot&2M0d)&ZA-y*2W)kS}nTW9P2h3}6xlCYi!8bX%ZS9Tn
zSJJ6vK>9A+EBfGl{HKPYq7@9q@oty;KW--3`o*RK#3T0dk*SLn`()&$FzB{NPLgpd
zqnPwG=v2MVcL3<!*K5=M*%)o*^59pMVe-Jf(V5C1%`GhjaE=K@Ld=7Gn=l&&Sp@1y
zs>^?IenKB&8u5IOX>r4EL2-C_|9K3Jf^fTj)rza_&~_>vQDbUI8ec7xx^RU%C5hTo
zy8W|t$L|lySogmQZlA)9{;XVOX%`T2O*oy{YDAQ#HUHt%le`;SXGJwzWWr>B>oms3
zfXKdwKOaiIK)EtLgEf+4&(s3MUxPRm3z*?dY~Ms%oj4<5EtqU%y^cMo^e9*lMjy}P
zMY>Z{^G=n~mnXkczcj>9=zfTv*bF}%(ui)El@@_eBxAK?pQgFsGtnty&1!F-wK)Yf
z7sz5BzJL&)H(H2}ypR*Aq?32TLe9QgV;$1A7zUjxE{0SinwJN^CuzSM{=hvh5^dLU
zNf{B`gf0JZWh1)#pEFl4VaFsHIe%U_461Wu%i$uB{C7J55+=!lV!r1P{ZjdY{Qq_t
z-p}NRU#x~F#uJX?K{luyA?qP4O%c4E7#i%*P*^@;{7iT!kc2s1S?K<z%M>tkvla04
zU%Cz-RegTCTK{s;1*BJ@BvhhExzp9pnp+q_dBpjbIXI`_FV8-$d?3UXo1ub7;2I4P
zfWEvL3?CMTKFgGX0Ag0aJAdQnmXl<S6VF>AtwX<(6#H_t;;0v|eg=}KlES(0ky6IT
z`{(=~P~sno%&3q(K5th#!Y1jL#}h{PGmieXjX%soc!7^Ak^8W*K&@#2L(#wUpwe_*
zA}Oj-oMUFAIrsP?!>kB3EajzH%mhX2JV9p+8-qoIc8%DdaD1eEbc4@k_U|jl-E&pe
zIOWST2`y5rlFXd((l(WdX;-k&Y)n+)l4Gum_Kvnvh(`IQvt)J1ubEmnk#yL0jh`4j
zL?Ob0h~8W)g=y<H_Ip;oNl3tSP8E5366^$D-VN_lEvQ&v+AxIFq|YU}A(Z2^lqtUk
zCq12vivQSa%uSmTV}k=jsdnBqt~l3cjO<E037(j+vKg}Tq9-!f;GaF8*ZZ18KNdva
zx3pgAx(7ScG^?wLrscuOq|jZY8q~SMW0yH5`Y6|AZJ{Zc*Yr&g{`RDxRJd(;Yx{zK
zN0XTOn)Pr{{n$vx{*~p=X(*$Fxi#9YqROn+^=gQ0z#Jvde&ZY6;dDhJ$$)^Y>T=9F
zhEbj%q*pk=7#kjtKAy|UhHrv3J%ze2fJ}oz>imWplwM?w*s%^PqmizhbJsTm=Qg2O
zHQ&dhWiYj%si1}u9xlaz^dilx7NiBoWRD|Tn2HBEy;99q#DKCmgoHlE{`?8fb{xVf
zHJ6)0Z=O`c)A>d^nB-RLQd0Ib4L=w5@$SQvNMt6!HR=~D&9G<$P^(6OrSpr9$G#RG
z44&>}$;pDSp}dD7^Kdhw<;>rSe&rvI%rtW`*o76Rm|ZHqhjulWz3li~U4l3UcfjC<
z0gawhzp<JkE&CkHI5tJjRTkVjymX3v=kVJSA%98joCC9A%rXu=ns)rzh_b1k&#fB0
zla8sIl~IJI3gpJsdrWVsD=goUC>A8kqA+A1Bi##GW757KGmXI5fXxSN%J9u*$t#2p
zc!mGAK#S{x{9J{5B`Aa<Wvl9X&EceD*sw=06~=s`QJnp2@jVgE#G*Mu`Wkq4f{OmY
zyNN-K*|^_Nsg>EP(=cl$0vO>v()#CS-<_*Rio!!!5u*wInPPc?OjW-*nqM)R2f~Ys
zIy?O{%WxadUdNgK!7Y0~mv&&46%)G(Lr}JMriFVwW1OhG#4OO)G=FN(YIbHtLY*Q}
zN%z~KJGlJ7Nk2a?8p;=PD0&7zVCLAi9h7vrY@eTPXfq>H>8-TUVoEto9y&O&IOgw7
zN|clycn%fliP9jz0l_v9;vmBQUbXm)5~p-y;vi%sCBFZ#Bmdh-V6GlD&X{TXy0`JB
zrT$*|9NrRt+gVMaYi6W?%N2)jGHTst-FvfO$%2f5vmvkGB5Y~^fZoA$>6hsLjH;Rd
zHWj1U#6QZ)M|=}&X<y39H5|~12REH<?^2%Z&m&UVFe=E(9<TXr5Try3yqE>knuqp_
z%Z08eidrY^*ibmpnAiC*0DlJ5=?@J=BEqP(So@G|%<}t+i38zCV8GD?5<z^+^a^6R
zQb&yJN$tp}J!2%4Y1KjqS1^qM{7W@xA@GTwvkP?Di{=3$RhqW^Zf+TOq!UM+Jcre;
zql2f1s96u*sp@pDDZ%C1hHzrR!Z+_6#g6*NMHOZzq(`tPOL{BLh^@F?JSFCZ7g@xC
z!RAwj-gwW~g@kLuAsDkLLDb@u*sm2EuX-=<-Xd$Y+sOTSEtzTj-gnmy4X9@5o}Jdb
z1}(P}9>dn+u!r1FsaE|Axg^o)j30@*rWY9<tG=p|_`lUoHV-|hp3RJ`XcUW8TZ)Xn
zSV6xOt6n-Ze3XSAn(#>ZPMqIL|C~Gf*+4snD6*QiMr(;|DZ+)pacy4Hbxq+$!2RC#
z&*imGprTd7J(PiVSVJu$tDP#F+gIevVlE_IS7K<s<^k$4BIO>HENHMHN67lu7pYr*
ze)c@jM?2WzDe3df$RI0Xz-JW%z)+IyxQ%6HIK#FQ!I+yZfdi<uqJRa`*e;nc)DW}T
z9w0|6ejit{ntnswiGl(BB1BaO$LE<;hWL9zGu)pxy<&ksV<xV8Kr`ab>5s<GddKHb
zOSUJ}A#aYQ)bUA4>n*%V(@kUJie1_4t&b>U`RuVs-(a?81weY=UpdcE(%kvXqNUQ<
zBD(vHz-iCcP7A4tk9_?b9;WA8qF~s8J4Xkg)7)@aGM8EW@k(+|QeY%XXvKm}ui`Mm
zqZ9T6X(vCZY$ESf!~#>N&=$K(_DGeaas<zUe&$`9jtLR<!;kEbPOlBBM~)+uZ9&a8
zluSKXza=@uuYQvtp1YNc?lsbpqHl`Hync`ECMFV1KZRC?cHPs4h!^M7Is18JQKwmB
z@9Id`F0`5+zxN(XH5?uGjNb#a&Y1FLJ2t5Bga<B>c6wi;wb_IQ5<@AYESq$83ArYQ
zb_}{V5mCzrdO&sZQpF*ghnw){?_kC)%nV>MQ-Cp{pY6x={o@C1eJ$55n!f8UylYnk
zAEc)57l!8ux`5Aucsyi+;r&MuHHuH}LzdAF!3ug9RVuh#$sfq>VYB4Zd{?Wo9!z-r
z?BPfIFnoJKvVkY)V(lti{M-XHAcgZR?1_NS&cI$k?Eg^0{~{CrNUHzaVGzS$!-AB|
z1O={-Okl}G*f=#R4uT}haO??~(MB@e=)fb+|6LbvCr2b~jHWv<VDE<+DBC{(EIF%S
zh9Lx_Pf|(##~yU3<oMI-(A4YQOldL}W74VxgwK`oK_Y9KlB{^$f0|vqKcxk;6gErM
zHJk^2XwsoUUpYr#h;@#j>jbt=Kbd@f$Z0ccNO7X=bjl}?=OYC4Q0j&JjX>lJ(FRbw
zxV)X<v--6bmf`$Uq$WL_Q$^!k30u7JAIky~I#Kl*bHR8;7p~R@!PC(^uYb|V2L=7T
zmv40agr-6ItT#AF8#3R3DMx5^a~RXjxF;4GkT+@&&bocQ6A$(g%#GKYTbeTsyxoUW
zJ~}$0cXF~{E6uwY8sBKB9Ck@gSXtxR9RB^DHN$dSkH1%UTR;73*4WTc*V>BE43q6u
zx>n5u%~;o|5)`Xf>55{CB`9c1U{%KimhS#~<v`pB+ua(1r$8ANXg6(3zD+ykyJX%S
zzHF6vjoiH;CmOKdsO<ATo1c@{mJR0JdDq#U^Hm0~NumW|G3Sb>?vrt++KY<FXyw}H
zpDkqP6fbYe-b?y{Lx;a%<=R?52tFV^@?Ht2QL5^E(~0JYQ?E~EZjCTP`QmATiWD%#
zU=`Rvau(xS#S`N5)er0{F%rK+xDwW?Mk4bSV1z&!TC7sxgds8vcP{_^exQDmCmdpb
zhvvk(Vb0&-*Sdm596hKg8(_&m7&SXy_A0!tH4q{8KKA}10pZO14{en_ME_Kqkacaj
zINOAZB=gN6-8peugeP{7r0Lt^uE`={8z#&w;3uV4vu~qV$xC{xS}9)#RT5qQYKIe0
zrLvWilTip<o-Jr1?BzQqM&>I>)^7hX#`t=#?X?=~8*7Y_5z|`ytD(4F-B@r2gfB<Q
zwz|#xc9i;UBQdMYyI=MTV^6dWrZUw;QaO~KY4Hch359NaX#O<>{^}!BlI!0^sd&tJ
zE@y5NcD(;6#9)8VCdP%V)WIUE+_9-V<xg>$Azq1=2L`z^DOt43cewTHiCWSP?iqco
z)1L)DaK@(z&)ycsbByuov8xn@RnOhkA%H&x6fPLP>-t>e8NkhvjXu#!t7s$xAej=L
z>y?kN+rGKq6}$dbxGQ-$Mbxaaa5_nzLV1Ig(NUrh2{S)X9O6vu_FhwVONQmMzq?<K
zuQGPM$)hxHHEgr`q*>)`NTN7CTiR@=GNvbQ(jDh$*!d8ZlE-=!;(OcD*Q!RDi&jO_
zOyp9Bsc?Otx}&*>pN3!N=}hkmzm8&wKwC<c;tG(k*ueOL#QT;Q9t|$!t3(II+P{}d
z0Qjgco`j-8@eE@o+!)U#wf=keF5J-0<_TbS!R)-~)pjp8hN5sE(8fj03NS%62+USy
zsr9$qdP@|_l41s!XdSX4_f%Pg3N_EI_5AzPUwlZMl|G&oSEOSUv02qTn2H=GbA?Jr
zE!MnzicwJGZ&C4O#$Ge0e_ZYrerOh(vJymo2$`?K4a>0}r-=7|oHfGL#7Al2h0DMl
zm{}Rf6osY`Sj2Irvzpc;lPh6J2ZQm7odXEz*Y|NykhU2*i&r_;-pQtwwEm8bTxpQS
z=rAqfp9)KJLsgnAKuX%wi*D#wJ?AB~Gp0H6B}4LJFD10_yzQ~C+<yA?o=tfrV6N}k
z6knHUgtTU|V-opQpp~f#qda}uPgH-@*UVRJ273~H-VJBs(<QU-7$Jt-wer?0OF+5^
z11XDtA8l)bP|bM4s$@mEjNkC5PmyyGmf;LJaU;72M4JAPewoXv3rTZUN#R(g;F9;}
zAFDUe7&!Zco`X%fq>r|8G1n+mLb{mMY11`Tx@sQq`tT+^i5f_~Qtnpv@M0$(sdwbG
zH>W}V@CEPBmY53gp`;N5p7eN)Kc>Dsd@cJCdlR8n+G21{9vYp7$*QN_#1imK`l-yy
z<@Yy-kRYZA_RWh2DgsFI8GCesM<{~8-kVV(5`STxXwH$)-*J_tyTAF_kVBn?idsLx
zsx+YVTS%?Yf%czHKZCDsrA|#~{u~JY3dkwsRL!-}p9e?$mvpG;g_YJSM!4J-gL})o
z#P9{dZ;%SSjp;Nv!;M>iWBXJTQNPoht+T<)yV1=H!_6x)-I2JF<o-ZM(H1_+K_9v2
z>JU%d71jo|!yxDnu~0}3^rjP%_!`DHJu8UIXkyvw;jj12kpGrGyUCkYnTyTm3A21u
zFniMpdvd;1hh*ob=~_UbT<^QKNX^-A8g4YCxG56?6P7T2SEtfB-+H#)?dyM+3ntG&
zDrhI8`Q#6g`)t`pP*njSHAjH_`_s$f48V_FS<@g<K#fUvgrZH&J(^ZdN!xeAcvz&#
zbkqbBLt+2NgV&uf(OOOq&c4Cu=l^fX{P!}`9^>p*k)u3bO^Q6trh}+=37AdB)Zo9D
z)?iD%U|<87h}MEFpLGvz4Ad}WK{1<p|GoA+)H|{AeqeUSd!mn3*#EA>FTvy=J5g85
zS`pP4;Mox#1|Gbf;BM;=Z57cJF|a4>Qq|>ZjK}h9i#W)+``#I%?o_^@AZdT94$?{)
zD(*Oq;ZVqjgZpsD>A-58D6)pZTNTsqaFdI4x{|lDl+D36fw$^wCiE-teKwPWMJ7_H
zv{m!PiQ>gO^n#J6N?yyB<6rq1m{1DyFk8i9ZRSMs%uvUh-D!hcdb!=bp@eUl7PJEu
zS>B<u*6r1je>28k6V{|yE<^8|#ysCY|M0b<P+_hIE`MzlgXMOLt_~Kfn_(_YV1}!^
zEq~I&`|MquLP{~XmDtbdt~Enfp&U2Qnd*^fA3uGhv0vtva!ZN9R#vLJ=;0<Sln)WH
z{C)pml{;c|BSVIiHwFEGmp%MW{M1X{O=O!xjvxDa^Kue@^YIfh+5hqNmQitaUAky+
zg1fuByG!8~+#$FJ*Wd&w2=4Cg?iMKA-GaLXcRS?$y1&zXZuk9HV~^VV*P3h2N7l3C
zvJE~>Zn{LZOcqIyGH#Q||1?daK5CGe+Py>_Ykx9VtwIv-bbnK>1L-oS=0JypstSBh
z`NX%(Frp^eq-lSdK{k6Mv|njI`$N5R$!tX>tkw`gIbMsQ`}bT;4J4~}Ly2_z3)fFB
zF@-EV>z6#n#r}^<zz=8*zhuS9v!{~DmM8eg=<#xmK!26&9)JEJlDq_!yn}+{09$Y<
zwt&0U5O#)5sXJvyls94Ea99}-iRnoagxY9lwz1~*+FR{Iv$#)sE5<5AfuEQDKuLy}
z=&kG*!-g}B#;#;e)}^3lHG^vSVMf3>q^|MCVxd;j0$?ioi{$RAYYO8A_0s4+BtO>P
zAZV8aqWv&TWR5pp)+Z_~$2c<qaHsuLPou&s%F^oeAg#K0d62u4=oV9M`1VJI&kC2`
zV%CgI<Hg^r@iK8^V8+LdV)$Wd>DKEpFT0}GnSyd&B5X23NT%0ocTCuRPJT?&cl^Fe
z7a^~~S77fbJ*)_bOe?1*xJqG)RGJyQp0Ajge8Ru)|8+#N>%O1yoJ?`&e`+uU;66Zq
z@{BvurQ-V1&tuSeerO+UW=jSi7&W&da?{I)4}7z$MFe1vHGmQ6=54v$jm@9^^PwlK
z0Lfazv-ov8jne#Mnww=1#q7FUXXP+!u(w$IU77l&&-aOwGmWR}E-d{@0}D6$@){vF
z=2~J>+-XrG!FQ-V^)LL<<Y737MHyiXhO1CTwFvOxnJxmUgzBE)x>RX!%JX6RhUG5;
znRy4~*SPH>xXeo<8NF~nSPja<cM_S_&S8JtveIFZJfvp6q-*5_8p_n>gW7e4{+P&&
z2;E0b6AuQe*Wr{02f(P!_V$5i$&A}aiV<)jp5koJGLoQ-P(=Fn#2>X30$TUYeaZ#a
zDpyXNn_6Jc(8$5qlmfrLw3&9^BEZ9CH9B7rL6Vg%t4X6^jDx+xazuK!cUa)U7S;7u
z+=6jq?vJ0-PIvxLdH^`E3lV4jDmD?K7*#$dM@5l+G`)V0QQVoGyN8RZ9ypqmT}mbJ
z<So0aB^ecMtEQ0di@eo4qiifnK%(7@C)_6mUB*$WI3S_z9jYx%yK=90McBuj@J=h<
zA-qE*j8|w7ab9W^$6y)Z?oXe<4*gn3Gpgh}jwQ-3(<EFv0)CsPh8;MA#l1`-$vD3I
zG3!$y=K4cbmsf5{W?NbY&CZj_4L0Kl7U95(PaQ>Hg)<ycb3Naua5sjAeW_LIPq7L}
zkpSF+9b6{CF2wQKST<>R;99I>(^Io)3<R&U{mHv>&_>Dd$=W$UUi~l`Hij}r7dh3p
zpg7D?pk#|yb&{hLh+Sech9Z8W?h8MP0f66Y<oJQ^<KG!wOePeKeL+$`iHtkKXgp<=
ze0dsvc+TmI{?Pi`<rHI}>Sq{nz*2VLD&0H`Gv1$2bo%p)sCWLiQ9JBx58u>ZaGrG}
zPaQ&kJnsmE^f;X6Mh$kVtx?#N8dP(tFq~H~&yS3f2&&-_U^C5kldrnJW}nmPM(c%R
z!`?Ej(0b{tmad*JO#T$kU1Mo9Z@|lQJgxK8wDj2At7hJS4m@If*}{!BM~y_SKT=uB
ztVClAeXO!RpN7#A>O%EA$sG8PVeJ2DG*8%I_qq!2+kPv$_T&jzj`+Z?`rptV1goYH
zzK1AW8F$}Wp#OQL=Sb=!g$b|;WAF201g<k3{%4WdxLi*1pdhk`?p+m?ghiOGTIxtW
z^9XI^+gCr6H5<UK8q<jBdDA>`BVpm|G4G$XtJSs;CWequJE5QhT6_LtTs>HfTKGKp
zV1rJ@u!b_yh(pT8pHXzo<-S7}gUxA3{+~Ep6WyPMQW<2MMVO<|<2x8L2gs(#kWR$$
zbb+Il=J55ro#d!%y-_S4SbKZN+9udz_a*0vfY143UMV+dC|5KV^_|S}3GO@2ipG66
z1kg4?{QeYWF-pf9csJ=Ik6Xb*=cO#!H<cotbh^7aZ)zQHoUNa-WN-U#-U_vF4i-<+
z$pGXx)N`5eK*?)A)OQ10Y=Z8pZr)IuY#}*A6)j7%(gCKWtZ$dO#oCtCB<v3fbmt2d
z#k1XV_OHu%VHZ@6z%5_K`=d}utg3#byLW3A<XV^4ZR;^%MB?J1^AMKoI-;nVn}<Sx
zX#1WI<h2AwIHh!PnDZ^1H*-+2O4rcRV}eB$I!QvB8Yh+&EIDT6^R1x@Hj$#PNDjkE
zK`8pd?O6R0(bL{msKcv1`5!nYElIUOSu-o<?J2V8&2ZytU8w?lD&R?55;)<!J(&Qc
z-~{~JJ<<S7UK0lcAF&I475C-#!7$aN7P7(MWH;BVP{RimYQQ$J8|Hs8)8-x8lN#A$
zcSXRi+%s!*i4t4YIr!B6$S==IO=l!!mU+|Uj2GIo>@N~-Zfi^pVYfSQYMzO6Q&(mO
zf7icno@E8>0)4~4jk6SLmoNC7x9k<(1OTy8X~MMl<PKDyUAM-+NswrpmdJ;kRChRB
zO2GY)!FsZ#4~6-D9rt9@US+0&*Y(+m-h*qjAT7nX1+NS|@No9jnjK(L%;i{H1KQ?D
zV(|HK(&(vdgH`U{(2+8k<WYMJXlLQ2#I<26^?kpsrMs(-ys3*Sd_J~J_aHl{O{`rg
zD%UOh*6-XMzXJ;ZFcPAA_$Im5)A55#Kq<i8_@>$nltv-wW&In!{I`-pfP^I2av|}C
zJ9~17uqFA(3AZf1;L$6$`xH<-1kK<e1(g*Svg%)Hy}v3_khiTIHtO~dBC_}dKK@k0
zLs9cZlr$y|4MuG+F7dX8H7x|oLLjx|yqv&;uhVCnzII{rNbSt1_U0=;pLy9H|A5!m
zpNE}pP^1P7weC=`#tcD*`M|5CFMqHvC7A@ZP;myYT||~Dk*`yt<BJon#g%(S@{ulM
z?l?}IB}B}JulbLZVvxDVr=&iI)lIf@TjC+qdua^jWr+pPD3cXGA_9$FpIlc;+E@{g
zhznWZOYyGTz<!<UBggU2b@Gxt)t+tRkH%EXj>n(0Q$0bcg$P+>>MvzfoLAQUnBY(y
zn(Fyn8hf(jjCOlWXq;Zn$jz#l2<0utDqx&$1G+`c+uAlcRh47i7{|8!e3cKHPIoj^
zemCw4d6KMAs+-J9Bk+Xi*4}E(FdrSo1qM6irR9lI02KOoP#uXMakhQYJopNi$J#GZ
zV2i7nE*G*((@H`$BRvOX4}Ak3bJw><;Y2ww=Dkc%CGDg`EZhXWP%;&`nu%njhILu(
zXZl@UJ6)4v1Y|Z$M0gS(udo6<OWBurrTaE2q-TIIEs(BRM5NO)Q}Cxgf(CJ8%KAaq
zLN&Y<y$l9FD^+NJno*fk?;pZOI|y)JTou&mv)+E;!aC4n@F|mGY>Obos?|xG6H3%0
zlp7>&ys~}^kC=oq5}W_&wkFSVA?Lx>F91xg63B>}?UV2yOm?#RD3bsYICzI{swaZ%
z+=(rcU17|a0HhEi9RgG%iJPy}_}2V4w-pkN;YbMjN#dIfA|&eULen0|4Kf(%YwO3e
zMKb#`fj3s=>+0TVC|^0KPpk#Ho5FH$DNpvB3ym+KB1&^e(gvAlk1`GwAHjqG#?%<m
z*cRc-ZN1$L5sAzk9Ugzmw4T(WH(47tXC!KMWFPlG&H_!Hr%0R=5yoS7=m9>r-D?Ma
z6M^ZDd4ZkAxF&rynr*j3++PtZQad+YCx@+xRZ^_8-Zv34L(cgBp!|Pz2@yaKs^asb
zs;By@Kez9}>Zd%*G(^A-yd4Rj#m5g}4K5Jhb5U);VbJ2Te<i2^QEMq0t|_9lCgAWr
zs}4*TO#;duhq(a^{%IPee3Fp&FP9MTE2_t+N$N~vJ)Z|qDv;mvHuaOF^GvasN(zgK
zX|7hItVHW<|DTcp&lXE+*`t)4zdj-{)i0PRbj)1xOmu$?f(XFhW#IiG@jN0jA{&$Q
zjZ*I6t=>aU^Aggv+YC~^h)?r7H?dk@0%Z;NXrEX0WfQ+HC+dS&1JTpSvi-QVk>IC=
z66<#V5^@YM$MU#0C1GWN8ohxLGYS9LF^CHZyqW<sntaXD?eg9H0$A2`*>1E5x-WxS
z92AfaQLm>p1nc`In|>gDLhDp3N*bgvqF!;qT0h<~9A*vrc^Vw2<@=({hF>`-DZ(<J
zox`6?K*z)_fdHw&9+_Dzf`dWsAr$S!P(T3+Qiub6xfTj~b0(Hk3F{1{c2EtUWR{Qg
z&X%9crsXx#ws*50<F{KvhTJDVo7w2?u$-6LE6O{hQGk5Z4>P2=-dZL}sbjrT)nZYa
zRo!b&3jE4&eTWw@gNI=W2-5c!pb3KBK@*o)xB^W-Y+iGAq*&r`j(DY(WarfPf5}QA
z&Yq>98%GU#9MrwiZ3?WR53MG26kbucN}$@VR1#V<9`U4&PPCu<?s{cja8^8F{Aed-
z<SdWDT%VPac35LO+6Z{`>KX;jTuS+7F;S0beXF#MK`n?bGvQ<S`(Lmv7${t#7_%tY
zlqNlFe#i+$NXasn%z;b(g*@kZ)3Zp?L@QH;n&)7Tvy@2v|I7l2cVo#Y4xtD{5AD$o
z7AZ3>dwr$yy-?WxwDI#Gqv_z@qf?F_a)ta04X=K_F|V7=llw)+0p;u?PrT`cSmK4%
zwt)tKBqOU{19XE9n)j-TC{((&ENhV&TLi!#J}Pyz_#g5;eY*^&R%%@Mle{spSVGM+
z!H(HA?KQ-Vq1nxG|J`jca)+PDwY!LYwT`~m3Jt%XI-MOKf}8KZ&|v>=BMwv~o5JYO
ztT=-;t4p8qjs`!-t$kYjnkn4GC7_#aGng(i;|E1X`ayhI{HXzhzf%rd&K919&KqTF
z1cT334m<KI{dh%h8D-R6>K~@<YYn-BTH=NAAHi@q<w-n_X=z`uagKGjV%>N<IBfaf
z7&e7ExXaqzCP@${hp=2NLKGKyKdw%in>_9r^PJ`yzckzB?%N%41mOmFkC1j#jiB*X
z^m{~`;-47_m5{T6t3)^J8(N9v%(N0zUZRX*5H`@ggkN}LZO?m^{}J&^vQWPg1`bld
ze!ucx|9LP+kwh-uL$9y2r+{!Bpig_kr8_xmb>;rBiSNp)`?(!UdE%{O1q~1XVckp_
zg|WYUa68^NwFw8_qsYIjg>IiW4o8N;wTqtdg+&+?7Bz)u^1MX}7H`D%lX3cn>(+|P
z<8z_G&UbD@s39+ss}p%npSAYZ&oIhL#cZVCr_2B%dR09<I`gP+;Dg>~?R)Mh+~I=9
z-zj;zwJSZ!OP>3#PSiM8zOD~Dz9^2ht8uEoW5cG%<medWDkI<5`ipso$f8&bN$&}^
z{7%PrsRSND%nNOqc;v>J9V3YA%jPqwhDwYv$VILR2ni?ssm#YxEkaJ$vNKxy>Mh-z
z0+*RUI2tWnm!)W!G_5BNzR2n-f5sHi99{@v49prcKJsbH2vuVs^y66B4gbW(csaLl
z{+H4uNh_2r(UaleRuM62;3ivL)GV`Bypd|vJyTzI`+&VX=%}zXC@Tp*M8mAD&S+9N
zMwM(&WQ42CvMzb+AH?(uc}%rMigw6heZv;|d(VBaZvXjyh4#vNp2eUPh~K-t_b2da
zvkD^35*G3Oap?IJ3GE>KT5Iz7c#JR8Ng+$D)9LBbD^h9opuH=uxgVW3x-z1XOMhL(
z^(0Sgg^*AxN1fQ1AKNP*^=tlE0=3E0!QP@%SKzsgAM1V9OEcax_z9~)wTYMe?*EL?
zUxWT4bmTubrTn%%N%_iQuG?-#A2&KPGzJ_Nd2)en+&>uK!$wWp9^^1(Ql0Jz=sa$&
z4{obD*^cDt8416p-=Yh#*30+8AF;N9-|3&3Du@t_Shxb&jSYyW-&N?%*(8|+Ro3HT
z?2)F<TNI--S1Vbzly%!~(LWQkcUVJLbU@)n50nsj2v3Gv7#olR7ENQM4eT^B`4o{k
zWM+`gAW9Yk-FLO=7}7jZo+6#Fo7h9wL{QP^(BV4x(UnQrw3Zy42m<oWgS_N7aSq<;
zbvLKK_=pg$MV!z(8o=jaHK8OU!a;r+*{PhbiT6bpeKLj0;_;l3N2yT@NeiE(O+7gA
zZ0RcKJs<AW>Vdz=bt+P<5M+p^ZAVd`AmS`<z4cfS8>gF|D27OYe06Z8f9ALWi?yml
z8=q*&ph#3mW;r)%9I<}WQB<?P&_*^5yJNP+`w^?<KNq@(2PIazLi-kvYB|n7Cm4L8
zHimp9bty<!5Is!<TEi5U+ktWxVJQTElK_<(*BR*89++@tS1QAQXa3dNev8D>-n?|a
zTa7qP{v_UI6Ml+>f3U%F1YPh517Dq`)R6eDAsna(Q!8-{ROHL1k2>PRWyjG>9A<;;
zYPJ?WnF#dD=S3hMs=d;-{=I)$Yg$d%=>M?XmDv=YK4*~iJL!-FDU7bs5wVK8BbG1*
zC;^N*0*GhxDW2XPVep0z-aKmsTdUW)yiD_K;-g%)GW{iIv>^AxWG=R(Ijq#Vy!)3k
zqV|?4`;D_nz0h^>HEW~n^s2-+<t095B(B={Uz~9H?(TW+bfF>`0qB5n+hIVikgNRW
zazZ6$tyjvX%Wb{199x&x-KJpNgFZqhGsmKU=b2N-;kSv&lxb-W(aA(&lpuGl;_?H1
zPaeb?Z=qq3&PJKv^R?j_^-6(;;A=gmhhrS|a1I&uf0H);y{Au2!ah8=zSdVgHdUU!
z*A9m)UA((~P)^<2AG`S440Il+(uxR&Ult<kb5p+GTO1-l)A8eaI+IJN54uVsyO}hp
z+qM0_tZYm$52k$c&&Z$aG&GZ4Sl~VznzZ=tr*0|jFB|$lKf;_5Xwy(1Y6+Krj6PjL
zNkoTI?T^0djKG0X(>tA9cY6rFWPO{qb(QgU=wjRmSa}U{8iii=f{CxLW8We1m$=p|
zHgThZ8F?EwptAhu<oDMi{bel@m)D=z)gh1z%&TjN0J3}N;fE=nv)-<$w;3FB6^wxg
z#zLuhnwtX2M+|yCa5OAvU8yCZGh+|nC@UXY!KbRl(!3aECQ1G)y+-iKaoV~m@8EnN
zcD)S}mPr8HXnX<<8ABfd^GEE<7&%%NqeNqwPKxD168o69^rTe#uonYy^@`Kv3r|nT
z4Aj8(VQ~V|0AgTOsUGU>=F(BpmQP4RIL%2aJ)qI695P^z0Ek9{Kel?hAp*25&_7)b
zPN}Yl$hyqkNJ43;IOv2QBDKBBIDE;M;3|@N!AF2p<x0qIQLG7YSUY|k;$ynhyk=~z
z03IqC4~O^}D5#ZjPrkyyC@-ot5w&7ib(zf+r2;0e{9ezQAtt-ig|RAWPg|JM)2TPN
z)CO)oWDm1}IDMjDeMf03-dT2RYLWCr*=tFBw=H(`Mq}SxO*M>t6pl&c)(Hfe6C!sg
zD|FS&Zo@nWwi|033=2kO<noX1$C7<I^n%sKnBHh)dG`Lj>TA^5$;DlgJ4BQcFAA#S
zQE8O1;w4EqZOT2QHa-2hOC;kh_nkf%E;4YEj4Q>@#T`qh<`&ySYbnrdA5XB}%#12R
zgNsW34*OnH#KSwF<@;GpP`7iQQmIAu+{VDYM$(9i*}c@F{B4J#`-bn7b{a4Yw!IIa
zJ)YIS-U+j%rWDwgIzN&Reb=$~wy(7P%!la7w?p>aYxN;%tndA=OcFGh6weuc#)h3)
zb1j%IujJmE?b8eyzVXpRemnLuO12z_a|mMC<0Y?i3u5i>G;4g|aXx9%dDOpv!&mXm
z8iu5HC?FVgx(j(!vMj@cg}~?Su9ncGD-XMk;>u7}IUcBA2S=ZP2&kAwv8P?<QcAkE
z<j_f_Xt&rHNYcdi%3+NJCPwTC_sOggM?YOZ+F}MFY=@EptFM({cP!9_WtVCi<Q;x^
zZydx+b)iSWlCA|4;**fbaJ?4{7dgywUny-nhepRZU8$~_X8ZxlD>GsQM#>Pa<Bi}}
zTcUUMjZ5j~m7GxSmyjHhfo75B$xr8GDCGcN!GoV}l7wc|8>KAO+`ivV7_n%3S~a??
zLvT=5Z@xL=R?U_=fj_rFZee5(GJ=E9KG;=KM7}aOvNiqEnaN{Lw6hYlE{jI@Qu#K3
zfq_3Z)||X&>|GQ7VVV~mwprXE<*WXkg3I0Kx6-uL3s1qmF1?#pygdDV-s^}zUom=%
z<6{w#YdK7;V4l#mNlxoD3!A$ZIRz+=zgPh^r$+~vIFhDb!LI>!?1sNu(jumX89<NH
ziBA&VT_h;7&z<|GlwaA7IzFT+!PqxM$M7_gKjzbq)WzgNs)Z`&d6|KS>UZ&jniHpQ
zG|eg0>_mf)-_<lKD`vv>oD_m8%xYi6G8x%C8P?{N^J#S*wLldgC8A5_WiLZtzG(*D
zcmylKm%<H(H{vu+Gy%_9KGLo}#`lP~pPdW|ia8YXOz)Cu#xcHVv9&G{?=7V;vKvej
z$ON6m{-a#M2goOmojL!~hx!2Scn3C(N)I@c(tY6?X4xA=Q@Rgskq5Y#8a|KSiBqJ>
zTPTMRkSY{?@e%Ig<HsZ$p$C1yZ?TMHtAFLS&Hxd3&Ki?&OX1a&*Q?n(cm8)9UvUqL
zG;z(7bxR|MMdg0Cbp60XQpb603v9I4D8Gu`qQs!@@|m|n40NVp0l}6=3Ut#m#cbtH
zvFGvPV=N`nsj_9etnD1Bx|z*fNpAld#s5?E{=c1}8<wZnig>4cb}44*(9&Go*jIOg
z##~&N4lCM(3p8UOC7y~Hw!&s2ZrnBM_^-JJDuAt1>d&8o`dfy@h~0-@D)ES~OtgG*
zruo=%@|VqWqNZ9Rlgu6;{?2oaWI~V3bG6$vOE`Qnd~?rq>Qp_>MKq43)Y|@<d+E#~
zc1R`*D86W;fUh!Tr5x)??T0b8!EdBq2HHwQW<`=e3Y%Z8W~IC3jTUAl35oMMb*-k&
zUey0o3TrF;q+JY+yk&S`e<D3*#zY<_&Syi(cUd0RSP^Yy;k@#hL1R)xk$XgPwO8af
zre$87Wcpzty*5Ut1%v65*zZkeMI_LA!SE`Cau04!)6FZUW@3<#kEvYvRF4zRcWhZN
z^L+#@5*e6N!z|KC+MK)QbXTGs>5>j3mU1V3Ja^(m4iT%~NoDveFf+M8g*5$#@7|%!
zZREhH_2xnHsGT+aQ}x_@=tg(!A!eu3)jJdUN}u7wVK(J7*Ck-agdUislHGcOeiD|I
zY&iNTtGwL;&X#W8w=O4EHqc=JUP(%%VW3f*wK1}piZM?xySlrk%OX7ITRL_A5q1Af
ztec!Jg4Y@Cj3pZ>M^D78q!j8RB42V?_wung)p_nOavn*2scXo(p4kI{EPmqX*c)u(
zR7ULHANFHowsNq|dI_yH8+Xe9Y;ZV%ABVL5ltlq(6VTN(qG5{>pK7eT=LKPl&ApA;
zsWWFIPCTBUcIajI`&_$vUF8oB<;*&oo~^oZuCg8*vI$Y|d_zmA9hfMGV3W+iXV9`7
z)lWsTzx0^>C1E&znWoAsxQs^7)_w7SaLOrBoMX>3yl%GQKF$2_tC&K93#cL_-a%FA
zv3rYdOOumPh<g~==lKj=06IU`+YJ>s%%OUx^49mJZ@z6ua)A6BdDq;5p9TWy7U}D@
zRt^iC3DKNx`o$W=9M6Vz-7p=owYW!3*l;ihB^}=h*$Ta=2M#DNY)7cmQ<_8xa^Z;N
zitCL^TlxO~3gY3p`LVwG*4#wg_V-)K5~HZ?MG{6GsdC*=NEM|B7X{pl1u4&e)B<(@
z)=6%k$GtF(<aO<e?uH?ruu_~1M2+F42LZbRl`$~t0>TNR88Y)z^<-3Y9f(K8DuPBg
zl1sB^Vt6@HBm}mMA_%H5UFzCA5{l|bHOX>^QoSN-`5<=QxUz&BV1L<J(LegYu`dMx
zL>%q@qLUw-L%OMME`S=PUYIpV^<EONWKg^<LxOBKcaAx~M<rB2o<TM@0voZV`2H5~
zg95b;1BL5WIrv&d?c$*I_NG<!g}NRz^P%PilVmiZ%6+guFr>AEz2akDfp$qSv_r~i
zQipgoB{TCb_1>siHsc<zo>pPrP%lA&mON{}x_&^NQzSAvY5ZCj$9qt$B?R?4UD|y4
zCOQ^3UN+p<<l`2<GWru!d9I!o_{|^=z)R(h)bwIbypx?!6X$!y&U_M({f_V|H-YU2
zY7#mx8`M2a#;)8BBT|CkQ&m{J>yP+ET*q6R1Z$7Euw$9Bc-(B3_g3KS0{Q{DAG<QC
zk-Kj0qgACHxq?tC4fA?fMwor`Q!PObP5ZPLd`y@(Y9s71iK*!5yt4jC>>3lPb1dHm
za$q!wYH>=N-Uth=@?|%acv?nDJJ7R(;)n&g(!cmx!RTA{dmo@odh1eS-O7Wfv%52N
zHrLv=8cp$X`xE_Hfv~8MShN}mNjU=7_}8p2U(CqY^e-KWmD0kl7!=sh=^L|?%aH`r
zvseNwR8di=(Ja1Tutw%BWQamIiQ~q=Vg#fJ$)fAGd@UH;S=ayFauGvk0`TK}b}ha7
zN1jOiPkF*>SMvN1;9LzUU|r->mb`W>Ra<<g$E1x15%bJ@>(Q#SSN=YGew4rL7Hynm
zN5}RZFDA}x81}Eb!^Umi-E6BFYa~}$9)-FevkJA!#PL#E-+QgnMmVD!@cIV-MOFS+
z+tWt}2zZnHhYm%*kSIJ+e5jd&S`YY_BV!W<C{B{O3Fe@!XWnj_MYD@;){fx5L=O2@
z)-)n1;R0qBc1zObDf0zk)>%W8{<VS{gjW5e_m&#aLut$4y&m{QJAu;83T32;BGqK<
zyDg(Y0a=_SqF*pKPM&bOLLRCkK%zxy)(YD;Kz^3bs^0><?c*RznpO#VdzD({3oLW;
z8L;T0U0pLpYsBlx))NjOd!6wH0rW6?O5V2^KvRvbvj#1p=`>pxs7HEb5^q?)NFhK1
zywcyFvE*O{L4Ms+Xi)ge0g6@eC8Z{)IdF1vumUuJBwyy!y6Z{dAYE=FhRY;TG1a)4
zpmHbmNbE7C;e{#bL5^`C(Hjf^d1{Oy$2w8+N;oGRx!dYx+0G7Py5xfY)g>AlXBplj
z<}vFv5F{Y#S2<YZarx6!R@*kLy15+cH`*^1g|zTyJb|O8%Bj(#Gvdx$*~#Y*JEP;W
zs>IodBO4e6guu`B<_>+|-=s|_7S-V`WAQRqF*T42RN4$((N8*_iHBq4cHZbhSOpp5
zDN+I-cakoMtK@3`cvebFQQW8o-_nl8&fZzbxB3f^b)aYnk7k(|ddbHoRx9@rIhq!2
zWBeq3lh7@rMg@UU?L3-2!MR;!j%**;HUZ3-Q;U4+>ah{gMlyS<W0PmM^jXio{FWN1
z{Y$)86o2U13lC3$wt%@lT2?e6BFd5L<rIrGOcVv)cH9L5kSY;<W3>9v1Z?rnmNush
z=oIEeSxdbP0wJs7)>=nTOl`cYb$7*VA%4C5>a&oR)ZUYz7Rp0H>IAi1$Ay{zpupzb
zKv<(b2(z#12VtTs@NWYEK&q%8{FtROuyw0vcijCZ4}ls$z#5=5*CJG}eI>?;@_nh>
zC#|Gf-|IC`i`ebDkv?0OP%fKRiqr?#SwHoi159W{ewtNKAfpZrRj#=F_i=~YQ_8^q
zxZc4+Mf^xMVT)e_+H7Y3)BT4VK<@HG7km(9y&$?2bS&0u4~~`vJ_ac$N^z{OcmS)G
zlFwHGjJjfL)b4YpRVso*@!qBs1I1wlUF1uSlz*5cd=|(Zm?l--$~%B}<r+E4MQrm#
zh-4$Z7|OIU)4?4@ZYu1y2I??-T6Ul5NmC2jXk6f|J(|!DKBI7G=U<E>(FID4p-cqT
zR4uv@2qtPc0+REmjB0fRdPFhyT+7^C?`=K@Tr}rHA5b=x7&bxcM$c;8CO-BApK~HJ
z0esZTH}pcD5QZ<t{w$uL3!ygb7%O-;grmvz$6%5qjaZmsmGO>nr;#pWy}vTR`lMx;
zPC$Xtuj(;YS<Kme9?B(y-;=5fR^GO%BPf+}72CB|fq4)j-bd8j2eW&IF=p!XW@DTS
z^cq>tH|K1I{JoQ9L~D?uq@z^PYTVU!CL{NSfaq2a$DV6xCXxW4>vq-oZEn7;Ea>V&
z#Kv6i+)KPo(thA2=v~?9^1KuPh!-fREMC*Uu^XF>DLZJF?XU?LRFAF2$;OZTsKp+i
z5Cx6<v0pFw&bvONvG$VX{1~txD)Qr8Oh{oYk+$kWHT$h}o@wSAb%}asAQ2aFiw)Ai
zJ-_QoNIL2}U}A|C^YP?iCa=1B@WqT2HPFXz6xLa2T!eapy?OolGIi~I+~sk4@f-Tx
z6KCl|hQj*ZKt0w%Pv`Ar8lMIoGVLb0uA$8<DzJ~?pwVbD@M2g+yDXwv=+WgT?LlF0
zyTBg^D#O7W89pgPDwCtCsFx4Q6lAPngf+k`?h>PZ{H6URrT;R&mJGSz{NcA~jQMz7
zK!8?d*1Ry_-Dc^%Dx(=St6{H+(u?S6Qde!q{CD-k4(EH=iIm`XfZx1D_0GhHF7)66
zqmLbL=6`y<dN79IjI^Cl3_c{2FtrV=*>EOe@DjHRZOdvb(TKG?_`umgxL_!X{I7%s
zsYpi<9*ReQqCOtMFS>{Jh)8_}@0K*6HtuAI_2WEY>b-9IwlIEnWr7uSO=Bv1%e23C
z|4AT2afa0=sI*|j?as<pi=sJG$o%MEkoW%@Wd8#z<OriU{Fr?c>C{ipE$N9*Zu-UV
zE4eyVwHelj-GI2hw{q-buo|QFm+k%ATYiQ2fHkUi^mTqWw5z+R0Y6EllWDkT+0*Z}
zS|QJw$5$phm$*=R479BFJa>#6HpdQn*nyZylHLcAP7wvTe1|5>Cy6U$qd?k!1VAb`
z;*cSe=xcNQIM-a*DcFb!=@l<C%a>6HFpv5;3E1A%!(ocGHB-_^UU+{CW7`w3?63)y
zQhqJESs1j@F{AWC$SeIjD%ChZu9(^xb*M8C<{<2hHtEhOsVijAgzA^uSfH65@v8Ap
zenS*mCkV>IsV%~+b<!tdmD>tBkisit*7En7^SE6$M&VW0@_`$xmwiRh1{VevQaxZ`
z<&-8izw5;*2oVQ4wK@s42gTbNlHF*%(5)e`ei$^s?>xT^vWG+)?&&Mk)6Ir1KnT|~
z`Ic=?ClGOWOa2~{d;CRq?zEhdZe!mBxm<dzXJ0Q`JZz#mrWSB52~F!^HQA1=<N6CK
z7UV=SGHn`l!V8~f3%W#$HJax^zfO)C;pUqfTx~Aw>rnj0PLfu)cFa$iRPz_<q;s*}
zjbe-#A_USZ%oz3^z(mLI_pw~7mf>#5w1UzU?G*`y3y`hz1`qEjukHP*jT;XN^{>%Z
zBT6nJNjLBK-aThtYYdsYOOqy)8cwP+%KQB)+mR3a{HZt6uT+egL#8gWqdTGxot*FY
z44FyVL>aR$<9Qc?Y~mFAJUS>AIZF8c6e^6}pP0s0ICRG}N4<i~^QLp^H&W6p-RpQ!
zr^nJL1de4<ny{%BK2#PsmN9N#`Gc^O^<vdAORKPPD{DPRhK-XC2Fk8j0HFRGO+i;1
zC}qmuAJ&&kmides8|edt>A=yuO}+K*lzGYSef|A{yap+NKi|gL{V6xWl-|B|{X53^
z={V=)jyDfo@^$o<*OYa{ifPB$2UFz01{+b}k{S}2UB$O3|ED03Zg%L@d4U;}-Vmfs
z#<AagTHvqB3ki)1?kfl|h5fw<%dA%^d&-k_LSuUIyZYX`?<3D|kko;Sbjlh}As=)O
zyCh>4gzEvAK!|0Q{Om1^b&%-}mgT2=sI^|ZfqusBx5J$InHX~LrzMV*mL?|Qp{2lB
z@|o&`kg;UlXW;5?3Xd4(Sr_-OKH5FO<s#SG|Jmem!woL;x(2rR<_Niuxjf~}@#d3?
zNYl)K=VcH;zx9v60Zj|;N|KdG<FTarmBc8Q-+;`v0Jk#MW}A~~4Hx{`rPb0Lpjv1s
zPyoDANLz;&^G~K2-FneMs=TXnctirrJEBV7lg=|P%he$g-%XKMR+m0tz{yobZ4PG>
z68j~dR%T&Tuu~WoswRRaNY)@U5Vl{3n-KUw#;#4l&SwTT9Tf-;|0U004@0tiwveiJ
zMMIf!{ZlQKbpBBi;I^H&@`BtGAQ4t{lhXvR+jwe;ils7#LI|Bp7giT}n-Z?I$|He!
zapRe^lt5Nir;KPRNQj!SZkO{DqFV;jL=+(?ROrsPA-ljy%f-eLoh6lKmE!&gZC7(d
zcqZsh;NU_0eVbAw)%Z}Qo${5iK_n<<D2NKj-UZDH1y%yD9^NK9ul-Ax-g{ChEKu%u
z%32o)PInGfT^R_);|x-(P@HStD@eCG^=7C*p4SYwSz+^B{k0k0M+Z84j4i9TuIsiG
zT+^^^%yMpz<49z8TXM_0ubss3R&@C~u4qfjRQJ&T8?Bmt?iwEbjOBg2d;l%Eekqti
zc_V4IjDFb*$v92ut1mx%EA-C@_SJ?kM-j6r-jr%uwVW;o_M2Xn7ql~I*$Lq#@%>+k
zNhOXqnw*&t^j+OF&0D{ufK0P;ZoaOl1EX7y-0LOCv#axXw?BGr!#M3N>B#WN%`No8
zZz}hvC2SnlgMd0Qwwk~*%+D&tIsp18S(i8&R-8~0iVJM&(!3B&YtuBIU@5l0>Ka*E
zj&LdVamjSxp&!w!-^ja;d@7%9N4ZQ~D<AR>C6S19gWSyWlS<el1KwQMe3;knhcnEh
zus#f$Hd1&<$e>D$&x=L|uyg-{Z*c_>A^Evw|4B~zwAc=O5&s&-b;Ij-b*bqaev4sj
zZ{#Zlg>6}L0#k^T-K~GywT%a>h`x!?V8x&}2o2LI`)YM5OnegqC8b{ys26q6uMTA%
zL8a4P10X)k={h^1SLZw>Dt7l^y&>jQbs;@v?e@G`h7JAjR0d&woNQ_7WNy50>gf1b
z_<t7KV<WMBEUoiaK04BRkVv^@4s`G!|4BgnGWM5aK;^G{pGQ=m^c*nyZr1CAjW2=c
zdoPh4<K<z0P?Rn#rZ!mj(b{Z3-FoF^wI4z>8-*gTcrD@|9(?O^+pB-*`UPZe^ZImK
zU#$Ou`nu?KXyVY1HowOk?lCfcT_CZt8i^<|x*>)w4#phJ=FXp7uwq->Jx*Ew-{#VH
zvd#02ef^Vv%Qrl{kl-JNESio{^YF_+NBhoM5NQs1o4T&?m#~LYk~B-UTiplIbvY4D
zRW~`iY{f#>vC)w|e#j_tmP{w@rW;9<`#6&_F%+KxJ^Bz9lqfABIkxq?i=koiw8Kf`
z_-{X!i611_KRs)ps6?ygb&R@6?c|eW*T5w~s|B5c&`?q13e#gJ#X1xTn(JSGC2bX9
z$e^6wz>lC=y$zg?l8#JaA40%y;m-XovHqG&UY(ym4@h<5Dz{jGIKY_ZVV31EOQb=b
zMm=n&M_mW_1~5i0Gi9-%6CKRKGsFks6g2&`T0w&dEu+t&4r0fCB-J3KeA2ks2n$7g
z8W%U6eD?qY(}d>M%{#KQ1``(~n>FpnzZF}dj-%DiBm4-4#WiV`&(15-9*{z&)Q+0V
zchDa(yPmUy3&u4MIyIt%Sn3GtV<&&QPigbn>)^k5v_?^I1TP3bo!s9wc^le%+gt*H
z?benGXLBJ#ig~&GQr)1!Ba<A8Wt*;Iq$a5!@6SB!t4L}xHi(Bc8E(zQi9`MN84>14
zIpo@(5T$#dW6+a=pFE9fn#Fey9EDjT*Py_t8k%bwLoh;?x3lw<F@snzvF2MMBR3l1
z7XK09^6X5feYdudz<Vyf@$E!`5_7k+dV=LhygEboS;^8|iVj6oh2lYK-G@W%8>W<+
zr-HGs-yX*an*m*iF7g#Bm7(WJyc(B&Z;r*Fpv0i=lX^HXX)z}wEya!aMR#Xnmv-mI
zAr>S!|5^kD@~ICyWEyoJIDHm~n~C<$)>@0QT#=Z3F+aRQr6ESiFE84D+M(rR=)GTl
zUcBbKpRBtdf9}uJ6;b4#FRGs$t?yb_*=!5`4%;|m@D|eJqe`f@s$j4F=lIA5a{ovs
z@N?d<vr|buA+CW|w<a;*fC$6B<P4ZDFTIK_@Zr0HW(O|~uMNvY$ce?qm*a8WMma<R
zpRWL5vF8=fS^RkddF6G7+ppEylCW|KLbK|0Dj8H5kGvkMdll;Q6&<}Crc`Y1fltzQ
zNHmiCu}dwfPF!p*cSkn-zTsTftb6qQ^l@UPDgJwNI!N+zm`ZtrB+8W7N)w4lQ6WpQ
z!I}5p1@zxedQAz#Qrs%lTiBNi&^?HnXCl5v_wksEzu))W{)-WJ!4BYCw$;Ial_--`
zGv2~?VVR8$on`e#%N}djJ<+x;CM(0_Q6D+mmP)fsP>i)t_Sl)GfmNeG*`LFCv|8eK
zgmr-dqpJHs$ydZE6OLwsr_msJps%~!mO6-SJ0hQ7!3}5*D|dlDH8j=LMxS9WwVN}I
z<eYFfM_#Ab_55rjs)1XMWdt#)b_$mi;?jg`thj$xmP0f#ASd^=av$GE8M(lvSDesV
zkO>t^DZ{w=A=?&(%M4z9Z;7WO?bLM`ZlrMNGNLS>7Bf;?sx$hy{XY6p%|F+$E{=^W
z2$sU_OZlf?(Xt9(a(xJ=l>-@s*A)rv;{2G!ggLR|aHm<^f?BnWPwYE21oB05SZ^z;
zexL<E7T7~o)jv_Wo>1kPvZh4_zaC)`Q^rynAZj+(DUk~qXQ@QzO?$PfYbkLUprIU@
z!)BF-*01J+mP%kWcFY7HuC_!~=W|xO<D089x>2$`Z_+(=ajN=?F7nYir_Ww4LyEDr
zqT7#}3!U@eYJ==Gm61yX(l1(1hB?fjksvWQ{gSsWtUCOIdgxR@uw&6RwrFO28e<dn
z3*#xm9g?=aFR()lk3uEs4+c-TBf)Bwzjbixm3WLV3i!~kWux+<^9*GY2CD<6nY=WF
zgRUs|`#Dd1YE__4M{u3PKl?UU9T1xiC7Z64nuPdxClonrAvt`i0?>P{^9W4T8R6{#
zCP(ii!^F3VuRiWv9ccRI1Cfo5<v@Rx6&&j9li%e@r!z%(Z;M>5A;k#{IEj}i1FKzv
zjCfThO%2wf-{$&!Ejh0h`IuufKfRaK1E(}xr?7hO_><|NP++C%bco!B4L?_k?oeTl
z>M~o#ICTn2J|ts;g{f@bzN!A|C#l!WC3}QaJ!RGx(r4{pAleh6`8SxWs?b>^7oW7k
z-j4aD=M}l9Ma!1y7qcyhjbA>&oJ>_Pq!=w@WEss_WbrO2*VKtsBRURYKC#=@ZU<C0
zgkMEXJ{S6_2<ruV9Z78TQ4Vp%_7i%xSaX_>Q!NoE(wndAzy)?ZmOLP*z}$LzF>TLM
zl6Fnn&ZkV?ggn*r5eZv_RjUcUvotenA#(O<qMHr(pH3G_kCK`%9UVuMH!q@Iz!TRW
z+*$NLI+c+<B`wjM?X^pCZm}yRjE|0tp_(R3M1?V?9l~>nE3yMT`4kG+X#Q>(sNl45
zm!F{t&4tXf&mCVAWBY+h((KIhFbhj3KJh*7K3Az2@nwoM`n!KwNu@0EV(0IEM755)
zW((IoN1dS;8UE+Q%-xX(BpruEMPV&s5G1?{dMd8V1bGVb)00!#ucz}F$*kv>Ln@60
zFt57wUA6n=4`g-8_ATetr2yh2VNPVnJaA9Uoi-Ru&y@Z8k1(-gndCk>$%tN`@VoJ~
zXDDxGtaCv3#$Fli6Tz0U$0v)vzS8yKe8LQg9zg?e21EHMA#O2t@*+EyM3;iKYYRO5
zkPRBeLy578|Fz+hd|eF5xFH4CN&=9y#Vt>TBDNR(gS2~a14U|CJG8$3(hOFUQ{K+O
z^Yn+QjR6OT`koJWLKl*5<B_+fot;nio2jNZKkSKhTWyW5x%y7BP_pq@Polx;7R&&q
z-oBcjI2#NJl7`lzwx=)C8y@yFVX{UTJd{eK{mK$?TG~-`UVg0G`5|=A`|y>ICNLbj
zs|Gfj#bx%PVe-bn-3RX)fUXcfD46*cgj64nRJJh?m0}$Xg^yRp205+1FCc3~LZDK}
z3fw4QGy5!B$!Gf_Jt&u2ib^VTgYv39Z-a8Vq@FMVluXzAVo<UgS0Qyg4SiNNjsfxn
zn{=ZCq5lkp5=hg?9@@w#5uPVNd8eS4eC8X}8$kJKEA$=A8q+s(0M$9$3k;r(vm@f2
zawR$jft24sB*QH6#8H2@o3*@Rh9m)`+dretF*w$V3$MSe=l1JL%Qu^lw)@6L&LdN)
z8NdA^b1Z(*NHqRcyh{h;Bv|PwKiekj0q0pwG~vP0>8oFz@qyCEbw6ZH=~MxYOxh)J
zu}qufS|ZBXa_r@aQjxcoUW0YISUPz}_|DwUD5aFoGmvA=t8?e&Z!MqTyQ~X_uxtNB
zZr0D)QuMLqoqEEcPQMGjz>t-ucFv(yjqJ{zIoOvM^%HEJAi_?TiB+oCm_(lT)N$Dv
zM@;_IM(!B4?r5_?71GQcnJ`-FP4Ss=Kk>>lZu(Qz2)-Ayn3dN2xuY|?l8O#Pb5XB7
z&{D`x`W5cFzPq~opeFsmdsimU?gkVxIJNg@MB3SRAVTuw$OAmPq+Pe{{*0@BUe4Fi
zJJ<(8P{Jjv>vdnrc<c8hU$-IVPds&-XMLbBbRd{qa_WggX0ZK88QE^X41D>N8T0{l
zfCFCf$*Ad1T9Z&H9n%NhkZzvOS6h6B*+aXIBW}YKiTTWlBpKm4zvGS3P{ij<LmH@}
z=o!?3`3w~z4Qk_Ty|Kj9rF1mj&>)2P!WXi#-RXCqm+Jo0FZfeetZy*4YXljC9mX<Y
zVQ$8b?rmsl2NN`LEry5q7QYb}LJSemO>8|QySa==a>9|P(IB>I(%BAFf=^m~8;;kt
zNqE9mGROnbD2(yAh{r<-U!4}#YUEIphoU!q6TS(Jby-)sekr*6VXSZdV;Ag^YlJhM
zF{g#x-2UPKd*wuLJ}oGHL)BYIXxN>yH+msqN1$}JJGUC%gNEBZ?b*>M0<C4P-Tcp2
zfAW~pIL|NsA|vJ(R&j@e^xR=~2-zp$Skr+#21rlGORNemhIf)(P$T^FP_|{it}AfN
z>v-AzZf0QMzOm*roLSuuu&LD!Yw@8+U!HwWV3#v7@zg(WKnS|w<s6;e{#hcO0t1iO
z-!*)No7TO-d4={8KN*K7nB?S0EOPz2v7v1OvTak#!G`vu1i<WVWUqM;N4ehnzFTUz
z=3$xL4>jgzeTSS3_zw#VT{fF*nTVi^(?)Y1<?4I<pkYP&*D3=a#;4{)BA;E&5KC@9
z1EJz6qKU~jl8l8bhz8=<lA;}=Z|vhVTp_F2-thg93i`4`1T*~b`Q~zbY|vW?N?c)S
z<x7I2$=`a<32is1@5F@Vl?r4SrYHVI@clnp(g(NxPO^EPZLVMb?-A?+!$b`k8)1ue
z6d=4o>DG{RXud0S{2-2lX=_N6oTU+!k6GR0LX1}2|FmkMXi{yq#F3jxrO)6;a|Zss
zbERvly;F7_bd8PfV=?}1EuzI#g0UP8U|-HwukV?BQc7itbgeJQI2V0~|A@SN4>fJw
zc6Uj?i1HfQ^<I8nLG88n!f;-k9+nkrnDyd0)*adp6TZJmee0!8g*@te>Yz7^>cIxj
zp4a3kki-18?Rofq4_ew}84Jy>m6ICRlDTnjGklv$N(7NPx@O2b@exvIcqOh$O1UJ;
z&qYm%%V(0c$w!(!|F|_xl>h@7%+Tm0P)@wbbX4JC!ZNyn4S8r&@57i05%<@U(2tnT
zgackn_j{B|6{R}C5_5~N{QdK<b{m1JyF8u;1PnLFr7!956~!$wH?E)aXeZ6|a@2i{
z%gw=%3&Jf+TG?bvCKugjl0`tDXB-5bFi#a#LWZvA<QvlatJs_$Wi)2e`&**D5#Z&v
zC0|hkF2m)HMqF>Ic56;<L&>q4!3`|e$N${@5-n3B)e@FuPK8#yalXqaLDyQ^85G=q
zOM0CrU<aH}i7u3tOcvOQUbA`*zC@2v)6J(JyakzvM#FVk40`gv`z#P+qni1f^T~qR
z)TJVmjAfBeu(brKpUA(fbJnDE6-&vmt?E^D$OS6|aRevjEPN#DPX-xlHTl#Ukv+SQ
zsiGk7J*FL+)$4g<-v(LoSyf(qD^^ggFlae^?5CmDxOk7XYkkE&_=vCi$b;Z{o&GL-
zueSIqt=_5`!4rBIbQ84qF$mlFwK1#!$N1-n&%Ym&$sOKx?{hDtP8By;y8HGw6K~~g
zI10dE5Ek++H4mdE4zR21(y@Nq+bd&WZJ%`3alvKATpx#igw3q(xV20Bgg;{5i?ryN
zxR4aK#JsiDpzlRo{)KdniTsa4#Bd)b;VlZSvE#^W3P|JjB?kPLVilXAL8O*I!(O(H
z1JTd$M9QL*5C{&Z0Z=N5-i*FF@>q{6&_{=%yOnUd;U`_(n5j={4RaLZbabwFjDo&P
z>z_Q7IR$f*@g2ES=PTqf!cU9^a?6TyJCvlZU+lP(E}j{l0~)dzy7TP;t@%$dobF#c
zBF%lqkF{}{6-m;()N#k{_xKO&Gb{|3bViA$GSm3>K1t^EASB|(9iEIb=i-7dUP@C<
zikwmomg!2+CkPunAlECP3);)>g6)`{T@ivqpa)>|KY$e57KS31`}kNLK8&gSkRm*h
zdTW1Lr)e^;dwboWAx>+~zN=#4cQ<|V`N*}tCt{?e!?yp~?6R0?_x)FkI!EV%2lF5N
zeH9<xtk9L?v$l{dZF#jtrq`Z8t<*K|vk{BS-wT>EO??A+kIQ*Vl@wn7lU;Y-4evDX
zOCjk=Ti@$jwl%S_P+u3k3ogV@E(L~>kAha(-7g%<+}P}W{G(@7)$)8Ze7if)mJh;K
zLDV>1_a_Xong7@e|GmZh7nGNi{gFnX+FrHHghIo~<$*6)U`z7U)sTDga4-UW{*>{r
zBc=#gkYd|8zb8As>spPtXmctHb@z-he7M<Y(z-{p%ZR@UgjX#)@cNdlkrD5ggRBlS
zK^hyDFaXs%h&kf8+ow#ib2mDVe4I+4Zz=MW83`J}KI1lvgIzr00I)l(X>NFs)JM9L
zT=dfydaU(3R}}W~^EtnVFT^%^CfmCwg8G!T^YsRkHS_%4_E&z|FQyd2GsDd5nC62P
z8@}P25-0TK-3?DJw`B*nZ!!$aS4t-)J0<fl?u1jdUe=(l>$~l11Lk*|K0k)O-nN$&
ztP`VCHM6uKPTt?`N!xLFby?zgu0VIdoD8<;LPZaene#Y`Bu8d6x9C!k3LWFG1k*}B
zGSq9VKnPJj`FTEyP271L>*=AxB1o-}+N_7(xuLqOh9C19JFCB4w}6%!LP*ikbz_j&
zL5E};xoS|`bcE&y%#FGTO6nUCOCD_oUBhUU@Z*|ai5ql(k|UMarj3h;pbdNTxEcmh
z8Afy6wYew1V6R7|42#I?PrC}zIw25uk9Z-p%J|5;*=-nNE${RB=9A#%3QY*9MnDO<
z#&%S|WG<JFs!OB2hEygGVpO#|RaS8?h+m2>2XX-C@7~+dRE*3%XGG6Jf2)?Vhs*!Y
z7QJ3sk`BXog;Tu<=xzqglI4k;&#qOE*F$dZz~?t>=N*AZ=tghndse9MPxb{`VhS?b
zKB{OJibYt4u1xAs0C!#pttEYx`ioVxgy0@&6`~DSW@l&yXB3QnId0!@V`&?k&=Kbw
z-*)u-`t{i_f<8jjz2pS%`ZsKks&0o@w`LCtL~C}<Mhv9h@|uUJ6mLlUk`~!fqQ_=C
zG_<3|ldIcc7Q-}dA4mOKN6?m-tU3`Njod^B5Yg^+o%`<p7+8ck$!WYk76ZD`S05b?
zSNBzQ`RAnKY}xj{NZvR;hE57Eim6fT7TJ&2UfX624DQjw04l3AlOC{VNQv0XKW)vS
zHFr=PZ_UXq!#o;goZWT6hIwgdrYz+|Lzy|S178`W5Q6lR$-K!vyMe{_GedqAf-AEP
zPJvOV%5^srrq`?-x{q_DQlfBVwmU!Iu3ubI#k>B(VEC@gVy1g@m&jao*hWY!m(Gjv
zBf`aPHo~t@fS3;ZdV{ehtC_k~PHl0olVPQ)U@|6KSvV`;>8OvrD-FsfAQNsXEBS|(
zw!)=xM38UF_2v1}0aT@?%y0W!=O-y>kjk}iFB)l7S)%T@Y~)TI?srl>i{3J)@wZfi
z#yf%TfOoTHh!Si_wO6<mcQzi>q3P1GtM~K&!`4@Y#SwN*h5&;NZUY2&cefw`g1fuB
zySux)ySoPngFC?;LU4D7<@;rCcJKPG-+rF%KIc@`slvi$eIXebbh>`B-{ntbqKxH*
z<nORA8!95_H*URlgPNk`V+>BX$5^Dp1MPqElu`+~s<JyMTs5%8mO~V{7ZjPI4Vz~*
zuAh6sUMM5~iXN##>BO9lJh_lePVBsQ4wg}er3LuzhuyDad_JwG9m7}ojC#uK1u3?V
z*j^M-Px^dY6Mj`2S=1HT?2l${F-BOFH9Tn$Y7t01+$#ihM`BtOGCN;+q-}S=5Xj|q
z+gjMQ{ngnboH&pi5cZZU6(r}^IkRcg`^9o(hs$U`7RT6af_3in33i32`y`rgF93SW
z%8j%vnPQ=4M;Gp)83YMUhsdaOLjV6v5YQPa5#R|ko%+sS_qm92&Vm=NyJii^x2dwC
z&kwPJlH}E63yGCngf@P0OqI9t-I=tlfYz$a?Z0%KkT3xh4&GIV(mEz%h|eWIMcbTW
zP7GsK65j65mj7sPHT?`+p50PpI1V|pOBq0}Ih?>qrKD(5uX8Mym!6-!1@Pw5(C9Yd
z3C`bVhS!^aH;JiCSEYIV0`&74B;RaFQz*h=g+ZwB&g*f)Z-v+WBl;2Ji|S>Uq1Jku
zQb*!f&@RCcPeee~D=oZk!;FfKy4<hANBl8<9~N>D_C>zQH|L!R9pUj$xF20pZqs?b
zod`bD?)=k6?dvU$?Lu7bd&-#|a>-<nURK#0v0?BxYA^MljOjsr6nfZ?+6xzqo)~$0
zZYEK$d*Xzd;%riM-)=NQYOa@+*=3yDoF^E<O45yG*ecz_CDVxJF&C?hvNthN>ZF4M
zsMKCRR=Ut0D<iA?bnl^<;CKT~N%H+A!f9#7^mXbB{auz)AsQ@OzSgupFQgV|wk*J_
z*DRI*&k^7i=rmh^@aC8yL?mDP>o3RxzTZFee0YrRhCW<3ei3TEc=a2d$mV8sJNIxp
z*yMFO&%m#h_`REJC0?_!dHO3s6vu9bjVViYM2!rq{T8wUxqJMrbh~@fTbDtW;`SeO
z-0n+WR^U#53Sv7pfd1FKLMXN-x^*{+3?dreW(O(0*LRJwHkkq8aH-jMW-cVmG57Pb
zLU||=8Vopz7l)h|BEsU3<>U3cSxQ3#D)8HgE;Ekyu984Nea475+yFpY2rT`LR0W^X
z6Q}SjxlDKPidLn78nlDYCj=IOeLP!NVf*Gtb%Dl)f1hmF8L?{&o_t-jNlZb>l9Ceb
zAks8zQz(HJ*{Os3%xk%&*5!IjUe+(3S04HDr>y*(1MVo4il|aK<uk3@N3XXk!gU$b
zmb)LTGHAykKZ{>aH~f`sHc^xM)t8JS()&KT+vUg<GOpCnQhx}=9iv%X(M-rI0?~vv
zK~Trnuw4hURq?tVSC%mSM<MW)R0c1GHjJl=gGM)4>W3Vq`DgxdH?$R8O$0+LC(N5<
z9$$8wE&o__ygqN9ggq?SoYMxUW$&Ko2CkFg^X2+y`^#l!QGxyI%y_^b1{w#5m@RQT
z=ag9UkX3VTXTzpiWiC4n3ogQG`~|X~^8@#Aua#lEYlh$Q2{g={Vt4(BgwO*EC`Jt+
zQUFOp$j#vz4<&SSqGe-7GKpX=DIJ-_fOqS_4KHb)$9dg5EFXCruVWdXWHrB$FRo3r
z^}d(8RI*n%b;`?<$$O>S$VVv9{MnYpP*dqQ{pEqJW0N$|9<WJ5)1_!I$>v+fBaoD<
zt&KiBkCm%Mj>V!I*-%LZFAk|SjpVyr|ANJ>Oz4rz`gg+&#XDSSbcxCPr2^ehsPZEx
zkpdh<rCU5fLoio&^~g4)G*~^ryx%aW&d2R}(qJD^q{OyDf;`D49M~WXc&kJh4A}J;
ze6Q+bxuwEFD5q#{xFf?C!{;2xCxM=p5tDfbC>?5A{cS~I)*FhD#+$%|3HipK1wWlf
zBZq4UilzyIkcE^PeO&$Xls?EQ_QCSPXJhNJufP6%4dL6wI!QaM?O|F@H9Ay5;9nsv
z&snF%jPjOai!cAHoZaVzI0QDr<XfsxZG~PH@ELRL&~y^4=3s-iBFq#tMDN};(6!~t
zHA`?x8ZF38rBVVIb3d9`*1Dq}DYCnbAsmXswMLyH`bu@`zUwGP8{%>m<4UM%aA;i-
z`+$y6m%l*6bL+_Lc}A^s0pB)K%lT6g`xI3&tEYHDACR?yG(fNNO%qp3ZRE-p%UvX~
z$p#*z5<qJ>BODk<V5F<|VDEtV@1c+9s+Ca4%FVScS#Lp``6{T}Z&WUH3hebHH_6*%
zYT#Sj!58RBUoN;fn4@Vw^kpJ!X8uMON2qzzyYo5e=Wh?BPN5&E8!-yLQA0?Qwo`%v
zqORRC*$KCN`**kcUJ(dmRofpyuZ$6z%c+Tnmi#%Ja0@oV3Hu0~hfnuPG05kKMck@v
z?_6tVjka6T15-qQZJtc)mla~8C?iN(rl4>#et#HzpotD^Y_Rk>_XXBZsI!y?qyr3b
z!!3yZpNsAKrSJU)Tm6LYkaoDg`xePUQ9T;-Gy$#gpBcA=;|Aa7j?_3=!>2wn1F%1c
zGIiyuMWrE`5eL}p%h@k$DmN{=ey!TL)(*{LQjo!jwHAtEcaThsaQv&FZ8W~Ie@C1a
zzd)!yF!r?)7?^k^TzI-h=2E7d8*>#-KZ;Gzp(hFKgh7;@tc;)y!UAsN!&cDz4Iu$#
zRz`e95U(AmHU0IYV6suP5vGfC9@hiXcoA6TOeg^=*LaM^NC>$<jNBHU|Jow-4-cfP
zfX(w$eC)@rzLoMD?wCp>JGf^eCiyz+EaN1!k+Qb;L4}D*V&Nc4uNz9vLvl3%?fNT%
z31Ek~)Ud(guUJqJPbbh%hGMhPCL*8!N7_5n&|b4-j1G@1WsF2Cyr@(`YYpk{Q8_!t
zNuMooajfH*6vhA=ulrOCtCGA`wSe!B&6z{6OVL|tZf(rS3=<cAJMXd_m{wDGO&j9c
z2^LVclOZwfCO0<xt(KNm;Oaa;Un0GVUtUW7(Pl5_b6P3cgU)2}Jj^LUh}iGQeelzd
zm?}*=scut6{B53i{$-1P9a+$ITpUV+=27r=I)vm9Q8&Au{g*!SxXnC@2Xxd(DGtk7
z&D|H$QHt?zMNFIJSp({bBAj=zzD_JabMuIIv|R;Wj}(QX0?M^QUF4A2Ow@9Mh@B_n
zAQO?PC&vtGH_;H&Hh%|zOTILw9ZDn+C+-y~x!`a7ITZ`}jaG-Un|!U2OnJ>}ky!~z
z=5Fc;3mkG=sk|~VFtG*>nXH_9%B%T%a$znS4K~E^+mU-39SQAb#bN}zey^?XaTf`4
zXs_uCVmAsl*?AkTUz11G(`_RVB&}!tUhwI(;HYz>y3n=Jim)7C0$m$){}qS*R_`T-
zmQ(a2K<-o#T7;_0?gL>oMEFrBDZYU7mQSPbY6ph{NiTN77$=Mw6k6__$OHXqj!`#v
zMjh+Kxym`C8;?+~-#*<dYRSkr7Fdvaw9Qa4rDptqx<Wa->dGi^uWM~mkv8+U<i&U)
z%iO3F+mT!QL8g$TNgPVvl(;7N>NDR$<AMfp7E7(TDt=R{G<I2c@poMP?<zfTQ00W%
zdfmRnt)7r`;gMxhpWqLCtE70_pVX^mhyE23r>zk4ZJVd^Dv$9H760o6P;L{n$-u46
z*s4^5;5o9ePT>BivTpZBy73Oqy)`+|6)!U1ro*DM1iI^sTj~Pp3KVjgsT8}`k9`io
zvVI#>c0gV1kT?OOLUR(nC|NJDY__e?2BIJ6n3J4^s^-zJfdLaD#zFox^_?iJgJ7_W
z>+jZ;#6sp@@<${;m;{=Ew7>izS0cRy7ddFRR4E!|;uZBq6y>M0l$!U(JOi8D5C`PE
zztc&mpw0`5++B)~iI@GIn%ylh&(#>@UOnji9<MacoxM<23(!L;48F-?@P|H#`}|~T
zGmSI&S-BJlq5VrLxuTshknSUD?x@xX!pq!?=V6F9<3#OUa~cog7jNB{Y3#wj(9bf&
zejxiI*wu{@F4R|_jzo~p1)F{CXkyOnF+**NWkG|B{Se7b%nv`hYUPyAu;5PWF8gVU
zPQhK$C%FEq4U(%{vUeuS@ny!a0{@Z%?P_7VytjtmUO1Q$^k3f)?5fZ3Sqw4e`iN(~
ze=8|h4!D|fo)_2jmM7&HyIW~NI=zfo07bWXwrJq=y|kSbcrJ}0#TmlD{yq4R?9j1Y
zINWL;v~`8uP;G)U=-43P9M|U=84mL_g{ckW$|_ETxxRCOi3$2<;k`<NGIZ!w5TIe$
zec|TusADqPLEiqsF3Pn0tp(>{+T*!ShhSvbGuTsJ#<E<Vb>GdQ%phiv_fvOiI^i`e
z6*k<H;IWT50#p3aO+z^3;^jLo1e#4c3b#1g=@<0BRejd<+Y6*jY!|%wQE+YX=q!?-
zNv3AJ^ivcHk@PmpJ`x+>=3#VaU7hE@f9YJ_?3a3mC_V)^=?{p3hwzi;J+5`=z*Bu=
zoKuNFq1!ukfPgmPn14~o4$!r|kEnWq=`yzxXP(GM?O;hC{5}i~PY=oKssYF?`^;kz
z1LuPV*=DE^C**;JErGM;BrmY!)PNzp(QW)g5XcMz@+KlyND!2H2HJG@N;cBh8D#Qh
zD#PJTNCzNA0??2o=^G?7s{@WgQ7z0fbAJ;(4}t)*4uP}wcgS%@C^7Ruxp$mNqUSqz
z0G4VROE`rkVAhTeoD?_sKUE5>0*QAg+4Kk1WeMV7*d0rj*QW1@qTgTX+ge+gXf1pi
zSMw3P!X6FTB6gF3xn^+4;j5!cw8Pu`gvRmj0?~(&n@O(dH5|=D>#$Z0&j3VX_?E2p
zL^W4715bTH3DpWi4BCaNQ&BuNHWjO;qNS$Ul;vtVceVwq0mQEW<xl_gZ+zBr$jiv?
z!d#GkkXP6t`x4?#GBLj`FfvxH-NOsdBI8(DUlL@0XZNo@p*o!CsfR%Oqj{PwZ?QDg
z6Q>?|kS?=st2pNY8Rj#QI0}`Um^G_v8`qq>(<|cErjFKb!QHlXYrY;N`)iZ_0gzW+
zx2kGLy;#qJvbxn_i#&0Pxe_Q$-yPlLo+d5bBkfrM2QIrc8MOPUdSTGtlQK4*oflsw
z?>XS6-OsUH@Y0qp$-{|6FN-C{G0#6dOWhGBpog^IZgs-dUws05DH7{Ug^MBvNd~+W
z@DeIk^lcGs;!#(cjmMCFqf-;9y=c~=l{C(y`dAatb=K)H_1aweL4J7Ku6c)TohUs`
z7n%VsgS9RK*ASl*WKjh$f7vIV9l8r)*`UN|*jp6DVske!?UaXQR0ZzUm;4fXx=8n?
z7$vtVTy%!hiES;O<2FV4b2YEnlK5FV$K0J!pz1iPtUCe%D%5Mo6|exMtBj}#Pz>eI
zf|zJPId-phjRTtWY5wG&LTC-j&Ep4wp%pBf07NB~ZaCc1F)pMZEd`cZ9>EdlO!cmh
zEo4})&i-BiAMz#jTDRP{)#RXFnKKL(EFium#NU|wqDj=yY04;$ab&huV-Bl5&EL6p
z_t6*S_3^U5OmkoaR>+c$Q5I*uy`|8JbkD2c<>oRv&KC7CL4W~19*)ZA)3u=m6*{V~
zGVqTIu%Z8y5EAmtYV#a6j6|1CxE<fkguu=lLc88S`YzdDVxP5BNVLDxx^xs;I<gy(
z2I_Q*-dtQoW>7CmaRh55f5d4Q(rzdZ?uxMWr|$kfVb^V~-@w2lIma3&K1~wH@IRsX
z>6!p;_={`Da;U7J6jcp(0w=4~1_ZGtnTNLTJt?f*!G@aTNcMsRo_BHgV+=B~tDoM~
zI%R6nMxRVWYEUlqk2$z*^@T4Kf>DYsJ&l=W|M`vCRgv8isdhdE9zrdxxyp-{&3+ke
zbLfSmI*-U(E@_l560Be^UH|*o9yya#gr;Y<?k7-MQVMj;t27ztb+gMbK|@p!oD-q6
zh-Pg~iArit6J9zHYYM7_0!pw0CGmlhGA%;X!h@kDT`Cz@TB3A%GWo|Jzs=<t0W};8
zKi_-BQ+_sQL~dBl_A|lEg=K;*X%t5*L;9Yuph}^Jc6+S7!9I-?=tAbSUcI!K<nT`<
zgC)3(y&Xn-wpKGUA$0|v;rtqIZ0y{f3=!pgzda+vA`J30NvsA{(Z7X#b5yB`&NQdA
zwAsZU>uC461yQeX$@ZnR>wN7+H^n}uHH0h-0h^M<6dlZ^dmfjs^s`OsVt~5*5ocZ^
zXe$R*&bHceaWgr`DLR$E5hHFr9Ku}&iw|eA0;iOEdY<F9E?u-W)n8@#WoF<wP@f_`
z**mlQJ}z>{!@zIBl?VP1NeWZsU2OAhY1SKNIr&|YN@RYI&YQt{M05piH-sBnBT>pc
zX>EPPTRpsIc3-crh1K$&e<e07?^Fj6w6dRDi@J5rY3u#C4IXoo`ZlDZ*RX7qfGXC7
z;6o#X^2XX5_tHV!IYU>p#6F8-Imr}-l4k|Qzngl(NVessbo<?nzyI?;ZBNA>Zy0hI
z37J|I)7%paxeI1{Ry4?7NQuoO)&FkExe%Q70Oyu|<{nEHLwQ3D`!rgZDS8Qs5mpvy
z*#q0CbH{a}W}*x)*0HI-6a#LT*7-L}L52F_2C^b*R^h=j_dTt$C^z>G+F`!IUxRuY
zT3TltU2Md2;cS=dT7<J<7`UOg@pVS8r21o3S~GMb)G4rP%*|wZXvJA$e!QGHAP>3>
zH#kZw+5+TdgoPFRav8u+z_8#?4WDnpj8VJ$A*a?U{4AtxK50ydXR8QpkwanRsW@jC
zX(YQJjU@lyil7Mj!2#^X<^%V|{-DR1G_!xMG~M<gOA9~_nlhYMoMatXmYVaO{(k)+
zB!7?!Ec{QAQVncS<0Ek_pJptCU>3{4eUzaHIy^H2<&MMz2Cxab@x~EG4YJ9n0Vv|k
z^m97}ix?sR?gqPvZ)~uZE{s8kKga!zkVeh(YcGWEZ@mGtRntsNgo0rZL32bsFq9}(
zhSHE>@{kT;m+B27LXajxC^4NuI~B!F*Z`m~7E|~+-mt%czrMTUtRX#A5F#u{*FDP=
z<qkGc?KwjYq|1X9_1_a2k{jZwgVo@%CvHbfWa()(5+wNw<;>Cose5?ien0ETFqi6Q
zf-g9<lz3IL>r7SV_CUIdpg)SFf-QlBd54~9=M4_a*<5aoU&#Q)u=Q{<-ij@PreVDQ
z@@EYOGU+Y2x=27|?#?i7gy2h@8`I-1^sM7S+bArj$;*b8wm^{fil}8g);}921fT9c
zU1~|(<j?|tFR3@^gN^)x@5!AslVcOp1=Bf}{e9yG_9)l?Fn_$}H#wb+Wal9V1SG`D
zg<mW_9?u<>hc#mL&xj3oxnpRNo4R=Eq0%QZV)2i^<%bF*aMxwhn(8@X_b9v0E&eJs
zs3)b()fbsVNhuF>;q^rEIXr(!B7g<UyoJ4f;0dv*y@tF)6^y#-yY`aMxvxPS<Yda+
zCV1z+@WV!(l|JFkPG?FAoODgO(c=BfAuqAAJd$`ir<fxs(1$hL71hSSvsof9#CX!W
zCD5WVXT(fG-;%uGS;?9m3D@(b6&Mdv`uS@)wIdXP{MZt?4Dn}agl^$9-Mu-Fv^F_$
z4HB}*ocS3NZsxIh2Z3MLM#*QMz0O^Tp~!IWQQHK=kJ}y{(5p^hXldYmmZWwEQhu{j
zmPrYj;dG8k=%LJu9(Z?DX<tVL_#SkP_zU56^qD_8Uo@`EFT?O))Fz2Nv~-^9xRMu+
z1GZz?#gP_Ri}2*<VYU&yTlFE;ZNWFI`0}_exZ@ewq%`$bf{d_p-y`cthlf==r>7A~
z1lk;PwBJhp2w_HRjdi|RFhe6y9ZVa1qDK9zfV0=^Wq;)j`t!%?+9c`$pkg4gjo|uL
zu(|tr)=F&!<wz%iy(Cd#=XyAfN>E6vuUqTFKoS%%)S|@|7LNp!+^NKNyN#wz3kS*W
z6#c?J1oi>S;8hl@`XaL`zYbiwA!;Z!SzQ(V&^koAa0`aeK;{+IvE;8%R5@=a4B_hy
zeI6eFD>Ce<RlpezKO!P_HVsARg%zW8*-qDJg4RE0)A$T;kocNYH*M`bTl0Pw*X5mV
z-ya6~iP|Ay9SvOLJBLOziHcYYrqFm)?cLD3g<)KI+*f@?)i{^zvkR{b&ie`o-K<@m
zc_DNyrN$CvRIhq4mp;sJo3Q<PtHwQf)-1{xx#HROo6D>R!+WD2vSIfote|55#KVtz
zDy5SR9Qs_+8dIqfg<vlDhgYY7cA|U`IZCw;#VHEGhy>@E=r5^D?{kqTzhQ<pO}s;V
z`B#EPj*7z+^3nU&qL|#R`Ro?QtO~YSDSVT)N_QOo-Ue>Wb39h;JB*01$r94bZ)Vt7
zlNg9;vM7c^TLtNiOh#tEQqkGi*wxsyPkGZ|JxNq9o2!O_?ss1~zE9#N7R}O`vpS&X
zQW{Tx&7H9&uKV^m3*EP0h7sR8Y<9Y}=c@NDS?T!{gmg=lvIW&gBo|IO6n%>o8Ajq3
z5oOdqHm%D|j5)w~VfQgF8Xt7*(Ze;baf_M^JydO*nv@qEM&2sCNr)AD7tY&$>02X>
zMNcb3msuD-cy)#l&{uF^C?MV1I#}a$DoTmHGQu_2MeN&Fc6FJJYtUj;MJU@AVR#Dl
za^N^;Jtn=%TX2tCl3Gv`h_*BBT1kz$=J+1ae|X21tRz{r9vK>J_!O1{xYW=|`_4b@
z6{ZtZ#8%$K!(OtbqPx%YMlE1@_vaC1y*hDuONhSv)QOMNAnFB&UXJ3JW?CsIness-
z%7ivTPU*_~P;<iz>Rc_RgexN--3$d^3VTJ$5>v7e7rCs(y`|{Vo~Em7HCvtUe}OE2
z_RDw^lo+4TfQPUrVDvQQCsy?<P3U%x8wlza80+g^;0Y}Q-wgr_3)EP{g#jhGqI9lY
zR@>}@eO`O|Lp$q#)Yp3Aa3$7;{Obx_q7MtC^-=Z&#{4%Fzk+#pb=+9PYiW)2eFm7i
zU)AaUb5q|(-0+v3yuUjxp;B~^xv{Ox-L3pD--58Ia0mo7C(`=DQ%|nY>oLHFZEvYN
zS88-d`u$9>|5ayD=DvT{i!1<5$rdz}Zn;I)CRtHi?F>(gC9u`9m7Bp$#koTy<)+py
z!yOrcZPq-6O1?JU{Ocrh92Z@5R_%rJ+0IYw8Je4#+ds30%VSgLY}V$_HE$<mpvW;r
z%{b@+^Jdwz#f>3U88xdrDp(cuUWq^H?NsoGlK|eo$No8KI2t^gkDnj1gl|X)b#V|3
z$FCvZP$pcJ85U;jjJ@Q%r`i~b!B8DaWSqdQ*)lEQ=qa8K62UL5^8qOL26x|O{HON3
zhziBHknU;}^4TbS`&c=QKG8J(c(=1<638>mh`HUxA9eR!o#vN=TzRNtrGAMb&P}?@
z`&sbPz=)SAZ>lfICHq5gZHB}=qEhpC19}X0sCtu6XaJ0ul8`@i-kwOUiw<(R@1R?V
zxX2Cn6I=IkE^hR@TUWJpAH-XaiX6xNagB8UKZ03;j-EzV;=F^C0c$e`EIMkllWaZf
zZO2=#rR1I(Iy{%d=1W^HvSmhI*H}zJs^e}u1`g|CMqAK%#aoR+DqQr6c)%)yq_?>D
zwxBTKyXkI9W}()te`wSt^+O_ML#ms#;ja{)HW9To){KDwSDxQuV<jEAe}hLwZ6-_p
zT1I%k@o%6hChau6{D%dB)#G5YZ@wI0G^63Le$lSZcMRTZ`~!`F?^ubU!jj*xEe>5N
z+4o)XbZBJ3p4B_}CQuuB;RhwIJ5tGwfp_qUzbuFMkdwvjveoA-OSiqU4_6r$X}ggO
zAwqgJae8t?L@}-s5ssl}t)@kQyv2Gk%gMsh+mKCfxaXc0HB|MmV2c5d^G}R6j}AR#
z`nOVrz*?!$p`}l&EV`)ps4It10BT6(ME;IZnE`@Z3Z>~9m2k^49F0yM33IJHr`r+@
z=-(KdOLP_a%)lWo`>;1vAfPU-fJc(~XpQBP7swHp^EA&@C^e8|CYupb(|b7bz{|Kt
z-_M!XWr4!C`bYUCb||>&Q94DkQ2bV_Q~ft-^W;)C1Il2(TnuG;82UpJLnS?V{_7W)
zz+Ge^;kP}G&VkluQS1sOW%&M>>x_lWVS1I-n&m#D#%_b%NyvCtrb##0BKgwEd}Eng
z(j_m|?*2ECnCWvj;wd>Fj!O*)xmX^JCln5e&Sy^@o3?gI=pdBz3{{-u${kMi_b<>a
zLn*KvGtww3k<(?9w~|T>4SVgYvu-f`=E_1^COU+FysI|xKF%>fz$o!0IZ*gXg6ga!
zW=(484ehCFgN6F7<E6bNZ~Ebr%;d_+4x8j}1|&ckjijlMwL-1O`byE}%!B&j&o|bO
zR1%Qlc+L;4ldino^TAvJrMozUp>6Ksl%7#(-H;Qt7FQi<9~eHqo4>}Te2Ui|>(&&E
zeK$?dB2(3A`s$S_9>!m5X?+SoboN_A3yZ>p15POI{ojC^3k%OZ#505?Wo~^pqEH=u
z>EHF<?N3vv-F7_&FW~pY(1{mLfxYXM*#j;XdGnmnKYANH?jdZ$5UF~_Kt-PH@+O=S
zd+@zQ-U@G6&$NYML&&28cN7A6FP;MDO(eaCKluc++blyot<ave0DOF%hbjypzv3ga
zs{nsKkJY+bW5_FfqouU|FAFtT2r{>h4cX~{OcBUY7AaHSTP@78U1_Ip1r9E$kt(jx
ztEfd6VSvBZg_3OUx~^rIc>>KxHbY@mLB&>{9s4u^H&k3Xd0STIzaHx@cDv<i9(U@z
zWIT?}kB74uY)1)K$)$YX-wG51f0nR?9N^Dxx~Q*)H(|^p<_8GpJ(Vr-AvXx%Gre{v
z6ua&(K77Mq<+2HmH6r;B192%>5<oNPFaGC)rCVt_1Kjs41rcOEW)GN!)Bsj9A{wB)
z(f0Mhj9KC223Q{P09*y|M=p;M<6`82TvQQeDEvH0s0^4=oNa*m3#{+AU0t9_pJw&9
zk1E{5)P5pQfxd8pp8CiD{IO7``r6b%f02?FV7J<oS)aia6=0vA1?QtmGbYDP8uBCS
zvm|Wson4$)V{?bPy|xOG@QAbUz5kBQp#$mGyc41NSbP?T|J&_zB62k-o&8}V!|c*O
z#?FDyIq9m@n*mphJIwB7@|tyQ0_-g7c{SAab1WXspTHb{3a`><2)tw$6O8ztzBV-a
z1}UM(@1Za%(cXzvHRRWfO$>Qbyf3A1oG*_uv87mTNWh3F?WFzO+%ZSzaL=x-kB{Ve
zo%<D?JGZ(<(4_yEG&hbob@jN@z^%@oZr0x+Q6;GWcqd?;T)(Iy?G44W>nwiwv|h7u
z%b4NP?R-k{!Z#4fEf!`^jhJ>U)1rgTinx|MY?7=gBBx1SyJgb9UnZG+&Y{$6ylgk6
z=%@;92v}&I*fKPDl6J5Z+uvZiOibGxBN&@*fqZTo?@L8>9*X2hMC64*K6ge3M$K<$
zZ@QyOp-_`R-(6vu;KY_Q^4aRdHOns$kd=pW96mJ({&<ri@2&1^yXFC<uF?W@lq8%x
zJfGa=uDrGiv23*8*=LDZ4f>4ChnerSeFFNwcJv6o6+q|iSZQtS7U(8oLum?ZRKoDx
zcYKoQGYN-ZpJpxyaKdg_dX+piceU<9e1_h@C^G1z@6MCf?%o(5GLN7)OJtjqUw1Uq
z`K^h$ojz!{i<^Axe|OeLp11I?N<Ec!pYtOk{4`>=)EIufRSCr>IoDkTNIZpd`OaEB
z#{u&5PQEZGx`(Db#UjoZCf0Hg$rQJ?Lf+yOvdxLyhU%_%=7n2fw6i-Fyn^}Z+EM^K
z1FE~@5yJ_Tr@a_nKp*IRKQ9zO8wvKv%tEM5_<|H1!{&4I0mQW>`}gK}QNcyZEM>?b
zzNMJNZ;bM-E!Hz>#^}MwuR$^nwX5I*xKF1Dgu=XW1WY`5U79JBiq#vg4&g~>yRIQn
z=ARy8guJcgwBdkhM?LW;^X8vJSyLpMw%}KjN>Nujeejh|<(OF7v^>3YHkheunw_2~
zm7eF&z1U|4@}mcQdXyYlAM$h20hV->(mU0>fnQiExa*&-77q142kUNWr*b_CO@afY
z<n4pW0WEED_TA}<+bD~oxS)V~yh=uqJ)NDnOEkFYvFA3(-1aX(r=^yQ9iC>I{3$3w
zy_!Nt$RK>B5G&lLnBrI-rH0&Pf3}5wqhP#a&4SbE0q)T03+dT8M?XXlDsmb4mF55e
z&b%Z2$Ja6=OPt#^+FSNOa$IrB<tmI&D;{BETg!;6>WS{h*Nb7JZcGQy)PDCGxx@-s
zRud_lrFNPuGR)5RMr*IE{GA4sTlvw@Ll%cp*`|TFpRFkEvL=Cl-%(`+a9~@jT!w6&
zl1Y!bpUi!d0}Qav2s+IS+MCmAG1l}+PD(@?NZL#fkLAA_RG#MZ9%@FvMXP4$6vCQZ
z5yw)~M^}?gk@5C04xf-}hlWqs(NMD0+Rlc!11gG~WdwQPLnp5qV_&JNB~J#hM`#t%
zPfaoo8P9uv)WyY+Xh-4K#eUlmu@~&3e=9*+*Mk9>5E0pLI6eAM9<e&$?xL22r(VzG
zM9Y<|07D!WbZI|Wz73~1`0PG)kVRYifJz?0R4gj3iotOB6UCLjn+y#XtKvHKy};d^
zFEn>;Kk_tg@%R}p_I|csWQ;zit^0}MoGHK>Va$I|QLpL|tzHJ<^2g`BP}cSsO4cCn
zX@9Yk%x(0}q}vG2)2kly8~&X%93OHWVXbWzqEb8I?zq!%NBYMk)&Ic8lD}p+r(AC`
z?KK;(=#vyE0sI#xynaxTbULxS{GQlH$%}rK8zqOf_R0Hd#bW-F-$HHd#pOcEwY&Vr
zNrv+4nx$i^h9Ka9MXe&s#XEX4SbfUZTt6SL?O5P-DuRNY*hLwP*_`hx=ho--g7a4m
z?8jSd@FeoNrdH`@dt_95bj`{~+3$38Db%w1z%H}K=hD5DnQN%N?Ra}a$TR8su>S@z
z4aa-9R+eVA9YnBJASFJ5u*bvhp4`7U`!{e?@O=En^^CwW?ghlX{zrA&1D8~?@wPch
zqHBO6=78;hGi8q7mukOBytoZ7(mRD>ju`v%LH`G>vPASI0l5=;B=1=4ZH@vb@-#w+
z<FJy_F_K5VvBvEDq(2_=H-Z6QJ{G8B8=~awgG4D|#i%&3INAWOi9)yI(t!PyDHFB_
z1*5xRwi!0W0S&;uP+%WShZ=J?57xcWB@aD#ci8a1${={*dq48SJkm7l3rpTkOO2gf
zXqv6{L|6%?o>2(#k~j+!1PU4GFBBZ0EH(gUgeX(qK$`7;SR+wfbqkIo`pmW&`jU+%
zQsiT|^y1{@KRU{r$|~?8KOE+VHSkFHkT4H)8lpFwxNzg{R$>H?+gfv~MLT2>U}^+v
zN%?v@j_JDtZy>kBlv3y$5%7W3&Ka9@{Z7AbDKtvGrbxz|tR|<)hWzkn;HJj6SU(8|
zOKIbBe;u}&-LekR3{(AjDp_n!1H)h1${CJukiyTB|75QI2{+UMY!jrIn03r0e-NkR
z&_MUBUv`Z=OeO)&*m$oO?+|}3!JA}9bR{7cGSEN1eYs>mw}bU5EwPL-mU8>`-lY$o
z=H_N3S}!t_&3uF(R~d`s!FqhN)Y{KH@)q!$e#f~@x17%NjvD$}4<bqy;od<P2x1a#
zU7b@3Q8zu{&MteMxO_@EA^(B3H>$Fy`j426jas?bHN6;(^{;(59Zw}w3@aUC&8XWq
z!+_pPa<Eq7UL>K`df1slVOw!I6nH*5nfdpr3zun9MHa8$dK_gD|1}-%w){HuK^n=)
z?ByVgtsbuI(n)4~3DgH94z3!$#p(xIi->hvu#NGDb4zYiBfOv6ZXJXrxA$hJMhKI8
zmSZZ9+|}Ls_$~9R7ol0KI1hi7-9^%6mvb>IC9v-a-^|Gv3246A+V*-KdN5mOGKjvn
z^g}RQ@BY{k$rq=W-(r1+Scp4d3!XrU^GK3RtzxoPayyY7-?|%;G&)gQd~MWFXrTmw
zgUy1Lk#&g&*1{ct28-;pS|Vk?=cN^UfZn<<)+w#J>@TuQ{+{{0roQ-F8NbLM?PD4=
z#9GdjV(-BF0(V&)mI&fn5z!z>n)<=1IQu5rX5R>uZyfWh|KJhBvf#Xf2RlU{*?-7)
zM=A;Y`{d&(P<L<&VaP>jHa^#Ww}_OMIxs6-3=t1BkEeu`Dnhb&-zp4d@+L8Z;N90H
zoLVToA;o5XMq<zKVn2)xyHhN-X|pt-iaU%(X@TaYIxxYi(0kCeJFxH`f4j9G)GAVk
zJID^uF@)${B5y1{b-S`@bPtIQu5I7D9><)U(_YmN$CFPZ9=A+cABzU$mv9V5>|N@%
zg<_o>-cB>V#>^6ibHlo2@i!`>O`=7KW{_x>t7SUITz!9WA^ci#pIWyzq*(5E+bDh_
z8ulJZu80&@yvU-PkM&+fffqTCJ>~WRx?eudA3)G<t$SaQA~iGAuCI6~6VTNg^|=uy
zD17diJw}?pvAfB-5sM>w7JY3w?V-L5r#H=rYH~NgUc!v@-Cc~3jPY($P9<Ld&MS@i
zxbtuKoq<Up!Wok>f~SjP;J~BU)`sh}dR4uLU~SbIz3lP@%@g9$CEjrJi6TuAWeO!P
z1I;v8LD92h+ZQvfM~e~%G?KZQ<cJZBfp`@}D2r*BwW2tg+;Zbu>B7H=5m-d^sG42K
zBTuP9YGB#_21~P!)uB<C-?xSF8lq5WUdHo%8YXHn>#THa=Nav=1QuLT!`(kf%NLhU
zcGQHC!^SscdF%C3YXttScr_b-FeIFfxLKH17{6l_Nx-%JY#fcSuYKnf@|9$YVqT3Q
zwM`FKqAy7`HZ=Dxgl+Ze=6nr-(3i5YlBR>huVD|lZj-}L<<b}!H`*J6v1TV5qEjiz
zQm!Y>`7piz5E$5vSU|ZLo3NnTZbvbh;x3!&N64~%<#Z2w#Tjy<ZX6yP>g!HciD*T;
zVcPN4Ol^;2b?+<jLh+vF9p<eH^`bGuSHdDHJsWAG%lLJo!Hm|TCa7~p%lr;+Oo3C}
zjUd?{&k@v7w=Bt<(UdElL(_G}FtgaZa&k@Pcg=!DdLvT*?C9W^MN>#nhp0*eMDhcS
zX+_n(l}*%?wRrO>7=5pt(-GtGy0;6<?UXC8oI9&}axXlL4?2L$kcdEU5B&8nkXb8d
z(?XEh396Qr24#&|HiXK*EK729`j^%A3C4pskLd@&gb3zPzer&KQX}#nSE1E>mfAbs
zPT28#gJ1S%Jo?1R)$9x3RsvO$?XXviK-sRxh@jT1ZC`uc3c3fSTX8-(bX7slr+eV0
z)+?TZ7&vR%6@Cp3WEgCtO^y2$U9JAwdWV~+OK+P0!l$?6u@GwPC$S^e_T8_xzA3`1
z@@#j-ogp5M#{tkQE^xDXGi;Qr_%BxAXeg6)=*=qUNp#A^-A_qtZ&5>Zcs1%DdA3eb
z-hW9IilO&dc(c)r#aDcY+d23sQ)4aiuW(@{481=_l-t%QnOIAEyY|yQCo@-^kv=c$
z&m85QSzsYHJ|yAs0n4WgeLU0FR$f3({WD9r7&g=P7a=BGKY@apjjc=OOCeQa$eSic
zR;IYkxPX@kL-)H6&##cN2y8_=yY$S!!VDq(vZ5<sA#@Oki*U5Bb8Z}1m_@-<e}lM)
z3J4E9v6CP%{$75SUSgGFR=$pS1ayyVjXnfu%Cd&b0!frRcP@tq`kt^^ne67W0$y76
zFZp=EMQMnTT846l`Uhmg{*>Uap_C8xrhN%Q1mN8ZyJ|P2pu-O!sVDt3de9+0VUS?H
znLhuHEWme|@-rR&-eeo{W)$d+QUaPid7EK*3_=+CT_pY6+}rONkGVc*!yDj=p*ca+
z=>QRQ*ptVCr2B%@T)Gq?bnD3gd4XsI5={yUK7iA*NeB66Pi6*eQO51;>9+d+=1pQQ
zoGVJF_nJ3=sWt#|^{Mv}J=>s&PSXIMe+odxs)?~O`JqF(ZhM6UY;`3=gwbU}@J}0V
z^Xsr~GyPx<Cr|?x)|s+)Us;Zy0jsU1^!2lG2mFcmX#n4ZU;#f$X1I8bK#<du+`wu(
zb-=8V66g>a0Y}}f1<0oxW6*~>Wek{=81;Y03_{|@=g$ZL=VC%Z%bKFld&LI4{6Lw_
zYLR8)L!O8V7i#7U3CJm+4?27l0v1}$;cQ-D{o40&+GGP3`t37sZ|_q8HuW|Q+#lH4
zGXffof|lD7c@&Akt67xHd;wjh%(xuu$T7Ta`z+npy@dUj&qZlXcGVphPGQFnD-YF%
z6&G0mv>rRe_nx-4KoE@VjiBMBC^sZBK_Kd{P5#&b=y2TWQsPlE!-3Ba^*^23r)EZ;
zO6?>C>1@{`eIenDj;Z8_2|NI!aw&h4?g3u`wba5@668FoYbAgmE@nKx5ub{%BjQK`
zEPqbTGVBHg&kgk)n7RxH-I|H?jOsC`=FOdyO6r&krm%bLlA1`07V-*29kOdy`oeRA
zKJC~gdQu;Un0?u8;6Wkj$9t|zGk4r&u<QsA*g2)E!RfF~QJUkYN9kYw@D5ZK=!)rk
zeh2jBaT>S0kbRM}NjB(007=e-k(L$%_CggCC<!CUPgt?EeQ*g*D_HM<%vNDvdq2(6
zlg{Lp0`v5{{!rthubq?ol=XW=OoCN$jf0aaa5X#N0AJWx?Gy?4I*(0`ZpeDoa~XYi
z;yLudRA>m=&j`%mun6`Jr{zU&WfCgJ_C9MAL6Gt;szsak#+!msAb3-ZQA3$-SF~Fc
zBzF0o$8*NMknvW~&fQU8-IRY6Bg<C<pc~J-HYX~GgW-G^&Z)Ji`Q%`~1L51cpSHPi
z3oe%ZaC|+;c-$De9Xk&C3z2iGNJr6U`u2Mx&ex?_rrrS;0r!~9EAihr-8dY)?VvZO
z9bEGdZ<AiSfeSj;XHybBSzQ(bs^_bvzv<N{-)3=F3BJq4<s<(S)}rx1I73iIz-Aj@
zA>(UEJvQDz9wh@66I7rue}W>g$VxOUUQiTU7P(q~$50p?r&QFn8UM&Z1QX%BZjJa6
zPZyW9pVPpZc(yZ;{Ql-IR7f!xHdUNTQ_EA#73EohKyD96Kzcv$1B9~GN;`Fl0Lpp)
zG3<#iI+R99%6I$2fhB|aRF#*a>j@+|b=GII84tTX2MSGd>fTWvAt^XIM{ZQ^%{l&M
z+dP=x3aS`ap$N0~n>Ey*i?#b<=&z&6ZPwQQ^=;WJu+!UrWn!sF5qI7VdCc2*1U-9v
zFX@V9kOquyvPe<DyYDxb^^L&omPRPGyjU1VSjM89*>n%Si9Re+e?<U$GN+K>%WBC$
zw|t%ZlU6&S-=Fm?-v~_Oc|0;P{bl%TvUr-F(5;Pzym!z+yT;ewSR^^NigT*}QrwOG
zClY$>GbqKJi6EWq1ig7n`QPH$0@bf<(;i;PU3jcNc9Nb1fC9^bG|uw3+EmXj+Mry8
zhLqPA<f6<o$ES`h^Jq28XQjF#!rOuQe<oj-Li$|{x=)N^HjdTx@FrX}eyD#@t)bo|
zCIh9C1e+-;Epdfe{S?mr3Hac=ynsrhM@OWH9cFOUqB{rAjTzI*pf-<HH(YJ!ebV>s
zf9&sHZqZ;2pn{aT20f&$o2lOd`A=*gjYT*fS2>rLumO~f_$_G?H(L<$A)mTEYGtC6
zS~7*!HhdW-xx`wnQtI^<*Q#nZs)8A(ll&rOUc9<MoQ02|*%7~P=;v}&K7wk3R&%Or
z{CAA^LiubPP9gQ^plh1DGK*Kf1(U}s`2NZ%U6hhkke45Oys+8aG?Xvnt|OK?pI&9%
zwHYz6jD?-^ft)XnqSM-;aQ70*Ef+4$lIyIDq($S}=St?~AmG)z%YE6e6lT2Oen=^0
zExAp*?=qQyzTK5hfwtlh6}+p3jeaT^Q+GTgnQ1B4pyFtC`KQlA885z5#<XL=DRzxa
z4FeiXw8aP?YBM2@zNZ_M*6=&-D|0Xm9z#6ZZ}p=0!EsrHPLZuoqY6$-%pone{=*x@
zNKJQjwaaK7rxx83&#~HVIowBXdaM!Tq7v|`0%4RZ7TqkP?LP7b)*7<p@~&}hCzUiu
zYP3au2iwe8cA=d3o<(8VVb0(#8c^cO8v(gIx%h;?awDtieyC7PkxI*1#FiOz=ANfm
z@``vJ{?fp{vU4ec&<H;l*SyTDVl(iN@;2KO^Mdz<Pik_tY_jO#Cq4t542gfMx6#)8
zoXI8k7ANFG9Lp+qvpL5;-r;Z%<J2~CyKEJg4_7S$bP$|FJKOtq8zi9QnGB5Z@KDuD
zGtc-s{?<<H;<#;945qr>(T}$Qvm{8Pg?Mfh6A3c1U{B)}CY{9|sKx`|kzY8M_zusV
zn4lVHy~l^q?wZ<ao)@4%-`aQg;7FvoQeX4t`V1Rnu&F>m6UJil=NGqU%8}N_7UIs8
zyFuO$H}S_)$UY-$a>%n~EGV6<Q6kM}FEQhnR)J5Y{FxBSq;DQX*cTxb7i5w~ZS$(f
zh9pqB*&{!xBEtZ6&RAdAh|=dug+KAX8epLtX3(KI7&hg)ANmtg?lWa&n${k@ytWWS
zX*K~s-gpS<_nguJW^)Lcw=EQzEK{tEF{;SE^caqtZF_66boYY^P4qWy*ucAa6*x@u
z<0kt2eoR4!1!|zr<g-zKp-cn!y~4pxRLC0}=xB3A0zYoko>Wq9FFeTH8eW#}S9v<Y
ztew%2#}~ZRI1|xSBpk;LmcRhy`7S0RY{x-d#|CvJ=N)~(vlKSqc`pw~R1OEQ4@X}l
z)~;&5r4IP!Y-2P(#?@y&y@vBCZJ!L+)#x!sdAnl({1bx~LY6V=GYB#gW&F_DYbF+}
z@wMk%k6n*(sX6#WHDFLzI#&kDRcgh^riT50Tlk}D#EPQa-eLe|7j1xC%1OYHHKM~T
zZNTw|e}F}cGGO+24JYBQ$MsW%8cE3PF3tx4%HM#}{f;KX>98A@83+dbi~JdY!Dh_*
z%bPJmx5FCIBiG|iIOb-bq8qkx&>_rI^x^Q^J`dpdQ_5Lig(HegBlgcBGe+sgYvP)j
zB+fC~(ElPZ90eA~fAb0%u!(JT3u0)`1&N-Nv23>!!=5U0#0xop*o)on1_Qk=9{Gz@
z&)AnV!f)O`4+nFDlWyh{hB_8Q5Gt?@8<RymnkNNNRG1PZYYPJ<EP)xJNXam9I39e?
z<=+Gn%&>O73E5E|e5`LDIwo^ITOJ%b-LcC*F)=O}KxuDSfZW7q^Ts#a_tiC?ucHVG
zEcM@*4$55LP9a2hGAorn$1DKzmkRHCX#rjX(bnvK^_UK(PHl(NyHU?V6B$-E>uar|
zC_xQ+i((GOEI)WU4fi2{6pNyYbVzh8gzJV9KZ_te9Gh*G2zv|#yP#Te5VlgZLAS9J
zOOWn4SF4JVEndk5FZ28xVV^Rht~UJ7nqcklXGRp~p)b<fhjbpE;Tp83c}XU+!%?<u
znY<<`Ih<ebDmicYn}}769&s25#gC5g;XMLQzB*L_r(<%5gzI=zZ&ZDl)`JO>_KxOi
zslpiu$KY(JJEiz@1r0$!gPO@aKDaljbGlU?ezJvdc~viC8I0i=fw{kfV4buZl+{_q
zD!4f5aNV9U+X=1Gl`jvZZ`lzHl;-{*a|8AL+V+>sp4mNNZuQ4>)HuRxK2M^jowx>h
zi3$dp5JzO6!#2!L;qwm%pVBdv<B{tVlaR|U312Tdr#pxj!>0j%zrTZTo>#N&(^!4G
zkzQ6Kz03sJ#<umZhOV_-3T^W!^Tn5ffvxQ%DkTQ#6>1705hJB!F2($jD;&9hO^n~&
zCaL2G$Ku2$nDP{<S0CDa)-ZUbt!E@vYTdj23n~f#)SH6|$kzq*N8*1x8cPNcy19o-
zxIuj@Mb2OQ_wT)b8YG)0w#a!Bn9C_{D3h$Bvau}~^v6h;o)AXd$P<JHrIE{6iw58(
zs9ulQ=8(5)V;ocn!Rs-By_4=-at2jTDGr4c2N71KAXdb_#VCh7P7e9?4gidg3n;&W
z+}3}%Z3Vn*P}dz_f6f3Mzj!ZaThb*|`PWxmKjQ@6?yNVo>W7=8{#n4^7-7_Xx(D%h
zsIKr7nkh1-8Sexj+M&MkiWS9eC@PdTTvl=(N}|hHems%aYKtJUZTON%(akEeFP$kc
zv6U$9h<6J8SuoqeL`Z={=?R=_To?2sW<Xold&wXm`AOJbKwXLS2m9D!3%xz>_jgBo
z%%MF3)5B3g58U2f%R1ge+=4gQWheQ-!PI*;(`$V2FZlggu+n=rHec>7LL6ZvKK)vS
z`&o`zsx{+ZnVA`71IrZnL7FB-LkNkjR39VMS+Tp2+`977=77jS0vHkXB+Dh@>OTTX
z5lq>yH2QGfR`52hVzne9bKsJhEtdo;QLnGwc_`NAowGG<hYy98$|e6^EW)&SH-oQ!
zP>w+TfRWyFjapWg<F~e7{Y`LbaIadwXkAq4dCWW3s&cm}7_HanOS`vPIGA@@!@)x0
zjqcNau`7~2yCfSVZpJRw9e7#Ki4o10M2Jxatz+<!4kq}m$#|$eopFYxqz0y@8G5m4
z9$F*?jWzlXzg5mpx+;<JZO(}F6tu_E%<GEG#T#!JOgdR@WV913)J~kihnDR}6eeib
zY13`RQFy~XSxa;ZZ};fyVfId&#8o3_<wb9Yw05cGxw8MU6y7w+7+GS(+z4?^%hUib
zi~I<A<GB$*c;f+OW<uK82lDci#ogrI#LXC;T4`8=R)=+n6nZb=@`ONQ!(-BtpE}MF
z(S-7TmW1`B1S(Bf?><Tv$PFojZL))L&S9~!K`v%QwlcuqJ5&ade8HvegGc3{(9)NM
zYz&wnrEiqDDUmA2TKM#wLva?{L{{JXzDS__BDc$QeEW`M=W9ULUvfQpumy^A&{!Q8
zU{b$K`^dan6!nIcjnjK;V|f?wYH_7<YgaggtPNVI?Qy>$w5U%CQnF0m{Ll;{kbMzj
zBxK~NJZJEH$K|#=6&C<OMd1{c7Ui<&RJ$&a68<Vxm;KL18k-8Fl$}Z6Og%Xyz)R3j
z-`hJDV!DiwS^vvBALR7z@?}=rYtOqd02qovdR<P8TvYd9>i#51ZK|)7c!Fb3g);3g
zbj1%WB*bRrdzw+bk*hOxf0ef`G0`V*;sCR<@i%_f(z9HnKGp-LPdtAUOmxg1iCPIw
zz~9B%jW7Hw``L(CWUgTsCn2;{P;c13Y>XH<zZbZyWzlKaC%?#Y-}rz#rt=N~Nx<jD
z67HKc-uEW`Ihd1zbUXGVH`t?uZpZkehr*81n7dU6C-wCOKKCI$cR3%Bjom{_dt`ux
zc~b@`D=k3x!=A7J<o8aNdRu2&NPy)a@O0aa2l65+6t!{#VNDYtK-8V%aM^z!x`R*2
zr%D16ZToCCE;u3s=*Ol_`p~-!gVT7t|Cr$dHodmY+|M#&o}yN!KKD^f>r8{49Nx;9
z+~4b0i3GGs{yjOKxBwa5;`QB+F4ae7*ygtYE=Jqnib0@v%rh1EhR$;uBh>72<((KX
zn^$YL&H8eJ!}L#goT&3n99S)<H*^{Me%b+CzLEM5`LV_o7T{1t53YKsIsh>hzoSAD
z2&}PmYwBwMCi2u}`VU>6q6P3PSFuU26=V8LfMDq^%1Q0>CnX*J4^1*+w4crwYG%G_
z0v&XC)B^lWxK7zQ;(qnLJaqvUs$r5UcVYV$;9O?Eeg*G7<oxc7&7WqTd2az0!gR7Q
zZSf=rc${?M=yrg~7uO#;MDAX`F+4}N>~5PAoo_3nSCus29@2nvZ%c^`q|`od2*b-^
z#sRKcz$T+hhyOhO|6NS^0cmAXM$h5?TZz{qEM~LP=x)2ZK;8U;zD{{b4BM*p8%Pt>
z+aEPlk9jVbF^lZ<9&a?+v3vQqKvMxG3N8Qw@?RZuLp&d|E@fN<9X5)9P~;)%*0&pT
zfpc_EUg_aL0N%aR&-8fM5t<;T=rR4u*-_LQHfR^zekd|51H{&9EZ>}i8~8RAdTp&_
zyb8GnN%z;w<s*(LSv4kDoZ}_|KKC%CG8Nj11B6-G14nFG<mtOV7KY((^#6yduW)Pn
zf!-CT17QeAZFC4I5~I7Mk(Tc6?oR1$B?Y8QK<OBaZlt@rMmOAjf4}G6=ehsDw$HZb
zob#SH&V>k*bP;dd%I<e*hyKWVZsPqXtS#-*wR}4KpSq>pK_CPcL5H(O(}!<siA@e2
zwz#WG6U(XyJ5-X<XuP-rQ~{L|XU5NSNh7HmzbELnK?LJ*dQ{7wy!g}ngGtSN1)PGr
zzOBW*4J^{QUi}&;+Y{DYx)RBz=x;sqSKA$~AFZ#!!wta2Hk<E#9#~`CW>$hT`0h+<
zX30F_J-?j_{Viv!*#Dgjt^m9JHqm4V<^H?2*i*!(OxvCN@Fs6w)Z|ZM?cNt%U?aLV
z&FvkZj%CJz<t^_>ZH{6De7eGo7C*llVNyEKaUB)#MgvMj!TCM<>*4L0M-~IL!<4Cx
zkptwhWSJaUL+V->=^Xw5`yg&mrWjnIF^TW4$2g|%`Ve_DuuC^L;I{9-spw-^F_gk=
z+F@MMLCZgoq<|BV8a82_ImrkU>T&slq_OA-?dJkU6nJo%#55>JvV6)U=CcVJI<XU^
zaUpLFF=>RCi>fbU`Iic%^PVQZ&^4wH0no-IL|<xC&--!W8-B_v-Ebkrx2hTP9ehwf
z>oQN6we>W-65b=O9#NyuquY}GtBuWAZT*Uvj8hDuNUzcUc<30}>kCdTO?|12T2iY+
z)M?jnKqK`K%dsPg8ZmGv|MgoF!CV@FVA`sxS51TC86yg_@6YOuOVca*kozctda_z<
zpsLwcCjPEHQhB$oI&m>ixNkls6qd{1x_mShp+Qm2T;a!5yLVZh9AZ`^E~E=CqUV_B
zD0y9xG=4BgA60;LTfjT_YvT!Dgy|>1c<h;FxJkUWV^*uWFjcXM$7*k!)SBfjRI9S?
z`!KB+O~j<UEB5#~amEIhM@<cayKK#ITAbN6*}BMsBwWI;8E)s$_t$UBpF(*+@*}SH
zV#I;kgU{mEq|*mc!YIC1o3fYZiY5`@9Yre`owDqwz#Y*h`Li&oU${2wd<l$bihJLe
zZhWJ<&#&Cb3|tC6&LlW_#BySg5C_R8s>gN*1>ep5dc(jx$R<P8Ef8D&1f@5L&lGVl
zYWJtfR#<u?Is(6Wm!cB#fvv-BxD62eQE;dx$yH@9ID4)BwOg<$U_yOp3GT4$wSG8w
zK6}2i17GeOFST%fs$cX&n!QxR^b(vEKXkZcIao;`ct3eKp?r3a@~VS*kJulI_z>g=
zg`j5jADLL=w@w-@=xwcg^@KT63o>?j9H8s^hMtXEM{Xf4)HhZX-DZQ8Z;x<irQM87
z_y#TWb-CUE5qYx7rF{=zy28NVlEChD{Kp{U(L(zrJ~>Vu=ju6j*Ns8MFLQr^tF#T`
zRC~8Kx<@}Q8NDYC699(sNIR>07z*Z<f^xG?EqUuWs+5bko9@c)`hUvVnTh3ht*skw
zdnpQR6{N&3yZ>hqnywFE%rV23yH7^}xWHle-%LETgN674*8c7+bc0&5twMXTx}SJ5
zaAZ7zdU3zlS}GEnxq;gic!2XPQSf)>y-~z~zh)oeU+yfmAdvsPI_qk22?pR?8;-<E
zX&?`WRFKbX1sJL_WMrUcDF_&)A%)xsdx5D{yajS8fJ@nfuvljfGF(v_yQ2XLB0__2
z4dd`IIw<tn^*<<WB6Zuz2^^Ojg@)g$u!Q?1_rg4-fy?JAOQEm?4ld*wdni=D1dJK>
zwHESSKXU*Pd)oUNl;_L<-(XfnvI*!AGJlmXKoGnJoGrN&8!6)Q!}KS);0R)%!Ydo6
z)n71YBw!<LJUW=LtpWMW)cl1}V2xhy9F(}$QMNqqE)Wve7Kv&%U@dkbw2Z55EF4KC
z6o*~U#s{Orb>=5hQBa8zaeh2!BgcjEFmMZi2)-d1^~jnfaxugJoYuAO$$^N1#hmd0
z<YbrT9)BM$9f1k|K=x7Rg0qlvtgap+Ab%c{fyWkow!tdAF#YLpYm)rl)I3z(KGXy@
zwgL@*stG_rWzPre=@M0G6M6OeDF-0iskRLSft}GGPPX0F3G<Rv02UU+8ez9yv%||;
zzpwy~w7U^Vg;%_Y{hSYQd;IftR?NhUJd9viY+>CzHvg}kqJI!E^W{N=RjWNku+Sv(
z(|Z>|p;)@-d@lQ=6khoOC|E6tYdo?~C8pQ{SMT6uI}TAC{|jrwGW3j(c%YbFq#CCh
zdxci3pSxjQmpGuR3ruRTM?7Zd2Dybzo9SwY4;8%ZC=&(5uYvLZx6nb=+(k>M{GM)|
zf%1i#tBQG)GfAD9T5n$^XOoWe))3*r<_-}q#4K($0mn1h<c;VH1ZuI(_a2X1T~}f-
z4bBq5Zk63|pyc~AC7s+f^2n5!l7FV)_Geb_y2nqn=rR@=8Mp)G)I;tIuE!0$tX3L)
zP5`&c;IM$)U-cTYzb3)4%T22;&dve#R0<9@76dV@;-v<n6?%bJO>{`IZPLktMpXtZ
zH4a=U%=o<62UWV1NqdV?zwO~28#%P1|8^fAK*I0)quPmnJ|zINNj_uwX5NSC$3H|~
zhHMdmPUKm|A#<oKB<OC}O~4nQwmA6@HO+--+)p>Oe5Lf}*~M5V!Iadb*cHYgD}syb
zUCV8~#q0b;+8xpDH`_#~Uy(V39==lx4fb*D<bqEh=h?)B1P&U<XN~l77!;<Ita&ye
zh!@DKi$0bW{*D{AW5&iMQHT0GusnYSq46uWgaS7c6`uC(Cvrs;$d46`*k6KF;<^6C
z9%p-f%H1lZk3jAJjDO{^4}I^#LTknG6ZO|Zz)$;M#wzK$`BWGp+G?y-UZ=lcW1<R^
z0wnKg@%r7gOJ(ILGumTvXw<7fxeGSLS#KCW%vE>HKBFU7+{#c#6xwgQwJ1?D?=aos
z1TNVtt7?aJ6=E_%{yf~DWQ(K=?2Lrzh-%-^40wDA6|lQ<w`6L-Z;}^ls-6r|*2(<V
ztHPsL2pP?1AT>d^W$`cJffrgW`h)tLK*?WJLkvYHW|sKpjSUdS4z3h73@OD<5C^e|
zt=`J6JTi{a!*S>bWz+dM7j|^dc}!2Y$AUJEdRBgvEb1gi0wSF#1EE;8CxcVMIl^7d
zz=)5n_nS*6xTX=ccju2~q?ePugTbzNi_AJLBBOyt6V(&PU{as#y%o&sDwQ9Utd1=`
z$|>}<{C124Rm+#_%2jy$_;O@C9bdRO>jv-}b^m_*QbgCUOVGaZ%cK!t$Ua|)TOuer
zB2ztV=cr*}5d8HUWl+anXFh7_oX7vo0uYS=prws_UFgWN4-7jwyIn*kuCD!|-h7+P
zsT5e8Xd|U<`jWS%PA!>!9j2geIbu$IE}BVSD_WkCt6~DK;;rDd!#>u(1N+uQ4#hdX
z#lq}57-=fPBzm>8I8N!6@j3hCdY1n_pu>0ZFd+oZeZtyp5zj+;Od%Q^@r0p}16LEP
zGP9R|X)1RR><Hj{A?tB#JxVcGq}?zbZ)I3ji)D6teh9Oq4m)HdfH(%S1h#oB|3jv|
zthE})eW$<F2>)S))spb~Dh_w7SuRlg@mTJjFya&@puxUr#R+|+@(&fwVp0SvqG2VQ
z=&X3?aGh4d{}|&D43v!Sp7$E~;1kL=zxJ~D=*Q{xveU(PC#atA6{g&t^y0Ijhn2j*
zm~p<3{*vAwobgY0UK-v>!8ZI$@>o7W=ju5NT&6C!{{g-gNzASv?EH;ucVCZ8C65tn
zCM>vm;rDe;XWF>gkT1RL4h|f@CtEoDyzC|pbgYHiY+~V$N=MiWrW1qYf8I5DWn0wG
zmU3F#DReZE3OFqMnqxm`wo%`GC5}|UJ>>QrFY;c#FB6K6a6js1vG8Y~&#yDLfj{zz
zl@wzB{hO=85dEKZZ+jBfM+|ZBz!tlp4gm#>G)1rV`!=SpAvJ+vv~2teg;<-=_=6y{
z2=Q;ThHqeR)woz%4r@X{%%qXNrjT4bL6x?oj<>?4nD1IMeT`f300+!8AP;Wc!GNN1
z%V=Ly>!OE8umeM4KZTbkJv`~=@)DHSjBIRN;Ii?-6LvtT?lm<0GnsNTAzR+eyEB#h
zG<ygFuldKCCvCLe+3GfbIV1ttKs{=T{hIWBX`iu{5zN633OG08$d>cx2*mkFKEgB1
zVj!hv{DN@c*#ez9`R=&WpBTumTxYWSU<Jw|e*;Abo`6?$@6)|D&rzW*${6>oHvlc<
zNC4rsvmZo&Fyn;M*7}F1{(GiYp;R0h9L#+f;vZ(W1igey2}v)4UtjQwB-p|Ldwej&
zNggL!aR7rw8^dGIqWX_5l9K13PEYP_H+=#BlBU8+bh66i;#*<ePZ3FA*O(RP8I=TC
zKK)D<<?oIayqj+c;toaYOu~qJEMkUl==tIU4l;w*PN|JJHr5r?SXvh2dlyU-QQFXg
z+Ps3+`eS6w3?9{4`TaZ?Hd*R>^P>@qCxS~z^k??$+vbhmqcsL1lClWpHCCO@4`jcg
zk`{gXOAX1U++`RHSR2;>lGz}K5F9cw0ep%6yG2$+2pkHm^Lvv>=QH6Dnil*2AfX(f
zY>DXa^y(LJCCfocvAPlFM3owtSfTC4{3)O?I++JA)7=u%YHl678O6V@xz~BIbGk?q
zIT^3RSdF`fJf857fGdwPU@#a^oipG+J%A;ig#KtmNUOXq_d_c<o}DZcL$nc%QBR((
zbL^X&;)*7Rk1j41Vk58xw3Zs-_gK|Jv9hik5gL0?zzi8CV_k?amWy90_J7QBFsBE%
zjGJi%t?bk|9!&MPdjvMqSC#&WrMFQwq@T64>)YiEn@sSJ*AQHgsh>Jbu)up1{T)__
zby-V*d%B&~(w-F?tnoZHAZO#?G1QRMyZRcQT^_pnkaYOwN~iZWf<6TP_#T`-4+=R~
z2|hYX#6;B_e)c7GRSh?7Bv&w3z{8ka&}?~;X3g}uksLkC^qAHEx6SI<vMaO&e+k$B
zNt+Y&;V=~cphKa0FU|C44$|o(=pcvn^zN>{)=wzF0@L?nn#dbFFPte(7pmYlQe$_j
z9NF?bG^aCSR$IzhqpMMJj^$rIbF2GS+%};VDQt2TF0Mv-@5@LB38WTS<e;ZnwOvTG
z20w5gleBiFQ0g#oT5$dhB1;#uHTi_ge7Z$<StCtACU7*)ngCTpBTx9;wZ<%Q<IDFw
zmbO8a_t8Y(H{3w^c+lnpe(!dCBqm~X?6AX=c9<@wXPWEJyB>Btkt4aLXjF1-IN#N*
z=eI8tHzDa+|HYA!^%C#ig(D3tD2$0)oM*o?Tmnmi7QMX!J50X>@4lCbz*`v9w3^<N
z>CO%i<SM8$1WYK3T#1oj-A3)Hq|nd(X4CDl+@yc-MV%CVn~+8X7VAIyV~eg-mw1t{
zX%TLf-9y6cw_Yi>)Ot~Hj84*NzG0(-8yiv~JWQN;;vI5;VYU%WI^n)A-7xycolbFt
zTV+;Ei>JKrci>04t{j*?CVnY<y%G!{(A_-!Hk<V+r<FOds<h!rh}j!0{#Cqqgseta
z2zs%qPP~X?SzUapT7_09LH)FT;B;NxU(;f(zw_i{?G<%Qpo*tIb^)%Erp3*9lxXxd
zCNDqmNe9G#V>o#hh;w&dYvoTA?(`bHP8u5e!RCA-aI)O!ZZ?t`C_P_!yTz(|`prRM
zzObz?j$odq7>9V+>lczkRA<$nFeY(f_w4R9b1pYk*>l^4Mz)j+68N-NxC~x+g@ix~
zm#31uW>UU2f3oHURp=7@ojJW_+$w)Z?2;w&G3-;qX~DS-VM_V>ZFQzs>v;J;oa2$I
zKj1)<C2`WhTXcP&uSTrFn<7PlE<9ZbXPr-K4ROwh)}6@0RviNW^r;Jx<Jv|W3(e$v
z*0!(kLa(c<PgG907K$w`y>I&6YI(ITm@jCIX`{QY%lUMDy}?eekFmp`sjrR|P(59%
zpFW-?$`MX{akzgt++29V3@wTBA*Ze*R*k|!9j$@-q+}HN{C%+^dQ5b^WAqf;hex_-
zm**`XPTHjW(>L$=A1GN=<7t$vK~G!vPb<IHpZn~xSq?7gP0*nr{*1_y?Fz}ayyio7
zFfx|mUmY_#>}P?xeQ@{}w~wnQpPP%&MpcQ&PHSohM1~<W{Y-NX_d0Y3@mChsnoUO*
zV|H^SGsMi(K9@KGTqI}5KJHloDi`kOo59dASZ;YBD2@5HaWU{Tm?K6<3Ln4Obk$th
zpSL98ApqavCkMMC<DglJqj;pmiJY%$&5*;bEZE3(IRjm*M3($8$K@0Z299~hwXzX}
zzIGbfOhfuNCDZ=ZYwdx(zsgl;Tc@EXC*OSP)CbWJA5#1wkWI7sYAZ{dI4dMGjnTra
zdRtiY(#QV$uPN!bcd3o6gp_Tj3{R+d78GN`#8$BLo9RW-O!jyb(flIFcRG(Bf&buC
z7R812CNNPw_6vTyHbyXlpAlj)8VPRem<*~0MbqgI{{*7%K#3AJZBw4a6+%bJq48@)
z8IivFP`8Bj@mSr7W)sKBbJ@n18@`{6&bWDGp+x-_l!I*HS--%DWFnMtqCpBTNtW;@
zzPP~C<`U3Ke(_G=)jt{}uDoOO!OCUi&!X+lQ3mA+wT~dWNBxsU*gfCDcY1pw&DmhK
z<RZbnb2+I{SPqGZO?AUXT@=Ki9Ulds82FFnddF)RcHet+Oo@|tF9rz;bmSRE)T_1M
z`T|Rj-=QqI@C5H3q@p}gO6+I@d*-suZ$dlmuMB^0U;vL51`*81<!KPaVN*cIc{J!j
zX96sSO9k^WySTnJAqKo0zwcxNU0B=jr(O~P_K!y`bV3`CFag=j)Cf3pcp&L7Dhm2K
zP<!j04-&+4#j#M#3cM6^Q`UgCsI33i#|LcWM<!^D4Dv{`v?P;zg|<HamNXK|AtmH%
zLWUx!8;0hz!lB`6(!lZO)=t<XU(nj%BWp(B=sw!*gBZ(+`{EN6o@Uhk*O^`LoR-bU
zFHhsvM#AS99UhR64u^SMa)&afN$IzB9aneMf)R?u9mk%iR8!L@jgOc29vEtGzr77M
zuFC={&sXmM+!nB0J^l!iJ!~6-MeRBR$9=y-zJJf|rf6~l0kU!4146Na0txdCe&#xZ
zs790a!o)%4(Jav(q7$F+0ihz1A}Ftk(AcppR)0<i|1F(x<u0SYEzm8YDk`FWHhwCA
zF;9&fLRG^*vv}T9H_oRvGjr6HGwN3f4J!C_MNK-Iz1oCA2cB{N-A?vms*7dxsE?Ti
zR#+D@s@2louZse=XYpFd6{9R{UV6op;TgSGd#fBXEG|&}C->)aOCigtt+c&}fG>dy
zYwI9F5L=&ZB^KXK;trHW*UmJCf@k0GT(ml#C}W<6DWd~TV*2j4A?v~~{?=8pk&_NZ
zjz@O(U*iRUghBNx$7z}<n?iv&HNquvXsVZDv71{=q_#rBa$DtA4nv{l*!bFq>7p_f
zzja0GaiQCPvh{p+!`k0F7Jgge#;QGGC|cA<9&vy_{_!IJzN%Ms3LUgKWQt`Ac4%aC
z`Y~kp6Ia)Kk@BGdwmQlAo0zJM1|+X5TGQW$x-Q72dWL?S=9I%6hLY$}jL5lp;+@D`
zjpc;CQ2FE%6U!A|R{fW?Jbb%RL5$<G$EyO+H(T-Sz&-zADs@v9gWPE={<4Rb`fz@D
zckN{a)lx@*@^%Sc)IECP`-FRKW#M}zK=yKbu0lR2_^PikBK}NZ_X<bA27ROmmif_h
z%PU=W9MuT@2;|!`2s<V0tDmC!mSKV_tJX8o>(w|<3B|Uk=HTwcHG~ZT`4@EmVo>{t
z`<LH5J<689d(bhYN_(bzkNjKkK&xfl#mwh$ZPGg)_dmZg#p3oP%Lp5<u&U|W`_3sy
z6GmD@B2?vlty>#lChKEjv`J!Ls5CpE`E0PQ(c&&bf7brvncgO*jH5-^wAmC;t$~gy
z6Zx=A9W@I5fewh9-<Lu2%uXE?Pv1AnFF4CzV-i7zEFIvrsZW+Zr0mv9%+tP_D=a#q
zF$m8?phFGVMKD8$@ACV6cRWXi+4wGi#2;;n6E^JQKkk1Bj<9Y{*{8uF-ln5m)YhWm
zOy|XAWZbasca|*K%<>`5IdI;;psgLCVEjlJp8tMHb0;Rw7&SUr2cUVyw|?D<c-&|k
zcv?K2HX|M&_22Enu1hQ|&b5@4XaD&5t0)E{8(K8-nA*`v>uk9u#=k;oRW$c=+&eb!
zpB&J#WmRU)TVrP+|21&EyM6odL59D#(y1f$__W)4>MWvV>z?3#=Fs}H#;mEI#qq{j
zY0`n^`j#Qg%KCL%@R9ei^dLsNUK-Zh9k)Cqx4l(~P2ArG+`Kh8#_(*6?l{M1`1uE4
z-7{D7@GCe~h{~qWGx03;M!1#KUg)e}+%L42aS(w<43CVPGQVAYDknvBL3d3VI2lu6
z{x0gwTyo7X*XT|I@v_m!M-<r@RuN;uO@my%9F^T|MayE|*k4qQLUgzDbZ|0=e*rN6
z!~&cCy}0E-0g!5qZRHcF{9xi+PR{D!TP3kR)iO^^ct+jTx99n+UkueVv_HA_&J?Zo
zF`mrSzA@>XTjM}m9PxYYd_~v6)~rcSC26y1J*^K;+|69Ap0Z8tn_Ba;kxHd>_IA4?
zvBdR?xR?H)LNj(n#67^8?w_gQ$rTtlEyaphvV$yQ%{O+RlSYbMj9MbQ5~jhb2bN@j
z6i7fW8i*BNO?Jg^IU4`ljR(GAlv{2_VM~QzoBy!wS$Z+zyxtChzQB(z6gc>M@yAOF
zIPOjYFnk4NC+!O`-2V`IV))1CRq*cZIpisIiBZ6g>b4#fyt|zXl4>PG$>NShLDew^
z?@IDY<Y|S=-~n#OS5}gumpT_VxVUtt(r}m`gXchCJUf%FofF2LZy6A=D)AQz^|cVN
zJ1H!HEDxC8i%e(J!;8Mj#~(2cAV1G+p#Xabz5)Qt64&fo<?ka}7*|F=Aoq^MC~aQ{
zAm7y^UhKUK^;h1BHaE!hqyWTv`1MlU?V<p;!~DBpw(lVp>SSgHB=*#RnKE$ksKlx|
z3GU#1CW_fJnvv5+61g>SyuuiwA&Oi5Elfx6)#>YF8eoBb4~$#NrR>I|L%_L(ar2sC
zYsV|wVZoGjEZfZxhP}*6QACBn=;~@>Ffj0INXAL(mnKAsnF94;8W=2F;RNs2W3hQ;
z#|4ruQ2qR-LIvrIZH!xLjShv=Fs!Kb&<{zLzdP&;#$nE)2A@8NWgZ)Y`!2~=#e(i5
zR{pmq9n!hJ4x1Y#lA0gw#du3NB`j%V5Y-=U`f=I0m(Ayr!5x&RP0O|_ZNUv7@4E%L
zG)#aIDk-BBL~N;R#N|C+?4*>Co1YmYh&bG?mzb}R{m9|Br9r}?R6(kMvn@`B+<L8b
zRjle~ni2bhO!Z-94s)ztAwwT^M}9#m?<Sk2ubj^|ZFKTI6ex8rWI0u#1ovAbP##*-
z(wx{B-|d0RkF=>#Z@#!R)CIIUs?D~<zO`+Vb=>0-<BH*l^5e%P)y7qpj!BNHvUHa(
zl==5Dc9-ZBzoOajx<&Wv7QMNNx{D6$p`z>GIS%vZo+NRu!HqXi_P@P{PP<$Cs`rUB
zTIPhx39~f$DS4XEomPBO^M4=n{}J6jQsJJAjk?_iJvobfBE6ufq!ovasn7{_;Fi?(
zEr~v*HFA_L=e5kA3kSJ>tGei&5X=mJL-W_*LNdQ$gEwwD9tSR1iSqOwb=GA~x;G3%
zw|X9>@v>59>fq*~b<4&@+opAmv1i3CCDMYowXkjuS;8>rk1Cy2YmDKiwXS^0<V8$*
z48aw%s@bw4ykcNQe7rG{yukC#ex@Z@QVALT{90z)&WIyP=NP+_5ZUg;Lv~Bo<{Q~x
z>mM->y3Eh1WX*2c9u@sW9(*3)Z8mig#knQFX#1P({fn8Qn&@O6!x2=&Yksbj;x+y2
zfx^+iX}F><rH8f7+VJ`Kh$q>vIne`m*=Ps0#O}&P+oj+|me%<)hQUxJ$q1FctO|v(
zJt=P$y9rmqy@GW(QMF-F*w=cQ)WgIjrJY7&cj;b<`Z6ohI<e6vcb$S|h@IpvB;D(5
zu2M@3Q(j=&ZcX9`RTH<YL&=PINWhLwwe<Adl9*PqhAr1wz1rvfy(SC(H96(hWwEj=
z1Zn&)-e&#<6OKSwM<Ghu+YQAlkk{W9uIMDT5++@a=l6yVJW1{Pe7nSjLP35a1itF>
z63?CI#VZf#L}{|)zONLdTwR19=T$B~bO$3F=PnyHLzf`4`Z3@2{SF(c1KC{BP>kl^
zK^@%Q$6Vi6U9QLKL(hZ0?L<C(yPv;b4%sAov$97WNA~R?IgM~xE&Ti+ZdVeur{T1;
zQN#8L5jcH7K+RFn>}_FXX|0TpRTL_GRpsV(%Bn3>U|IpSS{M)(3}kg-{&9jJIe0Ha
zaH{?aYuC*ejV#47W3F`L93cNQaC!_jvln{${5A4AawI*6Bkz{uZM6{N<Lx7V_O~~#
z3lC1VG;^9)&W{{-ws*6V;*jNftFMI1L2Xh(rJx&pvsS$@2Y+0U{E>3PO>w_dk>r`!
zyA-RW*$<D4ZiUX8X=#rx*ciw$@#hhT;vRTdXoR){u@#3j{GS}ji+*M9W&7j3>JIv&
ztt2|}ww~?x`3N$xa@5WkhOj|SndbE@zi~%!K~<a=%w5N109?;k3|ydg9qn$~_I`?{
z9kx(^yiw=@QwQR<x|+9}BGF-&!I`Dfn=5$t7|p04L*D!okh2`p1CvKrXM1p}B7{C%
zwDqKZ0QMA!f-Z150NDa3rr5(^_-fWkdcCU8;mO|A#uA}7@zbMCPn+J+a+-d62V&4$
zJN<c+2uKn?#j+0f&w?WcJ*y8=*?exX7~z9U&uik;kS9l5eE2oVeTP<Pj*tq=D!7aF
zH*|{<*n^*KZm>AVVsOIsV=b*2fiB_0hrCil_Jt*S$up)P6$ikFXU>Yyb$tuy)f^0<
zyR8TJopd6F9-Kguz6v84jwmH{+xF}gy`B+WQ308ofamDgWSP3bw9W_SzbCmnx!2V)
zz#i*7GUL{C0r<rAd-z6A*jzf!XuBIR-2VFd>h=l95*~{aeY2tRL4schr>NirKI^sr
znmUy@XPE(6mo?;Vns>MAr@ig`9|e~^QP<WA0+ZnfPH$Vwc$A>w=t9N@-Q!}w@!(LL
zlibPX&U7@-lAGVw?!a;8D)UuQT{eVKG=-kv>VMT2i3SYDVV||^xh!owZ8;;Vde+Nz
zHq2A)Vyk=6H%ofX7b3MSkpoTH-td{xBj>#JK*-#D@lC$&W^6Ds8RB+ytt!N&RS+KI
z%-^Af%QD10UvQvG+m`4>oZ4R--ZrMaYZBs9dp|xMQ!&FFpxYi`bE(3h_}wY;c3k-I
z(11^oD=ND6m&{|Rhm--8h8mu&Cv#ikZ#Z*s?20kcS|0aCqhoJ1$>1GuS5u{Tu%4r6
z+32U}XRJImM=+#Mt;Ol0=m!@AWohm}{iispyFfqF*uRMPm2oq4pY8PD;9f`49kUIi
zdAGtH_e}5nV^o?%hshr@gG7FFi|dBQe&px=F+rTbx2So`yWskJOkEoNR7}30xIpVj
z-*xKc&DhsjFPA&D^|$NQlO1L}LPDIh7ma{0(VW4_Pf>cHHKoaC#+q9MySRE^S=1Vj
z-nCjusDL~p2j6>p+gkBMD-XPFf*wPf3f-S&@%{xw5;hdG%JhWzoD9ERqxH(mlspog
z-NH6dlqZ=xENDqLc)!+A@Jb$Dpv{GOh2uqZqWkkndnQoudvJ`UdF|o_o=>38@NDnr
zmk`TavS%Ia<BTX)Eud!_R}?21ej_`I|BD09<M|`3$}%DT%)N|8v+#cKFn9fZN-i}1
zL#(?c(c`<Yuy=dYd0MK{DWSW7Q!X|+Y+rHm;U@>yO3CMyA@l9JX{EPqUIb%8y@ITq
zPI@1|9SBPMo&<ghTNR*UtsDi<ot?*kGn*`UCJDc^+Q#lvbfoaLbA_fS+3+R~P7!@Q
z2<QAR*A(><;5d83r6@3^P}b|-hc3{*8mtsQkrOFSC|OIm`nUsiA&{OXYJ29tQ>pAq
zBLvz_^_KptDe59kxDH4oC*I~pzK2F-4ioNPW!dIQ04W~ljl7bYjU+d^yIJ8zg;bo|
z$H??w!%k%tVGnMaM~KSse3uL>Q370sWB_%WR9*Cg0-{RZp8XRCD~(*2J_*hF^|zPW
zy{50DMTG%Sk`!N3V&zz;b{6QX+@eTY4!32_3eY)G@}7ij6D2Wl({A^si~~$1{T}0+
zt(f%Hhy5*tuDzX8oRtTE$}>ZjM(EWF!H!=huK(EY60nZh@4p2e4)d<+21ZgH6CdYu
zd8-BMJwA*1x%rfM$V-{W43U02nA$0`9+xkQqNHvQi&v?UJMD@g<Z*!~$(pBFk5>_=
zh)JVA#%f-ME?~tCNSLON5S%d1bZo|~zex;fbjEY){@^$W{>C|V>WicJGf-3$K$%r|
z<BbgI%n%rVp*CTq{jcw}YLrngZtKzFjD-XBEy|}cSepe!?^fAz>OEgvzd2}p&~E*c
zkVumXo=~ffGGL9b$-HM+OtkB!k!uC6y|`R#@04IOMd9*-8s8{O(;C`lYGpnHipcKU
zLHyjY^E@>7_`O}MKl*%Yw0ydmar@dMo=N3;D2o?&G{?%GH@uGTl($)esry>^8_}(w
z6oG(_5b?16`+&7W`QWc#^#6g#BoA6rSK^S!U{(%5)JhX^x>fzFvE8*;=>y5xh&EaD
zj$R(*J5qd5I0{|Asp?DPYm8hg`t%;amoIC4^H6(iY_Mp_4>);*!bk1HKSmRMj6RSz
zV|OUA!-Ffh&wws+yjTrZ>(T>Hjr#*KV_qZ(Oaynq?sLD1T>KRQxxl|bPEGGY2Cy+@
zXgo93&mZ+{;md6)fwxbxKz;#a$p3ySs890a9Q3!t$qML@!I3R^w-E%aYats&{L&`7
z0(S;T9!f$OnRAN2)xSRHIAI$T^2rLk;zK$KubqJ7<2dh2X|h+$-@|w8e+*DGt+D}o
zn%RTX-;V~2u?-D6N8cDdtM?%=sW)h@Vv{H#ANT}c?mNB)Y@ql;hO3;*jD*|kSs$DP
zMhtI&2}j$k4-b!|&5A6D7XBgF%M&C!L<fR%y}ty7$2DiJ<!r51gN7!=gfDs=fZJbL
z!SM(0P;?ig*3zsiX4^RcqbqQbe${q8C=aI<%&lek$}!A$jul2mBqHjUWy1a*S3-ss
z8ZST_^vy{xbf;nDi?^Y+t!B14M9d$GR98x_5`SMod;qkk85;OVvlIt@GZU^t@;}UO
zM8U;hIfHi-(TKWmC0cfFaeHC>?tdPIw4hYym`tzQX5Ok!vm(f<K|U*z!g9*}CK5?T
z$$dW?=ULL3^vfaf_qwm%V0g^Tshu>g44M3SJP}2Hh8c(nNjeeilhDs#5x|cwin|Fs
zpqhlNp@HeEhsstH7DwJVkw5b+2G~kH0oE5JK>pn~Kw5_|mT;y-kdKGB@c-Lt%?P)?
z;d{5sU1Dk%5kyDPEUHc!O`SpekI>+S$R$hf52;F8Qo1R#=;w6&ZN|X2#l;+|F+L94
z+a{5Vn&5a&f$oxbMIHX$BH26RWjf?HOfXb#+r%;iS$M&_?|@6@&9{zO7Gc++l2f;a
zt?~w|I$@W5Tj0JAJ-4fm(9AouH&4s&_=%MwV5gIdL6Wmhu~35vfp^DOC+#(ZLnF-a
z-%MQHQ=h*588K?fkS{z2Za{723yd1I3&&Vel0u(}yqvw3cbrj0;tHQ?ny|g`hP>Jy
z-?_MytWVooN8BN^|CpP3jVySt#pdpc(nR1FV3lT3hB0P7*}mq#wP}&aabeed@2dZ>
z#3;{`H`*sF@1IM{;@ZHQxhz0HZ<#zTF`=eL&9Y6+srvm#Tc-kEBL^AH(>Y8M5%?}Y
zvE19<RQ}`Pw4+lg>-5h3ZfF(8o4j+2pR)C03j173ZjnVs_3|^H?LxFZjR_Sj7d9_X
zNV<enIvI*yq~75bDzvEHqJItkJu-!{9gNo~)zIrd+mxxSG6};=>~z8H=hPAtgHuyz
z*dWhkYv*-eOwQ768yd*lC?3!nV31B!mA6aWn^opG;o5!A-&47C{RNok!gR?e`C$TO
zu>>}QFyTG|<b|1xMe#cLr!Ah8n2Ltoz-7AK+`6qx_WD<9(Su%rJ26j<n+e2*Xi~(e
zHVrQUG8V4-4r0ES-RsJm<fSV1)~*#@0ToXH%YQY#7k~K$@Alcx&h=mUhBk377JR|M
z=Fg4m7}B0_V@+A1xaC$RO37~>qtw<%K<d_y;Gc+17G)r|5&Wg+dim~drlg6QNXw^v
z=*fuzNYWj(FFp*k+FBT3%oTx?hbeje^KtK~uYA8Ry0P=r12Znev%@=^9^Cq~M755n
zN0o7t##4q)I#JFeou)M%drExg)X&y(q#+#q@(1->ZVpU&Uu6s5+ZZoN7HG*xm2FCV
zHP4%)*E<AHY0_Y{nBT+%Ln#}xvMCRAEj!ZOC4ZS1*J!}j&urdHD(1h7NkR8O%>cGk
z99kF8`bB3Lhh++RU#aLBa2jlTHmog-*KbyB8@(hQ)#Qq*CA?XPdo^0ZstwOCZA^VK
zM6{_Yd|D2247+Vu4bhCrMAIrrkD?AO-0F@o>8<(9Bv-E;;75{`@ffVPn8TOmhaH;y
z#r;U9{9-Lhexg75S2cyO%XlO3q~zh!C@H+Xt=p-nXd-{<EaDl`^%xqm%@N^;ktMV8
z=kyGP4#~0<2ZHt)@7B8v9js)C8UyJ6#0|piB?CJwsrTP#RQw7w-aS|ciC<8kPG^x*
zEf%Ntjbhd}FKlv#-4gf)@tlgb<f!tc6-2yEwP&spLL|329o84EHP8E9&Zfn*=3vqy
z&GA=_KF8zcIBExs>S?2JwZVNBFRfLt`Eck6hmWS}FQ*oy<C)Lv%@L@lnfuFW`%=k2
z+rnYi0b&eMl<^26kbKw?6Y;ND9ZZm-u1RI5e4N#*ELV$Ck^C@^u|x<`uE&q_UJxnB
zyZlQGjwdEW$s#ek*&@&cf;P{=g3sDt!5%C5gujdYV9Fh^h7lT2yH6%c8azBOIM){k
zu(FWG5>Cc!WKhI4EM-%y!3@!eze5kU*y8~W;SL$y^v1*(^1lk->YT7-3Eq9c0}`}k
z_fVueZ~(}NUjrT~kw6Dae~{wN{nak5czt>sV@4ojId!W2F0)psK9iD7f-!b$P6wiZ
ziDXpR%jA`x1e@`~YXIL0rrAwsNbW6&KPLs0jcW8KQ$7)t>@5%k1g!6cqQqsV2R^*I
zpqXa$iD>zvZ)hO0#SH|7H$x2+%fWtY@?9??&>o_FW5iFsRHgTtv$m+5i;iNSnRg*9
zpPE6xjAM~#X+?v1820(N8pPel6NYrAeS&bO?W*nXEOrXtouLuQJMDsjAbEXb1Idc)
z=ujK0`UDhj`F_N01^%Z2^P9h}o-rt*6NaFioDWw<Ow@wN=-2eSEVX*`)jD<1AM#Kf
z{hVIU;>@*zv22OU33XOjJ<iR`3cTjeEC%g(Mn46bQ&4(eB12B?4hDt!ios7J6Yl{V
z=To3RwKy<nYrq=uG1sMDLv29d=}ru&-7g;ul~o5tw?~`aTu;D}vXf#`BsWvBw1iu)
zo;2gv!WoNl48ZZ2b2OA3e!<sc;F4?1ZYy;FQR`JD84MTPvAxXxLfkWp)sCh6;+0E-
zCB>dY^H2De*T3+<69FXZHGT)cJaGYzW3WNQ5>7t-vd`&)p(dNW3M~v9^u^YZ{253s
z4bTr))@BO7i{hB??^4e$P^h89>O#$x8vn>tt)sow!s}g<&c@?-MBQ`2wj(FfyK=as
zcG*06NCo(96vdEPhrvaA*Z4_2*cOD_?;6~B_c|s}C&QA@$Y7J#y@`x&V@qqmFt~5w
z=1wlFVvkI}NvCi`MkMUCF_CQ2HVx%Reb(2LVvpPDZ5v1ZNY%HWiYzxbqsAOR5;)(t
zg>FcI<WO%VgLC&x7nViShdJMmM_er=wW;s#)O*R&%;BMos$KD2+}lhtedeLW^-uQt
z%i>ORq4rUq@LBQnRuX+jY({f)zw5uo<6`RFca3yU%ht#(=NUWyRIhJzo4^J`e<)Ce
zI%H?=I4Z+hsJp!uZxF7Uo24|zbfmOoeHX<8Q&J9nkqTmeY40+8J8Wxb@A?_o_+Akk
zO>U6=qTRCq&(n(T(|<BerEQvj%XP$Hm!^v;mr=fj!1+$?$KgQ*#MX1dZF@N7_n>vK
zUwP8kS*E=gU!nQR;y|OveZ=!n8CRxc%>0CgYvoqwt41m~<NveD1pBg3B?d*4k^~El
zLtL7vd>@~GDS{BW>bah-o*!wmWgt%1n=A}1T=p#3cjNr>F3Xg5`F3znfQdhGp}De!
zQRs{;{<3q2oWA<fOzBsvm&V)H(}Q4^Q;PQct9n}aMp5f+Yz|d<L1}X74vW%S4f${*
zICmI7OJ}f(=}-!==`=W{meSy3pH#uyck>`$7cBb@jQtm28X6)}E<KfnInHM{63=6b
zBRX+E(4ip4C)Rzm07W_b(O|wrzCZhQ-%2T4%7f4kYbhd;6a+j(8|;;1TUeCX{fLuN
z#_o)_Y1D5$qd!ePCj$r8M7jKpNjaM+>jL_`x!s%9$Ue$bR$<^0)K%E}6Fa8_Z`Fv1
zAD|ynvT5+Lu%jXK77+0&0@kPm-eC%o@HRMxT*vi(=5vg>Uy<GwQmorE!+ObBV$%Ou
z&fwa(;aDu&<ggm&n0@1yMWi(rbS^4TV!hTlnW^isC3W#-l95gS`{2>kYZt(NSnkKa
zY>dSx*o36Nnj=TN8$!F!oAx>;d;_g`R)#+Fk&o$F1*}-V3{|@lxK4)OD>JH-ADXI)
zR8LO`hYqIsim%@xO`TeWxI3skfj}))K)MbZz8lWt@S6dB*zV;gtxJsx#W})lPY!LE
z@H8Xjdait1U8DRkzlx7x*D2(Vdg{SxI0Y$Y&}JT6aZ16cbBSYC73Ij^r!(kW;U~De
zCTp8mTTAAYY4_n<YAlbsoDg9c6aE=kHxp1Fa&Z#I2F)&Uv%5Sd_2ghNK&W>LKOt-9
zCAld!K3XUprld5^jV1Hdqyvzkp+AdEQJ{^`D>QMXIV12j$~a?P>qDdxh<PiFE6)7J
zax}o&=M7+^GpPDYn+7KKlJZCZii$pRS7il;D{ygbLzW+nz;Na*PjD+=&f0k^5>`QW
zxnj&bbY}S;V*^6~TcA8iY6`6n;P0sakgjW*;N8(sk&EGG$kWNDt>&_hLJ*#4_S)iQ
zEyMweOiRO9y88s=N8<(dxZ>sC=BiH=#I6yCdp$IZg7S*+;2U?j=4b+joNzAo%lDHW
zp=rD?KZ}5xD#KBN+mPn~R`BzWDxdRMK<!VFI3OY45pZbEm604Iao{f~nprATz4Wp(
z+Z5C;I2v#!j$Hoff$nWk#F9t>7;{J|osQ6AA=s~}WR{Tz$y^7Ump=ptiD-x+^(a-3
z(FwEh|I%!-X0za6Fqc=%+RW{ZHmF_T%-OBSP7{u3v_>lSAAbh~dh!h;&fSr@EXX2n
zcWWA&v*HW^gm#3ko%hBXBEQcUcjmYdk1DZ)%l9oC%h2G!EK4#2kH!eNN;kzodpeT1
zZw&}^bnJd1bjP2{3mjj<S#{?RLt-lW>Hp<E*j=+ko^lHUe~BKrU_I78&a%#Kyxmx(
z8-3YMf@`>i`n(4;2O}jKFv{}4?yFC=J+O&i3y0J4UwkN!j)Wc4WinUPc%mkGtHHgG
zGPViLK3H5NOr9!<a%8VWF{UK3>M;*nV<!*L??YBm7KrG@TdP%!55(_OYz(dIN|XNN
zG#fk2+OW>v8H#8yewTGIuA)4<lm0$Nus6AU>%<syq)0yZEVWE!*xhx@ZAaxLLq2h^
zO9u^L`!Gi{dR8Dfq)#K18II+uggU{Wvoc@pKJ_gBFkwTYpHjK(<HKg<Zi3>aE}P}9
z5NFnRk}p-mr{*`DYg|5(*+vJLq31&8%isV$+hvLQ?}g{%s+gxy<LbLPiZsp{u>_Xy
zvmYj=|I)i#+z4YvbdwA_I~O2-{++VGP|VcNV!$jI_x$QW$%sK-7gGUgEMIia9_c$^
z;hE<!uuy8yD&$^g`p54=+Px16a!5bcgJ5r6t(Z*oPX>|6K{wU@(6^AYx(n@MGF-Oz
z<6-OeSIkuFO4B2vQt4t~5ogO3AIbJNYgCsOglxro%7#ipc~9(l>fOxebUFbjfYSH}
z?^lz&L=9b;rL1_g*m>91HWf?ab?DMpVy@58;~ppPn`fFHI8d`ahnX?_zmj}fy4p(`
zo|4zhBiVPgZwn5r(V6#)=n-!4?c|@n3bX%9tupBJHFQ4PO+Aa}?v+hNp}OAD&lxcl
znt>`Nc)-H>Sr0^rBvXo><@r;+oK3xjx+KSPWnE6Zl`<*qrfuN>cEkaX>ZK|&>XiN!
zR&i=+i~v`x3<G)2*V=M)Tz=_wc#yfG0z*NA3GqLK?2-I51b9>^xZ=?K!!EV>nz1e8
zrgg0EG|^NtCFUd-V+cDf_)MA!uhIS^iYHmAJ~nwNeT0|zZ2V=2dPGd&^v!>zoKJ?P
zOvBEnNzeY@CprF=jrRv5`x-|DNTcp}fVATtI!1`DkT-2-a2D&~?LKd&d1{-}zpRJP
zGe~Z$kke^;{I#m(e)5mBb)F%D_V271q(yG)+E?mCE9$&owgw~yNGZ#!8ksb-zFnI@
z=E+4I|A?5(*`*Uon3}qeinI?BjNWK;QYSN}==VsYMvD<4?i94AR)(;)t9y@rOdNk$
z4Z*^~Y(Jgx7;U_snxIwM3s{*ZlByRFXWkWhuk5-Xbdx`|@1&&GdF@}a>KGI7A8m->
z_?=C#pfK?0^2PYz^+0oKs+AU5w%isWYxSbpS2eYBG7#NG`>^rNNR>)PccH}QrzzU%
zjE<5$m3dFQb+6z|VL%{oULJtOYzz}!bO5@^elj!&8>RxfQFz4!hM0^q9-7M<VPD3B
zo?Bv(p3cT17M~|cEEdJ**C1{^qCuEE^#pj<)8qqSrJIW_yoIr$m1uf-5O$q~tiPE>
zrZB``X86viSBwbzPt;(G**74tgkOJv`9usz|H=EBCH+cvC(VN2t-=@^(<f?ekiQ`{
zFiVaUzH^-l;?5-MgS~|Uz<y7t#1OIe0prz&eel=<99c-2&9NU(PQY!smFBR!n`Th9
zzwyl_C)c*E(sPe#^o??T>j><`3lqSHXc@8a+l8=uXQQ4VUv7Gliv{r%ZWkl4<_hF5
zHe-c8-B<$&ehLl2ZXV=7&ksz#*(hP@$XXefJd2UWvO1QV8t55g8C}<Whe1Q__$Z1`
zE%>Z5j1X|`k<rN<4&SQNyE{7tU3|^U_G(S9LOw@U!eGEgJr$6jUT7$QZaNoq*j1Mh
z`1B+Ss`_)%Ke2Pjb2Y^Z%s%l0{mDs#KHTdX&nD4z?K2x=>t1~b-c3ua-f7Zin_1|5
znmJ*8NQW;WADug?149_%_tYJPvm)Pq%MRe1%j%~{L=N(F!4R?Mq+Zy~p(ThRTHeq=
z@7NLO$2EwEW0PTB)x6UO<vef7f!dePw^@E0TapFuZfHZ=e4LGqtITiyO`Su;R(!Jp
z?@d_&p%*_udF`7lzoEv&aON46@`@Gb`-5xCecs>)(Juh$u<1RrJM;T`2&e&X)EH)5
zVz}Ck-4WQ{mJ%y+5nK)Oc)t%12;&~)M(hw~iu?>jQg~HB`mLgM+=vKuZRP`_AF%Qy
zQrRmo=Z8M$R*L%eJ^Z_%?S?K%fqeCB!Pa<zNV8&_6phM8p1-9-ZOph|n3J@ZUbr+^
z%V`u?>Dz6T|MpyZr^{d84hamL>m#@t8MU`$u&G;ciizO;AHsS4-^aRVSS{VEFH2(>
zUQD>Ee6gx(?dqwL@17uM4BDS})+xRUEymh<5a9C4-7A0q-w@~DslP2gU%r?OL`^9$
z61TWy{!?XFo}Y{<?Y^~9A8imyrdYv|y&_8Blhw>JSWab%Xp*xxXT#}H)|H?AMmqkk
zUNbdgcJb!VPs7#``1xwW#53a$zQr%Mx%DaB{JhHP7f5!G$0@M%T^E_0ZVF|oG6!w6
z%h%-m8ka=syHtMBTz%<{N%3S;gszliGv4bSN{OiMq`PO5@nDzj#EuyWkl?*sfZk3C
z-|=^Lf=?sE`0ml4Z@%5h@78#N4PaHO)6v!s$=PoNcp4!=EL2Ekg>84hP#Q8`s4}J>
zl;~X%h?h}5_Hx{PNRxiTV0HA@LOe{kjc?A>;!3tn>S;Ll5$}t-ze&W|;g6~;0kqHw
zqhb=^(^KL5H(0v_+6;5r=>F=5<d3#Z^}BH==V}-7gc#rQX3OLI2o2{lcV?<ApSuWt
z6kTkj64Kj|{9cRPCa)E1z1(dfksS(GC@#M3F4joN5NLh&8XZz1H97w@G<BrgYT4#f
zZqk=3uPXrG@zS1*b0$n@h>4(dZjlmg&}En;r)0pdn%^<MKdY{0cT7GqgN{?5nU;2n
zMf3EUY1tqdT2-m9pXrsmZ#XHu{<Ut^eQC3&7ZdpP<%o08kb$P;{UC4_p<$b3%tkCd
z^Fz*vV-r^?@%YdyU91sBXbko=C7cc}X6~WnGvHYeonZM{RV%Tb*Q$`x8%NHIf*WYn
zc|linldY6wQn~eJJHX9~;%iZ{N!0cBkbJW7J8M%G0y50>5Ff&9>`=e8Jr(|<cZ1}o
zJf&(X#InK}Y2Etl1bXw9H0nA*eyk~Y3?zS}+-x3$n$jBmFKv#GCa{}!4s+h0G4lV(
z*Ac##1~6%!JIBj=N<_>04AE<?AvDR6yoNBK*@8lJJACn<E|@Xe3_aPMkzo>a%3I&V
zfFJStK<B>ZSk)p;cWS@X+fxTY*|gF#Xg`zT#6@uPAUiSV%Hqcx{-3}RsXJ=ek^^qz
z*0C6jd47Vg5G}AdoPj9G_29xn3#{*QR`f9-PzKUr#1=o6(=x&Q!Lcd!C0PdrYdy;<
z&u?TAP%ncZ<!&rKZG_GCOu@UAf}lM4kD%z4N@SED@;b_z@+5TNeC&E*0X&P|6u&mL
z9VQ*SRwRl6_T%O3Azq==^dQL^fwiQW-2lIGt@Qom9#98?v$H#3Z#AO8v$?h;0KR6O
zp72l8uw6Gcz`3X|qz&a@E`6MWfY5UP&Iq$<oqkqQ;_ws|0vuno1La{zvW-2r1O~p|
z)&+l$^GaSL=iea#f}E2eV#|*+(>o_QA_qN2m+!2x=O)Ls=vP#H<+gH+z#6~FmJ*kf
zuK9(%LL<pmFj{wv`@BJM2gD9NUcWDB+~Q~$4Q}{PcrJKidPgXLATaW3rYrd{!c-ux
zAA`ww)zdFr#AOQ&=qBS8fil{u0DW*`R%Lr|$<t^z1wFT2g6Pwjjc%@+i;4#Ak*3*$
zZs7{1AhD^h<$CE>?eEm^l5pnwmtlxUx6qtDfwTzzeXpa7WO5cL7TC6)e{?Rpckc+<
z<XV?shML1cd0+A2qj52hpOL=S^dVSF4;8Qnkax+{h-7ps{(``Tl9zW;f}r*~jmKLa
zK4YPGNTKml_!{wmx6|_dWSMy%QiX8%G5Y4dqU3&hLf^W5r{(-|-~M2wmroCyY@P@$
zB;UnCzbDuEDh>jyBe{740OTz|e%r)h6Se5GKyyDqG#Y8ZlV<Frtl?zo9RY^04G_pe
z5)IcAr*pb5j%~IUCx2L;u9*8jv`4K7D6=T5fQpqZik=BZ;0xH5Es;d3cUjRhe#jZf
zeyy)@|59+Z-S*jY9G6aH`E9j%6mqw+Zj%T;`(u%VDzxLL)`jt^QzZvi|9-O*WATK=
z!6Ea==w8IHsy|dWtMk6Y&&)168U;;Na>3TewsulRpc4FurkQ&u+BOnjCfrc*tF?g~
z@W(y6QR9IOF#eHNy2xVQy7Xt!;rkYHx4zq2bJ1r<6J_0|pxlak&%N4>>XcSTr|b@4
ziQ}E)0tvs~2^sJCuen7z8M$^rF=e(nkclJ21Lf9J>cViPV!o&n^7Mh#9-aPMlU7sZ
zLMykuR)TV+s4Ze(d&d3FYL46&9XUP@s|WiU>TIR%Ya63RIOER8>yQuAU=5;XWgnW2
zs^djH0aGk-mk=Z^?sj@hR`h@9dgtg!+wM#FNhh78lZrbwJGPxp#kQSvY`bH1Y}>YN
z+qP|g)zAC>W_>fW=FeKCRd?O{I{RSnGm@~(=mSP}Tkp%;t9Mi??{IRfifPJ=tK%W_
zcG(_JsikT#O7xtwdXREi9j9N!V{}9k5WnCMBkim@P2<+8*gJ@z7UM89!`UoLjkwe^
ztl47Nr(37+w^<?}r6Y~+xNV*|M#R}2p2CGSxy<<-LiLjithY&44u0m>^dT$*QoyFU
zm+51{sRRkb47V!CoAagFddZp;8Rm&*@%P4Yu~V#rUH*1_Z@pd6fM*shuGhyplHa?n
z2HzlmY7)m`zBX{MXx22?>{(B}9C+t}FV|av<9p~;tH$~Xi>c#|hI=pD*jlBpxydO8
zQfHDR5ou2cYkCYf?2my;?jL|dE#N1o<%@ROWd2D96h2_r73Hgboz;tGVK3AfIy_!z
zP~|)};=s&MvuC{Q74y%ZxV!rHyR%fssL#Is*##eZC-Y4a3G(et7Rz*p$YR&GVtk9L
zrnHf@re7X$57MCZ@}I_N<d@{*GUOunMOFy4A|0p$3{#}3db{$QQU>Dn^#|_j+Vkk{
zQ7`^*l4LSlb2^Ob{Z>5hnE=7A9_5&b1bMW3YK|l~>aZGXnnyn0OH9&n?PB?h0wP@s
zs5RaUcDP>k)wNBu>l?Gp7BhjlWPJa&Wywes)CDtqJ#i;ZXI<O=zCrfnU5)VFg24=0
zE6iK!-7Csvk24y;Sp<&<`klrx%oFLHXoX)2ew$DlUoFrZsZcyH(~44=Dq+MKu0el(
z@`P(3C)0{`QX7t^K2E;adwbbM#&G#EwYBipolLMUk*1t9%DNb`p~=}_D4X#y#kH8E
z!-2~-haL0L9i2g4Gk@pb87X2p@D1N)k-e&IWB;j(kQUqe8dIsf(FispV??b=UUE$r
z4sQhKDO{o1{S+!Wy^N{O<nj0Jo*Q&Wk76uW@rTFh&(9z4XXjkE3>Ew7k!gHJ)0ph|
zbzN&0334G$UtnAJ>>3rj`N!_;2TdQ6GK}5j_<bP1z||E%w$Ws~f&>ueA0qgW+)rwd
zFRL+NehS^c1^(!F5r9PdS5hM;46Yh5?iyK(<dA5;I3Vc^4P;Aa)q4rl^)QpX*#H&+
zWlmTwWZ00ncZ!7=P4K;2*NTt_`8eQ<`Irt%^1b&W-h=m{KDI;9^g0+|1FtN85iU1(
zj%&z}fTU7TMCypL?JNC}{e`=|JPvT?ag5RN!#ErBsIs?=dssfq)Gindp_m8L#cNYv
z7q1K*{I-4i(4$&U*K35HeiKPxwLX@nBTAXJDjDMnK!t&UfC^x5m;#)ckYUy>)@jjK
zT@ifw@G|PUHGz0bA6Ys*dH40jP%Oht`V4JQ_yG&aUB?Mo&sl%(UQzCysjMO@yX`g2
zKjOt9ZW9Fwr<o73|CNB`)(AJ!-S;y>e0QM+2Okv!G(GSo`o!xPV7Np>LvllXgY1Ex
z>e3G|gP*}uLi95U2lX*P&1q|NhTsE`a{|?`!7RuVOv0R2YtTk9RR))?_`hQje^WYj
z9q`Ol)SM+fet|P96Lm8-G>f^V7&U6#y0dTob!}$P5FY=?_Zy+!G7y+@$_^Ih92(^4
zVMExPP_$d+1{OSDLm?{IYtilDqi=|S79Y+#*3EnDAY-Q}kEpNX)ldn2y^o##2_2!t
z1L028k5>D@*$?R;M%zzlKvx{tQFY0F#1)-wTDKv|m2>9?!1+;}j`wFmE^k7fo}K0?
zO*;k2`|4|UrQ)=WB%lSi5C1`pD14>arMZ3Iq2XQA8_rMRCFe3G?O-v+IH;{3@0lpD
zz-6-`u0*K3JYh+GsC+Rq`-;M$4%aZXf^<I3a8f}v`Ba***M8boF)Zxh<jL-8DV7h8
zDr0s`Y<oKauAOs6j#2x<W;abV-6%p>BvrYk(&BC2lhZPm+|nPP>YEG!<KT7I(XiE(
zt_Oa|*Id4q5R-aO?-4F?I?<L}R2EsNk?(}z<?@E!;<Z5Q>B6}`0Y!19Gr-`g{y1*!
zd~6FaqSp}SkXLcNCVApr`fQ@d%I#U$oWKGamx+YgQX{&;t=HKO&X;@-@?K%ooE$x7
zkh|~Z1{fSJcfv-CqXu&&rhq6ZPiEUcowYc12XiJ`B&l-rh|9T<yb6;u3RoHD_qd%I
z4y1rRkYnaz959qy*Z>L3?GJB|h=}V-&Foi57gB)KV-9Zk+*0VL_*cjcQg==b&&QFV
zTvR?R@82*h^j7OMOX}j0h1iGJuhh%=O9$9y1B*+XY1HeE;*DZO5`vxVl?n3h^W#?w
zV=fa<-m!Ha>dC7_Ir_BPZJFcs(ckdJD0$g3W)%Pd)#MFvITg9BPJ`}h<=$@ZD=y9K
z5z?2d@Rz%@eM20A2RWN(5xLBm(eiJdBh=11?2;bE#_PLN5~>TQFT-iITmd*_Xw+zu
zEu;MybKrssv!iFUy~Vl3vEst)18J2th`d$@$LH`U8wEcXVd?dYDE#KQo#UkC$VsxC
zu@#!44bwM@RHT4`7Hfo-F&8;E4tZH2ZwMb>vDlNCb-@K^+n+fgODYO#1>Bh)m|AgA
z=OkkEju-2G*@+|@SIhSNI&e~^Z;H;Z?PNT|v~oR$ig=~+JaXW?AupqXkm0s0JY3@;
zeUknT3)#sG2N10Nb{XEDkEPD|bdKY4h3w6;=pVdtTD`;Dy$PN^$6#e`4mZM*?*sTD
zse>Djg-xeLqqX<k)jB8CVffU3GCUXt1*)-#_(R%hcAQu;To_x;I-5S6J+9VwlK1AD
zKOG5jp-Bugp4W`|IM4i#7r@<XaED%j(PrfJ>;fzEeT0hf;!r+iOo~*`?u*_J*Fa1?
z-G4{|{SU@_+5fm+b~sZ{|9mZ?N8WsYaqGDP?90;8;QUNVgvkSk`8(IxWu6sgqzfhr
z(V7<7`?`D{W2MKeLLY1gmm3i9D6qzqbl($L0_Y%r+;wHfai$~A&?_@`S&(6PsHqXG
zQeC?jfxCk5mRP1Me2X*G<vw==`Mg*eoFxk4&p=&Y)OzT5(eq%jYQ_o4^t0xv^X%Y$
z8lLI7GA>pSncVN<SfDx)K_2Ym<lawKq#ImKWXZKRd=NS<V0g;7J%*zTZ)C80X+%GI
z-S!0T*3JBTSO(t05e3CT#=sZX^8ArrV8W@l$5h?sNEcw%MFa<*K=Snw0{(-yIKWs{
zW5Es%r^gG9zxNCP1Uu<=5%K;M4Z@(89S21>rUWn<Ed|VuY|~a~B&ivIg;Xm@ZQ<Du
z4Uj^})u3&P;|GT-hjdC3YLI;{d<@WLzeM@g>vBIPIJpskT4$v<@)N^Xu{G45lF_5N
z@C-Hi>#2H&QJGF}hCzLa_M%x18%zS1MuzG`&y}r1cU9jr&3N&wQ6H$-VG^>w11S|M
zDOo}SCMc9z_A}+k{psx@(jjBmi3j(bB|s~FCG=*sUqQYW4ERMHZQb}mNE<R;f)BAH
z%BDFmRwOl|A!yPnc>Z9nRUNwRhjPfHBe;7eapuzCd8h0s<7<uU_Qruq;sxj022vpX
z;p%GP+Is2pKKHP`7G)Yyi_yU7!GfkmGZU!K#I(^U(<*bMCR2o|mw;pB4cEIKB~j=^
zUloy0uOwB}bt|i9oZ4E@e}LC<46^ziWnJRz*LRZ}&^a8Ci!T^KvaE`)fw7a4qhqqj
zqQ1+=8Ha6WSd$qXn5Y|6Xz|gjn?Mw!WJRCMl&mJ)%QTSwehd#AG?riDch~3^m`y?E
zeN>fer%oa9a8LpWhu0ZEkuqL$NFe2u_<SqT2t-09x1Zm2`LII7*Se?ZNnP7XRDOe^
zx|#nC_TzHqg3WPA;Vd#xv(Tk<Gq<0ihT@o@lLF4e%6Qcn6%8@?vw5pUc5bRJ`eC@#
z*zeV0Du+t9s4$%(0q?OZ;BvTg@6*-J@N{6ML?qQ4&w8a5<|As;<GCS*TkiXmGYdd_
zPdajJoCluF>~h?h{%yXn_o@9@KEGx}Nzmf9j8egJcVEFWM$}GKEFu8@T*$+VNIJ&c
zxEA2<g7@4t1VwLq#_4=~hAh_mTin@&{1EK{;(hhik%AjD*{UJ;VZZcT{FG$H{W<S-
zQI(asJXihn(7ifJI>*rG8^a^+^*ciwGVkUci?-Dc{bB=~*bpAEuqZ3{xpec}g(<^+
zGcBbWOQ&l428r^y0I!&%lZ6Y~zmYvC5^|+fF?J{TMRFt_7l|xK#%R&U&+cu_j75i6
zWvdY3HK|vnPPcakVO&x~@!}$x8wPg_*QvWNQJBT^UMwzsy&Rx%Ff13aHYtwG0p4oB
z&5Sf3a=+h?QY9Pkf$!-J*?})v{h>-0!yEk6vT!zqfnSyXKm{wquAl!71fnsvG;gV_
zKF-BLOm1Q5HS1(&R%XmMs(n;2F?&5toVLTfkJ`BwHWlI6$xW`3&lJQ7^XxIebcT)E
zB$Iu}I{s=c>Uy3pa&!t0pG08yfs8PO1IQcPMKVt-UGQxt`&}OR9GrxkfpNpH+-7tj
zzA5ZQx@q&lY}PXyzojPbcpA{gB@VGpB^4*u!=Fd2$zD86S<9fUTkwJPg1IYHhRnX-
zCRKje9&9FDJ&|!cQDox(S6qYmS23LB<GIM+4B7$s;t3fosfVbQh54%OEel%E@s~3F
zEvM_kU8RZhL&<_?98Q^k_dAOH3CVpnX4F-COb)4OAVPi}qHJNH8zAnFJetlJTpi|f
zyhHkr1tAeyAx3aGf$cNFkH6o-4_=<{Bi}e%wXIsOUS74tC}xq?Uj4_!$JM#3^>Uj$
z5bqyk`TSMQR#WI{@4ayEPb4y;s{GnA^l$wdY1b80)0bi{oIaQX%5|yLaV*fOyc`(v
z!P*DZD5<UJrdpNDx)l)(MFH+fqHk-Ka(?AHpTlIC@~CJyNzB;Db(3}LJjS2?yNh8o
zD2uJaC9z^;$7mQ^&C|*<0v+Nt6hzt=XFmmu;ld8x)BX-S{2j*MhJ3LY*df9zs&KEn
zaB>l~S-gK6R?JZ_U2A7kr=GHT&V3~~Gl>*C4%xf~-kz^qN_Y7JUT2K$@q_e2d9^di
zQ*H5JoC}ED<UB64eV;BRw3T9SLl!1h#LY#X_cN0lM90$v?xH?qdA~UE1W-`Fy)CH@
zIWlWSXBN}qJu9Ti5S2uSg=#w|xy)u#QlW_9^1dks(TII-_?eMP6r;Iw`Vz<5%$Ux;
zIf`V3k#5}n9SI=4jy(f9hvc6xSg+VCp@tm5dR5}{e!^&QwR%y)9cOlR)wTC2uUY+;
z7D=L2_)5uhR-8MmVPi3&tyutH4i}H)sqpT3;82HPJUuyANkZ1u_w__S;HyGO>xoYk
z9B2gr_ROdI-r4K6>C7K2dbbm4uanz3{taah<5C@*+f;2}?|t4G*8rO#RJ7+@SL(?G
z)ei(KFH#n(*M@UvJprE=iD0-(1m<!j*oq}=!Dt#Q$3!J~NKCifwJi8GB^SIuv@zaZ
zS9Fhko1fLj3lS~UoG0IMtV6sFQ;$FFzf+;7YD4N|XlPO|%7lq6SNI9N777Ngb|%QP
z?nj=*hJ#t&r|))#Kkmu84wom6_4GAyf?Op9O!$ry?a8uvo~oa0+@`S}Y#4_!h~9F~
zvRED@Kt$`dDK+BJa%p}FK5tjl-vRm@Xv{X_<&3;h*wV>CAzfrDjF#UTwRh>a%{i*S
zv*FNafw7=DQY%tyxU6aFfuAJhry@ufLayel>`EzT)rIhvT*xkT=~9;;Q={l~>n|$-
zPWBHkqcQ<rG*G5;aJd4R-c9`{*AI@!?>4W$UkG2EQFo4WhQF?TsL(FWcp|7!fyQL~
zE^X}}hAwy8W6!-RjCbB+Dkkd^{WhaNf<CMPXzWb}P#I^)izuwCVIx1bp6<5(oHBa)
zE%uSBa|}k4bwgHC9T5jyB0c|k^)v8%OCK2jtw+8tkvme|C33u$ov4e`|Iu)Cx4d=Q
z`pesZ2q4<8W2_Sk>nHy>X^PUlSz&zeVM{*{wr}3-O}7)!cA3b>g!C?UyWHn$WI^8G
zwxi$sK40VO7_+}I;t5#Px*S<k$LSmw)dy<o>IEY;uz4i{6$9RF5}N;0U5+mI!PBqe
zo!{bGUDskaBAaeJ@2vU64TMsB4mx&*GelVSEXKcC4b?Rfm(FrhFgC|B%`{gkn7|DP
zm?u(juxT^8x8ds1h5Ev|T@Xv3fxXND{+`(XVtNhn^xyH`f6p<{2~+4}(NEyE;{`e0
z>BgW7wd3UGV$n|gmE}Luc2M&@y38kz_<0B6z`RqHF}SS?o=nt{eft^2it-|SLf8ho
z5_}0EdB67na`1e&gZv=CGpr!|zWim3pf`gpb|GJfsFH7gQ&3zzCj|<*6>C4>NlQ$b
zq|Cp8Wc2+qbRs<FirFtF7}nu#l%DO+{V3ucjPAv5mW(wlpoIzgq7ldCf#U%|y$r)d
z^8<z#LNya{qmgp77QF?(WIev_*9kgj!1M}h6i4(s_LTWL3JC(eF`5N7ggz0^;O3KL
z*13JYvE0gad)uKbMiK6J;eUj}$=vk)lc;)O0Vb#^ard2ud2&n9;P`g?bU$W$Nx#vh
zb$u<Yan14vSF&pHLE)rAY}h5vw_ZfgjCR?dEb23^`CiZ4%&uGb&SaI3&C^NYyL}7|
zM`pv29Nr3-C7o1jmCfdNJ(zL~NO0KSwCQxn4t`domE568+$n%O<U$N4lf+n+`p`vH
zj6+AGJA1#wdm!nRtE$yiAvME1?wxJw5z(hl*nu|L%?XVLNJ8z<a3Zjsm?~)1ki3MN
z>49Vtj+ryUDkXXt;ylICt`UJ>M8Xv2kE0~))jF*{!=SnRJ{L>GOPG^>vnJEO7=c!K
zM}gWNfqHpogz|_^Hp#R=q!<!s;fTCX@Z*%8YLid2W^Z(!gh(fexlCoclm&ji7Q<B|
z4tmwXT**Z-sZrx0oaEiD?ag;3Ug+zvx%NABzYrUhhgEqWH%XOQsazh1CBuQ42`5=X
zfJr9v7N&YasS>#I&Z8woxg0(py-s#qfI>P04b8gSO)=+bJ2fUpBoa+A%U;aE501Lo
zob@iJ5>nDt;Rg~d%ZjL@!D5B+V>MuAOR@?N3jmV6u0xjxAj~=1vHV_um=4@zQQ2x1
zZ(RNosa9)>sV?KiC&#PKgrrkOw_g+yezi-AeVEroTSHazv%y~(@bfb+0`749=TSs@
zGL%;Z&w3goTT6c-MDG{)^Xe60ji?R&$Yyi^_AliyN~v$fu^qq&f4r6E%H!^D`6-#(
zH0)0qn3OAOE-&-=N#Z_Vb4q~oO;2u|qwr*EJyS{raz)W^OUQcl;}&RZUKivm@a=u_
z!HfmJ)_XSH)wVS;|5b`ZV6o#c(+9V-Ohg<F7fbBH1W=_Qnu1}|TJB$2%wxZmvwGSL
zsx0}rvsgBNd52JEzlu*bVLLu8$r5;(3isD+FJ?hPNIUTIS2DBeCcbik0xvLSLDoZw
zD~rJMZtHT+evf$K*#IBT`H^lSH?(by4wL*)66=R|4|k(C99SkZEanoKv#IovvmrDU
zmeL|uZw@;3IGeT65rACQ9oe#fME!w7Hinx;86)$YAfI^r<wHc}QH1d`WUouhjHVWH
zb+*qbPOL*+Gm;9TPmR~iig*_E-N!2XwRe=K*7sI@J9B)4#1}?#ojV~nE>Gz&=WX|C
zN_D+N!fBttDcML&|G$(7=Q~!Bz4@LzCP)0ufV4@_Xz~ooIkc$AhsbAog%_huPwP_$
zE8BEEA+@T*tn}8`Fw;2!2Y&B@^mFP~{s)Yg7L1&f&M2!z1BmOt9P6PBjQSR`Dyt(+
z0B$ZnYB~x)^zXMoVzK*S5VX2`-_keL3BDk^N$F?d((3CP7em7zOTb9;{@ZL&1g8)=
za>TbwOqT>$@DO1_w<GuqUWogm!2jpal7^d`B5LIq*@>aAQZ*{Eql6+XH-$LBQkPuC
z<&GBnjzUA*qTz$8U#ku_apNmhr+J=$D(3$y8V1t^F=-}pLIp{=^sQ9M=)n`S+)Ude
z9%-^5N%igsVp*X!+q_{Ed7Q!GYNS8#YD<5Vg=}VzX$OIHh~;UtA7ow=c0v5gAz9Uo
zL~K12F>FDy9KJGI5n5DvB<U%8$=End-Pob)@;#aUW)qe4Wt{<-*o<-{&)2%ofL>Du
z1?HZ7tFFzd7ZOmXOoL3%JnpH&OOF$nTU%<;EJ>b0Z}8DKf0647be41x+z5}@qg!jf
zV-htEVa6Cdlb1Y8YqA(bDfohCcBVt7(~TB4{Y^#&8P97@S@O5?@M7V+0jtcIItT7C
zGndI*zn|XklfapIg(YHRghd^<eE9V!IO$lDX{hWGVY;0#%>0(M>Y$7&Xky8ND41`d
zxg;TAN)iq{<7{D3sTEv-*jC10g}>YY+PF+WM)D9pMG#wCaZI2za$t{T5)NB+ayP%f
z?~osDUbu9;6u1cC<Vsslq6nccyBB;KVcd{@T;bHkar<pO&>xSisj2E0QD-SH@P|GC
zDz~H2&S8&j9&My=5oUCcmfwghiIyX2;dK^ZVOdPNzqV!NHqS`_;%B<I6dO~MNmDgS
z&iA<^3$Fl@oj33*!NK>);WUw^LlPLv<Gsbf=%-Y6jv|=brB7=G9K~{e3Zg3mdFCd5
zs)cW9hOJu4>>is(kMnIH?*U+ZN#83Q(*D%*Ma=P4l!f%H@d;4geKJVLeAIG*Z5=ak
zH66YnU5XKSSu|-_|Akl_E<?&_Ph!O22jKLMtX-a1Lk>(T3|&G#J@hss_=Ya>-^>OC
zofoX3>0?f*+)v+tFSv-Dl;|4!w2S$I(P&21JS8?$TB;_{GI5_mPW5SY8L1cy?D@XQ
ztwZ=kaxN2b=71Fim9mYWNd~;mbUSV(V+-A<4tqprcq&oLb&OV#Ovk>*Y&$PZAc;Q9
z2_Xuj7?z$wUM$UHZtW4w3$bCWmddO<41jvQ)ZjDp1FQMUpFUEePZssP@hwxxg)aGC
zk=tjJYqPKw)nAjdo{6C+oYb*=Ia^Pa61|v>AaZkhEM)+8Ghx3o5^->R|AS$JsJ1wI
zm)N*-2L9c)+naHz%Skm`p8m;BOQ^&2U17-Pxu!CBK?7OGyI$)!6w0siM5-Mativ%(
z1R(#5T<{7X&c+3hh#hvyu74Unn#O&F+&wRIT_n5;-AsEc@!6rZ^Y+WRQttEhXJkjm
z<`O1SGu^Yd$aF2aHT7kb9Uo6lmF_rb_YhMW%D-vwYl#n_n;?ZHwz{(Z>pu9u{s|zV
zc0QJDijS2v1Pu-D5Y&G}y<5}#Gu`%--O2wE`TpfaSw7-wUv;ylyhw2&;dw@!nk!=2
zsxKHc5S<v_&%d>N{fZpiLDZ4OK`TJE<kusT8hfZ8c3C7~q(6MH?_aE(C|D<Ltsuc3
z^E;u)jz{$=S#a-~RqOe<J=~J5b`0NE(`6;4p}0DPCZ3z>D`ceF4U8if0)Me#@=l6(
zojuIgbRO#Jx7OPY<;kF6-UNhc@Pr~+Uj0wKVAkHqVfpwb(2)Dwa#dL^@UsxjWMk&E
z;;s2a<{9H3XC*6{z=Cu1yy6FiZByF9bVHfC5R>6KIICepBCR`+>H&Sxl(PRo*k5mp
z>XlP6Yo>X2klD(h*`1=+9<~-23(qXNwqKH_$s?!pKIgC-4JDbDF}m6P*pD~9G*(Eo
zx)2rvZS`rk+{v*)8o3<Ns8wceFie<zDOAho#}Mq&_+r=MUi=MKuukbF=F_zTkoYcr
z@U1ziaje5=;u0wWEppryay$OlukEmQ8$J=91lCVzpFN)`ek`Nq2V|<!{$;kzR`efJ
zGCFIg@^2eZDbh@f%lNPlfR@><=HgqQ!}Xw+?kr=7ciX-8?Sedn_cs`gT{AfDzR3!k
zRmZpct|a1_jfaNi62Cu_*0ew3BYY`j^pCNsrz=F9(pSkE2>i2)(~b3(h;&UTyG8kz
zPjJ|99@Ru#8)7(pWX)?>gntXnc#h|rV@#zg;K)pEqYNPSa4!mxQkk*NpN9X67o*N7
zYc1kVXXhLG_BXS+n(oqFkLL{QTdZAjnDr}oeXgGvr**jcUCfrOv~q|~U0qcXd}qBC
zqSU7UPP08ao;hbkdZbd!1*O9Xr$L#D5LS?jpCS$ySm55~uju;rl#Sxjy+0Y2iHm>y
zV;TV>bm$jW3Rw}4U}oU}NOsp!8nliQ7$v2I8Gav@yKjDWIJ*J#whntseG)+Og3>VS
z{Q0cfqkL~xy-jm&9Kr{a!|43lce^?GQ|A4tPs2{ngfbDWy-;o>VDzEbn=BTRRA&q_
z8@JG*-!?{S@(jh+tN6x$#-8*t*w*d1Q6ihp-s`wtVJcHTci@u@dr@NX&)e--aJQus
z@80gvLaCbvx@9BXYUlU9{RXVwTmKA6T*X1L>-mLLT*dI)QdXT2rR!yocIhv=i@;pi
z>m_bcGWq+q>gahtm0pZYc64*F<T%`3SDri~Yx~PDYDSQF$}6zAUm)r-f3<^l*Ifa9
z>HkwK`A_NPuLS?Uxs4Abd={iU{8qJ};D1mILM>8}kC9Wy6Te>0(1P4K?c%0JCjF6%
zL|q?kT_$y}zi&yK>uZx11TyfobrcV){x?r<u=X=~XJg&G(kv@jWU3J9i>E5Wl<o{a
zju!cAGOT1=dCoR3gky`O=g>YXx7QA*zG-_!4NsU}Qjt?rMMO~;bSqp!QR|2jZuEvm
z2~uV-2MnGvP_COwD8zi2`4CmdosTT%?9=#CVyl#L;$bs2HJ$XE-X<dwqAWv5j($C(
zZSl%N%v^ng4yO9)%u3+hX1_9(;NycipQyQpr6s0Cd3`xtvE^6L3q__$V0=L?i9>ib
zF+9zm#?n^MdmFG2(1G-q&+AOa8;|K6Op`p1YxWb|hv^K=@j?$_hRsR&!(q(R#4Oe|
ze5iW-(hf-x&p?!MzM74JF1e8ym5s_M;G}KK30*(be2=HjLq6ho-fHIC%+Ei>Gw)~a
zdXQQ0l(w3wtwq|G%D_&|r<e~VIHI**L^GC*B|wq|=}@t!lS1@}xmji8bsE0B%N{=m
z{2#ca)N)FtnLDp*jGZZ#;{cjgsl_DA+mH(xFLf`rup^DQ@{Z~uX-rndKl|$+-(JLR
zxlV}jvYd`m$4gRvTznB=$Jnl0WqMOiT}uOHt#Uk;1@m1@q4@_=L!cF2ZE>3==X0Y>
zt`YodC{!qb886yhWhY_I^{T3XmEO90TxHV|lIidM$GBJn7m9;HT?&kq2NjGtOi%G)
z6%)iMBbBinSl|n8fW|>fGS38WcnYs7p8h+<CXY{%e;{`1GI!e^l7_YH&uO{lk2mZe
z){vXm4fe+*=wg6}sdR4_VV*7>5o^AJG+wBKyHgpdce{!t!Q%bIJCCAl$AyWsgP&5^
z+9W1D&qxX^1zHgAUmX_L=!jzsb8sp|$g!?`hYD(xb5n1J5n<0So|**_1F%0)M(EzN
z4{{g!j*!-nb*m7BP_z3j{Abs1gPl_OLW5?huk(FQ&;wr_O#_Qy_xiyk*jqIHbZ&iy
zpMTXv^lw1bx?FmDGk-Y0aW3JYNFu}%^c1fUZ4=1S^UX1c7o>a(5bzW(S~tb{0`KGF
z$6tUG!a*1egKpLJjYCmVE;NPcif>H+21=^;GtQLdb<rJ(G5OtFBCC`_^7r#psB{z=
zzf^(tuD7CP0bA9la<y1TU<*>QLS@gpFxPp>k|NAkK17PTe>i$N|0xrp<K-c-ZmMJY
zIkvso_e?K5q3STNS=q+>q8_oqB%GOB>HjM0iXN`v`}!jnZGV5~BYytik~RCDphkxl
zPeYdGJ%+b^4w;J%jg4Zfd-GuOg&Rkn8;6-l&H4U~zON*TXXq$RRkN_~#9moD+Aq#1
z*gVFDZ!jp?)+S;`Fu#9r5G6Jy+HH~+Ff&m;ac@B<Q%9b(0^G`NkJAfpixi?*-4ubj
zvFr${3#~Vz%J*b2ECo5Uzh`D-{kps2KvDy88pX;@dp)4FV;1y~QL>?3vZwqWCQE0B
z!1?kH=Ht{J+Zl9=LXv3HaY8{urWTQyv7oUSYH0ymbNb9z_?BCnX3RZPqPEB@I}21K
z`@A#l4}nj<-WCl=Unkr`w1NKnIq+aXzXiMg`|t(eQ^Z@ktaSE2Px{b9*Cn&GDF$Kb
z3u)@55v2%cPPTfQ7qu5c0t;X5Sf*Jlrq`fu+|=DOb%xn66@$QP;g=}|M^}Rjat}0q
z&a1}j;mSnSg_yl#MQSd;;6+P`gq+{FqJgvyXh24#S6Q!4YD2!E(6jTzT{Nl1gd~^d
z=5gQySBz27pIJf7=|4Tw4iiMcK$iYn*4(M!7?lM@Mf?H>8<@lSn`}Dne;OOda_^u|
zIHqrMm`_L*UuZTJ-n0SacmyfJOLE_z$zjer2u&e98TlPDo7X%vP~cF${lXZ@tAa=<
zqwh#`%72qal|v_F`RS;rww^_0t2umTy!1%l^O&(*%9`ZYrc@V#f+3o)nDPwhGqbmz
znU&GYN<NJ9paKUEH*=ZqOY?h+tO^ko{i;C<Stcd7AXC+8_5i-Al-ct=W@obw`raT^
zqE-PXI|gBjTiX#?nwVu9w}g3~F=qZa64j?-sMq}XlI_h16NxCsaJ&;uA@jscYdoS}
zS*4<rIErXuE0%d$^ZqU}<C82n?+n)ca-<Iyf%|s(HVdEbm*pGcj})3t`(bBv#C7Tq
z&Q@qs=H5Arn3wP;SL#TmhWEzgR(9y3Zl?{(PY&m=6SF3KpR^yD(&#y+i|Dn}ipA=)
z@vu@qYQJ;D1)R8-UWXBqeGAAjObdopXcA<uB?~#FTj_NkhuuX~gv+r%=6<^@7&3Q_
z2t9MIp-~uj!3%#NNfuZ$j_|s~Es3!l4L`d~EVlCvws1xR-B7w)1oXZ?j$C}u34S4<
z`8k+$a+U}{`+KJ2F&wUGw45j1XmTg}wjHNp0#^E2-Jo7A6-V#snn{&;vv^}8U!j`m
zPZ`^S<tr1=_Vhl--}>W3qMeZFr!%S)W}F+jd(dua9^!pvhv;bAzI{%L-La?#@<m|)
z6FS6&$Y-I<)}Uq`=D6ZKg+4fXJ6t%trk9CsA`qwRi(t;Jb|wqB6%UZ4M!9oL_lWdp
zv|h#Hnw2(byFKw=Ct^Med~B%rNWCGr_50K;8U);Zne?83InroFp+06X@73aeI8%)z
zG&_?+0+?cm7iY3Qa$ypL8}KVhGiI7e(}ZJwEr_v)8#=pzuEEwG;ne+(0F=_<#ZLAs
zTyT-m$8=K%n;?4(xco3%)nlKJMI{^Mz5B7#0Kt#{4Tk><l~a6(HvY38*E~(i|6<h#
zERsL0#E1o_j~is<UMO;5g<2`Osj_Ctm+1MD);x3obALQYzh#%|@f;IF3S7K~GXB)d
z4|j+ug38`7h=+%f{f)$<P$&21ti7ksE6*OF<Ku$|{Dt&{(+FV*yE)E(l`E`(^sCb(
z_C~MkP)pmKv6>CvU8g*DDast3H^`^VApZRZ|E~QDa7gjBwnQmO-LNh~Ey9P!tM-Ez
zvL#rAfe&GKQDUAnWjaW9atJg;o7~+_4R}PZ&jXp)-FIkCB9E6zvDxS7jH(<F=5?01
z3h_042QR)|1<uxFY^*<Tk0KjUzFkf>{oDxK-Ewn{7(peNii+$(l0=pS$jAaW50-aQ
z|IkNoL)dqJ2_?TzrYZ5_okl?8HTG(YI+D8Xpb&UQoMs>c8)0{??oeCyS*on{-g%@~
z4bJ=Gp)+kZlLKqJhEuMamDPf`=^X3I>um~C(4@#W{Bl&kp%{5r4l|d->5|H^vhuQF
zjni+!*Lz7An<o=^VQnsen{$geFGpoAbh7t^xKzH7)F>N}uu0YWdAjbXmHSC|nMgn|
zaBG-h(Q<EPu1iEe^anZjt6W1ANVp|<2h#0zd0&6J!@B3ZZPr>EPI2ugPTyAq=YXRG
z_$*S<^%RG!RZ>t2k%kOCY^dirmSDOjeRhl9la834oc$wLlALQDb5+xam71|LL;E;F
z!)W&daa&Q6`wx$un6WMBB$rl#U=%XWn-x9R85OA+%d;{^pN5JS8mC++zr0T{F&Kwo
z(z%*1fi^9F&Tp$<lDM_*_cV@GSG#+~LZo<JsA0jXKb<IEsCKj<;<{wR7RX9hmr^vE
z5E~f=>sHvKamD#W31J-cSlx2V+*#K3vS?*T%16{wf=_JFa<1n4=TViD?UML5vh!Ca
zjwT~uzHin@EOVUy{iW1CL)Aa7Lo)I}t>>KD5#@{*tum0_B8@~0aWS;5Iof%F#a@`I
zy-<fW`aNRa7rM|a=8tBJn#Hb$jW-s3`CJLnD@Pb8KutYDZ+!)C_YvAa2rj4<@iWER
zlKWoX`VKh_%_8r3=-$c_EDUPMxoeC(bN!!T?O#0c>kw{17kUPil`j4_+QAFxhN+Qi
zCt$EMfMFsACAE!NVG+syGJvsjJA3th!wSxeYht^G%o$y3JRBkg#x5!CV8uxkFb_+b
zJR8i%Rh_r_qns<fE^DKyOIXxr_L~9uXN{|t|41j*g;2JKid0yoE>)5lY?*di%nEKF
zE>hX!Q+NGNqt<ETV6tl#O+Zs3`Y5-6Se&4^CVD2FQVQ#M!aj=>z82$-QMtTlx>wC0
zrI#HnUBS{m_k6}$v6B+gxEytiU$S^Owi9xYfg?&jKxvmX=nK1TDT|(^%{hux(H`{*
z-nB2m=$?p+es@>TQHuMMkW`(dBd((iR8f**P|3;=orzhqQfHr@RM3tJW2kD4VZGHy
z9maN(Xk%dA8Fw%(#qF^w+^p#Poj*qRh7Cz?Fh3R}LyqAK-nzuu1_2eN;b8iqz(KHc
zW4tVQa3v&Nz#{$BMCu_khEW0)WWMMzqiTXnG^j-EXlDUkHv-wAgThmMsd?ROfe3II
z?M%582Cck(>{zAmV)}~YT*=i?qDArduml&*5^7KgB3U^pWx5w79>yd~4m=WF*f=#X
z@xK`V$G#L}LS#46vZ*=4Q(f1)=Z9y><Lh}PDd%3tT{)56X%4U?Ig5BTy&v@7Vz@vX
zR?88^i(D)Y1QhG1R3jbaI=+=zb^hu|LC5X)lu`+SuTE?*(LAj6PR^77Qs-aNwVh}*
zUFLPh({#O)5v4A)F?Ei<V#wp==Uh}jR;EV_><T$;A0BM+23V!u63l$BuR7^340bxQ
zco{%m+bpSla%A6pea4theJwLES*}KvOJ|Y>$Y(0^Y5h+AF%}<oQT9`*JGcyqYF{q-
z!~6`k#+YVj^vahYQ=^`Jn0K;Cg{Kz!{wQ*|vINC@6Sta2gdRA`v?r;T*(*uYoX1&X
zpX(O-{^fDD2`at`okwpcQBaEUB<!Gf?5a6?9@2%Y-V``DuOZ2SP|p<(0*lQJB0KVS
zT|msz)6CSvALhd=Oj(4)SuZCvgnE%V(M@Um2?^ur`+c@Up*uUs{~rPTUy)}Ek$o7N
zz2{gAa_xNLU%aGMK-Z<#D;5wvc0|Z18-J5jn!v|>e5?&=(u}(Eqr3NnXh*%n<1rv=
zJe`RT9t#1B6rfH(M%YDf4!lZmlC#t<g|@C;xi1Jbra**J@@)X;kW*y5)yZWbC>q-t
zXWIU~o&HY@vDoj$8N6_cIY+q%8Tu#;G5P@Qf#v~W_2lxNKbJsCr@0phBr4=)QXGU&
zuNw{#t+U^|vaDW*{?|}7gxK>XA*;64D*I^dBZw5&fi4ioJk2u6?qMMpr7r5!L!>Jd
zC;b(@fGkI(Kh~u!{A1vV&nNQlsB`vho1HRIkV-MUg}=<YgRRE6RCM@Ki9*BMR-+~F
zR^OD)^zh}fHK{@kfscrOnUSmtah8waFRhpuONyH#8n$we|Cp*6rbqemd=EB>dk;e0
z=&EJQ|AlZ1mB2HaS7V6e`gl2|twjU6KliVoxdk5;Jy>8N{!#vTlu9%4=RRdRloa%r
zAO`14vVW?XI02FGJ8KBmTP&qQBt0!Xm|tis3vv?3j$a!4vM^9k2JqB(j+hyK`D3_#
zuPEmZ2&Cihv*b|l8p`^8Ez!vv>RT8r(Z!s?85k_nLRgp(WLo!UB1G%YDQ|3E1Y{*W
z3w!+EJ0z5dJ^>Qby_0!{MBW$X9M~GnI1OOZP1?4fFg<B#46bjUr0I1HNRe#6Scrv(
ze)R>@X5X8n-~#XJJ&S-z(>XTo;8-T8s6mJMP&FhLW=&Yw;jL}-EnbKHkt>_lVmMJQ
z28Beq?;CR{;wW+^iMW_lDT?^*{yQ%$#n*3XOPuxo+wf~!Udx(7&P2F*R#LCm;tm4+
zN$lH5B1~jPZonBK9h$nnnO8LKTVe!s^|S*d8PvtA!ymsOk>=)p_+CnQEI&|v$!^fH
zq*Jv<L`obn;g~_8xsIDSvqcq4DbloNjxLkF1YOEB0L02Ye1iaRLjg8}KYS9_U{Gxi
z>fZ6Co5N|Kr2E(0dhR7)cQGFZS*o_*ckVK`W13sg_?R|V0{4GUj{b{{Ek9v`7eA}t
ztDT_B_&19JSU{-XIkijaZwbxalmujZp3Cy0eSIZ&#mj8|XBW#Y4Re&6@4WogM5(~}
zs1Y!YwJF`az<b>%8{K$O{SwTsphe1)Ul)xMslq8qOvRRPG)?3bHnp-M)dfsMwq$m6
z6aEKD+Xyo35Lg*D_Zmp*$4>dE+5Bn@ymt#)VZcT(F$IJN@Wh-Md$S?;RR@EIbc59h
z+#KXxjtkJN73QestwiW#(ptB?K;9YDe%$9GJm$2B-6VWvqamVHPTjwqLAHA=a)z5p
zS-)AwiWD931<LO$rS6m?{e%MdQ%*o?1ow|N+}SZ?0&DU%x*$EU;(o#_3>>MBPLuil
z5qoA4+dI`3<I)#J?8biT*qjGFC=_L~Vvp`!?i%)y0pG)5u9zT70osG82LHu6YG`G)
z(|Ds0GE8p(Ih27P#-Ob&jdzkk)<hiUlZJ-ync{ua0_R6gM;0RryjBjE_-Z2YPPA*=
z+eSaDof)3HW)FF;P?;1T0Y{j;YYmB8=*YY>?n<!Z@~dIJ60FDNLPeBao-}+|m->Us
zv#}rnIv>MxrG%w_?u6)UrY^#S_Dv^-C|fnPGHyet!pM>|7%IQ_Vzsz|w@PI-t{uWn
zs=c{lIb?KJRjdu0YF~-T5mXoks-HvfpLy1LH#+<|goJMHetCeOP9pD7;_uCHomiBh
zE~wBh;kFj}TtBHJaRjqz!vvz(rb2eYMIJ4s!KCds8;B8A9`(rn&r9tLJzC@^K7B8w
za=(06dfb1+f{Nla@)o1NQ$n9d>Eun^6aB>0tWT0{o&8Jq!SQ{{>Y|C_QOXH&8v>3m
z)LCPU3XqJZCTiyMZ4k@L)Cg~g-IMu#T^>pX9qSIIDxHKduIW^*ysZmO*^hAAY;Zie
z;TKf(cntFzRYDae=zIxMjn3zdcrlnWbR@iK>Qptnq=OIc!;>Zs$+sV$KeGLkQD$tq
zm5t&$6XW+!RN;Zz8r?vsDPXgaH*xjDM^(LQt+kr7$a=|wnX0eSMzcQ!^Pq%;{3(QR
zW*}9%$CRgCPn*XfQ5)W0p_bv#MllF6KCwZ+(YKk`S9Q>u4hd&^VxC-Z6~hpjG7cme
z6iBs)(E4XO=sy~~JbH6$ZSi#%4lAezZVgzeDU?v85;@M?(o{qUxVt(2r2SvDihtqA
z&nHOGl372<GWcIk2Z`~8>;Cji$xr(F3Y<1|o}S5#G&uejb@(?3u1eqIZ<6Y2t)~0H
z$Q1c^^gNkK_edT;sAL};c`&4w++sI>UX*ijDlt|R$D(&tmh+sEwgEZy=)8dJ`nXXy
z9?$4-6RdG$Kta_nW7#r){8rEX#2xDHsmdHQf}eme<8J^rssRoB8nc4MqA0?gaM#AI
z{5V0$AVcK-dQw*>Z#Ne`^$o%T-eHfQp0QrCLL{Tw*q?WWh|fk7CzcuoLK{j_7iZlH
zyCGibys8~?;GHu{H*g--7_{fe)AhF0$09I=S5%`RytOynj#MxBy}1s6D|WS*@*fNn
z{CjiP9^?<7Vfl2)I^(@v+IFp`+u7wVoX&n;tm#MPcj*>VVAF;floR~V4jf~7=6Pg7
zgc5(?c1^z^oVeDB@$&-nbOucTh$G2y;JK3kJ*Sf%w->>SbtA%!xe2GmxXHn;9Bn<k
zA^!F|h8MG<Rk7K;S`w@g{<Qn&`pu<cp;gz`iw)0olW7_l1>wrF-B-QL->RN|E#p?D
zP+wl}gjWcHGf*GVJl{_<Y@4)nD7@pYZ+QrWN5w2^Zte;X#D6=L3GAuNw<P#<RXUqF
zgl$l7^iy&`&mWf;C<zD%<n%FGnC5tPkkvrItBa2vM2nZT=3}@8mLa&M()JzdiXkkh
ztQFrGKMxu6E7F7`F~IaaEDQD#NHNC;9+T+Jem4m5Lg9D82ed|JYl}(aeEIR+C%iy>
z0jp9RvtnmBj{5BR9w-3_pURC>Da|^&STRtf#7E)>@;zYMV-vgSRPLU(j)Pm3YAMx}
ze_2O{W(@wk&Ojzr=m%oX9o*NowEYdKoCHx6!ZnHx4*l|it33`be%4K#8oRvK0&_D-
z=G5EO2DYbmk^dx_fBPNfQA1Ii3Mx-B6!lFIAR5ymHj9ZqQ#%SmUZA*!;w>tLcS8<h
zjtzrBczgjWxDs&bvLY|dzev>D>Wpkmcobl+!*m2M&3D7AsZ}ZYE>Y`@zKRe=pr;72
zIblW;hc6y`#L!H723Lp9K^Xq?n5{KK<aBecMj)R`wm@#<QR9L_OJtMS`8UA%phW%y
zUAT;$+U8c;JAZ*BEyX^X-J;3_gf{{n&RX8k=52hUiI^FI@y=MW|IIW18wLNTV)z&0
zAEEpX-j+$`DnGuw{?Q6PpsRx=YBg;|JYTj%=a#jq8m-Q)Z(gG%D57u0ap$SX`!{Yv
z3;b@3CH-AL>vr$NOSF4Ua9t2&ke*>t0E<Z8Jo#jZu={PuDki=T^Zv8tK<Q(PJLmkl
z-Xo}-4Jy;26-k?y(Kxh4KA;gdb|DI2lfx<}SZYtaSe0Sc=Q_e`lP+03hD`8|wWrw9
z<q_R6qxQ#{Of?l;1B#VXl4Udd>a*)J{=PE_CLM#facfpEN62l%bp@V8JH~+u5?Ll?
zh}EjlX-$WL#e2>KNH*R~VnEapMwpBpM#Z3uCEH=>7FComF^26UvKSsVn3A0y9=qY}
z^at*qhIm_nz>Y?G0R>P{N0&y(+w5-wS_V0hd8$O&b`Mp@nHh<4sJHnzGWyK(r1Npy
zrvG3MnI^7b!Z(vqW3Pr=1@5IL8Wy5W&3I;%c}RM`(ijy-haK%#3N<_Q%sz(>2SPkV
zL8@L%cZ8yAv2-*5+iy(KzPJWF06kQO2%*PrgioFvNPk~&8TrIBm}I}Se--)5luW_D
z>Asmr{3(g&qp(^rp{z?#dUKo-Ghj%)BT>oJ;}XkX0ZE@X%Vd$DsHa2VJ&5Pqx%M@l
zPGK<Z{wDn+?iJQQ*wkI;-&b&9AhhzpQtfhnET^;F@_ya<!UC{<;+j^w37ltx&8a;O
zG`14URgi87bDJeVs*bz7BTA^yZi#)!;(Mh`k*wv&;~L1&kx8ig;Vi6JzD-jSuu5S`
zHC%8hOImtj0ns?<_!Bgk#%5|vi6|=+6*UrFBdvde!5~W7bJ0gQg$>a{^pdbRr>nj{
z#2Ao2l#TO%@&@uO9@bB4csnz}JnwbP#<K|A`m`eXr1li>r6)ojClDs6bAe@rEcl(3
z+aO+Ptyc;8PLL)&<frArYqkHE90o$?g=BkFb;0uWUz15AEBUA{j<x!jY<q<Uc!B2^
zq_ZF_TGj=P#I&A^w4zFx+B3?^q&2Co`@=@7+S7Uhv+M4{Fv923L%0#87vnv_^nLrT
z3F-6~ncTgf#4mJu*YH_yjE}P!Up2Qq9Mi$nIfeV|^9IS~h`LG4UiJ6LIfQ+#Mx19J
z{7z@UV752}YMDJ^`y6XCgb#5noL50bLH<lo5wVSdUrOzMRM2@h_i3-&rD9Py#A0ba
z+o|yX0Z5;K-^6JTvzPXf)v!bo`zv3g$HdR=cCc3koNRYMzZz66;{FmIoY)xQBTJAf
zgcAIfCtq%0h<$1B%3d*$l-bsUgm;kQoU@B&`X2um+%LI7$rBdyWxiVeDr7LPrrUJ7
z!9q_C;9|UtF+RvNhni<Pb!Of?;$~s6{$%U^7p@uf5vy`4rn(YSeHcV|@Sn1187Mq_
zV+9~J?0r}f13$vY>#_CE1zo<f<}yM_*UXUkkS9J&ARDR|9e6!M9FwR_Oxbi~sYrEY
z-kVCravO%f*mZwXn9rbBN>_|G0Ng?GoMph}G*|Q4KBo@<K8;X8@`Vu*NhN}LR+%V2
zTA%VHDS`QCIz<TbVP6ptz8}ZqB|EJ+=#&D>2f1JA%}e+$_bz^?kOCM#57%P%G0oOQ
zWp^oEqSE^NaH7U-C0^<d#xh&Ql7(V}E`l7r4{s>=O@NsQ7lKWk6ubLKn3bK*s2bO$
zg06_urISn+7xAg%ZuhOj=m}4W_cgVgf6zZN&&<-3+*ppXNRA+F#;Qayn@e5JaE#SQ
zv>PirXNxTbAqOIlr~ik5OOV~yPqz=14)~pXtE8DKL?4G|Zd>=R4{hP1=o;hVkIc$w
z4d7Q|^<l%2M%dtyWqTbjR!F)pSFFpSxB5g9>AY|1Igs^_+dLTb;z-7toS2)ZN;e|q
ze3{`NVjeSB$AvXAEXBXVgY)%&i1vs+OE<4>^?%XfG2b#Ij&TZI=MBiSwM{kK^<Vz>
zSR`JDO;J=FrSHKE{jyxmsQmjk)<0nJ?$L6NG2^rJvM<4bwGe2=Lm`4&h!71)>e`bw
zUK$I8oOgs|bg^yh((3xNRexOY&_VfWP9l~MQ+W*?YhDNw$wM!%nO%}6p=q$#iIGxa
zje9H5e^&F3e@o%}3JRjfRSd9b<KJ%JEP~J2S7KYEh|?xl4S3SEui2`lA|Yd>c-zeR
z)O?J*T94k<+*HT8u)YuqO`vLv*RMk!BtBVnoFzMKcrXMX2+x2ft>0V4Du*AnUnn<9
zQp8)kzg@NXwj(o12csWys|RC}BM!XXt{)g|gt8r5FoJK#xgkpXv9MO#)grBaYua`8
zpEU?vMs<L`gYj{y6|~mXj;{LT1g&>{h`aBmth`W~pIH7c(fmK+ptJx)j!ZXeCYqFX
z$r-XZbGu7|Rxqzyy%u8gpXGkx&}xZGBj#F(%h6`!e@p#aqR?lv);Ei&ZJ^y^SI2~x
zN89>1kZl!cxh^GFe+q&mDr+vo)H~9V-b)UQ-c!Fwl$9VCaH<N~?Ue{m>++iMPNSo1
zT3mGf)<WN&B>qw~wC`qa8awZX`Y0Y^W5gE9$zs4t1are2|4$GQbI_AJ&Dce&*6r(e
zejxG`cCB)Jz+?3a#C>(in;jR+4n0GJ+PE0<0;Za6y-U%S%5suCXRa@$ieJ@G?g5FF
zU>@IkH+D-$CB0PW?lc+v=5Ab!A`2+CwLoSoLVe2zy!0sYU~cOCxmEz&As$F|&$~i6
zAs=$z7gu8y2A)(09<`njpLsF6%H6=8prZm)86QK&y9lcQF0@>>p3C4c$9N&oCu*+(
zmV*8=JfWmaIhE8UlVv@QK{CFx#|%nbkIUJV3Jv?ei^xA{(6?VjfltfU&MDF-FBS<U
zF>sJ$=&$6VKB63a1e}V_C@J<UMqD)ra><&5Asu4q!+>xf*^5Q5y;c%&XK7nrpUIK-
zIgz?#KIh)qm?B^8sgz}(H<=kJt%*W@^-8A_+|VPr7OQ?G_H5>)_QNv@p<a&i>`8)n
z<`t)0AI^}1B^;?;Br&y3%qA{`PU7SuDBII%zC0zzB6z>xd^4z3&<F<$SDzKLrGi|2
zQT)=-03Y046UMI%Z+KRBVm}u^7*a5`7H=q=NcU|&IY-)Ru(Vfb4A8Mesg5xZ<Hm{w
zcwdvxgy4yNcV7~q@!IW1*IB2a)7UiIdFqn48^`Gr+!kuW0=ad5iAvE}{3DdSo>RM@
z2~oC`lQ?IB(O_IuAUHLhI_T%+-W^akx9FfA#<Q^#1ue9Dj|9kiK6fg68Z!H5imiH*
zlG$%C3B!Kpgh$EjB+7kX`MAD;Tr>6Et|^H?7irLr6;>s8mIFlh3Ms#+A=61RAWC2N
zEIrVttMXYlrDfZ^V_(fbmD=)@A-3{jQiP8r#hDUI6WN(|35Rl9PY)h0&3dQdkhF*T
zH#$)Tb;%KcXx5%h$m<Ws|BtV)42pBxwhh*}ySuv+Tp9=-9D=*My9IX(ZjHM`aEIXT
zPH=aZ$KL0@Kd;VxHGgzfS9gECs@7W5#vG&coddcSLvr_`FUe|Vpyim!Yi*pa#ey(~
zyZsAi|4#>gX;uEho8imKLiMp5gAtBtQybXg&xrk>R8qKYd0CtXu}xVBpS&o^oeFcI
zhxWf$Y4n9jQ3P^5+5LHM9Mg1K`8b?D&!evI0Moi>^FNNC2l5#ooUl&hY>`B8WV|7-
z8_8N^Ke(w*LXK?~Gt8211t@de2#Im!Pvt3Hr9S#~S`Ti6%}>jc9K8u%28K65TazbE
zF9z)QWXa8Lr78}N<}SB`B%L#ZD<(=M+}X}Nh=M*08UX1v;;Cvi!5(I9S;s=NG>i9j
zGw;6E2E3p3Dd%+H-Y+QhA$mcX%~a|((X<v{(><qiu2b^E^rIiCtaQg}Z^^ID)Gg=A
z2ej->H_1fdIngYI0g+P*FEN$oL6THPl9v>HNtRTE?J5sOT<FapokzwioWiPW>t7XO
zAl+kdxcYENPBDw38PZs4iX_@tH6x*EcqeF~&rJYnRgiJ>z?v8d6Wh9AjU9F>19sfq
z?PRy7BM3xTfGG*v9#GNtvtQBr;eoQ<P~J&(CB2{*$#PjzTmsLIQX7-QSY@oU(5ay8
z%pcijYnz}<tAj2eabl4OyMDp+DvFn<=8d>X5_XP(>zQ5$=NwIrD4m|3)uRqBdAF8S
z92r25xyYa5bBrplhZ^bRr=~v%??t@sB%^+TlyP2Q1Edty`Bdik>^jeKJk*@~SXKNQ
zuM^*SoYh)JlDO@idDxSWd1tUOvH3<T@c5F0Ec#30orbarFFp?HS)C^V`7aRrc-j$i
z=7?;b!b)c95|%jMb{d*%K7cW#@;tLgM3f}NMOxhwnUH=>i&&$P5jNFdxZiO3PkLM@
z{lhR`P||%ZUT~>HeVH>Ngac>2YGbc&g;LrYMLNbgenN8)hW}=<HTT&cg<m9@(lnWw
zzhJDBRErkwy$vSG>(P;7<Hq=!UmOuZrBSRtEurr^++>+$jA%9(Nse1<4&M{cFb2^p
z*CQ?${u6-~!N?{IdzXk9-+0v0g?KS{)&yNRWvE1bLva_e+>`aCCm+eZNvz{kZ1{$K
zz(`8ptJRh)wgpLoGTDMQBEjp!`C^Ct7wX%r#t**MHp5-<b$M+!x^chiD^Pj+!q^4Z
zgC~8Ag7HzwQ1NT8UdS74`?u6t9l7$7IWKP#?Gs)jJz=qa=j|$|wkroMq<~K0T%8Mj
z{C)AN<w?oUS@iBc{r?FW{~w3<S)L*QljJ!o*1PmnXvKdJa$73oyJ-6NQ^J1Y`d!*a
z(zpPCGqfEuqRpP}FPhLKbUDa7lRDT&VkskX*7IawSA2DP=yzY8bK90q@Iq&IaB+2?
z;Tv>kHa52co&GL=u-{|iM*S@-^>B9UlI5<-qb!2pJ2w95<zjA1$lwMEP0FstC8c1p
z7UO`MlJCvmqe-R4GFG&GsNqfLm5gCs1O_CpBB9rfPP!b<d&RR=y-{X|yNZ14Z?TV-
zx{By+=$Yc&m|FQ@QFWGEUI9|Kv?CnN{xxBCitb|h1gqdVwy5hF#TROs;rk9N60U?j
z#o&MfW96+f>m4arm<f-wqGY2l>8hiQgXBeU6b+i^Mtf5k&L}EAKrNT(WiDa}m(ku2
z-IT&8)qHl5-(w}^RbFxC1-A0GE$Em~14FfXQVk04;(qX@X!S>&V$R{&bk%cV7#4#^
zH<*A<+K8vT3;L%T0)_mvGrP(Xb&{1rJM}jA!;OuRjeL3ozp{31yWqDUpF(8>yMtra
zt=iC--Nn89UD~SwDp+%OHwB{ema89)64d-|au@h*lASljoG0PBpobYQDHOJO<FpCu
z-7OkI%96=~%*7I3#GRI9P_q#;CA^z5C4@mMPcFna2PS>vIHE!zEb$F4iqt{iH@Ycg
z!Sq;RL8A#6X>q|{PcBIX*#|f@&uXA)x}3Mm^{W5N1<;ECMHi1&Q?59drW<lumk)*E
zG^HSS+gWOc_@^5CMV3<uMJWIUrnE&>cV+cksh4w*q%bA@j8@zH(NF(w#X?ln-$m*3
zb$<wR#+3S}<lDke_ZWtvzU+{!o-}-!-&2cVX}m0v6`EL8Wbn8@Ebm*&8u<*%c+OBR
zSzoO_;TdB@J<9Q&8f#DB@(Lsiv&%{OHb3v3zrvQHhYIVFY#Ciy!iW_p$1@6sew0c)
z<zQR{FB0)tp)s2Xs6h(3hW~a!$_`4+v!ocgQ}ZVKWmwo@OI%J(gQDYbVA-oO(?2pU
zaR|Mg+;Y?VKM!_*A^2>!aWv=xdG?e!ZcVE#&HjJcnec7*@kIC0xaRCBIN7{sn|}BD
zlF8QX<X!*gDsX<o%#|wr7@rb)DeW!}ZI2F1{<U@Iz^SCMgA5b0Pg16hd~tp%E$gUL
z5%zxQCH6r}3_)<S>ONa%>s3%0$xc<KNgpmK^a4(a10a*Fz$qa2$Z>!3Z5gUaAX&pD
zsbgt)cXNpzfdq5K5?bsJPA9kgF@X10Awr?a#`#%er;sW*`Q;6P5K1GBmhVha-8pMF
zUmb>xI4!T^xtFmf(IB_Z=lwOT=y|aB6AVR{E?#RcB`?nc&N?`0*pMa4rz$79{%yDj
znnM3blE)VqYMn@uU5D2=j?$6WJj|WPT(vZoGj!F;T`Zzz36k_iUxs|$c+XjmYdgY1
zAYgm7`~V-yMMzBL)&2o~ihnDMG`+JX;H|?uy_K%Pg#!_j@{6VQr@_7xtCVh$2oLwL
zBiS|e58LV4Jqodg==u7$D03NC#(DBh<@e+^Dcij!La_mToAAj}Iw<*4?}#+IX<M%6
zJl2bq(iyjn=D$PT0SkK96V{+@$8>)9J^6-)G^$yOaMvp~wO+-x@ZLa=^MMvB^isWn
zD;`*%?|3E|mItYZ*%$q|07%GAxr-zcI;P5$rp><x05zy|R$;3{r4^};Uqe`(qoaz)
z7xNdR0wovTT+)asJPL#dbEjt*-@(LgsWwgSLGK6sSSvkO>f7b>E^u%J?AbIGY0WN=
z^M<cM_8@k+#QyIE*y*f}kYZlgQ9FH#j#FzmsEjpvR|bz7@f)pvRVrqFgVXxi=3#Mx
z3DV9`y3;uH4nq3VM%`9Ym1RM1I<=o-2VteAIc596i$XVvZ-qokEe%qf77DapsoM7(
z!uoA1%y?6iA*WmmSVdjo!_y;z7&!Ew9l>_OaZb%t%gkdK7?L_#8+7}GZNhx_k<;3G
z{lD%apAn10kAM30w?LA}XtdHb^2|E>6c_hAX;nIqR*Gz~)J&WFLs(fc^^dAWpJ?Is
zR-j)*&=*Tx4!^3NJKFN`O&g?@a&wb$xJj!eb`Ru0`}A*%vv1I8UTwGvW;Me!kiDgB
z25{U4=dsZ&JiQh;_L(E3_8^q+XRXoO;wGI#W`e=eE{_p!)=Ak2LMZZxM8*GGSrRNp
zc<#%^Chcm+=>Ae++f!_14SW&R^)K6wRmXn|nQu@5Y8{~$Xe2eYS>9ga=H=kKiGomu
z_{0p#6w}`k<QjT}66UT<RjI?@)%hpi6;R%|{4l+i4p8L=w!?~oWQ+#327r3QZ9hNf
z$5pgN^^`<Fw8wlFy{q%SP@q(64l5_}i$;l;JK5Bs8J6wXQ!GP^q{*XABY*0IPq_sT
z;%wyJv~1e_uy49P)qXOsv?dm6559G`2W+-oeQ0ujHQP@Y+A5I&0p^&L`IPRyYy&Lf
z&OzCvg5kH=@cUun`UJjw<XYbBi2Xy!W=Es&A}G802@(U!z?RheWK~A;bVV3&(wHvK
zWwK+L@L$*+)M^YA&!*?f`x}OH8Eu)J%0Ej7N<}WvQOi#n^XLP2390&IB|Cx+hDvbI
zf5_z~84IJm<(=BR6U}tAD{>O<4(*!E)-AfFu<v+uB)A+X%q^@wv%X2H+~cRUSRLbl
zhWe?ipJ^^VWcR(d)8P)~7TRXh!_~?pm(au-<|<V2F{TA*Cugk^4zGewmZ^3(C#$eu
za0%l=)^Hy(iDTmJyUAZA&@&v5G)GzO$3EqD^|_&y)q5N14NpHzsu%3=K`&OiN_HeZ
z(kbOOm*qQhfN_L)&w0AjZk3cxpHA$4OYw&OW{EB{;Y@8jiu8~qgle3ABaJAMR^IM@
z_oR>aOqp?6@mo$0u#CKLCgs2L=DX0iD01;g9JTRU!&?s{2}>x>=XhBYgY=-MS8rdW
zzK?OkoCq|Q+c*HiYcsBQ4(R6P)WCHaET&sRallG-S6+TZ#?*D>lRfb(7PZe$;n216
zraym@UrausC}jVEqVV{j@+cSlp$MTJIgpGrCyWpyY^+feYnWCU{!#Fm)c8dBM{qfS
zqsoq}UQd+$@v_`6kBlVh^$A|0r<XPEBG=rJYn9}Q-d!9U9gx5sKEC~|wg(Ffb6#4`
zcda;cR9cv|jhligtShd>96$+BMigSipt^5^)kZ_#WUHmP{r7r(K!os|@Yz!7Ml~~B
z8;VcrDimskZX^7(>auIkTL*O*j@hYW*~3EP=0DyD2Ih3pWDqTmEIeMy5%|Tff0X;?
z6YOnYvV$bUqfNCL|B)2-Gv!*4xyR$;TY@I1gh>U(^HD-<-6w6n&KOl$<ZZn4T~>$u
zMFO)lzddIED!bT2{Uk@EBDk{Ptg%A~Y@_7NCx_0n$z${A!_DFqi|$zS5_F&*=o!Zc
z(>;>`wghXoSA^$RvNbY9t(4&w^pT17SQ<2FhjU-1>!nVlK#o;DDr8gFQ}!bOWjdY+
zBtB)wErebN{?pj$W9HQm^y5C$9nO=Q7y+EewlOw{C&KQ8px{$DVMb3>xeC*5c<AcI
z(wLSq|ChM!YNcVh-D(iYJEH`JJVC5LVWujqvYZOd;)qFE@!jJMEyF~=C1|y5LRI-I
z*{kI(^i%ds*nOh|2wF8wmpSZ*CiKO3pv84v!)b!8o!CHUa?ASD-}Mz?h3PJYX}GsW
zVs8^tU*?MQ&J`BOX)4{Dhhukqk9#fVw=VqfjR_sw_aMj0QC^iH13bk_A;E`1sLOoC
z99+ILmKYJ`&@eBOcfnUCC!s{wp*ixGi}53hL%|mN#IiWx16F&9TE0erhjInxdjFEw
zL5Fa4-^QMLpH@xx6^)6tevQ;FFb7FCC6RL)c~)@Dj4UgkI+8<#W2~Q^Z_8&Tk}M`z
z$q5ZeF!SwGU>Bq*QOElxEdB+OLe+}w!In)Xo=dbQu~R(vv-@R3b}b0nwPz(|u3Ojq
zCn=xB1`|4`@rgQI6DPlA5koChIp<0at(9arL_i8K5+<4GT%vQxDGWI%tYyK347Z5(
z4ED{Xih83_b+F9L4tw@t2r(2l&*M@Zwe-IK+Ar9o9=wuOgdX79IbSi|9Zu(*=3H|7
z6}(Qa@yh|$Rful_`h2Zk?I(Lk*}~@d#6<5H#BIToUXa?Q+I&I$y^{qu-yHy}CEk|{
z&uIYI#P<($@V`La5o94j(5B7LcGd5NXMr$=i%WZd&aCeXRxiYDuw|vn&t@uPuON}1
z`HvQeEv!1Ifn%RKuzYR*>0AZ*!~O9mS*hM+qXf9h<DH;*WJpB7A^^X$%SLnHu&})I
zcfmR2`;!q(0jsf84prSzBA<;*uoVd;jiZf0AGBi$%@aLIeU@xGf{QoC+ph}g@=z-H
z&8Xn?Pwe7hJZ5eLA@<F|oEbL|Kw=Op)uS0h#YudDRHaH+^<}$K@6+I)`q?_lcjTpS
zzmR&DucQ&*)P)8v{k6M*^jAI&Co6%+vIQ!mRtQSU8Ojhl`k~fRcF)74URro#9u`sK
zYf=rdYO^<OH8z*E79|XY?sG%O<tI~pdCp&T@Q?p;5Tk$xxd%$`hneSpNHR(M7$y4l
z5!I1>gnC3K)x6*EmU_4kwR-qw5=QZ(&m0t@hdjNUF@9r0f-<A|j2;XmH%B36^5P+s
zY@#PqVagiDJ@zm>@=_rx>O8wS9`}{Hm26V81A4-@MR`CK*o7evZFAinkEQ7LUrqdQ
zj_kfjAG+2MA7ZGE2wck{-9cvcwr?j^#0l>n3QyP*T73s^$KN-Z8XKHaQWP;fTZYFe
z#05p(>fI&Um$Q|fiAXf0W$Sc-cP(;F-&)=Xq}J(Zr}Cd*fh?ZS+r2X@HhKzDhA;gp
zJ)t+pSxJ7aRcLWwNG3e-Tv*9o#pt;=US=jY?{7T+DIegO?Q<K<-1biGcBx7?AH-{b
zZ=sOOA`2>&AtPh4wrvkH0=7lstEBb?-5*iSTU!Iqde~SE87Ek#P?j{lS6bN=?}uJ2
z8%tw}X}V|3X7fPu8k>ApmtUtmQMXm(lcr9#pu^Gs1>V`SfNTEFk>3NBn}@FX)R6wx
zQTozL+hm8KEY~^+I4?iDOa9DSa!9)KQ*-``tGgn$X&IQp-xse{hb}9^`?WnT%dTCA
z+1w&!FFww@^TNMNGOomWF(-ZMWr(emaHT!aiOO(Ko7PY3hrr`W^oflZX940lT%sG?
z1qR@L`t)4g-S%3?{?^L+!T~s0U8+vY@}L?w?mhnmIUh#su2$$_`l)wkRNpVdbb-xk
zsPN&ru#eo`>zAbeZ0ypwq`|1%$@)4_=&q=~p^sJ3|3FGz3(te_Cd=PT`+%1$1C<vI
z@rhHWg`NKwnE2n0qBUe7K}x{-@cUewnh`d?#-Rv>zy?>7=giBG6%ky^3m;bX-ha<9
zfpG4yH5WcYm`0uMA{Oy?;w5ubU#vb6XS(}QcLbI-MZwT2K7GV>4Qkh$yUR`W1)B0-
zAuH%ulI4#CAY#T=wT~~5jfzfyoA^9i;?yEl^=s^D-`nN6?H&gOi#@_Yw7`(E+CVD}
z%90d<Bh&(VH1Uj;%xein;{(Vv8{7IQUd-hKa|XhS7)!+=+ndk`=sE_l2E|iok#Ru+
z<dHgk-YyvjvRNvV1nw$HAWRcw<*O-3gMGRn1HA=WeesH9E9LL5b9AY|^4=|!DS5V_
z*0a8}c{IC=i!Y*YHNFe^1qyKTfcmgx*S724gm6tlm{Ge$jUf@(JPJQ<v%a6d$@WOe
zD&xOV%4Ls{SlOznFRZMf&n0?-X$T^sXAN#E%k$h-e{sDM`16p&<dZIK7{odK%}4G2
ztI;K&zLlQ$i&<)`ZkMTUurFCx64Qz1qxp3p2iDxfL<6VZ4O~SLbWD+cu>!;Dig7_R
zq2wkIIC|`k$t<j`#~~kU`0XG@JR>~8onYVx_xfGrTjwKfo%+<na8dTIiwy~8mRS^g
zrF6iT3aT3p`39y+B^9wdxXlSy3sCd^nc|Cn^Fb{VppRAha|rJ?|8#)TXq*Nt&|)6Q
zDG_LIjb_{)*fY#Z8fS^7l8JydpQpM6Q!J-*|Gj<|fd!Jz1ik*C=Xl3-fkBK9TZ^#p
z4g7$eOJ(_n5V&+xlGrJdONhGFya%!sG1<0{K`S+FKhN-K`|%IBAt0bzGigML;yltG
zbI<IzLV|mCsCSCWq}{K3Z@ta}iFpT*vowfQM<Rt6@);%oM?g%w@GP>4Bw^=Cu5C%}
zX5ux-)fD^&BF)`Ep4kiv9(HKg2EgC0E5o1FQ_QmFHbZr%>tKFGrDn?|Dv)Ygr<u1O
zP|XU4j)G&CySR%^z>~H&(Des;J08!`GJZ(8`03{bY4;Cfh`PJihZend9QU-<#~^T^
zWO~V@Dhtn7e(=`52CFsXQMZ0+`MCl;PQUV+cq4AL_Fpj|hVMqPZd-E3;YE^hNR8Fz
z_^&5R7g(Fw(by+@;_$ZJPP4!}#qet6yMBWgnB+`Ir?!nAu3VpNiqP|^$)Q8qmD!t5
zqT2t4-vA5k78WP*%yq5kx<BE!#Rlmy){a7Njwp^X5qtA#!~f&um9YcfTB9~<#t*h%
zY9<g4j37ujN|4qqh5IpJ8AO5~8dp+Gf_)2T$9NOP7TK|*iqo8gIVe%&?(&+DIf}y@
z^TFgyvGr=_kba{;KrsOq>hPZOl7ZMGPR<F9mfk#ql5tc|1YSG>b;=Fd39K>zXixvi
zD``r<z>n|%b4$EP0npKvntRlq)vk04_}Qm$jE%nv^&3Kfy9ZR4oR;AwuUZsXnCp}8
zNGl5j3U_J*MxvMs8Fgtm90VPk1hD>mf?uyjvm+G>ln*SJZQr`ka$ap~GStZ0uwaQR
z(o^*du9LH7t4>)4%aOZnCH$`Z;#7`_z{bmM$_wClYL9}31Wer;yP)2F7)|Ua#&1?V
z)B`@5ES8fzziqcaGz8eXKDk^!^VQ@n%&fwn{Z?cZ2z8Y&*g%zUPUY>)Aq_0^$Vzd<
zmolBIBNe}Y_`%odDR?X-9lkI2k<gxbIJ(omY!+nCtAorw3>W;BM)sA4{wo}J&S07p
zdce|_IRBY6zFdZdI?3sbd@b}5y=B`5qrZ%2Wkn)};y*1}yQ#5jSZf3b)+Xaj0;mi3
z&u-TtX|I-Fce(#sbs5LR-%kz;93_M=Wj2qU(BWFU$O_#ajn!@3vi*_nF&zw$HX0%4
zl}c_BJi+2<@Qr+*+JE}_r3sBICy8(m@J1d|D`;|UMV3Sw9T$EbB0rT|#68bO8vEa~
zKZi98LH-NA7=r(_XFjnC$dDixj2m2#)2_Wyc(L6KQ&V$j?06N1))Ep@Z)vuOSi}5h
zvVKbnj<A>vfc)xP`3L>?Wd3I18mg(Ajknl2-5PRJZS;QQ=WFTksb~`4kMLq=t|yiW
zxwS(%L7qc@7@UGNNx=|je;K|_%LVf>o`PSc#1l6#U9=*-cJjZ!CF$5hYOr&z?Vu_|
zUbIqwgOm`^VYYUC$~w1zo=;t`$oR85ranyV$Qa+uWW{`r?tpB&n3b^H=zw4mm%47P
zTlu7gn{8Kfcj^(niK(OTkl%{KJFD%8FrQsQc4iv7q5m>Ov-!T^1xJ>7P0L%dCs4`s
z55)MtoGJ*+fWv^5+V_=e4pVH<XPyPL-TNe(V9BjM6TOu(zy{HdG0j=QZYjb%>N+cQ
z>V$1@xspEPG{bM0o4N)B{ZD9+1Ow?_Z^;&>Lb?#y$Qch`X+Yi&nJOgB?(xe7eVa&O
zr{_14JwC0z>J^qnxKUVVZm-<OGwCoSIID)@Mr%&Rig@ymV5}#X&<$!a{COq_UJ3tH
zXt*^@l1vG!E*`u<95UFF2MQvdlRFb;3`L-n{_^V$E=xujlcb5c7-W?Ig0g-22L@yP
zW_b^H?du<LAFA|$SDZI9v0s#wTHvGK&k7M}k0VXR;1Ux8Jyi~nk5^TqYnAQQXCn&5
zV?)o2e+lK>S@XgaKi3~4!SX%8=E!B;B0~a`VoXVP(=whOM#3%km`4N~;pGx-+A)>8
z+@u=JXI@ylAHTNL7t|%O+;av<lrohQ;=q^OKLP2-tWIO$SqcS+>8My`DAs9o`y6=U
ztC@X0SYybiQ#Ey>Z;%utxcvv?etfRp3v<mcOZWKdYiR1~lHC@%5pOX<d!kv11wnkm
zjk+T4ZFJ<S3jVduOiws;oL;hk&SVnL`##dR{NROM2h;Nc`r#r9=dWmGpjj&YoTR=&
z0=jB?9)wc$4{?0*XZDWWENtmnEluH`(;Evur1W$)s}&O{8udHpH=*h3{N%LL!DQ|O
ze{(uvjy)R6n5NzEE19_I-s*6{C>V)+#l6rycR{?TEYB|6BrddLq$aI^KQedLT>)9K
ziTUw4wfc}HwXSfI3Dj-l&NyIsNjB*B;sFOosL=tyTHbORVkmbjP4-SZ!_I-u;IQ5D
zDwgI+s)!kKj37Nf%+Fb}ur*%%?FLQ28E;)qJpuwGmlEf@L6FaMo=-YR**U>qcgvC@
zN}%EAECD)rna=7*5K>uo^K7>IDP`>S%tW%lWYW%Sud>YGUtU;HN2At1ZD6DT7#Qt}
zq11SSpUS{fWJw;MdUx!zV4s-G-WkX$K9lxv<=sz|Rxylo#}>BDB}gyAht=L_nGUOU
z6mUh{RD!_X`G@&RlIL`$+K6L@{96=W;gI>O-p<_~aIXVr6c6Y|+D>z_r!~OT39)#_
zYT7_#)q2*kfkue-@N*9Q`Thmr0oJsf1@EEQuzDqEB;+dgMgh+{(-^h{phNEPQI>58
z1%75&U5ss-j|yT>u(r!U&n<d-xgsczA=33M?wG{X)40?MKH!8ywTjQ?GPKVbVVJ%V
zc&`IgJ!8JBul@e}ReRw^2n?=O%=ruTH!#C_;>O4R+yvu*KZX0?iR{oo)#;IR%;OYh
zvkyw_`BY(-FZbDjCxdG`Yvho#Df&^W>`&B5l)1jzK7@CV8sPYb`iDlahZ4G@S6)Vq
znr&9Sr?QV8WAX&do{&ZhV`n4b|2;F!WP$<aFcAvOKR?BiwV}Lu*6IkG>$`$&t1J~h
zbR3aozMo>d_5^_|ungE_>x8w2)pS%ZC;yA-Qir@hY20B9f-Y*6Yi5EM&s*s)7jY5+
zmPx6IQIgf}%CY<vT}Lt>c|`KW_q`MU9G!dzy9<aWWrzt$Ld&9;xN*XnJN7>cT4yN$
zUy(0t9A#kc4$@mqXF1a|DjN@$37b&E<)8ngYD=VQQTph->9Ljmjn+0!G(BehGY+y2
zf^MTB{10*Y>0a!xkhynt5o7%T8noB9C|OSOMRYm+&5Sk}h_AkB)*-Nbm*{%gxyTL^
zF#DIx5Ud+3RqpnqG7xpa%d~!0;tqH0)jvcflpXsja-kEr%0||_s9{3t7%l;1yi~`s
z>B}~*T5phGLimN$`HZmpbZ-Nn2xO!jy531rVA04(`g)ps{NQ6WVYf^H*SfBpb5WIE
zEAi7dsNvQo1coun5tXm+eZs0{@CtF~rWVFd$`;t-`P|puwmEPSw8`W*R7)%%FTY6A
zosPRB8-7wbhSq}gEKEE6<+^iM4S}k+>s1}zuYHQ>pmBQs>pf4U0Q7j_NBWEJWi%Wx
zjt!pZ-IaHma-YaGrwPJuaX15EI#Dy@5i1EgJ~@#(tY)@lyYgXATj-$P31!>qzR2SO
z<ej<Xyw`S05=xZuw?8<@3>FAT*2CP?UhZ2$uFHQSk&YwbDlypQek3gSuMihZM^04c
zKxIWKv3>^o^n+9jbTTaSqT_vE*!0*hmX$b0kZzMn=O1ajTPbiXio&~6Z)xg>teDnU
zO-@2B=-%D_{#;~1MY{0e#o1PgI}IIoljH4yv991j+2%NX5J*cH`0g6$c3cK8@}=MP
zy|Wk*BbXcmm{#z~SsW`KV4sMgv_|_McMk(@R=>>)wNh*PSzmIW9#Wsgb8g13s`dC3
zYb*`i;0AKfs_^Eoo_;okeIkTGzt#LqP|q_CQ)qLTVAe5B`~NgQ=(xPPog%${ATBsP
z8SeV<iy+VEy+d=fvm08p8$U-uxeInLhI3*<S3q;6m#kR>nE=&8(E_=M>)lQ;^9U~N
zn?=nq1*Y(Z1+GPVT*((Vm4YwY1<60KdatJ1AlI-nZ6<=Rzs0L}Bn;KFff-d3ryp@X
zK@PnYYU;j5dN)M=8<so$dJ1h*zE9Ig(};Lsb28l?_)7Bqn$zTilV(v&n7^`0kBB1s
zyx*thwRbIa>u<_mb5G<=-!#M>3=|jZ>h8Ca(gnWwroUg;kSuc8RU@<7wPM?s!3YGp
zp1!%LUcH!ft8&k||3<@9EDqE6RFUE^qD<@l8Z)WH4JC}hc@R+!r>OEQC#zcn2iq~G
zuTKB6_Sa5S&FI5hAzBJ)rH_Kqo%K4W7fpY&Fa`dyxB9i<ckExD6f@eYK8(XkX=q^R
zE!pnOkAg}=2)zM;-bxSOQguzNJUxK{I|(MkYEgPngHUDh{NDcplgTAV5t!mz(tCL*
zXo4qVz-jy0biJ<B=6s?@Qx9+Znyk}$C%FQy0}fAC_-h#onI#Z7Tjdz^hv|PJz6;c)
zl3ylojKSvY9ht96MLzcH3A1D<M<UVs<OtJD4wydR^YVHevXrD3*n)eB-+^%{$<x)&
zD?pwUO-l6fV0U5sLT|E&;G0K%=d&>4`;eY5;pT-pH)5+(c@S&W;TtK0J8=B=ovD<0
z$a%g3oT;9wHpyI|L9HVNcnw6=P;!*ytH#ASZ$zMwn(t7N48yiFLkOfNGoxq5axV)@
zf%E$;Ck^vekH44#Zk14@{>JG<80lY)sWLB5LZ1z4Be_gq+x&nj7kSw>{ZZ+V*yth+
zqk4Dtx3n2ld9kY_=5+V;%6$-P<!y{^%NSY+z2oKxtj&6|wcEmmyicbYrUtuRxq=!r
zU}~uk*Nv&EE0OPxm;+|ZIw`$BTu{&~Aw5h$NV64I>?RDd@swG!i@6TsD<yN4s1TVH
zv{l!*lC<7;p;WR+ks4;JEwEQXyh!!|C<Y~31mp0}%v;3UB=XlQk}dn3Wj)#vlk(w<
z*so}3Y%{bB7EC%H=e;}bpqfb`=Ibq#LJ;kbzS^8`8EXvc^e8+}@Yf2<uELh<?S>cl
zh-l@;)}G)ico4kT;$OHz<noc3d%oc3FbOi)Z*Yjse*=UHCaB$1d4{{8$)=3e5B=Rw
znul7#?fYU#r4$s~eE>7Vu=&Sl#te4;bB2$lAtNg@j7`#9EzBeJ+twAqvEl9akknGP
zLPd_?=}=GJ@zN+^^`Cn>bNYCy<_e&nu}=sS%zIMSF#lv0;h!~RwGl=%mz7w7kJ~t!
zG&-u0Tt`LZ{88S2LkxQunjMWpI_cTJP}4J~jXMGb0(*N&4|_~8C)%zd=`^QNkgQ}s
z?rt4DLc7DoyQ_!F>!GS%EC5}Oc|P*Dt_$_Uc`xR(d$e|<dh~vUf?7Xc*xx+ftZXiV
zlpZyRjyDU#|ABq7AvVM;IMX6aXPDP-=wGgNkLlk#evpurV&^raHM#fCs5NSx{TcMZ
zm84K^NW>9TOLYsK^2o{Mms?Wp&}P&Npwum*U(H5mJo7Hs%xAUD{Wl2-^QXNSd$><P
zKlcP3yc->@wa0b~#aE5KtXQvgeLwI~PxJ^OGXAY+GPy65I#Z8v$r4@Im@mR_f;`G+
zdFqtD3eiPhJ9#shTFO7yRn2K0TWa)3G*AERP;lmrjN7n^ChnH&_ibE0qb@x{$DEtr
zapgL`JH`5gM-b`}Mc3?1wj-)JVS_g~p@U=S=8h}2{W^_D`W61%YQQ6yxND_T7%q@D
z!oc!LJQ15fNtZYQ-V~&HFh(!s*!f)E^U#wM--Y$tuJFK$NE3?MhXiC<12jY1a)I(H
z^vQFzfNPC@GfTCqyArq|aG2Ego+MLT4J(T;5IarA9DHZu4pZ$d!hb5K*(~CkZ;yEc
zXyliSidy)P#8}z#fGpJCe7OcQO9NZPUe@<^l>Ym?F-|wkBe)1|x6f&cP8<Cu6A+oH
zX>{1mFp8SwzF`Rp;&8+9dUgB$@E_E^Ck#J*m+DocGf2sEw`nu-&)J23U#HM35ekFF
z(YW7wKxy)T2UTS7b90#MGAQi4<O%vk36-d(j%z&${kWOrLfUoH8f0PpGZU{UdkW2`
ziuG5?TtC;J3taxK1v%^0_9>q#&iPkSes{U==<{ZrVMcUv12x~fveN<YKOdQ1QHII(
zh-n2Xar9BaOo{D<28rc#ALsxC%L>03euyTC<V=f(J`+pTXT2?XL%)>HRu^{F@XztZ
z#F|;~U9IUdFzF>gwnc$xVs0U&ASzLDhH@+S-wRXs_<<njwVj_<VN|1(xF02-{8i|a
zjPc`AAHZBIl6(m!rZ(6vE3_=-AwoMVzEA3nfh!_vYDygz#P}b(6s}+s@T`^zttiPx
zaQt_5J^s<Uiub~Z7^wZlAu5G*vU?@xJ7p*k8C1z~Tfdq9mfvXsx*$CMrpD&BD>S@z
zwD5{j8q6UkU7gBicDZ8txis}VtSCz`O3lF%#258*H~^v&mj$1Q`uyYFW!Hi>^@JD9
z&Q3I8Jb=G6Idt+jQzMw&VkoUmhpjMIhIZzqNTW(8X!Eq3X$1!CH~uo037M>7=QS0#
zw?)NhzIu;kvQu^n3F_$nGMlxSK~v1X19J8)jQG&O-IGTgZ`lAKebB9QXn)0inRKnM
zH;3ReJj!?d7Drr}o)ZvYBB%xHGB+TfZqyyuFRI7%UhR>#?I%S_hBlL}^xgNXj<tVt
z_>3ax)9op^KuYh~Du>IzHa#-d>W1EjBqJw_QA~4>1;4Jb+)trVtKWn)$vg;79=IM&
z<XCErzf#_tnTE4|jJ?rL$OurzFjYJoV}^l0d%AQTS-5!oyLnM?16<*@a<a_>E+}pE
z{MUvuWSwvxVwre?PF>E2o^U-4abBPc@yriikl}{jTQ|RyW41+Ovnp?D{ycGPqIav0
zSo@KKmznaz5;i7B8F}j+=|N~C3PV7qAzZoxdgN?J{kA?LTWvQ@;TemZ6AU+%c+fXc
zMG#lqH{@F%+vHmPm$N;w{_h^A$<Qi6RU%IkcNp|2?nwQ*3e2*SK0UbDF)Hk2a#V%+
zn<PZ(c3@qHTW?n%W*4%btZ!99zPe}$Ujits>(axwVnoYS*G7P)GV|b=S|>_?ho*OU
z3!6r@4}|%$w?(YSMZfaOUwM`~TA%!H+{X5O!B~bm-$keb1M4UZ<Vc2JJ|^~bY~7h-
zAt~Yx#~u>L#?o4x4zk@geI6z<y{Oh*-F2JXc=;EP%tkM8Gpf||yLlLh&s-DL!@Y)}
z0m+UqO;+`B?p1vW_s6KGkRi3oODtH~lFeSuHuZ!Tt=pF^oRV4koiQ&>1e#<M3rmA0
zClj`dtLO3-K|e(7XmRjn^LM)F0}qyKPivbCQD$`{v5|%jE+#mw^<~+_jOlO^MjlQ_
zO;!<Q^@z}M4%KP`&NtwWvSZ)weC#Zin*VZc{TG|+<cyo{Y<pNfTdzdN1#NVsHVI8c
zk)UA0h8#{8<Jg(6+^0WyUE)2Sns4dcxZSBw*X3Eb_v6~GM_bv3g5|Xr?hNizfumu5
z6bd=R%Xx~;5|u@sC?bj}uWJ#M^({R3tqJ4Lov?M}S!l<FW1@7(pWKzgW$T`iEDnE<
zB+X1MRlY1)T#GUQ1>-F{?bMo)86EmM0_Q!`e7gz;_Lb@8%G}1axIVxBTZGxk;Wfg0
zm!&p=^eYvrB*xHB;e6p8<q#%Yn&fXd!bjOvlRI;RcqZ!wj4VIbNe*V6=o;5#R#E_L
zvdrTT_&T?m`P4DCKAXon*0Qo_9p%^8ZMxAX(E<z~sTQf<lZs>BStq)r*-f{zkgsPV
zQ&v8-uyEf^(gU1-iV$r!jWQal@;<W(>ditjG^+nr+xB+s%BPpU5?GiwOeV!UK*|Mg
zc?l`nwym2xP7xAmGx(F7#-)sw#r=*#*97FAfe|3^*nVdu5-mh@o*XM5nZ-{BNu%@@
zQgdf4&)QY?Jgl$tZ#PKayWhlhf~_u=7aprt9X)2?T3?sG58P4pgj6@~;QC<iSbCK6
z1U(!O(RRVUd^Cg<864o9j$FIBuuN+@Dv<hh`6&w$4Jn`b(#gow3cZoM;k*RrmA@D7
zh7U423K*ifG-zG{Pj8+DgRo1+&aSPi?1-6KN{t^7<@#Xzr_fREr>sk6nU|r9^Z#(Q
z0#>b)-{_)Xz2EUByn$8YwMT7r3)ZzROg=5lM@x7bqVuL5a()P6z`S#&*_4-rj`9FN
zy&8l1(W)_WageKGwy%ud&gZVX^jH+88c>l%$>Mzwp4U@aVX@nAtcAO4ItHxW6jkoS
zw*&6W0KLfwt;-jG(%gy0l%Q3woOD`I)bWd}2`7jYF3Q>)0075x!mB{($}AJlkqs2H
zk#jmfN6V)TE{^C@ge(K42lU10xvk4rP!9VCP}^>!*pK#BtE=#{ODFdx^N#7kUp@7P
z7m|3}@XUCfv97l_8NX>vge8;~h>hH+xJlo46ZJCi(+Tx&@ZTdpTp9PONQDS7;v-Rn
zkQg1TtH^P^G>wphjq}zq!5?#<XJAR!%p;^-0&;S^o@wA9k*OP{31q5tyS64wyjsYm
zb0&~t$8Oi+5vbn&RQMIy(?6T;9olp0Af$-bO7@<keXF#K<&@yIIOFZSR|v`IO+F!3
zSmEoAJG(rpU`~gXc?4DvN{p3r$|gIq>h?`-epyxTyOhbq#BoN+ob1nohike_WE^6e
z*J0=WV+GE|6v&WK%KEbnc8&p`lYWtOL|cW_fsp3qa~i&~w0HOTMgRY5-3MY2nmMt3
z)_#(jG&l^k?dd)J?!t4EpFD8xMXU(ZKI}L+qBK-xi%M{&B3~fBfHBVB@99GPV@I5q
zO&O|BGyq@Jx)NTWj@^XVG5Wu{b^Mo4SK{Tru9SSCOFHSw?h>jRZoX=LV@r@IMaRZ!
zK}C<}E+9L4SFp)f=obQlh?7g-JQOOh`&hncQ2@f;0qA5%vlvW`x^>3z{5RvqdDf|&
zm1$sm2=n!pTh`t%&-Ti^*msb=<TW33wH4l_Kq4)`ht`u_W2@cig-p;0`r+POG;f9&
z72#KKfba9~QtF}n>xVr8hG%h(p>>`cN%SyzHKBqW62IS1cv-RT%|YGN)~s>N5{4e%
zK#jN9o?FRtYL-!1?-YQEHsw-&@=x-pl{A3kt<<^ba5nX3>hmdvY`-SA)3)Jf+%UCE
zZt%Tz<<2!|>OcrfZCWdAGuFjLXGkAZMb<YE(sx;p)2YZg{W!d?n%9kX%hqR1CK1>3
z!STUc&c5fTDf@l6(zAXL80lHN+jBiPq7le^d089aYggUx3x8U;Z#y?R*HWwP&5UP#
zR`MgIRfkuq<HW}_XBkMoe?NJl4CGKltnJ-;LEqfxGMA-^5K6UE7e(IbsIA`Ow;kuq
zTlP76Ue=-(scwjS4}c4U{|LBemqq5wWx4!GH%O9W6H}XyxSPcM%p;Aj|7Zc;r|=Z$
zo{MufEFS#9gFbKLGtL!*iA6$??Rt|Gh_m-ePS?gWuV&{>R`^?7gr~M887uh>y%A!v
ztUY<G<j-%S11OXEU0whCfra#O%<z3UGzWSR1H19k6%u-b5*}xAbuqMpfB}bsxh=Q(
zI^AaWI{|U^nyDqWI3kzQ-BK*&$pIoCbe9avchzFafitRkC(WwiX>Y=HPzImhV#E*C
z$m`Czd1qVhj(fsoEj*2#vRhB=1}dq7Cf=WL&JU)$rJNnccILT?)vUxxP>gcL*K=6x
zF3jWT55J-7N(*)Wxrnwnr|JOmO%TR<a<{R8595xoVwK09s+-33a;_PxreV67FY(Vy
zXvs4qv(-{93Z94!g#g2FOx)J#y?fhm2hyjoJ?tfWKfOMB>H^4?Mt%67_=TlU8=CF`
ziqWuxxcY~Z2S+8YRrFY5aqB;myFM_(wQ|r_1@c@SJB)L@xhVLj;D)bqNhXw78>cUC
zGC1ooE{aD?xM!awnsc26V~Q08xNlsO{_YL;@?66N_kec6Wm?oppgl+s_kfh)-Z+hx
zQV*9<Elkemz<Xm{^mRGn0IIp{b;+%kWgi72ugEmmAZ)O8ClmI*Y*B)csa%Jk_w`=-
zOYb=y9g}myLHgiZ)B&oB`DYwvmh=InMf$7XFMG~hO9Kh#?Rr5%e;zpWrJlOUtpK{@
zXWWQy=ujZ$A@`XJJ~yw<obBH%zimaCcyfmCI8N%d6z%#illa)U;90F){UDQR>k0Va
zp6Xt??sFJp(DE@R#NNKm`!Nsl1i#l@o&|Pn)=nu~QVaAGgx>SD1X0EgnJx#7!(>ct
znxCrgDDNymSA%_8z+@Ugmo(&XGoADpSNPUtydjHC4JPD}a}=e+1P9bSiW6lPYY-NL
zGfAR`FwL6Y1CNoOht>Da+lj?@|F!zN4Q_?AIqoYFAAnv0FXY~9$rK0V#n|`v4q;jy
zoJFV%cg1Sk<MX!+a6!Haq<G}G&tt)W0hBjEl9`LNKvFujv0|aqAxsEprNHI4`#Q{^
zlX&B>XB@yH=|By&Z-^kfLADX-<cefS-vqbM+Z1V*IBZe4oMQ`7UaPREseCRRhrgS8
z$6i5?Xjor3&!3c6n$5kQlcZwL=s(NzDGp$?C9F(d%IV&;emyf?I$daK1P2s5e~Hp&
zsi|l#9mgU<Fd{CEn8{_uepPsOOJGW%u92n0JT11)U-=fv=mlJ3_@&A<3oFOO6@94M
zB9ZkAc{1eOe0`WhnOO>hDNn6fH|r_oIu}L#V=k_|;FE3;I=HnQ$MAdAC3m7gW~;>o
zGV-<ZRo6gGI99qkBl2a}SA*@*qw)u?l(ANDh?@e7Vpl!Pt~74K9Ifa#eHF@;<$}dO
z3Mx}bj$TBq<w`ywGB}xBvtw9S5<O(2=x7?!b{v3CO8(3+Q#wz@mnr}5w7>fHQk#y-
zP*nD^`*mCUvnFj5%?8~v@!L+nt8QUiVGpjF#8nq?S5SA1{`4!pD?Q=8TpNR`<xejr
zBsv@O;C)ne&AzYNG-?e>5D6wU+o@Q=38jv2h6+V|h;C0mdp||vOpAIDk7Eos&K@W%
zZW6hUv*Bc}ke~L5^C%>vf2f_7?t!_iV<5o%x#KLa&1n$nhcg5maAJ-VhIrnoX#QXY
zH0rdc-%Tl*H=IZd0t*>$jfX$BdF#XlIB&Sa_*%sFvo#2CKxMUu*hjL<!?n<}*kf-~
zG981EIl-BMN8e*8<^GlgldvP;`@7Ca$};+d3DjhQn9~O~Wwty0<sifs2)wIYYkfCJ
zTq6f=&})RQL<UYm^8#&DqMMs*Vv7dCpIJEOGeWr;za`4wB4?{(W&2`3Q%gi#IeCaW
zd?B33_?_jaX;@c(W}8QiFqB<x-`NfKp;(oIaZG{W1n-bz8(TD+II<$v6nb3Y2<gdI
zFMavI{I!BzOO|ky^BwZt`%%_BSDy0_KrkGUhJQ;H`m;Z$TMws5ge)AOj@ivYit+rr
zl5;9|@p#W$K=o(2gk^Um&^Au)kBMbViaX*z)=&^&oh5~vZeBoS{6hME&~Bn(Lg7;W
zb(VfFRGZl^_k@~7vD9#UgE|gb7s4hFta07Hs{Ey>LvHIeKhVI3ddH)GB?5OgW7%M*
z)35$bW_g@uFajfAozB;QVLL4`Vh^Puabk<Zl>JJ!H^lSptlM4xyCvjxo`?k!;UN$c
ze`>EpyMJ{q!xd?yS;NX#6%eG++<y5fiKdQ8Khf{6GUkjsJjv%yHJcXU`cncOKP_nU
z2@ua2K&?^_d`pd8kJ(Yk`s3_#9Xaib{q@J2JO2u|SieIUs_dm7^V^B_L=ju)Ah5lx
z=h|uKCip=Nd6=|{`&UJfeaBl?C&AwOp6Wg_xMxF0=^x6)m}_^WqIs;z)qSa_6HzXn
zclD#q-%A5lJ8{hzhD@QY!XGU=GRi}LpV&q*hx2O*@ssY^d!8s>EE_<)rEBqFUw>ZR
zXRclC7J~<@G7b8L3x#&Eq_Rx=?#M1L5=QyDG%x;sVH70hoWu>dQRP>5RW~D>xnpxz
z#2D5A%nt9-@OO>wlrgJ_#kz*!vSNF9)d*}{Iyw4zr2^rx4pPJjpc=*e;>U{H0sokw
z8=m_J3(#$S6a{hEXk}vS7Ihp&N3T~u^BSWleC2}DR5H~CJ?K;pjDsXLfHe10z{MXM
zCkBnLteE2d?yAta8$Nx$I~EQX)U`PZRINfoxMm`uTN)}I??+kZ4-I!ET4H1h?XeK1
z_-H8YN@fKjR~TGksiWxI<6T;4+4-&7H6q-ub_3t=?b#uMX8pf)7MJ2G<A72WOz(4(
z3r<jV(1h1*4Yu?r*?@WpV?+&`6I^LJJtzW>%t24x*f=<ufQI^s=kw1USPQ6I`R%j$
zN@PiLYs?#iL;Rl=-gf~!kNR%o(LxaRleP90EPedyfal$7R#&Fzu*&W0j*>SE)rWiM
zoX~ywbrNOR49j0{gJ^#U1z*{!(+oUjjozEtW@s?J*LFZ0>R7t#Sx-ss8i{thE<z}-
z&AK|_qJ?9LRtAEhFG{)Jeybv?U%&B+?STycD&gw_xGg<))@S)yYfz;nckamz2^&ot
zXd`lJs2xW@5E4!~jpwp?yP2FnH3j)2Q#%|nNM?}OkBZ89E#8!eN`D93ZpNTZ_1vWe
z<_hlSIxeLqh+z3G{Z$+1Fg&s-#x2|xNVX9~qlGX9ZTO;%r{tkQ37A<&tH(JDCAIxq
zbmf`7mCAi>AvPD(DS>@lGdFam3c)(9>Fjiv)rY&nfs+{<a_H@n^pZ~TEdJSRQ<ob>
zKXNO!;gGT1LUr-Z(#R)icyxNSqJ=q~yn49e=hnP}+*a!lHw=uVgUe}1Lc?1ttCb;n
z1HaQrZG<^U<FnaHiPxjsuW<iSpi-PV{+s^>38Z}Z%*cI5!8?(8BJuYnJY1Pi?CR%U
zqVjn~pKGgwVjp_xYG^|0aK;k)%jhij@oLBwa7W*ZU`L|NAqVCbTZZiyJp#kXqU=|q
zSN!qTYs|jBN;gz$1l8tgK1yaReKOT{<twsVm@M^E7N5GdCBagdY?)%0fd}RxABWc^
zh{qSdnHoFk>@9i~h|Yi!v>dZU|H9*WQap6^a2{;IMkX<jLn-GBZ|aW$Z?11XT0-3D
ztQ}foZ)<-G(HW_5@=^$12Fh5;pLnkegHv+#aO=K9zFEyQ>rgt4OC5`<lq6PT2sOck
z7f>=9z-a5{vDPxEr*fCud%Na-Rm#)lgB8%J8lD$1<IuZEgz)02oDG<}e3%TK3;}GU
zaGC}U+~p-$YZnnIcTMYvV~=O=^My0?3x4inYi&+TZ=x#84asWA7rfauB6b%Amt0ZU
zkFT#!|0&AUgyPWzhYr4n9H#Y3-{tlC%%4ik9)boJ#;Z78ejVyv#S-HC`Y7JbcjO!9
zT;nZq<9*}Laak~%2A;C^VFCUMX4rT<ogs_qXZVp;stIZNPHMbXm032Tb(OT7%RUF}
zUPX`|v#dQx5=kQ?Ajs6a`0<$o5X5O^O!J5WH{QE8ti{XZnQHdJtd71u8$qoUHd2nt
zd>1=+2V*WFV!x6?#J;@Id^?S$+V<M3Gynd*?xw4fX_SBm@y_`3-GK>{w#3L5>va><
zn8~Pg_fNFPgff<{_D?L8kO{}O*_rJ5v(pLIMQQ%n7Kzoc1C9y|rT60WxuzXUvF#rN
zf7#sHMvLl)X3GbN83Yw(Ba!;=d!$kSEcRZ+iugv>8zE*tkT6SCkSIoLKa(_fErlIS
zlvQ^S;^2L%$3;vkyJf8yy{NekB^FC>j+s>fO~Yh|CtC^wbIR)n;_KyAS=zB;hllHf
zu=%K8q0`6bAa+8@Lr6>c4IEK2{xaqafyy9P6Gt1e;*h~d@?;tXf<9~|N2vdE+k6%#
z5&`Em2-9<3X(i}ihuCmJiTZ%qgeO?6z>6q@EjfLXAWSuGHkSSuDzV$uavabsCVJzE
z@|iM3EFB=E$SL)HS#q;{vI@{z@fo;Qo6HcK9Zoly;)$tz4JZAfK8a%UJD#-lktpO)
zzOTDB%5bLnMCPAI^Tq(`c;(f76gvLB*FXP9F(4*co#Ex4g+Z0%{dS5tr$@MG&9!Gi
zk@4XW$6=uEf?!xJKFiH9?IXd}v=hE*(H)kaGBb&J43-_dR+e@RIZ&c)xx>ytzksXE
zjc(R$OSMHU%hH~lDKuQ3%%_8p5){KlpMB|Wu_2~FRp8^6bpFj5OAlms1q@m}$1OmY
zJ$#gGhrjbIet^Bi5V5M#jD8J$=DaDR!V}gj_=`#%iumjyFr>e=`8$v@GEpWTn}hG3
zO`)AY(1pj;|Ge33NS@wAY&IefUWTRn!cb^372EpZ3qBj6L1)w9D?G~-W7`V<`KB)1
zlM)jj;n{pTWy9m&If+Frt!ouT1bIvZdF9BCf*^}6cvF8GMIiL}DQV<YOJ|pPmJV0j
zu!wBXFfoya9)ph6ffk6JmC{inaR)1rjLHxYqIL}az#a<Av4Y2t+&SK^!&TkdQmLMN
zxoCuu#nKA7H#AbdX008o`~7Cj2~H1vcJ($17<w{0u1+%9>|>1_K#4t$aHEI5<nMTO
zdoaEEOY|3S$uZ8{s4{kX&g8(usB0gF4#qz!Z4Ug~?qzM@JKAn#oxNMnRw-&7t#Q@(
z)#1x~EZ;CcXnl&lU^>5@Rwklc6n>fXQiU)x>bj@!Fvvvw8XFxuZNApi7qxto^)nk-
zZ^4rlY+AQ>k7MdczARj>>%(&5qV2atehmKy8c;e55nH-<K}H;sdfZwgoagrp^Z+|P
zNJTI2EX|Xg7vC3)(GU7)dt%o9KTLfEP+M!)H4vnfP=b{J#aal(gS)hZQi^+vyB90&
z6bMkP#a)6!krsD%cXxMp`*ZJoznOnBlgVU~b0*K^th4r7Yww5bZ<LVBAvX84XoKcs
zr1V!>7sd3SxaVx?n#v(pNm;qy1C@gpx;}DE1yfMUsea4WnN;L1ZCBm7;Ms0iS=w5M
zqL&mnkn|{}*)ZX9DOG6ZkJOrjm31R)<BlZ&1TzTE;u4Yn2&0Dv?s~o)qz^wO{dNz4
z-V$n!QN$HZkQyoH@$81=&*4X#6s3I(9l)ImfZyK#M&L4j<@5s8MbRqLQ*Yl<UW~d5
zW5}{6Wya!B`0~g_#GH#;dB(Oj?mG4^W@T<sD~8^0mhn36nxt`CEpQ|^kgk(t9A{HL
z#)Wb`DMvl*tH!sf9XW#}jpCadbj)0y3TwYU=3P`*xG<mY>%qSZvb=s?!CGT1RQ5Ap
zoSc)7Ue_wWX9e{QzNU%0|8A0VAdF+sU59M7rL(^ialqG&5&b(|V&M=^57{nR{bh1!
zx@zBc&P_(xb9k5^ha(gx#Sr~msH1??h4qEn^;>a3B=E5cEHFZoDNh82PHcrfwxAAU
zseL31Jwt23czS3`;R1<NIelBI<Rp&xHohv@zNNkzpxgbO-e6*V2W8`bONit=^0@xe
zrc3U8gco+xIKUCAF3##F(jnLeQSHyrwOo`n93G$T(LcaL@0U>6*e6mXTjH7#`IVdw
zj<&=+lY25!+H;kfv(T$C3i7h&_46FSHk(Vioov92<bu-1wcH=Sgyh?j%yi}hN79%9
zcmq_N@l*V<zPV$9{~T%`L8H6=k3^!+^>aku?Ur<BTRE17j4x>B>BPRv<3scwUYjLy
zt9n^(h2<X;?_#%UA0`Bc8eZWgzqT>c(c!F%CXFyI{_Y!FW8dS9KGJh2c^(CsurMmh
zXNTFudc9s+_1)rHfoap~OU_ug@|<AP2_^B&<>HwUo)FA(7q?I&+UCV(dZ<U<=W(+Y
zN-EGFRx)cW@m6pEbKx8Eh7lBR`uPIfauj)<Beo6>PEFc1*JpPt0GS7r<Az>NCP3*=
zL1mU7^xx~HXum(w7mUQsdDmK&?_PS12R8di!t(<)<Ohag4P#+-4goyJX(Z2EdVKPv
z@#MSS6UjG0eHZ!rv+qrKlA_}Re=388n3u+?llTMV+B!c!w^5qx%P$(=A#Pj1h`$C&
z=jWqpSje*Nd?e%ZK2+S*4&2KAzgYnFUv}K<v)nR2GX}@*Fs`WFx|vI>zy}DR#}0Ab
zf~)&0hTWxW+OxANM_M=Uqqh=x2Rs8{((Ql7`J&7yDXe()%LZYH+19d<gq%i^0Ngj4
zP%h#>=6@uts9kqk`wD|<gPC@&zTF5N<+z)3O)G)2Tnlh16$b3`Z~z*fduA>|FL?*z
zmHqerl5w8E@0&Aw#@oJ1hh@i_?)c?U3SxILb*amW3*MXt-r7XX{3N6l#n`QrH#JMF
z?V=41O#p_i5lUEBfA`uT{mccT{B}-z^DLZmX%Tvo&v-8HVY#ZZj<b(@D0R*`6*j7#
zncqB2G<Q_=(@<WKWZ=`%teTtWrn2)&ZT#og9G(N7nglL<sykOblya7ixAwZD?A)dP
z1p{tzjYt2#Vep|r?YF|tT)&Di1^M~i|L9N^H?)^H_%RlDsk;+IABea0l`DyMMAM=u
z|HHoDl$iUl7`<aPWpKD&@a4UG=USuA7R!80i*bX7#-#-}kYd6no6!(2drlp0R<3&a
zCWOR(LHFS4<8SmgHY_5UaA7ME{>!IcyQN~@7<HD`aE<4|TRi8MyWMC)&eerK9-UcJ
zL(F~>1u1`TQzqN(=<u~CN|mqeEvZW)Ijx>?Y8%kHGOnG~cXyAh1KIJY74uK6c33(3
zgM&R{i4JkvyBfT4Wk&*1-jncmE>OD&?Ox}veK=AnSI$xV&34y*5;E*faVP3x{Q@X!
zdZdy)0|b5w#H9@15_F&0Uh02t(8IoNUuxw<$Yv!6_~!<;C2wXnt3|<fMt1z~!|aO8
z<qH&S+87mF>)r=MvW(c^^&O>1FhZ&<K5ueQ9Nd2ehC~MD`?fd#`e7IpP9XDncdc>~
z#pD5H#G9DoSWo1K`Ch~AWdr7AF(wByAR-49wp8yEeEG)=-OttGGj?f;53;P+0Z`}y
zzEC`Y1SavV?14hVMS7RsOH~AGe*c;YVtr(0L`m%*-Z}W}7`qL=5t!5ATS<D38D>$T
ziR)TaOtrXt+|0;Er42`Q*$LNHdmjuHkpVRrF-`Ol0EUe}ziLQ8-8N7+gQ3QvXIPIr
zlK^HFL%t^c2+=5O{cGBQdng*Sb9GUm`9_|wzTD$aj~bXd4w}^JWbU`v7_C|4mU_q}
zJ+72r;g_8fLS2<Y6gsz@zxk50CVHFuVGiq2U5t9?!@WE<(*(lqiA?;wl<z6_TM0e0
z&=aBa80oRUnc7I8bGAzEaOt-U9lghc<D2SELvjg=2BV%2x7Ud=Q9(MyPo_EQgkf1p
z+qpe4m(*ztwKE>R!DNc7;Zes9?{RV(RFY;D!cyo{zgOeoIt6CEbeF;!YqlLrgEJuf
zq}>x~l!F$@8`W6w^RZ6yJQnW15Ij6HoySuO(Tv?bknVPj`XnQMfP3!#&5>Wo5)%7K
z-n^NG<qOJ6Tzle|qg5df_4S?N>KE23@+v}nvBIbA7sZWi87nU8t}UUO^o9nCuL|V;
zGPutezFX#()O)RvBBm=@$L|j=;@*yFTkLQwm#a5%@vtlW^7+PD%VyQyvgPF~$x3jl
z_ar0Fw9psr(B}z%sVF!oiRnmJt5X7o*DT>ZUCtuUidpKWdngcc1XiUZnjz0+jQS>w
z<#=Y^xU>2{T3mSCL3bT<IM=yle_W~mrFZ1v7kF5l8kjfOW$wj&q<!?*9wsvzwj<_+
zS$_(frnY-afQdJNeza0fnq8ie_pW4w%4}b3%%SxNX5T&BFzzexQ=rvx_J8_sL$<F@
zWmzvVjBXWWLYd$pL?nlMfa@{Z(K?{@2l05+!MevWnHdRLce!7?0(P77nYN=Yaq3uq
z!>J%$HJ-X<j(Y7lD^InmVVqdKjyw~Ey6@Jj`2?F<3beWEybMU*Qtg<gaj#r;$<T;y
zyxS$ly~laB@mk;tfAe{U0bzOOwKuOLa|c&y$+^8fI8N#Xxop7g6ng#JEjd+FgpQ2S
z4znZ$@3Z-y^e64}sq-+jz>HPZu%O-TiPk2f*ME{|xM&ZV<w%wio_W@-?N7bpMS6=E
zUH`kim&*a{D7wUL_jW4_H3V|1McFBT;U~|EW=tYCn~mYPeqh!M^5XvGYbwwM64q?h
z>!_n%#t4lfb?K!V1j+t32!cP)x#&8L)(?6lFg*V5%?-c;;IUtOf?GhuOc0AFEM?Mu
zfLYGQ!#`5FH>TdkZwBB<D<|oz2%}}bb)>4kn-ARNMIW=jW0`n{`^+&kIGl6-@P`LH
zZ=7QXw=~%#s1XFRUa-F>Hn6=XCq=Yo07vXwK$C)ch~uBglM1BJkgbCeY3l5X4Z7xV
zvv>T2Jy%@Z4q$NyKmE}vdr@|nd^<QfF1F=i^#Ul;RR>0_!8u{ZoZxwNMu=zK&=<_J
zWYnF#18b=aJq>S>heUAy9gAL&TVo4IyW%HJ?FE5}v-_W?-N3D<Ck@E${9<hY4P$Lu
z%eOskATVo-1NNTNy=m#_E!@nF#_!CO-H&d$i~U+etAF|C+6}%<PDuK@oIZZ?%Cyip
z$V3x!Dg_(3*+J}g_B)nF;28P)`=Uq@M{gmutRuO=r%B)ln<>YLW1}gz`1u~6UrM!)
zd`;DP5*44{yBh{nBGK5E`&mz5NquV;9uqpDtC?2{xSxEJ1J=MI?y#N!9Fe#Is<Dm(
z87n#NC!jFzx28dhjfpKDGlVkdb-5-`vvTJ2j!E&hSsQs%#I9&+N4eEBl}=nSgFfQG
zhHT*Yd8iDnyj;b_(Q@OhM`T3b)?6G??^1UbxoaS0Bt>XKI9aGlT;p59whbT!y2y8w
zB{`0x8NuL}=*w3bku+|@P?$7@xL_+AgcJ6FQ6|1m_@6Or&Rsg;X`s$eq&e@idP|`E
zhm_q9iz@!1bY<vMiX?`_X97*){MV?y()TnLWIBmD=38BfpPb~<_!PSd3(yi4YK)%N
zmuFNS$I$P1R1#+cW2{41TJ1_olNR)d(%xXr$zfviuun_mMd9pQjRzIt23RJ|zlA5Y
zYgHcpeFk9OM-^{P?@<#J!qRI!zQTH{bVkH}Y_ALANsza`zjgwmZ0knc(Zw!<zsvs?
z<c%g0aXMM~f=Ma1?Z?a5MN3@y`p(xQfB)v-fYnOkLQTC#J^L0|UOmE^9`{0i!OG7~
zga>QOkSb%ns%F|~x_{h4XS9V2YJE7eTTcqBYH@qmQdSadD!qSG_7;9eUZ>Vr5`?*z
z>hM#lXbREpWS^qQ5W(vB6c>TYcvn}zWJ}4rjfb<u`f=gbOr0}U7@w-h!t7x`T`THi
z5y_{&d1jL%c~h#|Di<Lq@ugWF-+XYT=S1MB`2sM*?>`$0irWrrhi6Xj2&O31C6kDX
z*ZzjEh`oB%25lGF|Lxn3Zdok+MLQ^RfUEF$GQ|?TM&Rm)*Y8)Dxwl-7#>6lIY2!T;
z#=|#?jeWmn2-0jwgX3Gb8lH(1IF_F!N@!lFt?TD)uVMn|1x%__@hFr5G4{e=9ga(d
zN=CSQ#|^jjze&7k4>&onv%qCD$HC<ybK#2#j>S>F`kC)u?OR4&m2dn%&$E=5>)4oS
zUMyg%3meMEozK#!sby@54w>$EtHa-rD5&dK&cCQ{x771CqZYqV0BGsCSUlz3V5o53
zx-&lDe7CD}?RS%U^%CPr^O_`c{=1OO$IWhh&Q5I%fbuu{#}3S^q2i>2>@l194vLHj
zf@*Iy`5#S#9S`L*%seq4F38P8X7-Qg^%j4Ith&4yZ^J&zrx2x04XKSkJNL#B(hd4B
z1I1?J>c!aD7<007K?WpvdcQ3&tg*ox1qzj%Ys5VRV>pQnd`17K06U2Tq|ZMo+5}T(
zY;7PX36fVv6^Q3G?a^Yh1*gfREIjV1G?*OoX$+-PS)y$t-owpKWMwR#)RsCxYQBc0
zXycr~ktM*g7&WzHoo)X0-yLK%WMiPT7Z5qy1-{P{?L3YU1c-NAf)Vt@06&&Ce@%&#
z_Z^@V(Fot%GdBohZWmxXs*4mMkG8zMMuaIXCOXtSmZg_yHPvgR#Q{w()U&O3l_@{D
zMpjjcaz?Ljq!L@M?U2r_^`L&xp-a5)yLnMk1af!2uW_v0=$!Aw6|RREA=*VBA1T?d
zwWgr)lDmDd4gCT{eDH3-sc=|TEPK9YAY5Z9r5Q9p(Vecd{e#;fBYxUpgdaE<=L;O{
zOXEc6B?OK@`)C<?gRNN%gC5J^=1Rv5QGvRB&C3q2qZ<D_?`eh}cL8^`DaD-o4T1_@
zMS0y_YS<rq0qOwB+tcG#wwy@V%V|+gI)FeW8~q@!H>A@|pEt*-7LI7e9nKsew>k%A
zH@F670VW_UknjZ(^%p-tIJ&#`jC_>dUpG9JAn!J7R~&gxHe#P1oaIgQTu4y>6YAjP
zsxD4w7MDz~3Cr`=vz=BXDA(<h_OBuHm}!KhT!v^GRyGk#&kW40L#4iMP<MzOqA4-@
zW2TGNC$*d{Z^(S&zJ;QMKT;YV*I<A=3!_`^S6v_vUacg|UM_@VSr6@3&}gAEusD|~
zs^3jJeS9gG=c~qLd>!?+MD<UYHC$a(lXIb@`2*dwHcu2S{uN4Vm5&tj>S^-3zg(r<
z$HhLl?~j|-$YbHax9Jn5sM458>qjmq{ZLD(m*4ZpwImsTBSnyN`;mnD`S6Ys`B4q_
z04ag7?A7e4?Ze@=M{U4a+-aIi?Eehja<XX?wY`Y1vj#E-j_eC`l;<l6e+#9Hsx-bT
z71HRgP9|paR0#1Tsq`=V&Jfq&Ggf%$O0U+2i%iB3mvWz%m{7X3J=R-Px%03@dctC9
zc4^ME;LoI!g1=^3mRgAk3=J@w=LpgC{-T!=77fRaXpj7Cd2a9kskAJ=aGT>t?i#GF
zBw<NXQpaAwC2m@$1<vx)D=3(HvpDv1g+umb#Qpxp@PyY$%G+t`0;Y|P93CAT3$2NX
z&j$XFUv(<-H*}Cinl0A_w>ZT}MrD;S`jy~LYv!it?huc*`QAnakF?1r#NB~{;`awn
zqi;C&lok({J>1{+<6e3d{t#Z)VtMpAI%(wLl06&>uU|ZzA*NKXB2PSgdmOB7PAy4g
zI3$6g6%pvH8>7K#%@d=N)a=Lnpq(9iWGMw%i2<Pk+RSTmzO+qALG8+5mgXS^W9#Br
zQ_TeG*;94rSnPZ{>9rx}8;x0CxscvvA|CW-3Tty)zKB9L(}n$i>n~s8dt7rrOezfX
zm0Hr--r@4ZMF!AHpSXOsUM7^BXwj88H0cJ#yxv#DkyJhM^5)RG%J)PEQVz_h8k^-d
zFF<edW4ias=4rbPE6KeQG)Id2+6jq)QeUw>6?00O-m#^}Io_VW(2Rr%9qe{uhiz?I
zdG<4V$<5-!tTcdY?`H^@%=uj0h((jN8i8(VrnN`r<MvQ4?jX<qN;f2VjU)T)feR3C
z6T*bc=d#D=Hb(evjct*F-!BZoq`h>VUbZp4W6uwK=>~o{Q-Cny;ntYwbb=ZpdzzQ`
zP2k&1<Oz9R)#LZVF*3Ax%14@*@DFG4-NWTiAk5UgL&N3xH4*rB6))$pnF~K*C0ax(
z7AI89L59|bftV~gMpj=5SGGi}rCE8y46~@lhhkup2^0P|c7`UW*OIh*3wpfl%I6o;
z^a+e;GRIu=_#RwoSvVfC#qs&ekx9;l-6J1lSG98uC?tr)HP`ssqUd0ttRYhMme$eJ
zgEl0$;5Z>qt;HYTf+IZzg^JB}gFLU$G1)B0Ay&da+`nKl3F`FiH&jRHBOYjTE0NKT
zL<3AV4(&lb>{CVH$j%R`#e%i<wxh!8SQ@u&<S$u@V~07548hW-;N+w(a>Yi+XmNU%
z_b$35z*o%=@yN4dvFaoC-nl$RFLi-Z9B>E^c)7q$((J#4___Hzn}^R0AdC-_55Rpl
zIYYft7c#%IQ!YpWNovW|1oT&ZMgH*F7KSN1luLHIY={C_EO`SILM6|sD5`AK*`DF%
z(o!b`YfbGX<fW{Yfx*oJ|CH6khMQUT5DqR}(JJ>?P8hKtDR`XCRLPs|(S;oV(CqLp
zlC;}Ky@X@Me-yd5gTED`()lt7S`dl%)t&%DMs3QC?>_#I_{|3Lc(Q2}qIzU|nnjSg
z*h2isk>T5i^ADKC6?9{_l&1LNT-?uG&a8L<;^WlK#?^){*k(Bw2e*m;ki9ff+}~AH
z33znjnk_OIy!So&4Si!m_zLmISut9V2)L>8OF8-kt<IaaRnPGZ3Fx#+igv^%<G$ex
z&!1VLl~QG)>o!5HPoEDgjGumiZ8O3!hpxC@da?e+C3xvg=55}prk+u=@S(1Fy=`-M
zn|Th~=EKIW@&{c6Au(vH+Pe1ca({?mbm50ULO*^2PHFI>YuWpYiJmRIHo$KN{5SSc
zF`8lzEed!x#rWM8*TwIKGqwC01C~O$6pWfrEO%T4%<oJErDUMNg!$v8Sd+Pi-i876
z0T+`$a$iBbmPOImutYpQaM?b#XZE{32LOnN66X?Y4pl<6rAp*mlG3fiT!-)}Uc2S{
zN$lZy1r*s)S=47}Jk}?KJE@IgmW_fMbc8-ra)oW*ozRuV>4f>JDAHFg#xyPrK%zKE
z2p@jE=%;cK6gr$0$-!#Gi^~W8q8qi@k`@TU+;@_%=HKNDQx3N<r{unB6)W<1caIHd
znECw7#A@PEw3718H`VZEx(o})6zPK@0ZmH_nj>*-BWsI4aSkBd`rHE5uy>}Wt3#`7
z46m4<J2v_cXN=Pk-&!S^BjafkeU8KT)0<9_`R)bpw5*euE;#-}uZCP-&ggVh_;$*)
zKtW~x0W0G@>~|Ebu*0N)xcbOZd-;I9A{80X*jLBhK45ybUK6s;_6m}Iu;vihIeK^+
zu!F~SCY>aO1tb@GwZ`}vq|VazP{a7kRJH>@2$<A?wR&uNToss+Bm}>f*ts~QOAHKF
z_-pq=T{7%z1=Y-kOXEDF`e-0L4ez!Gm67r2m;s-Vy={$cYc7qkUeElwgKqoG>G6qf
zq>KJE?x^Iie<h-SWg=OC>lGAMHmC=cCj?P+d^CVQGJp|fXr*Wq<UlVC@4jXqR_fV9
zytDYXy7u(q=S_s9-Hy%Sd=CCD;l@yJWMEQf83PlZJ|Q!%6Mwq|_OmN>!@NQTkoG%=
znl96*aOi(UlwdYDPeS9VzBD&SIUDqBb#-!lK6Rl%lBh_6_TdUcQiLR%2|{BEXcFXA
zpZt#?)|A-WOuff}iCKNcp?EM1J@5?!jsTFaZ}c~S;WmtZC+029Z_@M;ZtjV*+I|&M
zXJABPQoLD-sCIXj+h48U;8*ITgcD6d5iRCQ?J#>KR4zGom>;_Mc?C-U(!$X4eZNsq
zv4Twb!@~{mln{XUE}vwPA*|g7YIp(yBt@<`OR-3YJZ4jlo({y@mYQUZ61C)MHv{Q5
zl>tGw4^h7KBR<=A?nh_9R|OdRk0<weKZAj;IC3V-QjT<fu2wy;gG)M0_@78$`duX5
zyxD?|L>}r=4e1GKa~=mVg84YU>J5v%))M&P>sS2?=)Z!&rRW7<=M%*tojzJ;hYe{B
z9J?TM-c@J><lKY-P9h{SrpF$qU_at;LilAw<G?bz*+1j^UM7<KlZacxN!Z*gbwC!|
zorJW0dt20yipOT;ian{Y=Wk@JZzcd!&sa@t7oK~^sSTpZs##Zp6)Rv3Xv;l(X)Rkk
z0W@wWA22Z=vw_8d;VWC>m}g;}q+rApCm)`?NzkKGhn|QINBJ8q15!#heOqnefEmSk
zYl5#2g4HD=X-=66OTX02sWVk8kWg6cdlM3GRz?2_(tBk6+ysQe+u4?2wuccLd@8c>
zZ;JD$9H8PnSj%`U<}X(7;wDN;<9#;6;9!ZAt`~w;>7YdcYF)MC_&ilswZavHH+iSE
z(7s4uKrh(h*Bb-0RPX{yN;ZS1ZI)ECg49sO)!}<Q{>$kR!(!<Rq~b-94>!1buP$U=
zD%+IgK=<%90{=#~vaj5=K#&sMZ5Zt6+aAV2Q5*sDfZt8@3RZ!P6Os+T*+@-k&>k`1
zeX-xBMHpU8Y$krJpHg0M%ol0PwsCsXz_Zaoz_Y$!#XEC;;UU#E+Wc!5#nr!E?AO6>
ztU<Y@E6IVq(;<kbt9?&MvC@G0F_G?BbYkckgo_JzZz6QZsLWPJK8?fW{9Toyn6e4p
zP@@QQ*y^VRW5q4bKU|Un8D(%GRPl+On__oVOw8W6ZGOU_!!n!>fTCmC?lM{T2z`MK
z<17DuDCs~l9<H_Ojr(_R(bxj@xt2edS%VPu+@qi?+e95+iVWJpE8=U9CAJbZa!@Xg
zc(*&BY;F?IFZ#nPvi@0QVq@%xM*PCT&kCvN|CgvOt>UtcKhU>z|Kzv2V3hjlD>0k;
zZZ*VBLv<EU;@?17ITLQs0WwCvt#u;qdjb}4p2lPU6Q7Nj=*FD7blRU?cK8ZYUTX91
zaOj&pih202EsfVoP%epw24W6vU&iu%8pj}h2en%@-U19auiMpd-Pe`glqnrMn{^yW
zZnZdEzUKY=@!7xerzW!MbH<!t{m*2ouVjHE3~TXuquz`OZ>e$)<6D-6XZ-2M<^tK@
zI-=@VMDuNe5r)Mkh}BvNnP@y^8x2T5zUP`7QcwR8yxx=lh6o^@#{v{eW;I<_ba)x1
zCD_XwuD5K?r9XW+Y$&7h=+_4ZCu%?<tKvUx(`hkWAour142$o!kSClM`KMYunKA%|
zys0>08CxyqnuiOKy5^7x6ke`&oLAHZ<DP$s69TGFUSXaYVGrdjrS)b_%E!Js0{>o4
zh1%lurKR=w*F2Dj8iRkA$964Wzl1`Y!#S&)9Qz;lmKJBvjJr5lbmc#M#yktjj5j-C
zlQTtlSSmZl`nFZj$D0)vZgR%!691zx8GnXv-!eiN7nFvY#S0ByyPCz5erg>$bGL&;
z=R_ctC_V2XPMEW92Mqper_!Sj3L}xx(;I#O6Q*31#UhE(giNj4XkXjW(hnqKtuSAe
zFE)NH{!rR6oOvQ|Z4#84*3sN7O8q;AOjHnwx=0(r+`x>C_#pX{7VnT|CrqM{&1AXs
z1IN!06t^>0NP)BKizo{-vWe+b0cEIdx+8s*R_)Un5(cI>(}o_^Gmcs)&8RK$Do>L3
z0l4d!G_xa`;Vt)0Vzf42Ey&qnI3(;NY8dU;|M^77FgM4(Kt&GdLgmb{uFz{dt#5vC
zg9#R<qaPd_QDn!&=*+sNl@pxbZ|<UOu|n;t6-9nMo_O~`sYI=g-c_B4m0V_5q;Ots
z&K)>H(6D%%RP<(FpGUJL)31-(BFAgGmFFQRF~kjbbuIBKsgRx%ZrW<gqTWloTWMPe
z`2!&lzhpX`x;=c2=sM@MxzW(A-=X7~ul7=na+B(LJ!7gTKfagwTw*Lc-F?>X8(Swp
z9=^koSs2QLj7$)9F(E>>ZA^tQZbCN08>?2n-32`w>(o(7Pb<c$YO_+4S;>i>F-NCm
zdE1U$+{AbLqFVB{e<c<e(|5}=i6F&NIcIB2`gXZCLzO|LZ&p5cKB>WMtfxz6LlJA8
zsd&0yI^0E}1E3E(%$5hkd1WkP$1(?A4UltF?eT`Jv@8_8JmM2N4qm)fz*n_eJ}L`9
z&AYF<C0wP?ZIYIc!QA*>;8#84ePeh#D-LgKI~-SDh$v}8`I#WK#MX87nZiWg-sM{b
zN7Q77$5+>)-+%ID?=wCAco;wiHRBk{P8uHI0<)=X%1A%DYgi2a7UYR5uZ2h9q=8W^
zzUt^!n@~K}yEOpdYdST9Qt(zKJiq%mF233O$~R@e@^Q+XMJO2}l-BDxPS=#wkmnmQ
z{-J6{FSz=1Ly=_itJ+>UL{w}0Cq?y<8QQp84^4CG;>o=U-^Qw#iB&H;e&o1ehrN&`
z!U<!GMmg3C*Z;^><ydoEQ7M2Z_673;Uc*?wwNAp{QIFR_gN}YXVt#kJQ7kvk?N}wg
zm%n{qx&N*0HZb@p*9rardKI?hEIc^&Szlt!tG`h)q)Jrxu>4H+7FXMMtL432EnnB_
z)zU__R+%z!+R6VMI%Jv0OU9?hQFJu;s|^e$5JnboH|Tx`Q9?Hqw4?~=Vjj3(@EK<6
zH61`cpBQM+1!75T)~kOqfIIoSpq9OnUQyaPdVy4qhV^BlZ5SLg^b*Gj+Ou#oOna|_
zrEuS@n0sb@j5#W%<qAsq4My~!+NSG~BI1a|&*z0O*&<~PUjxdA0S7ZjNMTwunnr+6
z&)I#<{y~yh{Jf7}M&+sejugQVmGCh&Swn45ERjU)q!BoB#Yozni%x438@c2H0i58V
zmiI%u=~L3u%rL_ZUiqCi5WScGkwM?jjpQTq>fn9amBf=o8z@Y-ykBS5RnTk`C-I%6
zl~{<dz$YjS2krR#10j$niEMcFQa`{}>g1YE<}KaqNZQBpzH!?=y~lGq2#1f(dl{lg
z-1`^RDZYMGZLo{c>=#@zg!08NIQLl#8YG~KA~Fwd41}8qzWTuz$sMrM1=#>frIHZN
zkx5NlzZ<U}(8G=t+-v~ZqjZ(&fw_tPE(l5dR<3N%8e~-V%m+X<3`h`+D_}5Hl3wBk
zD*EaN@#qY~F2Ys*Syxq#q!te&G3koA#`+>}tgDGF!p7mzOR_Wf=_Io5*`%<W%F9&D
zaEcaC!z~8@Ps<p1=!X=;vh*|-m(&b`8aaJw3LJoo_b<?Z^|WL5f{k#bG1|PP4#2g}
z{2b9X)c(~-I|}PY;mw>%&OmFJs%{t^R>6C7qNe5K#FC$EZ^9&7qHV{w8zZUi>w;go
zWPW5=H2uyuF?TWT-S9bAml^03qGjPS@Ovb8^#gzSXgoEUHC|*2umW5e?{BI+<1t~y
z?s1w><Kpuxz*_7~70p~!5fQ--Tf2C5^1Y4R@icW>`>fOmwIO>+%b(Hypj`gg^_=J8
zO{Gk$5=wEG_D!3gte<y^(a=i6iI0xnAenF{>~vkl#yM*ptDc=p{1KbI6_!ZVF8EX?
zXgVh_rQI8lEB<7;y<)<WH<?tUoXQMu$tROoTrMvX4fC9O6r1^i`x7RSt-bGSXeY{P
zUUsLavHQ5Pqw+9Au#hF6Nlv5~Gmj|znI>?Cabmf?#H4AsWVtB-lUf=7;upaDI#I##
zH#lNXIkso`OWAQluBA5rY{-n(FW$r@1zmZjkwYSW$$1V`=~$V<m_2PJ!|uUi7v=j3
z0{9OBWn)Wb(##01O#HiHc5a{Gug)Xy!N~<v(y7*)pvsvJ<<z#DneEA!EK}9Rd+nn$
z1LI$%@!k0bS|0uM2<@VRgp((CA!up)3jeefIfO~Xg`6<{ls>rq>Ee6HO2bQgJcz)%
z`v}0bDB3*oEaPPWt+_$%hA-~&IK6oPvt4AlY(s^7+`b@A8Kl44wN$u;<f$qKeEmDZ
z{TyLBfwlUkJ#)nsy{heBGjRVsAO#6=8eTfgpa9Owl4#|CtyXCJ0?(`kG{gL5K1Lt(
zP@=1iYw|x^^xxK)%Y5I(SaZ*%$h|r#V5dA?+gdKLvx`>zoD<o=VkaP8?)p~TB~uh7
zIO`b9E+`}hmtLTsnAQLIxhXpM6W%cJ6hh5tmZ`Em1Kl^BX#)bqFvk9F)h`^ViT3C*
z*muFCBd36ddhr7AV4+ZJvJQ~27n$FVK->PI&e)W93ly3s6!5NrAJw6K1303+Mj9Hz
z2k{h)`SH$m^$gEa#1oHj(@9iE#9wORP!stH?@4I>hC3twy#^*z28JGGMzZhPj||}X
zPpFa5?;kCZyWgc2=2~5W>rzWmO*rtax1`|TpK>nX;p3edafTXAQ6il2d5;C=$jkKW
zuDF3xTlii~D__AU7Wr5sb9mmC5PVCQ6zmqq`B4U#?)(Y7PPpP@4&<mae@kxNRA+bU
zxK)Dz9BE7R1rQDFTNjoF>a>Hx3Jnrx7WKJi^&e$8^LV3MZ~#d760M1A`4+iY+!Ycm
zMa4+xcbauJztEyk${FW%_EI_)2a`#k4^rWV`YMbly}lyj$OW^ajm^c9*DBL3<DIAG
z!;8bhq1>?XwWP0xxJDIrca1l~<IPtP4%ntM5C|zCMd*uz@5QuQK`GG4mF%CSF10w#
zNmMbw5nvn0b2&bba|+$p%t!BajlAo>ofov0dgO~ochm0+j@3h(vPx$i*!O2nK~vgW
zxYCR;Coj9~cu~S&c4U-W`Z>KthTJqOr<BAUmuAV-b}kEfWamcZ=(u)?^PCpm040&-
zI?0dp=XO4W_Vw+ase=mn7vfS48ml_Q$3fe%>h+4?Pf6{sd061&S6XyiIsz5J^#MlC
z?9I3YrEHE-JElQZwRjg;*TS}Lh$KDRSH*UUJUUt*UkDHIzLbj(#ZsIJ{;OD$8E|Ny
zC8&*z-La3!Z1K)jkvw4C{w7HyDr((|-ndoJ9lu&B%kYi*a0wJjQ5!Y6c*hxdt~Nk_
zdjK9`et^VSBIr#TEoY+4^_~f2C*Tqw<T%Eh2w-t)6oo{MRQ9{?Zl!q?-x9JTz%9e&
zQjJafat}z)LMeAN0%Js;DNRwd?aE{AxX5IU*6Jq_Wxi_)B@=ulzn5#~$F|kCc!54z
z)zcZL5?^M;bvTn3zG1ouTAHb!hT@sp*w9=@;o94h3AS(#P*a5VcHw!Zr;EzmI{7K-
zeo5fIgk*1fxZ`VUf}%2Pyj+6KAcT~!ywfvW<_wF4WMB8`8ybXyBgwl#I0sc#118yk
z#vd(=yZrQ>{JTCC;f3^>pPKh8jRXHnHh5<H#&^VY5rqAfXyIFKJ8u7@l70A4lSu3o
zLnEL}T<1$hxCT)+%~3le_=m4d`h+UVG^54|!o$`5J2*Fo38Tq)5;1O#$M+x&E#?JI
zNV~52-$#0bnO=yY#!^zn_uySBGt$ts*#l%(xRjZh^OFMRXcAi991AdZR1K)u1BAal
z-~M6%uHSwFiJwoQ^t(U9JfMB=+(8{>#v45ahV)lx3vIZ<^Y-R|Kvux(w!%3KLRX0F
zp_HFNkQ|d~kVs=HjnWi5Yzl+cCPpN!1dK38JqEk7i<j?vVLGK7l3wDqdE6d{8w9bk
z8AI7JNq<*6V~KN}xnQnkMkT~vRjr-LfRo_|^S8J;8wX<(&*DDGt1nKTUfCy@v8_aI
zEYX>etQggoI3WAFW26&4KDzhm1+*iZNSc(8#AEWXsh1<q^ApNNozJ^Q#`KDT;qWv1
z9$Py<1C0fs{_o_BU8Z=mx<h5sbwbSCo2dA_dJ{6jjX6fgV**Ufdv$ggiB3Ok>8z&%
zI1<MU=z?JbE*cinhW(BJku8q<r0};~ws^Smf%9F*KQd%2*iN~l7@G{<0I=LtWh`7R
z=>qxSTVbSD6<m5<n@L1|RtED%lJ64R4<B_Pykl?Ujm#~)QuLHIdLk*dJjwsX1^Sz6
zk48f?>Ow<@v$Lr_aSVX^wGGFyziRJtU@N<=;@9}M)>7hssFyqMn@BJ3JIc#r*slKN
zkklHvqFKbhpH0>C6v&t+<mycu*sID#F-os%RH%SFIZOzPQb)6>fAtp+63x7{SxoRf
z1WUolyzkz0W7d5+4%^h=#=|>*7XA|BMDFS3um*kC8F@_>2Rf!$yWX#_1ga{pts%3B
zG~%n}&eUo-M~eWt-Q(+zSZ#CyKA0rgu~lk+JB9R)-aW0={;6FF9xwXa;Uk4bi8++E
zsUky((8`(Ic@`;Gz;>YyOF760fv?ujzFs7K>`ah#J2aE=K#vXmbM`@G13AMk#iofa
zG5XFZa8k^BVdiqo1y%AQGm}`cV>;&CjdLn)xRh|sf@7f~gm}c%NqtOH&*$qogVQQU
z=$la1Xo*;9CpI9*jdl8)+IMM;#mcZMmHv?`()xV1%9(*~`oKFJl3CFhOnB~VO_1lz
zD`xp`Jry2}4VKh#!YRl(voJj3$E<`w>lmC~)1|Pz7{N9#^|BWG@WUT2C0N=vm52pd
zi`W`tW*xC~3_5RBSc*ehK;Ws5`VRC~hf`2nvsz+8zm9quwx9VT)0Zwr)kN6ax}$oq
zXWSZ(#)XeK-=57}6PIhd!lb5Cgeyitf431WqKi;@<Ntj35-wZBUdi2O=Ik0X2G8^x
z2?plI97Kanf||`PUF?UHx>m1H7oz**%PPhH;6KZPdMI|-j$jScI?Bl&+SiR)P^H-L
z)uKon*EPk~Bp*&wMQ8#w_@5D8kLjcgAbYd_y-VF7d@cMX2bUMHda2g)vR;W3x<4Y*
z=c%R9f+V}e#)x}Jj~=DF0AY`OeUvZFDhv}IWCvkf`lWPTPdpOI(Eoyg8##kRNNo{d
zyUqTy1!tKF%x4e@+^rb(*}iIT_I*bu?l5kPNu^Hg_dN^h@)jU2-IA_=KKxq~j3BW=
z#*LQ}eAiI>nw!-)^3t9qz*{`q+kmg=wUF=d#QC=Dz#zXH?(CZIWsHI<?^w8+B&yp_
z=}Mq5@=YihYrx?6#><apmulF9d_W2-!UPd{F$NAzYZ`!25@wPO<hEuHSyG5|b+hL?
zK9&D$agEB0(=Z&Ot%+=@#BOUJ_OTxW<y?G)!z0P9jSvU7DgDcC)jwLU%vcFyj*g<o
zfP>C`<?~5&+%hWiG6w$$C0GoQic9zQbWzm&jUIwrFygB-yP~;G8_bnG;oV~-iiSC<
zRosxChYq{q!}_>M3&_(`K%B_!V}k{z-=2clsapJy+fNz+WCq9iBWCIcbz}?)G8jiy
ztQT|`!I^)s&(cHP*Gv<Fw9!Ez(sW%<QStL+eWY~cp-4S~nn9X86v*k4DiG_t@J>YB
zK}`&C%8EF4zo+U#XDY*N!h68*yQCo0&aE8hyiWN81R4PHIu!T=%>R%XlR_$VQVjnA
z8RTFRuAZM<H9DG4FBYL8=66GZGGeDIHy(mhVjl#LmbMz*+tB@Ze<CAGODXNkaBp-!
z@*wK00&^%Z1Jkh1CRD#Pdao>$mrHgc`roEcWXmV6b@p|4PKl~b9sm4Mwm(S)=b3WW
zuVtJ*%xMCxDSOA4GOBNG8s)xnTCTq$1zXhCO60pQ!7j!ZKn*(u8zkI|D%o0bG%F)9
znA6M>=HkMh)Vh+c|AulMXD!CRr0ufm#x~YxM{3+rvc$UhD6kE=zi7hQd2i!5vd#1%
zQ^iyP4x4WgZ)8nz>tsEzT%2CbO%zIZI&QmIkRNU4bC14Hlp;9%P-&#*O~#YDJy~Nq
z$`TadpOHTshB>OjG8qsrKwIk-uOj*?crU3hvZicI1@6*WkHJ+IZ<gw@B)l~`bw-D5
z&6(lqPoX(U|5_)zpRIfcCUGpkK?D2rPPJPlX*`<Kg<WfFm_tD<-iBR6?Ox+f;$x)?
zhbFQsqk%ui?D_!bJ^ByF0yV0u63+>pX(OetJiU_KyXu1;$E$U#M-1(>&Mj-r1n{m!
za&2A4gRbU9|9<osY$L1b7#G;aPNVep>Y>pzRm$lU6L2JS4RYpfKX_TA(@Ndi?;JtB
z2#c8QxktSoX2j*%<=*sT*eNuzaN(=i>GKT!FLkSFnI6@vK#F!mCHE@pHvin+y>b<B
zhyR1z1s4?8n9BB#2GJaumlLwL3zf6M;}s7XwKVeck{y!Q-T$|<=pUU_W4IZ#n_Gq+
zo0D@U>pXs7cEaZwd0?f<*ZuI>3Ib(ursk6I+_ZoccsnEqpW1xJT;tz@{<<$zCcU>t
zdZ6^Vi-bU+uNv~2@iRgQ)u0IMvqB<8Kexaz<rRAd^FQ=}gcZ6#4IUhR*=44jf>U1;
zad6`jd-)zhjAe|5iOA$+-6ExIra#yI2-d)fi)=&|CC`lAMKj6j<Qi7M2zC2}yu%&B
zgttv~`NLv|k<a1VZDGJpNfPAsHfS6a{$hGz61$L_eZ(mP)vg+0sWR%psHOC<Pfv>A
zl+)K^V^5J<$iZTVhRTAo4vMM+4G|B*d?7S$Us=a)U3xhn7W6VK@k6kK<GF1PW-F|d
zH-umZ7<$%$qS-TnDmFq@ri4D&&18-n(ys<A4Z$=QJki?N;7Ctdd$-D1#$ty7DG*DV
zL3*c<@kL6-e-s8c4Ne$NiN4-(qkl_pVtff>^iE~#6$>>Zo^YT}IXrm%8x{u)?;o*P
zbr%h}lh%Z0ycru(8V#~e?SY*_V^~NLG#D~(16FQ7!zt>lc>BXXCCV@G#I}4djDtd{
zwR=IrOr%0pUmzC(-)!i{PH*sabW>}R%^UG#vlqC>RQ?J-)cy+#rqT&U=A1V?|7Bb_
zB{wA~!~moe=@E?~!(w<X%OAcn`jIc*0D+-A?A0qeoy~+R9FSg7t`M5r0;77y|GLeb
zbOE>qVGr#&OT}e7^jOt-^}kk$NUG&xJoR*2a5ZXe5v-BbO-tVIcoPMDQL)O@L<Q+Q
zc^`4*i{992f3>xX+gSdhNYp1s#+$=v;(UQ#)geOTtsURE@b5SfGq$l81mecGZY(js
zUtSH=yOTT|G2^lF-D5MKn0)9~wvsM#Og^<<rDg0mX@^ZB_BoaB8dQt8n2vok3KzL;
zGE(u<Sa5;(!V`Mf@@<)Sg)1kq5iC5I()?jG0^X}5m-WA3qLFVlDxPDmzb5xfVzFte
z<9;#OU={?qUHSd%@U9vUM_7QqijYbiQXF)cdRkDAH5C%)klk^Rf=3|%U~D^uNfrwz
zdKv29%1srJboiYqgfHPg){B14i#wW$;|6jZI5@sz_ewht46xxP4Z@xq;s`@f0wxxo
zSN4(-f6A=JU;8Q4Tq!rHq)oeMn&0V9-yPOD+^*#ES=l@9+8P&gcp`Bp=0z8sdUS*N
za*xJCgRKQNkon?ZLzm$Uo+q8BQ4;7i8sN9NS!m5sEswuGI`Ms;Yue41_i!S~x)}br
z;JJFIyd1i)pyf0x2=3-A%U$|0<R^R)E8n)Iub$|vI25RDi9#vYVSm4vg+p2!N0~Jm
z<5u%8f;2adHGkyqzWC;mmZ;B3obA6W<yb3ch^AaTTB!Z$56M6MTwIyHLY}{8C-|mb
zo`t*PTxsH#k5~;4a4k#xr*{1Fe5VK?rssq`Y4<K)efp}qE4)AY{I_2fS^^UzX|(Pj
z46bd6K{yE^1A{i&#xt{lI<_%$!s+l+=nGuBrpIrXYXlLT`V5uiZLk}t;{gn5z7%yE
zF6;e`T!|Tthin*N_^kPm!M7V|WK={<XbCqolNd*VodJ>@BWf#=m!fPY{{&D>kZ$H~
zCwoTpc5xCIab^K_VFXgUX+vRDmT<F1Brh=sBffbL!qSvCg=O}HF_e^^;p&tyBE!0$
zn_)55_2It0afU%^Ovr4@^yo~uuW6SKQl`9h4W)5YL0<1N0r26dUeeHS+JpYtFpJlO
z8=_|n?||B0TAB%M*!^6eb(BAh1X6l=D|PDKk06`++lKaj2tZEFNK>EGEg=O+{Mw5{
zIu<KecXf0!oKtE3jb4Ue7ZWF$a?c%d;7Cij`D^EFlrt4)j$ikcN|;Q6pCOS`mhf$c
zHqc4yRn$q1|H`pt!uW&#21^I8tkLq!w^xJ{oo&L@x83Z&>Wh@B!i=C#mq5aUjTVyl
zZ8zc=D|zjv$l$C(O2Z&liatFF)dARsCX4ua916lcO%HMED~*h?kU=8l!9%bcWK1up
zQMYs1nSR9j^PBmj3C;wSp(_1`b-r_3O%Ha$m_^~(7LRlTYWZYu)$o=JR-f%ZQ3+;s
zu}c5*ZDYg7;;PtiUHrx_FI@MXY?gChfC4CGaWKl}2Ci}cYoQ&rpza#4cC2psMB>(G
z;Qlk@(~?qJP!%okC9*S&2f4<Mb}2?A-O@lIV1YvI(VCm0u?~^TPet7FnViF>Nv`VR
z=Xi&IHNuze8x};hf<-=>Sgw!xyVbxN5Z41U+1w>FTrt##R`SZ~;c~i*$Su7*K0U~`
zQ2Q?`;CvgKI^19-_1suL-=L_VZFAsu4aYT)pm2K636NQ_n~1O3mBQcjxc@TNd~<2$
zm@4b7;wmt=j@#9|@{z%tD|Fj(^&{_x^v0H}nXDyio0wPcJkM9m_OVWs&$C`L-zcZh
zOxp_cx0|jF%+A1?5=7DtAH6Rd(Hjd^Z=dSdjSZ#WPfrC?KYxq;V^ss95mQTduB%yh
zZ8&_`7m;P5ZPBMtG@*u}ac5~+0j!v0wTP^o)P~op%zp$C6Ze;v%upM7A3Y4YuAdw?
z%!8=?#RhM@^bS+L9nRM*UD+RbervkwSMeyd;oYr*SrL_7sbDI#J3R0QH;g@gUk^BG
z045fUhFuF=jU4$BpF9pua*;7T>74{{o(%uBS@D5&kH<~Dq;%QqU&T~qXjtP3F+Mi6
zufhg3yullLqX`4>AUE86*)$%Van>l2_V5yyuTi)Bch1>YXN{oUOB0_1kG_;p2ls%s
zcso_=R@v7HUy|CTkLkmvj4%8f7gn3IucI19NCt+<Kzj#*Xg{~otsFm7B8u!L+~O`4
zU4)GCP&pQ<K;+|spTpiOeWQ=%whgSa#m1I>s?N(JiV!bKyfDBPICdZAVD`F$*cowL
zbRO+Y7r&#qxW{#Nr2?4nsT!77r3v?=a5e)SNC2M3k;hCv-^L_sGg{qChWK-Nznn9b
zKI1X*A>H=zCF1|^JVe>_M~nU2$H$P0pD$MX`dywdY})pCCj0jOJq8reFE*sNoWYIW
zj}j+|(yb|p!dI8<!`UJ%A0(&WWJHOoS4|6x(5D5Vh;WC2ioEyUp;db0qf&dnKwU1E
z@HhL=d%z1qfzOKvSvOr!qJ>A8zb_XVB0w<-qlfKDr_!S&b0nic4PT=M{7QNjKjT18
zL-urYhaJe;A6kCSE!m6?VIzE9fzq_}3B_drA8|qd5v@6jAKl~7`#H_7w<kU7a?S|)
zzOIi?y)djfzkG@S<?s%lhH*UX*)`<@+sbf&)h~u!-TGwEfx6MoCCRhv8Xa+zZcpFg
z8;;jzo#pt?eCm)pfnHoXWAmj>^ISNt;Gp-XCw@TTJ9>%M%#Vk!(0rl&K1&@X?jgj7
zR1TL$V@WeMwLTf;ScM7IbX*&@-I&RH1ww%g(xY_S?#kWIo8TkfANhOZqim`ZqHxfz
z4Bxp{K$dST-_Xp;?HHm9QOVGzgY588@2*(Tn&rJ=K_(_5S1-_-{U-d)1n**BftCkr
z&^>f+s8Q41a8Z|cEuZmyvxA^MQN|(<c^~BSeZR6l&C`kC_(p+UUw03yVR#6H;uVWr
zUrKzvC1?n!NK{GsmrP65eH>^(+6;_}abrqo3uloIx;1_N{@(RIyXzUs*dS#2Dlgb)
zD=8n{=O_#U3+OZVLOZ>A>uZuX=dKgHpD#v$@|+3+vtZ{uaK;{SV*89-5<@Q3<%79r
zBiDnh12>p7WE4IA?h`oZK2ckQ3T9~^P_8%3&>E{1C4;P;)3e`rQtTHGZHaaDJf?P_
zm=n*`{_^$KKzkt+z@qUu0zyzQFu#U9{5jFCIBb2nz{bW7K>p!C`qb1;H5wsXG&6`q
zq`YZ4z&f(>Ko<J44a4|D@HEq`V|>O@X!3}g2FCLeneUf--D(TEm{vt@b1p~D0eJ*l
zEFVze&w7o3f-S%@S|Vv@&ulztw!$j;1l?8K;yAh*Y^IOYJL>e!JDp6ub&XQ$dE97A
zS`~ON9}30=i%a^n-5}2?Z<`r%{0?1sZ9`o!^MpLHz0&Hfs$7i+mFg9-Ms_tS)QP63
z3W2bk(lj;U-qdjK5`1!Xp5bI4w92|u(afV}Umpf&fAZ?<#Y48-_!ZlNPO8$fA;P7k
zz=W_cc{<k6BT2CUnRJl10;?;=VVa*!vp}wn;)$KsSQ+?ur;$+^Eh{hojFsi&P&3o*
zi%{gjLBv@QgotevuoF0NQT8$KW_(=trG7Ndgn#K`U*tM+)S*=3?6=|hKWJFnu+03w
zIwRL6r8muuIw1+-3nk|yar&Entb;}~R(<En(`g!4-{K;=M$BRdZ^aodC98U46BZu?
zKFcSAR3EH%O<yX07{41(|AA;7()$gnkbM@CV6fsjZ=ThJd;I%ORJL_Qay@KgI$T4q
zq_-Va+<C5c=lUv`R6SRRhgdYNvj6wRs;<OU^r~2brp}!Ue&w4*22WNJ(W**dQC)k9
zbV<g+Mg-?jNA<#{^RB>pU5kj}k+Vw^D%Gs|AK<&FMZdd?zub*vWqq!$CtebF84^nH
zBa)6I?vuw5=uut!L&B6y$78}%>=SO^;Pcow?iQ6y^HFDZ1N#G^Wcge6Y=G|EqKWex
z?#XfCg%?abQrW9lEv9#d;bTtsS{*<nnl&pLEWhWT3?FvqG_1N3be6uPRAxh6#20j|
z5crg`zQ6(K0s|f<xcD^Po!uf_$YaURlYSm<H!jv;Tq3{z53WK1+C$9)Z1}dXD#Cr%
zURL3wyiQp99!H3H-yK*9Qcy6*UVON@c#ig5T2wNqv*JG12d#Pw`?`J&34fLzi9X2f
z2enR7BJk*7RUt;tJamw#_A9$P`p`M-%`y<-g%%!VMY{~la74d<ScG`|Fj~G_IY9T(
z{Y?$i2#LiG2Wel$;3M|_NdL-DI?|SD-g=klbC>b~rFumZJ*dn|LTqKn0j*hG4Ic&K
z8RNrAq$L+LXQkUQO#Tdovx^XcHu;TPU-$ETt8X})S}E~Vv%XsUq39F{CH4g!iq<_p
znn=k#1U20rgi0OB&-3>#^z7N&epy-+SfjUO?bELZXGhY$kR#dmTUYj_Cn+|rHbz!Y
zujig_=-!`u5a0v7$0I<*4j(u~MPlWfNq2!Xqn|uT@Ax6n_^WHp0jS<&(qc4w(&)(r
zUP3gC-Cj?~HMv`qZ)RAy!-r;W=3?_@-&vtO-^a*Ec_>$vwL?#?eIP)4K8J;W?Cg~E
zlPA)S|1~VE++ng0-N$<#h7zxbjn5YQwQFVWQ_!U3AlpcyJgfHaXMBKDT>m_q896ku
zU^>{Ip{vmF4NI+Ymvk>k0ql)MLq>j6chOr-&L6>69gYIxFOlG|l;oJ4OnV<xN$%RR
z5)49Jc900VHH@=XLHBu*rCmNdS4EeZ9-chJ(xq8;u_k}gt@ic#K4a%^X8Gq{LfAd7
zm=OK{(e>6*QTSW;@Gx|zba!`3gNQUpcZW36J%EVP(w$04cQdp!NY_vzB^^Wa8~omT
z*L~jSdH<Tlf2=k8oU_l~`*VhX(Nz8u+8&+TmjkD-*>!?=07W94O};tG-SfDU`PupI
zH^Fp=RC^OlwUV6vg7BT{nU{Z@DJ47D1QAB71tM_7D<cT;Pk==zfHfA{yt5!08q)b$
zB?nfNDU4F%Rf(oS81x1?;SJZtTNLvFJrYG!IaI;qhFX%DAwrJRtI$ghAfKPiBx3db
z>qGp_<&l$*%1tGHWL>KjBEI`YHP=P4q>R|MtT)#wqWX03HgVmLi_jU&?dBMGiqV+0
zEA00%ZJvUt$_CWa8)OB)ji!*BGS^?*cYPF&QdMXZ($q7OGVBMOb|40i({wMwr2Y1S
z%>&r!=~)Mm=l(5&DO*3)D;qY)9=LQpCD2TUL{)z0+cm>`|90G0qlBOSeG5Q^w|Djp
z5!tN!T$n}EBP_Moa9FLij;%6fCX|oEsGZFoR1I|brIE2>>N#n8F9wf?9QmhVG1oX$
zhNa1H?g-k;R=fSifBMg|8>lEZ!+AVT>%H$N_el!Ft2X~u`d9Pwd!#j#^SMG)dqf0a
zN8i@Afeon~k=7oslj`{kzc%ta==UYvv8RnScN_-AZj3my@*IO`lKj&Ek8EMt(Hek$
zW88QG<n)ioOALfRI;UTH4nCT2AC$d6mcW<{7YQC2JOGhbGVWeC=dI*t^@Rks^-U6}
zH=DdV$YBg1kqAq#+aCS4_Pyx5xPD$STBo7_^?-_Rq7*L%-o#hIi=cAQ0P-_#V{P&J
zb(4|k<q+_q>{a@QohklipYA5gZ*UDHGWzAI68BL(X8=NO!#?Dc$~uPMm2y78>qdKM
z-k_K(_d9E^%Xb1k4?h#5jUwlfA98AaH@s?flg!rP5Ss1;W;Y5ZE;2K<Kf*uwsd>GS
zYgIRAZt7KO9bCNf-Gmle04b|XQ3O2M(KNU9JTnlkH6W@}^Znio`Sf2yfyJF8`We3B
zog3Noz{Z>X>*n5yy`9dt_fh1IYuWppyb;R=_gBa-*9Kjt`0Qy(tmt|m3nmPa<~K|)
z$<_Jg&W^9WhX<b9tdyFi8BcjF)yvWP+$QZ~Fz?*J*a!}M^_ic@SQe{g2Xa?XdCoER
z;~U>7$PcME0#S^Z)k6^5pvW%t&?1ry!T~;>g#eTL35)h)L8ho7;)wwaa%_x~n+Kn3
zf*aykZ=`(!xb?3@C{wUQDdcc`G6q1fs)AH^DRthu9IiS}8H&_|HVU%S7nU$OP5UNh
z;ME|kfz;7Tr7r8p0<IVExZTTW0Iv{oK7Ry=Lx7Q>L30%5836=8XF2fJPXu9Sr?u0j
zz}V#%aNL<kkm%Mz5NPXN2A~5%8=!Ix0s()&2BLB6;<U!)C`y0IQ5!%9g3V-iJA+<b
z)vMqa?1Z@`OkM^{e!<B3a&8BY!#(Xv%nW)V=I-{3Z%<ht5n(i%4Ml_IL{CNdezf^A
z6f}|9o40d<FvIsD1XZ3M4r>hoCr<{So94~3uVW<j`mwA$h_eYssKkEes-@!hV0xj#
zvkM>Mk&3SU@RJA6Y^UI?9sw1==+V?>!`@0@3}7rkezdb3g&B;x1<RO{*&wBbSjLBV
zZ-_k9!|l2}v8Dd*rvl5<j$p{c^@VSW7U8VAd{#MgLUf%dfBJ-Y*oV!m+yIXNb{QaS
zzO&uRG+u|#?Zs?c1lE#19x(ULzjjLLKMsg+NTRb@*J5q#A;@o|`nr!0&T=#MqA~Y1
z+}CrK_}x!w%c2dO))IGF*obk}6#-8AeIy)-e^^9_(Cm*%Jzd=DJL{A($0p1n<OI}1
z^8q8BdmXbV{twZ<UELEZ*$(H?-bUX3q0Hys$aV-@R@+O|g}*k6W>OHQMXCs#b|~Nk
zuB2fE6ppIacyB>RZ8iJViKvK57b~d<x{}gFR3H3T@xA5k3$23%0#9#B0d#*<_OG(+
z#JMQQFvzR0*-a8&oYeN8Z??|=fj`%{)&Yq)CqZo!_V0Nc5Q+gAnU+;O$EOU!(LP`L
zy08)FA{Y!O%*>hsrGhcJs4M)O4#@;N)rPzbf`VwIOq&(Q2M^5q41TfRh4w~g9Jzdx
zl<7A6#G7S8aXOGHG0VghNMoWqn%cLj<VHQ4!#wfx)QxNF(l--u?S6GvJ<xYoJic6R
zl6Y+q9@KQdQF<CA89g>JwP1RK=sk$m9(2GiKsrxHIP_;>{mN#)4<om8a2L3wU~U&m
zR&Dkv5BV^)8(Q;Vm5$D5{Pog~B%$q6m13OSJ~vUTl3-%P6Bn_<nDtf6QH?D}MS^Tp
z@ur((t7GJ~eNe#K>xmN27r^z*t4n?8Xe88s_Eo3+5UuV6#s*Y%_H)@Ncb8_a=BXh=
zw*uREMfY)b$J}EpEhM;EU~vDYbFtj@Y3>^8@L0P$_tS?kVU(qxh=AH<lhk{y-A8i%
zFI{r2Ct0vM=t)-&e)%L&C*u$OO?Lji?^RfO`7bj*oDtau%gcb^lEIPXOI4(@bp=Cq
z9=gszbQ43o3F$In=`3MFD#xg*PgMhLEgwmQ8A;a9?|5!%+=~54;Im(3UGW-RyTUBC
zgpe*`V5jgk9gy*5TQ-O!;>AX@n3V)GY|S-;LEqU5ycTn}X-orei2%n3U=0;w2%-jL
zF`)ty5xYfm{<SR&z^hnfLhLn~?Z7f7q7|8sysPFN1Hr)c7l(g>-EfDcHE<nlC&=iE
zx1{_+MPABGzX=996D1uqHh9H4R4i~Tk2igUOpXaKrO`*r=a(WaHm(LCs&w4Bl*{M8
z`@JN0Cqf(Lls<g>N$HQb`kOGPvQOXntYduMO42Y-uR5HkKFs<@DUUkpo=60l92uqC
zwr|DOBlFcnV#XqTTh2<}82$FUUxb;x;nKz!_h@xfk~c)EJ2Dl~xI-`QQGDO&h9aa#
z)3>Y*KMdb7IBjzLHGI+xks(1=PuGrEjhIB9YL~#1fJ!>>z|8w)!?fClcV-NUa2ZBc
zT$w}b1bSLLraef~s`Fh8xQmP|o&#&MT%&&JH32&ZOCUPs>tnW_J>9y~&bP|*rVW5X
zqgG0WEy1lwXB+C-?BdS1(by^GhXq3F%|s49tIW4hWwENg^qaM5{>vM--xI_$C{CJV
zQ41UrEjcCfI%<9AmXfEEzdO;ya!WBpje>b?o*F_pXTLI+4>>&DY8L)fPLb;Zm0kyD
zHT1N;ZFQd84;RRlccB_}*Z6S2%#3MDM7C1re#9l0Ur@odz)+K;9RB3;Bzf4RpteQ!
z<LDmo=^PO+!-TmrG9X0x{kuaK+{zebxg6jQn5c%)X!A7U7w3Yx0(Fa`MAtgL%nSn?
zITB_5=r@T~AxS<WzS$)jMlxuw9kN3Wfl`yVcoqvGG%i$6qmkXHfF8F>R^70T1OCo4
z{;Fm?iD~n6V`mpU;KJ_MV3J8x$rW+6r0dT&&sh3-Huoj^&2;fHTN#fgOP1q!?;T$}
z>NwYKKz)Be$rt&K+WQt)_uTs`xqvjNy~4HdidVE6$b+XZkhe&v5>e&bLf%p=`4goD
zhX%GsA(3T8cksDINBj8mYz0r*ntlCKe5;pLrfDTzgyiF*`mHqO^uf<=Nc0N6{PANB
zkjap|sy>XmF1z`f16DY@ffyIM(;yR+@0!fQPCF-L`7#wOJ$J7(zp<VMEmR?uvCpfn
z=j)W_VJVI#xy-d<*R1Lovv?7c3;nd+JFm%!c`hTkf2s-2X8qbKZw?3Gtgi|-mz0EC
zr&R%4>X?(~B2UQwU9kXslTH}x-@+@Hc&ooAfm|}!=>gG_=>T=^M3A)v+`6Me9&#YU
zvj$~`)EXOtpv~dc>gr$Ap)<XTBBE$nq)s3uAOdQdcfw;E=BdSbI(#9vE;BPeWx2gL
zHG5>rOEb-?@rGu9#b4rhHJJV27mJ0<fK-x)6lYdmqPAhX6I&^_^y0qjI%UpmS2}(r
zSO2=Ol1?#4r_qM*5{4U7`6qXO9sZ>_vxYvcTbrSTHRVeT+3s;+;XRuyzaEqCU3I!(
zvyQ^Har}fB=S&oI#nqo3-vV)|k`O$z_2}Frf%K7ag+NUsF#_~O8;xysAA4OX*!8W%
zb%L9%Z`agFNK1RPOBQ9sRVTctL)^6i7@YiVD(Lf7vA41Xvkla0ZKEUm{rSQTz)^(u
z=vp3l@ld!@)ydpiVNU`KLVOJHvAc;kHVX(!<mCN5)yLZ{<~zTGTU%zTn3GHIxmsts
zRTcPB->Y_X@TUu2y(zq*5MMk!Q%^DdYObwzTt?iEoMw7UjairKuFhg8^qhIU9=1&G
zwd!l%wR8|Bj@7@=B^~9rw<%qY<Wanv?L0`Ff324ro-j_ZN@%t`{1CaMAJ*q%1l>M<
zp}t9Q{%zL!4)GPCwfk#$&Usm$l@@4geaV<M$o3wd6peyt^oLY;S=-wcAN;dIq+tz0
zAx)14agXywIBva<h$VurKiOZQYSi4=1g!qj|HFf3cRs9GO-uVK8MWo}wXa#QO?1z}
zfCOAAnFWGLlfs(}rzsa?^o^d!lUB7N4&rL!<u7jXRi|lJB&e2EkhY>ylvbEuY$nN_
zc#9VC&3y{5D)SmR(&}1t@#e{nu0HO!ZRrqidx><!h}wPXVc7=VFQTe0^WM?=e$5l%
z#LIVAkvJ)|$To`iSu`pa*ulY9$J-F6-KSR~Mk<qf`(Ny7bAo|>n!oR6ocoq|=LHX%
z{zxLAMO-58f5$cD;)HMg>BuYt^thv^-<AF{3NCs;skDABi#%IyZcW=v4IpAMI-9kg
ztHGx#kp;Uo17mfB7vSSs-N;#_@Z$H8y}73BVli*~lt#f4NC~6rB1S)>Bnr*S2Tu7~
zrYbY*CRCt>FSR~}YMrM9VIEKL&dBZ8KCB?OEkqwy=?VwYLZ)0C+FhEC%aWTZt1sWH
zQ8qH3QW=CQhiFB0vHj5!mbE}6`>Rg;Tet<g^-HzV0`BE-0mnBe{GAGbe`@aeUsfVM
z@m644d6UYAGvrG8&S%UXsH)wO0os-Us<!Y&16;S2foK}G_y7qG)Qk0G@Ly67mgdg8
zl2G4g@|IO8f_hmVvfOH+pA;?FL3o8-CD`&Z=K2S*-v+KyBXdh*hvI866OaG-aJuRQ
zr@aFi%>HPFlc9AFIk1WvW|j>pw)dc82MdcEP0PIp9Tf&PQ#z<@ddm96<W(Bf^9apF
z9kAB)#qI3yH@^uuFZH^o)Ru0LpG5_Yo#l8g$v?>D|5`om{#>gjEh%;!$1U{hAb=nq
zrAsm4)GfjgKUAk?B)^)Vv1|te&2r|dx|*NbtNV^QdJ%duwRx7GrU?2)dY{jB+dM_$
zkNDkT*+CxB8a_w@^NgYy^8wa9UKosy7JbYdvGNG>V9_<o!m92S17g4a4&^xHZQNaS
z42UnZ<@_NJhegIWnMh2G^)Sngcw!lte?ybPHQZoE0jB%L)>r5wQ9{JgNO@q%p^ldX
zcYo&t)R9JgX_{FF>FC5*KlrWeo&B2Ty9^CmisNb@=ma&l`#ecxK1d;6>YS=B2r_<2
z5MHDyN{~_3LBewPi7m=yB@}wOSzaymf`47C@wzHeQ2px?L0GrX5q!v#MLDAo3Mv*K
zMK|<t%kot0JEhtaYS0&hQU3;=J_M>h`%n^oZ+7Q3<jQ)<l7aIEVbt+&cK`s<;}go$
zd?k+dN4|oOZ$6Q}kEtA=yhIKMt%1iseOL?%`G&?9jee(C7L~E$y&W}2(f2kK99@rw
zm}k$5F?g6@Ue@Vxe3cOy@8H@dr}v3fhaIqBF<%PBluEuVw8GtFuc5aCBJus?_%XY)
zby-D~#Td+Z+UzkGI(6+E)U(1g_tHqA#<n?><l;fDo5lFpvRtA4ol=T=(GvQKF5*kX
zvu%q|zbShhzK>4o8}OWK$C-6;A+M4V3ppx(N`^lz{GCd>`54hcBwirkbwYN@pb|)u
zE5Nj}W<j4YhZfaw6r9g9xt5i`hv-O<HSBrFuVFhplj#old3t!L#NI*uK@G`DG{i}}
zqU`4pcw798LYJ`zsHqonP|<3j#fgr*ck+bxG=a_)xY@<}ki@}Ja5sg^jdm<V*R!C8
zaEkU%#ev;MK&JTWARQ%8pr9S1`5G(N^PqMEgFUG87P(`?=r(2lH9~h4Fz7G7j=P!`
zV6dZ*sV283fmkL4;MQl1WcZ!$bYMonM}Vs@%vVtl27<NgP?)v25kRE!|EgUvk_E=5
zMW+;A^M;I+@Kf`#C6|Or$puWlUh<sci<ft%>>YoyURZ9FFYXFl`>guNQQX{umbZuN
zp+R$zFGWO{_H$WP!R2OCmw`%U+WaJV5}jt0dbL%8&NPVQSx|j~48|YIqP*kFs^PLh
z^k_;mw1KuPXo0`5+Ag^c`x>+{r`MFaN=d0Oe&jQ^T*Qi6#o2lKBcQ2_LPiTz>|-7T
zB;rv3Xe!v>H1-mMqDY8n(8u4?_hm3UvlNxCjL-I8KkTv<rLXP4Zng59Md=;C6Z{Ye
zE-gMpK7y29gOOPOSiFd#q_+7K_0Cx8^KpY+pjD}KVDpJ_xr>J3C?js^b^^(+IYxap
z7nk~0eLVpB+tm5U1kKgv($0IJ9UkiGKE%0D4){G03|XzgSf{!B$cHy!*r?4<?&BW2
zON;b&g<lucen;(TzLqJv#*D@=jZCzDqY>!dEt{G*sg+~RjK806kkBhM|Dw@0!J0Rv
zhLE3z2rtcT_D0V1bEuqvAy?mll4&!m?c>TgbZzHt9=1u-&(Ux0Q9n;ZS{^$61LXAV
z%ElXN{KfqaaB1vA$!KdauVfkYx`h26KCjlPnIXNEpNDp&zn^a%x>7WP)}r=@y-Fz{
zvf|`jt2WPnN1s?U&zN1NrqlBKts^$y9wB{`yCm_@%Cf!WoXe1^fUgm<`A5wTX@4qO
zG|;Tw_Mt8IUj&NBgq;6+BR*4bQtw9_ox0E#e(jJvOdK`lb6dHBIxEQdNjE*jWL%0W
z4MCILCSEtt$n@Tr$U%CZdbeL@^bOiJ?LKCtc&N!6h*Ro8DhF5xn)jI8OO!NQwUOiL
z^Thc`movELUs0`QN4nY^#t=|LY@m1|XZ+xO$vqHmmH4jDYw=$M(GMBEG!&rzR{2xP
zG)2C*D_DN$nY8{RoWnj`uQBM!gYZ{+zL-(Np#XMZeqQ!D3V=-)2Qa6N19)`dfm{Cx
z1fu!$eAGPsfVp0S2NSb@<8agMBLKm2z6c;id0{+xuoEw=zG{ZSf;HXc$9d<T_Mox8
z^Kr^4$16KAuA9a(Z}saP>z&A&-{f1{hbevu?-SJDcJPh(2Rm##i|FUAYuqMJSbWqZ
zc78bgAhfT`pdyW%&}Q$wej_lhoS-BfoY68FXgugE&Q_+)a<9F4W7wYlU|(oUjG?7F
z+GhU}HMA)6yrUOu)l#OmKVi}%xfPSj2;q=YL&F_s47K_*9@W=0W#5-)iL7dGh1l~R
za}Fr07)?|gynpA#BYn(>B4~`7dPk@PK-DYyfqyYi;EasFvf@$h<ba*J)L_$~D%p;L
zSmM{vx?GT_ZC%P9L2~)WD&275h;d`sz_Oo@co3$Cp<`H;Xk~^_aPaOdK6wJxCL~P>
z#^HaAzQIqw_Wo_159FpnJPFt%edr+vkwrdQyObgNC^;s$Sc)&>+JN%`)9ar7s+q>F
z(;?7p-9>k`I_4QI_kPpUmj@9%UTDs1+9?8b+C?bGb7hW&@7`-jUKwt)5%6Xdki_M`
zU93{m<92#FFFWD5NZvTzn|{wpwBX`qc`HfTe=dafWNRozV_R?`%D#vd72>_Bs=q3Y
zp4d;F$s&p({7U!Lb%rBjT!)muP-wcUoI*>}Q#14vBX!6p+X0Jv(KkzxGG;%xd;hpi
zAW>1n;7uL@y9IFKk-Yfs`<UAs6e>JES7wanSJ`%_;hfp%(;vLy+y1aE*=PA8Q)*zF
zx1bV!%H2F}w)c7c$VZk3YnEhzkY%d!;#uu<MaQPMNyA-RkN*sg|K4i;N(Ik5&MrIr
z*LPN;NF>B`Fd_ae0*EOJRak093ZPe?2EuOaAruFvROg?7@wm6%5o<{PV_a4nI)2H_
z7Q&_xSI7D(Y+bjUXG+BU%Q16U#4AdJaW%Faj+l<G&n{ZhfUy4Zq(+7YP`|VUOC#9n
zNGE2XS8L2IyAgZSM#l$z4pm4slkSy3e>TGoV!TT4dusz{-Oz=@&my^gTfP=vHpN#%
zu0_(T&^2Gdx;e>0w(*rOPDc$-oah;?J(1kZ{3&`AFOKE$W?F0bxGmTG4o(w(n}ni~
zQ6VjpdHOO*hJd&Qf6PeCnINUCYvLPAlKTahO<}bbrf#Z5n=nJxUe!y-WBhS?z8-V;
z_#)Nh^C&lIrd_R-xYUN`ilg*W=A>k{jTA=$iM@8j_c^-!W;t`#2RUqVl0#p<d6WC6
zh2;DIle|xoST+AW-Jd$tHVwzAOm%#Aay+IB9JX!uH)0_3@I(cR*FDk^{(@sy=Nd6Z
z3jpnr8pWJJV#Ol{0VYJhQcF-bM(mEy6>4_0gIh@X;v=hik9F5~>~2d#j!v!?A#?i7
zH)>OdzE^YFn=kT8DT1xgt{*qaZl%SluBPhBT>68qgw}Fu5W2hrw~1yb`m?)^e4!xm
zJHAPaP{v(=gn<rbgMIJ>Q!0}ZK~_+0nbQ2*6_QG`kKT!6@2&UTTRd_bpb|r(XV(`9
z9?1)N?QxiVt2@!)JE3@q#JrBJh2DQOZd+<vv4^vj&+pp@Ae7Wwd+;MVWwV5!@ZujY
zV9anpN>V^&+6nHFbFwr1w7&jF_WQd4Qg4Ir5?NtTUB;3yR*CWNNQ1`IrGVr4etI5j
zmKZ8yC!4r_H&WkQNP2{rV7c!1Ro0bb=gSNq1s?=Af&W48|I<;Q9mzW$aOm{I58)KT
zrh;ml@9@`kG=Kn4SOz1>5eQxpK^Rpg5<I3m&qw&Nb7=ECGA?A0Lj8(Yojb6_U_YYX
zD}bhY`Lb=rcQBjJyIiboX?e{!=b1M&Kp|P*@tF!;8SP~87>c+xVsRDp1`HCf<AN3@
zLBWB(PUhvQ2+5@+4>oUKtFcsSpFS>RfR;HND8PhnE_(rDrV(1|NJ~SG8FqK2a9xPR
z8_p#;5j~WnC}qriuXPO*P3VHRTv-Jhj28@!=jedbL7R@3^DWoZvMR~wM1|Eeu>?1S
zzBmeuveK#PrdEsY2;$sB$<WQ9>cC|4_Dt*&`v*peW`(4O^;~A=%eLrRf_o#2ihd_u
z!;c$b<a%^_(#&LhuI#+=zZ0x7y6>a{KXwpeuMb*ERs<9GoQIQ`MR;$5OeR*SA!*pI
z54B9Q*k^#85bO;Rr5%xY8urE3k0A&G>M<Vl;9Oys*GP@;5(zqXCv}5)9&Gn~g2LEF
zV|J0#sqq+6<bNOyYbDdRlNDnW$BZvy15JCSf+NL70w{Z1U&1>e`yVA&V&^H&7=Q>#
zJfv~eq*GMgOOec&6+zRRtHwH8LR<YF5nQ~H(OKp`fiZlG7NjHGTZBoye1hZtE4u4}
zb&$LcY(895$(I>iCce<<e?&`iQuv*vFYil`cFRvLyfsJq?W1is_VQ213@J=8u4#HP
zlP)X{g?VUpxMNi|eS(n2FcLWiwvM~l{lB%NjcEKvx-1L;NpHGuj-Wy5n9>xQGeh1N
zwZGjOMX>ll5$<N9vy~&HRsnDKp6qL#iH6a{|Cr>@xckpsANw;fM@LfPg#faE87$7Y
zaArSW0l_DZK&+E-8i1G;3gAU7;Lh=y8#c`e;K1yRW{h2EKrm`(HCGnV^F}Jfmj|Y$
zo}Ei8!}9qtNw?yPdeU7$ak7DT46f@Zb|-A~u8%<|w)T0vsBN%l%3ya}H-?RHU^IU3
zhc&rE4Kr2;3{-qm*4r(<soRBbTGulRPa>#@UwFr`s%sz*a}><+%>16{7iu-89F4V(
zgL4n}U*zBzI_FOUj(^8?!2B@(QEI-vS6Rlof^C>r#|a;3$u2FU+Lu-01EZOx%#$Q$
zS8;o`W>_rZ&ts46)o3vQ?f25A?ya3vRC)Bx{5*JwoF;@Gmsgfi*ap95(eZeg6|jtU
z+g?hvj&l-mS(3wGD8}ttOW~+i3-I0ve!cn`>RYFyoN}Y20Wyv^!*}n9tIjW%GYbvi
zmUAhUtc1UwO0X^GMAj>I==0m_pzz0v4dHjE2?(TA)o|&`eB#cLPqeOlj23Se#A3?e
z%__1#CT!=;@^k6Vp&J6XRle2st`5^o!3tGgl$O$M`Jkq51F_;8`^|ObT(93wqj=Ij
zwalK!Hs*I2ID`69DJK2k-&}H#6{fA{AfcVkD#C+%rY$cMe+N;aBHix2b4}R~7#Z?P
zqX_9b;2rDVKO+Iq8GmUNpfLN?{yIQqGRm=q8&fG)!HcCe+slcTUUAf?tdiH%?SfV#
z<G^xrOXejC+{N*S!5KQB>8Q@71$K(#e*pBqdRreatflF__WN<`6omrF&4$71QU(aV
z`;I!LdPOA#Xl@6A_zXB0&3=PmK>E#r0E!^O2iULyv6N9|>M2Ytp0ydXh>9jmAab>5
z$8y7HW*lkyeue-VgSL8;8Hq!f+xyjSJxS%uh<w4h<-h}))pBM{2X%p#3W3wRZ_!0b
zEc{s;1S9z7A1;&#Y+4Y|Nw4$^^;w_Hc(!uwLT|kgqMjvV{BWbvqX6%*pyfjcYMXpT
z3C5=np8l`&AqPT(!cfC8yt=ZkPl}11w;q8hkcla|sEYS(`eoei;U*5jhQC%z)EF|Q
ziRTA@@3z@kW}SZz8W4Sa;~&5|8-uez<(}~<aD^OR@;>1b;t;Cn6%)%FWRx5>9U~h_
zs=wLBMj977J({Xic+lSWY>!^6gkCv-CYD+D-I3O9qilO9ZU9?yDWpsrphm$a#lS2$
zSBJytGx<8eBLdr>!e-+{%=Ps6Lv@lj{Do!-`52?UE1&henyk$wMhUh%<`tLiGP7V|
z!qpLQ+kuy2JLXCK5bcPSpI;l~i$!s1K=>irnvAJ_z+Ht!;EiGk)Yzi!dtS4Ng|sG%
zAnS3M%8b5q(&<%Q#9t4_GCM@}MUHbdN9>0Z$5AO4E9$XqW=7V48olDS=o$^Adq!Dn
zPNX-wJ1t`6RGeKERk;yc4z~w>oC8oH+l?Rik2yFG<PuoIc1rbou_k(?^yt5c?7Di2
z1JSoKyGeZeIH}OowztkEh_M;J%CbttI{a%L`ClJ-&}Up(Uerknq=JCcatNR`PHMoY
zHUelU)HRxt+&dHY|Hh)hlU>@h$W0R%!c8ryVjC#qeSHJ+cFNW-UP86|o?#?5qtXX=
zR_K(pSa0MHwJ7{9w8g3}6@H`hmI3@qXo+FbF5u0dx62eZg0oe>AFNgDCA?TvwucVv
z@*oTc{L#=_q|-{~I)1aK)PnwP6Ak9CTKKC;ul)cg*6!I?v7%aPQuDjKj!n{CTt`da
zwWRl)0vseVOYR68QB`){7DW^Az3LyNs36UKg9lH3rS@z1?&NkSHEi7owoxL)_(~wC
zG8Chi?S&|2h)-STFm@jf7)8iz5;pWZ%1AypB3|yLuWcGw>r8t@2fjX#%tT2{8_V>R
z%wIoW3a?XtpiYB3v6`|!cM-#e#NT{v_La!H_o!nzFcbB<R7OH7<CmLv<N7}87$37V
zGIF`v4;Cz$Pjd4qc5scM4XO(IvW|xVR`APHjq|3e$v{YFk$;2tIt%9|yb_l~LfYPp
zpKn~(EeP^%Zp(LS1n+`c{I$PtOU_@IY(<2Gy~vTs)khvfg7K)q32#u`FKS|S=8L}L
zCn@jJtnh7h^^=N40Nuq?S!PYThj&QEXKOr+TG`39*_6o8`9$r^>V`0~;ny1ZbL)Ys
z6%CvE)3Y7xSN{tK|Kj`WdNv@$V)2XVHriaAF{33e$Eop7O<0=B84J!#OAv1T*JoHl
zk`NaVi3yjtgj=wDj`z1y)nI51sZ{eczVEtZOr$~LeBJ#HB6yRtG=W<uJcQ+$sI0uL
zp$C(+;;Vm-%tfxgul>pr^Dt}+H?X%cd|WVvdFFc#>h@eU_-RC35y#FT<##(tI-KU)
zcoXooHk&uk!MdS4dU_usBH4Y;ud1PgFO_duezOhgiJndC?tySyQl4QT6@z0jUq)Z%
zEn2ejqp6IE-VQ32Jj6S9UkdjkQz_`>7OBokrpIVj>~;P|AR(G{XY~kGZd@?+jl5Uj
z3Ci*M%B%;M^PBH85-)S&F+*M8bxTp{DI&ets#8i-YV?lHA6OlrBL$$9s{xJJ)%bJg
z0MKyGh}^BUn~SWe+tEmPs!i28`9A~zNiJiX<Nos>GUv5-FB&ytdYVZ+*AGQDcX_$h
zi&*`-dq&EP3mq9HgVb;w<v)jbI}N6*Flkv0dqe;cQqAzSpx(UwIU8s;KF3W8U5Aei
zSPz1`z&X()pHl?S<OTKhK>pA6d1R~TJ|{MyXWlcpObp{Y{CdETM?$nc)7m!0-`2*S
zn~DWB?Y;6B%dgcwzkvJ&%k0Jqo;AHB{^dAZGLW{4S+2})*ID@*l^RrUj+^Ozcb~Ei
zS1*+d4qYC7It+gfN&Ls+`p+^5+Ym#-%wPyXme%q9IGzMcoe|;#%DnIaDI5%ddv$!k
zxL6b@G_M*ipgIF^auALHni!`sD{6N`h?dL*xQ+w>(Hwhr$LY2LVF6buHS0VDCn3XZ
z?_<1vYEj}<2cvKLZKI;juwSWAhX!3JBUohvQ5M8)4j~vhhgaK{b%rWG0Zct8d#S<<
zBxpi*W|DSgJ(`7rLqc^*DNSwfnt5k~NIQ85i5g59w!GYIWQFNtWi$g#e;+mgD#~8#
zI>kf*b1zCiwAq!_7i2BlSpMez;i1ncais1b(>nA=9dP;nTc<BBnhC?1=A5%b!iT1J
zeg!Qo$TyQDUS*C#m9{!)lM*pNGw*^MQw5oIx09HTp6L^}eoq5NGXklP=`BDxpUGdW
zqnZ=}rpzfYLm5K+L%mcG9=l9Eo2j!HE+}47)+BBxU%IS{>=#x*{m6*W?tC3wrRmkz
zTWLyg-4)%XSR+0!wd@sm+5HuiH=Y)c!DKJ5N*PaqHE1oQCmxLyL^_QycIsTY$A|>+
zOfG8;3sf1nt10wtQEV(8Mn|-fh?t2E;@c7>ed`&;>dxcdY+Pab^Yn46=yg+ujaX-1
ziE&f5>ergT;jFIdSt(K8xxs(3?H52KEFGWQw$g2jhJjNgZ94CC6Db@jTE0nqVlYwc
zUXXSCL-3Un;b2H!7b$mDcB}|N0HFm^kS6<6XxAOEY>s@02urqzWLLtW$+*&VAjqk)
zi`N?+RIyL<;6MeY)I)V?npUa&%R~MfQ2#<%$dM&={+(C9|4VYD1Q6haMFG%k3DbRV
zf>8mYb9jKq&NRSD1&s0h&_D>lAru4&na}}NzM%q^Dd5O#EMdTejDULDX$g^2<dT;b
z;m}nq0QBetsx}D5Y$-K@qZfR#ry>4?7hm92%A4-y;H#CYlA0wWM8rlmp=}hTt3ODD
zT{#S4IkB|e*KPD<F%BTx*{87{Pkd&(Rnq64lqNlEiKaej2OISir7>J=hikv6IMBpb
zMHyVSqTozY7%EBq$BeQ_A-_uAnMg;x<uX{M47np><~No}*Us$2+F4b9mA5yDy3DB#
zaT|<e(vNL4jc3m54`^#D01+bWUOD>(yp&cSnC|q7T&Cu>{J5BB;_;mHg6*b7!?>(O
zad!|zn1V29QS^}Y`ReDQPawBMa`R+n{Ks0tDbP{>H<z9SM-bJ##P=P`y;lwIA(=Hh
z%slbhU+mx3L&v|rDzx7RA9pbLV?~4z;wI5w8}k*n%I1$*eX-_U-T1S|-6Rl{TEHwD
z;JnLc*uHF!=kh7xy~|V0Q8!W2pVuMs8#3WEFWm<8LVu+3WQ}LCYTOz}C+epE)rjOU
z{@~{nsm*M0P+rT#e-e&rzdkDoSSx>VL2LJ8!$tV-3kxJ9JV(=6wfqx;$K0FRUZN=1
z3wn*dW8q!WSK<SQ+`+oRv`rXN;GaDk`rz5m#tW}v{JX`r0fHMwq~5o#jF>P13qM$Q
z6Mdz~Pt}7;`?u-YDZ}+9w`28zReersT8J9A8BBfLjB<US$d<gTA-(wN;Q!{+#=TN9
z^ZdOP<Qzz$Qn2_NE_ZJ!9iW_*E-%l6Y{Tv{7CHjk<pzQ9qK+Mo{KBwwwxvUZcbx8l
z)7b;oePU!pt3p3y**mp(_VE2S#f5vtLWSxV{>xTVb;bxczh+t*DX>$b0{}6iaTJ;a
zM#NTfCrWoItfsOGeUo%5jd|Nqz25NiA#k5c6vyBKx0o>NBNxDGj|;_mYgy!1MkCy;
z-+65FSOoU|ADDo;wQ5jHwk=_(((`5N_Iye7NbfQaG-u2VGujy=4d3|e%M9z{1oKN|
z5?4C%Ivg6Oz2wYu@JNr1+aiT+k=*f)c%HSu>)K&Y(?0l4zQCdf)SR`!J$7PStWhap
z{6@s)S!n&O8ztCf#o-Dr1{=dVta8Awv2X+|m0a}pgEtkcO4z&ikf1Aa5+n%_tG>ft
zh}|Tj+`R<SXMWgq%8$De3B!d>ELRx1)E)j)n>w~#7b*c8u#*T!KF7_Lv-p$A7GY;c
zfuY0i3zga*{+R!Dh?{s^D~VBM86c*wKNO?A@+WuHua%!Iv`k+rX)3v*ZH|5BoCNSJ
z@1qF_Z9_xQSCvVjXVo@si_U3rUB`WOXSqFeC1Lx7b*s(|fMjO)`^<v$5W>%Ggm8b}
zSD`3v=@n@3n{}jyzxwW+oaXkT>9Mf)luV!P+PjFzu%iSLf?<&H&OFLr5jZn6KO-Q_
zxy&r_HcHX0ok(zZP=#wV3_JOmef`hz7eY4*Kv#|9?lzviXOstzYJ3Eo^ot;X^21O8
zw!d%z&Y5ZepCP;xWUqx2%<bxps4R1M9}6BhvuJ9-vk`q^fi$-jf8-q2BR^|)aB{z4
zP1W3~9w#Hll}Z0`|Fg;XQHc=Etc9zY!CX4@rEqNAvoH_HLZSO$YVlhFCW6BAHy0m1
z9Doo$3($mHM>7m>u&KV<kgEML%V!x-PfE~co(i{ML6`*T{J<ND<N#C~M85U(TLgcH
zd%b(C`@~G~@UuAJrNmL51Nkma<23aepJXqunZ31dx;I;pxZarFr*wlcpG%4@3h#EG
zd*_noNgA$d`RwA;5^~ks)f`lu3O&;*-F>Rf9b(sp>aq^uN=qa^TzE^Pk41O=%TZij
z=fB}bhjNkX*wlYyKT~JeO<quX(F<@ep>|qoEnH;EgXXF*I^S8Q_pdENzMl&qyL~f%
zFA5T*TB9<BS%{Nv<m=gmkTrj$D{?IO?|)RaSEb>L*(ngxYr{a!UJOebHnjy9H@%UA
zO-(Lj?1guEj1O}(sJKfGFr5nVQWfR`L76pdOQ+mB{)01K9NZk$60mO82O)H|af9#k
zN?9+L8=G~uC<ZB?wQWD7+vU31VuJC$LN&oeS;*nBj?3=Tt=+v<G8+-ZEhdd|i!S0b
z&pPg9{D2$Zsi7XI3mdwAu<0^ZOq2wCslMLPk;;?@O>4DYsJy8&WG~IOnfh>8cS(xG
z{$?1|CaV{*#?-Mg+I_59cq8=yL@W}XPF$=kFyddnJgd46?ADOfY%&ex4^935fbAdX
zVSR)n%%a3FTqVByL<7irnGE1&g^{R3av<0g18(gXreVD$Q?@doxh5XeGyM$#<Rps1
ztcOVkW|srJL8EIGe-`7VjxEi;Lueb(#HfCNoR6hd69=ZOqezkhIV`?k{I<pWz49_6
zbQ)A%mlPsk6<XVp?R=!e5w9Pn89c-&umVu_C^Sxxlg?krgLGdLKA3ztGy{pM7}0i#
z62c=6Yf_}}H1^*QuZHg&5E|+?S~^50>L4avo4^xeg|j(VNQHb<ZAvf8Zk8>#f9tUF
zgi3BVE1J|FqRpkGwI|3#6mCISoRJ%%;Ui{D>vOqej{W_((0$dj4$eM2o2r(=Llx0l
zmIT#DcggUiH7eAirM2)HE5m|_&$XR<UDGTOZ{5PT;HDm_*?YkH_l^==(N6T7?WXUc
z)SMRaqzd|JfAN`iffa?6`IO`+rUdBQNsW=s?mcF0Z`TO?A?P97f_mgzI>9!DfwgdN
zsgS7r_&MZW(U_xSIhVawx#;|gibE|j`#0KI2NXgN!Re=1N%e7AtP@Y$m?@u!1S(I4
z&Fo^`Lkw&G+36wOVE#q<Ys6Stg4?`~wZcn*in14_*U07C2Uf@HAL3wM?K=!B>CSsG
zuVF5vvU04TW9|Lm(!rCUDmf<@c+j5A7}{f&oszqz(J|3vR&!UvuG^}nDz&M`rRAA8
zG@#lKu&~(+Wo~@Yh2W@V^QMa49r`1ln@`{Yl{XPuOx``_gXHiP!hC)2<=5(j4(Fp>
zSFj*2B!~LkMD5hc6x*?x=-qx158h<_K|0qx{HFx|cb^-7*3QRIXAU*0Qt$yfu*WrR
zYA9r2-vq$PIm~aIeSyHf#$@&Yl;P)uL-}XG_VK9~H<gW@&feLV(bUtchxLr|j*5V+
z%VW6V|G1Wq61=1B?elAMq2SYKjV+QgjSB4^5=qOSxedp@6T$juDCt!Nk>EHU3{l`Z
z?_JnrVYsGNR+$kVr?8zp;es~zlxFm0E>;N+v_Q43h9(GuONsejrI8CbnI3L%@h@ZE
zx{wxs#3NhXs%u|Xde$ew58pLcDQKcKB{Jzdw&HZvYRMS}gadGBy?Wk={V=|gWYqc{
zST9Un0cGzy{>)4y`C|aiQM-8F?$`(Rs;T31$PeGHe;B*7emt_!M&MprIEk|da<DFL
zti4rXDGvaffP?4)aQ1t&_?Oc);EbYfwzBpMGiO^yNC>O;w>QT+A}~M+$@kP@y2}Wd
z^@<vYIS`!szo|u~?S@l!g|~j0B0iRIxp($Hq~q;phDY<$4)kFPIn_;co@$dyt<y)#
z21Yy9{sF9`HW)V{bBdiU+zR4*mX}?Bk-UM=J=F)5ij)o<w?m<dkk@Tv0LKK+G83P{
znr9@o=D_8uXp{oYs#_I}3gH6QBk0L>BOBtK=Eg|UTC^nd-tC_$8d^L>0w--cRGf#R
zVy;ZodhUw5>)?=jtk3F{+-#^t=T|LUpEm9D*`-VaBZuU=QU!Je_Tg7d1;pHqY!;v|
zkG48;Y+McAu_PX$-E=iDb8EwFu+fw2EMq!Rw+mLd1P50!;@6Ac?Tm<Dr+<H0XUvrG
z8|UAV$^WANethud(tq-EgLIJK_`VS06r?LORWiKdONEr60+>)`u*Hdw2%pEN9a3oZ
zw4)HjxSViE2_C)U0mj1?6rwsF<X5)1y+gj0{`CHyOo3rlmCP*s;(1BjNadf^3HODW
zPz^-^B6I%w8SQ;RRkV=huM&b(3ZrTEPias?@{@~Ne*c8DXqg%5O{F7)gBPABmvGt@
zYrGdWj|MkV$al~^0wKM;$(ikiw!zJyEYk~5y{q>J#l>6_k`&=RjqIXHkv*E{Whf#p
zZ)FN*XE!`a!i$dvM1-rIu7vR>tDie?Qpn#IkbonDdy?yS65#pw;{@{?mo&XaWf(7F
z&J7G)99H}=U2SjPx<g-r+D=P1#hX_d8<d<0O$^;0x_PBYR5g>KI@(GwEL>jI6+4jl
z=pKF8$GVnqd(HTjEa0@CVF6*b;qEAME}$&#TbD}!k1YhFJ;zVKE<iR(kL$Qbmr6Zt
zqdx3z3nBa4;I8FybMxR|rP)IWKGC>Lbp{}dcK3sys0*6chjER!?s5hX<@qZ{Pd;GK
zFXnqS%h8lPMvt0Kc0W?s5OVwhQ*9~YS~^}~5G1v7eOkP3b8EhdZIcFGBo_C2H0Z7?
zZh2En(V69S{4@%;j-+H6G;2>iB&_OX6L!`tUi&c5opZQg0WiHtq0%QviA+9YBqu6q
zV#{-7jlF9%(3?${{UY(EubDn;ZT~%a>aJ~ZrsM{cy+1m_MdaS(Gd&ML=|9oE|AJCl
zNUtSKnur)&!(F~*I!6)Sw&Q`rANUXh*<~QFS$*I8td&F&FPM{YH<3%VmWLVa1g>X#
zN-l@Wh;n5*Mznq&81esg5Y`9@bkvYyHRyL{x2!+~vSJ<xrTYoP>J~N%@hXM(pUS#6
z0rjajhoAeWm0b6^#Z|Lt3N>h^>{##@ZQPD)Z{#)}T2nF$AAjJfJ|I4*`|$Jx1fRyS
zRnO7uk)DVdp{LtuezRM@6!)}cY5XmP>>2P~BGc|}yI^ugb;^6JA5J^F04ejj8DcPP
zs_LD@B4*)vG{!(ffOM=g;#tD@mdx~{QPlS_Fq{i>s!&ql$&A1!OwMPCHv}>YQVY3M
z0HMzab8vur_Sx1c1HgO^M^>63Xw#6uXIhd2Y%A+@LoHuQFgEWKhjaJ&q3ji=Ej?PF
zXX45>R=vPo2QjgoCQbjy=SywGm`Vi?9B?J4wOR6NX_=^<TkSt&jB-=)IiwbFJkH;C
zHd>r3-Q~y=GbdU(=<f)7ZTBRto87eC1nE%G`Y**B<~?x13v*XU0gM{VLdOSwj{J3<
z(bk>XjI|W2m@92z8$8{Rl&R<oyMXfC;n`_I?;tga<UD+gShddz7Q=5r=11Re+brQe
zqJ8Vy1N@v(ut*rV>N7EC;cRe7GEYV*3EQyeGo17>ZXE%8wU;kRI6d)D4L+?59?|W*
z=)aWD49WZMO@<Kb-s(LTaNm9f&OFO(NTZh0=_V6?oI_2NhyrSg_8EQZlD{qEOHN0R
zXH~;n8}uW`ei3HGWU%pnH7}SLW&s$y{C9f7j}HD|l~bRI`@G08zAak_o{DRqjTE-_
zB$}1v>>f&N{`HXSq0X#E6v`MIQvBDR1&BXV;4dgKTzHOOaL9(_r+#ZuJ8;?bs%e~k
zoP!j*+H=E1YI4GZm`ZH-HLM3SxTATj6j`3f>ah^4z3R!C)h`h;y7L8PDxwu=u$T?<
zpKX>rHUOTMX>!bRO<+QZmU#D1c+3D;BKu^&#HC3uhiEA>o%@&J<oIA07fx|BzIN<~
zVr$o4Ej&u0$g$?}=V0q8##iAk<r+XPk>>i72ndm{lDkU}!!@Xtd&hi_4Q`w7bXm?m
z^m?AS4_Rd*wQ`DU6plIT3EMxAQK~dZ?MZW5Yn2&*|HY0IyH6_PimjEh=*4X^)rg)0
z3u;E5ta+8w90x1xpn`F~R20f^9-JOu@#~|WpChWc`i#@QzW%C#cv-*3`-5)(fG)Zb
zPan`eGnNb<IMwN~<V>GFR+gdeE7rj+uWdbY1{=kuuWxuV6E3vczfeIM>8;QE{>2bO
z&?3w_q<D+$eDTp8`P~^)sTRrc4(UI$C*+FcQ@a_DFS`kJ2^)1@LgO*Nviib5==ebq
zIsXq#n+l=plY;$b>p1T<ZT`S$vC@|JX5vsr5%=-&_y_k>C<NuY@)m9Y6E7FvRHr$c
z)LeehWo|D6VOjUG*9&QcD?=n<pd*3i8+XuoJeGxyGAR3s=k5m{9=Fag#Y<R?#$v(K
zOsxx|x4Q^cMLC@It(AGF!ERc~E2URpTLt|{uq7i%i{x85p4KqiXZ%cMa4#X1LHhaf
z|JE_&Q=az&{h%#_V3<JmI);(_;e8k#td!Yc>qH(AcQvI~%S-h5Z~KQww8!v}{kK+1
zqDLCM#?b~S+)~90a0S^Q=Q7&vcQ*mY*%is2?|H9W+9}Oh@MUmxGMHWrWP+SXKEf8w
zG->xiGy+<*{2~jI8sqn+K1+E3<VU`z78g1Dtq}}21?FI8l|>K10JN>P;fz<|CZUIj
z$}t>lC(Z)>)(4C3aCoOYtM4G@f4)d;PhX3C=5rs<CQ=*Zycw>Yw8yKRZN}#wACYNQ
z_~T0N_zpj(jIjY0)N~hz4SgJVc<l_bPuoqGizk@qC)103vuXntNc%0aeM5CD1(puX
zj@lQreWvy!t82Az5TWJOyP_?p1+ob+_+BfWcJqKvK)Z2@zvTyMs__^YS)IXw!^GA|
z9YH(bAWZS6R#_g)>pQ8k&roPH=^h_yMp2pAsuAwlk#S;xEJ3Q@9rhf4PnNca<Z@ol
z2Si|=&PwLZ;@VV{w2SIPL=TbduPwRrQ#h}f>(b)_ed6L=Jah_cW+(jEo^E}qL;iyX
zsNiS6cuLLPe;di8QiB))<5^a<439Y#-D?iO&GIsYV>Nbg-OtY|jN@Z3p63%L#`Oo&
zI|{Tnv)Y@aG3g8{Ddf@6V3I4KkUJYO8L0f1X`EJ?ROa>}|y8Qs`f!S55r$3vUN
ziRV1j`A_7|=<8Jkxk5X2!17q3b4}l!)Uw&WQvmtA2N!&WbE?s{Yc7@*n9^tMGpp?H
zO<^gRtJ)q6*D3W5=8j~;THxw@7BzOexcq;G8DQVUzy9*zcz$VVAc4KGL@5t0%@)Fo
z4O+zb`*>wH219GV!sh!|mWWV6b^HmbFnDRrsQJ&YaO+PWhx%n#`!=s{kIGO>86!S3
z)0AZX62$GA&qI?gK+t14m{HJEeO@&Cm37Zl3|r|XB;F=&x+*SY{UAjnG^<-N12mQ*
zM-sk}JFrZBYxbwdEf|j2qH1;xw+13>3QN8CkO{fW9sJ2vXWp=8=1wl&0z1@FqMLfb
zLK;*fQj#L8*~JzPg$FRr(B1yaJiU02aR~gP3E|rqi|?}LQEYYd6W!6pMf%#LRd#;|
z(F4hkxpY-x5LFcVCH9n?>}j#l)LYQTq$#HEoqS`L5lvaL!&-j>1$TUS-0s^3Zz%?e
z6sJFb)zz21VAdA=vI_5CEFxj-`}A=lN;T6)As0tp-%Ir_1aIlm-*nK4iwS-WB|Y9*
zd4HKE9a8$x(gr*}AYBcPOu;cPdLW9FY3wvh?GtveC27ws6Yx$oP5B6IIWj+exJlA@
zSK;5gd4*h^S4u#Z?5+N+tetd*{Oe89`Xn(iS|EQ;4h>geG>LipvX660V&<gjVaBVv
zi&)F&s1vye(!D)3<YxU?@SLJqdJ=tar&LUqgi%wQw9-5E+h<gZ0@{#+=hqek=N8bl
z{W4{1K^rmcV0UbXi(!Dr>s{hytSWCzc<CCY;Jo%_TAICPAVHc;?7bXM3a{>(5E<g=
zwMrpL)W4_V|CBajIpH4*+wLGuP)&Nc^;q8z@KjwlY;eT}Mk=2Mb`*eML_U->7Hu5v
zb7SIbi`|z%aN$Ql^&GBQ?t=v-TyZ__hrrqPXFPU%3Sq3FXhefGHqQHk7syw6<WGCo
z%?cKua)?b=odh;Ui^`n5eNWp~^R9MJJ#zX33`>Dh!s;)u5JSA&50O2TYZ&pBd&bhz
zm2w-`-;b)3^1=?-0v*3VuYYqDV4$dkW2y~0zjFD>)Hc(05>Qr&J!Tl9IW$cFrJY*h
z$jrfY*h`m#8$UI`6rvwxXo$n#?^eE{Wo<z{SKOj-6hU)=#+}rXSmBOwfvY+5j!;rQ
zm35l;)SWP<F`uWELn3}sRQ*jf?)hD}SoXhf0Z5#a5#&UgSCUBX5H`(<^9hg=>7zv=
z)s!N2*{T=0^mXNS`ei)E$X2ONvb@CWXMlZ~9(~|F=da>pfL}|vyz+2jju65)6p$^i
z<cFP<ewP+i-6&7I1E=;lhjx-538k!Zd290$?%G8hy`}cYoZvR&E05R^?;WiS`_DC`
zRV#;jKK2`4;)e*(Jl0VHY)g8ZNL|`s)~l}BmY**;JHxJl@hk#9Pr%fY<9(lCN0sje
zTP8#vfe*#Eh!7&gmARjBt`rsg_fOrAwqdNFpe`I<FWNsXH-gLLxKh^P9R<pL#43w-
z-N~(o6*Y>}N9c7H=#wKE(5x^0Vh(`3`6wgAO8olvqekU3*AttP|Kj)|vKH%a1A);g
zfRP;+w`D}IvmGG^=aQEDYjq?HF#eUSCJkJrW^s>#hR|QVi_0pnOJpSb_@s0EPtr1G
zhtyMJkT_S)r`L6=LG1{6ijwbgiKF;j!O3T3*W8sNJs1eToa_9)KvmzaBe)+rwo3}7
z-+FBS|EVK|y#QO`LmP-10Ro81KGfSchaR96@CuA<Y~IUKn1Tl|F;WCHtKb8)7VJRD
z0M|P)Ab1}Z$bq7vR!>mgvcWB5TAB(xKZlUwGBYc7<c`70>#9g>_<08sit23q#t1on
zrdVk_T9~*nokt>Qq_Tph`(65#vwKBJVj?2gN4)2j0S$k~uy(<Bkf{arW+yAUl}r?X
zWO<KugZ$tS?NUEpc~b%Pl0BT?88Zf@%WX6BAZ=kUlkdjA&M)=BW(t<u`0hlgU74+e
zs6gB+qv>YIb7e@Mj)CZiJOofhv`wT(lkI<JS#0a$Vq}{-lKR3dCpNJko820TvVapj
zu{sjkXwrf=3bvVxM9zX;|AFZeQt}w|in%IDfSui%vp-nC)qDFIiK;MwSHlRvT3BW3
z@c*&(mO*iL!P;(s009PfcM{x#TbMv_2=4Cg4j~iV-Q6L$ySonVPH+nhE@$%Y{Z*Yh
z->K(siXyDly}GZKyR|nlVY|K*uIn!8^e802woLD{#t-Rr3L;e>lQ=mHBs7Dc6A4Ee
zWaW611lbqs-Wh*4UX2+ZS2>afCFolCJLGZ{yP3d&sh_x<x}YgI(;ar&pxxh4$op>!
z<Y9L;|IL(df7BtZG<d38FaUOV<sIIB|3P#$n-_jwg#ROJ?v5U3dKA@ep5ub<UyFs7
z_tUa-1c$+w)<ls-<D0%+JLnFhg3<m<UxU|(E2H0+!eLEGHnRfEoh9hpUdh-0<2rB1
zpyP}(`%Iu+9<>lh%8T8~S<}lu{6BOr%QTBRcKJ_`S|y(019wGv#gzgndQMI*87g<j
z`Tjb>RpXNVdUEWL??S4wJnS$N$LP*SFX7h+zdO-OI(#gTZ%9r)`LC4r|K)K}pP?_V
z&`S1?p;vO<FWuxD&}$}PPxnrT28oQgLHP_3w)<x_Oaye>N?s7EK_Yd19i=*yIY1|H
zy1klY7LY`hOgl~w1I9nc=V`eJqUPh9ErHSC%c<ZW4VTa8wASs!euBI-l~yM=xOvHu
zQG`FYK}<ls8k|^l>N^QHji+yQcPtKixRAHEr@FRN_yE(E53s#CpD1Wz@c!&iDHfUG
z#aL<)iED1M+`^BJN_PR4o5pxh4iE0q69;3Gq59@b6-<Z&Pk{r9qDp2kEyJen%Yhx$
zZP{-pzHU=w_ZdQ`uw%8=oKr*R<DOmSe=@FLeS{Cj`3`(H^56I?0w3k5WMQm}y926c
z1dFqN{hkx%_Z&Igd+a#<8m-?>q#V?8myRB~=M`L5sw%&+TkYQ}+o_YW#y^Gp=t947
zv)BRg$kds#UNH;EvdU|-`0k|3n}Nq1yo=NW<|(||>Wxs_{F0Ubm>Q0=hg7Q8d6u;0
ztzIR4wEv4=@059q*}(w?5Z^JN2tQO)gipp{ZJ&$WW{MPbU8c8~CR?E|Un{DwZtzz%
z4i7E1Xhp5fq+^jJ3trDvfJS~Z^e6%n8CA5dk~E4HI?vxhl{|PPgX7o4qV8^QC<vnG
zGVj{O2#{`P$*%F}8B{GtX1bdn_}hauz9kl&wD$NlN)e*P7bIBmF<o2jhZg+K%z%DS
zzIE<>(8DZA-p0AWP4v)}QD~c0%^)wUHYx)tUjyX4m&j7RkfCu0p2jQhnTHV+Qh^=j
z*e)f#Tox@bPh;5@v&N;!?8bxIbj&=IE5v43$I9r&<@a^7{R7&tQQ^yD1k26CgDwQ`
z5QyiIO+<rek(b-&wE|lAzwgkIIibx~fTG8C_;&c#T6o!{$y(sc-27j!Nc<pG{ds%e
zTCvf??48(U-i!2xbM85N`PKj;lEHktw)G&ND!qL*<n|cg8&)(!dr75#IUZ5ogi+^%
z@J;&K8-Y-=MY;~qoyAykR`TpSQ*f-B^_f{4=2+n$r53Kht3sXP1?gj_(w@^;d(G3&
z(fHm&?jnWPK>y&A52=h{F;9EYlluSrrumq+NaY0%4!%s`aDoRjSyL?@6lIqCF*>XT
zu^arr9N=13PZST7jC;IKj<`<ITf|k~v?5Ci(yP)=rt8nmZsR*Jb~nAv?fEnim0W!#
zjdbexjSF7FO8@lIMvyv1-VZ9(5HE3CLLuhMt&!7IMYCQlrL4I1<eORzw~w=$Xqbp=
zq_eN)N(EC)`^Nz?2+>qTIh`e)W*iN{T>wO7-QP9qGhq9$xTgS6sV`5|Zi~dy$yX<#
zDn2fh44O^bkNj)XjmvdTS&>Z@;Pk;Z>&qM%J8XNDLsJIcD!i1sxh6BLj5L(Bt&J<U
zn5FtXM11SD=win<uZ~5EyAkbf?VJg`VP`&$DJ;~Cm3RcIDcK`h%T&ED^*xC#k^IUh
z$?;6nlKbU?sRaXY-?ynV=(G<mxy$#^l&s=cqcLAj#e!f)PzBNl-JDO~ArH;cCyz^S
zG&8)Yv-{I3wY-qu#TGi?6fT;-`3g@C6&YFykqBYOf16Oiz4kcFa3ue_q3!3poxP=)
zX1{SETnrxFYZeUDIVjO#I%suXR|&-roRht#AW`x@W@UNh9t>|eKgA{I`gQHRaDO@$
zLvHxxm-0L<bw=9Yhn%N5vw&#KDBP*mlOoFGsj!o<tsF5%$Rc}ZC8xlg69`!@wjyeJ
zO>$$c$hM>ZKB_UaE5AwT9R99lBy!bz7S^U}$uv7V3_n<wu;P@oU9>=-L$XtA6V&vh
zNpzJt2Frnb9;jZfbj+2!W@lX1t45SpytZ0DLAQ8KWDjE>P-z^%6dSF6dfD62nq<bi
z)|3l7uUlXJMvz=E7UEViaNI=DWi#J3hca!Kf1lg#N(jcgbAGf$_Ii+X7fEto-T!md
z@@zCpY&Bs*wt0%TKi$M#C<n4rJe(3LIPc@N%wD5w8g6V9em$8FM|NkE)CPW@7p^_J
zdgFT%y2SgxKi;eV?)8y*26=(SuAZ}Oy-!J@ug9-i^hh-vPNkEL{`-F>F`dX%t(2UO
z9QomUWf+5FJ{P?$<0S6WpP>fh(|7Jkp<@El-_6|x+{R_4?L(3I)G#u06>9k6$N^n6
zc&u_|q`Y5b-=<~PL>sS@U*GYBoy=#I;^yx_lvyNQo+!Z-PYv|Wf36~})|wlyyn-2|
z+I+)E4a@hece)(&Q%Q~xNH5^+Z`*`nW82m=&jl<GueO-yzjS!Wuv&)0*WO6Y!Qr;G
z)tK&@I)x)&XL!t<LYHxVCWl;4e7c(`nxd7tD7I%PQjJ_O<NY~6yn}|n=pFj+xK8j0
zK35H+`U<vWDHrAe=}tQSh@v?BnicfTv3V=hjW5@oc!kv$rb3xJ_S(6AYK;RaJP0h@
zI)B{>QJXI%I#{`DF5QjeK~i2~^@V#%zcErY8zZ4qi^xVEQfg)Q-E;x6;HsBj>^!H}
zc9e9D|ITdBpbqrrnfutHuuN|JgbbO8CwpL?Ja~Ljw$q4sy^&H58~oTbHTR2^tq|CG
zM9<Oiamy*b?@+jHK!|Evqb!RvTdzCTKp!7v^x^uqs(@_uW=}W|3HuGLkDaPp;!~An
zfOy$G&0t5v%<^W*;G+QX31Vc_iinaqLrtevo!Fb2vA`jrBXeWERt26&zZ=h?(D4NA
zh;<cooJ61X+k$r_^smwluIsj}oxYg^UZip<I_<R;+!X(HEdy?Q<e$iP$d>KzZ@_NA
z>W;wjwML&5FJ0X*c&?L>9hfRLhG<jRT60UT{Z7>0H(Mi@4d>VWxC>d4nK7^`H4_F4
z+&0<eZsF@Nybq%O=u?*52GPoFPhW0KCgNk&yx{)#Q~&?I%s$wS?RsY)w3-UuJ3%_E
zO^$R*uX*S$;)d_c>_GjD9*jg@1m$;#p%!^2S|<Z19nd#Lo(3hM{Op(%wEM#*CyA{4
zREdv2lK8i`Nu%|r@LNG{Zc}CDP>JPNw}i+&QZ)|Ck)b*S)uRO*+A4^gCgsovo8r@|
za$HF!ErEy9=e<W&v)~`sbgH2ZyXv}0I#pe5evb!^eVkL7^5(~TUbYL}HC&N?W$#Jk
zlzTvKb;=FXL{WvO-5LgxZT87KgUvU}(hUZMZrG3*>_vSp+LkuyCmrx#Q%B#%PX#p?
z2VuF7$El9KpOz0A(<B{*tDFV)XIT;+*Z%ojW;_CxumZ&q?o7spNt*xer;FtKVUmFB
zwBe-XTTxQh`DU-N-CsgR*`>-%S4ZENs*|KmTef%oL0cgbsikjou-flD{WT4)W3xZe
zO9GsCYeX~~{d1S>b(R-i#<0GID~LVCpBCf2eb}we`V+p;twga3$gPvr_O|TM!@~`W
zCmq!O``t5Nv|r_k^yLl`Abygtid-Ws<gsCqzNV0LE?{0FztE(r%nZHg+PtD+^E+MO
z68X;jS5r3m0)Ewi=aj-CN9)WbGcsjfmcxU@UXPNM->M>dVQ0$oM&;H|3x%$cOHq&9
zy_R(p4$m%&vc|!4>WO-UK&--K;<h@P)eU>IiTV<}bkeE=)ay4Kr9f{jU)WV$<IUQx
zG}4GJ@u8*hB4!q9s$?D87v9RRyB@hsqKKLXX~N>j0VgbxsTnRlg^Aa9rM}Z|fg4W5
zTGK<?9{*XtdW~on_8%8H6%&>w<=}=lJ>x-=HS2+^5kL%x&X7dPe70>CD744tvT?Hq
znO|Km@?inHd3SMA5o9Y?$XEKeTeD@&ShJzYNDKE`#+kPNJFx%z%W>AED0a4u+Dw~<
zedCL#<mfU9ksc$+bEGE7Q_z{h=&;8EA;!aaIADq1s3D2hPqPUQe8Rq!cQK9$K|rC4
zdy4yKlcClDq?M!dAJz`*+e}Lb{qoknuUI*)3&T%*pZfu_OzD5#uuB<0ogHYITMwLx
z8apu!^g}(78$-|*5b4}l_sTwkb8IOqR;B@C)Qm853&MN#ojywLLX`6y{urEY%HIvI
z`Qz{$iQrT%if3~k_=ebMnF!}qX^||FR~D!xPy526&y}+8D1R66lsG(b5FU2>nndmR
z;NrqOE#9uKmxF{|{_1r5Y+l16sXY7C@$P$tUiKSNCo81P$G6lFG2x4LfnpW?KKRn0
ztdYM#Supt80kpik5RKlTq-VR>nl0abA=Nn)$^47D=8(>-<=chKKUd(00^kgBu%grt
z>oRrhNY`cfENd4hLbm8FHHLTC*2z9|^hc#|$$Hn-EsR06;vvA2=NMDnTZ)+ZvTT)7
zhWKP9DJ_C<aSDf<K}F_qS2xLgDpC0|rz4C^v-O96A5y8<{>|hQtWqjT$K~HxU~#(j
zO%s#)q*6=VHJUX4s#_fi(i_R+2Q|V!u8k(6qx8+^Y6_v7JG?54>T-Te*+X!?2bugR
zxZJN%6!YQhPN<Kz7^-&IgGedI>o3kk{BtICv<uY85Y79%Jw<(f`>!!5WJM;W@pBH5
zU!Dq~ZdTr&XADQcrZSl>@GQ<-Zsg~!wv@BN=SX&T5<bf|?=SCLeDX5x%DXedAa0zX
zeMu%^M0Q18vykEa60!{Q0<R|RPOGby=E(EAYVFJwyCg;xrxdXtP1><NZ`AVKH%Xej
z)YV-=BBU-KN5~wTI^q6>AD=aHc=n})$#p(UrcnMoR4C<9l|nf?y^ORAr*54_D+)-n
z$Po-h{y%Xr^mMhm77a6K-G<%Y>wT-4I{uhUyGpBl<I>^KIi(7BN{_dTwVrZt9Ismq
zco2naIWbot3-Xgq8vi0hL->5)kGD&d$><CZwtI>2%L|0em)W{;?g@;G?{rE%w;yQb
zldO~_$P?}jdCxRwGkRvUNj4Xb2*lfZw{phM>+Sv*i93ulU6QZ^S*;N`KOkV}R;1=P
zX{JLp(SMqT=y3wT$IAe{l+00#OzupvO6cx=E)c#uw|GU7af5$_od$kswFEn`>H~rw
zDO7SR!YD+72!T`hTaeFRJYE_UL8z8#<WIhWBC&m)fh`Ye!n}A?&~Aj0$DeOO@+tM>
z?BJ_gl_3mW!xU^Yo|h;Ol=^V^??***EFBN>E9<a~P>CR04)OX8_TbJmm6(t}ulV9F
z<z{Fd<eR-vSFm;8&^>lbTBA($z4pHRf%C7F9b82g=4N)Wu6@T)Pqw+KzfzQ-EO{1>
zv~Fk2A%7Z0RYPw5?q99C5vK1R4oouZv%F+xMYv|kZ59MjuRpK;<-zYR;(1P*7a`1c
zc7r3efzL=g^{9~d%j|6}li&C3SSKL)`Wq3|_8%d?#ogy0UB%2ego$XzH5&<-pTlok
zjan^Xk)upHBHa@zyvD9%*c9=1&(4-uu}tb{oU-}*VQFt73H3FChIQ>SHQ^-V#<`|_
z9Jl4*Jdiig%NT)0cs$dpg?hzTFLA8k<a_5v%heeb8@|@AG%j+?`F;X!bNP01R?GTs
znWWI)j!`CgruxT3c%PEzgQ$UihP_7K#tfZPEvCy<i#vhkf$EmRcIHZ9-=lo224LUJ
z!yr~-^;TfCwy}Ojf4l8vE1Z7p{*kh(h?9;2TfAo1UZ=ab?r@fla@B^=riZ`1NIOhY
zToO0jUmYQitWvuJxuj{NQe7g?GeoCRwx7`SK46tTlo|<kFCNG~e)!ANPZ7t-?pOEF
ziUARLx)Nq~H$EL%)uwAAa1$l5nv(DvE*;sf?G>VAWt2QA+Hbs#=kPT9VdVs5wsiu!
z4!g!?QiNcz>i^NQ;+|h7y{Z2|BD>zZO)zx`QeY=PQ7p7M_jnaVERQzOiO+v*3!mN;
zZjpP&PP_}XJ!ij>Da$JJ8q#WFnbzd6M-s@%`n}PGvFp6{hs+vWwfy_F7AG6-tg%Xp
z|Hh=YGzYNCtl&s4ov*LGcMl_<_}cU`$>nmeRb95bjyFwkr*C6FyD5eFzaeZbe3Xli
z{zqaRtu5h$HS@hOtOIpj+A>ui-~8e`*JsgFM(CmpZ1w+%chwu;4Qg8k<1emrT&MIy
z@|hIZ;RoTYeh^zw56y_8y<ip>LC)B^-0!T`@KvnW%34M`Zs!hWp4hS&YLmxG>00#6
zXK6?``|v@x$H_;G*gM*p`^f!UM(u|dAJe?)1|WJn|6{SgnH|y%L(bb1DT%<K9)YX!
z)Z5M0P0OIeKZ$_KS$&h0&a56Gg1k0z!;lTyl)YQh4*wDD8E!7zG|I<Go1jB<<J5(_
z7hiNvph)Hr1}eAg$(c`NrELJC3_+ebtI5g~o>12i#)}hy_|2PC(aWQVn$U#aD8|dV
zM9^XTKESWIFU7b))vsd%HggtMGt;^5SuGHc;3HHyjNCk=Q!tz|&4Ub1vVbD6A^+K>
zhXU(ZCCFl=i{Qe7NOL$DiX^h&r7uO>XSPv28`RY(pgFuw<?nRz#jLMzH;2vlhccHx
zc$h(m!^epuBjw)8Y9;JH4`nq)(_8$0=3A~%)W#0LnY4s|x88XvbWyuF9ScDyU5>X)
z-SCSlvTT}4VEnD#{^!-3XyHW5jY}@nuIK{<c};T%kK|6uLsD5Iw@+Dt&+#QNK2BZR
zG8=o$u_rd6Ru*08i*cD1<0)h}&Z;Lsi`t1`@+5^r{dt!5e-cX=UXN8D)smQSA0>-j
zywWq>3oBYMo+@Qi|GZwDK|p6Ed#~lsG~QmzNckBrPiwp-tGjMX=4EjG>Ks!dEt_A>
z*V^fZh1IKah2^Z)>_Lz>yVGOfRIivq2tD?DCDyA%zPnu?@3`hsKm3ESY4DB5@@&G{
z*B9E`0|!#MbT=I3nKs+%p<Mp7&)miJWlF0664rUM2Ath^28jJbMPkTOVbUr+r^f^C
zqnlC=zMv0u^^9Nd1_654r>#p0{aXFBNH4R*cKEZEQws)Vj-gudL5izbT*3HW&+`qg
zXn%<mR>p{Y)m$NMto?A#37b8k{fEcgBaI@?_3<yfZYA;kmqu3JGpLPb47Yu@s?i3L
zWfMeKyMMR%w_~7jIsX&{e@Wgm?e!VQCfOtVfwVJt$<Cd6k3`HZug|Y1(*3WXw5-_P
zB321|rG0hieEWa?`;!L6kQtI+&Ee?KRCO;+l>sC#GjAO0MVor2`CH||e<_RZG%ObD
zCuALchEm}}4?(tAmg*i~5=VY+z8q<s6-Mrgs;Vry%Gl?;F$&LPXfFceP58~Y4%w6t
zV;0T`Bnn>+v}U_-yjL;dCIl$~Em5}=E_y&Q5)>q9;5Dyv0P8Nys3d3GoeA^8p;unF
zD?{wpya2&?OlxSU&6OLiI2S}K=<p&a^rq9rhWRp*itgNE<wOD@dvyR{=JeYn&(`-3
zRLp9&%YfGa8T-h0<X=v4*a@OcYY&y0G(`i6;<&Q`eVtM)P;r~sKB%bn86aTw0loD9
zLRkTn4ODMZZuf64?|Py?eF!>?@|3Orh}sW~Rd~Vx@#w>2&tKC*qY7wTfoj+EI{}E_
z%%DsO>&^9w9tQ5RWpp0se9*x2aIqq$?l(=ujKq6^e1mD}(?={qXj^O#g%p|!ow@7I
zx5Llp>AA2<m?-PY6f^h36LfqP!GGRr*;lb<aBU)YdO!N8Nsn9QT-$26=D!2u5kNvc
zbkzLHMw3Lt^1Nqt{qnV)yunQJ0?#{^o|rc+&3?DXUHafyxNN{nZwUtPkn&?|xWU_*
z3;8-^;2!yINj=QtIHSI#QI+$`v&Bafqe`O1s^q1>U}%Q)Hg)-4FATW@;fzC>yQ9*-
z5rD>1)I?+t&K~9VRW)Po`Ui5m@i2D7`m2wCjO%!J(o$=5^2uRXQd}us;GE_a84jlD
z#r<?a+z;H@cbu~<+O}3C1z7P)PcFlF9`k1jg@~4}f5IdrXYJ@@Oh!N|f?_Hk!ShZm
zeqHpl>$hq>@*~(Li&0QD_ma^LUoIx_UXG?@+g_bAJ6W#ojwY{#lhzMJVuFh?iW@%)
z%RUjY&)U71@d&LlY<iQNKZVac*5X(ZC_YAtOvYnr;cM8549kg;?s1u7MX!#XF^r(q
z*dwCVJdL~E^}knPv+Piy7U}J0h0V;Z>DW<m$MuWhJ2zqXB;i-3PqpP6(nq0RM5@4P
zl(AQKqh)A@(<iWY@LmtPnnnTIG4H()iYC`jN8|sao+Lsj5sM$gI}iN|n2;wKt(6OF
z$NLxW6v0|;WIJ^(`t!t(HklmE>*yg(H(393V{RKnrCsvvkd<Rctx-8v7|s{mnfiI<
z5>saguUr$;{8PFNNaCoy%c+8&g4_}a8F!#dG-!_eG@92nE0c*zLA1o9KIi{EH&8aE
zSVaJ&A)Nqv3)}ID-=rS_l)8>&3yKY4JQOj1XnO;|=0)rtED$PjE`JX3W;4?r%OIni
zMF6axP;dW<=SZxkL89?iin{<d#Pz5J0G|5D(d>UEhd_Sfg}~`$i4ZVQY0f4-4Yhz*
zZvvl<pP`S?dmjMOGyp=?6zy@HtJ4sViT<$PZ)hT5rZllPSGlu*z7<0aiyW1=y5fIq
ziJdvd)#zMpak-qyD5N)}Xaih-;&bb#&;gG&51XEJ)`*L2Ptn4lK7uZbJ@YwH##M%D
z|8j>)P})p#*siu4_&17SzRgMfcl=`GhH5L=!@;MO)WV5|MCvFJ$-n0v54~#JItxC8
zLpiqD=tZ^p#|blQow82R?Panx1i`m!`$JOOVc^Ed%U+t{4gINPBmK-l$)1<qEIZ!t
zy!aX$C<OZt4!4Q5Po||hPv@=Dm$|Lvb!V1J`=ntUQ)9}_2)s?Lh@Yc*Yqe&!&Xml@
zvIBEP-*ijOYFPg{t9k>N-0(HF)t`A%Dl>ZyX*zw@w@sqtsyTo6U*q7$1Tcwfw8%eQ
z%h)z`m)iyR9tp8WV!p6>y5~LqQZ7)b%RfSDvX`tCM}<WzrfLZ$Sys@sAfgWRb44aW
z1M2&Be(}vLyf&w85-^I~;_i-}L31Gg)F007m8Zpvy_zc-TBXgKwn8U4ORWRlC7WOj
zhBCWne?(7o$Id5GKi&{g1`!@~)^0-~INfwLDrJjoao<x<SoGn3kH-;Y8=g!?{{TOy
zwRf&HVj{I8V4ML^Arpfl2=!J!gNj8=M5<J+vZ*T-P$9SKm8vo}_75;c6~R~5=Jz}F
z4WZQfkPOSNCaa7vfDqdr<=Vu|tzK%OIXjdu4rx=2WfgdR8V8yR(+ltZyowsVSnqYy
z@W9v>o<XZy68iDf(qLA%w)CZ8lBEp44YJZQqMd4sxvlc?q+{?un6L$G>aP1qx#Ek6
zeRg(-$Kk-$09F5EfOaOBF`MOSRYe$F=79{UJT-`ZDzb=Ivw_)x7g(|-5KnJSgF=Iy
zj&<Is?I#M^0)_44pelP|Xoh3h#2MS^L4?f=B@5l)deJ0P3^3P^!xifki^zckcHll#
zJFxQ~@^#qJqB!VZZ@@O5S`Y|2{D?x3SBTCIZ2S2hv5127wX+fxEB3?8_T8?w7xV_W
z=%JiKNkrt2-HejzHe9m$6$Et=k<N}pI!SJ(o`kfo%eA@g1tKR`Fz3GqQIy3vsXZkV
zkI(Fy;_Sbb6?H}6cF{8K5d?hr-B|27u@fnxTF5ZU$zWI`HU|w*)>2%(Rho9_I{M*k
zy{0lyvc8(YK&5q&#+r%L@BRBrVPcsUtz<p(Hfm9l5!R(@uk1X_z>d3)lt)1_fy9V3
zJFe|}OPV>sY0a;_^t=}{SYBH#_iid_2mMarrHgY9G^#q}(zQVM{<+=*Ci#3~rcQh_
z(wEI&ip2Y_M#;W@43aY|RAfQT>B0lLn5EE1v|IZf9)VdJi9Ei6p~o7{WYP-OERDKj
zM>2^;031~YSqXBI&#zg$h#3T>ippYy$Pl<$X?9N<A|`6=wW}!$0f}4AuaO*yxqriT
zhhd?JV-s&gAmhv{Rf!<l;MmI*esDJg!0_`bz%oaRtGvmycFE!WY-SX>B4x4p)qqvZ
zyd7f_I^{^Wp)$lBKUbFwRo#k=gu=x=cwn%gw--F7oR`kP)$FGvex&uAP91H2(c(0D
z$ucY0Cwqk{nPs4sWq#n=5S>R<r=1LPsPg0(dX~6uX+iH;3nd57$>zk=E$7PG5M!V7
zk&yNkin94Xv`Z{)1A-jztJH|G<Xxn>U|(nZ0P*^63jPt-WFY<oAGd>|exptP(##j~
zx0KU<I?zee5T+ppgtf?!Yq(Jn9R9D=*ACA=47<Md#icfYI<|9^(tyxb{jOZ!n6Kj4
zRp=Z&kpArU$ago_9lhjhoJG#_@B7-p@RP=AT*H=GI!)F)-cD9+TAm3jL&mSh?_X?Y
zm7`J+J8ZJAv{9`8=3$S(5=0lzJ2s2|<m$c?{7N72Oi<{6d_E;_=FomLf$`!(2O4&b
zqI_I@A7r5#gHf{kCFoFSmoP825CGQbsrNPSEnR<0F$!5oWKS8nhCx`&Jz#I%FHR|W
ztC}#rB$4E-sJh_p!w`aUx;dR7ivd#rBAH=`gitqzuOXQ@mlRvY-KJI~!U)C@4)&S(
z(z$aWps2P{wic8$3jnvl^)6H#Bx8eKs!Z58k2QiU@(mowkU}0kyV*R%tYO4m%8Wz&
zR7QaizcRu+{A8#&pM#9?5Ib=5v#b;A9hUfw1yo*mh69Mp!+B3~VG^<-kVTo+5kq-g
zn6l@WT@4U<GhAT}gz4qzQdsB0)>qJCbM*XF+5jMlyD<4;dGi!WL#q4;#}4#Q>E03^
zW<rS70QIxZaPiZb*-d<&*t*sn1+GL_8H5PqVT^Ffxbp@b9$5mw&A0$C4;%)P2K#;1
zZ<7#V-gq(5!gqOZ-Z4D~x)WPoicmGiD_+pScp9abBxeY3IaEm%sy=vepP5e3zV*xo
z+MP26kgzXu0%zZghtdm93JXS3KE4gI0}r%=4*l_=R0q0l!!(H!s}4#-nM|l8_w2%?
zbk>!W0lLFEPvNkid|AYA9)$pt_%I{on=~n(V6PebxBR?>LHKn{;#_|?f)14fWp7$&
z?|gFpC<IYHLcRxCK=277^2tr}uG6%JbIwkGzzJ9y(!2bA*@2}iyQ~<HJ&AhSc03qF
zlAU{9pqIVzjo?B|in{fz%oomvyBlRCwZ|Lg+xXmEZe!2K&TAlV7P8?a`*h(e#4h|_
zfp-{hX^X+6usc#lla_(xb}nx4O=IPc)7Q+P<`4eY)DQw77m;l>lN4|!UthYMsAFI!
zsGzjaUB$;2!gA)sHtn~n5+2Lx*zS^k=Fa|3GEJgm7gD}&mlzIXr8UEJibBPS%42Rr
zD|`4b4{3@L_f7v!KOD`={q=KMt@Yd){Y?>ih8O2)HhICdiXXgN#Qo&1m{2n9mx?KY
zaCooLP?y+#_>#cBqv##&hM`Wu2hC0lHVDdz5_;2!YU9_e7$?h+*XAJ5ua(^F-n)j4
z1n_85XO&mxy(bb$ziBL=yF7-NEGidLrdLgqD1O}!jIT~9$Zji*^(VA_GLWD|X3O^9
zwEv1^TN4joX@&=^Ph}rI6g!sJvuNhKDW&#0`blVA!3B-G)Sk%0Bk)X`1WZVtD=#*R
zVr(Z2VIv{MejYGbEl_z<{T`ALHDEI@*@get=2-k<z=<qV>@Bpt=UkOGiB!=&#hm6q
z;HYK$UaxiC33m`2GEBIJKAhY6*O$(Lt&p(rw0gNOqC0XzKK{K({_b$0uO-z%QB~$J
z@z81)MvG0vr|7s|{kGzS5s(ZcSxz_j?ysi<6SsYM;r`Y9TnU!%6zm*lakC8b4TEW$
z)ZTw5>yiR5ydz(n8+TZ^Q_;NB8-cTXm+H&V`6wdqCtexcNs*af7prf8I0iH9`!79$
zCP->{!^PW1@6|Sfg_%SeQ`oGYuwVSFe5{X63rjWLOMdX|(hYH(7M0C;q6IxkuuL@!
zxqc<OGqmQ`OK>AjtV;=%<bNvK^6Bu7XVaS#-z!CEirz20%}0Zc==%=q`K2rw_ofvz
zW4K4Rn^?OiWkzsO43vGsySc*Cn@MC={Av%?Sc_WRX|md`-JQaMcPS$3AiIM(B!bR|
z4>8PJA|xxj0ve4Z-}#n>P-;0dVsEydxpM&hQM<hxqM^P3i9#ok;|fO}SHv3z4zWQy
z)uTi4`wND+$}<K?@Lg045A@$E1Hg)Xzbf*40*yi{wFWU%p{2JhGPE3bGBXE3R0^*h
zDg<Sq$Owk-SGK1HYh!k1LLh<eh=IR1o6_!)JDUSnCRA=1g$s0Yg#@TPfO;4^|CpYu
zS4;#n%b?H|+C&KqxZv6Ebfs!E{vZOMN&$jAOt{o<<Vk91=6PPZ&kols%d+jRhr^J4
zLI8MIoL~+`ye5beFFg%Uzh~=-<5$n!Ur7MC3II0*9OF{mV10Up@}o_GlouRH4H92$
zIK)4xvUAH%%%JOAHH0Zfs3@`-3*3-V-{p1A;Bb-QSnzvsmDVVvLLj-uFmQ}i^*H>A
z)3>Jf^65*{`{}VeHsAq5f0G&E#_-S-ht;AAE#pA<HrGV|YV3!cHavD-^iu<jSapsP
zdpL`;_Xsa<@jaLMKPVYmLPPJKo=ynz%0;@n!Fn7HqPn9LqO%%w8!R_VmF)qPUTz;i
z&r}FeXV5axh45tmY8QJ(j~kldR*q&L#k+NBI?oR4D&uMUcO(W-=0(m(*O`@1XZa{d
zroevHkWoZ1tSkEa%i+66h(24hoqesb*+5fh0C~4K0K5qoA}o|j`Pkz^=P8tHe6eV6
zdVBJMV3am%42|C2d}76N$|O_GT#YDw0`ec+r&X@6<9fdIz6O(?)6%_q-^UsBYF#~B
zWFPP!Ux!|#CX+-%-60uIE2GV|!U2b)<g9zNxx))74WML*{do(-cf%#h7Q8Xp^q)QW
zF;&B$CoFH*cdaloFg0p#R|VG3{|WQG&g~w+Ew+jOzN|sk&@2+Uhk_R@65fR-?SXNw
zc5Gn!)2v^;Rp|n0!-xFO<FO4~2Q*pRGnIp6MJ`qqt+S}_c+fvZUFgekEv<7L1d9v>
z5;shtE;~dvn868AIEeqaE#blcHRGXYAkmhN@(|=uc@q36>?)74m(Dq_i4}#~>L8YF
z%^RooaIcFFT-B<&Q>5)w@P;sG8M3=FM)nonh{f-1Dg}_4+Tj7c*HHe{@opBIcC3%_
zEcF!}mzMF^g_O?k1k>M~&>eabjnV~&SEo&Nz_;eXBcx>o4OMw^1pghqq8MKYO{*Qo
zY8zTiS?Bdn9A)U|9{{mw=Y*RS+!ewxPsg{m*ufqQm1@$Vl=;dK$%4X<RaU)7avd_#
zl|LTSkRLcjbcIBm#;bTiK>=c0wuqUAzVcq2mba&R-8+E{JF@5wH{RI`cR`f1nEfq>
z<KDu?gOD4`AgQQlJ{<OEqHF7ge2U|7GR*cR+NL%NsuH{>p?6}DyVN6QlCx;b8-g0w
z{C${eP0f;Y#)$}+X5NzJ@+U-1x()@iO%vIMQ+8G#e8(mJm;F5A9ojkRL{U7}rsPb0
z5%wREY5v~l;mAM*-&p<l^znBQ53|79!LZE)x&-2q-~Y=a@{U9-9%!xk#}yhCOY{r8
z6g(_O+%KHwpE(iltwJu=gq$(16nv%q)$+1NW;8$Ri6j(|l1#jh3OCAkLgQPL-u#6X
zRd~)kY(25+WQ=sKe1xEiE)<9Sx;og_+u&4Z=9~G+*>w8GSv^dFN&!hQ`%6~p@qk^2
zn)vaz*%?{pbCEVZFPcVjDfqV+k$-5)*ermX2bwsY_Fyol3}a*#LK*UV&~OngAW8(E
z6PQgp){W`<4o;G^57>q&LGaQCRYca;gnBe+#CV_>)Dtcajcz<XgZ#YNz$(}7kp@)3
z<<&z?Oi+MqD!XUFZvm}Eb?^be%P=7%hXi?V>b)2pZup>(EORywv^RuVuV1YWFS+?g
zr!4>yxhmr?%-{nacA#?+TZ{sdMWtd10q-aaRMj02Lb6{;g)r41!MW7wgAf2;%oa;+
zr9Z^wzIq;LFEokad;YoTY~A0F7IeKL4mcMV9LMmrqyzcg3mWIY`mv>klE{f8PMj=2
zF$AhHn={Ng_8k{+3{~9aB$$A=WW8sr)qnxy2k2WoLX`M<YEGC36O`*F^N|=~eF9){
z&ZD!LObhy!qyT*vNQN<lnj$G%Z8$)ieLe_L^>SzphLZR`(PHlvCPYUU>T+)n5AX>P
zFonsDFEdVz5Tcx;_(J+DM>sGxI}iR6`n@h3Vr1tNTFD#kjh!vBPZ+hzvVon<=v}x_
zyX=$4jjGpW2=2GJNO>l+N4hb62(kHmvU~(U(T?13h>P#m?!W*N99$?_!4$8DP$I;t
zL1pV-l{pAlmj^F2|J(+SX(w?KC_s!u>>hE%xy0EuvJJO1TH`2N?GZsCb0-A6Dkun;
zYNX<4S-l_HXyYg!F9-phRqpKIW4b?3*ty6VE65ax$!RWUz=q}<8$kZRAN1{jJ%pEm
zFmEEM$FVgebP}<vC5kfbQWF8K4`y2{L~j8=_0ip!>ft_^Ft4Q$Fh$V4bx%R~+n)h6
zJd&PLqH9hcxVf0`nsrh#Rb00;YcKhv@lM1un?E1E7h`$5(N~m%H%55!5=1rl@qh5w
z0y74GMpo(V=sHWuMM)rx5LLz~KlUN>=8ix`a}^&dTxGi<AD#VIii>G~*D~48>1K(=
znXt_ST&Le#%L(p?a9-^nMFd`K+W!daGS@l6fJHU0y=?;XJ6~_Z$Z>iDV!}$QLKj7J
zkcz&jrcaYo5x02%$-__3mR`5WX?uS%<rW!RqE)tQnmG)M6>HDE`N_MAU*GT;`4Lel
z0_AmF<zMANF21jHbZ#(N`LjO2;odhDb4P}V=k`piRY))L*tW&bRM_r*Eb=jzzDK2|
zC$bQ)%qzA@G`U7pnv$-e|Ej&9iHL!1AZ%UC{?d_$nopPE9^o4%?$%WC0@Jj)bmu5G
za5d?aTa`u6JGqpDJH}mIh#nE*!`}?Ar`sZ)7?u-7iC-lkaZ@L^DTBM{C6!$BgF178
zpm>y_N!6eVb!a>KBX0&(qnVE15>_&`ypu<_INy0e-+93aw`{4-th%9K5#mWH!Y=2E
zg+C9oT`*CH*hUu?dhHDb#1OS(a>g^`zh=*ALp%Zm#3C{x^y&eRv8ZDAgx?rdMxNO=
z4yYPv%B{^;yf%0hSXBb!zc@Z>n={IW9zGsXiT@WcLb)d=3QpR{@%P^y@`-7>v)jYD
z>ZLnhuO>?IPUiV_&@Y_OXUT&ozm(lqzozuRNWC<!=Ma%fZQ^Jooy@1VN<}?dDt$bW
z4sk#ySfN1D3ew?WslV+S?Ks!d{&gZ*GrX1>)K}iFHqK^I2+j`CS`0_7GZU-N&>U?P
z+W&@9+MC^<t?;i=VrU#z=-em-Pk+>*-;_=|J|uuT5t_^4h~E5RKuE3=F%Pi=gsi-H
z5n?kdG=nG~mt_&0NT)M`VVJ6_0VF_cg0*D*NeV1eLTm_rl3$XMgWzvtwo2k&j2BOS
zY(Bg|h=??R#0Kh0M#2G6(Q?XPM7SaY`i_w4Z05gAN(#Xs)>s0R2-w+wd+r_qz)hby
za=G~4jsy|Vw6jgw;P2OERo>2|5zx#N0e{<4zlm7U!%1#<0!TRFvEAQCX_08ry8!I0
z%QeRcc+)mUffb!?KPi!^{esfC!+QKC%)ZcrQH2P6<bv2^GgRLB>aYWgk?GRXd|x^M
zo!QF(5>s64hOVZV9;YB<3zc0qnTne`=CLi|a+8qP1384KS~=(tzzM<cr?4M?B6;tO
z5YR?Y2zfuPz)|OZRu>!%oOa}KnYKYDiAcW%fEQtJS3`<6N8(orhM(ZM)R$JH#Y_}O
z`iqU;g3T0P7TyuL35aw3#g=vE2@R~m;Q(%WWGq){#qZuJGZFxaI$}a^L>~CC-?D<Z
zCqDy7G-%-@2jA(qZ;VhF-Pi)afiTdn#%D$lVSgaUxF3W@LJDg${vFUPYYgLMmj$6K
z97p^{YGlhV+w|5tTS>>1karr7opAI8&-%jT@@IgFgO6b1ausyU;u|gKjL~%GQPF9Y
z3Sk#~LO_z-dp(&n@QyPM8KJ>e^m=hmJrF=5bHD*qSntK~_2&TXW*R%J;RE#8H2Sxk
zON>M0TVeu?-Rb)RWC?y3)9cz3zCU*YM5W`i=QdvNNj?cKDrGAi)Rb(--BydLdAsa9
zS@@$gbIZ<MxXu#(tCaRbE2Zu5D^>xhoD437z4y9mBgn~XR@zd^zh;XVjLFEiB+~aD
zC1v8(xV^&n5ijQLW+U#kX6P#F02`+&-e&}RaPugA_y$`k31y~VHDBQfN>-}m(&`A{
zq_{IALJC%bX`2M6s~WAx75?OP@UhJ(Em3dWsCUS18GO@s%AGqHg66x!q32YI?G3o+
z52gxN2Nw<?ak|B^ITz*Pk}+-p(Yu<1P+8l0o!y0#wkx5bxn?uC<Nu8Q2R<4d?ua47
zGS)wIm-+iH4H}*$Bg?*uoi-?|>;ilZh`Vr_y47liRHOvAG``zf+2}-nt7VYE$UF)&
z$qk40Bv@-wRFusNAUxU3)Hd>VUJ!_+QNg!NEUfe0l`3mcKwX!Pbq&3Tmg;mbXtKTn
zw~d74`v6@_%E`ljaCMx?0b#PKgM^HURZu&!SIs1OJMc@u7YQpi%VfM1r@B4dCW#%f
zeERu)mi!r{=H?L<g9TJxheIOWa)oZvo3=e>2SIJo7=*b+IZSn9k-vv3^tbLY!%I!l
zy0WZE7`7@R32%Wu>1>aaBMz623(v;9=h|sl4-0wSV3oy@h(XC}jg!w~a`eEPIQQ8%
ztF_JViS^!HuP3I^)BELW(w1d~v(n&hh2ae>G>}~D8WzzyZK~wHRKNZQKes6G?#KPv
zpvk~Eo#C(PaN$YdihnZniPvcLr4+9}tOJ12huEiEe^hHRDewgyX0Wowwicy?CiTsf
zCm|(=2s?82O+}}C%hE)qZhlq31$`(cCIk}cj+-n*B@s^=Z*{BnV+dVhU>|keY9d4x
z$wdXm7A1a<*X_(F<V{!~1Y&-lDlv(B`anG&gcgx(Z+zM7C+gI8yYQ^S>4>;Cr85qU
zCFWeTCVm+&*+xb{OO#Wd@^>YR*A(a!1ha>vEbYC+U+{3k#AYFF^Qkh};^6~{^^YQ^
zT6oKok74L`hKcQs?#mYVU$F-ra$*zYW&RAkA=Pa)!NRU+@DK!r^g|hWE@*es4;ugR
z{xPB}yGt<3+poS^vZ2%Y@_q?Mwth|Ph%-#8l6#UJD05+m;dx7jfT;i%w;Gss_km6)
z>@U$6a7?Q39GdR3TK!9Y-~Q6jX;qdPU&8({rt<tw|Lkjf+=D^8f3emNjJh}U5B%4-
z;7lxjvj_EG&oG7+9<NnbvNT<{L}ntAv)^0|N{^a7_Pyx72V&amCj<86q9&>~Mt~5z
zVdI^sBBjW|?*#1bbEL9Gw@a+#XV+^=Z$42SNH2IK@F;sEfx^DYX9fX|Q2JlK$VQ+D
zUoSq?iSjRYW-$)PY}K0kBumPXt9CKlEY?yzdQKR%4sHIqAN#f%=KOX2nJ%|qh2Eu>
zEK{kN8V_?hA<JFPT6$oy8M=vuGiZA-c!Oa<3xGi?S8+&Mm8@o&mmcaBN<Z86{9%L5
z`C!_UU+>}@?b8h&q?)$Oec#B`)5-qf7T|46Nj~MQ!x{@6xZy8`Zyl>US|_qcX!X&I
zK_603vMn#mx2<q)ySYVlfXiNB*5SmopV{uUQYo-&NMMyrO)sJ_3iRFZF~eioDv`fR
z*qRvO;^$Cboj*rxj-6g_PycyBx?H0?^4UHbTEB9EMFGryg*H$I{q85v-sJdBu+1fY
zijU1St^QepH;`FQM%69VN_#gamf5m{slJ;bc(|yYt;JLzi?1%49>)VifK%oQgq_{m
z2u}n|^wy?Zr3gpvH%NxM6=P!<zu4c?mnc2rYl*|wrTrVwF)-!^w|uYa_mFV|h|=73
zRG%ZUXK0hCq^7~|<B!V$`d|&;BHBmDsQ<|#cV$}|RmOK^Wa^w(AI%yKTa3fuOzJ@`
zs$B0DvSEdUezZoKt@~}g2K!HIQ#ti=+CZw>gI0IKsOed`J<;>{e^scD)j!1dOv6KD
zZizTaJ-3$lNMjYwSZa)we4b`%$y%GA6=~OH^_R(;)UMAQJX?2%OVUpzCRpce8tj{@
zH~2N_+?c74&$5dA#q>+3*e?d%ioRB^20dxi6nt+bejMusL%0Ynh~E$g`rev`I<RDO
zwX9wx{8@Tl1k?-f(9o&6m6CP{6<Iz;sa*W``>eqpNYk+>#6=Jeb%PHngAV5<3Fpwn
zb3bzn%5kJ9Rnb?e0I24Wf-EXbwmvn)g_0D7nX{#=-17&eDYx>>wtw4JnXN)Mq<UC#
z{Ljwfe=b)gQP8n>KnOteEpR*YbVzh8A}Ugc7y^>eJPS6YQ6uQkasWf9CL#cI>_eB<
zDjXsDwA}yz>q~F|&EJz8FVyS3-Gxy;j)<g|yds{PV!$||VM8(aNis*w?T4HjQ3U1q
zWVV=KN*YdLGbVMtLD6Qx9}C0?V}zSi?p&Y){-}V?%<lx2S0)p{%t<KyLcXm5>>VIX
zXUr1v;#HYZumUV5&m<67XA(Jq{{K9TH#^9*CnpmHx_Wp9kqLDD&<B`olZ)zA$b#m1
zP}=UNMYmFPO>W8t(qsqiO1=}Wga?3szXSfYdGvYJ9m(;Xy`ypgM0nY>xXc&>U^ya9
zv?}{%$bu|id3&P+_0UC4_TS|}l0u`@t&2Du4m+d8B;nu`Eo3cwe#wkH5Y7X#%o(BW
zTqT70DGv0b;zJt?ew!c(HsGc#r_z&F-xfUG2*&ywPmsmXfUL>|aUh_w3WPHfFTH)x
zd}l{jhAUC&>YVy5ONBvyw7<cg5Mfub&w#`W4trjvR$XPcd*&A{^av41N(As^{x_1i
z?;WkF&ecBv*NIMoPgd@HXz-d1us}R{0CAT^svwJ2qo@M&%?0S=l2fi>09s{!kUP&q
znzuuD<JbON;)TEYgq&$iWL9Hr^_kniAVY|I%q)6H<vq$%IH!8fE9HPSbhAMTR;Sp*
z7fh|IKqbv~h*sNb6wf8IQ~KE3Pn76_gd)9gg|;Aj7x~n?kr#w<DG0i1*5?MR;vQ7>
zy2+O~#9l0%A#WsV^*>ATM=yo{YI)>%-$F@N<!X2TzUOr0wQ~V;E(l|6-bz1ZXxKT5
z>-B8oKvvV(zLp;wPfuo8nlU_+4OqKqqeMcxulI=^8e01|g1uVQh82kaLfdX+v_!kN
zT}lQj75c~rM^Z~o+)9_aTRrcuhc;!e`;(rUi#`;`=W?)fZ=_UxpXX<}i-Y^u$6P8s
z5K#M)JQ&tlG%VKZ59=@NYKih5tr(WhfjOOW%oQKxA(*9+zwf5E0<Glt0e#iAWx-$5
zPvcq~N@5_jPF*JfTxZz(8kGYTkHKcOpt2l=@~Gv{U3mZ8dlIul6j9ETlwXdHB!o!W
zby7p_M5{z=_t;kGcwOnX)))6d9hZb%$v@_ZvXxq7V|LRtbgh8h^dLgI|DDYWap(&!
z<G3|4y2m=7)9)UXPAY9PP3Zp>dWP5<6f$@v=UB>%=?ZC6yu4Xu`sHr}*VI`^=2Nxe
zo82S~PQGB_O=K8<j8XT#vm>G|Z;<ItHnX>)0u!}nr(KV)Vp*aCFB^L3({!bDg|oGm
zhQ0I|$}yyVYSwvQ5RG5?R`;i4jKK1F#^^eCk<B{Dqa(#z{lH=`IQ3XAu-nQa`K9h9
z|3jn9XWQfgLZxEdKeR74n0i}k#_tt;)6IiYXH^RB{%c-xvoT+>3mt#7!|Pl8D!X-P
zt9)1`#sPe*AjsQfG6=biG7ULAe}}^{Bi^rg{1xHlvz%+TQur6i-d3K}Zvvv68lPce
zcRnR@G&$db03@RDpxr1z2jm0l^$2AeahF2l7{qR{3klrNT&XOMq07_&0Nh7zpp{bl
z^nu5s#W2LKHAKw(4Uki3XAp9pF?kt|Ap3Vz8VAmI_JDi=P>hD2jLyBu`!lrZ@#Z@M
zTA)Bm^Q!}GzP(Wh8A^yBVi)lQup=l|Rz!ao_;(yj#4iEE&zoJ`MW!RuE9?L40(eKl
zO>P{b*Bl(G@HKl`!JG9i2~cFFKY}5IEn5FOBcRXnU6M?S#Tovj$(94unURlI0KhWt
z9L;)9bfBtOlgp>BbTNZMQoyY;`{5J<kkmXu9=i9I7rI=USl@@Rs-pe3;U*zqN9TRk
zWr@3$XnW6Dzv2ciAPc{1xWBxC?#MiK!RqVdc~nnx(T|?1dU7~KkFlIA<Paj0ghyOX
zAUUfkM?(7{tVNKSqCJ$Wt*Qd3(l;h3a(<eHRHEectNc8f!a{N#yxLfw<f>w+@wE<%
z{29<!X}r_mjq>j)0c1>y62Jb4XK;)8Uvhm!;ruY~QVn0r@8Rp`YFpoZfGd6@HnJ6o
zHb(L5kt^s>bL=vK0`OlfzQv4DCwl0xVn3><xovc8*7G@;@pi&quK+Wziikas`pV=)
z2X`de4aq(u9D;_AR=Hry8eUbQS8U7kRpm?Dp^KDDYX!T)F|>LzBcnsx8AO~T$*3B>
z2$@Ryi<$sO&cstS#U)>psz5#e+jHW(KJO%SYC_%xpKQ+?K6o~7|6Y&8?x%5Wn$7)}
zimupuQo2GS)w~PMAK$nd*DB+r8laOUTm6mQrA18w^V<@%g`dg;uLlKW7zPA(uewJT
zLd&pMan5SpeM`IdO7|);w&&hs%Ey<xSfUCPUBV@}?iYGB$$cIm08io3mUk3|Yvu%C
zY^%^*a8Z@F^&o;jHr=G5)%+M!5&~4F?<x91GWNI^S|e|sa4LyQ<W+Mz6>J-Spk|dc
zw0M0iwU||jmMfiIoxny$_tU9)AtF<mmp1nYnP&3`!?R$3EPwQa9*)ds9Maq6yvL4J
zNas-gblT`~16K$1%G5?60&Ql*XY+zAuP19w7Wt9(wj+7#Nzuw(Y>ewxp(k>YEymTS
z{{;yLHKI|+`H%c=fVLGMt!~V*ZLzLGjZtu5<sDm;hG5jB*hX^EaQQ3{bjAc>-UIEa
z&+(8<Q9v*~Y5$&^{U)p0vXB;1QIM<og<nXI|M&H;Rz3|?fYdk*w~!h$)-Xcipz9-9
zgGBa1y7~JR0|dFqIZfywn&<mF8O9@1uMMYPU2n4@1D>BpU>ZFmSt@4hFs{P%+@{ja
zO8@J3IyDbq7&`rEQJBJ)c&9i9j4Cp`wr4ioa`0fp=1aSMCIcZEeP;LQIrxkaWll`U
zo3%cO;kySzcZ#~Gy3-vJAcYAthKk)Fpaosg#gRhf%Z*(@>oWeBYe3~`R;tGitPP<g
zdZ7+Fl!Qrb(AH;B0f3*S0I*FuL%_3FzR1v|==yp!b|6myK_{Bg<<I^zfYEzdbB8QT
zrx<Hn-_&gLC58JA-nD0Ze71^;m%}3-0ffasbWX4rU8Hf|{(^L}G@V{~g>vbTc4PG3
z4881Ja)nvmaLtsxDb>r@FiI~dwRvq04?Vs9X}op8lN`FD>x7CeG702HW7;S9BdRGQ
zb8<<v!~aQG6<Xbbt`t96hR$EKWCV1QLs9rDL7wDiXys75r{8;0P;bDROB#*4*G@wO
zN`Iel!;q4r1ktH)Hq-|Ube3`0KcDis2>Pl)iBT556&=f)4lOq69s1p!FCW#Ce$+W4
z3bgVi6c#!bPZR7VJbqd%#+EvB|Cgnwe?K*DeAlSNzUo>>(pPxf?diF{R&+A7LEz)$
z#`7ARhjD#G=<P`?EkNEt&a(un_9*Cp{{54!kUlmI6=uQ0$rI6k<SZIq+Q#_T>}#2{
z8m~<gm3ZIoYS;Y!XzN4FRFIz_IAsWN2~1Z)Ugd*B3n`1pa@Pp3^rW>vLKg5s1Jj?|
zdZ}T$#@E*u1)}UGoN=F<knps%aGGcNf55k5uHt5?;1uYOk1GCu2z%?OsKal4R7D4s
z5$P`J?rwDGk`n3e?v9ae=>{1Zq`O-h99kNwp`{z{N6+`1d+$2Gb?<+(fb|(>-u>?V
z>}NlF@ACJy096ymqJYWseVeeuW6Ky@#>cl+>syGhWVYF#rttlfj2F<J<GbclSX(!i
z(K<1+`(-f`$_maENt()(KJLzf<`6CgFf4-bv|79U%Y}v39%|zsbI-3jRC-(l3l4F7
zvv<$VkDqj&scpmq9Bid1PpV1FD?wU@qkLq1n0)=BO&a!rrzd@CZFj_Ln5vCEB)3Q;
z3M+Ow8MRhaFSX)?(zrNjyq9RLR)VKaNxo@3jj8g}0gQbvUpVm-)6z$Nk56tItE@qj
z3C|K)($lZs<2F1Y&zNtZ=>#@4e=tS|UW+Dw{0Bp-a0XmHY$OH~zvgKYk^9j&df>F#
zmTS|AV(B^VjW__^ugXPRI}t2>BX48j(&)_F6pnDHowFpFtV~`&_#%6e<IAOo?zKR-
z0w(!L$(ZYTbyCM1GO@?Js}${U?iMR&tUvi_46Ouc*7S&eI}VvzrQau9U;D-woDxI#
zQaiVYeb1ZU73a#Iz!KR)@MWD#v#wz{Jg(u7Y?Pg1P)S`KD7uh}rBv;s-)LKZ786@;
z<bYFC62G6a@zbSk-T1xdhh}*M)&Vlh`&`4WsqE5cdqI!f6;`}xjWy>|{j*(H5?22M
zaTHM#yMNz&DaV*a$uNWDmx*Qp+Np%CIr8a#r3A|GXMXG!8?SV<;$(Ibpt<zqN%9}=
ziFR$mIh9hcl09mPzEJloBoM^+z>&8*=r_$PyIglI<3l!uZ{*l8w$E2nV1%V4`+jYV
zxcN!ug%9yQQk6)1o(;CVnzZ|g5?NoBVT7CXBmO2d!?Hl5;xxnyDqXbSm%03m?xB`z
zrG7wJqhQj`)*4AAfn8(oo$FQn)t+%Ngd<e7D0PO1`Q=>$dx@JX7beqDEV#f}7f_g<
z9_g=!^2S3c4Aa8SMyZ1I8oMLfoKj8crWH+Ko6t0|ysdZO=(p*3us3mHLL@IVHp=q!
z&;f1Is}E=8$KY6IliyK_q{I3dlz0g7Crl3{AI_$jfa3gXJcws4xL`jCyvvhs-9yH{
z699%E0*?^^hSQvQQe(~tpWP*#;>o@6WpP3r#A^nsu<AV$1>mne45DFguwHqKbgj1A
zh$^!<T<@r)NVR(j2RzX!#Ydg$`dCWmOa>-CRdA7O{dLcfGQh^JyZ(prI!<gM^=WL6
zjYQtaQkumre;m=Td4Vss3k^k8TU6m)N1L`F4mwqKxLkWqt_?!edy6K;oC(vE6~Y2S
zySHNha0U?$@VtM9s5X$BDsPi{YYFP;Xu;1gOCypWb;E;2gs^>H2=||}B&`GY;=bVA
z+oPInP5Dfl+9xU`L{_C+B<08|tf`WeZHW&%A^^?yJdF*w#yvFo<AD!~mB9WmDs2vO
zB_3V%l~(E96%GQ!L!PCT(%^KjoOTd@z#@4>eg(Q0pyo#f)E2xkK>FJH#F8s<#6+`9
zLK=0HE83#%-gETZa@o_cH{JJwn%hmRjwkR3?}6u(hlT1y**F9I!aTRVE4#Dm?7Vi^
z9A!sG?PTv%-M-Fe&0?<qjjx?y=FqtO7MuJ&UhR64HBNh88}@EIj?PRI{-5NisA6+t
z{OZ)Y-7g@@npSS<uRy3Ds;#>zYT_-Aqo$okC9gq2QejxT4O$p3wlSSSW-7*$dA>Ep
z$-w>#u_9u|0`kipz!@hAhk5EJH>a~Xwkj+)ygpqNBaVyL;<i*ySoeIjNUK6II$QXh
zJ+(Tj`Tj8^x(MiCPf>XuCB4Vr=J!VL;&1l>*%o$FPDK1=4J4U+Szn?2H&^8oN#4=f
z!GgJ_qIuN?V;}X|R?3E7#K>&jDGv8?u>w*rDt6;=R$msTA3tHBGiiFpHb-UA);jsQ
z({%V5A%q`$mRW9@o;Po(Z}?e=s+wus9&vGjZsL40Hj%&xy$8b~!*t{Mi7hhn+ZOf2
zIf{ce;)Mu)mEzTJ#j*(BF7*B^^jSkeEt{DVQkFOMvwx6qv*=S!2`+J#Uif^rt6-YZ
z(|L}p)}kiZoldoPuG<roo4)Iks%^mc<)&1or+tpZZIsiRlstv`{TOPIfSfmBcPg9Z
zT<oVG=emwS`-Xv2+7`0;Q8uS4D*r&ie%y39=;j53o^mM`$@d;{&ujsy00o$*vUQx+
z3zFLWp|isebIaHq%Sr;AaJx~<Yg3wExE=U^!y2;YH~%r#Tq0eFE9RLOBas6nIClp+
zz2H-v+$t&^bvwRg^n2a`p$l~Fzt8$=dG={`93yeU=ukf3w4j{JTRVzMlHg;$x0uL5
zDEC5kM7ALj8gE*GVV%(zZy~v?`Ya?|Z?Vy|vFL(rco5DsrS`*f$LwfXl#LdSUfQ74
zs3oQgWT)tk*6iEBtJ<B`0v?W(nFbbTIiGzLpcJ&viOEz^4gPVe&Ki4{dzvysq1*lr
zFH*0bUFkLjDVZWzVhLa_eOmzP2Sz>$%?Ayx_GJKVn~}N$WM57&wmO?{S%RYFZ-*Q2
z*3acO{$TNc8oeQm-=E|4V8A@bQQ-1PH_-1dIEb_)YONh&gN$!-C@smFvcpeuk}iN|
zn43hojn65bvG1DK9fA?|#4A6fsOL+5Lfvx!A+_?3jp6F*El)6FBI2(mQU7<@%_?@d
zFj@$z)j{GVKsmMB3l(Yu+1Af4zDQ?63r*&G4GxE;&j2DJrot~srNuCBTt3np4wQx9
zE{{Nl4R&BVa*dF);dgSJYTlQ`;5$Al%xSBNiJ>*QDXorvpyp*s9Y>S+pxN77XLG+~
z3-G@8Ds0ET>{M=v9I>3W<1YaP)djOoF5cXxm)z9t1t2S0z+nSw*qbuJ<`6sTgl;w;
zf`IX+okonm|BXHY&=lkeLd~;dxBr*~es-z>g4VG|Q@ACw;<?EbY_@9<kcg19K&8uZ
zEj=y%lF{l!m`pT@@B4QteQNG#|0#bgmPz$^a~S`QyxHmZIKcX=)?}AtAtg&uuzcR3
z@z=&eOrSb2&1pQrC+T`-yOg$9bi5h!8|C$7BiOeeJJJUymN?<~&DnU+)?DR)k_0Pk
zkY~L@Qu>vql}^BDh<D3^M4Qr)YLy~Za^GH1oS;+mJ?(CTTM4(_1PkQP6ZpO=(;4&M
zPOyMgS|`lt6-uc>XO#0+L|~=pTdu0J_YWDE%K1L!=GWEFH{XRitv=(1cR{<rle457
z5+8;Bwo@)v#@VH0?|k?St=MI<fA(24l$_4_B$`*tGE;iSOFs97XV{`X(N~IU?N2Bd
z^Q9MI{Kz2$Y888JZ~*)$Bn~cB&Z11#Pq<DqMq~d7ZLbfyjjr<4{b|CeFMWcPOJDbk
z^2ljOAcLB)5Yy7}K!1=W<K6GpT`SEw(ICZEUi(Vb%iLvj8p-OX_fy4g8Q7tbiY)Vw
zDN%&ywaP2CPaGS&L|*U-I4_4F{UkYUO=o-B<fY_YPUP64BpiBSp<{&VdIxirvEbdc
z>A9F1aQTgjEGY!6S9$(pcpuGcR;Ha~BovX|syqDl<*s}xQ`Qh_iB)e*l?e70T{C--
z$#L3s){lot=Iaa!@&(>LA(@aQ<F>_h-ru_mzhlgsBt<H)>fI_B3M#syT}**yb5V!T
z`&n{oFSJ+5yS+KQ4+iqWF8(is562REs<KYE!PQ8p)$MLuM|Emn5N2m_KGP>h<qFfp
zc@I^}iAqsreHTo?(hLPkV2tZs*O4Bh-bMW~eOS`GOL;HV<@$eh%FRAoI>+NsIF7(O
zayiNhIR1(#*fpsZD#>>uwAWqQ!YL7WF}7FVy+7mayJX^O=#kEvw%Kw<;b=jRM}wGO
zPDrU%W!@b9Jy1*<r*bTKV9cTO7!rh3&gkf)X2?<TSARA){%j3v;*l%0>qwbP;hf~t
zF9;aEErW-!(+|B_8Ag-Gl>3}+E+tw^l;kUt+I6MLEg|}5(}09uM)#fqG|Pmq^7SZn
zC=lohEl8ECCE~XqWi<HHD#v=e(?>1p>;$wH{dh?|p_thV8Ma{xigwe>?uC?fzswaO
z7=xTtI<NBv7yVh6qory-{S=HCatuZ+KY^(~4_MDP4b6%L!xh0F&SFW~;M-UV8p|_M
zUlL;9_cuL$9zrYgj18k-I*|vV>PL5-3ij0sz4f}OG(t0|C&iQr^Gz@e^{q&eB1w8L
ziO`V6&xo<rf6ufWk}?Px?k_nj7Npj^m30HsK?Qw#ozlPJ$p<Mg{J_r$D4cuUvbmQt
zq(~`{`wMr(iBT@Ef#EDzHF|BX%)0!UVf;%@Y?w@XyqQ30PT49t&yk^r;|Hrj>BVqJ
z0zmuaAZ-FoeuF*A*Z`4E{NXI5O0FpCM5YfiT$X6dY!KT0GAs4mj|vYK@*LAbq#>>A
zY8X(in<x6RhIOGR#<0OPeOofZq9tPss+>=8sN-yXLf_qta0b&xW)RJE3Q}~PTVLIG
z+$vE|{T8f=h9Z3jSpCQyx!SuM01~O2ad?Y-uUe^;Xk2DtG_)s0C{RyPZ!ZCopF4h_
z<Q4%4)F;ECtvi>WckzhEpZ(#}9e^BYG7um9%LvO33TH|60PSKscf+3bK~P**I>}QV
zvyS_$zwIhSO)t39Ue6mHQ;$d1N}(*8J54uZAkk7{-i=>fMh{`S{LEmh$tLDQjxMZr
z2>%^vDQQ!skleH?Mpp3@scEEE;eh+7yVJgL4)(bDZ4>&-7wNImHyTZa`sIGZj^qau
zTA>r7x9k>F&A-eVQaU*2tNIj9&scAz7tsbBmom+(f^oB?fX>8a2IilQIRRm=TXqpg
z8g-sN4_1BNM&-cJjCVsbLG@C7dv$7Y+rfJ~<}0Gu7cTItC-)k-cEoa<8qw#2S*BBh
zcUjYPyN<wY6sqNCG;uG2Q^|Qo3}deFg!tOcY;B9@(1s=fCYw{Ev-4iwaFJC|Zv%gE
z72PY$F2kfCQ#Id`Q!#8hpqb*Rq<G9ABi}yPZ{Er>GZ5P}_C*z^{e(0$Q({-R`J-5w
z7D-GWfoIn%o2L**WdX1&oGUa9c2!=W9lhK1oG1KMQ#eo(kJl768`K_t3v91bv~czI
zMjA6eRxRb;6B+G85%(;*ef)Kcdet0Fe}9@a6u(H>G;40$Po)rgfTgUze%$?@>{UG!
z?Q?j4!V6ao>m%hPkv^j#W~vCA&#uLL#~5UlDE03|$)3hYzk~87eZX+D+E}L2XluKC
z981iq$h|B#=KI9>%c(t^(`_pwRp7Mk(l*^`ytQx5Y3ABCZ=iOV7B0O&?x6QOx=2q;
zkj(9N{}Gm%XRaJl->tcJL_}G<*Gn?T)k9@GPNZ<zXg9lh1IHP*VsN-O?{8F3>=yeF
z6XzuVM~HL#R7g%nyFsSb?lPU!O1opoXzMLi;E_<6s{-~If1_@@{xjb+Tz^uJNZ%L<
zEj7v4A-L+*XzLS3XL{q3GDagSw2KYDaySpZXReD&wMc$Zcll_&db3sJ+XyBkh<Gf2
zzw7@?O7DA-t0m(P-wk00KkRlEOZNMv2%E{-4tsD)^j=OwOX`hG)RCQdi+M66eK@)q
zQ{%$XUN^G@_a)ZHW^qBgFyZeNHM8d4<>Zn-Vmsy>JLbzTUlPU8S9K?j?#mnc0QA{$
z$hO{g`3z)b;S9G5>Drlcww%kirn?v^+B|f)Q1HVUy-N^r#bAV4C&wy2Z;xm+L$wPk
zy7&@fOIaF4f-pk?x%@nJdgpaTX>C#N#ZPJ&1u#ai?zQi`(qEjT2FNOquUSje{2Y2@
z)Dgvvw)pRks_mG&&m-wzhQxC&cjMoG%L*c8K}qngmTf_BMJ%M)FAVshoSLJZM`s2S
zje+;`o8UPOiMgxihG<K!NJz3wB7L1MYJ~wy)1>-nqnaaR^CIcc?So79Zr2G}d(zvN
zNc|CB_&G+)A4fYW#D@LX_XI(@jlV|8G#^<55M5zlURs?#vU#Vk&~<qMip3SJhX98|
zS-}u4y~cCl{!SEu81~Q!)V3AAZqeK5ry*h^9Ie`D@=0Kr9zW>Q%<`D1NVW;u{myfw
z9ZFEpiI8F7go(vgEz+Lb!$7CtH>1F<?dC*X*aNU{likYa&<zaX-IwEFYG(0>!@Kfk
z3_wJLi=EY%;(gw|6Tx%8yAS|fs0?&YEXm?SEJBvcy_(jag2*$N4Ma<yIdgdpo{&<5
z;01(u?$CQW5PbXyc<bN5i?cUi$nzuTk9#*Qw7yhGES<!r-?~Ecm;F3wD2Q~exdIUP
zPrFZ0>dpxU$!=~)Buq>F^6^qPXRScwq;3-IHh6<C4{TWs1Fuoq5+8i;Hp;+6xctf9
zetH5(E5jI|XM!3M2>Yc1{6<v1F0vBc@t4GHZs_Zan?G$q(Qj4od6`?&psq{-hz%*w
zd<(~uB@fa7gi##S(bsnE>qS)_#Z$j-)=**WC#_K?Xg+5+Ao6-m2?VWoi~cIF4;6)=
zg2;&<7)IDW+}W6pAIi%g0>u@iVe1yd-eM1ZIF*c)lU#Ta2gQBWK2D~2t^gXdA~gh5
zAdE;)%emR?GctQSok|tiy6-dgW`19CgbF*6B8{VG0ro-sWVSEcY_+-Z>(eZ1wm)qp
zMriqZUlA65K0aAq#lqQHKX%TMBA{2DyX8k^{|x#M_^h|{sS3j`u~L{V=e~Q|dK`5a
z;d!~E=5}*lZ+a#@bNuUfTWnRllRsr?aM<LTI+}FXhlXKpEddG)hYxB6t_KzM5(`xO
zQI^6hth%&p(w1Uc@~$D*m)K&t+*BNdF~edd&e)20$wO`_a&0V4ocIHY6dSZqWGhaS
z`^a>T{*-FE_&eOY_C@ad?}Jm9%@Aiu23;-35{-iqF=Le$$&yU`Tyc9n&f6#AWsA1C
z2dzIi?CDs^oWokWC1N)G%N&q7NzPVjvut2O&0Cwr`0YiaRFJbd#b4Bu`>^xmh%I%)
zip7|&0*VZiNH1(@|K(_g6gr$)BYsmu`ko<N?E)~Rcz^&c44Jg>JzGxtskgIy#m-ys
zM)C2Xf-|KlhH2!`1V#He?GGgL%`+eBi1r#<ahnpH8yq#djw4>f&;1hz^0g}8%h&b<
zdDF^c%2!b2?Xxjk%|m>-nuvdwns<x|7c58KR!H3A-Z;NHTu_yyRO8j3$GQG8!$f*C
zkDwHyV@1~=-mI2^Fk+;iBXc`XE$|{=phhff<q1iRYz>Ivskf&<+|<`R7v#BQx7K-O
zdN$-1K5R1XCRnN6J^LKNy9#WPKqTLZzEOsF`&<g1sS7bZLQl^Ud{;oOoI>A8L_L4N
zIc3v-h5xp!q-EH?4Y06%RX&TUDiHyHq_t2&-Usbx->TcdzI$Ky8IlFxviK*oNGM>s
z>sC8ZCQ5}Tgbxv@qa5&9l>Var)8<CZQULjcFzTDUNa3YJzb-l9&2)qdXjPh&V7z9z
znkVPNx_C@!R?v=emf|C?!wXNy9{WYdz?o#O7BcF9Zn1@(^>n<`WIz;Zg?syA<q@nL
zg^IcFu4P0xw%6oIJH0N5PjvuOm`ZxFExweQYq5w^^u>_Zm6jLa3Mf^MQIB;wUrVZ}
zS;I6hH~&7ViS-Xl3~MEpCovLgX0{<(K1C1JkujQlQ!o@)4V7b+)e;}>b@c-W1i$0q
zc;Djuys41HG!Pg-g3R!p;w_ouDxSn=mV?#vO2fd-owBpIL%odd&WPzI@ZvU+ey2ly
zG&;XTKE;{Oi4z!BVQrA7g`)Bu`FY^=XFiG%w<Q^tgS#UuJZ~an{f^9NYtn170QNjN
zwpDje%m4)OID4pRdouJkPXHf(dJHS_`egv(f&t&$3?~4Ao#ya^(I~g@*kQ3ZWwgk(
z>vi*?U%dgE4N(u7=V&^<JM<4y<j>>sQect_`sq3!xD{{a`hIWkA)5i6FBm!yT!RPM
z@kRAaLiZJ^_X|aTVE+A<^}S@9d<edMN=+mJ;B*Zkdn~nE{5}3gTRp_G6NL+klKyf6
z^aaC{7B`m;r<jq>bYn*=CdLw#U?$y0Y!<UXlbscV&v8C`)P|kV<5@7Q%FZ;f-Xf&)
zL;(Lv$S?V62tu7>_2VPpMp0JAU+&#Er=oA=))yR_2CjtEelVF|6Fe|a+uF&BVGaG1
z%>J_k1#MaosJj)es~dmN!5U%>Ex#3{P%ZFFRzi1~?B7WQ6k+)>S~~;3c0woL%(8a7
zPO`AxG4_!)>$1J~nJtT<2kBOh7<A?|ow_C2y7dd-x%;{WKJ7nK&=uhiLa(1m?D#lY
zbxzVnb~Ag)y880*orHO!aCX5$)AbW17Rv9&#=a|_H_HXrEkC!LoMR>rF3-2)m$Ma-
zxsj?=3&HyaSId{ZPnM04?b}~Hf4BVx$*z(XL^Syv38ecsC_f#4Ld|>T3VkIvZ|mUq
z9_0Y_ep^1ZXPypOiG#Vdqrv(MpN4#`_i*V_q5Zl*PjvO9O~HK-vqeLpOA|yyoa%n|
z_s9&Pb~;h>Yh3NaSDKZ9h^QYnS3h^<1gfv6gJ_x2vo%iz7Qe;H{v?a<D^C_5wPo>9
z)HhT<iWiwntE-OG(Ehf|=XvhEcoj;{zt_xT^+L~mhs{3nVZrCaNqWKCg!`jp_I{o(
zhs-{+{SE<4%UjDf9Ov6wah&I4(sGfdqrXY%;o-ZVVRcqsY?)}eL`n7E2oYLZ_Un`<
za#gUWH0ewf`xobN=v((mX8!nBKf~Xc?hpyuf48lqNj7yn$#<#1)Ju_d4q}*54+XsU
zj{YjLWqR)^ljrvnGUB?_?~5vIOL906rzJ`@)SXT0kk$9o`e6??W0=j_ov}^4Cd$n5
zK?^c8Lk{7HGSlF!4DoD@o@7>}<l$qI*wdv7D(cn|)bqqQso6eR#5TG?d&AFoRJ<ol
zcYOvs^`ArozPow-MZH(D$xcn=pf*ZBvs3(4w#(0mlqlInzn{M6emKQ^lzxMk4(PKU
zKaFvDhx)jvMK`35Un65CsuJbY$Y)k7vWPwq5sMh4s!gEgG;zLP`f}Uvs$=Fh9)R@S
zhBZ-RtV4s4uM};z)Hb2Bw>W(j=hJb7H|B;8Du}xuRl-AES~-`mXcbRJa_R#x;Z1V%
z!zvG5`YQeWbo1Mj%xtYJ5-Qt;FRv6iy4JR3_^PEk<g)8Tmex|VXtM3?^g_cvOd2Yb
z)SP9eTlCKdW*LgmvY#hg4vGpaHxRTA?1et`*szUTL1ccZigVH5Os2MZ*Wx`rzdft7
z1??a2`@^><{PjEuFREcD4privN)Ur$(fFo?L8pijR{1k@w_KF=hG0s2iHpHW)$Pzv
zwsY3m;`6QFbWWW%%Q4j1(4E}dwb_Ps@~h5@Ni1Z)UjD}a1J-T_6LH1Nb@Vuuh}G*H
zF{64&6}!d<cw-VzRP^|GV5$}6T7B?Q)C#gKX|zNq>hI0Gh>9C;`G8H@-|7w$L~1i5
zYwJBdTVd<IS;^`qYwlqWRgeM^DZD^JWebN6IE9OO>vbXO@+i0ql11&*zz}C=Fz+rc
zNXMSf5D((TgoL^WcxF3(V!_F1e_Zc=+{P}~zD`~+5^7km80RP(EA!|#*-vAU9j8u$
z1%QhSZN?0P-WrU4gHn9KtJP!ub2<_E$7|@=js~!Sl4z>d9U`jH*;-;SFDnn5eMe=G
zLv)mOu&KrJg)|8Mu3wbWY>*5it&5?MwP?1Q{0G_11q9^V5yQT!uQSx?lr*+-?G)U_
z@`D{MhgWokn8r)!D~k3S7Vsn{Ja=Y#JV<_;m?|T_^YR24p8GM`6A?F~Rbg&Xys-0D
zx3A@M*Iwm6!Oy=V1Qm1u*?UlQEO)!Tg<kgd&CO|(z1&rjt_-NPXGeDM!ef6=qo3@i
z`!xto@s<M(niCC0-hIkWXlrM*8kk_b3TlsqA}>M%FIpJz+$rQ)yB#|=fztwOgM@{)
zl^QfU9h^6rJwZEqaNh<Wn=N+|^uB6c^n2I7&r2^`j><f0ZZ0GiLzT2Q8s%<LbSB6z
zcI>p|(B=u|wNrn2@?|PlXrl_<iyfwgpa`oq{Mp`2b-muY(XbP})<4EM|D{^c-asX-
zL(0ge!IFhRbRBFRUM>IqIc9`7QI~0n*2F@l`>kv+bka5cs*Hl4@ss+)9(=^$jnb^m
z1`j*UoZ=wMMHg<qJTWfLtzF)BubC|SWj6DmlDO-PwcsB~<1jCWWg7h4j9SMhc}O<z
zJ)eB^ly*I4I77Zyi`?HaL0(E?+-`e2)~wM^x}CV7n(CR~LP^(17I0Hn#)?dtWUmRx
z&v!?t_1X$l6-18-`&60q-I`uz$Z(13Irwpp>;JSBe`6?R7TqpIMN8T@ENhp)_Z4S=
zC_TQ9xu$@{0K}ZU?!!SX(O!DhwIzJtGY$BgcDlNwv}yErN=<-Y!zTkfFH27CiO|)B
zQN(dK2Q-rG(T8duQDc3mNb8NYG^aQeX`T^?S7%F)aXmHv?j=u0#J!1qK>`hXORtJK
z0u1HJw<skN$RdgNzMrFyX%5+5Z9>m1u*uQLwtI(Bl-5d*DD*`c8-@nd_1dltq#BJ|
zT_zE4&O7K0vS`<b=k7L`)8Ls=Wd($Bt0Vi4tFNM!x|8$k#o)Txti5VK-aRKk4P+lr
z49(KA!A73ukJxr}Oups68bc#a4m-COtmla!wT86N>%HN@|CFrVB`UPcQ;OO@tmepZ
z*cnPcn^j-8<gMHADN@~tk=k`h&LzJ<IQb}jd)ij|tiLW~5zcr!tbxtpAbvJjlHO;X
zXSAtLKVm4E38>_Zqr{Ah`m`2LaXJ{1v81Pq1sMiw9MIiDzcwTI-MEG-IQ!~Zh{VU{
z^H<Km?w}h|q^|2miO(L$V;v6C)m4Hu@d4-ZXZPeVf=lFiunUt=F=>F0dRWAJo&kG*
z=xj-geH&{^`J2Ov=K1SZASL<f1sa=PtG#5pB>>Q!sGrfQZx7kcwju{w*Qpa2Li8!}
zVP;{tP(+9?LswYtD%KNlNl2N{D^b$U{P{<fXtA>rEx>D_d<j6t3(fm9+8O1+MZr-9
zC<STX0D0A9hUi>X<XRt)%Qs@-nO7fFFOgTLTd>PVn<rR(o22$MmehG)Ti%_nl20)`
zHIY09$CAZNGlhnUYLzvVM;)#(t8bRq-<euXwnp)eiA#3_bK)zhWfmeFCD|?5(9P>h
zy?DN5jUJk(MdN9Z(Ue}|&v`U`$EW>iVrt0)=G+N4X$fy(bB`_2<`|Oh!n}yz*IG-#
zpgJg!uw{}oCTNqqOOB{M+024MNBvo;>Zc@Px{B$Q=yK0>C%Zads+(ibr$xn26{We<
zJ}6IHv)}d^paY1D<Wehn++HQYjZ^+~ZE_#{JUYbWmy0$*p_pN7#EjSI6N7K3FR}wM
z9hG=48Losw1H3yh9UkE>wV|w*AtOJ|vzy<y+Yl3B@8DeKVv)KPMEF&wMi=bdSRfAy
z@^mpKxT6nhn{_-5dCSY+d|T2@wYqNjwei_(6t0Y;+$)`s<2R18^DOJ0E|slHK5l#h
z1JMo@Ynd4xxfab4an@)Cs=5KSqH<1;&qvYDUtdnXhD4*OGqgRwY%vV<3G#n-I~=5g
z6U&ycJ&)l<amA!MJ2#%&u{wK)Mn}}OxiH!!kXEoTKDoY>IgocmF}Z1`oLM`H!cYA3
zJXN^^u81#{LNv;9=p!|v#KA?rs0#iRVC`&1$|7JKEd$FQ)S2jNuP_eb-?UV%&9(6C
zsL)cUrcTQumY=+sEbhh0S#nF(H7d-}U5wjYkyf`zlupKiJk}Lw)Z@dD3X*np)ETlb
z0lQ{40mYRo9q_=T3)qrNgUds7u{3?5Rgm_XcBqPHSsTi>l`k9rr;%A2j$W=t1w58V
z-N#dcBlFj$|AHo9%Km1+oUJK55S%8<c$G7KnCymd6yf63{^dV;xI3tt9jG$ZOgJ8+
zbkrw7(S{}Q4Ezel^z{dG^TC2I-%?KbdD8_TY{H@WPo7gu6&_q3n36rO(+~9g??`bm
zX`yBJ^D#fgw&ruU7I%qeh<6~7RtC%b45p6FkH%E)?xMHhT01E|M7KRgkR@skH+#*7
zv^81<{n#(LUnmA^vCRl2wclimR1x)yai!Y+Og%WOe=e&&pDhFV5nJbqJV8tM&BAa8
z$J{2GB=2v!xY;e*o`T0XN}TM$-$CXyWdkd@qa0hXzjB_YSMke{!Eg-suIPfa(8bQK
zyAe|{{r11octt)Ln!9CyZL&EmgoG6TG)Gpu0kyDvh@XSC|3dk~Gmrr>vv!tX!#;YL
zzYp(QQctwPVTv>Fsa}F9LoTV>oDUg!hTgcrU3J%iy}U4Toq}Ity|Qdw1x~`m2=K-{
z+o9SQVpb4-)wj!B;1?XqYY$TLIhX6g#-WXBD{mzu12ev_gq);|;8K-e9u!#hYC{QG
zo`uNVD&z&64@adAqmuCaY^W4ryk=ghX0}gp&ixjH#T#Bpp_@fWp&R8{RMEjf8co4D
zysQ<zrwVkV+sD?iw7&1N&g#8~k>uNQ*pp4O<c_OBMLbGxI;iaBTvSFe&fcOM?38KS
z+9|I{dgP>w>y?NK@x|G?Kqv=(P)(dBj`>oqR^CLrz@usr0f$fzcLVK)P)z3eF{>M7
zqyUqw0ku%XPC2H!EDU}6MZuG6S`Xe&(#TYByIK`w?F6->VJ-OyO8D|y!-X?jd>y|<
zSeGf<R7D?3Jf?W9PJ(GY36q5Ab;GdP5FP)}Yv@}DVV&6Hxrf%-X&j=_CPAKhe%b18
z#v>a7%@ee78$uj!H0D?ak4?Qz<k?Y&ExX^oJ6Fn}kxXj{btBN`Y3i~K9j?8k`e)XE
zc~O8y8WD{(B>VQuk1e!aZ)g=)3}0yW66b&ZfEV{K4r;1&Mz_ua@=O81z7H0EbSg(P
z;6r$pJ6|4bjhdm|+rJ4wuwa8>9Z6u`l;>ZN{jBf}g#OFAy+Noc{9WjjCo>ci#od7+
zwpcFqjKgSAbR3e-Q(bG$*l;;He_a#y?>FDSLF<RJ$HQ!WL&~b0?C+X?Q18T`<Q`yb
zdE&P|Wsy2{A5&LOdEkYiUB}Yzr}e?hHqzh<I0nC!B7Zh?E=QP=>38-rDSoGs`kpos
zm1`D1_d!68)NzInUH2vKiQ;(B2Y)(yBk{8bf&<jcqm-WtCHyUt5(^$tX8L4oWX55P
zYH*=)#^Q-v`v3CU_G+{!#+-ySvIJ`J(KC9MX}6o)HIl3R24ejf%{u++2!MH@Ztj=o
ziKp^724`R*Nn6(bO8TUz{zDmI_(gWxn|~F*^Ik@7-4mXup*DYNK7lTQG6lX=TJ!@u
zWTJDU!yib(ZZ7gYj+tzO=Zfd_-YddMKO&HqV265NUeU8@`MOXY3QY65AS?6cO)RM`
zj;Bw*&0(ojPkjvWEMh7Ayz7R6>0y}i;ql-(8|l4l(cs0=^YZL9gb>N550!^uh8HnG
zOsfJ9-LEg({-g81NQ>HcgX}Kfb9|J0mC6z(+<M~aQb$c-d1BHYzC0W(-=dRG;i*cL
zaQEdPB2jNZ$TpjTe~myVe!!ZEyJh|A6qPT&Ji-31ASL`2zg?4BOoP!!uPzuv6f1mm
zssv}@+5N1cV#_w;r!Dr;DGZxKF2II5sX9elil!G_&P0FPrew7XYGyOYF22MryCeYo
z>S{OJ*HZkoJ4eqfk36HVxPO0yVYo_dxcacj&YS~%^Cu^P*(|Vs%_{6);0I8(AYhFa
zBKIS%d=hg_JR>H7*Jm?tcsM5bV1<knBX0rD8Ho@-l$9+|?CeNSA8i`+@&L$?J2_bG
z@#pL;i^mKtYGM33WvsFJ%Jga(d|5{HEA@O31&!>Z6EGZ$r&~^<y&`{1(1N7eer}4Y
zq$)jAt%N}6!|JGOgWnj7@YlYNz2Qc|6n_?1xAqusba4Qx&~lSnbwYZnmcGW(bFlE4
z05)8f{k**wSVu>>zi~Y|195_XznIry*YwNbWE*nnc~10nifgwf<qyqY45*<wODyZg
zJ#RKzsJ`wlYV*5x*l;@nfJUKECC<;$6L$r!SwzO{0?XgD)B?ows7e}W^tY)5koxG~
zo#cd<ri3=hr8ldUm8hZBh3Cno$=&BGSN|fKreV}wWe~_wEJi3X$;nf#g@&tsN#SA1
zf*#^Y{raS|sh=}n%r@-8pCpqUV80yvA%pHy<*B1Io4-{0Z*E*(!`)(jB2Cmcllw(U
zy4}R+5{X^CAF_xDd1>RqceD{K&7r9MbwoPf&2mLvI5yvy<=IQskz!pf?*<Z>49H?3
zWoA>5XYRWFaq$eSo$onRcNgQxPn%}LG}wr^B}U3M&SCl(>fY=Son^tNY}=1Qr$l@Y
z@3u+PGxyLWa-o4mHq)(aTbc;%KDK{XCas1CI}T!7W^7<rhWGw7uhF&A%?(V^57qsf
zRUis3q+4Gu<)K}5TBDD5l|$!MwRkRc|C55vZ(7tKjVHq>`%$MkoD*h}y#Un7LWT6=
zi&(8MF@>C+V`OqQjj!H6cei?CENUU)ey5`|)F{~WC-(w?4<K%&<uzx2Z{Jxd2~)Ge
zae!;w)WSCqZc~#2!_*f$JD)p{LB=&D=7$RZn!Bj!s(<kt@{H(NmvR}&7-JE`vD)_{
zMuW6$#F#pKafS7{2YTNNh%w)Bp`sq{zG^D~Va76-tCX^92^pl1@;(jGm}d4MCsAOg
zK4(dF^|0Mt)Z(PJ6tBXwVn5#oI*(&gqh7~mn<U;jc{rB^wk!MTqu)@RYKMi^pQa6`
zL!U9q^VSi*PCL`BEB#Iwfb#u4gwES(?Il@3i=T`*Pga8bVQA+>$XUkq-Rn8NIB7zg
zBd#o{@2LMf@!%q7*|2t4Ix=TsiHz!yuTWgXd{YV)C^7U|x+nn^jeM?+tI|AFE6X+T
zYOs}9H3!!3U{dn=EheNQ0rLzG(`FlZQVqUTILPf|hR#X14dinIW^)s>PYy8WMifLz
zs)u4uz_irO@^Un`RGpk_cCdm%s3uTANuTFQ|8|QvDtIHTYPXKyvZlN<Qtxa27io)j
zJ-bf|D`wRQAEgdceHejEzzg^aOPWo2wLOs@n!3YAi%dy|vkYb#oyW|fGh}>HT0P2J
zF^-c?@(V)vR`8(ubJhDexxo?59$c)V=R+%^DspuDH!{mbg)-0|b{d?zk#M*dA66;m
zn<RwDIZm5AY$bC&w0!NK<MIcPH=lA#$z)5C#3g?}D(0kh?Y~g_7ZCne<orvj{4*SV
zeKr69-tKI3>9BT>+0;j}W-sqrUiPX))^c+Yzr!=QK8^mz)B%3z51y1rkw|s%GM6RH
z!JhZP7G~JNP|4)}gm*Hs%P_=8WI6XR-Evb9LYZ$-fOa1dM_m77MB@YJ^gh;#E}==#
zOa8-6O9Q3acF8FDU#?-V1m_t#)bVYi281qvO1(HvIQKch_!FuTEUsw?Cj2`C&LMsD
z%w-~V%#V}~rK|g<S6fjvu*-?^k~5XkKP~oGI*sN66~|!kb5I~-Y(9N+;h5Sl;48gM
zLj_5wVx<H|o32y-R^xKXjVb@t!6kwzkl)idA=~A8U{fC2()iA^C8fB+`6mHyuyh_k
zy>54k#t|&N!?(ffk6Xs9hXDItCiI`6%To!!HV4S>*?lY`2V-Ek-~qhNPQjIzp6c|?
zx}^e!B-+7Jh1G*ZN&q&T6v0n{Mf9;c9~{l-l$!G1Etl4?n5^h#7|zPxTx66Pw!E9S
zvQjN8kn)gHn-BO&@>xsvoOvSh!F5>8IluQ5GnQhVd;U<|ioYUY*}$}!0X|npC^?8u
z-#p!xZaHNke`e+q7(M*&Qv>%K6-ip<h@SGMRN&LZ&irZ0kU9le^8L6Qg}8dz_+M}F
z?}Yzf2i_@a(kYN}F{x<#!y1S}q&E6!nU0Z_cLZH8Pm!N>#mOb1Cp7?7gdAkx<XH{f
zkN^4+cBByTHXF3ishv3T;!3Qx;+km&m}G4SwSiW0_nK<tjJkMwo4I_M@6_EgnIp`j
zeV;9eYI6jV7~vgk?#Ra5oT)r&TS84=KTVjW_0>R}jmsT{g(<DjEX{*k#jl*IZ1?MR
zlLX`q?+}q@c777Tx<)(<9a4NSE#8S>HUeQ>yjG&Ve(KrEyL}-?uDPh>@_E(EjL^Dm
zbn4RO75RDi+h)}MV5{TXm^hr8@dxvF(KI)W*8$=GQrd@6(PCzYC2>c`c}ZTOWJUW%
zdev9xHXcKqX|i<PI|?fKeB`(5CLtqA_(F%XHjJ2<Z@M!5zB29qU~Ya<(Ubvr1qrv0
z5Dqk8+)1-}+w9+qlZ=Yw@p)l=njlu8_@jL9y?yYi_#k%}Zg*bpBRamO9;+x`<9u`;
zhIFm@%(P|NkNe-&6Kr`p`T4wb3vbB%%a>u<$&e7$Osk(c(++UaR?hnhL$8DOnxPL7
z(Phaj1Xhwgwgvb(;Wp|vE1Q$(Co2qk1qDVtD%(jPW5bqL?1E~LpPz*@uKpzy@H)Y@
z5tE)F(IleL#3iQAfUWQRbprqEq5~{PHvS*f!uzvHr3gI8hiC6R3o^twfg$7w$zW)H
z;fnzC8EXmN6rR?2v9l@@aI1kYb^roGma_yNe0l0S{P(@w!9=XYu}q-iPjV9IZ!kxz
zyG){UmwphDY)wiN7{9t1G;B<ZybDTl)uAW_!<DWxO-j}E!=yZarRD6eQdhYV^*0Y0
zSk~r?LohcTuFYoE2)8hA_#L(xH`Jm>4t_9yhnvXO8U6Y*mj6oK4W5}eI=kMdv&V`a
z_)=wkN#k|L4n86nnldtZ{d2oCY%`xaA}9Anar*VGdXB4!(xpf2erTR5ZMo8La*dI7
zcN7NP%LbTVo6OyHJeBb=Chgy~Ig<nh?f@&T9NCBzw$J;sE-QqiI+Z(K@HZ<0?H5gi
zKF*^L`4;ig$run3?W)nk9aF(|oHyJ7KhI}r(Rnj2DwM94!B`+=fC_3Bd$@7>=AF`>
zodv}m_9r7u4i&Ws(ypa$uKL6eBboQdo&7_<Ertz*m-}=m;mBtKi_pX3Gz01gL_4=?
zLos(z$ETB8uQ7DuhNx<w<F~MWaEaEg&qY>QKQ{tQH?HCljIb42N|0_gkF#2TrkG_O
z+oUNcF7lfmXQRaxVXx6<;qO~2RIX55uKABI0AU~<RDZIYg$s>2^qid~!aV`+_&=Wh
z|KSIq&_RDU-h;yI^p}C=o3r@#;8%e~{AiG8U^LeU4q3dMr7k3(@c11N0CZ%c&#$!m
zi=DA|y&V28IQj<<fTQA>AB-ne%ZiiW_aep^Q=0Os)$KYpNYWXGy_ipNHKU6j6E0`a
z>LTWM({4RpHSifGL@y63@Rx0i3TYgDzdL#ZcfCH8sAJV!$}Q+SD15~H9QU)1-ZgX-
z4YMJrCLsUKxqx%<46Zw9EDZWgzy!|8%P1)wKX2q(Olz}}+`~4L_!9C5Tb*Kn;-b)y
zU(xJWh0M?oM+j~Gc=Pyb$gK6{{1N?Mn2_GLjOziqK}W(oV;x3+=7iI0gBrc3v)TIv
z<THUW9f6}#(PHT%Jlgp(=w%^$=CB+~;>*I0DKCp`D*oUBgD85Zj2$WC&-61V%2j+o
z`Cen6-RcHoT$CEg9?uDR+c>FE_IG(_l%kpU?%z5G&R;HIs{;FQe3MMAnSS`I1E47m
z)69!;P7BI>s_vYgYW{-&){>;Cln=04g~M9T5#%=w@``hgc`e*(N^d4;o4jV6)eFKv
zErN+wu<yFfCu{BwbUC56+8jv|zYzDrMGROZdS3thGAj}=yx5g70=W8wC1!~Kc&Gn;
z`8zv}|2qBC@q@N-v05jV07MU<d~sUs<dO``^vg9|jk<g^wcmIzQ8bPMR0~n?Dzye*
zp8waCZv+!1)K!ahuBpy8(T}3P(&DMpgl$i$Sn-Tgdw#yRKHyIf$@l$SkGe@Qb7#Md
zxw&udG)2Vb+_h1=MCrR1x{DZRAZbaZ9go)hdi6vJN+2ER>ztS%z0=s$n`(nb!MJa|
za6^!%Lp7_zwx<aw{|P7x2g>KOPVHH_1lf>6RPNWY;TNwQf?B;Oh%N<)Phd=KKRD1v
zMc*f<7Q-zqy94LN!|dnt75O+L(XKLHFLIu)UF#%hR5O`oKdf28KvjvtpKy3>kvmEm
z3)%-b?BVzW`Hce(ka*+Rfo6z}+1oW2Ci+eBBioJxU`nd0xLXWG2KSD+kKZg&txGV4
zm5{1N@!NUC^71E6=_oabNxjsYpf=p`&IkjtL3D*0ZQR0zNQ!Pcfsub_!b|~z^keJ^
zN$@91_8nQG?M!hs-v4I*y)BN)WwDzXZw{&71D%o+w)ju$(F{*PWY1=uD!wkYng29i
zKnhte($Ul5?@kc)$R#e745KZLP=C;**?@%HSCGFlmfD=3te-)%X(*JF$MKPPq4$ph
zAqC0BY5pW6GP4PaJp8oy{{!~e(MVI4+V4Dl^7YZmbO*_r)!5lRuCxGcmP%J=+zdd2
z%mbnY#aXshduqJYbDyxk`R5-zUe*eQ>D-0fQE87UjC+MLmbs6ksD<fot#!jg2X?QZ
z>(68>;K$?Y74M%bwrqATzkpaRS>cY~KPRVA$#?ykT~XVlauX<K%zC;sRAKOpktyh8
z-bI`nxNetWo@tsgsxsKNJ>7CX&6L3IvxKp8VH>5yn_B`@$CLt#&c+rQxs5QG<sWkD
z9WT{?(8jZ06q-t2n1<jVwHy!5T=HJYwYrF+R*d3slX4Zzv`iZ57dDpLnn(E5mC}1Q
z<Du@IVYT_8QsY3%7MG)ZoMizT58(xpNltE5Zk0l#N?cy!3{U&!{Tw?jFq)gMt{%?0
z0GPmZ9+8Frq0@NFQZ$W}3NsbEmsTr%j>jF!Vq%G^$u98ieB%M<70G9~Hj-f5l%sCL
z!ElGv&oQ{!&h7Zc?6vKGeQqNdZ-6T|6RY+4wZ77)ER#-NKPnf%Fv`_5&TyLt3W|tr
z>L)E)<PCcYR&PRUJHn-<eUIxCqf0`-x(O<Z!+xEoVV<{U=ZA0HnJr0jdHO29APH3n
zBe!t#TV#LiU4Q)}tSo}u?)`wjQXFElm>eE@rcFhv&QTVOkhlEm)xG8kVmTq-abur(
z^C>9Fe5P5+GBEsbIiURgKF@z51W-UZD2cwXZ?N|@k59d~%1<HAaqy9CvP~J7y2bcE
zVF>^X%z@oQCVelixzGa4JB-d2n?e9>TD=DtmILj~amETjEDhs9MA##<7Eb?Ca-E$9
z--fs%a0W4H%c5xr6c$8!d4Ees37!8b-p%TVe~an3!6X0ru<Yx(*57bpO={wx`cQa1
zL?j2<@hq`=UH1Bnh)p^8x?IakJTrY04XQx2d_zV($I-i`jtgUbX9~A639j$|l3?v9
z`GR`mOzbpb8ou>PrDUb)rDr;Y2YGLV*=ir_*RSt&i{W3sombJYYgy)eh@+WMF6cWx
zYUq&4$|T#;f{9t8md$sgvb>tr3Cu{a<?hFd7TL0BGe)9zJ%4xUPYKWvzJ2~*sj3Ae
zyIyN;(#v=3I(F>Ju`M$FT4T2z{9$c6Yw2FBj#$u^$@^E}Y?&|qqWa&n03eA%OrZ~2
z4?(UnFuR>H+m>#h$T<mMLk5!J+k+ZNdatkB9J-KAOjtaI?8rPle_3esoTuf~zWxL#
zS(scoFpJ`VoN!6lLkcsW!qP=4d$7IEJUGMJp4iaQc?_sbtl{&DfRtfMM2>Km_41!l
zxKLfw+hvK_vwxHoh9TM2J&ks%T0T72QmsF*>=pTt3*<18`yf-It_L6Ua@<+;QPpS@
z<2ujXOnSAk`0?rNt;-GD<&Qpyyl2a8c<+4&?+bMW)kZ1QbUL6hgCAzvGV6<bU*r@M
z(U6eiXPJK65CqL-wE1v37OF76^`g;MsekbYpS_@(AN<jdYxh_GbUFh(dd{T1yn0Gt
zvNPzHR7Yj<R+WO#;i0Z<?Z?>`mU6PRCPC_ky{t<YFM3-)l-sVY%J#SGWjuZuN9A4H
zR6GkEdcn6GE>ic4Fk)@&>4>a_J*$WX`P9@oc@fw0vI(x}Km3kSr3ZUsR~JG6y^PW=
zsyvfM%->v09lK85^#2hhO@Sz>yD@&Gm?Y-h{~kmnj)D(SPhbl*)$Sv^2{s7KypJ$K
zYkS}V!z3Az-pU|}L3%I&iE6nS8WU<DLMH<M?n&Z5vJY<n0Lpc+W%NXZgo+)y!PJ3J
z^<_jGX)H(m)n$si^w6itlCic2i({{ItA0#QMRnm{STnLkVX`Rjm_U+}`Q%_g73P{*
z?Ga9^o{3TRESfPeMX_dbo;4`fLVVSH7lU&kaC5tPnvvc-%D+x}=ToKzgX!zV&xq5r
z=cl05Vt^!vo*IxbzaAxv3T^USym>`l6O^FvsX=4_Gk5GTSjR7(h@ij>rWddS0@g!8
z_)0#H>8{AL2>9vA+ZNvJ2J1hY_r^aTbY?7@k(aMH0iW?9y{G6dj#`%A45wG(qtWzh
zvieO=J%Em_0v$?Dpp#w0F>_ck@WkDzou-tZKo{CVolu&3Z1sAdC<{w8Nk7*E!&Rzs
zPa3J`n|B)di|w;jTtAYaLptb7zutx+`b`9;bS2N0u*YAz4t#DhthobF(SEqdP$3mu
zlD^Cx<L2$?ZKem&4vnGTHxVls%TLVmMYMbGU$#w(AM-{uIOZd#AhY30oUMc{1EAJ!
zj;}$`JacDcaU5|obRvb#z?1*Z+}<KJS>|0c1l<~QKy)o?A{SgB{5CiW>C+l#+n)n*
zS4ZfAZZR5?+iB(uPGfenWHzR4Z=+1y#T2;0D44Cm=xOo?&v@CJDRyTY5AD8=BVvpm
zUAEXRsRhlf*#)NBaK;Hu61~Isexcy-O~q}vD#BHsMZ<1bE>uve^o}J@UH3cs_9sOk
zX^v&(1$egy`FVSF0|BPQ^L&{2YC;yvb>yM6vMabmInC0<+->ICR3`B+^tLK*sFZcz
z{YG;7Iy8EADK#nhdlF1!ux>8Ulz=#(@4qu<fNzMim7pcOQ&0G@=o^+;;k+oEYSMZD
z`*eD)gn8gxS_x>r8a*eeH~LPuPH9C>uUEYi-aq8WDf7s^?RqGvz-JDZKhyQg&E#K&
zoddBED6No!Ci>HhNVtlL1RQ+-!#@5Wprxo8neb0!){nOx{cNk|o;*mHMSJ+}Pw=F8
zONbZnQ&@FIM0-hlL^mcH*#7vlfBpGjQiBg^SRC>1ktNG{xD6INQ|bA?6AS+yI_jX%
zA1oD4Y6P+Ta#}80T1Wk{SO-To@hR?=HqF=;F<-(kT|bND_-Qw}>K3@XAITD#FabdL
z#jd8;v?GkYZSp~pQ70{RtA+5q8#Z#1B@@@Idr-$KOujtWlcFCM_dId1k0D2mEweP%
z^BAlx3VJQ~E3H@!{)T&}%`!Q3M$cMaZX?w&D74V9z6}>W>131AXFM*;D=19mbd_B$
zucYumgj_t&!0Gt`z>a8xK{7g3?e8H+*FSG24=aC5sY@>v<Q_1)js9tRLH`b>!E2H(
z7auXBK1e+aKKHEAQlsbfAM=7d;$M?8E>pWv2-P-@O<P%O>r1C4$i9O3CNn*OcX_yk
zQ+NaW+xUNc48~#}RclH@knr@#;ow^WpwC?iOj+DT90!;0E&eDshw-0+M75#l2@0`T
z-lPV6-Ik-jj{;R8yjgcYj9VbLD{il3;&+pgS)eknb$CPKYHQKuAgQ%U$Pl-Bav$_q
z4hfym-A6T3H|W;RYwq3>m596V(+%Fdyd<hkwZNh}RTfVX{cY`gifH2rav-lZ-Nm}u
zyqJ9KF3=UWSZ?~V;3MG+zPEOwIbR2bEA(qwpNCNv!-+l*#VTfM2i(sL(kw3E!boo{
zZvGHOHgJ5Qjp>*PL^ngHQ}Y=;xgwn;YD(yTJgQVI;cvp+6p}8dgzxfx^kTlpyG?JT
zRkXk53&$;^_N*D>=n(giTjj`gLJPosdfZeGbbnk9kKK49Vh+p93sdPiMa(29ZQWkl
zT+}Tx4=qCQ+mg+ah?5JSm%lC6<(&HuZ^kT)bZ}C;3|)6hAnPv9+<?`#F9JC2&;L%9
zLHtNfNIp-H>K>2036W5>pn<vWrG{uadPAa+>@qOqx5hY`=eVJSc{bro_z?*phNs5Q
zdB6uGSLs@Kq5kebC8y4gEp!=%m(#XFFbC#bS90J8{!?p`@%UNdB?!*hg=osu0pugd
z!({X7gQ6Q>ltFL?%77wORzz!lA|B+eCJ632yc)6(=GBx1!K32DRMDP<%c@=wLU`*S
z>#RWYU+rjQ8PP4_AH^1n%Y$uUM6BnJHV!IsALX_0VoqM=>asETd+y!8-G9a2ez?UW
z;xT%BuUe#65Mp2mB42oao?7;yS$E7H!qd{sx6*Z7R=Uv$7dI(IdzvV-`P~VjID0c{
z8*|M}tpLmo_;U8*5m0ldMP8gP;$2!k{0_w}@&T0bVt7)BLDQ1;0+Pb0P}OC%G?$?H
zz90}{oE)e#Q@2iId5TOB@yO}qvpQJTxBOpxy=7RO-LftEp$YEpjRkjT975yn?gV#t
zf;S!@xVyVM1PBn^g9mqazn!(#K4;&1_PIa%>299powKS&jT%+OGTDEr*}M`4_njhD
zX}-f1eMGV=Krj}`r1-6H^Qr%KR6VSX1>%4ZU_zM{aS8LMGs04*M;%EzyJKU=Q9b6X
zRl+Ep&UIC)?zVkd)527>OYd~EkfU=ytL7qT;?2A2e3@}hAXMkL2_#>uD&MsdYmC3^
z!wIHLB2_I>%8rhgrBJkS+H;e7PV019P@|@rW{p?$lInuUEvR7jAsL{YBc5|yHxmT)
z8s&0)W03f*3s;?UswC(KBUxP8#J__i@kMpm^b4AC^Oko;b>3vwZtk*u`KiW%?upQ&
z>A}juch4EvTXn?K+gp7pKIc!855E6A+Z6TR{Nql<;Z=_RiXYrl;Ygz4;4<E*(%tjh
zm^;66o)o92-wz%T@N18-5IVp@zlUP6YCx}=u$}b&**UY#k6A^~pi6YSd$6gLzW)_y
zO)AkXPnOu6kVESR9CKee(EjD_p(A8-={05Lojf|u{PaXz$X&FI2S|E2Mwi#~kHu_(
zGpQ5{nD_Lsz?ep7o9MD9nZ^{iE~W7fi_K6?K9CXA@WZmo;HMzvdU+((K(qfG;EE%t
zFBf9Nm^CNm^X<F6`w-lbYEeg=s<zd+*!%Sf(4}PHnv+-mC9w#J`LC>n&bM+j3N~)E
zsuZK?s&dU$V(NW-HIjVh(8Lc#o?rHq=Pw%b@BJqmhD&C3V+6iVc*SWV0le)wSBbHS
zJ!CxdnmgHz^nb>fzf`7U-(T($FLUOMNRH*p`wgyy<?|$j#DFxOh3t}1z)ut5v4a<?
zEJB_F9?&Ugr#bDMs*L_X?LHHLa`gJhz1{gK?RIBnLj#4i;k(hU@%M$xSMXTD{#~*0
zOA5<<E{E*!(>&c1y)b7Y?5DHT0TuY3q&;DQAC{nImJe>|Ai|jyG<LANorPJQp(rvv
z1nvd8MNe&V$;LqGyBgUs&GR!p>P%y7x_;6(bSH_$3OCy=NJ^*ut|NYYazOD@@1+cQ
zG|82tQxBc<;j^XI)`vjBVY=Jv`W(hH&L<PgOr<Y_!NdTHNa(cj=|*vCd8j9j)Vm15
z&Yq75fQ)2`PaSmjJX}`<L~@13)2o*K5H-pABBXhh$9<Z~GsYs<0v>V9jBEUd!i<Tx
zez9NTa)-O-(D`sLgH5kg^<Q}i*tZ<MgFTR%>9k>s0~HIUxlfI`2w9q?Yed%mO&DcJ
zo$jOqI~Gf0s)M*rir5RHAA0%@RaZQawB`LrOF9d<vdP&sE2B<LPfe^)<pGp;adzj^
z!yOJ087V%grVG%JJZJ^He82DLhmQv_9HS>9-nH%^`EOvBAz!V%Md{YF$#PbhG!g~6
zHWS)~LC6YFhRq_Z#Uk!vkS~)SrH=l$NOE<i=6L{_&Ec@ekuur@)m(vGmhNY9_{Avi
zAd&(je}%5oI&B9?^n-!{f8R$n&-ixU7}@ceZM{1;a`Im!XW-XBUrqCQ#!R$%j++R^
z%*G|Gn^dErKEFHvk=XA;r!)d32*heUW@S@i%Ksu1^lCB?OrXwGujIn}&1dF15F0nj
zV-b#P$$UNw*C&(k`m}r(Jdtk#(0|qq*imblA&VdpCAuUt8M*zXJd0d6OSNGr=Xf3U
z+t?Y><kM2H%^iT@AcFh`qL3F!5EDNlj6va55BO-60W8bb!vy9Zh4*R_WjQKhi#e#&
z7ZF=2nyAjLCyp<92WDG!8bwuPWx!tZ_jMZ!?T1ID)CA{|V+9K_1(P%*8`KtGa`Nee
zIq?>y1F9ax*QN)oBGXX}<C<nL9*G+J85Cd4+0JG`1wioRwI1~;`4|h~#MD6!VTnQ9
z&F9fmoz#e+gN(W=Z{6gdJT<_4MPPo%9JVH6M|{2#)qe!qzVOTHOs6~czfVxln6U8*
z9AD&Q*dNvLdV0D){Ldyi#e=+aC*ROL{|WvM2)}-TOx5@_g(;DG-hLZT9W56=yVQWM
zU&V3lLJhN<h3dsMj~pxT)93?hSW4i8NSo=+>nEY^=s>Bq3jEDC1MUn_*JE3F<wtk;
z=|n|{VOrulek?U7cRuHn(CRtXDEgatL3$N0v5<j$aGza=Z16q*F&O(AeBuqStj-nu
zNogYxT-sxX{+eXwx;EN7%2VM!jzy@hOT_9HGi;}OG<X=AYFRtz5~YbT-r%6+rpX3a
zOP3V7I?&2bJo^3aWVYS-y|tc4u`bncdUWH>ORo2OgxAqLeN~JC(V$B{@ooAE>GI{E
zT!(;%*t}<ob~f`;%!dS){(Zr;)vf22{5d7wh(_G9U)niQZ!|Zz4QI~7Hm$vgh5Mhz
z-Q(U+X~5*#lo30_0KZd}>~933JZ-%b@7KsT1Tc=h?wXMYdl_H8c!~4gAj<MLq(!V`
zT3-8?L_~?2B=p|~cNx6WdM}o?%%*6<Vy&8E`CAZd@zDTksi-GYOoG~f*{8*9w}|Xe
z#B3DdmpLedJwXuS=4&8?ql7cdRZD4QV4Tj;IL8ZE|Grc^t8S8W!swAzrZ~aci=lwk
z&S5i&owM}2tyyLfXtube_~)GXOYInzSL8tw5$!M=A&Z>49OgEoa)eZ?Z=pb7#5;+}
z3{h4YkiqSDa@J!*0(S}~UDCK{(^4}&^H3CTZH~DN6@OQ6m$<pj2SNy0f;Zlq_OXQ7
zN7adtipKRo&%s}EN%y+wFCkPkAi|7-MvUcRnVKE(tPfgfoE#m);9lu^TJaUm6i=}t
zNv`dGhFwkBiIR5pKwRAC)VRFmZEGECnWI3AO=ma!PM#u0?W(WDH_j-!TwSq?`{Bb7
zM{McPZ9yx|f!)qj?#S6&S?c)y^9RA@u|bz;0i<xP(N_WqO*>KXC9fN~lC`EmKAhRM
z^c=M4LQV20H7A+;@yjTVs;AakOFY!fBZ}yfwT0h3j7Bg6>`9{D3hyR=>{`VFHe0=h
zWG*UyX7DQsLwfZZ`6>_s?ff*lQ!?1HlnRiSOH{l+908KfMMzopjx0QhQR9{DSeYBF
z*pSuuxo|0CzR|Re`gI|{q#N(i%MUONey8YCa<0M`?}yVki4Ptp=k)fp>#_z`I04J0
z_Z}$@PoRExjq8i6PRu;ZLtx3DEw(9U>sxn{u6LR=Q`)2t-VtI6e+5YYt-|wP1;ADi
z!EhI7Xx<?_$-ukQ=ED#YWRV4{umbB3UvHZPqe7SIL)O0GIWiYycc7`+-xe4Xqfr`(
z-}@Q%=IF0SlF_xDB~>9tW;*IX_-W0aiI2CC?uz{L8sm##Z;4MiI*=x)Ajk(z1YrW}
z8DTKEXHln(lZ=iV81NxGSH6jWN#Edt4`@a{GMs8pkbBPyz#ALaqK3~;RvH;uX+rv8
zKO8MUDOVg#KPrPAyqCW^WE9@XZ(7Ri47*WsO{47DJK;t4bQ<40-<tw9M;Z3Zcds7<
z-{$To8}nY<?6VS%=i@X#EHE4x{?K8|XrSlDmF+GgV8D9NqbP6Un=5H4my01d|Ck6{
zMWg_BwMyM9ODl5b=g&mZpk`-0w?c|nA2X#-_2igzyJ4|O3dzjh2#=rxqr5!_WCseF
zkxSx4m;^_NvzgqL%E+B8CGHMQI=?8iH7_xtZNXtTITQrz2f@lKb6yPWC{cLpOKDc@
z0^gcTU(aX>w`KUh@+X{fyJ!)|d)n~6+uK+&RKT?nM<B?n+ty^EtnYlfMHI>zj=xHR
z_lEoUFa1H^*QvqgxqK6Z;!D#sl-4Ts<bd$?HP%ETW`a#bxj;nM7g+&^G*%BVp{-~`
zdf`v${FD_ZbL=ZG?&_@|Ed#Y@Q6!7kQ5-frj+Z*xPh`7?-%pf|e5WU}zMMgzmtX%^
z+eH-kSk3%-jS9w&ul=goKW}TKeou>MdGA2b?_VWmH8NFcG_EEF*Po-u0{x_};eJuo
ztLv}6UZHIk8vD0*5}f2xL#`HLQ}>_PC3snX)luKU_r>N=570$exu^Yf>n@*BWa;yp
zB9NTJx|<&K6|m)KLNZzm9*_$pVwh3{X>}o;UPM{f<V0;3jjM{chyP-cICUO03V)pj
z2H&UfQoEzeFPXbx?>dTbp<TBQC|{%hOms!DCp$AFXE*N)W?pbfcJ84~<;m^wtW)Ux
zQO8H)l1n?Qn_^V$V*q#@`EVwC+Ef}lT<uy#MX#hfFAP8n*BPALP6vgXT7|)sD=eG)
zD!go6Mml#|mfEv!gb4P0(k74Z)-oognm(<UQ*;H{m`u(n6zcEBU86>s;<99p(rqD%
zagOb+2>kmY;l%~IJ1w6~^}XTP4WCFR4m%~%^Nq@K{8i2Uk2xeW3jD+!LI<bB+@xe+
zD0yIQ;-$#LxQj$16{~Xka6a=7nErgmUHvHM3(I&iSdSzswoW~I<nJFaWA}b$5QLOM
zLDU((E!}PDff2zX7p(&5`7BHpe?+~%jx7kcWQw}x7>C5>ZfK1Uy8;E}k%6^uUoo0J
zCNl^iT6(J}x**Y;v|Z%<dq#8gB@uA82XcN|P&{Olzzj?=ZgsIEaT7%ARq!4Q`(`mL
zq-mt<m2Ya(5slaHzo6ESCEJyR>!Eb+0&+9I&ZMQXHhs3GtDVI~)}O>u=T)!zt*B|`
zXy<%DpKsEnFJzL-<w;!DuG6<Uu~v$E6)Y8BHmb8FMZ-kAvTAVM^{e-XSwnC?;U%nM
zEWUY7@)DzOnnmt8zEo<xn`(uZ$(7mCC1Z!x0**D_FD#8#I_(>ns!xUrGY-=aU2S>p
z?Srt?f)+v*tfI#{>PdN)edld!TQB()R!Ei0Uz&ca^zU`71~5{!e;(KcMQwi@##*c#
z7^t?(#TXviC8r&ZvdHy;4<Tlj#j$NR(ebe>J{8hEz1#MNH57um@REVEDn9(8&8~-2
z9x^bcnYnVYooqXXsjth&*ZP(znK<St&*^3wLA*C%E%v2@5GZ>NF{hM&V8FK4Xpskj
z19o`?E&q7OP=DY5mmv4A7anG#e)0}IwpA@OY=&AK#l_(H($Xm<M7Hh!d!7D5Peh^q
z#3-FI@*w?DRFLFlKVWkqE&qD`8%b;uuRU-LEa6;ADXyn<sffuyGEDD{IMh*KP4w!D
zMNHt9;p2~U7H$^N>Mckta0;8A^SNj3quVGZXYPb^h~T+G*Fz*&>geZ3e}7#W2Od%S
zjac(U_O3?PYC^TpXROXho;qm6HN_eixgn%ru$hg0%F7S2u^sx=mJz!>!$<Gds6rC!
zj8)VoXKG@NL;9zX6O$W2J<pUDzl<>)RoBodmR6o5PbAIG?95&QAmXLJScMd{Nn}MI
z(Kz!j6`{ECKSP2YrWEc>4d>m`KbdO4FozFhwod{7Gl+C7V?*zmgXf(3ndtuexNL*%
zujI(gA{WAaObSkZw1cb#_;}6Yeq2Y3Z_FTSq#}o{2hpT?Ekd@;+*cvt=efVK5S)PL
zr{5d*6q5t5L}rRW=Q;x)ua$KjRFLJgU@>)G&vhhH$x3yf<*SUsXD{0)G%^m9PfDOy
z>c3msa!Ce)n~$Yci@3o#Qd+z^--2LVG)J&nk`!3ZbqM$FOLcshXbeL5+0O6T85JD{
z%StVgpHd}fcd7vT%Pz}Dv8@}!y_a%rx7sIgOT-6QCiR{ycjto~j%@g!SXFl#-KKr)
zReZ)2aH_~k2|@Hp4U9D34kaLzyFi`bm;vQv04E$D9a`?2)I1~U0IW~fW1^=Aipp)F
z*CHY?J45isE-=D_uPSHpcN}Hb)|~7In0!;KsX`m<XOh^Aa)q;cDYvfQFi5})L~P2>
zs^LP^5d9jIr18>DtaEt=iDa=t8m@~B9O^)En5wmEQyA;?97bQ9s|t_Jj{Xo5)bAra
z=y2&)0dmZi2_7+X3->bV#-(C7kffO%dK)6@%voda!-e0rltxMiatj=wkE-&=_xv$4
zQzfVB5%J57@5x69K?e;s23FPKYa4AgYshets*8iu*v$|zT?!IJQ4S&CZ!BkJwIy{S
zIqF}t@T7o22$w3+HPFJ{QOmq~?AD3%@W9>VTV35*&@iTkc8hsktyGYlWt=yt2P|!Z
z(`EZl<)Ih#%SAR+Wg5f7QgIF(@;Vm$5gzr}0FK|Nr-g~-eYR+0hYx3sU?o*KlyA+}
z-f!CMqrIE@?j>TeYBn&}ddTyS*#$}(!q;lPH0nEWvB@HPkN;HR5tUr@HyecFgYJNz
z5981H4>$M}8+haAtiqkvPe1DKm^s<xDI~`0ojH<!FaCeIz|w5JF2Voq+@9#=9_m&3
z85k0b5s2<L&I-<68!-BS0Y6>YhMd0$;qT0Vd-AwSpyAm`*3Y}6792l(6?Iv120vEK
zpcZuyQ$`E4FR_2_xLFvbrp$^`n2ir8flN(#lI4X#ZQG4L@P-k05m^}V?(>~jwg{Q6
z)ZZ{%f-nvmgK~6>y=KgMU%e@`u9h!na!S^Ue{KBeWB67)fh|oO%kQ)k9_iGc2V^{N
zpErs<6bOx=%+g!PwBKhe@bw8!?*SvB>x#K72d7ds^%iPt95hwH=q56`e+0O<E*fni
zVxY^HA)mYmPxi`A{7ZI0;**tPw-0t~xAj0K4E-JHYt%hJReZfBCCUk|E`Zecv0`Jr
z<3MJVkUZMN^T$=Imn}p*-nHD&eIFc78OK|Ty)WFcvAPjI^ST;+qP4|Ab3J&0DTKOH
zu?a6pki;ShYxNn$5bt}`Ci5pvQQi|^OhpLtNt@6yzp4s^w?7<|U8Yzlw#@*!FjMoI
zZ~%mtxI?cwi<OVR$)}rp$dEhemX=e>?+0^Q>CAZXg*yFU&O*EO^-Tua+l!YYRJkIa
zIBB4(1<$!AJN?+j+>mNkj5=PGDrx4(Q2^~fQddD#?U6wa9KKu+a|m3^UnTW(B`9Ku
zb~K%ZL<^K?*R^AZl>^rZd_pG??hcA#WD5l&&Ky1E%gtTn0@2pYVr|~mjS<v>@yzt{
z#th@Y`ymbBT;wHtvc>bEhYu?uf?ZIrE^W?IT*T^w<7bk#=X}Xp<E5SvcsmEF5p?<a
zO$ccYmp%q_n?`(SNEyyMGZ}M0E}1^%zCk-Zm2mLH9Bp>sXeOk-<rq#S<p`|L_R~Av
zfkTyX{*gSINhRzHNd$5=kuMTX8@SGDPCvO3#WMKzFziuo6=#&<<_geU+1BNwN|$6U
z`KJMIWVRLmOpc11=Jv{~lP!1#DrRx7Xk7S7bycKhEoITvjr!WL5vPu`x+M+VE27~3
zFkilH-ofP(jh+Ve=+rHq;(@s8;XdkGxt}(ZJjU4ErS*p$pr*?ux|ZJSLJUS^gX8OP
zv3>Y}{{u_ZAJ1G`8-Hd|?v_tcfh<@5C*A%Rg33lhojZER3s2n4><}?hwsX)DQ>woE
zjFR)=ldI-1I|UZH2zAZj|282=EAW0LK?eQvkxCWq0~WlWEqd_5Mt2{%4uoxotc$K4
z(14+2BvXb213^0!ylA<Ke#lgf%<NL*5cgo{&7uzcedhisg1pyFYe;;s)TN7%^iDV!
zSSEBLJp8zLaAcBI@CivW(Ye+c$ZRWBiYx1?s9u)cgy|hB9MR+Qo&J&Gks~|;enn?#
z=DT*8(y9*I(iN;&^m4r%Xih1Lv@X&zdfnVVh7%olk%@t!i^H&AG~lqx9)Ki6y~NRO
z&#f~c+GWr{soWsg%ovpyx#W98YdjjAPIRD-%K(`Z<|&OY5~9jAZlsAAP}6USXH}2m
z&hsur%t)mYSWqwS9U-%d9IwjABFa0U(1(A`X4Di<OO-xgJiGbzsr1NE$`5-e%3YH(
zCs^<ykbz>x1t%ll{dci#1MqAr*0y=5ts15Sv+QT=qd@-(^@GcwtT<X<jUBbE8ZqMW
zzOYa@D)VcQG66TIp4KAe=_(qrL|GSbti<-$<y*l`CE^{HE$q8(Etyksu4z;c4p-I}
z-N}OopBajFj??tv3Fuk%%Y<E~3GEPYsVioOEgsWe;g7JgB0DMabd3-1!N_kBMq+`7
zU-~p8t@!}zLux;wm4F+m3Pb8CwBf0}6q-4p$m>AfX6!N>H!i&#;SJQo-*KYdVV3!p
zs=iBC0^aO}vwu#^$etn`{?R|j3>t+QGW4w;H}7fR*y{ZJ<xa4&Rc@DzXM#;r7o^1_
zHb74DE%!1)d8meR7?RmRZY3W41l^CaW0esL2qg_+B6wMqKGkr0Yn0@P(Ld1tgeXM>
z2&P(l>WHoGtn=V%K3#r)5Q%mq%lIC%D>a*+*z5(BjTX62yn>>^)9Rq}Nq!Yqs&JpK
z4X|etcAy?5bL!c6+3-t$$JX4G$n<%hB6Rqvof2T(J(1Ts(Rls0fekXXH&=h*=#;O0
z4N@49-`}59tH`f{Od0y0QwSV*XO^Tu<Qpf()S-jFyjf77Hv{xdH<aOaf>Ew71EtHS
z!;kqbkl6!n4FnfR2d*Oi<lm;^T@N8O9xb?Dmt7Iy4{FMW*Z81#ZBlj2UB`5HOZi29
z{c)-NT1ahNB2{>?sv5QAc0SsrYR$u48AH!!*)+*+kq%1eBH|+9XI<oXsX=xdk^<kw
z=Ij*KNPb|BO3ZSN4f3?PkCc6K+MT|vN%?qVzT-Ea+>;V&;>{TPj%7!o-D(#sy<7}Q
z(6{bChH!wwiQeF!1jWpE48G^k;T;W(gjP3K6-wPHcfTlc5oPa&wV$dpoJOMfv67U#
zUUR&_rPK4d9EM-6=9nTaOT0BsO~X|iJ0{)f{k8}^%Lu-uK5(UivEb@N9am%9XqFt4
z_k9!6BZ=&3mQ6_0pUG*W6P;U0z8D@U+y%c95L3WDZ4N6mOTmqdh(BaiP*mFsv4pPE
z<QN8hUmj}!0#Kaoyi?$FQkTc}E@E+?Kq_<eZ^Awf&=!(T**-lM?x%zld0BK3YerO1
z@V33*p)cX<3T<q+ZETZlF}Fh|6EHCo2lSFmXxR|zs0<T~h^l1}+@mQZBBg6lQcc*R
zm>3aYrEFD$LvX1Ir-jmQeX_F;4>vP;$1Dv?pWe;OkE)K2j`nT+o;qI_ns@P+ah>W^
zNjkVlyv9)F0?FG4x37PmCAF}sAn6Hju<M6$dQ#b@c%u#0shp)BMS18@$`_kB&YTs=
z_Jk?zpY`o;${yKyL=Fk~D!A>}>iF(#7Y_abHV%7FYSRB=6Nf_$hJN;0_r)(>sL+o_
z>3FsFwZEmCdFIlN!Y_3^xg=`W-^?2}pC-luBZA;!`Qc)3@1M7ipqv0O`k``M0Lntq
zw}Omd8<dWIA}ptZ27mh1=r<zYq@Lmo|7UB$KTh*V+v)=8s%>AB80B;=8<swG$R}Ed
zy$Ua9qsnFtFLl#@KdKZ=&WBbG6f5cd)FLU!fx7rb?tW2hVVS91#`wKBbJ3k|w(jc9
zKbt0NHa4_WM2(yt{a?T4@0b4b@<22&fppo_1@fLF@+T!cDeh^};E3TGe`ye@UkG?i
zHeuLL8q@1ZH0Y0guypfl&IAoW7bY+G1D#ybXYZUR1Uk8xK(o7-S9Oi}%C43o|6u8b
zj=DgzThppW@fJ^s2cj<?Pu@F&GwDjZL>8OSf9BZdg?rg#W!%U#D?ReM#;QK{aJfdW
zY<sM!{btUqDcD{mCopP}J1y?&pF29{q+H&OW-JMo@n6RHxg<-wqZ_pt{#)W6yftHI
zSMOtD+@rzLjdr4|7W&w={Nv6t0+Co6PG?O@@Fz}=Xv9+P2J+y_Mg>!u=?1rcHDpAQ
zXhYod3~~ITYk=t`pJC0@aFZ^Euo$GweXt0PH|IZW>Rkd|C7}l=C!a@!Ou{(xM`jhj
z)9`Gb+8q1HM?Zp;4N3Um2-7X!yiohbW!P;3EwGDs=fvoDhT8;kNEdbQCgvS(29lq#
z1Iku~(U8UWs;~N;r3rn*arK<s-`k2k(q_~368jWQ;mTl9r(<ffa|uuT=i$bu|9viz
z^h(vb^XA<wxY9AzRGEJwIVJuc%)j4^{RX|_<^^2bgg0M#6RiBr3I9ZMTiJI@v9c2o
z_4XnC7xAiG%&@i#y<HiT(DL;*p&G}sF3GB@-=%*YHRNvqJrUJHaho)7$ZABB!s_xo
zeQYBjH}k*Ug})c|@5^o3!20cD$Jn=mX~@x^U^wsW^J~OY3daG1VO0N=++%=#N`Kv!
zA_oitB8z=w&sSepyg)l~7LW<s3s~_M&=_rMiLMol8%%}wR=VV&ug?q#OV=bu9_Lir
zJ`S=gN01xxZj>+|r$#sLXLxV8Z*bWc6|L5Tq^M&X<LqG-qZUf@QtP+;WAm(0REzL;
zmc?XnutwZSOB*+X%JEs|YZi{9K>bx|CH-povi|EZ?Vkg6b6M-0Nu1{M1oJ$x8GKFt
zQ_<)|5cj-xPv%*NXD%MEzEImqJ6-Luis5>Z;PH~dkA$&U41JIElKI5doWTvMm)#;E
z^Gx4vd9(bg>6G@)pIR}9#Kv&kBS?1p#;CfV?ap@X_t<5cmdvTy<+5n^5AFFnhvXkI
z*l5zOcg1Kh1_KVVmLCd_bV5;sJl{X+;`oDaPG#G#Uw7<aIvhe=_yTfQGdg?B+~-QB
z*VkkHtpZffK~#;BtDwW)1Wa8MwvY*F%qne)p?!7cs%Q3D54PlQi#X+cQw9`b%Pw4I
z3?z>X*vGc#wHBw19*w3QW<H&h&611PK<G|IcbddbF%J!X8yXI3>O_wfkvI45h8_i(
zYI&UtUMT%>^RbqB&Q^JNw-wU7#(8e4c$t;4^?`Mn=aa&m^W_>Q&Ngq`#p6Pk_Oa~;
zyD#|VxscU6{U!AVU&5HMc}FzUvDk91+H`ji4p!7Q=$R}PXPciibhpr#^qU@Lw81CX
zGH#<Ujp}B#Hi$i<?`&rRF8!Hzmx`R)K8)g!>-3>o<n8(OEt`kzK@QasHp+uLiU{z-
z)SrKS15st@`IUFm*R4&WVxny4Iycvp^FKFGHE?_H_3gpev%a9d>dHUrIA(9@Vt)4}
zyOCi|VJv-i#&NOq9cj_#XSmDQ5NA)x4GO*8YTOI572>$UPUtGu52@l9+?L`M+uXH0
zc}m>!lPnHETirFLpbpykO!jSb5ybud5y9!bf2h^oY`69N<)I&6>7u8%R+=6P?!SV1
zjyTjZW)NT)<s)*^*GXg!(~ADT*vjX?14r*vvY*!Ce|k%Xjn}<a*PLoZ2t^xn0A0eX
zaW*Qy1Bz$?WV5ibc+x?X5(x)Dal{q|^CdtbE6V4}F;V@o0{Hl`T54%+i1%b7l3MUl
z!WlUl(y^*qHg-3jNm<FN_J=|w3~0?w$yu0dk_2`sX;!2+zbtIsW^MAkA&eqD65)0|
z${F|QeNx@s2n&uCl$W5^MAhASYzuu|N9=g9)H5Rfa<Aa6gx9Q@DveZ=4&6aNzuR3a
zxKXe9p=Y?)T34*U@MU4(@)s++S}2)~@RtPljF-6gd}zsdXm$5aelDZW@BtQEEG(x@
zO&kVgVbi~UbQWu^k-^DDs2O!+mZ|i^04QaXNG6=*w(8g4fVC4l7ZwQ~bMDU?g!~B#
zq->pr5q7~OsNeFkPrK(^EO$sy#qyU)B>71s+!T^Re%wvGStfYBq@;Yd5Tu$6sbpr)
zYhYq{sC<Z3@svEh&~!y@!4wJ5Cn)tPQ~TvO=Ts2#^PLgy;5ecDkds+##Qoq*M}7;q
zdI_UhVdbG^r#f!`q511}6;Wj8fI2Uqe{pP^V})BA9e#KjZa}vwOQj88b~nK0g7;~8
zfN9Td`D#W-Ae)vM*UL}@d+kx5P(Q<7Z(_Js^p$8&J5-m}{$z$Y>fiBFJy-oMgvU*X
zVeGOSM*=x)ZTq^Qs#Z|vfB8^HwNFM)Q5krL+mnbcjKqp-D%a|ifMWIYJ})PWA2GNn
zK!^1L&E>XMg}KShn)h4w#&%4@B7UFS_9?}b3U*58i0J12_;;E0+T%{_LPG^a>ng0a
zRMZl}jSm77FMgsemVbRTsz@MiSAWd{ey^LbR2h+xV&5^@#<t%G`3^-lYKJDhPQm|<
z@Bih~OM5SiL`E*%fR~nkf9<QXQSG-Mb5vBa1qt8hH`XQUy%meb1F`Q2%cRW7ilj6)
z8Z!R;aQke2Ue?%_ZPs;lpV&l9R8+;$Y$@?XoO|Cc*nqgddg7vk8)bWeQ%vys^U=Bl
zu>+^cRyoRIBZdq<vMBIslUH+0%*HZ7N4FA>Sfd}lK^M=V1{p8as@&6OIf(3NEh9{p
zzXt7{gRTj8FUep8ujPk5#_R-cv!txrA4{d&Uond%k6Rp8wKW-<@OTb!Op;nq)Z7+_
zQ(C0uIL^9dVPOM(b%M}k^qMfrDN4mpY#t;S;_@Ba$Na?d)5NMQI0sWKcu%FuP_Y&6
zwON)J#w*AuolX^M@JAeiq#sl_6<7f?m%zN6>N0Z$w^D75&Qb3b!>pmn4nP@QrAJR{
z%tyugp-MM#KJmX%_##^)SN$~JEw2@pn&jMO&}Sg|e?cl@K}2$sGTspy9=)=(&b$gK
z>)Q_fHO6>&FW#|5alX^wN5r!DY0~}IYnSQ)Z2MNPBhp25bY*3`i|Af_)xX&50>@IB
zf0bU>cXfufH=C{D|0g=D{`EM%Le-b0>y(JLSODjRG|<j~F=%Hc`Nkk*9&(7nV)0`J
z96@Hc9|T8^1d~X8zngU_5R3jHODG&OD3H=nl<ivEAPNL`cBU8VM(H^cKpc*YRmKWK
z^ab}tJ<}7DuJ6LJUJ)m6-&HT~Mwq`0+Vn5%^PFbR)ReU-Qq(T%+kqr9*{aY-bpAmD
z5>~X7#AOdB&a>l%R?QB9i9@L(|FO;nWdBc75WO>V6GYG}KlM&Jbymme!m}n*7)QWC
zCQEr7zlYVFfjW(i1B}G}<U#pr87dW>Cv$&DGvh4#JFmqK-KyE4LjC~nUytVi`_H~<
z44u#$QS*p;ZX4)UotEjm*bsef%PBate4g2LmQ)^MqrE-`P{7PE9(5-IN74$3o~V9>
zOD0p<|BazCp?BE@d|^9SG$~hyO0aVz6K$1x2HXm0R=>Tt7<L*eG6ff@Xj;Zc(L_7b
zmTz?h*avG9P1&264Q|m7CYScU(M{zyugtqdIp`9?r_b75-u=e>|3d12o7_+d^p~N1
z<H3SR#7-V;sg(SUH4zZE9SuNO=QGH@5jMUTGt$QS7&@UzlL`SfJ>Be%MS&L3a{)42
zat2(PkZXBE0Cj+lwTLZ@oZ_1cn{*YR=bkNLes(4_9h~V8kD=Wa30`gb3iee=Z1gaG
zqm-<@aZ0Ga+nA-QAY&+MvJXGG(#jV)LB7RpaDT{%?&-XdD7}i&DSY2SoDJw6p7Qcr
zL!hWp<(X&lu7fOYm|+#GW;vNM%+Yl*P8xRC%u8p+KCEyqFRQNIP?)6(M!91kq7}^K
zR+K{mNSn=ymR6WVu{Pb?HN-^5J7(PI?rAm{U!UH+yeF}*`y)(Bf>1e&9%JfAae&L%
zz1QC{*em8*;Y#*E?$0SZZ6!#wOGXAREJU5LyRdz)FpzXAA39tF>I-ClMGms9ncgPq
zk@;c<&jnvqrdMD#RCul6RnXY!l~=NT;AGjwFjYLFZh_DT(}+_%1qC8gQpgc>mf%Pc
zE4TpyWzNvYRD<I)2<n)m+bQF;|M%fzz?JWC-gCteW#7iuTC)P)pY_X|yB)8<uGZZh
zA9+Zh1CNkzBn#(q5j$=zAHkabjTm0PdJcKKPZXo<XP3;6%}<a!Du;h9kNa0(@#=;h
z9WkG3PijR%T=JbL6ia$d3yJY9vTafOM*RTVRLOHJ_pD!zWz4^T!%#5v<VTFhEb5?&
zd(aL?a;_tIMn@dve~jQC=LXs-S^}9}s0D3Ulwj?Ifio=-G~^mV-g1&h58j!-eG2>O
z#X#ArpdA!>kOY3v#wZEK<5wEzxk4IA(Af=a&nN@nDgrj1P<AJ`Xh^WntA`dq$%T<t
z9zpC;Hi#ZiMrHQoFNaa-YCRlCcJ~?vu2eSL7^+}}J(o-;7fGk|Bx8s5hYN(1VM$S_
ze1u7fz{BdX#5~~yXjJr53~>4VRyAgkSr9nT0=<ZzAkOXQ&x`SRgbb)RqUihJ4GlP;
z2EW|e0-VoH5=gdD0^~pv5Wp{N3Lci{!OHZkqz2?X0WrKTpaAR=81ecD08K^6Mvo(D
z?)4mbT5Un5-4O80;bO$Fra5}N4=z@GKs{(@l2N(^^U*;Ow7345{W(cBLdt(&p>k+i
zUuWCBTpx;d!oS3}X>h+i47H@e@%6mknj<?lOR$ipMRb8TKI<-xAnw}fa?L^MRER*x
zCubOb$ySqT`}$)9nv%hC`O(@}USK@3-E4Y_tJTeJu{D6$*=);?3{C$;JTm6t=|#2q
zk$a;OiJxLi4`{_oTz1M@!g6eRANrb4P4CD_uVghe&R>LrT5>hBv!6|uxoMzlV-VQZ
z(?PDM_nVmMuOAXs`a`8(^;5_9`>tbm9SWW`Zp&fw;8O1~-uX8a)k_#}-KH)}!~pzr
zG6nrWvgI*_XT;{?1c88qbs__vcZWbJGOwPO(=q0%I{d$P+XBZjY3)qjH$b$Wy+;Wi
z1e`Ar%lq%RL++E2tZmWYV~sGw3t7x@fPLBXc0_8M)x1x#2QT!BHPHoct7Oeh0Ec#A
zj^M@{L(R6U?MBgl2X6SGQ1>rzrq7aNYeFPrQcOL9m%lgpTsarB6ENxM<tI#OOInw;
zY4809O@gor82kfPhvg&p&|D|67O&CQUw?d1)QAt!^A?a3(LP`xd?_cpE?`Q-UA;*e
z<M)hNt%3x1gpC!w3j3?n9{y>Auyd>3JCH|O-!^9Xk5AN+iijpFjNOW1c296|#~1dA
z{sL67OPG#=O>$xBQMdq}{+A-e>lPg2PgeyraMM*nX)Ru#r|a^TPE&P&9mLtfug902
zn3W>Ce9Q92J&@%8Z}u=01>NHc+PUM87^Zt2$#Vg|Mo>Qpx?}d;4#`U6ah+q&q@8u7
z<$AjWMV~4D`Dh6qe#SXGF;D`o4r#Im-lrOeQ!kz0yg#xY0zR*AGZ;#JLt9q_b#ugl
zGk2s?PElQ)c>E8SWBmNDjT24<E_L-_!e>Lq{Bw(P#v#sqRSrZ$jeuH$@PY4Xy2Y=6
zcB&=R?3(CR9{TU6x_gR3T_T2OQ@*|87Y$##^|pYA?*%a)YiXX@-jth1ws1f@v>uq1
z!<Vp-owh%extD-hrM(myxjX1_WZ`jtv0pH>%Rc(*k}JlDyJtA1<3Ck2mha?!SsuaQ
zVc{`a?e~pEklBPl!qTh?A>FM*kF=vGoiv~#fhu2tqP5nC27_QB4*cTFCM_9QaKH#~
z?xsl-?70ya15Kn!)h`h3W3Dc7C4i&JBrimT(!iWl2(fXL;9-ybc-v<Pqr+|M(ZNWp
zu%@6Nx6PQmU|Vq|Kc!B`h!g!LSX6apv6uZ7DV#LL)`9*+|BKoy9hh}RGlz0tNAKgU
zPyc9jt)aCQ5|NtV&7>mEX66?%r`E6ZNzFOq4yP)LET1~uj^6a}d<^P7|NJ#?7om*j
zk}$qPrBMJa8T0Qf1Up9|*l(>f=>9nMFN75<5h-FuNy7KvnS)}_AAOW=ADV-@MKP>d
zumtga0iC8<*%+@gIy{#k`zw7vy%X2{P5Lj>^ihNY0o9L>qR2d?7eAH^c0l6#e^Fz^
z^N8OMx~nBjspA}VQngim%d7lb5P;|Fb~z~yf7&&omaKE!4mLJZHGM}tntkVDT};=l
zyd|sLuee+KqgL%SD;7zJU-Wd*Z)2Iyb*x-s?4vNOZftJWvSS;25De3!iJErlZh3tb
zB}s*NP*BPC)ZbYS7Pnd+lXUJLfwhC8@`o~_sUBOY^$d*a0j((4_dQ&OIziiPZg0$E
z9eq{Y@8BoYdz;%bV&cG8q@UHfrkm#1A^qR=GVGc^xLJ1*GS1iaRU^KaCO(eLZ`VIw
z`79u%;@ST*$KrEvZg?-88Z#s&Tvc6V-Wy!>#!$+$%=Z}oS$JuCozwXs(czkja1y3V
zodqS;!E<TsIyEI=#?Kon``kY5bs2L^cHziJL&W0d`PBJlwkG7AAK~>!;$B1Td2>%<
zO<t`3oiJ1N>~fcVV!T+6%8*uzyzo97ix)5BRBFnz8~eN9CY%Kxb`CrsBW!j#WBYb#
zi~T<jzIGCE^(bQagdaiZJ{eOlM<55RcgX~}3Z?x>Y~Vl%z*<PK^>8?&avXRj#*ikF
zX&|k0rNViLvJOfToupZkgI|Yedh|svp5GJ_nuOhOFdnz=KTZluEs)A82L|VfY>dN3
zRaH|Dy68*%dGH~^AW)I0CpdY4$7pkB!FXhmtiStL2#@uHM7qPecl@7}U_>eLpy8Tq
z0KhOu`~r?j8lZcO0Qg5hzLwKF;SW0B_dlq-vLI5=5HPabC*~78<Uq>9epnABx*#N5
z2xDf9(-%6r1~Qv60$km@K<sjYa&Smdw>$(a&=&nCWB?uXOD<vfCkm*L`<p8RMpmI*
z+-0QMDJ18Y^c=OF@)WRZusYfU-j%Ab80^?-R;QOhCd6pkd?~cFp66dfZFEpYoAmn!
zk2|W!TG(x!%dyt#!bhy2KO&z+)mXW1T`RqRF%&NulI@2foy;|V_USGfsX;?K*qL~&
z#hOfc`L4QaUMmI*LvSXPf30gONbhL-N$K!s#;osU+u3{#ix)x7_P;HxMr6FRFK;h?
z>lvZhVBSh1mtc!UO$$*CC4qVI9g!PUsOo2-AIMAi*BtK?psVG19tV`kZB^EYTTSG6
zB=C<-C0gy53+VnA?L3qVy=xuv1RlC2<oe-zqF;UAH$5jDb>7kOCG`E;Dy@Ib;6D95
zJFp;k?Tf(>aB#+QZR1A#`Pn9XnW9>YC0>$DzhHr3^{airzJ?Bvs+?xgw*BV(q0_!O
zW2^vmk!9TFO7o#l*zKPI-yNSs)*MvHC0AJx9~aSdPHf;k^Dr^urV+kS%(gVionz(|
zzx~ciA@$mW(N6o|nLGG_fIBtk86@dP*&T55opy~Y70s>FkK1jv%)qDRb@PEJg%*Rc
z-!w9|grWbv?`6}nRER5f5lS=s+){7<)DbkGWS}HGHFr35ynZ44)3alY!}w`OPLgyH
zMz$6&5s$7VoTP+7`2?luDHdm>^$ZE=n+I@xX7etb#tFn_qmLHswrH#CR}g04-yl$y
z8>d~=9ple`TGRiY9{5;+^Eo`8ri1y#7SrLxK}{E$?<UhXBW%K$-{hYbe)jikTOqU@
z#6`LO);*6W#pcb&g-xl|Vxk`s^Vhj+H&Ud+z_X_m&btCbuJ@DH9Mo{l9yxR4b$RnF
z?Rwa2G<kD|WgnR$|F43k3W{AOKL3HtHyVMihbR0w6iXUd+2nwpB^eN|JgwGceHeHw
z<1^^3B!>KIn=pa5lpBKau4xl(S6^n3#X$Z#G9Y`OuRHgVw=n8PU&tLhJCe)_{BC>!
z^)H_RSCdEx00jkdu;b%7)$#i;I?#?Jq<{c1L1JV$g)#uZqbYvt5zQuI_*E4=97zap
z9wiJCQIkcO@Tj2#yXwfJ{zTr!|BJDv^F@}X+2et6f4RI~oC6kEL(V;n%&!xafEqYJ
zJq>#NEY1kAc-@G95huV}p9w>o>?5%PjA)3mI7mS~M0PyEPw^FkJ)b`ZQIq3h#_tRE
zWt}4|`Aaa=pFi_1H(NF`wMs!I56dT`mb7-R*y&)E`)>a!l$I9!d=PG3q9y-UcQK(x
z=Fcn(&D_{7G4D9azoNykNGnIo4IMjlWWq)8YCTT0N4?Xpuh8P@IxGum(B*QZr_5VW
z)U9dwiOJ(?9urg$t8zUtJ{DRu)vbob{F}j`mL-{f^?>yYx{g=`B-1g4JeOcZwdFcd
zUgk87;$N48h_Z>H#ToT5R#y%z8?cg3$+cn*HU+^e{|HNcIjzHnz(ryKxbIb{Tn0O3
zRu03nYkH1V4Z2w2s&%K&m1^6&?Oz@?SsLHTQ-l+7bsbX1hmJLymp2GH{q>uK7Iz_s
z_V;mmYWx!~Kkgkzd1K;5nv>(){44Xoay-=8S260pS#u(^dXQ8JFc-ILt?GwczHj*W
zi)koK{?f@oZ=1xKglT$8i2tTpW=={t#@RiKsgQtP^~IqT{VIaQjtrMUToa(;RAVWO
zuO_D{H#^6Gv}N8wwH3-B`%1QJIOPRl&nxOU$-bg3K<~<L^)xlvwkXs~>eq92xc%^d
z%>sBqLk%;I3hK{x@c3<$;8t%&w0HcgKR2S488_@#xOIF+-e&)S=3dzEYh-s*$1lBr
zNyTnKXY+>TtpHQj>g5Ac(7G*81#KSE1bg8&pHX{}d{0AR*z3q_(QI8vS4RN-9mDai
zKk|2uBx_)`;rHjwH|InF(xb?)M3!>GMV;6*8)ueLN#v`uVeOTgXd-BHn<kNGqU~MX
zTEMA5ZIz*wE=LqUJ17!9o5ZAWHU}Pm=GgwM1Llm_p?*{sooi3}H3aT73r`(ZhZbpJ
z1Ie`w?jF%>;E!q%=6u+p;CIxD7ShNaEJ?K`W)|Zc+z*n|OQ}899-n@?0sNj4%+{aS
zGZm9pu*SnlCYdXKd<tF6TBl$!?V!0%_MFQYquFNL;GWIphUM%{rlO_sTK7=%$=FqE
zBUFof4X3yYoLZ?e>G==zIgk!4!4d7vU6ERN)<udKR#Ig5sY}2GupbLS57pFxcG@7u
z7)FL(Np2;@{R^K0TuLBi9#<4x(TgaKIMitj-FM!XZg$G6=t!DD?mSclDm;^b{xEL^
z``SWdM)Xrf?a#+gN?QofVCr6|TSDle16Tn=7QjdF?nmfk0%uqnLJxw!7nc+67wWQ<
zQ|{j@SgM+<2OcJfG<(cTKadiy1=&gZm)<2xxC$yJiZ!KB3ZG^8m#l<t6CHZ|*riU3
zlYQHN)!k;Wqhxl}m`e${8ayC_KZ<3E?6z5Ko~q0+TC#q#xKhM(YwWFK5>~IMpwWwA
zsoj#5FS95N_U=IBWv1TJMZWeZ)<3dSwXY4JZs=%ctv_rOt^K6;)h{Ikj42HY)mS4Z
zD(^V7lyS7}{K}^ze@k)-DcAnxuyC<2PHQS;3^`0+%p^p-<TX4v&b#)w-XjAkTXH__
zKil(x{zYW<BF?oQIVKFT%3j=CZaQvGGj-dGhWpVFOO2BU#F`tNPUZxXz13nL%si5u
z>eo}v93po5?+Wm?L<!>lrFFLrAp}=@Q}KgSvJ@GYwb%cPfHKvA(ludVvIpK<Wbl&m
zSTErmu|N~Aovfcx2H^2(N$~lhEVkTjJzv=zz)12x`$b(lmNh5hUh)wTD{p>!*nbk>
z-H2H2KIJv6sJoi{13&xK+kY7l(++nB=XN|_1|>V0j;SAfEVuuF*}~p)En3;qr|wE5
zbie6H=h_OYZkth>NLlu8IA8^;>4&@~g;6{~4-qXkFX6@W@B=?Ci1OGAqmX!hL0O@-
za+F-pklXx~{qv!DSz+J}pTZ?L2vQWlTF?BQ0k!;W7O%h2t2pS2Y(FXMD!h1xFQBLY
z_VMSveDnGy@Jq0lBf?SM$SsYEd0pC$?HA_)RLTQ?SM4oy>o7;(=VTvScoMebTbI!~
zx8FP;7BpH$yHEx1WU4x2$M2VQ8qnjAI$bck_=Xy?7CEwV-@9o9Cc1@PnN`Kc#akzI
z@}JTPQ-F=jw0IYn>(whYEdbo=v)2PA?{LGIH>PgEAsODI=idSrqvPa44r;2ux$)uS
zgh^9|t}TRBn7^k%MX?2Nj2Q$i5)r#5_QDpK*Jcc0xG8w90gFp}F_g){YblH47-^>W
z{OTstxsIcOXWjjqwZ*cI0bcBV&p#oUF)NKKD)pg48f5k*0R(UO86?CS4<1{U0ZnKW
z-g79XxW8M)f;)8-(I0k(U{O8rFkm&<sEFPt_b^t0CeoA7KGPVLC#mFD$88WYap+^L
zB-V@llPSRYRRG|e1`v$fP>s`&2ieA&4W-%NhX;R5F2AF4>6KZT70?2-{*?005u$({
z>PZBv<;bI_N<k*J(U@(&{>A{U>xqDV!b6A8S5Y0`lgQTQ?_nk^-M6^j!2+%{W3kD0
znJKiqI3zDRZ(yM!>;ay{4zvI~IVwN{9wzJpTj4`%oEM;;I?uDm&#p|P@ZwR;UV78g
zE)siLRkncEQK~zuaN*=OLpwHB<>}?Uns#aXZmR_>HLedUTeqR`hy3+p-%0`Xhj)Ap
zqM&n_dDzJuby#69L-`Bu)?{$!ISt@y2||RMM&9@f9d0-N#_BvDBi9<4M;DkR{0JTN
znhJh-BL-Xzg_)hpc5Rs63kFz33=1;>maca~ya87MD<7Lt5Z2I~(JN`0AnV^}V6FIb
zz@0DWusO5{gC30<@r<9ZT|R>h$^cp;i&*guLJ}Y$VK7+F7=iDO{WXOW@a}+s;dSU8
zbgnE+yR@IJEya3p5<)7eNMh5|{uLssy%Gva;>R92c`Ul|I}DSivgz_d#NeGx7|S-~
zo689fKfW9huGSK6$Nt3MUW{dd1#yUf`HV7X8;F*i0UXWFFTnCUaxh|JwUKKy@gs!z
z3KH^=QUXFv%!4OGD+QVCG~<@Tk(q7NpGI`g6soih0P44XIX_9n&nP+GznZW38#mS1
zpt9J^`K94xrtX!fv@loZpj8ZRsJ)b}zN|V+m<xJLr~@~pnH>QP$%Oqxo7q;D>iHeV
zQAneklw3gOP^6OT-=C%-oNhCAY5g{CwhmIb>iu%`8L<M<Meq;<u&|qbdh=OR?k_x}
zP!id!&tWHlnf$ccN7yu8J-DyU;1Ae`>%kmB5H_sf1Y~X9cE}1c$RhD1;x>u(>|A;s
z^E}hm>8DA6C~g1E`@b;!^8r;Gk1w{N{lZiBGZ`1*IT`SLaKy*hr}5vV;(wtPrCiXx
z8<$RDz(ek9U8rehIohbQA~A;FmJ$IeQ8<~9Thzw*v6NuZ6e(|}?7VPPx0bSqU<V2+
zZ1sS~IL>4G(f$Uc`)}jMKv3DVs6;P=Y#-@^oipYyoaFQ*jcaYr8r@IEJFxL_>6o`u
zhz1@%GL2a3Ui(LeodkdR+LAhRFzQ7%&rPH8D2$qx@{F3_-$d^KPpTh(=cXVGW!*R=
z#2gtDOC~PilV&H|W=dqLbrBzttkn**39XrSv#-7g?dk7@RgBzkwiXJgxquqk`li_;
zGM6gr3`1X!TXy25fw5B^MRSRw7c4Hw{`v;}PE=*Dz!xF$2~NkP#t75mc25F(%SFom
zAsmwE@Q3So%CyY>00IApxNRb8Ra}$q;*kq)I%3x~EY$-rqU4%5P^e?R=Fc_#Ne1PS
z(EYyIH|KI<=G4=TXXf>ELBoRspEFn2-jh*IXPTJpM8bD3>`m|~_vU3^=dCI5%=|go
z&lO>hta$~Gk?#-23SKy$fd6T1@knvk`-dT|H4mrox29pg4pTb7W%IGaAManA$Zyzh
zCJ-bm3a4Wh?q@J-EHG@x5&H*p=dxJD-kZ67(Scwf7Z9`!R4H4+>>T058Z&VT^6gFs
zSGK2NPuS0%!pgSL{#ic_2M-6(8o~Vt`Efc^v}M6U4tV$CpgT#?Av~i7hj_4vX8$8O
z456=CkOTXZDGxQYyT{Pu6@_Kcm5S2LPCstQ?^Hb_T*1Z<F;fG01QkGU!s6|=i=!qP
zG>*BtqK=)f(#@AQD<6sFEfU5csu>_!8t8)82M43=)*d>%^#gb~oOq;}L4!{ml1Q{9
zm~^o}Gkm^iwWk4;M3Za#=;*M;n!Tz#zd>f~N^dTd6)fb8kptqQDGpM?OytErx!bnC
z&<E|0;s@>TGJ$qJ#De863JY9PPR`A-DofkaT&;k;TWLI)z|0A%C8M5wgb6Zk^h~s9
za^QHH>>sU@kIQ~OX0xuNJ~*Nxd79GJIBmWZBS(_5hd<-6fWxuWsp?OyYs?@6c@cl6
zK(QkTx7cF|NH}uM9lG|DuLYbyCv;Ww4e8T?7IcCHM0Zl|G2<0asnW#!)2LOx%`CpY
z^Q0I>Jn|#7ucawXz3<9?Bxb<^;R=ggbas8~jh~4R8u#X`oC56xOp-eeL;h)6Pry}T
z8f<@S2<gzuIjRg;Ab~QpAesmE?wisSh1)-`*XJ1VS&Q*O_E+XX!x{IO0`>?1!%Vr_
znL$?-UtwWHSRVFux%TLQtZx%HQ(d5)%_7(hmjDI6j40e0B7vFv`NnNBt00~DZN=wP
zT$Km^=b^~LF2rlGS6<In(wrZZQOO0$f%OGWDozRzkMozr`-(*s!t&c#oO!0>^sXI*
zF<aGtctXF(NoL4tQxRM)D*Waq50wJT)eZhl-Ef{A2EWq8#(>sLmgYPU>zn81=W7^K
z#C3x^h?Qnstx}^Me>=*8a^W!MIYP4#@nUE#(Uhee#{%;Hf7p8KxTgE>ef$=Y5Ca&9
zAUV2Kx=}`_be9Oy-7q91M4Hi|gfs|9#|UYV?(SyvfWi2^xbOG<{(L^a$M-+jV{GH>
z#C6WOuIKa9BZ~N`2H=`FU~)%BqB{Nr6^p12|I=zn^S(cCl+*0{EOhueG5-cpzRT)B
zfoSsiqiJn^UkJY|=Zh%`6TQDv7q+zjn?kwAr?t*gTFI>fF<$da7l9vA$gpdIq}VGK
z_J($;2uSy_uHJoQYkgXm=ZVJWWAmvMy!2NSc{UWCi2tf<t`6s>P37!OxKzhCEYEaP
zo;#LTDbYt_$fMC7nh(I)jN~RO4m&0)7o!*dyKSq)pxA`xy8TpWb4+;XG;m*ZVDN3^
zi(g}9jv4bgGqk3>yCqjO_Xe7MW!>TzM5Z-oKkb@mR7gNWGA3m$wPQe+I*Rfa7cLOO
zLhDhIlobi#*V!7rQ(-nPFgtLmkZ(`gzVe%<GBPtIMZQX=mQHmBm=oBUi-^W68kkdd
z^3<9Ow%I8*lhCnVEY+^{I66?b#Jp<TZdIcH-W!q9i=b_9U!U^xd~Q<u^CBbbYN}jK
z@O&a8ByvWmLY0)sqD^%NyO3)*dn6NVBuVk4=}IT0>;4zvkI^BD{#`L_^nUnJ*}uWB
zSvH<_sa|@#h-{A-(lUk%_&Ag0|AbRt59Y+(N}^kbM<4U@Lxto@O*yi!HMa1i)oW}q
zsG<gzkKVX!`Sc0b<8*EA+&JykOb&9o?57CQmI4P*#;<us^Mez?q6?^UvHqFFk8Ah*
znq<iBljKXid-I0iZILm`I)YierW-vcvSQ7}eb?>0Is(rrBpUAX%Ape2$bVUwEKm?=
z5tOSlV2_>Qd(#+ln*MB4pClM)*j=o$W|O|#UxZ*3*-<-i?<wQs9=;6-^$QLa&cy+z
zP7uT>YT?IlW-2a)3Z}IXbj2vdakp*xWhlP(-swK;Nya2#p&;;_Amq0hV0d05){~F|
zUnK$`^Wjc=lzC%FXG_a&d=%UP9w`yPB=`e@Kwsd1Z^j8?ND$PaTb?YfAw#S=n>a4p
zk5oe{r=4-h70c)?i@K>w=)ulE@Tc<VT-;<56VRs^U?=yt)h<_gpO>!Ik}*$;z&Pam
zpUook88Jr$_TF$_uhT--p2B6l#IC7Is6*GhI9q{D5oQd91>x6KkRbLos)x>*Takhr
zbo*QDsIiC!e4m`avyC7oRZQs+wPOoeQ~m)w6Xy@fqnJP3W1yJjNSBnSwGLUd&&|!>
zZhCB^(GXxsh~^=<+{OVP_e%wIr^MkJK0U!Dw-h3`8ujNuKYJ6TAa8tz7~|Y`&pF<&
zAxNW~mw_~k_#f*1)dPm2$LBH25cc@p8dBc^!Tg+0A?E@jc{qm1McU--?IbPS1+fgU
z1LO&2<c~s;*bT8U0tmwmRu6fnG8bemIH+yAzlWgF-ax+MoO40a0l18ewDR#H{>;$z
zD5fy_DM;AJoiL`%o;LJXa&*L~8gs<8=Rg-=PWT&qf-6l7>Crcsd|3P=qaRTV8B%T~
zqmVUw3w+^7_h_3&ygnRqV{2J+W%=#Zm)3->j~V1~uVF%ze0s9@j_RN}@h#kit|HlU
z&m;RuLdej~>(+-PxZ8%15uBfX8cMHGzfob`b0wIO=#-@!2@ru0OnkA5Zua0KGsVte
zYx~&+IlEV<=c&@Y#vEX-^bdXlGN^%7la96HJ?)zMgr3h_#ZT8+EF3HbV^!x@99Bf_
zffMnQS>U=ZvJKaWHm-a`l{I6qbv?HW5kO^0TnV?V(yuz)@!NlCH8&l0124vZ)wZd>
zJNLvp(Z*s5bj8_8!VUK}mFOnx9Vgw0{UAHvG@IMg`IF(=Ot;zqTA-j59ge}7gw%K+
z(*?BA8F#pHzTlp;+0i7npJzeq*dLDvM$ysFHh4O$fc)+dO`M;sMWN({`6N$%Lf+S4
zd5Om6sC7Aj1DcfEZkfXJAuYp^3W}qi+7Eg*arg&lrPbf(r-!=1htXDo&~Ci;b*J;c
zb0NPS@kh0W_IgKt;l~Lil;&a7ZI(>G0OrxZeA@-Jhf{<G%Z|`TNJYZ~IcLa6^~QU}
z(<|4c0_0Q!oQW;(R1o5OnHg$fgyX8pw-Krli?|=gPBj}0!{@XnchJU^okxUn4Wt_#
zq~87Fw8v2g58(IrjjBMtfh)01zOEXdzH7f0&@&gfrNQtcXXe_k%OB0j<#lw3C;rJ-
zr?~8>h}|}De)PbB=`_M5{H?^(uozkFWfUa&#fIzSQn)LBLPDYc4jZr&R;9R=h(#d8
zB65ezZ7rXha=vtVBo*le6ENUoTSF!*9!!E9$R9@LJHMOMW22xtypfg|&BAqjrQs(`
z8Jc9tnVG?#gmsN(r?>hfIFOBhIGOQQm;bq^QdL>Bx}9`N(M#jHukl4cZ+S<HtKTaR
zRP=~01EcFK-dV4khV`wjS&DSW=8aCJ_0_QMYM!ZuM<<DeW(ylGeNfXwxb-o@?WWSY
zYa!mb8l#5I6z=v*;eDii)~E~W#GG}<CCB&CpauU(eU~}iAlD7~W~L}5!zL?(DS8>x
zy_L-daVV%4Tu)&o_Hk&i^#iGn6VaE?fA1#p#Tzhg5Y!iQ*GvRH^)YM)444ER!0N{X
z0~}sP=iz;+rMeY;c1uY=%^zn+3tmi@HLk%;_awtPVo_eAu1{R5p9f*aFzw3n8tPuz
zMH=_)m<J4&zry6pQeaG8RhhYsd%I-38`>?MK-jqG#&+q0;089hAL!-_Q}uCUUgQzP
zc$d=az+Wgafc@%%&CX485&g6d-B+*Q{O%Hxoe*+WKrJf;bnm;z7T<iOhpfe&;E;0>
zGrM5^sh+A)-FZGYTVgIE2b}iD)l+?y`XT3K>^h5tgMOv##BDtqQ#p<-cDjSEdptPw
zs|EnL=tIXABoD9I=pEvD=w{KSZ!ph<yM~Ippb1{j5u76a61fcE+e5M*@}~$NnDL?(
zy^GSyH7gEzEJ<veOny2=l!M9AW6Uyma!wyAOsO=9=D8>V>E&N<=!S##&b&8QF=K*3
z=Qqaatdu3;ZNY#~phSF#s)6#McP+zoEiEu3*9P%3EhsO2^ds&`aJlVwX+z-BIjc8S
zel}b6-F_I*U6@EPHOe;>$>B$jIXGw)D<CK8BI5r9_j)^;NSXCML7n+M|J8#g%*8-K
z@_mG7n9MOl?w6-(Llq;S7`!6cH_mzAkK6)Y;B&X4Hnm5l=X9P|#w~d^!(3AE9xm!b
zmIfWzTTz-<b-&P}pH@{dQ!g<|Q$ERb2I5fG)X_OVM>q6fB0~I^oZk@egE_(cnRW~w
zl!kAYa2jDt!NA%sGxGF-GX6JK{Khgvxsi!%MzMbyZUHZ%2(Il24Em;RBGiBS$(Cic
zZ|tEDRDbZ*pMAf8P6n|o=2x6M7V3^C6=x(=Z*_1%zk-TQEkD)QvezAM1=oWNCW<-^
zT#9@&2RZl8Qq@mZd{{Zm3P%U%5fv0*u{yrT9MQo8J3T73zE%Z8@AoYZCFy`s&{4j4
zuIjnjpiGN^+VOrHt<A69yk=5|&0Ow$k^29TN(1vg0kh2yU!rPsZ6`R3+Rj}HhK^tA
z8<ZFS6sg+Nc+(&_CJ1d&*nf3L4Tuzbpi2x+D_rRUj$!w5VOpTn`!6x|&`^^)PY|pb
z{A$O~Q;J(~a50kV2`U-yYibuxG~?x<atrdK)^>2}iX5yKeMMyBS0JM3i$6P9nx3Qd
zxc0l3>E{xMj+$Bi_caTF`*z8vsXyDON0L2f$9Gq5!mGEQU@K{{Z)T~%(tenGT^kpQ
z<s8&K?fSqljZkVm>VEEhrojJNj*81%lma6};ITIq3VJ!AdCEuSgq$#&Z66-X%zd@A
zELZwl%TkcziB=+Ze_!T%cm&lHDm8pSB*`5n#xwUaG_vsib@En-Of`BgY>Q-p1YMtc
zna3(gUb?-BV2t>tYbTYWto_G1U`{oaJnCxOclx9_vq6R3D@cqxYI0IQSk&Z`2*pj5
zOYinW4T>#sK^BEpv#Y|etITKSQPNaMD9W9SV0*yO)K3t*%i<5OAK<Fc0ke?Df|Sbf
zK=XR%9`h`Rxn<EDktXx!z2G22FP<Jj5hLjeFWq`q8Umfu53!Ph@zK$4tliH3p=s_S
zJt;_zJBCx!lB_Q1a!(Ne6hNxDRwZjaqdLb=!!D))XEsNzA^+@@kPyU-mG5^Ufpjt0
ztg!TPW?4I_>QrTCXuXUb;7+B|g-3hsD5?5<<yN)bX}JiCfULoSR%R*o5)j&|Tc1(|
zR|a0@IFvQdM=9u*f%y@;+Foeo&6qayWda^>$nNha*fqayjwI<yDC&G_z@fhd1DAsH
z8f1Ah;|3YPncA}W{wKJxRh+(|)UR*pz{k~K9n`Su)$!+?O=MiSFtB#F9KF2tx+iPh
zG>G&9R~&Bwu!2>YfInr${nQFJeIiev5jyt^W>(^zf(w2RWG}U@{;^m+IpCs2DOKyE
z>kVd1ly_RF4}z7>`72xXdU#~ZunSfX7&~teg|Y(B5j!)loTK;EmELS-OL?^ndLn|(
zc}!WF+y%gSyY_@JT?kr-LuDCn;thC;%xlzUDUAWSV&xC_1i=M9F%9{OABvNNF^N3>
z2K3!{HF3p@uCSb;VeZ{w!>c54Xt(c+9($4Fv%Ai*n<|r*(<H4G%A`_FHTzrfKQXd|
z&vKYBzrStlmNjEn6J#bWdYOqtJXR|>U;16-@&FtUrsA@XrYaH?d?3YL!c&&jJX@^)
ztGJIOA(AqE8efdwXX3ln%<XxS)ciAtm3{d!)`z@5q-QHjZ;P&HR9+CuS1ELmh>dqW
ze%^|g%xLQ?HbCYQNA@A2neQ8o@w-yH=9lqT_ngR5<mZi(hX&5?b_$e4#=pPFg;OQ9
zZ4omLbna?Q)DNAt1Sr@x;J6v-o%AZ8RU8<ep~CJqa{NRO)3%i#<$1@@ADh->x^wnV
z<r}69mv2fxv{2dZn%-szdztO`*>OpGCza<o5)+&I%v@yV2J7rEn{_X<S{B={o_Fxd
zhw0O#Z+K?}xc-g(T(HE$2O+u68Bg$h=Va1<O|SV~ll*5WdDm`7JzSFFbdK$K`suY{
z^IS#<r!|Ca<P6&J?OY~nkXb)!q<t&|gMuG59hWDA+I)3<1O~24ljS&NWQsu)5xWty
znasI0-1mgeaKWdKe9)PU9F?42F$GuZ8mm9e3d(S^Bli4MO`V@jHjGgVyvUB<Id3Es
zYeYK=?xf``#oa!<ot2jbc$$d<SW!}YFMZ$yYkl?of83oJ#&Rz|vbGV7KYrOrQ>%bQ
zZX37T;WN@jtHb=T%YXChL5F$yKQNC;WyweJVZY>K3q@)Ny86V>#zW_T1J9W2PV0cd
z)kvQ_no#k)&~7V5d}h}NZzypR@H5~1bUCVm2&&B*kLbV{q?c5<frzeTj4obK+xBA}
z$xS3w<!(ab6L^Yz|2g{sm~A7+V+DWzc?B^8(14e6MQ#t3cYMnmSJ58wp%I*4TBShY
z_7!X$nZZQ~re>fP0oJ6Lh(ZNAaZ&Q8`WZPOoRY7S$0@!6<=A5S-#4Q0_qBVAHh*xI
zPDT4tEQY!&@>>S-+NX~&CRlt38m&qu`zj(R`H7JG1A@9;8;spc+}E3C7+ougID3JF
zg3j<@>bJYR;N!l^u<p%L>U`FK{kIqorFqk>aW|GgUTOU(Vgj9%Pm|+=xV))}L#yYO
zAuh5<)tu2|EF%J${imh5VHr!_9XFVhC_(aXM#QnXKG^W(MTXFzZJCKO?QJ^gdWh42
z_x1Ub`)83thuvEg8S}3^MCJPf+Ryt<RXGHza6oEMLPkNLlO5I_1$_hIHF{G41@4{<
zO``h%fT&Z~T9i`knb#1ZaLHzDzhUt&TU=%v-+2N^y(=bbm!Wh8aB^GhxHpTUt^!1p
z{&WmAR73SoZ<7{i)VhjT3PE2w{Y0v`{lxydddDih8%6sw|Lok4HlU=LYgaNtywq%2
zGy?=*JhO#9qik#huUw<a-aD8yl|W#@#iP$EA@e1bUeJQFw0#DirGTavVmIoURlz?E
zFFxPWsclcL6E+Z!3B`djB>?fFw@Z4+IF7kLUjOpg2dYM_2kC0`qn4f)R&nRKMJa7q
z$Z$u5&8}`yO8`lxta6azWP*0&LvlZ0p?`!-ujg^(S2WzYl&+m@&ft$^e3C~4=b5<M
z@&$fx*@h+w4V*}Yq%j+OqAFHQwU!l~o{=`H@n|UKs#Vb&#jm#3-&E5GWglDUyscAN
zmm)(C3BJ}Z*w(K91I$4)&r`nLQjie!%{yFnWBy7ZEa*lM?cN#VRl#56itSjR3xwF%
z!;vx9@s%*Ue18pCketr}(1QJsVB{Y2x1-Uj_>}V(usC75)v8<NFQ0F*<a0}hz<z~)
zo+6|NsW^Ok8oyj8lkU#z$vsq^bt<=gCXcsi`0iebeg3|Ap7d`n_j)n}{wocwDW|o+
zN#1|Y>6ScHjFT9f#A2_}CePmF2q`)=^@A3YWFhDY{1`tyaeStA8-NJH@GUy=fEaA~
zyTzpd)o7R=88lE9Q|wwxpWhHvyo?V!-V{LDVPLCBdPSzyl#P-V;3j~d(&@$e%wl46
zjTO*^zpUSV2zI(%$0S&yn4WBUhPVx)1@jro4I!2>Jr@Dm_4#%X-%9Mt>KJLjd1RN^
z{^b${UCF+8^9GS>?L0{$e#|CcT-*6gRhlO)fhQA<H^uduLTp5e)a#q<H*}SoiQET_
z!!Oa}zL2#SxDKb#(9rAi#jtK{!s8mz>$C6vr%*T}2c6FC;}cm;gzdH>&F6v>fHO;C
zt2UKk&s+&v3zEi7_^SLV8%YQTbAimx=mNZ?$wV{1RYXNrY@ef2HsJ{2>iEz(TU@Y+
zUPW2<mq!LlkkSRY`e{*|7}A7qi_@5VkBWf{KfD<Isr+pxk927|)Y2I~4x)r{Y6xZB
zo_(=jc;81Rd3eJf-F6U2%>0X_@@taPP)c)4xT~<j`Pg@~itiK3(yq;yJ;d(L(Zoyw
z-K!nT|CYqRh_w;bK_yQZE6k%7bPV{lQ`Z!f!W#u@Re7+k?F`gPtn2wbs_Qz3LeJAT
zw&|l>1%?NxEJHKv4^sp#HDgNb;)*LS!1hkn<3!bn3LyjfnGWuLs|;9GVi8wAeU#bR
zwQ8K_=yhVqc=b%MT{F{Ch=+65D(==x_vANlpv^<vdr&y%;hVw<{q}gn^|pfJKalpm
z*0<h-I+j2gJW$A%qgr?rGGO+(;bF%tYkI|`DS0`4y_2H}oVS6`Mim@ko~|VN6t4kt
zm+~FHvJCyWlW-mNBnB0}v~u`A>+}3O=-;21uaBQx`sZ~X*|qqLTMA{!e4@vk@%}re
z*bIuAS3^%t{$v^TRm!d1GuoZ7nseCDHLw<!xZ8OBLn7r5CY}PxAf8hDF(cxHd&m_>
zW6(~x{_V>Hz|@Q4kUc<~)f(3h=aPY-?)_r|&+}AZSO!%X8D1JHl+mR6cqh)8C6~1h
z;mAbsw~lOa#@L7ztd7?Z-|Y>bNk9NNByjQ!>?Hpdipbavd_<7!M@m5I&fZh)?LpSF
zgRQG;8`|wE03NZy)pTftg|ek6L9VbyP8N=-23^ulJ<H*jNDc-O#I4GiHVIBC-lB3v
zlRht?*Upfm8qfKuMMhftbG=)8L?)?e9;K;Vu|Qi!bEiza-c+uie<CrW%jrZ9*~z5V
zx%EE*J}-B;*_kci-D2vwYDV;-4%e=#7lN5wk!CT`PsfEvaGJ&DBp_EJkcy_<$UJ51
zyK)xD)>8w{LMX-ha_k{?Z$WPXxqxbIra>^#Ot{YZ*~x3aG)>9llFw)CK=nLf@5n%6
z?>Zkkh=^Wx5$|>d1FD)%?p3zGC2lR=(*E<%5@`F12m7=2bbXIG`%AgFc3kV#SVFc1
z+z>Sb+p)Ar=slyUeP=J`y^`i~I|LWSIfqUi)eq?<NK>Y0%|=^F@%P6ZIgPXhV?5n`
z&)n3)X5%W(e~nr31yfk&${SKDjZ&!^JhB|F+GdEl+~I3qnSSY&_QU<;bM@QKB;^4}
z!yj48dfWbNjIKJ2P=_5QTz(m}>xtLr_U&I<4&Rx7+$|`FSPz?}gF}$)M4lhcXODO0
z+HRV@@f9Y1?GDM265V<@>-?iF{nPnFz8Pi71t#n3c{Mx~&y3(nHt`mTR4<Hdzf}E-
z$SwSisJ!%C^&2ssehFR&qdwtP;kznXRKqDtv+w^Fb1w}sA9|zdgyh1%9^V*<g;Q~N
ziBM^h>Kkz>77EC`nR%h>6aMUSdTui3$>r+F<ji9!voC3cR`K1R&CQ+%*Vjg>s55o`
zTMDox?|nvUUQ-9~ynjX>fAr*pT0m<h&LE||an*3Y^bes=q|V3*Ls<#64&0dx5Dw<z
zCg>ZAzea`F(K<lVWS2sxq$<w;#I_<dq#Pn5U37Clu=MQEvJn)JN&Mu#(hcZdOZk?m
zEWSKH`V<&6Gy=>9KXB`yi=OOqH6t+V{#bxBqY*HN1PH;q_JEe<lm@Kj6+k@3^X4{E
zeFsNVvpzM{1|Ac7tp!Yft`uU=7mz+{l8~Hpjtayq$t>8w^b&6nKZfEdC`R-HW)#8#
z_fHw&mk8C5VxA9s_GULhF^hg-W(1&0%K5~R+`$(Zt#UDdgzjeeb-<>gG`x)xWl=DK
zIqL>#+jFU<V9nvOt;wmJdH+2rTQ`?M#11+oh$(;QgUs2SZ?2CpyBT8x(1{+Iy^IM$
zE67+B_w)W`f-#OATSK~8NjrJZj@6SlH+Z-e3m_Z5P~%)Ki3;>(?);ZfAM<+zva~tS
z4>YYPmA7ckP?c*cOjej6*r|x2HpcFL>+aDDKotYDI1Mv0%lz1VuK}e!+!1()@HO`7
zE&>dW5(tpV4wkV{O;`&4%!O;m#-ADbh!g@5%@~?)`bgtTAx0G{b9MPrj^Hbm%(-Y)
zUh<N2CnzSIFsDnMFY$Crv6q~AuZA!BjxIbwsBN#*ZGTXl*X5Zniapxh8*+8jDnJ>w
z{alS&ieIFY@+bfNoiAQ!QCsl!3Tj;QPs+gTD|Aq`FMk(u+_>*7?l4MBWEkymQYD>y
z*VQH#t@fR$H*DJ1RiW5snJ=mXI1hzm*13N^OsH~M?QW}iZtnagG-eLOT@G(;h_U@i
zgq6HZiB85zZZ%aizk1TUbGgLn_slGZhJY+I(mAiW&!4xR2Xxs{&CbmjIg1$K;r&}|
zp5LNl=fOWpOtscw`C>eL^h-@1JMoWBpFhXW<?2X-G~LQZc0lhg5i(YFwGci1?)XBM
zjL-1bg-AC+`CQ|^!+qJQ><LVcreCyS9&@I<XRo91c<<=A(M1slG=X?J=IOfN=aWs#
ze#P&QFUA0IOpt8_p#ZXB;M<|0{C8hwbw}Ik>6N9&&%}`32B^ocm^y$y&iH}uhbRMU
zVBSS|l|TIdJ9uHKo7Ewh(iF=xCiRDrF{80yHs`MW)%OLp!py44gDQvOr0W3B=};fN
zjA2#`Xf5}dS3}%?CZOs|)XXX@_W{sOPPF{#r0gWJsZrIcU#tBM;VS;4RrBZ!Y$Xac
zs`?DGn?Xxj9p7BW+6-5qkYuJwdqsL&egR(y&x=>=>v8w&5E8C^os~7ejUuLV_UP4J
z(6Jhatc7^rNmcM6So#VJ(>2!YuKGJx?=PvWWjBW61(gP_9xxwF@q6Cyk&;i0MgO2}
z<d_m2EbeMZ)WiySPNG4#i;796D?(T^yGEnh9tOakyA$eT8{Bb&gF@5dewJm2r{&64
z8|;WT4gqr2;&_e%=sG7bapk1f*7do^JZBY(&)@0BpF>gXBSSrn9=ZA27-V)n)+!L;
zl3L|rbd^m1+17#;2xQd+V9!3)Ajc#aF-CA2LqNC7yG`(S7y`0_pA9nA<492%2Vea-
z;9XA3DrHS}bAhG}sS`Vn1b6^qujndCSVcaDbyvK=Azv_}2g|m+H?U!CTp+AC2Sy*4
z*XFU)RHPCRbbDMsG+%D=IoPSAMy5LPZs1_g0ZtZS*Z2r_>NSqlJ-V?XYmN9s5m~1r
zkIaQ<N<n@={agG`dBDf4inUn}n0+$qDkD#?uVogEDC;9##6n6Z{1aTNekEq&<qT!m
zUyhr_+VB5>hKgZ7om?Fqh>;w9^-UZ&loX3Hhri=4;F`X-JQ>|eB|n&YceY`xcb&8t
z{jdgP<l-FOu{H!}cvQ_;?%i^ep=*|>Q@>ImZMh|ShNQxw+WEn}bq0O0C*cY+Fj`c^
z9yKS`sZ!)E%}45Z*QFhB+V$-i$c3#cyw|2huGMRwd|5Y-habOw?pv^LeN6VAhs7a{
zRmBt~>qq{+PH;G4&*=@b$R6)#p&l9JyioAW%j%!$feaD9;$yBnE67!b?utC)vjfw>
zX-nRf6Gzr(&$HV<mdD3cyi{41dN@~#-aZhcic>`HP~U1s4BDR6HdzrI2=gu&iM|ln
z(c<u*7b6uFQ8Y{Rl>&0G!H$l%$X-Lc(`BVB*n3XyCW5&8O#T4zY$>3svG|!yJL<FF
zJk$?&#}0NSJ?1|9;mW$(ea5K45-)m}agJ!uzUDH$$^T}L#W_4AyRamV%4A`{PR)bd
zvuKt$;2FPpb(?@2Rjhs7GZ*{H2rK6neyb1z+gElTLUlAhdP?z-)_cki8oEtxp{^HC
zcD(yf&dXlG=XZ2`MK&7xgh4-)Hyp+GMF%}Y^5%H@=i@eZh<lEbEM2?DSC4xfhZHH}
zSd&rGH_{f2UNe1()HM|KZD@j-VFV~HR^O<Oebl0C?2DSiP}P}R9c(N5A+G686Gn-Z
zyLQN?M!7oJf&Z*SH(TVt3W|6B*|PrMo1+R;!F~lCV;~B0qKz~HC0;PVKzkb#M>+b!
zRp?ZS{y4op_gMHCQM5V6!dM}@pMXXCzuK>Sb>j|TCvmI5P%COWljx<)*r17%z+X!H
z5gut4X)sD8>mB}8wtL5NJEI44(%Z!wb8IYQqab_D!j7gE8efBXSz181pT5rbhfc@W
z)-T|%I9JTcbfWE%(g}>Yj};Nd9ocF!6QXCY?}D(f95dztC+)PRWzeu*XH0)af?bk0
z1j|5CwWxca4-}(pOB<>VuocPwaAv51UjWliIbgl(1i)fDn(0+_FYImK9Pu9cg7#gl
z1pFA7yzIpG3nNB{!-trgBEhGk%F^|(@gZ*Ha=^>63xJ)XXn=ssjU1{>z}M$&jLa#i
zz4c)LSTcpDmvXWDH`79)-H+pN%HGEUgt;4a9Iy^LAV)I>FPJb!4M>e6%UhSytT^DK
zNML8gl_L+AP*<S6MxWUPwBSUV3CUBHiOtD7P}I?8eYm=YFT@t#HYeB>(obt{@1m3$
zV(b?D1W&I>r16slqHRz;thg>ua>7Lvu)x3$D)|P%2r3R*k~O!N;>W~-T;$S%^)H}L
zC=<EUJ!zTelxG&ypeG6e-5FYO#Wm|UjzgXX)X6t0Ff{sg08En#JZ*?x#f@z;`oO-Q
zn>rEKU%zY!KF+m-&_xudoqs%q#A;z1QMseja!M}9y<jc(CSv+U-wXEaZ~_ccv<q+R
zrczg}Wt0nik`wG9x+&SLh+5yY44QSEj4eL%9u4h|3vO9Bx}GJ&SwIj%4w{0@X3y0B
zLrmP>=6{bG&IR-qJn8(YKq3N;Ls-x-j+p)K_Fzud!u_fQ?^|FevlO4@z!#Ru+UJa4
z)v&h+40D_elS^#^+oe2sir?OvSGf+)4iedE|3gbm&qf!QfT*@U*66?uyo!wWRVD*9
zW{Uy;IKQAM#57uycUk>aDxc7@Jf^=gYN)f2ReiDee8IDB7CVifm@Yj|^*L+X^E2kf
zVJpGoB*_O0?;ony9gQlt@7Bx0KTH@Ic^rBVSRW7v=K9a|IC(MIMEwn)TZr?=a{VV!
zN(Dl#8LuVHs%#z>Z9a=c1<Y<4Ie-(jKX7o^9Q;ye;!n$(V6qSH0C~1c#m!VVR+94R
zy6~ItJcD~%k)UWeste3(?xU<@_K(5m<AgKLL{-Htx1ZppvSIAW5rcjj>q^|+xlu3r
zE(_B*hg}qGxl=b+FXjy49n??v3G^R`%nU`oYclpOzpz%8O8E9o$m+@ds9<lui^X*J
z3Ww{grJu!{_u=a=ligFii|^FP1rpwvQ_m_D%k*2jij>e1FLntF@<Z-BJ!i6rT8%Xr
zhZ1tc!vxnJ3V<6j?WW6ae6Vtp7TO>FD#WUKGDy-0Rd;{=8z*nRfYA1TTGe`DrdI+T
z&Z~W93eWuu4Ea9>J*osh9Rt+Aa%(YjWR;X>8=pE9BZw6Hx|(tqG{J+D_D*`m-68GT
zLWGwvx5lqLwMv`4Z%JqV%O(I+Ip12+%VOEyE_Mq7fb<M=(w`DMO?V!@fnS`qDeH!r
z{dvh5;S-P>pGX)p)_PF98`d2}3NagiU|{#^$@2aQt0Ckk1K1k8b8@e{8t)~9Dx5>i
zX3W4&i$xH#BMHoy`(60_)akMiL+`jHK~21#b7b;tH(JR22kngsHCP@HDiuf|p7P><
z1=K!R!~~>eVcmY%;Mcyyt-@M_F^e_~-Ccw|Bk6A|;%iQwc3O~udB9&+<jRlT?y$t}
zbFllKm;~SiQ+Qe-Y|&rwg+(){O&ftdjZhrWW<-ScdnFl*`QjvOWJp3(Ll&-Y3_QhJ
zA49lsbrs)2YuYt$Q}AP?2Z4he0HA~3)dU|`=0es^>4W^A&QJ^;8<7NZ(tsBYlwU4Y
z1O=>KAc|zn|FBbojbyv`LJHUH=)vxZ1j5agxEYffOE+so0lNc8P)yxPXt%NAxu52w
zHZaZ74LCWEl*yDopy$toe+f-%{6wva(8A5I;ES_?llX;)&XvPjM44!I@egfjLa)=o
zz>X<!$+00aRR82b!0LkR(w1P%w;QF{$D_c)>oO6Av&RLhc+$jn<_=jyP+Z<lH-Ii!
zqD~C22X%f?<I<Ifb$lZbYU9QDs`&GL@Nc($!N*IyVR6+|!8wYl3=zw`wrR5qeGYN6
zKQhs3hl(?^8<~D0IhL#ZVKJ5bN@O9eN;r`bpBsNBNz(oXm&?9=+wkX7jCR{0DAG{V
z#QJi;hySGcq#u=myrIl-cKPGHL9dGxF^a>T%6x%fWgJVm^nwXBMVf>q2f&Se=&w>s
zd*Swsm%X@`h$Gn6gI(550U4t(N}}dt#F0)jgI^T<RKzIk;3Bx}8dWi_1)Y0#*$a|W
zIDv5_ynAOfeKOm8#X5ZH^-7^Ax9nOn&<_dw*hh8MJLsKx1iM{q+-4pAMbUhGJ89W`
zyy)w<YV9PZ)$r>;*=L>8#ivgz<7JKo!`#!OJZbbv_OQ`0j4JP4bHymPQQWOPq0zV6
zk(>Fw%~toFqq-4KuxKr2Ti~vP12_Jszc8gr#jA?Gll5>hbn_0!ChiP<_bfz?Ut?D(
zBvPd0=i3x>tU*R0mN{CdO3B*tfCBXBz5DU`Z~OPsJT)Tg7*$_t&yX+N?RO#eRxc(P
z_bg$1Pv31M>~#s;lOQVmEI6I1ZB-?GHkwLkU0|D%v1P^q9~vdK3XqM+#>SwPrB)k!
zS;-nOSSzWOAJ}RFq5jMN9+CFXrj|c^twKKX<K4b2{LojX4HgGVTBGCAlZJZ~UU-o%
z3MkI#U$s`P_Ql=0m%Ws?4-%cP*4gLEUUZm<vMO<FR0Wgxu$1gsg`7t$CmWpb(`Zcg
zOkVhSVyWNU9=(Vb<|@8<1b;OH%)-R9w=I%<=b<L)w;RqdsBUG5RXjF(reH&d=gpT~
z#-k(!{c1hR6789q3biXva7Tee8j>9~L29R%D!8jRUZ`X+^My=IsmgHXcvX|OR;-F=
zys+qYMDtgE926clQ5~+kz$y3X6D?^adNX@EO`qyzaD%|Iy3)EM)1CFH|GJ0s$iQEL
zBsX;9srY~mGiJ;}vpz=xM4sg#rk|a6$4D{>pJNx5N0|_dk_@_~9G&RKS9bfBa@~FI
zn&f-^Y7oXf9Pz&}@I6Q{DgAo1nu)x!%dVTd7BA%DlP=lM?->ca+0``iWT=a4c52H%
z<GJ{BThOnSr2*fxcG5QuuGR88uzo%4(&^hw=xQK~4u>pq9`e)~m<G3GvKqzRgysIG
zpN9*NYBgsWk7;H6Tu&-X<DcDPg(HUJF5;8?5oPc#)?u;u%akd-ipN4mal79|ni(ch
z9P}LX*DyxkGb6Ke7^234p=;2$S4)mrvR6w<xT#MoOUz?bD{DpU7{=&u`KgRK)nJe5
zBCRZ~mZwi^Vel;wC=<<w<OvlHxhK1+KVcPtFZQ;)Z|OU=p!4cWti$)=o2*962g<(m
zf<g<EV}OH#H#TXT3F*<d^LwlCnbD}{MxGmOpMggrxpozSC=fgM>zrI0lmydo#e~+Z
z=9H!3TlhY1Mh+D<V|Ceck02f8_Yh)6$2^y)oVs=`rNh1>*rN3ArEhG~<&k%h&w7yJ
zs<kO+f$d_~tH!R$F~_xf7T=;FEhRHw!0PHZY(|&9a~a8T$Q!X{22hH{`b967)dKJ-
zM|?}l09wA!{wCC`-p;-Gc=u}Y7zY2zyD(`6wgsE~JdZBxxnV=)mZdpddm4ppXdA^H
z{q_th^i@1(gu>~(ISjnSFXc{K2QO2;?(?s*GOp+WUyLtB*(r*>Zhbv5{s{K*DA7kT
zQjsncd!RP;LZtZ;1pMRBGJ5j~rGD*Mmm=T|7@C(373Q?iLW;o4APuhM*I9=xXq%5o
z?s-F>fKVcb*v(Fq5>dC(qMNLY{eMn*>S~&yJu=7<#-Osaq-$Iwfp=)GW(H~c40{kt
z$E{xr-PUGqq-IgvG_sm`_+o3b^)~_IxnCt!=I@*D6muBgJ(Nl3Lc+2G0Uv*P{QxQ>
z7ALkks{he_n}kJe5SiaMF)WC_U*PkFznAlySb`pya{f<Z=g`SndvLz$bq&n#_u`sm
z`5(%w#Ib$GhOURVLboQoQl$=0L(Q%quNNyjbmwuMb~vH$>sw5vG@1#QkJ(j^-s|$i
znOZ$_&AkpuZ*pBmJzy*xp(-r1k^j0Q_iHGG+fOaruqtyc=<BywMfV|i!V#Wb+<S|@
zZ|}B3TDtLTjSb^C<+I)&%L|F#A`3Q!<p!%eqCkp#e(;sd%S~twY<^;LO@}iEaPV?W
ze}AGWR^>3?FK}k~)`QC$RyCw>IrpprK15mFCmQW2X(gO1-DQ)yW}n|{aDbdVd=c_v
zAWefgJC=gMMjCebf|Q|9>gdF_Tgv6!YtgMNF08w_`3CJdh17!^{5E5PicXXq{8~y#
z87sG_b5%TkZofGytyk?J<n-l{j;IcC?7PK>88N>iXH)mT)mzBWzY#3HMC<E!vehJ5
zR1ci&rm5~sVtX#%`tjxkar&Dj7TDXZ-5P-h)(TSHha~oqB8<mw^n1Kt3^v{?=hS<Y
z`JDF-yN4r?UQKC&gd%@ehJ+Rsl&@F?B}Ll)*+cbiuTf(GfAL0JRu4vE*jec^nH#Uk
z8IzFQ5xoW<XO?MCB9ohr4=m1l>R#|$V|<(?B#PPt-vI}|Hb6Z47-pg3(>l4fR5o}D
zvskKbCQOe35>mk-3}hHzCR~|z47)~=H5)q1_D?0%=D0N+!=kz_dozugjVxQ_tAL6c
zdItNLbp&H{+m=04pu>1_LGkqPi4d&Jhq6;kNrt=GDJ$g!HYkshYp>Q9zRl=5ia&-i
zy~;jdU$cJQ>fSAma||o=puAzROF!qiyybp%-)@C)Ud!7Mw@d)rXB~6nnUTP~%Kpis
z`M!uKAyywnv}iKD&-!FT=j4mU+p2@_QmB+`-<N7vf`?*aC4e7z^2L=2k2+|r53YYM
zoAY<lz}W28Qqq40M87{?RT-ZsmtEW;44yQkA@FMYLe51`OYzTdIX+!4=mP7nGT}h?
zK}7lN=Q2okS{a`gW5MhqP@hw8E^;ymys!fQB2>a|&;IoOu0ph6h)pcgweQ=GO#8wL
z=-^JP>T^eY>AhXOadT+oXV&peys(YgkCRlLA!m#=&<nMK+0QQgoX6Bg;3-j^2HVO^
z6HD&f;S6$Q+S~_aQm|#k{w)qYij$+K_K{`Aw7gi;Dol3HdS+Mf@@Xs^sv(tc;uw#u
zK5n8mJi9>hfXlCIbyhOsvg~mW`nM`%y(d<6!7yRVPd{}_63;(|eW+mPRG`bRdn?M`
zD~DkmTs$KG+U<xc>sYK0S`ob}s9@5wkHdV?Vj3ho6u8DcE174V2+7R9rM$rvqpB%N
zvYRZxMpjj5nBs{&jjGh~B9DTr*73LJL{4jt!(>=06_mjR>m`pFza9%Z24wjyG~sr{
zt_4j(z=hVT8l8+eyx}a(TuL&zER*A3SfC<9u>9V^$yeiGtJ+L4$Po0$C~HCnW*>fT
zn10u09#=nqN12#eJ$V-juiz2Sx$NSPBFA8qhP&|EYGcOM=KjFPj@_)y*s~^(R9N_)
z67}PrjM_)6ViGMK@^i@Ffx@fxifN9*aq$J?-rR#zg;pO(%iIsY{`j!I_j}V|V3%ng
z!_uk3n<;UzTg`diXaN%c{T$V-Ha;^+H0UG0e&u{amnQZ5xp&dpWRusco6L$-oDc&E
zpnN|!#Q<XR1|L{*J|Xm<{~?uGE*v-n!q&)LNv=KCRS&N_ZjCTVFL+Hiyh*9YVpntB
z%KYJwb)D5dgaH5fBaDk1(dU#8$nRrva5_}s<|QuTW#NBnQNj1<y^uR4xoy{K{(V8>
zyG|McE0b7$<nZ-x?C~<`K~~`9l7r3P(H<b)5SE;@YHzor>nU|as)w$D-JUT^+typ@
z52jzH9k-T5U#c=?yo!J>Lpd6-kd?4ohW<GbJpR^f)?pXP$=cG?+DOzIfzr>9+|Uix
zj0QQC7<*23+q6%^%ZSNeVzZX9<=iQ|z-92n%}2Ow79Ir#=}xmZ*ar!>XKEws+?UJy
zBK}o3kHkqZrM`*(UfV8j^E=_?<mO%T;}E!4C5TJFW$Ue*c*M7J>gel2Wh5B#!;&%*
zfIoAX{@f%4mI?6u<<8yY##>+Iw5%IFHN=*p+I!@2E%Ei*rC~A|Vir`9oYmrpK_9d8
zMYeg|K)!l+VsR^6p6~set4(vIzjARKkvM)IavBEQ!$C*}LOhZE_j4%+@=aZkIDzI~
z#w_05_g_#Sx<BVCKUbVwB}QmDfeVgoQ~DPbdVUZ7*3mgw-LI=H)$_}~$n?A%G5Gf^
zfNj?a=sd}|D{F(D!K5uRVy7xi+a>HX)X=j#ygdGMw37Sk=i4SdaU`i-*zCc(KYWg}
zzLu9yGNY~dUulf@OtRfV&v&l{H@|R=U!_0F9<fs{+{|3xq!N+z`&qo}nwwD+GK_1u
zS@0WDDa7INe)9H78;87t2`5MC?lZoBG3M9j{`7_7QHehEv;yzGuDUe{$}y4;pTLM$
zl4#H~TG)QmVXD1#BhN;y!qCe5)~C<9Zr?7L*r{EYrYnHn*}c2Nsdq5*E+IJx_<OCg
z4wD$3!xn!;T*<k>&tWy{24_wU;*b9G49R;?t2h34@Ydg5Z7Il3g#>fhY<_Ep<x)oB
zR@&Lc0E=hRbs&eK%a(?W{WVk95q)h3tLbJ6y5^`!z-6<up#dcP*rG$9t!qGM)H3X$
zO})TceZ8w6Zk~y&YC6k01h224R3B_QQmr!=%<O&w+4QcLPvvLGI11;r6Ebr*Ld^c`
zCLBdx1VuXj>OYG#CdI&)!7-a5W!*R&)XVmYaB~c>TpY=YOsb(LR6^AoQ(*8(82d9j
zbOeKx!azzA1<0RT<DpGoKuYY4X+MHPl%pO_b8lxND`lrjCqfr2x_m_nGD_W%Dn?K0
z6u5K<a)-@!tSAN#llcp}>dimpW~UfY9gzh0+D!K6*YDwqy|+qdHyXe#7~|NvbR8rP
zr+yJ5EO)F>;wr+<M-!7&S0z-Qc<oho(UGk0Y2&_JXJ~6GFdd9JXg#Ye7PaLr3}ni|
z)z!rGYD-rqu|~7i*nCOKc%yX~o30JKb5}p$e36*^fG_RSe7XafLzW2{{g3*LZ#d6`
z?Ed}Mf~&;UgY>%(v9GJ6IgxUwlw6<H-i0+wpvFyPe|cLY74pgJbJP*F;)*F-)>-`*
zaG{9?`eV&@Eiys^(&~S^jK4b;f9L}zSQ$i_*(fYQ4uHJ|yR6^;;~Q8IYC&r6R;9n)
zWvTu_;n^#-4n5^(=~?qX4t`8SY}E<V`;?ub8-esi#~)16K=&Zwwj0Ze8w0NMAjVv~
zUGIv0)f)Yr&L`V(rGX8bQ#QUI?{mtaYeIlj<Y-W`v+3$8D&OF<(xLC_ju2W43Nr8E
zO05SSON-KkB%)ZiP04&iqf+}ql@zuHv3&2o^DR<yx01}dB#;s)y=g|%7eYZaXQDN9
zzv-$R2k5;TTQFwR9j=-r?L#dlau59MKSV5SW+GLja>TXG3$|zA?@}d4b*LkLq-Yqn
z!F!oc*%@u@RyBicv@g!WadU{4H3-exWA%)@;H5hUG)`d;C_P;}&V-6@7Y@GTsnu+`
zhIyds<UvXeSnea94O&-|zT-O<sh5o%IRrO8zq!(8R>PAzU5|3+f+!f%W-{zxgi_dl
zcZGmOX;rADbCdIFuWB0=|G=&#83?o-`#5n~m6I#y&e>slgR@u?r<?!<CNEUf_F1^8
z9Vo2Wx9dH{f5SBY90nTN@y!kY;FDZNqE&v^Y~J^8=+@$>E_xEk)yDky*VbZ8i1YME
zU~Bo^Z_mRpNL*k=js0O%YB~(2vG>31Y14X~1u(^xF=cwpEYDCWcY@@%g*hkL$Ds_r
zncc{ucJ-9<)y|7)d*ilUe!wAW1f!y!+zd)(mLFy925|EZ4%S$zrsK{9D7&~!dOD%!
zAVN!)!3)S&3}>+@2_Q)XJa|3uh^GsyOK&ZP0$U5KivV~KOUd||(S_5GwM|aaa!ar)
z+(MJj^49;5c~mCPll&p37bYG2GjL~O!5`JRl%E6fbA<=Vrn0AU-0KIenqUSa4&Jnu
zZw$5Gl83`pGqhK`!r82h6FUr)VyfYp%cl_8!)bT=wE87}@P9L;zJK&Sdg*p`g;!`2
ze?ae*=K8d8)cx<eI$f)1#pXBIIet`;i&d-79{k6=>=?;yT3I8u%?z^*X=RfYj9)+B
zCA@#~<ntbk<rtr1e)qfh3w+x$kp$wnkST-&?`o%i#p~fpV-dT*!g{6kdV0~FIV8cZ
z(Trgv<6cC)<Zqr~rg@qLUf&wI(ylR`S@o{nYKe~X!(0P{`adhYET47RY8PX^>SXZ>
znl~drPcY=_(^)i8_ML?<bFsg^;lrdJVjtpZFTk7l$V2QIa<Lihhb9u$e(3fV6XQM&
zK932-IK;WHb5>wB5P6XxNPk1m@s9ds8s_UW1yq5)DDCal0Gx79ZKZl39Rw*E8RkR%
zmg+7%8*G0yU96O}1l%t@=&%#88v3{A{GH%KpCm_vuRv8Q{WZb_=uUmUl-X|;d0X-P
ztf9@h=U*hC(K|}|kIAB-|E8jUZFHARUJhX_`!CR(WB+Y7E#{$<;KPsJkxIKuwvj21
z#)nwhn4%s&03u_a)!o4SM>~1V7(1fM!1U;4dOStr>jATE3Blo6W$Plv64`U<f#NwK
zin5~6uTHbym6J(`2lKz_LaGo5Jr`~B_~U*D^FqCGzF=1z@oz@{+^>ceoFD}0iUP?6
z5*HW)xe<dXdD6-5I51BnC62i9(t~G77+N-g$Of$m9KSF}BAiEz{?P+O4<eZ)6ceeM
z6xv)<`K1QeoL6$KpJ@m~bK-05h%K%!cam%k(!C~VzNEa`ty`4Ti2Qrf^{<zm5_jVR
z`4p1*Dm-7CK+!POw&Lo^P8E8$gi$|&;i}n3Pw@X1+kfkBixB;%piy@@GUR>0RNtan
znCHbK0zL-V{yO_E+G<!Nt@UHnQ=VJbY-mdaqT`}|l^ic<5LjGx!1L4Mm1%{%G-8uf
zp~J^BRUv}PZ5}tTCdtOvL(NKf7z7%>YLqW_Twxk;b?=JXO*5wU33#I-n=O0b|2BhE
zBd0(g#QpFF|C6{M25EhFf9Wpg$oyns<W$WE2XcZtLt%cg^-r}AK-SbWOCcKiPfQLI
zLdA^_@p4c>YVBp43c<`OH{@jzvro1y1Vxyd-I6COZ)^l|8cegMDM(uL@ZE$fnc;TQ
zf5U~p-C&X>`a>26OQ@)Pcn5|ocPYoHS>aMs<Il5v;nJ`ScO`PJ)>ZufdHTP$1zhs}
zoss-we?#wlOg~#Z!X+!92Dhj-e0wZnS3h`Q>aELn?ziaJb5oOMHlME+ys)y+^n!NI
zbF(7)4}I4P>UZsmt#Vj$?OM3aFlqDQ1hXQyR8E*PkZ&D_3qCf%AzR6hA2X0Df=<Rn
zCI*ei*9wD_Jw!Mc*Uq3{9y$66DptZ7ufnqMFz{=$#D8QQt?e1&F8K0&9F`t?Gh9Qm
z3!w4={G{#9WYS0ruQ|eD((CJ~nzXoq`s(;CI-Vq}QNAy(IFB6fQ6y|9FsThzWcKE^
zjL-}Yvw8~OZx)ddl3e;}0tzKyOzT}5#cg1FmCME-iSysIQyVK@+Pj3D;6HLXA*>QV
zAsJSzSI}zJyD^@pdB+@TIMS6r^B_tB^Pd#<uW$X6w&wDIuG3x}fA+2)sa5jF()z$+
zaM;;JG-#=k`V}vk^YaR(`jOS}pDQ!J#a!|l^+j9MIUxm?MSdI#4-zbxqq-WFQ7c3{
z_`<L<JwQ#Jd4*x)neFm#?HCJ2*5>#c5oT!H@;Aa^&x#<BeVb`~Dno$OcPikm(!_(4
zX*QFV2-uH91!@`o{(oLNU=X0mBq4+Y210N8J=r|q-Wm~8MR$qEm<bPZhUr?iJo#Br
znNyd_hoR&u8np?IJRwP+u|2H)PFXkdD32y_`^T5>&;<VfrX8%n7EFQOD^ew~2Dfs(
z7Hn=SUv){_!x#|__#)-<QG;THsB^MotGD?7(>nTwe*tY3@(mGSx)wzcPLGx6TWkb6
zsXHg~*vuyd_s9mzrw6zWQbS4ZoH2o)!ra$|<Q*@2g}kT!qLWOf3q_>I!Z4e!3^IaY
z@_m+W=4Q|^L^iwbz)oy5D?*KtZ(ltddTs+wnfcMzbMU>dzdxz|ffoRun=ypj3zFw+
zDkr^)L@Sn!4a=Uez5NS=&%oH9-Jyfw!fqj!$0_q+BrS&jGIg}2MYvem5?2h(ms97r
zuZFXf4tKkGz178RuSj1i)blT^r{zBqO@4XwD39a+xQ(WN0dJR?K$fytwG4?~bA^h*
zloU~J`~~HR<G|vr=ZfM{HT6u<pA^C5*N-UvSG@iAM*!<ZJ9y2zOyS?zCjxh^)_EhI
z+RAZ&H5CTOw1jZO4(Tg?tB(H4;2hA4FL<GUEXqK(sdQBPE%w8+Zo&bs@LslLb~R}V
zH^y9L{-zGI-S*Twe|kQ?b%Fv}Lo7{8>${GR!kmvkNE6lA9;C?tC?xIB%K73V+bW<o
zvFi?jPI5*osm*bBhz$zPe}D?Oic!=zug$4lJIhwBu>B1+?3%^P*?*oW9^UMEuf)&8
zdG4$b94M~+0nbFBwN=5j$SQC!UqfreFX+NG+s@Sj0{9czcFJ<R5fTgb^9B8nnQu&2
zH2(%p%Nbt3L!I3(`m0r2ym9?5uA#}#>M|;F+x_9GwX|1Wo^>;W`viJrWG=lgUH)4!
z|3L`<Kxyk|;%(bJaVGcgiX$Jx($`!grmXaXhRiEF`r)T@d<c}g?CGPM6Dq%OkYXmW
zA95&)eb`sI)0MtvRWZ2N#ixg@%6l}Ugh<<IZlq$pu3#A}!=Jg-!(4T^-#LzOy*9rv
zv0zokeQU}}>5+=X7qSz6yHRxpzi3Jr*C}D&Y%1uSG5l(}G>_E^uOBsW{N+iXi#)KI
zXo<ktdik0DT`<0$c>)$9Uk9GscL%$u9ZnvYZ<VW?U0iy|kC`2xd{|Uv)Kj47iW>EV
zcJ96C5X1~l6b}ZurPDnO<AzyW3Gh*jvg=)hYh3+A!*4oJ(`HPr9r@Fgo0C0n=oQic
zERnSp@?YG<$MIAAE7xUXYSo9D6oFu)L|F8b?XjEc^$s(}uG@lUg@g4&$LahO$PWMM
z|I?mD#DG1=<xI%$TF)1NXdA8x9J;)i=r?t>?3IP(yJsKSJC>d?90GsWkg>)0t9+wF
z418r~J7=(DvVVMWbpv+w$zq)4aP0%JHqA)v1f6iDDuHx1RLaoJ;jdhE-3DBiOzDrT
zGN#oB`wdtG?rC)DXK!aLyCm*5XgG-&D7ihQ*DA*ZaP}whA1Rf#=*}gI@sX%rIQ*Z0
z1M->)NYzAwyUZagWp<E6m=O63Xf%CU%7dR_w5zvBN>-m3eN8eED}Q-GE;PnmlMeqm
zX1TMz%j+?x6877+A>Y;KS7BVriE31e#dV@iu0dEWgK}=?@Et1nr$5KiU3$XubCqXU
znle?IOZ{*mwZ<jsiR|g`;Q1>>N+VlN6|S29?w<a)5tnrDVz|Wg`BUJvmINOds`@o-
z_?4_-DuGbq7CkWc5zCo_?d!jW@10Pw_IX}FkVkU`DWEs=K_aA09OdaId5J^=tI#`K
zRe%yv`2Ps|>bR=XE?htmL8ME%ySqU-v~)KT64KppP^6KNPU!~eP63gUmhSFuxclJD
zH#5%Md;dEg{O!Hp^{(}-XFY4Z`;3*pYsgKoOf{n8Ox*|yvhebjHw-(M0P^NE?@&xZ
z=ODrPG>;Qi2Tt5gwrYrLLm7VY`c}xS1`rsJMMwq(4TQ~HCH$mC8<bpr7R}8aVV~Co
zK00Ddz>`1$c;f*~XHF|t(!y;s<=d}%SOgNVFdb4PvU1ohR+ma8&K&)9eL+><YWmf=
z51}O3x|Rbhf`X`AbYO|SzJ*08do-mY69xU1H*qz<9-WN#QDj#Jo;kF@VPET+;gTsh
z>eP}MS(~G>k?Mp};C)?00NwdR1NjpH0MGrQ4rB%|u`*<c)u<4XKY5H=yY2Fq#$KPL
zRo%R3#x=_H!;oL4w$D9w|Bz7aJ?fo7;36@3<o!k~BnUThaq!eC?1UY6hOA;`(G!t@
zwP0wyFnW=Lbz-)^aCa}u-CLr(WgndyuQp)G%i{}h7o_(h^^Ib$Ty7kgaMtIX5(rb3
z2Zvq8b>2X|1OnF=fuStf*5aK@jm<V~%{@Xmy*IWRgfF$0p`y;lS@+vkteSGWq(mZ3
zj<SbrKHajVZ7CV0o<T9O@XR~;nm3tJ+jkf+OT$`anV0-59ODnF0A8dyNM}T^!Ogf%
zW%#7b?BI7=9!5VDTih_46uyycrdZra2*AVN?DEIQ+9-fllv2FsfkY%Udf>N{u}%Gk
zvlKD0uJS->>uzG>qyy{>1beGp4%tt|dq0%L*itq4=;RyZI)W^?3=23FV*{r?oCcYd
zj#tVEFI(uO=}vwNf_lx~?0jXRr>^LvhuUcwoxfwl|2#@@+f;{9;T-=ZKn|@lM4C0l
zh$pc<;ltKeGxW9OR=#kqWg!v;0NZZ64CBu?h^6JN=VZ>-o3*#^DOJeZ;-441b;?Q<
z00WLBLH*2HuW|kS=+f!6%aVCZ?yR)LPW1ML6O~ioamhUCyLxu2L}vaD^LU;`$0w;1
z>EhjOFaGciTV0rsfs<O28Cdi=bv;b3I8`NLl}EMx+Ah=FjEX7NV^(~w0%ZS7oqzKv
z5+s-h4|6qzeG_~=CQ4j;c1<_Duc%BUJIh2d43a#CKg;O2%ZtiW9`Dg@Or;6Xt=d_L
z*(B>Fun(FW<)V^Hc-()>q$lAiY|f!RTakh>(I<7D{E9GbBPjULx{hRNS)q<<C<Gf|
zUC&`Om$>S_p_|Yta=+Ev68vg4^OcCYph|dj-YMS>g07FKQFBoJ35FW2FlKeUn~~^j
z^6~1_;lzeez>BmIq^48yh~8di<ZK}=-$rX}GxCoRCX4q1@3=>}`7S1w)unlyG!29C
zZ`Q%&@BaXJgOw+qU;8=PVTX>wlr7s#<gSaX*LdaJx<<qrHf4!?=BhWH_5M}}{LMyQ
z05(mj!vg#n*xD%Htb5Ii&M;8s?utiFQS(uX;|4-a_?q4Edzz$SdB4d{!P?p#j+?fo
ze5E(=hus&ZCqBFx9ga$lwjLop;{lv-vGI9aO{VeE>=m<0ZP!D&?^i+N=}}bF4FUG=
z9ukuE2dF_uEswecx$VxU_})qb>==bl3lwKU$y<d;p}_B3VYCfGWDOasyky3oP$cKn
zr<=MyeXmlF_cm}itjtFP*Ry1I?fr#flTWinWq`>+tqbFlWEaDHLy%&U@RKpA{u#%5
zDyJRI`5~W0dbGg&>U&)Gzj#FhnSrEz_B>9fdn~0^y;oE!iQmVS(O9bKzC-F6>(d;v
zdXgzw`2U7+5>ixIhV%z50-Qw8q9uo>OgTh6l_pdtism(aTgsWXDiciP(o)UumZGwe
zufRD-q9A;Jco=S0PG6G38<Z#3qNCZ=JE-&1Tyzj)bp+nHjfjX@6ck5QQqQxWE1Q<J
zZ@0AR8J91A>}~x;bL|NVW%!Wv-9+x!JaIr%+1l7TQ9@1}!os0*T~pCCN@AMznTSo6
zH0Yww$ipv;73$K>SSRL&aEyK|+|0c%`|P0H-IncAb$i2F{#<$Cn8v|)OWpat7V52?
z<3m$k_BRZ-Ot;=_p_D+e2?fWqIC@HvjHne?zkEwC-}C5gZA%0a<I4B}`xhy~0HP;I
z5@1J{kQ?1z_O%g&(-|r`hU_=0tNNx?!Q_M^pnmt3YqoPSS?>v)RH}Mx&8iqv$(XF^
z;|r&<pSc(2VXj0r7K7_>v*uctr{+4{mW3-$9aiV{+tNs5Db<W<>#>$I&KGt}aw%@Q
z+oL_t29fqWdk;%ao9&1*#B^(-wi>4P@H@kH^>+5TAmrvMBpu}s0EK?i*Z@QNB9Wff
ztuUGV@)j0@-A*X0+$r~>aJ%tGZt{%#3r@Rvj4MueWsKrn$}=^+!6kL;3x#|XY_<P6
z#{Y;C`7094eOmIJM!R}QM9evXOTVWarCrcCGZkPqT)aDNUR|7-2)2M56i7$j?&>rC
z#9Ul_bnkE0c%iU}Ie7}^b)x7{Z^@e0f%ie3-)h!wZumqK_QpnUz)FKL=R_d2t!I3D
zq9$vPNmst6WiW{Sx#454_Fp_>#uMydjqjDTzs7Ovv7xUo*VIC@y8nEzh5F5RpMI}`
zh-D@^fZUL3IRk?V@GYRLM4fJQ*tut5o<U5^8c^EImDjB)ejuLA)T`yIQSwXsJ<&5C
z;U2eKzCqdbLmE_EDVUU{<hw9Zi>T7+?g^x2_KDXiJ;L{yolPeMK6W7?jXH)~EU~Tm
z5WuaagKlhYp`$|bYn8>?SSdaEN;ouORX1f5H22d{JQbtOMYGm9l%P0}2t#HEM)VUc
zITzWxZgMs@xPxmiW1&5sxBtlL1}g~fCFW?CLm?(dLIwwmtvi~~Txd`w)idt$aT3Lv
zzg>^DZO^4c|9A*LMSAB86jumL`PzC@7#N)8udDV$WTuu(8B$2CS3$$+ppmcR=7!~N
z3RL4ifTC^xgGj2IuXn&c&koyao*u$xkkMxPR1~|<cXq1YxS6L+56Fxw363D^tza3x
zslw=NRBiL{mKhIhqI;-RDn!19jk(Zyr${ZYc=R0zj+EiA<^+};i)L&dc*Wbj`30EZ
z*OaKAfr;%5VeF@!0x~KD#MDOs<b5w*w|Yl<MXkDarsd<GEqBB|p7|eqhUEx0!j|ud
z$RU{(;8~KeL?tN_C|yBwHC8zh{va{;B6_;j1aaM0{v3+nBKBRbQ-caXvlocN=y$A6
z>{w>&dv%HP(QYX}B$(AhsT=+jfQ2-drcAf`o-@4`N8Swk-FEb5eZ1_PL>VIuU_4|_
z6Z-Blm!P><$4<YqW7=>3I{EI!<;Z259SFkVaWo8`>U=?%L!nNVrbvE#O~S?o%eHHI
zys7G0LT5a3dAgF$>{oBPqx4$3bUBj~4X?A<rzvGwJCe%Yi+-e2Lju#z=wv4B-CdEb
z@Bi`%i5LmpOD5yMI}qci<Jy!8F8%f3Pp@(Kk#WX?j-xixI*GRL2h6!yF&uC@AcXXH
zUSk0J1SHYQyM+1RCeq;ZB+Vl)Pvd>+C>^&qL&rIKXoNbahsAHXE>X%ZpG~{v2c#|o
zz~vIMrxDM#%h6Blxc%~qUdnM`7bl5v2sbHbpk$wDWZvqP%J$W5vIt%G7A+%)uzd1P
z{7_Q#jw}11FK3TbE_9Ww-r5de`i<K#Myv2M`?MK+mpXUVVgmYK!pEG1(gcds3Uquf
z6q`SNp&RsyX{hLX=_K(7aKZxjAK-2z_o+MhK1#T!?;uT}(IcEzDA~nJ%9vGeDlSVH
z1fZurIHGqa5t<`<Ex%<U?H#V}kC?ls%6ts$3~4pt-yB6%D8nc5te2^>MryU=$vC@@
z{61c*`HS&!>rOw{`!PEm;0VgrKC+W`X*w%(6;8Ys3nZI)Z|ow&M$sP5kG?H<7_v=}
z!R8}cz0GCku&L!@bH2U&^-1ObcE&&wF)N>?Wti-WUH4^Y;zl8s$sVInGze(dMZ0^v
zEiQ2Z-QzLiT)W+*eMcA6Kj=Hl;0IUjyB0xItKW8t#S`Mmo<uUq2!JPlg=5;7y_!fA
ze!~6b`yIM0dw=Md15YGFLHE|-%?q!2`@Uos?+@9;($qp$focurNBDre`mK`M`2??6
zO9h@fqbt9p*$roZkcrL3{-u^qRX(?CLo=l@Ia}c;Q@t^av8Ke#oI&_~FYZ=8SdxQq
z1kfxfks{E3rd)M@R&-Q^zh-J{v_UvG6-(ZMO$VDXi@*98p6ax;GP0@97zT=mAaN&}
zVp`i4k19mX=trB?$Oh-VOdT0O>relp@(IZZ0ft<fBn+|dHdtQsNO)fBZbG-nW^0GG
zVVe0{2^!`YoMF%*;Yz(V26rOPO$%tKX~~cO*}d}3AkcoE_F8slG(!8!NI(qeO3)Mq
z2{GaX8OofzpJtp_x|ywqfW+rd<=l?rV-^|r6EO|9;TuNILj5a>z`^EWW%0~Me_i5F
zlLOKdDiJa@*a5NuECjH4tMrDZUIz&)KZg5VY*510dqze-2QlR#vudGw$*T_MG7Q;G
zuDmIdwZ_5V-1F3%s3(=<dDi1OTM&KNEThP-SX8v2hf$qSZi5LCObt+R->TJp6nO4Q
zf%7#rro}BePgd33)UTTq=>F>j_%1qfiK-;?D-I4moAWL$Lc5EK{=L9`+#r$@dZ7Lz
z=uU^xxifqtwknfW8<zMKKWX5b1Hpa!c-#totl7wSU^vJ+P3ig+ZO^Aa72T9lG1GkA
zg$d!0p3mQ_8!#yYpf!`;EG+&2_~0TbstaNHtbIn_42HYd<22=PZHqYgfXzH!a(-ER
zk;35TX(U&nLUecW4TB99gsaad3{^TDKHCR`CsvL%G7WDWxa*<NJj{-ed;prZ2DVR(
zw<evNsqn;(PppzP@KkPI<<3XGP&@(roSUqy6}#3DqcNZcs6HOVzyDSL4@hNDhG48X
zdeOTgn1fTO%{jCZY#C!x>iLOtG%LyNXGl!<j+E$0m~3gtZQ@2L_AV+je+F}pZbs<x
zd<?vfKLuS)?ijDwD2TA&-Ss~@|9`9>w-H8?hp_SEvv8o_k4p+8yYy^z-XJUpdwM3y
zeOQrIP2TkrO-G+Xg_T9)$k)&`&D|_siIj}A+K`4katjycV&E;ixdlHRKVZBfe{*D1
zbHJw<&zB9%io8z9MyxsD!OILe&1NkE()&AQgCbqk!^PdUZfCvb+6{lPyK|8gq4htu
zzCq**tu3JDIm1_{<~))=IBp6~ivLyN{bQ@zs3LZURYq->@->&Noj@46*kMbjZ<)4x
z?6WaGiw}L>JyVF``y6!Vq+EBb|Kqy%ncA&5`fZA*X57;|vL*o<!tVJWrpAn=6G>_o
z<e2{;48KBPP8OOXxjE^o)&}xNh98nGiY@4Em;x2%oky;=H(6^SWn`B?x#tu4%i_~q
z+L?j&uip015>M=E^A+11t6p1JKax&dNRVZAXn{`pj1L*-g|q*fazrHisf0R4E+x*L
zL?F{%YWf&dwZ&W0pY@>k0Jx!omzg7(p%Y1KZ?)x}IN_2PtCT^0)2RDdX?f$f1_{eg
zxUO%GDQ^PD)ft9r=rZ7IaDvPm^(qC`!(wl$oUzbJB_?#2JS4BrxeH%Ds~$qV6;P4t
zrX0GT{St6?bcIvhOrie;l%{`~aS=v-mB^s%!FyLEU`)q*mzZZ~8n{tumVw>*w1B@Y
zCQjk^bO{&<mjtj7GAe~LM5tPDhhxf9r}t?hcikpv7-ht1JIp+)+~#OI?5S_t*<{|c
zR}~dTFsBd|D5O=ylVv+ET(~6zVjbxv^m8Rr*-y+qxo4su;fVDKZ%-zPEh&SyxrV+k
z1Fi}NV`!8Ww+{HQx%zmR0J6$?`fm&ew*bcN6X^)$5WIO#f;FI%Y86<oM_+zN%#S_1
zGDXD6hfCbJ+ijoC2+Xk|7UlKBP;&DgiO9xhdNxWmML0C~LsPvl$Q#doEuQ2`d<cKb
zmvTy@Qh84eg#U7Smr3tF*h{skZFl6%Hy;($f=%5=+Y?kgYQT)!j2;?(Tt)NbzmTL5
zHBhe;7wKWcz)2)O3g|~k+v4gcOlwlF<hBUkTXn@MqM(}uO<UlM<#b^Ps-9JDu{7pL
zQeU~(MCT7RHIq(ap!3#EexNb7r(vT7M(%@$d9>Dem_~H&+Cj`(0G!Yutkj=x2fz+J
zzqUHHxo;;<{wD5lCm(J4gBEXVD{V<;s7If{^Nt+kc0y7-+`|(K*m4_(rW&r~C+hBu
z0<E*Z()n?05t>KLy+L)nPz0VhAj174#z^TSNXeq2rSpG=E}!GoX4`$QX~sNiiER%r
zq30yjQIuH68iMwco8YzHOwQv5b@$@;!|mmsK4ND#FWjz7iP|`zY8IZ`eO<u5{)`#(
zP*`8hnLCn^6hjSkKKYVc481t20`z$!d=bzteoOOEU>P`;j?HW$2%Cf?UXEb(ucbN%
zvnJdAg?kGu)ZSYs-XA)>f@y7hUM0t*RHs1;>PCwhxm{iv3#@bnp;D*A#c=Q@5Pg3Z
zoB=eXa1mihQ});@<tc89tB7^*J-=3+1&Gai`7Lu<F9{F<J6LN6hI?bE-fsRQ6}gC)
zd|vNs&;pnD<obU4?hr&g{WC8D9}V&ljq7Q|IEJj;8~a^AEfNk{gSR(twWy>qhlEE<
z6kMKX$d}a4_C<s2mpM57uR{<WI?c$WOP@|ZDMfMI)FD=9?O@Ngw%>{;zXJ0yga?D0
zad!TyDgV3`ivWb`2oOmm!vPr>7O0XPM+Q=DhR$6scpY=~^v3nG8umYxWX6+O#O$&$
zDR+ml5mtvc9IHMhY~ol{@psM+N9l279Y+(J)$oXugdEO5#dU|dJbQf|xbW2p2g_Vv
zPVkP)X_#O+Asl}b>z4kP4r{v$6Fq81R|YcLrAcYwbGlXo>b^^at1=-O2}cWphdJWE
z`H6UeNHs7|CZ>`>9c&kR@S0a0i#MZ%L#BT>%IvuMeA9$rUZXY{+u>%}zh<iL!^U0v
zOF`9N!x)-mEomHL6S}FTMX0JDHYCmh1>9#n+CKc-8y0C%2~%_ZEBqj2<N}Ur#0bkr
zw=GC79HU8Su-G-*%ruah(PS>xqvilV(_L!O!*S3pe)O1^PObrYeqS<6Jcvs9orkWI
z7?<O0aM_bXZ?_N1apUqZukd8;Q7#f>O#;bu@fRKg2HJu_i_Yyv%wSz>+7gm8D8-o$
z@g5$O8Y?h8PFsV$AaJ3c?HHkk6j|I%3d!2(>l5g=QE7QDXB7=K#$i9{gVuhyCeF$e
z=>EP4+k6qph&5?Ast|FzJm1<FkjoDW-qeYP;JtcPZ=KyKwIQmYf{n(v9L-i`IG*(N
z(~nNqjR{L@O7=!mK53;+JH9VqPz3oKOQH*|1n*RizWzNM0HGU71KnrKmqBEm%mCOa
z9(H(@(Xf8``HvO6V?`Mb-J9L6Qful&%9iK%(A0S#m6C7lm>LB;=3{ON7sf+oy-xPW
zsH^MZyo;YG1~C?*fRiE|ygdRG7;nuWbcljQtu%_bxetMJ(#u0{J>jGF&WkB#W_DCk
z0R0Q#fQbS6>b4HxAk3cXL!I}}pjqgGJresMQv<3AL0D;txGsw)%{9*u&h~dtC7)98
z2xyi?3)3Q2Rg?j*28ac0(scVs4-nvhJAOAKWR*{Vyb~T6@SGX}$RCv8vc9Ovqwf+r
zlmqd-RLMs(`K6^Yd(;$EftB88_-IBu#WTd%nuduo;ecd80u&I#vySV`FXF6ly5j5&
z%@r0C<Tm-~Xo7*R@`4sDc6?CvPgol}+wJErW*CYA!JEj00?7aB64dg-jC7hM?FI^6
z|1_-e-6vBPw*)<&pQxZmd2Skb*Q>g<%;ZWy_OGW0FG48h;?2g=3;@TkX>A9%g5$~S
zMHtE6D9f|TIu%g8;27Yt;khtlBZ|VXOq4F6jvZbxK3A|de8vboeKK5EqUuScRh#Kg
z5|`DMS}?}Kin+GYALL2XAmjUIgjh3A^H?Hk<?fnQr+ThE;gx}@4gutUWvMwr@Jnv-
zb+ZiN`R1PtWuWKTXoPirpDndre(*`0Jatn^TL6@se*tqO1+aj~6SHL<z$spn+Nz}5
zr%+v}P)$C)k_`_q(dkTGw{!pWR9F&|jrS~yTUz#KI31}pJBEMd!!zKX!Dv>$-z&Rm
zLl|`B%j7$CT;@}o=xN~|m?RF~PZ;HabjaW0(9G#A1g9a*?Qz=a#OD8Prt431JQ#R_
zIjO^`h3@s{m`t(H>O<f`a&FuCZPSnGYQGLO^@M1u|F%Em@fT0bmUMvopphz=41DXQ
zOqRobcIK3l^&|+v<aWTafSOP9+w!aT;*<eEeTIq)4P$g#fJI#LOlZjc{P>2rlr(sf
z%A@V^!_-A>c`HmWLN1l-%-}ddrSKqifm45p+<#4E%smGu@X3cRHe5M+Mlxc9Hl1GM
zD`IM-&e^pRpn;g{jjE?$k^igaWq>3C1|sofSil3KU?CCv+HcU6jkEs9smE0#+-374
z)dYsjL#gq_56GstiZDhq9=l%QG+Re9UAyvvr?^!soQ&iJ6Za<G#>x*wvP5q|j=O*E
zbCA7Q&bO-k>8dH@AuWj$-Qy2m``Yt`en*!BYlJ>FH|J2YtBP*y&CA$Q%qqU3cCrhz
z)Pi{%l;aEispWd{Ul#ezWmrA|IzQhhVnG6a;PEqH@$x1+ITcfC4UEW6N6DGqOg$R{
zO;(!CGlhrP(iT%A4)MgTMJ;!5Da)?ep(W7URP}f>0CfYoB+yA!!C2j%AJDDfx@dg6
zE3X#Iv{W00e}4@RN#M@lSA?7i!5g9`H9ZokNON-~0xy}zF;w&2?OwtD$AHY7CYYB_
ze;qMHo{{0RVP{Y-7ejkVnvsTlEd8zNoe<yG-O^432kA3*<bPo~)ITf5NJ<4tpt*(N
zIe=jmJvDfk7n)yfk!F_I!YX_0`doFTc9&L>Rj<WV^;Zl>!5z&!I(zqlhGE3f{Y?R=
zfj?VLj?4lRGPm>^(xsUoDI4zlL=QJKTF+_`{A!j(4=?UeDdWD(<Y@ZW|M*8(wl*-Z
z$cL-meaG@a*Uww$_a=@Vk?Y7wd$%Pea66vZw1gU<uSVT(J~YVV_-8}wFP`@y9cbnS
zEWG<5`T;mOn$xQ2q2(OE;E=|Er&}l8C2Hs=>%mc%su3J)XI&>*^ciSEh?k4}45RWf
zuVr=I@4DuOml+Kv7CyYad5F^17`XNu)rbi4tDn&!Clbu;6}Fd&(I;0_8Qia`yhilg
zT5zHNnHc~PwMc|D48tuEL!RE+<rq_6%0lMb`?_6iNa;e~<;i}}Xs3XB+sEUggO*Zb
zRiC2$?|Q9O0TY-KQ1BeJ?E}<>K+HTgQ<Yp~q0^pGgI;cmWm4W3S%PzKXT>e$5L%!M
zj&@!q>_D;s6j<Q6%nS6GpQDajT3=gyH0NWGG}Z&&FMNjyF%M?^0_v>8IK6Q_xVU$%
ze&xvz!UQh&z!?AO=8xlAVfa&zD2J$P7*&ohevmnlVxa>~019+@nHnM`IW(`uNt{*s
zaI=azMqsjx@GL$LTM|V1ua)4ULke+p6bxi^AdY)r7z_F@JwJ^U=s6{-1Vx14-^*S~
z9kqYw-r^~ds5^KEB8@g8Z~7o%o?JIFm`5ebX6OM|1`I)mx`L5tS6+#I$_}Yv@Dz&A
zRJ+v>GJ5wg<f+bl0AH!$-FI~CM4KNWHCv)Ix|tciA?EP4k>cM)CKR}h)HaF8=9qAZ
zcx<j@T`Hpt5y8hlBt1$}M6fn9b_;9Bg=-flK$Z>dlz6uABxeDnqPQx@CkKkLD%VB-
zcLUi#;R(s2qk;eu&l3o(Zi7)>3f+|E7&E@Y=al1067{2*b&lmIUi*s0`k6siQ5tA-
zI@d{VKFm;qLjeA|ST2FcT$00(T;n`6xD{l01;d#G8UnmZCqyK2z@;5^wn_Jg{k?yL
z?~&u}_DPdxhV#(o0kjbmdi`%*7bF<4zqKF!cA(IgTdu_3-57NFYa)9xw9_B;e3~{q
zXjL1&eC^kl?HM?y^a2;etA%IJ^ycMMx3Z`TxQj=-ZvP`ax+npo1N&KAd%(D$Z>!T2
zmC2FvmIPUhhQwE4)jE{p7h$h>=dy(pb=IHD_6$1WIT@Ym)~gF+L!Rse?OUWvod>33
zB7r*FYo#=j(k|oetF`sZN6*~b?m&M?1P!g37N2h;4uoCMlDe$s?d7<(r<5{?^KY+s
zCn1xDe`d%(LLAo;CZrF<E_dvor^YZLQ>)wL9DAtGldQ)xwZ>DwX3vVosggT*d^WG^
z)=C&Dgm#);0vat1RPV%3+<<dXNnJRD$N*yhvD73oAeAPy2d4mi5L8?qaOPU&4olm0
z5(D1sHa-tidv;~2DNnW?Q$J<5gmH<kM*B7sUAf08wS)nXRDDH0`XXqjrKeI8Gid9g
z6)Bl|AAPOnB&e~&nD{jcTEf6G@MhC4YUA@%-BJw`d&dC^_b#XK()#srz3LST?!RO#
z1KX#eIGLai<nAH}tYfHx)_WQL!5bm;dv-1dSBV#v)I367*SO!&9Cu9wHA@_8^W>7_
zK=bogT&lJWU(9l6iUYaoQcm#DtTIJ2ln!`w*9VT>XCl{v|2U5Qw|K8*0vM<s4n7y8
z9s$Ou_--0=67v$bK(8Lp6A#*I>CMGJ&_Ap@I+V`7=g?eWdGn~3qKrbq@L}aOA|>7;
z-Xa&zqaPB0bW$;`;33TrT$T2{R_-qysG-$Dw@5fI2+)u5ny8(w*G>rwvOk%YN0ug;
zs{FeMBVply?b>`5_d0N|#^1VN38Yd#!h{LZFQ}E6S>!=G?U__MsL#XAlz3aer&LiQ
z#igBbueJG=RrDl=u(W{N_4YW#IdGmQqj-<eA>z7shSnkWT3a`_Y3BOJKhBB#LT&>v
z4dyRV(iV|`-BdPrRYoHzKGb2lW+eezwKGMtTCimt7-vP`3c+Y~r%t-E$}nuyWx7Gn
zFVUWXxF>LY6ck!r=Z7)h=h1&ns3>^r=RF)%uULF__}%n-Q28dZRx``P%=W?-oOm4v
z?|0POlKPnOCuGqr)1DO+PS?9{k@>adPYiZ-u6-#JrHw{06ZnvSFHK2UDq*|oUqyfB
zL5o4Wv=Ou>tcl$qXhE6zzWcU7JY{TVN_i=Cr3TDB7V_A-w2C9PGRax&<fuN{F__ys
zp?*5=NkbV)qH-j&1Le@JAxzln>GTOK|2W!e2vVH&zXfFRTBtWXgp2pkwpT#tFJR@d
z<(dQ_2nnT5jy9D<*~#%ZnKzAh&}n1tE`n5&@~IM5cp$_EwN?x4c|3g*h)Cr1BCd4K
zDrjoP&TI4+gRFH2UyoYe3W;)<pPD7AB-ZQZ1}6=ig6^d}W|Y|;OaxrvZ?1`}2V<m~
z^F0D|Zb!C-DRZK3pWmR1al3yZxQ4uUe-K<}@qs+C#<1x7vTGJo)=R&VI>*}hK-!F^
z&lM$VfDTnRPq=p4qIVwU>Oav^q?cnadZKK<R!hQ1o+KOcZ=n?TH6&fVM&Xge0Wg3M
z2GFr~ING}TH94k^o)JuB>Rh~#W8mb)O!!1F2%PF!_3v@|wPrOi61!-BH7L_8wR!i@
zyV|*uuQ@bN?oLCfV;K+|fhJ+#w}>#wjE|i%Uo%bGu05<e?9{7cgr8(7yXzmY0DZ)N
zvIv8C#Fgn+k!|Z#ryQJ`XkKc}YEL<age_y&m?Taluxb+OqBaJqqIW>yrU@K4SdO!+
zW+v|}S9a5M-M;aNo(PM!ur&}4&G*DP@t(&`V^5O?4J5^fI{-f%_S>D1<RPl(U^kX2
z98i6}2FC}<l!LlA8Jy}DFczQ7(d(~2A9!i_Qga^weQi)DPlFD{L#G(E)*I}3e$X{v
z{A8|q?x(D<E(p`1zd#E9R~6w7&OX9!`ij=CEBvBw7PoP>W~&$YMJQXH$VH!jVJ}NN
z9Ggkjc1UQ5E*N(ZTY6Tyf{;3+M3;0=353tZ^rhx`Dl4a+q%z|WC{8TX_10ZRHLs29
z=&W?lteP-(qao&Wl+N|B+SKZq?fr3iJE8Jd{W!~V#+i@*f;v%cAf1U`A9f4@g-M4(
zx>g!62%eX+&K<TriqVH{wH8W-Ct6HNeH7Sk-zek5!OKJeG-$iy<T6;Dn_rzl$|v8}
zQ^ZiN{m6YM5TuW=ae7Vlr6u0%OFtv%^u*6gj>3HiNFa|$58w~QmNKba<*}}k^E6~n
z=+{T(eo?XCJbV&{w0DF+BV&>+tqV6pV#ku6LtmuOuuivc8~<tCh#{HQ@e1(PjiJ6U
zH$$Y-0BOkp4rU!A-sW;eT~v_p6GF9^;C<6>xu$7Uuby2c4~ijqkbe<b2HqG&5+n2P
z;<9gR1S-HRBV6NcBY>qyqVre`*cO-{JU{Nssn&7Oc`ws5Xe+&}1IP>O39_pm)lS7O
z{hty;A97r)6JmT>*g?+aQd_&Kf7^B5$e`et{=v!(*Hp3RzgXrL+?yafiyR8AYPb5v
zZ8l*adDYm4+>nc>^juWavt#?j<iBwi06+E8Ta70bqc*%1^Kub0S|VKHr(JJ*tImg&
z#7;J%k567naF|*(53IVzkErtoohJ4MIfd={QC+<rdd>HJ+NNMes`OBgOX!f^T^9AW
zVX_ceDnma6V-2$={&|eu?@B)s8F+()4;wNl41`>nNQ_{LB+n_WHmymynPX|`aZt`B
z{)nfFI2N^wE`uWL93#%JQ)wawt+4i|w?~isHbPz92f2wUf*Zo4s@J0%Y(~ZkKkif4
z8#vp{PzIHsqHD!M`k<(_(BO}W{#)dU%!(Q=eY<*6qmeicgk*Ic?1f1un!Ry!mGCQr
z)fl(o$?_;k#so5dhrI^`1aG6^S`C{dlGVpqf0k-|=b`E{ojxVTf4)D&xyNw6ZyTTk
zS3`Zw|0;C)f+$sf?66%mb|kyMd=fP!=S@knt(If^=!KQb1%Ad5{YV3qUc6>x_r%%K
za}CbH)~&!dn*y7rhABA`g#;ZzWporJtKM$P|Kjhpj{ynJ{h0<XB7jc|If!O#-uDoK
zTjrgA-_D*K8+V-`+X<U=W>(4ybl{~gqb7iZZLGgeE{YiV!jjRcuRPCSJxaBr=Z4>}
zR09A}nFulp3n_4e=mMxTGUkH^KVLJmkC>ZZYA?aZfS#4a;B{8bqg4Mb4gB#@C#=;>
z+QHm!qbU(Xq0UP(%y|=N{noj0Dcd*LH792Wtrg=I9$C7OnSC2_ihFE<aTiSF5iA!}
z;kR+EJ)eUpb#gA&@?~}Au7<igL>E-qxl?V!mq8q~_gv*X!69Sd2#4FGuM-YM<hRZg
zfZymq)I2+UZ3N+?kEm^@#zNw$EbOe<<_4|V^Ex<e?=^;W>_K+z`N<16+`Sn77{D)F
zCkIONF^n)m2LDlO&0*ZmUY`d(nY)Fn%e}+ui95rlo?-o?IGchS-+*(kT6_LpW|($5
z+j^d2$|`yZW$tUz8~p#6p#!@pktlzt&rz$DkQ;-G-KcQ?qLt3e>bTr=B&T<OOT9j=
z=_ztN5$@m?MWckHMEZhTXX?nXd<3<rsY-2vKVv<Z&KPU|v`||g!^TdVdAjxp=3j|n
zCIWKUCJ+T5BH;jZn9ieq``MRtK{>9llMgE(W$prw8xw0;WHBmA35iwL8qe4EgbzM!
z9V<fB&jD}@8h&*Kv(0rqg5q#y@&eVt9#b7*J4nes!5G0fOxuelSZ)W_t73KK7M{10
zY$^NjZBk3y{Fn9zG1=et-<2A+-0kXp9%;Syt85sllXZ(o?N*nYk5%;jY%-PJyXbC(
zBv&cH2^=ZQLGMRHhXcb8YPT67;{a~Uc6=iy@!PXct0BFYzo%L7dtika=ztmw%&YXm
zNc2CSeC>x~SZ-Fi8jK;^`M!LPJ?RLW%{VB7Lyw8Q@dG5AED)l9t7&PvjqKH#YS6pQ
zxN(zn4N(j+i{1b6V$qs}<s~eUt-}WN3l7MORu-ylHW^kEbk9(4Wj9PMmdP^ivjq3}
zqw-W8f2#f0A-Z=jNc#P)G|>7LFZt9_A;LI^P+15<excfyN#e#+;AOp(;Ts+FYX#3|
znDS{*m)&&8K7YcwW~|^^3u3(${{aZd|1{hLIZ)GVD|T=}jA`xHiUm|K;_A&{nm+b~
z9N|O(-zQ<8FB@;5(?0+CE&#kJb0&R?Ysdisk%c!;L!EnzRgw?(9(1>+n)|dSe(2Jx
z`u+F!O;baDB@`5`mJ&a^9y15JhG7JVVwjp^gNAdYMz{&;`;z4NIs5vixhn*gJ4Q_B
z-tg_>rn<~^v5z1JIDUz)eqo_B9TaUVi81|!ket908{hFK8qDss^Zfk1G}}+#NRi%{
zz4L?Cy7iWOVV_G$?>+Sv`WGXvaVmUPBf?69wY?J#2yisEw=kOD6cV5>+V|*=-|j1J
zI&!#Pr7*(pxNt+?*Po^B5KE)ph<VK1zdAF`go2|HfkssDg~3e!^)aj57gIa%&Hy3U
z5gfVo)Hz6#=GMl)Au6r)xEr7N?A(0e<G44=%(ib>!@ezj*8Reim@^n&Eg)EdN)Z3|
zi@gekdX$T?{4Eo5gIa4a_JlbGqM3vXZh0IrmCUox6Hu!KroFE<iA6QPA>-8aDjna5
z7BYul1_O`N`U&CURrDs<_2TE%pk?EgX43A+wX;>+DfHv3t!cYwa5OC5;PdJ1D`>cx
z>kC7t!Kt_V^DFf#$^<{>u9VN`&!O%o?`S;xVRF6j2jbuE4>JC?Qs9~LX%yZZzbtt~
z)cR7D>9%w_=IRaoeM{)2cj}4dp9cBUHY}_#NXc@c<R4!G_5w~$Wqi+hw71nTC}HZV
z`zRq$n5Zyv*D*Ho$M`g(3|*>F)*R6h9AFPHElxyh{(We)i03z@N&SYK6R$1DUWEL<
znf$Y@xSiGQy7z!-prNT>Si|4o3s`c~8^|2O?-ENMR5GNN_Ezg&*IsMm?`+4k5_hF4
z-djIC$ePY=dG4zxoabKJ#9xD(+Y0reJsqarvSsr~#9;%O@v#{TVGb!&x0ejhYj90F
zoC|2Y<?~?vqjK+3;oU8$n^!3cLjqSLwtbBxz4{%R75Y8i1)W4FB87W)#mKm92;3Cg
zlm9l=zswN{qlw|hLKO1gIbF=QGn`v5sIN7HLVot1>jOWLI4Bt*6O*(<bBVT6S{WL#
zZe0+ow@zQhUl(`}F=;$D_?7mK43ffolsqSFv!@;hxAZzvAy?GbD3l^EG>rCI+TEEl
zpx~@^7Q7okv$-qek8Wkp_L^Fm8s<l}7SiZYmJ)DTP>B&AmEX-;mA#Qfk<K_IK78Cv
zPw!JswQ#Am(7WZeCDX4&O2vS1KJBv4dEA8npK>4)u+2%RSbC)s5O$?EPwQp65p8?(
z&7QnzPlhS(lzjSB%Ia#xnQp2?x%Y0!Ii{(^q3}etW7;L(d!DlQnhWLUoE7@Qtbo~7
zGVF`U)(Fcu%kGu=-J_9tzD*PBuSA#H^vlwlnb^hp<W8S6PpgigqxiGuNtVrSoiFiE
z6+^~tBGORLDI{KeUljG0>iHTD?LX}xyNI2xv6^ql>}8jl*dXJcw7ym#6Z^45-A&S(
zdbzIr)=+&12QI&IA5o{ya!~vq@%pEG{w<0l=3q28y{;QQAaOmyY}?LR@6BybxOlX!
zna9`u79BG%U;AabeLDUem5h|hh)K_cg)72+HW2?TO)!*qaUG^HuySLmKVx8W2^Yt9
zncdRoQ^#_H!IllJ%u&05Z-9evzKtc2sv}>+wcw@Iv}#r82s11bZtJtoKWSyQ7`(de
zfELqqcYABETR@9!Iema}dN1u0r@yd=)Jo-2dR-2KT~F3>a5S4^CDpL!w5vwvm5A;1
zQojh(eWrQ?L%PdD>&6bjJauNIKz{cV$wuOfdFp8?<sK5#LY3g+Pl{6Z37-&S^s1xN
z*UidCD7uHX%o>KOU+96RU!1A;5FOEjZ0`L~xq6QhBUhJrR2dbY#j%KZY=|`^t6cri
zqdfmOzG$$8S2raXrz2Ykt443qD1V21PU@yl*)ff|)5i+OZ=&iIq>8X9ny)PF&g@kg
z!iW%qtg#R>b7R&fxvJ~d`TyMmmJnE6o)j^Z=o$bn`WV1+Ck1)!YRVfw5S%WnOeNup
z@?a+YST^QerD1ZLQ61cwtf|4u_dgE2ua=aW0k@}N@q%%?Qk&TW^RB72SP=fOnjf|C
z+>P2s$dePGAttp-)vNA>#=4NbX>KwW)K~I_iKb$~L%ls$LO#A%I~v;ILDL&=)!%hf
zy$ySh$Xw}o`8282`)bH{3xJXFdp*AZ;-i?wMR$W#<}wre#+wMMqm-NT(uxiO-aZ1(
z;cZKhk!I=o&ye0@iP(?Uam#F0;gQcIv|5OVZ=reo)jXzT3^<T{Mm<j=kGpp}4K{-q
z8Mllne`I)ghIH5(9L1j~D%eNg%reEikkH1w34NL7?g)MT1<`T*8SpZ|kB_hGcPd-5
zpcT`KYVl65DKzNk9Gu_OHCjj=UMXdVo_zjFtJyRY`*`)kyMmZ<gOhF6N336Ae*S+^
z=}jPzZ`fB}9J0}hfG4R*Vr2W%W#-g<n{MvD>*658=R<Auo+w}@A1KimNc-iI_F_sI
zkd0s_H54&Cq{0Y87`yYH*|ZqMzS-AFo;>HP#j3Zq+w4kW%<@Z;-0&n4%;UWoa|~+b
z_-u~NOh1nN!T==W0K=c4d+7V#nAt_wYY!almg$duy8saCgQ*s{Es<1E!{u@leo2({
z%0HfXjOa^4wogOMw?SHM6N-B)>S;55ezp%qEtSP}jwHeppT2tLFeE8+5-98zY^e`P
z4EIX>X!v~bt$OHCMqF<q-1B{7>=%4Zu1EEKTS#fN!h@^Uy_I!ICUeuVCJud_c6-$f
zIAIRPUg1T9UIM*q)}Ia~Nw}$v3zA8WDGcdEueipxr;0F*t#%Qt@Uu_^KdU|>RP|~W
zjit6g=Tm$~`*t=7{?*6ieW-!WciUO>_5*|jg4c3xiZEDSDJxxBFr_o|vSSWo?OF!j
z4w0$%c{-(=ETVe$ZB)}=vj<()UxK6cTkShylx0Q<RLP9o*2M4175^d^{{dD2ZoT><
z!oWJ5bI~Y32uYZgIIaUbr-B&=<?bV={=K{fI-;)Mp@>2L_l0T*iCdR^(-P35mx~=o
zilc6?O)gO4_PZz{*(pa~<k@@Zrv$9`hU@Fx(2sIDxl#zPO-%fw3fQD*R`bM?;tDpR
ziygKzzXyrN<Ip)n)x1A%2w1(~wfGV?Zkn&eKj;hDPAKq5^D|SVvW^yXXS#gjNjfY>
zdg_H%TIWE&0s>6fbTfx=`Xax^0Z#vE-}WZn`UTca(Qw+z;cFcXIHISHa)H&%C*-{*
zUf?3;gF3!CU5@Z-<&JK~(GIVYfup*J%kOXMbtb(F=-j0ha1`NgodwY)2@x`?CFgS}
z^DI0Z-%37K8G4i9=FF4Fho@oXn6lNl<6|jP!Pktup>!*R2qo{*YaZEJ9EUhL=AV}r
zO|r#=knw%6^H}$D5;G1ru~<&KIP}=!Awk^k8ZJWl)_WOHQ$V=TYm(Q-MVYI(noKfL
zy%JsJ&c-{GQ*>X}-xJvndH3WM5>Q7Xb;TO+KsBgzkD)*1df<e(nXQ9fC+kaH=NlV?
zoviu~TiM!&xo{)Io`zqV_&?V8u(7SKaJW2#rMNY@5d3-p<FlE{%7|$8WcS-a9PtUp
zo!Q0t+YE!TCMHc%1=HvGtNy$it9K(72pV;W*(QAuNF`B)V~V$-GX)=?C3VkGQ|;5S
zM@&TBXQ=dz$c&a_Z+0B5Vc};&P2%~qL?gOwSsx1#!!;eX6Kq0~(wC<YYQwC#fL~Qw
z&_WBHs1s)q{L$+^e~L}p^vV*B!P%jMECD{Ygh^3`IA#v(y7Ps*ZblrQK@j7ajImD9
z(Wnke^;?8cSO)wbi}(x*rHQX1yuzobu#iwy9PD-UCyhnlmX{Z7|75_=(rkD;;b)tB
z^YW`{JKgOj)J)K&bIc!{(bfd|aa74pdp5)=-_%2`pC{bGy50Uppt`79TQ`3u*J43j
z9swussqlgD*%1DDz%BysRf0mE8h<{EiNh@VbDMJN_Q&}&Y6kn9fpfJJtSPJ6=yBI1
z7KtW1>{iW@sesF-iv%Mg6|DWp${NnU#hDgvp)8=_uC-cd%Wu)ROyBF*M3+p#ur6(=
zphC=0+ijMo!^JiTYnYc78Dl!H5s%BxvQPP_R|ahEKK91xy8?A=q0*1}I4LRvCrzJq
zp=UqqJM|t}GdG1csH6cin@8Xfz`G}!y&Yu?s<clj=}nx+dai0iw~gzH_Rrtd3x5lV
z#4ZE}@um~Nxu?doZfGGyeD|qA$9<+~;FtgyKQwgqz!l8Y4kPS@1!{8^10R>j=qel?
z7hAn-l6tIS^#tc#!RgyXt3XQn!VhX;n$y-L^iI9nzNa5&Uke}e*rsV0?FH>kNHwW&
zzqLu?hB%Hv9}E&jSVAy9kO(p36=9O+O-GlPIwAl1qppE?XstE$W4Y3#aDK@5wX9&|
zijJE*Aq0*0?<j>?aXyYMm+M5v1$*wrp1#wMUI67Gx#`2P=>|1R_$gg#Slt(@>={7{
zCKhGbs}nnYVqZOi3W=(dJ!766Um+hdf@lLTeV(dQ*OZg1P=4B83=5Kz-bQX!#$L?A
zDC!J1m(wl}j2@DLI(?b_g1#nuv;-xzFJE7cjU5PJO3H&@Fh@G3Ar)QGlqF^c(8sQQ
zU`V0woqQ7~@^%BkN{j9`33-I1#(BIl=>yabXF!-Izevco3uAe+k7@O$K1Oleu9ojW
zy^tf_q``+OF-C`eSuMKs5?w5zChQAerM+9Z<`%Qdm?xS84{<gNiz_KzLp^W}u)bk2
z-(!z@0rNTPf0S<#dqBODbFB`wp9Ywe3ruwT=;k9TT+!JzvGjrCkF<P2OJ+T}94}X!
zLuHXZEHVe^h;<v)dkf|WLhQO{S4+;GVvvm)fAW(Q$qfOwu41t=+Q5QmH5@)a)Ksrh
zIA-;Et%95NI~8~Fn(a>uifpxb@E=~*Y~IjOHPeoM&0@1i&9|%*6N^!t*lE_M8^ikP
zvGY!iE@vxdi%IY>cNL8mNF7;)tyOdBd!G>NQ`GzlHV&Q*Q@<<li$TizVCL4qzwYb6
z&p*mbc<U*c_4xeTW2QHT{43XzPJNKKW+2uJUGaL<E}2;x^@;Un+No%AQ@!}H^0U_z
zDKxyl^E9QK;zElItFbV#Pw`TSV~(qoYChI{A(~%pN<Vh{iqfTGSlVx#X=E+_kSJ_g
z8RB<tFcXRUrgMac!H{QDAVFWa87O`9_v#w(!cEfFR&#d&(>Y2wnuvB71f+Opc*i{A
zf-Oe$<fC<~(kSk)yJRWLhtBk0uREgqhwx|XUN^gYu<AoRIQ|r7!o9-~s(;Hb2qy1z
ztC$4qz){y#Q>OIH+Er>zH9x(6_o6Mx^t;{D{-p|$FJ8vWxxv{k+oRnnDn-%3`F0Sb
zglvl^<Waj_DtqRyif!8n4@HW-I=s8T<L`LL)vFsR<u~9lz3%e9BZ~>i;9J1xuURxY
zx?Ra!Q#%ig*0)vQ{f1&Bj9ioTD;VBPc?v7*xTQQKVQt^zYe!+|RKRKe7J2Edqfd-w
z`^{akAFNTE*b2oYg17;F1pX4lg3i%oyo{v1U|kZMp8s9-*hRnGkNKAcX(Rlj6xMth
zl>OK=m3+?^!7`SQcP`f)^y8rD^qiP4X#%x!Gf6DsJc?IT@l{*53pQQPq6oV#$_JC}
zsBxge7V-BGCf5b--W=-}K_lMY77A-1eBC8qQ||TkY?zwOG<n2`byoBTaI?_ES}U5Y
zy?J(ne_l4Z16IzFO;bY`Vf3E$k)VqcYPpT~`K;PJ82=uB#`9F$8l}ms6sFmK;^4e|
z(uHom)#V(Tc&c_rfWWu2UW2h(B;mVyww(LQc6p@T$?eBB6$rro!zO_BAbau0w=7~1
zeF2R_UKIFl!2;&dVx#w+hJA6vFqfu<o~tB`%&^Ll{vG76zA($^-|UZd{G$=GE@*@(
zJ*4+DiC#;h2q8Kac6=l(QzUUJ^_IF~Qmt8BEfztUD-fc<T1OXJ{LqZIdoaiN8NTM3
zz>~^lxQpB@7iP<&av5hD8_hBjzI};y>f)2djy()%nDT<r0rqm}Uy@EAB3R72q=gph
z9(}+2p4R2Wt!nBf7^Zm}Wy447=}tJIqQ}a$jJ_w!iTUILU9rcp?`A%9b3lhH)6^Sc
znDMX^{%C$T6)F#r@VLT?m)6Cwin87`R*}s-`Y}=qoQOHGTAP%4cyP)xnWv<@<BGOb
z?O0Rxa11}hG!|BIW{^Bzy@O=Af7<JBrUO3esQQ9O3WseUBo`@cTLgZC?4?Mn_TC}<
zuB-3sy=s86Bt7n5Xl1v0Tl?wZVkM*tY@RumZ16S9a{R**4eyz`?$?Z3BP=A6qDcy-
zYtCXGi3HpyTm{9E<lk%d$W#1`e)34wYfzlO(Pdjb?6AT#8^*#F-RBZv!SD9XjF%4G
zaKd>4{J12g%CX{Y{#cKl>yO?qmXFyg|Mag+xHYNP3I?dInRm>+DV}okn_S<RqXeyx
zFHN)W?>=<g!5`g2>sHS@UT(LH=?SO1Q<}L?RpztX$B&qGB*^wG!x7+bPDnAUDo0xp
z6P$f3M9<3`EbJe4L$@vd<XSM?+TYyA7&pcp0G#ZE>Q&9xv!+P5Kvdi_f%}&-_ovtW
zvZ6UXs!V8QCDuaN1CCLFf!BkmtEforbKVAI+T7{cGqOf6XMA_@zxsSoytab)%%|tE
zrxf6cB(kvHdzr#GHfKbHw{<+JTk4rFM%A`=&sX@gb{+ULKML)gQg^WC8@o3#KBaUz
zPL))9se9jh8ucj3JumGk@UE@5!PyU`7t1u9l2vn><-RpSuye1J_rg4fFdI6kSTHYb
z7NK&3Xkv-#dS_YK`*XjYzbZONSNBb9X&gyGoJaw|+M2=|T}P*ZMNneqlJ!mZIuXE#
zuAXE~d!`A=$GS)ZX`%sOVUG8R3%8K<{pCoTWB?Zbk@_O+VFX>3t>Ox|xN}vkwpZig
ztZWd|=wgFKEyOM+;!4-hA%SO3&Z0b5ws8jE&g{wNyVZ=Jg-(}Rddk*<QrBER2j6{P
z%l0xgHJZ9j`E>J2-%CXZk+;aD<qa4qrS$2^#m8<zNXTc^x4uf)r|$hSkFt3O)oA?j
zIxbuIowLihl*!HyEGFE|q1B|K6aEGj<yTkOn$5gd{sD`Kxss7;=xLeKS%zEvr%JS{
z-gbK(=u*1W6Z)&x?H0dqJ?q+YuoMdNoE5474qJMaH!}YZV18}&KIS*~^$7BZZ{z8}
zpMPwI+eoMO8&0p9_A4Ea!nc33ce-?E`V0v+@F;qGrg}jG2TONprAtq?>c#xH&jOgM
zg<9a;2_d6?oy$|*dnCQv|3VwS)N5C?pxfmf1fAK^!F?B^(s(Rivln$M-SPNbjQ)kn
zdC!67v#*{d{m~DM6=25IBGm&2lZY3->aLRR=8yR%^>4cMJ$bJR9GucTE&M!Na1nC2
zcoDM1jS%vv-W8NL_JoG+9ykwOfJ{OJ>*Sl|RZr`y>2L0zlvHrmI0md)EW|`bU#;7R
zs@o2OZt;Ag^X*&XI<8M_BwJcBUpL2L{Y%pQWsyJqp$`{eho*g*Oa1^otdJ$qSKSnz
zCXLyD6~}802HWZh?q^TPUd0%R#D*A!d1jS2_CM4tzA)?x29@aM1Zg5wkC(*V>prF)
zjEBENp`ibKm4|MTbi2MQToAJlzY*lpBtbd)VnlP+oBu~(OV|CU=1HI2z5N!bPx<;^
zS|Xu>Z@X`DQ(zTTV+Y3#9+Y`72O13|+4`Q)>F>Gaha2@AG;J@+-8uEBQ(fAeeDt}F
z@zme^>h_f2RJM5zTH*v4=jzemOvo&s94eO0i_`zKU9zn{%ho(sU2|4h7y{j5#=Cu9
z`bg-tH9PS{tLbivb5o?ebD1WSy!)?;@2}YS(=2}=g_1f+EUtOJM^Gc%UmgW^#E#!3
z?uiYgOZfiC8bMe=9zR{`zN5Y+R%F2U?X!OU=hxAL)h3@`(xu04>U&rkvH4!fEH2v%
z)FP$vo^{yDKrS^x#WKBqZT_xl2m?Q;5O1N<oZWOqLqhyr-lAXOIa>(<at={N|5iM#
z)b-N4&+@$=j4TiP9XxfaZ%;unckR{-uM1D$GrTe@>Ng-b$1)AeGe1~O{rwu^!z>J~
zmnr%Pzb$51)E7%`WSRGv>49ea!UQE166~WBg1TySPsB6_B?PRGmN!{(h&g8sXT%gi
zQ&|=kX?%}E)p(kwtWKt%ZY8S-35E!F`^1DLwlf`PsmP_6HluPCKMwALzh&+@{A@(^
z<`=R7UKT25Kyl%Qv1a($lDR45Dj?=18!n!hX!bKobn9<WHiGUi2sNJyWS>vkXfCvV
zaJWi`RV*FXs&?AO*1TMIg_i&3JV?lI>5U47v3IKP+-k2{Kp$q=`u`RoTPvR$zq-ac
zfcZPg5CPY~8469TnWv9>A?rtZ^QLOtuZZXC;CY)Y-yb4Hpuwpe&&}NwFkkgOcd-Yn
zWL11rzf*jKT4pk9Qw+^_BeO-V=UgF*8~mrcw8B0yu8r@pZAIi9oGI|V#}p#Cdl!1Y
zFod#j?(*&Yi)%B}wDT@TlcrN!Bjxpdx6^!w5kL6eTwlqDw4rKJZx*|xNx0_!>UF=!
z>Jdas70@P1_&auUGeo?8sz4)=kn1yh8!P)*fm^6b4<>(Y<m2tuBXp6mNiI_)=utf<
z9l{u>E!xv6qkBc^f@c4)s_l<bB%C+-UxsV0h!@NsAzeIeW`zhpkcVI)h1pk^nSDfb
zVRM>>=yOC<wm8JEb7}-Y)yd@6R<5K<2PINj!)f3gdD{r%b+gf3sch@WU4-|zl$a{J
zj^gQ-!~))5@$nbC`wMEQv;d|OguQzB_kCdrf=O^Mp+y#{n>Y56A5h+GNxhhE5}EOF
z>^5yi{;Km~kU$j5>4&3Q%{+4D@@2{$gRR=F+m5_94o`-+>1?Sumf~)vHn)`6sQB_p
z+W%qfEx@AOy0BpwK%_lLBcT!k5`xktA>9K=4l3Q<H6kD&Eg{`8l+rn%NOz2MNq2+9
z;P;H@9FLy!eeeHYmw}4#?7i2z*S+qw)_%M)6vh@v2^hSaw_^|g*5BIiMxj((bkVY%
z_EpfHS%YZd_a5$fI3jGj>3B?3Jz?Y2p!_=o0f${NUAug?WpTV)4H`(4-0`AvmrU2d
zHulNia7rcn{{T_NmnbcdX<=NL;a7HR`*eO)^xSEm>n|axNh4z}wao%Mg>MerFP{NM
zy3Y_X@F(r5<LngePM>%_p=8QuJ(V_!Xn5fgcdHrA>e44tfev<w<nA8|Lk!!C$gGG0
zLB1Z(9<H)m=8w#K++^#YS|!#2={hZm&C^T|g*3|*roT{tK~4G?T`eBG2e%>A*_t9Y
z8HYLX7+v3i$g(=`52`<yZc_}xK%oW}C}lSv?Opy4D<A*_gnabWd{lz-&&l26B0y1e
z3_%qX_4%8&yQ0INJ#Z~zvzahc9-sHmdyl8meU0X7b<<EE8$ZZ(hBZ!!x!z=|T-D)5
zXN@NHu9EBx#B{h<@f(DJXE-*u!E`(<M3gO31}GC;L+?MWSYVm`CW23ShJ}OSJwU@d
zpfkU#=LfY@kRIWfva7!gr~5u5)9FgdtJhy)uJ-{#!th#`6J4{gk@X{(6idVI*yM^#
z$JgneQ%|lL7ZCDlp8;Q#Pq7mN^}w=n=>5AaXHVVL{so9k`hZd5FfXJ3f`?4%{&r6l
zbMkH}u%<HIAIy$CZ`Bp`q$rzAXvFjY&jckbXNNTOW!jXD$w`BGa<ILOFK}$@Y(q*e
zLorqRHfT<Z0dfYX-mOr7fSAk74lSg#jOa-~MSoc_;F;D>dpjIEw(G~L^yXO_RK6Ra
zxpqWYHXFcI5Dd;)ruStM6Kxz_v^v8vYPOSBwYRXe3eQ|~Z+Yexs@5cjK}Qv74&Qk`
z@9)Z1HLAw7ho_gQ#Ge9+!~Y+u_t%joW`ew1pIg#RtFG}HXn_i@9$o66_wT{Hy5!0>
zzDPqyeP`ANL?7*skvLI`@(^{lvOlWPX2d9L*wOKN5S6rQr{UIV@q)F-Y%@emEK(K_
zvYXLXFV_98@hsp*TmTo&fG+xUwi2!qEkAT%*n3CbPuSI9Rw@$B^@n~_rHJx{O`6lY
zNbd_fZEgVT(Il2NuwaO??0G*epKEtbJ%1y5PnZP=U!s9@gq$dRY9Y98@7s(nyaO=W
zzs(P|6r*c6#(f!9%elYHU9tDMbe~lY3i}5y>jy^7GN{y351%eniY{b@7I+=^@WP_%
z$B@NZ?VjJ$e7I@LePF9a6mHZd1(V^e?nvpvN~&YWw|G~(dj#t|ipp}O_tUerZR?Aq
zJR3GXAR2OrYJ!D)DlfIBIehLftqYX@Rs8Q&h7Lv6L#Vz!<@OFEv|KU-g23%qd&z73
zO+^Ix%jDz{(&(DxSQ^Q}0fq23x|+Ly>Aszu9;Qhuk~wLHmW~k>%#zKM^KYK+%AI)n
zIX6B!aJw1R0LU!1Xj=E##=68=Sz}1m=Ypl--RO>Du7xo+|GQWX>>zGF%cZ@$#V;n0
z#%A36rtGB@#ULQ4htj@RYpWZ=EgbGJj0<%V;aB%5P!<JE5LV%f3hkegR8a2sMk-`r
z_f)gHb<uC^2%X9k;Q((zs5ui7jv{4!FAhbS^kcxJS-z<b&&Nzkatgl1RL!P`H9UQA
z{JP#Kq8mv!?!j@Z7?_CKK*-LGHFlOw5P0{`6&5WE#skUPr6skK0G4SjU_F#qXdY*%
zCoR3xa)xB((pUV~S;ymW*$ej2-m?q-84P=QQ4l1QwidN}kSYp$bzVPbmg{zL#5OeE
ztaY$KmNAxAj0;J;^y<T9)HK1)7T>V(ij7li-M&)B`6d;ozH2X@1}W9<bh&L@c%Kb*
zkX$AxG<=i9Tu%t)F?TZqNGb3m*AIi>#Q)Z6%Zby1Kc;%JbB#@DT?7WiBmuS(XzM~E
zPT=;AfVQ?Rp)R>qlXvAgZZ8+GDLh-yeMc0myio&E5{k@P%HJ?m97@RnDB<{i{*B&0
ztr|!MzXi2f`+aIX<uli{)<6@Sv2sT+1r%(ow)$9sZ&%Hw%uVmbE!Z$0&f%xpI347u
z!<s2;Y~tx*dLJARx;R;rq`tY)ADV#aw_J4?JKi!n5Vu+|Q8*#4NNd@R)nD)WaBV?+
z1Dza8eE{%jyGnVks-<>}LN`86dp86#Mu8x1M(N(+Wyx@bnh_J~KZdA;kD*RueiDcd
zWj3Gx-A@9*H;_NHYAlKTbWW(w{T9Caru(fl9VN%fJUee==wTi+TO;B%f;p$m7Xel$
zc4+7J^6B<b-8rbbE|Q_<{@c+$uRFLovnOjXm@z$~R5$fjvZikPV^9&<>xq=8ALnCl
z18nRV?^iM&?TnAki9z&_M++oi*q0}FT_Zt+!G}A2FDTp^ULd_`kP@HE#v7*12<#g;
zPixK`=z@e)jdTHqtb!+`g2EA_VJ!Y;6~PB8e}UrPRE(YxAYDo!9W7yh{|yj7YLn3B
z-q$AKCqP)aa)k67qUm9xB*1l!^vCm4(9y9mbdjCDcnm~K)V78Qq{(@p%Yz05ov^pK
zRog2`_o6&EOXH|xp`*ruqXL$>^$Hjho^YKtrd29;FmCTdl(B4w?0Ygx7&Tj{mQl~s
zvHC5z$@#s{9Rl8OjkeJ(!#28o;IpFLYmep<S9YCkMoRy7<YS;4VcuJ8KTr4$DrE)V
zn7>TJRe_HEYduB>SmjLLNrW-}Wqn<woqK_Ko#<G#*lN&`AyJPnuP!%(z7WZe6>Z*>
z*^&(e$KvO~#$3uF6_A&O$d${Lpt8;&zHwrp<K#qkkW|Lu+v0F=_J>vi@vtje%Vv-L
z`3ab{R~=^9$}6GdG1=UDZ$FuW;o)GRKaM9%(QUcOYgXBQ7<!@s$gTctfe;%g9@R?P
zZWH+Q({TaB75KqzD`H0U*|_b|By?L{z=e5F#>%(gJKdSD>(mEFuVfG~l1$^;guc@*
z;ierGz)x}3^FMO|V3-A{dxV7x{jNar$kW64S;<qvl0ST6@@kf7cevyhrCw^2rM|Az
zx0;G6)pz?M2+&vrwq~!Z1&+3E&W=-I$wq?dV8DeDrDdgj^3}6Oo2TR(R0HyZkG4LZ
zGQOV7JNXQ!qtPIsdkQMl+`XPtFLY#?jGx04P&nZVg*+_Z-lM%t_<>xe<t^qW!<yf?
zw7SD@;fHUiMxntj0w<RJktGBt$K6(Q3z;_I3sK&4=ZnO4S5my3GEj~=9P_4WKUoYq
z?w-C@^hC1*$#;KgYyUd!zsMO-6ukbK_^!=;YA5QS-~xQAVilTLj18>K1Aa>Knl+*O
zTGREF0yq$S3G2pu92<|reZ<oR7L_^N2kt6$cHh^u>;bMT%wU26k!M%&ni}4;rBhkT
z?Vg0qGgArWwtyc_G>38`E<DU1?njRf7P=+wQ?gWi)<yv;*o7ZIB~a{@GeLX?c?E+(
zW2SY;DD0gPg{>yXOY78y3exd=Tr2?QG6%+C|7gV7Mln$2AOFhUfF}<DIvg}U^~V0h
z7(n|hLwTj4zz$z3NV2}QDp>(kxbxn(wVVIebiH50(S{MiSHvMsea@%sLqV2R8rc4r
zZfgIXf%-=d?~gX>;!o~<(m&iO5y%s_ybkc`@YR;0L>-V_Al}YvY<S7(g#p!g!@|Q8
zImVGqn={_2WvlssqBt*e84{_ciBbabTy~spAOzk70+z1?g~tulMzb)toQW*c;&8tH
z8Kr(aa7)g0AbhbD3*Y<44O^se_6Wj4VwN1#<xv%UwvQU8OoZkc3cE6?2_JlY=HLn#
zh1Dv@-pB30yW7YzR6zwM-w-Ac-&^<go43S@mTGS+nCJikSQH4@L5=y_d&H)VEklD#
zn>2g)gbQAde4I8S3y|5naW#v8egOHt%|1g$q_Dx@U+++-)1VD1)OtsRnIXHJr3z%V
zZ)E%rk?xhX>?kmKcemxtXGNIJkmaApMWqB#>OYL;kDeUWa{lx8Ejf6=GUS+7aMabr
ziRvG-ql3Sjz6KS}ST=`(3Jom%<YwUJ!s@q=1u+T*Owq}Fu<mI?W|48vGX_Np=;9ZF
zFe8Ws;Bvm`fQTMey=L<+Z_D91g`s`R&bG`<<QTfl7S^rk<*})9U2d=znwFh@!LH%8
z;!oZU>V&QhEZS`N8O|YE!)B;#yd3CDNdV5W88hHVA*J^Y!22uKo*iiYn$7@@&=?Gu
z(=N1C^A~b7!U@Ezyzi8Ixaz7#3{jIeXC1MZ4;WsDHqn~55uFNGJYk)vtKdKWJYFU=
zSB&p=DvO2yg!ASEv;+H!2Bk<JKU+Cy$PECK{|FVnhSknwKU@=}LAJM3FqQ4^H%m3G
z!UnCj5vgaN57H~`$E|aikN?W;bZ7|#ZO3U-=+qNXA={SlH0$DYRIZcA_^_h?p-+;C
z{VWGiFXIK=Vq`__@X0TE3UC|`$&Kb8_BErqU;JDLJp!N*(lD{xh7ft20w|Xe)D&5H
z<%-5JzZ>E60*}jFXy`U{*orqWX*^8)gKool^l^*uDfNPzasGXSydK1()0Jx^wYPzo
z#P_))mBn5X6d;<3E%JUW)zQT<XGZOYCdP%l^5)nQIpsB_PXxB`OpV4DtN5RzhCui6
z9BROFH;&bCazv>#WlLnWWSjcrbHR9@mC_QXS*Pyb1oq}{3fJ-)ZPm?}<rjBGeJD!=
zAolM9Rm&UFd1^AAlcD`j8sbK>RoQ*$t9^WNv2d(`o#GinhTbSzKqGk6y7H>A3_Ul~
zv!hJzMga-+o^)MV;y{&$E>Pe%0-+3yo{2~ribHIU5Gh5Vi+wp>U^ZmUDixyg^~UEn
z7zr9NeW`pv8_W{6x>;`1aw(Nr8nr7)S~@7*jHdPsS28eRmb})HowQyi;bP&~A>Hij
zHUEI<W@{EEAX90hz0Qf^r7f5^_P*)#va~_JyT5oSo^kHC<NbqwnV~9haXGidA1hQi
zOY=3Ybh30r>F*Kf=FUP1BAHOa!|o5O>1~=*KUhUv3oAS@wOmudy>z1in{hlFPU0(s
z<}@RSbYNz4?*06;m{)`u%2=yo?XU8JQB+{EhlX})CM&{f-d{AINZzlLH>j16zqaDR
ztx487&T1Dc79-ErZ3%^h=Jx`yarUNL+u;PO_Tr#2^SI`?1GjKY`Rosu+|uRPBqwMB
zA9|y1#_nn-<6a8uKtAZbEa=HHvV<0?5*x*}Sb9(p!k#Q*m3H<eNXkZ$4jt&HQab+G
z-ST%(`km1OT^A%qfg^67_EP`+1JH<892NbX=v4ww5oZ0g)ygU@f4*Ds*`aO*c?U1u
z*{5;42V)$i8Y<|AM+vmwOFEzuhyGsfTMbSMxQ&tkf(h~SXSBQ_<6jjVEMR=SLB;rn
z_3>!5_(mr1(_$`&=LxH=b}M6!N)@l}Z5Lmq=ycA<2;)G_y8g3!+@xJ_)qxr{)haGZ
zuo6dt1FGbLYdr>dv>yR|X&y=@lLScPz#w)<59&|9#9l6Cz-~Km+WUU-&pa)Zj!?ok
z5T6GUT_D!NZ4ab7&yNE?B=FNaOW5>BA9<eKA2GwAGV|PaH#*c|MZwC?i#=*rE7_|9
zE>xrF1o)uE6;{g>fE$EUW`Zv<)m^ehi(@{HF5njG0rVbg!zLCs$1oTCRq~nmmw~tO
zbmcDau$P5_c@nG@{d0_Q8evdIR7rMtsun`*pUXi@=2ch#!;=hr^WvIlUNmp`q0gkS
z<tz}BqD)Z$vjshL{bd0Fz*Ld~0AoWZVwdvMpnLIASr<>*%`!(<90qXYZoFRoJbe4=
z3#jje?$oCUlfu!*rHLp4X?W<6uXhtmb=!`?ml=Jjj?^@c_Tp-qy9r?7`D2GGGwSnA
znAduvXx^(sr<`SNMhl4D@<Yy_j6`o`@7I~v?W%^$pSzS_CLSSZx|UJpL602o9*Zup
z0PWNJ&{63rhA3US8U-yG=;-ri{>}sX7mtWl;~b`!&iGAz7rn&}({JyhOs^#llfM}v
z7ufUbfdOaND+-*M*&NR@%};s=)nP>CEr`XXzFt)*y3=e5kNO!{%)-q*qi<K|5UF)`
z&sb=b``gyI9gJ&Q@4KoY82y;<Xtx;K@zby>WjkD~CS?W)#%8C7%QI?5tXNMl@>lhP
zUuiPF_O!4^K=4a>Gg{M~#3ij7tj0NdhSmr4j0PE|Gr;$hs<h0V^q<9HY9ueZI*=$<
z?cdN;Y?_Ggf*1uRnb{(`aEt0c(VI}OUcB{r*vN(A8hi{urX2LGmghH11oFzeKwkOA
z1Kg<klln!a0gBHZk#wP5!78ltPb^--sMr;b>?sj%@c3}jLG1rn+~qkwsp<<sQ*QV*
zk_jT)OGccu{#Np1yY8JI^!?x^J3D})`O7a>l{DycF@vJSCAv=e#zGE<fdjZEG|e1P
zxMoj4$Zs{NBpF3-r3fk_2HO-+5>}s_qa&tXTG+L*s*rl&WRNMKxlr0CbDpv-dB#G)
zSm4QF)ph^-#s!KxqKNQABslEQ8x_s+o&ag{DF5O8pP}tf9LduIB)fHAkg(W4>u&MF
z1vtp*(t<p0&MWaULExGk*47E2)WnJ|cnlx!d2u~0!qB_d<eOrlw{Eeh`uDC5C^zzK
z&xHLkg{I}|yP=Zt0aw7?Dv&sbSNFIsuykD_tz5dqVG*Fhd;@A_>iJq5HReb5ppu<e
z3BE;d#$?;}V1{GW1RAsDE_nl5EslDj-895fGeVq{vQ0{H$$BPd=3G^st4Vi+_7ghJ
ze7`DrNTE)nkRITNq-mfs=udz85dmA4@Bn^z*8Xdn2FPB(qNrOIOGy%$ptgsAx{)1u
z|7tVlJ1IszA`^zBqq}G`PE3JKlS9=HCytx13NTf-&H56S+^_4fvTaP5<^sZs&e>C7
zmM^2l>v|=s>JS43r{d?Jy3Xb3HLE7?C3<=*go)M>@~`W)cJ;M@wN(vN4PIeAYJr85
zZn&QLdMD^)?7rP>Xzd+yGcN>}I;v4{?X;iyP8(07meBGBNIOOE(%$5Z`PYI7@Qi2n
z{&4?Ml>O}lTNpu3(=?t{Sn*#Y_vcJJ@;!2|usG?|w!XWLKy1-Hz4z=y9J&a1Jq4D|
zH#~DCHavtiYZlGa(K#-iHi=A{u}<8=zL#8E8P`0q6`=jAz;DEa*IQNRTMF9w>|KmI
zEVLeM*NN|TuxoqPfJa!ql(NGWew$KONlRXYDoMWyo3QYS(^GVrYDmk_+C?$do(Q6L
z!L>S%#%!72jIT1dqQjrAiq^sKJn@y$i?|kUJ>vK_hS&98k834ab7$AdUJ0?kSC5;T
zTJkKH%<Dek^QIL3JskWJ8KNXW#r8=OL;(q?`I)~$p?w{o#Od39VkAlY0DAnDo66ix
zY)Fh-E#s(5<a-0=8jsm@7uJhfw!oxBuqZnro$uw%Hu1m-bKfxxO86j2-Gi+GC3RhZ
zAIgi-8r}lp62=dIlrsU(eT~$^$=Y~AgQZ4<(nIKxf|LqRkGx$Hn&ux^uuqwcTjLdv
zYwp;^wQLy}zARAo#L1bx-IMIyWQj|xOE7pq4pb_L{_Hq63c}l4J^QK5{Q{ru_W+3b
zUD9efLRss&!>SGo7b6G|3A#gc5k!4Fk3n1uc4L=zIG^yxzrgbHOMS3(Ja<;UJ<gc6
z^E^5wZqcvO5_w+QL{}rL1bATKnZ7$f%3HQ>ntz$8o`BHGijZKbkD+z&@)a#oQ9Q)q
ztM&N0D>h&@F3c7;s3{uXdAjSRJ@+-MWgVoww8FOT65SXy>cjq!wA>(~r&8eBpLLt4
zub@j0(KfmpoRR;sQ%sbu_&7=!{}(h_LM^4sVOfnODikDHeUrxVS@^eWSHT#%TTWGT
zHgQ<!h!CLg--flaljJ4Ffli8y>!{g^i#({pyS(GFjjWaanNXrcq(QT5HtDgU1yS7v
zwBG_ni%#^``BK_8^_Hd_4V<o*i>k#AhEOT$h9O!$F9|d53$Svm>bwWerhx+|FKNMu
zH2@ZSa^2ksWj%|;^8<&AU%$5h=2AdYK{09o0L77i#&2;?)GChL`x_N-_2n|Cv294~
zE#5;Skvi;yR?6e&%%zpfLJ1~o*2%HhZx^%^e9qJ|z&0)9*-GAdT}MNB)nJl&@AOcW
zz%dj*n|FjNBUT03-$US%%^XF3&wy-YvmXXXTcvpzt1yl^J6L?9P(l6Pex2_#p7)q-
zrXknJHfwc_O^U-1&#~`2Iw7;rND6K(Df~6ZGsfJKzFIaA?)5YH{^=fx7C^13zmmCq
z6Qu|I=|~n#KvW+7!A|TM(M8s-JsNyPb((|C@yf2`qFvOe^K=TH#<soJZ&K&#km@PV
znqKc$w>d|K;bh;?Gy->`j>v-m-0(;Y!w?<#nn03f{xCJ~0lxMVpBdTR{&OVvEX3<e
zGe@)d@Usg=nHi}Ri|3JlXsWYE%frIP$cm&RWj=y&awZpyEJ;+AB2te^`6*RXZ<Zh5
zqOADw_$}|z{L)snL4QB+pQi&ox&g2!ku;v&f55T#DT+P4prheT0VY=lcl*Cq#&+lL
zgiL;CiW>#gUPPI&P|yg_2Wp7<btT#$U7x6z2Qu884OJ8wBMC<NJrmI3g3LKMm`8n+
zJOlcLjGMIX?ZT03QztBAsh7TKxuHJQ_w9gLxX1D76B7{?)c%K$Qu)i4$Q7enRQ0uT
zp0YhU%dC_J&W|yX6&;Za2yG#8hH`c>FPgyZeCu_!UFbjND1Ug4X$i2h^s31CAMM7!
zF!^~J;GEzA0l&0)LcuqIOs()l%3Xv+Rs>r8o>_R+|4Gw}i|^zZ*Sx2y<EP4eQc(p?
zZBRkm`Q5(sX{@*P1Fr^p9LgoI$O=%SicDI-;y!yGn4=b_u4z<!h|Ew~drXuvLemvm
zNRa2lAGN^{1>?L19yoV>*!gV>C^QbTiV<au@J){5z)ZEwzV))=qybY_H-KI#YHd`*
zdwXk46n-8dwx)|5cY&_u$}9DWh#zGo#hVnHm!cy(VPgvaK=Yp#_iMKh%7y<1^p;-Y
z#G)@S8GO}(9UWxGQMjks*N_=;g>TQcN350%L9m-dhQ2D2fJmqv&$npdt#=;3XDwG^
z4OtR>Z<PvWWzz%_(+PHMHlK<*Z@)jgjezfMbb!h2(radEbUXCDxN^lH!D-CQ=Z&>W
z<*PT<T<4&u44@aO!1ITk@cKN~G$#_66mzh|v!AS7=TU#`LN7{kjfAO@;0ylxN5Ofi
zMnu}%?DX-m(AZ@EbX)$Dey8n#cI6Bsvj~>!#8N)@cM?Vw=)SrZf^5I!=BZ=!_`K8O
z@S0s(slV4k#k2R|&sOPQj{5Hpp@lbqgso6@Md0Ut0QDL@e^TyJRBt)`Hdg&M^~oc4
z(t)ipz3-J|Yk2w9G-CX|@h?@YF0RVt&@}zN_&AV9MG6_bePNRP6oC7in_bx`50ce{
zf|GZH6+aPBo_RdWd1=X=XbDux@{P|z?iEmlTQLG}&Oq?&ySlZ@y?0@d4Yjazsx|@U
zro<5mx|%>T5KX@U%-khxn(HUw=V>KMyuGlfmraWYzAzi`h37_a)A{~&#zwfn&_=3+
zbsB2s_OBB*x<nO5FHNTdKqT%7{#4g7v{>H-nW<6d@Kt@VE5$h9#&X#?akCK$dX`q-
zHa&~+cAY@T*C!aPRYg=E9Jf^f^uBc1quDx215493UXVaRc7?K0-oUB)V@(-S@BmPK
zp_s3=!|5nFs+LYwZC$*nd@j;=^-XL3&j@W(@}8rE#5T6Ophw<i*%K%0S|M<Iv1c7C
zseEw^ymu*BeLdH<xGu-ly7JWz8&1SDJ--yqOI~{Na}|Fb7uq%DGwHwW7Fb}^Fe(Sg
zr6b~u0P$#`lsbaScYcIo_~JK#4xef4qm4zE<1sb#M;+`B!v^HBz~D0EOYinw{fd4c
z#RzH*9Z|gin>Rpm({a764`l$o_E6{K-3F9kr+qyqDH`YsTxi({>0>{C^;UIvR@lhX
zGVkiP*`{K8iT?pG_fXSv8Lm$0QuD?DS`@W>sPya}k$517RA7Q3Yd(_~#2gr|NQ(<C
zLPczGD!iPj%e3~$q#EVlTr;N*h;u)ZV$`h7pQ|h60HT9B5@-MX9w0-%1Ro5G=lit5
zjTrqm+-%&{Su<SEFkH%-_^&Q{93AxP(e8brEBu}V7Edj5`BngOw2y^6Z})3-o(MSf
z`MV`@2Uup=0qG3{=_B#Gf<D<ag>PArYE!Q~b$q1^xMXfG`Y6HSWag+&=ol=P;ZQI~
z=*E+m1!~PjEpeQ&8LXksfk5bhNwKZRl4>m{t;$Iw%IP@)Rt43$%O%)MwDF3HyG7kC
zSr_K-1jksdJgOhz4GLSl#@u6qJWB;8pD=e6_~dh!P3Ot<f|65v$+MecaQn)?r$m*k
zb~^s2V=d7T$Yk@!(#`(a6q4GYmNja!?Btp3Y_aUSCn)P+Qpu2^!!7Gs;QVH=(nTPS
zZ8&+L!H$Olt%&c9PDdsCaX@e$t@09MMSpYW;(fVtRO{b&0h}=k80N7av{_zuB-?la
zRY4n-x2Hckw!2Y#wZ%P~tHyQQ7>5S4l)t7H`3es{3%Nx%!}=JZr2e3)6yg$jI<N5_
zIbE;M<5q#$z0sqei`g5mkYQaHJ)2Osd>iP6u3vVf0dmB?<Adk%Rp|JEkFiqTmz=+%
zC7ka+JUGn6bi>0KzZXRe!Z7V5e41hSXo6n*A=ND#`&3rk(6rbV^X&Lp=R~LSk6k`W
z=Wlz3XdYo?N`7Jfx`(rFSJ`H1^qlbNIQQ6nE^d{rxU<_2h!v~I1Gajs<(_Upty80m
z$MOHWtt7y<{#}{)%aaoCp~G*BsKo#`zTUIX2KihW^Oxp;SS>GOY0X<BYp#sK8lz#?
zB))5R_#Gt<Os6oCg7J<nLg}Jr!m{NH+PF_n1A~`@AH*$JR29p{wx5YA(LS_`1#rp>
zfDDZzMlSUS$M};olx`m+HvaJWI9jwIUnpa8R_LBA)Fi{&G<KY+5udNi_Cus3=F(##
zB^-m`$1}3!1}$V*=As|iRQHU@zWY*muAh%Z7<jm&M0D#*`1$+z{rV?2k7Zy<M?(~Y
zyVx#%EmPY}d}Ew%heDeUFGFWqY~a%+QOCm<dQF%+`f85IDWN?qEGBWI>o{kAR7PS{
zibr^EoCPm7u&^5|S6+FV6nP&60F6^mv~t{giu?E8X{zf~v6jZ`zT?a?+K_D<T_$uF
z=d3#Qq8~7=6UO+@*4Co*va*{I4ZpBI#c;H_s5C0Gv_KFEMRSX|dHE{M;MUd9r2(35
z@2LV@1B9Q+JbOI<Et0QZCXZeGnIU*#BlB~86L3Q}P8RFgb-AkHSrZdBM!LGq+;kH_
zO1STp0leoAz+mk^01b~c(ZX<=9a?US!65ckmzj+c=A>38q?AopYMglMf-tVwyf(A-
zAuR@#9^mm<*HEI`vq|0Q)pxb%tcth084CoO>i~W5?hDQrB=643ec>$lq_?TalW-}?
zdwko5BWoU^w4J+;)fpeF`}p#!);B%4HA{=-p*OQmb=H<+S^J7|cc={R14geZPzDti
zt-jOKCc;+J{uV9z2Gjmg)H1hqJkZEF%fdx0B}RgkLAuw7sQ__q35#PM9dPaD9JSdT
zvRO!N=*=5~Y@Mh0W*Pc^<XlPw2_^5mIsGNML|LklB`Uh<+WpmY`~xYCQb694o+(Dt
zfR>lYi;j~7vt_5dT?lER#i;abUb!eaP^-4|YwTB;Z(*JubOB4yG-X<AE|8@uX-Sks
zvSyMcVQ;nKb|T_d$OPsTiLxBwbUwpdsC=W>7BJq5%`>K3&Yu*p+(%53fvyRcd>ef7
zfFfpam<Y>eB#a5@(3Np2i;i50PmlB*_1`UQudJE2dB^L<qiU$J*1debLQcLBDie$W
zQ=@-Gxrg5sNy7K|yw!^8IC*o4Jlr|b%0ad+p8fHNH69=tWw^@TaIvd{{9(MMkQ!5z
zx{Eh+S<qomUn9Ou5BXBjrwq+VQi{^^G9<!QNH2S92pS^St-r)Nt4}W`vw}uoWf(ib
z*+P28raW;xcWDTLUhH+B#|O<dxCRv6Ci)wPHihW@1=Z1^z1$FSR&=|jj0jtFWm{X#
zeDG124-S6Ue*07mWY@8IiuQZu*~rTNPT%&=%<HenG(Ur4$a=3{{7n1Bp>K)jsfSsP
z`Q0A?IWF+(>~Y{NeR6H|RUM*cIdnK#CzG+73wUD=EpFQ}CZU>LHxJaxS9XAovbazY
zIPR$jWdo~InRPO1HfXj=1D8->E`MHfff40IiI~6{#QF;c5DHMKifm8Wx65WILMRC1
z3!{C*G5c7di?9h=qJ!u|)6SDAO6P#Z#?hhfVjG|h&8Fc>e1>1;;$#I?<lHoF(Hck)
zPs7Cc0}CVD-;ku1rUa9X^!Z#Vh28>`d#Okr`!g_@a?-3z9kt45lnv?JI0WvM@H8+h
z-*uw7w)S?pb<vQo(bu>*cr4O(ZW9~K>Hj!=5W|pl<NUVBJ!DigWUYMYAah|^IX-y7
zq0;v~Z}c=r9r6L?82)a;G@OH~$8-IKdI)cQq(t&&*wX5HR)?cOW@x*B=tfxao9ts#
zcaNb?$B9|eE=Qg8Y2|7aTRuUKJ%RecCt{Ii1-<?uR(Ip%gY`d2L3b<QG-fX4&shxZ
zhwa2xAaCUSZ*F&?t%j%7qHdiKP~mQA)B|O!sXo-FrMGc@oUU<zL%<<KqTnaCqj}op
z$~WnWG&(pNQ?M7o^`#46A;4;6{GRSI$`DY<UoOY`KwClDROkE7$>6NX!pHEI<^!U$
zIj>Xsh<_V`(KKKP5y@|a{+53Lmr=|`ft|cf8gP$DqCC{l1vA(NjN=k{4o|6_xpdkJ
zK6vu&M<@BaTDGoZg(6lvpdmr8>1y^lPS!aqFz7cav}MU@_kIfKj2U|{1AyJ%!)L(6
z84#YSR^R!?-+M}dbxkD^b;SggB2B5cyYn0tq?sA!J)Wo1CiC(Y@h!!;cqj$8u7B>-
z*JmO6V>Z-1h|rFlQ+0<?1-qUPpJYvBc$nXIOGjSYn8jV4`9sG~Hk~gx4h)Wq6!p_k
zXg>CKcPaw+mjS)@<w)n6;&}Wx(j0E<M)3qaderPODV99r(|VnkIhw5>hiCf&s@<lX
zA=^jkV)=|3%iqGrqNDt+K6NtFkK$uLS+0sT5qJ4?TbDLmW^Am}-Pmj5x%!kj+S%0#
z<AUg!C(>W@9O>AuS$e1VK>WysMw6pSH+ko*N#v+|ETfnFE@@osM{wP^N#<vcxaHGV
z(pNspWd9jGYasyS9#2W{>0er^jmoj|E&#m;E=G$Q-fNixJz?eO%FqB>iP_^+XKXJ-
z811GPm)XS(?vkeHLkxBU?CgA6IiIr@ajY&9i)$9ac47{f8xYjHHnad+_t)15(R-Wn
z<l{#Lu`b|T7%&d^eq^&!fL<3W{94D6a!AIS`K`AU|I*!01cUyBe*JQ}l4H<<;pY!+
zDJc0YDHAF$g0{PscTzBKH-MXWxhdRw5IVQ{2&e07dS}Yr7-XsZN$*03NnL4eKZvLG
zIeJ{<n`Q_q$d-=>&{l}02~+*}naC~K@Rx<bUu3ylXa-*^&zXOG0zD<6K%O_P?n^(d
zpmp!$;8A<;TxD3GU9?%&*?oo-nW+0mr}$y2EyX~&-}#HM{Ot9=$u>!1Dk?{E71f>v
zCb+p=fk7IZ`eLxS=0LIt2W}3WIIS$sU+SCmwjn!dlee}~N5u=V*{3zVu7}lqeiEX@
zBYx()G8?%a!M!p|mjb?prQpOpKu-Q7=OK=QNfs8Mn#xG0Flh+R-%#-V>;>e3Kp9U6
zNZ=zdHIuCqEQwQ-d-Yvn8z}A@kO4XD{Q>(2vA26Pavufn3_j1IhheId?wL}QCMI2W
zy6DUfvO@K7rg^p87{okNNM$vayOjD}-e_eRG&y9#z21TEKVqf4-z3Ayy&J2sQmdGJ
z7eam$0rZ8vWr+N9;P7WwMPCI(RHg0J)}nvzj<P^?3=Cf>_@e=4gTUOn5hOggI6I1M
zUCwO@%%<9HSVBU=?<aH{-~dG<ekgQQxa5c(f*$%QnD(L*cmvS!Gw-Kzj0`~8J-2zb
z{p`wfxt0#tXy`jepryoAgwt?)i_F|Xq+Z(g1N!z(ijtOZ)6?5pM3K~4%MWy#os63<
zy(hGQdo6OBwf-Vuk0Hw$;>utON9W92WvEnAm<c(nudx=CK;C%(N`DcvDOnoQ-nDIl
zOes(=zQ$d3%Pwa?{opX4lBQ$UFpZ^$M<(8^>qH0m44=wr69$Y=Lk?L;9;N$I9n{=7
zLgq{4jn-xB*AH+os#Ros9DzdJ-hr#mY??mj3!T{=&)6v!>vd{cyV(5?(c7CjWgAX3
zS`nOa?!jLZ-~Skm7f8WOQ@c5%-D_-%p|w-GSF@|EbswSDHhj?<V_z1h!u@`N1Jd(S
z&NgL^nKCi>mwX15qetBVfHbc&VBX{xB-Pge%B=bmfeI+t)mJd;bURZ{FMhRQhyZ$K
z4b`}LFI?g3pd+AjgOOAb$OJ0EC+A6+=+iKpqGf{Jx~F=VolkIb&QBL5m0A5JKqL_h
z3_y<S|5@orjtfFDoB7)ZWW$CT9fXOr*eOml8!7P+dv1F!;v2w_|I}kOyd_{}eKQ(v
z^EsCYXtcq2;3ua#+BN=I=HpOe$l23MLz3WtoZ$=e`wFp}(fW}jb}4fjm+1d#22hLw
za@(7jU9G<?PB8*#aZ0?UO&bRwlX$b8y;-mQjYqIIlewFF2T4FX;TLUV;T$hMw>fc>
z(Zp_8&si-d(N<HWs5#?n7@$+p#&v6+r|o4V@C^dfRq=;!c>7PYiGLw&7iR}MfW;k=
z0^Nh~fi$;&rP~SOTAni;yMU1Lw?^_)43eeS8{osExzW+FM@iX%v`-&wsy0jTmMZZ5
zaFtHfDU<iL;{;m|=q)=%!0${kZmIUnQ@kt);Zj8~PgNz8@4uKRqv6~++aFlEE0!Ia
zul}s+UgMkvW0TiYpQoO{D_m&su0Qzv_J6mz{m=$j7=d9%c@aXxUnKm$D43DwpHn}D
zvY}P(QXJBrr?1hkxmcM;!wtTFMy`#;JRaR1umRO0li9IzV%DEdWo%2y5DmVhvp(P9
zixq*h36Z}7@o7i!+|b%Kw1LuUkgRkb2-kcn9{zCy&cFBMH>m<>H$`!9E#I_%YUjYo
zei!p{1Fd!#%$<I(NU-(LF1PL}w+^SFD}Uf_0D66+<HdC&(C9&WPabAC3r?n-70X+r
zB(m_>Jc92zAM|M-$|Yy~@ybY9P8DNi@(<p&QKOPKL}wCrC(4%<CJ}XluKDQ$r>E3D
z%Sel=;L3`NDIcFXB8!!a)C+;4iK^qF%Nyco=$IRl87!6rlf*tZEssvVeIL;P1g9hW
zBkOXJf<~O<i)iHlI>=!Fn>6A2w~<4aw%gS~MvzLqx!04#)68z0yn3w{U&bpLMHbJ`
z^s4qAHNlRR!j8fV_BDr+??e}t->ah^A~nBG?~exhfBuqtUDaL}mk0YX@H9Uj>91dm
zp5s<Bn8~af-^uT}$dct$%YQzScC?R3mSh+2<IR~<IPNRxQnB3_<jbO!tFJKd%V5L*
zKtpI~KJ`IzDR+(fKswn}5LrN$(vGK%$ySN>Xh>pIi(M8WG0OKuH-l<GWdJ9r)=5k9
zU*GokKT?#VVdYEaJ(vFN(NLd50c)w;yQ7m5su&ygsjYtK-Fh$?a=|hhxquP26IpfR
zQN5mTqQz}X?_DJ~x3D7<U_7s8EmYDPdoh2if?A5u6cXb%YGe4u`8<V|^P=iFY@miO
zlcR12Mi{-`#rBm=ow}K;iGjr8deR74>hP}{^ZOH`4l#&X{Nwsw<5TM2cM$lOCwHPA
z<|)<B;9De&eMS_X40MZ9`Q9h>phtFhbHbH#1iZ~#+X?PyWHC4Jx>??~bSyZWCpK_+
zT_TgFUw>xk(`tE{V_j{-jXdU!Zhz01#;t^>5`b63@dc?8$uV^R)6f5y`;kpDwfpPV
z!RVLdKl(>S3I6jCzdro@GKeGvJ{>^&AA@*%PqoqENY-BNyc@hE>nit6UgXMTf^Dfh
z`Iw;KpkhAl^Xw~6m4oc#I~Sk&cP=ENg|`e`j41*OeAXAfsYb$|Y0`$!4Sv!{?Z)uD
zHRhXVCrUt_)jlPi*PW%u`BaTv)~19k=b9h^CqW5Ul|Ul(K-jd_WzhdH%<GD*=(x(v
zi0zf<f1q~xmt_<XL4&My`3+Q#AJ*y4-dcZHCs=I(spXr<9uZmp-o-b~R4xQJ>mwR_
zed{`%JYUJM#OOnnbK4<;;EB*}&!e~km9^?N=LB!BO+MC(;nS?p%FcJI=_P?>JQ6;V
z8yxIXN{~OQJ0?{O|1iuPi8qJef9)(y^`9O4#jUtLV&3CVHR5~Q^B*_5gZD76B5NW<
zwy05hHj^ASV#2!}XP~{S&s5G=&-uM;Zi8AeHW<SzU>$lCP0&25KGeE^w;mznJkKz-
zuywXJadu~>>+oxyj_0NU<V$K$91UO3^tU&M2Iw0I?cz}jSC22HJ;n40Tn)W)tdE~^
z$~l_v{x5$&W)S~qAu_|>Px{-IqINifZSZX$&WdR%+9Qwr(q~wOym6hrHwB2BNg8jM
zqg$TXpNpTDzZ@-To@Q|5n*LJ9I6}&!lf*f+etjz1+U)WrS@jO#Zam$ZsB4&V`p7$3
zKOb<8fNj{SEdn%pM5<k5^?2iGBU#OiApu(}N|4bokzL-|;D7IHKd`SfW_+(k{=Khg
zPz}z(Tj^=3(ILI@MN3Nc7tdjKOa=^4SxBYbvSBLib>1>>u3?K~g2Iatnd(Vub+2;f
zm_D039`+1E-eEgNueN1JNq46+0}F;!mSlAO>-C&Htsh(3Pli@Mx}&jpXxC7nkn<od
zNhwU4>J!WUTdmY<>^cf-c|#<H5!hesTo=Qu)55XY;|h;w4&t5V%{R5%le6A&D|F;P
zSv4cysALpo6BPR6f3FURTKO2m?|(OL)JZUfiNE0Hc^4Pb`&K)wY2MjX{~Z}1DhD4*
zPOi(*CiG>(Z$I+9xBaGy>k)t;c~ds^0$#ESr)|M6(_&d<mE6iUbc^kb<CKbaMv!u_
zX1Ep4Bc&drNQ-w5YYiQpx4vXMX$@L#_nfoET1|Uz`HGz<lV!|4RG2%01(t@8)+;M;
z)SiDOOwDU1Ge}u;cYN%w2D3BjBg0&^gnSArxc$Xqg1s+RSZg(dStO&w7C(=fr&8sA
zAUSl1JY(MZC|m&aA14~E6W=!-TwgFbfPszc*G5ZOydudU8xOaXJY6PliaH#xsF**t
z{FcEcTlER79mSe#hcwY=f4enw+@Z4>e%fUyQieP*s&q&q9TmMtJ(tH!xz3p!YJG!U
z-=;UOG^rG;TBG97uE{=tG{?~XSXwJF35=&Pg7+y*gI$qe6&vqP^sKvGE%TtoDCv%T
zWfboYq1*bwY>v=WZcpUrxc%8$w(-J6nHbykVF!cn-3FU82Pdf^+cGtJ`6JeD<0V)h
zJd(=R8>A!euJ#I)>EXQ^Ir%bX?X2MuzGL>}AapPE>D*e95?(&u|AJ-ZKygV!oBZcy
zTNwZNBvgZaFKqPd>G{Us*NfiUD3?~mhvIy1j6DBj*rX})G4|l;7Oo(hoh*Q>J1TS&
z=RIIN^$Lt3QTCji)D&^a$;!qvLw5XrGB?B0K$vb+()W6@_I}wZ3^pJL>u6I5nc7*3
z)b+fu!rag``XsiydA?(R0MLY%PAqqL<a7l4Z>S6PB%UY8KX$I+*ZviWe*klV3N+_|
z%`i_o6M0vzgNvnhj3M2udxi3%=z4?1Qik>RH^jwJU)n<Hw_L@!r9!7*4X^Cg%x_W0
zBdz<l4Vnhd1^brmUFsqlmSs8uBrTc&k7^CEapQM~KkLvEF>GW6UV%AFuj|oT$dPn|
zoybd@M(4SmwU4Zz4b}RQnWx1BIp1Xf{Gd|Kw9L*RJFsA(=%u!q?rSQ@A6NApY3~fz
zan+L-M=E%nQ~p=HG4y|_VfXRYe=L-qix=dO{V?cU?by-R@!3kAV`go9NJIl9&KLad
z)?q{VtC-s8L8R}N-TshrhEt{#=O%M`?|JaZYK)mn*T*&O^%t4&kVl@}9*P$t3uVI{
zWp0a`bb7GB_v<^F24OA%A1tB8R)c&Ayi(z6FN(3M93a}eZoz)sO&`-rcZ8%|nZ@c>
zWa`gs&qBsW*S*2Up(zz#8*SO~l{sp6zE<5~AM)0@vF^?<rW^ImSHV4g6NnGiX%Pw;
zr?(L-|8KK{;@ksR3ua@XKJou$$+ytiQiO$uB)S#z>lf~OH0c{mJie!E&!UsNbS!&5
z?80db5V=m$EG7g9arA&)5%}#<q}4M^yJ^y$(a3VUn2}*Ep8!jz;rM>UeqL?WiwN5>
zk}51G+mZd+M=r=(_T-zEVwQ0NAsi}|bQ6v<%K1uyE<LRyu2m){L1V2Y!AIfmv0q~A
z@xhgiA)_o}L@rdWu5)g=otg9RO7m@_{-}|hIMJNq-Sm}6HH0;E9VIl~el!I6Qe)~9
ztyeNfp=6N`8=E6esaxM7Ev2tQ;DYr&ojLaR#Yt$c76lfqj=NX+S1}4{tYGVkQ$NgS
z&m8=PUi^z~P>g{HJgUMsE0&i2`6Q;27EblB=1c8hvN<zCl-n9TzrNR)PI2ncsNWX7
zhvj2d#3tUigI7ZDqH%tBEgHFwL2%lkdxLsM$@Tk3e*UQqYCRx0r5<WoJhQa_QeVNt
zE*r-))<2guGGClCT&7%P2eyfpZx%=asM3+(57=cPfC}E2_kdvYg=k0tVd4pW%b^dZ
zx%JE6Xw+yXnjL>hYjQiw|BVRX-ycj#%glXur{uuYBe*ngq%)6rNe(VM_c=-A!}*ZP
zx0j>HXJ)NFV{6nJ*?{{^v%~~96-C_tK=7o_r2OI7;;rb^gHJSqN-i6OXPsw6!rCGy
zM;qX|9N4zjnR8WHY*3|z#$nS*`zX(sbi_6;IP4_?VP-eV_qlu8+Az!RHNlZm1yp&t
zGDow<3fGhz**zLR<<6_89Om5oOhH)s;^BYr<>z@skY>yuJtbKA4VnIeOr}!tW2KCd
zr8_#s+Jz%6dHMOf(Q^ok^rK>t_v%YVdrMg|Wj6%g3MY%izP_dG*6EteB=q?_IA)b9
zO=q!vOyuS|uwEa9)A0y-Tj^(8nf4v9?d0P|seuYd!VoYnczGOzc%iH^cj}o!wl+Q>
zMKxkUou^Vnl2Of31wv#Qxwy($=~b1H!b<%$Yb@l=q$^%*4iM_z9V0RjOQlJTU%M)w
z_-OIWWcu9JhdR>qNrkgMNRHuuc&L#+o~pfIsT~n>@NdWaC+$>?7tdDg$;-bpQlTx|
zK<iw;_HdN)!i4<`B>HMNA*0@6Z6%j;vc}qO9#{1yo45#=T@iT9ZMj?{1Su?Upu}T^
zNxOE^1bKI&|67|&eMN0q(P;y>tuACRmUe9jG@T(sIGwp#v}-V<CeUCN;T4k!_t3=C
zG`J|ZqTMK4)mK@v5$rP#*u*M-V(r1eH*rqj@Ir5diCtL9)Hbn76Ho1!ps2J3_#b9v
z!@j~i>A6^?t`Jk}O1^q!qWsO3@QF%$Pt{aiF$7%cxWBK{`ZDp@xiRx;^LNi`Us#8n
zX>NtfBt_qKvnTbB$8S9PEJ(9Sb}9#fVfB4Xvv$iGadsOf-2l4=&%7g*rMrG&1;O*v
zDES_@Vy$dnAHm1y^tBFZ=r)*A{YoNv3FNadYpi07wLaBq6QW$Py<e?}w^@;YGx>z^
zBGyOHxAMI(zpl{^yfl=LjxVspK@YY$kv`H{1`*~9{J#YZE*4Cx1IuI*?H=hnKbiJu
zDhdz8w*QL<(M1$_;^WCjLAryKO*cfyb)Kyr4gnreu`0nAx5|n<syourOB3-!R;*5W
z#_iwz5fuVmLoul;2P^kE4Hnji-zk*XJ85w@Xk2s^yYM1FPx%m6lR95i%mVY*sT~YH
zTAFV5jSeF=Tqo_W>z1=*lD^)Fwx!wat6wZ_$@99yMzO8bu(-oEbFkUaP>Ck6ZHPeF
zF6utD95LS=77;|eoXI^SP9?0ENH$ivXaLgCeB}{=ZLKI+V4-#|0yLXv3AJ>p5=W30
z=OK<=nz93nbesl>H;<(d64UP=SdPL;kQuivWFrkqBdYbsDn88wsjBM%R=$hRi<Nw5
z+l*lN#%p~_xZwG*)R;ojFw;;xue4HO-=HCrD!cfX)mL7kw_s#%Xk*Mi<rA|p_z2F%
z*MAzesxEN|%q=R;aIQ?h35;WA`q?JEoY<VEjw(T`?>mqCgjm_DOrF}>WG^9+s(F>)
zaVUWljIEmx7TH}fvK8hOtS;-Rn!BOWmDldLp_j+Fn|XEC%^En>be`{M22)nkE(cz2
zQp;9tVK+^)Gra%2DahGUn0<5;Fgc#P$V1wJUegfeR?4CK+na}vVLk@g)n%KEJTHdC
zuv^*82QZM>X!SaF<titprQ-ZE3A7`TF4OT-qwI6!Pn*joca=_=y<B`wy~A9nlyLHO
z#gd2>hb&$QKT3`VpYxXbGz%ATL`GJm+KOo=@O`fLDIU{nnhXDUWE^y25v$>!zRJAn
zvkLna=8O(-05mw;5zv0~HyVt_q4_;i)2<1Nl}5#a`z?pQpPjAwL$D6^jHYAjL8J0e
zOD*=N1ZhOB#=88uf#3#Wb}8JGPTOQ%sAhn&$?0+F{DY~Rt}b=^xmtS#n=adi`F8pQ
z0KKAo49y<A!5;|PA*da*e5^NFIZc|eQ7M4%%?d2k9UPD1ZGba`Zx*Xot&#Us*&}RL
zPi}7DK5i8p+Af(Nz=c$8ahAV)O6`XAA)4%SX*s;sSMn)KBI4`k?t^UP#>q1mp_%ll
zNZSqF56xK+#V{lZ@~d~mI-fHcjo5ZLD^^r+7L7uU+4(9De+&)0jWfLfxS#$_ZpG0R
zUohh3XfYrkrTI=ZZWfEXD0_Nl1xBoItyPYvS)HdS%s6pQ7cMrbX*X6(kw(pK;L0tQ
zo*0~aZz|O{xUy?^6mV{BM;|2l7|ax5rEoV{1*PSa6}Z;%@niGEP@cOaFb4|~1$z4K
z!>gMNbh(mt^lG#RYK{(t<)T+|*wcg{#`4t}+tZ(w^acz{Gz6GM_=qj!$T3&*;9@|&
zr<<{<bR=<@WnX*mTU@Ogm1NTn&X;c<zAtA~Ik5j>_P|FJ;{Gp3(2Xdb{7n5Q=s=f&
zZQp8%Qm_Cz#x)5N-j&fyMBe3chQgiwHfA%l8`1a7KW?O{Zq^hVA$)Q!hWOmBAtJqr
zP8k~RvcD}q)-d24Z}1(!O%3vjozb0N#?`&KxvOJ_6kWHNYL;@7{gIy~z6BA;y_kHc
zgr`=))3AbFdc$SWBbp5y7ieW5Rsko2b$^mFwzOAnlpU+!dpOW%+^Ywv<eV&8b)xsU
z@6G?mdI%jJU5HDPFs-O}L%7&%lZVCf1>}_G$~lY=gO5pr$1ApsEUQ{&+u82zc1ksD
zI6X#dj!SR5fp3sWqeijh2xGH>uXomTQ*-wEbOBZ`&&};kYL{)Lij#Jo&CO(wNAU0E
z%eGCF`}j3!@DF1p`BPTS?iFSDqr<{oFWBGo69xxsM2Ald#9l}k2y7`uVBRT{=Zx)n
zZom;7AXx|*I1i4Mp9{CTjr_x)%)R2|ElRtQI-(=VIUxeBf&N9Y0f2i^0XT$x7cqY7
zpNs?Q3nHOE#>fmqWm~G~X{R@j+8uSSr^t%qse86sKDJ!KRf`q*u-g#%>dW>82r0YX
zn}<Cuqi`B1^3LaTrPqrT>N4*o=k!}q$jnl+p0MNN@KW0g4!#-tmORG{8@~qS3gi%c
zdB=V8WNkEj^Mw0T<ohvp6`LaXA45$a9pcEog2l=k1*{EbzgPr{kCZ0P=B=i08;&4K
z?Huu4UEw6=VqYRbkbKb}IfBYI%ZHlto96g>Us<#U6SU^|Syruu-Rj5=uvVp3*@yW8
zpOkz|`TV`k32WK&UDHHbXA_RVc<Q!c84}#w4@%XVdsPf^+v)4{+0V>ajX!r<c(bXO
zp_b>;H^|pMnXEBTQD$~gZ>Y6Cl?(>Lc_EuV0r`{Fz)6+<2y)ZXD4xWa?)j2)WFd3k
zF!OUZdW6Jqj=Jjk`;)4uqi&hXj3d=O#W4*&e0YPW!QxtSUTk4eUZ5wb97G$kfh#>_
zkZJx<&<?knPmgaCX;Vm+W|_I)r)6$^dNG~sLUsSm0AP;v+W9-8eC<3Y@o7x|N@a|k
zu>fhv0CE!XHy+a414UE%R0jHF6z_Gp({cz`wk63AX^|p(eeoIB;ylKuoNwe)y*Z?o
zdDNnodFJN2`%vV`6EBcnsfNK$dl9l}(#_5}ypmpq)y~_h+qB1G(hgTyL9b0IySi7D
z^|G>ny(9vg*Tw~iow_=l$u#4xtygQyl?Uwt>^BsP#Z~dtMiL~Fce;z|k7aAwrV2-9
zv+U?ZIBc7?at~)bll2!Y>%HPg%U2xi#*2Cwhme-f5^{XxAjOMc#tMzkhJ{BhV#+EU
z<k%KHTttu*7rEyjj$~>^PG?I5pI_h{RKpeLYV*B-w=R-vM#TOg<wMJri@3LtosMjk
zVfr5t+-$c}i7nB!+!fKav~*@;QZwmvOB-;hvx1)H=hv>wd^8J|Kzeo`5p=#&mY(x3
z?ihSI`d;BeCT6FYZP?;O{Zb-JV}i7iMEjhx^uu~T@FEeRfCOuii~7p4hIv`nZ)B8J
zd2|+>6_qz?SS<8J`3A)z9rRYGa|M;AwE8twd`6xdFSe!2H1r2#iqM@fU#K_1jUn&(
zicZF+oaa)S3~aq;W?f1-<KXZIljmA%rBg&T88fr`0^6L&19HL@R$9qvD*R6Cy}6~F
zrE~+mHj@<V$7>jQ57MG_KBhLn#V-~VPut~b9N8m@u8Dj|oy#08&9C+IaO4uASaZ+8
z`dqJnv7X5`<M~dHQ`Fsez;SH1@}gwQh1Pn^*Tqj}ICVd)Yi*8?(xQlLuZF>_SkNl&
zN+24GrgX1fx8<JNi@}^RM6Bp6vh&d$>}*Jhmvx~b#FJlilxH}g++YIK=U}xuO}%wx
z`OI=DEts!iHJh{`=#I_DDo2Biz>Z9^uL(Ymw}^hj`9yI*ZdcAo?#lfMJvV_Cp_~wR
z?K&m^buzU4SA9xw?xJKDDH4i<L(yTrRhE_qfdz7Q1sH%>weIfDAex12tWSF-6C?lK
zd;zA%;gB2k6z_#3+nc@-%S>x^mJ|f2rIYltuGsMMWX*B$`!A%Ud0(rAbj|gw(^nA3
zu!|8T{T|j1mU@|d3zudDwDG&=|7q_#!<x*taA#0PM|2RyAOhk<u|NU}A|gmEiG`+O
zfzU;eP6VWcl1#=yz=)infFMOcL<B+ysR1?8M5;z=6e1BRF|<%p?hYQEf#Z12Jon$d
z{?I3ceEIfXd#!i9>s>1WZK)Y<bLGR>kr-vQR;M4&x`_Uv)_wGXREEJL$4Q(?^}}^J
z8%goi#9SqPFVQfvJX)YYQ(5uk4pOY(Fnqm1r(#gDyV)LNTAN6#FIiYipQ&NK_dP5m
z+@*Ij&+$qJL3^k^cb(fgu`VD<;YKu$oPM01ZlHv5fUP*9;Ce0gazJ<Shs69R)#G`a
z+cfa?YHv>ix@Di22K8V2erh<qIm}9M3-W&BFPF{5_K9}|ns0le{PZRK!$Wo3#)Cx2
zFJ;VV!(q*KA3Y0;Ll%iHZ}IMaHBn`5uk=D?Qg^j4);+9|;2ds*goXWCz}K%?KW23-
z+Ut<)d}_8nv(&dYe1(#hkV`#_|DkEl-A8ZluS=ilZxR=Owm%(`e5C2=iDl<*&tBJ-
zD|LJXcW)aWJeHQni*<GG-W{MzKiQcUR66(UVD&{dX&h*ddE;s%#}=xyO55+x2HDiX
z)ZC%8=)udn1v|~}H7GQ5G}F$##%<AdIURU2)8ErAVEf+nCAf3gl+39MO8Vq-?4EE%
zOu|c?X<&b*8AXL$N!p|HAYyb?SzYhFz6(Lm1JQ+5np&$_;RP7cCZ+DNJ1*GlN<6KC
zK+}#cd)Lr&rnn0e*d_QmP6j9kr^Fm(?rA!_;!T3wFKK$vwQcOaezsOZug+nyYclsz
z^X(k%Dj*h3fZSibo9cP(#*oa#t__7&lCYsW;r&vh&Fs_d_n(PrE$OlP7nh?H(Ph|T
z*s*GXY_u~Jz;XOjjYPeppE=EkgTGP!MGh`|swpg!wy9K&^BJo;mVnDniUWEx^j6vZ
z#`5bJLa()Pky?&OyFI8kx|A?yB1Z^HXV#6-$qo4<#h8YIf9(qRHBh<bQ0;t;dYDoa
z^>JcKZNT}hLqN+BCE1t1w@$R$MgDF?ix#`7cL*MLJ<ayC*UTTzr`{Q~1)d65TKQ_s
z$Z;h|H+xWNJbIP?!vt1f>sr}no^z7>BY4r=^ko;$DVDvR6UWEk9p7!F-El%cZ-4XL
z9XUtD<+wR5EwslZrbj7n6aVH!)9|XaMFIuk<{Af`^sJ8c3nKrEX7SnM2oXzT^E9iw
z^~Df&hI@>|H7zWE9XU_AV!T4E=YfC9doNSp4G0-GJW@<D%X(M3e%9SwmEq&!?+4l*
zirzbC;)mYS8rRt2TS{hjI&u8p<H8jRG$NgdJ*>5p#YnozL)xqDYPJ_2sKxLVO--N2
z%{mI@fgyfcU3Q&9>73(1JH@H1ual{6TRkW>fvv@_fL=yF&ne!n-&nNm+=I?FjZKg<
zuP$ed%Pct7->T|B-Qnkfa|!#BX1~<DUoZ_2RzuMZzYi2Bw1@kic=0k86@Sfeb37ZK
z`2uoU^L{}~Hnn?NGDHbu=}MZ(JJ578E!V)lKBHfufBd?p(iiq6G~ivb=!T)$VNL9U
zXFt2tXJdZN)Pz&QCPRCu<~JAmwZ)&L19W)B;q?#ywc`4I&p)mF-L{KrhyVBy_)z0y
z;B4G}P&iy9)Tj|_!%q`yi?SdjJqQ}vyV?{95rrhpw{iXwHed6`FV9^+3F;Gx$GOrX
z%WtXti-@wFSwXC|M_iB?^m2VOj>X$b96x7<G&TRpGEw0_*7N0EzH&DeDTe@Tv~UDq
zCr(r+_c3&gZj25{8dU0nL*+wZ&}=gsHjUjBiegyM%(#0!|IwQtUJIho!7wxE2ayZg
za}@GJxQ+HOKGH1&2IXK1LbyFn#M%l(VF=gJg*}9Kjz`%Zc4GH`YH)pJtzW%&oC&bZ
z+PNG_kg5q)^d2`hX1D7l14f$?9NMT{xNVjR9uP%{R{V}GcxBU9dOCk^o+@D?+n+10
zCvm`9b~%?}Nkou7f;D3a)NoFs^J-%4Dm^%Euv7~U4Yv5F#;E}iH8HJx9r|0<-ZhT|
zKN1m1v?8+rd`=0?@;uvF&f9>&e%@CT^Nn8WpF#Qg(cVhXMy~NW|H&FUxgoNzG3;%Z
z{W7SWWq0n#m^jh=_T;zUP+Qm~_%F8p=mCZ=tVRz4KKqPRHTvAh|IoOI-pKTU%}dtB
zK}55bxYq1lm?^#uy9ZF9y;ivfv`M;GOk7JXGFaEp+@+t}UUWDn)jLk&pP)S<T5LC^
zTu;cf*CRwc!K!A4r5c*ed@iP2;Llm7%gK+BtSGIjok!0LzUn@owtfjpNW>JxIjo9W
z6qT|{RWgR*FReF%tw)0HbZeXWcv_KK{xsj(Q;TZUWlBBtkIrYk{%;y_zifT#0oLnD
z5G98WgJVv0YiK?(M15{l#_u4cdIuC7otA1#fj{T}?<wfb8L^L7+fKvl`Ku4Sxg`dR
z#jwkGI43r*dZ3b0-ACK!S&7<msWDvbYyE@24S-7l;NZhqkGG2~zrL9uuAD!;w)3^4
z1a@!!G@I|a-$r6)b|2Ad_L=klU0{`K<b}kQHTRu(Z-L1V(T+!Rr-A31vtp0=L{n7+
zuT_bxf4FWN!qxIl!}1LvaDO1=N=+ZKkqG4%qV6@RleW^35Gz}WnU{NrwFDjn-z#tC
zCNVSA1;fQ@{v*#@3tBRMHu6!z_?4W-IZ4cXG=Pv?V^XJ(W?e97akq=a4AMoy@lv`)
z;y?50e&IU@NF9+laQDX2ghm)t+zvugTSBZw5m2_J#Ba1G|AT1)Em+vvUH<m<%Fht-
zmDhZgO5KKugx!31sKoG}SY`eu1d>pnR@Epv^uO??zpiU*7=W*v1Lr)Gz7ecnb=Rs|
z;dD@pjM<_9{x8r5-s|QOkU)-mW|W-#ryI4l6;_hZ!?M0=zi-%?aC~#Kwj=qLn6SW{
zzj$kjuw_(xB?e{NaE@3zNQ6O&;J{@L5o=dUNqcL-ASBj(2&pk0wZ%sQ8|WZ`HE@uK
z8Gt@<vy)5Po4bf(ly@ap8M#nF5)h&yT^G(T2{`K@fnH;J6`x?YU#elBVF%s#s>%xp
zF!KSFb)pG!^aLD=0RNVmBb9%FK`C;k$W9UB$})p?CI(GYce%7jxSxk2{#0m0=>Q&w
z*bWg0FgUe>khpawm#cT0bsz*$lU(>*>D9Uy9~PY60iGD+&pL_l^Nx?x5WNJ7W&s2H
zZX2ySE%Ns|ZNbH0yZ5^}UM6GQWNK7(c-w$!0=@0LG`?oA0gmN^Z?hUJuPdz19^#P{
zn#4<1@=8tC3U>!5%xS^ESBR(!WarMfNY9>X@KcWRitDRynB||94c{T;HAaN8eK0I;
zR-1@V5TEemYtzQKdKvI5@?Ob0_=ZYybt+moYqPhqsu;YkvbnH2aXjC=Vf<_cyoSzX
zA8DAKDIOMD!Atw}vjs;@sN1sX=Acc<xzC<st-5@;5st;NZS+R0PIAvVu!C-~&`#-a
zq~Z$LGNt8>MPP&1u@HQ@H2EDiSCG8g)9_BJcZIx<$xoGsg-3ZMT%~w=9L9$VB1fXM
z1;=db#%b`}b~5KhYLpE|Ux;U8qEpN$ZOY}`)G1B%?%&(;5{Tf+VV2T6yyzw5($%-O
ziwFujBbZ_287B6W!vr}sN;@3vCng_;tGEa@zfCTRaCYm5@#4&(4JJ`Sz6gj8uH}3>
zPoPQ_1e^4kCQt&?6`F`lzufZMWVEaFw27U7YxZiqco<x9dm+QDVyp%%&!gL(yymkr
zldAC7;Cz+aM2hcpw^KvaYH_z*23GKUb3+$=35JtY+ZS%krE)6dqB=(sUfY0?MuUgc
znIQ5@veorlUU62gx$TuL6IW{^n86m;^2gc`VesChN92MheY6Y0`HJm&Z^Wa`oD%h@
z<BktR(>{6hTdP-`$UfhB6TMv_iSU^cl+45hMQP*D6H~pj!@&|}CZr*Pz&BY*souQM
zS)sE!*+~nh39oMbs1j|;9WSjeBnXnKrr^N7F1b_;Xtem>O^KQu$8<E&^T^JdjGryY
z$%MMCn{JN$USj#h0g}ZeF_qg_T<euJy^4F8Xz9oXeK{`bsIeP6ORJcHB|77HcsVrJ
zZ%HGs--$>k@1+M9nQ+eu&%Q4a2F_lQj30vN<(?ZUn11kxFZv@@#?raida2Oc<p>xb
z6RxMrU5!nv3f~lDGHuW`JLcx<-*?+hT?0sV@XdGzoR75(5UIZUOT!xxiaeS-h5GRW
zsk^TW8OV?FvapO0COH6HJjAYnuq3;Wfq~}>sR$-fkZr_0Ot%sLa?Xm{F@JWng(!N{
z;%@Gk{9W#!8v99>NQnvZzF~gIQgY}Y#%5P>Ud&|>3hv#qi=;F)qP&2Tj$-6E<G5EE
zV3mmG?lLGbA=??3l}*l_#(|aJn7-)`M>WmGOPO$e1tmY_hIHq&_eGP-+L8^GKc!>9
zYszn<kvf-Cs4O0$bgSxX&*vESSjiWkSE1b#Qks<(Z}8w<C+R#xVl5^Jg3xI;e<vRi
zdZyR+A)*OkE8*zDHv5ES?-b@K^^vVRFGNu>UE8CoyF3B!tmqSnuhXmN&lG1eF&(tC
zzN%lq`4d8yRh{JD@ye5N=Y2-E0~X6AMZ+V*FG+Ru#n$3O;D;%H`0qh;XHf^=Q$dL!
zfEG^Uc5ynJjV69dD7uDX^nq%jB6uIX((0A_wavYP-%{U)mKXK(&0T4%m|8)!%Gh2T
zsGUFUJ^=10ocVjZ4nmS64~{#7^6P}L=@DGPmwj)Jw%1#_I?~Bj`-dM?yN)9dp1D0N
zLh*atx32ecJe5;Mg?iXKNgOv{+i?qxyeJCcVasMcx!qMzbyp$km6A7{{7HD1^DD{&
z)2Wz=6nQh`#yS|T{I=F62#F%dOI`nx$+IEW27!%4^Tsh~5CGW?MAd}^Qr}HDm(AfZ
zKYWf0LW;-<TL@2#NGX(+i_$3Xg|_zf)mRM|P4idMkoQZb+0_FBMQE;RlQ%kmlG#7J
z_7e}|wwb{z5wE)s;7J0Pi_$L6QNqs66RBydP?RliAMqGloBuc(25Pz|bg&p|-i_1C
zZ%qcK?%jA==+TcvDr{Qt2R>e>Vk)-hgdjE0$bmhe*me&?F#zT3Y$^^hcTcBE-JRWk
zc0h=I!$Tify7I!PB3H@xxU?K<p+ZPHG^C{$3B`z~nIUnkt;E_6PZ%^DQxvkhbn946
zVov+Uip)fO15Ox9g~`_!l8c*Wcd;H;I>zB&s-KUCE=pkM02y3nF)r^Vh<O=hn$}|h
zoRg(bWQlu`@zsNs%0zl;LBDO8I{y*1ySxgxO5!hPE}3$v4R8FdR|&V~AVR%!)0k^U
zqz`24II}SY<9b&$>`zct=W+khY+hxyWX%N-|I1ez-<=NDMOB~5<ptwt&^(?aQGn$c
zflc|g^hG1eJD_<Cz%rg4{lqSWkPv7=<((H^d;QFt)+*>&Bg<q$ex<pK#15TZ@9^3A
zE6X|Af)tdkqiwoH_P6#9>Ken*xu%!@o-?c30`XPO4=JDYBi3HH1K-F{aRZbd0s?)P
zaXB{q{fYRCJNynOzHrH(o)n)n3BYIGI2~7gCU<2;qokgz)-AMC%_n45x`c#5<sY>`
z^Dx;cMrSI7gkOukT1M-foI~J=>V}h^xtaZvOuE1~Ws;Zi#T%MzqdK#$9cpt13@WM-
z0^j0gz!*$}K?RRGlQECH4er&blft`g*>fWTZCF&?2WCJnonIr9#l#5uC2Qofh7_lo
z=J`{46>}{SpW=AR!8E|5`p#iuWs6@r$~M87?;vk_USh_`hTTs!7cl#fSfN)0Z5F!Y
zXIdwWuxo(Q1}J=Sm7Dr+TAoA4W}RWPQtk1O@Q?B!$ZBke@+^c#T4Uc)4y$mQf09{R
z2E&b@T_JRpk-gQjVF<Tf#a6Jxm>z&sJTxd0=gVZ^^>|zciXrHAZV5k=U@tM#?HcuV
z)b-8eNV>_hdBYf+{xvJ=i^B@d;&(Mc#|mFUc*ovV8I|=0Ly5H%-ew?>^tXeN4!}U?
zX_2zZbD>tUQ=8x#gJN)WY$P-R$O&a3aUD=TSq7sRX3lWviE?WREC)ChS(#Wnhsg`c
z)PwQO>pxSbe}nB`EV2^-#>~Di7TOwG)dRAGhUJA84Jascexf1yPN^t{$~n`Yj-s2L
z*h3?i8#%%Iiamp=***Ij`{qEV^i>;lB+vA-o`baVy4w&a2j_ByRS|(CcKJRe&`?l9
zWicBmjuJBn@ZLkAsote&4SmJ_Ic<Hkxl>N;hR-aqezwx+-z?Fb5M-0?iopk?1W5<2
z(=B@O3?2<V8-7ju7W!#LG`;t^)vG3o-F|!nztTwQ`EsFcb`$y<?+2TYcLlg-p%~n%
zz8YdlpFm{`unSW($|k+mo(4R79&eRZ>J~{)zx}-X$)Ban&Ro&T2ot2v^fCzP3OQ8Y
z%fPmg4)99~ebp^@Qw|7{;o=b4OA;}oebM5(IX0j$9ueHX4^S|!T7-u5F55b0RGezE
zZ+b$gBUlQC9$H)xR-JF2iiF!kNR~8isg<R@#E$#8x=?0i?hB}qz6z!|gzJ$)#hAVF
z%q{4@HNk5Tn}_$6C+_+lHfNuzpR-sS({fRIwsK%uROh6R9YC&Ai4;4F=xAFZ4O^d3
zwbl%dyE8)`lrdaFl$UoPZ_g2oG$$we{>ucpG4OF;aqI(uRqF82Sn?uK@21&eyjTj^
zzQTYmI8RjB9nXxd6{k2$<B}W?7H!Xw+>e--G6)0L!f*HwpCVcf6&jlfbBE+n^WAi#
z3FKA3{Zt5@ZoR_~YX71bu#uThDfI1%=PO`PrgFhbe5b%y%^uCgK?=~b*$PIVcbsu9
zLTDvxUlC@>Kyk<AY@tr3gzFup^g16V%Evzt(!K`0f1AbH`wkV8kD;Ihgcz`07xD>M
zrFz-G+}cS&#;rgL%UcT-1ZxlB)Mg0_@g*g!J^98M1X#yWIN!im%wlZ`Q%6|jHPrrS
zDP@XOM!T@*X3~I63CI#+8-&!YPI)DBn8=C-6-|h|IY3OP4A2_I?IDO>JozYWwxxXt
zh8vX0E}Yg8DzjC`M0+8o$fEsr()<F!QCHkB-HeMH{gBh1VDTwsPH`elPaP44O482(
z9|Iwj(Lj!wu=v9T+%0L!%AHQ7-WgY34#Nff)oTF=Oew)|clMXTp+2ee<f_;NS;<)f
zjzWC|VALFwo|izevx=6oo1cIFf`7LeB9W9gOrI~9NC<^~oB{<&Huv^Wn$iWmgltw0
zIGuPir!IPS(KNtb-cTfPHi1vDLQ@(z6Qvhr-o*N2=o-y9Py>$rbP$gtS~LI=&V-8|
z71mlKMno&hrAOJIIG7*4Uav&Ztxm@q1}YY<#O?W18@N;L0<Cu3sF_D}@&WcZCvFcB
zk5e&ms2evhgBAMoo;&F!Xp{~Vn~j=|%&wiBF`M#wS+6RTN#ji3%z%0zJ3#ds-m*!j
zF@i~F<uI^`L^@5)3<f~IxvyA>k9g{{4~FB}kiX=}#hJi5PwWhRyLyN8Biv_e+~t2N
zyNFl&RAddkXWM?!%W5NZM=-OUirY5|GH0T+5r~|2Wfi{7yE=GOCr@g#uukY@8k;!@
zk}K;9weeINy@V5q=hG|5;qYQf|9Kj4y`t)vm?2S3WvImDm3uwFtG)AN_Nvmy6B%%<
z&db8iL@i=sKJidHAHk6l<Ya9BVZ-}r;jUkKtzy&*#oV2@t297m-8&5aa4P>gHcHz^
z1^H6IQf=m3A@I*1bm!*b`S2m3nbsu;OD+Hh|Nf6qrw9}$+lp~>4e&lfgwx+kXv!*T
z=Tnga%B78t0(($@c3jg4#BIuB%t+FH(n)LvJGn%(d&8TuG6K}NAD8_kg2@vroYg8S
zLXUAN<bw7}!oAn>^HeW$sR)hhsp;)HZGRfXac2K5UZCA{p9=xp>daDH@A<lMZ<>?f
z{2BK6pr#m55Bz~<z$sa2;r@yFCh6sG1uOgLL3jbqRS>2C5qSEu16T;SPRzT847duN
z7V7t<4%$cB)$n<1^P41u!kSN-o4-SxnUVX;u1{~ibPVVw`f#W}kWhrMvlL?Sc8eLP
za}38jK*m3LQ9@HKXL=}B86x~7*iP)y`gyG)kCLsQkS$3DzG+wpT+<8i_82J>r1C0Q
z+Ni^)Ly0fds!jkfmG`q`)67wjZGV&^Q3)tUW|ha7;-5>M(W>6KaILeTHZRZ_U27i-
zjPr$q{4bBL#~kRtEf=%o-+b;LT;3i87>}3xoxR7u_{5)f`sO3+VWEhp<$vSLT<ib3
z<nQ-faPD8R@^{<*w*dI=L*ISqQx5wcE8k<~Q;z(O3Ewf{|2s^Gtr=~Wm0rXl2sfps
zgP%h4l=4H&H<%)OB}{E!kAyyl=IzOCcjK3lP_{Urn&?;_^|pnOveO`>GN7S)mxaW&
zU6h!a&?VM(SE+I{X6BwI&w-A1bTEWe90W-U%R2y?j~gtU#HZTYp&lok(k(2nm#gQ6
zlq=?hXvd<Fl`$sw?(@7IBuZ2Z+F={Vs=#CbH~h6xSqJDQ)QSQnY{ns6I}VK;)rSE^
z?gk9%DFRCK58XhutM!bbVkVslsCx=IAufh+99B?*Ux!9QmPYaQ2LGPeUh)8a-Gn%K
zvqdkto?DejcarFc0=i5^ohtYDHeugY-58B5Lb$S}I7b`cP<v5O9``b*h(N0xf<y1@
zgBRNO49C{mgOV*;+>7z*gefwi=e$ITdq&XjZAr7Md@7cJISQ3xQ{=|Mf{*~$tW<9d
zy~A2!2fw?Wd`db-!ldBM?z|ALHW-tt%ZjSsSHtJ`f<Xh7;HV4HD>1l08|mr2Qlb#}
z-Z0pcE>611oPB!g6c@KG&!IZKp*ega6f_Y8_(&?o=T!i)R$Mr!(7u8Qd7KTD*V(3u
zJq0H(b<%@${tof0(j~y0Qv2RKsfBEwuy5V^%;t&y2Ds^J?>nh{<uRqu)S~8zdT(N_
z84z8cgEqEg^laISI=^QW{7n=iaSgGyE86rbd6#Dtit&UzGeTZDuiwQZa=<ufkqA>-
z`HJh=GVYDZs7`k19@Udc!txD@DDQ%L9JiL3iKd~EFT`L_tuh$Y>KJUTLGaLjd=toL
zr(K^dY|YcOo?sWv16+StWgT>1u%jr1+o9X=Aff)jXh4QDe{~qy@nDx}LoGt2n?)+g
zm3%*#cb&U(pk!;_hdm9ooP449+!czV@0LYht)!5jLp^B4@WXU+?<?5{&6iJs<(&#f
z(cf$^;l4zXXZSZ`f_^`(@_3Qt_h%7xKw2csPWF!ri|d=6mP>z9+Ral@f=^mGnEJ9t
z^Cb6^h{{DL9X_CLYe^GC#o#z`D>R19OL`HwxiCc5!+>5e)OoUaqhN?vw}M!?0~<+A
zWCam{P~&6bBxb;iU8KDIwf5UNCA1Q6|C^PIP+oU65aXU?z@S};Mbk?JjUartT>xP<
zQu4$`q6A(PV(t}&vh9_IkkI*XsGcQ80$uqySw1};?bI~d1vJ+z8qkMk)rnTDbXN$a
z+=P|%ypJX`QUZLvWNHHukr=H?tQF{jg4weIWjiWM^uac$Ln<ntCTnfD$2)Apu9+En
z4ILYAhmQ4vFk;3z1LYIQ&)HyVu=sVdC`nn>dQM;Xvi@~ZKN@GFPX@?Ia^Cd=m8!6B
z=#fc6?*(=P-Y=XgFQ8}UOutL>8x3R?O;_~J>?hXZq!2o<yXBl=xGWhuFr{C-(h38C
zQzX@jnhFeu;NL(ox~ux4zcF-F<typ1F?<_UbCG*(y7X}AHIa(s=82dzICK>`lpEfJ
zM)Ei1#XU^<r7}?tNa?Y~Xyl%<Js-UhWi+6|QUZXIo=CB6;r#(2Jy$2OQi0Ah;*;e)
zLySSu#Lb;Fw5gj>Nh22NR2G+G49uGQqOWGIDG<y}O123A^6_^UO<0*R-WovgJ>m!e
zEt3>9a;o%PS<jVt&@G`mw#*g0Ocu?}fENx4Jg!jX)5>aq?tfsjxJ%*m$effLB<R_a
z9(ufrDR(N`2mRB&&bn|4b04wxVHe!g!IU>!s+Jp4zAbNMKpazzh`Yt@flzu^HNc?G
zYl!mM(Itcx239Y@3{|x$u~wRpYDvB}F8o4*MK1N}N6@4!e|tF(3@Uu-qRt0J)zWpo
zu>6n-mECY<PLqz^4oY*u31Q!-&dAJR7L0!&H@>!!z<K~!Rm*@w)j(hKZES5AIrMT_
zNH-X$9cAUbAFDMoFBH^{+F*AD8aS|r!d{x6a`3~Jp%~8K5*aO+>8A3|KDle-+zD3*
zX}>b{eN|hU#o7*LEF^(k5K>-1g^mqFh_x&fgp$~2kd99T-72)F@gER8o_MIgZ<><(
zv@aSTn3Zbc_aTjXN0WZ3kjfG$4bAJdG$Z1%Y-i{ws?v|oToy$r+Cb^2$~ewZS|*;)
zKu9+VAu9}OSaOW#U$u*nV;F*OV;T&-Y_jNc<|lnbP)X@?wzC<?YZ|drlz(A*?Ibva
zsVUNwU7;2jEEYdaM0`jCU8Ip>^h8Sc5~9j#UP>)=EK3WHD`R?_6j*x%p%8vLIpBlW
zrpOm=0Qc=d19r@&LfS@pZcmds?*Y3ILh?aDNK?(wv6k9o!^Fbr@t*vyWXx{{KpD;{
zIRWe=$q%A)L5C1wY=-PD0pCV5RfBhEIdmKZQ>SO!(>@k`&=rDj*BpoAW<d8_MLw&9
z1J9qXz|Z+Z;P1WI{XFKV$pv<bM!?g)xk*)H$yolRqTUD^L3xsFMN3ch))ap9=}(AU
z@TmO?(3B=GMejYj(60!;ifK1HvV-oQHeWj<H3FgBnHwz(3GfJ_hKEmehl7#jQ=Q;9
z)4Ia$#AsL&iqV)v&7MqY2TRyjJiG)#(j?fu$uL=C>J-g-0W0k19liJ>@4=j^o*=c5
zccR+`H203yAY#EtqHVmtkAsBcWrKSkOVo1PAA&hW<S0ngjvfY#3gJWvTW2iZ3xbcL
zj^Fp?OX}$qJcj1M^CeRj%A*Sc<AFHnh7I-WxJND8`V-091F^NB2?#})I7^`4$I?6K
z-5|yc!F+UzI;YJaj$?s^HLU@YvqOi8R)SqLq=yv365}aleqNfJDFw!AS51bcL|eK^
zIEujv=LQi*0k;AL>=*5=_{(FHM2*LmJTnNMXJr8Z&}LLA6h*PLSo?E!UWnw}j#nAQ
zgJy?`kXAPUx72NLzM{7m;5_)0RIUbNR_**kjsFur{uJ?qx5Ac3fLhUCp9EW8UnA0H
z53p-S4v>IDMEr$;SQWtg%*RDB4ZWcCmvS+xrL3xAKZGJs>jJExy+o7<8lX$&;7}FN
zb^`md53&3BO48&fE6iQ>7qJ*=2gecLQ>)F(r3+2^bF-q7YbGihV7+g<==mYTp6q^~
zZaX$lCK`+w^bYjqwnOtOpFmZ6P>c(?0D+tPvOjF&m?F>hetX<}XM*r^rA<D;o{NTj
z*B4cF(QVRswKFzHsYhD{?8YdYGmIc9Ff;$C4%*7>i%xu4f(4ncc)taN4%jFW^le98
zRT*hns%&0mBhB-^+n_qpFT0uuF&qUTtZZ(}=J7qB@b>v7HMHVYCfo4Fz}LJDY$?uA
zpREwmHb683!HTbi$Eg%$Bq{~Y#PHt%6iZjeq1lt=7YkAJ2Sc$|d@BJ^nxC|Pb8OsN
zdj&Y7jjR>SKNiH;rkx)|@a}g|jI$#U{G%J~$?`#RX58uY*s=#bAle-+wrFIP3dc7#
zb~v$}8{_RI9NodRzMR$F?E%{S*3Oorr@>Ho256gM0LI|LorTTWwRhlGvea?Wi4rCg
zVF?sRHgo3XBq$KJKoGbQmPx^CAc+IA=i>$#R96>_|6$g=?GQ%>4sEiBP`)*h`|Kt_
zg#JOdOUjEvC-rAZ43ygklF@D&_brCk1WJ+9M0rF<iLi&I(xs~B`fGw!|2t1JU{hm3
zM_+WspavLV?hX}QQKNObA(U2_TAU4Li2x9Un6D9zCTW<sVL>n}%`@h8K{XV=)>iyr
z74Jyy^s6@DXSuQdbX6i^)&~c6!HtIFxX*@w6SpvhpL7r?=1phxfNZr4led!cLk}x-
zVV7`0syuSy&1=fd^%W^O_6vay%%7)My#tcJir`h>i9DJ*i6bDR7^%<uqQfsM0|<Id
zg;1gq+(#%zSvoab$`cIXkOZ_|-qY7-quhVs7v}6-y)H!bdaUK?<121&a)O>}(F!|q
zef0y~m16yuFG!}`e+s_2>J>uif_0L+ovBy<p|%Yg$zt}wr8{>&Xb*B<<Mgz3>2=hq
zH6JT$dgD|l@2hcSI=lMEd64~_1kc{_z$T5J0?do<lClVGS{UCr0zWk#Irx%qo?VrO
zusn{-H`1IVFXI?TuzXSw6pmQr<&gw?et-R)V&mB%L$A#xoCB!>O}}3w88oFxMp3FR
z_$z@EoW!B`c}j}Bv*B_XTf)g_A}E>XX2+RB4xC4-*nT&S2>TOfBQEqDyT7W;2WH%9
zuO4{#V^5h<U9gXF1ZFt9fJPr)LEmR`yeM2Ta5C6fv5v+uo@L2NY&!7U4?l`5UA}t3
z0|ILG-f;mEa$aX((eIT*s+dtqb#pb5xD_>Jq~Q*;K}sUWVfNxnjL+thOqrWUp5BaP
zT+xeUOky5v(aO%&@4Z)&5+NH{(5f-Ud;eC~W9X48E0d3B`S<xMVCTrXtiZ`4T@N}z
zSJm}ECHQ&lU0`H>k4~QXaZ7*KRD|Hl;Yh{@xk$!CC8av8D8)J_jTdeQhLyOil%XQy
zS<h_=nOihe&PK?%ktdi<w3f3e^$b>9u%bO;@<#Z%tzl^vFLV$r4TQGQ@n7WAw}i`P
zJvSS9SH^mouHQTDm!s-vK6*q@t`e9P!SzUO#j`@bVh*8!%#Z!JDox6Wp8xY-O?4M-
zJ(4YHHHWPbe;uLg@!s05b@g<tE}nDZ#+e9~?86CqlmA4eyIt#n*)rWcXY=FAgGa`g
z{gDju8pXg1A6$LTM#u{?Bf53SwgV<Zxl`lMWtiHzldZb=@l2*c#hjABfA`RAA<ckA
zZw=!Q`>`nJ?G<J|RMtP`jOns??RC2J*Jbu45doY_3Oy(Ec;%ULZCCrO_8T3KdKOuk
z^fAb73*P^Jgt@1y0`|SNLJ8g~HQSFoIox9vQ~vAZL=)9C+`g0)XOcDHrQx^Rd1HQM
z;w!=H_{hODev`k>sX;{c{@AlVLl5$%KJdpopEdHI56Cvg8FR>8VTk_y{kmh+Wr{pr
z6F7SN6AIW$J$$dke|ppWGmln_m|1G?SAP0r|II}v_D!#%7#-hem9CGY`b9Ic`gJ`V
zqoaHK+e$=7yUdPn40sUH{U|?zJH^(eH~U;p1!1L7f;)4Zy#DM9UL4w(yFc>y753mP
z?Yd%}S4w?m*&aUcbVej&Q2C}t)(yovrLryJvJbXcbyyA3AC%6f)jvxtcJR@cQM2;G
zD$LkwY+Y5+^Gv1Fg_+nSZ>&^zr`Kd~D3hBUQSM_tNbx~_EkMAQ_8fk5-RFkO5l!{Q
z!2=-}-&385eBZCDr@(PTT6fmY6}j7O_93%69iK4EY9c3l)SAwA9f`Cxi^eLNcXI+*
zClzLx2*o<M##!Z2^_cyCV?z^3;Df}xwF3tiuW4_SNK?YYEj`3t#<RgJ*|W16_onP2
z<L?Jw>Uyx!6zi5fi)2i_tG9W9-JCzmb5fWYa#FzB6y}jukQEk1_~+e>h-@(IT6J~-
zE(q6UeoJH<2z=9r6MEAF)w*Lb?{o#c<c$`Z54Ru$jtW@jqX_d>yHoZRd~6S;I6_v(
z2rURCZ~2?M+}UyW?PWFfm(WGOhZtGc7(_BGiqAyIX(@?mzKr%VN*RglJ^$*7Wpq7N
z!%fauxe}Y`{7j=VH6qS8Pj&8tWB<soQrxeL-stmo#>HH)?1|fyKeQGE@M1$Q-tJXc
zj#n))f6sqF<lh{T@OpwbIHr-Sihl))|8}3B;J^;eUC!={q4@iKzG?dJcKhyc!gTW8
s55D`scR%=)?7rup{}=J#qkKK%v8~uWS@0(>fB4}z$`DCDWc~a90U22kQUCw|

literal 0
HcmV?d00001

diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index d1219f912..5d7934174 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -35,8 +35,8 @@
 # i.e. for each K, we only need to compile one of the implementation, not all.
 #
 # For each K, whether to use wrapReduce or blockReduce was done by experiment
-# Please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-# and this experiment log: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
+# Please refer to this post: https://github.com/facebookincubator/AITemplate/wiki/How-to-write-a-fast-Softmax-CUDA-kernel%3F
+# and this experiment log [fb internal only]: https://github.com/facebookincubator/AITemplate/wiki/How-to-write-a-fast-Softmax-CUDA-kernel%3F
 FUNC_TEMPLATE = jinja2.Template(
     """
 {{custom_libs}}

From 3a6d48fbce9e036b2eea1716f36dcfa3c1ef6b57 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Tue, 7 Mar 2023 23:32:28 -0800
Subject: [PATCH 225/638] o2 benchmark (#372)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/372

Reviewed By: jiaqizhai, khabinov

Differential Revision: D43726142

fbshipit-source-id: 90add0c73e9b7725a4a0969fd3ba14ae81d3e481
---
 fx2ait/fx2ait/converters/ait_converters.py           | 3 ++-
 fx2ait/fx2ait/test/converters/test_ait_layer_norm.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index dcfdcc706..7863b5b5b 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -916,6 +916,7 @@ def acc_ops_layer_norm(
         raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
     weight = kwargs["weight"]
     bias = kwargs["bias"]
+    eps = kwargs["eps"]
     normalized_shape = []
     if all(isinstance(i, int) for i in shape):
         for i in shape:
@@ -924,7 +925,7 @@ def acc_ops_layer_norm(
         normalized_shape = shape
     else:
         raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
-    return layernorm()(input_val, weight, bias, normalized_shape)
+    return layernorm()(input_val, weight, bias, normalized_shape, eps)
 
 
 @ait_converter(acc_ops.flatten)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
index 4829634d2..2a109bc19 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
@@ -22,17 +22,18 @@
 class TestLayernormConverter(AITTestCase):
     @parameterized.expand(
         [
-            param("1d_normalized_shape", [10], [2, 10]),
+            param("1d_normalized_shape", [10], [2, 10], 1e-5),
+            param("1d_normalized_shape", [100], [20, 100], 1e-6),
             # Enable test case once layernorm support expand
             # param("2d_normalized_shape", [5, 10], [5, 10]),
         ]
     )
-    def test_layer_norm(self, name, normalized_shape, input_shape):
+    def test_layer_norm(self, name, normalized_shape, input_shape, eps):
         class TestModule(torch.nn.Module):
             def __init__(self, normalized_shape):
                 super().__init__()
                 # TODO remove hard code eps once layernorm api expose eps setting
-                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+                self.mod = nn.LayerNorm(normalized_shape, eps=eps)
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return self.mod(x)

From 97f4c38e752b5d2036cbc895753e3a0e92599274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serhat=20Varolg=C3=BCnes?= <svarolgunes@meta.com>
Date: Wed, 8 Mar 2023 01:22:28 -0800
Subject: [PATCH 226/638] Add fast_tanh(half2) fallback for CUDA_ARCH < 75
 (#374)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/374

Fallback for input type half2 version of fast_tanh in CUDA_ARCH < 75 case is implemented.

Reviewed By: aakhundov

Differential Revision: D43871666

fbshipit-source-id: 5e9bed21996eb9cd5e71fdb3851e7ab9d20826cb
---
 python/aitemplate/backend/cuda/elementwise/custom_math.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 60140aeb2..64d59f009 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -97,7 +97,8 @@ __device__ half2 fast_tanh(half2 x) {
   return x;
 
 #else
-  NOT_IMPLEMENTED();
+  return half2(
+      {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
 #endif
 }
 

From 20e5250186a4f5ef1139047987ea772fb260827b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 8 Mar 2023 02:35:13 -0800
Subject: [PATCH 227/638] Make JaggedIntVar hashable (#370)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/370

Turns out, `JaggedIntVar` wasn't hashable. This created problems for some passes (e.g., [here](https://github.com/facebookincubator/AITemplate/blob/75f54510d8e02114e013200a66ea9a5d433e5f81/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py#L44-L48)).

This diff adds a `__hash__` function to `JaggedIntVar`. And because it should pretend to be a regular `IntVar` by default, the new `__hash__` function has the structure of the `IntVar.__hash__`.

Reviewed By: ipiszy

Differential Revision: D43857198

fbshipit-source-id: dc569e02731ae07aa522ad06d45d4b2f8893d336
---
 python/aitemplate/compiler/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 1a10d866a..1e32a243f 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -387,6 +387,9 @@ def __eq__(self, another: JaggedIntVar) -> bool:
             and self.jagged_dims() == another.jagged_dims()
         )
 
+    def __hash__(self) -> int:
+        return hash((self._attrs["name"], tuple(self._attrs["values"])))
+
     def total_length(self) -> IntVar:
         """The total_length dimension the JaggedIntVar is based on."""
         return self._total_length

From 62c87e2580c0dc427f6136c4eb589863768742c1 Mon Sep 17 00:00:00 2001
From: Mengchi Zhang <mengchi@meta.com>
Date: Wed, 8 Mar 2023 08:39:22 -0800
Subject: [PATCH 228/638] Add jagged_to_dense op (#380)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/380

In this diff, `jagged_to_dense` front-end and back-end op is added with the vectorized I/O. We reuse many of the utilities in `testing/jagged_utils.py` similar to `backend/common/elementwise_common.py`  in D43482363. A unit test and benchmark are included.

## Implementation Details
Since the output is dense, we adopt the calculations based on dense shape and apply padding when current element is outside of jagged shape from input.

Reviewed By: aakhundov

Differential Revision: D43562375

fbshipit-source-id: 930ad6793a9c6260497847330abd0a83e5861ac9
---
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../backend/cuda/tensor/jagged_to_dense.py    | 390 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/jagged_to_dense.py    |  82 ++++
 tests/unittest/ops/test_jagged_elementwise.py |   4 +-
 tests/unittest/ops/test_jagged_to_dense.py    | 318 ++++++++++++++
 6 files changed, 794 insertions(+), 3 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/jagged_to_dense.py
 create mode 100644 tests/unittest/ops/test_jagged_to_dense.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index 6372aff54..9a400798f 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -23,6 +23,7 @@
     dynamic_slice,
     expand,
     gather,
+    jagged_to_dense,
     masked_select,
     permute,
     permute021,
@@ -43,6 +44,7 @@
     "dynamic_slice",
     "expand",
     "gather",
+    "jagged_to_dense",
     "masked_select",
     "permute",
     "permute021",
diff --git a/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py b/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
new file mode 100644
index 000000000..c20e475a3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
@@ -0,0 +1,390 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define jagged_to_dense codegen and CUDA kernel
+"""
+from typing import Any, Dict, List, Optional
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
+from aitemplate.utils import shape_utils
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define FUSED_ELE_THREAD_SIZE 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}({{read_t}}* y, const {{read_t}}* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
+  // first compute the dense_idx from the blockIdx and threadIdx
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the jagged_idx from the dense_idx_elem
+  {{index_type}} jagged_idx;
+  {
+    // dense_coord is along consecutive dense dimensions
+    // jagged_coord is along the total_length of the jagged Tensor
+    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
+    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
+    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
+
+{% for i in range(num_offsets) %}
+    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
+    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
+    dense_coord = running_idx / ({{strides[i+1]}});
+    running_idx = running_idx % ({{strides[i+1]}});
+    if (dense_coord >= next_offset - prev_offset) {
+        // this element of the dense volume is
+        // out of bounds of the jagged Tensor
+        {{read_t}} padded_vector;
+        {{data_t}}* cursor = reinterpret_cast<{{data_t}}*>(&padded_vector);
+
+        #pragma unroll
+        for (int i = 0; i < N_ELEMENTS_PER_THREAD; i++) {
+            cursor[i] = {{data_t}}({{padding_value}});
+        }
+
+        y[dense_idx] = padded_vector;
+        return;
+    }
+    jagged_coord = prev_offset;
+
+{% endfor %}
+    jagged_coord += dense_coord;
+    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
+  }
+  y[dense_idx] = x[jagged_idx];
+}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        reinterpret_cast<{{read_t}}*>(y),
+        reinterpret_cast<const {{read_t}}*>(x),
+        {{dynamic_dims_call}}
+        {{offsets_call}}
+        n_elements
+    );
+}
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{y}}, {{x}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
+{{indent}}}
+    """
+)
+
+
+def _get_output_volume_strides(
+    output_volume: List[IntVar],
+) -> List[str]:
+    """
+    Generate the stride expressions for each of the dimensions
+    of the y volume. A stride expression here means the
+    product of all dimensions following the given dimension.
+    The order of the stride expressions in the returned list
+    is the same as of the dimensions of the y volume.
+    """
+    strides = []
+    for dim in reversed(output_volume[1:]):
+        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
+        if strides:
+            strides.append(f"{strides[-1]} * {str_dim}")
+        else:
+            strides.append(str_dim)
+    strides.reverse()
+    return strides
+
+
+def _get_dynamic_dims(y: Tensor) -> List[IntVar]:
+    res = {}
+
+    for dim in y.shape():
+        if not isinstance(dim, IntImm):
+            res[dim._attrs["name"]] = dim
+    return list(res.values())
+
+
+def _gen_dynamic_dim_str(
+    index_type: str, dynamic_dims: List[IntVar], has_type: bool
+) -> str:
+    type_str = index_type + " " if has_type else ""
+    res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
+    if res:
+        res += ", "
+    return res
+
+
+def _gen_offsets_str(
+    x: Tensor,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+) -> str:
+    jagged_int_var = x._attrs["shape"][0]
+    offsets_var_name = jagged_int_var.offsets_var_name()
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    ref_prefix = "const " if const_ref else ""
+    ref_suffix = "&" if const_ref else ""
+    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
+    arg_name = name if name is not None else offsets_var_name
+    offsets = f"{arg_type}{arg_name}, "
+
+    return offsets
+
+
+def _gen_int_var_product_str(
+    int_vars: List[IntVar],
+) -> str:
+    res = []
+    for int_var in int_vars:
+        if isinstance(int_var, IntImm):
+            res.append(str(int_var._attrs["values"][0]))
+        elif isinstance(int_var, IntVar):
+            res.append(int_var._attrs["name"])
+        else:
+            raise RuntimeError(
+                "A dim must be an IntVar! Current type: {}".format(type(int_var))
+            )
+    return " * ".join(res) if res else "1"
+
+
+def _detect_read_type(inner_size: int, dtype: str) -> str:
+    if dtype in ("bfloat16", "half"):
+        if inner_size % 8 == 0:
+            return "uint4"
+        elif inner_size % 4 == 0:
+            return "uint2"
+        elif inner_size % 2 == 0:
+            return "uint"
+    elif dtype == "float":
+        if inner_size % 4 == 0:
+            return "uint4"
+        elif inner_size % 2 == 0:
+            return "uint2"
+
+    return dtype
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+    read_type: str,
+) -> str:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    padding_value = func_attrs["padding_value"]
+    jagged_int_var = x.shape()[0]
+    num_offsets = len(jagged_int_var.jagged_dims())
+    backend_spec = CUDASpec()
+
+    dynamic_dims = _get_dynamic_dims(y)
+
+    kernel_func = KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        num_offsets=num_offsets,
+        strides=_get_output_volume_strides(
+            y.shape(),
+        ),
+        offsets_type=jagged_int_var.offsets_type(),
+        data_t=data_type,
+        read_t=read_type,
+        padding_value=padding_value,
+        dynamic_dims=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            dynamic_dims,
+            has_type=True,
+        ),
+        offsets=_gen_offsets_str(
+            x,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+    )
+    return kernel_func
+
+
+@registry.reg("cuda.jagged_to_dense.gen_function")
+def jagged_to_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates jagged_to_dense function definition."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+
+    dtype = x.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_inner_size = shape_utils.get_num_rightmost_static_elements(x.shape())
+    read_type = _detect_read_type(read_inner_size, data_type)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs,
+        backend_spec.index_type,
+        data_type,
+        read_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=read_type,
+        data_t=data_type,
+    )
+
+    dynamic_dims = _get_dynamic_dims(y)
+
+    function = FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        dynamic_dims_decl=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            dynamic_dims,
+            has_type=False,
+        ),
+        offsets_decl=_gen_offsets_str(
+            x,
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=_gen_offsets_str(
+            x,
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
+        read_t=read_type,
+    )
+    return function
+
+
+@registry.reg("cuda.jagged_to_dense.func_decl")
+def jagged_to_dense_gen_function_decl(func_attrs) -> str:
+    """Generate jagged_to_dense function declaration."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    dynamic_dims = _get_dynamic_dims(y)
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        dynamic_dims=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            dynamic_dims,
+            has_type=True,
+        ),
+        offsets=_gen_offsets_str(
+            x,
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.jagged_to_dense.func_call")
+def jagged_to_dense_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate jagged_to_dense function call."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    dynamic_dims = _get_dynamic_dims(y)
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        index_type=backend_spec.index_type,
+        calculate_n=_gen_int_var_product_str(
+            y.shape(),
+        ),
+        y=y._attrs["name"],
+        x=x._attrs["name"],
+        dynamic_dims=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            dynamic_dims,
+            has_type=False,
+        ),
+        offsets=_gen_offsets_str(
+            x,
+            has_type=False,
+            const_ref=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 7ba9bf237..5c324ff33 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -24,6 +24,7 @@
 from .dynamic_slice import dynamic_slice
 from .expand import expand
 from .gather import gather
+from .jagged_to_dense import jagged_to_dense
 from .masked_select import masked_select
 from .permute import permute
 from .permute021 import permute021
diff --git a/python/aitemplate/compiler/ops/tensor/jagged_to_dense.py b/python/aitemplate/compiler/ops/tensor/jagged_to_dense.py
new file mode 100644
index 000000000..aeb713f8d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/jagged_to_dense.py
@@ -0,0 +1,82 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Define jagged_to_dense op
+"""
+import logging
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class jagged_to_dense(Operator):
+    """
+    Returns a tensor containing the dense format of the input jagged tensor.
+    Args:
+        x (Tensor): input jagged tensor
+        padding_value (float): the padding value for elements out of jagged shape.
+    Returns:
+        y: a tensor containing the dense format of input jagged tensor.
+    """
+
+    def __init__(
+        self,
+        padding_value: float = 0,
+    ):
+        super().__init__()
+        self._attrs["op"] = "jagged_to_dense"
+        self._attrs["padding_value"] = padding_value
+
+    def _infer_shape(self, x: Tensor) -> List[IntVar]:
+        jagged_int_var = x.shape()[0]
+        inner_shape = x.shape()[1:]
+        return jagged_int_var.get_max_dense_shape() + inner_shape
+
+    def _get_op_attributes(self):
+        return {
+            "padding_value": self._attrs["padding_value"],
+        }
+
+    def _args_for_pseudo_code(self):
+        return [f"padding_value={self._attrs['padding_value']}"]
+
+    def __call__(
+        self,
+        x: Tensor,
+    ) -> Tensor:
+        if not x.is_jagged():
+            raise RuntimeError(
+                "Input tensor x is expected to be jagged, but actually dense for jagged_to_dense."
+            )
+
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shape(x)
+        y = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+
+        self._attrs["outputs"] = [y]
+        return y
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_jagged_elementwise.py b/tests/unittest/ops/test_jagged_elementwise.py
index 42e50eeda..60232d8ac 100644
--- a/tests/unittest/ops/test_jagged_elementwise.py
+++ b/tests/unittest/ops/test_jagged_elementwise.py
@@ -572,8 +572,6 @@ def _benchmark_jagged_dense_elementwise_add(
             )
 
     def _test_benchmark_jagged_dense_elementise_add(self):
-        # ESUHM use case: "jagged + dense + dense = jagged",
-        # with dtype=float16; https://fburl.com/code/1e9z83fb
         self._benchmark_jagged_dense_elementwise_add(
             B=1024,
             N=260,
@@ -582,7 +580,7 @@ def _test_benchmark_jagged_dense_elementise_add(self):
             dtype="float16",
             offsets_dtype="int32",
             use_jagged_space_indexing=False,
-            test_suffix="esuhm",
+            test_suffix="benchmark",
         )
 
 
diff --git a/tests/unittest/ops/test_jagged_to_dense.py b/tests/unittest/ops/test_jagged_to_dense.py
new file mode 100644
index 000000000..bebd50094
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_to_dense.py
@@ -0,0 +1,318 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for jagged Convert Operator.
+"""
+
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import aitemplate.testing.jagged_utils as jagged_utils_ref
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class JaggedToDenseTestCase(unittest.TestCase):
+    def _test_jagged_to_dense(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+        padding_value: float = 0.0,
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        jagged_dims_max_values = jagged_max_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged_inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.jagged_to_dense(padding_value=padding_value)(JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert not RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_to_dense_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        result_pt = jagged_utils_ref.jagged_to_dense(
+            jagged=source_pt,
+            offsets_list=list(offsets_pt.values()),
+            dense_shape=jagged_max_shape,
+            padding_value=padding_value,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 8], "float16", 0.0),
+            param(2, "int32", [4, 3, 4], "float16", 1e2),
+            param(3, "int32", [4, 3, 2], "float16", 0.0),
+            param(4, "int32", [4, 3, 1], "float16", 1e2),
+            param(5, "int32", [4, 3, 8], "bfloat16", 0.0),
+            param(6, "int32", [4, 3, 4], "bfloat16", 1e2),
+            param(7, "int32", [4, 3, 2], "bfloat16", 0.0),
+            param(8, "int32", [4, 3, 1], "bfloat16", 1e2),
+            param(9, "int64", [4, 3, 4], "float32", 0.0),
+            param(10, "int64", [4, 3, 2], "float32", 1e5),
+            param(11, "int64", [4, 3, 1], "float32", 1e5),
+        ]
+    )
+    def test_jagged_to_dense_single_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+        padding_value,
+    ):
+        self._test_jagged_to_dense(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dtype=dtype,
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"single_offsets_{dtype}_{i}",
+            padding_value=padding_value,
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 8], "float16", 0.0),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], "float16", 1e2),
+            param(3, "int32", [3, 4, 5, 150, 3, 2], "float16", 0.0),
+            param(4, "int32", [3, 4, 5, 150, 1, 1], "float16", 1e2),
+            param(5, "int32", [3, 4, 5, 150, 1, 8], "bfloat16", 0.0),
+            param(6, "int32", [3, 4, 5, 150, 3, 4], "bfloat16", 1e2),
+            param(7, "int32", [3, 4, 5, 150, 1, 2], "bfloat16", 0.0),
+            param(8, "int32", [3, 4, 5, 150, 3, 1], "bfloat16", 1e2),
+            param(9, "int64", [3, 4, 5, 150, 1, 4], "float32", 0.0),
+            param(10, "int64", [3, 4, 5, 150, 3, 2], "float32", 1e5),
+            param(11, "int64", [3, 4, 5, 150, 3, 1], "float32", 1e5),
+        ]
+    )
+    def test_jagged_to_dense_multiple_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+        padding_value,
+    ):
+        self._test_jagged_to_dense(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dtype=dtype,
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"multiple_offsets_{dtype}_{i}",
+            padding_value=padding_value,
+        )
+
+    def _benchmark_jagged_to_dense(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        jagged_dim = JaggedDim(min_value=0, max_value=N)
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jagged_dim],
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.jagged_to_dense()(JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"benchmark_jagged_to_dense_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            jagged_utils_ref.generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs = {"source": source_pt, "offsets": offsets_pt}
+            outputs = [
+                torch.zeros(
+                    (B, N, D), dtype=string_to_torch_dtype(dtype), device="cuda"
+                )
+            ]
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                jagged_to_dense_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("jagged_to_dense")
+                ]
+                assert len(jagged_to_dense_records) == 1
+                runtime_ms = jagged_to_dense_records[0]["ms_per_iter"]
+
+            jagged_item = total_length * D  # total items to read: the jagged volume
+            dense_item = B * N * D  # total items to write: the dense volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            bandwidth = (
+                (jagged_item + dense_item) * size / (runtime_ms * 1e-3 * 1e9)
+            )  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_jagged_to_dense(self):
+        self._benchmark_jagged_to_dense(
+            B=1024,
+            N=260,
+            D=256,
+            dtype="float16",
+            offsets_dtype="int32",
+            test_suffix="benchmark",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From b5733b1abe1130059805f8ae6640453197dba8a5 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 8 Mar 2023 12:00:06 -0800
Subject: [PATCH 229/638] Expand op cleanup (#368)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/368

Cleaned up code for expand op:

* Added more documentation comments & type hints
* Improved variable & function naming
* Simplified code ( eliminated potentially unneccessarily specialized kernels )

Reviewed By: chenyang78

Differential Revision: D43844913

fbshipit-source-id: 3734e1b47d108398d5e1513e301a193e54839dc9
---
 python/aitemplate/backend/backend_spec.py     |  26 ++
 .../aitemplate/backend/cuda/tensor/expand.py  | 222 +++++-------------
 .../cuda/tensor/expand_static_shape.py        | 144 ++++++------
 .../aitemplate/backend/cuda/tensor/repeat.cuh | 118 ++++++----
 .../aitemplate/compiler/ops/tensor/expand.py  |  10 +-
 tests/unittest/ops/test_expand.py             |  22 +-
 6 files changed, 245 insertions(+), 297 deletions(-)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index fb9394b8b..0f9d6637b 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -61,6 +61,32 @@ class GPUBackendSpec(BackendSpec):
         }
     )
 
+    # find the size in bytes of a given backend type
+    sizeof_types: Dict[str, int] = field(
+        default_factory=lambda: {
+            "uint8_t": 1,
+            "half": 2,
+            "bfloat16": 2,
+            "float32": 4,
+            "int64_t": 8,
+            "int32_t": 4,
+            "float": 4,
+        }
+    )
+
+    # find a backend type for a given size in bytes
+    # useful to find types 2 or 4 times larger than a given dtype
+    # for vectorization purposes.
+    type_for_size: Dict[int, str] = field(
+        default_factory=lambda: {
+            1: "uint8_t",
+            2: "half",
+            4: "float",
+            8: "int64_t",
+            16: "int4",
+        }
+    )
+
     backend_datatype_convertors: Dict[str, Dict[str, str]] = field(
         default_factory=lambda: {
             "half": {"float": "__half2float"},
diff --git a/python/aitemplate/backend/cuda/tensor/expand.py b/python/aitemplate/backend/cuda/tensor/expand.py
index be535af9b..73d044aa4 100644
--- a/python/aitemplate/backend/cuda/tensor/expand.py
+++ b/python/aitemplate/backend/cuda/tensor/expand.py
@@ -13,6 +13,9 @@
 #  limitations under the License.
 #
 
+"""
+expand op general CUDA implementation with complete dynamic shape support
+"""
 
 from typing import Any, Dict
 
@@ -24,21 +27,19 @@
 from aitemplate.backend.cuda.tensor import expand_static_shape  # noqa: F401
 
 
-def _to_cuda_dtype(dtype):
-    dtype = CUDASpec().dtype_to_backend_dtype.get(dtype, None)
-    return dtype
-
-
 @registry.reg("cuda.expand.func_decl")
-def gen_function_decl(func_attrs):
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
     if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
         func = registry.get("cuda.expand.static.func_decl")
         return func(func_attrs)
     x = func_attrs["inputs"][0]
     func_name = func_attrs["name"]
-    index_type = _to_cuda_dtype(func_attrs.get("index_type", "int64"))
+    cuda_spec: CUDASpec = CUDASpec()
+    index_type = cuda_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
     dt = x.dtype()
-    dtype = _to_cuda_dtype(dt)
+    dtype = cuda_spec.dtype_to_backend_dtype.get(dt, None)
     assert (
         dtype is not None
     ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
@@ -93,85 +94,13 @@ def gen_function_decl(func_attrs):
 
 // integer ceil division
 #define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
-
-/**
- * Sequential write expand kernel for single block case.
- *
- * This kernel is optimized for small inputs, where we can load
- * the entire  input into shared memory more or less at once
- */
-__global__ void {{func_name}}_sequential_write_single_block_kernel(
-  // Implementation for small inputs where the entire src can be read into shared memory,
-  // and we have just one thread block
-  const {{dtype}}* src,
-  const {{index_type}} src_numel,
-  {{dtype}}* dst,
-  const {{index_type}} dst_numel
-  {% for i in range(output_rank) %}
-        ,const {{index_type}} output_strides_{{i}}
-        ,const {{index_type}} read_strides_{{i}}
-  {% endfor %}
-  ) {
-    // determine our range of elements to read
-    const {{index_type}} write_idx = threadIdx.x;
-    extern __shared__ {{dtype}} src_shared[]; // dynamic shared memory
-    if (write_idx<src_numel) {
-        src_shared[write_idx] = src[write_idx];
-    }
-    __syncthreads();
-    {{index_type}} read_idx = 0;
-    {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
-    {% for i in range(output_rank) %}
-        read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
-        remaining_idx %= output_strides_{{i}};
-    {% endfor %}
-    if (write_idx<dst_numel) {
-        dst[write_idx] = src_shared[read_idx];
-    }
-}
-
-/**
- * Sequential write expand kernel with batched read/writes on trailing
- * dimensions.
- *
- * This kernel is optimized for the case that trailing dimensions
- * are kept between input and output, in which case we can do block-wise
- * reads and writes.
- */
-__global__ void {{func_name}}_sequential_write_batch_kernel(
-
-  const {{dtype}}* src,
-  {{dtype}}* dst,
-  const {{index_type}} dst_numel,
-  const {{index_type}} batch_size
-  {% for i in range(output_rank) %}
-        ,const {{index_type}} output_strides_{{i}}
-        ,const {{index_type}} read_strides_{{i}}
-  {% endfor %}
-  ) {
-    // determine our range of elements to read
-    const {{index_type}} write_idx = (blockDim.x * blockIdx.x + blockDim.y * blockIdx.y + blockDim.z * blockIdx.z + threadIdx.x) * batch_size;
-    {{index_type}} read_idx = 0;
-    {{index_type}} i = write_idx; // Used to calculate remainder
-    {% for i in range(output_rank) %}
-        read_idx += (i / output_strides_{{i}}) * read_strides_{{i}};
-        i %= output_strides_{{i}};
-    {% endfor %}
-    if (write_idx+batch_size-1<dst_numel) {
-        dst[write_idx] = src[read_idx];
-        for (i = 1; i < batch_size; i++) {
-            dst[write_idx+i] = src[read_idx+i];
-        }
-    }
-}
+#define INT_MIN(a,b) ((a) < (b)? (a) : (b))
 
 /**
  * Sequential write expand kernel.
- * This kernel deals with the general case. It relies heavily on L2 cache
- * for scattered read optimization and does sequential writes.
- * This was benchmarked against an alternative implementation that tried
- * to minimize overall memory accesses, doing sequential reads and scattered
- * writes. But this implementation is faster.
+ * This kernel deals with the general case ( strided copy ).
+ * It relies heavily on L2 cache for scattered read optimization and
+ * writes sequentially.
  */
 __global__ void {{func_name}}_sequential_write_kernel(
 
@@ -184,15 +113,16 @@ def gen_function_decl(func_attrs):
   {% endfor %}
   ) {
     // determine our range of elements to read
-    const {{index_type}} write_idx = blockDim.x * blockIdx.x + blockDim.y * blockIdx.y + blockDim.z * blockIdx.z + threadIdx.x;
-    {{index_type}} read_idx = 0;
-    {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
-    {% for i in range(output_rank) %}
-        read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
-        remaining_idx %= output_strides_{{i}};
-    {% endfor %}
-    if (write_idx<dst_numel) {
-        dst[write_idx] = src[read_idx];
+    {{index_type}} write_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    const {{index_type}} grid_stride = gridDim.x*blockDim.x;
+    for (;write_idx<dst_numel;write_idx += grid_stride) {
+      {{index_type}} read_idx = 0;
+      {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+      {% for i in range(output_rank) %}
+          read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+          remaining_idx %= output_strides_{{i}};
+      {% endfor %}
+      dst[write_idx] = src[read_idx];
     }
 }
 
@@ -215,6 +145,9 @@ def gen_function_decl(func_attrs):
   for (i = 0; i < input_rank; ++i) {
     input_numel *= input_dims[i];
   }
+  if (input_numel==0) {
+    return;
+  }
   {{index_type}} input_dim_pos = 0;
 
   // Calculate number of output dimensions
@@ -222,7 +155,9 @@ def gen_function_decl(func_attrs):
   for (i = 0; i < output_rank; ++i) {
     output_numel *= output_dims[i];
   }
-
+  if (output_numel==0) {
+    return;
+  }
   // Determine stride for each input dimension
   {{index_type}} input_strides[input_rank];
   input_strides[input_rank-1] = 1;
@@ -264,97 +199,46 @@ def gen_function_decl(func_attrs):
       tail_dim *= output_dims[i];
   }
 
-  {{index_type}} batch_size = 1; // sequential batch len
-
-  if (output_numel>MAX_THREADS_PER_BLOCK) {
-    // If the input/output is so small that we can read it all into shared mem,
-    // sequential batching makes no sense
-    batch_size = 7; // Determined experimentally via benchmark.
-                    // Should be reevaluated after algorithmic changes.
-    for (;batch_size>1;--batch_size) {
-      if ((tail_dim % batch_size)==0) {
-          break;
-      }
-    }
-  }
-  assert ((output_numel % batch_size)==0);
-
-  // determine CUDA kernel grid layout
-  {{index_type}} output_batches = output_numel / batch_size;
-
-  {{index_type}} block_size = INT_CEIL_DIV(output_batches, MAX_THREADS_PER_BLOCK);
-  {{index_type}} thread_size_x = min(MAX_THREADS_PER_BLOCK, output_batches);
+  // determine CUDA kernel grid layout. Tuning numbers determined experimentally
+  {{index_type}} thread_size_x = INT_MIN(output_numel, MAX_THREADS_PER_BLOCK); // more threads per block maximize L1 cache utilization
+  {{index_type}} block_size_x = INT_MIN(INT_CEIL_DIV(output_numel, thread_size_x), 4096l ); //
 
-  {{index_type}} block_size_x = block_size;
-  {{index_type}} block_size_y = 1;
-  {{index_type}} block_size_z = 1;
-
-  // for very large dimensions, we need to split into x,y,z grid blocks
-  if (block_size_x>MAX_X_BLOCKS) {
-      block_size_y = INT_CEIL_DIV(block_size_x, MAX_X_BLOCKS);
-      block_size_x = MAX_X_BLOCKS;
-      if (block_size_y > MAX_BLOCKS) {
-        block_size_z = INT_CEIL_DIV(block_size_y, MAX_BLOCKS);
-        block_size_y = MAX_BLOCKS;
-      }
-  }
-  dim3 dimGrid(block_size_x, block_size_y, block_size_z);
+  // for very large dimensions, we rely on grid-stride loop and save the block launch overhead
+  dim3 dimGrid(block_size_x, 1, 1);
   dim3 dimBlock(thread_size_x, 1, 1);
-  // Select the right kernel to call and call it
-  if (batch_size==1) {
-    if (block_size_x>1) {
-      {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
-          static_cast<const {{dtype}}*>(src),
-          static_cast<{{dtype}}*>(dst),
-          output_numel
-          {% for i in range(output_rank) %}
-            ,output_strides[{{i}}]
-            ,read_strides[{{i}}]
-          {% endfor %}
-      );
-    } else {
-      {{func_name}}_sequential_write_single_block_kernel<<<dimGrid,dimBlock,sizeof({{dtype}})*input_numel,stream>>>(
-          static_cast<const {{dtype}}*>(src),
-          input_numel,
-          static_cast<{{dtype}}*>(dst),
-          output_numel
-          {% for i in range(output_rank) %}
-            ,output_strides[{{i}}]
-            ,read_strides[{{i}}]
-          {% endfor %}
-      );
-    }
-  } else {  // batch_size>1, asserting (thread_size_x % batch_size)==0
-      {{func_name}}_sequential_write_batch_kernel<<<dimGrid,dimBlock,0,stream>>>(
-          static_cast<const {{dtype}}*>(src),
-          static_cast<{{dtype}}*>(dst),
-          output_numel,
-          batch_size
-          {% for i in range(output_rank) %}
-            ,output_strides[{{i}}]
-            ,read_strides[{{i}}]
-          {% endfor %}
-      );
-  }
+  {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+      static_cast<const {{dtype}}*>(src),
+      static_cast<{{dtype}}*>(dst),
+      output_numel
+      {% for i in range(output_rank) %}
+        ,output_strides[{{i}}]
+        ,read_strides[{{i}}]
+      {% endfor %}
+  );
 }
 """
 )
 
 
-def create_template_args(func_attrs: Dict[str, Any], indent="  "):
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     dst = y._attrs["name"]
     src = x._attrs["name"]
     func_name = func_attrs["name"]
-    dtype = _to_cuda_dtype(x.dtype())
+    cuda_spec: CUDASpec = CUDASpec()
+    dtype = cuda_spec.dtype_to_backend_dtype.get(x.dtype(), None)
     assert (
         dtype is not None
     ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
 
     xshape = x._attrs["shape"]
     yshape = y._attrs["shape"]
-    index_type = _to_cuda_dtype(func_attrs.get("index_type", "int64"))
+    index_type = cuda_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
     assert index_type is not None
 
     input_dims = ",".join(
@@ -382,7 +266,7 @@ def create_template_args(func_attrs: Dict[str, Any], indent="  "):
 
 
 @registry.reg("cuda.expand.gen_function")
-def gen_function(func_attrs):
+def gen_function(func_attrs: Dict[str, Any]) -> str:
     if not (
         func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
     ):
@@ -393,7 +277,7 @@ def gen_function(func_attrs):
 
 
 @registry.reg("cuda.expand.func_call")
-def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
     if not (
         func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
     ):
diff --git a/python/aitemplate/backend/cuda/tensor/expand_static_shape.py b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
index 06683070d..b927100da 100644
--- a/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
+++ b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
@@ -13,6 +13,13 @@
 #  limitations under the License.
 #
 
+"""
+Specialized and optimized CUDA kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
 import math
 import os
 from itertools import accumulate
@@ -27,16 +34,9 @@
 from aitemplate.backend.target import Target
 from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
 
-"""
-Specialized and optimized CUDA kernel declarations for the `expand` operator
-dealing with the most common case that the input and target shapes are known at compile time,
-with the possible exception of leading dimensions.
-
-"""
-
 
 @registry.reg("cuda.expand.static.func_decl")
-def gen_function_decl(func_attrs):
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
     return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
 
 
@@ -68,7 +68,12 @@ def gen_function_decl(func_attrs):
 
 #define MAX_THREADS_PER_BLOCK 1024l
 // integer ceil division
-#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+
+// Maximum amount of shared memory that the repeat copy kernel(s) should use.
+// (used within repeat.cuh, included below )
+// Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
+#define SHM_MAX 1024 * 44
 
 {{custom_libs}}
 
@@ -99,7 +104,7 @@ def gen_function_decl(func_attrs):
  *
  * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
  */
-__forceinline__ __device__ void {{func_name}}_tail_copy(
+__forceinline__ __device__ void tail_copy(
         const {{dtype}} * const src, // base src tensor memory pointer
         const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
         {{dtype}} * const dst,  // base destination tensor memory pointer
@@ -113,32 +118,22 @@ def gen_function_decl(func_attrs):
     }
 }
 
-
 /**
- * Implement the "middle" part of the kernel, where we have to deal with non-contiguous reads/writes.
- *
+ * Implement the "middle" part of the kernel, dealing with strided reads/writes.
  * Also utilizes grid-stride loop for efficiency and flexibility
- * see  * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * see
+ * * https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * * https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#coalesced-access-to-global-memory
+ * * and https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#strided-accesses
+ * for a more detailed explanation of the reasons for the choice of this specific form.
+ *
+ * Performance notes:
+ *
+ * It is critical to calculate the block_thread_index passed to tail_copy(..) based on
+ * the x-dimension of the launch grid, in order to benefit from Warp memory access coalescing.
+ *
  */
-__global__ void {{func_name}}_mid_kernel(
-
-  const {{dtype}}* const src, // source tensor
-  {{dtype}}* const dst // destination tensor
-  ) {
-    // determine our range of elements to read
-    const {{index_type}} write_offset = (blockDim.x * blockIdx.x + threadIdx.x) * {{tail_size}}l;
-    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
-    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
-    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
-    const {{index_type}} thread_idx_y = blockDim.y * blockIdx.y + threadIdx.y;
-    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=grid_size_x) {
-        {{func_name}}_tail_copy(src, read_offset, dst, write_offset, thread_idx_y, grid_size_y, {{tail_size}}l);
-    }
-
-}
-
-
-__global__ void {{func_name}}_mid_kernel2(
+__global__ void expand_strided_copy(
 
   const {{dtype}}* const src, // source tensor
   {{dtype}}* const dst // destination tensor
@@ -151,7 +146,7 @@ def gen_function_decl(func_attrs):
     const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
     const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
     for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
-        {{func_name}}_tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+        tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
     }
 
 }
@@ -165,18 +160,22 @@ def gen_function_decl(func_attrs):
   const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
   cudaStream_t stream)
 {
+  if ((({{mid_size*tail_size}})==0) || (head_size==0)) {
+    return;
+  }
   {% if mid_dim_count>0 %}
   // we have middle dimensions which involve non-contiguous reads
   // so we need to invoke the middle kernel
-  dim3 dimGrid({{mid_grid_blocks_x}}, {{mid_grid_blocks_y}});
-  dim3 dimBlock({{mid_grid_threads_x}}, {{mid_grid_threads_y}});
-  {{func_name}}_mid_kernel2<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  dim3 dimGrid({{grid_blocks_x}}, {{grid_blocks_y}});
+  dim3 dimBlock({{grid_threads_x}}, {{grid_threads_y}});
+  expand_strided_copy<<<dimGrid,dimBlock,0,stream>>>(src, dst);
   if (head_size>1l) {
      // now repeat copy what we already built once, multiple times into the rest of the output tensor
      cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
   }
   {% else %}
-    // we have no middle dimensions, so all we need to do is repeatedly copy the source multiple times
+    // we have no middle dimensions, so strided copy is unneccessary.
+    // All we need to do is repeatedly copy the source multiple times
     // repeat the entire thing a dynamic number of times ( e.g. head_size times )
     cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
   {% endif %}
@@ -184,42 +183,31 @@ def gen_function_decl(func_attrs):
 """
 )
 
-_dtype_sizes = {
-    "half": 2,
-    "bfloat16": 2,
-    "float32": 4,
-    "int64_t": 8,
-    "int32_t": 4,
-    "float": 4,
-}
-
-_size_dtypes = {
-    2: "half",
-    4: "float",
-    8: "int64_t",
-    16: "int4",
-}
-
 
-def _ceil(num):
+def _ceil(num: float) -> int:
     return int(math.ceil(num))
 
 
-def create_template_args(func_attrs: Dict[str, Any], indent="  "):
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     dst = y._attrs["name"]
     src = x._attrs["name"]
     func_name = func_attrs["name"]
+    # Efficient vectorized & buffered repeat copy implementation,
+    # even for odd shapes
     custom_libs = Target.current().get_custom_libs(
         os.path.dirname(__file__), "repeat.cuh"
     )
-    dtype = CUDASpec().dtype_to_backend_dtype[x.dtype()]
+    cuda_spec = CUDASpec()
+    dtype = cuda_spec.dtype_to_backend_dtype[x.dtype()]
     assert (
         dtype is not None
     ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
-    dtype2 = _size_dtypes.get(_dtype_sizes[dtype] * 2, None)
-    dtype4 = _size_dtypes.get(_dtype_sizes[dtype] * 4, None)
+    dtype2 = cuda_spec.type_for_size.get(cuda_spec.sizeof_types[dtype] * 2, None)
+    dtype4 = cuda_spec.type_for_size.get(cuda_spec.sizeof_types[dtype] * 4, None)
     xshape = x._attrs["shape"]
     yshape = y._attrs["shape"]
     dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
@@ -288,11 +276,15 @@ def create_template_args(func_attrs: Dict[str, Any], indent="  "):
     ]  # this does not include the number of elements obtained from head repetitions
     # since we have excluded head dimensions above
     input_numel = input_strides[0]
-
-    mid_size = output_numel // tail_size
+    if tail_size > 0:
+        mid_size = output_numel // tail_size
+    else:
+        mid_size = 0
     mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
-
-    mid_expansion_rate = mid_size * tail_size // input_numel
+    if input_numel > 0:
+        mid_expansion_rate = mid_size * tail_size // input_numel
+    else:
+        mid_expansion_rate = 1
 
     # remove the first dimension, which is the total number of elements
     # and prepend the head_dims with stride 0
@@ -330,13 +322,13 @@ def create_template_args(func_attrs: Dict[str, Any], indent="  "):
         output_strides = [s // 2 for s in output_strides]
         read_strides = [s // 2 for s in read_strides]
 
-    mid_grid_blocks_x = 1
-    mid_grid_threads_x = min(tail_size, 32)
-    mid_max_y_threads = 1024 // mid_grid_threads_x  # guaranteed to be >= 1
-    mid_grid_threads_y = min(
-        mid_max_y_threads, mid_size
+    grid_blocks_x = 1
+    grid_threads_x = max(1, min(tail_size, 32))
+    max_y_threads = 1024 // grid_threads_x  # guaranteed to be >= 1
+    grid_threads_y = max(
+        1, min(max_y_threads, mid_size)
     )  # so that  mid_grid_threads_x*max_x_threads <= 1024
-    mid_grid_blocks_y = _ceil(mid_size / mid_grid_threads_y)
+    grid_blocks_y = _ceil(mid_size / grid_threads_y)
 
     if dtype == "bfloat16":
         # bfloat16 is not available in model-generated.h as a type,
@@ -363,21 +355,21 @@ def create_template_args(func_attrs: Dict[str, Any], indent="  "):
         "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float )
         "indent": indent,  # indentation for the function call template,
         "index_type": index_type,
-        "mid_grid_blocks_y": mid_grid_blocks_y,
-        "mid_grid_blocks_x": mid_grid_blocks_x,
-        "mid_grid_threads_y": mid_grid_threads_y,
-        "mid_grid_threads_x": mid_grid_threads_x,
-        "custom_libs": custom_libs,
+        "grid_blocks_y": grid_blocks_y,  # number of y grid blocks in the strided copy kernel
+        "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
+        "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
+        "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.cuh
     }
 
 
 @registry.reg("cuda.expand.static.gen_function")
-def gen_function(func_attrs):
+def gen_function(func_attrs: Dict[str, Any]) -> str:
     return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
 
 
 @registry.reg("cuda.expand.static.func_call")
-def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
     return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/repeat.cuh b/python/aitemplate/backend/cuda/tensor/repeat.cuh
index cffb6602c..ce0a6fee0 100644
--- a/python/aitemplate/backend/cuda/tensor/repeat.cuh
+++ b/python/aitemplate/backend/cuda/tensor/repeat.cuh
@@ -1,18 +1,20 @@
-/*
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
+/**
+
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+-
 
 Functions for repeating parts of a CUDA source tensor onto itself
 or into a target tensor.
@@ -21,15 +23,22 @@ Used by expand_static_shape.py ( expand operator )
 
 */
 
-#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
-#define SHM_MAX 1024 * 44
-
+/**
+ * CUDA Kernel to copy elements repeatedly from a source memory
+ * region to a target memory region.
+ */
 __global__ void repeat_head_kernel(
-    const int64_t* const src,
-    int64_t* data,
-    size_t head_mem_num_elements,
-    size_t num_repeat_copies) {
-  extern __shared__ int64_t shared[];
+    const int64_t* const src, ///< source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+                   for head_mem_num_elements*num_repeat_copies int64_t elements.
+               */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies) ///< How many times to repeat it all into data
+{
+  extern __shared__ int64_t
+      shared[]; // preallocated to blockDim.x elements, typically 32
   const size_t stride_y = blockDim.y * gridDim.y;
   const size_t stride_x = blockDim.x * gridDim.x;
 
@@ -39,10 +48,10 @@ __global__ void repeat_head_kernel(
        ri += stride_x) {
     // read only with one thread per y dim
     if (threadIdx.y == 0) {
-      // in y direction: thread 0 reads, all threads write
-      // repeatedly direct async copy from global to shared memory, see
+      // the following is functionally equivalent to
+      // shared[threadIdx.x] = src[ri]
+      // for reference, see
       // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#optimizing-cuda-applications
-      // and
       // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memcpy-async-primitiv
       __pipeline_memcpy_async(&shared[threadIdx.x], &src[ri], sizeof(int64_t));
       __pipeline_commit();
@@ -52,17 +61,29 @@ __global__ void repeat_head_kernel(
     // inner grid-stride loop, write with all threads out of shared memory
     size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
     for (; wi < num_repeat_copies; wi += stride_y) {
+      // Note that this ensures coalesced writes, due to consecutive write
+      // accesses of threads in a Warp
       data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
     }
   }
 }
 
+/**
+ * Copy an 8-byte aligned memory region, which has a byte size that is a
+ * multiple of 8 into an 8-byte aligned target memory region efficiently. Calls
+ * into repeat_head_kernel ( see above )
+ *
+ **/
 __host__ cudaError_t cuda_repeat_head_vectorized(
-    const int64_t* const src,
-    int64_t* data,
-    size_t head_mem_num_elements,
-    size_t num_repeat_copies,
-    cudaStream_t stream) {
+    const int64_t* const src, ///< Source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+              for head_mem_num_elements*num_repeat_copies int64_t elements. */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies, ///< How many times to repeat it all into data
+    cudaStream_t stream ///< CUDA stream
+) {
   size_t threads_x = 32;
   size_t threads_y = 1024 / threads_x;
   size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
@@ -88,11 +109,20 @@ __host__ cudaError_t cuda_repeat_head_vectorized(
   return cudaPeekAtLastError();
 }
 
+/**
+ * Repeatedly copy the beginning (head) section of a memory region an additonal
+ * num_repeat_copies times nto the memory region directly following that head,
+ * such that the end result will have this head data
+ * repeated 1+num_repeat_copies
+ */
 __host__ cudaError_t cuda_repeat_head(
-    void* data,
-    const size_t head_mem_bytes,
-    size_t num_repeat_copies,
-    cudaStream_t stream) {
+    void* data, ///< pointer to CUDA memory of size (at least)
+                ///< head_mem_bytes*(num_repeat_copies+1)
+    const size_t head_mem_bytes, ///< How many bytes to repeat
+    size_t num_repeat_copies, ///< How many times to repeat it (in addition to
+                              ///< the existing head data)
+    cudaStream_t stream ///< CUDA Stream to use
+) {
   cudaError_t res = cudaSuccess;
   if (num_repeat_copies == 0)
     return res;
@@ -141,12 +171,20 @@ __host__ cudaError_t cuda_repeat_head(
   return res;
 }
 
+/**
+ * Repeatedly copy a source memory region into a target memory region
+ * such that the end result will have the source data
+ * repeated num_repeat_copies
+ */
 __host__ cudaError_t cuda_repeat_src(
-    const void* const src,
-    void* data,
-    const size_t head_mem_bytes,
-    size_t num_repeat_copies,
-    cudaStream_t stream) {
+    const void* const src, ///< Source memory region (readonly)
+    void* data, ///< Destination memory region (read/write, size of at least
+                ///< num_repeat_copies*head_mem_bytes)
+    const size_t head_mem_bytes, ///< Size of source region to copy
+    size_t num_repeat_copies, ///< How many times to copy the data from source
+                              ///< into data
+    cudaStream_t stream ///< CUDA stream to use
+) {
   cudaError_t res = cudaSuccess;
   if (num_repeat_copies == 0) {
     return res;
diff --git a/python/aitemplate/compiler/ops/tensor/expand.py b/python/aitemplate/compiler/ops/tensor/expand.py
index bb77919c4..00cf0621a 100644
--- a/python/aitemplate/compiler/ops/tensor/expand.py
+++ b/python/aitemplate/compiler/ops/tensor/expand.py
@@ -92,14 +92,16 @@ def _infer_shape(self, tensor: Tensor, target_shape: List[IntVar]) -> List[IntVa
         input_shape = tensor._attrs["shape"]
         assert len(input_shape) > 0, "Input tensor must have a shape of length > 0"
         for i, dim in enumerate(input_shape):
-            if dim.lower_bound() <= 0:
+            if dim.lower_bound() < 0:
                 raise ValueError(
-                    f"Dimension {i} of expand input tensor shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes zero or negative values."
+                    f"Dimension {i} of expand input tensor shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes negative values."
                 )
         for i, dim in enumerate(target_shape):
-            if dim.lower_bound() <= 0 and dim.lower_bound() != -1:
+            if dim.lower_bound() < 0 and not (
+                dim.lower_bound() == -1 and dim.upper_bound() == -1
+            ):
                 raise ValueError(
-                    f"Dimension {i} of expand target shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes zero or negative values."
+                    f"Dimension {i} of expand target shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes negative values."
                 )
 
         if len(target_shape) < len(input_shape):
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
index ca2aa8c6c..7a6aee2c5 100644
--- a/tests/unittest/ops/test_expand.py
+++ b/tests/unittest/ops/test_expand.py
@@ -259,14 +259,14 @@ def test_no_op_expands_removed_size_op_fp32(self):
             #     True,
             #     "int64",
             # ),
-            # param(
-            #     "benchmark_var_1",
-            #     "float32",
-            #     [100, 1, 9, 4],
-            #     [20, 20, 100, 100, 9, -1],
-            #     False,
-            #     "int64",
-            # ),
+            param(
+                "benchmark_var_1",
+                "float32",
+                [100, 1, 9, 4],
+                [20, 20, 100, 100, 9, -1],
+                False,
+                "int64",
+            ),
             # param(
             #     "benchmark_var_2",
             #     "int64",
@@ -289,6 +289,9 @@ def test_no_op_expands_removed_size_op_fp32(self):
             param("edge_case_shapes_2", "float32", [1], [-1]),
             param("edge_case_shapes_3", "float32", [3], [-1]),
             param("edge_case_shapes_4", "float32", [1], [1]),
+            param("edge_case_shapes_5", "float32", [1, 1], [1, 0]),
+            param("edge_case_shapes_6", "float32", [2, 0], [-1, -1]),
+            param("edge_case_shapes_7", "float32", [2, 0], [2, 0]),
             param(
                 "edge_case_shapes_var_1",
                 "float32",
@@ -299,6 +302,9 @@ def test_no_op_expands_removed_size_op_fp32(self):
             param("edge_case_shapes_var_2", "float32", [1], [-1], False),
             param("edge_case_shapes_var_3", "float32", [3], [-1], False),
             param("edge_case_shapes_var_4", "float32", [1], [1], False),
+            param("edge_case_shapes_var_5", "float32", [1, 1], [1, 0], False),
+            param("edge_case_shapes_var_6", "float32", [2, 0], [-1, -1], False),
+            param("edge_case_shapes_var_6", "float32", [2, 0], [2, 0], False),
         ]
     )
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")

From c0c1af00257e0160a5b1f276d1f64283711f6cc4 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Thu, 9 Mar 2023 08:01:28 -0800
Subject: [PATCH 230/638] Refactor: change relative to absolute imports (#354)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/354

Applied a one-off refactoring script, to change all relative imports within AITemplate to absolute imports. Then ran arc lint to make sure formatting is correct.

### Why?

IDEs like VSCode or PyCharm have problems resolving the paths to packages imported via relative imports, as they don't know the basepath. Now we can navigate to all imported symbols using CMD+click on the symbol.

Here is the script. It is intended for one-off usage, so I did not bother with code style or reusability.

```
import os
import re
from pathlib import Path

def process_relative_imports(path, basepath, basepackage):
    path = os.path.abspath(str(path))
    basepath = os.path.abspath(str(basepath))
    if not path.startswith(basepath):
        return
    relpath = path[len(basepath) :].strip("/")
    pparts = relpath.split("/")

    def dot_replacer(match):
        dots = match.group(2)
        pkg = basepackage + ".".join(pparts[: -len(dots)])
        pkg = pkg.strip(".")
        replacement = match.group(1) + pkg + "." + match.group(3)
        return replacement.replace("..", ".")

    with open(path, "rt", encoding="utf8") as f:
        contents = f.read()
        rcontents = re.sub(
            r"(^from )(\.+)([^\.].*import .*$)",
            dot_replacer,
            contents,
            flags=re.MULTILINE,
        )
    with open(path, "wt", encoding="utf8") as f:
        f.write(rcontents)
    print(f"Wrote {path}")

if __name__ == "__main__":
    allpyfiles = [str(path) for path in Path(".").rglob("*.py")]
    for p in allpyfiles:
        print(p)
        if p.endswith("extra_cutlass_generator.py"):
            continue
        process_relative_imports(p, ".", "aitemplate.")

```

Reviewed By: ipiszy, chenyang78, tenpercent

Differential Revision: D43715713

fbshipit-source-id: 1c2eaaaadc2f1edf8f4e378bc2781c5f851e80ba
---
 python/aitemplate/__init__.py                 |   4 +-
 python/aitemplate/backend/__init__.py         |   2 +-
 python/aitemplate/backend/backend_spec.py     |   5 +-
 python/aitemplate/backend/builder.py          |   7 +-
 python/aitemplate/backend/codegen.py          |  10 +-
 .../backend/common/concatenate_common.py      |   4 +-
 .../backend/common/elementwise_common.py      |  10 +-
 .../tensor/slice_reshape_scatter_common.py    |   2 +-
 .../backend/common/tensor_accessor_codegen.py |   6 +-
 .../common/vision_ops/efficient_nms_common.py |   2 +-
 .../backend/common/vision_ops/nms_common.py   |   2 +-
 python/aitemplate/backend/cuda/__init__.py    |  40 ++---
 .../backend/cuda/attention/__init__.py        |   2 +-
 .../backend/cuda/attention/flash_attention.py |   2 +-
 .../cuda/attention/mem_eff_attention.py       |   4 +-
 .../backend/cuda/common/__init__.py           |   2 +-
 .../backend/cuda/common/dummy_op.py           |   2 +-
 .../backend/cuda/conv2d/__init__.py           |   2 +-
 .../aitemplate/backend/cuda/conv2d/common.py  |   6 +-
 .../conv2d/common_conv2d_bias_activation.py   |   2 +-
 .../common_conv2d_bias_add_activation.py      |   2 +-
 .../cuda/conv2d/common_conv2d_few_channels.py |   4 +-
 .../cuda/conv2d/common_transposed_conv2d.py   |   2 +-
 .../aitemplate/backend/cuda/conv2d/conv2d.py  |   4 +-
 .../backend/cuda/conv2d/conv2d_bias.py        |   4 +-
 .../backend/cuda/conv2d/conv2d_bias_add.py    |   7 +-
 .../cuda/conv2d/conv2d_bias_add_hardswish.py  |   7 +-
 .../cuda/conv2d/conv2d_bias_add_relu.py       |   7 +-
 .../cuda/conv2d/conv2d_bias_few_channels.py   |   4 +-
 .../cuda/conv2d/conv2d_bias_hardswish.py      |   4 +-
 .../conv2d_bias_hardswish_few_channels.py     |   4 +-
 .../backend/cuda/conv2d/conv2d_bias_relu.py   |   4 +-
 .../conv2d/conv2d_bias_relu_few_channels.py   |   4 +-
 .../cuda/conv2d/conv2d_bias_sigmoid.py        |   4 +-
 .../backend/cuda/conv2d/conv2d_depthwise.py   |   8 +-
 .../cuda/conv2d/conv2d_depthwise_bias.py      |   4 +-
 .../backend/cuda/conv2d/transposed_conv2d.py  |   4 +-
 .../cuda/conv2d/transposed_conv2d_bias.py     |   4 +-
 .../backend/cuda/conv3d/__init__.py           |   7 +-
 .../aitemplate/backend/cuda/conv3d/common.py  |  11 +-
 .../backend/cuda/conv3d/common_bias.py        |  11 +-
 .../aitemplate/backend/cuda/conv3d/conv3d.py  |   6 +-
 .../backend/cuda/conv3d/conv3d_bias.py        |   6 +-
 .../backend/cuda/conv3d/depthwise_conv3d.py   |   6 +-
 .../cuda/conv3d/depthwise_conv3d_bias.py      |   6 +-
 .../backend/cuda/elementwise/__init__.py      |   2 +-
 .../cuda/elementwise/fused_elementwise.py     |   8 +-
 .../cuda/elementwise/int_elementwise.py       |   7 +-
 .../backend/cuda/embedding/__init__.py        |   2 +-
 .../backend/cuda/embedding/bert_embeddings.py |   4 +-
 .../cuda/gemm_epilogue_vistor/__init__.py     |   2 +-
 .../bmm_common_softmax.py                     |   6 +-
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |  11 +-
 .../gemm_epilogue_vistor/common_dual_gemm.py  |  10 +-
 .../gemm_epilogue_vistor/common_softmax.py    |   6 +-
 .../gemm_epilogue_vistor/dual_bmm_rrr_div.py  |  10 +-
 .../dual_gemm_rcr_fast_gelu.py                |  10 +-
 .../dual_gemm_rcr_silu.py                     |  10 +-
 .../gemm_rcr_bias_softmax.py                  |   9 +-
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |   8 +-
 .../backend/cuda/gemm_special/__init__.py     |   6 +-
 .../backend/cuda/gemm_special/bmm_rcr_n1.py   |  12 +-
 .../cuda/gemm_special/bmm_rrr_k1_tanh.py      |   8 +-
 .../cuda/gemm_special/gemm_rrr_small_nk.py    |  10 +-
 .../backend/cuda/gemm_universal/__init__.py   |   2 +-
 .../backend/cuda/gemm_universal/bmm_common.py |   6 +-
 .../cuda/gemm_universal/bmm_permute_common.py |  12 +-
 .../cuda/gemm_universal/bmm_rcr_permute.py    |  11 +-
 .../cuda/gemm_universal/bmm_rrr_permute.py    |  11 +-
 .../gemm_universal/bmm_softmax_bmm_permute.py |   2 +-
 .../backend/cuda/gemm_universal/bmm_xxx.py    |   4 +-
 .../cuda/gemm_universal/bmm_xxx_add.py        |   8 +-
 .../backend/cuda/gemm_universal/common.py     |  10 +-
 .../gemm_universal/common_bias_activation.py  |   6 +-
 .../gemm_universal/common_bias_broadcast.py   |   8 +-
 .../cuda/gemm_universal/common_permute.py     |   6 +-
 .../backend/cuda/gemm_universal/gemm_rcr.py   |   8 +-
 .../cuda/gemm_universal/gemm_rcr_bias.py      |   6 +-
 .../gemm_rcr_bias_elementwise.py              |   6 +-
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |   4 +-
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py |   4 +-
 .../gemm_universal/gemm_rcr_bias_hardswish.py |   4 +-
 .../gemm_universal/gemm_rcr_bias_permute.py   |  13 +-
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |   4 +-
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |   4 +-
 .../gemm_universal/gemm_rcr_bias_swish.py     |   4 +-
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py |   4 +-
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py |  10 +-
 .../cuda/gemm_universal/gemm_rcr_permute.py   |   6 +-
 .../gemm_universal/gemm_rcr_permute_elup1.py  |   4 +-
 .../backend/cuda/gemm_universal/gemm_rrr.py   |   6 +-
 .../cuda/gemm_universal/gemm_rrr_permute.py   |   6 +-
 .../cuda/gemm_universal/group_common.py       |   6 +-
 .../cuda/gemm_universal/group_common_bias.py  |   2 +-
 .../cuda/gemm_universal/group_gemm_rcr.py     |   6 +-
 .../gemm_universal/group_gemm_rcr_bias.py     |   8 +-
 .../group_gemm_rcr_bias_relu.py               |   8 +-
 .../group_gemm_rcr_bias_sigmoid.py            |   8 +-
 .../cuda/gemm_universal/perm021fc_ccr.py      |   4 +-
 .../cuda/gemm_universal/perm021fc_ccr_bias.py |   9 +-
 .../perm021fc_ccr_bias_permute.py             |   4 +-
 .../cuda/gemm_universal/perm021fc_crc.py      |   4 +-
 .../cuda/gemm_universal/perm021fc_crc_bias.py |   9 +-
 .../cuda/gemm_universal/perm102_bmm_rcr.py    |   6 +-
 .../gemm_universal/perm102_bmm_rcr_bias.py    |  15 +-
 .../cuda/gemm_universal/perm102_bmm_rrr.py    |  10 +-
 .../gemm_universal/perm102_bmm_rrr_bias.py    |  15 +-
 .../backend/cuda/groupnorm/__init__.py        |   2 +-
 .../backend/cuda/groupnorm/groupnorm.py       |   4 +-
 .../cuda/groupnorm/groupnorm_common.py        |   4 +-
 .../backend/cuda/groupnorm/groupnorm_swish.py |   4 +-
 .../cuda/layernorm_sigmoid_mul/__init__.py    |   2 +-
 .../batch_layernorm_sigmoid_mul.py            |  10 +-
 .../group_layernorm_sigmoid_mul.py            |  10 +-
 .../layernorm_sigmoid_mul.py                  |  10 +-
 .../aitemplate/backend/cuda/lib_template.py   |   2 +-
 .../backend/cuda/padding/__init__.py          |   2 +-
 .../backend/cuda/padding/ndhwc3to8.py         |   4 +-
 .../backend/cuda/padding/nhwc3to4.py          |   4 +-
 .../backend/cuda/padding/nhwc3to8.py          |   4 +-
 .../backend/cuda/padding/pad_last_dim.py      |   4 +-
 .../backend/cuda/pool2d/__init__.py           |   2 +-
 .../backend/cuda/pool2d/avg_pool2d.py         |   6 +-
 .../backend/cuda/pool2d/max_pool2d.py         |   6 +-
 .../backend/cuda/reduce/__init__.py           |   9 +-
 .../backend/cuda/reduce/reduce_3d.py          |   6 +-
 .../backend/cuda/reduce/reduce_common.py      |   5 +-
 .../backend/cuda/reduce/reduce_mean.py        |   4 +-
 .../backend/cuda/reduce/reduce_small_axis.py  |   2 +-
 .../backend/cuda/reduce/reduce_sum.py         |   4 +-
 python/aitemplate/backend/cuda/reduce/var.py  |   6 +-
 .../backend/cuda/reduce/vector_norm.py        |   4 +-
 .../backend/cuda/softmax/__init__.py          |   2 +-
 .../backend/cuda/softmax/softmax.py           |   8 +-
 python/aitemplate/backend/cuda/target_def.py  |  16 +-
 .../backend/cuda/tensor/__init__.py           |   2 +-
 .../aitemplate/backend/cuda/tensor/argmax.py  |   6 +-
 .../backend/cuda/tensor/batch_gather.py       |   6 +-
 .../backend/cuda/tensor/concatenate.py        |   8 +-
 .../backend/cuda/tensor/concatenate_fast.py   |   8 +-
 .../backend/cuda/tensor/concatenate_tanh.py   |   6 +-
 .../backend/cuda/tensor/dynamic_slice.py      |   6 +-
 .../aitemplate/backend/cuda/tensor/gather.py  |   4 +-
 .../backend/cuda/tensor/masked_select.py      |   6 +-
 .../aitemplate/backend/cuda/tensor/permute.py |   6 +-
 .../backend/cuda/tensor/permute021.py         |   6 +-
 .../backend/cuda/tensor/permute0213.py        |   6 +-
 .../backend/cuda/tensor/permute102.py         |   6 +-
 .../backend/cuda/tensor/permute210.py         |   6 +-
 .../cuda/tensor/slice_reshape_scatter.py      |   6 +-
 .../backend/cuda/tensor/slice_scatter.py      |   6 +-
 .../aitemplate/backend/cuda/tensor/split.py   |   6 +-
 python/aitemplate/backend/cuda/tensor/topk.py |   6 +-
 .../backend/cuda/upsample/__init__.py         |   2 +-
 .../backend/cuda/upsample/upsampling2d.py     |   6 +-
 .../backend/cuda/upsample/upsampling2d_add.py |   6 +-
 python/aitemplate/backend/cuda/utils.py       |   4 +-
 .../backend/cuda/view_ops/__init__.py         |   2 +-
 .../backend/cuda/view_ops/make_jagged.py      |   2 +-
 .../backend/cuda/view_ops/view_ops.py         |   2 +-
 .../backend/cuda/vision_ops/__init__.py       |   4 +-
 .../backend/cuda/vision_ops/nms/__init__.py   |   6 +-
 .../cuda/vision_ops/nms/batched_nms.py        |   4 +-
 .../cuda/vision_ops/nms/efficient_nms.py      |   6 +-
 .../backend/cuda/vision_ops/nms/nms.py        |   6 +-
 .../cuda/vision_ops/roi_ops/__init__.py       |   2 +-
 .../roi_ops/multi_level_roi_align.py          |   6 +-
 .../cuda/vision_ops/roi_ops/roi_align.py      |   6 +-
 python/aitemplate/backend/profiler_runner.py  |   6 +-
 python/aitemplate/backend/rocm/__init__.py    |  24 +--
 .../backend/rocm/common/__init__.py           |   2 +-
 .../backend/rocm/common/dummy_op.py           |   2 +-
 .../backend/rocm/conv2d/__init__.py           |   2 +-
 .../aitemplate/backend/rocm/conv2d/common.py  |   2 +-
 .../aitemplate/backend/rocm/conv2d/conv2d.py  |   4 +-
 .../backend/rocm/conv2d/conv2d_bias.py        |   4 +-
 .../rocm/conv2d/conv2d_bias_add_relu.py       |   4 +-
 .../backend/rocm/conv2d/conv2d_bias_relu.py   |   4 +-
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |   4 +-
 .../backend/rocm/conv2d/transposed_conv2d.py  |   4 +-
 .../conv2d/transposed_conv2d_bias_relu.py     |   4 +-
 .../backend/rocm/elementwise/__init__.py      |   2 +-
 .../rocm/elementwise/fused_elementwise.py     |   8 +-
 .../aitemplate/backend/rocm/gemm/__init__.py  |   2 +-
 .../aitemplate/backend/rocm/gemm/bmm_ccr.py   |   6 +-
 .../backend/rocm/gemm/bmm_common.py           |   2 +-
 .../aitemplate/backend/rocm/gemm/bmm_crr.py   |   6 +-
 .../aitemplate/backend/rocm/gemm/bmm_rcr.py   |   6 +-
 .../backend/rocm/gemm/bmm_rcr_permute.py      |   6 +-
 .../aitemplate/backend/rocm/gemm/bmm_rrr.py   |   6 +-
 .../backend/rocm/gemm/bmm_rrr_permute.py      |   6 +-
 .../backend/rocm/gemm/bmm_softmax_bmm.py      |   6 +-
 .../rocm/gemm/bmm_softmax_bmm_permute.py      |   8 +-
 python/aitemplate/backend/rocm/gemm/common.py |   4 +-
 .../backend/rocm/gemm/gemm_epilogue.py        |   2 +-
 .../aitemplate/backend/rocm/gemm/gemm_rcr.py  |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias.py        |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias_add.py    |   6 +-
 .../rocm/gemm/gemm_rcr_bias_add_add.py        |   6 +-
 .../rocm/gemm/gemm_rcr_bias_add_add_relu.py   |   6 +-
 .../rocm/gemm/gemm_rcr_bias_add_relu.py       |   6 +-
 .../rocm/gemm/gemm_rcr_bias_fast_gelu.py      |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias_mul.py    |   6 +-
 .../rocm/gemm/gemm_rcr_bias_mul_add.py        |   6 +-
 .../rocm/gemm/gemm_rcr_bias_mul_tanh.py       |   6 +-
 .../rocm/gemm/gemm_rcr_bias_permute.py        |   6 +-
 .../rocm/gemm/gemm_rcr_bias_permute_m2n3.py   |   6 +-
 .../rocm/gemm/gemm_rcr_bias_permute_m3n2.py   |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias_relu.py   |   6 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid.py        |   6 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid_mul.py    |   6 +-
 .../gemm/gemm_rcr_bias_sigmoid_mul_tanh.py    |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias_swish.py  |   6 +-
 .../backend/rocm/gemm/gemm_rcr_bias_tanh.py   |   6 +-
 .../rocm/gemm/gemm_rcr_permute_m2n3.py        |   6 +-
 .../aitemplate/backend/rocm/gemm/gemm_rrr.py  |   6 +-
 .../rocm/gemm/gemm_rrr_bias_permute.py        |   6 +-
 .../aitemplate/backend/rocm/lib_template.py   |   2 +-
 .../backend/rocm/normalization/__init__.py    |   2 +-
 .../backend/rocm/normalization/groupnorm.py   |   8 +-
 .../rocm/normalization/groupnorm_swish.py     |   4 +-
 .../backend/rocm/normalization/layernorm.py   |   8 +-
 .../backend/rocm/normalization/norm_common.py |   2 +-
 .../backend/rocm/normalization/softmax.py     |   6 +-
 .../backend/rocm/pool2d/__init__.py           |   2 +-
 .../backend/rocm/pool2d/avg_pool2d.py         |   4 +-
 .../backend/rocm/pool2d/max_pool2d.py         |   4 +-
 python/aitemplate/backend/rocm/target_def.py  |  11 +-
 .../backend/rocm/tensor/__init__.py           |   2 +-
 .../aitemplate/backend/rocm/tensor/argmax.py  |   6 +-
 .../backend/rocm/tensor/batch_gather.py       |   6 +-
 .../backend/rocm/tensor/concatenate.py        |   6 +-
 .../backend/rocm/tensor/concatenate_tanh.py   |   4 +-
 .../backend/rocm/tensor/dynamic_slice.py      |   6 +-
 .../backend/rocm/tensor/permute021.py         |   6 +-
 .../backend/rocm/tensor/permute0213.py        |   6 +-
 .../backend/rocm/tensor/permute102.py         |   6 +-
 .../backend/rocm/tensor/permute210.py         |   6 +-
 .../rocm/tensor/slice_reshape_scatter.py      |   6 +-
 .../backend/rocm/tensor/slice_scatter.py      |   6 +-
 .../aitemplate/backend/rocm/tensor/split.py   |   6 +-
 python/aitemplate/backend/rocm/tensor/topk.py |   6 +-
 .../backend/rocm/upsample/__init__.py         |   2 +-
 .../backend/rocm/upsample/upsampling2d.py     |   6 +-
 .../backend/rocm/upsample/upsampling2d_add.py |   6 +-
 python/aitemplate/backend/rocm/utils.py       |   2 +-
 .../backend/rocm/view_ops/__init__.py         |   2 +-
 .../backend/rocm/view_ops/view_ops.py         |   2 +-
 .../backend/rocm/vision_ops/__init__.py       |   7 +-
 .../backend/rocm/vision_ops/efficient_nms.py  |   6 +-
 .../aitemplate/backend/rocm/vision_ops/nms.py |   6 +-
 .../rocm/vision_ops/roi_ops/__init__.py       |   2 +-
 .../roi_ops/multi_level_roi_align.py          |   6 +-
 .../rocm/vision_ops/roi_ops/roi_align.py      |   6 +-
 python/aitemplate/backend/target.py           |   4 +-
 python/aitemplate/compiler/__init__.py        |   6 +-
 python/aitemplate/compiler/base.py            |   6 +-
 python/aitemplate/compiler/compiler.py        |  14 +-
 python/aitemplate/compiler/ops/__init__.py    |  32 ++--
 .../compiler/ops/attention/__init__.py        |   4 +-
 .../compiler/ops/attention/flash_attention.py |   8 +-
 .../ops/attention/mem_eff_attention.py        |   8 +-
 .../compiler/ops/common/__init__.py           |  14 +-
 .../compiler/ops/common/elementwise.py        |  11 +-
 .../compiler/ops/common/fused_elementwise.py  |  10 +-
 .../compiler/ops/common/int_elementwise.py    |  12 +-
 .../compiler/ops/common/python_ops.py         |   5 +-
 .../compiler/ops/common/view_ops.py           |   2 +-
 .../aitemplate/compiler/ops/conv/__init__.py  |  48 +++---
 .../ops/conv/common_conv2d_bias_activation.py |   4 +-
 .../conv/common_conv2d_bias_add_activation.py |   4 +-
 python/aitemplate/compiler/ops/conv/conv2d.py |  20 ++-
 .../compiler/ops/conv/conv2d_bias.py          |   4 +-
 .../compiler/ops/conv/conv2d_bias_add.py      |   4 +-
 .../ops/conv/conv2d_bias_add_hardswish.py     |   4 +-
 .../compiler/ops/conv/conv2d_bias_add_relu.py |   4 +-
 .../ops/conv/conv2d_bias_few_channels.py      |   4 +-
 .../ops/conv/conv2d_bias_hardswish.py         |   4 +-
 .../conv2d_bias_hardswish_few_channels.py     |   4 +-
 .../compiler/ops/conv/conv2d_bias_relu.py     |   4 +-
 .../ops/conv/conv2d_bias_relu_few_channels.py |   4 +-
 .../compiler/ops/conv/conv2d_bias_sigmoid.py  |   4 +-
 .../compiler/ops/conv/conv2d_depthwise.py     |   4 +-
 .../ops/conv/conv2d_depthwise_bias.py         |   4 +-
 python/aitemplate/compiler/ops/conv/conv3d.py |  20 ++-
 .../compiler/ops/conv/conv3d_bias.py          |   5 +-
 .../compiler/ops/conv/conv_common.py          |   4 +-
 .../compiler/ops/conv/depthwise_conv3d.py     |   8 +-
 .../conv/special_conv2d_bias_activation.py    |   6 +-
 .../compiler/ops/conv/transposed_conv2d.py    |   7 +-
 .../ops/conv/transposed_conv2d_bias.py        |   6 +-
 .../ops/conv/transposed_conv2d_bias_relu.py   |   2 +-
 .../compiler/ops/embedding/__init__.py        |   2 +-
 .../compiler/ops/embedding/bert_embeddings.py |   8 +-
 .../ops/gemm_epilogue_vistor/__init__.py      |  22 ++-
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |   8 +-
 .../gemm_epilogue_vistor/dual_bmm_rrr_div.py  |   6 +-
 .../dual_gemm_rcr_fast_gelu.py                |   6 +-
 .../dual_gemm_rcr_silu.py                     |   6 +-
 .../gemm_rcr_bias_softmax.py                  |   8 +-
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |   6 +-
 .../compiler/ops/gemm_special/__init__.py     |   6 +-
 .../compiler/ops/gemm_special/bmm_rcr_n1.py   |   6 +-
 .../ops/gemm_special/bmm_rrr_k1_tanh.py       |   4 +-
 .../ops/gemm_special/gemm_rrr_small_nk.py     |   4 +-
 .../compiler/ops/gemm_universal/__init__.py   | 142 ++++++++++++------
 .../compiler/ops/gemm_universal/bmm.py        |   8 +-
 .../compiler/ops/gemm_universal/bmm_ccr.py    |   6 +-
 .../ops/gemm_universal/bmm_ccr_add.py         |   7 +-
 .../compiler/ops/gemm_universal/bmm_crr.py    |   6 +-
 .../ops/gemm_universal/bmm_crr_add.py         |   7 +-
 .../compiler/ops/gemm_universal/bmm_rcr.py    |   6 +-
 .../ops/gemm_universal/bmm_rcr_permute.py     |   8 +-
 .../compiler/ops/gemm_universal/bmm_rrr.py    |   6 +-
 .../ops/gemm_universal/bmm_rrr_add.py         |   7 +-
 .../ops/gemm_universal/bmm_rrr_permute.py     |   8 +-
 .../ops/gemm_universal/bmm_softmax_bmm.py     |   8 +-
 .../gemm_universal/bmm_softmax_bmm_permute.py |  10 +-
 .../ops/gemm_universal/gemm_common.py         |  26 +++-
 .../compiler/ops/gemm_universal/gemm_rcr.py   |   4 +-
 .../ops/gemm_universal/gemm_rcr_bias.py       |   6 +-
 .../ops/gemm_universal/gemm_rcr_bias_add.py   |   4 +-
 .../gemm_universal/gemm_rcr_bias_add_add.py   |   4 +-
 .../gemm_rcr_bias_add_add_relu.py             |   4 +-
 .../gemm_universal/gemm_rcr_bias_add_relu.py  |   4 +-
 .../gemm_universal/gemm_rcr_bias_broadcast.py |   6 +-
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |   2 +-
 .../ops/gemm_universal/gemm_rcr_bias_gelu.py  |   2 +-
 .../gemm_universal/gemm_rcr_bias_hardswish.py |   2 +-
 .../ops/gemm_universal/gemm_rcr_bias_mul.py   |   4 +-
 .../gemm_universal/gemm_rcr_bias_mul_add.py   |   4 +-
 .../gemm_universal/gemm_rcr_bias_mul_tanh.py  |   4 +-
 .../gemm_universal/gemm_rcr_bias_permute.py   |  10 +-
 .../ops/gemm_universal/gemm_rcr_bias_relu.py  |   2 +-
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |   2 +-
 .../gemm_rcr_bias_sigmoid_mul.py              |   4 +-
 .../gemm_rcr_bias_sigmoid_mul_tanh.py         |   4 +-
 .../ops/gemm_universal/gemm_rcr_bias_swish.py |   2 +-
 .../ops/gemm_universal/gemm_rcr_bias_tanh.py  |   2 +-
 .../ops/gemm_universal/gemm_rcr_fast_gelu.py  |   2 +-
 .../ops/gemm_universal/gemm_rcr_permute.py    |  10 +-
 .../gemm_universal/gemm_rcr_permute_elup1.py  |   2 +-
 .../compiler/ops/gemm_universal/gemm_rrr.py   |   4 +-
 .../ops/gemm_universal/gemm_rrr_bias.py       |   6 +-
 .../gemm_universal/gemm_rrr_bias_permute.py   |  10 +-
 .../ops/gemm_universal/gemm_rrr_permute.py    |   8 +-
 .../ops/gemm_universal/group_gemm_rcr.py      |  16 +-
 .../ops/gemm_universal/group_gemm_rcr_bias.py |  13 +-
 .../group_gemm_rcr_bias_relu.py               |   2 +-
 .../group_gemm_rcr_bias_sigmoid.py            |   2 +-
 .../ops/gemm_universal/perm021fc_ccr.py       |  10 +-
 .../ops/gemm_universal/perm021fc_ccr_bias.py  |   5 +-
 .../perm021fc_ccr_bias_permute.py             |   8 +-
 .../ops/gemm_universal/perm021fc_crc.py       |   6 +-
 .../ops/gemm_universal/perm021fc_crc_bias.py  |   5 +-
 .../ops/gemm_universal/perm102_bmm_rcr.py     |   6 +-
 .../gemm_universal/perm102_bmm_rcr_bias.py    |   6 +-
 .../ops/gemm_universal/perm102_bmm_rrr.py     |   6 +-
 .../gemm_universal/perm102_bmm_rrr_bias.py    |   6 +-
 .../compiler/ops/groupnorm/__init__.py        |   4 +-
 .../compiler/ops/groupnorm/groupnorm.py       |  19 ++-
 .../compiler/ops/groupnorm/groupnorm_swish.py |   2 +-
 .../compiler/ops/layernorm/__init__.py        |  16 +-
 .../layernorm/batch_layernorm_sigmoid_mul.py  |   4 +-
 .../layernorm/group_layernorm_sigmoid_mul.py  |   4 +-
 .../compiler/ops/layernorm/layernorm.py       |  21 ++-
 .../ops/layernorm/layernorm_sigmoid_mul.py    |   9 +-
 .../compiler/ops/padding/__init__.py          |   8 +-
 .../compiler/ops/padding/ndhwc3to8.py         |   8 +-
 .../compiler/ops/padding/nhwc3to4.py          |   2 +-
 .../compiler/ops/padding/nhwc3to8.py          |   2 +-
 .../compiler/ops/padding/nhwc_pad_common.py   |   8 +-
 .../compiler/ops/padding/pad_last_dim.py      |   6 +-
 .../aitemplate/compiler/ops/pool/__init__.py  |   4 +-
 .../compiler/ops/pool/avg_pool2d.py           |   2 +-
 .../compiler/ops/pool/max_pool2d.py           |   2 +-
 python/aitemplate/compiler/ops/pool/pool2d.py |   8 +-
 .../compiler/ops/reduce/__init__.py           |   8 +-
 .../compiler/ops/reduce/reduce_common.py      |  14 +-
 .../compiler/ops/reduce/reduce_mean.py        |   2 +-
 .../compiler/ops/reduce/reduce_sum.py         |   2 +-
 python/aitemplate/compiler/ops/reduce/var.py  |   2 +-
 .../compiler/ops/reduce/vector_norm.py        |   2 +-
 .../compiler/ops/softmax/__init__.py          |   2 +-
 .../compiler/ops/softmax/softmax.py           |  20 ++-
 .../compiler/ops/tensor/__init__.py           |  42 +++---
 .../aitemplate/compiler/ops/tensor/argmax.py  |   8 +-
 .../compiler/ops/tensor/batch_gather.py       |   8 +-
 .../aitemplate/compiler/ops/tensor/chunk.py   |   4 +-
 .../compiler/ops/tensor/concatenate.py        |  12 +-
 .../compiler/ops/tensor/concatenate_tanh.py   |   2 +-
 .../compiler/ops/tensor/dynamic_slice.py      |   8 +-
 .../aitemplate/compiler/ops/tensor/gather.py  |   6 +-
 .../aitemplate/compiler/ops/tensor/permute.py |  16 +-
 .../compiler/ops/tensor/permute021.py         |   6 +-
 .../compiler/ops/tensor/permute0213.py        |   6 +-
 .../compiler/ops/tensor/permute102.py         |   6 +-
 .../compiler/ops/tensor/permute210.py         |   6 +-
 python/aitemplate/compiler/ops/tensor/size.py |   2 +-
 .../ops/tensor/slice_reshape_scatter.py       |  10 +-
 .../compiler/ops/tensor/slice_scatter.py      |   7 +-
 .../aitemplate/compiler/ops/tensor/split.py   |  10 +-
 python/aitemplate/compiler/ops/tensor/topk.py |   6 +-
 .../compiler/ops/tensor/transpose.py          |   4 +-
 .../compiler/ops/upsample/__init__.py         |   4 +-
 .../compiler/ops/upsample/upsampling2d.py     |   2 +-
 .../compiler/ops/upsample/upsampling2d_add.py |   4 +-
 .../ops/upsample/upsampling_common.py         |   8 +-
 .../compiler/ops/vision_ops/__init__.py       |   4 +-
 .../compiler/ops/vision_ops/nms/__init__.py   |   6 +-
 .../ops/vision_ops/nms/batched_nms.py         |  13 +-
 .../ops/vision_ops/nms/efficient_nms.py       |   8 +-
 .../compiler/ops/vision_ops/nms/nms.py        |   8 +-
 .../ops/vision_ops/roi_ops/__init__.py        |   6 +-
 .../roi_ops/multi_level_roi_align.py          |   4 +-
 .../ops/vision_ops/roi_ops/roi_align.py       |   2 +-
 .../ops/vision_ops/roi_ops/roi_ops.py         |   8 +-
 python/aitemplate/compiler/tensor_accessor.py |   4 +-
 .../aitemplate/compiler/transform/__init__.py |  51 ++++---
 .../compiler/transform/apply_padding.py       |  12 +-
 .../transform/fuse_conv_elementwise.py        |   6 +-
 .../compiler/transform/fuse_conv_patterns.py  |   6 +-
 .../compiler/transform/fuse_group_ops.py      |  17 ++-
 .../compiler/transform/fuse_mm_elementwise.py |  15 +-
 .../transform/fuse_mm_elementwise_patterns.py |   6 +-
 .../transform/fuse_mm_reshape_permute.py      |  11 +-
 .../aitemplate/compiler/transform/fuse_ops.py |  18 +--
 .../compiler/transform/fuse_parallel_gemms.py |  19 +--
 .../transform/fuse_permute_bmm_and_gemm.py    |  19 +--
 .../compiler/transform/fuse_split.py          |   7 +-
 .../compiler/transform/fuse_utils.py          |   6 +-
 .../compiler/transform/mark_param_tensor.py   |   2 +-
 .../compiler/transform/memory_planning.py     |   2 +-
 .../compiler/transform/name_graph.py          |   2 +-
 .../compiler/transform/optimize_graph.py      |  43 +++---
 .../aitemplate/compiler/transform/profile.py  |   6 +-
 .../compiler/transform/profile_dynamic_dim.py |   4 +-
 .../compiler/transform/refine_graph.py        |   4 +-
 .../compiler/transform/remove_unused_ops.py   |   2 +-
 .../transform/split_large_concat_ops.py       |   9 +-
 .../split_large_slice_scatter_ops.py          |   9 +-
 .../transform/split_large_split_ops.py        |   9 +-
 .../aitemplate/compiler/transform/toposort.py |   2 +-
 .../transform/transform_memory_ops.py         |   7 +-
 .../transform/transform_odd_alignment.py      |  26 ++--
 .../transform/transform_special_ops.py        |  16 +-
 .../transform/transform_strided_ops.py        |  24 ++-
 .../transform/transform_strided_slice.py      |   9 +-
 .../compiler/transform/transform_utils.py     |  11 +-
 python/aitemplate/frontend/__init__.py        |  11 +-
 python/aitemplate/frontend/nn/__init__.py     |  43 +++---
 python/aitemplate/frontend/nn/attention.py    |  17 +--
 python/aitemplate/frontend/nn/container.py    |   6 +-
 .../aitemplate/frontend/nn/conv2d/__init__.py |  36 +++--
 .../nn/conv2d/common_conv2d_bias_act.py       |   6 +-
 .../nn/conv2d/common_conv2d_bias_add_act.py   |   6 +-
 .../aitemplate/frontend/nn/conv2d/conv2d.py   |   6 +-
 .../frontend/nn/conv2d/conv2d_bias.py         |   2 +-
 .../nn/conv2d/conv2d_bias_add_hardswish.py    |   2 +-
 .../nn/conv2d/conv2d_bias_add_relu.py         |   2 +-
 .../nn/conv2d/conv2d_bias_few_channels.py     |   2 +-
 .../nn/conv2d/conv2d_bias_hardswish.py        |   2 +-
 .../conv2d_bias_hardswish_few_channels.py     |   2 +-
 .../frontend/nn/conv2d/conv2d_bias_relu.py    |   2 +-
 .../conv2d/conv2d_bias_relu_few_channels.py   |   2 +-
 .../frontend/nn/conv2d/conv2d_bias_sigmoid.py |   2 +-
 .../frontend/nn/conv2d/conv2d_depthwise.py    |   4 +-
 .../nn/conv2d/conv2d_depthwise_bias.py        |   2 +-
 .../nn/conv2d/special_conv2d_bias_act.py      |   6 +-
 .../nn/conv2d/transposed_conv2d_bias.py       |   4 +-
 .../nn/conv2d/transposed_conv2d_bias_act.py   |   6 +-
 .../nn/conv2d/transposed_conv2d_bias_relu.py  |   4 +-
 python/aitemplate/frontend/nn/conv3d.py       |   6 +-
 python/aitemplate/frontend/nn/dropout.py      |   2 +-
 python/aitemplate/frontend/nn/dual_gemm.py    |   8 +-
 python/aitemplate/frontend/nn/embedding.py    |  13 +-
 python/aitemplate/frontend/nn/fpn_proposal.py |   6 +-
 python/aitemplate/frontend/nn/group_norm.py   |   6 +-
 python/aitemplate/frontend/nn/identity.py     |   2 +-
 python/aitemplate/frontend/nn/layer_norm.py   |   6 +-
 python/aitemplate/frontend/nn/linear.py       |   7 +-
 python/aitemplate/frontend/nn/module.py       |   4 +-
 .../frontend/nn/multiscale_attention.py       |  16 +-
 python/aitemplate/frontend/nn/padding.py      |   4 +-
 python/aitemplate/frontend/nn/parameter.py    |   2 +-
 python/aitemplate/frontend/nn/pool2d.py       |   4 +-
 python/aitemplate/frontend/nn/proposal.py     |   8 +-
 python/aitemplate/frontend/nn/roi_ops.py      |   4 +-
 python/aitemplate/frontend/nn/upsample.py     |   4 +-
 .../frontend/nn/vanilla_attention.py          |  12 +-
 python/aitemplate/frontend/nn/view_ops.py     |   4 +-
 python/aitemplate/frontend/parameter.py       |   2 +-
 python/aitemplate/testing/__init__.py         |   4 +-
 python/aitemplate/testing/detect_target.py    |   2 +-
 python/aitemplate/utils/__init__.py           |   2 +-
 python/aitemplate/utils/mk_ck_lib/__init__.py |   8 +-
 .../utils/mk_ck_lib/conv2d_operation.py       |   2 +-
 .../utils/mk_ck_lib/gemm_operation.py         |   2 +-
 .../aitemplate/utils/mk_ck_lib/generator.py   |   2 +-
 .../utils/mk_ck_lib/groupnorm_operation.py    |   2 +-
 .../utils/mk_ck_lib/layernorm_operation.py    |   2 +-
 python/aitemplate/utils/mk_ck_lib/manifest.py |   2 +-
 .../utils/mk_ck_lib/softmax_operation.py      |   2 +-
 .../utils/mk_cutlass_lib/mk_cutlass_lib.py    |   7 +-
 .../utils/visualization/__init__.py           |   2 +-
 505 files changed, 1906 insertions(+), 1565 deletions(-)

diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
index 99f5e468c..60a116a71 100644
--- a/python/aitemplate/__init__.py
+++ b/python/aitemplate/__init__.py
@@ -14,8 +14,8 @@
 #
 import sys
 
-from . import backend, compiler, frontend, testing, utils
-from ._libinfo import __version__  # noqa
+from aitemplate import backend, compiler, frontend, testing, utils
+from aitemplate._libinfo import __version__  # noqa
 
 if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
     PY3STATEMENT = "The minimal Python requirement is Python 3.7"
diff --git a/python/aitemplate/backend/__init__.py b/python/aitemplate/backend/__init__.py
index 8e7aaca0d..df7240114 100644
--- a/python/aitemplate/backend/__init__.py
+++ b/python/aitemplate/backend/__init__.py
@@ -15,7 +15,7 @@
 """
 Backend for AITemplate.
 """
-from . import (  # noqa
+from aitemplate.backend import (  # noqa
     backend_spec,
     builder,
     codegen,
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 0f9d6637b..94abf6a2c 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -22,8 +22,9 @@
 
 import jinja2
 
-from ..compiler.ops.common.epilogue import FuncEnum
-from .target import Target
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 
 
 class BackendSpec:
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index bef7346ef..af91a2f25 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -32,13 +32,14 @@
 
 import jinja2
 
+from aitemplate.backend.target import Target
+from aitemplate.backend.task_runner import BaseRunner, Task
+
 from aitemplate.utils import environ
 
 from aitemplate.utils.debug_settings import AITDebugSettings
 
-from ..utils.misc import is_debug
-from .target import Target
-from .task_runner import BaseRunner, Task
+from aitemplate.utils.misc import is_debug
 
 # pylint: disable=W0221,C0103
 
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 17249e62d..92fc24835 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -29,18 +29,18 @@
 
 import jinja2
 
+from aitemplate.backend import registry
+
 from aitemplate.backend.main_templates import MODEL_CONTAINER_TEMPLATE, MODEL_TEMPLATE
-from aitemplate.compiler.base import Operator
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 from aitemplate.compiler.dtype import dtype_to_enumerator, get_dtype_size
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from aitemplate.compiler.transform.memory_planning import Workspace
 from aitemplate.utils.debug_settings import AITDebugSettings
 
-from ..compiler.base import IntImm, IntVar, IntVarTensor, Tensor
-from . import registry
-from .target import Target
-
 # pylint: disable=C0103,W0613,C0301
 
 
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index acaa1b899..d6f3013ce 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from ...compiler.ops.tensor import concatenate
+from aitemplate.backend.common import tensor_accessor_codegen
 
-from . import tensor_accessor_codegen
+from aitemplate.compiler.ops.tensor import concatenate
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 1655d40f8..6d3c8e806 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -22,12 +22,12 @@
 
 import jinja2
 from aitemplate.backend.backend_spec import BackendSpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.target import Target
 
-from ...compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
-from ...compiler.tensor_accessor import TensorAccessor
-from ...utils import alignment as alignment_utils, shape_utils
-from ..target import Target
-from . import tensor_accessor_codegen
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment as alignment_utils, shape_utils
 
 CONSTANT_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
index 0162eab33..49d574548 100644
--- a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
@@ -19,7 +19,7 @@
 
 import jinja2
 
-from . import slice_common
+from aitemplate.backend.common.tensor import slice_common
 
 OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/common/tensor_accessor_codegen.py b/python/aitemplate/backend/common/tensor_accessor_codegen.py
index 7e3336f76..bb1014fdf 100644
--- a/python/aitemplate/backend/common/tensor_accessor_codegen.py
+++ b/python/aitemplate/backend/common/tensor_accessor_codegen.py
@@ -20,10 +20,10 @@
 from typing import List
 
 import jinja2
+from aitemplate.backend.target import Target
 
-from ...compiler.tensor_accessor import TensorAccessor
-from ...utils import alignment
-from ..target import Target
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment
 
 # Template used to transform a Python TensorAccessor object
 # to a C++ TensorAccessor struct.
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
index fd0ca6c50..499be93b3 100644
--- a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from .efficient_nms_kernel import kernel
+from aitemplate.backend.common.vision_ops.efficient_nms_kernel import kernel
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/common/vision_ops/nms_common.py b/python/aitemplate/backend/common/vision_ops/nms_common.py
index 53e2b6f31..cf02e380a 100644
--- a/python/aitemplate/backend/common/vision_ops/nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/nms_common.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from .nms_kernel import KERNEL_TEMPLATE
+from aitemplate.backend.common.vision_ops.nms_kernel import KERNEL_TEMPLATE
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index f2ff7c11f..e2124c3a5 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -16,23 +16,23 @@
 """
 CUDA backend codegen functions.
 """
-from . import cuda_common, lib_template, target_def, utils
-from .common import *
-from .conv2d import *
-from .conv3d import *
-from .elementwise import *
-from .embedding import *
-from .gemm_special import *
-from .gemm_universal import *
-from .gemm_epilogue_vistor import *
-from .layernorm_sigmoid_mul import *
-from .padding import *
-from .pool2d import *
-from .reduce import *
-from .softmax import *
-from .tensor import *
-from .upsample import *
-from .view_ops import *
-from .vision_ops import *
-from .attention import *
-from .groupnorm import *
+from aitemplate.backend.cuda import cuda_common, lib_template, target_def, utils
+from aitemplate.backend.cuda.common import *
+from aitemplate.backend.cuda.conv2d import *
+from aitemplate.backend.cuda.conv3d import *
+from aitemplate.backend.cuda.elementwise import *
+from aitemplate.backend.cuda.embedding import *
+from aitemplate.backend.cuda.gemm_special import *
+from aitemplate.backend.cuda.gemm_universal import *
+from aitemplate.backend.cuda.gemm_epilogue_vistor import *
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import *
+from aitemplate.backend.cuda.padding import *
+from aitemplate.backend.cuda.pool2d import *
+from aitemplate.backend.cuda.reduce import *
+from aitemplate.backend.cuda.softmax import *
+from aitemplate.backend.cuda.tensor import *
+from aitemplate.backend.cuda.upsample import *
+from aitemplate.backend.cuda.view_ops import *
+from aitemplate.backend.cuda.vision_ops import *
+from aitemplate.backend.cuda.attention import *
+from aitemplate.backend.cuda.groupnorm import *
diff --git a/python/aitemplate/backend/cuda/attention/__init__.py b/python/aitemplate/backend/cuda/attention/__init__.py
index 9636980b4..c57effeee 100644
--- a/python/aitemplate/backend/cuda/attention/__init__.py
+++ b/python/aitemplate/backend/cuda/attention/__init__.py
@@ -15,6 +15,6 @@
 """
 cuda flash_attention module init
 """
-from . import flash_attention, mem_eff_attention
+from aitemplate.backend.cuda.attention import flash_attention, mem_eff_attention
 
 __all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/backend/cuda/attention/flash_attention.py b/python/aitemplate/backend/cuda/attention/flash_attention.py
index 55d781ceb..b53eb419e 100644
--- a/python/aitemplate/backend/cuda/attention/flash_attention.py
+++ b/python/aitemplate/backend/cuda/attention/flash_attention.py
@@ -19,7 +19,7 @@
 
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 9a3e39d47..d306ad0e9 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -19,8 +19,8 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/common/__init__.py b/python/aitemplate/backend/cuda/common/__init__.py
index 2115b6952..4971d840f 100644
--- a/python/aitemplate/backend/cuda/common/__init__.py
+++ b/python/aitemplate/backend/cuda/common/__init__.py
@@ -16,4 +16,4 @@
 """
 CUDA Common module init
 """
-from .dummy_op import *
+from aitemplate.backend.cuda.common.dummy_op import *
diff --git a/python/aitemplate/backend/cuda/common/dummy_op.py b/python/aitemplate/backend/cuda/common/dummy_op.py
index da293ee4e..8a81b6087 100644
--- a/python/aitemplate/backend/cuda/common/dummy_op.py
+++ b/python/aitemplate/backend/cuda/common/dummy_op.py
@@ -18,7 +18,7 @@
 
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("cuda.size.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/__init__.py b/python/aitemplate/backend/cuda/conv2d/__init__.py
index 09703e7b8..e18c91cdf 100644
--- a/python/aitemplate/backend/cuda/conv2d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv2d/__init__.py
@@ -16,7 +16,7 @@
 """
 cuda conv2d module init
 """
-from . import (
+from aitemplate.backend.cuda.conv2d import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add,
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 992229faa..01e076d03 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -24,10 +24,10 @@
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal.common import add_profiler, build_profiler
+from aitemplate.backend.target import Target
 
-from ....utils import alignment
-from ...target import Target
-from ..gemm_universal.common import add_profiler, build_profiler
+from aitemplate.utils import alignment
 
 
 KERNEL_KEY_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
index e1dbf6f1d..7025a45c6 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -16,7 +16,7 @@
 common functions for conv_bias_activation subgraph
 """
 
-from . import common
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0103,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
index b3e78c300..75655dcaf 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -16,7 +16,7 @@
 common functions for conv2d bias act residual add
 """
 
-from . import common
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0301,C0103
 
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
index d110a21f0..d6059eeeb 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
@@ -16,8 +16,8 @@
 common functions for conv2d op with few channels(< 8)
 """
 
-from ....utils import alignment
-from . import common
+from aitemplate.backend.cuda.conv2d import common
+from aitemplate.utils import alignment
 
 
 def extract_config(func_attrs, dtype="float16"):
diff --git a/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
index eb5be30ed..666b66f2d 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
@@ -18,7 +18,7 @@
 
 import re
 
-from . import common
+from aitemplate.backend.cuda.conv2d import common
 
 
 def _conv_transpose_instance(op_def):
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
index 68de39fd5..d4c54c2e0 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -15,8 +15,8 @@
 """
 Codegen for conv2d.
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
index 66c57f966..3c010c4dc 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -15,8 +15,8 @@
 """
 conv2d bias codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
index bd952ab68..c6db62b6e 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -15,8 +15,11 @@
 """
 conv2d bias add codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
index 4e7526699..968b605e0 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -15,8 +15,11 @@
 """
 conv2d bias add hardswish codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
index e7f009871..1a7fe093a 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -15,8 +15,11 @@
 """
 conv2d bias add relu codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
index 40330318b..33fdbb989 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -16,8 +16,8 @@
 specialize conv2d op with few channels(< 8)
 """
 
-from ... import registry
-from . import (
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
     common,
     common_conv2d_bias_activation as cba,
     common_conv2d_few_channels as cfc,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
index 13743d294..f883312c6 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -15,8 +15,8 @@
 """
 conv2d bias hardswish codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
index 7594887c9..36a2dc35f 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -16,8 +16,8 @@
 specialize conv2d op with few channels(< 8)
 """
 
-from ... import registry
-from . import (
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
     common,
     common_conv2d_bias_activation as cba,
     common_conv2d_few_channels as cfc,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
index cd44eef51..a55895970 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -15,8 +15,8 @@
 """
 conv2d bias relu codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
index 927e9da83..5659c8d0a 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -16,8 +16,8 @@
 specialize conv2d op with few channels(< 8)
 """
 
-from ... import registry
-from . import (
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
     common,
     common_conv2d_bias_activation as cba,
     common_conv2d_few_channels as cfc,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
index a9fe1801f..3977c6355 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -16,8 +16,8 @@
 conv2d bias sigmoid codegen
 """
 
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
index fee377f95..1b5de0758 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -17,11 +17,11 @@
 """
 from collections import OrderedDict
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from ...target import Target
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv2d import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
index a42edfb33..82f7ffbc7 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
@@ -16,8 +16,8 @@
 Codegen for conv2d_depthwise.
 """
 
-from ... import registry
-from . import common, conv2d_depthwise as cdw
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, conv2d_depthwise as cdw
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
index 00f639983..f03edc5fe 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -15,8 +15,8 @@
 """
 transposed conv2d op codegen
 """
-from ... import registry
-from . import common, common_transposed_conv2d as ctc
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
index fb10b92ae..30503992b 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -15,8 +15,8 @@
 """
 transposed conv2d + bias + (relu) codegen
 """
-from ... import registry
-from . import common, common_transposed_conv2d as ctc
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv3d/__init__.py b/python/aitemplate/backend/cuda/conv3d/__init__.py
index 6187336e2..dadb06e64 100644
--- a/python/aitemplate/backend/cuda/conv3d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv3d/__init__.py
@@ -15,6 +15,11 @@
 """
 CUDA conv3d module init
 """
-from . import conv3d, conv3d_bias, depthwise_conv3d, depthwise_conv3d_bias
+from aitemplate.backend.cuda.conv3d import (
+    conv3d,
+    conv3d_bias,
+    depthwise_conv3d,
+    depthwise_conv3d_bias,
+)
 
 __all__ = ["conv3d", "conv3d_bias", "depthwise_conv3d", "depthwise_conv3d_bias"]
diff --git a/python/aitemplate/backend/cuda/conv3d/common.py b/python/aitemplate/backend/cuda/conv3d/common.py
index b059d1770..7bdc77158 100644
--- a/python/aitemplate/backend/cuda/conv3d/common.py
+++ b/python/aitemplate/backend/cuda/conv3d/common.py
@@ -22,10 +22,15 @@
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv2d.common import (
+    extract_config as conv2d_extract_config,
+)
+from aitemplate.backend.cuda.gemm_universal.common import (  # noqa: F401
+    add_profiler,
+    build_profiler,
+)
 
-from ....utils import alignment
-from ..conv2d.common import extract_config as conv2d_extract_config
-from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+from aitemplate.utils import alignment
 
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/conv3d/common_bias.py b/python/aitemplate/backend/cuda/conv3d/common_bias.py
index 929d46fd2..be2a6aab9 100644
--- a/python/aitemplate/backend/cuda/conv3d/common_bias.py
+++ b/python/aitemplate/backend/cuda/conv3d/common_bias.py
@@ -22,10 +22,15 @@
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv2d.common import (
+    extract_config as conv2d_extract_config,
+)
+from aitemplate.backend.cuda.gemm_universal.common import (  # noqa: F401
+    add_profiler,
+    build_profiler,
+)
 
-from ....utils import alignment
-from ..conv2d.common import extract_config as conv2d_extract_config
-from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+from aitemplate.utils import alignment
 
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
index 2b2ce4620..1fd781957 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -18,10 +18,10 @@
 """
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
index 57c455354..fa2f248eb 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
@@ -18,10 +18,10 @@
 """
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
index 92a63f325..399c88d79 100644
--- a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
@@ -17,10 +17,10 @@
 """
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
index 1f6e682bc..70f46eff8 100644
--- a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
@@ -17,10 +17,10 @@
 """
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import common_bias
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common_bias
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/elementwise/__init__.py b/python/aitemplate/backend/cuda/elementwise/__init__.py
index 18bff2803..545d10a86 100644
--- a/python/aitemplate/backend/cuda/elementwise/__init__.py
+++ b/python/aitemplate/backend/cuda/elementwise/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import fused_elementwise, int_elementwise
+from aitemplate.backend.cuda.elementwise import fused_elementwise, int_elementwise
 
 __all__ = ["fused_elementwise", "int_elementwise"]
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
index fabf7d4f9..afe305ff2 100644
--- a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -19,10 +19,10 @@
 import os
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import elementwise_common
-from ...target import Target
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import elementwise_common
+from aitemplate.backend.target import Target
 
 HEAD_TEMPLATE = """
 #include <cuda_fp16.h>
diff --git a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
index 8bd6fc5a8..8ed55cfd4 100644
--- a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
@@ -18,10 +18,11 @@
 
 import jinja2
 
-from ....compiler.base import IntVarTensor
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CPUBackendSpec
+from aitemplate.backend.backend_spec import CPUBackendSpec
+
+from aitemplate.compiler.base import IntVarTensor
 
 
 INT_VAR_FUNC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/embedding/__init__.py b/python/aitemplate/backend/cuda/embedding/__init__.py
index 3e3aab46b..bcc34df37 100644
--- a/python/aitemplate/backend/cuda/embedding/__init__.py
+++ b/python/aitemplate/backend/cuda/embedding/__init__.py
@@ -13,4 +13,4 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bert_embeddings import *
+from aitemplate.backend.cuda.embedding.bert_embeddings import *
diff --git a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
index e62826889..9ebae2334 100644
--- a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
@@ -20,8 +20,8 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
index 28ff30353..c721702b9 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from . import (
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
     bmm_rcr_softmax,
     dual_bmm_rrr_div,
     dual_gemm_rcr_fast_gelu,
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
index af5753b3a..b770f9556 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
@@ -17,10 +17,10 @@
 """
 import jinja2
 
-from ...common import gemm_common
-from ..gemm_universal import common
+from aitemplate.backend.common import gemm_common
 
-from . import common_softmax
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_softmax
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 60f0587f4..2965eb1c2 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -20,10 +20,13 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from ..gemm_universal.layout import RCR
-from . import bmm_common_softmax as bmm_common, common_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
+    bmm_common_softmax as bmm_common,
+    common_softmax,
+)
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index ef20701d2..60b69b285 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -25,12 +25,12 @@
 
 import jinja2
 
-from ....utils import alignment
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.utils import alignment
 
 
 # pylint: disable=C0301,C0415,R1705
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index bb6140318..56e580df8 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -21,9 +21,9 @@
 
 import jinja2
 
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301,C0415,R1705
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
index 905d8e72b..6bf8e2071 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -19,11 +19,11 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common, common_bias
-from ..gemm_universal.layout import RRR
-from . import common_dual_gemm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index 753ed9347..769978a15 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -19,11 +19,11 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common, common_bias
-from ..gemm_universal.layout import RCR
-from . import common_dual_gemm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index 1c4528560..0e9c26d0a 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -19,11 +19,11 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common, common_bias
-from ..gemm_universal.layout import RCR
-from . import common_dual_gemm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 0bc378b04..9ad034e81 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -20,9 +20,12 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from . import common_softmax, gemm_rcr_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
+    common_softmax,
+    gemm_rcr_softmax,
+)
+from aitemplate.backend.cuda.gemm_universal import common
 
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 0964cd303..b417e2c94 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -20,10 +20,10 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from ..gemm_universal.layout import RCR
-from . import common_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_softmax
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/__init__.py b/python/aitemplate/backend/cuda/gemm_special/__init__.py
index 93043be2c..7f582d9ab 100644
--- a/python/aitemplate/backend/cuda/gemm_special/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_special/__init__.py
@@ -15,7 +15,11 @@
 """
 special gemm ops
 """
-from . import bmm_rcr_n1, bmm_rrr_k1_tanh, gemm_rrr_small_nk
+from aitemplate.backend.cuda.gemm_special import (
+    bmm_rcr_n1,
+    bmm_rrr_k1_tanh,
+    gemm_rrr_small_nk,
+)
 
 
 __all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
index b407b431e..7cc5f1b58 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -29,13 +29,13 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common, tensor_accessor_codegen
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common, tensor_accessor_codegen
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.compiler.base import IntImm
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
index 7f40abc66..797028e69 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
@@ -22,10 +22,10 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index b30034e56..b4ec05077 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -27,11 +27,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index cebb32746..3cf6eecc4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from . import (
+from aitemplate.backend.cuda.gemm_universal import (
     bmm_rcr_permute,
     bmm_rrr_permute,
     bmm_xxx,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 4f388af7a..b88c8c395 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -19,9 +19,9 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
index 222522396..568da302d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
@@ -15,11 +15,15 @@
 """
 Common functions and templates for bmm_permute-family ops
 """
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common, common_bias
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
 
-from . import bmm_common, common_permute
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 8b66f8542..083f47f86 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -18,9 +18,14 @@
 A[RowMajor], B[ColMajor], bias[RowMajor]
 """
 
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_permute_common, common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    bmm_permute_common,
+    common,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index 670936784..379f0faf9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -18,9 +18,14 @@
 A[RowMajor], B[RowMajor], bias / C[RowMajor]
 """
 
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_permute_common, common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    bmm_permute_common,
+    common,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
index 742d601a0..8fad72cc9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("cuda.bmm_softmax_bmm_permute.func_decl")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
index 7e9497433..c10837f42 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
@@ -13,8 +13,8 @@
 #  limitations under the License.
 #
 
-from ... import registry
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 """
 Codegen for 8 bmm_xxx ops, which compute A @ B + bias. The ops differ in
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
index aae526481..3f2aaedbd 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -24,10 +24,10 @@
 """
 
 
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-from .bmm_xxx import _get_problem_args, get_config
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
+from aitemplate.backend.cuda.gemm_universal.bmm_xxx import _get_problem_args, get_config
 
 
 def get_gen_function(a_layout, b_layout, c_layout):
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index d6ef0ecc9..a0c54deb8 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -25,13 +25,13 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
-from ....utils import alignment
+from aitemplate.backend.backend_spec import CUDASpec
 
-from ...backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common, tensor_accessor_codegen
+from aitemplate.backend.target import Target
 
-from ...common import gemm_common, tensor_accessor_codegen
-from ...target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.utils import alignment
 
 # pylint: disable=C0301,C0415,R1705
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
index bd7e437e4..06ed9ef3c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -17,9 +17,9 @@
 Common codegen functions for gemm_bias_activation.
 """
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias, gemm_rcr
-from .layout import RCR
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 2e5f9de6d..4c19576fa 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -22,11 +22,11 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
 
-from . import common, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal import common, gemm_rcr
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
index 378911608..7625580fa 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
@@ -22,9 +22,9 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0301,C0415,R1705
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index 7ab432ebd..ba598b965 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -19,11 +19,11 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common
-from .layout import RCR
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index 11642f13d..7f12c6961 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -19,10 +19,10 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias, gemm_rcr
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
index 0dd38d9d7..436d3101e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
@@ -16,9 +16,9 @@
 GEMM Specialization for
 C = UnaryOp2(BinaryOp2(BinaryOp1(UnaryOp1(GeMM(A, B) + bias), D1), D2)),
 """
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_broadcast
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index d6bcb1b16..f0757d202 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -18,8 +18,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index 9c0deed3e..b7dcfb475 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -18,8 +18,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index 084baa41e..dd8ab1177 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -18,8 +18,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
index 6abdcc977..36cfe36ab 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
@@ -16,10 +16,15 @@
 GEMM with bias and permute epilogue fusion
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common
-from . import common_bias, common_permute, gemm_rcr_bias, gemm_rcr_permute
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    common_bias,
+    common_permute,
+    gemm_rcr_bias,
+    gemm_rcr_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index fb7c0e17d..83643889b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -19,8 +19,8 @@
 
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index fd49dad6b..8c2adb852 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -19,8 +19,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index 3899a79bb..358616679 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -19,8 +19,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 18c889a13..1828195de 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -19,8 +19,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index d63c87c49..f324370ec 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -18,10 +18,14 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias_activation, common_no_bias
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    common_bias_activation,
+    common_no_bias,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index 8931b62f7..8814248de 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
index e9741f320..f90882e31 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
@@ -19,8 +19,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import gemm_rcr_permute
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import gemm_rcr_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 306280e20..37e2bd064 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -19,10 +19,10 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index f34e8315d..e8bd44869 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -19,10 +19,10 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common, common_permute
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 41cb8f444..f04abba51 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -21,9 +21,9 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
index b270f99d4..9e57686b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from . import group_common
+from aitemplate.backend.cuda.gemm_universal import group_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
index 6011a6bad..03acac5df 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, group_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, group_common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
index 0f395982d..631306f4e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -15,8 +15,12 @@
 """
 Codegen functions for group_gemm_rcr_bias.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
index b295fc1e4..eb5eaa8bb 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -15,8 +15,12 @@
 """
 Codegen functions for group_gemm_rcr_bias_relu.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index 4f05d1108..29f0e76bc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -15,8 +15,12 @@
 """
 Codegen functions for group_gemm_rcr_bias_sigmoid.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
index 20e688383..dd20aed8f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -16,8 +16,8 @@
 Codegen functions for perm021fc_ccr, which computes
 [b, m, n] = bmm([b, k, m], [1, n, k]).
 """
-from ... import registry
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
index 69712f30f..d3946f532 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
@@ -16,8 +16,13 @@
 Codegen functions for perm021fc_ccr_bias, which computes
 [b, m, n] = bmm([b, k, m], [1, n, k]) + bias[n].
 """
-from ... import registry
-from . import bmm_common, common, common_bias, perm021fc_ccr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm021fc_ccr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
index 77a59f21a..1641a2b95 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -16,9 +16,9 @@
 Common functions and templates for perm021_ccr_bias_permute, which computes
 (A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
 """
-from ... import registry
+from aitemplate.backend import registry
 
-from . import (
+from aitemplate.backend.cuda.gemm_universal import (
     bmm_common,
     bmm_permute_common,
     common,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index 98daa99b9..cbe218e54 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -16,8 +16,8 @@
 Codegen functions for perm021fc_crc, which computes
 [b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)).
 """
-from ... import registry
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
index 3e6497c76..75abc8b6d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
@@ -16,8 +16,13 @@
 Codegen functions for perm021fc_crc_bias, which computes
 [b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)) + bias[n].
 """
-from ... import registry
-from . import bmm_common, common, common_bias, perm021fc_crc
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm021fc_crc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index 36b9ceda1..cafdf96ac 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -16,9 +16,9 @@
 Codegen functions for perm102_bmm_rcr, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
index 92afe0ca5..99fd2e644 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
@@ -16,10 +16,17 @@
 Codegen functions for perm102_bmm_rcr_bias, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col)) + bias[n].
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common, common_bias, perm102_bmm_rcr
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm102_bmm_rcr,
+)
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index 2b3d78fd6..ff2103afe 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -16,10 +16,12 @@
 Codegen functions for perm102_bmm_rrr, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
index e065d70c1..718867ade 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
@@ -16,10 +16,17 @@
 Codegen functions for perm102_bmm_rrr_bias, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[n]
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common, common_bias, perm102_bmm_rrr
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm102_bmm_rrr,
+)
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/groupnorm/__init__.py b/python/aitemplate/backend/cuda/groupnorm/__init__.py
index ee950628c..f98ae7ce9 100644
--- a/python/aitemplate/backend/cuda/groupnorm/__init__.py
+++ b/python/aitemplate/backend/cuda/groupnorm/__init__.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from . import groupnorm, groupnorm_swish
+from aitemplate.backend.cuda.groupnorm import groupnorm, groupnorm_swish
 
 __all__ = ["groupnorm", "groupnorm_swish"]
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
index e26d8cd62..6bb632055 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm_common import (
+from aitemplate.backend.cuda.groupnorm.groupnorm_common import (
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
     groupnorm_gen_function,
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 06db17cc3..39a44ebd3 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...target import Target
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
 FUNC_SIGNATURE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
index 0f2b00dac..3106ce62d 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm_common import (
+from aitemplate.backend.cuda.groupnorm.groupnorm_common import (
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
     groupnorm_gen_function,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
index 4525406e0..c8fd30caf 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
@@ -15,7 +15,7 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import (
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import (
     batch_layernorm_sigmoid_mul,
     group_layernorm_sigmoid_mul,
     layernorm_sigmoid_mul,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
index 62419c299..f0be34b94 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
index 7937338b1..2bf72cb3b 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
index 9a1452822..ec791eb0f 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/lib_template.py b/python/aitemplate/backend/cuda/lib_template.py
index 67d6d76b9..1c42108b1 100644
--- a/python/aitemplate/backend/cuda/lib_template.py
+++ b/python/aitemplate/backend/cuda/lib_template.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from .. import registry
+from aitemplate.backend import registry
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/padding/__init__.py b/python/aitemplate/backend/cuda/padding/__init__.py
index 37bb6eedb..807b81bc4 100644
--- a/python/aitemplate/backend/cuda/padding/__init__.py
+++ b/python/aitemplate/backend/cuda/padding/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA padding init
 """
-from . import ndhwc3to8, nhwc3to4, nhwc3to8, pad_last_dim
+from aitemplate.backend.cuda.padding import ndhwc3to8, nhwc3to4, nhwc3to8, pad_last_dim
 
 __all__ = ["ndhwc3to8", "nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/cuda/padding/ndhwc3to8.py b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
index 6aaca1218..bb03c0b16 100644
--- a/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
+++ b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to4.py b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
index 2b56539ad..c07f8bc33 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to4.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to8.py b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
index 5f66c9be7..0f4e4eb52 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to8.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/padding/pad_last_dim.py b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
index a8ac6ec68..bc9ebe4e6 100644
--- a/python/aitemplate/backend/cuda/padding/pad_last_dim.py
+++ b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/pool2d/__init__.py b/python/aitemplate/backend/cuda/pool2d/__init__.py
index 2d21ced04..437cf7395 100644
--- a/python/aitemplate/backend/cuda/pool2d/__init__.py
+++ b/python/aitemplate/backend/cuda/pool2d/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA pool2d module init
 """
-from . import avg_pool2d, max_pool2d
+from aitemplate.backend.cuda.pool2d import avg_pool2d, max_pool2d
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
index d82df77bf..745fc62cb 100644
--- a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -18,10 +18,10 @@
 
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import pool2d
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.pool2d import pool2d
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index ca4a0c20c..ad6d8d761 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -18,10 +18,10 @@
 
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import pool2d
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.pool2d import pool2d
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/reduce/__init__.py b/python/aitemplate/backend/cuda/reduce/__init__.py
index 0535d8a33..feb5cde4c 100644
--- a/python/aitemplate/backend/cuda/reduce/__init__.py
+++ b/python/aitemplate/backend/cuda/reduce/__init__.py
@@ -15,7 +15,14 @@
 """
 CUDA reduce module init
 """
-from . import reduce_3d, reduce_common, reduce_mean, reduce_sum, var, vector_norm
+from aitemplate.backend.cuda.reduce import (
+    reduce_3d,
+    reduce_common,
+    reduce_mean,
+    reduce_sum,
+    var,
+    vector_norm,
+)
 
 __all__ = [
     "reduce_3d",
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index 08557c484..c8728b9b1 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -24,10 +24,10 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
 
-from . import reduce_small_axis
+from aitemplate.backend.cuda.reduce import reduce_small_axis
 
 
 DEFAULT_PROLOGUE_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_common.py b/python/aitemplate/backend/cuda/reduce/reduce_common.py
index ff8d65c12..c2416e1e9 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_common.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_common.py
@@ -17,8 +17,9 @@
 """
 import jinja2
 
-from ....compiler.base import IntImm, IntVar
-from ...backend_spec import CUDASpec
+from aitemplate.backend.backend_spec import CUDASpec
+
+from aitemplate.compiler.base import IntImm, IntVar
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_mean.py b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
index 521e18a23..56fcf9fad 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_mean.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
@@ -18,8 +18,8 @@
 
 import jinja2
 
-from ... import registry
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d
 
 
 EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index 72d54661b..14c7c8584 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.compiler.base import IntImm
 
 
 EXEC_COND_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_sum.py b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
index a30c91bfb..fa3406ba7 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_sum.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
@@ -24,8 +24,8 @@
 epilogue so it is more general than reduce_common.
 """
 
-from ... import registry
-from . import reduce_3d, reduce_common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d, reduce_common
 
 
 def _is_last_reduction_dim(func_attrs):
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 25ed40eb7..754b07cf8 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -21,9 +21,9 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.reduce import reduce_3d
 
 
 EXTRA_CODE_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/reduce/vector_norm.py b/python/aitemplate/backend/cuda/reduce/vector_norm.py
index c212a66a1..2159ed7f2 100644
--- a/python/aitemplate/backend/cuda/reduce/vector_norm.py
+++ b/python/aitemplate/backend/cuda/reduce/vector_norm.py
@@ -18,8 +18,8 @@
 
 import jinja2
 
-from ... import registry
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d
 
 
 L2_NORM_PROLOGUE_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/softmax/__init__.py b/python/aitemplate/backend/cuda/softmax/__init__.py
index 3b3b17330..615fd1954 100644
--- a/python/aitemplate/backend/cuda/softmax/__init__.py
+++ b/python/aitemplate/backend/cuda/softmax/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import softmax
+from aitemplate.backend.cuda.softmax import softmax
 
 __all__ = ["softmax"]
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index 5d7934174..e8fcd0dab 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...target import Target
+from aitemplate.compiler.base import IntImm
 
 # pylint: disable=C0301, C0116
 
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index dcadb16a3..040c81bc8 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -27,15 +27,19 @@
 from pathlib import Path
 from typing import List
 
-from aitemplate.backend.profiler_cache import ProfileCacheDB
+from aitemplate.backend import registry
 
-from aitemplate.backend.target import TargetType
+from aitemplate.backend.profiler_cache import ProfileCacheDB
 
-from ...utils import environ
-from ...utils.misc import is_debug
+from aitemplate.backend.target import (
+    AIT_STATIC_FILES_PATH,
+    CUTLASS_PATH,
+    Target,
+    TargetType,
+)
 
-from .. import registry
-from ..target import AIT_STATIC_FILES_PATH, CUTLASS_PATH, Target
+from aitemplate.utils import environ
+from aitemplate.utils.misc import is_debug
 
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
 
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index 9a400798f..ea7cd5d26 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -15,7 +15,7 @@
 """
 CUDA tensor ops module init
 """
-from . import (
+from aitemplate.backend.cuda.tensor import (
     argmax,
     batch_gather,
     concatenate,
diff --git a/python/aitemplate/backend/cuda/tensor/argmax.py b/python/aitemplate/backend/cuda/tensor/argmax.py
index 9f82e584d..0c3784d0f 100644
--- a/python/aitemplate/backend/cuda/tensor/argmax.py
+++ b/python/aitemplate/backend/cuda/tensor/argmax.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import argmax_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import argmax_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/tensor/batch_gather.py b/python/aitemplate/backend/cuda/tensor/batch_gather.py
index 721bbb84b..56bc43bbb 100644
--- a/python/aitemplate/backend/cuda/tensor/batch_gather.py
+++ b/python/aitemplate/backend/cuda/tensor/batch_gather.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import batch_gather_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import batch_gather_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate.py b/python/aitemplate/backend/cuda/tensor/concatenate.py
index a0ef2a035..fdc7adb3c 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate.py
@@ -16,10 +16,10 @@
 CUDA concatenate function
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import concatenate_common
-from . import concatenate_fast
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import concatenate_common
+from aitemplate.backend.cuda.tensor import concatenate_fast
 
 
 def _is_valid_fast_cat(func_attrs):
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.py b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
index ee74d4509..fb58b6bac 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
@@ -17,12 +17,12 @@
 
 import jinja2
 
-from ....compiler.ops.tensor import concatenate
+from aitemplate.backend.backend_spec import CUDASpec
 
-from ...backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.target import Target
 
-from ...common import tensor_accessor_codegen
-from ...target import Target
+from aitemplate.compiler.ops.tensor import concatenate
 
 
 KERNEL_SRC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
index c03d6d250..833bc9c22 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import concatenate
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.tensor import concatenate
 
 
 TANH_DEF = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/tensor/dynamic_slice.py b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
index cee387517..a948e4efd 100644
--- a/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
+++ b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
@@ -16,9 +16,9 @@
 Dynamic slice CUDA implementation.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("cuda.dynamic_slice.func_decl")
diff --git a/python/aitemplate/backend/cuda/tensor/gather.py b/python/aitemplate/backend/cuda/tensor/gather.py
index f8ecf17a9..0841dcb18 100644
--- a/python/aitemplate/backend/cuda/tensor/gather.py
+++ b/python/aitemplate/backend/cuda/tensor/gather.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from .. import cuda_common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda import cuda_common
 
 CAST_TO_CONST_INDEX_PTR_TEMPLATE = jinja2.Template(
     "reinterpret_cast<const {{index_type}}*>({{name}})"
diff --git a/python/aitemplate/backend/cuda/tensor/masked_select.py b/python/aitemplate/backend/cuda/tensor/masked_select.py
index dce3ca88d..534f5746c 100644
--- a/python/aitemplate/backend/cuda/tensor/masked_select.py
+++ b/python/aitemplate/backend/cuda/tensor/masked_select.py
@@ -17,10 +17,10 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from .. import cuda_common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda import cuda_common
 
 
 header_files = """
diff --git a/python/aitemplate/backend/cuda/tensor/permute.py b/python/aitemplate/backend/cuda/tensor/permute.py
index e23aef5ff..c437a257f 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.py
+++ b/python/aitemplate/backend/cuda/tensor/permute.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from ...target import Target
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/tensor/permute021.py b/python/aitemplate/backend/cuda/tensor/permute021.py
index 95015cf14..d53f6e902 100644
--- a/python/aitemplate/backend/cuda/tensor/permute021.py
+++ b/python/aitemplate/backend/cuda/tensor/permute021.py
@@ -16,9 +16,9 @@
 permute021 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute021_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute021_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute0213.py b/python/aitemplate/backend/cuda/tensor/permute0213.py
index 2d84d3299..29143d156 100644
--- a/python/aitemplate/backend/cuda/tensor/permute0213.py
+++ b/python/aitemplate/backend/cuda/tensor/permute0213.py
@@ -16,9 +16,9 @@
 permute0213 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute0213_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute0213_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
index d28b36aa8..a1457521b 100644
--- a/python/aitemplate/backend/cuda/tensor/permute102.py
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -16,9 +16,9 @@
 permute102 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute102_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute102_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute210.py b/python/aitemplate/backend/cuda/tensor/permute210.py
index d029277e2..4084bbe6b 100644
--- a/python/aitemplate/backend/cuda/tensor/permute210.py
+++ b/python/aitemplate/backend/cuda/tensor/permute210.py
@@ -16,9 +16,9 @@
 permute210 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute210_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute210_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
index 15d54efc5..c1552ddb9 100644
--- a/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_reshape_scatter_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_reshape_scatter_common
 
 OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/tensor/slice_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
index f42524c01..254193f59 100644
--- a/python/aitemplate/backend/cuda/tensor/slice_scatter.py
+++ b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
@@ -16,9 +16,9 @@
 Slice scatter CUDA implementation.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("cuda.slice_scatter.func_decl")
diff --git a/python/aitemplate/backend/cuda/tensor/split.py b/python/aitemplate/backend/cuda/tensor/split.py
index b0bf6c531..257571158 100644
--- a/python/aitemplate/backend/cuda/tensor/split.py
+++ b/python/aitemplate/backend/cuda/tensor/split.py
@@ -16,9 +16,9 @@
 CUDA concatenate function
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import split_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import split_common
 
 
 @registry.reg("cuda.split.func_decl")
diff --git a/python/aitemplate/backend/cuda/tensor/topk.py b/python/aitemplate/backend/cuda/tensor/topk.py
index 36916d4e2..f6046c7c6 100644
--- a/python/aitemplate/backend/cuda/tensor/topk.py
+++ b/python/aitemplate/backend/cuda/tensor/topk.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import topk_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import topk_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/upsample/__init__.py b/python/aitemplate/backend/cuda/upsample/__init__.py
index 98b87b6d8..f7fa8ce45 100644
--- a/python/aitemplate/backend/cuda/upsample/__init__.py
+++ b/python/aitemplate/backend/cuda/upsample/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA upsampling module init
 """
-from . import upsampling2d, upsampling2d_add
+from aitemplate.backend.cuda.upsample import upsampling2d, upsampling2d_add
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index 795f857f2..31a7ec55e 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -16,9 +16,9 @@
 Codegen functions for upsampling2d.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
index 8015ed78d..76fdcdc8a 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -16,9 +16,9 @@
 Codegen functions for upsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index 14f8fa3c1..751a3b7cf 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -17,9 +17,9 @@
 """
 import logging
 
-from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
+from aitemplate.backend import registry
 
-from .. import registry
+from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
 
 # pylint: disable=C0103,C0415,W0707
 
diff --git a/python/aitemplate/backend/cuda/view_ops/__init__.py b/python/aitemplate/backend/cuda/view_ops/__init__.py
index 1b7b20efc..b2be80a1a 100644
--- a/python/aitemplate/backend/cuda/view_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/view_ops/__init__.py
@@ -15,7 +15,7 @@
 """
 CUDA view_ops module init
 """
-from . import make_jagged, view_ops
+from aitemplate.backend.cuda.view_ops import make_jagged, view_ops
 
 __all__ = [
     "view_ops",
diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index 16a09d23f..0fc06ec2f 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -30,7 +30,7 @@
 """
 import jinja2
 
-from ....backend import registry
+from aitemplate.backend import registry
 
 
 SRC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/view_ops/view_ops.py b/python/aitemplate/backend/cuda/view_ops/view_ops.py
index 792f7b1de..502b66bea 100644
--- a/python/aitemplate/backend/cuda/view_ops/view_ops.py
+++ b/python/aitemplate/backend/cuda/view_ops/view_ops.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from ....backend import registry
+from aitemplate.backend import registry
 
 SRC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/vision_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/__init__.py
index 5cdda2760..82940f11a 100644
--- a/python/aitemplate/backend/cuda/vision_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/__init__.py
@@ -17,5 +17,5 @@
 """
 # flake8: noqa
 
-from .nms import *
-from .roi_ops import *
+from aitemplate.backend.cuda.vision_ops.nms import *
+from aitemplate.backend.cuda.vision_ops.roi_ops import *
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
index 280f1ada0..4f47cf2d8 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
@@ -15,4 +15,8 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import batched_nms, efficient_nms, nms  # noqa
+from aitemplate.backend.cuda.vision_ops.nms import (  # noqa
+    batched_nms,
+    efficient_nms,
+    nms,
+)
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
index f5efb3df0..3c83d0003 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
index c7ecca653..89ab885d9 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import efficient_nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import efficient_nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
index ac4780747..a4d7f6839 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
index bbdaf07a4..5959e1a3b 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA roi_align module init
 """
-from . import multi_level_roi_align, roi_align
+from aitemplate.backend.cuda.vision_ops.roi_ops import multi_level_roi_align, roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
index 89e608d5d..64c604f96 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import multi_level_roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import multi_level_roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
index 1597e848c..3726fdbf0 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 612c514a1..204e09e1b 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -28,10 +28,10 @@
 from queue import Queue
 from typing import Callable, List, Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.backend.target import Target
+from aitemplate.backend.task_runner import BaseRunner, Task
 
-from .target import Target
-from .task_runner import BaseRunner, Task
+from aitemplate.testing import detect_target
 
 # pylint: disable=W0221
 
diff --git a/python/aitemplate/backend/rocm/__init__.py b/python/aitemplate/backend/rocm/__init__.py
index 28df18128..fe26a8c3d 100644
--- a/python/aitemplate/backend/rocm/__init__.py
+++ b/python/aitemplate/backend/rocm/__init__.py
@@ -16,15 +16,15 @@
 """
 Rocm backend init.
 """
-from . import lib_template, target_def, utils
-from .common import *
-from .conv2d import *
-from .gemm import *
-from .pool2d import *
-from .view_ops import *
-from .elementwise import *
-from .tensor import *
-from .normalization import softmax
-from .upsample import *
-from .vision_ops import *
-from .normalization import groupnorm, groupnorm_swish, layernorm
+from aitemplate.backend.rocm import lib_template, target_def, utils
+from aitemplate.backend.rocm.common import *
+from aitemplate.backend.rocm.conv2d import *
+from aitemplate.backend.rocm.gemm import *
+from aitemplate.backend.rocm.pool2d import *
+from aitemplate.backend.rocm.view_ops import *
+from aitemplate.backend.rocm.elementwise import *
+from aitemplate.backend.rocm.tensor import *
+from aitemplate.backend.rocm.normalization import softmax
+from aitemplate.backend.rocm.upsample import *
+from aitemplate.backend.rocm.vision_ops import *
+from aitemplate.backend.rocm.normalization import groupnorm, groupnorm_swish, layernorm
diff --git a/python/aitemplate/backend/rocm/common/__init__.py b/python/aitemplate/backend/rocm/common/__init__.py
index 50ab82434..3e6e5152f 100644
--- a/python/aitemplate/backend/rocm/common/__init__.py
+++ b/python/aitemplate/backend/rocm/common/__init__.py
@@ -16,4 +16,4 @@
 """
 ROCM Common module init
 """
-from .dummy_op import *
+from aitemplate.backend.rocm.common.dummy_op import *
diff --git a/python/aitemplate/backend/rocm/common/dummy_op.py b/python/aitemplate/backend/rocm/common/dummy_op.py
index e4342ff43..5cbea271a 100644
--- a/python/aitemplate/backend/rocm/common/dummy_op.py
+++ b/python/aitemplate/backend/rocm/common/dummy_op.py
@@ -18,7 +18,7 @@
 
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("rocm.size.gen_function")
diff --git a/python/aitemplate/backend/rocm/conv2d/__init__.py b/python/aitemplate/backend/rocm/conv2d/__init__.py
index 8a330f108..989ea243f 100644
--- a/python/aitemplate/backend/rocm/conv2d/__init__.py
+++ b/python/aitemplate/backend/rocm/conv2d/__init__.py
@@ -15,7 +15,7 @@
 """
 ROCM conv2d init.
 """
-from . import (
+from aitemplate.backend.rocm.conv2d import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add_relu,
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index c2d91d5d2..0dc1a98bb 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -22,7 +22,7 @@
 
 import jinja2
 
-from ...target import Target
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0611,C0301
 
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d.py b/python/aitemplate/backend/rocm/conv2d/conv2d.py
index c8191c19a..e724d8e54 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for conv2d.
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
index ccbc265dd..b9956922b 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for Conv2dBias: conv2d(w, x) + b
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index 5ae33fd39..fc424e43a 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
index ddbcaecd3..0a48bf6e3 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for conv2d_bias_relu.
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index 2ca81637f..7458226e9 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
index f07a8f17a..b4dfc7c7e 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
index 0be5a94e6..5053e58aa 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/elementwise/__init__.py b/python/aitemplate/backend/rocm/elementwise/__init__.py
index 0bf6e473f..4594bf9ec 100644
--- a/python/aitemplate/backend/rocm/elementwise/__init__.py
+++ b/python/aitemplate/backend/rocm/elementwise/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import fused_elementwise
+from aitemplate.backend.rocm.elementwise import fused_elementwise
 
 __all__ = ["fused_elementwise"]
diff --git a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
index 177d84cd1..b6441bf5c 100644
--- a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
@@ -19,10 +19,10 @@
 import os
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import elementwise_common
-from ...target import Target
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import elementwise_common
+from aitemplate.backend.target import Target
 
 
 HEAD_TEMPLATE = """
diff --git a/python/aitemplate/backend/rocm/gemm/__init__.py b/python/aitemplate/backend/rocm/gemm/__init__.py
index ce3fefe28..38c659280 100644
--- a/python/aitemplate/backend/rocm/gemm/__init__.py
+++ b/python/aitemplate/backend/rocm/gemm/__init__.py
@@ -15,7 +15,7 @@
 """
 Rocm gemm init.
 """
-from . import (  # noqa: F401
+from aitemplate.backend.rocm.gemm import (  # noqa: F401
     bmm_ccr,
     bmm_crr,
     bmm_rcr,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
index b8c24f4af..e2a97fa0d 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import CCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import CCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_common.py b/python/aitemplate/backend/rocm/gemm/bmm_common.py
index 497eaf26b..67fdff617 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_common.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_common.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from . import common
+from aitemplate.backend.rocm.gemm import common
 
 EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr.py b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
index 2369ffd45..02d176a77 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_crr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import CRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import CRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
index 3fe8e9529..8396335c1 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
index 9909b3e65..54c73b438 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, bmm_permute_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, bmm_permute_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
index 8e8646385..2d05afe05 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
index bedbc90ba..6d4fc73fd 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, bmm_permute_common, common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, bmm_permute_common, common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
index 3881b4879..ca6b9976a 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 EXTRA_CODE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
index cf9fdd752..040a3f455 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -24,10 +24,10 @@
 """
 import jinja2
 
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 INPUT_ADDR_CALCULATOR = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 4b3c1f351..6528b04a6 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -22,8 +22,8 @@
 
 import jinja2
 
-from ...common import gemm_common
-from ...target import Target
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0611,C0301
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
index 962b0a7d7..52edac942 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
@@ -17,7 +17,7 @@
 """
 from typing import Dict, List, NamedTuple
 
-from ....compiler.ops.common.epilogue import EpilogueOp
+from aitemplate.compiler.ops.common.epilogue import EpilogueOp
 
 
 class GeMMEpilogueSpec(NamedTuple):
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
index eaf160305..530196408 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear(bias=false)`
 When used for `linear`, need to set A->Data, B->Weight
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
index 3eb456567..d092ae3c1 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
index c6b1e43c4..c567a649e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
index 83f2422e4..58527f1b0 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
index 4f3d7a3b9..8c5d20de8 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
index f40127ce0..18e179eca 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
index 376065ba8..ed4b039df 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear + swish`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
index 0741eb9d2..914c36c1e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
index 0591e573d..f013f3758 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 EXTRA_CODE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
index 5c925b21c..a34fe1952 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
index ffacf0417..ac5bbc6cc 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 @registry.reg("rocm.gemm_rcr_bias_permute.config")
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
index a0b96d106..4d8ba2a14 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
index 596dee60c..07df32276 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
index e49bcd7ec..9725d980d 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear + relu`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
index 83531c77e..f9028a005 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
index 100805e2e..147e3ec4f 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
index dc73256ef..24a427528 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index cb298e6b1..e3a19c86d 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -19,9 +19,9 @@
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
index 5eaeb0686..acc6b1ca3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
index d9350fd20..6661fd1c3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
index 414428906..dd6beb088 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
@@ -18,9 +18,9 @@
 This is used for `torch.mm`
 When used for `mm`, need to set A->Data, B->Weight
 """
-from ... import registry
-from . import common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
index 005f51bd3..ab34001c8 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common, permute_common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 @registry.reg("rocm.gemm_rrr_bias_permute.config")
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
index 9dfbf11e2..4e01c6bef 100644
--- a/python/aitemplate/backend/rocm/lib_template.py
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from .. import registry
+from aitemplate.backend import registry
 
 # pylint: disable=W0613
 
diff --git a/python/aitemplate/backend/rocm/normalization/__init__.py b/python/aitemplate/backend/rocm/normalization/__init__.py
index fb90889b3..4585e7cee 100644
--- a/python/aitemplate/backend/rocm/normalization/__init__.py
+++ b/python/aitemplate/backend/rocm/normalization/__init__.py
@@ -15,4 +15,4 @@
 """
 Common modules for backends
 """
-from . import norm_common, softmax  # noqa
+from aitemplate.backend.rocm.normalization import norm_common, softmax  # noqa
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index b2dfffb64..978e3bef8 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...target import Target
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
index 01872be32..f9ad7dbec 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm import (
+from aitemplate.backend.rocm.normalization.groupnorm import (
     groupnorm_extract_config,
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index 0ca7e6052..b559a621e 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...target import Target
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index c6a0cca17..a21bdb96c 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -23,7 +23,7 @@
 
 import jinja2
 
-from ...target import Target
+from aitemplate.backend.target import Target
 
 FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
 
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index 819f24e0e..11a0aa85c 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
 
-from ... import registry
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/pool2d/__init__.py b/python/aitemplate/backend/rocm/pool2d/__init__.py
index 4a1ee6bf3..072cfd047 100644
--- a/python/aitemplate/backend/rocm/pool2d/__init__.py
+++ b/python/aitemplate/backend/rocm/pool2d/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM pool2d init
 """
-from . import avg_pool2d, max_pool2d
+from aitemplate.backend.rocm.pool2d import avg_pool2d, max_pool2d
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
index 4ec7db8b0..4196eb37f 100644
--- a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
@@ -15,8 +15,8 @@
 """
 ROCM avg_pool2d funcs
 """
-from ... import registry
-from . import pool2d
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.pool2d import pool2d
 
 
 @registry.reg("rocm.avg_pool2d.gen_function")
diff --git a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
index 38a108ddf..25199c946 100644
--- a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
@@ -15,8 +15,8 @@
 """
 ROCM max_pool2d funcs
 """
-from ... import registry
-from . import pool2d
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.pool2d import pool2d
 
 
 @registry.reg("rocm.max_pool2d.gen_function")
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index 884eceebf..7055b843e 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -25,12 +25,15 @@
 import sys
 from typing import List
 
-from aitemplate.backend.target import AIT_STATIC_FILES_PATH
+from aitemplate.backend import registry
 
-from ...utils import environ
+from aitemplate.backend.target import (
+    AIT_STATIC_FILES_PATH,
+    COMPOSABLE_KERNEL_PATH,
+    Target,
+)
 
-from .. import registry
-from ..target import COMPOSABLE_KERNEL_PATH, Target
+from aitemplate.utils import environ
 
 # pylint: disable=W0613
 
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 70203d4c8..90181b170 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -15,7 +15,7 @@
 """
 ROCM tensor ops module init
 """
-from . import (  # noqa
+from aitemplate.backend.rocm.tensor import (  # noqa
     argmax,
     batch_gather,
     concatenate,
diff --git a/python/aitemplate/backend/rocm/tensor/argmax.py b/python/aitemplate/backend/rocm/tensor/argmax.py
index 15049bed5..78e54fb29 100644
--- a/python/aitemplate/backend/rocm/tensor/argmax.py
+++ b/python/aitemplate/backend/rocm/tensor/argmax.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import argmax_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import argmax_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/tensor/batch_gather.py b/python/aitemplate/backend/rocm/tensor/batch_gather.py
index 8deff3144..8ac8a78a7 100644
--- a/python/aitemplate/backend/rocm/tensor/batch_gather.py
+++ b/python/aitemplate/backend/rocm/tensor/batch_gather.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import batch_gather_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import batch_gather_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate.py b/python/aitemplate/backend/rocm/tensor/concatenate.py
index ac56c8dde..730037eec 100644
--- a/python/aitemplate/backend/rocm/tensor/concatenate.py
+++ b/python/aitemplate/backend/rocm/tensor/concatenate.py
@@ -16,9 +16,9 @@
 ROCM concatenate function
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import concatenate_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import concatenate_common
 
 
 @registry.reg("rocm.concatenate.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
index 3b2c8f93e..4806ca919 100644
--- a/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
+++ b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import concatenate
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.tensor import concatenate
 
 TANH_DEF = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/tensor/dynamic_slice.py b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
index 4f39785d3..1df53d7dd 100644
--- a/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
+++ b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
@@ -16,9 +16,9 @@
 Dynamic slice ROCM implementation.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("rocm.dynamic_slice.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/permute021.py b/python/aitemplate/backend/rocm/tensor/permute021.py
index df066ca78..afcba7883 100644
--- a/python/aitemplate/backend/rocm/tensor/permute021.py
+++ b/python/aitemplate/backend/rocm/tensor/permute021.py
@@ -16,9 +16,9 @@
 permute021 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute021_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute021_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/rocm/tensor/permute0213.py b/python/aitemplate/backend/rocm/tensor/permute0213.py
index 2fdde245d..efeb759fb 100644
--- a/python/aitemplate/backend/rocm/tensor/permute0213.py
+++ b/python/aitemplate/backend/rocm/tensor/permute0213.py
@@ -16,9 +16,9 @@
 permute0213 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute0213_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute0213_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/rocm/tensor/permute102.py b/python/aitemplate/backend/rocm/tensor/permute102.py
index f5304897f..7ab68b47f 100644
--- a/python/aitemplate/backend/rocm/tensor/permute102.py
+++ b/python/aitemplate/backend/rocm/tensor/permute102.py
@@ -16,9 +16,9 @@
 permute102 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute102_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute102_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/rocm/tensor/permute210.py b/python/aitemplate/backend/rocm/tensor/permute210.py
index 31fdf6d91..b09bae490 100644
--- a/python/aitemplate/backend/rocm/tensor/permute210.py
+++ b/python/aitemplate/backend/rocm/tensor/permute210.py
@@ -16,9 +16,9 @@
 permute210 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute210_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute210_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
index 5405ae749..4dd31dc3f 100644
--- a/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_reshape_scatter_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_reshape_scatter_common
 
 TANH_DEF = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/tensor/slice_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
index 4641c0905..cf59b2ad5 100644
--- a/python/aitemplate/backend/rocm/tensor/slice_scatter.py
+++ b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
@@ -16,9 +16,9 @@
 Slice scatter ROCM implementation.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("rocm.slice_scatter.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/split.py b/python/aitemplate/backend/rocm/tensor/split.py
index 1e545a2b2..dde07ea10 100644
--- a/python/aitemplate/backend/rocm/tensor/split.py
+++ b/python/aitemplate/backend/rocm/tensor/split.py
@@ -16,9 +16,9 @@
 ROCM split function
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import split_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import split_common
 
 
 @registry.reg("rocm.split.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/topk.py b/python/aitemplate/backend/rocm/tensor/topk.py
index 038a4b361..590c5ef7e 100644
--- a/python/aitemplate/backend/rocm/tensor/topk.py
+++ b/python/aitemplate/backend/rocm/tensor/topk.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import topk_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import topk_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/upsample/__init__.py b/python/aitemplate/backend/rocm/upsample/__init__.py
index 500b24f81..3d822c1b0 100644
--- a/python/aitemplate/backend/rocm/upsample/__init__.py
+++ b/python/aitemplate/backend/rocm/upsample/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM upsampling module init
 """
-from . import upsampling2d, upsampling2d_add
+from aitemplate.backend.rocm.upsample import upsampling2d, upsampling2d_add
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d.py b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
index 08dd19267..e4c592ac0 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
@@ -16,9 +16,9 @@
 ROCM codegen functions for unsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
index 1f534af7a..798c2317d 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
@@ -16,9 +16,9 @@
 ROCM codegen functions for unsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/rocm/utils.py b/python/aitemplate/backend/rocm/utils.py
index 40c45fb6e..e1d3107b3 100644
--- a/python/aitemplate/backend/rocm/utils.py
+++ b/python/aitemplate/backend/rocm/utils.py
@@ -21,7 +21,7 @@
 import shutil
 import tempfile
 
-from .. import registry
+from aitemplate.backend import registry
 
 # from . import extra_conv_emit, extra_cutlass_generator, extra_enum
 
diff --git a/python/aitemplate/backend/rocm/view_ops/__init__.py b/python/aitemplate/backend/rocm/view_ops/__init__.py
index 7fdffeffd..505398dde 100644
--- a/python/aitemplate/backend/rocm/view_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/view_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM view_ops module init
 """
-from . import view_ops
+from aitemplate.backend.rocm.view_ops import view_ops
 
 __all__ = ["view_ops"]
diff --git a/python/aitemplate/backend/rocm/view_ops/view_ops.py b/python/aitemplate/backend/rocm/view_ops/view_ops.py
index f11f20857..f41668fea 100644
--- a/python/aitemplate/backend/rocm/view_ops/view_ops.py
+++ b/python/aitemplate/backend/rocm/view_ops/view_ops.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from ....backend import registry
+from aitemplate.backend import registry
 
 SRC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/vision_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/__init__.py
index fc8b18622..f46596197 100644
--- a/python/aitemplate/backend/rocm/vision_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/vision_ops/__init__.py
@@ -15,5 +15,8 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import efficient_nms, nms  # noqa
-from .roi_ops import multi_level_roi_align, roi_align  # noqa  # noqa
+from aitemplate.backend.rocm.vision_ops import efficient_nms, nms  # noqa
+from aitemplate.backend.rocm.vision_ops.roi_ops import (  # noqa  # noqa
+    multi_level_roi_align,
+    roi_align,
+)
diff --git a/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
index 9d0c947bd..4a1fdb947 100644
--- a/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
+++ b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.vision_ops import efficient_nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import efficient_nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/vision_ops/nms.py b/python/aitemplate/backend/rocm/vision_ops/nms.py
index 694f4d205..5f3c108e8 100644
--- a/python/aitemplate/backend/rocm/vision_ops/nms.py
+++ b/python/aitemplate/backend/rocm/vision_ops/nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.vision_ops import nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
index 6082dbff7..8e7fc3709 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM roi_align module init
 """
-from . import multi_level_roi_align, roi_align
+from aitemplate.backend.rocm.vision_ops.roi_ops import multi_level_roi_align, roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
index d875ae9c2..94bb11d52 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import ROCMSpec
-from ....common.vision_ops import multi_level_roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import multi_level_roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
index 81ec65afd..6a48f6e15 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import ROCMSpec
-from ....common.vision_ops import roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 27559c7d5..d94826ddc 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -23,8 +23,8 @@
 from enum import IntEnum
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from . import registry
-from .profiler_cache import ProfileCacheDB
+from aitemplate.backend import registry
+from aitemplate.backend.profiler_cache import ProfileCacheDB
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/__init__.py b/python/aitemplate/compiler/__init__.py
index 315577d39..c3752028a 100644
--- a/python/aitemplate/compiler/__init__.py
+++ b/python/aitemplate/compiler/__init__.py
@@ -12,9 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from . import base, dtype, ops, tensor_accessor, transform
-from .compiler import compile_model
-from .model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
+from aitemplate.compiler import base, dtype, ops, tensor_accessor, transform
+from aitemplate.compiler.compiler import compile_model
+from aitemplate.compiler.model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
 
 __all__ = [
     "base",
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 1e32a243f..1d2606b6b 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -27,12 +27,12 @@
 import numpy as np
 
 from aitemplate.compiler.dtype import get_dtype_size, normalize_dtype
+from aitemplate.compiler.op_registry import OP_REGISTRY
 
 from aitemplate.compiler.stable_set import StableSet
-from aitemplate.utils.torch_utils import torch_dtype_to_string
 
-from ..utils.tensor_utils import wrap_dim
-from .op_registry import OP_REGISTRY
+from aitemplate.utils.tensor_utils import wrap_dim
+from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 # pylint: disable=C0206,W0613,C0201,W0102,W0231,W0233
 
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 874254ba9..3a87e0112 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -21,16 +21,20 @@
 from typing import Dict, List, Optional, Union
 
 from aitemplate import backend, compiler
-from aitemplate.compiler.model import AITemplateAllocatorKind
+
+from aitemplate.compiler.base import DynamicProfileStrategy, Tensor
+
+from aitemplate.compiler.model import (
+    AIT_DEFAULT_NUM_RUNTIMES,
+    AITemplateAllocatorKind,
+    Model,
+    TorchTensor,
+)
 from aitemplate.compiler.transform.profile import elapsed_dt_sec
 from aitemplate.utils import graph_utils
 from aitemplate.utils.debug_settings import AITDebugSettings
 from aitemplate.utils.serialization.serdes_code import dump_program
 
-from .base import DynamicProfileStrategy, Tensor
-
-from .model import AIT_DEFAULT_NUM_RUNTIMES, Model, TorchTensor
-
 # pylint: disable=W0102
 
 
diff --git a/python/aitemplate/compiler/ops/__init__.py b/python/aitemplate/compiler/ops/__init__.py
index 0d78fae7f..99c3584dd 100644
--- a/python/aitemplate/compiler/ops/__init__.py
+++ b/python/aitemplate/compiler/ops/__init__.py
@@ -16,19 +16,19 @@
 """
 AIT operators.
 """
-from .common import *
-from .conv import *
-from .embedding import *
-from .gemm_special import *
-from .gemm_universal import *
-from .gemm_epilogue_vistor import *
-from .layernorm import *
-from .padding import *
-from .pool import *
-from .reduce import *
-from .softmax import *
-from .tensor import *
-from .upsample import *
-from .vision_ops import *
-from .attention import *
-from .groupnorm import *
+from aitemplate.compiler.ops.common import *
+from aitemplate.compiler.ops.conv import *
+from aitemplate.compiler.ops.embedding import *
+from aitemplate.compiler.ops.gemm_special import *
+from aitemplate.compiler.ops.gemm_universal import *
+from aitemplate.compiler.ops.gemm_epilogue_vistor import *
+from aitemplate.compiler.ops.layernorm import *
+from aitemplate.compiler.ops.padding import *
+from aitemplate.compiler.ops.pool import *
+from aitemplate.compiler.ops.reduce import *
+from aitemplate.compiler.ops.softmax import *
+from aitemplate.compiler.ops.tensor import *
+from aitemplate.compiler.ops.upsample import *
+from aitemplate.compiler.ops.vision_ops import *
+from aitemplate.compiler.ops.attention import *
+from aitemplate.compiler.ops.groupnorm import *
diff --git a/python/aitemplate/compiler/ops/attention/__init__.py b/python/aitemplate/compiler/ops/attention/__init__.py
index 4f18558f7..ff60a7246 100644
--- a/python/aitemplate/compiler/ops/attention/__init__.py
+++ b/python/aitemplate/compiler/ops/attention/__init__.py
@@ -15,8 +15,8 @@
 """
 flash attention module init
 """
-from .flash_attention import flash_attention
-from .mem_eff_attention import mem_eff_attention
+from aitemplate.compiler.ops.attention.flash_attention import flash_attention
+from aitemplate.compiler.ops.attention.mem_eff_attention import mem_eff_attention
 
 
 __all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/compiler/ops/attention/flash_attention.py b/python/aitemplate/compiler/ops/attention/flash_attention.py
index d0b8db9ed..3b0658867 100644
--- a/python/aitemplate/compiler/ops/attention/flash_attention.py
+++ b/python/aitemplate/compiler/ops/attention/flash_attention.py
@@ -21,10 +21,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 6703984a7..8ac4f5fe8 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -22,10 +22,10 @@
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/common/__init__.py b/python/aitemplate/compiler/ops/common/__init__.py
index 1247cc790..4e00e86d3 100644
--- a/python/aitemplate/compiler/ops/common/__init__.py
+++ b/python/aitemplate/compiler/ops/common/__init__.py
@@ -16,10 +16,10 @@
 """
 Common ops.
 """
-from .elementwise import *
-from .int_elementwise import *
-from .epilogue import *
-from .fused_elementwise import *
-from .math import *
-from .python_ops import *
-from .view_ops import *
+from aitemplate.compiler.ops.common.elementwise import *
+from aitemplate.compiler.ops.common.int_elementwise import *
+from aitemplate.compiler.ops.common.epilogue import *
+from aitemplate.compiler.ops.common.fused_elementwise import *
+from aitemplate.compiler.ops.common.math import *
+from aitemplate.compiler.ops.common.python_ops import *
+from aitemplate.compiler.ops.common.view_ops import *
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index dfea0367c..8c16ecf77 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -18,11 +18,12 @@
 import functools
 from typing import Any, List
 
-from ....utils import shape_utils
-from ...base import IntVar, IntVarTensor, Operator, Tensor
-from ...dtype import normalize_dtype
-from ...op_registry import OP_REGISTRY
-from .epilogue import FuncEnum
+from aitemplate.compiler.base import IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+from aitemplate.compiler.op_registry import OP_REGISTRY
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
index 67bdf5abc..3716c6f96 100644
--- a/python/aitemplate/compiler/ops/common/fused_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -17,11 +17,11 @@
 """
 from typing import List, Set
 
-from .... import backend
-from ....backend import registry
-from ...base import Operator
-from ...tensor_accessor import TensorAccessor
-from .elementwise import elementwise
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
+from aitemplate.compiler.ops.common.elementwise import elementwise
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0301,C0103,W0223
 
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index 1e8a4b8e0..81292fea0 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -18,13 +18,13 @@
 import functools
 from functools import reduce
 
-from .... import backend
-from ....backend import registry
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVarTensor, Operator, Tensor
+from aitemplate.compiler.op_registry import OP_REGISTRY
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 
-from ....utils import shape_utils
-from ...base import IntVarTensor, Operator, Tensor
-from ...op_registry import OP_REGISTRY
-from .epilogue import FuncEnum
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
diff --git a/python/aitemplate/compiler/ops/common/python_ops.py b/python/aitemplate/compiler/ops/common/python_ops.py
index b53d76050..98d8ba13c 100644
--- a/python/aitemplate/compiler/ops/common/python_ops.py
+++ b/python/aitemplate/compiler/ops/common/python_ops.py
@@ -17,8 +17,9 @@
 """
 from typing import Any, List, Tuple, Union
 
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221,R1732,W0613
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index b47fa6ff5..118c7a7c6 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -36,7 +36,7 @@
 )
 from aitemplate.utils.shape_utils import convert_shape_to_IntVar
 
-from ....utils.tensor_utils import wrap_dim
+from aitemplate.utils.tensor_utils import wrap_dim
 
 
 # SHAPE_ASSIGNMENT_TEMPLATE is folded in here
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
index 3744274a8..2a49f29c9 100644
--- a/python/aitemplate/compiler/ops/conv/__init__.py
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -16,22 +16,32 @@
 """
 Conv2d family operators.
 """
-from .conv2d import conv2d
-from .conv2d_bias import conv2d_bias
-from .conv2d_bias_add import conv2d_bias_add
-from .conv2d_bias_add_hardswish import conv2d_bias_add_hardswish
-from .conv2d_bias_add_relu import conv2d_bias_add_relu
-from .conv2d_bias_few_channels import conv2d_bias_few_channels
-from .conv2d_bias_hardswish import conv2d_bias_hardswish
-from .conv2d_bias_hardswish_few_channels import conv2d_bias_hardswish_few_channels
-from .conv2d_bias_relu import conv2d_bias_relu
-from .conv2d_bias_relu_few_channels import conv2d_bias_relu_few_channels
-from .conv2d_bias_sigmoid import conv2d_bias_sigmoid
-from .conv2d_depthwise import conv2d_depthwise
-from .conv2d_depthwise_bias import conv2d_depthwise_bias
-from .conv3d import conv3d
-from .conv3d_bias import conv3d_bias
-from .depthwise_conv3d import depthwise_conv3d
-from .transposed_conv2d import transposed_conv2d
-from .transposed_conv2d_bias import transposed_conv2d_bias
-from .transposed_conv2d_bias_relu import transposed_conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
+from aitemplate.compiler.ops.conv.conv2d_bias_add import conv2d_bias_add
+from aitemplate.compiler.ops.conv.conv2d_bias_add_hardswish import (
+    conv2d_bias_add_hardswish,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_add_relu import conv2d_bias_add_relu
+from aitemplate.compiler.ops.conv.conv2d_bias_few_channels import (
+    conv2d_bias_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_hardswish import conv2d_bias_hardswish
+from aitemplate.compiler.ops.conv.conv2d_bias_hardswish_few_channels import (
+    conv2d_bias_hardswish_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv2d_bias_relu_few_channels import (
+    conv2d_bias_relu_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_sigmoid import conv2d_bias_sigmoid
+from aitemplate.compiler.ops.conv.conv2d_depthwise import conv2d_depthwise
+from aitemplate.compiler.ops.conv.conv2d_depthwise_bias import conv2d_depthwise_bias
+from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
+from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias_relu import (
+    transposed_conv2d_bias_relu,
+)
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index f018ef0d8..fae05dad9 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -17,8 +17,8 @@
 """
 from typing import Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
 
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
index 240ad61c6..1395f50ff 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
@@ -16,8 +16,8 @@
 Fused conv2d_bias_add_activation op.
 """
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 5cfd441f3..98e93f36d 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -26,17 +26,23 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import alignment, environ, shape_utils
-from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
-from .cache_entry import ConvQueryEntry, ConvRecordEntry
-from .conv_common import (
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.conv.cache_entry import ConvQueryEntry, ConvRecordEntry
+from aitemplate.compiler.ops.conv.conv_common import (
     filter_op_instances,
     generate_profiler_sources,
     get_profiler_filename,
 )
+from aitemplate.utils import alignment, environ, shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
index 9628df362..416066f29 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
@@ -15,7 +15,9 @@
 """
 Conv2d with bias.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
index 39f7c2a95..9a1dffafc 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_add op
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
index b8f224a7a..36a59445c 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_add_hardswish op, for residual block
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
index c118716a6..150e10554 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_relu_add op, for residual block
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
index 1328d5f53..71a17819f 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
 
 # pylint: disable=C0103
 class conv2d_bias_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
index b36039cb3..e6039ade5 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_hardswish op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
index 104bf7ef1..bbe2a879c 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_hardswish_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
 
 # pylint: disable=C0103
 class conv2d_bias_hardswish_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
index b8fdf7d75..ab9fdcb94 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_relu op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
index b4d5f9594..84b9bea70 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_relu_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
 
 # pylint: disable=C0103
 class conv2d_bias_relu_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
index 521fd642d..55e009d91 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_sigmoid op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
index 35bc350e8..d191dfb95 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
@@ -17,8 +17,8 @@
 """
 from typing import List, Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
 
 # pylint: disable=C0103
 class conv2d_depthwise(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
index 73ddaa04c..c6d026b84 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
@@ -17,8 +17,8 @@
 """
 from typing import List, Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
 
 # pylint: disable=C0103
 class conv2d_depthwise_bias(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 13be89072..6c5301673 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -27,17 +27,23 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import alignment, environ, shape_utils
-from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
-from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
-from .conv_common import (
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.conv.cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+from aitemplate.compiler.ops.conv.conv_common import (
     filter_op_instances,
     generate_profiler_sources,
     get_profiler_filename,
 )
+from aitemplate.utils import alignment, environ, shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
index 57623840a..7c5a41362 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -18,8 +18,9 @@
 """
 from typing import List
 
-from ...base import Tensor
-from .conv3d import conv3d
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.compiler.ops.conv.conv3d import conv3d
 
 
 class conv3d_bias(conv3d):
diff --git a/python/aitemplate/compiler/ops/conv/conv_common.py b/python/aitemplate/compiler/ops/conv/conv_common.py
index 647d45408..a984d5c37 100644
--- a/python/aitemplate/compiler/ops/conv/conv_common.py
+++ b/python/aitemplate/compiler/ops/conv/conv_common.py
@@ -16,8 +16,8 @@
 import logging
 from hashlib import sha1
 
-from .... import backend
-from ....backend import registry
+from aitemplate import backend
+from aitemplate.backend import registry
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 3fff25a99..fe9b5a3b3 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -22,10 +22,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
index dc1f557d5..4be6e637a 100644
--- a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -15,9 +15,9 @@
 """
 Fused special_conv2d_bias_activation op.
 """
-from ...base import Tensor
-from ..padding import nhwc3to4, nhwc3to8
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+from aitemplate.compiler.ops.padding import nhwc3to4, nhwc3to8
 
 # pylint: disable=C0103
 class special_conv2d_bias_activation(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
index 40a293238..533b7e88a 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -21,9 +21,10 @@
 
 import jinja2
 
-from ....utils import shape_utils
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+
+from aitemplate.utils import shape_utils
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
index 13d44f128..c3423eb7c 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
@@ -18,12 +18,12 @@
 
 from typing import Tuple
 
+from aitemplate.compiler.base import Tensor
+
 from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
     conv2d_bias_activation,
 )
-
-from ...base import Tensor
-from .transposed_conv2d import transposed_conv2d
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 
 # pylint: disable=C0103
 class transposed_conv2d_bias(transposed_conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
index b66d8162d..2f0a57de6 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
@@ -15,7 +15,7 @@
 """
 Fused transposed_conv2d_bias_relu op.
 """
-from .transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
 
 # pylint: disable=C0103
 class transposed_conv2d_bias_relu(transposed_conv2d_bias):
diff --git a/python/aitemplate/compiler/ops/embedding/__init__.py b/python/aitemplate/compiler/ops/embedding/__init__.py
index 8e8178f4b..1ff35e6e9 100644
--- a/python/aitemplate/compiler/ops/embedding/__init__.py
+++ b/python/aitemplate/compiler/ops/embedding/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bert_embeddings import bert_embeddings
+from aitemplate.compiler.ops.embedding.bert_embeddings import bert_embeddings
 
 __all__ = [
     "bert_embeddings",
diff --git a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
index 8fc501529..8d8f7f42c 100644
--- a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
+++ b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
@@ -15,10 +15,10 @@
 """
 Operator definition for bert_embeddings.
 """
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntImm, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 
 class bert_embeddings(Operator):
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
index 265d4aebf..f48eb5a98 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
@@ -12,12 +12,22 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .bmm_rcr_softmax import bmm_rcr_softmax
-from .dual_bmm_rrr_div import dual_bmm_rrr_div
-from .dual_gemm_rcr_fast_gelu import dual_gemm_rcr_fast_gelu
-from .dual_gemm_rcr_silu import dual_gemm_rcr_silu
-from .gemm_rcr_bias_softmax import gemm_rcr_bias_softmax
-from .gemm_rcr_softmax import gemm_rcr_softmax
+from aitemplate.compiler.ops.gemm_epilogue_vistor.bmm_rcr_softmax import bmm_rcr_softmax
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_bmm_rrr_div import (
+    dual_bmm_rrr_div,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_gemm_rcr_fast_gelu import (
+    dual_gemm_rcr_fast_gelu,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_gemm_rcr_silu import (
+    dual_gemm_rcr_silu,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_bias_softmax import (
+    gemm_rcr_bias_softmax,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_softmax import (
+    gemm_rcr_softmax,
+)
 
 
 __all__ = [
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 7eba552dd..b30aba29f 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -19,10 +19,10 @@
 When use for `linear`, need set A->Data, B->Weight
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal import gemm_common as common
-from ..gemm_universal.bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
index c22b15c83..7e2a907ad 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -15,9 +15,9 @@
 """
 Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index 11deca6ee..62f8db0eb 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index 04bc02b38..e847b1acc 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 358dd891b..f09339f98 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -15,9 +15,11 @@
 """
 Operator definition for gemm_rcr_bias_softmax.
 """
-from ...base import _create_host_zero_tensor, Tensor
-from ...tensor_accessor import TensorAccessor
-from .gemm_rcr_softmax import gemm_rcr_softmax
+from aitemplate.compiler.base import _create_host_zero_tensor, Tensor
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_softmax import (
+    gemm_rcr_softmax,
+)
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120,W0223
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 163238824..630762e42 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -19,9 +19,9 @@
 When use for `linear`, need set A->Data, B->Weight
 """
 
-from ...base import _create_host_zero_tensor, IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/__init__.py b/python/aitemplate/compiler/ops/gemm_special/__init__.py
index 019225be5..71b3a2922 100644
--- a/python/aitemplate/compiler/ops/gemm_special/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_special/__init__.py
@@ -15,9 +15,9 @@
 """
 special gemm ops
 """
-from .bmm_rcr_n1 import bmm_rcr_n1
-from .bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
-from .gemm_rrr_small_nk import gemm_rrr_small_nk
+from aitemplate.compiler.ops.gemm_special.bmm_rcr_n1 import bmm_rcr_n1
+from aitemplate.compiler.ops.gemm_special.bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
+from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
 
 
 __all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
index 752cef8a0..7b32095b0 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
@@ -27,9 +27,9 @@
 This kernel computes C = alpha * A @ B
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal import bmm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
index e71c2933c..951727081 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import IntVar, Tensor
-from ..gemm_universal import bmm_rrr
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
 
 # pylint: disable=C0103,W0221,C0200
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
index 46ff36709..bd96f9b8e 100644
--- a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
@@ -25,8 +25,8 @@
 C: [M, N]
 """
 
-from ...base import IntImm, Tensor
-from ..gemm_universal import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
index 7ce092d7d..9e3ca05ab 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -13,53 +13,95 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bmm_ccr import bmm_ccr
-from .bmm_ccr_add import bmm_ccr_add
-from .bmm_crr import bmm_crr
-from .bmm_crr_add import bmm_crr_add
-from .bmm_rcr import bmm_rcr
-from .bmm_rcr_permute import bmm_rcr_permute
-from .bmm_rrr import bmm_rrr
-from .bmm_rrr_add import bmm_rrr_add
-from .bmm_rrr_permute import bmm_rrr_permute
-from .bmm_softmax_bmm import bmm_softmax_bmm
-from .bmm_softmax_bmm_permute import bmm_softmax_bmm_permute
-from .gemm_rcr import gemm_rcr
-from .gemm_rcr_bias import gemm_rcr_bias
-from .gemm_rcr_bias_add import gemm_rcr_bias_add
-from .gemm_rcr_bias_add_add import gemm_rcr_bias_add_add
-from .gemm_rcr_bias_add_add_relu import gemm_rcr_bias_add_add_relu
-from .gemm_rcr_bias_add_relu import gemm_rcr_bias_add_relu
-from .gemm_rcr_bias_fast_gelu import gemm_rcr_bias_fast_gelu
-from .gemm_rcr_bias_gelu import gemm_rcr_bias_gelu
-from .gemm_rcr_bias_hardswish import gemm_rcr_bias_hardswish
-from .gemm_rcr_bias_mul import gemm_rcr_bias_mul
-from .gemm_rcr_bias_mul_add import gemm_rcr_bias_mul_add
-from .gemm_rcr_bias_mul_tanh import gemm_rcr_bias_mul_tanh
-from .gemm_rcr_bias_permute import gemm_rcr_bias_permute
-from .gemm_rcr_bias_relu import gemm_rcr_bias_relu
-from .gemm_rcr_bias_sigmoid import gemm_rcr_bias_sigmoid
-from .gemm_rcr_bias_sigmoid_mul import gemm_rcr_bias_sigmoid_mul
-from .gemm_rcr_bias_sigmoid_mul_tanh import gemm_rcr_bias_sigmoid_mul_tanh
-from .gemm_rcr_bias_swish import gemm_rcr_bias_swish
-from .gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
-from .gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
-from .gemm_rcr_permute import gemm_rcr_permute
-from .gemm_rcr_permute_elup1 import gemm_rcr_permute_elup1
-from .gemm_rrr import gemm_rrr
-from .gemm_rrr_bias import gemm_rrr_bias
-from .gemm_rrr_bias_permute import gemm_rrr_bias_permute
-from .gemm_rrr_permute import gemm_rrr_permute
-from .group_gemm_rcr import group_gemm_rcr
-from .group_gemm_rcr_bias import group_gemm_rcr_bias
-from .group_gemm_rcr_bias_relu import group_gemm_rcr_bias_relu
-from .group_gemm_rcr_bias_sigmoid import group_gemm_rcr_bias_sigmoid
-from .perm021fc_ccr import perm021fc_ccr
-from .perm021fc_ccr_bias import perm021fc_ccr_bias
-from .perm021fc_ccr_bias_permute import perm021fc_ccr_bias_permute
-from .perm021fc_crc import perm021fc_crc
-from .perm021fc_crc_bias import perm021fc_crc_bias
-from .perm102_bmm_rcr import perm102_bmm_rcr
-from .perm102_bmm_rcr_bias import perm102_bmm_rcr_bias
-from .perm102_bmm_rrr import perm102_bmm_rrr
-from .perm102_bmm_rrr_bias import perm102_bmm_rrr_bias
+from aitemplate.compiler.ops.gemm_universal.bmm_ccr import bmm_ccr
+from aitemplate.compiler.ops.gemm_universal.bmm_ccr_add import bmm_ccr_add
+from aitemplate.compiler.ops.gemm_universal.bmm_crr import bmm_crr
+from aitemplate.compiler.ops.gemm_universal.bmm_crr_add import bmm_crr_add
+from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.bmm_rcr_permute import bmm_rcr_permute
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr_add import bmm_rrr_add
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr_permute import bmm_rrr_permute
+from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm import bmm_softmax_bmm
+from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm_permute import (
+    bmm_softmax_bmm_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add import gemm_rcr_bias_add
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_add import (
+    gemm_rcr_bias_add_add,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_add_relu import (
+    gemm_rcr_bias_add_add_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_relu import (
+    gemm_rcr_bias_add_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_fast_gelu import (
+    gemm_rcr_bias_fast_gelu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_gelu import gemm_rcr_bias_gelu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_hardswish import (
+    gemm_rcr_bias_hardswish,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul import gemm_rcr_bias_mul
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul_add import (
+    gemm_rcr_bias_mul_add,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul_tanh import (
+    gemm_rcr_bias_mul_tanh,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_permute import (
+    gemm_rcr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_relu import gemm_rcr_bias_relu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid import (
+    gemm_rcr_bias_sigmoid,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid_mul import (
+    gemm_rcr_bias_sigmoid_mul,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid_mul_tanh import (
+    gemm_rcr_bias_sigmoid_mul_tanh,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_swish import (
+    gemm_rcr_bias_swish,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_permute import gemm_rcr_permute
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_permute_elup1 import (
+    gemm_rcr_permute_elup1,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_bias import gemm_rrr_bias
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_bias_permute import (
+    gemm_rrr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_permute import gemm_rrr_permute
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr import group_gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias import (
+    group_gemm_rcr_bias,
+)
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias_relu import (
+    group_gemm_rcr_bias_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias_sigmoid import (
+    group_gemm_rcr_bias_sigmoid,
+)
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr import perm021fc_ccr
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias_permute import (
+    perm021fc_ccr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.perm021fc_crc import perm021fc_crc
+from aitemplate.compiler.ops.gemm_universal.perm021fc_crc_bias import perm021fc_crc_bias
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rcr import perm102_bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rcr_bias import (
+    perm102_bmm_rcr_bias,
+)
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rrr import perm102_bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rrr_bias import (
+    perm102_bmm_rrr_bias,
+)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
index 0903664da..67fd3cc38 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
@@ -18,11 +18,9 @@
 
 # pylint: disable=C0103,W0223
 
-from aitemplate.compiler.base import Tensor
-
-from ...base import IntImm
-from ...dtype import is_same_dtype
-from .gemm_common import gemm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.dtype import is_same_dtype
+from aitemplate.compiler.ops.gemm_universal.gemm_common import gemm
 
 
 def is_valid_inputs(output_shapes, c_shapes):
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
index 57f206312..250ae2c28 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
 """
 
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
index cf37ac68b..4f5ac9bd2 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
@@ -16,12 +16,11 @@
 Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
 """
 
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_ccr
+from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from . import bmm_ccr
-from .bmm import is_valid_inputs
-
 # pylint: disable=C0103, W0223
 
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
index dc3a9ee12..219da71fa 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
 """
 
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
index c5697c2b0..93d69300e 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
@@ -16,12 +16,11 @@
 Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
 """
 
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_crr
+from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from . import bmm_crr
-from .bmm import is_valid_inputs
-
 # pylint: disable=C0103, W0223
 
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
index 5565eda9c..bc3d64e42 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
 """
 
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
index 51e3a480b..4a987a498 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -18,10 +18,10 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import bmm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
index ae788c72f..37b65a8b4 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
 """
 
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
index a6b5dde8f..d8062d14d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
@@ -16,12 +16,11 @@
 Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
 """
 
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from . import bmm_rrr
-from .bmm import is_valid_inputs
-
 # pylint: disable=C0103, W0223
 
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index c920dfb4f..187db9f69 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -18,10 +18,10 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import bmm_rrr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
index 6872691ee..113807ba3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
@@ -16,10 +16,10 @@
 BMM_RCR + Softmax + BMM_RRR Specialization
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
index c9f00d27d..752676166 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -17,11 +17,11 @@
 """
 from typing import Tuple
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 329c3cda0..4744c069c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -29,15 +29,25 @@
 
 import jinja2
 
-from aitemplate.backend.profiler_runner import ProfileResult
+from aitemplate import backend
+from aitemplate.backend import registry
 
-from .... import backend
-from ....backend import registry
-from ....utils import alignment, environ
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ...dtype import is_same_dtype
-from ...tensor_accessor import TensorAccessor
-from .cache_entry import GemmQueryEntry, GemmRecordEntry
+from aitemplate.backend.profiler_runner import ProfileResult
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.dtype import is_same_dtype
+from aitemplate.compiler.ops.gemm_universal.cache_entry import (
+    GemmQueryEntry,
+    GemmRecordEntry,
+)
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment, environ
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
index c84915fe5..04052766b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
@@ -16,8 +16,8 @@
 GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
index 85a777278..1b327b93c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: GEMM_RCR(A, B) + Bias
 """
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
index 9f7b92d05..faea82cd5 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: GEMM_RCR(A, B) + Bias + D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
index 35026663d..eceba570f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: GEMM_RCR(A, B) + Bias + D0 + D1
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
index 2b6eb5312..1838cb33d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
@@ -16,7 +16,9 @@
 GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0 + D1)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
index 824114a01..c084051d0 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
@@ -16,7 +16,9 @@
 GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
index 2faaf8234..4f1388834 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
@@ -17,9 +17,9 @@
 BinaryOp2(BinaryOp1(UnaryOp(TensorOp(X) + bias), residual1), residual2)
 """
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rcr_bias
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 7c72ac636..743ade763 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: FastGELU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
index b8c7a33ce..34157307f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: GELU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
index 4ee004262..b658c243d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: HardSwish(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
index 486821c65..0e4906a8e 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
@@ -16,7 +16,9 @@
 GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
index 0039992d6..b04378d9d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0 + D1
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
index 72f1f6ea3..cbe25f593 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
@@ -16,7 +16,9 @@
 GEMM Specialization: TANH((GEMM_RCR(A, B) + Bias) * D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
index dcb865d6b..f95a89d61 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
@@ -18,12 +18,12 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import gemm_rcr_bias
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
index 99318ff49..6c6307d76 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
index b65c6f0a6..f4f868328 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """
 Sigmoid(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
index bbf2f133a..a2c35956b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
@@ -16,7 +16,9 @@
 GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias) * D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
index b26d6fe4d..a9eebbbda 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -16,7 +16,9 @@
 GEMM Specialization: Tanh(Sigmoid(GEMM_RCR(A, B) + Bias) * D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
index ffb285ef8..c4138269c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: SiLU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
index 53b35e879..bf3d7ef4a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: Tanh(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
index 1ffed29a4..264d9df5b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: FastGELU(GEMM_RCR(A, B))
 """
-from . import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
index 6a21c6e8a..882809860 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
@@ -21,13 +21,13 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from ...base import IntImm, IntVar, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from . import gemm_rcr
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
index f99b54bcd..062e1c242 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
@@ -16,7 +16,7 @@
 A specialization of gemm_rcr_permute applying ELU + 1 as epilogue.
 """
 
-from . import gemm_rcr_permute
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_permute
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
index 7a9bd7062..aca9b03a3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
@@ -19,8 +19,8 @@
 When use for `linear`, need set A->Data, B->Weight
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
index 03f6242a6..0eff459b8 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
@@ -15,9 +15,9 @@
 """
 gemm rrr with bias
 """
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rrr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
index 8774c7f97..57207a5b6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
@@ -18,13 +18,13 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from . import gemm_rrr_bias
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
index 498b90ad9..bf2604224 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
@@ -21,11 +21,11 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from . import gemm_rrr
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index 4d5e94d19..28f5cc6d7 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -22,15 +22,15 @@
 
 import jinja2
 
-from aitemplate.compiler.stable_set import StableSet
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import ExecItem, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.tensor import concatenate
 
-from ....backend import registry
-from ....backend.target import Target
-from ...base import ExecItem, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..tensor import concatenate
-from . import gemm_common as common
-from .gemm_rcr import gemm_rcr
+from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
index ac348062f..2cc5ced97 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
@@ -20,12 +20,15 @@
 
 import jinja2
 
-from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.base import ExecItem, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr import (
+    group_gemm_rcr,
+    SHAPE_EVAL_TEMPLATE,
+)
 
-from ...base import ExecItem, Tensor
-from ...tensor_accessor import TensorAccessor
-from .gemm_rcr_bias import gemm_rcr_bias
-from .group_gemm_rcr import group_gemm_rcr, SHAPE_EVAL_TEMPLATE
+from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
index 3094eb71f..e1b62eb81 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -15,7 +15,7 @@
 """Grouped GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
 """
 
-from . import group_gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import group_gemm_rcr_bias
 
 # pylint: disable=C0103,W0223
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index 8601144a3..5098e3285 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """Grouped GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias)
 """
 
-from . import group_gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import group_gemm_rcr_bias
 
 # pylint: disable=C0103,W0223
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
index 5189f8d17..521510975 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
@@ -16,11 +16,11 @@
 GEMM Specialization: A.permute(0, 2, 1)[col] @ B[col]
 """
 
-from ....utils import alignment
-from ...base import _create_host_zero_tensor, IntImm, Tensor
-from ..tensor import concatenate
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.ops.tensor import concatenate
+from aitemplate.utils import alignment
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
index 3d19f77fd..7ded98ca3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
@@ -16,11 +16,10 @@
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias)
 """
 
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import perm021fc_ccr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from . import perm021fc_ccr
-
 # pylint: disable=C0103, W0223, W0221
 
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
index 5016174cb..f9e05c116 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -15,10 +15,10 @@
 """
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from .perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
index 806e3d0eb..026435330 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
@@ -17,9 +17,9 @@
 Note: This op's output is a ColMajor
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
index 749be1900..7443bfa84 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
@@ -16,11 +16,10 @@
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[row] + Bias)
 """
 
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm021fc_crc
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import IntImm, Tensor
-from . import perm021fc_crc
-
 # pylint: disable=C0103, W0223, W0221
 
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
index 3a7d8dc9b..54cd4c2ee 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
index fb1969552..90486b63a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import perm102_bmm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm102_bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
index d22913d65..a20e138a6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
index c8e64ff45..c85d1bf10 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[b, n]
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import perm102_bmm_rrr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm102_bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/groupnorm/__init__.py b/python/aitemplate/compiler/ops/groupnorm/__init__.py
index e51549e67..cb5fd3174 100644
--- a/python/aitemplate/compiler/ops/groupnorm/__init__.py
+++ b/python/aitemplate/compiler/ops/groupnorm/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from .groupnorm import group_norm
-from .groupnorm_swish import group_norm_swish
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm_swish import group_norm_swish
 
 __all__ = ["group_norm", "group_norm_swish"]
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 119c34f76..fd31bb5d7 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -25,13 +25,20 @@
 
 import jinja2
 
-from aitemplate.testing import detect_target
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
index 9aebe87c7..89e6a8bab 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from .groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
 
 
 class group_norm_swish(group_norm):
diff --git a/python/aitemplate/compiler/ops/layernorm/__init__.py b/python/aitemplate/compiler/ops/layernorm/__init__.py
index 361b3a05e..ebeb4b39d 100644
--- a/python/aitemplate/compiler/ops/layernorm/__init__.py
+++ b/python/aitemplate/compiler/ops/layernorm/__init__.py
@@ -12,11 +12,17 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .batch_layernorm_sigmoid_mul import batch_layernorm_sigmoid_mul
-from .group_layernorm import group_layernorm
-from .group_layernorm_sigmoid_mul import group_layernorm_sigmoid_mul
-from .layernorm import layernorm
-from .layernorm_sigmoid_mul import layernorm_sigmoid_mul
+from aitemplate.compiler.ops.layernorm.batch_layernorm_sigmoid_mul import (
+    batch_layernorm_sigmoid_mul,
+)
+from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
+from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
+    group_layernorm_sigmoid_mul,
+)
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
+from aitemplate.compiler.ops.layernorm.layernorm_sigmoid_mul import (
+    layernorm_sigmoid_mul,
+)
 
 
 __all__ = [
diff --git a/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
index 5ff2dd079..647f25485 100644
--- a/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
@@ -19,8 +19,8 @@
 """
 from typing import List
 
-from ...base import IntImm
-from .layernorm import layernorm
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
index ed13b6760..1fb0b85e4 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import IntImm
-from .group_layernorm import group_layernorm
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index 5ab572d74..8cf35808d 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -25,16 +25,23 @@
 
 import jinja2
 
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
 from aitemplate.testing import detect_target
 from aitemplate.utils import shape_utils
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
-
 # pylint: disable=C0103,W0221,W0102,W0223
 
 
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
index e34a2a019..691cfb41a 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
@@ -15,12 +15,11 @@
 """
 Operator definition for layernorm_sigmoid_mul.
 """
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
 from aitemplate.compiler.stable_set import StableSet
-
-from .... import backend
-from ....backend import registry
-from ...base import Operator
-from ...tensor_accessor import TensorAccessor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/padding/__init__.py b/python/aitemplate/compiler/ops/padding/__init__.py
index 10518e995..6448b85a2 100644
--- a/python/aitemplate/compiler/ops/padding/__init__.py
+++ b/python/aitemplate/compiler/ops/padding/__init__.py
@@ -15,10 +15,10 @@
 """
 Padding ops module init.
 """
-from .ndhwc3to8 import ndhwc3to8
-from .nhwc3to4 import nhwc3to4
-from .nhwc3to8 import nhwc3to8
-from .pad_last_dim import pad_last_dim
+from aitemplate.compiler.ops.padding.ndhwc3to8 import ndhwc3to8
+from aitemplate.compiler.ops.padding.nhwc3to4 import nhwc3to4
+from aitemplate.compiler.ops.padding.nhwc3to8 import nhwc3to8
+from aitemplate.compiler.ops.padding.pad_last_dim import pad_last_dim
 
 
 __all__ = ["ndhwc3to8", "nhwc3to8", "nhwc3to4", "pad_last_dim"]
diff --git a/python/aitemplate/compiler/ops/padding/ndhwc3to8.py b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
index 908f3858f..738d249f8 100644
--- a/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
+++ b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to4.py b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
index 7cce9a94e..03c748568 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc3to4.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
@@ -18,7 +18,7 @@
 
 import jinja2
 
-from .nhwc_pad_common import nhwc_pad_common
+from aitemplate.compiler.ops.padding.nhwc_pad_common import nhwc_pad_common
 
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to8.py b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
index 7d4581c8e..aeb502ded 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc3to8.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
@@ -18,7 +18,7 @@
 
 import jinja2
 
-from .nhwc_pad_common import nhwc_pad_common
+from aitemplate.compiler.ops.padding.nhwc_pad_common import nhwc_pad_common
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
index c1bbe897f..96c5eb0be 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/padding/pad_last_dim.py b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
index 7a826b2f1..6def61e73 100644
--- a/python/aitemplate/compiler/ops/padding/pad_last_dim.py
+++ b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
@@ -19,9 +19,9 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ...base import IntImm, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/pool/__init__.py b/python/aitemplate/compiler/ops/pool/__init__.py
index e0e5003fe..7cd9df61a 100644
--- a/python/aitemplate/compiler/ops/pool/__init__.py
+++ b/python/aitemplate/compiler/ops/pool/__init__.py
@@ -15,8 +15,8 @@
 """
 Pool module init.
 """
-from .avg_pool2d import avg_pool2d
-from .max_pool2d import max_pool2d
+from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
+from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/compiler/ops/pool/avg_pool2d.py b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
index c113c1d5e..094968e72 100644
--- a/python/aitemplate/compiler/ops/pool/avg_pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
@@ -15,7 +15,7 @@
 """
 Avg_pool2d op.
 """
-from .pool2d import pool2d_base
+from aitemplate.compiler.ops.pool.pool2d import pool2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/pool/max_pool2d.py b/python/aitemplate/compiler/ops/pool/max_pool2d.py
index f92303f1b..f95144463 100644
--- a/python/aitemplate/compiler/ops/pool/max_pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/max_pool2d.py
@@ -15,7 +15,7 @@
 """
 Max_pool2d op.
 """
-from .pool2d import pool2d_base
+from aitemplate.compiler.ops.pool.pool2d import pool2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
index f523f576a..37bbb9151 100644
--- a/python/aitemplate/compiler/ops/pool/pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
diff --git a/python/aitemplate/compiler/ops/reduce/__init__.py b/python/aitemplate/compiler/ops/reduce/__init__.py
index 037151e20..1fdff06c8 100644
--- a/python/aitemplate/compiler/ops/reduce/__init__.py
+++ b/python/aitemplate/compiler/ops/reduce/__init__.py
@@ -15,10 +15,10 @@
 """
 Reduce module init.
 """
-from .reduce_mean import reduce_mean
-from .reduce_sum import reduce_sum
-from .var import var
-from .vector_norm import vector_norm
+from aitemplate.compiler.ops.reduce.reduce_mean import reduce_mean
+from aitemplate.compiler.ops.reduce.reduce_sum import reduce_sum
+from aitemplate.compiler.ops.reduce.var import var
+from aitemplate.compiler.ops.reduce.vector_norm import vector_norm
 
 
 __all__ = ["reduce_mean", "reduce_sum", "var", "vector_norm"]
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_common.py b/python/aitemplate/compiler/ops/reduce/reduce_common.py
index 33fa194e3..1dafa717f 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_common.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_common.py
@@ -20,13 +20,13 @@
 
 from typing import List
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
-from ...dtype import get_dtype_size
-from ...tensor_accessor import TensorAccessor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_mean.py b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
index 44fbc810d..37952f488 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_mean.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
@@ -15,7 +15,7 @@
 """
 Reduce_mean op implementation.
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_sum.py b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
index 3f08d8b14..08d538e96 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_sum.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
@@ -15,7 +15,7 @@
 """
 reduce_sum op
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/var.py b/python/aitemplate/compiler/ops/reduce/var.py
index 136783ee1..91117404d 100644
--- a/python/aitemplate/compiler/ops/reduce/var.py
+++ b/python/aitemplate/compiler/ops/reduce/var.py
@@ -15,7 +15,7 @@
 """
 var op implementation
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/vector_norm.py b/python/aitemplate/compiler/ops/reduce/vector_norm.py
index c4d445195..38ea5c367 100644
--- a/python/aitemplate/compiler/ops/reduce/vector_norm.py
+++ b/python/aitemplate/compiler/ops/reduce/vector_norm.py
@@ -16,7 +16,7 @@
 vector_norm op implementation that simulates pytorch's linalg.vector_norm.
 Currently, we only support L2 norm.
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/softmax/__init__.py b/python/aitemplate/compiler/ops/softmax/__init__.py
index d1e1d89d0..61a8ad8ba 100644
--- a/python/aitemplate/compiler/ops/softmax/__init__.py
+++ b/python/aitemplate/compiler/ops/softmax/__init__.py
@@ -15,7 +15,7 @@
 """
 softmax module init
 """
-from .softmax import softmax
+from aitemplate.compiler.ops.softmax.softmax import softmax
 
 
 __all__ = ["softmax"]
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index 42de07d67..aa08bddd4 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -25,15 +25,21 @@
 
 import jinja2
 
-from aitemplate.testing import detect_target
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
+from aitemplate.testing import detect_target
 
-from ....utils.tensor_utils import wrap_dim
-from ...base import DynamicProfileStrategy, ExecItem, IntVar, Operator, Tensor
-from .cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.utils.tensor_utils import wrap_dim
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 5c324ff33..d826578ef 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -16,24 +16,24 @@
 """
 reduce module init
 """
-from .argmax import argmax
-from .batch_gather import batch_gather
-from .chunk import chunk
-from .concatenate import concatenate
-from .concatenate_tanh import concatenate_tanh
-from .dynamic_slice import dynamic_slice
-from .expand import expand
-from .gather import gather
-from .jagged_to_dense import jagged_to_dense
-from .masked_select import masked_select
-from .permute import permute
-from .permute021 import permute021
-from .permute0213 import permute0213
-from .permute102 import permute102
-from .permute210 import permute210
-from .size import size
-from .slice_reshape_scatter import slice_reshape_scatter
-from .slice_scatter import slice_scatter
-from .split import split
-from .topk import topk
-from .transpose import transpose
+from aitemplate.compiler.ops.tensor.argmax import argmax
+from aitemplate.compiler.ops.tensor.batch_gather import batch_gather
+from aitemplate.compiler.ops.tensor.chunk import chunk
+from aitemplate.compiler.ops.tensor.concatenate import concatenate
+from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.gather import gather
+from aitemplate.compiler.ops.tensor.jagged_to_dense import jagged_to_dense
+from aitemplate.compiler.ops.tensor.masked_select import masked_select
+from aitemplate.compiler.ops.tensor.permute import permute
+from aitemplate.compiler.ops.tensor.permute021 import permute021
+from aitemplate.compiler.ops.tensor.permute0213 import permute0213
+from aitemplate.compiler.ops.tensor.permute102 import permute102
+from aitemplate.compiler.ops.tensor.permute210 import permute210
+from aitemplate.compiler.ops.tensor.size import size
+from aitemplate.compiler.ops.tensor.slice_reshape_scatter import slice_reshape_scatter
+from aitemplate.compiler.ops.tensor.slice_scatter import slice_scatter
+from aitemplate.compiler.ops.tensor.split import split
+from aitemplate.compiler.ops.tensor.topk import topk
+from aitemplate.compiler.ops.tensor.transpose import transpose
diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
index 2cf26c11a..2bf5922c5 100644
--- a/python/aitemplate/compiler/ops/tensor/argmax.py
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -26,10 +26,10 @@
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index 03664f494..1182e32b5 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -21,10 +21,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/tensor/chunk.py b/python/aitemplate/compiler/ops/tensor/chunk.py
index 78e5d7f21..786a62177 100644
--- a/python/aitemplate/compiler/ops/tensor/chunk.py
+++ b/python/aitemplate/compiler/ops/tensor/chunk.py
@@ -19,8 +19,8 @@
 
 from typing import List
 
-from ...base import Tensor
-from .split import split
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.tensor.split import split
 
 
 class chunk(split):
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index db0922cbe..51c792d93 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -17,12 +17,12 @@
 """
 from typing import List, Sequence, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntVar, Operator, Tensor
-from ...tensor_accessor import TensorAccessor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
index 08cdacaa7..7ecaf757f 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
@@ -15,7 +15,7 @@
 """
 Concatenate_tanh
 """
-from . import concatenate
+from aitemplate.compiler.ops.tensor import concatenate
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
index 774c4418a..b7d966ee2 100644
--- a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
+++ b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
@@ -18,10 +18,10 @@
 import itertools
 from typing import List, Optional, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntVar, IntVarTensor, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/gather.py b/python/aitemplate/compiler/ops/tensor/gather.py
index 6a551892c..4a2b4d131 100644
--- a/python/aitemplate/compiler/ops/tensor/gather.py
+++ b/python/aitemplate/compiler/ops/tensor/gather.py
@@ -15,9 +15,9 @@
 """
 Operator definition for gather.
 """
-from .... import backend
-from ....backend import registry
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/tensor/permute.py b/python/aitemplate/compiler/ops/tensor/permute.py
index 3e7eadfce..c4a9049d6 100644
--- a/python/aitemplate/compiler/ops/tensor/permute.py
+++ b/python/aitemplate/compiler/ops/tensor/permute.py
@@ -17,14 +17,14 @@
 """
 from typing import List, Sequence
 
-from .... import backend
-from ....backend import registry
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
-from .permute021 import permute021
-from .permute0213 import permute0213
-from .permute102 import permute102
-from .permute210 import permute210
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.permute021 import permute021
+from aitemplate.compiler.ops.tensor.permute0213 import permute0213
+from aitemplate.compiler.ops.tensor.permute102 import permute102
+from aitemplate.compiler.ops.tensor.permute210 import permute210
+from aitemplate.utils.tensor_utils import wrap_dim
 
 
 class permute(Operator):
diff --git a/python/aitemplate/compiler/ops/tensor/permute021.py b/python/aitemplate/compiler/ops/tensor/permute021.py
index e1d20f48f..241823985 100644
--- a/python/aitemplate/compiler/ops/tensor/permute021.py
+++ b/python/aitemplate/compiler/ops/tensor/permute021.py
@@ -17,9 +17,9 @@
 """
 from typing import List
 
-from .... import backend
-from ....backend import registry
-from ...base import IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/permute0213.py b/python/aitemplate/compiler/ops/tensor/permute0213.py
index 42aab5709..b6b33c10d 100644
--- a/python/aitemplate/compiler/ops/tensor/permute0213.py
+++ b/python/aitemplate/compiler/ops/tensor/permute0213.py
@@ -18,10 +18,10 @@
 """
 from typing import List
 
-from aitemplate.backend import registry
+from aitemplate import backend
 
-from .... import backend
-from ...base import IntVar, Operator, Tensor
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/permute102.py b/python/aitemplate/compiler/ops/tensor/permute102.py
index c678210b8..3c9674186 100644
--- a/python/aitemplate/compiler/ops/tensor/permute102.py
+++ b/python/aitemplate/compiler/ops/tensor/permute102.py
@@ -18,10 +18,10 @@
 """
 from typing import List
 
-from aitemplate.backend import registry
+from aitemplate import backend
 
-from .... import backend
-from ...base import IntVar, Operator, Tensor
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/permute210.py b/python/aitemplate/compiler/ops/tensor/permute210.py
index 3cba6d811..70abe3baf 100644
--- a/python/aitemplate/compiler/ops/tensor/permute210.py
+++ b/python/aitemplate/compiler/ops/tensor/permute210.py
@@ -18,10 +18,10 @@
 """
 from typing import List
 
-from aitemplate.backend import registry
+from aitemplate import backend
 
-from .... import backend
-from ...base import Operator, Tensor
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/size.py b/python/aitemplate/compiler/ops/tensor/size.py
index e7cd1adf3..9607db1c5 100644
--- a/python/aitemplate/compiler/ops/tensor/size.py
+++ b/python/aitemplate/compiler/ops/tensor/size.py
@@ -21,7 +21,7 @@
 
 from aitemplate.backend import registry
 
-from ...base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 
 # pylint: disable=C0103,W0221,R1732,W0613
 
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index 1aad49a71..8e54ea182 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -17,12 +17,12 @@
 """
 from typing import Optional
 
-from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator
+from aitemplate.compiler.stable_set import StableSet
 
-from .... import backend
-from ....backend import registry
-from ...base import IntImm, IntVar, Operator
-from ...stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,C0415,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 02e3d4666..05ca0b1d5 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -16,12 +16,11 @@
 Slice_scatter.
 """
 
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
 from aitemplate.compiler.stable_set import StableSet
 
-from .... import backend
-from ....backend import registry
-from ...base import Operator
-
 # pylint: disable=C0103,W0221
 
 
diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
index 7b60d36f4..d175099cb 100644
--- a/python/aitemplate/compiler/ops/tensor/split.py
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -18,11 +18,11 @@
 import itertools
 from typing import List, Sequence, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
index 871f03d5d..6a3cdfbe7 100644
--- a/python/aitemplate/compiler/ops/tensor/topk.py
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -26,9 +26,9 @@
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/tensor/transpose.py b/python/aitemplate/compiler/ops/tensor/transpose.py
index 2154105a4..5ad8d298d 100644
--- a/python/aitemplate/compiler/ops/tensor/transpose.py
+++ b/python/aitemplate/compiler/ops/tensor/transpose.py
@@ -16,8 +16,8 @@
 transpose op
 """
 
-from ...base import Tensor
-from .permute import permute
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.tensor.permute import permute
 
 
 class transpose(permute):
diff --git a/python/aitemplate/compiler/ops/upsample/__init__.py b/python/aitemplate/compiler/ops/upsample/__init__.py
index 54712bf9b..6af4b174d 100644
--- a/python/aitemplate/compiler/ops/upsample/__init__.py
+++ b/python/aitemplate/compiler/ops/upsample/__init__.py
@@ -15,8 +15,8 @@
 """
 Upsampling module init.
 """
-from .upsampling2d import upsampling2d
-from .upsampling2d_add import upsampling2d_add
+from aitemplate.compiler.ops.upsample.upsampling2d import upsampling2d
+from aitemplate.compiler.ops.upsample.upsampling2d_add import upsampling2d_add
 
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d.py b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
index 747cf0291..e53d4aed0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling2d.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
@@ -15,7 +15,7 @@
 """
 Upsampling2d op.
 """
-from .upsampling_common import upsampling2d_base
+from aitemplate.compiler.ops.upsample.upsampling_common import upsampling2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
index b203c0050..e63c2c560 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import Tensor
-from .upsampling_common import upsampling2d_base
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.upsample.upsampling_common import upsampling2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index aff1b36a8..59b94e0b0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
diff --git a/python/aitemplate/compiler/ops/vision_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/__init__.py
index 29b76d99a..9f32e81ed 100644
--- a/python/aitemplate/compiler/ops/vision_ops/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/__init__.py
@@ -15,5 +15,5 @@
 """
 Vision ops module init.
 """
-from .nms import *  # noqa
-from .roi_ops import *  # noqa
+from aitemplate.compiler.ops.vision_ops.nms import *  # noqa
+from aitemplate.compiler.ops.vision_ops.roi_ops import *  # noqa
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
index c26c09e00..eea6045e6 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
@@ -15,9 +15,9 @@
 """
 Nms family ops.
 """
-from .batched_nms import batched_nms
-from .efficient_nms import efficient_nms
-from .nms import nms
+from aitemplate.compiler.ops.vision_ops.nms.batched_nms import batched_nms
+from aitemplate.compiler.ops.vision_ops.nms.efficient_nms import efficient_nms
+from aitemplate.compiler.ops.vision_ops.nms.nms import nms
 
 
 __all__ = ["batched_nms", "nms", "efficient_nms"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
index c44c8d30f..686beea1d 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
@@ -20,10 +20,15 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import _create_host_zero_tensor, IntImm, Operator, Tensor  # noqa
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import (  # noqa
+    _create_host_zero_tensor,
+    IntImm,
+    Operator,
+    Tensor,
+)
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
index 5b124d081..d39872b7c 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
@@ -25,10 +25,10 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import IntImm, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
index bc1769e4c..beee90245 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
@@ -25,10 +25,10 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
index 0d8619521..19edd785a 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
@@ -15,7 +15,9 @@
 """
 Roi-align module init.
 """
-from .multi_level_roi_align import multi_level_roi_align
-from .roi_align import roi_align
+from aitemplate.compiler.ops.vision_ops.roi_ops.multi_level_roi_align import (
+    multi_level_roi_align,
+)
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_align import roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
index bd3fc7093..f5e2701b6 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
@@ -18,8 +18,8 @@
 
 from typing import List
 
-from ....base import Tensor
-from .roi_ops import roi_ops_base
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_ops import roi_ops_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
index dbc6b13da..cdcb0fa80 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
@@ -15,7 +15,7 @@
 """
 Roi_align.
 """
-from .roi_ops import roi_ops_base
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_ops import roi_ops_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
index 076ee9235..39b2ab046 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index 518cd8df7..d248d526b 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -24,9 +24,7 @@
 from pprint import pformat
 from typing import Any, List, Optional
 
-from aitemplate.compiler.base import IntVar
-
-from .base import IntImm, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index ca9bf77e4..d618161b0 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -13,29 +13,36 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bind_constants import bind_constants
-from .constant_folding import constant_folding
-from .fuse_conv_elementwise import fuse_conv_elementwise
-from .fuse_group_ops import (
+from aitemplate.compiler.transform.bind_constants import bind_constants
+from aitemplate.compiler.transform.constant_folding import constant_folding
+from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_group_ops import (
     fuse_group_gemm_ops,
     fuse_group_layernorm_ops,
     fuse_group_ops,
 )
-from .fuse_mm_elementwise import fuse_mm_elementwise
-from .fuse_ops import fuse_ops
-from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
-from .mark_param_tensor import mark_param_tensor, mark_special_views
-from .memory_planning import memory_planning
-from .name_graph import name_graph
-from .optimize_graph import optimize_graph
-from .profile import profile
-from .refine_graph import refine_graph
-from .remove_no_ops import remove_no_ops
-from .remove_unused_ops import remove_unused_ops
-from .split_large_concat_ops import split_large_concat_ops
-from .split_large_split_ops import split_large_split_ops
-from .toposort import toposort
-from .transform_memory_ops import transform_memory_ops
-from .transform_odd_alignment import transform_odd_alignment
-from .transform_special_ops import transform_special_ops
-from .transform_strided_ops import transform_strided_ops
+from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
+from aitemplate.compiler.transform.fuse_ops import fuse_ops
+from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
+    fuse_permute_bmm_and_gemm,
+)
+from aitemplate.compiler.transform.mark_param_tensor import (
+    mark_param_tensor,
+    mark_special_views,
+)
+from aitemplate.compiler.transform.memory_planning import memory_planning
+from aitemplate.compiler.transform.name_graph import name_graph
+from aitemplate.compiler.transform.optimize_graph import optimize_graph
+from aitemplate.compiler.transform.profile import profile
+from aitemplate.compiler.transform.refine_graph import refine_graph
+from aitemplate.compiler.transform.remove_no_ops import remove_no_ops
+from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
+from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
+from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_odd_alignment import (
+    transform_odd_alignment,
+)
+from aitemplate.compiler.transform.transform_special_ops import transform_special_ops
+from aitemplate.compiler.transform.transform_strided_ops import transform_strided_ops
diff --git a/python/aitemplate/compiler/transform/apply_padding.py b/python/aitemplate/compiler/transform/apply_padding.py
index 423e0980c..67a7343bc 100644
--- a/python/aitemplate/compiler/transform/apply_padding.py
+++ b/python/aitemplate/compiler/transform/apply_padding.py
@@ -18,13 +18,13 @@
 import logging
 from typing import Callable, Dict, List
 
-from aitemplate.compiler.base import _create_host_zero_tensor
+from aitemplate.compiler import ops
 
-from ...utils import alignment
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal.gemm_common import DimInfo, gemm, Source
-from . import transform_utils
+from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import DimInfo, gemm, Source
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import alignment
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
index 2db2a8c9e..9700cc9e8 100644
--- a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
@@ -17,14 +17,14 @@
 """
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
-from .fuse_conv_patterns import (
+from aitemplate.compiler.transform.fuse_conv_patterns import (
     get_conv2d_bias_elementwise_patterns,
     get_conv2d_bias_pattern,
     get_cuda_only_conv2d_bias_elementwise_patterns,
 )
-from .fuse_utils import transform_simple_fusion_patterns
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
 
 # pylint: disable=C0103,C0415,W0612
 
diff --git a/python/aitemplate/compiler/transform/fuse_conv_patterns.py b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
index 9dfef17dd..9256784b4 100644
--- a/python/aitemplate/compiler/transform/fuse_conv_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
@@ -12,9 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..ops.common import elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.conv import (
+from aitemplate.compiler.ops.common import elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.conv import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add,
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index 6f8b2ad0f..48ff13cc3 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -20,14 +20,15 @@
 import os
 from typing import Callable, List, OrderedDict, Set
 
-from ...utils import graph_utils
-from ...utils.shape_utils import all_static_dimensions
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_universal.gemm_common import default_align_ab
-from . import transform_utils
-from .fuse_split import _can_fuse_split_op
-from .toposort import toposort
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import default_align_ab
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.fuse_split import _can_fuse_split_op
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import all_static_dimensions
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
index 89610d3f9..cbe30d300 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
@@ -17,17 +17,20 @@
 """
 from typing import List
 
-from ..base import Tensor
-from ..ops.common.epilogue import FuncEnum
-from ..ops.gemm_universal import gemm_rcr_bias_swish
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias_swish
 
-from .fuse_mm_elementwise_patterns import get_gemm_rcr_bias_patterns, get_patterns
-from .fuse_utils import (
+from aitemplate.compiler.transform.fuse_mm_elementwise_patterns import (
+    get_gemm_rcr_bias_patterns,
+    get_patterns,
+)
+from aitemplate.compiler.transform.fuse_utils import (
     extract_only_one_op,
     is_elementwise_type,
     transform_simple_fusion_patterns,
 )
-from .transform_utils import (
+from aitemplate.compiler.transform.transform_utils import (
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
     remove_single_tensor_op_from_sorted_graph,
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
index 9e53f4711..08924078d 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
@@ -12,9 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..ops.common import elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.gemm_universal import (
+from aitemplate.compiler.ops.common import elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.gemm_universal import (
     bmm_ccr,
     bmm_ccr_add,
     bmm_crr,
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
index 9bb606c26..d8e9370c5 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -17,11 +17,12 @@
 """
 from typing import List, Sequence
 
-from ...utils import graph_utils
-from ..base import IntImm, Operator, Tensor
-from ..ops import gemm_rcr_permute
-from . import transform_utils
-from .toposort import toposort
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops import gemm_rcr_permute
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import graph_utils
 
 
 def _check_reshape(op: Operator) -> bool:
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index ba9c9d8bd..3dd39e3a3 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -20,18 +20,16 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Set
 
-from aitemplate.compiler.base import Operator
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.common import elementwise, fused_elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm_swish import group_norm_swish
+from aitemplate.compiler.ops.layernorm import layernorm_sigmoid_mul
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
 from aitemplate.compiler.transform.toposort import toposort
 
-from ..base import Tensor
-from ..ops.common import elementwise, fused_elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.groupnorm.groupnorm import group_norm
-from ..ops.groupnorm.groupnorm_swish import group_norm_swish
-from ..ops.layernorm import layernorm_sigmoid_mul
-from . import transform_utils
-from .fuse_utils import transform_simple_fusion_patterns
-
 # pylint: disable=C0103,W0612
 
 
diff --git a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
index 668372d2c..9d2525e18 100644
--- a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
+++ b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
@@ -18,15 +18,16 @@
 
 from typing import Callable, List, Tuple
 
-from ...utils import graph_utils
-from ...utils.shape_utils import is_static_dimension
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_universal.gemm_common import default_align_ab
-from ..tensor_accessor import TensorAccessor
-from . import transform_utils
-from .toposort import toposort
-from .transform_strided_ops import _is_supported_op
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import default_align_ab
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_strided_ops import _is_supported_op
+
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import is_static_dimension
 
 
 def _is_same_shape(gemm_op1: Operator, gemm_op2: Operator) -> bool:
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
index 47cc5f6b6..109a10100 100644
--- a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
+++ b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
@@ -17,12 +17,9 @@
 """
 from typing import Callable, List, Optional, Set, Tuple, Type, Union
 
-from aitemplate.compiler.ops.tensor.permute import permute
-
-from ...utils import alignment
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal import (
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal import (
     bmm_ccr,
     bmm_crr,
     bmm_rcr,
@@ -32,9 +29,11 @@
     gemm_rrr,
     gemm_rrr_bias,
 )
-from ..ops.tensor import permute021
-from .fuse_utils import extract_only_one_op
-from .transform_utils import (
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.ops.tensor.permute import permute
+from aitemplate.compiler.transform.fuse_utils import extract_only_one_op
+from aitemplate.compiler.transform.transform_utils import (
     copy_src_op_attributes,
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
@@ -43,6 +42,8 @@
     sanitize_sorted_graph,
 )
 
+from aitemplate.utils import alignment
+
 # pylint: disable=C0103,W0612
 
 
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index 8074b389b..e81c0eed9 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -18,11 +18,12 @@
 import logging
 from typing import List
 
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
 
-from ...utils import alignment, graph_utils
-from ..base import IntImm, IntVar, Operator, Tensor
-from . import transform_strided_ops_utils, transform_utils
+from aitemplate.utils import alignment, graph_utils
 
 # pylint: disable=W0612
 
diff --git a/python/aitemplate/compiler/transform/fuse_utils.py b/python/aitemplate/compiler/transform/fuse_utils.py
index 13553480d..39f4c2e73 100644
--- a/python/aitemplate/compiler/transform/fuse_utils.py
+++ b/python/aitemplate/compiler/transform/fuse_utils.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, List, Optional, Set
 
-from ..base import Operator, Tensor
-from .toposort import toposort
-from .transform_utils import (
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
     replace_tensor,
diff --git a/python/aitemplate/compiler/transform/mark_param_tensor.py b/python/aitemplate/compiler/transform/mark_param_tensor.py
index 104739908..677c1df93 100644
--- a/python/aitemplate/compiler/transform/mark_param_tensor.py
+++ b/python/aitemplate/compiler/transform/mark_param_tensor.py
@@ -17,7 +17,7 @@
 """
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103,W0613
 
diff --git a/python/aitemplate/compiler/transform/memory_planning.py b/python/aitemplate/compiler/transform/memory_planning.py
index 2abe94a8b..22ff201a9 100644
--- a/python/aitemplate/compiler/transform/memory_planning.py
+++ b/python/aitemplate/compiler/transform/memory_planning.py
@@ -20,7 +20,7 @@
 from dataclasses import dataclass
 from typing import List
 
-from ..base import Operator, Tensor
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 512a3af0f..4f95a9281 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,7 @@
 import re
 from typing import List
 
-from ..base import IntImm, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.compiler.base import IntImm, IntVarTensor, JaggedIntVar, Tensor
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 72071c20b..ae68548d4 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -18,23 +18,32 @@
 
 from typing import List
 
-from ...utils import graph_utils
-from ..base import Tensor
-from .apply_padding import apply_padding
-from .fuse_conv_elementwise import fuse_conv_elementwise
-from .fuse_group_ops import fuse_group_ops
-from .fuse_mm_elementwise import fuse_mm_elementwise
-from .fuse_mm_reshape_permute import fuse_mm_reshape_permute
-from .fuse_ops import fuse_ops
-from .fuse_parallel_gemms import fuse_parallel_gemms
-from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
-from .split_large_concat_ops import split_large_concat_ops
-from .split_large_slice_scatter_ops import split_large_slice_scatter_ops
-from .split_large_split_ops import split_large_split_ops
-from .transform_memory_ops import transform_memory_ops
-from .transform_odd_alignment import transform_odd_alignment
-from .transform_special_ops import transform_special_ops
-from .transform_strided_ops import transform_strided_ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.transform.apply_padding import apply_padding
+from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_group_ops import fuse_group_ops
+from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
+from aitemplate.compiler.transform.fuse_mm_reshape_permute import (
+    fuse_mm_reshape_permute,
+)
+from aitemplate.compiler.transform.fuse_ops import fuse_ops
+from aitemplate.compiler.transform.fuse_parallel_gemms import fuse_parallel_gemms
+from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
+    fuse_permute_bmm_and_gemm,
+)
+from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
+from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
+    split_large_slice_scatter_ops,
+)
+from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
+from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_odd_alignment import (
+    transform_odd_alignment,
+)
+from aitemplate.compiler.transform.transform_special_ops import transform_special_ops
+from aitemplate.compiler.transform.transform_strided_ops import transform_strided_ops
+
+from aitemplate.utils import graph_utils
 
 
 def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index a87c5c190..c020004b7 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -21,16 +21,16 @@
 from datetime import datetime
 from typing import List, OrderedDict
 
+from aitemplate.backend import builder, codegen
+
 from aitemplate.backend.profiler_runner import ProfilerRunner
+from aitemplate.compiler.base import DynamicProfileStrategy, Tensor
 
 from aitemplate.compiler.ops.gemm_universal.gemm_common import (
     gemm,
     GemmProfilerPostprocessingDelegate,
 )
 
-from ...backend import builder, codegen
-from ..base import DynamicProfileStrategy, Tensor
-
 # pylint: disable=C0103,W0613,W0102
 
 
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index e49367d47..493d2f54a 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -19,8 +19,8 @@
 from copy import deepcopy
 from typing import List, OrderedDict
 
-from ...backend import builder, codegen
-from ..base import Tensor
+from aitemplate.backend import builder, codegen
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103,W0613,W0102
 
diff --git a/python/aitemplate/compiler/transform/refine_graph.py b/python/aitemplate/compiler/transform/refine_graph.py
index 2d1aa552b..c270ee94d 100644
--- a/python/aitemplate/compiler/transform/refine_graph.py
+++ b/python/aitemplate/compiler/transform/refine_graph.py
@@ -18,9 +18,9 @@
 import logging
 from typing import List
 
-from ...utils.graph_utils import get_sorted_ops
+from aitemplate.compiler.base import Operator, Tensor
 
-from ..base import Operator, Tensor
+from aitemplate.utils.graph_utils import get_sorted_ops
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/transform/remove_unused_ops.py b/python/aitemplate/compiler/transform/remove_unused_ops.py
index f3ccee282..26675a0b7 100644
--- a/python/aitemplate/compiler/transform/remove_unused_ops.py
+++ b/python/aitemplate/compiler/transform/remove_unused_ops.py
@@ -18,7 +18,7 @@
 from collections import deque
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
 def remove_unused_ops(sorted_graph: List[Tensor]) -> None:
diff --git a/python/aitemplate/compiler/transform/split_large_concat_ops.py b/python/aitemplate/compiler/transform/split_large_concat_ops.py
index 06b4522af..c4c4b55c8 100644
--- a/python/aitemplate/compiler/transform/split_large_concat_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_concat_ops.py
@@ -22,12 +22,13 @@
 
 from typing import List
 
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform import transform_utils
 
-from ...utils import graph_utils
-from .. import ops
-from ..base import Operator, Tensor
-from . import transform_utils
+from aitemplate.utils import graph_utils
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
index a25810375..911f57656 100644
--- a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
@@ -21,12 +21,13 @@
 
 from typing import List
 
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+from aitemplate.compiler.transform import transform_utils
 
-from ...utils import graph_utils, shape_utils
-from .. import ops
-from ..base import Operator, Tensor
-from . import transform_utils
+from aitemplate.utils import graph_utils, shape_utils
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/split_large_split_ops.py b/python/aitemplate/compiler/transform/split_large_split_ops.py
index 321afea63..8b0323cb8 100644
--- a/python/aitemplate/compiler/transform/split_large_split_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_split_ops.py
@@ -20,11 +20,12 @@
 
 from typing import List
 
-from ...utils import graph_utils
-from .. import ops
-from ..base import Operator, Tensor
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
 
-from . import toposort, transform_utils
+from aitemplate.compiler.transform import toposort, transform_utils
+
+from aitemplate.utils import graph_utils
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/compiler/transform/toposort.py b/python/aitemplate/compiler/transform/toposort.py
index c5e3c2fbe..c71b44937 100644
--- a/python/aitemplate/compiler/transform/toposort.py
+++ b/python/aitemplate/compiler/transform/toposort.py
@@ -17,7 +17,7 @@
 """
 from typing import List, Union
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index 312328bef..6cebfec2f 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -18,11 +18,12 @@
 import copy
 from typing import List
 
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
 
-from ...utils import graph_utils, shape_utils
-from ..base import Operator, Tensor
-from . import transform_utils
+from aitemplate.utils import graph_utils, shape_utils
 
 
 def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
diff --git a/python/aitemplate/compiler/transform/transform_odd_alignment.py b/python/aitemplate/compiler/transform/transform_odd_alignment.py
index c572b5e76..77f4de59a 100644
--- a/python/aitemplate/compiler/transform/transform_odd_alignment.py
+++ b/python/aitemplate/compiler/transform/transform_odd_alignment.py
@@ -18,17 +18,21 @@
 from math import inf
 from typing import Dict, List, Tuple
 
-from ..base import IntImm, IntVar, Operator, Tensor
-from ..ops.common.view_ops import unsqueeze
-from ..ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
-from ..ops.tensor import permute021
-
-from .apply_padding import get_padding_length
-from .fuse_utils import extract_only_one_op
-from .toposort import toposort
-from .transform_strided_ops import _is_supported_op as _is_supported_strided_op
-from .transform_strided_slice import _is_supported_op as _is_supported_strided_slice
-from .transform_utils import (
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.common.view_ops import unsqueeze
+from aitemplate.compiler.ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.transform.apply_padding import get_padding_length
+from aitemplate.compiler.transform.fuse_utils import extract_only_one_op
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_strided_ops import (
+    _is_supported_op as _is_supported_strided_op,
+)
+from aitemplate.compiler.transform.transform_strided_slice import (
+    _is_supported_op as _is_supported_strided_slice,
+)
+from aitemplate.compiler.transform.transform_utils import (
     can_be_constant_folded,
     copy_src_op_attributes,
     copy_tensor_attributes,
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index 53ee141fc..55edb037e 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -18,14 +18,12 @@
 """
 from typing import Callable, List, Tuple, Type, Union
 
-from aitemplate.utils.shape_utils import is_singleton_dimension
-
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
-from ..ops.gemm_universal.bmm_rcr import bmm_rcr
-from ..ops.gemm_universal.gemm_rrr import gemm_rrr
-from .transform_utils import (
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
+from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
+from aitemplate.compiler.transform.transform_utils import (
     copy_src_op_attributes,
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
@@ -33,6 +31,8 @@
     sanitize_sorted_graph,
 )
 
+from aitemplate.utils.shape_utils import is_singleton_dimension
+
 # pylint: disable=C0103,C0415,W0612
 
 
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 3be187323..5174ba389 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -19,16 +19,24 @@
 
 from typing import List
 
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops.tensor.slice_reshape_scatter import slice_reshape_scatter
+from aitemplate.compiler.ops.tensor.slice_scatter import slice_scatter
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.transform.fuse_split import (
+    _fuse_split_and_group_gemm,
+    _fuse_split_and_strided_op,
+)
+from aitemplate.compiler.transform.transform_strided_op_and_view_op import (
+    _fuse_strided_op_and_view_op,
+)
+from aitemplate.compiler.transform.transform_strided_slice import (
+    _fuse_slice_and_strided_op,
+)
+
 from aitemplate.testing import detect_target
 
-from ...utils import graph_utils, shape_utils
-from ..base import IntImm, Operator, Tensor
-from ..ops.tensor.slice_reshape_scatter import slice_reshape_scatter
-from ..ops.tensor.slice_scatter import slice_scatter
-from . import transform_strided_ops_utils, transform_utils
-from .fuse_split import _fuse_split_and_group_gemm, _fuse_split_and_strided_op
-from .transform_strided_op_and_view_op import _fuse_strided_op_and_view_op
-from .transform_strided_slice import _fuse_slice_and_strided_op
+from aitemplate.utils import graph_utils, shape_utils
 
 # pylint: disable=W0612
 
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index 167a8e444..cb4996bfe 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -19,10 +19,11 @@
 
 from typing import List
 
-from ...utils import graph_utils
-from ..base import IntImm, IntVar, Operator, Tensor
-from ..ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
-from . import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
+
+from aitemplate.utils import graph_utils
 
 
 def _is_supported_gemm(gemm_op: Operator, slice_op: Operator) -> bool:
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index 9a2b66fd4..705587db1 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -20,13 +20,14 @@
 from collections import deque
 from typing import Dict, List, Union
 
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform.mark_param_tensor import mark_param_tensor
+from aitemplate.compiler.transform.name_graph import name_graph
+from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
 
-from ...utils import graph_utils
-from ..base import Operator, Tensor
-from .mark_param_tensor import mark_param_tensor
-from .name_graph import name_graph
-from .remove_unused_ops import remove_unused_ops
+from aitemplate.utils import graph_utils
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/python/aitemplate/frontend/__init__.py b/python/aitemplate/frontend/__init__.py
index dd3562e11..a9d32e278 100644
--- a/python/aitemplate/frontend/__init__.py
+++ b/python/aitemplate/frontend/__init__.py
@@ -12,8 +12,13 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..compiler.base import DynamicProfileStrategy, IntImm, IntVar, Tensor  # noqa: F401
-from . import nn
-from .nn.parameter import Parameter
+from aitemplate.compiler.base import (  # noqa: F401
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Tensor,
+)
+from aitemplate.frontend import nn
+from aitemplate.frontend.nn.parameter import Parameter
 
 __all__ = ["nn", "Parameter"]
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index 5ee80550b..de3e91329 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -13,33 +13,32 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .container import ModuleDict, ModuleList, Sequential
-from .embedding import BertEmbeddings, Embedding
-from .module import Module
-from .conv2d import *
-from .conv3d import *
-from .linear import *
-from .padding import *
-from .pool2d import *
-from .fpn_proposal import FPNProposal
-from .proposal import Proposal
-from .roi_ops import *
-from .upsample import *
-from .view_ops import *
-from .attention import (
+from aitemplate.frontend.nn.container import ModuleDict, ModuleList, Sequential
+from aitemplate.frontend.nn.embedding import BertEmbeddings, Embedding
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.conv2d import *
+from aitemplate.frontend.nn.conv3d import *
+from aitemplate.frontend.nn.linear import *
+from aitemplate.frontend.nn.padding import *
+from aitemplate.frontend.nn.pool2d import *
+from aitemplate.frontend.nn.fpn_proposal import FPNProposal
+from aitemplate.frontend.nn.proposal import Proposal
+from aitemplate.frontend.nn.roi_ops import *
+from aitemplate.frontend.nn.upsample import *
+from aitemplate.frontend.nn.view_ops import *
+from aitemplate.frontend.nn.attention import (
     CrossAttention,
     FlashAttention,
     MultiheadAttention,
-    ScaledDotProductAttention,
 )
-from .identity import Identity
-from .multiscale_attention import MultiScaleBlock
-from .vanilla_attention import (
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.multiscale_attention import MultiScaleBlock
+from aitemplate.frontend.nn.vanilla_attention import (
     vanilla_attention,
     VanillaCrossAttention,
     VanillaMultiheadAttention,
 )
-from .dropout import *
-from .layer_norm import *
-from .group_norm import *
-from .dual_gemm import T5DenseGatedGeluDense
+from aitemplate.frontend.nn.dropout import *
+from aitemplate.frontend.nn.layer_norm import *
+from aitemplate.frontend.nn.group_norm import *
+from aitemplate.frontend.nn.dual_gemm import T5DenseGatedGeluDense
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 9bf9f9726..091f7d81a 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -15,17 +15,16 @@
 """
 Frontend for attention module
 """
+from aitemplate.compiler import ops
+from aitemplate.compiler.ops import flash_attention
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 from aitemplate.testing import detect_target
 
-from ...compiler import ops
-from ...compiler.ops import flash_attention
-from ...compiler.ops.common.epilogue import FuncEnum
-from .. import Tensor
-from .dropout import Dropout
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
-
 
 class FlashAttention(Module):
     r"""FlashAttention provides an implementation for fused
diff --git a/python/aitemplate/frontend/nn/container.py b/python/aitemplate/frontend/nn/container.py
index da1f0381e..83769e80d 100644
--- a/python/aitemplate/frontend/nn/container.py
+++ b/python/aitemplate/frontend/nn/container.py
@@ -29,10 +29,10 @@
     Union,
 )
 
-from ...compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
-from .module import Module, typename
-from .parameter import Parameter
+from aitemplate.frontend.nn.module import Module, typename
+from aitemplate.frontend.nn.parameter import Parameter
 
 __all__ = ["Sequential", "ModuleList", "ModuleDict", "ParameterList", "ParameterDict"]
 
diff --git a/python/aitemplate/frontend/nn/conv2d/__init__.py b/python/aitemplate/frontend/nn/conv2d/__init__.py
index 79375c8f1..50d8b0dd2 100644
--- a/python/aitemplate/frontend/nn/conv2d/__init__.py
+++ b/python/aitemplate/frontend/nn/conv2d/__init__.py
@@ -16,17 +16,25 @@
 """
 modules for conv2d
 """
-from .conv2d import Conv2d
-from .conv2d_bias import Conv2dBias
-from .conv2d_bias_add_hardswish import Conv2dBiasAddHardswish
-from .conv2d_bias_add_relu import Conv2dBiasAddRelu
-from .conv2d_bias_few_channels import Conv2dBiasFewChannels
-from .conv2d_bias_hardswish import Conv2dBiasHardswish
-from .conv2d_bias_hardswish_few_channels import Conv2dBiasHardswishFewChannels
-from .conv2d_bias_relu import Conv2dBiasRelu
-from .conv2d_bias_relu_few_channels import Conv2dBiasReluFewChannels
-from .conv2d_bias_sigmoid import Conv2dBiasSigmoid
-from .conv2d_depthwise import Conv2dDepthwise
-from .conv2d_depthwise_bias import Conv2dDepthwiseBias
-from .transposed_conv2d_bias import ConvTranspose2dBias
-from .transposed_conv2d_bias_relu import ConvTranspose2dBiasRelu
+from aitemplate.frontend.nn.conv2d.conv2d import Conv2d
+from aitemplate.frontend.nn.conv2d.conv2d_bias import Conv2dBias
+from aitemplate.frontend.nn.conv2d.conv2d_bias_add_hardswish import (
+    Conv2dBiasAddHardswish,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_add_relu import Conv2dBiasAddRelu
+from aitemplate.frontend.nn.conv2d.conv2d_bias_few_channels import Conv2dBiasFewChannels
+from aitemplate.frontend.nn.conv2d.conv2d_bias_hardswish import Conv2dBiasHardswish
+from aitemplate.frontend.nn.conv2d.conv2d_bias_hardswish_few_channels import (
+    Conv2dBiasHardswishFewChannels,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_relu import Conv2dBiasRelu
+from aitemplate.frontend.nn.conv2d.conv2d_bias_relu_few_channels import (
+    Conv2dBiasReluFewChannels,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_sigmoid import Conv2dBiasSigmoid
+from aitemplate.frontend.nn.conv2d.conv2d_depthwise import Conv2dDepthwise
+from aitemplate.frontend.nn.conv2d.conv2d_depthwise_bias import Conv2dDepthwiseBias
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias import ConvTranspose2dBias
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_relu import (
+    ConvTranspose2dBiasRelu,
+)
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
index 96e5efe1b..1a57137ad 100644
--- a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
index a08a4abf5..687a3e676 100644
--- a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv2d bias act residual add
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d.py b/python/aitemplate/frontend/nn/conv2d/conv2d.py
index fa1f1d0da..1b78611cf 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d.py
@@ -15,9 +15,9 @@
 """
 conv2d Module.
 """
-from ....compiler.ops import conv2d
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler.ops import conv2d
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index b3b99fae6..68c9aefdf 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -15,7 +15,7 @@
 """
 conv2d bias module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBias(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
index 343780b53..046f9b589 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
@@ -15,7 +15,7 @@
 """
 conv2d + bias + residual + hardswish
 """
-from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_add_act import Conv2dBiasAddAct
 
 
 class Conv2dBiasAddHardswish(Conv2dBiasAddAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
index f12c7a3ec..99a779ab1 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
@@ -15,7 +15,7 @@
 """
 General template module for conv2e + bias + residual + relu
 """
-from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_add_act import Conv2dBiasAddAct
 
 
 class Conv2dBiasAddRelu(Conv2dBiasAddAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
index f7494d54f..36cb07963 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
index 89ccdd94f..55662e4f6 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
@@ -15,7 +15,7 @@
 """
 conv bias hardswish module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasHardswish(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
index c6b6e4d0d..8cf6c3033 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias hardswish module for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasHardswishFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
index 197ce60ce..25e02abb9 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
@@ -15,7 +15,7 @@
 """
 conv2d bias relu module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasRelu(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
index 214ae2726..56a2eb8fb 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias relu for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasReluFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
index 51c6eb839..65077f4c4 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """
 conv2d bias sigmoid module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasSigmoid(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
index 93b95927c..6968c22e6 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
@@ -15,8 +15,8 @@
 """
 conv2d depthwise module
 """
-from ....compiler.ops import conv2d_depthwise
-from .conv2d import Conv2d
+from aitemplate.compiler.ops import conv2d_depthwise
+from aitemplate.frontend.nn.conv2d.conv2d import Conv2d
 
 
 class Conv2dDepthwise(Conv2d):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
index 6632db113..129b491d4 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
@@ -15,7 +15,7 @@
 """
 conv2d depthwise bias module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dDepthwiseBias(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
index 63d9751dc..d713908f9 100644
--- a/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
index 8fc7e6c45..a298478c1 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
@@ -12,7 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_act import (
+    ConvTranspose2dBiasAct,
+)
 
 
 class ConvTranspose2dBias(ConvTranspose2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
index 2dc54ab4e..628932729 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for ConvTranspose2d_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
index a2d89c848..079ed7b57 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
@@ -15,7 +15,9 @@
 """
 conv2d bias relu module
 """
-from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_act import (
+    ConvTranspose2dBiasAct,
+)
 
 
 class ConvTranspose2dBiasRelu(ConvTranspose2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
index 69b6f8e97..214f75018 100644
--- a/python/aitemplate/frontend/nn/conv3d.py
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -15,9 +15,9 @@
 """
 conv3d Module.
 """
-from ...compiler.ops import conv3d, conv3d_bias, depthwise_conv3d
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler.ops import conv3d, conv3d_bias, depthwise_conv3d
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/dropout.py b/python/aitemplate/frontend/nn/dropout.py
index 353ae44d7..91874de30 100644
--- a/python/aitemplate/frontend/nn/dropout.py
+++ b/python/aitemplate/frontend/nn/dropout.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """Dropout/DropPath placeholder"""
-from .module import Module
+from aitemplate.frontend.nn.module import Module
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/dual_gemm.py b/python/aitemplate/frontend/nn/dual_gemm.py
index 109aa7450..17c84e5f7 100644
--- a/python/aitemplate/frontend/nn/dual_gemm.py
+++ b/python/aitemplate/frontend/nn/dual_gemm.py
@@ -15,10 +15,10 @@
 """
 Frontend for attention module
 """
-from ...compiler import ops
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index e5ee72c69..f5144eca1 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -12,15 +12,14 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+from aitemplate.compiler import ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.layer_norm import LayerNorm
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 from aitemplate.testing import detect_target
 
-from ...compiler import ops
-from ...compiler.public import FuncEnum
-from .dropout import Dropout
-from .layer_norm import LayerNorm
-from .module import Module
-from .parameter import Parameter
-
 
 class Embedding(Module):
     r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
diff --git a/python/aitemplate/frontend/nn/fpn_proposal.py b/python/aitemplate/frontend/nn/fpn_proposal.py
index 8645a82a2..3f4f12e8f 100644
--- a/python/aitemplate/frontend/nn/fpn_proposal.py
+++ b/python/aitemplate/frontend/nn/fpn_proposal.py
@@ -17,9 +17,9 @@
 """
 import numpy as np
 
-from ...compiler import ops
-from ...compiler.base import Tensor
-from .proposal import generate_shifted_anchors, Proposal
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.frontend.nn.proposal import generate_shifted_anchors, Proposal
 
 
 def generate_fpn_anchors(im_h, im_w, feat_strides, scales, ratios, batch_size, dtype):
diff --git a/python/aitemplate/frontend/nn/group_norm.py b/python/aitemplate/frontend/nn/group_norm.py
index af8ea7a53..4d93a3d06 100644
--- a/python/aitemplate/frontend/nn/group_norm.py
+++ b/python/aitemplate/frontend/nn/group_norm.py
@@ -15,9 +15,9 @@
 """
 GroupNorm module
 """
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/identity.py b/python/aitemplate/frontend/nn/identity.py
index ac51ae53d..272d8c320 100644
--- a/python/aitemplate/frontend/nn/identity.py
+++ b/python/aitemplate/frontend/nn/identity.py
@@ -15,7 +15,7 @@
 """
 Identity module.
 """
-from .module import Module
+from aitemplate.frontend.nn.module import Module
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/layer_norm.py b/python/aitemplate/frontend/nn/layer_norm.py
index 8b6f9988e..90331baae 100644
--- a/python/aitemplate/frontend/nn/layer_norm.py
+++ b/python/aitemplate/frontend/nn/layer_norm.py
@@ -15,9 +15,9 @@
 """
 LayerNorm module.
 """
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
index fa0bac88b..61740b281 100644
--- a/python/aitemplate/frontend/nn/linear.py
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -15,12 +15,11 @@
 """
 Linear module.
 """
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 from aitemplate.testing import detect_target
 
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
-
 
 class Linear(Module):
     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
diff --git a/python/aitemplate/frontend/nn/module.py b/python/aitemplate/frontend/nn/module.py
index ae02926a0..c51a49db9 100644
--- a/python/aitemplate/frontend/nn/module.py
+++ b/python/aitemplate/frontend/nn/module.py
@@ -15,8 +15,8 @@
 from collections import namedtuple, OrderedDict
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
 
-from ...compiler.base import Tensor
-from .parameter import Parameter
+from aitemplate.compiler.base import Tensor
+from aitemplate.frontend.nn.parameter import Parameter
 
 
 class _IncompatibleKeys(
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 5b982b906..ebc2b2d87 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -23,14 +23,14 @@
 
 import numpy
 
-from ...compiler import ops
-from ...compiler.ops.common.epilogue import FuncEnum
-from .. import Tensor
-from .conv3d import Conv3d
-from .dropout import Dropout, DropPath
-from .identity import Identity
-from .linear import Linear
-from .module import Module
+from aitemplate.compiler import ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.dropout import Dropout, DropPath
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
 
 _LOGGER = logging.getLogger(__name__)
 
diff --git a/python/aitemplate/frontend/nn/padding.py b/python/aitemplate/frontend/nn/padding.py
index c5294f5ad..dfdca6fa9 100644
--- a/python/aitemplate/frontend/nn/padding.py
+++ b/python/aitemplate/frontend/nn/padding.py
@@ -15,8 +15,8 @@
 """
 Padding related modules.
 """
-from ...compiler.ops import ndhwc3to8, nhwc3to8
-from .module import Module
+from aitemplate.compiler.ops import ndhwc3to8, nhwc3to8
+from aitemplate.frontend.nn.module import Module
 
 
 class Nhwc3to8(Module):
diff --git a/python/aitemplate/frontend/nn/parameter.py b/python/aitemplate/frontend/nn/parameter.py
index 8caea006c..5c5e1af9d 100644
--- a/python/aitemplate/frontend/nn/parameter.py
+++ b/python/aitemplate/frontend/nn/parameter.py
@@ -15,7 +15,7 @@
 """
 Parameter definition.
 """
-from ...compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
 class Parameter(object):
diff --git a/python/aitemplate/frontend/nn/pool2d.py b/python/aitemplate/frontend/nn/pool2d.py
index 212847d77..a1eb439c2 100644
--- a/python/aitemplate/frontend/nn/pool2d.py
+++ b/python/aitemplate/frontend/nn/pool2d.py
@@ -15,8 +15,8 @@
 """
 pool2d-family modules.
 """
-from ...compiler.ops import avg_pool2d, max_pool2d
-from .module import Module
+from aitemplate.compiler.ops import avg_pool2d, max_pool2d
+from aitemplate.frontend.nn.module import Module
 
 
 class MaxPool2d(Module):
diff --git a/python/aitemplate/frontend/nn/proposal.py b/python/aitemplate/frontend/nn/proposal.py
index 59a590b6e..18b53f313 100644
--- a/python/aitemplate/frontend/nn/proposal.py
+++ b/python/aitemplate/frontend/nn/proposal.py
@@ -19,10 +19,10 @@
 
 import numpy as np
 
-from ...compiler import ops
-from ...compiler.base import Tensor
-from ...compiler.ops.common.epilogue import FuncEnum
-from .module import Module
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend.nn.module import Module
 
 
 def _mkanchors(widths, heights, x_ctr, y_ctr):
diff --git a/python/aitemplate/frontend/nn/roi_ops.py b/python/aitemplate/frontend/nn/roi_ops.py
index 12e1f7621..a3d17bbf2 100644
--- a/python/aitemplate/frontend/nn/roi_ops.py
+++ b/python/aitemplate/frontend/nn/roi_ops.py
@@ -15,8 +15,8 @@
 """
 RoiAlign-family modules.
 """
-from ...compiler.ops import multi_level_roi_align, roi_align
-from .module import Module
+from aitemplate.compiler.ops import multi_level_roi_align, roi_align
+from aitemplate.frontend.nn.module import Module
 
 
 class RoiAlign(Module):
diff --git a/python/aitemplate/frontend/nn/upsample.py b/python/aitemplate/frontend/nn/upsample.py
index aa6a90edd..619d97236 100644
--- a/python/aitemplate/frontend/nn/upsample.py
+++ b/python/aitemplate/frontend/nn/upsample.py
@@ -15,8 +15,8 @@
 """
 Unsampling2d module.
 """
-from ...compiler.ops import upsampling2d, upsampling2d_add
-from .module import Module
+from aitemplate.compiler.ops import upsampling2d, upsampling2d_add
+from aitemplate.frontend.nn.module import Module
 
 
 class Upsampling2d(Module):
diff --git a/python/aitemplate/frontend/nn/vanilla_attention.py b/python/aitemplate/frontend/nn/vanilla_attention.py
index b7ca8c309..7fe7f0377 100644
--- a/python/aitemplate/frontend/nn/vanilla_attention.py
+++ b/python/aitemplate/frontend/nn/vanilla_attention.py
@@ -17,12 +17,12 @@
 """
 from functools import partial
 
-from ...compiler import ops
-from .. import Tensor
-from .dropout import Dropout
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/view_ops.py b/python/aitemplate/frontend/nn/view_ops.py
index f4afc902e..1406ed9bf 100644
--- a/python/aitemplate/frontend/nn/view_ops.py
+++ b/python/aitemplate/frontend/nn/view_ops.py
@@ -15,8 +15,8 @@
 """
 View-related modules.
 """
-from ...compiler.ops import flatten, reshape
-from .module import Module
+from aitemplate.compiler.ops import flatten, reshape
+from aitemplate.frontend.nn.module import Module
 
 
 class Reshape(Module):
diff --git a/python/aitemplate/frontend/parameter.py b/python/aitemplate/frontend/parameter.py
index ebb060bd7..5c5e1af9d 100644
--- a/python/aitemplate/frontend/parameter.py
+++ b/python/aitemplate/frontend/parameter.py
@@ -15,7 +15,7 @@
 """
 Parameter definition.
 """
-from ..compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
 class Parameter(object):
diff --git a/python/aitemplate/testing/__init__.py b/python/aitemplate/testing/__init__.py
index 7aeed2679..5f2eca031 100644
--- a/python/aitemplate/testing/__init__.py
+++ b/python/aitemplate/testing/__init__.py
@@ -15,8 +15,8 @@
 """
 testing module
 """
-from . import benchmark_ait, benchmark_pt
-from .detect_target import detect_target
+from aitemplate.testing import benchmark_ait, benchmark_pt
+from aitemplate.testing.detect_target import detect_target
 
 __all__ = [
     "benchmark_pt",
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 1031d6386..1ec630016 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -19,7 +19,7 @@
 import os
 from subprocess import PIPE, Popen
 
-from ..backend.target import CUDA, ROCM
+from aitemplate.backend.target import CUDA, ROCM
 
 # pylint: disable=W0702, W0612,R1732
 
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index f527e0a88..001e62070 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -15,7 +15,7 @@
 
 # flake8: noqa
 
-from . import (
+from aitemplate.utils import (
     alignment,
     environ,
     graph_utils,
diff --git a/python/aitemplate/utils/mk_ck_lib/__init__.py b/python/aitemplate/utils/mk_ck_lib/__init__.py
index 0988106cc..ecadc7f17 100644
--- a/python/aitemplate/utils/mk_ck_lib/__init__.py
+++ b/python/aitemplate/utils/mk_ck_lib/__init__.py
@@ -15,4 +15,10 @@
 
 # flake8: noqa
 
-from . import conv2d_operation, gemm_operation, generator, library, manifest
+from aitemplate.utils.mk_ck_lib import (
+    conv2d_operation,
+    gemm_operation,
+    generator,
+    library,
+    manifest,
+)
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
index 739b69579..931651b99 100644
--- a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -22,7 +22,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 class Conv2DSpecialization(enum.Enum):
diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
index 3a6968aa5..28b44f308 100644
--- a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -20,7 +20,7 @@
 
 import jinja2
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 # import library
 
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index bdc086ae8..91b44ea7d 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -15,7 +15,7 @@
 
 import copy
 
-from . import (
+from aitemplate.utils.mk_ck_lib import (
     conv2d_operation as conv,
     gemm_operation as gemm,
     groupnorm_operation as groupnorm,
diff --git a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
index 98b98191a..e61fa7ef9 100644
--- a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
diff --git a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
index c9d93d55a..6e28da94f 100644
--- a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
diff --git a/python/aitemplate/utils/mk_ck_lib/manifest.py b/python/aitemplate/utils/mk_ck_lib/manifest.py
index b6f5c6c0d..077ee9103 100644
--- a/python/aitemplate/utils/mk_ck_lib/manifest.py
+++ b/python/aitemplate/utils/mk_ck_lib/manifest.py
@@ -22,7 +22,7 @@
 import os.path
 import re
 
-from .library import OperationKind, OperationKindNames
+from aitemplate.utils.mk_ck_lib.library import OperationKind, OperationKindNames
 
 
 class Manifest:
diff --git a/python/aitemplate/utils/mk_ck_lib/softmax_operation.py b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
index f280236d1..9684137bf 100644
--- a/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
diff --git a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
index 373610a7a..8231bc7b6 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
@@ -18,7 +18,12 @@
 import shutil
 import tempfile
 
-from . import extra_conv_emit, extra_cutlass_generator, extra_enum, extra_gemm_emit
+from aitemplate.utils.mk_cutlass_lib import (
+    extra_conv_emit,
+    extra_cutlass_generator,
+    extra_enum,
+    extra_gemm_emit,
+)
 
 
 def mk_cutlass_lib(template_path, dst_prefix=None):
diff --git a/python/aitemplate/utils/visualization/__init__.py b/python/aitemplate/utils/visualization/__init__.py
index e6a2db339..7514939ef 100644
--- a/python/aitemplate/utils/visualization/__init__.py
+++ b/python/aitemplate/utils/visualization/__init__.py
@@ -13,6 +13,6 @@
 #  limitations under the License.
 #
 
-from .plot import plot_graph
+from aitemplate.utils.visualization.plot import plot_graph
 
 __all__ = ["plot_graph"]

From 160efd678e587129f79938aa442ad5a08e58b193 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Thu, 9 Mar 2023 10:02:55 -0800
Subject: [PATCH 231/638] Move some A100 tests to V100 - first batch (#382)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/382

ATT

Reviewed By: alexanderguzhva

Differential Revision: D43920370

fbshipit-source-id: b387815948bff5b8791069c37683df8f3ff7273b
---
 tests/unittest/backend/test_gen_standalone.py |  9 +-
 .../compiler/test_constant_folding.py         | 38 +++++++-
 .../compiler/test_fuse_mm_elementwise.py      | 92 ++++++++++++-------
 .../compiler/test_fuse_mm_reshape_permute.py  | 15 +--
 .../compiler/test_fuse_permute_bmm.py         | 88 +++++++-----------
 .../compiler/test_fuse_permute_gemm.py        | 33 ++++++-
 ...st_fused_elementwise_complex_dependency.py | 44 ++++++---
 .../test_fused_elementwise_out_of_order.py    | 11 ++-
 8 files changed, 206 insertions(+), 124 deletions(-)

diff --git a/tests/unittest/backend/test_gen_standalone.py b/tests/unittest/backend/test_gen_standalone.py
index 746213b2c..a322efa75 100644
--- a/tests/unittest/backend/test_gen_standalone.py
+++ b/tests/unittest/backend/test_gen_standalone.py
@@ -25,6 +25,7 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -164,13 +165,11 @@ def test_gen_standalone_f16(self):
         self._test_gen_standalone("gen_standalone_f16", "float16")
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gen_standalone_f32(self):
+    def test_gen_standalone_f32_sm80(self):
         self._test_gen_standalone("gen_standalone_f32", "float32")
 
 
+filter_test_cases_by_test_env(StridedOpCatPatternTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index 2c99d417b..c4354a48e 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -25,8 +25,10 @@
 
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 
 from parameterized import parameterized
@@ -81,7 +83,14 @@ def test_simple_constant_fold(self, dtype):
         # and add one constant, so the total size should be 3.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=3)
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_pad_constant_weight(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -128,7 +137,14 @@ def test_pad_constant_weight(self, dtype):
             expected_num_nodes=expected_num_nodes,
         )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_fold_long_chain(self, dtype):
         target = detect_target()
         if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
@@ -171,7 +187,14 @@ def test_fold_long_chain(self, dtype):
         # The entire graph is turned into a constant.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_constant_folding_through_views(self, dtype):
         target = detect_target()
         if dtype == "float" and target.name == "rocm":
@@ -203,7 +226,14 @@ def test_constant_folding_through_views(self, dtype):
         # The entire graph is eliminated.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_late_binding(self, dtype):
         target = detect_target()
         if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index 4db38e955..002c910bd 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -21,8 +21,11 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 
@@ -261,7 +264,14 @@ def _test_gemm_rcr_bias_add_add_relu(
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_gemm_rcr_bias_add_fail(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -306,7 +316,14 @@ def test_gemm_rcr_bias_add_fail(self, dtype):
         module.run_with_tensors([X_pt, W_pt, B_pt, B1_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_gemm_rcr_bias_chained(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -366,7 +383,14 @@ def test_gemm_rcr_bias_chained(self, dtype):
         module.run_with_tensors([X_pt, W_pt, B_pt, D_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_gemm_rcr_bias_fail(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -737,11 +761,7 @@ def test_gemm_rcr_bias_mul_tanh(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_bias_add_float(self):
+    def test_gemm_rcr_bias_add_float_sm80(self):
         self._test_gemm_rcr_bias(
             [8], 16, 8, True, "gemm_rcr_bias_basic_decomposed_float", dtype="float"
         )
@@ -781,6 +801,9 @@ def test_gemm_rcr_bias_add_float(self):
         )
 
 
+filter_test_cases_by_test_env(FuseGemmRcrBiasCase)
+
+
 class FuseGemmRcrBiasActivationCase(unittest.TestCase):
     def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
@@ -1121,11 +1144,7 @@ def test_gemm_rcr_bias_gelu(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_bias_float(self):
+    def test_gemm_rcr_bias_float_sm80(self):
         self._test_gemm_rcr_bias_activation(
             [8],
             16,
@@ -1184,6 +1203,9 @@ def test_gemm_rcr_bias_float(self):
         )
 
 
+filter_test_cases_by_test_env(FuseGemmRcrBiasActivationCase)
+
+
 class FuseGemmRcrBiasSwishCase(unittest.TestCase):
     def _test_gemm_rcr_bias_swish(
         self, Ms, N, K, testname, dtype="float16", use_add=False
@@ -1261,11 +1283,7 @@ def test_gemm_rcr_add_swish(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_swish_float(self):
+    def test_gemm_rcr_swish_float_sm80(self):
         self._test_gemm_rcr_bias_swish(
             [8],
             16,
@@ -1291,6 +1309,9 @@ def test_gemm_rcr_swish_float(self):
         )
 
 
+filter_test_cases_by_test_env(FuseGemmRcrBiasSwishCase)
+
+
 class FuseBmmCcrAddCase(unittest.TestCase):
     def _test_bmm_ccr_add(
         self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
@@ -1429,11 +1450,7 @@ def test_bmm_ccr_add_negative(self):
         self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_input", "other_input")
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_ccr_add_float(self):
+    def test_bmm_ccr_add_float_sm80(self):
         self._test_bmm_ccr_add(
             [8, 32], 32, 16, 8, "bmm_ccr_add_dynamic_float", dtype="float"
         )
@@ -1453,7 +1470,14 @@ def test_bmm_ccr_add_float(self):
             "bmm_ccr_add_negative_output", "is_output", dtype="float"
         )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_bmm_ccr_add_double_shared_input(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -1529,6 +1553,9 @@ def test_bmm_ccr_add_double_shared_input(self, dtype):
         self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(FuseBmmCcrAddCase)
+
+
 class FuseBmmCrrAddCase(unittest.TestCase):
     def _test_bmm_crr_add(
         self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
@@ -1597,11 +1624,7 @@ def test_bmm_crr_add(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_crr_add_float(self):
+    def test_bmm_crr_add_float_sm80(self):
         self._test_bmm_crr_add(
             [8, 32], 32, 16, 8, "bmm_crr_add_dynamic_float", dtype="float"
         )
@@ -1613,6 +1636,9 @@ def test_bmm_crr_add_float(self):
         )
 
 
+filter_test_cases_by_test_env(FuseBmmCrrAddCase)
+
+
 class FuseBmmRrrAddCase(unittest.TestCase):
     def _test_bmm_rrr_add(
         self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
@@ -1733,11 +1759,7 @@ def test_bmm_rrr_bias_add(self):
         self._test_bmm_rrr_bias_add([8], 32, 16, 8, [1, 16], "bmm_rrr_bias_add_03")
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_rrr_add_float(self):
+    def test_bmm_rrr_add_float_sm80(self):
         self._test_bmm_rrr_add(
             [8, 32], 32, 16, 8, "bmm_rrr_add_dynamic_float", dtype="float"
         )
@@ -1752,6 +1774,8 @@ def test_bmm_rrr_add_float(self):
         )
 
 
+filter_test_cases_by_test_env(FuseBmmRrrAddCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
index 05b974f8b..5e65c2581 100644
--- a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
+++ b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
@@ -20,6 +20,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
     has_op,
@@ -29,6 +30,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMReshapePermuteTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_rcr_0213(
         self,
         ms,
@@ -104,7 +109,7 @@ def torch_f(x, w, b, has_bias, shape):
             # )
             # print(f"pt: {t} ms/iter")
 
-    def test_rcr_0213(self):
+    def test_rcr_0213_sm80(self):
         self._test_rcr_0213(
             [54],
             256,
@@ -125,11 +130,7 @@ def test_rcr_0213(self):
             should_fuse=False,
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rcr_0213_float(self):
+    def test_rcr_0213_float_sm80(self):
         self._test_rcr_0213(
             [29, 29 * 8],
             256,
@@ -143,5 +144,7 @@ def test_rcr_0213_float(self):
         )
 
 
+filter_test_cases_by_test_env(GEMMReshapePermuteTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
index c12a0dd28..687772871 100644
--- a/tests/unittest/compiler/test_fuse_permute_bmm.py
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -21,8 +21,11 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 
@@ -119,11 +122,7 @@ def test_misalign_b_bmm(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_misalign_bmm_float(self):
+    def test_misalign_bmm_float_sm80(self):
         self._test_missing_alignment_bmm(
             [2, 4, 7],
             [2, 7, 8],
@@ -311,11 +310,7 @@ def test_ccr_to_rrr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_ccr_to_rrr_float(self):
+    def test_ccr_to_rrr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -355,11 +350,7 @@ def test_ccr_to_crr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_ccr_to_crr_float(self):
+    def test_ccr_to_crr_float_sm80(self):
         B = [1, 3]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -399,11 +390,7 @@ def test_ccr_to_rcr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_ccr_to_rcr_float(self):
+    def test_ccr_to_rcr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -443,11 +430,7 @@ def test_crr_to_ccr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_crr_to_ccr_float(self):
+    def test_crr_to_ccr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -487,11 +470,7 @@ def test_crr_to_rrr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_crr_to_rrr_float(self):
+    def test_crr_to_rrr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -531,11 +510,7 @@ def test_rcr_to_ccr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rcr_to_ccr_float(self):
+    def test_rcr_to_ccr_float_sm80(self):
         B = [1, 3]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -575,11 +550,7 @@ def test_rcr_to_rrr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rcr_to_rrr_float(self):
+    def test_rcr_to_rrr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -619,11 +590,7 @@ def test_rrr_to_crr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rrr_to_crr_float(self):
+    def test_rrr_to_crr_float_sm80(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -658,11 +625,7 @@ def test_rrr_to_rcr(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rrr_to_rcr_float(self):
+    def test_rrr_to_rcr_float_sm80(self):
         B = [1, 3]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -795,15 +758,18 @@ def test_gemm_broadcast_rrr_to_crr(self):
         self._test_gemm_broadcast_rrr_to_crr(False)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_broadcast_float(self):
+    def test_gemm_broadcast_float_sm80(self):
         self._test_gemm_broadcast_rcr_to_ccr(True, dtype="float")
         self._test_gemm_broadcast_rrr_to_crr(False, dtype="float")
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_permute_multiple_consumer(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -858,7 +824,14 @@ def test_permute_multiple_consumer(self, dtype):
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_permute_multiple_only_bmm_consumer(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -919,5 +892,6 @@ def test_permute_multiple_only_bmm_consumer(self, dtype):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(FusePermuteBmmCase)
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_gemm.py b/tests/unittest/compiler/test_fuse_permute_gemm.py
index 051ea6a01..720af1661 100644
--- a/tests/unittest/compiler/test_fuse_permute_gemm.py
+++ b/tests/unittest/compiler/test_fuse_permute_gemm.py
@@ -20,13 +20,24 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 
 from parameterized import parameterized
 
 
 class FusePermuteGemmTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_no_fusion_odd_alignment(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -59,7 +70,14 @@ def test_no_fusion_odd_alignment(self, dtype):
         else:
             raise RuntimeError("invalid {dtype=}")
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_gemm_rrr_to_rcr(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -86,7 +104,14 @@ def test_gemm_rrr_to_rcr(self, dtype):
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_gemm_rcr_to_rrr(self, dtype):
         target = detect_target()
         if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 902d77ee9..21b3cd463 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -24,8 +24,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import graph_utils
 
@@ -286,7 +288,14 @@ def test_fused_elementwise_non_elementwise_ops(self, dtype):
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_fused_elementwise_indirect_input_dependency(self, dtype):
         r"""
             X0   X1
@@ -365,11 +374,14 @@ def test_fused_elementwise_indirect_input_dependency(self, dtype):
         module.run_with_tensors(inputs, [r3])
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
     )
-    @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype):
         r"""
                 X0[M,K] X1[]
@@ -464,11 +476,14 @@ def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype)
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
     )
-    @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_multi_dependency(self, dtype):
         r"""
             X0   X1                X3
@@ -575,11 +590,14 @@ def test_fused_elementwise_multi_dependency(self, dtype):
         module.run_with_tensors(inputs, [r7])
         self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
     )
-    @parameterized.expand([("float16"), ("float")])
     def test_fused_elementwise_find_fusable_graph(self, dtype):
         r"""
                      X0
diff --git a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
index d8abf7a10..76d20ee07 100644
--- a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
+++ b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
@@ -24,8 +24,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 
 from parameterized import parameterized
@@ -33,7 +35,14 @@
 
 
 class FusedElementwiseOutOfOrderTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_fused_elementwise_out_of_order(self, dtype):
         r"""
            X0   X1

From 90cab80df4fc3f0c03122bba93b2fbd2e1faf7c6 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Thu, 9 Mar 2023 11:53:19 -0800
Subject: [PATCH 232/638] Move most test cases in test_serdes from A100 to V100
 (#383)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/383

ATT

Reviewed By: wushirong

Differential Revision: D43923044

fbshipit-source-id: 77a21ddf9a1a11180f9bde2b132dca43964e2a88
---
 tests/unittest/util/test_serdes.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/unittest/util/test_serdes.py b/tests/unittest/util/test_serdes.py
index 617bb599b..284391a72 100644
--- a/tests/unittest/util/test_serdes.py
+++ b/tests/unittest/util/test_serdes.py
@@ -25,6 +25,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils.serialization.serdes_code import (
     dump_program,
     get_inputs_from_graph,
@@ -209,12 +210,8 @@ def test_reshape(self):
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_group_gemm_rcr(self):
+    def test_group_gemm_rcr_sm80(self):
         target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Group Gemm need SM80 HW")
-            return
-
         M = 256
         K1 = 128
         N1 = 60
@@ -293,6 +290,9 @@ def test_dynamic_slice(self):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
 
+filter_test_cases_by_test_env(SerDesTestCase)
+filter_test_cases_by_test_env(SerDesSpecialOpTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From 43893e2b0ca5ea08523d92ba0cc45da27674116a Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Thu, 9 Mar 2023 11:53:19 -0800
Subject: [PATCH 233/638] Move some tests/unittest/ops/test_transpose_conv2d
 test cases from A100 to V100 (#384)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/384

ATT

Reviewed By: wushirong

Differential Revision: D43924250

fbshipit-source-id: 7b438ccc420d99352855b0e69088184df075afe2
---
 tests/unittest/ops/test_transpose_conv2d.py     | 17 +++++++----------
 .../unittest/ops/test_transpose_conv2d_bias.py  | 17 +++++++----------
 .../ops/test_transpose_conv2d_bias_relu.py      | 17 +++++++----------
 3 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/tests/unittest/ops/test_transpose_conv2d.py b/tests/unittest/ops/test_transpose_conv2d.py
index 2bb23d2bf..363418ece 100644
--- a/tests/unittest/ops/test_transpose_conv2d.py
+++ b/tests/unittest/ops/test_transpose_conv2d.py
@@ -19,13 +19,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(
-    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
-    "Not supported by CUDA arch < 80.",
-)
 class Conv2dTransposeTestCase(unittest.TestCase):
     def _test_transpose_conv2d(
         self,
@@ -81,11 +80,7 @@ def test_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fp32(self):
+    def test_fp32_sm80(self):
         self._test_transpose_conv2d(
             test_name="transpose_conv2d_fp32",
             dtype="float32",
@@ -97,6 +92,8 @@ def test_fp32(self):
         )
 
 
+filter_test_cases_by_test_env(Conv2dTransposeTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias.py b/tests/unittest/ops/test_transpose_conv2d_bias.py
index 8cd97bde0..5172c8df2 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias.py
@@ -19,14 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skipIf(
-    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
-    "Not supported by CUDA arch < 80.",
-)
 class Conv2dTransposeBiasTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
@@ -104,11 +103,7 @@ def test_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fp32(self):
+    def test_fp32_sm80(self):
         self._test_transpose_conv2d_bias(
             test_name="transpose_conv2d_bias_fp32",
             dtype="float32",
@@ -120,5 +115,7 @@ def test_fp32(self):
         )
 
 
+filter_test_cases_by_test_env(Conv2dTransposeBiasTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
index 95d7f6102..3ea8c63f6 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
@@ -19,14 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skipIf(
-    (detect_target().name() == "cuda" and int(detect_target()._arch) < 80),
-    "Not supported by CUDA arch < 80.",
-)
 class Conv2dTransposeBiasReluTestCase(unittest.TestCase):
     def _test_transpose_conv2d_bias_relu(
         self,
@@ -96,11 +95,7 @@ def test_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fp32(self):
+    def test_fp32_sm80(self):
         self._test_transpose_conv2d_bias_relu(
             test_name="transpose_conv2d_bias_relu_fp32",
             dtype="float32",
@@ -112,6 +107,8 @@ def test_fp32(self):
         )
 
 
+filter_test_cases_by_test_env(Conv2dTransposeBiasReluTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From 190b13a903136ae6bc4f16c97fc0eaf7d501aaf8 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 9 Mar 2023 13:35:34 -0800
Subject: [PATCH 234/638] Add an option to shorten tensor names for plotting
 (#381)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/381

Setting AIT_PLOT_SHORTEN_TENSOR_NAMES=1 environment variable makes AITemplate to replace tensors with long names with shortened names (like URL shortener does) during building a plot.

Reviewed By: chenyang78

Differential Revision: D43918759

fbshipit-source-id: d820dfae8cbfdd5c9e0ac750709736a17a94ceeb
---
 docs/source/reference/env.rst                 | 2 ++
 python/aitemplate/utils/environ.py            | 8 ++++++++
 python/aitemplate/utils/misc.py               | 9 +++++++++
 python/aitemplate/utils/visualization/plot.py | 6 ++++++
 4 files changed, 25 insertions(+)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 86fcf78ac..e5616dbec 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -42,3 +42,5 @@ Miscellaneous
 -------------
 
 **LOGLEVEL**: It is used to control the logging level in Python. The default value is "INFO". "DEBUG" is useful for debugging.
+
+**AIT_PLOT_SHORTEN_TENSOR_NAMES**: If set to "1", shorten too long tensor names for a plot of a model graph, thus making a plot much easier to analyze visually. "0" by default.
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 24165ac46..44c5f40eb 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -58,3 +58,11 @@ def time_compilation() -> bool:
     Requires to install "time".
     """
     return os.getenv("AIT_TIME_COMPILATION", "0") == "1"
+
+
+def shorten_tensor_names_for_plots() -> bool:
+    """
+    When enabled, long tensor names will be replaced with a hash string,
+    making the graph representation significantly simpler.
+    """
+    return os.getenv("AIT_PLOT_SHORTEN_TENSOR_NAMES", "0") == "1"
diff --git a/python/aitemplate/utils/misc.py b/python/aitemplate/utils/misc.py
index a1b52babf..fa578c501 100644
--- a/python/aitemplate/utils/misc.py
+++ b/python/aitemplate/utils/misc.py
@@ -15,6 +15,7 @@
 """
 miscellaneous utilities
 """
+import hashlib
 import logging
 import os
 
@@ -41,3 +42,11 @@ def setup_logger(name):
     )
     root_logger.setLevel(LOG_LEVEL)
     return root_logger
+
+
+def short_str(s, length=8) -> str:
+    """
+    Returns a hashed string, somewhat similar to URL shortener.
+    """
+    hash_str = hashlib.sha256(s.encode()).hexdigest()
+    return hash_str[0:length]
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 00ebbfe7b..91550fa1e 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -19,6 +19,8 @@
 import os
 
 from aitemplate import compiler
+from aitemplate.utils.environ import shorten_tensor_names_for_plots
+from aitemplate.utils.misc import short_str
 from aitemplate.utils.visualization import op_attr_factory, pydot
 from aitemplate.utils.visualization.op_attr_factory import op_to_content
 from aitemplate.utils.visualization.web_template import (
@@ -124,6 +126,10 @@ def plot_graph(tensors, file_path: str) -> None:
     for tensor in sorted_graph:
         tensor_node = None
         tensor_name = tensor._attrs["name"]
+        if shorten_tensor_names_for_plots():
+            if tensor_name is not None and len(tensor_name) > 30:
+                tensor_name = short_str(tensor_name)
+
         if tensor in tensor_set:
             tensor_node = tensor_set[tensor]
         else:

From 65c607b39a992f96494cffbb9283caa8e33b8917 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 9 Mar 2023 17:23:45 -0800
Subject: [PATCH 235/638] Add json-based serialization of graph, which allows
 to re-import the graph in a third-party python code. (#388)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/388

`_graph.json` files will be generated in addition to `_graph.txt` files under the same circumstances. Such a file can be loaded using `json.loads()` call.

Reviewed By: chenyang78

Differential Revision: D43951586

fbshipit-source-id: 392ee5b43f4746f428a1d92ba2bcc5ab4cbf11bb
---
 python/aitemplate/utils/graph_utils.py | 20 +++++++
 python/aitemplate/utils/json_utils.py  | 72 ++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 python/aitemplate/utils/json_utils.py

diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index d3dcf6f52..cdda48714 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -49,6 +49,22 @@ def sorted_graph_debug_str(tensors) -> str:
     return "Tensors: {}\n\nOperators: {}\n\n".format(tensor_str, op_str)
 
 
+def sorted_graph_debug_json(tensors) -> str:
+    import json
+
+    from aitemplate.compiler.base import Tensor
+    from aitemplate.utils.json_utils import GraphJsonEncoder
+
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+
+    json_dict = {}
+    json_dict["Tensors"] = tensors
+    json_dict["Operators"] = get_sorted_ops(tensors)
+
+    return json.dumps(json_dict, cls=GraphJsonEncoder)
+
+
 def sorted_graph_pseudo_code(tensors, with_shape=True) -> str:
     from aitemplate.compiler.base import Tensor
 
@@ -72,11 +88,15 @@ def dump_graph_debug_str_to_file(tensors, workdir, name):
         # Dump graph and pseudo code for debug only
         prefix = os.path.join(workdir, name)
         graph_path = prefix + "_graph.txt"
+        graph_json_path = prefix + "_graph.json"
         pseudo_code_path = prefix + "_pseudo_code.txt"
         graph_visual_path = prefix + "_graph_vis.html"
         with open(graph_path, "w") as f:
             f.write(sorted_graph_debug_str(tensors))
             _LOGGER.debug(f"Dumped {name} graph to {graph_path}")
+        with open(graph_json_path, "w") as f:
+            f.write(sorted_graph_debug_json(tensors))
+            _LOGGER.debug(f"Dumped {name} graph to {graph_json_path}")
         with open(pseudo_code_path, "w") as f:
             f.write(sorted_graph_pseudo_code(tensors))
             _LOGGER.debug(f"Dumped {name} pseudo code to {pseudo_code_path}")
diff --git a/python/aitemplate/utils/json_utils.py b/python/aitemplate/utils/json_utils.py
new file mode 100644
index 000000000..0b58072fe
--- /dev/null
+++ b/python/aitemplate/utils/json_utils.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import json
+
+from aitemplate.compiler.base import (
+    _HostConstantTensorData,
+    _NumpyConstantTensorData,
+    _TorchConstantTensorData,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+
+class GraphJsonEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, FuncEnum):
+            return obj.name
+        if isinstance(obj, Tensor):
+            return self._jsonize_tensor(obj)
+        if isinstance(obj, Operator):
+            return self._jsonize_operator(obj)
+        if isinstance(obj, TensorAccessor):
+            return obj.__dict__
+        if isinstance(obj, IntImm):
+            return obj.__dict__
+        if isinstance(obj, IntVar):
+            return obj.__dict__
+        if isinstance(obj, _HostConstantTensorData):
+            return "_HostConstantTensorData"
+        if isinstance(obj, _TorchConstantTensorData):
+            return "_TorchConstantTensorData"
+        if isinstance(obj, _NumpyConstantTensorData):
+            return "_NumpyConstantTensorData"
+
+        return str(obj)
+
+    def _jsonize_tensor(self, tensor: Tensor):
+        output = {}
+        for key in tensor._attrs.keys():
+            if key in ("src_ops", "dst_ops") and tensor._attrs[key] is not None:
+                output[key] = [x._attrs["name"] for x in tensor._attrs[key]]
+            else:
+                output[key] = tensor._attrs[key]
+        return output
+
+    def _jsonize_operator(self, op: Operator):
+        output = {}
+        for key in op._attrs.keys():
+            if (
+                key in ("inputs", "args", "outputs", "original_inputs")
+                and op._attrs[key] is not None
+            ):
+                output[key] = [x._attrs["name"] for x in op._attrs[key]]
+            else:
+                output[key] = op._attrs[key]
+        return output

From d352ff00a83b36eec85ce6c39e565901b1355c68 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 9 Mar 2023 18:20:00 -0800
Subject: [PATCH 236/638] moving bmm tests from a100 (#389)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/389

att

Reviewed By: chenyang78

Differential Revision: D43953467

fbshipit-source-id: 61dc27f91210bdf0984f6b0ba3645bd9daeed819
---
 tests/unittest/ops/test_attention.py       | 30 ++++++++--------------
 tests/unittest/ops/test_bmm.py             | 28 ++++++--------------
 tests/unittest/ops/test_bmm_add.py         | 28 ++++++--------------
 tests/unittest/ops/test_bmm_alpha.py       | 15 ++++-------
 tests/unittest/ops/test_bmm_permute.py     | 15 ++++-------
 tests/unittest/ops/test_bmm_rcr_n1.py      | 12 +++++----
 tests/unittest/ops/test_bmm_rrr_k1_tanh.py | 12 +++++----
 7 files changed, 50 insertions(+), 90 deletions(-)

diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index ecd77e2c3..8af709eaf 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -27,7 +27,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import benchmark_pt, detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 from einops import rearrange, repeat
@@ -144,7 +147,7 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-class attentionTestCase(unittest.TestCase):
+class AttentionTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         torch.manual_seed(0)
@@ -602,11 +605,7 @@ def test_mem_eff_attention_invalid_head_size_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_mem_eff_attention_fp32(self):
+    def test_mem_eff_attention_fp32_sm80(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
                 use_perm=use_perm,
@@ -621,10 +620,6 @@ def test_mem_eff_attention_fp32(self):
             )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
     def test_mem_eff_attention_bf16(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
@@ -775,11 +770,7 @@ def test_cross_attention_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_cross_attention_fp32(self):
+    def test_cross_attention_fp32_sm80(self):
         self._test_cross_attention(
             test_name="cross_attention_fp32",
             dtype="float32",
@@ -794,10 +785,6 @@ def test_cross_attention_fp32(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
     def test_cross_attention_bf16(self):
         self._test_cross_attention(
             test_name="cross_attention_bf16",
@@ -817,5 +804,8 @@ def test_cross_attention_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(AttentionTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index c0ec115ab..9bdb71e1c 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -160,11 +161,7 @@ def test_ccr(self):
             self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_float(self):
+    def test_bmm_fp32_sm80(self):
         self._test_rcr([128], [64], N=8, K=64, test_name="static_float", dtype="float")
         self._test_rcr(
             [1, 5, 77, 128],
@@ -190,11 +187,7 @@ def test_bmm_float(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_bfloat16(self):
+    def test_bmm_bf16(self):
         self._test_rcr(
             [128], [64], N=8, K=64, test_name="static_bfloat16", dtype="bfloat16"
         )
@@ -450,11 +443,7 @@ def test_ccr(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_broadcast_float(self):
+    def test_bmm_broadcast_fp32_sm80(self):
         self._test_rcr_with_accessors(dtype="float")
         self._test_rcr_merge_with_accessors(dtype="float")
         self._test_rcr([2, 16, 8], [1, 32, 8], "broadcastable_b", dtype="float")
@@ -466,11 +455,7 @@ def test_bmm_broadcast_float(self):
         self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a", dtype="float")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b", dtype="float")
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_broadcast_bfloat16(self):
+    def test_bmm_broadcast_bf16(self):
         self._test_rcr_with_accessors(dtype="bfloat16")
         self._test_rcr_merge_with_accessors(dtype="bfloat16")
         self._test_rcr(
@@ -555,5 +540,8 @@ def test_rrr_fail(self, dtype="float16"):
             pass
 
 
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+filter_test_cases_by_test_env(BMMTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 1bd77cd82..a604e4877 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -20,6 +20,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -145,22 +146,14 @@ def test_ccr(self):
     def test_crr(self):
         self._test_crr(B=32, M=256, K=256, N=512)
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_add_float(self):
+    def test_bmm_add_fp32_sm80(self):
         self._test_rrr(B=8, M=32, K=8, N=64, dtype="float")
         self._test_ccr(
             B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_float", dtype="float"
         )
         self._test_crr(B=8, M=32, K=16, N=64, dtype="float")
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_add_bfloat16(self):
+    def test_bmm_add_bf16(self):
         self._test_rrr(B=8, M=32, K=8, N=64, dtype="bfloat16")
         self._test_ccr(
             B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_bfloat16", dtype="bfloat16"
@@ -320,11 +313,7 @@ def test_ccr(self):
             test_name="broadcastable_bias3d",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_add_broadcast_float(self):
+    def test_bmm_add_broadcast_fp32_sm80(self):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
@@ -347,11 +336,7 @@ def test_bmm_add_broadcast_float(self):
             dtype="float",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_add_broadcast_bfloat16(self):
+    def test_bmm_add_broadcast_bf16(self):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
@@ -375,5 +360,8 @@ def test_bmm_add_broadcast_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(BMMAddTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_alpha.py b/tests/unittest/ops/test_bmm_alpha.py
index aa3440ebb..bf255c57f 100644
--- a/tests/unittest/ops/test_bmm_alpha.py
+++ b/tests/unittest/ops/test_bmm_alpha.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -282,11 +283,7 @@ def test_bmm_alpha(self):
             use_fp16_acc=False,
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_alpha_float(self):
+    def test_bmm_alpha_fp32_sm80(self):
         self._test_bmm_alpha(
             bmm_op=ops.bmm_rcr,
             is_div=False,
@@ -319,11 +316,7 @@ def test_bmm_alpha_float(self):
             dtype="float",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_alpha_bfloat16(self):
+    def test_bmm_alpha_bf16(self):
         self._test_bmm_alpha(
             bmm_op=ops.bmm_rcr,
             is_div=False,
@@ -357,5 +350,7 @@ def test_bmm_alpha_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(BMMAlphaTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index c989fdeae..9ee1038e7 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -116,11 +117,7 @@ def test_rcr(self):
             )
             self._test_rcr([24], [80], N=96, K=0, d1=12, test_name="permute1_zero_k")
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_permute_float(self):
+    def test_bmm_permute_fp32_sm80(self):
         self._test_rrr(
             [10], [8], N=88, K=64, d1=10, test_name="permute3_float", dtype="float"
         )
@@ -138,11 +135,7 @@ def test_bmm_permute_float(self):
             [10], [8], N=64, K=88, d1=10, test_name="permute3_float", dtype="float"
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_permute_bfloat16(self):
+    def test_bmm_permute_bf16(self):
         self._test_rrr(
             [10],
             [8],
@@ -173,6 +166,8 @@ def test_bmm_permute_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(BMMPermuteTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index 0e67119cf..27680952e 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -22,7 +22,10 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
@@ -122,10 +125,7 @@ def test_bmm_rcr_n1_float32(self):
             [1, 5, 8], [100], 1, 123, False, "static_float32", dtype="float32"
         )
 
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80, "bf16 is supported with CUDA sm80+"
-    )
-    def test_bmm_rcr_n1_bfloat16(self):
+    def test_bmm_rcr_n1_bf16(self):
         self._test_rcr_n1(
             [1],
             [1000000],
@@ -148,5 +148,7 @@ def test_bmm_rcr_n1_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(BMMRcrN1TestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
index 15ad23632..6114c1f93 100644
--- a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -19,7 +19,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -58,14 +61,13 @@ def test_bmm_rrr_k1_tanh_float16(self):
     def test_bmm_rrr_k1_tanh_float32(self):
         self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="float32")
 
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80, "bf16 is supported with CUDA sm80+"
-    )
-    def test_bmm_rrr_k1_tanh_bfloat16(self):
+    def test_bmm_rrr_k1_tanh_bf16(self):
         self._test_rrr(
             B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="bfloat16"
         )
 
 
+filter_test_cases_by_test_env(BMMRrrK1TanhTestCase)
+
 if __name__ == "__main__":
     unittest.main()

From 17b601cec3bbee1c0167d2fcfe83a4d7c04430fe Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Thu, 9 Mar 2023 18:40:07 -0800
Subject: [PATCH 237/638] No need to reset internal constants (#385)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/385

Reduce assignment of constants that are not necessary.

Reviewed By: khabinov, morgendave, wushirong

Differential Revision: D43923768

fbshipit-source-id: 1ec6869dfa01964cd4ac0c3cdd7600c604ade9d5
---
 python/aitemplate/backend/codegen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 92fc24835..7ff934a58 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -515,7 +515,8 @@ def _codegen_param_setup(
                 )
             )
             self._codegen_bound_constant(tensor)
-            self.reset_constants.append(const_slice)
+            if not tensor._attrs.get("is_internal_constant", False):
+                self.reset_constants.append(const_slice)
             if self.constants_data_file is not None:
                 self._add_owned_constant(tensor)
         elif tensor._attrs["constant_folding_output_idx"] is not None:
@@ -526,7 +527,8 @@ def _codegen_param_setup(
                 )
             )
             self.tensor_slice.append(const_slice)
-            self.reset_constants.append(const_slice)
+            if not tensor._attrs.get("is_internal_constant", False):
+                self.reset_constants.append(const_slice)
         elif not isinstance(tensor, IntVarTensor):
             # Unbound constant. We will expect the user to set this via SetConstant.
             self.set_up_constant_names.append(

From f2eb69f4f204783cd742b0ed0adf0cadea2d4c06 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Thu, 9 Mar 2023 22:37:52 -0800
Subject: [PATCH 238/638] Add int_elementwise support in fx2ait (#392)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/392

We had int_elementwise support  for dynamic shape in aten2ait, but didn't add it to fx2ait. Fx2ait were able to calculate static shape, but recently IFR model requests dynamic shape calculation: https://fburl.com/code/7eag5h8a

Therefore added the support.

Reviewed By: khabinov, wushirong

Differential Revision: D43964418

fbshipit-source-id: 32e64e18e1acd1f6152b6448361fd472e4dbfe8d
---
 fx2ait/fx2ait/converters/utils.py             |  9 +++
 .../test/converters/test_ait_reshape.py       | 80 +++++++++++++------
 2 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index ea8894a0a..6d12804be 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -21,6 +21,7 @@
 from aitemplate.compiler.public import (
     elementwise,
     FuncEnum,
+    int_elementwise,
     permute,
     Tensor as AITTensor,
 )
@@ -83,6 +84,14 @@ def create_binary_op(
         )
         return res
 
+    if isinstance(lhs, IntVarTensor) or isinstance(rhs, IntVarTensor):
+        lhs = IntVarTensor(IntImm(lhs)) if isinstance(lhs, int) else lhs
+        rhs = IntVarTensor(IntImm(rhs)) if isinstance(rhs, int) else rhs
+
+        if not (isinstance(lhs, IntVarTensor) and isinstance(rhs, IntVarTensor)):
+            raise RuntimeError(f"Unexpected right operand {type(rhs)} on {name}: {rhs}")
+
+        return int_elementwise(op_type)(lhs, rhs)
     return elementwise(op_type)(lhs, rhs)
 
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reshape.py b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
index 49cb6ccda..c5fd4e686 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_reshape.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
@@ -94,6 +94,48 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         )
 
     def test_with_getitem_reshape_dim0_dynamic(self) -> None:
+        class TestSimpleModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestSimpleModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+        class TestComplexModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * (d2 + d1 - 3)  # d2+d1-3=d2
+                return x.reshape(d0, d)
+
+        model = TestComplexModule().cuda()
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+    def test_with_getitem_reshape_dim01_dynamic(self) -> None:
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 d0 = x.size(dim=0)
@@ -108,7 +150,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 [2, 3, 4],
             ],
             inputs_max=[
-                [20, 3, 4],
+                [20, 30, 4],
             ],
             dtype_list=[
                 torch.float16,
@@ -120,25 +162,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
         )
 
-    ###TODO dim=0,1 dynamic has problem due to output size is not IntVar for dim1(P537903486).
-    # def test_with_getitem_reshape_dim01_dynamic(self) -> None:
-    #     class TestModule(torch.nn.Module):
-    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
-    #             d0 = x.size(dim=0)
-    #             d1 = x.size(dim=1)
-    #             d2 = x.size(dim=2)
-    #             d = d1 * d2
-    #             return x.reshape(d0, d)
-
-    #     model = TestModule().cuda()
-    #     inputs = [
-    #         [
-    #             torch.randn(2, 3, 4).half().cuda(),
-    #         ],
-    #         [
-    #             torch.randn(20, 30, 4).half().cuda(),
-    #         ],
-    #     ]
-    #     self.run_test_with_dynamic_shape(
-    #         model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
-    #     )
+        class TestComplexModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * (d2 - d0 + d0)
+                return x.reshape(d0, d)
+
+        model = TestComplexModule().cuda()
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )

From cb7ae488b7c2a97a482d5c6d3083b23223a666d0 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 10 Mar 2023 07:23:57 -0800
Subject: [PATCH 239/638] Split A100 / V100 tests in test_dynamic_conv (#387)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/387

ATT

Reviewed By: wushirong

Differential Revision: D43949769

fbshipit-source-id: 3f6e44b5188a74611a3eb9669729b5870f84e4eb
---
 tests/unittest/ops/test_dynamic_conv.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index b553c2616..9caf85523 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -22,7 +22,10 @@
 from aitemplate.compiler.base import DynamicProfileStrategy
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -79,12 +82,7 @@ def test_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fp32(self):
+    def test_fp32_sm80(self):
         self._test_conv_dynamic(
             test_name="conv_dynamic_fp32",
             dtype="float32",
@@ -231,16 +229,15 @@ def _test_conv3d_dynamic(
             y_transpose = y.permute((0, 4, 1, 2, 3))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=0.05, rtol=0.05))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_conv3d_fp16(self):
+    def test_conv3d_fp16_sm80(self):
         self._test_conv3d_dynamic(
             test_name="conv3d_dynamic_fp16",
             dtype="float16",
         )
 
 
+filter_test_cases_by_test_env(ConvDynamicTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()

From 935ec5a860e0dd1055a05eb07f3df84873ddbce7 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 10 Mar 2023 08:13:59 -0800
Subject: [PATCH 240/638] Split A100 / V100 tests in test_fused_elementwise_*
 (#390)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/390

ATT

Reviewed By: tenpercent

Differential Revision: D43959948

fbshipit-source-id: c898c2f986c33cf1cadba2f627a293d40baa2dc0
---
 tests/unittest/ops/test_fused_elementwise.py  | 457 +++++++++++-------
 .../ops/test_fused_elementwise_broadcast.py   |   4 -
 ..._fused_elementwise_with_strided_outputs.py |   8 -
 3 files changed, 275 insertions(+), 194 deletions(-)

diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 8c25bdcf6..030b2ce4f 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -30,18 +30,22 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
     get_torch_full_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 from parameterized import parameterized
 
-ait_dtype_to_pytorch = {"float16": torch.float16}
-if detect_target().name() != "rocm":
-    ait_dtype_to_pytorch["float32"] = torch.float32
-    if int(detect_target()._arch) >= 80:
-        ait_dtype_to_pytorch["bfloat16"] = torch.bfloat16
+
+_AIT_DTYPE_TO_PYTORCH_DTYPE = {
+    "float16": torch.float16,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
 
 
 class FusedElementwiseTestCase(unittest.TestCase):
@@ -103,12 +107,20 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
         self.assertEqual(X1._attrs["depth"], 0)
         self.assertEqual(X4._attrs["depth"], 2)
 
-    def test_fused_elementwise_constructor(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_constructor(ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_constructor(self, ait_dtype):
+        self._test_fused_elementwise_constructor(ait_dtype)
 
     def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype):
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(batch_sizes),
@@ -149,43 +161,51 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype)
                     module.run_with_tensors([x1_pt], [x4])
                     self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_e2e(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[256],
-                ks=[128],
-                test_name=f"static_shapes_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1, 99, 998, 1024],
-                ms=[256],
-                ks=[128],
-                test_name=f"dynamic_batch_size_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[1, 128, 256],
-                ks=[128],
-                test_name=f"dynamic_m_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[256],
-                ks=[1, 3, 8, 128],
-                test_name=f"dynamic_k_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[700, 80, 1024],
-                ms=[23, 78, 256],
-                ks=[10, 30, 128],
-                test_name=f"dynamic_all_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_e2e(self, ait_dtype):
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[128],
+            test_name=f"static_shapes_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1, 99, 998, 1024],
+            ms=[256],
+            ks=[128],
+            test_name=f"dynamic_batch_size_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[1, 128, 256],
+            ks=[128],
+            test_name=f"dynamic_m_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[1, 3, 8, 128],
+            test_name=f"dynamic_k_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[700, 80, 1024],
+            ms=[23, 78, 256],
+            ks=[10, 30, 128],
+            test_name=f"dynamic_all_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
 
     def _test_fused_elementwise_kernel1(self, ait_dtype):
         BATCH_SIZE = 1024
@@ -231,12 +251,20 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
         module.run_with_tensors(inputs, [x9])
         torch.testing.assert_close(x9, x9_pt, atol=1e-2, rtol=1e-2)
 
-    def test_fused_elementwise_kernel1(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_kernel1(ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_kernel1(self, ait_dtype):
+        self._test_fused_elementwise_kernel1(ait_dtype)
 
     def _test_sigmoid(self, input_size, test_name, ait_dtype):
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -262,15 +290,23 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
         self.assertEqual(torch.sum(x2 < 0), 0)
         self.assertEqual(torch.sum(x2 > 1), 0)
 
-    def test_sigmoid(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
-            self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
-            self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sigmoid(self, ait_dtype):
+        self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
+        self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
+        self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
 
     def _test_tanh(self, input_size, test_name, ait_dtype):
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -291,15 +327,24 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_tanh(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
-            self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
-            self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                # float16 device function is different for SM80 and lower
+                TestEnv.CUDA_SM80: [("float16"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_tanh(self, ait_dtype):
+        self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
+        self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
+        self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
 
     def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -323,80 +368,18 @@ def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_gelu(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_gelu([1024, 22400], f"gelu_1_{ait_dtype}", ait_dtype)
-            self._test_gelu([1024, 70144], f"fast_gelu_1_{ait_dtype}", ait_dtype, True)
-
-    def _test_power(self, input_size, exp, test_name, ait_dtype):
-        print(f"Running test {test_name} with exp = {exp}")
-        assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype=ait_dtype,
-            name="input0",
-            is_input=True,
-        )
-        X2 = ops.elementwise(FuncEnum.POW)(X1, exp)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        if abs(exp) < 1.0:
-            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype) + 0.5
-        else:
-            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        x2_pt = torch.pow(x1_pt, exp)
-
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
-        module.run_with_tensors([x1_pt], [x2])
-        # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
-        # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
-        # print(f"BW: {bw} GB/s")
-        torch.testing.assert_close(x2, x2_pt, atol=1e-3, rtol=1e-3, equal_nan=True)
-
     @parameterized.expand(
-        itertools.product(
-            (0, 1, -1, 0.5, -0.5, 2, -2, 1.4, 3),
-            ([1024, 1024], [1025, 1025]),
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
     )
-    def test_power_float16(self, exp, shape):
-        dtype = "float16"
-        self._test_power(
-            shape,
-            exp,
-            f"pow_{shape[0]}_{shape[1]}_{exp}_{dtype}",
-            dtype,
-        )
-
-    @unittest.skipIf(
-        detect_target().name() != "cuda", "float32 dtype only supported in CUDA"
-    )
-    def test_power_float32(self):
-        self._test_power(
-            (1024, 1024),
-            2.5,
-            "pow_float32",
-            "float32",
-        )
-
-    @unittest.skipIf(
-        detect_target().name() != "cuda", "bfloat16 dtype only supported in CUDA"
-    )
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80, "bfloat16 dtype only supported in CUDA sm80+"
-    )
-    def test_power_bfloat16(self):
-        self._test_power(
-            (1024, 1024),
-            1.2,
-            "pow_bfloat16",
-            "bfloat16",
-        )
+    def test_gelu(self, ait_dtype):
+        self._test_gelu([1024, 22400], f"gelu_1_{ait_dtype}", ait_dtype)
+        self._test_gelu([1024, 70144], f"fast_gelu_1_{ait_dtype}", ait_dtype, True)
 
     def _test_min_max(
         self,
@@ -451,39 +434,55 @@ def _test_min_max(
 
         torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
 
-    def test_min(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_min_max(
-                [512, 512],
-                test_name=f"min_nonan_{ait_dtype}",
-                is_min=True,
-                add_nans=False,
-                ait_dtype=ait_dtype,
-            )
-            self._test_min_max(
-                [512, 512],
-                test_name=f"min_nan_{ait_dtype}",
-                is_min=True,
-                add_nans=True,
-                ait_dtype=ait_dtype,
-            )
-
-    def test_max(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_min_max(
-                [512, 512],
-                test_name=f"max_nonan_{ait_dtype}",
-                is_min=False,
-                add_nans=False,
-                ait_dtype=ait_dtype,
-            )
-            self._test_min_max(
-                [512, 512],
-                test_name=f"max_nan_{ait_dtype}",
-                is_min=False,
-                add_nans=True,
-                ait_dtype=ait_dtype,
-            )
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_min(self, ait_dtype):
+        self._test_min_max(
+            [512, 512],
+            test_name=f"min_nonan_{ait_dtype}",
+            is_min=True,
+            add_nans=False,
+            ait_dtype=ait_dtype,
+        )
+        self._test_min_max(
+            [512, 512],
+            test_name=f"min_nan_{ait_dtype}",
+            is_min=True,
+            add_nans=True,
+            ait_dtype=ait_dtype,
+        )
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_max(self, ait_dtype):
+        self._test_min_max(
+            [512, 512],
+            test_name=f"max_nonan_{ait_dtype}",
+            is_min=False,
+            add_nans=False,
+            ait_dtype=ait_dtype,
+        )
+        self._test_min_max(
+            [512, 512],
+            test_name=f"max_nan_{ait_dtype}",
+            is_min=False,
+            add_nans=True,
+            ait_dtype=ait_dtype,
+        )
 
     def _test_clamp(
         self,
@@ -494,7 +493,7 @@ def _test_clamp(
         ait_dtype,
     ) -> None:
         assert len(input_size) == 2 or len(input_size) == 0
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X0 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])] if input_size else [],
             dtype=ait_dtype,
@@ -517,17 +516,25 @@ def _test_clamp(
 
         self.assertTrue(torch.allclose(x1, x1_pt, atol=1e-2, rtol=1e-2))
 
-    def test_clamp(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_clamp([512, 106], -1, 1, f"clamp_0_{ait_dtype}", ait_dtype)
-            self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
-            self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
-            self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
-            self._test_clamp([], 1, -1, f"clamp_4_{ait_dtype}", ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_clamp(self, ait_dtype):
+        self._test_clamp([512, 106], -1, 1, f"clamp_0_{ait_dtype}", ait_dtype)
+        self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
+        self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
+        self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
+        self._test_clamp([], 1, -1, f"clamp_4_{ait_dtype}", ait_dtype)
 
     def _test_operator_overload(self, ait_dtype):
         input_size = [4, 2]
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=input_size,
             dtype=ait_dtype,
@@ -555,13 +562,21 @@ def _test_operator_overload(self, ait_dtype):
         module.run_with_tensors([x1_pt, x2_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
-    def test_operator_overload(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_operator_overload(ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_operator_overload(self, ait_dtype):
+        self._test_operator_overload(ait_dtype)
 
     def _test_operator_overload_with_constant_number(self, ait_dtype):
         input_size = [4, 2]
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=input_size,
             dtype=ait_dtype,
@@ -581,9 +596,87 @@ def _test_operator_overload_with_constant_number(self, ait_dtype):
         module.run_with_tensors([x1_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
-    def test_operator_overload_with_constant_number(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_operator_overload_with_constant_number(ait_dtype)
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_operator_overload_with_constant_number(self, ait_dtype):
+        self._test_operator_overload_with_constant_number(ait_dtype)
+
+
+class FusedElementwisePowerTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_power(self, input_size, exp, test_name, ait_dtype):
+        print(f"Running test {test_name} with exp = {exp}")
+        assert len(input_size) == 2
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.POW)(X1, exp)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        if abs(exp) < 1.0:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype) + 0.5
+        else:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x2_pt = torch.pow(x1_pt, exp)
+
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        module.run_with_tensors([x1_pt], [x2])
+        # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
+        # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
+        # print(f"BW: {bw} GB/s")
+        torch.testing.assert_close(x2, x2_pt, atol=1e-3, rtol=1e-3, equal_nan=True)
+
+    @parameterized.expand(
+        itertools.product(
+            (0, 1, -1, 0.5, -0.5, 2, -2, 1.4, 3),
+            ([1024, 1024], [1025, 1025]),
+        )
+    )
+    def test_power_float16(self, exp, shape):
+        dtype = "float16"
+        self._test_power(
+            shape,
+            exp,
+            f"pow_{shape[0]}_{shape[1]}_{exp}_{dtype}",
+            dtype,
+        )
+
+    def test_power_float32_sm80(self):
+        self._test_power(
+            (1024, 1024),
+            2.5,
+            "pow_float32",
+            "float32",
+        )
+
+    def test_power_bfloat16_bf16(self):
+        self._test_power(
+            (1024, 1024),
+            1.2,
+            "pow_bfloat16",
+            "bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(FusedElementwisePowerTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index ace6989b7..65a91b404 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -99,10 +99,6 @@ def _test_different_dim(
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_different_dim_fp16(self):
         self._test_different_dim(
             batch_sizes=[1024],
diff --git a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
index 2419b3eb1..b5398eb38 100644
--- a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
+++ b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
@@ -122,10 +122,6 @@ def _test_fused_elementwise_with_strided_outputs(
                     # Do comparisons.
                     self.assertTrue(torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_all_aligned_fp16(self):
         self._test_fused_elementwise_with_strided_outputs(
             batch0_sizes=[1],
@@ -221,10 +217,6 @@ def test_all_aligned_fp32(self):
             dtype="float32",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_not_aligned_fp16(self):
         self._test_fused_elementwise_with_strided_outputs(
             batch0_sizes=[8],

From 01eacf9e0442f7068c958365335e9dc6b415cc56 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Fri, 10 Mar 2023 08:35:13 -0800
Subject: [PATCH 241/638] Replace pycuda with cuda-python in detect_target
 (#391)

Summary:
NVIDIA provides official python API for CUDA - [NVIDIA/cuda-python](https://github.com/NVIDIA/cuda-python).  We can use it instead of [inducer/pycuda](https://github.com/inducer/pycuda).

Benefits of `cuda-python`:
- Pre-installed in many NVIDIA containers (e.g. [tensorrt](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorrt) and [pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) containers)
- `pip install` is fast. **No compilation** during the installation.
- Versioning is more clear - it corresponds to cuda version (e.g. 11.6.1). In contrast `pycuda` uses date based versioning  - e.g. 2022.2.2 which does not correspond to cuda version.
- Official NVIDIA package

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/391

Reviewed By: chenyang78

Differential Revision: D43962921

Pulled By: tenpercent

fbshipit-source-id: a2a7a5c8fc627ba3f122f208ba38edc60afda396
---
 .circleci/config.yml                       |  2 +-
 docker/Dockerfile.cuda                     |  4 ++--
 python/aitemplate/testing/detect_target.py | 12 +++++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a9c258eff..e1bb1ba96 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -27,7 +27,7 @@ setup_env: &setup_env
           python3.8 setup.py bdist_wheel &&
           sudo python3.8 -m pip install --no-input dist/*.whl &&
           cd /home/circleci/project &&
-          python3.8 -m pip install pycuda &&
+          python3.8 -m pip install 'cuda-python<12.0.0' &&
           python3.8 -m pip install pytest &&
           python3.8 -m pip install torch &&
           python3.8 -m pip install numpy &&
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 4f75bf741..1d481809f 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -40,8 +40,8 @@ RUN bash /Install/install_doc_dep.sh
 # install Pytorch
 RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 
-# install Pycuda
-RUN pip3 install pycuda
+# install NVIDIA cuda-python
+RUN pip3 install 'cuda-python<12.0.0'
 
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 1ec630016..f0731eea9 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -54,10 +54,16 @@ def _detect_cuda_with_nvidia_smi():
 
 def _detect_cuda():
     try:
-        import pycuda.driver as drv
+        from cuda import cuda
 
-        drv.init()
-        major, minor = drv.Device(0).compute_capability()
+        def assert_cuda(res):
+            if res[0].value != 0:
+                raise RuntimeError(f"CUDA error code={res[0].value}")
+            return res[1:]
+
+        assert_cuda(cuda.cuInit(0))
+        # Get Compute Capability of the first Visible device
+        major, minor = assert_cuda(cuda.cuDeviceComputeCapability(0))
         comp_cap = major * 10 + minor
         if comp_cap >= 90:
             return "90"

From 13ebc76c346c0d8b0564a7b66513a3a753c53b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serhat=20Varolg=C3=BCnes?= <svarolgunes@meta.com>
Date: Fri, 10 Mar 2023 09:38:19 -0800
Subject: [PATCH 242/638] Eliminate permute over singleton dims (#378)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/378

Since when permute operation moves only a (or multiple) singleton dimension(s) memory layout does not change, it can be replaced with a reshape operation. New transform function for detecting and replacing operators in such cases is implemented. It's added among compiler optimization functions list. Unittests for this new functionality is added.

Reviewed By: chenyang78

Differential Revision: D43878517

fbshipit-source-id: 521017f1e8610bca07e40e6e63f5c39bb33df0d6
---
 .../compiler/transform/optimize_graph.py      |   4 +
 .../transform/transform_permute_to_reshape.py | 124 ++++++++++++++++++
 .../test_transform_permute_to_reshape.py      | 117 +++++++++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 python/aitemplate/compiler/transform/transform_permute_to_reshape.py
 create mode 100644 tests/unittest/compiler/test_transform_permute_to_reshape.py

diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index ae68548d4..c227c2b14 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -40,6 +40,9 @@
 from aitemplate.compiler.transform.transform_odd_alignment import (
     transform_odd_alignment,
 )
+from aitemplate.compiler.transform.transform_permute_to_reshape import (
+    transform_permute_to_reshape,
+)
 from aitemplate.compiler.transform.transform_special_ops import transform_special_ops
 from aitemplate.compiler.transform.transform_strided_ops import transform_strided_ops
 
@@ -95,6 +98,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         split_large_slice_scatter_ops,
         split_large_concat_ops,
         split_large_split_ops,
+        transform_permute_to_reshape,
         transform_memory_ops,
     ]
 
diff --git a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
new file mode 100644
index 000000000..2c65bc677
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
@@ -0,0 +1,124 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Transform permute to reshape wherever applicable.
+"""
+from typing import List
+
+from ...utils import graph_utils
+from ..base import IntImm, Operator, Tensor
+from ..ops import reshape
+from . import transform_utils
+from .toposort import toposort
+
+
+def _check_permute_to_reshape(op: Operator) -> bool:
+    """Check if applicable to replace permute with reshape.
+
+    Args:
+        op (Operator): reshape op
+
+    Returns:
+        bool: False if operation is not a permute or a permute with memory
+            layout modification otherwise True.
+    """
+    if not op._attrs["op"].startswith("permute"):
+        return False
+
+    inputs = op._attrs["inputs"]
+
+    assert (
+        len(inputs) == 1
+    ), "Permute operation {} should have 1 input, got {} instead".format(
+        op._attrs["op"], len(inputs)
+    )
+
+    input_shape = inputs[0].shape()
+
+    if op._attrs["op"] == "permute":
+        permutation = list(op._attrs["dims"])
+    elif op._attrs["op"] == "permute021":
+        n_dims = len(input_shape)
+        permutation = list(range(n_dims - 2)) + [n_dims - 1, n_dims - 2]
+    elif op._attrs["op"] == "permute102":
+        permutation = [1, 0, 2]
+    elif op._attrs["op"] == "permute210":
+        permutation = [2, 1, 0]
+    elif op._attrs["op"] == "permute0213":
+        permutation = [0, 2, 1, 3]
+    else:
+        raise NotImplementedError(
+            f"Not implemented for permute operation: {op._attrs['op']}"
+        )
+
+    # Get non-singular dimension indices
+    permutation = [
+        dim_idx
+        for dim_idx in permutation
+        if not isinstance(input_shape[dim_idx], IntImm)
+        or input_shape[dim_idx].value() != 1
+    ]
+    is_reshape = permutation == sorted(permutation)
+    return is_reshape
+
+
+def transform_permute_to_reshape(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Convert permute to reshape wherever applicable.
+
+    When permute op involves moving one or more dimensions with size
+    1 around where the order of non-singular dimensions is preserved,
+    it's basically a reshape op, i.e. the underlying memory layout
+    does not change.
+
+    Example:
+        [256x5x1x32] -> [256x5x32x1] (with 0132) is a reshape
+        [256x1x5x1x32] -> [256x5x32x1x1] (with 02431) is a reshape
+        [256x5x1x32] -> [256x32x5x1] (with 0312) is not a reshape
+
+    Args:
+        sorted_graph (List[Tensor]): input graph
+        workdir (str, optional): current workdir for dumping debug info. Defaults to None.
+
+    Returns:
+        List[Tensor]: optimized graph
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+
+    has_modified = False
+    for op in sorted_ops:
+        if not _check_permute_to_reshape(op):
+            continue
+
+        has_modified = True
+
+        permute_input = op._attrs["inputs"][0]
+        permute_output = op._attrs["outputs"][0]
+        output_shape = permute_output.shape()
+
+        transform_utils.remove_dst_op_from_tensor(permute_input, op)
+
+        reshape_op = reshape()
+        reshape_output = reshape_op(permute_input, output_shape)
+
+        transform_utils.replace_tensor(permute_output, reshape_output)
+
+        sorted_graph.append(reshape_output)
+
+    if has_modified:
+        sorted_graph = toposort(sorted_graph)
+        transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/tests/unittest/compiler/test_transform_permute_to_reshape.py b/tests/unittest/compiler/test_transform_permute_to_reshape.py
new file mode 100644
index 000000000..8b846b9ac
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_permute_to_reshape.py
@@ -0,0 +1,117 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from parameterized import parameterized
+
+_PERMUTE_OPS = (
+    "permute",
+    "permute021",
+    "permute102",
+    "permute210",
+    "permute0213",
+)
+
+
+def _generate_model_name(shape, permutation, is_reshape, dtype, is_complex):
+    model_name = "_".join(
+        [
+            ("test_permute_complex" if is_complex else "test_permute"),
+            ("to_reshape" if is_reshape else "not_to_reshape"),
+            "x".join([str(s) for s in shape]),
+            "".join([str(s) for s in permutation]),
+            dtype,
+        ]
+    )
+    return model_name
+
+
+class TransformPermuteToReshapeTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            # no singleton
+            ([32, 51, 12], [1, 2, 0], False, "float16"),
+            ([32, 51, 12], [1, 2, 0], False, "float32"),
+            # one singleton dimension
+            ([32, 51, 1], [0, 2, 1], True, "float16"),
+            ([32, 51, 1], [0, 2, 1], True, "float32"),
+            ([32, 51, 1], [1, 2, 0], False, "float16"),
+            ([32, 51, 1], [1, 2, 0], False, "float32"),
+            # two same sized dimensions
+            ([32, 32, 1], [2, 0, 1], True, "float16"),
+            ([32, 32, 1], [2, 0, 1], True, "float32"),
+            ([32, 32, 1], [1, 0, 2], False, "float16"),
+            ([32, 32, 1], [1, 0, 2], False, "float32"),
+            # double singleton dimension
+            ([32, 1, 51, 1], [3, 0, 2, 1], True, "float16"),
+            ([32, 1, 51, 1], [3, 0, 2, 1], True, "float32"),
+            ([32, 1, 51, 1], [2, 3, 1, 0], False, "float16"),
+            ([32, 1, 51, 1], [2, 3, 1, 0], False, "float32"),
+            # IntVar dimension
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, "float16"),
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, "float32"),
+            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, "float16"),
+            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, "float32"),
+            # other
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, "float16"),
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, "float32"),
+        ]
+    )
+    def test_permute_to_reshape(self, shape, permutation, is_reshape, dtype):
+        target = detect_target()
+
+        X = Tensor(shape, dtype=dtype, is_input=True, name="x")
+        Z = ops.softmax()(ops.permute()(X, dims=permutation), -1)
+        Z._attrs["is_output"] = True
+        Z._attrs["name"] = "z"
+
+        model_name = _generate_model_name(
+            shape, permutation, is_reshape, dtype, is_complex=False
+        )
+        module = compile_model(Z, target, "./tmp", model_name)
+        has_permute_op = any(
+            test_utils.graph_has_op(module.debug_sorted_graph, op_name)
+            for op_name in _PERMUTE_OPS
+        )
+        has_reshape_op = test_utils.graph_has_op(module.debug_sorted_graph, "reshape")
+
+        if is_reshape:
+            self.assertFalse(has_permute_op)
+            self.assertTrue(has_reshape_op)
+        else:
+            self.assertTrue(has_permute_op)
+            self.assertFalse(has_reshape_op)
+
+        shape = [dim.upper_bound() if isinstance(dim, IntVar) else dim for dim in shape]
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        z_pt = torch.softmax(torch.permute(x_pt, tuple(permutation)), dim=-1)
+        z_ait = torch.empty_like(z_pt)
+        module.run_with_tensors({"x": x_pt}, {"z": z_ait})
+
+        torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From e61db40175c493564e956d901a2cf1f406ac212f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serhat=20Varolg=C3=BCnes?= <svarolgunes@meta.com>
Date: Fri, 10 Mar 2023 10:21:41 -0800
Subject: [PATCH 243/638] Add CELU activation function (#373)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/373

New activation function CELU is implemented as follows:
1. Backend device-level implementations for all dtypes added
2. New enum member for new op added
3. By-type enum member added
4. Frontend function for op in Python added
5. Unittests are extended to include new op

Reviewed By: aakhundov

Differential Revision: D43852704

fbshipit-source-id: 1652bcd73187aa5b8f6db65e49a0b980459f3a3d
---
 python/aitemplate/backend/backend_spec.py     |  7 +++
 .../backend/cuda/elementwise/custom_math.cuh  | 32 ++++++++++++
 .../compiler/ops/common/epilogue.py           |  1 +
 python/aitemplate/compiler/ops/common/math.py |  4 ++
 tests/unittest/ops/test_activation.py         | 51 +++++++++++++++++++
 5 files changed, 95 insertions(+)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 94abf6a2c..0dfab0002 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -300,6 +300,13 @@ class GPUBackendSpec(BackendSpec):
                 "bfloat16": "floor_div",
                 "bfloat16_2": "floor_div",
             },
+            FuncEnum.CELU: {
+                "float": "fcelu",
+                "half": "hcelu",
+                "half2": "h2celu",
+                "bfloat16": "hcelu",
+                "bfloat16_2": "h2celu",
+            },
         }
     )
 
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 64d59f009..449dbfda8 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -978,4 +978,36 @@ __device__ bfloat16_2 floor_div(const bfloat16_2 a, const bfloat16_2 b) {
 #endif
 }
 
+__device__ float fcelu(const float a, const float alpha) {
+  return a > 0.f ? a : alpha * (expf(a / alpha) - 1.0f);
+}
+
+__device__ half hcelu(const half a, const half alpha) {
+  return __hgt(a, CUDA_FP16_ZERO)
+      ? a
+      : __hmul(alpha, __hsub(hexp(__hdiv(a, alpha)), CUDA_FP16_ONE));
+}
+
+__device__ bfloat16 hcelu(const bfloat16 a, const bfloat16 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(a, CUDA_BF16_ZERO)
+      ? a
+      : __hmul(alpha, __hsub(hexp(__hdiv(a, alpha)), CUDA_BF16_ONE));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 h2celu(const half2 a, const half2 alpha) {
+  return half2(hcelu(a.x, alpha.x), hcelu(a.y, alpha.y));
+}
+
+__device__ bfloat16_2 h2celu(const bfloat16_2 a, const bfloat16_2 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(hcelu(a.x, alpha.x), hcelu(a.y, alpha.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
 #endif
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index 423b30626..fd684bf6e 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -64,3 +64,4 @@ class FuncEnum(Enum):
     ELU = 26
     SOFTSIGN = 27
     FLOOR_DIV = 28
+    CELU = 29
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index d79597682..016b4ddca 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -113,3 +113,7 @@ def softsign(tensor: Any) -> Tensor:
 
 def floor_div(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("FLOOR_DIV")(tensor)
+
+
+def celu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("CELU")(tensor)
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index 73b78e629..b270e6981 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -38,6 +38,7 @@
     FuncEnum.SQRT: torch.sqrt,
     FuncEnum.SIGMOID: torch.sigmoid,
     FuncEnum.RELU: torch.relu,
+    FuncEnum.CELU: torch.celu,
 }
 
 TORCH_FP_DTYPES = [torch.float16]
@@ -327,6 +328,45 @@ def _test_softsign(
             module.run_with_tensors([x1_pt], [x2])
             self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
+    def _test_celu(
+        self,
+        input_size,
+        alpha=1.0,
+        test_name="celu",
+        copy_op=False,
+    ):
+        for torch_dtype in TORCH_FP_DTYPES:
+            dtype = torch_dtype_to_string(torch_dtype)
+            assert len(input_size) == 2
+            X1 = Tensor(
+                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+                dtype=dtype,
+                name="input0",
+                is_input=True,
+            )
+            X_alpha = Tensor(
+                shape=[],
+                dtype=dtype,
+                name="alpha",
+                value=alpha,
+            )
+            X2_op = ops.elementwise(FuncEnum.CELU)
+            if copy_op:
+                X2_op = ops.elementwise(**X2_op._get_op_attributes())
+            X2 = X2_op(X1, X_alpha)
+            X2._attrs["is_output"] = True
+            X2._attrs["name"] = "output0"
+
+            target = detect_target()
+            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
+            OP_pt = torch.nn.CELU(alpha=alpha)
+            x2_pt = OP_pt(x1_pt)
+
+            x2 = torch.empty_like(x2_pt)
+            module.run_with_tensors([x1_pt], [x2])
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
     def test_lrelu(self):
         self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
         self._test_leaky_relu(
@@ -454,6 +494,17 @@ def test_floor_div(self):
             copy_op=True,
         )
 
+    def test_celu(self):
+        self._test_celu([63, 63], alpha=1.0, test_name="celu_1")
+        self._test_celu([128, 128], alpha=4.0, test_name="celu_2")
+        self._test_celu([128, 256], alpha=0.4, test_name="celu_3")
+        self._test_celu(
+            [256, 128],
+            alpha=1.0,
+            test_name="celu_3_copy_op",
+            copy_op=True,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 987a1c9d853e1778f2c02b77a6a5d09e662f9b3a Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Fri, 10 Mar 2023 11:05:32 -0800
Subject: [PATCH 244/638] Adopt softmax to wider dimension with reshape (#395)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/395

AIT only support dim=rank-1 softmax. But sometimes softmax will happen in dim < dim, e.g. in IFR model: https://fburl.com/code/pjfety5f

Luckily, in most of these cases, all i in range(dim, rank) are 1s. So we can replace the original input with a reshaped input to flatten the last few dims with value 1 and perform softmax on the last dim.
Because reshape doesn't cost any memory I/O, such change won't result in degraded performance

Reviewed By: wushirong

Differential Revision: D43968603

fbshipit-source-id: 3aebd9234e4089b8b1d55d9df201bde872992a22
---
 fx2ait/fx2ait/converters/ait_converters.py    | 15 ++++
 .../test/converters/test_ait_softmax.py       | 90 +++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 7863b5b5b..29681278e 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -423,6 +423,21 @@ def acc_ops_softmax(
         raise RuntimeError(f"Unexpected input for {name}: {input_val}")
 
     dim = kwargs["dim"]
+    rank = len(input_val.shape())
+    if dim < 0:
+        dim = rank + dim
+    if dim != rank - 1:
+        for i in range(rank, dim):
+            if input_val.shape()[i].value() != 1:
+                raise RuntimeError(
+                    f"AIT softmax only supports dim=rank-1, got dim={dim}, rank={rank}"
+                )
+        reshape_dim = size()(input_val)[: dim + 1]
+        reshape_val = reshape()(input_val, reshape_dim)
+        softmax_val = softmax()(reshape_val, -1)
+        return reshape()(
+            softmax_val, reshape_dim + [IntVarTensor(IntImm(1))] * (rank - dim - 1)
+        )
 
     return softmax()(input_val, dim)
 
diff --git a/fx2ait/fx2ait/test/converters/test_ait_softmax.py b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
index aca9dc12a..a9f96e047 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_softmax.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
@@ -12,8 +12,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import unittest
+
 import torch
 from fx2ait.acc_tracer import acc_ops
+from fx2ait.tensor_spec import TensorSpec
 from fx2ait.tools.common_fx2ait import AITTestCase
 from parameterized import param, parameterized
 
@@ -36,3 +39,90 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         ]
 
         self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    def test_softmax_not_last_dim(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        # Test static use case
+        inputs = [
+            torch.randn(2, 3, 5, 1, 1).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+        # Test dynamic use case
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 5, 1, 1],
+            ],
+            inputs_max=[
+                [20, 10, 5, 1, 1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.softmax},
+        )
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    @unittest.expectedFailure
+    def test_softmax_expected_failure(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        inputs = [
+            torch.randn(2, 3, 5, 2, 1).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    @unittest.expectedFailure
+    def test_softmax_expected_failure_dynamic(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 5, 2, 1],
+            ],
+            inputs_max=[
+                [20, 10, 5, 4, 1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.softmax},
+        )

From a6106b1d6372c362e6f07c3488fc22c9e362839b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 10 Mar 2023 12:33:15 -0800
Subject: [PATCH 245/638] Split A100 / V100 tests in test_gemm_* (first batch)
 (#400)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/400

ATT

Reviewed By: alexanderguzhva

Differential Revision: D43982098

fbshipit-source-id: 5e491aa0ad7fa4936094404a348f567d641c4818
---
 tests/unittest/ops/test_gemm.py               | 103 +++++++-------
 tests/unittest/ops/test_gemm_bias.py          |  58 ++++----
 .../unittest/ops/test_gemm_bias_broadcast.py  | 130 ++++++++++--------
 .../unittest/ops/test_gemm_bias_hardswish.py  |  40 +++---
 tests/unittest/ops/test_gemm_bias_relu.py     |  56 ++++----
 5 files changed, 188 insertions(+), 199 deletions(-)

diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 9a390b19c..a3e8376b7 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -28,15 +29,11 @@
 from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-2, "rtol": 1e-2}
-    elif dtype in ("float", "float32"):
-        return {"atol": 3e-2, "rtol": 3e-2}
-    elif dtype == "bfloat16":
-        return {"atol": 2e-1, "rtol": 2e-1}
-    else:
-        return {}
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-2, "rtol": 1e-2},
+    "float32": {"atol": 3e-2, "rtol": 3e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
 
 
 class GEMMTestCase(unittest.TestCase):
@@ -50,7 +47,7 @@ def __init__(self, *args, **kwargs):
 
     def _test_rcr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
             dtype=dtype,
@@ -83,7 +80,9 @@ def _test_rcr(self, ms, k, n, test_name, dtype="float16"):
     def test_rcr_simple_static(self) -> None:
         self._test_rcr([1024], 256, 512, "static")
 
-    @unittest.skipIf(detect_target().name() != "cuda", "Only supported by CUDA.")
+    def test_rcr_simple_static_rocm(self) -> None:
+        self._test_rcr([1024], 256, 512, "static")
+
     @parameterized.expand(
         [
             ("dynamic1", [1, 1024], 256, 512),
@@ -101,7 +100,7 @@ def test_rcr_simple_dynamic(self, name, ms, k, n) -> None:
 
     def _test_rcr_dynamic_n(self, ms, k, ns, test_name, dtype="float16"):
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
             dtype=dtype,
@@ -144,9 +143,15 @@ def test_rcr_dynamic_n(self):
             [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n"
         )
 
+    def test_rcr_dynamic_n_rocm(self):
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1")
+        self._test_rcr_dynamic_n(
+            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n"
+        )
+
     def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         if dtype == "float16":
             tolerance_limits["atol"] = 2e-2
             tolerance_limits["rtol"] = 2e-2
@@ -181,7 +186,6 @@ def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name, dtype="float16"):
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rcr(self):
         self._test_3d_2d_rcr([1024], [2], 256, 512, "static")
         self._test_3d_2d_rcr([1, 1024], [2], 256, 512, "dynamic1")
@@ -190,7 +194,7 @@ def test_3d_2d_rcr(self):
 
     def _test_rrr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         if dtype == "float16":
             tolerance_limits["atol"] = 2e-2
             tolerance_limits["rtol"] = 2e-2
@@ -221,8 +225,10 @@ def _test_rrr(self, ms, k, n, test_name, dtype="float16"):
 
     def test_rrr(self):
         self._test_rrr([256], 128, 32, "static")
-        if detect_target().name() == "cuda":
-            self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
+        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
+
+    def test_rrr_rocm(self):
+        self._test_rrr([256], 128, 32, "static")
 
     def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
@@ -257,29 +263,17 @@ def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rrr(self):
         self._test_3d_2d_rrr([256], [2], 128, 32, "static")
         self._test_3d_2d_rrr([1, 128], [3], 256, 16, "dynamic1")
         self._test_3d_2d_rrr([2], [24, 36], 256, 16, "dynamic2")
         self._test_3d_2d_rrr([2, 34, 48], [1, 3, 5], 256, 16, "dynamic3")
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
-    def test_h_rcr(self, ait_dtype):
+    def _test_h_rcr(self, ait_dtype):
         M = 256
         K = 256
         N = 512
         target = detect_target(use_fp16_acc=(ait_dtype == "float16"))
-        if target.name() != "cuda" and ait_dtype != "float16":
-            self.skipTest(
-                f"{ait_dtype} input type is not supported for {target.name()}"
-            )
-        if (
-            target.name() == "cuda"
-            and int(target._arch) < 80
-            and ait_dtype != "float16"
-        ):
-            self.skipTest(f"{ait_dtype} is not supported for cuda sm < 80")
         X = Tensor(shape=[M, K], dtype=ait_dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=ait_dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
@@ -299,37 +293,37 @@ def test_h_rcr(self, ait_dtype):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_float(self):
-        self._test_rcr([1024], 256, 512, "static_float", dtype="float")
-        self._test_rcr([1, 1024], 256, 512, "dynamic1_float", dtype="float")
-        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_float", dtype="float")
+    def test_h_rcr_float16(self):
+        self._test_h_rcr(ait_dtype="float16")
+
+    def test_h_rcr_float16_rocm(self):
+        self._test_h_rcr(ait_dtype="float16")
+
+    def test_h_rcr_float32_sm80(self):
+        self._test_h_rcr(ait_dtype="float32")
+
+    def test_h_rcr_bfloat16_bf16(self):
+        self._test_h_rcr(ait_dtype="bfloat16")
+
+    def test_gemm_float32_sm80(self):
+        self._test_rcr([1024], 256, 512, "static_float", dtype="float32")
+        self._test_rcr([1, 1024], 256, 512, "dynamic1_float", dtype="float32")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_float", dtype="float32")
 
-        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_float", dtype="float")
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_float", dtype="float32")
         self._test_3d_2d_rcr(
-            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_float", dtype="float"
+            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_float", dtype="float32"
         )
 
-        self._test_rrr([256], 128, 32, "static_float", dtype="float")
-        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic_float", dtype="float")
+        self._test_rrr([256], 128, 32, "static_float", dtype="float32")
+        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic_float", dtype="float32")
 
-        self._test_3d_2d_rrr([256], [2], 128, 32, "static_float", dtype="float")
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static_float", dtype="float32")
         self._test_3d_2d_rrr(
-            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_float", dtype="float"
+            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_float", dtype="float32"
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "rocm", "bfloat16 is not supported by ROCm."
-    )
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "bfloat16 is not supported by CUDA < SM80.",
-    )
-    def test_gemm_bfloat16(self):
+    def test_gemm_bfloat16_bf16(self):
         self._test_rcr([1024], 256, 512, "static_bfloat16", dtype="bfloat16")
         self._test_rcr([1, 1024], 256, 512, "dynamic1_bfloat16", dtype="bfloat16")
         self._test_rcr(
@@ -352,5 +346,8 @@ def test_gemm_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(GEMMTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index bb493ad37..3d969f490 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -21,32 +21,18 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
 from aitemplate.utils import shape_utils
-from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "bfloat16":
-        return {"atol": 3e-1, "rtol": 3e-1}
-    else:
-        return {}
-
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 class GEMMBiasTestCase(unittest.TestCase):
@@ -56,7 +42,7 @@ def __init__(self, *args, **kwargs):
 
     def _test_rcr(self, Ms, N, K, test_name, dtype="float16"):
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
@@ -90,25 +76,26 @@ def _test_rcr(self, Ms, N, K, test_name, dtype="float16"):
 
     def test_rcr_zero_size(self):
         target = detect_target()
-        if target.name() == "cuda":
-            # This test triggered a c10 assertion failure internally
-            # caffe2/c10/util/SmallVector.h:338:
-            # Assertion `idx < size()' failed
-            if type(target).__name__ != "FBCUDA":
-                self._test_rcr([2], N=64, K=0, test_name="zero_k")
-            self._test_rcr([2], N=0, K=4, test_name="zero_n")
-            self._test_rcr([0], N=4, K=4, test_name="zero_m")
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
+        if type(target).__name__ != "FBCUDA":
+            self._test_rcr([2], N=64, K=0, test_name="zero_k")
+        self._test_rcr([2], N=0, K=4, test_name="zero_n")
+        self._test_rcr([0], N=4, K=4, test_name="zero_m")
 
     def test_rcr_static(self):
         self._test_rcr([4096], N=4, K=4, test_name="static")
         self._test_rcr([1000], N=81, K=1024, test_name="static")
         self._test_rcr([67200], N=3, K=256, test_name="static")
 
-    @parameterized.expand(("bfloat16",))
-    def test_rcr_all_floats(self, dtype):
-        skipped_reason = _skip_target(detect_target(), dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
+    def test_rcr_static_rocm(self):
+        self._test_rcr([4096], N=4, K=4, test_name="static")
+        self._test_rcr([1000], N=81, K=1024, test_name="static")
+        self._test_rcr([67200], N=3, K=256, test_name="static")
+
+    def test_rcr_bfloat16_bf16(self):
+        dtype = "bfloat16"
         self._test_rcr([4], N=2, K=11, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr([128], N=64, K=1024, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr(
@@ -120,6 +107,9 @@ def test_rcr_all_floats(self, dtype):
         )
 
 
+filter_test_cases_by_test_env(GEMMBiasTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index a802b5eb4..0c0564c0e 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -20,6 +20,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -76,9 +77,11 @@ def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_mul_add(self):
         self._test_bias_rcr_mul_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_add_rocm(self):
+        self._test_bias_rcr_mul_add(8, None, None, 8, 8)
 
     def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -104,9 +107,11 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_sigmoid_mul(self):
         self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
-            self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
+        self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_sigmoid_mul_rocm(self):
+        self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
 
     def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -132,10 +137,12 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_sigmoid_mul_tanh(self):
         self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128)
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
+
+    def test_bias_rcr_sigmoid_mul_tanh_rocm(self):
+        self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
 
     def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -159,9 +166,11 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_add(self):
         self._test_bias_rcr_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_add_rocm(self):
+        self._test_bias_rcr_add(8, None, None, 8, 8)
 
     def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -185,9 +194,11 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_add_relu(self):
         self._test_bias_rcr_add_relu(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_add_relu_rocm(self):
+        self._test_bias_rcr_add_relu(8, None, None, 8, 8)
 
     def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -211,17 +222,19 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
         self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_add_add_relu(self):
+        self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
+        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 0)
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
         target = detect_target()
+        if type(target).__name__ != "FBCUDA":
+            self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
+
+    def test_bias_rcr_add_add_relu_rocm(self):
         self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
-        if target.name() == "cuda":
-            self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 0)
-            # This test triggered a c10 assertion failure internally
-            # caffe2/c10/util/SmallVector.h:338:
-            # Assertion `idx < size()' failed
-            if type(target).__name__ != "FBCUDA":
-                self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
 
     def _test_bias_rcr_mul(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -245,9 +258,11 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_mul(self):
         self._test_bias_rcr_mul(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_mul(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_rocm(self):
+        self._test_bias_rcr_mul(8, None, None, 8, 8)
 
     def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -272,10 +287,12 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_add_add(self):
         self._test_bias_rcr_add_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
+        self._test_bias_rcr_add_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
+
+    def test_bias_rcr_add_add_rocm(self):
+        self._test_bias_rcr_add_add(8, None, None, 8, 8)
 
     def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
         target = detect_target()
@@ -299,33 +316,25 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
 
     def test_bias_rcr_mul_tanh(self):
         self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_bias_broadcast_float(self):
-        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="float")
-        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="float")
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_bias_broadcast_bfloat16(self):
+        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_tanh_rocm(self):
+        self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
+
+    def test_gemm_bias_broadcast_float32_sm80(self):
+        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="float32")
+        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="float32")
+
+    def test_gemm_bias_broadcast_bfloat16_bf16(self):
         self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="bfloat16")
         self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="bfloat16")
         self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
@@ -338,5 +347,8 @@ def test_gemm_bias_broadcast_bfloat16(self):
         self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
 
 
+filter_test_cases_by_test_env(GEMMBiasBroadcastTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index cef3443f8..602869532 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -19,31 +19,19 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "bfloat16":
-        return {"atol": 3e-1, "rtol": 3e-1}
-    else:
-        return {}
-
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 def hard_swish(x):
@@ -79,13 +67,17 @@ def _test_rcr(self, dtype="float16"):
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, **_tolerance_limits(dtype)))
+        self.assertTrue(torch.allclose(Y_pt, y, **_TOLERANCE_LIMITS[dtype]))
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
     def test_rcr(self, dtype):
-        skipped_reason = _skip_target(detect_target(), dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
         self._test_rcr(dtype)
 
 
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index 9f43fed0d..c3977a228 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -19,31 +19,19 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "bfloat16":
-        return {"atol": 2e-1, "rtol": 2e-1}
-    else:
-        return {}
-
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
 
 
 class GEMMBiasReluTestCase(unittest.TestCase):
@@ -55,7 +43,7 @@ def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         M = 128
         K = 1024
         N = 64
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -77,19 +65,24 @@ def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_gemm_rcr_bias_relu(self, ait_dtype):
         target = detect_target()
-        skipped_reason = _skip_target(target, ait_dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
         self._test_gemm_rcr_bias_relu(ait_dtype, target)
 
     def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         M = 128
         K = 1024
         N = 64
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -113,12 +106,17 @@ def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_gemm_rcr_bias_add_relu(self, ait_dtype):
         target = detect_target()
-        skipped_reason = _skip_target(target, ait_dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
         self._test_gemm_rcr_bias_add_relu(ait_dtype, target)
 
 
From 5665fb822658966f6abb2e0dce9bfc5839c89b4c Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Fri, 10 Mar 2023 12:37:41 -0800
Subject: [PATCH 246/638] Move tests from A100 to V100 - second batch (#394)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/394

ATT

Reviewed By: wushirong

Differential Revision: D43968252

fbshipit-source-id: 14a3f9ae12db08436755ee1cba6ce2f4d5d11936
---
 tests/unittest/compiler/test_group_fusions.py |  15 +--
 .../test_pad_bmm_rrr_bias_with_cat.py         |  10 +-
 .../compiler/test_pad_gemm_rrr_with_cat.py    |   9 +-
 .../compiler/test_pad_gemm_with_cat.py        |  11 +-
 .../test_pad_gemm_with_elementwise.py         | 106 +++++++++++++-----
 .../compiler/test_parallel_gemm_fusions.py    |  21 +---
 tests/unittest/compiler/test_refine_graph.py  |  42 +++++--
 .../compiler/test_slice_elemwise_fusion.py    |  12 +-
 .../compiler/test_slice_gemm_fusion.py        |  20 +++-
 .../compiler/test_slice_reshape_scatter.py    |   9 +-
 10 files changed, 164 insertions(+), 91 deletions(-)

diff --git a/tests/unittest/compiler/test_group_fusions.py b/tests/unittest/compiler/test_group_fusions.py
index b7a218187..6a11e9661 100644
--- a/tests/unittest/compiler/test_group_fusions.py
+++ b/tests/unittest/compiler/test_group_fusions.py
@@ -24,6 +24,7 @@
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
     count_ops,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
     has_op,
@@ -657,11 +658,7 @@ def test_group_gemm_fusion_float16(self):
         self._test_group_gemm_fusion(1024, [[16, 44], [32, 32]], should_fail=True)
         self._test_group_gemm_fusion(1024, [[16, 13], [32, 1]], should_fail=True)
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_group_gemm_fusion_float32(self):
+    def test_group_gemm_fusion_float32_sm80(self):
         self._test_group_gemm_fusion(32, [[16, 64], [32, 32]], dtype="float32")
         self._test_group_gemm_fusion(
             32, [[16, 64], [32, 40]], has_bias=False, dtype="float32"
@@ -774,11 +771,7 @@ def test_split_group_gemm_fusion_float16(self):
             num_group_ops=1,
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_split_group_gemm_fusion_float32(self):
+    def test_split_group_gemm_fusion_float32_sm80(self):
         self._test_split_group_gemm_fusion(
             32,
             [[16, 64], [16, 40], [16, 128]],
@@ -797,5 +790,7 @@ def test_split_group_gemm_fusion_float32(self):
         )
 
 
+filter_test_cases_by_test_env(GroupOpTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
index 25909a2a0..b0626fa11 100644
--- a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
+++ b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -103,11 +104,7 @@ def test_pad_bmm_rrr_bias_with_cat_float16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_pad_bmm_rrr_bias_with_cat_float32(self):
+    def test_pad_bmm_rrr_bias_with_cat_float32_sm80(self):
         self._test_pad_bmm_rrr_bias_with_cat(
             "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10, dtype="float32"
         )
@@ -122,6 +119,9 @@ def test_pad_bmm_rrr_bias_with_cat_float32(self):
         )
 
 
+filter_test_cases_by_test_env(PadBmmBiasWithCatTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
index d03e5bcae..2cfd80ae1 100644
--- a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -85,11 +86,7 @@ def test_pad_gemm_rrr_with_cat_float16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_pad_gemm_rrr_with_cat_float32(self):
+    def test_pad_gemm_rrr_with_cat_float32_sm80(self):
         self._test_pad_gemm_rrr_with_cat(
             "static_odd_k", ms=[128], n=32, k1=3, k2=10, dtype="float32"
         )
@@ -103,6 +100,8 @@ def test_pad_gemm_rrr_with_cat_float32(self):
         )
 
 
+filter_test_cases_by_test_env(PadGemmWithCatTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_pad_gemm_with_cat.py b/tests/unittest/compiler/test_pad_gemm_with_cat.py
index d330f694d..b8bf4858d 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_cat.py
@@ -23,8 +23,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 
 from parameterized import parameterized
@@ -34,7 +36,14 @@
 
 
 class PadGemmWithCatTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_pad_gemm_rcr_with_cat(self, dtype):
         target = detect_target()
         if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
index 15ef14349..bd091301d 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -22,8 +22,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 from parameterized import param, parameterized
@@ -31,11 +33,17 @@
 
 class PadGemmWithElementwise(unittest.TestCase):
     @parameterized.expand(
-        [
-            param("static_M_float16", [23], 7, 3, "float16"),
-            param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
-            param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
-        ]
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param("static_M_float16", [23], 7, 3, "float16"),
+                    param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
+                ],
+            }
+        )
     )
     def test_pad_gemm_rcr_bias_broadcast_with_elementwise(
         self, test_name, ms, n, k, dtype
@@ -84,14 +92,34 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            ("static_shape_float16", [3], [1], 5, 3, "float16"),
-            ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
-            ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
-            ("dynamic_BM_float16", [3, 5, 8], [3, 9, 10], 17, 21, "float16"),
-            ("static_shape_float32", [3], [1], 5, 3, "float32"),
-            ("dynamic_BM_float32", [3, 5, 8], [3, 9, 10], 17, 21, "float32"),
-        ]
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("static_shape_float16", [3], [1], 5, 3, "float16"),
+                    ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+                    ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+                    (
+                        "dynamic_BM_float16",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float16",
+                    ),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("static_shape_float32", [3], [1], 5, 3, "float32"),
+                    (
+                        "dynamic_BM_float32",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float32",
+                    ),
+                ],
+            }
+        )
     )
     def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k, dtype):
         target = detect_target()
@@ -133,14 +161,34 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k, dtype):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            ("static_shape_float16", [3], [1], 5, 3, "float16"),
-            ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
-            ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
-            ("dynamic_BM_float16", [3, 5, 8], [3, 9, 10], 17, 21, "float16"),
-            ("static_shape_float32", [3], [1], 5, 3, "float32"),
-            ("dynamic_BM_float32", [3, 5, 8], [3, 9, 10], 17, 21, "float32"),
-        ]
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("static_shape_float16", [3], [1], 5, 3, "float16"),
+                    ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+                    ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+                    (
+                        "dynamic_BM_float16",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float16",
+                    ),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("static_shape_float32", [3], [1], 5, 3, "float32"),
+                    (
+                        "dynamic_BM_float32",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float32",
+                    ),
+                ],
+            }
+        )
     )
     def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k, dtype):
         target = detect_target()
@@ -186,11 +234,17 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k, dty
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            param("static_M_float16", [23], 7, 3, "float16"),
-            param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
-            param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
-        ]
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param("static_M_float16", [23], 7, 3, "float16"),
+                    param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
+                ],
+            }
+        )
     )
     def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(
         self, test_name, ms, n, k, dtype
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
index f84d7e7b5..4eba28ea6 100644
--- a/tests/unittest/compiler/test_parallel_gemm_fusions.py
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -27,6 +27,7 @@
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
     count_ops,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
     has_op,
@@ -343,11 +344,7 @@ def test_fuse_parallel_gemm_cat_fp16(self):
         self._fuse_2_split_parallel_gemm_cat(b=4, ms=[256, 512], n=128, k=64)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fuse_parallel_gemm_cat_fp32(self):
+    def test_fuse_parallel_gemm_cat_fp32_sm80(self):
         # test n x gemms + cat
         self._fuse_parallel_gemm_cat(
             b=4,
@@ -573,11 +570,7 @@ def test_fuse_parallel_gemm_cat_partial_fp16(self):
         self._test_fuse_parallel_gemm_cat_partial(2, 2, [128, 256], 33, 55, True)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fuse_parallel_gemm_cat_partial_fp32(self):
+    def test_fuse_parallel_gemm_cat_partial_fp32_sm80(self):
         self._test_fuse_parallel_gemm_cat_partial(
             4, 4, [128, 256], 32, 64, True, dtype="float32"
         )
@@ -621,11 +614,7 @@ def test_multi_parallel_gemm_cat_groups_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_multi_parallel_gemm_cat_groups_fp32(self):
+    def test_multi_parallel_gemm_cat_groups_fp32_sm80(self):
         self._test_multi_parallel_gemm_cat_groups(
             256,
             [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
@@ -633,6 +622,8 @@ def test_multi_parallel_gemm_cat_groups_fp32(self):
         )
 
 
+filter_test_cases_by_test_env(ParallelGemmCatFusionTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
index 2fddd99cd..dfc813f49 100644
--- a/tests/unittest/compiler/test_refine_graph.py
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -23,8 +23,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import graph_utils
 
@@ -35,7 +37,14 @@
 
 
 class RefineGraphTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
     def test_elementwise_ops(self, dtype):
         target = detect_target()
         if dtype == "float32" and target.name == "rocm":
@@ -116,10 +125,6 @@ def test_elementwise_ops_single_input_no_refine(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_elementwise_ops_single_input(self):
         dtype = "float16"
         M = 10
@@ -201,7 +206,14 @@ def _build_gemm_rcr_bias_mul(self, M, N, K, dtype, start_idx=0):
 
         return mul_tensor
 
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
     def test_gemm_ops(self, dtype):
         target = detect_target()
         if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -230,7 +242,14 @@ def test_gemm_ops(self, dtype):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
 
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
     def test_bmm_ops_accessor(self, dtype):
         target = detect_target()
         if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
@@ -280,7 +299,14 @@ def test_bmm_ops_accessor(self, dtype):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
     def test_refine_graph_group_gemms(self, dtype):
         target = detect_target()
         if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
index f7e8ac3e5..fce4b2637 100644
--- a/tests/unittest/compiler/test_slice_elemwise_fusion.py
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -258,7 +258,7 @@ def _test_slice_elemwise_fusion_dynamic(
             is_input=True,
         )
 
-        Y1 = ops.elementwise(FuncEnum.TANH)(X2)
+        Y1 = ops.elementwise(FuncEnum.RELU)(X2)
         Y2 = ops.elementwise(FuncEnum.SUB)(Y1, X2)
         Y = ops.elementwise(FuncEnum.ADD)(slice_output, Y2)
         Y._attrs["name"] = "y"
@@ -305,7 +305,7 @@ def _test_slice_elemwise_fusion_dynamic(
                 slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
             ]
             slice_output_pt = x1_pt[slice_indices]
-            y1_pt = torch.tanh(x2_pt)
+            y1_pt = torch.relu(x2_pt)
             y2_pt = y1_pt - x2_pt
             y_pt = slice_output_pt + y2_pt
 
@@ -319,10 +319,6 @@ def _test_slice_elemwise_fusion_dynamic(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_slice_elemwise_fusion_dynamic(self):
         self._test_slice_elemwise_fusion_dynamic(
             slice_input_shape=([5, 16], 10),
@@ -352,10 +348,6 @@ def test_slice_elemwise_fusion_dynamic(self):
             expected_data_t="half",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_slice_elemwise_fusion_dynamic_broadcast(self):
         # slice_output broadcasts to input_x2
         self._test_slice_elemwise_fusion_dynamic(
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 88f3bfb6d..5312f4d3a 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -23,8 +23,11 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import graph_utils, shape_utils
 
@@ -151,7 +154,14 @@ def test_slice_gemm_rcr_fusion_a(self):
 
     # This is a test for testing cases where we correctly update a/b_alignment
     # based on input_accessors
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
     def test_slice_gemm_rcr_fusion_align(self, dtype):
         if dtype == "float" and int(detect_target()._arch) < 80:
             self.skipTest("gemm with float tensors requires CUDA sm >= 80")
@@ -774,11 +784,7 @@ def test_slice_multiple_gemm_rcr_fusion_a(self):
             test_name="slice_multiple_gemm_rcr_fusion_a",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_slice_gemm_fusion_float(self):
+    def test_slice_gemm_fusion_float_sm80(self):
         self._test_slice_gemm_rcr_fusion_a(
             N=4,
             K=8,
@@ -862,6 +868,8 @@ def test_slice_gemm_fusion_float(self):
         )
 
 
+filter_test_cases_by_test_env(SliceGemmFusionTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index 4c5b9f591..c4a07d0ac 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -121,11 +122,7 @@ def _run_one_test(
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
-    def test_slice_scatter_reshape(self):
+    def test_slice_scatter_reshape_sm80(self):
         self._run_one_test(
             input_shapes=[[1, 2], [1, 2]],
             input_start_indices=[[0, 0], [0, 0]],
@@ -241,6 +238,8 @@ def test_slice_scatter_reshape_float16_2(self):
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
 
 
+filter_test_cases_by_test_env(SliceScatterReshapeCatTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From 32a846507b129003c26600b1bcbe98f93aebcefe Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Fri, 10 Mar 2023 13:52:44 -0800
Subject: [PATCH 247/638] Add .idea to .gitignore (#397)

Summary:
Add .idea to .gitignore

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/397

Reviewed By: aakhundov

Differential Revision: D43991515

Pulled By: khabinov

fbshipit-source-id: 3448791deb1d4f53059fffc2e9473796ab09409e
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index f3bbc0889..8897298b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,6 +136,9 @@ tags
 # macOS dir files
 .DS_Store
 
+# PyCharm files
+.idea
+
 # vscode
 .vscode
 

From 75619f9708f6edff46ba1fdd8a49360c214dc97a Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 10 Mar 2023 15:19:47 -0800
Subject: [PATCH 248/638] Split A100 / V100 tests in test_gemm_* (second batch)
 (#401)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/401

ATT

Reviewed By: muchulee8, chenyang78

Differential Revision: D43985366

fbshipit-source-id: b7abb134fda9956dce22c95c93230ce8160bf07c
---
 tests/unittest/ops/test_gemm_bias_permute.py | 19 ++++---
 tests/unittest/ops/test_gemm_bias_sigmoid.py | 41 +++++++--------
 tests/unittest/ops/test_gemm_bias_softmax.py | 19 ++++---
 tests/unittest/ops/test_gemm_bias_swish.py   | 52 ++++++++------------
 tests/unittest/ops/test_gemm_bias_tanh.py    | 43 +++++++---------
 5 files changed, 77 insertions(+), 97 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_bias_permute.py b/tests/unittest/ops/test_gemm_bias_permute.py
index 48637a18e..8bbfa24db 100644
--- a/tests/unittest/ops/test_gemm_bias_permute.py
+++ b/tests/unittest/ops/test_gemm_bias_permute.py
@@ -19,10 +19,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
 class GEMMBiasPermuteTestCase(unittest.TestCase):
     def _test_gemm_rcr_bias_permute_m2n3(
         self,
@@ -79,7 +81,7 @@ def _test_gemm_rcr_bias_permute_m2n3(
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m2n3_fp16(self):
+    def test_gemm_rcr_bias_permute_m2n3_fp16_rocm(self):
         self._test_gemm_rcr_bias_permute_m2n3(
             test_name="gemm_rcr_bias_permute_m2n3_fp16",
             dtype="float16",
@@ -144,7 +146,7 @@ def _test_gemm_rcr_bias_permute_m3n2(
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m3n2_fp16(self):
+    def test_gemm_rcr_bias_permute_m3n2_fp16_rocm(self):
         self._test_gemm_rcr_bias_permute_m3n2(
             test_name="gemm_rcr_bias_permute_m3n2_fp16",
             dtype="float16",
@@ -203,7 +205,7 @@ def _test_gemm_rcr_permute_m2n3(
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_permute_m2n3_fp16(self):
+    def test_gemm_rcr_permute_m2n3_fp16_rocm(self):
         self._test_gemm_rcr_permute_m2n3(
             test_name="test_gemm_rcr_permute_m2n3_fp16",
             dtype="float16",
@@ -215,7 +217,7 @@ def test_gemm_rcr_permute_m2n3_fp16(self):
         )
 
     # ========== enable them after fix profiler =========
-    # def test_gemm_rcr_bias_relu(self):
+    # def test_gemm_rcr_bias_relu_rocm(self):
     #     M0 = 4
     #     M1 = 32
     #     M2 = 128
@@ -246,7 +248,7 @@ def test_gemm_rcr_permute_m2n3_fp16(self):
     #     module.run_with_tensors(inputs, [y])
     #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    # def test_gemm_rrr_bias_relu(self):
+    # def test_gemm_rrr_bias_relu_rocm(self):
     #     M0 = 4
     #     M1 = 32
     #     M2 = 128
@@ -278,5 +280,8 @@ def test_gemm_rcr_permute_m2n3_fp16(self):
     #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(GEMMBiasPermuteTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 2985f2da3..5ab21e6c4 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -19,31 +19,19 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "bfloat16":
-        return {"atol": 3e-1, "rtol": 3e-1}
-    else:
-        return {}
-
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 class GEMMBiasSigmoidTestCase(unittest.TestCase):
@@ -75,13 +63,18 @@ def _test_rcr(self, dtype="float16"):
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        torch.testing.assert_close(Y_pt, y, **_tolerance_limits(dtype))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_rcr(self, dtype):
-        skipped_reason = _skip_target(detect_target(), dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
         self._test_rcr(dtype)
 
 
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
index 5bee4c91a..0c9d7b811 100644
--- a/tests/unittest/ops/test_gemm_bias_softmax.py
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -30,9 +31,8 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 @unittest.skip("GEMM + Softmax is disabled for now")
-class GEMMTestCase(unittest.TestCase):
+class GEMMBiasSoftmaxTestCase(unittest.TestCase):
     def _test_gemm_rcr_bias_softmax(
         self, M=16, K=64, N=24, rebuild=True, dtype="float16"
     ):
@@ -74,15 +74,14 @@ def _test_gemm_rcr_bias_softmax(
             rtol=1e-1,
         )
 
-    def test_gemm_bias_softmax(self):
-        self._test_gemm_rcr_bias_softmax(N=81)
+    def test_gemm_bias_softmax_float16(self):
+        self._test_gemm_rcr_bias_softmax(N=81, dtype="float16")
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_bias_softmax_float(self):
-        self._test_gemm_rcr_bias_softmax(N=81, dtype="float")
+    def test_gemm_bias_softmax_float32_sm80(self):
+        self._test_gemm_rcr_bias_softmax(N=81, dtype="float16")
+
+
+filter_test_cases_by_test_env(GEMMBiasSoftmaxTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
index 0ce13bf91..d9c71780a 100644
--- a/tests/unittest/ops/test_gemm_bias_swish.py
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -19,38 +19,23 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
-from parameterized import parameterized
 
 
-def swish(x):
-    return x * torch.sigmoid(x)
-
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "bfloat16":
-        return {"atol": 3e-1, "rtol": 3e-1}
-    else:
-        return {}
 
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+def swish(x):
+    return x * torch.sigmoid(x)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasSwishTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(GEMMBiasSwishTestCase, self).__init__(*args, **kwargs)
@@ -80,14 +65,19 @@ def _test_rcr(self, dtype="float16"):
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, **_tolerance_limits(dtype)))
-
-    @parameterized.expand(("float16", "float32", "bfloat16"))
-    def test_rcr(self, dtype):
-        skipped_reason = _skip_target(detect_target(), dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
-        self._test_rcr(dtype)
+        self.assertTrue(torch.allclose(Y_pt, y, **_TOLERANCE_LIMITS[dtype]))
+
+    def test_rcr_float16(self):
+        self._test_rcr(dtype="float16")
+
+    def test_rcr_float32_sm80(self):
+        self._test_rcr(dtype="float32")
+
+    def test_rcr_bfloat16_bf16(self):
+        self._test_rcr(dtype="bfloat16")
+
+
+filter_test_cases_by_test_env(GEMMBiasSwishTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index 089fbf10c..0a5b17c39 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -21,32 +21,20 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 from parameterized import parameterized
 
 
-def _tolerance_limits(dtype):
-    if dtype == "float16":
-        return {"atol": 1e-1, "rtol": 1e-1}
-    elif dtype == "float32":
-        return {"atol": 3e-2, "rtol": 2e-2}
-    elif dtype == "bfloat16":
-        return {"atol": 2e-1, "rtol": 2e-1}
-    else:
-        return {}
-
-
-def _skip_target(target, ait_dtype):
-    if ait_dtype == "float16":
-        return None
-    if target.name() != "cuda":
-        return "Not supported for non-CUDA target"
-    if int(target._arch) < 80:
-        return "Not supported for CUDA SM<80."
-    return None
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 3e-2, "rtol": 2e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
 
 
 class GEMMBiasTanhTestCase(unittest.TestCase):
@@ -58,7 +46,7 @@ def _test_rcr(self, Ms, test_name, dtype="float16"):
         K = 1024
         N = 64
         target = detect_target()
-        tolerance_limits = _tolerance_limits(dtype)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
@@ -86,12 +74,17 @@ def _test_rcr(self, Ms, test_name, dtype="float16"):
             )
             torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @parameterized.expand(("float16", "float32", "bfloat16"))
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_rcr_bias_tanh_floats(self, dtype):
-        skipped_reason = _skip_target(detect_target(), dtype)
-        if skipped_reason is not None:
-            self.skipTest(skipped_reason)
-        self._test_rcr([128], "static")
+        self._test_rcr([128], f"static_m_{dtype}", dtype=dtype)
         self._test_rcr([1, 7, 64, 127], f"dynamic_m_{dtype}", dtype=dtype)
 
 
From b8562141dcfd28cedfae83dc91e5ea5e55e9d0f5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 10 Mar 2023 16:21:45 -0800
Subject: [PATCH 249/638] move test_activation from a100 (#393)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/393

att

Reviewed By: muchulee8, chenyang78

Differential Revision: D43963876

fbshipit-source-id: 792caa58554e14ebcebb7b62a6d6e1871cf2251a
---
 tests/unittest/ops/test_activation.py | 898 ++++++++++++++++----------
 1 file changed, 563 insertions(+), 335 deletions(-)

diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index b270e6981..01581e07f 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -24,7 +24,12 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils.torch_utils import torch_dtype_to_string
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
 
 
 TORCH_EQUIVALENTS = {
@@ -41,12 +46,6 @@
     FuncEnum.CELU: torch.celu,
 }
 
-TORCH_FP_DTYPES = [torch.float16]
-if detect_target().name() != "rocm":
-    TORCH_FP_DTYPES.append(torch.float32)
-    if int(detect_target()._arch) >= 80:
-        TORCH_FP_DTYPES.append(torch.bfloat16)
-
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedElementwiseTestCase(unittest.TestCase):
@@ -56,39 +55,38 @@ def _test_leaky_relu(
         negative_slope=0.01,
         test_name="leaky_relu",
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            slope = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="slope",
-                value=negative_slope,
-            )
-            X2_op = ops.elementwise(FuncEnum.LRELU)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, slope)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.LeakyReLU(negative_slope)
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        slope = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="slope",
+            value=negative_slope,
+        )
+        X2_op = ops.elementwise(FuncEnum.LRELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, slope)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype=dtype)
+        OP_pt = torch.nn.LeakyReLU(negative_slope)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_floor_div(
         self,
@@ -96,39 +94,38 @@ def _test_floor_div(
         test_name="floor_div",
         dividend=2,
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            slope = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="input1",
-                value=dividend,
-            )
-            X2_op = ops.elementwise(FuncEnum.FLOOR_DIV)
-
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, slope)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            x2_pt = torch.div(x1_pt, dividend, rounding_mode="floor")
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        slope = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="input1",
+            value=dividend,
+        )
+        X2_op = ops.elementwise(FuncEnum.FLOOR_DIV)
+
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, slope)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        x2_pt = torch.div(x1_pt, dividend, rounding_mode="floor")
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_hardtanh(
         self,
@@ -137,47 +134,46 @@ def _test_hardtanh(
         max_val=1,
         test_name="hard_tanh",
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            X_min = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="min_val",
-                value=min_val,
-                is_input=True,
-            )
-            X_max = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="max_val",
-                value=max_val,
-                is_input=True,
-            )
-            X2_op = ops.elementwise(FuncEnum.HARDTANH)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, X_min, X_max)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_min = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="min_val",
+            value=min_val,
+            is_input=True,
+        )
+        X_max = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="max_val",
+            value=max_val,
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.HARDTANH)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_min, X_max)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_softplus(
         self,
@@ -186,147 +182,137 @@ def _test_softplus(
         threshold=20.0,
         test_name="softplus",
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            X_beta = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="beta",
-                value=beta,
-                is_input=True,
-            )
-            X_threshold = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="threshold",
-                value=threshold,
-                is_input=True,
-            )
-            X2_op = ops.elementwise(FuncEnum.SOFTPLUS)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, X_beta, X_threshold)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
-
-    def _test_simple_function(self, input_size, function, test_name, copy_op=False):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            X2_op = ops.elementwise(function)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            x2_pt = TORCH_EQUIVALENTS[function](x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(
-                torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
-            )
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_beta = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="beta",
+            value=beta,
+            is_input=True,
+        )
+        X_threshold = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="threshold",
+            value=threshold,
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.SOFTPLUS)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_beta, X_threshold)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_simple_function(
+        self, input_size, function, test_name, copy_op=False, dtype="float16"
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(function)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        x2_pt = TORCH_EQUIVALENTS[function](x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
 
     def _test_elu(
-        self,
-        input_size,
-        alpha=1.0,
-        test_name="elu",
-        copy_op=False,
+        self, input_size, alpha=1.0, test_name="elu", copy_op=False, dtype="float16"
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            X_alpha = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="alpha",
-                value=alpha,
-            )
-            X2_op = ops.elementwise(FuncEnum.ELU)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, X_alpha)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.ELU(alpha=alpha)
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_alpha = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="alpha",
+            value=alpha,
+        )
+        X2_op = ops.elementwise(FuncEnum.ELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_alpha)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.ELU(alpha=alpha)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_softsign(
         self,
-        input_shape,
+        input_size,
         test_name="softsign",
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            X1 = Tensor(
-                shape=[IntImm(dim) for dim in input_shape],
-                dtype=dtype,
-                name="input",
-                is_input=True,
-            )
-            X2_op = ops.elementwise(FuncEnum.SOFTSIGN)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-
-            x1_pt = torch.randn(input_shape, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.Softsign()
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        X1 = Tensor(
+            shape=[IntImm(dim) for dim in input_size],
+            dtype=dtype,
+            name="input",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.SOFTSIGN)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.Softsign()
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_celu(
         self,
@@ -334,58 +320,77 @@ def _test_celu(
         alpha=1.0,
         test_name="celu",
         copy_op=False,
+        dtype="float16",
     ):
-        for torch_dtype in TORCH_FP_DTYPES:
-            dtype = torch_dtype_to_string(torch_dtype)
-            assert len(input_size) == 2
-            X1 = Tensor(
-                shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-                dtype=dtype,
-                name="input0",
-                is_input=True,
-            )
-            X_alpha = Tensor(
-                shape=[],
-                dtype=dtype,
-                name="alpha",
-                value=alpha,
-            )
-            X2_op = ops.elementwise(FuncEnum.CELU)
-            if copy_op:
-                X2_op = ops.elementwise(**X2_op._get_op_attributes())
-            X2 = X2_op(X1, X_alpha)
-            X2._attrs["is_output"] = True
-            X2._attrs["name"] = "output0"
-
-            target = detect_target()
-            module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
-            x1_pt = torch.randn(input_size, dtype=torch_dtype).cuda()
-            OP_pt = torch.nn.CELU(alpha=alpha)
-            x2_pt = OP_pt(x1_pt)
-
-            x2 = torch.empty_like(x2_pt)
-            module.run_with_tensors([x1_pt], [x2])
-            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
-
-    def test_lrelu(self):
-        self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_alpha = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="alpha",
+            value=alpha,
+        )
+        X2_op = ops.elementwise(FuncEnum.CELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_alpha)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.CELU(alpha=alpha)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_lrelu(self, dtype):
+        self._test_leaky_relu([512, 512], test_name="leaky_relu_1", dtype=dtype)
         self._test_leaky_relu(
             [1024, 1024],
             negative_slope=0.5,
             test_name="leaky_relu_2",
+            dtype=dtype,
         )
         self._test_leaky_relu(
             [1024, 1024],
             negative_slope=0.5,
             test_name="leaky_relu_2_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
-        self._test_leaky_relu([63, 63], test_name="leaky_relu_3")
-
-    def test_htanh(self):
-        self._test_hardtanh([511, 511], test_name="hard_tanh_1")
+        self._test_leaky_relu([63, 63], test_name="leaky_relu_3", dtype=dtype)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_htanh(self, dtype):
+        self._test_hardtanh([511, 511], test_name="hard_tanh_1", dtype=dtype)
         self._test_hardtanh(
-            [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2"
+            [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2", dtype=dtype
         )
         self._test_hardtanh(
             [1024, 1024],
@@ -393,116 +398,339 @@ def test_htanh(self):
             max_val=2,
             test_name="hard_tanh_2_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
 
-    def test_softplus(self):
-        self._test_softplus([63, 63], test_name="softplus_1")
-        self._test_softplus([128, 128], beta=1.0, threshold=1.5, test_name="softplus_2")
-        self._test_softplus([128, 256], beta=2.0, threshold=0.5, test_name="softplus_3")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_softplus(self, dtype):
+        self._test_softplus([63, 63], test_name="softplus_1", dtype=dtype)
+        self._test_softplus(
+            [128, 128], beta=1.0, threshold=1.5, test_name="softplus_2", dtype=dtype
+        )
+        self._test_softplus(
+            [128, 256], beta=2.0, threshold=0.5, test_name="softplus_3", dtype=dtype
+        )
         self._test_softplus(
             [256, 128],
             beta=1.0,
             threshold=1.0,
             test_name="softplus_3_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
 
-    def test_cos(self):
-        self._test_simple_function([511, 511], FuncEnum.COS, test_name="cos_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_cos(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.COS, test_name="cos_1", dtype=dtype
+        )
         self._test_simple_function(
-            [512, 512], FuncEnum.COS, test_name="cos_1_copy_op", copy_op=True
+            [512, 512],
+            FuncEnum.COS,
+            test_name="cos_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_sin(self):
-        self._test_simple_function([511, 511], FuncEnum.SIN, test_name="sin_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sin(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.SIN, test_name="sin_1_copy_op", copy_op=True
+            [511, 511], FuncEnum.SIN, test_name="sin_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIN,
+            test_name="sin_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_tanh(self):
-        self._test_simple_function([512, 512], FuncEnum.TANH, test_name="tanh_1")
-        self._test_simple_function([1, 1], FuncEnum.TANH, test_name="tanh_2")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_tanh(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.TANH, test_name="tanh_1_copy_op", copy_op=True
+            [512, 512], FuncEnum.TANH, test_name="tanh_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [1, 1], FuncEnum.TANH, test_name="tanh_2", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.TANH,
+            test_name="tanh_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_sign(self):
-        self._test_simple_function([511, 511], FuncEnum.SIGN, test_name="sign_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sign(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.SIGN, test_name="sign_1_copy_op", copy_op=True
+            [511, 511], FuncEnum.SIGN, test_name="sign_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIGN,
+            test_name="sign_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_abs(self):
-        self._test_simple_function([511, 511], FuncEnum.ABS, test_name="abs_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_abs(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.ABS, test_name="abs_1", dtype=dtype
+        )
         self._test_simple_function(
-            [512, 512], FuncEnum.ABS, test_name="abs_1_copy_op", copy_op=True
+            [512, 512],
+            FuncEnum.ABS,
+            test_name="abs_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_loge(self):
-        self._test_simple_function([511, 511], FuncEnum.LOGE, test_name="loge_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_loge(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.LOGE, test_name="loge_1_copy_op", copy_op=True
+            [511, 511], FuncEnum.LOGE, test_name="loge_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.LOGE,
+            test_name="loge_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_exp(self):
-        self._test_simple_function([511, 511], FuncEnum.EXP, test_name="exp_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_exp(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.EXP, test_name="exp_1", dtype=dtype
+        )
         self._test_simple_function(
-            [512, 512], FuncEnum.EXP, test_name="exp_1_copy_op", copy_op=True
+            [512, 512],
+            FuncEnum.EXP,
+            test_name="exp_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_sqrt(self):
-        self._test_simple_function([511, 511], FuncEnum.SQRT, test_name="sqrt_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sqrt(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.SQRT, test_name="sqrt_1_copy_op", copy_op=True
+            [511, 511], FuncEnum.SQRT, test_name="sqrt_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SQRT,
+            test_name="sqrt_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_sigmoid(self):
-        self._test_simple_function([511, 511], FuncEnum.SIGMOID, test_name="sigmoid_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sigmoid(self, dtype):
         self._test_simple_function(
-            [512, 512], FuncEnum.SIGMOID, test_name="sigmoid_1_copy_op", copy_op=True
+            [511, 511], FuncEnum.SIGMOID, test_name="sigmoid_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIGMOID,
+            test_name="sigmoid_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_relu(self):
-        self._test_simple_function([511, 511], FuncEnum.RELU, test_name="relu_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_relu(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.RELU, test_name="relu_1", dtype=dtype
+        )
         self._test_simple_function(
-            [512, 512], FuncEnum.RELU, test_name="relu_1_copy_op", copy_op=True
+            [512, 512],
+            FuncEnum.RELU,
+            test_name="relu_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
         )
 
-    def test_elu(self):
-        self._test_elu([63, 63], test_name="elu_1")
-        self._test_elu([128, 128], alpha=4.0, test_name="elu_2")
-        self._test_elu([128, 256], alpha=0.4, test_name="elu_3")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_elu(self, dtype):
+        self._test_elu([63, 63], test_name="elu_1", dtype=dtype)
+        self._test_elu([128, 128], alpha=4.0, test_name="elu_2", dtype=dtype)
+        self._test_elu([128, 256], alpha=0.4, test_name="elu_3", dtype=dtype)
         self._test_elu(
             [256, 128],
             alpha=1.0,
             test_name="elu_3_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
 
-    def test_softsign(self):
-        self._test_softsign([63, 63], test_name="softsign_1")
-        self._test_softsign([128], test_name="softsign_2")
-        self._test_softsign([128], test_name="softsign_3", copy_op=True)
-        self._test_softsign([121, 128], test_name="softsign_4")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_softsign(self, dtype):
+        self._test_softsign(
+            [63, 63],
+            test_name="softsign_1",
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [128],
+            test_name="softsign_2",
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [128],
+            test_name="softsign_3",
+            copy_op=True,
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [121, 128],
+            test_name="softsign_4",
+            dtype=dtype,
+        )
 
-    def test_floor_div(self):
-        self._test_floor_div([511, 511], test_name="floor_div_1")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_floor_div(self, dtype):
+        self._test_floor_div(
+            [511, 511],
+            test_name="floor_div_1",
+            dtype=dtype,
+        )
         self._test_floor_div(
             [1024, 1024],
             dividend=3,
             test_name="test_floor_div_2_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
 
-    def test_celu(self):
-        self._test_celu([63, 63], alpha=1.0, test_name="celu_1")
-        self._test_celu([128, 128], alpha=4.0, test_name="celu_2")
-        self._test_celu([128, 256], alpha=0.4, test_name="celu_3")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_celu(self, dtype):
+        self._test_celu([63, 63], alpha=1.0, test_name="celu_1", dtype=dtype)
+        self._test_celu([128, 128], alpha=4.0, test_name="celu_2", dtype=dtype)
+        self._test_celu([128, 256], alpha=0.4, test_name="celu_3", dtype=dtype)
         self._test_celu(
-            [256, 128],
-            alpha=1.0,
-            test_name="celu_3_copy_op",
-            copy_op=True,
+            [256, 128], alpha=1.0, test_name="celu_3_copy_op", copy_op=True, dtype=dtype
         )
 
 
From 7007246d3039a72b72c329f9b0820c9ef45e8b4b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 10 Mar 2023 16:21:45 -0800
Subject: [PATCH 250/638] move a few compiler pass tests from a100 ci (#399)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/399

Reviewed By: muchulee8, alexanderguzhva

Differential Revision: D43966899

fbshipit-source-id: d8abfa4c9d1b535e912f70301f56068ff317eeec
---
 .../compiler/test_strided_view_cat.py         | 104 +++++++++++-------
 .../unittest/compiler/test_strided_view_op.py |  16 ---
 .../compiler/test_transform_special_op.py     |  26 ++++-
 .../unittest/compiler/test_view_strided_op.py |  15 +--
 4 files changed, 87 insertions(+), 74 deletions(-)

diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index ccf6ab607..b585e0771 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -22,8 +22,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import graph_utils
 from parameterized import param, parameterized
@@ -44,14 +46,6 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 expected_num_tensors=10,
                 expected_num_ops=9,
             ),
-            param(
-                "gemm_reshape_cat_fusible_expand_1",
-                n=2,
-                new_shape=[-1, 2, 1, 2],
-                cat_dim=3,
-                expected_num_tensors=10,
-                expected_num_ops=9,
-            ),
             param(
                 "gemm_reshape_cat_fusible_expand_2",
                 n=4,
@@ -92,32 +86,6 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 expected_num_tensors=14,
                 expected_num_ops=9,
             ),
-            param(
-                "gemm_reshape_cat_non_fusible_expand",
-                n=4,
-                new_shape=[-1, 4, 2, 2],
-                cat_dim=3,
-                expected_num_tensors=16,
-                expected_num_ops=9,
-            ),
-            param(
-                "gemm_reshape_cat_fusible_expand_float_1",
-                n=2,
-                new_shape=[-1, 2, 1, 2],
-                cat_dim=3,
-                expected_num_tensors=10,
-                expected_num_ops=9,
-                dtype="float",
-            ),
-            param(
-                "gemm_reshape_cat_non_fusible_expand_float",
-                n=4,
-                new_shape=[-1, 4, 2, 2],
-                cat_dim=3,
-                expected_num_tensors=16,
-                expected_num_ops=9,
-                dtype="float",
-            ),
         ],
         name_func=custom_name_func,
     )
@@ -130,10 +98,57 @@ def test_strided_gemm_view_cat_fusible(
         expected_num_tensors: int,
         expected_num_ops: int,
         dtype: str = "float16",
+    ):
+        self._test_strided_gemm_view_cat_fusible(
+            test_name,
+            n,
+            new_shape,
+            cat_dim,
+            expected_num_tensors,
+            expected_num_ops,
+            dtype,
+        )
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_strided_gemm_view_cat_fusible_dtype(self, dtype):
+        self._test_strided_gemm_view_cat_fusible(
+            f"gemm_reshape_cat_non_fusible_expand_{dtype}",
+            n=4,
+            new_shape=[-1, 4, 2, 2],
+            cat_dim=3,
+            expected_num_tensors=16,
+            expected_num_ops=9,
+            dtype=dtype,
+        )
+        self._test_strided_gemm_view_cat_fusible(
+            f"gemm_reshape_cat_fusible_expand_{dtype}",
+            n=2,
+            new_shape=[-1, 2, 1, 2],
+            cat_dim=3,
+            expected_num_tensors=10,
+            expected_num_ops=9,
+            dtype=dtype,
+        )
+
+    def _test_strided_gemm_view_cat_fusible(
+        self,
+        test_name: str,
+        n: int,
+        new_shape: List[int],
+        cat_dim: int,
+        expected_num_tensors: int,
+        expected_num_ops: int,
+        dtype: str = "float16",
     ):
         target = detect_target()
-        if dtype == "float" and (target.name() != "cuda" or int(target._arch) < 80):
-            self.skipTest("Only supported with CUDA >= 80")
 
         batch_dim = IntVar([1, 2, 3], "batch_size")
         input0 = test_utils.gen_input_tensor(
@@ -341,12 +356,17 @@ def _create_layernorm_sigmoid_mul(
                     f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}",
                 )
 
-    def test_strided_layernorm_view_cat_fusible(self):
-        self._test_strided_layernorm_view_cat_fusible()
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_strided_layernorm_view_cat_fusible_float(self):
-        self._test_strided_layernorm_view_cat_fusible(dtype="float")
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_strided_layernorm_view_cat_fusible(self, dtype):
+        self._test_strided_layernorm_view_cat_fusible(dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
index 6f841592a..d413b7a0f 100644
--- a/tests/unittest/compiler/test_strided_view_op.py
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -172,10 +172,6 @@ def __init__(self, *args, **kwargs):
         super(StridedViewOpTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     @parameterized.expand(
         [
             param(f"single_gemm_{name}_fusion_{dtype}", func, dtype)
@@ -284,10 +280,6 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func, dtype):
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self._test_id += 1
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     @parameterized.expand(
         [
             param(
@@ -339,10 +331,6 @@ def test_single_op_and_view_fusible(
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self._test_id += 1
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     @parameterized.expand(
         [
             param(f"single_op_{name}_non_fusion_{dtype}", func, dtype)
@@ -469,10 +457,6 @@ def _test_two_parallel_views(self, dtype="float16"):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
     def test_two_views(self):
         self._test_two_parallel_views()
         self._test_two_serial_view_outputs()
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
index aed1a89b8..90528ce11 100644
--- a/tests/unittest/compiler/test_transform_special_op.py
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -23,8 +23,10 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 from aitemplate.utils.graph_utils import get_sorted_ops
@@ -120,11 +122,17 @@ def test_small_nk_fp32(self):
             [100, 200], 6, 3, "test_small_nk_alignment_fp32", dtype="float32"
         )
 
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_small_nk_no_transform(self, dtype):
         target = detect_target()
-        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M, K, N = 8, 8, 16
         _, _, output = self._create_gemm_rrr_graph(M, K, N, dtype)
@@ -228,11 +236,17 @@ def test_n1_k8_fp32(self):
         self._test_n1_k8(10, [8], 1, 8, dtype="float32")
         self._test_n1_k8(10, [8, 16], 1, 8, dtype="float32")
 
-    @parameterized.expand([("float16"), ("float32")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
     def test_n_non1_fail(self, dtype):
         target = detect_target()
-        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         B, M, K, N = 8, 8, 8, 8
         _, _, output = self._create_bmm_rcr_graph(B, M, K, N, dtype)
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
index d5170a079..6f637eb81 100644
--- a/tests/unittest/compiler/test_view_strided_op.py
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -23,6 +23,7 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target, test_utils
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -396,11 +397,7 @@ def test_multi_view_and_multi_bmm_fusible(self):
         self._test_multi_view_and_multi_bmm_fusible()
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_multi_view_and_multi_bmm_fusible_float(self):
+    def test_multi_view_and_multi_bmm_fusible_fp32_sm80(self):
         self._test_multi_view_and_multi_bmm_fusible(dtype="float")
 
     @parameterized.expand(
@@ -585,13 +582,11 @@ def test_single_view_and_gemm_fusible(self):
         self._test_single_view_and_gemm_fusible()
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_single_view_and_gemm_fusible_float(self):
+    def test_single_view_and_gemm_fusible_fp32_sm80(self):
         self._test_single_view_and_gemm_fusible(dtype="float")
 
 
+filter_test_cases_by_test_env(ViewStridedOpTestCase)
+
 if __name__ == "__main__":
     unittest.main()

From 7e3eded95cd14db880ba7021736f318c84acf858 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Fri, 10 Mar 2023 18:04:53 -0800
Subject: [PATCH 251/638] export ScaledDotProductAttention from frontend.nn
 (#403)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/403

We need to export ScaledDotProductAttention. Otherwise
the converter would fail.

Reviewed By: alexanderguzhva

Differential Revision: D43988076

fbshipit-source-id: c0d2ecc1d6f0870878ac9ea4c29411c999604fc1
---
 python/aitemplate/frontend/nn/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index de3e91329..6c22f1a99 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -30,6 +30,7 @@
     CrossAttention,
     FlashAttention,
     MultiheadAttention,
+    ScaledDotProductAttention,
 )
 from aitemplate.frontend.nn.identity import Identity
 from aitemplate.frontend.nn.multiscale_attention import MultiScaleBlock

From 15a60b6d43b5c936f904c9c4a37fdaae2a6aedd4 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Fri, 10 Mar 2023 22:50:50 -0800
Subject: [PATCH 252/638] Move tests from A100 to V100 (#402)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/402

Moving tests from A100 to V100

Reviewed By: aakhundov

Differential Revision: D43985931

fbshipit-source-id: c1c2d343f32f0f160bf85880d989206fc83033ef
---
 tests/unittest/ops/test_conv.py               | 55 ++++++---------
 tests/unittest/ops/test_conv2d_bias_add.py    | 46 ++++++-------
 tests/unittest/ops/test_conv_bias.py          | 47 ++++++-------
 .../ops/test_conv_bias_act_few_channels.py    | 68 ++++++++-----------
 .../ops/test_conv_bias_add_hardswish.py       | 38 +++++------
 tests/unittest/ops/test_conv_bias_add_relu.py | 46 ++++++-------
 .../unittest/ops/test_conv_bias_hardswish.py  | 46 ++++++-------
 tests/unittest/ops/test_conv_bias_relu.py     | 47 ++++++-------
 tests/unittest/ops/test_conv_bias_sigmoid.py  | 47 ++++++-------
 9 files changed, 183 insertions(+), 257 deletions(-)

diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index 7cbd70a75..fbabf6fb8 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvTestCase(unittest.TestCase):
@@ -69,47 +75,24 @@ def _test_conv(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_fp16(self):
-        self._test_conv(
-            test_name="conv2d_fp16",
-            dtype="float16",
-        )
-        self._test_conv(
-            copy_op=True,
-            test_name="conv2d_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "fp32 is not supported by CUDA < SM80.",
     )
-    def test_conv2d_fp32(self):
-        self._test_conv(
-            test_name="conv2d_fp32",
-            dtype="float32",
-        )
+    def test_conv2d(self, dtype):
         self._test_conv(
-            copy_op=True,
-            test_name="conv2d_fp32_copy_op",
-            dtype="float32",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "bf16 is not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bf16(self):
-        self._test_conv(
-            test_name="conv2d_bf16",
-            dtype="bfloat16",
+            test_name=f"conv2d_{dtype}",
+            dtype=dtype,
         )
         self._test_conv(
             copy_op=True,
-            test_name="conv2d_bf16_copy_op",
-            dtype="bfloat16",
+            test_name=f"conv2d_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index 69adeafa5..24e232f23 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -92,37 +98,23 @@ def _test_conv_bias_add(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_bias_add_fp16(self):
-        self._test_conv_bias_add(
-            test_name="conv2d_bias_add_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_add(
-            copy_op=True,
-            test_name="conv2d_bias_add_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float"), ("bfloat16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_add_fp32(self):
+    def test_conv2d_bias_add(self, dtype):
         self._test_conv_bias_add(
-            test_name="conv2d_bias_add_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_add_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "bf16 is not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_add_bf16(self):
         self._test_conv_bias_add(
-            test_name="conv2d_bias_add_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_add_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index 94d541961..b6f18ec08 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasTestCase(unittest.TestCase):
@@ -82,37 +88,24 @@ def _test_conv_bias(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_bias_fp16(self):
-        self._test_conv_bias(
-            test_name="conv2d_bias_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias(
-            copy_op=True,
-            test_name="conv2d_bias_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_fp32(self):
+    def test_conv2d_bias(self, dtype):
         self._test_conv_bias(
-            test_name="conv2d_bias_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_bf16(self):
         self._test_conv_bias(
-            test_name="conv2d_bias_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index 7c91ad840..1f2329248 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -87,31 +93,23 @@ def _test_conv_bias_relu_few_channels(
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_relu_fp16(self):
-        self._test_conv_bias_relu_few_channels(
-            test_name="conv_bias_relu_few_channels_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_relu_few_channels(
-            copy_op=True,
-            test_name="conv_bias_relu_few_channels_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_relu_fp32(self):
+    def test_relu(self, dtype):
         self._test_conv_bias_relu_few_channels(
-            test_name="conv_bias_relu_few_channels_fp32",
-            dtype="float32",
+            test_name=f"conv_bias_relu_few_channels_{dtype}",
+            dtype=dtype,
         )
         self._test_conv_bias_relu_few_channels(
             copy_op=True,
-            test_name="conv_bias_relu_few_channels_fp32_copy_op",
-            dtype="float32",
+            test_name="conv_bias_relu_few_channels_{dtype}_copy_op",
+            dtype=dtype,
         )
 
     def _test_conv_bias_hardswish_few_channels(
@@ -172,31 +170,23 @@ def _test_conv_bias_hardswish_few_channels(
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_hardswish_fp16(self):
-        self._test_conv_bias_hardswish_few_channels(
-            test_name="conv_bias_hardswish_few_channels_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_hardswish_few_channels(
-            copy_op=True,
-            test_name="conv_bias_hardswish_few_channels_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_hardswish_fp32(self):
+    def test_hardswish(self, dtype):
         self._test_conv_bias_hardswish_few_channels(
-            test_name="conv_bias_hardswish_few_channels_fp32",
-            dtype="float32",
+            test_name=f"conv_bias_hardswish_few_channels_{dtype}",
+            dtype=dtype,
         )
         self._test_conv_bias_hardswish_few_channels(
             copy_op=True,
-            test_name="conv_bias_hardswish_few_channels_fp32_copy_op",
-            dtype="float32",
+            test_name=f"conv_bias_hardswish_few_channels_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index ff7ce64a0..a769366ec 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -91,31 +97,23 @@ def _test_conv_bias_add_hardswish(
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_fp16(self):
-        self._test_conv_bias_add_hardswish(
-            test_name="conv2d_bias_add_hardswish_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_add_hardswish(
-            copy_op=True,
-            test_name="conv2d_bias_add_hardswish_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_fp32(self):
+    def test_conv_bias_add_hardswish(self, dtype):
         self._test_conv_bias_add_hardswish(
-            test_name="conv2d_bias_add_hardswish_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_add_hardswish_{dtype}",
+            dtype=dtype,
         )
         self._test_conv_bias_add_hardswish(
             copy_op=True,
-            test_name="conv2d_bias_add_hardswish_fp32_copy_op",
-            dtype="float32",
+            test_name="conv2d_bias_add_hardswish_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index 8fd22026b..1b4545bcc 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -93,37 +99,23 @@ def _test_conv_bias_add_relu(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_bias_add_relu_fp16(self):
-        self._test_conv_bias_add_relu(
-            test_name="conv2d_bias_add_relu_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_add_relu(
-            copy_op=True,
-            test_name="conv2d_bias_add_relu_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_add_relu_fp32(self):
+    def test_conv2d_bias_add_relu(self, dtype):
         self._test_conv_bias_add_relu(
-            test_name="conv2d_bias_add_relu_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_add_relu_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "bf16 is not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_add_relu_bf16(self):
         self._test_conv_bias_add_relu(
-            test_name="conv2d_bias_add_relu_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_add_relu_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index 4c138a2fc..0809c1e45 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -89,37 +95,23 @@ def _test_conv_bias_hardswish(
         elif dtype == "bfloat16":
             torch.testing.assert_close(Y_pt, y_transpose, atol=1, rtol=1)
 
-    def test_conv2d_bias_hardswish_fp16(self):
-        self._test_conv_bias_hardswish(
-            test_name="conv2d_bias_hardswish_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_hardswish(
-            copy_op=True,
-            test_name="conv2d_bias_hardswish_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_hardswish_fp32(self):
+    def test_conv2d_bias_hardswish(self, dtype):
         self._test_conv_bias_hardswish(
-            test_name="conv2d_bias_hardswish_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_hardswish_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_hardswish_bf16(self):
         self._test_conv_bias_hardswish(
-            test_name="conv2d_bias_hardswish_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_hardswish_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index f01063f6d..8ac41e071 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasReluTestCase(unittest.TestCase):
@@ -85,37 +91,24 @@ def _test_conv_bias_relu(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_bias_relu_fp16(self):
-        self._test_conv_bias_relu(
-            test_name="conv2d_bias_relu_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_relu(
-            copy_op=True,
-            test_name="conv2d_bias_relu_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_relu_fp32(self):
+    def test_conv2d_bias_relu(self, dtype):
         self._test_conv_bias_relu(
-            test_name="conv2d_bias_relu_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_relu_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "bf16 is not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_relu_bf16(self):
         self._test_conv_bias_relu(
-            test_name="conv2d_bias_relu_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_relu_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index a9ff2be1d..206956534 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -19,7 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasSigmoidTestCase(unittest.TestCase):
@@ -79,37 +85,24 @@ def _test_conv_bias_sigmoid(
         else:
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_conv2d_bias_sigmoid_fp16(self):
-        self._test_conv_bias_sigmoid(
-            test_name="conv2d_bias_sigmoid_fp16",
-            dtype="float16",
-        )
-        self._test_conv_bias_sigmoid(
-            copy_op=True,
-            test_name="conv2d_bias_sigmoid_fp16_copy_op",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
     )
-    def test_conv2d_bias_sigmoid_fp32(self):
+    def test_conv2d_bias_sigmoid(self, dtype):
         self._test_conv_bias_sigmoid(
-            test_name="conv2d_bias_sigmoid_fp32",
-            dtype="float32",
+            test_name=f"conv2d_bias_sigmoid_{dtype}",
+            dtype=dtype,
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_conv2d_bias_sigmoid_bf16(self):
         self._test_conv_bias_sigmoid(
-            test_name="conv2d_bias_sigmoid_bf16",
-            dtype="bfloat16",
+            copy_op=True,
+            test_name=f"conv2d_bias_sigmoid_{dtype}_copy_op",
+            dtype=dtype,
         )
 
 
From 19bcf97d5a85c295f664e63138c741c554fcfaa7 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Sat, 11 Mar 2023 01:50:36 -0800
Subject: [PATCH 253/638] enable torchscripting (#410)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/410

- map the operator.getitem since it is in scripting block list

Reviewed By: wushirong

Differential Revision: D43986271

fbshipit-source-id: 037db53f89a7b863ef0fbaa7b94425fd9a08dc96
---
 fx2ait/fx2ait/converters/ait_converters.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 29681278e..cfd2ea268 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -524,6 +524,7 @@ def acc_ops_unbind(
     return res
 
 
+@ait_converter(operator.getitem)
 @ait_converter(acc_ops.getitem)
 def acc_ops_getitem(
     target: Target,
@@ -531,8 +532,8 @@ def acc_ops_getitem(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    input_val = kwargs["input"]
-    idx = kwargs["idx"]
+    input_val = kwargs["input"] if "input" in kwargs else args[0]
+    idx = kwargs["idx"] if "idx" in kwargs else args[1]
     if isinstance(idx, Sequence) and any(isinstance(x, Sequence) for x in idx):
         count = 0
         dim = None
@@ -581,11 +582,12 @@ def acc_ops_getitem(
     if isinstance(input_val, AITTensor):
         return acc_ops_slice(target, args, kwargs, name)
 
-    if isinstance(kwargs["idx"], int):
+    idx_org = kwargs["idx"] if "idx" in kwargs else args[1]
+    if isinstance(idx_org, int):
         idx = get_positive_dim(idx, len(input_val))
 
     if all(isinstance(i, IntImm) for i in input_val):
-        return operator.getitem(input_val, kwargs["idx"])
+        return operator.getitem(input_val, idx_org)
     else:
         return getitem()(input_val, idx)
 

From e0c2d3ddcd8a1ba79a0026c5a8e151102a98e629 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Sat, 11 Mar 2023 22:04:20 -0800
Subject: [PATCH 254/638] fix getitem issue (#411)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/411

Fix an issue in getitem. The operator.getitem stores the param in `args` so we need to copy it into `kwargs`. In this way, the downstream operator like slice are invoked in getitem and can use the `kwargs` w/o modifications.
If this diff is landed, we do not have to land D44005383 for reversion.

Reviewed By: tissue3

Differential Revision: D44007352

fbshipit-source-id: a88c641e98b2a5f70161e4e866e2bfc52de69155
---
 fx2ait/fx2ait/converters/ait_converters.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index cfd2ea268..861539581 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -532,8 +532,15 @@ def acc_ops_getitem(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    input_val = kwargs["input"] if "input" in kwargs else args[0]
-    idx = kwargs["idx"] if "idx" in kwargs else args[1]
+    # operator.getitem does not have kwargs. We copy args to kwargs so the downstream like acc_ops_slice can use it.
+    new_kwargs = dict(kwargs)
+    if "input" not in kwargs:
+        new_kwargs["input"] = args[0]
+    if "idx" not in kwargs:
+        new_kwargs["idx"] = args[1]
+    kwargs = new_kwargs
+    input_val = kwargs["input"]
+    idx = kwargs["idx"]
     if isinstance(idx, Sequence) and any(isinstance(x, Sequence) for x in idx):
         count = 0
         dim = None
@@ -582,12 +589,11 @@ def acc_ops_getitem(
     if isinstance(input_val, AITTensor):
         return acc_ops_slice(target, args, kwargs, name)
 
-    idx_org = kwargs["idx"] if "idx" in kwargs else args[1]
-    if isinstance(idx_org, int):
+    if isinstance(kwargs["idx"], int):
         idx = get_positive_dim(idx, len(input_val))
 
     if all(isinstance(i, IntImm) for i in input_val):
-        return operator.getitem(input_val, idx_org)
+        return operator.getitem(input_val, kwargs["idx"])
     else:
         return getitem()(input_val, idx)
 

From 53008def993c814023362908a5f16e72c9e7efc5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Sat, 11 Mar 2023 22:21:21 -0800
Subject: [PATCH 255/638] move more compiler tests from a100 (#412)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/412

att

Reviewed By: wushirong

Differential Revision: D43997389

fbshipit-source-id: c60b521a08ac8a7b4576929e4d0da287eb2a925c
---
 .../compiler/test_split_bmm_fusion.py         | 17 ++++++----
 .../compiler/test_split_view_strided.py       |  9 +++--
 .../compiler/test_strided_op_cat_pattern.py   | 33 +++++--------------
 .../compiler/test_strided_reshape_cat.py      |  9 +++--
 4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
index c3921cbd8..741c0f5f3 100644
--- a/tests/unittest/compiler/test_split_bmm_fusion.py
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
     has_op,
@@ -30,6 +31,10 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SplitBmmFusionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _test_split_bmm_rcr_fusion(
         self,
         bmm_rcr_op,
@@ -290,17 +295,13 @@ def _test_split_bmm_rcr_fusion_qkv(
 
         y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors({"input0": a}, [y])
-        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
 
-    def test_split_bmm_rcr_fusion_qkv(self):
+    def test_split_bmm_rcr_fusion_qkv_sm80(self):
         self._test_split_bmm_rcr_fusion_qkv(3, 4096, 4096, 512, 1, 1)
         self._test_split_bmm_rcr_fusion_qkv(3 * 16, 1024, 1024, 256, 16, 16)
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_split_bmm_fusion_float(self):
+    def test_split_bmm_fusion_fp32_sm80(self):
         # bmm_rcr (K with an odd value) with padding:
         # in this case, split and bmm_rcr are not going to be fused actually because
         # of the padding applied to bmm_rcr.
@@ -379,5 +380,7 @@ def test_split_bmm_fusion_float(self):
         self._test_split_bmm_rcr_fusion_qkv(3 * 16, 10, 10, 8, 16, 16, dtype="float")
 
 
+filter_test_cases_by_test_env(SplitBmmFusionTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
index 62bcd6e7f..b11946f46 100644
--- a/tests/unittest/compiler/test_split_view_strided.py
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -21,6 +21,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -177,11 +178,7 @@ def test_split_view_bmm_rcr_fusion(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_split_view_bmm_rcr_fusion_float(self):
+    def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
         b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
         m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
 
@@ -219,5 +216,7 @@ def test_split_view_bmm_rcr_fusion_float(self):
         )
 
 
+filter_test_cases_by_test_env(SplitViewStridedOpTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
index 36de11705..8245b9e0a 100644
--- a/tests/unittest/compiler/test_strided_op_cat_pattern.py
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -28,6 +28,7 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -541,11 +542,7 @@ def test_gemm(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_float(self):
+    def test_gemm_fp32_sm80(self):
         self._fused_gemm_e2e_helper(m=1024, k=256, n1=5, n2=32, n3=4, dtype="float")
         self._fused_gemm_e2e_helper(
             m=1024, k=256, n1=8, n2=16, n3=32, m2=8, cat_dim=2, dtype="float"
@@ -682,11 +679,7 @@ def test_gemm_alignment(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_alignment_float(self):
+    def test_gemm_alignment_fp32_sm80(self):
         self._fused_gemm_alignment_e2e_helper(
             gemm_op=ops.gemm_rcr_bias_add(), input_n=1, m=2, k=2, n=4, dtype="float"
         )
@@ -729,12 +722,8 @@ def test_gemm_update_epilogue_alignment(self):
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
     # Tests to ensure that we correctly update epilogue alignment values
-    def test_gemm_update_epilogue_alignment_float(self):
+    def test_gemm_update_epilogue_alignment_fp32_sm80(self):
         # Note that we have to force profiling in ci. Otherwise, we would not
         # be able to fetch cached config.
         target = detect_target()
@@ -1611,11 +1600,7 @@ def test_bmm_crr_add_cat_fusion(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_cat_fusion_float(self):
+    def test_bmm_cat_fusion_fp32_sm80(self):
         self._test_bmm_xxx_cat_fusion(
             B=1,
             M=8,
@@ -1810,11 +1795,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
         self._test_bmm_rcr_update_epilogue_alignment_common()
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_bmm_rcr_update_epilogue_alignment_float(self):
+    def test_bmm_rcr_update_epilogue_alignment_fp32_sm80(self):
         self._test_bmm_rcr_update_epilogue_alignment_common(dtype="float")
 
     def _test_reduce_cat_fusion_1(
@@ -2407,5 +2388,7 @@ def test_reduce_cat_float(self):
         self._test_strided_op_multiple_cats_2(dtype="float")
 
 
+filter_test_cases_by_test_env(StridedOpCatPatternTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
index fec28a2c8..344b6d9d3 100644
--- a/tests/unittest/compiler/test_strided_reshape_cat.py
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -250,15 +251,13 @@ def test_strided_reshape_cat(self):
         self._test_strided_reshape_cat_bias()
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_strided_reshape_cat_float(self):
+    def test_strided_reshape_cat_fp32_sm80(self):
         self._test_strided_reshape_cat(num_cat_ops=2, dtype="float")
         self._test_strided_reshape_cat_bias(dtype="float")
 
 
+filter_test_cases_by_test_env(StridedReshapeCatTestCase)
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From dc0dca9b91c3820fd86cc04d82cd533972f84e64 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Sat, 11 Mar 2023 22:21:21 -0800
Subject: [PATCH 256/638] move test_slice_view_strided from a100 (#413)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/413

Reviewed By: wushirong

Differential Revision: D43999167

fbshipit-source-id: a1ebd3d5b967a1f3991f1728c88d0168e454a72f
---
 .../compiler/test_slice_view_strided.py       | 86 +++++++++++++------
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index 6c3267ad4..af91821ad 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -21,22 +21,27 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import graph_utils
 
 from parameterized import parameterized
 
 
-SKIP_FLOAT = detect_target().name() == "rocm" or int(detect_target()._arch) < 80
-
-
 class SliceViewStridedOpTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_view_gemm_fusible(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
@@ -87,10 +92,16 @@ def test_slice_view_gemm_fusible(self, dtype):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_view_gemm_non_fusible(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
@@ -123,7 +134,7 @@ def test_slice_view_gemm_non_fusible(self, dtype):
             input0_pt = get_random_torch_tensor([batch_size, N, 2 * N], dtype)
             x0_pt = input0_pt[:, :, :N]
             x1_pt = torch.reshape(x0_pt, [-1, N * N])
-            input1_pt = get_random_torch_tensor([N, N * N], dtype)
+            input1_pt = get_random_torch_tensor([N, N * N], dtype) * 0.5
             y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
             y = get_torch_empty_tensor(y_pt.shape, dtype)
 
@@ -137,15 +148,18 @@ def test_slice_view_gemm_non_fusible(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
-
-    @parameterized.expand([("float16"), ("float")])
+            torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_flatten_concat_fusible_1(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         test_name = f"slice_flatten_concat_fusible_{dtype}"
         batch_dim = IntVar([3, 10], "batch_size")
@@ -219,10 +233,16 @@ def test_slice_flatten_concat_fusible_1(self, dtype):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_flatten_concat_fusible_2(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         test_name = f"slice_flatten_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 2], "batch_size")
@@ -290,10 +310,16 @@ def test_slice_flatten_concat_fusible_2(self, dtype):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_reshape_concat_fusible_1(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         test_name = f"slice_reshape_concat_fusible_{dtype}_1"
         batch_dim = IntVar([1, 2], "batch_size")
@@ -360,10 +386,16 @@ def test_slice_reshape_concat_fusible_1(self, dtype):
                 f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
             )
 
-    @parameterized.expand([("float16"), ("float")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
     def test_slice_reshape_concat_fusible_2(self, dtype):
-        if dtype == "float" and SKIP_FLOAT:
-            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         test_name = "slice_reshape_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 8], "batch_size")

From 611d4c24d4d1de8afd0086d6652da527feef7be7 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 12 Mar 2023 16:53:26 -0700
Subject: [PATCH 257/638] Return test_gemm_permute to A100 (#415)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/415

ATT

Reviewed By: alexanderguzhva

Differential Revision: D44010903

fbshipit-source-id: 905527d7eb6378125dff1310d4bcbb6dac1d7f8a
---
 tests/unittest/ops/test_gemm_permute.py | 133 +++++++++++++++---------
 1 file changed, 85 insertions(+), 48 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index 13fcd34bf..e4dad2d49 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -27,9 +27,25 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
+class GEMMPermuteTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(GEMMPermuteTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_rcr(
-        self, ms, k, n, shape, test_name, has_bias=False, copy_op=False, dtype="float16"
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        has_bias=False,
+        copy_op=False,
+        dtype="float16",
     ):
         target = detect_target()
         X = Tensor(
@@ -52,7 +68,8 @@ def _test_rcr(
             Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
             X_pt = get_random_torch_tensor([m, k], dtype)
@@ -80,7 +97,7 @@ def test_rcr(self):
                     32,
                     96,
                     (5, 3, 2),
-                    "permute1",
+                    f"test_rcr_float16_{has_bias}_{copy_op}_1",
                     has_bias=has_bias,
                     copy_op=copy_op,
                 )
@@ -89,7 +106,7 @@ def test_rcr(self):
                     64,
                     256,
                     (8, 4, 4),
-                    "permute2",
+                    f"test_rcr_float16_{has_bias}_{copy_op}_2",
                     has_bias=has_bias,
                     copy_op=copy_op,
                 )
@@ -127,7 +144,8 @@ def _test_rcr_0213(
             Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
             X_pt = get_random_torch_tensor([m, k], dtype)
@@ -168,7 +186,7 @@ def test_rcr_0213(self):
             256,
             4000000,
             [54, 1000000],
-            "permute_0213_1",
+            "test_rcr_0213_float16_1",
             has_bias=False,
             copy_op=False,
             layout="0213",
@@ -178,13 +196,22 @@ def test_rcr_0213(self):
             256,
             300000,
             [29, 100000],
-            "permute_0213_2",
+            "test_rcr_0213_float16_2",
             has_bias=False,
             copy_op=False,
             layout="0213",
         )
 
-    def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False, dtype="float16"):
+    def _test_rrr(
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        copy_op=False,
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
@@ -199,7 +226,8 @@ def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False, dtype="float16"):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
             X_pt = get_random_torch_tensor([m, k], dtype)
@@ -213,47 +241,61 @@ def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False, dtype="float16"):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     def test_rrr(self):
-        self._test_rrr([80], 32, 96, (5, 3, 2), "permute1")
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2")
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_copy_op", copy_op=True)
+        self._test_rrr(
+            [80],
+            32,
+            96,
+            (5, 3, 2),
+            "test_rrr_float16_1",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_float16_2",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_float16_2_copy_op",
+            copy_op=True,
+        )
 
     @unittest.skipIf(
         detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
         "Not supported by CUDA < SM80.",
     )
-    def test_permute_float(self):
+    def test_permute_float32(self):
         for has_bias in (True, False):
-            for copy_op in (True, False):
-                self._test_rcr(
-                    [80],
-                    32,
-                    96,
-                    (5, 3, 2),
-                    "permute1_float",
-                    has_bias=has_bias,
-                    copy_op=copy_op,
-                    dtype="float",
-                )
+            self._test_rcr(
+                [80],
+                32,
+                96,
+                (5, 3, 2),
+                f"test_rcr_float32_{has_bias}",
+                has_bias=has_bias,
+                dtype="float32",
+            )
         self._test_rcr_0213(
             [29, 29 * 8],
             256,
             300000,
             [29, 100000],
-            "permute_0213_2_float",
+            "test_rcr_0213_float32",
             has_bias=False,
-            copy_op=False,
             layout="0213",
-            dtype="float",
+            dtype="float32",
         )
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_float", dtype="float")
         self._test_rrr(
             [128],
             64,
             256,
             (8, 4, 4),
-            "permute2_copy_op_float",
-            copy_op=True,
-            dtype="float",
+            "test_rrr_float32",
+            dtype="float32",
         )
 
     @unittest.skipIf(
@@ -262,36 +304,31 @@ def test_permute_float(self):
     )
     def test_gemm_permute_bfloat16(self):
         for has_bias in (True, False):
-            for copy_op in (True, False):
-                self._test_rcr(
-                    [80],
-                    32,
-                    96,
-                    (5, 3, 2),
-                    "permute1_bfloat16",
-                    has_bias=has_bias,
-                    copy_op=copy_op,
-                    dtype="bfloat16",
-                )
+            self._test_rcr(
+                [80],
+                32,
+                96,
+                (5, 3, 2),
+                f"test_rcr_bfloat16_{has_bias}",
+                has_bias=has_bias,
+                dtype="bfloat16",
+            )
         self._test_rcr_0213(
             [29, 29 * 8],
             256,
             300000,
             [29, 100000],
-            "permute_0213_2_bfloat16",
+            "test_rcr_0213_bfloat16",
             has_bias=False,
-            copy_op=False,
             layout="0213",
             dtype="bfloat16",
         )
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_bfloat16", dtype="bfloat16")
         self._test_rrr(
             [128],
             64,
             256,
             (8, 4, 4),
-            "permute2_copy_op_bfloat16",
-            copy_op=True,
+            "test_rrr_bfloat16",
             dtype="bfloat16",
         )
 

From 6f2d8ad2619d5cee87ed7d9bbff6ba3f2d568188 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Mon, 13 Mar 2023 10:25:05 -0700
Subject: [PATCH 258/638] Move tests from A100 to V100 (#419)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/419

Reviewed By: muchulee8

Differential Revision: D43990177

fbshipit-source-id: 1b957f4a399c81a1a8dd67e1d89eb9421065ab40
---
 tests/unittest/ops/test_perm021fc_ccr.py      | 21 +++++++------------
 tests/unittest/ops/test_perm021fc_ccr_bias.py | 21 +++++++------------
 tests/unittest/ops/test_perm021fc_crc.py      | 21 +++++++------------
 3 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/tests/unittest/ops/test_perm021fc_ccr.py b/tests/unittest/ops/test_perm021fc_ccr.py
index 17daa41b5..4061fdb60 100644
--- a/tests/unittest/ops/test_perm021fc_ccr.py
+++ b/tests/unittest/ops/test_perm021fc_ccr.py
@@ -25,10 +25,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCCRTestCase(unittest.TestCase):
     def _test_perm021fc_ccr(
         self,
@@ -77,22 +79,12 @@ def test_perm021fc_ccr_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_ccr_fp32(self):
+    def test_perm021fc_ccr_float32_sm80(self):
         self._test_perm021fc_ccr(
             test_name="perm021fc_ccr_fp32",
             dtype="float32",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"bf16 BMM not supported in {detect_target()._arch}",
-    )
     def test_perm021fc_ccr_bf16(self):
         self._test_perm021fc_ccr(
             test_name="perm021fc_ccr_bf16",
@@ -100,6 +92,9 @@ def test_perm021fc_ccr_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(Perm021FCCCRTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias.py b/tests/unittest/ops/test_perm021fc_ccr_bias.py
index dbf91b245..b4e8525d6 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias.py
@@ -25,10 +25,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCCRBiasTestCase(unittest.TestCase):
     def _test_perm021fc_ccr_bias(
         self,
@@ -88,22 +90,12 @@ def test_perm021fc_ccr_bias_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_ccr_bias_fp32(self):
+    def test_perm021fc_ccr_bias_float32_sm80(self):
         self._test_perm021fc_ccr_bias(
             test_name="perm021fc_ccr_bias_fp32",
             dtype="float32",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"bf16 BMM not supported in {detect_target()._arch}",
-    )
     def test_perm021fc_ccr_bias_bf16(self):
         self._test_perm021fc_ccr_bias(
             test_name="perm021fc_ccr_bias_bf16",
@@ -111,6 +103,9 @@ def test_perm021fc_ccr_bias_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(Perm021FCCCRBiasTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc.py b/tests/unittest/ops/test_perm021fc_crc.py
index aac752c33..cc9b87215 100644
--- a/tests/unittest/ops/test_perm021fc_crc.py
+++ b/tests/unittest/ops/test_perm021fc_crc.py
@@ -25,10 +25,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCRCTestCase(unittest.TestCase):
     def _test_perm021fc_crc(
         self,
@@ -79,22 +81,12 @@ def test_perm021fc_crc_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_crc_fp32(self):
+    def test_perm021fc_crc_float32_sm80(self):
         self._test_perm021fc_crc(
             test_name="perm021fc_crc_fp32",
             dtype="float32",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"bf16 BMM not supported in {detect_target()._arch}",
-    )
     def test_perm021fc_crc_bf16(self):
         self._test_perm021fc_crc(
             test_name="perm021fc_crc_bf16",
@@ -102,6 +94,9 @@ def test_perm021fc_crc_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(Perm021FCCRCTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From d7e152fa1d5f12587f007e6724e38cd4d8607d85 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 13 Mar 2023 13:04:01 -0700
Subject: [PATCH 259/638] Extend jagged tensor support (#367)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/367

This diff extends the initial fx2ait functionality for handling the dynamic dimension of the jagged Tensor inputs to the AIT model subgraphs. Specifically, the main contributions are:

1. Multiple different fbgemm jagged ops are now detected besides `jagged_to_padded_dense`, with the respective arg position and kwarg name of the jagged Tensor parameter.

2. Jagged offsets Tensors, which are also parameters to fbgemm jagged ops and have a different dynamic dimension than the `batch_size` (normally: `batch_size` + 1) are now also detected and processed. A separate pass is added for this.

3. Different jagged Tensors in the model can have different batch dimensions: e.g., because they represent different kinds of sequential data for the same users in the batch. To this end, the `IntVar` names of the batch dimensions having different values in the inputs are set differently (otherwise the AIT runtime would interpret all those dynamic dimensions as equal, which is problematic).

Reviewed By: wushirong

Differential Revision: D43816291

fbshipit-source-id: 77f3dce76fd5407b08826f67213d8299d9d48542
---
 fx2ait/fx2ait/tensor_spec.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index b8337f163..4a9594b4b 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import logging
-from typing import Any, List
+from typing import Any, List, Set
 
 import torch
 from aitemplate.compiler.public import IntImm, IntVar
@@ -196,8 +196,9 @@ def from_input_list_with_batch_size_jagged_tensor(
         cls,
         inputs: List[torch.Tensor],
         max_batch_size: int,
-        max_batch_size_jagged_tensor: int,
-        tag_val=None,
+        max_sequence_length: int,
+        jagged_tensor_batch_dims: Set[int],
+        jagged_offsets_batch_dims: Set[int],
     ) -> List["TensorSpec"]:
         """
         Most of the recommendation models will work fine using this function.
@@ -210,14 +211,30 @@ def from_input_list_with_batch_size_jagged_tensor(
         left_inputs: List = []
         left_inputs_ind: List = []
         for ind, t in enumerate(inputs):
-            if t.shape[0] == tag_val:
+            batch_dim: int = t.shape[0]
+            batch_dim_lower_bound: int = 0
+            batch_dim_upper_bound: int = 0
+            batch_dim_name: str = ""
+            if batch_dim in jagged_tensor_batch_dims:
+                batch_dim_lower_bound = 0  # when all sequences are empty
+                batch_dim_upper_bound = max_batch_size * max_sequence_length
+                batch_dim_name = f"batch_size_jagged_tensor_{batch_dim}"
+            elif batch_dim in jagged_offsets_batch_dims:
+                batch_dim_lower_bound = 2  # prefix 0 + at least one offset
+                batch_dim_upper_bound = max_batch_size + 1
+                batch_dim_name = f"batch_size_jagged_offsets_{batch_dim}"
+
+            if batch_dim_upper_bound > 0:
                 shape: List[IntVar] = []
                 for i, d in enumerate(t.shape):
                     if i == 0:
                         shape.append(
                             IntVar(
-                                [1, max_batch_size_jagged_tensor],
-                                "batch_size_jagged_tensor",
+                                values=[
+                                    batch_dim_lower_bound,
+                                    batch_dim_upper_bound,
+                                ],
+                                name=batch_dim_name,
                             )
                         )
                     else:

From 2d9ec1895f9b709ed45ae8d803f0a11aeb436240 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Mon, 13 Mar 2023 16:25:55 -0700
Subject: [PATCH 260/638] Return link to softmax experiment log (#405)

Summary:
Looks like fb internal only link to softmax experiment log was accidentally removed recently in https://github.com/facebookincubator/AITemplate/pull/379

This PR returns the link to softmax experiment log

Related issue: https://github.com/facebookincubator/AITemplate/issues/350

tissue3  khabinov

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/405

Reviewed By: wushirong

Differential Revision: D44029434

Pulled By: khabinov

fbshipit-source-id: fca2ccb8cbb1f7abe1b23cfaf742b0b5efcec749
---
 python/aitemplate/backend/cuda/softmax/softmax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index e8fcd0dab..ad8d493ab 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -36,7 +36,7 @@
 #
 # For each K, whether to use wrapReduce or blockReduce was done by experiment
 # Please refer to this post: https://github.com/facebookincubator/AITemplate/wiki/How-to-write-a-fast-Softmax-CUDA-kernel%3F
-# and this experiment log [fb internal only]: https://github.com/facebookincubator/AITemplate/wiki/How-to-write-a-fast-Softmax-CUDA-kernel%3F
+# and this experiment log [fb internal only]: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
 FUNC_TEMPLATE = jinja2.Template(
     """
 {{custom_libs}}

From cbe8f483a46f6c5afddd05e798df802a38dc96e5 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 14 Mar 2023 12:08:33 +0800
Subject: [PATCH 261/638] fix bugs

---
 examples/03_bert/benchmark_pt.py           |  4 +--
 python/aitemplate/frontend/nn/embedding.py |  2 --
 tests/unittest/backend/test_profiler.py    | 36 ++++++++--------------
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/examples/03_bert/benchmark_pt.py b/examples/03_bert/benchmark_pt.py
index 586df4fea..b64800da6 100644
--- a/examples/03_bert/benchmark_pt.py
+++ b/examples/03_bert/benchmark_pt.py
@@ -20,7 +20,7 @@
 
 
 def benchmark_pt(pretrained=True, batchsize=0):
-    bert = BertBaseUncased(pretrained)
+    bert = BertBaseUncased(pretrained=pretrained)
     model = bert._model
     model.eval()
 
@@ -70,7 +70,7 @@ def benchmark_pt(pretrained=True, batchsize=0):
 
 
 def benchmark_pt_encoders_only(pretrained=True, batchsize=0):
-    model = BertBaseUncased(pretrained)
+    model = BertBaseUncased(pretrained=pretrained)
     pt_bert = model._model
     pt_bert.eval()
 
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index 6814fc359..d77c12382 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -60,8 +60,6 @@ def __init__(
         dtype="float16",
     ):
         super().__init__()
-        if BertEmbeddings.USE_CUDA is None:
-            BertEmbeddings.USE_CUDA = detect_target().name() == "cuda"
         assert (
             hidden_dropout_prob == 0.0
         ), "Dropout rate larger than 0 is not supported yet."
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
index ea308c3bf..3e2e6f3f3 100644
--- a/tests/unittest/backend/test_profiler.py
+++ b/tests/unittest/backend/test_profiler.py
@@ -19,8 +19,6 @@
 
 from aitemplate.backend.profiler_runner import ProfilerRunner
 
-from aitemplate.testing import detect_target
-
 
 def dice():
     return randrange(1, 10) / 4
@@ -53,29 +51,21 @@ def test_profiler_runner(self):
             "aitemplate.backend.profiler_runner.extract_profile_result"
         ) as mock_extract_profile_result:
             mock_extract_profile_result.return_value = ("", False)
-            with detect_target() as _:
-                pr = ProfilerRunner(
-                    devices=[str(i) for i in range(12)],
-                    timeout=60,
-                    postprocessing_delegate=Delegate(test_instance=self),
+            pr = ProfilerRunner(
+                devices=[str(i) for i in range(12)],
+                timeout=60,
+                postprocessing_delegate=Delegate(test_instance=self),
+            )
+
+            for i, _ in enumerate(pr._postprocessing_delegate.results):
+                sleep_for = 0
+                pr.push(
+                    cmds=["sleep", f"{sleep_for}"],
+                    process_result_callback=delegate_cb_wrapper(i, sleep_for),
                 )
 
-                for i, _ in enumerate(pr._postprocessing_delegate.results):
-                    sleep_for = 0
-                    pr.push(
-                        cmds=["sleep", f"{sleep_for}"],
-                        process_result_callback=delegate_cb_wrapper(i, sleep_for),
-                    )
-
-                    for i, _ in enumerate(pr._postprocessing_delegate.results):
-                        sleep_for = 0
-                        pr.push(
-                            cmds=["sleep", f"{sleep_for}"],
-                            process_result_callback=delegate_cb_wrapper(i, sleep_for),
-                        )
-
-                    pr.join()
+            pr.join()
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
\ No newline at end of file

From 4625717034fa1be8bc95e55598902b2ea3ccee1f Mon Sep 17 00:00:00 2001
From: Zhijing Li <173666635@qq.com>
Date: Mon, 13 Mar 2023 22:00:32 -0700
Subject: [PATCH 262/638] Use nighly pytorch package to test fx2ait (#424)

Summary:
Use latest pytorch package to test splitter

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/424

Reviewed By: wushirong

Differential Revision: D44048105

Pulled By: tissue3

fbshipit-source-id: 2d9059ee9d318966c1298c7a4dba08b434241c5d
---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e1bb1ba96..ed0c8585e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -54,7 +54,7 @@ setup_fx2ait_env: &setup_fx2ait_env
           sudo cp cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include
           sudo cp -P cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
           sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
-          python3.8 -m pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+          python3.8 -m pip install --ignore-installed --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
           python3.8 fx2ait/setup.py install --prefix=/home/circleci/
           echo 'export PYTHONPATH=$PWD/fx2ait:$PYTHONPATH' >> $BASH_ENV
           break || sleep 5;

From d19dc04c5d0418c94f032069f1f4ac86d25c86b4 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 13 Mar 2023 22:14:35 -0700
Subject: [PATCH 263/638] Fix softmax dimension raise logic. (#422)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/422

Fix some issues in D43968603. The judgement of dimension with 1 wasn't correct. For AITTensor, dimension can be IntVar and IntImm, and judgement code should change accordingly.

Reviewed By: frank-wei

Differential Revision: D44040761

fbshipit-source-id: 1cc56a1454621fa0c36995e7d8e23bb6fa6f6527
---
 fx2ait/fx2ait/converters/ait_converters.py     | 18 +++++++++++++++---
 .../fx2ait/test/converters/test_ait_softmax.py | 17 ++++++++---------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 861539581..e5b291603 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -427,10 +427,22 @@ def acc_ops_softmax(
     if dim < 0:
         dim = rank + dim
     if dim != rank - 1:
-        for i in range(rank, dim):
-            if input_val.shape()[i].value() != 1:
+        for i in range(dim + 1, rank):
+            unsupported = False
+            if isinstance(input_val.shape()[i], IntImm):
+                if input_val.shape()[i].value() != 1:
+                    unsupported = True
+            elif isinstance(input_val.shape()[i], IntVar):
+                unsupported = True
+            else:
                 raise RuntimeError(
-                    f"AIT softmax only supports dim=rank-1, got dim={dim}, rank={rank}"
+                    f"unknown dimension type={type(i)} in AITTensor={input_val}"
+                )
+
+            if unsupported:
+                raise ValueError(
+                    f"AIT softmax only supports dim=rank-1, got AITTensor={input_val}, "
+                    f"where dim={dim}, rank={rank}"
                 )
         reshape_dim = size()(input_val)[: dim + 1]
         reshape_val = reshape()(input_val, reshape_dim)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_softmax.py b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
index a9f96e047..d1171f852 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_softmax.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import unittest
 
 import torch
 from fx2ait.acc_tracer import acc_ops
@@ -83,7 +82,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             param("neg", dim=-3),
         ]
     )
-    @unittest.expectedFailure
     def test_softmax_expected_failure(self, name, dim=None):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -94,7 +92,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         inputs = [
             torch.randn(2, 3, 5, 2, 1).half().cuda(),
         ]
-        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+        with self.assertRaises(ValueError):
+            self.run_test(model, inputs, expected_ops={acc_ops.softmax})
 
     @parameterized.expand(
         [
@@ -102,7 +101,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             param("neg", dim=-3),
         ]
     )
-    @unittest.expectedFailure
     def test_softmax_expected_failure_dynamic(self, name, dim=None):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -121,8 +119,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 torch.float16,
             ],
         )
-        self.run_test_with_dynamic_shape(
-            model,
-            inputs_spec,
-            expected_ops={acc_ops.softmax},
-        )
+        with self.assertRaises(ValueError):
+            self.run_test_with_dynamic_shape(
+                model,
+                inputs_spec,
+                expected_ops={acc_ops.softmax},
+            )

From 4bb43dc9b7cdce94a735b16b06dce5eb0fb56903 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 13 Mar 2023 23:47:52 -0700
Subject: [PATCH 264/638] Add allow op supports to AIT splitter (#421)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/421

Currently splitter only rejects nodes, but doesn't have allowed node list. Then there are some special ops hard to adapt to general rules.

Example:
torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output should be supported after AIT adds it, but the second input of the op is offset, which is int64 type. But AIT in general only support float32 and float64 calculation.
There we just want to enable that op.

We added an allow_list to splitter so that if any op is in the allow_list, the splitter will split it into acc graph, rather than gpu graph.

Now, we can write
```
    class JaggedOperatorSupport(OperatorSupportBase):
        def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
            return node.op == "call_function" and node.target in [
                torch.ops.fbgemm.jagged_to_padded_dense,
                torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output,
            ]

    op_lowering_disallow_list = []
    op_support = create_ait_operator_support(
        op_lowering_disallow_list=op_lowering_disallow_list,
        allow_op_supports=[JaggedOperatorSupport()],
    )

```
and we will get the same splitter but with one specific op get excluded!

Reviewed By: wushirong

Differential Revision: D44036284

fbshipit-source-id: 877b4510848bbca55183d7b0ab0085e8662ff46c
---
 fx2ait/fx2ait/ait_splitter.py           |  8 +++-
 fx2ait/fx2ait/test/test_ait_splitter.py | 55 ++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index b96ddf3e6..886da0beb 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -67,7 +67,10 @@ def _any_supported(nodes: Sequence[torch.fx.Node]) -> bool:
 
 
 def create_ait_operator_support(
-    use_implicit_batch_dim=True, op_lowering_disallow_list=None, allow_int_inputs=False
+    use_implicit_batch_dim=True,
+    op_lowering_disallow_list=None,
+    allow_int_inputs=False,
+    allow_op_supports=None,
 ) -> ops.OperatorSupportBase:
     """Creates an `OperatorSupportBase` instance used for AIT splitting purpose."""
     # Create an `OperatorSupport` that declares a node supported if it
@@ -102,7 +105,8 @@ def create_ait_operator_support(
         # optimization.
         _decline_if_would_trigger_extra_copies(supported_if_converter_registered),
     ]
-
+    if allow_op_supports:
+        return ops.any_chain(ops.chain(*chained_not_supported_ops), *allow_op_supports)
     return ops.chain(*chained_not_supported_ops)
 
 
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
index 77e3955c3..5b010c3a3 100644
--- a/fx2ait/fx2ait/test/test_ait_splitter.py
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import torch
-from fx2ait.acc_tracer import acc_tracer
+from fx2ait.acc_tracer import acc_ops, acc_tracer
 from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait
     AITSplitter,
     AITSplitterSettings,
@@ -21,6 +21,7 @@
 )
 from fx2ait.tools.common_fx2ait import AITTestCase
 from torch.fx.passes import operator_support as op_support
+from torch.fx.passes.operator_support import OperatorSupportBase
 
 
 class TestSplit(AITTestCase):
@@ -180,3 +181,55 @@ def forward(self, a):
             dict(split_results_int_allowed.split_module.named_children()).keys(),
             {"_run_on_acc_0"},
         )
+
+    def test_accept_if_allow_op_support(self):
+        operator_support = create_ait_operator_support()
+
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.relu(a)
+                return b
+
+        test_mod = TestModule().cuda().eval()
+        x = torch.randn(2, 3)
+        settings = AITSplitterSettings()
+        settings.min_acc_module_size = 0
+
+        # nodes w/ int input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_int = splitter.generate_split_results()
+        self.assertTrue(len(split_results_int), 1)
+        self.assertEqual(
+            dict(split_results_int.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )
+
+        class JaggedOperatorSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    acc_ops.relu,
+                ]
+
+        operator_support = create_ait_operator_support(
+            allow_op_supports=[JaggedOperatorSupport()]
+        )
+        # node relu should be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_relu_allowed = splitter.generate_split_results()
+        self.assertTrue(len(split_results_relu_allowed), 1)
+        self.assertEqual(
+            dict(split_results_relu_allowed.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )

From c8f6a045c6e5c34b42a087c3b2649dd080330ed1 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 14 Mar 2023 16:54:47 +0800
Subject: [PATCH 265/638] revert some changes

---
 python/aitemplate/backend/profiler_runner.py  | 9 ++++-----
 python/aitemplate/compiler/ops/conv/conv2d.py | 9 +++++++--
 python/aitemplate/compiler/ops/conv/conv3d.py | 9 +++++++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index f73b67567..24f4b2d5e 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -125,14 +125,12 @@ def process_task(task: Task) -> None:
         if not single_file_profiler:
             task._failed = True
             return
-        cmd = task._cmd
-        if Target.current().name() == "rocm":
-            cmd = " ".join(cmd)
+
         _LOGGER.debug(
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
                 name=task._name,
                 algo=task._idx,
-                cmd=cmd,
+                cmd=task._cmd,
                 stderr=stderr,
             ),
         )
@@ -313,7 +311,8 @@ def callback_when_done(fut):
                     )
             finally:
                 # unblock one future in `join()`
-                self._done_queue.put(stdout)
+                if stdout is not None:
+                    self._done_queue.put(stdout)
 
         future.add_done_callback(callback_when_done)
         self._futures.append(future)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 6b9e91b4f..c43197be0 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -503,8 +503,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             raise RuntimeError(
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        out = min(result, key=lambda x: x[1].duration)
-        best_algo = out[0]
+        if target.name() == "rocm":
+            out = min(result, key=lambda x: x[1].duration)
+            best_algo = out[0]
+        else:
+            from operator import itemgetter
+            out = min(result, key=itemgetter(1))
+            best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = ConvRecordEntry(
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 4d955d677..9899f71a6 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -510,8 +510,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             raise RuntimeError(
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        out = min(result, key=lambda x: x[1].duration)
-        best_algo = out[0]
+        if target.name() == "rocm":
+            out = min(result, key=lambda x: x[1].duration)
+            best_algo = out[0]
+        else:
+            from operator import itemgetter
+            out = min(result, key=itemgetter(1))
+            best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = Conv3dRecordEntry(

From 718d1f9fb117cb987df44c4c294bb8cff62d959c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 14 Mar 2023 10:06:08 -0700
Subject: [PATCH 266/638] restore attention on a100 (#406)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/406

apparently all attention kernels require sm80

Reviewed By: ipiszy, khabinov

Differential Revision: D43993362

fbshipit-source-id: 97738748036c0e7611ffc5915d5cb4e28fbd8e14
---
 tests/unittest/ops/test_attention.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 8af709eaf..64c37fa67 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -27,10 +27,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import benchmark_pt, detect_target
-from aitemplate.testing.test_utils import (
-    filter_test_cases_by_test_env,
-    get_random_torch_tensor,
-)
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 from einops import rearrange, repeat
@@ -147,6 +144,10 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class AttentionTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -605,7 +606,7 @@ def test_mem_eff_attention_invalid_head_size_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_mem_eff_attention_fp32_sm80(self):
+    def test_mem_eff_attention_fp32(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
                 use_perm=use_perm,
@@ -770,7 +771,7 @@ def test_cross_attention_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_cross_attention_fp32_sm80(self):
+    def test_cross_attention_fp32(self):
         self._test_cross_attention(
             test_name="cross_attention_fp32",
             dtype="float32",
@@ -804,8 +805,5 @@ def test_cross_attention_bf16(self):
         )
 
 
-filter_test_cases_by_test_env(AttentionTestCase)
-
-
 if __name__ == "__main__":
     unittest.main()

From a9aa7294c0bdf16dcd1b677ae2d4e6de967a94d8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 14 Mar 2023 10:06:08 -0700
Subject: [PATCH 267/638] restore test_bmm_permute on a100 (#409)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/409

Reviewed By: ipiszy, khabinov

Differential Revision: D43994056

fbshipit-source-id: 0ef893f135a214fec0785200edda24f1ce5581b6
---
 tests/unittest/ops/test_bmm_permute.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index 9ee1038e7..b2b0b91ea 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -21,7 +21,6 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -29,6 +28,10 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class BMMPermuteTestCase(unittest.TestCase):
     def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
@@ -117,7 +120,7 @@ def test_rcr(self):
             )
             self._test_rcr([24], [80], N=96, K=0, d1=12, test_name="permute1_zero_k")
 
-    def test_bmm_permute_fp32_sm80(self):
+    def test_bmm_permute_fp32(self):
         self._test_rrr(
             [10], [8], N=88, K=64, d1=10, test_name="permute3_float", dtype="float"
         )
@@ -166,8 +169,6 @@ def test_bmm_permute_bf16(self):
         )
 
 
-filter_test_cases_by_test_env(BMMPermuteTestCase)
-
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()

From 1b866e55c9b25306a2aa43b031c7792af60933b8 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Tue, 14 Mar 2023 11:13:05 -0700
Subject: [PATCH 268/638] Move perm and split_getitem tests to V100 (#398)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/398

Split the tests between A100 and V100 to avoid timeouts.

Reviewed By: alexanderguzhva

Differential Revision: D43975487

fbshipit-source-id: 251b2b1c227e0a65b1276e298d9555341e7e318a
---
 tests/unittest/ops/test_perm021fc_crc_bias.py |  40 +++----
 tests/unittest/ops/test_perm102_bmm_rcr.py    |  34 +++---
 tests/unittest/ops/test_perm102_bmm_rrr.py    |  28 +++--
 tests/unittest/ops/test_split_getitem.py      | 103 +++++++-----------
 4 files changed, 99 insertions(+), 106 deletions(-)

diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
index 8c9e719e6..48c67b878 100644
--- a/tests/unittest/ops/test_perm021fc_crc_bias.py
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -25,7 +25,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -81,32 +87,18 @@ def _test_perm021fc_crc_bias(
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_perm021fc_crc_bias_fp16(self):
-        self._test_perm021fc_crc_bias(
-            test_name="perm021fc_crc_bias_fp16",
-            dtype="float16",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_crc_bias_fp32(self):
-        self._test_perm021fc_crc_bias(
-            test_name="perm021fc_crc_bias_fp32",
-            dtype="float32",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"bf16 BMM not supported in {detect_target()._arch}",
     )
-    def test_perm021fc_crc_bias_bf16(self):
+    def test_perm021fc_crc_bias(self, dtype):
         self._test_perm021fc_crc_bias(
-            test_name="perm021fc_crc_bias_bf16",
-            dtype="bfloat16",
+            test_name=f"perm021fc_crc_bias_{dtype}",
+            dtype=dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index 62f143035..16c0fd752 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -28,21 +28,25 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 from parameterized import parameterized
 
 
-def cuda_skip_condition(dtype, arch):
-    return dtype != "float16" and int(arch) < 80
-
-
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_TestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
     def test_perm102_bmm_rrr(self, dtype):
-        arch_ = detect_target()._arch
-        if cuda_skip_condition(dtype, arch_):
-            self.skipTest(f"BMM with float32 inputs not supported on CUDA SM{arch_}")
         B = 25
         M = 128
         K = 256
@@ -70,11 +74,15 @@ def test_perm102_bmm_rrr(self, dtype):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
     def test_perm102_bmm_rrr_bias(self, dtype):
-        arch_ = detect_target()._arch
-        if cuda_skip_condition(dtype, arch_):
-            self.skipTest(f"BMM with float32 inputs not supported on CUDA SM{arch_}")
         B = 25
         M = 128
         K = 256
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index 026e5333a..ba12ed1d1 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -28,16 +28,25 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
     def test_perm102_bmm_rrr(self, dtype="float16"):
-        if dtype != "float16" and int(detect_target()._arch) < 80:
-            self.skipTest(f"{dtype} BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256
@@ -65,10 +74,15 @@ def test_perm102_bmm_rrr(self, dtype="float16"):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMBiasTestCase(unittest.TestCase):
-    @parameterized.expand([("float16"), ("float32"), ("bfloat16")])
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
     def test_perm102_bmm_rrr_bias(self, dtype="float16"):
-        if dtype != "float16" and int(detect_target()._arch) < 80:
-            self.skipTest(f"{dtype} BMM not supported in {detect_target()._arch}")
         B = 25
         M = 128
         K = 256
diff --git a/tests/unittest/ops/test_split_getitem.py b/tests/unittest/ops/test_split_getitem.py
index 770f1e273..5068d0112 100644
--- a/tests/unittest/ops/test_split_getitem.py
+++ b/tests/unittest/ops/test_split_getitem.py
@@ -20,8 +20,14 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -87,30 +93,18 @@ def _test_split_getitem(
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem_fp16(self):
-        self._test_split_getitem(
-            test_name="split_getitem_fp16",
-            dtype="float16",
-        )
-        self._test_split_getitem(
-            batch_size=[5],
-            X_shape=(16, 32),
-            split_sections=[8, 20, 4],
-            split_dim=2,
-            item_idx=1,
-            test_name="split_getitem_fp16",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
     )
-    def test_split_getitem_fp32(self):
+    def test_split_getitem(self, dtype):
         self._test_split_getitem(
-            test_name="split_getitem_fp32",
-            dtype="float32",
+            test_name=f"split_getitem_{dtype}",
+            dtype=dtype,
         )
         self._test_split_getitem(
             batch_size=[5],
@@ -118,8 +112,8 @@ def test_split_getitem_fp32(self):
             split_sections=[8, 20, 4],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_fp32",
-            dtype="float32",
+            test_name=f"split_getitem_{dtype}",
+            dtype=dtype,
         )
 
     def _test_split_getitem_output(
@@ -161,26 +155,18 @@ def _test_split_getitem_output(
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem_output_fp16(self):
-        self._test_split_getitem_output(
-            test_name="split_getitem_output",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
+    )
+    def test_split_getitem_output(self, dtype):
         self._test_split_getitem_output(
-            batch_size=[10],
-            X_shape=(16, 31),
-            split_sections=[9, 19, 3],
-            split_dim=2,
-            item_idx=1,
             test_name="split_getitem_output",
-            dtype="float16",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_split_getitem_output_fp32(self):
-        self._test_split_getitem_output(
-            test_name="split_getitem_output_fp32",
-            dtype="float32",
+            dtype=dtype,
         )
         self._test_split_getitem_output(
             batch_size=[10],
@@ -188,8 +174,8 @@ def test_split_getitem_output_fp32(self):
             split_sections=[9, 19, 3],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_output_fp32",
-            dtype="float32",
+            test_name="split_getitem_output",
+            dtype=dtype,
         )
 
     def _test_split_multiple_getitems(
@@ -259,33 +245,26 @@ def _test_split_multiple_getitems(
             module.run_with_tensors({"input_0": X_pt, "input_2": X2_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_mutiple_getitems_fp16(self):
-        self._test_split_multiple_getitems(
-            test_name="split_multiple_getitems_fp16",
-            dtype="float16",
-        )
-        self._test_split_multiple_getitems(
-            batch_size=[10],
-            X_shape=(16, 31),
-            split_sections=[9, 9, 13],
-            split_dim=2,
-            test_name="split_multiple_getitems_fp16",
-            dtype="float16",
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_split_mutiple_getitems_fp32(self):
+    )
+    def test_split_mutiple_getitems(self, dtype):
         self._test_split_multiple_getitems(
-            test_name="split_multiple_getitems_fp32",
-            dtype="float32",
+            test_name=f"split_multiple_getitems_{dtype}",
+            dtype=dtype,
         )
         self._test_split_multiple_getitems(
             batch_size=[10],
             X_shape=(16, 31),
             split_sections=[9, 9, 13],
             split_dim=2,
-            test_name="split_multiple_getitems_fp32",
-            dtype="float32",
+            test_name=f"split_multiple_getitems_{dtype}",
+            dtype=dtype,
         )
 
 
From dbda11b497f68de70b85c2c00cf1146da0ea7048 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 14 Mar 2023 15:17:06 -0700
Subject: [PATCH 269/638] Fix types info in compiler/model.py (#425)

Summary:
Fixes in `compiler/model.py`
- Move Data Classes on top in order to use their types in function signatures.
- fix wrong type info in some functions.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/425

Reviewed By: alexanderguzhva

Differential Revision: D44070874

Pulled By: khabinov

fbshipit-source-id: db404483e7bb3e5a056bc6ee17ec0f06b4b37965
---
 python/aitemplate/compiler/model.py | 90 ++++++++++++++---------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 6ee7fce41..420a567c8 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -39,6 +39,43 @@
 TorchTensor = TypeVar("TorchTensor")
 
 
+class AITemplateMemcpyKind(enum.Enum):
+    HostToDevice = 0
+    DeviceToHost = 1
+    DeviceToDevice = 2
+
+
+class AITemplateAllocatorKind(enum.Enum):
+    DEFAULT = 0
+    TRACKING = 1
+
+
+class AITData(NamedTuple):
+    """
+    Input or output tensor for Model.run. We require the extra data for safety
+    checks inside the runtime.
+    """
+
+    data_ptr: int
+    shape: List[int]
+    dtype: str
+
+
+class _AITemplateShape(ctypes.Structure):
+    _fields_ = [
+        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
+        ("size", ctypes.c_size_t),
+    ]
+
+
+class _CFormatAITData(ctypes.Structure):
+    _fields_ = [
+        ("pointer", ctypes.c_void_p),
+        ("shape", _AITemplateShape),
+        ("dtype", ctypes.c_int),
+    ]
+
+
 def _dlclose(dll: ctypes.CDLL):
     syms = ctypes.CDLL(None)
     if hasattr(syms, "dlclose"):
@@ -84,7 +121,7 @@ def is_bad_tensor(tensor: TorchTensor) -> bool:
     _check_tensors(tensors, is_bad_tensor, name, "contiguous and on host")
 
 
-def torch_to_ait_data(tensor):
+def torch_to_ait_data(tensor: TorchTensor) -> AITData:
     """
     Convert a torch Tensor to a AITData.
     """
@@ -93,7 +130,7 @@ def torch_to_ait_data(tensor):
     )
 
 
-def _convert_tensor_args(params):
+def _convert_tensor_args(params: Union[List[TorchTensor], Dict[str, TorchTensor]]):
     """
     Helper function for the WithTensors APIs.
     """
@@ -117,43 +154,6 @@ def _reshape_tensor(tensor: TorchTensor, shape: List[int]) -> TorchTensor:
     return new_tensor.reshape(shape)
 
 
-class AITemplateMemcpyKind(enum.Enum):
-    HostToDevice = 0
-    DeviceToHost = 1
-    DeviceToDevice = 2
-
-
-class AITemplateAllocatorKind(enum.Enum):
-    DEFAULT = 0
-    TRACKING = 1
-
-
-class AITData(NamedTuple):
-    """
-    Input or output tensor for Model.run. We require the extra data for safety
-    checks inside the runtime.
-    """
-
-    data_ptr: int
-    shape: List[int]
-    dtype: str
-
-
-class _AITemplateShape(ctypes.Structure):
-    _fields_ = [
-        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
-        ("size", ctypes.c_size_t),
-    ]
-
-
-class _CFormatAITData(ctypes.Structure):
-    _fields_ = [
-        ("pointer", ctypes.c_void_p),
-        ("shape", _AITemplateShape),
-        ("dtype", ctypes.c_int),
-    ]
-
-
 class Model(object):
     class _DLLWrapper:
         def __init__(
@@ -328,7 +328,7 @@ def _dict_to_ordered_list(self, params, is_inputs):
                 f"Did not get correct number of {'inputs' if is_inputs else 'outputs'} expected {len(index_map)}, got {len(params)}"
             )
 
-        result = [None for i in range(len(index_map))]
+        result = [None] * len(index_map)
         for name, tensor in params.items():
             if name not in index_map:
                 raise ValueError(
@@ -341,7 +341,7 @@ def _dict_to_ordered_list(self, params, is_inputs):
 
     def _make_ait_outputs(
         self, outputs: List[AITData], c_output_shapes
-    ) -> Dict[str, List[int]]:
+    ) -> Dict[str, AITData]:
         output_shapes = []
         for i, c_shape in enumerate(c_output_shapes):
             shape = []
@@ -545,7 +545,7 @@ def run_with_tensors(
     def _run_with_outputs_on_host(
         self,
         inputs: Union[Dict[str, AITData], List[AITData]],
-        outputs: Union[Dict[str, int], List[int]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
         stream_ptr: Optional[int] = None,
         graph_mode: bool = False,
     ) -> Dict[str, AITData]:
@@ -592,7 +592,7 @@ def _run_with_tensors_outputs_on_host(
     def benchmark(
         self,
         inputs: Union[Dict[str, AITData], List[AITData]],
-        outputs: Union[Dict[str, int], List[int]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
         stream_ptr: Optional[int] = None,
         graph_mode: bool = False,
         count: int = 10,
@@ -799,7 +799,7 @@ def set_many_double_buffer_constants(
             self.handle, ctypes.c_void_p(stream_ptr), c_names, c_tensors, num_tensors
         )
 
-    def set_many_constants_with_tensors(self, tensors: Dict[str, AITData]):
+    def set_many_constants_with_tensors(self, tensors: Dict[str, TorchTensor]):
         ait_tensors = {}
         for name, tensor in tensors.items():
             if not tensor.is_contiguous() or not tensor.is_cuda:
@@ -822,7 +822,7 @@ def set_double_buffer_constant_with_tensor(
         self.set_double_buffer_constant(name, torch_to_ait_data(tensor), stream_ptr)
 
     def set_many_double_buffer_constants_with_tensors(
-        self, tensors: Dict[str, AITData], stream_ptr: Optional[int] = None
+        self, tensors: Dict[str, TorchTensor], stream_ptr: Optional[int] = None
     ):
         ait_tensors = {}
         for name, tensor in tensors.items():

From a1d12d2846b96a05983a0e772a058084afe73429 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Tue, 14 Mar 2023 18:33:50 -0700
Subject: [PATCH 270/638] Add tensor accessor to permute021 (#348)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/348

Reviewed By: chenyang78

Differential Revision: D43682176

fbshipit-source-id: b974cc558ef517a53e3f37724b1a74ecf40bc67d
---
 .../common/tensor/permute021_common.py        |  40 ++++--
 .../compiler/ops/tensor/permute021.py         |   2 +
 .../transform/transform_strided_slice.py      |   4 +-
 .../compiler/test_parallel_gemm_fusions.py    |   6 +-
 .../compiler/test_slice_permute021_fusion.py  | 127 ++++++++++++++++++
 tests/unittest/ops/test_permute021.py         |  44 +++---
 6 files changed, 186 insertions(+), 37 deletions(-)
 create mode 100644 tests/unittest/compiler/test_slice_permute021_fusion.py

diff --git a/python/aitemplate/backend/common/tensor/permute021_common.py b/python/aitemplate/backend/common/tensor/permute021_common.py
index 7850e443c..b2dc5bfdb 100644
--- a/python/aitemplate/backend/common/tensor/permute021_common.py
+++ b/python/aitemplate/backend/common/tensor/permute021_common.py
@@ -24,6 +24,7 @@
 from typing import Any, Dict
 
 import jinja2
+from aitemplate.backend.common import tensor_accessor_codegen
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -57,11 +58,13 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
+{{input_accessor_def}}
 {{indent}}permute021_launcher(
 {{indent}}    in_ptr,
 {{indent}}    out_ptr,
 {{indent}}    rank,
 {{indent}}    x_dims,
+{{indent}}    input_accessor,
 {{indent}}    stream
 {{indent}});
 {{indent}}return;
@@ -78,13 +81,17 @@
 #define CH_K 4
 
 namespace {
+
+{{tensor_accessor_libs}}
+
 template <typename T>
 __global__ void permute021_kernel(T *output,
                                   const T *input,
                                   const int64_t n,
                                   const int32_t h,
                                   const int32_t w,
-                                  const int32_t c) {
+                                  const int32_t c,
+                                  TensorAccessor input_accessor) {
 
   const int32_t hw = h * w;
   const int32_t hwc = hw * c;
@@ -98,24 +105,26 @@
   const int32_t hwi0 = blockIdx.y * TILE_SIZE;
   const int32_t ci0  = blockIdx.x * TILE_SIZE;
 
-  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
-  const T *A = input + input_idx;
+  size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
+
+  const T *A = input_accessor.get<const T, const T>(input, input_idx);
+
   if (ci0 + lid < c) {
     const int lid_x_33 = lid * (TILE_SIZE + 1);
     if ((hwi0 + TILE_SIZE) <= hw) {
       int hwi = wid;  // between 0 and 7
       #pragma unroll
       for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
-        shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[TILE_SIZE / CH_K * c];
+        shbuf[lid_x_33 + hwi] = *input_accessor.get<const T, const T>(input, input_idx + lid);
+        input_idx += TILE_SIZE / CH_K * c;
         hwi += TILE_SIZE / CH_K;
       }
     } else {
       for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
         if (hwi + hwi0 < hw) {
-          shbuf[lid_x_33 + hwi] = A[lid];
+          shbuf[lid_x_33 + hwi] = *input_accessor.get<const T, const T>(input, input_idx + lid);
         }
-        A = &A[TILE_SIZE / CH_K * c];
+        input_idx += TILE_SIZE / CH_K * c;
       }
     }
   }
@@ -145,6 +154,7 @@
                          void* out_ptr,
                          int64_t rank,
                          const int64_t* x_dims,
+                         TensorAccessor input_accessor,
                          {{prefix}}Stream_t stream) {
   int64_t x_dim0 = 1;
   for (int i = 0; i < rank - 2; i++) {
@@ -174,7 +184,8 @@
     n,
     h,
     w,
-    c
+    c,
+    input_accessor
   );
 }
 } // namespace
@@ -223,14 +234,22 @@ def gen_function(
     """
     func_name = func_attrs["name"]
     x = func_attrs["inputs"][0]
+    tensor_accessor = func_attrs["input_accessors"][0]
     xdtype = x._attrs["dtype"]
-    exec_paths = EXEC_TEMPLATE.render()
+    tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+    input_accessor_name = "input_accessor"
+    input_accessor = tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+        name=input_accessor_name, tensor_accessor=tensor_accessor
+    )
+    exec_paths = EXEC_TEMPLATE.render(input_accessor_def=input_accessor)
+
     return SRC_TEMPLATE.render(
         function_name=func_name,
         exec_paths=exec_paths,
         header_files=header_files,
         lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
+        tensor_accessor_libs=tensor_accessor_libs,
     )
 
 
@@ -281,7 +300,8 @@ def gen_function_call(
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
 
-    xshape = x._attrs["shape"]
+    input_accessor = func_attrs["input_accessors"][0]
+    xshape = input_accessor.original_shapes
     x_dims = [dim._attrs["name"] for dim in xshape]
 
     return FUNC_CALL_TEMPLATE.render(
diff --git a/python/aitemplate/compiler/ops/tensor/permute021.py b/python/aitemplate/compiler/ops/tensor/permute021.py
index 241823985..d775db8bc 100644
--- a/python/aitemplate/compiler/ops/tensor/permute021.py
+++ b/python/aitemplate/compiler/ops/tensor/permute021.py
@@ -20,6 +20,7 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221
 
@@ -61,6 +62,7 @@ def __call__(self, x: Tensor) -> Tensor:
         assert len(x.shape()) > 2, "The input tensor must have at least 3 dimensions"
 
         self._attrs["inputs"] = [x]
+        self._attrs["input_accessors"] = [TensorAccessor(x)]
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index cb4996bfe..b1a2c888b 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -65,7 +65,7 @@ def _is_supported_op(op: Operator, slice_op: Operator) -> bool:
         return _is_supported_gemm(op, slice_op)
     if op_type == "concatenate":
         return _sanity_check_concatenate(op, slice_op)
-    if op_type == "fused_elementwise":
+    if op_type == "fused_elementwise" or op_type == "permute021":
         return True
     if op_type.startswith("layernorm") or op_type.startswith("group_layernorm"):
         return True
@@ -95,7 +95,7 @@ def _valid_alignment(
 ) -> bool:
     op_type = op._attrs["op"]
     if (
-        op_type in ("fused_elementwise", "concatenate")
+        op_type in ("fused_elementwise", "concatenate", "permute021")
         or op._attrs["op"].startswith("layernorm")
         or op._attrs["op"].startswith("group_layernorm")
     ):
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
index 4eba28ea6..f5498b482 100644
--- a/tests/unittest/compiler/test_parallel_gemm_fusions.py
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -280,7 +280,7 @@ def _fuse_parallel_gemm_cat(
 
                 # Do comparisons.
                 self.assertTrue(
-                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                    torch.allclose(out, cat_output_pt, atol=5e-2, rtol=5e-2)
                 )
 
     def test_fuse_parallel_gemm_cat_fp16(self):
@@ -560,7 +560,7 @@ def _test_fuse_parallel_gemm_cat_partial(
 
                 # Do comparisons.
                 self.assertTrue(
-                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                    torch.allclose(out, cat_output_pt, atol=5e-2, rtol=5e-2)
                 )
 
     def test_fuse_parallel_gemm_cat_partial_fp16(self):
@@ -602,7 +602,7 @@ def _test_multi_parallel_gemm_cat_groups(
                 ys.append(y)
             pt_y = torch.cat(ys, dim=-1)
             module.run_with_tensors(inputs, outputs)
-            self.assertTrue(torch.allclose(pt_y, outputs[0], atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(pt_y, outputs[0], atol=5e-2, rtol=5e-2))
 
     def test_multi_parallel_gemm_cat_groups_fp16(self):
         self._test_multi_parallel_gemm_cat_groups(
diff --git a/tests/unittest/compiler/test_slice_permute021_fusion.py b/tests/unittest/compiler/test_slice_permute021_fusion.py
new file mode 100644
index 000000000..6a97f5c5d
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_permute021_fusion.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SlicePermute021FusionTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SlicePermute021FusionTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_slice_permute021_fusion(
+        self,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        dims,
+        test_name,
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        permute_op = ops.permute021()
+        Y = permute_op(tensor_A)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        with compile_model(
+            Y,
+            target,
+            "./tmp",
+            f"{test_name}_{self._test_id}",
+            dll_name=f"test_{self._test_id}.so",
+        ) as module:
+            self._test_id += 1
+
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 2)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 1)
+
+            # Run PyTorch
+            input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            a_pt = input_pt[slice_indices]
+            y_pt = torch.permute(a_pt, dims)
+
+            # Run AITemplate module.
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            module.run_with_tensors([input_pt], [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_slice_permute021_fusion(self):
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 2, 8),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(2, 2, 8),
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 3, 8, 62),
+            slice_start_indices=(0, 0, 0, 2),
+            slice_end_indices=(2, 3, 8, 50),
+            dims=(0, 1, 3, 2),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 3, 4, 4, 8),
+            slice_start_indices=(0, 0, 0, 0, 0),
+            slice_end_indices=(2, 3, 4, 4, 2),
+            dims=(0, 1, 2, 4, 3),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute021.py b/tests/unittest/ops/test_permute021.py
index 8fe7c56b7..df7b2be39 100644
--- a/tests/unittest/ops/test_permute021.py
+++ b/tests/unittest/ops/test_permute021.py
@@ -46,7 +46,7 @@ def _test_permute_021(
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", f"perm021_{self._test_id}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
         self._test_id += 1
 
         batch_dim = input_shape[0]
@@ -64,54 +64,54 @@ def _test_permute_021(
 
     @parameterized.expand(
         [
-            param((2, 384, 262), (0, 2, 1)),
-            param((2, 3, 384, 262), (0, 1, 3, 2)),
-            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
-            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
-            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
         ]
     )
-    def test_permute021_fp16(self, input_shape, dims):
+    def test_permute021_fp16(self, id, input_shape, dims):
         self._test_permute_021(
             input_shape=input_shape,
             dims=dims,
-            test_name="permute021_fp16",
+            test_name=f"permute021_fp16_{id}",
             dtype="float16",
         )
 
     @parameterized.expand(
         [
-            param((2, 384, 262), (0, 2, 1)),
-            param((2, 3, 384, 262), (0, 1, 3, 2)),
-            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
-            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
-            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
         ]
     )
     @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
-    def test_permute021_fp32(self, input_shape, dims):
+    def test_permute021_fp32(self, id, input_shape, dims):
         self._test_permute_021(
             input_shape=input_shape,
             dims=dims,
-            test_name="permute021_fp32",
+            test_name=f"permute021_fp32_{id}",
             dtype="float32",
         )
 
     @parameterized.expand(
         [
-            param((2, 384, 262), (0, 2, 1)),
-            param((2, 3, 384, 262), (0, 1, 3, 2)),
-            param((2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
-            param((IntVar([2, 3]), 384, 262), (0, 2, 1)),
-            param((IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
         ]
     )
     @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
-    def test_permute021_bf16(self, input_shape, dims):
+    def test_permute021_bf16(self, id, input_shape, dims):
         self._test_permute_021(
             input_shape=input_shape,
             dims=dims,
-            test_name="permute021_bf16",
+            test_name=f"permute021_bf16_{id}",
             dtype="bfloat16",
         )
 

From 6349654fa0cd45db6773b5fe2baa417c1080006b Mon Sep 17 00:00:00 2001
From: Dhawal Patel <dhawalkp@amazon.com>
Date: Tue, 14 Mar 2023 21:13:30 -0700
Subject: [PATCH 271/638] Adding a configurable directory for saving compiled
 model artifacts (#414)

Summary:
Issue: https://github.com/facebookincubator/AITemplate/issues/396
StableDiffusionAITPipeline and StableDiffusionImg2ImgAITPipeline in pipeline_stable_diffusion_ait.py and [pipeline_stable_diffusion_img2img_ait.py has workdir hardcoded to "tmp/" which might create problems where the file system locations are either read-only or users have restricted permissions in the current working directory. Examples include managed ML services in Cloud that might have file permission restrictions.

Desired behavior: User should be given flexibility to specify the custom location as workdir where AITemplate .so files are downloaded.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/414

Reviewed By: ipiszy

Differential Revision: D44036462

Pulled By: khabinov

fbshipit-source-id: c9379d0c53c6be5dc2ef2605a37cf3f836460a0b
---
 .../05_stable_diffusion/scripts/compile.py    |  10 +-
 examples/05_stable_diffusion/scripts/demo.py  |  11 +-
 .../scripts/demo_img2img.py                   |  10 +-
 .../scripts/download_pipeline.py              |  15 ++-
 examples/05_stable_diffusion/src/benchmark.py |  22 +++-
 .../src/compile_lib/compile_clip.py           |   7 +-
 .../src/compile_lib/compile_unet.py           |   7 +-
 .../src/compile_lib/compile_vae.py            |   8 +-
 .../src/compile_lib/util.py                   | 111 ++++++++++++++++++
 .../src/pipeline_stable_diffusion_ait.py      |   6 +-
 .../pipeline_stable_diffusion_img2img_ait.py  |   4 +-
 11 files changed, 174 insertions(+), 37 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 0018dafda..d946744d0 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -26,25 +26,23 @@
 from src.compile_lib.compile_clip import compile_clip
 from src.compile_lib.compile_unet import compile_unet
 from src.compile_lib.compile_vae import compile_vae
+from src.compile_lib.util import get_work_dir_location_diffusers
 
 
 @click.command()
-@click.option(
-    "--local-dir",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="the local diffusers pipeline directory",
-)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option("--batch-size", default=1, help="batch size")
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
 def compile_diffusers(
-    local_dir, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
+    width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
 ):
     logging.getLogger().setLevel(logging.INFO)
     torch.manual_seed(4896)
 
+    local_dir = get_work_dir_location_diffusers()
+
     if detect_target().name() == "rocm":
         convert_conv_to_gemm = False
 
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index d4f5dbb99..a5a92ad0c 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -23,22 +23,21 @@
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
 
+from src.compile_lib.util import get_work_dir_location_diffusers
 from src.pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
 
 
 @click.command()
-@click.option(
-    "--local-dir",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="the local diffusers pipeline directory",
-)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, prompt, benchmark):
+def run(width, height, prompt, benchmark):
+
+    local_dir = get_work_dir_location_diffusers()
+
     pipe = StableDiffusionAITPipeline.from_pretrained(
         local_dir,
         scheduler=EulerDiscreteScheduler.from_pretrained(
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index e4d96d865..e640beb3e 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -25,15 +25,11 @@
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
 
+from src.compile_lib.util import get_work_dir_location_diffusers
 from src.pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
 
 
 @click.command()
-@click.option(
-    "--local-dir",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="the local diffusers pipeline directory",
-)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option(
@@ -42,7 +38,9 @@
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, prompt, benchmark):
+def run(width, height, prompt, benchmark):
+
+    local_dir = get_work_dir_location_diffusers()
 
     # load the pipeline
     device = "cuda"
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 1128769da..6120fa8df 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -14,17 +14,20 @@
 #
 import click
 import torch
+from aitemplate.utils.import_path import import_parent
 from diffusers import StableDiffusionPipeline
 
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+from src.compile_lib.util import get_work_dir_location_diffusers
+
 
 @click.command()
 @click.option("--token", default="", help="access token")
-@click.option(
-    "--save_directory",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="pipeline files local directory",
-)
-def download_pipeline_files(token, save_directory) -> None:
+def download_pipeline_files(token) -> None:
+
+    save_directory = get_work_dir_location_diffusers()
+
     StableDiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-2-1-base",
         revision="fp16",
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index 5cac6a465..a08a00c12 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -22,11 +22,18 @@
 from aitemplate.compiler import Model
 from aitemplate.testing import detect_target
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from compile_lib.util import (
+    get_file_location_autoencoder,
+    get_file_location_clip,
+    get_file_location_unet,
+    get_work_dir_location_diffusers,
+)
 from diffusers import StableDiffusionPipeline
 
 from torch import autocast
 from transformers import CLIPTokenizer
 
+
 USE_CUDA = detect_target().name() == "cuda"
 
 
@@ -56,7 +63,9 @@ def benchmark_unet(
     verify=False,
 ):
 
-    exe_module = Model("./tmp/UNet2DConditionModel/test.so")
+    file_name = get_file_location_unet()
+
+    exe_module = Model(file_name)
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
         exit(-1)
@@ -131,7 +140,10 @@ def benchmark_clip(
 ):
     mask_seq = 0
 
-    exe_module = Model("./tmp/CLIPTextModel/test.so")
+    file_name = get_file_location_clip()
+
+    exe_module = Model(file_name)
+
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
         exit(-1)
@@ -205,7 +217,9 @@ def benchmark_vae(
 
     latent_channels = 4
 
-    exe_module = Model("./tmp/AutoencoderKL/test.so")
+    file_name = get_file_location_autoencoder()
+
+    exe_module = Model(file_name)
     if exe_module is None:
         print("Error!! Cannot find compiled module for AutoencoderKL.")
         exit(-1)
@@ -282,6 +296,8 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     np.random.seed(0)
     torch.manual_seed(4896)
 
+    local_dir = get_work_dir_location_diffusers()
+
     pipe = StableDiffusionPipeline.from_pretrained(
         local_dir,
         revision="fp16",
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index cfda48607..173866766 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -19,7 +19,7 @@
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
-from .util import mark_output
+from .util import get_work_dir_location, mark_output
 
 
 def map_clip_params(pt_mod, batch_size, seqlen, depth):
@@ -117,4 +117,7 @@ def compile_clip(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
+
+    workdir = get_work_dir_location()
+
+    compile_model(Y, target, workdir, "CLIPTextModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index 7cc2b41e4..96753a6f3 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -21,7 +21,7 @@
 from ..modeling.unet_2d_condition import (
     UNet2DConditionModel as ait_UNet2DConditionModel,
 )
-from .util import mark_output
+from .util import get_work_dir_location, mark_output
 
 
 def map_unet_params(pt_mod, dim):
@@ -85,4 +85,7 @@ def compile_unet(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
+
+    workdir = get_work_dir_location()
+
+    compile_model(Y, target, workdir, "UNet2DConditionModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index d01f320dc..2c28a431e 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -15,14 +15,13 @@
 from collections import OrderedDict
 
 import numpy as np
-
 import torch
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
-from .util import mark_output
+from .util import get_work_dir_location, mark_output
 
 
 def map_vae_params(ait_module, pt_module, batch_size, seq_len):
@@ -131,10 +130,13 @@ def compile_vae(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
+
+    workdir = get_work_dir_location()
+
     compile_model(
         Y,
         target,
-        "./tmp",
+        workdir,
         "AutoencoderKL",
         constants=params_ait,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 000e862e9..1497e8b7e 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -12,6 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import os
+
+
 def mark_output(y):
     if type(y) is not tuple:
         y = (y,)
@@ -20,3 +23,111 @@ def mark_output(y):
         y[i]._attrs["name"] = "output_%d" % (i)
         y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def get_work_dir_location():
+
+    """
+    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
+    path to a directory which has AITemplate compiled artifacts of the model(s).
+    Make sure the OS user running this script has read and write permissions to
+    this directory. By default, the artifacts will be saved under tmp/ folder of
+    the current working directory.
+    """
+
+    env_name = "AITEMPLATE_WORK_DIR"
+    workdir = "tmp/"
+    if env_name in os.environ:
+        workdir = os.environ[env_name]
+
+    print("The value of {} is {}".format(env_name, workdir))
+
+    return workdir
+
+
+def get_work_dir_location_diffusers():
+
+    """
+    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
+    path to a directory which has AITemplate compiled artifacts of the model(s).
+    Make sure the OS user running this script has read and write permissions to
+    this directory. By default, it will look for compiled artifacts under
+    tmp/ folder of the current working directory.
+    """
+
+    env_name = "AITEMPLATE_WORK_DIR"
+    local_dir = "./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2"
+
+    if env_name in os.environ:
+        local_dir = os.path.join(
+            os.environ[env_name],
+            "diffusers-pipeline",
+            "stabilityai",
+            "stable-diffusion-v2",
+        )
+
+    print("The value of {} is {}".format(env_name, local_dir))
+    return local_dir
+
+
+def get_file_location_clip():
+    """
+    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
+    path to a directory which has AITemplate compiled artifacts of the model(s).
+    Make sure the OS user running this script has read and write permissions to
+    this directory. By default, it will look for compiled artifacts under
+    tmp/ folder of the current working directory.
+    """
+
+    env_name = "AITEMPLATE_WORK_DIR"
+    file_name = "./tmp/CLIPTextModel/test.so"
+
+    if env_name in os.environ:
+        file_name = os.path.join(os.environ[env_name], "CLIPTextModel", "test.so")
+
+    print("The value of {} is {}".format(env_name, file_name))
+    return file_name
+
+
+def get_file_location_autoencoder():
+
+    """
+    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
+    path to a directory which has AITemplate compiled artifacts of the model(s).
+    Make sure the OS user running this script has read and write permissions to
+    this directory. By default, it will look for compiled artifacts under
+    tmp/ folder of the current working directory.
+    """
+
+    env_name = "AITEMPLATE_WORK_DIR"
+    file_name = "./tmp/AutoencoderKL/test.so"
+
+    if env_name in os.environ:
+        file_name = os.path.join(os.environ[env_name], "AutoencoderKL", "test.so")
+
+    print("The value of {} is {}".format(env_name, file_name))
+
+    return file_name
+
+
+def get_file_location_unet():
+
+    """
+    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
+    path to a directory which has AITemplate compiled artifacts of the model(s).
+    Make sure the OS user running this script has read and write permissions to
+    this directory. By default, it will look for compiled artifacts under
+    tmp/ folder of the current working directory.
+    """
+
+    env_name = "AITEMPLATE_WORK_DIR"
+    file_name = "./tmp/UNet2DConditionModel/test.so"
+
+    if env_name in os.environ:
+        file_name = os.path.join(
+            os.environ[env_name], "UNet2DConditionModel", "test.so"
+        )
+
+    print("The value of {} is {}".format(env_name, file_name))
+
+    return file_name
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 7dace1275..6818d6b91 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -13,8 +13,8 @@
 #  limitations under the License.
 #
 import inspect
-
 import os
+
 import warnings
 from typing import List, Optional, Union
 
@@ -37,6 +37,7 @@
     StableDiffusionPipelineOutput,
     StableDiffusionSafetyChecker,
 )
+from src.compile_lib.util import get_work_dir_location
 
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
@@ -98,7 +99,8 @@ def __init__(
             requires_safety_checker=requires_safety_checker,
         )
 
-        workdir = "tmp/"
+        workdir = get_work_dir_location()
+
         self.clip_ait_exe = self.init_ait_module(
             model_name="CLIPTextModel", workdir=workdir
         )
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
index ad2885086..084a23b51 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -36,6 +36,7 @@
     StableDiffusionPipelineOutput,
     StableDiffusionSafetyChecker,
 )
+from src.compile_lib.util import get_work_dir_location
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
@@ -109,7 +110,8 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
-        workdir = "tmp/"
+        workdir = get_work_dir_location()
+
         self.clip_ait_exe = self.init_ait_module(
             model_name="CLIPTextModel", workdir=workdir
         )

From 19f1e3a02e61a93c7423eb248b66ed0b3165b99b Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Wed, 15 Mar 2023 01:58:51 -0700
Subject: [PATCH 272/638] Add bmm_xxc and bmm_xxc_add ops (#369)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/369

T144417069: **Add bmm_xxr(_add) + permute021 ==> bmm_xxc(_add) fusion**

As a part of this task, we need to add `bmm_xxc` and `bmm_xxc_add` ops. The previous diff - D43621471 ([#341](https://github.com/facebookincubator/AITemplate/pull/341)) - added codegens for these ops. Current diff adds the ops themselves and the corresponding tests.

The class structure has been refactored: every individual op (`bmm_rrc`, `bmm_rcc` etc) inherits from `bmm_xxx` which inherits from `bmm` - this allows to minimize duplicated code. Same refactor is applied for `bmm_xxx_add` ops.

Reviewed By: chenyang78, aakhundov

Differential Revision: D43697976

fbshipit-source-id: 0bf28430a46ce9f4ef7bcaff7acbdee1b1be951e
---
 .../backend/cuda/gemm_universal/common.py     |  13 +
 .../gemm_epilogue_vistor/dual_bmm_rrr_div.py  |   2 +-
 .../compiler/ops/gemm_universal/__init__.py   |  27 +-
 .../compiler/ops/gemm_universal/bmm.py        |   4 +
 .../compiler/ops/gemm_universal/bmm_ccr.py    | 111 ----
 .../ops/gemm_universal/bmm_ccr_add.py         |  88 ----
 .../compiler/ops/gemm_universal/bmm_crr.py    | 111 ----
 .../ops/gemm_universal/bmm_crr_add.py         |  88 ----
 .../compiler/ops/gemm_universal/bmm_rcr.py    | 111 ----
 .../ops/gemm_universal/bmm_rcr_permute.py     |   2 +-
 .../compiler/ops/gemm_universal/bmm_rrr.py    | 109 ----
 .../ops/gemm_universal/bmm_rrr_add.py         |  84 ---
 .../ops/gemm_universal/bmm_rrr_permute.py     |   2 +-
 .../compiler/ops/gemm_universal/bmm_xxx.py    | 307 +++++++++++
 .../ops/gemm_universal/bmm_xxx_add.py         | 396 ++++++++++++++
 python/aitemplate/compiler/public/__init__.py |   3 +-
 .../transform/transform_special_ops.py        |   2 +-
 tests/unittest/ops/test_bmm.py                | 426 ++++++++++++---
 tests/unittest/ops/test_bmm_add.py            | 497 ++++++++++++++++--
 19 files changed, 1565 insertions(+), 818 deletions(-)
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
 delete mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index a0c54deb8..3624ee3da 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -1160,6 +1160,16 @@ def default_fproc(
     if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
         if Target.current()._kwargs["use_fp16_acc"]:
             acc_type = cutlass_lib.library.DataType.f16
+
+    # For column-major C layouts, filter out GEMM tiling configs introducted by
+    # extra_cutlass_generator.py - those will cause a build error.
+    threadblock_mxn = op.tile_description.threadblock_shape[:2]
+    is_nonstandard_theadblock_shape = threadblock_mxn == [128, 32]
+    filter_extra_tile_configs = (
+        is_nonstandard_theadblock_shape
+        and c_layout == cutlass_lib.library.LayoutType.ColumnMajor
+    )
+
     if (
         cutlass_lib.library.DataTypeTag[op.A.element] == data_type
         and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
@@ -1167,6 +1177,7 @@ def default_fproc(
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
+        and not filter_extra_tile_configs
     ):
         op = copy.deepcopy(op)
         # set output major
@@ -1224,6 +1235,8 @@ def function_filter(cfg, func_attrs, ab_alignment):
     bool
         If input cfg should be filtered.
     """
+    # example:
+    # cfg="cutlass_tensorop_f16_s16816gemm_f16_128x32_64x4_nn_align_8_8"
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
index 7e2a907ad..4be9a4585 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -16,7 +16,7 @@
 Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
 """
 from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
index 9e3ca05ab..24ae1ef82 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -13,19 +13,32 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from aitemplate.compiler.ops.gemm_universal.bmm_ccr import bmm_ccr
-from aitemplate.compiler.ops.gemm_universal.bmm_ccr_add import bmm_ccr_add
-from aitemplate.compiler.ops.gemm_universal.bmm_crr import bmm_crr
-from aitemplate.compiler.ops.gemm_universal.bmm_crr_add import bmm_crr_add
-from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
 from aitemplate.compiler.ops.gemm_universal.bmm_rcr_permute import bmm_rcr_permute
-from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
-from aitemplate.compiler.ops.gemm_universal.bmm_rrr_add import bmm_rrr_add
 from aitemplate.compiler.ops.gemm_universal.bmm_rrr_permute import bmm_rrr_permute
 from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm import bmm_softmax_bmm
 from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm_permute import (
     bmm_softmax_bmm_permute,
 )
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+)
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx_add import (
+    bmm_ccc_add,
+    bmm_ccr_add,
+    bmm_crc_add,
+    bmm_crr_add,
+    bmm_rcc_add,
+    bmm_rcr_add,
+    bmm_rrc_add,
+    bmm_rrr_add,
+)
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add import gemm_rcr_bias_add
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
index 67fd3cc38..4a8a8e430 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
@@ -20,6 +20,7 @@
 
 from aitemplate.compiler.base import IntImm, Tensor
 from aitemplate.compiler.dtype import is_same_dtype
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 from aitemplate.compiler.ops.gemm_universal.gemm_common import gemm
 
 
@@ -113,3 +114,6 @@ def _sanity_check(self, a: Tensor, b: Tensor):
                     atype=a.dtype(), btype=b.dtype()
                 )
             )
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
deleted file mode 100644
index 250ae2c28..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import gemm_common as common
-from aitemplate.compiler.ops.gemm_universal.bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_ccr(bmm):
-    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
-
-    This operator is equivalent to following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
-    """
-
-    def __init__(self):
-        """Constructor for bmm_ccr"""
-        super().__init__()
-        self._attrs["op"] = "bmm_ccr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, k, self._attrs["inputs"][0].dtype())
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-1], b.shape()[-2]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, K, M) * (B, N, K) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
deleted file mode 100644
index 4f5ac9bd2..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import bmm_ccr
-from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_ccr_add(bmm_ccr):
-    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
-    C can be the same size as the output or be broadcast as bias.
-
-    This operator is equivalent to following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        WT = torch.transpose(W_pt, 2, 1)
-        Y_pt = torch.bmm(XT, WT)
-        Y_pt = Y_pt + D_pt
-    """
-
-    def __init__(self):
-        """Constructor for bmm_ccr_add"""
-        super().__init__()
-        self._attrs["op"] = "bmm_ccr_add"
-        self._attrs["has_d"] = True
-
-    @staticmethod
-    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
-        output_shapes = bmm_ccr()._infer_shapes(A, B)
-        c_shapes = C.shape()
-        return is_valid_inputs(output_shapes, c_shapes)
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_ccr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a.transpose(2, 1), b.transpose(2, 1)) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor in shape (B, K, M)
-        b : Tensor
-            Tensor in shape (B, N, K)
-        c : Tensor
-            Tensor in shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor in shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
deleted file mode 100644
index 219da71fa..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import gemm_common as common
-from aitemplate.compiler.ops.gemm_universal.bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_crr(bmm):
-    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_crr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, n, self._attrs["inputs"][0].dtype())
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-1], b.shape()[-1]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, K, M) * (B, K, N) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
deleted file mode 100644
index 93d69300e..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import bmm_crr
-from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_crr_add(bmm_crr):
-    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
-    C can be the same size as the output or be broadcast as bias.
-
-    This operator is equivalent to the following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-        Y_pt = Y_pt + D_pt
-
-    """
-
-    def __init__(self):
-        """Constructor for bmm_crr_add"""
-        super().__init__()
-        self._attrs["op"] = "bmm_crr_add"
-        self._attrs["has_d"] = True
-
-    @staticmethod
-    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
-        output_shapes = bmm_crr()._infer_shapes(A, B)
-        c_shapes = C.shape()
-        return is_valid_inputs(output_shapes, c_shapes)
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_crr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a.transpose(2, 1), b) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor in shape (B, K, M)
-        b : Tensor
-            Tensor in shape (B, K, N)
-        c : Tensor
-            Tensor in shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor in shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
deleted file mode 100644
index bc3d64e42..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import gemm_common as common
-from aitemplate.compiler.ops.gemm_universal.bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_rcr(bmm):
-    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rcr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-2], b.shape()[-2]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, M, K) * (B, N, K) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
index 4a987a498..a11ff5d34 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -20,7 +20,7 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.common import reshape
-from aitemplate.compiler.ops.gemm_universal import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
deleted file mode 100644
index 37b65a8b4..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import gemm_common as common
-from aitemplate.compiler.ops.gemm_universal.bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_rrr(bmm):
-    """BBatch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-
-        Y_pt = torch.bmm(X_pt, W_pt)
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rrr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-2], b.shape()[-1]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, M, K) * (B, K, N) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
deleted file mode 100644
index d8062d14d..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.ops.gemm_universal import bmm_rrr
-from aitemplate.compiler.ops.gemm_universal.bmm import is_valid_inputs
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_rrr_add(bmm_rrr):
-    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
-    C can be the same size as the output or be broadcast as bias.
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        Y_pt = torch.bmm(X_pt, W_pt) + D_pt
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rrr_add"
-        self._attrs["has_d"] = True
-
-    @staticmethod
-    def is_valid_inputs(A: Tensor, B: Tensor, C: Tensor):
-        output_shapes = bmm_rrr()._infer_shapes(A, B)
-        c_shapes = C.shape()
-        return is_valid_inputs(output_shapes, c_shapes)
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_rrr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a, b) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor with shape (B, M, K)
-        b : Tensor
-            Tensor with shape (B, K, N)
-        c : Tensor
-            Tensor with shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor with shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index 187db9f69..587cba9a2 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -20,7 +20,7 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.common import reshape
-from aitemplate.compiler.ops.gemm_universal import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rrr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
new file mode 100644
index 000000000..453aaa0ee
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
@@ -0,0 +1,307 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from ...base import Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+
+class bmm_xxx(bmm):
+    """Batch GEMM specialization"""
+
+    def __init__(self, a_layout, b_layout, c_layout):
+        super().__init__()
+        self._attrs["op"] = f"bmm_{a_layout}{b_layout}{c_layout}"
+        self.a_layout = a_layout
+        self.b_layout = b_layout
+        self.c_layout = c_layout
+
+        self.a_is_column_major = int(self.a_layout == "c")
+        self.b_is_column_major = int(self.b_layout == "c")
+        self.c_is_column_major = int(self.c_layout == "c")
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(
+                self._get_a_leading_dim(m, k),
+                self._get_b_leading_dim(n, k),
+                self._attrs["inputs"][0].dtype(),
+            )
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        m = a.shape()[self._get_m_idx_in_a(a.shape())]
+        n = b.shape()[self._get_n_idx_in_b(b.shape())]
+        return [batch_size, *self._get_output_shape(m, n)]
+
+    def _extract_dims(self, for_profiling=False):
+        # C = A * B
+        # A shape is (B, M, K) for row-major layout and (B, K, M) for column-major layout
+        # B shape is (B, K, N) for row-major layout and (B, N, K) for column-major layout
+        # C shape is (B, M, N) for row-major layout and (B, N, M) for column-major layout
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_m_idx_in_a(a_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_m_idx_in_c(),
+                ),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=1,
+                    dim_idx=self._get_n_idx_in_b(b_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT, tensor_idx=0, dim_idx=self._get_n_idx_in_c()
+                ),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_k_idx_in_a(a_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=1,
+                    dim_idx=self._get_k_idx_in_b(b_shapes),
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _get_a_leading_dim(self, m, k):
+        return [k, m][self.a_is_column_major]
+
+    def _get_b_leading_dim(self, n, k):
+        return [n, k][self.b_is_column_major]
+
+    def _get_m_idx_in_a(self, a_shapes):
+        return len(a_shapes) - 2 + self.a_is_column_major
+
+    def _get_m_idx_in_c(self):
+        return 1 + self.c_is_column_major
+
+    def _get_n_idx_in_b(self, b_shapes):
+        return len(b_shapes) - 1 - self.b_is_column_major
+
+    def _get_n_idx_in_c(self):
+        return 2 - self.c_is_column_major
+
+    def _get_k_idx_in_a(self, a_shapes):
+        return len(a_shapes) - 1 - self.a_is_column_major
+
+    def _get_k_idx_in_b(self, b_shapes):
+        return len(b_shapes) - 2 + self.b_is_column_major
+
+    def _get_output_shape(self, m, n):
+        if self.c_is_column_major:
+            return [n, m]
+        return [m, n]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+
+class bmm_ccr(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+    """
+
+    def __init__(self):
+        super().__init__("c", "c", "r")
+
+
+class bmm_rrr(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+    """
+
+    def __init__(self):
+        super().__init__("r", "r", "r")
+
+
+class bmm_crr(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__("c", "r", "r")
+
+
+class bmm_rcr(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__("r", "c", "r")
+
+
+class bmm_ccc(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[ColMajor].
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt.transpose(2, 1))
+        Y_pt = torch.transpose(YT, 2, 1)
+    """
+
+    def __init__(self):
+        super().__init__("c", "c", "c")
+
+
+class bmm_rrc(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[ColMajor]
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        YT = torch.bmm(X_pt, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+    """
+
+    def __init__(self):
+        super().__init__("r", "r", "c")
+
+
+class bmm_crc(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[ColMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+
+    """
+
+    def __init__(self):
+        super().__init__("c", "r", "c")
+
+
+class bmm_rcc(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[ColMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+
+    """
+
+    def __init__(self):
+        super().__init__("r", "c", "c")
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
new file mode 100644
index 000000000..00ae2a7ee
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
@@ -0,0 +1,396 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import Tensor
+from .bmm import is_valid_inputs as bmm_is_valid_inputs
+from .bmm_xxx import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+    bmm_xxx,
+)
+
+
+class bmm_xxx_add(bmm_xxx):
+    """Batch GEMM specialization with Add.
+    C can be the same size as the output or be broadcast as bias.
+    """
+
+    def __init__(self, a_layout, b_layout, c_layout):
+        super().__init__(a_layout, b_layout, c_layout)
+        self._attrs["op"] = f"bmm_{a_layout}{b_layout}{c_layout}_add"
+        self._attrs["has_d"] = True
+
+    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        """Call bmm_rrr_add with tensors a, b, c"""
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(c)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        return output
+
+    def is_valid_inputs_unspecialized(self, A: Tensor, B: Tensor, C: Tensor):
+        # For base bmm_xxx_add class this method can't be static,
+        # since the class doesn't know about the layout (the object does).
+        output_shapes = bmm_xxx(
+            self.a_layout, self.b_layout, self.c_layout
+        )._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return bmm_is_valid_inputs(output_shapes, c_shapes)
+
+    @classmethod
+    def is_valid_inputs(cls, A: Tensor, B: Tensor, C: Tensor):
+        """
+        This method should only be called from subclasses of bmm_xxx_add, since
+        _SpecializedBase is defined there. For the parent class bmm_xxx_add itself
+        call is_valid_inputs_unspecialized instead.
+        """
+        if not hasattr(cls, "_SpecializedBase"):
+            raise NotImplementedError(
+                "Call bmm_xxx_add.is_valid_inputs_unspecialized instead of bmm_xxx_add.is_valid_inputs. The latter is only defined for child classes of bmm_xxx_add."
+            )
+        output_shapes = cls._SpecializedBase()._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return bmm_is_valid_inputs(output_shapes, c_shapes)
+
+
+class bmm_crr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, K, N)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+
+    """
+
+    _SpecializedBase = bmm_crr
+
+    def __init__(self):
+        """Constructor for bmm_crr_add"""
+        super().__init__("c", "r", "r")
+
+
+class bmm_rcr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(X_pt, WT)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_rcr
+
+    def __init__(self):
+        """Constructor for bmm_rcr_add"""
+        super().__init__("r", "c", "r")
+
+
+class bmm_ccr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(XT, WT)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_ccr
+
+    def __init__(self):
+        """Constructor for bmm_ccr_add"""
+        super().__init__("c", "c", "r")
+
+
+class bmm_rrr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with shape (B, M, K)
+        b : Tensor
+            Tensor with shape (B, K, N)
+        c : Tensor
+            Tensor with shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor with shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_rrr
+
+    def __init__(self):
+        super().__init__("r", "r", "r")
+
+
+class bmm_crc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, K, N)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_crc
+
+    def __init__(self):
+        """Constructor for bmm_crc_add"""
+        super().__init__("c", "r", "c")
+
+
+class bmm_rcc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        YT = torch.bmm(X_pt, WT)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_rcc
+
+    def __init__(self):
+        """Constructor for bmm_rcc_add"""
+        super().__init__("r", "c", "c")
+
+
+class bmm_ccc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        WT = torch.transpose(W_pt, 2, 1)
+        YT = torch.bmm(XT, WT)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_ccc
+
+    def __init__(self):
+        """Constructor for bmm_ccc_add"""
+        super().__init__("c", "c", "c")
+
+
+class bmm_rrc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+        YT = torch.bmm(X_pt, W_pt)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with shape (B, M, K)
+        b : Tensor
+            Tensor with shape (B, K, N)
+        c : Tensor
+            Tensor with shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor with shape (B, N, M)
+    """
+
+    _SpecializedBase = bmm_rrc
+
+    def __init__(self):
+        super().__init__("r", "r", "c")
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index 3f5e77761..d8a6a5bf3 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -38,8 +38,7 @@
 from aitemplate.compiler.ops.common.int_elementwise import int_elementwise
 
 """GEMM"""
-from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
-from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr, bmm_rrr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
 from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index 55edb037e..cf1f6ffa8 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -21,7 +21,7 @@
 from aitemplate.compiler import ops
 from aitemplate.compiler.base import Operator, Tensor
 from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
-from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr
 from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
 from aitemplate.compiler.transform.transform_utils import (
     copy_src_op_attributes,
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 9bdb71e1c..e390865f7 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -21,12 +21,21 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_test_env,
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 from aitemplate.utils import shape_utils
 
+from parameterized import parameterized
+
+
+_TEST_PARAMS = {
+    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+}
+
 
 class BMMTestCase(unittest.TestCase):
     def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
@@ -160,63 +169,191 @@ def test_ccr(self):
         if detect_target().name() == "cuda":
             self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
 
+    def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
+
+            WT = torch.transpose(W_pt, 2, 1)
+            Y_pt = torch.bmm(X_pt, WT)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, m], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            if X_pt.nelement() == 0 or Y_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rcc(self):
+        self._test_rcc([1024], [128], N=512, K=256, test_name="static")
+        if detect_target().name() == "cuda":
+            self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+            self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+            self._test_rcc(
+                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
+            )
+            self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
+            self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
+            self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+
+    def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        k_dim = shape_utils.gen_int_var_min_max(ks, name="k")
+        X = Tensor(
+            shape=[batch_dim, k_dim, M], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, k_dim, N], dtype=dtype, name="input_1", is_input=True
+        )
+        OP = ops.bmm_crc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        for (b, k) in itertools.product(bs, ks):
+            X_pt = get_random_torch_tensor([b, k, M], dtype)
+            W_pt = get_random_torch_tensor([b, k, N], dtype)
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, M], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_crc(self):
+        self._test_crc([1024], [128], M=256, N=512, test_name="static")
+        if detect_target().name() == "cuda":
+            self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+            self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+            self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrc_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
+
+            Y_pt = torch.bmm(X_pt, W_pt)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, m], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rrc(self):
+        self._test_rrc([87], [23], K=256, N=512, test_name="static")
+        if detect_target().name() == "cuda":
+            self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+            self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+            self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        X = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_ccc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
+
+        for b in bs:
+            X_pt = get_random_torch_tensor([b, K, M], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+            y = get_torch_empty_tensor([b, N, M], dtype)
+            # y = get_torch_empty_tensor([b, M, N], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_ccc(self):
+        self._test_ccc([77], M=256, N=64, K=128, test_name="static")
+        if detect_target().name() == "cuda":
+            self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_fp32_sm80(self):
-        self._test_rcr([128], [64], N=8, K=64, test_name="static_float", dtype="float")
+    def test_bmm_dtype(self, dtype):
+        self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr(
             [1, 5, 77, 128],
             [32],
             N=16,
             K=64,
-            test_name="dynamic_b_float",
-            dtype="float",
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
         )
         self._test_crr(
             [1, 2, 5],
             [3, 6, 8],
             M=24,
             N=64,
-            test_name="dynamic_bk_float",
-            dtype="float",
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
         )
         self._test_rrr(
-            [8], [4, 7, 9], K=64, N=32, test_name="dynamic_m_float", dtype="float"
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
         )
         self._test_ccr(
-            [1, 9, 11], M=64, N=32, K=16, test_name="dynamic_b_float", dtype="float"
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_bf16(self):
-        self._test_rcr(
-            [128], [64], N=8, K=64, test_name="static_bfloat16", dtype="bfloat16"
-        )
-        self._test_rcr(
+        self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcc(
             [1, 5, 77, 128],
             [32],
             N=16,
             K=64,
-            test_name="dynamic_b_bfloat16",
-            dtype="bfloat16",
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
         )
-        self._test_crr(
+        self._test_crc(
             [1, 2, 5],
             [3, 6, 8],
             M=24,
             N=64,
-            test_name="dynamic_bk_bfloat16",
-            dtype="bfloat16",
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
         )
-        self._test_rrr(
-            [8], [4, 7, 9], K=64, N=32, test_name="dynamic_m_bfloat16", dtype="bfloat16"
+        self._test_rrc(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
         )
-        self._test_ccr(
-            [1, 9, 11],
-            M=64,
-            N=32,
-            K=16,
-            test_name="dynamic_b_bfloat16",
-            dtype="bfloat16",
+        self._test_ccc(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
 
@@ -443,45 +580,169 @@ def test_ccr(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
-    def test_bmm_broadcast_fp32_sm80(self):
-        self._test_rcr_with_accessors(dtype="float")
-        self._test_rcr_merge_with_accessors(dtype="float")
-        self._test_rcr([2, 16, 8], [1, 32, 8], "broadcastable_b", dtype="float")
-        self._test_rcr([16, 8], [8, 32, 8], "2d_broadcastable_a", dtype="float")
-        self._test_crr([1, 8, 16], [2, 8, 32], "broadcastable_a", dtype="float")
-        self._test_crr([8, 8, 16], [8, 32], "2d_broadcastable_b", dtype="float")
-        self._test_rrr([2, 16, 8], [1, 8, 32], "broadcastable_b", dtype="float")
-        self._test_rrr([16, 8], [8, 8, 32], "2d_broadcastable_a", dtype="float")
-        self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a", dtype="float")
-        self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b", dtype="float")
-
-    def test_bmm_broadcast_bf16(self):
-        self._test_rcr_with_accessors(dtype="bfloat16")
-        self._test_rcr_merge_with_accessors(dtype="bfloat16")
-        self._test_rcr(
-            [2, 16, 8], [1, 32, 8], "broadcastable_b_bfloat16", dtype="bfloat16"
-        )
-        self._test_rcr(
-            [16, 8], [8, 32, 8], "2d_broadcastable_a_bfloat16", dtype="bfloat16"
-        )
-        self._test_crr(
-            [1, 8, 16], [2, 8, 32], "broadcastable_a_bfloat16", dtype="bfloat16"
-        )
-        self._test_crr(
-            [8, 8, 16], [8, 32], "2d_broadcastable_b_bfloat16", dtype="bfloat16"
-        )
-        self._test_rrr(
-            [2, 16, 8], [1, 8, 32], "broadcastable_b_bfloat16", dtype="bfloat16"
-        )
-        self._test_rrr(
-            [16, 8], [8, 8, 32], "2d_broadcastable_a_bfloat16", dtype="bfloat16"
-        )
-        self._test_ccr(
-            [1, 8, 16], [2, 32, 8], "broadcastable_a_bfloat16", dtype="bfloat16"
-        )
-        self._test_ccr(
-            [8, 8, 16], [32, 8], "2d_broadcastable_b_bfloat16", dtype="bfloat16"
-        )
+    def _test_rcc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcc(self):
+        self._test_rcc([1, 16, 8], [2, 32, 8], "broadcastable_a")
+        self._test_rcc([2, 16, 8], [1, 32, 8], "broadcastable_b")
+        self._test_rcc([16, 8], [8, 32, 8], "2d_broadcastable_a")
+        self._test_rcc([8, 16, 8], [32, 8], "2d_broadcastable_b")
+
+    def _test_crc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_crc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_crc(self):
+        self._test_crc([1, 8, 16], [2, 8, 32], "broadcastable_a")
+        self._test_crc([2, 8, 16], [1, 8, 32], "broadcastable_b")
+        self._test_crc([8, 16], [8, 8, 32], "2d_broadcastable_a")
+        self._test_crc([8, 8, 16], [8, 32], "2d_broadcastable_b")
+
+    def _test_rrc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        Y_pt = torch.matmul(X_pt, W_pt)
+        Y_pt = Y_pt.transpose(-2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrc(self):
+        self._test_rrc([1, 16, 8], [2, 8, 32], "broadcastable_a")
+        self._test_rrc([2, 16, 8], [1, 8, 32], "broadcastable_b")
+        self._test_rrc([16, 8], [8, 8, 32], "2d_broadcastable_a")
+        self._test_rrc([8, 16, 8], [8, 32], "2d_broadcastable_b")
+
+    def _test_ccc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_ccc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccc(self):
+        self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a")
+        self._test_ccr([2, 8, 16], [1, 32, 8], "broadcastable_b")
+        self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
+        self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
+
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_broadcast_dtype(self, dtype):
+        self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+        self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
 
     def test_rcr_fail(self, dtype="float16"):
         target = detect_target()
@@ -539,9 +800,34 @@ def test_rrr_fail(self, dtype="float16"):
         except RuntimeError:
             pass
 
+    def test_rcc_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, 8, K], dtype)
+        y = get_torch_empty_tensor([2, 8, 10], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be imcompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
 
-filter_test_cases_by_test_env(BMMBroadcastTestCase)
-filter_test_cases_by_test_env(BMMTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index a604e4877..86d1cbd4c 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -20,11 +20,19 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_test_env,
+    filter_test_cases_by_params,
     get_random_torch_tensor,
     get_torch_empty_tensor,
+    TestEnv,
 )
 
+from parameterized import parameterized
+
+_TEST_PARAMS = {
+    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+}
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMAddTestCase(unittest.TestCase):
@@ -92,6 +100,34 @@ def _test_ccr(self, B, M, N, K, test_name, dtype="float16"):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
+    def _test_rcr(self, B, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rcr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
+
+        Y_pt = torch.bmm(X_pt, W_pt.transpose(2, 1))
+        Y_pt = Y_pt + D_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
     def _test_crr(self, B, M, K, N, dtype="float16"):
         target = detect_target()
         X = Tensor(
@@ -134,6 +170,133 @@ def _test_crr(self, B, M, K, N, dtype="float16"):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
+    def _test_rcc(self, B, M, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rcc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        WT = W_pt.transpose(2, 1)
+        Y_pt = torch.bmm(X_pt, WT)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_rrc(self, B, M, K, N, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rrc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", f"bmm_rrc_add_{dtype}", dll_name=dll_name
+        )
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
+
+    def _test_crc(self, B, M, K, N, dtype="float16"):
+        target = detect_target()
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[B, N, M],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.bmm_crc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        test_name = f"bmm_crc_add_{dtype}"
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_ccc(self, B, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_ccc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
     def test_rrr(self):
         self._test_rrr(B=32, M=256, K=256, N=512)
 
@@ -143,22 +306,52 @@ def test_ccr(self):
         self._test_ccr(B=1, M=0, N=256, K=512, test_name="bmm_ccr_zero_m")
         self._test_ccr(B=1, M=256, N=256, K=0, test_name="bmm_ccr_zero_k")
 
+    def test_rcr(self):
+        self._test_rcr(B=32, M=256, N=256, K=512, test_name="bmm_rcr_add")
+        self._test_rcr(B=0, M=256, N=256, K=512, test_name="bmm_rcr_zero_batch")
+        self._test_rcr(B=1, M=0, N=256, K=512, test_name="bmm_rcr_zero_m")
+        self._test_rcr(B=1, M=256, N=256, K=0, test_name="bmm_rcr_zero_k")
+
     def test_crr(self):
         self._test_crr(B=32, M=256, K=256, N=512)
 
-    def test_bmm_add_fp32_sm80(self):
-        self._test_rrr(B=8, M=32, K=8, N=64, dtype="float")
+    def test_ccc(self):
+        self._test_ccc(B=32, M=256, N=256, K=512, test_name="bmm_ccc_add")
+        self._test_ccc(B=0, M=256, N=256, K=512, test_name="bmm_ccc_zero_batch")
+        self._test_ccc(B=1, M=0, N=256, K=512, test_name="bmm_ccc_zero_m")
+        self._test_ccc(B=1, M=256, N=256, K=0, test_name="bmm_ccc_zero_k")
+
+    def test_rcc(self):
+        self._test_rcc(B=32, M=256, N=256, K=512, test_name="bmm_rcc_add")
+        self._test_rcc(B=0, M=256, N=256, K=512, test_name="bmm_rcc_zero_batch")
+        self._test_rcc(B=1, M=0, N=256, K=512, test_name="bmm_rcc_zero_m")
+        self._test_rcc(B=1, M=256, N=256, K=0, test_name="bmm_rcc_zero_k")
+
+    def test_rrc(self):
+        self._test_rrc(B=32, M=256, K=256, N=512)
+
+    def test_crc(self):
+        self._test_crc(B=32, M=256, K=256, N=512)
+
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_add_dtype(self, dtype):
+        self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccr(
-            B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_float", dtype="float"
+            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+        )
+        self._test_crr(B=8, M=32, K=16, N=64, dtype=dtype)
+        self._test_rcr(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
         )
-        self._test_crr(B=8, M=32, K=16, N=64, dtype="float")
 
-    def test_bmm_add_bf16(self):
-        self._test_rrr(B=8, M=32, K=8, N=64, dtype="bfloat16")
-        self._test_ccr(
-            B=8, M=32, N=64, K=16, test_name="bmm_ccr_add_bfloat16", dtype="bfloat16"
+        self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_ccc(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+        )
+        self._test_crc(B=8, M=32, K=16, N=64, dtype=dtype)
+        self._test_rcc(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_rcc_add_{dtype}", dtype=dtype
         )
-        self._test_crr(B=8, M=32, K=16, N=64, dtype="bfloat16")
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -262,6 +455,56 @@ def test_rrr(self):
             test_name="broadcastable_bias3d",
         )
 
+    def _test_rcr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rcr_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT) + bias_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr(
+            [1, 16, 8], [2, 32, 8], bias_shape=[32], test_name="broadcastable_bias1d"
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+        )
+
     def _test_ccr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-2]
         B = max(A_shape[0], B_shape[0])
@@ -313,55 +556,243 @@ def test_ccr(self):
             test_name="broadcastable_bias3d",
         )
 
-    def test_bmm_add_broadcast_fp32_sm80(self):
-        self._test_crr(
-            [1, 8, 16],
+    def _test_crc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_crc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_rrc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rrc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        Y_pt = torch.matmul(X_pt, W_pt).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrc(self):
+        self._test_rrc(
+            [1, 16, 8], [2, 8, 32], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_rrc(
+            [1, 16, 8],
             [2, 8, 32],
-            bias_shape=[16, 32],
-            test_name="broadcastable_bias2d_float",
-            dtype="float",
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
         )
-        self._test_rrr(
+        self._test_rrc(
             [1, 16, 8],
             [2, 8, 32],
-            bias_shape=[1, 32],
-            test_name="broadcastable_bias1d_2_float",
-            dtype="float",
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
         )
-        self._test_ccr(
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_rcc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rcc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcc(self):
+        self._test_rcc(
+            [1, 16, 8], [2, 32, 8], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_ccc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_ccc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccc(self):
+        self._test_ccc(
+            [1, 8, 16], [2, 32, 8], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_ccc(
             [1, 8, 16],
             [2, 32, 8],
-            bias_shape=[1, 16, 32],
-            test_name="broadcastable_bias3d_float",
-            dtype="float",
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
         )
 
-    def test_bmm_add_broadcast_bf16(self):
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_add_broadcast_dtype(self, dtype):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
             bias_shape=[16, 32],
-            test_name="broadcastable_bias2d_bfloat16",
-            dtype="bfloat16",
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
         )
         self._test_rrr(
             [1, 16, 8],
             [2, 8, 32],
             bias_shape=[1, 32],
-            test_name="broadcastable_bias1d_2_bfloat16",
-            dtype="bfloat16",
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
         )
         self._test_ccr(
             [1, 8, 16],
             [2, 32, 8],
             bias_shape=[1, 16, 32],
-            test_name="broadcastable_bias3d_bfloat16",
-            dtype="bfloat16",
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
         )
 
+        self._test_crc(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[32, 16],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
 
-filter_test_cases_by_test_env(BMMAddTestCase)
-filter_test_cases_by_test_env(BMMBroadcastTestCase)
 
 if __name__ == "__main__":
     unittest.main()

From 2400fb3bbeeea10c2eed46834eab918b62bd8a9a Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 03:12:42 -0700
Subject: [PATCH 273/638] Remove redundant DeviceToDeviceCopies call (#429)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/429

As `model->DeviceToDeviceCopies()` is called from the `ModelBase::Run`, in this diff we remove the redundant call from the end of the generated `Model::RunImpl()` function body in the `MODEL_TEMPLATE`. Redundant call has caused an overhead of unnecessary double copying of the same data in cases when the generated body of `DeviceToDeviceCopies` wasn't empty.

Reviewed By: chenyang78

Differential Revision: D44074465

fbshipit-source-id: 58b6a37062566e34a0787ef8438e5c2bd306c43b
---
 python/aitemplate/backend/main_templates.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 366bc411d..27f580f0d 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -99,7 +99,6 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
   {{ func }}
       DeviceCheckLastError(__FILE__, __LINE__);
   {% endfor %}
-      DeviceToDeviceCopies(stream);
     }
 
     void ProfileImpl(StreamType stream, size_t iters, const std::string& filename) {

From 89773223aceaa1b27c08781c7fe940dcd5a3e3b4 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 15 Mar 2023 06:54:19 -0700
Subject: [PATCH 274/638] Fix json serialization & visualization of multiple
 operators with the same name (#427)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/427

This change handles the situation when multiple Operator instances have the same name, because the same operator is used several times for graph operations. Say, if operator named `foo_123` was used twice, then two nodes `foo_123 0` and `foo_123 1` will be generated for the json serialization and the visualization.

Json-serialized Operator instances gain an additional field called `_original_op_name`.

Also, this change finally allows to properly traverse the json-serialized graph using various graph-based algorithms.

Reviewed By: chenyang78

Differential Revision: D44066254

fbshipit-source-id: 624324ecde1d574f59a35cd54e61a0eb27d9b8c8
---
 python/aitemplate/utils/graph_utils.py        |  9 ++-
 python/aitemplate/utils/json_utils.py         | 78 ++++++++++++++++++-
 python/aitemplate/utils/visualization/plot.py | 24 +++++-
 3 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index cdda48714..549a6d821 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -50,10 +50,8 @@ def sorted_graph_debug_str(tensors) -> str:
 
 
 def sorted_graph_debug_json(tensors) -> str:
-    import json
-
     from aitemplate.compiler.base import Tensor
-    from aitemplate.utils.json_utils import GraphJsonEncoder
+    from aitemplate.utils.json_utils import gen_unique_op_names, GraphJsonEncoder
 
     if isinstance(tensors, Tensor):
         tensors = [tensors]
@@ -62,7 +60,10 @@ def sorted_graph_debug_json(tensors) -> str:
     json_dict["Tensors"] = tensors
     json_dict["Operators"] = get_sorted_ops(tensors)
 
-    return json.dumps(json_dict, cls=GraphJsonEncoder)
+    op_names = gen_unique_op_names(tensors)
+    encoder = GraphJsonEncoder(op_names)
+
+    return encoder.encode(json_dict)
 
 
 def sorted_graph_pseudo_code(tensors, with_shape=True) -> str:
diff --git a/python/aitemplate/utils/json_utils.py b/python/aitemplate/utils/json_utils.py
index 0b58072fe..8b28e42f9 100644
--- a/python/aitemplate/utils/json_utils.py
+++ b/python/aitemplate/utils/json_utils.py
@@ -14,6 +14,8 @@
 
 import json
 
+from typing import Dict, List
+
 from aitemplate.compiler.base import (
     _HostConstantTensorData,
     _NumpyConstantTensorData,
@@ -27,7 +29,66 @@
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 
+def gen_unique_op_names(sorted_graph: List[Tensor]) -> Dict[Operator, str]:
+    # List is used here, not Set, in order to maintain the order of operators,
+    # depending on memory locations, which may vary from run to run.
+    # Additionally, I don't expect to have too usages for a single op.
+    tmp: Dict[str, List[Operator]] = {}
+    for tensor in sorted_graph:
+        for src_op in tensor.src_ops():
+            op_name = src_op._attrs["name"]
+            if op_name is None:
+                continue
+
+            if op_name not in tmp:
+                tmp[op_name] = []
+            sub_dict = tmp[op_name]
+
+            if src_op not in sub_dict:
+                sub_dict.append(src_op)
+
+        for dst_op in tensor.dst_ops():
+            op_name = dst_op._attrs["name"]
+            if op_name is None:
+                continue
+
+            if op_name not in tmp:
+                tmp[op_name] = []
+            sub_dict = tmp[op_name]
+
+            if dst_op not in sub_dict:
+                sub_dict.append(dst_op)
+
+    # assemble the result
+    op_names: Dict[Operator, str] = {}
+
+    for op_name, ops in tmp.items():
+        if len(ops) == 1:
+            # the provided operator is unique, do not add one to the dict
+            continue
+
+        # add several unique names
+        for idx, op in enumerate(ops):
+            op_names[op] = f"{op_name} {idx}"
+
+    # done
+    return op_names
+
+
 class GraphJsonEncoder(json.JSONEncoder):
+    def __init__(self, op_names: Dict[Operator, str], *args, **kwargs):
+        super(GraphJsonEncoder, self).__init__(*args, **kwargs)
+
+        # This is a Dict that provides custom names for operators.
+        # It is possible that two instances of the same operator,
+        # say, 'fused_elementwise_123' is used twice in the graph,
+        # but with different inputs and/or outputs.
+        # As a result, there will be two instances of Operator object,
+        # holding the same name, which leads to invalid graph
+        # visualization / serialization.
+        # So, this diff allows to overcome this problem.
+        self.op_names: Dict[Operator, str] = op_names
+
     def default(self, obj):
         if isinstance(obj, FuncEnum):
             return obj.name
@@ -54,7 +115,13 @@ def _jsonize_tensor(self, tensor: Tensor):
         output = {}
         for key in tensor._attrs.keys():
             if key in ("src_ops", "dst_ops") and tensor._attrs[key] is not None:
-                output[key] = [x._attrs["name"] for x in tensor._attrs[key]]
+                op_names = []
+                for op in tensor._attrs[key]:
+                    # check whether a name for an op is provided
+                    op_name = self.op_names.get(op, op._attrs["name"])
+                    op_names.append(op_name)
+
+                output[key] = op_names
             else:
                 output[key] = tensor._attrs[key]
         return output
@@ -67,6 +134,15 @@ def _jsonize_operator(self, op: Operator):
                 and op._attrs[key] is not None
             ):
                 output[key] = [x._attrs["name"] for x in op._attrs[key]]
+            elif key == "name":
+                # check whether a name for an op is provided.
+
+                # save the original name
+                op_name = op._attrs[key]
+                output["_original_op_name"] = op_name
+                # save the key
+                op_name = self.op_names.get(op, op._attrs[key])
+                output[key] = op_name
             else:
                 output[key] = op._attrs[key]
         return output
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 91550fa1e..50076e44f 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -29,7 +29,6 @@
     TABLE_TEMPLATE,
 )
 
-
 COLOR_SCHEME = {
     "default_tensor": "lightskyblue1",
     "view": "plum1",
@@ -118,6 +117,19 @@ def plot_graph(tensors, file_path: str) -> None:
     sorted_graph = compiler.transform.toposort(tensors)
     compiler.transform.name_graph(sorted_graph)
 
+    # Before doing the further processing, it is needed
+    # to find whether there is an Operator instance with the same
+    # name like 'fused_elementwise_123' that is used
+    # several times, but with different input and/or outputs.
+    # In such a case, every Operator instance should get its unique
+    # name.
+    #
+    # The following dict will be used to store such unique names,
+    # such as 'fused_elementwise_123 0' and 'fused_elementwise_123 1'.
+    from aitemplate.utils.json_utils import gen_unique_op_names
+
+    op_names = gen_unique_op_names(sorted_graph)
+
     op_set = {}
     tensor_set = {}
     modal_set = []
@@ -155,6 +167,11 @@ def plot_graph(tensors, file_path: str) -> None:
         for src_op in tensor.src_ops():
             op_node = None
             op_name = src_op._attrs["name"]
+
+            # replace op_name with an unique name, if provided
+            if op_name is not None:
+                op_name = op_names.get(src_op, op_name)
+
             if src_op in op_set:
                 op_node = op_set[src_op]
             else:
@@ -176,6 +193,11 @@ def plot_graph(tensors, file_path: str) -> None:
         for dst_op in tensor.dst_ops():
             op_node = None
             op_name = dst_op._attrs["name"]
+
+            # replace op_name with an unique name, if provided
+            if op_name is not None:
+                op_name = op_names.get(dst_op, op_name)
+
             if dst_op in op_set:
                 op_node = op_set[dst_op]
             else:

From 34a0516fc6a7e1f8f11532ca05d7baa35ad9cc10 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Wed, 15 Mar 2023 10:23:37 -0700
Subject: [PATCH 275/638] Add more test cases (#434)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/434

ATT

Reviewed By: chenyang78

Differential Revision: D44080255

fbshipit-source-id: cce38174b010c62b94b64e750753b91871e480a0
---
 .../compiler/test_slice_permute021_fusion.py  | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/unittest/compiler/test_slice_permute021_fusion.py b/tests/unittest/compiler/test_slice_permute021_fusion.py
index 6a97f5c5d..ea6d57c0c 100644
--- a/tests/unittest/compiler/test_slice_permute021_fusion.py
+++ b/tests/unittest/compiler/test_slice_permute021_fusion.py
@@ -100,6 +100,46 @@ def test_slice_permute021_fusion(self):
             test_name="slice_permute021",
             dtype="float16",
         )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 2, 8),
+            slice_start_indices=(0, 1, 0),
+            slice_end_indices=(2, 3, 8),
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[2, 9, 4],
+            slice_start_indices=[0, 0, 1],
+            slice_end_indices=[None, None, 3],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[120, 1211, 1200],
+            slice_start_indices=[0, 0, 3],
+            slice_end_indices=[None, None, 1100],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[123, 1211, 1200],
+            slice_start_indices=[0, 5, 0],
+            slice_end_indices=[None, 1200, None],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
         self._test_slice_permute021_fusion(
             N=2,
             K=2,

From 8df68fff84ba3eb4fa4cf8a2dd34ac61764fab45 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 10:41:25 -0700
Subject: [PATCH 276/638] Add batched_dense_vec_jagged_2d_mul_ref to jagged
 utils (#433)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/433

This diff adds a new reference function to the `testing.jagged_utils` module: `batched_dense_vec_jagged_2d_mul_ref`. The new reference function will be used in the unit tests of the upcoming AIT equivalent for the [`fbgemm.batched_dense_vec_jagged_2d_mul`](https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul) operator.

Reviewed By: brad-mengchi

Differential Revision: D44091823

fbshipit-source-id: d662f9b3b88b8a63bbf240677622648677f38f5d
---
 python/aitemplate/testing/jagged_utils.py | 61 +++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/python/aitemplate/testing/jagged_utils.py b/python/aitemplate/testing/jagged_utils.py
index e3bac12ed..dd1a35ed4 100644
--- a/python/aitemplate/testing/jagged_utils.py
+++ b/python/aitemplate/testing/jagged_utils.py
@@ -312,3 +312,64 @@ def generate_offsets(
 
     torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
     return torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+
+
+def batched_dense_vec_jagged_2d_mul_ref(
+    vectors: torch.Tensor,  # [B, H, N]
+    matrices: torch.Tensor,  # [sum_B(N_B), H, D]
+    offsets: torch.Tensor,  # [B + 1]
+):
+    """
+    Reference function for fbgemm batched_dense_vec_jagged_2d_mul.
+    https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul
+
+    Parameters
+    ----------
+    vecrors: torch.Tensor
+        Batch of vectors of the shape [B, H, N]. N is the maximum
+        sequence length in the jagged Tensor `matrices`. Each vector
+        in the batch is N-sized. The effective batch size is B * H.
+    matrices: torch.Tensor
+        Batch of jagged matrices (in a jagged Tensor) of the shape
+        [sum_B(N_B), H, D]. The first dimension encodes the batch
+        B of sequneces of variable length: from 0 to N. The matrices
+        have variable number of rows (determined by the variable
+        sequence lengths) and fixed number of columns: D. H is a
+        factor of the effective batch size, just pulled to the
+        right of the sum_B(N_B) dimension.
+    offsets: torch.Tensor
+        Rank-1 offsets Tensor describing the single jagged dimension
+        (from 0 to N) in the jagged `matrices`.
+
+    Returns
+    -------
+    torch.Tensor
+        Batch of vectors resulting from the batched vector x jagged
+        matrix multiplication. Shape: [B, H, D] (as N in the `vectors`
+        is contracted with the variable sequence length encoded in the
+        sum_B(N_B) dimension of the `matrices`).
+    """
+    assert vectors.dim() == 3
+    B, H, N = vectors.size()
+
+    assert matrices.dim() == 3
+    assert matrices.size(1) == H
+    D = matrices.size(2)
+
+    assert offsets.dim() == 1
+    assert offsets.size(0) == B + 1
+
+    # pad the jagged matrices with zeros
+    padded_matrices = jagged_to_dense(
+        jagged=matrices,
+        offsets_list=[offsets],
+        dense_shape=[B, N, H, D],
+        padding_value=0.0,
+    )  # [B, N, H, D]
+
+    return torch.matmul(
+        vectors.unsqueeze(dim=2),  # [B, H, 1, N]
+        padded_matrices.permute([0, 2, 1, 3]),  # [B, H, N, D]
+    ).squeeze(
+        dim=2
+    )  # [B, H, D]

From e53a60e4c43b673e5691ecb9162fda47706e2685 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 11:06:02 -0700
Subject: [PATCH 277/638] Add dense_to_jagged op (#386)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/386

This diff adds a new op, `dense_to_jagged`, to convert the input dense Tensor to the output jagged Tensor, given the offsets of the resulting jagged Tensor.

The op also takes the `total_length` dynamic (`IntVar`) dimension of the resulting jagged Tensor, as it is not inferable from the dense input Tensor's shape (only the `batch_dim` and jagged dims are). If the `total_length` is not initialized from any model input Tensor's shape, it is set within the `dense_to_jagged` back-end from the very last offset value in the very last offsets Tensor in the `offsets_list`.

Due to the current design aspect: the Tensors in the graph which are not outputs of any op and not model inputs are constant-folded, it doesn't seem possible to create the `source` of the resulting jagged Tensor "out of thin air" and pass it through the `make_jagged` op's back-end (validating the offsets content) *before* the `dense_to_jagged` op's back-end. As a workaround, we declare the `source` Tensor as the output of `dense_to_jagged`, then pass it (together with the `offsets_list`) through the `make_jagged`, and then return from the `dense_to_jagged.__call__` (see the details in the function body). Although this conforms to the above-mentioned aspect (i.e., `source` is not constant-folded, as it is, formally, the output of the `dense_to_jagged` op), we have to take a leap of faith in operating on not-yet-validated offsets Tensor content in the `dense_to_jagged` back-end. On the upside, the offsets are validated in the subsequent invocation of the `make_jagged`, before the resulting jagged Tensor can be processed further.

Reviewed By: ipiszy

Differential Revision: D43944424

fbshipit-source-id: 41960503d7f0d9203f5886835b8f5c82727a469e
---
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../backend/cuda/tensor/dense_to_jagged.py    | 504 ++++++++++++++++++
 python/aitemplate/compiler/compiler.py        |  45 +-
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/dense_to_jagged.py    | 135 +++++
 tests/unittest/ops/test_dense_to_jagged.py    | 346 ++++++++++++
 6 files changed, 1032 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/backend/cuda/tensor/dense_to_jagged.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/dense_to_jagged.py
 create mode 100644 tests/unittest/ops/test_dense_to_jagged.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index ea7cd5d26..ccb8698a0 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -20,6 +20,7 @@
     batch_gather,
     concatenate,
     concatenate_tanh,
+    dense_to_jagged,
     dynamic_slice,
     expand,
     gather,
@@ -41,6 +42,7 @@
     "batch_gather",
     "concatenate",
     "concatenate_tanh",
+    "dense_to_jagged",
     "dynamic_slice",
     "expand",
     "gather",
diff --git a/python/aitemplate/backend/cuda/tensor/dense_to_jagged.py b/python/aitemplate/backend/cuda/tensor/dense_to_jagged.py
new file mode 100644
index 000000000..6e601c0a4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/dense_to_jagged.py
@@ -0,0 +1,504 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+The back-end bindings of the dense_to_jagged op.
+"""
+from typing import Any, Dict, List, Optional
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Tensor
+from aitemplate.utils import shape_utils
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define FUSED_ELE_THREAD_SIZE 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the dense_idx from the blockIdx and threadIdx
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the jagged_idx from the dense_idx_elem
+  {{index_type}} jagged_idx;
+  {
+    // dense_coord is along consecutive dense dimensions
+    // jagged_coord is along the total_length of the jagged Tensor
+    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
+    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
+    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
+
+{% for i in range(num_offsets) %}
+    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
+    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
+    dense_coord = running_idx / ({{strides[i+1]}});
+    running_idx = running_idx % ({{strides[i+1]}});
+    if (dense_coord >= next_offset - prev_offset) {
+        // this element of the dense volume is
+        // out of bounds of the jagged Tensor
+        return;
+    }
+    jagged_coord = prev_offset;
+
+{% endfor %}
+    jagged_coord += dense_coord;
+    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
+KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the jagged_idx from the blockIdx and threadIdx
+  const {{index_type}} jagged_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} jagged_idx_elem = jagged_idx * N_ELEMENTS_PER_THREAD;
+  if (jagged_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the dense_idx from the jagged_idx_elem
+  {{index_type}} dense_idx = jagged_idx_elem % ({{strides[num_offsets]}});
+  {
+    {{offsets_type}} left, right, mid, tmp_value, offset_idx, offset_value;
+    {{index_type}} running_idx = jagged_idx_elem / ({{strides[num_offsets]}});
+
+    // binary search to determine the dense coord along the current jagged dimension
+    // the goal is to find the index of the maximum offset value in offsets.data[{{i}}]
+    // which is <= the running_idx. the (running_idx - offset_value) will then indicate
+    // the dense cooord along the current jagged dimension.
+{% for i in range(num_offsets - 1, -1, -1) %}
+    left = 0;
+    right = offsets.lengths[{{i}}] - 1;
+    while (left <= right) {
+        mid = (left + right) >> 1;
+        tmp_value = offsets.data[{{i}}][mid];
+        if (tmp_value <= running_idx) {
+            offset_idx = mid;
+            offset_value = tmp_value;
+            left = mid + 1;
+        } else {
+            right = mid - 1;
+        }
+    }
+    dense_idx += (running_idx - offset_value) * ({{strides[i+1]}});
+    running_idx = offset_idx;
+
+{% endfor %}
+    dense_idx = (dense_idx + running_idx * ({{strides[0]}})) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}(
+    {{read_t}}* y,
+    const {{read_t}}* x,
+    {{dynamic_dims}}
+    {{offsets}}
+    {{index_type}} n_elements
+) {
+  {{compute_idx}}
+
+  y[jagged_idx] = x[dense_idx];
+}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include <iostream>
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void {{func_name}}(
+    void* y,
+    const void* x,
+{% for idx in range(num_offsets) %}
+    {{index_type}} offsets_length_{{idx}},
+    const void* offsets_data_{{idx}},
+{% endfor %}
+    {{dynamic_dims_decl}}
+    {{prefix}}Stream_t stream
+) {
+    {{index_type}} n_elements = {{calculate_n}};
+    if (n_elements == 0) {
+      return;
+    }
+
+    // we define local offsets here, because the resulting jagged Tensor's offsets
+    // haven't been initialized by make_jagged yet, which is invoked after this op
+    {{offsets_struct_type}} local_offsets;
+{% for idx in range(num_offsets) %}
+    local_offsets.lengths[{{idx}}] = offsets_length_{{idx}};
+    local_offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
+{% endfor %}
+
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        reinterpret_cast<{{read_t}}*>(y),
+        reinterpret_cast<const {{read_t}}*>(x),
+        {{dynamic_dims_call}}
+        local_offsets,
+        n_elements
+    );
+}
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    void* y,
+    const void* x,
+{% for idx in range(num_offsets) %}
+    {{index_type}},
+    const void*,
+{% endfor %}
+    {{dynamic_dims}}
+    {{prefix}}Stream_t stream
+);
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{y}},
+{{indent}}    {{x}},
+{% for idx in range(num_offsets) %}
+{{indent}}    {{offsets_first_dim_names[idx]}},
+{{indent}}    {{offsets_data_names[idx]}},
+{% endfor %}
+{{indent}}    {{dynamic_dims}}
+{{indent}}    {{stream}}
+{{indent}});
+    """
+)
+
+
+def _get_strides(shape: List[IntVar]) -> List[str]:
+    """
+    Generate the stride expressions for each of the dimensions
+    of the shape. A stride expression here means the
+    product of all dimensions following the given dimension.
+    The order of the stride expressions in the returned list
+    is the same as of the dimensions of the shape.
+    """
+    strides = []
+    for dim in reversed(shape[1:]):
+        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
+        if strides:
+            strides.append(f"{strides[-1]} * {str_dim}")
+        else:
+            strides.append(str_dim)
+    strides.reverse()
+    return strides
+
+
+def _get_dynamic_dims(x: Tensor, y: Tensor) -> List[IntVar]:
+    res = {}
+    for dim in list(x.shape()) + list(y.shape()):
+        if not isinstance(dim, IntImm):
+            res[dim._attrs["name"]] = dim
+
+    return list(res.values())
+
+
+def _gen_dynamic_dim_str(
+    index_type: str,
+    dynamic_dims: List[IntVar],
+    has_type: bool,
+) -> str:
+    type_str = index_type + " " if has_type else ""
+    res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
+    if res:
+        res += ", "
+
+    return res
+
+
+def _gen_offsets_str(
+    jagged_int_var: JaggedIntVar,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+) -> str:
+    offsets_var_name = jagged_int_var.offsets_var_name()
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    ref_prefix = "const " if const_ref else ""
+    ref_suffix = "&" if const_ref else ""
+    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
+    arg_name = name if name is not None else offsets_var_name
+    offsets = f"{arg_type}{arg_name}, "
+
+    return offsets
+
+
+def _gen_int_var_product_str(
+    int_vars: List[IntVar],
+) -> str:
+    res = []
+    for int_var in int_vars:
+        if isinstance(int_var, IntImm):
+            res.append(str(int_var._attrs["values"][0]))
+        elif isinstance(int_var, IntVar):
+            res.append(int_var._attrs["name"])
+        else:
+            raise RuntimeError(
+                "A dim must be an IntVar! Current type: {}".format(type(int_var))
+            )
+
+    return " * ".join(res) if res else "1"
+
+
+def _detect_read_type(
+    inner_size: int,
+    dtype: str,
+) -> str:
+    if dtype in ("bfloat16", "half"):
+        if inner_size % 8 == 0:
+            return "uint4"
+        elif inner_size % 4 == 0:
+            return "uint2"
+        elif inner_size % 2 == 0:
+            return "uint"
+    elif dtype == "float":
+        if inner_size % 4 == 0:
+            return "uint4"
+        elif inner_size % 2 == 0:
+            return "uint2"
+
+    return dtype
+
+
+def _gen_compute_idx_str(
+    input_shape: List[IntVar],
+    output_shape: List[IntVar],
+    index_type: str,
+    jagged_int_var: JaggedIntVar,
+) -> str:
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+    compute_idx_template = (
+        KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE
+        if use_jagged_space_indexing
+        else KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE
+    )
+
+    return compute_idx_template.render(
+        index_type=index_type,
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        strides=_get_strides(input_shape),
+        offsets_type=jagged_int_var.offsets_type(),
+    )
+
+
+def _gen_calculate_n(
+    input_shape: List[IntVar],
+    output_shape: List[IntVar],
+) -> str:
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+    # we use jagged output's volume in case of the jagged space indexing
+    # and dense input's volume in case of the dense space indexing
+    index_space = output_shape if use_jagged_space_indexing else input_shape
+
+    return _gen_int_var_product_str(index_space)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+    read_type: str,
+) -> str:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    return KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        compute_idx=_gen_compute_idx_str(
+            input_shape=x.shape(),
+            output_shape=y.shape(),
+            index_type=index_type,
+            jagged_int_var=jagged_int_var,
+        ),
+        read_t=read_type,
+        dynamic_dims=_gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=_get_dynamic_dims(x, y),
+            has_type=True,
+        ),
+        offsets=_gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.dense_to_jagged.gen_function")
+def dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates dense_to_jagged function definition."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    dtype = x.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_inner_size = shape_utils.get_num_rightmost_static_elements(y.shape())
+    read_type = _detect_read_type(read_inner_size, data_type)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs,
+        backend_spec.index_type,
+        data_type,
+        read_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=read_type,
+        data_t=data_type,
+    )
+
+    func_name = func_attrs["name"]
+    dynamic_dims = _get_dynamic_dims(x, y)
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+    total_length = jagged_int_var.total_length()
+
+    if total_length._attrs.get("isolated", False):
+        raise ValueError(
+            f"The {total_length._attrs['name']} (total_length) dimension "
+            f"of the jagged Tensor output of {func_name} must be present in "
+            "one of the input shapes, but it isn't."
+        )
+
+    return FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        offsets_struct_type=offsets_struct_type,
+        offsets_type=jagged_int_var.offsets_type(),
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_name,
+        calculate_n=_gen_calculate_n(
+            input_shape=x.shape(),
+            output_shape=y.shape(),
+        ),
+        dynamic_dims_decl=_gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=_gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=False,
+        ),
+        read_t=read_type,
+    )
+
+
+@registry.reg("cuda.dense_to_jagged.func_decl")
+def dense_to_jagged_gen_function_decl(func_attrs) -> str:
+    """Generate dense_to_jagged function declaration."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        dynamic_dims=_gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=_get_dynamic_dims(x, y),
+            has_type=True,
+        ),
+    )
+
+
+@registry.reg("cuda.dense_to_jagged.func_call")
+def dense_to_jagged_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate dense_to_jagged function call."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    offsets_list = func_attrs["inputs"][1:]
+    offsets_first_dim_names = [
+        offsets._attrs["shape"][0]._attrs["name"] for offsets in offsets_list
+    ]
+    offsets_data_names = [offsets._attrs["name"] for offsets in offsets_list]
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        offsets_first_dim_names=offsets_first_dim_names,
+        offsets_data_names=offsets_data_names,
+        y=y._attrs["name"],
+        x=x._attrs["name"],
+        dynamic_dims=_gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=_get_dynamic_dims(x, y),
+            has_type=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 3a87e0112..147f30428 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -22,7 +22,12 @@
 
 from aitemplate import backend, compiler
 
-from aitemplate.compiler.base import DynamicProfileStrategy, Tensor
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    JaggedIntVar,
+    Tensor,
+)
 
 from aitemplate.compiler.model import (
     AIT_DEFAULT_NUM_RUNTIMES,
@@ -91,6 +96,43 @@ def _verify_outputs_still_in_graph(sorted_graph: List[Tensor], outputs: List[Ten
             )
 
 
+def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
+    """
+    Mark the IntVars that are not present in any input's shape
+    with the _attrs["isolated"] = True flag. The purpose is to
+    be able to distinguish these dynamic dims in the codegen
+    of some of the functions which should set them instead of
+    relying on / validating the pre-set value. To this end,
+    this function must be invoked right before the back-end
+    code generation of the ops.
+
+    One example is the jagged_to_dense op that must set the
+    total_length dimension of the resulting jagged Tensor if
+    it hasn't been set from any of the model input's shape.
+    Another example is the make_jagged op that should set the
+    batch_dim within the JaggedIntVar of the resulting jagged
+    Tensor, unless it has been set already from the inputs.
+    """
+    int_vars = {}
+    int_var_names_in_input_shapes = set()
+    for tensor in sorted_graph:
+        for dim in tensor._attrs["shape"]:
+            if not isinstance(dim, IntImm):
+                name = dim._attrs["name"]
+                int_vars[name] = dim
+                if isinstance(dim, JaggedIntVar):
+                    batch_dim = dim.batch_dim()
+                    int_vars[batch_dim._attrs["name"]] = batch_dim
+                    total_length = dim.total_length()
+                    int_vars[total_length._attrs["name"]] = total_length
+                if tensor._attrs["is_input"]:
+                    int_var_names_in_input_shapes.add(name)
+
+    for name, dim in int_vars.items():
+        if name not in int_var_names_in_input_shapes:
+            dim._attrs["isolated"] = True
+
+
 _DEBUG_SETTINGS = AITDebugSettings()
 
 
@@ -225,6 +267,7 @@ def compile_model(
                 workspace,
             ) = compiler.transform.memory_planning(graph)
             _verify_outputs_still_in_graph(graph, output_tensors)
+            _mark_isolated_int_vars(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
 
             file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index d826578ef..04b2835f1 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -21,6 +21,7 @@
 from aitemplate.compiler.ops.tensor.chunk import chunk
 from aitemplate.compiler.ops.tensor.concatenate import concatenate
 from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
+from aitemplate.compiler.ops.tensor.dense_to_jagged import dense_to_jagged
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
 from aitemplate.compiler.ops.tensor.gather import gather
diff --git a/python/aitemplate/compiler/ops/tensor/dense_to_jagged.py b/python/aitemplate/compiler/ops/tensor/dense_to_jagged.py
new file mode 100644
index 000000000..422183b37
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/dense_to_jagged.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+The front-end definition of the dense_to_jagged op.
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Operator, Tensor
+from aitemplate.compiler.ops import make_jagged
+
+
+class dense_to_jagged(Operator):
+    """
+    Returns a jagged Tensor "extracted" from the input dense Tensor,
+    given the offsets list. The resulting jagged Tensor contains the
+    subset of values of the input dense Tensor specified by the rank-1
+    offset Tensors in the offsets_list.
+
+    Args:
+        x (Tensor): input dense tensor.
+        offsets_list (List[Tensor]): the list of offsets of the resulting jagged Tensor.
+        total_length (IntVar): the total length dimension of the resulting jagged Tensor.
+    Returns:
+        y (Tensor): a jagged Tensor extracted from the input dense Tensor x.
+    """
+
+    def __init__(
+        self,
+        total_length: IntVar,
+    ):
+        if type(total_length) != IntVar:
+            raise TypeError(
+                f"total_length must be IntVar, but got {type(total_length).__name__}."
+            )
+
+        super().__init__()
+        self._attrs["op"] = "dense_to_jagged"
+        self._attrs["total_length"] = total_length
+
+    def _infer_shape(
+        self,
+        x: Tensor,
+        offsets_list: List[Tensor],
+    ) -> List[IntVar]:
+        inner_shape = x.shape()[1 + len(offsets_list) :]
+        return [self._attrs["total_length"]] + inner_shape
+
+    def _get_op_attributes(self):
+        return {
+            "total_length": self._attrs["total_length"],
+        }
+
+    def _args_for_pseudo_code(self):
+        return [f"total_length={self._attrs['total_length']}"]
+
+    def __call__(
+        self,
+        x: Tensor,
+        offsets_list: List[Tensor],
+    ) -> Tensor:
+        x_shape = x.shape()
+        if not offsets_list:
+            raise ValueError("At least one offsets Tensor must be specified.")
+        if len(x_shape) < len(offsets_list) + 2:
+            raise ValueError(
+                "The input dense Tensor x must have at least len(offsets_list) + 2 dimensions: "
+                "one batch dimension, as many sequence dimensions as len(offsets_list), and "
+                f"at least one inner dimension, but {len(offsets_list)=}, {x_shape=}."
+            )
+        if type(x_shape[0]) != IntVar:
+            raise TypeError(
+                f"x.shape()[0] must be IntVar, but got {type(x_shape[0]).__name__}."
+            )
+        for i, dim in enumerate(x_shape[1 : 1 + len(offsets_list)], start=1):
+            if not isinstance(dim, IntImm):
+                raise TypeError(
+                    "All sequence dimensions in the x.shape() (corresponding to the "
+                    "jagged dimensions of the output jagged Tensor) must be IntImm, "
+                    f"but got type(x_shape()[{i}]) == {type(dim).__name__}."
+                )
+
+        self._attrs["inputs"] = [x, *offsets_list]
+        self._set_depth()
+        output_shape = self._infer_shape(x, offsets_list)
+
+        # the source Tensor of the resulting jagged Tensor
+        source = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        self._attrs["outputs"] = [source]
+
+        # in the AIT graph, the output of the dense_to_jagged op is set to the
+        # source Tensor, which is still not a jagged Tensor. The source Tensor
+        # is passed through the make_jagged op to obtain the jagged Tensor returned
+        # from the __call__: this way, the chain of ops in the graph looks like:
+        #
+        #      x --> dense_to_jagged --> source --> make_jagged --> y
+        #                    \------ offsets_list -------/
+
+        # the resulting jagged Tensor
+        jagged_output = make_jagged(
+            batch_dim=x_shape[0],
+            jagged_dims=[
+                JaggedDim(min_value=0, max_value=dim.value())
+                for dim in x_shape[1 : 1 + len(offsets_list)]
+            ],
+        )(
+            source=source,
+            offsets_list=offsets_list,
+        )
+
+        # we keep the resulting jagged Tensor's JaggedIntVar around,
+        # as we'll need it for the back-end code generation of the
+        # dense_to_jagged op
+        self._attrs["jagged_int_var"] = jagged_output._attrs["shape"][0]
+
+        return jagged_output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_dense_to_jagged.py b/tests/unittest/ops/test_dense_to_jagged.py
new file mode 100644
index 000000000..4609d013d
--- /dev/null
+++ b/tests/unittest/ops/test_dense_to_jagged.py
@@ -0,0 +1,346 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_to_dense op.
+"""
+
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import aitemplate.testing.jagged_utils as jagged_utils_ref
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class DenseToJaggedTestCase(unittest.TestCase):
+    def _test_dense_to_jagged(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        sequence_shape = jagged_max_shape[1 : 1 + len(offsets_list)]
+        sequence_dims = [IntImm(value=dim) for dim in sequence_shape]
+        inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        inner_dims = [IntImm(value=dim) for dim in inner_shape]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+        jagged_dims = [JaggedDim(min_value=0, max_value=N) for N in sequence_shape]
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                *sequence_dims,
+                *inner_dims,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGED = ops.dense_to_jagged(total_length=total_length_dim)(
+            x=DENSE,
+            offsets_list=OFFSETS_LIST,
+        )
+        ANOTHER = ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, ANOTHER)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert ANOTHER.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"test_dense_to_jagged_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        dense_pt = get_random_torch_tensor(jagged_max_shape, dtype)
+        result_pt = jagged_utils_ref.dense_to_jagged(
+            dense=dense_pt,
+            offsets_list=list(offsets_pt.values()),
+        )
+
+        source = torch.zeros_like(result_pt)
+        result = torch.empty_like(result_pt)
+
+        inputs = {"dense": dense_pt, "source": source, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 8], "float16"),
+            param(2, "int32", [4, 3, 4], "float16"),
+            param(3, "int32", [4, 3, 2], "float16"),
+            param(4, "int32", [4, 3, 1], "float16"),
+            param(5, "int64", [4, 3, 4], "float32"),
+            param(6, "int64", [4, 3, 2], "float32"),
+            param(7, "int64", [4, 3, 1], "float32"),
+        ]
+    )
+    def test_dense_to_jagged_single_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_dense_to_jagged(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[[0, 1, 4, 6, 7]],
+                dtype=dtype,
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"single_offsets_{dtype}_{i}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 8], "float16"),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], "float16"),
+            param(3, "int32", [3, 4, 5, 150, 3, 2], "float16"),
+            param(4, "int32", [3, 4, 5, 150, 1, 1], "float16"),
+            param(5, "int64", [3, 4, 5, 150, 1, 4], "float32"),
+            param(6, "int64", [3, 4, 5, 150, 3, 2], "float32"),
+            param(7, "int64", [3, 4, 5, 150, 3, 1], "float32"),
+        ]
+    )
+    def test_dense_to_jagged_multiple_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_dense_to_jagged(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[
+                    [0, 1, 3, 5],
+                    [0, 2, 4, 7, 9, 10],
+                    [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+                ],
+                dtype=dtype,
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"multiple_offsets_{dtype}_{i}",
+            )
+
+    def _benchmark_dense_to_jagged(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        sequence_dim = IntImm(value=N, name="sequence_dim")
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                sequence_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+
+        JAGGED = ops.dense_to_jagged(total_length=total_length_dim)(
+            x=DENSE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        ANOTHER = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[JaggedDim(min_value=0, max_value=N)],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, ANOTHER)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"benchmark_dense_to_jagged_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            jagged_utils_ref.generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            dense_pt = get_random_torch_tensor([B, N, D], dtype)
+            inputs = {"dense": dense_pt, "offsets": offsets_pt}
+            outputs = [get_torch_empty_tensor([total_length, D], dtype)]
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs["source"] = source_pt
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                dense_to_jagged_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("dense_to_jagged")
+                ]
+                assert len(dense_to_jagged_records) == 1
+                runtime_ms = dense_to_jagged_records[0]["ms_per_iter"]
+
+            dense_item = total_length * D  # total items to read: the jagged volume
+            jagged_item = total_length * D  # total items to read: the jagged volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            bandwidth = (
+                (jagged_item + dense_item) * size / (runtime_ms * 1e-3 * 1e9)
+            )  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_dense_to_jagged(self):
+        self._benchmark_dense_to_jagged(
+            B=1024,
+            N=260,
+            D=256,
+            dtype="float16",
+            offsets_dtype="int32",
+            use_jagged_space_indexing=False,
+            isolated_total_length=True,
+            test_suffix="esuhm",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From f9b77009cc792ffb3415c36418bd76d9b6f3d6f3 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357 <generatedunixname89002005232357@fb.com>
Date: Wed, 15 Mar 2023 11:54:22 -0700
Subject: [PATCH 278/638] Revert D44036462: Multisect successfully blamed
 D44036462 for test or build failures (#435)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/435

Reviewed By: khabinov

Differential Revision: D44085816

fbshipit-source-id: 6ccd94b1c48f29441956936bd3062567980565b6
---
 .../05_stable_diffusion/scripts/compile.py    |  10 +-
 examples/05_stable_diffusion/scripts/demo.py  |  11 +-
 .../scripts/demo_img2img.py                   |  10 +-
 .../scripts/download_pipeline.py              |  15 +--
 examples/05_stable_diffusion/src/benchmark.py |  22 +---
 .../src/compile_lib/compile_clip.py           |   7 +-
 .../src/compile_lib/compile_unet.py           |   7 +-
 .../src/compile_lib/compile_vae.py            |   8 +-
 .../src/compile_lib/util.py                   | 111 ------------------
 .../src/pipeline_stable_diffusion_ait.py      |   6 +-
 .../pipeline_stable_diffusion_img2img_ait.py  |   4 +-
 11 files changed, 37 insertions(+), 174 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index d946744d0..0018dafda 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -26,23 +26,25 @@
 from src.compile_lib.compile_clip import compile_clip
 from src.compile_lib.compile_unet import compile_unet
 from src.compile_lib.compile_vae import compile_vae
-from src.compile_lib.util import get_work_dir_location_diffusers
 
 
 @click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option("--batch-size", default=1, help="batch size")
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
 def compile_diffusers(
-    width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
+    local_dir, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
 ):
     logging.getLogger().setLevel(logging.INFO)
     torch.manual_seed(4896)
 
-    local_dir = get_work_dir_location_diffusers()
-
     if detect_target().name() == "rocm":
         convert_conv_to_gemm = False
 
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index a5a92ad0c..d4f5dbb99 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -23,21 +23,22 @@
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
 
-from src.compile_lib.util import get_work_dir_location_diffusers
 from src.pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
 
 
 @click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(width, height, prompt, benchmark):
-
-    local_dir = get_work_dir_location_diffusers()
-
+def run(local_dir, width, height, prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         local_dir,
         scheduler=EulerDiscreteScheduler.from_pretrained(
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index e640beb3e..e4d96d865 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -25,11 +25,15 @@
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
 
-from src.compile_lib.util import get_work_dir_location_diffusers
 from src.pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
 
 
 @click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option(
@@ -38,9 +42,7 @@
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(width, height, prompt, benchmark):
-
-    local_dir = get_work_dir_location_diffusers()
+def run(local_dir, width, height, prompt, benchmark):
 
     # load the pipeline
     device = "cuda"
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 6120fa8df..1128769da 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -14,20 +14,17 @@
 #
 import click
 import torch
-from aitemplate.utils.import_path import import_parent
 from diffusers import StableDiffusionPipeline
 
-if __name__ == "__main__":
-    import_parent(filepath=__file__, level=1)
-from src.compile_lib.util import get_work_dir_location_diffusers
-
 
 @click.command()
 @click.option("--token", default="", help="access token")
-def download_pipeline_files(token) -> None:
-
-    save_directory = get_work_dir_location_diffusers()
-
+@click.option(
+    "--save_directory",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="pipeline files local directory",
+)
+def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-2-1-base",
         revision="fp16",
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index a08a00c12..5cac6a465 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -22,18 +22,11 @@
 from aitemplate.compiler import Model
 from aitemplate.testing import detect_target
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from compile_lib.util import (
-    get_file_location_autoencoder,
-    get_file_location_clip,
-    get_file_location_unet,
-    get_work_dir_location_diffusers,
-)
 from diffusers import StableDiffusionPipeline
 
 from torch import autocast
 from transformers import CLIPTokenizer
 
-
 USE_CUDA = detect_target().name() == "cuda"
 
 
@@ -63,9 +56,7 @@ def benchmark_unet(
     verify=False,
 ):
 
-    file_name = get_file_location_unet()
-
-    exe_module = Model(file_name)
+    exe_module = Model("./tmp/UNet2DConditionModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
         exit(-1)
@@ -140,10 +131,7 @@ def benchmark_clip(
 ):
     mask_seq = 0
 
-    file_name = get_file_location_clip()
-
-    exe_module = Model(file_name)
-
+    exe_module = Model("./tmp/CLIPTextModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
         exit(-1)
@@ -217,9 +205,7 @@ def benchmark_vae(
 
     latent_channels = 4
 
-    file_name = get_file_location_autoencoder()
-
-    exe_module = Model(file_name)
+    exe_module = Model("./tmp/AutoencoderKL/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for AutoencoderKL.")
         exit(-1)
@@ -296,8 +282,6 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     np.random.seed(0)
     torch.manual_seed(4896)
 
-    local_dir = get_work_dir_location_diffusers()
-
     pipe = StableDiffusionPipeline.from_pretrained(
         local_dir,
         revision="fp16",
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index 173866766..cfda48607 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -19,7 +19,7 @@
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
-from .util import get_work_dir_location, mark_output
+from .util import mark_output
 
 
 def map_clip_params(pt_mod, batch_size, seqlen, depth):
@@ -117,7 +117,4 @@ def compile_clip(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-
-    workdir = get_work_dir_location()
-
-    compile_model(Y, target, workdir, "CLIPTextModel", constants=params_ait)
+    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index 96753a6f3..7cc2b41e4 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -21,7 +21,7 @@
 from ..modeling.unet_2d_condition import (
     UNet2DConditionModel as ait_UNet2DConditionModel,
 )
-from .util import get_work_dir_location, mark_output
+from .util import mark_output
 
 
 def map_unet_params(pt_mod, dim):
@@ -85,7 +85,4 @@ def compile_unet(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-
-    workdir = get_work_dir_location()
-
-    compile_model(Y, target, workdir, "UNet2DConditionModel", constants=params_ait)
+    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index 2c28a431e..d01f320dc 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -15,13 +15,14 @@
 from collections import OrderedDict
 
 import numpy as np
+
 import torch
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
-from .util import get_work_dir_location, mark_output
+from .util import mark_output
 
 
 def map_vae_params(ait_module, pt_module, batch_size, seq_len):
@@ -130,13 +131,10 @@ def compile_vae(
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-
-    workdir = get_work_dir_location()
-
     compile_model(
         Y,
         target,
-        workdir,
+        "./tmp",
         "AutoencoderKL",
         constants=params_ait,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 1497e8b7e..000e862e9 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -12,9 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
-
-
 def mark_output(y):
     if type(y) is not tuple:
         y = (y,)
@@ -23,111 +20,3 @@ def mark_output(y):
         y[i]._attrs["name"] = "output_%d" % (i)
         y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
-
-
-def get_work_dir_location():
-
-    """
-    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
-    path to a directory which has AITemplate compiled artifacts of the model(s).
-    Make sure the OS user running this script has read and write permissions to
-    this directory. By default, the artifacts will be saved under tmp/ folder of
-    the current working directory.
-    """
-
-    env_name = "AITEMPLATE_WORK_DIR"
-    workdir = "tmp/"
-    if env_name in os.environ:
-        workdir = os.environ[env_name]
-
-    print("The value of {} is {}".format(env_name, workdir))
-
-    return workdir
-
-
-def get_work_dir_location_diffusers():
-
-    """
-    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
-    path to a directory which has AITemplate compiled artifacts of the model(s).
-    Make sure the OS user running this script has read and write permissions to
-    this directory. By default, it will look for compiled artifacts under
-    tmp/ folder of the current working directory.
-    """
-
-    env_name = "AITEMPLATE_WORK_DIR"
-    local_dir = "./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2"
-
-    if env_name in os.environ:
-        local_dir = os.path.join(
-            os.environ[env_name],
-            "diffusers-pipeline",
-            "stabilityai",
-            "stable-diffusion-v2",
-        )
-
-    print("The value of {} is {}".format(env_name, local_dir))
-    return local_dir
-
-
-def get_file_location_clip():
-    """
-    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
-    path to a directory which has AITemplate compiled artifacts of the model(s).
-    Make sure the OS user running this script has read and write permissions to
-    this directory. By default, it will look for compiled artifacts under
-    tmp/ folder of the current working directory.
-    """
-
-    env_name = "AITEMPLATE_WORK_DIR"
-    file_name = "./tmp/CLIPTextModel/test.so"
-
-    if env_name in os.environ:
-        file_name = os.path.join(os.environ[env_name], "CLIPTextModel", "test.so")
-
-    print("The value of {} is {}".format(env_name, file_name))
-    return file_name
-
-
-def get_file_location_autoencoder():
-
-    """
-    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
-    path to a directory which has AITemplate compiled artifacts of the model(s).
-    Make sure the OS user running this script has read and write permissions to
-    this directory. By default, it will look for compiled artifacts under
-    tmp/ folder of the current working directory.
-    """
-
-    env_name = "AITEMPLATE_WORK_DIR"
-    file_name = "./tmp/AutoencoderKL/test.so"
-
-    if env_name in os.environ:
-        file_name = os.path.join(os.environ[env_name], "AutoencoderKL", "test.so")
-
-    print("The value of {} is {}".format(env_name, file_name))
-
-    return file_name
-
-
-def get_file_location_unet():
-
-    """
-    Set the OS environment variable AITEMPLATE_WORK_DIR to point to an absolute
-    path to a directory which has AITemplate compiled artifacts of the model(s).
-    Make sure the OS user running this script has read and write permissions to
-    this directory. By default, it will look for compiled artifacts under
-    tmp/ folder of the current working directory.
-    """
-
-    env_name = "AITEMPLATE_WORK_DIR"
-    file_name = "./tmp/UNet2DConditionModel/test.so"
-
-    if env_name in os.environ:
-        file_name = os.path.join(
-            os.environ[env_name], "UNet2DConditionModel", "test.so"
-        )
-
-    print("The value of {} is {}".format(env_name, file_name))
-
-    return file_name
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 6818d6b91..7dace1275 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -13,8 +13,8 @@
 #  limitations under the License.
 #
 import inspect
-import os
 
+import os
 import warnings
 from typing import List, Optional, Union
 
@@ -37,7 +37,6 @@
     StableDiffusionPipelineOutput,
     StableDiffusionSafetyChecker,
 )
-from src.compile_lib.util import get_work_dir_location
 
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
@@ -99,8 +98,7 @@ def __init__(
             requires_safety_checker=requires_safety_checker,
         )
 
-        workdir = get_work_dir_location()
-
+        workdir = "tmp/"
         self.clip_ait_exe = self.init_ait_module(
             model_name="CLIPTextModel", workdir=workdir
         )
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
index 084a23b51..ad2885086 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -36,7 +36,6 @@
     StableDiffusionPipelineOutput,
     StableDiffusionSafetyChecker,
 )
-from src.compile_lib.util import get_work_dir_location
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
@@ -110,8 +109,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
-        workdir = get_work_dir_location()
-
+        workdir = "tmp/"
         self.clip_ait_exe = self.init_ait_module(
             model_name="CLIPTextModel", workdir=workdir
         )

From 1a6c00fd189dccc13e66c164a518b4cce2de30d0 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 13:14:07 -0700
Subject: [PATCH 279/638] Make fuse_expand_elementwise jagged-aware (#426)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/426

The `_fused_expand_elementwise` function within the `remove_no_ops` graph optimization pass eliminates `expand` ops preceding the `elementwise` ops. However, the elimination doesn't work with mixed jagged / dense `elementwise` inputs out of the box. Due to the fact that the `JaggedIntVar` in the jagged input shape is treated as a single dimension, against potentially more than one dimension in the dense input shape, the shapes don't match and the pass is not applied.

This diff expands the `JaggedIntVar` into its maximum dense shape. As a result, dense input shapes are comparable with the jagged shape "extended" like this. We rely on the mixed jagged / dense input compatibility invariant established by the `elementwise` op's front-end, and don't check it again in the pass.

Reviewed By: ipiszy

Differential Revision: D44063504

fbshipit-source-id: 8d9c04b230a838817797133ca14947c6ec29a84a
---
 .../compiler/transform/remove_no_ops.py       | 21 +++++++-
 tests/unittest/ops/test_expand.py             | 53 ++++++++++++++++++-
 2 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/compiler/transform/remove_no_ops.py b/python/aitemplate/compiler/transform/remove_no_ops.py
index cd0a8b81f..b1c876a8d 100644
--- a/python/aitemplate/compiler/transform/remove_no_ops.py
+++ b/python/aitemplate/compiler/transform/remove_no_ops.py
@@ -31,7 +31,7 @@
 """
 from typing import List
 
-from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.base import IntVar, JaggedIntVar, Operator, Tensor
 from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
 
 from aitemplate.compiler.transform import transform_utils
@@ -110,6 +110,15 @@ def _is_compatible_with_broadcasting(
             expand_output_dim
         )
 
+    def _replace_jagged_int_var(shape: List[IntVar]):
+        """
+        If shape[0] is a JaggedIntVar, replace it with
+        the corresponding maximum dense shape.
+        """
+        if shape and isinstance(shape[0], JaggedIntVar):
+            return shape[0].get_max_dense_shape() + shape[1:]
+        return shape
+
     for op in graph_utils.get_sorted_ops(sorted_graph):
         if op._attrs["op"] != "expand":
             continue
@@ -121,6 +130,8 @@ def _is_compatible_with_broadcasting(
         if expand_output._attrs["is_output"]:
             continue
 
+        expand_output_shape = _replace_jagged_int_var(expand_output._attrs["shape"])
+
         def _can_fuse_with(dst_op: Operator) -> bool:
             if dst_op._attrs["op"] != "elementwise":
                 return False
@@ -128,10 +139,16 @@ def _can_fuse_with(dst_op: Operator) -> bool:
             for elementwise_input in dst_op._attrs["inputs"]:
                 if elementwise_input is expand_output:
                     continue
+
+                elementwise_input_shape = _replace_jagged_int_var(
+                    elementwise_input._attrs["shape"]
+                )
+
                 if not all(
                     _is_compatible_with_broadcasting(dim_a, dim_b)
                     for dim_a, dim_b in zip(
-                        expand_output._attrs["shape"], elementwise_input._attrs["shape"]
+                        expand_output_shape,
+                        elementwise_input_shape,
                     )
                 ):
                     return False
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
index 7a6aee2c5..4f6f0639b 100644
--- a/tests/unittest/ops/test_expand.py
+++ b/tests/unittest/ops/test_expand.py
@@ -18,11 +18,13 @@
 
 import torch
 
+from aitemplate import compiler
 from aitemplate.compiler import compile_model, ops
-from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op, has_op
+from aitemplate.utils import graph_utils
 from parameterized import param, parameterized
 
 
@@ -166,6 +168,53 @@ def test_no_op_expands_removed_size_op_fp32(self):
             dtype="float32",
         )
 
+    def test_no_op_expand_elementwise_jagged_dense_inputs(self):
+        total_length = IntVar([1, 100])
+        batch_dim = IntVar([1, 10])
+        offsets_dim = IntVar([2, 11])
+        embedding_dim = IntImm(128)
+        max_seq_len = 10
+
+        X = Tensor(
+            [batch_dim, 1, embedding_dim],
+            name="x",
+            is_input=True,
+            dtype="float16",
+        )
+        SOURCE = Tensor(
+            [total_length, embedding_dim],
+            name="source",
+            is_input=True,
+            dtype="float16",
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[offsets_dim],
+                name="offsets",
+                is_input=True,
+                dtype="int32",
+            )
+        ]
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(0, max_seq_len),
+            ],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        Y = ops.expand()(X, [batch_dim, max_seq_len, -1])
+        Z = ops.elementwise(FuncEnum.MUL)(JAGGED, Y)
+
+        graph = compiler.transform.toposort([Z])
+        compiler.transform.remove_no_ops(graph)
+        sorted_ops = graph_utils.get_sorted_ops(graph)
+
+        assert not has_op(sorted_ops, "expand")
+
     @parameterized.expand(
         [
             param("fp32_small_noadd_1", "float32", [10, 1, 5], [-1, 10, 5]),

From e39a90c24722cd3b2cca1223dfda3d9e04b24ca5 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Wed, 15 Mar 2023 13:56:36 -0700
Subject: [PATCH 280/638] move group/gemm_xxx from a100 to v100 (#423)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/423

move group/gemm_xxx from a100 to v100

Reviewed By: tenpercent

Differential Revision: D44042494

fbshipit-source-id: 94c86e8b89cd9e6ab60c94c53d0075b94981e5d3
---
 .../unittest/ops/test_gemm_rcr_bias_fast_gelu.py  | 15 +++++----------
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py     | 15 +++++----------
 tests/unittest/ops/test_gemm_rrr_small_nk.py      | 15 ++++++++-------
 tests/unittest/ops/test_group_gemm_rcr.py         |  7 ++++++-
 tests/unittest/ops/test_group_gemm_rcr_bias.py    | 12 +++++++-----
 .../ops/test_group_gemm_rcr_bias_activation.py    | 14 ++++++++------
 .../unittest/ops/test_group_gemm_rcr_bias_cat.py  | 12 +++++++-----
 tests/unittest/ops/test_group_gemm_rcr_cat.py     | 12 +++++++-----
 8 files changed, 53 insertions(+), 49 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index abfc65e88..52a0d7f1e 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -83,11 +84,7 @@ def test_rcr(self):
             self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_rcr_float(self):
+    def test_rcr_float_sm80(self):
         self._test_rcr(
             [1, 7, 64, 127], "fast_dynamic_m_float", use_fast_gelu=True, dtype="float"
         )
@@ -96,11 +93,7 @@ def test_rcr_float(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_bias_fast_gelu_bfloat16(self):
+    def test_gemm_rcr_bias_fast_gelu_bfloat16_sm80(self):
         self._test_rcr(
             [1, 7, 64, 127],
             "fast_dynamic_m_bfloat16",
@@ -114,5 +107,7 @@ def test_gemm_rcr_bias_fast_gelu_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(GEMMRcrBiasFastGeluTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index b86cce950..c17ac02f7 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -23,6 +23,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
@@ -95,22 +96,14 @@ def test_rcr(self):
             self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_fast_gelu_float(self):
+    def test_gemm_rcr_fast_gelu_float_sm80(self):
         self._test_rcr([128], "static_float", use_fast_gelu=True, dtype="float")
         self._test_rcr(
             [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="float"
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_gemm_rcr_fast_gelu_bfloat16(self):
+    def test_gemm_rcr_fast_gelu_bfloat16_sm80(self):
         self._test_rcr(
             [128],
             "static_float",
@@ -124,5 +117,7 @@ def test_gemm_rcr_fast_gelu_bfloat16(self):
         )
 
 
+filter_test_cases_by_test_env(GEMMRcrFastGeluTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index 94f35a00b..d3ef0f0c8 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
@@ -81,18 +84,16 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
-    def test_gemm_rrr_small_nk_float32(self):
+    def test_gemm_rrr_small_nk_float_sm80(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
         self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by cuda sm<80",
-    )
-    def test_gemm_rrr_small_nk_bfloat16(self):
+    def test_gemm_rrr_small_nk_bfloat16_sm80(self):
         self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
         self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GEMMRrrSmallNKTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index 4c1775f75..9ff58d9b9 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from parameterized import param, parameterized
 
 
@@ -93,5 +96,7 @@ def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
             torch.testing.assert_close(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GroupGEMMRcrTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index a29a7b841..64f3fcb57 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 from parameterized import param, parameterized
 
@@ -33,7 +36,7 @@ class GroupGEMMRcrBiasTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             param("group_gemm_rcr_bias_fp16", "float16"),
-            param("group_gemm_rcr_bias_fp32", "float32"),
+            param("group_gemm_rcr_bias_fp32_sm80", "float32"),
             param("group_gemm_rcr_bias_bf16", "bfloat16"),
         ]
     )
@@ -44,9 +47,6 @@ def test_group_gemm_rcr_bias(self, test_name, dtype):
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Group Gemm need SM80 HW")
-            return
         X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
         W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
@@ -84,5 +84,7 @@ def test_group_gemm_rcr_bias(self, test_name, dtype):
         torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index 6da1ed164..6b4e69f13 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from parameterized import param, parameterized
 
 
@@ -32,10 +35,10 @@ class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
-            param("group_gemm_rcr_bias_relu_fp32", "float32", "relu"),
+            param("group_gemm_rcr_bias_relu_fp32_sm80", "float32", "relu"),
             param("group_gemm_rcr_bias_relu_bf16", "bfloat16", "relu"),
             param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
-            param("group_gemm_rcr_bias_sigmoid_fp32", "float32", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_fp32_sm80", "float32", "sigmoid"),
             param("group_gemm_rcr_bias_sigmoid_bf16", "bfloat16", "sigmoid"),
         ]
     )
@@ -46,9 +49,6 @@ def test_rcr_activation(self, test_name, dtype, activation):
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Group Gemm need SM80 HW")
-            return
         X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
         W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
@@ -93,5 +93,7 @@ def test_rcr_activation(self, test_name, dtype, activation):
         torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasActTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index 5c5a0773d..3752c3adc 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from parameterized import param, parameterized
 
 
@@ -32,7 +35,7 @@ class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             param("group_gemm_rcr_bias_cat_fp16", "float16"),
-            param("group_gemm_rcr_bias_cat_fp32", "float32"),
+            param("group_gemm_rcr_bias_cat_fp32_sm80", "float32"),
             param("group_gemm_rcr_bias_cat_bf16", "bfloat16"),
         ]
     )
@@ -43,9 +46,6 @@ def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Group Gemm need SM80 HW")
-            return
         X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
         W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
@@ -84,5 +84,7 @@ def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasCatTestCase)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index c3ff44a93..4b3646e94 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -20,7 +20,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from parameterized import param, parameterized
 
 
@@ -32,7 +35,7 @@ class GroupGEMMRcrCatTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             param("group_gemm_rcr_cat_fp16", "float16"),
-            param("group_gemm_rcr_cat_fp32", "float32"),
+            param("group_gemm_rcr_cat_fp32_sm80", "float32"),
             param("group_gemm_rcr_cat_bf16", "bfloat16"),
         ]
     )
@@ -43,9 +46,6 @@ def test_group_gemm_rcr_cat(self, test_name, dtype):
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Group Gemm need SM80 HW")
-            return
         X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
         W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
@@ -79,5 +79,7 @@ def test_group_gemm_rcr_cat(self, test_name, dtype):
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
 
+filter_test_cases_by_test_env(GroupGEMMRcrCatTestCase)
+
 if __name__ == "__main__":
     unittest.main()

From 3bec0d47558566f9499c6b0b87b8e98c65bd4c09 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 15:44:20 -0700
Subject: [PATCH 281/638] Consolidate jagged ops (#416)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/416

In this diff, the code of the newly added jagged Tensor-related ops---`jagged_to_dense` and `dense_to_jagged`---undergoes some consolidation. In particular:

- The `jagged_to_dense` is renamed to `jagged_to_padded_dense`.
- The `dense_to_jagged` is renamed to `padded_dense_to_jagged`.
- The back-end code of the two ops is heavily de-duplicated and aligned.
- Minor cleanu-up here and there.

Reviewed By: ipiszy

Differential Revision: D44006524

fbshipit-source-id: 8fb2859a06a15a097aaf4504e96fb601c679340a
---
 .../backend/common/elementwise_common.py      | 104 +++--
 .../backend/cuda/tensor/__init__.py           |   8 +-
 .../backend/cuda/tensor/jagged_to_dense.py    | 390 ------------------
 .../cuda/tensor/jagged_to_padded_dense.py     | 284 +++++++++++++
 ...to_jagged.py => padded_dense_to_jagged.py} | 267 +++---------
 python/aitemplate/compiler/compiler.py        |   6 +-
 .../compiler/ops/tensor/__init__.py           |   4 +-
 ..._to_dense.py => jagged_to_padded_dense.py} |  28 +-
 ...to_jagged.py => padded_dense_to_jagged.py} |  16 +-
 tests/unittest/ops/test_gemm_permute.py       |  12 +-
 ...ense.py => test_jagged_to_padded_dense.py} |  62 ++-
 ...gged.py => test_padded_dense_to_jagged.py} |  44 +-
 12 files changed, 485 insertions(+), 740 deletions(-)
 delete mode 100644 python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py
 rename python/aitemplate/backend/cuda/tensor/{dense_to_jagged.py => padded_dense_to_jagged.py} (51%)
 rename python/aitemplate/compiler/ops/tensor/{jagged_to_dense.py => jagged_to_padded_dense.py} (67%)
 rename python/aitemplate/compiler/ops/tensor/{dense_to_jagged.py => padded_dense_to_jagged.py} (90%)
 rename tests/unittest/ops/{test_jagged_to_dense.py => test_jagged_to_padded_dense.py} (82%)
 rename tests/unittest/ops/{test_dense_to_jagged.py => test_padded_dense_to_jagged.py} (89%)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 6d3c8e806..737b1d744 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -29,6 +29,7 @@
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.utils import alignment as alignment_utils, shape_utils
 
+
 CONSTANT_TEMPLATE = jinja2.Template(
     """
 #define FUSED_ELE_THREAD_SIZE 256
@@ -103,6 +104,7 @@
     if (dense_coord >= next_offset - prev_offset) {
         // this element of the dense volume is
         // out of bounds of the jagged Tensor
+        {{out_of_bounds_action}}
         return;
     }
     jagged_coord = prev_offset;
@@ -663,10 +665,10 @@ def _get_alignments_and_sizes_and_dtype(
     return alignments, input_broadcast_sizes, dtype
 
 
-def _get_dynamic_dims(output_accessors: List[TensorAccessor]) -> List[IntVar]:
+def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
     res = {}
-    for output_accessor in output_accessors:
-        for dim in output_accessor.original_shapes:
+    for shape in shapes:
+        for dim in shape:
             if not isinstance(dim, IntImm):
                 res[dim._attrs["name"]] = dim
                 if isinstance(dim, JaggedIntVar):
@@ -769,7 +771,7 @@ def _parse_func_metadata(
     sub_func_metadata, op_type = _get_sub_func_metadata(
         ops, data_type, op_type, backend_spec
     )
-    dynamic_dims = _get_dynamic_dims(output_accessors)
+    dynamic_dims = get_dynamic_dims(*[acc.original_shapes for acc in output_accessors])
 
     return FusedElementwiseMetaData(
         inputs,
@@ -791,7 +793,7 @@ def _parse_func_metadata(
     )
 
 
-def _gen_int_var_product_str(
+def gen_int_var_product_str(
     int_vars: List[IntVar],
 ) -> str:
     res = []
@@ -804,6 +806,7 @@ def _gen_int_var_product_str(
             raise RuntimeError(
                 "A dim must be an IntVar! Current type: {}".format(type(int_var))
             )
+
     return " * ".join(res) if res else "1"
 
 
@@ -845,9 +848,9 @@ def _gen_input_broadcast_calculator_str(
         res.append(
             "{} % ({}) / ({}) * ({})".format(
                 idx_str,
-                _gen_int_var_product_str(output_num_element),
-                _gen_int_var_product_str(output_stride),
-                _gen_int_var_product_str(input_stride),
+                gen_int_var_product_str(output_num_element),
+                gen_int_var_product_str(output_stride),
+                gen_int_var_product_str(input_stride),
             )
         )
 
@@ -890,37 +893,56 @@ def _gen_input_broadcast_size_str(
     return res
 
 
-def _gen_dynamic_dim_str(
-    index_type: str, dynamic_dims: List[IntVar], has_type: bool
+def gen_dynamic_dim_str(
+    index_type: str,
+    dynamic_dims: List[IntVar],
+    has_type: bool,
 ) -> str:
     type_str = index_type + " " if has_type else ""
     res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
     if res:
         res += ", "
+
     return res
 
 
-def _gen_offsets_str(
-    fused_elementwise_metadata: FusedElementwiseMetaData,
+def gen_offsets_str(
+    jagged_int_var: JaggedIntVar,
     has_type: bool,
     const_ref: bool,
     name: Optional[str] = None,
 ) -> str:
-    offsets = ""
+    offsets_var_name = jagged_int_var.offsets_var_name()
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    ref_prefix = "const " if const_ref else ""
+    ref_suffix = "&" if const_ref else ""
+    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
+    arg_name = name if name is not None else offsets_var_name
+    offsets = f"{arg_type}{arg_name}, "
+
+    return offsets
+
+
+def _gen_offsets_str_from_metadata(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+):
     if fused_elementwise_metadata.mixed_jagged_dense_indexing:
         inputs = fused_elementwise_metadata.inputs
         jagged_input = [t for t in inputs if t.is_jagged()][0]
         jagged_int_var = jagged_input._attrs["shape"][0]
-        offsets_var_name = jagged_int_var.offsets_var_name()
-        offsets_struct_type = jagged_int_var.offsets_struct_type()
 
-        ref_prefix = "const " if const_ref else ""
-        ref_suffix = "&" if const_ref else ""
-        arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
-        arg_name = name if name is not None else offsets_var_name
-        offsets = f"{arg_type}{arg_name}, "
-
-    return offsets
+        return gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=has_type,
+            const_ref=const_ref,
+            name=name,
+        )
+    else:
+        return ""
 
 
 def _gen_num_elements_calculator(
@@ -931,7 +953,7 @@ def _gen_num_elements_calculator(
             # for the jagged space indexing, the num_elements
             # is the number of elements in the output jagged Tensor, hence
             # the usage of the output shape here, not the output volume
-            return _gen_int_var_product_str(
+            return gen_int_var_product_str(
                 fused_elementwise_metadata.output_accessors[0].original_shapes,
             )
         else:
@@ -939,13 +961,13 @@ def _gen_num_elements_calculator(
             # is the number of elements in the output volume: the smallest
             # rectangular volume that fits the output jagged Tensor, hence
             # the usage of the output volume here, not the output shape
-            return _gen_int_var_product_str(
+            return gen_int_var_product_str(
                 fused_elementwise_metadata.output_volume,
             )
     else:
         # all inputs and outputs are treated as dense:
         # use the output shape for computing num_elements
-        return _gen_int_var_product_str(
+        return gen_int_var_product_str(
             fused_elementwise_metadata.output_accessors[0].original_shapes,
         )
 
@@ -1033,18 +1055,16 @@ def _gen_write_outputs_str(
     return write_outputs_str
 
 
-def _get_output_volume_strides(
-    output_volume: List[IntVar],
-) -> List[str]:
+def get_stride_expressions(shape: List[IntVar]) -> List[str]:
     """
     Generate the stride expressions for each of the dimensions
-    of the output volume. A stride expression here means the
+    of the shape. A stride expression here means the
     product of all dimensions following the given dimension.
     The order of the stride expressions in the returned list
-    is the same as of the dimensions of the output volume.
+    is the same as of the dimensions of the shape.
     """
     strides = []
-    for dim in reversed(output_volume[1:]):
+    for dim in reversed(shape[1:]):
         str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
         if strides:
             strides.append(f"{strides[-1]} * {str_dim}")
@@ -1076,7 +1096,7 @@ def _gen_compute_idx(
         return compute_idx_template.render(
             index_type=index_type,
             num_offsets=num_offsets,
-            strides=_get_output_volume_strides(
+            strides=get_stride_expressions(
                 fused_elementwise_metadata.output_volume,
             ),
             offsets_type=jagged_int_var.offsets_type(),
@@ -1152,12 +1172,12 @@ def _gen_kernel_function(
         index_type=index_type,
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
-        offsets=_gen_offsets_str(
+        offsets=_gen_offsets_str_from_metadata(
             fused_elementwise_metadata,
             has_type=True,
             # the offsets are passed
@@ -1267,17 +1287,17 @@ def fused_elementwise_gen_function(
         func_name=func_attrs["name"],
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims_decl=_gen_dynamic_dim_str(
+        dynamic_dims_decl=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
-        dynamic_dims_call=_gen_dynamic_dim_str(
+        dynamic_dims_call=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
-        offsets_decl=_gen_offsets_str(
+        offsets_decl=_gen_offsets_str_from_metadata(
             fused_elementwise_metadata,
             has_type=True,
             # the offsets are passed
@@ -1285,7 +1305,7 @@ def fused_elementwise_gen_function(
             const_ref=True,
             name="offsets",
         ),
-        offsets_call=_gen_offsets_str(
+        offsets_call=_gen_offsets_str_from_metadata(
             fused_elementwise_metadata,
             has_type=False,
             const_ref=False,
@@ -1340,12 +1360,12 @@ def fused_elementwise_gen_function_decl(
         func_name=func_name,
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
-        offsets=_gen_offsets_str(
+        offsets=_gen_offsets_str_from_metadata(
             fused_elementwise_metadata,
             has_type=True,
             const_ref=True,
@@ -1392,12 +1412,12 @@ def fused_elementwise_gen_function_call(
         ),
         output_params=output_params,
         input_params=input_params,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
-        offsets=_gen_offsets_str(
+        offsets=_gen_offsets_str_from_metadata(
             fused_elementwise_metadata,
             has_type=False,
             const_ref=False,
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index ccb8698a0..cc8bceeb6 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -20,12 +20,12 @@
     batch_gather,
     concatenate,
     concatenate_tanh,
-    dense_to_jagged,
     dynamic_slice,
     expand,
     gather,
-    jagged_to_dense,
+    jagged_to_padded_dense,
     masked_select,
+    padded_dense_to_jagged,
     permute,
     permute021,
     permute0213,
@@ -42,12 +42,12 @@
     "batch_gather",
     "concatenate",
     "concatenate_tanh",
-    "dense_to_jagged",
     "dynamic_slice",
     "expand",
     "gather",
-    "jagged_to_dense",
+    "jagged_to_padded_dense",
     "masked_select",
+    "padded_dense_to_jagged",
     "permute",
     "permute021",
     "permute0213",
diff --git a/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py b/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
deleted file mode 100644
index c20e475a3..000000000
--- a/python/aitemplate/backend/cuda/tensor/jagged_to_dense.py
+++ /dev/null
@@ -1,390 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Define jagged_to_dense codegen and CUDA kernel
-"""
-from typing import Any, Dict, List, Optional
-
-import jinja2
-
-from aitemplate.backend import registry
-from aitemplate.backend.backend_spec import CUDASpec
-from aitemplate.compiler.base import IntImm, IntVar, Tensor
-from aitemplate.utils import shape_utils
-
-
-CONSTANT_TEMPLATE = jinja2.Template(
-    """
-#define FUSED_ELE_THREAD_SIZE 256
-
-const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
-    """
-)
-
-KERNEL_TEMPLATE = jinja2.Template(
-    """
-__global__ void {{func_name}}({{read_t}}* y, const {{read_t}}* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
-  // first compute the dense_idx from the blockIdx and threadIdx
-  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
-  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
-  if (dense_idx_elem >= n_elements) {
-    return;
-  }
-
-  // then compute the jagged_idx from the dense_idx_elem
-  {{index_type}} jagged_idx;
-  {
-    // dense_coord is along consecutive dense dimensions
-    // jagged_coord is along the total_length of the jagged Tensor
-    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
-    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
-    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
-
-{% for i in range(num_offsets) %}
-    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
-    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
-    dense_coord = running_idx / ({{strides[i+1]}});
-    running_idx = running_idx % ({{strides[i+1]}});
-    if (dense_coord >= next_offset - prev_offset) {
-        // this element of the dense volume is
-        // out of bounds of the jagged Tensor
-        {{read_t}} padded_vector;
-        {{data_t}}* cursor = reinterpret_cast<{{data_t}}*>(&padded_vector);
-
-        #pragma unroll
-        for (int i = 0; i < N_ELEMENTS_PER_THREAD; i++) {
-            cursor[i] = {{data_t}}({{padding_value}});
-        }
-
-        y[dense_idx] = padded_vector;
-        return;
-    }
-    jagged_coord = prev_offset;
-
-{% endfor %}
-    jagged_coord += dense_coord;
-    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
-  }
-  y[dense_idx] = x[jagged_idx];
-}
-    """
-)
-
-FUNC_TEMPLATE = jinja2.Template(
-    """
-{{head}}
-
-#include "jagged.h"
-
-namespace {
-
-{{constant}}
-
-{{kernel_function}}
-
-}  // namespace
-
-void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
-    if (n_elements == 0) {
-      return;
-    }
-    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
-    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
-        reinterpret_cast<{{read_t}}*>(y),
-        reinterpret_cast<const {{read_t}}*>(x),
-        {{dynamic_dims_call}}
-        {{offsets_call}}
-        n_elements
-    );
-}
-    """
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
-    """
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{
-    {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
-    {{indent}}invoke_{{func_name}}({{y}}, {{x}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
-{{indent}}}
-    """
-)
-
-
-def _get_output_volume_strides(
-    output_volume: List[IntVar],
-) -> List[str]:
-    """
-    Generate the stride expressions for each of the dimensions
-    of the y volume. A stride expression here means the
-    product of all dimensions following the given dimension.
-    The order of the stride expressions in the returned list
-    is the same as of the dimensions of the y volume.
-    """
-    strides = []
-    for dim in reversed(output_volume[1:]):
-        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
-        if strides:
-            strides.append(f"{strides[-1]} * {str_dim}")
-        else:
-            strides.append(str_dim)
-    strides.reverse()
-    return strides
-
-
-def _get_dynamic_dims(y: Tensor) -> List[IntVar]:
-    res = {}
-
-    for dim in y.shape():
-        if not isinstance(dim, IntImm):
-            res[dim._attrs["name"]] = dim
-    return list(res.values())
-
-
-def _gen_dynamic_dim_str(
-    index_type: str, dynamic_dims: List[IntVar], has_type: bool
-) -> str:
-    type_str = index_type + " " if has_type else ""
-    res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
-    if res:
-        res += ", "
-    return res
-
-
-def _gen_offsets_str(
-    x: Tensor,
-    has_type: bool,
-    const_ref: bool,
-    name: Optional[str] = None,
-) -> str:
-    jagged_int_var = x._attrs["shape"][0]
-    offsets_var_name = jagged_int_var.offsets_var_name()
-    offsets_struct_type = jagged_int_var.offsets_struct_type()
-
-    ref_prefix = "const " if const_ref else ""
-    ref_suffix = "&" if const_ref else ""
-    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
-    arg_name = name if name is not None else offsets_var_name
-    offsets = f"{arg_type}{arg_name}, "
-
-    return offsets
-
-
-def _gen_int_var_product_str(
-    int_vars: List[IntVar],
-) -> str:
-    res = []
-    for int_var in int_vars:
-        if isinstance(int_var, IntImm):
-            res.append(str(int_var._attrs["values"][0]))
-        elif isinstance(int_var, IntVar):
-            res.append(int_var._attrs["name"])
-        else:
-            raise RuntimeError(
-                "A dim must be an IntVar! Current type: {}".format(type(int_var))
-            )
-    return " * ".join(res) if res else "1"
-
-
-def _detect_read_type(inner_size: int, dtype: str) -> str:
-    if dtype in ("bfloat16", "half"):
-        if inner_size % 8 == 0:
-            return "uint4"
-        elif inner_size % 4 == 0:
-            return "uint2"
-        elif inner_size % 2 == 0:
-            return "uint"
-    elif dtype == "float":
-        if inner_size % 4 == 0:
-            return "uint4"
-        elif inner_size % 2 == 0:
-            return "uint2"
-
-    return dtype
-
-
-def _gen_kernel_function(
-    func_attrs: Dict[str, Any],
-    index_type: str,
-    data_type: str,
-    read_type: str,
-) -> str:
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    padding_value = func_attrs["padding_value"]
-    jagged_int_var = x.shape()[0]
-    num_offsets = len(jagged_int_var.jagged_dims())
-    backend_spec = CUDASpec()
-
-    dynamic_dims = _get_dynamic_dims(y)
-
-    kernel_func = KERNEL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        index_type=index_type,
-        num_offsets=num_offsets,
-        strides=_get_output_volume_strides(
-            y.shape(),
-        ),
-        offsets_type=jagged_int_var.offsets_type(),
-        data_t=data_type,
-        read_t=read_type,
-        padding_value=padding_value,
-        dynamic_dims=_gen_dynamic_dim_str(
-            backend_spec.index_type,
-            dynamic_dims,
-            has_type=True,
-        ),
-        offsets=_gen_offsets_str(
-            x,
-            has_type=True,
-            # the offsets are passed
-            # by value to the kernel
-            const_ref=False,
-            name="offsets",
-        ),
-    )
-    return kernel_func
-
-
-@registry.reg("cuda.jagged_to_dense.gen_function")
-def jagged_to_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
-    """Generates jagged_to_dense function definition."""
-
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    backend_spec = CUDASpec()
-
-    dtype = x.dtype()
-    data_type = backend_spec.dtype_to_backend_type(dtype)
-    read_inner_size = shape_utils.get_num_rightmost_static_elements(x.shape())
-    read_type = _detect_read_type(read_inner_size, data_type)
-
-    kernel_function = _gen_kernel_function(
-        func_attrs,
-        backend_spec.index_type,
-        data_type,
-        read_type,
-    )
-
-    constant = CONSTANT_TEMPLATE.render(
-        read_t=read_type,
-        data_t=data_type,
-    )
-
-    dynamic_dims = _get_dynamic_dims(y)
-
-    function = FUNC_TEMPLATE.render(
-        prefix=backend_spec.prefix,
-        index_type=backend_spec.index_type,
-        head=backend_spec.header_src_template.render(),
-        constant=constant,
-        kernel_function=kernel_function,
-        func_name=func_attrs["name"],
-        dynamic_dims_decl=_gen_dynamic_dim_str(
-            backend_spec.index_type,
-            dynamic_dims,
-            has_type=True,
-        ),
-        dynamic_dims_call=_gen_dynamic_dim_str(
-            backend_spec.index_type,
-            dynamic_dims,
-            has_type=False,
-        ),
-        offsets_decl=_gen_offsets_str(
-            x,
-            has_type=True,
-            # the offsets are passed
-            # by const reference to the function
-            const_ref=True,
-            name="offsets",
-        ),
-        offsets_call=_gen_offsets_str(
-            x,
-            has_type=False,
-            const_ref=False,
-            name="offsets",
-        ),
-        read_t=read_type,
-    )
-    return function
-
-
-@registry.reg("cuda.jagged_to_dense.func_decl")
-def jagged_to_dense_gen_function_decl(func_attrs) -> str:
-    """Generate jagged_to_dense function declaration."""
-
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    func_name = func_attrs["name"]
-    backend_spec = CUDASpec()
-
-    dynamic_dims = _get_dynamic_dims(y)
-
-    return FUNC_DECL_TEMPLATE.render(
-        prefix=backend_spec.prefix,
-        index_type=backend_spec.index_type,
-        func_name=func_name,
-        dynamic_dims=_gen_dynamic_dim_str(
-            backend_spec.index_type,
-            dynamic_dims,
-            has_type=True,
-        ),
-        offsets=_gen_offsets_str(
-            x,
-            has_type=True,
-            const_ref=True,
-            name="offsets",
-        ),
-    )
-
-
-@registry.reg("cuda.jagged_to_dense.func_call")
-def jagged_to_dense_gen_function_call(
-    func_attrs,
-    indent: str,
-) -> str:
-    """Generate jagged_to_dense function call."""
-
-    x = func_attrs["inputs"][0]
-    y = func_attrs["outputs"][0]
-    backend_spec = CUDASpec()
-    dynamic_dims = _get_dynamic_dims(y)
-
-    return FUNC_CALL_TEMPLATE.render(
-        stream=backend_spec.stream,
-        func_name=func_attrs["name"],
-        index_type=backend_spec.index_type,
-        calculate_n=_gen_int_var_product_str(
-            y.shape(),
-        ),
-        y=y._attrs["name"],
-        x=x._attrs["name"],
-        dynamic_dims=_gen_dynamic_dim_str(
-            backend_spec.index_type,
-            dynamic_dims,
-            has_type=False,
-        ),
-        offsets=_gen_offsets_str(
-            x,
-            has_type=False,
-            const_ref=False,
-        ),
-        indent=indent,
-    )
diff --git a/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py b/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py
new file mode 100644
index 000000000..bdafac544
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py
@@ -0,0 +1,284 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+The back-end bindings of the jagged_to_padded_dense op.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import (
+    CONSTANT_TEMPLATE,
+    gen_dynamic_dim_str,
+    gen_int_var_product_str,
+    gen_offsets_str,
+    get_dynamic_dims,
+    get_stride_expressions,
+    KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE,
+)
+from aitemplate.utils import shape_utils
+
+
+KERNEL_PADDING_TEMPLATE = jinja2.Template(
+    """
+        {{read_t}} padded_vector;
+        {{data_t}}* cursor = reinterpret_cast<{{data_t}}*>(&padded_vector);
+
+        #pragma unroll
+        for (int i = 0; i < N_ELEMENTS_PER_THREAD; i++) {
+            cursor[i] = {{data_t}}({{padding_value}});
+        }
+
+        y[dense_idx] = padded_vector;
+    """
+)
+
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}({{read_t}}* y, const {{read_t}}* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
+  {{compute_idx}}
+
+  y[dense_idx] = x[jagged_idx];
+}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        reinterpret_cast<{{read_t}}*>(y),
+        reinterpret_cast<const {{read_t}}*>(x),
+        {{dynamic_dims_call}}
+        {{offsets_call}}
+        n_elements
+    );
+}
+    """
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{y}}, {{x}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
+{{indent}}}
+    """
+)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+    read_type: str,
+) -> str:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    padding_value = func_attrs["padding_value"]
+    jagged_int_var = x.shape()[0]
+    num_offsets = len(jagged_int_var.jagged_dims())
+    backend_spec = CUDASpec()
+
+    padding_str = KERNEL_PADDING_TEMPLATE.render(
+        data_t=data_type,
+        read_t=read_type,
+        padding_value=padding_value,
+    )
+
+    compute_idx_str = KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE.render(
+        index_type=index_type,
+        num_offsets=num_offsets,
+        strides=get_stride_expressions(y.shape()),
+        offsets_type=jagged_int_var.offsets_type(),
+        out_of_bounds_action=padding_str,
+    )
+
+    return KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        read_t=read_type,
+        compute_idx=compute_idx_str,
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=True,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.gen_function")
+def jagged_to_padded_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates jagged_to_padded_dense function definition."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    backend_spec = CUDASpec()
+
+    dtype = x.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+
+    # inner size of the input jagged Tensor: can't use the output dense Tensor
+    # shape here, as some the dimensions in it may overlap with the jagged
+    # dimensions of the input jagged Tensor
+    inner_size = shape_utils.get_num_rightmost_static_elements(x.shape())
+    read_type = backend_spec.get_elementwise_read_backend_type(inner_size, dtype)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs=func_attrs,
+        index_type=backend_spec.index_type,
+        data_type=data_type,
+        read_type=read_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=read_type,
+        data_t=data_type,
+        op_t=data_type,
+    )
+
+    dynamic_dims = get_dynamic_dims(y.shape())
+
+    return FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        dynamic_dims_decl=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=False,
+        ),
+        offsets_decl=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
+        read_t=read_type,
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.func_decl")
+def jagged_to_padded_dense_gen_function_decl(func_attrs) -> str:
+    """Generate jagged_to_padded_dense function declaration."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=True,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.func_call")
+def jagged_to_padded_dense_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate jagged_to_padded_dense function call."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    backend_spec = CUDASpec()
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        index_type=backend_spec.index_type,
+        calculate_n=gen_int_var_product_str(y.shape()),
+        y=y._attrs["name"],
+        x=x._attrs["name"],
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=False,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=False,
+            const_ref=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/dense_to_jagged.py b/python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py
similarity index 51%
rename from python/aitemplate/backend/cuda/tensor/dense_to_jagged.py
rename to python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py
index 6e601c0a4..73d4e9e19 100644
--- a/python/aitemplate/backend/cuda/tensor/dense_to_jagged.py
+++ b/python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py
@@ -13,106 +13,29 @@
 #  limitations under the License.
 #
 """
-The back-end bindings of the dense_to_jagged op.
+The back-end bindings of the padded_dense_to_jagged op.
 """
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import jinja2
 
 from aitemplate.backend import registry
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import (
+    CONSTANT_TEMPLATE,
+    gen_dynamic_dim_str,
+    gen_int_var_product_str,
+    gen_offsets_str,
+    get_dynamic_dims,
+    get_stride_expressions,
+    KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE,
+    KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE,
+)
 from aitemplate.backend.target import Target
-from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Tensor
+from aitemplate.compiler.base import IntVar, JaggedIntVar
 from aitemplate.utils import shape_utils
 
 
-CONSTANT_TEMPLATE = jinja2.Template(
-    """
-#define FUSED_ELE_THREAD_SIZE 256
-
-const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
-    """
-)
-
-KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE = jinja2.Template(
-    """
-  // first compute the dense_idx from the blockIdx and threadIdx
-  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
-  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
-  if (dense_idx_elem >= n_elements) {
-    return;
-  }
-
-  // then compute the jagged_idx from the dense_idx_elem
-  {{index_type}} jagged_idx;
-  {
-    // dense_coord is along consecutive dense dimensions
-    // jagged_coord is along the total_length of the jagged Tensor
-    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
-    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
-    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
-
-{% for i in range(num_offsets) %}
-    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
-    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
-    dense_coord = running_idx / ({{strides[i+1]}});
-    running_idx = running_idx % ({{strides[i+1]}});
-    if (dense_coord >= next_offset - prev_offset) {
-        // this element of the dense volume is
-        // out of bounds of the jagged Tensor
-        return;
-    }
-    jagged_coord = prev_offset;
-
-{% endfor %}
-    jagged_coord += dense_coord;
-    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
-  }
-    """
-)
-
-KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE = jinja2.Template(
-    """
-  // first compute the jagged_idx from the blockIdx and threadIdx
-  const {{index_type}} jagged_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
-  const {{index_type}} jagged_idx_elem = jagged_idx * N_ELEMENTS_PER_THREAD;
-  if (jagged_idx_elem >= n_elements) {
-    return;
-  }
-
-  // then compute the dense_idx from the jagged_idx_elem
-  {{index_type}} dense_idx = jagged_idx_elem % ({{strides[num_offsets]}});
-  {
-    {{offsets_type}} left, right, mid, tmp_value, offset_idx, offset_value;
-    {{index_type}} running_idx = jagged_idx_elem / ({{strides[num_offsets]}});
-
-    // binary search to determine the dense coord along the current jagged dimension
-    // the goal is to find the index of the maximum offset value in offsets.data[{{i}}]
-    // which is <= the running_idx. the (running_idx - offset_value) will then indicate
-    // the dense cooord along the current jagged dimension.
-{% for i in range(num_offsets - 1, -1, -1) %}
-    left = 0;
-    right = offsets.lengths[{{i}}] - 1;
-    while (left <= right) {
-        mid = (left + right) >> 1;
-        tmp_value = offsets.data[{{i}}][mid];
-        if (tmp_value <= running_idx) {
-            offset_idx = mid;
-            offset_value = tmp_value;
-            left = mid + 1;
-        } else {
-            right = mid - 1;
-        }
-    }
-    dense_idx += (running_idx - offset_value) * ({{strides[i+1]}});
-    running_idx = offset_idx;
-
-{% endfor %}
-    dense_idx = (dense_idx + running_idx * ({{strides[0]}})) / N_ELEMENTS_PER_THREAD;
-  }
-    """
-)
-
 KERNEL_TEMPLATE = jinja2.Template(
     """
 __global__ void {{func_name}}(
@@ -129,6 +52,7 @@
     """
 )
 
+
 FUNC_TEMPLATE = jinja2.Template(
     """
 {{head}}
@@ -144,7 +68,7 @@
 
 }  // namespace
 
-void {{func_name}}(
+void invoke_{{func_name}}(
     void* y,
     const void* x,
 {% for idx in range(num_offsets) %}
@@ -179,9 +103,10 @@
     """
 )
 
+
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
-void {{func_name}}(
+void invoke_{{func_name}}(
     void* y,
     const void* x,
 {% for idx in range(num_offsets) %}
@@ -194,9 +119,10 @@
     """
 )
 
+
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
-{{indent}}{{func_name}}(
+{{indent}}invoke_{{func_name}}(
 {{indent}}    {{y}},
 {{indent}}    {{x}},
 {% for idx in range(num_offsets) %}
@@ -210,102 +136,6 @@
 )
 
 
-def _get_strides(shape: List[IntVar]) -> List[str]:
-    """
-    Generate the stride expressions for each of the dimensions
-    of the shape. A stride expression here means the
-    product of all dimensions following the given dimension.
-    The order of the stride expressions in the returned list
-    is the same as of the dimensions of the shape.
-    """
-    strides = []
-    for dim in reversed(shape[1:]):
-        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
-        if strides:
-            strides.append(f"{strides[-1]} * {str_dim}")
-        else:
-            strides.append(str_dim)
-    strides.reverse()
-    return strides
-
-
-def _get_dynamic_dims(x: Tensor, y: Tensor) -> List[IntVar]:
-    res = {}
-    for dim in list(x.shape()) + list(y.shape()):
-        if not isinstance(dim, IntImm):
-            res[dim._attrs["name"]] = dim
-
-    return list(res.values())
-
-
-def _gen_dynamic_dim_str(
-    index_type: str,
-    dynamic_dims: List[IntVar],
-    has_type: bool,
-) -> str:
-    type_str = index_type + " " if has_type else ""
-    res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
-    if res:
-        res += ", "
-
-    return res
-
-
-def _gen_offsets_str(
-    jagged_int_var: JaggedIntVar,
-    has_type: bool,
-    const_ref: bool,
-    name: Optional[str] = None,
-) -> str:
-    offsets_var_name = jagged_int_var.offsets_var_name()
-    offsets_struct_type = jagged_int_var.offsets_struct_type()
-
-    ref_prefix = "const " if const_ref else ""
-    ref_suffix = "&" if const_ref else ""
-    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
-    arg_name = name if name is not None else offsets_var_name
-    offsets = f"{arg_type}{arg_name}, "
-
-    return offsets
-
-
-def _gen_int_var_product_str(
-    int_vars: List[IntVar],
-) -> str:
-    res = []
-    for int_var in int_vars:
-        if isinstance(int_var, IntImm):
-            res.append(str(int_var._attrs["values"][0]))
-        elif isinstance(int_var, IntVar):
-            res.append(int_var._attrs["name"])
-        else:
-            raise RuntimeError(
-                "A dim must be an IntVar! Current type: {}".format(type(int_var))
-            )
-
-    return " * ".join(res) if res else "1"
-
-
-def _detect_read_type(
-    inner_size: int,
-    dtype: str,
-) -> str:
-    if dtype in ("bfloat16", "half"):
-        if inner_size % 8 == 0:
-            return "uint4"
-        elif inner_size % 4 == 0:
-            return "uint2"
-        elif inner_size % 2 == 0:
-            return "uint"
-    elif dtype == "float":
-        if inner_size % 4 == 0:
-            return "uint4"
-        elif inner_size % 2 == 0:
-            return "uint2"
-
-    return dtype
-
-
 def _gen_compute_idx_str(
     input_shape: List[IntVar],
     output_shape: List[IntVar],
@@ -324,7 +154,7 @@ def _gen_compute_idx_str(
     return compute_idx_template.render(
         index_type=index_type,
         num_offsets=len(jagged_int_var.jagged_dims()),
-        strides=_get_strides(input_shape),
+        strides=get_stride_expressions(input_shape),
         offsets_type=jagged_int_var.offsets_type(),
     )
 
@@ -340,7 +170,7 @@ def _gen_calculate_n(
     # and dense input's volume in case of the dense space indexing
     index_space = output_shape if use_jagged_space_indexing else input_shape
 
-    return _gen_int_var_product_str(index_space)
+    return gen_int_var_product_str(index_space)
 
 
 def _gen_kernel_function(
@@ -364,12 +194,12 @@ def _gen_kernel_function(
             jagged_int_var=jagged_int_var,
         ),
         read_t=read_type,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             index_type=backend_spec.index_type,
-            dynamic_dims=_get_dynamic_dims(x, y),
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
             has_type=True,
         ),
-        offsets=_gen_offsets_str(
+        offsets=gen_offsets_str(
             jagged_int_var=jagged_int_var,
             has_type=True,
             # the offsets are passed
@@ -380,9 +210,9 @@ def _gen_kernel_function(
     )
 
 
-@registry.reg("cuda.dense_to_jagged.gen_function")
-def dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
-    """Generates dense_to_jagged function definition."""
+@registry.reg("cuda.padded_dense_to_jagged.gen_function")
+def padded_dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates padded_dense_to_jagged function definition."""
 
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
@@ -391,23 +221,28 @@ def dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
 
     dtype = x.dtype()
     data_type = backend_spec.dtype_to_backend_type(dtype)
-    read_inner_size = shape_utils.get_num_rightmost_static_elements(y.shape())
-    read_type = _detect_read_type(read_inner_size, data_type)
+
+    # inner size of the output jagged Tensor: can't use the input dense Tensor
+    # shape here, as some the dimensions in it may overlap with the jagged
+    # dimensions of the output jagged Tensor
+    inner_size = shape_utils.get_num_rightmost_static_elements(y.shape())
+    read_type = backend_spec.get_elementwise_read_backend_type(inner_size, dtype)
 
     kernel_function = _gen_kernel_function(
-        func_attrs,
-        backend_spec.index_type,
-        data_type,
-        read_type,
+        func_attrs=func_attrs,
+        index_type=backend_spec.index_type,
+        data_type=data_type,
+        read_type=read_type,
     )
 
     constant = CONSTANT_TEMPLATE.render(
         read_t=read_type,
         data_t=data_type,
+        op_t=data_type,
     )
 
     func_name = func_attrs["name"]
-    dynamic_dims = _get_dynamic_dims(x, y)
+    dynamic_dims = get_dynamic_dims(x.shape(), y.shape())
     offsets_struct_type = jagged_int_var.offsets_struct_type()
     total_length = jagged_int_var.total_length()
 
@@ -432,12 +267,12 @@ def dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
             input_shape=x.shape(),
             output_shape=y.shape(),
         ),
-        dynamic_dims_decl=_gen_dynamic_dim_str(
+        dynamic_dims_decl=gen_dynamic_dim_str(
             index_type=backend_spec.index_type,
             dynamic_dims=dynamic_dims,
             has_type=True,
         ),
-        dynamic_dims_call=_gen_dynamic_dim_str(
+        dynamic_dims_call=gen_dynamic_dim_str(
             index_type=backend_spec.index_type,
             dynamic_dims=dynamic_dims,
             has_type=False,
@@ -446,9 +281,9 @@ def dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
     )
 
 
-@registry.reg("cuda.dense_to_jagged.func_decl")
-def dense_to_jagged_gen_function_decl(func_attrs) -> str:
-    """Generate dense_to_jagged function declaration."""
+@registry.reg("cuda.padded_dense_to_jagged.func_decl")
+def padded_dense_to_jagged_gen_function_decl(func_attrs) -> str:
+    """Generate padded_dense_to_jagged function declaration."""
 
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
@@ -461,20 +296,20 @@ def dense_to_jagged_gen_function_decl(func_attrs) -> str:
         index_type=backend_spec.index_type,
         func_name=func_name,
         num_offsets=len(jagged_int_var.jagged_dims()),
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             index_type=backend_spec.index_type,
-            dynamic_dims=_get_dynamic_dims(x, y),
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
             has_type=True,
         ),
     )
 
 
-@registry.reg("cuda.dense_to_jagged.func_call")
-def dense_to_jagged_gen_function_call(
+@registry.reg("cuda.padded_dense_to_jagged.func_call")
+def padded_dense_to_jagged_gen_function_call(
     func_attrs,
     indent: str,
 ) -> str:
-    """Generate dense_to_jagged function call."""
+    """Generate padded_dense_to_jagged function call."""
 
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
@@ -495,9 +330,9 @@ def dense_to_jagged_gen_function_call(
         offsets_data_names=offsets_data_names,
         y=y._attrs["name"],
         x=x._attrs["name"],
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             index_type=backend_spec.index_type,
-            dynamic_dims=_get_dynamic_dims(x, y),
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
             has_type=False,
         ),
         indent=indent,
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 147f30428..99131090f 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -106,9 +106,9 @@ def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
     this function must be invoked right before the back-end
     code generation of the ops.
 
-    One example is the jagged_to_dense op that must set the
-    total_length dimension of the resulting jagged Tensor if
-    it hasn't been set from any of the model input's shape.
+    One example is the padded_dense_to_jagged op that must set
+    the total_length dimension of the resulting jagged Tensor
+    if it hasn't been set from any of the model input's shape.
     Another example is the make_jagged op that should set the
     batch_dim within the JaggedIntVar of the resulting jagged
     Tensor, unless it has been set already from the inputs.
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 04b2835f1..0cabdee7b 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -21,12 +21,12 @@
 from aitemplate.compiler.ops.tensor.chunk import chunk
 from aitemplate.compiler.ops.tensor.concatenate import concatenate
 from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
-from aitemplate.compiler.ops.tensor.dense_to_jagged import dense_to_jagged
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
 from aitemplate.compiler.ops.tensor.gather import gather
-from aitemplate.compiler.ops.tensor.jagged_to_dense import jagged_to_dense
+from aitemplate.compiler.ops.tensor.jagged_to_padded_dense import jagged_to_padded_dense
 from aitemplate.compiler.ops.tensor.masked_select import masked_select
+from aitemplate.compiler.ops.tensor.padded_dense_to_jagged import padded_dense_to_jagged
 from aitemplate.compiler.ops.tensor.permute import permute
 from aitemplate.compiler.ops.tensor.permute021 import permute021
 from aitemplate.compiler.ops.tensor.permute0213 import permute0213
diff --git a/python/aitemplate/compiler/ops/tensor/jagged_to_dense.py b/python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py
similarity index 67%
rename from python/aitemplate/compiler/ops/tensor/jagged_to_dense.py
rename to python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py
index aeb713f8d..a9dbf4aa9 100644
--- a/python/aitemplate/compiler/ops/tensor/jagged_to_dense.py
+++ b/python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py
@@ -14,7 +14,7 @@
 #
 
 """
-Define jagged_to_dense op
+Define jagged_to_padded_dense op
 """
 import logging
 from typing import List
@@ -28,14 +28,24 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class jagged_to_dense(Operator):
+class jagged_to_padded_dense(Operator):
     """
-    Returns a tensor containing the dense format of the input jagged tensor.
+    Returns a dense Tensor "expanded" from the input jagged Tensor.
+    For each of the jagged dimensions (JaggedDims) in the jagged
+    Tensor's first dimension (JaggedIntVar), a separate static
+    dimension (IntImm) equal to the max_value of the jagged
+    dimension is created in the output dense Tensor's shape.
+
+    The values in the output dense Tensor that don't have corresponding
+    values in the input jagged Tensor are set to the padding_value.
+
     Args:
-        x (Tensor): input jagged tensor
-        padding_value (float): the padding value for elements out of jagged shape.
+        x (Tensor): input jagged Tensor.
+        padding_value (float): the padding value for the output dense
+            Tensor's elements that don't have counterparts in the input
+            jagged Tensor.
     Returns:
-        y: a tensor containing the dense format of input jagged tensor.
+        y (Tensor): a dense Tensor expanded from the input jagged Tensor x.
     """
 
     def __init__(
@@ -43,7 +53,7 @@ def __init__(
         padding_value: float = 0,
     ):
         super().__init__()
-        self._attrs["op"] = "jagged_to_dense"
+        self._attrs["op"] = "jagged_to_padded_dense"
         self._attrs["padding_value"] = padding_value
 
     def _infer_shape(self, x: Tensor) -> List[IntVar]:
@@ -64,9 +74,7 @@ def __call__(
         x: Tensor,
     ) -> Tensor:
         if not x.is_jagged():
-            raise RuntimeError(
-                "Input tensor x is expected to be jagged, but actually dense for jagged_to_dense."
-            )
+            raise RuntimeError("Input tensor x must be jagged.")
 
         self._attrs["inputs"] = [x]
         self._set_depth()
diff --git a/python/aitemplate/compiler/ops/tensor/dense_to_jagged.py b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
similarity index 90%
rename from python/aitemplate/compiler/ops/tensor/dense_to_jagged.py
rename to python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
index 422183b37..95bec33eb 100644
--- a/python/aitemplate/compiler/ops/tensor/dense_to_jagged.py
+++ b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
@@ -14,7 +14,7 @@
 #
 
 """
-The front-end definition of the dense_to_jagged op.
+The front-end definition of the padded_dense_to_jagged op.
 """
 from typing import List
 
@@ -24,7 +24,7 @@
 from aitemplate.compiler.ops import make_jagged
 
 
-class dense_to_jagged(Operator):
+class padded_dense_to_jagged(Operator):
     """
     Returns a jagged Tensor "extracted" from the input dense Tensor,
     given the offsets list. The resulting jagged Tensor contains the
@@ -49,7 +49,7 @@ def __init__(
             )
 
         super().__init__()
-        self._attrs["op"] = "dense_to_jagged"
+        self._attrs["op"] = "padded_dense_to_jagged"
         self._attrs["total_length"] = total_length
 
     def _infer_shape(
@@ -102,13 +102,13 @@ def __call__(
         source = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [source]
 
-        # in the AIT graph, the output of the dense_to_jagged op is set to the
-        # source Tensor, which is still not a jagged Tensor. The source Tensor
+        # in the AIT graph, the output of the padded_dense_to_jagged op is set to
+        # the source Tensor, which is still not a jagged Tensor. The source Tensor
         # is passed through the make_jagged op to obtain the jagged Tensor returned
         # from the __call__: this way, the chain of ops in the graph looks like:
         #
-        #      x --> dense_to_jagged --> source --> make_jagged --> y
-        #                    \------ offsets_list -------/
+        #      x --> padded_dense_to_jagged --> source --> make_jagged --> y
+        #                    \--------- offsets_list ----------/
 
         # the resulting jagged Tensor
         jagged_output = make_jagged(
@@ -124,7 +124,7 @@ def __call__(
 
         # we keep the resulting jagged Tensor's JaggedIntVar around,
         # as we'll need it for the back-end code generation of the
-        # dense_to_jagged op
+        # padded_dense_to_jagged op
         self._attrs["jagged_int_var"] = jagged_output._attrs["shape"][0]
 
         return jagged_output
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index e4dad2d49..0c9b96453 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -27,6 +27,10 @@
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class GEMMPermuteTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
@@ -264,10 +268,6 @@ def test_rrr(self):
             copy_op=True,
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
     def test_permute_float32(self):
         for has_bias in (True, False):
             self._test_rcr(
@@ -298,10 +298,6 @@ def test_permute_float32(self):
             dtype="float32",
         )
 
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
     def test_gemm_permute_bfloat16(self):
         for has_bias in (True, False):
             self._test_rcr(
diff --git a/tests/unittest/ops/test_jagged_to_dense.py b/tests/unittest/ops/test_jagged_to_padded_dense.py
similarity index 82%
rename from tests/unittest/ops/test_jagged_to_dense.py
rename to tests/unittest/ops/test_jagged_to_padded_dense.py
index bebd50094..cf774ff23 100644
--- a/tests/unittest/ops/test_jagged_to_dense.py
+++ b/tests/unittest/ops/test_jagged_to_padded_dense.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-Unittests for jagged Convert Operator.
+Unittests for the jagged_to_padded_dense op.
 """
 
 import json
@@ -22,7 +22,7 @@
 import unittest
 from typing import List
 
-import aitemplate.testing.jagged_utils as jagged_utils_ref
+import aitemplate.testing.jagged_utils as jagged_utils
 
 import torch
 
@@ -35,8 +35,8 @@
 from parameterized import param, parameterized
 
 
-class JaggedToDenseTestCase(unittest.TestCase):
-    def _test_jagged_to_dense(
+class JaggedToPaddedDenseTestCase(unittest.TestCase):
+    def _test_jagged_to_padded_dense(
         self,
         jagged_max_shape: List[int],
         offsets_list: List[List[int]],
@@ -89,7 +89,7 @@ def _test_jagged_to_dense(
             jagged_dims=jagged_dims,
         )(SOURCE, OFFSETS_LIST)
 
-        RESULT = ops.jagged_to_dense(padding_value=padding_value)(JAGGED)
+        RESULT = ops.jagged_to_padded_dense(padding_value=padding_value)(JAGGED)
 
         RESULT._attrs["name"] = "result"
         RESULT._attrs["is_output"] = True
@@ -102,7 +102,7 @@ def _test_jagged_to_dense(
             [RESULT],
             detect_target(),
             "./tmp",
-            f"test_jagged_to_dense_{test_suffix}",
+            f"test_jagged_to_padded_dense_{test_suffix}",
         )
 
         torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
@@ -111,7 +111,7 @@ def _test_jagged_to_dense(
             for i, offsets in enumerate(offsets_list)
         }
         source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
-        result_pt = jagged_utils_ref.jagged_to_dense(
+        result_pt = jagged_utils.jagged_to_dense(
             jagged=source_pt,
             offsets_list=list(offsets_pt.values()),
             dense_shape=jagged_max_shape,
@@ -130,16 +130,12 @@ def _test_jagged_to_dense(
             param(2, "int32", [4, 3, 4], "float16", 1e2),
             param(3, "int32", [4, 3, 2], "float16", 0.0),
             param(4, "int32", [4, 3, 1], "float16", 1e2),
-            param(5, "int32", [4, 3, 8], "bfloat16", 0.0),
-            param(6, "int32", [4, 3, 4], "bfloat16", 1e2),
-            param(7, "int32", [4, 3, 2], "bfloat16", 0.0),
-            param(8, "int32", [4, 3, 1], "bfloat16", 1e2),
-            param(9, "int64", [4, 3, 4], "float32", 0.0),
-            param(10, "int64", [4, 3, 2], "float32", 1e5),
-            param(11, "int64", [4, 3, 1], "float32", 1e5),
+            param(5, "int64", [4, 3, 4], "float32", 0.0),
+            param(6, "int64", [4, 3, 2], "float32", 1e5),
+            param(7, "int64", [4, 3, 1], "float32", 1e5),
         ]
     )
-    def test_jagged_to_dense_single_offsets(
+    def test_jagged_to_padded_dense_single_offsets(
         self,
         i,
         offsets_dtype,
@@ -147,7 +143,7 @@ def test_jagged_to_dense_single_offsets(
         dtype,
         padding_value,
     ):
-        self._test_jagged_to_dense(
+        self._test_jagged_to_padded_dense(
             jagged_max_shape=jagged_max_shape,
             offsets_list=[[0, 1, 4, 6, 7]],
             dtype=dtype,
@@ -162,16 +158,12 @@ def test_jagged_to_dense_single_offsets(
             param(2, "int32", [3, 4, 5, 150, 1, 4], "float16", 1e2),
             param(3, "int32", [3, 4, 5, 150, 3, 2], "float16", 0.0),
             param(4, "int32", [3, 4, 5, 150, 1, 1], "float16", 1e2),
-            param(5, "int32", [3, 4, 5, 150, 1, 8], "bfloat16", 0.0),
-            param(6, "int32", [3, 4, 5, 150, 3, 4], "bfloat16", 1e2),
-            param(7, "int32", [3, 4, 5, 150, 1, 2], "bfloat16", 0.0),
-            param(8, "int32", [3, 4, 5, 150, 3, 1], "bfloat16", 1e2),
-            param(9, "int64", [3, 4, 5, 150, 1, 4], "float32", 0.0),
-            param(10, "int64", [3, 4, 5, 150, 3, 2], "float32", 1e5),
-            param(11, "int64", [3, 4, 5, 150, 3, 1], "float32", 1e5),
+            param(5, "int64", [3, 4, 5, 150, 1, 4], "float32", 0.0),
+            param(6, "int64", [3, 4, 5, 150, 3, 2], "float32", 1e5),
+            param(7, "int64", [3, 4, 5, 150, 3, 1], "float32", 1e5),
         ]
     )
-    def test_jagged_to_dense_multiple_offsets(
+    def test_jagged_to_padded_dense_multiple_offsets(
         self,
         i,
         offsets_dtype,
@@ -179,7 +171,7 @@ def test_jagged_to_dense_multiple_offsets(
         dtype,
         padding_value,
     ):
-        self._test_jagged_to_dense(
+        self._test_jagged_to_padded_dense(
             jagged_max_shape=jagged_max_shape,
             offsets_list=[
                 [0, 1, 3, 5],
@@ -192,7 +184,7 @@ def test_jagged_to_dense_multiple_offsets(
             padding_value=padding_value,
         )
 
-    def _benchmark_jagged_to_dense(
+    def _benchmark_jagged_to_padded_dense(
         self,
         B: int,
         N: int,
@@ -232,7 +224,7 @@ def _benchmark_jagged_to_dense(
             jagged_dims=[jagged_dim],
         )(SOURCE, OFFSETS_LIST)
 
-        RESULT = ops.jagged_to_dense()(JAGGED)
+        RESULT = ops.jagged_to_padded_dense()(JAGGED)
 
         RESULT._attrs["name"] = "result"
         RESULT._attrs["is_output"] = True
@@ -241,13 +233,13 @@ def _benchmark_jagged_to_dense(
             [RESULT],
             detect_target(),
             "./tmp",
-            f"benchmark_jagged_to_dense_{test_suffix}",
+            f"benchmark_jagged_to_padded_dense_{test_suffix}",
         )
 
         random.seed(0)
         load_factors = [i / 20 for i in range(1, 21)]
         offset_tensors = [
-            jagged_utils_ref.generate_offsets(
+            jagged_utils.generate_offsets(
                 batch_size=B,
                 max_seq_len=N,
                 load_factor=load_factor,
@@ -275,13 +267,13 @@ def _benchmark_jagged_to_dense(
                     filename=f.name,
                 )
                 profiling_data = json.loads(f.read())
-                jagged_to_dense_records = [
+                jagged_to_padded_dense_records = [
                     profiling_data[func_name]
                     for func_name in profiling_data
-                    if func_name.startswith("jagged_to_dense")
+                    if func_name.startswith("jagged_to_padded_dense")
                 ]
-                assert len(jagged_to_dense_records) == 1
-                runtime_ms = jagged_to_dense_records[0]["ms_per_iter"]
+                assert len(jagged_to_padded_dense_records) == 1
+                runtime_ms = jagged_to_padded_dense_records[0]["ms_per_iter"]
 
             jagged_item = total_length * D  # total items to read: the jagged volume
             dense_item = B * N * D  # total items to write: the dense volume
@@ -302,8 +294,8 @@ def _benchmark_jagged_to_dense(
                 f"bandwidth: {round(bandwidth, 3)} GB/s"
             )
 
-    def _test_benchmark_jagged_to_dense(self):
-        self._benchmark_jagged_to_dense(
+    def _test_benchmark_jagged_to_padded_dense(self):
+        self._benchmark_jagged_to_padded_dense(
             B=1024,
             N=260,
             D=256,
diff --git a/tests/unittest/ops/test_dense_to_jagged.py b/tests/unittest/ops/test_padded_dense_to_jagged.py
similarity index 89%
rename from tests/unittest/ops/test_dense_to_jagged.py
rename to tests/unittest/ops/test_padded_dense_to_jagged.py
index 4609d013d..7951551cd 100644
--- a/tests/unittest/ops/test_dense_to_jagged.py
+++ b/tests/unittest/ops/test_padded_dense_to_jagged.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-Unittests for the jagged_to_dense op.
+Unittests for the padded_dense_to_jagged op.
 """
 
 import json
@@ -22,7 +22,7 @@
 import unittest
 from typing import List
 
-import aitemplate.testing.jagged_utils as jagged_utils_ref
+import aitemplate.testing.jagged_utils as jagged_utils
 
 import torch
 
@@ -39,8 +39,8 @@
 from parameterized import param, parameterized
 
 
-class DenseToJaggedTestCase(unittest.TestCase):
-    def _test_dense_to_jagged(
+class PaddedDenseToJaggedTestCase(unittest.TestCase):
+    def _test_padded_dense_to_jagged(
         self,
         jagged_max_shape: List[int],
         offsets_list: List[List[int]],
@@ -95,7 +95,7 @@ def _test_dense_to_jagged(
             is_input=True,
         )
 
-        JAGGED = ops.dense_to_jagged(total_length=total_length_dim)(
+        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_dim)(
             x=DENSE,
             offsets_list=OFFSETS_LIST,
         )
@@ -118,7 +118,7 @@ def _test_dense_to_jagged(
             [RESULT],
             detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
             "./tmp",
-            f"test_dense_to_jagged_{test_suffix}",
+            f"test_padded_dense_to_jagged_{test_suffix}",
         )
 
         torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
@@ -127,7 +127,7 @@ def _test_dense_to_jagged(
             for i, offsets in enumerate(offsets_list)
         }
         dense_pt = get_random_torch_tensor(jagged_max_shape, dtype)
-        result_pt = jagged_utils_ref.dense_to_jagged(
+        result_pt = jagged_utils.dense_to_jagged(
             dense=dense_pt,
             offsets_list=list(offsets_pt.values()),
         )
@@ -151,7 +151,7 @@ def _test_dense_to_jagged(
             param(7, "int64", [4, 3, 1], "float32"),
         ]
     )
-    def test_dense_to_jagged_single_offsets(
+    def test_padded_dense_to_jagged_single_offsets(
         self,
         i,
         offsets_dtype,
@@ -159,7 +159,7 @@ def test_dense_to_jagged_single_offsets(
         dtype,
     ):
         for use_jagged_space_indexing in [False, True]:
-            self._test_dense_to_jagged(
+            self._test_padded_dense_to_jagged(
                 jagged_max_shape=jagged_max_shape,
                 offsets_list=[[0, 1, 4, 6, 7]],
                 dtype=dtype,
@@ -179,7 +179,7 @@ def test_dense_to_jagged_single_offsets(
             param(7, "int64", [3, 4, 5, 150, 3, 1], "float32"),
         ]
     )
-    def test_dense_to_jagged_multiple_offsets(
+    def test_padded_dense_to_jagged_multiple_offsets(
         self,
         i,
         offsets_dtype,
@@ -187,7 +187,7 @@ def test_dense_to_jagged_multiple_offsets(
         dtype,
     ):
         for use_jagged_space_indexing in [False, True]:
-            self._test_dense_to_jagged(
+            self._test_padded_dense_to_jagged(
                 jagged_max_shape=jagged_max_shape,
                 offsets_list=[
                     [0, 1, 3, 5],
@@ -200,7 +200,7 @@ def test_dense_to_jagged_multiple_offsets(
                 test_suffix=f"multiple_offsets_{dtype}_{i}",
             )
 
-    def _benchmark_dense_to_jagged(
+    def _benchmark_padded_dense_to_jagged(
         self,
         B: int,
         N: int,
@@ -238,7 +238,7 @@ def _benchmark_dense_to_jagged(
             )
         ]
 
-        JAGGED = ops.dense_to_jagged(total_length=total_length_dim)(
+        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_dim)(
             x=DENSE,
             offsets_list=OFFSETS_LIST,
         )
@@ -269,13 +269,13 @@ def _benchmark_dense_to_jagged(
             [RESULT],
             detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
             "./tmp",
-            f"benchmark_dense_to_jagged_{test_suffix}",
+            f"benchmark_padded_dense_to_jagged_{test_suffix}",
         )
 
         random.seed(0)
         load_factors = [i / 20 for i in range(1, 21)]
         offset_tensors = [
-            jagged_utils_ref.generate_offsets(
+            jagged_utils.generate_offsets(
                 batch_size=B,
                 max_seq_len=N,
                 load_factor=load_factor,
@@ -301,13 +301,13 @@ def _benchmark_dense_to_jagged(
                     filename=f.name,
                 )
                 profiling_data = json.loads(f.read())
-                dense_to_jagged_records = [
+                padded_dense_to_jagged_records = [
                     profiling_data[func_name]
                     for func_name in profiling_data
-                    if func_name.startswith("dense_to_jagged")
+                    if func_name.startswith("padded_dense_to_jagged")
                 ]
-                assert len(dense_to_jagged_records) == 1
-                runtime_ms = dense_to_jagged_records[0]["ms_per_iter"]
+                assert len(padded_dense_to_jagged_records) == 1
+                runtime_ms = padded_dense_to_jagged_records[0]["ms_per_iter"]
 
             dense_item = total_length * D  # total items to read: the jagged volume
             jagged_item = total_length * D  # total items to read: the jagged volume
@@ -328,8 +328,8 @@ def _benchmark_dense_to_jagged(
                 f"bandwidth: {round(bandwidth, 3)} GB/s"
             )
 
-    def _test_benchmark_dense_to_jagged(self):
-        self._benchmark_dense_to_jagged(
+    def _test_benchmark_padded_dense_to_jagged(self):
+        self._benchmark_padded_dense_to_jagged(
             B=1024,
             N=260,
             D=256,
@@ -337,7 +337,7 @@ def _test_benchmark_dense_to_jagged(self):
             offsets_dtype="int32",
             use_jagged_space_indexing=False,
             isolated_total_length=True,
-            test_suffix="esuhm",
+            test_suffix="benchmark",
         )
 
 
From 2ad553c307f132e8955749b8dcbf31ddacbcac4b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 15 Mar 2023 18:33:24 -0700
Subject: [PATCH 282/638] Set batch_dim in make_jagged if isolated (#436)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/436

Previously, the `batch_dim` of the `JaggedIntVar` was set in the `make_jagged` back-end function when the value of `batch_dim` was zero. When the `batch_dim` is isolated (i.e., not set from any of the input shapes), this may be problematic when multiple batches with different batch size enter the same compiled model as inputs: the first batch will set the value of `batch_dim` by its size, but the very next batch with different size will trigger an error (because the value of `batch_dim` is no longer zero).

In this diff, the logic of setting the `batch_dim` value is changed to rely on the property of the symbolic graph (whether the dimension is isolated or not) instead of the rumtime (whether the value is zero). For a compiled model, this former property doesn't change in the runtime. Hence consecutive batches with different sizes can rewrite the value of of the `batch_dim` dimension in the runtime without triggering an error.

Reviewed By: chenyang78

Differential Revision: D44113704

fbshipit-source-id: bd4c6d350ff9c6955af9c8c5005ccf39ba651a46
---
 .../backend/cuda/view_ops/make_jagged.py      | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index 0fc06ec2f..f09d8c79e 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -146,13 +146,16 @@
     offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
 {% endfor %}
 
-    if (*batch_dim == 0) {
-      // batch_dim must be set by this code
-      *batch_dim = offsets.lengths[0] - 1;
-    } else if (*batch_dim != offsets.lengths[0] - 1) {
-      // batch_dim must have been set before this code
-      throw std::runtime_error("batch_dim != len(offsets[0]) - 1");
+{% if isolated_batch_dim %}
+    // batch_dim is not present in any input shape
+    // we should set it here from the offsets length
+    *batch_dim = offsets.lengths[0] - 1;
+{% else %}
+    if (*batch_dim != offsets.lengths[0] - 1) {
+        // batch_dim must have been set before this code
+        throw std::runtime_error("batch_dim != len(offsets[0]) - 1");
     }
+{% endif %}
 
     int64_t max_offset_length = 0;
     for (int i = 0; i < {{num_offsets}}; ++i) {
@@ -225,6 +228,9 @@ def make_jagged_gen_function(func_attrs):
     jagged_dim_min_values = [dim.min_value() for dim in jagged_int_var.jagged_dims()]
     jagged_dim_max_values = [dim.max_value() for dim in jagged_int_var.jagged_dims()]
 
+    batch_dim = jagged_int_var.batch_dim()
+    isolated_batch_dim = batch_dim._attrs.get("isolated", False)
+
     return SRC_TEMPLATE.render(
         func_name=func_name,
         num_offsets=len(offsets_list),
@@ -232,6 +238,7 @@ def make_jagged_gen_function(func_attrs):
         jagged_dim_min_values=jagged_dim_min_values,
         jagged_dim_max_values=jagged_dim_max_values,
         offsets_type=jagged_int_var.offsets_type(),
+        isolated_batch_dim=isolated_batch_dim,
     )
 
 
From 6a6222f1954e3e9cf0b59e8c516f2deabe2fd9dd Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 16 Mar 2023 09:41:30 -0700
Subject: [PATCH 283/638] make input accessors for all inputs for
 perm021_fc_ccr_bias (#437)

Summary:
This changes make the op have the consistent behavior with all other ops with input accessors, i.e. we have an input accessor for each input. It would simplify our input-accessor-related passes because we don't have to check the input index for accessing input accessors.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/437

Reviewed By: wushirong

Differential Revision: D44123947

Pulled By: chenyang78

fbshipit-source-id: 0dbe9044aa2149c93407f6d2f48dae2b16567bf5
---
 .../compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
index f9e05c116..c57c3d0d1 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -55,7 +55,9 @@ def __init__(self, layout="021"):
     def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         a, b = self._align_ab(a, b)
         self._attrs["inputs"] = [a, b, bias]
-        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)

From d3722e41652218b20338e7988dfdfc675b23b558 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 15:59:39 -0700
Subject: [PATCH 284/638] Update ait_ci.yml

Switch from rocm to facebook repo.
---
 .github/workflows/ait_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/ait_ci.yml
index c111bb404..89e04547d 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/ait_ci.yml
@@ -33,7 +33,7 @@ jobs:
         rocm-smi
         rocminfo | grep "gfx"
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
-        git clone --recursive -b $GIT_BRANCH https://github.com/ROCmSoftwarePlatform/AITemplate.git
+        git clone --recursive -b $GIT_BRANCH https://github.com/facebookincubator/AITemplate.git
         cd AITemplate
         DOCKER_BUILDKIT=1 ./docker/build.sh rocm
         docker run --network=host --device=/dev/kfd --device=/dev/dri --ipc=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $HOME:/dockerx/ ait:latest

From f2452a88a7e41b2d8d7d6b7e6504a8e3cfe58e69 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 17 Mar 2023 13:58:35 +0800
Subject: [PATCH 285/638] enable rocm ci

---
 .github/workflows/{ait_ci.yml => rocm_ci.yml} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename .github/workflows/{ait_ci.yml => rocm_ci.yml} (99%)

diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/rocm_ci.yml
similarity index 99%
rename from .github/workflows/ait_ci.yml
rename to .github/workflows/rocm_ci.yml
index 89e04547d..92219871d 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,4 +1,4 @@
-name: AITemplate_ci
+name: ROCM_CI
 
 on:
   push:

From 550900a8370cf53e8933fafe6c7ac8891de6fc28 Mon Sep 17 00:00:00 2001
From: Yanxing-Shi <shiyanxing2008@qq.com>
Date: Fri, 17 Mar 2023 08:33:57 +0000
Subject: [PATCH 286/638] fix fx2ait bug

---
 .../compiler/transform/transform_strided_op_and_view_op.py     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index d97e07e12..64e6ab085 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -83,8 +83,7 @@ def _fuse_strided_op_and_view_op_single_pass(
                     tensor._attrs["is_view_of"] = None
                     src_op._attrs["outputs"][idx] = tensor
                     tensor._attrs["src_ops"] = StableSet({src_op})
-                    for view_op_input in view_op._attrs["inputs"]:
-                        transform_utils.remove_tensor_from_sorted_graph(view_op_input)
+                    transform_utils.remove_tensor_from_sorted_graph(view_input_tensor)
                     break
             assert (
                 found_tensor

From f1c2a62919f42801cd50d5912e63d6b3af62bc2d Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Fri, 17 Mar 2023 05:12:44 -0700
Subject: [PATCH 287/638] Replace cudaMemcpy with async copy + stream sync in
 masked_select (#438)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/438

`masked_select` op needs to copy the number of non-masked elements from device to host, since it determines the shape of the output. This causes a CUDA sync.
By using `cudaMemcpyAsync` + `cudaStreamSynchronize` we only synchronize a given stream instead of the whole device. #Thanks frank-wei, ipiszy for the idea.

Also factored out CUDA status checks into a separate macro to avoid duplicate code.

Reviewed By: ipiszy, chenyang78

Differential Revision: D44129126

fbshipit-source-id: b1eff02734b07958083cea5b417a603e2e89c930
---
 .../backend/cuda/tensor/masked_select.py      | 48 ++++++++++---------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/python/aitemplate/backend/cuda/tensor/masked_select.py b/python/aitemplate/backend/cuda/tensor/masked_select.py
index 534f5746c..abf053431 100644
--- a/python/aitemplate/backend/cuda/tensor/masked_select.py
+++ b/python/aitemplate/backend/cuda/tensor/masked_select.py
@@ -48,6 +48,18 @@
     """
 {{header_files}}
 
+#ifndef CUDA_CHECK_MASKED_SELECT
+#define CUDA_CHECK_MASKED_SELECT(expr, msg)                   \\
+  do {                                                        \\
+    cudaError_t status = (expr);                              \\
+    if (status != cudaSuccess) {                              \\
+        std::cerr << msg << " at " << __FILE__                \\
+                  << ": " << __LINE__ << std::endl;           \\
+        throw std::runtime_error(cudaGetErrorString(status)); \\
+    }                                                         \\
+  } while (0)
+#endif // CUDA_CHECK_MASKED_SELECT
+
 void {{func_name}}(
     {{input_type}}* output,
     const {{input_type}}* input,
@@ -80,44 +92,34 @@
     // Get needed temporary storage size and reallocate if necessary
     void* d_temp_storage = nullptr;
     size_t temp_storage_bytes = 0;
-    cudaError_t err = cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, input, mask, output, num_nonmasked_device, num_elems, stream);
-    if (err != cudaSuccess) {
-        std::cerr << "Error when checking the required buffer size!" << std::endl;
-        throw std::runtime_error(cudaGetErrorString(err));
-    }
-    cudaStreamSynchronize(stream);
+    CUDA_CHECK_MASKED_SELECT(cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, input, mask, output, num_nonmasked_device, num_elems, stream),
+                             "Error when checking the required buffer size!");
+    CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
+
     if (allocated_storage < temp_storage_bytes + NUM_NONMASKED_SIZE) {
         auto msg = "Got pre-allocated buffer of size " + std::to_string(allocated_storage) + ", but need " + std::to_string(temp_storage_bytes)
                 + ". Allocating a new buffer, expect performance degradation.";
         std::cerr << msg << std::endl;
         // Allocate temporary storage
         temp_storage_bytes += NUM_NONMASKED_SIZE;
-        err = cudaMalloc(&d_temp_storage, temp_storage_bytes);
-        if (err != cudaSuccess) {
-            std::cerr << "Error when trying to allocate a new buffer!" << std::endl;
-            throw std::runtime_error(cudaGetErrorString(err));
-        }
+        CUDA_CHECK_MASKED_SELECT(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream), "Error when trying to allocate a new buffer!");
+        CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
         workspace = d_temp_storage;
         allocated_storage = temp_storage_bytes;
     }
     allocated_storage -= NUM_NONMASKED_SIZE;  // First NUM_NONMASKED_SIZE bytes are reserved
 
     // Select nonmasked elements. First NUM_NONMASKED_SIZE bytes of workspace are reserved for num_nonmasked_device
-    err = cub::DeviceSelect::Flagged(workspace + NUM_NONMASKED_SIZE, allocated_storage, input, mask, output,
-        num_nonmasked_device, num_elems, stream);
-    if (err != cudaSuccess) {
-        std::cerr << "Error when selecting nonmasked elements!" << std::endl;
-        throw std::runtime_error(cudaGetErrorString(err));
-    }
+    CUDA_CHECK_MASKED_SELECT(cub::DeviceSelect::Flagged(workspace + NUM_NONMASKED_SIZE, allocated_storage, input, mask, output,
+        num_nonmasked_device, num_elems, stream),  "Error when selecting nonmasked elements!");
 
     // Extract number of nonmasked elements (size of the output)
-    err = cudaMemcpy(num_nonmasked, num_nonmasked_device, NUM_NONMASKED_SIZE, cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess) {
-        std::cerr << "Error when copying the number of nonmasked elements from device to host!" << std::endl;
-        throw std::runtime_error(cudaGetErrorString(err));
-    }
+    CUDA_CHECK_MASKED_SELECT(cudaMemcpyAsync(num_nonmasked, num_nonmasked_device, NUM_NONMASKED_SIZE, cudaMemcpyDeviceToHost, stream),
+                             "Error when copying the number of nonmasked elements from device to host!");
+    CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
+
     if (d_temp_storage != nullptr) {
-        cudaFree(d_temp_storage);
+        CUDA_CHECK_MASKED_SELECT(cudaFreeAsync(d_temp_storage, stream), "Error when freeing GPU memory allocated by masked_select!");
     }
 }
 """

From 7e4470e49456c5e19c26aa1b92e35f2beff1f755 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Fri, 17 Mar 2023 12:12:27 -0700
Subject: [PATCH 288/638] Add a back-to-back batched gemm kernel into AIT
 (#444)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/444

As the tile. The kernel is adapted from cutlass b2b gemm kernel.

Support `causal_mask(alpha1(activation(alpha0(A[B, M, K] @ B[B, N0, K]) + bias))) @ C[B, N0, N1]`

Reviewed By: rkindi

Differential Revision: D43865351

fbshipit-source-id: a300f75a819ad7f8344d5700131831e5b197cb69
---
 .../classic_b2b_bmm/device/b2b_batched_gemm.h | 416 +++++++++
 .../classic_b2b_bmm/kernel/b2b_batched_gemm.h | 407 ++++++++
 .../kernel/default_b2b_batched_gemm.h         | 222 +++++
 .../thread/linear_combination_triu.h          | 136 +++
 .../threadblock/b2b_mma_base.h                | 241 +++++
 .../threadblock/b2b_mma_multistage.h          | 874 ++++++++++++++++++
 .../threadblock/b2b_mma_pipelined.h           | 559 +++++++++++
 .../threadblock/default_b2b_mma.h             | 376 ++++++++
 .../default_gmem_to_accum_loader_tensor_op.h  | 201 ++++
 .../threadblock/gmem_to_accum_loader.h        | 361 ++++++++
 ...mem_to_accum_loader_shared_load_iterator.h | 274 ++++++
 ...accum_loader_fragment_iterator_tensor_op.h | 315 +++++++
 .../triu_mma_tensor_op_fragment_iterator.h    | 235 +++++
 13 files changed, 4617 insertions(+)
 create mode 100644 static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
 create mode 100644 static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h

diff --git a/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
new file mode 100644
index 000000000..55646bd44
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -0,0 +1,416 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+
+#include "classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "classic_b2b_bmm/kernel/default_b2b_batched_gemm.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0 = false,
+    /// Stage accumulator in shared memory
+    bool SmemAccumulator = false,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class B2bGemmBatched {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using LayoutB1 = LayoutB1_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape0 = ThreadblockShape0_;
+  using ThreadblockShape1 = ThreadblockShape1_;
+  using WarpShape0 = WarpShape0_;
+  using WarpShape1 = WarpShape1_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kCausalMaskAfterGemm0 = CausalMaskAfterGemm0;
+
+  /// Derived types
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor;
+
+  /// Define the kernel
+  using B2bGemmBatchedKernel = typename kernel::DefaultB2bGemmBatched<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    LayoutB1,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    CausalMaskAfterGemm0,
+    SmemAccumulator
+  >::B2bGemmBatchedKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size_0;
+    GemmCoord problem_size_1;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    int64_t stride_A0;
+    TensorRef<ElementB const, LayoutB> ref_B0;
+    int64_t stride_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    int64_t stride_C0;
+    TensorRef<ElementB const, LayoutB1> ref_B1;
+    int64_t stride_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    int64_t stride_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    int64_t stride_D1;
+    int batch_count;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() {
+
+    }
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      int64_t stride_A0_,
+      TensorRef<ElementB const, LayoutB> ref_B0_,
+      int64_t stride_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      int64_t stride_C0_,
+      TensorRef<ElementB const, LayoutB1> ref_B1_,
+      int64_t stride_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      int64_t stride_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      int64_t stride_D1_,
+      int batch_count_,
+      typename EpilogueOutputOp0::Params epilogue0_ =
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ =
+        typename EpilogueOutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      stride_A0(stride_A0_),
+      ref_B0(ref_B0_),
+      stride_B0(stride_B0_),
+      ref_C0(ref_C0_),
+      stride_C0(stride_C0_),
+      ref_B1(ref_B1_),
+      stride_B1(stride_B1_),
+      ref_C1(ref_C1_),
+      stride_C1(stride_C1_),
+      ref_D1(ref_D1_),
+      stride_D1(stride_D1_),
+      batch_count(batch_count_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename B2bGemmBatchedKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  B2bGemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = B2bGemmBatchedKernel::can_implement(
+      args.problem_size_0,
+      args.problem_size_1,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0,
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.batch_count);
+
+    // Initialize the Params structure
+    params_ = typename B2bGemmBatchedKernel::Params{
+      args.problem_size_0,
+      args.problem_size_1,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.stride_A0,
+      args.ref_B0.non_const_ref(),
+      args.stride_B0,
+      args.ref_C0.non_const_ref(),
+      args.stride_C0,
+      args.ref_B1.non_const_ref(),
+      args.stride_B1,
+      args.ref_C1.non_const_ref(),
+      args.stride_C1,
+      args.ref_D1,
+      args.stride_D1,
+      args.batch_count,
+      args.epilogue0,
+      args.epilogue1
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
+    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D1.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(B2bGemmBatchedKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename B2bGemmBatchedKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<B2bGemmBatchedKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<B2bGemmBatchedKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
new file mode 100644
index 000000000..10325a165
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -0,0 +1,407 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  typename GmemToAccumLoader_
+>
+struct B2bGemmBatched {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using GmemToAccumLoader = GmemToAccumLoader_;
+  using OutputOp0 = typename B2bMma::OutputOp;
+  using OutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size_0;
+    cutlass::gemm::GemmCoord problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename B2bMma::IteratorA0::Params params_A0;
+    typename B2bMma::IteratorA0::TensorRef ref_A0;
+    int64_t stride_A0;
+    typename B2bMma::IteratorB0::Params params_B0;
+    typename B2bMma::IteratorB0::TensorRef ref_B0;
+    int64_t stride_B0;
+    typename GmemToAccumLoader::OutputTileIterator::Params params_C0;
+    typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0;
+    int64_t stride_C0;
+    typename B2bMma::IteratorB1::Params params_B1;
+    typename B2bMma::IteratorB1::TensorRef ref_B1;
+    int64_t stride_B1;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
+    int64_t stride_C1;
+    typename Epilogue::OutputTileIterator::Params params_D1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
+    int64_t stride_D1;
+    int batch_count;
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+    int gemm_k_iterations_0;
+    int gemm_k_iterations_1;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      int64_t stride_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      int64_t stride_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      int64_t stride_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      int64_t stride_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      int64_t stride_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      int64_t stride_D1,
+      int batch_count,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      stride_A0(stride_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      stride_B0(stride_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      stride_C0(stride_C0),
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      stride_B1(stride_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      stride_C1(stride_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      stride_D1(stride_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      batch_count(batch_count),
+      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
+      gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+    typename GmemToAccumLoader::SharedStorage gmem_to_accum_loader;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  B2bGemmBatched() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1) {
+
+    static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements;
+    static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) ||
+      (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) ||
+      (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) ||
+      (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) ||
+      (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) ||
+      (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine if fusion sizes are valid
+    if(problem_size_0.m() != problem_size_1.m())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() != problem_size_1.k())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() > B2bMma::Shape0::kN)
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_1.n() > B2bMma::Shape1::kN)
+      return Status::kErrorInvalidProblem;
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_idx = threadblock_swizzle.get_batch_idx(); batch_idx < params.batch_count; batch_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A0{
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B0{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      };
+
+      cutlass::MatrixCoord tb_offset_B1{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename B2bMma::IteratorA0 iterator_A0(
+        params.params_A0,
+        params.ref_A0.data(),
+        params.problem_size_0.mk(),
+        thread_idx,
+        tb_offset_A0);
+
+      iterator_A0.add_pointer_offset(params.stride_A0 * batch_idx);
+
+      typename B2bMma::IteratorB0 iterator_B0(
+        params.params_B0,
+        params.ref_B0.data(),
+        params.problem_size_0.kn(),
+        thread_idx,
+        tb_offset_B0);
+
+      iterator_B0.add_pointer_offset(params.stride_B0 * batch_idx);
+
+      typename B2bMma::IteratorB1 iterator_B1(
+        params.params_B1,
+        params.ref_B1.data(),
+        params.problem_size_1.kn(),
+        thread_idx,
+        tb_offset_B1);
+
+      iterator_B1.add_pointer_offset(params.stride_B1 * batch_idx);
+
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int lane_idx = threadIdx.x % 32;
+
+      // assume identity swizzle
+      MatrixCoord tb_offset_C0(
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename GmemToAccumLoader::OutputTileIterator iterator_C0(
+        params.params_C0,
+        params.ref_C0.data(),
+        params.problem_size_0.mn(),
+        thread_idx,
+        tb_offset_C0
+      );
+
+      iterator_C0.add_pointer_offset(params.stride_C0 * batch_idx);
+
+
+      //
+      // Main loop
+      //
+
+      OutputOp0 output_op_0(params.output_op_0);
+
+      // Construct thread-scoped matrix multiply
+      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
+
+      typename B2bMma::FragmentC0 src_accum;
+      typename B2bMma::FragmentC1 accumulators;
+
+      src_accum.clear();
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_C0,
+        iterator_B1, src_accum, output_op_0);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp1 output_op_1(params.output_op_1);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * B2bMma::Shape1::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        params.ref_C1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C1.add_pointer_offset(params.stride_C1 * batch_idx);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D1(
+        params.params_D1,
+        params.ref_D1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D1.add_pointer_offset(params.stride_D1 * batch_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
new file mode 100644
index 000000000..6edb43260
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -0,0 +1,222 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+      This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "classic_b2b_bmm/threadblock/default_b2b_mma.h"
+#include "classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Layout type for B1 matrix operand
+  typename LayoutB1_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Apply upper triangular causal mask after first gemm
+  bool CausalMaskAfterGemm0 = false,
+  /// Stage accumulator in shared memory
+  bool SmemAccumulator = false
+>
+struct DefaultB2bGemmBatched;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0>
+struct DefaultB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, CausalMaskAfterGemm0> {
+
+  // TODO: Make pipelined (i.e. stages == 2) work.
+  static_assert((Stages >= 3), "Currently, only multistage is supported (not pipelined).");
+
+  // While we ought to debug it, the warp shape M restriction is not considered
+  // high-priority as we do not want to make warp M much larger anyway.
+  static_assert(
+    !CausalMaskAfterGemm0 || (WarpShape0::kM == 16),
+    "Currently, causal mask is only supported with warp shape M of 16."
+  );
+
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK0 = ThreadblockShape0::kK / WarpShape0::kK;
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmBatchedKernel = kernel::B2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h b/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h
new file mode 100644
index 000000000..b513f958a
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h
@@ -0,0 +1,136 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include <cutlass/half.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  int ThreadBlockShapeM,
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationTriu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static int const kThreadBlockShapeM = ThreadBlockShapeM;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTriu() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, int index, int n, int m) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate = converted_accumulator;
+
+    for (int i = 0; i < kCount; i++) {
+
+      int row = (
+        (kThreadBlockShapeM * blockIdx.x) +
+        (16 * (threadIdx.x / 32)) +
+        (8 * (i / 2)) +
+        ((threadIdx.x % 32) / 4)
+      );
+      int col = (
+        (16 * index) +
+        (8 * n) +
+        (2 * (threadIdx.x % 4)) +
+        (i % 2)
+      );
+
+      intermediate[i] = intermediate[i] * ElementCompute(row <= col);
+
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+}
+}
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h
new file mode 100644
index 000000000..d2460cce9
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h
@@ -0,0 +1,241 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  using Shape1 = Shape1_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm0 = typename Policy0::Operator::Shape;
+  using WarpGemm1 = typename Policy1::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount0 = GemmShape<Shape0::kM / WarpGemm0::kM,
+                               Shape0::kN / WarpGemm0::kN,
+                               Shape0::kK / WarpGemm0::kK>;
+  using WarpCount1 = GemmShape<Shape1::kM / WarpGemm1::kM,
+                               Shape1::kN / WarpGemm1::kN,
+                               Shape1::kK / WarpGemm1::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations0 =
+      (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 =
+      (WarpGemm1::kK / Operator1::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template<
+    typename Shape_,
+    typename Policy_
+  >
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Policy = Policy_;
+    using Operator = typename Policy::Operator;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+  using SharedStorage0 = SharedStorage<Shape0, Policy0>;
+  using SharedStorage1 = SharedStorage<Shape1, Policy1>;
+  union B2bMmaSharedStorage {
+    SharedStorage0 shared_storage0;
+    SharedStorage1 shared_storage1;
+  };
+
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A0 operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+
+  /// Iterator to load a warp-scoped tile of B1 operand from shared memory
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
new file mode 100644
index 000000000..bbea76d44
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -0,0 +1,874 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_base.h"
+#include "classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+#include "classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator)
+    typename FragmentIteratorA1_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// WarpIterator to load Scale or Bias vector from threadblock fragment
+    typename FragmentIteratorA1ScaleBias_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    bool CausalMaskAfterGemm0,
+    typename WarpShape0_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaMultistage :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over intermediate accumulator tile
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from threadblock fragment
+  using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  static const int kPartitionsK0 = Shape0_::kK / WarpShape0_::kK;
+
+  using GmemToAccumLoader =
+      typename cutlass::epilogue::threadblock::DefaultGmemToAccumLoaderTensorOp<
+          Shape0_, Operator0, kPartitionsK0, OutputOp,
+          OutputOp::kCount>::GmemToAccumLoader;
+
+  using IteratorC0 = typename GmemToAccumLoader::OutputTileIterator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  GmemToAccumLoader gmem_to_accum_loader;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      typename GmemToAccumLoader::SharedStorage &bias_add_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
+      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+                              int group_start_A0 = 0, int group_start_B0 = 0) {
+    iterator_A0.set_iteration_index(group_start_A0 *
+                                   IteratorA0::kAccessesPerVector);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess /
+                              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0 *
+                                   IteratorB0::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
+                              int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(group_start_B1 *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A0 operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B0 operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over C0 operand in global memory
+      IteratorC0 iterator_C0,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0)
+    {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+      iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA0::Element>::value *
+              IteratorA0::ThreadMap::kElementsPerAccess /
+              IteratorA0::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+    iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        warp_mma0(
+          accum0,
+          warp_transformed_frag_A0[warp_mma_k % 2],
+          warp_transformed_frag_B0[warp_mma_k % 2],
+          accum0
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+
+          group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+          iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+          iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    // Apply bias add
+    gmem_to_accum_loader(output_op_0, accum0, iterator_C0);
+    __syncthreads();
+
+
+    // 2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+    typename FragmentIteratorA1::OutputOp noop_output_op_0({}); // Is noop LinearCombination (see default_b2b_mma.h)
+    TriuMmaTensorOpFragmentIterator<FragmentIteratorA1, Shape0::kM> triu_warp_tile_iterator_A1_;
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], noop_output_op_0);
+    if (CausalMaskAfterGemm0) {
+      triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+      ++triu_warp_tile_iterator_A1_;
+    }
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+    //
+    // Mainloop
+    //
+
+    gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment
+        warp_tile_iterator_A1_.load(
+            warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+            noop_output_op_0
+        );
+        if (CausalMaskAfterGemm0) {
+          triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          ++triu_warp_tile_iterator_A1_;
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+
+        warp_mma1(
+          accum,
+          warp_transformed_frag_A1[warp_mma_k % 2],
+          warp_transformed_frag_B1[warp_mma_k % 2],
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
new file mode 100644
index 000000000..d8ffe67ad
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
@@ -0,0 +1,559 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator)
+  typename FragmentIteratorA1_,
+  /// Iterates over vectors of scale and bias vector in global memory
+  //  (concept: VectorIterator)
+  typename IteratorAccumulatorScaleBias_,
+  /// FragmentIterator to load Scale or Bias vector from threadblock fragment
+  typename FragmentIteratorA1ScaleBias_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element,
+    typename IteratorA0_::Element,
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element,
+    typename IteratorB0_::Element,
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element,
+    typename IteratorB1_::Element,
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelined :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
+  using FragmentIteratorA1ScaleBias =
+    FragmentIteratorA1ScaleBias_;     ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  /// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
+  using WarpFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
+
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    //These should stay the same across different GEMM layers
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorAccumulatorScaleBias iterator_A1_scale,    ///< iterator over A1 operand scale vectors in global memory
+    IteratorAccumulatorScaleBias iterator_A1_bias,     ///< iterator over A1 operand bias vectors in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    //2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentA1ScaleBias tb_frag_A1_scale;
+    FragmentA1ScaleBias tb_frag_A1_bias;
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
+    FragmentB1 tb_frag_B1;
+
+    if(PerChannelScale)
+        tb_frag_A1_scale.clear();
+    tb_frag_A1_bias.clear();
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    if(PerChannelScale)
+        iterator_A1_scale.load(tb_frag_A1_scale);
+    iterator_A1_bias.load(tb_frag_A1_bias);
+    iterator_B1.load(tb_frag_B1);
+
+    if(PerChannelScale)
+        ++iterator_A1_scale;
+    ++iterator_A1_bias;
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
+    WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    if(PerChannelScale)
+        warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
+    warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
+        warp_frag_A1_bias[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    if(PerChannelScale)
+        ++warp_tile_iterator_A1_scale_;
+    ++warp_tile_iterator_A1_bias_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Avoid reading out of bounds
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+
+          if(PerChannelScale) {
+              tb_frag_A1_scale.clear();
+              iterator_A1_scale.load(tb_frag_A1_scale);
+              ++iterator_A1_scale;
+            }
+            tb_frag_A1_bias.clear();
+            iterator_A1_bias.load(tb_frag_A1_bias);
+            ++iterator_A1_bias;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+
+        if(PerChannelScale)
+          warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
+            warp_frag_A1_scale[(warp_mma_k + 1) % 2],
+            warp_frag_A1_bias[(warp_mma_k + 1) % 2],
+            output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if(PerChannelScale)
+          ++warp_tile_iterator_A1_scale_;
+        ++warp_tile_iterator_A1_bias_;
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+          ++iterator_B1;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                  warp_frag_B1[warp_mma_k % 2], accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
new file mode 100644
index 000000000..ad915009c
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -0,0 +1,376 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_pipelined.h"
+#include "classic_b2b_bmm/threadblock/b2b_mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Staging the accumulators in shared memory.
+    bool SmemAccumulator = false>
+struct DefaultB2bMma;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output with 2-stage pipeline
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB1, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB,
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  // FragmentIteratorA1 should just load A1 fragments from the intermediate
+  // accumulator tile without modification, so LinearCombination is used to
+  // apply a no-op to the accumulator tile.
+  using LinearCombinationOutputOp = epilogue::thread::LinearCombination<
+    typename EpilogueOutputOp::ElementOutput,
+    EpilogueOutputOp::kCount,
+    typename EpilogueOutputOp::ElementOutput,
+    typename EpilogueOutputOp::ElementCompute
+  >;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, LinearCombinationOutputOp>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB1, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages,
+      CausalMaskAfterGemm0, typename MmaCore0::WarpShape>;
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
new file mode 100644
index 000000000..6379eb435
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
@@ -0,0 +1,201 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/default_epilogue_tensor_op.h but
+  modified to use GmemToAccumLoader, GmemToAccumLoaderFragmentIteratorTensorOp, and
+  GmemToAccumLoaderSharedLoadIterator.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "classic_b2b_bmm/threadblock/gmem_to_accum_loader.h"
+#include "classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h"
+#include "classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultGmemToAccumLoaderTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::GmemToAccumLoaderFragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename cutlass::epilogue::threadblock::GmemToAccumLoaderSharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using GmemToAccumLoader = cutlass::epilogue::threadblock::GmemToAccumLoader<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
new file mode 100644
index 000000000..f2403f43b
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
@@ -0,0 +1,361 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/epilogue.h and modified to essentially
+  inverse the direction of the epilogue. See https://github.com/NVIDIA/cutlass/issues/784
+  for details.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/util/index_sequence.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class GmemToAccumLoader :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+  static_assert(kPartitionsK == 1, "Must be exactly 1.");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoader(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                ///< Output operator
+    AccumulatorTile &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration)
+    {
+
+      //
+      // Load fragments from shared memory
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+        // Load addend source fragment from global memory to aligned register fragment.
+        source_iterator.load(source_fragment);
+        ++source_iterator;
+
+        // Store data in register fragment to shared memory.
+        shared_load_iterator_.store(source_fragment);
+
+        if (p < Base::kFragmentsPerIteration - 1)
+        {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        typename AccumulatorFragmentIterator::Fragment source_accum_fragment;
+        typename AccumulatorFragmentIterator::Fragment output_accum_fragment;
+
+        // Load from shared memory to "unaligned" accumulator fragment.
+        this->warp_tile_iterator_.load(source_accum_fragment);
+
+        // Load from accumulators to accumulator fragment.
+        accum_fragment_iterator.load(accum_fragment);
+
+        // Store result of computation to accumulators.
+        apply_output_operator(output_accum_fragment, output_op, accum_fragment, source_accum_fragment);
+        accum_fragment_iterator.store(output_accum_fragment);
+
+        ++accum_fragment_iterator;
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+    }
+
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator(
+    typename AccumulatorFragmentIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename AccumulatorFragmentIterator::Fragment const &accum_fragment,
+    typename AccumulatorFragmentIterator::Fragment const &source_fragment)
+  {
+
+    OutputAccessType *output_frag_ptr =
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&accum_fragment);
+
+    OutputAccessType const *source_frag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations =
+      AccumulatorFragmentIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i)
+    {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
new file mode 100644
index 000000000..f5ecb1bc7
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
@@ -0,0 +1,274 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in
+/// GmemToAccumLoader.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    int MaxAlignment =
+        ThreadMap_::kElementsPerAccess* sizeof_bits<Element_>::value / 8>
+class GmemToAccumLoaderSharedLoadIterator {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment =
+      (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType =
+      AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+      Element,
+      const_min(
+          128 / sizeof_bits<Element>::value,
+          ThreadMap::kElementsPerAccess),
+      const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess =
+      AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoaderSharedLoadIterator(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t*>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ += thread_offset.row() * stride_ +
+        thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    byte_pointer_ += offset.row() * Shape::kRow * stride_ +
+        offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t const* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+          LoadType const* memory_pointer =
+              reinterpret_cast<LoadType const*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer
+                  [(column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                       kLoadsPerAccess +
+                   v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment from memory.
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(const Fragment& frag, Index pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType const* frag_ptr = reinterpret_cast<LoadType const*>(&frag);
+          LoadType* memory_pointer = reinterpret_cast<LoadType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int memory_pointer_idx =
+                  (column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                      kLoadsPerAccess +
+                  v;
+              memory_pointer[memory_pointer_idx] =
+                  frag_ptr[frag_idx * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_DEVICE
+  void store(const Fragment& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h b/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
new file mode 100644
index 000000000..3fb47da01
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
@@ -0,0 +1,315 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+
+      NOTE: Copied from cutlass/epilogue/warp/fragment_iterator_tensor_op.h but modified
+      to make the accumulators non-const type so the accumulators can be modified.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC,
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile &accum):
+    accumulators_(reinterpret_cast<AccessType *>(&accum)),
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset =
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+
+  /// Stores a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void store(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = frag_ptr[n];
+    }
+  }
+
+  /// Adds a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void add(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = accumulators_[accumulator_access_offset] + frag_ptr[n];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h b/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
new file mode 100644
index 000000000..7b6903486
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,235 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "classic_b2b_bmm/thread/linear_combination_triu.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Modified version of MmaTensorOpFragmentIterator that can zero out upper triangular
+// portion of output matrix.
+template <typename MmaTensorOpFragmentIterator_, int ThreadBlockShapeM_>
+class TriuMmaTensorOpFragmentIterator {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = typename MmaTensorOpFragmentIterator_::Shape;
+
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = typename MmaTensorOpFragmentIterator_::AccumulatorShape;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = MmaTensorOpFragmentIterator_::kKBlockColumn;
+
+  /// Accumulator Element type
+  using ElementAccumulator = typename MmaTensorOpFragmentIterator_::ElementAccumulator;
+
+  /// Element type
+  using Element = typename MmaTensorOpFragmentIterator_::Element;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = typename MmaTensorOpFragmentIterator_::InstructionShape;
+
+  /// Output operation on fragment
+  using OutputOp = thread::LinearCombinationTriu<
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    MmaTensorOpFragmentIterator_::OutputOp::kCount,
+    ThreadBlockShapeM_,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementCompute
+  >;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow,
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+  OutputOp output_op;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator()
+      : index_(0), is_residual_tile_(true), output_op() {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset;
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        if(!(is_residual_tile_ && index_ >= kResidualIndex)) {
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(
+              frag_ptr[m * MmaIterations::kColumn + n],
+              index_,
+              n,
+              m
+            );
+        }
+      }
+    }
+  }
+
+};
+
+}
+}
+}

From 3c689e12418922348cbc345d26948a79ce97219d Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Fri, 17 Mar 2023 16:55:29 -0700
Subject: [PATCH 289/638] Add aitemplate/static into compilation path (#447)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/447

ATT, so that AIT kernel could rely on kernels under static/

Reviewed By: tenpercent

Differential Revision: D44156615

fbshipit-source-id: 77195b36cb1490b83c5313ab05d6809ea4a202c9
---
 python/aitemplate/backend/cuda/target_def.py | 73 +++++++++-----------
 1 file changed, 33 insertions(+), 40 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 040c81bc8..c8e3bc714 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -106,15 +106,13 @@ def _build_compile_options(self):
                 flash_attention_path,
                 "fmha",
             ),
-            os.path.join(self._template_path, "../cub"),
         ]
-
+        ait_static_path = os.path.join(self._ait_include_path, "include/kernels")
         options = [
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
             "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
             "-w",
-            "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
-            % (self._arch, self._arch, self._arch),
+            f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
             "-Xcompiler=-fPIC",
             "-Xcompiler=-Wconversion",
             "-Xcompiler=-fno-strict-aliasing",
@@ -123,14 +121,8 @@ def _build_compile_options(self):
             "-std=c++17",
             "--expt-relaxed-constexpr",
             "--use_fast_math",
-            "-I" + cutlass_path[0],
-            "-I" + cutlass_path[1],
-            "-I" + cutlass_path[2],
-            "-I" + cutlass_path[3],
-            "-I" + cutlass_path[4],
-            "-I" + cutlass_path[5],
-            "-I" + cutlass_path[6],
-        ]
+            f"-I{ait_static_path}",
+        ] + ["-I" + path for path in cutlass_path]
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         return " ".join(options)
@@ -220,6 +212,10 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             )
             attention_include_path = self._include_path + "/att_include"
             shutil.copytree(attention_src_path, attention_include_path)
+            ait_static_include_path = self._include_path + "/static"
+            shutil.copytree(
+                static_files_path + "/include/kernels", ait_static_include_path
+            )
         self.cutlass_path_ = FBCUDA.cutlass_path_
 
         cutlass_lib_path = parutil.get_dir_path(
@@ -251,41 +247,38 @@ def _build_compile_options(self):
                 os.path.join(self._template_path, "examples/45_dual_gemm"),
                 os.path.join(self._template_path, "../att_include"),
                 os.path.join(self._template_path, "../att_include/fmha"),
-                os.path.join(self._template_path, "../cub"),
             ]
+            ait_static_path = os.path.join(self._include_path, "static")
             fb_include_path = os.path.join(self._include_path, "fb_include")
             pp_args = self.nvcc_options_json["pp_args"]
             with open(fb_include_path, "w") as fb_include:
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
 
-            options = self.nvcc_options_json["args"] + [
-                "-I" + cutlass_path[0],
-                "-I" + cutlass_path[1],
-                "-I" + cutlass_path[2],
-                "-I" + cutlass_path[3],
-                "-I" + cutlass_path[4],
-                "-I" + cutlass_path[5],
-                "-I" + cutlass_path[6],
-                f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
-                "-Xcompiler -Wno-strict-aliasing",
-                "-Xcompiler -Wno-narrowing",
-                "-Xcompiler -Wno-error=maybe-uninitialized",
-                "-Xcompiler -Wno-uninitialized",
-                "-Xcompiler -Wno-error=array-bounds",
-                "-Xcompiler -fPIC",
-                "-Xcompiler -fvisibility=hidden",
-                "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-                "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
-                "-w",
-                "--expt-relaxed-constexpr",
-                "--use_fast_math",
-                "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
-                % (self._arch, self._arch, self._arch),
-                "-Xcompiler=-Wconversion",
-                environ.get_compiler_opt_level(),
-                "-std=c++17",
-            ]
+            options = (
+                self.nvcc_options_json["args"]
+                + ["-I" + path for path in cutlass_path]
+                + [
+                    f"-I{ait_static_path}",
+                    f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
+                    "-Xcompiler -Wno-strict-aliasing",
+                    "-Xcompiler -Wno-narrowing",
+                    "-Xcompiler -Wno-error=maybe-uninitialized",
+                    "-Xcompiler -Wno-uninitialized",
+                    "-Xcompiler -Wno-error=array-bounds",
+                    "-Xcompiler -fPIC",
+                    "-Xcompiler -fvisibility=hidden",
+                    "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                    "-w",
+                    "--expt-relaxed-constexpr",
+                    "--use_fast_math",
+                    f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
+                    "-Xcompiler=-Wconversion",
+                    environ.get_compiler_opt_level(),
+                    "-std=c++17",
+                ]
+            )
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
             FBCUDA.compile_options_ = " ".join(options)

From 5151d6e6e3918b84be6d0fd75bb32d38b608cb3c Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Fri, 17 Mar 2023 18:18:22 -0700
Subject: [PATCH 290/638] Add a basic op for b2b bmm. (#445)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/445

Add a b2b bmm operator into AIT.

Reviewed By: chenyang78

Differential Revision: D43865461

fbshipit-source-id: fa878bc45f06f3d463b58cba5887b22015927f48
---
 python/aitemplate/backend/cuda/__init__.py    |   1 +
 .../backend/cuda/b2b_bmm/__init__.py          |  21 ++
 .../backend/cuda/b2b_bmm/classic_b2b_bmm.py   | 292 ++++++++++++++++++
 python/aitemplate/compiler/ops/__init__.py    |   1 +
 .../compiler/ops/b2b_bmm/__init__.py          |  20 ++
 .../compiler/ops/b2b_bmm/classic_b2b_bmm.py   | 189 ++++++++++++
 .../utils/mk_cutlass_lib/extra_enum.py        |   6 +-
 tests/unittest/ops/test_b2b_bmm.py            | 181 +++++++++++
 8 files changed, 708 insertions(+), 3 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/b2b_bmm/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
 create mode 100644 tests/unittest/ops/test_b2b_bmm.py

diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index e2124c3a5..6c1df2038 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -36,3 +36,4 @@
 from aitemplate.backend.cuda.vision_ops import *
 from aitemplate.backend.cuda.attention import *
 from aitemplate.backend.cuda.groupnorm import *
+from aitemplate.backend.cuda.b2b_bmm import *
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
new file mode 100644
index 000000000..121c3ffc4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+
+"""
+b2b bmm module init
+"""
+
+from . import classic_b2b_bmm
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
new file mode 100644
index 000000000..6edec81ec
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -0,0 +1,292 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+classic_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "classic_b2b_bmm/device/b2b_batched_gemm.h"
+
+namespace {
+
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int ThreadblockM = 64;
+constexpr int ThreadblockK = 32;
+constexpr int WarpK = 32;
+constexpr int InstructionM = 16;
+constexpr int InstructionN = 8;
+constexpr int InstructionK = 16;
+
+// Currently, causal mask is only supported with warp shape M of 16.
+// While we ought to debug it, the warp shape M restriction is not considered
+// high-priority as we do not want to make warp M much larger anyway. If you want
+// to explore the perf-impact of tuning this, then you can turn off causal mask after
+// gemm 0 and see what the perf result is.
+constexpr int WarpM = 16;
+
+constexpr int N0 = {{n0}};
+constexpr int N1 = {{n1}};
+
+void check_status(cutlass::Status status, int64_t m0, int64_t k0, const std::string& message) {
+  if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error(
+        message +
+        "Function: {{function_name}}. "
+        "m0: " + std::to_string(m0) +
+        ", k0: " + std::to_string(k0) +
+        ", n0: " + std::to_string({{n0}}) +
+        ", n1: " + std::to_string({{n1}}) + "."
+      );
+  }
+  return;
+}
+
+}  // end namespace
+
+{{func_signature}} {
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+  using ElementCompute = {{elem_input_type}};
+
+  ElementCompute alpha0 = ElementCompute({{alpha0}});
+  ElementCompute beta0 = ElementCompute(1);
+  ElementCompute activation_alpha = ElementCompute({{alpha1}});
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<ThreadblockM, N0, ThreadblockK>;
+  using WarpShape0 = cutlass::gemm::GemmShape<WarpM, N0, WarpK>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<ThreadblockM, N1, ThreadblockK>;
+  using WarpShape1 = cutlass::gemm::GemmShape<WarpM, N1, WarpK>;
+  using InstructionShape = cutlass::gemm::GemmShape<InstructionM, InstructionN, InstructionK>;
+
+  using EpilogueOutputOp0 =
+    cutlass::epilogue::thread::LinearCombinationGeneric<
+      {{epilogue_math}},
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      // Saves a little time in the epilogue by not multiplying the source by beta.
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  using EpilogueOutputOp1 =
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::Nothing
+    >;
+
+  using B2bGemmBatched = cutlass::gemm::device::B2bGemmBatched<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3,
+    {{has_causal}} // enable causal mask after gemm0
+  >;
+
+  cutlass::gemm::GemmCoord problem_size_0(m0, {{n0}}, k0);
+  cutlass::gemm::GemmCoord problem_size_1(m0, {{n1}}, {{n0}});
+  typename B2bGemmBatched::Arguments arguments{
+    problem_size_0,
+    problem_size_1,
+    {static_cast<ElementCompute*>(query), typename B2bGemmBatched::LayoutA::Stride(problem_size_0.k())},
+    problem_size_0.m() * problem_size_0.k(),
+    {static_cast<ElementCompute*>(key), typename B2bGemmBatched::LayoutB::Stride(problem_size_0.k())},
+    problem_size_0.n() * problem_size_0.k(),
+    {static_cast<ElementCompute*>(bias), typename B2bGemmBatched::LayoutC::Stride(problem_size_0.n())},
+    problem_size_0.m() * problem_size_0.n(),
+    {static_cast<ElementCompute*>(value), typename B2bGemmBatched::LayoutB1::Stride(problem_size_1.n())},
+    problem_size_1.n() * problem_size_1.k(),
+    {static_cast<ElementCompute*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},
+    0,
+    {static_cast<ElementOutput*>(output), typename B2bGemmBatched::LayoutC::Stride(problem_size_1.n())},
+    problem_size_1.m() * problem_size_1.n(),
+    batch_size,
+    {alpha0, beta0, activation_alpha},
+    {alpha1, beta1},
+  };
+
+  B2bGemmBatched b2b_gemm_op;
+  check_status(
+    b2b_gemm_op.can_implement(arguments),
+    m0, k0,
+    "Problem sizes are not supported."
+  );
+  check_status(
+    b2b_gemm_op.initialize(arguments),
+    m0, k0,
+    "classic_b2b_bmm initialization failed!"
+  );
+  check_status(
+    b2b_gemm_op(stream),
+    m0, k0,
+    "classic_b2b_bmm failed to execute!"
+  );
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   void* query,
+                   void* key,
+                   void* value,
+                   void* bias,
+                   int64_t batch_size,
+                   int64_t m0,
+                   int64_t k0,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{m0}},
+{{indent}}    {{k0}},
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.classic_b2b_bmm.gen_function")
+def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v, bias = func_attrs["inputs"]
+    n0 = k._attrs["shape"][1]
+    n1 = v._attrs["shape"][2]
+    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
+        raise RuntimeError(
+            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
+        )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        elem_accum_type = "cutlass::half_t"
+    else:
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    epilogue_math = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        n0=str(n0.value()),
+        n1=str(n1.value()),
+        has_causal="true" if func_attrs["causal"] else "false",
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        epilogue_math=epilogue_math,
+    )
+
+
+@registry.reg("cuda.classic_b2b_bmm.func_decl")
+def classic_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.classic_b2b_bmm.func_call")
+def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 4
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+    bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    batch_size = q_shape[0]._attrs["name"]
+    m0 = q_shape[1]._attrs["name"]
+    k0 = q_shape[2]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        batch_size=batch_size,
+        m0=m0,
+        k0=k0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/__init__.py b/python/aitemplate/compiler/ops/__init__.py
index 99c3584dd..d9cfc6d7a 100644
--- a/python/aitemplate/compiler/ops/__init__.py
+++ b/python/aitemplate/compiler/ops/__init__.py
@@ -32,3 +32,4 @@
 from aitemplate.compiler.ops.vision_ops import *
 from aitemplate.compiler.ops.attention import *
 from aitemplate.compiler.ops.groupnorm import *
+from aitemplate.compiler.ops.b2b_bmm import *
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
new file mode 100644
index 000000000..22f9a5d0d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+B2B Bmm ops.
+"""
+
+from aitemplate.compiler.ops.b2b_bmm.classic_b2b_bmm import classic_b2b_bmm
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
new file mode 100644
index 000000000..85c18e866
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
@@ -0,0 +1,189 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) + beta0 * bias))), V),
+
+where:
+Q: [B, M0, K0] (row_major), K: [B, N0, K0] (column_major), V: [B, N0, N1] (row_major), bias: [B, M0, N0] (row_major).
+Layouts are fixed for now.
+
+causal_masks can be disabled.
+When casual_masks is enabled, only the left bottom triangular part of the matrix is valid,
+and the other part is set to 0.
+
+Only supports M0 <= 512.
+"""
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils.alignment import find_max_alignment, get_alignments
+
+
+def _check_max_alignment(shape: IntVar, dtype: str, error_msg: str) -> None:
+    if not isinstance(shape, IntImm):
+        raise RuntimeError(f"{shape=} must be IntImm! ", error_msg)
+    res = find_max_alignment(shape.value(), dtype) == max(get_alignments(dtype))
+    if not res:
+        raise RuntimeError(
+            f"{shape=} does not satisfy {dtype=} max alignment requirements! ",
+            error_msg,
+        )
+
+
+class classic_b2b_bmm(Operator):
+    def __init__(
+        self, causal: bool, epilogue_math_name: str, alpha0: float, alpha1: float
+    ) -> None:
+        """Initialize classic_b2b_bmm op."""
+        super().__init__()
+        self._attrs["op"] = "classic_b2b_bmm"
+        self._attrs["has_profiler"] = False
+        self._attrs["causal"] = causal
+        self._attrs["alpha0"] = alpha0
+        self._attrs["alpha1"] = alpha1
+
+        import cutlass_lib
+
+        if epilogue_math_name not in cutlass_lib.library.EpilogueMathName:
+            raise RuntimeError(
+                "Unsupported epilogue function! Please check "
+                "python/aitemplate/utils/mk_cutlass_lib/extra_enum.py for a list of supported epilogue functions."
+            )
+        self._attrs["epilogue_math_name"] = epilogue_math_name
+
+    def _check_alignment(self) -> None:
+        q, k, v, bias = self._attrs["inputs"]
+        if (
+            q._attrs["dtype"] != k._attrs["dtype"]
+            or q._attrs["dtype"] != v._attrs["dtype"]
+        ):
+            raise RuntimeError(
+                "QKV dtypes must be the same! "
+                f"QKV dtypes: {q._attrs['dtype']=}, {k._attrs['dtype']=}, {v._attrs['dtype']=}"
+            )
+        dtype = q._attrs["dtype"]
+
+        _check_max_alignment(q._attrs["shape"][2], dtype, f"{q._attrs['shape']=}")
+        _check_max_alignment(k._attrs["shape"][2], dtype, f"{k._attrs['shape']=}")
+        _check_max_alignment(v._attrs["shape"][2], dtype, f"{v._attrs['shape']=}")
+
+    def _infer_shapes(self):
+        """infer the output shape for classic_b2b_bmm."""
+        q, k, v, bias = self._attrs["inputs"]
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3:
+            raise RuntimeError(
+                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        batch_size = q_shape[0]
+        M0 = q_shape[1]
+        if M0.upper_bound() > 512:
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports <=512 seq_length. Current length: {M0}"
+            )
+        K0 = q_shape[2]
+        if K0 != k_shape[2]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N0 = k_shape[1]
+        if N0 != v_shape[1]:
+            raise RuntimeError(
+                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N1 = v_shape[2]
+
+        output_shape = [batch_size, M0, N1]
+
+        bias_shape = bias._attrs["shape"]
+        if bias_shape != [batch_size, M0, N0]:
+            raise RuntimeError(
+                f"bias shape is not compatible with Q K! "
+                f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                f"bias shapes: {bias_shape=}."
+            )
+        return output_shape
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Tensor,
+    ) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, K0)
+        k: Tensor, shape(B, N0, K0)
+        v: Tensor, shape(B, N0, N1)
+        bias: Tensor, shape(B, M0, N0)
+
+        Returns
+        ----------
+        Tensor, shape(B, M0, N1)
+        """
+
+        self._attrs["inputs"] = [q, k, v, bias]
+        self._set_depth()
+        self._check_alignment()
+        output_shape = self._infer_shapes()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = ["causal", "epilogue_math_name", "alpha0", "alpha1"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        if current_target.name() == "rocm" or (
+            current_target.name() == "cuda" and int(current_target._arch) < 80
+        ):
+            raise NotImplementedError(
+                "classic_b2b_bmm is only supported by CUDA>=SM80 devices."
+            )
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index 54c1a4d9d..de6211628 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -92,7 +92,7 @@ class EpilogueMath(enum.Enum):
   Plus = enum_auto()
   Gelu = enum_auto()
   FastGelu = enum_auto()
-  Silu = enum_auto()
+  SiLu = enum_auto()
   ELUp1 = enum_auto()
 
 
@@ -105,7 +105,7 @@ class EpilogueMath(enum.Enum):
   EpilogueMath.Plus: 'cutlass::plus',
   EpilogueMath.Gelu: 'GELU',
   EpilogueMath.FastGelu: 'GELU_taylor',
-  EpilogueMath.Silu: 'cutlass::epilogue::thread::Silu',
+  EpilogueMath.SiLu: 'cutlass::epilogue::thread::SiLu',
   EpilogueMath.ELUp1: 'cutlass::epilogue::thread::ELUp1',
 }
 
@@ -119,7 +119,7 @@ class EpilogueMath(enum.Enum):
   "Add": EpilogueMath.Plus,
   "Gelu": EpilogueMath.Gelu,
   "FastGelu": EpilogueMath.FastGelu,
-  "Silu": EpilogueMath.Silu,
+  "SiLu": EpilogueMath.SiLu,
   "ELUp1": EpilogueMath.ELUp1
 }
 
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
new file mode 100644
index 000000000..2385187b1
--- /dev/null
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -0,0 +1,181 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for b2b bmm Operators.
+"""
+import logging
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class ClassicB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_classic_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        epilogue_math_name="Identity",
+        causal=False,
+        dtype="float16",
+        test_name="classic_b2b_bmm",
+        copy_op=True,
+        atol=1e-2,
+        rtol=1e-2,
+    ):
+        # Initialize AIT classic_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0 / m
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[batch_size_dim, m, n0],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        classic_b2b_bmm_op = ops.classic_b2b_bmm(
+            causal=causal,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            classic_b2b_bmm_op = ops.classic_b2b_bmm(
+                **classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=True)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            q_pt = torch.rand(batch_size, m, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, n1, dtype=torch_dtype).cuda()
+            bias_pt = torch.rand(batch_size, m, n0, dtype=torch_dtype).cuda()
+
+            # Run PT reference.
+            attn = alpha0 * (q_pt @ k_pt.transpose(-2, -1)) + bias_pt
+            if epilogue_math_name == "Identity":
+                pass
+            elif epilogue_math_name == "Sigmoid":
+                attn = torch.sigmoid(attn)
+            elif epilogue_math_name == "SiLu":
+                attn = torch.nn.functional.silu(attn)
+            else:
+                raise NotImplementedError(f"Unsupported {epilogue_math_name=}!")
+            attn = alpha1 * attn
+            if causal:
+                invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
+                    torch.ones(
+                        (m, n0),
+                        dtype=torch.bool,
+                        device="cuda",
+                    )
+                ).fill_diagonal_(False).to(torch_dtype)
+                attn = attn * invalid_attn_mask
+            output = attn @ v_pt
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
+            y = torch.empty(
+                [batch_size, m, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_b2b_bmm_fp16(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_basic",
+            dtype="float16",
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_causal",
+            dtype="float16",
+            batch_sizes=5,
+            causal=True,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="SiLu",
+            causal=True,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From cd9694b3b16cbc4a1083f3bbcc02da0e2cf4f307 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 17 Mar 2023 21:23:22 -0700
Subject: [PATCH 291/638] Add logics in elementwise to handle the case that a
 jagged tensor is treated as a dense tensor in AIT (#449)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/449

ATT, it looks that it's possible that a jagged tensor is identified as a dense
tensor in fx2ait lowering if this jagged tensor is never used in any fbgemm
jagged operators. e.g. Adding two jagged tensors together do not require any
special fbgemm jagged operators. This diff handles this case for elementwise.

We need to think systematically on how to solve this issue for other operators
as well. i.e. When jagged_int_var.total_length() == dense_int_var, sometimes we
want them to be treated equally (e.g. when calculating the broadcasted dim),
but sometimes we want to always pick up the jagged_int_var (e.g. when inferring
output shapes).

Reviewed By: chenyang78

Differential Revision: D44191261

fbshipit-source-id: 8dc0fd07e618d85e9c0d21072cc7d1f369b78d43
---
 .../backend/common/elementwise_common.py           | 14 +++++++++++++-
 .../aitemplate/compiler/ops/common/elementwise.py  |  6 ++++--
 python/aitemplate/utils/shape_utils.py             |  8 +++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 737b1d744..7f4991114 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -713,7 +713,16 @@ def _get_mixed_jagged_dense_config(
         # dense inputs' shapes) will be treated as a single dense dim
         return False, None, False
 
+    # If all dense inputs' first dim is equal to jagged_int_var's total_length(),
+    # treat all these dense inputs as jagged inputs as well.
     jagged_int_var = output_shape[0]
+    all_dense_jagged = True
+    for dense_input_shape in dense_input_shapes:
+        if dense_input_shape[0] != jagged_int_var.total_length():
+            all_dense_jagged = False
+    if all_dense_jagged:
+        return False, None, False
+
     jagged_max_dense_prefix_shape = jagged_int_var.get_max_dense_shape()
     jagged_suffix_shape = output_shape[1:]
     output_volume = jagged_max_dense_prefix_shape + jagged_suffix_shape
@@ -821,7 +830,10 @@ def _gen_input_broadcast_calculator_str(
 
     start_idx = 0
     for i, (input_dim, output_dim) in enumerate(zip(input_shape, output_shape)):
-        if input_dim != output_dim:
+        if input_dim != output_dim and not (
+            isinstance(output_dim, JaggedIntVar)
+            and input_dim == output_dim.total_length()
+        ):
             assert input_dim == IntImm(
                 1
             ), "Unexpected shapes! Input: {}, output: {}".format(
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index 8c16ecf77..c2f8e6983 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -112,10 +112,12 @@ def _broadcast_dense_and_jagged_shape(
                 "higher than the rank of the jagged inputs (when treating "
                 "the jagged dims as separate dims)."
             )
-
-        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+        broadcastable_jagged_dense, _ = shape_utils.get_broadcast_max_shape(
             jagged_max_dense_prefix_shape, dense_prefix_shape
         )
+        broadcastable_jagged_jagged, _ = shape_utils.get_broadcast_max_shape(
+            [jagged_first_dim.total_length()], dense_prefix_shape
+        )
         if not broadcastable:
             raise ValueError(
                 f"JaggedIntVar of the jagged inputs ({jagged_first_dim}) is not compatible "
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 7816b81fe..f67a9978c 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -51,7 +51,7 @@ def get_broadcast_max_shape(shape1, shape2):
     Note that two shapes are not required to have the same number of dimensions.
     For example, shape [5, 2, 3] and shape [3] are also broadcastable.
     """
-    from aitemplate.compiler.base import IntImm
+    from aitemplate.compiler.base import IntImm, JaggedIntVar
 
     min_len = min(len(shape1), len(shape2))
     if len(shape1) > len(shape2):
@@ -65,6 +65,12 @@ def get_broadcast_max_shape(shape1, shape2):
         if dim1 == dim2:
             res_shape[idx] = dim1
             continue
+        if isinstance(dim1, JaggedIntVar) and dim1.total_length() == dim2:
+            res_shape[idx] = dim1
+            continue
+        if isinstance(dim2, JaggedIntVar) and dim2.total_length() == dim1:
+            res_shape[idx] = dim2
+            continue
         if dim1 == IntImm(1):
             res_shape[idx] = dim2
         elif dim2 == IntImm(1):

From 14f1374618f2c2b3ce4ddd76f016f840d12a96f6 Mon Sep 17 00:00:00 2001
From: Mengchi Zhang <mengchi@meta.com>
Date: Fri, 17 Mar 2023 22:56:06 -0700
Subject: [PATCH 292/638] Add batched_dense_vec_jagged_2d_mul op (#448)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/448

In this diff, batched_dense_vec_jagged_2d_mul front-end and back-end op is added. A unit test is included.

Reviewed By: aakhundov

Differential Revision: D43921286

fbshipit-source-id: 2914ec2fc1442fde13fe1ecf720da88699ee3a6f
---
 .../backend/cuda/gemm_special/__init__.py     |   8 +-
 .../batched_dense_vec_jagged_2d_mul.py        | 251 ++++++++++++++++++
 .../compiler/ops/gemm_special/__init__.py     |  10 +-
 .../batched_dense_vec_jagged_2d_mul.py        | 108 ++++++++
 .../test_batched_dense_vec_jagged_2d_mul.py   | 197 ++++++++++++++
 5 files changed, 572 insertions(+), 2 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
 create mode 100644 tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py

diff --git a/python/aitemplate/backend/cuda/gemm_special/__init__.py b/python/aitemplate/backend/cuda/gemm_special/__init__.py
index 7f582d9ab..00f99c3c7 100644
--- a/python/aitemplate/backend/cuda/gemm_special/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_special/__init__.py
@@ -16,10 +16,16 @@
 special gemm ops
 """
 from aitemplate.backend.cuda.gemm_special import (
+    batched_dense_vec_jagged_2d_mul,
     bmm_rcr_n1,
     bmm_rrr_k1_tanh,
     gemm_rrr_small_nk,
 )
 
 
-__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
+__all__ = [
+    "batched_dense_vec_jagged_2d_mul",
+    "bmm_rcr_n1",
+    "bmm_rrr_k1_tanh",
+    "gemm_rrr_small_nk",
+]
diff --git a/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py b/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..aa4267dd5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,251 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define batched_dense_vec_jagged_2d_mul codegen and CUDA kernel
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_offsets_str
+from aitemplate.backend.target import Target
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define WARP_SIZE 32
+#define MAX_THREADS 1024
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}(
+    {{data_t}}* output,
+    const {{data_t}}* vectors,
+    const {{data_t}}* matrices,
+    {{offsets}}
+    {{index_type}} b, {{index_type}} h,
+    {{index_type}} n, {{index_type}} d
+) {
+  const int b_h_begin = blockIdx.x * blockDim.y + threadIdx.y;
+  const int b_h_step = gridDim.x * blockDim.y;
+  for (int b_h = b_h_begin; b_h < b * h; b_h += b_h_step) {
+    const int b_idx = b_h / h;
+    const int h_idx = b_h % h;
+
+    const {{index_type}} row_start = offsets.data[0][b_idx];
+    const {{index_type}} row_end = offsets.data[0][b_idx + 1];
+    const {{index_type}} length = min(row_end - row_start, n);
+    if (length == 0) {
+      for (int d_idx = threadIdx.x; d_idx < d; d_idx += blockDim.x) {
+        output[b_h * d + d_idx] = 0;
+      }
+    } else {
+      for (int d_idx = threadIdx.x; d_idx < d; d_idx += blockDim.x) {
+        {{acc_t}} acc =
+            {{acc_t}}(vectors[b_h * n] * matrices[row_start * h * d + h_idx * d + d_idx]);
+        for (int l = 1; l < length; ++l) {
+          acc += {{acc_t}}(vectors[b_h * n + l] * matrices[(row_start + l) * h * d + h_idx * d + d_idx]);
+        }
+        output[b_h * d + d_idx] = {{data_t}}(acc);
+      }
+    }
+  }
+}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(void* output, const void* vectors, const void* matrices, {{index_type}} b, {{index_type}} h, {{index_type}} n, {{index_type}} d, {{offsets_decl}} {{prefix}}Stream_t stream) {
+    if (b == 0 || d == 0) {
+      return;
+    }
+    int block_dim_x = std::min(static_cast<int>(std::ceil(static_cast<double>(d) / WARP_SIZE) * WARP_SIZE), MAX_THREADS);
+    int block_dim_y = MAX_THREADS / block_dim_x;
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(b * h) / block_dim_y));
+    {{func_name}}<<<block_size, dim3(block_dim_x, block_dim_y), 0, stream>>>(
+        reinterpret_cast<{{data_t}}*>(output),
+        reinterpret_cast<const {{data_t}}*>(vectors),
+        reinterpret_cast<const {{data_t}}*>(matrices),
+        {{offsets_call}}
+        b,
+        h,
+        n,
+        d
+    );
+}
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(void* output, const void* vectors, const void* matrices, {{index_type}} b, {{index_type}} h, {{index_type}} n, {{index_type}} d, {{offsets}} {{prefix}}Stream_t stream);
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}invoke_{{func_name}}({{output}}, {{vectors}}, {{matrices}}, {{b}}, {{h}}, {{n}}, {{d}}, {{offsets}} {{stream}});
+{{indent}}}
+    """
+)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+) -> str:
+    matrices = func_attrs["inputs"][1]
+
+    acc_t = "float"
+    if (
+        data_type in ["half", "bfloat16"]
+        and "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        acc_t = data_type
+
+    kernel_func = KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        data_t=data_type,
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+        acc_t=acc_t,
+    )
+    return kernel_func
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.gen_function")
+def jagged_to_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates jagged_to_dense function definition."""
+
+    vectors = func_attrs["inputs"][0]
+    matrices = func_attrs["inputs"][1]
+    backend_spec = CUDASpec()
+
+    dtype = vectors.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs,
+        backend_spec.index_type,
+        data_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render()
+
+    function = FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        offsets_decl=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
+        data_t=data_type,
+    )
+    return function
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.func_decl")
+def jagged_to_dense_gen_function_decl(func_attrs) -> str:
+    """Generate jagged_to_dense function declaration."""
+
+    matrices = func_attrs["inputs"][1]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.func_call")
+def jagged_to_dense_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate jagged_to_dense function call."""
+
+    vectors = func_attrs["inputs"][0]
+    vshape = vectors._attrs["shape"]
+    matrices = func_attrs["inputs"][1]
+    jshape = matrices._attrs["shape"]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        matrices=matrices._attrs["name"],
+        vectors=vectors._attrs["name"],
+        b=vshape[0]._attrs["name"],
+        h=vshape[1]._attrs["name"],
+        n=vshape[2]._attrs["name"],
+        d=jshape[2]._attrs["name"],
+        output=output._attrs["name"],
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=False,
+            const_ref=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/gemm_special/__init__.py b/python/aitemplate/compiler/ops/gemm_special/__init__.py
index 71b3a2922..b577f5ae0 100644
--- a/python/aitemplate/compiler/ops/gemm_special/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_special/__init__.py
@@ -15,9 +15,17 @@
 """
 special gemm ops
 """
+from aitemplate.compiler.ops.gemm_special.batched_dense_vec_jagged_2d_mul import (
+    batched_dense_vec_jagged_2d_mul,
+)
 from aitemplate.compiler.ops.gemm_special.bmm_rcr_n1 import bmm_rcr_n1
 from aitemplate.compiler.ops.gemm_special.bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
 from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
 
 
-__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
+__all__ = [
+    "batched_dense_vec_jagged_2d_mul",
+    "bmm_rcr_n1",
+    "bmm_rrr_k1_tanh",
+    "gemm_rrr_small_nk",
+]
diff --git a/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..f1b092e3c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Define batched_dense_vec_jagged_2d_mul op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class batched_dense_vec_jagged_2d_mul(Operator):
+    """
+    Returns a dense tensor containing batched matrix multiplication of batched vector and batched jagged tensor.
+    Args:
+        vectors (Tensor): batched vector tensor
+        matrices (Tensor): batched jagged tensor
+    Returns:
+        output (Tensor): a dense tensor containing the batched matrix multiplication result.
+    """
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self._attrs["op"] = "batched_dense_vec_jagged_2d_mul"
+
+    def _infer_shape(self, vectors: Tensor, matrices: Tensor) -> List[IntVar]:
+        jagged_int_var = matrices.shape()[0]
+        return [jagged_int_var.batch_dim(), matrices.shape()[1], matrices.shape()[2]]
+
+    def __call__(self, vectors: Tensor, matrices: Tensor) -> Tensor:
+        # Check matrices is jagged tensor
+        if not matrices.is_jagged():
+            matrices_name = matrices._attrs["name"]
+            raise RuntimeError(
+                f"Input tensor {matrices_name} is expected to be jagged, but actually dense."
+            )
+
+        # Check input tensor's dimension is 3
+        if len(vectors.shape()) != 3:
+            vectors_name = vectors._attrs["name"]
+            raise RuntimeError(f"Input tensor {vectors_name} dim should be 3.")
+
+        if len(matrices.shape()) != 3:
+            matrices_name = matrices._attrs["name"]
+            raise RuntimeError(f"Input tensor {matrices_name} dim should be 3.")
+
+        jagged_int_var = matrices.shape()[0]
+        # Check first dim B
+        if jagged_int_var.batch_dim() != vectors.shape()[0]:
+            raise RuntimeError(
+                f"Batch dim B of input tensors are expected to be the same, but actually first is {vectors.shape()[0]} and second is {jagged_int_var.batch_dim()}."
+            )
+
+        # Check second dim H
+        if vectors.shape()[1] != matrices.shape()[1]:
+            raise RuntimeError(
+                f"Second dim H of input tensors are expected to be the same, but actually first is {vectors.shape()[1]} and second is {matrices.shape()[1]}."
+            )
+
+        # Check tensor types
+        if vectors.dtype() != matrices.dtype():
+            raise RuntimeError(
+                f"Input tensors sare expected to have the same type, but actually first is {vectors.dtype()} and second is {matrices.dtype()}."
+            )
+
+        # Check Jagged dims
+        num_jagged_dims = len(jagged_int_var.jagged_dims())
+        if num_jagged_dims != 1:
+            raise RuntimeError(
+                f"Jagged dims for second jagged inputs should be 1, but actually is {num_jagged_dims}."
+            )
+        else:
+            jagged_max_values = jagged_int_var.jagged_dims()[0].max_value()
+            if jagged_max_values != vectors.shape()[2].value():
+                raise RuntimeError(
+                    f"max value is expected to be {vectors.shape()[2].value()} , but actually is {jagged_max_values}."
+                )
+
+        self._attrs["inputs"] = [vectors, matrices]
+        self._set_depth()
+        output_shape = self._infer_shape(vectors, matrices)
+        output = Tensor(output_shape, src_ops={self}, dtype=vectors.dtype())
+
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..5a0f6708a
--- /dev/null
+++ b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,197 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for batched_dense_vec_jagged_2d_mul Operator.
+"""
+import unittest
+from typing import List
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import batched_dense_vec_jagged_2d_mul_ref
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import parameterized
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 3e-2, "rtol": 2e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BatchedDenseVecJagged2DMulTestCase(unittest.TestCase):
+    def _test_batched_dense_vec_jagged_2d_mul(
+        self,
+        B: int,
+        N: int,
+        H: int,
+        D: int,
+        offsets: List[int],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_fp16_acc: bool = False,
+        test_suffix: str = "",
+    ):
+        # jagged shape is equal to (B, N, H, D)
+        batch_size = B
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        jagged_dims = [JaggedDim(min_value=0, max_value=N)]
+
+        total_length = offsets[-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+        jagged_inner_shape = [H, D]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dim = IntVar(values=[2, len(offsets) * 2])
+
+        # dense shape is (B, H, N)
+        dense_shape = [batch_size, H, N]
+        dense_dims = [batch_dim, IntImm(H), IntImm(N)]
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        DENSE = Tensor(
+            shape=dense_dims,
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        RESULT = ops.batched_dense_vec_jagged_2d_mul()(DENSE, JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert not RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_fp16_acc=use_fp16_acc),
+            "./tmp",
+            f"test_batched_dense_vec_jagged_2d_mul_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        dense_pt = get_random_torch_tensor(dense_shape, dtype)
+        result_pt = batched_dense_vec_jagged_2d_mul_ref(
+            vectors=dense_pt,
+            matrices=source_pt,
+            offsets=offsets_pt,
+        )
+        result = get_torch_empty_tensor([batch_size, H, D], dtype)
+
+        inputs = {"dense": dense_pt, "source": source_pt, "offsets": offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        torch.testing.assert_close(result, result_pt, **tolerance_limits)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                # TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_batched_dense_vesc_jagged_2d_mul(self, ait_dtype):
+        # test with different combination of offsets_dtype, use_fp16_acc and shapes
+        self._test_batched_dense_vec_jagged_2d_mul(
+            4,
+            260,
+            10,
+            32,
+            [0, 1, 4, 6, 7],
+            dtype=ait_dtype,
+            offsets_dtype="int32",
+            use_fp16_acc=True,
+            test_suffix=f"{ait_dtype}_int32_True",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            6,
+            130,
+            15,
+            39,
+            [0, 1, 4, 6, 7, 9, 10],
+            dtype=ait_dtype,
+            offsets_dtype="int32",
+            use_fp16_acc=False,
+            test_suffix=f"{ait_dtype}_int32_False",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            8,
+            52,
+            21,
+            32,
+            [0, 1, 4, 6, 7, 8, 12, 20, 29],
+            dtype=ait_dtype,
+            offsets_dtype="int64",
+            use_fp16_acc=True,
+            test_suffix=f"{ait_dtype}_int64_True",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            10,
+            10,
+            32,
+            8,
+            [0, 1, 4, 6, 7, 11, 15, 19, 23, 26, 28],
+            dtype=ait_dtype,
+            offsets_dtype="int64",
+            use_fp16_acc=False,
+            test_suffix=f"{ait_dtype}_int64_False",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 4ea634c00bee47b3bd04e756be50ecdde2e0d723 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 19 Mar 2023 04:34:10 -0700
Subject: [PATCH 293/638] Add dynamic JaggedDim bounds support (#452)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/452

Previously, `JaggedDim`---a jagged dimension encoded in the first `total_length` / `sum_B(N_B)` dimension of a jagged Tensor---could only have static (`int`) `min_value` and `max_value`. This is problematic when the bounds (in particular, the upper bound: `max_value`) of a jagged dimension in a jagged Tensor shape, as well as the corresponding `max_seq_len` dimension in a dense Tensor shape, are dynamic (`IntVar`). The latter can happen, as the maximum sequence length can, in principle, be dynamic within a single compiled AIT model.

This diff promotes the `min_value` / `max_value` of `JaggedDim` to either `IntImm` for static cases (backward-compatible with previously supported `int` use cases) or `IntVar` for dynamic cases.

Reviewed By: frank-wei

Differential Revision: D44198794

fbshipit-source-id: 1650ba4832c62cd4c9ca923602117caacd5f16c6
---
 .../backend/common/elementwise_common.py      |  13 ++-
 .../backend/cuda/view_ops/make_jagged.py      |  89 +++++++++++++---
 python/aitemplate/compiler/base.py            |  23 ++--
 python/aitemplate/compiler/compiler.py        |   7 ++
 .../batched_dense_vec_jagged_2d_mul.py        |  59 ++++++-----
 .../ops/tensor/padded_dense_to_jagged.py      |  11 +-
 python/aitemplate/testing/jagged_utils.py     |  28 +++++
 tests/unittest/ops/test_jagged_elementwise.py |  36 +------
 tests/unittest/ops/test_make_jagged.py        | 100 ++++++++++++++++++
 9 files changed, 274 insertions(+), 92 deletions(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 7f4991114..d8b60d90d 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -672,10 +672,19 @@ def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
             if not isinstance(dim, IntImm):
                 res[dim._attrs["name"]] = dim
                 if isinstance(dim, JaggedIntVar):
-                    # the batch_dim within the JaggedIntVar may not be present directly
-                    # in other input / output shapes, so we're adding it here separately
+                    # the batch_dim and the JaggedDim bounds within the JaggedIntVar
+                    # may not be present directly in other input / output shapes,
+                    # so we're adding it here separately
                     batch_dim = dim.batch_dim()
                     res[batch_dim._attrs["name"]] = batch_dim
+                    for jagged_dim in dim.jagged_dims():
+                        min_value = jagged_dim.min_value()
+                        if not isinstance(min_value, IntImm):
+                            res[min_value._attrs["name"]] = min_value
+                        max_value = jagged_dim.max_value()
+                        if not isinstance(max_value, IntImm):
+                            res[max_value._attrs["name"]] = max_value
+
     return list(res.values())
 
 
diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index f09d8c79e..40ce1d538 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -28,9 +28,13 @@
   of the constraints can be checked on the device, in which
   case an std::runtime_error is thrown on violation.
 """
+from typing import Set
+
 import jinja2
 
 from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar
 
 
 SRC_TEMPLATE = jinja2.Template(
@@ -56,10 +60,10 @@
   {{offsets_struct_type}} offsets,
   OffsetBounds bounds
 ) {
-  int64_t dim_id = blockIdx.y;
-  int64_t offset_id = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+  {{index_type}} dim_id = blockIdx.y;
+  {{index_type}} offset_id = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
 
-  int64_t length = offsets.lengths[dim_id];
+  {{index_type}} length = offsets.lengths[dim_id];
   const {{offsets_type}}* data = offsets.data[dim_id];
 
   if (offset_id >= length - 1) {
@@ -133,12 +137,15 @@
 
 void {{func_name}}(
 {% for idx in range(num_offsets) %}
-  int64_t offsets_length_{{idx}},
+  {{index_type}} offsets_length_{{idx}},
   const void* offsets_data_{{idx}},
+{% endfor %}
+{% for name in jagged_dynamic_bound_names %}
+  {{index_type}} {{name}},
 {% endfor %}
   {{offsets_struct_type}}& offsets,
-  int64_t* batch_dim,
-  int64_t total_length,
+  {{index_type}}* batch_dim,
+  {{index_type}} total_length,
   cudaStream_t stream
 ) {
 {% for idx in range(num_offsets) %}
@@ -157,7 +164,7 @@
     }
 {% endif %}
 
-    int64_t max_offset_length = 0;
+    {{index_type}} max_offset_length = 0;
     for (int i = 0; i < {{num_offsets}}; ++i) {
         if (offsets.lengths[i] <= 1) {
             throw std::runtime_error("offset array's length must be at least 2");
@@ -186,12 +193,15 @@
     """
 void {{func_name}}(
 {% for idx in range(num_offsets) %}
-  int64_t,
+  {{index_type}},
   const void*,
+{% endfor %}
+{% for _ in range(num_jagged_dynamic_bound_dims) %}
+  {{index_type}},
 {% endfor %}
   {{offsets_struct_type}}&,
-  int64_t*,
-  int64_t,
+  {{index_type}}*,
+  {{index_type}},
   cudaStream_t
 );
 """,
@@ -206,6 +216,9 @@
 {{indent}}  {{offsets_first_dim_names[idx]}},
 {{indent}}  {{offsets_data_names[idx]}},
 {% endfor %}
+{% for name in jagged_dynamic_bound_names %}
+{{indent}}  {{name}},
+{% endfor %}
 {{indent}}  {{offsets_var_name}},
 {{indent}}  &{{batch_dim_name}},
 {{indent}}  {{source_first_dim_name}},
@@ -217,16 +230,57 @@
 )
 
 
+def _get_jagged_dynamic_bound_dims(jagged_int_var: JaggedIntVar) -> Set[IntVar]:
+    """Get the set of dynamic dims in JaggedIntVar's JaggedDims' min / max values."""
+    return set(
+        [
+            dim.min_value()
+            for dim in jagged_int_var.jagged_dims()
+            if type(dim.min_value()) == IntVar
+        ]
+        + [
+            dim.max_value()
+            for dim in jagged_int_var.jagged_dims()
+            if type(dim.max_value()) == IntVar
+        ]
+    )
+
+
 @registry.reg("cuda.make_jagged.gen_function")
 def make_jagged_gen_function(func_attrs):
     func_name = func_attrs["name"]
     offsets_list = func_attrs["inputs"][1:]
+    backend_spec = CUDASpec()
 
     output = func_attrs["outputs"][0]
     jagged_int_var = output._attrs["shape"][0]
     offsets_struct_type = jagged_int_var.offsets_struct_type()
-    jagged_dim_min_values = [dim.min_value() for dim in jagged_int_var.jagged_dims()]
-    jagged_dim_max_values = [dim.max_value() for dim in jagged_int_var.jagged_dims()]
+
+    jagged_dim_min_values = [
+        dim.min_value().value()
+        if isinstance(dim.min_value(), IntImm)
+        else dim.min_value()._attrs["name"]
+        for dim in jagged_int_var.jagged_dims()
+    ]
+    jagged_dim_max_values = [
+        dim.max_value().value()
+        if isinstance(dim.max_value(), IntImm)
+        else dim.max_value()._attrs["name"]
+        for dim in jagged_int_var.jagged_dims()
+    ]
+
+    jagged_dynamic_bound_dims = _get_jagged_dynamic_bound_dims(jagged_int_var)
+    jagged_dynamic_bound_names = [
+        dim._attrs["name"] for dim in jagged_dynamic_bound_dims
+    ]
+
+    for dim in jagged_dynamic_bound_dims:
+        if dim._attrs.get("isolated", False):
+            raise ValueError(
+                "Dynamic dimension (IntVar) in the min / max value "
+                "of a JaggedDim in the JaggedIntVar is isolated "
+                f"(not present in any input shape): {jagged_int_var}."
+            )
 
     batch_dim = jagged_int_var.batch_dim()
     isolated_batch_dim = batch_dim._attrs.get("isolated", False)
@@ -239,6 +293,8 @@ def make_jagged_gen_function(func_attrs):
         jagged_dim_max_values=jagged_dim_max_values,
         offsets_type=jagged_int_var.offsets_type(),
         isolated_batch_dim=isolated_batch_dim,
+        jagged_dynamic_bound_names=jagged_dynamic_bound_names,
+        index_type=backend_spec.index_type,
     )
 
 
@@ -246,15 +302,19 @@ def make_jagged_gen_function(func_attrs):
 def make_jagged_gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
     offsets_list = func_attrs["inputs"][1:]
+    backend_spec = CUDASpec()
 
     output = func_attrs["outputs"][0]
     jagged_int_var = output._attrs["shape"][0]
     offsets_struct_type = jagged_int_var.offsets_struct_type()
+    jagged_dynamic_bound_dims = _get_jagged_dynamic_bound_dims(jagged_int_var)
 
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
         num_offsets=len(offsets_list),
         offsets_struct_type=offsets_struct_type,
+        num_jagged_dynamic_bound_dims=len(jagged_dynamic_bound_dims),
+        index_type=backend_spec.index_type,
     )
 
 
@@ -273,6 +333,10 @@ def make_jagged_gen_function_call(func_attrs, indent="  "):
     batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
     source_first_dim_name = source._attrs["shape"][0]._attrs["name"]
 
+    jagged_dynamic_bound_names = [
+        dim._attrs["name"] for dim in _get_jagged_dynamic_bound_dims(jagged_int_var)
+    ]
+
     return FUNC_CALL_TEMPLATE.render(
         indent="      ",
         func_name=func_name,
@@ -282,4 +346,5 @@ def make_jagged_gen_function_call(func_attrs, indent="  "):
         offsets_data_names=offsets_data_names,
         batch_dim_name=batch_dim_name,
         source_first_dim_name=source_first_dim_name,
+        jagged_dynamic_bound_names=jagged_dynamic_bound_names,
     )
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 1d2606b6b..89676f0db 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -220,21 +220,26 @@ class JaggedDim(Node):
 
     def __init__(
         self,
-        min_value: int,
-        max_value: int,
+        min_value: IntVar,
+        max_value: IntVar,
     ):
         """Initializes a JaggedDim.
 
         Parameters
         ----------
-        min_value : int
+        min_value : IntVar
             Minimum possible value of the jagged dimension.
-        max_value : int
+        max_value : IntVar
             Maximum possible value of the jagged dimension.
         """
-        if min_value < 0:
+        if isinstance(min_value, int):
+            min_value = IntImm(min_value)
+        if isinstance(max_value, int):
+            max_value = IntImm(max_value)
+
+        if min_value.lower_bound() < 0:
             raise ValueError(f"{min_value=}, but must be non-negative.")
-        if min_value > max_value:
+        if min_value.lower_bound() > max_value.upper_bound():
             raise ValueError(f"{min_value=} can't be larger than {max_value=}.")
 
         super().__init__()
@@ -256,11 +261,11 @@ def __str__(self) -> str:
             attrs["offsets"] = {"name": self._attrs["offsets"]._attrs["name"]}
         return str(attrs)
 
-    def min_value(self) -> int:
+    def min_value(self) -> IntVar:
         """The minimum possible value of the JaggedDim."""
         return self._attrs["values"][0]
 
-    def max_value(self) -> int:
+    def max_value(self) -> IntVar:
         """The maximum possible value of the JaggedDim."""
         return self._attrs["values"][1]
 
@@ -427,7 +432,7 @@ def get_max_dense_shape(self) -> List[IntVar]:
         """
         result = [self.batch_dim()]
         for dim in self.jagged_dims():
-            result.append(IntImm(dim.max_value()))
+            result.append(dim.max_value())
         return result
 
 
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 99131090f..215068c71 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -125,6 +125,13 @@ def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
                     int_vars[batch_dim._attrs["name"]] = batch_dim
                     total_length = dim.total_length()
                     int_vars[total_length._attrs["name"]] = total_length
+                    for jagged_dim in dim.jagged_dims():
+                        min_value = jagged_dim.min_value()
+                        if not isinstance(min_value, IntImm):
+                            int_vars[min_value._attrs["name"]] = min_value
+                        max_value = jagged_dim.max_value()
+                        if not isinstance(max_value, IntImm):
+                            int_vars[max_value._attrs["name"]] = max_value
                 if tensor._attrs["is_input"]:
                     int_var_names_in_input_shapes.add(name)
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
index f1b092e3c..9ea46d170 100644
--- a/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
+++ b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
@@ -27,12 +27,17 @@
 
 class batched_dense_vec_jagged_2d_mul(Operator):
     """
-    Returns a dense tensor containing batched matrix multiplication of batched vector and batched jagged tensor.
+    Compute a dense tensor containing batched matrix
+    multiplication of a batched dense vector and
+    a batched jagged matrix.
+
     Args:
-        vectors (Tensor): batched vector tensor
-        matrices (Tensor): batched jagged tensor
+        vectors (Tensor): batched dense vector of shape [B, H, N].
+        matrices (Tensor): batched jagged matrix of shape [sum_B(N_B), H, D].
+
     Returns:
-        output (Tensor): a dense tensor containing the batched matrix multiplication result.
+        output (Tensor): dense tensor containing the batched vector /
+        jagged matrix multiplication result of shape [B, H, D].
     """
 
     def __init__(
@@ -46,52 +51,54 @@ def _infer_shape(self, vectors: Tensor, matrices: Tensor) -> List[IntVar]:
         return [jagged_int_var.batch_dim(), matrices.shape()[1], matrices.shape()[2]]
 
     def __call__(self, vectors: Tensor, matrices: Tensor) -> Tensor:
-        # Check matrices is jagged tensor
         if not matrices.is_jagged():
-            matrices_name = matrices._attrs["name"]
-            raise RuntimeError(
-                f"Input tensor {matrices_name} is expected to be jagged, but actually dense."
+            raise TypeError(
+                f"matrices must be a jagged Tensor, but got a dense Tensor {matrices}."
+            )
+        if vectors.is_jagged():
+            raise TypeError(
+                f"vectors must be a jagged Tensor, but got a jagged Tensor {vectors}."
             )
 
-        # Check input tensor's dimension is 3
         if len(vectors.shape()) != 3:
-            vectors_name = vectors._attrs["name"]
-            raise RuntimeError(f"Input tensor {vectors_name} dim should be 3.")
+            raise ValueError(f"vectors must be rank-3, but got {vectors}.")
 
         if len(matrices.shape()) != 3:
-            matrices_name = matrices._attrs["name"]
-            raise RuntimeError(f"Input tensor {matrices_name} dim should be 3.")
+            raise ValueError(f"matrices must be rank-3, but got {matrices}.")
 
         jagged_int_var = matrices.shape()[0]
-        # Check first dim B
         if jagged_int_var.batch_dim() != vectors.shape()[0]:
             raise RuntimeError(
-                f"Batch dim B of input tensors are expected to be the same, but actually first is {vectors.shape()[0]} and second is {jagged_int_var.batch_dim()}."
+                "The batch dim B of the jagged matrices tensor and "
+                "dense vectors tensor must be the same, but got "
+                f"{jagged_int_var.batch_dim()=} != {vectors.shape()[0]=}."
             )
 
-        # Check second dim H
         if vectors.shape()[1] != matrices.shape()[1]:
             raise RuntimeError(
-                f"Second dim H of input tensors are expected to be the same, but actually first is {vectors.shape()[1]} and second is {matrices.shape()[1]}."
+                f"The second dim H of the jagged matrices tensor and "
+                "dense vectors tensor must be the same, but got "
+                f"{matrices.shape()[1]=} != {vectors.shape()[1]}."
             )
 
-        # Check tensor types
         if vectors.dtype() != matrices.dtype():
             raise RuntimeError(
-                f"Input tensors sare expected to have the same type, but actually first is {vectors.dtype()} and second is {matrices.dtype()}."
+                "vectors and matrices must have the same type, but got "
+                f"{vectors.dtype()=} != {matrices.dtype()=}."
             )
 
-        # Check Jagged dims
-        num_jagged_dims = len(jagged_int_var.jagged_dims())
-        if num_jagged_dims != 1:
+        if len(jagged_int_var.jagged_dims()) != 1:
             raise RuntimeError(
-                f"Jagged dims for second jagged inputs should be 1, but actually is {num_jagged_dims}."
+                "Jagged matrices tensor must have a "
+                f"single JaggedDim, but got {matrices}."
             )
         else:
-            jagged_max_values = jagged_int_var.jagged_dims()[0].max_value()
-            if jagged_max_values != vectors.shape()[2].value():
+            max_value = jagged_int_var.jagged_dims()[0].max_value()
+            if max_value != vectors.shape()[2]:
                 raise RuntimeError(
-                    f"max value is expected to be {vectors.shape()[2].value()} , but actually is {jagged_max_values}."
+                    "Upper bound (max_value) of the jagged dim in matrices "
+                    "must be equal to the last dim N in vectors, but got "
+                    f"{max_value=} != {vectors.shape()[2].value()=}."
                 )
 
         self._attrs["inputs"] = [vectors, matrices]
diff --git a/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
index 95bec33eb..fcfd6b181 100644
--- a/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
+++ b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
@@ -20,7 +20,7 @@
 
 from aitemplate.backend import registry
 from aitemplate.backend.target import Target
-from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Operator, Tensor
+from aitemplate.compiler.base import IntVar, JaggedDim, Operator, Tensor
 from aitemplate.compiler.ops import make_jagged
 
 
@@ -86,13 +86,6 @@ def __call__(
             raise TypeError(
                 f"x.shape()[0] must be IntVar, but got {type(x_shape[0]).__name__}."
             )
-        for i, dim in enumerate(x_shape[1 : 1 + len(offsets_list)], start=1):
-            if not isinstance(dim, IntImm):
-                raise TypeError(
-                    "All sequence dimensions in the x.shape() (corresponding to the "
-                    "jagged dimensions of the output jagged Tensor) must be IntImm, "
-                    f"but got type(x_shape()[{i}]) == {type(dim).__name__}."
-                )
 
         self._attrs["inputs"] = [x, *offsets_list]
         self._set_depth()
@@ -114,7 +107,7 @@ def __call__(
         jagged_output = make_jagged(
             batch_dim=x_shape[0],
             jagged_dims=[
-                JaggedDim(min_value=0, max_value=dim.value())
+                JaggedDim(min_value=0, max_value=dim)
                 for dim in x_shape[1 : 1 + len(offsets_list)]
             ],
         )(
diff --git a/python/aitemplate/testing/jagged_utils.py b/python/aitemplate/testing/jagged_utils.py
index dd1a35ed4..f8f4cb3d9 100644
--- a/python/aitemplate/testing/jagged_utils.py
+++ b/python/aitemplate/testing/jagged_utils.py
@@ -373,3 +373,31 @@ def batched_dense_vec_jagged_2d_mul_ref(
     ).squeeze(
         dim=2
     )  # [B, H, D]
+
+
+def add_jagged_dense_ref(
+    jagged: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    dense: torch.Tensor,
+    jagged_max_shape: List[int] = None,
+) -> torch.Tensor:
+    """The reference function for jagged / dense elementwise add."""
+    if jagged_max_shape is None:
+        jagged_max_shape = dense.shape
+
+    assert len(jagged.shape) + len(offsets_list) >= len(dense.shape)
+    assert len(jagged_max_shape) == len(jagged.shape) + len(offsets_list)
+
+    return dense_to_jagged(
+        dense=(
+            dense
+            + jagged_to_dense(
+                jagged=jagged,
+                offsets_list=offsets_list,
+                dense_shape=jagged_max_shape,
+                padding_value=0.0,
+            )
+        ),
+        offsets_list=offsets_list,
+        padding_value=-1.0,
+    )
diff --git a/tests/unittest/ops/test_jagged_elementwise.py b/tests/unittest/ops/test_jagged_elementwise.py
index 60232d8ac..0da6a467f 100644
--- a/tests/unittest/ops/test_jagged_elementwise.py
+++ b/tests/unittest/ops/test_jagged_elementwise.py
@@ -25,44 +25,12 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.jagged_utils import (
-    dense_to_jagged,
-    generate_offsets,
-    jagged_to_dense,
-)
+from aitemplate.testing.jagged_utils import add_jagged_dense_ref, generate_offsets
 from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 from parameterized import param, parameterized
 
 
-def _add_jagged_dense_ref(
-    jagged: torch.Tensor,
-    offsets_list: List[torch.Tensor],
-    dense: torch.Tensor,
-    jagged_max_shape: List[int] = None,
-) -> torch.Tensor:
-    """The reference function for jagged / dense elementwise add."""
-    if jagged_max_shape is None:
-        jagged_max_shape = dense.shape
-
-    assert len(jagged.shape) + len(offsets_list) >= len(dense.shape)
-    assert len(jagged_max_shape) == len(jagged.shape) + len(offsets_list)
-
-    return dense_to_jagged(
-        dense=(
-            dense
-            + jagged_to_dense(
-                jagged=jagged,
-                offsets_list=offsets_list,
-                dense_shape=jagged_max_shape,
-                padding_value=0.0,
-            )
-        ),
-        offsets_list=offsets_list,
-        padding_value=-1.0,
-    )
-
-
 class JaggedElementwiseTestCase(unittest.TestCase):
     def _test_jagged_dense_elementwise_add(
         self,
@@ -156,7 +124,7 @@ def _test_jagged_dense_elementwise_add(
         }
         source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
         dense_pt = get_random_torch_tensor(dense_shape, dtype)
-        result_pt = _add_jagged_dense_ref(
+        result_pt = add_jagged_dense_ref(
             jagged=source_pt,
             offsets_list=list(offsets_pt.values()),
             jagged_max_shape=jagged_max_shape,
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
index 63dd7a148..f68d43357 100644
--- a/tests/unittest/ops/test_make_jagged.py
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -18,12 +18,15 @@
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import JaggedDim, JaggedIntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import add_jagged_dense_ref
 from aitemplate.testing.test_utils import (
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class MakeJaggedTestCase(unittest.TestCase):
@@ -109,6 +112,103 @@ def test_make_jagged(
         torch.testing.assert_close(y, x_pt)
         torch.testing.assert_close(z, z_pt)
 
+    def test_make_jagged_with_dynamic_bounds(
+        self,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N_min = 1
+        N_max = 32
+        N = 3
+        D = 64
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntVar(name="max_seq_len", values=[N_min, N_max])
+        embedding_dim = IntImm(name="embedding", value=D)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N_max])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(
+                    min_value=0,
+                    max_value=max_seq_dim,
+                )
+            ],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, DENSE)
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert RESULT.is_jagged()
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_with_dynamic_bounds",
+        )
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        source_pt = get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+        dense_pt = get_random_torch_tensor([B, N, D], dtype=dtype)
+
+        result_pt = add_jagged_dense_ref(
+            jagged=source_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, D],
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, "offsets": offsets_pt, "dense": dense_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From d22178f035dd26d3e691b66b4f58ca1c75339d0a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Sun, 19 Mar 2023 16:40:44 -0700
Subject: [PATCH 294/638] add knob to turn off optimization passes (#443)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/443

in preparation for benchmarks, add an option to run the original model without optimizations

Reviewed By: chenyang78

Differential Revision: D44154322

fbshipit-source-id: 39db560cf15f3c524d80b3f00c3fe41e3fb4cf98
---
 .../aitemplate/compiler/transform/fuse_ops.py | 41 +++++++++++-
 .../compiler/transform/optimize_graph.py      | 26 ++++++--
 .../test_fused_elementwise_singleton.py       | 64 +++++++++++++++++++
 3 files changed, 123 insertions(+), 8 deletions(-)
 create mode 100644 tests/unittest/compiler/test_fused_elementwise_singleton.py

diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 3dd39e3a3..37b57fab6 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -300,7 +300,7 @@ def _detect_cycle(group: List[Operator]) -> bool:
     return False
 
 
-def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+def fuse_elementwise(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
     """
     Given a sorted graph, returns a sorted graph with fused_elementwise ops on fusable elementwise ops.
     """
@@ -311,7 +311,10 @@ def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
             continue
         src_op = list(src_ops)[0]
         if src_op._attrs["op"] == "elementwise":
-            disjoint_set.add(src_op, _find_fusable_elementwise_ops(src_op))
+            disjoint_set.add(
+                src_op,
+                _find_fusable_elementwise_ops(src_op),
+            )
 
     to_be_fused_op_groups = disjoint_set.get_node_groups()
 
@@ -327,6 +330,39 @@ def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
+def process_singleton_elementwise(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """
+    A dummy pass which enables codegen for any elementwise op without fusing it with neighbors
+    """
+    disjoint_set = SimpleDisjointSet()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] == "elementwise":
+            disjoint_set.add(
+                src_op,
+                {src_op},
+            )
+
+    to_be_fused_op_groups = disjoint_set.get_node_groups()
+
+    for ops in to_be_fused_op_groups:
+        # Partition subgraph based on output shape.
+        # output_op_map = {op._attrs["op"]: set(op) for op in ops}
+        output_op_map = _partition_subgraphs(ops)
+        # Collect information to create fuse ops.
+        info_list = _collect_info(output_op_map, set(ops), sorted_graph)
+        # Create fuse ops.
+        _create_fuse_ops(info_list)
+
+    sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
 def _fuse_layernorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
     to_be_fused_op_groups = []
     for tensor in sorted_graph:
@@ -396,7 +432,6 @@ def fuse_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
     funcs = [
         _fuse_layernorm_sigmoid_mul,
         _fuse_groupnorm_sigmoid_mul,
-        _fuse_elementwise,  # this pass should be left in the last one
     ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index c227c2b14..b5687b74a 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -15,7 +15,6 @@
 """
 Applies graph transformations.
 """
-
 from typing import List
 
 from aitemplate.compiler.base import Tensor
@@ -26,7 +25,11 @@
 from aitemplate.compiler.transform.fuse_mm_reshape_permute import (
     fuse_mm_reshape_permute,
 )
-from aitemplate.compiler.transform.fuse_ops import fuse_ops
+from aitemplate.compiler.transform.fuse_ops import (
+    fuse_elementwise,
+    fuse_ops,
+    process_singleton_elementwise,
+)
 from aitemplate.compiler.transform.fuse_parallel_gemms import fuse_parallel_gemms
 from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
     fuse_permute_bmm_and_gemm,
@@ -49,7 +52,9 @@
 from aitemplate.utils import graph_utils
 
 
-def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
+def optimize_graph(
+    sorted_graph: List[Tensor], workdir: str, optimize=True
+) -> List[Tensor]:
     """Applies graph optimizations, including
 
     - fuse permute and bmm
@@ -86,6 +91,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         fuse_mm_reshape_permute,
         transform_memory_ops,
         fuse_ops,
+        fuse_elementwise,
         # need to run before transform_strided_ops to fuse strided ops + concat
         # and transform_memory_ops to fuse split + concat
         fuse_parallel_gemms,
@@ -102,8 +108,18 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         transform_memory_ops,
     ]
 
-    for func in funcs:
+    if not optimize:
+        # 1 - Convert elementwise ops to singleton fused_elementwise ops
+        # 2 - Padding also needs to be done for the model to be executable.
+        funcs = [
+            process_singleton_elementwise,
+            apply_padding,
+        ]
+
+    for i, func in enumerate(funcs):
         sorted_graph = func(sorted_graph, workdir)
-        graph_utils.dump_graph_debug_str_to_file(sorted_graph, workdir, func.__name__)
+        graph_utils.dump_graph_debug_str_to_file(
+            sorted_graph, workdir, f"{i:02}-{func.__name__}"
+        )
 
     return sorted_graph
diff --git a/tests/unittest/compiler/test_fused_elementwise_singleton.py b/tests/unittest/compiler/test_fused_elementwise_singleton.py
new file mode 100644
index 000000000..4c0fc241c
--- /dev/null
+++ b/tests/unittest/compiler/test_fused_elementwise_singleton.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+from aitemplate import compiler
+
+from aitemplate.compiler import ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.fuse_ops import (
+    fuse_elementwise,
+    process_singleton_elementwise,
+)
+from aitemplate.testing import detect_target
+
+
+def _make_graph():
+    X0 = Tensor(
+        shape=[3, 5, 7, 9],
+        dtype="float16",
+        name="X0",
+        is_input=True,
+    )
+
+    Y = ops.elementwise(FuncEnum.ABS)(ops.elementwise(FuncEnum.SIN)(X0))
+
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+    return Y
+
+
+class FusedElementwiseSingletonTestCase(unittest.TestCase):
+    def test_singleton_elementwise(self):
+        Y = _make_graph()
+
+        with detect_target():
+            graph = compiler.transform.toposort(Y)
+            compiler.transform.name_graph(graph)
+            g1 = process_singleton_elementwise(graph)
+
+        self.assertEqual(3, len(g1))  # x, sin(x), abs(sin(x))
+
+    def test_fused_elementwise(self):
+        Y = _make_graph()
+
+        with detect_target():
+            graph = compiler.transform.toposort(Y)
+            compiler.transform.name_graph(graph)
+            g1 = fuse_elementwise(graph)
+
+        self.assertEqual(2, len(g1))  # x, abs(sin(x))

From d0c8b136603e5436ae7c206e6fda42e8e718568f Mon Sep 17 00:00:00 2001
From: Cheng Cai <chengcai@meta.com>
Date: Mon, 20 Mar 2023 14:40:06 -0700
Subject: [PATCH 295/638] swap MultiScaleBlock (#376)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/376

Module swap for MultiScaleBlock

Reviewed By: mortzur

Differential Revision: D43739657

fbshipit-source-id: 615f70d5c2820cfac8931b4bc9e949ce4ddbc8dc
---
 python/aitemplate/frontend/nn/multiscale_attention.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index ebc2b2d87..bd6c6c7ea 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -684,13 +684,15 @@ def __init__(
             self.pool_skip, has_cls_embed=self.has_cls_embed, norm=None
         )
 
-    def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+    def forward(
+        self, x: Tensor, t_shape: int, h_shape: int, w_shape: int
+    ) -> Tuple[Tensor, List[int]]:
         """
         Args:
             x (Tensor): Input tensor.
             thw_shape (List): The shape of the input tensor (before flattening).
         """
-
+        thw_shape = [t_shape, h_shape, w_shape]
         x_block, thw_shape_new = self.attn(x, thw_shape)
 
         x_res, _ = self._attention_pool(x, thw_shape)

From c0a751ec3659f87982d5aff91dc2d6870edcf9ef Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 21 Mar 2023 13:31:39 +0800
Subject: [PATCH 296/638] format code

---
 python/aitemplate/backend/codegen.py          |  6 +++++-
 .../backend/cuda/conv2d/conv2d_depthwise.py   |  1 -
 .../aitemplate/backend/rocm/conv2d/common.py  |  1 -
 .../backend/rocm/gemm/gemm_rcr_bias_swish.py  |  1 +
 .../backend/rocm/normalization/norm_common.py |  1 -
 python/aitemplate/backend/task_runner.py      |  1 +
 python/aitemplate/compiler/model.py           | 21 ++++++++++++++++---
 .../ops/conv/common_conv2d_bias_activation.py |  1 +
 python/aitemplate/compiler/ops/conv/conv2d.py |  1 +
 .../ops/conv/conv2d_bias_few_channels.py      |  1 +
 .../conv2d_bias_hardswish_few_channels.py     |  1 +
 .../ops/conv/conv2d_bias_relu_few_channels.py |  1 +
 .../compiler/ops/conv/conv2d_depthwise.py     |  1 +
 .../ops/conv/conv2d_depthwise_bias.py         |  1 +
 python/aitemplate/compiler/ops/conv/conv3d.py |  1 +
 .../conv/special_conv2d_bias_activation.py    |  1 +
 .../compiler/ops/conv/transposed_conv2d.py    |  1 +
 .../ops/conv/transposed_conv2d_bias.py        |  1 +
 .../ops/conv/transposed_conv2d_bias_relu.py   |  1 +
 .../compiler/ops/layernorm/group_layernorm.py |  2 +-
 .../compiler/ops/layernorm/layernorm.py       |  2 +-
 python/aitemplate/compiler/tensor_accessor.py |  2 +-
 .../compiler/transform/fuse_group_ops.py      |  1 +
 .../aitemplate/compiler/transform/profile.py  |  1 -
 .../transform/split_large_split_ops.py        |  2 +-
 .../transform/transform_special_ops.py        |  3 +--
 .../transform_strided_op_and_view_op.py       |  2 --
 python/aitemplate/frontend/nn/module.py       |  1 -
 .../frontend/nn/multiscale_attention.py       |  3 ---
 .../utils/mk_ck_lib/conv2d_operation.py       |  1 -
 .../aitemplate/utils/mk_ck_lib/generator.py   |  3 +--
 python/aitemplate/utils/mk_ck_lib/library.py  |  1 +
 python/aitemplate/utils/mk_ck_lib/manifest.py |  1 -
 python/aitemplate/utils/torch_utils.py        |  4 ++--
 tests/unittest/backend/test_profiler.py       |  2 +-
 .../benchmark/test_group_gemm_benchmark.py    |  2 +-
 .../test_strided_layernorm_benchmark.py       |  5 ++++-
 .../compiler/test_slice_gemm_fusion.py        |  1 -
 .../compiler/test_slice_view_strided.py       |  5 -----
 .../compiler/test_strided_layernorm.py        |  1 -
 .../unittest/compiler/test_strided_view_op.py | 18 ++++++++--------
 tests/unittest/frontend/test_module.py        |  8 +++----
 tests/unittest/ops/test_batch_gather.py       |  2 --
 tests/unittest/ops/test_bmm.py                | 12 +++++------
 tests/unittest/ops/test_bmm_permute.py        |  4 ++--
 tests/unittest/ops/test_bmm_softmax.py        |  1 -
 tests/unittest/ops/test_bmm_softmax_bmm.py    |  4 ++--
 tests/unittest/ops/test_efficient_nms.py      |  4 +---
 tests/unittest/ops/test_nms.py                |  4 +---
 tests/unittest/ops/test_topk.py               |  1 -
 50 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 7ff934a58..c15e64d74 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -374,7 +374,11 @@ def __init__(
         self.graph = graph
 
         self.num_inputs, self.num_outputs = count_inputs_outputs(graph)
-        (self.max_blob_size, self.max_constant_blob_size, self.workspace,) = (
+        (
+            self.max_blob_size,
+            self.max_constant_blob_size,
+            self.workspace,
+        ) = (
             max_blob_size,
             max_constant_blob_size,
             workspace,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
index 1b5de0758..b7097e417 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -85,7 +85,6 @@ def f_proc_op_special(op):
             and op.accumulator_type() == acc_type
             and op.group_mode == cutlass_lib.library.GroupMode.NoneGroup
         ):
-
             op = copy.deepcopy(op)
             # set epilogue
             epilogue_name = func_attrs["epilogue"]
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 03250e7f5..f2d5da5b9 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -570,7 +570,6 @@ def gen_profiler(
     src_template=SRC_TEMPLATE,
     prob_args_template=PROBLEM_ARGS_TEMPLATE,
 ):
-
     """Generates standalone executables for profiler.
 
     Parameters
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index 21d8664ce..218a6e6b5 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -20,6 +20,7 @@
 """
 
 import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.rocm.gemm import common
 from aitemplate.backend.rocm.gemm.layout import RCR
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 7a65ebed4..c0b5796c7 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -340,7 +340,6 @@ def gen_profiler(
     op_instance = func_attrs["op_instance"]
     file_pairs = []
     for op_name, op in op_instance.items():
-
         config = emit_instance(op)
         config_name = extract_config_name(config)
         instances = INSTANCE_TEMPLATE.render(
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
index a4714715b..ac8f5eb3d 100644
--- a/python/aitemplate/backend/task_runner.py
+++ b/python/aitemplate/backend/task_runner.py
@@ -24,6 +24,7 @@
 import typing
 from collections import OrderedDict
 
+
 # pylint: disable=R1732,R1710,R1721
 class Task(object):
     """Task is an object containing a bash command,
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 420a567c8..9a9de0190 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -367,7 +367,12 @@ def _run_impl(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+        (
+            c_inputs,
+            c_outputs,
+            c_stream,
+            c_output_shapes_out,
+        ) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
@@ -451,7 +456,12 @@ def profile(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+        (
+            c_inputs,
+            c_outputs,
+            c_stream,
+            c_output_shapes_out,
+        ) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
@@ -607,7 +617,12 @@ def benchmark(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+        (
+            c_inputs,
+            c_outputs,
+            c_stream,
+            c_output_shapes_out,
+        ) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index fae05dad9..ce2024559 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
+
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
     """Base class of conv2d with bias + activation."""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index f8378b12e..6e3961656 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -533,6 +533,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             best_algo = out[0]
         else:
             from operator import itemgetter
+
             out = min(result, key=itemgetter(1))
             best_algo = out[1].op_config
         workspace = out[1].workspace
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
index 71a17819f..fb34f4625 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
@@ -19,6 +19,7 @@
     special_conv2d_bias_activation,
 )
 
+
 # pylint: disable=C0103
 class conv2d_bias_few_channels(special_conv2d_bias_activation):
     """conv2d_bias_few_channels.
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
index bbe2a879c..ac79c62ac 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
@@ -19,6 +19,7 @@
     special_conv2d_bias_activation,
 )
 
+
 # pylint: disable=C0103
 class conv2d_bias_hardswish_few_channels(special_conv2d_bias_activation):
     """conv2d_bias_hardswish_few_channels.
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
index 84b9bea70..d915b80fe 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
@@ -19,6 +19,7 @@
     special_conv2d_bias_activation,
 )
 
+
 # pylint: disable=C0103
 class conv2d_bias_relu_few_channels(special_conv2d_bias_activation):
     """conv2d_bias_relu_few_channels.
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
index d191dfb95..ca2117e05 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
+
 # pylint: disable=C0103
 class conv2d_depthwise(conv2d):
     """Base class of conv2d with groups."""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
index c6d026b84..505f0b976 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
+
 # pylint: disable=C0103
 class conv2d_depthwise_bias(conv2d):
     """Base class of conv2d with groups."""
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 612597c48..e458ce15f 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -547,6 +547,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             best_algo = out[0]
         else:
             from operator import itemgetter
+
             out = min(result, key=itemgetter(1))
             best_algo = out[1].op_config
         workspace = out[1].workspace
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
index 4be6e637a..f0b402820 100644
--- a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -19,6 +19,7 @@
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 from aitemplate.compiler.ops.padding import nhwc3to4, nhwc3to8
 
+
 # pylint: disable=C0103
 class special_conv2d_bias_activation(conv2d):
     """Special_conv2d_bias_activation.
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
index 533b7e88a..cd08f5b9c 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -49,6 +49,7 @@
 """
 )
 
+
 # pylint: disable=C0103
 class transposed_conv2d(conv2d):
     r"""Transposed conv2d.
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
index c3423eb7c..caeb948dc 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
@@ -25,6 +25,7 @@
 )
 from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 
+
 # pylint: disable=C0103
 class transposed_conv2d_bias(transposed_conv2d):
     r"""Transposed conv2d with bias.
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
index 2f0a57de6..81ea0f61e 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
@@ -17,6 +17,7 @@
 """
 from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
 
+
 # pylint: disable=C0103
 class transposed_conv2d_bias_relu(transposed_conv2d_bias):
     r"""Transposed conv2d with bias + relu.
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
index aee3458e5..dc39712bb 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -64,7 +64,7 @@ def _sanity_check(self, all_inputs):
             == len(self._attrs["normalized_shape"])
         )
 
-        for (x, gamma, beta, normalized_shape) in zip(
+        for x, gamma, beta, normalized_shape in zip(
             inputs, gammas, betas, self._attrs["normalized_shape"]
         ):
             (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index f97eb74cb..2882ad835 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -133,7 +133,7 @@ def _sanity_check(self, x, gamma, beta):
         (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(x, gamma, beta)
 
         expected_dtype = x.dtype()
-        for (param, name) in ((gamma, "gamma"), (beta, "beta")):
+        for param, name in ((gamma, "gamma"), (beta, "beta")):
             if param is not None and param.dtype() != expected_dtype:
                 raise NotImplementedError(
                     f"Layernorm doesn't support type promotions; expected {expected_dtype} but got {name} with dtype {param.dtype()}"
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index d248d526b..a19e2c1fb 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -262,7 +262,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
         # Loop through self._dim_mapping to generate stride_strs.
         found_original_dim_group = False
         res = []
-        for (original_group, actual_group) in self._dim_mapping:
+        for original_group, actual_group in self._dim_mapping:
             if not found_original_dim_group:
                 if dim in original_group:
                     found_original_dim_group = True
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index 48ff13cc3..455f83459 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -397,6 +397,7 @@ def _get_sorted_candidate_ops(
 # the arguments to gpu memory with sync memcpy, which is bad for perf
 _MAX_LAYERNORM_GROUP = 39
 
+
 # TODO: remove after switching to async copy for group layernorm args
 def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
     if len(group) <= _MAX_LAYERNORM_GROUP:
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 54b8ca79c..35232f80c 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -56,7 +56,6 @@ def profile(
     devices=None,
     dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
 ):
-
     """Profiles kernels.
 
     Parameters
diff --git a/python/aitemplate/compiler/transform/split_large_split_ops.py b/python/aitemplate/compiler/transform/split_large_split_ops.py
index 8b0323cb8..beab11ca6 100644
--- a/python/aitemplate/compiler/transform/split_large_split_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_split_ops.py
@@ -101,7 +101,7 @@ def split_large_split_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
             sorted_graph += list(new_outputs)
             output_mapping += list(zip(outputs[start:end], new_outputs))
 
-        for (old_output, new_output) in output_mapping:
+        for old_output, new_output in output_mapping:
             transform_utils.replace_tensor(old_output, new_output)
 
     if not modified:
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index 9961a8928..41577cec3 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -18,8 +18,7 @@
 """
 from typing import Callable, List, Tuple, Type, Union
 
-from aitemplate.utils.shape_utils import is_singleton_dimension
-from ...backend.target import Target
+from aitemplate.backend.target import Target
 
 from aitemplate.compiler import ops
 from aitemplate.compiler.base import Operator, Tensor
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index 64e6ab085..b7be3a6ba 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -29,8 +29,6 @@
 
 
 def _is_supported_strided_op(op: Operator) -> bool:
-    from ...backend.target import Target
-
     op_kind = op._attrs["op"]
     return not op_kind.startswith("group_gemm")
 
diff --git a/python/aitemplate/frontend/nn/module.py b/python/aitemplate/frontend/nn/module.py
index c51a49db9..391d9d5d7 100644
--- a/python/aitemplate/frontend/nn/module.py
+++ b/python/aitemplate/frontend/nn/module.py
@@ -296,7 +296,6 @@ def get_submodule(self, target: str) -> "Module":
         mod: Module = self
 
         for item in atoms:
-
             if not hasattr(mod, item):
                 raise AttributeError(
                     mod._get_name() + " has no " "attribute `" + item + "`"
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index ebc2b2d87..bb06112bf 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -208,7 +208,6 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         tensor = ops.reshape()(tensor, [B, N, L_pooled, C])
 
         if self.has_norm and not self.norm_before_pool:
-
             # TODO: add support for norm before pool
             # tensor = self.norm(tensor)
             _LOGGER.warning("Unsupport norm before pool")
@@ -347,7 +346,6 @@ def __init__(
         ## TODO: add pool mode support for {"max", "avg"}
 
         elif pool_mode == "conv":
-
             self.pool_q = (
                 Conv3d(
                     head_dim,
@@ -513,7 +511,6 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
             q, k, v = self._reshape_qkv_to_seq(q, k, v, q_N, v_N, k_N, B, C)
             q, k, v = self._qkv_proj(q, q_N, k, k_N, v, v_N, B, C)
         else:
-
             if self.separate_qkv:
                 q = k = v = x
                 pass
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
index 568035e0a..4c46deeb2 100644
--- a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -266,7 +266,6 @@ def accumulator_type(self):
         return library.DataType.f32
 
     def emit(self) -> str:
-
         template = jinja2.Template(
             """
 using {{name}} = {{xdl_op_type}}<
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index a6b675be4..e8f89f666 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -24,6 +24,7 @@
     softmax_operation as softmax,
 )
 
+
 ###########################################################################################################
 # Convolution for 2D Fwd operations
 def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
@@ -1390,7 +1391,6 @@ def CreateBmmSoftmaxBmmOperator(
     ]
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
@@ -1505,7 +1505,6 @@ def CreateBmmSoftmaxBmmPermOperator(
 
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
diff --git a/python/aitemplate/utils/mk_ck_lib/library.py b/python/aitemplate/utils/mk_ck_lib/library.py
index e4de8af38..4b6a357b9 100644
--- a/python/aitemplate/utils/mk_ck_lib/library.py
+++ b/python/aitemplate/utils/mk_ck_lib/library.py
@@ -201,6 +201,7 @@ class LayoutType(enum.Enum):
     LayoutType.GNWK: "GNWK",
 }
 
+
 #
 class OperationKind(enum.Enum):
     Gemm = auto()
diff --git a/python/aitemplate/utils/mk_ck_lib/manifest.py b/python/aitemplate/utils/mk_ck_lib/manifest.py
index 077ee9103..c572737d8 100644
--- a/python/aitemplate/utils/mk_ck_lib/manifest.py
+++ b/python/aitemplate/utils/mk_ck_lib/manifest.py
@@ -87,7 +87,6 @@ def get_kernel_filters(self, kernelListFile):
             return []
 
     def filter_out_kernels(self, kernel_name, kernel_filter_list):
-
         for kernel_filter_re in kernel_filter_list:
             if kernel_filter_re.search(kernel_name) is not None:
                 return True
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
index 078d40557..387d9ed28 100644
--- a/python/aitemplate/utils/torch_utils.py
+++ b/python/aitemplate/utils/torch_utils.py
@@ -34,7 +34,7 @@ def types_mapping():
 
 
 def torch_dtype_to_string(dtype):
-    for (torch_dtype, ait_dtype) in types_mapping():
+    for torch_dtype, ait_dtype in types_mapping():
         if dtype == torch_dtype:
             return ait_dtype
     raise ValueError(
@@ -49,7 +49,7 @@ def string_to_torch_dtype(string_dtype):
         # handling None is useful here.
         return None
 
-    for (torch_dtype, ait_dtype) in types_mapping():
+    for torch_dtype, ait_dtype in types_mapping():
         if string_dtype == ait_dtype:
             return torch_dtype
     raise ValueError(
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
index 3e2e6f3f3..438df946d 100644
--- a/tests/unittest/backend/test_profiler.py
+++ b/tests/unittest/backend/test_profiler.py
@@ -68,4 +68,4 @@ def test_profiler_runner(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/unittest/benchmark/test_group_gemm_benchmark.py b/tests/unittest/benchmark/test_group_gemm_benchmark.py
index 03b0034fc..7222ae4ef 100644
--- a/tests/unittest/benchmark/test_group_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_group_gemm_benchmark.py
@@ -84,7 +84,7 @@ def _prepare_inputs(m, nk_groups, repeats=10, has_bias=True):
     inputs = []
     for _ in range(repeats):
         inputs.append([])
-        for (n, k) in nk_groups:
+        for n, k in nk_groups:
             x_pt = torch.randn(m, k).half().cuda()
             w_pt = torch.randn(n, k).half().cuda()
             b_pt = torch.randn(n).half().cuda()
diff --git a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
index 18d751a07..b7c32c2be 100644
--- a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
+++ b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
@@ -34,7 +34,10 @@ def __init__(self, *args, **kwargs):
 
     @unittest.skipIf(detect_target().in_ci_env(), "don't run benchmark in CI")
     def test_benchmark(self):
-        for (input_nonbatch_shape, (start_indices, end_indices),) in itertools.product(
+        for (
+            input_nonbatch_shape,
+            (start_indices, end_indices),
+        ) in itertools.product(
             ((2048, 256), (2048, 512), (2048, 1024), (2048, 2048)),
             (((0, 0, 4), (None, None, 224)), ((0, 0, 3), (None, None, 223))),
         ):
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 5312f4d3a..b5ad9e0e1 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -348,7 +348,6 @@ def _test_slice_gemm_rcr_fusion_a_2(
         no_fusion=False,
         dtype="float16",
     ):
-
         X = Tensor(
             shape=slice_input_shape,
             dtype=dtype,
diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index af91821ad..1a866788f 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -102,7 +102,6 @@ def test_slice_view_gemm_fusible(self, dtype):
         )
     )
     def test_slice_view_gemm_non_fusible(self, dtype):
-
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
@@ -160,7 +159,6 @@ def test_slice_view_gemm_non_fusible(self, dtype):
         )
     )
     def test_slice_flatten_concat_fusible_1(self, dtype):
-
         test_name = f"slice_flatten_concat_fusible_{dtype}"
         batch_dim = IntVar([3, 10], "batch_size")
         X0 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x0")
@@ -243,7 +241,6 @@ def test_slice_flatten_concat_fusible_1(self, dtype):
         )
     )
     def test_slice_flatten_concat_fusible_2(self, dtype):
-
         test_name = f"slice_flatten_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 2], "batch_size")
         X0 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x0")
@@ -320,7 +317,6 @@ def test_slice_flatten_concat_fusible_2(self, dtype):
         )
     )
     def test_slice_reshape_concat_fusible_1(self, dtype):
-
         test_name = f"slice_reshape_concat_fusible_{dtype}_1"
         batch_dim = IntVar([1, 2], "batch_size")
         M = 2
@@ -396,7 +392,6 @@ def test_slice_reshape_concat_fusible_1(self, dtype):
         )
     )
     def test_slice_reshape_concat_fusible_2(self, dtype):
-
         test_name = "slice_reshape_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 8], "batch_size")
         M = 8
diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
index c119c85d9..552e89b13 100644
--- a/tests/unittest/compiler/test_strided_layernorm.py
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -152,7 +152,6 @@ def _test_slice_layer_norm(
         end_indices: List[int] = (None,),
         dtype: str = "float16",
     ):
-
         input_rank = 1 + len(input_nonbatch_shape)
         if 1 == len(start_indices) and len(start_indices) != input_rank:
             start_indices = [start_indices[0]] * input_rank
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
index d413b7a0f..2f56ca01f 100644
--- a/tests/unittest/compiler/test_strided_view_op.py
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -87,9 +87,9 @@ def _gen_simple_strided_ops(
     return test_cases
 
 
-def _gen_fusible_view_ops_after_strided_op() -> List[
-    Tuple[str, Callable[[Tensor], Tensor], str]
-]:
+def _gen_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def reshape_op(input_tensor: Tensor):
         shape = input_tensor._attrs["shape"]
         return ops.reshape()(
@@ -110,9 +110,9 @@ def flatten_op(input_tensor: Tensor):
     return test_cases
 
 
-def _gen_non_fusible_view_ops_after_strided_op() -> List[
-    Tuple[str, Callable[[Tensor], Tensor], str]
-]:
+def _gen_non_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def reshape_op(input_tensor: Tensor):
         n2 = input_tensor._attrs["shape"][2].value()
         return ops.reshape()(input_tensor, [-1, n2])
@@ -130,9 +130,9 @@ def flatten_op(input_tensor: Tensor):
     return test_cases
 
 
-def _gen_multiple_fusible_view_ops_after_strided_op() -> List[
-    Tuple[str, Callable[[Tensor], Tensor], str]
-]:
+def _gen_multiple_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def _get_shape(input_tensor: Tensor):
         return (
             input_tensor._attrs["shape"][1].value(),
diff --git a/tests/unittest/frontend/test_module.py b/tests/unittest/frontend/test_module.py
index 7d0db2104..1169cfb84 100644
--- a/tests/unittest/frontend/test_module.py
+++ b/tests/unittest/frontend/test_module.py
@@ -59,7 +59,7 @@ def forward(self, x):
         b = PTModule()
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_1(self):
@@ -100,7 +100,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_2(self):
@@ -153,7 +153,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_module_dict(self):
@@ -238,7 +238,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
 
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 4c210af1a..f25331739 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -46,7 +46,6 @@ def _test_batch_gather(
         test_name="gather",
         dtype="float16",
     ):
-
         in_shape = shape
 
         o_shape = list(in_shape)
@@ -133,7 +132,6 @@ def _test_batch_gather_topk(
         test_name="topk",
         dtype="float16",
     ):
-
         m_shape = (N,) + shape
         n_shape = (topK,) + shape
 
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index e390865f7..f9d727460 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -52,7 +52,7 @@ def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, N, K], dtype)
 
@@ -94,7 +94,7 @@ def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        for (b, k) in itertools.product(bs, ks):
+        for b, k in itertools.product(bs, ks):
             X_pt = get_random_torch_tensor([b, k, M], dtype)
             W_pt = get_random_torch_tensor([b, k, N], dtype)
 
@@ -126,7 +126,7 @@ def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, K, N], dtype)
 
@@ -183,7 +183,7 @@ def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, N, K], dtype)
 
@@ -226,7 +226,7 @@ def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
 
-        for (b, k) in itertools.product(bs, ks):
+        for b, k in itertools.product(bs, ks):
             X_pt = get_random_torch_tensor([b, k, M], dtype)
             W_pt = get_random_torch_tensor([b, k, N], dtype)
 
@@ -259,7 +259,7 @@ def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rrc_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, K, N], dtype)
 
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index b2b0b91ea..b8266c99c 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -49,7 +49,7 @@ def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16")
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, K, N], dtype)
 
@@ -91,7 +91,7 @@ def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16")
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = get_random_torch_tensor([b, m, K], dtype)
             W_pt = get_random_torch_tensor([b, N, K], dtype)
 
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
index 0aeaa6fe2..e2ef4b998 100644
--- a/tests/unittest/ops/test_bmm_softmax.py
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -31,7 +31,6 @@ class BMMSoftmaxTestCase(unittest.TestCase):
     def _test_bmm_rcr_softmax(
         self, B=16, M=16, K=64, N=24, test_name="bmm_rcr_softmax"
     ):
-
         X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
         W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
         OP = ops.bmm_rcr_softmax()
diff --git a/tests/unittest/ops/test_bmm_softmax_bmm.py b/tests/unittest/ops/test_bmm_softmax_bmm.py
index 7afad1c85..7422a6e13 100644
--- a/tests/unittest/ops/test_bmm_softmax_bmm.py
+++ b/tests/unittest/ops/test_bmm_softmax_bmm.py
@@ -83,7 +83,7 @@ def _test_bmm_permute(
             Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
         )
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = torch.randn(b, m, K).cuda().half()  # Q
             W_pt = torch.randn(b, N, K).cuda().half()  # K
             B1_pt = torch.randn(b, N, D).cuda().half()  # V
@@ -156,7 +156,7 @@ def _test_b2b(
             Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
         )
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = torch.randn(b, m, K).cuda().half()  # Q
             W_pt = torch.randn(b, N, K).cuda().half()  # K
             B1_pt = torch.randn(b, N, D).cuda().half()  # V
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index d2625a8d7..dbcf514ce 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -188,9 +188,7 @@ def _test_nms(
             ref_box = boxes_pt[keep].cpu()
         else:
             ref_box = torch.zeros(nmsMaxOut, 4)
-            ref_box[
-                : keep.shape[0],
-            ] = boxes_pt[keep].cpu()
+            ref_box[: keep.shape[0],] = boxes_pt[keep].cpu()
         ref_box = ref_box.cuda().to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 1, 4)).copy()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index 430967af3..b2294c5db 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -144,9 +144,7 @@ def _test_nms(
             ref_box = boxes[keep]
         else:
             ref_box = torch.zeros(nmsMaxOut, 4)
-            ref_box[
-                : keep.shape[0],
-            ] = boxes[keep]
+            ref_box[: keep.shape[0],] = boxes[keep]
         ref_box = ref_box.to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 4)).contiguous()
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index 3a3353d02..12f2fa746 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -46,7 +46,6 @@ def _test_topk(
         copy_op=False,
         dtype="float16",
     ):
-
         o_shape = list(shape)
         o_shape[-1] = topK
 

From 9d9a577c12462a40b07642f9c12fcbb4744e5c4e Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Mon, 20 Mar 2023 23:48:11 -0700
Subject: [PATCH 297/638] changes to improve jagged tensor and add b2b bmm
 (#455)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/455

- fix some typo in b2b bmm
- add `from_two_input_lists_jagged_tensor` to help jagged tensor batch reasoning

Reviewed By: aakhundov

Differential Revision: D44193623

fbshipit-source-id: a08bad1ac74a6d1409bb3f2e96953ed0c149d006
---
 fx2ait/fx2ait/ait_module.py  |   2 +
 fx2ait/fx2ait/tensor_spec.py | 100 +++++++++++++++++++++++++++++++----
 2 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
index 54d212685..8cf48b842 100644
--- a/fx2ait/fx2ait/ait_module.py
+++ b/fx2ait/fx2ait/ait_module.py
@@ -64,4 +64,6 @@ def create_ait_module_wrapper(engine, interp_result, trace_ait_module, *inputs):
         This is turned in by passing allow_scripting=True.
         """
         mod = AITModule(engine, interp_result)
+        # sanity test before tracing
+        mod(*inputs)
         return torch.jit.trace(mod, inputs) if trace_ait_module else mod
diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index 4a9594b4b..bf7cb6e23 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -89,6 +89,70 @@ def from_two_input_lists(
 
         return result
 
+    @classmethod
+    def from_two_input_lists_jagged_tensor(
+        cls, inputs1: List[torch.Tensor], inputs2: List[torch.Tensor]
+    ) -> List["TensorSpec"]:
+        """
+        This function is useful when we expect multiple dynamic dims.
+
+        The parent graph can receive two sets of inputs:
+        1. with min dynamic dim values,
+        2. with max dynamic dim values.
+
+        After FX splitter logic is applied and lowerable subgraph sample inputs
+        are inferred, we make two assumptions:
+        1. two lists of inferred inputs will differ at dynamic dimensions,
+        2. the difference numbers will be the dynamic ranges, i.e. min and max.
+
+        TODO: The assumptions above are not ideal, and, in theory, we should do
+        symbolic shape propagation using something like SymPy.
+        """
+        if len(inputs1) != len(inputs2):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs1)} vs {len(inputs2)}"
+            )
+
+        result: List[TensorSpec] = []
+        dynamic_dict = {}
+        num_dynamic = 0
+        for t1, t2 in zip(inputs1, inputs2):
+            if t1.dtype != t2.dtype:
+                raise ValueError(f"Different types: {t1.dtype} vs {t2.dtype}")
+            if len(t1.shape) != len(t2.shape):
+                raise ValueError(
+                    f"Different tensor sizes: {len(t1.shape)} vs {len(t2.shape)}"
+                )
+            shape: List[IntVar] = []
+            for _, (d1, d2) in enumerate(zip(t1.shape, t2.shape)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    dynamic_range = [min(d1, d2), max(d1, d2)]
+                    tuple_range = tuple(dynamic_range)
+                    dynamic_name = dynamic_dict.get(tuple_range, None)
+                    # The rule here we record the dynamic range+dynamic name in a dict
+                    # and extract it if it exists. If not, we will create the pair and append
+                    # the pair to the dict
+                    if dynamic_name:
+                        shape.append(IntVar(dynamic_range, dynamic_name))
+                    else:
+                        if num_dynamic == 0:
+                            shape.append(IntVar(dynamic_range, "batch_size"))
+                            dynamic_dict[tuple_range] = "batch_size"
+                        else:
+                            shape.append(
+                                IntVar(
+                                    dynamic_range,
+                                    f"batch_size_{num_dynamic}",
+                                )
+                            )
+                            dynamic_dict[tuple_range] = f"batch_size_{num_dynamic}"
+                        num_dynamic = num_dynamic + 1
+            result.append(TensorSpec(shape, t1.dtype))
+
+        return result
+
     @classmethod
     def gen_int_var_min_max(cls, vmin: int, vmax: int, name: str = None):  # noqa [B902]
         values = [vmin, vmax]
@@ -199,6 +263,7 @@ def from_input_list_with_batch_size_jagged_tensor(
         max_sequence_length: int,
         jagged_tensor_batch_dims: Set[int],
         jagged_offsets_batch_dims: Set[int],
+        additional_inputs: List[torch.Tensor] = None,
     ) -> List["TensorSpec"]:
         """
         Most of the recommendation models will work fine using this function.
@@ -210,6 +275,7 @@ def from_input_list_with_batch_size_jagged_tensor(
         result_unsorted: List = []
         left_inputs: List = []
         left_inputs_ind: List = []
+        left_additional_inputs: List = []
         for ind, t in enumerate(inputs):
             batch_dim: int = t.shape[0]
             batch_dim_lower_bound: int = 0
@@ -244,15 +310,31 @@ def from_input_list_with_batch_size_jagged_tensor(
                 left_inputs.append(t)
                 left_inputs_ind.append(ind)
 
-        bs_dim = cls.find_batch_size_dim(left_inputs)
-        for index, t in enumerate(left_inputs):
-            shape: List[IntVar] = []
-            for i, d in enumerate(t.shape):
-                if i == bs_dim[index]:
-                    shape.append(IntVar([1, max_batch_size], "batch_size"))
-                else:
-                    shape.append(IntImm(d))
-            result_unsorted.append((left_inputs_ind[index], TensorSpec(shape, t.dtype)))
+        if additional_inputs:
+            for ind in left_inputs_ind:
+                left_additional_inputs.append(additional_inputs[ind])
+            input_specs_left = TensorSpec.from_two_input_lists_jagged_tensor(
+                left_inputs, left_additional_inputs
+            )
+            assert len(input_specs_left) == len(
+                left_inputs_ind
+            ), "Unexpected length for left inputs"
+
+            for index, ind_value in enumerate(left_inputs_ind):
+                result_unsorted.append((ind_value, input_specs_left[index]))
+
+        else:
+            bs_dim = cls.find_batch_size_dim(left_inputs)
+            for index, t in enumerate(left_inputs):
+                shape: List[IntVar] = []
+                for i, d in enumerate(t.shape):
+                    if i == bs_dim[index]:
+                        shape.append(IntVar([1, max_batch_size], "batch_size"))
+                    else:
+                        shape.append(IntImm(d))
+                result_unsorted.append(
+                    (left_inputs_ind[index], TensorSpec(shape, t.dtype))
+                )
         result = sorted(result_unsorted, key=lambda num: num[0])
         result = [r[1] for r in result]
         return result

From 9f54dc7ae370e527ad7f43467169ffceb562dfd5 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 21 Mar 2023 03:59:51 -0700
Subject: [PATCH 298/638] Add full operator (#458)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/458

This diff is adding a new operator: `full`. The op fills the output Tensor of a given `shape` and `dtype` with a (float scalar) `fill_value`. As the memory planning is done in the maximum shape of the op output, the kernel fills the maximum memory with the `fill_value` to allow constant-folding the output even if the shape contains `IntVar`s.

Reviewed By: wushirong

Differential Revision: D43909589

fbshipit-source-id: 6c9c522efcbf183cacc5dd8fa4946c50fb65eb2b
---
 .../backend/cuda/tensor/__init__.py           |   3 +-
 python/aitemplate/backend/cuda/tensor/full.py | 148 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 python/aitemplate/compiler/ops/tensor/full.py |  61 ++++++++
 tests/unittest/ops/test_full.py               |  91 +++++++++++
 5 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/backend/cuda/tensor/full.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/full.py
 create mode 100644 tests/unittest/ops/test_full.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index cc8bceeb6..6b08eec93 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -22,6 +22,7 @@
     concatenate_tanh,
     dynamic_slice,
     expand,
+    full,
     gather,
     jagged_to_padded_dense,
     masked_select,
@@ -44,6 +45,7 @@
     "concatenate_tanh",
     "dynamic_slice",
     "expand",
+    "full",
     "gather",
     "jagged_to_padded_dense",
     "masked_select",
@@ -56,6 +58,5 @@
     "slice_reshape_scatter",
     "slice_scatter",
     "split",
-    "argmax",
     "topk",
 ]
diff --git a/python/aitemplate/backend/cuda/tensor/full.py b/python/aitemplate/backend/cuda/tensor/full.py
new file mode 100644
index 000000000..a35167311
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/full.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void*,  /* output */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void full(
+    {{read_type}}* output,
+    {{index_type}} num_elements
+) {
+  const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx * N_ELEMENTS_PER_THREAD >= num_elements) {
+    return;
+  }
+
+  {{read_type}} tmp;
+  {{data_type}}* p = reinterpret_cast<{{data_type}}*>(&tmp);
+
+  #pragma unroll
+  for (int i=0; i < N_ELEMENTS_PER_THREAD; i++) {
+      p[i] = ({{data_type}}) ({{fill_value}});
+  }
+
+  output[idx] = tmp;
+}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    {{prefix}}Stream_t stream
+){
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>({{num_elements}}) / N_ELEMENTS_PER_THREAD / N_THREADS_PER_BLOCK));
+    full<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(reinterpret_cast<{{read_type}}*> (output), {{num_elements}});
+}
+    """
+)
+
+
+@registry.reg("cuda.full.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+
+    # fill the maximum output Tensor size with the fill_value
+    # any shape within the maximum bounds will be a subset
+    num_elements = 1
+    for dim in y.shape():
+        num_elements *= dim.upper_bound()
+
+    dtype = y.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
+
+    return FUNC_TEMPLATE.render(
+        header_files=CUDA_HEADER_FILES,
+        constant=CONSTANT_TEMPLATE.render(
+            read_t=read_type,
+            data_t=data_type,
+        ),
+        func_name=func_attrs["name"],
+        read_type=read_type,
+        data_type=data_type,
+        index_type=backend_spec.index_type,
+        fill_value=func_attrs["fill_value"],
+        num_elements=num_elements,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.full.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.full.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 0cabdee7b..755710886 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.full import full
 from aitemplate.compiler.ops.tensor.gather import gather
 from aitemplate.compiler.ops.tensor.jagged_to_padded_dense import jagged_to_padded_dense
 from aitemplate.compiler.ops.tensor.masked_select import masked_select
diff --git a/python/aitemplate/compiler/ops/tensor/full.py b/python/aitemplate/compiler/ops/tensor/full.py
new file mode 100644
index 000000000..e88a19c4a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/full.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import List
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class full(Operator):
+    """
+    Creates a tensor of a given `shape` and `dtype` filled
+    with the specified `fill_value` (float scalar).
+
+    Args:
+        shape (List[IntVar]): the shape of the output Tensor.
+        fill_Value (float): the value to fill the output Tensor with.
+        dtype (str): the dtype of the output Tensor.
+
+    Returns:
+        Tensor: a tensor of `shape` and `dtype` filled with `fill_value`.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self._attrs["op"] = "full"
+        self._attrs["has_profiler"] = False
+
+    def __call__(
+        self,
+        shape: List[IntVar],
+        fill_value: float,
+        dtype: str = "float16",
+    ) -> Tensor:
+        self._attrs["inputs"] = []
+        self._attrs["fill_value"] = fill_value
+
+        self._set_depth()
+        output = Tensor(shape, src_ops={self}, dtype=dtype)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_full.py b/tests/unittest/ops/test_full.py
new file mode 100644
index 000000000..2064d35e0
--- /dev/null
+++ b/tests/unittest/ops/test_full.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+class TestFull(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_full(
+        self,
+        shape,
+        fill_value,
+        dtype="float16",
+        test_name="full",
+    ) -> None:
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Y = ops.full()(X.shape(), fill_value, dtype)
+        Y._attrs["name"] = "Y"
+
+        Z = ops.elementwise(FuncEnum.ADD)(X, Y)
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Z, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        if isinstance(shape[0], IntVar):
+            shapes = [[val] + shape[1:] for val in shape[0]._attrs["values"]]
+        else:
+            shapes = [shape]
+
+        for shape in shapes:
+            x_pt = get_random_torch_tensor(shape, dtype=dtype)
+            z_pt = x_pt + fill_value
+
+            z = torch.empty_like(z_pt)
+
+            module.run_with_tensors([x_pt], [z])
+            torch.testing.assert_close(z, z_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param(1, [1], 1, "float16"),
+            param(2, [10, 20, 30], 3.14, "float16"),
+            param(3, [IntVar([10, 20]), 30], 0, "float16"),
+            param(4, [20, 30], 2.71, "float32"),
+            param(5, [IntVar([1, 128]), 10], -1.23, "float32"),
+        ]
+    )
+    def test_full(self, i, shape, fill_value, dtype):
+        self._test_full(
+            shape=shape,
+            fill_value=fill_value,
+            dtype=dtype,
+            test_name=f"test_full_{i}",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 631f5791f118216ce109727ef539d74380b67cdc Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 21 Mar 2023 05:09:29 -0700
Subject: [PATCH 299/638] Handle implicit + explicit jagged Tensor inputs in
 elementwise (#453)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/453

When `elementwise` has both explicit (with `JaggedIntVar` as the first dim) and implicit (with `total_length: IntVar` as the first dim) jagged Tensor inputs, current broadcasting doesn't work, as it can't broadcast `total_length: IntVar` against the `JaggedIntVar`. In this diff, we're making the implicit jagged Tensor inputs explicit before the broadcasting takes place, by replacing the `total_length: IntVar` in their shape by the corresponding `JaggedIntVar` in the jagged input's shape.

Reviewed By: frank-wei, wushirong

Differential Revision: D44199071

fbshipit-source-id: 5c4cd726b93090723b0e9cb167052b14eb353682
---
 .../backend/common/elementwise_common.py      | 14 +-----
 .../compiler/ops/common/elementwise.py        | 45 ++++++++++++++++---
 .../compiler/ops/common/view_ops.py           |  4 ++
 python/aitemplate/utils/shape_utils.py        |  8 +---
 tests/unittest/ops/test_jagged_elementwise.py | 42 +++++++++++------
 5 files changed, 75 insertions(+), 38 deletions(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index d8b60d90d..89ee5cdf5 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -722,16 +722,7 @@ def _get_mixed_jagged_dense_config(
         # dense inputs' shapes) will be treated as a single dense dim
         return False, None, False
 
-    # If all dense inputs' first dim is equal to jagged_int_var's total_length(),
-    # treat all these dense inputs as jagged inputs as well.
     jagged_int_var = output_shape[0]
-    all_dense_jagged = True
-    for dense_input_shape in dense_input_shapes:
-        if dense_input_shape[0] != jagged_int_var.total_length():
-            all_dense_jagged = False
-    if all_dense_jagged:
-        return False, None, False
-
     jagged_max_dense_prefix_shape = jagged_int_var.get_max_dense_shape()
     jagged_suffix_shape = output_shape[1:]
     output_volume = jagged_max_dense_prefix_shape + jagged_suffix_shape
@@ -839,10 +830,7 @@ def _gen_input_broadcast_calculator_str(
 
     start_idx = 0
     for i, (input_dim, output_dim) in enumerate(zip(input_shape, output_shape)):
-        if input_dim != output_dim and not (
-            isinstance(output_dim, JaggedIntVar)
-            and input_dim == output_dim.total_length()
-        ):
+        if input_dim != output_dim:
             assert input_dim == IntImm(
                 1
             ), "Unexpected shapes! Input: {}, output: {}".format(
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index c2f8e6983..721782592 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -18,7 +18,7 @@
 import functools
 from typing import Any, List
 
-from aitemplate.compiler.base import IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 from aitemplate.compiler.dtype import normalize_dtype
 from aitemplate.compiler.op_registry import OP_REGISTRY
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
@@ -28,6 +28,41 @@
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
 
+def _discover_implicit_jagged_inputs(inputs: List[Tensor]):
+    """
+    Convert implicit jagged Tensor inputs into explicit jagged Tensors.
+
+    There may be cases when elementwise has both explicit jagged Tensor
+    inputs (i.e. with a JaggedIntVar as the first dimension in the shape)
+    and "implicit" jagged Tensor inputs (i.e. dense Tensors with the first
+    dimension == the JaggedIntVar.total_length() in the jagged Tensor
+    inputs). Here we detect such implicit jagged Tensor inputs and replace
+    the total_length: IntVar in the dense input's shape by the corresponding
+    JaggedIntVar from the jagged input's shape. Importantly, this must be
+    done before the mixed jagged / dense broadcasting takes place.
+    """
+    total_length_map = {}
+    for tensor in inputs:
+        if tensor.is_jagged():
+            jagged_int_var = tensor._attrs["shape"][0]
+            total_length = jagged_int_var.total_length()
+            total_length_map[total_length] = jagged_int_var
+
+    if total_length_map:
+        # there are explicit jagged Tensors among the inputs:
+        # we check if there are implict ones and make them explicit
+        for tensor in inputs:
+            shape = tensor._attrs["shape"]
+            if not tensor.is_jagged() and shape and not isinstance(shape[0], IntImm):
+                if shape[0] in total_length_map:
+                    # the dense Tensor input's first dimension is the total_length
+                    # dimension in the JaggedIntVar of one of the jagged Tensor
+                    # inputs: we replace the dense Tensor input's first dimension
+                    # by the corresponding JaggedIntVar, hence giving it a
+                    # jagged Tensor semantics for further processing.
+                    shape[0] = total_length_map[shape[0]]
+
+
 def _broadcast_dense_shapes(shapes: List[List[IntVar]]) -> List[IntVar]:
     if len(shapes) == 1:
         return list(shapes[0])
@@ -112,12 +147,10 @@ def _broadcast_dense_and_jagged_shape(
                 "higher than the rank of the jagged inputs (when treating "
                 "the jagged dims as separate dims)."
             )
-        broadcastable_jagged_dense, _ = shape_utils.get_broadcast_max_shape(
+
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
             jagged_max_dense_prefix_shape, dense_prefix_shape
         )
-        broadcastable_jagged_jagged, _ = shape_utils.get_broadcast_max_shape(
-            [jagged_first_dim.total_length()], dense_prefix_shape
-        )
         if not broadcastable:
             raise ValueError(
                 f"JaggedIntVar of the jagged inputs ({jagged_first_dim}) is not compatible "
@@ -159,6 +192,8 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
                 "Elementwise op {} doesn't have inputs!".format(self._attrs["func"])
             )
 
+        _discover_implicit_jagged_inputs(args)
+
         dense_shapes = [arg._attrs["shape"] for arg in args if not arg.is_jagged()]
         jagged_shapes = [arg._attrs["shape"] for arg in args if arg.is_jagged()]
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 118c7a7c6..0cf30564e 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -661,6 +661,10 @@ def _infer_shapes(self, source: Tensor) -> List[IntVar]:
         return [jagged_int_var] + source._attrs["shape"][1:]
 
     def __call__(self, source: Tensor, offsets_list: List[Tensor]) -> Tensor:
+        if source.is_jagged():
+            # already a jagged Tensor
+            return source
+
         jagged_dims = self._attrs["jagged_dims"]
         if len(offsets_list) != len(jagged_dims):
             raise ValueError(
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index f67a9978c..7816b81fe 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -51,7 +51,7 @@ def get_broadcast_max_shape(shape1, shape2):
     Note that two shapes are not required to have the same number of dimensions.
     For example, shape [5, 2, 3] and shape [3] are also broadcastable.
     """
-    from aitemplate.compiler.base import IntImm, JaggedIntVar
+    from aitemplate.compiler.base import IntImm
 
     min_len = min(len(shape1), len(shape2))
     if len(shape1) > len(shape2):
@@ -65,12 +65,6 @@ def get_broadcast_max_shape(shape1, shape2):
         if dim1 == dim2:
             res_shape[idx] = dim1
             continue
-        if isinstance(dim1, JaggedIntVar) and dim1.total_length() == dim2:
-            res_shape[idx] = dim1
-            continue
-        if isinstance(dim2, JaggedIntVar) and dim2.total_length() == dim1:
-            res_shape[idx] = dim2
-            continue
         if dim1 == IntImm(1):
             res_shape[idx] = dim2
         elif dim2 == IntImm(1):
diff --git a/tests/unittest/ops/test_jagged_elementwise.py b/tests/unittest/ops/test_jagged_elementwise.py
index 0da6a467f..af1205040 100644
--- a/tests/unittest/ops/test_jagged_elementwise.py
+++ b/tests/unittest/ops/test_jagged_elementwise.py
@@ -256,6 +256,7 @@ def _test_jagged_jagged_elementwise_add(
         jagged_max_prefix_shape: List[int],
         jagged1_inner_shape: List[int],
         jagged2_inner_shape: List[int],
+        implicit_jagged_input: bool,
         offsets_list: List[List[int]],
         dtype: str = "float16",
         offsets_dtype: str = "int32",
@@ -314,10 +315,14 @@ def _test_jagged_jagged_elementwise_add(
             for i, offsets_dim in enumerate(offsets_dims)
         ]
 
-        JAGGED1 = ops.make_jagged(
-            batch_dim=batch_dim,
-            jagged_dims=jagged_dims,
-        )(SOURCE1, OFFSETS_LIST)
+        if implicit_jagged_input:
+            JAGGED1 = SOURCE1
+        else:
+            JAGGED1 = ops.make_jagged(
+                batch_dim=batch_dim,
+                jagged_dims=jagged_dims,
+            )(SOURCE1, OFFSETS_LIST)
+
         JAGGED2 = ops.make_jagged(
             batch_dim=batch_dim,
             jagged_dims=jagged_dims,
@@ -328,7 +333,13 @@ def _test_jagged_jagged_elementwise_add(
         RESULT._attrs["name"] = "result"
         RESULT._attrs["is_output"] = True
 
-        assert not SOURCE1.is_jagged()
+        if implicit_jagged_input:
+            # SOURCE1 is "converted" into a jagged Tensor
+            # in the ops.elementwise by replacing its first
+            # dim with the JaggedIntVar from JAGGED 2
+            assert SOURCE1.is_jagged()
+        else:
+            assert not SOURCE1.is_jagged()
         assert not SOURCE2.is_jagged()
         assert JAGGED1.is_jagged()
         assert JAGGED2.is_jagged()
@@ -358,10 +369,11 @@ def _test_jagged_jagged_elementwise_add(
 
     @parameterized.expand(
         [
-            param(1, "int32", [4, 3], [5], [5]),
-            param(2, "int32", [4, 3], [5], [1]),
-            param(3, "int64", [4, 3], [1], [5]),
-            param(4, "int64", [4, 3], [5, 1, 7], [1, 6, 1]),
+            param(1, "int32", [4, 3], [5], [5], False),
+            param(2, "int32", [4, 3], [5], [1], False),
+            param(3, "int64", [4, 3], [1], [5], True),
+            param(4, "int64", [4, 3], [5, 1, 7], [1, 6, 1], False),
+            param(5, "int64", [4, 3], [5, 6, 7], [1, 6, 7], True),
         ]
     )
     def test_jagged_jagged_elementise_add_single_offsets_fp16(
@@ -371,11 +383,13 @@ def test_jagged_jagged_elementise_add_single_offsets_fp16(
         jagged_max_prefix_shape,
         jagged1_inner_shape,
         jagged2_inner_shape,
+        implicit_jagged_input,
     ):
         self._test_jagged_jagged_elementwise_add(
             jagged_max_prefix_shape=jagged_max_prefix_shape,
             jagged1_inner_shape=jagged1_inner_shape,
             jagged2_inner_shape=jagged2_inner_shape,
+            implicit_jagged_input=implicit_jagged_input,
             offsets_list=[[0, 1, 4, 6, 7]],
             dtype="float16",
             offsets_dtype=offsets_dtype,
@@ -384,10 +398,10 @@ def test_jagged_jagged_elementise_add_single_offsets_fp16(
 
     @parameterized.expand(
         [
-            param(1, "int32", [3, 4, 5, 200], [10], [10]),
-            param(2, "int32", [3, 4, 5, 200], [1, 2], [2, 1]),
-            param(3, "int64", [3, 4, 5, 150], [6, 7, 8], [6, 7, 8]),
-            param(4, "int64", [3, 4, 5, 150], [6, 1, 8], [1, 7, 1]),
+            param(1, "int32", [3, 4, 5, 200], [10], [10], False),
+            param(2, "int32", [3, 4, 5, 200], [1, 2], [2, 1], True),
+            param(3, "int64", [3, 4, 5, 150], [6, 7, 8], [6, 7, 8], False),
+            param(4, "int64", [3, 4, 5, 150], [6, 1, 8], [1, 7, 1], True),
         ]
     )
     def test_jagged_jagged_elementise_add_multiple_offsets_fp16(
@@ -397,11 +411,13 @@ def test_jagged_jagged_elementise_add_multiple_offsets_fp16(
         jagged_max_prefix_shape,
         jagged1_inner_shape,
         jagged2_inner_shape,
+        implicit_jagged_input,
     ):
         self._test_jagged_jagged_elementwise_add(
             jagged_max_prefix_shape=jagged_max_prefix_shape,
             jagged1_inner_shape=jagged1_inner_shape,
             jagged2_inner_shape=jagged2_inner_shape,
+            implicit_jagged_input=implicit_jagged_input,
             offsets_list=[
                 [0, 1, 3, 5],
                 [0, 2, 4, 7, 9, 10],

From e2b6f9f7181b874ca91f20f1e1ee205c7be45234 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Tue, 21 Mar 2023 07:59:47 -0700
Subject: [PATCH 300/638] Add bmm + permute transform (#420)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/420

Fuse `bmm_xxc + permute021` into `bmm_xxr` and the other way round. This diff is a continuation of D43697976 where `bmm_xxc` ops were introduced.

Reviewed By: chenyang78, aakhundov

Differential Revision: D44032396

fbshipit-source-id: 2bbd329003aec44ac6db4b1afc0575003fbb9464
---
 python/aitemplate/backend/profiler_cache.py   |   2 +-
 .../compiler/transform/fuse_bmm_permute.py    |  65 +++++++
 .../compiler/transform/fuse_utils.py          |   8 +-
 .../compiler/transform/optimize_graph.py      |   3 +
 .../compiler/test_fuse_bmm_permute.py         | 171 ++++++++++++++++++
 5 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 python/aitemplate/compiler/transform/fuse_bmm_permute.py
 create mode 100644 tests/unittest/compiler/test_fuse_bmm_permute.py

diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index e76ba38a0..63be95837 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -457,7 +457,7 @@ class CacheMode(enum.Enum):
 )
 
 
-__AIT_CACHE_VERSION__ = 2
+__AIT_CACHE_VERSION__ = 3
 
 
 def ait_cache_version() -> int:
diff --git a/python/aitemplate/compiler/transform/fuse_bmm_permute.py b/python/aitemplate/compiler/transform/fuse_bmm_permute.py
new file mode 100644
index 000000000..a563a0cab
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_bmm_permute.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform fusions for bmm + permute021 operators:
+    bmm_xxc + permute021 -> bmm_xxr
+    bmm_xxr + permute021 -> bmm_xxc
+"""
+from typing import List
+
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.compiler.ops.gemm_universal import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+)
+
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
+
+
+def fuse_bmm_permute(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Fuse bmm + permute021 ops. The second argument is unused, it's only
+    here to make the type of this function the same as the others called in optimize_graph.
+    """
+    ops_r = [
+        bmm_ccr,
+        bmm_crr,
+        bmm_rcr,
+        bmm_rrr,
+    ]
+
+    ops_c = [
+        bmm_ccc,
+        bmm_crc,
+        bmm_rcc,
+        bmm_rrc,
+    ]
+    patterns_cr = [((c_op(), permute021()), r_op) for c_op, r_op in zip(ops_c, ops_r)]
+    patterns_rc = [((r_op(), permute021()), c_op) for c_op, r_op in zip(ops_c, ops_r)]
+
+    sorted_graph = transform_simple_fusion_patterns(
+        sorted_graph, patterns_cr + patterns_rc
+    )
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_utils.py b/python/aitemplate/compiler/transform/fuse_utils.py
index 39f4c2e73..dae123d4c 100644
--- a/python/aitemplate/compiler/transform/fuse_utils.py
+++ b/python/aitemplate/compiler/transform/fuse_utils.py
@@ -100,6 +100,7 @@ def transform_simple_fusion_patterns(
 ) -> List[Tensor]:
     output_tensors = []
     to_remove = set()
+    has_modified = False
     for tensor in sorted_graph:
         if tensor in to_remove:
             to_remove.remove(tensor)
@@ -178,6 +179,7 @@ def transform_simple_fusion_patterns(
 
         # inputs here might not be ready in graph. But we will toposort again
         # at end of pass so it's okay.
+        has_modified = True
         new_tensor = new_op(**src_op._get_op_attributes())(*inputs)
         copy_tensor_attributes(new_tensor, last_tensor)
         if new_tensor._attrs["is_output"]:
@@ -187,5 +189,7 @@ def transform_simple_fusion_patterns(
             remove_dst_op_from_tensor(tensors, dst_op)
         to_remove |= to_remove_candidate
 
-    new_sorted_graph = toposort(output_tensors)
-    return sanitize_sorted_graph(new_sorted_graph)
+    if has_modified:
+        sorted_graph = toposort(output_tensors)
+        sorted_graph = sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index b5687b74a..35903bc8e 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -19,6 +19,7 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.transform.apply_padding import apply_padding
+from aitemplate.compiler.transform.fuse_bmm_permute import fuse_bmm_permute
 from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
 from aitemplate.compiler.transform.fuse_group_ops import fuse_group_ops
 from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
@@ -67,6 +68,7 @@ def optimize_graph(
     - fuse group ops
     - transform special ops
     - transform strided ops
+    - fuse bmm and permute
     - transform memory ops
     - apply padding
 
@@ -85,6 +87,7 @@ def optimize_graph(
 
     funcs = [
         fuse_permute_bmm_and_gemm,
+        fuse_bmm_permute,
         transform_odd_alignment,
         fuse_conv_elementwise,
         fuse_mm_elementwise,
diff --git a/tests/unittest/compiler/test_fuse_bmm_permute.py b/tests/unittest/compiler/test_fuse_bmm_permute.py
new file mode 100644
index 000000000..d00d02b61
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_bmm_permute.py
@@ -0,0 +1,171 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+from typing import Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from aitemplate.utils import shape_utils
+
+from parameterized import parameterized
+
+
+class FuseBmmPermuteCase(unittest.TestCase):
+    def _create_bmm_permute_graph(
+        self,
+        A_shape: Tuple[IntVar, int, int],
+        B_shape: Tuple[IntVar, int, int],
+        bmm_type: str,
+        dtype: str,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Create a graph consisting of bmm with given layout + permute021.
+        """
+        OP = getattr(ops, bmm_type, None)
+        assert OP is not None
+
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        inputs = [A, B]
+
+        Y = OP()(*inputs)
+        Y = ops.permute021()(Y)
+        Y._attrs["name"] = "target_bmm_tensor"
+        return A, B, Y
+
+    def _test_bmm_permute(
+        self,
+        B: int,
+        A_shape: Tuple[IntVar, int, int],
+        B_shape: Tuple[IntVar, int, int],
+        orig_layout: str,
+        dtype: str = "float16",
+    ):
+
+        is_row_major_a = orig_layout[0] == "r"
+        is_row_major_b = orig_layout[1] == "r"
+        is_row_major_c = orig_layout[2] == "r"
+
+        new_layout = orig_layout[:2] + ("c" if is_row_major_c else "r")
+        testname = f"{orig_layout}_to_{new_layout}_{dtype}"
+
+        original_bmm = f"bmm_{orig_layout}"
+        new_bmm = f"bmm_{new_layout}"
+
+        X, W, bmm_tensor = self._create_bmm_permute_graph(
+            A_shape,
+            B_shape,
+            original_bmm,
+            dtype,
+        )
+
+        output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        # Check that the new bmm is present and the original is not
+        exist_new_bmm = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            assert (
+                len(src_ops) == 1
+            ), "Constructed graph should only have single-source op tensors."
+            src_op = list(tensor.src_ops())[0]
+            assert src_op._attrs["op"] != original_bmm
+
+            if src_op._attrs["op"] == new_bmm:
+                exist_new_bmm = True
+
+        assert exist_new_bmm, "Can't find converted bmm op in the graph."
+
+        m = A_shape[-2] if is_row_major_a else A_shape[-1]
+        n = B_shape[-1] if is_row_major_b else B_shape[-2]
+        k = B_shape[-2] if is_row_major_b else B_shape[-1]
+
+        # Check that fused graph produces correct output
+        for b in B:
+            # Compute PyTorch output
+            X_pt = get_random_torch_tensor((b, m, k), dtype)
+            W_pt = get_random_torch_tensor((b, k, n), dtype)
+            Y_pt = torch.matmul(X_pt, W_pt)
+            if is_row_major_c:
+                Y_pt = torch.transpose(Y_pt, 2, 1)
+            Y_pt = torch.cos(Y_pt)
+
+            # Compute AIT output
+            out_shape = [b, m, n] if not is_row_major_c else [b, n, m]
+            y = get_torch_empty_tensor(out_shape, dtype)
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0]
+            if not is_row_major_a:
+                X_pt = torch.transpose(X_pt, 2, 1).contiguous()
+            if not is_row_major_b:
+                W_pt = torch.transpose(W_pt, 2, 1).contiguous()
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            module.run_with_tensors(inputs, [y])
+
+            torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    @parameterized.expand(
+        itertools.product(
+            [[1, 4]],  # Batch size
+            ["r", "c"],  # Layout of A
+            ["r", "c"],  # Layout of B
+            ["r", "c"],  # Layout of output
+            filter_test_cases_by_params(
+                {
+                    TestEnv.CUDA_LESS_THAN_SM80: ["float16"],
+                }
+            ),
+        )
+    )
+    def test_xxr_to_xxс(self, B, layout_a, layout_b, layout_c, dtype):
+        """
+        Test that bmm_xxr + permute021 is fused into bmm_xxc and the other way round.
+        """
+        M, N, K = 4, 6, 8
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+
+        shape_a = [batch_dim, K, M] if layout_a == "c" else [batch_dim, M, K]
+        shape_b = [batch_dim, N, K] if layout_b == "c" else [batch_dim, K, N]
+
+        self._test_bmm_permute(
+            B,
+            shape_a,
+            shape_b,
+            layout_a + layout_b + layout_c,
+            dtype=dtype,
+        )

From 4a235dbd8da4c86a8b4bf8be3b27383c8add4881 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 21 Mar 2023 14:38:54 -0700
Subject: [PATCH 301/638] Basis of symbolic shape (#320)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/320

Add Symbol into AITemplate for shape deducing in upcoming diffs.

Reviewed By: chenyang78

Differential Revision: D42918454

fbshipit-source-id: b7a62ffc082734e7f99bd1538e1218af7db2176c
---
 .circleci/config.yml                          |   1 +
 python/aitemplate/backend/codegen.py          |   2 +
 python/aitemplate/compiler/__init__.py        |   1 +
 python/aitemplate/compiler/base.py            |  15 ++-
 .../compiler/ops/common/elementwise.py        |   9 ++
 .../compiler/ops/common/int_elementwise.py    |  16 ++-
 python/aitemplate/compiler/symbolic.py        | 113 ++++++++++++++++++
 python/setup.py                               |   2 +-
 .../compiler/test_split_view_strided.py       |   8 +-
 .../compiler/test_strided_view_cat.py         |   3 +
 tests/unittest/compiler/test_symbolic.py      | 105 ++++++++++++++++
 .../unittest/compiler/test_view_strided_op.py |   1 +
 tests/unittest/ops/test_vanilla_attention.py  |   3 +
 13 files changed, 273 insertions(+), 6 deletions(-)
 create mode 100644 python/aitemplate/compiler/symbolic.py
 create mode 100644 tests/unittest/compiler/test_symbolic.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ed0c8585e..8ab59e610 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -32,6 +32,7 @@ setup_env: &setup_env
           python3.8 -m pip install torch &&
           python3.8 -m pip install numpy &&
           python3.8 -m pip install jinja2 &&
+          python3.8 -m pip install sympy &&
           python3.8 -m pip install recordtype &&
           python3.8 -m pip install parameterized &&
           python3.8 -m pip install einops &&
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 7ff934a58..c44435af9 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -778,6 +778,8 @@ def append_tensor(self, node: Tensor) -> None:
                 )
             else:
                 self.tensor_decl.append(self.f_var_decl(name=name))
+            # IntVarTensor could be used as dim too, add to visited to prevent duplicated declaration.
+            self.visited_dims.add(name)
         else:
             self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
 
diff --git a/python/aitemplate/compiler/__init__.py b/python/aitemplate/compiler/__init__.py
index c3752028a..78581152b 100644
--- a/python/aitemplate/compiler/__init__.py
+++ b/python/aitemplate/compiler/__init__.py
@@ -21,6 +21,7 @@
     "dtype",
     "op_registry",
     "ops",
+    "symbolic",
     "tensor_accessor",
     "transform",
     "compile_model",
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 89676f0db..5707c9446 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 
+from aitemplate.compiler import symbolic
 from aitemplate.compiler.dtype import get_dtype_size, normalize_dtype
 from aitemplate.compiler.op_registry import OP_REGISTRY
 
@@ -124,7 +125,12 @@ def __init__(
             )
         self._attrs["values"] = sorted(set(values))
         if len(self._attrs["values"]) == 1:
+            self._attrs["symbolic_value"] = self._attrs["values"][0]
             self._attrs["values"] = self._attrs["values"] * 2
+        else:
+            symbolic_value = symbolic.create_new_symbol(name, values)
+            self._attrs["symbolic_value"] = symbolic_value
+            symbolic.store_intvar(symbolic_value.name, self)
 
     def __str__(self) -> str:
         return pformat(self._attrs, indent=2)
@@ -132,8 +138,7 @@ def __str__(self) -> str:
     def __eq__(self, another: Any) -> bool:
         return (
             isinstance(another, IntVar)
-            and self._attrs["values"] == another._attrs["values"]
-            and self._attrs["name"] == another._attrs["name"]
+            and self._attrs["symbolic_value"] == another._attrs["symbolic_value"]
         )
 
     def __hash__(self) -> int:
@@ -147,6 +152,10 @@ def upper_bound(self) -> int:
         """Returns upper bound of this dynamic dim."""
         return self._attrs["values"][-1]
 
+    def symbolic_value(self):
+        """Returns the symbolic value of this dynamic dim."""
+        return self._attrs["symbolic_value"]
+
     def pseudo_code(self, with_shape=False) -> str:
         return (
             self._attrs["name"]
@@ -188,6 +197,7 @@ def __init__(
         Node.__init__(self)  # pylint: disable=W0233
         self._attrs["name"] = name
         self._attrs["values"] = [value]
+        self._attrs["symbolic_value"] = value
 
     def __eq__(self, another: Union[int, IntVar]) -> bool:
         if isinstance(another, int):
@@ -868,6 +878,7 @@ def __init__(
             is_output=is_output,
         )
         self._attrs["int_var"] = int_var
+        self._attrs["symbolic_value"] = int_var._attrs["symbolic_value"]
 
     def pseudo_code(self, with_shape=True) -> str:
         return f"IntVarTensor({self._attrs['int_var'].pseudo_code()})"
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index 721782592..4e2692750 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -22,6 +22,7 @@
 from aitemplate.compiler.dtype import normalize_dtype
 from aitemplate.compiler.op_registry import OP_REGISTRY
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.common.int_elementwise import INT_ELEMENTWISE_FUNC
 
 from aitemplate.utils import shape_utils
 
@@ -209,16 +210,19 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
 
     def __call__(self, *args: Tensor) -> Tensor:
         converted_args = []
+        symbolic_args = []
         common_dtype = None
         assert len(args) > 0, "Elementwise ops must take at least one argument."
         for arg in args:
             if isinstance(arg, int) or isinstance(arg, float):
                 converted_args.append(Tensor(shape=[], value=arg))
+                symbolic_args.append(arg)
             elif isinstance(arg, IntVarTensor) and self._attrs["func"] == FuncEnum.SQRT:
                 assert len(arg._attrs["int_var"]._attrs["values"]) == 1
                 converted_args.append(
                     Tensor(shape=[], value=arg._attrs["int_var"]._attrs["values"][0])
                 )
+                symbolic_args.append(arg._attrs["int_var"].symbolic_value())
             elif isinstance(arg, Tensor):
                 converted_args.append(arg)
                 if common_dtype is None:
@@ -227,6 +231,7 @@ def __call__(self, *args: Tensor) -> Tensor:
                     raise NotImplementedError(
                         f"Type promotions are not supported; got dtype {arg.dtype()}, but expected {common_dtype}"
                     )
+                symbolic_args.append(arg._attrs.get("symbolic_value", None))
             else:
                 raise RuntimeError(
                     f"Unsupported data type {arg} in elementwise {self}!"
@@ -248,6 +253,10 @@ def __call__(self, *args: Tensor) -> Tensor:
         self._set_depth()
         output_shape = self._infer_shapes(*converted_args)
         output = Tensor(output_shape, src_ops={self}, dtype=common_dtype)
+        if self._attrs["func"] in INT_ELEMENTWISE_FUNC and None not in symbolic_args:
+            output._attrs["symbolic_value"] = functools.reduce(
+                INT_ELEMENTWISE_FUNC[self._attrs["func"]], symbolic_args
+            )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index 81292fea0..e56cc1c37 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -28,7 +28,12 @@
 
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
-INT_ELEMENTWISE_FUNC_COVERAGE = [FuncEnum.MUL, FuncEnum.DIV, FuncEnum.SUB, FuncEnum.ADD]
+INT_ELEMENTWISE_FUNC = {
+    FuncEnum.MUL: lambda x, y: x * y,
+    FuncEnum.DIV: lambda x, y: x / y,
+    FuncEnum.SUB: lambda x, y: x - y,
+    FuncEnum.ADD: lambda x, y: x + y,
+}
 
 
 class int_elementwise(Operator):
@@ -43,7 +48,7 @@ def __init__(self, func_enum: FuncEnum) -> None:
 
         super().__init__()
         self._attrs["op"] = "int_elementwise"
-        if func_enum not in INT_ELEMENTWISE_FUNC_COVERAGE:
+        if func_enum not in INT_ELEMENTWISE_FUNC:
             raise RuntimeError(f"Not such FuncEnum {func_enum} in int_elementwise!")
         self._attrs["func"] = func_enum
         self._attrs["has_profiler"] = False
@@ -59,12 +64,15 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                 )
         max_vars = [max(v._attrs["values"]) for v in int_vars]
         min_vars = [min(v._attrs["values"]) for v in int_vars]
+        sym_vars = [v._attrs["symbolic_value"] for v in int_vars]
         assert len(max_vars) == len(min_vars) and len(max_vars) >= 2
         values = []
         if self._attrs["func"] == FuncEnum.MUL:
             values += [reduce(lambda x, y: x * y, lis) for lis in [min_vars, max_vars]]
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.MUL], sym_vars)
         elif self._attrs["func"] == FuncEnum.ADD:
             values += [reduce(lambda x, y: x + y, lis) for lis in [min_vars, max_vars]]
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.ADD], sym_vars)
         elif self._attrs["func"] == FuncEnum.SUB:
             inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
             # For an inputs of range [(4,9), (1,8)],
@@ -85,6 +93,7 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                         )
                     a = (lower_bound, upper_bound)
             values = list(a)
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.SUB], sym_vars)
         elif self._attrs["func"] == FuncEnum.DIV:  # floordiv
             inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
             # For an inputs of range [(4,9), (1,8)],
@@ -105,9 +114,11 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                         )
                     a = (lower_bound, upper_bound)
             values = list(a)
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.DIV], sym_vars)
         else:
             raise RuntimeError(f"Unsupported calculation type {self._attrs['func']}!")
         dim = shape_utils.gen_int_var_min_max(values)
+        dim._attrs["symbolic_value"] = sym_values
         for arg, iv in zip(args, int_vars):
             arg._attrs["int_var"] = iv
             assert not arg.is_a_const_num(), f"{arg} cannot be constant"
@@ -115,6 +126,7 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
         self._attrs["inputs"] = list(args)
         self._set_depth()
         output = IntVarTensor(dim, src_ops={self})
+        output._attrs["symbolic_value"] = sym_values
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/symbolic.py b/python/aitemplate/compiler/symbolic.py
new file mode 100644
index 000000000..2d5d81024
--- /dev/null
+++ b/python/aitemplate/compiler/symbolic.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Symbolic helpers for AITemplate.
+
+For interesting how to use Sympy, check: https://docs.sympy.org/latest/tutorials/intro-tutorial/intro.html
+"""
+from __future__ import annotations
+
+from numbers import Number
+from typing import Any, List, Optional, Set
+
+import sympy
+
+
+_k_symbolic_to_intvar = {}
+_k_symbolic_index = 0
+_k_symbolic_value = {}
+
+
+def create_new_symbol(
+    name: Optional[str] = None,
+    values: Optional[List[int]] = None,
+    check_duplicate: bool = False,
+) -> sympy.Symbol:
+    """
+    Creates and memoizing symbols.
+
+    Parameters
+    ----------
+    name : Optional[str]
+        The symbol name that is going to be used. If None is provided, an unused
+        name would be created.
+    values : Optional[List[int]]
+        The values for IntVar, which indicates the range of which the symbol could
+        represent.
+    check_duplicate : bool
+        If set as True and name is provided, we check whether the name and values
+        provided matches the corresponding symbol recorded.
+    """
+    global _k_symbolic_index
+    global _k_symbolic_value
+
+    if name is None:
+        while True:
+            name = f"_sym_{_k_symbolic_index}"
+            _k_symbolic_index += 1
+
+            if name not in _k_symbolic_value:
+                break
+
+    values = sorted(set(values)) if values is not None else values
+    if (
+        check_duplicate
+        and name in _k_symbolic_value
+        and _k_symbolic_value[name] != values
+    ):
+        raise ValueError(
+            f"Symbol ({name}) has different values! New value is {values}, stored value is {_k_symbolic_value[name]}"
+        )
+
+    _k_symbolic_value[name] = values
+    return sympy.Symbol(name)
+
+
+def is_symbol(sym_val: Any) -> bool:
+    return isinstance(sym_val, sympy.Symbol)
+
+
+def is_symbolic(sym_val: Any) -> bool:
+    """
+    Check whether sym_val is a sympy class.
+    """
+    return isinstance(sym_val, sympy.Basic)
+
+
+def is_integer(sym_val: Any) -> bool:
+    # We wrap this since None is returned if sympy can't determine the property.
+    if is_symbolic(sym_val):
+        return sym_val.is_number and int(sym_val) - sym_val == 0
+    elif isinstance(sym_val, Number):
+        return int(sym_val) - sym_val == 0
+
+    return False
+
+
+def get_global_symbol_set() -> Set:
+    global _k_symbolic_value
+    return set(_k_symbolic_value.keys())
+
+
+def get_intvar(sym_name: str):
+    global _k_symbolic_to_intvar
+
+    return _k_symbolic_to_intvar.get(sym_name, None)
+
+
+def store_intvar(sym_name: str, int_var) -> None:
+    global _k_symbolic_to_intvar
+
+    _k_symbolic_to_intvar[sym_name] = int_var
diff --git a/python/setup.py b/python/setup.py
index 53eaa8063..1f5b14103 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -153,7 +153,7 @@ def gen_license_file_list():
     version=__version__,
     description="AITemplate: Make Templates Great for AI",
     zip_safe=True,
-    install_requires=["jinja2", "numpy"],
+    install_requires=["jinja2", "numpy", "sympy"],
     packages=find_packages(),
     package_data={
         "aitemplate": [
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
index b11946f46..fb759de21 100644
--- a/tests/unittest/compiler/test_split_view_strided.py
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -162,6 +162,8 @@ def test_split_view_bmm_rcr_fusion(self):
         )
 
         # bmm_rcr dynamic M, B unfusible
+        # TODO: Reactivate after reshape is supported for symbolic shape
+        """
         self._test_split_view_bmm_rcr(
             ops.bmm_rcr,
             Bs=[2, 4, 5, 10],
@@ -176,10 +178,11 @@ def test_split_view_bmm_rcr_fusion(self):
             expected_num_ops=17,
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible",
         )
+        """
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
-        b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
+        # b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
         m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
 
         # bmm_rcr dynamic M fusible
@@ -199,6 +202,8 @@ def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
             dtype="float",
         )
         # bmm_rcr dynamic M, B unfusible
+        # TODO: Reactivate after reshape is supported for symbolic shape
+        """
         self._test_split_view_bmm_rcr(
             ops.bmm_rcr,
             Bs=[2, 4, 5, 10],
@@ -214,6 +219,7 @@ def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible_float",
             dtype="float",
         )
+        """
 
 
 filter_test_cases_by_test_env(SplitViewStridedOpTestCase)
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index b585e0771..87d876c97 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -148,6 +148,9 @@ def _test_strided_gemm_view_cat_fusible(
         expected_num_ops: int,
         dtype: str = "float16",
     ):
+        if test_name == "gemm_reshape_cat_non_fusible_dynamic_dim":
+            # TODO: Reactivate when reshape is ready for symbolic shapes.
+            self.skipTest("")
         target = detect_target()
 
         batch_dim = IntVar([1, 2, 3], "batch_size")
diff --git a/tests/unittest/compiler/test_symbolic.py b/tests/unittest/compiler/test_symbolic.py
new file mode 100644
index 000000000..e91e6d412
--- /dev/null
+++ b/tests/unittest/compiler/test_symbolic.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import sympy
+
+from aitemplate.compiler import ops, symbolic
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+
+class SymbolTestCase(unittest.TestCase):
+    def test_symbolic_values_existence(self):
+        imm = IntImm(value=7)
+        self.assertIsNotNone(imm._attrs.get("symbolic_value", None))
+
+        var = IntVar(values=[1, 256])
+        self.assertIsNotNone(var._attrs.get("symbolic_value", None))
+
+    def test_imm_equal(self):
+        imm1 = IntImm(value=7)
+        imm2 = IntImm(value=8)
+        imm3 = IntImm(value=7, name="dummy_name")
+
+        self.assertNotEqual(imm1, imm2)
+        self.assertEqual(imm1, imm3)
+
+    def test_var_equal(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        var2 = IntVar(values=[1, 256], name="var_2")
+        var3 = IntVar(values=[1, 256])
+
+        var1_dup = IntVar(values=[1, 256], name="var_1")
+
+        self.assertNotEqual(var1, var2)
+        self.assertNotEqual(var1, var3)
+        self.assertEqual(var1, var1_dup)
+
+    def test_new_symbol(self):
+        sym1 = symbolic.create_new_symbol(name="sym_1")  # noqa: F841
+        # Create same symbol
+        sym1_dup = symbolic.create_new_symbol(name="sym_1")  # noqa: F841
+        # Capture error if 2 symbols share the same name but different value
+        with self.assertRaises(ValueError):
+            _ = symbolic.create_new_symbol(
+                name="sym_1", values=[1, 256], check_duplicate=True
+            )
+
+        sym2 = symbolic.create_new_symbol(name="sym_2", values=[2, 32])  # noqa: F841
+        sym2_dup = symbolic.create_new_symbol(  # noqa: F841
+            name="sym_2", values=[2, 32]
+        )
+        with self.assertRaises(ValueError):
+            _ = symbolic.create_new_symbol(
+                name="sym_2", values=[1, 256], check_duplicate=True
+            )
+
+    def test_is_integer(self):
+        self.assertTrue(symbolic.is_integer(3))
+        self.assertFalse(symbolic.is_integer(3.5))
+        self.assertFalse(symbolic.is_integer("string"))
+        self.assertFalse(symbolic.is_integer([3, 4, 5]))
+
+        sym1 = sympy.Symbol("sym_1")
+        self.assertTrue(symbolic.is_integer(sym1 / sym1))
+        sym2 = 2 * sym1
+        self.assertTrue(symbolic.is_integer(sym2 / sym1))
+        sym3 = 1.5 * sym1
+        self.assertFalse(symbolic.is_integer(sym3 / sym1))
+        self.assertTrue(symbolic.is_integer(sym3 / sym1 * 2))
+
+    def test_elementwise_symbolic(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+
+        tensor1 = IntVarTensor(int_var=var1)
+        tensor2 = IntVarTensor(int_var=var2)
+
+        add = ops.elementwise(FuncEnum.ADD)(tensor1, tensor2)
+        self.assertEqual(add._attrs["symbolic_value"], sym1 + sym2)
+        sub = ops.elementwise(FuncEnum.SUB)(tensor1, tensor2)
+        self.assertEqual(sub._attrs["symbolic_value"], sym1 - sym2)
+        mul = ops.elementwise(FuncEnum.MUL)(tensor1, tensor2)
+        self.assertEqual(mul._attrs["symbolic_value"], sym1 * sym2)
+        div = ops.elementwise(FuncEnum.DIV)(tensor1, tensor2)
+        self.assertEqual(div._attrs["symbolic_value"], sym1 / sym2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
index 6f637eb81..4d1736f17 100644
--- a/tests/unittest/compiler/test_view_strided_op.py
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -485,6 +485,7 @@ def test_multiple_view_and_bmm_fusible(
         ],
         name_func=custom_name_func,
     )
+    @unittest.skip("TODO: Add back when reshape/flatten for symbolic is complete")
     def test_non_fusible_view_and_bmm(
         self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index bf1012edd..1c065239f 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -323,12 +323,15 @@ def _test_mha(
 
     def test_cross_attn(self):
         self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
+        # TODO: Activate after reshape is completed for symbolic shapes
+        """
         self._test_mha(
             batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
         )
         self._test_mha(
             batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
         )
+        """
 
 
 if __name__ == "__main__":

From 88f00b1bf932aa12753a95f1ed228807cd957566 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 21 Mar 2023 14:38:54 -0700
Subject: [PATCH 302/638] Add Symbolic Shape for view_ops (#440)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/440

Add Symbolic Shape for view_ops

Reviewed By: chenyang78

Differential Revision: D43845771

fbshipit-source-id: 1dddb771a4c6018539ecff078d71ac70db433c5a
---
 .../compiler/ops/common/view_ops.py           | 267 +++++++++++-------
 python/aitemplate/compiler/symbolic.py        |  23 ++
 .../compiler/test_split_view_strided.py       |   8 +-
 .../compiler/test_strided_view_cat.py         |   3 -
 .../unittest/compiler/test_view_strided_op.py |   1 -
 tests/unittest/ops/test_flatten.py            |  69 +++++
 tests/unittest/ops/test_reshape.py            |  81 +++++-
 tests/unittest/ops/test_size_getitem_ops.py   |   2 +
 tests/unittest/ops/test_vanilla_attention.py  |   3 -
 9 files changed, 333 insertions(+), 124 deletions(-)

diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 0cf30564e..8c55ae30b 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -16,10 +16,10 @@
 View ops.
 """
 
-import itertools
 import logging
 import math
-from typing import Any, List, Optional, Tuple, Union
+from functools import reduce
+from typing import Any, List, Optional, Union
 
 import jinja2
 
@@ -34,8 +34,15 @@
     Operator,
     Tensor,
 )
-from aitemplate.utils.shape_utils import convert_shape_to_IntVar
-
+from aitemplate.compiler.symbolic import (
+    get_global_symbol_set,
+    get_intvar,
+    is_integer,
+    is_symbol,
+    is_symbolic,
+    simplify_intvar_values,
+)
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar, gen_int_var_min_max
 from aitemplate.utils.tensor_utils import wrap_dim
 
 
@@ -139,7 +146,9 @@ def make_output_shape_from_int_vars(
             else:
                 # dynamic dimension
                 dim_name = int_var._attrs["name"]
-                output_shape.append(IntVar(name=dim_name, values=dim_values))
+                var = IntVar(name=dim_name, values=dim_values)
+                var._attrs["symbolic_value"] = int_var._attrs["symbolic_value"]
+                output_shape.append(var)
         return output_shape
 
     def make_output_shape(
@@ -185,6 +194,16 @@ def _is_dynamic_dim_reused(x_shape_values, y_shape_values) -> bool:
     )
 
 
+def _get_shape_values(symbolic_shape_values, shape_values):
+    new_shape_values = []
+    for sym, var in zip(symbolic_shape_values, shape_values):
+        if is_integer(sym):
+            new_shape_values.append([int(sym)])
+        else:
+            new_shape_values.append(var._attrs["values"])
+    return new_shape_values
+
+
 class reshape(_reshape_base):
     """
     Returns a tensor with the same data and number of elements as input, but with the
@@ -201,84 +220,135 @@ def __init__(self) -> None:
         self.shape_eval_template = RESHAPE_FUNC_TEMPLATE
         self.dynamic_eval_template = DYNAMIC_RESHAPE_FUNC_TEMPLATE
 
-    def _infer_shape(self, x: Tuple[int], shape: Tuple[int]):
-        new_shape = list(shape)
-        cur_shape = x
-        unknown_idx = -1
-        prod = 1
-        for idx, v in enumerate(new_shape):
-            if v == -1:
-                # no multiple -1s
-                assert unknown_idx == -1
-                unknown_idx = idx
-            else:
-                prod *= v
-        numel = 1
-        for dim in cur_shape:
-            numel *= dim
-
-        if unknown_idx == -1:
-            assert (
-                numel == prod
-            ), f"When there is no unknown index, we expect dim products to be equal, got current shape {numel=} != new shape {prod=}"
-        else:
-            # FIXME: note that this RuntimeError rules out some "valid" PyTorch
-            # code like:
-            # t = torch.arange(0).reshape(4, 0)
-            # this is valid in PT but would trigger RuntimeError below
-            # t.reshape(2, 2, -1)
-            # We can fix it later.
-            if prod <= 0:
-                raise RuntimeError(f"cannot reshape tensor {x} with shape {shape}")
-            assert numel % prod == 0
-            new_shape[unknown_idx] = numel // prod
-        return new_shape
-
     def _infer_shapes(self, x: Tensor):
         # There are two cases:
         # 1) there is only one unknown shape.
         # 2) there is no unkown shape and all shape dimensions are represented as IntVarTensor
-        # For 1), the view op will deduce the shape of if one dim is labeled as -1,
+        # For 1), the view op will deduce the shape of the dim that is labeled as -1,
         #         but it can't do so with more than 1 dynamic dimension
         # For 2), when all dynamic shapes are known, we should be able to pass the input shape to out.
         #         i.e. we should skip the deduction when all shapes are known.
         is_intvar = all([isinstance(var, IntVarTensor) for var in self._attrs["shape"]])
         self._attrs["is_intvar"] = is_intvar
+
         if not is_intvar:
-            x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-            x_dynamic_dims = [
-                var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
+            # x_symbolic_shapes is a list of symbolic_values
+            x_symbolic_shapes = [
+                var._attrs["symbolic_value"] for var in x._attrs["shape"]
+            ]
+            x_symbolic_shapes_mapping = {
+                var._attrs["symbolic_value"]: var for var in x._attrs["shape"]
+            }
+            # x_shape_values is a list of valid IntVar _attrs["values"]
+            x_shape_values = _get_shape_values(x_symbolic_shapes, x._attrs["shape"])
+            # x_shape_symbolic_values is a list of valid _attrs["symbolic_values"]
+            x_shape_symbolic_values = [
+                shape_values[0] if len(shape_values) < 2 else sym
+                for sym, shape_values in zip(x_symbolic_shapes, x_shape_values)
             ]
-            x_shapes = list(itertools.product(*x_shape_values))
 
             self._attrs["shape"] = convert_shape_to_IntVar(self._attrs["shape"])
-            new_shape_vals = [var._attrs["values"] for var in self._attrs["shape"]]
-            new_shapes = list(itertools.product(*new_shape_vals))
-
-            # len(x_shapes) > 1 means that at least 1 dim in the shapes of x is dynamic.
-            # len(new_shapes) > 1 means that the dynamic dim is retained; otherwise, it would
-            # have been replaced with -1 or a concrete number.
-            if len(x_shapes) > len(new_shapes):
-                # we only support two cases here, when len(x_shapes) > 1, len(x_shapes) must
-                # be either len(new_shapes) (the dynamic dim is retained) or 1 (use -1 to
-                # mark the dynamic or unknown index and no other dim is dynamic).
-                assert len(new_shapes) == 1
-                new_shapes = new_shapes * len(x_shapes)
-            # run infershape for each
-            y_shapes = [
-                self._infer_shape(x_shape, new_shape)
-                for x_shape, new_shape in zip(x_shapes, new_shapes)
+            to_symbolic_shapes = [
+                var._attrs["symbolic_value"] for var in self._attrs["shape"]
+            ]
+            # new_shape_values is a list of valid IntVar _attrs["values"] with the
+            # only exception being it including an -1.
+            new_shape_values = _get_shape_values(
+                to_symbolic_shapes, self._attrs["shape"]
+            )
+            new_shape_symbolic_values = [
+                shape_values[0] if len(shape_values) < 2 else sym
+                for sym, shape_values in zip(to_symbolic_shapes, new_shape_values)
             ]
 
-            def unique(vector):
-                return sorted(set(vector))
+            # Check whether we have -1 that needs to be deduced
+            neg_dim = None
+            for idx, s in enumerate(new_shape_values):
+                if len(s) == 1 and s[0] == -1:
+                    assert neg_dim is None, "Multiple -1 detected in reshape"
+                    neg_dim = idx
 
-            y_shape_values = list(map(unique, zip(*y_shapes)))
-            reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
-            return self.make_output_shape(
-                y_shape_values,
-                dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
+            x_prod = reduce(
+                lambda x, y: x * y, [val for val in x_shape_symbolic_values if val != 0]
             )
+            new_prod = reduce(
+                lambda x, y: x * y,
+                [val for val in new_shape_symbolic_values if val != 0],
+            )
+            quotient = x_prod / new_prod
+            if neg_dim is not None and is_integer(quotient):
+                # We check whether the negative -1 is static.
+                val = int(quotient * -1)
+                new_shape_symbolic_values[neg_dim] = val
+                self._attrs["shape"][neg_dim] = IntImm(val)
+                neg_dim = None
+
+            if neg_dim is None:
+                # We try to simplify symbols before returning the shapes.
+                symbol_idx = [
+                    idx
+                    for idx, s in enumerate(new_shape_symbolic_values)
+                    if is_symbolic(s)
+                ]
+
+                if len(symbol_idx) == 1:
+                    # Check if we can reuse shapes and if the shape belongs to
+                    # unknown_idx and need to be determined during runtime.
+                    new_prod = 1
+                    for idx, val in enumerate(new_shape_symbolic_values):
+                        if idx == symbol_idx[0]:
+                            continue
+                        if val != 0:
+                            new_prod *= val
+                    dynamic_symbol = x_prod / new_prod
+                    if is_symbol(dynamic_symbol):
+                        self._attrs["shape"][symbol_idx[0]] = get_intvar(
+                            dynamic_symbol.name
+                        )
+                    elif is_integer(dynamic_symbol):
+                        self._attrs["shape"][symbol_idx[0]] = IntImm(
+                            int(dynamic_symbol)
+                        )
+                    else:
+                        self._attrs["unknown_idx"] = symbol_idx[0]
+                # TODO: Handle len(symbol_idx) > 1 with recording previous symbols.
+
+                return self._attrs["shape"]
+            else:
+                # We try to deduce the dynamic dimensions for new_shapes.
+                self._attrs["unknown_idx"] = neg_dim
+
+                y_shapes = []
+                for idx, val in enumerate(new_shape_symbolic_values):
+                    if idx == self._attrs["unknown_idx"]:
+                        dynamic_symbol = x_prod / new_prod * -1
+                        if is_symbol(dynamic_symbol):
+                            y_shapes.append(get_intvar(dynamic_symbol.name))
+                        elif is_integer(dynamic_symbol):
+                            y_shapes.append(IntImm(int(dynamic_symbol)))
+                        else:
+                            symbol_names = {s.name for s in dynamic_symbol.free_symbols}
+                            assert (
+                                len(symbol_names - get_global_symbol_set()) == 0
+                            ), "Unable to deduce dynamic symbol"
+
+                            values = simplify_intvar_values(dynamic_symbol)
+                            new_var = IntVar(values)
+                            new_var._attrs["symbolic_value"] = dynamic_symbol
+
+                            y_shapes.append(new_var)
+                    elif isinstance(val, int):
+                        y_shapes.append(IntImm(val))
+                    elif val in x_symbolic_shapes_mapping:
+                        y_shapes.append(x_symbolic_shapes_mapping[val])
+                    elif is_symbolic(val):
+                        val_var = gen_int_var_min_max(new_shape_values[idx])
+                        val_var._attrs["symbolic_value"] = val
+                        y_shapes.append(val_var)
+                    else:
+                        raise ValueError(f"Unknown sym type for handling {val}")
+            return y_shapes
+
         else:
             return self.make_output_shape_from_int_vars(self._attrs["shape"])
 
@@ -334,46 +404,43 @@ def __init__(self, start_dim=0, end_dim=-1) -> None:
         self._attrs["start"] = start_dim
         self._attrs["end"] = end_dim
 
-    def _infer_shape(self, x: List[int]):
-        start = self._attrs["start"]
-        end = self._attrs["end"]
-
-        start = wrap_dim(start, len(x))
-        end = wrap_dim(end, len(x))
-
-        new_shape = []
-        for idx in range(start):
-            new_shape.append(x[idx])
-
-        prod = 1
-        for dim in x[start : end + 1]:
-            prod *= dim
-        new_shape.append(prod)
+    def _infer_shapes(self, x: Tensor):
+        # x_symbolic_shapes is a list of symbolic_values
+        x_symbolic_shapes = [var._attrs["symbolic_value"] for var in x._attrs["shape"]]
+        # x_shape_values is a list of valid IntVar _attrs["values"]
+        x_shape_values = _get_shape_values(x_symbolic_shapes, x._attrs["shape"])
+        # x_shape_symbolic_values is a list of valid _attrs["symbolic_values"]
+        x_shape_symbolic_values = [
+            shape_values[0] if len(shape_values) < 2 else sym
+            for sym, shape_values in zip(x_symbolic_shapes, x_shape_values)
+        ]
 
-        for dim in x[end + 1 :]:
-            new_shape.append(dim)
+        start = wrap_dim(self._attrs["start"], len(x_symbolic_shapes))
+        end = wrap_dim(self._attrs["end"], len(x_symbolic_shapes))
+        self._attrs["unknown_idx"] = start
 
-        return new_shape
+        # Computed shape after flatten.
+        new_shapes = []
 
-    def _infer_shapes(self, x: Tensor):
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        x_dynamic_dims = [
-            var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
-        ]
+        for var in x._attrs["shape"][:start]:
+            new_shapes.append(var)
 
-        # run infershape for each
-        y_shapes = [self._infer_shape(x_shape) for x_shape in x_shapes]
+        min_val, max_val, sym_val = 1, 1, 1
+        for idx in range(start, end + 1):
+            min_val *= min(x_shape_values[idx])
+            max_val *= max(x_shape_values[idx])
+            sym_val *= x_shape_symbolic_values[idx]
+        if min_val == max_val:
+            flatten_shape = IntImm(value=min_val)
+        else:
+            flatten_shape = IntVar(values=[min_val, max_val])
+            flatten_shape._attrs["symbolic_value"] = sym_val
+        new_shapes.append(flatten_shape)
 
-        def unique(vector):
-            return sorted(set(vector))
+        for var in x._attrs["shape"][end + 1 :]:
+            new_shapes.append(var)
 
-        y_shape_values = list(map(unique, zip(*y_shapes)))
-        reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
-        return self.make_output_shape(
-            y_shape_values,
-            dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
-        )
+        return new_shapes
 
     def _sanity_check(self, x_shape):
         x_rank = len(x_shape)
diff --git a/python/aitemplate/compiler/symbolic.py b/python/aitemplate/compiler/symbolic.py
index 2d5d81024..9ccd019f6 100644
--- a/python/aitemplate/compiler/symbolic.py
+++ b/python/aitemplate/compiler/symbolic.py
@@ -19,6 +19,8 @@
 """
 from __future__ import annotations
 
+import itertools
+
 from numbers import Number
 from typing import Any, List, Optional, Set
 
@@ -111,3 +113,24 @@ def store_intvar(sym_name: str, int_var) -> None:
     global _k_symbolic_to_intvar
 
     _k_symbolic_to_intvar[sym_name] = int_var
+
+
+def simplify_intvar_values(sym_val: sympy.Basic):
+    """
+    Given a symbolic value, resolve the symbol's value range.
+
+    Example:
+    'symbol_A' has value range of [10, 20]
+    simplify_intvar_values(symbol_A * 3 + 4) returns [34, 64]
+    """
+    global _k_symbolic_value
+
+    symbols = list(sym_val.free_symbols)
+    symbol_shapes = [_k_symbolic_value[s.name] for s in symbols]
+    symbol_shapes = [s for s in symbol_shapes if s is not None]
+    shape_perms = list(itertools.product(*symbol_shapes))
+
+    new_shape = [int(sym_val.subs(zip(symbols, s))) for s in shape_perms]
+    new_shape = sorted(set(new_shape))
+
+    return new_shape
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
index fb759de21..b11946f46 100644
--- a/tests/unittest/compiler/test_split_view_strided.py
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -162,8 +162,6 @@ def test_split_view_bmm_rcr_fusion(self):
         )
 
         # bmm_rcr dynamic M, B unfusible
-        # TODO: Reactivate after reshape is supported for symbolic shape
-        """
         self._test_split_view_bmm_rcr(
             ops.bmm_rcr,
             Bs=[2, 4, 5, 10],
@@ -178,11 +176,10 @@ def test_split_view_bmm_rcr_fusion(self):
             expected_num_ops=17,
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible",
         )
-        """
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
-        # b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
+        b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
         m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
 
         # bmm_rcr dynamic M fusible
@@ -202,8 +199,6 @@ def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
             dtype="float",
         )
         # bmm_rcr dynamic M, B unfusible
-        # TODO: Reactivate after reshape is supported for symbolic shape
-        """
         self._test_split_view_bmm_rcr(
             ops.bmm_rcr,
             Bs=[2, 4, 5, 10],
@@ -219,7 +214,6 @@ def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible_float",
             dtype="float",
         )
-        """
 
 
 filter_test_cases_by_test_env(SplitViewStridedOpTestCase)
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index 87d876c97..b585e0771 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -148,9 +148,6 @@ def _test_strided_gemm_view_cat_fusible(
         expected_num_ops: int,
         dtype: str = "float16",
     ):
-        if test_name == "gemm_reshape_cat_non_fusible_dynamic_dim":
-            # TODO: Reactivate when reshape is ready for symbolic shapes.
-            self.skipTest("")
         target = detect_target()
 
         batch_dim = IntVar([1, 2, 3], "batch_size")
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
index 4d1736f17..6f637eb81 100644
--- a/tests/unittest/compiler/test_view_strided_op.py
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -485,7 +485,6 @@ def test_multiple_view_and_bmm_fusible(
         ],
         name_func=custom_name_func,
     )
-    @unittest.skip("TODO: Add back when reshape/flatten for symbolic is complete")
     def test_non_fusible_view_and_bmm(
         self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
diff --git a/tests/unittest/ops/test_flatten.py b/tests/unittest/ops/test_flatten.py
index cbf1920f2..ca2dca531 100644
--- a/tests/unittest/ops/test_flatten.py
+++ b/tests/unittest/ops/test_flatten.py
@@ -175,6 +175,75 @@ def test_flatten_fp32(self):
             dtype="float32",
         )
 
+    def _test_flatten_shape(self, in_shape, out_shape, start_dim, end_dim):
+        X = Tensor(
+            shape=in_shape,
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Flatten(start_dim, end_dim)
+        Y = OP(X)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_flatten_shape_imm(self):
+        in_shape = [IntImm(17), IntImm(19), IntImm(23)]
+
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19 * 23)], 0, 2)
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19 * 23)], 0, -1)
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19), IntImm(23)], 0, 1)
+        self._test_flatten_shape(in_shape, [IntImm(17), IntImm(19 * 23)], 1, 2)
+        self._test_flatten_shape(in_shape, [IntImm(17), IntImm(19 * 23)], 1, -1)
+
+    def test_flatten_shape_var(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+        in_shape = [var1, var2, var3]
+
+        ovar1 = IntVar(values=[21, 110])
+        ovar1._attrs["symbolic_value"] = sym1 * sym2 * sym3
+        self._test_flatten_shape(in_shape, [ovar1], 0, 2)
+        self._test_flatten_shape(in_shape, [ovar1], 0, -1)
+        ovar1 = IntVar(values=[3, 10])
+        ovar1._attrs["symbolic_value"] = sym1 * sym2
+        self._test_flatten_shape(in_shape, [ovar1, var3], 0, 1)
+        ovar1 = IntVar(values=[21, 55])
+        ovar1._attrs["symbolic_value"] = sym2 * sym3
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, 2)
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
+
+    def test_flatten_shape_mix(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        imm1 = IntImm(17)
+        imm2 = IntImm(19)
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+        in_shape = [var1, imm1, var2, var3, imm2]
+
+        ovar1 = IntVar(values=[51, 170])
+        ovar1._attrs["symbolic_value"] = sym1 * 17 * sym2
+        self._test_flatten_shape(in_shape, [ovar1, var3, imm2], 0, 2)
+        ovar1 = IntVar(values=[6783, 35530])
+        ovar1._attrs["symbolic_value"] = 323 * sym1 * sym2 * sym3
+        self._test_flatten_shape(in_shape, [ovar1], 0, -1)
+        ovar1 = IntVar(values=[357, 935])
+        ovar1._attrs["symbolic_value"] = 17 * sym2 * sym3
+        self._test_flatten_shape(in_shape, [var1, ovar1, imm2], 1, 3)
+        ovar1 = IntVar(values=[6783, 17765])
+        ovar1._attrs["symbolic_value"] = 323 * sym2 * sym3
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_reshape.py b/tests/unittest/ops/test_reshape.py
index a7252b635..724beb583 100644
--- a/tests/unittest/ops/test_reshape.py
+++ b/tests/unittest/ops/test_reshape.py
@@ -16,8 +16,8 @@
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model
-from aitemplate.compiler.ops.common.view_ops import reshape
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVarTensor
 
 from aitemplate.frontend import IntImm, IntVar, nn, Tensor
 from aitemplate.testing import detect_target
@@ -25,6 +25,39 @@
 
 
 class ReshapeTestCase(unittest.TestCase):
+    def _infer_shape(self, x, shape):
+        new_shape = list(shape)
+        cur_shape = x
+        unknown_idx = -1
+        prod = 1
+        for idx, v in enumerate(new_shape):
+            if v == -1:
+                # no multiple -1s
+                assert unknown_idx == -1
+                unknown_idx = idx
+            else:
+                prod *= v
+        numel = 1
+        for dim in cur_shape:
+            numel *= dim
+
+        if unknown_idx == -1:
+            assert (
+                numel == prod
+            ), f"When there is no unknown index, we expect dim products to be equal, got current shape {numel=} != new shape {prod=}"
+        else:
+            # FIXME: note that this RuntimeError rules out some "valid" PyTorch
+            # code like:
+            # t = torch.arange(0).reshape(4, 0)
+            # this is valid in PT but would trigger RuntimeError below
+            # t.reshape(2, 2, -1)
+            # We can fix it later.
+            if prod <= 0:
+                raise RuntimeError(f"cannot reshape tensor {x} with shape {shape}")
+            assert numel % prod == 0
+            new_shape[unknown_idx] = numel // prod
+        return new_shape
+
     def _test_reshape(
         self,
         batch_size=(1, 3),
@@ -89,7 +122,6 @@ def _test_reshape_single_op(
         )
 
         OP = nn.Reshape()
-        OP_backend = reshape()
         Y = OP(X, Y_shape)
 
         Y._attrs["name"] = "output_0"
@@ -103,8 +135,9 @@ def _test_reshape_single_op(
         if len(x_shapes) > len(new_shapes):
             assert len(new_shapes) == 1
             new_shapes = new_shapes * len(x_shapes)
+
         y_shapes = [
-            OP_backend._infer_shape(x_shape, new_shape)
+            self._infer_shape(x_shape, new_shape)
             for x_shape, new_shape in zip(x_shapes, new_shapes)
         ]
 
@@ -145,7 +178,7 @@ def test_reshape(self):
         )
         self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
-            Y_shape=(5, 4, IntVar(values=(2, 4)), 3, -1),
+            Y_shape=(5, 4, IntVar(values=(2, 4), name="input_batch"), 3, -1),
             test_name="reshape_name_unknown_static_dim",
             check_name_retention=True,
         )
@@ -155,11 +188,6 @@ def test_reshape(self):
             test_name="reshape_name_no_unknown_dims",
             check_name_retention=True,
         )
-        self._test_reshape_single_op(
-            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
-            Y_shape=(IntVar(values=(10, 20)), 4, 2, 3, -1),
-            test_name="reshape_squeeze_intvar_dim",
-        )
         self._test_reshape_single_op(
             X_shape=(IntVar(values=(20, 40), name="input_batch"), 1, 12),
             Y_shape=(4, 2, IntVar(values=(2, 4)), 3, 5),
@@ -170,6 +198,39 @@ def test_reshape(self):
     def test_reshape_float32(self):
         self._test_reshape_single_op(input_type="float32", test_name="reshape_float32")
 
+    def _test_reshape_shape(self, in_shape, out_shape, target_shape):
+        X = Tensor(
+            shape=in_shape,
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Reshape()
+        Y = OP(X, target_shape)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_reshape_shape_symbolic(self):
+        dummy_shape = Tensor(
+            shape=[1, 2],
+            name="dummy_shape",
+            is_input=True,
+        )
+        var1 = IntVar(values=[2, 4], name="var1")
+        tensor1 = IntVarTensor(var1)
+        X_shape = [var1, IntImm(256)]
+
+        intvar = [ops.size()(dummy_shape, idx) for idx in range(2)]
+
+        target_shape = [intvar[1] * tensor1, IntImm(-1)]
+        outdim0 = IntVar(values=[4, 8])
+        outdim0._attrs["symbolic_value"] = var1._attrs["symbolic_value"] * 2
+        answer_shape = [outdim0, IntImm(128)]
+        self._test_reshape_shape(X_shape, answer_shape, target_shape)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
index fdb08f346..173516bff 100644
--- a/tests/unittest/ops/test_size_getitem_ops.py
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -148,6 +148,7 @@ def _test_tensor_size_op(
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
+    @unittest.skip("TODO: Activate after symbolic shape for concatenate is ready")
     def test_tensor_size_op_fp16(self):
         self._test_tensor_size_op(
             test_name="tensor_size_op_fp16",
@@ -160,6 +161,7 @@ def test_tensor_size_op_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.skip("TODO: Activate after symbolic shape for concatenate is ready")
     def test_tensor_size_op_fp32(self):
         self._test_tensor_size_op(
             test_name="tensor_size_op_fp32",
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index 1c065239f..bf1012edd 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -323,15 +323,12 @@ def _test_mha(
 
     def test_cross_attn(self):
         self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
-        # TODO: Activate after reshape is completed for symbolic shapes
-        """
         self._test_mha(
             batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
         )
         self._test_mha(
             batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
         )
-        """
 
 
 if __name__ == "__main__":

From c1bbd4f2d0d2afb7a43b76939345d8d2c76acee0 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 21 Mar 2023 14:38:54 -0700
Subject: [PATCH 303/638] Add symbolic shape support for concatenate (#439)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/439

Add symbolic support for concatenate

Reviewed By: chenyang78

Differential Revision: D44079718

fbshipit-source-id: 201fbba7c9c05b1a2c58f8dc91e0224d4fb90417
---
 .../compiler/ops/tensor/concatenate.py        | 11 +++-
 tests/unittest/ops/test_concatenate.py        | 62 +++++++++++++++++++
 tests/unittest/ops/test_size_getitem_ops.py   |  2 -
 3 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index 51c792d93..15a02fabc 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -15,6 +15,7 @@
 """
 Concatenate.
 """
+from functools import reduce
 from typing import List, Sequence, Union
 
 from aitemplate import backend
@@ -90,9 +91,15 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             if idx == dim:
                 min_value_sum = sum(value[0] for value in lst)
                 max_value_sum = sum(value[-1] for value in lst)
-                output_shape.append(
-                    shape_utils.gen_int_var([min_value_sum, max_value_sum])
+                shape_var = shape_utils.gen_int_var([min_value_sum, max_value_sum])
+                shape_var._attrs["symbolic_value"] = reduce(
+                    lambda x, y: x + y,
+                    [
+                        input_shape[idx]._attrs["symbolic_value"]
+                        for input_shape in input_shapes
+                    ],
                 )
+                output_shape.append(shape_var)
             else:
                 output_dim = input_shapes[0][idx]
                 for shape in input_shapes:
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index d9bc93a6e..87e686f23 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -17,6 +17,7 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import get_random_torch_tensor
@@ -371,6 +372,67 @@ def test_floats(self, dtype):
             input_type=dtype,
         )
 
+    def _test_concatenate_shape(self, in_shapes, out_shape, dim):
+        Xs = [
+            Tensor(
+                shape=in_shape,
+                name=f"input_{idx}",
+                is_input=True,
+            )
+            for idx, in_shape in enumerate(in_shapes)
+        ]
+
+        Y = ops.concatenate()(Xs, dim)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_concatenate_shape_var(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+
+        in_shapes = [[var, 2, 3] for var in [var1, var2, var3]]
+        ovar1 = IntVar(values=[11, 18])
+        ovar1._attrs["symbolic_value"] = sym1 + sym2 + sym3
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], -3)
+
+    def test_concatenate_shape_mix(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        imm1 = IntImm(17)
+        imm2 = IntImm(19)
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+
+        in_shapes = [[var1, 2, 3], [imm1, 2, 3], [imm2, 2, 3], [var2, 2, 3]]
+        ovar1 = IntVar(values=[40, 43])
+        ovar1._attrs["symbolic_value"] = sym1 + sym2 + 17 + 19
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
+
+    def test_concatenate_shape_compatible(self):
+        var1 = IntVar(values=[1, 2])
+        sym1 = var1._attrs["symbolic_value"]
+
+        in_shapes = [[var1, 2, 3], [var1, 2, 3]]
+        self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
+        dup_var1 = IntVar(values=[1, 2])
+        dup_var1._attrs["symbolic_value"] = sym1
+        in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
+        self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
+        var2 = IntVar(values=[1, 2])
+        with self.assertRaises(RuntimeError):
+            in_shapes = [[var1, 2, 3], [var2, 2, 3]]
+            self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
index 173516bff..fdb08f346 100644
--- a/tests/unittest/ops/test_size_getitem_ops.py
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -148,7 +148,6 @@ def _test_tensor_size_op(
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    @unittest.skip("TODO: Activate after symbolic shape for concatenate is ready")
     def test_tensor_size_op_fp16(self):
         self._test_tensor_size_op(
             test_name="tensor_size_op_fp16",
@@ -161,7 +160,6 @@ def test_tensor_size_op_fp16(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skip("TODO: Activate after symbolic shape for concatenate is ready")
     def test_tensor_size_op_fp32(self):
         self._test_tensor_size_op(
             test_name="tensor_size_op_fp32",

From 6fcc155ef06774a3da19b55f999d2aaa30c45dd8 Mon Sep 17 00:00:00 2001
From: Ying Zhang <ipiszy@users.noreply.github.com>
Date: Tue, 21 Mar 2023 22:46:37 -0700
Subject: [PATCH 304/638] Add rocm ci into AIT (#441)

Summary:
ATT, add a placeholder to enable rocm ci in github first.
No actual tests will be triggered. Need this PR to be landed first to unblock https://github.com/facebookincubator/AITemplate/pull/146.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/441

Reviewed By: yinghai

Differential Revision: D44285053

Pulled By: ipiszy

fbshipit-source-id: ee6f4bb106c8617eeead41d8369ceb149339c884
---
 .github/workflows/rocm_ci.yml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 .github/workflows/rocm_ci.yml

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
new file mode 100644
index 000000000..4e0f2c92c
--- /dev/null
+++ b/.github/workflows/rocm_ci.yml
@@ -0,0 +1,30 @@
+name: ROCM_CI
+
+on:
+  push:
+
+jobs:
+  build:
+    runs-on: rocm
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Get CPU info on Ubuntu
+      if: contains(runner.os, 'linux')
+      run: |
+        cat /proc/cpuinfo
+    - name: Get env vars
+      run: |
+        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
+        echo HOME              = $HOME
+        echo GITHUB_ACTION     = $GITHUB_ACTION
+        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
+        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
+        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
+        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
+        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
+        echo GITHUB_SHA        = $GITHUB_SHA
+        echo GITHUB_REF        = $GITHUB_REF
+        export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
+        echo GIT_BRANCH        = $GIT_BRANCH
+        c++ --verbose

From 6c9bf5405894b4fa93e4abd4118ee7b694993557 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 02:29:08 -0700
Subject: [PATCH 305/638] Fix typo remplate (#450)

Summary:
Fix typo remplate -> template

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/450

Reviewed By: tenpercent

Differential Revision: D44278987

Pulled By: aakhundov

fbshipit-source-id: 68a6d824ceaac18d4ec1667b718b4153d2c180c3
---
 python/aitemplate/backend/cuda/conv2d/common.py             | 4 ++--
 .../backend/cuda/conv2d/common_conv2d_bias_activation.py    | 4 ++--
 .../cuda/conv2d/common_conv2d_bias_add_activation.py        | 4 ++--
 python/aitemplate/backend/cuda/conv2d/conv2d.py             | 4 ++--
 python/aitemplate/backend/cuda/conv2d/conv2d_bias.py        | 4 ++--
 python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py    | 4 ++--
 .../backend/cuda/conv2d/conv2d_bias_add_hardswish.py        | 4 ++--
 .../aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py  | 4 ++--
 .../backend/cuda/conv2d/conv2d_bias_few_channels.py         | 6 +++---
 .../aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py | 4 ++--
 .../cuda/conv2d/conv2d_bias_hardswish_few_channels.py       | 6 +++---
 python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py   | 4 ++--
 .../backend/cuda/conv2d/conv2d_bias_relu_few_channels.py    | 6 +++---
 .../aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py   | 4 ++--
 python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py   | 4 ++--
 .../aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py | 4 ++--
 python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py  | 4 ++--
 .../backend/cuda/conv2d/transposed_conv2d_bias.py           | 4 ++--
 python/aitemplate/backend/cuda/conv3d/common.py             | 4 ++--
 python/aitemplate/backend/cuda/conv3d/common_bias.py        | 4 ++--
 python/aitemplate/backend/cuda/conv3d/conv3d.py             | 4 ++--
 python/aitemplate/backend/cuda/conv3d/conv3d_bias.py        | 4 ++--
 .../backend/cuda/gemm_universal/group_gemm_rcr_bias.py      | 4 ++--
 .../backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py | 4 ++--
 .../cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py      | 4 ++--
 python/aitemplate/backend/cuda/pool2d/avg_pool2d.py         | 4 ++--
 python/aitemplate/backend/cuda/pool2d/max_pool2d.py         | 4 ++--
 python/aitemplate/backend/cuda/upsample/upsampling2d.py     | 4 ++--
 python/aitemplate/backend/cuda/upsample/upsampling2d_add.py | 4 ++--
 .../cuda/vision_ops/roi_ops/multi_level_roi_align.py        | 4 ++--
 .../aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py | 4 ++--
 python/aitemplate/backend/rocm/conv2d/common.py             | 6 +++---
 python/aitemplate/backend/rocm/conv2d/conv2d.py             | 6 +++---
 python/aitemplate/backend/rocm/conv2d/conv2d_bias.py        | 6 +++---
 .../aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py  | 6 +++---
 python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py   | 6 +++---
 .../aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py   | 6 +++---
 python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py  | 6 +++---
 .../backend/rocm/conv2d/transposed_conv2d_bias_relu.py      | 6 +++---
 python/aitemplate/backend/rocm/pool2d/avg_pool2d.py         | 4 ++--
 python/aitemplate/backend/rocm/pool2d/max_pool2d.py         | 4 ++--
 python/aitemplate/backend/rocm/pool2d/pool2d.py             | 4 ++--
 python/aitemplate/backend/rocm/upsample/upsampling2d.py     | 4 ++--
 python/aitemplate/backend/rocm/upsample/upsampling2d_add.py | 4 ++--
 .../rocm/vision_ops/roi_ops/multi_level_roi_align.py        | 4 ++--
 .../aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py | 4 ++--
 46 files changed, 103 insertions(+), 103 deletions(-)

diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 01e076d03..04a7e2c69 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -752,7 +752,7 @@ def extract_config_name(config):
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
@@ -822,7 +822,7 @@ def gen_function(
             instance_name=fname,
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
 
     function = FUNCTION_TEMPLATE.render(
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
index 7025a45c6..28b92127e 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -44,13 +44,13 @@ def gen_profiler(
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         is_bias=True,
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
index 75655dcaf..64641ba2f 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -68,13 +68,13 @@ def gen_profiler(
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         is_bias_add=True,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
index d4c54c2e0..01bb30105 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -52,14 +52,14 @@ def conv2d_gen_profiler(
 @registry.reg("cuda.conv2d.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d function."""
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
index 3c010c4dc..adc7b0253 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -52,14 +52,14 @@ def conv2d_bias_gen_profiler(
 @registry.reg("cuda.conv2d_bias.gen_function")
 def conv2d_bias_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d function."""
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
index c6db62b6e..c9762f6ec 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -56,13 +56,13 @@ def conv2d_bias_add_identity_gen_profiler(
 @registry.reg("cuda.conv2d_bias_add_identity.gen_function")
 def conv2d_bias_add_identity_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cbaa.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
index 968b605e0..defcae4a4 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -56,13 +56,13 @@ def conv2d_bias_add_hardswish_gen_profiler(
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_function")
 def conv2d_bias_add_hardswish_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cbaa.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
index 1a7fe093a..cbbe02038 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -56,13 +56,13 @@ def conv2d_bias_add_relu_gen_profiler(
 @registry.reg("cuda.conv2d_bias_add_relu.gen_function")
 def conv2d_bias_add_relu_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cbaa.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
index 33fdbb989..5c618d87c 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -69,7 +69,7 @@ def conv2d_bias_few_channels_gen_profiler(
 @registry.reg("cuda.conv2d_bias_few_channels.gen_function")
 def conv2d_bias_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -79,7 +79,7 @@ def conv2d_bias_few_channels_gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -93,7 +93,7 @@ def conv2d_bias_few_channels_gen_function(
     """
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
index f883312c6..9ab085cfd 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -50,13 +50,13 @@ def conv2d_bias_hardswish_gen_profiler(
 @registry.reg("cuda.conv2d_bias_hardswish.gen_function")
 def conv2d_bias_hardswish_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
index 36a2dc35f..6f139a3ae 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -69,7 +69,7 @@ def conv2d_bias_hardswish_few_channels_gen_profiler(
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_function")
 def conv2d_bias_hardswish_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -79,7 +79,7 @@ def conv2d_bias_hardswish_few_channels_gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -93,7 +93,7 @@ def conv2d_bias_hardswish_few_channels_gen_function(
     """
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
index a55895970..1b3726f66 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -50,13 +50,13 @@ def conv2d_bias_relu_gen_profiler(
 @registry.reg("cuda.conv2d_bias_relu.gen_function")
 def conv2d_bias_relu_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
index 5659c8d0a..d1663a7f5 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -69,7 +69,7 @@ def conv2d_bias_relu_few_channels_gen_profiler(
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_function")
 def conv2d_bias_relu_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -79,7 +79,7 @@ def conv2d_bias_relu_few_channels_gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -93,7 +93,7 @@ def conv2d_bias_relu_few_channels_gen_function(
     """
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
index 3977c6355..0fef123cf 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -51,13 +51,13 @@ def conv2d_bias_sigmoid_gen_profiler(
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_function")
 def conv2d_bias_sigmoid_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return cba.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
index 1b5de0758..3546cb823 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -144,14 +144,14 @@ def gen_profiler(
 @registry.reg("cuda.conv2d_depthwise.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d_depthwise function."""
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         is_depthwise=True,
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
index 82f7ffbc7..dcfe362cb 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
@@ -46,14 +46,14 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.conv2d_depthwise_bias.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d_depthwise_bias function."""
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         f_emit_instance=cdw.emit_instance,
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
index f03edc5fe..e7186ebce 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -53,13 +53,13 @@ def transposed_conv2d_gen_profiler(
 @registry.reg("cuda.transposed_conv2d.gen_function")
 def transposed_conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         f_emit_instance=ctc.emit_instance,
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
index 30503992b..54f298cdc 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -57,13 +57,13 @@ def transposed_conv2d_bias_gen_profiler(
 @registry.reg("cuda.transposed_conv2d_bias_relu.gen_function")
 def transposed_conv2d_bias_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
         func_attrs=func_attrs,
-        exec_cond_remplate=exec_cond_remplate,
+        exec_cond_template=exec_cond_template,
         shape_eval_template=shape_eval_template,
         shape_save_template=shape_save_template,
         f_emit_instance=ctc.emit_instance,
diff --git a/python/aitemplate/backend/cuda/conv3d/common.py b/python/aitemplate/backend/cuda/conv3d/common.py
index 7bdc77158..2dd9f9f66 100644
--- a/python/aitemplate/backend/cuda/conv3d/common.py
+++ b/python/aitemplate/backend/cuda/conv3d/common.py
@@ -186,7 +186,7 @@ def gen_function(
     instance_template,
     exec_template,
     src_template,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
@@ -251,7 +251,7 @@ def gen_function(
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
         program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/cuda/conv3d/common_bias.py b/python/aitemplate/backend/cuda/conv3d/common_bias.py
index be2a6aab9..9ecda801b 100644
--- a/python/aitemplate/backend/cuda/conv3d/common_bias.py
+++ b/python/aitemplate/backend/cuda/conv3d/common_bias.py
@@ -191,7 +191,7 @@ def gen_function(
     instance_template,
     exec_template,
     src_template,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
@@ -256,7 +256,7 @@ def gen_function(
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
         program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
index 1fd781957..602cc8ef0 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -533,7 +533,7 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.conv3d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -543,7 +543,7 @@ def gen_function(
         INSTANCE_TEMPLATE,
         EXEC_TEMPLATE,
         SRC_TEMPLATE,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
index fa2f248eb..a442b472b 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
@@ -538,7 +538,7 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.conv3d_bias.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -548,7 +548,7 @@ def gen_function(
         INSTANCE_TEMPLATE,
         EXEC_TEMPLATE,
         SRC_TEMPLATE,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
index 631306f4e..e2a9589bd 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -40,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
index eb5eaa8bb..df4fccb31 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -40,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index 29f0e76bc..4e6a6a15f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -40,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
index 745fc62cb..4b9b1c26f 100644
--- a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -160,7 +160,7 @@
 @registry.reg("cuda.avg_pool2d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -197,7 +197,7 @@ def gen_function(
             stride=func_attrs["stride"],
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index ad6d8d761..5462d0134 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -198,7 +198,7 @@
 @registry.reg("cuda.max_pool2d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -235,7 +235,7 @@ def gen_function(
             stride=func_attrs["stride"],
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index 31a7ec55e..ebb30dab9 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -65,7 +65,7 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = upsampling2d_common.EXEC_TEMPLATE.render(dtype=input_type)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
index 76fdcdc8a..5daa3b74c 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -67,7 +67,7 @@ def gen_function(
         program = upsampling2d_common.EXEC_TEMPLATE.render(
             bias_add=True, dtype=input_type
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
index 64c604f96..564e01086 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -36,7 +36,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -59,7 +59,7 @@ def gen_function(
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return multi_level_roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
index 3726fdbf0..3754f660e 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -80,7 +80,7 @@ def gen_function(
             continuous_coordinate=func_attrs["continuous_coordinate"],
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 0dc1a98bb..894818ae9 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -678,7 +678,7 @@ def gen_profiler(
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     conv2d_flag,
@@ -691,7 +691,7 @@ def gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -766,7 +766,7 @@ def gen_function(
             problem_args=problem_args,
             is_profiler=False,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d.py b/python/aitemplate/backend/rocm/conv2d/conv2d.py
index e724d8e54..8c9df0f5f 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d.py
@@ -70,7 +70,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -86,7 +86,7 @@ def conv2d_gen_function(
         Execution statements in main function.
     src_template : jinja2.Template
         Full main.cpp with headers, embedding all templates.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -103,7 +103,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
index b9956922b..91506f2f9 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
@@ -70,7 +70,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -80,7 +80,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -96,7 +96,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index fc424e43a..79d19bf1b 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -115,7 +115,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_add_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -125,7 +125,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -141,7 +141,7 @@ def conv2d_gen_function(
     extra_code = EXTRA_CODE.render()
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_add_relu",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
index 0a48bf6e3..b33561394 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
@@ -71,7 +71,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -81,7 +81,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -97,7 +97,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_relu",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index 7458226e9..8449dc1de 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -119,7 +119,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_sigmoid.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -129,7 +129,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -145,7 +145,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_sigmoid",
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
index b4dfc7c7e..f14a6f57e 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -98,7 +98,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.transposed_conv2d.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -114,7 +114,7 @@ def conv2d_gen_function(
         Execution statements in main function.
     src_template : jinja2.Template
         Full main.cpp with headers, embedding all templates.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -131,7 +131,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "",
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
index 5053e58aa..a6c5a3bd9 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
@@ -80,7 +80,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.transposed_conv2d_bias_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -90,7 +90,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -106,7 +106,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_relu",
diff --git a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
index 4196eb37f..cf1fffef8 100644
--- a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
@@ -22,13 +22,13 @@
 @registry.reg("rocm.avg_pool2d.gen_function")
 def max_pool2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return pool2d.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
index 25199c946..9f67236f7 100644
--- a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
@@ -22,13 +22,13 @@
 @registry.reg("rocm.max_pool2d.gen_function")
 def max_pool2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return pool2d.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index 197621abb..ca09ce7c1 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -157,7 +157,7 @@
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -227,7 +227,7 @@ def gen_function(
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
         program = EXEC_TEMPLATE.render(indent="    ", instance=fname)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d.py b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
index e4c592ac0..94cfa007e 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -67,7 +67,7 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = upsampling2d_common.EXEC_TEMPLATE.render()
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
index 798c2317d..2e10ae1a5 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -67,7 +67,7 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = upsampling2d_common.EXEC_TEMPLATE.render(bias_add=True)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
index 94bb11d52..284fc2336 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -60,7 +60,7 @@ def gen_function(
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return multi_level_roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
index 6a48f6e15..76d3d7eae 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -81,7 +81,7 @@ def gen_function(
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,

From b603e18a60b320495d3eed794bab33e2f9418a4b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 22 Mar 2023 04:34:00 -0700
Subject: [PATCH 306/638] Add flag to make_jagged to skip seq len check (#462)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/462

Currently, `make_jagged` op's back-end always checks the offset differences to be within the corresponding `JaggedDim`'s bounds. Theoretically, there can be cases where a jagged Tensor can have sequences beyond the `JaggedDim.max_value()` (e.g., when the latter is defined by the corresponding dense Tensor's shape among the `elementwise` operands).

Here we add a flag to skip checking the sequence lengths in such cases (by default, the check is enabled).

Importantly, a guard is added to the `KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE` in `elementwise_common.py` to `return` prematurely from the CUDA thread handling the flat jagged index if the latter happens to be ouside the sequence bounds of the corresponding dimension of the dense volume. Previously, this was not needed, as the jagged sequences were always assumed to be covered by the dense volume.

Reviewed By: tissue3

Differential Revision: D44278023

fbshipit-source-id: 43bb607d63c92080186feceae33a8a5c1cf7fc5b
---
 .../backend/common/elementwise_common.py      |  6 ++++
 .../backend/cuda/view_ops/make_jagged.py      |  4 +++
 .../compiler/ops/common/view_ops.py           |  3 ++
 tests/unittest/ops/test_make_jagged.py        | 31 ++++++++++++++++---
 4 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 89ee5cdf5..eeee08634 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -150,6 +150,12 @@
             right = mid - 1;
         }
     }
+    if (running_idx - offset_value >= (({{strides[i]}}) / ({{strides[i+1]}}))) {
+        // this element of the jagged volume is
+        // out of bounds of the dense Tensor
+        // i.e., the sequence is longer than max_seq_len
+        return;
+    }
     dense_idx += (running_idx - offset_value) * ({{strides[i+1]}});
     running_idx = offset_idx;
 
diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index 40ce1d538..d2e248360 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -71,6 +71,7 @@
     return;
   }
 
+{% if check_sequence_lengths %}
   {{offsets_type}} group_size = data[offset_id + 1] - data[offset_id];
   if (group_size < bounds.min_values[dim_id] || group_size > bounds.max_values[dim_id]) {
     printf(
@@ -89,6 +90,7 @@
     );
     __trap();
   }
+{% endif %}
 
   if (offset_id == 0) {
     {{offsets_type}} first_offset = data[0];
@@ -284,6 +286,7 @@ def make_jagged_gen_function(func_attrs):
 
     batch_dim = jagged_int_var.batch_dim()
     isolated_batch_dim = batch_dim._attrs.get("isolated", False)
+    check_sequence_lengths = func_attrs["check_sequence_lengths"]
 
     return SRC_TEMPLATE.render(
         func_name=func_name,
@@ -295,6 +298,7 @@ def make_jagged_gen_function(func_attrs):
         isolated_batch_dim=isolated_batch_dim,
         jagged_dynamic_bound_names=jagged_dynamic_bound_names,
         index_type=backend_spec.index_type,
+        check_sequence_lengths=check_sequence_lengths,
     )
 
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 8c55ae30b..a2e76dfd2 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -685,6 +685,7 @@ def __init__(
         self,
         batch_dim: IntVar,
         jagged_dims: List[JaggedDim],
+        check_sequence_lengths: bool = True,
     ) -> None:
         if type(batch_dim) != IntVar:
             raise TypeError(
@@ -704,6 +705,7 @@ def __init__(
         self._attrs["op"] = "make_jagged"
         self._attrs["batch_dim"] = batch_dim
         self._attrs["jagged_dims"] = list(jagged_dims)
+        self._attrs["check_sequence_lengths"] = check_sequence_lengths
 
     def _set_jagged_dim_offsets(self, offsets_list: List[Tensor]):
         jagged_dims = self._attrs["jagged_dims"]
@@ -775,6 +777,7 @@ def _get_op_attributes(self):
         return {
             "batch_dim": self._attrs["batch_dim"],
             "jagged_dims": self._attrs["jagged_dims"],
+            "check_sequence_lengths": self._attrs["check_sequence_lengths"],
         }
 
     def gen_function(self) -> str:
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
index f68d43357..ba3c3dacc 100644
--- a/tests/unittest/ops/test_make_jagged.py
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -30,8 +30,10 @@
 
 
 class MakeJaggedTestCase(unittest.TestCase):
-    def test_make_jagged(
+    def _test_make_jagged(
         self,
+        check_sequence_lengths=True,
+        test_name="make_jagged",
     ):
         offsets1 = Tensor(
             shape=[
@@ -70,11 +72,12 @@ def test_make_jagged(
         )
 
         batch_dim = IntVar(values=[1, 128])
-        jd0 = JaggedDim(min_value=0, max_value=10)
-        jd1 = JaggedDim(min_value=0, max_value=15)
+        jd0 = JaggedDim(min_value=0, max_value=2)
+        jd1 = JaggedDim(min_value=0, max_value=3)
         Y = ops.make_jagged(
             batch_dim=batch_dim,
             jagged_dims=[jd0, jd1],
+            check_sequence_lengths=check_sequence_lengths,
         )(X, [offsets1, offsets2])
         Z = ops.gemm_rrr()(Y, W)
 
@@ -95,10 +98,16 @@ def test_make_jagged(
         Z._attrs["name"] = "Z"
         Z._attrs["is_output"] = True
 
-        model = compile_model([Y, Z], detect_target(), "./tmp", "test_make_jagged")
+        model = compile_model([Y, Z], detect_target(), "./tmp", test_name)
 
         offsets1_pt = torch.tensor([0, 1, 3, 5], dtype=torch.int32).cuda()
-        offsets2_pt = torch.tensor([0, 2, 4, 4, 9, 10], dtype=torch.int32).cuda()
+        offsets2_pt = torch.tensor([0, 2, 4, 4, 7, 10], dtype=torch.int32).cuda()
+
+        if not check_sequence_lengths:
+            # extend seq lens beyond the JaggedDim bounds
+            offsets1_pt[2] = 4
+            offsets2_pt[4] = 9
+
         x_pt = get_random_torch_tensor([10, 128], "float16")
         w_pt = get_random_torch_tensor([128, 64], "float16")
         z_pt = torch.matmul(x_pt, w_pt)
@@ -112,6 +121,18 @@ def test_make_jagged(
         torch.testing.assert_close(y, x_pt)
         torch.testing.assert_close(z, z_pt)
 
+    def test_make_jagged(self):
+        self._test_make_jagged(
+            check_sequence_lengths=True,
+            test_name="make_jagged",
+        )
+
+    def test_make_jagged_no_seq_len_check(self):
+        self._test_make_jagged(
+            check_sequence_lengths=False,
+            test_name="make_jagged_no_seq_len_check",
+        )
+
     def test_make_jagged_with_dynamic_bounds(
         self,
         dtype="float16",

From 3c14119d750ac361d8ebcfd56fd7c4fad60d3b57 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 07:52:20 -0700
Subject: [PATCH 307/638] Remove inheritance from object (#465)

Summary:
Inheritance from `object` was needed in Python2.
However, in Python3 classes are implicitly inherit from `object`.

More [info](https://stackoverflow.com/questions/4015417/why-do-python-classes-inherit-object) on it

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/465

Reviewed By: alexanderguzhva

Differential Revision: D44293772

Pulled By: aakhundov

fbshipit-source-id: 581a8fa49405d71c4c29152a2f38c864cec384a7
---
 examples/01_resnet-50/weight_utils.py            | 2 +-
 python/aitemplate/backend/builder.py             | 2 +-
 python/aitemplate/backend/cuda/utils.py          | 2 +-
 python/aitemplate/backend/profiler_cache.py      | 2 +-
 python/aitemplate/backend/rocm/utils.py          | 2 +-
 python/aitemplate/backend/target.py              | 2 +-
 python/aitemplate/backend/task_runner.py         | 6 +++---
 python/aitemplate/compiler/model.py              | 2 +-
 python/aitemplate/compiler/tensor_accessor.py    | 2 +-
 python/aitemplate/compiler/transform/fuse_ops.py | 2 +-
 python/aitemplate/frontend/nn/parameter.py       | 2 +-
 python/aitemplate/frontend/parameter.py          | 2 +-
 python/aitemplate/utils/visualization/pydot.py   | 2 +-
 13 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/01_resnet-50/weight_utils.py b/examples/01_resnet-50/weight_utils.py
index 252583192..4dc455a3d 100644
--- a/examples/01_resnet-50/weight_utils.py
+++ b/examples/01_resnet-50/weight_utils.py
@@ -30,7 +30,7 @@
 CONV_WEIGHT_PATTERN = re.compile(r"conv\d+\.weight")
 
 
-class timm_export(object):
+class timm_export:
     def __init__(self, model_name, pretrained=True):
         self.model_name = model_name
         if model_name != "resnet50":
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index af91a2f25..32a3e092e 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -268,7 +268,7 @@ def pull(self) -> list[None]:
         return ret
 
 
-class Builder(object):
+class Builder:
     """Builder is a module to compile generated source code
     files into binary objects.
     """
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index 751a3b7cf..2178ea375 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -27,7 +27,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class Args(object):
+class Args:
     def __init__(self, arch):
         self.operations = "all"
         self.build_dir = ""
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 63be95837..89da74a68 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -464,7 +464,7 @@ def ait_cache_version() -> int:
     return __AIT_CACHE_VERSION__
 
 
-class ProfileCacheDB(object):
+class ProfileCacheDB:
     r"""Local SQLite profile cache database."""
 
     def __init__(
diff --git a/python/aitemplate/backend/rocm/utils.py b/python/aitemplate/backend/rocm/utils.py
index e1d3107b3..39cc9e0b4 100644
--- a/python/aitemplate/backend/rocm/utils.py
+++ b/python/aitemplate/backend/rocm/utils.py
@@ -28,7 +28,7 @@
 # pylint: disable=C0103,C0415,W0707
 
 
-class Args(object):
+class Args:
     def __init__(self, arch):
         self.operations = "all"
         self.build_dir = ""
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index d94826ddc..29c834269 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -49,7 +49,7 @@ class TargetType(IntEnum):
     rocm = 2
 
 
-class Target(object):
+class Target:
     def __init__(self, static_files_path: str):
         """
         Parameters
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
index a4714715b..7f5e1e144 100644
--- a/python/aitemplate/backend/task_runner.py
+++ b/python/aitemplate/backend/task_runner.py
@@ -25,7 +25,7 @@
 from collections import OrderedDict
 
 # pylint: disable=R1732,R1710,R1721
-class Task(object):
+class Task:
     """Task is an object containing a bash command,
     process for the command, and output of the process.
     """
@@ -187,7 +187,7 @@ def __del__(self) -> None:
                 self._proc.stderr.close()
 
 
-class DeviceFarm(object):
+class DeviceFarm:
     """Device Farm is a stateful object to
     schedule and assigns a task to the available devices.
     Devices are logical devices, can be CPUs or GPUs.
@@ -240,7 +240,7 @@ def reset_all(self) -> None:
             self._dev_stats[dev] = False
 
 
-class BaseRunner(object):
+class BaseRunner:
     """Genetic subprocess task runner for different purposes"""
 
     def __init__(self, devs: list[int], tag: str, timeout: int = 10) -> None:
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 420a567c8..413a64fea 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -154,7 +154,7 @@ def _reshape_tensor(tensor: TorchTensor, shape: List[int]) -> TorchTensor:
     return new_tensor.reshape(shape)
 
 
-class Model(object):
+class Model:
     class _DLLWrapper:
         def __init__(
             self,
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index d248d526b..c90e7bfd2 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -30,7 +30,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class TensorAccessor(object):
+class TensorAccessor:
     """
     A tensor accessor which manages how to access a Tensor.
     Must always be used together with a Tensor.
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 37b57fab6..40af68f35 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -36,7 +36,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class SimpleDisjointSet(object):
+class SimpleDisjointSet:
     def __init__(self):
         self.node_to_list_mapping: Dict[Any, List[Any]] = {}
 
diff --git a/python/aitemplate/frontend/nn/parameter.py b/python/aitemplate/frontend/nn/parameter.py
index 5c5e1af9d..660dd65b6 100644
--- a/python/aitemplate/frontend/nn/parameter.py
+++ b/python/aitemplate/frontend/nn/parameter.py
@@ -18,7 +18,7 @@
 from aitemplate.compiler.base import Tensor
 
 
-class Parameter(object):
+class Parameter:
     def __init__(self, shape, dtype, name=None, value=None):
         self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
         self._value = value
diff --git a/python/aitemplate/frontend/parameter.py b/python/aitemplate/frontend/parameter.py
index 5c5e1af9d..660dd65b6 100644
--- a/python/aitemplate/frontend/parameter.py
+++ b/python/aitemplate/frontend/parameter.py
@@ -18,7 +18,7 @@
 from aitemplate.compiler.base import Tensor
 
 
-class Parameter(object):
+class Parameter:
     def __init__(self, shape, dtype, name=None, value=None):
         self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
         self._value = value
diff --git a/python/aitemplate/utils/visualization/pydot.py b/python/aitemplate/utils/visualization/pydot.py
index 6e33aec91..e580fc611 100644
--- a/python/aitemplate/utils/visualization/pydot.py
+++ b/python/aitemplate/utils/visualization/pydot.py
@@ -601,7 +601,7 @@ def graph_from_incidence_matrix(matrix, node_prefix="", directed=False):
     return graph
 
 
-class Common(object):
+class Common:
     """Common information to several classes.
 
     Should not be directly used, several classes are derived from

From 5e48d5d458eed5c1ea200bb0ae551f07095dff05 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 08:11:57 -0700
Subject: [PATCH 308/638] Use List[T] instead of list[T] in type info (#466)

Summary:
Use `List[T]` instead of `list[T]` in type info

Until Python 3.9 [added support for type hinting using standard collections](https://docs.python.org/3/whatsnew/3.9.html#type-hinting-generics-in-standard-collections), you had to use `typing.Tuple` and `typing.List` if you wanted to document what type the contents of the containers needed to be:

Up until Python 3.8, `tuple` and `list` did not support being [used as generic types](https://docs.python.org/3/library/typing.html#generics).

More [info](https://stackoverflow.com/questions/39458193/using-list-tuple-etc-from-typing-vs-directly-referring-type-as-list-tuple-etc) on it

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/466

Reviewed By: alexanderguzhva

Differential Revision: D44294019

Pulled By: aakhundov

fbshipit-source-id: 062740486b17c6e783a22da0dc03324bdadfc9af
---
 python/aitemplate/backend/builder.py         | 19 ++++++++--------
 python/aitemplate/backend/codegen.py         | 24 ++++++++++----------
 python/aitemplate/backend/profiler_runner.py | 15 ++++++------
 python/aitemplate/backend/task_runner.py     | 16 ++++++-------
 python/aitemplate/compiler/base.py           |  2 +-
 tests/lint/check_meta_header.py              |  9 ++++----
 6 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 32a3e092e..ddc5da699 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -25,10 +25,9 @@
 import re
 import shlex
 import subprocess
-import typing
 from hashlib import sha1
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Tuple, Union
 
 import jinja2
 
@@ -224,12 +223,12 @@ class Runner(BaseRunner):
     Runner is inherited from BaseRunner.
     """
 
-    def __init__(self, devs: list[int], timeout: int = 10):
+    def __init__(self, devs: List[int], timeout: int = 10):
         """Initialize a parallel runner for building
 
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             CPU ids for compiling
         timeout : int, optional
             Compiling timeout, by default 10 (seconds)
@@ -241,7 +240,7 @@ def __init__(self, devs: list[int], timeout: int = 10):
         self._ftask_proc = process_task
         self._fret_proc = process_return
 
-    def push(self, idx: typing.Union[int, str], cmd: str, target: Target) -> None:
+    def push(self, idx: Union[int, str], cmd: str, target: Target) -> None:
         """Push a building task into runner
 
         Parameters
@@ -255,7 +254,7 @@ def push(self, idx: typing.Union[int, str], cmd: str, target: Target) -> None:
         """
         self._queue.append(Task(idx, cmd, target, shell=True))
 
-    def pull(self) -> list[None]:
+    def pull(self) -> List:
         """Pull building results.
         Check whether all building tasks are successful.
 
@@ -296,7 +295,7 @@ def __init__(self, n_jobs: int = -1, timeout: int = 180) -> None:
 
     def build_objs(
         self,
-        files: list[typing.Tuple[str, str]],
+        files: List[Tuple[str, str]],
         cc_cmd: str,
         binary_cc_cmd: Optional[str] = None,
     ):
@@ -304,7 +303,7 @@ def build_objs(
 
         Parameters
         ----------
-        files : list[Tuple[str, str]]
+        files : List[Tuple[str, str]]
             list of tuples of source code path and object file path
         cc_cmd : str
             command line template for building objects
@@ -345,14 +344,14 @@ def build_objs(
         self._runner.join()
         self._runner.pull()
 
-    def build_so(self, target: Target, objs: list[str]):
+    def build_so(self, target: Target, objs: List[str]):
         """Generate a task to build all objects into a dynamic library
 
         Parameters
         ----------
         target : Target
             Device target of dynamic library
-        objs : list[str]
+        objs : List[str]
             List of all object file paths for building the dynamic library.
         """
         _LOGGER.info("Building " + target)
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index c44435af9..faa616eaf 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -60,12 +60,12 @@
 MODEL_NAME = "Model"
 
 
-def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_strategy):
+def gen_profiler(sorted_graph: List[Tensor], workdir: str, dynamic_profiling_strategy):
     """Generate operator profiler source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     workdir : str
         Target directory for generated C++ source code files
@@ -83,13 +83,13 @@ def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_str
 
 
 def gen_function_src(
-    sorted_graph: list[Tensor], workdir: str, model_name: str = ""
-) -> list[Tuple[str, str]]:
+    sorted_graph: List[Tensor], workdir: str, model_name: str = ""
+) -> List[Tuple[str, str]]:
     """Generate functions source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     workdir : str
         Target directory for generated C++ source code files
@@ -98,7 +98,7 @@ def gen_function_src(
 
     Returns
     -------
-    list[Tuple[str, str]]
+    List[Tuple[str, str]]
         List of tuple (source file path, object file path)
     """
     target = Target.current()
@@ -321,7 +321,7 @@ def __init__(
         graph: List[Tensor],
         output_tensors: List[Tensor],
         model_name: str = MODEL_NAME,
-        additional_unbound_constants: Optional[list[Tensor]] = None,
+        additional_unbound_constants: Optional[List[Tensor]] = None,
         debug_settings: Optional[AITDebugSettings] = None,
     ):
         self.target = Target.current()
@@ -986,7 +986,7 @@ def append_all_tensors(self) -> None:
 
 
 def gen_library_src(  # noqa: C901
-    sorted_graph: list[Tensor],
+    sorted_graph: List[Tensor],
     max_blob_size: int,
     max_constant_blob_size: int,
     workspace: Workspace,
@@ -994,13 +994,13 @@ def gen_library_src(  # noqa: C901
     output_tensors: List[Tensor],
     model_name: str = "",
     debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
-    additional_unbound_constants: Optional[list[Tensor]] = None,
-) -> list[Tuple[str, str]]:
+    additional_unbound_constants: Optional[List[Tensor]] = None,
+) -> List[Tuple[str, str]]:
     """Generate model driver source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     max_blob_size : int
         Total memory for input/output tensor and intermediate results,
@@ -1016,7 +1016,7 @@ def gen_library_src(  # noqa: C901
 
     Returns
     -------
-    list[Tuple[str, str]]
+    List[Tuple[str, str]]
         List of tuple (source file path, object file path)
     """
 
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 204e09e1b..9b9d38095 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -23,10 +23,9 @@
 
 import re
 import subprocess
-import typing
 from collections import namedtuple
 from queue import Queue
-from typing import Callable, List, Tuple
+from typing import Callable, List, Tuple, Union
 
 from aitemplate.backend.target import Target
 from aitemplate.backend.task_runner import BaseRunner, Task
@@ -57,7 +56,7 @@ def optimization_key(result):
 def extract_profile_result(
     stdout,
     return_ops=None,
-) -> Tuple[ProfileResult | List[ProfileResult], bool]:
+) -> Tuple[Union[ProfileResult, List[ProfileResult]], bool]:
     failed = False
     try:
         runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
@@ -148,7 +147,7 @@ def process_task(task: Task) -> None:
             )
 
 
-def process_return(task: Task) -> typing.Tuple[typing.Union[int, str], ProfileResult]:
+def process_return(task: Task) -> Tuple[Union[int, str], ProfileResult]:
     """Generate profile result from a profiling task
 
     Parameters
@@ -169,14 +168,14 @@ class Runner(BaseRunner):
     Runner is inherited from BaseRunner.
     """
 
-    def __init__(self, devs: list[int], op_name: str, timeout: int = 30):
+    def __init__(self, devs: List[int], op_name: str, timeout: int = 30):
         _LOGGER.info("Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name))
         super().__init__(devs, op_name, timeout)
         self._dev_flag = Target.current().dev_select_flag()
         self._ftask_proc = process_task
         self._fret_proc = process_return
 
-    def push(self, idx: typing.Union[int, str], cmd: str, return_ops: List[str] = None):
+    def push(self, idx: Union[int, str], cmd: str, return_ops: List[str] = None):
         """Push a new profiling task into runner's queue
 
         Parameters
@@ -185,7 +184,7 @@ def push(self, idx: typing.Union[int, str], cmd: str, return_ops: List[str] = No
             Profiling task id (usually is algorithm id or name)
         cmd : str
             Bash command to execute the profiling task
-        return_ops : list[str]
+        return_ops : List[str]
             Names of the ops to return the profiling results for. If specified,
             instead of a single (best) ProfileResult instance, a list with the
             ProfileResults for each op in the return_ops is returned from `pull`.
@@ -205,7 +204,7 @@ def pull(self):
 
         Returns
         -------
-        list[Tuple[Union[int, str], ProfileResult]]
+        List[Tuple[Union[int, str], ProfileResult]]
             Profiling results of all successful tasks.
         """
         ret = super().pull(self._ftask_proc, self._fret_proc)
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
index 7f5e1e144..fd0c3d0dd 100644
--- a/python/aitemplate/backend/task_runner.py
+++ b/python/aitemplate/backend/task_runner.py
@@ -23,6 +23,8 @@
 import time
 import typing
 from collections import OrderedDict
+from typing import List
+
 
 # pylint: disable=R1732,R1710,R1721
 class Task:
@@ -193,12 +195,12 @@ class DeviceFarm:
     Devices are logical devices, can be CPUs or GPUs.
     """
 
-    def __init__(self, devs: list[int]) -> None:
+    def __init__(self, devs: List[int]) -> None:
         """Initialize a Device Farm given a list of device ids.
 
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             List of device ids in int
         """
         if isinstance(devs, int):
@@ -243,11 +245,11 @@ def reset_all(self) -> None:
 class BaseRunner:
     """Genetic subprocess task runner for different purposes"""
 
-    def __init__(self, devs: list[int], tag: str, timeout: int = 10) -> None:
+    def __init__(self, devs: List[int], tag: str, timeout: int = 10) -> None:
         """
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             List of device ids for tasks.
         tag : str
             Runner's name tag
@@ -287,9 +289,7 @@ def reset(self) -> None:
         self._finished_tasks = set()
         self._queue = []
 
-    def pull(
-        self, ftask_proc: typing.Callable, fret_proc: typing.Callable
-    ) -> list[object]:
+    def pull(self, ftask_proc: typing.Callable, fret_proc: typing.Callable) -> List:
         """Pull results from all tasks executed on the runner.
 
         Parameters
@@ -301,7 +301,7 @@ def pull(
 
         Returns
         -------
-        list
+        List
             Aggregated returns from all tasks
         """
         ret = []
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 5707c9446..3ab794588 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -94,7 +94,7 @@ def __init__(
 
         Parameters
         ----------
-        values : list[int]
+        values : List[int]
             A list of possible values of this dynamic dimension.
             len(values) must be >= 2.
 
diff --git a/tests/lint/check_meta_header.py b/tests/lint/check_meta_header.py
index dd69d74fb..da385fa58 100644
--- a/tests/lint/check_meta_header.py
+++ b/tests/lint/check_meta_header.py
@@ -19,6 +19,7 @@
 
 import os
 import sys
+from typing import List
 
 import click
 
@@ -36,7 +37,7 @@ def process_header(header, comment):
 CPP_HEADER = process_header(HEADER, "//")
 
 
-def dfs(root_path: str) -> list[str]:
+def dfs(root_path: str) -> List[str]:
     """DFS source code tree to find python files missing header
 
     Parameters
@@ -46,7 +47,7 @@ def dfs(root_path: str) -> list[str]:
 
     Returns
     -------
-    list[str]
+    List[str]
         file list missing header
     """
     ret = []
@@ -66,12 +67,12 @@ def dfs(root_path: str) -> list[str]:
     return ret
 
 
-def fix_header(file_list: list[str]) -> None:
+def fix_header(file_list: List[str]) -> None:
     """Adding Meta header to to source files
 
     Parameters
     ----------
-    file_list : list[str]
+    file_list : List[str]
         file list missing header
     """
     for path in file_list:

From f790c51bd945282274068b60139cbf0be73e942f Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 22 Mar 2023 08:43:39 -0700
Subject: [PATCH 309/638] Improve graph visualization (#457)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/457

Creates an additional graph visualization in case of profiling. Becomes available once `LOGLEVEL=DEBUG` is used, and it is recommended to enable  `AIT_PLOT_SHORTEN_TENSOR_NAMES=1` as well.

Highlights ops with the duration higher than 70% percentile.

Every op gets marked with an execution duration. Size is proportional to the duration for the above 70% percentile.

Every tensor gets two timestamps in the form of 'A ms / B ms'. A is the timestamp when the tensor becomes computed using a single stream execution. B is the timestamp when the tensor becomes computed using unlimited number of independent streams, thus serving as a theoretically minimal timestamp at which the tensor may be computed.

The ratio of `A/B - 1` for the output tensor is how much speed improvement one may gain in case of unlimited amount of GPUs available and zero copy cost.

Reviewed By: ipiszy, chenyang78

Differential Revision: D44183018

fbshipit-source-id: 8568d06dd090086d743fc6c0c4eb68ddb3d2d392
---
 python/aitemplate/utils/graph_utils.py        | 286 +++++++++++++++++-
 python/aitemplate/utils/visualization/plot.py |  84 ++++-
 2 files changed, 361 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 549a6d821..4539c9f6b 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -12,9 +12,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import json
 import logging
 import os
-from typing import Any, List
+from collections import deque
+from typing import Any, Dict, List, Set
 
 from aitemplate.utils.misc import is_debug
 from aitemplate.utils.visualization import plot_graph
@@ -24,6 +26,11 @@
 
 
 def get_sorted_ops(tensors) -> List[Any]:
+    """
+    Produces the exact execution sequence of operators.
+    This matches backend/codegen.py, ModelContainerGenerator.append_all_tensors()
+    """
+
     from aitemplate.compiler.base import Tensor
 
     visited = set()
@@ -31,7 +38,7 @@ def get_sorted_ops(tensors) -> List[Any]:
     if isinstance(tensors, Tensor):
         tensors = [tensors]
     for tensor in tensors:
-        for src_op in tensor._attrs["src_ops"]:
+        for src_op in tensor.src_ops():
             if src_op in visited:
                 continue
             visited.add(src_op)
@@ -84,7 +91,7 @@ def sorted_op_pseudo_code(ops, with_shape=True) -> str:
     return op_str
 
 
-def dump_graph_debug_str_to_file(tensors, workdir, name):
+def dump_graph_debug_str_to_file(tensors, workdir, name, file_with_time_profiles=None):
     if is_debug():
         # Dump graph and pseudo code for debug only
         prefix = os.path.join(workdir, name)
@@ -101,5 +108,276 @@ def dump_graph_debug_str_to_file(tensors, workdir, name):
         with open(pseudo_code_path, "w") as f:
             f.write(sorted_graph_pseudo_code(tensors))
             _LOGGER.debug(f"Dumped {name} pseudo code to {pseudo_code_path}")
-        plot_graph(tensors, graph_visual_path)
+        plot_graph(tensors, graph_visual_path, file_with_time_profiles)
         _LOGGER.debug(f"Dumped {name} visualization to {graph_visual_path}")
+
+
+class TimestampTracking:
+    def __init__(
+        self, execution_start: float = 0, duration: float = 0, execution_order: int = 0
+    ):
+        self.execution_order = execution_order
+        self.execution_start = execution_start
+        self.duration = duration
+
+    @property
+    def execution_end(self):
+        return self.execution_start + self.duration
+
+
+class ProfiledTimeStatistics:
+    def __init__(self):
+        # Dict[Operator, float]
+        self.op_durations = {}
+
+        # Dict[Operator, TimestampTracking]
+        self.op_parallel_trackers = {}
+        # Dict[Operator, TimestampTracking]
+        self.op_sequential_trackers = {}
+
+        # Dict[Trnsor, TimestampTracking]
+        self.tensor_parallel_trackers = {}
+        # Dict[Trnsor, TimestampTracking]
+        self.tensor_sequential_trackers = {}
+
+        # 0.7 percentile of op times
+        self.duration_p70 = 0.0
+        # 0.9 percentile of op times
+        self.duration_p90 = 0.0
+        # 0.95 percentile of op times
+        self.duration_p95 = 0.0
+        # max time spent among operators
+        self.duration_max = 0.0
+        # total time spent by operators
+        self.total_duration = 0.0
+
+
+def track_graph_timings(
+    tensors, file_path_profiler_output: str
+) -> ProfiledTimeStatistics:
+    """
+    Traverses the graph of tensors and uses the statistics from the profiler
+    to evaluate execution times in case of sequential execution (1 stream)
+    and parallel execution (unlimited number of streams).
+    """
+
+    from aitemplate.compiler.base import Operator, Tensor
+
+    output = ProfiledTimeStatistics()
+
+    # the exact sequence of non-constant tensors that need to be evaluated
+    #   within a single execution stream.
+    unprocessed_tensors: List[Tensor] = []
+
+    # Sequence_of_ops contains an exact execution sequence of ops
+    #   within a single execution stream.
+    # Similar to graph_utils.py, get_sorted_ops() call.
+    sequence_of_ops: List[Operator] = []
+    visited_ops: Set[Operator] = set()
+
+    for tensor in tensors:
+        src_ops = tensor.src_ops()
+
+        if len(src_ops) == 0:
+            # This tensor depends on no operator.
+            # So, add the final statistics for it.
+            output.tensor_parallel_trackers[tensor] = TimestampTracking()
+            output.tensor_sequential_trackers[tensor] = TimestampTracking()
+        else:
+            for op in src_ops:
+                if op not in visited_ops:
+                    visited_ops.add(op)
+                    sequence_of_ops.append(op)
+
+            # this tensor needs to be evaluated
+            unprocessed_tensors.append(tensor)
+
+    # ok, we've got ops. Load the file with the profile.
+    with open(file_path_profiler_output, "r") as f:
+        perf_per_op_str = f.read()
+
+    # parse file
+    perf_per_op_str_dict = json.loads(perf_per_op_str)
+
+    op_durations: Dict[str, float] = {}
+    for op_name, op_data in perf_per_op_str_dict.items():
+        op_durations[op_name] = op_data["ms_per_iter"]
+
+    # map timings to ops
+    for op in visited_ops:
+        # profiler records the results under the original_name
+        op_name = op._attrs["original_name"]
+
+        # replace op_name with an unique name, if provided
+        if op_name is not None:
+            if op_name not in op_durations:
+                # op_name was not found in the profiler report
+                output.op_durations[op] = 0
+            else:
+                time_cost = op_durations[op_name]
+                output.op_durations[op] = time_cost
+        else:
+            # op_name is None, idk what to do
+            output.op_durations[op] = 0
+
+    # compute statistics
+    sorted_op_durations = sorted(op_durations.values())
+    if len(sorted_op_durations) > 0:
+        output.duration_p70 = sorted_op_durations[int(len(sorted_op_durations) * 0.7)]
+        output.duration_p90 = sorted_op_durations[int(len(sorted_op_durations) * 0.9)]
+        output.duration_p95 = sorted_op_durations[int(len(sorted_op_durations) * 0.95)]
+        output.duration_max = sorted_op_durations[-1]
+        output.total_duration = sum(sorted_op_durations)
+
+    # proceed with sequential execution:
+    unprocessed_seq_ops = deque(sequence_of_ops)
+    unprocessed_seq_tensors = deque(unprocessed_tensors)
+
+    global_timestamp = 0.0
+    execution_step = 0
+    while len(unprocessed_seq_ops) > 0 or len(unprocessed_seq_tensors) > 0:
+        # process operators
+        n_local_processed_ops = 0
+        for op in unprocessed_seq_ops:
+            depends_on = op._attrs["inputs"]
+
+            # are all prereqs complete?
+            can_proceed = all(
+                tensor in output.tensor_sequential_trackers for tensor in depends_on
+            )
+            if can_proceed:
+                # yes. This operator is ready to be executed.
+                execution_step += 1
+
+                op_duration = output.op_durations[op]
+
+                output.op_sequential_trackers[op] = TimestampTracking(
+                    execution_start=global_timestamp,
+                    duration=op_duration,
+                    execution_order=execution_step,
+                )
+
+                # modify global clock
+                global_timestamp += op_duration
+
+                n_local_processed_ops += 1
+            else:
+                # cannot go ahead, some tensors need to be marked as processed
+                break
+
+        for _ in range(0, n_local_processed_ops):
+            unprocessed_seq_ops.popleft()
+
+        # process tensors
+        n_local_processed_tensors = 0
+        for tensor in unprocessed_seq_tensors:
+            depends_on = tensor.src_ops()
+
+            # are all prereqs complete?
+            can_proceed = all(op in output.op_sequential_trackers for op in depends_on)
+            if can_proceed:
+                # yes. The tensor computation is finished.
+                max_execution_end = max(
+                    output.op_sequential_trackers[op].execution_end for op in depends_on
+                )
+                max_execution_order = max(
+                    output.op_sequential_trackers[op].execution_order
+                    for op in depends_on
+                )
+
+                output.tensor_sequential_trackers[tensor] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=0.0,
+                    execution_order=max_execution_order,
+                )
+
+                n_local_processed_tensors += 1
+            else:
+                # cannot proceed, some ops needs to be run first
+                break
+
+        for _ in range(0, n_local_processed_tensors):
+            unprocessed_seq_tensors.popleft()
+
+        # are we done?
+        if n_local_processed_ops == 0 and n_local_processed_tensors == 0:
+            # yes, no operators or tensors were processed on the current step.
+            # This does not imply that all operators and tensors were processed.
+            # Basically, this is a kinda early termination verification that
+            # indicates that there is some invalid profiler / graph data.
+            # So, we're trying to avoid infinite loops.
+            break
+
+    # process with parallel execution
+    unprocessed_par_ops = set(sequence_of_ops)
+    unprocessed_par_tensors = set(unprocessed_tensors)
+
+    execution_step = 0
+    while len(unprocessed_par_ops) > 0 or len(unprocessed_par_tensors) > 0:
+        # process operators
+        new_processed_ops: Set[Operator] = set()
+        for op in unprocessed_par_ops:
+            depends_on = op._attrs["inputs"]
+
+            # are all prereqs complete?
+            can_proceed = all(
+                tensor in output.tensor_parallel_trackers for tensor in depends_on
+            )
+            if can_proceed:
+                # yes. This operator is ready to be executed.
+                op_duration = output.op_durations[op]
+
+                max_execution_end = max(
+                    output.tensor_parallel_trackers[tensor].execution_end
+                    for tensor in depends_on
+                )
+
+                output.op_parallel_trackers[op] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=op_duration,
+                    execution_order=execution_step,
+                )
+
+                new_processed_ops.add(op)
+
+        # ok, there were some processed operators
+        if len(new_processed_ops) > 0:
+            for op in new_processed_ops:
+                unprocessed_par_ops.remove(op)
+
+            execution_step += 1
+
+        # process tensors
+        new_processed_tensors: Set[Tensor] = set()
+        for tensor in unprocessed_par_tensors:
+            depends_on = tensor.src_ops()
+
+            # are all prereqs complete?
+            can_proceed = all(op in output.op_parallel_trackers for op in depends_on)
+            if can_proceed:
+                # yes. The tensor computation is finished.
+                max_execution_end = max(
+                    output.op_parallel_trackers[op].execution_end for op in depends_on
+                )
+                max_execution_order = max(
+                    output.op_parallel_trackers[op].execution_order for op in depends_on
+                )
+
+                output.tensor_parallel_trackers[tensor] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=0.0,
+                    execution_order=max_execution_order,
+                )
+
+                new_processed_tensors.add(tensor)
+
+        for tensor in new_processed_tensors:
+            unprocessed_par_tensors.remove(tensor)
+
+        # are we done?
+        if len(new_processed_ops) == 0 and len(new_processed_tensors) == 0:
+            # Same story: we're trying to avoid infinite loops.
+            break
+
+    # done
+    return output
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 50076e44f..02e414261 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -18,6 +18,8 @@
 import json
 import os
 
+from typing import Optional
+
 from aitemplate import compiler
 from aitemplate.utils.environ import shorten_tensor_names_for_plots
 from aitemplate.utils.misc import short_str
@@ -92,7 +94,31 @@ def _gen_op_modal(op) -> str:
     return modal_src
 
 
-def plot_graph(tensors, file_path: str) -> None:
+def _highlight_op_node(op_node, op, time_stats):
+    if op in time_stats.op_durations:
+        perf_op = time_stats.op_durations[op]
+        scale_factor = float(perf_op) / float(time_stats.total_duration)
+
+        if perf_op > time_stats.duration_p95:
+            op_node.set("color", "maroon1")
+            op_node.set("penwidth", 9)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+        elif perf_op > time_stats.duration_p90:
+            op_node.set("color", "magenta1")
+            op_node.set("penwidth", 6)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+        elif perf_op > time_stats.duration_p70:
+            op_node.set("color", "mediumorchid1")
+            op_node.set("penwidth", 3)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+
+
+def plot_graph(
+    tensors, file_path: str, file_with_time_profiles: Optional[str] = None
+) -> None:
     """
     Plot AIT graph.
 
@@ -104,6 +130,9 @@ def plot_graph(tensors, file_path: str) -> None:
         Output file path, currently we support the following extension:
             - html
             - format supported by graphviz
+    file_with_time_profile : Optional[str]
+        Adds time for every node, if provided
+
     """
     dot_graph = pydot.Dot(graph_type="digraph")
     _, ext = os.path.splitext(file_path)
@@ -130,6 +159,12 @@ def plot_graph(tensors, file_path: str) -> None:
 
     op_names = gen_unique_op_names(sorted_graph)
 
+    from aitemplate.utils.graph_utils import ProfiledTimeStatistics, track_graph_timings
+
+    time_stats = ProfiledTimeStatistics()
+    if file_with_time_profiles is not None:
+        time_stats = track_graph_timings(sorted_graph, file_with_time_profiles)
+
     op_set = {}
     tensor_set = {}
     modal_set = []
@@ -138,9 +173,6 @@ def plot_graph(tensors, file_path: str) -> None:
     for tensor in sorted_graph:
         tensor_node = None
         tensor_name = tensor._attrs["name"]
-        if shorten_tensor_names_for_plots():
-            if tensor_name is not None and len(tensor_name) > 30:
-                tensor_name = short_str(tensor_name)
 
         if tensor in tensor_set:
             tensor_node = tensor_set[tensor]
@@ -152,17 +184,42 @@ def plot_graph(tensors, file_path: str) -> None:
                 color = COLOR_SCHEME["output"]
             if tensor._attrs["is_param"] is True:
                 color = COLOR_SCHEME["param"]
+
+            label = tensor_name
+
+            if shorten_tensor_names_for_plots():
+                if tensor_name is not None and len(tensor_name) > 30:
+                    label = short_str(tensor_name)
+
+            # add a label with time
+            label_with_time = ""
+            seq_tracker = time_stats.tensor_sequential_trackers.get(tensor, None)
+            if seq_tracker is not None and seq_tracker.execution_end != 0:
+                label_with_time += f"{seq_tracker.execution_end:.3f} ms"
+
+            par_tracker = time_stats.tensor_parallel_trackers.get(tensor, None)
+            if par_tracker is not None and par_tracker.execution_end != 0:
+                if label_with_time:
+                    label_with_time += " / "
+                label_with_time += f"{par_tracker.execution_end:.3f} ms"
+
+            if label_with_time:
+                label = f"{tensor_name}\\n{label_with_time}"
+
+            # add a node
             tensor_node = pydot.Node(
                 name=tensor_name,
                 shape="note",
                 id=tensor_name,
+                label=label,
                 color=color,
             )
             tensor_set[tensor] = tensor_node
             dot_graph.add_node(tensor_node)
             modal_set.append(_gen_tensor_modal(tensor))
             items.append(tensor_name)
-            popover_data[tensor_name] = "shape: " + _get_tensor_shape_str(tensor)
+
+            popover_data[tensor_name] = f"shape: {_get_tensor_shape_str(tensor)}"
 
         for src_op in tensor.src_ops():
             op_node = None
@@ -175,12 +232,20 @@ def plot_graph(tensors, file_path: str) -> None:
             if src_op in op_set:
                 op_node = op_set[src_op]
             else:
+                label = (
+                    f"{op_name}\\n{str(time_stats.op_durations[src_op])} ms"
+                    if src_op in time_stats.op_durations
+                    else op_name
+                )
                 op_node = pydot.Node(
                     name=op_name,
                     shape="folder",
                     id=op_name,
+                    label=label,
                     color="mediumpurple1",
                 )
+                _highlight_op_node(op_node, src_op, time_stats)
+
                 op_set[src_op] = op_node
                 dot_graph.add_node(op_node)
                 modal_set.append(_gen_op_modal(src_op))
@@ -201,15 +266,24 @@ def plot_graph(tensors, file_path: str) -> None:
             if dst_op in op_set:
                 op_node = op_set[dst_op]
             else:
+                label = (
+                    f"{op_name}\\n{str(time_stats.op_durations[dst_op])} ms"
+                    if dst_op in time_stats.op_durations
+                    else op_name
+                )
                 op_node = pydot.Node(
                     name=op_name,
                     shape="folder",
                     id=op_name,
+                    label=label,
                     color="mediumpurple1",
                 )
+                _highlight_op_node(op_node, dst_op, time_stats)
+
                 op_set[dst_op] = op_node
                 dot_graph.add_node(op_node)
                 items.append(op_name)
+
                 popover_data[op_name] = ", ".join(
                     [f"{x}: {y}" for x, y in op_to_content(dst_op).items()]
                 )

From b0ca796ad6deabdc297fb69adbe9a5bfec9712af Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 08:51:07 -0700
Subject: [PATCH 310/638] Example 05 - Add use_linear_projection param support
 (#451)

Summary:
### Description
stable-diffusion-2-1-base UNet model has [config](https://huggingface.co/stabilityai/stable-diffusion-2-1-base/blob/main/unet/config.json#L44) parameter `use_linear_projection`.

If it is True then internally `proj_in` and `proj_out` use Linear layer instead of 1x1 Convolution.
- Transformer2DModel [init](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py#L146)
- Transformer2DModel [forward](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py#L250)

We can do similar logic in corresponding AIT model - `SpatialTransformer` which is used by `UNet2DConditionModel`.

### Testing
Here is the [Test image](https://ibb.co/8zycmQB) generated with `use_linear_projection=True`

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/451

Reviewed By: tenpercent

Differential Revision: D44280295

Pulled By: aakhundov

fbshipit-source-id: 1bfffc842d23f743b0c85311fec67b60267cfeb1
---
 examples/05_stable_diffusion/.gitignore       |  1 +
 .../05_stable_diffusion/scripts/compile.py    |  1 +
 .../src/compile_lib/compile_unet.py           |  2 +
 .../05_stable_diffusion/src/modeling/clip.py  | 46 ++++++++++++++-----
 .../src/modeling/unet_2d_condition.py         |  9 +++-
 .../src/modeling/unet_blocks.py               | 10 ++++
 .../05_stable_diffusion/src/modeling/vae.py   |  2 +-
 7 files changed, 57 insertions(+), 14 deletions(-)
 create mode 100644 examples/05_stable_diffusion/.gitignore

diff --git a/examples/05_stable_diffusion/.gitignore b/examples/05_stable_diffusion/.gitignore
new file mode 100644
index 000000000..e33609d25
--- /dev/null
+++ b/examples/05_stable_diffusion/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 0018dafda..896b2432c 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -82,6 +82,7 @@ def compile_diffusers(
         convert_conv_to_gemm=convert_conv_to_gemm,
         hidden_dim=pipe.unet.config.cross_attention_dim,
         attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
     )
     # VAE
     compile_vae(
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index 7cc2b41e4..3c2f59603 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -58,12 +58,14 @@ def compile_unet(
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    use_linear_projection=False,
 ):
 
     ait_mod = ait_UNet2DConditionModel(
         sample_size=64,
         cross_attention_dim=hidden_dim,
         attention_head_dim=attention_head_dim,
+        use_linear_projection=use_linear_projection,
     )
     ait_mod.name_parameter_tensor()
 
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 1a95314d4..30afcd051 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -195,16 +195,27 @@ class SpatialTransformer(nn.Module):
     """
 
     def __init__(
-        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        use_linear_projection=False,
     ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)  # Group Norm
+        self.use_linear_projection = use_linear_projection
 
-        self.proj_in = nn.Conv2dBias(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2dBias(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+            )
 
         self.transformer_blocks = nn.ModuleList(
             [
@@ -215,21 +226,34 @@ def __init__(
             ]
         )
 
-        self.proj_out = nn.Conv2dBias(
-            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
-        )
+        if use_linear_projection:
+            self.proj_out = nn.Linear(inner_dim, in_channels)
+        else:
+            self.proj_out = nn.Conv2dBias(
+                inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+            )
 
     def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
         b, h, w, c = get_shape(x)
         x_in = x
         x = self.norm(x)
-        x = self.proj_in(x)
-        x = ops.reshape()(x, [b, -1, c])
+        if self.use_linear_projection:
+            x = ops.reshape()(x, [b, -1, c])
+            x = self.proj_in(x)
+        else:
+            x = self.proj_in(x)
+            x = ops.reshape()(x, [b, -1, c])
+
         for block in self.transformer_blocks:
             x = block(x, context=context)
-        x = ops.reshape()(x, [b, h, w, c])
-        x = self.proj_out(x)
+
+        if self.use_linear_projection:
+            x = self.proj_out(x)
+            x = ops.reshape()(x, [b, h, w, c])
+        else:
+            x = ops.reshape()(x, [b, h, w, c])
+            x = self.proj_out(x)
         return x + x_in
 
 
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index 770156ff9..eb28a076a 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -50,6 +50,7 @@ class UNet2DConditionModel(nn.Module):
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
         cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        use_linear_projection (`bool`, *optional*, defaults to False): Use linear projection instead of 1x1 convolution.
     """
 
     def __init__(
@@ -81,6 +82,7 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
     ):
         super().__init__()
         self.center_input_sample = center_input_sample
@@ -117,9 +119,10 @@ def __init__(
                 add_downsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
                 downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
             )
             self.down_blocks.append(down_block)
 
@@ -134,6 +137,7 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
         )
 
         # up
@@ -159,8 +163,9 @@ def __init__(
                 add_upsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=reversed_attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 7b6e3e6e6..9eaa6e0b1 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -51,6 +51,7 @@ def get_down_block(
     attn_num_head_channels,
     cross_attention_dim=None,
     downsample_padding=None,
+    use_linear_projection=False,
 ):
     down_block_type = (
         down_block_type[7:]
@@ -96,6 +97,7 @@ def get_down_block(
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            use_linear_projection=use_linear_projection,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -144,6 +146,7 @@ def get_up_block(
     resnet_act_fn,
     attn_num_head_channels,
     cross_attention_dim=None,
+    use_linear_projection=False,
 ):
     up_block_type = (
         up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
@@ -175,6 +178,7 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            use_linear_projection=use_linear_projection,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -239,6 +243,7 @@ def __init__(
         attention_type="default",
         output_scale_factor=1.0,
         cross_attention_dim=1280,
+        use_linear_projection=False,
         **kwargs,
     ):
         super().__init__()
@@ -274,6 +279,7 @@ def __init__(
                     in_channels // attn_num_head_channels,
                     depth=1,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
                 )
             )
             resnets.append(
@@ -322,6 +328,7 @@ def __init__(
         output_scale_factor=1.0,
         downsample_padding=1,
         add_downsample=True,
+        use_linear_projection=False,
     ):
         super().__init__()
 
@@ -354,6 +361,7 @@ def __init__(
                     out_channels // attn_num_head_channels,
                     depth=1,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -481,6 +489,7 @@ def __init__(
         output_scale_factor=1.0,
         downsample_padding=1,
         add_upsample=True,
+        use_linear_projection=False,
     ):
         super().__init__()
 
@@ -515,6 +524,7 @@ def __init__(
                     out_channels // attn_num_head_channels,
                     depth=1,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
index 1cd25aa19..924c7257a 100644
--- a/examples/05_stable_diffusion/src/modeling/vae.py
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -73,11 +73,11 @@ def __init__(
                 in_channels=prev_output_channel,
                 out_channels=output_channel,
                 prev_output_channel=None,
+                temb_channels=None,
                 add_upsample=not is_final_block,
                 resnet_eps=1e-6,
                 resnet_act_fn=act_fn,
                 attn_num_head_channels=None,
-                temb_channels=None,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel

From ad80787df8a02e6c306bfa658b17803c175cdc1e Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 09:07:14 -0700
Subject: [PATCH 311/638] Replace OrderedDict with dict in Examples (#463)

Summary:
### Description
From Python 3.6 onwards, the standard `dict` type maintains insertion order by default. [link](https://stackoverflow.com/questions/1867861/how-to-keep-keys-values-in-same-order-as-declared)

We can replace `collections.OrderedDict` with built-in python `dict` in AIT Examples and docs

### More info about OrderedDict vs dict
Changed in version python 3.7: Dictionary order is guaranteed to be insertion order. This behavior was an implementation detail of CPython from 3.6. - [link](https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/)

Regular python `dict` preserves insertion order - That's a new feature in 3.6 (3.7 is when it's first guaranteed by the language standard) - [link](https://stackoverflow.com/questions/60446154/python-dictionary-insertion-and-deletion)

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/463

Reviewed By: tenpercent

Differential Revision: D44293712

Pulled By: aakhundov

fbshipit-source-id: 273c519672300588d8935c2ff1142c5519a09a46
---
 docs/source/tutorial/how_to_infer_pt.rst                    | 4 +---
 examples/03_bert/benchmark_ait.py                           | 3 +--
 examples/05_stable_diffusion/src/compile_lib/compile_vae.py | 4 +---
 examples/07_how_to_run_pt_model/how_to_run_pt_model.py      | 4 +---
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/docs/source/tutorial/how_to_infer_pt.rst b/docs/source/tutorial/how_to_infer_pt.rst
index 8aa68c9c7..8b0535ce0 100644
--- a/docs/source/tutorial/how_to_infer_pt.rst
+++ b/docs/source/tutorial/how_to_infer_pt.rst
@@ -11,8 +11,6 @@ We need to import necessary Python modules:
 
 .. code-block:: python
 
-  from collections import OrderedDict
-
   import torch
 
   from aitemplate.compiler import compile_model
@@ -85,7 +83,7 @@ In AIT, all names must follow the C variable naming standard, because the names
   def map_pt_params(ait_model, pt_model):
     ait_model.name_parameter_tensor()
     pt_params = dict(pt_model.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_model.named_parameters():
       ait_name = name.replace(".", "_")
       assert name in pt_params
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 624588d18..a16244a9a 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 #
 import os
-from collections import OrderedDict
 
 from typing import Dict, List
 
@@ -100,7 +99,7 @@ def map_pt_params(
     ait_bert, pt_bert, batch_size: int, seq_length: int
 ) -> Dict[str, torch.Tensor]:
     pt_params = dict(pt_bert.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_bert.named_parameters():
         ait_name = name.replace(".", "_")
         if name in pt_params:
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index d01f320dc..7352740d0 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from collections import OrderedDict
-
 import numpy as np
 
 import torch
@@ -27,7 +25,7 @@
 
 def map_vae_params(ait_module, pt_module, batch_size, seq_len):
     pt_params = dict(pt_module.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_module.named_parameters():
         ait_name = name.replace(".", "_")
         if name in pt_params:
diff --git a/examples/07_how_to_run_pt_model/how_to_run_pt_model.py b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
index 993b7c69f..f860f2c1b 100644
--- a/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
+++ b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from collections import OrderedDict
-
 import torch
 
 from aitemplate.compiler import compile_model
@@ -58,7 +56,7 @@ def forward(self, input):
 def map_pt_params(ait_model, pt_model):
     ait_model.name_parameter_tensor()
     pt_params = dict(pt_model.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_model.named_parameters():
         ait_name = name.replace(".", "_")
         assert name in pt_params

From 196cc6d331f7389429720d7d0c305712b15fdb83 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Wed, 22 Mar 2023 09:08:40 -0700
Subject: [PATCH 312/638] Add ait full op converters (#461)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/461

ATT
* ops like torch.ones/torch.zeros will be const_fold by lowering pass, thus not adding converter for them

Reviewed By: frank-wei, qxy11

Differential Revision: D44274432

fbshipit-source-id: ba33951ae2d2ebc99794aff8026a01a31f9ad8da
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py           | 84 +++++++++++++++----
 fx2ait/fx2ait/converters/ait_converters.py    | 57 +++++++++++++
 .../fx2ait/test/converters/test_ait_full.py   | 79 +++++++++++++++++
 python/aitemplate/compiler/public/__init__.py |  1 +
 4 files changed, 205 insertions(+), 16 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_full.py

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index b653e64cc..ac55f8c65 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -2966,22 +2966,6 @@ def tensor_split(*, input, indices_or_sections, dim=0):
         )
 
 
-@register_acc_op_mapping(
-    op_and_target=("call_method", "new_ones"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("size", "size"),
-        ("dtype", "dtype", this_arg_is_optional),
-        ("device", "device", this_arg_is_optional),
-        ("requires_grad", "requires_grad", this_arg_is_optional),
-    ],
-)
-@register_acc_op
-def new_ones(*, input, size, dtype=None, device=None, requires_grad=False):
-    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
-    return input.new_ones(size, dtype=dtype, device=device)
-
-
 @register_acc_op_mapping(
     op_and_target=("call_method", "new_empty"),
     arg_replacement_tuples=[
@@ -3280,6 +3264,74 @@ def long(*, input):
     return input.long()
 
 
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_full"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("fill_value", "fill_value"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_full(*, input, size, fill_value, dtype=None, device=None, requires_grad=False):
+    return input.new_full(size, fill_value=fill_value, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.full_like))
+@register_acc_op
+def full_like(*, input, fill_value, dtype=None, device=None):
+    return torch.full_like(
+        input=input, fill_value=fill_value, dtype=dtype, device=device
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_ones"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_ones(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_ones(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.ones_like))
+@register_acc_op
+def ones_like(*, input, dtype=None, device=None):
+    return torch.ones_like(input=input, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_zeros"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_zeros(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_zeros(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.zeros_like))
+@register_acc_op
+def zeros_like(*, input, dtype=None, device=None):
+    return torch.zeros_like(input=input, dtype=dtype, device=device)
+
+
 ###############################################################################
 
 # Set ops as side-effectul, this prevents them from being optimized away or
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index e5b291603..f068c247a 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -34,6 +34,7 @@
     elementwise,
     expand,
     flatten,
+    full,
     FuncEnum,
     gemm_rcr,
     gemm_rrr,
@@ -1663,3 +1664,59 @@ def acc_ops_neg(
         raise ValueError(f"Unexpected input dtype {dt}")
 
     return create_binary_op(FuncEnum.MUL, args, new_kwargs, name)
+
+
+@ait_converter(acc_ops.new_full)
+def acc_ops_new_full(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    size = kwargs["size"]
+    fill_value = kwargs["fill_value"]
+    return full()(size, fill_value=fill_value, dtype="float16")
+
+
+@ait_converter(acc_ops.full_like)
+def acc_ops_full_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    fill_value = kwargs["fill_value"]
+    return full()(input_val.shape(), fill_value=fill_value, dtype="float16")
+
+
+@ait_converter(acc_ops.new_ones)
+def acc_ops_new_ones(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    size = kwargs["size"]
+    return full()(size, 1, dtype="float16")
+
+
+@ait_converter(acc_ops.ones_like)
+def acc_ops_ones_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    return full()(input_val.shape(), 1, dtype="float16")
+
+
+@ait_converter(acc_ops.new_zeros)
+def acc_ops_new_zeros(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    size = kwargs["size"]
+    return full()(size, 0, dtype="float16")
+
+
+@ait_converter(acc_ops.zeros_like)
+def acc_ops_zeros_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    return full()(input_val.shape(), 0, dtype="float16")
diff --git a/fx2ait/fx2ait/test/converters/test_ait_full.py b/fx2ait/fx2ait/test/converters/test_ait_full.py
new file mode 100644
index 000000000..438fc4d1f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_full.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestFullConverter(AITTestCase):
+    def test_new_full(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = x.new_full((2, 6), 2.2)
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.new_full})
+
+    def test_full_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = torch.full_like(x, 2.2)
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.full_like})
+
+    def test_new_ones(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = x.new_ones((2, 6))
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.new_ones})
+
+    def test_ones_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                ones = torch.ones_like(x)
+                return torch.cat([ones, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.ones_like})
+
+    def test_new_zeros(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                zeros = x.new_zeros((2, 6))
+                return torch.cat([zeros, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.new_zeros})
+
+    def test_zeros_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                zeros = torch.zeros_like(x)
+                return torch.cat([zeros, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.zeros_like})
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index d8a6a5bf3..9b8be9e6c 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -79,6 +79,7 @@
 from aitemplate.compiler.ops.tensor.concatenate import concatenate
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.full import full
 from aitemplate.compiler.ops.tensor.permute import permute
 from aitemplate.compiler.ops.tensor.split import split
 

From 93153294354b456715426c7f0eb948efb04ad34a Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 09:40:58 -0700
Subject: [PATCH 313/638] Fix type info in constant_folding.py (#467)

Summary:
Fix output type info in `constant_folding.py`

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/467

Reviewed By: alexanderguzhva

Differential Revision: D44294103

Pulled By: aakhundov

fbshipit-source-id: 2471075d82ea5998b26220b28ba64264099b3a9a
---
 python/aitemplate/compiler/transform/constant_folding.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 20266ad5e..d86406961 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -44,7 +44,7 @@ def _make_op_names_unique(graph: List[Tensor]) -> Dict[str, str]:
     """
     To avoid ODR issues, we rename all ops in the constant folding subgraph.
     ODR issues can arise if two ops end up sharing the same name & implementation (which
-    can actualy happen, e.g. in the proposal op).
+    can actually happen, e.g. in the proposal op).
     """
     new_name_to_old = {}
     for tensor in graph:
@@ -123,7 +123,7 @@ def _fix_op_inputs_outputs(
 
 def _extract_foldable_subgraph(
     sorted_graph: List[Tensor],
-) -> List[Tensor]:
+) -> Tuple[List[Tensor], Dict[str, Tensor], List[Tensor]]:
     """
     Extract a list of foldable nodes. A node is foldable if:
     * It has bound data, or
@@ -281,7 +281,7 @@ def constant_folding(
     sorted_graph: List[Tensor],
     workdir: str,
     model_name: str,
-) -> Tuple[List[Tensor], Tuple[str, str]]:
+) -> Tuple[List[Tensor], List[Tuple[str, str]], List[Tensor]]:
     """
     Fold and propagate constants.
 

From 48f953fe0a704f407d62ba02a896290d1c5f8db9 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 10:15:55 -0700
Subject: [PATCH 314/638] Fix type info in compiler/base.py (#431)

Summary:
`Tensor __init__` takes Sequence of src_ops and stores them as StableSet internally.
- Update `src_ops` and `dst_ops` type in `__init__` to `Sequence[Node]`
- `StableSet __init__` can accept `None`. Remove unnecessary IFs
- Fix typo
- If the list of function parameters is empty we can remove "Parameters" section from the description

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/431

Reviewed By: aakhundov

Differential Revision: D44299738

Pulled By: wushirong

fbshipit-source-id: a966d5e1fca38497c80fcce43feb8358c8ed198e
---
 python/aitemplate/compiler/base.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 3ab794588..79e404632 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -22,7 +22,7 @@
 from enum import Enum
 from functools import reduce
 from pprint import pformat
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, Union
 
 import numpy as np
 
@@ -584,8 +584,8 @@ def __init__(
         self,
         shape: List[IntVar],
         name: str = None,
-        src_ops: StableSet[Node] = None,
-        dst_ops: StableSet[Node] = None,
+        src_ops: Sequence[Node] = None,
+        dst_ops: Sequence[Node] = None,
         dtype: str = "float16",
         is_input: bool = False,
         is_output: bool = False,
@@ -632,12 +632,8 @@ def __init__(
         super().__init__()
         self._attrs["shape"] = self._convert_shape(shape)
         self._attrs["name"] = name
-        self._attrs["src_ops"] = (
-            StableSet(src_ops) if src_ops is not None else StableSet()
-        )
-        self._attrs["dst_ops"] = (
-            StableSet(dst_ops) if dst_ops is not None else StableSet()
-        )
+        self._attrs["src_ops"] = StableSet(src_ops)
+        self._attrs["dst_ops"] = StableSet(dst_ops)
         self._attrs["dtype"] = dtype
         self._attrs["is_output"] = is_output
         self._attrs["is_input"] = is_input
@@ -743,7 +739,7 @@ def is_jagged(self) -> bool:
         )
 
     def size_bytes(self, alignment: int = 1) -> int:
-        """Returns acutal size (in bytes) of this Tensor."""
+        """Returns actual size (in bytes) of this Tensor."""
         return get_aligned_size(self._attrs["shape"], self.dtype(), alignment)
 
     def pseudo_code(self, with_shape=True) -> str:
@@ -1063,10 +1059,6 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         This is used when we need to copy the op with identical behaviour.
 
-        Parameters
-        ----------
-        None
-
         Returns
         -------
         Dict of attributes

From 3cfae1b24a3054ffe274d02d09400fac850e4505 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 10:34:08 -0700
Subject: [PATCH 315/638] =?UTF-8?q?Add=20COMBINE=5FPROFILER=5FMULTI=5FSOUR?=
 =?UTF-8?q?CES=20and=20FORCE=5FONE=5FPROFILER=5FSOURCE=5FPER=5F=E2=80=A6?=
 =?UTF-8?q?=20(#428)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Add `COMBINE_PROFILER_MULTI_SOURCES` and `FORCE_ONE_PROFILER_SOURCE_PER_TARGET` envs description to docs.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/428

Reviewed By: aakhundov

Differential Revision: D44299786

Pulled By: wushirong

fbshipit-source-id: 46059bf3cfc542a52ffa0e340b8b8a1b6b49c464
---
 docs/source/reference/env.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index e5616dbec..6420fb5e1 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -31,6 +31,10 @@ Profiling
 
 **FORCE_PROFILE**: If set to "1", it will do profiling regarless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
 
+**COMBINE_PROFILER_MULTI_SOURCES**: Whether to combine multiple profiler sources per target. "0" - Disabled, "1" - Enabled (default).
+
+**FORCE_ONE_PROFILER_SOURCE_PER_TARGET**: Whether to combine multiple profiler sources per target into one. "0" - Disabled (default), "1" - Enabled.
+
 OSS CI
 ------
 

From 2c822c081399834371c2fe22b428b134fca2db9b Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Wed, 22 Mar 2023 10:46:38 -0700
Subject: [PATCH 316/638] stable diffusion unet ait converter (#460)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/460

Adding stable diffusion modeling files to AIT frontend.

Reviewed By: yinghai

Differential Revision: D44106679

fbshipit-source-id: 17fd22c012bb46f0b1b689440f28a89b0fe6fccf
---
 fx2ait/fx2ait/tools/common_fx2ait.py          |   4 +
 python/aitemplate/frontend/nn/ldm/__init__.py |  17 +
 .../aitemplate/frontend/nn/ldm/attention.py   | 105 +++
 python/aitemplate/frontend/nn/ldm/clip.py     | 628 +++++++++++++++
 .../aitemplate/frontend/nn/ldm/embeddings.py  | 101 +++
 python/aitemplate/frontend/nn/ldm/resnet.py   | 238 ++++++
 .../frontend/nn/ldm/unet_2d_condition.py      | 255 ++++++
 .../aitemplate/frontend/nn/ldm/unet_blocks.py | 762 ++++++++++++++++++
 python/aitemplate/frontend/nn/ldm/vae.py      | 153 ++++
 9 files changed, 2263 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/ldm/__init__.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/attention.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/clip.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/embeddings.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/resnet.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/unet_blocks.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/vae.py

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 90d18fcd3..b9aeb8009 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -97,6 +97,7 @@ def run_test(
             leaf_module_list.append(leaf_module)
 
         orig_mod = copy.deepcopy(mod)
+        orig_mod.eval()
         mod = acc_tracer.trace(
             mod,
             inputs,
@@ -110,6 +111,9 @@ def run_test(
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
             inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        mod.half()
+        inputs = [inp.half().contiguous() for inp in inputs]
         interp = AITInterpreter(
             mod,
             inputs,
diff --git a/python/aitemplate/frontend/nn/ldm/__init__.py b/python/aitemplate/frontend/nn/ldm/__init__.py
new file mode 100644
index 000000000..b14195e81
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/__init__.py
@@ -0,0 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+
+from aitemplate.frontend.nn.ldm.unet_2d_condition import UNet2DConditionModel
diff --git a/python/aitemplate/frontend/nn/ldm/attention.py b/python/aitemplate/frontend/nn/ldm/attention.py
new file mode 100644
index 000000000..14993e6d9
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/attention.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
+"""
+
+from typing import Optional
+
+from aitemplate.compiler.ops import reshape
+
+from aitemplate.frontend import nn, Tensor
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        batch_size (:obj:`int`): The number of examples per batch.
+        height (:obj:`int`): Height of each image example.
+        width (:obj:`int`): Width of each image example.
+        channels (:obj:`int`): The number of channels in the input and output.
+        num_head_channels (:obj:`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.channels = channels
+        self.num_heads = (
+            channels // num_head_channels if num_head_channels is not None else 1
+        )
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
+        self.attention = nn.MultiheadAttention(
+            channels,
+            batch_size,
+            height * width,
+            self.num_heads,
+            qkv_bias=True,
+            has_residual=True,
+            use_mem_eff=True,
+        )
+        self.rescale_output_factor = rescale_output_factor
+
+    def forward(self, hidden_states) -> Tensor:
+        """
+        input hidden_states shape: [batch, height, width, channel]
+        output shape: [batch, height, width, channel]
+        """
+        residual = hidden_states
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = reshape()(
+            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+        )
+
+        batch, hw, channel = hidden_states.shape()
+        if (
+            batch.value() != self.batch_size
+            or hw.value() != self.width * self.height
+            or channel.value() != self.channels
+        ):
+            raise RuntimeError(
+                "nchw params do not match! "
+                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
+                f"actual: {batch}, {channel}, {hw}."
+            )
+
+        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
+        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+
+        return res
diff --git a/python/aitemplate/frontend/nn/ldm/clip.py b/python/aitemplate/frontend/nn/ldm/clip.py
new file mode 100644
index 000000000..1a95314d4
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/clip.py
@@ -0,0 +1,628 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from inspect import isfunction
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        dtype="float16",
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
+        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None, residual=None):
+        nheads = self.heads
+        d = self.dim_head
+
+        layout = "20314" if USE_CUDA else "m2n3"
+
+        bs, seqlen, _ = get_shape(x)
+        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
+        )
+        context = default(context, x)
+
+        seqlen = get_shape(context)[1]
+        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        )
+        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
+        )
+
+        if USE_CUDA:
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
+            )
+        else:
+            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
+            out = OP(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+                (ops.reshape()(v, [bs * nheads, -1, d])),
+            )
+        out = ops.reshape()(out, [bs, -1, nheads * d])
+        proj = self.to_out(out)
+        proj = ops.reshape()(proj, [bs, -1, nheads * d])
+        if residual is not None:
+            return proj + residual
+        else:
+            return proj
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+
+    def forward(self, x):
+        return self.proj(x, self.gate(x))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(
+                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x, residual=None):
+        shape = ops.size()(x)
+        x = self.net(x)
+        x = ops.reshape()(x, shape)
+        if residual is not None:
+            return x + residual
+        else:
+            return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
+
+    def forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), residual=x)
+        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.ff(self.norm3(x), residual=x)
+        return x
+
+
+def Normalize(in_channels):
+    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+
+    def __init__(
+        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)  # Group Norm
+
+        self.proj_in = nn.Conv2dBias(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+
+        self.proj_out = nn.Conv2dBias(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, h, w, c = get_shape(x)
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = ops.reshape()(x, [b, -1, c])
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = ops.reshape()(x, [b, h, w, c])
+        x = self.proj_out(x)
+        return x + x_in
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        batch_size=1,
+        seq_len=16,
+        layer_norm_eps=1e-5,
+        hidden_dropout_prob=0.0,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=hidden_dropout_prob,
+            has_residual=False,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        residual: Optional[Tensor] = None,
+    ):
+        if residual is not None:
+            self_output = self.attn(hidden_states, residual)
+        else:
+            self_output = self.attn(hidden_states)
+        return self_output
+
+
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, x):
+        x1 = x * 1.702
+        x1 = ops.sigmoid(x1)
+        x = x * x1
+        return x
+
+
+class CLIPMLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="GELU",
+        drop=0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            specialization="gelu",
+        )
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPMLPQuickGelu(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+        )
+        self.activation_fn = QuickGELUActivation()
+
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPEncoderLayer(nn.Module):
+    ACT_LAYER_TO_CLIP_MLP_MAP = {
+        "gelu": CLIPMLP,
+        "quick_gelu": CLIPMLPQuickGelu,
+    }
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        mlp_ratio=4.0,
+        batch_size=1,
+        seq_len=16,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=0,
+            has_residual=True,
+            causal=causal,
+            mask_seq=mask_seq,
+            use_mem_eff=True,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
+            hidden_size, int(hidden_size * mlp_ratio)
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, residual)
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states, residual)
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        num_hidden_layers=12,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        hidden_size=768,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    causal=causal,
+                    mask_seq=mask_seq,
+                    act_layer=act_layer,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        # all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        vocab_size=49408,
+        max_position_embeddings=77,
+        dtype="float16",
+    ):
+        super().__init__()
+        embed_dim = hidden_size
+
+        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
+        self.position_embedding = nn.Embedding(
+            shape=[max_position_embeddings, embed_dim], dtype=dtype
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        inputs_embeds: Optional[Tensor] = None,
+    ) -> Tensor:
+
+        input_shape = ops.size()(input_ids)
+
+        # [B * S]
+        input_ids = ops.reshape()(input_ids, [-1])
+
+        position_ids = ops.reshape()(position_ids, [-1])
+
+        if inputs_embeds is None:
+            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+
+        position_embeddings = ops.batch_gather()(
+            self.position_embedding.tensor(), position_ids
+        )
+
+        embeddings = inputs_embeds + position_embeddings
+
+        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
+
+        return embeddings
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
+        self.encoder = CLIPEncoder(
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            causal=causal,
+            mask_seq=mask_seq,
+            act_layer=act_layer,
+        )
+        self.final_layer_norm = nn.LayerNorm(hidden_size)
+
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        return last_hidden_state
diff --git a/python/aitemplate/frontend/nn/ldm/embeddings.py b/python/aitemplate/frontend/nn/ldm/embeddings.py
new file mode 100644
index 000000000..36b96a4fb
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/embeddings.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def get_timestep_embedding(
+    timesteps: Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+
+    exponent = (-math.log(max_period)) * Tensor(
+        shape=[half_dim], dtype="float16", name="arange"
+    )
+
+    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
+
+    emb = ops.exp(exponent)
+    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = ops.concatenate()(
+            [ops.cos(emb), ops.sin(emb)],
+            dim=-1,
+        )
+    else:
+        emb = ops.concatenate()(
+            [ops.sin(emb), ops.cos(emb)],
+            dim=-1,
+        )
+    return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.linear_2(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
diff --git a/python/aitemplate/frontend/nn/ldm/resnet.py b/python/aitemplate/frontend/nn/ldm/resnet.py
new file mode 100644
index 000000000..03e4f8023
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/resnet.py
@@ -0,0 +1,238 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Upsample2D(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        conv = None
+        if use_conv_transpose:
+            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(x)
+
+        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                x = self.conv(x)
+            else:
+                x = self.Conv2d_0(x)
+
+        return x
+
+
+class Downsample2D(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            conv = nn.Conv2dBias(
+                self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        x = self.conv(x)
+
+        return x
+
+
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_nin_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = nn.GroupNorm(
+            num_groups=groups,
+            num_channels=in_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+
+        self.conv1 = nn.Conv2dBias(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = nn.GroupNorm(
+            num_groups=groups_out,
+            num_channels=out_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2dBias(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        self.upsample = self.downsample = None
+
+        self.use_nin_shortcut = (
+            self.in_channels != self.out_channels
+            if use_nin_shortcut is None
+            else use_nin_shortcut
+        )
+
+        if self.use_nin_shortcut:
+            self.conv_shortcut = nn.Conv2dBias(
+                in_channels, out_channels, 1, 1, 0
+            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
+        else:
+            self.conv_shortcut = None
+
+    def forward(self, x, temb=None):
+        hidden_states = x
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm1(
+            hidden_states
+        )  # .float()).type(hidden_states.dtype) # fused swish
+        # hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            x = self.downsample(x)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(ops.silu(temb))
+            bs, dim = get_shape(temb)
+            temb = ops.reshape()(temb, [bs, 1, 1, dim])
+            hidden_states = hidden_states + temb
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = hidden_states + x
+
+        return out
diff --git a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
new file mode 100644
index 000000000..770156ff9
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
@@ -0,0 +1,255 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple, Union
+
+from aitemplate.frontend import nn
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+
+
+class UNet2DConditionModel(nn.Module):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+    ):
+        super().__init__()
+        self.center_input_sample = center_input_sample
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=norm_eps,
+            use_swish=True,
+        )
+
+        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+
+    def forward(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        return_dict: bool = True,
+    ):
+        """r
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+
+        # 1. time
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "attentions")
+                and downsample_block.attentions is not None
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+
+        # 5. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "attentions")
+                and upsample_block.attentions is not None
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                )
+
+        # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/python/aitemplate/frontend/nn/ldm/unet_blocks.py b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
new file mode 100644
index 000000000..7b6e3e6e6
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
@@ -0,0 +1,762 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# flake8: noqa
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+from .attention import AttentionBlock
+
+from .clip import SpatialTransformer
+from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
+
+# pylint: disable=W0102
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+    downsample_padding=None,
+):
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
+            )
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+):
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
+            )
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                SpatialTransformer(
+                    in_channels,
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if attention_type != "default":
+            raise NotImplementedError(
+                f"attention_type must be default! current value: {attention_type}"
+            )
+
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    batch_size,
+                    height,
+                    width,
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/python/aitemplate/frontend/nn/ldm/vae.py b/python/aitemplate/frontend/nn/ldm/vae.py
new file mode 100644
index 000000000..1cd25aa19
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/vae.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
+"""
+
+from typing import Tuple
+
+from aitemplate.frontend import nn, Tensor
+
+from .unet_blocks import get_up_block, UNetMidBlock2D
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        act_fn="silu",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2dBias(
+            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+        )
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=32,
+            temb_channels=None,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = 32
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=num_groups_out,
+            eps=1e-6,
+            use_swish=True,
+        )
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, z) -> Tensor:
+        sample = z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        sample_size: int = 32,
+    ):
+        super().__init__()
+        self.decoder = Decoder(
+            batch_size,
+            height,
+            width,
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+        )
+        self.post_quant_conv = nn.Conv2dBias(
+            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def decode(self, z: Tensor, return_dict: bool = True):
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self):
+        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")

From 89a46912aa49ad981f6ceee5660b513a459124a8 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 22 Mar 2023 10:55:51 -0700
Subject: [PATCH 317/638] Update rocm_ci.yml

Trigger rocm_ci on PRs from rocm repo.
---
 .github/workflows/rocm_ci.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 92219871d..7c55cabb1 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,7 +1,11 @@
 name: ROCM_CI
 
-on:
-  push:
+on: 
+  pull_request: 
+    types: 
+      - [opened, reopened]
+    branches: 
+      - 'ROCmSoftwarePlatform/AITemplate/**'
 
 jobs:
   build:

From 57ab373e20af14420a9602b45418613f37d28831 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 22 Mar 2023 11:44:43 -0700
Subject: [PATCH 318/638] Fix type info in compiler/tensor_accessor.py (#468)

Summary:
Type info and other minor fixes in `tensor_accessor.py`

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/468

Reviewed By: alexanderguzhva

Differential Revision: D44294199

Pulled By: aakhundov

fbshipit-source-id: 2e6e43853dee61385e28da8cb7756f52fb58db62
---
 python/aitemplate/compiler/tensor_accessor.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index c90e7bfd2..7df290512 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -22,7 +22,7 @@
 # pylint: disable=C0103,C0301,W0612
 
 from pprint import pformat
-from typing import Any, List, Optional
+from typing import Any, Iterable, List, Optional
 
 from aitemplate.compiler.base import IntImm, IntVar, Tensor
 
@@ -49,7 +49,7 @@ def __init__(self, original_tensor: Tensor) -> None:
         # This strictly means that the tensor's memory itself is contiguous
         self.is_contiguous = True
 
-        ## These variables are only set when self.stride_dim != None.
+        # These variables are only set when self.stride_dim != None.
         # A tensor can be contiguous and still come from a strided tensor,
         # e.g., when stride_dim == 0
         self.is_from_strided_tensor = False
@@ -79,7 +79,7 @@ def __init__(self, original_tensor: Tensor) -> None:
         # between self.original_shapes and self.actual_shapes.
         # e.g. The original tensor is in shape [2, 3, 2], and it's reshaped to [2, 6].
         # In this case, self._dim_mapping = [([0], [0]), ([1, 2], [1])], which represents
-        # that self.orignal_shapes[0] and self.actual_shapes[0] are in the same group,
+        # that self.original_shapes[0] and self.actual_shapes[0] are in the same group,
         # and self.original_shapes[1:2] and self.actual_shapes[1] are in the same group.
         #
         # It's possible that such a mapping cannot be calculated (e.g. because of
@@ -235,7 +235,9 @@ def try_get_stride_strs(
                 f"dim_names: {dim_names}, shapes: {self.original_shapes}"
             )
 
-        def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
+        def _get_value_or_names(
+            shape: List[IntVar], indices: Iterable[int]
+        ) -> List[str]:
             res = []
             for index in indices:
                 d = shape[index]
@@ -329,7 +331,7 @@ def stride(self, dim: int) -> int:
             stride *= int(s)
         return stride
 
-    def gen_stride_str(self, dim: int, dim_names: List[str]) -> int:
+    def gen_stride_str(self, dim: int, dim_names: List[str]) -> str:
         """
         Returns the str to calculate the stride of a certain dim. This is
         a temporary solution to get around dynamic shapes problems with
@@ -351,7 +353,7 @@ def update_base_tensor(
         """
         Updates the TensorAccessor with a new base tensor.
         This API is useful to handle ops with a stride dim, e.g. split, cat.
-        It can also used by slice if slice is only operated on one dim.
+        It can also be used by slice if slice is only operated on one dim.
         """
 
         assert (

From 81699d1a426125c67a54b6092cc782c895126b5c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Wed, 22 Mar 2023 12:45:10 -0700
Subject: [PATCH 319/638] added a pass to move view ops (#454)

Summary:
Added a pass move_view_op_before_concat that turns "cat + view_op + cat" into "view_op + cat + cat" whenever possible. The yielded pattern may be optimized further by the transform_memory_ops pass, which assumes no view ops between two fusible concat ops. Note that this pass must be invoked before transform_strided_op_and_view_op and transform_strided_ops.

For view ops, we only enable reshape and flatten at the moment. We will extend it to others such as squeeze and unsqueeze.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/454

Reviewed By: ipiszy

Differential Revision: D44200888

Pulled By: chenyang78

fbshipit-source-id: ce36734817a65a2fb244e9c3b620ffd119b3d300
---
 .../aitemplate/compiler/transform/__init__.py |    1 +
 .../compiler/transform/move_view_ops.py       |  284 ++++
 .../compiler/transform/optimize_graph.py      |    3 +
 .../transform/transform_memory_ops.py         |    3 +-
 .../compiler/transform/transform_utils.py     |    3 +-
 python/aitemplate/utils/shape_utils.py        |   17 +-
 tests/unittest/compiler/test_move_view_ops.py | 1378 +++++++++++++++++
 .../compiler/test_slice_reshape_scatter.py    |   41 +-
 .../test_split_large_slice_scatter.py         |   12 +-
 .../compiler/test_strided_reshape_cat.py      |   12 +-
 10 files changed, 1722 insertions(+), 32 deletions(-)
 create mode 100644 python/aitemplate/compiler/transform/move_view_ops.py
 create mode 100644 tests/unittest/compiler/test_move_view_ops.py

diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index d618161b0..32cf17cdd 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -31,6 +31,7 @@
     mark_special_views,
 )
 from aitemplate.compiler.transform.memory_planning import memory_planning
+from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
 from aitemplate.compiler.transform.name_graph import name_graph
 from aitemplate.compiler.transform.optimize_graph import optimize_graph
 from aitemplate.compiler.transform.profile import profile
diff --git a/python/aitemplate/compiler/transform/move_view_ops.py b/python/aitemplate/compiler/transform/move_view_ops.py
new file mode 100644
index 000000000..5f9c1e2db
--- /dev/null
+++ b/python/aitemplate/compiler/transform/move_view_ops.py
@@ -0,0 +1,284 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This pass move any view op between two concatenate ops to the front of the
+first concatenate op if possible.
+"""
+import copy
+from typing import Callable, List, Optional, Tuple
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import shape_utils
+
+
+# TODO: support other view ops such as squeeze and unsqueeze
+_SUPPORTED_VIEW_OPS = ["reshape", "flatten"]
+
+
+def _make_input_view_shape(
+    cat_input: Tensor,
+    original_view_shape: List[IntVar],
+    cat_dim: int,
+    input_idx: int,
+) -> Optional[List[IntVar]]:
+    """
+    Assumes that there is a pattern like concat + view_op in the graph, we tries
+    to transform it into view_op + concat. However, it's not always valid to
+    perform such a transformation, because the concat's original inputs may
+    not be shape-compatible with the moved view_op. Currently, we only support
+    cases where a view_op only changes the dims after cat_dim and the dims after
+    the cat_dim must be static.
+
+    For example, for the following code:
+
+        x1 = Tensor(batch, 3 * 4)
+        x2 = Tensor(batch, 5 * 4)
+        concat_0 = concat([x1, x2], cat_dim=1)
+        reshape_1 = reshape(concat_0, [batch, 8, 4])
+
+    This function will generate shape [batch, 3, 4] for x1 and [batch, 5, 4]
+    for x2, respectively.
+
+    In contrast, if we have code like below:
+
+        x1 = tensor([batch, 16])
+        x2 = tensor([batch, 8])
+        cat_1 = concatenate([x1, x2], cat_dim=1)
+        reshape_2 = reshape(cat_1, [batch, 4, 6])
+
+    We would return None for both x1 and x2, because we cannot make valid reshape
+    ops for x1 and x2 while keeping the original semantics.
+
+    Parameters
+    ----------
+    cat_input: Tensor
+        a concat op's input for which we will generate a view op, e.g. the x1 or
+        x2 tensor in the example above
+    original_view_shape: List[IntVar]
+        the shape of the view op's output, where the view op consumes the concat's
+        output, e.g. the reshape op in the example above
+    cat_dim: int
+        the value of the cat_dim attribute of the concate op
+    input_idx: int
+        the index of the cat_input in the concat's inputs list
+    """
+    cat_input_shape = cat_input.shape()
+    if cat_dim >= len(cat_input_shape) or cat_dim >= len(original_view_shape):
+        return None
+    # make sure each dimension at the same index in front of the cat_dim is the same for
+    # both cat_input_shape and original_view_shape
+    for curr_cat_dim, orig_dim in zip(
+        cat_input_shape[:cat_dim], original_view_shape[:cat_dim]
+    ):
+        if curr_cat_dim != orig_dim:
+            return None
+    input_stride_at_cat_dim = shape_utils.get_static_stride(cat_input_shape, cat_dim)
+    # make sure all dimensions are static after cat_dim
+    if input_stride_at_cat_dim is None:
+        return None
+    orig_view_stride_at_cat_dim = shape_utils.get_static_stride(
+        original_view_shape, cat_dim
+    )
+    # make sure all dimensions are static after cat_dim
+    if orig_view_stride_at_cat_dim is None:
+        return None
+    new_input_view_shape = copy.deepcopy(original_view_shape)
+    cat_stride = cat_input_shape[cat_dim].value() * input_stride_at_cat_dim
+    if cat_stride % orig_view_stride_at_cat_dim != 0:
+        return None
+    orig_dim_name = original_view_shape[cat_dim]._attrs["name"]
+    new_input_view_shape[cat_dim] = IntImm(
+        cat_stride // orig_view_stride_at_cat_dim,
+        name=f'{orig_dim_name}_{cat_input._attrs["name"]}_{input_idx}',
+    )
+    return new_input_view_shape
+
+
+def _call_view_op(
+    view_op: Callable, view_output_shape: List[IntVar], input_tensor: Tensor
+) -> Tensor:
+    """
+    call the view_op with suitable arguments and return the output tensor
+    """
+    view_op_type = view_op._attrs["op"]
+    if view_op_type == "reshape":
+        output = view_op(input_tensor, view_output_shape)
+    elif view_op_type == "flatten":
+        output = view_op(input_tensor)
+    else:
+        raise AssertionError(f"unsupported {view_op_type=}")
+    return output
+
+
+def _try_move_view_op(
+    first_cat: Operator, second_cat: Operator, view_op: Operator
+) -> bool:
+    """
+    Try to move the view_op to the front of the first_cat.
+    Return true if the transformation is successful, False otherwise.
+    """
+    cat_dim = first_cat._attrs["concat_dim"]
+    first_cat_output = first_cat._attrs["outputs"][0]
+    first_cat_output_shape = first_cat_output.shape()
+    # we might be able to support dynamic cat_dim, but let's be conservative
+    # for now
+    if not shape_utils.is_static_dimension(first_cat_output_shape, cat_dim):
+        return False
+    if second_cat._attrs["concat_dim"] != cat_dim:
+        return False
+    second_cat_output = second_cat._attrs["outputs"][0]
+    if not shape_utils.is_static_dimension(second_cat_output.shape(), cat_dim):
+        return False
+    # We are not always able to move the view op. For example, we cannot
+    # move the reshape to the front of cat_1 in the following code:
+    #    x1 = tensor([batch, 16])
+    #    x2 = tensor([batch, 8])
+    #    cat_1 = concatenate([x1, x2], cat_dim=1)
+    #    reshape_2 = reshape(cat_1, [batch, 4, 6])
+    #    x3 = tensor([batch, 2, 6])
+    #    cat_2 = concatenate([reshape_2, x3], cat_dim=1)
+    # Basically, we cannot reshape either x1 or x2 to a shape while
+    # keep cat_dim = 1, i.e. we cannot form a shape [batch, -1, 6] from
+    # either [batch, 16] or [batch, 8].
+    new_view_output_shapes = []
+    view_op_output = view_op._attrs["outputs"][0]
+    original_view_shape = view_op_output.shape()
+    for input_idx, first_cat_input in enumerate(first_cat._attrs["inputs"]):
+        input_view_shape = _make_input_view_shape(
+            first_cat_input, original_view_shape, cat_dim, input_idx
+        )
+        if input_view_shape is None:
+            return False
+        new_view_output_shapes.append(input_view_shape)
+    # Now we start modifying the graph.
+    # make a new output tensor for the first cat
+    new_first_cat_output = Tensor(original_view_shape, first_cat_output._attrs["name"])
+    transform_utils.replace_tensor(first_cat_output, new_first_cat_output)
+    first_cat._attrs["outputs"][0] = new_first_cat_output
+    new_first_cat_output._attrs["src_ops"].add(first_cat)
+
+    # remove the old view op
+    transform_utils.remove_view_op_from_sorted_graph(view_op)
+    # make a new view op for each first_cat's original input and place it between
+    # the original input and the first cat
+    new_first_cat_inputs = []
+    # The same tensor may be used multiple times by the first cat.
+    # We don't want to make one view op for each use, because it would
+    # prevent us from propagating those view ops to an upper level.
+    first_cat_input_to_view_output = {}
+    for first_cat_input, input_view_shape in zip(
+        first_cat._attrs["inputs"], new_view_output_shapes
+    ):
+        new_view_output = first_cat_input_to_view_output.get(first_cat_input, None)
+        if new_view_output is None:
+            new_view_op = type(view_op)(**view_op._get_op_attributes())
+            new_view_output = _call_view_op(
+                new_view_op, input_view_shape, first_cat_input
+            )
+            first_cat_input_to_view_output[first_cat_input] = new_view_output
+            new_view_output._attrs["dst_ops"].add(first_cat)
+            first_cat_input._attrs["dst_ops"].remove(first_cat)
+        new_first_cat_inputs.append(new_view_output)
+    first_cat._attrs["inputs"] = new_first_cat_inputs
+    first_cat._attrs["original_inputs"] = list(new_first_cat_inputs)
+    first_cat._attrs["input_accessors"] = [
+        TensorAccessor(inp) for inp in new_first_cat_inputs
+    ]
+    return True
+
+
+def _is_valid_cat_op(cat: Operator) -> bool:
+    """
+    Return true if the cat op is valid for moving the view op.
+    """
+    if cat._attrs["op"] != "concatenate":
+        return False
+    # skip if the cat has any fused strided op
+    if any(mask is False for mask in cat._attrs["input_masks"]):
+        return False
+    # If cat carries strided input_accessors or fused view ops, we skip it
+    if "input_accessors" in cat._attrs:
+        if any(
+            input_accessor.stride_dim is not None
+            or input_accessor.actual_shapes is not None
+            for input_accessor in cat._attrs["input_accessors"]
+        ):
+            return False
+    return True
+
+
+def _move_view_op_before_concat(
+    sorted_graph: List[Tensor],
+) -> Tuple[bool, List[Tensor]]:
+    """
+    Return a tuple of (bool, List[Tensor]), where True indicates the
+    graph has been successfully changed.
+    """
+    changed = False
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) == 0:
+            continue
+        first_cat = list(src_ops)[0]
+        if not _is_valid_cat_op(first_cat):
+            continue
+        first_cat_outputs = first_cat._attrs["outputs"]
+        if len(first_cat_outputs) != 1:
+            continue
+        first_cat_output = first_cat_outputs[0]
+        # If the first cat is a graph output, we cannot fuse it
+        if first_cat_output._attrs["is_output"]:
+            continue
+        next_ops = first_cat_output._attrs["dst_ops"]
+        if len(next_ops) != 1:
+            continue
+        view_op = list(next_ops)[0]
+        # skip if the next op is not one of the supported view ops
+        if view_op._attrs["op"] not in _SUPPORTED_VIEW_OPS:
+            continue
+        view_op_output = view_op._attrs["outputs"][0]
+        if view_op_output._attrs["is_output"]:
+            continue
+        next_next_ops = view_op_output._attrs["dst_ops"]
+        if len(next_next_ops) != 1:
+            continue
+        second_cat = list(next_next_ops)[0]
+        if not _is_valid_cat_op(second_cat):
+            continue
+        if _try_move_view_op(first_cat, second_cat, view_op):
+            changed = True
+    return (changed, sorted_graph)
+
+
+def move_view_op_before_concat(
+    sorted_graph: List[Tensor], wordir: str = None
+) -> List[Tensor]:
+    """
+    This transformation turns "cat + view_op + cat" into "view_op + cat + cat".
+    The yielded pattern may be optimized further by the transform_memory_ops pass.
+    Note that this pass must be invoked before transform_strided_op_and_view_op
+    and transform_strided_ops.
+    """
+    changed = True
+    while changed:
+        changed, sorted_graph = _move_view_op_before_concat(sorted_graph)
+        if changed:
+            sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 35903bc8e..3b8ab2468 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -35,6 +35,7 @@
 from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
     fuse_permute_bmm_and_gemm,
 )
+from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
 from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
     split_large_slice_scatter_ops,
@@ -92,6 +93,8 @@ def optimize_graph(
         fuse_conv_elementwise,
         fuse_mm_elementwise,
         fuse_mm_reshape_permute,
+        # make sure we run move_view_op_before_concat before transform_memory_ops
+        move_view_op_before_concat,
         transform_memory_ops,
         fuse_ops,
         fuse_elementwise,
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index 6cebfec2f..bb5904565 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -121,7 +121,8 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     cat._attrs["original_inputs"] = list(new_cat_original_inputs)
     cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
     for tensor in first_op_inputs:
-        tensor._attrs["dst_ops"].remove(first_op)
+        # the same tensor may be used multiple times
+        tensor._attrs["dst_ops"].discard(first_op)
         tensor._attrs["dst_ops"].add(cat)
     for tensor in first_op_outputs:
         transform_utils.remove_tensor_from_sorted_graph(tensor)
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index 705587db1..353e5c9de 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -227,7 +227,8 @@ def remove_view_op_from_sorted_graph(op: Operator) -> None:
     input_tensor = op._attrs["inputs"][0]
     output_tensor = op._attrs["outputs"][0]
 
-    input_tensor._attrs["dst_ops"] = output_tensor._attrs["dst_ops"]
+    input_tensor._attrs["dst_ops"].remove(op)
+    input_tensor._attrs["dst_ops"].update(output_tensor._attrs["dst_ops"])
     for dst_op in output_tensor._attrs["dst_ops"]:
         dst_op.replace_input_tensor(output_tensor, input_tensor)
     if output_tensor._attrs["is_output"]:
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 7816b81fe..3c22da90b 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -16,7 +16,7 @@
 Util functions to handle shapes.
 """
 
-from typing import List
+from typing import List, Optional
 
 
 def gen_int_var(values: List[int], name: str = None):
@@ -185,3 +185,18 @@ def is_same_shape(shapes1, shapes2) -> bool:
         if dim1 != dim2:
             return False
     return True
+
+
+def get_static_stride(shape, dim) -> Optional[int]:
+    """
+    This is a helper function that returns the static stride for dim.
+    It returns None if it cannot generate a static stride.
+    """
+    from aitemplate.compiler.base import IntImm
+
+    stride = 1
+    for d in shape[dim + 1 :]:
+        if not isinstance(d, IntImm):
+            return None
+        stride *= d.value()
+    return stride
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
new file mode 100644
index 000000000..1a8cde5e8
--- /dev/null
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -0,0 +1,1378 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class MoveViewOpsTestCase(unittest.TestCase):
+    BATCH_SIZE = 1024
+
+    def __init__(self, *args, **kwargs):
+        super(MoveViewOpsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_non_movable_reshape_cat(self, M0, M1, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1, dim=1)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2, dim=2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, M2, IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim_1 = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim_1)
+        reshape_to_shape_1 = [-1, M2, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        cat_dim_2 = 2
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim_2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 5)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim_1)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim_2)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_movable_reshape_cat(self):
+        self._test_non_movable_reshape_cat(
+            M0=4,
+            M1=2,
+            N=4,
+            test_name="test_non_movable_reshape_cat",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic(
+        self, M0, M1, M2, N, test_name, dtype="float16", non_movable=False
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        if non_movable is True:
+            assert (M0 + M1) % 3 == 0, "(M0 + M1) * N must be divisible by 3"
+            X2_M = (M0 + M1) * N // 3
+            X2_N = 3
+            reshape_to_shape_1 = [-1, X2_M, X2_N]
+        else:
+            reshape_to_shape_1 = [-1, M0 + M1, N]
+            X2_M = M2
+            X2_N = N
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(X2_M), IntImm(X2_N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        if non_movable is True:
+            expected_num_tensors = 5
+            # reshape can be fused into the second cat
+            expected_num_ops = 2
+        else:
+            expected_num_tensors = 4
+            expected_num_ops = 1
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, X2_M, X2_N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic(self):
+        self._test_move_reshape_cat_basic(
+            M0=4,
+            M1=2,
+            M2=6,
+            N=4,
+            test_name="test_move_reshape_cat_basic_non_movable",
+            dtype="float16",
+            non_movable=True,
+        )
+        self._test_move_reshape_cat_basic(
+            M0=1,
+            M1=5,
+            M2=7,
+            N=3,
+            test_name="test_move_reshape_cat_basic",
+            dtype="float16",
+        )
+        self._test_move_reshape_cat_basic(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_basic",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic_2(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # reshape_0 = reshape(x0)
+        # reshape_1 = reshape(x1)
+        # concat_2 = concatenate(reshape_0, x3, reshape_1)
+        # reshape_3 = reshape(concat_2)
+        # y = concatenate(x2, reshape_3, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        assert M0 % 2 == 0, f"{M0=} must be divisible by 2"
+        assert N % 2 == 0, f"{N=} must be divisible by 2"
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 // 2), IntImm(N * 2)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N // 2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        reshape_0 = ops.reshape()(X0, [-1, M0, N])
+        reshape_1 = ops.reshape()(X1, [-1, M1, N])
+        concat_2 = ops.concatenate()([reshape_0, X3, reshape_1], dim=cat_dim)
+        reshape_to_shape_3 = [-1, (M0 + M0 + M1) * 2, N // 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to_shape_3)
+        Y = ops.concatenate()([X2, reshape_3, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 5)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 // 2, N * 2], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N // 2], dtype)
+            x3_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            reshape_0_pt = torch.reshape(x0_pt, [-1, M0, N])
+            reshape_1_pt = torch.reshape(x1_pt, [-1, M1, N])
+            concat_2_pt = torch.cat([reshape_0_pt, x3_pt, reshape_1_pt], dim=cat_dim)
+            reshape_3_pt = torch.reshape(concat_2_pt, reshape_to_shape_3)
+            y_pt = torch.cat([x2_pt, reshape_3_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic_2(self):
+        self._test_move_reshape_cat_basic_2(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_basic_2",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic_3(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x0)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X0], dim=cat_dim)
+        reshape_to_shape_1 = [-1, M0 + M0, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x0_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic_3(self):
+        self._test_move_reshape_cat_basic_3(
+            M0=1,
+            M2=7,
+            N=3,
+            test_name="test_move_reshape_cat_basic_3",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_1(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_2 = reshape(concat_0)
+        # concat_4 = concatenate(x2, reshape_2)
+        # flatten_5 = flatten(concat_4)
+        # concat_6 = concatenate(x0, flatten_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_0, [-1, M0 + M1, N])
+        concat_4 = ops.concatenate()([X2, reshape_2], dim=cat_dim)
+        flatten_5 = ops.flatten(start_dim=1, end_dim=-1)(concat_4)
+        Y = ops.concatenate()([X0, flatten_5], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_0_pt, [-1, M0 + M1, N])
+            concat_4_pt = torch.cat([x2_pt, reshape_2_pt], dim=cat_dim)
+            flatten_5_pt = torch.flatten(concat_4_pt, 1, -1)
+            y_pt = torch.cat([x0_pt, flatten_5_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_1(self):
+        self._test_move_reshape_cat_1(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_1",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_2(self, M0, M1, M2, M3, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # concat_1 = concatenate(x0, x1)
+        # reshape_2 = reshape(concat_0)
+        # reshape_3 = reshape(concat_1)
+        # concat_4 = concatenate(x2, reshape_2, reshape_3, x3, reshape_2)
+        # flatten_5 = flatten(concat_4)
+        # concat_6 = concatenate(x0, flatten_5, x1, flatten_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        concat_1 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_0, [-1, M0 + M1, N])
+        reshape_3 = ops.reshape()(concat_1, [-1, M0 + M1, N])
+        concat_4 = ops.concatenate()(
+            [X2, reshape_2, reshape_3, X3, reshape_2], dim=cat_dim
+        )
+        flatten_5 = ops.flatten(start_dim=1, end_dim=-1)(concat_4)
+        Y = ops.concatenate()([X0, flatten_5, X1, flatten_5], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            concat_1_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_0_pt, [-1, M0 + M1, N])
+            reshape_3_pt = torch.reshape(concat_1_pt, [-1, M0 + M1, N])
+            concat_4_pt = torch.cat(
+                [x2_pt, reshape_2_pt, reshape_3_pt, x3_pt, reshape_2_pt], dim=cat_dim
+            )
+            flatten_5_pt = torch.flatten(concat_4_pt, 1, -1)
+            y_pt = torch.cat([x0_pt, flatten_5_pt, x1_pt, flatten_5_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_2(self):
+        self._test_move_reshape_cat_2(
+            M0=2,
+            M1=2,
+            M2=6,
+            M3=4,
+            N=8,
+            test_name="test_move_reshape_cat_2",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)
+        # concat_1 = concatenate(add_0, x2)
+        # reshape_2 = reshape(concat_1)
+        # y = concatenate(reshape_2, x3)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        Y = ops.concatenate()([reshape_2, X3], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            y_pt = torch.cat([reshape_2_pt, x3_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat(self):
+        self._test_move_strided_reshape_cat(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=8,
+            test_name="test_move_strided_reshape_cat",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat(
+            M0=4,
+            M1=4,
+            M2=5,
+            M3=10,
+            N=7,
+            test_name="test_move_strided_reshape_cat",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_2(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x0)
+        # reshape_1 = reshape(add_0)
+        # add_2 = add(x1, x1)
+        # concat_3 = concatenate(x2, reshape_1, x2, add_2)
+        # reshape_4 = reshape(concat_3)
+        # y = concatenate(x3, reshape_4, x3)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        reshape_1 = ops.reshape()(add_0, [-1, M0 * N])
+        add_2 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        concat_3 = ops.concatenate()([X2, reshape_1, X2, add_2], dim=cat_dim)
+        reshape_to_shape_4 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, reshape_1, X2, add_2]]) // N
+        )
+        reshape_4 = ops.reshape()(concat_3, [-1, reshape_to_shape_4, N])
+        Y = ops.concatenate()([X3, reshape_4, X3], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            add_0_pt = x0_pt + x0_pt
+            reshape_1_pt = torch.reshape(add_0_pt, [batch, M0 * N])
+            add_2_pt = x1_pt + x1_pt
+            concat_3_pt = torch.cat([x2_pt, reshape_1_pt, x2_pt, add_2_pt], dim=cat_dim)
+            reshape_4_pt = torch.reshape(concat_3_pt, [-1, reshape_to_shape_4, N])
+            y_pt = torch.cat([x3_pt, reshape_4_pt, x3_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_2(self):
+        self._test_move_strided_reshape_cat_2(
+            M0=4,
+            M1=6,
+            M2=9,
+            M3=16,
+            N=8,
+            test_name="test_move_strided_reshape_cat_2",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_3(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # slice_1 = slice(x4)
+        # slice_2 = slice(x4)
+        # add_0 = add(x0, x0)
+        # reshape_1 = reshape(add_0)
+        # add_2 = add(x1, x1)
+        # flatten_3 = flatten(add_2)
+        # concat_4 = concatenate(x2, slice_0, slice_1, reshape_1, slice_2, flatten_3) # 2d
+        # add_5 = add(x3, x3)
+        # reshape_6 = reshape(add_5)
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7, reshape_6) # 3d
+        # add_9 = add(x0, x0)
+        # flatten_10 = flatten(concat_8) # 2d
+        # reshape_11 = reshape(add_9) # 2d
+        # y = concatenate(x1, reshape_11, flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_start_indices_1 = [None, 3 * N]
+        slice_end_indices_1 = [None, 4 * N]
+        slice_start_indices_2 = [None, 4 * N]
+        slice_end_indices_2 = [None, 8 * N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        slice_1 = ops.dynamic_slice()(X4, slice_start_indices_1, slice_end_indices_1)
+        slice_2 = ops.dynamic_slice()(X4, slice_start_indices_2, slice_end_indices_2)
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        reshape_1 = ops.reshape()(add_0, [-1, M0 * N])
+        add_2 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        flatten_3 = ops.flatten(start_dim=1, end_dim=-1)(add_2)
+        concat_4 = ops.concatenate()(
+            [X2, slice_0, slice_1, reshape_1, slice_2, flatten_3], dim=cat_dim
+        )
+        add_5 = ops.elementwise(FuncEnum.ADD)(X3, X3)
+        reshape_6 = ops.reshape()(add_5, [-1, M3, N])
+        reshape_to_shape_7 = (
+            sum(
+                [
+                    t.shape()[cat_dim].value()
+                    for t in [X2, slice_0, slice_1, reshape_1, slice_2, flatten_3]
+                ]
+            )
+            // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7, reshape_6], dim=cat_dim)
+        add_9 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        reshape_11 = ops.reshape()(add_9, [-1, M0 * N])
+        Y = ops.concatenate()([X1, reshape_11, flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_indices_1 = [
+                slice(i, j) for i, j in zip(slice_start_indices_1, slice_end_indices_1)
+            ]
+            slice_indices_2 = [
+                slice(i, j) for i, j in zip(slice_start_indices_2, slice_end_indices_2)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+            slice_1_pt = x4_pt[slice_indices_1]
+            slice_2_pt = x4_pt[slice_indices_2]
+
+            add_0_pt = x0_pt + x0_pt
+            reshape_1_pt = torch.reshape(add_0_pt, [batch, M0 * N])
+            add_2_pt = x1_pt + x1_pt
+            flatten_3_pt = torch.flatten(add_2_pt, 1, -1)
+            concat_4_pt = torch.cat(
+                [x2_pt, slice_0_pt, slice_1_pt, reshape_1_pt, slice_2_pt, flatten_3_pt],
+                dim=cat_dim,
+            )
+            add_5_pt = x3_pt + x3_pt
+            reshape_6_pt = torch.reshape(add_5_pt, [-1, M3, N])
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt, reshape_6_pt], dim=cat_dim)
+            add_9_pt = x0_pt + x0_pt
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            reshape_11_pt = torch.reshape(add_9_pt, [-1, M0 * N])
+            y_pt = torch.cat([x1_pt, reshape_11_pt, flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_3(self):
+        self._test_move_strided_reshape_cat_3(
+            M0=4,
+            M1=6,
+            M2=9,
+            M3=16,
+            N=8,
+            test_name="test_move_strided_reshape_cat_3",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_4(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # concat_4 = concatenate(x2, slice_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # y = concatenate(x0, reshape_7) # 3d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, slice_0], dim=cat_dim)
+        reshape_to_shape_7 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, slice_0]]) // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        Y = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+
+            concat_4_pt = torch.cat([x2_pt, slice_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            y_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_4(self):
+        self._test_move_strided_reshape_cat_4(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_4",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_5(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # concat_4 = concatenate(x2, slice_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7) # 3d
+        # flatten_10 = reshape(concat_8) # 2d
+        # y = concatenate(flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, slice_0], dim=cat_dim)
+        reshape_to_shape_7 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, slice_0]]) // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        Y = ops.concatenate()([flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+
+            concat_4_pt = torch.cat([x2_pt, slice_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            y_pt = torch.cat([flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_5(self):
+        self._test_move_strided_reshape_cat_5(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_5",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_6(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # add_0 = add(x4, x4)
+        # concat_4 = concatenate(x2, add_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7) # 3d
+        # flatten_10 = reshape(concat_8) # 2d
+        # y = concatenate(flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X4, X4)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, add_0], dim=cat_dim)
+        reshape_to_shape_7 = sum([t.shape()[cat_dim].value() for t in [X2, add_0]]) // N
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        Y = ops.concatenate()([flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            add_0_pt = x4_pt + x4_pt
+            concat_4_pt = torch.cat([x2_pt, add_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            y_pt = torch.cat([flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_6(self):
+        self._test_move_strided_reshape_cat_6(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_6",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_7(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)
+        # concat_1 = concatenate(add_0, x2)
+        # reshape_2 = reshape(concat_1)
+        # add_3 = add(x4, reshape_2)
+        # concat_4 = concatenate(x3, reshape_2, x3)
+        # reduce_5 = reduce_sum(add_3)
+        # reduce_6 = reduce_sum(concat_5)
+        # y = add(reduce_5, reduce_6)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M0 + M2), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(X4, reshape_2)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_5 = ops.reduce_sum(reduce_dim)(add_3)
+        reduce_6 = ops.reduce_sum(reduce_dim)(concat_4)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_5, reduce_6)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 7)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 2)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = x4_pt + reshape_2_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reduce_5_pt = torch.sum(add_3_pt, reduce_dim)
+            reduce_6_pt = torch.sum(concat_4_pt, reduce_dim)
+            y_pt = reduce_5_pt + reduce_6_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_7(self):
+        self._test_move_strided_reshape_cat_7(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=8,
+            test_name="test_move_strided_reshape_cat_7",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat_7(
+            M0=4,
+            M1=4,
+            M2=5,
+            M3=3,
+            N=7,
+            test_name="test_move_strided_reshape_cat_7",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_8(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(bmm_crr_add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(bmm_crr_add_3, [-1, N * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 10)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 3)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(bmm_crr_add_3_pt, [-1, N * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_8(self):
+        self._test_move_strided_reshape_cat_8(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            N=4,
+            test_name="test_move_strided_reshape_cat_8",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat_8(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=4,
+            test_name="test_move_strided_reshape_cat_8",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index c4a07d0ac..d430e2c2b 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -45,6 +45,9 @@ def _run_one_test(
         reshape_to,
         input_x_shape,
         dim,
+        # when it's true, it means that the reshape can be moved to the front
+        # of the first concat op so that we can fuse all ops into a single concat
+        reshape_movable=False,
         add_tanh=False,
         dtype="float16",
     ):
@@ -100,17 +103,22 @@ def _run_one_test(
         module = compile_model(
             Y, target, "./tmp", "slice_scatter_reshape_cat", dll_name=dll_name
         )
-        Y_src_ops = Y._attrs["src_ops"]
-        np.testing.assert_equal(len(Y_src_ops), 2)
-        np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
-        np.testing.assert_equal(concat_op_2._attrs["input_masks"], [True, False, True])
-        Y_src_ops_list = list(Y_src_ops)
-        slice_reshape_scatter_op = (
-            Y_src_ops_list[1] if concat_op_2 == Y_src_ops_list[0] else Y_src_ops_list[0]
-        )
-        np.testing.assert_equal(
-            slice_reshape_scatter_op._attrs["op"], "slice_reshape_scatter"
-        )
+        Y_src_ops = list(Y._attrs["src_ops"])
+        if reshape_movable:
+            np.testing.assert_equal(len(Y_src_ops), 1)
+            np.testing.assert_equal(Y_src_ops[0]._attrs["op"], "concatenate")
+        else:
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
+            np.testing.assert_equal(
+                concat_op_2._attrs["input_masks"], [True, False, True]
+            )
+            slice_reshape_scatter_op = (
+                Y_src_ops[1] if concat_op_2 == Y_src_ops[0] else Y_src_ops[0]
+            )
+            np.testing.assert_equal(
+                slice_reshape_scatter_op._attrs["op"], "slice_reshape_scatter"
+            )
 
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0 for i in range(len(Xs_pt) + 1)]
@@ -130,6 +138,7 @@ def test_slice_scatter_reshape_sm80(self):
             reshape_to=[1, 2, 2],
             input_x_shape=[1, 1, 2],
             dim=1,
+            reshape_movable=True,
         )
         self._run_one_test(
             input_shapes=[[10, 20], [15, 44]],
@@ -207,13 +216,9 @@ def test_slice_scatter_reshape_float16_2(self):
         dll_name = "test.so"
         test_name = "slice_scatter_reshape_cat_float16_2"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
-        Y_src_ops = Y._attrs["src_ops"]
-        self.assertEqual(len(Y_src_ops), 3)
-        slice_reshape_scatter_cnt = 0
-        for op in Y_src_ops:
-            if op._attrs["op"] == "slice_reshape_scatter":
-                slice_reshape_scatter_cnt += 1
-        self.assertEqual(slice_reshape_scatter_cnt, 2)
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 1)
+        self.assertEqual(Y_src_ops[0]._attrs["op"], "concatenate")
 
         slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
 
diff --git a/tests/unittest/compiler/test_split_large_slice_scatter.py b/tests/unittest/compiler/test_split_large_slice_scatter.py
index 582091eb4..d55d322eb 100644
--- a/tests/unittest/compiler/test_split_large_slice_scatter.py
+++ b/tests/unittest/compiler/test_split_large_slice_scatter.py
@@ -68,15 +68,9 @@ def _test_slice_scatter_reshape_float16(
         test_name = "slice_scatter_large_inputs"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         self.test_count += 1
-        Y_src_ops = Y._attrs["src_ops"]
-        # We have a single concat op. All the rest are slice_reshape_scatter ops
-        concat_cnt = 0
-        for op in Y_src_ops:
-            if op._attrs["op"] == "concatenate":
-                concat_cnt += 1
-                continue
-            self.assertEqual(op._attrs["op"], "slice_reshape_scatter")
-        self.assertEqual(concat_cnt, 1)
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 5)
+        self.assertTrue(all(op._attrs["op"] == "concatenate" for op in Y_src_ops))
 
         input0_pt = get_random_torch_tensor(input0_shape, dtype)
         input1_pt = get_random_torch_tensor(input1_shape, dtype)
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
index 344b6d9d3..9027035a4 100644
--- a/tests/unittest/compiler/test_strided_reshape_cat.py
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -114,8 +114,16 @@ def _test_strided_reshape_cat(self, num_cat_ops=1, dtype="float16"):
                 concat_op._attrs["input_masks"], [False, False, True, False]
             )
         else:
-            np.testing.assert_equal(concat_op_1._attrs["input_masks"], [False, False])
-            np.testing.assert_equal(concat_op_2._attrs["input_masks"], [True, False])
+            Y_src_ops = list(Y_src_ops)
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            concat_op = (
+                Y_src_ops[0]
+                if Y_src_ops[0]._attrs["op"] == "concatenate"
+                else Y_src_ops[1]
+            )
+            np.testing.assert_equal(
+                concat_op._attrs["input_masks"], [False, False, True, False]
+            )
 
         expected_inputs_group_gemm_op = [X1, W1, X2, W2, X3, W3]
         np.testing.assert_equal(

From cce9405273d66887ca41a0dbd5aac067dca8094c Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 22 Mar 2023 15:26:14 -0700
Subject: [PATCH 320/638] Fix flaky tests (#469)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/469

Fix two flaky tests:

1. Move `test_conv_bias_act_few_channels` completely to A100, as even `float16` doesn't seem to work properly on V100.

2. In `test_slice_view_strided`, fix the seed and set wider tolerance bound for `bfloat16`.

Reviewed By: tenpercent

Differential Revision: D44308111

fbshipit-source-id: 5dfdbe2642d31f53db074273f3f6ae44b0718b29
---
 .../compiler/test_slice_view_strided.py       | 39 ++++++++-----------
 .../ops/test_conv_bias_act_few_channels.py    | 30 +++++++-------
 2 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index af91821ad..5b97fbf1f 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -31,7 +31,18 @@
 from parameterized import parameterized
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-2, "rtol": 1e-2},
+    "float32": {"atol": 1e-2, "rtol": 1e-2},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 class SliceViewStridedOpTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     @parameterized.expand(
         filter_test_cases_by_params(
             {
@@ -87,10 +98,7 @@ def test_slice_view_gemm_fusible(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(
@@ -148,7 +156,7 @@ def test_slice_view_gemm_non_fusible(self, dtype):
             )
 
             # Do comparisons.
-            torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(
@@ -228,10 +236,7 @@ def test_slice_flatten_concat_fusible_1(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(
@@ -305,10 +310,7 @@ def test_slice_flatten_concat_fusible_2(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(
@@ -381,10 +383,7 @@ def test_slice_reshape_concat_fusible_1(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(
@@ -460,12 +459,8 @@ def test_slice_reshape_concat_fusible_2(self, dtype):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=5e-2, rtol=5e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index 1f2329248..2511e6883 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -19,11 +19,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
-    get_random_torch_tensor,
-    TestEnv,
-)
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 from parameterized import parameterized
 
@@ -34,6 +30,10 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class ConvBiasActFewChannelsTestCase(unittest.TestCase):
     def _test_conv_bias_relu_few_channels(
         self,
@@ -94,12 +94,10 @@ def _test_conv_bias_relu_few_channels(
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32")],
-            }
-        )
+        [
+            ("float16"),
+            ("float32"),
+        ]
     )
     def test_relu(self, dtype):
         self._test_conv_bias_relu_few_channels(
@@ -171,12 +169,10 @@ def _test_conv_bias_hardswish_few_channels(
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32")],
-            }
-        )
+        [
+            ("float16"),
+            ("float32"),
+        ]
     )
     def test_hardswish(self, dtype):
         self._test_conv_bias_hardswish_few_channels(

From 7a3602601cd4531c7b0bef5f478e0740ab7641d4 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 22 Mar 2023 16:25:33 -0700
Subject: [PATCH 321/638] Easier API and additional example for symbolic usage.
 (#470)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/470

Easier API and additional example for symbolic usage.

Reviewed By: ipiszy, chenyang78

Differential Revision: D44286113

fbshipit-source-id: e26e5a966b5bf5505e60e5b48cdb2972053b9178
---
 python/aitemplate/compiler/base.py            | 10 +++++++--
 .../compiler/ops/common/int_elementwise.py    |  4 +---
 .../compiler/ops/common/view_ops.py           | 18 ++++++++-------
 .../compiler/ops/tensor/concatenate.py        |  6 +++--
 python/aitemplate/compiler/symbolic.py        | 22 +++++++++++++++++--
 python/aitemplate/utils/shape_utils.py        | 16 ++++++++++----
 tests/unittest/ops/test_concatenate.py        |  9 +++-----
 tests/unittest/ops/test_flatten.py            | 18 +++++----------
 8 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 79e404632..d6b47a5d5 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -25,6 +25,7 @@
 from typing import Any, Dict, List, Optional, Sequence, Set, Union
 
 import numpy as np
+import sympy
 
 from aitemplate.compiler import symbolic
 from aitemplate.compiler.dtype import get_dtype_size, normalize_dtype
@@ -89,6 +90,7 @@ def __init__(
         self,
         values: List[int],
         name: str = None,
+        symbolic_value: Optional[sympy.Basic] = None,
     ) -> None:
         """Initializes an IntVar.
 
@@ -109,6 +111,9 @@ def __init__(
         name : str, optional
             Name of this dimension, by default None.
             This field must be set for dims which are used by input tensors.
+
+        symbolic_value: sympy.Basic, optional
+            The symbolic value for this IntVar. If None is provided, we will generate a symbol for this IntVar.
         """
         super().__init__()
         self._attrs["name"] = name
@@ -128,9 +133,10 @@ def __init__(
             self._attrs["symbolic_value"] = self._attrs["values"][0]
             self._attrs["values"] = self._attrs["values"] * 2
         else:
-            symbolic_value = symbolic.create_new_symbol(name, values)
+            if symbolic_value is None:
+                symbolic_value = symbolic.create_new_symbol(name, values)
+                symbolic.store_intvar(symbolic_value.name, self)
             self._attrs["symbolic_value"] = symbolic_value
-            symbolic.store_intvar(symbolic_value.name, self)
 
     def __str__(self) -> str:
         return pformat(self._attrs, indent=2)
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index e56cc1c37..7a2bfe8ca 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -117,8 +117,7 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
             sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.DIV], sym_vars)
         else:
             raise RuntimeError(f"Unsupported calculation type {self._attrs['func']}!")
-        dim = shape_utils.gen_int_var_min_max(values)
-        dim._attrs["symbolic_value"] = sym_values
+        dim = shape_utils.gen_int_var_min_max(values, symbolic_value=sym_values)
         for arg, iv in zip(args, int_vars):
             arg._attrs["int_var"] = iv
             assert not arg.is_a_const_num(), f"{arg} cannot be constant"
@@ -126,7 +125,6 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
         self._attrs["inputs"] = list(args)
         self._set_depth()
         output = IntVarTensor(dim, src_ops={self})
-        output._attrs["symbolic_value"] = sym_values
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index a2e76dfd2..01a91ed16 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -146,8 +146,11 @@ def make_output_shape_from_int_vars(
             else:
                 # dynamic dimension
                 dim_name = int_var._attrs["name"]
-                var = IntVar(name=dim_name, values=dim_values)
-                var._attrs["symbolic_value"] = int_var._attrs["symbolic_value"]
+                var = IntVar(
+                    name=dim_name,
+                    values=dim_values,
+                    symbolic_value=int_var._attrs["symbolic_value"],
+                )
                 output_shape.append(var)
         return output_shape
 
@@ -333,8 +336,7 @@ def _infer_shapes(self, x: Tensor):
                             ), "Unable to deduce dynamic symbol"
 
                             values = simplify_intvar_values(dynamic_symbol)
-                            new_var = IntVar(values)
-                            new_var._attrs["symbolic_value"] = dynamic_symbol
+                            new_var = IntVar(values, symbolic_value=dynamic_symbol)
 
                             y_shapes.append(new_var)
                     elif isinstance(val, int):
@@ -342,8 +344,9 @@ def _infer_shapes(self, x: Tensor):
                     elif val in x_symbolic_shapes_mapping:
                         y_shapes.append(x_symbolic_shapes_mapping[val])
                     elif is_symbolic(val):
-                        val_var = gen_int_var_min_max(new_shape_values[idx])
-                        val_var._attrs["symbolic_value"] = val
+                        val_var = gen_int_var_min_max(
+                            new_shape_values[idx], symbolic_value=val
+                        )
                         y_shapes.append(val_var)
                     else:
                         raise ValueError(f"Unknown sym type for handling {val}")
@@ -433,8 +436,7 @@ def _infer_shapes(self, x: Tensor):
         if min_val == max_val:
             flatten_shape = IntImm(value=min_val)
         else:
-            flatten_shape = IntVar(values=[min_val, max_val])
-            flatten_shape._attrs["symbolic_value"] = sym_val
+            flatten_shape = IntVar(values=[min_val, max_val], symbolic_value=sym_val)
         new_shapes.append(flatten_shape)
 
         for var in x._attrs["shape"][end + 1 :]:
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index 15a02fabc..d207bdee1 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -91,14 +91,16 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             if idx == dim:
                 min_value_sum = sum(value[0] for value in lst)
                 max_value_sum = sum(value[-1] for value in lst)
-                shape_var = shape_utils.gen_int_var([min_value_sum, max_value_sum])
-                shape_var._attrs["symbolic_value"] = reduce(
+                sym_val = reduce(
                     lambda x, y: x + y,
                     [
                         input_shape[idx]._attrs["symbolic_value"]
                         for input_shape in input_shapes
                     ],
                 )
+                shape_var = shape_utils.gen_int_var(
+                    [min_value_sum, max_value_sum], symbolic_value=sym_val
+                )
                 output_shape.append(shape_var)
             else:
                 output_dim = input_shapes[0][idx]
diff --git a/python/aitemplate/compiler/symbolic.py b/python/aitemplate/compiler/symbolic.py
index 9ccd019f6..2edcfb195 100644
--- a/python/aitemplate/compiler/symbolic.py
+++ b/python/aitemplate/compiler/symbolic.py
@@ -14,8 +14,26 @@
 #
 """
 Symbolic helpers for AITemplate.
-
-For interesting how to use Sympy, check: https://docs.sympy.org/latest/tutorials/intro-tutorial/intro.html
+AITemplate leverages Sympy to do symbolic computations for shapes.
+The core of Sympy is surrounded around the class "Symbol". We could apply operations
+on Symbols (i.e. add/mul/power/etc.) Which could help us do basic arithmetic with
+unknown values.
+The symbolic-ness comes from representation that includes Symbol (i.e. sym_1 + 100.)
+
+Example Usage:
+A = IntVar(...)
+sym_A = A.symbolic_value() # equivalent of A._attrs["symbolic_value"]
+
+# do something about sym_A, some common usage include:
+new_sym = sym_A + 100
+new_sym = sym_A - 100
+new_sym = sym_A * 2
+new_sym = sym_A * sym_B
+
+# We could then assign the symbolic value to a new IntVar.
+new_var = IntVar(..., symbolic_value=new_sym)
+
+For more advanced usage on Sympy, check: https://docs.sympy.org/latest/tutorials/intro-tutorial/intro.html
 """
 from __future__ import annotations
 
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 3c22da90b..7be8df950 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -18,8 +18,12 @@
 
 from typing import List, Optional
 
+import sympy
 
-def gen_int_var(values: List[int], name: str = None):
+
+def gen_int_var(
+    values: List[int], name: str = None, symbolic_value: Optional[sympy.Basic] = None
+):
     """
     A helper function to generate IntImm or IntVar depending on the length of values.
     """
@@ -29,17 +33,21 @@ def gen_int_var(values: List[int], name: str = None):
     if len(values) == 1:
         return IntImm(values[0], name=name)
     elif len(values) > 1:
-        return IntVar(values, name=name)
+        return IntVar(values, name=name, symbolic_value=symbolic_value)
     else:
         raise RuntimeError("Unsupported dim definition: {}".format(values))
 
 
-def gen_int_var_min_max(values: List[int], name: str = None):
+def gen_int_var_min_max(
+    values: List[int], name: str = None, symbolic_value: Optional[sympy.Basic] = None
+):
     """
     A helper function to generate IntImm or IntVar depending on the length of values.
     Only keeps [min, max] pairs if there are more than 2 values.
     """
-    return gen_int_var([min(values), max(values)], name=name)
+    return gen_int_var(
+        [min(values), max(values)], name=name, symbolic_value=symbolic_value
+    )
 
 
 def get_broadcast_max_shape(shape1, shape2):
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index 87e686f23..94073251e 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -398,8 +398,7 @@ def test_concatenate_shape_var(self):
         sym3 = var3._attrs["symbolic_value"]
 
         in_shapes = [[var, 2, 3] for var in [var1, var2, var3]]
-        ovar1 = IntVar(values=[11, 18])
-        ovar1._attrs["symbolic_value"] = sym1 + sym2 + sym3
+        ovar1 = IntVar(values=[11, 18], symbolic_value=sym1 + sym2 + sym3)
         self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
         self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], -3)
 
@@ -412,8 +411,7 @@ def test_concatenate_shape_mix(self):
         sym2 = var2._attrs["symbolic_value"]
 
         in_shapes = [[var1, 2, 3], [imm1, 2, 3], [imm2, 2, 3], [var2, 2, 3]]
-        ovar1 = IntVar(values=[40, 43])
-        ovar1._attrs["symbolic_value"] = sym1 + sym2 + 17 + 19
+        ovar1 = IntVar(values=[40, 43], symbolic_value=sym1 + sym2 + 17 + 19)
         self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
 
     def test_concatenate_shape_compatible(self):
@@ -423,8 +421,7 @@ def test_concatenate_shape_compatible(self):
         in_shapes = [[var1, 2, 3], [var1, 2, 3]]
         self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
-        dup_var1 = IntVar(values=[1, 2])
-        dup_var1._attrs["symbolic_value"] = sym1
+        dup_var1 = IntVar(values=[1, 2], symbolic_value=sym1)
         in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
         self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
diff --git a/tests/unittest/ops/test_flatten.py b/tests/unittest/ops/test_flatten.py
index ca2dca531..d63261e0c 100644
--- a/tests/unittest/ops/test_flatten.py
+++ b/tests/unittest/ops/test_flatten.py
@@ -208,15 +208,12 @@ def test_flatten_shape_var(self):
         sym3 = var3._attrs["symbolic_value"]
         in_shape = [var1, var2, var3]
 
-        ovar1 = IntVar(values=[21, 110])
-        ovar1._attrs["symbolic_value"] = sym1 * sym2 * sym3
+        ovar1 = IntVar(values=[21, 110], symbolic_value=sym1 * sym2 * sym3)
         self._test_flatten_shape(in_shape, [ovar1], 0, 2)
         self._test_flatten_shape(in_shape, [ovar1], 0, -1)
-        ovar1 = IntVar(values=[3, 10])
-        ovar1._attrs["symbolic_value"] = sym1 * sym2
+        ovar1 = IntVar(values=[3, 10], symbolic_value=sym1 * sym2)
         self._test_flatten_shape(in_shape, [ovar1, var3], 0, 1)
-        ovar1 = IntVar(values=[21, 55])
-        ovar1._attrs["symbolic_value"] = sym2 * sym3
+        ovar1 = IntVar(values=[21, 55], symbolic_value=sym2 * sym3)
         self._test_flatten_shape(in_shape, [var1, ovar1], 1, 2)
         self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
 
@@ -234,14 +231,11 @@ def test_flatten_shape_mix(self):
         ovar1 = IntVar(values=[51, 170])
         ovar1._attrs["symbolic_value"] = sym1 * 17 * sym2
         self._test_flatten_shape(in_shape, [ovar1, var3, imm2], 0, 2)
-        ovar1 = IntVar(values=[6783, 35530])
-        ovar1._attrs["symbolic_value"] = 323 * sym1 * sym2 * sym3
+        ovar1 = IntVar(values=[6783, 35530], symbolic_value=323 * sym1 * sym2 * sym3)
         self._test_flatten_shape(in_shape, [ovar1], 0, -1)
-        ovar1 = IntVar(values=[357, 935])
-        ovar1._attrs["symbolic_value"] = 17 * sym2 * sym3
+        ovar1 = IntVar(values=[357, 935], symbolic_value=17 * sym2 * sym3)
         self._test_flatten_shape(in_shape, [var1, ovar1, imm2], 1, 3)
-        ovar1 = IntVar(values=[6783, 17765])
-        ovar1._attrs["symbolic_value"] = 323 * sym2 * sym3
+        ovar1 = IntVar(values=[6783, 17765], symbolic_value=323 * sym2 * sym3)
         self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
 
 
From f40a17922e092344e41d1f16cb23ae44b4644cdd Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Wed, 22 Mar 2023 17:40:14 -0700
Subject: [PATCH 322/638] Add an FMHA-style-b2b-bmm kernel into
 aitemplate/static (#446)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/446

ATT, this diff only has the C++ CUDA kernel implementation.

It's adapted from the CUTLASS example https://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention, with several changes:

1) replaces softmax with arbitrary scale - activation - scale functions;
2) causal masks: supports both lower left and upper right causal masks;
3) accum_t: templatize it;

Reviewed By: chenyang78

Differential Revision: D44156978

fbshipit-source-id: c900fa27069430b574d5d8995857897c19b471aa
---
 .../attention_scaling_coefs_updater.h         |  384 +++
 .../kernels/fmha_style_b2b_bmm/debug_utils.h  |  201 ++
 .../fmha_style_b2b_bmm/epilogue_pipelined.h   |  635 +++++
 .../epilogue_rescale_output.h                 |  267 +++
 .../fmha_style_b2b_bmm/find_default_mma.h     |  193 ++
 .../fmha_style_b2b_bmm/gemm_kernel_utils.h    |  254 ++
 .../epilogue_predicated_tile_iterator.h       |  753 ++++++
 .../iterators/make_residual_last.h            |  103 +
 ...cated_tile_access_iterator_residual_last.h | 2118 ++++++++++++++++
 .../predicated_tile_iterator_residual_last.h  | 2123 +++++++++++++++++
 .../iterators/transpose_warp_iterator.h       |   59 +
 .../iterators/warp_iterator_from_smem.h       |  281 +++
 .../fmha_style_b2b_bmm/kernel_forward.h       |  856 +++++++
 .../fmha_style_b2b_bmm/mma_from_smem.h        | 1691 +++++++++++++
 .../transform/tile_smem_loader.h              |   94 +
 15 files changed, 10012 insertions(+)
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/debug_utils.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h
 create mode 100644 static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h

diff --git a/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h b/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h
new file mode 100644
index 000000000..170fa894a
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Mostly copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+
+/* Iterates on the accumulator and corresponding position on result matrix
+
+All of this is done on registers, before we store all of this
+on shared memory for the next matmul with Value.
+
+We have multiple implementations, because each configuration has a different way
+of iterating in the accumulators.
+*/
+
+template <typename BASE, typename T, typename accum_t, int kWarpSize>
+struct RegisterOps {};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSm80
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSm80<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterVolta
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterVolta<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSimt
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSimt<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using Iterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSimt<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Updater =
+      AttentionScalingCoefsUpdaterVolta<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSm80<Iterator, accum_t, kWarpSize>;
+};
diff --git a/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h b/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h
new file mode 100644
index 000000000..90766c775
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+#include <float.h>
+#include <stdio.h>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (int _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 1
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_T0_L0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_TX_LX(msg, ...)                                                 \
+  for (int bx = 0; bx < gridDim.x; ++bx) {                                    \
+    for (int by = 0; by < gridDim.y; ++by) {                                  \
+      for (int bz = 0; bz < gridDim.z; ++bz) {                                \
+        for (int tx = 0; tx < blockDim.x; ++tx) {                             \
+          for (int ty = 0; ty < blockDim.y; ++ty) {                           \
+            for (int tz = 0; tz < blockDim.z; ++tz) {                         \
+              __syncthreads();                                                \
+              if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \
+                  threadIdx.x == tx && threadIdx.y == ty &&                   \
+                  threadIdx.z == tz) {                                        \
+                printf(                                                       \
+                    "[%d,%d,%d][%d,%d,%d]" msg "\n",                          \
+                    bx,                                                       \
+                    by,                                                       \
+                    bz,                                                       \
+                    tx,                                                       \
+                    ty,                                                       \
+                    tz,                                                       \
+                    ##__VA_ARGS__);                                           \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+#else
+#define PRINT_T0_L0
+#define PRINT_TX_LX
+#endif
+
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+#if __cplusplus >= 201402L
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+template <class T>
+constexpr __string_view __get_type_name() {
+  return {"unsupported", 11};
+}
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_T0_L0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_T0_L0("printing %s (%s)", name, typeStr.data);      \
+    for (int _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_T0_L0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_T0_L0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_T0_L0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
diff --git a/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h b/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h
new file mode 100644
index 000000000..6d36dc8b1
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h
@@ -0,0 +1,635 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h b/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h
new file mode 100644
index 000000000..73a7aee9a
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "fmha_style_b2b_bmm/epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h b/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h
new file mode 100644
index 000000000..a39d1956e
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instanciate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+
+    Copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h b/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h
new file mode 100644
index 000000000..70f1883f6
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                           \
+  {                                                                            \
+    if (query.scalar_type() == at::ScalarType::Float) {                        \
+      using scalar_t = float;                                                  \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::Half) {                  \
+      using scalar_t = cutlass::half_t;                                        \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {              \
+      using scalar_t = cutlass::bfloat16_t;                                    \
+      func();                                                                  \
+    } else {                                                                   \
+      XFORMERS_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                          \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      constexpr bool BOOL_NAME = true;      \
+      F();                                  \
+    } else {                                \
+      constexpr bool BOOL_NAME = false;     \
+      F();                                  \
+    }                                       \
+  }
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      XFORMERS_CHECK(                                                     \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  XFORMERS_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)   \
+  if (!(COND)) {                    \
+    std::cerr << #COND " failed\n"; \
+    return false;                   \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    XFORMERS_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) {
+  return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value) {
+  return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000..d09e86727
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,753 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h
new file mode 100644
index 000000000..bfabc5875
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h
@@ -0,0 +1,103 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h"
+#include "fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000..8c41720d8
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+    this iterator visits maybe partial, then the remaining tiles are complete.
+    So, we only need to compute the predicates twice, once before the first tile
+    and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+
+    Copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000..53a7fc6a3
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Templates implementing loading of tiles from pitch-linear rank=2
+  tensors.
+
+  This iterator uses masks to guard out-of-bounds accesses. The first tile
+  this iterator visits maybe partial, then the remaining tiles are complete.
+  So, we only need to compute the predicates twice, once before the first tile
+  and once for the remaining full tiles which can share the same predicates.
+
+  A precomputed "Params" object minimizes the amount of state that must be
+  stored in registers, and integer addition is used to advance the pointer
+  through memory.
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h
new file mode 100644
index 000000000..cbf917afa
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h
@@ -0,0 +1,59 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+  using Iterator = char;
+  static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, kTranspose>> {
+  using Iterator =
+      cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, !kTranspose>;
+  static bool constexpr kSupportsTranspose = true;
+};
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h
new file mode 100644
index 000000000..1d77f44d2
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+
+    Mostly copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = cutlass::MatrixShape<32, 32>;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(
+      kOperand == Operand::kA || kOperand == Operand::kB,
+      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = cutlass::MatrixShape<16, 8>;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = 1;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess =
+      (sizeof_bits<Element>::value >= 32 ? 1
+                                         : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+      Shape::kRow / InstructionShape::kRow,
+      Shape::kColumn / InstructionShape::kColumn>;
+
+  static int const kIterations = (kOperand == Operand::kA)
+      ? InstructionCount::kColumn
+      : InstructionCount::kRow;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+      Element,
+      (kOperand == Operand::kA)
+          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+  /// Memory access type
+  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  using AccessType = Array<unsigned, 4>;
+
+  static int constexpr kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn
+                               : InstructionShape::kRow);
+  static int constexpr kAccessesInner =
+      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+ private:
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+ public:
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+      : ref_(ref), iterations_(0) {
+    int ldsm_vec_num = (lane_id >> 3);
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id % 8, 0);
+      static_assert(
+          InstructionCount::kRow * kAccessesInner * kTilesPerInstruction == 4,
+          "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow;
+           ++inst_m_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction;
+               ++access_m_idx) {
+            int access_idx = access_m_idx +
+                kTilesPerInstruction *
+                    (inner_idx + kAccessesInner * inst_m_idx);
+
+            MatrixCoord offset(
+                access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+                inner_idx * 4 * kElementsPerAccess);
+
+            if (access_idx == ldsm_vec_num) {
+              if (kTranspose) {
+                offset = MatrixCoord(offset.column(), offset.row());
+              }
+              origin_ += offset;
+            }
+          }
+        }
+      }
+    } else {
+      origin_ = MatrixCoord(0, lane_id % 8);
+      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
+           ++inst_n_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+          if (access_idx == ldsm_vec_num) {
+            if (kTranspose) {
+              offset = MatrixCoord(offset.column(), offset.row());
+            }
+            origin_ += offset;
+          }
+        }
+      }
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
+    TensorCoord coord_offset(
+        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    if (kTranspose) {
+      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
+    }
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    } else {
+      add_tile_offset({1, 0});
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& operator++() {
+    iterations_++;
+
+    if (iterations_ >= kIterations)
+      advance();
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+    using LoadLayout = typename platform::
+        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+    MatrixCoord offset;
+    if (kOperand == Operand::kA) {
+      offset = MatrixCoord(0, iterations_ * InstructionShape::kColumn);
+    } else {
+      offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+    }
+    if (kTranspose) {
+      offset = MatrixCoord(offset.column(), offset.row());
+    }
+    cutlass::arch::ldsm<LoadLayout, 4>(
+        access_ptr[0], ref_.data() + ref_.offset(offset));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
new file mode 100644
index 000000000..c7e51e385
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
@@ -0,0 +1,856 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * This implementation is adapted from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "fmha_style_b2b_bmm/debug_utils.h"
+#include "fmha_style_b2b_bmm/epilogue_pipelined.h"
+#include "fmha_style_b2b_bmm/epilogue_rescale_output.h"
+#include "fmha_style_b2b_bmm/find_default_mma.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+#include "fmha_style_b2b_bmm/mma_from_smem.h"
+#include "fmha_style_b2b_bmm/transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+} // namespace
+
+template <
+    // The datatype of Q/K/V/output
+    typename scalar_t_,
+    // The datatype for accumulation
+    typename accum_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    // = `value.shape[-1] <= kKeysPerBlock`
+    bool kSingleValueIteration,
+    // Activation functor
+    template <typename T>
+    class ActivationFunctor>
+struct AttentionKernel {
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  using output_accum_t = accum_t;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
+      cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
+    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+    int32_t* cu_seqlens_q_ptr = nullptr;
+    int32_t* cu_seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
+    output_accum_t*
+        output_accum_ptr; // [num_queries, num_heads, head_dim_value]
+
+    // Scale
+    accum_t scale;
+    accum_t activation_scale;
+
+    // Dimensions/strides
+    int32_t head_dim;
+    int32_t head_dim_value;
+    int32_t seq_length;
+    int32_t num_queries;
+    int32_t num_keys;
+
+    enum CausalType {
+      NO_CAUSAL = 0,
+      UPPER_RIGHT_EMPTY = 1,
+      LOWER_LEFT_EMPTY = 2
+    };
+    CausalType causal_type;
+
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+    int32_t bias_strideM;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int32_t bias_strideH;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int32_t bias_strideB;
+    int32_t num_batches;
+    int32_t num_heads;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      int64_t q_start, k_start;
+      // Advance to current batch - in case of different sequence lengths
+      if (cu_seqlens_q_ptr != nullptr) {
+        assert(cu_seqlens_k_ptr != nullptr);
+        cu_seqlens_q_ptr += batch_id;
+        cu_seqlens_k_ptr += batch_id;
+        q_start = cu_seqlens_q_ptr[0];
+        k_start = cu_seqlens_k_ptr[0];
+        int64_t q_next_start = cu_seqlens_q_ptr[1];
+        int64_t k_next_start = cu_seqlens_k_ptr[1];
+        num_queries = q_next_start - q_start;
+        num_keys = k_next_start - k_start;
+
+        if (query_start >= num_queries) {
+          return false;
+        }
+      } else {
+        query_ptr += batch_id * q_strideB;
+        key_ptr += batch_id * k_strideB;
+        value_ptr += batch_id * v_strideB;
+        output_ptr += int64_t(batch_id * num_queries) * o_strideM();
+        if (output_accum_ptr != nullptr) {
+          output_accum_ptr += int64_t(batch_id * num_queries) * o_strideM();
+        }
+        q_start = 0;
+        k_start = 0;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+      key_ptr += k_start * k_strideM + head_id * k_strideH;
+      value_ptr += k_start * v_strideM + head_id * v_strideH;
+      output_ptr += int64_t(q_start + query_start) * o_strideM() +
+          head_id * head_dim_value;
+      if (attn_bias_ptr != nullptr) {
+        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
+      }
+
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr += int64_t(q_start + query_start) * o_strideM() +
+            head_id * head_dim_value;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+      num_queries -= query_start;
+      if (causal_type == CausalType::UPPER_RIGHT_EMPTY) {
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + kQueriesPerBlock), num_keys);
+      }
+      num_batches = 0; // no longer used after
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      attn_bias_ptr = warp_uniform(attn_bias_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we load bias
+      into shared_memory and then add it to registers, and apply scaling and
+      causal masks. We then store this value into a shared-memory
+      ("AccumulatorSharedStorage") that is used later as operand A for the
+      second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        DefaultConfig::kStages, // Should use `DefaultConfig::kStages`, but that
+                                // uses too much smem
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Updater;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // used for efficient load of bias tile Bij from global to shared memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the result from MM0
+      and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {};
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      // For some reason union doesn't work. Need more debugging.
+      // volatile union {
+      typename MM0::BiasLoader::SmemTile bias;
+      typename MM0::AccumulatorSharedStorage si;
+      // };
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      // union {
+      typename MM0::BiasLoader::SmemTile bias;
+      typename MM0::AccumulatorSharedStorage si;
+      // };
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0, "value is not correctly aligned");
+    XFORMERS_CHECK(
+        p.q_strideH % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideH % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideH % kAlignmentV == 0, "value is not correctly aligned");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+
+    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+    uint32_t key_start = (p.causal_type == Params::CausalType::LOWER_LEFT_EMPTY)
+        ? query_start
+        : 0;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM()},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)p.o_strideM()},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+    // Iterate through keys
+    for (int32_t iter_key_start = key_start; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1.mm,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_id();
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // load bias tile Bij into shared memory
+      typename MM0::BiasLoader::GmemTileIterator bias_iter(
+          {cutlass::layout::RowMajor(p.bias_strideM)},
+          // attn_bias_pointer points to matrix of size (n_queries, n_keys)
+          // for the relevant batch_id and head_id
+          p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+          {problem_size_0_m, problem_size_0_n},
+          thread_id());
+      cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+          shared_storage.after_mm0.bias.data(),
+          cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+      typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
+          bias_tensor_ref, thread_id());
+      if (p.attn_bias_ptr != nullptr) {
+        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
+      }
+
+      // apply scale, attention bias, activation_scale if applicable
+      // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+      auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+          lane_id(), warp_id(), iteratorC_tile_offset);
+      MM0::ScalingCoefsUpdater::iterateRows(
+          lane_offset,
+          [&](int accum_m) {},
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+              int x = accum_m + query_start;
+              int y = accum_n + iter_key_start;
+              accum[idx] = accum[idx] * p.scale;
+              if (p.attn_bias_ptr != nullptr) {
+                accum[idx] = accum[idx] +
+                    bias_tensor_ref.at(
+                        {(p.bias_strideM == 0 ? 0 : accum_m), accum_n});
+              }
+              accum[idx] = ActivationFunctor<accum_t>()(accum[idx]) *
+                  (accum_t)(p.activation_scale);
+            }
+          },
+          [&](int accum_m) {});
+
+      // Mask out last if causal
+      //      if (p.causal && p.num_keys - iter_key_start <= kKeysPerBlock) {
+      if (p.causal_type != Params::CausalType::NO_CAUSAL) {
+        int32_t last_col;
+        MM0::ScalingCoefsUpdater::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              last_col = query_start + accum_m - iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              switch (p.causal_type) {
+                case Params::CausalType::UPPER_RIGHT_EMPTY:
+                  if (accum_n > last_col && accum_m < problem_size_0_m &&
+                      accum_n < problem_size_0_n) {
+                    accum[idx] = accum_t(0);
+                  }
+                  break;
+                case Params::CausalType::LOWER_LEFT_EMPTY:
+                  if (accum_n < last_col && accum_m < problem_size_0_m) {
+                    accum[idx] = accum_t(0);
+                  }
+                  break;
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            shared_storage.after_mm0.mm1.mm,
+            shared_storage.after_mm0.si,
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id(),
+            (int)problem_size_1_k);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          DISPATCH_BOOL(
+              iter_key_start == key_start, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp =
+                          typename cutlass::epilogue::thread::LinearCombination<
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  output_t,
+                                  output_accum_t>::type,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              (kIsFirst ? cutlass::epilogue::thread::ScaleType::
+                                              Nothing
+                                        : cutlass::epilogue::thread::ScaleType::
+                                              NoBetaScaling),
+                              cutlass::FloatRoundStyle::round_to_nearest,
+                              output_accum_t>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp epilogue_op({});
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          warp_id(),
+                          lane_id());
+                      epilogue(epilogue_op, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    if (kKeepOutputInRF) {
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      auto dest_iter = createOutputIter(0);
+      DefaultOp epilogue_op({});
+      DefaultEpilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      epilogue(epilogue_op, dest_iter, accum_o);
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
diff --git a/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h b/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h
new file mode 100644
index 000000000..366420278
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h
@@ -0,0 +1,1691 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    Mostly copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "fmha_style_b2b_bmm/attention_scaling_coefs_updater.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+#include "fmha_style_b2b_bmm/iterators/make_residual_last.h"
+#include "fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h"
+#include "fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA =
+      TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         AccumulatorSharedStorage::Shape::kN,
+                                         Policy_,
+                                         2> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy_,
+      2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx, ///< ID of each thread within a warp
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async tranfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(
+            accumulator_shared_storage.accum_ref(),
+            lane_idx),
+        smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        warp_loaded_frag_A1[0],
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              warp_loaded_frag_A1[warp_mma_k % 2],
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy,
+    typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+  using WarpShape = cutlass::MatrixShape<32, 32>;
+
+  using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem<
+      cutlass::gemm::Operand::kA,
+      typename RegularWarpIterator::Element>;
+};
+
+// TensorOp - Ampere half
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+        Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <
+    typename Mma_,
+    typename AccumulatorSharedStorage,
+    bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    AccumulatorSharedStorage_,
+    kTransposeA> {
+  static constexpr int kWarpSize = 32;
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  static constexpr bool kIsTransposedA = false;
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      AccumulatorSharedStorage_,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    AccumulatorSharedStorage_,
+    kTransposeA> {
+  static constexpr int kWarpSize = 32;
+
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorA_ = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+  static constexpr bool kIsTransposedA =
+      WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+  using WarpIteratorA = typename platform::conditional<
+      kIsTransposedA,
+      typename WarpIteratorTranspose::Iterator,
+      WarpIteratorA_>::type;
+
+  static int constexpr kMaxK = kIsTransposedA
+      ? AccumulatorSharedStorage_::Shape::kM
+      : AccumulatorSharedStorage_::Shape::kN;
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          AccumulatorSharedStorage_,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages,
+          kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using SmemIteratorD0 = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+      WarpShape,
+      cutlass::gemm::GemmShape<32, 32, 4>,
+      scalar_t,
+      SmemAccumulatorLayout>;
+
+  // // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+              16,
+              32>, // typename SmemIteratorD0::TensorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  using OutputLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h b/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h
new file mode 100644
index 000000000..5faded20f
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <
+    typename scalar_t, // scalar type
+    typename ThreadblockTileShape, // size of tile to load
+    int Threads, // number of participating threads
+    int ElementsPerAccess> // thread access width in elements
+class TileSmemLoader {
+ public:
+  using SmemTile =
+      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+      cutlass::layout::PitchLinearShape<
+          ThreadblockTileShape::kColumn, // contiguous
+          ThreadblockTileShape::kRow>, // strided
+      Threads, // Threads
+      ElementsPerAccess>; // ElementsPerAccess
+
+  using GmemTileIterator =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          ThreadblockTileShape, // Shape
+          scalar_t, // Element
+          cutlass::layout::RowMajor, // Layout
+          0, // AdvanceRank
+          ThreadMap>; // ThreadMap
+
+  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
+      ThreadblockTileShape, // Shape
+      scalar_t, // Element
+      cutlass::layout::RowMajor, // Layout
+      0, // AdvanceRank
+      ThreadMap>; // ThreadMap
+
+  using Fragment = typename GmemTileIterator::Fragment;
+
+  /// load a tile from global memory into shared memory
+  CUTLASS_DEVICE
+  static void load(
+      GmemTileIterator tile_load_iter,
+      SmemTileIterator tile_store_iter) {
+    Fragment tb_frag;
+    tb_frag.clear();
+    tile_load_iter.load(tb_frag);
+    tile_store_iter.store(tb_frag);
+
+    __syncthreads();
+  }
+};

From e77881862eed84a636d86eac2fec22e1f9742c7b Mon Sep 17 00:00:00 2001
From: Amir Shimoni <amirshim@fb.com>
Date: Wed, 22 Mar 2023 20:07:47 -0700
Subject: [PATCH 323/638] Revert D44106679: stable diffusion unet ait converter

Differential Revision:
D44106679

Original commit changeset: 17fd22c012bb

Original Phabricator Diff: D44106679

fbshipit-source-id: 9655726b14827b8485ea9447051392f0f46a4e57
---
 fx2ait/fx2ait/tools/common_fx2ait.py          |   4 -
 python/aitemplate/frontend/nn/ldm/__init__.py |  17 -
 .../aitemplate/frontend/nn/ldm/attention.py   | 105 ---
 python/aitemplate/frontend/nn/ldm/clip.py     | 628 ---------------
 .../aitemplate/frontend/nn/ldm/embeddings.py  | 101 ---
 python/aitemplate/frontend/nn/ldm/resnet.py   | 238 ------
 .../frontend/nn/ldm/unet_2d_condition.py      | 255 ------
 .../aitemplate/frontend/nn/ldm/unet_blocks.py | 762 ------------------
 python/aitemplate/frontend/nn/ldm/vae.py      | 153 ----
 9 files changed, 2263 deletions(-)
 delete mode 100644 python/aitemplate/frontend/nn/ldm/__init__.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/attention.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/clip.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/embeddings.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/resnet.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/unet_blocks.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/vae.py

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index b9aeb8009..90d18fcd3 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -97,7 +97,6 @@ def run_test(
             leaf_module_list.append(leaf_module)
 
         orig_mod = copy.deepcopy(mod)
-        orig_mod.eval()
         mod = acc_tracer.trace(
             mod,
             inputs,
@@ -111,9 +110,6 @@ def run_test(
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
             inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
-
-        mod.half()
-        inputs = [inp.half().contiguous() for inp in inputs]
         interp = AITInterpreter(
             mod,
             inputs,
diff --git a/python/aitemplate/frontend/nn/ldm/__init__.py b/python/aitemplate/frontend/nn/ldm/__init__.py
deleted file mode 100644
index b14195e81..000000000
--- a/python/aitemplate/frontend/nn/ldm/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-# flake8: noqa
-
-from aitemplate.frontend.nn.ldm.unet_2d_condition import UNet2DConditionModel
diff --git a/python/aitemplate/frontend/nn/ldm/attention.py b/python/aitemplate/frontend/nn/ldm/attention.py
deleted file mode 100644
index 14993e6d9..000000000
--- a/python/aitemplate/frontend/nn/ldm/attention.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
-"""
-
-from typing import Optional
-
-from aitemplate.compiler.ops import reshape
-
-from aitemplate.frontend import nn, Tensor
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    Uses three q, k, v linear layers to compute attention.
-    Parameters:
-        batch_size (:obj:`int`): The number of examples per batch.
-        height (:obj:`int`): Height of each image example.
-        width (:obj:`int`): Width of each image example.
-        channels (:obj:`int`): The number of channels in the input and output.
-        num_head_channels (:obj:`int`, *optional*):
-            The number of channels in each head. If None, then `num_heads` = 1.
-        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        channels: int,
-        num_head_channels: Optional[int] = None,
-        num_groups: int = 32,
-        rescale_output_factor: float = 1.0,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.batch_size = batch_size
-        self.height = height
-        self.width = width
-        self.channels = channels
-        self.num_heads = (
-            channels // num_head_channels if num_head_channels is not None else 1
-        )
-        self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
-            channels,
-            batch_size,
-            height * width,
-            self.num_heads,
-            qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
-        )
-        self.rescale_output_factor = rescale_output_factor
-
-    def forward(self, hidden_states) -> Tensor:
-        """
-        input hidden_states shape: [batch, height, width, channel]
-        output shape: [batch, height, width, channel]
-        """
-        residual = hidden_states
-
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-
-        hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
-        )
-
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
-
-        return res
diff --git a/python/aitemplate/frontend/nn/ldm/clip.py b/python/aitemplate/frontend/nn/ldm/clip.py
deleted file mode 100644
index 1a95314d4..000000000
--- a/python/aitemplate/frontend/nn/ldm/clip.py
+++ /dev/null
@@ -1,628 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from inspect import isfunction
-from typing import Optional
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-
-# pylint: disable=W0102
-
-USE_CUDA = detect_target().name() == "cuda"
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        dtype="float16",
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.dim_head = dim_head
-
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None, residual=None):
-        nheads = self.heads
-        d = self.dim_head
-
-        layout = "20314" if USE_CUDA else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
-        context = default(context, x)
-
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
-        )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
-
-        if USE_CUDA:
-            attn_op = ops.mem_eff_attention(causal=False)
-            out = attn_op(
-                (ops.reshape()(q, [bs, nheads, -1, d])),
-                (ops.reshape()(k, [bs, nheads, -1, d])),
-                (ops.reshape()(v, [bs, nheads, -1, d])),
-            )
-        else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
-            )
-        out = ops.reshape()(out, [bs, -1, nheads * d])
-        proj = self.to_out(out)
-        proj = ops.reshape()(proj, [bs, -1, nheads * d])
-        if residual is not None:
-            return proj + residual
-        else:
-            return proj
-
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
-        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
-
-    def forward(self, x):
-        return self.proj(x, self.gate(x))
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = (
-            nn.Sequential(
-                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
-            )
-            if not glu
-            else GEGLU(dim, inner_dim)
-        )
-
-        self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x, residual=None):
-        shape = ops.size()(x)
-        x = self.net(x)
-        x = ops.reshape()(x, shape)
-        if residual is not None:
-            return x + residual
-        else:
-            return x
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        n_heads,
-        d_head,
-        dropout=0.0,
-        context_dim=None,
-        gated_ff=True,
-        checkpoint=True,
-    ):
-        super().__init__()
-        self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
-            query_dim=dim,
-            context_dim=context_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            dropout=dropout,
-        )
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-
-        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
-
-    def forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), residual=x)
-        x = self.attn2(self.norm2(x), context=context, residual=x)
-        x = self.ff(self.norm3(x), residual=x)
-        return x
-
-
-def Normalize(in_channels):
-    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-
-    def __init__(
-        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)  # Group Norm
-
-        self.proj_in = nn.Conv2dBias(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
-                )
-                for d in range(depth)
-            ]
-        )
-
-        self.proj_out = nn.Conv2dBias(
-            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = ops.reshape()(x, [b, -1, c])
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = ops.reshape()(x, [b, h, w, c])
-        x = self.proj_out(x)
-        return x + x_in
-
-
-class CLIPAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        batch_size=1,
-        seq_len=16,
-        layer_norm_eps=1e-5,
-        hidden_dropout_prob=0.0,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=hidden_dropout_prob,
-            has_residual=False,
-            causal=causal,
-            mask_seq=mask_seq,
-        )
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        residual: Optional[Tensor] = None,
-    ):
-        if residual is not None:
-            self_output = self.attn(hidden_states, residual)
-        else:
-            self_output = self.attn(hidden_states)
-        return self_output
-
-
-class QuickGELUActivation(nn.Module):
-    """
-    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
-    """
-
-    def forward(self, x):
-        x1 = x * 1.702
-        x1 = ops.sigmoid(x1)
-        x = x * x1
-        return x
-
-
-class CLIPMLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer="GELU",
-        drop=0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-            specialization="gelu",
-        )
-        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
-
-    def forward(self, x, res):
-        shape = get_shape(x)
-        x = self.fc1(x)
-        x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
-
-
-class CLIPMLPQuickGelu(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-        )
-        self.activation_fn = QuickGELUActivation()
-
-        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
-
-    def forward(self, x, res):
-        shape = get_shape(x)
-        x = self.fc1(x)
-        x = self.activation_fn(x)
-        x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
-
-
-class CLIPEncoderLayer(nn.Module):
-    ACT_LAYER_TO_CLIP_MLP_MAP = {
-        "gelu": CLIPMLP,
-        "quick_gelu": CLIPMLPQuickGelu,
-    }
-
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        mlp_ratio=4.0,
-        batch_size=1,
-        seq_len=16,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
-            causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
-        )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
-            hidden_size, int(hidden_size * mlp_ratio)
-        )
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        output_attentions: Optional[bool] = False,
-    ):
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-                `(config.encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states, residual)
-
-        return hidden_states
-
-
-class CLIPEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`CLIPEncoderLayer`].
-    Args:
-        config: CLIPConfig
-    """
-
-    def __init__(
-        self,
-        num_hidden_layers=12,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        hidden_size=768,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [
-                CLIPEncoderLayer(
-                    hidden_size=hidden_size,
-                    num_attention_heads=num_attention_heads,
-                    batch_size=batch_size,
-                    seq_len=seq_len,
-                    causal=causal,
-                    mask_seq=mask_seq,
-                    act_layer=act_layer,
-                )
-                for _ in range(num_hidden_layers)
-            ]
-        )
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Causal mask for the text model. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        # all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for _, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(hidden_states)
-            hidden_states = layer_outputs
-
-        return hidden_states
-
-
-class CLIPTextEmbeddings(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        vocab_size=49408,
-        max_position_embeddings=77,
-        dtype="float16",
-    ):
-        super().__init__()
-        embed_dim = hidden_size
-
-        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
-        self.position_embedding = nn.Embedding(
-            shape=[max_position_embeddings, embed_dim], dtype=dtype
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Tensor:
-
-        input_shape = ops.size()(input_ids)
-
-        # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
-
-        if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
-
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
-        )
-
-        embeddings = inputs_embeds + position_embeddings
-
-        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
-
-        return embeddings
-
-
-class CLIPTextTransformer(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
-        self.encoder = CLIPEncoder(
-            num_hidden_layers=num_hidden_layers,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            causal=causal,
-            mask_seq=mask_seq,
-            act_layer=act_layer,
-        )
-        self.final_layer_norm = nn.LayerNorm(hidden_size)
-
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        input_ids: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        position_ids: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Returns:
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
-
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-        )
-
-        last_hidden_state = encoder_outputs
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        return last_hidden_state
diff --git a/python/aitemplate/frontend/nn/ldm/embeddings.py b/python/aitemplate/frontend/nn/ldm/embeddings.py
deleted file mode 100644
index 36b96a4fb..000000000
--- a/python/aitemplate/frontend/nn/ldm/embeddings.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import math
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def get_timestep_embedding(
-    timesteps: Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
-
-    half_dim = embedding_dim // 2
-
-    exponent = (-math.log(max_period)) * Tensor(
-        shape=[half_dim], dtype="float16", name="arange"
-    )
-
-    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
-
-    emb = ops.exp(exponent)
-    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
-
-    # scale embeddings
-    emb = scale * emb
-
-    # concat sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = ops.concatenate()(
-            [ops.cos(emb), ops.sin(emb)],
-            dim=-1,
-        )
-    else:
-        emb = ops.concatenate()(
-            [ops.sin(emb), ops.cos(emb)],
-            dim=-1,
-        )
-    return emb
-
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
-        super().__init__()
-
-        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
-
-    def forward(self, sample):
-        sample = self.linear_1(sample)
-        sample = self.linear_2(sample)
-        return sample
-
-
-class Timesteps(nn.Module):
-    def __init__(
-        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
-    ):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-
-    def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
diff --git a/python/aitemplate/frontend/nn/ldm/resnet.py b/python/aitemplate/frontend/nn/ldm/resnet.py
deleted file mode 100644
index 03e4f8023..000000000
--- a/python/aitemplate/frontend/nn/ldm/resnet.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-class Upsample2D(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self,
-        channels,
-        use_conv=False,
-        use_conv_transpose=False,
-        out_channels=None,
-        name="conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        conv = None
-        if use_conv_transpose:
-            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(x)
-
-        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                x = self.conv(x)
-            else:
-                x = self.Conv2d_0(x)
-
-        return x
-
-
-class Downsample2D(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            conv = nn.Conv2dBias(
-                self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        x = self.conv(x)
-
-        return x
-
-
-class ResnetBlock2D(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        time_embedding_norm="default",
-        kernel=None,
-        output_scale_factor=1.0,
-        use_nin_shortcut=None,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.up = up
-        self.down = down
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        self.norm1 = nn.GroupNorm(
-            num_groups=groups,
-            num_channels=in_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-
-        self.conv1 = nn.Conv2dBias(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
-        else:
-            self.time_emb_proj = None
-
-        self.norm2 = nn.GroupNorm(
-            num_groups=groups_out,
-            num_channels=out_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = nn.Conv2dBias(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        self.upsample = self.downsample = None
-
-        self.use_nin_shortcut = (
-            self.in_channels != self.out_channels
-            if use_nin_shortcut is None
-            else use_nin_shortcut
-        )
-
-        if self.use_nin_shortcut:
-            self.conv_shortcut = nn.Conv2dBias(
-                in_channels, out_channels, 1, 1, 0
-            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
-        else:
-            self.conv_shortcut = None
-
-    def forward(self, x, temb=None):
-        hidden_states = x
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm1(
-            hidden_states
-        )  # .float()).type(hidden_states.dtype) # fused swish
-        # hidden_states = self.nonlinearity(hidden_states)
-
-        if self.upsample is not None:
-            x = self.upsample(x)
-            hidden_states = self.upsample(hidden_states)
-        elif self.downsample is not None:
-            x = self.downsample(x)
-            hidden_states = self.downsample(hidden_states)
-
-        hidden_states = self.conv1(hidden_states)
-
-        if temb is not None:
-            temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
-            temb = ops.reshape()(temb, [bs, 1, 1, dim])
-            hidden_states = hidden_states + temb
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm2(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            x = self.conv_shortcut(x)
-
-        out = hidden_states + x
-
-        return out
diff --git a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
deleted file mode 100644
index 770156ff9..000000000
--- a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from typing import Optional, Tuple, Union
-
-from aitemplate.frontend import nn
-
-from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
-
-
-class UNet2DConditionModel(nn.Module):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-    """
-
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types: Tuple[str] = (
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-    ):
-        super().__init__()
-        self.center_input_sample = center_input_sample
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-
-        # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
-        # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[
-                min(i + 1, len(block_out_channels) - 1)
-            ]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=reversed_attention_head_dim[i],
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=norm_num_groups,
-            eps=norm_eps,
-            use_swish=True,
-        )
-
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
-
-    def forward(
-        self,
-        sample,
-        timesteps,
-        encoder_hidden_states,
-        return_dict: bool = True,
-    ):
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-
-        # 1. time
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if (
-                hasattr(downsample_block, "attentions")
-                and downsample_block.attentions is not None
-            ):
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        sample = self.mid_block(
-            sample, emb, encoder_hidden_states=encoder_hidden_states
-        )
-
-        # 5. up
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[
-                : -len(upsample_block.resnets)
-            ]
-
-            if (
-                hasattr(upsample_block, "attentions")
-                and upsample_block.attentions is not None
-            ):
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
-                )
-
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-        return sample
diff --git a/python/aitemplate/frontend/nn/ldm/unet_blocks.py b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
deleted file mode 100644
index 7b6e3e6e6..000000000
--- a/python/aitemplate/frontend/nn/ldm/unet_blocks.py
+++ /dev/null
@@ -1,762 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# flake8: noqa
-from aitemplate.compiler import ops
-
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-
-from .attention import AttentionBlock
-
-from .clip import SpatialTransformer
-from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
-
-# pylint: disable=W0102
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-    downsample_padding=None,
-):
-    down_block_type = (
-        down_block_type[7:]
-        if down_block_type.startswith("UNetRes")
-        else down_block_type
-    )
-    if down_block_type == "DownBlock2D":
-        return DownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnDownBlock2D":
-        return AttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
-            )
-        return CrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "SkipDownBlock2D":
-        return SkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnSkipDownBlock2D":
-        return AttnSkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "DownEncoderBlock2D":
-        return DownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-):
-    up_block_type = (
-        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    )
-    if up_block_type == "UpBlock2D":
-        return UpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "CrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
-            )
-        return CrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "AttnUpBlock2D":
-        return AttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "SkipUpBlock2D":
-        return SkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "AttnSkipUpBlock2D":
-        return AttnSkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "UpDecoderBlock2D":
-        return UpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-class UNetMidBlock2DCrossAttn(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        cross_attention_dim=1280,
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                SpatialTransformer(
-                    in_channels,
-                    attn_num_head_channels,
-                    in_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class CrossAttnDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class DownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class CrossAttnUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_upsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-    ):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb=temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpDecoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UNetMidBlock2D(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        **kwargs,
-    ):
-        super().__init__()
-
-        if attention_type != "default":
-            raise NotImplementedError(
-                f"attention_type must be default! current value: {attention_type}"
-            )
-
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                AttentionBlock(
-                    batch_size,
-                    height,
-                    width,
-                    in_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
diff --git a/python/aitemplate/frontend/nn/ldm/vae.py b/python/aitemplate/frontend/nn/ldm/vae.py
deleted file mode 100644
index 1cd25aa19..000000000
--- a/python/aitemplate/frontend/nn/ldm/vae.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""
-Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
-"""
-
-from typing import Tuple
-
-from aitemplate.frontend import nn, Tensor
-
-from .unet_blocks import get_up_block, UNetMidBlock2D
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2dBias(
-            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
-        )
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            batch_size,
-            height,
-            width,
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-
-        # up
-        self.up_blocks = nn.ModuleList([])
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=num_groups_out,
-            eps=1e-6,
-            use_swish=True,
-        )
-        self.conv_out = nn.Conv2dBias(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
-        )
-
-    def forward(self, z) -> Tensor:
-        sample = z
-        sample = self.conv_in(sample)
-
-        # middle
-        sample = self.mid_block(sample)
-
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
-
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class AutoencoderKL(nn.Module):
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        sample_size: int = 32,
-    ):
-        super().__init__()
-        self.decoder = Decoder(
-            batch_size,
-            height,
-            width,
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-        self.post_quant_conv = nn.Conv2dBias(
-            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def decode(self, z: Tensor, return_dict: bool = True):
-
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self):
-        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")

From ff84ae5492d9eb459a1e86b37a126e95ef838e22 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 23 Mar 2023 03:17:01 -0700
Subject: [PATCH 324/638] Split test_attention (#473)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/473

The test functions in `test_attention.py` are given `_sm80`, `_bf16`, or `_rocm` suffixes and split with the `filter_test_cases_by_test_env` at the class level.

Reviewed By: chenyang78

Differential Revision: D44317162

fbshipit-source-id: 8d61c37485f988c732e76ef9dcaf280e945d2658
---
 tests/unittest/ops/test_attention.py | 35 +++++++++++-----------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 64c37fa67..5a4bf54e0 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -27,7 +27,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import benchmark_pt, detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 from einops import rearrange, repeat
@@ -144,10 +147,6 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-@unittest.skipIf(
-    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-    "Not supported by CUDA < SM80.",
-)
 class AttentionTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -292,8 +291,7 @@ def _test_flash_attention(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_flash_attention(self):
+    def test_flash_attention_sm80(self):
         self._test_flash_attention(
             test_name="flash_attention_fp16",
             dtype="float16",
@@ -395,8 +393,7 @@ def _test_attention(
             )
             _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
 
-    @unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
-    def test_rocm_attention(self):
+    def test_attention_rocm(self):
         self._test_attention(
             test_name="attention_fp16",
             dtype="float16",
@@ -564,8 +561,7 @@ def _test_mem_eff_attention(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_mem_eff_attention_fp16(self):
+    def test_mem_eff_attention_fp16_sm80(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
                 use_perm=use_perm,
@@ -593,9 +589,8 @@ def test_mem_eff_attention_fp16(self):
             # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
             # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     @unittest.expectedFailure
-    def test_mem_eff_attention_invalid_head_size_fp16(self):
+    def test_mem_eff_attention_invalid_head_size_fp16_sm80(self):
         self._test_mem_eff_attention(
             batch_size=16,
             nheads=8,
@@ -605,8 +600,7 @@ def test_mem_eff_attention_invalid_head_size_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_mem_eff_attention_fp32(self):
+    def test_mem_eff_attention_fp32_sm80(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
                 use_perm=use_perm,
@@ -620,7 +614,6 @@ def test_mem_eff_attention_fp32(self):
                 dtype="float32",
             )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_mem_eff_attention_bf16(self):
         for use_perm in [False, True]:
             self._test_mem_eff_attention(
@@ -755,8 +748,7 @@ def _test_cross_attention(
 
         torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_cross_attention_fp16(self):
+    def test_cross_attention_fp16_sm80(self):
         self._test_cross_attention(
             test_name="cross_attention_fp16",
             dtype="float16",
@@ -770,8 +762,7 @@ def test_cross_attention_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_cross_attention_fp32(self):
+    def test_cross_attention_fp32_sm80(self):
         self._test_cross_attention(
             test_name="cross_attention_fp32",
             dtype="float32",
@@ -785,7 +776,6 @@ def test_cross_attention_fp32(self):
             dtype="float32",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_cross_attention_bf16(self):
         self._test_cross_attention(
             test_name="cross_attention_bf16",
@@ -805,5 +795,8 @@ def test_cross_attention_bf16(self):
         )
 
 
+filter_test_cases_by_test_env(AttentionTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()

From 484d4558d8d1a5320c4ed8e5ad0f74ba51a89d98 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 23 Mar 2023 21:34:07 +0800
Subject: [PATCH 325/638] format code

---
 python/aitemplate/backend/codegen.py          |  6 +-----
 python/aitemplate/compiler/model.py           | 21 +++----------------
 .../test_strided_layernorm_benchmark.py       |  5 +----
 tests/unittest/ops/test_efficient_nms.py      |  4 +++-
 tests/unittest/ops/test_nms.py                |  4 +++-
 5 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 4748ff68e..c44435af9 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -374,11 +374,7 @@ def __init__(
         self.graph = graph
 
         self.num_inputs, self.num_outputs = count_inputs_outputs(graph)
-        (
-            self.max_blob_size,
-            self.max_constant_blob_size,
-            self.workspace,
-        ) = (
+        (self.max_blob_size, self.max_constant_blob_size, self.workspace,) = (
             max_blob_size,
             max_constant_blob_size,
             workspace,
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 9a9de0190..420a567c8 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -367,12 +367,7 @@ def _run_impl(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (
-            c_inputs,
-            c_outputs,
-            c_stream,
-            c_output_shapes_out,
-        ) = self._prepare_run(
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
@@ -456,12 +451,7 @@ def profile(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (
-            c_inputs,
-            c_outputs,
-            c_stream,
-            c_output_shapes_out,
-        ) = self._prepare_run(
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
@@ -617,12 +607,7 @@ def benchmark(
             inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
         if isinstance(outputs, dict):
             outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
-        (
-            c_inputs,
-            c_outputs,
-            c_stream,
-            c_output_shapes_out,
-        ) = self._prepare_run(
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
             inputs,
             outputs,
             stream_ptr,
diff --git a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
index b7c32c2be..18d751a07 100644
--- a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
+++ b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
@@ -34,10 +34,7 @@ def __init__(self, *args, **kwargs):
 
     @unittest.skipIf(detect_target().in_ci_env(), "don't run benchmark in CI")
     def test_benchmark(self):
-        for (
-            input_nonbatch_shape,
-            (start_indices, end_indices),
-        ) in itertools.product(
+        for (input_nonbatch_shape, (start_indices, end_indices),) in itertools.product(
             ((2048, 256), (2048, 512), (2048, 1024), (2048, 2048)),
             (((0, 0, 4), (None, None, 224)), ((0, 0, 3), (None, None, 223))),
         ):
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index dbcf514ce..d2625a8d7 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -188,7 +188,9 @@ def _test_nms(
             ref_box = boxes_pt[keep].cpu()
         else:
             ref_box = torch.zeros(nmsMaxOut, 4)
-            ref_box[: keep.shape[0],] = boxes_pt[keep].cpu()
+            ref_box[
+                : keep.shape[0],
+            ] = boxes_pt[keep].cpu()
         ref_box = ref_box.cuda().to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 1, 4)).copy()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index b2294c5db..430967af3 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -144,7 +144,9 @@ def _test_nms(
             ref_box = boxes[keep]
         else:
             ref_box = torch.zeros(nmsMaxOut, 4)
-            ref_box[: keep.shape[0],] = boxes[keep]
+            ref_box[
+                : keep.shape[0],
+            ] = boxes[keep]
         ref_box = ref_box.to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 4)).contiguous()

From 4aa59f80c3ea1d6afba727568291797933ea5329 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 23 Mar 2023 09:41:33 -0700
Subject: [PATCH 326/638] Fix flaky tests 2 (#477)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/477

Fix one flaky test:

1. In `test_strided_view_cat`, fix the seed and set wider tolerance bound for bfloat16.

Reviewed By: tenpercent

Differential Revision: D44333841

fbshipit-source-id: 65b5b2243556a9e33b78301f51cb6473ed2b33b1
---
 .../compiler/test_strided_view_cat.py         | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index b585e0771..a4c9f942c 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -31,11 +31,22 @@
 from parameterized import param, parameterized
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-2, "rtol": 1e-2},
+    "float32": {"atol": 1e-2, "rtol": 1e-2},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 def custom_name_func(testcase_func, param_num, param):
     return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
 
 
 class StridedViewCatOpTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     @parameterized.expand(
         [
             param(
@@ -243,10 +254,7 @@ def _test_strided_gemm_view_cat_fusible(
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(z, z_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}, input5_pt: {input5_pt}",
-            )
+            torch.testing.assert_close(z, z_pt, **_TOLERANCE_LIMITS[dtype])
 
     def _test_strided_layernorm_view_cat_fusible(self, dtype="float16"):
         def _create_layernorm_sigmoid_mul(
@@ -351,10 +359,7 @@ def _create_layernorm_sigmoid_mul(
 
             # Do comparisons.
             for x, x_pt in zip(z, z_pt):
-                self.assertTrue(
-                    torch.allclose(x, x_pt, atol=1e-2, rtol=1e-2),
-                    f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}",
-                )
+                torch.testing.assert_close(x, x_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
         filter_test_cases_by_params(

From 61a3587a7a4a92bd9271586cac8c7c369a8075e1 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 23 Mar 2023 11:09:22 -0700
Subject: [PATCH 327/638] Fix map_clip_params in 05_stable_diffusion example
 (#442)

Summary:
Fixes:
- Remove unnecessary code lines around `params_pt`.
- The block which adds `encoder_layers_%d_self_attn_cu_length` to `params_ait` was accidentally added under "for loop" which maps `pt_params` to `params_ait`. This PR moves the block out from the for loop.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/442

Reviewed By: wushirong

Differential Revision: D44331237

Pulled By: aakhundov

fbshipit-source-id: 00b9873bdf0e4bc3185310d4dfb28ff53b86162b
---
 .../src/compile_lib/compile_clip.py             | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index cfda48607..9f68e827a 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -23,14 +23,7 @@
 
 
 def map_clip_params(pt_mod, batch_size, seqlen, depth):
-
-    params_pt = list(pt_mod.named_parameters())
-
     params_ait = {}
-    pt_params = {}
-    for key, arr in params_pt:
-        pt_params[key.replace("text_model.", "")] = arr
-
     pt_params = dict(pt_mod.named_parameters())
     for key, arr in pt_params.items():
         name = key.replace("text_model.", "")
@@ -67,11 +60,11 @@ def map_clip_params(pt_mod, batch_size, seqlen, depth):
             continue
         params_ait[ait_name] = arr
 
-        if detect_target().name() == "cuda":
-            for i in range(depth):
-                prefix = "encoder_layers_%d_self_attn_cu_length" % (i)
-                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
-                params_ait[prefix] = torch.from_numpy(cu_len).cuda()
+    if detect_target().name() == "cuda":
+        for i in range(depth):
+            prefix = f"encoder_layers_{i}_self_attn_cu_length"
+            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+            params_ait[prefix] = torch.from_numpy(cu_len).cuda()
 
     return params_ait
 

From 3aa0b6c98be728ec9a7b0476876138ed25318e49 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Thu, 23 Mar 2023 12:50:45 -0700
Subject: [PATCH 328/638] propagate graph optimization setting from lowering
 context to model compilation (#475)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/475

it also turns out we need to split large memory ops to make the model compilable (otherwise cuda limit for stack args is exceeded)

Reviewed By: chenyang78

Differential Revision: D44322911

fbshipit-source-id: 3cee492253e04c9a12680880946d802c1231840a
---
 fx2ait/fx2ait/fx2ait.py                                | 3 +++
 python/aitemplate/compiler/compiler.py                 | 5 ++++-
 python/aitemplate/compiler/transform/optimize_graph.py | 3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index a0fb2224f..e70a6c3f8 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -66,6 +66,7 @@ def __init__(
         load_ait_dir: Optional[str] = None,
         remote_cache_file_path: Optional[str] = None,
         save_remote_cache: Optional[bool] = False,
+        do_optimize_graph: bool = True,
     ):
         """
         Args:
@@ -122,6 +123,7 @@ def __init__(
         self.dump_ait_dir = dump_ait_dir
         self.keep_constants = keep_constants
         self.load_ait_dir = load_ait_dir
+        self.do_optimize_graph = do_optimize_graph
 
     def _create_target(self):
         """Detect GPU target"""
@@ -208,6 +210,7 @@ def run(self) -> AITInterpreterResult:
             "dynamic_profiling_strategy": self.dynamic_profile_strategy,
             "dll_name": self.dll_name,
             "profile_dir": profile_dir,
+            "do_optimize_graph": self.do_optimize_graph,
         }
         if self.dump_ait_dir:
             dump_ait_path = os.path.join(self.dump_ait_dir, self.name + ".py")
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 215068c71..3c4038e1b 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -156,6 +156,7 @@ def compile_model(
     constants: Optional[Dict[str, TorchTensor]] = None,
     allocator_kind: Optional[AITemplateAllocatorKind] = None,
     debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+    do_optimize_graph: bool = True,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
@@ -236,7 +237,9 @@ def compile_model(
             )
 
             start_t = datetime.now()
-            graph = compiler.transform.optimize_graph(graph, test_dir)
+            graph = compiler.transform.optimize_graph(
+                graph, test_dir, optimize=do_optimize_graph
+            )
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
             _LOGGER.info(f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}")
 
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 3b8ab2468..45061a40c 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -120,6 +120,9 @@ def optimize_graph(
         funcs = [
             process_singleton_elementwise,
             apply_padding,
+            split_large_slice_scatter_ops,
+            split_large_concat_ops,
+            split_large_split_ops,
         ]
 
     for i, func in enumerate(funcs):

From 88854fb69523386f541330ae28f4ab6f9ec99313 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Thu, 23 Mar 2023 19:37:44 -0700
Subject: [PATCH 329/638] Accept scalar shape in full op (#480)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/480

Full op's front-end was missing validation and some flexibility around the arguments. E.g., `torch.Tensor.new_ones` happens to accept int as a size, and so do we now.

Reviewed By: aakhundov

Differential Revision: D44351353

fbshipit-source-id: 4993303df887ba5b023da440cbea3b83a50ca5d6
---
 python/aitemplate/compiler/ops/tensor/full.py | 14 ++++++++++++--
 tests/unittest/ops/test_full.py               | 15 ++++++++++-----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/full.py b/python/aitemplate/compiler/ops/tensor/full.py
index e88a19c4a..774fa054a 100644
--- a/python/aitemplate/compiler/ops/tensor/full.py
+++ b/python/aitemplate/compiler/ops/tensor/full.py
@@ -26,8 +26,8 @@ class full(Operator):
     with the specified `fill_value` (float scalar).
 
     Args:
-        shape (List[IntVar]): the shape of the output Tensor.
-        fill_Value (float): the value to fill the output Tensor with.
+        shape (int or IntVar or List[IntVar]): the shape of the output Tensor.
+        fill_value (int or float): the value to fill the output Tensor with.
         dtype (str): the dtype of the output Tensor.
 
     Returns:
@@ -46,6 +46,16 @@ def __call__(
         fill_value: float,
         dtype: str = "float16",
     ) -> Tensor:
+        if isinstance(shape, (int, IntVar)):
+            shape = [shape]
+        if not isinstance(shape, (list, tuple)):
+            raise TypeError(f"shape must be List[IntVar], but got {shape}.")
+        shape = list(shape)
+
+        if not isinstance(fill_value, (int, float)):
+            raise TypeError(f"fill_value must be a scalar, but got {fill_value}.")
+        fill_value = float(fill_value)
+
         self._attrs["inputs"] = []
         self._attrs["fill_value"] = fill_value
 
diff --git a/tests/unittest/ops/test_full.py b/tests/unittest/ops/test_full.py
index 2064d35e0..356e49b81 100644
--- a/tests/unittest/ops/test_full.py
+++ b/tests/unittest/ops/test_full.py
@@ -36,6 +36,12 @@ def _test_full(
         dtype="float16",
         test_name="full",
     ) -> None:
+        Y = ops.full()(shape, fill_value, dtype)
+        Y._attrs["name"] = "Y"
+
+        if not isinstance(shape, list):
+            shape = [shape]
+
         X = Tensor(
             shape=shape,
             name="X",
@@ -43,9 +49,6 @@ def _test_full(
             is_input=True,
         )
 
-        Y = ops.full()(X.shape(), fill_value, dtype)
-        Y._attrs["name"] = "Y"
-
         Z = ops.elementwise(FuncEnum.ADD)(X, Y)
         Z._attrs["name"] = "Z"
         Z._attrs["is_output"] = True
@@ -73,8 +76,10 @@ def _test_full(
             param(1, [1], 1, "float16"),
             param(2, [10, 20, 30], 3.14, "float16"),
             param(3, [IntVar([10, 20]), 30], 0, "float16"),
-            param(4, [20, 30], 2.71, "float32"),
-            param(5, [IntVar([1, 128]), 10], -1.23, "float32"),
+            param(4, 123, -5, "float16"),
+            param(5, [20, 30], 2.71, "float32"),
+            param(6, [IntVar([1, 128]), 10], -1.23, "float32"),
+            param(7, IntVar([1, 128]), 1234, "float32"),
         ]
     )
     def test_full(self, i, shape, fill_value, dtype):

From 8d8a87db2d5e2760a8bcbffc3c61474ff7bf8537 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Fri, 24 Mar 2023 04:31:24 -0700
Subject: [PATCH 330/638] Use absolute imports in aitemplate pkg (#479)

Summary:
Couple weeks ago it was decided to use absolute imports in aitemplate package. - [PR354](https://github.com/facebookincubator/AITemplate/pull/354)

This PR fixes recently added files to use absolute imports too.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/479

Reviewed By: ipiszy

Differential Revision: D44351089

Pulled By: aakhundov

fbshipit-source-id: 2ce6c55697abc0212a62bcd26241c7d8492dc1df
---
 python/aitemplate/backend/cuda/b2b_bmm/__init__.py    |  2 +-
 .../aitemplate/compiler/ops/gemm_universal/bmm_xxx.py |  6 +++---
 .../compiler/ops/gemm_universal/bmm_xxx_add.py        | 11 ++++++-----
 .../compiler/ops/gemm_universal/group_gemm_rcr.py     |  2 +-
 .../compiler/ops/tensor/slice_reshape_scatter.py      |  2 +-
 .../compiler/transform/fuse_conv_elementwise.py       |  2 +-
 .../aitemplate/compiler/transform/fuse_group_ops.py   |  2 +-
 python/aitemplate/compiler/transform/fuse_split.py    |  2 +-
 .../transform/transform_permute_to_reshape.py         | 11 ++++++-----
 .../compiler/transform/transform_special_ops.py       |  2 +-
 .../transform/transform_strided_op_and_view_op.py     |  2 +-
 11 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
index 121c3ffc4..5e0c9d41f 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -18,4 +18,4 @@
 b2b bmm module init
 """
 
-from . import classic_b2b_bmm
+from aitemplate.backend.cuda.b2b_bmm import classic_b2b_bmm
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
index 453aaa0ee..6f0caace0 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
@@ -13,9 +13,9 @@
 #  limitations under the License.
 #
 
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 
 class bmm_xxx(bmm):
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
index 00ae2a7ee..8d311b6e7 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
@@ -14,11 +14,11 @@
 #
 
 
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-from ...base import Tensor
-from .bmm import is_valid_inputs as bmm_is_valid_inputs
-from .bmm_xxx import (
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.bmm import (
+    is_valid_inputs as bmm_is_valid_inputs,
+)
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import (
     bmm_ccc,
     bmm_ccr,
     bmm_crc,
@@ -29,6 +29,7 @@
     bmm_rrr,
     bmm_xxx,
 )
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 
 class bmm_xxx_add(bmm_xxx):
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index 28f5cc6d7..c755610b6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -166,7 +166,7 @@ def _concat_strided_outputs(self, outputs, output_stride_dim):
             offset += output_tensor._attrs["shape"][output_stride_dim]._attrs["values"][
                 0
             ]
-            from ...transform import transform_utils
+            from aitemplate.compiler.transform import transform_utils
 
             transform_utils.remove_tensor_from_sorted_graph(output_tensor)
         return cat_output
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index 8e54ea182..a109084d9 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -80,7 +80,7 @@ def is_valid(cat_op: Operator, reshape_op: Operator, cat_op_2: Operator) -> bool
         )
 
     def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
-        from ...transform import transform_utils
+        from aitemplate.compiler.transform import transform_utils
 
         idx = -1
         for i, input_tensor in enumerate(cat_op_2._attrs["inputs"]):
diff --git a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
index 9700cc9e8..9c192bc92 100644
--- a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
@@ -61,7 +61,7 @@ def fuse_conv_elementwise(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
     for func in funcs:
         sorted_graph = func(sorted_graph)
 
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "cuda":
         funcs = [
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index 48ff13cc3..b31abedd7 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -753,7 +753,7 @@ def fuse_group_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tens
     """
     # gemms need to be fused first
     # TODO: enable after adding heuristics and fixing dynamic shapes
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "cuda":
         if "fuse_group_gemm" in Target.current()._kwargs:
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index e81c0eed9..a2daf99ac 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -138,7 +138,7 @@ def _valid_input(input_tensor):
 
 
 def _is_supported_op(op_type: str):
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "rocm":
         return op_type == "bmm_softmax_bmm_permute"
diff --git a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
index 2c65bc677..67bb456e4 100644
--- a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
+++ b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
@@ -17,11 +17,12 @@
 """
 from typing import List
 
-from ...utils import graph_utils
-from ..base import IntImm, Operator, Tensor
-from ..ops import reshape
-from . import transform_utils
-from .toposort import toposort
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops import reshape
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import graph_utils
 
 
 def _check_permute_to_reshape(op: Operator) -> bool:
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index cf1f6ffa8..81ee5b1be 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -292,7 +292,7 @@ def transform_special_ops(
         _transform_1x1_conv_gemm_rcr,
     ]
 
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if "transform_conv_to_gemm" in Target.current()._kwargs:
         if Target.current()._kwargs["transform_conv_to_gemm"]:
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index 3d2604582..ec6f533b0 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -29,7 +29,7 @@
 
 
 def _is_supported_strided_op(op: Operator) -> bool:
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     op_kind = op._attrs["op"]
     if Target.current().name() == "rocm":

From f4da5c36437a6a88690376880b10e7ffd4288ccb Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Fri, 24 Mar 2023 11:51:29 -0700
Subject: [PATCH 331/638] Refactor frontend.nn.Linear forward func (#481)

Summary:
Refactor frontend.nn.Linear forward func

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/481

Reviewed By: wushirong

Differential Revision: D44368169

Pulled By: aakhundov

fbshipit-source-id: b222c67d1022852c5e5fca62854ab9ea37bcf800
---
 python/aitemplate/frontend/nn/linear.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
index 61740b281..a6a6e1793 100644
--- a/python/aitemplate/frontend/nn/linear.py
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -88,19 +88,12 @@ def __init__(
     def forward(self, *args):
         assert len(args) >= 1
         x = args[0]
-        if not self.USE_CUDA:
-            shape = x._attrs["shape"]
-            x = x if len(shape) == 2 else ops.reshape()(x, [-1, self.in_channels])
+        if not self.USE_CUDA and len(x._attrs["shape"]) != 2:
+            x = ops.reshape()(x, [-1, self.in_channels])
+        inputs = [x, self.weight.tensor()]
+        if self.use_bias:
+            inputs.append(self.bias.tensor())
         if len(args) == 2:
-            if self.use_bias:
-                inputs = [x, self.weight.tensor(), self.bias.tensor(), args[1]]
-            else:
-                inputs = [x, self.weight.tensor(), args[1]]
-            output = self.op(*inputs)
-            return output
-        output = (
-            self.op(x, self.weight.tensor(), bias=self.bias.tensor())
-            if self.use_bias
-            else self.op(x, self.weight.tensor())
-        )
+            inputs.append(args[1])
+        output = self.op(*inputs)
         return output

From 2d6dc0122771b78165b00f9062c0a77b774aaedf Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Fri, 24 Mar 2023 17:04:35 -0700
Subject: [PATCH 332/638] Make fill op read dtype from input/kwarg (#485)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/485

Make fill op read dtype from input/kwarg rather than hard code.

Reviewed By: tissue3

Differential Revision: D44375547

fbshipit-source-id: a965bafc517afc81591052e355fd34062b028a89
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py        |  1 -
 fx2ait/fx2ait/converters/ait_converters.py | 24 ++++++++++++++++------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index ac55f8c65..26a4cdd10 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -3322,7 +3322,6 @@ def ones_like(*, input, dtype=None, device=None):
 )
 @register_acc_op
 def new_zeros(*, input, size, dtype=None, device=None, requires_grad=False):
-    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
     return input.new_zeros(size, dtype=dtype, device=device)
 
 
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index f068c247a..c01613ef4 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1670,9 +1670,13 @@ def acc_ops_neg(
 def acc_ops_new_full(
     target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
 ) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
+    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
     fill_value = kwargs["fill_value"]
-    return full()(size, fill_value=fill_value, dtype="float16")
+    return full()(size, fill_value=fill_value, dtype=dtype)
 
 
 @ait_converter(acc_ops.full_like)
@@ -1683,15 +1687,19 @@ def acc_ops_full_like(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     fill_value = kwargs["fill_value"]
-    return full()(input_val.shape(), fill_value=fill_value, dtype="float16")
+    return full()(input_val.shape(), fill_value=fill_value, dtype=input_val.dtype())
 
 
 @ait_converter(acc_ops.new_ones)
 def acc_ops_new_ones(
     target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
 ) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
-    return full()(size, 1, dtype="float16")
+    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
+    return full()(size, 1, dtype=dtype)
 
 
 @ait_converter(acc_ops.ones_like)
@@ -1701,15 +1709,19 @@ def acc_ops_ones_like(
     input_val = kwargs["input"]
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
-    return full()(input_val.shape(), 1, dtype="float16")
+    return full()(input_val.shape(), 1, dtype=input_val.dtype())
 
 
 @ait_converter(acc_ops.new_zeros)
 def acc_ops_new_zeros(
     target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
 ) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
-    return full()(size, 0, dtype="float16")
+    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
+    return full()(size, 0, dtype=dtype)
 
 
 @ait_converter(acc_ops.zeros_like)
@@ -1719,4 +1731,4 @@ def acc_ops_zeros_like(
     input_val = kwargs["input"]
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
-    return full()(input_val.shape(), 0, dtype="float16")
+    return full()(input_val.shape(), 0, dtype=input_val.dtype())

From c3b50dfe3b04495ddb4a0f0777ee84b89cf3237d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Fri, 24 Mar 2023 22:57:37 -0700
Subject: [PATCH 333/638] added slice + bmm fusion (#489)

Summary:
This PR added support to fuse slice and all bmm ops.

It also fixed a couple of latent issues:

* correctly check alignment requirement for fuse slice and gemm
* correctly check the tensor accessor for the bias

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/489

Reviewed By: ipiszy

Differential Revision: D44384414

Pulled By: chenyang78

fbshipit-source-id: 4f0d4a816d5e09790c5f561c59693c0739ec7e89
---
 .../backend/cuda/gemm_universal/bmm_common.py |   3 +-
 .../transform/transform_strided_slice.py      |  63 ++-
 .../compiler/test_slice_bmm_fusion.py         | 530 ++++++++++++++++++
 3 files changed, 575 insertions(+), 21 deletions(-)
 create mode 100644 tests/unittest/compiler/test_slice_bmm_fusion.py

diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index b88c8c395..96b826032 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -300,8 +300,9 @@ def make_function_strided_args(
         if has_bias:
             # FIXME: we don't suppor strided bias yet. Will enable it once
             # we support it.
+            input_bias_accessor = func_attrs["input_accessors"][2]
             assert (
-                not input_b_accessor.is_from_strided_tensor
+                not input_bias_accessor.is_from_strided_tensor
             ), f'strided bias is not supported for op {func_attrs["name"]}'
 
     input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index b1a2c888b..f9c69b6d0 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -23,24 +23,23 @@
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
 from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
 
-from aitemplate.utils import graph_utils
+from aitemplate.utils import alignment as utils_alignment, graph_utils, shape_utils
 
 
-def _is_supported_gemm(gemm_op: Operator, slice_op: Operator) -> bool:
+def _is_supported_gemm_or_bmm(gemm_or_bmm_op: Operator, slice_op: Operator) -> bool:
+    if not gemm_or_bmm_op._attrs["op"].startswith(("gemm_rcr", "bmm")):
+        return False
     slice_output_tensor = slice_op._attrs["outputs"][0]
     slice_output_rank = slice_output_tensor._rank()
-    # TODO: support other gemm kinds
-    if gemm_op._attrs["op"].startswith("gemm_rcr"):
-        # TODO: support cases where slice_input_tensor is used by non-A/B
-        # matrices, e.g. bias/d1/d2 in gemm_rcr_bias_add_add
-        gemm_inputs = gemm_op._attrs["inputs"]
-        if (
-            gemm_inputs[0] is not slice_output_tensor
-            and gemm_inputs[1] is not slice_output_tensor
-        ):
-            return False
-        return slice_output_rank >= 2
-    return False
+    # TODO: support cases where slice_input_tensor is used by non-A/B
+    # matrices, e.g. bias/d1/d2 in gemm_rcr_bias_add_add
+    op_inputs = gemm_or_bmm_op._attrs["inputs"]
+    if (
+        op_inputs[0] is not slice_output_tensor
+        and op_inputs[1] is not slice_output_tensor
+    ):
+        return False
+    return slice_output_rank >= 2
 
 
 def _sanity_check_concatenate(concat_op: Operator, slice_op: Operator) -> bool:
@@ -61,8 +60,8 @@ def _sanity_check_concatenate(concat_op: Operator, slice_op: Operator) -> bool:
 
 def _is_supported_op(op: Operator, slice_op: Operator) -> bool:
     op_type = op._attrs["op"]
-    if op_type.startswith("gemm"):
-        return _is_supported_gemm(op, slice_op)
+    if op_type.startswith(("bmm", "gemm")):
+        return _is_supported_gemm_or_bmm(op, slice_op)
     if op_type == "concatenate":
         return _sanity_check_concatenate(op, slice_op)
     if op_type == "fused_elementwise" or op_type == "permute021":
@@ -89,6 +88,8 @@ def _is_slice_full_range(dim: IntVar, start_idx: int, end_idx: int) -> bool:
 
 def _valid_alignment(
     op: Operator,
+    slice_dim: int,
+    slice_output_tensor: Tensor,
     slice_input_shape: List[IntVar],
     start_indices: List[int],
     end_indices: List[int],
@@ -96,11 +97,17 @@ def _valid_alignment(
     op_type = op._attrs["op"]
     if (
         op_type in ("fused_elementwise", "concatenate", "permute021")
-        or op._attrs["op"].startswith("layernorm")
-        or op._attrs["op"].startswith("group_layernorm")
+        or op_type.startswith("layernorm")
+        or op_type.startswith("group_layernorm")
     ):
         return True
 
+    dtype = slice_output_tensor.dtype()
+    stride = shape_utils.get_static_stride(slice_input_shape, slice_dim)
+    assert (
+        stride is not None
+    ), f"expected non-None stride for {slice_input_shape=} at {slice_dim=}"
+    start_offset = start_indices[slice_dim] * stride
     if op_type.startswith("gemm_rcr"):
         # for n-d * 2-d cases, we are only able to support a special case
         # where we fully slice all axes except the last one (i.e. -1), because
@@ -122,8 +129,22 @@ def _valid_alignment(
         k_dim = slice_input_shape[-1]
         if not isinstance(k_dim, IntImm):
             return False
-        alignment = math.gcd(k_dim.value(), start_indices[-1])
-        return alignment % 2 == 0
+        alignment = math.gcd(k_dim.value(), start_offset)
+        return utils_alignment.valid_alignment(alignment, dtype)
+
+    if op_type.startswith("bmm"):
+        bmm_inputs = op._attrs["inputs"]
+        if bmm_inputs[0] is slice_output_tensor:
+            leading_dim = op._get_a_leading_dim(
+                slice_input_shape[-2], slice_input_shape[-1]
+            )
+        elif bmm_inputs[1] is slice_output_tensor:
+            leading_dim = op._get_b_leading_dim(
+                slice_input_shape[-2], slice_input_shape[-1]
+            )
+        alignment = math.gcd(leading_dim.value(), start_offset)
+        return utils_alignment.valid_alignment(alignment, dtype)
+
     return False
 
 
@@ -202,6 +223,8 @@ def _process_one_slice_dst(
     # Now let's check alignment
     if not _valid_alignment(
         strided_op,
+        slice_dim,
+        slice_output_tensor,
         slice_input_shape,
         normalized_start_indices,
         normalized_end_indices,
diff --git a/tests/unittest/compiler/test_slice_bmm_fusion.py b/tests/unittest/compiler/test_slice_bmm_fusion.py
new file mode 100644
index 000000000..63c2fa6a2
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_bmm_fusion.py
@@ -0,0 +1,530 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SliceBMMFusionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(SliceBMMFusionTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _bmm_parameters(self, bmm_op_name, batch_sizes, M, N, K):
+        """
+        Return a dict of parameters used for constructing bmm ops
+        """
+        B_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+        M_dim = IntImm(M)
+        N_dim = IntImm(N)
+        K_dim = IntImm(K)
+        a_shape = {
+            "r": [B_dim, M_dim, K_dim],
+            "c": [B_dim, K_dim, M_dim],
+        }
+        b_shape = {
+            "r": [B_dim, K_dim, N_dim],
+            "c": [B_dim, N_dim, K_dim],
+        }
+        c_shape = {
+            "r": [B_dim, M_dim, N_dim],
+            "c": [B_dim, N_dim, M_dim],
+        }
+        permute = {
+            "r": None,
+            "c": [0, 2, 1],
+        }
+        bmm_op_name = bmm_op_name[:7]
+        a_layout = bmm_op_name[4]
+        b_layout = bmm_op_name[5]
+        c_layout = bmm_op_name[6]
+        bmm_dict = {}
+        bmm_dict["a_shape"] = a_shape.get(a_layout)
+        bmm_dict["b_shape"] = b_shape.get(b_layout)
+        bmm_dict["c_shape"] = c_shape.get(c_layout)
+        bmm_dict["a_permute"] = permute.get(a_layout)
+        bmm_dict["b_permute"] = permute.get(b_layout)
+        bmm_dict["c_permute"] = permute.get(c_layout)
+        return bmm_dict
+
+    def _test_slice_bmm_xxx_fusion_a(
+        self,
+        bmm_op_fn,
+        M,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        expected_num_tensors,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # bmm(slice_output, B)
+        assert (
+            len(slice_input_shape) == 3
+        ), f"expected {slice_input_shape=} to have a rank of 3"
+        Batch = slice_input_shape[0]
+        batch_sizes = [1, Batch]
+        bmm_op = bmm_op_fn()
+        bmm_params = self._bmm_parameters(bmm_op._attrs["op"], batch_sizes, M, N, K)
+        a_shape = bmm_params["a_shape"]
+
+        slice_input_tensor_shape = [a_shape[0]] + [
+            IntImm(d) for d in slice_input_shape[1:]
+        ]
+        X = Tensor(
+            shape=slice_input_tensor_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        assert shape_utils.is_same_shape(
+            a_shape, A.shape()
+        ), f"expected {a_shape=} and {A.shape()=} are the same shape"
+        b_shape = bmm_params["b_shape"]
+        B = Tensor(
+            shape=b_shape,
+            dtype=dtype,
+            name="b",
+            is_input=True,
+        )
+        input_tensors = [A, B]
+        c_shape = bmm_params["c_shape"]
+        has_add = "_add" in bmm_op._attrs["op"]
+        if has_add:
+            D = Tensor(
+                shape=c_shape,
+                dtype=dtype,
+                name="d",
+                is_input=True,
+            )
+            input_tensors.append(D)
+        Y = bmm_op(*input_tensors)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        for batch in batch_sizes:
+            x_pt = get_random_torch_tensor([batch] + list(slice_input_shape[1:]), dtype)
+            b_pt = get_random_torch_tensor(
+                [batch, b_shape[1].value(), b_shape[2].value()], dtype
+            )
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            a_pt = x_pt[slice_indices]
+
+            a_permute = bmm_params["a_permute"]
+            bmm_a_pt = a_pt
+            if a_permute is not None:
+                bmm_a_pt = a_pt.permute(a_permute)
+            b_permute = bmm_params["b_permute"]
+            bmm_b_pt = b_pt
+            if b_permute is not None:
+                bmm_b_pt = b_pt.permute(b_permute)
+            y_pt = torch.bmm(bmm_a_pt, bmm_b_pt)
+            c_permute = bmm_params["c_permute"]
+            bmm_y_pt = y_pt
+            if c_permute is not None:
+                bmm_y_pt = y_pt.permute(c_permute)
+
+            inputs = {"x": x_pt, "b": b_pt}
+            if has_add:
+                d_pt = get_random_torch_tensor(
+                    [batch, c_shape[-2].value(), c_shape[-1].value()], dtype
+                )
+                inputs["d"] = d_pt
+                bmm_y_pt = bmm_y_pt + d_pt
+            y = get_torch_empty_tensor(bmm_y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(bmm_y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_slice_bmm_rcr_fusion_a(self):
+        # non-fusible due to the odd K
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 7, None]
+        K = 5
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, 4, None]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 7, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+    def test_slice_bmm_rrr_fusion_a(self):
+        # non-fusible
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 7
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=8,
+            expected_num_ops=3,
+            test_name="slice_bmm_rrr_fusion_a",
+        )
+
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrr_fusion_a",
+        )
+
+    def test_slice_bmm_rrc_fusion_a(self):
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrc_fusion_a",
+        )
+
+    def test_slice_bmm_crr_fusion_a(self):
+        # non-fusible
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        M = 3
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_crr_add,
+            M=M,
+            N=6,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, M),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=9,
+            expected_num_ops=4,
+            test_name="slice_bmm_crr_fusion_a",
+        )
+
+        slice_start_indices = [0, 3, 0]
+        slice_end_indices = [None, 6, None]
+        M = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_crr_add,
+            M=M,
+            N=6,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, M),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_crr_fusion_a",
+        )
+
+    def test_slice_bmm_rcc_fusion_a(self):
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 7, None]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcc,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcc_fusion_a",
+        )
+
+    def _test_slice_bmm_xxx_fusion_b(
+        self,
+        bmm_op_fn,
+        M,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        expected_num_tensors,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # bmm(A, slice_output)
+        assert (
+            len(slice_input_shape) == 3
+        ), f"expected {slice_input_shape=} to have a rank of 3"
+        Batch = slice_input_shape[0]
+        batch_sizes = [1, Batch]
+        bmm_op = bmm_op_fn()
+        bmm_params = self._bmm_parameters(bmm_op._attrs["op"], batch_sizes, M, N, K)
+        b_shape = bmm_params["b_shape"]
+
+        slice_input_tensor_shape = [b_shape[0]] + [
+            IntImm(d) for d in slice_input_shape[1:]
+        ]
+        X = Tensor(
+            shape=slice_input_tensor_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        B = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        a_shape = bmm_params["a_shape"]
+        A = Tensor(
+            shape=a_shape,
+            dtype=dtype,
+            name="a",
+            is_input=True,
+        )
+        assert shape_utils.is_same_shape(
+            b_shape, B.shape()
+        ), f"expected {a_shape=} and {A.shape()=} are the same shape"
+        input_tensors = [A, B]
+        c_shape = bmm_params["c_shape"]
+        has_add = "_add" in bmm_op._attrs["op"]
+        if has_add:
+            D = Tensor(
+                shape=c_shape,
+                dtype=dtype,
+                name="d",
+                is_input=True,
+            )
+            input_tensors.append(D)
+        Y = bmm_op(*input_tensors)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        for batch in batch_sizes:
+            x_pt = get_random_torch_tensor([batch] + list(slice_input_shape[1:]), dtype)
+            a_pt = get_random_torch_tensor(
+                [batch, a_shape[1].value(), a_shape[2].value()], dtype
+            )
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            b_pt = x_pt[slice_indices]
+
+            a_permute = bmm_params["a_permute"]
+            bmm_a_pt = a_pt
+            if a_permute is not None:
+                bmm_a_pt = a_pt.permute(a_permute)
+            b_permute = bmm_params["b_permute"]
+            bmm_b_pt = b_pt
+            if b_permute is not None:
+                bmm_b_pt = b_pt.permute(b_permute)
+            y_pt = torch.bmm(bmm_a_pt, bmm_b_pt)
+            c_permute = bmm_params["c_permute"]
+            bmm_y_pt = y_pt
+            if c_permute is not None:
+                bmm_y_pt = y_pt.permute(c_permute)
+
+            inputs = {"x": x_pt, "a": a_pt}
+            if has_add:
+                d_pt = get_random_torch_tensor(
+                    [batch, c_shape[-2].value(), c_shape[-1].value()], dtype
+                )
+                inputs["d"] = d_pt
+                bmm_y_pt = bmm_y_pt + d_pt
+            y = get_torch_empty_tensor(bmm_y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(bmm_y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_slice_bmm_rrc_fusion_b(self):
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        N = 2
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=8,
+            N=N,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, N),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrc_fusion_b",
+        )
+
+    def test_slice_bmm_crc_fusion_b(self):
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        N = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_crc_add,
+            M=8,
+            N=N,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, N),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_crc_fusion_b",
+        )
+
+    def test_slice_bmm_ccr_fusion_b(self):
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccr_add,
+            M=8,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_ccr_fusion_b",
+        )
+
+    def test_slice_bmm_ccc_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccc_add,
+            M=5,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=9,
+            expected_num_ops=4,
+            test_name="slice_bmm_ccc_fusion_b",
+        )
+
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccc_add,
+            M=8,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_ccc_fusion_b",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From ee64ffd512dc99e8b12af6d838471ed1d8179cf1 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 25 Mar 2023 02:05:56 -0700
Subject: [PATCH 334/638] Some typo and type info fixes (#487)

Summary:
PR to fix some typos and type info

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/487

Reviewed By: khabinov, alexanderguzhva

Differential Revision: D44383021

Pulled By: chenyang78

fbshipit-source-id: cf6af427132382c79e1b5454fb57cca7ba5687b0
---
 docs/source/reference/env.rst                      |  2 +-
 python/aitemplate/backend/target.py                | 14 ++++++++------
 .../compiler/ops/gemm_universal/gemm_common.py     |  5 ++---
 .../ops/gemm_universal/gemm_rcr_bias_broadcast.py  |  2 +-
 python/aitemplate/frontend/nn/identity.py          |  2 +-
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 6420fb5e1..db3f6604c 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -29,7 +29,7 @@ Profiling
 
 **HIP_VISIBLE_DEVICES**: This one is from ROCm itself. It's used to set the number of GPU devices available for profiling. Set to "0,1,2,3,4,5,6,7" to speed up profiling. For benchmarking, it's useful to set to a particular device to lower noise.
 
-**FORCE_PROFILE**: If set to "1", it will do profiling regarless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
+**FORCE_PROFILE**: If set to "1", it will do profiling regardless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
 
 **COMBINE_PROFILER_MULTI_SOURCES**: Whether to combine multiple profiler sources per target. "0" - Disabled, "1" - Enabled (default).
 
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 29c834269..36e55a4c0 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -268,7 +268,7 @@ def disable_profiler_codegen(self) -> bool:
     def force_profile(self) -> bool:
         """Whether to force profile.
 
-        Force profiling regarless in_ci_env, disable_profiler_codegen
+        Force profiling regardless in_ci_env, disable_profiler_codegen
 
         Returns
         -------
@@ -363,20 +363,22 @@ def get_profile_cache_version(self, op_class: str) -> int:
             return self._profile_cache.conv3d_cache_version
         raise NotImplementedError
 
-    def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
+    def query_profile_cache(
+        self, op_class: str, args: Dict[str, Any]
+    ) -> Tuple[str, int]:
         """Query the profile cache for the given op class and args.
 
         Parameters
         ----------
         op_class : str
             Op class name. gemm, conv or normalization
-        args : str
+        args : Dict[str, Any]
             Op arguments.
 
         Returns
         -------
-        Tuple[str]
-            Queried best profile results.
+        Tuple[str, int]
+            Queried best profiling results.
 
         Raises
         ------
@@ -393,7 +395,7 @@ def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
             return self._profile_cache.query_normalization(args)
         raise NotImplementedError
 
-    def insert_profile_cache(self, op_class: str, args: str):
+    def insert_profile_cache(self, op_class: str, args: Dict[str, Any]):
         """Insert the profile cache for the given op class and args."""
         if op_class == "gemm":
             self._profile_cache.insert_gemm(args)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 4744c069c..0bd5e4639 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -221,7 +221,6 @@ def _extract_epilogue_alignment(
 
         dtype = self._attrs["inputs"][0].dtype()
         self._attrs["epilogue_alignment"] = alignment.find_max_alignment(shape, dtype)
-        return
 
     def _infer_shapes(self, a: Tensor, b: Tensor):
         raise NotImplementedError("_infer_shapes() is not implemented!")
@@ -306,7 +305,7 @@ def _extract_exec_path(self, dynamic_profiling_strategy):
         """
 
         dim_info_dict: Dict[str, List[DimInfo]] = self._extract_dims()
-        dim_dict: Dict[str, IntVar] = {}
+        dim_dict: Dict[str, List[IntVar]] = {}
         for name, dim_infos in dim_info_dict.items():
             dim_info = None
             for d in dim_infos:
@@ -409,7 +408,7 @@ def _should_build_profiler(
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
-        # We are forced to use the cache so we skip building profilers.
+        # We are forced to use the cache, so we skip building profilers.
         if environ.force_profiler_cache():
             return False
         target = backend.target.Target.current()
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
index 4f1388834..8944a0113 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
@@ -49,7 +49,7 @@ def is_valid_inputs(*inputs):
             for d in inputs[3:]:
                 d_shape = d.shape()
                 if d_shape != base_shape:
-                    msg = "Additional elementwise shape {d_shape} doesn't match gemm_bias' shape {base_shape}"
+                    msg = f"Additional elementwise shape {d_shape} doesn't match gemm_bias' shape {base_shape}"
                     return False, msg
 
         return True, msg
diff --git a/python/aitemplate/frontend/nn/identity.py b/python/aitemplate/frontend/nn/identity.py
index 272d8c320..31d1efb68 100644
--- a/python/aitemplate/frontend/nn/identity.py
+++ b/python/aitemplate/frontend/nn/identity.py
@@ -21,7 +21,7 @@
 
 
 class Identity(Module):
-    """The identify of the input."""
+    """The identity of the input."""
 
     def __init__(
         self,

From b9bafc5e920f728ca303854d90cbd139e914b62b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Sat, 25 Mar 2023 08:29:34 -0700
Subject: [PATCH 335/638] fix interaction of use_fp16_acc with input dtype in
 gemm_bias_broadcast (#491)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/491

att

Reviewed By: aakhundov

Differential Revision: D44389500

fbshipit-source-id: 190fdb2afafbb17eaf1e8d516262e1f2600d5851
---
 .../cuda/gemm_universal/common_bias_broadcast.py     |  1 +
 tests/unittest/ops/test_gemm_bias_broadcast.py       | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 4c19576fa..51a29bc2a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -365,6 +365,7 @@ def gemm_bias_broadcast_instance(
     if (
         "use_fp16_acc" in Target.current()._kwargs
         and Target.current()._kwargs["use_fp16_acc"]
+        and elem_type == "cutlass::half_t"
     ):
         acc_type = "cutlass::half_t"
     else:
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index 0c0564c0e..3c562e65a 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -236,8 +236,8 @@ def test_bias_rcr_add_add_relu(self):
     def test_bias_rcr_add_add_relu_rocm(self):
         self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
 
-    def _test_bias_rcr_mul(self, m, m0, m1, k, n, dtype="float16"):
-        target = detect_target()
+    def _test_bias_rcr_mul(self, m, m0, m1, k, n, use_fp16_acc=False, dtype="float16"):
+        target = detect_target(use_fp16_acc=use_fp16_acc)
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul()
         Y = OP(self.X, self.W, self.B, self.D0)
@@ -346,6 +346,14 @@ def test_gemm_bias_broadcast_bfloat16_bf16(self):
         self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="bfloat16")
         self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
 
+    def test_gemm_bias_broadcast_use_fp16_acc_sm80(self):
+        self._test_bias_rcr_mul(
+            None, 2, 32, 256, 128, use_fp16_acc=True, dtype="float32"
+        )
+        self._test_bias_rcr_mul(
+            None, 2, 32, 256, 128, use_fp16_acc=True, dtype="bfloat16"
+        )
+
 
 filter_test_cases_by_test_env(GEMMBiasBroadcastTestCase)
 

From 5781e7191c670b3f5577f6e81229b784db626fc7 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 25 Mar 2023 13:58:46 -0700
Subject: [PATCH 336/638] Fix epiligue typo (#494)

Summary:
Fix epiligue typo

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/494

Reviewed By: aakhundov

Differential Revision: D44395404

Pulled By: chenyang78

fbshipit-source-id: 5694133d8037af55946989c1c962b18516d69ca2
---
 .../cuda/gemm_epilogue_vistor/common_dual_gemm.py    | 12 ++++++------
 .../backend/cuda/gemm_universal/bmm_rcr_permute.py   |  2 +-
 .../backend/cuda/gemm_universal/bmm_rrr_permute.py   |  2 +-
 .../backend/cuda/gemm_universal/bmm_xxx.py           |  2 +-
 .../aitemplate/backend/cuda/gemm_universal/common.py |  6 +++---
 .../backend/cuda/gemm_universal/gemm_rcr_permute.py  |  2 +-
 .../backend/cuda/gemm_universal/gemm_rrr.py          |  2 +-
 .../backend/cuda/gemm_universal/gemm_rrr_permute.py  |  2 +-
 .../backend/cuda/gemm_universal/perm021fc_ccr.py     |  2 +-
 .../gemm_universal/perm021fc_ccr_bias_permute.py     |  2 +-
 .../backend/cuda/gemm_universal/perm021fc_crc.py     |  2 +-
 .../backend/cuda/gemm_universal/perm102_bmm_rcr.py   |  2 +-
 .../backend/cuda/gemm_universal/perm102_bmm_rrr.py   |  2 +-
 13 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 60b69b285..1cbb62c5f 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -182,8 +182,8 @@ def default_fproc(
     a_layout,
     b_layout,
     c_layout,
-    epiligue_name,
-    epiligue2_name,
+    epilogue_name,
+    epilogue2_name,
     permute_layout=None,
     dtype="float16",
 ):
@@ -228,8 +228,8 @@ def default_fproc(
         # set output major
         op.C.layout = c_layout
         # set epilogue
-        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
-        op.epilogue_functor2 = cutlass_lib.library.EpilogueFunctorName[epiligue2_name]
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+        op.epilogue_functor2 = cutlass_lib.library.EpilogueFunctorName[epilogue2_name]
         op.element_epilogue = acc_type
         if permute_layout is not None:
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
@@ -261,8 +261,8 @@ def fproc(op):
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
-            epiligue_name=func_attrs["epilogue"],
-            epiligue2_name=func_attrs["epilogue2"],
+            epilogue_name=func_attrs["epilogue"],
+            epilogue2_name=func_attrs["epilogue2"],
             dtype=dtype,
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 083f47f86..16451f4de 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -54,7 +54,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index 379f0faf9..30b066b34 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -54,7 +54,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
index c10837f42..ca99405cf 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
@@ -61,7 +61,7 @@ def fproc(op):
                 b_layout=layout_choice[b_layout],
                 c_layout=layout_choice[c_layout],
                 dtype=func_attrs["inputs"][0].dtype(),
-                epiligue_name=func_attrs["epilogue"],
+                epilogue_name=func_attrs["epilogue"],
             )
 
         func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 3624ee3da..46e1bca4d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -1131,7 +1131,7 @@ def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
 
 
 def default_fproc(
-    *, op, a_layout, b_layout, c_layout, dtype, epiligue_name, permute_layout=None
+    *, op, a_layout, b_layout, c_layout, dtype, epilogue_name, permute_layout=None
 ):
     import copy
 
@@ -1183,7 +1183,7 @@ def default_fproc(
         # set output major
         op.C.layout = c_layout
         # set epilogue
-        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
         op.element_epilogue = acc_type
         if permute_layout is not None:
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
@@ -1212,7 +1212,7 @@ def fproc(op):
             b_layout=b_layout,
             c_layout=c_layout,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index 8814248de..91605d166 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -76,7 +76,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 37e2bd064..170450290 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -76,7 +76,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index e8bd44869..8ff8d7acb 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -77,7 +77,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
index dd20aed8f..e6b51647d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -52,7 +52,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
index 1641a2b95..4bbc994ed 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -93,7 +93,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index cbe218e54..f760809e6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -53,7 +53,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index cafdf96ac..63b1ef34b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -109,7 +109,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index ff2103afe..8fb2fb8f2 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -80,7 +80,7 @@ def fproc(op):
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
             dtype=func_attrs["inputs"][0].dtype(),
-            epiligue_name=func_attrs["epilogue"],
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)

From b7c1cbf6d80832dccdf78cdaacefa8e815659e38 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 25 Mar 2023 13:59:53 -0700
Subject: [PATCH 337/638] Add is_input to visualization plot (#492)

Summary:
Currently visualization html shows `is_output` and `is_param` flags.
We can also show `is_input` flag.

Screenshot: https://ibb.co/bKrxtWX

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/492

Reviewed By: aakhundov

Differential Revision: D44395413

Pulled By: chenyang78

fbshipit-source-id: 836d07d3f4ad165e732aa9efdfbfcc3163bbca19
---
 python/aitemplate/utils/graph_utils.py        | 2 +-
 python/aitemplate/utils/visualization/plot.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 4539c9f6b..a2292fe92 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -208,7 +208,7 @@ def track_graph_timings(
         # profiler records the results under the original_name
         op_name = op._attrs["original_name"]
 
-        # replace op_name with an unique name, if provided
+        # replace op_name with a unique name, if provided
         if op_name is not None:
             if op_name not in op_durations:
                 # op_name was not found in the profiler report
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 02e414261..757466598 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -69,6 +69,7 @@ def _gen_tensor_modal(tensor) -> str:
         if tensor._attrs["is_view_of"] is None
         else tensor._attrs["is_view_of"]._attrs["name"]
     )
+    content["is_input"] = str(tensor._attrs["is_input"])
     content["is_output"] = str(tensor._attrs["is_output"])
     content["is_param"] = str(tensor._attrs["is_param"])
     content["dtype"] = str(tensor._attrs["dtype"])
@@ -225,7 +226,7 @@ def plot_graph(
             op_node = None
             op_name = src_op._attrs["name"]
 
-            # replace op_name with an unique name, if provided
+            # replace op_name with a unique name, if provided
             if op_name is not None:
                 op_name = op_names.get(src_op, op_name)
 
@@ -259,7 +260,7 @@ def plot_graph(
             op_node = None
             op_name = dst_op._attrs["name"]
 
-            # replace op_name with an unique name, if provided
+            # replace op_name with a unique name, if provided
             if op_name is not None:
                 op_name = op_names.get(dst_op, op_name)
 

From 8c472bdeadfb40c9f86f519b0670eebdf2441841 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sun, 26 Mar 2023 01:28:47 -0700
Subject: [PATCH 338/638] adjust rtol/atol for float32 tests (#497)

Summary:
test_slice_reshape_concat_fusible_2_2_float32 failed locally. Adjust rtol/atol to 5e-2 for float32 tests

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/497

Reviewed By: aakhundov

Differential Revision: D44395294

Pulled By: chenyang78

fbshipit-source-id: 413d989e752934c5650f717739cbc4f390b59ef3
---
 tests/unittest/compiler/test_slice_view_strided.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index 5b97fbf1f..8d89b38ba 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -32,8 +32,8 @@
 
 
 _TOLERANCE_LIMITS = {
-    "float16": {"atol": 1e-2, "rtol": 1e-2},
-    "float32": {"atol": 1e-2, "rtol": 1e-2},
+    "float16": {"atol": 5e-2, "rtol": 5e-2},
+    "float32": {"atol": 5e-2, "rtol": 5e-2},
     "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
 }
 
@@ -110,7 +110,6 @@ def test_slice_view_gemm_fusible(self, dtype):
         )
     )
     def test_slice_view_gemm_non_fusible(self, dtype):
-
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
@@ -168,7 +167,6 @@ def test_slice_view_gemm_non_fusible(self, dtype):
         )
     )
     def test_slice_flatten_concat_fusible_1(self, dtype):
-
         test_name = f"slice_flatten_concat_fusible_{dtype}"
         batch_dim = IntVar([3, 10], "batch_size")
         X0 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x0")
@@ -248,7 +246,6 @@ def test_slice_flatten_concat_fusible_1(self, dtype):
         )
     )
     def test_slice_flatten_concat_fusible_2(self, dtype):
-
         test_name = f"slice_flatten_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 2], "batch_size")
         X0 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x0")
@@ -322,7 +319,6 @@ def test_slice_flatten_concat_fusible_2(self, dtype):
         )
     )
     def test_slice_reshape_concat_fusible_1(self, dtype):
-
         test_name = f"slice_reshape_concat_fusible_{dtype}_1"
         batch_dim = IntVar([1, 2], "batch_size")
         M = 2
@@ -395,8 +391,7 @@ def test_slice_reshape_concat_fusible_1(self, dtype):
         )
     )
     def test_slice_reshape_concat_fusible_2(self, dtype):
-
-        test_name = "slice_reshape_concat_fusible_{dtype}_2"
+        test_name = f"slice_reshape_concat_fusible_{dtype}_2"
         batch_dim = IntVar([1, 8], "batch_size")
         M = 8
         N = 64

From 1dd93562627a48a45838b4d5c25eebe9022b32a3 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sun, 26 Mar 2023 04:07:53 -0700
Subject: [PATCH 339/638] fixed how we fetch leading_a_dim and leading_b_dim
 for bmm ops (#498)

Summary:
This PR fixed the incorrect uses of bmm's methods _get_a_leading_dim and _get_b_leading_dim

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/498

Reviewed By: aakhundov

Differential Revision: D44395398

Pulled By: chenyang78

fbshipit-source-id: bc13da9c8a45bf8510070778a6d443a054d3290a
---
 .../backend/cuda/gemm_universal/common.py     |  11 +-
 .../transform/transform_strided_slice.py      |  13 +-
 .../compiler/test_slice_bmm_fusion.py         | 113 ++++++++++++++++--
 .../compiler/test_slice_gemm_fusion.py        |   2 -
 4 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 46e1bca4d..8f24d2a7d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -1093,11 +1093,12 @@ def gen_local_dim_defs(func_attrs, indent="  "):
             # skip dynamic dims
             if isinstance(dim, IntImm):
                 input_shape = func_attrs["inputs"][input_idx]._attrs["shape"]
-                name = input_shape[idx]._attrs["name"]
-                if name in dims:
-                    assert dims[name] == dim.value(), "bmm inputs shape mismatch"
-                else:
-                    dims[name] = dim.value()
+                if idx < len(input_shape):
+                    name = input_shape[idx]._attrs["name"]
+                    if name in dims:
+                        assert dims[name] == dim.value(), "bmm inputs shape mismatch"
+                    else:
+                        dims[name] = dim.value()
     return DIM_DEFS_TEMPLATE.render(dims=dims, indent=indent)
 
 
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index f9c69b6d0..ee556b819 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -135,13 +135,22 @@ def _valid_alignment(
     if op_type.startswith("bmm"):
         bmm_inputs = op._attrs["inputs"]
         if bmm_inputs[0] is slice_output_tensor:
+            # _get_a_leading_dim(m, k)
             leading_dim = op._get_a_leading_dim(
-                slice_input_shape[-2], slice_input_shape[-1]
+                slice_input_shape[op._get_m_idx_in_a(slice_input_shape)],
+                slice_input_shape[op._get_k_idx_in_a(slice_input_shape)],
             )
         elif bmm_inputs[1] is slice_output_tensor:
+            # _get_a_leading_dim(n, k)
             leading_dim = op._get_b_leading_dim(
-                slice_input_shape[-2], slice_input_shape[-1]
+                slice_input_shape[op._get_n_idx_in_b(slice_input_shape)],
+                slice_input_shape[op._get_k_idx_in_b(slice_input_shape)],
             )
+        else:
+            # TODO: support strided access for other inputs
+            return False
+        if not isinstance(leading_dim, IntImm):
+            return False
         alignment = math.gcd(leading_dim.value(), start_offset)
         return utils_alignment.valid_alignment(alignment, dtype)
 
diff --git a/tests/unittest/compiler/test_slice_bmm_fusion.py b/tests/unittest/compiler/test_slice_bmm_fusion.py
index 63c2fa6a2..223011bab 100644
--- a/tests/unittest/compiler/test_slice_bmm_fusion.py
+++ b/tests/unittest/compiler/test_slice_bmm_fusion.py
@@ -42,9 +42,9 @@ def _bmm_parameters(self, bmm_op_name, batch_sizes, M, N, K):
         Return a dict of parameters used for constructing bmm ops
         """
         B_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
-        M_dim = IntImm(M)
-        N_dim = IntImm(N)
-        K_dim = IntImm(K)
+        M_dim = shape_utils.gen_int_var_min_max(M) if isinstance(M, list) else IntImm(M)
+        N_dim = shape_utils.gen_int_var_min_max(N) if isinstance(N, list) else IntImm(N)
+        K_dim = shape_utils.gen_int_var_min_max(K) if isinstance(K, list) else IntImm(K)
         a_shape = {
             "r": [B_dim, M_dim, K_dim],
             "c": [B_dim, K_dim, M_dim],
@@ -99,7 +99,8 @@ def _test_slice_bmm_xxx_fusion_a(
         a_shape = bmm_params["a_shape"]
 
         slice_input_tensor_shape = [a_shape[0]] + [
-            IntImm(d) for d in slice_input_shape[1:]
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape[1:]
         ]
         X = Tensor(
             shape=slice_input_tensor_shape,
@@ -146,8 +147,19 @@ def _test_slice_bmm_xxx_fusion_a(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), expected_num_ops)
 
-        for batch in batch_sizes:
-            x_pt = get_random_torch_tensor([batch] + list(slice_input_shape[1:]), dtype)
+        dynamic_dim = [d for d in slice_input_shape[1:] if isinstance(d, list)]
+        assert (
+            len(dynamic_dim) == 0 or len(dynamic_dim) == 1
+        ), f"expected at most one dynamic dim besides batch dim in {slice_input_shape=}"
+        if len(dynamic_dim) == 1:
+            assert len(dynamic_dim[0]) == len(
+                batch_sizes
+            ), f"expected {dynamic_dim[0]} and {batch_sizes=} have the same rank"
+        for idx, batch in enumerate(batch_sizes):
+            input_shape_pt = [batch] + [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape[1:]
+            ]
+            x_pt = get_random_torch_tensor(input_shape_pt, dtype)
             b_pt = get_random_torch_tensor(
                 [batch, b_shape[1].value(), b_shape[2].value()], dtype
             )
@@ -266,6 +278,23 @@ def test_slice_bmm_rrr_fusion_a(self):
         )
 
     def test_slice_bmm_rrc_fusion_a(self):
+        # non-fusible due to dynamic dimension
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, [10, 20], K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrc_fusion_dynamic_a",
+        )
+
         slice_start_indices = [0, 2, 0]
         slice_end_indices = [None, 6, None]
         K = 2
@@ -358,7 +387,8 @@ def _test_slice_bmm_xxx_fusion_b(
         b_shape = bmm_params["b_shape"]
 
         slice_input_tensor_shape = [b_shape[0]] + [
-            IntImm(d) for d in slice_input_shape[1:]
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape[1:]
         ]
         X = Tensor(
             shape=slice_input_tensor_shape,
@@ -379,7 +409,7 @@ def _test_slice_bmm_xxx_fusion_b(
         )
         assert shape_utils.is_same_shape(
             b_shape, B.shape()
-        ), f"expected {a_shape=} and {A.shape()=} are the same shape"
+        ), f"expected {b_shape=} and {B.shape()=} are the same shape"
         input_tensors = [A, B]
         c_shape = bmm_params["c_shape"]
         has_add = "_add" in bmm_op._attrs["op"]
@@ -405,8 +435,19 @@ def _test_slice_bmm_xxx_fusion_b(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), expected_num_ops)
 
-        for batch in batch_sizes:
-            x_pt = get_random_torch_tensor([batch] + list(slice_input_shape[1:]), dtype)
+        dynamic_dim = [d for d in slice_input_shape[1:] if isinstance(d, list)]
+        assert (
+            len(dynamic_dim) == 0 or len(dynamic_dim) == 1
+        ), f"expected at most one dynamic dim besides batch dim in {slice_input_shape=}"
+        if len(dynamic_dim) == 1:
+            assert len(dynamic_dim[0]) == len(
+                batch_sizes
+            ), f"expected {dynamic_dim[0]} and {batch_sizes=} have the same rank"
+        for idx, batch in enumerate(batch_sizes):
+            input_shape_pt = [batch] + [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape[1:]
+            ]
+            x_pt = get_random_torch_tensor(input_shape_pt, dtype)
             a_pt = get_random_torch_tensor(
                 [batch, a_shape[1].value(), a_shape[2].value()], dtype
             )
@@ -475,6 +516,23 @@ def test_slice_bmm_crc_fusion_b(self):
         )
 
     def test_slice_bmm_ccr_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 0, 2]
+        slice_end_indices = [None, None, 6]
+        N = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccr_add,
+            M=6,
+            N=N,
+            K=(slice_end_indices[-1] - slice_start_indices[-1]),
+            slice_input_shape=(2, N, 7),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_ccr_fusion_b",
+        )
+
         slice_start_indices = [0, 1, 0]
         slice_end_indices = [None, 6, None]
         K = 4
@@ -525,6 +583,41 @@ def test_slice_bmm_ccc_fusion_b(self):
             test_name="slice_bmm_ccc_fusion_b",
         )
 
+    def test_slice_bmm_rrr_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, None, 4]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=9,
+            N=(slice_end_indices[-1] - slice_start_indices[-1]),
+            K=K,
+            slice_input_shape=(2, K, 7),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrr_fusion_b",
+        )
+
+        # non-fusible due to dynamic cim
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, None, 4]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=4,
+            N=(slice_end_indices[-1] - slice_start_indices[-1]),
+            K=K,
+            slice_input_shape=(2, K, [10, 20]),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrr_fusion_dynamic_b",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 5312f4d3a..7465a68b5 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -348,7 +348,6 @@ def _test_slice_gemm_rcr_fusion_a_2(
         no_fusion=False,
         dtype="float16",
     ):
-
         X = Tensor(
             shape=slice_input_shape,
             dtype=dtype,
@@ -826,7 +825,6 @@ def test_slice_gemm_fusion_float_sm80(self):
             slice_start_indices=(0, 8),
             slice_end_indices=(None, 16),
             test_name="slice_gemm_rcr_fusion_a_2_float",
-            no_fusion=True,
             dtype="float",
         )
         self._test_slice_gemm_rcr_bias_add(

From a4fcc00fc83285ca56e58a351a474252d4a28817 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sun, 26 Mar 2023 10:37:54 -0700
Subject: [PATCH 340/638] Allow file name without dir to dump ait_to_py (#490)

Summary:
`AITDebugSettings` allow to specify py file to which generated `AITProgram` will be dumped.
Currently it expects at least one dir before the file name.
This PR allows to use file name without dir.
e.g.
```
debug_settings = AITDebugSettings()
debug_settings.dump_ait_to_py = "ait_prog.py"

m = compiler.compile_model(...,
    debug_settings=debug_settings,
)
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/490

Reviewed By: alexanderguzhva

Differential Revision: D44397289

Pulled By: aakhundov

fbshipit-source-id: afcf832ea5a1f1e1ca30545e4711aa262e894501
---
 docs/static/ait_model.html                            | 2 +-
 python/aitemplate/compiler/compiler.py                | 3 ++-
 python/aitemplate/utils/serialization/ait_program.py  | 2 +-
 python/aitemplate/utils/serialization/serdes_code.py  | 4 +++-
 python/aitemplate/utils/visualization/web_template.py | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/static/ait_model.html b/docs/static/ait_model.html
index 9a93d717f..3f414b67b 100644
--- a/docs/static/ait_model.html
+++ b/docs/static/ait_model.html
@@ -661,7 +661,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
           /*make the matching letters bold:*/
           b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
           b.innerHTML += arr[i].substr(val.length);
-          /*insert a input field that will hold the current array item's value:*/
+          /*insert an input field that will hold the current array item's value:*/
           b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
           /*execute a function when someone clicks on the item value (DIV element):*/
               b.addEventListener("click", function(e) {
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 3c4038e1b..8d5ac2274 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -203,7 +203,8 @@ def compile_model(
     # arguments (even if we put quotes around it)!!
     test_name = test_name.replace(",", "_")
     test_dir = os.path.join(workdir, test_name)
-    profile_dir = workdir if profile_dir is None else profile_dir
+    if profile_dir is None:
+        profile_dir = workdir
 
     if debug_settings.dump_ait_to_py:
         dump_program(tensor, debug_settings.dump_ait_to_py)
diff --git a/python/aitemplate/utils/serialization/ait_program.py b/python/aitemplate/utils/serialization/ait_program.py
index 12e3068a4..0c24052cc 100644
--- a/python/aitemplate/utils/serialization/ait_program.py
+++ b/python/aitemplate/utils/serialization/ait_program.py
@@ -86,6 +86,6 @@ def set_all_random_constants(self, dtype="float16"):
     def model(self) -> Union[Tensor, Tuple[Tensor]]:
         """
         This function defines the AIT program.
-        Returns a output tensor, or a tuple of output tensors.
+        Returns an output tensor, or a tuple of output tensors.
         """
         pass
diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
index c263bca9f..6c58299d3 100644
--- a/python/aitemplate/utils/serialization/serdes_code.py
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -360,7 +360,9 @@ def dump_program(
     )
 
     if file_path != "":
-        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        dirs = os.path.dirname(file_path)
+        if dirs != "":
+            os.makedirs(dirs, exist_ok=True)
         with open(file_path, "w") as f:
             f.write(program)
 
diff --git a/python/aitemplate/utils/visualization/web_template.py b/python/aitemplate/utils/visualization/web_template.py
index 3f1f1c920..488e14d1f 100644
--- a/python/aitemplate/utils/visualization/web_template.py
+++ b/python/aitemplate/utils/visualization/web_template.py
@@ -162,7 +162,7 @@
           /*make the matching letters bold:*/
           b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
           b.innerHTML += arr[i].substr(val.length);
-          /*insert a input field that will hold the current array item's value:*/
+          /*insert an input field that will hold the current array item's value:*/
           b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
           /*execute a function when someone clicks on the item value (DIV element):*/
               b.addEventListener("click", function(e) {

From 3ecd0d61b7867ecd2a9ee4535cf57dbe2b46fff0 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Mon, 27 Mar 2023 01:23:30 -0700
Subject: [PATCH 341/638] Inherit StableSet from abc.MutableSet (#488)

Summary:
Issues:

1. `StableSet.__init__` takes input items as `Sequence`
However, we do not use any `Sequence` class methods when we work with the input items.
We can use `Iterable` instead of `Sequence` for input items in `StableSet.__init__`.

2. `Tensor.__init__`  has input param `src_ops` which is marked as `Sequence`, but in all places where we create `Tensor` we pass `set` to  `src_ops` param, e.g
```
output = Tensor(output_shape, src_ops={self}, dtype="int64")
```
`set` is not a `Sequence`. We can mark `src_ops` as `Iterable` instead of `Sequence` in `Tensor.__init__` . `set` is `Iterable`.

3. `StableSet` implements all required methods of abstract class [MutableSet](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes) but is it not formally inherits from it.
We can mark `StableSet` as `MutableSet` by inheriting `StableSet` from  `abc.MutableSet`.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/488

Reviewed By: alexanderguzhva

Differential Revision: D44397350

Pulled By: aakhundov

fbshipit-source-id: 6aa5880ace9b546093fc645e6a432d9b31c18fef
---
 python/aitemplate/compiler/base.py       | 20 ++++++++++----------
 python/aitemplate/compiler/stable_set.py |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index d6b47a5d5..b9deffd71 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -22,7 +22,7 @@
 from enum import Enum
 from functools import reduce
 from pprint import pformat
-from typing import Any, Dict, List, Optional, Sequence, Set, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
 import numpy as np
 import sympy
@@ -590,8 +590,8 @@ def __init__(
         self,
         shape: List[IntVar],
         name: str = None,
-        src_ops: Sequence[Node] = None,
-        dst_ops: Sequence[Node] = None,
+        src_ops: Iterable[Node] = None,
+        dst_ops: Iterable[Node] = None,
         dtype: str = "float16",
         is_input: bool = False,
         is_output: bool = False,
@@ -608,16 +608,16 @@ def __init__(
         shape : List[IntVar]
             Shape of this Tensor.
         name : str, optional
-            Name of this Tensor. By default it's None.
-        src_ops : Set[Node], optional
+            Name of this Tensor. By default, it's None.
+        src_ops : Iterable[Node], optional
             Source operators of this Tensor which write to this Tensor.
-            By default it's an empty set.
-        dst_ops : Set[Node], optional
+            By default, it's an empty set.
+        dst_ops : Iterable[Node], optional
             Destination operators of this Tensor which take this Tensor as
             one of their inputs.
-            By default it's an empty set.
+            By default, it's an empty set.
         dtype : str, optional
-            Date type of this Tensor. By default it's "float16".
+            Date type of this Tensor. By default, it's "float16".
         is_input : bool, optional
             Whether this Tensor is an input Tensor of a graph.
             Note that constant Tensors (e.g. weights) are NOT input Tensors.
@@ -1018,7 +1018,7 @@ def profile(
             A list of device ids which can be used for profiling.
         dynamic_profiling_strategy: DynamicProfileStrategy, optional
             Profiling strategy used when there are dynamic dims.
-            By default MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
+            By default, MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
         """
 
         return
diff --git a/python/aitemplate/compiler/stable_set.py b/python/aitemplate/compiler/stable_set.py
index 84a5704d7..82f945078 100644
--- a/python/aitemplate/compiler/stable_set.py
+++ b/python/aitemplate/compiler/stable_set.py
@@ -19,12 +19,12 @@
 potentially make debugging (e.g. comparison with the original graph, comparison between
 AIT GPU trace and other GPU traces) easier.
 """
+from collections import abc
+from typing import Any, Iterable
 
-from typing import Any, Sequence
 
-
-class StableSet:
-    def __init__(self, s: Sequence[Any] = None):
+class StableSet(abc.MutableSet):
+    def __init__(self, s: Iterable[Any] = None):
         if s is None:
             s = []
         self._d = {item: None for item in s}

From 47503a2fc1902118165de188e56dc9ce7d0c5906 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 27 Mar 2023 12:26:39 -0700
Subject: [PATCH 342/638] Resurrect gemm_ / bmm_softmax (#500)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/500

The `gemm_` / `bmm_softmax` ops are broken for ~10 months, with the corresponding tests disabled, due to an inconsistency with the CUTLASS API update in 2.10. As a result, the major changes in the GEMM back-end did not cover these ops.

In this diff, the following `softmax` ops are resurrected:

- `gemm_rcr_softmax`
- `gemm_rcr_bias_softmax`
- `bmm_rcr_softmax`

More specifically, the CUTLASS API inconsistency is resolved, the ops' GEMM back-end code is modernized to be compatible with the outer GEMM ecosystem, the unit tests are made functional again. Importantly, the diff introduces minimal changes necessary to make the above `softmax` ops functional again. Further improvements (some listed below) are *not introduced* here.

What is *done* in this diff (based on terrychenism's [unpublished branch](https://github.com/fairinternal/AITemplate/compare/main...terrychenism:AITemplate:softmax_update) from 8 months ago):

- Update the internal `GemmSoftmaxUnversal` CUTLASS-based operator to be consistent with the 3.0 APIs.
- Fix the bug in `GemmSoftmaxUnversal` operator (also present in the upstream `GemmSoftmax`).
    - To allow SMEM > `(48 << 10)` bytes required by some of the generated op instances.
- Use op workspace instead of *all* temporary inputs.
- Generalize `dtype` instead of the hard-coded `half`.
- Combine profilers into a single file, as required by the outer `gemm_universal` code.
- Remove `split_k` support from the codegen, as the CUTLASS op uses `kBatched` mode hard-coded.

What is *not done* in this diff (hence, should probably be done in the future):

- Performance tuning (e.g., based on tweaking the `ApplyShape` as suggested [here](https://github.com/NVIDIA/cutlass/blob/master/examples/35_gemm_softmax/gemm_softmax.cu#L221-L224)).
- Use upstream CUTLASS [`GemmSoftmax`](https://github.com/NVIDIA/cutlass/blob/master/examples/35_gemm_softmax/gemm_with_softmax.h#L317) operator instead of the internal `GemmSoftmaxUniversal`.
    - Due to the SMEM bug. Will submit a PR to `nvidia/cutlass` soon, can switch when merged.
- Support arbitrary number of `a` dimensions instead of M in `gemm_` ops.
- Support input TensorAccessors (output TAs seem to be supported).
- Other things that are possibly missed.

Reviewed By: ipiszy

Differential Revision: D44406815

fbshipit-source-id: 39077a19e391f9054532c661a60f5065b9f48123
---
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   | 103 ++--
 .../gemm_epilogue_vistor/common_softmax.py    | 461 ++++++++++++------
 .../gemm_rcr_bias_softmax.py                  |  74 ++-
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  | 135 ++---
 .../include/gemm_with_softmax.h               | 147 ++++--
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |  21 +-
 .../gemm_rcr_bias_softmax.py                  |  11 +-
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |  20 +-
 tests/unittest/ops/test_bmm_softmax.py        |  89 ++--
 tests/unittest/ops/test_gemm_bias_softmax.py  | 108 ++--
 tests/unittest/ops/test_gemm_softmax.py       | 102 ++--
 11 files changed, 796 insertions(+), 475 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 2965eb1c2..1fc726a6c 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -22,8 +22,8 @@
 
 from aitemplate.backend import registry
 from aitemplate.backend.cuda.gemm_epilogue_vistor import (
-    bmm_common_softmax as bmm_common,
     common_softmax,
+    gemm_rcr_softmax,
 )
 from aitemplate.backend.cuda.gemm_universal import common
 from aitemplate.backend.cuda.gemm_universal.layout import RCR
@@ -53,63 +53,57 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: B*M*K (RowMajor)
-        B: B*N*K (ColumnMajor)
-        C/D/sofmax: B*M*N (RowMajor)
-        N: B*M*1 (RowMajor)
+        A: (B, M, K) (RowMajor)
+        B: (B, N, K) (ColumnMajor)
+        C, D, Soft: (B, M, N) (RowMajor)
+        N, S: (B, block_num, M) (RowMajor)
     */
 
-    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
-    B,                       // int32_t batch_count_
-    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
-    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
-    {c_ptr, LayoutC(N)},     // TensorRefC ref_C_
-    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {M, N, K},                                                                                                                             // cutlass::gemm::GemmCoord problem_size
+    B,                                                                                                                                     // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                           // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                           // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace), LayoutC(N)},                                                                      // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + B * M * N * sizeof({{elem_output_type}})), LayoutC(N)},                           // TensorRefC ref_D_
     {
         float(1.0),
         float(0.0)
-    },                       // typename EpilogueFunctorOp::Params linear_scaling
-    {n_ptr, LayoutC(1)},     // ???
-    {soft_ptr, LayoutC(N)},  // ???
-    M*K,                     // int64_t batch_stride_A_
-    N*K,                     // int64_t batch_stride_B_
-    M*N,                     // int64_t batch_stride_C_
-    M*N,                     // int64_t batch_stride_D_
-    M*N,                     // ???
-    M*N,                     // ???
+    },                                                                                                                                     // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * B * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                      // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * B * M * N * sizeof({{elem_output_type}}) + B * M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                           // TensorRefSoft ref_Softmax_
+    M * K,                                                                                                                                 // int64_t batch_stride_A_
+    N * K,                                                                                                                                 // int64_t batch_stride_B_
+    M * N,                                                                                                                                 // int64_t batch_stride_C_
+    M * N,                                                                                                                                 // int64_t batch_stride_D_
+    M * block_num,                                                                                                                         // int64_t batch_stride_Max_
+    M * block_num,                                                                                                                         // int64_t batch_stride_Sum_
+    M * N                                                                                                                                  // int64_t batch_stride_Softmax_
 """
 )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.config")
 def bmm_rcr_softmax_config(func_attrs, dtype="float16"):
-    """This function sets a callback for processing the epilogue of the kernel
-    associated with func_attrs.
-
-    Parameters
-    ----------
-    func_attrs: Dictionary
-        kernel attributes dictionary
-    layout: layout object
-        kernel layout
-    Returns
-    -------
-    None
-    """
     common.make_fproc(func_attrs, RCR)
 
 
 @registry.reg("cuda.bmm_rcr_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    """Generate code for profiling"""
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
-        ARGS_PARSER_TEMPLATE,
-        emit_kernel=True,
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    return gemm_rcr_softmax.common_gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        ndims=3,
     )
 
 
@@ -119,26 +113,27 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    """Generate the code for main function"""
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE.render(),
+    return gemm_rcr_softmax.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
     )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.func_decl")
 def gen_function_decl(func_attrs):
-    """Rendering argument to function declaration template"""
-    func_name = func_attrs["name"]
-    return bmm_common.FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3)
+    return gemm_rcr_softmax.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.func_call")
 def gen_function_call(func_attrs, indent="  "):
-    """Rendering the code to function call template"""
-    return bmm_common.gen_function_call(func_attrs, indent)
+    return gemm_rcr_softmax.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.filter")
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index 56e580df8..15c8c0642 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -21,6 +21,7 @@
 
 import jinja2
 
+from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.common import gemm_common
 from aitemplate.backend.cuda.gemm_universal import common
 from aitemplate.backend.target import Target
@@ -34,6 +35,7 @@
 #include <memory>
 #include <random>
 #include <vector>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/kernel/gemm_grouped.h"
@@ -67,21 +69,20 @@
 using LayoutB = cutlass::layout::ColumnMajor;
 using LayoutC = cutlass::layout::RowMajor;
 
-
-void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-{% if has_d %}
-    cutlass::half_t* d_ptr,
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
+void {{func_name}} (
+    {{instance_name_base}}& gemm_op,
+{% else %}
+void {{func_name}} (
 {% endif %}
-    cutlass::half_t* c_ptr,
-    cutlass::half_t* d_ptr,
-    float* n_ptr,
-    cutlass::half_t* soft_ptr,
-    uint8_t* workspace,
-{% if support_split_k %}
-    int split_k,
+    void* a_ptr,
+    void* b_ptr,
+{% if has_bias %}
+    void* bias_ptr,
 {% endif %}
+    void* soft_ptr,
+    uint8_t* workspace,
 {% for idx in range(input_ndims) %}
     int64_t* a_dim{{idx}},
 {% endfor %}
@@ -94,10 +95,13 @@
     cudaStream_t stream
   ) {
   {{shape_eval}}
+
   {{output_addr_calculator}}
+
   {{extra_shape}}
 
   {{exec_paths}}
+
   throw std::runtime_error(
       "Unsupported workload for this gemm specialization."
   );
@@ -110,41 +114,40 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}int block_num = (N + {{instance}}::ThreadblockShape::kN - 1) / {{instance}}::ThreadblockShape::kN;
 
-{{problem_args}}
-
-{{indent}}};
-{{indent}}{{instance}} gemm_op;
 {% if is_profiler %}
-{{indent}}size_t workspace_size = 0; //gemm_op.get_workspace_size(arguments);
+{{indent}}size_t workspace_size = 2 * M * N * sizeof({{elem_output_type}}) + 2 * block_num * M * sizeof(float);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} gemm_op;
 {% endif %}
 
+{{indent}}typename {{instance}}::Arguments arguments{
+{{problem_args}}
+{{indent}}};
+
 {{indent}}auto status = gemm_op.initialize(arguments);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}status = gemm_op(stream);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}return;
-
 """
 )
 
+
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  float*,
-  cutlass::half_t*,
-  uint8_t*,
-{% if support_split_k %}
-  int,
+  void*,
+  void*,
+{% if has_bias %}
+  void*,
 {% endif %}
+  void*,
+  uint8_t*,
 {% for idx in range(input_ndims) %}
   int64_t*,
 {% endfor %}
@@ -163,17 +166,16 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
 {{indent}}    {{a_ptr}},
 {{indent}}    {{b_ptr}},
 {% if has_bias %}
 {{indent}}    {{bias_ptr}},
 {% endif %}
-{{indent}}    {{c_ptr}},
-{{indent}}    {{d_ptr}},
-{{indent}}    {{n_ptr}},
 {{indent}}    {{soft_ptr}},
 {{indent}}    global_workspace_,
-{{indent}}    {{split_k}},
 {% for dim in adims %}
 {{indent}}    {{dim}},
 {% endfor %}
@@ -189,23 +191,67 @@
 )
 
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}{{instance_name}} {{gemm_op}};
+{{indent}}const char *gemm_op_name = "{{gemm_op_name}}";
+{{indent}}int ret = 0;
+{{indent}}try {
+{{indent}}ret = {{func_name}}(
+{{indent}}    {{gemm_op}},
+{{indent}}    gemm_op_name,
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{soft_ptr}},
+{{indent}}    global_workspace_,
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}} catch (...) {}
+{{indent}}if (ret != 0)
+{{indent}}  return ret;
+{{indent}}
+{{indent}}}
+"""
+)
+
+
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
-  // cast to int64_t to avoid overflow
-  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1);
-  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1);
-  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1);
-  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
-
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
-  memory_pool->AllocateFloatTensor(c_dim0,  mem_pool_sz);  // n_ptr: index 4
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
+  int64_t a_ptr_sz = a_dim0 * a_dim1;
+  int64_t b_ptr_sz = b_dim0 * b_dim1;
+  int64_t c_ptr_sz = c_dim0 * c_dim1;
+
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim1;
+{%endif%}
+
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);                      // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);                      // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/ true);  // soft_ptr: index 2
+
+{% if has_bias %}
+  memory_pool->AllocateTensor(c_dim1, mem_pool_sz);                        // bias_ptr: index 3
+{% endif %}
 """
 )
 
@@ -227,8 +273,62 @@
 #include <sstream>
 {{op_func}}
 
+template <typename GemmInstance>
+int benchmark_{{func_name}} (
+    GemmInstance &gemm_op,
+    const char *gemm_op_name,
+    void* a_ptr,
+    void* b_ptr,
+{% if has_bias %}
+    void* bias_ptr,
+{% endif %}
+    void* soft_ptr,
+    uint8_t* global_workspace_,
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+  ) {
+  // warmup
+  for (int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 10; ++i) {
+    {{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << gemm_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
+template <typename DType>
 struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
+  ProfilerMemoryPool() : shared_input_tensor(false) {
     std::random_device rd;
     gen = std::mt19937(rd());
     uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
@@ -240,7 +340,50 @@
   }
   ~ProfilerMemoryPool() {}
 
-  template <typename DType>
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz) {
+    // TODO: special pool size for A100 L2 cache 40M
+    // need to tune it for other devices
+    int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+    size_t free_global_mem = 0;
+    size_t total_global_mem = 0;
+    cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
+    if (cuda_error != cudaSuccess) {
+      auto error_msg = std::string("Failed to invoke cudaMemGetInfo: ") +
+          cudaGetErrorName(cuda_error) + ", at " + __FILE__;
+      throw std::runtime_error(error_msg);
+    }
+    size_t single_copy_nbytes = one_copy_sz * sizeof(DType);
+    while (mem_pool_sz > 0) {
+      size_t nbytes = single_copy_nbytes * mem_pool_sz;
+      if (nbytes < free_global_mem) {
+        break;
+      }
+      mem_pool_sz--;
+    }
+
+    if (mem_pool_sz <= 1) {
+      size_t minimal_required_nbytes = ptr_max_sz * sizeof(DType);
+      if (minimal_required_nbytes > free_global_mem) {
+        // We absolutely run out of memory
+        auto error_msg = std::string("no enough GPU memory: requested ") +
+            std::to_string(minimal_required_nbytes) + ", available: " +
+            std::to_string(free_global_mem) + ", ptr_max_sz: " +
+            std::to_string(ptr_max_sz) + ", at " + __FILE__;
+        throw std::runtime_error(error_msg);
+      } else {
+        // Let's try to allocate a single blob that is large enough to hold
+        // all input tensors. Note that this is still an approximation, because
+        // we may still hit cudaErrorMemoryAllocation error while allocating
+        // memory for the output. We will rely on cudaMalloc to throw out
+        // an exception in such a case.
+        shared_input_tensor = true;
+        AllocateGaussianTensor(ptr_max_sz);
+      }
+      return 1;
+    }
+    return mem_pool_sz;
+  }
+
   DType* AllocateGaussianTensor(int64_t size) {
     size_t length = size * sizeof(DType);
     blobs.emplace_back(length);
@@ -256,41 +399,25 @@
     return ptr;
   }
 
-
-  cutlass::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<cutlass::half_t*>(
-        AllocateGaussianTensor<__half>(size));
-  }
-
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
-    offsets.push_back(0);
-    strides.push_back(size);
-    copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
-    ptrs.push_back(reinterpret_cast<void*>(ptr));
-    return ptrs.size() - 1;
-  }
-
-  float* AllocateFloatGaussianTensor(int64_t size) {
-    return reinterpret_cast<float*>(
-        AllocateGaussianTensor<float>(size));
-  }
-
-  int AllocateFloatTensor(int64_t size, int64_t copy) {
+  int AllocateTensor(int64_t size, int64_t copy, bool is_output = false) {
     offsets.push_back(0);
     strides.push_back(size);
     copies.push_back(copy);
-    auto ptr = AllocateFloatGaussianTensor(size * copy);
+    DType *ptr;
+    if (!is_output && shared_input_tensor) {
+      ptr = reinterpret_cast<DType*>(blobs.back().get());
+    } else {
+      ptr = AllocateGaussianTensor(size * copy);
+    }
     ptrs.push_back(reinterpret_cast<void*>(ptr));
     return ptrs.size() - 1;
   }
 
-  template <typename T>
-  T* RequestTensorByIdx(int idx) {
+  DType* RequestTensorByIdx(int idx) {
     auto copy = copies.at(idx);
     auto offset = offsets.at(idx);
     auto stride = strides.at(idx);
-    T* ptr = reinterpret_cast<T*>(ptrs.at(idx));
+    DType* ptr = reinterpret_cast<DType*>(ptrs.at(idx));
     ptr += offset;
     offset += stride;
     if (offset == copy * stride) {
@@ -307,13 +434,16 @@
   std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
   std::mt19937 gen;
   std::uniform_int_distribution<int64_t> uniform_dist;
+  // make a shared blob to hold all inputs in cases we do not have
+  // enough GPU memory
+  bool shared_input_tensor;
 };
 
 int main(int argc, char** argv) {
   int device_idx;
   cudaDeviceProp device_properties;
   cudaError_t result = cudaGetDevice(&device_idx);
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  auto memory_pool = std::make_unique<ProfilerMemoryPool<{{elem_type}}>>();
   if (result != cudaSuccess) {
     std::ostringstream errorStream;
     errorStream << "cudaGetDevice() call failed! "
@@ -332,44 +462,16 @@
     throw std::runtime_error(errorStream.str());
   }
 
-
-
   {{args_parse}}
 
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-  using ElementInputN = typename {{name}}::ElementN;
-  uint8_t* global_workspace = nullptr;
+  uint8_t* global_workspace_ = nullptr;
   cudaStream_t stream = nullptr;
 
   {{tensor_decl}}
 
-  // warmup
-  {{func_call}}
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-    {{func_call}}
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  {{benchmark_instances}}
+
+  return 0;
 }
 """
 )
@@ -412,9 +514,9 @@ def gen_function(
     input_ndims,
     weight_ndims,
     dim_info_dict,
+    has_bias=False,
     f_instance_convertor=_gemm_softmax_instance,
     emit_kernel=False,
-    support_split_k=False,
     output_addr_calculator="",
     extra_code="",
 ):
@@ -447,44 +549,51 @@ def gen_function(
             indent="    ",
             instance=fname,
             problem_args=problem_args,
-            support_split_k=support_split_k,
         )
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         custom_libs=gen_custom_libs(),
         instances=instance_decl,
-        function_name=func_name,
-        dtype="cutlass::half_t",
+        func_name=func_name,
         shape_eval=shape_eval_func,
         output_addr_calculator=output_addr_calculator,
         exec_paths=exec_paths,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
-        support_split_k=support_split_k,
-        has_d=common.has_d(func_attrs),
-        has_d1=common.has_d1(func_attrs),
         extra_code=extra_code,
+        has_bias=has_bias,
     )
 
 
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
     args_parser_template,
     emit_kernel=False,
-    support_split_k=False,
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    ndims=2,
 ):
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
 
-    ndims = 2
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     adims = ["&a_dim" + str(i) for i in range(ndims)]
     bdims = ["&b_dim" + str(i) for i in range(ndims)]
     cdims = ["&c_dim" + str(i) for i in range(ndims)]
@@ -492,56 +601,94 @@ def gen_profiler(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmSoftmaxInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+        elem_output_type=elem_output_type,
+    )
+
+    instances = []
+    benchmark_instances = []
+    func_name = "gemm_softmax"
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(op, emit_kernel=emit_kernel)
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_softmax_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = EXEC_TEMPLATE.render(
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            support_split_k=support_split_k,
-            problem_args=problem_args_template.render(),
-        )
-        op_func = src_template.render(
-            custom_libs=gen_custom_libs(),
-            instances=instance,
-            function_name="gemm",
-            input_ndims=2,
-            weight_ndims=2,
-            shape_eval=shape_func,
-            exec_paths=exec_program,
-            output_addr_calculator=output_addr_calculator,
-            support_split_k=support_split_k,
-            extra_code=extra_code,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="gemm",
-            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
-            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{func_name}",
+            a_ptr="memory_pool->RequestTensorByIdx(0)",
+            b_ptr="memory_pool->RequestTensorByIdx(1)",
             has_bias=has_bias,
             bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
-            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
-            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
-            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
-            split_k="split_k",
+            soft_ptr="memory_pool->RequestTensorByIdx(2)",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
         )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser_template.render(),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        func_name=func_name,
+        instance_name_base=instance_name_base,
+        custom_libs=gen_custom_libs(),
+        has_bias=has_bias,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        shape_eval=shape_func,
+        exec_paths=exec_program,
+        output_addr_calculator=output_addr_calculator,
+        extra_code=extra_code,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=func_name,
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        has_bias=has_bias,
+        bias_ptr="bias_ptr",
+        soft_ptr="soft_ptr",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        args_parse=args_parser_template.render(),
+        func_name=func_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        func_call=func_call,
+        name=instance_name_base,
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(
+            has_bias=has_bias,
+        ),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_output_type=elem_output_type,
+        elem_type=elem_type,
+    )
+
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 9ad034e81..40b6496ea 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -34,41 +34,53 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: M*K (RowMajor)
-        B: N*K (ColumnMajor)
-        C/D/sofmax: M*N (RowMajor)
-        N: M*1 (RowMajor)
+        A: (M, K) (RowMajor)
+        B: (N, K) (ColumnMajor)
+        C, D, Soft: (M, N) (RowMajor)
+        N, S: (block_num, M) (RowMajor)
     */
 
-    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
-    1,                       // int32_t batch_count_
-    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
-    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
-    {c_ptr, 0},              // TensorRefC ref_C_
-    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {M, N, K},                                                                                                                     // cutlass::gemm::GemmCoord problem_size
+    1,                                                                                                                             // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(bias_ptr), 0},                                                                        // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + M * N * sizeof({{elem_output_type}})), LayoutC(N)},                       // TensorRefC ref_D_
     {
         float(1.0),
         float(1.0)
-    },                       // typename EpilogueFunctorOp::Params linear_scaling
-    {n_ptr, LayoutC(1)},     // ???
-    {soft_ptr, LayoutC(N)},  // ???
+    },                                                                                                                             // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                  // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}}) + M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                   // TensorRefSoft ref_Softmax_
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.config")
 def gemm_rcr_bias_softmax_config(func_attrs, dtype="float16"):
-    return gemm_rcr_softmax.gemm_rcr_softmax_config(func_attrs, dtype)
+    gemm_rcr_softmax.gemm_rcr_softmax_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
     return gemm_rcr_softmax.common_gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=gemm_rcr_softmax.ARGS_PARSER_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -79,23 +91,31 @@ def gen_function(
     dim_info_dict,
 ):
     return gemm_rcr_softmax.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        has_bias=True,
     )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.func_decl")
 def gen_function_decl(func_attrs):
-    return gemm_rcr_softmax.gen_function_decl(func_attrs)
+    return gemm_rcr_softmax.gen_function_decl(
+        func_attrs=func_attrs,
+        has_bias=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.func_call")
 def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+
     return gemm_rcr_softmax.gen_function_call(
-        func_attrs,
-        indent,
+        func_attrs=func_attrs,
+        indent=indent,
+        has_bias=True,
+        bias_ptr=bias._attrs["name"],
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index b417e2c94..a5fac4e53 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -21,6 +21,7 @@
 import jinja2
 
 from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.cuda.gemm_epilogue_vistor import common_softmax
 from aitemplate.backend.cuda.gemm_universal import common
 from aitemplate.backend.cuda.gemm_universal.layout import RCR
@@ -33,7 +34,6 @@
   int64_t M = std::atoi(argv[1]);
   int64_t N = std::atoi(argv[2]);
   int64_t K = std::atoi(argv[3]);
-  int64_t split_k = std::atoi(argv[4]);
 
   int64_t a_dim0 = M;
   int64_t a_dim1 = K;
@@ -47,68 +47,77 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: M*K (RowMajor)
-        B: N*K (ColumnMajor)
-        C/D/sofmax: M*N (RowMajor)
-        N: M*1 (RowMajor)
+        A: (M, K) (RowMajor)
+        B: (N, K) (ColumnMajor)
+        C, D, Soft: (M, N) (RowMajor)
+        N, S: (block_num, M) (RowMajor)
     */
 
-    {M, N, K},               // cutlass::gemm::GemmCoord problem_size
-    1,                       // int32_t batch_count_
-    {a_ptr, LayoutA(K)},     // TensorRefA ref_A_
-    {b_ptr, LayoutB(K)},     // TensorRefB ref_B_
-    {c_ptr, LayoutC(N)},     // TensorRefC ref_C_
-    {d_ptr, LayoutC(N)},     // TensorRefC ref_D_
+    {M, N, K},                                                                                                                     // cutlass::gemm::GemmCoord problem_size
+    1,                                                                                                                             // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace), LayoutC(N)},                                                              // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + M * N * sizeof({{elem_output_type}})), LayoutC(N)},                       // TensorRefC ref_D_
     {
         float(1.0),
         float(0.0)
-    },                       // typename EpilogueFunctorOp::Params linear_scaling
-    {n_ptr, LayoutC(1)},     // ???
-    {soft_ptr, LayoutC(N)},  // ???
+    },                                                                                                                             // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                  // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}}) + M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                   // TensorRefSoft ref_Softmax_
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.config")
 def gemm_rcr_softmax_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 def common_gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
-    bias_ptr_arg=None,
-    extra_code="",
+    args_parser_template,
+    **kwargs,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
     )
+
     return common_softmax.gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        src_template,
-        problem_args_template,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        args_parser_template=args_parser_template,
         emit_kernel=True,
-        support_split_k=True,
         output_addr_calculator=output_addr_calculator,
-        bias_ptr_arg=bias_ptr_arg,
-        extra_code=extra_code,
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
     return common_gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
     )
 
 
@@ -118,53 +127,68 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
     problem_args_template=None,
+    **kwargs,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     if problem_args_template is None:
-        problem_args = PROBLEM_ARGS_TEMPLATE.render()
-    else:
-        problem_args = problem_args_template.render()
+        problem_args_template = PROBLEM_ARGS_TEMPLATE
+
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
     return common_softmax.gen_function(
-        func_attrs,
-        common_softmax.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_softmax.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        dim_info_dict=dim_info_dict,
         emit_kernel=True,
-        support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.func_decl")
-def gen_function_decl(func_attrs):
+def gen_function_decl(
+    func_attrs,
+    **kwargs,
+):
     func_name = func_attrs["name"]
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
     return common_softmax.FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
-        support_split_k=True,
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.func_call")
-def gen_function_call(func_attrs, indent="  "):
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+    **kwargs,
+):
     a = func_attrs["inputs"][0]
     b = func_attrs["inputs"][1]
-
-    tmp_c = func_attrs["inputs"][2]
-    tmp_d = func_attrs["inputs"][3]
-    tmp_n = func_attrs["inputs"][4]
-
     soft = func_attrs["outputs"][0]
-    has_bias = False
+
     adims = [
         "&" + dim._attrs["name"]
         for dim in func_attrs["input_accessors"][0].original_shapes
@@ -177,20 +201,17 @@ def gen_function_call(func_attrs, indent="  "):
         "&" + dim._attrs["name"]
         for dim in func_attrs["output_accessors"][0].original_shapes
     ]
+
     return common_softmax.FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         a_ptr=a._attrs["name"],
         b_ptr=b._attrs["name"],
-        has_bias=has_bias,
-        c_ptr=tmp_c._attrs["name"],
-        d_ptr=tmp_d._attrs["name"],
-        n_ptr=tmp_n._attrs["name"],
         soft_ptr=soft._attrs["name"],
-        split_k=func_attrs["split_k"],
         adims=adims,
         bdims=bdims,
         cdims=cdims,
         indent=indent,
+        **kwargs,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
index 4ad8ee10b..5d3ea3d0e 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
@@ -28,12 +28,13 @@ template <
     typename ArchTag,
     typename ElementAccumulator,
     int kStages,
-    typename ThreadblockShape,
+    typename ThreadblockShape_,
     typename WarpShape,
     typename InstructionShape,
     typename EpilogueFunctorOp,
     typename ThreadblockSwizzle,
-    typename ElementSum_ = ElementAccumulator,
+    typename ElementNorm_ = float,
+    typename ElementSum_ = float,
     typename ElementSoftmax_ = ElementC_>
 
 class GemmSoftmaxUniversal {
@@ -50,10 +51,13 @@ class GemmSoftmaxUniversal {
   using ElementCompute = ElementAccumulator;
   using ElementSum = ElementSum_;
   using ElementSoft = ElementSoftmax_;
+  using ElementSoftmaxCompute = float;
 
   using LayoutA = LayoutA_;
   using LayoutB = LayoutB_;
 
+  using ThreadblockShape = ThreadblockShape_;
+
   static int const kAlignment = kAlignmentA;
 
   ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -71,35 +75,34 @@ class GemmSoftmaxUniversal {
   // This is a mandatory data type for the atomic reduction in the GEMM epilogue
   // to function.
 
-  using ElementN = float;
+  // using ElementN = float;
+  using ElementNorm = ElementNorm_;
+
+  using ApplyShape = MatrixShape<1, 1024>;
 
   // These are mandatory layouts.
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutN = cutlass::layout::RowMajor;
+  using LayoutS = cutlass::layout::RowMajor;
   using LayoutSoft = cutlass::layout::RowMajor;
 
   using TensorRefA = TensorRef<ElementA, LayoutA>;
   using TensorRefB = TensorRef<ElementB, LayoutB>;
   using TensorRefC = TensorRef<ElementC, LayoutC>;
-  using TensorRefN = TensorRef<ElementN, LayoutN>;
+  using TensorRefN = TensorRef<ElementNorm, LayoutN>;
+  using TensorRefSum = TensorRef<ElementSum, LayoutS>;
   using TensorRefSoft = TensorRef<ElementSoft, LayoutSoft>;
 
-  // using OperatorClass       = cutlass::arch::OpClassTensorOp;
-  // using ArchTag             = cutlass::arch::Sm80;
-  // static int const kStages  = Stages;
-  // using ThreadblockSwizzle =
-  // cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
-
   ///////////////////////////////////////////////////////////////////////////////////////////////
 
   // basic GEMM kernel
   using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
       ElementA,
       LayoutA,
-      kAlignment,
+      kAlignmentA,
       ElementB,
       LayoutB,
-      kAlignment,
+      kAlignmentB,
       ElementC,
       LayoutC,
       ElementCompute,
@@ -124,12 +127,16 @@ class GemmSoftmaxUniversal {
   ///////////////////////////////////////////////////////////////////////////////////////////////
 
   // Epilogue visitor
-  using EpilogueVisitor = kernel::EpilogueVisitorBiasMax<
-      ThreadblockShape,
-      DefaultGemmKernel::kThreadCount,
-      typename DefaultGemmKernel::Epilogue::OutputTileIterator,
-      ElementCompute,
-      EpilogueFunctorOp>;
+  using EpilogueVisitor =
+      typename cutlass::epilogue::threadblock::EpilogueVisitorSoftmax<
+          ThreadblockShape,
+          DefaultGemmKernel::kThreadCount,
+          typename DefaultGemmKernel::Epilogue::OutputTileIterator,
+          ElementCompute,
+          ElementNorm,
+          ElementSum,
+          ElementSoftmaxCompute,
+          EpilogueFunctorOp>;
 
   /// Epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::
@@ -146,11 +153,19 @@ class GemmSoftmaxUniversal {
   // Softmax kernel
   using SoftmaxApplyKernel = kernel::ApplySoftmax<
       ElementC,
-      ElementN,
+      ElementNorm,
       ElementSum,
       ElementSoft,
+      ElementSoftmaxCompute,
       kAlignmentC,
-      MatrixShape<1, 1024>>;
+      ApplyShape>;
+
+  using ApplyFinalReductionKernel =
+      cutlass::reduction::kernel::ApplySoftmaxFinalReduction<
+          ElementNorm,
+          ElementSum,
+          ElementSoftmaxCompute,
+          ThreadblockShape>;
 
  public:
   /// Arguments class
@@ -158,6 +173,8 @@ class GemmSoftmaxUniversal {
     typename GemmKernel::Arguments gemm;
 
     typename SoftmaxApplyKernel::Arguments softmax;
+    typename ApplyFinalReductionKernel::Arguments reduction;
+    cutlass::gemm::GemmCoord extent;
 
     //
     // Methods
@@ -173,12 +190,14 @@ class GemmSoftmaxUniversal {
         TensorRefC ref_D_,
         typename EpilogueFunctorOp::Params linear_scaling,
         TensorRefN ref_N_,
+        TensorRefSum ref_S_,
         TensorRefSoft ref_Softmax_,
         int64_t batch_stride_A_ = 0,
         int64_t batch_stride_B_ = 0,
         int64_t batch_stride_C_ = 0,
         int64_t batch_stride_D_ = 0,
         int64_t batch_stride_Max_ = 0,
+        int64_t batch_stride_Sum_ = 0,
         int64_t batch_stride_Softmax_ = 0)
         : gemm(
               cutlass::gemm::GemmUniversalMode::kBatched,
@@ -186,38 +205,55 @@ class GemmSoftmaxUniversal {
               batch_count_,
               ref_A_,
               ref_B_,
+              ref_C_,
+              ref_D_,
+              ref_N_.data(),
+              ref_S_.data(),
               batch_stride_A_,
               batch_stride_B_,
               typename EpilogueVisitor::Arguments(
                   linear_scaling,
-                  ref_C_,
-                  ref_D_,
-                  ref_N_.data(),
                   batch_stride_C_,
                   batch_stride_D_,
-                  batch_stride_Max_)),
+                  batch_stride_Max_,
+                  batch_stride_Sum_)),
+          reduction(
+              problem_size,
+              ref_N_.data(),
+              ref_S_.data(),
+              batch_stride_Max_,
+              batch_stride_Sum_),
           softmax(
               MatrixCoord(problem_size.m(), problem_size.n()),
               batch_count_,
               ref_D_,
               ref_N_,
+              ref_S_,
               ref_Softmax_,
               batch_stride_D_,
               batch_stride_Max_,
-              batch_stride_Softmax_) {}
+              batch_stride_Sum_,
+              batch_stride_Softmax_),
+          extent(problem_size) {}
   };
 
   struct Params {
     typename GemmKernel::Params gemm;
 
     typename SoftmaxApplyKernel::Params softmax;
+    typename ApplyFinalReductionKernel::Params reduction;
+    MatrixCoord extent;
 
     //
     // Methods
     //
     Params() {}
 
-    Params(Arguments const& args) : gemm(args.gemm), softmax(args.softmax) {}
+    Params(Arguments const& args)
+        : gemm(args.gemm),
+          reduction(args.reduction),
+          softmax(args.softmax),
+          extent(MatrixCoord(args.extent.m(), args.extent.n())) {}
   };
 
  public:
@@ -254,10 +290,52 @@ class GemmSoftmaxUniversal {
 
     int gemm_smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
+    cudaError_t result;
+
+    if (gemm_smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          cutlass::Kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          gemm_smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
     cutlass::Kernel<GemmKernel>
         <<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
 
-    cudaError_t result = cudaGetLastError();
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the ApplyFinalReductionKernel
+    //
+
+    int thread_per_block = 128;
+    int block_per_row =
+        (params_.extent.row() + thread_per_block - 1) / thread_per_block;
+    if (block_per_row < 4) {
+      thread_per_block = 32;
+      block_per_row =
+          (params_.extent.row() + thread_per_block - 1) / thread_per_block;
+    }
+
+    dim3 final_reduction_grid(
+        block_per_row, 1, params_.softmax.args.batch_count);
+    dim3 final_reduction_block(thread_per_block);
+
+    Kernel<ApplyFinalReductionKernel>
+        <<<final_reduction_grid,
+           final_reduction_block,
+           sizeof(typename ApplyFinalReductionKernel::SharedStorage),
+           stream>>>(params_.reduction);
+
+    result = cudaGetLastError();
 
     if (result != cudaSuccess) {
       return cutlass::Status::kErrorInternal;
@@ -268,15 +346,18 @@ class GemmSoftmaxUniversal {
     //
 
     dim3 apply_block(
-        SoftmaxApplyKernel::Shape::kColumn, SoftmaxApplyKernel::Shape::kRow);
+        SoftmaxApplyKernel::ApplyShape::kColumn,
+        SoftmaxApplyKernel::ApplyShape::kRow);
 
-    int cta_rows = SoftmaxApplyKernel::Shape::kRow;
-    int cta_columns =
-        SoftmaxApplyKernel::Shape::kColumn * SoftmaxApplyKernel::kAlignment;
+    int threadblock_rows = SoftmaxApplyKernel::ApplyShape::kRow;
+    int threadblock_columns = SoftmaxApplyKernel::ApplyShape::kColumn *
+        SoftmaxApplyKernel::kAlignment;
 
     dim3 apply_grid(
-        (params_.softmax.args.extent.row() + cta_rows - 1) / cta_rows,
-        (params_.softmax.args.extent.column() + cta_columns - 1) / cta_columns,
+        (params_.softmax.args.extent.row() + threadblock_rows - 1) /
+            threadblock_rows,
+        (params_.softmax.args.extent.column() + threadblock_columns - 1) /
+            threadblock_columns,
         params_.softmax.args.batch_count);
 
     Kernel<SoftmaxApplyKernel>
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
index b30aba29f..6eb2e387c 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -13,10 +13,7 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
-This is special in template based gemm solution
-This is used for `torch.nn.functional.linear`
-When use for `linear`, need set A->Data, B->Weight
+Operator definition for bmm_rcr_softmax.
 """
 
 from aitemplate.compiler.base import IntImm, Tensor
@@ -36,7 +33,6 @@ class bmm_rcr_softmax(bmm):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "bmm_rcr_softmax"
-        raise Exception("BMM + Softmax is disabled for now")
 
         def cal_align_ab(m, n, k):
             return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
@@ -68,24 +64,13 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_c = Tensor(output_shape, dst_ops={self})
-        temp_d = Tensor(output_shape, dst_ops={self})
-        temp_n = Tensor(
-            [output_shape[0], output_shape[1], IntImm(1)],
-            dtype="float32",
-            dst_ops={self},
-        )
-
-        self._attrs["inputs"].append(temp_c)
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
@@ -118,7 +103,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
         def fbuild_cmd(exec_key):
             B, M, N, K = self._invert_exec_key(exec_key)
             cmd = []
-            cmd.append(B)  # m
+            cmd.append(B)  # b
             cmd.append(M)  # m
             cmd.append(N)  # n
             cmd.append(K)  # k
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index f09339f98..76aa740dd 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -15,7 +15,7 @@
 """
 Operator definition for gemm_rcr_bias_softmax.
 """
-from aitemplate.compiler.base import _create_host_zero_tensor, Tensor
+from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_softmax import (
     gemm_rcr_softmax,
 )
@@ -59,20 +59,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_n = _create_host_zero_tensor(
-            [output_shape[0], 1], dtype="float32", dst_ops={self}
-        )
-
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 630762e42..929a8a72d 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -13,13 +13,10 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
-This is special in template based gemm solution
-This is used for `torch.nn.functional.linear`
-When use for `linear`, need set A->Data, B->Weight
+Operator definition for gemm_rcr_softmax.
 """
 
-from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Tensor
+from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
@@ -31,9 +28,9 @@ class gemm_rcr_softmax(gemm_rcr):
 
     def __init__(self):
         """Initializes gemm_rcr_softmax."""
+
         super().__init__()
         self._attrs["op"] = "gemm_rcr_softmax"
-        raise Exception("GEMM + Softmax is disabled for now")
 
     def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         """Performs sanity checks, offline shape inference and returns an output tensor."""
@@ -46,22 +43,13 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_c = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_n = _create_host_zero_tensor(
-            [output_shape[0], IntImm(1)], dtype="float32", dst_ops={self}
-        )
-
-        self._attrs["inputs"].append(temp_c)
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
index 0aeaa6fe2..a56e9cc52 100644
--- a/tests/unittest/ops/test_bmm_softmax.py
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
 import torch
@@ -20,49 +19,79 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
-_LOGGER = logging.getLogger(__name__)
-
-
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skip("BMM + Softmax is disabled for now")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMSoftmaxTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_bmm_rcr_softmax(
-        self, B=16, M=16, K=64, N=24, test_name="bmm_rcr_softmax"
+        self,
+        B=16,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="bmm_rcr_softmax",
     ):
-
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr_softmax()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
-        if int(target._arch) < 80:
-            _LOGGER.warning("Skip this test on SM75")
-            return
-        if type(target).__name__ == "FBCUDA":
-            _LOGGER.warning("Skip this test for special profiling requirement")
-            return
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
+        x_pt = get_random_torch_tensor([B, M, K], dtype)
+        w_pt = get_random_torch_tensor([B, N, K], dtype)
+        wt_pt = torch.transpose(w_pt, 2, 1)
+        y_pt = torch.bmm(x_pt, wt_pt)
+        y_pt = torch.softmax(y_pt, dim=-1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt}
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(inputs, [y])
+
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+        torch.testing.assert_close(
+            torch.argmax(y, axis=2),
+            torch.argmax(y_pt, axis=2),
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+    def test_bmm_rcr_softmax_float16(self):
+        self._test_bmm_rcr_softmax(
+            B=16,
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="bmm_rcr_softmax_fp16_1",
+        )
 
-        WT = torch.transpose(W_pt, 2, 1)
-        Y_pt = torch.bmm(X_pt, WT)
-        Y_pt = torch.softmax(Y_pt, dim=-1)
+    def test_bmm_rcr_softmax_float32_sm80(self):
+        self._test_bmm_rcr_softmax(
+            B=16,
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="bmm_rcr_softmax_fp32_1",
+        )
 
-        y = torch.empty([B, M, N]).cuda().half()
-        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
-        eps = 1e-1
-        self.assertTrue(torch.allclose(Y_pt, y, atol=eps, rtol=eps))
 
-    def test_bmm_softmax(self):
-        self._test_bmm_rcr_softmax()
+filter_test_cases_by_test_env(BMMSoftmaxTestCase)
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
index 0c9d7b811..1c34f297f 100644
--- a/tests/unittest/ops/test_gemm_bias_softmax.py
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -12,13 +12,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
-import os
 import unittest
 
-import numpy as np
 import torch
-from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
@@ -28,19 +25,23 @@
 )
 
 
-_LOGGER = logging.getLogger(__name__)
-
-
-@unittest.skip("GEMM + Softmax is disabled for now")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasSoftmaxTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_gemm_rcr_bias_softmax(
-        self, M=16, K=64, N=24, rebuild=True, dtype="float16"
+        self,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="gemm_rcr_bias_softmax",
+        atol=1e-2,
+        rtol=1e-2,
+        assert_argmax=True,
     ):
-        target = detect_target()
-        if type(target).__name__ == "FBCUDA":
-            _LOGGER.warning("Skip this test for special profiling requirement")
-            return
-
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -49,36 +50,67 @@ def _test_gemm_rcr_bias_softmax(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = get_random_torch_tensor([M, K], dtype)
-        W_pt = get_random_torch_tensor([N, K], dtype)
-        B_pt = get_random_torch_tensor([N], dtype)
-        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
-        Y_pt = torch.softmax(Y_pt, dim=1)
-        Y_np = Y_pt.cpu().numpy()
-
-        test_name = f"gemm_bias_softmax_{dtype}"
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        x_pt = get_random_torch_tensor([M, K], dtype)
+        w_pt = get_random_torch_tensor([N, K], dtype)
+        b_pt = get_random_torch_tensor([N], dtype)
+        y_pt = torch.nn.functional.linear(x_pt, w_pt, bias=b_pt)
+        y_pt = torch.softmax(y_pt, dim=1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt, "input_2": b_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-        np.testing.assert_allclose(
-            np.argmax(Y_np, axis=1),
-            np.argmax(y.cpu().numpy(), axis=1),
-            atol=1e-1,
-            rtol=1e-1,
+        torch.testing.assert_close(y, y_pt, atol=atol, rtol=rtol)
+
+        if assert_argmax:
+            torch.testing.assert_close(
+                torch.argmax(y, axis=1),
+                torch.argmax(y_pt, axis=1),
+                atol=1e-1,
+                rtol=1e-1,
+            )
+
+    def test_gemm_rcr_bias_softmax_float16(self):
+        self._test_gemm_rcr_bias_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="gemm_rcr_bias_softmax_fp16_1",
         )
 
-    def test_gemm_bias_softmax_float16(self):
-        self._test_gemm_rcr_bias_softmax(N=81, dtype="float16")
+        if not detect_target().use_dummy_profiling_results():
+            # dummy workspace size (10240 bytes) is insufficient for
+            # these tests: run them only locally where profiler is
+            # executed and detects the necessary workspace size
+            self._test_gemm_rcr_bias_softmax(
+                M=1024,
+                K=512,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_bias_softmax_fp16_2",
+            )
+            self._test_gemm_rcr_bias_softmax(
+                M=2048,
+                K=1024,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_bias_softmax_fp16_3",
+                atol=3e-2,
+                rtol=3e-2,
+                assert_argmax=False,
+            )
 
-    def test_gemm_bias_softmax_float32_sm80(self):
-        self._test_gemm_rcr_bias_softmax(N=81, dtype="float16")
+    def test_gemm_rcr_bias_softmax_float32_sm80(self):
+        self._test_gemm_rcr_bias_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="gemm_rcr_bias_softmax_fp32_1",
+        )
 
 
 filter_test_cases_by_test_env(GEMMBiasSoftmaxTestCase)
diff --git a/tests/unittest/ops/test_gemm_softmax.py b/tests/unittest/ops/test_gemm_softmax.py
index f189a39e6..d9f28f1f9 100644
--- a/tests/unittest/ops/test_gemm_softmax.py
+++ b/tests/unittest/ops/test_gemm_softmax.py
@@ -12,33 +12,33 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
-import os
 import unittest
 
-import numpy as np
 import torch
-from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
 
 
-_LOGGER = logging.getLogger(__name__)
-
-
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skip("GEMM + Softmax is disabled for now")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMSoftmaxTestCase(unittest.TestCase):
-    def _test_gemm_rcr_softmax(self, M=16, K=64, N=24, rebuild=True, dtype="float16"):
-        target = detect_target()
-        if type(target).__name__ == "FBCUDA":
-            _LOGGER.warning("Skip this test for special profiling requirement")
-            return
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
 
+    def _test_gemm_rcr_softmax(
+        self,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="gemm_rcr_softmax",
+    ):
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr_softmax()
@@ -46,35 +46,65 @@ def _test_gemm_rcr_softmax(self, M=16, K=64, N=24, rebuild=True, dtype="float16"
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = get_random_torch_tensor([M, K], dtype)
-        W_pt = get_random_torch_tensor([N, K], dtype)
-        Y_pt = torch.nn.functional.linear(X_pt, W_pt)
-        Y_pt = torch.softmax(Y_pt, dim=1)
-        Y_np = Y_pt.cpu().numpy()
-
-        test_name = f"gemm_softmax_{dtype}"
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-        inputs = {"input_0": X_pt, "input_1": W_pt}
+        x_pt = get_random_torch_tensor([M, K], dtype)
+        w_pt = get_random_torch_tensor([N, K], dtype)
+        y_pt = torch.nn.functional.linear(x_pt, w_pt)
+        y_pt = torch.softmax(y_pt, dim=1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        y_ait_np = y.cpu().numpy()
-        np.testing.assert_allclose(Y_np, y_ait_np, atol=1e-1, rtol=1e-1)
-        np.testing.assert_allclose(
-            np.argmax(Y_np, axis=1),
-            np.argmax(y_ait_np, axis=1),
+
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+        torch.testing.assert_close(
+            torch.argmax(y, axis=1),
+            torch.argmax(y_pt, axis=1),
             atol=1e-1,
             rtol=1e-1,
         )
 
-    def test_gemm_softmax(self):
-        self._test_gemm_rcr_softmax()
+    def test_gemm_rcr_softmax_float16(self):
+        self._test_gemm_rcr_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="gemm_rcr_softmax_fp16_1",
+        )
+
+        if not detect_target().use_dummy_profiling_results():
+            # dummy workspace size (10240 bytes) is insufficient for
+            # these tests: run them only locally where profiler is
+            # executed and detects the necessary workspace size
+            self._test_gemm_rcr_softmax(
+                M=1024,
+                K=512,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_softmax_fp16_2",
+            )
+            self._test_gemm_rcr_softmax(
+                M=2048,
+                K=1024,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_softmax_fp16_3",
+            )
+
+    def test_gemm_rcr_softmax_float32_sm80(self):
+        self._test_gemm_rcr_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="gemm_rcr_softmax_fp32_1",
+        )
+
 
-    def test_gemm_softmax_float(self):
-        self._test_gemm_rcr_softmax(dtype="float")
+filter_test_cases_by_test_env(GEMMSoftmaxTestCase)
 
 
 if __name__ == "__main__":

From ac7ce281c67a45a954d866b824a9c7d9ca7c3ce2 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 27 Mar 2023 14:25:56 -0700
Subject: [PATCH 343/638] Reorganize skipped tests (#501)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/501

A few tests are currently being skipped (with `unittest.skip`) which causes issues in the internal CI. In this diff, those tests are reorganized to avoid skipping them. More specifically:

- `test_bmm_softmax_bmm`: supported on ROCM only: split with `filter_test_cases_by_test_env`.
- `test_split_bmm_softmax_bmm`: supported on ROCM only: split with `filter_test_cases_by_test_env`.
- `test_conv3d`: fp32 op instances aren't generated by CUTLASS: commented out.
- `test_perm021fc_ccr_bias_perm021`: necessary CUTLAS permute layout not available: commented out.
- `test_transform_special_op`: ROCM-based tests have issue in CK; commented out.
- `test_efficient_nms`: benchmark function is supposed to be enabled manually; disabled with `_` prefix.

Reviewed By: chenyang78

Differential Revision: D44416401

fbshipit-source-id: 2fe8a4fe65edc5998a276e2c75cd56fd456b3bbb
---
 .../compiler/test_split_bmm_softmax_bmm.py    |  7 +-
 .../compiler/test_transform_special_op.py     | 68 +++++++++---------
 tests/unittest/ops/test_bmm_softmax_bmm.py    |  7 +-
 tests/unittest/ops/test_conv3d.py             | 70 ++++++++++---------
 tests/unittest/ops/test_efficient_nms.py      | 62 ++++++++--------
 .../ops/test_perm021fc_ccr_bias_perm021.py    | 60 ++++++++--------
 6 files changed, 144 insertions(+), 130 deletions(-)

diff --git a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
index 2e3d846de..43017853c 100644
--- a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
+++ b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
@@ -24,13 +24,13 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
 from aitemplate.utils import graph_utils, shape_utils
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
 class SplitBMMTestCase(unittest.TestCase):
     def _test_split_reshape_bmm_permute(
         self, bs, nheads, seq_len, hidden_size, test_name, dtype="float16"
@@ -82,7 +82,7 @@ def _test_split_reshape_bmm_permute(
             module.run_with_tensors([x_pt], [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_reshape_bmm_permute(self):
+    def test_split_reshape_bmm_permute_rocm(self):
         self._test_split_reshape_bmm_permute(
             bs=[1], nheads=12, seq_len=256, hidden_size=768, test_name="static"
         )
@@ -91,6 +91,9 @@ def test_split_reshape_bmm_permute(self):
         )
 
 
+filter_test_cases_by_test_env(SplitBMMTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
index 90528ce11..34137ea35 100644
--- a/tests/unittest/compiler/test_transform_special_op.py
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -266,7 +266,6 @@ def test_n_non1_fail(self, dtype):
         self.assertEqual(src_op._attrs["op"], "bmm_rcr")
 
 
-@unittest.skip("enable it when ck fix")
 class OneByOneConvTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -376,38 +375,41 @@ def _test_simple_1x1_conv(
                     Y_pt, Y_ait.permute(0, 3, 1, 2), atol=1e-1, rtol=1e-1
                 )
 
-    def test_1x1_conv_no_bias(self):
-        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2)
-        self._test_simple_1x1_conv(
-            batch=3, CO=100, HH=200, WW=4, CI=2, activation="relu"
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid"
-        )
-        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13)
-        self._test_simple_1x1_conv(batch=(1, 10), CO=128, HH=2, WW=2, CI=10)
-
-    def test_1x1_conv_with_bias(self):
-        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2, with_bias=True)
-        self._test_simple_1x1_conv(
-            batch=3,
-            CO=100,
-            HH=200,
-            WW=4,
-            CI=2,
-            activation="relu",
-            with_bias=True,
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid", with_bias=True
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=64, HH=10, WW=42, CI=3, activation="hardswish", with_bias=True
-        )
-        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13, with_bias=True)
-        self._test_simple_1x1_conv(
-            batch=(1, 10), CO=128, HH=2, WW=2, CI=10, with_bias=True
-        )
+    # !!! SKIPPED TESTS BELOW !!!
+    # TODO: enable the tests when ck is fixed
+
+    # def test_1x1_conv_no_bias(self):
+    #     self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2)
+    #     self._test_simple_1x1_conv(
+    #         batch=3, CO=100, HH=200, WW=4, CI=2, activation="relu"
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid"
+    #     )
+    #     self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13)
+    #     self._test_simple_1x1_conv(batch=(1, 10), CO=128, HH=2, WW=2, CI=10)
+
+    # def test_1x1_conv_with_bias(self):
+    #     self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2, with_bias=True)
+    #     self._test_simple_1x1_conv(
+    #         batch=3,
+    #         CO=100,
+    #         HH=200,
+    #         WW=4,
+    #         CI=2,
+    #         activation="relu",
+    #         with_bias=True,
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid", with_bias=True
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=64, HH=10, WW=42, CI=3, activation="hardswish", with_bias=True
+    #     )
+    #     self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13, with_bias=True)
+    #     self._test_simple_1x1_conv(
+    #         batch=(1, 10), CO=128, HH=2, WW=2, CI=10, with_bias=True
+    #     )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_bmm_softmax_bmm.py b/tests/unittest/ops/test_bmm_softmax_bmm.py
index 7afad1c85..d0693eda6 100644
--- a/tests/unittest/ops/test_bmm_softmax_bmm.py
+++ b/tests/unittest/ops/test_bmm_softmax_bmm.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils import shape_utils
 
 
@@ -37,7 +38,6 @@ def build_causal_attention_mask(bsz, seq_len, dtype):
     return mask
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
 class BMMSoftmaxBMMTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(BMMSoftmaxBMMTestCase, self).__init__(*args, **kwargs)
@@ -172,7 +172,7 @@ def _test_b2b(
             else:
                 self.assertTrue(torch.allclose(Y2_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rcr(self):
+    def test_rcr_rocm(self):
         # FIXME: re-enable it after we fix the missing parameter for bmm_softmax_bmm
         # self._test_b2b([16], [576], N=576, K=64, D=64, test_name="static")
         self._test_bmm_permute([24], [256], N=256, K=64, D=64, test_name="static")
@@ -206,5 +206,8 @@ def test_rcr(self):
         )
 
 
+filter_test_cases_by_test_env(BMMSoftmaxBMMTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index 974428de9..42e8e05a9 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -22,7 +22,11 @@
 from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-@unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class Conv3dTestCase(unittest.TestCase):
     def test_conv3d_bias_padding(
         self,
@@ -248,39 +252,37 @@ def test_fp16(self):
             dtype="float16",
         )
 
-    @unittest.skip("no fp32 kernels are available for conv3d")
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    @unittest.skipIf(
-        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-        "Not supported by CUDA < SM80.",
-    )
-    def test_fp32(self):
-        self._test_conv3d(
-            4,
-            224,
-            224,
-            8,
-            96,
-            3,
-            5,
-            5,
-            stride=(2, 4, 4),
-            pad=(1, 2, 2),
-            test_name="conv3d_fp32_1",
-            dtype="float32",
-        )
-        self._test_conv3d(
-            56,
-            56,
-            56,
-            64,
-            256,
-            1,
-            1,
-            1,
-            test_name="conv3d_fp32_2",
-            dtype="float32",
-        )
+    # !!! SKIPPED TESTS BELOW !!!
+    # CUTLASS generator doesn't provide conv3d ops for fp32
+    # TODO: enable the tests after the issue is resolved
+
+    # def test_fp32(self):
+    #     self._test_conv3d(
+    #         4,
+    #         224,
+    #         224,
+    #         8,
+    #         96,
+    #         3,
+    #         5,
+    #         5,
+    #         stride=(2, 4, 4),
+    #         pad=(1, 2, 2),
+    #         test_name="conv3d_fp32_1",
+    #         dtype="float32",
+    #     )
+    #     self._test_conv3d(
+    #         56,
+    #         56,
+    #         56,
+    #         64,
+    #         256,
+    #         1,
+    #         1,
+    #         1,
+    #         test_name="conv3d_fp32_2",
+    #         dtype="float32",
+    #     )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index ce2962dd5..7be2609dd 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -322,36 +322,38 @@ def test_nms_fp32(self):
             dtype="float32",
         )
 
-    @unittest.skip("manually enable it for benchmarking")
-    def test_nms_benchmark_shapes(self):
-        self._test_nms(
-            N=3350,
-            preNmsTop=2000,
-            nmsMaxOut=100,
-            iouThreshold=0.5,
-            minBoxSize=0,
-            batch_size=16,
-            num_classes=1,
-            rand_box=True,
-            test_name="nms_fcos_shape",
-            benchmark_shapes=True,
-        )
-
-        for bz in (1, 4, 16):
-            for N in (6000, 12000, 20000, 60000):
-                for maxout in (100, 300, 1000):
-                    self._test_nms(
-                        N=N,
-                        preNmsTop=6000,
-                        nmsMaxOut=maxout,
-                        iouThreshold=0.5,
-                        minBoxSize=0,
-                        batch_size=bz,
-                        num_classes=1,
-                        rand_box=True,
-                        test_name="nms_" + str(bz) + "_" + str(N) + "_" + str(maxout),
-                        benchmark_shapes=True,
-                    )
+    # !!! SKIPPED TESTS BELOW !!!
+    # manually enable for benchmarking
+
+    # def test_nms_benchmark_shapes(self):
+    #     self._test_nms(
+    #         N=3350,
+    #         preNmsTop=2000,
+    #         nmsMaxOut=100,
+    #         iouThreshold=0.5,
+    #         minBoxSize=0,
+    #         batch_size=16,
+    #         num_classes=1,
+    #         rand_box=True,
+    #         test_name="nms_fcos_shape",
+    #         benchmark_shapes=True,
+    #     )
+
+    #     for bz in (1, 4, 16):
+    #         for N in (6000, 12000, 20000, 60000):
+    #             for maxout in (100, 300, 1000):
+    #                 self._test_nms(
+    #                     N=N,
+    #                     preNmsTop=6000,
+    #                     nmsMaxOut=maxout,
+    #                     iouThreshold=0.5,
+    #                     minBoxSize=0,
+    #                     batch_size=bz,
+    #                     num_classes=1,
+    #                     rand_box=True,
+    #                     test_name="nms_" + str(bz) + "_" + str(N) + "_" + str(maxout),
+    #                     benchmark_shapes=True,
+    #                 )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
index 3f11392b9..5ba303241 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
@@ -25,12 +25,18 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skip("Re-enable after cutlass fix")
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCCRBiasPerm021TestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_perm021fc_ccr_bias_perm021(
         self,
         test_name="perm021fc_ccr_bias_perm021",
@@ -83,35 +89,31 @@ def _test_perm021fc_ccr_bias_perm021(
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_perm021fc_ccr_bias_perm021_fp16(self):
-        self._test_perm021fc_ccr_bias_perm021(
-            test_name="perm021fc_ccr_bias_perm021_fp16",
-            dtype="float16",
-        )
+    # !!! SKIPPED TESTS BELOW !!!
+    # Permute3DBMM_021 layout not currently present in CUTLASS
+    # TODO: enable the tests after this layout becomes available
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"fp32 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_ccr_bias_perm021_fp32(self):
-        self._test_perm021fc_ccr_bias_perm021(
-            test_name="perm021fc_ccr_bias_perm021_fp32",
-            dtype="float32",
-        )
+    # def test_perm021fc_ccr_bias_perm021_fp16(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_fp16",
+    #         dtype="float16",
+    #     )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    @unittest.skipIf(
-        int(detect_target()._arch) < 80,
-        f"bf16 BMM not supported in {detect_target()._arch}",
-    )
-    def test_perm021fc_ccr_bias_perm021_bf16(self):
-        self._test_perm021fc_ccr_bias_perm021(
-            test_name="perm021fc_ccr_bias_perm021_bf16",
-            dtype="bfloat16",
-        )
+    # def test_perm021fc_ccr_bias_perm021_fp32_sm80(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_fp32",
+    #         dtype="float32",
+    #     )
+
+    # def test_perm021fc_ccr_bias_perm021_bf16_sm80(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_bf16",
+    #         dtype="bfloat16",
+    #     )
+
+
+filter_test_cases_by_test_env(Perm021FCCCRBiasPerm021TestCase)
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()

From 846b41d11005e0c8c70b35c72aa13a262f7bd3ce Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 27 Mar 2023 19:19:33 -0700
Subject: [PATCH 344/638] full: pass input dtype when dtype is None (#504)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/504

When the `fx2ait` converters for the `full` op pass `kwargs["dtype"]` to the op, apparently, it can be `None`. This breaks the downstream graph.

This diff adds a check to the converter and a validation to the `full` op's front-end to prevent this from happening.

Reviewed By: amateurcoffee

Differential Revision: D44433380

fbshipit-source-id: bc59ca7eabd796c985f7af59e8bd25f6349ef038
---
 fx2ait/fx2ait/converters/ait_converters.py    | 22 ++++++++++++++-----
 python/aitemplate/compiler/ops/tensor/full.py |  4 ++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index c01613ef4..73bf06851 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1623,11 +1623,11 @@ def acc_ops_tile(
     input_dim_len = len(input_val.shape())
     result = input_val
     if len(shape_dims) < input_dim_len:
-        for i in range(input_dim_len - len(shape_dims)):
+        for _ in range(input_dim_len - len(shape_dims)):
             shape_dims.insert(0, 1)
     if input_dim_len < len(shape_dims):
         shape = input_val.shape()
-        for i in range(len(shape_dims) - input_dim_len):
+        for _ in range(len(shape_dims) - input_dim_len):
             shape.insert(0, IntImm(1))
         result = expand()(input_val, shape)
 
@@ -1674,7 +1674,11 @@ def acc_ops_new_full(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
-    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
     fill_value = kwargs["fill_value"]
     return full()(size, fill_value=fill_value, dtype=dtype)
 
@@ -1698,7 +1702,11 @@ def acc_ops_new_ones(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
-    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
     return full()(size, 1, dtype=dtype)
 
 
@@ -1720,7 +1728,11 @@ def acc_ops_new_zeros(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     size = kwargs["size"]
-    dtype = kwargs["dtype"] if "dtype" in kwargs else input_val.dtype()
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
     return full()(size, 0, dtype=dtype)
 
 
diff --git a/python/aitemplate/compiler/ops/tensor/full.py b/python/aitemplate/compiler/ops/tensor/full.py
index 774fa054a..2f2ae515b 100644
--- a/python/aitemplate/compiler/ops/tensor/full.py
+++ b/python/aitemplate/compiler/ops/tensor/full.py
@@ -18,6 +18,7 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
 
 
 class full(Operator):
@@ -56,6 +57,9 @@ def __call__(
             raise TypeError(f"fill_value must be a scalar, but got {fill_value}.")
         fill_value = float(fill_value)
 
+        # validation inside
+        get_dtype_size(dtype)
+
         self._attrs["inputs"] = []
         self._attrs["fill_value"] = fill_value
 

From 9bd086cd80902a42fa1bfad740ca8c578fcc3277 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Mon, 27 Mar 2023 23:56:18 -0700
Subject: [PATCH 345/638] Remove links to youtube debug background music (#496)

Summary:
Found interesting links in gemm profiling templates: https://www.youtube.com/watch?v=rRwxfYlgG-M

Let's add the description - Namewee - China REGGAETON

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/496

Reviewed By: frank-wei

Differential Revision: D44395422

Pulled By: chenyang78

fbshipit-source-id: 5dfd7516f599573e505f24a57c1fa2ad0be56ebb
---
 .../backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py        | 1 -
 python/aitemplate/backend/cuda/gemm_universal/common.py          | 1 -
 python/aitemplate/backend/cuda/gemm_universal/group_common.py    | 1 -
 3 files changed, 3 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 1cbb62c5f..c5d128967 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -100,7 +100,6 @@
 
 {{indent}}};
 {% if is_profiler %}
-{{indent}}// https://youtu.be/-Rp7UPbhErE
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 8f24d2a7d..a20c54bda 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -222,7 +222,6 @@
 
 {{indent}}};
 {% if is_profiler %}
-{{indent}}// https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index f04abba51..65da16a52 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -297,7 +297,6 @@
 
 {{indent}}};
 {% if is_profiler %}
-{{indent}}// Debug BGM: https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();

From 5509cd1dec154c85d8a0871b6d4d1f91967c34ea Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Mar 2023 02:31:27 -0700
Subject: [PATCH 346/638] Visualize more Operators attrs (#493)

Summary:
Visualize more Operators attrs

Screenshot: https://ibb.co/VBsDfnb

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/493

Reviewed By: ipiszy

Differential Revision: D44432329

Pulled By: aakhundov

fbshipit-source-id: fd6a3ca32fb587349c3b6abbf7ba1c98d8405735
---
 .../utils/visualization/op_attr_factory.py      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/utils/visualization/op_attr_factory.py b/python/aitemplate/utils/visualization/op_attr_factory.py
index 53505259c..c0d91a3c8 100644
--- a/python/aitemplate/utils/visualization/op_attr_factory.py
+++ b/python/aitemplate/utils/visualization/op_attr_factory.py
@@ -13,11 +13,26 @@
 #  limitations under the License.
 #
 
+KEYS = [
+    "op",
+    "depth",
+    "nop",
+    "has_profiler",
+    "epilogue",
+    "epilogue_alignment",
+    "split_k",
+    "permute_shape",
+]
+
 
 def op_to_content(op):
     # TODO (XXX): Add op specialized attrs here, like gemm/conv
     content = {}
-    content["op_type"] = op._attrs["op"]
+    for k in KEYS:
+        v = op._attrs.get(k)
+        if v is not None and v != "":
+            content[k] = v
+
     if op._attrs["op"] == "fused_elementwise":
         content["func"] = ", ".join(
             [str(x._attrs["func"]) for x in op._attrs["elementwise_ops"]]

From cdefe2588d8538b47c9d9c013193b390966648fc Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 28 Mar 2023 07:12:12 -0700
Subject: [PATCH 347/638] Fix warnings from MSVC compiler (#499)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/499

Reviewed By: wushirong

Differential Revision: D44404746

fbshipit-source-id: 87a0cb7ec0eb7884ed626e4f2e8996880a32df4f
---
 static/csrc/debug_utility.cpp    | 2 +-
 static/include/model_interface.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/static/csrc/debug_utility.cpp b/static/csrc/debug_utility.cpp
index 3ebaf1879..3d9f3b60d 100644
--- a/static/csrc/debug_utility.cpp
+++ b/static/csrc/debug_utility.cpp
@@ -35,7 +35,7 @@ __global__ void inf_and_nan_checker(const half* tensor, int64_t elem_cnt) {
   }
   if (nan_num > 0 || pos_inf > 0 || neg_inf > 0) {
     printf(
-        "contains NaN: %ld, +INF: %ld, -INF: %ld, total elements: %ld\n",
+        "contains NaN: %lld, +INF: %lld, -INF: %lld, total elements: %lld\n",
         nan_num,
         pos_inf,
         neg_inf,
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 1415c5f2f..6ef12a2da 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -60,7 +60,7 @@ struct AITemplateParamShape {
 
   size_t Numel() const {
     return std::accumulate(
-        shape_data, shape_data + size, 1, std::multiplies<int64_t>());
+        shape_data, shape_data + size, (int64_t)1, std::multiplies<int64_t>());
   }
 };
 
@@ -104,6 +104,7 @@ inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
     case AITemplateDtype::kUnset:
       throw std::runtime_error("Unset dtype has no size!");
   }
+  throw std::runtime_error("dtype handling is not implemented!");
 }
 
 struct AITemplateStreamOpaque {};

From d2ed4dceebaaf55313c3aa034bc092584b2ed039 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Mar 2023 12:15:05 -0700
Subject: [PATCH 348/638] Format json arrays in vis html (#506)

Summary:
Problem:
When we visualize Large models the html files will have very long lines for particular data arrays.
For example, [test model](https://apivovarov-tmp.s3.us-west-2.amazonaws.com/www/toposort_graph_vis_1000_new.html) with 1000 Linear operators has two long lines. The biggest one is 254,594 chars!

It does not work well with text editors or other html tools.

We can apply pretty formatting to large json arrays in html file. In that case the dumped arrays will grow vertically but not horizontally in html.

I identified two arrays with this problem - `popover_data` and `items`.
This PR uses `indent=2` in `json.dumps()` to format the array.

The result of the arrays formatting in html:

```
    var popover_data = {
  "X": "shape: [(1, 8), 3]",
  "gemm_rcr_bias_0": "op: gemm_rcr_bias, depth: 0, nop: False, has_profiler: True, epilogue: LinearCombination, epilogue_alignment: 1, split_k: 1",
  "tensor_0": "shape: [3, 3]",
  "tensor_1": "shape: [3]",
  "gemm_rcr_bias_0_0": "shape: [(1, 8), 3]",
  "gemm_rcr_bias_1": "op: gemm_rcr_bias, depth: 1, nop: False, has_profiler: True, epilogue: LinearCombination, epilogue_alignment: 1, split_k: 1",
  "tensor_2": "shape: [3, 3]",
  "tensor_3": "shape: [3]",
...
  "gemm_rcr_bias_99": "op: gemm_rcr_bias, depth: 99, nop: False, has_profiler: True, epilogue: LinearCombination, epilogue_alignment: 1, split_k: 1",
  "tensor_198": "shape: [3, 3]",
  "tensor_199": "shape: [3]",
  "Y": "shape: [(1, 8), 3]"
};
```
```
  items = [
  "X",
  "gemm_rcr_bias_0",
  "tensor_0",
  "tensor_1",
  "gemm_rcr_bias_0_0",
  "gemm_rcr_bias_1",
  "tensor_2",
  "tensor_3",
  "gemm_rcr_bias_1_0",
  "gemm_rcr_bias_2",
  "tensor_4",
  "tensor_5",
...
  "tensor_198",
  "tensor_199",
  "Y"
];
```

[Test report](https://apivovarov-tmp.s3.us-west-2.amazonaws.com/www/toposort_graph_vis_100_new_formatted.html) with arrays formatting

`items` array - at line 18,860
`popover_data` array - at line 20,201

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/506

Reviewed By: alexanderguzhva

Differential Revision: D44454768

Pulled By: aakhundov

fbshipit-source-id: 5f33e23f1e3db3682b63773a924e5994100e67aa
---
 python/aitemplate/utils/visualization/plot.py         | 4 ++--
 python/aitemplate/utils/visualization/web_template.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 757466598..d0c21999a 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -300,8 +300,8 @@ def plot_graph(
         basename = os.path.splitext(os.path.basename(file_path))[0]
         dot_src = dot_graph.to_string()
         modal_src = "\n".join(modal_set)
-        items_src = [f'"{item}"' for item in items]
-        popover_src = json.dumps(popover_data)
+        items_src = json.dumps(items, indent=2)
+        popover_src = json.dumps(popover_data, indent=2)
         index = INDEX_TEMPLATE.render(
             dot_src=dot_src,
             modals=modal_src,
diff --git a/python/aitemplate/utils/visualization/web_template.py b/python/aitemplate/utils/visualization/web_template.py
index 488e14d1f..fe98c074f 100644
--- a/python/aitemplate/utils/visualization/web_template.py
+++ b/python/aitemplate/utils/visualization/web_template.py
@@ -135,7 +135,7 @@
   
 
   <script>
-  items = [{{items|join(", ")}}];
+  items = {{items}};
   function autocomplete(inp, arr) {
   /*the autocomplete function takes two arguments,
   the text field element and an array of possible autocompleted values:*/

From 439fd0b1e4dd6a85aa4906aab0c4f30f8b5645e3 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Mar 2023 12:31:47 -0700
Subject: [PATCH 349/638] Fix OrderedDict comprehensions usage (#508)

Summary:
1. When we create `OrderedDict` from another collection or iterable there is not need to wrap it with `list` or `dict`. Wrapping with `dict` is especially strange because it could change the items order before python 3.6.

More info on it - [OrderedDict comprehensions](https://stackoverflow.com/questions/21103732/ordereddict-comprehensions)

2. Another improvement is to import `OrderedDict` from `collections` instead of from `typing`. It is recommended to import `OrderedDict` from `collections` rather than from other places.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/508

Reviewed By: alexanderguzhva

Differential Revision: D44457567

Pulled By: aakhundov

fbshipit-source-id: 089df36a79fd343e64f9026661bacbfc8d820f9e
---
 fx2ait/fx2ait/converters/ait_module_converters.py |  3 ++-
 .../backend/rocm/normalization/groupnorm.py       |  4 ++--
 .../backend/rocm/normalization/layernorm.py       |  4 ++--
 .../backend/rocm/normalization/norm_common.py     |  3 ++-
 .../compiler/ops/gemm_universal/gemm_common.py    |  8 +++-----
 .../compiler/ops/gemm_universal/group_gemm_rcr.py |  8 +++-----
 python/aitemplate/compiler/transform/profile.py   | 15 +++++++--------
 .../compiler/transform/profile_dynamic_dim.py     | 13 ++++++-------
 python/aitemplate/frontend/nn/container.py        |  4 ++--
 tests/unittest/frontend/test_module.py            | 10 +++++-----
 10 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index a869b18ae..be41bf4ba 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -12,7 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from typing import Any, Dict, OrderedDict, Tuple
+from collections import OrderedDict
+from typing import Any, Dict, Tuple
 
 import numpy as np
 
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index 978e3bef8..ab8dced5e 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -15,9 +15,9 @@
 """
 Groupnorm codegen for ROCM.
 """
-
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index b559a621e..93d2216aa 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -15,9 +15,9 @@
 """
 Layernorm codegen for ROCM.
 """
-
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index a21bdb96c..4f0da20e9 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -18,8 +18,9 @@
 
 import os
 import re
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 0bd5e4639..a285ee431 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -495,11 +495,9 @@ def gen_profiler(
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
-            {
-                k: v
-                for k, v in self._attrs["op_instance"].items()
-                if filter_func(k, self._attrs, ab_alignments[0])
-            }
+            (k, v)
+            for k, v in self._attrs["op_instance"].items()
+            if filter_func(k, self._attrs, ab_alignments[0])
         )
         _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index c755610b6..39e6bdfe1 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -278,11 +278,9 @@ def gen_profiler(
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
-            {
-                k: v
-                for k, v in self._attrs["op_instance"].items()
-                if filter_func(k, self._attrs, ab_alignments[0])
-            }
+            (k, v)
+            for k, v in self._attrs["op_instance"].items()
+            if filter_func(k, self._attrs, ab_alignments[0])
         )
         _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index c020004b7..4e908fee9 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -17,9 +17,10 @@
 """
 import logging
 import os
+from collections import OrderedDict
 from copy import deepcopy
 from datetime import datetime
-from typing import List, OrderedDict
+from typing import List
 
 from aitemplate.backend import builder, codegen
 
@@ -55,7 +56,6 @@ def profile(
     devices=None,
     dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
 ):
-
     """Profiles kernels.
 
     Parameters
@@ -89,13 +89,12 @@ def profile(
     compile_engine.make_profilers(generated_profilers, profiler_dir)
     _LOGGER.info(f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
     funcs_to_profile = OrderedDict(
-        {
-            func._attrs["name"]: func
-            for node in sorted_graph
-            for func in node.src_ops()
-            if func._attrs["has_profiler"]
-        }
+        (func._attrs["name"], func)
+        for node in sorted_graph
+        for func in node.src_ops()
+        if func._attrs["has_profiler"]
     )
+
     start_t = datetime.now()
     gemms, non_gemms = _splitter(
         funcs_to_profile.values(), lambda f: isinstance(f, gemm)
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index 493d2f54a..c398c5034 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -16,8 +16,9 @@
 Graph pass to invoke profiling with dynamic shapes.
 """
 import logging
+from collections import OrderedDict
 from copy import deepcopy
-from typing import List, OrderedDict
+from typing import List
 
 from aitemplate.backend import builder, codegen
 from aitemplate.compiler.base import Tensor
@@ -35,12 +36,10 @@ def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
     compile_engine = builder.Builder()
     compile_engine.make_profilers(generated_profilers, workdir)
     funcs_to_profile = OrderedDict(
-        {
-            func._attrs["name"]: func
-            for node in sorted_graph
-            for func in node.src_ops()
-            if func._attrs["has_profiler"]
-        }
+        (func._attrs["name"], func)
+        for node in sorted_graph
+        for func in node.src_ops()
+        if func._attrs["has_profiler"]
     )
     for f in funcs_to_profile.values():
         f.profile_dynamic_dim(
diff --git a/python/aitemplate/frontend/nn/container.py b/python/aitemplate/frontend/nn/container.py
index 83769e80d..78d13be7f 100644
--- a/python/aitemplate/frontend/nn/container.py
+++ b/python/aitemplate/frontend/nn/container.py
@@ -131,7 +131,7 @@ def __delitem__(self, idx: Union[slice, int]) -> None:
             delattr(self, key)
         # To preserve numbering
         str_indices = [str(i) for i in range(len(self._modules))]
-        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+        self._modules = OrderedDict(zip(str_indices, self._modules.values()))
 
     def __len__(self) -> int:
         return len(self._modules)
@@ -309,7 +309,7 @@ def __delitem__(self, idx: Union[int, slice]) -> None:
             delattr(self, self._get_abs_string_index(idx))
         # To preserve numbering, self._modules is being reconstructed with modules after deletion
         str_indices = [str(i) for i in range(len(self._modules))]
-        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+        self._modules = OrderedDict(zip(str_indices, self._modules.values()))
 
     def __len__(self) -> int:
         return len(self._modules)
diff --git a/tests/unittest/frontend/test_module.py b/tests/unittest/frontend/test_module.py
index 7d0db2104..1c397eaf3 100644
--- a/tests/unittest/frontend/test_module.py
+++ b/tests/unittest/frontend/test_module.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import unittest
-from typing import OrderedDict
+from collections import OrderedDict
 
 import torch
 import torch as pt
@@ -59,7 +59,7 @@ def forward(self, x):
         b = PTModule()
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_1(self):
@@ -100,7 +100,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_2(self):
@@ -153,7 +153,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_module_dict(self):
@@ -238,7 +238,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
 
From b042f0405d44433c8d3017103e74b48c63b9d35a Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Mar 2023 12:44:23 -0700
Subject: [PATCH 350/638] Use .exe ext on Windows only (#509)

Summary:
`compile_model()` allows to build standalone model app.
```
debug_settings.gen_standalone = True
m = aitemplate.compiler.compile_model(...
    debug_settings=debug_settings,
)
```

Currently the generated app will have `.exe.` extension regardless of the OS.

Lets add `.exe.` extension on Windows platform only

This PR also adds `is_windows()` to `aitemplate.utils.misc`

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/509

Reviewed By: alexanderguzhva

Differential Revision: D44457551

Pulled By: aakhundov

fbshipit-source-id: 5a4068d9e5dcc192f63904b116ed2aa678b4319a
---
 python/aitemplate/backend/builder.py          | 6 ++++--
 python/aitemplate/utils/misc.py               | 4 ++++
 tests/unittest/backend/test_gen_standalone.py | 8 +++++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index ddc5da699..6ec0cf4a7 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -38,7 +38,7 @@
 
 from aitemplate.utils.debug_settings import AITDebugSettings
 
-from aitemplate.utils.misc import is_debug
+from aitemplate.utils.misc import is_debug, is_windows
 
 # pylint: disable=W0221,C0103
 
@@ -457,7 +457,9 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings)
         build_standalone_rules = ""
         if debug_settings.gen_standalone:
             build_exe_cmd = f"$(CC) $(CFLAGS) -o $@ {standalone_obj} {dll_name}"
-            exe_name = os.path.splitext(dll_name)[0] + ".exe"
+            exe_name = os.path.splitext(dll_name)[0]
+            if is_windows():
+                exe_name += ".exe"
             exe_target_deps = f"{dll_name} {standalone_obj}"
             build_standalone_rules = standalone_rules_template.render(
                 standalone_src=standalone_src,
diff --git a/python/aitemplate/utils/misc.py b/python/aitemplate/utils/misc.py
index fa578c501..e9429b7c1 100644
--- a/python/aitemplate/utils/misc.py
+++ b/python/aitemplate/utils/misc.py
@@ -25,6 +25,10 @@ def is_debug():
     return logger.level == logging.DEBUG
 
 
+def is_windows() -> bool:
+    return os.name == "nt"
+
+
 def setup_logger(name):
     root_logger = logging.getLogger(name)
     info_handle = logging.StreamHandler()
diff --git a/tests/unittest/backend/test_gen_standalone.py b/tests/unittest/backend/test_gen_standalone.py
index a322efa75..50221f51b 100644
--- a/tests/unittest/backend/test_gen_standalone.py
+++ b/tests/unittest/backend/test_gen_standalone.py
@@ -30,6 +30,7 @@
     get_torch_empty_tensor,
 )
 from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.misc import is_windows
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -115,7 +116,7 @@ def _test_gen_standalone(self, test_name, dtype):
         )
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
-        # Now we run the generate executable
+        # Now we run the generated executable
         cwd = os.getcwd()
         workdir = os.path.join(cwd, "tmp", test_name)
         working_env = os.environ.copy()
@@ -126,8 +127,9 @@ def _test_gen_standalone(self, test_name, dtype):
         else:
             working_env["LD_LIBRARY_PATH"] = workdir
         _LOGGER.info(f"work dir: {workdir}")
+        exe_name = "./test.exe" if is_windows() else "./test"
         with subprocess.Popen(
-            ["./test.exe"],
+            [exe_name],
             shell=True,
             cwd=workdir,
             env=working_env,
@@ -147,7 +149,7 @@ def _test_gen_standalone(self, test_name, dtype):
                 if proc.returncode != 0:
                     _LOGGER.info(f"stdout:\n\n{stdout}")
                     _LOGGER.info(f"stderr:\n\n{stderr}")
-                    raise RuntimeError("failed to execute test.exe")
+                    raise RuntimeError(f"failed to execute {exe_name}")
                 else:
                     _LOGGER.info(f"stdout:\n\n{stdout}")
                     all_output_lines = stdout.split("\n")

From d54dddd1036c99555e7dfb48cacf2be4ed44ca57 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 28 Mar 2023 13:02:17 -0700
Subject: [PATCH 351/638] Skip bind_constants if constants dict is empty (#507)

Summary:
`constants` parameter is Optional in `compiler.compile_model()`.
We can skip iterating over the graph Tensors in `bind_constants()` transform pass if `constants` dict is empty.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/507

Reviewed By: alexanderguzhva

Differential Revision: D44454431

Pulled By: aakhundov

fbshipit-source-id: 1e986cffa7154b2ac38e891526f9a429facd1a28
---
 python/aitemplate/compiler/transform/bind_constants.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/transform/bind_constants.py b/python/aitemplate/compiler/transform/bind_constants.py
index 7ff6fe9c1..3f100ce6c 100644
--- a/python/aitemplate/compiler/transform/bind_constants.py
+++ b/python/aitemplate/compiler/transform/bind_constants.py
@@ -35,7 +35,8 @@ def bind_constants(graph: List[Tensor], constants: Dict[str, TorchTensor]) -> No
         Constants to bind
 
     """
-
+    if not constants:
+        return
     for tensor in graph:
         name = tensor._attrs["name"]
         if name not in constants:

From 3f1004b290622d3feb369fc2a9a9d339501fd5f2 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 28 Mar 2023 15:54:26 -0700
Subject: [PATCH 352/638] Add a pass that dedups all shape name that are
 identical. (#495)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/495

We add a graph pass that remove all unnecessary variables that are identical.

Reviewed By: terrychenism

Differential Revision: D44392335

fbshipit-source-id: 139ca3b554659aa2984d4e9bcbcd8ee5930e31e7
---
 python/aitemplate/compiler/compiler.py        |  5 +++
 .../aitemplate/compiler/transform/__init__.py |  2 +-
 .../compiler/transform/name_graph.py          | 32 +++++++++++++++++++
 tests/unittest/compiler/test_symbolic.py      | 16 +++++++++-
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 8d5ac2274..1e5fdcb43 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -232,6 +232,11 @@ def compile_model(
             compiler.transform.name_graph(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "name_graph")
 
+            compiler.transform.dedup_symbolic_name(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "dedup_symbolic_name"
+            )
+
             compiler.transform.mark_param_tensor(graph)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "mark_param_tensor"
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index 32cf17cdd..083a7e853 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -32,7 +32,7 @@
 )
 from aitemplate.compiler.transform.memory_planning import memory_planning
 from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
-from aitemplate.compiler.transform.name_graph import name_graph
+from aitemplate.compiler.transform.name_graph import dedup_symbolic_name, name_graph
 from aitemplate.compiler.transform.optimize_graph import optimize_graph
 from aitemplate.compiler.transform.profile import profile
 from aitemplate.compiler.transform.refine_graph import refine_graph
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 4f95a9281..71896d9a3 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -28,6 +28,7 @@
 func_name_to_tensor_cnt = {}
 
 MEMO = set()
+user_provided_dim = set()
 
 
 def valid_c_name(name):
@@ -55,6 +56,7 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
     global func_cnt
     global tensor_cnt
     global func_name_to_tensor_cnt
+    global user_provided_dim
     for node in sorted_graph:
         funcs = node.src_ops()
         if len(funcs) == 0:
@@ -95,6 +97,8 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
         tensor_name = node._attrs["name"]
         for i, dim in enumerate(node._attrs["shape"]):
+            if dim._attrs["name"] is not None:
+                user_provided_dim.add(dim._attrs["name"])
             if dim._attrs["name"] is None and not isinstance(dim, JaggedIntVar):
                 dim_name = "{tname}_dim_{idx}".format(tname=tensor_name, idx=i)
                 dim._attrs["name"] = dim_name
@@ -118,3 +122,31 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 # the batch_dim wasn't named above, so we name it here
                 jagged_int_var_name = jagged_int_var._attrs["name"]
                 batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"
+
+
+def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
+    """Rename all shape variable that are identical to the same name.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph to be simplified
+    """
+    symbolic_to_name = {}
+    global user_provided_dim
+    for node in sorted_graph:
+        for dim in node._attrs["shape"]:
+            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
+                dim_sym = dim.symbolic_value()
+                if (
+                    dim_sym not in symbolic_to_name
+                    or dim_sym in symbolic_to_name
+                    and dim._attrs["name"] in user_provided_dim
+                ):
+                    symbolic_to_name[dim_sym] = dim._attrs["name"]
+
+    for node in sorted_graph:
+        for dim in node._attrs["shape"]:
+            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
+                dim_sym = dim.symbolic_value()
+                dim._attrs["name"] = symbolic_to_name[dim_sym]
diff --git a/tests/unittest/compiler/test_symbolic.py b/tests/unittest/compiler/test_symbolic.py
index e91e6d412..44b8b11e0 100644
--- a/tests/unittest/compiler/test_symbolic.py
+++ b/tests/unittest/compiler/test_symbolic.py
@@ -17,9 +17,10 @@
 
 import sympy
 
-from aitemplate.compiler import ops, symbolic
+from aitemplate.compiler import ops, symbolic, transform
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
 
 
 class SymbolTestCase(unittest.TestCase):
@@ -100,6 +101,19 @@ def test_elementwise_symbolic(self):
         div = ops.elementwise(FuncEnum.DIV)(tensor1, tensor2)
         self.assertEqual(div._attrs["symbolic_value"], sym1 / sym2)
 
+    def test_dedup_symbolic_name(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        var2 = IntVar(
+            values=[1, 256], name="var_2", symbolic_value=var1.symbolic_value()
+        )
+        X_shape = [var1, var2]
+
+        X = Tensor(shape=X_shape, name="input_0", is_input=True)
+
+        self.assertNotEqual(X.shape()[0]._attrs["name"], X.shape()[1]._attrs["name"])
+        transform.dedup_symbolic_name([X])
+        self.assertEqual(X.shape()[0]._attrs["name"], X.shape()[1]._attrs["name"])
+
 
 if __name__ == "__main__":
     unittest.main()

From f82ceb9d84c10bc6f20db7a2120af642bc8e1516 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Tue, 28 Mar 2023 18:04:45 -0700
Subject: [PATCH 353/638] Replace mem_eff_attention with a custom attention for
 a lowering path (#505)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/505

Reviewed By: ipiszy, wushirong

Differential Revision: D44437799

fbshipit-source-id: 7ae4ffe7e48b3755a7f676e0a4e7981da372e8a1
---
 .../aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index 587cba9a2..9d52103b8 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -95,7 +95,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         if self._attrs["layout"] == "Permute4DBMM_0213":
             b, m, n = output_shape
             d1 = self._attrs["shape"][0]
-            output_shape = [b.value() // d1, m, d1, n]
+            output_shape = [-1, m, d1, n]
             self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:

From 163bad0a018769b2ae2d41e12c2437cb1a41aa04 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 29 Mar 2023 10:35:36 -0700
Subject: [PATCH 354/638] Overload IntVar arithmetic operations (#478)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/478

Overload IntVar arithmetic operations, so we could directly do add/sub/mul/div on IntVars.

Reviewed By: chenyang78

Differential Revision: D44327420

fbshipit-source-id: bdee4be0057e0dd978457bc646606eb3e94220bc
---
 python/aitemplate/compiler/base.py       | 150 ++++++++++++++++++++++-
 tests/unittest/compiler/test_symbolic.py | 134 ++++++++++++++++++++
 2 files changed, 283 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index b9deffd71..80e1451ec 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -17,10 +17,13 @@
 """
 from __future__ import annotations
 
+import math
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from functools import reduce
+from numbers import Number
 from pprint import pformat
 from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
@@ -84,6 +87,9 @@ class IntVar(Node):
     """
     An IntVar represents a dynamic dimension.
     IntVar and IntImm (see below) are used together to represent a Tensor's shape.
+
+    IntVar supports basic arithmetic operations, and returns the most conservative
+    IntVar w.r.t. range of _attrs["values"].
     """
 
     def __init__(
@@ -148,7 +154,149 @@ def __eq__(self, another: Any) -> bool:
         )
 
     def __hash__(self) -> int:
-        return hash((self._attrs["name"], tuple(self._attrs["values"])))
+        return hash(
+            (
+                self._attrs["name"],
+                tuple(self._attrs["values"]),
+                self._attrs["symbolic_value"],
+            )
+        )
+
+    def __add__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym + other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym + other
+        else:
+            raise NotImplementedError(f"Unable to do addition on {self} and {other}")
+
+        new_values = [
+            self_values[0] + other_values[0],
+            self_values[-1] + other_values[-1],
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __radd__(self, other: Union[Any, IntVar]) -> IntVar:
+        return self + other
+
+    def __sub__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym - other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym - other
+        else:
+            raise NotImplementedError(f"Unable to do subtraction on {self} and {other}")
+
+        new_values = [
+            max(0, self_values[0] - other_values[-1]),
+            max(0, self_values[-1] - other_values[0]),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rsub__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = other._attrs["symbolic_value"] - new_sym
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = other - new_sym
+        else:
+            raise NotImplementedError(
+                f"Unable to do r-subtraction on {self} and {other}"
+            )
+
+        new_values = [
+            max(0, other_values[0] - self_values[-1]),
+            max(0, other_values[-1] - self_values[0]),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __mul__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym * other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym * other
+        else:
+            raise NotImplementedError(
+                f"Unable to do multiplication on {self} and {other}"
+            )
+
+        new_values = [
+            self_values[0] * other_values[0],
+            self_values[-1] * other_values[-1],
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rmul__(self, other: Union[Any, IntVar]) -> IntVar:
+        return self * other
+
+    def __truediv__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym / other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym / other
+        else:
+            raise NotImplementedError(f"Unable to do division on {self} and {other}")
+
+        new_values = [
+            math.floor(self_values[0] / max(1, other_values[-1])),
+            math.ceil(self_values[-1] / max(1, other_values[0])),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rtruediv__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = other._attrs["symbolic_value"] / new_sym
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = other / new_sym
+        else:
+            raise NotImplementedError(f"Unable to do r-division on {self} and {other}")
+
+        new_values = [
+            math.floor(other_values[0] / max(1, self_values[-1])),
+            math.ceil(other_values[-1] / max(1, self_values[0])),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
 
     def lower_bound(self) -> int:
         """Returns lower bound of this dynamic dim."""
diff --git a/tests/unittest/compiler/test_symbolic.py b/tests/unittest/compiler/test_symbolic.py
index 44b8b11e0..e98c85e74 100644
--- a/tests/unittest/compiler/test_symbolic.py
+++ b/tests/unittest/compiler/test_symbolic.py
@@ -115,5 +115,139 @@ def test_dedup_symbolic_name(self):
         self.assertEqual(X.shape()[0]._attrs["name"], X.shape()[1]._attrs["name"])
 
 
+class IntVarSymbolTestCase(unittest.TestCase):
+    def test_add(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=41)
+
+        var3 = var1 + var2
+        self.assertEqual(var3._attrs["values"], [1 + 1, 256 + 256])
+        self.assertEqual(var3.symbolic_value(), sym1 + sym2)
+
+        var4 = var1 + imm1
+        self.assertEqual(var4._attrs["values"], [1 + 37, 256 + 37])
+        self.assertEqual(var4.symbolic_value(), sym1 + 37)
+
+        imm3 = imm1 + imm2
+        self.assertEqual(imm3._attrs["values"], [37 + 41])
+        self.assertEqual(imm3.symbolic_value(), 37 + 41)
+
+    def test_radd(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 3 + var1
+        self.assertEqual(var2._attrs["values"], [3 + 1, 3 + 256])
+        self.assertEqual(var2.symbolic_value(), 3 + sym1)
+
+        imm2 = 7 + imm1
+        self.assertEqual(imm2._attrs["values"], [7 + 37])
+        self.assertEqual(imm2.symbolic_value(), 7 + 37)
+
+    def test_sub(self):
+        var1 = IntVar(values=[1, 512], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=31)
+
+        var3 = var1 - var2
+        self.assertEqual(var3._attrs["values"], [0, 511])
+        self.assertEqual(var3.symbolic_value(), sym1 - sym2)
+
+        var4 = var1 - imm1
+        self.assertEqual(var4._attrs["values"], [0, 512 - 37])
+        self.assertEqual(var4.symbolic_value(), sym1 - 37)
+
+        imm3 = imm1 - imm2
+        self.assertEqual(imm3._attrs["values"], [37 - 31])
+        self.assertEqual(imm3.symbolic_value(), 37 - 31)
+
+    def test_rsub(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 31 - var1
+        self.assertEqual(var2._attrs["values"], [0, 30])
+        self.assertEqual(var2.symbolic_value(), 31 - sym1)
+
+        imm2 = 47 - imm1
+        self.assertEqual(imm2._attrs["values"], [47 - 37])
+        self.assertEqual(imm2.symbolic_value(), 47 - 37)
+
+    def test_mul(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=41)
+
+        var3 = var1 * var2
+        self.assertEqual(var3._attrs["values"], [1 * 1, 256 * 256])
+        self.assertEqual(var3.symbolic_value(), sym1 * sym2)
+
+        var4 = var1 * imm1
+        self.assertEqual(var4._attrs["values"], [1 * 37, 256 * 37])
+        self.assertEqual(var4.symbolic_value(), sym1 * 37)
+
+        imm3 = imm1 * imm2
+        self.assertEqual(imm3._attrs["values"], [37 * 41])
+        self.assertEqual(imm3.symbolic_value(), 37 * 41)
+
+    def test_rmul(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 3 * var1
+        self.assertEqual(var2._attrs["values"], [3 * 1, 3 * 256])
+        self.assertEqual(var2.symbolic_value(), 3 * sym1)
+
+        imm2 = 7 * imm1
+        self.assertEqual(imm2._attrs["values"], [7 * 37])
+        self.assertEqual(imm2.symbolic_value(), 7 * 37)
+
+    def test_div(self):
+        var1 = IntVar(values=[4, 512], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[2, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=4)
+        imm2 = IntImm(value=2)
+
+        var3 = var1 / var2
+        self.assertEqual(var3._attrs["values"], [0, 256])
+        self.assertEqual(var3.symbolic_value(), sym1 / sym2)
+
+        var4 = var1 / imm1
+        self.assertEqual(var4._attrs["values"], [1, 128])
+        self.assertEqual(var4.symbolic_value(), sym1 / 4)
+
+        imm3 = imm1 / imm2
+        self.assertEqual(imm3._attrs["values"], [2])
+        self.assertEqual(imm3.symbolic_value(), 2)
+
+    def test_rdiv(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=4)
+
+        var2 = 512 / var1
+        self.assertEqual(var2._attrs["values"], [2, 512])
+        self.assertEqual(var2.symbolic_value(), 512 / sym1)
+
+        imm2 = 32 / imm1
+        self.assertEqual(imm2._attrs["values"], [8])
+        self.assertEqual(imm2.symbolic_value(), 8)
+
+
 if __name__ == "__main__":
     unittest.main()

From da84008fcd8f269cfdf7b46de52547285b74ec8a Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 29 Mar 2023 15:23:18 -0700
Subject: [PATCH 355/638] Replace an outdated comment in model-generated.h
 (#515)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/515

Reviewed By: khabinov, aakhundov

Differential Revision: D44505127

fbshipit-source-id: 7bca60c3c27dc9f1a500bb9984b61c26368bba30
---
 python/aitemplate/backend/main_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 27f580f0d..5979df5a1 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -249,7 +249,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 }
 
 ModelContainer* CreateModelContainer(size_t num_runtimes, AITemplateAllocator& allocator) {
-  // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size, allocator
+  // num_runtimes, num_inputs, num_outputs, num_bound_constants, num_unbound_constants, params_size, allocator
   return new ModelContainer(num_runtimes, {{num_inputs}}, {{num_outputs}}, {{num_bound_constants}}, {{num_unbound_constants}}, {{param_size}}, allocator);
 }
 } // namespace ait

From 9f38a423cce31dd26daeba76091362a22b24606d Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 30 Mar 2023 00:22:11 -0700
Subject: [PATCH 356/638] Fix default number of runtimes 2->1 in comments
 (#513)

Summary:
`AIT_DEFAULT_NUM_RUNTIMES=1`, however, `compile_model()`, and `Model.__init__()` descriptions say that it is 2.

This PR fixes the description for `num_runtimes` param. Plus fixes some typos in other places.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/513

Reviewed By: muchulee8

Differential Revision: D44517324

Pulled By: chenyang78

fbshipit-source-id: 2fe0b06fc782854906cc29110f0ac0afb5109867
---
 python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh | 2 +-
 python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh       | 2 +-
 python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh    | 2 +-
 python/aitemplate/compiler/compiler.py                        | 2 +-
 python/aitemplate/compiler/model.py                           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 59bbf792e..b868849e4 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -198,7 +198,7 @@ __forceinline__ __device__ bfloat16 Rsqrt<bfloat16>(bfloat16 x) {
 
 template <typename T>
 inline __device__ void WelfordCombine(T val, T* mean, T* m2, int* count) {
-  // Use Welford Online algorithem to compute mean and variance
+  // Use Welford Online algorithm to compute mean and variance
   // For more details you can refer to:
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   *count += 1;
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index af64e2a76..387be808a 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -227,7 +227,7 @@ struct DirectStore {
 
 template <typename T>
 inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
-  // Use Welford Online algorithem to compute mean and variance
+  // Use Welford Online algorithm to compute mean and variance
   // For more details you can refer to:
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   *count += 1;
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
index da8b26996..cb61f326b 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
@@ -99,7 +99,7 @@ Tensor ConcatKernelDimN(const TestCase & tc, int64_t concatDim) {
 */
 
 ////////////////////////////////////////////////////////////
-// Here go the facilities that are resposible for post-processing,
+// Here go the facilities that are responsible for post-processing,
 //   such as applying tanh on top of values on a concatenated tensor.
 
 // does no processing
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 1e5fdcb43..43d9acf9f 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -181,7 +181,7 @@ def compile_model(
     num_runtimes: int
         How many runtimes should be stored in the internal pool. This
         determines how many inferences can happen concurrently. By
-        default, set to 2. Must be positive.
+        default, set to 1. Must be positive.
     allocator_kind: AITemplateAllocatorKind, optional
         The GPU allocator to use. If none is specified, use the default allocator.
     debug_settings: AITDebugSettings
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 413a64fea..3d22c3748 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -198,7 +198,7 @@ def __init__(
         num_runtimes : int, optional
             How many runtimes should be stored in the internal pool. This
             determines how many inferences can happen concurrently. By
-            default, set to 2. Must be positive.
+            default, set to 1. Must be positive.
         allocator_kind : AITemplateAllocatorKind, optional
             What type of allocator to use when allocating GPU memory.
         """

From 56318c48a06e307561388fe20d8435c6ba14e9f4 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 30 Mar 2023 00:26:30 -0700
Subject: [PATCH 357/638] Apply json formatting to debug graph (#512)

Summary:
`compile_model()` function dumps model graph to txt, json and html after each transformation.

Currently `xxx_graph.json` file contains just one huge json line
For better readability we can format the json.

Before:
```json
{"Tensors": [{"name": "X", "depth": 0, "nop": false, "shape": [{"_attrs": {"name": "batch", "depth": 0,
```

After:
```json
{
  "Tensors": [
    {
      "name": "X",
      "depth": 0,
      "nop": false,
      "shape": [
        {
          "_attrs": {
            "name": "batch",
            "depth": 0,
            "nop": false,
            "values": [
              1,
              8
            ],
            "symbolic_value": "batch"
          }
        },
        {
          "_attrs": {
            "name": "dim1",
            "depth": 0,
            "nop": false,
            "values": [
              3
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/512

Reviewed By: aakhundov

Differential Revision: D44517013

Pulled By: chenyang78

fbshipit-source-id: 19daffd7469829c80f18bb24029cd204cd11950c
---
 python/aitemplate/utils/graph_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index a2292fe92..6fa0a86d7 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -68,7 +68,7 @@ def sorted_graph_debug_json(tensors) -> str:
     json_dict["Operators"] = get_sorted_ops(tensors)
 
     op_names = gen_unique_op_names(tensors)
-    encoder = GraphJsonEncoder(op_names)
+    encoder = GraphJsonEncoder(op_names, indent=2)
 
     return encoder.encode(json_dict)
 

From 4ffbb2b44e89a9d45da148a8e173b086dac20d58 Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Thu, 30 Mar 2023 00:36:20 -0700
Subject: [PATCH 358/638] reland D44106679: stable diffusion unet ait converter
 (#510)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/510

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/460

Adding stable diffusion modeling files to AIT frontend.

Reviewed By: yinghai

Differential Revision: D44425856

fbshipit-source-id: 5b95441e788e2677562677882c9f75984af218c2
---
 fx2ait/fx2ait/tools/common_fx2ait.py          |   4 +
 python/aitemplate/frontend/nn/ldm/__init__.py |  17 +
 .../aitemplate/frontend/nn/ldm/attention.py   | 105 +++
 python/aitemplate/frontend/nn/ldm/clip.py     | 628 +++++++++++++++
 .../aitemplate/frontend/nn/ldm/embeddings.py  | 101 +++
 python/aitemplate/frontend/nn/ldm/resnet.py   | 238 ++++++
 .../frontend/nn/ldm/unet_2d_condition.py      | 255 ++++++
 .../aitemplate/frontend/nn/ldm/unet_blocks.py | 762 ++++++++++++++++++
 python/aitemplate/frontend/nn/ldm/vae.py      | 153 ++++
 9 files changed, 2263 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/ldm/__init__.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/attention.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/clip.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/embeddings.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/resnet.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/unet_blocks.py
 create mode 100644 python/aitemplate/frontend/nn/ldm/vae.py

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 90d18fcd3..b9aeb8009 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -97,6 +97,7 @@ def run_test(
             leaf_module_list.append(leaf_module)
 
         orig_mod = copy.deepcopy(mod)
+        orig_mod.eval()
         mod = acc_tracer.trace(
             mod,
             inputs,
@@ -110,6 +111,9 @@ def run_test(
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
             inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        mod.half()
+        inputs = [inp.half().contiguous() for inp in inputs]
         interp = AITInterpreter(
             mod,
             inputs,
diff --git a/python/aitemplate/frontend/nn/ldm/__init__.py b/python/aitemplate/frontend/nn/ldm/__init__.py
new file mode 100644
index 000000000..b14195e81
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/__init__.py
@@ -0,0 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+
+from aitemplate.frontend.nn.ldm.unet_2d_condition import UNet2DConditionModel
diff --git a/python/aitemplate/frontend/nn/ldm/attention.py b/python/aitemplate/frontend/nn/ldm/attention.py
new file mode 100644
index 000000000..14993e6d9
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/attention.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
+"""
+
+from typing import Optional
+
+from aitemplate.compiler.ops import reshape
+
+from aitemplate.frontend import nn, Tensor
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        batch_size (:obj:`int`): The number of examples per batch.
+        height (:obj:`int`): Height of each image example.
+        width (:obj:`int`): Width of each image example.
+        channels (:obj:`int`): The number of channels in the input and output.
+        num_head_channels (:obj:`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.channels = channels
+        self.num_heads = (
+            channels // num_head_channels if num_head_channels is not None else 1
+        )
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
+        self.attention = nn.MultiheadAttention(
+            channels,
+            batch_size,
+            height * width,
+            self.num_heads,
+            qkv_bias=True,
+            has_residual=True,
+            use_mem_eff=True,
+        )
+        self.rescale_output_factor = rescale_output_factor
+
+    def forward(self, hidden_states) -> Tensor:
+        """
+        input hidden_states shape: [batch, height, width, channel]
+        output shape: [batch, height, width, channel]
+        """
+        residual = hidden_states
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = reshape()(
+            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+        )
+
+        batch, hw, channel = hidden_states.shape()
+        if (
+            batch.value() != self.batch_size
+            or hw.value() != self.width * self.height
+            or channel.value() != self.channels
+        ):
+            raise RuntimeError(
+                "nchw params do not match! "
+                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
+                f"actual: {batch}, {channel}, {hw}."
+            )
+
+        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
+        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+
+        return res
diff --git a/python/aitemplate/frontend/nn/ldm/clip.py b/python/aitemplate/frontend/nn/ldm/clip.py
new file mode 100644
index 000000000..1a95314d4
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/clip.py
@@ -0,0 +1,628 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from inspect import isfunction
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        dtype="float16",
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
+        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None, residual=None):
+        nheads = self.heads
+        d = self.dim_head
+
+        layout = "20314" if USE_CUDA else "m2n3"
+
+        bs, seqlen, _ = get_shape(x)
+        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
+        )
+        context = default(context, x)
+
+        seqlen = get_shape(context)[1]
+        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        )
+        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
+        )
+
+        if USE_CUDA:
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
+            )
+        else:
+            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
+            out = OP(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+                (ops.reshape()(v, [bs * nheads, -1, d])),
+            )
+        out = ops.reshape()(out, [bs, -1, nheads * d])
+        proj = self.to_out(out)
+        proj = ops.reshape()(proj, [bs, -1, nheads * d])
+        if residual is not None:
+            return proj + residual
+        else:
+            return proj
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+
+    def forward(self, x):
+        return self.proj(x, self.gate(x))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(
+                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x, residual=None):
+        shape = ops.size()(x)
+        x = self.net(x)
+        x = ops.reshape()(x, shape)
+        if residual is not None:
+            return x + residual
+        else:
+            return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
+
+    def forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), residual=x)
+        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.ff(self.norm3(x), residual=x)
+        return x
+
+
+def Normalize(in_channels):
+    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+
+    def __init__(
+        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)  # Group Norm
+
+        self.proj_in = nn.Conv2dBias(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+
+        self.proj_out = nn.Conv2dBias(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, h, w, c = get_shape(x)
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = ops.reshape()(x, [b, -1, c])
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = ops.reshape()(x, [b, h, w, c])
+        x = self.proj_out(x)
+        return x + x_in
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        batch_size=1,
+        seq_len=16,
+        layer_norm_eps=1e-5,
+        hidden_dropout_prob=0.0,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=hidden_dropout_prob,
+            has_residual=False,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        residual: Optional[Tensor] = None,
+    ):
+        if residual is not None:
+            self_output = self.attn(hidden_states, residual)
+        else:
+            self_output = self.attn(hidden_states)
+        return self_output
+
+
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, x):
+        x1 = x * 1.702
+        x1 = ops.sigmoid(x1)
+        x = x * x1
+        return x
+
+
+class CLIPMLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="GELU",
+        drop=0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            specialization="gelu",
+        )
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPMLPQuickGelu(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+        )
+        self.activation_fn = QuickGELUActivation()
+
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPEncoderLayer(nn.Module):
+    ACT_LAYER_TO_CLIP_MLP_MAP = {
+        "gelu": CLIPMLP,
+        "quick_gelu": CLIPMLPQuickGelu,
+    }
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        mlp_ratio=4.0,
+        batch_size=1,
+        seq_len=16,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=0,
+            has_residual=True,
+            causal=causal,
+            mask_seq=mask_seq,
+            use_mem_eff=True,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
+            hidden_size, int(hidden_size * mlp_ratio)
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, residual)
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states, residual)
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        num_hidden_layers=12,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        hidden_size=768,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    causal=causal,
+                    mask_seq=mask_seq,
+                    act_layer=act_layer,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        # all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        vocab_size=49408,
+        max_position_embeddings=77,
+        dtype="float16",
+    ):
+        super().__init__()
+        embed_dim = hidden_size
+
+        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
+        self.position_embedding = nn.Embedding(
+            shape=[max_position_embeddings, embed_dim], dtype=dtype
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        inputs_embeds: Optional[Tensor] = None,
+    ) -> Tensor:
+
+        input_shape = ops.size()(input_ids)
+
+        # [B * S]
+        input_ids = ops.reshape()(input_ids, [-1])
+
+        position_ids = ops.reshape()(position_ids, [-1])
+
+        if inputs_embeds is None:
+            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+
+        position_embeddings = ops.batch_gather()(
+            self.position_embedding.tensor(), position_ids
+        )
+
+        embeddings = inputs_embeds + position_embeddings
+
+        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
+
+        return embeddings
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+        act_layer="gelu",
+    ):
+        super().__init__()
+        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
+        self.encoder = CLIPEncoder(
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            causal=causal,
+            mask_seq=mask_seq,
+            act_layer=act_layer,
+        )
+        self.final_layer_norm = nn.LayerNorm(hidden_size)
+
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        return last_hidden_state
diff --git a/python/aitemplate/frontend/nn/ldm/embeddings.py b/python/aitemplate/frontend/nn/ldm/embeddings.py
new file mode 100644
index 000000000..36b96a4fb
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/embeddings.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def get_timestep_embedding(
+    timesteps: Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+
+    exponent = (-math.log(max_period)) * Tensor(
+        shape=[half_dim], dtype="float16", name="arange"
+    )
+
+    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
+
+    emb = ops.exp(exponent)
+    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = ops.concatenate()(
+            [ops.cos(emb), ops.sin(emb)],
+            dim=-1,
+        )
+    else:
+        emb = ops.concatenate()(
+            [ops.sin(emb), ops.cos(emb)],
+            dim=-1,
+        )
+    return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.linear_2(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
diff --git a/python/aitemplate/frontend/nn/ldm/resnet.py b/python/aitemplate/frontend/nn/ldm/resnet.py
new file mode 100644
index 000000000..03e4f8023
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/resnet.py
@@ -0,0 +1,238 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Upsample2D(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        conv = None
+        if use_conv_transpose:
+            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(x)
+
+        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                x = self.conv(x)
+            else:
+                x = self.Conv2d_0(x)
+
+        return x
+
+
+class Downsample2D(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            conv = nn.Conv2dBias(
+                self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        x = self.conv(x)
+
+        return x
+
+
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_nin_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = nn.GroupNorm(
+            num_groups=groups,
+            num_channels=in_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+
+        self.conv1 = nn.Conv2dBias(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = nn.GroupNorm(
+            num_groups=groups_out,
+            num_channels=out_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2dBias(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        self.upsample = self.downsample = None
+
+        self.use_nin_shortcut = (
+            self.in_channels != self.out_channels
+            if use_nin_shortcut is None
+            else use_nin_shortcut
+        )
+
+        if self.use_nin_shortcut:
+            self.conv_shortcut = nn.Conv2dBias(
+                in_channels, out_channels, 1, 1, 0
+            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
+        else:
+            self.conv_shortcut = None
+
+    def forward(self, x, temb=None):
+        hidden_states = x
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm1(
+            hidden_states
+        )  # .float()).type(hidden_states.dtype) # fused swish
+        # hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            x = self.downsample(x)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(ops.silu(temb))
+            bs, dim = get_shape(temb)
+            temb = ops.reshape()(temb, [bs, 1, 1, dim])
+            hidden_states = hidden_states + temb
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = hidden_states + x
+
+        return out
diff --git a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
new file mode 100644
index 000000000..770156ff9
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
@@ -0,0 +1,255 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple, Union
+
+from aitemplate.frontend import nn
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+
+
+class UNet2DConditionModel(nn.Module):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+    ):
+        super().__init__()
+        self.center_input_sample = center_input_sample
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=norm_eps,
+            use_swish=True,
+        )
+
+        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+
+    def forward(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        return_dict: bool = True,
+    ):
+        """r
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+
+        # 1. time
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "attentions")
+                and downsample_block.attentions is not None
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+
+        # 5. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "attentions")
+                and upsample_block.attentions is not None
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                )
+
+        # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/python/aitemplate/frontend/nn/ldm/unet_blocks.py b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
new file mode 100644
index 000000000..7b6e3e6e6
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
@@ -0,0 +1,762 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# flake8: noqa
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+from .attention import AttentionBlock
+
+from .clip import SpatialTransformer
+from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
+
+# pylint: disable=W0102
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+    downsample_padding=None,
+):
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
+            )
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+):
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
+            )
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                SpatialTransformer(
+                    in_channels,
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if attention_type != "default":
+            raise NotImplementedError(
+                f"attention_type must be default! current value: {attention_type}"
+            )
+
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    batch_size,
+                    height,
+                    width,
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/python/aitemplate/frontend/nn/ldm/vae.py b/python/aitemplate/frontend/nn/ldm/vae.py
new file mode 100644
index 000000000..1cd25aa19
--- /dev/null
+++ b/python/aitemplate/frontend/nn/ldm/vae.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
+"""
+
+from typing import Tuple
+
+from aitemplate.frontend import nn, Tensor
+
+from .unet_blocks import get_up_block, UNetMidBlock2D
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        act_fn="silu",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2dBias(
+            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+        )
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=32,
+            temb_channels=None,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = 32
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=num_groups_out,
+            eps=1e-6,
+            use_swish=True,
+        )
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, z) -> Tensor:
+        sample = z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        sample_size: int = 32,
+    ):
+        super().__init__()
+        self.decoder = Decoder(
+            batch_size,
+            height,
+            width,
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+        )
+        self.post_quant_conv = nn.Conv2dBias(
+            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def decode(self, z: Tensor, return_dict: bool = True):
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self):
+        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")

From 86c4f9cd8c9eb4384ad9b6abec6fbde9f0c20dd0 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Thu, 30 Mar 2023 09:27:23 -0700
Subject: [PATCH 359/638] AIT build cache (#456)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/456

The compilation stage of AIT's generated source files takes a considerable amount of time. It would be nice if we could speed up the build using a build cache, for example using ccache. This would be especially beneficial for speeding up
Unit tests, possibly with a huge cost benefit to Meta,
considering how much time is spent in AIT unit tests on GPU build servers ( see https://www.internalfb.com/sevmanager/view/322359 ) - also, we as developers might wait unneccessarily long for unit tests to complete, so developer productivity could be improved.

Unfortunately, ccache does not work with AIT build scripts for nvcc as it destroys the options that need to be passed down to the host compiler ( I tried that).

But luckily, AIT's codegen has a nice property that we can exploit: The generated sources are rather compact, and all in a single directory. So instead of caching build results for a single file, we could attempt to hash *all* of the sources of a build directory, and if nothing changed, to retrieve the build results for that directory instead of invoking make.

This would likely speed up CI builds considerably.

This diff has a small POC to test this hypothesis.

I was able to, for example, bring down the execution time of this small set of unit tests down from approx. 60 to 15 seconds  after having warmed the build cache:

  buck2 run @//mode/dev-nosan AITemplate:test_gemm --local-only -- -r 3d_2d_rcr

or this set, from approx. 530 seconds, down to approx. 100 seconds.

  buck2 run @//mode/dev-nosan AITemplate:test_expand --local-only

It works rather simple: Define an environment variable
like this:

  export AIT_BUILD_CACHE_DIR=/tmp/ait_build_cache

and that's it. If it is not defined, it is disabled.

### Note:

This is a quick PoC to test the idea and prove we can speed up our CI and dev workflow considerably - to actually productionalize it on our CI, a few points would have to be taken care of:

### TO DO

for production readiness in CI ( in followup diff(s)?)

 * ~~Old cache entries need to be cleaned up~~ (Done!)
 * ~~Make cache updates atomic ( write to temporary directory, then rename to final directory name = hash ) to prevent race conditions in distributed / parallel builds~~ (Done!)
 * ~~Ensure that no environment variables play a role in the build (beyond being included in the Makefile verbatim) - if they do, include in hash~~ ( Checked and tested manually)
 * ~~Filter html, json and possible other debug / trace info, so that these do not contribute to hash calculation~~ (Done!)
 * Run entire test suite twice, and see if there are tests where the cache does not hit on second run ( if yes, investigate why -> Likely the code generation was not generating exactly the same sources twice )
 * ~~(Store the cached data in a distributed manner on the build servers ( network fs, memcached? ccache seems to use memcached there ) and that it is properly expired.~~) (Separate task created)
 * ~~Integrate with CI build system~~ (Separate task created)
 * ~~Add unit tests, which prevent regressions ( e.g. allow to check that future changes in code generation do not break the caching mechanism, for example by not generating deterministically the same code on repeated invocation with same inputs).~~(Done)
 * ~~Ensure that differences in compiler versions are reflected in the hash, so either ensure that used compiler versions are part of the path they are being referred to in the Makefile, or hash these versions explicitly ( they could just be written to a file in the build directory that's included in the hash by executing something like "nvcc --version >>compiler_versions.hashinfo" etc)~~ (Done)

Reviewed By: chenyang78

Differential Revision: D44229622

fbshipit-source-id: 287a7bec3fc285af8c2d14544a338b6cd5752cb8
---
 python/aitemplate/backend/build_cache.py      |  44 ++
 python/aitemplate/backend/build_cache_base.py | 458 ++++++++++++++++++
 python/aitemplate/backend/builder.py          | 128 +++--
 python/aitemplate/backend/cuda/target_def.py  |  51 +-
 python/aitemplate/compiler/compiler.py        |   2 +
 .../compiler/transform/name_graph.py          |  13 +
 python/aitemplate/utils/environ.py            |  15 +
 python/aitemplate/utils/io.py                 | 209 ++++++++
 tests/unittest/backend/test_build_cache.py    | 240 +++++++++
 9 files changed, 1121 insertions(+), 39 deletions(-)
 create mode 100644 python/aitemplate/backend/build_cache.py
 create mode 100644 python/aitemplate/backend/build_cache_base.py
 create mode 100644 python/aitemplate/utils/io.py
 create mode 100644 tests/unittest/backend/test_build_cache.py

diff --git a/python/aitemplate/backend/build_cache.py b/python/aitemplate/backend/build_cache.py
new file mode 100644
index 000000000..5b19af1bc
--- /dev/null
+++ b/python/aitemplate/backend/build_cache.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+The build_cache functionality is split into
+this file and build_cache_base.py
+
+This file is part of the AITemplate OSS distribution.
+For Meta-internal use, there can be an alternative
+to this file which allows to instantiate build caches
+with Meta-internal backing infrastructure.
+"""
+
+from aitemplate.backend.build_cache_base import (
+    BuildCache,
+    FileBasedBuildCache,
+    NoBuildCache,
+)
+from aitemplate.utils import environ as aitemplate_env
+
+__all__ = ["BUILD_CACHE", "BuildCache"]
+
+
+def create_build_cache() -> BuildCache:
+    build_cache_dir = aitemplate_env.ait_build_cache_dir()
+    if build_cache_dir is None or build_cache_dir == "":
+        return NoBuildCache()
+    else:
+        return FileBasedBuildCache(build_cache_dir)
+
+
+BUILD_CACHE: BuildCache = create_build_cache()
diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
new file mode 100644
index 000000000..3e5223f47
--- /dev/null
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -0,0 +1,458 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import hashlib
+import logging
+import os
+import secrets
+import shutil
+
+from abc import ABC, abstractmethod
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+from aitemplate.utils.io import file_age, touch
+
+_LOGGER = logging.getLogger(__name__)
+
+# File extensions to be considered source files
+source_extensions = {
+    "cpp",
+    "h",
+    "cu",
+    "cuh",
+    "c",
+    "hpp",
+    "hxx",
+    "py",
+    "cxx",
+    "cc",
+    "version",
+    "binhash",
+    "hash",
+}
+
+source_filenames = {
+    # needs to be lowercase, because everything is lowercased before comparison
+    # Filenames in here are considered source files, even if their extension would
+    # suggest they are cache artifacts
+    "makefile"
+}
+
+# File extensions of files to be considered cache artifacts ( unless they are considered source files )
+cache_extensions = {"obj", "so", "dll", "exe", ""}
+
+
+def filename_norm_split(filename: str) -> Tuple[str, str]:
+    """
+    Splits filename into basename and extension
+    and lowercases results to enable simple lookup
+    in a case-insensitive manner.
+
+    Args:
+        filename (str): Filename/Path to split
+
+    Returns:
+        Tuple[str,str]: file basename, file extension
+    """
+    file_basename = os.path.basename(filename).lower()
+    file_parts = file_basename.split(".")
+    if len(file_parts) > 1:
+        file_ext = file_parts[-1]
+    else:
+        file_ext = ""
+    return file_basename, file_ext
+
+
+def is_source(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a source file (used to build the cache key) for the purpose of caching
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a source file
+    """
+    file_basename, file_ext = filename_norm_split(filename)
+    return (file_basename in source_filenames) or (file_ext in source_extensions)
+
+
+def is_cache_artifact(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a cacheable artifact (not used to build cache key, but stored in cache)
+    for the purpose of caching
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a cache artifact
+    """
+    file_basename, file_ext = filename_norm_split(filename)
+    return not is_source(filename) and file_ext in cache_extensions
+
+
+def is_bin_file(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a bin file which needs to be considered for the purpose of creating
+    a cache-key, but may be deleted after an initial build.
+
+    bin files are hashed, and their hashes are kept in a small separete file
+    for future use when building the cache key. So the hash is not lost, even if the binary
+    file is deleted.
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a binary file in the above sense
+    """
+    return filename.lower().endswith(".bin")
+
+
+def create_dir_hash(
+    cmds: List[str],
+    build_dir: str,
+    filter_func: Callable[[str], bool] = is_source,
+    debug=False,
+) -> str:
+    """Create a hash of the (source file) contents of a build directory, used for
+    creating a cache key of an entire directory along with the build commands.
+
+    Args:
+        cmds (List[str]): Build commands to be incorporated in hash key computation
+        build_dir (str): Path to build directory ( not part of hash )
+        filter_func (Callable[[str], bool], optional): Filter function which determines whether a given file is considered a source file or not. Defaults to is_source(path).
+        debug (bool, optional): Whether to write a 'cache_key.log' file into the build directory, so that cache misses can be debugged more easily. Defaults to False.
+
+    Returns:
+        str: SHA256 Hash of the build directory contents in the form of a hexdigest string.
+    """
+
+    try:
+        hash_log = None
+        if debug:
+            hash_log = open(  # noqa: P201 - this is actually closed properly in the finally close below
+                os.path.join(build_dir, "cache_key.log"), mode="w", encoding="utf8"
+            )
+            hash_log.write(f"Building dir hash of {build_dir}\n")
+        basepath = Path(build_dir)
+        files = [p.relative_to(basepath) for p in basepath.rglob("*") if not p.is_dir()]
+        hash_object = hashlib.sha256()
+        for cmd in cmds:
+            _cmd = cmd.replace(
+                build_dir, "${BUILD_DIR}"
+            )  # Make sure we can cache regardless of the build directory location.
+            hash_object.update(_cmd.encode("utf-8"))
+        for fpath in sorted(files):
+            if not filter_func(str(fpath)):
+                continue
+            hash_object.update(str(fpath).encode("utf-8"))
+            fullpath = str(basepath / fpath)
+            with open(fullpath, "rb") as f:
+                # read file in chunks of 32kb
+                # in order to support large files ( constants.obj )
+                while True:
+                    chunk = f.read(1024 * 32)
+                    if not chunk:
+                        break
+                    hash_object.update(chunk)
+                if debug:
+                    hash_log.write(f"\t{str(fpath)} -> {hash_object.hexdigest()}\n")
+        if debug:
+            hash_log.write(
+                f"Final hash of {build_dir} is {hash_object.hexdigest().lower()}\n"
+            )
+        return hash_object.hexdigest().lower()
+    finally:
+        if hash_log:
+            hash_log.close()
+
+
+def write_binhash_file(
+    build_dir,
+    binhash_filename="constants.hash",
+    filter_func: Callable[[str], bool] = is_bin_file,
+):
+    """Hash all binary input files, so we don't have to keep them ( Usecase: constants.obj / constants.bin )
+
+    Args:
+        build_dir (str): Path to build directory
+        binhash_filename (str, optional): File to be written within build_dir, defaults to "constants.hash".
+        filter_func (Callable[[str], bool], optional): Filter function to determine which files to hash. Defaults to is_bin_file.
+    """
+    binhash = create_dir_hash([binhash_filename], build_dir, filter_func=filter_func)
+    with open(os.path.join(build_dir, binhash_filename), "w", encoding="utf-8") as f:
+        f.write(binhash)
+
+
+class BuildCache(ABC):
+    """
+    Abstract base class for build cache implementations
+    """
+
+    @abstractmethod
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Retrieves the build cache artifacts for the given build directory,
+        so that ideally no compilation needs to take place.
+
+        Args:
+            cmds (_type_): Build commands, these will be part of the hash used to calculate a lookup key
+            build_dir (str): Build directory. The source files, Makefile and some other files will be hashed and used to
+                             determine the build cache key.
+            from_sources_filter_func (Callable[[str], bool], optional): Filter function, which may be used to determine which files are being considered source files. Defaults to is_source.
+
+        Returns:
+            Tuple[bool, Optional[str]]: A tuple indicating whether the build cache was successfully retrieved, and a cache key (which should be passed on to store_build_cache on rebuild )
+        """
+        ...
+
+    @abstractmethod
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        """
+        Store the build cache artifacts
+
+        Args:
+            cmds ( List[str]): Build commands, these will be part of the hash used to calculate a lookup key
+            build_dir (str): Path to build directory to retrieve build artifacts from
+            cache_key (str): Cache key, as returned from retrieve_build_cache
+            filter_func (Callable[[str], bool], optional): Filter function, which may be used to determine which files are being considered cacheable artifact files. Defaults to is_cache_artifact.
+
+        Returns:
+            bool: Whether the artifacts were successfully stored
+        """
+        ...
+
+    def maybe_cleanup(
+        self, lru_retention_hours: int = 72, cleanup_max_age_seconds: int = 3600
+    ):
+        """
+        Maybe clean up the build cache if its been longer than `cleanup_max_age_seconds` that it has been cleaned up
+
+        Args:
+            lru_retention_hours (int, optional): How many hours should unused elements be retained in the cache? Defaults to 72.
+            cleanup_max_age_seconds (int, optional): Cleanup interval in seconds. Defaults to 3600.
+        """
+        pass
+
+    def cleanup(self, retention_hours: int = 72):
+        """Do a cache cleanup.
+
+        Args:
+            retention_hours (int, optional): How many hours should unused elements be retained in the cache? Defaults to 72.
+        """
+        pass
+
+
+class NoBuildCache(BuildCache):
+    def __init__(self):
+        """
+        Dummy build cache implementation which does nothing.
+
+        For method docstrings, see parent class.
+        """
+        _LOGGER.info("Build cache disabled")
+
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        return False, None
+
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        pass
+
+
+class FileBasedBuildCache(BuildCache):
+    def __init__(
+        self,
+        cache_dir,
+        lru_retention_hours=72,
+        cleanup_max_age_seconds=3600,
+        debug=True,
+    ):
+        """Filesystem based build cache.
+
+        For method docstrings, see parent class.
+
+        Args:
+            cache_dir (str): Path to store cache data below. Should be an empty, temporary directory with enough space to hold the cache contents. Will be written to and deleted in!
+            lru_retention_hours (int, optional): Retention time for *unused* cache entries. Defaults to 72.
+            cleanup_max_age_seconds (int, optional): Minimum time between cache cleanups in seconds. After this time, a new cleanup gets triggered on next cache retrieval. Defaults to 3600.
+            debug (bool, optional): Whether to enable debugging cache key creation ( see debug parameter of create_dir_hash). Defaults to True. May be left at True, as it is usually helpful and  does not hurt performance.
+        """
+        self.cache_dir = cache_dir
+        self.lru_retention_hours = lru_retention_hours
+        self.cleanup_max_age_seconds = cleanup_max_age_seconds
+        self.debug = debug
+        _LOGGER.info(
+            f"Using file-based build cache, cache directory = {self.cache_dir}"
+        )
+
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        """See docstring of implemented method interface in parent class"""
+
+        self.maybe_cleanup(self.lru_retention_hours, self.cleanup_max_age_seconds)
+        cache_dir = self.cache_dir
+        dir_hash = create_dir_hash(
+            cmds, build_dir, filter_func=from_sources_filter_func, debug=self.debug
+        )
+        key_cache_dir = os.path.join(cache_dir, dir_hash)
+        if os.path.exists(key_cache_dir):
+            _LOGGER.info(f"CACHE: Using cached build results for {build_dir}")
+            target_basepath = Path(build_dir)
+            src_basepath = Path(key_cache_dir)
+            copy_files = [
+                p.relative_to(src_basepath)
+                for p in src_basepath.rglob("*")
+                if not p.is_dir()
+            ]
+            for filepath in copy_files:
+                target_path = target_basepath / filepath
+                target_parent = target_path.parent
+                src_path = src_basepath / filepath
+                if target_parent != target_basepath:
+                    os.makedirs(str(target_parent), exist_ok=True)
+                shutil.copy(
+                    str(src_path),
+                    str(target_path),
+                    follow_symlinks=True,
+                )  # Using shutil.copy intentionally instead of copy2, so the file modification time is updated, and file owner
+                # is not copied. When you retrieve the file from cache, it is yours.
+                _LOGGER.debug(f"CACHE: retrieved {filepath}")
+            # make sure the last modified timestamp is updated, so we can
+            # evict cache directories which are too old using a separate script
+            os.utime(key_cache_dir)
+            return True, dir_hash
+        _LOGGER.info(f"CACHE: No results found for {build_dir}")
+        return False, dir_hash
+
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        """See docstring of implemented method interface in parent class"""
+        cache_dir = self.cache_dir
+        key_cache_dir = os.path.join(cache_dir, cache_key)
+
+        # We create a temporary directory first, so we can do an
+        # atomic update later to prevent race conditions
+        # in a distributed / parallel build setting
+        random_str = secrets.token_hex(16)
+
+        # the temp_cache_dir will be renamed to key_cache_dir
+        # atomically later. It needs to be on same file system
+        # for atomic rename, so we put it into the same folder.
+        temp_cache_dir = key_cache_dir + f".{random_str}.tmp"
+        try:
+            os.makedirs(temp_cache_dir, exist_ok=False)
+        except OSError:
+            _LOGGER.warn(
+                f"CACHE: Failed to create tempdir {temp_cache_dir}. Cannot write cache entries."
+            )
+            return False
+        basepath = Path(build_dir)
+        target_basepath = Path(temp_cache_dir)
+        copy_files = [
+            p.relative_to(basepath) for p in basepath.rglob("*") if not p.is_dir()
+        ]
+        for filepath in copy_files:
+            src_path = basepath / filepath
+            if not filter_func(str(filepath)):
+                continue
+
+            target_path = target_basepath / filepath
+            target_parent = target_path.parent
+            if target_parent != target_basepath:
+                os.makedirs(str(target_parent), exist_ok=True)
+            shutil.copy2(
+                str(src_path),
+                str(target_path),
+                follow_symlinks=True,
+            )  # Use copy2, so the file metadata (incl. last modified time) is preserved
+            _LOGGER.info(f"CACHE: storing {filepath} into {key_cache_dir}: ")
+        try:
+            os.rename(
+                temp_cache_dir, key_cache_dir
+            )  # Atomic update to prevent race condition
+            return True
+        except OSError:
+            _LOGGER.info(
+                f"CACHE: update race conflict - {key_cache_dir} already exists. (Note: No error! This can be expected to happen occasionally.))"
+            )
+            shutil.rmtree(temp_cache_dir, ignore_errors=True)
+            return False
+
+    def maybe_cleanup(
+        self, lru_retention_hours: int = 72, cleanup_max_age_seconds: int = 3600
+    ):
+        """See docstring of implemented method interface in parent class"""
+        last_cleaned_seconds = file_age(os.path.join(self.cache_dir, ".last_cleaned"))
+        if last_cleaned_seconds > cleanup_max_age_seconds:
+            self.cleanup(lru_retention_hours)
+
+    def cleanup(self, lru_retention_hours: int = 72):
+        """See docstring of implemented method interface in parent class"""
+        _LOGGER.info(
+            f"CACHE: Cleaning up build cache below {self.cache_dir}. Folders last used more than {lru_retention_hours} hours ago will be deleted."
+        )
+        touch(os.path.join(self.cache_dir, ".last_cleaned"))
+        if os.path.isdir(self.cache_dir):
+            now = datetime.now()
+            age_limit = timedelta(hours=lru_retention_hours)
+
+            for dirpath in os.scandir(self.cache_dir):
+                if os.path.isdir(dirpath):
+                    # Get the modification time of the directory and convert it to a datetime object
+                    mtime = os.path.getmtime(dirpath)
+                    modification_time = datetime.fromtimestamp(mtime)
+
+                    # Check if the directory is older than N hours
+                    if now - modification_time > age_limit:
+                        _LOGGER.info(f"CACHE: Deleting {dirpath}")
+                        shutil.rmtree(dirpath)
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 6ec0cf4a7..fed10dd4d 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -20,7 +20,6 @@
 
 import logging
 import multiprocessing
-
 import os
 import re
 import shlex
@@ -31,6 +30,9 @@
 
 import jinja2
 
+from aitemplate.backend.build_cache import BUILD_CACHE
+from aitemplate.backend.build_cache_base import write_binhash_file
+
 from aitemplate.backend.target import Target
 from aitemplate.backend.task_runner import BaseRunner, Task
 
@@ -141,34 +143,43 @@ def _log_error_context(
                 _LOGGER.info(f"{path}:\n\n{summary}")
 
 
-def _run_make_cmds(cmds, timeout, build_dir):
+def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
     _LOGGER.debug(f"make {cmds=}")
-    proc = subprocess.Popen(
-        [" && ".join(cmds)],
-        shell=True,
-        env=os.environ.copy(),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-    try:
-        out, err = proc.communicate(timeout)
-    except subprocess.TimeoutExpired as e:
-        proc.kill()
-        out, err = proc.communicate()
-        raise e
-    finally:
-        stdout = out.decode()
-        stderr = err.decode()
-        if proc.returncode != 0:
-            _LOGGER.info(f"make stdout:\n\n{stdout}")
-            _LOGGER.info(f"make stderr:\n\n{stderr}")
-
-            _log_error_context(stderr, build_dir)
-
-            raise RuntimeError("Build has failed.")
-        else:
-            _LOGGER.debug(f"make stdout:\n\n{stdout}")
-            _LOGGER.debug(f"make stderr:\n\n{stderr}")
+    if allow_cache:
+        cached_results_available, store_cache_key = BUILD_CACHE.retrieve_build_cache(
+            cmds, build_dir
+        )
+    else:
+        cached_results_available, store_cache_key = False, None
+    if not cached_results_available:
+        proc = subprocess.Popen(  # noqa: P204
+            [" && ".join(cmds)],
+            shell=True,
+            env=os.environ.copy(),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        try:
+            out, err = proc.communicate(timeout)
+        except subprocess.TimeoutExpired as e:
+            proc.kill()
+            out, err = proc.communicate()
+            raise e
+        finally:
+            stdout = out.decode()
+            stderr = err.decode()
+            if proc.returncode != 0:
+                _LOGGER.info(f"make stdout:\n\n{stdout}")
+                _LOGGER.info(f"make stderr:\n\n{stderr}")
+
+                _log_error_context(stderr, build_dir)
+
+                raise RuntimeError("Build has failed.")
+            else:
+                _LOGGER.debug(f"make stdout:\n\n{stdout}")
+                _LOGGER.debug(f"make stderr:\n\n{stderr}")
+        if store_cache_key is not None:
+            BUILD_CACHE.store_build_cache(cmds, build_dir, store_cache_key)
 
 
 def process_task(task: Task) -> None:
@@ -779,6 +790,13 @@ def make_profilers(self, generated_profilers, workdir):
             return
         build_dir = shlex.quote(os.path.join(workdir, "profiler"))
         self._gen_makefile_for_profilers(file_pairs, build_dir)
+        # Write compiler version string(s) into build directory, so these can be used as part of cache key
+        self._gen_compiler_version_files(build_dir)
+
+        # hash all .bin files and write hash into it, so we can use their hash to build the cache key,
+        # even if we delete the actual .bin file afterwards
+        write_binhash_file(build_dir)
+
         make_path = shlex.quote(Target.current().make())
         make_flags = " ".join(
             [
@@ -789,12 +807,60 @@ def make_profilers(self, generated_profilers, workdir):
         make_clean_cmd = f" {make_path} {make_flags} clean "
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         cmds = [make_clean_cmd, make_all_cmd]
-        _run_make_cmds(cmds, self._timeout, build_dir)
+        _run_make_cmds(cmds, self._timeout, build_dir, allow_cache=True)
+
+    def _gen_compiler_version_files(self, target_dir):
+        # Write compiler version string(s) into build directory
+        # for cache invalidation purposes (different compiler versions
+        # should not reuse same cached build artifacts )
+        cc = Target.current().cc()
+        compilers = {"main_compiler": cc}
+        if "nvcc" in cc:
+            ccbin_match = re.search(r'-ccbin "?([^ "]+)', cc)
+            if ccbin_match:
+                nvcc_host_compiler = ccbin_match.group(1)
+            else:
+                nvcc_host_compiler = "g++"  # default, using PATH resolution
+            compilers["nvcc_host_compiler"] = nvcc_host_compiler
+
+        # Write compiler version string(s)
+        # into the build directory, to enable using them for cache hash determination
+        for compiler_name, compiler_cmd in compilers.items():
+            try:
+                version_bytes = subprocess.check_output([compiler_cmd, "--version"])
+                with open(
+                    os.path.join(target_dir, compiler_name + ".version"),
+                    "wb",  # version_bytes is bytes obj
+                ) as fh:
+                    fh.write(version_bytes)
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                _LOGGER.warn("CACHE: Could not determine version of host compiler.")
+                # This will always invalidate the cache, due to the inclusion of a timestamp
+                with open(
+                    os.path.join(target_dir, compiler_name + ".error.version"),
+                    "w",
+                    encoding="utf-8",
+                ) as fh:
+                    fh.write(f"Could not determine version of {compiler_cmd}\n")
 
     def make(
-        self, file_pairs, dll_name, workdir, test_name, debug_settings=_DEBUG_SETTINGS
+        self,
+        file_pairs,
+        dll_name,
+        workdir,
+        test_name,
+        debug_settings=_DEBUG_SETTINGS,
+        allow_cache=True,
     ):
         self.gen_makefile(file_pairs, dll_name, workdir, test_name, debug_settings)
+
+        # Write compiler version string(s) into build directory, so these can be used as part of cache key
+        self._gen_compiler_version_files(os.path.join(workdir, test_name))
+
+        # hash all .bin files and write hash into it, so we can use their hash to build the cache key,
+        # even if we delete the actual .bin file afterwards
+        write_binhash_file(os.path.join(workdir, test_name))
+
         make_path = shlex.quote(Target.current().make())
         build_dir = shlex.quote(os.path.join(workdir, test_name))
         make_flags = " ".join(
@@ -809,4 +875,4 @@ def make(
         cmds = [make_clean_cmd, make_all_cmd]
         if not is_debug():
             cmds.append(make_clean_constants_cmd)
-        _run_make_cmds(cmds, self._timeout, build_dir)
+        _run_make_cmds(cmds, self._timeout, build_dir, allow_cache=allow_cache)
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index c8e3bc714..ee8e489cd 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -15,11 +15,13 @@
 """
 CUDA target specialization
 """
+import hashlib
 import json
 import logging
 import os
 import pipes
 import re
+import secrets
 import shutil
 import sys
 import tempfile
@@ -39,8 +41,10 @@
 )
 
 from aitemplate.utils import environ
+from aitemplate.utils.io import copytree_with_hash
 from aitemplate.utils.misc import is_debug
 
+
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
 
 
@@ -200,22 +204,53 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
         self._include_path = None
         if not FBCUDA.cutlass_path_:
-            self._include_path = tempfile.mkdtemp()
-
+            # Copy all of the includes over into an include directory
+            random_key = secrets.token_hex(16)
+            # the random_key part of this path will later be renamed to the content hash
+            self._include_path = os.path.join(
+                tempfile.gettempdir(), "aitemplate_tmp", random_key, "includes"
+            )
+            includes_content_hash = hashlib.sha256()
             FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
             self.cub_path_ = self._include_path + "/cub"
-            shutil.copytree(cutlass_src_path, FBCUDA.cutlass_path_)
-            shutil.copytree(cub_src_path, self.cub_path_)
-
+            # copy recursively, and update a content hash in one go
+            copytree_with_hash(
+                cutlass_src_path, FBCUDA.cutlass_path_, hash=includes_content_hash
+            )
+            copytree_with_hash(cub_src_path, self.cub_path_, hash=includes_content_hash)
             attention_src_path = parutil.get_dir_path(
                 "aitemplate/AITemplate/python/aitemplate/backend/cuda/attention/src"
             )
             attention_include_path = self._include_path + "/att_include"
-            shutil.copytree(attention_src_path, attention_include_path)
+            copytree_with_hash(
+                attention_src_path, attention_include_path, hash=includes_content_hash
+            )
             ait_static_include_path = self._include_path + "/static"
-            shutil.copytree(
-                static_files_path + "/include/kernels", ait_static_include_path
+            copytree_with_hash(
+                static_files_path + "/include/kernels",
+                ait_static_include_path,
+                hash=includes_content_hash,
             )
+            # Now we have a content hash over all include contents
+            include_hash_digest = includes_content_hash.hexdigest()
+            # Prepare to rename atomically
+            old_path = os.path.join(tempfile.gettempdir(), "aitemplate_tmp", random_key)
+            new_path = os.path.join(
+                tempfile.gettempdir(), "aitemplate_tmp", include_hash_digest
+            )
+            # if it already exists, we don't want to overwrite it
+            # we can just delete our copy.
+            try:
+                os.rename(old_path, new_path)
+            except OSError:
+                # target directory with identical contents already exists
+                shutil.rmtree(old_path)  # No need to keep out copy
+
+            # set the include paths to the final variant
+            self._include_path = os.path.join(new_path, "includes")
+            self.cub_path_ = self._include_path + "/cub"
+            FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
+
         self.cutlass_path_ = FBCUDA.cutlass_path_
 
         cutlass_lib_path = parutil.get_dir_path(
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 43d9acf9f..fb0d9dba9 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -35,6 +35,7 @@
     Model,
     TorchTensor,
 )
+from aitemplate.compiler.transform.name_graph import reset_name_counters
 from aitemplate.compiler.transform.profile import elapsed_dt_sec
 from aitemplate.utils import graph_utils
 from aitemplate.utils.debug_settings import AITDebugSettings
@@ -212,6 +213,7 @@ def compile_model(
     if int(recompile) == 1:
         os.makedirs(test_dir, exist_ok=True)
         with target:
+            reset_name_counters()
             graph = compiler.transform.toposort(tensor)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "toposort")
 
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 71896d9a3..94fea4a45 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -31,6 +31,17 @@
 user_provided_dim = set()
 
 
+def reset_name_counters():
+    global func_cnt
+    global tensor_cnt
+    global func_name_to_tensor_cnt
+    global MEMO
+    func_cnt = 0
+    tensor_cnt = 0
+    func_name_to_tensor_cnt = {}
+    MEMO = set()
+
+
 def valid_c_name(name):
     return re.sub(r"\W|^(?=\d)", "_", name)
 
@@ -52,6 +63,8 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
     ----------
     sorted_graph : List[Tensor]
         Input graph to be named
+    reset_counters : bool
+        If True, reset counters which are used to name tensors and functions. (Default: False)
     """
     global func_cnt
     global tensor_cnt
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 44c5f40eb..739606684 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -17,6 +17,7 @@
 """
 import logging
 import os
+from typing import Optional
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -66,3 +67,17 @@ def shorten_tensor_names_for_plots() -> bool:
     making the graph representation significantly simpler.
     """
     return os.getenv("AIT_PLOT_SHORTEN_TENSOR_NAMES", "0") == "1"
+
+
+def ait_build_cache_dir() -> Optional[str]:
+    """
+    When set to a non-empty string, cache the build artifacts
+    below this directory for significantly faster builds.
+
+    See aitemplate.backend.build_cache
+
+    Returns:
+        Optional[str]: Value of AIT_BUILD_CACHE_DIR environment variable,
+        or None if not set.
+    """
+    return os.environ.get("AIT_BUILD_CACHE_DIR", None)
diff --git a/python/aitemplate/utils/io.py b/python/aitemplate/utils/io.py
new file mode 100644
index 000000000..91caa39eb
--- /dev/null
+++ b/python/aitemplate/utils/io.py
@@ -0,0 +1,209 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions to handle file or network io
+"""
+import hashlib
+import os
+import tarfile
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Optional, Union
+
+
+def touch(file_path):
+    """
+    Emulates the Linux 'touch' command by creating an empty file if it doesn't exist, or updating the modified timestamp if it does.
+
+    :param file_path: str: The path to the file to be created or updated.
+    :return: None
+    """
+    if not os.path.exists(file_path):
+        p = Path(file_path)
+        # ensure parent directory exists
+        os.makedirs(str(p.parent), exist_ok=True)
+        open(file_path, "w").close()
+
+    # Update the modified timestamp
+    os.utime(file_path)
+
+
+def file_age(file_path):
+    """
+    Returns the age of a file in seconds since its last modified timestamp.
+
+    :param file_path: str: The path to the file.
+    :return: float: The age of the file in seconds.
+    """
+    if not os.path.isfile(file_path):
+        return 3600 * 24 * 1000.0
+
+    # Get the current time and the file's last modification time
+    current_time = time.time()
+    file_mtime = os.path.getmtime(file_path)
+
+    # Calculate the file age in seconds
+    file_age_seconds = current_time - file_mtime
+
+    return file_age_seconds
+
+
+# Utility functions to be used by (not yet existing) distributed cache implementations
+# to minimize the amount of network roundtrips and network bandwidth needed
+
+
+def create_archive(directory_path: str, filter_func=None) -> bytes:
+    """Create tar.gz archive in-memory and return the archive contents as
+    a bytes object.
+
+    Args:
+        directory_path (str): Directory to create archive of.
+        filter_func (_type_, optional): A function which, being passed a filename,
+                                        returns whether to include it or not.
+                                        Defaults to None (include all).
+
+    Returns:
+        bytes: Archive contents as a bytes object.
+    """
+    # Archive files in a directory.
+
+    # Create an in-memory bytes buffer
+    buffer = BytesIO()
+
+    # Determine the appropriate compression mode
+    compression_mode = None
+    compression_mode = "w:gz"
+
+    # Create a new archive file
+    with tarfile.open(fileobj=buffer, mode=compression_mode) as archive:
+        # Walk through the directory tree and add each file to the archive
+        for root, _, files in os.walk(directory_path):
+            for _file in files:
+                # Check if the file should be included based on the filter function
+                if filter_func is not None:
+                    file_basename = os.path.basename(_file)
+                    file_root, file_extension = os.path.splitext(_file)
+                    if not filter_func(file_basename, file_extension):
+                        continue
+
+                # Calculate the relative path of the file
+                relative_path = os.path.relpath(
+                    os.path.join(root, _file), directory_path
+                )
+
+                # Add the file to the archive with the relative path
+                archive.add(os.path.join(root, _file), arcname=relative_path)
+
+    # Get the bytes from the buffer
+    buffer.seek(0)
+    compressed_bytes = buffer.read()
+
+    return compressed_bytes
+
+
+def extract_archive(
+    archive_bytes: bytes, target_directory: str, overwrite: bool = False
+):
+    """Extract a tar.gz archive (written for example via create_archive) from a bytes buffer
+    into a target directory.
+
+    Args:
+        archive_bytes (bytes): Byte contents of the tar.gz archive to be extracted.
+        target_directory (str): Target directory to extract to.
+        overwrite (bool, optional): Whether to overwrite files or not.
+                                    If False, files will be silently skipped
+                                    if they already exist. Defaults to False.
+    """
+    # Create an in-memory bytes buffer
+    buffer = BytesIO(archive_bytes)
+
+    archive = tarfile.open(fileobj=buffer, mode="r:gz")
+
+    # Extract the archive contents into the target directory
+    for member in archive.getmembers():
+        # Calculate the full path of the extracted file or directory
+        target_path = os.path.join(target_directory, member.name)
+
+        # Check if the file or directory already exists
+        if os.path.exists(target_path):
+            if not overwrite:
+                continue
+            else:
+                os.remove(target_path)
+
+        # Extract the file or directory from the archive
+        archive.extract(member, target_directory)
+
+    # Close the archive object
+    archive.close()
+
+
+def copytree_with_hash(
+    src_path: Union[Path, str],
+    dst_path: Union[Path, str],
+    buffer_size=1024 * 1024,
+    hash: Optional[hashlib.sha256] = None,
+    max_depth: int = 20,
+) -> Optional[str]:
+    """Copy a directory and its contents recursively, while at the same time calculating a hash over each file and filename.
+
+    :param src_path: Path: The path to the source directory.
+    :param dst_path: Path: The path to the destination directory.
+    :param buffer_size: int: The buffer size to read and write data in.
+    :param hash: Optional[hashlib.sha256]: The hash to use for calculating the hash. ( Default: None)
+    :max_depth: int : The maximum recursion depth. Default: 20
+    :return: None, if a hash instance was passed. Otherwise, the hash of the copied data and path names.
+    """
+
+    if hash is None:
+        hash_obj = hashlib.sha256()
+    else:
+        hash_obj = hash
+    if isinstance(src_path, str):
+        src_path = Path(src_path)
+    if isinstance(dst_path, str):
+        dst_path = Path(dst_path)
+    if dst_path.exists():
+        dst_path = dst_path.resolve()
+        if not dst_path.is_dir():
+            raise OSError("Target path exists and is not a directory.")
+        dst_path = dst_path / src_path.name
+    hash_obj.update(dst_path.name.encode("utf-8"))
+    if src_path.is_file():
+        # Copy the file to the destination
+        with open(dst_path, "wb") as dst_file:
+            with open(src_path, "rb") as src_file:
+                while True:
+                    data = src_file.read(buffer_size)
+                    if not data:
+                        break
+                    hash_obj.update(data)
+                    dst_file.write(data)
+    elif src_path.is_symlink():
+        new_src_path = src_path.resolve()
+        copytree_with_hash(new_src_path, dst_path, buffer_size, hash_obj, max_depth - 1)
+    elif src_path.is_dir():
+        # Recursively copy the directory contents
+        os.makedirs(dst_path, exist_ok=True)
+        for sub_path in src_path.iterdir():
+            sub_dst_path = dst_path / sub_path.name
+            copytree_with_hash(
+                sub_path, sub_dst_path, buffer_size, hash_obj, max_depth - 1
+            )
+    else:
+        raise OSError(f"Source path {src_path} is neither file, directory nor symlink.")
+    if hash is None:
+        return hash_obj.hexdigest()
diff --git a/tests/unittest/backend/test_build_cache.py b/tests/unittest/backend/test_build_cache.py
new file mode 100644
index 000000000..79b543675
--- /dev/null
+++ b/tests/unittest/backend/test_build_cache.py
@@ -0,0 +1,240 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+import torch
+from aitemplate.backend.build_cache_base import (
+    create_dir_hash,
+    FileBasedBuildCache,
+    is_source,
+)
+from aitemplate.backend.cuda.target_def import FBCUDA
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.io import file_age
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class BuildCacheTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _create_model_graph(self):
+        dtype = "float32"
+        X1 = Tensor(
+            shape=[IntImm(1), IntImm(10)],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+        Y = ops.expand()(X1, shape=(10, 10))
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        return Y
+
+    def test_file_build_cache(self):
+
+        with tempfile.TemporaryDirectory() as parent_dir:
+            cache_dir = os.path.join(parent_dir, "build_cache")
+            shutil.rmtree(cache_dir, ignore_errors=True)
+            cache = FileBasedBuildCache(
+                cache_dir,
+                lru_retention_hours=0,
+                cleanup_max_age_seconds=1000,
+                debug=True,
+            )
+            cache.maybe_cleanup()
+            assert os.path.exists(cache_dir + "/.last_cleaned")
+            assert (
+                file_age(cache_dir + "/.last_cleaned") < 10.0
+            ), "Last clean time should  than 10 seconds"
+
+            build_dir_1 = os.path.join(parent_dir, "build_1")
+            build_dir_2 = os.path.join(parent_dir, "build_2")
+
+            os.makedirs(build_dir_1, exist_ok=False)
+            os.makedirs(build_dir_2, exist_ok=False)
+            for build_dir in [build_dir_1, build_dir_2]:
+                bp = Path(build_dir)
+                (bp / "Makefile").write_text("test.exe: test.cu")
+                (bp / "test.cu").write_text("printf('Hello, World!');")
+            assert create_dir_hash(
+                [f"make {build_dir_1}"], build_dir_1
+            ) == create_dir_hash([f"make {build_dir_2}"], build_dir_2)
+            found_entry1, cache_key1 = cache.retrieve_build_cache(
+                [f"make {build_dir_1}"], build_dir_1
+            )
+            found_entry2, cache_key2 = cache.retrieve_build_cache(
+                [f"make {build_dir_2}"], build_dir_2
+            )
+            assert not found_entry1
+            assert not found_entry2
+            assert cache_key1 == cache_key2
+            assert cache_key1 == create_dir_hash([f"make {build_dir_1}"], build_dir_1)
+            (Path(build_dir_2) / "test.obj").write_bytes("ELF1234".encode("ascii"))
+            cache.store_build_cache([f"make {build_dir_2}"], build_dir_2, cache_key2)
+            assert os.path.exists(os.path.join(cache_dir, cache_key2))
+            found_entry1, cache_key1 = cache.retrieve_build_cache(
+                [f"make {build_dir_1}"], build_dir_1
+            )
+            assert os.path.exists(os.path.join(build_dir_1, "test.obj"))
+            assert (
+                Path(os.path.join(build_dir_1, "test.obj")).read_bytes()
+                == Path(os.path.join(build_dir_2, "test.obj")).read_bytes()
+            )
+
+    def test_deterministic_codegen(self, dtype="float32"):
+        # Tests, whether repeated invocation of compilation results in identical generated source files
+        test_name = "test_deterministic_codegen"
+        basepath = "./tmp"
+
+        # Clean previous test results. These are usually kept for debugging purposes
+        # but we need a clean slate here.
+        if os.path.exists(basepath):
+            existing_dirs = [
+                d
+                for d in os.listdir(basepath)
+                if d.startswith(test_name) and os.path.isdir(os.path.join(basepath, d))
+            ]
+            for d in existing_dirs:
+                oldpath = os.path.join(basepath, d)
+                if os.path.exists(oldpath) and test_name in oldpath:
+                    shutil.rmtree(oldpath)
+        else:
+            os.mkdir(basepath)
+
+        Y = self._create_model_graph()
+        target = detect_target()
+        debug_settings = AITDebugSettings(gen_standalone=False)
+        dll_name = "test.so"
+        build_dir = os.path.join("./tmp", test_name)
+        compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name + "_1",
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+        hash1 = create_dir_hash(["test_name"], build_dir + "_1", is_source, debug=True)
+        Y = self._create_model_graph()
+        target = detect_target()
+        # Variant 2: Clean build
+        compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name + "_2",
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+        hash2 = create_dir_hash(["test_name"], build_dir + "_2", is_source, debug=True)
+        assert (
+            hash1 == hash2
+        ), "Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+        # Variant 3: Build over existing build dir
+        Y = self._create_model_graph()
+        target = detect_target()
+        compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name + "_2",
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+        hash3 = create_dir_hash(["test_name"], build_dir + "_2", is_source, debug=True)
+        assert (
+            hash2 == hash3
+        ), "Code generation was not deterministic. Cache key mismatch between second and third code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+        # Variant 4: Let's provoke to copy the includes again, maybe to a new path?
+        Y = self._create_model_graph()
+        FBCUDA.cutlass_path_ = None
+        compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name + "_4",
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+        hash4 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
+
+        assert (
+            hash3 == hash4
+        ), "Code generation was not deterministic. Cache key mismatch between third and fourth code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+        with open(
+            os.path.join(build_dir + "_4", "Makefile"), "a", encoding="utf-8"
+        ) as f:
+            f.write("\n")
+
+        hash5 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
+        assert (
+            hash4 != hash5
+        ), "Directory hash was not sensitive to a change in the Makefile, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+        with open(
+            os.path.join(build_dir + "_4", "anything.cu"), "w", encoding="utf-8"
+        ) as f:
+            f.write("// Nothing, really\n")
+
+        hash6 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
+        assert (
+            hash6 != hash5
+        ), "Directory hash was not sensitive to a change in a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+        os.rename(
+            os.path.join(build_dir + "_4", "anything.cu"),
+            os.path.join(build_dir + "_4", "anything_.cu"),
+        )
+        hash7 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
+        assert (
+            hash7 != hash6
+        ), "Directory hash was not sensitive to a change of name of a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+        Y = self._create_model_graph()
+        target = detect_target()
+        debug_settings = AITDebugSettings(gen_standalone=True)
+        compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name + "_8",
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+        hash8 = create_dir_hash(["test_name"], build_dir + "_8", is_source, debug=True)
+
+        assert (
+            hash8 != hash1
+        ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+
+filter_test_cases_by_test_env(BuildCacheTestCase)
+
+if __name__ == "__main__":
+    unittest.main()

From dd65d68863d78449ff43ea4d59dd0fc69a6d399e Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 30 Mar 2023 09:33:14 -0700
Subject: [PATCH 360/638] Accept multiple sources in make_jagged (#519)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/519

`make_jagged` currently accepts only a single `source` and `offsets_list`, returning a jagged Tensor made from those. This may be inefficient for the cases where there are multiple jagged Tensors in the graph with the same `total_length` dimension / same offsets (e.g., in the group b2b bmm). In this case, `make_jagged` would have to be invoked as many times, doing the same offset validation on the GPU in every run. The latter is a waste of time.

In this diff, `make_jagged` is extended to accept multiple source Tensors (with the same first `total_length` dimension) and convert each to a jagged Tensor.

The extension is necessary for the upcoming graph transformation pass eliminating multiple `make_jagged` ops within the graph by pulling them upstream / applying `make_jagged` to the sets of source input Tensors with the same `total_length` first dimension. The pass will be added in a follow-up diff.

Reviewed By: chenyang78

Differential Revision: D44508562

fbshipit-source-id: 9424ae3428b2b05a0bea04af7c7c42d2e5de0a92
---
 .../backend/cuda/view_ops/make_jagged.py      |  17 +--
 .../compiler/ops/common/view_ops.py           |  97 +++++++++------
 tests/unittest/ops/test_make_jagged.py        | 112 +++++++++++++++++-
 3 files changed, 183 insertions(+), 43 deletions(-)

diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
index d2e248360..9e17a0503 100644
--- a/python/aitemplate/backend/cuda/view_ops/make_jagged.py
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -223,7 +223,7 @@
 {% endfor %}
 {{indent}}  {{offsets_var_name}},
 {{indent}}  &{{batch_dim_name}},
-{{indent}}  {{source_first_dim_name}},
+{{indent}}  {{total_length_name}},
 {{indent}}  stream
 {{indent}});
 """,
@@ -251,7 +251,8 @@ def _get_jagged_dynamic_bound_dims(jagged_int_var: JaggedIntVar) -> Set[IntVar]:
 @registry.reg("cuda.make_jagged.gen_function")
 def make_jagged_gen_function(func_attrs):
     func_name = func_attrs["name"]
-    offsets_list = func_attrs["inputs"][1:]
+    num_sources = func_attrs["num_sources"]
+    offsets_list = func_attrs["inputs"][num_sources:]
     backend_spec = CUDASpec()
 
     output = func_attrs["outputs"][0]
@@ -305,7 +306,8 @@ def make_jagged_gen_function(func_attrs):
 @registry.reg("cuda.make_jagged.func_decl")
 def make_jagged_gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
-    offsets_list = func_attrs["inputs"][1:]
+    num_sources = func_attrs["num_sources"]
+    offsets_list = func_attrs["inputs"][num_sources:]
     backend_spec = CUDASpec()
 
     output = func_attrs["outputs"][0]
@@ -325,8 +327,9 @@ def make_jagged_gen_function_decl(func_attrs):
 @registry.reg("cuda.make_jagged.func_call")
 def make_jagged_gen_function_call(func_attrs, indent="  "):
     func_name = func_attrs["name"]
-    source = func_attrs["inputs"][0]
-    offsets_list = func_attrs["inputs"][1:]
+    num_sources = func_attrs["num_sources"]
+    total_length = func_attrs["inputs"][0]._attrs["shape"][0]
+    offsets_list = func_attrs["inputs"][num_sources:]
     output = func_attrs["outputs"][0]
     jagged_int_var = output._attrs["shape"][0]
 
@@ -335,7 +338,7 @@ def make_jagged_gen_function_call(func_attrs, indent="  "):
     ]
     offsets_data_names = [offsets._attrs["name"] for offsets in offsets_list]
     batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
-    source_first_dim_name = source._attrs["shape"][0]._attrs["name"]
+    total_length_name = total_length._attrs["name"]
 
     jagged_dynamic_bound_names = [
         dim._attrs["name"] for dim in _get_jagged_dynamic_bound_dims(jagged_int_var)
@@ -349,6 +352,6 @@ def make_jagged_gen_function_call(func_attrs, indent="  "):
         offsets_first_dim_names=offsets_first_dim_names,
         offsets_data_names=offsets_data_names,
         batch_dim_name=batch_dim_name,
-        source_first_dim_name=source_first_dim_name,
+        total_length_name=total_length_name,
         jagged_dynamic_bound_names=jagged_dynamic_bound_names,
     )
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 01a91ed16..8eaace3c6 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -629,7 +629,7 @@ def _infer_shapes(self, x: Tensor) -> List[IntVar]:
 
 class make_jagged(_view):
     """
-    Creates a jagged Tensor from a normal Tensor, offsets, and metadata.
+    Creates jagged Tensors from normal Tensors, offsets, and metadata.
 
     Jagged Tensors are normal Tensors with the first dynamic dimensions
     represented with a JaggedIntVar instance (as opposed to a vanilla
@@ -668,12 +668,14 @@ class make_jagged(_view):
             docstrings for the details.
 
     __call__ Args:
-        source : Tensor
-            The source Tensor of the jagged Tensor created by this op.
-            The jagged Tensor is a view of the source Tensor. The main
-            difference is that the resulting jagged Tensor's first
-            dimension is set to a JaggedIntVar, constructed from the
-            batch_dim, jagged_dims, and the offsets_list.
+        source : Union[Tensor, List[Tensor]]
+            One or more source Tensors of the jagged Tensor(s) created by this op.
+            The jagged Tensor is a view of the source Tensor. The main difference
+            is that the resulting jagged Tensor's first dimension is set to a
+            JaggedIntVar, constructed from the batch_dim, jagged_dims, and the
+            offsets_list. The same JaggedIntVar instance is set as the first
+            dimension of every resulting jagged Tensor: one for each source
+            Tensor in the `source`.
         offsets_list : List[Tensor]
             The list of rank-1 offsets Tensors describing the variable-length
             layout of each of the jagged_dims. There must be exactly as many
@@ -681,6 +683,11 @@ class make_jagged(_view):
             the jagged_dims list. Each offsets Tensor is associated with the
             corresponding JaggedDim before constructing a JaggedIntVar from
             them for the resulting jagged Tensor.
+
+    Returns:
+        Union[Tensor, List[Tensor]]
+            The resulting jagged Tensor or a list thereof, depending on whether
+            the `source` argument is a Tensor or a List[Tensor].
     """
 
     def __init__(
@@ -722,18 +729,37 @@ def _set_jagged_dim_offsets(self, offsets_list: List[Tensor]):
                     )
             jagged_dim._attrs["offsets"] = offsets
 
-    def _infer_shapes(self, source: Tensor) -> List[IntVar]:
-        jagged_int_var = JaggedIntVar(
-            batch_dim=self._attrs["batch_dim"],
-            jagged_dims=self._attrs["jagged_dims"],
-            total_length=source._attrs["shape"][0],
-        )
+    def __call__(
+        self,
+        source: Union[Tensor, List[Tensor]],
+        offsets_list: List[Tensor],
+    ) -> Tensor:
+        sources_list = [source] if isinstance(source, Tensor) else source
+
+        if not sources_list:
+            raise ValueError("There must be at least one source Tensor in the list.")
 
-        return [jagged_int_var] + source._attrs["shape"][1:]
+        for s in sources_list:
+            if len(s._attrs["shape"]) == 0:
+                raise ValueError(
+                    "The source Tensors must be at least rank-1, but given rank-0."
+                )
+            if type(s._attrs["shape"][0]) != IntVar:
+                raise ValueError(
+                    "The source Tensor's first dim (total_length) must be "
+                    f"dynamic (IntVar), but given {s._attrs['shape']=}."
+                )
+
+        total_length = sources_list[0]._attrs["shape"][0]
+        for s in sources_list[1:]:
+            if s._attrs["shape"][0] != total_length:
+                raise ValueError(
+                    "All source Tensors must have the same first (total_length) dimension, "
+                    f"but got {s[0]._attrs['shape']=}, {s._attrs['shape']=}."
+                )
 
-    def __call__(self, source: Tensor, offsets_list: List[Tensor]) -> Tensor:
-        if source.is_jagged():
-            # already a jagged Tensor
+        if isinstance(total_length, JaggedIntVar):
+            # already jagged Tensors
             return source
 
         jagged_dims = self._attrs["jagged_dims"]
@@ -752,28 +778,31 @@ def __call__(self, source: Tensor, offsets_list: List[Tensor]) -> Tensor:
                     "The offsets Tensors can be either int32 or int64, "
                     f"but given the Tensor of type {offsets._attrs['dtype']}."
                 )
-        if len(source._attrs["shape"]) == 0:
-            raise ValueError(
-                "The source Tensor must be at least rank-1, but given rank-0."
-            )
-        if type(source._attrs["shape"][0]) != IntVar:
-            raise ValueError(
-                "The source Tensor's first dim (total_length) must be dynamic (IntVar), "
-                f"but given {type(source._attrs['shape'][0]).__name__}."
-            )
 
-        self._attrs["inputs"] = [source, *offsets_list]
+        self._attrs["num_sources"] = len(sources_list)
+        self._attrs["inputs"] = sources_list + offsets_list
         self._set_depth()
         self._set_jagged_dim_offsets(offsets_list)
-        output_shape = self._infer_shapes(source)
-        output = Tensor(
-            shape=output_shape,
-            src_ops={self},
-            is_view_of=source,
+
+        jagged_int_var = JaggedIntVar(
+            batch_dim=self._attrs["batch_dim"],
+            jagged_dims=self._attrs["jagged_dims"],
+            total_length=total_length,
         )
-        self._attrs["outputs"] = [output]
 
-        return output
+        outputs = [
+            Tensor(
+                shape=[jagged_int_var] + s._attrs["shape"][1:],
+                src_ops={self},
+                is_view_of=s,
+            )
+            for s in sources_list
+        ]
+        self._attrs["outputs"] = outputs
+        if isinstance(source, Tensor):
+            outputs = outputs[0]
+
+        return outputs
 
     def _get_op_attributes(self):
         return {
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
index ba3c3dacc..f82d3a970 100644
--- a/tests/unittest/ops/test_make_jagged.py
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -124,13 +124,13 @@ def _test_make_jagged(
     def test_make_jagged(self):
         self._test_make_jagged(
             check_sequence_lengths=True,
-            test_name="make_jagged",
+            test_name="test_make_jagged",
         )
 
     def test_make_jagged_no_seq_len_check(self):
         self._test_make_jagged(
             check_sequence_lengths=False,
-            test_name="make_jagged_no_seq_len_check",
+            test_name="test_make_jagged_no_seq_len_check",
         )
 
     def test_make_jagged_with_dynamic_bounds(
@@ -230,6 +230,114 @@ def test_make_jagged_with_dynamic_bounds(
 
         torch.testing.assert_close(result, result_pt)
 
+    def test_make_jagged_multiple_sources(
+        self,
+        num_sources=3,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N = 3
+        D = 64
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntImm(name="max_seq_len", value=N)
+        embedding_dim = IntImm(name="embedding", value=D)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+
+        SOURCES = [
+            Tensor(
+                shape=[
+                    total_length_dim,
+                    embedding_dim,
+                ],
+                name=f"source_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_sources)
+        ]
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGEDS = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(
+                    min_value=0,
+                    max_value=max_seq_dim,
+                )
+            ],
+        )(
+            source=SOURCES,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = DENSE
+        for JAGGED in JAGGEDS:
+            RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, RESULT)
+
+        assert all(not SOURCE.is_jagged() for SOURCE in SOURCES)
+        assert not DENSE.is_jagged()
+        assert all(JAGGED.is_jagged() for JAGGED in JAGGEDS)
+        assert RESULT.is_jagged()
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_multiple_sources",
+        )
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        sources_pt = {
+            f"source_{i}": get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+            for i in range(num_sources)
+        }
+        dense_pt = get_random_torch_tensor([B, N, D], dtype=dtype)
+
+        sources_list_pt = list(sources_pt.values())
+        summed_sources_pt = torch.clone(sources_list_pt[0])
+        for source_pt in sources_list_pt[1:]:
+            summed_sources_pt += source_pt
+        result_pt = add_jagged_dense_ref(
+            jagged=summed_sources_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, D],
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {**sources_pt, "offsets": offsets_pt, "dense": dense_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt, rtol=1e-2, atol=1e-2)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 88c13054061bf48c320ca1209fe4d5931117f0b9 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 30 Mar 2023 17:13:18 -0700
Subject: [PATCH 361/638] Update rocm_ci.yml

change the CI trigger to PR with label 'rocm_ci'
---
 .github/workflows/rocm_ci.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 7c55cabb1..650746839 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,11 +1,9 @@
 name: ROCM_CI
 
 on: 
-  pull_request: 
-    types: 
-      - [opened, reopened]
-    branches: 
-      - 'ROCmSoftwarePlatform/AITemplate/**'
+  pull_request:
+     label:
+       - rocm_ci
 
 jobs:
   build:

From e59d7bda0163822ce9725940cb5bd06a74567726 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 30 Mar 2023 17:25:18 -0700
Subject: [PATCH 362/638] Update rocm_ci.yml

Change the label to "module: rocm"
---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 650746839..8fe351719 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -3,7 +3,7 @@ name: ROCM_CI
 on: 
   pull_request:
      label:
-       - rocm_ci
+       - module: rocm
 
 jobs:
   build:

From a799ed25b2d6f02dadcffb36c1b6d6f784b2d1f3 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 30 Mar 2023 18:32:07 -0700
Subject: [PATCH 363/638] Check if PR label contains rocm

---
 .github/workflows/rocm_ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 8fe351719..2ac66a2a3 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -2,11 +2,11 @@ name: ROCM_CI
 
 on: 
   pull_request:
-     label:
-       - module: rocm
+    types: [labeled]
 
 jobs:
   build:
+    if: contains(github.event.pull_request.labels.*.name, 'rocm')
     runs-on: rocm
 
     steps:

From c291450a5777317d7bb2c18b6dafcf9628fd7cbf Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 30 Mar 2023 19:21:34 -0700
Subject: [PATCH 364/638] enabled to merge concat ops with multiple dst_ops
 (#471)

Summary:
Previously, our transform_memory_ops pass was only able to merge concat ops with a single dst_op. This PR extended the pass to hanlde cases where the first concat may take multiple dst ops. The basic idea is that we generate a slice op for each non-concat dst op of the first concat's output. This slice op consumes the first concat's output and feeds into the original non-concat dst op.

Currently, we restrict our implementation to elementwise ops and bmm_ccr being extra dst ops as they may be fused with the input slice. We will support more strided ops with corresponding tests later.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/471

Reviewed By: ipiszy

Differential Revision: D44313936

Pulled By: chenyang78

fbshipit-source-id: daa5e737ad98f552e9e44fa6fb02dd7b2a63fcef
---
 .../aitemplate/compiler/transform/__init__.py |   1 +
 .../compiler/transform/move_view_ops.py       |  77 ++-
 .../compiler/transform/optimize_graph.py      |   3 +
 .../transform/transform_memory_ops.py         | 298 ++++++++-
 .../transform/transform_merge_slice_ops.py    | 138 +++++
 .../unittest/compiler/test_merge_slice_ops.py | 519 ++++++++++++++++
 tests/unittest/compiler/test_move_view_ops.py | 377 +++++++++++-
 .../compiler/test_transform_memory_ops.py     | 563 +++++++++++++++++-
 8 files changed, 1922 insertions(+), 54 deletions(-)
 create mode 100644 python/aitemplate/compiler/transform/transform_merge_slice_ops.py
 create mode 100644 tests/unittest/compiler/test_merge_slice_ops.py

diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index 083a7e853..ff080c52f 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -42,6 +42,7 @@
 from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
 from aitemplate.compiler.transform.toposort import toposort
 from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_merge_slice_ops import merge_slice_ops
 from aitemplate.compiler.transform.transform_odd_alignment import (
     transform_odd_alignment,
 )
diff --git a/python/aitemplate/compiler/transform/move_view_ops.py b/python/aitemplate/compiler/transform/move_view_ops.py
index 5f9c1e2db..294340848 100644
--- a/python/aitemplate/compiler/transform/move_view_ops.py
+++ b/python/aitemplate/compiler/transform/move_view_ops.py
@@ -19,8 +19,8 @@
 import copy
 from typing import Callable, List, Optional, Tuple
 
+from aitemplate.compiler import ops
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
-
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.compiler.transform import transform_utils
 from aitemplate.compiler.transform.toposort import toposort
@@ -128,7 +128,9 @@ def _call_view_op(
 
 
 def _try_move_view_op(
-    first_cat: Operator, second_cat: Operator, view_op: Operator
+    first_cat: Operator,
+    second_cat: Operator,
+    view_op: Operator,
 ) -> bool:
     """
     Try to move the view_op to the front of the first_cat.
@@ -169,13 +171,30 @@ def _try_move_view_op(
         new_view_output_shapes.append(input_view_shape)
     # Now we start modifying the graph.
     # make a new output tensor for the first cat
-    new_first_cat_output = Tensor(original_view_shape, first_cat_output._attrs["name"])
+    new_first_cat_output = Tensor(
+        original_view_shape,
+        first_cat_output._attrs["name"],
+        dtype=first_cat_output.dtype(),
+    )
     transform_utils.replace_tensor(first_cat_output, new_first_cat_output)
     first_cat._attrs["outputs"][0] = new_first_cat_output
     new_first_cat_output._attrs["src_ops"].add(first_cat)
 
-    # remove the old view op
-    transform_utils.remove_view_op_from_sorted_graph(view_op)
+    for dst_op in new_first_cat_output._attrs["dst_ops"]:
+        dst_op_type = dst_op._attrs["op"]
+        if dst_op_type in _SUPPORTED_VIEW_OPS:
+            # we've ensured all view ops have the same output shape before entering
+            # this function, so it's safe to remove the old view ops
+            transform_utils.remove_view_op_from_sorted_graph(dst_op)
+        else:
+            # we need to place a view op as we've changed the concat's output shape
+            new_view_output = ops.reshape()(
+                new_first_cat_output, first_cat_output.shape()
+            )
+            transform_utils.replace_tensor_for_op(
+                dst_op, new_first_cat_output, new_view_output
+            )
+
     # make a new view op for each first_cat's original input and place it between
     # the original input and the first cat
     new_first_cat_inputs = []
@@ -224,6 +243,28 @@ def _is_valid_cat_op(cat: Operator) -> bool:
     return True
 
 
+def _get_valid_view_op_and_second_cat(
+    view_ops: List[Operator],
+) -> Tuple[Operator, Operator]:
+    """
+    Return the view op and the second cat if we can find such a pair
+    """
+    view_op = None
+    second_cat = None
+    for a_view_op in view_ops:
+        view_op_output = a_view_op._attrs["outputs"][0]
+        next_next_ops = view_op_output._attrs["dst_ops"]
+        next_concats = [n for n in next_next_ops if n._attrs["op"] == "concatenate"]
+        # only allow a single concat in the view_op's dst_ops
+        if len(next_concats) != 1:
+            continue
+        if _is_valid_cat_op(next_concats[0]):
+            view_op = a_view_op
+            second_cat = next_concats[0]
+            break
+    return (view_op, second_cat)
+
+
 def _move_view_op_before_concat(
     sorted_graph: List[Tensor],
 ) -> Tuple[bool, List[Tensor]]:
@@ -247,20 +288,26 @@ def _move_view_op_before_concat(
         if first_cat_output._attrs["is_output"]:
             continue
         next_ops = first_cat_output._attrs["dst_ops"]
-        if len(next_ops) != 1:
+        if len(next_ops) == 0:
             continue
-        view_op = list(next_ops)[0]
-        # skip if the next op is not one of the supported view ops
-        if view_op._attrs["op"] not in _SUPPORTED_VIEW_OPS:
+        view_ops = [op for op in next_ops if op._attrs["op"] in _SUPPORTED_VIEW_OPS]
+        # skip if none of the next ops is one of the supported view ops
+        if len(view_ops) == 0:
             continue
-        view_op_output = view_op._attrs["outputs"][0]
-        if view_op_output._attrs["is_output"]:
+        a_view_op = view_ops[0]
+        view_output_shape = a_view_op._attrs["outputs"][0].shape()
+        # handle a special case where the all view_ops have the same output shape
+        if len(view_ops) > 1 and not all(
+            shape_utils.is_same_shape(
+                vop._attrs["outputs"][0].shape(), view_output_shape
+            )
+            for vop in view_ops
+        ):
             continue
-        next_next_ops = view_op_output._attrs["dst_ops"]
-        if len(next_next_ops) != 1:
+        if any(vop._attrs["outputs"][0]._attrs["is_output"] for vop in view_ops):
             continue
-        second_cat = list(next_next_ops)[0]
-        if not _is_valid_cat_op(second_cat):
+        view_op, second_cat = _get_valid_view_op_and_second_cat(view_ops)
+        if second_cat is None:
             continue
         if _try_move_view_op(first_cat, second_cat, view_op):
             changed = True
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 45061a40c..2b0803b85 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -106,6 +106,9 @@ def optimize_graph(
         # op directly. After fuse_ops, there are only FusedElementwise ops.
         transform_special_ops,
         apply_padding,
+        # apply_padding may introduce new concats that can be fused
+        move_view_op_before_concat,
+        transform_memory_ops,
         transform_strided_ops,
         split_large_slice_scatter_ops,
         split_large_concat_ops,
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index bb5904565..49d6cff3d 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -20,8 +20,11 @@
 
 from aitemplate.compiler.base import Operator, Tensor
 
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.tensor_accessor import TensorAccessor
-from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_merge_slice_ops import merge_slice_ops
 
 from aitemplate.utils import graph_utils, shape_utils
 
@@ -56,7 +59,106 @@ def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
-def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
+def _update_cat_dst_ops(
+    first_cat: Operator, second_cat: Operator, cat_dim_offset: int
+) -> None:
+    """
+    Add all the strided dst_ops of the first cat to the second and
+    make an appropriate slice op between the second cat and each dst_ops.
+    cat_dim_offset represents the offset of the first cat output appearing
+    in the second cat along the cat_dim dimension.
+    """
+    first_cat_output = first_cat._attrs["outputs"][0]
+    first_cat_dst_ops = first_cat_output._attrs["dst_ops"]
+    # the first cat does not have any strided ops
+    if len(first_cat_dst_ops) <= 1:
+        return
+    first_cat_shape = first_cat_output.shape()
+    rank = len(first_cat_shape)
+    cat_dim = first_cat._attrs["concat_dim"]
+    assert transform_strided_ops_utils.cat_split_dim_is_static(
+        first_cat, cat_dim
+    ), f"expected the {cat_dim=} of {first_cat=} to be static"
+    second_cat_output = second_cat._attrs["outputs"][0]
+    # make start_indices and end_indices for the slice
+    for idx, first_cat_dst_op in enumerate(first_cat_dst_ops):
+        if first_cat_dst_op is second_cat:
+            continue
+        else:
+            # Make a new slice op. Note that it's fine we make a new slice op from
+            # another slice op, because consecutive slice ops will be merged
+            # by the merge_slice_ops pass
+            slice_start_indices = [0] * rank
+            slice_end_indices = [None] * rank
+            slice_start_indices[cat_dim] = cat_dim_offset
+            slice_end_indices[cat_dim] = (
+                cat_dim_offset + first_cat_shape[cat_dim].value()
+            )
+            slice_op = dynamic_slice()
+            slice_op_name = f'dynamic_slice_{idx}_{first_cat._attrs["name"]}'
+            slice_op._attrs["name"] = slice_op_name
+            slice_op._attrs["original_name"] = slice_op_name
+            slice_output = slice_op(
+                second_cat_output, slice_start_indices, slice_end_indices
+            )
+            slice_output._attrs["name"] = f"{slice_op_name}_0"
+            slice_output._attrs["dst_ops"].add(first_cat_dst_op)
+            # remove the old strided op from first cat's dst_ops
+            first_cat_dst_ops.remove(first_cat_dst_op)
+            # update the strided op's input to the newly-created slice output
+            first_cat_dst_op.replace_input_tensor(first_cat_output, slice_output)
+
+
+def _is_supported_dst_op_for_first_cat(
+    dst_op: Operator,
+) -> bool:
+    """
+    A helper function that returns True if the given dst_op is
+    * a supported strided op; or
+    * a view op that is only used by a supported stride op; or
+    * a view op that is indirectly (via another single-dst view op) used
+      by a supported strided op.
+    Note that technically, this checking is not necessary, because we could
+    let other passes process the likely fusion patterns related to
+    concat + strided_op. However, it seems to be safer if we could add
+    more tests similar to test_fuse_strided_cat_reshape_cat but with different
+    strided ops such as gemm/layernorm/etc. To be conservative, we only
+    enable the following patterns and will remove the restriction once we
+    have more test coverage.
+    """
+    view_ops = ["reshape", "flatten", "dynamic_slice", "squeeze", "unsqueeze"]
+    # FIXME: enable other ops with input_accessors
+    supported_strided_ops = ["elementwise", "fused_elementwise"]
+
+    def _supported_op_type(op_type):
+        if op_type in supported_strided_ops:
+            return True
+        return op_type.startswith("bmm_crr")
+
+    dst_op_type = dst_op._attrs["op"]
+    if _supported_op_type(dst_op_type):
+        return True
+    while dst_op_type in view_ops:
+        dst_op_outputs = dst_op._attrs["outputs"]
+        if len(dst_op_outputs) != 1:
+            return False
+        dst_op_output = dst_op_outputs[0]
+        if dst_op_output._attrs["is_output"]:
+            return False
+        next_dst_ops = dst_op_output._attrs["dst_ops"]
+        if len(next_dst_ops) != 1:
+            return False
+        dst_op = next_dst_ops[0]
+        dst_op_type = dst_op._attrs["op"]
+        if _supported_op_type(dst_op_type):
+            return True
+    return False
+
+
+def _check_first_cat(first_cat: Operator, second_cat: Operator) -> bool:
+    """
+    return True if the first cat is valid for fusion
+    """
     # Make sure input_accessors do not carry any strided information.
     # It may happen. For example, an input of the cat can be of a strided
     # tensor generated by slice, which takes another concat's output.
@@ -65,12 +167,125 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     #     y2 = slice(y1)
     #     y = cat(y1, y2)
     # In such a case, we cannot merge those two concat ops.
+    if not all(
+        accessor.actual_shapes is None
+        for accessor in first_cat._attrs["input_accessors"]
+    ):
+        return False
+    if not all(first_cat._attrs["input_masks"]):
+        return False
+
+    # we need to make sure all other dst ops except the second cat have input
+    # accessors for which we may generate valid strided information. We will
+    # leverage the input accessor by injecting a slice op between the merged
+    # cat and the strided op (e.g. add).
+    cat_dim = first_cat._attrs["concat_dim"]
+    first_cat_outputs = first_cat._attrs["outputs"]
+    assert (
+        len(first_cat_outputs) == 1
+    ), f"expected {first_cat_outputs=} to have a single output"
+    first_cat_output = first_cat_outputs[0]
+    first_cat_dst_ops = first_cat_output._attrs["dst_ops"]
+    if len(first_cat_dst_ops) == 1:
+        return True
+    if not transform_strided_ops_utils.cat_split_dim_is_static(first_cat, cat_dim):
+        return False
+    # we cannot leverage slice if any of the dimensions after cat_dim is dynamic
+    if not shape_utils.all_static_dimensions(first_cat_output.shape(), cat_dim):
+        return False
+
+    # we can fuse the first cat into the second only if all of the first cat's
+    # dst ops are valid
+    for dst_op in first_cat_dst_ops:
+        if dst_op is second_cat:
+            continue
+        if not _is_supported_dst_op_for_first_cat(dst_op):
+            return False
+        # merging first_cat and second_cat may introduce a cycle
+        if transform_utils.is_ancestor(dst_op, second_cat):
+            return False
+    return True
+
+
+def _check_second_cat(cat: Operator) -> bool:
+    """
+    return True if the second cat is valid for fusion
+    """
+    if len(cat._attrs["outputs"]) != 1:
+        return False
+    # Similar to the first cat, make sure the second cat's input_accessors
+    # do not carry any strided information.
     if not all(
         accessor.actual_shapes is None for accessor in cat._attrs["input_accessors"]
     ):
         return False
-    first_op_inputs = first_op._attrs["inputs"]
-    first_op_outputs = first_op._attrs["outputs"]
+    if not all(cat._attrs["input_masks"]):
+        return False
+    return True
+
+
+def _try_merge_cat_cat(first_cat: Operator, second_cat: Operator) -> bool:
+    if not _check_first_cat(first_cat, second_cat):
+        return False
+    if not _check_second_cat(second_cat):
+        return False
+    first_cat_inputs = first_cat._attrs["inputs"]
+    first_cat_outputs = first_cat._attrs["outputs"]
+    first_cat_output = first_cat_outputs[0]
+    second_cat_inputs = second_cat._attrs["inputs"]
+    second_cat_original_inputs = second_cat._attrs["original_inputs"]
+    new_cat_inputs = []
+    new_cat_original_inputs = []
+    new_cat_input_accessors = []
+    for i, second_cat_input in enumerate(second_cat_inputs):
+        if second_cat_input is first_cat_output:
+            new_cat_inputs.extend(first_cat._attrs["inputs"])
+            first_cat_original_inputs = first_cat._attrs["inputs"]
+            new_cat_original_inputs.extend(first_cat_original_inputs)
+            new_cat_input_accessors.extend(
+                copy.deepcopy(first_cat._attrs["input_accessors"])
+            )
+        else:
+            new_cat_inputs.append(second_cat_input)
+            new_cat_original_inputs.append(second_cat_original_inputs[i])
+            new_cat_input_accessors.append(second_cat._attrs["input_accessors"][i])
+
+    for tensor in new_cat_inputs:
+        if tensor in first_cat_outputs:
+            return False
+
+    # note that we have to compute cat_dim_offset before updating cat's inputs,
+    # because we determine the cat_dim_offset based on its old inputs
+    cat_dim_offset = 0
+    cat_dim = second_cat._attrs["concat_dim"]
+    for second_cat_input in second_cat._attrs["inputs"]:
+        if second_cat_input is first_cat_output:
+            break
+        cat_dim_offset += second_cat_input._size(cat_dim).value()
+
+    second_cat._attrs["inputs"] = new_cat_inputs
+    # make sure all of the input_masks values are True. We may need to
+    # change this part later when we have TensorAccessors, depending on
+    # the order of the transformations.
+    assert all(second_cat._attrs["input_masks"])
+    second_cat._attrs["input_accessors"] = new_cat_input_accessors
+    second_cat._attrs["original_inputs"] = list(new_cat_original_inputs)
+    second_cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
+    for tensor in first_cat_inputs:
+        # the same tensor may be used multiple times
+        tensor._attrs["dst_ops"].discard(first_cat)
+        tensor._attrs["dst_ops"].add(second_cat)
+    # now we can move strided ops from the first cat to the merged cat with
+    # an appropriate slice op between the merged cat and each strided op
+    _update_cat_dst_ops(first_cat, second_cat, cat_dim_offset)
+    transform_utils.remove_tensor_from_sorted_graph(first_cat_output)
+    return True
+
+
+def _try_merge_split_cat(split_op: Operator, cat: Operator) -> bool:
+    # If split_op carries strided input_accessors, we skip it
+    split_op_inputs = split_op._attrs["inputs"]
+    split_op_outputs = split_op._attrs["outputs"]
     cat_inputs = cat._attrs["inputs"]
     cat_original_inputs = cat._attrs["original_inputs"]
     new_cat_inputs = []
@@ -79,29 +294,19 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     i = 0
     while i < len(cat_inputs):
         matched = True
-        for j, _ in enumerate(first_op_outputs):
+        for j, _ in enumerate(split_op_outputs):
             if (i + j >= len(cat_inputs)) or (
-                cat_inputs[i + j] is not first_op_outputs[j]
+                cat_inputs[i + j] is not split_op_outputs[j]
             ):
                 matched = False
                 break
         if matched:
-            new_cat_inputs.extend(first_op._attrs["inputs"])
-            # we may not have original_inputs/input_accessors, e.g. if first_op is split
-            if "original_inputs" in first_op._attrs:
-                original_inputs = first_op._attrs["original_inputs"]
-            else:
-                original_inputs = first_op._attrs["inputs"]
-            new_cat_original_inputs.extend(original_inputs)
-            if "input_accessors" in first_op._attrs:
-                new_cat_input_accessors.extend(
-                    copy.deepcopy(first_op._attrs["input_accessors"])
-                )
-            else:
-                new_cat_input_accessors.extend(
-                    [TensorAccessor(t) for t in original_inputs]
-                )
-            i += len(first_op_outputs)
+            # split doens't have "original_inputs" attribute
+            split_op_inputs = split_op._attrs["inputs"]
+            new_cat_inputs.extend(split_op_inputs)
+            new_cat_original_inputs.extend(split_op_inputs)
+            new_cat_input_accessors.extend([TensorAccessor(t) for t in split_op_inputs])
+            i += len(split_op_outputs)
         else:
             new_cat_inputs.append(cat_inputs[i])
             new_cat_original_inputs.append(cat_original_inputs[i])
@@ -109,7 +314,7 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
             i += 1
 
     for tensor in new_cat_inputs:
-        if tensor in first_op_outputs:
+        if tensor in split_op_outputs:
             return False
 
     cat._attrs["inputs"] = new_cat_inputs
@@ -120,11 +325,10 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     cat._attrs["input_accessors"] = new_cat_input_accessors
     cat._attrs["original_inputs"] = list(new_cat_original_inputs)
     cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
-    for tensor in first_op_inputs:
-        # the same tensor may be used multiple times
-        tensor._attrs["dst_ops"].discard(first_op)
+    for tensor in split_op_inputs:
+        tensor._attrs["dst_ops"].discard(split_op)
         tensor._attrs["dst_ops"].add(cat)
-    for tensor in first_op_outputs:
+    for tensor in split_op_outputs:
         transform_utils.remove_tensor_from_sorted_graph(tensor)
     return True
 
@@ -149,7 +353,12 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
         cat = None
         found_cat_op = True
         for output_t in first_op._attrs["outputs"]:
-            if len(output_t._attrs["dst_ops"]) > 1:
+            # TODO: currently, we only allow concatenate output with multiple dst_ops.
+            # We may need to extend it to split ops.
+            if (
+                len(output_t._attrs["dst_ops"]) > 1
+                and first_op._attrs["op"] != "concatenate"
+            ):
                 found_cat_op = False
                 break
             # If first op is output, it can't be fused.
@@ -157,12 +366,14 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
                 found_cat_op = False
                 continue
             next_ops = output_t._attrs["dst_ops"]
-            if len(next_ops) != 1:
+            if len(next_ops) == 0:
                 break
-            next_op = list(next_ops)[0]
-            if next_op._attrs["op"] != "concatenate":
+            next_concats = [n for n in next_ops if n._attrs["op"] == "concatenate"]
+            # only support cases where first_cat is consumed by a single concat
+            if len(next_concats) != 1:
                 found_cat_op = False
                 break
+            next_op = next_concats[0]
             if cat is None:
                 cat = next_op
             if next_op is not cat:
@@ -181,11 +392,31 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
             continue
 
         to_be_merged_ops.append([first_op, cat])
+        # only add first_op to the visited set to cases where
+        # we may have chained concat cases:
+        #     concat_0 = concat(x0...)
+        #     concat_1 = concat(concat_0...)
+        #     concat_2 = concat(concat_1...)
+        # where merging concat_0 and concat_1 is invalid but merging concat_1
+        # and concat_2 is valid. If we include both first_op and cat into
+        # the visited set, we would miss the opportunity of merging concat_1
+        # and concat_2.
         visited.add(first_op)
-        visited.add(cat)
 
+    updated_cat_cat = False
     for ops in to_be_merged_ops:
-        _try_merge_split_cat(ops[0], ops[1])
+        first_op_type = ops[0]._attrs["op"]
+        if first_op_type == "split":
+            _try_merge_split_cat(ops[0], ops[1])
+        elif first_op_type == "concatenate":
+            if _try_merge_cat_cat(ops[0], ops[1]):
+                updated_cat_cat = True
+        else:
+            raise AssertionError(f"unsupported {first_op_type=} for merging with cat")
+
+    # we adjusted input/output dependencies so need to run toposort again
+    if updated_cat_cat:
+        sorted_graph = toposort(sorted_graph)
 
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
@@ -229,6 +460,7 @@ def transform_memory_ops(
     funcs = [
         _eliminate_split_full_idx,
         _merge_split_and_cat,
+        merge_slice_ops,
         _eliminate_cat,
     ]
     num_ops = None
diff --git a/python/aitemplate/compiler/transform/transform_merge_slice_ops.py b/python/aitemplate/compiler/transform/transform_merge_slice_ops.py
new file mode 100644
index 000000000..ec6dbf004
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_merge_slice_ops.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This file implements a pass that merges consecutive slice ops if possible.
+"""
+from typing import List, Optional
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
+from aitemplate.compiler.ops.tensor.dynamic_slice import MAX_INT32
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import shape_utils
+
+
+def _try_merge_slice_slice(
+    first_slice: Operator, second_slice: Operator, slice_dim: int
+) -> bool:
+    """
+    This function tries to merge two consecutive slice ops with the following
+    steps:
+        * update the start_indices and end_indices fields of the second_slice
+        * remove the first slice
+    """
+    first_slice_output = first_slice._attrs["outputs"][0]
+    first_slice_input_shape = first_slice._attrs["inputs"][0].shape()
+    second_slice_output = second_slice._attrs["outputs"][0]
+    second_slice_output_shape = second_slice_output.shape()
+    # note that all the dims of input_shape[slice_dim:] and output_shape[slice_dim:]
+    # are static at this point
+    for idx in range(slice_dim, first_slice_output._rank()):
+        first_slice_dim_offset = first_slice._attrs["start_indices"][idx]
+        # update the start and end indices of the second slice op
+        new_start = second_slice._attrs["start_indices"][idx] + first_slice_dim_offset
+        first_slice_input_dim = first_slice_input_shape[idx].value()
+        # new start index exceeds the corresponding dim value of the first slice input shape
+        if new_start >= first_slice_input_dim:
+            return False
+        new_end = new_start + second_slice_output_shape[idx].value()
+        # new end index exceeds the corresponding dim value of the first slice input shape
+        if new_end > first_slice_input_dim:
+            return False
+        first_slice_end = first_slice._attrs["end_indices"][idx]
+        second_slice_end = second_slice._attrs["end_indices"][idx]
+        if first_slice_end == MAX_INT32 == second_slice_end:
+            new_end = MAX_INT32
+        second_slice._attrs["start_indices"][idx] = new_start
+        second_slice._attrs["end_indices"][idx] = new_end
+    # remove the old strided op from the first cat's dst_ops
+    transform_utils.remove_single_tensor_op_from_sorted_graph(first_slice)
+    return True
+
+
+def _check_slice_op(slice_op: Operator, slice_dim: int) -> bool:
+    """
+    Return True if the slice_op's indices are valid for being merged
+    """
+    slice_shape = slice_op._attrs["outputs"][0].shape()
+    if not shape_utils.all_static_dimensions(slice_shape, slice_dim):
+        return False
+    # we expect normalized start_indices and end_indices
+    start_index = slice_op._attrs["start_indices"][slice_dim]
+    if start_index is None or start_index < 0:
+        return False
+    end_index = slice_op._attrs["end_indices"][slice_dim]
+    if end_index is None or end_index < 0 or end_index <= start_index:
+        return False
+    return True
+
+
+def _get_rightmost_non_dynamic_dim(shape: List[IntVar]) -> Optional[int]:
+    """
+    Return the index of the rightmost non-dynamic dim. For example, given
+    a shape [3, dyn_dim, 4, 1], it would return 2, which is the index of the
+    third dim.
+    Return None if shape[-1] is dynamic.
+    """
+    idx = 0
+    for dim in reversed(shape):
+        if not isinstance(dim, IntImm):
+            break
+        idx += 1
+    if idx == 0:
+        return None
+    return len(shape) - idx
+
+
+def merge_slice_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
+    # a list of tuple(first_slice, second_slice, slice_dim)
+    to_be_merged = []
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "dynamic_slice":
+            continue
+        first_slice = src_op
+        first_slice_output = first_slice._attrs["outputs"][0]
+        if first_slice_output._attrs["is_output"]:
+            continue
+        slice_dim = _get_rightmost_non_dynamic_dim(first_slice_output.shape())
+        if slice_dim is None:
+            continue
+        if not _check_slice_op(first_slice, slice_dim):
+            continue
+        next_ops = first_slice_output._attrs["dst_ops"]
+        if len(next_ops) != 1:
+            continue
+        next_op = next_ops[0]
+        if next_op._attrs["op"] != "dynamic_slice":
+            continue
+        second_slice = next_op
+        second_slice_output = second_slice._attrs["outputs"][0]
+        if first_slice_output._rank() != second_slice_output._rank():
+            continue
+        second_slice_dim = _get_rightmost_non_dynamic_dim(second_slice_output.shape())
+        if slice_dim != second_slice_dim:
+            continue
+        if not _check_slice_op(second_slice, slice_dim):
+            continue
+        to_be_merged.append([first_slice, second_slice, slice_dim])
+
+    for first_slice, second_slice, slice_dim in to_be_merged:
+        _try_merge_slice_slice(first_slice, second_slice, slice_dim)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/tests/unittest/compiler/test_merge_slice_ops.py b/tests/unittest/compiler/test_merge_slice_ops.py
new file mode 100644
index 000000000..18c5c1469
--- /dev/null
+++ b/tests/unittest/compiler/test_merge_slice_ops.py
@@ -0,0 +1,519 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class MergeSliceOpsTestCase(unittest.TestCase):
+    BATCH_SIZE = 1024
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(MergeSliceOpsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_slice_slice_basic(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # y = concat(x2, slice_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        M2 = 3
+        N2 = slice_2.shape()[-1].value()
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        Y = ops.concatenate()([X2, slice_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            y_pt = torch.cat([x2_pt, slice_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_basic(self):
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 1],
+            first_slice_end_indices=[None, None, 15],
+            second_slice_start_indices=[0, 1, 3],
+            second_slice_end_indices=[None, None, 5],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_basic_0",
+            dtype="float16",
+        )
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 0],
+            first_slice_end_indices=[None, 10, None],
+            second_slice_start_indices=[0, 2, 0],
+            second_slice_end_indices=[None, 4, None],
+            expected_ops_cnt=2,
+            expected_slice_cnt=0,
+            test_name="slice_slice_basic_1",
+            dtype="float16",
+        )
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 3],
+            first_slice_end_indices=[None, 10, 12],
+            second_slice_start_indices=[0, 2, 1],
+            second_slice_end_indices=[None, None, 6],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_basic_2",
+            dtype="float16",
+        )
+
+    def _test_slice_slice_2(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        third_slice_start_indices,
+        third_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # slice_3 = slice(slice_2)
+        # y = add(slice_3, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        slice_3 = ops.dynamic_slice()(
+            slice_2,
+            start_indices=third_slice_start_indices,
+            end_indices=third_slice_end_indices,
+        )
+        M2 = slice_3.shape()[-2].value()
+        N2 = slice_3.shape()[-1].value()
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        Y = ops.elementwise(FuncEnum.ADD)(slice_3, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            third_slice_indices = [
+                slice(i, j)
+                for i, j in zip(third_slice_start_indices, third_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            slice_3_pt = slice_2_pt[third_slice_indices]
+            y_pt = slice_3_pt + x2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_2(self):
+        self._test_slice_slice_2(
+            M0=20,
+            N0=30,
+            first_slice_start_indices=[0, 1, 2],
+            first_slice_end_indices=[None, 15, 28],
+            second_slice_start_indices=[0, 2, 2],
+            second_slice_end_indices=[None, 10, 9],
+            third_slice_start_indices=[0, 2, 1],
+            third_slice_end_indices=[None, 5, 3],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_2",
+            dtype="float16",
+        )
+        self._test_slice_slice_2(
+            M0=20,
+            N0=30,
+            first_slice_start_indices=[0, 1, 2],
+            first_slice_end_indices=[None, 15, 28],
+            second_slice_start_indices=[0, 2, 2],
+            second_slice_end_indices=[None, None, 9],
+            third_slice_start_indices=[0, 2, 1],
+            third_slice_end_indices=[None, 5, None],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_2",
+            dtype="float16",
+        )
+
+    def _test_slice_slice_3(
+        self,
+        input_shape,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x0)
+        # slice_1 = slice(add_0)
+        # Y = slice(slice_1)
+        X0 = Tensor(
+            shape=input_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        Y = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        x0_pt = get_random_torch_tensor(input_shape, dtype)
+
+        first_slice_indices = [
+            slice(i, j)
+            for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+        ]
+        second_slice_indices = [
+            slice(i, j)
+            for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+        ]
+        add_0_pt = x0_pt + x0_pt
+        slice_1_pt = add_0_pt[first_slice_indices]
+        y_pt = slice_1_pt[second_slice_indices]
+
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        inputs = {"x0": x0_pt}
+        outputs = [y]
+        module.run_with_tensors(inputs, outputs)
+        torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_3(self):
+        self._test_slice_slice_3(
+            input_shape=[2, 3, 2],
+            first_slice_start_indices=[0, 1, 0],
+            first_slice_end_indices=[None, 2, None],
+            second_slice_start_indices=[0, 0, 1],
+            second_slice_end_indices=[None, None, 2],
+            expected_ops_cnt=2,
+            expected_slice_cnt=1,
+            test_name="slice_slice_3",
+            dtype="float16",
+        )
+        self._test_slice_slice_3(
+            input_shape=[2, 1, 10, 10, 10],
+            first_slice_start_indices=[0, 0, 1, 0, 0],
+            first_slice_end_indices=[None, None, -1, None, None],
+            second_slice_start_indices=[0, 0, 0, 1, 0],
+            second_slice_end_indices=[None, None, None, 2, None],
+            expected_ops_cnt=2,
+            expected_slice_cnt=1,
+            test_name="slice_slice_3",
+            dtype="float16",
+        )
+
+    def _test_non_fusible_slice_slice(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # y = concat(x2, slice_1, slice_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_1_N = slice_1.shape()[-1].value()
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        M2 = 3
+        N2 = slice_2.shape()[-1].value()
+        assert N0 == slice_1_N == N2, f"expected {N0=} == {slice_1_N=} == {N2=}"
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        Y = ops.concatenate()([X2, slice_1, slice_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            y_pt = torch.cat([x2_pt, slice_1_pt, slice_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_non_fusible_slice_slice(self):
+        self._test_non_fusible_slice_slice(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 0],
+            first_slice_end_indices=[None, 10, None],
+            second_slice_start_indices=[0, 2, 0],
+            second_slice_end_indices=[None, 4, None],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_non_fusible",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
index 1a8cde5e8..94b933565 100644
--- a/tests/unittest/compiler/test_move_view_ops.py
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -504,6 +504,79 @@ def test_move_reshape_cat_2(self):
             dtype="float16",
         )
 
+    def _test_move_reshape_cat_3(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # reshape_2 = reshape(x2)
+        # y = concatenate(reshape_2, reshape_1, reshape_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_1 = ops.reshape()(concat_0, [-1, (M0 + M1) * N])
+        reshape_2 = ops.reshape()(X2, [-1, M2 * N])
+        Y = ops.concatenate()([reshape_2, reshape_1, reshape_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, [-1, (M0 + M1) * N])
+            reshape_2_pt = torch.reshape(x2_pt, [-1, M2 * N])
+            y_pt = torch.cat([reshape_2_pt, reshape_1_pt, reshape_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_3(self):
+        self._test_move_reshape_cat_3(
+            M0=4,
+            M1=6,
+            M2=3,
+            N=4,
+            test_name="test_move_reshape_cat_3",
+            dtype="float16",
+        )
+
     def _test_move_strided_reshape_cat(
         self, M0, M1, M2, M3, N, test_name, dtype="float16"
     ):
@@ -1165,12 +1238,12 @@ def _test_move_strided_reshape_cat_7(
         self.test_count += 1
         sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-        self.assertEqual(len(sorted_ops), 7)
+        self.assertEqual(len(sorted_ops), 6)
         concat_cnt = 0
         for sorted_op in sorted_ops:
             if sorted_op._attrs["op"] == "concatenate":
                 concat_cnt += 1
-        self.assertEqual(concat_cnt, 2)
+        self.assertEqual(concat_cnt, 1)
 
         for batch in [1, self.BATCH_SIZE]:
             x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
@@ -1309,12 +1382,15 @@ def _test_move_strided_reshape_cat_8(
         self.test_count += 1
         sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-        self.assertEqual(len(sorted_ops), 10)
+        # dynamic_slice + bmm cannot be fused because we can't generate
+        # any valid strided access
+        self.assertEqual(len(sorted_ops), 9)
         concat_cnt = 0
         for sorted_op in sorted_ops:
-            if sorted_op._attrs["op"] == "concatenate":
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
                 concat_cnt += 1
-        self.assertEqual(concat_cnt, 3)
+        self.assertEqual(concat_cnt, 1)
 
         for batch in [1, self.BATCH_SIZE]:
             x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
@@ -1363,6 +1439,7 @@ def test_move_strided_reshape_cat_8(self):
             test_name="test_move_strided_reshape_cat_8",
             dtype="float16",
         )
+        return
         self._test_move_strided_reshape_cat_8(
             M0=4,
             M1=4,
@@ -1373,6 +1450,296 @@ def test_move_strided_reshape_cat_8(self):
             dtype="float16",
         )
 
+    def _test_move_strided_reshape_cat_9(
+        self, M0, M1, M2, M3, M7, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, concat_1, x3) # 2d
+        # reshape_5 = reshape(concat_4) # 3d
+        # add_6 = add(reshape_5, x6) # 3d
+        # concat_7 = concatenate(x7, reshape_5, x7) # 3d
+        # reduce_8 = reduce_sum(bmm_crr_add_3)
+        # reduce_9 = reduce_sum(add_6)
+        # reduce_10 = reduce_sum(concat_7)
+        # add_11 = add(reduce_8, reduce_9)
+        # y = add(add_11, reduce_10)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3 * N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)  # 2d
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)  # 2d
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, concat_1, X3], dim=cat_dim)  # 2d
+        M6 = sum([t.shape()[cat_dim].value() for t in [X3, concat_1, X3]])
+        assert M6 % N == 0, f"expected {M6=} is divisible by {N=}"
+        M6 = M6 // N
+        reshape_5 = ops.reshape()(concat_4, [-1, M6, N])  # 3d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(M6), IntImm(N)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        X7 = Tensor(
+            shape=[batch_dim, IntImm(M7), IntImm(N)],
+            dtype=dtype,
+            name="x7",
+            is_input=True,
+        )
+        concat_7 = ops.concatenate()([X7, reshape_5, X7], dim=cat_dim)  # 3d
+        reduce_dim = cat_dim
+        reduce_8 = ops.reduce_sum(reduce_dim)(bmm_crr_add_3)
+        reduce_9 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_10 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_11 = ops.elementwise(FuncEnum.ADD)(reduce_8, reduce_9)
+        Y = ops.elementwise(FuncEnum.ADD)(add_11, reduce_10)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3 * N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, M6, N], dtype)
+            x7_pt = get_random_torch_tensor([batch, M7, N], dtype)
+
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, concat_1_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, M6, N])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x7_pt, reshape_5_pt, x7_pt], dim=cat_dim)
+            reduce_8_pt = torch.sum(bmm_crr_add_3_pt, reduce_dim)
+            reduce_9_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_10_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_11_pt = reduce_8_pt + reduce_9_pt
+            y_pt = add_11_pt + reduce_10_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+                "x7": x7_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_9(self):
+        self._test_move_strided_reshape_cat_9(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            M7=8,
+            N=4,
+            test_name="test_move_strided_reshape_cat_9",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_multi_dsts(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # reshape_4 = reshape(concat_1) # 3d
+        # concat_5 = concatenate(x3, reshape_4, x3) # 3d
+        # reduce_8 = reduce_sum(bmm_crr_add_3)
+        # reduce_9 = reduce_sum(concat_5)
+        # y = add(reduce_8, reduce_9)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)  # 2d
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)  # 2d
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        reshape_to_shape_4 = M0 + M2
+        reshape_4 = ops.reshape()(concat_1, [-1, reshape_to_shape_4, N])  # 3d
+        concat_5 = ops.concatenate()([X3, reshape_4, X3], dim=cat_dim)  # 2d
+        reduce_dim = cat_dim
+        reduce_8 = ops.reduce_sum(reduce_dim)(bmm_crr_add_3)
+        reduce_9 = ops.reduce_sum(reduce_dim)(concat_5)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_8, reduce_9)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 6)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            reshape_4_pt = torch.reshape(concat_1_pt, [-1, reshape_to_shape_4, N])
+            concat_5_pt = torch.cat([x3_pt, reshape_4_pt, x3_pt], dim=cat_dim)
+            reduce_8_pt = torch.sum(bmm_crr_add_3_pt, reduce_dim)
+            reduce_9_pt = torch.sum(concat_5_pt, reduce_dim)
+            y_pt = reduce_8_pt + reduce_9_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_multi_dsts(self):
+        self._test_move_strided_reshape_cat_multi_dsts(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            N=4,
+            test_name="test_move_strided_reshape_cat_multi_dsts",
+            dtype="float16",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index e069c26e1..8e54a05d3 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -17,13 +17,14 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
-from aitemplate.utils import graph_utils
+from aitemplate.utils import graph_utils, shape_utils
 
 from parameterized import parameterized
 
@@ -34,6 +35,10 @@ class MemoryOpTransformationTestCase(unittest.TestCase):
     N = 128
     USE_DYNAMIC_BATCH = False
 
+    def __init__(self, *args, **kwargs):
+        super(MemoryOpTransformationTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
     def _prepare_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
@@ -470,6 +475,562 @@ def test_skip_cat_cat_elimination_e2e(self, dtype):
         self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
         self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
 
+    def _test_fuse_strided_cat_cat(self, M0, M1, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # add_1 = add(concat_0, x2)
+        # concat_2 = concatenate(x0, concat_0)
+        # reduce_3 = reduce_sum(add_1)
+        # reduce_4 = reduce_sum(concat_2)
+        # y = add(reduce_3, reduce_4)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        add_1 = ops.elementwise(FuncEnum.ADD)(concat_0, X2)
+        concat_2 = ops.concatenate()([X0, concat_0], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_3 = ops.reduce_sum(reduce_dim)(add_1)
+        reduce_4 = ops.reduce_sum(reduce_dim)(concat_2)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_3, reduce_4)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            add_1_pt = concat_0_pt + x2_pt
+            concat_2_pt = torch.cat([x0_pt, concat_0_pt], dim=cat_dim)
+            reduce_3_pt = torch.sum(add_1_pt, reduce_dim)
+            reduce_4_pt = torch.sum(concat_2_pt, reduce_dim)
+            y_pt = reduce_3_pt + reduce_4_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_cat(self):
+        self._test_fuse_strided_cat_cat(
+            M0=3,
+            M1=4,
+            N=9,
+            test_name="test_fuse_strided_cat_cat",
+        )
+        self._test_fuse_strided_cat_cat(
+            M0=2,
+            M1=4,
+            N=8,
+            test_name="test_fuse_strided_cat_cat",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat(
+        self, M0, M1, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # add_2 = add(reshape_1, x2)
+        # concat_3 = concatenate(x0, reshape_1)
+        # reduce_4 = reduce_sum(add_2)
+        # reduce_5 = reduce_sum(concat_3)
+        # y = add(reduce_4, reduce_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3 * N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_to_shape_1 = [-1, M2 * N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        add_2 = ops.elementwise(FuncEnum.ADD)(reshape_1, X2)
+        concat_3 = ops.concatenate()([X3, reshape_1], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_4 = ops.reduce_sum(reduce_dim)(add_2)
+        reduce_5 = ops.reduce_sum(reduce_dim)(concat_3)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_4, reduce_5)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3 * N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            add_2_pt = reshape_1_pt + x2_pt
+            concat_3_pt = torch.cat([x3_pt, reshape_1_pt], dim=cat_dim)
+            reduce_4_pt = torch.sum(add_2_pt, reduce_dim)
+            reduce_5_pt = torch.sum(concat_3_pt, reduce_dim)
+            y_pt = reduce_4_pt + reduce_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat(self):
+        self._test_fuse_strided_cat_reshape_cat(
+            M0=2,
+            M1=4,
+            M3=3,
+            N=8,
+            test_name="test_fuse_strided_cat_reshape_cat",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat_2(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # add_3 = add(reshape_2, x4) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(reshape_2, X4)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(add_3, [-1, (M0 + M2) * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = reshape_2_pt + x4_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(add_3_pt, [-1, (M0 + M2) * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x6": x6_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat_2(self):
+        self._test_fuse_strided_cat_reshape_cat_2(
+            M0=2,
+            M1=2,
+            M2=2,
+            M3=1,
+            N=2,
+            test_name="test_fuse_strided_cat_reshape_cat_2",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat_3(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # add_3 = add(reshape_2, x4) # 3d
+        # concat_4 = concatenate(x3, concat_1, x3) # 2d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(reshape_2, X4)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(add_3, [-1, (M0 + M2) * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = reshape_2_pt + x4_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(add_3_pt, [-1, (M0 + M2) * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x6": x6_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat_3(self):
+        self._test_fuse_strided_cat_reshape_cat_3(
+            M0=2,
+            M1=2,
+            M2=2,
+            M3=1,
+            N=2,
+            test_name="test_fuse_strided_cat_reshape_cat_3",
+        )
+
+    def _test_non_fusible_strided_cat_cat(self, M0, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # add_1 = add(concat_0, x2)
+        # y = concatenate(concat_0, add_1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M0 + M0), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        add_1 = ops.elementwise(FuncEnum.ADD)(concat_0, X2)
+        Y = ops.concatenate()([concat_0, add_1], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        concat_cnt = 0
+        output_cat = None
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+                if sorted_op._attrs["outputs"][0] == Y:
+                    output_cat = sorted_op
+        self.assertEqual(concat_cnt, 2)
+        self.assertEqual(output_cat._attrs["input_masks"], [True, False])
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M0 + M0, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            add_1_pt = concat_0_pt + x2_pt
+            y_pt = torch.cat([concat_0_pt, add_1_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_strided_cat_cat(self):
+        self._test_non_fusible_strided_cat_cat(
+            M0=2,
+            N=8,
+            test_name="test_non_fusible_strided_cat_cat",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From bd43a9931b3f69a7fafef2b84a6a318cb298d216 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 30 Mar 2023 19:54:36 -0700
Subject: [PATCH 365/638] Support models with 2GB+ params (#520)

Summary:
I tried to compile test model having 2GB+ params.
Large (>2GB) `constants.obj` was generated successfully, but `model.so` creation failed. (Linux x86_64 platform)

Currently we convert `constants.bin` to `constants.obj` using
```
ld -r -b binary -o constants.obj constants.bin
```

`objdump` shows that `_binary_constants_bin` array was placed to `.data` section.
`.data` and `.rodata` sections can not allocate more that 2GB.

To solve the issue with 2GB limit we can put `_binary_constants_bin` array to `.lrodata` read-only section. It does not have 2GB limit.
We can do it by using
```
objcopy --rename-section .data=.lrodata,alloc,load,readonly,data,contents constants.obj constants.obj
```
to rename `.data` section in `constants.obj` file to `.lrodata`.

Before
```
$ objdump -x constants_old.obj

architecture: i386:x86-64, flags 0x00000010:
HAS_SYMS
start address 0x0000000000000000

Sections:
Idx Name          Size      VMA               LMA               File off  Algn
  0 .data         8b8577e0  0000000000000000  0000000000000000  00000040  2**0
                  CONTENTS, ALLOC, LOAD, DATA
SYMBOL TABLE:
0000000000000000 l    d  .data	0000000000000000 .data
000000008b8577e0 g       *ABS*	0000000000000000 _binary_constants_bin_size
000000008b8577e0 g       .data	0000000000000000 _binary_constants_bin_end
0000000000000000 g       .data	0000000000000000 _binary_constants_bin_start
```

After:
```
$ objdump -x constants.obj

architecture: i386:x86-64, flags 0x00000010:
HAS_SYMS
start address 0x0000000000000000

Sections:
Idx Name          Size      VMA               LMA               File off  Algn
  0 .lrodata      8b8577e0  0000000000000000  0000000000000000  00000040  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, DATA
SYMBOL TABLE:
0000000000000000 l    d  .lrodata	0000000000000000 .lrodata
000000008b8577e0 g       *ABS*	0000000000000000 _binary_constants_bin_size
000000008b8577e0 g       .lrodata	0000000000000000 _binary_constants_bin_end
0000000000000000 g       .lrodata	0000000000000000 _binary_constants_bin_start
```

Large (>2GB) `model.so`
```
-rwxrwxr-x 1 ubuntu ubuntu 2.2G Mar 30 06:24 my_model.so

```

Related Links:
- [Embedding Binary Blobs With GCC](https://www.burtonini.com/blog/2007/07/13/embedding-binary-blobs-with-gcc/)
- [TVM PR - [LLVM] Support CodeGenBlob for large >2GB models](https://github.com/apache/tvm/pull/10882)

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/520

Reviewed By: alexanderguzhva

Differential Revision: D44533582

Pulled By: chenyang78

fbshipit-source-id: 75cc9d07bacd1a74124dafd21a9d64101f8cb96d
---
 python/aitemplate/backend/target.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 36e55a4c0..ab8b3d937 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -18,6 +18,7 @@
 import logging
 import os
 import pathlib
+import platform
 import shutil
 import tempfile
 from enum import IntEnum
@@ -172,7 +173,15 @@ def binary_compile_cmd(self):
         A command that turns a raw binary file into an object file that
         can be linked into the executable.
         """
-        return "ld -r -b binary -o {target} {src}"
+        cmd = "ld -r -b binary -o {target} {src}"
+        # Support models with >2GB constants on Linux only
+        if platform.system() == "Linux":
+            cmd += (
+                " && objcopy --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def compile_options(self) -> str:
         """Options for compiling the target.

From 383f516af7079bd874b49dc9f34185c87710ba8f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 31 Mar 2023 13:45:24 +0800
Subject: [PATCH 366/638] print op name when raise runtime error

---
 python/aitemplate/compiler/ops/groupnorm/groupnorm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 2093dcb89..0d2270c32 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -265,7 +265,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
 
         if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
+                "Profile workload: " f"{self._attrs['op']}" f"{exec_key}" " failed. " f"Results: {result}."
             )
 
         out = min(result, key=lambda x: x[1].duration)

From c29de3c324083a6550c1612d6dddbd5ad91981d1 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 31 Mar 2023 14:30:07 +0800
Subject: [PATCH 367/638] fix a bug

---
 python/aitemplate/compiler/transform/transform_special_ops.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index d0a51e73b..cc9b5264a 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -297,9 +297,7 @@ def transform_special_ops(
     funcs = [
         _transform_1x1_conv_gemm_rcr,
     ]
-
-    from aitemplate.backend.target import Target
-
+    
     if "transform_conv_to_gemm" in Target.current()._kwargs:
         if Target.current()._kwargs["transform_conv_to_gemm"]:
             for func in funcs:

From bec4e659ce910d7f56f4e7285a4c72113cb30c56 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Fri, 31 Mar 2023 12:12:05 -0700
Subject: [PATCH 368/638] Revert hack for bmm (#517)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/517

Revert hack for bmm. Use symbolic shape to support bmm cases.

Reviewed By: khabinov, chenyang78

Differential Revision: D44484990

fbshipit-source-id: a41440079595e1698d4b1f38a2deb91b0d7a8bbe
---
 fx2ait/fx2ait/converters/ait_converters.py | 25 +++-------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 73bf06851..5f5deb86b 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -981,25 +981,6 @@ def acc_ops_flatten(
     return flatten(start_dim=start_dim, end_dim=end_dim)(input_val)
 
 
-def acc_ops_bmm(name: str, lhs: AITTensor, rhs: AITTensor) -> ConverterOutput:
-    lhs_shape = lhs.shape()
-    rhs_shape = rhs.shape()
-    if (
-        lhs_shape[0] == rhs_shape[0]
-        and lhs_shape[0]._attrs["name"] is None
-        and rhs_shape[0]._attrs["name"] is None
-    ):
-        lhs_shape[0]._attrs["name"] = f"acc_{name}_batch_size"
-        rhs_shape[0]._attrs["name"] = f"acc_{name}_batch_size"
-    elif lhs_shape[0] != rhs_shape[0]:
-        if lhs_shape[0]._attrs["values"] == rhs_shape[0]._attrs["values"]:
-            if lhs_shape[0]._attrs["name"] is None:
-                lhs_shape[0] = rhs_shape[0]
-            else:
-                rhs_shape[0] = lhs_shape[0]
-    return bmm_rrr()(lhs, rhs)
-
-
 @ait_converter(acc_ops.matmul)
 def acc_ops_matmul(
     target: Target,
@@ -1026,7 +1007,7 @@ def acc_ops_matmul(
     if len(rhs_shape) == 2:
         return gemm_rrr()(lhs, rhs)
     elif len(lhs_shape) <= 3 and len(rhs_shape) <= 3:
-        return acc_ops_bmm(name, lhs, rhs)
+        return bmm_rrr()(lhs, rhs)
     elif len(lhs_shape) == 4 and len(rhs_shape) == 4 and lhs_shape[1] == rhs_shape[1]:
         assert all(isinstance(i, IntImm) for i in lhs_shape[1:])
         assert all(isinstance(i, IntImm) for i in rhs_shape[1:])
@@ -1045,7 +1026,7 @@ def acc_ops_matmul(
             shape_1 = (batch_size * channel, K, N)
             shape_2 = (batch_size, channel, M, N)
         elif isinstance(lhs_shape[0], IntVar) and isinstance(rhs_shape[0], IntVar):
-            if lhs_shape[0]._attrs["values"] != rhs_shape[0]._attrs["values"]:
+            if lhs_shape[0] != rhs_shape[0]:
                 raise ValueError(
                     f"Batch size mismatch on matmul. Expected: {lhs_shape[0]} == {rhs_shape[0]}"
                 )
@@ -1060,7 +1041,7 @@ def acc_ops_matmul(
             )
         reshape_op_0 = reshape()(lhs, shape_0)
         reshape_op_1 = reshape()(rhs, shape_1)
-        return reshape()(acc_ops_bmm(name, reshape_op_0, reshape_op_1), shape_2)
+        return reshape()(bmm_rrr()(reshape_op_0, reshape_op_1), shape_2)
     else:
         raise NotImplementedError(
             f"This case is unsupported in {name}: {len(lhs_shape)} and {len(rhs_shape)}"

From 58c4e77dd14cf08bca5dfc4d43b4e2c2f0108788 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 31 Mar 2023 13:01:06 -0700
Subject: [PATCH 369/638] Update rocm_ci.yml

Fix the label check logic.
---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 2ac66a2a3..a3db6e76b 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build:
-    if: contains(github.event.pull_request.labels.*.name, 'rocm')
+    if: contains(github.event.label.name, 'rocm')
     runs-on: rocm
 
     steps:

From e86ee2f69bbad93b50b61f4a1c91d5a73678d64a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang (Meta Employee)" <ezyang@meta.com>
Date: Fri, 31 Mar 2023 13:54:33 -0700
Subject: [PATCH 370/638] Refactor dynamic dims api, stateless internals,
 higher level export API (#96699) (#96699)

Summary:
The purpose of this API is to execute a few large components of work:

1) Refactor all the internals of plumbing dynamic dimension information after dynamo to be stateless
2) Decouple allocation controls around dynamic dimensions from verification
3) For (2), for allocation, create an enum that dictates whether we are in DUCK (default today), STATIC (aka assume_static_default in the past), or DYNAMIC (aka user constrained, do not duck shape)
4) For (2), for verification, we separate out the list of dynamic ranges entirely from allocation. This means shape_env does not tracking for what we verify on, and instead, it is the callers job to invoke produce_guards() with the various things they want verified, specifically, with the valid ranges. We do use constrain ranges to refine value ranges when doing analysis.
5) We have decided, therefore, as an extension of (4) to double down on "late" checks versus "eager" checks, primarily because the mechanisms for gathering what actually matters happens during guards, and should be a purview of the caller seeking guards, not the shape env. However, for dynamo, these structures are essentially one and the same.

X-link: https://github.com/pytorch/pytorch/pull/96699
Approved by: https://github.com/avikchaudhuri, https://github.com/ezyang

bypass-github-export-checks

Reviewed By: frank-wei

Differential Revision: D44552069

Pulled By: ezyang

fbshipit-source-id: 0c40717e76471bb3d4af1db323803487684381e2
---
 fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py | 6 +++---
 fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
index 47c8e38c2..fc215d96f 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
@@ -38,7 +38,7 @@ def forward(self, x):
                 return torch.flatten(x, self.start, self.end)
 
         model = TestModule(start_dim, end_dim).cuda().half()
-        inputs = (torch.randn(1, 2, 3, 1).half().cuda(),)
+        inputs = (torch.randn(2, 3, 4, 5).half().cuda(),)
 
         self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
 
@@ -61,10 +61,10 @@ def forward(self, x):
         model = TestModule(start_dim, end_dim).cuda().half()
         inputs_spec = TensorSpec.create_spec_from_shapes(
             inputs_min=[
-                [1, 2, 3, 4],
+                [2, 3, 4, 5],
             ],
             inputs_max=[
-                [10, 20, 3, 4],
+                [10, 20, 4, 5],
             ],
             dtype_list=[
                 torch.float16,
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
index e652f1a07..4819280d7 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
@@ -65,7 +65,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             # Only M can be dynamic: https://github.com/fairinternal/AITemplate/blob/main/tests/unittest/ops/test_gemm.py
             [[[2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
             [[[2, 3], [2, 3], [3, 3], [6, 6]], aten_compose_mm_2d],
-            [[[1, 3], [2, 3], [6, 8], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            # Cannot test with size=1, we will one specialize
+            # [[[1, 3], [2, 3], [6, 8], [3, 3], [6, 6]], torch.ops.aten.mm.default],
             # FIXME: batch_size cannot be dynamic because the permutation of shape change the names: P544607056
             # b, m, k, n
             [[[2, 2], [6, 8], [3, 3], [6, 6]], aten_compose_bmm_3d, True],

From 492262b84d983f82dbdd5b9ef23ff4d7cf715c9f Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sat, 1 Apr 2023 10:08:30 -0700
Subject: [PATCH 371/638] Add pass for make_jaged deduplication (#523)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/523

This diff introduces a new graph transformation pass to deduplicate multiple `make_jagged` ops in the graph which operate on the `source` tensors with the same `total_length` dimension and on the same `offsets_list`s. After removing those ops, a single new `make_jagged` op is applied to *all* graph inputs with the same `total_length` dimension, hence achieving two goals:

1. Guarantee that the offsets validation will be performed before any of the downstream jagged-aware ops can act on the jagged tensors / offsets. This should improve reliability.

2. Perform offsets validation only once per `offsets_list` (instead of multiple times in multiple `make_jagged` ops that are deduplicated). This should save time.

The docstrings of the new pass's helper functions provide more details on how it works. Applying `make_jagged` op to multiple source inputs with the same `total_length` dimension relies on the `make_jagged` extension introduced in D44508562.

Reviewed By: chenyang78

Differential Revision: D44559189

fbshipit-source-id: f099574da1e035424cb3e786d4813706cbc5bacc
---
 python/aitemplate/compiler/base.py            |   4 +
 .../transform/dedup_make_jagged_ops.py        | 312 ++++++++++++++++++
 .../compiler/transform/optimize_graph.py      |   2 +
 .../compiler/transform/transform_utils.py     |   2 +-
 tests/unittest/ops/test_make_jagged.py        | 155 +++++++++
 5 files changed, 474 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/compiler/transform/dedup_make_jagged_ops.py

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 80e1451ec..31bd2f262 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -541,6 +541,7 @@ def __init__(
         super().__init__(
             values=total_length._attrs["values"],
             name=total_length._attrs["name"],
+            symbolic_value=total_length._attrs["symbolic_value"],
         )
 
         self._attrs["batch_dim"] = batch_dim
@@ -911,6 +912,9 @@ def pseudo_code(self, with_shape=True) -> str:
         if data is not None:
             args.append(f"data=({data.size()} bytes)")
 
+        if self.is_jagged():
+            args.append("jagged=True")
+
         return f"Tensor({', '.join(args)})"
 
     def _bind_data(self, data: _ConstantTensorData) -> None:
diff --git a/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
new file mode 100644
index 000000000..982a2c59f
--- /dev/null
+++ b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
@@ -0,0 +1,312 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Deduplicate make_jagged ops in the graph.
+"""
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Set
+
+from aitemplate.compiler.base import IntVar, JaggedIntVar, Operator, Tensor
+
+from aitemplate.compiler.ops import make_jagged
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
+    remove_dst_op_from_tensor,
+    replace_tensor,
+    replace_tensor_for_op,
+    sanitize_sorted_graph,
+)
+from aitemplate.utils.graph_utils import get_sorted_ops
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class MakeJaggedMetaData:
+    op: Operator
+    sources_list: List[Tensor]
+    offsets_list: List[Tensor]
+    outputs: List[Tensor]
+    jagged_int_var: JaggedIntVar
+
+
+def _get_make_jagged_metadata(
+    sorted_graph: List[Tensor],
+) -> Dict[IntVar, List[MakeJaggedMetaData]]:
+    """Collect metadata about the existing make_jagged ops in the graph.
+
+    The MakeJaggedMetaData instances, one per make_jagged op, are grouped
+    by the total_length dimension in the source input Tensors of the ops.
+    In case of multiple inputs, total_length dimension is the same in
+    every input. The metadata is used further to inform the transformation.
+    """
+    metadata = {}
+    for op in get_sorted_ops(sorted_graph):
+        if op._attrs["op"] == "make_jagged":
+            outputs = op._attrs["outputs"]
+            jagged_int_var = outputs[0]._attrs["shape"][0]
+            total_length = jagged_int_var.total_length()
+            num_sources = op._attrs["num_sources"]
+            if total_length not in metadata:
+                metadata[total_length] = []
+            metadata[total_length].append(
+                MakeJaggedMetaData(
+                    op=op,
+                    sources_list=op._attrs["inputs"][:num_sources],
+                    offsets_list=op._attrs["inputs"][num_sources:],
+                    outputs=outputs,
+                    jagged_int_var=jagged_int_var,
+                )
+            )
+
+    return metadata
+
+
+def _remove_make_jagged_ops(
+    make_jagged_metadata: Dict[IntVar, List[MakeJaggedMetaData]],
+    graph_inputs: Set[Tensor],
+    graph_outputs: Set[Tensor],
+):
+    """Remove the make_jagged ops from the graph where possible.
+
+    The individual make_jagged ops scattered over the graph are removed,
+    to be further replaced by a single make_jagged instance, per total_length
+    dimension, applied to all inputs with the total_length dimension at once.
+    The ops are considered group by group, where group is formed from
+    the ops with the same total_length dimension in the source Tensors.
+
+    The make_jagged ops in the group are not removed (and the respective
+    total_length key is popped from the make_jagged_metadata) if:
+
+        1. There is only one make_jagged op in the group.
+
+        2. There is a make_jagged op in the group connecting a
+           graph input to a graph output: can't be eliminated.
+
+        3. The total_length dimension representing the group is
+           not present in any of the graph inputs' shape.
+
+    In other cases, all make_jagged ops in the grpup are removed from the graph
+    (and the respective total_length key is kept in the make_jagged_metadata).
+    """
+    for total_length in list(make_jagged_metadata.keys()):
+        make_jagged_group = make_jagged_metadata[total_length]
+        assert len({d.jagged_int_var for d in make_jagged_group}) == 1, (
+            "All make_jagged ops applied to the sources with the "
+            "same total_length must produce the same jagged_int_var."
+        )  # this includes offsets identity check internally
+
+        if len(make_jagged_group) == 1:
+            _LOGGER.debug(
+                "There is only one make_jagged op in the group "
+                f"with {total_length=}: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        has_input_to_output_op = False
+        for data in make_jagged_group:
+            if any(s in graph_inputs for s in data.sources_list) and any(
+                o in graph_outputs for o in data.outputs
+            ):
+                has_input_to_output_op = True
+                break
+        if has_input_to_output_op:
+            _LOGGER.debug(
+                "There is a make_jagged op in the group with "
+                f"{total_length=} that maps a graph input to "
+                "a graph output: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        graph_input_with_total_length = False
+        for inp in graph_inputs:
+            shape = inp._attrs["shape"]
+            if shape and shape[0] == total_length:
+                graph_input_with_total_length = True
+                break
+        if not graph_input_with_total_length:
+            _LOGGER.debug(
+                "None of the graph inputs has the first dimension "
+                f"equal to {total_length=}: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        _LOGGER.debug(
+            f"Removing {len(make_jagged_group)} make_jagged ops "
+            f"in the group with {total_length=} from the graph."
+        )
+        for data in make_jagged_group:
+            for source, output in zip(data.sources_list, data.outputs):
+                replace_tensor(output, source)
+                remove_dst_op_from_tensor(source, data.op)
+
+
+def _apply_make_jagged_to_inputs(
+    make_jagged_metadata: Dict[IntVar, List[MakeJaggedMetaData]],
+    sorted_graph: List[Tensor],
+    graph_inputs: Set[Tensor],
+) -> Dict[IntVar, JaggedIntVar]:
+    """Apply new make_jagged ops to the (bundled) input source Tensors.
+
+    For each group of make_jagged ops that removed from the graph,
+    a new make_jagged op is applied to all graph inputs with the
+    corresponding total_length dimension. This way, the source Tensors
+    are converted to jagged Tensors right from the "beginning" of the
+    graph and can be used as jagged Tensors downstream.
+
+    Two points are worth mentioning:
+
+        1. Due to the fact that the new make_jagged op is applied to
+           *all* source inputs with the total_length dimension, it is
+           guaranteed that the offsets validation performed by the
+           make_jagged op's back-end will run before any of the
+           resulting jagged Tensors can be used downstream.
+
+        2. Because a single make_jagged op is applied to multiple
+           graph inputs, the make_jagged op's back-end kernel will
+           be launched only once to validate the offsets (the latter
+           are the same for every source input). This optimizes out
+           redundant validation of the same offsets.
+
+    The mapping of each total_length to the new JaggedIntVar (produced
+    by the corresponding new make_jagged op) is returned.
+    """
+    new_jagged_int_vars = {}
+    for total_length, make_jagged_group in make_jagged_metadata.items():
+        sources_list = []
+        for inp in graph_inputs:
+            shape = inp._attrs["shape"]
+            if shape and shape[0] == total_length:
+                sources_list.append(inp)
+
+        _LOGGER.debug(
+            "Adding a single make_jagged op for the source inputs "
+            f"{[source._attrs['name'] for source in sources_list]}."
+        )
+
+        data = make_jagged_group[0]
+        new_make_jagged_op = make_jagged(
+            batch_dim=data.jagged_int_var.batch_dim(),
+            jagged_dims=data.jagged_int_var.jagged_dims(),
+        )
+        jagged_tensors = new_make_jagged_op(
+            source=sources_list,
+            offsets_list=data.offsets_list,
+        )
+        jagged_int_var = jagged_tensors[0]._attrs["shape"][0]
+        new_jagged_int_vars[total_length] = jagged_int_var
+
+        for source, jagged in zip(sources_list, jagged_tensors):
+            for op in source._attrs["dst_ops"]:
+                if op is not new_make_jagged_op:
+                    replace_tensor_for_op(op, source, jagged)
+
+        sorted_graph.extend(jagged_tensors)
+
+    return new_jagged_int_vars
+
+
+def _replace_total_length_with_jagged_int_var(
+    new_jagged_int_vars: Dict[IntVar, JaggedIntVar],
+    sorted_graph: List[Tensor],
+    graph_inputs: Set[Tensor],
+):
+    """Replace total_length dimensions by the new JaggedIntVars.
+
+    As we've removed the internal make_jagged ops from the graph and
+    replaced their output jagged Tensors by the input source Tensors,
+    the latter have lost their JaggedIntVars. Here we replace the
+    total_length dimension in *every* non-input Tensor in the graph
+    by the corresponding new JaggedIntVar (produced by the new
+    make_jagged op applied to the bundled source inputs). This includes,
+    but is not limited to, the source inputs of the make_jagged ops
+    removed from within the graph in the beginning of the pass.
+    """
+    for total_length, new_jagged_int_var in new_jagged_int_vars.items():
+        for tensor in sorted_graph:
+            if tensor not in graph_inputs:
+                shape = tensor._attrs["shape"]
+                if shape and shape[0] == total_length:
+                    shape[0] = new_jagged_int_var
+
+
+def dedup_make_jagged_ops(
+    sorted_graph: List[Tensor],
+    workdir: str = None,
+) -> List[Tensor]:
+    """Deduplicate make_jagged ops in the graph.
+
+    The rationale is to eliminate redundant offset validation as
+    well as make the implicit jagged Tensors (sources) in the graph
+    explicit, by replacing their total_length dimension with the
+    corresponding JaggedIntVar.
+
+    The pass is performed in the following steps:
+
+        1. Collect the metadata of the existing make_jagged ops.
+        2. Remove make_jagged ops from the graph where possible.
+        3. Apply new make_jagged ops to the (bundled) source inputs.
+        4. Replace total_length dimensions with new JaggedIntVars.
+
+    See the docstrings of the individual steps' helper functions
+    above for more details.
+    """
+    make_jagged_metadata = _get_make_jagged_metadata(sorted_graph)
+
+    if not make_jagged_metadata:
+        _LOGGER.debug("No make_jagged ops in the graph: skipping.")
+        return sorted_graph
+
+    graph_inputs = {t for t in sorted_graph if t._attrs["is_input"]}
+    graph_outputs = {t for t in sorted_graph if t._attrs["is_output"]}
+
+    _remove_make_jagged_ops(
+        make_jagged_metadata,
+        graph_inputs,
+        graph_outputs,
+    )
+
+    if not make_jagged_metadata:
+        _LOGGER.debug(
+            "There are make_jagged ops in the graph, "
+            "but nothing to deduplicate: skipping."
+        )
+        return sorted_graph
+
+    # drop the removed make_jagged outputs
+    sorted_graph = sanitize_sorted_graph(sorted_graph)
+
+    new_jagged_int_vars = _apply_make_jagged_to_inputs(
+        make_jagged_metadata,
+        sorted_graph,
+        graph_inputs,
+    )
+    _replace_total_length_with_jagged_int_var(
+        new_jagged_int_vars,
+        sorted_graph,
+        graph_inputs,
+    )
+
+    # sort the new make_jagged outputs
+    sorted_graph = toposort(sorted_graph)
+    # name the new tensors + do sanity check
+    sorted_graph = sanitize_sorted_graph(sorted_graph)
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 2b0803b85..01ea8913b 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -19,6 +19,7 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.transform.apply_padding import apply_padding
+from aitemplate.compiler.transform.dedup_make_jagged_ops import dedup_make_jagged_ops
 from aitemplate.compiler.transform.fuse_bmm_permute import fuse_bmm_permute
 from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
 from aitemplate.compiler.transform.fuse_group_ops import fuse_group_ops
@@ -87,6 +88,7 @@ def optimize_graph(
     """
 
     funcs = [
+        dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
         fuse_bmm_permute,
         transform_odd_alignment,
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index 353e5c9de..1c5caf4d9 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -260,7 +260,7 @@ def sanitize_sorted_graph(sorted_graph: List[Tensor]) -> List[Tensor]:
     """
     Removes tensors whose src_op and dst_ops are empty.
     Inputs and outputs are always kept in the graph.
-    Names unamed tensors.
+    Names unnamed tensors.
     """
 
     if len(sorted_graph) == 1:
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
index f82d3a970..7ba2ffc53 100644
--- a/tests/unittest/ops/test_make_jagged.py
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -26,6 +26,7 @@
     get_random_torch_tensor,
     get_torch_empty_tensor,
 )
+from aitemplate.utils.graph_utils import get_sorted_ops
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
@@ -338,6 +339,160 @@ def test_make_jagged_multiple_sources(
 
         torch.testing.assert_close(result, result_pt, rtol=1e-2, atol=1e-2)
 
+    def test_make_jagged_dedup(
+        self,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N = 3
+        D = 64
+        W = 32
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntImm(name="max_seq_len", value=N)
+        embedding_dim = IntImm(name="embedding", value=D)
+        weights_dim = IntImm(name="weight", value=W)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_dim)]
+        num_sources = 4
+
+        X1, X2, X3, X4 = [
+            Tensor(
+                shape=[
+                    total_length_dim,
+                    embedding_dim,
+                ],
+                name=f"x_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_sources)
+        ]
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                weights_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        WEIGHTS = Tensor(
+            shape=[
+                embedding_dim,
+                weights_dim,
+            ],
+            name="weights",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Y1, Y2 = (
+            ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+                source=SOURCE,
+                offsets_list=OFFSETS_LIST,
+            )
+            for SOURCE in (X1, X2)
+        )
+        Y3, Y4 = (ops.gemm_rrr()(SOURCE, WEIGHTS) for SOURCE in (X3, X4))
+        Z1, Z2 = (ops.gemm_rrr()(SOURCE, WEIGHTS) for SOURCE in (Y1, Y2))
+        Z3, Z4 = (
+            ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+                source=SOURCE,
+                offsets_list=OFFSETS_LIST,
+            )
+            for SOURCE in (Y3, Y4)
+        )
+        RESULT = DENSE
+        for Z in (Z1, Z2, Z3, Z4):
+            RESULT = ops.elementwise(FuncEnum.ADD)(RESULT, Z)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        for X in (X1, X2, X3, X4):
+            assert not X.is_jagged()
+        assert Y1.is_jagged()
+        assert Y2.is_jagged()
+        assert not Y3.is_jagged()
+        assert not Y4.is_jagged()
+        for Z in (Z1, Z2, Z3, Z4):
+            assert Z.is_jagged()
+        assert not DENSE.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_dedup",
+        )
+
+        make_jagged_ops = [
+            op
+            for op in get_sorted_ops(model.debug_sorted_graph)
+            if op._attrs["op"] == "make_jagged"
+        ]
+        assert len(make_jagged_ops) == 1
+        make_jagged_inputs = set(make_jagged_ops[0]._attrs["inputs"])
+        assert make_jagged_ops[0]._attrs["num_sources"] == num_sources
+        for X in (X1, X2, X3, X4):
+            assert not X.is_jagged()
+            assert X in make_jagged_inputs
+        assert OFFSETS_LIST[0] in make_jagged_inputs
+        for Y in (Y1, Y2, Y3, Y4):
+            assert Y.is_jagged()
+        for Z in (Z1, Z2, Z3, Z4):
+            assert Z.is_jagged()
+        assert not DENSE.is_jagged()
+        assert RESULT.is_jagged()
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        xs_pt = {
+            f"x_{i}": get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+            for i in range(num_sources)
+        }
+        weights_pt = get_random_torch_tensor([D, W], dtype=dtype)
+        dense_pt = get_random_torch_tensor([B, N, W], dtype=dtype)
+
+        ys_pt = [torch.matmul(x_pt, weights_pt) for x_pt in xs_pt.values()]
+        summed_ys_pt = torch.clone(ys_pt[0])
+        for y_pt in ys_pt[1:]:
+            summed_ys_pt += y_pt
+        result_pt = add_jagged_dense_ref(
+            jagged=summed_ys_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, W],
+            dense=dense_pt,
+        )
+
+        inputs = {
+            **xs_pt,
+            "offsets": offsets_pt,
+            "dense": dense_pt,
+            "weights": weights_pt,
+        }
+        result = torch.empty_like(result_pt)
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt, rtol=1e-2, atol=1e-2)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 984a2ccb934d87b681ce0f0105ccba0e2a527081 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Sun, 2 Apr 2023 02:28:42 -0700
Subject: [PATCH 372/638] Add an FMHA-style-b2b-bmm op into AIT (#474)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/474

Add an FMHA-style b2b bmm op into AIT.

Reviewed By: chenyang78

Differential Revision: D44007284

fbshipit-source-id: 7ba206dc84b9a20d6bfc995422b6a88a79fad7e6
---
 .../cuda/attention/mem_eff_attention.py       |   2 +-
 .../backend/cuda/b2b_bmm/__init__.py          |   2 +-
 .../backend/cuda/b2b_bmm/classic_b2b_bmm.py   |   5 +-
 .../cuda/b2b_bmm/fmha_style_b2b_bmm.py        | 301 ++++++++++++++++++
 .../compiler/ops/b2b_bmm/__init__.py          |   1 +
 .../compiler/ops/b2b_bmm/b2b_bmm_base.py      |  94 ++++++
 .../compiler/ops/b2b_bmm/classic_b2b_bmm.py   |  87 ++---
 .../ops/b2b_bmm/fmha_style_b2b_bmm.py         | 199 ++++++++++++
 python/aitemplate/testing/test_utils.py       |  15 +
 tests/unittest/ops/test_b2b_bmm.py            | 281 ++++++++++++++--
 10 files changed, 909 insertions(+), 78 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py

diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index d306ad0e9..37d349ee3 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -156,7 +156,7 @@
     }
     if (!Attention::check_supported(p)) {
       std::string error_msg = std::string("Got error: kernel does not support these inputs") +
-           " at " + __FILE__ + ": " + std::to_string(__LINE__);          
+           " at " + __FILE__ + ": " + std::to_string(__LINE__);
       throw std::runtime_error(error_msg);
     }
     kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
index 5e0c9d41f..369380b45 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -18,4 +18,4 @@
 b2b bmm module init
 """
 
-from aitemplate.backend.cuda.b2b_bmm import classic_b2b_bmm
+from aitemplate.backend.cuda.b2b_bmm import classic_b2b_bmm, fmha_style_b2b_bmm
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
index 6edec81ec..e5d91f8dd 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -23,6 +23,7 @@
 from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.target import Target
 from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
 
 # pylint: disable=C0301
 
@@ -247,7 +248,9 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         elem_accum_type=elem_accum_type,
         n0=str(n0.value()),
         n1=str(n1.value()),
-        has_causal="true" if func_attrs["causal"] else "false",
+        has_causal=(
+            "true" if func_attrs["causal_type"] != CausalType.NO_CAUSAL else "false"
+        ),
         alpha0=str(func_attrs["alpha0"]),
         alpha1=str(func_attrs["alpha1"]),
         epilogue_math=epilogue_math,
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..2ed23b652
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
@@ -0,0 +1,301 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+fmha_style_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "fmha_style_b2b_bmm/kernel_forward.h"
+
+namespace {
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int kQueriesPerBlock = 64;
+constexpr int kKeysPerBlock = ({{n1}} <= 64 ? 64 : 128);
+constexpr bool kSingleValueIteration = ({{n1}} <= kKeysPerBlock);
+}  // end namespace
+
+{{func_signature}} {
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+  using ElementCompute = {{elem_input_type}};
+
+  using Attention = AttentionKernel<
+    ElementCompute,
+    ElementAccumulator,
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kSingleValueIteration,
+    {{activation_functor}}
+  >;
+
+  ElementAccumulator alpha0 = ElementAccumulator({{alpha0}});
+  ElementAccumulator alpha1 = ElementAccumulator({{alpha1}});
+
+  int64_t seq_length = m0;
+  int64_t seq_length_kv = {{n0}};
+  int64_t head_dim = k0;
+  int64_t head_dim_value = {{n1}};
+
+  typename Attention::Params p;
+  { // set parameters
+    p.query_ptr = static_cast<ElementCompute*>(query);
+    p.key_ptr = static_cast<ElementCompute*>(key);
+    p.value_ptr = static_cast<ElementCompute*>(value);
+    if (bias) {
+      p.attn_bias_ptr = static_cast<ElementCompute*>(bias);
+    }
+    p.output_accum_ptr = nullptr;
+    if (Attention::kNeedsOutputAccumulatorBuffer) {
+      p.output_accum_ptr = reinterpret_cast<ElementAccumulator*>(accum_ptr);
+    }
+    p.output_ptr = static_cast<ElementOutput*>(output);
+
+    p.scale = alpha0;
+    p.activation_scale = alpha1;
+
+    p.num_heads = {{num_heads}};
+    p.num_batches = batch_size;
+
+    p.head_dim = head_dim;
+    p.head_dim_value = head_dim_value;
+    p.seq_length = seq_length;
+    p.num_queries = seq_length;
+    p.num_keys = seq_length_kv;
+    p.causal_type = Attention::Params::{{causal_type}};
+
+    // All tensors are in BMHK shapes
+    p.q_strideH = head_dim;
+    p.k_strideH = head_dim;
+    p.v_strideH = head_dim_value;
+
+    p.q_strideM = p.q_strideH * p.num_heads;
+    p.k_strideM = p.k_strideH * p.num_heads;
+    p.v_strideM = p.v_strideH * p.num_heads;
+
+    p.q_strideB = p.q_strideM * seq_length;
+    p.k_strideB = p.k_strideM * seq_length_kv;
+    p.v_strideB = p.v_strideM * seq_length_kv;
+
+    int32_t bias_stride = {{n0}};
+    {% if bias_broadcast[2] %}
+    p.bias_strideM = 0;
+    {% else %}
+    p.bias_strideM = bias_stride;
+    bias_stride *= seq_length;
+    {% endif %}
+
+    {% if bias_broadcast[1] %}
+    p.bias_strideH = 0;
+    {% else %}
+    p.bias_strideH = bias_stride;
+    bias_stride *= p.num_heads;
+    {% endif %}
+
+    {% if bias_broadcast[0] %}
+    p.bias_strideB = 0;
+    {% else %}
+    p.bias_strideB = bias_stride;
+    {% endif %}
+  }
+
+  // launch kernel :)
+  constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+  int smem_bytes = sizeof(typename Attention::SharedStorage);
+  if (smem_bytes > 0xc000) {
+    auto result = cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    auto error_code = cudaGetLastError();
+    if (result != cudaSuccess) {
+        throw std::runtime_error(
+            "Failed to set attribute! Error: " + std::string(cudaGetErrorString(error_code)) +
+            ", error code: " + std::to_string(error_code)
+        );
+    }
+  }
+  if (!Attention::check_supported(p)) {
+    throw std::runtime_error(
+      std::string("Kernel does not support these inputs. ") +
+      "Function: {{function_name}}. " +
+      "m0: " + std::to_string(m0) +
+      ", k0: " + std::to_string(k0) +
+      ", n0: " + std::to_string({{n0}}) +
+      ", n1: " + std::to_string({{n1}}) + "."
+    );
+  }
+  kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(
+  void* output,
+  void* query,
+  void* key,
+  void* value,
+  void* bias,
+  void* accum_ptr,
+  int64_t batch_size,
+  int64_t m0,
+  int64_t k0,
+  cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+{{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{accum_ptr}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{m0}},
+{{indent}}    {{k0}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+def _causal_type_to_kernel_str(causal_type: CausalType) -> str:
+    if causal_type == CausalType.NO_CAUSAL:
+        return "CausalType::NO_CAUSAL"
+    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
+        return "CausalType::UPPER_RIGHT_EMPTY"
+    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
+        return "CausalType::LOWER_LEFT_EMPTY"
+    else:
+        raise RuntimeError(f"Unsupported causal type {causal_type=}")
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.gen_function")
+def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v = func_attrs["inputs"][0:3]
+
+    bias_broadcast = [False] * 4
+    if len(func_attrs["inputs"]) > 3:
+        bias = func_attrs["inputs"][3]
+        bias_broadcast = [var == IntImm(1) for var in bias.shape()]
+
+    n0 = k._attrs["shape"][1]
+    n1 = v._attrs["shape"][3]
+    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
+        raise RuntimeError(
+            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
+        )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_accum_type = elem_input_type
+    if elem_input_type == "cutlass:half_t" and not Target.current()._kwargs.get(
+        "use_fp16_acc", False
+    ):
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    activation_functor = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        n0=str(n0.value()),
+        n1=str(n1.value()),
+        causal_type=_causal_type_to_kernel_str(func_attrs["causal_type"]),
+        num_heads=str(func_attrs["num_heads"]),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        activation_functor=activation_functor,
+        bias_broadcast=bias_broadcast,
+    )
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.func_decl")
+def fmha_style_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.func_call")
+def fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1, f"{len(func_attrs['outputs'])=} != 1"
+    assert len(func_attrs["inputs"]) in (
+        3,
+        4,
+    ), f"{len(func_attrs['inputs'])=} != 3 or 4"
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    bias_name = "nullptr"
+    if len(func_attrs["inputs"]) == 4:
+        bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    batch_size = q_shape[0]._attrs["name"]
+    m0 = q_shape[1]._attrs["name"]
+    k0 = q_shape[3]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        accum_ptr="global_workspace_",
+        batch_size=batch_size,
+        m0=m0,
+        k0=k0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
index 22f9a5d0d..b93f2c205 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
@@ -18,3 +18,4 @@
 """
 
 from aitemplate.compiler.ops.b2b_bmm.classic_b2b_bmm import classic_b2b_bmm
+from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import fmha_style_b2b_bmm
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
new file mode 100644
index 000000000..7b275f1c0
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
@@ -0,0 +1,94 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Base class for back-to-back batched gemm fused kernels.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) + bias))), V),
+
+where:
+Q: [B, M0, (H,) K0] (row_major),
+K: [B, N0, (H,) K0] (column_major),
+V: [B, N0, (H,) N1] (row_major),
+bias: [B, (H,) M0, N0] (row_major).
+Layouts are fixed for now.
+
+causal_masks have 3 types:
+NO_CAUSAL: no causal masks
+UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
+LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
+"""
+
+from enum import Enum
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator
+from aitemplate.utils.alignment import find_max_alignment, get_alignments
+
+
+def _check_max_alignment(shape: IntVar, dtype: str, error_msg: str) -> None:
+    if not isinstance(shape, IntImm):
+        raise RuntimeError(f"{shape=} must be IntImm! ", error_msg)
+    res = find_max_alignment(shape.value(), dtype) == max(get_alignments(dtype))
+    if not res:
+        raise RuntimeError(
+            f"{shape=} does not satisfy {dtype=} max alignment requirements! ",
+            error_msg,
+        )
+
+
+class CausalType(Enum):
+    NO_CAUSAL = 0
+    UPPER_RIGHT_EMPTY = 1
+    LOWER_LEFT_EMPTY = 2
+
+
+class b2b_bmm_base(Operator):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+    ) -> None:
+        """Initialize classic_b2b_bmm op."""
+        super().__init__()
+        self._attrs["has_profiler"] = False
+        self._attrs["causal_type"] = causal_type
+        self._attrs["alpha0"] = alpha0
+        self._attrs["alpha1"] = alpha1
+
+        import cutlass_lib
+
+        if epilogue_math_name not in cutlass_lib.library.EpilogueMathName:
+            raise RuntimeError(
+                "Unsupported epilogue function! Please check "
+                "python/aitemplate/utils/mk_cutlass_lib/extra_enum.py for a list of supported epilogue functions."
+            )
+        self._attrs["epilogue_math_name"] = epilogue_math_name
+
+    def _check_alignment(self) -> None:
+        q, k, v = self._attrs["inputs"][0:3]
+        if (
+            q._attrs["dtype"] != k._attrs["dtype"]
+            or q._attrs["dtype"] != v._attrs["dtype"]
+        ):
+            raise RuntimeError(
+                "QKV dtypes must be the same! "
+                f"QKV dtypes: {q._attrs['dtype']=}, {k._attrs['dtype']=}, {v._attrs['dtype']=}"
+            )
+        dtype = q._attrs["dtype"]
+
+        _check_max_alignment(q._attrs["shape"][-1], dtype, f"{q._attrs['shape']=}")
+        _check_max_alignment(k._attrs["shape"][-1], dtype, f"{k._attrs['shape']=}")
+        _check_max_alignment(v._attrs["shape"][-1], dtype, f"{v._attrs['shape']=}")
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
index 85c18e866..cab3250e7 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
@@ -15,71 +15,45 @@
 
 """
 Back-to-back batched gemm fused kernel.
-Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) + beta0 * bias))), V),
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
 
 where:
-Q: [B, M0, K0] (row_major), K: [B, N0, K0] (column_major), V: [B, N0, N1] (row_major), bias: [B, M0, N0] (row_major).
+Q: [B, M0, K0] (row_major),
+K: [B, N0, K0] (column_major),
+V: [B, N0, N1] (row_major),
+bias: [B, M0, N0] (row_major).
 Layouts are fixed for now.
 
-causal_masks can be disabled.
-When casual_masks is enabled, only the left bottom triangular part of the matrix is valid,
-and the other part is set to 0.
+Only supports NO_CAUSAL or LOWER_LEFT_EMPTY for now.
+When causal_mask is enabled, M0 must be equal to N0.
 
-Only supports M0 <= 512.
+Internally, it stores the results of Q@K in registers without writing them to shared memory, which is faster.
+However, N0 / N1 must be <= 512.
 """
 
 from aitemplate.backend import registry, target
-from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
-from aitemplate.utils.alignment import find_max_alignment, get_alignments
-
-
-def _check_max_alignment(shape: IntVar, dtype: str, error_msg: str) -> None:
-    if not isinstance(shape, IntImm):
-        raise RuntimeError(f"{shape=} must be IntImm! ", error_msg)
-    res = find_max_alignment(shape.value(), dtype) == max(get_alignments(dtype))
-    if not res:
-        raise RuntimeError(
-            f"{shape=} does not satisfy {dtype=} max alignment requirements! ",
-            error_msg,
-        )
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
 
 
-class classic_b2b_bmm(Operator):
+class classic_b2b_bmm(b2b_bmm_base):
     def __init__(
-        self, causal: bool, epilogue_math_name: str, alpha0: float, alpha1: float
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
     ) -> None:
         """Initialize classic_b2b_bmm op."""
-        super().__init__()
+        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1)
         self._attrs["op"] = "classic_b2b_bmm"
-        self._attrs["has_profiler"] = False
-        self._attrs["causal"] = causal
-        self._attrs["alpha0"] = alpha0
-        self._attrs["alpha1"] = alpha1
-
-        import cutlass_lib
-
-        if epilogue_math_name not in cutlass_lib.library.EpilogueMathName:
-            raise RuntimeError(
-                "Unsupported epilogue function! Please check "
-                "python/aitemplate/utils/mk_cutlass_lib/extra_enum.py for a list of supported epilogue functions."
-            )
-        self._attrs["epilogue_math_name"] = epilogue_math_name
-
-    def _check_alignment(self) -> None:
-        q, k, v, bias = self._attrs["inputs"]
         if (
-            q._attrs["dtype"] != k._attrs["dtype"]
-            or q._attrs["dtype"] != v._attrs["dtype"]
+            causal_type != CausalType.NO_CAUSAL
+            and causal_type != CausalType.LOWER_LEFT_EMPTY
         ):
-            raise RuntimeError(
-                "QKV dtypes must be the same! "
-                f"QKV dtypes: {q._attrs['dtype']=}, {k._attrs['dtype']=}, {v._attrs['dtype']=}"
+            raise NotImplementedError(
+                f"classic_b2b_bmm only supports NO_CAUSAL or LOWER_LEFT_EMPTY. Current causal type: {causal_type}"
             )
-        dtype = q._attrs["dtype"]
-
-        _check_max_alignment(q._attrs["shape"][2], dtype, f"{q._attrs['shape']=}")
-        _check_max_alignment(k._attrs["shape"][2], dtype, f"{k._attrs['shape']=}")
-        _check_max_alignment(v._attrs["shape"][2], dtype, f"{v._attrs['shape']=}")
 
     def _infer_shapes(self):
         """infer the output shape for classic_b2b_bmm."""
@@ -102,10 +76,6 @@ def _infer_shapes(self):
             )
         batch_size = q_shape[0]
         M0 = q_shape[1]
-        if M0.upper_bound() > 512:
-            raise RuntimeError(
-                f"classic_b2b_bmm only supports <=512 seq_length. Current length: {M0}"
-            )
         K0 = q_shape[2]
         if K0 != k_shape[2]:
             raise RuntimeError(
@@ -117,6 +87,15 @@ def _infer_shapes(self):
                 f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
         N1 = v_shape[2]
+        if N0.upper_bound() > 512 or N1.upper_bound() > 512:
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
+            )
+        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
+            if M0 != N0:
+                raise RuntimeError(
+                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
+                )
 
         output_shape = [batch_size, M0, N1]
 
@@ -152,8 +131,8 @@ def __call__(
 
         self._attrs["inputs"] = [q, k, v, bias]
         self._set_depth()
-        self._check_alignment()
         output_shape = self._infer_shapes()
+        self._check_alignment()
         output = Tensor(
             output_shape,
             src_ops={self},
@@ -164,7 +143,7 @@ def __call__(
         return output
 
     def _get_op_attributes(self):
-        target_attrs = ["causal", "epilogue_math_name", "alpha0", "alpha1"]
+        target_attrs = ["causal_type", "epilogue_math_name", "alpha0", "alpha1"]
         attr = {}
 
         for target_attr in target_attrs:
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..aaecec9d6
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
@@ -0,0 +1,199 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel, implemented in FMHA style.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) [+ bias]))), V),
+
+where:
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major). Bias can be omitted.
+Layouts are fixed for now.
+
+causal_masks have 3 types:
+NO_CAUSAL: no causal masks
+UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
+LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
+When causal_masks is enabled, M0 must be equal to N0.
+
+Internally this implementation stores the results of Q@K in shared memory.
+It supports larger N0 / N1 compared to the classic_b2b_bmm implementation.
+"""
+
+from typing import Optional
+
+import numpy as np
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+from aitemplate.utils import shape_utils
+
+
+class fmha_style_b2b_bmm(b2b_bmm_base):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        num_heads: int,
+    ) -> None:
+        """Initialize fmha_style_b2b_bmm op."""
+        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1)
+        self._attrs["op"] = "fmha_style_b2b_bmm"
+        self._attrs["num_heads"] = num_heads
+        self._attrs["workspace"] = 0
+
+    def _infer_shapes(self):
+        """infer the output shape for classic_b2b_bmm."""
+        q, k, v = self._attrs["inputs"][0:3]
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 4:
+            raise RuntimeError(
+                f"QKV must have rank == 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[2] != k_shape[2] or q_shape[2] != v_shape[2]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        batch_size = q_shape[0]
+        if q_shape[2] != IntImm(self._attrs["num_heads"]):
+            raise RuntimeError(
+                f"num_heads are not equal! {self._attrs['num_heads']=}, {q_shape[2]=}"
+            )
+        M0 = q_shape[1]
+        K0 = q_shape[3]
+        if K0 != k_shape[3]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N0 = k_shape[1]
+        if N0 != v_shape[1]:
+            raise RuntimeError(
+                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N1 = v_shape[3]
+
+        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
+            if M0 != N0:
+                raise RuntimeError(
+                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
+                )
+
+        head_size = IntImm(self._attrs["num_heads"])
+
+        output_shape = [batch_size, M0, head_size, N1]
+
+        if len(self._attrs["inputs"]) == 4:
+            bias = self._attrs["inputs"][3]
+            bias_shape = bias._attrs["shape"]
+            bias_expected_shape = [batch_size, head_size, M0, N0]
+            broadcastable, _ = shape_utils.get_broadcast_max_shape(
+                bias_shape, bias_expected_shape
+            )
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Expected bias rank 4. Current bias rank: {len(bias)}."
+                )
+            if not broadcastable:
+                raise RuntimeError(
+                    f"bias shape is not compatible with Q K! "
+                    f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                    f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
+                )
+            if bias_shape[-1] != N0:
+                raise RuntimeError(
+                    f"Bias last dim is not broadcastable! Expected shape: {N0}, current bias shape: {bias_shape}"
+                )
+        return output_shape
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Optional[Tensor] = None,
+    ) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, H, K0)
+        k: Tensor, shape(B, N0, H, K0)
+        v: Tensor, shape(B, N0, H, N1)
+        bias: Tensor, shape(B, H, M0, N0), optional
+
+        Returns
+        ----------
+        Tensor, shape(B, H, M0, N1)
+        """
+
+        if bias is not None:
+            self._attrs["inputs"] = [q, k, v, bias]
+        else:
+            self._attrs["inputs"] = [q, k, v]
+        self._set_depth()
+        output_shape = self._infer_shapes()
+        self._check_alignment()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+        o_shape = [var.upper_bound() for var in output_shape]
+        if o_shape[-1] > 128:
+            self._attrs["workspace"] = 4 * np.prod(o_shape)
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "num_heads",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index b871792b4..a3988c416 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -176,3 +176,18 @@ def get_shape(shape: List[IntVar], dim_to_value_dict: Dict[str, int]):
         for dim in shape
     ]
     return res
+
+
+def epilogue_math_name_to_torch_fn(epilogue_math_name: str) -> Callable[[Any], Any]:
+    if epilogue_math_name == "Identity":
+        return lambda x: x
+    elif epilogue_math_name == "Sigmoid":
+        return torch.sigmoid
+    elif epilogue_math_name == "SiLu":
+        return torch.nn.functional.silu
+    elif epilogue_math_name == "ReLu":
+        return torch.nn.functional.relu
+    elif epilogue_math_name == "Tanh":
+        return torch.nn.functional.tanh
+    else:
+        raise NotImplementedError(f"Unsupported {epilogue_math_name=}!")
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index 2385187b1..6547aaf7d 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -22,8 +22,10 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import epilogue_math_name_to_torch_fn
 from aitemplate.utils import shape_utils
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
@@ -31,6 +33,32 @@
 _LOGGER = logging.getLogger(__name__)
 
 
+def _get_attn_mask_per_causal_type(
+    m: int, n: int, causal_type: CausalType, torch_dtype: str
+) -> torch.Tensor:
+    if causal_type == CausalType.NO_CAUSAL:
+        invalid_attn_mask = torch.ones((m, n), dtype=torch_dtype, device="cuda")
+    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
+        invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch.bool,
+                device="cuda",
+            )
+        ).fill_diagonal_(False).to(torch_dtype)
+    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
+        invalid_attn_mask: torch.Tensor = torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch_dtype,
+                device="cuda",
+            )
+        )
+    else:
+        raise NotImplementedError(f"Unsupported {causal_type=}!")
+    return invalid_attn_mask
+
+
 @unittest.skipIf(
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
@@ -48,7 +76,7 @@ def _test_classic_b2b_bmm(
         n0=256,
         n1=256,
         epilogue_math_name="Identity",
-        causal=False,
+        causal_type=CausalType.NO_CAUSAL,
         dtype="float16",
         test_name="classic_b2b_bmm",
         copy_op=True,
@@ -87,7 +115,7 @@ def _test_classic_b2b_bmm(
             is_input=True,
         )
         classic_b2b_bmm_op = ops.classic_b2b_bmm(
-            causal=causal,
+            causal_type=causal_type,
             alpha0=alpha0,
             alpha1=alpha1,
             epilogue_math_name=epilogue_math_name,
@@ -114,24 +142,12 @@ def _test_classic_b2b_bmm(
 
             # Run PT reference.
             attn = alpha0 * (q_pt @ k_pt.transpose(-2, -1)) + bias_pt
-            if epilogue_math_name == "Identity":
-                pass
-            elif epilogue_math_name == "Sigmoid":
-                attn = torch.sigmoid(attn)
-            elif epilogue_math_name == "SiLu":
-                attn = torch.nn.functional.silu(attn)
-            else:
-                raise NotImplementedError(f"Unsupported {epilogue_math_name=}!")
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
             attn = alpha1 * attn
-            if causal:
-                invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
-                    torch.ones(
-                        (m, n0),
-                        dtype=torch.bool,
-                        device="cuda",
-                    )
-                ).fill_diagonal_(False).to(torch_dtype)
-                attn = attn * invalid_attn_mask
+            invalid_attn_mask = _get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
             output = attn @ v_pt
             y_pt = output.detach()
 
@@ -150,17 +166,26 @@ def test_classic_b2b_bmm_fp16(self):
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_basic",
             dtype="float16",
+            batch_sizes=1,
         )
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_dynamic_batch",
             dtype="float16",
             batch_sizes=[3, 8, 10],
         )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+        )
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_causal",
             dtype="float16",
             batch_sizes=5,
-            causal=True,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
         )
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_sigmoid",
@@ -172,8 +197,222 @@ def test_classic_b2b_bmm_fp16(self):
             test_name="classic_b2b_bmm_fp16_complex",
             dtype="float16",
             batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class FMHAStyleB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_fmha_style_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        num_heads=1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="fmha_style_b2b_bmm",
+        copy_op=True,
+        atol=1e-3,
+        rtol=1e-2,
+        use_fp16_acc=True,
+    ):
+        # Initialize AIT fmha_style_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0 / m
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, num_heads, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, num_heads, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, num_heads, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        Bias = None
+        if has_bias:
+            shape = [batch_size_dim, num_heads, m, n0]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        shape[i] = 1
+            Bias = Tensor(
+                shape=shape,
+                dtype=dtype,
+                name="bias",
+                is_input=True,
+            )
+        fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            epilogue_math_name=epilogue_math_name,
+            num_heads=num_heads,
+        )
+        if copy_op:
+            fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
+                **fmha_style_b2b_bmm_op._get_op_attributes()
+            )
+        Y = fmha_style_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
+            shape = [batch_size, num_heads, m, n0]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        shape[i] = 1
+            bias_pt = torch.rand(shape, dtype=torch_dtype).cuda()
+
+            # Run PT reference.
+            attn = alpha0 * (
+                q_pt.transpose(1, 2) @ k_pt.transpose(1, 2).transpose(-2, -1)
+            )
+            if has_bias:
+                attn = attn + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 * attn
+            invalid_attn_mask = _get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            output = (attn @ v_pt.transpose(1, 2)).transpose(1, 2)
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {
+                "q": q_pt,
+                "k": k_pt,
+                "v": v_pt,
+            }
+            if has_bias:
+                inputs["bias"] = bias_pt
+            y = torch.empty(
+                [batch_size, m, num_heads, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_fmha_style_b2b_bmm_fp16(self):
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_causal_upper_right_empty",
+            dtype="float16",
+            batch_sizes=2,
+            causal_type=CausalType.UPPER_RIGHT_EMPTY,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_causal_lower_left_empty",
+            dtype="float16",
+            batch_sizes=3,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias",
+            dtype="float16",
+            batch_sizes=2,
+            has_bias=True,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias_broadcast",
+            dtype="float16",
+            batch_sizes=3,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_multi_head",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            has_bias=True,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
             epilogue_math_name="SiLu",
-            causal=True,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_complex_fp32_acc",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            use_fp16_acc=False,
+            m=512,
+            n0=512,
         )
 
 
From baefeece029f4b4420a1ad9b36474dc8dfe7c027 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Mon, 3 Apr 2023 00:38:20 -0700
Subject: [PATCH 373/638] Mem efficient attention with variable lengths (#472)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/472

Modify `mem_eff_attention` op to handle variable sequence lengths: each element in the batch can specify different source and target sequence lengths.
Two backends are available:
- (A) pre-existing backend code relying on `kernel_forward.h` is based on [fused_multihead_attention_fixed_seqlen.cu](https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu). It has been modified to accept variable lengths
- (B) new backend code based on [fused_multihead_attention_variable_seqlen.cu](https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu) has been added. It uses grouped FHMA.

Currently the op expects inputs to be padded, e.g.
```
s1 0  0
s2 s2 s2
s3 s3 0
```
instead of  `s1 s2 s2 s2 s3 s3`. Source and target sequence lengths are passed into the op as tensors `lengths_kv` and `lengths_q`, each of containing `batch_size` elements.

Benchmarking has shown that the for this setup backend A is almost always faster than backend B (see the test plan). Because of this, the op uses backend A by default. Backend B can be turned on by passing `use_grouped_fmha=True` at op creation.

In the future we might consider switching to jagged tensors as representation of variable-length inputs Q, K, V. It's likely that in that case backend B would be faster for large enough inputs and then we can use profiler to find optimal backend for each input shape.

Reviewed By: ipiszy, terrychenism

Differential Revision: D44027012

fbshipit-source-id: a3b1c7152eff97e7effeddf9a27e655a3fe8d895
---
 .../cuda/attention/mem_eff_attention.py       |  553 ++++++++-
 .../ops/attention/mem_eff_attention.py        |   95 +-
 static/include/cuda_device_functions.h        |    3 +-
 .../mem_eff_attention/kernel_forward.h        |  952 +++++++++++++++
 static/include/model.h                        |    3 +-
 tests/unittest/ops/test_attention.py          | 1059 ++++++++++-------
 6 files changed, 2198 insertions(+), 467 deletions(-)
 create mode 100644 static/include/kernels/mem_eff_attention/kernel_forward.h

diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 37d349ee3..66d00a75b 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-attention kernel codegen for CUDA.
+Attention kernel codegen for CUDA.
 """
 from typing import Any, Dict
 
@@ -24,35 +24,51 @@
 
 # pylint: disable=C0301
 
-FUNC_TEMPLATE = jinja2.Template(
+CUDA_CHECK = """
+#ifndef CUDA_CHECK_ME_ATTN
+#define CUDA_CHECK_ME_ATTN(expr, msg)                                          \\
+  do {                                                                         \\
+    cudaError_t status = (expr);                                               \\
+    if (status != cudaSuccess) {                                               \\
+      std::cerr << msg << " at " << __FILE__ << ": " << __LINE__ << std::endl; \\
+      throw std::runtime_error(cudaGetErrorString(status));                    \\
+    }                                                                          \\
+  } while (0)
+#endif // CUDA_CHECK_ME_ATTN
+"""
+
+FUNC_TEMPLATE_KERNEL_FWD = jinja2.Template(
     """
 #include <iostream>
 #include <cuda_fp16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"
-// TODO: this include should be removed. There's a bug in CUTLASS, the
-// header containing cutlass::gemm::warp::WarpSize is not being included.
-// Until the fix is upstreamed, just inject it here instead.
-#include "cutlass/gemm/warp/mma.h"
 #include "gemm_kernel_utils.h"
-#include "kernel_forward.h"
+
+#include "mem_eff_attention/kernel_forward.h"
+
 
 using namespace gemm_kernel_utils;
 
+{{cuda_check}}
+
 {{func_signature}}
 {
 
     /*
+    The code is based on fused_multihead_attention_fixed_seqlen.cu example in CUTLASS repo:
+    https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
+
     problem_sizes0 [b, m, n, k]
     [head_number * batch_size, m, mkv, k0]
-    [head_number * batch_size, seq_length, seq_length_kv, head_size]
+    [head_number * batch_size, seq_length_q, seq_length_kv, head_size]
 
     problem_sizes1
     [head_number * batch_size, m, k1, mkv]
-    [head_number * batch_size, seq_length, head_size_v, seq_length_kv]
+    [head_number * batch_size, seq_length_q, head_size_v, seq_length_kv]
 
-    m = seq_len
-    n = seq_len
+    m = seq_len_q
+    n = seq_len_kv
     k = head_size
 
     Q: B, M, K
@@ -60,11 +76,11 @@
     P: B, M, N
     V: B, N, K
     O: B, M, K
-    output: bs, num_head, seq_len, head_size
+    output: bs, seq_len_q, num_head, head_size
     */
 
 
-    using ArchTag = cutlass::arch::Sm80;
+    using ArchTag = cutlass::arch::Sm{{arch}};
     constexpr bool kIs64x64 = {{kIs64x64}};
     constexpr bool kSingleValueIteration = {{kSingleValueIteration}};
 
@@ -111,7 +127,6 @@
         kSingleValueIteration
     >;
 
-    int block_O_size = (*batch_size) * seq_len * num_heads * head_size_v;
     typename Attention::Params p;
     {
         // set parameters
@@ -120,8 +135,16 @@
         p.value_ptr = static_cast<{{elem_input_type}}*>(value);
         p.logsumexp_ptr = nullptr; // Only needed for bw
         p.output_accum_ptr = nullptr;
+
+        if (!fixed_seq_length_q) {
+            p.seqlens_q_ptr = lengths_q;
+        }
+        if (!fixed_seq_length_kv) {
+            p.seqlens_k_ptr = lengths_kv;
+        }
+
         if (Attention::kNeedsOutputAccumulatorBuffer) {
-          p.output_accum_ptr = accum_ptr;
+          p.output_accum_ptr = static_cast<float*>(workspace);
         }
         p.output_ptr = static_cast<{{elem_input_type}}*>(output);
 
@@ -129,7 +152,7 @@
         p.num_batches = *batch_size;
         p.head_dim = head_size;
         p.head_dim_value = head_size_v;
-        p.num_queries = seq_len;
+        p.num_queries = seq_len_q;
         p.num_keys = seq_len_kv;
         p.causal = is_causal;
 
@@ -138,14 +161,14 @@
         p.k_strideM = head_size;
         p.v_strideM = head_size_v;
 
-        p.q_strideH = p.q_strideM * seq_len;
+        p.q_strideH = p.q_strideM * seq_len_q;
         p.k_strideH = p.k_strideM * seq_len_kv;
         p.v_strideH = p.v_strideM * seq_len_kv;
         p.o_strideH = head_size_v;
         p.q_strideB = p.q_strideH * num_heads;
         p.k_strideB = p.k_strideH * num_heads;
         p.v_strideB = p.v_strideH * num_heads;
-        p.o_strideB = head_size_v * seq_len * num_heads;
+        p.o_strideB = head_size_v * seq_len_q * num_heads;
     }
 
     // launch kernel
@@ -173,22 +196,467 @@
 )
 
 
+FUNC_TEMPLATE_GROUPED_FMHA = jinja2.Template(
+    """
+#include <vector>
+#include <iostream>
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "gemm_kernel_utils.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/fast_math.h"
+
+#include "default_fmha_grouped.h"
+
+using namespace gemm_kernel_utils;
+
+{{cuda_check}}
+
+{{func_signature}}
+
+{
+  /*
+  The code is based on fused_multihead_attention_variable_seqlen.cu example in CUTLASS repo:
+  https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
+
+  problem_sizes0 [b, m, n, k]
+  [head_number * batch_size, mq, mkv, k0]
+  [head_number * batch_size, seq_length_q, seq_length_kv, head_size]
+
+  problem_sizes1
+  [head_number * batch_size, mq, k1, mkv]
+  [head_number * batch_size, seq_length_q, head_size_v, seq_length_kv]
+
+  m = seq_len_q
+  n = seq_len_kv
+  k = head_size
+
+  Q: B, M, K
+  K: B, N, K
+  P: B, M, N
+  V: B, N, K
+  O: B, M, K
+  output: bs, seq_len_q, num_head, head_size
+
+  Note that the output shape is different from the CUTLASS example.
+  */
+  //
+  int problem_count = (*batch_size) * num_heads;
+
+  /////// Calculate offsets of FMHA arguments in the workspace //////
+
+  int used_memory = 0;
+  // Space for problem sizes for each problem
+  int size_problem_sizes = sizeof(cutlass::gemm::GemmCoord) * problem_count;
+  cutlass::gemm::GemmCoord* problem_sizes_device0 =
+      static_cast<cutlass::gemm::GemmCoord*>(workspace + used_memory);
+  used_memory += size_problem_sizes;
+  cutlass::gemm::GemmCoord* problem_sizes_device1 =
+      static_cast<cutlass::gemm::GemmCoord*>(workspace + used_memory);
+  used_memory += size_problem_sizes;
+  // Space for leading dimensions of tensors in each problem
+  int size_ld = sizeof(int64_t) * problem_count;
+  int64_t* ldq = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldk = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldv = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldo = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+
+  using ArchTag = cutlass::arch::Sm{{arch}};
+  constexpr bool kIs64x64 = {{kIs64x64}};
+  constexpr bool kSingleValueIteration = {{kSingleValueIteration}};
+
+  // Set grid size
+  constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;
+  constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;
+  if (kIs64x64 && head_size_v > kKeysPerBlock) {
+    std::cerr
+        << "WARNING: you will get better performance with `kIs64x64=false`";
+  }
+  if (kSingleValueIteration && head_size_v > kKeysPerBlock) {
+    std::cerr << "ERROR  : Use kSingleValueIteration to keep output in RF. "
+                 "This requires to have `head_size <= kKeysPerBlock` "
+                 "but head_size_v="
+              << head_size_v << " and kKeysPerBlock=" << kKeysPerBlock << "";
+    return;
+  }
+  if (!kSingleValueIteration && head_size_v <= kKeysPerBlock) {
+    std::cerr
+        << "WARNING: you will get better performance with `kSingleValueIteration=true` (keeps the output in RF rather than GMEM)";
+  }
+
+  using GemmType = DefaultGemmType<ArchTag, {{elem_input_type}}>;
+  using OpClass = typename GemmType::OpClass;
+  using DefaultConfig =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          OpClass,
+          ArchTag,
+          {{elem_input_type}},
+          {{elem_input_type}},
+          {{elem_input_type}}, // ElementC
+          float // ElementAccumulator
+          >;
+
+  // If the head_size already meets the alignment requirement, then
+  // it's safe to mark mem_align to be true to maximize the alignment
+  // benefit. Otherwise, assign false to it to use the minimal alignment.
+  constexpr const bool mem_align = ({{head_size}} % DefaultConfig::kAlignmentA == 0) &&
+      ({{head_size}} % DefaultConfig::kAlignmentB == 0);
+
+  cutlass::gemm::kernel::GroupScheduleMode const GroupScheduleMode_ =
+      cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly;
+
+  using AttentionKernel = typename cutlass::gemm::kernel::DefaultFMHAGrouped<
+      {{elem_input_type}}, // scalar_t
+      ArchTag,
+      mem_align,
+      kQueriesPerBlock,
+      kKeysPerBlock,
+      kSingleValueIteration,
+      GroupScheduleMode_>::FMHAKernel;
+  using Attention = cutlass::gemm::device::GemmGrouped<AttentionKernel>;
+
+  if (({{head_size}} % AttentionKernel::kAlignmentQ != 0) ||
+      ({{head_size}} % AttentionKernel::kAlignmentK != 0)) {
+    std::cerr << "Error at " << __FILE__ << ": " << __LINE__ <<
+        "head_size not aligned! head_size has to be divisible by " <<
+        std::to_string(AttentionKernel::kAlignmentQ) << " and " <<
+        std::to_string(AttentionKernel::kAlignmentK) + ", but got {{head_size}}."
+        << std::endl;
+    return;
+  }
+
+  // If we need a separate buffer for output accumulation
+  static bool const kNeedsOutputAccumulatorBuffer =
+      Attention::GemmKernel::kNeedsOutputAccumulatorBuffer;
+
+  // Problem sizes with actual sequence lengths
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+  // Problem sizes with "full" sequence lengths
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_full;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_full;
+
+  problem_sizes0.reserve(problem_count);
+  problem_sizes1.reserve(problem_count);
+  problem_sizes0_full.reserve(problem_count);
+  problem_sizes1_full.reserve(problem_count);
+
+  // Copy sequence lengths from device to host, if they are not fixed
+  std::vector<int> mq_real_buf; // Target sequence lengths
+  std::vector<int> mkv_real_buf; // Source sequence lengths
+  if (!fixed_seq_length_q) {
+    mq_real_buf.resize(*batch_size);
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        mq_real_buf.data(), lengths_q, *batch_size * sizeof(int), cudaMemcpyDeviceToHost, stream),
+      "Error when copying target sequence lengths from device!");
+  }
+  if (!fixed_seq_length_kv) {
+    mkv_real_buf.resize(*batch_size);
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        mkv_real_buf.data(), lengths_kv,  *batch_size * sizeof(int), cudaMemcpyDeviceToHost, stream),
+        "Error when copying source sequence lengths from device!");
+  }
+  if (!fixed_seq_length_q || !fixed_seq_length_kv) {
+    CUDA_CHECK_ME_ATTN(cudaStreamSynchronize(stream),
+          "Error when synchronizing stream after copying sequence lengths from device!");
+  }
+
+  int mq_full = seq_len_q;
+  int mkv_full = seq_len_kv;
+
+  for (int i = 0; i < *batch_size; ++i) {
+    // Problems belonging to the same batch share the same seq len
+    // Source sequence length
+    int mkv_real = fixed_seq_length_kv ? mkv_full : mkv_real_buf.at(i);
+    // Target sequence length
+    int mq_real = fixed_seq_length_q ? mq_full : mq_real_buf.at(i);
+
+    int k0 = head_size;
+    int k1 = head_size_v;
+
+    // Create sizes of two GEMM problems for each of batch_size * num_heads attention problems
+    for (int j = 0; j < num_heads; ++j) {
+      cutlass::gemm::GemmCoord problem0(mq_real, mkv_real, k0);
+      cutlass::gemm::GemmCoord problem1(mq_real, k1, mkv_real);
+      problem_sizes0.push_back(problem0);
+      problem_sizes1.push_back(problem1);
+
+      cutlass::gemm::GemmCoord problem0_full(mq_full, mkv_full, k0);
+      cutlass::gemm::GemmCoord problem1_full(mq_full, k1, mkv_full);
+      problem_sizes0_full.push_back(problem0_full);
+      problem_sizes1_full.push_back(problem1_full);
+    }
+  }
+
+  // Move problem sizes to the device
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      problem_sizes_device0,
+      problem_sizes0.data(),
+      size_problem_sizes,
+      cudaMemcpyHostToDevice,
+      stream),
+    "Error when copying problem sizes 0 to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      problem_sizes_device1,
+      problem_sizes1.data(),
+      size_problem_sizes,
+      cudaMemcpyHostToDevice,
+      stream),
+    "Error when copying problem sizes 1 to device!");
+
+  // Offsets of input, buffer, and output matrices in memory
+  std::vector<int64_t> offset_Q_full;
+  std::vector<int64_t> offset_K_full;
+  std::vector<int64_t> offset_V_full;
+  std::vector<int64_t> offset_O_full;
+
+  // Leading dimensions of matrices of each problem
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  ldq_host.resize(problem_count);
+  ldk_host.resize(problem_count);
+  ldv_host.resize(problem_count);
+  ldo_host.resize(problem_count);
+
+  using scalar_t = typename Attention::GemmKernel::scalar_t;
+  using accum_t = typename Attention::GemmKernel::accum_t;
+  using output_t = typename Attention::GemmKernel::output_t;
+  using output_accum_t = typename Attention::GemmKernel::output_accum_t;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementAccumulator = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+
+  // Arrays of pointers to matrices for each problem
+  int size_ptrs = sizeof(ElementQ*) * problem_count;
+  ElementQ** ptr_Q = static_cast<ElementQ**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementK** ptr_K = static_cast<ElementK**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementV** ptr_V = static_cast<ElementV**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementO** ptr_O = static_cast<ElementO**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementOAccum** ptr_O_accumulate =
+      static_cast<ElementOAccum**>(workspace + used_memory);
+  used_memory += size_ptrs;
+
+  int64_t total_elements_Q_full = 0;
+  int64_t total_elements_K_full = 0;
+  int64_t total_elements_V_full = 0;
+  //int64_t total_elements_O_full = 0;
+  int64_t total_elements_O_at_batch_start = 0;
+
+  // Pointers to matrices and leading dimensions for each problem are first
+  // formed on the host and then copied to the device.
+
+  for (int32_t i_batch = 0; i_batch < *batch_size; ++i_batch) {
+    int64_t total_elements_O_in_current_batch = 0;
+    for (int32_t i_heads = 0; i_heads < num_heads; ++i_heads) {
+      int64_t i = i_batch * num_heads + i_heads;
+      auto problem0 = problem_sizes0.at(i);
+      auto problem1 = problem_sizes1.at(i);
+
+      auto problem0_full = problem_sizes0_full.at(i);
+      auto problem1_full = problem_sizes1_full.at(i);
+
+      /*
+      Below we specify leading dimensions of each matix, assuming the following
+      layouts and dimensions:
+
+      using LayoutQ = cutlass::layout::RowMajor;
+      using LayoutK = cutlass::layout::ColumnMajor;
+      using LayoutV = cutlass::layout::RowMajor;
+      using LayoutO = cutlass::layout::RowMajor;
+
+      ldq_host.at(i) = LayoutQ::packed({problem0.m(), problem0.k()}).stride(0);
+      ldk_host.at(i) = LayoutK::packed({problem0.k(), problem0.n()}).stride(0);
+      ldv_host.at(i) = LayoutV::packed({problem1.k(), problem1.n()}).stride(0);
+      ldo_host.at(i) = LayoutO::packed({problem1.m(), problem1.n()}).stride(0);
+      */
+
+      ldq_host.at(i) = problem0.k(); // K, rowmajor
+      ldk_host.at(i) = problem0.k(); // K, columnmajor
+      ldv_host.at(i) = problem1.n(); // K, rowmajor
+      // Since we want output in shape [b, seq_len_q, num_head, head_size] and
+      // not [b, num_head, seq_len_q, head_size], ldo is different from the
+      // CUTLASS example. Each next row of O is now separated from the previous
+      // one by head_size * num_heads, instead of just head_size.
+      ldo_host.at(i) = problem1.n() * num_heads; // K * num_heads, rowmajor
+
+      offset_Q_full.push_back(total_elements_Q_full);
+      offset_K_full.push_back(total_elements_K_full);
+      offset_V_full.push_back(total_elements_V_full);
+      // To write the output in shape [b, seq_len_q, num_head, head_size]
+      // instead of [b, num_head, seq_len_q, head_size], we place rows of O
+      // from the same batch but different heads at stride head_size from
+      // each other (and not seq_len_q * head_size).
+      offset_O_full.push_back(
+          total_elements_O_at_batch_start + i_heads * problem1_full.n());
+
+      int64_t elements_Q_full = problem0_full.m() * problem0_full.k();
+      int64_t elements_K_full = problem0_full.k() * problem0_full.n();
+      int64_t elements_V_full = problem1_full.k() * problem1_full.n();
+      int64_t elements_O_full = problem1_full.m() * problem1_full.n();
+
+      total_elements_Q_full += elements_Q_full;
+      total_elements_K_full += elements_K_full;
+      total_elements_V_full += elements_V_full;
+      total_elements_O_in_current_batch += elements_O_full;
+    }
+    total_elements_O_at_batch_start += total_elements_O_in_current_batch;
+  }
+
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldq, ldq_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of Q matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldk, ldk_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of K matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldv, ldv_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of V matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldo, ldo_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of O matrices to device!");
+
+  // Buffer for output accumulation, if necessary
+  float* accum_ptr = static_cast<float*>(workspace + used_memory);
+
+  std::vector<ElementQ*> ptr_Q_host(problem_count);
+  std::vector<ElementK*> ptr_K_host(problem_count);
+  std::vector<ElementV*> ptr_V_host(problem_count);
+  std::vector<ElementO*> ptr_O_host(problem_count);
+  std::vector<ElementOAccum*> ptr_O_accumulate_host(problem_count);
+
+  for (int32_t i = 0; i < problem_count; ++i) {
+    ptr_Q_host.at(i) = static_cast<ElementQ*>(query) + offset_Q_full.at(i);
+    ptr_K_host.at(i) = static_cast<ElementK*>(key) + offset_K_full.at(i);
+    ptr_V_host.at(i) = static_cast<ElementV*>(value) + offset_V_full.at(i);
+    ptr_O_host.at(i) = static_cast<ElementO*>(output) + offset_O_full.at(i);
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      ptr_O_accumulate_host.at(i) =
+        static_cast<ElementOAccum*>(accum_ptr) + offset_O_full.at(i);
+    }
+  }
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_Q, ptr_Q_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to Q matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_K, ptr_K_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to K matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_V, ptr_V_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to V matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_O, ptr_O_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to O matrices to device!");
+
+  if (kNeedsOutputAccumulatorBuffer) {
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        ptr_O_accumulate,
+        ptr_O_accumulate_host.data(),
+        size_ptrs,
+        cudaMemcpyHostToDevice,
+        stream),
+      "Error when copying pointers to accumulator buffers to device!");
+  }
+
+  int threadblock_count =
+      Attention::sufficient(problem_sizes1.data(), problem_count);
+  typename Attention::Arguments args(
+      problem_sizes_device0,
+      problem_sizes_device1,
+      problem_count,
+      threadblock_count,
+      ptr_Q,
+      ptr_K,
+      nullptr, // ptr_P isn't used by grouped FMHA
+      ptr_V,
+      ptr_O,
+      ptr_O_accumulate,
+      ldq,
+      ldk,
+      nullptr, // ldp isn't used by grouped FMHA
+      ldv,
+      ldo,
+      is_causal,
+      problem_sizes1.data());
+
+  Attention fmha;
+  cutlass::Status status = fmha.initialize(args, nullptr, stream);
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Failed to initialize CUTLASS Grouped FMHA kernel."
+              << std::endl;
+    return;
+  }
+
+  // Run the grouped FMHA object
+  status = fmha.run(stream);
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+    return;
+  }
+}
+
+    """
+)
+
+
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(void* output,
                    void* query,
                    void* key,
                    void* value,
-                   float* accum_ptr,
                    int64_t* batch_size,
-                   int seq_len,
                    int seq_len_kv,
+                   int seq_len_q,
                    int num_heads,
                    int head_size,
                    int head_size_v,
                    float p_dropout,
                    float softmax_scale,
                    bool is_causal,
+                   bool fixed_seq_length_kv,
+                   int32_t* lengths_kv,
+                   bool fixed_seq_length_q,
+                   int32_t* lengths_q,
+                   void* workspace,
                    cudaStream_t stream)
     """
 )
@@ -204,16 +672,21 @@
 {{indent}}{{func_name}}(
 {{indent}}    {{output}},
 {{indent}}    {{query}}, {{key}}, {{value}},
-{{indent}}    {{accum_ptr}},
 {{indent}}    {{batch_size}},
-{{indent}}    {{seq_len}},
 {{indent}}    {{seq_len_kv}},
+{{indent}}    {{seq_len_q}},
 {{indent}}    {{num_heads}},
 {{indent}}    {{head_size}},
 {{indent}}    {{head_size_v}},
 {{indent}}    {{p_dropout}},
 {{indent}}    {{softmax_scale}},
-{{indent}}    {{is_causal}}, stream /* default stream */
+{{indent}}    {{is_causal}},
+{{indent}}    {{fixed_seq_length_kv}},
+{{indent}}    {{lengths_kv}},
+{{indent}}    {{fixed_seq_length_q}},
+{{indent}}    {{lengths_q}},
+{{indent}}    global_workspace_,
+{{indent}}    stream /* default stream */
 {{indent}});
     """
 )
@@ -226,12 +699,19 @@ def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    return FUNC_TEMPLATE.render(
+    if func_attrs["use_grouped_fmha"]:
+        func_template = FUNC_TEMPLATE_GROUPED_FMHA
+    else:
+        func_template = FUNC_TEMPLATE_KERNEL_FWD
+
+    return func_template.render(
         elem_input_type=elem_input_type,
         head_size=func_attrs["head_size"],
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         kIs64x64="true" if func_attrs["head_size"] <= 64 else "false",
         kSingleValueIteration="true" if func_attrs["head_size"] <= 128 else "false",
+        cuda_check=CUDA_CHECK,
+        arch=func_attrs["arch"],
     )
 
 
@@ -247,7 +727,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     """the function for generating a function call for attention"""
     output_name = ""
     assert len(func_attrs["outputs"]) == 1
-    assert len(func_attrs["inputs"]) == 3
+    assert len(func_attrs["inputs"]) in [3, 4, 5]
 
     output_name = func_attrs["outputs"][0]._attrs["name"]
 
@@ -255,10 +735,24 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     k_name = func_attrs["inputs"][1]._attrs["name"]
     v_name = func_attrs["inputs"][2]._attrs["name"]
 
+    variable_seq_length_kv = func_attrs["variable_seq_length_kv"]
+    variable_seq_length_q = func_attrs["variable_seq_length_q"]
+
+    lengths_name_kv = "nullptr"
+    lengths_name_q = "nullptr"
+
+    if variable_seq_length_kv:
+        assert len(func_attrs["inputs"]) > 3
+        lengths_name_kv = func_attrs["inputs"][3]._attrs["name"]
+    if variable_seq_length_q:
+        idx_len_q = 3 + variable_seq_length_kv
+        assert len(func_attrs["inputs"]) > idx_len_q
+        lengths_name_q = func_attrs["inputs"][idx_len_q]._attrs["name"]
+
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     batch_size = "&" + xshape[0]._attrs["name"]
-    seq_len = x._attrs["shape"][2]._attrs["values"][0]
+    seq_len_q = x._attrs["shape"][2]._attrs["values"][0]
 
     num_heads = x._attrs["shape"][1]._attrs["values"][0]
     head_size = x._attrs["shape"][3]._attrs["values"][0]
@@ -276,15 +770,18 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
         query=q_name,
         key=k_name,
         value=v_name,
-        accum_ptr="reinterpret_cast<float*>(global_workspace_)",
         batch_size=batch_size,
-        seq_len=seq_len,
         seq_len_kv=seq_len_kv,
+        seq_len_q=seq_len_q,
         num_heads=num_heads,
         head_size=head_size,
         head_size_v=head_size_v,
         p_dropout=p_dropout,
         softmax_scale=softmax_scale,
         is_causal="true" if is_causal else "false",
+        fixed_seq_length_kv="false" if variable_seq_length_kv else "true",
+        lengths_kv=f"static_cast<int32_t*>({lengths_name_kv})",
+        fixed_seq_length_q="false" if variable_seq_length_q else "true",
+        lengths_q=f"static_cast<int32_t*>({lengths_name_q})",
         indent=indent,
     )
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 8ac4f5fe8..cb9b86832 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -16,17 +16,20 @@
 Flash attention.
 """
 import itertools
+import logging
 from collections import OrderedDict
-from typing import List
+from typing import List, Optional, Tuple
 
 import jinja2
 import numpy as np
 
 from aitemplate import backend
 from aitemplate.backend import registry
-from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 from aitemplate.utils import shape_utils
 
+_LOGGER = logging.getLogger(__name__)
+
 # pylint: disable=C0103,W0221,W0102,W0223
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
@@ -58,7 +61,14 @@ class mem_eff_attention(Operator):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
     """
 
-    def __init__(self, causal, dropout=0) -> None:
+    def __init__(
+        self,
+        causal,
+        dropout=0,
+        variable_seq_length_kv=False,
+        variable_seq_length_q=False,
+        use_grouped_fmha=False,
+    ) -> None:
         """Initialize attention module"""
         super().__init__()
         assert dropout == 0
@@ -66,8 +76,11 @@ def __init__(self, causal, dropout=0) -> None:
         self._attrs["has_profiler"] = False
         self._attrs["dropout"] = dropout
         self._attrs["causal"] = causal
+        self._attrs["variable_seq_length_kv"] = variable_seq_length_kv
+        self._attrs["variable_seq_length_q"] = variable_seq_length_q
         self._attrs["head_size"] = -1
         self._attrs["workspace"] = 0
+        self._attrs["use_grouped_fmha"] = use_grouped_fmha
         self.exec_key_template = EXEC_KEY_TEMPLATE
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
@@ -113,7 +126,14 @@ def unique(vector):
         ]
         return output_shape
 
-    def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        lengths_kv: Optional[Tensor] = None,
+        lengths_q: Optional[Tensor] = None,
+    ) -> Tensor:
         """call the op
 
         Parameters
@@ -131,13 +151,21 @@ def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
         self._attrs["head_size"] = head_size_v
 
         self._attrs["inputs"] = [q, k, v]
+        if self._attrs["variable_seq_length_kv"]:
+            assert lengths_kv is not None
+            self._attrs["inputs"].append(lengths_kv)
+        if self._attrs["variable_seq_length_q"]:
+            assert lengths_q is not None
+            self._attrs["inputs"].append(lengths_q)
         self._set_depth()
         self._extract_exec_path(q)
         output_shape = self._infer_shapes(q, v)
 
-        o_shape = [var._attrs["values"][-1] for var in output_shape]
-        if o_shape[-1] > 128:
-            self._attrs["workspace"] = 4 * np.prod(o_shape)
+        required_workspace_size = self._compute_required_workspace(
+            output_shape, q._attrs["shape"], k._attrs["shape"]
+        )
+        self._attrs["workspace"] = required_workspace_size
+        _LOGGER.debug(f"Required workspace size: {required_workspace_size}")
         output = Tensor(
             output_shape,
             src_ops={self},
@@ -146,6 +174,58 @@ def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
         self._attrs["outputs"] = [output]
         return output
 
+    def _compute_required_workspace(
+        self,
+        output_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+        q_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+        k_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+    ) -> int:
+        """
+        Compute workspace size required for attention op.
+        """
+        is_float32 = self._attrs["inputs"][0]._attrs["dtype"] not in [
+            "float16",
+            "bfloat16",
+        ]
+
+        o_shape = [var._attrs["values"][-1] for var in output_shape]
+        # We need a separate buffer of output accumulation
+        # - when the intermediate output can't fit into the register file.
+        # - when the accumulation type (float) is different from the output type.
+        # See https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fmha_grouped.h#L79-L95
+        needs_output_accum_buffer = (o_shape[-1] > 128) or not is_float32
+        if needs_output_accum_buffer:  # Needs output accumulator buffer
+            size_of_accum_element = 4  # Accumulation is always in float
+            accu_size = size_of_accum_element * np.prod(o_shape)
+        else:
+            accu_size = 0
+
+        # The backend which uses kernel_forward.h only needs accumulator buffer
+        if not self._attrs["use_grouped_fmha"]:
+            return accu_size
+
+        # Number of problems is batch_size * num_heads
+        problem_count = q_shape[0].upper_bound() * q_shape[1].upper_bound()
+
+        size_of_int = 4
+        size_of_int64 = 8
+        # GEMM size is specified by 3 ints: m, n, k
+        size_of_gemm_coord = 3 * size_of_int
+
+        # There are two GEMM sizes for each problem, corresponding to 2 matrix
+        # multiplications in attention
+        problem_sizes_size = 2 * size_of_gemm_coord * problem_count
+
+        # For each problem, need space for leading dimensions of 5 matrices:
+        # Q, K, V, O. Leading dimensions are in int64.
+        ld_sizes = 4 * size_of_int64 * problem_count
+
+        # For each problem, pointers to 5 matrices: Q, K, V, O, O_accum
+        size_of_ptr = 8  # 64-bit arch
+        ptrs_sizes = 5 * size_of_ptr * problem_count
+        total_size = problem_sizes_size + accu_size + ld_sizes + ptrs_sizes
+        return total_size
+
     def _get_op_attributes(self):
         target_attrs = ["causal"]
         attr = {}
@@ -176,6 +256,7 @@ def _extract_exec_path(self, x: Tensor):
     def gen_function(self) -> str:
         """call backend functions"""
         target = backend.target.Target.current()
+        self._attrs["arch"] = target._arch
         func_key = "{target}.{op}.gen_function".format(
             target=target.name(), op=self._attrs["op"]
         )
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 4db230ee4..b1505b47d 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -365,7 +365,8 @@ inline DeviceError GetLastError() {
 }
 
 inline std::string GetLastErrorString() {
-  return cudaGetErrorString(cudaGetLastError());
+  auto err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 inline DeviceError StreamSynchronize(StreamType stream) {
diff --git a/static/include/kernels/mem_eff_attention/kernel_forward.h b/static/include/kernels/mem_eff_attention/kernel_forward.h
new file mode 100644
index 000000000..33f777cc7
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/kernel_forward.h
@@ -0,0 +1,952 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// This code has been adapted from
+// https://github.com/NVIDIA/cutlass/blob/77549ae6c8cf31c7ac4c8b88180a8708a8683da4/examples/41_fused_multi_head_attention/kernel_forward.h
+
+#pragma once
+
+#ifdef HAS_PYTORCH
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+#endif
+
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+// From
+// fbcode/aitemplate/AITemplate/fb/3rdparty/cutlass/examples/41_fused_multi_head_attention/
+#include "debug_utils.h"
+#include "epilogue_pipelined.h"
+#include "epilogue_rescale_output.h"
+#include "find_default_mma.h"
+#include "gemm_kernel_utils.h"
+#include "mma_from_smem.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+} // namespace
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    bool kSingleValueIteration // = `value.shape[-1] <= kKeysPerBlock`
+    >
+struct AttentionKernel {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using lse_scalar_t = float;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr int32_t kAlignLSE = 32; // block size of backward
+  static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
+      cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
+    int32_t* seqlens_q_ptr = nullptr;
+    int32_t* seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
+    output_accum_t*
+        output_accum_ptr; // [num_queries, num_heads, head_dim_value]
+    lse_scalar_t* logsumexp_ptr; // [num_heads, num_queries] - can be null
+
+    // Dimensions/strides
+    int32_t head_dim;
+    int32_t head_dim_value;
+    int32_t num_queries;
+    int32_t num_keys;
+
+    bool causal;
+
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int32_t o_strideH;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int64_t o_strideB;
+    int32_t num_batches;
+    int32_t num_heads;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      // Note: Note in sync with cutlass' main branch!! Make sure to apply
+      // when updating cutlass.
+      return head_dim_value * num_heads;
+    }
+
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+      // In case of variable target sequence lengths, get real sequence length
+      // for this batch
+      if (seqlens_q_ptr != nullptr) {
+        num_queries = seqlens_q_ptr[batch_id];
+        if (query_start >= num_queries) {
+          return false;
+        }
+      }
+      // In case of variable source sequence lengths, get real sequence length
+      // for this batch
+      if (seqlens_k_ptr != nullptr) {
+        num_keys = seqlens_k_ptr[batch_id];
+      }
+
+      query_ptr += batch_id * q_strideB;
+      key_ptr += batch_id * k_strideB;
+      value_ptr += batch_id * v_strideB;
+      output_ptr += batch_id * o_strideB;
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr += batch_id * o_strideB;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += query_start * q_strideM + head_id * q_strideH;
+      key_ptr += head_id * k_strideH;
+      value_ptr += head_id * v_strideH;
+      output_ptr += int64_t(query_start) * o_strideM() + head_id * o_strideH;
+
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr +=
+            int64_t(query_start) * o_strideM() + head_id * o_strideH;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+      if (logsumexp_ptr != nullptr) {
+        // lse[batch_id, head_id, query_start]
+        logsumexp_ptr +=
+            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+      }
+
+      num_queries -= query_start;
+      if (causal) {
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + kQueriesPerBlock), num_keys);
+      }
+      num_batches = 0; // no longer used after
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        DefaultConfig::kStages, // Should use `DefaultConfig::kStages`, but that
+                                // uses too much smem
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Updater;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            output_accum_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> mi;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0, "value is not correctly aligned");
+    XFORMERS_CHECK(
+        p.q_strideH % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideH % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideH % kAlignmentV == 0, "value is not correctly aligned");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (thread_id() < kQueriesPerBlock) {
+      s_prime[thread_id()] = accum_t(0);
+      m_prime[thread_id()] =
+          -cutlass::platform::numeric_limits<accum_t>::infinity();
+      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+    }
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM()},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)p.o_strideM()},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+    // Iterate through keys
+    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1.mm,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                       // updated from end of prev iter
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_id();
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // Mask out last if causal
+      if (p.causal && p.num_keys - iter_key_start <= kKeysPerBlock) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+            lane_id(), warp_id(), iteratorC_tile_offset);
+        int32_t last_col;
+        MM0::ScalingCoefsUpdater::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              last_col = query_start + accum_m - iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n > last_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+      DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+                      DISPATCH_BOOL(
+                          p.num_keys - iter_key_start >= kKeysPerBlock,
+                          kFullColumns,
+                          ([&] {
+                            // Update `mi` from accum stored in registers
+                            // Also updates `accum` with accum[i] <-
+                            // exp(accum[i] * scale
+                            // - mi)
+                            MM0::ScalingCoefsUpdater::update<
+                                kQueriesPerBlock,
+                                kFullColumns,
+                                kIsFirst,
+                                kKeepOutputInRF>(
+                                accum_o,
+                                accum,
+                                mi,
+                                m_prime,
+                                s_prime,
+                                lane_id(),
+                                thread_id(),
+                                warp_id(),
+                                p.num_keys - iter_key_start,
+                                iteratorC_tile_offset,
+                                1.0f / cutlass::fast_sqrt(float(p.head_dim)));
+                          }));
+                    }));
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            shared_storage.after_mm0.mm1.mm,
+            shared_storage.after_mm0.si,
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id(),
+            (int)problem_size_1_k);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          DISPATCH_BOOL(
+              iter_key_start == 0, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp = typename cutlass::epilogue::
+                          thread::MemoryEfficientAttentionNormalize<
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  output_t,
+                                  output_accum_t>::type,
+                              output_accum_t,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              kIsFirst,
+                              kIsLast,
+                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp rescale(s_prime, m_prime);
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          warp_id(),
+                          lane_id());
+                      epilogue(rescale, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads(); // we modify `m_prime` after
+    }
+
+    if (kKeepOutputInRF) {
+      constexpr bool kIsFirst = true;
+      constexpr bool kIsLast = true;
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      using EpilogueOutputOp =
+          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+              output_t, // output
+              output_accum_t, // source
+              DefaultOp::kCount,
+              typename DefaultOp::ElementAccumulator, // accum
+              output_accum_t, // compute
+              kIsFirst,
+              kIsLast,
+              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+      using Epilogue =
+          typename cutlass::epilogue::threadblock::EpiloguePipelined<
+              typename DefaultEpilogue::Shape,
+              typename MM1::Mma::Operator,
+              DefaultEpilogue::kPartitionsK,
+              typename MM1::OutputTileIterator, // destination
+              typename DefaultEpilogue::AccumulatorFragmentIterator,
+              typename DefaultEpilogue::WarpTileIterator,
+              typename DefaultEpilogue::SharedLoadIterator,
+              EpilogueOutputOp,
+              typename DefaultEpilogue::Padding,
+              DefaultEpilogue::kFragmentsPerIteration,
+              true, // IterationsUnroll
+              typename MM1::OutputTileIteratorAccum // source tile
+              >;
+      auto dest_iter = createOutputIter(0);
+      EpilogueOutputOp rescale(s_prime, m_prime);
+      Epilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      epilogue(rescale, dest_iter, accum_o);
+    }
+
+    // 7. Calculate logsumexp
+    // To make the backward easier, we pad logsumexp with `inf`
+    // this avoids a few bound checks, and is not more expensive during fwd
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+      if (thread_id() < p.num_queries) {
+        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()]) +
+            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+      } else if (thread_id() < lse_dim) {
+        p.logsumexp_ptr[thread_id()] =
+            cutlass::platform::numeric_limits<accum_t>::infinity();
+      }
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
+
+#define _ATTENTION_KERNEL_FORWARD_BEGIN(...)                                  \
+  template <>                                                                 \
+  __global__ void __launch_bounds__(                                          \
+      __VA_ARGS__::kNumThreads, __VA_ARGS__::kMinBlocksPerSm)                 \
+      attention_kernel_batched<__VA_ARGS__>(typename __VA_ARGS__::Params p) { \
+    using Kernel = __VA_ARGS__;
+#define _ATTENTION_KERNEL_FORWARD_END() }
+
+#ifdef __CUDA_ARCH__
+#define __CUDA_ARCH_OR_ZERO__ __CUDA_ARCH__
+#else
+#define __CUDA_ARCH_OR_ZERO__ 0
+#endif
+
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD(              \
+    ARCH,                                                  \
+    SCALAR_T,                                              \
+    IS_ALIGNED,                                            \
+    QUERIES_PER_BLOCK,                                     \
+    KEYS_PER_BLOCK,                                        \
+    SINGLE_VALUE_ITER)                                     \
+  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<         \
+                                  SCALAR_T,                \
+                                  cutlass::arch::Sm##ARCH, \
+                                  IS_ALIGNED,              \
+                                  QUERIES_PER_BLOCK,       \
+                                  KEYS_PER_BLOCK,          \
+                                  SINGLE_VALUE_ITER>)      \
+  if (!p.advance_to_block()) {                             \
+    return;                                                \
+  }                                                        \
+  Kernel::attention_kernel(p);                             \
+  _ATTENTION_KERNEL_FORWARD_END();
+
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(              \
+    ARCH,                                                           \
+    SCALAR_T,                                                       \
+    IS_ALIGNED,                                                     \
+    QUERIES_PER_BLOCK,                                              \
+    KEYS_PER_BLOCK,                                                 \
+    SINGLE_VALUE_ITER)                                              \
+  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<                  \
+                                  SCALAR_T,                         \
+                                  cutlass::arch::Sm##ARCH,          \
+                                  IS_ALIGNED,                       \
+                                  QUERIES_PER_BLOCK,                \
+                                  KEYS_PER_BLOCK,                   \
+                                  SINGLE_VALUE_ITER>)               \
+  printf(                                                           \
+      "FATAL: this function is for sm%d, but was built for sm%d\n", \
+      int(ARCH),                                                    \
+      int(__CUDA_ARCH_OR_ZERO__));                                  \
+  _ATTENTION_KERNEL_FORWARD_END();
+
+// All kernels are disabled by default
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__)
+
+// Enable the right one based on __CUDA_ARCH__
+#ifndef __CUDA_ARCH__
+#elif __CUDA_ARCH__ < 500
+#error "Need cuda arch at least 5.0"
+#elif __CUDA_ARCH__ < 700
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__)
+#elif __CUDA_ARCH__ < 750
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__)
+#elif __CUDA_ARCH__ < 800
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__)
+#elif __CUDA_ARCH__ >= 800
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__)
+#endif
diff --git a/static/include/model.h b/static/include/model.h
index ac9b1c30e..7ccd9f3ad 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -22,7 +22,8 @@ namespace ait {
 inline void DeviceCheckLastError(const char* file, int line) {
   auto device_error = GetLastError();
   if (device_error != GetDeviceSuccess()) {
-    std::string msg = std::string("Got error: ") + GetLastErrorString() +
+    std::string msg = std::string("Got error: ") +
+        cudaGetErrorString(device_error) +
         " enum: " + std::to_string(device_error) + " at " + file + ": " +
         std::to_string(line);
     LOG(ERROR) << msg;
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 5a4bf54e0..1509d0371 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -15,6 +15,7 @@
 """
 Unittests for flash_attention Operator.
 """
+import itertools
 import logging
 import math
 import os
@@ -28,13 +29,16 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import benchmark_pt, detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_test_env,
+    filter_test_cases_by_params,
     get_random_torch_tensor,
+    TestEnv,
 )
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 from einops import rearrange, repeat
 
+from parameterized import parameterized
+
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -68,6 +72,21 @@ def index_first_axis(x, indices):
 
 
 def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Reference implementation of scaled dot-product attention. For benchmarking
+    purposes, when possible we use torch.nn.functional.scaled_dot_product_attention,
+    which calls optimized mem.effient and flash attention kernels.
+    """
+    if True or (causal and attn_mask is not None):
+        # SDPA doesn't support causal and custom masks simultaneously,
+        # fall back on manual implementation
+        return attention_ref_math(
+            qkv, attn_mask, dropout_p, upcast=upcast, causal=causal
+        )
+    return attention_ref_sdpa(qkv, attn_mask, dropout_p, upcast=upcast, causal=causal)
+
+
+def attention_ref_sdpa(qkv, attn_mask, dropout_p, upcast=False, causal=False):
     """
     Arguments:
         qkv: (batch_size, seqlen, 3, nheads, head_dim)
@@ -78,10 +97,39 @@ def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
         attention: softmax after dropout
     """
     q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
+    q = q.transpose(1, 2)  # to (batch_size, nheads, seqlen, head_dim)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    if attn_mask is not None:
+        # to (batch_size, nheads, seqlen, seqlen)
+        attn_mask = attn_mask.reshape(q.shape[0], 1, 1, q.shape[2])
+    output = torch.nn.functional.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=causal
+    )
+    output = output.transpose(1, 2)  # to (batch_size, seqlen, nheads, head_dim)
+    return output.to(dtype=qkv.dtype)
+
+
+def attention_ref_math(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        attn_mask: (batch_size, seqlen), or (batch_size, target_len, seqlen),
+            or (batch_size, nheads, target_len, seqlen), or broadcastable to that shape.
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+        attention: softmax after dropout
+    """
+    q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
     seqlen = qkv.shape[1]
     d = qkv.shape[-1]
     scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
-    scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf"))
+    if len(attn_mask.shape) == 2:
+        attn_mask_expanded = rearrange(attn_mask, "b s -> b 1 1 s")
+    elif len(attn_mask.shape) == 3:
+        attn_mask_expanded = rearrange(attn_mask, "b t s -> b 1 t s")
+    scores.masked_fill_(~attn_mask_expanded, float("-inf"))
     if causal:
         causal_mask = torch.triu(
             torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1
@@ -170,136 +218,138 @@ def _test_flash_attention(
         torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
-        x = torch.randn(
-            batch_size,
-            seqlen,
-            n,
-            device="cuda",
-            dtype=torch_dtype,
-            requires_grad=True,
-        )
-        Wqkv = torch.nn.Linear(
-            nheads * d,
-            3 * nheads * d,
-            device=device,
-            dtype=torch_dtype,
-        )
+        with torch.no_grad():
+            x = torch.randn(
+                batch_size,
+                seqlen,
+                n,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            Wqkv = torch.nn.Linear(
+                nheads * d,
+                3 * nheads * d,
+                device=device,
+                dtype=torch_dtype,
+            )
 
-        lengths = torch.tensor(
-            [seqlen] * batch_size, dtype=torch.int, device="cuda"
-        ).reshape(-1, 1)
-        attention_mask_bool = (
-            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
-            < lengths
-        )
-        attention_mask = torch.zeros(
-            batch_size,
-            seqlen,
-            device="cuda",
-            dtype=torch_dtype,
-        )
-        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
+            lengths = torch.tensor(
+                [seqlen] * batch_size, dtype=torch.int, device="cuda"
+            ).reshape(-1, 1)
+            attention_mask_bool = (
+                repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
+                < lengths
+            )
+            attention_mask = torch.zeros(
+                batch_size,
+                seqlen,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
 
-        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-            x, attention_mask_bool
-        )
-        qkv_unpad = (
-            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        qkv = (
-            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
-        y_pt = output.detach()
+            x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                x, attention_mask_bool
+            )
+            qkv_unpad = rearrange(
+                Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads
+            )
+            qkv = rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            y_pt = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
 
-        total, _, num_heads, head_size = qkv_unpad.shape
+            total, _, num_heads, head_size = qkv_unpad.shape
 
-        X1 = Tensor(
-            shape=[total, 3, num_heads, head_size],
-            dtype=dtype,
-            name="qkv",
-            is_input=True,
-        )
-        X2 = Tensor(
-            shape=[batch_size + 1],
-            dtype="int32",
-            name="cu_seqlens",
-            is_input=True,
-        )
+            X1 = Tensor(
+                shape=[total, 3, num_heads, head_size],
+                dtype=dtype,
+                name="qkv",
+                is_input=True,
+            )
+            X2 = Tensor(
+                shape=[batch_size + 1],
+                dtype="int32",
+                name="cu_seqlens",
+                is_input=True,
+            )
 
-        flash_attention_op = ops.flash_attention(
-            batch_size=batch_size,
-            dropout=dropout_p,
-            max_seq_len=max_seqlen_in_batch,
-            causal=causal,
-        )
-        if copy_op:
             flash_attention_op = ops.flash_attention(
-                **flash_attention_op._get_op_attributes()
+                batch_size=batch_size,
+                dropout=dropout_p,
+                max_seq_len=max_seqlen_in_batch,
+                causal=causal,
+            )
+            if copy_op:
+                flash_attention_op = ops.flash_attention(
+                    **flash_attention_op._get_op_attributes()
+                )
+            Y = flash_attention_op(X1, X2)
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output"
+
+            if rebuild:
+                target = detect_target()
+                module = compile_model(Y, target, "./tmp", test_name)
+            else:
+                module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+            x1 = qkv_unpad.to(torch_dtype).cuda()
+            x2 = cu_seqlens.to(torch.int32).cuda()
+            inputs = {"qkv": x1, "cu_seqlens": x2}
+            y = torch.empty(
+                [total, num_heads, head_size],
+                dtype=torch_dtype,
+                device="cuda",
             )
-        Y = flash_attention_op(X1, X2)
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
-
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-
-        x1 = qkv_unpad.detach().to(torch_dtype).cuda()
-        x2 = cu_seqlens.detach().to(torch.int32).cuda()
-        inputs = {"qkv": x1, "cu_seqlens": x2}
-        y = torch.empty(
-            [total, num_heads, head_size],
-            dtype=torch_dtype,
-            device="cuda",
-        )
-        module.run_with_tensors(inputs, [y])
-
-        # Warm up.
-        for _ in range(5):
             module.run_with_tensors(inputs, [y])
-        # Benchmark.
-        time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-            inputs,
-            [y],
-            count=100,
-        )
-        _LOGGER.info(f"benchmark flash-attn time: {time_per_iter_ms}")
-
-        y = y.reshape((batch_size, -1, nheads, d))
-        torch.testing.assert_close(y, y_pt, atol=1e-3, rtol=1e-3)
-
-        if benchmark_pt:
-            from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
-            func = attention_ref
-            args = (
-                qkv.to(torch_dtype).cuda(),
-                attention_mask_bool.cuda(),
-                dropout_p,
-                False,
-                False,
-            )
-            duration = benchmark_torch_function(100, func, *args)
-            print(
-                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark.
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
             )
-
-    def test_flash_attention_sm80(self):
+            _LOGGER.info(f"benchmark flash-attn time: {time_per_iter_ms}")
+
+            y = y.reshape((batch_size, -1, nheads, d))
+            torch.testing.assert_close(y, y_pt, atol=1e-3, rtol=1e-3)
+
+            if benchmark_pt:
+                from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+                func = attention_ref
+                args = (
+                    qkv.to(torch_dtype).cuda(),
+                    attention_mask_bool.cuda(),
+                    dropout_p,
+                    False,
+                    False,
+                )
+                duration = benchmark_torch_function(100, func, *args)
+                print(
+                    f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+                )
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                # Flash attention requires A100
+                TestEnv.CUDA_SM80: [("float16")],
+            }
+        ),
+        skip_on_empty=True,
+    )
+    def test_flash_attention(self, dtype):
         self._test_flash_attention(
-            test_name="flash_attention_fp16",
-            dtype="float16",
+            test_name=f"flash_attention_{dtype}",
+            dtype=dtype,
         )
         self._test_flash_attention(
-            test_name="flash_attention_fp16_copy_op",
+            test_name=f"flash_attention_{dtype}_copy_op",
             copy_op=True,
-            dtype="float16",
+            dtype=dtype,
         )
 
     def _test_attention(
@@ -393,10 +443,18 @@ def _test_attention(
             )
             _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
 
-    def test_attention_rocm(self):
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.ROCM: [("float16")],
+            }
+        ),
+        skip_on_empty=True,
+    )
+    def test_attention_rocm(self, dtype):
         self._test_attention(
-            test_name="attention_fp16",
-            dtype="float16",
+            test_name=f"attention_{dtype}",
+            dtype=dtype,
         )
 
     def _test_mem_eff_attention(
@@ -415,223 +473,380 @@ def _test_mem_eff_attention(
         benchmark_pt=False,
         copy_op=False,
         use_perm=True,
+        variable_seq_length_kv=False,
+        variable_seq_length_q=False,
+        skip_pt=False,
+        use_grouped_fmha=False,
         atol=1e-3,
         rtol=1e-3,
     ):
+        """
+        Use skip_pt to avoid CUDA OOM when benchmarking with problem sizes
+        which are too large for the PT implementation.
+        """
+        # Can't skip PT computation if we are benchmarking it
+        assert not (benchmark_pt and skip_pt)
+
         torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
-        x = torch.randn(
-            batch_size,
-            seqlen,
-            n,
-            device="cuda",
-            dtype=torch_dtype,
-            requires_grad=True,
-        )
-        Wqkv = torch.nn.Linear(
-            nheads * d,
-            3 * nheads * d,
-            device=device,
-            dtype=torch_dtype,
-        )
-
-        lengths = torch.tensor(
-            [seqlen] * batch_size, dtype=torch.int, device="cuda"
-        ).reshape(-1, 1)
-        attention_mask_bool = (
-            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
-            < lengths
-        )
-        attention_mask = torch.zeros(
-            batch_size,
-            seqlen,
-            device="cuda",
-            dtype=torch_dtype,
-        )
-        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
-
-        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-            x, attention_mask_bool
-        )
-        qkv_unpad = (
-            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        qkv = (
-            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        q, k, v = torch.split(qkv, 1, dim=2)
-        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
-        y_pt = output.detach()
+        with torch.no_grad():
+            x = torch.randn(
+                batch_size,
+                seqlen,
+                n,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            Wqkv = torch.nn.Linear(
+                nheads * d,
+                3 * nheads * d,
+                device=device,
+                dtype=torch_dtype,
+            )
 
-        total, _, num_heads, head_size = qkv_unpad.shape
+            if variable_seq_length_kv:
+                lengths_kv = torch.randint(0, seqlen + 1, size=(batch_size, 1))
+                lengths_kv = lengths_kv.to(device="cuda")
+            else:
+                lengths_kv = torch.tensor(
+                    [seqlen] * batch_size, dtype=torch.int, device="cuda"
+                ).reshape(-1, 1)
+
+            if variable_seq_length_q:
+                lengths_q = torch.randint(0, seqlen + 1, size=(batch_size, 1))
+                lengths_q = lengths_q.to(device="cuda")
+            else:
+                lengths_q = torch.tensor(
+                    [seqlen] * batch_size, dtype=torch.int, device="cuda"
+                ).reshape(-1, 1)
+
+            seq_range = torch.arange(seqlen, device="cuda")
+            attention_mask_bool_kv = (
+                seq_range.unsqueeze(0).expand((batch_size, seqlen)) < lengths_kv
+            ).unsqueeze(1)
+
+            attention_mask_bool_q = (
+                seq_range.unsqueeze(0).expand((batch_size, seqlen)) < lengths_q
+            ).unsqueeze(2)
+
+            x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                x, attention_mask_bool_kv
+            )
+            qkv_unpad = rearrange(
+                Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads
+            )
+            qkv = rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            q, k, v = torch.split(qkv, 1, dim=2)
+            if not skip_pt:
+                y_pt = attention_ref(
+                    qkv, attention_mask_bool_kv, dropout_p, causal=causal
+                )
+
+            total, _, num_heads, head_size = qkv_unpad.shape
+
+            Q = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="q",
+                is_input=True,
+            )
+            K = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="k",
+                is_input=True,
+            )
+            V = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="v",
+                is_input=True,
+            )
 
-        Q = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype=dtype,
-            name="q",
-            is_input=True,
-        )
-        K = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype=dtype,
-            name="k",
-            is_input=True,
-        )
-        V = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype=dtype,
-            name="v",
-            is_input=True,
-        )
+            if variable_seq_length_kv:
+                L_kv = Tensor(
+                    shape=[batch_size, 1],
+                    dtype="int",
+                    name="lengths_kv",
+                    is_input=True,
+                )
+            if variable_seq_length_q:
+                L_q = Tensor(
+                    shape=[batch_size, 1],
+                    dtype="int",
+                    name="lengths_q",
+                    is_input=True,
+                )
 
-        mem_eff_attention_op = ops.mem_eff_attention(
-            causal=causal,
-        )
-        if copy_op:
             mem_eff_attention_op = ops.mem_eff_attention(
-                **mem_eff_attention_op._get_op_attributes()
+                causal=causal,
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                use_grouped_fmha=use_grouped_fmha,
             )
-
-        Y = mem_eff_attention_op(Q, K, V)
-
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
-
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-
-        q = torch.permute(q, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
-        k = torch.permute(k, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
-        v = torch.permute(v, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
-
-        inputs = {
-            "q": q.detach().to(torch_dtype).cuda().contiguous(),
-            "k": k.detach().to(torch_dtype).cuda().contiguous(),
-            "v": v.detach().to(torch_dtype).cuda().contiguous(),
-        }
-
-        y = torch.empty(
-            [batch_size, seqlen, num_heads, head_size],
-            dtype=torch_dtype,
-            device="cuda",
-        )
-        module.run_with_tensors(inputs, [y])
-
-        if benchmark_ait:
-            # Warm up.
-            for _ in range(5):
-                module.run_with_tensors(inputs, [y])
-            # Benchmark AIT
-            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-                inputs,
-                [y],
-                count=100,
+            if copy_op:
+                mem_eff_attention_op = ops.mem_eff_attention(
+                    **mem_eff_attention_op._get_op_attributes()
+                )
+
+            Y = mem_eff_attention_op(
+                Q,
+                K,
+                V,
+                L_kv if variable_seq_length_kv else None,
+                L_q if variable_seq_length_q else None,
             )
-            _LOGGER.info(f"benchmark eff-mem-attn time: {time_per_iter_ms}")
 
-        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output"
 
-        if benchmark_pt:
-            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+            if rebuild:
+                target = detect_target()
+                module = compile_model(Y, target, "./tmp", test_name)
+            else:
+                module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-            func = attention_ref
-            args = (
-                qkv.to(torch_dtype).cuda(),
-                attention_mask_bool.cuda(),
-                dropout_p,
-                False,
-                False,
+            q = torch.permute(q, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
             )
-            duration = benchmark_torch_function(100, func, *args)
-            print(
-                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            k = torch.permute(k, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
             )
-
-    def test_mem_eff_attention_fp16_sm80(self):
-        for use_perm in [False, True]:
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                test_name=f"mem_eff_attention_fp16_{use_perm}",
-                dtype="float16",
+            v = torch.permute(v, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
             )
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                causal=True,
-                test_name=f"mem_eff_attention_fp16_{use_perm}_causal",
-                dtype="float16",
+
+            inputs = {
+                "q": q.to(torch_dtype).contiguous(),
+                "k": k.to(torch_dtype).contiguous(),
+                "v": v.to(torch_dtype).contiguous(),
+            }
+            if variable_seq_length_kv:
+                inputs["lengths_kv"] = lengths_kv.to(torch.int).contiguous()
+            if variable_seq_length_q:
+                inputs["lengths_q"] = lengths_q.to(torch.int).contiguous()
+
+            y = torch.empty(
+                [batch_size, seqlen, num_heads, head_size],
+                dtype=torch_dtype,
+                device="cuda",
             )
+            module.run_with_tensors(inputs, [y])
+
+            ret = {}
+
+            if benchmark_ait or benchmark_pt:
+                print(
+                    f"batch_size = {batch_size}, nheads = {nheads}, seqlen = {seqlen}, n = {n}, causal = {causal}, dtype = {dtype}"
+                )
+                print(
+                    f"variable_seq_length_kv = {variable_seq_length_kv}, variable_seq_length_q = {variable_seq_length_q}"
+                )
+
+            if benchmark_ait:
+                # Warm up.
+                for _ in range(5):
+                    module.run_with_tensors(inputs, [y])
+                # Benchmark AIT
+                time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                    inputs,
+                    [y],
+                    count=100,
+                )
+                print(
+                    f"AIT benchmark eff-mem-attn time per iter: {time_per_iter_ms:.2f}ms"
+                )
+
+                ret["ait_time_per_iter_ms"] = time_per_iter_ms
+
+            # y ~ [batch_size, seqlen, num_heads, head_size]
+            # attention_mask_bool_q ~ [batch_size, seqlen, 1]
+            if variable_seq_length_q:
+                y = y.masked_fill_(~attention_mask_bool_q.unsqueeze(2), 0.0)
+                if not skip_pt:
+                    y_pt = y_pt.masked_fill_(~attention_mask_bool_q.unsqueeze(2), 0.0)
+
+            if not skip_pt:
+                torch.testing.assert_close(
+                    y, y_pt.to(torch_dtype), atol=atol, rtol=rtol, equal_nan=True
+                )
+
+            if benchmark_pt:
+                from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+                func = attention_ref
+                args = (
+                    qkv.to(torch_dtype).cuda(),
+                    attention_mask_bool_kv.cuda(),
+                    dropout_p,
+                    False,
+                    False,
+                )
+                duration = benchmark_torch_function(100, func, *args)
+                print(
+                    f"PT benchmark eff-mem-attn time per iter: {duration:.2f}ms, BS: {batch_size}, QPS: {batch_size / duration:.2f}"
+                )
+
+                ret["pt_time_per_iter_ms"] = duration
+
+        return ret
+
+    @parameterized.expand(
+        itertools.product(
+            filter_test_cases_by_params(
+                {
+                    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                }
+            ),
+            [False, True],  # variable_seq_length_kv
+            [False, True],  # variable_seq_length_q
+            [False, True],  # causal
+        ),
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_mem_eff_attention(
+        self,
+        dtype: str,
+        variable_seq_length_kv: bool,
+        variable_seq_length_q: bool,
+        causal: bool,
+    ):
+        if dtype == "bfloat16":
+            atol = 1e-2
+            rtol = 1e-2
+        else:
+            atol = 1e-3
+            rtol = 1e-3
+        for use_grouped_fmha in [True, False]:
             self._test_mem_eff_attention(
                 batch_size=16,
                 nheads=4,
                 seqlen=8,
                 n=80,
-                use_perm=use_perm,
-                test_name="mem_eff_attention_fp16_nheads_20",
-                dtype="float16",
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                causal=causal,
+                use_grouped_fmha=use_grouped_fmha,
+                test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}_small",
+                dtype=dtype,
+                atol=atol,
+                rtol=rtol,
+            )
+            self._test_mem_eff_attention(
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                causal=causal,
+                test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}",
+                dtype=dtype,
+                atol=atol,
+                rtol=rtol,
             )
-            # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
-            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
-            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
-            # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
-            # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
 
+        # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
+        # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
+
+    @unittest.skip("Skip benchmarking in CI.")
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_mem_eff_attention_benchmark(
+        self,
+    ):
+        causal = False
+        res = []
+        for num_heads in [16]:
+            for batch_size, nheads, seqlen, n, skip_pt in (
+                # Larger head dimension
+                (1, num_heads, 1024, 1024, False),
+                (16, num_heads, 1024, 1024, False),
+                (64, num_heads, 1024, 1024, False),
+                (128, num_heads, 1024, 1024, False),
+                (1, num_heads, 1024, 2048, False),
+                (16, num_heads, 1024, 2048, False),
+                (64, num_heads, 1024, 2048, False),
+                (128, num_heads, 1024, 2048, False),
+                # Larger batch size
+                (1024, num_heads, 128, 512, True),
+                (1024, num_heads, 256, 512, True),
+                (1024, num_heads, 512, 512, True),
+                # Larger seq len
+                (128, num_heads, 1024, 512, True),
+                (128, num_heads, 2048, 512, True),
+                (128, num_heads, 4096, 512, True),
+            ):
+                for use_grouped_fmha in [True, False]:
+                    for dtype in ("float16", "float32"):
+                        for variable_seq_length_kv in [False, True]:
+                            for variable_seq_length_q in [False, True]:
+                                print("---------------------------------------------")
+                                run_res = self._test_mem_eff_attention(
+                                    batch_size=batch_size,
+                                    nheads=nheads,
+                                    seqlen=seqlen,
+                                    n=n,
+                                    variable_seq_length_kv=variable_seq_length_kv,
+                                    variable_seq_length_q=variable_seq_length_q,
+                                    causal=causal,
+                                    use_grouped_fmha=use_grouped_fmha,
+                                    benchmark_ait=True,
+                                    benchmark_pt=not skip_pt,
+                                    skip_pt=skip_pt,
+                                    test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}_small",
+                                    dtype=dtype,
+                                )
+
+                                run_res.update(
+                                    {
+                                        "dtype": dtype,
+                                        "batch_size": batch_size,
+                                        "nheads": nheads,
+                                        "seqlen": seqlen,
+                                        "n": n,
+                                        "variable_seq_length_kv": variable_seq_length_kv,
+                                        "variable_seq_length_q": variable_seq_length_q,
+                                        "causal": causal,
+                                        "use_grouped_fmha": use_grouped_fmha,
+                                    }
+                                )
+                                res.append(run_res)
+                                print("Intermediate result:")
+                                print(res)
+            print("Final result:")
+            print(res)
+
+    @parameterized.expand(
+        itertools.product(
+            filter_test_cases_by_params(
+                {
+                    # Don't run this test on V100: the binary crashes
+                    # with 'misaligned address' error.
+                    TestEnv.CUDA_SM80: [("float16")],
+                }
+            ),
+            [False, True],  # variable_seq_length_kv
+            [False, True],  # variable_seq_length_q
+        ),
+        skip_on_empty=True,
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     @unittest.expectedFailure
-    def test_mem_eff_attention_invalid_head_size_fp16_sm80(self):
+    def test_mem_eff_attention_invalid_head_size(
+        self, dtype, variable_seq_length_kv, variable_seq_length_q
+    ):
         self._test_mem_eff_attention(
             batch_size=16,
             nheads=8,
             seqlen=8,
             n=80,
-            test_name="mem_eff_attention_fp16_invalid_head_size",
-            dtype="float16",
+            variable_seq_length_kv=variable_seq_length_kv,
+            variable_seq_length_q=variable_seq_length_q,
+            test_name=f"mem_eff_attention_invalid_head_size_{dtype}_{variable_seq_length_kv}_{variable_seq_length_q}",
+            dtype=dtype,
         )
 
-    def test_mem_eff_attention_fp32_sm80(self):
-        for use_perm in [False, True]:
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                test_name=f"mem_eff_attention_fp32_{use_perm}",
-                dtype="float32",
-            )
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                causal=True,
-                test_name=f"mem_eff_attention_fp32_{use_perm}_causal",
-                dtype="float32",
-            )
-
-    def test_mem_eff_attention_bf16(self):
-        for use_perm in [False, True]:
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                test_name=f"mem_eff_attention_bf16_{use_perm}",
-                dtype="bfloat16",
-                atol=1e-2,
-                rtol=1e-2,
-            )
-            self._test_mem_eff_attention(
-                use_perm=use_perm,
-                causal=True,
-                test_name=f"mem_eff_attention_bf16_{use_perm}_causal",
-                dtype="bfloat16",
-                atol=1e-2,
-                rtol=1e-2,
-            )
-
     def _test_cross_attention(
         self,
         batch_size=16,
@@ -654,149 +869,133 @@ def _test_cross_attention(
     ):
         torch_dtype = string_to_torch_dtype(dtype)
 
-        q = torch.randn(
-            batch_size,
-            seqlen,
-            num_heads,
-            head_size,
-            device="cuda",
-            dtype=torch_dtype,
-        )
-        k = torch.randn(
-            batch_size,
-            seqlen_kv,
-            num_heads,
-            head_size,
-            device="cuda",
-            dtype=torch_dtype,
-        )
-        v = torch.randn(
-            batch_size,
-            seqlen_kv,
-            num_heads,
-            head_size_v,
-            device="cuda",
-            dtype=torch_dtype,
-        )
+        with torch.no_grad():
+            q = torch.randn(
+                batch_size,
+                seqlen,
+                num_heads,
+                head_size,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            k = torch.randn(
+                batch_size,
+                seqlen_kv,
+                num_heads,
+                head_size,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            v = torch.randn(
+                batch_size,
+                seqlen_kv,
+                num_heads,
+                head_size_v,
+                device="cuda",
+                dtype=torch_dtype,
+            )
 
-        output = ref_cross_attention(q, k, v)
-        y_pt = output.detach()
+            y_pt = ref_cross_attention(q, k, v)
 
-        Q = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype=dtype,
-            name="q",
-            is_input=True,
-        )
-        K = Tensor(
-            shape=[batch_size, num_heads, seqlen_kv, head_size],
-            dtype=dtype,
-            name="k",
-            is_input=True,
-        )
-        V = Tensor(
-            shape=[batch_size, num_heads, seqlen_kv, head_size_v],
-            dtype=dtype,
-            name="v",
-            is_input=True,
-        )
+            Q = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="q",
+                is_input=True,
+            )
+            K = Tensor(
+                shape=[batch_size, num_heads, seqlen_kv, head_size],
+                dtype=dtype,
+                name="k",
+                is_input=True,
+            )
+            V = Tensor(
+                shape=[batch_size, num_heads, seqlen_kv, head_size_v],
+                dtype=dtype,
+                name="v",
+                is_input=True,
+            )
 
-        mem_eff_attention_op = ops.mem_eff_attention(
-            causal=causal,
-        )
-        if copy_op:
             mem_eff_attention_op = ops.mem_eff_attention(
-                **mem_eff_attention_op._get_op_attributes()
+                causal=causal,
             )
-        Y = mem_eff_attention_op(Q, K, V)
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
-
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-
-        q = torch.permute(q, (0, 2, 1, 3))
-        k = torch.permute(k, (0, 2, 1, 3))
-        v = torch.permute(v, (0, 2, 1, 3))
-
-        inputs = {
-            "q": q.detach().to(torch_dtype).cuda().contiguous(),
-            "k": k.detach().to(torch_dtype).cuda().contiguous(),
-            "v": v.detach().to(torch_dtype).cuda().contiguous(),
-        }
-        y = torch.empty(
-            [batch_size, seqlen, num_heads, head_size_v],
-            dtype=torch_dtype,
-            device="cuda",
-        )
-        module.run_with_tensors(inputs, [y])
-
-        if benchmark_ait:
-            # Warm up.
-            for _ in range(5):
-                module.run_with_tensors(inputs, [y])
-            # Benchmark AIT
-            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-                inputs,
-                [y],
-                count=100,
+            if copy_op:
+                mem_eff_attention_op = ops.mem_eff_attention(
+                    **mem_eff_attention_op._get_op_attributes()
+                )
+            Y = mem_eff_attention_op(Q, K, V)
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output"
+
+            if rebuild:
+                target = detect_target()
+                module = compile_model(Y, target, "./tmp", test_name)
+            else:
+                module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+            q = torch.permute(q, (0, 2, 1, 3))
+            k = torch.permute(k, (0, 2, 1, 3))
+            v = torch.permute(v, (0, 2, 1, 3))
+
+            inputs = {
+                "q": q.to(torch_dtype).contiguous(),
+                "k": k.to(torch_dtype).contiguous(),
+                "v": v.to(torch_dtype).contiguous(),
+            }
+            y = torch.empty(
+                [batch_size, seqlen, num_heads, head_size_v],
+                dtype=torch_dtype,
+                device="cuda",
             )
-            _LOGGER.info(f"benchmark cross-attn time: {time_per_iter_ms}")
-
-        torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
-
-    def test_cross_attention_fp16_sm80(self):
-        self._test_cross_attention(
-            test_name="cross_attention_fp16",
-            dtype="float16",
-        )
-        self._test_cross_attention(
-            seqlen=1024,
-            seqlen_kv=768,
-            head_size=64,
-            head_size_v=64,
-            test_name="cross_attention2_fp16",
-            dtype="float16",
-        )
+            module.run_with_tensors(inputs, [y])
 
-    def test_cross_attention_fp32_sm80(self):
-        self._test_cross_attention(
-            test_name="cross_attention_fp32",
-            dtype="float32",
-        )
-        self._test_cross_attention(
-            seqlen=1024,
-            seqlen_kv=768,
-            head_size=64,
-            head_size_v=64,
-            test_name="cross_attention2_fp32",
-            dtype="float32",
-        )
+            if benchmark_ait:
+                # Warm up.
+                for _ in range(5):
+                    module.run_with_tensors(inputs, [y])
+                # Benchmark AIT
+                time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                    inputs,
+                    [y],
+                    count=100,
+                )
+                _LOGGER.info(f"benchmark cross-attn time: {time_per_iter_ms}")
+
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_SM80: [("float16"), ("float32"), ("bfloat16")],
+            }
+        ),
+        skip_on_empty=True,
+    )
+    def test_cross_attention(self, dtype):
+        if dtype == "bfloat16":
+            atol = 1e-2
+            rtol = 1e-2
+        else:
+            atol = 1e-3
+            rtol = 1e-3
 
-    def test_cross_attention_bf16(self):
         self._test_cross_attention(
-            test_name="cross_attention_bf16",
-            dtype="bfloat16",
-            atol=1e-2,
-            rtol=1e-2,
+            test_name=f"cross_attention_{dtype}",
+            dtype=dtype,
+            atol=atol,
+            rtol=rtol,
         )
         self._test_cross_attention(
             seqlen=1024,
             seqlen_kv=768,
             head_size=64,
             head_size_v=64,
-            test_name="cross_attention2_bf16",
-            dtype="bfloat16",
-            atol=1e-2,
-            rtol=1e-2,
+            test_name=f"cross_attention2_{dtype}",
+            dtype=dtype,
+            atol=atol,
+            rtol=rtol,
         )
 
 
-filter_test_cases_by_test_env(AttentionTestCase)
-
-
 if __name__ == "__main__":
     unittest.main()

From a6479852dee900addcde47301fd6778fc33216fc Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 3 Apr 2023 17:45:38 +0800
Subject: [PATCH 374/638] enable mask-rcnn

---
 .../common/tensor/permute102_common.py        |  2 +
 .../multi_level_roi_align_common.py           | 50 +++++++++----------
 .../roi_ops/multi_level_roi_align.py          |  2 +
 .../backend/rocm/normalization/softmax.py     |  4 +-
 python/aitemplate/backend/rocm/target_def.py  |  4 +-
 .../roi_ops/multi_level_roi_align.py          |  3 +-
 6 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index cd705c9ce..880afae82 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -193,7 +193,9 @@
 #define DIRECT_BLOCK_Z 2
 
 namespace {
+#ifndef __HIP_PLATFORM_HCC__
 using bfloat16 = __nv_bfloat16;
+#endif
 
 template<typename T>
 __global__ void permute102_tiled_kernel(T* output,
diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
index 5756aa59c..5211cda90 100644
--- a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -24,12 +24,12 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}FPNRoiAlign<float, {{num_rois}}, {{pooled_size}}>(
-{{indent}}    in_ptr_p2,
-{{indent}}    in_ptr_p3,
-{{indent}}    in_ptr_p4,
-{{indent}}    in_ptr_p5,
-{{indent}}    rois_ptr,
-{{indent}}    out_ptr,
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p2),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p3),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p4),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p5),
+{{indent}}    static_cast<{{elem_input_type}}*>(rois_ptr),
+{{indent}}    static_cast<{{elem_output_type}}*>(out_ptr),
 {{indent}}    batchSize,
 {{indent}}    featureCount,
 {{indent}}    imageSize,
@@ -143,7 +143,7 @@
   const Trois* roi = rois + 5 * (batch * roiCount + roiIdx);
   float hw;
 
-{% if elem_input_type == "half" %}
+{% if elem_input_type in ["half", "ck::half_t"] %}
   float x1 = __half2float(roi[1]);
   float y1 = __half2float(roi[2]);
   float x2 = __half2float(roi[3]);
@@ -290,12 +290,12 @@
 } // namespace
 
 void {{function_name}} (
-    {{elem_input_type}}* in_ptr_p2,
-    {{elem_input_type}}* in_ptr_p3,
-    {{elem_input_type}}* in_ptr_p4,
-    {{elem_input_type}}* in_ptr_p5,
-    {{elem_input_type}}* rois_ptr,
-    {{elem_output_type}}* out_ptr,
+    void* in_ptr_p2,
+    void* in_ptr_p3,
+    void* in_ptr_p4,
+    void* in_ptr_p5,
+    void* rois_ptr,
+    void* out_ptr,
     {{index_type}}* batch, {{index_type}}* in_ch,
     {{index_type}}* p2_h, {{index_type}}* p2_w,
     {{index_type}}* p3_h, {{index_type}}* p3_w,
@@ -329,12 +329,12 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_output_type}}*,
+  void*,
+  void*,
+  void*,
+  void*,
+  void*,
+  void*,
   {{index_type}}*,
   {{index_type}}*,
   {{index_type}}*,
@@ -360,12 +360,12 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p2}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p3}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p4}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p5}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{rois_ptr}}),
-{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{in_ptr_p2}},
+{{indent}}    {{in_ptr_p3}},
+{{indent}}    {{in_ptr_p4}},
+{{indent}}    {{in_ptr_p5}},
+{{indent}}    {{rois_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_in_ch}},
 {{indent}}    {{p2_h}}, {{p2_w}},
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
index 564e01086..a2aa16bb7 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -58,6 +58,8 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
         )
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index 65fb63491..bc10c5e09 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -62,8 +62,8 @@
     auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
                                                             i_inStrides,
                                                             reduceDims,
-                                                            &alpha,
-                                                            &beta,
+                                                            alpha,
+                                                            beta,
                                                             static_cast<ck::half_t *>(input),
                                                             static_cast<ck::half_t *>(output),
                                                             ck::tensor_operation::element_wise::PassThrough{},
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index b8e138687..a0c1ec289 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -124,10 +124,10 @@ def _build_compile_options(self):
         ]
         if self._arch in {"GFX908", "gfx908"}:
             options.append("-DCK_AMD_GPU_GFX908")
-            options.append("--amdgpu-target=gfx908")
+            options.append("--offload-arch=gfx908")
         elif self._arch in {"GFX90a", "gfx90a"}:
             options.append("-DCK_AMD_GPU_GFX90A")
-            options.append("--amdgpu-target=gfx90a")
+            options.append("--offload-arch=gfx90a")
         else:
             raise RuntimeError("Unsupported GPU Arch")
         for path in ck_paths:
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
index 284fc2336..179e40c74 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -48,7 +48,6 @@ def gen_function(
     backend_spec = ROCMSpec()
     input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
     output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
-
     exec_paths = ""
     for key, _ in exec_path.items():
         program = multi_level_roi_align_common.EXEC_TEMPLATE.render(
@@ -59,6 +58,8 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
         )
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst

From 9bb4d0e50b3c3be3e554037c8ebec0242d18105e Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Mon, 3 Apr 2023 12:36:12 -0700
Subject: [PATCH 375/638] Make utils init empty to solve circular import issue
 (#521)

Summary:
If I try to import `aitemplate.utils.misc` in `aitemplate/backend/target.py` then I get the following error:
```
ImportError: cannot import name 'Target' from partially initialized module 'aitemplate.backend.target'
(most likely due to a circular import) (/home/ubuntu/workspace/AITemplate/python/aitemplate/backend/target.py)
```

Internally some `aitemplate.utils` submodules import `aitemplate.compiler` which causes circular import issue above.
To solve the issue we can make `utils.__init__.py` empty to stop importing all `utils` sumbodiles upfront.
It will allow us to use `utils.misc` module in any other modules of the project.

Looks like importing `utils` submodules in the `__init__.py` is not actually required. It will be no negative consequences of making `__init__.py` empty.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/521

Reviewed By: ipiszy

Differential Revision: D44605435

Pulled By: chenyang78

fbshipit-source-id: 5c5af10ad818e82c5286dc8ff08f5cf7384e2dfb
---
 python/aitemplate/utils/__init__.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index 001e62070..fd1123eef 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -13,17 +13,4 @@
 #  limitations under the License.
 #
 
-# flake8: noqa
-
-from aitemplate.utils import (
-    alignment,
-    environ,
-    graph_utils,
-    import_path,
-    markdown_table,
-    misc,
-    shape_utils,
-    tensor_utils,
-    torch_utils,
-    visualization,
-)
+# Let's keep this file empty to resolve circular import issues

From 48c960ee666ebcb62fa53add46de501c32dc0801 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Mon, 3 Apr 2023 12:50:44 -0700
Subject: [PATCH 376/638] migrate to StorageImpl::mutable_unsafe_data() (#522)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/522

See D44409928 for motivation.

Reviewed By: chenyang78

Differential Revision: D44554227

fbshipit-source-id: c80355a333e640ff8ac1e57de474be125094ea1e
---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 2bd041d7b..5c8feed11 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -311,8 +311,7 @@ void AITModelImpl::allocateOutputs(
         allocator->allocate(size_bytes),
         allocator,
         /*resizable=*/true);
-    ait_outputs.emplace_back(
-        storage_impl->unsafe_data<void>(), shape, ait_dtype);
+    ait_outputs.emplace_back(storage_impl->mutable_data(), shape, ait_dtype);
     output_index_to_output_storage_impl[output_index] = std::move(storage_impl);
   }
 }

From 254e8c09e92b8e571a93e223ff187689edf86a1d Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Mon, 3 Apr 2023 17:28:57 -0700
Subject: [PATCH 377/638] Skip acc normalization of repeat_interleave if input
 dims aren't integral (#528)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/528

There's cases where we have torch.repeat_interleave(x, repeats=y), where the y is a tensor. In these cases, don't do the mapping since we can't run tile on those dims.

To do this, skip creating a new replacement node in the mapper, and add an option to skip the node mapping in acc_normalizer called "skip_normalization_if_none". If true, then following normalization the returns none, it'll set the args/kwargs back so that the graph recompilation can succeed.

Reviewed By: frank-wei

Differential Revision: D44568748

fbshipit-source-id: 3e5c2aac8a7b9e50efe04fcae361a3c0ee1777a7
---
 fx2ait/fx2ait/acc_tracer/acc_normalizer.py | 11 +++++++++++
 fx2ait/fx2ait/acc_tracer/acc_ops.py        | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
index f1a96d0c4..9295dffc6 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
@@ -83,6 +83,7 @@ class NormalizationInfo(NamedTuple):
         List[Union[Tuple[str, str, bool], Tuple[str, str]]]
     ]
     needs_shapes_for_normalization: bool
+    skip_normalization_if_none: bool
 
 
 # Dict from (op, target) to NormalizationInfo for that op.
@@ -102,6 +103,7 @@ def _insert_fun(
     ] = None,
     needs_shapes_for_normalization=False,
     allow_normalize_from_torch_package=False,
+    skip_normalization_if_none=False,
 ):
     if op_and_target[0] == "call_function":
         assert callable(op_and_target[1])
@@ -143,6 +145,7 @@ def _insert_fun(
         custom_mapping_fn=custom_mapping_fn,
         kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
         needs_shapes_for_normalization=needs_shapes_for_normalization,
+        skip_normalization_if_none=skip_normalization_if_none,
     )
     _normalization_dict[op_and_target] = norm_info
 
@@ -231,6 +234,7 @@ def register_custom_acc_mapper_fn(
     ],
     needs_shapes_for_normalization=False,
     allow_normalize_from_torch_package=False,
+    skip_normalization_if_none=False,
 ):
     def insert(custom_mapping_fn: Callable):
         _insert_fun(
@@ -239,6 +243,7 @@ def insert(custom_mapping_fn: Callable):
             arg_replacement_tuples=arg_replacement_tuples,  # type: ignore[arg-type]
             needs_shapes_for_normalization=needs_shapes_for_normalization,
             allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+            skip_normalization_if_none=skip_normalization_if_none,
         )
         return custom_mapping_fn
 
@@ -377,12 +382,18 @@ def normalize_to_acc_op(
         if normalization_info.custom_mapping_fn is not None:
             # For custom mapping, the normalized_kwargs are used for the original op,
             # i.e. *before* custom acc_ops normalization. Do that now.
+            if normalization_info.skip_normalization_if_none:
+                original_args = node.args
+                original_kwargs = node.kwargs
             node.args = normalized_args
             node.kwargs = normalized_kwargs
             new_node = normalization_info.custom_mapping_fn(node, mod)
             # If a new node is returned then use it to replace the old node. Otherwise
             # the custom mapping function did its own replacement, so return early.
             if new_node is None:
+                if normalization_info.skip_normalization_if_none:
+                    node.args = original_args
+                    node.kwargs = original_kwargs
                 return
         else:
             # If there's kwargs_to_move_to_acc_out_ty then use it to setup acc_out_ty in
diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 26a4cdd10..8e185d2bc 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 # encoding: utf-8
+import logging
 import operator
 
 import torch  # isort:skip
@@ -29,6 +30,8 @@
 )
 from .acc_op_properties import AccOpProperty, register_acc_op_properties
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 this_arg_is_optional = True
 move_to_qparams = True
 dont_move_to_qparams = False
@@ -485,6 +488,7 @@ def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
         ("dim", "dim", this_arg_is_optional),
         ("output_size", "output_size", this_arg_is_optional),
     ],
+    skip_normalization_if_none=True,
 )
 @register_custom_acc_mapper_fn(
     op_and_target=("call_function", torch.repeat_interleave),
@@ -494,11 +498,17 @@ def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
         ("dim", "dim", this_arg_is_optional),
         ("output_size", "output_size", this_arg_is_optional),
     ],
+    skip_normalization_if_none=True,
 )
 def repeat_interleave_mapper(node: torch.fx.Node, _: nn.Module):
     input_node = node.kwargs["input"]
     repeats = cast(int, node.kwargs["repeats"])
     dim = node.kwargs["dim"]
+    if not (type(repeats) is int):
+        logger.info(
+            "Not mapping repeat_interleave to an acc op. We currently only support `repeat_interleave` with int repeats"
+        )
+        return
     assert (
         type(repeats) is int
     ), "We currently only support `repeat_interleave` with int repeats"

From 4c8a28c49cb776e4ef62f66ae642dd8810186c1f Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Mon, 3 Apr 2023 17:28:57 -0700
Subject: [PATCH 378/638] Skip acc normalization of repeat if dims aren't ints
 (#532)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/532

Skip acc_normalization if repeat dims aren't all ints. Otws, it gets mapped to tile on w/ some variable dim inputs, and the dim is treated as a proxy node rather than an int and the acc tracing fails w/ a type error.

Differential Revision: D44568745

fbshipit-source-id: f412f35baeee9a1b17f67b7749ca1f9b8cbbe77b
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 8e185d2bc..963b7b975 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -17,7 +17,7 @@
 import operator
 
 import torch  # isort:skip
-from typing import cast, Iterable, List, Sequence
+from typing import cast, Iterable, List, Optional, Sequence
 
 import torch.nn as nn
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
@@ -462,14 +462,27 @@ def tile(*, input, dims):
         ("input", "input"),
         ("*", "sizes"),
     ],
+    skip_normalization_if_none=True,
 )
-def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> Optional[torch.fx.Node]:
     """
     Map repeat to tile.
     """
     with node.graph.inserting_before(node):
         inputs = node.kwargs["input"]
         dims = node.kwargs["sizes"]
+        # Skip repeat mapping when the list of dims is not all ints (ie. contains
+        # some calculated value). torch.tile cannot support cases where dims
+        # are Proxy nodes
+        if (
+            isinstance(dims, (list, tuple))
+            and len(dims) > 0
+            and not all(isinstance(x, int) for x in dims)
+        ):
+            logger.info(
+                "Not mapping repeat to an acc op. We can't handle variable dims."
+            )
+            return
         new_node = node.graph.create_node(
             "call_function",
             tile,

From f0f458d4580fdd77938ca1048807760f2e95cb7a Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 3 Apr 2023 18:21:12 -0700
Subject: [PATCH 379/638] Support simple variable_seq_length in
 FMHA_style_b2b_bmm (#486)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/486

ATT, the block size / thread  size is still based on max_seq_length, but skip elements which are not within actual_seq_length.
It's not as optimal as the solution with group scheduling algorithm (especially when seq_lengths differ a lot for different examples in a batch), but it could help avoid jagged / padded conversion and should work fine when seq_lengths do not differ a lot.

Reviewed By: aakhundov

Differential Revision: D44364087

fbshipit-source-id: 2ca7f96e2a02f3edb660b61af7cc258aba99139b
---
 .../backend/cuda/b2b_bmm/__init__.py          |   6 +-
 .../cuda/b2b_bmm/fmha_style_b2b_bmm.py        |  43 ++-
 .../b2b_bmm/grouped_fmha_style_b2b_bmm.py     | 180 ++++++++++
 .../compiler/ops/b2b_bmm/__init__.py          |   3 +
 .../ops/b2b_bmm/fmha_style_b2b_bmm.py         |   2 +-
 .../ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py | 181 ++++++++++
 python/aitemplate/testing/test_utils.py       |  27 ++
 .../fmha_style_b2b_bmm/kernel_forward.h       |  82 +++--
 tests/unittest/ops/test_b2b_bmm.py            |  35 +-
 tests/unittest/ops/test_grouped_b2b_bmm.py    | 337 ++++++++++++++++++
 10 files changed, 820 insertions(+), 76 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
 create mode 100644 tests/unittest/ops/test_grouped_b2b_bmm.py

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
index 369380b45..c7ac172ee 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -18,4 +18,8 @@
 b2b bmm module init
 """
 
-from aitemplate.backend.cuda.b2b_bmm import classic_b2b_bmm, fmha_style_b2b_bmm
+from aitemplate.backend.cuda.b2b_bmm import (
+    classic_b2b_bmm,
+    fmha_style_b2b_bmm,
+    grouped_fmha_style_b2b_bmm,
+)
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
index 2ed23b652..c3764e564 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
@@ -37,8 +37,8 @@
 namespace {
 // Hardcode these sizes for now until we get profiling ready.
 constexpr int kQueriesPerBlock = 64;
-constexpr int kKeysPerBlock = ({{n1}} <= 64 ? 64 : 128);
-constexpr bool kSingleValueIteration = ({{n1}} <= kKeysPerBlock);
+constexpr int kKeysPerBlock = ({{head_dim_value}} <= 64 ? 64 : 128);
+constexpr bool kSingleValueIteration = ({{head_dim_value}} <= kKeysPerBlock);
 }  // end namespace
 
 {{func_signature}} {
@@ -54,16 +54,17 @@
     kQueriesPerBlock,
     kKeysPerBlock,
     kSingleValueIteration,
-    {{activation_functor}}
+    {{activation_functor}},
+    {{offset_t}}
   >;
 
   ElementAccumulator alpha0 = ElementAccumulator({{alpha0}});
   ElementAccumulator alpha1 = ElementAccumulator({{alpha1}});
 
-  int64_t seq_length = m0;
-  int64_t seq_length_kv = {{n0}};
-  int64_t head_dim = k0;
-  int64_t head_dim_value = {{n1}};
+  int64_t seq_length = {{seq_length}};
+  int64_t seq_length_kv = {{seq_length_kv}};
+  int64_t head_dim = {{head_dim}};
+  int64_t head_dim_value = {{head_dim_value}};
 
   typename Attention::Params p;
   { // set parameters
@@ -81,6 +82,7 @@
 
     p.scale = alpha0;
     p.activation_scale = alpha1;
+    p.activation_scale_divide_by_seq_len = {{alpha1_divide_by_seq_len}};
 
     p.num_heads = {{num_heads}};
     p.num_batches = batch_size;
@@ -105,7 +107,7 @@
     p.k_strideB = p.k_strideM * seq_length_kv;
     p.v_strideB = p.v_strideM * seq_length_kv;
 
-    int32_t bias_stride = {{n0}};
+    int32_t bias_stride = {{seq_length_kv}};
     {% if bias_broadcast[2] %}
     p.bias_strideM = 0;
     {% else %}
@@ -125,6 +127,8 @@
     {% else %}
     p.bias_strideB = bias_stride;
     {% endif %}
+
+    p.offset_ptr = static_cast<const {{offset_t}}*>({{offset_ptr}});
   }
 
   // launch kernel :)
@@ -143,11 +147,11 @@
   if (!Attention::check_supported(p)) {
     throw std::runtime_error(
       std::string("Kernel does not support these inputs. ") +
-      "Function: {{function_name}}. " +
-      "m0: " + std::to_string(m0) +
-      ", k0: " + std::to_string(k0) +
-      ", n0: " + std::to_string({{n0}}) +
-      ", n1: " + std::to_string({{n1}}) + "."
+      "Function: {{func_name}}. " +
+      "m0: " + std::to_string({{seq_length}}) +
+      ", k0: " + std::to_string({{head_dim}}) +
+      ", n0: " + std::to_string({{seq_length_kv}}) +
+      ", n1: " + std::to_string({{head_dim_value}}) + "."
     );
   }
   kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
@@ -193,7 +197,7 @@
 )
 
 
-def _causal_type_to_kernel_str(causal_type: CausalType) -> str:
+def causal_type_to_kernel_str(causal_type: CausalType) -> str:
     if causal_type == CausalType.NO_CAUSAL:
         return "CausalType::NO_CAUSAL"
     elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
@@ -245,14 +249,19 @@ def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
         elem_accum_type=elem_accum_type,
-        n0=str(n0.value()),
-        n1=str(n1.value()),
-        causal_type=_causal_type_to_kernel_str(func_attrs["causal_type"]),
+        offset_t="int64_t",
+        seq_length="m0",
+        seq_length_kv=str(n0.value()),
+        head_dim="k0",
+        head_dim_value=str(n1.value()),
+        causal_type=causal_type_to_kernel_str(func_attrs["causal_type"]),
         num_heads=str(func_attrs["num_heads"]),
         alpha0=str(func_attrs["alpha0"]),
         alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="false",
         activation_functor=activation_functor,
         bias_broadcast=bias_broadcast,
+        offset_ptr="nullptr",
     )
 
 
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..9ec0c70e5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -0,0 +1,180 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+grouped_fmha_style_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.b2b_bmm import fmha_style_b2b_bmm
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+
+from ... import registry
+
+# pylint: disable=C0301
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(
+  void* output,
+  void* query,
+  void* key,
+  void* value,
+  void* bias,
+  void* accum_ptr,
+  int64_t batch_size,
+  int64_t max_seq_length,
+  const void* offset,
+  cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+{{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{accum_ptr}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{max_seq_length}},
+{{indent}}    {{offset}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.gen_function")
+def grouped_fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v = func_attrs["inputs"][0:3]
+
+    bias_broadcast = [False] * 4
+    if len(func_attrs["inputs"]) > 3:
+        bias = func_attrs["inputs"][3]
+        bias_broadcast = [var == IntImm(1) for var in bias.shape()]
+
+    jagged_dim = q._attrs["shape"][0]
+    head_dim = q._attrs["shape"][2]
+    head_dim_value = v._attrs["shape"][2]
+    if not isinstance(head_dim, IntImm) or not isinstance(head_dim_value, IntImm):
+        raise RuntimeError(
+            f"head_dim and head_dim_value must be static dims. {func_attrs['name']=}, {head_dim=}, {head_dim_value=}"
+        )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_accum_type = elem_input_type
+    if (
+        elem_input_type == "cutlass::half_t"
+        and "use_fp16_acc" in Target.current()._kwargs
+        and not Target.current()._kwargs["use_fp16_acc"]
+    ):
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    activation_functor = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+    return fmha_style_b2b_bmm.FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        offset_t=jagged_dim.offsets_type(),
+        seq_length="max_seq_length",
+        seq_length_kv="max_seq_length",
+        head_dim=str(head_dim.value()),
+        head_dim_value=str(head_dim_value.value()),
+        causal_type=fmha_style_b2b_bmm.causal_type_to_kernel_str(
+            func_attrs["causal_type"]
+        ),
+        num_heads=str(func_attrs["num_heads"]),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        activation_functor=activation_functor,
+        bias_broadcast=bias_broadcast,
+        offset_ptr="offset",
+    )
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.func_decl")
+def grouped_fmha_style_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.func_call")
+def grouped_fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) in (3, 4)
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    bias_name = "nullptr"
+    if len(func_attrs["inputs"]) == 4:
+        bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    jagged_intvar = func_attrs["inputs"][0]._attrs["shape"][0]
+    batch_size = jagged_intvar.batch_dim()._attrs["name"]
+    if len(jagged_intvar.jagged_dims()) != 1:
+        raise RuntimeError(
+            "Only support 1 jagged dim in grouped_fmha_style_b2b_bmm for now! "
+            f"Current jagged intvar: {jagged_intvar}"
+        )
+    max_seq_length_dim = jagged_intvar.jagged_dims()[0].max_value()
+    max_seq_length = (
+        str(max_seq_length_dim.value())
+        if isinstance(max_seq_length_dim, IntImm)
+        else max_seq_length_dim._attrs["name"]
+    )
+    offset = f"{jagged_intvar.offsets_var_name()}.data[0]"
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        accum_ptr="global_workspace_",
+        batch_size=batch_size,
+        max_seq_length=max_seq_length,
+        offset=offset,
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
index b93f2c205..e0bad2b1d 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
@@ -19,3 +19,6 @@
 
 from aitemplate.compiler.ops.b2b_bmm.classic_b2b_bmm import classic_b2b_bmm
 from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import fmha_style_b2b_bmm
+from aitemplate.compiler.ops.b2b_bmm.grouped_fmha_style_b2b_bmm import (
+    grouped_fmha_style_b2b_bmm,
+)
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
index aaecec9d6..91eede72b 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
@@ -60,7 +60,7 @@ def __init__(
         self._attrs["workspace"] = 0
 
     def _infer_shapes(self):
-        """infer the output shape for classic_b2b_bmm."""
+        """infer the output shape for fmha_style_b2b_bmm."""
         q, k, v = self._attrs["inputs"][0:3]
         q_shape = q._attrs["shape"]
         k_shape = k._attrs["shape"]
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..e649733d4
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -0,0 +1,181 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Grouped back-to-back batched gemm fused kernel, implemented in FMHA style.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) [+ bias]))), V),
+
+where:
+Q: [B_M0, H, K0] (row_major),
+K: [B_N0, H, K0] (column_major),
+V: [B_N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major). Bias can be omitted.
+B_M0, B_N0 are jagged dims.
+Layouts are fixed for now.
+
+causal_masks have 3 types:
+NO_CAUSAL: no causal masks
+UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
+LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
+When causal_masks is enabled, M0 must be equal to N0.
+
+Internally this implementation stores the results of Q@K in shared memory.
+It supports larger N0 / N1 compared to the classic_b2b_bmm implementation.
+"""
+
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import (
+    CausalType,
+    fmha_style_b2b_bmm,
+)
+from aitemplate.utils import shape_utils
+
+
+class grouped_fmha_style_b2b_bmm(fmha_style_b2b_bmm):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool,
+        num_heads: int,
+    ) -> None:
+        """Initialize grouped_fmha_style_b2b_bmm op."""
+        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1, num_heads)
+        self._attrs["op"] = "grouped_fmha_style_b2b_bmm"
+        self._attrs["alpha1_divide_by_seq_len"] = alpha1_divide_by_seq_len
+
+    def _infer_shapes(self):
+        """infer the output shape for grouped_fmha_style_b2b_bmm."""
+        q, k, v = self._attrs["inputs"][0:3]
+        if not (q.is_jagged() and k.is_jagged() and v.is_jagged()):
+            raise RuntimeError(f"{q=}, {k=}, {v=} must be jagged!")
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3:
+            raise RuntimeError(
+                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same jagged_dim (batch_size and seq_length)! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if len(q_shape[0].jagged_dims()) != 1:
+            raise RuntimeError(f"{len(q_shape[0].jagged_dims())=} must be 1!")
+
+        if q_shape[1] != k_shape[1] or q_shape[1] != v_shape[1]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[1] != IntImm(self._attrs["num_heads"]):
+            raise RuntimeError(
+                f"num_heads are not equal! {self._attrs['num_heads']=}, {q_shape[1]=}"
+            )
+        K0 = q_shape[2]
+        if K0 != k_shape[2]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        head_size = IntImm(self._attrs["num_heads"])
+
+        output_shape = [q_shape[0], head_size, v_shape[2]]
+
+        if len(self._attrs["inputs"]) == 4:
+            batch_size = q_shape[0].batch_dim()
+            max_seq_length = q_shape[0].jagged_dims()[0].max_value()
+            bias = self._attrs["inputs"][3]
+            bias_shape = bias._attrs["shape"]
+            bias_expected_shape = [
+                batch_size,
+                head_size,
+                max_seq_length,
+                max_seq_length,
+            ]
+            bias_max_shape = shape_utils.get_broadcast_max_shape(
+                bias_shape, bias_expected_shape
+            )
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Expected bias rank 4. Current bias rank: {len(bias)}."
+                )
+            if not bias_max_shape[0]:
+                raise RuntimeError(
+                    f"bias shape is not compatible with Q K! "
+                    f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                    f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
+                )
+            if bias_shape[-1] != max_seq_length:
+                raise RuntimeError(
+                    f"Bias last dim is not broadcastable! Expected shape: {max_seq_length}, current bias shape: {bias_shape}"
+                )
+            # See comments below.
+            if not isinstance(q_shape[0].jagged_dims()[0].min_value(), IntImm):
+                raise RuntimeError(
+                    "Jagged dim' min value must be constant!"
+                    f"Current value: {q_shape[0].jagged_dims()=}"
+                )
+        else:
+            # Note: jagged_dims min / max values cannot be IntVar, as AIT lacks the feature to set
+            # "attributes" dynamically at runtime in general.
+            #
+            # Assuming the case: Q @ K @ V, Q / K / V are all dense tensor inputs.
+            # As a result, Q / K / V have total_length IntVar to represent the first dimension.
+            # Then there are make_jagged() ops which take Q / K / V as well as
+            # min_seq_len / max_seq_len IntVars as inputs.
+            # At runtime, Q / K / V are inputs passed to AIT runtime. However, since
+            # min_seq_len / max_seq_len is not bound to any input dimensions,
+            # there are no ways for AIT to infer these values. As a result, AIT compilation would
+            # fail.
+            #
+            # To support min_seq_len / max_seq_len IntVars, there must be a way dynamically set
+            # them at runtime.
+            #
+            # When bias is set, max_seq_len can be inferred from bias input.
+
+            if (not isinstance(q_shape[0].jagged_dims()[0].min_value(), IntImm)) or (
+                not isinstance(q_shape[0].jagged_dims()[0].max_value(), IntImm)
+            ):
+                raise RuntimeError(
+                    "Jagged dim' min / max values must be constant!"
+                    f"Current value: {q_shape[0].jagged_dims()=}"
+                )
+
+        return output_shape
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+            "num_heads",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index a3988c416..283794a0e 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -24,6 +24,7 @@
 
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 from aitemplate.compiler.dtype import normalize_dtype
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
 from aitemplate.testing.detect_target import detect_target
 from aitemplate.utils.graph_utils import get_sorted_ops
 from aitemplate.utils.torch_utils import string_to_torch_dtype
@@ -191,3 +192,29 @@ def epilogue_math_name_to_torch_fn(epilogue_math_name: str) -> Callable[[Any], A
         return torch.nn.functional.tanh
     else:
         raise NotImplementedError(f"Unsupported {epilogue_math_name=}!")
+
+
+def get_attn_mask_per_causal_type(
+    m: int, n: int, causal_type: CausalType, torch_dtype: str
+) -> torch.Tensor:
+    if causal_type == CausalType.NO_CAUSAL:
+        invalid_attn_mask = torch.ones((m, n), dtype=torch_dtype, device="cuda")
+    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
+        invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch.bool,
+                device="cuda",
+            )
+        ).fill_diagonal_(False).to(torch_dtype)
+    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
+        invalid_attn_mask: torch.Tensor = torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch_dtype,
+                device="cuda",
+            )
+        )
+    else:
+        raise NotImplementedError(f"Unsupported {causal_type=}!")
+    return invalid_attn_mask
diff --git a/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
index c7e51e385..47396ed60 100644
--- a/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
+++ b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
@@ -102,13 +102,15 @@ template <
     bool kSingleValueIteration,
     // Activation functor
     template <typename T>
-    class ActivationFunctor>
+    class ActivationFunctor,
+    typename offset_t_ = int64_t>
 struct AttentionKernel {
   using scalar_t = scalar_t_;
   using accum_t = accum_t_;
   using output_t = scalar_t;
   // Accumulator between 2 iterations
   using output_accum_t = accum_t;
+  using offset_t = offset_t_;
   static constexpr bool kIsAligned = isAligned_;
   static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
       cutlass::sizeof_bits<scalar_t>::value == 16;
@@ -133,8 +135,6 @@ struct AttentionKernel {
     scalar_t* key_ptr; // [num_keys, num_heads, head_dim]
     scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
     scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
-    int32_t* cu_seqlens_q_ptr = nullptr;
-    int32_t* cu_seqlens_k_ptr = nullptr;
 
     // Output tensors
     output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
@@ -144,6 +144,7 @@ struct AttentionKernel {
     // Scale
     accum_t scale;
     accum_t activation_scale;
+    bool activation_scale_divide_by_seq_len{false};
 
     // Dimensions/strides
     int32_t head_dim;
@@ -151,6 +152,22 @@ struct AttentionKernel {
     int32_t seq_length;
     int32_t num_queries;
     int32_t num_keys;
+    int32_t num_batches;
+    int32_t num_heads;
+
+    // When offset_ptr is not null, support variable sequence length.
+    // offset is a vector of offset of sequence length per batch.
+    // len(offset) = batch_size + 1
+    // In this case, seq_length / num_queries / num_keys are max_seq_length.
+    // offset is applied to queries, keys and values.
+    //
+    // e.g. If input tensor shape is:
+    // num_batches = 3
+    // batch0: seq_length_0=4, H, head_dim
+    // batch1: seq_length_0=2, H, head_dim
+    // batch2: seq_length_0=10, H, head_dim
+    // offset=[0, 4, 6, 16]
+    const offset_t* offset_ptr;
 
     enum CausalType {
       NO_CAUSAL = 0,
@@ -174,8 +191,6 @@ struct AttentionKernel {
     int64_t k_strideB;
     int64_t v_strideB;
     int32_t bias_strideB;
-    int32_t num_batches;
-    int32_t num_heads;
 
     CUTLASS_HOST_DEVICE int32_t o_strideM() const {
       return head_dim_value * num_heads;
@@ -186,20 +201,18 @@ struct AttentionKernel {
       auto batch_id = blockIdx.z;
       auto head_id = blockIdx.y;
       auto query_start = blockIdx.x * kQueriesPerBlock;
+      int64_t q_start = 0;
+      int64_t k_start = 0;
 
-      int64_t q_start, k_start;
       // Advance to current batch - in case of different sequence lengths
-      if (cu_seqlens_q_ptr != nullptr) {
-        assert(cu_seqlens_k_ptr != nullptr);
-        cu_seqlens_q_ptr += batch_id;
-        cu_seqlens_k_ptr += batch_id;
-        q_start = cu_seqlens_q_ptr[0];
-        k_start = cu_seqlens_k_ptr[0];
-        int64_t q_next_start = cu_seqlens_q_ptr[1];
-        int64_t k_next_start = cu_seqlens_k_ptr[1];
-        num_queries = q_next_start - q_start;
-        num_keys = k_next_start - k_start;
-
+      if (offset_ptr) {
+        auto start = offset_ptr[batch_id];
+        auto end = offset_ptr[batch_id + 1];
+        q_start = static_cast<int64_t>(start);
+        k_start = static_cast<int64_t>(start);
+        auto actual_seq_length = static_cast<int32_t>(end - start);
+        num_queries = actual_seq_length;
+        num_keys = actual_seq_length;
         if (query_start >= num_queries) {
           return false;
         }
@@ -232,11 +245,10 @@ struct AttentionKernel {
         // Accumulate directly in the destination buffer (eg for f32)
         output_accum_ptr = (accum_t*)output_ptr;
       }
-      num_queries -= query_start;
       if (causal_type == CausalType::UPPER_RIGHT_EMPTY) {
-        num_keys = cutlass::fast_min(
-            int32_t(query_start + kQueriesPerBlock), num_keys);
+        num_keys = cutlass::fast_min(num_queries, num_keys);
       }
+      num_queries -= query_start;
       num_batches = 0; // no longer used after
 
       // Make sure the compiler knows these variables are the same on all
@@ -652,8 +664,8 @@ struct AttentionKernel {
           [&](int accum_m) {},
           [&](int accum_m, int accum_n, int idx) {
             if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
-              int x = accum_m + query_start;
-              int y = accum_n + iter_key_start;
+              // int x = accum_m + query_start;
+              // int y = accum_n + iter_key_start;
               accum[idx] = accum[idx] * p.scale;
               if (p.attn_bias_ptr != nullptr) {
                 accum[idx] = accum[idx] +
@@ -662,12 +674,21 @@ struct AttentionKernel {
               }
               accum[idx] = ActivationFunctor<accum_t>()(accum[idx]) *
                   (accum_t)(p.activation_scale);
+              if (p.activation_scale_divide_by_seq_len) {
+                // Divide by max_seq_len instead of actual_seq_len.
+                // Might beed to be configured to use either max_seq_len or
+                // actual_seq_len in the future.
+                accum[idx] = accum[idx] / (accum_t)(p.seq_length);
+              }
+            } else {
+              // Need to set out-of-bound elements to 0 as these elements
+              // will also be used in the accum@V MMA.
+              accum[idx] = (accum_t)(0);
             }
           },
           [&](int accum_m) {});
 
       // Mask out last if causal
-      //      if (p.causal && p.num_keys - iter_key_start <= kKeysPerBlock) {
       if (p.causal_type != Params::CausalType::NO_CAUSAL) {
         int32_t last_col;
         MM0::ScalingCoefsUpdater::iterateRows(
@@ -678,14 +699,19 @@ struct AttentionKernel {
             [&](int accum_m, int accum_n, int idx) {
               switch (p.causal_type) {
                 case Params::CausalType::UPPER_RIGHT_EMPTY:
-                  if (accum_n > last_col && accum_m < problem_size_0_m &&
-                      accum_n < problem_size_0_n) {
-                    accum[idx] = accum_t(0);
+                  // Need to set out-of-bound elements to 0 as these elements
+                  // will also be used in the accum@V MMA.
+                  if (accum_n > last_col) {
+                    accum[idx] = (accum_t)(0);
                   }
                   break;
                 case Params::CausalType::LOWER_LEFT_EMPTY:
-                  if (accum_n < last_col && accum_m < problem_size_0_m) {
-                    accum[idx] = accum_t(0);
+                  // Need to set out-of-bound elements to 0 as these elements
+                  // will also be used in the accum@V MMA.
+                  if ((accum_n < last_col && accum_m < problem_size_0_m) ||
+                      accum_m >= problem_size_0_m ||
+                      accum_n >= problem_size_0_n) {
+                    accum[idx] = (accum_t)(0);
                   }
                   break;
               }
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index 6547aaf7d..ae04287e6 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -25,7 +25,10 @@
 from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import epilogue_math_name_to_torch_fn
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
 from aitemplate.utils import shape_utils
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 
@@ -33,32 +36,6 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-def _get_attn_mask_per_causal_type(
-    m: int, n: int, causal_type: CausalType, torch_dtype: str
-) -> torch.Tensor:
-    if causal_type == CausalType.NO_CAUSAL:
-        invalid_attn_mask = torch.ones((m, n), dtype=torch_dtype, device="cuda")
-    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
-        invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
-            torch.ones(
-                (m, n),
-                dtype=torch.bool,
-                device="cuda",
-            )
-        ).fill_diagonal_(False).to(torch_dtype)
-    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
-        invalid_attn_mask: torch.Tensor = torch.tril(
-            torch.ones(
-                (m, n),
-                dtype=torch_dtype,
-                device="cuda",
-            )
-        )
-    else:
-        raise NotImplementedError(f"Unsupported {causal_type=}!")
-    return invalid_attn_mask
-
-
 @unittest.skipIf(
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
@@ -144,7 +121,7 @@ def _test_classic_b2b_bmm(
             attn = alpha0 * (q_pt @ k_pt.transpose(-2, -1)) + bias_pt
             attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
             attn = alpha1 * attn
-            invalid_attn_mask = _get_attn_mask_per_causal_type(
+            invalid_attn_mask = get_attn_mask_per_causal_type(
                 m, n0, causal_type, torch_dtype
             )
             attn = attn * invalid_attn_mask
@@ -309,7 +286,7 @@ def _test_fmha_style_b2b_bmm(
                 attn = attn + bias_pt
             attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
             attn = alpha1 * attn
-            invalid_attn_mask = _get_attn_mask_per_causal_type(
+            invalid_attn_mask = get_attn_mask_per_causal_type(
                 m, n0, causal_type, torch_dtype
             )
             attn = attn * invalid_attn_mask
diff --git a/tests/unittest/ops/test_grouped_b2b_bmm.py b/tests/unittest/ops/test_grouped_b2b_bmm.py
new file mode 100644
index 000000000..49c1c5def
--- /dev/null
+++ b/tests/unittest/ops/test_grouped_b2b_bmm.py
@@ -0,0 +1,337 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for grouped b2b bmm Operators.
+"""
+import itertools
+import logging
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar, JaggedDim
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class GroupedFMHAStyleB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_grouped_fmha_style_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        max_seq_lens: Tuple[int, List[int]] = 256,
+        head_dim=128,
+        head_dim_value=256,
+        num_heads=1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        offsets_dtype="int32",
+        test_name="grouped_fmha_style_b2b_bmm",
+        alpha1_divide_by_seq_len=True,
+        copy_op=True,
+        atol=1e-3,
+        rtol=1e-3,
+        use_fp16_acc=False,
+    ):
+        # Initialize AIT fmha_style_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes, batch_sizes]
+        if isinstance(max_seq_lens, int):
+            max_seq_lens = [max_seq_lens, max_seq_lens]
+        alpha0 = 1.0 / (head_dim**0.5)
+        batch_size_dim = IntVar(
+            values=[min(batch_sizes), max(batch_sizes)], name="batch_size"
+        )
+        max_seq_len_dim = shape_utils.gen_int_var_min_max(
+            max_seq_lens, name="max_seq_len"
+        )
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_len_dim)]
+        total_length_dim = IntVar(
+            values=[0, batch_size_dim.upper_bound() * max_seq_len_dim.upper_bound()],
+            name="total_length",
+        )
+        offsets_dim = IntVar(
+            values=[batch_size_dim.lower_bound() + 1, batch_size_dim.upper_bound() + 1],
+            name="offset_length",
+        )
+        Q_dense = Tensor(
+            shape=[total_length_dim, num_heads, head_dim],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K_dense = Tensor(
+            shape=[total_length_dim, num_heads, head_dim],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V_dense = Tensor(
+            shape=[total_length_dim, num_heads, head_dim_value],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        offsets = [
+            Tensor(
+                shape=[offsets_dim], name="offsets", dtype=offsets_dtype, is_input=True
+            )
+        ]
+        Q = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            Q_dense, offsets
+        )
+        K = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            K_dense, offsets
+        )
+        V = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            V_dense, offsets
+        )
+        Bias = None
+        if has_bias:
+            shape = [batch_size_dim, num_heads, max_seq_len_dim, max_seq_len_dim]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        shape[i] = 1
+            Bias = Tensor(
+                shape=shape,
+                dtype=dtype,
+                name="bias",
+                is_input=True,
+            )
+        grouped_fmha_style_b2b_bmm_op = ops.grouped_fmha_style_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=1.0,
+            alpha1_divide_by_seq_len=alpha1_divide_by_seq_len,
+            epilogue_math_name=epilogue_math_name,
+            num_heads=num_heads,
+        )
+        if copy_op:
+            grouped_fmha_style_b2b_bmm_op = ops.grouped_fmha_style_b2b_bmm(
+                **grouped_fmha_style_b2b_bmm_op._get_op_attributes()
+            )
+        Y = grouped_fmha_style_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
+        for batch_size, max_seq_len in itertools.product(
+            sorted(set(batch_sizes)), sorted(set(max_seq_lens))
+        ):
+            # Initialize inputs
+            lengths = torch.randint(
+                1, max_seq_len, (batch_size + 1,), dtype=offsets_torch_dtype
+            )
+            lengths[0] = 0
+            offsets = torch.cumsum(lengths, dim=0).to(dtype=offsets_torch_dtype)
+            # print(f"{batch_size=}, {offsets=}")
+            total_length = offsets[-1]
+            offsets_pt = offsets.cuda()
+            q_pt = torch.rand(
+                (total_length, num_heads, head_dim), dtype=torch_dtype
+            ).cuda()
+            k_pt = torch.rand(
+                (total_length, num_heads, head_dim), dtype=torch_dtype
+            ).cuda()
+            v_pt = torch.rand(
+                (total_length, num_heads, head_dim_value), dtype=torch_dtype
+            ).cuda()
+            bias_shape = [batch_size, num_heads, max_seq_len, max_seq_len]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        bias_shape[i] = 1
+            bias_pt = torch.rand(bias_shape, dtype=torch_dtype).cuda()
+
+            # Run AIT.
+            inputs = {
+                "q": q_pt,
+                "k": k_pt,
+                "v": v_pt,
+                "offsets": offsets_pt,
+            }
+            if has_bias:
+                inputs["bias"] = bias_pt
+            y = torch.empty(
+                [total_length, num_heads, head_dim_value],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+
+            # Run PT reference and verify results.
+            for row in range(batch_size):
+                start = offsets[row]
+                end = offsets[row + 1]
+                length = end - start
+                q_pt_row = q_pt[start:end, :, :]
+                k_pt_row = k_pt[start:end, :, :]
+                v_pt_row = v_pt[start:end, :, :]
+                attn = alpha0 * (
+                    q_pt_row.transpose(0, 1)
+                    @ k_pt_row.transpose(0, 1).transpose(-2, -1)
+                )
+                if has_bias:
+                    bias_row = (
+                        0 if (bias_broadcast is not None and bias_broadcast[0]) else row
+                    )
+                    bias_pt_row = bias_pt[
+                        bias_row : bias_row + 1, :, :length, :length
+                    ].squeeze(dim=0)
+                    attn = attn + bias_pt_row
+                attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+                if alpha1_divide_by_seq_len:
+                    attn /= max_seq_len
+                invalid_attn_mask = get_attn_mask_per_causal_type(
+                    length, length, causal_type, torch_dtype
+                )
+                attn = attn * invalid_attn_mask
+                output = (attn @ v_pt_row.transpose(0, 1)).transpose(0, 1)
+                y_pt_row = output.detach()
+                # print(
+                #     f"{batch_size=}, {row=}, {y[start:end, :, :]=}, {y_pt_row.to(torch_dtype)=}"
+                # )
+                torch.testing.assert_close(
+                    y[start:end, :, :], y_pt_row.to(torch_dtype), atol=atol, rtol=rtol
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_fmha_style_b2b_bmm_fp16(self):
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_batch_fp16_acc",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+            use_fp16_acc=True,
+            # Need to use a larger threshold for fp16 accum, it seems that
+            # torch always generates the same result regardless of
+            # how allow_fp16_reduced_precision_reduction is set.
+            atol=1e-2,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_causal_upper_right_empty",
+            dtype="float16",
+            batch_sizes=2,
+            causal_type=CausalType.UPPER_RIGHT_EMPTY,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_causal_lower_left_empty",
+            dtype="float16",
+            batch_sizes=3,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_bias",
+            dtype="float16",
+            batch_sizes=2,
+            has_bias=True,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_bias_broadcast",
+            dtype="float16",
+            batch_sizes=3,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_seq_len",
+            dtype="float16",
+            max_seq_lens=[128, 256],
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_multi_head",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            has_bias=True,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_complex",
+            dtype="float16",
+            offsets_dtype="int64",
+            batch_sizes=[3, 4],
+            epilogue_math_name="SiLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_complex_fp16_acc",
+            dtype="float16",
+            batch_sizes=[1, 4, 10, 512, 1024],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            use_fp16_acc=True,
+            max_seq_lens=1024,
+            # Need to use a larger threshold for fp16 accum, it seems that
+            # torch always generates the same result regardless of
+            # how allow_fp16_reduced_precision_reduction is set.
+            atol=1e-2,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From c829c06dd912235ffbd2d377d90049c0c15d2df7 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Mon, 3 Apr 2023 23:26:49 -0700
Subject: [PATCH 380/638] Support models with 2GB+ params for internal
 pipelines (#527)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/527

This PR applied the same approach in D44533582 to internal
compilation flow.

Reviewed By: hl475

Differential Revision: D44588867

fbshipit-source-id: 1a1d3783eff2e3023a3ff82fd5c524a0f83e488e
---
 python/aitemplate/backend/cuda/target_def.py | 12 +++++++++++-
 python/aitemplate/backend/rocm/target_def.py | 12 +++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index ee8e489cd..1606dfc13 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import pipes
+import platform
 import re
 import secrets
 import shutil
@@ -329,7 +330,16 @@ def binary_compile_cmd(self):
         There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
         """
         ld = self.nvcc_options_json["ld"]
-        return " ".join([ld, "-r -b binary -o {target} {src}"])
+        objcopy = self.nvcc_options_json["objcopy"]
+        cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
+        # Support models with >2GB constants on Linux only
+        if platform.system() == "Linux":
+            cmd += (
+                f" && {objcopy} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def cc(self):
         return self.nvcc_options_json["nvcc_bin"]
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index 7055b843e..be9359391 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -20,6 +20,7 @@
 import json
 import logging
 import os
+import platform
 import re
 import shutil
 import sys
@@ -365,7 +366,16 @@ def binary_compile_cmd(self):
         There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
         """
         ld = self.hipcc_options_json["ld"]
-        return " ".join([ld, "-r -b binary -o {target} {src}"])
+        objcopy = self.hipcc_options_json["objcopy"]
+        cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
+        # Support models with >2GB constants on Linux only
+        if platform.system() == "Linux":
+            cmd += (
+                f" && {objcopy} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def cc(self):
         return self.hipcc_options_json["hipcc_bin"]

From b6f4368b9fb212267d4c7824b05984612a4c929d Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 4 Apr 2023 15:15:46 +0800
Subject: [PATCH 381/638] fix bugs

---
 .../backend/common/vision_ops/multi_level_roi_align_common.py | 4 +++-
 python/aitemplate/backend/rocm/embedding/bert_embeddings.py   | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
index 5211cda90..1dc9ac608 100644
--- a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -233,7 +233,9 @@
             interpolateBilinear(src, srcDims, ySample, xSample, featureCount);
       }
     }
-{% if elem_output_type == "half" %}
+{% if elem_output_type == "ck::half_t" %}
+    *out = __half(result) / __float2half_rn(samplingCount);
+{% elif elem_output_type == "half" %}
     *out = result / __float2half_rn(samplingCount);
 {% elif elem_output_type == "float" %}
     *out = result / samplingCount;
diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
index 5c10eedc1..de169c70c 100644
--- a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -133,9 +133,9 @@ def python_int_dtype_to_c_dtype(dtype):
 @registry.reg("rocm.bert_embeddings.gen_function")
 def bert_embeddings_gen_function(func_attrs: Dict[str, Any]) -> str:
     backend_spec = ROCMSpec()
-    elem_input_type = backend_spec.dtype_to_ck_type[
+    elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][3]._attrs["dtype"]
-    ]
+    )
     (
         input_ids,
         token_type_ids,

From 70da6996a414f40d96d69f3e1b504a95c6a493d4 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 4 Apr 2023 15:56:50 +0800
Subject: [PATCH 382/638] remove useless chamges

---
 examples/03_bert/benchmark_ait.py             | 40 -------------------
 .../common/tensor/permute102_common.py        |  4 --
 .../backend/cuda/tensor/permute102.py         |  1 +
 3 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index dc3738260..2864f5b60 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -228,33 +228,6 @@ def compile_module(
     return mod
 
 
-def load_module(
-    batch_size: int,
-    seq_length: int,
-    hidden_size: int,
-    activation: str,
-    use_fp16_acc: bool,
-    encoders_only: bool,
-    pt_model: torch.nn.Module,
-) -> None:
-    model_name = f"BERT_{activation}_{batch_size}_{seq_length}"
-
-    if encoders_only:
-        model = BertBaseEncodersOnly(batch_size, seq_length, hidden_act=activation)
-    else:
-        model = BertBaseUncased(batch_size, seq_length, hidden_act=activation)
-
-    # Mark all parameters with name same to PyTorch name convention
-    model.name_parameter_tensor()
-
-    params = map_pt_params(model, pt_model, batch_size, seq_length)
-
-    mod = Model(os.path.join("./tmp", model_name, "test.so"))
-
-    for k, v in params.items():
-        mod.set_constant_with_tensor(k, v)
-
-    return mod
 
 
 @click.command()
@@ -309,19 +282,6 @@ def compile_and_benchmark(
     pt_model.eval()
     hidden_size = pt_model.config.hidden_size
 
-    if batch_size >= 1 and seq_length >= 1:
-        mod = load_module(
-            batch_size,
-            seq_length,
-            hidden_size,
-            activation,
-            use_fp16_acc,
-            encoders_only,
-            pt_model,
-        )
-        benchmark(batch_size, seq_length, hidden_size, mod, graph_mode, encoders_only)
-        return
-
     if batch_size < 1:
         batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
     else:
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index 880afae82..1f83b5884 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -193,10 +193,6 @@
 #define DIRECT_BLOCK_Z 2
 
 namespace {
-#ifndef __HIP_PLATFORM_HCC__
-using bfloat16 = __nv_bfloat16;
-#endif
-
 template<typename T>
 __global__ void permute102_tiled_kernel(T* output,
                                         const T *input,
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
index a1457521b..498cf4eb2 100644
--- a/python/aitemplate/backend/cuda/tensor/permute102.py
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -29,6 +29,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/util/host_tensor.h"
 
+using bfloat16 = __nv_bfloat16;
 """
 
 
From 9c104ecad928f1cc6a0257789537b1faba4c5827 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 4 Apr 2023 01:36:19 -0700
Subject: [PATCH 383/638] Added Meta Copyright header (#531)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/531

Reviewed By: alexanderguzhva

Differential Revision: D44617243

Pulled By: chenyang78

fbshipit-source-id: f2b4decd8076e9643377488e58f3c59c9cef7e0f
---
 fx2ait/fx2ait/csrc/AITModel.cpp                    | 14 ++++++++++++++
 fx2ait/fx2ait/csrc/AITModel.h                      | 14 ++++++++++++++
 fx2ait/fx2ait/csrc/AITModelImpl.cpp                | 14 ++++++++++++++
 fx2ait/fx2ait/csrc/AITModelImpl.h                  | 14 ++++++++++++++
 .../test/converters/test_ait_convtranspose2d.py    |  4 +---
 .../test_ait_convtranspose2d_aten.py               |  4 +---
 6 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModel.cpp b/fx2ait/fx2ait/csrc/AITModel.cpp
index 6ba3eaba6..eed7855c4 100644
--- a/fx2ait/fx2ait/csrc/AITModel.cpp
+++ b/fx2ait/fx2ait/csrc/AITModel.cpp
@@ -1,3 +1,17 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
 #include "AITModel.h"
 
 #include "picojson.h"
diff --git a/fx2ait/fx2ait/csrc/AITModel.h b/fx2ait/fx2ait/csrc/AITModel.h
index 8949758db..4780b7ed1 100644
--- a/fx2ait/fx2ait/csrc/AITModel.h
+++ b/fx2ait/fx2ait/csrc/AITModel.h
@@ -1,3 +1,17 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
 #pragma once
 
 #include <torch/torch.h> // @manual=//caffe2:torch-cpp
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 5c8feed11..0193c3c2b 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -1,3 +1,17 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
 #include "AITModelImpl.h" // @manual
 
 #include <type_traits>
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
index 6b78d735f..56924a420 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.h
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -1,3 +1,17 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
 #pragma once
 
 #include "model_interface.h" // @manual=//aitemplate/AITemplate/static/include:aitemplate
diff --git a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
index 33b9e9d6e..7e9972296 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
@@ -1,4 +1,4 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
 import torch
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.tools.common_fx2ait import AITTestCase
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
index ebf6dd83c..50c13008e 100644
--- a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
@@ -1,4 +1,4 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
 import torch
 from fx2ait.tools.common_aten2ait import DispatchTestCase
 from parameterized import param, parameterized

From 4fd9a5c162bb6f70177f3afb448b80bfe29c0533 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 4 Apr 2023 10:54:04 -0700
Subject: [PATCH 384/638] append the name to the front of each op (#535)

Summary:
With the change, we will have something like:

(Tensor(name=elementwise_1_0, shape=[batch_size, 1920])) = gemm_rcr_bias()(
   ...
)

This change would make it much easier to map the pseudo code to the profiling results.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/535

Reviewed By: alexanderguzhva

Differential Revision: D44661845

Pulled By: chenyang78

fbshipit-source-id: 3c6998987d3f2161664fb7591c840f9a896cdfe5
---
 python/aitemplate/compiler/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 31bd2f262..3f4bbc75f 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -1248,4 +1248,5 @@ def pseudo_code(self, with_shape=True):
         args = self._pseudo_code_helper(self._args_for_pseudo_code(), with_shape)
         inputs = self._pseudo_code_helper(self._inputs_for_pseudo_code(), with_shape)
         outputs = self._pseudo_code_helper(self._outputs_for_pseudo_code(), with_shape)
-        return f"({outputs}) \n= {self._attrs['op']}({args})(\n{inputs})\n"
+        name = self._attrs.get("name", None)
+        return f"# {name}\n({outputs}) \n= {self._attrs['op']}({args})(\n{inputs})\n"

From e230819e20cb6193014077729cda7e681fbbc190 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 4 Apr 2023 18:03:46 -0700
Subject: [PATCH 385/638] Add doc for missing compile_model parameters. Fix
 typos (#537)

Summary:
Fixes:
- Add doc for missing `compile_model()` parameters
- Fix output type of `in_ci_env()`
- Fix op**it**mizations typo

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/537

Reviewed By: amateurcoffee

Differential Revision: D44690194

Pulled By: hl475

fbshipit-source-id: 1533b78ea9703c7025a4fb95fa0598d6e1f7327b
---
 python/aitemplate/backend/cuda/target_def.py | 2 +-
 python/aitemplate/backend/target.py          | 8 ++++----
 python/aitemplate/compiler/compiler.py       | 9 ++++++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 1606dfc13..08ba8811a 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -369,7 +369,7 @@ def list_rindex(input_list, x):
             res = f.read()
             return res
 
-    def in_ci_env(self):
+    def in_ci_env(self) -> bool:
         return (
             os.environ.get("INSIDE_RE_WORKER", None) == "1" and not self.trick_ci_env()
         )
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index ab8b3d937..ab34a6dda 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -22,7 +22,7 @@
 import shutil
 import tempfile
 from enum import IntEnum
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple
 
 from aitemplate.backend import registry
 from aitemplate.backend.profiler_cache import ProfileCacheDB
@@ -249,13 +249,13 @@ def trick_ci_env(self) -> bool:
         """
         return os.environ.get("TRICK_CI_ENV", None) == "1"
 
-    def in_ci_env(self) -> Union[None, str]:
+    def in_ci_env(self) -> bool:
         """Check if the current environment is CI.
 
         Returns
         -------
-        Union[None, str]
-            CI environment name if in CI environment, otherwise None.
+        bool
+            Returns True if env CI_FLAG=CIRCLECI and TRICK_CI_ENV is not set (or 0).
         """
         return os.environ.get("CI_FLAG", None) == "CIRCLECI" and not self.trick_ci_env()
 
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index fb0d9dba9..427b63f7d 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -93,7 +93,7 @@ def _verify_outputs_still_in_graph(sorted_graph: List[Tensor], outputs: List[Ten
     for tensor, was_seen in seen.items():
         if not was_seen:
             raise ValueError(
-                f"Output {tensor} was not found in the graph after opitmizations."
+                f"Output {tensor} was not found in the graph after optimizations."
             )
 
 
@@ -183,10 +183,17 @@ def compile_model(
         How many runtimes should be stored in the internal pool. This
         determines how many inferences can happen concurrently. By
         default, set to 1. Must be positive.
+    profile_dir: str
+        The base dir to generate profiling source codes. By default, workdir/test_name
+    constants: Dict[str, TorchTensor], optional
+        User-provided constants to bind to the graph. The constants can be folded and packaged into
+        the final *.so.
     allocator_kind: AITemplateAllocatorKind, optional
         The GPU allocator to use. If none is specified, use the default allocator.
     debug_settings: AITDebugSettings
         specify debug settings such as where to dump AITemplate model Python file, etc.
+    do_optimize_graph: bool
+        Apply full list of graph optimizations. Default: True
 
     Returns
     -------

From b75bfa1f24cf95262d5931397436f64b28f027a9 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 4 Apr 2023 22:11:16 -0700
Subject: [PATCH 386/638] Add is_linux() to utils.misc (#534)

Summary:
Add `is_linux()` to `utils.misc`. And use it in `target.py` instead of checking the `platform.system()` result.
alexanderguzhva

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/534

Reviewed By: chenyang78

Differential Revision: D44695103

Pulled By: hl475

fbshipit-source-id: e87b6480b9ea6cf76d9f7c35ccf135e662019de3
---
 python/aitemplate/backend/cuda/target_def.py | 5 ++---
 python/aitemplate/backend/rocm/target_def.py | 4 ++--
 python/aitemplate/backend/target.py          | 4 ++--
 python/aitemplate/utils/misc.py              | 5 +++++
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 08ba8811a..8c84aeea4 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -20,7 +20,6 @@
 import logging
 import os
 import pipes
-import platform
 import re
 import secrets
 import shutil
@@ -43,7 +42,7 @@
 
 from aitemplate.utils import environ
 from aitemplate.utils.io import copytree_with_hash
-from aitemplate.utils.misc import is_debug
+from aitemplate.utils.misc import is_debug, is_linux
 
 
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
@@ -333,7 +332,7 @@ def binary_compile_cmd(self):
         objcopy = self.nvcc_options_json["objcopy"]
         cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
         # Support models with >2GB constants on Linux only
-        if platform.system() == "Linux":
+        if is_linux():
             cmd += (
                 f" && {objcopy} --rename-section"
                 " .data=.lrodata,alloc,load,readonly,data,contents"
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index be9359391..ec3bcc134 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -20,7 +20,6 @@
 import json
 import logging
 import os
-import platform
 import re
 import shutil
 import sys
@@ -35,6 +34,7 @@
 )
 
 from aitemplate.utils import environ
+from aitemplate.utils.misc import is_linux
 
 # pylint: disable=W0613
 
@@ -369,7 +369,7 @@ def binary_compile_cmd(self):
         objcopy = self.hipcc_options_json["objcopy"]
         cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
         # Support models with >2GB constants on Linux only
-        if platform.system() == "Linux":
+        if is_linux():
             cmd += (
                 f" && {objcopy} --rename-section"
                 " .data=.lrodata,alloc,load,readonly,data,contents"
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index ab34a6dda..6207cf554 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -18,7 +18,6 @@
 import logging
 import os
 import pathlib
-import platform
 import shutil
 import tempfile
 from enum import IntEnum
@@ -26,6 +25,7 @@
 
 from aitemplate.backend import registry
 from aitemplate.backend.profiler_cache import ProfileCacheDB
+from aitemplate.utils.misc import is_linux
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -175,7 +175,7 @@ def binary_compile_cmd(self):
         """
         cmd = "ld -r -b binary -o {target} {src}"
         # Support models with >2GB constants on Linux only
-        if platform.system() == "Linux":
+        if is_linux():
             cmd += (
                 " && objcopy --rename-section"
                 " .data=.lrodata,alloc,load,readonly,data,contents"
diff --git a/python/aitemplate/utils/misc.py b/python/aitemplate/utils/misc.py
index e9429b7c1..52afc582e 100644
--- a/python/aitemplate/utils/misc.py
+++ b/python/aitemplate/utils/misc.py
@@ -18,6 +18,7 @@
 import hashlib
 import logging
 import os
+import platform
 
 
 def is_debug():
@@ -25,6 +26,10 @@ def is_debug():
     return logger.level == logging.DEBUG
 
 
+def is_linux() -> bool:
+    return platform.system() == "Linux"
+
+
 def is_windows() -> bool:
     return os.name == "nt"
 

From 2799c64759836d3105df9ca549c4e6598f1e7cc8 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 5 Apr 2023 11:34:26 -0700
Subject: [PATCH 387/638] Hotfix: Include path write conflict (#536)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/536

Fix for include path write conflict reported here:
https://fb.workplace.com/groups/757073672259175/permalink/885310439435497/

It works by incorporating the user name into the generated include path to avoid conflicts.

Reviewed By: chenyang78, aakhundov

Differential Revision: D44670444

fbshipit-source-id: aa052ab4724559ded5b3d99be5dbc93764038890
---
 python/aitemplate/backend/build_cache_base.py |  40 ++-
 python/aitemplate/backend/builder.py          |  11 +-
 python/aitemplate/backend/cuda/target_def.py  |  20 +-
 tests/unittest/backend/test_build_cache.py    | 289 ++++++++++--------
 4 files changed, 217 insertions(+), 143 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index 3e5223f47..3c8c56888 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -18,6 +18,7 @@
 import os
 import secrets
 import shutil
+import tempfile
 
 from abc import ABC, abstractmethod
 from datetime import datetime, timedelta
@@ -28,6 +29,7 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+
 # File extensions to be considered source files
 source_extensions = {
     "cpp",
@@ -127,6 +129,19 @@ def is_bin_file(filename: str) -> bool:
     return filename.lower().endswith(".bin")
 
 
+def makefile_normalizer(makefile_content_orig: bytes) -> bytes:
+    """Normalize the content of the makefile for hashing purposes (nothing else!),
+    so that it can be compared to other Makefiles
+    generated by different users on different systems"""
+    makefile_content = makefile_content_orig.decode("utf-8")
+    tmpdir = tempfile.gettempdir()
+    userid = str(os.getuid())
+    user_tmpdir = os.path.join(tmpdir, userid)
+    makefile_content = makefile_content.replace(user_tmpdir, "/tmp/$USER")
+    makefile_content = makefile_content.replace(tmpdir, "/tmp")
+    return makefile_content.encode("utf-8")
+
+
 def create_dir_hash(
     cmds: List[str],
     build_dir: str,
@@ -166,16 +181,21 @@ def create_dir_hash(
                 continue
             hash_object.update(str(fpath).encode("utf-8"))
             fullpath = str(basepath / fpath)
-            with open(fullpath, "rb") as f:
-                # read file in chunks of 32kb
-                # in order to support large files ( constants.obj )
-                while True:
-                    chunk = f.read(1024 * 32)
-                    if not chunk:
-                        break
-                    hash_object.update(chunk)
-                if debug:
-                    hash_log.write(f"\t{str(fpath)} -> {hash_object.hexdigest()}\n")
+            if fpath.name.lower() == "makefile":
+                makefile_content = (basepath / fpath).read_bytes()
+                makefile_content = makefile_normalizer(makefile_content)
+                hash_object.update(makefile_content)
+            else:
+                with open(fullpath, "rb") as f:
+                    # read file in chunks of 32kb
+                    # in order to support large files ( constants.obj )
+                    while True:
+                        chunk = f.read(1024 * 32)
+                        if not chunk:
+                            break
+                        hash_object.update(chunk)
+            if debug:
+                hash_log.write(f"\t{str(fpath)} -> {hash_object.hexdigest()}\n")
         if debug:
             hash_log.write(
                 f"Final hash of {build_dir} is {hash_object.hexdigest().lower()}\n"
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index fed10dd4d..e4c7cf8c8 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -30,7 +30,7 @@
 
 import jinja2
 
-from aitemplate.backend.build_cache import BUILD_CACHE
+from aitemplate.backend import build_cache
 from aitemplate.backend.build_cache_base import write_binhash_file
 
 from aitemplate.backend.target import Target
@@ -146,9 +146,10 @@ def _log_error_context(
 def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
     _LOGGER.debug(f"make {cmds=}")
     if allow_cache:
-        cached_results_available, store_cache_key = BUILD_CACHE.retrieve_build_cache(
-            cmds, build_dir
-        )
+        (
+            cached_results_available,
+            store_cache_key,
+        ) = build_cache.BUILD_CACHE.retrieve_build_cache(cmds, build_dir)
     else:
         cached_results_available, store_cache_key = False, None
     if not cached_results_available:
@@ -179,7 +180,7 @@ def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
                 _LOGGER.debug(f"make stdout:\n\n{stdout}")
                 _LOGGER.debug(f"make stderr:\n\n{stderr}")
         if store_cache_key is not None:
-            BUILD_CACHE.store_build_cache(cmds, build_dir, store_cache_key)
+            build_cache.BUILD_CACHE.store_build_cache(cmds, build_dir, store_cache_key)
 
 
 def process_task(task: Task) -> None:
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 8c84aeea4..6162518ec 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -203,12 +203,25 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         cub_src_path = parutil.get_dir_path("aitemplate/AITemplate/fb/3rdparty/cub")
         static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
         self._include_path = None
+        try:
+            self.tmp_path = os.path.join(
+                tempfile.gettempdir(), f"{os.getuid()}_aitemplate_tmp"
+            )
+        except OSError:
+            _LOGGER.warning(
+                "FBCUDA Target: Failed to create user-specific temp directory path."
+            )
+            self.tmp_path = self.tmp_path = os.path.join(
+                tempfile.gettempdir(), f"{secrets.token_hex(16)}_aitemplate_tmp"
+            )
         if not FBCUDA.cutlass_path_:
             # Copy all of the includes over into an include directory
             random_key = secrets.token_hex(16)
             # the random_key part of this path will later be renamed to the content hash
             self._include_path = os.path.join(
-                tempfile.gettempdir(), "aitemplate_tmp", random_key, "includes"
+                self.tmp_path,
+                random_key,
+                "includes",
             )
             includes_content_hash = hashlib.sha256()
             FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
@@ -234,9 +247,10 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             # Now we have a content hash over all include contents
             include_hash_digest = includes_content_hash.hexdigest()
             # Prepare to rename atomically
-            old_path = os.path.join(tempfile.gettempdir(), "aitemplate_tmp", random_key)
+            old_path = os.path.join(self.tmp_path, random_key)
             new_path = os.path.join(
-                tempfile.gettempdir(), "aitemplate_tmp", include_hash_digest
+                self.tmp_path,
+                include_hash_digest,
             )
             # if it already exists, we don't want to overwrite it
             # we can just delete our copy.
diff --git a/tests/unittest/backend/test_build_cache.py b/tests/unittest/backend/test_build_cache.py
index 79b543675..1be768b14 100644
--- a/tests/unittest/backend/test_build_cache.py
+++ b/tests/unittest/backend/test_build_cache.py
@@ -20,13 +20,17 @@
 from pathlib import Path
 
 import torch
+
+from aitemplate.backend import build_cache
 from aitemplate.backend.build_cache_base import (
     create_dir_hash,
     FileBasedBuildCache,
     is_source,
+    makefile_normalizer,
+    NoBuildCache,
 )
-from aitemplate.backend.cuda.target_def import FBCUDA
 
+from aitemplate.backend.cuda.target_def import FBCUDA
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
@@ -107,131 +111,166 @@ def test_file_build_cache(self):
             )
 
     def test_deterministic_codegen(self, dtype="float32"):
-        # Tests, whether repeated invocation of compilation results in identical generated source files
-        test_name = "test_deterministic_codegen"
-        basepath = "./tmp"
-
-        # Clean previous test results. These are usually kept for debugging purposes
-        # but we need a clean slate here.
-        if os.path.exists(basepath):
-            existing_dirs = [
-                d
-                for d in os.listdir(basepath)
-                if d.startswith(test_name) and os.path.isdir(os.path.join(basepath, d))
-            ]
-            for d in existing_dirs:
-                oldpath = os.path.join(basepath, d)
-                if os.path.exists(oldpath) and test_name in oldpath:
-                    shutil.rmtree(oldpath)
-        else:
-            os.mkdir(basepath)
-
-        Y = self._create_model_graph()
-        target = detect_target()
-        debug_settings = AITDebugSettings(gen_standalone=False)
-        dll_name = "test.so"
-        build_dir = os.path.join("./tmp", test_name)
-        compile_model(
-            Y,
-            target,
-            "./tmp",
-            test_name + "_1",
-            dll_name=dll_name,
-            debug_settings=debug_settings,
-        )
-        hash1 = create_dir_hash(["test_name"], build_dir + "_1", is_source, debug=True)
-        Y = self._create_model_graph()
-        target = detect_target()
-        # Variant 2: Clean build
-        compile_model(
-            Y,
-            target,
-            "./tmp",
-            test_name + "_2",
-            dll_name=dll_name,
-            debug_settings=debug_settings,
-        )
-        hash2 = create_dir_hash(["test_name"], build_dir + "_2", is_source, debug=True)
-        assert (
-            hash1 == hash2
-        ), "Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
-        # Variant 3: Build over existing build dir
-        Y = self._create_model_graph()
-        target = detect_target()
-        compile_model(
-            Y,
-            target,
-            "./tmp",
-            test_name + "_2",
-            dll_name=dll_name,
-            debug_settings=debug_settings,
-        )
-        hash3 = create_dir_hash(["test_name"], build_dir + "_2", is_source, debug=True)
-        assert (
-            hash2 == hash3
-        ), "Code generation was not deterministic. Cache key mismatch between second and third code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
-
-        # Variant 4: Let's provoke to copy the includes again, maybe to a new path?
-        Y = self._create_model_graph()
-        FBCUDA.cutlass_path_ = None
-        compile_model(
-            Y,
-            target,
-            "./tmp",
-            test_name + "_4",
-            dll_name=dll_name,
-            debug_settings=debug_settings,
-        )
-        hash4 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
-
-        assert (
-            hash3 == hash4
-        ), "Code generation was not deterministic. Cache key mismatch between third and fourth code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
-
-        with open(
-            os.path.join(build_dir + "_4", "Makefile"), "a", encoding="utf-8"
-        ) as f:
-            f.write("\n")
-
-        hash5 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
-        assert (
-            hash4 != hash5
-        ), "Directory hash was not sensitive to a change in the Makefile, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
-        with open(
-            os.path.join(build_dir + "_4", "anything.cu"), "w", encoding="utf-8"
-        ) as f:
-            f.write("// Nothing, really\n")
-
-        hash6 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
-        assert (
-            hash6 != hash5
-        ), "Directory hash was not sensitive to a change in a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
-
-        os.rename(
-            os.path.join(build_dir + "_4", "anything.cu"),
-            os.path.join(build_dir + "_4", "anything_.cu"),
-        )
-        hash7 = create_dir_hash(["test_name"], build_dir + "_4", is_source, debug=True)
-        assert (
-            hash7 != hash6
-        ), "Directory hash was not sensitive to a change of name of a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
-
-        Y = self._create_model_graph()
-        target = detect_target()
-        debug_settings = AITDebugSettings(gen_standalone=True)
-        compile_model(
-            Y,
-            target,
-            "./tmp",
-            test_name + "_8",
-            dll_name=dll_name,
-            debug_settings=debug_settings,
-        )
-        hash8 = create_dir_hash(["test_name"], build_dir + "_8", is_source, debug=True)
+        old_build_cache = build_cache.BUILD_CACHE
+        try:
+            build_cache.BUILD_CACHE = NoBuildCache()
+
+            # Tests, whether repeated invocation of compilation results in identical generated source files
+            test_name = "test_deterministic_codegen"
+            basepath = "./tmp"
+
+            # Clean previous test results. These are usually kept for debugging purposes
+            # but we need a clean slate here.
+            if os.path.exists(basepath):
+                existing_dirs = [
+                    d
+                    for d in os.listdir(basepath)
+                    if d.startswith(test_name)
+                    and os.path.isdir(os.path.join(basepath, d))
+                ]
+                for d in existing_dirs:
+                    oldpath = os.path.join(basepath, d)
+                    if os.path.exists(oldpath) and test_name in oldpath:
+                        shutil.rmtree(oldpath)
+            else:
+                os.mkdir(basepath)
+
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            dll_name = "test.so"
+            build_dir = os.path.join("./tmp", test_name)
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_1",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash1 = create_dir_hash(
+                ["test_name"], build_dir + "_1", is_source, debug=True
+            )
+            Y = self._create_model_graph()
+            target = detect_target()
+            # Variant 2: Clean build
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_2",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash2 = create_dir_hash(
+                ["test_name"], build_dir + "_2", is_source, debug=True
+            )
+            assert (
+                hash1 == hash2
+            ), "Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+            # Variant 3: Build over existing build dir
+            Y = self._create_model_graph()
+            target = detect_target()
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_2",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash3 = create_dir_hash(
+                ["test_name"], build_dir + "_2", is_source, debug=True
+            )
+            assert (
+                hash2 == hash3
+            ), "Code generation was not deterministic. Cache key mismatch between second and third code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+            # Variant 4: Let's provoke to copy the includes again, maybe to a new path?
+            Y = self._create_model_graph()
+            FBCUDA.cutlass_path_ = None
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_4",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash4 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+
+            assert (
+                hash3 == hash4
+            ), "Code generation was not deterministic. Cache key mismatch between third and fourth code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+            with open(
+                os.path.join(build_dir + "_4", "Makefile"), "a", encoding="utf-8"
+            ) as f:
+                f.write("\n")
+
+            hash5 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash4 != hash5
+            ), "Directory hash was not sensitive to a change in the Makefile, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+            with open(
+                os.path.join(build_dir + "_4", "anything.cu"), "w", encoding="utf-8"
+            ) as f:
+                f.write("// Nothing, really\n")
+
+            hash6 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash6 != hash5
+            ), "Directory hash was not sensitive to a change in a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
 
-        assert (
-            hash8 != hash1
-        ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+            os.rename(
+                os.path.join(build_dir + "_4", "anything.cu"),
+                os.path.join(build_dir + "_4", "anything_.cu"),
+            )
+            hash7 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash7 != hash6
+            ), "Directory hash was not sensitive to a change of name of a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=True)
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_8",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash8 = create_dir_hash(
+                ["test_name"], build_dir + "_8", is_source, debug=True
+            )
+
+            assert (
+                hash8 != hash1
+            ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+        finally:
+            build_cache.BUILD_CACHE = old_build_cache
+
+    def test_makefile_rewrite(self):
+        tmpdir = os.path.join(tempfile.gettempdir(), f"{os.getuid()}_aitemplate_tmp")
+        makefile = f"""
+                TMPDIR: {tmpdir}
+        """
+        assert tmpdir in makefile
+        rewritten_makefile = makefile_normalizer(makefile.encode("utf-8")).decode(
+            "utf-8"
+        )
+        assert tmpdir not in rewritten_makefile
+        assert "$USER" in rewritten_makefile
 
 
 filter_test_cases_by_test_env(BuildCacheTestCase)

From 2540aaa84e74580e8a4a3d42b70ad59446293cdd Mon Sep 17 00:00:00 2001
From: Hung-Ju Chen <robert501128@meta.com>
Date: Wed, 5 Apr 2023 18:35:57 -0700
Subject: [PATCH 388/638] add priority toposort (#408)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/408

Following the suggestion in T147269828,
Use Kahn's algorithm with a priority queue, the priority definition of the priority queue would be the order of the sorted graph.

Following this note: https://fb.workplace.com/notes/1322142898565804 , added a "SizePriTensorHelper" , which larger size tensor would be in the top rank of the sorted graph.

In the future if we'd like to test different types of priority, simply add a new class inherited from "PriTensorHelper", overwrite the function "get_priority"

Reviewed By: tenpercent

Differential Revision: D43994537

fbshipit-source-id: 95012bc231e1212d36ce3481349212e0faa6a942
---
 .../aitemplate/compiler/transform/toposort.py | 82 ++++++++++++++++++-
 .../compiler/test_transform_toposort.py       | 40 +++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/transform/toposort.py b/python/aitemplate/compiler/transform/toposort.py
index c71b44937..8fb6411e8 100644
--- a/python/aitemplate/compiler/transform/toposort.py
+++ b/python/aitemplate/compiler/transform/toposort.py
@@ -15,7 +15,8 @@
 """
 Graph pass for topological sort.
 """
-from typing import List, Union
+import heapq
+from typing import List, Tuple, Union
 
 from aitemplate.compiler.base import Tensor
 
@@ -35,6 +36,10 @@ def toposort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
     List[Tensor]
         Sorted graph
     """
+    return _priSort(nodes, SizePriTensorHelper())
+
+
+def _dfsSort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
     visited = set()
     sorted_graph = []
     stack = []
@@ -68,5 +73,80 @@ def toposort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
             for idx in visit_seq:
                 arg = args[idx]
                 stack.append((arg, False))
+    return sorted_graph
+
+
+class PriTensorHelper:
+    def __init__(self) -> None:
+        self.entry_cnt = -1
+
+    def get_heap_input(self, node: Tensor) -> Tuple[float, int, Tensor]:
+        # input is built based on heapq doc suggestion:
+        # https://docs.python.org/3/library/heapq.html#priority-queue-implementation-notes
+        # the return tuple is: (
+        #   priority_ (less is more important),
+        #   entry_cnt (so earlier entered item is chosen if same priority),
+        #   element (here is tensor)
+        # )
+        self.entry_cnt += 1
+        return (
+            self.get_priority(node),
+            self.entry_cnt,
+            node,
+        )
+
+    def get_tensor_from_heap_output(
+        self, heap_output: Tuple[float, int, Tensor]
+    ) -> Tensor:
+        return heap_output[2]
+
+    def get_priority(self, node: Tensor) -> float:
+        # please implement your own priority function
+        # note that smaller value would be in higher-pri
+        pass
+
+
+class SizePriTensorHelper(PriTensorHelper):
+    def get_priority(self, node: Tensor) -> float:
+        # use negative byte size since
+        # we'd like to pop larger size first
+        return -node.size_bytes()
+
+
+def _priSort(
+    nodes: Union[Tensor, List[Tensor]], pri_tensor_helper: PriTensorHelper
+) -> List[Tensor]:
+    # do a DFS to get all nodes in a list
+    nodes = _dfsSort(nodes)
+    # number of src tensors
+    in_degree = {}
+    for node in nodes:
+        in_degree[node] = 0
+        for src_op in node.src_ops():
+            # sometimes it'd have 2 same nodes in one list
+            # change to set to de-dupe these nodes
+            in_degree[node] += len(set(src_op._attrs["inputs"]))
+
+    queue = []
+    sorted_graph = []
+    for node in nodes:
+        if in_degree[node] == 0:
+            # input nodes need to be in the original order,
+            # hence add them to the sorted graph here
+            # instead of going through the pri heap
+            sorted_graph.append(node)
+            heapq.heappush(queue, pri_tensor_helper.get_heap_input(node))
+
+    while queue:
+        node = pri_tensor_helper.get_tensor_from_heap_output(heapq.heappop(queue))
+        if node not in sorted_graph:
+            sorted_graph.append(node)
 
+        for dst_op in node.dst_ops():
+            for next_node in set(dst_op._attrs["outputs"]):
+                if next_node not in in_degree:
+                    continue
+                in_degree[next_node] -= 1
+                if in_degree[next_node] == 0:
+                    heapq.heappush(queue, pri_tensor_helper.get_heap_input(next_node))
     return sorted_graph
diff --git a/tests/unittest/compiler/test_transform_toposort.py b/tests/unittest/compiler/test_transform_toposort.py
index 918a5e40c..0642e88b3 100644
--- a/tests/unittest/compiler/test_transform_toposort.py
+++ b/tests/unittest/compiler/test_transform_toposort.py
@@ -18,10 +18,29 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.toposort import (
+    _dfsSort,
+    _priSort,
+    SizePriTensorHelper,
+)
 from aitemplate.testing import detect_target
 
 
 class TestTopoSort(unittest.TestCase):
+    def _get_diff_size_graph(self):
+        X1 = Tensor(shape=[10, 50], dtype="float16", name="in_10_50")
+        X2 = Tensor(shape=[50, 1000], dtype="float16", name="in_50_1000")
+        X3 = Tensor(shape=[1000, 5], dtype="float16", name="in_1000_5")
+        X4 = Tensor(shape=[5, 5], dtype="float16", name="in_5_5")
+        X5 = ops.gemm_rrr()(X1, X2)
+        X5._attrs["name"] = "MUL_10_1000"
+        X6 = ops.gemm_rrr()(X3, X4)
+        X6._attrs["name"] = "MUL_1000_5"
+        X7 = ops.gemm_rrr()(X5, X6)
+        X7._attrs["name"] = "MUL_10_5"
+        X7._attrs["is_output"] = True
+        return X7
+
     def test_very_deep_toposort(self):
         x = Tensor(
             [2, 10],
@@ -46,6 +65,27 @@ def test_very_deep_toposort(self):
 
         self.assertTrue(torch.equal(out_ait, out_pt))
 
+    def test_size_pri_toposort(self):
+        tensor = self._get_diff_size_graph()
+        expected_order = [
+            "in_10_50",
+            "in_50_1000",
+            "in_1000_5",
+            "in_5_5",
+            "MUL_10_1000",
+            "MUL_1000_5",
+            "MUL_10_5",
+        ]
+        self.assertEqual(
+            [node._attrs["name"] for node in _priSort(tensor, SizePriTensorHelper())],
+            expected_order,
+        )
+
+        # dfs don't follow size pri order
+        self.assertNotEqual(
+            [node._attrs["name"] for node in _dfsSort(tensor)], expected_order
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From f4dcb10f948a94a5c1bb591908053e34a8c35947 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Wed, 5 Apr 2023 23:43:40 -0700
Subject: [PATCH 389/638] skip fusing split + strided_concat (#538)

Summary:
Fixed a bug where I accidentally dropped a previous check that prevented us from fusing split + strided_concat.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/538

Reviewed By: singlaiiit, khabinov, alexanderguzhva

Differential Revision: D44704343

Pulled By: chenyang78

fbshipit-source-id: f8290b7370458af678642cfc3c5d98ad0fa57c83
---
 .../transform/transform_memory_ops.py         |  8 ++-
 .../compiler/test_transform_memory_ops.py     | 62 +++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index 49d6cff3d..c11944fe8 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -283,7 +283,13 @@ def _try_merge_cat_cat(first_cat: Operator, second_cat: Operator) -> bool:
 
 
 def _try_merge_split_cat(split_op: Operator, cat: Operator) -> bool:
-    # If split_op carries strided input_accessors, we skip it
+    # If split_op carries strided input_accessors, we skip it.
+    if not all(
+        accessor.actual_shapes is None for accessor in cat._attrs["input_accessors"]
+    ):
+        return False
+    if not all(cat._attrs["input_masks"]):
+        return False
     split_op_inputs = split_op._attrs["inputs"]
     split_op_outputs = split_op._attrs["outputs"]
     cat_inputs = cat._attrs["inputs"]
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index 8e54a05d3..2212cf1e0 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -1031,6 +1031,68 @@ def test_non_fusible_strided_cat_cat(self):
             test_name="test_non_fusible_strided_cat_cat",
         )
 
+    def _test_non_fusible_split_reshape_cat(self, M, test_name, dtype="float16"):
+        # make the following graph
+        # split_0, split_1 = split(x0)
+        # unsqueeze_2 = unsqueeze(dim=1)(split_0)
+        # unsqueeze_3 = unsqueeze(dim=1)(split_1)
+        # add_4 = add(x1, x1)
+        # y = concat([unsqueeze_2, unsqueeze_3, add_4], dim=1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        assert M % 2 == 0, f"expected {M=} % 2 == 0"
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(2), IntImm(M // 2)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        dim = 1
+        split_0, split_1 = ops.split()(X0, [M // 2, M // 2], dim=dim)
+        unsqueeze_2 = ops.unsqueeze(dim=dim)(split_0)
+        unsqueeze_3 = ops.unsqueeze(dim=dim)(split_1)
+        add_4 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        Y = ops.concatenate()([unsqueeze_2, unsqueeze_3, add_4], dim=dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, 2, M // 2], dtype)
+
+            split_0_pt, split_1_pt = torch.split(x0_pt, [M // 2, M // 2], dim=dim)
+            unsqueeze_2_pt = torch.unsqueeze(split_0_pt, dim)
+            unsqueeze_3_pt = torch.unsqueeze(split_1_pt, dim)
+            add_4_pt = x1_pt + x1_pt
+            y_pt = torch.cat([unsqueeze_2_pt, unsqueeze_3_pt, add_4_pt], dim=dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_fusible_split_reshape_cat(self):
+        self._test_non_fusible_split_reshape_cat(
+            M=32,
+            test_name="test_non_fusible_split_reshape_cat",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 85fb7007bdfae812fd2c80fbfe04ac8191fdcb13 Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Thu, 6 Apr 2023 11:38:48 -0700
Subject: [PATCH 390/638] Add back CVV Preset, remove MHA from AIT Lowering
 (#539)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/539

Add a check on MHA converter for the case where head_size isn't divisible by 4.

Reviewed By: chenyang78

Differential Revision: D44724952

fbshipit-source-id: b30ea1ed30c3b8fccf6f4e77c3aeb19d23b05e4a
---
 fx2ait/fx2ait/converters/ait_module_converters.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index be41bf4ba..892f05bfc 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -41,6 +41,17 @@ def multi_head_attention_module(
     value = kwargs["value"] if "value" in kwargs else args[2]
     bsz, seq_len_q, dim = query.shape()
     _, seq_len, _ = key.shape()
+
+    assert (
+        submod.embed_dim % submod.num_heads == 0
+    ), f"embed_dim {submod.embed_dim} must be divisible by num_heads {submod.num_heads}"
+    head_size = submod.embed_dim // submod.num_heads
+    if head_size % 4 != 0:
+        raise ValueError(
+            f"The head size {head_size} (ie. embed_dim ({submod.embed_dim}) / num_heads ({submod.num_heads}) "
+            " must be divisible by 4. Please fix the model or consider using the complete_video_view_all_page_types preset",
+        )
+
     attn = nn.CrossAttention(
         dim=submod.embed_dim,
         seq_len=seq_len_q.value(),

From f534aa0a15553de8d8eb359ad6176a82b65465fb Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 7 Apr 2023 10:35:22 -0700
Subject: [PATCH 391/638] Use parameterized.expand for
 test_gemm_rcr_bias_add_float_sm80 and test_gemm_rcr_bias_float_sm80 (#542)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/542

This would help to reduce the test duration

Reviewed By: houseroad

Differential Revision: D44778881

fbshipit-source-id: 74d251750dcc19ee5f5dcc5d6d7d8b8511ffafcc
---
 .../compiler/test_fuse_mm_elementwise.py      | 275 ++++++++++++------
 1 file changed, 180 insertions(+), 95 deletions(-)

diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index 002c910bd..02b900c5a 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -32,6 +32,13 @@
 from parameterized import parameterized
 
 
+def custom_name_func(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        param.args[-2],
+    )
+
+
 class FuseGemmRcrBiasCase(unittest.TestCase):
     def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
@@ -761,44 +768,96 @@ def test_gemm_rcr_bias_mul_tanh(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_rcr_bias_add_float_sm80(self):
-        self._test_gemm_rcr_bias(
-            [8], 16, 8, True, "gemm_rcr_bias_basic_decomposed_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_add(
-            [8], 16, 8, False, "gemm_rcr_bias_add_basic_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_add_add(
-            [8, 32], 16, 8, False, "gemm_rcr_bias_add_add_dynamic_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_add_add_relu(
-            [8],
-            16,
-            3,
-            False,
-            "gemm_rcr_bias_add_add_relu_need_align_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_add_relu(
-            [8],
-            16,
-            8,
-            True,
-            "gemm_rcr_bias_add_relu_basic_decomposed_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_tanh(
-            [8], 16, 8, False, "gemm_rcr_bias_tanh_basic_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_mul(
-            [8, 32], 16, 8, False, "gemm_rcr_bias_mul_dynamic_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_mul_add(
-            [8], 16, 3, False, "gemm_rcr_bias_mul_add_need_align_float", dtype="float"
-        )
-        self._test_gemm_rcr_bias_mul_tanh(
-            [8], 16, 3, False, "gemm_rcr_bias_mul_tanh_need_align_float", dtype="float"
-        )
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_rcr_bias,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_basic_decomposed_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_add_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add,
+                [8, 32],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_add_add_dynamic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add_relu,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_add_add_relu_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_relu,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_add_relu_basic_decomposed_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_tanh,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_tanh_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul,
+                [8, 32],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_mul_dynamic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul_add,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_mul_add_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul_tanh,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_mul_tanh_need_align_float",
+                "float",
+            ),
+        ],
+        name_func=custom_name_func,
+    )
+    def test_gemm_rcr_bias_add_float_sm80(
+        self, func, Ms, N, K, decomposed, testname, dtype
+    ):
+        func(self, Ms, N, K, decomposed, testname, dtype)
 
 
 filter_test_cases_by_test_env(FuseGemmRcrBiasCase)
@@ -1144,63 +1203,89 @@ def test_gemm_rcr_bias_gelu(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_rcr_bias_float_sm80(self):
-        self._test_gemm_rcr_bias_activation(
-            [8],
-            16,
-            8,
-            "relu",
-            "gemm_rcr_bias_relu",
-            True,
-            "gemm_rcr_bias_relu_basic_decomposed_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_activation(
-            [8],
-            16,
-            8,
-            "sigmoid",
-            "gemm_rcr_bias_sigmoid",
-            False,
-            "gemm_rcr_bias_sigmoid_basic_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_sigmoid_mul(
-            [8],
-            16,
-            8,
-            False,
-            "gemm_rcr_bias_sigmoid_mul_basic_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
-            [8],
-            16,
-            3,
-            False,
-            "gemm_rcr_bias_sigmoid_mul_tanh_need_align_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_activation(
-            [8],
-            16,
-            8,
-            "tanh",
-            "gemm_rcr_bias_tanh",
-            False,
-            "gemm_rcr_bias_tanh_basic_float",
-            dtype="float",
-        )
-        self._test_gemm_rcr_bias_activation(
-            [8, 32],
-            16,
-            8,
-            "fast_gelu",
-            "gemm_rcr_bias_fast_gelu",
-            True,
-            "gemm_rcr_bias_fast_gelu_basic_decomposed_float",
-            dtype="float",
-        )
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_relu_basic_decomposed_float",
+                "float",
+                "relu",
+                "gemm_rcr_bias_relu",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_sigmoid_basic_float",
+                "float",
+                "sigmoid",
+                "gemm_rcr_bias_sigmoid",
+            ),
+            (
+                _test_gemm_rcr_bias_sigmoid_mul,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_sigmoid_mul_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_sigmoid_mul_tanh,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_sigmoid_mul_tanh_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_tanh_basic_float",
+                "float",
+                "tanh",
+                "gemm_rcr_bias_tanh",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8, 32],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_fast_gelu_basic_decomposed_float",
+                "float",
+                "fast_gelu",
+                "gemm_rcr_bias_fast_gelu",
+            ),
+        ],
+        name_func=custom_name_func,
+    )
+    def test_gemm_rcr_bias_float_sm80(
+        self,
+        func,
+        Ms,
+        N,
+        K,
+        decomposed,
+        testname,
+        dtype,
+        activation=None,
+        target_ait=None,
+    ):
+        if activation and target_ait:
+            func(self, Ms, N, K, activation, target_ait, decomposed, testname, dtype)
+        else:
+            func(self, Ms, N, K, decomposed, testname, dtype)
 
 
 filter_test_cases_by_test_env(FuseGemmRcrBiasActivationCase)

From 7d9781fb003a332cf19e9531a76d3ff3ea1c877f Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 7 Apr 2023 10:52:06 -0700
Subject: [PATCH 392/638] naively split test_bmm_dtype and
 test_bmm_broadcast_dtype (#546)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/546

This would help to reduce the test duration

Reviewed By: houseroad

Differential Revision: D44782015

fbshipit-source-id: 3f4ce8d3bb07766eaef866ec19d41ba990ae5b38
---
 tests/unittest/ops/test_bmm.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index e390865f7..9c1d42be3 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -307,7 +307,7 @@ def test_ccc(self):
 
     @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_dtype(self, dtype):
+    def test_bmm_0_dtype(self, dtype):
         self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr(
             [1, 5, 77, 128],
@@ -332,6 +332,9 @@ def test_bmm_dtype(self, dtype):
             [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_1_dtype(self, dtype):
         self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcc(
             [1, 5, 77, 128],
@@ -725,7 +728,7 @@ def test_ccc(self):
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
     @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_broadcast_dtype(self, dtype):
+    def test_bmm_broadcast_0_dtype(self, dtype):
         self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
@@ -735,6 +738,8 @@ def test_bmm_broadcast_dtype(self, dtype):
         self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
         self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
 
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_broadcast_1_dtype(self, dtype):
         self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)

From 64911f0e406fa6de220a34aec38989525d6db947 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 7 Apr 2023 10:52:06 -0700
Subject: [PATCH 393/638] naively split test_bmm_add_dtype and
 test_bmm_add_broadcast_dtype (#548)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/548

This would help to reduce the test duration

Reviewed By: houseroad

Differential Revision: D44782077

fbshipit-source-id: bc69c15b67de543ce3c23fabf205c8050b646590
---
 tests/unittest/ops/test_bmm_add.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 86d1cbd4c..e94849583 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -334,7 +334,7 @@ def test_crc(self):
         self._test_crc(B=32, M=256, K=256, N=512)
 
     @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_dtype(self, dtype):
+    def test_bmm_add_0_dtype(self, dtype):
         self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccr(
             B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
@@ -344,6 +344,8 @@ def test_bmm_add_dtype(self, dtype):
             B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
         )
 
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_add_1_dtype(self, dtype):
         self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccc(
             B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
@@ -734,7 +736,7 @@ def test_ccc(self):
         )
 
     @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_broadcast_dtype(self, dtype):
+    def test_bmm_add_broadcast_0_dtype(self, dtype):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
@@ -764,6 +766,8 @@ def test_bmm_add_broadcast_dtype(self, dtype):
             dtype=dtype,
         )
 
+    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    def test_bmm_add_broadcast_1_dtype(self, dtype):
         self._test_crc(
             [1, 8, 16],
             [2, 8, 32],

From a0ff21b67948153efacff4b1080a6fa5ce445af6 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 7 Apr 2023 14:46:55 -0700
Subject: [PATCH 394/638] Use parameterized.expand for
 test_fuse_permute_bmm_sm80 (#545)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/545

This would help to reduce the test duration

Reviewed By: houseroad

Differential Revision: D44781395

fbshipit-source-id: 438062d52630e86e3346a4c9ac7f8ed6bcb34d7d
---
 .../compiler/test_fuse_permute_bmm.py         | 158 +++++++++++-------
 1 file changed, 100 insertions(+), 58 deletions(-)

diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
index 687772871..7587b5581 100644
--- a/tests/unittest/compiler/test_fuse_permute_bmm.py
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -32,6 +32,20 @@
 from parameterized import parameterized
 
 
+def custom_name_func_with_testname(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        param.args[-2],
+    )
+
+
+def custom_name_func_with_funcname(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        str(param.args[0].__name__),
+    )
+
+
 class FusePermuteBmmCase(unittest.TestCase):
     def _create_permute_bmm_graph(
         self, A_shape, B_shape, bmm_type, permA, permB, dtype, bias_shape=None
@@ -122,61 +136,75 @@ def test_misalign_b_bmm(self):
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_misalign_bmm_float_sm80(self):
-        self._test_missing_alignment_bmm(
-            [2, 4, 7],
-            [2, 7, 8],
-            "bmm_crr",
-            True,
-            False,
-            "bmm_crr_misalign_a",
-            dtype="float",
-        )
-        self._test_missing_alignment_bmm(
-            [2, 4, 7],
-            [2, 8, 4],
-            "bmm_rcr",
-            True,
-            False,
-            "bmm_rcr_misalign_a",
-            dtype="float",
-        )
-        self._test_missing_alignment_bmm(
-            [2, 4, 7],
-            [2, 4, 8],
-            "bmm_rrr",
-            True,
-            False,
-            "bmm_rrr_misalign_a",
-            dtype="float",
-        )
-        self._test_missing_alignment_bmm(
-            [2, 8, 4],
-            [2, 8, 7],
-            "bmm_ccr",
-            False,
-            True,
-            "bmm_ccr_misalign_b",
-            dtype="float",
-        )
-        self._test_missing_alignment_bmm(
-            [2, 7, 8],
-            [2, 8, 7],
-            "bmm_crr",
-            False,
-            True,
-            "bmm_crr_misalign_b",
-            dtype="float",
-        )
-        self._test_missing_alignment_bmm(
-            [2, 4, 8],
-            [2, 8, 7],
-            "bmm_rcr",
-            False,
-            True,
-            "bmm_rcr_misalign_b",
-            dtype="float",
-        )
+    @parameterized.expand(
+        [
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 7, 8],
+                "bmm_crr",
+                True,
+                False,
+                "bmm_crr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 8, 4],
+                "bmm_rcr",
+                True,
+                False,
+                "bmm_rcr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 4, 8],
+                "bmm_rrr",
+                True,
+                False,
+                "bmm_rrr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 8, 4],
+                [2, 8, 7],
+                "bmm_ccr",
+                False,
+                True,
+                "bmm_ccr_misalign_b",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 7, 8],
+                [2, 8, 7],
+                "bmm_crr",
+                False,
+                True,
+                "bmm_crr_misalign_b",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 8],
+                [2, 8, 7],
+                "bmm_rcr",
+                False,
+                True,
+                "bmm_rcr_misalign_b",
+                "float",
+            ),
+        ],
+        name_func=custom_name_func_with_testname,
+    )
+    def test_misalign_bmm_float_sm80(
+        self, func, A_shape, B_shape, bmm_type, permA, permB, testname, dtype
+    ):
+        func(self, A_shape, B_shape, bmm_type, permA, permB, testname, dtype)
 
     def _test_permute_bmm(
         self,
@@ -758,9 +786,23 @@ def test_gemm_broadcast_rrr_to_crr(self):
         self._test_gemm_broadcast_rrr_to_crr(False)
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_broadcast_float_sm80(self):
-        self._test_gemm_broadcast_rcr_to_ccr(True, dtype="float")
-        self._test_gemm_broadcast_rrr_to_crr(False, dtype="float")
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_broadcast_rcr_to_ccr,
+                True,
+                "float",
+            ),
+            (
+                _test_gemm_broadcast_rrr_to_crr,
+                False,
+                "float",
+            ),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_broadcast_float_sm80(self, func, test_bias, dtype):
+        func(self, test_bias, dtype)
 
     @parameterized.expand(
         filter_test_cases_by_params(

From e1344efad011ec211c2b2f8a03168aa384f20274 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Fri, 7 Apr 2023 14:46:55 -0700
Subject: [PATCH 395/638] Use parameterized.expand for
 test_gemm_bias_broadcast_sm80 (#547)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/547

This would help to reduce the test duration

Reviewed By: khabinov, houseroad

Differential Revision: D44781804

fbshipit-source-id: 02501b800663bfc5a75f4c684ec9c61ebe8ba750
---
 .../unittest/ops/test_gemm_bias_broadcast.py  | 66 ++++++++++++-------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index 3c562e65a..2e6320094 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -25,6 +25,16 @@
     get_torch_empty_tensor,
 )
 
+from parameterized import parameterized
+
+
+def custom_name_func_with_funcname(testcase_func, param_num, param):
+    return "%s_%s_%s" % (
+        testcase_func.__name__[:-5],
+        str(param.args[0].__name__),
+        testcase_func.__name__[-4:],
+    )
+
 
 class GEMMBiasBroadcastTestCase(unittest.TestCase):
     def _init_tensors(self, m, k, n, m0=None, m1=None, dtype="float16"):
@@ -322,29 +332,39 @@ def test_bias_rcr_mul_tanh(self):
     def test_bias_rcr_mul_tanh_rocm(self):
         self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
 
-    def test_gemm_bias_broadcast_float32_sm80(self):
-        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="float32")
-        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="float32")
-
-    def test_gemm_bias_broadcast_bfloat16_bf16(self):
-        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_add(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_mul(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_add_add(None, 2, 32, 256, 128, dtype="bfloat16")
-        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128, dtype="bfloat16")
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128, "float32"),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_float32_sm80(self, func, m, m0, m1, k, n, dtype):
+        func(self, m, m0, m1, k, n, dtype)
+
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128, "bfloat16"),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_bfloat16_bf16(self, func, m, m0, m1, k, n, dtype):
+        func(self, m, m0, m1, k, n, dtype)
 
     def test_gemm_bias_broadcast_use_fp16_acc_sm80(self):
         self._test_bias_rcr_mul(

From 5f23f3fc5ce123efbee3903298a73adda3baa8b6 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 7 Apr 2023 17:22:48 -0700
Subject: [PATCH 396/638] Carry over check_sequence_lengths in
 dedup_make_jagged_ops pass (#549)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/549

The `check_sequence_lengths` attribute of the `make_jagged` ops is not carried over in the `dedup_make_jagged_ops` from the old ops to the new one. This is a bug that causes problems in the setting where `check_sequence_lengths` is set to `False` in the existing ops, as the default value is `True`. The diff fixes the bug by carrying over the attribute in the pass.

Reviewed By: amateurcoffee, tissue3

Differential Revision: D44808132

fbshipit-source-id: 5bdd8d8d764cafbf06e0bacfaffe973db2aecf25
---
 python/aitemplate/compiler/transform/dedup_make_jagged_ops.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
index 982a2c59f..c07185184 100644
--- a/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
+++ b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
@@ -205,6 +205,9 @@ def _apply_make_jagged_to_inputs(
         new_make_jagged_op = make_jagged(
             batch_dim=data.jagged_int_var.batch_dim(),
             jagged_dims=data.jagged_int_var.jagged_dims(),
+            check_sequence_lengths=all(
+                d.op._attrs["check_sequence_lengths"] for d in make_jagged_group
+            ),
         )
         jagged_tensors = new_make_jagged_op(
             source=sources_list,

From 0222f451ad1cd488f7e7a1a22280fc291e04cf56 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sat, 8 Apr 2023 11:08:41 -0700
Subject: [PATCH 397/638] Fix "Profiler is not executable" error (#544)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/544

Previously, the same `Makefile` was used (and reused) for building all different kinds of profilers for different ops and op configurations. This has caused an issue when running parallel unit tests in a local environment, as contention for the same `./tmp/profiler/Makefile` has led to different tests rewriting it before being read by others.

As a result, the tests were building each others' profilers and were left without their own. The latter manifested itself in the following error, as the profiler executable that should have been built wouldn't have been there by the time the compilation would have ended:

```
Profiler ./tmp/profiler/gemm_rcr/gemm_rcr_9e46850d5286ecc7e078b5b7f76afbcac62967b4_3 is not executable
```

In this diff, the built profiler target names are included in the per-profiler `Makefile` name, hence excluding the possibility of different tests rewriting each other profiler `Makefile`s. This resolves the issue and the above error is no longer raised. Importantly, it is acceptable for the tests to rewrite the `Makefile` of the same profiler targets, as the content will also be the same.

Additionally, a few retries (with a delay) are made to check if the profiler binary is executable in the `gemm_universal` front-end. This is to handle the cases where the same binary is being compiled in parallel by more than one unit test, so that by the time one tries to check executability, the other is in process of writing the compiled result.

Reviewed By: kadeng

Differential Revision: D44788627

fbshipit-source-id: 3080fadb7d3114615a49b214bb4bb65abca15ef7
---
 python/aitemplate/backend/build_cache_base.py | 10 +++++--
 python/aitemplate/backend/builder.py          | 20 +++++++++-----
 .../ops/gemm_universal/gemm_common.py         | 26 +++++++++++++++++--
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index 3c8c56888..03bef207f 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -54,6 +54,8 @@
     "makefile"
 }
 
+source_filename_prefixes = ["makefile"]
+
 # File extensions of files to be considered cache artifacts ( unless they are considered source files )
 cache_extensions = {"obj", "so", "dll", "exe", ""}
 
@@ -91,7 +93,11 @@ def is_source(filename: str) -> bool:
         bool: Whether the filename is a source file
     """
     file_basename, file_ext = filename_norm_split(filename)
-    return (file_basename in source_filenames) or (file_ext in source_extensions)
+    return (
+        (file_basename in source_filenames)
+        or (file_ext in source_extensions)
+        or any(file_basename.startswith(p) for p in source_filename_prefixes)
+    )
 
 
 def is_cache_artifact(filename: str) -> bool:
@@ -181,7 +187,7 @@ def create_dir_hash(
                 continue
             hash_object.update(str(fpath).encode("utf-8"))
             fullpath = str(basepath / fpath)
-            if fpath.name.lower() == "makefile":
+            if fpath.name.lower().startswith("makefile"):
                 makefile_content = (basepath / fpath).read_bytes()
                 makefile_content = makefile_normalizer(makefile_content)
                 hash_object.update(makefile_content)
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index e4c7cf8c8..15a5e4b1a 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -750,7 +750,7 @@ def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
 
         commands = []
         num_compiled_sources = 0
-        num_linked_executables = 0
+        target_names = set()
         for target, srcs in dependencies.items():
             # for each "target: srcs" pair,
             # generate two lines for the Makefile
@@ -769,28 +769,35 @@ def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
             command = f"{dep_line}\n\t{cmd_line}\n"
             commands.append(command)
 
-            # increment compilation statistics
+            # update compilation statistics
             num_compiled_sources += sum(1 for s in srcs if s.endswith(".cu"))
-            num_linked_executables += 0 if target.endswith(".obj") else 1
+            if not target.endswith(".obj"):
+                target_names.add(os.path.split(target)[-1])
 
         _LOGGER.info(f"compiling {num_compiled_sources} profiler sources")
-        _LOGGER.info(f"linking {num_linked_executables} profiler executables")
+        _LOGGER.info(f"linking {len(target_names)} profiler executables")
 
         makefile_str = makefile_template.render(
             targets=" ".join(set(targets)),
             commands="\n".join(commands),
         )
 
-        dumpfile = os.path.join(profiler_dir, "Makefile")
+        # make the Makefile name dependent on the built target names
+        target_names_str = "_".join(sorted(target_names))  # stable order
+        makefile_suffix = sha1(target_names_str.encode("utf-8")).hexdigest()
+        makefile_name = f"Makefile_{makefile_suffix}"
+        dumpfile = os.path.join(profiler_dir, makefile_name)
         with open(dumpfile, "w+") as f:
             f.write(makefile_str)
 
+        return makefile_name
+
     def make_profilers(self, generated_profilers, workdir):
         file_pairs = [f for gp in generated_profilers for f in gp]
         if not file_pairs:
             return
         build_dir = shlex.quote(os.path.join(workdir, "profiler"))
-        self._gen_makefile_for_profilers(file_pairs, build_dir)
+        makefile_name = self._gen_makefile_for_profilers(file_pairs, build_dir)
         # Write compiler version string(s) into build directory, so these can be used as part of cache key
         self._gen_compiler_version_files(build_dir)
 
@@ -801,6 +808,7 @@ def make_profilers(self, generated_profilers, workdir):
         make_path = shlex.quote(Target.current().make())
         make_flags = " ".join(
             [
+                f"-f {makefile_name}",
                 "--output-sync",
                 f"-C {build_dir}",
             ]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index a285ee431..4f3858254 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -25,7 +25,8 @@
 from enum import Enum
 from hashlib import sha1
 from operator import itemgetter
-from typing import Any, Dict, List, Union
+from time import sleep
+from typing import Any, Callable, Dict, List, Union
 
 import jinja2
 
@@ -179,6 +180,22 @@ def _to_list(elem):
         return [elem]
 
 
+def _check_with_retries(
+    condition: Callable[[], bool],
+    max_attempts: int = 3,
+    delay_seconds: int = 5,
+) -> bool:
+    """Check a condition with retries."""
+    attempts = 0
+    while True:
+        if condition():
+            return True
+        attempts += 1
+        if attempts >= max_attempts:
+            return False
+        sleep(delay_seconds)
+
+
 class gemm(Operator):
     """Base gemm operators"""
 
@@ -526,8 +543,13 @@ def _gen_profile_cmd(
         self, profiler_prefix, profiler_filename, exec_key, fbuild_cmd
     ):
         exe_path = os.path.join(profiler_prefix, profiler_filename)
-        if not os.access(exe_path, os.X_OK):
+        if not _check_with_retries(
+            condition=lambda: os.access(exe_path, os.X_OK),
+            max_attempts=3,
+            delay_seconds=5,
+        ):
             raise RuntimeError("Profiler %s is not executable" % exe_path)
+
         cmd_args = fbuild_cmd(exec_key)
         cmd = [exe_path]
         # mnk

From 85307943ec85627417f025ec2216fb2b7e530785 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 9 Apr 2023 05:43:25 -0700
Subject: [PATCH 398/638] Increase tolerance of test_make_jagged_dedup (#554)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/554

`test_make_jagged_dedup` fails in some CircleCI jobs (mostly `main`), due to a minor discrepancy:

```
AssertionError: Tensor-likes are not close!

Mismatched elements: 1 / 224 (0.4%)
Greatest absolute difference: 0.01708984375 at index (0, 16) (up to 0.01 allowed)
Greatest relative difference: 0.03127792672028597 at index (0, 16) (up to 0.01 allowed)
```

The error, probably, accumulates due to the two gemm ops being applied back-to-back in the test. Here we increase the tolerance to `5e-2` to avoid the test failure in CircleCI.

Reviewed By: alexanderguzhva

Differential Revision: D44815979

fbshipit-source-id: 02b73c45487cc5a300e04e4f131a7664bcccb6a4
---
 tests/unittest/ops/test_make_jagged.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
index 7ba2ffc53..4c91d3d5d 100644
--- a/tests/unittest/ops/test_make_jagged.py
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -491,7 +491,7 @@ def test_make_jagged_dedup(
         result = torch.empty_like(result_pt)
         model.run_with_tensors(inputs, [result])
 
-        torch.testing.assert_close(result, result_pt, rtol=1e-2, atol=1e-2)
+        torch.testing.assert_close(result, result_pt, rtol=5e-2, atol=5e-2)
 
 
 if __name__ == "__main__":

From eec6882187c0d4a036b761c30d62f4a47e35302d Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 9 Apr 2023 08:49:38 -0700
Subject: [PATCH 399/638] Add profiler execution retries (#553)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/553

When multiple unit tests are running in parallel, a few can be building the same profiler binary (e.g., for the same op configuration from both tests). In such cases, it may happen that, by the time one test attempts to execute the build profiler binary, another test is in the middle of writing the compilation result. This triggers an error, which before this diff has caused a failure of the async task running profiler commands and eventual profiler timeout.

The diff adds retries to profiler execution, hence remediating the problem described above.

Reviewed By: alexanderguzhva

Differential Revision: D44815907

fbshipit-source-id: c9082e8bc9c59ad1f629373e156ba4661cc89795
---
 python/aitemplate/backend/profiler_runner.py | 35 +++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 9b9d38095..5381b5794 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -25,6 +25,7 @@
 import subprocess
 from collections import namedtuple
 from queue import Queue
+from time import sleep
 from typing import Callable, List, Tuple, Union
 
 from aitemplate.backend.target import Target
@@ -44,6 +45,9 @@
 RUNTIME_PATTERN = re.compile(r"TIME:([\d\.]+)")
 WORKSPACE_PATTERN = re.compile(r"WS:([\d]+)")
 
+PROFILER_RUN_MAX_ATTEMPTS = 3
+PROFILER_RUN_RETRY_DELAY_SECONDS = 5
+
 ProfileResult = namedtuple("ProfileResult", "op_config duration workspace")
 """Object to store profiling result
 """
@@ -216,14 +220,29 @@ def run_task(cmds, queue, dev_select_flag):
     device = queue.get()
     _LOGGER.debug(f"running profiler {cmds=} on GPU #{device}")
 
-    completed_process = subprocess.run(
-        cmds,
-        env=update_inplace(os.environ.copy(), {dev_select_flag: device}),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-        shell=False,
-    )
+    attempts = 0
+    while True:
+        try:
+            completed_process = subprocess.run(
+                cmds,
+                env=update_inplace(os.environ.copy(), {dev_select_flag: device}),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                shell=False,
+            )
+            break
+        except Exception as ex:
+            attempts += 1
+            if attempts >= PROFILER_RUN_MAX_ATTEMPTS:
+                raise
+            _LOGGER.debug(
+                f"[{attempts} / {PROFILER_RUN_MAX_ATTEMPTS}] "
+                f"Failed to run profiler {cmds=} due to exception: {ex}. "
+                f"Will retry in {PROFILER_RUN_RETRY_DELAY_SECONDS} seconds."
+            )
+            sleep(PROFILER_RUN_RETRY_DELAY_SECONDS)
+
     queue.put(device)
     return completed_process.stdout, completed_process.stderr
 

From edc89f20ad3f5c2e3c7be3e45d198faea5a87852 Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Mon, 10 Apr 2023 09:18:28 -0700
Subject: [PATCH 400/638] Skip fuse parallel gemm for output tensors (#550)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/550

Currently the fuse parallel gemm pass doesn't check if tensors being fused and eliminated are output tensors. This results in errors like
```
"ValueError: Output output188 was not found in the graph after optimizations."
```
during AIT compilation.

This diff adds the check in to make sure these aren't removed from the optimized graph.

Reviewed By: frank-wei, houseroad

Differential Revision: D44806086

fbshipit-source-id: a1e1f286c5377afe8464aba1cb0c5d7f83de9984
---
 .../compiler/transform/fuse_parallel_gemms.py |   4 +
 .../compiler/test_parallel_gemm_fusions.py    | 110 ++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
index 9d2525e18..baf298e2f 100644
--- a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
+++ b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
@@ -58,6 +58,10 @@ def _is_valid_gemm_op(tensor: Tensor, f_check_src_op: Callable) -> bool:
     if len(tensor.dst_ops()) != 1 or len(tensor.src_ops()) != 1:
         return False
 
+    # Don't fuse if tensor is an output tensor
+    if tensor._attrs["is_output"]:
+        return False
+
     gemm_op = list(tensor.src_ops())[0]
     if gemm_op._attrs["op"] != "gemm_rcr_bias":
         return False
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
index f5498b482..929d06150 100644
--- a/tests/unittest/compiler/test_parallel_gemm_fusions.py
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -621,6 +621,116 @@ def test_multi_parallel_gemm_cat_groups_fp32_sm80(self):
             dtype="float32",
         )
 
+    def _skip_fuse_parallel_gemm_output_cat(
+        self,
+        b: int,
+        ms: Sequence[int],
+        n: int,
+        k: int,
+        perm102_bmm_op: str,
+        dtype: str = "float16",
+    ):
+        _LOGGER.info(f"_skip_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}")
+        X = Tensor(
+            shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        Ws = []
+        Bs = []
+        for i in range(b):
+            W = Tensor(
+                shape=[IntImm(n), IntImm(k)],
+                dtype=dtype,
+                name=f"W{i}",
+            )
+
+            Ws.append(W)
+            B = Tensor(
+                shape=[IntImm(n)],
+                dtype=dtype,
+                name=f"B{i}",
+            )
+            Bs.append(B)
+
+        X1 = ops.split()(X, k, dim=-1)
+        cat_inputs = []
+        for i in range(b):
+            X2 = X1[i]
+            X3 = ops.gemm_rcr_bias()(X2, Ws[i], Bs[i])
+            cat_inputs.append(X3)
+            X3._attrs["name"] = f"output{i+1}"
+            X3._attrs["is_output"] = True
+
+        cat_output = ops.concatenate()(cat_inputs, dim=-1)
+
+        cat_output._attrs["name"] = "output0"
+        cat_output._attrs["is_output"] = True
+
+        constants = {}
+        for i in range(b):
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [cat_output, *cat_inputs],
+            target,
+            "./tmp",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
+            constants=constants,
+        ) as module:
+            self._test_id += 1
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            assert not has_op(
+                sorted_ops, perm102_bmm_op
+            ), f"the final graph has op {perm102_bmm_op}"
+            assert has_op(
+                sorted_ops, "gemm_rcr_bias"
+            ), "the final graph does not have op gemm_rcr_bias"
+
+            for m in ms:
+                x_pt = get_random_torch_tensor([m, b * k], dtype)
+                x1_pt = torch.split(x_pt, k, dim=-1)
+
+                cat_inputs_pt = []
+                for i in range(b):
+                    x2_pt = x1_pt[i]
+                    x3_pt = torch.nn.functional.linear(
+                        x2_pt, constants[f"W{i}"], constants[f"B{i}"]
+                    )
+                    cat_inputs_pt.append(x3_pt)
+                cat_output_pt = (torch.cat(cat_inputs_pt, dim=-1), *cat_inputs_pt)
+
+                # Run AITemplate module.
+
+                cat_out = get_torch_empty_tensor([m, b * n], dtype)
+                out_other = [
+                    get_torch_empty_tensor(x.shape, dtype) for x in cat_inputs_pt
+                ]
+                out = [cat_out, *out_other]
+                module.run_with_tensors([x_pt], out)
+
+                # Do comparisons.
+                for (out_ait, out_pt) in zip(out, cat_output_pt):
+                    self.assertTrue(
+                        torch.allclose(out_ait, out_pt, atol=5e-2, rtol=5e-2)
+                    )
+
+    def test_skip_parallel_gemm_cat_groups(self):
+        self._skip_fuse_parallel_gemm_output_cat(
+            b=4,
+            ms=[256, 512],
+            n=128,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+        )
+
 
 filter_test_cases_by_test_env(ParallelGemmCatFusionTestCase)
 

From 57d182af4d51e90eb6e6132ba13a7d16b7de4a34 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 10 Apr 2023 10:27:03 -0700
Subject: [PATCH 401/638] Fix memory pool using in GEMM profiler (#556)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/556

There is a bug in the current GEMM profiler's way of using the memory pool: the tensors are requested only once for the entire GEMM kernel's profiling loop. The fact that the same tensors / memory regions / pointers are used in all iterations of the kernel's profiling loop render the memory pool virtually useless. The risk is that small inputs may stick in the GPU's L2 cache, leading to unreliable profiling results.

In this diff we fix the bug by modifying the GEMM back-end profiler templates in a way that the `memory_pool->RequestTensorByIdx(...)` calls are made *within* the profiling loop, hence rotating the inputs for every call and eschewing L2 caching. Experiments with simple GEMM on small problem sizes (e.g., `M=1024, N=512, K=256`) have shown that, after the fix, the runtimes measured in profiling can grow up to 30% for some of the kernels. The selected best kernel can also change as a result.

Reviewed By: tenpercent

Differential Revision: D44816867

fbshipit-source-id: 27259671614422cbe3072d578842b5bc617dc830
---
 .../bmm_common_softmax.py                     | 256 ------------------
 .../gemm_epilogue_vistor/common_dual_gemm.py  |  13 +-
 .../gemm_epilogue_vistor/common_softmax.py    |  31 +--
 .../backend/cuda/gemm_universal/bmm_common.py |  17 +-
 .../cuda/gemm_universal/bmm_permute_common.py |  15 +-
 .../backend/cuda/gemm_universal/common.py     |  43 +--
 .../gemm_universal/common_bias_broadcast.py   |  21 +-
 .../cuda/gemm_universal/common_permute.py     |  13 +-
 tests/unittest/ops/test_dual_bmm.py           |  24 +-
 tests/unittest/ops/test_dual_gemm.py          |  12 +-
 tests/unittest/ops/test_gemm.py               |  19 +-
 .../unittest/ops/test_gemm_bias_broadcast.py  | 162 ++++++++---
 12 files changed, 199 insertions(+), 427 deletions(-)
 delete mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
deleted file mode 100644
index b770f9556..000000000
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Common functions and templates for bmm-family ops
-"""
-import jinja2
-
-from aitemplate.backend.common import gemm_common
-
-from aitemplate.backend.cuda.gemm_epilogue_vistor import common_softmax
-from aitemplate.backend.cuda.gemm_universal import common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-{% if has_bias %}
-  cutlass::half_t*,
-{% endif %}
-  cutlass::half_t*,
-  cutlass::half_t*,
-  float*,
-  cutlass::half_t*,
-  uint8_t*,
-{% if support_split_k %}
-  int,
-{% endif %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-  cudaStream_t
-);
-"""
-)
-
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{a_ptr}},
-{{indent}}    {{b_ptr}},
-{% if has_bias %}
-{{indent}}    {{bias_ptr}},
-{% endif %}
-{{indent}}    {{c_ptr}},
-{{indent}}    {{d_ptr}},
-{{indent}}    {{n_ptr}},
-{{indent}}    {{soft_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{a_dim0_ptr}},
-{{indent}}    {{a_dim1_ptr}},
-{{indent}}    {{a_dim2_ptr}},
-{{indent}}    {{b_dim0_ptr}},
-{{indent}}    {{b_dim1_ptr}},
-{{indent}}    {{b_dim2_ptr}},
-{{indent}}    {{c_dim0_ptr}},
-{{indent}}    {{c_dim1_ptr}},
-{{indent}}    {{c_dim2_ptr}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
-TENSOR_DECL_TEMPLATE = jinja2.Template(
-    """
-  // cast to int64_t to avoid overflow
-  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1) * static_cast<int64_t>(a_dim2);
-  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1) * static_cast<int64_t>(b_dim2);
-  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1) * static_cast<int64_t>(c_dim2);
-  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
-
-
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
-  memory_pool->AllocateFloatTensor(c_dim0 * c_dim1,  mem_pool_sz);  // n_ptr: index 4
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
-"""
-)
-
-
-def gen_profiler(
-    func_attrs,
-    workdir,
-    dim_info_dict,
-    src_template,
-    problem_args_template,
-    args_parser_template,
-    emit_kernel=False,
-    bias_ptr_arg=None,
-):
-    """Generate code for profiling"""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    has_d = False
-    if "has_d" in func_attrs:
-        has_d = func_attrs["has_d"]
-    shape_func = gemm_common.gen_shape_eval_code(
-        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
-    )
-
-    file_pairs = []
-    has_bias = bias_ptr_arg is not None
-    assert not (has_d and has_bias)
-    for op_name, op in op_instance.items():
-        config = common_softmax.emit_instance(op, emit_kernel=emit_kernel)
-        config_name = common.extract_config_name(config)
-        name = "GemmInstance"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = common_softmax.EXEC_TEMPLATE.render(
-            indent="  ",
-            instance=name,
-            is_profiler=True,
-            problem_args=problem_args_template.render(),
-        )
-        op_func = src_template.render(
-            custom_libs=common_softmax.gen_custom_libs(),
-            instances=instance,
-            function_name="bmm",
-            input_ndims=3,
-            weight_ndims=3,
-            shape_eval=shape_func,
-            exec_paths=exec_program,
-            has_d=has_d,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="bmm",
-            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
-            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
-            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
-            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
-            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
-            has_d=has_d,
-            a_dim0_ptr="&a_dim0",
-            a_dim1_ptr="&a_dim1",
-            a_dim2_ptr="&a_dim2",
-            b_dim0_ptr="&b_dim0",
-            b_dim1_ptr="&b_dim1",
-            b_dim2_ptr="&b_dim2",
-            c_dim0_ptr="&c_dim0",
-            c_dim1_ptr="&c_dim1",
-            c_dim2_ptr="&c_dim2",
-        )
-        code = common_softmax.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser_template.render(),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(
-                name=name, has_d=has_d, has_bias=has_bias
-            ),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
-def gen_function_decl(func_attrs):
-    """Rendering argument to function declaration template"""
-    func_name = func_attrs["name"]
-    has_d = False
-    if "has_d" in func_attrs:
-        has_d = func_attrs["has_d"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3, has_d=has_d)
-
-
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-    problem_args,
-):
-    """Generate the code for main function"""
-    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
-    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
-    return common_softmax.gen_function(
-        func_attrs,
-        common_softmax.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims=input_ndims,
-        weight_ndims=weight_ndims,
-        dim_info_dict=dim_info_dict,
-        emit_kernel=True,
-    )
-
-
-def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
-    """Rendering the code to function call template"""
-
-    a = func_attrs["inputs"][0]
-    ashape = func_attrs["input_accessors"][0].original_shapes
-    b = func_attrs["inputs"][1]
-    bshape = func_attrs["input_accessors"][1].original_shapes
-
-    c = func_attrs["inputs"][2]
-    d = func_attrs["inputs"][3]
-    n = func_attrs["inputs"][4]
-
-    soft = func_attrs["outputs"][0]
-    cshape = func_attrs["output_accessors"][0].original_shapes
-    has_d = False
-    has_bias = bias_ptr_arg is not None
-    assert not (has_d and has_bias)
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        a_ptr=a._attrs["name"],
-        b_ptr=b._attrs["name"],
-        has_bias=has_bias,
-        bias_ptr=bias_ptr_arg,
-        c_ptr=c._attrs["name"],
-        d_ptr=d._attrs["name"],
-        n_ptr=n._attrs["name"],
-        soft_ptr=soft._attrs["name"],
-        has_d=has_d,
-        a_dim0_ptr="&" + ashape[0]._attrs["name"],
-        a_dim1_ptr="&" + ashape[1]._attrs["name"],
-        a_dim2_ptr="&" + ashape[2]._attrs["name"],
-        b_dim0_ptr="&" + bshape[0]._attrs["name"],
-        b_dim1_ptr="&" + bshape[1]._attrs["name"],
-        b_dim2_ptr="&" + bshape[2]._attrs["name"],
-        c_dim0_ptr="&" + cshape[0]._attrs["name"],
-        c_dim1_ptr="&" + cshape[1]._attrs["name"],
-        c_dim2_ptr="&" + cshape[2]._attrs["name"],
-        indent=indent,
-    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index c5d128967..9bac19ac6 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -434,11 +434,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -467,11 +462,11 @@ def gen_profiler(
     func_call = common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index 15c8c0642..11ab6cfb9 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -202,12 +202,7 @@
 {{indent}}ret = {{func_name}}(
 {{indent}}    {{gemm_op}},
 {{indent}}    gemm_op_name,
-{{indent}}    {{a_ptr}},
-{{indent}}    {{b_ptr}},
-{% if has_bias %}
-{{indent}}    {{bias_ptr}},
-{% endif %}
-{{indent}}    {{soft_ptr}},
+{{indent}}    memory_pool.get(),
 {{indent}}    global_workspace_,
 {% for dim in adims %}
 {{indent}}    {{dim}},
@@ -271,18 +266,17 @@
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
 #include <sstream>
+
 {{op_func}}
 
+template <typename DType>
+struct ProfilerMemoryPool;
+
 template <typename GemmInstance>
 int benchmark_{{func_name}} (
     GemmInstance &gemm_op,
     const char *gemm_op_name,
-    void* a_ptr,
-    void* b_ptr,
-{% if has_bias %}
-    void* bias_ptr,
-{% endif %}
-    void* soft_ptr,
+    ProfilerMemoryPool<{{elem_type}}>* memory_pool,
     uint8_t* global_workspace_,
 {% for idx in range(input_ndims) %}
     int64_t* a_dim{{idx}},
@@ -631,11 +625,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{func_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            soft_ptr="memory_pool->RequestTensorByIdx(2)",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
@@ -663,11 +652,11 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=func_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        soft_ptr="soft_ptr",
+        bias_ptr=bias_ptr_arg,
+        soft_ptr="memory_pool->RequestTensorByIdx(2)",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
         cdims=benchmark_cdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 96b826032..8eb5a24b9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -433,13 +433,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
-            has_d=has_d,
             adims=a_dims_ptr,
             bdims=b_dims_ptr,
             cdims=c_dims_ptr,
@@ -464,12 +457,12 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
-        d_ptr="d_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
         has_d=has_d,
         a_dims_ptr=benchmark_adims,
         b_dims_ptr=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
index 568da302d..7fdfa98b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
@@ -98,13 +98,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
-            has_d=has_d,
             adims=a_dims_ptr,
             bdims=b_dims_ptr,
             cdims=c_dims_ptr,
@@ -130,12 +123,12 @@ def gen_profiler(
     func_call = bmm_common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
         bias_ptr=bias_ptr_arg,
-        c_ptr="c_ptr",
-        d_ptr="d_ptr",
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
         has_d=has_d,
         a_dims_ptr=benchmark_adims,
         b_dims_ptr=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index a20c54bda..efd3b6b4d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -308,18 +308,7 @@
 {{indent}}ret = {{func_name}}(
 {{indent}}    {{gemm_op}},
 {{indent}}    gemm_op_name,
-{{indent}}    {{a_ptr}},
-{{indent}}    {{b_ptr}},
-{% if has_bias %}
-{{indent}}    {{bias_ptr}},
-{% endif %}
-{% if has_d %}
-{{indent}}    {{d_ptr}},
-{% endif %}
-{% if has_d1 %}
-{{indent}}    {{d1_ptr}},
-{% endif %}
-{{indent}}    {{c_ptr}},
+{{indent}}    memory_pool.get(),
 {{indent}}    global_workspace_,
 {% if support_split_k %}
 {{indent}}    {{split_k}},
@@ -376,8 +365,12 @@
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
 #include <sstream>
+
 {{op_func}}
 
+template <typename DType>
+struct ProfilerMemoryPool;
+
 template <typename GemmInstance>
 int benchmark_{{function_name}} (
 {% if is_group_gemm %}
@@ -407,18 +400,7 @@
 
     GemmInstance &gemm_op,
     const char *gemm_op_name,
-    void* a_ptr,
-    void* b_ptr,
-{% if has_bias %}
-    void* bias_ptr,
-{% endif %}
-{% if has_d %}
-    void* d_ptr,
-{% endif %}
-{% if has_d1 %}
-    void* d1_ptr,
-{% endif %}
-    void* c_ptr,
+    ProfilerMemoryPool<{{elem_type}}>* memory_pool,
     uint8_t* global_workspace_,
 {% if support_split_k %}
     int split_k,
@@ -998,11 +980,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -1037,11 +1014,11 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 51a29bc2a..bc702add6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -485,20 +485,11 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(4)",
-            d1_ptr="memory_pool->RequestTensorByIdx(5)",
-            bias_ptr="memory_pool->RequestTensorByIdx(3)",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
             support_split_k=support_split_k,
             split_k="split_k",
-            has_bias=True,
-            has_d=True,
-            has_d1=has_d1,
         )
         instances.append(instance)
         benchmark_instances.append(benchmark_instance)
@@ -525,12 +516,12 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name="gemm",
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
-        c_ptr="c_ptr",
-        d0_ptr="d_ptr",
-        d1_ptr="d1_ptr",
-        bias_ptr="bias_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d0_ptr="memory_pool->RequestTensorByIdx(4)",
+        d1_ptr="memory_pool->RequestTensorByIdx(5)",
+        bias_ptr="memory_pool->RequestTensorByIdx(3)",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
         cdims=benchmark_cdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
index 7625580fa..4b767535e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
@@ -279,11 +279,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -312,11 +307,11 @@ def gen_profiler(
     func_call = common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
diff --git a/tests/unittest/ops/test_dual_bmm.py b/tests/unittest/ops/test_dual_bmm.py
index 90292d315..6772adc8e 100644
--- a/tests/unittest/ops/test_dual_bmm.py
+++ b/tests/unittest/ops/test_dual_bmm.py
@@ -114,7 +114,7 @@ def test_dual_bmm_rrr_div_fp16(self):
             N=64,
             K=128,
             broadcast_b1=False,
-            test_name="dual_bmm_rrr_div_fp16",
+            test_name="dual_bmm_rrr_div_fp16_1",
             dtype="float16",
         )
         self._test_dual_bmm_rrr_div(
@@ -123,7 +123,7 @@ def test_dual_bmm_rrr_div_fp16(self):
             N=512,
             K=512,
             broadcast_b1=False,
-            test_name="dual_bmm_rrr_div_fp16",
+            test_name="dual_bmm_rrr_div_fp16_2",
             dtype="float16",
         )
         self._test_dual_bmm_rrr_div(
@@ -132,7 +132,7 @@ def test_dual_bmm_rrr_div_fp16(self):
             N=1024,
             K=2048,
             broadcast_b1=False,
-            test_name="dual_bmm_rrr_div_fp16",
+            test_name="dual_bmm_rrr_div_fp16_3",
             dtype="float16",
         )
 
@@ -143,7 +143,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp16(self):
             N=64,
             K=128,
             broadcast_b1=True,
-            test_name="dual_bmm_rrr_div_fp16",
+            test_name="dual_bmm_rrr_div_broadcast_b1_fp16_1",
             dtype="float16",
         )
         # self._test_dual_bmm_rrr_div(
@@ -152,7 +152,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp16(self):
         #     N=512,
         #     K=512,
         #     broadcast_b1=True,
-        #     test_name="dual_bmm_rrr_div_fp16",
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp16_2",
         #     dtype="float16",
         # )
         # self._test_dual_bmm_rrr_div(
@@ -161,7 +161,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp16(self):
         #     N=1024,
         #     K=2048,
         #     broadcast_b1=True,
-        #     test_name="dual_bmm_rrr_div_fp16",
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp16_3",
         #     dtype="float16",
         # )
 
@@ -173,7 +173,7 @@ def test_dual_bmm_rrr_div_fp32(self):
             N=64,
             K=128,
             broadcast_b1=False,
-            test_name="dual_bmm_rrr_div_fp32",
+            test_name="dual_bmm_rrr_div_fp32_1",
             dtype="float32",
         )
         # self._test_dual_bmm_rrr_div(
@@ -182,7 +182,7 @@ def test_dual_bmm_rrr_div_fp32(self):
         #     N=512,
         #     K=512,
         #     broadcast_b1=False,
-        #     test_name="dual_bmm_rrr_div_fp32",
+        #     test_name="dual_bmm_rrr_div_fp32_2",
         #     dtype="float32",
         # )
         # self._test_dual_bmm_rrr_div(
@@ -191,7 +191,7 @@ def test_dual_bmm_rrr_div_fp32(self):
         #     N=1024,
         #     K=2048,
         #     broadcast_b1=False,
-        #     test_name="dual_bmm_rrr_div_fp32",
+        #     test_name="dual_bmm_rrr_div_fp32_3",
         #     dtype="float32",
         # )
 
@@ -203,7 +203,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp32(self):
             N=64,
             K=128,
             broadcast_b1=True,
-            test_name="dual_bmm_rrr_div_fp32",
+            test_name="dual_bmm_rrr_div_broadcast_b1_fp32_1",
             dtype="float32",
         )
         # self._test_dual_bmm_rrr_div(
@@ -212,7 +212,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp32(self):
         #     N=512,
         #     K=512,
         #     broadcast_b1=True,
-        #     test_name="dual_bmm_rrr_div_fp32",
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp32_2",
         #     dtype="float32",
         # )
         # self._test_dual_bmm_rrr_div(
@@ -221,7 +221,7 @@ def test_dual_bmm_rrr_div_broadcast_b1_fp32(self):
         #     N=1024,
         #     K=2048,
         #     broadcast_b1=True,
-        #     test_name="dual_bmm_rrr_div_fp32",
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp32_3",
         #     dtype="float32",
         # )
 
diff --git a/tests/unittest/ops/test_dual_gemm.py b/tests/unittest/ops/test_dual_gemm.py
index 2d6ce76d5..569910ed8 100644
--- a/tests/unittest/ops/test_dual_gemm.py
+++ b/tests/unittest/ops/test_dual_gemm.py
@@ -174,7 +174,7 @@ def test_dual_gemm_silu_fp16(self):
             K=256,
             fast_gelu=False,
             broadcast_b1=False,
-            test_name="dual_gemm_silu_fp16",
+            test_name="dual_gemm_silu_fp16_1",
             dtype="float16",
         )
         self._test_dual_gemm(
@@ -183,7 +183,7 @@ def test_dual_gemm_silu_fp16(self):
             K=2048,
             fast_gelu=False,
             broadcast_b1=False,
-            test_name="dual_gemm_silu_fp16",
+            test_name="dual_gemm_silu_fp16_2",
             dtype="float16",
         )
         self._test_dual_gemm(
@@ -192,7 +192,7 @@ def test_dual_gemm_silu_fp16(self):
             K=8192,
             fast_gelu=False,
             broadcast_b1=False,
-            test_name="dual_gemm_silu_fp16",
+            test_name="dual_gemm_silu_fp16_3",
             dtype="float16",
         )
 
@@ -214,7 +214,7 @@ def test_dual_gemm_fast_gelu_fp16(self):
             K=256,
             fast_gelu=True,
             broadcast_b1=False,
-            test_name="dual_gemm_fast_gelu_fp16",
+            test_name="dual_gemm_fast_gelu_fp16_1",
             dtype="float16",
         )
         self._test_dual_gemm(
@@ -223,7 +223,7 @@ def test_dual_gemm_fast_gelu_fp16(self):
             K=2048,
             fast_gelu=True,
             broadcast_b1=False,
-            test_name="dual_gemm_fast_gelu_fp16",
+            test_name="dual_gemm_fast_gelu_fp16_2",
             dtype="float16",
         )
         self._test_dual_gemm(
@@ -232,7 +232,7 @@ def test_dual_gemm_fast_gelu_fp16(self):
             K=8192,
             fast_gelu=True,
             broadcast_b1=False,
-            test_name="dual_gemm_fast_gelu_fp16",
+            test_name="dual_gemm_fast_gelu_fp16_3",
             dtype="float16",
         )
 
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index a3e8376b7..542e9e50e 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -81,7 +81,7 @@ def test_rcr_simple_static(self) -> None:
         self._test_rcr([1024], 256, 512, "static")
 
     def test_rcr_simple_static_rocm(self) -> None:
-        self._test_rcr([1024], 256, 512, "static")
+        self._test_rcr([1024], 256, 512, "static_rocm")
 
     @parameterized.expand(
         [
@@ -144,9 +144,9 @@ def test_rcr_dynamic_n(self):
         )
 
     def test_rcr_dynamic_n_rocm(self):
-        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_rocm")
         self._test_rcr_dynamic_n(
-            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n"
+            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n_rocm"
         )
 
     def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name, dtype="float16"):
@@ -228,7 +228,7 @@ def test_rrr(self):
         self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
 
     def test_rrr_rocm(self):
-        self._test_rrr([256], 128, 32, "static")
+        self._test_rrr([256], 128, 32, "static_rocm")
 
     def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
@@ -249,7 +249,7 @@ def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", f"gemm_rrr_{test_name}_{self._test_id}"
+            Y, target, "./tmp", f"gemm_3d_2d_rrr_{test_name}_{self._test_id}"
         )
         self._test_id += 1
 
@@ -269,7 +269,10 @@ def test_3d_2d_rrr(self):
         self._test_3d_2d_rrr([2], [24, 36], 256, 16, "dynamic2")
         self._test_3d_2d_rrr([2, 34, 48], [1, 3, 5], 256, 16, "dynamic3")
 
-    def _test_h_rcr(self, ait_dtype):
+    def _test_h_rcr(self, ait_dtype, test_name=None):
+        if test_name is None:
+            test_name = ait_dtype
+
         M = 256
         K = 256
         N = 512
@@ -281,7 +284,7 @@ def _test_h_rcr(self, ait_dtype):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", f"hgemm_rcr_{ait_dtype}_{self._test_id}"
+            Y, target, "./tmp", f"hgemm_rcr_{test_name}_{self._test_id}"
         )
         self._test_id += 1
         X_pt = get_random_torch_tensor((M, K), ait_dtype)
@@ -297,7 +300,7 @@ def test_h_rcr_float16(self):
         self._test_h_rcr(ait_dtype="float16")
 
     def test_h_rcr_float16_rocm(self):
-        self._test_h_rcr(ait_dtype="float16")
+        self._test_h_rcr(ait_dtype="float16", test_name="float16_rocm")
 
     def test_h_rcr_float32_sm80(self):
         self._test_h_rcr(ait_dtype="float32")
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index 2e6320094..eade402ae 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -51,7 +51,12 @@ def _init_tensors(self, m, k, n, m0=None, m1=None, dtype="float16"):
         self.D1_pt = get_random_torch_tensor([*m_shape, n], dtype)
 
     def _test_and_verify(
-        self, module, torch_output, dtype, has_d1=False, module_output_name="output_0"
+        self,
+        module,
+        torch_output,
+        dtype,
+        has_d1=False,
+        module_output_name="output_0",
     ):
         inputs = {
             "input_0": self.X_pt,
@@ -68,7 +73,16 @@ def _test_and_verify(
         else:
             torch.testing.assert_close(torch_output, y, atol=1e-1, rtol=1e-1)
 
-    def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
+    def _test_bias_rcr_mul_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_add()
@@ -76,7 +90,10 @@ def _test_bias_rcr_mul_add(self, m, m0, m1, k, n, dtype="float16"):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", f"gemm_rcr_bias_mul_add_k_{k}_n_{n}_{dtype}"
+            Y,
+            target,
+            "./tmp",
+            f"gemm_rcr_bias_mul_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
         Y_pt = (
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
@@ -91,9 +108,18 @@ def test_bias_rcr_mul_add(self):
         self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_mul_add_rocm(self):
-        self._test_bias_rcr_mul_add(8, None, None, 8, 8)
-
-    def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_mul_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_sigmoid_mul(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul()
@@ -104,7 +130,7 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_sigmoid_mul_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_sigmoid_mul_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -121,9 +147,18 @@ def test_bias_rcr_sigmoid_mul(self):
         self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_sigmoid_mul_rocm(self):
-        self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
-
-    def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_sigmoid_mul_tanh(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul_tanh()
@@ -134,7 +169,7 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_sigmoid_mul_tanh_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_sigmoid_mul_tanh_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.tanh(
@@ -152,9 +187,20 @@ def test_bias_rcr_sigmoid_mul_tanh(self):
         self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
 
     def test_bias_rcr_sigmoid_mul_tanh_rocm(self):
-        self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
+        self._test_bias_rcr_sigmoid_mul_tanh(
+            8, None, None, 8, 8, test_name_suffix="_rocm"
+        )
 
-    def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
+    def _test_bias_rcr_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add()
@@ -165,7 +211,7 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_add_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -180,9 +226,18 @@ def test_bias_rcr_add(self):
         self._test_bias_rcr_add(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_add_rocm(self):
-        self._test_bias_rcr_add(8, None, None, 8, 8)
-
-    def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_relu(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_relu()
@@ -193,7 +248,7 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_add_relu_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_add_relu_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.relu(
@@ -208,9 +263,18 @@ def test_bias_rcr_add_relu(self):
         self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_add_relu_rocm(self):
-        self._test_bias_rcr_add_relu(8, None, None, 8, 8)
-
-    def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_add_relu(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_add_relu(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add_relu()
@@ -221,7 +285,7 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_add_add_relu_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_add_add_relu_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.relu(
@@ -244,9 +308,19 @@ def test_bias_rcr_add_add_relu(self):
             self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
 
     def test_bias_rcr_add_add_relu_rocm(self):
-        self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
-
-    def _test_bias_rcr_mul(self, m, m0, m1, k, n, use_fp16_acc=False, dtype="float16"):
+        self._test_bias_rcr_add_add_relu(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_mul(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        use_fp16_acc=False,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul()
@@ -257,7 +331,7 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n, use_fp16_acc=False, dtype="float16
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_mul_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_mul_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -272,9 +346,18 @@ def test_bias_rcr_mul(self):
         self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_mul_rocm(self):
-        self._test_bias_rcr_mul(8, None, None, 8, 8)
-
-    def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_mul(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add()
@@ -285,7 +368,7 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_add_add_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_add_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -302,9 +385,18 @@ def test_bias_rcr_add_add(self):
         self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
 
     def test_bias_rcr_add_add_rocm(self):
-        self._test_bias_rcr_add_add(8, None, None, 8, 8)
-
-    def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
+        self._test_bias_rcr_add_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_mul_tanh(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
         self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_tanh()
@@ -315,7 +407,7 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n, dtype="float16"):
             Y,
             target,
             "./tmp",
-            f"gemm_rcr_bias_mul_tanh_k_{k}_n_{n}_{dtype}",
+            f"gemm_rcr_bias_mul_tanh_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.tanh(
@@ -330,7 +422,7 @@ def test_bias_rcr_mul_tanh(self):
         self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
 
     def test_bias_rcr_mul_tanh_rocm(self):
-        self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
+        self._test_bias_rcr_mul_tanh(8, None, None, 8, 8, test_name_suffix="_rocm")
 
     @parameterized.expand(
         [

From fda4c6006342030c450ce9ddb83b25cabc63c375 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@meta.com>
Date: Mon, 10 Apr 2023 20:21:47 -0700
Subject: [PATCH 402/638] dynamic seq (#560)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/560

Reviewed By: henryhu6

Differential Revision: D44854358

Pulled By: terrychenism

fbshipit-source-id: a80e704f35aea69ba57c1b0d7bf1785312aa88bf
---
 .../cuda/attention/mem_eff_attention.py       | 26 ++---
 tests/unittest/ops/test_attention.py          | 97 +++++++++++--------
 2 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 66d00a75b..6a69f817c 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -152,8 +152,8 @@
         p.num_batches = *batch_size;
         p.head_dim = head_size;
         p.head_dim_value = head_size_v;
-        p.num_queries = seq_len_q;
-        p.num_keys = seq_len_kv;
+        p.num_queries = *seq_len_q;
+        p.num_keys = *seq_len_kv;
         p.causal = is_causal;
 
 
@@ -161,14 +161,14 @@
         p.k_strideM = head_size;
         p.v_strideM = head_size_v;
 
-        p.q_strideH = p.q_strideM * seq_len_q;
-        p.k_strideH = p.k_strideM * seq_len_kv;
-        p.v_strideH = p.v_strideM * seq_len_kv;
+        p.q_strideH = p.q_strideM * (*seq_len_q);
+        p.k_strideH = p.k_strideM * (*seq_len_kv);
+        p.v_strideH = p.v_strideM * (*seq_len_kv);
         p.o_strideH = head_size_v;
         p.q_strideB = p.q_strideH * num_heads;
         p.k_strideB = p.k_strideH * num_heads;
         p.v_strideB = p.v_strideH * num_heads;
-        p.o_strideB = head_size_v * seq_len_q * num_heads;
+        p.o_strideB = head_size_v * (*seq_len_q) * num_heads;
     }
 
     // launch kernel
@@ -381,8 +381,8 @@
           "Error when synchronizing stream after copying sequence lengths from device!");
   }
 
-  int mq_full = seq_len_q;
-  int mkv_full = seq_len_kv;
+  int mq_full = *seq_len_q;
+  int mkv_full = *seq_len_kv;
 
   for (int i = 0; i < *batch_size; ++i) {
     // Problems belonging to the same batch share the same seq len
@@ -644,8 +644,8 @@
                    void* key,
                    void* value,
                    int64_t* batch_size,
-                   int seq_len_kv,
-                   int seq_len_q,
+                   int64_t* seq_len_kv,
+                   int64_t* seq_len_q,
                    int num_heads,
                    int head_size,
                    int head_size_v,
@@ -752,7 +752,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     batch_size = "&" + xshape[0]._attrs["name"]
-    seq_len_q = x._attrs["shape"][2]._attrs["values"][0]
+    seq_len_q = "&" + xshape[2]._attrs["name"]
 
     num_heads = x._attrs["shape"][1]._attrs["values"][0]
     head_size = x._attrs["shape"][3]._attrs["values"][0]
@@ -761,7 +761,9 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     softmax_scale = head_size ** (-0.5)
 
     v = func_attrs["inputs"][2]
-    seq_len_kv = v._attrs["shape"][2]._attrs["values"][0]
+    vshape = v._attrs["shape"]
+    seq_len_kv = "&" + vshape[2]._attrs["name"]
+
     head_size_v = v._attrs["shape"][3]._attrs["values"][0]
 
     return FUNC_CALL_TEMPLATE.render(
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 1509d0371..75260d32d 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -26,7 +26,7 @@
 
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import benchmark_pt, detect_target
 from aitemplate.testing.test_utils import (
     filter_test_cases_by_params,
@@ -864,12 +864,64 @@ def _test_cross_attention(
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
+        cache_size=1,
         atol=1e-3,
         rtol=1e-3,
     ):
         torch_dtype = string_to_torch_dtype(dtype)
 
-        with torch.no_grad():
+        Q = Tensor(
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_q"),
+                head_size,
+            ],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_kv"),
+                head_size,
+            ],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_kv"),
+                head_size_v,
+            ],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+
+        mem_eff_attention_op = ops.mem_eff_attention(
+            causal=causal,
+        )
+        if copy_op:
+            mem_eff_attention_op = ops.mem_eff_attention(
+                **mem_eff_attention_op._get_op_attributes()
+            )
+        Y = mem_eff_attention_op(Q, K, V)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        for i in range(cache_size):
             q = torch.randn(
                 batch_size,
                 seqlen,
@@ -880,7 +932,7 @@ def _test_cross_attention(
             )
             k = torch.randn(
                 batch_size,
-                seqlen_kv,
+                seqlen_kv + i,
                 num_heads,
                 head_size,
                 device="cuda",
@@ -888,7 +940,7 @@ def _test_cross_attention(
             )
             v = torch.randn(
                 batch_size,
-                seqlen_kv,
+                seqlen_kv + i,
                 num_heads,
                 head_size_v,
                 device="cuda",
@@ -897,42 +949,6 @@ def _test_cross_attention(
 
             y_pt = ref_cross_attention(q, k, v)
 
-            Q = Tensor(
-                shape=[batch_size, num_heads, seqlen, head_size],
-                dtype=dtype,
-                name="q",
-                is_input=True,
-            )
-            K = Tensor(
-                shape=[batch_size, num_heads, seqlen_kv, head_size],
-                dtype=dtype,
-                name="k",
-                is_input=True,
-            )
-            V = Tensor(
-                shape=[batch_size, num_heads, seqlen_kv, head_size_v],
-                dtype=dtype,
-                name="v",
-                is_input=True,
-            )
-
-            mem_eff_attention_op = ops.mem_eff_attention(
-                causal=causal,
-            )
-            if copy_op:
-                mem_eff_attention_op = ops.mem_eff_attention(
-                    **mem_eff_attention_op._get_op_attributes()
-                )
-            Y = mem_eff_attention_op(Q, K, V)
-            Y._attrs["is_output"] = True
-            Y._attrs["name"] = "output"
-
-            if rebuild:
-                target = detect_target()
-                module = compile_model(Y, target, "./tmp", test_name)
-            else:
-                module = Model(os.path.join("./tmp", test_name, "test.so"))
-
             q = torch.permute(q, (0, 2, 1, 3))
             k = torch.permute(k, (0, 2, 1, 3))
             v = torch.permute(v, (0, 2, 1, 3))
@@ -991,6 +1007,7 @@ def test_cross_attention(self, dtype):
             head_size=64,
             head_size_v=64,
             test_name=f"cross_attention2_{dtype}",
+            cache_size=16,
             dtype=dtype,
             atol=atol,
             rtol=rtol,

From ca2d572d90f547e2a340ef5980bedb60c09074f1 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 11 Apr 2023 06:38:07 -0700
Subject: [PATCH 403/638] Fix MSVC compiler complaints (#551)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/551

Reviewed By: tenpercent

Differential Revision: D44814768

fbshipit-source-id: 71184eeb0c95bafbd853ea4685e2135423c7df8b
---
 python/aitemplate/backend/main_templates.py | 1 +
 static/include/cuda_device_functions.h      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 5979df5a1..3d4e48eb0 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -37,6 +37,7 @@
 #include <string>
 #include <unordered_map>
 #include <math.h>
+#include <iomanip>
 
 {{ function_decl }}
 
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index b1505b47d..88cc305e0 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -14,6 +14,7 @@
 //
 #pragma once
 
+#include <sstream>
 #include <string>
 
 #include "cutlass/conv/conv2d_problem_size.h"

From 216cd17e186056056d8415242ec43628c62762fd Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 11 Apr 2023 08:27:48 -0700
Subject: [PATCH 404/638] Fix MSVC compiler narrowing conversion errors for
 cuda/gemm_epilogue_vistor (#552)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/552

cutlass::gemm::GemmCoord uses int values as coordinates under the hood, while AIT might use int64_t variables in {M, N, K} constructor. So, narrowing conversion is needed.

Reviewed By: tenpercent

Differential Revision: D44814784

fbshipit-source-id: 521fb91570fea19c4a651e71ea93e2e0c787eb48
---
 .../backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py    | 6 +++++-
 .../backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py   | 1 +
 .../backend/cuda/gemm_epilogue_vistor/common_softmax.py     | 1 +
 .../backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py   | 6 +++++-
 .../cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py    | 6 +++++-
 .../backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py | 6 +++++-
 .../cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py      | 6 +++++-
 .../backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py   | 6 +++++-
 8 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 1fc726a6c..570986ce6 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -59,7 +59,11 @@
         N, S: (B, block_num, M) (RowMajor)
     */
 
-    {M, N, K},                                                                                                                             // cutlass::gemm::GemmCoord problem_size
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                                     // cutlass::gemm::GemmCoord problem_size
     B,                                                                                                                                     // int32_t batch_count_
     {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                           // TensorRefA ref_A_
     {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                           // TensorRefB ref_B_
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 9bac19ac6..1fcfdc774 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -94,6 +94,7 @@
 //{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
 {{indent}}using ElementCompute = typename {{instance}}::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
 
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index 11ab6cfb9..ba284906d 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -125,6 +125,7 @@
 {{indent}}{{instance}} gemm_op;
 {% endif %}
 
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 {{problem_args}}
 {{indent}}};
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
index 6bf8e2071..61b11bd5a 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -31,7 +31,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::DualGemmMode::kBatched,         // DualGemmMode mode
-    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
     {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
     {({{elem_input_type}}*)b_ptr, LayoutB(N)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
     nullptr_ref,                                   // TensorRef<ElementC const, LayoutC> ref_C0_
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index 769978a15..283aaab72 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -32,7 +32,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
-    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
     {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
     {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
     ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index 0e9c26d0a..a2cd67f60 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -32,7 +32,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
-    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
     {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
     {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
     ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 40b6496ea..bf3a4d0c0 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -40,7 +40,11 @@
         N, S: (block_num, M) (RowMajor)
     */
 
-    {M, N, K},                                                                                                                     // cutlass::gemm::GemmCoord problem_size
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                             // cutlass::gemm::GemmCoord problem_size
     1,                                                                                                                             // int32_t batch_count_
     {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
     {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index a5fac4e53..35ef2e467 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -53,7 +53,11 @@
         N, S: (block_num, M) (RowMajor)
     */
 
-    {M, N, K},                                                                                                                     // cutlass::gemm::GemmCoord problem_size
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                             // cutlass::gemm::GemmCoord problem_size
     1,                                                                                                                             // int32_t batch_count_
     {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
     {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_

From d67c97f23fc3c9f41005387dc27d2e9ba4155d33 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 11 Apr 2023 16:25:51 -0700
Subject: [PATCH 405/638] Add dynamic_seq_len and dynamic_num_head support in
 b2b bmm. (#530)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/530

ATT.
Also updated b2b bmm kernels to support alpha1_divide_by_seq_len.

Reviewed By: aakhundov, kadeng

Differential Revision: D44451037

fbshipit-source-id: dc104bed4edff38d99d2117815d700b516a50c73
---
 python/aitemplate/backend/codegen.py          |  7 +-
 .../backend/common/elementwise_common.py      |  3 +-
 .../backend/cuda/b2b_bmm/classic_b2b_bmm.py   |  6 ++
 .../cuda/b2b_bmm/fmha_style_b2b_bmm.py        | 55 +++++++------
 .../b2b_bmm/grouped_fmha_style_b2b_bmm.py     | 38 +++++++--
 python/aitemplate/compiler/base.py            |  5 --
 python/aitemplate/compiler/compiler.py        |  3 +-
 .../compiler/ops/b2b_bmm/b2b_bmm_base.py      | 31 ++++---
 .../compiler/ops/b2b_bmm/classic_b2b_bmm.py   | 26 +++++-
 .../ops/b2b_bmm/fmha_style_b2b_bmm.py         | 27 ++++---
 .../ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py | 43 +++-------
 .../compiler/ops/common/view_ops.py           |  5 --
 tests/unittest/ops/test_b2b_bmm.py            | 80 +++++++++++++------
 tests/unittest/ops/test_grouped_b2b_bmm.py    | 33 +++++---
 14 files changed, 224 insertions(+), 138 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index faa616eaf..18980c4b3 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -682,7 +682,12 @@ def _process_jagged_dims(self, node: Tensor) -> None:
 
         batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
         if batch_dim_name not in self.visited_dims:
-            self.dim_decl.append(self.f_var_decl(batch_dim_name, 0))
+            batch_dim_value = (
+                0
+                if not isinstance(jagged_int_var.batch_dim(), IntImm)
+                else jagged_int_var.batch_dim().value()
+            )
+            self.dim_decl.append(self.f_var_decl(batch_dim_name, batch_dim_value))
             self.visited_dims.add(batch_dim_name)
 
     def _process_dims_for_tensor(self, node: Tensor) -> None:
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index eeee08634..adbc2d2ad 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -682,7 +682,8 @@ def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
                     # may not be present directly in other input / output shapes,
                     # so we're adding it here separately
                     batch_dim = dim.batch_dim()
-                    res[batch_dim._attrs["name"]] = batch_dim
+                    if not isinstance(batch_dim, IntImm):
+                        res[batch_dim._attrs["name"]] = batch_dim
                     for jagged_dim in dim.jagged_dims():
                         min_value = jagged_dim.min_value()
                         if not isinstance(min_value, IntImm):
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
index e5d91f8dd..bc1aab11b 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -80,6 +80,9 @@
   ElementCompute alpha0 = ElementCompute({{alpha0}});
   ElementCompute beta0 = ElementCompute(1);
   ElementCompute activation_alpha = ElementCompute({{alpha1}});
+  {% if alpha1_divide_by_seq_len %}
+  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(m0));
+  {% endif %}
   ElementCompute alpha1 = ElementCompute(1);
   ElementCompute beta1 = ElementCompute(0);
 
@@ -253,6 +256,9 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         ),
         alpha0=str(func_attrs["alpha0"]),
         alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
         epilogue_math=epilogue_math,
     )
 
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
index c3764e564..078b88f25 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
@@ -61,8 +61,6 @@
   ElementAccumulator alpha0 = ElementAccumulator({{alpha0}});
   ElementAccumulator alpha1 = ElementAccumulator({{alpha1}});
 
-  int64_t seq_length = {{seq_length}};
-  int64_t seq_length_kv = {{seq_length_kv}};
   int64_t head_dim = {{head_dim}};
   int64_t head_dim_value = {{head_dim_value}};
 
@@ -84,7 +82,7 @@
     p.activation_scale = alpha1;
     p.activation_scale_divide_by_seq_len = {{alpha1_divide_by_seq_len}};
 
-    p.num_heads = {{num_heads}};
+    p.num_heads = num_heads;
     p.num_batches = batch_size;
 
     p.head_dim = head_dim;
@@ -107,7 +105,7 @@
     p.k_strideB = p.k_strideM * seq_length_kv;
     p.v_strideB = p.v_strideM * seq_length_kv;
 
-    int32_t bias_stride = {{seq_length_kv}};
+    int32_t bias_stride = seq_length_kv;
     {% if bias_broadcast[2] %}
     p.bias_strideM = 0;
     {% else %}
@@ -148,10 +146,10 @@
     throw std::runtime_error(
       std::string("Kernel does not support these inputs. ") +
       "Function: {{func_name}}. " +
-      "m0: " + std::to_string({{seq_length}}) +
-      ", k0: " + std::to_string({{head_dim}}) +
-      ", n0: " + std::to_string({{seq_length_kv}}) +
-      ", n1: " + std::to_string({{head_dim_value}}) + "."
+      "seq_length: " + std::to_string(seq_length) +
+      ", head_dim: " + std::to_string({{head_dim}}) +
+      ", seq_length_kv: " + std::to_string(seq_length_kv) +
+      ", head_dim_value: " + std::to_string({{head_dim_value}}) + "."
     );
   }
   kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
@@ -170,8 +168,9 @@
   void* bias,
   void* accum_ptr,
   int64_t batch_size,
-  int64_t m0,
-  int64_t k0,
+  int64_t seq_length,
+  int64_t seq_length_kv,
+  int64_t num_heads,
   cudaStream_t stream)
     """
 )
@@ -186,11 +185,15 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}    {{output}},
-{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{query}},
+{{indent}}    {{key}},
+{{indent}}    {{value}},
+{{indent}}    {{bias}},
 {{indent}}    {{accum_ptr}},
 {{indent}}    {{batch_size}},
-{{indent}}    {{m0}},
-{{indent}}    {{k0}},
+{{indent}}    {{seq_length}},
+{{indent}}    {{seq_length_kv}},
+{{indent}}    {{num_heads}},
 {{indent}}    stream
 {{indent}});
     """
@@ -218,11 +221,11 @@ def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         bias = func_attrs["inputs"][3]
         bias_broadcast = [var == IntImm(1) for var in bias.shape()]
 
-    n0 = k._attrs["shape"][1]
+    k0 = k._attrs["shape"][3]
     n1 = v._attrs["shape"][3]
-    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
+    if not isinstance(k0, IntImm) or not isinstance(n1, IntImm):
         raise RuntimeError(
-            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
+            f"k0 and n1 must be static dims. {func_attrs['name']=}, {k0=}, {n1=}"
         )
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -250,15 +253,14 @@ def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         elem_output_type=elem_output_type,
         elem_accum_type=elem_accum_type,
         offset_t="int64_t",
-        seq_length="m0",
-        seq_length_kv=str(n0.value()),
-        head_dim="k0",
+        head_dim=str(k0.value()),
         head_dim_value=str(n1.value()),
         causal_type=causal_type_to_kernel_str(func_attrs["causal_type"]),
-        num_heads=str(func_attrs["num_heads"]),
         alpha0=str(func_attrs["alpha0"]),
         alpha1=str(func_attrs["alpha1"]),
-        alpha1_divide_by_seq_len="false",
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
         activation_functor=activation_functor,
         bias_broadcast=bias_broadcast,
         offset_ptr="nullptr",
@@ -291,9 +293,11 @@ def fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
         bias_name = func_attrs["inputs"][3]._attrs["name"]
 
     q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    k_shape = func_attrs["inputs"][1]._attrs["shape"]
     batch_size = q_shape[0]._attrs["name"]
-    m0 = q_shape[1]._attrs["name"]
-    k0 = q_shape[3]._attrs["name"]
+    seq_length = q_shape[1]._attrs["name"]
+    seq_length_kv = k_shape[1]._attrs["name"]
+    num_heads = q_shape[2]._attrs["name"]
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -304,7 +308,8 @@ def fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
         bias=bias_name,
         accum_ptr="global_workspace_",
         batch_size=batch_size,
-        m0=m0,
-        k0=k0,
+        seq_length=seq_length,
+        seq_length_kv=seq_length_kv,
+        num_heads=num_heads,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
index 9ec0c70e5..46f6e310f 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -36,10 +36,26 @@
   void* key,
   void* value,
   void* bias,
+
+  // Used as an internal cache to compute output values when the output is too
+  // large to be computed in a single iteration.
   void* accum_ptr,
+
   int64_t batch_size,
-  int64_t max_seq_length,
+
+  // Max sequence lengths of the query, key and values.
+  // This kernel always assumes that seq_length == seq_length_kv.
+  int64_t seq_length,
+  int64_t seq_length_kv,
+
+  int64_t num_heads,
+
+  // A pointer to the offset of the variable sequence lengths
+  // of the query and key tensors.
+  // e.g. when batch_size=4, seq_length is [2, 1, 4, 5]
+  // offset array is [0, 2, 3, 7, 12].
   const void* offset,
+
   cudaStream_t stream)
     """
 )
@@ -57,7 +73,9 @@
 {{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
 {{indent}}    {{accum_ptr}},
 {{indent}}    {{batch_size}},
-{{indent}}    {{max_seq_length}},
+{{indent}}    {{seq_length}},
+{{indent}}    {{seq_length_kv}},
+{{indent}}    {{num_heads}},
 {{indent}}    {{offset}},
 {{indent}}    stream
 {{indent}});
@@ -116,7 +134,7 @@ def grouped_fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         causal_type=fmha_style_b2b_bmm.causal_type_to_kernel_str(
             func_attrs["causal_type"]
         ),
-        num_heads=str(func_attrs["num_heads"]),
+        num_heads="num_heads",
         alpha0=str(func_attrs["alpha0"]),
         alpha1=str(func_attrs["alpha1"]),
         alpha1_divide_by_seq_len="true"
@@ -150,19 +168,21 @@ def grouped_fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
     if len(func_attrs["inputs"]) == 4:
         bias_name = func_attrs["inputs"][3]._attrs["name"]
 
-    jagged_intvar = func_attrs["inputs"][0]._attrs["shape"][0]
-    batch_size = jagged_intvar.batch_dim()._attrs["name"]
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    jagged_intvar = q_shape[0]
+    batch_size_str = jagged_intvar.batch_dim()._attrs["name"]
     if len(jagged_intvar.jagged_dims()) != 1:
         raise RuntimeError(
             "Only support 1 jagged dim in grouped_fmha_style_b2b_bmm for now! "
             f"Current jagged intvar: {jagged_intvar}"
         )
     max_seq_length_dim = jagged_intvar.jagged_dims()[0].max_value()
-    max_seq_length = (
+    max_seq_length_str = (
         str(max_seq_length_dim.value())
         if isinstance(max_seq_length_dim, IntImm)
         else max_seq_length_dim._attrs["name"]
     )
+    num_heads_str = q_shape[1]._attrs["name"]
     offset = f"{jagged_intvar.offsets_var_name()}.data[0]"
 
     return FUNC_CALL_TEMPLATE.render(
@@ -173,8 +193,10 @@ def grouped_fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
         value=v_name,
         bias=bias_name,
         accum_ptr="global_workspace_",
-        batch_size=batch_size,
-        max_seq_length=max_seq_length,
+        batch_size=batch_size_str,
+        seq_length=max_seq_length_str,
+        seq_length_kv=max_seq_length_str,
+        num_heads=num_heads_str,
         offset=offset,
         indent=indent,
     )
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 3f4bbc75f..6bb71b4b2 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -504,11 +504,6 @@ def __init__(
                 "total_length must be dynamic (IntVar), "
                 f"but given {type(total_length).__name__}."
             )
-        if batch_dim is None or type(batch_dim) != IntVar:
-            raise TypeError(
-                "batch_dim must be dynamic (IntVar), "
-                f"but given {type(batch_dim).__name__}."
-            )
         if not jagged_dims or not all(
             isinstance(dim, JaggedDim) for dim in jagged_dims
         ):
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 427b63f7d..07eabe9e5 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -123,7 +123,8 @@ def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
                 int_vars[name] = dim
                 if isinstance(dim, JaggedIntVar):
                     batch_dim = dim.batch_dim()
-                    int_vars[batch_dim._attrs["name"]] = batch_dim
+                    if not isinstance(batch_dim, IntImm):
+                        int_vars[batch_dim._attrs["name"]] = batch_dim
                     total_length = dim.total_length()
                     int_vars[total_length._attrs["name"]] = total_length
                     for jagged_dim in dim.jagged_dims():
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
index 7b275f1c0..9b979a307 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
@@ -15,7 +15,7 @@
 
 """
 Base class for back-to-back batched gemm fused kernels.
-Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) + bias))), V),
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
 
 where:
 Q: [B, M0, (H,) K0] (row_major),
@@ -23,11 +23,6 @@
 V: [B, N0, (H,) N1] (row_major),
 bias: [B, (H,) M0, N0] (row_major).
 Layouts are fixed for now.
-
-causal_masks have 3 types:
-NO_CAUSAL: no causal masks
-UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
-LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
 """
 
 from enum import Enum
@@ -48,18 +43,35 @@ def _check_max_alignment(shape: IntVar, dtype: str, error_msg: str) -> None:
 
 
 class CausalType(Enum):
-    NO_CAUSAL = 0
-    UPPER_RIGHT_EMPTY = 1
-    LOWER_LEFT_EMPTY = 2
+    NO_CAUSAL = 0  # no causal mask
+    UPPER_RIGHT_EMPTY = 1  # upper right triangular part of the matrix is 0
+    LOWER_LEFT_EMPTY = 2  # bottom left triangular part of the matrix is 0
 
 
 class b2b_bmm_base(Operator):
+    r"""Base class for back-to-back batched gemm fused kernels.
+
+    Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+    Args:
+    * causal_type (CausalType): Type of causal_mask. See comments above.
+    * epilogue_math_name (str): Name of the activation function.
+      Supported epilogue functions can be found from
+      python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+    * alpha0 (float): See the math function above.
+    * alpha1 (float): See the math function above.
+    * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+      Useful when seq_len is a dynamic value so that alpah1 cannot be
+      computed in advance.
+    """
+
     def __init__(
         self,
         causal_type: CausalType,
         epilogue_math_name: str,
         alpha0: float,
         alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
     ) -> None:
         """Initialize classic_b2b_bmm op."""
         super().__init__()
@@ -67,6 +79,7 @@ def __init__(
         self._attrs["causal_type"] = causal_type
         self._attrs["alpha0"] = alpha0
         self._attrs["alpha1"] = alpha1
+        self._attrs["alpha1_divide_by_seq_len"] = alpha1_divide_by_seq_len
 
         import cutlass_lib
 
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
index cab3250e7..154af58a0 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
@@ -32,20 +32,28 @@
 """
 
 from aitemplate.backend import registry, target
-from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.base import IntImm, Tensor
 from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
 
 
 class classic_b2b_bmm(b2b_bmm_base):
+    """See comments at the head of this file."""
+
     def __init__(
         self,
         causal_type: CausalType,
         epilogue_math_name: str,
         alpha0: float,
         alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
     ) -> None:
-        """Initialize classic_b2b_bmm op."""
-        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1)
+        """Initialize classic_b2b_bmm op.
+        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
+        about these args.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
         self._attrs["op"] = "classic_b2b_bmm"
         if (
             causal_type != CausalType.NO_CAUSAL
@@ -91,6 +99,10 @@ def _infer_shapes(self):
             raise RuntimeError(
                 f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
             )
+        if not isinstance(N0, IntImm) or not isinstance(N1, IntImm):
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports static N0 / N1. Current {N0=}, {N1=}."
+            )
         if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
             if M0 != N0:
                 raise RuntimeError(
@@ -143,7 +155,13 @@ def __call__(
         return output
 
     def _get_op_attributes(self):
-        target_attrs = ["causal_type", "epilogue_math_name", "alpha0", "alpha1"]
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+        ]
         attr = {}
 
         for target_attr in target_attrs:
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
index 91eede72b..85023e164 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
@@ -39,24 +39,30 @@
 import numpy as np
 
 from aitemplate.backend import registry, target
-from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
 from aitemplate.utils import shape_utils
 
 
 class fmha_style_b2b_bmm(b2b_bmm_base):
+    """See comments at the head of this file."""
+
     def __init__(
         self,
         causal_type: CausalType,
         epilogue_math_name: str,
         alpha0: float,
         alpha1: float,
-        num_heads: int,
+        alpha1_divide_by_seq_len: bool = False,
     ) -> None:
-        """Initialize fmha_style_b2b_bmm op."""
-        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1)
+        """Initialize fmha_style_b2b_bmm op.
+        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
+        about these args.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
         self._attrs["op"] = "fmha_style_b2b_bmm"
-        self._attrs["num_heads"] = num_heads
         self._attrs["workspace"] = 0
 
     def _infer_shapes(self):
@@ -83,10 +89,6 @@ def _infer_shapes(self):
                 f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
         batch_size = q_shape[0]
-        if q_shape[2] != IntImm(self._attrs["num_heads"]):
-            raise RuntimeError(
-                f"num_heads are not equal! {self._attrs['num_heads']=}, {q_shape[2]=}"
-            )
         M0 = q_shape[1]
         K0 = q_shape[3]
         if K0 != k_shape[3]:
@@ -106,8 +108,7 @@ def _infer_shapes(self):
                     f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
                 )
 
-        head_size = IntImm(self._attrs["num_heads"])
-
+        head_size = q_shape[2]
         output_shape = [batch_size, M0, head_size, N1]
 
         if len(self._attrs["inputs"]) == 4:
@@ -119,7 +120,7 @@ def _infer_shapes(self):
             )
             if len(bias_shape) != 4:
                 raise RuntimeError(
-                    f"Expected bias rank 4. Current bias rank: {len(bias)}."
+                    f"Expected bias rank 4. Current bias rank: {len(bias_shape)}."
                 )
             if not broadcastable:
                 raise RuntimeError(
@@ -179,7 +180,7 @@ def _get_op_attributes(self):
             "epilogue_math_name",
             "alpha0",
             "alpha1",
-            "num_heads",
+            "alpha1_divide_by_seq_len",
         ]
         attr = {}
 
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
index e649733d4..b1b2b957b 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -44,19 +44,24 @@
 
 
 class grouped_fmha_style_b2b_bmm(fmha_style_b2b_bmm):
+    """See comments at the head of this file."""
+
     def __init__(
         self,
         causal_type: CausalType,
         epilogue_math_name: str,
         alpha0: float,
         alpha1: float,
-        alpha1_divide_by_seq_len: bool,
-        num_heads: int,
+        alpha1_divide_by_seq_len: bool = False,
     ) -> None:
-        """Initialize grouped_fmha_style_b2b_bmm op."""
-        super().__init__(causal_type, epilogue_math_name, alpha0, alpha1, num_heads)
+        """Initialize grouped_fmha_style_b2b_bmm op.
+        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
+        about these args.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
         self._attrs["op"] = "grouped_fmha_style_b2b_bmm"
-        self._attrs["alpha1_divide_by_seq_len"] = alpha1_divide_by_seq_len
 
     def _infer_shapes(self):
         """infer the output shape for grouped_fmha_style_b2b_bmm."""
@@ -87,19 +92,14 @@ def _infer_shapes(self):
             raise RuntimeError(
                 f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        if q_shape[1] != IntImm(self._attrs["num_heads"]):
-            raise RuntimeError(
-                f"num_heads are not equal! {self._attrs['num_heads']=}, {q_shape[1]=}"
-            )
         K0 = q_shape[2]
         if K0 != k_shape[2]:
             raise RuntimeError(
                 f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
 
-        head_size = IntImm(self._attrs["num_heads"])
-
-        output_shape = [q_shape[0], head_size, v_shape[2]]
+        num_heads = q_shape[1]
+        output_shape = [q_shape[0], num_heads, v_shape[2]]
 
         if len(self._attrs["inputs"]) == 4:
             batch_size = q_shape[0].batch_dim()
@@ -108,7 +108,7 @@ def _infer_shapes(self):
             bias_shape = bias._attrs["shape"]
             bias_expected_shape = [
                 batch_size,
-                head_size,
+                num_heads,
                 max_seq_length,
                 max_seq_length,
             ]
@@ -162,20 +162,3 @@ def _infer_shapes(self):
                 )
 
         return output_shape
-
-    def _get_op_attributes(self):
-        target_attrs = [
-            "causal_type",
-            "epilogue_math_name",
-            "alpha0",
-            "alpha1",
-            "alpha1_divide_by_seq_len",
-            "num_heads",
-        ]
-        attr = {}
-
-        for target_attr in target_attrs:
-            if target_attr in self._attrs:
-                attr[target_attr] = self._attrs[target_attr]
-
-        return attr
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 8eaace3c6..b48a94519 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -696,11 +696,6 @@ def __init__(
         jagged_dims: List[JaggedDim],
         check_sequence_lengths: bool = True,
     ) -> None:
-        if type(batch_dim) != IntVar:
-            raise TypeError(
-                "batch_dim must be dynamic (IntVar), "
-                f"but given {type(batch_dim).__name__}."
-            )
         if not jagged_dims or not all(
             isinstance(dim, JaggedDim) for dim in jagged_dims
         ):
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index ae04287e6..1d0e0bcd3 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -15,6 +15,7 @@
 """
 Unittests for b2b bmm Operators.
 """
+import itertools
 import logging
 import unittest
 from typing import List, Tuple
@@ -64,7 +65,7 @@ def _test_classic_b2b_bmm(
         if isinstance(batch_sizes, int):
             batch_sizes = [batch_sizes]
         alpha0 = 1.0 / (k0**0.5)
-        alpha1 = 1.0 / m
+        alpha1 = 1.0
         batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
 
         Q = Tensor(
@@ -95,6 +96,7 @@ def _test_classic_b2b_bmm(
             causal_type=causal_type,
             alpha0=alpha0,
             alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
             epilogue_math_name=epilogue_math_name,
         )
         if copy_op:
@@ -120,7 +122,7 @@ def _test_classic_b2b_bmm(
             # Run PT reference.
             attn = alpha0 * (q_pt @ k_pt.transpose(-2, -1)) + bias_pt
             attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
-            attn = alpha1 * attn
+            attn = alpha1 / m * attn
             invalid_attn_mask = get_attn_mask_per_causal_type(
                 m, n0, causal_type, torch_dtype
             )
@@ -191,11 +193,11 @@ def setUpClass(cls):
     def _test_fmha_style_b2b_bmm(
         self,
         batch_sizes: Tuple[int, List[int]] = 1024,
-        m=256,
+        seq_lens: Tuple[int, List[int]] = 256,
         k0=128,
-        n0=256,
+        seq_lens_kv: Tuple[int, List[int]] = 256,
         n1=256,
-        num_heads=1,
+        num_heads: Tuple[int, List[int]] = 1,
         has_bias=False,
         bias_broadcast=None,
         epilogue_math_name="Identity",
@@ -210,31 +212,40 @@ def _test_fmha_style_b2b_bmm(
         # Initialize AIT fmha_style_b2b_bmm operator.
         if isinstance(batch_sizes, int):
             batch_sizes = [batch_sizes]
+        if isinstance(seq_lens, int):
+            seq_lens = [seq_lens]
+        if isinstance(seq_lens_kv, int):
+            seq_lens_kv = [seq_lens_kv]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads]
         alpha0 = 1.0 / (k0**0.5)
-        alpha1 = 1.0 / m
+        alpha1 = 1.0
         batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+        seq_lens_dim = shape_utils.gen_int_var_min_max(seq_lens, "seq_len")
+        seq_lens_kv_dim = shape_utils.gen_int_var_min_max(seq_lens_kv, "seq_len_kv")
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, "num_heads")
 
         Q = Tensor(
-            shape=[batch_size_dim, m, num_heads, k0],
+            shape=[batch_size_dim, seq_lens_dim, num_heads_dim, k0],
             dtype=dtype,
             name="q",
             is_input=True,
         )
         K = Tensor(
-            shape=[batch_size_dim, n0, num_heads, k0],
+            shape=[batch_size_dim, seq_lens_kv_dim, num_heads_dim, k0],
             dtype=dtype,
             name="k",
             is_input=True,
         )
         V = Tensor(
-            shape=[batch_size_dim, n0, num_heads, n1],
+            shape=[batch_size_dim, seq_lens_kv_dim, num_heads_dim, n1],
             dtype=dtype,
             name="v",
             is_input=True,
         )
         Bias = None
         if has_bias:
-            shape = [batch_size_dim, num_heads, m, n0]
+            shape = [batch_size_dim, num_heads_dim, seq_lens_dim, seq_lens_kv_dim]
             if bias_broadcast:
                 for i, broadcast in enumerate(bias_broadcast):
                     if broadcast:
@@ -249,8 +260,8 @@ def _test_fmha_style_b2b_bmm(
             causal_type=causal_type,
             alpha0=alpha0,
             alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
             epilogue_math_name=epilogue_math_name,
-            num_heads=num_heads,
         )
         if copy_op:
             fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
@@ -266,12 +277,20 @@ def _test_fmha_style_b2b_bmm(
 
         # Run tests.
         torch_dtype = string_to_torch_dtype(dtype)
-        for batch_size in batch_sizes:
+        for batch_size, seq_len, seq_len_kv, num_head in itertools.product(
+            batch_sizes, seq_lens, seq_lens_kv, num_heads
+        ):
             # Initialize inputs
-            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
-            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
-            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
-            shape = [batch_size, num_heads, m, n0]
+            q_pt = torch.rand(
+                batch_size, seq_len, num_head, k0, dtype=torch_dtype
+            ).cuda()
+            k_pt = torch.rand(
+                batch_size, seq_len_kv, num_head, k0, dtype=torch_dtype
+            ).cuda()
+            v_pt = torch.rand(
+                batch_size, seq_len_kv, num_head, n1, dtype=torch_dtype
+            ).cuda()
+            shape = [batch_size, num_head, seq_len, seq_len_kv]
             if bias_broadcast:
                 for i, broadcast in enumerate(bias_broadcast):
                     if broadcast:
@@ -285,9 +304,9 @@ def _test_fmha_style_b2b_bmm(
             if has_bias:
                 attn = attn + bias_pt
             attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
-            attn = alpha1 * attn
+            attn = alpha1 / seq_len * attn
             invalid_attn_mask = get_attn_mask_per_causal_type(
-                m, n0, causal_type, torch_dtype
+                seq_len, seq_len_kv, causal_type, torch_dtype
             )
             attn = attn * invalid_attn_mask
             output = (attn @ v_pt.transpose(1, 2)).transpose(1, 2)
@@ -302,7 +321,7 @@ def _test_fmha_style_b2b_bmm(
             if has_bias:
                 inputs["bias"] = bias_pt
             y = torch.empty(
-                [batch_size, m, num_heads, n1],
+                [batch_size, seq_len, num_head, n1],
                 dtype=torch_dtype,
                 device="cuda",
             )
@@ -321,12 +340,27 @@ def test_fmha_style_b2b_bmm_fp16(self):
             dtype="float16",
             batch_sizes=[3, 8, 10],
         )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len",
+            dtype="float16",
+            seq_lens=[128, 256],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len_kv",
+            dtype="float16",
+            seq_lens_kv=[128, 256],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_num_heads",
+            dtype="float16",
+            num_heads=[1, 2],
+        )
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_rectangular",
             dtype="float16",
             batch_sizes=[2],
-            m=512,
-            n0=128,
+            seq_lens=512,
+            seq_lens_kv=128,
             n1=128,
         )
         self._test_fmha_style_b2b_bmm(
@@ -388,8 +422,8 @@ def test_fmha_style_b2b_bmm_fp16(self):
             bias_broadcast=[False, False, True, False],
             num_heads=2,
             use_fp16_acc=False,
-            m=512,
-            n0=512,
+            seq_lens=512,
+            seq_lens_kv=512,
         )
 
 
diff --git a/tests/unittest/ops/test_grouped_b2b_bmm.py b/tests/unittest/ops/test_grouped_b2b_bmm.py
index 49c1c5def..54b3396e6 100644
--- a/tests/unittest/ops/test_grouped_b2b_bmm.py
+++ b/tests/unittest/ops/test_grouped_b2b_bmm.py
@@ -53,7 +53,7 @@ def _test_grouped_fmha_style_b2b_bmm(
         max_seq_lens: Tuple[int, List[int]] = 256,
         head_dim=128,
         head_dim_value=256,
-        num_heads=1,
+        num_heads: Tuple[int, List[int]] = 1,
         has_bias=False,
         bias_broadcast=None,
         epilogue_math_name="Identity",
@@ -72,6 +72,8 @@ def _test_grouped_fmha_style_b2b_bmm(
             batch_sizes = [batch_sizes, batch_sizes]
         if isinstance(max_seq_lens, int):
             max_seq_lens = [max_seq_lens, max_seq_lens]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads, num_heads]
         alpha0 = 1.0 / (head_dim**0.5)
         batch_size_dim = IntVar(
             values=[min(batch_sizes), max(batch_sizes)], name="batch_size"
@@ -79,6 +81,7 @@ def _test_grouped_fmha_style_b2b_bmm(
         max_seq_len_dim = shape_utils.gen_int_var_min_max(
             max_seq_lens, name="max_seq_len"
         )
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, name="num_heads")
         jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_len_dim)]
         total_length_dim = IntVar(
             values=[0, batch_size_dim.upper_bound() * max_seq_len_dim.upper_bound()],
@@ -89,19 +92,19 @@ def _test_grouped_fmha_style_b2b_bmm(
             name="offset_length",
         )
         Q_dense = Tensor(
-            shape=[total_length_dim, num_heads, head_dim],
+            shape=[total_length_dim, num_heads_dim, head_dim],
             dtype=dtype,
             name="q",
             is_input=True,
         )
         K_dense = Tensor(
-            shape=[total_length_dim, num_heads, head_dim],
+            shape=[total_length_dim, num_heads_dim, head_dim],
             dtype=dtype,
             name="k",
             is_input=True,
         )
         V_dense = Tensor(
-            shape=[total_length_dim, num_heads, head_dim_value],
+            shape=[total_length_dim, num_heads_dim, head_dim_value],
             dtype=dtype,
             name="v",
             is_input=True,
@@ -122,7 +125,7 @@ def _test_grouped_fmha_style_b2b_bmm(
         )
         Bias = None
         if has_bias:
-            shape = [batch_size_dim, num_heads, max_seq_len_dim, max_seq_len_dim]
+            shape = [batch_size_dim, num_heads_dim, max_seq_len_dim, max_seq_len_dim]
             if bias_broadcast:
                 for i, broadcast in enumerate(bias_broadcast):
                     if broadcast:
@@ -139,7 +142,6 @@ def _test_grouped_fmha_style_b2b_bmm(
             alpha1=1.0,
             alpha1_divide_by_seq_len=alpha1_divide_by_seq_len,
             epilogue_math_name=epilogue_math_name,
-            num_heads=num_heads,
         )
         if copy_op:
             grouped_fmha_style_b2b_bmm_op = ops.grouped_fmha_style_b2b_bmm(
@@ -156,8 +158,8 @@ def _test_grouped_fmha_style_b2b_bmm(
         # Run tests.
         torch_dtype = string_to_torch_dtype(dtype)
         offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
-        for batch_size, max_seq_len in itertools.product(
-            sorted(set(batch_sizes)), sorted(set(max_seq_lens))
+        for batch_size, max_seq_len, num_head in itertools.product(
+            sorted(set(batch_sizes)), sorted(set(max_seq_lens)), sorted(set(num_heads))
         ):
             # Initialize inputs
             lengths = torch.randint(
@@ -169,15 +171,15 @@ def _test_grouped_fmha_style_b2b_bmm(
             total_length = offsets[-1]
             offsets_pt = offsets.cuda()
             q_pt = torch.rand(
-                (total_length, num_heads, head_dim), dtype=torch_dtype
+                (total_length, num_head, head_dim), dtype=torch_dtype
             ).cuda()
             k_pt = torch.rand(
-                (total_length, num_heads, head_dim), dtype=torch_dtype
+                (total_length, num_head, head_dim), dtype=torch_dtype
             ).cuda()
             v_pt = torch.rand(
-                (total_length, num_heads, head_dim_value), dtype=torch_dtype
+                (total_length, num_head, head_dim_value), dtype=torch_dtype
             ).cuda()
-            bias_shape = [batch_size, num_heads, max_seq_len, max_seq_len]
+            bias_shape = [batch_size, num_head, max_seq_len, max_seq_len]
             if bias_broadcast:
                 for i, broadcast in enumerate(bias_broadcast):
                     if broadcast:
@@ -194,7 +196,7 @@ def _test_grouped_fmha_style_b2b_bmm(
             if has_bias:
                 inputs["bias"] = bias_pt
             y = torch.empty(
-                [total_length, num_heads, head_dim_value],
+                [total_length, num_head, head_dim_value],
                 dtype=torch_dtype,
                 device="cuda",
             )
@@ -304,6 +306,11 @@ def test_grouped_fmha_style_b2b_bmm_fp16(self):
             num_heads=2,
             bias_broadcast=[True, True, True, False],
         )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_multi_head",
+            dtype="float16",
+            num_heads=[2, 4],
+        )
         self._test_grouped_fmha_style_b2b_bmm(
             test_name="grouped_fmha_style_b2b_bmm_fp16_complex",
             dtype="float16",

From c360bf85d258e972e72becda4700613fc8ef7693 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 12 Apr 2023 10:23:19 -0700
Subject: [PATCH 406/638] Fix reduce ops with last input dim IntVar (#563)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/563

The `recude_*` ops seem to fail [this assertion](https://github.com/facebookincubator/AITemplate/blob/main/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py#L316) when the last input dimension is `IntVar`. The problem seems to be that the reduction axis is assumed to be -1 in the `_get_read_vector_type` function, even if it's actually not. Hence the check [here](https://github.com/facebookincubator/AITemplate/blob/main/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py#L413) against the actual reduction axis passes, but the subsequent aforementioned assertion fails.

This diff replaces the assertion by using the `input_type` as the `read_vector_type` if the last input dim is `IntVar`, as the `IntVar` reduction dim's value can be odd in the runtime. Instead of failing the assertion the code compilation successfully completes.

Reviewed By: chenyang78

Differential Revision: D44915126

fbshipit-source-id: 34a8d9b8f0b678468ed1e80f4ae56b34aafc1c5e
---
 .../backend/cuda/reduce/reduce_small_axis.py         |  7 ++++++-
 tests/unittest/ops/test_reduce.py                    | 12 ++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index 14c7c8584..2db0f5524 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -313,7 +313,12 @@ def _size_to_vector_type(sz_in_byte) -> str:
         raise NotImplementedError("Unsupported vector size: {}".format(sz_in_byte))
 
     reduction_axis = -1
-    assert isinstance(input_shape[reduction_axis], IntImm)
+
+    if not isinstance(input_shape[reduction_axis], IntImm):
+        # the last dimension is IntVar, so the best we can do in
+        # terms of the read vector type is the input_type iteself
+        return input_type
+
     rank = len(input_shape)
     reduction_dim_val = input_shape[reduction_axis]._attrs["values"][0]
     input_type_sz_in_bit = type_to_size_in_bit.get(input_type)
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index 00d9af551..ed0b1a0d2 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -99,7 +99,7 @@ def _run_reduce_sum(
         atol=1e-2,
     ):
         self._run_reduce(
-            test_name="reduce_sum",
+            test_name=f"reduce_sum_{input_type}_{output_type}",
             reduce_op=ops.reduce_sum,
             torch_reduce_op=torch.sum,
             dim=dim,
@@ -219,7 +219,7 @@ def _run_reduce_mean(
         output_type=None,
     ):
         self._run_reduce(
-            test_name="reduce_mean",
+            test_name=f"reduce_mean_{input_type}_{output_type}",
             reduce_op=ops.reduce_mean,
             torch_reduce_op=torch.mean,
             dim=dim,
@@ -231,7 +231,11 @@ def _run_reduce_mean(
 
     def test_reduce_mean(self):
         self._run_reduce_mean(
-            dim=0, input_shape=[1], keepdim=True, input_type="float16", output_type=None
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
         )
         self._run_reduce_mean(
             dim=1,
@@ -445,7 +449,7 @@ def _run_batched_reduce_sum(
         output_type=None,
     ):
         self._run_batched_reduce(
-            test_name="reduce_sum_batched",
+            test_name=f"reduce_sum_batched_{input_type}_{output_type}",
             reduce_op=ops.reduce_sum,
             torch_reduce_op=torch.sum,
             dim=dim,

From d5e65388613a86e7b8329903b044943a0b890fe5 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 12 Apr 2023 13:18:46 -0700
Subject: [PATCH 407/638] Build Cache CI Integration (#541)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/541

See T148695911

With D44229622 we could prove that it should be possible to speed up unit tests and therefore also CI runs considerably.

The task was to integrate the build cache with Sandcastle CI
in order to speed up our CI process.

For reference about considered options, tradeoffs and decision process:

Original design doc at https://docs.google.com/document/d/1GHuhIJ83CsS3hgB8bV53TDTIqavqpPl4guP_kDcWdII/edit
Final design review meeting slides & notes: https://docs.google.com/presentation/d/1bICc-OtCp1kgisL3SOCN7XYN4ZRn9a6JX62eMjFUI68/edit#slide=id.g1e0053f1f88_0_53

Implementation:

 [x] Created a Manifold-based build cache implementation
 [x] incorporated it into the non-OSS part of the codebase, similar to fb/detect_model.py in fb/build_cache.py
 [x] Sets TTL on stored objects. Resets this TTL on read (  asynchronously, no need to wait for this before continuing )
 [x]Archiving and storing of files to be cached happen asynchronously in order not to delay the tests.
 [x]Investigated whether we can get Manifold latency down by creating a new bucket with different settings ( did not work for me)

 Add features and config options to:

 [x] Disabled caching for a compile_model call, entire unit test or globally ( env var )

 [x]Disabled the build cache for profiling only ( env var )
Not use the cache with a certain probability (in order to keep the build system and cache under test)
I
 [x]Incorporated info from question on Manifold Users Workplace group, whether we can use the official Manifold Client for this usecase ( https://fb.workplace.com/groups/ManifoldUsers/permalink/1682913152123392/ )

(Unless we quickly get an answer, the first implementation should use the deprecated manifold client, because that is proven to work and safe in multiprocessing. )

 [x] Does not cache .obj files ( unneccessary, and takes up large amount of storage in many cases )

 [x] Added unit test ( mock Manifold client )

Reviewed By: ipiszy, aakhundov

Differential Revision: D44642328

fbshipit-source-id: 9d2ec65e953d7f513d4325a7d1cc834f1b5afb75
---
 python/aitemplate/backend/build_cache_base.py |  62 ++++++-
 python/aitemplate/backend/builder.py          |  13 +-
 python/aitemplate/utils/environ.py            |  36 ++++
 python/aitemplate/utils/io.py                 |  60 +++++--
 tests/unittest/backend/test_build_cache.py    | 169 ++++++++++++------
 .../compiler/test_compilation_failure.py      |  17 +-
 .../test_transform_permute_to_reshape.py      |   9 +-
 .../ops/test_conv_bias_act_few_channels.py    |   2 +-
 .../ops/test_conv_bias_add_hardswish.py       |   2 +-
 9 files changed, 276 insertions(+), 94 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index 03bef207f..ccb914c4b 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -16,6 +16,7 @@
 import hashlib
 import logging
 import os
+import random
 import secrets
 import shutil
 import tempfile
@@ -25,6 +26,8 @@
 from pathlib import Path
 from typing import Callable, List, Optional, Tuple
 
+from aitemplate.utils import environ as aitemplate_env
+
 from aitemplate.utils.io import file_age, touch
 
 _LOGGER = logging.getLogger(__name__)
@@ -57,7 +60,51 @@
 source_filename_prefixes = ["makefile"]
 
 # File extensions of files to be considered cache artifacts ( unless they are considered source files )
-cache_extensions = {"obj", "so", "dll", "exe", ""}
+# note: we're not caching .obj files anymore as these are not strictly necessary to keep.
+cache_extensions = {"so", "dll", "exe", ""}
+
+skip_cache_flag = False  # Global flag that cache implementations should check whether
+# the cache is enabled or not. Used by skip_build_cache decorator
+
+
+class SkipBuildCache:
+    def __init__(self, context_skip_cache_flag: bool = True):
+        """
+        Context manager to temporarily disable the build cache within an execution context.
+        """
+        self.context_skip_cache_flag = context_skip_cache_flag
+
+    def __enter__(self):
+        global skip_cache_flag
+        self.old_skip_cache_flag = skip_cache_flag
+        skip_cache_flag = self.context_skip_cache_flag
+
+    def __exit__(self, *args, **kwargs):
+        global skip_cache_flag
+        skip_cache_flag = self.old_skip_cache_flag
+
+
+def should_skip_build_cache():
+    """
+    This function should be called by cache implementations to determine whether the cache should be skipped or not
+    """
+    global skip_cache_flag
+    if skip_cache_flag:
+        return True
+    skip_percentage = aitemplate_env.ait_build_cache_skip_percentage()
+    if skip_percentage is not None:
+        skip_percentage = int(skip_percentage)
+        assert (
+            skip_percentage >= 0 and skip_percentage <= 100
+        ), f"Skip percentage has to be in the range [0,100]. Actual value: {skip_percentage}"
+        if skip_percentage == 100:
+            return True
+        if skip_percentage == 0:
+            return False
+        rndi = random.randint(0, 99)
+        if rndi < skip_percentage:
+            return True
+    return False
 
 
 def filename_norm_split(filename: str) -> Tuple[str, str]:
@@ -166,12 +213,13 @@ def create_dir_hash(
     Returns:
         str: SHA256 Hash of the build directory contents in the form of a hexdigest string.
     """
-
+    hash_log = None
     try:
-        hash_log = None
+        if not os.path.isdir(build_dir):
+            return "empty_dir"
         if debug:
             hash_log = open(  # noqa: P201 - this is actually closed properly in the finally close below
-                os.path.join(build_dir, "cache_key.log"), mode="w", encoding="utf8"
+                os.path.join(build_dir, "cache_key.log"), mode="a", encoding="utf8"
             )
             hash_log.write(f"Building dir hash of {build_dir}\n")
         basepath = Path(build_dir)
@@ -182,6 +230,8 @@ def create_dir_hash(
                 build_dir, "${BUILD_DIR}"
             )  # Make sure we can cache regardless of the build directory location.
             hash_object.update(_cmd.encode("utf-8"))
+            if debug:
+                hash_log.write(f"\tCOMMAND: {_cmd} -> {hash_object.hexdigest()}\n")
         for fpath in sorted(files):
             if not filter_func(str(fpath)):
                 continue
@@ -359,7 +409,9 @@ def retrieve_build_cache(
         from_sources_filter_func: Callable[[str], bool] = is_source,
     ) -> Tuple[bool, Optional[str]]:
         """See docstring of implemented method interface in parent class"""
-
+        if should_skip_build_cache():
+            _LOGGER.info(f"CACHE: Skipped build cache for {build_dir}")
+            return False, None
         self.maybe_cleanup(self.lru_retention_hours, self.cleanup_max_age_seconds)
         cache_dir = self.cache_dir
         dir_hash = create_dir_hash(
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 15a5e4b1a..66244b127 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -162,6 +162,10 @@ def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
         )
         try:
             out, err = proc.communicate(timeout)
+            if store_cache_key is not None:
+                build_cache.BUILD_CACHE.store_build_cache(
+                    cmds, build_dir, store_cache_key
+                )
         except subprocess.TimeoutExpired as e:
             proc.kill()
             out, err = proc.communicate()
@@ -179,8 +183,6 @@ def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
             else:
                 _LOGGER.debug(f"make stdout:\n\n{stdout}")
                 _LOGGER.debug(f"make stderr:\n\n{stderr}")
-        if store_cache_key is not None:
-            build_cache.BUILD_CACHE.store_build_cache(cmds, build_dir, store_cache_key)
 
 
 def process_task(task: Task) -> None:
@@ -816,7 +818,12 @@ def make_profilers(self, generated_profilers, workdir):
         make_clean_cmd = f" {make_path} {make_flags} clean "
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         cmds = [make_clean_cmd, make_all_cmd]
-        _run_make_cmds(cmds, self._timeout, build_dir, allow_cache=True)
+        _run_make_cmds(
+            cmds,
+            self._timeout,
+            build_dir,
+            allow_cache=(not environ.ait_build_cache_skip_profiler()),
+        )
 
     def _gen_compiler_version_files(self, target_dir):
         # Write compiler version string(s) into build directory
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 739606684..8cd67b6e3 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -81,3 +81,39 @@ def ait_build_cache_dir() -> Optional[str]:
         or None if not set.
     """
     return os.environ.get("AIT_BUILD_CACHE_DIR", None)
+
+
+def ait_build_cache_skip_percentage() -> int:
+    """
+    When set to a non-empty string, and if AIT_BUILD_CACHE_DIR
+    is set, the build cache will be skipped randomly with
+    a probability correspinding to the specified percentage
+
+    Returns:
+        int: Integer value of AIT_BUILD_CACHE_SKIP_PERCENTAGE environment variable,
+        or 5 if not set.
+    """
+    return int(os.environ.get("AIT_BUILD_CACHE_SKIP_PERCENTAGE", "30"))
+
+
+def ait_build_cache_skip_profiler() -> bool:
+    """
+    boolean value of AIT_BUILD_CACHE_SKIP_PROFILER environment variable.
+    Will return True if that variable is not set, if it is equal to "0",
+    an empty string or "False" ( case insensitive ). Will return True
+    in all other cases.
+    """
+    ret = os.environ.get("AIT_BUILD_CACHE_SKIP_PROFILER", "1")
+    if ret is None or ret == "" or ret == "0" or ret.lower() == "false":
+        return False
+    return True
+
+
+def ait_build_cache_max_mb() -> int:
+    """
+    boolean value of AIT_BUILD_CACHE_MAX_MB environment variable.
+    This determines the maximum size of the artifact data to be cached
+    in MB. For larger (raw, uncompressed) data the build cache will
+    be skipped. Defaults to 30.
+    """
+    return int(os.environ.get("AIT_BUILD_CACHE_MAX_MB", "30"))
diff --git a/python/aitemplate/utils/io.py b/python/aitemplate/utils/io.py
index 91caa39eb..f6dc0e1f7 100644
--- a/python/aitemplate/utils/io.py
+++ b/python/aitemplate/utils/io.py
@@ -16,12 +16,15 @@
 Util functions to handle file or network io
 """
 import hashlib
+import logging
 import os
 import tarfile
 import time
-from io import BytesIO
+from io import BytesIO, FileIO
 from pathlib import Path
-from typing import Optional, Union
+from typing import BinaryIO, Callable, Optional, Union
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def touch(file_path):
@@ -61,11 +64,26 @@ def file_age(file_path):
     return file_age_seconds
 
 
+def file_sizes(directory, filter_function=None):
+    total_size = 0
+    for root, _dirs, files in os.walk(directory):
+        for _file in files:
+            file_path = os.path.join(root, _file)
+            if filter_function is not None and filter_function(file_path):
+                total_size += os.path.getsize(file_path)
+
+    return total_size
+
+
 # Utility functions to be used by (not yet existing) distributed cache implementations
 # to minimize the amount of network roundtrips and network bandwidth needed
 
 
-def create_archive(directory_path: str, filter_func=None) -> bytes:
+def create_archive(
+    directory_path: str,
+    filter_func: Callable[[str], bool] = None,
+    output_file: Optional[str] = None,
+) -> Optional[bytes]:
     """Create tar.gz archive in-memory and return the archive contents as
     a bytes object.
 
@@ -74,14 +92,20 @@ def create_archive(directory_path: str, filter_func=None) -> bytes:
         filter_func (_type_, optional): A function which, being passed a filename,
                                         returns whether to include it or not.
                                         Defaults to None (include all).
+        output_file (str): Output filename to write the archive to. Usually it ends on .tar.gz.
+                           If set to None ( default), the archive will not be written to
+                           file but returned as a bytes object.
 
     Returns:
-        bytes: Archive contents as a bytes object.
+        Optional[bytes]: Archive contents as a bytes object if output_file was not None
     """
     # Archive files in a directory.
 
     # Create an in-memory bytes buffer
-    buffer = BytesIO()
+    if output_file is None:
+        buffer = BytesIO()
+    else:
+        buffer = FileIO(output_file, mode="w+")
 
     # Determine the appropriate compression mode
     compression_mode = None
@@ -94,9 +118,7 @@ def create_archive(directory_path: str, filter_func=None) -> bytes:
             for _file in files:
                 # Check if the file should be included based on the filter function
                 if filter_func is not None:
-                    file_basename = os.path.basename(_file)
-                    file_root, file_extension = os.path.splitext(_file)
-                    if not filter_func(file_basename, file_extension):
+                    if not filter_func(_file):
                         continue
 
                 # Calculate the relative path of the file
@@ -108,6 +130,9 @@ def create_archive(directory_path: str, filter_func=None) -> bytes:
                 archive.add(os.path.join(root, _file), arcname=relative_path)
 
     # Get the bytes from the buffer
+    if output_file is not None:
+        buffer.close()
+        return None
     buffer.seek(0)
     compressed_bytes = buffer.read()
 
@@ -115,22 +140,19 @@ def create_archive(directory_path: str, filter_func=None) -> bytes:
 
 
 def extract_archive(
-    archive_bytes: bytes, target_directory: str, overwrite: bool = False
+    archive_data: BinaryIO, target_directory: str, overwrite: bool = False
 ):
     """Extract a tar.gz archive (written for example via create_archive) from a bytes buffer
     into a target directory.
 
     Args:
-        archive_bytes (bytes): Byte contents of the tar.gz archive to be extracted.
+        archive_data (BinaryIO): BinaryIO object ( typicall BytesIO or FileIO ) of the tar.gz archive to be extracted.
         target_directory (str): Target directory to extract to.
         overwrite (bool, optional): Whether to overwrite files or not.
                                     If False, files will be silently skipped
                                     if they already exist. Defaults to False.
     """
-    # Create an in-memory bytes buffer
-    buffer = BytesIO(archive_bytes)
-
-    archive = tarfile.open(fileobj=buffer, mode="r:gz")
+    archive = tarfile.open(fileobj=archive_data, mode="r:gz")
 
     # Extract the archive contents into the target directory
     for member in archive.getmembers():
@@ -140,8 +162,14 @@ def extract_archive(
         # Check if the file or directory already exists
         if os.path.exists(target_path):
             if not overwrite:
+                _LOGGER.debug(
+                    f"extract_archive: Skipping extraction of file to {os.path.abspath(target_path)}: A file at that path already exists, and overwrite is not enabled."
+                )
                 continue
             else:
+                _LOGGER.debug(
+                    f"extract_archive: Replacing existing file at {os.path.abspath(target_path)} with file from archive."
+                )
                 os.remove(target_path)
 
         # Extract the file or directory from the archive
@@ -181,8 +209,8 @@ def copytree_with_hash(
         if not dst_path.is_dir():
             raise OSError("Target path exists and is not a directory.")
         dst_path = dst_path / src_path.name
-    hash_obj.update(dst_path.name.encode("utf-8"))
     if src_path.is_file():
+        hash_obj.update(dst_path.name.encode("utf-8"))
         # Copy the file to the destination
         with open(dst_path, "wb") as dst_file:
             with open(src_path, "rb") as src_file:
@@ -198,7 +226,7 @@ def copytree_with_hash(
     elif src_path.is_dir():
         # Recursively copy the directory contents
         os.makedirs(dst_path, exist_ok=True)
-        for sub_path in src_path.iterdir():
+        for sub_path in sorted(src_path.iterdir()):
             sub_dst_path = dst_path / sub_path.name
             copytree_with_hash(
                 sub_path, sub_dst_path, buffer_size, hash_obj, max_depth - 1
diff --git a/tests/unittest/backend/test_build_cache.py b/tests/unittest/backend/test_build_cache.py
index 1be768b14..dce49c9b0 100644
--- a/tests/unittest/backend/test_build_cache.py
+++ b/tests/unittest/backend/test_build_cache.py
@@ -18,23 +18,22 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest.mock import patch
 
 import torch
 
-from aitemplate.backend import build_cache
 from aitemplate.backend.build_cache_base import (
     create_dir_hash,
     FileBasedBuildCache,
     is_source,
     makefile_normalizer,
-    NoBuildCache,
+    SkipBuildCache,
 )
 
 from aitemplate.backend.cuda.target_def import FBCUDA
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils.debug_settings import AITDebugSettings
 from aitemplate.utils.io import file_age
 
@@ -59,62 +58,79 @@ def _create_model_graph(self):
         Y._attrs["is_output"] = True
         return Y
 
-    def test_file_build_cache(self):
+    def _create_model_graph2(self):
+        dtype = "float32"
+        Z1 = Tensor(
+            shape=[IntImm(10), IntImm(1)],
+            dtype=dtype,
+            name="Z1",
+            is_input=True,
+        )
+        Y = ops.expand()(Z1, shape=(10, 10))
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        return Y
 
-        with tempfile.TemporaryDirectory() as parent_dir:
-            cache_dir = os.path.join(parent_dir, "build_cache")
-            shutil.rmtree(cache_dir, ignore_errors=True)
-            cache = FileBasedBuildCache(
-                cache_dir,
-                lru_retention_hours=0,
-                cleanup_max_age_seconds=1000,
-                debug=True,
-            )
-            cache.maybe_cleanup()
-            assert os.path.exists(cache_dir + "/.last_cleaned")
-            assert (
-                file_age(cache_dir + "/.last_cleaned") < 10.0
-            ), "Last clean time should  than 10 seconds"
+    def test_file_build_cache(self):
+        with patch(
+            "aitemplate.backend.build_cache_base.should_skip_build_cache"
+        ) as should_skip_build_cache_mock:
+            should_skip_build_cache_mock.return_value = False
+            with tempfile.TemporaryDirectory() as parent_dir:
+                cache_dir = os.path.join(parent_dir, "build_cache")
+                shutil.rmtree(cache_dir, ignore_errors=True)
+                cache = FileBasedBuildCache(
+                    cache_dir,
+                    lru_retention_hours=0,
+                    cleanup_max_age_seconds=1000,
+                    debug=True,
+                )
+                cache.maybe_cleanup()
+                assert os.path.exists(cache_dir + "/.last_cleaned")
+                assert (
+                    file_age(cache_dir + "/.last_cleaned") < 10.0
+                ), "Last clean time should  than 10 seconds"
 
-            build_dir_1 = os.path.join(parent_dir, "build_1")
-            build_dir_2 = os.path.join(parent_dir, "build_2")
+                build_dir_1 = os.path.join(parent_dir, "build_1")
+                build_dir_2 = os.path.join(parent_dir, "build_2")
 
-            os.makedirs(build_dir_1, exist_ok=False)
-            os.makedirs(build_dir_2, exist_ok=False)
-            for build_dir in [build_dir_1, build_dir_2]:
-                bp = Path(build_dir)
-                (bp / "Makefile").write_text("test.exe: test.cu")
-                (bp / "test.cu").write_text("printf('Hello, World!');")
-            assert create_dir_hash(
-                [f"make {build_dir_1}"], build_dir_1
-            ) == create_dir_hash([f"make {build_dir_2}"], build_dir_2)
-            found_entry1, cache_key1 = cache.retrieve_build_cache(
-                [f"make {build_dir_1}"], build_dir_1
-            )
-            found_entry2, cache_key2 = cache.retrieve_build_cache(
-                [f"make {build_dir_2}"], build_dir_2
-            )
-            assert not found_entry1
-            assert not found_entry2
-            assert cache_key1 == cache_key2
-            assert cache_key1 == create_dir_hash([f"make {build_dir_1}"], build_dir_1)
-            (Path(build_dir_2) / "test.obj").write_bytes("ELF1234".encode("ascii"))
-            cache.store_build_cache([f"make {build_dir_2}"], build_dir_2, cache_key2)
-            assert os.path.exists(os.path.join(cache_dir, cache_key2))
-            found_entry1, cache_key1 = cache.retrieve_build_cache(
-                [f"make {build_dir_1}"], build_dir_1
-            )
-            assert os.path.exists(os.path.join(build_dir_1, "test.obj"))
-            assert (
-                Path(os.path.join(build_dir_1, "test.obj")).read_bytes()
-                == Path(os.path.join(build_dir_2, "test.obj")).read_bytes()
-            )
+                os.makedirs(build_dir_1, exist_ok=False)
+                os.makedirs(build_dir_2, exist_ok=False)
+                for build_dir in [build_dir_1, build_dir_2]:
+                    bp = Path(build_dir)
+                    (bp / "Makefile").write_text("test.exe: test.cu")
+                    (bp / "test.cu").write_text("printf('Hello, World!');")
+                assert create_dir_hash(
+                    [f"make {build_dir_1}"], build_dir_1
+                ) == create_dir_hash([f"make {build_dir_2}"], build_dir_2)
+                found_entry1, cache_key1 = cache.retrieve_build_cache(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                found_entry2, cache_key2 = cache.retrieve_build_cache(
+                    [f"make {build_dir_2}"], build_dir_2
+                )
+                assert not found_entry1
+                assert not found_entry2
+                assert cache_key1 == cache_key2
+                assert cache_key1 == create_dir_hash(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                (Path(build_dir_2) / "test.so").write_bytes("ELF1234".encode("ascii"))
+                cache.store_build_cache(
+                    [f"make {build_dir_2}"], build_dir_2, cache_key2
+                )
+                assert os.path.exists(os.path.join(cache_dir, cache_key2))
+                found_entry1, cache_key1 = cache.retrieve_build_cache(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                assert os.path.exists(os.path.join(build_dir_1, "test.so"))
+                assert (
+                    Path(os.path.join(build_dir_1, "test.so")).read_bytes()
+                    == Path(os.path.join(build_dir_2, "test.so")).read_bytes()
+                )
 
     def test_deterministic_codegen(self, dtype="float32"):
-        old_build_cache = build_cache.BUILD_CACHE
-        try:
-            build_cache.BUILD_CACHE = NoBuildCache()
-
+        with SkipBuildCache():
             # Tests, whether repeated invocation of compilation results in identical generated source files
             test_name = "test_deterministic_codegen"
             basepath = "./tmp"
@@ -151,6 +167,7 @@ def test_deterministic_codegen(self, dtype="float32"):
             hash1 = create_dir_hash(
                 ["test_name"], build_dir + "_1", is_source, debug=True
             )
+            cache_key_log_1 = (Path(build_dir + "_1") / "cache_key.log").read_text()
             Y = self._create_model_graph()
             target = detect_target()
             # Variant 2: Clean build
@@ -165,9 +182,11 @@ def test_deterministic_codegen(self, dtype="float32"):
             hash2 = create_dir_hash(
                 ["test_name"], build_dir + "_2", is_source, debug=True
             )
+            cache_key_log_2 = (Path(build_dir + "_2") / "cache_key.log").read_text()
+
             assert (
                 hash1 == hash2
-            ), "Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+            ), f"Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)\nLOG 1:\n{cache_key_log_1}\n------\nLOG 1:\n{cache_key_log_2}\n-----"
             # Variant 3: Build over existing build dir
             Y = self._create_model_graph()
             target = detect_target()
@@ -257,8 +276,6 @@ def test_deterministic_codegen(self, dtype="float32"):
             assert (
                 hash8 != hash1
             ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
-        finally:
-            build_cache.BUILD_CACHE = old_build_cache
 
     def test_makefile_rewrite(self):
         tmpdir = os.path.join(tempfile.gettempdir(), f"{os.getuid()}_aitemplate_tmp")
@@ -272,8 +289,44 @@ def test_makefile_rewrite(self):
         assert tmpdir not in rewritten_makefile
         assert "$USER" in rewritten_makefile
 
+    def test_repeated_build_dir_usage(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            dll_name = "test.so"
+            build_dir = Path(tempdir) / "build_dir"
+            compile_model(
+                Y,
+                target,
+                tempdir,
+                "build_dir",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            test_so_path = build_dir / "test.so"
+            assert test_so_path.exists()
+            test_so_content = test_so_path.read_bytes()
+
+            # Compile a slightly different model into the same directory
+            Y2 = self._create_model_graph2()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            compile_model(
+                Y2,
+                target,
+                tempdir,
+                "build_dir",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            assert test_so_path.exists()
+            test_so_new_content = test_so_path.read_bytes()
+            self.assertNotEqual(
+                test_so_content,
+                test_so_new_content,
+                "The test.so should have been overwritten and different. Maybe a build cache was used and did not overwrite the file properly?",
+            )
 
-filter_test_cases_by_test_env(BuildCacheTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_compilation_failure.py b/tests/unittest/compiler/test_compilation_failure.py
index 23a968134..6c2d7f56f 100644
--- a/tests/unittest/compiler/test_compilation_failure.py
+++ b/tests/unittest/compiler/test_compilation_failure.py
@@ -17,6 +17,7 @@
 from unittest.mock import patch
 
 import jinja2
+from aitemplate.backend.build_cache_base import SkipBuildCache
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import DynamicProfileStrategy
@@ -67,14 +68,14 @@ def _test_compilation_failure(
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-
-        compile_model(
-            Y,
-            target,
-            f"./tmp/{test_name}",
-            test_name,
-            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
-        )
+        with SkipBuildCache():
+            compile_model(
+                Y,
+                target,
+                f"./tmp/{test_name}",
+                test_name,
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
 
     def test_compilation_failure_profiler(self):
         target = detect_target().name()
diff --git a/tests/unittest/compiler/test_transform_permute_to_reshape.py b/tests/unittest/compiler/test_transform_permute_to_reshape.py
index 8b846b9ac..c72aa147b 100644
--- a/tests/unittest/compiler/test_transform_permute_to_reshape.py
+++ b/tests/unittest/compiler/test_transform_permute_to_reshape.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import re
 import unittest
 
 import torch
@@ -38,11 +39,15 @@ def _generate_model_name(shape, permutation, is_reshape, dtype, is_complex):
         [
             ("test_permute_complex" if is_complex else "test_permute"),
             ("to_reshape" if is_reshape else "not_to_reshape"),
-            "x".join([str(s) for s in shape]),
-            "".join([str(s) for s in permutation]),
+            "x".join([str(s) for s in shape]),  #  these  can contain characters
+            "".join([str(s) for s in permutation]),  #  unsafe for usage in filenames
             dtype,
         ]
     )
+    # replace non-alphanumeric characters with underscores
+    # The ^ within the [^a-zA-Z0-9_] is a negation of the
+    # character class so it matches every character not in that class,
+    model_name = re.sub(r"[^a-zA-Z0-9_]", "_", model_name)
     return model_name
 
 
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index 2511e6883..f886b7fd1 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -106,7 +106,7 @@ def test_relu(self, dtype):
         )
         self._test_conv_bias_relu_few_channels(
             copy_op=True,
-            test_name="conv_bias_relu_few_channels_{dtype}_copy_op",
+            test_name=f"conv_bias_relu_few_channels_{dtype}_copy_op",
             dtype=dtype,
         )
 
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index a769366ec..48ba7fd2b 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -112,7 +112,7 @@ def test_conv_bias_add_hardswish(self, dtype):
         )
         self._test_conv_bias_add_hardswish(
             copy_op=True,
-            test_name="conv2d_bias_add_hardswish_{dtype}_copy_op",
+            test_name=f"conv2d_bias_add_hardswish_{dtype}_copy_op",
             dtype=dtype,
         )
 

From 1517314eeacfbe382281b952eee880748bf18bcb Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 12 Apr 2023 14:45:49 -0700
Subject: [PATCH 408/638] More robust cutlass include dir generation in FBCUDA
 (#565)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/565

There were reports of corrupted CUTLASS include directories which led to build failures which could only be resolved by manually deleting a directory generated by the FBCUDA target below /tmp. This fix attempts to make the corresponding logic more robust against edge cases and errors, as well as fail early if assertions are violated.

Reviewed By: aakhundov

Differential Revision: D44918599

fbshipit-source-id: e02e8f272ac8c625522c069a98a679383bbff883
---
 python/aitemplate/backend/cuda/target_def.py | 82 ++++++++++++++++----
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 6162518ec..898875648 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -15,6 +15,7 @@
 """
 CUDA target specialization
 """
+import errno
 import hashlib
 import json
 import logging
@@ -50,6 +51,8 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+_NUM_DIR_CREATE_ATTEMPTS = 20
+
 
 class CUDA(Target):
     """CUDA target."""
@@ -190,9 +193,14 @@ def comp_func(name):
 class FBCUDA(CUDA):
     """FBCUDA target. Used in Meta internal env only."""
 
+    # @TODO: instead of using multiple class properties
+    # which can go out of sync, we should refactor this
+    # to use a proper singleton instance that can be returned by detect_target
+
     nvcc_option_json = None
     cutlass_path_ = None
     compile_options_ = None
+    include_path_ = None
 
     def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         from libfb.py import parutil
@@ -214,31 +222,52 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             self.tmp_path = self.tmp_path = os.path.join(
                 tempfile.gettempdir(), f"{secrets.token_hex(16)}_aitemplate_tmp"
             )
-        if not FBCUDA.cutlass_path_:
+        if FBCUDA.cutlass_path_ is None:
+            FBCUDA.compile_options_ = None  # If we rebuild the cutlass path
+            # we also need to rebuild the compile options
             # Copy all of the includes over into an include directory
+
+            os.makedirs(self.tmp_path, exist_ok=True)
+            # find an unused random temporary directory within our base tmp_path
             random_key = secrets.token_hex(16)
+            errcount = 0
+            while True:
+                try:
+                    os.makedirs(os.path.join(self.tmp_path, random_key), exist_ok=False)
+                    break
+                except OSError as error:
+                    errcount += 1
+                    if errcount > _NUM_DIR_CREATE_ATTEMPTS:
+                        raise OSError(
+                            f"Failed to create user-specific temp directory path below {self.tmp_path}. Giving up."
+                        ) from error
+                    if error.errno != errno.EEXIST:
+                        raise
+                    else:
+                        random_key = secrets.token_hex(16)
+
             # the random_key part of this path will later be renamed to the content hash
-            self._include_path = os.path.join(
+            _tmp_include_path = os.path.join(
                 self.tmp_path,
                 random_key,
                 "includes",
             )
             includes_content_hash = hashlib.sha256()
-            FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
-            self.cub_path_ = self._include_path + "/cub"
+            _tmp_cutlass_path_ = os.path.join(_tmp_include_path, "cutlass")
+            _tmp_cub_path_ = os.path.join(_tmp_include_path, "cub")
             # copy recursively, and update a content hash in one go
             copytree_with_hash(
-                cutlass_src_path, FBCUDA.cutlass_path_, hash=includes_content_hash
+                cutlass_src_path, _tmp_cutlass_path_, hash=includes_content_hash
             )
-            copytree_with_hash(cub_src_path, self.cub_path_, hash=includes_content_hash)
+            copytree_with_hash(cub_src_path, _tmp_cub_path_, hash=includes_content_hash)
             attention_src_path = parutil.get_dir_path(
                 "aitemplate/AITemplate/python/aitemplate/backend/cuda/attention/src"
             )
-            attention_include_path = self._include_path + "/att_include"
+            attention_include_path = os.path.join(_tmp_include_path, "att_include")
             copytree_with_hash(
                 attention_src_path, attention_include_path, hash=includes_content_hash
             )
-            ait_static_include_path = self._include_path + "/static"
+            ait_static_include_path = os.path.join(_tmp_include_path, "static")
             copytree_with_hash(
                 static_files_path + "/include/kernels",
                 ait_static_include_path,
@@ -255,17 +284,36 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             # if it already exists, we don't want to overwrite it
             # we can just delete our copy.
             try:
+                if os.path.exists(new_path):
+                    # new version should replace old version. But this replacement
+                    # should happen ideally atomically. renames are much faster than
+                    # a recursive delete.
+                    os.rename(new_path, old_path + ".bak")
                 os.rename(old_path, new_path)
-            except OSError:
+            except OSError as e:
                 # target directory with identical contents already exists
-                shutil.rmtree(old_path)  # No need to keep out copy
-
+                _LOGGER.error(
+                    f"FBCUDA: Rename of old {old_path} to {new_path} failed.",
+                    exc_info=e,
+                )
+            try:
+                if os.path.exists(old_path):
+                    shutil.rmtree(old_path)
+            except OSError:
+                pass
+            try:
+                if os.path.exists(old_path + ".bak"):
+                    shutil.rmtree(old_path + ".bak")
+            except OSError:
+                pass
             # set the include paths to the final variant
             self._include_path = os.path.join(new_path, "includes")
-            self.cub_path_ = self._include_path + "/cub"
-            FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
+            self.cub_path_ = os.path.join(self._include_path, "cub")
+            FBCUDA.include_path_ = self._include_path
+            FBCUDA.cutlass_path_ = os.path.join(self._include_path, "cutlass")
 
         self.cutlass_path_ = FBCUDA.cutlass_path_
+        self._include_path = FBCUDA.include_path_
 
         cutlass_lib_path = parutil.get_dir_path(
             "aitemplate/AITemplate/python/aitemplate/utils/mk_cutlass_lib"
@@ -274,7 +322,7 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
 
         if not FBCUDA.nvcc_option_json:
             convert_nvcc_json = parutil.get_file_path(
-                os.path.join("aitemplate/testing", "convert_nvcc_cmd")
+                os.path.join("aitemplate", "testing", "convert_nvcc_cmd")
             )
             _LOGGER.info(f"Load the nvcc compile option from {convert_nvcc_json}")
             with open(convert_nvcc_json, "r") as nvcc_option_json:
@@ -285,7 +333,10 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         super().__init__(self.cutlass_path_, static_files_path, arch, **kwargs)
 
     def _build_compile_options(self):
-        if not FBCUDA.compile_options_:
+        if FBCUDA.compile_options_ is None:
+            assert self._template_path is not None
+            assert self._include_path is not None
+            assert self.cutlass_path_ is not None
             cutlass_path = [
                 os.path.join(self._template_path, "include"),
                 os.path.join(self._template_path, "tools/util/include"),
@@ -332,6 +383,7 @@ def _build_compile_options(self):
                 options.append("-DNDEBUG")
             FBCUDA.compile_options_ = " ".join(options)
         compile_options = FBCUDA.compile_options_
+        assert compile_options is not None
         _LOGGER.info(f"The compile options are: {compile_options}")
         return compile_options
 

From c9f0f4dd791d9b682f51b5b09fdbef1914e88c50 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Wed, 12 Apr 2023 22:14:29 -0700
Subject: [PATCH 409/638] add conv1d op (#562)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/562

conv1d can be expressed in terms of conv2d, so I didn't introduce any new kernel, but customized conv2d kernel generation

Reviewed By: terrychenism

Differential Revision: D44894688

fbshipit-source-id: c6e1d8894498302cf43bfe8c07ee9779b94fe3d2
---
 .../aitemplate/backend/cuda/conv2d/common.py  | 114 +++++++++++++-----
 python/aitemplate/backend/profiler_cache.py   |  45 ++++---
 .../compiler/ops/conv/cache_entry.py          |  18 ++-
 python/aitemplate/compiler/ops/conv/conv2d.py |  71 ++++++-----
 .../compiler/ops/conv/transposed_conv2d.py    |  16 ++-
 python/aitemplate/frontend/nn/__init__.py     |   1 +
 python/aitemplate/frontend/nn/conv1d.py       |  90 ++++++++++++++
 tests/unittest/ops/test_conv.py               |  91 +++++++++++++-
 8 files changed, 351 insertions(+), 95 deletions(-)
 create mode 100644 python/aitemplate/frontend/nn/conv1d.py

diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 04a7e2c69..67d27ebbb 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -150,9 +150,12 @@
     int64_t* out_batch,
     int64_t* out_h,
     int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
+    int strideh,
+    int dilationh,
+    int padh,
+    int stridew,
+    int dilationw,
+    int padw,
     cudaStream_t stream
   ) {
 
@@ -193,9 +196,9 @@
 {% else %}
     {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},  // cutlass::Tensor4DCoord filter_size
 {% endif %}
-    {pad, pad, pad, pad},                                 // cutlass::Tensor4DCoord padding
-    {stride, stride},                                     // cutlass::MatrixCoord stride
-    {dilation, dilation},                                 // cutlass::MatrixCoord dilation
+    {padh, padh, padw, padw},                                 // cutlass::Tensor4DCoord padding
+    {strideh, stridew},                                     // cutlass::MatrixCoord stride
+    {dilationh, dilationw},                                 // cutlass::MatrixCoord dilation
 {% if is_transpose %}
     {i32_batch, i32_in_h, i32_in_w, i32_in_ch},           // cutlass::Tensor4DCoord output_size
 {% else %}
@@ -232,9 +235,12 @@
 {{indent}}      {{no}},
 {{indent}}      {{ho}},
 {{indent}}      {{wo}},
-{{indent}}      {{stride}},
-{{indent}}      {{dilation}},
-{{indent}}      {{pad}},
+{{indent}}      {{strideh}},
+{{indent}}      {{dilationh}},
+{{indent}}      {{padh}},
+{{indent}}      {{stridew}},
+{{indent}}      {{dilationw}},
+{{indent}}      {{padw}},
 {{indent}}      global_workspace_,
 {{indent}}      stream
 {{indent}}    );
@@ -269,6 +275,9 @@
   int,
   int,
   int,
+  int,
+  int,
+  int,
   uint8_t*,
   cudaStream_t
 );
@@ -290,9 +299,12 @@
   int64_t NO,
   int64_t HO,
   int64_t WO,
-  int stride,
-  int dilation,
-  int pad,
+  int strideh,
+  int dilationh,
+  int padh,
+  int stridew,
+  int dilationw,
+  int padw,
   uint8_t* global_workspace_,
   cudaStream_t stream
 ) {
@@ -367,9 +379,12 @@
   int64_t kernel_h = std::stoi(argv[5]);
   int64_t kernel_w = std::stoi(argv[6]);
   int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
+  int strideh = std::stoi(argv[8]);
+  int padh = std::stoi(argv[9]);
+  int dilationh = std::stoi(argv[10]);
+  int stridew = std::stoi(argv[11]);
+  int padw = std::stoi(argv[12]);
+  int dilationw = std::stoi(argv[13]);
 
 {{shape_func}}
 
@@ -411,6 +426,9 @@
   int,
   int,
   int,
+  int,
+  int,
+  int,
   cudaStream_t
 );
 """
@@ -439,9 +457,12 @@
 {{indent}}    {{p_out_batch}},
 {{indent}}    {{p_out_h}},
 {{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
+{{indent}}    {{strideh}},
+{{indent}}    {{dilationh}},
+{{indent}}    {{padh}},
+{{indent}}    {{stridew}},
+{{indent}}    {{dilationw}},
+{{indent}}    {{padw}},
 {{indent}}    stream
 {{indent}});
 """
@@ -662,9 +683,12 @@ def gen_profiler(
             p_out_batch="&NO",
             p_out_h="&HO",
             p_out_w="&WO",
-            stride="stride",
-            dilation="dilation",
-            pad="pad",
+            strideh="strideh",
+            dilationh="dilationh",
+            padh="padh",
+            stridew="stridew",
+            dilationw="dilationw",
+            padw="padw",
         )
         benchmark = BENCHMARK_TEMPLATE.render(
             is_bias=is_bias,
@@ -695,9 +719,12 @@ def gen_profiler(
             no="NO",
             ho="HO",
             wo="WO",
-            stride="stride",
-            dilation="dilation",
-            pad="pad",
+            strideh="SH",
+            dilationh="DH",
+            padh="PH",
+            stridew="SW",
+            dilationw="DW",
+            padw="PW",
         )
         benchmark_instances.append(benchmark_instance)
 
@@ -717,9 +744,12 @@ def gen_profiler(
         w_dim0="out_ch",
         w_dim1="kernel_h",
         w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="strideh",
+        dilateh="dilationh",
+        padh="padh",
+        stridew="stridew",
+        dilatew="dilationw",
+        padw="padw",
     )
     profiler_main_code = PROFILER_MAIN_TEMPLATE.render(
         shape_func=shape_func,
@@ -798,9 +828,12 @@ def gen_function(
         w_dim0="*out_ch",
         w_dim1="*kernel_h",
         w_dim2="*kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="strideh",
+        dilateh="dilationh",
+        padh="padh",
+        stridew="stridew",
+        dilatew="dilationw",
+        padw="padw",
         div="/",
     )
     shape_save_func = shape_save_template.render(
@@ -905,9 +938,24 @@ def gen_function_call(
         p_out_batch="&" + yshape[0]._attrs["name"],
         p_out_h="&" + yshape[1]._attrs["name"],
         p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+        strideh=func_attrs["stride"]
+        if isinstance(func_attrs["stride"], int)
+        else func_attrs["stride"][0],
+        dilationh=func_attrs["dilate"]
+        if isinstance(func_attrs["dilate"], int)
+        else func_attrs["dilate"][0],
+        padh=func_attrs["pad"]
+        if isinstance(func_attrs["pad"], int)
+        else func_attrs["pad"][0],
+        stridew=func_attrs["stride"]
+        if isinstance(func_attrs["stride"], int)
+        else func_attrs["stride"][1],
+        dilationw=func_attrs["dilate"]
+        if isinstance(func_attrs["dilate"], int)
+        else func_attrs["dilate"][1],
+        padw=func_attrs["pad"]
+        if isinstance(func_attrs["pad"], int)
+        else func_attrs["pad"][1],
         indent=indent,
     )
 
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 89da74a68..e2170ba04 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -147,9 +147,12 @@ class CacheMode(enum.Enum):
   kh INTEGER NOT NULL,
   kw INTEGER NOT NULL,
   co INTEGER NOT NULL,
-  stride INTEGER NOT NULL,
-  pad INTEGER NOT NULL,
-  dilate INTEGER NOT NULL,
+  strideh INTEGER NOT NULL,
+  padh INTEGER NOT NULL,
+  dilateh INTEGER NOT NULL,
+  stridew INTEGER NOT NULL,
+  padw INTEGER NOT NULL,
+  dilatew INTEGER NOT NULL,
   op_type VARCHAR(512) NOT NULL,
   epilogue VARCHAR(512) NOT NULL,
   device VARCHAR(16) NOT NULL,
@@ -178,9 +181,12 @@ class CacheMode(enum.Enum):
 kh={{kh}} AND
 kw={{kw}} AND
 co={{co}} AND
-stride={{stride}} AND
-pad={{pad}} AND
-dilate={{dilate}} AND
+strideh={{strideh}} AND
+padh={{padh}} AND
+dilateh={{dilateh}} AND
+stridew={{stridew}} AND
+padw={{padw}} AND
+dilatew={{dilatew}} AND
 op_type='{{op_type}}' AND
 device='{{device}}' AND
 epilogue={{epilogue}} AND
@@ -204,9 +210,12 @@ class CacheMode(enum.Enum):
     kh,
     kw,
     co,
-    stride,
-    pad,
-    dilate,
+    strideh,
+    padh,
+    dilateh,
+    stridew,
+    padw,
+    dilatew,
     op_type,
     epilogue,
     device,
@@ -227,9 +236,12 @@ class CacheMode(enum.Enum):
     {{kh}},
     {{kw}},
     {{co}},
-    {{stride}},
-    {{pad}},
-    {{dilate}},
+    {{strideh}},
+    {{padh}},
+    {{dilateh}},
+    {{stridew}},
+    {{padw}},
+    {{dilatew}},
     '{{op_type}}',
     {{epilogue}},
     '{{device}}',
@@ -818,9 +830,12 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
             kh=args["kh"],
             kw=args["kw"],
             co=args["co"],
-            stride=args["stride"],
-            pad=args["pad"],
-            dilate=args["dilate"],
+            strideh=args["strideh"],
+            padh=args["padh"],
+            dilateh=args["dilateh"],
+            stridew=args["stridew"],
+            padw=args["padw"],
+            dilatew=args["dilatew"],
             op_type=args["op_type"],
             device=args["device"],
             epilogue=args["epilogue"],
diff --git a/python/aitemplate/compiler/ops/conv/cache_entry.py b/python/aitemplate/compiler/ops/conv/cache_entry.py
index 5f08fe215..efe4b58e0 100644
--- a/python/aitemplate/compiler/ops/conv/cache_entry.py
+++ b/python/aitemplate/compiler/ops/conv/cache_entry.py
@@ -34,9 +34,12 @@ class ConvQueryEntry:
     kh: int
     kw: int
     co: int
-    stride: int
-    pad: int
-    dilate: int
+    strideh: int
+    stridew: int
+    padh: int
+    padw: int
+    dilateh: int
+    dilatew: int
     op_type: str
     device: str
     epilogue: int
@@ -60,9 +63,12 @@ class ConvRecordEntry:
     kh: int
     kw: int
     co: int
-    stride: int
-    pad: int
-    dilate: int
+    strideh: int
+    stridew: int
+    padh: int
+    padw: int
+    dilateh: int
+    dilatew: int
     op_type: str
     epilogue: int
     device: str
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 98e93f36d..b620e8f3b 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -58,12 +58,12 @@
 {{indent}}{{dtype}}CO = {{w_dim0}};
 {{indent}}{{dtype}}KH = {{w_dim1}};
 {{indent}}{{dtype}}KW = {{w_dim2}};
-{{indent}}{{dtype}}SH = {{stride}};
-{{indent}}{{dtype}}SW = {{stride}};
-{{indent}}{{dtype}}DH = {{dilate}};
-{{indent}}{{dtype}}DW = {{dilate}};
-{{indent}}{{dtype}}PH = {{pad}};
-{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}SH = {{strideh}};
+{{indent}}{{dtype}}SW = {{stridew}};
+{{indent}}{{dtype}}DH = {{dilateh}};
+{{indent}}{{dtype}}DW = {{dilatew}};
+{{indent}}{{dtype}}PH = {{padh}};
+{{indent}}{{dtype}}PW = {{padw}};
 {{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
 {{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
 {{indent}}{{dtype}}NO = NI;
@@ -193,16 +193,36 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
         self.exec_cond_template = EXEC_COND_TEMPLATE
 
+    def _get_params_factory(self):
+        params_factory = {}
+        if isinstance(self._attrs["stride"], int):
+            params_factory["strideh"] = self._attrs["stride"]
+            params_factory["stridew"] = self._attrs["stride"]
+        else:
+            params_factory["strideh"] = self._attrs["stride"][0]
+            params_factory["stridew"] = self._attrs["stride"][1]
+        if isinstance(self._attrs["pad"], int):
+            params_factory["padh"] = self._attrs["pad"]
+            params_factory["padw"] = self._attrs["pad"]
+        else:
+            params_factory["padh"] = self._attrs["pad"][0]
+            params_factory["padw"] = self._attrs["pad"][1]
+        if isinstance(self._attrs["dilate"], int):
+            params_factory["dilateh"] = self._attrs["dilate"]
+            params_factory["dilatew"] = self._attrs["dilate"]
+        else:
+            params_factory["dilateh"] = self._attrs["dilate"][0]
+            params_factory["dilatew"] = self._attrs["dilate"][1]
+        return params_factory
+
     def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
         if x[3] != w[3] * self._attrs["group"]:
             raise RuntimeError("X/W Shape mismatch for conv2d")
+
         eval_func = self.shape_eval_template.render(
             indent="",
             dtype="",
             div="//",
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             x_dim0=x[0],
             x_dim1=x[1],
             x_dim2=x[2],
@@ -210,6 +230,7 @@ def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
             w_dim0=w[0],
             w_dim1=w[1],
             w_dim2=w[2],
+            **self._get_params_factory(),
         )
         output = {}
         exec(eval_func, output)  # noqa: P204
@@ -282,16 +303,6 @@ def _extract_exec_path(self, x: Tensor):
             key = self._gen_exec_key(x_shape)
             self._attrs["exec_path"][key] = ""
 
-    def _signature(self):
-        signature = "conv2d: K=[{kh}, {kw}], S=[{s}], P=[{p}], CO=[{co}]".format(
-            kh=self._attrs["KH"],
-            kw=self._attrs["KW"],
-            s=self._attrs["stride"],
-            p=self._attrs["pad"],
-            co=self._attrs["CO"],
-        )
-        return signature
-
     def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         epilogue_dim = output_shape[-1]
         if not isinstance(epilogue_dim, IntImm):
@@ -381,14 +392,12 @@ def _should_build_profiler(self) -> bool:
                     kh=self._attrs["KH"],
                     kw=self._attrs["KW"],
                     co=self._attrs["CO"],
-                    stride=self._attrs["stride"],
-                    pad=self._attrs["pad"],
-                    dilate=self._attrs["dilate"],
                     op_type=self._attrs["op"],
                     device=target._arch,
                     epilogue=tmp_op.epilogue_functor.value,
                     split_k=split_k,
                     exec_entry_sha1=exec_entry_sha1,
+                    **self._get_params_factory(),
                 )
                 cache_value = target.query_profile_cache("conv", query.__dict__)
                 if cache_value is not None and not target.force_profile():
@@ -446,6 +455,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         if not os.access(exe_path, os.X_OK):
             raise RuntimeError("Profiler %s is not executable" % exe_path)
         cmd = [exe_path]
+        params = self._get_params_factory()
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         cmd.append(x_shape[2])
@@ -453,9 +463,12 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(self._attrs["KH"])
         cmd.append(self._attrs["KW"])
         cmd.append(self._attrs["CO"])
-        cmd.append(self._attrs["stride"])
-        cmd.append(self._attrs["pad"])
-        cmd.append(self._attrs["dilate"])
+        cmd.append(params["strideh"])
+        cmd.append(params["padh"])
+        cmd.append(params["dilateh"])
+        cmd.append(params["stridew"])
+        cmd.append(params["padw"])
+        cmd.append(params["dilatew"])
         cmd.append(self._attrs["group"])
         command = [str(x) for x in cmd]
         return command
@@ -478,14 +491,12 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             kh=self._attrs["KH"],
             kw=self._attrs["KW"],
             co=self._attrs["CO"],
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             op_type=self._attrs["op"],
             device=target._arch,
             epilogue=tmp_op.epilogue_functor.value,
             split_k=split_k,
             exec_entry_sha1=exec_entry_sha1,
+            **self._get_params_factory(),
         )
         cache_value = target.query_profile_cache("conv", query.__dict__)
         if cache_value is not None and not target.force_profile():
@@ -536,15 +547,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
             kh=self._attrs["KH"],
             kw=self._attrs["KW"],
             co=self._attrs["CO"],
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             op_type=self._attrs["op"],
             epilogue=tmp_op.epilogue_functor.value,
             device=target._arch,
             algo=best_algo,
             workspace=workspace,
             split_k=split_k,  # todo add into profile
+            **self._get_params_factory(),
         )
         Target.current().insert_profile_cache("conv", cache_record.__dict__)
         return (best_algo, workspace)
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
index 533b7e88a..e2e9a8739 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -35,12 +35,12 @@
 {{indent}}{{dtype}}CO = {{w_dim0}};
 {{indent}}{{dtype}}KH = {{w_dim1}};
 {{indent}}{{dtype}}KW = {{w_dim2}};
-{{indent}}{{dtype}}SH = {{stride}};
-{{indent}}{{dtype}}SW = {{stride}};
-{{indent}}{{dtype}}DH = {{dilate}};
-{{indent}}{{dtype}}DW = {{dilate}};
-{{indent}}{{dtype}}PH = {{pad}};
-{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}SH = {{strideh}};
+{{indent}}{{dtype}}SW = {{stridew}};
+{{indent}}{{dtype}}DH = {{dilateh}};
+{{indent}}{{dtype}}DW = {{dilatew}};
+{{indent}}{{dtype}}PH = {{padh}};
+{{indent}}{{dtype}}PW = {{padw}};
 {{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
 {{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
 {{indent}}{{dtype}}NO = NI;
@@ -124,9 +124,6 @@ def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
             indent="",
             dtype="",
             div="//",
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             x_dim0=x[0],
             x_dim1=x[1],
             x_dim2=x[2],
@@ -134,6 +131,7 @@ def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
             w_dim0=w[3],  # for conv_transpose w = [c_in, kh, kw, c_out]
             w_dim1=w[1],
             w_dim2=w[2],
+            **self._get_params_factory(),
         )
         output = {}
         exec(eval_func, output)  # noqa: P204
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index 6c22f1a99..16e597a84 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -16,6 +16,7 @@
 from aitemplate.frontend.nn.container import ModuleDict, ModuleList, Sequential
 from aitemplate.frontend.nn.embedding import BertEmbeddings, Embedding
 from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.conv1d import *
 from aitemplate.frontend.nn.conv2d import *
 from aitemplate.frontend.nn.conv3d import *
 from aitemplate.frontend.nn.linear import *
diff --git a/python/aitemplate/frontend/nn/conv1d.py b/python/aitemplate/frontend/nn/conv1d.py
new file mode 100644
index 000000000..1ce285bd2
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv1d.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Conv1d Module.
+"""
+from aitemplate.compiler.ops import conv2d, conv2d_bias, squeeze, unsqueeze
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+
+class Conv1d(Module):
+    r"""
+    Conv1d module applies a 1D convolution over an input signal composed of several input planes.
+
+    .. math::
+        \text{out}\left(B_i, \text{:}, \text{channels\_out}_j\right) = \text{bias}\left(\text{channels\_out}_j\right) +
+        \sum_{k = 0}^{\text{channels\_in} - 1} \text{weight}\left(\text{channels\_out}_j, \text{:}, k\right)
+        \star \text{input}\left(B_i, \text{:}, k\right)
+
+    The semantics are similar to `PyTorch`_ with the following exception:
+    dims 1 and 2 of the weight, input and output are swapped (while dim 0 remains the same).
+
+    .. _PyTorch:
+        https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        dtype: str = "float16",
+        bias: bool = False,
+        name: str = "conv1d",
+    ):
+        super().__init__()
+
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, in_channels // groups],
+            dtype=dtype,
+            name=f"{name}_weight",
+        )
+        if bias:
+            self.bias = Parameter(
+                shape=[out_channels], dtype=dtype, name=f"{name}_bias"
+            )
+        else:
+            self.bias = None
+
+        # note that conv1d is functionally equivalent to conv2d,
+        # but we need to reshape the input, weight and output tensors,
+        # as well as use the correct stride, padding and dilation for the conv2d op.
+        fwd_func = conv2d_bias if bias else conv2d
+        self.op = fwd_func(
+            stride=(stride, 1), pad=(padding, 0), dilate=(dilation, 1), group=groups
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Applies Conv1d on the input tensor of shape :math:`(B, \text{seq\_in}, \text{channels\_in})`.
+        The output has shape :math:`(B, \text{seq\_out}, \text{channels\_out})`, where
+        .. math::
+            \text{seq\_out} = \left\lfloor\frac{\text{seq\_in} + 2 \times \text{padding} - \text{dilation}
+                             \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+        """
+        # make the conv2d inputs 4d
+        xu = unsqueeze(dim=2)(x)
+        wu = unsqueeze(dim=2)(self.weight.tensor())
+        if self.bias is None:
+            c2d = self.op(xu, wu)
+        else:
+            c2d = self.op(xu, wu, self.bias.tensor())
+        # make the result 3d again
+        return squeeze(dim=2)(c2d)
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index fbabf6fb8..cace970e0 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -17,7 +17,7 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, nn, Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
     filter_test_cases_by_params,
@@ -95,6 +95,95 @@ def test_conv2d(self, dtype):
             dtype=dtype,
         )
 
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv1d(self, dtype):
+        self._test_conv1d(dtype=dtype, bias=False)
+
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv1d_bias(self, dtype):
+        self._test_conv1d(dtype=dtype, bias=True)
+
+    def _test_conv1d(self, dtype, bias):
+        target = detect_target()
+        batch = 4
+        C_in = 80
+        C_out = 512
+        K = 3
+        L = 28
+        stride = 1
+        padding = 1
+        dilation = 1
+        test_name = "test_conv1d"
+
+        X_pt = get_random_torch_tensor([batch, C_in, L], dtype=dtype)
+        W_pt = get_random_torch_tensor([C_out, C_in, K], dtype=dtype)
+        bias_pt = get_random_torch_tensor([C_out], dtype=dtype) if bias else None
+
+        X = Tensor(
+            shape=[IntImm(batch), L, C_in],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        mod = nn.Conv1d(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            dtype=dtype,
+            bias=bias,
+        )
+
+        Y = mod(X)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+        module.set_constant_with_tensor(
+            "conv1d_weight", W_pt.permute((0, 2, 1)).contiguous()
+        )
+        if bias:
+            module.set_constant_with_tensor("conv1d_bias", bias_pt)
+        Y_pt = torch.nn.functional.conv1d(
+            X_pt.float(),
+            W_pt.float(),
+            bias=bias_pt.float() if bias else None,
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+        ).to(dtype=X_pt.dtype)
+
+        x = X_pt.permute((0, 2, 1)).contiguous()
+
+        y = torch.empty_like(Y_pt).permute((0, 2, 1)).contiguous()
+        module.run_with_tensors({"input_0": x}, [y])
+        y_transpose = y.permute((0, 2, 1))
+        if target.name() == "cuda":
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1.5e-1, rtol=1e-1)
+            else:
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+        else:
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From b4b01401565f62fe7654129e3169513f0f14662a Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Thu, 13 Apr 2023 09:35:18 -0700
Subject: [PATCH 410/638] arange as model param (#566)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/566

Refactoring "arange" tensor used in time embeddings to be model parameter.

Reviewed By: henryhu6

Differential Revision: D44903108

fbshipit-source-id: 227a2d4d2fee126dab02393af71ba35bef82936d
---
 fx2ait/fx2ait/tools/common_fx2ait.py          |  2 +
 .../aitemplate/frontend/nn/ldm/embeddings.py  | 84 +++++++------------
 2 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index b9aeb8009..bc99d258a 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -87,6 +87,7 @@ def run_test(
         passes: List[Callable] = [],  # noqa: B006
         leaf_module: Callable = None,  # one leaf module
         apply_passes_to_lowered_module_only=False,
+        use_fp16_acc=True,
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -119,6 +120,7 @@ def run_test(
             inputs,
             "/tmp",
             f"test-fx2ait-{uuid.uuid1()}",
+            use_fp16_acc=use_fp16_acc,
         )
         with torch.no_grad():
             cuda_inputs = []
diff --git a/python/aitemplate/frontend/nn/ldm/embeddings.py b/python/aitemplate/frontend/nn/ldm/embeddings.py
index 36b96a4fb..20519e661 100644
--- a/python/aitemplate/frontend/nn/ldm/embeddings.py
+++ b/python/aitemplate/frontend/nn/ldm/embeddings.py
@@ -15,7 +15,7 @@
 import math
 
 from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
+from aitemplate.frontend import nn
 
 
 def get_shape(x):
@@ -23,52 +23,6 @@ def get_shape(x):
     return shape
 
 
-def get_timestep_embedding(
-    timesteps: Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
-
-    half_dim = embedding_dim // 2
-
-    exponent = (-math.log(max_period)) * Tensor(
-        shape=[half_dim], dtype="float16", name="arange"
-    )
-
-    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
-
-    emb = ops.exp(exponent)
-    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
-
-    # scale embeddings
-    emb = scale * emb
-
-    # concat sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = ops.concatenate()(
-            [ops.cos(emb), ops.sin(emb)],
-            dim=-1,
-        )
-    else:
-        emb = ops.concatenate()(
-            [ops.sin(emb), ops.cos(emb)],
-            dim=-1,
-        )
-    return emb
-
-
 class TimestepEmbedding(nn.Module):
     def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
         super().__init__()
@@ -90,12 +44,34 @@ def __init__(
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
         self.downscale_freq_shift = downscale_freq_shift
+        self.scale = 1
+        self.max_period = 10000
+        half_dim = self.num_channels // 2
+        self.arange = nn.Parameter(shape=[half_dim], dtype="float16", name="arange")
 
     def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
+        assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+
+        half_dim = self.num_channels // 2
+
+        exponent = (-math.log(self.max_period)) * self.arange.tensor()
+        exponent = exponent * (1.0 / (half_dim - self.downscale_freq_shift))
+
+        emb = ops.exp(exponent)
+        emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
+
+        # scale embeddings
+        emb = self.scale * emb
+
+        # concat sine and cosine embeddings
+        if self.flip_sin_to_cos:
+            emb = ops.concatenate()(
+                [ops.cos(emb), ops.sin(emb)],
+                dim=-1,
+            )
+        else:
+            emb = ops.concatenate()(
+                [ops.sin(emb), ops.cos(emb)],
+                dim=-1,
+            )
+        return emb

From f4416e73ea73a5f6d8a4e985b72982bc79cfa4ee Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 13 Apr 2023 09:44:29 -0700
Subject: [PATCH 411/638] fixed an infinite loop in move_view_ops
 transformation (#570)

Summary:
Consider we have a following graph:

  concat_0 = concatenate(x0, x0)
  reshape_1 = reshape(concat_0)
  concat_2 = concat(reshape_1, x1)
  concat_3 = concatenate(concat_0, x2)

Previously, our move_view_ops pass would end up with an infinite loop, because it turned the graph into forms that were always valid for another iteration, e.g.

  (1) after the first iteration:

  concat_0 = concatenate(x0, x0)
  concat_2 = concat(concat_0, x1)
  new_reshape = reshape(concat_2)
  concat_3 = concatenate(new_reshape, x2)

  (2) after the second iteration:

  concat_0 = concatenate(x0, x0)
  new_reshape = reshape(concat_0)
  concat_2 = concat(new_reshape, x1)
  concat_3 = concatenate(concat_0, x2)

  and so on.

  This PR fixed the issue by skipping the pattern.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/570

Reviewed By: hl475

Differential Revision: D44946922

Pulled By: chenyang78

fbshipit-source-id: ff91fef90218feb4679e5b073979a8de02d912a8
---
 .../compiler/transform/move_view_ops.py       | 10 ++
 tests/unittest/compiler/test_move_view_ops.py | 93 +++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/python/aitemplate/compiler/transform/move_view_ops.py b/python/aitemplate/compiler/transform/move_view_ops.py
index 294340848..e49e1bc07 100644
--- a/python/aitemplate/compiler/transform/move_view_ops.py
+++ b/python/aitemplate/compiler/transform/move_view_ops.py
@@ -290,6 +290,16 @@ def _move_view_op_before_concat(
         next_ops = first_cat_output._attrs["dst_ops"]
         if len(next_ops) == 0:
             continue
+        # skip cases where the first cat op is directly connected with another cat op,
+        # because moving a view op between other two cat ops would insert a view op
+        # between the directly-connected cat ops. The transformed graph would contain
+        # a valid rewrite pattern which could trigger another re-write, and so on.
+        # Consequently, we would end up with an infinite rewriting loop, e.g.
+        # cat1 + reshape + cat2, cat1 + cat3 => cat1 + cat2, cat1 + reshape + cat3 =>
+        # cat1 + reshape + cat2, cat1 + cat3 => ...
+        concat_ops = [op for op in next_ops if op._attrs["op"] == "concatenate"]
+        if len(concat_ops) > 0:
+            continue
         view_ops = [op for op in next_ops if op._attrs["op"] in _SUPPORTED_VIEW_OPS]
         # skip if none of the next ops is one of the supported view ops
         if len(view_ops) == 0:
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
index 94b933565..0d3a473d5 100644
--- a/tests/unittest/compiler/test_move_view_ops.py
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -1740,6 +1740,99 @@ def test_move_strided_reshape_cat_multi_dsts(self):
             dtype="float16",
         )
 
+    def _test_non_movable_cat_reshape_cat_2(
+        self, M0, M1, M2, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x0) # 2d
+        # reshape_1 = reshape(concat_0) # 3d
+        # concat_2 = concat(reshape_1, x1) # 3d
+        # concat_3 = concatenate(concat_0, x2) # 2d
+        # reduce_4 = reduce_sum(concat_2)
+        # reduce_5 = reduce_sum(concat_3)
+        # y = add(reduce_4, reduce_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X0], dim=cat_dim)  # 2d
+        reshape_1_to_shape = [-1, M0 + M0, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_1_to_shape)
+        concat_2 = ops.concatenate()([reshape_1, X1], dim=cat_dim)  # 3d
+        concat_3 = ops.concatenate()([concat_0, X2], dim=cat_dim)  # 2d
+        reduce_dim = cat_dim
+        reduce_4 = ops.reduce_sum(reduce_dim)(concat_2)
+        reduce_4_2 = ops.reduce_sum(reduce_dim)(reduce_4)
+        reduce_5 = ops.reduce_sum(reduce_dim)(concat_3)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_4_2, reduce_5)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 7)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 3)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x0_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_1_to_shape)
+            concat_2_pt = torch.cat([reshape_1_pt, x1_pt], dim=cat_dim)
+            concat_3_pt = torch.cat([concat_0_pt, x2_pt], dim=cat_dim)
+            reduce_4_pt = torch.sum(concat_2_pt, reduce_dim)
+            reduce_4_2_pt = torch.sum(reduce_4_pt, reduce_dim)
+            reduce_5_pt = torch.sum(concat_3_pt, reduce_dim)
+            y_pt = reduce_4_2_pt + reduce_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_movable_cat_reshape_cat_2(self):
+        self._test_non_movable_cat_reshape_cat_2(
+            M0=3,
+            M1=4,
+            M2=6,
+            N=4,
+            test_name="test_non_movable_cat_reshape_cat_2",
+            dtype="float16",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 0f1165e62ae2835fe6f8a754c8b0db12c4a6acf9 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Thu, 13 Apr 2023 10:47:25 -0700
Subject: [PATCH 412/638] Remove hack in [fx/aten]2ait (#516)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/516

Symbolic shape support has landed, remove hacks that were used.

Reviewed By: tissue3

Differential Revision: D44482705

fbshipit-source-id: 685c74efa0b4a2cec6a2f963fff4b0437b44a32e
---
 fx2ait/fx2ait/converters/ait_converters.py      |  5 -----
 fx2ait/fx2ait/converters/aten2ait_converters.py |  3 ---
 fx2ait/fx2ait/converters/utils.py               | 16 ----------------
 3 files changed, 24 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 5f5deb86b..b414ac1fe 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -84,7 +84,6 @@
     identical_elem_tuple_to_int,
     ncdhw2ndhwc,
     nchw2nhwc,
-    unify_dynamic_shape_name,
     weight_ncdhw2ndhwc,
     weight_nchw2nhwc,
 )
@@ -346,10 +345,6 @@ def acc_ops_cat(
     if not isinstance(dim, int):
         raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
 
-    # TODO:  unify_dynamic_shape_name is a hack to workaround AIT's dynamic shape requirement.
-    # We will remove it after AIT provides vanilla support.
-    for i in range(len(tensors) - 1):
-        unify_dynamic_shape_name(tensors[i], tensors[i + 1])
     return concatenate()(tensors, dim=dim)
 
 
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index aef1c91de..4c4935d02 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -60,7 +60,6 @@
     get_positive_dim,
     identical_elem_tuple_to_int,
     nchw2nhwc,
-    unify_dynamic_shape_name,
 )
 from fx2ait.passes.lower_basic_pass_aten import (
     aten_compose_bmm_2d,
@@ -168,7 +167,6 @@ def aten_binary_ops_add(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    unify_dynamic_shape_name(args[0], args[1])
     kwargs = {
         "input": args[0],
         "other": args[1],
@@ -587,7 +585,6 @@ def aten_ops_matmul(
     if len(weight_shape) == 2:
         result = gemm_rrr()(input_val, weight)
     elif len(input_shape) == 3 and len(weight_shape) == 3:
-        unify_dynamic_shape_name(input_val, weight)
         result = bmm_rrr()(input_val, weight)
     else:
         raise NotImplementedError(
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
index 6d12804be..c94f62102 100644
--- a/fx2ait/fx2ait/converters/utils.py
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -203,19 +203,3 @@ def ait_ncdhw2ndhwc(ait_tensor: AITTensor) -> AITTensor:
 
 def ait_ndhwc2ncdhw(ait_tensor: AITTensor) -> AITTensor:
     return permute()(ait_tensor, [0, 4, 1, 2, 3])
-
-
-# TODO:  This is a hack to workaround AIT's dynamic shape requirement.
-# Detailed explanation can be found in D41743385 (aten2ait) D41974191(fx2ait).
-# We will throw this one after AIT provides vanilla support.
-def unify_dynamic_shape_name(input_val, weight):
-    input_shape = input_val.shape()
-    weight_shape = weight.shape()
-    if len(input_shape) == len(weight_shape):
-        for a, b in zip(input_shape, weight_shape):
-            if a._attrs["values"] == b._attrs["values"]:
-                if a._attrs["name"] is None:
-                    a._attrs["name"] = b._attrs["name"]
-                elif b._attrs["name"] is None:
-                    b._attrs["name"] = a._attrs["name"]
-    return input_shape, weight_shape

From e6ad08a97bb7b7b1c3c4974cc7ae915ad95d3911 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Thu, 13 Apr 2023 10:59:04 -0700
Subject: [PATCH 413/638] Fix _fuse_strided_op_and_cat: no GEMM+concat fusion
 with dim>=rank (#559)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/559

`_fuse_strided_op_and_cat` pass inside `transform_strided_ops` shouldn't fuse GEMM and concat if concatenation is happening along a dimension >= rank of the original shape. This happens, for example, when GEMM output of shape `(M, N)` is unsqueezed to `(M, N, 1)` and concatenated with another `(M, N, 1)`. Such fusion would require GEMM to write the last dimension into memory in a non-contiguous way, which is not supported for row-major output (only one stride is supported).
However, fusion is possible when unsqueezed dimension is internal - e.g. when final shape is `(M, 1, N)`.
Method `TensorAccessor.is_rightmost_dim_contiguous` checks if fusion is possible based on these criteria.

Reviewed By: tissue3, aakhundov

Differential Revision: D44747795

fbshipit-source-id: 4fbb005ce27d32654bda68f8405ec06b23f17a1a
---
 fx2ait/fx2ait/test/converters/test_ait_cat.py | 20 +++++++++++++
 python/aitemplate/compiler/tensor_accessor.py | 30 +++++++++++++++++++
 .../transform/transform_strided_ops_utils.py  | 10 +++++--
 .../compiler/test_strided_view_cat.py         | 18 +++++++++++
 4 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_cat.py b/fx2ait/fx2ait/test/converters/test_ait_cat.py
index e739ac22e..870e54cdf 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_cat.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_cat.py
@@ -78,3 +78,23 @@ def forward(
         )
 
         self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={acc_ops.cat})
+
+    def test_linear_stack(self):
+        """Pass _fuse_strided_op_and_cat shouldn't try to fuse GEMM and concat
+        in this graph, because that would require GEMM last dimension to be
+        non-contiguous, which it doesn't support. This is checked by
+        _dim_is_inside_original_shape in transform_strided_ops.
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                self.w = torch.randn(4, 3).half().cuda()
+                y = torch.nn.functional.linear(x, self.w)
+                z = torch.randn(2, 4).half().cuda()
+                return torch.stack([y, z], dim=2)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={})
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index 7df290512..d9d684dc1 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -448,3 +448,33 @@ def update_base_tensor_shape(self, new_tensor: Tensor) -> None:
             f"actual tensor: {self.actual_shapes}!"
         )
         self._try_gen_dim_mapping()
+
+    def is_rightmost_dim_contiguous(self, cat_dim: int) -> bool:
+        """Check if the rightmost diminsion would be contiguous after
+        concatenation along a given cat_dim. This is a necessary condition for
+        GEMM+concat fusion, since GEMM doesn't support discontinuous rightmost
+        dimension for row-major outout. Rightmost diminsion is contiguous iff
+        the concat dimension corresponds to one of the dimensions in the
+        original shape and it's the first dimension in its group of actual
+        dimensions.
+        """
+        num_groups = len(self._dim_mapping)
+        for group_idx in range(num_groups):
+            original_group, actual_group = self._dim_mapping[group_idx]
+            if cat_dim in actual_group:
+                if actual_group.index(cat_dim):
+                    # Concat dimension isn't the first in its group
+                    return False
+                # Check that there is at least one non-empty original group to the
+                # right of the group where cat_dim found (inclusive)
+                while (group_idx < num_groups) and not len(
+                    self._dim_mapping[group_idx][0]
+                ):
+                    group_idx += 1
+                if group_idx >= num_groups:
+                    # There are no original dimensions to the right (inclusive) of concat
+                    # dimension. Concat dimension is an unsqueezed dimension at the end
+                    # of the shape, fusion is impossible.
+                    return False
+                return True
+        return False
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
index cd040155c..da56a4cd9 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
@@ -78,8 +78,14 @@ def gemm_stride_checker(
 
     # Need to make sure that the new stride dim doesn't break
     # last dim's continuity.
-    # This is because CUTLASS gemm API assumes that gemm stride
-    # only operates on the last dim.
+    # This is because CUTLASS GEMM API assumes that GEMM stride
+    # only operates on the last dim for row-major output.
+    # For example, concatenations of GEMMs along dimensions to the right of the
+    # original shape can't be fused. A particular case of this is when GEMM
+    # output of shape (M, N) is unsqueezed to (M, N, 1) and concatenated with
+    # another (M, N, 1).
+    if not original_ta.is_rightmost_dim_contiguous(dim):
+        return False
 
     if get_stride_at_dim is None:
         # The dim before the last dim
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index a4c9f942c..1d58fe097 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -97,6 +97,24 @@ def setUpClass(cls) -> None:
                 expected_num_tensors=14,
                 expected_num_ops=9,
             ),
+            param(
+                # Concat along rightmost unsqueezed dim - not fusible.
+                "gemm_reshape_cat_non_fusible_stride_dim_rightmost_unsqueezed",
+                n=2,
+                new_shape=[-1, 2, 2, 1],
+                cat_dim=3,
+                expected_num_tensors=16,
+                expected_num_ops=9,
+            ),
+            param(
+                # Concat along inner unsqueezed dim - fusible.
+                "gemm_reshape_cat_fusible_stride_dim_inner_unsqueezed",
+                n=2,
+                new_shape=[-1, 2, 1, 2],
+                cat_dim=2,
+                expected_num_tensors=10,
+                expected_num_ops=9,
+            ),
         ],
         name_func=custom_name_func,
     )

From a58d75736e3e1c4175a1191ea3189cc9bba0f3b6 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 14 Apr 2023 13:27:05 +0800
Subject: [PATCH 414/638] refactor

---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp           | 91 +++++++++++--------
 fx2ait/fx2ait/csrc/AITModelImpl.h             |  1 +
 .../multi_level_roi_align_common.py           |  7 +-
 python/aitemplate/backend/profiler_runner.py  |  7 +-
 .../roi_ops/multi_level_roi_align.py          |  4 +-
 5 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index b75ec49cc..8eb008d5c 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -146,27 +146,33 @@ AITModelImpl::AITModelImpl(
   // It's not clear what stream we want to use yet. Create a new one.
   // We could alternatively use the default stream, but that could cause extra
   // synchronization.
-#ifdef __HIP_PLATFORM_HCC__
-  hipStream_t creation_stream;
-  TORCH_CHECK(
-      hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
-      hipSuccess);
-
-  using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<hipStream_t>,
-      decltype(&hipStreamDestroy)>;
-  StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
-#else
-  cudaStream_t creation_stream;
-  TORCH_CHECK(
-      cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
-      cudaSuccess);
-
+  StreamType creation_stream;
+  StreamCreate(&creation_stream, true);
   using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<cudaStream_t>,
-      decltype(&cudaStreamDestroy)>;
-  StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
-#endif
+      std::remove_pointer_t<StreamType>,
+      decltype(&StreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, StreamDestroy};
+// #ifdef __HIP_PLATFORM_HCC__
+//   hipStream_t creation_stream;
+//   TORCH_CHECK(
+//       hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
+//       hipSuccess);
+
+//   using StreamGuard = std::unique_ptr<
+//       std::remove_pointer_t<hipStream_t>,
+//       decltype(&hipStreamDestroy)>;
+//   StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
+// #else
+//   cudaStream_t creation_stream;
+//   TORCH_CHECK(
+//       cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
+//       cudaSuccess);
+
+//   using StreamGuard = std::unique_ptr<
+//       std::remove_pointer_t<cudaStream_t>,
+//       decltype(&cudaStreamDestroy)>;
+//   StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
+// #endif
 
 #define LOAD_SYMBOL(var, name_str)                                       \
   var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
@@ -630,27 +636,34 @@ void AITModelImpl::updateConstantsWithWeights(
         "failing this round of weight update");
     constants.emplace_back(torchToAitData(it->second));
   }
-#ifdef __HIP_PLATFORM_HCC__
-  hipStream_t constants_stream;
-  TORCH_CHECK(
-      hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
-      hipSuccess);
 
+  StreamType constants_stream;
+  StreamCreate(&constants_stream, true);
   using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<hipStream_t>,
-      decltype(&hipStreamDestroy)>;
-  StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
-#else
-  cudaStream_t constants_stream;
-  TORCH_CHECK(
-      cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
-      cudaSuccess);
-
-  using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<cudaStream_t>,
-      decltype(&cudaStreamDestroy)>;
-  StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
-#endif
+      std::remove_pointer_t<StreamType>,
+      decltype(&StreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, StreamDestroy};
+// #ifdef __HIP_PLATFORM_HCC__
+//   hipStream_t constants_stream;
+//   TORCH_CHECK(
+//       hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
+//       hipSuccess);
+
+//   using StreamGuard = std::unique_ptr<
+//       std::remove_pointer_t<hipStream_t>,
+//       decltype(&hipStreamDestroy)>;
+//   StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
+// #else
+//   cudaStream_t constants_stream;
+//   TORCH_CHECK(
+//       cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
+//       cudaSuccess);
+
+//   using StreamGuard = std::unique_ptr<
+//       std::remove_pointer_t<cudaStream_t>,
+//       decltype(&cudaStreamDestroy)>;
+//   StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
+// #endif
   AIT_CHECK(setManyConstantsDoubleBufferFunc_(
       model_handle_,
       /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
index 6b78d735f..9c8f27916 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.h
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "model_interface.h" // @manual=//aitemplate/AITemplate/static/include:aitemplate
+#include "utility.h"
 
 #include <dlfcn.h>
 #include <torch/torch.h> // @manual=//caffe2:torch-cpp
diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
index 1dc9ac608..d010e0b82 100644
--- a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -143,7 +143,7 @@
   const Trois* roi = rois + 5 * (batch * roiCount + roiIdx);
   float hw;
 
-{% if elem_input_type in ["half", "ck::half_t"] %}
+{% if elem_input_type == "half" %}
   float x1 = __half2float(roi[1]);
   float y1 = __half2float(roi[2]);
   float x2 = __half2float(roi[3]);
@@ -233,10 +233,9 @@
             interpolateBilinear(src, srcDims, ySample, xSample, featureCount);
       }
     }
-{% if elem_output_type == "ck::half_t" %}
+
+{% if elem_output_type == "half" %}
     *out = __half(result) / __float2half_rn(samplingCount);
-{% elif elem_output_type == "half" %}
-    *out = result / __float2half_rn(samplingCount);
 {% elif elem_output_type == "float" %}
     *out = result / samplingCount;
 {% endif %}
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index e4d528b3d..37c0ab698 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -304,11 +304,8 @@ def callback_when_done(fut):
                     _LOGGER.debug(
                         f"Profiler failure!\nProfiler stdout: {stdout}\nProfiler stderr: {stderr}",
                     )
-                    _LOGGER.debug(f"Failed to extract profiler result for {cmds}")
-                else:
-                    process_result_callback(
-                        profile_result, self._postprocessing_delegate
-                    )
+                    raise RuntimeError(f"Failed to extract profiler result for {cmds}")
+                process_result_callback(profile_result, self._postprocessing_delegate)
             finally:
                 # unblock one future in `join()`
                 if stdout is not None:
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
index 179e40c74..38c226ca9 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -46,8 +46,8 @@ def gen_function(
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     backend_spec = ROCMSpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     exec_paths = ""
     for key, _ in exec_path.items():
         program = multi_level_roi_align_common.EXEC_TEMPLATE.render(

From 2d8511c83f92ee5734963a7d6bd17a499504f761 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 14 Apr 2023 11:29:38 -0700
Subject: [PATCH 415/638] Accept JaggedIntVar in padded_dense_to_jagged (#577)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/577

It may happen that `total_length` passed to the `padded_dense_to_jagged` op is actually a `JaggedIntVar`. In such cases, the `total_length` is fetched from the `shape[0]` of a tensor that already happens to be jagged. Before this diff, this has caused an exception in the `padded_dense_to_jagged` front-end validation. The diff fixes this by fetching the `total_length` from within the passed `JaggedIntVar`.

Reviewed By: muchulee8

Differential Revision: D44997496

fbshipit-source-id: cebc005569c66c43fcf6443547ace1332e6df050
---
 .../ops/tensor/padded_dense_to_jagged.py      |  8 ++++-
 .../ops/test_padded_dense_to_jagged.py        | 34 +++++++++++++------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
index fcfd6b181..8010454eb 100644
--- a/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
+++ b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
@@ -20,7 +20,7 @@
 
 from aitemplate.backend import registry
 from aitemplate.backend.target import Target
-from aitemplate.compiler.base import IntVar, JaggedDim, Operator, Tensor
+from aitemplate.compiler.base import IntVar, JaggedDim, JaggedIntVar, Operator, Tensor
 from aitemplate.compiler.ops import make_jagged
 
 
@@ -43,6 +43,12 @@ def __init__(
         self,
         total_length: IntVar,
     ):
+        if isinstance(total_length, JaggedIntVar):
+            # the total_length dimension may be fetched from the
+            # jagged tensor's shape[0]. in such cases, the total_length
+            # would already be a JaggedIntVar and we fetch the real
+            # total_length from inside it.
+            total_length = total_length.total_length()
         if type(total_length) != IntVar:
             raise TypeError(
                 f"total_length must be IntVar, but got {type(total_length).__name__}."
diff --git a/tests/unittest/ops/test_padded_dense_to_jagged.py b/tests/unittest/ops/test_padded_dense_to_jagged.py
index 7951551cd..1b58c1b3c 100644
--- a/tests/unittest/ops/test_padded_dense_to_jagged.py
+++ b/tests/unittest/ops/test_padded_dense_to_jagged.py
@@ -47,6 +47,7 @@ def _test_padded_dense_to_jagged(
         dtype: str = "float16",
         offsets_dtype: str = "int32",
         use_jagged_space_indexing: bool = False,
+        pass_jagged_int_var_as_total_length: bool = False,
         test_suffix: str = "",
     ):
         batch_size = jagged_max_shape[0]
@@ -95,15 +96,24 @@ def _test_padded_dense_to_jagged(
             is_input=True,
         )
 
-        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_dim)(
-            x=DENSE,
-            offsets_list=OFFSETS_LIST,
-        )
         ANOTHER = ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
             source=SOURCE,
             offsets_list=OFFSETS_LIST,
         )
 
+        total_length_to_pass = total_length_dim
+        if pass_jagged_int_var_as_total_length:
+            # we pass JaggedIntVar as the total_length to the
+            # padded_dense_to_jagged op, as this may happen in
+            # some cases where the total_length is fetched from
+            # the shape of a jagged tensor
+            total_length_to_pass = ANOTHER._attrs["shape"][0]
+
+        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_to_pass)(
+            x=DENSE,
+            offsets_list=OFFSETS_LIST,
+        )
+
         RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, ANOTHER)
 
         RESULT._attrs["name"] = "result"
@@ -142,13 +152,13 @@ def _test_padded_dense_to_jagged(
 
     @parameterized.expand(
         [
-            param(1, "int32", [4, 3, 8], "float16"),
-            param(2, "int32", [4, 3, 4], "float16"),
-            param(3, "int32", [4, 3, 2], "float16"),
-            param(4, "int32", [4, 3, 1], "float16"),
-            param(5, "int64", [4, 3, 4], "float32"),
-            param(6, "int64", [4, 3, 2], "float32"),
-            param(7, "int64", [4, 3, 1], "float32"),
+            param(1, "int32", [4, 3, 8], "float16", False),
+            param(2, "int32", [4, 3, 4], "float16", False),
+            param(3, "int32", [4, 3, 2], "float16", False),
+            param(4, "int32", [4, 3, 1], "float16", True),
+            param(5, "int64", [4, 3, 4], "float32", False),
+            param(6, "int64", [4, 3, 2], "float32", False),
+            param(7, "int64", [4, 3, 1], "float32", True),
         ]
     )
     def test_padded_dense_to_jagged_single_offsets(
@@ -157,6 +167,7 @@ def test_padded_dense_to_jagged_single_offsets(
         offsets_dtype,
         jagged_max_shape,
         dtype,
+        pass_jagged_int_var_as_total_length,
     ):
         for use_jagged_space_indexing in [False, True]:
             self._test_padded_dense_to_jagged(
@@ -165,6 +176,7 @@ def test_padded_dense_to_jagged_single_offsets(
                 dtype=dtype,
                 offsets_dtype=offsets_dtype,
                 use_jagged_space_indexing=use_jagged_space_indexing,
+                pass_jagged_int_var_as_total_length=pass_jagged_int_var_as_total_length,
                 test_suffix=f"single_offsets_{dtype}_{i}",
             )
 

From 27ec1e1122f2888180526c6924f7da7ce5740dc7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Fri, 14 Apr 2023 11:55:04 -0700
Subject: [PATCH 416/638] implement per op benchmark backbone (#555)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/555

Ship the module profiling utility externally

Reviewed By: wushirong

Differential Revision: D44567322

fbshipit-source-id: 4f8ca36dbdc72dfa60e667c3592d0a2bc466b994
---
 python/aitemplate/testing/__init__.py |   7 +-
 python/aitemplate/testing/profile.py  | 101 ++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100644 python/aitemplate/testing/profile.py

diff --git a/python/aitemplate/testing/__init__.py b/python/aitemplate/testing/__init__.py
index 5f2eca031..746641f05 100644
--- a/python/aitemplate/testing/__init__.py
+++ b/python/aitemplate/testing/__init__.py
@@ -17,9 +17,6 @@
 """
 from aitemplate.testing import benchmark_ait, benchmark_pt
 from aitemplate.testing.detect_target import detect_target
+from aitemplate.testing.profile import profile_callable
 
-__all__ = [
-    "benchmark_pt",
-    "benchmark_ait",
-    "detect_target",
-]
+__all__ = ["benchmark_pt", "benchmark_ait", "detect_target", "profile_callable"]
diff --git a/python/aitemplate/testing/profile.py b/python/aitemplate/testing/profile.py
new file mode 100644
index 000000000..67e8dad27
--- /dev/null
+++ b/python/aitemplate/testing/profile.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Torch module profiling utility.
+"""
+import logging
+from operator import itemgetter
+from typing import Callable, List, Tuple
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def profile_callable(
+    func: Callable,
+    cache_flush_slab: torch.Tensor,
+    n_iter: int,
+) -> Tuple[List[int], List[int]]:
+    """
+    Profile the callable and return the device and wall time for each iteration.
+    We assume the iterations happen sequentially, not concurrently.
+    Example usage:
+    .. code-block:: python
+        x = torch.randn((4096, 2048), device='cuda')
+        y = torch.randn((8192, 2048), device='cuda')
+        xy = torch.empty((4096, 8192), device='cuda')
+        slab = torch.empty(40 * 1024 * 1024, dtype=torch.int8, device='cuda')
+        def _f():
+            torch.nn.functional.linear(x, y, out=xy)
+        profile_callable(_f, slab, 100)
+    Parameters
+    ----------
+    func: Callable
+        The callable to profile.
+    cache_flush_slab: torch.Tensor
+        A slab of GPU memory. We flush the device L2 cache by filling the slab.
+    n_iter: int
+        The number of iterations to call the callable.
+    Returns
+    -------
+        device_times: List[int]
+            Sum of the kernel device times (µs) for each iteration.
+        wall_times: List[int]
+            Times (µs) from the start of the first kernel
+            until the end of the last kernel for each iteration.
+    """
+    if n_iter <= 0:
+        return [], []
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CUDA],
+        record_shapes=True,
+    ) as prof:
+        for _ in range(n_iter):
+            cache_flush_slab.fill_(3.7)
+            func()
+    # log the invoked kernels
+    results = prof.key_averages().table(
+        sort_by="self_cuda_time_total",
+        max_name_column_width=None,
+        row_limit=-1,
+    )
+    logger.info(results)
+
+    events = [
+        {
+            "name": e.name,
+            "cuda_time": e.cuda_time,
+            "start": e.time_range.start,
+            "end": e.time_range.end,
+        }
+        for e in prof.events()
+        if e.cuda_time != 0
+    ]
+
+    sorted_events = sorted(events, key=itemgetter("start"))
+    assert 0 == len(sorted_events) % n_iter
+    n_groups = len(sorted_events) // n_iter
+    # in each group (corresponding to a profiling iteration),
+    # skip measuring the first kernel, which is the l2 cache flush
+    event_groups = [g[1:] for g in zip(*([iter(sorted_events)] * n_groups))]
+    logger.info(
+        f"First kernel sequence: {list(map(itemgetter('name'), event_groups[0]))}"
+    )
+    device_times = [sum(map(itemgetter("cuda_time"), g)) for g in event_groups]
+    wall_times = [
+        g[-1]["end"] - g[0]["start"] if len(g) > 0 else 0 for g in event_groups
+    ]
+    return device_times, wall_times

From c7d36cbba1ff222ade3fbfe2ad22bb7f37e14449 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 14 Apr 2023 12:07:16 -0700
Subject: [PATCH 417/638] Add SM90 kernel foundations (#575)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/575

Initial foundations are added for further support of the CUTLASS SM90 kernels. With these changes, under CUDA 12 (arch 90), the SM90 kernels will be generated, but not considered anywhere in the GEMM back-end (due to the special `GemmKind.Universal3x` not being matched against).

Reviewed By: chenyang78, tenpercent

Differential Revision: D44985884

fbshipit-source-id: 527848875f686fd582a28d7b1575734e2b1e66e6
---
 .../backend/cuda/elementwise/custom_math.cuh  | 24 +++++------
 python/aitemplate/backend/cuda/target_def.py  |  7 +++-
 python/aitemplate/backend/cuda/utils.py       | 42 +++++++++++++------
 python/aitemplate/utils/environ.py            | 21 ++++++++++
 4 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index 449dbfda8..f88a84427 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -15,12 +15,12 @@
 #ifndef CUSTOM_MATH
 #define CUSTOM_MATH
 
-#ifndef __HALF2_TO_UI
-#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+#ifndef __TO_UI
+#define __TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
 #endif
 
-#ifndef __HALF_TO_US
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#ifndef __TO_US
+#define __TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
 #endif
 
 #define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
@@ -92,8 +92,8 @@ __device__ half2 fast_tanh(half2 x) {
     (__CUDA_ARCH__ >= 750)
 
   asm volatile("tanh.approx.f16x2 %0, %1;"
-               : "=r"(__HALF2_TO_UI(x))
-               : "r"(__HALF2_TO_UI(x)));
+               : "=r"(__TO_UI(x))
+               : "r"(__TO_UI(x)));
   return x;
 
 #else
@@ -106,9 +106,7 @@ __device__ half fast_tanh(half x) {
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
     (__CUDA_ARCH__ >= 750)
 
-  asm volatile("tanh.approx.f16 %0, %1;"
-               : "=h"(__HALF_TO_US(x))
-               : "h"(__HALF_TO_US(x)));
+  asm volatile("tanh.approx.f16 %0, %1;" : "=h"(__TO_US(x)) : "h"(__TO_US(x)));
   return x;
 
 #else
@@ -121,8 +119,8 @@ __device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
     (__CUDA_ARCH__ >= 900)
 
   asm volatile("tanh.approx.bf16x2 %0, %1;"
-               : "=r"(__HALF_TO_UI(x))
-               : "r"(__HALF_TO_UI(x)));
+               : "=r"(__TO_UI(x))
+               : "r"(__TO_UI(x)));
   return x;
 
 #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
@@ -136,9 +134,7 @@ __device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
 __device__ bfloat16 fast_tanh(bfloat16 x) {
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
     (__CUDA_ARCH__ >= 900)
-  asm volatile("tanh.approx.bf16 %0, %1;"
-               : "=h"(__HALF_TO_US(x))
-               : "h"(__HALF_TO_US(x)));
+  asm volatile("tanh.approx.bf16 %0, %1;" : "=h"(__TO_US(x)) : "h"(__TO_US(x)));
   return x;
 
 #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 898875648..d1eeeeeb2 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -355,6 +355,11 @@ def _build_compile_options(self):
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
 
+            nvcc_arch = self._arch
+            if nvcc_arch == "90":
+                # required by CUTLASS SM90 TMA kernels
+                nvcc_arch = "90a"
+
             options = (
                 self.nvcc_options_json["args"]
                 + ["-I" + path for path in cutlass_path]
@@ -373,7 +378,7 @@ def _build_compile_options(self):
                     "-w",
                     "--expt-relaxed-constexpr",
                     "--use_fast_math",
-                    f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
+                    f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
                     "-Xcompiler=-Wconversion",
                     environ.get_compiler_opt_level(),
                     "-std=c++17",
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index 2178ea375..98dc8f9fd 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -18,7 +18,10 @@
 import logging
 
 from aitemplate.backend import registry
-
+from aitemplate.utils.environ import (
+    allow_cutlass_sm90_kernels,
+    force_cutlass_sm90_kernels,
+)
 from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
 
 # pylint: disable=C0103,C0415,W0707
@@ -53,16 +56,29 @@ def gen_ops(arch):
 
     args = Args(arch)
     manifest = cutlass_lib.manifest.Manifest(args)
-    try:
-        func = getattr(cutlass_lib.generator, "GenerateSM" + arch)
-        func(manifest, args.cuda_version)
-    except AttributeError as e:
-        raise NotImplementedError(
-            "Arch " + arch + " is not supported by current cutlass lib."
-        ) from e
-    try:
-        func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
-        func(manifest, args)
-    except AttributeError:
-        _LOGGER.warning("Arch " + arch + " is not supported by extra ops.")
+
+    if arch == "90":
+        if force_cutlass_sm90_kernels():
+            cutlass_lib.generator.GenerateSM90(manifest, cuda_version="12.0.0")
+        elif allow_cutlass_sm90_kernels():
+            cutlass_lib.generator.GenerateSM90(manifest, cuda_version="12.0.0")
+            cutlass_lib.generator.GenerateSM80(manifest, args.cuda_version)
+            cutlass_lib.extra_operation.GenerateSM80(manifest, args)
+        else:
+            cutlass_lib.generator.GenerateSM80(manifest, args.cuda_version)
+            cutlass_lib.extra_operation.GenerateSM80(manifest, args)
+    else:
+        try:
+            func = getattr(cutlass_lib.generator, "GenerateSM" + arch)
+            func(manifest, args.cuda_version)
+        except AttributeError as e:
+            raise NotImplementedError(
+                "Arch " + arch + " is not supported by current cutlass lib."
+            ) from e
+        try:
+            func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
+            func(manifest, args)
+        except AttributeError:
+            _LOGGER.warning("Arch " + arch + " is not supported by extra ops.")
+
     return manifest.operations
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 8cd67b6e3..36793d005 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -117,3 +117,24 @@ def ait_build_cache_max_mb() -> int:
     be skipped. Defaults to 30.
     """
     return int(os.environ.get("AIT_BUILD_CACHE_MAX_MB", "30"))
+
+
+def allow_cutlass_sm90_kernels() -> bool:
+    """
+    Whether the SM90 CUTLASS kernels should to be considered
+    alongside the SM80 CUTLASS kernels on the CUDA arch 90
+    (for the CUDA back-end of the GEMM ops). Default: False.
+    """
+    return (
+        force_cutlass_sm90_kernels()
+        or os.getenv("AIT_ALLOW_CUTLASS_SM90_KERNELS", "0") == "1"
+    )
+
+
+def force_cutlass_sm90_kernels() -> bool:
+    """
+    Whether only the SM90 CUTLASS kernels (and not the SM80 ones)
+    should be considered on the CUDA arch 90 (for the CUDA
+    back-end of the GEMM ops). Default: False.
+    """
+    return os.getenv("AIT_FORCE_CUTLASS_SM90_KERNELS", "0") == "1"

From 0e71c60c923c1a1200d21a5b544d657f7e656329 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 14 Apr 2023 12:55:51 -0700
Subject: [PATCH 418/638] add reshape for `to` converter (#576)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/576

We suppose to bypass this op but in extreme case like
a = placeholder(); return a.to()

It introduces a node in AIT graph which has is_input=True and is_output=True. The node name is output_xx
fx2ait throws error when doing the input name binding. So we add an extra reshape layer here which brings no computation.

Reviewed By: hl475, chenyang78

Differential Revision: D44991256

fbshipit-source-id: afc951c23d205351166a0407ff4b9f218b075bff
---
 fx2ait/fx2ait/converters/ait_converters.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index b414ac1fe..ec9db5fa6 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1538,7 +1538,10 @@ def acc_ops_contiguous(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    return kwargs["input"]
+    # Add a reshape. The reason is listed in acc_ops_to_dtype
+    input_val = kwargs["input"]
+    reshape_shape = size()(input_val)
+    return reshape()(input_val, reshape_shape)
 
 
 @ait_converter(acc_ops.to_dtype)
@@ -1548,7 +1551,13 @@ def acc_ops_to_dtype(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    return kwargs["input"]
+    # We suppose to bypass this op but in extreme case like
+    # a = placeholder(); return a.to()
+    # It introduces a node in AIT graph which has is_input=True and is_output=True. The node name is output_xx
+    # fx2ait throws error when doing the input name binding. So we add an extra reshape layer which brings no compuation
+    input_val = kwargs["input"]
+    reshape_shape = size()(input_val)
+    return reshape()(input_val, reshape_shape)
 
 
 @ait_converter(acc_ops.gelu)

From a4ba50a6acdf73a773b5b6a1ca0495fa0eb13e3b Mon Sep 17 00:00:00 2001
From: Zhengkai Zhang <zzhengkai@meta.com>
Date: Fri, 14 Apr 2023 16:14:06 -0700
Subject: [PATCH 419/638] Allow split without dim as input (#578)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/578

To allow split op with no dim as input

Reviewed By: qxy11, Gavin-Cheng

Differential Revision: D45011263

fbshipit-source-id: 45f57064588245ad3bfe73dcbba5a92d07eb3bc3
---
 fx2ait/fx2ait/acc_tracer/ait_acc_ops.py       |  6 ++-
 fx2ait/fx2ait/converters/ait_converters.py    |  3 ++
 .../fx2ait/test/converters/test_ait_split.py  | 40 +++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
index f162520e5..56ceee311 100644
--- a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
@@ -18,13 +18,15 @@
 
 from fx2ait.acc_tracer.ait_acc_ops_registry import ait_register_acc_op_mapping
 
+this_arg_is_optional: bool = True
+
 
 @ait_register_acc_op_mapping(
     op_and_target=("call_method", "split"),
     arg_replacement_tuples=[
         ("tensor", "input"),
         ("split_size_or_sections", "split_size_or_sections"),
-        ("dim", "dim"),
+        ("dim", "dim", this_arg_is_optional),
     ],
 )
 @ait_register_acc_op_mapping(
@@ -32,7 +34,7 @@
     arg_replacement_tuples=[
         ("tensor", "input"),
         ("split_size_or_sections", "split_size_or_sections"),
-        ("dim", "dim"),
+        ("dim", "dim", this_arg_is_optional),
     ],
 )
 @register_acc_op
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index ec9db5fa6..d3f6e6716 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1081,6 +1081,9 @@ def ait_acc_ops_split(
             f"Unexpected value for split_size_or_sections in {name}: {split_size_or_sections}"
         )
 
+    if "dim" not in kwargs:
+        return split()(input_val, split_size_or_sections)
+
     dim = kwargs["dim"]
     if not isinstance(dim, int):
         raise ValueError(f"Unexpected value for dim in {name}: {dim}")
diff --git a/fx2ait/fx2ait/test/converters/test_ait_split.py b/fx2ait/fx2ait/test/converters/test_ait_split.py
index 6930d5425..ab0d81032 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_split.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_split.py
@@ -64,6 +64,46 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         ]
         self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
 
+    @parameterized.expand(
+        [
+            [[2, 10], [2, 3, 5]],
+            [[2, 10], 2],
+            [[2, 10], 3],
+        ]
+    )
+    def test_tensor_split_with_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.split(split_size_or_sections, dim=1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[10], [2, 3, 5]],
+            [[10], 2],
+            [[10], 3],
+        ]
+    )
+    def test_tensor_split_without_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.split(split_size_or_sections)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
     def test_with_dim_dynamic_shape(self) -> None:
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:

From 7fa1852ed348351cab0ad2ff5d96e3b6a6999f59 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sat, 15 Apr 2023 14:14:54 -0700
Subject: [PATCH 420/638] Replace infer_shape for split op. (#568)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/568

Replace infer_shape for split op with symbolic shape.

Reviewed By: frank-wei

Differential Revision: D44906484

fbshipit-source-id: 48215334ef0df7e0535d9d909a9e799362c66ec6
---
 .../aitemplate/compiler/ops/tensor/split.py   | 34 ++-----------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
index d175099cb..e94dd2cf2 100644
--- a/python/aitemplate/compiler/ops/tensor/split.py
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -15,13 +15,11 @@
 """
 Split.
 """
-import itertools
 from typing import List, Sequence, Union
 
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
-from aitemplate.utils import shape_utils
 from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
@@ -77,37 +75,11 @@ def _infer_shapes(
                 f"sum of split_sizes ({split_sizes}) does not match split_dim_size ({split_dim_size})"
             )
 
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        y_shapes = []
-        for x_shape_vals in x_shapes:
-            y_shape = [list(x_shape_vals) for _ in range(num_splits)]
-            for split_size, shape in zip(split_sizes, y_shape):
-                shape[dim] = split_size
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
         output_shapes = []
-        for idx, shapes in enumerate(zip(*y_shapes)):
-            assert all(split_sizes[idx] == dims[dim] for dims in shapes)
-            output_shape = []
-            for i in range(len(shapes[0])):
-                dim_vals = unique(dims[i] for dims in shapes)
-                # propagate the name of each non-split-dim dynamic axis, which
-                # may be used later by some shape checks.
-                if i != dim:
-                    new_dim_val = shape_utils.gen_int_var(
-                        dim_vals, x_shape[i]._attrs["name"]
-                    )
-                else:
-                    # FIXME: we might want to create a new unique name for this
-                    # new_dim_val. We would do this once we have a mechanism
-                    # to create a unique dim name
-                    new_dim_val = shape_utils.gen_int_var(dim_vals)
-                output_shape.append(new_dim_val)
+        for split_size in split_sizes:
+            output_shape = x_shape[:dim] + [IntImm(split_size)] + x_shape[dim + 1 :]
             output_shapes.append(output_shape)
+
         return output_shapes
 
     def __call__(self, x: Tensor, split_size_or_sections, dim=0) -> List[Tensor]:

From c27702d84ab23ffef3b487ebd213f6f70a840a83 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sat, 15 Apr 2023 14:23:25 -0700
Subject: [PATCH 421/638] Add Identity op. (#579)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/579

Add Identity op in AIT.
This ops provides a way to "duplicate" tensors.
Possible usage include, but not limited to wanting to have 2 names for 1 tensor.

The backend does a memcpy if the resulting tensor is an output. Otherwise, the implementation is only a view.

Reviewed By: chenyang78

Differential Revision: D44993567

fbshipit-source-id: 37368398e68d8b527c33a9f13f141566e2a77c30
---
 .../backend/common/tensor/identity_common.py  | 155 ++++++++++++++++++
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../backend/cuda/tensor/identity.py           |  77 +++++++++
 .../backend/rocm/tensor/__init__.py           |   1 +
 .../backend/rocm/tensor/identity.py           |  77 +++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/identity.py           |  56 +++++++
 tests/unittest/ops/test_identity.py           |  86 ++++++++++
 8 files changed, 455 insertions(+)
 create mode 100644 python/aitemplate/backend/common/tensor/identity_common.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/identity.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/identity.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/identity.py
 create mode 100644 tests/unittest/ops/test_identity.py

diff --git a/python/aitemplate/backend/common/tensor/identity_common.py b/python/aitemplate/backend/common/tensor/identity_common.py
new file mode 100644
index 000000000..a5fcebf6a
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/identity_common.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+identity kernel codegen.
+"""
+
+from typing import Any, Dict
+
+import jinja2
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.dtype import get_dtype_size
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{func_signature}}
+{
+{% if is_copy %}
+    {{prefix}}MemcpyAsync(*output, input, size, {{prefix}}MemcpyDeviceToDevice, stream);
+{% else %}
+    *output = input;
+{% endif %}
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void** output, void* input, size_t size, {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   &{{output}},
+{{indent}}   {{input}},
+{{indent}}   {{size}},
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """Generates function.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function.
+    """
+    is_copy = func_attrs["outputs"][0]._attrs["is_output"]
+
+    return FUNC_TEMPLATE.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+        ),
+        prefix=backend_spec.prefix,
+        is_copy=is_copy,
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """Generates function decl.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function decl.
+    """
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+        ),
+    ).strip()
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    input_name = func_attrs["inputs"][0]._attrs["name"]
+
+    output_node = func_attrs["outputs"][0]
+    output_name = output_node._attrs["name"]
+    shape = ["1"]
+    for dim in output_node._attrs["shape"]:
+        if isinstance(dim, IntImm):
+            shape.append(str(dim._attrs["values"][0]))
+        else:
+            shape.append(dim._attrs["name"])
+    shape = "*".join(shape)
+    size = f"{shape} * {get_dtype_size(output_node._attrs['dtype'])}"
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        size=size,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index 6b08eec93..ab5f5ffe8 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -24,6 +24,7 @@
     expand,
     full,
     gather,
+    identity,
     jagged_to_padded_dense,
     masked_select,
     padded_dense_to_jagged,
@@ -47,6 +48,7 @@
     "expand",
     "full",
     "gather",
+    "identity",
     "jagged_to_padded_dense",
     "masked_select",
     "padded_dense_to_jagged",
diff --git a/python/aitemplate/backend/cuda/tensor/identity.py b/python/aitemplate/backend/cuda/tensor/identity.py
new file mode 100644
index 000000000..08c70b346
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/identity.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA identity function
+"""
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import identity_common
+
+
+@registry.reg("cuda.identity.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return identity_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
+
+
+@registry.reg("cuda.identity.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.identity.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return identity_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 90181b170..2993c648c 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -21,6 +21,7 @@
     concatenate,
     concatenate_tanh,
     dynamic_slice,
+    identity,
     permute021,
     permute0213,
     permute102,
diff --git a/python/aitemplate/backend/rocm/tensor/identity.py b/python/aitemplate/backend/rocm/tensor/identity.py
new file mode 100644
index 000000000..9bbab569a
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/identity.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM identity function
+"""
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import identity_common
+
+
+@registry.reg("rocm.identity.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return identity_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
+
+
+@registry.reg("rocm.identity.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec())
+
+
+@registry.reg("rocm.identity.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return identity_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 755710886..569a82aef 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -25,6 +25,7 @@
 from aitemplate.compiler.ops.tensor.expand import expand
 from aitemplate.compiler.ops.tensor.full import full
 from aitemplate.compiler.ops.tensor.gather import gather
+from aitemplate.compiler.ops.tensor.identity import identity
 from aitemplate.compiler.ops.tensor.jagged_to_padded_dense import jagged_to_padded_dense
 from aitemplate.compiler.ops.tensor.masked_select import masked_select
 from aitemplate.compiler.ops.tensor.padded_dense_to_jagged import padded_dense_to_jagged
diff --git a/python/aitemplate/compiler/ops/tensor/identity.py b/python/aitemplate/compiler/ops/tensor/identity.py
new file mode 100644
index 000000000..c6c691f54
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/identity.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+identity op
+"""
+from typing import List
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class identity(Operator):
+    """
+    Returns the input tensor. This could be useful for only name changes etc.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "identity"
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        return x.shape()
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+
+        output_shapes = self._infer_shapes(x)
+        output = Tensor(output_shapes, src_ops={self})
+        self._attrs["outputs"] = [output]
+        output._attrs["dtype"] = x.dtype()
+
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+        )
diff --git a/tests/unittest/ops/test_identity.py b/tests/unittest/ops/test_identity.py
new file mode 100644
index 000000000..a690750d7
--- /dev/null
+++ b/tests/unittest/ops/test_identity.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+class TestIdentity(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_identity(
+        self,
+        shape,
+        elementwise,
+        dtype="float16",
+        test_name="identity",
+    ) -> None:
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        Y = ops.identity()(X)
+        if elementwise:
+            Y = ops.elementwise(FuncEnum.ADD)(X, Y)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self.assertEqual(len(module.debug_sorted_graph), 3 if elementwise else 2)
+        self._test_id += 1
+
+        x_pt = get_random_torch_tensor(shape, dtype=dtype)
+        if elementwise:
+            y_pt = 2 * x_pt
+        else:
+            y_pt = x_pt
+
+        y = torch.empty_like(y_pt)
+
+        module.run_with_tensors([x_pt], [y])
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param(1, [3, 4], True, "float16"),
+            param(2, [3, 4], True, "float32"),
+            param(3, [3, 4], False, "float16"),
+            param(4, [3, 4], False, "float32"),
+        ]
+    )
+    def test_identity(self, i, shape, elementwise, dtype):
+        self._test_identity(
+            shape=shape,
+            elementwise=elementwise,
+            dtype=dtype,
+            test_name=f"test_identity_{i}",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From b0f91168cd7ea70016efb73e56b82e225ec174d7 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Mon, 17 Apr 2023 04:15:55 -0700
Subject: [PATCH 422/638] Build cache: Do not cache failed builds (#586)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/586

Currently the build cache is also caching the results of failed builds. While not neccessarily wrong, the fact that build errors are not shown on repeated invocation is both irritating and a problem when attempting to pinpoint errors.

This is a minor code change which fixes that.

Reviewed By: aakhundov

Differential Revision: D45043453

fbshipit-source-id: f44fd399aa24f45178f89b667218d042cd966676
---
 python/aitemplate/backend/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 66244b127..8c1b44166 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -162,7 +162,7 @@ def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
         )
         try:
             out, err = proc.communicate(timeout)
-            if store_cache_key is not None:
+            if proc.returncode == 0 and store_cache_key is not None:
                 build_cache.BUILD_CACHE.store_build_cache(
                     cmds, build_dir, store_cache_key
                 )

From 3b537f9db81ecce0994248e3abc37708fe6fce54 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Mon, 17 Apr 2023 12:09:02 -0700
Subject: [PATCH 423/638] Add BatchNorm FE module (#587)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/587

Adds AIT BatchNorm1d, BatchNorm2d, and BatchNorm3d FE module

Reviewed By: terrychenism

Differential Revision: D44922251

fbshipit-source-id: d2ae030fea244a1a398609209b52798d4444201c
---
 python/aitemplate/frontend/nn/batch_norm.py | 142 ++++++++++++++++++++
 tests/unittest/ops/test_batch_norm.py       |  99 ++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/batch_norm.py
 create mode 100644 tests/unittest/ops/test_batch_norm.py

diff --git a/python/aitemplate/frontend/nn/batch_norm.py b/python/aitemplate/frontend/nn/batch_norm.py
new file mode 100644
index 000000000..d2440f178
--- /dev/null
+++ b/python/aitemplate/frontend/nn/batch_norm.py
@@ -0,0 +1,142 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for attention module
+"""
+from aitemplate.compiler.public import elementwise, FuncEnum, permute
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+
+class _BatchNorm(Module):
+    """BatchNorm nn module"""
+
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        **kwargs,
+    ):
+        super().__init__()
+        self.dim = (num_features,)
+        self.dtype = dtype
+        self.num_features = num_features
+        self.eps = eps
+        self.weight = Parameter(shape=self.dim, name="weight", dtype=dtype)
+        self.bias = Parameter(shape=self.dim, name="bias", dtype=dtype)
+        self.running_mean = Parameter(shape=self.dim, name="running_mean", dtype=dtype)
+        self.running_var = Parameter(shape=self.dim, name="running_var", dtype=dtype)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        self._check_input_dim(x)
+        x = self._convert_input(x)
+
+        x_normalized = elementwise(FuncEnum.DIV)(
+            elementwise(FuncEnum.SUB)(x, self.running_mean.tensor()),
+            elementwise(FuncEnum.SQRT)(
+                elementwise(FuncEnum.ADD)(self.running_var.tensor(), self.eps)
+            ),
+        )
+
+        y = elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.MUL)(self.weight.tensor(), x_normalized),
+            self.bias.tensor(),
+        )
+
+        y = self._convert_output(y)
+        return y
+
+    def _check_input_dim(self):
+        raise NotImplementedError()
+
+    def _convert_input(self):
+        raise NotImplementedError()
+
+    def _convert_output(self):
+        raise NotImplementedError()
+
+
+class BatchNorm1d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 2 and len(x.shape()) != 3:
+            raise ValueError(
+                "expected 2D or 3D input (got {}D input)".format(x.shape())
+            )
+
+    def _convert_input(self, x):
+        if len(x.shape()) == 3:
+            return permute()(x, [0, 2, 1])
+        else:
+            return x
+
+    def _convert_output(self, y):
+        if len(y.shape()) == 3:
+            return permute()(y, [0, 2, 1])
+        else:
+            return y
+
+
+class BatchNorm2d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(x.shape()))
+
+    def _convert_input(self, x):
+        return permute()(x, [0, 2, 3, 1])
+
+    def _convert_output(self, y):
+        return permute()(y, [0, 3, 1, 2])
+
+
+class BatchNorm3d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 5:
+            raise ValueError("expected 5D input (got {}D input)".format(x.shape()))
+
+    def _convert_input(self, x):
+        return permute()(x, [0, 2, 3, 4, 1])
+
+    def _convert_output(self, y):
+        return permute()(y, [0, 4, 1, 2, 3])
diff --git a/tests/unittest/ops/test_batch_norm.py b/tests/unittest/ops/test_batch_norm.py
new file mode 100644
index 000000000..81c341844
--- /dev/null
+++ b/tests/unittest/ops/test_batch_norm.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn import batch_norm
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class BatchnormTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(BatchnormTestCase, self).__init__(*args, **kwargs)
+        self.test_id = 0
+
+    def _test_batchnorm(
+        self,
+        num_features,
+        bn_op,
+        input_shape,
+        input_type="float16",
+    ):
+        pt_op = getattr(torch.nn, bn_op)(num_features).cuda().half().eval()
+        ait_op = getattr(batch_norm, bn_op)(num_features, eps=pt_op.eps)
+
+        pt_params = dict(pt_op.named_parameters())
+        pt_buffers = dict(pt_op.named_buffers())
+        params_ait = {}
+        for key, arr in pt_params.items():
+            print(key, arr.shape)
+            params_ait[key] = arr
+        for key, arr in pt_buffers.items():
+            print(key, arr.shape)
+            if key in ["running_mean", "running_var"]:
+                params_ait[key] = arr
+
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Y_pt = pt_op(X_pt)
+        X_ait = Tensor(
+            shape=input_shape, dtype=input_type, name="input0", is_input=True
+        )
+        Y_ait = ait_op(X_ait)
+
+        Ys_ait = [var._attrs["values"][0] for var in Y_ait._attrs["shape"]]
+        self.assertEqual(list(Y_pt.shape), Ys_ait)
+
+        Y_ait._attrs["is_output"] = True
+        Y_ait._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(Y_ait, target, "./tmp", f"batch_norm_{self.test_id}")
+        for name, weight in params_ait.items():
+            module.set_constant_with_tensor(name, weight)
+
+        y = get_torch_empty_tensor(Ys_ait, dtype=input_type)
+        inputs = {"input0": X_pt}
+        module.run_with_tensors(inputs, [y])
+
+        print(f"PT output: {Y_pt=}")
+        print(f"AIT output: {y=}")
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-5, rtol=1e-5))
+
+    def test_batch_norm(self):
+        self._test_batchnorm(num_features=3, bn_op="BatchNorm1d", input_shape=[5, 3])
+        self._test_batchnorm(
+            num_features=3, bn_op="BatchNorm1d", input_shape=[5, 3, 234]
+        )
+        self._test_batchnorm(
+            num_features=3, bn_op="BatchNorm2d", input_shape=[1, 3, 244, 244]
+        )
+        self._test_batchnorm(
+            num_features=6, bn_op="BatchNorm3d", input_shape=[4, 6, 24, 24, 11]
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4d6d4695440a4868f7eaa2e736cbaa0e215b200a Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Mon, 17 Apr 2023 15:13:41 -0700
Subject: [PATCH 424/638] add identity for `to` and `contiguous` (#588)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/588

as titled

Reviewed By: qxy11, wushirong

Differential Revision: D45055690

fbshipit-source-id: 3930eec56f56bd9253c7aeac9562d8131493710f
---
 fx2ait/fx2ait/converters/ait_converters.py    | 10 ++++----
 .../test/converters/test_ait_unary_ops.py     | 24 +++++++++++++++++++
 python/aitemplate/compiler/public/__init__.py |  1 +
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index d3f6e6716..88333df63 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -40,6 +40,7 @@
     gemm_rrr,
     getitem,
     group_norm,
+    identity,
     IntImm,
     IntVar,
     IntVarTensor,
@@ -1541,10 +1542,8 @@ def acc_ops_contiguous(
     kwargs: Dict[str, Argument],
     name: str,
 ) -> ConverterOutput:
-    # Add a reshape. The reason is listed in acc_ops_to_dtype
     input_val = kwargs["input"]
-    reshape_shape = size()(input_val)
-    return reshape()(input_val, reshape_shape)
+    return identity()(input_val)
 
 
 @ait_converter(acc_ops.to_dtype)
@@ -1557,10 +1556,9 @@ def acc_ops_to_dtype(
     # We suppose to bypass this op but in extreme case like
     # a = placeholder(); return a.to()
     # It introduces a node in AIT graph which has is_input=True and is_output=True. The node name is output_xx
-    # fx2ait throws error when doing the input name binding. So we add an extra reshape layer which brings no compuation
+    # fx2ait throws error when doing the input name binding. So we need an identity layer.
     input_val = kwargs["input"]
-    reshape_shape = size()(input_val)
-    return reshape()(input_val, reshape_shape)
+    return identity()(input_val)
 
 
 @ait_converter(acc_ops.gelu)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index a743000c1..ffdbd06f7 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -65,3 +65,27 @@ def forward(self, y):
         ]
 
         self.run_test(model, inputs, expected_ops={})
+
+    def test_to(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y):
+                return y.to(dtype=torch.float16)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.to_dtype})
+
+    def test_contiguous(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y):
+                return y.contiguous()
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.contiguous})
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index 9b8be9e6c..d4894e027 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -80,6 +80,7 @@
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
 from aitemplate.compiler.ops.tensor.full import full
+from aitemplate.compiler.ops.tensor.identity import identity
 from aitemplate.compiler.ops.tensor.permute import permute
 from aitemplate.compiler.ops.tensor.split import split
 

From 535d064533c5f67999bb6449a5dc613f7e7e8a79 Mon Sep 17 00:00:00 2001
From: Oleg Khabinov <khabinov@meta.com>
Date: Mon, 17 Apr 2023 18:52:53 -0700
Subject: [PATCH 425/638] In-place weight updates tracing (#569)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/569

Reviewed By: wfanzju, wushirong

Differential Revision: D44942651

fbshipit-source-id: af5b6c66d578d7460e297309bcc21920a632489a
---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 0193c3c2b..b2a636637 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -574,6 +574,8 @@ void AITModelImpl::setDeserializePickledModel(bool deserializePickledModel) {
 // will not take effect until swapConstants is being called
 void AITModelImpl::updateConstantsWithWeights(
     const std::unordered_map<std::string, torch::Tensor>& weights) {
+  RECORD_USER_SCOPE("AITModel::updateConstantsWithWeights");
+
   TORCH_CHECK(
       getNumConstantsFunc_,
       "getNumConstantsFunc_ not loaded, can not do in place update");

From 5c4800ed069806f3b7eb6368b0a112825ecb926e Mon Sep 17 00:00:00 2001
From: Fei Kou <feikou@meta.com>
Date: Mon, 17 Apr 2023 20:04:03 -0700
Subject: [PATCH 426/638] Remove hardcoded dtype float16 in acc_ops_clone
 (#585)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/585

Now that bfloat16 and float32 are also supported, we don't need to hardcode to float16.

Reviewed By: tenpercent

Differential Revision: D45028307

fbshipit-source-id: 2fb3d2d22f2bb3761505cecf1d7f220497e529cc
---
 fx2ait/fx2ait/converters/ait_converters.py    |  6 ++-
 .../test/converters/test_ait_unary_ops.py     | 49 ++++++++++++++++---
 fx2ait/fx2ait/tools/common_fx2ait.py          | 25 ++++++++--
 3 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 88333df63..070e001fc 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -210,7 +210,9 @@ def acc_ops_clone(
     input_val = kwargs["input"]
     # deepcopy results with an error. replace with Idnetity multiplication by 1.
     # TODO: implement __deepcopy__ / clone for AITTensor.
-    one_const = AITTensor(shape=[], dtype="float16", name="one_const", value=1.0)
+    one_const = AITTensor(
+        shape=[], dtype=input_val.dtype(), name="one_const", value=1.0
+    )
     identity_mul_result = elementwise(FuncEnum.MUL)(input_val, one_const)
     return identity_mul_result
 
@@ -1642,7 +1644,7 @@ def acc_ops_neg(
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     new_kwargs = kwargs.copy()
     dt = new_kwargs["input"]._attrs["dtype"]
-    if dt == "float16" or dt == "float32":
+    if dt == "float16" or dt == "float32" or dt == "bfloat16":
         new_kwargs["other"] = float(-1)
     elif dt == "int32" or dt == "int64":
         new_kwargs["other"] = int(-1)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index ffdbd06f7..5c8f1c509 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -12,12 +12,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import itertools
 import math
-from typing import Callable
+from typing import Callable, Dict, Set
 
 import torch
+from aitemplate.testing.test_utils import filter_test_cases_by_params, TestEnv
 from fx2ait.acc_tracer import acc_ops
-from fx2ait.tools.common_fx2ait import AITTestCase
+from fx2ait.tools.common_fx2ait import (
+    AITTestCase,
+    lower_precision_to_torch_type,
+    LowerPrecision,
+)
 from parameterized import parameterized
 
 
@@ -33,21 +39,48 @@
     (torch.neg, acc_ops.neg),
 ]
 
+TestEnvToPrecision: Dict[TestEnv, Set[LowerPrecision]] = {
+    TestEnv.CUDA_LESS_THAN_SM80: [LowerPrecision.FP16, LowerPrecision.FP32],
+    TestEnv.CUDA_SM80: [LowerPrecision.BF16],
+    TestEnv.ROCM: [LowerPrecision.FP16],
+}
+
 
 class TestUnaryOpsConverter(AITTestCase):
-    @parameterized.expand([(op[0].__name__, op[0], op[1]) for op in unary_ops])
-    def test_unary_ops(self, name, orig_op: Callable, expected_op):
+    @parameterized.expand(
+        filter_test_cases_by_params(
+            {
+                env: [
+                    (
+                        f"{env}_{op[0].__name__}_{precision.value}",
+                        op[0],
+                        op[1],
+                        precision,
+                    )
+                    for op, precision in itertools.product(unary_ops, precisions)
+                ]
+                for env, precisions in TestEnvToPrecision.items()
+            }
+        )
+    )
+    def test_unary_ops(
+        self, name: str, orig_op: Callable, expected_op, precision: LowerPrecision
+    ):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return orig_op(x) * 2
+                return orig_op(x) * 2.0
 
-        model = TestModule().cuda().half()
+        torch_dtype = lower_precision_to_torch_type(precision)
+        model = TestModule().cuda().to(torch_dtype)
         inputs = [
-            torch.randn(1, 2, 3).half().cuda(),
+            torch.randn(1, 2, 3).cuda().to(torch_dtype),
         ]
 
         self.run_test(
-            model, inputs, expected_ops={expected_op} if expected_op is not None else {}
+            model,
+            inputs,
+            expected_ops={expected_op} if expected_op is not None else {},
+            precision=precision,
         )
 
     def test_sqrt(self):
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index bc99d258a..7ed11362b 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -42,9 +42,25 @@
 class LowerPrecision(Enum):
     FP32 = "fp32"
     FP16 = "fp16"
+    BF16 = "bf16"
     INT8 = "int8"
 
 
+def lower_precision_to_torch_type(
+    precision: LowerPrecision,
+) -> torch.dtype:
+    if precision == LowerPrecision.FP16:
+        return torch.float16
+    elif precision == LowerPrecision.BF16:
+        return torch.bfloat16
+    elif precision == LowerPrecision.FP32:
+        return torch.float
+    elif precision == LowerPrecision.INT8:
+        return torch.int8
+    else:
+        raise ValueError(f"Unsupported precision: {precision}")
+
+
 def fetch_attr(mod, target):
     """
     Fetch an attribute from the ``Module`` hierarchy of ``mod.module``.
@@ -113,8 +129,9 @@ def run_test(
         if permute_inputs:
             inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
 
-        mod.half()
-        inputs = [inp.half().contiguous() for inp in inputs]
+        torch_dtype = lower_precision_to_torch_type(precision)
+        mod.to(torch_dtype)
+        inputs = [inp.to(torch_dtype).contiguous() for inp in inputs]
         interp = AITInterpreter(
             mod,
             inputs,
@@ -146,7 +163,7 @@ def run_test(
                         interp_result.engine.lib_path,
                         interp_result.input_names,
                         interp_result.output_names,
-                        torch.float16,
+                        torch_dtype,
                         torch.float,
                         1,  #  num_runtimes
                     ),
@@ -158,7 +175,7 @@ def run_test(
                         interp_result.engine.lib_path,
                         interp_result.input_names,
                         interp_result.output_names,
-                        torch.float16,
+                        torch_dtype,
                         torch.float,
                         1,  #  num_runtimes
                     ),

From 55e150f83a8b149bc70f7197376af804c8401c16 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Tue, 18 Apr 2023 21:28:13 -0700
Subject: [PATCH 427/638] Fix acc_ops converter on std when keepdim=False
 (#593)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/593

As titled. std in fx2ait is represented by combinations of arithmetic ops and 2 reduce ops.:
```
Y = sqrt( mean(pow(X-mean(X))) )
```

However issue occurs when the first deduce op uses `keepdim=False`. It caused issue with subsequent sub op where X has old dimension while mean(X) only has 1 dim less than that.

Also, it seems there is no unittest for acc_ops when tracing std. This diff added the test

Reviewed By: amateurcoffee

Differential Revision: D45105942

fbshipit-source-id: 04f9c1105a2a6a711d025d5c85b95147343d0ecd
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py           |  6 +++---
 .../fx2ait/test/converters/test_ait_reduce.py | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 963b7b975..4ef422ce4 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -1382,7 +1382,7 @@ def std_mapper(node, mod):
         mean_kwargs = {
             "input": input_node,
             "dim": dim,
-            "keepdim": keepdim,
+            "keepdim": True,
         }
         mean_node = node.graph.call_function(mean, kwargs=mean_kwargs)
         mean_node.meta["type"] = torch.Tensor
@@ -1400,7 +1400,7 @@ def std_mapper(node, mod):
         }
         pow_node = node.graph.call_function(pow, kwargs=pow_kwargs)
         pow_node.meta["type"] = torch.Tensor
-        # sum(pow(X-mean(X))))/N
+        # mean(pow(X-mean(X)))
         post_mean_kwargs = {
             "input": pow_node,
             "dim": dim,
@@ -1408,7 +1408,7 @@ def std_mapper(node, mod):
         }
         post_mean_node = node.graph.call_function(mean, kwargs=post_mean_kwargs)
         post_mean_node.meta["type"] = torch.Tensor
-        # sqrt(sum(pow(X-mean(X))))/N)
+        # sqrt( mean(pow(X-mean(X))) )
         sqrt_kwargs = {
             "input": post_mean_node,
         }
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reduce.py b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
index b79a2c9e7..009b7cfa4 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_reduce.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
@@ -92,3 +92,24 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         model = TestModule().cuda()
         inputs = [torch.randn(2, 3, 5).half().cuda() + 1] * 2
         self.run_test(model, inputs, expected_ops={acc_ops.mean})
+
+    @parameterized.expand(
+        [
+            ["keepdim_false", (1,), False],
+            ["keepdim_true", (1,), True],
+            ["keepdim_false", (0,), False],
+            ["keepdim_true", (2,), True],
+        ]
+    )
+    # std is a combo of basic binary and mean ops
+    def test_std(self, name, dim, keepdim) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                return torch.std(input, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={},
+        )

From 7b973174bf5c4ec70da4095c987636682d5c49a2 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 19 Apr 2023 22:11:50 -0700
Subject: [PATCH 428/638] Fix race condition in FBCUDA (#591)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/591

A race condition in FBCUDA could be triggered when it was launched in a parallel.
See https://www.internalfb.com/phabricator/paste/view/P697768916

Update:

After further investigation, the source of the race condition appears to be between multiple processes and target_def.py CUDA.__exit__(...) code deleting things, and FBCUDA.__init__(...) code copying and writing files into directories that the other process is busy deleting. It was not obvious that CUDA.__exit__ would delete paths that FBCUDA.__init__() created.

After short discussions, we concluded that there is no way to clean this up with minimal risk code changes - a larger rewrite of target_def.py would be neccessary, also to prevent resource leaks, but is out of scope for a hotfix.

So the safest way to go, and what is now implemented here is the following:

 * Revert target_def.py to the pre-build-cache state where these issues were avoided by creating new randomly named temp directories every time.
 * In order to not get new  build cache keys everytime which would make caching impossible, the build cache makefile normalization was adapted instead.

The impact on build times appears neglible ( approx. 0.08 seconds )

Unit tests were added and modified accordingly as well.

Reviewed By: wushirong, aakhundov

Differential Revision: D45093471

fbshipit-source-id: 9473d96ac33178dcddc98cf988fa4560c7e85f96
---
 python/aitemplate/backend/build_cache_base.py |  86 ++++++++---
 python/aitemplate/backend/cuda/target_def.py  | 135 +++---------------
 tests/unittest/backend/test_build_cache.py    |  13 --
 3 files changed, 83 insertions(+), 151 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index ccb914c4b..b6de8f6a9 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -18,6 +18,7 @@
 import os
 import random
 import secrets
+import shlex
 import shutil
 import tempfile
 
@@ -26,6 +27,8 @@
 from pathlib import Path
 from typing import Callable, List, Optional, Tuple
 
+from aitemplate.backend.target import Target
+
 from aitemplate.utils import environ as aitemplate_env
 
 from aitemplate.utils.io import file_age, touch
@@ -182,24 +185,12 @@ def is_bin_file(filename: str) -> bool:
     return filename.lower().endswith(".bin")
 
 
-def makefile_normalizer(makefile_content_orig: bytes) -> bytes:
-    """Normalize the content of the makefile for hashing purposes (nothing else!),
-    so that it can be compared to other Makefiles
-    generated by different users on different systems"""
-    makefile_content = makefile_content_orig.decode("utf-8")
-    tmpdir = tempfile.gettempdir()
-    userid = str(os.getuid())
-    user_tmpdir = os.path.join(tmpdir, userid)
-    makefile_content = makefile_content.replace(user_tmpdir, "/tmp/$USER")
-    makefile_content = makefile_content.replace(tmpdir, "/tmp")
-    return makefile_content.encode("utf-8")
-
-
 def create_dir_hash(
     cmds: List[str],
     build_dir: str,
     filter_func: Callable[[str], bool] = is_source,
     debug=False,
+    content_replacer: Callable[[str], Optional[bytes]] = None,
 ) -> str:
     """Create a hash of the (source file) contents of a build directory, used for
     creating a cache key of an entire directory along with the build commands.
@@ -209,7 +200,8 @@ def create_dir_hash(
         build_dir (str): Path to build directory ( not part of hash )
         filter_func (Callable[[str], bool], optional): Filter function which determines whether a given file is considered a source file or not. Defaults to is_source(path).
         debug (bool, optional): Whether to write a 'cache_key.log' file into the build directory, so that cache misses can be debugged more easily. Defaults to False.
-
+        content_replacer (Callable[[Path], Optional[bytes]], optional): Content replacer is an optional function that may replace content of a file for hashing purposes. If None, or if this function returns None,
+                                                                        then no content replacement is done   on the file.
     Returns:
         str: SHA256 Hash of the build directory contents in the form of a hexdigest string.
     """
@@ -237,10 +229,11 @@ def create_dir_hash(
                 continue
             hash_object.update(str(fpath).encode("utf-8"))
             fullpath = str(basepath / fpath)
-            if fpath.name.lower().startswith("makefile"):
-                makefile_content = (basepath / fpath).read_bytes()
-                makefile_content = makefile_normalizer(makefile_content)
-                hash_object.update(makefile_content)
+            replaced_content = None
+            if content_replacer is not None:
+                replaced_content = content_replacer(fullpath)
+            if replaced_content is not None:
+                hash_object.update(replaced_content)
             else:
                 with open(fullpath, "rb") as f:
                     # read file in chunks of 32kb
@@ -348,6 +341,55 @@ def cleanup(self, retention_hours: int = 72):
         """
         pass
 
+    def makefile_normalizer(self, path, memoize_replacements=True) -> Optional[bytes]:
+        """
+        Normalizes the content of the makefile for hashing purposes (nothing else!),
+        so that it can be compared to other Makefiles
+        generated by different users on different systems.
+        """
+        p = Path(path)
+        if not p.name.lower().startswith("makefile"):
+            return None
+        makefile_content_orig = p.read_bytes()
+        target: Target = None
+        try:
+            target = Target.current()
+        except RuntimeError:
+            # No current target, returning Makefile content unchanged
+            return makefile_content_orig
+        if target is None:
+            return makefile_content_orig
+        if not hasattr(target, "_compile_options"):  #
+            return makefile_content_orig
+        if not hasattr(self, "_include_path_hash_cache"):
+            self._include_path_hash_cache = {}
+        makefile_content = makefile_content_orig.decode("utf-8")
+        compile_options = list(shlex.split(target._compile_options))
+        tmpdir = tempfile.gettempdir()
+        replacements = {}
+        for i in range(len(compile_options)):
+            if compile_options[i] == "-I":
+                if i < len(compile_options) - 1:
+                    inc_path = compile_options[i + 1]
+
+            elif compile_options[i].startswith("-I"):
+                inc_path = compile_options[i][2:]
+            else:
+                continue
+            # We are creating hashes of all include directories in a temp dir
+            if inc_path.startswith(tmpdir):
+                if memoize_replacements and inc_path in self._include_path_hash_cache:
+                    inc_path_hash = self._include_path_hash_cache[inc_path]
+                else:
+                    inc_path_hash = create_dir_hash([], inc_path, is_source)
+                    if memoize_replacements:
+                        self._include_path_hash_cache[inc_path] = inc_path_hash
+                replacements[inc_path] = inc_path_hash
+
+        for search, replace in replacements.items():
+            makefile_content = makefile_content.replace(search, replace)
+        return makefile_content.encode("utf-8")
+
 
 class NoBuildCache(BuildCache):
     def __init__(self):
@@ -415,7 +457,13 @@ def retrieve_build_cache(
         self.maybe_cleanup(self.lru_retention_hours, self.cleanup_max_age_seconds)
         cache_dir = self.cache_dir
         dir_hash = create_dir_hash(
-            cmds, build_dir, filter_func=from_sources_filter_func, debug=self.debug
+            cmds,
+            build_dir,
+            filter_func=from_sources_filter_func,
+            debug=self.debug,
+            content_replacer=lambda path: self.makefile_normalizer(
+                path, memoize_replacements=True
+            ),
         )
         key_cache_dir = os.path.join(cache_dir, dir_hash)
         if os.path.exists(key_cache_dir):
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index d1eeeeeb2..47f77199f 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -15,14 +15,11 @@
 """
 CUDA target specialization
 """
-import errno
-import hashlib
 import json
 import logging
 import os
 import pipes
 import re
-import secrets
 import shutil
 import sys
 import tempfile
@@ -42,17 +39,13 @@
 )
 
 from aitemplate.utils import environ
-from aitemplate.utils.io import copytree_with_hash
 from aitemplate.utils.misc import is_debug, is_linux
 
-
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
 
 
 _LOGGER = logging.getLogger(__name__)
 
-_NUM_DIR_CREATE_ATTEMPTS = 20
-
 
 class CUDA(Target):
     """CUDA target."""
@@ -193,14 +186,9 @@ def comp_func(name):
 class FBCUDA(CUDA):
     """FBCUDA target. Used in Meta internal env only."""
 
-    # @TODO: instead of using multiple class properties
-    # which can go out of sync, we should refactor this
-    # to use a proper singleton instance that can be returned by detect_target
-
     nvcc_option_json = None
     cutlass_path_ = None
     compile_options_ = None
-    include_path_ = None
 
     def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         from libfb.py import parutil
@@ -211,109 +199,24 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         cub_src_path = parutil.get_dir_path("aitemplate/AITemplate/fb/3rdparty/cub")
         static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
         self._include_path = None
-        try:
-            self.tmp_path = os.path.join(
-                tempfile.gettempdir(), f"{os.getuid()}_aitemplate_tmp"
-            )
-        except OSError:
-            _LOGGER.warning(
-                "FBCUDA Target: Failed to create user-specific temp directory path."
-            )
-            self.tmp_path = self.tmp_path = os.path.join(
-                tempfile.gettempdir(), f"{secrets.token_hex(16)}_aitemplate_tmp"
-            )
-        if FBCUDA.cutlass_path_ is None:
-            FBCUDA.compile_options_ = None  # If we rebuild the cutlass path
-            # we also need to rebuild the compile options
-            # Copy all of the includes over into an include directory
-
-            os.makedirs(self.tmp_path, exist_ok=True)
-            # find an unused random temporary directory within our base tmp_path
-            random_key = secrets.token_hex(16)
-            errcount = 0
-            while True:
-                try:
-                    os.makedirs(os.path.join(self.tmp_path, random_key), exist_ok=False)
-                    break
-                except OSError as error:
-                    errcount += 1
-                    if errcount > _NUM_DIR_CREATE_ATTEMPTS:
-                        raise OSError(
-                            f"Failed to create user-specific temp directory path below {self.tmp_path}. Giving up."
-                        ) from error
-                    if error.errno != errno.EEXIST:
-                        raise
-                    else:
-                        random_key = secrets.token_hex(16)
-
-            # the random_key part of this path will later be renamed to the content hash
-            _tmp_include_path = os.path.join(
-                self.tmp_path,
-                random_key,
-                "includes",
-            )
-            includes_content_hash = hashlib.sha256()
-            _tmp_cutlass_path_ = os.path.join(_tmp_include_path, "cutlass")
-            _tmp_cub_path_ = os.path.join(_tmp_include_path, "cub")
-            # copy recursively, and update a content hash in one go
-            copytree_with_hash(
-                cutlass_src_path, _tmp_cutlass_path_, hash=includes_content_hash
-            )
-            copytree_with_hash(cub_src_path, _tmp_cub_path_, hash=includes_content_hash)
+        if not FBCUDA.cutlass_path_:
+            self._include_path = tempfile.mkdtemp()
+
+            FBCUDA.cutlass_path_ = self._include_path + "/cutlass"
+            self.cub_path_ = self._include_path + "/cub"
+            shutil.copytree(cutlass_src_path, FBCUDA.cutlass_path_)
+            shutil.copytree(cub_src_path, self.cub_path_)
+
             attention_src_path = parutil.get_dir_path(
                 "aitemplate/AITemplate/python/aitemplate/backend/cuda/attention/src"
             )
-            attention_include_path = os.path.join(_tmp_include_path, "att_include")
-            copytree_with_hash(
-                attention_src_path, attention_include_path, hash=includes_content_hash
-            )
-            ait_static_include_path = os.path.join(_tmp_include_path, "static")
-            copytree_with_hash(
-                static_files_path + "/include/kernels",
-                ait_static_include_path,
-                hash=includes_content_hash,
-            )
-            # Now we have a content hash over all include contents
-            include_hash_digest = includes_content_hash.hexdigest()
-            # Prepare to rename atomically
-            old_path = os.path.join(self.tmp_path, random_key)
-            new_path = os.path.join(
-                self.tmp_path,
-                include_hash_digest,
+            attention_include_path = self._include_path + "/att_include"
+            shutil.copytree(attention_src_path, attention_include_path)
+            ait_static_include_path = self._include_path + "/static"
+            shutil.copytree(
+                static_files_path + "/include/kernels", ait_static_include_path
             )
-            # if it already exists, we don't want to overwrite it
-            # we can just delete our copy.
-            try:
-                if os.path.exists(new_path):
-                    # new version should replace old version. But this replacement
-                    # should happen ideally atomically. renames are much faster than
-                    # a recursive delete.
-                    os.rename(new_path, old_path + ".bak")
-                os.rename(old_path, new_path)
-            except OSError as e:
-                # target directory with identical contents already exists
-                _LOGGER.error(
-                    f"FBCUDA: Rename of old {old_path} to {new_path} failed.",
-                    exc_info=e,
-                )
-            try:
-                if os.path.exists(old_path):
-                    shutil.rmtree(old_path)
-            except OSError:
-                pass
-            try:
-                if os.path.exists(old_path + ".bak"):
-                    shutil.rmtree(old_path + ".bak")
-            except OSError:
-                pass
-            # set the include paths to the final variant
-            self._include_path = os.path.join(new_path, "includes")
-            self.cub_path_ = os.path.join(self._include_path, "cub")
-            FBCUDA.include_path_ = self._include_path
-            FBCUDA.cutlass_path_ = os.path.join(self._include_path, "cutlass")
-
         self.cutlass_path_ = FBCUDA.cutlass_path_
-        self._include_path = FBCUDA.include_path_
 
         cutlass_lib_path = parutil.get_dir_path(
             "aitemplate/AITemplate/python/aitemplate/utils/mk_cutlass_lib"
@@ -322,7 +225,7 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
 
         if not FBCUDA.nvcc_option_json:
             convert_nvcc_json = parutil.get_file_path(
-                os.path.join("aitemplate", "testing", "convert_nvcc_cmd")
+                os.path.join("aitemplate/testing", "convert_nvcc_cmd")
             )
             _LOGGER.info(f"Load the nvcc compile option from {convert_nvcc_json}")
             with open(convert_nvcc_json, "r") as nvcc_option_json:
@@ -333,10 +236,7 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         super().__init__(self.cutlass_path_, static_files_path, arch, **kwargs)
 
     def _build_compile_options(self):
-        if FBCUDA.compile_options_ is None:
-            assert self._template_path is not None
-            assert self._include_path is not None
-            assert self.cutlass_path_ is not None
+        if not FBCUDA.compile_options_:
             cutlass_path = [
                 os.path.join(self._template_path, "include"),
                 os.path.join(self._template_path, "tools/util/include"),
@@ -354,12 +254,10 @@ def _build_compile_options(self):
             with open(fb_include_path, "w") as fb_include:
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
-
             nvcc_arch = self._arch
             if nvcc_arch == "90":
                 # required by CUTLASS SM90 TMA kernels
                 nvcc_arch = "90a"
-
             options = (
                 self.nvcc_options_json["args"]
                 + ["-I" + path for path in cutlass_path]
@@ -378,7 +276,7 @@ def _build_compile_options(self):
                     "-w",
                     "--expt-relaxed-constexpr",
                     "--use_fast_math",
-                    f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
+                    f"-gencode=arch=compute_{self._arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
                     "-Xcompiler=-Wconversion",
                     environ.get_compiler_opt_level(),
                     "-std=c++17",
@@ -388,7 +286,6 @@ def _build_compile_options(self):
                 options.append("-DNDEBUG")
             FBCUDA.compile_options_ = " ".join(options)
         compile_options = FBCUDA.compile_options_
-        assert compile_options is not None
         _LOGGER.info(f"The compile options are: {compile_options}")
         return compile_options
 
diff --git a/tests/unittest/backend/test_build_cache.py b/tests/unittest/backend/test_build_cache.py
index dce49c9b0..921be28ba 100644
--- a/tests/unittest/backend/test_build_cache.py
+++ b/tests/unittest/backend/test_build_cache.py
@@ -26,7 +26,6 @@
     create_dir_hash,
     FileBasedBuildCache,
     is_source,
-    makefile_normalizer,
     SkipBuildCache,
 )
 
@@ -277,18 +276,6 @@ def test_deterministic_codegen(self, dtype="float32"):
                 hash8 != hash1
             ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
 
-    def test_makefile_rewrite(self):
-        tmpdir = os.path.join(tempfile.gettempdir(), f"{os.getuid()}_aitemplate_tmp")
-        makefile = f"""
-                TMPDIR: {tmpdir}
-        """
-        assert tmpdir in makefile
-        rewritten_makefile = makefile_normalizer(makefile.encode("utf-8")).decode(
-            "utf-8"
-        )
-        assert tmpdir not in rewritten_makefile
-        assert "$USER" in rewritten_makefile
-
     def test_repeated_build_dir_usage(self):
         with tempfile.TemporaryDirectory() as tempdir:
             Y = self._create_model_graph()

From be47ae561795403fce5e36caafc6bb5a4c604e20 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 20 Apr 2023 03:35:32 -0700
Subject: [PATCH 429/638] Upgrade CUTLASS to 3.1 (#584)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/584

ATT

Reviewed By: muchulee8, chenyang78, alexanderguzhva, tenpercent

Differential Revision: D45028066

fbshipit-source-id: 049ffee0f4e8f5f7e6fbd0b3517e2db7e9520c9f
---
 .gitmodules                                   |    2 +-
 3rdparty/cutlass                              |    2 +-
 .../cuda/attention/mem_eff_attention.py       |    6 +-
 .../utils/mk_cutlass_lib/extra_enum.py        |    6 +-
 .../attention_scaling_coefs_updater.h         |  514 ++++
 .../kernels/mem_eff_attention/debug_utils.h   |  160 ++
 .../mem_eff_attention/default_fmha_grouped.h  |  283 +++
 .../mem_eff_attention/epilogue_pipelined.h    |  632 +++++
 .../epilogue_rescale_output.h                 |  263 ++
 .../epilogue_thread_apply_logsumexp.h         |  175 ++
 .../mem_eff_attention/find_default_mma.h      |  190 ++
 .../kernels/mem_eff_attention/fmha_grouped.h  |  859 +++++++
 .../fmha_grouped_problem_visitor.h            |  186 ++
 .../mem_eff_attention/gemm/custom_mma.h       |  125 +
 .../mem_eff_attention/gemm/custom_mma_base.h  |  183 ++
 .../gemm/custom_mma_multistage.h              |  767 ++++++
 .../gemm/custom_mma_pipelined.h               |  401 ++++
 .../mem_eff_attention/gemm_kernel_utils.h     |  296 +++
 .../epilogue_predicated_tile_iterator.h       |  752 ++++++
 .../iterators/make_residual_last.h            |   98 +
 ...cated_tile_access_iterator_residual_last.h | 2115 ++++++++++++++++
 .../predicated_tile_iterator_residual_last.h  | 2120 +++++++++++++++++
 .../mem_eff_attention/kernel_forward.h        |    2 -
 .../kernels/mem_eff_attention/mma_from_smem.h | 1780 ++++++++++++++
 24 files changed, 11907 insertions(+), 10 deletions(-)
 create mode 100644 static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h
 create mode 100644 static/include/kernels/mem_eff_attention/debug_utils.h
 create mode 100644 static/include/kernels/mem_eff_attention/default_fmha_grouped.h
 create mode 100644 static/include/kernels/mem_eff_attention/epilogue_pipelined.h
 create mode 100644 static/include/kernels/mem_eff_attention/epilogue_rescale_output.h
 create mode 100644 static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h
 create mode 100644 static/include/kernels/mem_eff_attention/find_default_mma.h
 create mode 100644 static/include/kernels/mem_eff_attention/fmha_grouped.h
 create mode 100644 static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h
 create mode 100644 static/include/kernels/mem_eff_attention/gemm/custom_mma.h
 create mode 100644 static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h
 create mode 100644 static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h
 create mode 100644 static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h
 create mode 100644 static/include/kernels/mem_eff_attention/gemm_kernel_utils.h
 create mode 100644 static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
 create mode 100644 static/include/kernels/mem_eff_attention/iterators/make_residual_last.h
 create mode 100644 static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
 create mode 100644 static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
 create mode 100644 static/include/kernels/mem_eff_attention/mma_from_smem.h

diff --git a/.gitmodules b/.gitmodules
index e439953e9..1272127de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/AITemplate/cutlass.git
+	url = https://github.com/facebookincubator/cutlass-fork.git
 [submodule "3rdparty/cub"]
 	path = 3rdparty/cub
 	url = https://github.com/NVIDIA/cub.git
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 5d7be1ac1..77f07619c 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 5d7be1ac1b0dae1e9b8ccbe98d494ccaa437ddc0
+Subproject commit 77f07619c0b4899aa1ce076300258eb3a27ffad6
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 6a69f817c..379010662 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -43,8 +43,8 @@
 #include <cuda_fp16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "gemm_kernel_utils.h"
 
+#include "mem_eff_attention/gemm_kernel_utils.h"
 #include "mem_eff_attention/kernel_forward.h"
 
 
@@ -215,12 +215,12 @@
 #include "cutlass/util/reference/host/tensor_norm.h"
 
 #include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "gemm_kernel_utils.h"
 #include "cutlass/gemm/device/gemm_grouped.h"
 
 #include "cutlass/fast_math.h"
 
-#include "default_fmha_grouped.h"
+#include "mem_eff_attention/gemm_kernel_utils.h"
+#include "mem_eff_attention/default_fmha_grouped.h"
 
 using namespace gemm_kernel_utils;
 
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index de6211628..c07e70959 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -131,9 +131,9 @@ class EpiloguePermuteLayout(enum.Enum):
   NoPermute = enum_auto()
 
 EpiloguePermuteLayoutTag = {
-  EpiloguePermuteLayout.Permute5D_20314: 'cutlass::layout::Tensor5DPermute20314',
-  EpiloguePermuteLayout.Permute4D_0213: 'cutlass::layout::Tensor4DPermute0213',
-  EpiloguePermuteLayout.Permute4DBMM_0213: 'cutlass::layout::Tensor4DPermuteBMM0213',
+  EpiloguePermuteLayout.Permute5D_20314: 'cutlass::layout::Tensor5DPermute20314RowMajor',
+  EpiloguePermuteLayout.Permute4D_0213: 'cutlass::layout::Tensor4DPermute0213RowMajor',
+  EpiloguePermuteLayout.Permute4DBMM_0213: 'cutlass::layout::Tensor4DPermuteBMM0213RowMajor',
   EpiloguePermuteLayout.NoPermute: 'cutlass::layout::NoPermute',
   # EpiloguePermuteLayout.Permute3DBMM_021: 'cutlass::layout::Tensor3DPermute021BMM',
 }
diff --git a/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h b/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h
new file mode 100644
index 000000000..457fbc49e
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h
@@ -0,0 +1,514 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "gemm_kernel_utils.h"
+
+namespace {
+
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+} // namespace
+
+/* Iterates on the accumulator and corresponding position on result matrix
+
+(1) Update `mi[r]` to the max value of the row `r`
+(2) In a second iteration do the following:
+    (a) accum   <- exp(accum - mi)
+    (b) m_prime <- exp(m_prime - mi)
+    (c) s_prime <- s_prime * m_prime + sum(accum)
+
+All of this is done on registers, before we store all of this
+on shared memory for the next matmul with Value.
+
+We have multiple implementations, because each configuration has a different way
+of iterating in the accumulators.
+*/
+
+template <typename BASE, typename T, typename accum_t, int kWarpSize>
+struct RegisterOps {
+  template <
+      int kQueriesPerBlock,
+      bool kFullColumns,
+      bool kIsFirst,
+      bool kKeepOutputInRF>
+  CUTLASS_DEVICE static void update(
+      typename T::Fragment& frag_o, // output so far
+      typename T::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int16_t max_col,
+      typename T::TensorCoord const& tile_offset,
+      float scaling) {
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+    if (!kIsFirst) {
+      if (thread_id < kQueriesPerBlock) {
+        m_prime[thread_id] = mi[thread_id];
+      }
+      __syncthreads();
+    }
+
+    auto lane_offset = BASE::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (kFullColumns || accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max * scaling);
+          });
+    }
+    frag = cutlass::multiplies<typename T::Fragment>()(scaling * kLog2e, frag);
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    if (thread_id < kQueriesPerBlock) {
+      auto m_prime_exp = exp2f(kLog2e * (m_prime[thread_id] - mi[thread_id]));
+      m_prime[thread_id] = m_prime_exp;
+      s_prime[thread_id] *= m_prime_exp;
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !kIsFirst) {
+      accum_t mp;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mp = m_prime[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) { frag_o[idx] *= mp; },
+          [&](int accum_m) {});
+      __syncthreads();
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = kLog2e * mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] = (kFullColumns || accum_n < max_col)
+                ? exp2f(frag[idx] - mi_row)
+                : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (BASE::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              atomicAdd(&s_prime[accum_m], total_row);
+            }
+          });
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSm80
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSm80<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterVolta
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterVolta<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    static_assert(
+        cutlass::platform::is_same<Element, float>::value,
+        "update to support non-float accum");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+    // T0 & T2 share same line within a quad
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+    myValue = fn(myValue, otherV);
+    // quad 0 and quad 2 are on the same lines
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+    myValue = fn(myValue, otherV);
+    return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSimt
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSimt<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+      auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+      myValue = fn(myValue, otherV);
+    }
+    return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using Iterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSimt<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Updater =
+      AttentionScalingCoefsUpdaterVolta<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSm80<Iterator, accum_t, kWarpSize>;
+};
diff --git a/static/include/kernels/mem_eff_attention/debug_utils.h b/static/include/kernels/mem_eff_attention/debug_utils.h
new file mode 100644
index 000000000..ccdff22de
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/debug_utils.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <float.h>
+#include <stdio.h>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (int _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 0
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_T0_L0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", __VA_ARGS__);                                    \
+  }
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+#define PRINT_T0_L0
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_T0_L0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_T0_L0("printing %s (%s)", name, typeStr.data);      \
+    for (int _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_T0_L0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_T0_L0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_T0_L0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
diff --git a/static/include/kernels/mem_eff_attention/default_fmha_grouped.h b/static/include/kernels/mem_eff_attention/default_fmha_grouped.h
new file mode 100644
index 000000000..f33f8a98b
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/default_fmha_grouped.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix
+   multiply-add with the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major
+   outputs are accommodated by exchanging A and B operands and assuming
+   transposed layouts. Partial specializations here choose
+   'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "find_default_mma.h"
+#include "fmha_grouped.h"
+#include "gemm_kernel_utils.h"
+#include "mma_from_smem.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    bool kSingleValueIteration,
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly>
+struct DefaultFMHAGrouped {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using output_t = scalar_t;
+
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+
+  using ArchTag = ArchTag_;
+  static bool const kIsAligned = isAligned_;
+  static int const kWarpSize = 32;
+  static int const kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (kWarpSize * kWarpSize);
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+
+    using GemmType = gemm_kernel_utils::DefaultGemmType<ArchTag, scalar_t>;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = scalar_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator>;
+
+    static int const kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementAccumulator,
+        LayoutC,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        kStages,
+        Operator>::DefaultMma;
+
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        ElementAccumulator,
+        kWarpSize>::Updater;
+
+    static_assert(MmaCore::WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /*
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+
+    using GemmType = typename MM0::GemmType;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = output_accum_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator>;
+
+    static int const kAlignmentA = DefaultConfig::kAlignmentA;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = typename MM0::ThreadblockShape;
+    using WarpShape = typename MM0::WarpShape;
+    using InstructionShape = typename MM0::InstructionShape;
+
+    using EpilogueOutputOp = typename DefaultConfig::EpilogueOutputOp;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using ThreadblockSwizzle = void; // Swizzling is unused
+    static bool const kSplitKSerial = false;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EpilogueOutputOp,
+        ThreadblockSwizzle,
+        kStages,
+        kSplitKSerial,
+        Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  /// Define the kernel in terms of the default kernel
+  using FMHAKernel = kernel::FMHAGrouped<
+      MM0,
+      MM1,
+      scalar_t,
+      accum_t,
+      output_t,
+      output_accum_t,
+      kSingleValueIteration,
+      GroupScheduleMode_>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_pipelined.h b/static/include/kernels/mem_eff_attention/epilogue_pipelined.h
new file mode 100644
index 000000000..2a574e71f
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_pipelined.h
@@ -0,0 +1,632 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentOutput const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h b/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h
new file mode 100644
index 000000000..fce9c431b
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h b/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 000000000..2e286d3f4
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+      Array<Element, ElementsPerAccess> const& input) const {
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = expf(input[i]);
+    }
+
+    return result;
+  }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+      Array<half_t, ElementsPerAccess> const& input) const {
+    Array<half_t, ElementsPerAccess> result;
+
+    int const kVectorCount = ElementsPerAccess / 2;
+
+    __half2 const* input_ptr =
+        reinterpret_cast<__half2 const*>(input.raw_data());
+    __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = h2exp(input_ptr[i]);
+    }
+
+    return result;
+  }
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <
+    typename ElementOutput_, // output
+    typename ElementLSE_, // accumulator from LSE
+    typename ElementAccumulator_, // accumulator from matmul
+    typename ElementCompute_, // intermediate compute (and exp calculation)
+    int ElementsPerAccess>
+class ApplyLogSumExp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementLSE = ElementLSE_;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+  using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ApplyLogSumExp() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& AB,
+      FragmentLSE const& scale_unused,
+      // bias used as LSE
+      FragmentLSE const& bias) const {
+    FragmentCompute frag_AB = NumericArrayConverter<
+        ElementCompute,
+        ElementAccumulator,
+        kElementsPerAccess>()(AB);
+    FragmentCompute frag_lse_compute =
+        NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(
+            bias);
+    FragmentCompute frag_compute;
+
+    minus<FragmentCompute> minus_lse;
+    detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+    frag_compute = minus_lse(frag_AB, frag_lse_compute);
+    frag_compute = apply_exp(frag_compute);
+
+    return NumericArrayConverter<
+        ElementOutput,
+        ElementCompute,
+        kElementsPerAccess>()(frag_compute);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/find_default_mma.h b/static/include/kernels/mem_eff_attention/find_default_mma.h
new file mode 100644
index 000000000..7f9c99732
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/find_default_mma.h
@@ -0,0 +1,190 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instanciate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/mem_eff_attention/fmha_grouped.h b/static/include/kernels/mem_eff_attention/fmha_grouped.h
new file mode 100644
index 000000000..d48258569
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/fmha_grouped.h
@@ -0,0 +1,859 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped FMHA kernel
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+#include "epilogue_rescale_output.h"
+#include "fmha_grouped_problem_visitor.h"
+#include "gemm_kernel_utils.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename MM0_, ///! Structure for computing P = Q @ K
+    typename MM1_, ///! Structure for computing O = P @ V
+    typename scalar_t_,
+    typename accum_t_,
+    typename output_t_,
+    typename output_accum_t_,
+    bool kKeepOutputInRF, ///! Whether the intermediate output from MM0_ should
+                          /// be kept in the register file
+    GroupScheduleMode GroupScheduleMode_ ///! Type of scheduling to perform
+    >
+struct FMHAGrouped {
+ public:
+  using MM0 = MM0_;
+  using MM1 = MM1_;
+
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = output_t_;
+  using output_accum_t = output_accum_t_;
+
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  // Parameters to satisfy BaseGrouped
+  using ElementA = scalar_t;
+  using ElementB = scalar_t;
+  using ElementC = accum_t;
+  using LayoutA = typename MM0::LayoutA;
+  using LayoutB = typename MM0::ElementB;
+  using LayoutC = typename MM1::ElementC;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static int const kAlignmentA = MM0::kAlignmentA;
+  static int const kAlignmentB = MM0::kAlignmentB;
+  static int const kAlignmentC = 1;
+  using Mma = typename MM1::Mma;
+  using EpilogueOutputOp = typename MM1::EpilogueOutputOp;
+  using ThreadblockSwizzle = void;
+  using Operator = typename MM1::Operator;
+  using WarpShape = typename MM1::WarpShape;
+  using InstructionShape = typename MM1::InstructionShape;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+  using ElementAccumulator = accum_t;
+
+  using LayoutQ = typename MM0::LayoutA;
+  using LayoutK = typename MM0::LayoutB;
+  using LayoutP = typename MM0::LayoutC;
+  using LayoutV = typename MM1::LayoutB;
+  using LayoutO = typename MM1::LayoutC;
+
+  static bool const kPreloadV =
+      (MM1::Mma::ArchTag::kMinComputeCapability >= 80 &&
+       cutlass::sizeof_bits<ElementV>::value == 16);
+
+  static int const kAlignmentQ = MM0::kAlignmentA;
+  static int const kAlignmentK = MM0::kAlignmentB;
+  static int const kAlignmentV = 1;
+
+  using ThreadblockShape = typename MM0::ThreadblockShape;
+
+  static int const kQueriesPerBlock = ThreadblockShape::kM;
+  static int const kKeysPerBlock = ThreadblockShape::kN;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename MM1::WarpCount;
+  static int const kThreadsPerWarp = 32;
+  static int const kThreadCount = kThreadsPerWarp * WarpCount::kCount;
+
+  using ProblemVisitor = FMHAGroupedProblemVisitor<
+      ThreadblockShape,
+      kGroupScheduleMode,
+      kThreadCount,
+      kThreadCount>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord* problem_sizes0;
+    GemmCoord* problem_sizes1;
+
+    int problem_count;
+    int threadblock_count;
+
+    ElementQ** ptr_Q;
+    ElementK** ptr_K;
+    ElementP** ptr_P;
+    ElementV** ptr_V;
+    ElementO** ptr_O;
+    ElementOAccum** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex* ldq;
+    typename LayoutK::Stride::LongIndex* ldk;
+    typename LayoutP::Stride::LongIndex* ldv;
+    typename LayoutO::Stride::LongIndex* ldo;
+
+    // Whether causal masking is to be performed
+    bool causal;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_Q(nullptr),
+          ptr_K(nullptr),
+          ptr_P(nullptr),
+          ptr_V(nullptr),
+          ptr_O(nullptr),
+          ptr_O_accum(nullptr),
+          ldq(nullptr),
+          ldk(nullptr),
+          ldv(nullptr),
+          ldo(nullptr),
+          causal(false),
+          host_problem_sizes(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+        GemmCoord* problem_sizes0,
+        GemmCoord* problem_sizes1,
+        int problem_count,
+        int threadblock_count,
+        ElementQ** ptr_Q,
+        ElementK** ptr_K,
+        ElementP** ptr_P,
+        ElementV** ptr_V,
+        ElementO** ptr_O,
+        ElementOAccum** ptr_O_accum,
+        typename LayoutQ::Stride::LongIndex* ldq,
+        typename LayoutK::Stride::LongIndex* ldk,
+        typename LayoutP::Stride::LongIndex* ldp,
+        typename LayoutV::Stride::LongIndex* ldv,
+        typename LayoutO::Stride::LongIndex* ldo,
+        bool causal,
+        GemmCoord* host_problem_sizes = nullptr)
+        : problem_sizes0(problem_sizes0),
+          problem_sizes1(problem_sizes1),
+          problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          ptr_Q(ptr_Q),
+          ptr_K(ptr_K),
+          ptr_P(ptr_P),
+          ptr_V(ptr_V),
+          ptr_O(ptr_O),
+          ptr_O_accum(
+              kNeedsOutputAccumulatorBuffer ? ptr_O_accum : (accum_t**)ptr_O),
+          ldq(ldq),
+          ldk(ldk),
+          ldv(ldv),
+          ldo(ldo),
+          causal(causal),
+          host_problem_sizes(host_problem_sizes) {}
+
+    bool __host__ check_supported() {
+      CHECK_ALIGNED_PTR(ptr_Q, kAlignmentQ);
+      CHECK_ALIGNED_PTR(ptr_K, kAlignmentK);
+      CHECK_ALIGNED_PTR(ptr_V, kAlignmentV);
+      XFORMERS_CHECK(ldq % kAlignmentQ == 0, "query is not correctly aligned");
+      XFORMERS_CHECK(ldk % kAlignmentK == 0, "key is not correctly aligned");
+      XFORMERS_CHECK(ldv % kAlignmentV == 0, "value is not correctly aligned");
+      return true;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    ElementQ** ptr_Q;
+    ElementK** ptr_K;
+    ElementP** ptr_P;
+    ElementV** ptr_V;
+    ElementO** ptr_O;
+    ElementOAccum** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex* ldq;
+    typename LayoutK::Stride::LongIndex* ldk;
+    typename LayoutP::Stride::LongIndex* ldv;
+    typename LayoutO::Stride::LongIndex* ldo;
+
+    bool causal;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : ptr_Q(nullptr),
+          ptr_K(nullptr),
+          ptr_P(nullptr),
+          ptr_V(nullptr),
+          ptr_O(nullptr),
+          ptr_O_accum(nullptr),
+          ldq(nullptr),
+          ldk(nullptr),
+          ldv(nullptr),
+          ldo(nullptr),
+          causal(false) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(
+              args.problem_sizes0,
+              args.problem_sizes1,
+              args.problem_count,
+              workspace,
+              tile_count),
+          threadblock_count(args.threadblock_count),
+          ptr_Q(args.ptr_Q),
+          ptr_K(args.ptr_K),
+          ptr_P(args.ptr_P),
+          ptr_V(args.ptr_V),
+          ptr_O(args.ptr_O),
+          ptr_O_accum(
+              kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum
+                                            : (accum_t**)args.ptr_O),
+          ldq(args.ldq),
+          ldk(args.ldk),
+          ldv(args.ldv),
+          ldo(args.ldo),
+          causal(args.causal) {}
+
+    CUTLASS_HOST_DEVICE
+    void update(
+        Arguments const& args,
+        void* workspace = nullptr,
+        int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(
+          args.problem_sizes0,
+          args.problem_sizes1,
+          args.problem_count,
+          workspace,
+          tile_count);
+      threadblock_count = args.threadblock_count;
+      ptr_Q = args.ptr_Q;
+      ptr_K = args.ptr_K;
+      ptr_P = args.ptr_P;
+      ptr_V = args.ptr_V;
+      ptr_O = args.ptr_O;
+      ptr_O_accum = kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum
+                                                  : (accum_t**)args.ptr_O;
+      ldq = args.ldq;
+      ldk = args.ldk;
+      ldv = args.ldv;
+      ldo = args.ldo;
+      causal = args.causal;
+    }
+  };
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> m_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> s_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> mi;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+ private:
+  // Parameters to be used by an individual tile
+  struct TileParams {
+    CUTLASS_HOST_DEVICE
+    static int query_start(int threadblock_idx) {
+      return threadblock_idx * kQueriesPerBlock;
+    }
+
+    // Returns whether this threadblock computes within the number of queries,
+    // which is determined by the M dimension of problem 0
+    CUTLASS_HOST_DEVICE
+    static bool can_compute(
+        int threadblock_idx,
+        const GemmCoord& problem_size0) {
+      return query_start(threadblock_idx) < problem_size0.m();
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_queries(
+        int threadblock_idx,
+        const GemmCoord& problem_size0) {
+      return problem_size0.m() - query_start(threadblock_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_keys(
+        int threadblock_idx,
+        const GemmCoord& problem_size0,
+        bool causal) {
+      int nk = problem_size0.n();
+      if (causal) {
+        nk = cutlass::fast_min(
+            int32_t(query_start(threadblock_idx) + kQueriesPerBlock), nk);
+      }
+      return nk;
+    }
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  FMHAGrouped() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) {
+    return Status::kSuccess;
+  }
+
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x;
+  }
+
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.x / kThreadsPerWarp;
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x % kThreadsPerWarp;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+
+    ProblemVisitor problem_visitor(
+        params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      const int32_t threadblock_idx =
+          int32_t(problem_visitor.threadblock_idx());
+
+      if (!TileParams::can_compute(threadblock_idx, problem_size0)) {
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      const int32_t problem_idx = problem_visitor.problem_index();
+
+      if (thread_id() < kQueriesPerBlock) {
+        s_prime[thread_id()] = ElementAccumulator(0);
+        m_prime[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+        mi[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+      }
+
+      ElementO* ptr_O = params.ptr_O[problem_idx] +
+          TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      ElementOAccum* ptr_O_accum = params.ptr_O_accum[problem_idx] +
+          TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      const int num_queries =
+          TileParams::num_queries(threadblock_idx, problem_size0);
+
+      auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+        using OutputTileIterator = typename MM1::OutputTileIterator;
+        return OutputTileIterator(
+            typename OutputTileIterator::Params{
+                (int32_t)params.ldo[problem_idx]},
+            ptr_O,
+            typename OutputTileIterator::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      auto createOutputAccumIter =
+          [&](int col) -> typename MM1::OutputTileIteratorAccum {
+        using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+        return OutputTileIteratorAccum(
+            typename OutputTileIteratorAccum::Params{
+                (int32_t)params.ldo[problem_idx]},
+            ptr_O_accum,
+            typename OutputTileIteratorAccum::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      typename MM1::Mma::FragmentC accum_o;
+      accum_o.clear();
+
+      const int num_keys =
+          TileParams::num_keys(threadblock_idx, problem_size0, params.causal);
+
+      for (int32_t iter_key_start = 0; iter_key_start < num_keys;
+           iter_key_start += kKeysPerBlock) {
+        int32_t problem_size_0_m =
+            cutlass::fast_min((int32_t)kQueriesPerBlock, num_queries);
+        int32_t problem_size_0_n = cutlass::fast_min(
+            (int32_t)kKeysPerBlock, num_keys - iter_key_start);
+        int32_t const& problem_size_0_k = problem_size0.k();
+        int32_t const& problem_size_1_n = problem_size1.n();
+        int32_t const& problem_size_1_k = problem_size_0_n;
+
+        auto prologueV = [&](int blockN) {
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{
+                  MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] +
+                  iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          MM1::Mma::prologue(
+              shared_storage.after_mm0.mm1.mm,
+              iterator_V,
+              thread_id(),
+              problem_size_1_k);
+        };
+
+        __syncthreads(); // Need to have shared memory initialized, and
+                         // `m_prime` updated from end of prev iter
+
+        //
+        // MATMUL: Q.K_t
+        //
+        // Computes the block-matrix product of:
+        // (a) query[query_start:query_end, :]
+        // with
+        // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+        // and stores that into `shared_storage.si`
+        //
+
+        ElementQ* ptr_Q = params.ptr_Q[problem_idx] +
+            TileParams::query_start(threadblock_idx) * params.ldq[problem_idx];
+
+        // Construct iterators to A and B operands
+        typename MM0::IteratorA iterator_A(
+            typename MM0::IteratorA::Params(
+                typename MM0::MmaCore::LayoutA(params.ldq[problem_idx])),
+            ptr_Q,
+            {problem_size_0_m, problem_size_0_k},
+            thread_id(),
+            {0, 0});
+
+        typename MM0::IteratorB iterator_B(
+            typename MM0::IteratorB::Params(
+                typename MM0::MmaCore::LayoutB(params.ldk[problem_idx])),
+            params.ptr_K[problem_idx] +
+                iter_key_start * params.ldk[problem_idx],
+            {problem_size_0_k, problem_size_0_n},
+            thread_id(),
+            {0, 0});
+
+        // Construct thread-scoped matrix multiply
+        typename MM0::Mma mma(
+            shared_storage.mm0, thread_id(), warp_id(), lane_id());
+
+        typename MM0::Mma::FragmentC accum;
+
+        accum.clear();
+
+        auto gemm_k_iterations =
+            (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+        __syncthreads();
+
+        if (kPreloadV) {
+          prologueV(0);
+        }
+
+        typename MM0::Mma::Operator::IteratorC::TensorCoord
+            iteratorC_tile_offset = {
+                (warp_id() % MM0::Mma::WarpCount::kM),
+                (warp_id() / MM0::Mma::WarpCount::kM)};
+
+        // Mask out last if causal
+        if (params.causal && num_keys - iter_key_start <= kKeysPerBlock) {
+          auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+              lane_id(), warp_id(), iteratorC_tile_offset);
+          int32_t last_col;
+          MM0::ScalingCoefsUpdater::iterateRows(
+              lane_offset,
+              [&](int accum_m) {
+                last_col = TileParams::query_start(threadblock_idx) + accum_m -
+                    iter_key_start;
+              },
+              [&](int accum_m, int accum_n, int idx) {
+                if (accum_n > last_col) {
+                  accum[idx] =
+                      -cutlass::platform::numeric_limits<accum_t>::infinity();
+                }
+              },
+              [&](int accum_m) {});
+        }
+        DISPATCH_BOOL(
+            iter_key_start == 0, kIsFirst, ([&] {
+              DISPATCH_BOOL(
+                  num_keys - iter_key_start >= kKeysPerBlock,
+                  kFullColumns,
+                  ([&] {
+                    // Update `mi` from accum stored in registers
+                    // Also updates `accum` with accum[i] <-
+                    // exp(accum[i] * scale
+                    // - mi)
+                    MM0::ScalingCoefsUpdater::update<
+                        kQueriesPerBlock,
+                        kFullColumns,
+                        kIsFirst,
+                        kKeepOutputInRF>(
+                        accum_o,
+                        accum,
+                        mi,
+                        m_prime,
+                        s_prime,
+                        lane_id(),
+                        thread_id(),
+                        warp_id(),
+                        num_keys - iter_key_start,
+                        iteratorC_tile_offset,
+                        1.0f / cutlass::fast_sqrt(float(problem_size0.k())));
+                  }));
+            }));
+
+        // Output results to shared-memory
+        int warp_idx_mn_0 = warp_id() %
+            (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+        auto output_tile_coords = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+        MM0::B2bGemm::accumToSmem(
+            shared_storage.after_mm0.si, accum, lane_id(), output_tile_coords);
+
+        __syncthreads();
+
+        //
+        // MATMUL: Attn . V
+        // Run the matmul `attn @ V` for a block of attn and V.
+        // `attn` is read from shared memory (in `shared_storage_si`)
+        // `V` is read from global memory (with iterator_B)
+        //
+
+        const int64_t nBlockN = kKeepOutputInRF
+            ? 1
+            : ceil_div(
+                  (int64_t)problem_size_1_n,
+                  int64_t(MM1::ThreadblockShape::kN));
+
+        // Iterate over the N dimension of GEMM1
+        for (int blockN = 0; blockN < nBlockN; ++blockN) {
+          int gemm_k_iterations = (problem_size_1_k + MM1::Mma::Shape::kK - 1) /
+              MM1::Mma::Shape::kK;
+
+          // Compute threadblock-scoped matrix multiply-add and store it in
+          // accum (in registers)
+          if (!kPreloadV) {
+            __syncthreads(); // we share shmem between mma and epilogue
+          }
+
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{
+                  MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] +
+                  iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          typename MM1::Mma mma_pv(
+              shared_storage.after_mm0.mm1.mm,
+              shared_storage.after_mm0.si,
+              (int)thread_id(),
+              (int)warp_id(),
+              (int)lane_id(),
+              (int)problem_size_1_k);
+
+          mma_pv.set_prologue_done(kPreloadV);
+          if (!kKeepOutputInRF) {
+            accum_o.clear();
+          }
+
+          mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+          __syncthreads();
+
+          if (kPreloadV && !kKeepOutputInRF && blockN + 1 < nBlockN) {
+            prologueV(blockN + 1);
+          }
+
+          if (!kKeepOutputInRF) {
+            DISPATCH_BOOL(
+                iter_key_start == 0, kIsFirst, ([&] {
+                  DISPATCH_BOOL(
+                      (iter_key_start + kKeysPerBlock) >= num_keys,
+                      kIsLast,
+                      ([&] {
+                        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                        using DefaultOp =
+                            typename MM1::DefaultConfig::EpilogueOutputOp;
+                        using ElementCompute =
+                            typename DefaultOp::ElementCompute;
+                        using EpilogueOutputOp = typename cutlass::epilogue::
+                            thread::MemoryEfficientAttentionNormalize<
+                                typename cutlass::platform::conditional<
+                                    kIsLast,
+                                    output_t,
+                                    output_accum_t>::type,
+                                output_accum_t,
+                                DefaultOp::kCount,
+                                typename DefaultOp::ElementAccumulator,
+                                output_accum_t,
+                                kIsFirst,
+                                kIsLast,
+                                cutlass::
+                                    Array<ElementCompute, kQueriesPerBlock>>;
+                        using Epilogue = typename cutlass::epilogue::
+                            threadblock::EpiloguePipelined<
+                                typename DefaultEpilogue::Shape,
+                                typename MM1::Mma::Operator,
+                                DefaultEpilogue::kPartitionsK,
+                                typename cutlass::platform::conditional<
+                                    kIsLast,
+                                    typename MM1::OutputTileIterator,
+                                    typename MM1::OutputTileIteratorAccum>::
+                                    type,
+                                typename DefaultEpilogue::
+                                    AccumulatorFragmentIterator,
+                                typename DefaultEpilogue::WarpTileIterator,
+                                typename DefaultEpilogue::SharedLoadIterator,
+                                EpilogueOutputOp,
+                                typename DefaultEpilogue::Padding,
+                                DefaultEpilogue::kFragmentsPerIteration,
+                                true, // IterationsUnroll
+                                typename MM1::
+                                    OutputTileIteratorAccum // Read
+                                                            // iterator
+                                >;
+
+                        int col = blockN * MM1::Mma::Shape::kN;
+                        auto source_iter = createOutputAccumIter(col);
+                        auto dest_iter = gemm_kernel_utils::call_conditional<
+                            kIsLast,
+                            decltype(createOutputIter),
+                            decltype(createOutputAccumIter)>::
+                            apply(createOutputIter, createOutputAccumIter, col);
+                        EpilogueOutputOp rescale(s_prime, m_prime);
+                        Epilogue epilogue(
+                            shared_storage.epilogue_shared_storage(),
+                            thread_id(),
+                            warp_id(),
+                            lane_id());
+                        epilogue(rescale, dest_iter, accum_o, source_iter);
+                      }));
+                }));
+            if (!kKeepOutputInRF) {
+              __syncthreads();
+            }
+          }
+        }
+        __syncthreads(); // we modify `m_prime` after
+      }
+
+      if (kKeepOutputInRF) {
+        const bool kIsFirst = true;
+        const bool kIsLast = true;
+        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+        using ElementCompute = typename DefaultOp::ElementCompute;
+        using EpilogueOutputOp = typename cutlass::epilogue::thread::
+            MemoryEfficientAttentionNormalize<
+                output_t, // output
+                output_accum_t, // source
+                DefaultOp::kCount,
+                typename DefaultOp::ElementAccumulator, // accum
+                output_accum_t, // compute
+                kIsFirst,
+                kIsLast,
+                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+        using Epilogue =
+            typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator, // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true, // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum // source tile
+                >;
+        auto dest_iter = createOutputIter(0);
+        EpilogueOutputOp rescale(s_prime, m_prime);
+        Epilogue epilogue(
+            shared_storage.epilogue_shared_storage(),
+            thread_id(),
+            warp_id(),
+            lane_id());
+        epilogue(rescale, dest_iter, accum_o);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h b/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h
new file mode 100644
index 000000000..70be8e589
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h
@@ -0,0 +1,186 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped FMHA
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels
+template <typename ThreadblockShape>
+struct FMHAGroupedProblemSizeHelper {
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(
+      const cutlass::gemm::GemmCoord& problem) {
+    // FMHA only partitions tiles across the M dimension.
+    return cutlass::gemm::GemmCoord(
+        ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+        1,
+        1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <
+    typename ThreadblockShape,
+    GroupScheduleMode GroupScheduleMode_,
+    int PrefetchTileCount,
+    int ThreadCount,
+    bool Transposed = false>
+struct FMHAGroupedProblemVisitor
+    : public GroupedProblemVisitor<
+          detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>,
+          ThreadblockShape,
+          GroupScheduleMode_,
+          PrefetchTileCount,
+          ThreadCount> {
+  using ProblemSizeHelper =
+      detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<
+      ProblemSizeHelper,
+      ThreadblockShape,
+      GroupScheduleMode_,
+      PrefetchTileCount,
+      ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  cutlass::gemm::GemmCoord const* problem_sizes0;
+  cutlass::gemm::GemmCoord const* problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const* problem_sizes0;
+    cutlass::gemm::GemmCoord const* problem_sizes1;
+    int32_t problem_count;
+    void const* workspace;
+    int32_t tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : problem_sizes0(nullptr),
+          problem_sizes1(nullptr),
+          problem_count(0),
+          workspace(nullptr),
+          tile_count(0) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+        cutlass::gemm::GemmCoord const* problem_sizes0,
+        cutlass::gemm::GemmCoord const* problem_sizes1,
+        int32_t problem_count,
+        void const* workspace = nullptr,
+        int32_t tile_count = 0)
+        : problem_sizes0(problem_sizes0),
+          problem_sizes1(problem_sizes1),
+          problem_count(problem_count),
+          workspace(workspace),
+          tile_count(tile_count) {}
+
+    /// Convert the FMHA-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+      return BaseParams( // Set problem_sizes as problem_sizes1 because these
+                         // determine shape of the final output of FMHA
+          problem_sizes1,
+          problem_count,
+          workspace,
+          tile_count);
+    }
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  FMHAGroupedProblemVisitor(
+      Params const& params_,
+      SharedStorage& shared_storage_,
+      int32_t block_idx)
+      : Base(params_.to_base(), shared_storage_, block_idx),
+        problem_sizes0(params_.problem_sizes0),
+        problem_sizes1(params_.problem_sizes1) {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma.h
new file mode 100644
index 000000000..a85c5b2fa
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int Stages,
+    cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaMultistage<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        CacheOpA,
+        IteratorB,
+        SmemIteratorB,
+        CacheOpB,
+        ElementC,
+        LayoutC,
+        Policy,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK> {
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStages =
+      kMaxK == cutlass::platform::numeric_limits<int>::max()
+      ? Stages
+      : cutlass::const_min(
+            Stages,
+            (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+  using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      CacheOpA,
+      IteratorB,
+      SmemIteratorB,
+      CacheOpB,
+      ElementC,
+      LayoutC,
+      Policy,
+      kStages,
+      SharedMemoryClear,
+      kMaxK>;
+};
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaPipelined<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        IteratorB,
+        SmemIteratorB,
+        ElementC,
+        LayoutC,
+        Policy>,
+    kMaxK> {
+  using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      IteratorB,
+      SmemIteratorB,
+      ElementC,
+      LayoutC,
+      Policy>;
+};
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h
new file mode 100644
index 000000000..6c6d07819
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h
@@ -0,0 +1,183 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template <typename Element, typename OperandShape, typename OperandLayout>
+  struct OperandSharedStorage {
+    AlignedBuffer<Element, OperandShape::kCount> buffer;
+    using TensorRef = TensorRef<Element, OperandLayout>;
+
+    CUTLASS_DEVICE
+    static OperandLayout Layout() {
+      return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+    }
+
+    /// Returns a TensorRef to the operand
+    CUTLASS_HOST_DEVICE
+    TensorRef ref() {
+      return TensorRef{buffer.data(), Layout()};
+    }
+  };
+
+  /// Shape of the A matrix operand in shared memory
+  using ShapeA = MatrixShape<
+      Shape::kM + Policy::SmemPaddingA::kRow,
+      Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+  /// Shape of the B matrix operand in shared memory
+  using ShapeB = MatrixShape<
+      Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+      Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+  using SharedStorageA = OperandSharedStorage<
+      typename Operator::ElementA,
+      ShapeA,
+      typename Operator::LayoutA>;
+  using SharedStorageB = OperandSharedStorage<
+      typename Operator::ElementB,
+      ShapeB,
+      typename Operator::LayoutB>;
+  using TensorRefA = typename SharedStorageA::TensorRef;
+  using TensorRefB = typename SharedStorageB::TensorRef;
+
+  struct SharedStorage {
+    /// Buffer for A operand
+    SharedStorageA operand_A;
+
+    /// Buffer for B operand
+    SharedStorageB operand_B;
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorageA& shared_storageA,
+      SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h
new file mode 100644
index 000000000..e5cdc88fa
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h
@@ -0,0 +1,767 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+  };
+
+  static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireMat ? Stages : Stages - 1;
+
+ private:
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  bool prologue_done_;
+
+  // Set to `True` to ensure the accumulator will be zero outside the GEMM
+  // footprint
+  bool zero_outside_bounds_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx),
+        prologue_done_(false),
+        zero_outside_bounds_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaMultistage(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    zero_outside_bounds_ = value;
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+    SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+    int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+    _prologue<kLoadA, kLoadB>(
+        iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int group_start_A = 0,
+      int group_start_B = 0) {
+    iterator_A.set_iteration_index(
+        group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(
+        group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess /
+            IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void _prologue(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int32_t& gemm_k_iterations,
+      SmemIteratorA& smem_iterator_A_,
+      SmemIteratorB& smem_iterator_B_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          if (kLoadA) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          if (kLoadB) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+
+        ++smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      smem_iterator_A_.add_tile_offset({0, 1});
+      smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    if (!prologue_done_) {
+      _prologue<true, true>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else if (!kSmemContainsEntireMat) {
+      _prologue<false, false>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else {
+      gemm_k_iterations -= kNumStagesConcurrentLoad;
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint
+    // are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared
+      /// memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared
+      /// memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(
+        warp_transformed_frag_A[0],
+        warp_transformed_frag_B[0],
+        warp_loaded_frag_A[0],
+        warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        // In case of a non-circular buffer ("kSmemContainsEntireMat")
+        // make sure we don't load out of bounds data.
+        if (!kSmemContainsEntireMat ||
+            gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          this->warp_tile_iterator_A_.load(
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+        }
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              warp_loaded_frag_A[warp_mma_k % 2],
+              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma(
+              tmp_accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+              accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (!kSmemContainsEntireMat &&
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(
+              iterator_A,
+              iterator_B,
+              group_start_iteration_A,
+              group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          if (!kSmemContainsEntireMat) {
+            int group_start_iteration_A, group_start_iteration_B;
+            group_start_iteration_A =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+            group_start_iteration_B =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+            copy_tiles_and_advance(
+                iterator_A,
+                iterator_B,
+                group_start_iteration_A,
+                group_start_iteration_B);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (!kSmemContainsEntireMat &&
+              smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM
+      // mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h
new file mode 100644
index 000000000..73112e9a2
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<
+        typename SmemIteratorA_::Element,
+        typename IteratorA_::Element,
+        IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA =
+      IteratorA_; ///< Iterates over tiles of A operand in global memory
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+  static bool const kSmemContainsEntireMat = false;
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx ///< ID of each thread within a warp
+      )
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaPipelined(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    // NOT NEEDED FOR PIPELINED
+    // shared memory will always be zero-filled
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      IteratorA iterator_A, ///< iterator over A operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      TransformA transform_A =
+          TransformA(), ///< transformation applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h b/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h
new file mode 100644
index 000000000..931a4bb2a
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h
@@ -0,0 +1,296 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                        \
+  {                                                                         \
+    if (query.scalar_type() == at::ScalarType::Float) {                     \
+      using scalar_t = float;                                               \
+      func();                                                               \
+    } else if (query.scalar_type() == at::ScalarType::Half) {               \
+      using scalar_t = cutlass::half_t;                                     \
+      func();                                                               \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {           \
+      using scalar_t = cutlass::bfloat16_t;                                 \
+      func();                                                               \
+    } else {                                                                \
+      TORCH_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                       \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      constexpr bool BOOL_NAME = true;      \
+      F();                                  \
+    } else {                                \
+      constexpr bool BOOL_NAME = false;     \
+      F();                                  \
+    }                                       \
+  }
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      TORCH_CHECK(                                                        \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                         \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                     \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef HAS_PYTORCH
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)   \
+  if (!(COND)) {                    \
+    std::cerr << #COND " failed\n"; \
+    return false;                   \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                \
+  {                                                                \
+    A = B;                                                         \
+    TORCH_CHECK(                                                   \
+        B < cutlass::platform::numeric_limits<decltype(A)>::max(), \
+        #B " overflows");                                          \
+  }
+
+namespace gemm_kernel_utils {
+
+#ifdef HAS_PYTORCH
+template <typename scalar_t>
+struct TypeTraits;
+
+template <>
+struct TypeTraits<cutlass::half_t> {
+  using scalar_t = cutlass::half_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Half;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return at::PackedTensorAccessor32<scalar_t, nDim>(
+        (scalar_t*)(tensor.data_ptr()),
+        tensor.sizes().data(),
+        tensor.strides().data());
+  }
+};
+
+template <>
+struct TypeTraits<cutlass::bfloat16_t> {
+  using scalar_t = cutlass::bfloat16_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::BFloat16;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return at::PackedTensorAccessor32<scalar_t, nDim>(
+        (scalar_t*)(tensor.data_ptr()),
+        tensor.sizes().data(),
+        tensor.strides().data());
+  }
+};
+
+template <>
+struct TypeTraits<float> {
+  using scalar_t = float;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Float;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return tensor.packed_accessor32<scalar_t, nDim>();
+  }
+};
+#endif
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value) {
+  return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h b/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000..44f38dbcb
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h
new file mode 100644
index 000000000..3926cc1a0
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000..d49bf83e9
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2115 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000..4bb96a139
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/kernel_forward.h b/static/include/kernels/mem_eff_attention/kernel_forward.h
index 33f777cc7..3ca842acb 100644
--- a/static/include/kernels/mem_eff_attention/kernel_forward.h
+++ b/static/include/kernels/mem_eff_attention/kernel_forward.h
@@ -67,8 +67,6 @@
 #include "cutlass/platform/platform.h"
 #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
 
-// From
-// fbcode/aitemplate/AITemplate/fb/3rdparty/cutlass/examples/41_fused_multi_head_attention/
 #include "debug_utils.h"
 #include "epilogue_pipelined.h"
 #include "epilogue_rescale_output.h"
diff --git a/static/include/kernels/mem_eff_attention/mma_from_smem.h b/static/include/kernels/mem_eff_attention/mma_from_smem.h
new file mode 100644
index 000000000..21ac4d104
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/mma_from_smem.h
@@ -0,0 +1,1780 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "epilogue_thread_apply_logsumexp.h"
+#include "gemm_kernel_utils.h"
+#include "iterators/make_residual_last.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA =
+      TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         AccumulatorSharedStorage::Shape::kN,
+                                         Policy_,
+                                         2> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy_,
+      2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx, ///< ID of each thread within a warp
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async tranfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory<
+                                          Shape1_,
+                                          AccumulatorSharedStorage::Shape::kN,
+                                          Policy1_,
+                                          Stages_> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape1_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy1_,
+      Stages_>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(
+            accumulator_shared_storage.accum_ref(),
+            lane_idx),
+        smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        warp_loaded_frag_A1[0],
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              warp_loaded_frag_A1[warp_mma_k % 2],
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <typename Mma_, typename AccumulatorSharedStorage>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    AccumulatorSharedStorage_> {
+  static constexpr int kWarpSize = 32;
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      AccumulatorSharedStorage_,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    AccumulatorSharedStorage_> {
+  static constexpr int kWarpSize = 32;
+
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+
+  static int constexpr kMaxK = AccumulatorSharedStorage_::Shape::kN;
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          AccumulatorSharedStorage_,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  // Epilogue 2: with LSE (for backwards pass)
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+  using IteratorAccumulatorLSE =
+      cutlass::transform::threadblock::VectorIterator<
+          cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+              // Shape
+              cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+              // WarpShape
+              cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+              lse_scalar_t,
+              cutlass::layout::RowMajor,
+              kElementsPerAccess>>;
+  using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+      scalar_t, // ElementOutput_
+      lse_scalar_t, // ElementLSE_
+      accum_t, // ElementAccumulator_
+      accum_t, // ElementCompute_
+      128 / cutlass::sizeof_bits<scalar_t>::value
+      // FragmentIteratorAccumulator::Fragment::kElements
+      // InstructionShape::kM * InstructionShape::kN / 32
+      >;
+  using EpilogueWithLSE =
+      cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+          SmemIteratorD0,
+          FragmentIteratorAccumulator,
+          IteratorAccumulatorLSE,
+          EpilogueOpApplyLSE>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC& accum,
+      lse_scalar_t const* lse,
+      int32_t lse_extents,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    constexpr int32_t kAlignLSE = 32;
+    IteratorAccumulatorLSE iterator_lse(
+        lse,
+        {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+        thread_id,
+        warp_id,
+        cutlass::MatrixCoord{0, 0} // offset
+    );
+
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    EpilogueWithLSE epilogue;
+    EpilogueOpApplyLSE minus_lse_exp({});
+    epilogue(
+        minus_lse_exp,
+        smem_iterator_attn,
+        accum,
+        // scale - unused
+        iterator_lse,
+        // bias
+        iterator_lse);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using SmemIteratorD0 = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+      WarpShape,
+      cutlass::gemm::GemmShape<32, 32, 4>,
+      scalar_t,
+      SmemAccumulatorLayout>;
+
+  // // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+              16,
+              32>, // typename SmemIteratorD0::TensorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  using OutputLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using RegistersIter = typename DefaultAttentionScalingCoefsUpdater<
+        IteratorC,
+        accum_t,
+        WarpSize>::Updater;
+    auto lane_offset =
+        RegistersIter::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    RegistersIter::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using RegistersIter = typename DefaultAttentionScalingCoefsUpdater<
+        IteratorC,
+        accum_t,
+        WarpSize>::Updater;
+    auto lane_offset =
+        RegistersIter::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    RegistersIter::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////

From 714204654c89b7a45e4595445b5c239ced7a1caa Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Thu, 20 Apr 2023 13:00:05 -0700
Subject: [PATCH 430/638] benchmark: concat (#592)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/592

Reviewed By: aakhundov

Differential Revision: D45067694

fbshipit-source-id: 3020ff4e48ff450059f6c317fb08e2a57501a0af
---
 python/aitemplate/backend/codegen.py        | 6 ++++++
 python/aitemplate/backend/main_templates.py | 5 ++++-
 python/aitemplate/utils/tensor_utils.py     | 4 ++--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 18980c4b3..6febf6cea 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -339,6 +339,7 @@ def __init__(
         self.func_seq = []
         self._input_shape_seq = []
         self._output_shape_seq = []
+        self.func_prop_seq = []
         self.tensor_decl = []
         self.dim_decl = []
         self.jagged_decl = []
@@ -734,6 +735,10 @@ def _process_src_ops(self, node: Tensor) -> None:
                 input_shape, output_shape = extract_input_output_shapes(func._attrs)
                 self._input_shape_seq.append(input_shape)
                 self._output_shape_seq.append(output_shape)
+                props = {}
+                if "concat_dim" in func._attrs:
+                    props["dim"] = func._attrs["concat_dim"]
+                self.func_prop_seq.append(props)
 
             if "int_state_flag" in func._attrs:
                 if func._attrs["name"] not in self.state_record:
@@ -835,6 +840,7 @@ def generate_model(self) -> str:
             self.func_seq,
             self._input_shape_seq,
             self._output_shape_seq,
+            self.func_prop_seq,
         )
         return MODEL_TEMPLATE.render(
             model_name=self.model_name,
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 3d4e48eb0..1f71f6e19 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -117,7 +117,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       DeviceMalloc((void**) &L2CacheSlab, L2SizeInBytes);
 
       ss << "{\\n";
-      {% for func_name, func, input_sizes, output_sizes in per_op_profiler_seq %}
+      {% for func_name, func, input_sizes, output_sizes, func_properties in per_op_profiler_seq %}
       {
         std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
         std::vector<std::pair<EventType, EventType>> call_events(iters);
@@ -146,6 +146,9 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
            << ", \\"qps\\": " << 1000 * iters / milliseconds
            << ", \\"input_sizes\\": " << "{{ input_sizes | replace("'", '\\\\"') }}"
            << ", \\"output_sizes\\": " << "{{ output_sizes | replace("'", '\\\\"') }}"
+        {% for prop_name, prop_value in func_properties.items() %}
+          << ", \\"{{ prop_name }}\\": " << "{{ prop_value }}"
+        {% endfor %}
            << " } ";
         {% if loop.last %}
           ss << "\\n";
diff --git a/python/aitemplate/utils/tensor_utils.py b/python/aitemplate/utils/tensor_utils.py
index 66996010f..2042a44ec 100644
--- a/python/aitemplate/utils/tensor_utils.py
+++ b/python/aitemplate/utils/tensor_utils.py
@@ -21,8 +21,8 @@ def wrap_dim(idx, rank):
     """
     Wrap tensor index, idx, if it's negative.
     """
-    assert isinstance(idx, int)
+    assert isinstance(idx, int), "idx must be int, but got {}".format(type(idx))
     if idx < 0:
         idx = idx + rank
-    assert idx < rank
+    assert idx < rank, "idx {} out of range; rank {}".format(idx, rank)
     return idx

From aa84406e5e0ef37dc75ad5d0cd8d348c75d34b77 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@fb.com>
Date: Thu, 20 Apr 2023 23:58:37 -0700
Subject: [PATCH 431/638] Ldm update (#573)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/573

Reviewed By: henryhu6

Differential Revision: D44948785

Pulled By: terrychenism

fbshipit-source-id: f760a883d5da01534156e3ef8093c2ecbedbc4bf
---
 .../05_stable_diffusion/scripts/compile.py    |   1 +
 examples/05_stable_diffusion/scripts/demo.py  |   9 +-
 .../scripts/demo_img2img.py                   |   1 -
 .../scripts/download_pipeline.py              |   4 +-
 examples/05_stable_diffusion/src/benchmark.py |  25 ++---
 .../05_stable_diffusion/src/benchmark_pt.py   |  14 +--
 .../src/compile_lib/compile_clip.py           |  44 ++------
 .../src/compile_lib/compile_unet.py           |  24 +++--
 .../src/compile_lib/compile_vae.py            |  53 +++++----
 .../src/compile_lib/util.py                   |   2 +-
 .../src/modeling/attention.py                 |  33 ++----
 .../05_stable_diffusion/src/modeling/clip.py  | 102 +++++++++---------
 .../src/modeling/embeddings.py                |   2 +-
 .../src/modeling/resnet.py                    |   5 +-
 .../src/modeling/unet_2d_condition.py         |  19 +++-
 .../src/modeling/unet_blocks.py               |   1 -
 .../05_stable_diffusion/src/modeling/vae.py   |   1 -
 .../src/pipeline_stable_diffusion_ait.py      |  12 ++-
 .../backend/common/concatenate_common.py      |   2 +
 .../common/tensor/batch_gather_common.py      |   8 +-
 .../backend/common/upsampling2d_common.py     |   2 +
 .../cuda/groupnorm/groupnorm_common.py        |  22 ++--
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  64 +++++------
 python/aitemplate/compiler/compiler.py        |   5 +
 .../ops/attention/mem_eff_attention.py        |   7 +-
 .../compiler/ops/common/view_ops.py           |   5 +-
 .../ops/conv/common_conv2d_bias_activation.py |   1 +
 python/aitemplate/compiler/ops/conv/conv2d.py |  74 ++++++++-----
 .../compiler/ops/groupnorm/groupnorm.py       |  61 ++++++++++-
 .../compiler/ops/tensor/batch_gather.py       |   8 +-
 .../compiler/ops/tensor/concatenate.py        |   3 +-
 .../ops/upsample/upsampling_common.py         |  11 +-
 .../compiler/transform/name_graph.py          |  62 ++++++++---
 python/aitemplate/frontend/nn/attention.py    |  17 ++-
 .../frontend/nn/conv2d/conv2d_bias.py         |   2 +-
 tests/unittest/ops/test_batch_gather.py       |  41 +++----
 tests/unittest/ops/test_concatenate.py        |   8 +-
 tests/unittest/ops/test_nms.py                |  46 ++++----
 38 files changed, 467 insertions(+), 334 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 896b2432c..65032e34f 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -65,6 +65,7 @@ def compile_diffusers(
     compile_clip(
         pipe.text_encoder,
         batch_size=batch_size,
+        seqlen=77,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
         depth=pipe.text_encoder.config.num_hidden_layers,
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index d4f5dbb99..9ae7db46a 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -34,11 +34,13 @@
 )
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, prompt, benchmark):
+def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         local_dir,
         scheduler=EulerDiscreteScheduler.from_pretrained(
@@ -48,11 +50,14 @@ def run(local_dir, width, height, prompt, benchmark):
         torch_dtype=torch.float16,
     ).to("cuda")
 
+    prompt = [prompt] * batch
     with torch.autocast("cuda"):
         image = pipe(prompt, height, width).images[0]
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
-            print(f"sd e2e: {t} ms")
+            print(
+                f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
+            )
 
     image.save("example_ait.png")
 
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index e4d96d865..31e1c33df 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -43,7 +43,6 @@
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
 def run(local_dir, width, height, prompt, benchmark):
-
     # load the pipeline
     device = "cuda"
     pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 1128769da..b072e694f 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -21,12 +21,12 @@
 @click.option("--token", default="", help="access token")
 @click.option(
     "--save_directory",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
     help="pipeline files local directory",
 )
 def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2-1-base",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index 5cac6a465..f0f595122 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -55,7 +55,6 @@ def benchmark_unet(
     benchmark_pt=False,
     verify=False,
 ):
-
     exe_module = Model("./tmp/UNet2DConditionModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
@@ -65,7 +64,7 @@ def benchmark_unet(
     pt_mod = pt_mod.eval()
 
     latent_model_input_pt = torch.randn(batch_size, 4, height, width).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 64, hidden_dim).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 77, hidden_dim).cuda().half()
     timesteps_pt = torch.Tensor([1, 1]).cuda().half()
 
     with autocast("cuda"):
@@ -83,8 +82,6 @@ def benchmark_unet(
             with open("sd_pt_benchmark.txt", "a") as f:
                 f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
 
-    print("pt output:", pt_ys.shape)
-
     # run AIT unet model
     inputs = {
         "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
@@ -96,6 +93,8 @@ def benchmark_unet(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[1] = height
+        shape[2] = width
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -124,13 +123,11 @@ def benchmark_unet(
 def benchmark_clip(
     pt_mod,
     batch_size=1,
-    seqlen=64,
+    seqlen=77,
     tokenizer=None,
     benchmark_pt=False,
     verify=False,
 ):
-    mask_seq = 0
-
     exe_module = Model("./tmp/CLIPTextModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
@@ -142,7 +139,7 @@ def benchmark_clip(
     if tokenizer is None:
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_input = tokenizer(
-        ["a photo of an astronaut riding a horse on mars"],
+        ["a photo of an astronaut riding a horse on mars"] * batch_size,
         padding="max_length",
         max_length=seqlen,
         truncation=True,
@@ -150,8 +147,6 @@ def benchmark_clip(
     )
     input_ids = text_input["input_ids"].cuda()
 
-    attention_mask = torch.ones((batch_size, seqlen))
-    attention_mask[-1, -mask_seq:] = 0
     attention_mask = None
 
     position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
@@ -175,6 +170,7 @@ def benchmark_clip(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[0] = batch_size
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -202,7 +198,6 @@ def benchmark_clip(
 def benchmark_vae(
     pt_vae, batch_size=1, height=64, width=64, benchmark_pt=False, verify=False
 ):
-
     latent_channels = 4
 
     exe_module = Model("./tmp/AutoencoderKL/test.so")
@@ -239,9 +234,8 @@ def benchmark_vae(
         .cuda()
         .half()
     )
+
     ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
-    print("input pt tensor size: ", ait_input_pt_tensor.shape)
-    print("output pt tensor size: ", y.shape)
     exe_module.run_with_tensors([ait_input_pt_tensor], [y])
 
     # verification
@@ -305,7 +299,10 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     )
     # VAE
     benchmark_vae(
-        pipe.vae, batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify
+        pipe.vae,
+        batch_size=batch_size,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
     )
 
 
diff --git a/examples/05_stable_diffusion/src/benchmark_pt.py b/examples/05_stable_diffusion/src/benchmark_pt.py
index 95bfb725f..c12877897 100644
--- a/examples/05_stable_diffusion/src/benchmark_pt.py
+++ b/examples/05_stable_diffusion/src/benchmark_pt.py
@@ -26,22 +26,24 @@
     default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="the local diffusers pipeline directory",
 )
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, prompt, benchmark):
+def run(local_dir, width, height, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
         local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
     ).to("cuda")
 
-    with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
-        if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
-            print(f"sd pt e2e: {t} ms")
+    image = pipe(prompt, height, width, negative_prompt=negative_prompt).images[0]
+    if benchmark:
+        t = benchmark_torch_function(10, pipe, prompt)
+        print(f"sd pt e2e: {t} ms")
 
     image.save("example_pt.png")
 
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index 9f68e827a..a85aee84f 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -12,10 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import numpy as np
-import torch
+
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
@@ -32,40 +31,14 @@ def map_clip_params(pt_mod, batch_size, seqlen, depth):
             ait_name = ait_name.replace("out_proj", "proj")
         elif name.endswith("out_proj.bias"):
             ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("q_proj.weight"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.weight")]
-            q = pt_params[prefix + "q_proj.weight"]
-            k = pt_params[prefix + "k_proj.weight"]
-            v = pt_params[prefix + "v_proj.weight"]
-            qkv_weight = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_weight
-            continue
-        elif name.endswith("q_proj.bias"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.bias")]
-            q = pt_params[prefix + "q_proj.bias"]
-            k = pt_params[prefix + "k_proj.bias"]
-            v = pt_params[prefix + "v_proj.bias"]
-            qkv_bias = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_bias
-            continue
-        elif name.endswith("k_proj.weight"):
-            continue
-        elif name.endswith("k_proj.bias"):
-            continue
-        elif name.endswith("v_proj.weight"):
-            continue
-        elif name.endswith("v_proj.bias"):
-            continue
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
         params_ait[ait_name] = arr
 
-    if detect_target().name() == "cuda":
-        for i in range(depth):
-            prefix = f"encoder_layers_{i}_self_attn_cu_length"
-            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
-            params_ait[prefix] = torch.from_numpy(cu_len).cuda()
-
     return params_ait
 
 
@@ -97,6 +70,7 @@ def compile_clip(
 
     pt_mod = pt_mod.eval()
     params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
+    batch_size = IntVar(values=[1, 8], name="batch_size")
 
     input_ids_ait = Tensor(
         [batch_size, seqlen], name="input0", dtype="int64", is_input=True
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index 3c2f59603..c4233c1e4 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -15,7 +15,7 @@
 import torch
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.unet_2d_condition import (
@@ -58,9 +58,9 @@ def compile_unet(
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    model_name="UNet2DConditionModel",
     use_linear_projection=False,
 ):
-
     ait_mod = ait_UNet2DConditionModel(
         sample_size=64,
         cross_attention_dim=hidden_dim,
@@ -72,19 +72,31 @@ def compile_unet(
     # set AIT parameters
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height_d = IntVar(values=[32, 64], name="height")
+    width_d = IntVar(values=[32, 64], name="width")
 
     latent_model_input_ait = Tensor(
-        [batch_size, height, width, 4], name="input0", is_input=True
+        [batch_size, height_d, width_d, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
     text_embeddings_pt_ait = Tensor(
-        [batch_size, 64, hidden_dim], name="input2", is_input=True
+        [batch_size, 77, hidden_dim], name="input2", is_input=True
     )
 
-    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
+    mid_block_additional_residual = None
+    down_block_additional_residuals = None
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        down_block_additional_residuals,
+        mid_block_additional_residual,
+    )
     mark_output(Y)
 
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
+    compile_model(Y, target, "./tmp", model_name, constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index 7352740d0..e9c2d4964 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -12,11 +12,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import numpy as np
 
 import torch
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
@@ -40,20 +39,6 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
                 ).contiguous()
             else:
                 mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.qkv.weight"):
-            prefix = name[: -len("attention.qkv.weight")]
-            q_weight = pt_params[prefix + "query.weight"]
-            k_weight = pt_params[prefix + "key.weight"]
-            v_weight = pt_params[prefix + "value.weight"]
-            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-            mapped_pt_params[ait_name] = qkv_weight
-        elif name.endswith("attention.qkv.bias"):
-            prefix = name[: -len("attention.qkv.bias")]
-            q_bias = pt_params[prefix + "query.bias"]
-            k_bias = pt_params[prefix + "key.bias"]
-            v_bias = pt_params[prefix + "value.bias"]
-            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
-            mapped_pt_params[ait_name] = qkv_bias
         elif name.endswith("attention.proj.weight"):
             prefix = name[: -len("attention.proj.weight")]
             pt_name = prefix + "proj_attn.weight"
@@ -63,8 +48,31 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
             pt_name = prefix + "proj_attn.bias"
             mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.cu_length"):
-            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
-            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
+            ...
+        elif name.endswith("attention.proj_q.weight"):
+            prefix = name[: -len("attention.proj_q.weight")]
+            pt_name = prefix + "query.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_q.bias"):
+            prefix = name[: -len("attention.proj_q.bias")]
+            pt_name = prefix + "query.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.weight"):
+            prefix = name[: -len("attention.proj_k.weight")]
+            pt_name = prefix + "key.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.bias"):
+            prefix = name[: -len("attention.proj_k.bias")]
+            pt_name = prefix + "key.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.weight"):
+            prefix = name[: -len("attention.proj_v.weight")]
+            pt_name = prefix + "value.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.bias"):
+            prefix = name[: -len("attention.proj_v.bias")]
+            pt_name = prefix + "value.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
         else:
             pt_param = pt_module.get_parameter(name)
             mapped_pt_params[ait_name] = pt_param
@@ -79,6 +87,7 @@ def compile_vae(
     width=64,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
+    name="AutoencoderKL",
 ):
     in_channels = 3
     out_channels = 3
@@ -114,8 +123,12 @@ def compile_vae(
         latent_channels=latent_channels,
         sample_size=sample_size,
     )
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height_d = IntVar(values=[32, 64], name="height")
+    width_d = IntVar(values=[32, 64], name="width")
+
     ait_input = Tensor(
-        shape=[batch_size, height, width, latent_channels],
+        shape=[batch_size, height_d, width_d, latent_channels],
         name="vae_input",
         is_input=True,
     )
@@ -133,6 +146,6 @@ def compile_vae(
         Y,
         target,
         "./tmp",
-        "AutoencoderKL",
+        name,
         constants=params_ait,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 000e862e9..90cc1bc32 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -18,5 +18,5 @@ def mark_output(y):
     for i in range(len(y)):
         y[i]._attrs["is_output"] = True
         y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        y_shape = [d._attrs["values"] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
diff --git a/examples/05_stable_diffusion/src/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
index 14993e6d9..06ab5f1bd 100644
--- a/examples/05_stable_diffusion/src/modeling/attention.py
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -20,7 +20,6 @@
 from typing import Optional
 
 from aitemplate.compiler.ops import reshape
-
 from aitemplate.frontend import nn, Tensor
 
 
@@ -54,22 +53,18 @@ def __init__(
     ):
         super().__init__()
         self.batch_size = batch_size
-        self.height = height
-        self.width = width
         self.channels = channels
         self.num_heads = (
             channels // num_head_channels if num_head_channels is not None else 1
         )
         self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
+        self.attention = nn.CrossAttention(
             channels,
-            batch_size,
+            height * width,
             height * width,
             self.num_heads,
             qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
         )
         self.rescale_output_factor = rescale_output_factor
 
@@ -78,28 +73,22 @@ def forward(self, hidden_states) -> Tensor:
         input hidden_states shape: [batch, height, width, channel]
         output shape: [batch, height, width, channel]
         """
+
         residual = hidden_states
 
         # norm
         hidden_states = self.group_norm(hidden_states)
+        o_shape = hidden_states.shape()
+        batch_dim = o_shape[0]
 
         hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+            hidden_states,
+            [batch_dim, -1, self.channels],
         )
 
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+        res = self.attention(hidden_states, hidden_states, hidden_states, residual) * (
+            1 / self.rescale_output_factor
+        )
 
+        res = reshape()(res, o_shape)
         return res
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 30afcd051..ff0ce792a 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -57,9 +57,9 @@ def __init__(
         self.heads = heads
         self.dim_head = dim_head
 
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_out = nn.Sequential(
             nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
         )
@@ -68,36 +68,26 @@ def forward(self, x, context=None, mask=None, residual=None):
         nheads = self.heads
         d = self.dim_head
 
-        layout = "20314" if USE_CUDA else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
+        q = self.to_q(x)
         context = default(context, x)
-
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        bs = q.shape()[0]
+
+        q = ops.reshape()(q, [bs, -1, self.heads, self.dim_head])
+        k = ops.reshape()(k, [bs, -1, self.heads, self.dim_head])
+        v = ops.reshape()(v, [bs, -1, self.heads, self.dim_head])
+        q = ops.permute()(q, [0, 2, 1, 3])
+        k = ops.permute()(k, [0, 2, 1, 3])
+        v = ops.permute()(v, [0, 2, 1, 3])
+
+        attn_op = ops.mem_eff_attention(causal=False)
+        out = attn_op(
+            (ops.reshape()(q, [bs, nheads, -1, d])),
+            (ops.reshape()(k, [bs, nheads, -1, d])),
+            (ops.reshape()(v, [bs, nheads, -1, d])),
         )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
-
-        if USE_CUDA:
-            attn_op = ops.mem_eff_attention(causal=False)
-            out = attn_op(
-                (ops.reshape()(q, [bs, nheads, -1, d])),
-                (ops.reshape()(k, [bs, nheads, -1, d])),
-                (ops.reshape()(v, [bs, nheads, -1, d])),
-            )
-        else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
-            )
         out = ops.reshape()(out, [bs, -1, nheads * d])
         proj = self.to_out(out)
         proj = ops.reshape()(proj, [bs, -1, nheads * d])
@@ -235,7 +225,7 @@ def __init__(
 
     def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
+        b, h, w, c = x.shape()
         x_in = x
         x = self.norm(x)
         if self.use_linear_projection:
@@ -336,7 +326,7 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = get_shape(x)
+        shape = x.shape()
         x = self.fc1(x)
         x = self.fc2(x, res)
         return ops.reshape()(x, shape)
@@ -364,11 +354,11 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = get_shape(x)
+        # shape = get_shape(x)
         x = self.fc1(x)
         x = self.activation_fn(x)
         x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
+        return ops.reshape()(x, x.shape())
 
 
 class CLIPEncoderLayer(nn.Module):
@@ -391,19 +381,15 @@ def __init__(
     ):
         super().__init__()
         self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
+        self.self_attn = nn.CrossAttention(
+            hidden_size,
+            seq_len,
+            seq_len,
+            num_attention_heads,
             qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
             causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
         )
+
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
         self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
             hidden_size, int(hidden_size * mlp_ratio)
@@ -428,7 +414,9 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
+        hidden_states = self.self_attn(
+            hidden_states, hidden_states, hidden_states, residual
+        )
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -547,6 +535,9 @@ def __init__(
     ):
         super().__init__()
         embed_dim = hidden_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = hidden_size
+        self.vocab_size = vocab_size
 
         self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
         self.position_embedding = nn.Embedding(
@@ -559,20 +550,25 @@ def forward(
         position_ids: Tensor,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Tensor:
-
         input_shape = ops.size()(input_ids)
 
         # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
+        token_embedding = self.token_embedding.tensor()
+        token_embedding = ops.reshape()(
+            token_embedding, [1, self.vocab_size, self.embed_dim]
+        )
+        token_embedding = ops.expand()(token_embedding, [input_shape[0], -1, -1])
 
         if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+            inputs_embeds = ops.batch_gather()(token_embedding, input_ids)
 
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
+        position_embedding = self.position_embedding.tensor()
+        position_embedding = ops.reshape()(
+            position_embedding, [1, self.max_position_embeddings, self.embed_dim]
         )
+        position_embedding = ops.expand()(position_embedding, [input_shape[0], -1, -1])
+
+        position_embeddings = ops.batch_gather()(position_embedding, position_ids)
 
         embeddings = inputs_embeds + position_embeddings
 
diff --git a/examples/05_stable_diffusion/src/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
index 36b96a4fb..cab7c033f 100644
--- a/examples/05_stable_diffusion/src/modeling/embeddings.py
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -39,7 +39,7 @@ def get_timestep_embedding(
     :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
     embeddings. :return: an [N x dim] Tensor of positional embeddings.
     """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+    assert timesteps._rank() == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
 
diff --git a/examples/05_stable_diffusion/src/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
index 03e4f8023..c15bf26d2 100644
--- a/examples/05_stable_diffusion/src/modeling/resnet.py
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -58,7 +58,6 @@ def __init__(
             self.Conv2d_0 = conv
 
     def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
         if self.use_conv_transpose:
             return self.conv(x)
 
@@ -112,9 +111,7 @@ def __init__(
             self.conv = conv
 
     def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
         x = self.conv(x)
-
         return x
 
 
@@ -219,7 +216,7 @@ def forward(self, x, temb=None):
 
         if temb is not None:
             temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
+            bs, dim = temb.shape()
             temb = ops.reshape()(temb, [bs, 1, 1, dim])
             hidden_states = hidden_states + temb
 
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index eb28a076a..2ad4d9718 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -14,7 +14,7 @@
 #
 from typing import Optional, Tuple, Union
 
-from aitemplate.frontend import nn
+from aitemplate.frontend import nn, Tensor
 
 from .embeddings import TimestepEmbedding, Timesteps
 from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
@@ -185,6 +185,8 @@ def forward(
         sample,
         timesteps,
         encoder_hidden_states,
+        down_block_additional_residuals: Optional[Tuple[Tensor]] = None,
+        mid_block_additional_residual: Optional[Tensor] = None,
         return_dict: bool = True,
     ):
         """r
@@ -224,12 +226,27 @@ def forward(
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
+            # return sample
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
         sample = self.mid_block(
             sample, emb, encoder_hidden_states=encoder_hidden_states
         )
 
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+
         # 5. up
         for upsample_block in self.up_blocks:
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 9eaa6e0b1..897025660 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -731,7 +731,6 @@ def __init__(
             )
         ]
         attentions = []
-
         for _ in range(num_layers):
             attentions.append(
                 AttentionBlock(
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
index 924c7257a..f2bea6a43 100644
--- a/examples/05_stable_diffusion/src/modeling/vae.py
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -144,7 +144,6 @@ def __init__(
         )
 
     def decode(self, z: Tensor, return_dict: bool = True):
-
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
         return dec
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 7dace1275..a89f43109 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -108,6 +108,7 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
+        self.batch = 1
 
     def init_ait_module(
         self,
@@ -132,12 +133,13 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=64):
+    def clip_inference(self, input_ids, seqlen=77):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -149,6 +151,7 @@ def clip_inference(self, input_ids, seqlen=64):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -160,6 +163,7 @@ def vae_inference(self, vae_input):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -196,7 +200,7 @@ def __call__(
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
@@ -254,11 +258,13 @@ def __call__(
                 f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
             )
 
+        self.batch = batch_size
+
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index d6f3013ce..025688309 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -445,9 +445,11 @@
     throw std::runtime_error("the number of inputs must >= 1!");
   }
 
+
   for ({{index_type}} i = 0; i < rank; i++) {
     if (i == concat_dim) continue;
     {{index_type}} dim = real_input_shapes[0][i];
+
     for ({{index_type}} j = 1; j < num_real_inputs; j++) {
       if (real_input_shapes[j][i] != dim) {
         throw std::runtime_error(
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
index 97e8aee77..2e05c0997 100644
--- a/python/aitemplate/backend/common/tensor/batch_gather_common.py
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -36,7 +36,8 @@
 
 {{func_signature}}
 {
-    batch_gather_launcher<{{dtype}}, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
+    const int64_t gather_size = *batch_size * batch_num;
+    batch_gather_launcher<{{dtype}}, int64_t>(stream, gather_size, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
 }
     """
 )
@@ -46,6 +47,7 @@
 void {{func_name}}(void* output,
                    const void* input,
                    const int64_t* indices,
+                   const {{index_type}}* batch_size,
                    const {{index_type}} batch_num,
                    const {{index_type}} indices_num,
                    const {{index_type}} instance_size,
@@ -65,6 +67,7 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}   {{output}}, {{input}}, {{indices}},
+{{indent}}    {{batch_size}},
 {{indent}}    {{batch_num}},
 {{indent}}    {{indices_num}},
 {{indent}}    {{instance_size}},
@@ -168,7 +171,7 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
     axis = len(ind_shape) - 1
     batch_num = 1
-    for i in range(axis):
+    for i in range(1, axis):
         batch_num *= yshape[i]._attrs["values"][0]
 
     indices_num = yshape[axis]._attrs["values"][0]
@@ -184,6 +187,7 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
         output=output_name,
         input=input_name,
         indices=indices_name,
+        batch_size="&" + xshape[0]._attrs["name"],
         batch_num=batch_num,
         indices_num=indices_num,
         instance_size=instance_size,
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index 8e8310229..c1b94a217 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -313,7 +313,9 @@
     {{index_type}}* out_w,
     {{prefix}}Stream_t stream
 ) {
+
   {{shape_function}}
+
   {{exec_paths}}
   throw std::runtime_error(
       "Unsupported workload for this bilinear upsampling specialization."
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 39a44ebd3..5db6982aa 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -31,6 +31,10 @@
                           void* gamma,
                           void* beta,
                           int N,
+                          int64_t* H,
+                          int64_t* W,
+                          int64_t* HO,
+                          int64_t* WO,
                           const float eps,
                           const int max_smem_size,
                           void* workspace,
@@ -49,6 +53,8 @@
 {{indent}}{
 {{indent}}  {{func_name}}(
 {{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
+{{indent}}     {{H}}, {{W}},
+{{indent}}     {{HO}}, {{WO}},
 {{indent}}     {{eps}}, max_smem_size_, global_workspace_,
 {{indent}}  stream /* default stream */
 {{indent}}  );
@@ -87,13 +93,16 @@
 
 {{func_signature}}
 {
-
-    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+    *HO = *H;
+    *WO = *W;
+    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{C}}, {{G}}>(
             static_cast<{{elem_input_type}}*>(output),
             static_cast<{{elem_input_type}}*>(input),
             static_cast<{{elem_input_type}}*>(gamma),
             static_cast<{{elem_input_type}}*>(beta),
             N,
+            H,
+            W,
             eps,
             max_smem_size,
             workspace,
@@ -138,8 +147,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     use_swish = True if "swish" in func_attrs["name"] else False
     input_shape = func_attrs["inputs"][0].shape()
 
-    H = input_shape[1].value()
-    W = input_shape[2].value()
     C = input_shape[3].value()
     G = func_attrs["num_groups"]
 
@@ -157,8 +164,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
         FuseSwish="true" if use_swish else "false",
-        H=H,
-        W=W,
         C=C,
         G=G,
     )
@@ -180,6 +185,7 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
     input_shape = func_attrs["inputs"][0]._attrs["shape"]
+    output_shape = func_attrs["outputs"][0]._attrs["shape"]
     eps = func_attrs["eps"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -188,6 +194,10 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
         gamma=gamma_name,
         beta=beta_name,
         N=input_shape[0]._attrs["name"],
+        H="&" + input_shape[1]._attrs["name"],
+        W="&" + input_shape[2]._attrs["name"],
+        HO="&" + output_shape[1]._attrs["name"],
+        WO="&" + output_shape[2]._attrs["name"],
         eps=eps,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index b868849e4..0ccc9e105 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -477,8 +477,6 @@ struct TInputHelper<bfloat16> {
 template <
     typename TInput,
     bool FuseSwish,
-    int H,
-    int W,
     int C,
     int C_G,
     int ILP = 8,
@@ -490,6 +488,8 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
     TInput* gamma,
     TInput* beta,
     int N,
+    int H,
+    int W,
     float epsilon) {
   constexpr int C_G_2 = C_G / 2;
   constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
@@ -983,13 +983,15 @@ void DispatchGroupNormForwardGpu(
   }
 }
 
-template <typename TInput, bool FuseSwish, int H, int W, int C, int G>
+template <typename TInput, bool FuseSwish, int C, int G>
 cudaError_t invokeGroupNorm(
     TInput* output,
     TInput* input,
     TInput* gamma,
     TInput* beta,
     int N,
+    const int64_t* height,
+    const int64_t* width,
     const float eps,
     const int max_smem_size,
     void* workspace,
@@ -998,6 +1000,9 @@ cudaError_t invokeGroupNorm(
   constexpr auto C_G_2 = C_G / 2;
   constexpr int ILP = 8;
 
+  int64_t H = *height;
+  int64_t W = *width;
+
   const int64_t num_instances = N * G;
   const int64_t norm_size = H * W * C / G;
   const int64_t spatial_size = H * W;
@@ -1011,46 +1016,25 @@ cudaError_t invokeGroupNorm(
   // Bank conflict doesn't seem to matter to perf
   constexpr int BANK_CONFLICT = 0;
 
-  constexpr auto smem =
-      H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
+  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
 
   // C_G must be even, or we can have misaligned address for cp.async
   // reserve some shared_mem for block reduction
-  if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
-    constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
-
-    if constexpr (num_threads > 0) {
-      auto kernel_func = group_norm_smem<
-          TInput,
-          FuseSwish,
-          H,
-          W,
-          C,
-          C_G,
-          ILP,
-          BANK_CONFLICT,
-          num_threads>;
-      GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
-          kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
-      dim3 block(num_threads);
-      kernel_func<<<dim3(G, N), block, smem, stream>>>(
-          input, output, gamma, beta, N, eps);
-    } else {
-      DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
-          stream,
-          num_instances,
-          norm_size,
-          channel_size,
-          spatial_size,
-          epsilon,
-          input,
-          gamma,
-          beta,
-          output,
-          static_cast<float*>(workspace),
-          static_cast<float*>(workspace) + num_instances,
-          channels_first);
-    }
+  if (H > 0 && H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+    constexpr int num_threads = 1024;
+    auto kernel_func = group_norm_smem<
+        TInput,
+        FuseSwish,
+        C,
+        C_G,
+        ILP,
+        BANK_CONFLICT,
+        num_threads>;
+    GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
+        kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+    dim3 block(num_threads);
+    kernel_func<<<dim3(G, N), block, smem, stream>>>(
+        input, output, gamma, beta, N, H, W, eps);
   } else {
     DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
         stream,
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 07eabe9e5..5ee4f09fd 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -287,6 +287,11 @@ def compile_model(
             )
             _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
 
+            compiler.transform.dedup_symbolic_name(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "dedup_symbolic_name"
+            )
+
             (
                 max_blob,
                 max_constant_blob,
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index cb9b86832..55d2e9e28 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -26,7 +26,6 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
-from aitemplate.utils import shape_utils
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -120,9 +119,9 @@ def unique(vector):
         batch_info = x._attrs["shape"][0]
         output_shape = [
             batch_info,
-            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            x._attrs["shape"][2],
+            x._attrs["shape"][1],
+            w._attrs["shape"][-1],
         ]
         return output_shape
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index b48a94519..c5c174629 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -331,9 +331,10 @@ def _infer_shapes(self, x: Tensor):
                             y_shapes.append(IntImm(int(dynamic_symbol)))
                         else:
                             symbol_names = {s.name for s in dynamic_symbol.free_symbols}
+                            unknown_symbols = symbol_names - get_global_symbol_set()
                             assert (
-                                len(symbol_names - get_global_symbol_set()) == 0
-                            ), "Unable to deduce dynamic symbol"
+                                not unknown_symbols
+                            ), f"Unable to deduce dynamic symbol, because the following symbols are not in global symbol set: {unknown_symbols}"
 
                             values = simplify_intvar_values(dynamic_symbol)
                             new_var = IntVar(values, symbolic_value=dynamic_symbol)
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index fae05dad9..ce2024559 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
+
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
     """Base class of conv2d with bias + activation."""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index b620e8f3b..23a999ab2 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -22,7 +22,7 @@
 from collections import OrderedDict
 from hashlib import sha1
 from operator import itemgetter
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Tuple, Union
 
 import jinja2
 
@@ -166,12 +166,18 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
         Parameters
         ----------
-        stride : int
-            Stride of the convolution
-        pad : int
-            Size of padding to add to the input
-        dilate : int, optional
-            Size of spacing between kernel elements, by default 1
+        stride : int or tuple of two ints
+            Stride of the convolution. If tuple is
+            provided, the elements correspond to height and width stride
+            respectively
+        pad : int or tuple of two ints
+            Size of padding to add to the input. If tuple is
+            provided, the elements correspond to height and width padding
+            respectively
+        dilate : int or tuple of two ints, optional
+            Size of spacing between kernel elements, by default 1. If tuple is
+            provided, the elements correspond to height and width dilation
+            respectively
         group : int, optional
            Number of blocked connections from input
             channels to output channels, by default 1
@@ -195,24 +201,19 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
     def _get_params_factory(self):
         params_factory = {}
-        if isinstance(self._attrs["stride"], int):
-            params_factory["strideh"] = self._attrs["stride"]
-            params_factory["stridew"] = self._attrs["stride"]
-        else:
-            params_factory["strideh"] = self._attrs["stride"][0]
-            params_factory["stridew"] = self._attrs["stride"][1]
-        if isinstance(self._attrs["pad"], int):
-            params_factory["padh"] = self._attrs["pad"]
-            params_factory["padw"] = self._attrs["pad"]
-        else:
-            params_factory["padh"] = self._attrs["pad"][0]
-            params_factory["padw"] = self._attrs["pad"][1]
-        if isinstance(self._attrs["dilate"], int):
-            params_factory["dilateh"] = self._attrs["dilate"]
-            params_factory["dilatew"] = self._attrs["dilate"]
-        else:
-            params_factory["dilateh"] = self._attrs["dilate"][0]
-            params_factory["dilatew"] = self._attrs["dilate"][1]
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        params_factory["strideh"], params_factory["stridew"] = _maybe_int_to_tuple(
+            self._attrs["stride"],
+            "Stride",
+        )
+        params_factory["padh"], params_factory["padw"] = _maybe_int_to_tuple(
+            self._attrs["pad"],
+            "Pad",
+        )
+        params_factory["dilateh"], params_factory["dilatew"] = _maybe_int_to_tuple(
+            self._attrs["dilate"],
+            "Dilation",
+        )
         return params_factory
 
     def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
@@ -263,6 +264,21 @@ def unique(vector):
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        dilate_h, dilate_w = _maybe_int_to_tuple(self._attrs["dilate"], "Dilation")
+        stride_h, stride_w = _maybe_int_to_tuple(self._attrs["stride"], "Stride")
+        pad_h, pad_w = _maybe_int_to_tuple(self._attrs["pad"], "Pad")
+        KHEff = (w_shape[1] - 1) * dilate_h + 1
+        KWEff = (w_shape[2] - 1) * dilate_w + 1
+        out_h = (in_h + 2 * pad_h - KHEff) // stride_h + 1
+        out_w = (in_w + 2 * pad_w - KWEff) // stride_w + 1
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
@@ -770,3 +786,11 @@ def gen_function(self) -> str:
             self.shape_eval_template,
             self.shape_save_template,
         )
+
+
+def _maybe_int_to_tuple(x: Union[int, Tuple[int, int]], name: str) -> Tuple[int, int]:
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, tuple) and len(x) == 2:
+        return x
+    raise ValueError(f"{name} should be either int or tuple of 2 ints, but got {x}")
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index fd31bb5d7..47fb2dde9 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -15,6 +15,7 @@
 """
 Operator definition for groupnorm.
 """
+import itertools
 import logging
 import os
 import re
@@ -39,6 +40,7 @@
 from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
 from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -53,6 +55,19 @@
 """
 )
 
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = {{x_dim3}};
+"""
+)
+
 
 class group_norm(Operator):
     """Standalone group norm op.
@@ -68,6 +83,7 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
             self._attrs["has_profiler"] = True
         self._attrs["num_channels"] = num_channels
         self._attrs["workspace"] = 0
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
     @staticmethod
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
@@ -112,9 +128,52 @@ def _sanity_check(self, x, gamma, beta):
 
     def _infer_shapes(self, x: Tensor):
         """Infer shapes for groupnorm."""
-
         return x._attrs["shape"]
 
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes_v2(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            x.shape()[0],
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        output_shape[1]._attrs["symbolic_value"] = in_h
+        output_shape[2]._attrs["symbolic_value"] = in_w
+        return output_shape
+
     def __call__(
         self,
         x: Tensor,
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index 1182e32b5..c522eb9c2 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -55,7 +55,7 @@ def __init__(self) -> None:
 
     def _infer_shape(self, x: List[int], indices: List[int]):
         rank = len(indices)
-        for r in range(rank - 1):
+        for r in range(1, rank - 1):
             assert x[r] == indices[r]
         output = list(x)
         output[rank - 1] = indices[-1]
@@ -81,6 +81,12 @@ def unique(vector):
             output_shape.append(
                 shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
             )
+        if len(indices.shape()) > 1:
+            # Generally output has the same batch dimension as input
+            output_shape[0] = x.shape()[0]
+        else:
+            # Special case: gather happens along batch dimension
+            output_shape[0] = indices.shape()[0]
         return output_shape
 
     def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index d207bdee1..0d36a2c37 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -105,7 +105,8 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             else:
                 output_dim = input_shapes[0][idx]
                 for shape in input_shapes:
-                    if output_dim != shape[idx]:
+                    # if output_dim != shape[idx]:
+                    if output_dim._attrs["values"] != shape[idx]._attrs["values"]:
                         raise RuntimeError(
                             "tensors expected to have the same dimensions "
                             "except concat_dim! dim: {}, shape1: {}, shape2: {}, inputs: {}".format(
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index 59b94e0b0..edf87bba0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -121,11 +121,20 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x.shape()[0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        out_h = in_h * int(self._attrs["scale_factor"])
+        out_w = in_w * int(self._attrs["scale_factor"])
+
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 94fea4a45..7faa8da64 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,8 @@
 import re
 from typing import List
 
-from aitemplate.compiler.base import IntImm, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.utils import graph_utils
 
 # pylint: disable=C0103
 
@@ -139,7 +140,6 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
 def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """Rename all shape variable that are identical to the same name.
-
     Parameters
     ----------
     sorted_graph : List[Tensor]
@@ -147,19 +147,49 @@ def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """
     symbolic_to_name = {}
     global user_provided_dim
+    # First pass - build symbolic_to_name map
+    for i, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        if (
+            dim_sym not in symbolic_to_name
+            or dim_sym in symbolic_to_name
+            and dim._attrs["name"] in user_provided_dim
+        ):
+            symbolic_to_name[dim_sym] = dim._attrs["name"] or f"dim_{i}"
+
+    # Second pass - use symbolic_to_name map
+    for _, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        dim._attrs["name"] = symbolic_to_name[dim_sym]
+
+
+def _all_dims_in_graph(sorted_graph: List[Tensor]):
+    dim_idx = 0
     for node in sorted_graph:
         for dim in node._attrs["shape"]:
-            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
-                dim_sym = dim.symbolic_value()
-                if (
-                    dim_sym not in symbolic_to_name
-                    or dim_sym in symbolic_to_name
-                    and dim._attrs["name"] in user_provided_dim
-                ):
-                    symbolic_to_name[dim_sym] = dim._attrs["name"]
-
-    for node in sorted_graph:
-        for dim in node._attrs["shape"]:
-            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
-                dim_sym = dim.symbolic_value()
-                dim._attrs["name"] = symbolic_to_name[dim_sym]
+            yield dim_idx, dim
+            dim_idx += 1
+
+    # In case some dimensions are not encountered in any nodes in the graph,
+    # only in input/output accessors - iterate over all ops and dimensions
+    # in tensor accessors, if any.
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        input_accessors = op._attrs.get("input_accessors", None)
+        output_accessors = op._attrs.get("output_accessors", None)
+        for accessors in (input_accessors, output_accessors):
+            if accessors is None:
+                continue
+            for ta in accessors:
+                if ta.original_shapes:
+                    for dim in ta.original_shapes:
+                        yield dim_idx, dim
+                        dim_idx += 1
+
+
+def _dim_qualified_for_sym_dedup(dim: IntVar) -> bool:
+    return not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar)
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 091f7d81a..a1a7075b8 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -327,8 +327,6 @@ def __init__(
         self.causal = causal
         self.has_residual = has_residual
         self.dim = dim
-        self.seqlen = seq_len
-        self.seqlen_kv = seq_len_kv
 
         self.op = ops.mem_eff_attention(causal=causal)
 
@@ -353,8 +351,7 @@ def __init__(
         self.proj_drop = Dropout(proj_drop)
 
     def attention(self, q, k, v):
-        seqlen = self.seqlen
-        seqlen_kv = self.seqlen_kv
+        batch = q.shape()[0]
         head_dim = self.dim // self.num_heads
 
         query = self.proj_q(q)
@@ -362,13 +359,13 @@ def attention(self, q, k, v):
         value = self.proj_v(v)
 
         query = ops.permute()(
-            ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         key = ops.permute()(
-            ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         value = ops.permute()(
-            ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim]),
+            ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
             [0, 2, 1, 3],
         )
         return self.op(query, key, value)
@@ -377,9 +374,9 @@ def forward(self, *args):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
-        seq = self.seqlen
+        batch = x.shape()[0]
         attn_output = self.attention(args[0], args[1], args[2])
-        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
+        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 
         if self.has_residual:
             assert len(args) == 4
@@ -387,7 +384,7 @@ def forward(self, *args):
         else:
             x = self.proj(attn_output)
         x = self.proj_drop(x)
-        x = ops.reshape()(x, [-1, seq, self.dim])
+        x = ops.reshape()(x, [batch, -1, self.dim])
         return x
 
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index 68c9aefdf..2a1e0779e 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -65,7 +65,7 @@ def __init__(
         in_channels,
         out_channels,
         kernel_size,
-        stride,
+        stride=1,
         padding=0,
         dilation=1,
         groups=1,
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 4c210af1a..21dbd1618 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -103,21 +103,14 @@ def test_batch_gather(self):
         self._test_batch_gather(
             shape=(2, 2), ind_shape=(2, 1), dim=1, max_ind=2, test_name="batch_gather3"
         )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
         self._test_batch_gather(
             shape=(8, 4, 4, 2, 2),
             ind_shape=(8, 4, 1),
             dim=2,
             max_ind=4,
-            test_name="batch_gather4",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
-    def test_float32(self):
-        self._test_batch_gather(
-            shape=(8, 2, 2),
-            ind_shape=(2,),
-            dim=0,
-            max_ind=8,
             test_name="batch_gather_f32",
             dtype="float32",
         )
@@ -180,20 +173,20 @@ def _test_batch_gather_topk(
 
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_batch_gather_topk(self):
-        self._test_batch_gather_topk(
-            shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
-    def test_float32(self):
-        self._test_batch_gather_topk(
-            shape=(4, 1, 1),
-            N=2000,
-            topK=300,
-            test_name="batch_gather_topk_f32",
-            dtype="float32",
-        )
+    # def test_batch_gather_topk(self):
+    #     self._test_batch_gather_topk(
+    #         shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
+    #     )
+
+    # @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    # def test_float32(self):
+    #     self._test_batch_gather_topk(
+    #         shape=(4, 1, 1),
+    #         N=2000,
+    #         topK=300,
+    #         test_name="batch_gather_topk_f32",
+    #         dtype="float32",
+    #     )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index 94073251e..a7560f661 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -425,10 +425,10 @@ def test_concatenate_shape_compatible(self):
         in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
         self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
-        var2 = IntVar(values=[1, 2])
-        with self.assertRaises(RuntimeError):
-            in_shapes = [[var1, 2, 3], [var2, 2, 3]]
-            self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+        # var2 = IntVar(values=[1, 2])
+        # with self.assertRaises(RuntimeError):
+        #     in_shapes = [[var1, 2, 3], [var2, 2, 3]]
+        #     self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index 430967af3..cc67761ca 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -242,29 +242,29 @@ def model():
         y = score_inds[index]
         np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
-    def test_topk_nms_fp16(self):
-        self._test_topk_nms(
-            test_name="topk_nms_fp16",
-            dtype="float16",
-        )
-        self._test_topk_nms(
-            test_name="topk_nms_copy_op_fp16",
-            copy_op=True,
-            dtype="float16",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
-    def test_topk_nms_fp32(self):
-        self._test_topk_nms(
-            test_name="topk_nms_fp32",
-            dtype="float32",
-        )
-        self._test_topk_nms(
-            test_name="topk_nms_copy_op_fp32",
-            copy_op=True,
-            dtype="float32",
-        )
+    # @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
+    # def test_topk_nms_fp16(self):
+    #     self._test_topk_nms(
+    #         test_name="topk_nms_fp16",
+    #         dtype="float16",
+    #     )
+    #     self._test_topk_nms(
+    #         test_name="topk_nms_copy_op_fp16",
+    #         copy_op=True,
+    #         dtype="float16",
+    #     )
+
+    # @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    # def test_topk_nms_fp32(self):
+    #     self._test_topk_nms(
+    #         test_name="topk_nms_fp32",
+    #         dtype="float32",
+    #     )
+    #     self._test_topk_nms(
+    #         test_name="topk_nms_copy_op_fp32",
+    #         copy_op=True,
+    #         dtype="float32",
+    #     )
 
 
 if __name__ == "__main__":

From d227f60cb0687a49427c9e8e17ec1a047debcf93 Mon Sep 17 00:00:00 2001
From: Aleksandr Pivovar <apivovar@meta.com>
Date: Fri, 21 Apr 2023 02:06:48 -0700
Subject: [PATCH 432/638] Eliminated redundant permute pairs (#529)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/529

added transformation for eliminating redundant permute pairs
added unit tests for tesing

Reviewed By: chenyang78

Differential Revision: D44414108

fbshipit-source-id: f133bc025a90421e702794ebed87111d4928c486
---
 .../compiler/transform/optimize_graph.py      |   2 +
 .../transform/transform_permutations.py       | 109 ++++++++++
 .../compiler/test_eliminate_permutations.py   | 190 ++++++++++++++++++
 3 files changed, 301 insertions(+)
 create mode 100644 python/aitemplate/compiler/transform/transform_permutations.py
 create mode 100644 tests/unittest/compiler/test_eliminate_permutations.py

diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 01ea8913b..2e8f5a483 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -46,6 +46,7 @@
 from aitemplate.compiler.transform.transform_odd_alignment import (
     transform_odd_alignment,
 )
+from aitemplate.compiler.transform.transform_permutations import eliminate_permutations
 from aitemplate.compiler.transform.transform_permute_to_reshape import (
     transform_permute_to_reshape,
 )
@@ -117,6 +118,7 @@ def optimize_graph(
         split_large_split_ops,
         transform_permute_to_reshape,
         transform_memory_ops,
+        eliminate_permutations,
     ]
 
     if not optimize:
diff --git a/python/aitemplate/compiler/transform/transform_permutations.py b/python/aitemplate/compiler/transform/transform_permutations.py
new file mode 100644
index 000000000..cfda924eb
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_permutations.py
@@ -0,0 +1,109 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import numpy as np
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.transform import transform_utils
+
+
+NAME_TO_DIM = {
+    "permute021": [0, 2, 1],
+    "permute210": [2, 1, 0],
+    "permute102": [1, 0, 2],
+    "permute0213": [0, 2, 1, 3],
+}
+
+
+def get_permutation(op: Operator):
+    if op._attrs["op"] == "permute":
+        permutation = list(op._attrs["dims"])
+    elif op._attrs["op"] in NAME_TO_DIM:
+        permutation = NAME_TO_DIM[op._attrs["op"]]
+    else:
+        raise NotImplementedError(
+            f"Not implemented for permute operation: {op._attrs['op']}"
+        )
+    return permutation
+
+
+def remove_second_permutation_from_graph(
+    permutation_1: Operator, permutation_2: Operator
+):
+    input_tensor_p1 = permutation_1._attrs["inputs"][0]
+    input_tensor_p2 = permutation_2._attrs["inputs"][0]
+    output_tensor = permutation_2._attrs["outputs"][0]
+
+    input_tensor_p1._attrs["dst_ops"].update(output_tensor._attrs["dst_ops"])
+    input_tensor_p2._attrs["dst_ops"].discard(permutation_2)
+
+    for dst_op in output_tensor._attrs["dst_ops"]:
+        dst_op.replace_input_tensor(output_tensor, input_tensor_p1)
+
+    if output_tensor._attrs["is_output"]:
+        input_tensor_p1._attrs["is_output"] = True
+        input_tensor_p1._attrs["name"] = output_tensor._attrs["name"]
+
+    transform_utils.remove_tensor_from_sorted_graph(output_tensor)
+
+
+def eliminate_permutations(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    if len(sorted_graph) < 2:
+        return sorted_graph
+    removed_op = set()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        for cur_op in src_ops:
+            if cur_op in removed_op:
+                continue
+            if not cur_op._attrs["op"].startswith("permute"):
+                continue
+            input_accessors = cur_op._attrs.get("input_accessors", None)
+            if (
+                input_accessors is not None
+                and hasattr(input_accessors[0], "strided_dim")
+                and input_accessors[0].strided_dim is not None
+            ):
+                continue
+            curr_op_output = cur_op._attrs["outputs"][0]
+            dst_ops = curr_op_output._attrs["dst_ops"]
+            n_dst_ops = len(dst_ops)
+            if n_dst_ops == 0:
+                continue
+            remove_list = []
+            for next_op in dst_ops:
+                if not next_op._attrs["op"].startswith("permute"):
+                    continue
+                p1 = get_permutation(cur_op)
+                p2 = get_permutation(next_op)
+                if not np.all(np.array(p1)[p2] == np.arange(0, len(p1))):
+                    continue
+                is_input = cur_op._attrs["inputs"][0]._attrs["is_input"]
+                is_output = next_op._attrs["outputs"][0]._attrs["is_output"]
+                if is_input and is_output:
+                    continue
+                remove_list.append(next_op)
+
+            for next_op in remove_list:
+                remove_second_permutation_from_graph(cur_op, next_op)
+                removed_op.add(next_op)
+
+            if len(remove_list) == n_dst_ops:
+                transform_utils.remove_single_tensor_op_from_sorted_graph(cur_op)
+                removed_op.add(cur_op)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/tests/unittest/compiler/test_eliminate_permutations.py b/tests/unittest/compiler/test_eliminate_permutations.py
new file mode 100644
index 000000000..8c1603ec0
--- /dev/null
+++ b/tests/unittest/compiler/test_eliminate_permutations.py
@@ -0,0 +1,190 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    graph_has_op,
+)
+
+
+class EliminatePermutationTestCase(unittest.TestCase):
+    def test_eliminate_permutation(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        new_shape = [32, 64 * 112 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 3, 1, 2])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(new_shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "permute"))
+        self.assertTrue(torch.equal(torch.reshape(x_pt, new_shape), y_pt))
+
+    def test_eliminate_last_permutation(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 3, 1])
+        z = ops.permute()(p2, dims=[0, 3, 1, 2])
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertTrue(graph_has_op(result_graph, "permute"))
+
+    def test_eliminate_permutation_names(self):
+        dtype = "float"
+        shape = [32, 64, 112]
+        new_shape = [32, 64 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute021()(x)
+        p2 = ops.permute021()(p1)
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(new_shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation_names")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "permute"))
+        self.assertTrue(torch.equal(torch.reshape(x_pt, new_shape), y_pt))
+
+    def test_eliminate_permutation_multiple_operations(self):
+        dtype = "float"
+        shape = [2, 4]
+        target = detect_target()
+
+        x0 = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x0, dims=[1, 0])
+        p2 = ops.permute()(p1, dims=[1, 0])
+        r = ops.reshape()(p1, shape)
+        z = ops.elementwise(FuncEnum.ADD)(r, p2)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertTrue(
+            torch.equal(
+                torch.reshape(torch.permute(x_pt, (1, 0)), shape) + x_pt,
+                y_pt,
+            )
+        )
+
+    def test_eliminate_permutation_multiple_operations_2(self):
+        dtype = "float"
+        shape = [2, 4]
+        target = detect_target()
+
+        x0 = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x0, dims=[1, 0])
+        p2 = ops.permute()(p1, dims=[1, 0])
+        r = ops.reshape()(p1, shape)
+        a1 = ops.elementwise(FuncEnum.ADD)(r, p2)
+        a2 = ops.elementwise(FuncEnum.ADD)(x0, p2)
+        z = ops.elementwise(FuncEnum.MUL)(a1, a2)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertTrue(
+            torch.equal(
+                (torch.reshape(torch.permute(x_pt, (1, 0)), shape) + x_pt) * 2 * x_pt,
+                y_pt,
+            )
+        )
+
+    def test_eliminate_permutation_different_shapes(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        new_shape = [32, 64 * 112 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 3, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+        module = compile_model(
+            z, target, "./tmp", "test_eliminate_permutation_different_shapes"
+        )
+        result_graph = module.debug_sorted_graph
+        self.assertEqual(len(result_graph), 4)
+        self.assertTrue(graph_has_op(result_graph, "permute"))
+
+    def test_eliminate_permutation_all_permutations(self):
+        dtype = "float"
+        target = detect_target()
+        shape = [32, 64, 112, 112]
+
+        x = Tensor(shape, dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 3, 1, 2])
+        p2._attrs["is_output"] = True
+
+        module = compile_model(
+            p2,
+            target,
+            "./tmp",
+            "test_eliminate_permutation_all_permutations",
+        )
+        result_graph = module.debug_sorted_graph
+        self.assertEqual(len(result_graph), 3)
+        self.assertTrue(graph_has_op(result_graph, "permute"))

From 95053be9eef2a13dfe9f61e16c123b175b0917f7 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sat, 22 Apr 2023 05:46:39 -0700
Subject: [PATCH 433/638] Back out "Ldm update" (#603)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/603

Original commit changeset: f760a883d5da

Original Phabricator Diff: D44948785

Reviewed By: chenyang78

Differential Revision: D45207132

fbshipit-source-id: 8af9c86b1a8cf31af23d86816bd53787c3e8cc3f
---
 .../05_stable_diffusion/scripts/compile.py    |   1 -
 examples/05_stable_diffusion/scripts/demo.py  |   9 +-
 .../scripts/demo_img2img.py                   |   1 +
 .../scripts/download_pipeline.py              |   4 +-
 examples/05_stable_diffusion/src/benchmark.py |  25 +++--
 .../05_stable_diffusion/src/benchmark_pt.py   |  14 ++-
 .../src/compile_lib/compile_clip.py           |  44 ++++++--
 .../src/compile_lib/compile_unet.py           |  24 ++---
 .../src/compile_lib/compile_vae.py            |  53 ++++-----
 .../src/compile_lib/util.py                   |   2 +-
 .../src/modeling/attention.py                 |  33 ++++--
 .../05_stable_diffusion/src/modeling/clip.py  | 102 +++++++++---------
 .../src/modeling/embeddings.py                |   2 +-
 .../src/modeling/resnet.py                    |   5 +-
 .../src/modeling/unet_2d_condition.py         |  19 +---
 .../src/modeling/unet_blocks.py               |   1 +
 .../05_stable_diffusion/src/modeling/vae.py   |   1 +
 .../src/pipeline_stable_diffusion_ait.py      |  12 +--
 .../backend/common/concatenate_common.py      |   2 -
 .../common/tensor/batch_gather_common.py      |   8 +-
 .../backend/common/upsampling2d_common.py     |   2 -
 .../cuda/groupnorm/groupnorm_common.py        |  22 ++--
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  64 ++++++-----
 python/aitemplate/compiler/compiler.py        |   5 -
 .../ops/attention/mem_eff_attention.py        |   7 +-
 .../compiler/ops/common/view_ops.py           |   5 +-
 .../ops/conv/common_conv2d_bias_activation.py |   1 -
 python/aitemplate/compiler/ops/conv/conv2d.py |  74 +++++--------
 .../compiler/ops/groupnorm/groupnorm.py       |  61 +----------
 .../compiler/ops/tensor/batch_gather.py       |   8 +-
 .../compiler/ops/tensor/concatenate.py        |   3 +-
 .../ops/upsample/upsampling_common.py         |  11 +-
 .../compiler/transform/name_graph.py          |  62 +++--------
 .../transform/transform_permutations.py       |   1 +
 python/aitemplate/frontend/nn/attention.py    |  17 +--
 .../frontend/nn/conv2d/conv2d_bias.py         |   2 +-
 tests/unittest/ops/test_batch_gather.py       |  41 ++++---
 tests/unittest/ops/test_concatenate.py        |   8 +-
 tests/unittest/ops/test_nms.py                |  46 ++++----
 39 files changed, 335 insertions(+), 467 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 65032e34f..896b2432c 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -65,7 +65,6 @@ def compile_diffusers(
     compile_clip(
         pipe.text_encoder,
         batch_size=batch_size,
-        seqlen=77,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
         depth=pipe.text_encoder.config.num_hidden_layers,
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index 9ae7db46a..d4f5dbb99 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -34,13 +34,11 @@
 )
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
-@click.option("--batch", default=1, help="Batch size of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
-@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
+def run(local_dir, width, height, prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         local_dir,
         scheduler=EulerDiscreteScheduler.from_pretrained(
@@ -50,14 +48,11 @@ def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
         torch_dtype=torch.float16,
     ).to("cuda")
 
-    prompt = [prompt] * batch
     with torch.autocast("cuda"):
         image = pipe(prompt, height, width).images[0]
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
-            print(
-                f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
-            )
+            print(f"sd e2e: {t} ms")
 
     image.save("example_ait.png")
 
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index 31e1c33df..e4d96d865 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -43,6 +43,7 @@
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
 def run(local_dir, width, height, prompt, benchmark):
+
     # load the pipeline
     device = "cuda"
     pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index b072e694f..1128769da 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -21,12 +21,12 @@
 @click.option("--token", default="", help="access token")
 @click.option(
     "--save_directory",
-    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="pipeline files local directory",
 )
 def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        "stabilityai/stable-diffusion-2-1-base",
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index f0f595122..5cac6a465 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -55,6 +55,7 @@ def benchmark_unet(
     benchmark_pt=False,
     verify=False,
 ):
+
     exe_module = Model("./tmp/UNet2DConditionModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
@@ -64,7 +65,7 @@ def benchmark_unet(
     pt_mod = pt_mod.eval()
 
     latent_model_input_pt = torch.randn(batch_size, 4, height, width).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 77, hidden_dim).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 64, hidden_dim).cuda().half()
     timesteps_pt = torch.Tensor([1, 1]).cuda().half()
 
     with autocast("cuda"):
@@ -82,6 +83,8 @@ def benchmark_unet(
             with open("sd_pt_benchmark.txt", "a") as f:
                 f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
 
+    print("pt output:", pt_ys.shape)
+
     # run AIT unet model
     inputs = {
         "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
@@ -93,8 +96,6 @@ def benchmark_unet(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
-        shape[1] = height
-        shape[2] = width
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -123,11 +124,13 @@ def benchmark_unet(
 def benchmark_clip(
     pt_mod,
     batch_size=1,
-    seqlen=77,
+    seqlen=64,
     tokenizer=None,
     benchmark_pt=False,
     verify=False,
 ):
+    mask_seq = 0
+
     exe_module = Model("./tmp/CLIPTextModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
@@ -139,7 +142,7 @@ def benchmark_clip(
     if tokenizer is None:
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_input = tokenizer(
-        ["a photo of an astronaut riding a horse on mars"] * batch_size,
+        ["a photo of an astronaut riding a horse on mars"],
         padding="max_length",
         max_length=seqlen,
         truncation=True,
@@ -147,6 +150,8 @@ def benchmark_clip(
     )
     input_ids = text_input["input_ids"].cuda()
 
+    attention_mask = torch.ones((batch_size, seqlen))
+    attention_mask[-1, -mask_seq:] = 0
     attention_mask = None
 
     position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
@@ -170,7 +175,6 @@ def benchmark_clip(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
-        shape[0] = batch_size
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -198,6 +202,7 @@ def benchmark_clip(
 def benchmark_vae(
     pt_vae, batch_size=1, height=64, width=64, benchmark_pt=False, verify=False
 ):
+
     latent_channels = 4
 
     exe_module = Model("./tmp/AutoencoderKL/test.so")
@@ -234,8 +239,9 @@ def benchmark_vae(
         .cuda()
         .half()
     )
-
     ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
+    print("input pt tensor size: ", ait_input_pt_tensor.shape)
+    print("output pt tensor size: ", y.shape)
     exe_module.run_with_tensors([ait_input_pt_tensor], [y])
 
     # verification
@@ -299,10 +305,7 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     )
     # VAE
     benchmark_vae(
-        pipe.vae,
-        batch_size=batch_size,
-        benchmark_pt=benchmark_pt,
-        verify=verify,
+        pipe.vae, batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify
     )
 
 
diff --git a/examples/05_stable_diffusion/src/benchmark_pt.py b/examples/05_stable_diffusion/src/benchmark_pt.py
index c12877897..95bfb725f 100644
--- a/examples/05_stable_diffusion/src/benchmark_pt.py
+++ b/examples/05_stable_diffusion/src/benchmark_pt.py
@@ -26,24 +26,22 @@
     default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="the local diffusers pipeline directory",
 )
-@click.option("--width", default=512, help="Width of generated image")
-@click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
-@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, prompt, negative_prompt, benchmark):
+def run(local_dir, prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
         local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
     ).to("cuda")
 
-    image = pipe(prompt, height, width, negative_prompt=negative_prompt).images[0]
-    if benchmark:
-        t = benchmark_torch_function(10, pipe, prompt)
-        print(f"sd pt e2e: {t} ms")
+    with torch.autocast("cuda"):
+        image = pipe(prompt).images[0]
+        if benchmark:
+            t = benchmark_torch_function(10, pipe, prompt)
+            print(f"sd pt e2e: {t} ms")
 
     image.save("example_pt.png")
 
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index a85aee84f..9f68e827a 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -12,9 +12,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+import numpy as np
+import torch
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
@@ -31,14 +32,40 @@ def map_clip_params(pt_mod, batch_size, seqlen, depth):
             ait_name = ait_name.replace("out_proj", "proj")
         elif name.endswith("out_proj.bias"):
             ait_name = ait_name.replace("out_proj", "proj")
-        elif "q_proj" in name:
-            ait_name = ait_name.replace("q_proj", "proj_q")
-        elif "k_proj" in name:
-            ait_name = ait_name.replace("k_proj", "proj_k")
-        elif "v_proj" in name:
-            ait_name = ait_name.replace("v_proj", "proj_v")
+        elif name.endswith("q_proj.weight"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.weight")]
+            q = pt_params[prefix + "q_proj.weight"]
+            k = pt_params[prefix + "k_proj.weight"]
+            v = pt_params[prefix + "v_proj.weight"]
+            qkv_weight = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_weight
+            continue
+        elif name.endswith("q_proj.bias"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.bias")]
+            q = pt_params[prefix + "q_proj.bias"]
+            k = pt_params[prefix + "k_proj.bias"]
+            v = pt_params[prefix + "v_proj.bias"]
+            qkv_bias = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_bias
+            continue
+        elif name.endswith("k_proj.weight"):
+            continue
+        elif name.endswith("k_proj.bias"):
+            continue
+        elif name.endswith("v_proj.weight"):
+            continue
+        elif name.endswith("v_proj.bias"):
+            continue
         params_ait[ait_name] = arr
 
+    if detect_target().name() == "cuda":
+        for i in range(depth):
+            prefix = f"encoder_layers_{i}_self_attn_cu_length"
+            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+            params_ait[prefix] = torch.from_numpy(cu_len).cuda()
+
     return params_ait
 
 
@@ -70,7 +97,6 @@ def compile_clip(
 
     pt_mod = pt_mod.eval()
     params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-    batch_size = IntVar(values=[1, 8], name="batch_size")
 
     input_ids_ait = Tensor(
         [batch_size, seqlen], name="input0", dtype="int64", is_input=True
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index c4233c1e4..3c2f59603 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -15,7 +15,7 @@
 import torch
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.unet_2d_condition import (
@@ -58,9 +58,9 @@ def compile_unet(
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
-    model_name="UNet2DConditionModel",
     use_linear_projection=False,
 ):
+
     ait_mod = ait_UNet2DConditionModel(
         sample_size=64,
         cross_attention_dim=hidden_dim,
@@ -72,31 +72,19 @@ def compile_unet(
     # set AIT parameters
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
-    # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
 
     latent_model_input_ait = Tensor(
-        [batch_size, height_d, width_d, 4], name="input0", is_input=True
+        [batch_size, height, width, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
     text_embeddings_pt_ait = Tensor(
-        [batch_size, 77, hidden_dim], name="input2", is_input=True
+        [batch_size, 64, hidden_dim], name="input2", is_input=True
     )
 
-    mid_block_additional_residual = None
-    down_block_additional_residuals = None
-
-    Y = ait_mod(
-        latent_model_input_ait,
-        timesteps_ait,
-        text_embeddings_pt_ait,
-        down_block_additional_residuals,
-        mid_block_additional_residual,
-    )
+    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
     mark_output(Y)
 
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-    compile_model(Y, target, "./tmp", model_name, constants=params_ait)
+    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index e9c2d4964..7352740d0 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -12,10 +12,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import numpy as np
 
 import torch
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
@@ -39,6 +40,20 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
                 ).contiguous()
             else:
                 mapped_pt_params[ait_name] = pt_params[name]
+        elif name.endswith("attention.qkv.weight"):
+            prefix = name[: -len("attention.qkv.weight")]
+            q_weight = pt_params[prefix + "query.weight"]
+            k_weight = pt_params[prefix + "key.weight"]
+            v_weight = pt_params[prefix + "value.weight"]
+            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+            mapped_pt_params[ait_name] = qkv_weight
+        elif name.endswith("attention.qkv.bias"):
+            prefix = name[: -len("attention.qkv.bias")]
+            q_bias = pt_params[prefix + "query.bias"]
+            k_bias = pt_params[prefix + "key.bias"]
+            v_bias = pt_params[prefix + "value.bias"]
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
+            mapped_pt_params[ait_name] = qkv_bias
         elif name.endswith("attention.proj.weight"):
             prefix = name[: -len("attention.proj.weight")]
             pt_name = prefix + "proj_attn.weight"
@@ -48,31 +63,8 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
             pt_name = prefix + "proj_attn.bias"
             mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.cu_length"):
-            ...
-        elif name.endswith("attention.proj_q.weight"):
-            prefix = name[: -len("attention.proj_q.weight")]
-            pt_name = prefix + "query.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_q.bias"):
-            prefix = name[: -len("attention.proj_q.bias")]
-            pt_name = prefix + "query.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.weight"):
-            prefix = name[: -len("attention.proj_k.weight")]
-            pt_name = prefix + "key.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.bias"):
-            prefix = name[: -len("attention.proj_k.bias")]
-            pt_name = prefix + "key.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.weight"):
-            prefix = name[: -len("attention.proj_v.weight")]
-            pt_name = prefix + "value.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.bias"):
-            prefix = name[: -len("attention.proj_v.bias")]
-            pt_name = prefix + "value.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
+            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
         else:
             pt_param = pt_module.get_parameter(name)
             mapped_pt_params[ait_name] = pt_param
@@ -87,7 +79,6 @@ def compile_vae(
     width=64,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
-    name="AutoencoderKL",
 ):
     in_channels = 3
     out_channels = 3
@@ -123,12 +114,8 @@ def compile_vae(
         latent_channels=latent_channels,
         sample_size=sample_size,
     )
-    # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
-
     ait_input = Tensor(
-        shape=[batch_size, height_d, width_d, latent_channels],
+        shape=[batch_size, height, width, latent_channels],
         name="vae_input",
         is_input=True,
     )
@@ -146,6 +133,6 @@ def compile_vae(
         Y,
         target,
         "./tmp",
-        name,
+        "AutoencoderKL",
         constants=params_ait,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 90cc1bc32..000e862e9 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -18,5 +18,5 @@ def mark_output(y):
     for i in range(len(y)):
         y[i]._attrs["is_output"] = True
         y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"] for d in y[i]._attrs["shape"]]
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
diff --git a/examples/05_stable_diffusion/src/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
index 06ab5f1bd..14993e6d9 100644
--- a/examples/05_stable_diffusion/src/modeling/attention.py
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -20,6 +20,7 @@
 from typing import Optional
 
 from aitemplate.compiler.ops import reshape
+
 from aitemplate.frontend import nn, Tensor
 
 
@@ -53,18 +54,22 @@ def __init__(
     ):
         super().__init__()
         self.batch_size = batch_size
+        self.height = height
+        self.width = width
         self.channels = channels
         self.num_heads = (
             channels // num_head_channels if num_head_channels is not None else 1
         )
         self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.CrossAttention(
+        self.attention = nn.MultiheadAttention(
             channels,
-            height * width,
+            batch_size,
             height * width,
             self.num_heads,
             qkv_bias=True,
+            has_residual=True,
+            use_mem_eff=True,
         )
         self.rescale_output_factor = rescale_output_factor
 
@@ -73,22 +78,28 @@ def forward(self, hidden_states) -> Tensor:
         input hidden_states shape: [batch, height, width, channel]
         output shape: [batch, height, width, channel]
         """
-
         residual = hidden_states
 
         # norm
         hidden_states = self.group_norm(hidden_states)
-        o_shape = hidden_states.shape()
-        batch_dim = o_shape[0]
 
         hidden_states = reshape()(
-            hidden_states,
-            [batch_dim, -1, self.channels],
+            hidden_states, [self.batch_size, self.height * self.width, self.channels]
         )
 
-        res = self.attention(hidden_states, hidden_states, hidden_states, residual) * (
-            1 / self.rescale_output_factor
-        )
+        batch, hw, channel = hidden_states.shape()
+        if (
+            batch.value() != self.batch_size
+            or hw.value() != self.width * self.height
+            or channel.value() != self.channels
+        ):
+            raise RuntimeError(
+                "nchw params do not match! "
+                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
+                f"actual: {batch}, {channel}, {hw}."
+            )
+
+        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
+        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
 
-        res = reshape()(res, o_shape)
         return res
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index ff0ce792a..30afcd051 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -57,9 +57,9 @@ def __init__(
         self.heads = heads
         self.dim_head = dim_head
 
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
+        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
         self.to_out = nn.Sequential(
             nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
         )
@@ -68,26 +68,36 @@ def forward(self, x, context=None, mask=None, residual=None):
         nheads = self.heads
         d = self.dim_head
 
-        q = self.to_q(x)
+        layout = "20314" if USE_CUDA else "m2n3"
+
+        bs, seqlen, _ = get_shape(x)
+        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
+        )
         context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-
-        bs = q.shape()[0]
-
-        q = ops.reshape()(q, [bs, -1, self.heads, self.dim_head])
-        k = ops.reshape()(k, [bs, -1, self.heads, self.dim_head])
-        v = ops.reshape()(v, [bs, -1, self.heads, self.dim_head])
-        q = ops.permute()(q, [0, 2, 1, 3])
-        k = ops.permute()(k, [0, 2, 1, 3])
-        v = ops.permute()(v, [0, 2, 1, 3])
-
-        attn_op = ops.mem_eff_attention(causal=False)
-        out = attn_op(
-            (ops.reshape()(q, [bs, nheads, -1, d])),
-            (ops.reshape()(k, [bs, nheads, -1, d])),
-            (ops.reshape()(v, [bs, nheads, -1, d])),
+
+        seqlen = get_shape(context)[1]
+        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
         )
+        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
+        )
+
+        if USE_CUDA:
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
+            )
+        else:
+            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
+            out = OP(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+                (ops.reshape()(v, [bs * nheads, -1, d])),
+            )
         out = ops.reshape()(out, [bs, -1, nheads * d])
         proj = self.to_out(out)
         proj = ops.reshape()(proj, [bs, -1, nheads * d])
@@ -225,7 +235,7 @@ def __init__(
 
     def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = x.shape()
+        b, h, w, c = get_shape(x)
         x_in = x
         x = self.norm(x)
         if self.use_linear_projection:
@@ -326,7 +336,7 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = x.shape()
+        shape = get_shape(x)
         x = self.fc1(x)
         x = self.fc2(x, res)
         return ops.reshape()(x, shape)
@@ -354,11 +364,11 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        # shape = get_shape(x)
+        shape = get_shape(x)
         x = self.fc1(x)
         x = self.activation_fn(x)
         x = self.fc2(x, res)
-        return ops.reshape()(x, x.shape())
+        return ops.reshape()(x, shape)
 
 
 class CLIPEncoderLayer(nn.Module):
@@ -381,15 +391,19 @@ def __init__(
     ):
         super().__init__()
         self.embed_dim = hidden_size
-        self.self_attn = nn.CrossAttention(
-            hidden_size,
-            seq_len,
-            seq_len,
-            num_attention_heads,
+        self.self_attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
             qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=0,
+            has_residual=True,
             causal=causal,
+            mask_seq=mask_seq,
+            use_mem_eff=True,
         )
-
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
         self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
             hidden_size, int(hidden_size * mlp_ratio)
@@ -414,9 +428,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states, hidden_states, hidden_states, residual
-        )
+        hidden_states = self.self_attn(hidden_states, residual)
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -535,9 +547,6 @@ def __init__(
     ):
         super().__init__()
         embed_dim = hidden_size
-        self.max_position_embeddings = max_position_embeddings
-        self.embed_dim = hidden_size
-        self.vocab_size = vocab_size
 
         self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
         self.position_embedding = nn.Embedding(
@@ -550,25 +559,20 @@ def forward(
         position_ids: Tensor,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Tensor:
+
         input_shape = ops.size()(input_ids)
 
         # [B * S]
-        token_embedding = self.token_embedding.tensor()
-        token_embedding = ops.reshape()(
-            token_embedding, [1, self.vocab_size, self.embed_dim]
-        )
-        token_embedding = ops.expand()(token_embedding, [input_shape[0], -1, -1])
+        input_ids = ops.reshape()(input_ids, [-1])
+
+        position_ids = ops.reshape()(position_ids, [-1])
 
         if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(token_embedding, input_ids)
+            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
 
-        position_embedding = self.position_embedding.tensor()
-        position_embedding = ops.reshape()(
-            position_embedding, [1, self.max_position_embeddings, self.embed_dim]
+        position_embeddings = ops.batch_gather()(
+            self.position_embedding.tensor(), position_ids
         )
-        position_embedding = ops.expand()(position_embedding, [input_shape[0], -1, -1])
-
-        position_embeddings = ops.batch_gather()(position_embedding, position_ids)
 
         embeddings = inputs_embeds + position_embeddings
 
diff --git a/examples/05_stable_diffusion/src/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
index cab7c033f..36b96a4fb 100644
--- a/examples/05_stable_diffusion/src/modeling/embeddings.py
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -39,7 +39,7 @@ def get_timestep_embedding(
     :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
     embeddings. :return: an [N x dim] Tensor of positional embeddings.
     """
-    assert timesteps._rank() == 1, "Timesteps should be a 1d-array"
+    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
 
diff --git a/examples/05_stable_diffusion/src/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
index c15bf26d2..03e4f8023 100644
--- a/examples/05_stable_diffusion/src/modeling/resnet.py
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -58,6 +58,7 @@ def __init__(
             self.Conv2d_0 = conv
 
     def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
         if self.use_conv_transpose:
             return self.conv(x)
 
@@ -111,7 +112,9 @@ def __init__(
             self.conv = conv
 
     def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
         x = self.conv(x)
+
         return x
 
 
@@ -216,7 +219,7 @@ def forward(self, x, temb=None):
 
         if temb is not None:
             temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = temb.shape()
+            bs, dim = get_shape(temb)
             temb = ops.reshape()(temb, [bs, 1, 1, dim])
             hidden_states = hidden_states + temb
 
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index 2ad4d9718..eb28a076a 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -14,7 +14,7 @@
 #
 from typing import Optional, Tuple, Union
 
-from aitemplate.frontend import nn, Tensor
+from aitemplate.frontend import nn
 
 from .embeddings import TimestepEmbedding, Timesteps
 from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
@@ -185,8 +185,6 @@ def forward(
         sample,
         timesteps,
         encoder_hidden_states,
-        down_block_additional_residuals: Optional[Tuple[Tensor]] = None,
-        mid_block_additional_residual: Optional[Tensor] = None,
         return_dict: bool = True,
     ):
         """r
@@ -226,27 +224,12 @@ def forward(
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
-            # return sample
-
-        if down_block_additional_residuals is not None:
-            new_down_block_res_samples = ()
-
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample += down_block_additional_residual
-                new_down_block_res_samples += (down_block_res_sample,)
-
-            down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
         sample = self.mid_block(
             sample, emb, encoder_hidden_states=encoder_hidden_states
         )
 
-        if mid_block_additional_residual is not None:
-            sample += mid_block_additional_residual
-
         # 5. up
         for upsample_block in self.up_blocks:
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 897025660..9eaa6e0b1 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -731,6 +731,7 @@ def __init__(
             )
         ]
         attentions = []
+
         for _ in range(num_layers):
             attentions.append(
                 AttentionBlock(
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
index f2bea6a43..924c7257a 100644
--- a/examples/05_stable_diffusion/src/modeling/vae.py
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -144,6 +144,7 @@ def __init__(
         )
 
     def decode(self, z: Tensor, return_dict: bool = True):
+
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
         return dec
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index a89f43109..7dace1275 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -108,7 +108,6 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
-        self.batch = 1
 
     def init_ait_module(
         self,
@@ -133,13 +132,12 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
-            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=77):
+    def clip_inference(self, input_ids, seqlen=64):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -151,7 +149,6 @@ def clip_inference(self, input_ids, seqlen=77):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
-            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -163,7 +160,6 @@ def vae_inference(self, vae_input):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
-            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -200,7 +196,7 @@ def __call__(
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
@@ -258,13 +254,11 @@ def __call__(
                 f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
             )
 
-        self.batch = batch_size
-
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=self.tokenizer.model_max_length,
+            max_length=64,  # self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index 025688309..d6f3013ce 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -445,11 +445,9 @@
     throw std::runtime_error("the number of inputs must >= 1!");
   }
 
-
   for ({{index_type}} i = 0; i < rank; i++) {
     if (i == concat_dim) continue;
     {{index_type}} dim = real_input_shapes[0][i];
-
     for ({{index_type}} j = 1; j < num_real_inputs; j++) {
       if (real_input_shapes[j][i] != dim) {
         throw std::runtime_error(
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
index 2e05c0997..97e8aee77 100644
--- a/python/aitemplate/backend/common/tensor/batch_gather_common.py
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -36,8 +36,7 @@
 
 {{func_signature}}
 {
-    const int64_t gather_size = *batch_size * batch_num;
-    batch_gather_launcher<{{dtype}}, int64_t>(stream, gather_size, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
+    batch_gather_launcher<{{dtype}}, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
 }
     """
 )
@@ -47,7 +46,6 @@
 void {{func_name}}(void* output,
                    const void* input,
                    const int64_t* indices,
-                   const {{index_type}}* batch_size,
                    const {{index_type}} batch_num,
                    const {{index_type}} indices_num,
                    const {{index_type}} instance_size,
@@ -67,7 +65,6 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}   {{output}}, {{input}}, {{indices}},
-{{indent}}    {{batch_size}},
 {{indent}}    {{batch_num}},
 {{indent}}    {{indices_num}},
 {{indent}}    {{instance_size}},
@@ -171,7 +168,7 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
     axis = len(ind_shape) - 1
     batch_num = 1
-    for i in range(1, axis):
+    for i in range(axis):
         batch_num *= yshape[i]._attrs["values"][0]
 
     indices_num = yshape[axis]._attrs["values"][0]
@@ -187,7 +184,6 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
         output=output_name,
         input=input_name,
         indices=indices_name,
-        batch_size="&" + xshape[0]._attrs["name"],
         batch_num=batch_num,
         indices_num=indices_num,
         instance_size=instance_size,
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index c1b94a217..8e8310229 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -313,9 +313,7 @@
     {{index_type}}* out_w,
     {{prefix}}Stream_t stream
 ) {
-
   {{shape_function}}
-
   {{exec_paths}}
   throw std::runtime_error(
       "Unsupported workload for this bilinear upsampling specialization."
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 5db6982aa..39a44ebd3 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -31,10 +31,6 @@
                           void* gamma,
                           void* beta,
                           int N,
-                          int64_t* H,
-                          int64_t* W,
-                          int64_t* HO,
-                          int64_t* WO,
                           const float eps,
                           const int max_smem_size,
                           void* workspace,
@@ -53,8 +49,6 @@
 {{indent}}{
 {{indent}}  {{func_name}}(
 {{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
-{{indent}}     {{H}}, {{W}},
-{{indent}}     {{HO}}, {{WO}},
 {{indent}}     {{eps}}, max_smem_size_, global_workspace_,
 {{indent}}  stream /* default stream */
 {{indent}}  );
@@ -93,16 +87,13 @@
 
 {{func_signature}}
 {
-    *HO = *H;
-    *WO = *W;
-    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{C}}, {{G}}>(
+
+    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
             static_cast<{{elem_input_type}}*>(output),
             static_cast<{{elem_input_type}}*>(input),
             static_cast<{{elem_input_type}}*>(gamma),
             static_cast<{{elem_input_type}}*>(beta),
             N,
-            H,
-            W,
             eps,
             max_smem_size,
             workspace,
@@ -147,6 +138,8 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     use_swish = True if "swish" in func_attrs["name"] else False
     input_shape = func_attrs["inputs"][0].shape()
 
+    H = input_shape[1].value()
+    W = input_shape[2].value()
     C = input_shape[3].value()
     G = func_attrs["num_groups"]
 
@@ -164,6 +157,8 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
         FuseSwish="true" if use_swish else "false",
+        H=H,
+        W=W,
         C=C,
         G=G,
     )
@@ -185,7 +180,6 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
     input_shape = func_attrs["inputs"][0]._attrs["shape"]
-    output_shape = func_attrs["outputs"][0]._attrs["shape"]
     eps = func_attrs["eps"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -194,10 +188,6 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
         gamma=gamma_name,
         beta=beta_name,
         N=input_shape[0]._attrs["name"],
-        H="&" + input_shape[1]._attrs["name"],
-        W="&" + input_shape[2]._attrs["name"],
-        HO="&" + output_shape[1]._attrs["name"],
-        WO="&" + output_shape[2]._attrs["name"],
         eps=eps,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 0ccc9e105..b868849e4 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -477,6 +477,8 @@ struct TInputHelper<bfloat16> {
 template <
     typename TInput,
     bool FuseSwish,
+    int H,
+    int W,
     int C,
     int C_G,
     int ILP = 8,
@@ -488,8 +490,6 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
     TInput* gamma,
     TInput* beta,
     int N,
-    int H,
-    int W,
     float epsilon) {
   constexpr int C_G_2 = C_G / 2;
   constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
@@ -983,15 +983,13 @@ void DispatchGroupNormForwardGpu(
   }
 }
 
-template <typename TInput, bool FuseSwish, int C, int G>
+template <typename TInput, bool FuseSwish, int H, int W, int C, int G>
 cudaError_t invokeGroupNorm(
     TInput* output,
     TInput* input,
     TInput* gamma,
     TInput* beta,
     int N,
-    const int64_t* height,
-    const int64_t* width,
     const float eps,
     const int max_smem_size,
     void* workspace,
@@ -1000,9 +998,6 @@ cudaError_t invokeGroupNorm(
   constexpr auto C_G_2 = C_G / 2;
   constexpr int ILP = 8;
 
-  int64_t H = *height;
-  int64_t W = *width;
-
   const int64_t num_instances = N * G;
   const int64_t norm_size = H * W * C / G;
   const int64_t spatial_size = H * W;
@@ -1016,25 +1011,46 @@ cudaError_t invokeGroupNorm(
   // Bank conflict doesn't seem to matter to perf
   constexpr int BANK_CONFLICT = 0;
 
-  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
+  constexpr auto smem =
+      H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
 
   // C_G must be even, or we can have misaligned address for cp.async
   // reserve some shared_mem for block reduction
-  if (H > 0 && H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
-    constexpr int num_threads = 1024;
-    auto kernel_func = group_norm_smem<
-        TInput,
-        FuseSwish,
-        C,
-        C_G,
-        ILP,
-        BANK_CONFLICT,
-        num_threads>;
-    GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
-        kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
-    dim3 block(num_threads);
-    kernel_func<<<dim3(G, N), block, smem, stream>>>(
-        input, output, gamma, beta, N, H, W, eps);
+  if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+    constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
+
+    if constexpr (num_threads > 0) {
+      auto kernel_func = group_norm_smem<
+          TInput,
+          FuseSwish,
+          H,
+          W,
+          C,
+          C_G,
+          ILP,
+          BANK_CONFLICT,
+          num_threads>;
+      GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
+          kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+      dim3 block(num_threads);
+      kernel_func<<<dim3(G, N), block, smem, stream>>>(
+          input, output, gamma, beta, N, eps);
+    } else {
+      DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
+          stream,
+          num_instances,
+          norm_size,
+          channel_size,
+          spatial_size,
+          epsilon,
+          input,
+          gamma,
+          beta,
+          output,
+          static_cast<float*>(workspace),
+          static_cast<float*>(workspace) + num_instances,
+          channels_first);
+    }
   } else {
     DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
         stream,
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 5ee4f09fd..07eabe9e5 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -287,11 +287,6 @@ def compile_model(
             )
             _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
 
-            compiler.transform.dedup_symbolic_name(graph)
-            graph_utils.dump_graph_debug_str_to_file(
-                graph, test_dir, "dedup_symbolic_name"
-            )
-
             (
                 max_blob,
                 max_constant_blob,
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 55d2e9e28..cb9b86832 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -26,6 +26,7 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -119,9 +120,9 @@ def unique(vector):
         batch_info = x._attrs["shape"][0]
         output_shape = [
             batch_info,
-            x._attrs["shape"][2],
-            x._attrs["shape"][1],
-            w._attrs["shape"][-1],
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
         return output_shape
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index c5c174629..b48a94519 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -331,10 +331,9 @@ def _infer_shapes(self, x: Tensor):
                             y_shapes.append(IntImm(int(dynamic_symbol)))
                         else:
                             symbol_names = {s.name for s in dynamic_symbol.free_symbols}
-                            unknown_symbols = symbol_names - get_global_symbol_set()
                             assert (
-                                not unknown_symbols
-                            ), f"Unable to deduce dynamic symbol, because the following symbols are not in global symbol set: {unknown_symbols}"
+                                len(symbol_names - get_global_symbol_set()) == 0
+                            ), "Unable to deduce dynamic symbol"
 
                             values = simplify_intvar_values(dynamic_symbol)
                             new_var = IntVar(values, symbolic_value=dynamic_symbol)
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index ce2024559..fae05dad9 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -20,7 +20,6 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
-
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
     """Base class of conv2d with bias + activation."""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 23a999ab2..b620e8f3b 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -22,7 +22,7 @@
 from collections import OrderedDict
 from hashlib import sha1
 from operator import itemgetter
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List
 
 import jinja2
 
@@ -166,18 +166,12 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
         Parameters
         ----------
-        stride : int or tuple of two ints
-            Stride of the convolution. If tuple is
-            provided, the elements correspond to height and width stride
-            respectively
-        pad : int or tuple of two ints
-            Size of padding to add to the input. If tuple is
-            provided, the elements correspond to height and width padding
-            respectively
-        dilate : int or tuple of two ints, optional
-            Size of spacing between kernel elements, by default 1. If tuple is
-            provided, the elements correspond to height and width dilation
-            respectively
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
         group : int, optional
            Number of blocked connections from input
             channels to output channels, by default 1
@@ -201,19 +195,24 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
     def _get_params_factory(self):
         params_factory = {}
-        # Ensure convolutional parameters are in form (val_h, val_w)
-        params_factory["strideh"], params_factory["stridew"] = _maybe_int_to_tuple(
-            self._attrs["stride"],
-            "Stride",
-        )
-        params_factory["padh"], params_factory["padw"] = _maybe_int_to_tuple(
-            self._attrs["pad"],
-            "Pad",
-        )
-        params_factory["dilateh"], params_factory["dilatew"] = _maybe_int_to_tuple(
-            self._attrs["dilate"],
-            "Dilation",
-        )
+        if isinstance(self._attrs["stride"], int):
+            params_factory["strideh"] = self._attrs["stride"]
+            params_factory["stridew"] = self._attrs["stride"]
+        else:
+            params_factory["strideh"] = self._attrs["stride"][0]
+            params_factory["stridew"] = self._attrs["stride"][1]
+        if isinstance(self._attrs["pad"], int):
+            params_factory["padh"] = self._attrs["pad"]
+            params_factory["padw"] = self._attrs["pad"]
+        else:
+            params_factory["padh"] = self._attrs["pad"][0]
+            params_factory["padw"] = self._attrs["pad"][1]
+        if isinstance(self._attrs["dilate"], int):
+            params_factory["dilateh"] = self._attrs["dilate"]
+            params_factory["dilatew"] = self._attrs["dilate"]
+        else:
+            params_factory["dilateh"] = self._attrs["dilate"][0]
+            params_factory["dilatew"] = self._attrs["dilate"][1]
         return params_factory
 
     def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
@@ -264,21 +263,6 @@ def unique(vector):
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
-
-        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
-        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
-
-        # Ensure convolutional parameters are in form (val_h, val_w)
-        dilate_h, dilate_w = _maybe_int_to_tuple(self._attrs["dilate"], "Dilation")
-        stride_h, stride_w = _maybe_int_to_tuple(self._attrs["stride"], "Stride")
-        pad_h, pad_w = _maybe_int_to_tuple(self._attrs["pad"], "Pad")
-        KHEff = (w_shape[1] - 1) * dilate_h + 1
-        KWEff = (w_shape[2] - 1) * dilate_w + 1
-        out_h = (in_h + 2 * pad_h - KHEff) // stride_h + 1
-        out_w = (in_w + 2 * pad_w - KWEff) // stride_w + 1
-        output_shape[1]._attrs["symbolic_value"] = out_h
-        output_shape[2]._attrs["symbolic_value"] = out_w
-
         return output_shape
 
     def _invert_exec_key(self, key):
@@ -786,11 +770,3 @@ def gen_function(self) -> str:
             self.shape_eval_template,
             self.shape_save_template,
         )
-
-
-def _maybe_int_to_tuple(x: Union[int, Tuple[int, int]], name: str) -> Tuple[int, int]:
-    if isinstance(x, int):
-        return x, x
-    if isinstance(x, tuple) and len(x) == 2:
-        return x
-    raise ValueError(f"{name} should be either int or tuple of 2 ints, but got {x}")
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 47fb2dde9..fd31bb5d7 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -15,7 +15,6 @@
 """
 Operator definition for groupnorm.
 """
-import itertools
 import logging
 import os
 import re
@@ -40,7 +39,6 @@
 from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -55,19 +53,6 @@
 """
 )
 
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}NI = {{x_dim0}};
-{{indent}}{{dtype}}HI = {{x_dim1}};
-{{indent}}{{dtype}}WI = {{x_dim2}};
-{{indent}}{{dtype}}CI = {{x_dim3}};
-{{indent}}{{dtype}}NO = NI;
-{{indent}}{{dtype}}HO = HI;
-{{indent}}{{dtype}}WO = WI;
-{{indent}}{{dtype}}CO = {{x_dim3}};
-"""
-)
-
 
 class group_norm(Operator):
     """Standalone group norm op.
@@ -83,7 +68,6 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
             self._attrs["has_profiler"] = True
         self._attrs["num_channels"] = num_channels
         self._attrs["workspace"] = 0
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
     @staticmethod
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
@@ -128,51 +112,8 @@ def _sanity_check(self, x, gamma, beta):
 
     def _infer_shapes(self, x: Tensor):
         """Infer shapes for groupnorm."""
-        return x._attrs["shape"]
 
-    def _infer_shape(self, x: List[int]):
-        eval_func = self.shape_eval_template.render(
-            indent="",
-            dtype="",
-            div="//",
-            x_dim0=x[0],
-            x_dim1=x[1],
-            x_dim2=x[2],
-            x_dim3=x[3],
-        )
-        output = {}
-        exec(eval_func, output)  # noqa: P204
-        return [
-            int(output["NO"]),
-            int(output["HO"]),
-            int(output["WO"]),
-            int(output["CO"]),
-        ]
-
-    def _infer_shapes_v2(self, x: Tensor):
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        output_shape = [
-            x.shape()[0],
-            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
-        ]
-
-        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
-        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
-        output_shape[1]._attrs["symbolic_value"] = in_h
-        output_shape[2]._attrs["symbolic_value"] = in_w
-        return output_shape
+        return x._attrs["shape"]
 
     def __call__(
         self,
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index c522eb9c2..1182e32b5 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -55,7 +55,7 @@ def __init__(self) -> None:
 
     def _infer_shape(self, x: List[int], indices: List[int]):
         rank = len(indices)
-        for r in range(1, rank - 1):
+        for r in range(rank - 1):
             assert x[r] == indices[r]
         output = list(x)
         output[rank - 1] = indices[-1]
@@ -81,12 +81,6 @@ def unique(vector):
             output_shape.append(
                 shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
             )
-        if len(indices.shape()) > 1:
-            # Generally output has the same batch dimension as input
-            output_shape[0] = x.shape()[0]
-        else:
-            # Special case: gather happens along batch dimension
-            output_shape[0] = indices.shape()[0]
         return output_shape
 
     def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index 0d36a2c37..d207bdee1 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -105,8 +105,7 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             else:
                 output_dim = input_shapes[0][idx]
                 for shape in input_shapes:
-                    # if output_dim != shape[idx]:
-                    if output_dim._attrs["values"] != shape[idx]._attrs["values"]:
+                    if output_dim != shape[idx]:
                         raise RuntimeError(
                             "tensors expected to have the same dimensions "
                             "except concat_dim! dim: {}, shape1: {}, shape2: {}, inputs: {}".format(
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index edf87bba0..59b94e0b0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -121,20 +121,11 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            x.shape()[0],
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
-
-        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
-        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
-        out_h = in_h * int(self._attrs["scale_factor"])
-        out_w = in_w * int(self._attrs["scale_factor"])
-
-        output_shape[1]._attrs["symbolic_value"] = out_h
-        output_shape[2]._attrs["symbolic_value"] = out_w
-
         return output_shape
 
     def _invert_exec_key(self, key):
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 7faa8da64..94fea4a45 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,8 +18,7 @@
 import re
 from typing import List
 
-from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, JaggedIntVar, Tensor
-from aitemplate.utils import graph_utils
+from aitemplate.compiler.base import IntImm, IntVarTensor, JaggedIntVar, Tensor
 
 # pylint: disable=C0103
 
@@ -140,6 +139,7 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
 def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """Rename all shape variable that are identical to the same name.
+
     Parameters
     ----------
     sorted_graph : List[Tensor]
@@ -147,49 +147,19 @@ def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """
     symbolic_to_name = {}
     global user_provided_dim
-    # First pass - build symbolic_to_name map
-    for i, dim in _all_dims_in_graph(sorted_graph):
-        if not _dim_qualified_for_sym_dedup(dim):
-            continue
-        dim_sym = dim.symbolic_value()
-        if (
-            dim_sym not in symbolic_to_name
-            or dim_sym in symbolic_to_name
-            and dim._attrs["name"] in user_provided_dim
-        ):
-            symbolic_to_name[dim_sym] = dim._attrs["name"] or f"dim_{i}"
-
-    # Second pass - use symbolic_to_name map
-    for _, dim in _all_dims_in_graph(sorted_graph):
-        if not _dim_qualified_for_sym_dedup(dim):
-            continue
-        dim_sym = dim.symbolic_value()
-        dim._attrs["name"] = symbolic_to_name[dim_sym]
-
-
-def _all_dims_in_graph(sorted_graph: List[Tensor]):
-    dim_idx = 0
     for node in sorted_graph:
         for dim in node._attrs["shape"]:
-            yield dim_idx, dim
-            dim_idx += 1
-
-    # In case some dimensions are not encountered in any nodes in the graph,
-    # only in input/output accessors - iterate over all ops and dimensions
-    # in tensor accessors, if any.
-    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-    for op in sorted_ops:
-        input_accessors = op._attrs.get("input_accessors", None)
-        output_accessors = op._attrs.get("output_accessors", None)
-        for accessors in (input_accessors, output_accessors):
-            if accessors is None:
-                continue
-            for ta in accessors:
-                if ta.original_shapes:
-                    for dim in ta.original_shapes:
-                        yield dim_idx, dim
-                        dim_idx += 1
-
-
-def _dim_qualified_for_sym_dedup(dim: IntVar) -> bool:
-    return not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar)
+            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
+                dim_sym = dim.symbolic_value()
+                if (
+                    dim_sym not in symbolic_to_name
+                    or dim_sym in symbolic_to_name
+                    and dim._attrs["name"] in user_provided_dim
+                ):
+                    symbolic_to_name[dim_sym] = dim._attrs["name"]
+
+    for node in sorted_graph:
+        for dim in node._attrs["shape"]:
+            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
+                dim_sym = dim.symbolic_value()
+                dim._attrs["name"] = symbolic_to_name[dim_sym]
diff --git a/python/aitemplate/compiler/transform/transform_permutations.py b/python/aitemplate/compiler/transform/transform_permutations.py
index cfda924eb..ca6488e3e 100644
--- a/python/aitemplate/compiler/transform/transform_permutations.py
+++ b/python/aitemplate/compiler/transform/transform_permutations.py
@@ -15,6 +15,7 @@
 from typing import List
 
 import numpy as np
+
 from aitemplate.compiler.base import Operator, Tensor
 from aitemplate.compiler.transform import transform_utils
 
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index a1a7075b8..091f7d81a 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -327,6 +327,8 @@ def __init__(
         self.causal = causal
         self.has_residual = has_residual
         self.dim = dim
+        self.seqlen = seq_len
+        self.seqlen_kv = seq_len_kv
 
         self.op = ops.mem_eff_attention(causal=causal)
 
@@ -351,7 +353,8 @@ def __init__(
         self.proj_drop = Dropout(proj_drop)
 
     def attention(self, q, k, v):
-        batch = q.shape()[0]
+        seqlen = self.seqlen
+        seqlen_kv = self.seqlen_kv
         head_dim = self.dim // self.num_heads
 
         query = self.proj_q(q)
@@ -359,13 +362,13 @@ def attention(self, q, k, v):
         value = self.proj_v(v)
 
         query = ops.permute()(
-            ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         key = ops.permute()(
-            ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         value = ops.permute()(
-            ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
+            ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim]),
             [0, 2, 1, 3],
         )
         return self.op(query, key, value)
@@ -374,9 +377,9 @@ def forward(self, *args):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
-        batch = x.shape()[0]
+        seq = self.seqlen
         attn_output = self.attention(args[0], args[1], args[2])
-        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
+        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
 
         if self.has_residual:
             assert len(args) == 4
@@ -384,7 +387,7 @@ def forward(self, *args):
         else:
             x = self.proj(attn_output)
         x = self.proj_drop(x)
-        x = ops.reshape()(x, [batch, -1, self.dim])
+        x = ops.reshape()(x, [-1, seq, self.dim])
         return x
 
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index 2a1e0779e..68c9aefdf 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -65,7 +65,7 @@ def __init__(
         in_channels,
         out_channels,
         kernel_size,
-        stride=1,
+        stride,
         padding=0,
         dilation=1,
         groups=1,
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 21dbd1618..4c210af1a 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -103,14 +103,21 @@ def test_batch_gather(self):
         self._test_batch_gather(
             shape=(2, 2), ind_shape=(2, 1), dim=1, max_ind=2, test_name="batch_gather3"
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
-    def test_float32(self):
         self._test_batch_gather(
             shape=(8, 4, 4, 2, 2),
             ind_shape=(8, 4, 1),
             dim=2,
             max_ind=4,
+            test_name="batch_gather4",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather(
+            shape=(8, 2, 2),
+            ind_shape=(2,),
+            dim=0,
+            max_ind=8,
             test_name="batch_gather_f32",
             dtype="float32",
         )
@@ -173,20 +180,20 @@ def _test_batch_gather_topk(
 
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
 
-    # def test_batch_gather_topk(self):
-    #     self._test_batch_gather_topk(
-    #         shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
-    #     )
-
-    # @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
-    # def test_float32(self):
-    #     self._test_batch_gather_topk(
-    #         shape=(4, 1, 1),
-    #         N=2000,
-    #         topK=300,
-    #         test_name="batch_gather_topk_f32",
-    #         dtype="float32",
-    #     )
+    def test_batch_gather_topk(self):
+        self._test_batch_gather_topk(
+            shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather_topk(
+            shape=(4, 1, 1),
+            N=2000,
+            topK=300,
+            test_name="batch_gather_topk_f32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index a7560f661..94073251e 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -425,10 +425,10 @@ def test_concatenate_shape_compatible(self):
         in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
         self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
-        # var2 = IntVar(values=[1, 2])
-        # with self.assertRaises(RuntimeError):
-        #     in_shapes = [[var1, 2, 3], [var2, 2, 3]]
-        #     self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+        var2 = IntVar(values=[1, 2])
+        with self.assertRaises(RuntimeError):
+            in_shapes = [[var1, 2, 3], [var2, 2, 3]]
+            self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index cc67761ca..430967af3 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -242,29 +242,29 @@ def model():
         y = score_inds[index]
         np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
 
-    # @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
-    # def test_topk_nms_fp16(self):
-    #     self._test_topk_nms(
-    #         test_name="topk_nms_fp16",
-    #         dtype="float16",
-    #     )
-    #     self._test_topk_nms(
-    #         test_name="topk_nms_copy_op_fp16",
-    #         copy_op=True,
-    #         dtype="float16",
-    #     )
-
-    # @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
-    # def test_topk_nms_fp32(self):
-    #     self._test_topk_nms(
-    #         test_name="topk_nms_fp32",
-    #         dtype="float32",
-    #     )
-    #     self._test_topk_nms(
-    #         test_name="topk_nms_copy_op_fp32",
-    #         copy_op=True,
-    #         dtype="float32",
-    #     )
+    @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
+    def test_topk_nms_fp16(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp16",
+            dtype="float16",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    def test_topk_nms_fp32(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp32",
+            dtype="float32",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":

From 7f305888a66a1d0275f7a2d0a2c004d208d0f3fe Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sat, 22 Apr 2023 16:42:35 -0700
Subject: [PATCH 434/638] Add name for directory created by
 test_vanilla_attention (#602)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/602

We do not re-use names for tests that creates directories.

Reviewed By: chenyang78

Differential Revision: D45201126

fbshipit-source-id: 65e418a23019f3c7b6e6c5796318a8b4a165339f
---
 tests/unittest/ops/test_vanilla_attention.py | 26 +++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index bf1012edd..63149aa45 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -236,6 +236,7 @@ def _test_mha(
         num_heads=2,
         use_fp16_acc=False,
         benchmark_ait=False,
+        name="cross_attn_dynamic",
     ):
         pt_mod = (
             torch.nn.MultiheadAttention(
@@ -289,7 +290,7 @@ def _test_mha(
         Y = Y + inputs_ait
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        exe_module = compile_model(Y, target, "./tmp", name)
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
@@ -322,12 +323,29 @@ def _test_mha(
                 _LOGGER.info("benchmark cross-attn time: {0}".format(time_per_iter_ms))
 
     def test_cross_attn(self):
-        self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
         self._test_mha(
-            batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
+            batch_sizes=[1],
+            seqlen=2,
+            seqlen_kv=32,
+            dim=512,
+            num_heads=8,
+            name="single_batch",
         )
         self._test_mha(
-            batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
+            batch_sizes=[128, 256, 512],
+            seqlen=1,
+            seqlen_kv=62,
+            dim=512,
+            num_heads=8,
+            name="batches_seq_1",
+        )
+        self._test_mha(
+            batch_sizes=[1, 32, 64],
+            seqlen=128,
+            seqlen_kv=62,
+            dim=512,
+            num_heads=8,
+            name="batches_seq_128",
         )
 
 
From d17d5f2491b0359ec6fbd150816335e34d520ad4 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Sun, 23 Apr 2023 05:18:16 -0700
Subject: [PATCH 435/638] Fix MSVC compiler narrowing conversion errors for
 cuda/gemm_epilogue_universal (#604)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/604

cutlass::gemm::GemmCoord uses int values as coordinates under the hood, while AIT might use int64_t variables in {M, N, K} constructor. So, narrowing conversion is needed.

Reviewed By: chenyang78

Differential Revision: D45213076

fbshipit-source-id: f0c68cacf222fbcaf6632bd3d2d7ea65c65c2571
---
 .../aitemplate/backend/cuda/gemm_universal/common.py |  1 +
 .../backend/cuda/gemm_universal/gemm_rcr.py          | 12 ++++++++++--
 .../backend/cuda/gemm_universal/gemm_rcr_bias.py     | 12 ++++++++++--
 .../cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py   |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py        |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_hardswish.py   |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py        |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_sigmoid.py     |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_swish.py       |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py        |  6 +++++-
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py        |  6 +++++-
 .../backend/cuda/gemm_universal/gemm_rcr_permute.py  |  6 +++++-
 .../backend/cuda/gemm_universal/gemm_rrr.py          |  6 +++++-
 .../backend/cuda/gemm_universal/gemm_rrr_permute.py  |  6 +++++-
 .../backend/cuda/gemm_universal/group_common.py      |  2 +-
 15 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index efd3b6b4d..0f23cb1ee 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -216,6 +216,7 @@
 //  TODO: cast to right dtype
 {{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
 
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index ba598b965..9ea72e04d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -48,7 +48,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
@@ -71,7 +75,11 @@
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index 7f12c6961..b78fed2d7 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -31,7 +31,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
@@ -54,7 +58,11 @@
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index f0757d202..c13197ffa 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -63,7 +63,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index b7dcfb475..24f7691b3 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -27,7 +27,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index dd8ab1177..f565b897b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -27,7 +27,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index 83643889b..3907d35dc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -28,7 +28,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                                            // GemmUniversalMode mode
-    {M, N, K},                                                                          // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                  // GemmCoord problem_size
     split_k,                                                                            // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                                                      // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index 8c2adb852..a74f57aca 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -28,7 +28,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index 358616679..5b39629ea 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -28,7 +28,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 1828195de..70b5f495e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -63,7 +63,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index f324370ec..da92c6201 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -69,7 +69,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*) a_ptr,                            // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index 91605d166..30740cdf7 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -46,7 +46,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 170450290..bd7cb247e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -46,7 +46,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index 8ff8d7acb..2e4d1ed0a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -47,7 +47,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    {M, N, K},                                               // GemmCoord problem_size
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
     split_k,                                                 // int batch_count
     {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
     ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 65da16a52..b8854694a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -290,7 +290,7 @@
 {{indent}}  );
 {{indent}}}
 
-
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}

From e139f40cc8157d2e8ffadd41847448869e3468e7 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Sun, 23 Apr 2023 15:30:01 -0700
Subject: [PATCH 436/638] Mask select converter (#601)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/601

ATT

Reviewed By: frank-wei, qxy11

Differential Revision: D45164678

fbshipit-source-id: afb4df5e84571f466b0f385472493aefb89344cc
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py           |  6 +++
 fx2ait/fx2ait/converters/ait_converters.py    | 13 +++++++
 .../test/converters/test_ait_masked_select.py | 38 +++++++++++++++++++
 fx2ait/fx2ait/tools/common_fx2ait.py          | 10 ++++-
 python/aitemplate/compiler/public/__init__.py |  1 +
 5 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_masked_select.py

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 4ef422ce4..a6e0c9bc7 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -3354,6 +3354,12 @@ def zeros_like(*, input, dtype=None, device=None):
     return torch.zeros_like(input=input, dtype=dtype, device=device)
 
 
+@register_acc_op_mapping(op_and_target=("call_function", torch.masked_select))
+@register_acc_op
+def masked_select(*, input, mask):
+    return torch.masked_select(input=input, mask=mask)
+
+
 ###############################################################################
 
 # Set ops as side-effectul, this prevents them from being optimized away or
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 070e001fc..9058b38da 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -45,6 +45,7 @@
     IntVar,
     IntVarTensor,
     layernorm,
+    masked_select,
     max_pool2d,
     ndhwc3to8,
     pad_last_dim,
@@ -1732,3 +1733,15 @@ def acc_ops_zeros_like(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
     return full()(input_val.shape(), 0, dtype=input_val.dtype())
+
+
+@ait_converter(acc_ops.masked_select)
+def acc_ops_masked_select(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    mask = kwargs["mask"]
+
+    return masked_select()(input_val, mask)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_masked_select.py b/fx2ait/fx2ait/test/converters/test_ait_masked_select.py
new file mode 100644
index 000000000..4b606194a
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_masked_select.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestMaskedSelectConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("random", torch.randn(5, 10), torch.randn(5, 10)),
+            param("all_neg", torch.zeros(5, 10), torch.ones(5, 10)),
+            param("all_pos", torch.ones(5, 10), torch.zeros(5, 10)),
+        ]
+    )
+    def test_masked_select(self, _, a, b):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+                return torch.masked_select(input=x, mask=mask)
+
+        model = TestModule().eval().half().cuda()
+        boolTensor = a > b
+
+        inputs = [torch.randn(5, 10).half().cuda(), boolTensor.cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.masked_select})
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 7ed11362b..31356befc 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -131,7 +131,12 @@ def run_test(
 
         torch_dtype = lower_precision_to_torch_type(precision)
         mod.to(torch_dtype)
-        inputs = [inp.to(torch_dtype).contiguous() for inp in inputs]
+        inputs = [
+            inp.to(torch_dtype).contiguous()
+            if inp.dtype is not torch.bool
+            else inp.contiguous()
+            for inp in inputs
+        ]
         interp = AITInterpreter(
             mod,
             inputs,
@@ -228,7 +233,8 @@ def run_test_with_dynamic_shape(
         for use_lower_bound in [True, False]:
             inputs_list.append(
                 TensorSpec.create_inputs_from_specs(
-                    inputs_spec, use_lower_bound=use_lower_bound
+                    inputs_spec,
+                    use_lower_bound=use_lower_bound,
                 )
             )
 
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index d4894e027..fcd5b2e45 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -71,6 +71,7 @@
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
 from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax
+from aitemplate.compiler.ops.tensor.masked_select import masked_select
 from aitemplate.compiler.ops.tensor.size import size
 from aitemplate.compiler.ops.tensor.topk import topk
 

From 26afef719b548e61931d6d3bd0f9387177f3d346 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 24 Apr 2023 01:28:36 -0700
Subject: [PATCH 437/638] Skip incompatible tests outside CI (#606)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/606

Currently, when running the unit tests outside the CI (e.g., on a devgpu or locally), test filtering is not applied. This doesn't create visible issues on A100, as all the current tests are runnable on A100 (or skipped at the class level when they are ROCm-specific). But it does create an issue on e.g. V100 (CC 70) or T4 (CC 75) hosts outside CI: some tests meant to be run on A100 are failing. E.g., see P701981848 for a `test_gemm` run on T4. As we're planning to add H100 / SM90 unit tests in the near future, this will become an issue on A100, too.

This diff extends the test filtering functions in `test_utils.py` to filter out the tests *not runnable* in the local env, even when outside the CI. E.g., `_sm80` and `_bf16` tests are filtered out on V100 and T4, and `_rocm` tests are filtered out everywhere except on ROCm. When `SM90` arch will be added to the `test_utils.py`, only the `_rocm` tests will be skipped in it, and `_sm90` tests will be skipped everywhere else.

Reviewed By: alexanderguzhva

Differential Revision: D45217518

fbshipit-source-id: 390f8a2e7324d9d753cee073e8e9155e0b63e006
---
 python/aitemplate/testing/test_utils.py | 57 ++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 283794a0e..bdbc08478 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -18,7 +18,7 @@
 import itertools
 import unittest
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 
 import torch
 
@@ -53,6 +53,16 @@ def _SM80_filter(method_name: str) -> bool:
 }
 
 
+# maps each test env (key) to the set of all test envs compatible with
+# it (value). "compatible" means that a tests that can run in *any*
+# env in the value Set[TestEnv] can also run in the key TestEnv.
+_COMPATIBLE_TEST_ENVS: Dict[TestEnv, Set[TestEnv]] = {
+    TestEnv.ROCM: {TestEnv.ROCM},
+    TestEnv.CUDA_LESS_THAN_SM80: {TestEnv.CUDA_LESS_THAN_SM80},
+    TestEnv.CUDA_SM80: {TestEnv.CUDA_LESS_THAN_SM80, TestEnv.CUDA_SM80},
+}
+
+
 def _get_test_env(target) -> str:
     test_env = ""
     if target.name() == "cuda":
@@ -70,31 +80,58 @@ def _get_test_env(target) -> str:
         raise RuntimeError(f"Unknown test env, target: {target.name}, {target._arch}")
     if test_env not in _TEST_ENV_TO_FILTER_METHOD:
         raise RuntimeError(f"{test_env=} not defined in _TEST_ENV_TO_FILTER_METHOD")
+    if test_env not in _COMPATIBLE_TEST_ENVS:
+        raise RuntimeError(f"{test_env=} not defined in _COMPATIBLE_TEST_ENVS")
     return test_env
 
 
+def _test_runnable_in_env(test_name: str, env: TestEnv) -> bool:
+    """Whether the test with the given name can run in the given test env."""
+    for test_env in _COMPATIBLE_TEST_ENVS[env]:
+        if _TEST_ENV_TO_FILTER_METHOD[test_env](test_name):
+            return True
+    return False
+
+
 def filter_test_cases_by_params(params: Dict[TestEnv, List[Tuple[Any]]]):
-    """Filters test cases to run by given params. Only takes effect in CI env."""
+    """Filters test cases to run by given params.
+
+    In CI, only the params corresponding to the CI's test env are kept.
+    Outside CI, the params corresponding to any test env compatible with
+    the local test env are kept.
+    """
     target = detect_target()
     test_env = _get_test_env(target)
     return (
         params.get(test_env, [])
         if target.in_ci_env()
-        else list(itertools.chain.from_iterable(params.values()))
+        else list(
+            itertools.chain.from_iterable(
+                values
+                for env, values in params.items()
+                if env in _COMPATIBLE_TEST_ENVS[test_env]
+            )
+        )
     )
 
 
 def filter_test_cases_by_test_env(cls: Type[unittest.TestCase]):
-    """Filters test cases to run by test case names implicitly. Only takes effect in CI env."""
+    """Filters test cases to run by test case names implicitly.
+
+    In CI, only the test cases filtered by the CI's test env are kept.
+    Outside CI, the test cases filtered by any test env compatible with
+    the local test env are kept.
+    """
     target = detect_target()
     test_env = _get_test_env(target)
     for attr in list(cls.__dict__.keys()):
-        if (
-            attr.startswith("test_")
-            and target.in_ci_env()
-            and (not _TEST_ENV_TO_FILTER_METHOD.get(test_env)(attr))
-        ):
-            delattr(cls, attr)
+        if attr.startswith("test_"):
+            test_name = attr
+            if target.in_ci_env():
+                if not _TEST_ENV_TO_FILTER_METHOD[test_env](test_name):
+                    delattr(cls, attr)
+            elif not _test_runnable_in_env(test_name, test_env):
+                delattr(cls, attr)
 
 
 def _get_torch_tensor(torch_fn, shape, dtype):

From 3b19c9b067d32bae41e3bb5aabc3500f9c782e43 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 24 Apr 2023 06:45:34 -0700
Subject: [PATCH 438/638] Add env_variables context manager to test_utils
 (#607)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/607

A new context manager, `env_variables`, is added to the `test_utils` to facilitate temporary setting (or unsetting) of one or more environment variables for the block of code within the context manager. On exit from the context manager, the state (as set or not set) of the environment variables before entering the context manager is restored.

To temporarily set the environment variable `FIRST_ENV_VAR=1` for the code in `do_something()`:

```
with env_variables(FIRST_ENV_VAR="1"):
    do_something()
```

To temporarily unset (== remove) the environment variable `SECOND_ENV_VAR` for the code in `do_something()`:

```
with env_variables(SECOND_ENV_VAR=None):
    do_something()
```

To do both of the above simultaneously:

```
with env_variables(
    FIRST_ENV_VAR="1",
    SECOND_ENV_VAR=None,
):
    do_something()
```

On exit from the `with env_variables(...): ...` block, both `FIRST_ENV_VAR` and `SECOND_ENV_VAR` are returned to their previous state.

Reviewed By: alexanderguzhva

Differential Revision: D45218884

fbshipit-source-id: 4dddee7bcbd7e7e9829ef1b43ac0ca446b763167
---
 python/aitemplate/testing/test_utils.py       | 22 ++++++
 .../ops/test_conv3d_profiler_cache.py         | 68 +++++++------------
 .../unittest/ops/test_conv_profiler_cache.py  | 66 +++++++-----------
 .../unittest/ops/test_gemm_profiler_cache.py  | 66 +++++++-----------
 4 files changed, 98 insertions(+), 124 deletions(-)

diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index bdbc08478..16321d547 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -15,7 +15,9 @@
 """
 Utils for unit tests.
 """
+import contextlib
 import itertools
+import os
 import unittest
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
@@ -134,6 +136,26 @@ def filter_test_cases_by_test_env(cls: Type[unittest.TestCase]):
                 delattr(cls, attr)
 
 
+@contextlib.contextmanager
+def env_variables(**kwargs):
+    """CM for temporarily setting (or removing) environment variables."""
+    old_values = {name: os.environ.get(name, None) for name in kwargs}
+
+    try:
+        for name, new_value in kwargs.items():
+            if new_value is not None:
+                os.environ[name] = str(new_value)
+            elif name in os.environ:
+                os.environ.pop(name)
+        yield
+    finally:
+        for name, old_value in old_values.items():
+            if old_value is not None:
+                os.environ[name] = old_value
+            elif name in os.environ:
+                os.environ.pop(name)
+
+
 def _get_torch_tensor(torch_fn, shape, dtype):
     dtype = normalize_dtype(dtype)
     return torch_fn(shape, device="cuda", dtype=string_to_torch_dtype(dtype))
diff --git a/tests/unittest/ops/test_conv3d_profiler_cache.py b/tests/unittest/ops/test_conv3d_profiler_cache.py
index a10c20eb1..48e7622f8 100644
--- a/tests/unittest/ops/test_conv3d_profiler_cache.py
+++ b/tests/unittest/ops/test_conv3d_profiler_cache.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 #
 import logging
-import os
 import tempfile
 import unittest
 from unittest.mock import patch
@@ -24,6 +23,7 @@
 from aitemplate.compiler.base import DynamicProfileStrategy
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -74,7 +74,7 @@ def _test(
                 Y,
                 target,
                 "./tmp",
-                "conv3d",
+                test_name,
                 dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
             )
 
@@ -87,25 +87,15 @@ def _run_test(
         logger,
         cache_dir,
     ):
-        old_trick = os.environ.get("TRICK_CI_ENV", None)
-        old_cache = os.environ.get("CACHE_DIR", None)
-        try:
-            os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
                 test_name=test_name,
             )
-        finally:
-            if old_trick is not None:
-                os.environ["TRICK_CI_ENV"] = old_trick
-            else:
-                os.environ.pop("TRICK_CI_ENV")
-            if old_cache is not None:
-                os.environ["CACHE_DIR"] = old_cache
-            else:
-                os.environ.pop("CACHE_DIR")
 
     def test_conv3d_profiler_cache(self):
         first_dim = IntImm(4)
@@ -199,7 +189,6 @@ def test_conv3d_profiler_force_cache(self):
         test_name = "conv3d_profiler_force_cache"
         cache_version_property = "conv3d_cache_version"
 
-        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
         logger = "aitemplate.backend.profiler_cache"
         _LOGGER.info(f"running {test_name=}")
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -210,10 +199,19 @@ def test_conv3d_profiler_force_cache(self):
                 new=1,  # version
             ):
                 _LOGGER.info("force cache with no cache 1")
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
-                with self.assertRaisesRegex(
-                    RuntimeError, "force_cache is enabled but we could not find"
-                ):
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
                     self._run_test(
                         first_dim=first_dim,
                         test_name=test_name,
@@ -221,28 +219,14 @@ def test_conv3d_profiler_force_cache(self):
                         cache_dir=tmp_dirname,
                     )
 
-                del os.environ["AIT_FORCE_PROFILER_CACHE"]
-                _LOGGER.info("make cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
                 _LOGGER.info("force cache with no cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-        if old_force_cache is not None:
-            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
-        else:
-            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
 
     def test_conv3d_profiler_cache_dynamic(self):
         first_dim = IntVar([2, 8])
diff --git a/tests/unittest/ops/test_conv_profiler_cache.py b/tests/unittest/ops/test_conv_profiler_cache.py
index 406a2a442..94202c991 100644
--- a/tests/unittest/ops/test_conv_profiler_cache.py
+++ b/tests/unittest/ops/test_conv_profiler_cache.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 #
 import logging
-import os
 import tempfile
 import unittest
 from unittest.mock import patch
@@ -24,6 +23,7 @@
 from aitemplate.compiler.base import DynamicProfileStrategy
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -76,25 +76,15 @@ def _run_test(
         logger,
         cache_dir,
     ):
-        old_trick = os.environ.get("TRICK_CI_ENV", None)
-        old_cache = os.environ.get("CACHE_DIR", None)
-        try:
-            os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
                 test_name=test_name,
             )
-        finally:
-            if old_trick is not None:
-                os.environ["TRICK_CI_ENV"] = old_trick
-            else:
-                os.environ.pop("TRICK_CI_ENV")
-            if old_cache is not None:
-                os.environ["CACHE_DIR"] = old_cache
-            else:
-                os.environ.pop("CACHE_DIR")
 
     def test_conv_profiler_cache(self):
         first_dim = IntImm(4)
@@ -188,7 +178,6 @@ def test_conv_profiler_force_cache(self):
         test_name = "conv2d_profiler_force_cache"
         cache_version_property = "conv_cache_version"
 
-        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
         logger = "aitemplate.backend.profiler_cache"
         _LOGGER.info(f"running {test_name=}")
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -199,10 +188,19 @@ def test_conv_profiler_force_cache(self):
                 new=1,  # version
             ):
                 _LOGGER.info("force cache with no cache 1")
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
-                with self.assertRaisesRegex(
-                    RuntimeError, "force_cache is enabled but we could not find"
-                ):
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
                     self._run_test(
                         first_dim=first_dim,
                         test_name=test_name,
@@ -210,28 +208,14 @@ def test_conv_profiler_force_cache(self):
                         cache_dir=tmp_dirname,
                     )
 
-                del os.environ["AIT_FORCE_PROFILER_CACHE"]
-                _LOGGER.info("make cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
                 _LOGGER.info("force cache with no cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-        if old_force_cache is not None:
-            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
-        else:
-            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
 
     def test_conv_profiler_cache_dynamic(self):
         first_dim = IntVar([2, 8])
diff --git a/tests/unittest/ops/test_gemm_profiler_cache.py b/tests/unittest/ops/test_gemm_profiler_cache.py
index 597e2d2f8..eec488bf8 100644
--- a/tests/unittest/ops/test_gemm_profiler_cache.py
+++ b/tests/unittest/ops/test_gemm_profiler_cache.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 #
 import logging
-import os
 import tempfile
 import unittest
 from unittest.mock import patch
@@ -23,6 +22,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -76,25 +76,15 @@ def _run_test(
         logger,
         cache_dir,
     ):
-        old_trick = os.environ.get("TRICK_CI_ENV", None)
-        old_cache = os.environ.get("CACHE_DIR", None)
-        try:
-            os.environ["TRICK_CI_ENV"] = "1"
-            os.environ["CACHE_DIR"] = f"{cache_dir}/{test_name}"
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
             return self._test(
                 first_dim=first_dim,
                 logger=logger,
                 test_name=test_name,
             )
-        finally:
-            if old_trick is not None:
-                os.environ["TRICK_CI_ENV"] = old_trick
-            else:
-                os.environ.pop("TRICK_CI_ENV")
-            if old_cache is not None:
-                os.environ["CACHE_DIR"] = old_cache
-            else:
-                os.environ.pop("CACHE_DIR")
 
     def test_gemm_profiler_cache(self):
         first_dim = IntImm(4)
@@ -189,7 +179,6 @@ def test_gemm_profiler_force_cache(self):
         test_name = "gemm_rcr_profiler_force_cache"
         cache_version_property = "gemm_cache_version"
 
-        old_force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None)
         logger = "aitemplate.backend.profiler_cache"
         _LOGGER.info(f"running {test_name=}")
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -200,10 +189,19 @@ def test_gemm_profiler_force_cache(self):
                 new=1,  # version
             ):
                 _LOGGER.info("force cache with no cache 1")
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
-                with self.assertRaisesRegex(
-                    RuntimeError, "force_cache is enabled but we could not find"
-                ):
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
                     self._run_test(
                         first_dim=first_dim,
                         test_name=test_name,
@@ -211,28 +209,14 @@ def test_gemm_profiler_force_cache(self):
                         cache_dir=tmp_dirname,
                     )
 
-                del os.environ["AIT_FORCE_PROFILER_CACHE"]
-                _LOGGER.info("make cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-                os.environ["AIT_FORCE_PROFILER_CACHE"] = "1"
                 _LOGGER.info("force cache with no cache 1")
-                self._run_test(
-                    first_dim=first_dim,
-                    test_name=test_name,
-                    logger=logger,
-                    cache_dir=tmp_dirname,
-                )
-
-        if old_force_cache is not None:
-            os.environ["AIT_FORCE_PROFILER_CACHE"] = old_force_cache
-        else:
-            del os.environ["AIT_FORCE_PROFILER_CACHE"]
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
 
 
 if __name__ == "__main__":

From ae495ab0ff2a2ec60d4e50881194becd710a0bce Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 24 Apr 2023 07:23:29 -0700
Subject: [PATCH 439/638] Add CUDA_SM90 test env to test_utils (#608)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/608

ATT

Reviewed By: alexanderguzhva

Differential Revision: D45218993

fbshipit-source-id: 3a34dcd48be97b1a1e74c63baacd18ebb284bb18
---
 python/aitemplate/testing/test_utils.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 16321d547..88926622b 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -35,6 +35,7 @@
 class TestEnv(Enum):
     CUDA_LESS_THAN_SM80 = 1
     CUDA_SM80 = 2
+    CUDA_SM90 = 3
     ROCM = 100
 
 
@@ -46,11 +47,20 @@ def _SM80_filter(method_name: str) -> bool:
     return method_name.endswith("bf16") or method_name.endswith("sm80")
 
 
+def _SM90_filter(method_name: str) -> bool:
+    return method_name.endswith("sm90")
+
+
 _TEST_ENV_TO_FILTER_METHOD: Dict[str, Callable[[str], bool]] = {
     TestEnv.CUDA_LESS_THAN_SM80: (
-        lambda method_name: not (_SM80_filter(method_name) or _ROCM_filter(method_name))
+        lambda method_name: not (
+            _SM80_filter(method_name)
+            or _SM90_filter(method_name)
+            or _ROCM_filter(method_name)
+        )
     ),
     TestEnv.CUDA_SM80: _SM80_filter,
+    TestEnv.CUDA_SM90: _SM90_filter,
     TestEnv.ROCM: _ROCM_filter,
 }
 
@@ -62,6 +72,11 @@ def _SM80_filter(method_name: str) -> bool:
     TestEnv.ROCM: {TestEnv.ROCM},
     TestEnv.CUDA_LESS_THAN_SM80: {TestEnv.CUDA_LESS_THAN_SM80},
     TestEnv.CUDA_SM80: {TestEnv.CUDA_LESS_THAN_SM80, TestEnv.CUDA_SM80},
+    TestEnv.CUDA_SM90: {
+        TestEnv.CUDA_LESS_THAN_SM80,
+        TestEnv.CUDA_SM80,
+        TestEnv.CUDA_SM90,
+    },
 }
 
 
@@ -72,6 +87,8 @@ def _get_test_env(target) -> str:
             test_env = TestEnv.CUDA_LESS_THAN_SM80
         elif int(target._arch) == 80:
             test_env = TestEnv.CUDA_SM80
+        elif int(target._arch) == 90:
+            test_env = TestEnv.CUDA_SM90
         else:
             raise RuntimeError(
                 f"Unknown test env, target: {target.name}, {target._arch}"

From c0a97af0197cdeff63ba97f8edaac660a9ddf85c Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@fb.com>
Date: Mon, 24 Apr 2023 10:27:12 -0700
Subject: [PATCH 440/638] Ldm update (#611)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/611

Reviewed By: henryhu6

Differential Revision: D45221299

Pulled By: terrychenism

fbshipit-source-id: 423e5e9afefaf37ccb0dfcfaf474c787469230ac
---
 .../05_stable_diffusion/scripts/compile.py    |   1 +
 examples/05_stable_diffusion/scripts/demo.py  |   9 +-
 .../scripts/demo_img2img.py                   |   1 -
 .../scripts/download_pipeline.py              |   4 +-
 examples/05_stable_diffusion/src/benchmark.py |  25 ++---
 .../05_stable_diffusion/src/benchmark_pt.py   |  14 +--
 .../src/compile_lib/compile_clip.py           |  44 ++------
 .../src/compile_lib/compile_unet.py           |  24 +++--
 .../src/compile_lib/compile_vae.py            |  53 +++++----
 .../src/compile_lib/util.py                   |   2 +-
 .../src/modeling/attention.py                 |  33 ++----
 .../05_stable_diffusion/src/modeling/clip.py  | 102 +++++++++---------
 .../src/modeling/embeddings.py                |   2 +-
 .../src/modeling/resnet.py                    |   5 +-
 .../src/modeling/unet_2d_condition.py         |  19 +++-
 .../src/modeling/unet_blocks.py               |   1 -
 .../05_stable_diffusion/src/modeling/vae.py   |   1 -
 .../src/pipeline_stable_diffusion_ait.py      |  12 ++-
 .../backend/common/concatenate_common.py      |   2 +
 .../common/tensor/batch_gather_common.py      |  11 +-
 .../backend/common/upsampling2d_common.py     |   2 +
 .../cuda/groupnorm/groupnorm_common.py        |  22 ++--
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  64 +++++------
 python/aitemplate/compiler/compiler.py        |   5 +
 .../ops/attention/mem_eff_attention.py        |   7 +-
 .../compiler/ops/common/view_ops.py           |   5 +-
 .../ops/conv/common_conv2d_bias_activation.py |   1 +
 python/aitemplate/compiler/ops/conv/conv2d.py |  74 ++++++++-----
 .../compiler/ops/groupnorm/groupnorm.py       |  61 ++++++++++-
 .../compiler/ops/tensor/batch_gather.py       |   8 +-
 .../compiler/ops/tensor/concatenate.py        |   3 +-
 .../ops/upsample/upsampling_common.py         |  11 +-
 .../compiler/transform/name_graph.py          |  62 ++++++++---
 python/aitemplate/frontend/nn/attention.py    |  17 ++-
 .../frontend/nn/conv2d/conv2d_bias.py         |   2 +-
 tests/unittest/ops/test_concatenate.py        |   8 +-
 36 files changed, 430 insertions(+), 287 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
index 896b2432c..65032e34f 100644
--- a/examples/05_stable_diffusion/scripts/compile.py
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -65,6 +65,7 @@ def compile_diffusers(
     compile_clip(
         pipe.text_encoder,
         batch_size=batch_size,
+        seqlen=77,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
         depth=pipe.text_encoder.config.num_hidden_layers,
diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index d4f5dbb99..9ae7db46a 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -34,11 +34,13 @@
 )
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, width, height, prompt, benchmark):
+def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         local_dir,
         scheduler=EulerDiscreteScheduler.from_pretrained(
@@ -48,11 +50,14 @@ def run(local_dir, width, height, prompt, benchmark):
         torch_dtype=torch.float16,
     ).to("cuda")
 
+    prompt = [prompt] * batch
     with torch.autocast("cuda"):
         image = pipe(prompt, height, width).images[0]
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
-            print(f"sd e2e: {t} ms")
+            print(
+                f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
+            )
 
     image.save("example_ait.png")
 
diff --git a/examples/05_stable_diffusion/scripts/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
index e4d96d865..31e1c33df 100644
--- a/examples/05_stable_diffusion/scripts/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -43,7 +43,6 @@
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
 def run(local_dir, width, height, prompt, benchmark):
-
     # load the pipeline
     device = "cuda"
     pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 1128769da..b072e694f 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -21,12 +21,12 @@
 @click.option("--token", default="", help="access token")
 @click.option(
     "--save_directory",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
     help="pipeline files local directory",
 )
 def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2-1-base",
+        "runwayml/stable-diffusion-v1-5",
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``
diff --git a/examples/05_stable_diffusion/src/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
index 5cac6a465..f0f595122 100644
--- a/examples/05_stable_diffusion/src/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -55,7 +55,6 @@ def benchmark_unet(
     benchmark_pt=False,
     verify=False,
 ):
-
     exe_module = Model("./tmp/UNet2DConditionModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
@@ -65,7 +64,7 @@ def benchmark_unet(
     pt_mod = pt_mod.eval()
 
     latent_model_input_pt = torch.randn(batch_size, 4, height, width).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 64, hidden_dim).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 77, hidden_dim).cuda().half()
     timesteps_pt = torch.Tensor([1, 1]).cuda().half()
 
     with autocast("cuda"):
@@ -83,8 +82,6 @@ def benchmark_unet(
             with open("sd_pt_benchmark.txt", "a") as f:
                 f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
 
-    print("pt output:", pt_ys.shape)
-
     # run AIT unet model
     inputs = {
         "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
@@ -96,6 +93,8 @@ def benchmark_unet(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[1] = height
+        shape[2] = width
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -124,13 +123,11 @@ def benchmark_unet(
 def benchmark_clip(
     pt_mod,
     batch_size=1,
-    seqlen=64,
+    seqlen=77,
     tokenizer=None,
     benchmark_pt=False,
     verify=False,
 ):
-    mask_seq = 0
-
     exe_module = Model("./tmp/CLIPTextModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
@@ -142,7 +139,7 @@ def benchmark_clip(
     if tokenizer is None:
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_input = tokenizer(
-        ["a photo of an astronaut riding a horse on mars"],
+        ["a photo of an astronaut riding a horse on mars"] * batch_size,
         padding="max_length",
         max_length=seqlen,
         truncation=True,
@@ -150,8 +147,6 @@ def benchmark_clip(
     )
     input_ids = text_input["input_ids"].cuda()
 
-    attention_mask = torch.ones((batch_size, seqlen))
-    attention_mask[-1, -mask_seq:] = 0
     attention_mask = None
 
     position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
@@ -175,6 +170,7 @@ def benchmark_clip(
     num_outputs = len(exe_module.get_output_name_to_index_map())
     for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[0] = batch_size
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -202,7 +198,6 @@ def benchmark_clip(
 def benchmark_vae(
     pt_vae, batch_size=1, height=64, width=64, benchmark_pt=False, verify=False
 ):
-
     latent_channels = 4
 
     exe_module = Model("./tmp/AutoencoderKL/test.so")
@@ -239,9 +234,8 @@ def benchmark_vae(
         .cuda()
         .half()
     )
+
     ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
-    print("input pt tensor size: ", ait_input_pt_tensor.shape)
-    print("output pt tensor size: ", y.shape)
     exe_module.run_with_tensors([ait_input_pt_tensor], [y])
 
     # verification
@@ -305,7 +299,10 @@ def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
     )
     # VAE
     benchmark_vae(
-        pipe.vae, batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify
+        pipe.vae,
+        batch_size=batch_size,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
     )
 
 
diff --git a/examples/05_stable_diffusion/src/benchmark_pt.py b/examples/05_stable_diffusion/src/benchmark_pt.py
index 95bfb725f..c12877897 100644
--- a/examples/05_stable_diffusion/src/benchmark_pt.py
+++ b/examples/05_stable_diffusion/src/benchmark_pt.py
@@ -26,22 +26,24 @@
     default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="the local diffusers pipeline directory",
 )
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(local_dir, prompt, benchmark):
+def run(local_dir, width, height, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
         local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
     ).to("cuda")
 
-    with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
-        if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
-            print(f"sd pt e2e: {t} ms")
+    image = pipe(prompt, height, width, negative_prompt=negative_prompt).images[0]
+    if benchmark:
+        t = benchmark_torch_function(10, pipe, prompt)
+        print(f"sd pt e2e: {t} ms")
 
     image.save("example_pt.png")
 
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index 9f68e827a..a85aee84f 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -12,10 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import numpy as np
-import torch
+
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
@@ -32,40 +31,14 @@ def map_clip_params(pt_mod, batch_size, seqlen, depth):
             ait_name = ait_name.replace("out_proj", "proj")
         elif name.endswith("out_proj.bias"):
             ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("q_proj.weight"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.weight")]
-            q = pt_params[prefix + "q_proj.weight"]
-            k = pt_params[prefix + "k_proj.weight"]
-            v = pt_params[prefix + "v_proj.weight"]
-            qkv_weight = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_weight
-            continue
-        elif name.endswith("q_proj.bias"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.bias")]
-            q = pt_params[prefix + "q_proj.bias"]
-            k = pt_params[prefix + "k_proj.bias"]
-            v = pt_params[prefix + "v_proj.bias"]
-            qkv_bias = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_bias
-            continue
-        elif name.endswith("k_proj.weight"):
-            continue
-        elif name.endswith("k_proj.bias"):
-            continue
-        elif name.endswith("v_proj.weight"):
-            continue
-        elif name.endswith("v_proj.bias"):
-            continue
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
         params_ait[ait_name] = arr
 
-    if detect_target().name() == "cuda":
-        for i in range(depth):
-            prefix = f"encoder_layers_{i}_self_attn_cu_length"
-            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
-            params_ait[prefix] = torch.from_numpy(cu_len).cuda()
-
     return params_ait
 
 
@@ -97,6 +70,7 @@ def compile_clip(
 
     pt_mod = pt_mod.eval()
     params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
+    batch_size = IntVar(values=[1, 8], name="batch_size")
 
     input_ids_ait = Tensor(
         [batch_size, seqlen], name="input0", dtype="int64", is_input=True
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index 3c2f59603..c4233c1e4 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -15,7 +15,7 @@
 import torch
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.unet_2d_condition import (
@@ -58,9 +58,9 @@ def compile_unet(
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    model_name="UNet2DConditionModel",
     use_linear_projection=False,
 ):
-
     ait_mod = ait_UNet2DConditionModel(
         sample_size=64,
         cross_attention_dim=hidden_dim,
@@ -72,19 +72,31 @@ def compile_unet(
     # set AIT parameters
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height_d = IntVar(values=[32, 64], name="height")
+    width_d = IntVar(values=[32, 64], name="width")
 
     latent_model_input_ait = Tensor(
-        [batch_size, height, width, 4], name="input0", is_input=True
+        [batch_size, height_d, width_d, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
     text_embeddings_pt_ait = Tensor(
-        [batch_size, 64, hidden_dim], name="input2", is_input=True
+        [batch_size, 77, hidden_dim], name="input2", is_input=True
     )
 
-    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
+    mid_block_additional_residual = None
+    down_block_additional_residuals = None
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        down_block_additional_residuals,
+        mid_block_additional_residual,
+    )
     mark_output(Y)
 
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
-    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
+    compile_model(Y, target, "./tmp", model_name, constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index 7352740d0..e9c2d4964 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -12,11 +12,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import numpy as np
 
 import torch
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
@@ -40,20 +39,6 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
                 ).contiguous()
             else:
                 mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.qkv.weight"):
-            prefix = name[: -len("attention.qkv.weight")]
-            q_weight = pt_params[prefix + "query.weight"]
-            k_weight = pt_params[prefix + "key.weight"]
-            v_weight = pt_params[prefix + "value.weight"]
-            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-            mapped_pt_params[ait_name] = qkv_weight
-        elif name.endswith("attention.qkv.bias"):
-            prefix = name[: -len("attention.qkv.bias")]
-            q_bias = pt_params[prefix + "query.bias"]
-            k_bias = pt_params[prefix + "key.bias"]
-            v_bias = pt_params[prefix + "value.bias"]
-            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
-            mapped_pt_params[ait_name] = qkv_bias
         elif name.endswith("attention.proj.weight"):
             prefix = name[: -len("attention.proj.weight")]
             pt_name = prefix + "proj_attn.weight"
@@ -63,8 +48,31 @@ def map_vae_params(ait_module, pt_module, batch_size, seq_len):
             pt_name = prefix + "proj_attn.bias"
             mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.cu_length"):
-            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
-            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
+            ...
+        elif name.endswith("attention.proj_q.weight"):
+            prefix = name[: -len("attention.proj_q.weight")]
+            pt_name = prefix + "query.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_q.bias"):
+            prefix = name[: -len("attention.proj_q.bias")]
+            pt_name = prefix + "query.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.weight"):
+            prefix = name[: -len("attention.proj_k.weight")]
+            pt_name = prefix + "key.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.bias"):
+            prefix = name[: -len("attention.proj_k.bias")]
+            pt_name = prefix + "key.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.weight"):
+            prefix = name[: -len("attention.proj_v.weight")]
+            pt_name = prefix + "value.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.bias"):
+            prefix = name[: -len("attention.proj_v.bias")]
+            pt_name = prefix + "value.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
         else:
             pt_param = pt_module.get_parameter(name)
             mapped_pt_params[ait_name] = pt_param
@@ -79,6 +87,7 @@ def compile_vae(
     width=64,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
+    name="AutoencoderKL",
 ):
     in_channels = 3
     out_channels = 3
@@ -114,8 +123,12 @@ def compile_vae(
         latent_channels=latent_channels,
         sample_size=sample_size,
     )
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height_d = IntVar(values=[32, 64], name="height")
+    width_d = IntVar(values=[32, 64], name="width")
+
     ait_input = Tensor(
-        shape=[batch_size, height, width, latent_channels],
+        shape=[batch_size, height_d, width_d, latent_channels],
         name="vae_input",
         is_input=True,
     )
@@ -133,6 +146,6 @@ def compile_vae(
         Y,
         target,
         "./tmp",
-        "AutoencoderKL",
+        name,
         constants=params_ait,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 000e862e9..90cc1bc32 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -18,5 +18,5 @@ def mark_output(y):
     for i in range(len(y)):
         y[i]._attrs["is_output"] = True
         y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        y_shape = [d._attrs["values"] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
diff --git a/examples/05_stable_diffusion/src/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
index 14993e6d9..06ab5f1bd 100644
--- a/examples/05_stable_diffusion/src/modeling/attention.py
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -20,7 +20,6 @@
 from typing import Optional
 
 from aitemplate.compiler.ops import reshape
-
 from aitemplate.frontend import nn, Tensor
 
 
@@ -54,22 +53,18 @@ def __init__(
     ):
         super().__init__()
         self.batch_size = batch_size
-        self.height = height
-        self.width = width
         self.channels = channels
         self.num_heads = (
             channels // num_head_channels if num_head_channels is not None else 1
         )
         self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
+        self.attention = nn.CrossAttention(
             channels,
-            batch_size,
+            height * width,
             height * width,
             self.num_heads,
             qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
         )
         self.rescale_output_factor = rescale_output_factor
 
@@ -78,28 +73,22 @@ def forward(self, hidden_states) -> Tensor:
         input hidden_states shape: [batch, height, width, channel]
         output shape: [batch, height, width, channel]
         """
+
         residual = hidden_states
 
         # norm
         hidden_states = self.group_norm(hidden_states)
+        o_shape = hidden_states.shape()
+        batch_dim = o_shape[0]
 
         hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+            hidden_states,
+            [batch_dim, -1, self.channels],
         )
 
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+        res = self.attention(hidden_states, hidden_states, hidden_states, residual) * (
+            1 / self.rescale_output_factor
+        )
 
+        res = reshape()(res, o_shape)
         return res
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 30afcd051..ff0ce792a 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -57,9 +57,9 @@ def __init__(
         self.heads = heads
         self.dim_head = dim_head
 
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_out = nn.Sequential(
             nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
         )
@@ -68,36 +68,26 @@ def forward(self, x, context=None, mask=None, residual=None):
         nheads = self.heads
         d = self.dim_head
 
-        layout = "20314" if USE_CUDA else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
+        q = self.to_q(x)
         context = default(context, x)
-
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        bs = q.shape()[0]
+
+        q = ops.reshape()(q, [bs, -1, self.heads, self.dim_head])
+        k = ops.reshape()(k, [bs, -1, self.heads, self.dim_head])
+        v = ops.reshape()(v, [bs, -1, self.heads, self.dim_head])
+        q = ops.permute()(q, [0, 2, 1, 3])
+        k = ops.permute()(k, [0, 2, 1, 3])
+        v = ops.permute()(v, [0, 2, 1, 3])
+
+        attn_op = ops.mem_eff_attention(causal=False)
+        out = attn_op(
+            (ops.reshape()(q, [bs, nheads, -1, d])),
+            (ops.reshape()(k, [bs, nheads, -1, d])),
+            (ops.reshape()(v, [bs, nheads, -1, d])),
         )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
-
-        if USE_CUDA:
-            attn_op = ops.mem_eff_attention(causal=False)
-            out = attn_op(
-                (ops.reshape()(q, [bs, nheads, -1, d])),
-                (ops.reshape()(k, [bs, nheads, -1, d])),
-                (ops.reshape()(v, [bs, nheads, -1, d])),
-            )
-        else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
-            )
         out = ops.reshape()(out, [bs, -1, nheads * d])
         proj = self.to_out(out)
         proj = ops.reshape()(proj, [bs, -1, nheads * d])
@@ -235,7 +225,7 @@ def __init__(
 
     def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
+        b, h, w, c = x.shape()
         x_in = x
         x = self.norm(x)
         if self.use_linear_projection:
@@ -336,7 +326,7 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = get_shape(x)
+        shape = x.shape()
         x = self.fc1(x)
         x = self.fc2(x, res)
         return ops.reshape()(x, shape)
@@ -364,11 +354,11 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = get_shape(x)
+        # shape = get_shape(x)
         x = self.fc1(x)
         x = self.activation_fn(x)
         x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
+        return ops.reshape()(x, x.shape())
 
 
 class CLIPEncoderLayer(nn.Module):
@@ -391,19 +381,15 @@ def __init__(
     ):
         super().__init__()
         self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
+        self.self_attn = nn.CrossAttention(
+            hidden_size,
+            seq_len,
+            seq_len,
+            num_attention_heads,
             qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
             causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
         )
+
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
         self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
             hidden_size, int(hidden_size * mlp_ratio)
@@ -428,7 +414,9 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
+        hidden_states = self.self_attn(
+            hidden_states, hidden_states, hidden_states, residual
+        )
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -547,6 +535,9 @@ def __init__(
     ):
         super().__init__()
         embed_dim = hidden_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = hidden_size
+        self.vocab_size = vocab_size
 
         self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
         self.position_embedding = nn.Embedding(
@@ -559,20 +550,25 @@ def forward(
         position_ids: Tensor,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Tensor:
-
         input_shape = ops.size()(input_ids)
 
         # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
+        token_embedding = self.token_embedding.tensor()
+        token_embedding = ops.reshape()(
+            token_embedding, [1, self.vocab_size, self.embed_dim]
+        )
+        token_embedding = ops.expand()(token_embedding, [input_shape[0], -1, -1])
 
         if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+            inputs_embeds = ops.batch_gather()(token_embedding, input_ids)
 
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
+        position_embedding = self.position_embedding.tensor()
+        position_embedding = ops.reshape()(
+            position_embedding, [1, self.max_position_embeddings, self.embed_dim]
         )
+        position_embedding = ops.expand()(position_embedding, [input_shape[0], -1, -1])
+
+        position_embeddings = ops.batch_gather()(position_embedding, position_ids)
 
         embeddings = inputs_embeds + position_embeddings
 
diff --git a/examples/05_stable_diffusion/src/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
index 36b96a4fb..cab7c033f 100644
--- a/examples/05_stable_diffusion/src/modeling/embeddings.py
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -39,7 +39,7 @@ def get_timestep_embedding(
     :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
     embeddings. :return: an [N x dim] Tensor of positional embeddings.
     """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+    assert timesteps._rank() == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
 
diff --git a/examples/05_stable_diffusion/src/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
index 03e4f8023..c15bf26d2 100644
--- a/examples/05_stable_diffusion/src/modeling/resnet.py
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -58,7 +58,6 @@ def __init__(
             self.Conv2d_0 = conv
 
     def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
         if self.use_conv_transpose:
             return self.conv(x)
 
@@ -112,9 +111,7 @@ def __init__(
             self.conv = conv
 
     def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
         x = self.conv(x)
-
         return x
 
 
@@ -219,7 +216,7 @@ def forward(self, x, temb=None):
 
         if temb is not None:
             temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
+            bs, dim = temb.shape()
             temb = ops.reshape()(temb, [bs, 1, 1, dim])
             hidden_states = hidden_states + temb
 
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index eb28a076a..2ad4d9718 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -14,7 +14,7 @@
 #
 from typing import Optional, Tuple, Union
 
-from aitemplate.frontend import nn
+from aitemplate.frontend import nn, Tensor
 
 from .embeddings import TimestepEmbedding, Timesteps
 from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
@@ -185,6 +185,8 @@ def forward(
         sample,
         timesteps,
         encoder_hidden_states,
+        down_block_additional_residuals: Optional[Tuple[Tensor]] = None,
+        mid_block_additional_residual: Optional[Tensor] = None,
         return_dict: bool = True,
     ):
         """r
@@ -224,12 +226,27 @@ def forward(
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
+            # return sample
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
         sample = self.mid_block(
             sample, emb, encoder_hidden_states=encoder_hidden_states
         )
 
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+
         # 5. up
         for upsample_block in self.up_blocks:
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 9eaa6e0b1..897025660 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -731,7 +731,6 @@ def __init__(
             )
         ]
         attentions = []
-
         for _ in range(num_layers):
             attentions.append(
                 AttentionBlock(
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
index 924c7257a..f2bea6a43 100644
--- a/examples/05_stable_diffusion/src/modeling/vae.py
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -144,7 +144,6 @@ def __init__(
         )
 
     def decode(self, z: Tensor, return_dict: bool = True):
-
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
         return dec
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 7dace1275..a89f43109 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -108,6 +108,7 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
+        self.batch = 1
 
     def init_ait_module(
         self,
@@ -132,12 +133,13 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=64):
+    def clip_inference(self, input_ids, seqlen=77):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -149,6 +151,7 @@ def clip_inference(self, input_ids, seqlen=64):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -160,6 +163,7 @@ def vae_inference(self, vae_input):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -196,7 +200,7 @@ def __call__(
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
@@ -254,11 +258,13 @@ def __call__(
                 f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
             )
 
+        self.batch = batch_size
+
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index d6f3013ce..025688309 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -445,9 +445,11 @@
     throw std::runtime_error("the number of inputs must >= 1!");
   }
 
+
   for ({{index_type}} i = 0; i < rank; i++) {
     if (i == concat_dim) continue;
     {{index_type}} dim = real_input_shapes[0][i];
+
     for ({{index_type}} j = 1; j < num_real_inputs; j++) {
       if (real_input_shapes[j][i] != dim) {
         throw std::runtime_error(
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
index 97e8aee77..df8b5e935 100644
--- a/python/aitemplate/backend/common/tensor/batch_gather_common.py
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -36,7 +36,8 @@
 
 {{func_signature}}
 {
-    batch_gather_launcher<{{dtype}}, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
+    const int64_t gather_size = (gather_dim != 0) ? (*batch_size * batch_num) : batch_num;
+    batch_gather_launcher<{{dtype}}, int64_t>(stream, gather_size, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
 }
     """
 )
@@ -46,9 +47,11 @@
 void {{func_name}}(void* output,
                    const void* input,
                    const int64_t* indices,
+                   const {{index_type}}* batch_size,
                    const {{index_type}} batch_num,
                    const {{index_type}} indices_num,
                    const {{index_type}} instance_size,
+                   const {{index_type}} gather_dim,
                    const {{index_type}} gather_dim_size,
                    uint8_t* workspace,
                    {{prefix}}Stream_t stream)
@@ -65,9 +68,11 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}   {{output}}, {{input}}, {{indices}},
+{{indent}}    {{batch_size}},
 {{indent}}    {{batch_num}},
 {{indent}}    {{indices_num}},
 {{indent}}    {{instance_size}},
+{{indent}}    {{gather_dim}},
 {{indent}}    {{gather_dim_size}},
 {{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
@@ -168,7 +173,7 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
     axis = len(ind_shape) - 1
     batch_num = 1
-    for i in range(axis):
+    for i in range(1, axis):
         batch_num *= yshape[i]._attrs["values"][0]
 
     indices_num = yshape[axis]._attrs["values"][0]
@@ -184,9 +189,11 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
         output=output_name,
         input=input_name,
         indices=indices_name,
+        batch_size="&" + xshape[0]._attrs["name"],
         batch_num=batch_num,
         indices_num=indices_num,
         instance_size=instance_size,
+        gather_dim=axis,
         gather_dim_size=gather_dim_size,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index 8e8310229..c1b94a217 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -313,7 +313,9 @@
     {{index_type}}* out_w,
     {{prefix}}Stream_t stream
 ) {
+
   {{shape_function}}
+
   {{exec_paths}}
   throw std::runtime_error(
       "Unsupported workload for this bilinear upsampling specialization."
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index 39a44ebd3..5db6982aa 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -31,6 +31,10 @@
                           void* gamma,
                           void* beta,
                           int N,
+                          int64_t* H,
+                          int64_t* W,
+                          int64_t* HO,
+                          int64_t* WO,
                           const float eps,
                           const int max_smem_size,
                           void* workspace,
@@ -49,6 +53,8 @@
 {{indent}}{
 {{indent}}  {{func_name}}(
 {{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
+{{indent}}     {{H}}, {{W}},
+{{indent}}     {{HO}}, {{WO}},
 {{indent}}     {{eps}}, max_smem_size_, global_workspace_,
 {{indent}}  stream /* default stream */
 {{indent}}  );
@@ -87,13 +93,16 @@
 
 {{func_signature}}
 {
-
-    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+    *HO = *H;
+    *WO = *W;
+    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{C}}, {{G}}>(
             static_cast<{{elem_input_type}}*>(output),
             static_cast<{{elem_input_type}}*>(input),
             static_cast<{{elem_input_type}}*>(gamma),
             static_cast<{{elem_input_type}}*>(beta),
             N,
+            H,
+            W,
             eps,
             max_smem_size,
             workspace,
@@ -138,8 +147,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     use_swish = True if "swish" in func_attrs["name"] else False
     input_shape = func_attrs["inputs"][0].shape()
 
-    H = input_shape[1].value()
-    W = input_shape[2].value()
     C = input_shape[3].value()
     G = func_attrs["num_groups"]
 
@@ -157,8 +164,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
         FuseSwish="true" if use_swish else "false",
-        H=H,
-        W=W,
         C=C,
         G=G,
     )
@@ -180,6 +185,7 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
     input_shape = func_attrs["inputs"][0]._attrs["shape"]
+    output_shape = func_attrs["outputs"][0]._attrs["shape"]
     eps = func_attrs["eps"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -188,6 +194,10 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
         gamma=gamma_name,
         beta=beta_name,
         N=input_shape[0]._attrs["name"],
+        H="&" + input_shape[1]._attrs["name"],
+        W="&" + input_shape[2]._attrs["name"],
+        HO="&" + output_shape[1]._attrs["name"],
+        WO="&" + output_shape[2]._attrs["name"],
         eps=eps,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index b868849e4..0ccc9e105 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -477,8 +477,6 @@ struct TInputHelper<bfloat16> {
 template <
     typename TInput,
     bool FuseSwish,
-    int H,
-    int W,
     int C,
     int C_G,
     int ILP = 8,
@@ -490,6 +488,8 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
     TInput* gamma,
     TInput* beta,
     int N,
+    int H,
+    int W,
     float epsilon) {
   constexpr int C_G_2 = C_G / 2;
   constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
@@ -983,13 +983,15 @@ void DispatchGroupNormForwardGpu(
   }
 }
 
-template <typename TInput, bool FuseSwish, int H, int W, int C, int G>
+template <typename TInput, bool FuseSwish, int C, int G>
 cudaError_t invokeGroupNorm(
     TInput* output,
     TInput* input,
     TInput* gamma,
     TInput* beta,
     int N,
+    const int64_t* height,
+    const int64_t* width,
     const float eps,
     const int max_smem_size,
     void* workspace,
@@ -998,6 +1000,9 @@ cudaError_t invokeGroupNorm(
   constexpr auto C_G_2 = C_G / 2;
   constexpr int ILP = 8;
 
+  int64_t H = *height;
+  int64_t W = *width;
+
   const int64_t num_instances = N * G;
   const int64_t norm_size = H * W * C / G;
   const int64_t spatial_size = H * W;
@@ -1011,46 +1016,25 @@ cudaError_t invokeGroupNorm(
   // Bank conflict doesn't seem to matter to perf
   constexpr int BANK_CONFLICT = 0;
 
-  constexpr auto smem =
-      H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
+  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
 
   // C_G must be even, or we can have misaligned address for cp.async
   // reserve some shared_mem for block reduction
-  if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
-    constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
-
-    if constexpr (num_threads > 0) {
-      auto kernel_func = group_norm_smem<
-          TInput,
-          FuseSwish,
-          H,
-          W,
-          C,
-          C_G,
-          ILP,
-          BANK_CONFLICT,
-          num_threads>;
-      GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
-          kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
-      dim3 block(num_threads);
-      kernel_func<<<dim3(G, N), block, smem, stream>>>(
-          input, output, gamma, beta, N, eps);
-    } else {
-      DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
-          stream,
-          num_instances,
-          norm_size,
-          channel_size,
-          spatial_size,
-          epsilon,
-          input,
-          gamma,
-          beta,
-          output,
-          static_cast<float*>(workspace),
-          static_cast<float*>(workspace) + num_instances,
-          channels_first);
-    }
+  if (H > 0 && H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+    constexpr int num_threads = 1024;
+    auto kernel_func = group_norm_smem<
+        TInput,
+        FuseSwish,
+        C,
+        C_G,
+        ILP,
+        BANK_CONFLICT,
+        num_threads>;
+    GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
+        kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+    dim3 block(num_threads);
+    kernel_func<<<dim3(G, N), block, smem, stream>>>(
+        input, output, gamma, beta, N, H, W, eps);
   } else {
     DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
         stream,
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 07eabe9e5..5ee4f09fd 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -287,6 +287,11 @@ def compile_model(
             )
             _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
 
+            compiler.transform.dedup_symbolic_name(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "dedup_symbolic_name"
+            )
+
             (
                 max_blob,
                 max_constant_blob,
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index cb9b86832..55d2e9e28 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -26,7 +26,6 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
-from aitemplate.utils import shape_utils
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -120,9 +119,9 @@ def unique(vector):
         batch_info = x._attrs["shape"][0]
         output_shape = [
             batch_info,
-            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            x._attrs["shape"][2],
+            x._attrs["shape"][1],
+            w._attrs["shape"][-1],
         ]
         return output_shape
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index b48a94519..c5c174629 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -331,9 +331,10 @@ def _infer_shapes(self, x: Tensor):
                             y_shapes.append(IntImm(int(dynamic_symbol)))
                         else:
                             symbol_names = {s.name for s in dynamic_symbol.free_symbols}
+                            unknown_symbols = symbol_names - get_global_symbol_set()
                             assert (
-                                len(symbol_names - get_global_symbol_set()) == 0
-                            ), "Unable to deduce dynamic symbol"
+                                not unknown_symbols
+                            ), f"Unable to deduce dynamic symbol, because the following symbols are not in global symbol set: {unknown_symbols}"
 
                             values = simplify_intvar_values(dynamic_symbol)
                             new_var = IntVar(values, symbolic_value=dynamic_symbol)
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index fae05dad9..ce2024559 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 
+
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
     """Base class of conv2d with bias + activation."""
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index b620e8f3b..23a999ab2 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -22,7 +22,7 @@
 from collections import OrderedDict
 from hashlib import sha1
 from operator import itemgetter
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Tuple, Union
 
 import jinja2
 
@@ -166,12 +166,18 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
         Parameters
         ----------
-        stride : int
-            Stride of the convolution
-        pad : int
-            Size of padding to add to the input
-        dilate : int, optional
-            Size of spacing between kernel elements, by default 1
+        stride : int or tuple of two ints
+            Stride of the convolution. If tuple is
+            provided, the elements correspond to height and width stride
+            respectively
+        pad : int or tuple of two ints
+            Size of padding to add to the input. If tuple is
+            provided, the elements correspond to height and width padding
+            respectively
+        dilate : int or tuple of two ints, optional
+            Size of spacing between kernel elements, by default 1. If tuple is
+            provided, the elements correspond to height and width dilation
+            respectively
         group : int, optional
            Number of blocked connections from input
             channels to output channels, by default 1
@@ -195,24 +201,19 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
     def _get_params_factory(self):
         params_factory = {}
-        if isinstance(self._attrs["stride"], int):
-            params_factory["strideh"] = self._attrs["stride"]
-            params_factory["stridew"] = self._attrs["stride"]
-        else:
-            params_factory["strideh"] = self._attrs["stride"][0]
-            params_factory["stridew"] = self._attrs["stride"][1]
-        if isinstance(self._attrs["pad"], int):
-            params_factory["padh"] = self._attrs["pad"]
-            params_factory["padw"] = self._attrs["pad"]
-        else:
-            params_factory["padh"] = self._attrs["pad"][0]
-            params_factory["padw"] = self._attrs["pad"][1]
-        if isinstance(self._attrs["dilate"], int):
-            params_factory["dilateh"] = self._attrs["dilate"]
-            params_factory["dilatew"] = self._attrs["dilate"]
-        else:
-            params_factory["dilateh"] = self._attrs["dilate"][0]
-            params_factory["dilatew"] = self._attrs["dilate"][1]
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        params_factory["strideh"], params_factory["stridew"] = _maybe_int_to_tuple(
+            self._attrs["stride"],
+            "Stride",
+        )
+        params_factory["padh"], params_factory["padw"] = _maybe_int_to_tuple(
+            self._attrs["pad"],
+            "Pad",
+        )
+        params_factory["dilateh"], params_factory["dilatew"] = _maybe_int_to_tuple(
+            self._attrs["dilate"],
+            "Dilation",
+        )
         return params_factory
 
     def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
@@ -263,6 +264,21 @@ def unique(vector):
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        dilate_h, dilate_w = _maybe_int_to_tuple(self._attrs["dilate"], "Dilation")
+        stride_h, stride_w = _maybe_int_to_tuple(self._attrs["stride"], "Stride")
+        pad_h, pad_w = _maybe_int_to_tuple(self._attrs["pad"], "Pad")
+        KHEff = (w_shape[1] - 1) * dilate_h + 1
+        KWEff = (w_shape[2] - 1) * dilate_w + 1
+        out_h = (in_h + 2 * pad_h - KHEff) // stride_h + 1
+        out_w = (in_w + 2 * pad_w - KWEff) // stride_w + 1
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
@@ -770,3 +786,11 @@ def gen_function(self) -> str:
             self.shape_eval_template,
             self.shape_save_template,
         )
+
+
+def _maybe_int_to_tuple(x: Union[int, Tuple[int, int]], name: str) -> Tuple[int, int]:
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, tuple) and len(x) == 2:
+        return x
+    raise ValueError(f"{name} should be either int or tuple of 2 ints, but got {x}")
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index fd31bb5d7..47fb2dde9 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -15,6 +15,7 @@
 """
 Operator definition for groupnorm.
 """
+import itertools
 import logging
 import os
 import re
@@ -39,6 +40,7 @@
 from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
 from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -53,6 +55,19 @@
 """
 )
 
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = {{x_dim3}};
+"""
+)
+
 
 class group_norm(Operator):
     """Standalone group norm op.
@@ -68,6 +83,7 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
             self._attrs["has_profiler"] = True
         self._attrs["num_channels"] = num_channels
         self._attrs["workspace"] = 0
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
     @staticmethod
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
@@ -112,9 +128,52 @@ def _sanity_check(self, x, gamma, beta):
 
     def _infer_shapes(self, x: Tensor):
         """Infer shapes for groupnorm."""
-
         return x._attrs["shape"]
 
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes_v2(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            x.shape()[0],
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        output_shape[1]._attrs["symbolic_value"] = in_h
+        output_shape[2]._attrs["symbolic_value"] = in_w
+        return output_shape
+
     def __call__(
         self,
         x: Tensor,
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index 1182e32b5..c522eb9c2 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -55,7 +55,7 @@ def __init__(self) -> None:
 
     def _infer_shape(self, x: List[int], indices: List[int]):
         rank = len(indices)
-        for r in range(rank - 1):
+        for r in range(1, rank - 1):
             assert x[r] == indices[r]
         output = list(x)
         output[rank - 1] = indices[-1]
@@ -81,6 +81,12 @@ def unique(vector):
             output_shape.append(
                 shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
             )
+        if len(indices.shape()) > 1:
+            # Generally output has the same batch dimension as input
+            output_shape[0] = x.shape()[0]
+        else:
+            # Special case: gather happens along batch dimension
+            output_shape[0] = indices.shape()[0]
         return output_shape
 
     def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index d207bdee1..0d36a2c37 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -105,7 +105,8 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             else:
                 output_dim = input_shapes[0][idx]
                 for shape in input_shapes:
-                    if output_dim != shape[idx]:
+                    # if output_dim != shape[idx]:
+                    if output_dim._attrs["values"] != shape[idx]._attrs["values"]:
                         raise RuntimeError(
                             "tensors expected to have the same dimensions "
                             "except concat_dim! dim: {}, shape1: {}, shape2: {}, inputs: {}".format(
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index 59b94e0b0..edf87bba0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -121,11 +121,20 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x.shape()[0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        out_h = in_h * int(self._attrs["scale_factor"])
+        out_w = in_w * int(self._attrs["scale_factor"])
+
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 94fea4a45..7faa8da64 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -18,7 +18,8 @@
 import re
 from typing import List
 
-from aitemplate.compiler.base import IntImm, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.utils import graph_utils
 
 # pylint: disable=C0103
 
@@ -139,7 +140,6 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
 def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """Rename all shape variable that are identical to the same name.
-
     Parameters
     ----------
     sorted_graph : List[Tensor]
@@ -147,19 +147,49 @@ def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """
     symbolic_to_name = {}
     global user_provided_dim
+    # First pass - build symbolic_to_name map
+    for i, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        if (
+            dim_sym not in symbolic_to_name
+            or dim_sym in symbolic_to_name
+            and dim._attrs["name"] in user_provided_dim
+        ):
+            symbolic_to_name[dim_sym] = dim._attrs["name"] or f"dim_{i}"
+
+    # Second pass - use symbolic_to_name map
+    for _, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        dim._attrs["name"] = symbolic_to_name[dim_sym]
+
+
+def _all_dims_in_graph(sorted_graph: List[Tensor]):
+    dim_idx = 0
     for node in sorted_graph:
         for dim in node._attrs["shape"]:
-            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
-                dim_sym = dim.symbolic_value()
-                if (
-                    dim_sym not in symbolic_to_name
-                    or dim_sym in symbolic_to_name
-                    and dim._attrs["name"] in user_provided_dim
-                ):
-                    symbolic_to_name[dim_sym] = dim._attrs["name"]
-
-    for node in sorted_graph:
-        for dim in node._attrs["shape"]:
-            if not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar):
-                dim_sym = dim.symbolic_value()
-                dim._attrs["name"] = symbolic_to_name[dim_sym]
+            yield dim_idx, dim
+            dim_idx += 1
+
+    # In case some dimensions are not encountered in any nodes in the graph,
+    # only in input/output accessors - iterate over all ops and dimensions
+    # in tensor accessors, if any.
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        input_accessors = op._attrs.get("input_accessors", None)
+        output_accessors = op._attrs.get("output_accessors", None)
+        for accessors in (input_accessors, output_accessors):
+            if accessors is None:
+                continue
+            for ta in accessors:
+                if ta.original_shapes:
+                    for dim in ta.original_shapes:
+                        yield dim_idx, dim
+                        dim_idx += 1
+
+
+def _dim_qualified_for_sym_dedup(dim: IntVar) -> bool:
+    return not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar)
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 091f7d81a..a1a7075b8 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -327,8 +327,6 @@ def __init__(
         self.causal = causal
         self.has_residual = has_residual
         self.dim = dim
-        self.seqlen = seq_len
-        self.seqlen_kv = seq_len_kv
 
         self.op = ops.mem_eff_attention(causal=causal)
 
@@ -353,8 +351,7 @@ def __init__(
         self.proj_drop = Dropout(proj_drop)
 
     def attention(self, q, k, v):
-        seqlen = self.seqlen
-        seqlen_kv = self.seqlen_kv
+        batch = q.shape()[0]
         head_dim = self.dim // self.num_heads
 
         query = self.proj_q(q)
@@ -362,13 +359,13 @@ def attention(self, q, k, v):
         value = self.proj_v(v)
 
         query = ops.permute()(
-            ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         key = ops.permute()(
-            ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim]), [0, 2, 1, 3]
+            ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
         )
         value = ops.permute()(
-            ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim]),
+            ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
             [0, 2, 1, 3],
         )
         return self.op(query, key, value)
@@ -377,9 +374,9 @@ def forward(self, *args):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
-        seq = self.seqlen
+        batch = x.shape()[0]
         attn_output = self.attention(args[0], args[1], args[2])
-        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
+        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 
         if self.has_residual:
             assert len(args) == 4
@@ -387,7 +384,7 @@ def forward(self, *args):
         else:
             x = self.proj(attn_output)
         x = self.proj_drop(x)
-        x = ops.reshape()(x, [-1, seq, self.dim])
+        x = ops.reshape()(x, [batch, -1, self.dim])
         return x
 
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index 68c9aefdf..2a1e0779e 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -65,7 +65,7 @@ def __init__(
         in_channels,
         out_channels,
         kernel_size,
-        stride,
+        stride=1,
         padding=0,
         dilation=1,
         groups=1,
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index 94073251e..a7560f661 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -425,10 +425,10 @@ def test_concatenate_shape_compatible(self):
         in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
         self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
-        var2 = IntVar(values=[1, 2])
-        with self.assertRaises(RuntimeError):
-            in_shapes = [[var1, 2, 3], [var2, 2, 3]]
-            self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+        # var2 = IntVar(values=[1, 2])
+        # with self.assertRaises(RuntimeError):
+        #     in_shapes = [[var1, 2, 3], [var2, 2, 3]]
+        #     self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
 
 
 if __name__ == "__main__":

From 5a73b0096b640280d566b7c50b5ecba03bd26c38 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Mon, 24 Apr 2023 11:24:45 -0700
Subject: [PATCH 441/638] Add testname for test_var:test_batched_var (#612)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/612

Add test_name for test_batched_var to prevent test residing in the same directory.

Reviewed By: wushirong

Differential Revision: D45220860

fbshipit-source-id: 32265bb219cf37437aa7dac095cff8065e02c8b3
---
 tests/unittest/ops/test_var.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
index 58c999876..a8068e09c 100644
--- a/tests/unittest/ops/test_var.py
+++ b/tests/unittest/ops/test_var.py
@@ -105,7 +105,14 @@ def test_var_float16(self):
         )
 
     def _run_batched_var(
-        self, *, dim, unbiased, keepdim=False, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        unbiased,
+        keepdim=False,
+        input_type="float16",
+        output_type=None,
+        test_name="batched_var",
     ):
         torch.manual_seed(0)
         logging.info("Test batched_var with reduction_axes={dim}".format(dim=dim))
@@ -128,7 +135,6 @@ def _run_batched_var(
 
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        test_name = "batched_var"
         module = compile_model(Y, target, "./tmp", test_name)
 
         for B in [5, 128, 1024, 1237, 2002]:
@@ -146,10 +152,18 @@ def _run_batched_var(
         self.test_count += 1
 
     def test_batched_var(self):
-        self._run_batched_var(dim=0, unbiased=False, keepdim=True)
-        self._run_batched_var(dim=1, unbiased=True, keepdim=False)
-        self._run_batched_var(dim=1, unbiased=False, keepdim=True)
-        self._run_batched_var(dim=2, unbiased=True, keepdim=False)
+        self._run_batched_var(
+            dim=0, unbiased=False, keepdim=True, test_name="batched_var_0"
+        )
+        self._run_batched_var(
+            dim=1, unbiased=True, keepdim=False, test_name="batched_var_1"
+        )
+        self._run_batched_var(
+            dim=1, unbiased=False, keepdim=True, test_name="batched_var_2"
+        )
+        self._run_batched_var(
+            dim=2, unbiased=True, keepdim=False, test_name="batched_var_3"
+        )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
     def test_var_float32(self):

From d5c84d2cb214092e527d202c7cc409ddce10128e Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 24 Apr 2023 12:26:22 -0700
Subject: [PATCH 442/638] Prepare for CUTLASS 3.x integration (#614)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/614

A few random fixes in preparation for integrating CUTLASS 3.x.

Reviewed By: hl475

Differential Revision: D45231655

fbshipit-source-id: 09687df2a10cf15bca58491a259eb8afd8ae4c20
---
 python/aitemplate/backend/build_cache_base.py | 1 +
 python/aitemplate/backend/cuda/target_def.py  | 4 +++-
 python/setup.py                               | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index b6de8f6a9..227b54804 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -45,6 +45,7 @@
     "c",
     "hpp",
     "hxx",
+    "inl",
     "py",
     "cxx",
     "cc",
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 47f77199f..8ba5f2afb 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -254,10 +254,12 @@ def _build_compile_options(self):
             with open(fb_include_path, "w") as fb_include:
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
+
             nvcc_arch = self._arch
             if nvcc_arch == "90":
                 # required by CUTLASS SM90 TMA kernels
                 nvcc_arch = "90a"
+
             options = (
                 self.nvcc_options_json["args"]
                 + ["-I" + path for path in cutlass_path]
@@ -276,7 +278,7 @@ def _build_compile_options(self):
                     "-w",
                     "--expt-relaxed-constexpr",
                     "--use_fast_math",
-                    f"-gencode=arch=compute_{self._arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
+                    f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
                     "-Xcompiler=-Wconversion",
                     environ.get_compiler_opt_level(),
                     "-std=c++17",
diff --git a/python/setup.py b/python/setup.py
index 1f5b14103..92e1e24af 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -81,7 +81,10 @@ def gen_cutlass_list():
     ]
     f_cond = (
         lambda x: True
-        if x.endswith(".h") or x.endswith(".cuh") or x.endswith(".hpp")
+        if x.endswith(".h")
+        or x.endswith(".cuh")
+        or x.endswith(".hpp")
+        or x.endswith(".inl")
         else False
     )
     return gen_file_list(srcs, f_cond)

From e1ec7403d46a89fb9a31496c47c35f9c922398d6 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 25 Apr 2023 14:24:55 -0700
Subject: [PATCH 443/638] Set skip_on_empty=True from
 filter_test_cases_by_params (#609)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/609

Currently, if the test env that the tests are running on (in CI) is not found among the keys of the `params` dict argument of the `filter_test_cases_by_params` function, an empty list will be returned by the function:

https://www.internalfb.com/code/fbsource/[db44bc42edbac0af6b277aca542026005759f338]/fbcode/aitemplate/AITemplate/python/aitemplate/testing/test_utils.py?lines=81

When empty list is passed to the `parameterized.expand` decorator, without additionally passing the `skip_on_empty=True` argument, `parameterized.expand` will raise an exception. This is currently not visible, as the tests are running in the test envs that exist in the `params`. However, if we add ROCm or SM90 CI, this will become a problem.

In this diff, the `filter_test_cases_by_params`function is refactored to return a dict of `kwargs` for the `parameterized.expand` decorator instead of mere list for the first (`input`) argument. The `skip_on_empty=True` argument is included into the returned `kwargs` dict, along with the `input`. This will lead to empty `input` list to be processed normally by `parameterized.expand`.

All call sites of `filter_test_cases_by_params` in the unit tests are prepended with `**` to properly pass the `kwargs` to the decorator. Except a few cases where `itertools.product` is applied on the `input` list: in such cases (e.g., in `test_attention.py`) the value of the `"input"` key is fetched from the dict + `skip_on_empty=True` is added manually.

Reviewed By: chenyang78

Differential Revision: D45219129

fbshipit-source-id: 8fd737bbe32c84afd39c6f822fe46ebd63ab8674
---
 .../test/converters/test_ait_unary_ops.py     |  2 +-
 python/aitemplate/testing/test_utils.py       |  6 +++-
 .../compiler/test_constant_folding.py         |  8 ++---
 .../compiler/test_fuse_bmm_permute.py         |  5 +--
 .../compiler/test_fuse_mm_elementwise.py      |  8 ++---
 .../compiler/test_fuse_permute_bmm.py         |  4 +--
 .../compiler/test_fuse_permute_gemm.py        |  6 ++--
 ...st_fused_elementwise_complex_dependency.py |  8 ++---
 .../test_fused_elementwise_out_of_order.py    |  2 +-
 .../compiler/test_pad_gemm_with_cat.py        |  2 +-
 .../test_pad_gemm_with_elementwise.py         |  8 ++---
 tests/unittest/compiler/test_refine_graph.py  |  8 ++---
 .../compiler/test_slice_gemm_fusion.py        |  2 +-
 .../compiler/test_slice_view_strided.py       | 12 +++----
 .../compiler/test_strided_view_cat.py         |  4 +--
 .../compiler/test_transform_special_op.py     |  4 +--
 tests/unittest/ops/test_activation.py         | 34 +++++++++----------
 tests/unittest/ops/test_attention.py          | 14 ++++----
 .../test_batched_dense_vec_jagged_2d_mul.py   |  2 +-
 tests/unittest/ops/test_bmm.py                |  8 ++---
 tests/unittest/ops/test_bmm_add.py            |  8 ++---
 tests/unittest/ops/test_conv.py               |  6 ++--
 tests/unittest/ops/test_conv2d_bias_add.py    |  2 +-
 tests/unittest/ops/test_conv_bias.py          |  2 +-
 .../ops/test_conv_bias_add_hardswish.py       |  2 +-
 tests/unittest/ops/test_conv_bias_add_relu.py |  2 +-
 .../unittest/ops/test_conv_bias_hardswish.py  |  2 +-
 tests/unittest/ops/test_conv_bias_relu.py     |  2 +-
 tests/unittest/ops/test_conv_bias_sigmoid.py  |  2 +-
 tests/unittest/ops/test_fused_elementwise.py  | 22 ++++++------
 .../unittest/ops/test_gemm_bias_hardswish.py  |  2 +-
 tests/unittest/ops/test_gemm_bias_relu.py     |  4 +--
 tests/unittest/ops/test_gemm_bias_sigmoid.py  |  2 +-
 tests/unittest/ops/test_gemm_bias_tanh.py     |  2 +-
 tests/unittest/ops/test_perm021fc_crc_bias.py |  2 +-
 tests/unittest/ops/test_perm102_bmm_rcr.py    |  4 +--
 tests/unittest/ops/test_perm102_bmm_rrr.py    |  4 +--
 tests/unittest/ops/test_softmax.py            |  2 +-
 tests/unittest/ops/test_split_getitem.py      |  6 ++--
 39 files changed, 114 insertions(+), 111 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index 5c8f1c509..23be04bc5 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -48,7 +48,7 @@
 
 class TestUnaryOpsConverter(AITTestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 env: [
                     (
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 88926622b..88fb8b999 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -121,7 +121,7 @@ def filter_test_cases_by_params(params: Dict[TestEnv, List[Tuple[Any]]]):
     """
     target = detect_target()
     test_env = _get_test_env(target)
-    return (
+    input_ = (
         params.get(test_env, [])
         if target.in_ci_env()
         else list(
@@ -132,6 +132,10 @@ def filter_test_cases_by_params(params: Dict[TestEnv, List[Tuple[Any]]]):
             )
         )
     )
+    return {
+        "input": input_,
+        "skip_on_empty": True,
+    }
 
 
 def filter_test_cases_by_test_env(cls: Type[unittest.TestCase]):
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index c4354a48e..048fee2d5 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -84,7 +84,7 @@ def test_simple_constant_fold(self, dtype):
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=3)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -138,7 +138,7 @@ def test_pad_constant_weight(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -188,7 +188,7 @@ def test_fold_long_chain(self, dtype):
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -227,7 +227,7 @@ def test_constant_folding_through_views(self, dtype):
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_fuse_bmm_permute.py b/tests/unittest/compiler/test_fuse_bmm_permute.py
index d00d02b61..5360f4082 100644
--- a/tests/unittest/compiler/test_fuse_bmm_permute.py
+++ b/tests/unittest/compiler/test_fuse_bmm_permute.py
@@ -149,8 +149,9 @@ def _test_bmm_permute(
                 {
                     TestEnv.CUDA_LESS_THAN_SM80: ["float16"],
                 }
-            ),
-        )
+            )["input"],
+        ),
+        skip_on_empty=True,
     )
     def test_xxr_to_xxс(self, B, layout_a, layout_b, layout_c, dtype):
         """
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index 02b900c5a..bd64e5fd0 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -272,7 +272,7 @@ def _test_gemm_rcr_bias_add_add_relu(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -324,7 +324,7 @@ def test_gemm_rcr_bias_add_fail(self, dtype):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -391,7 +391,7 @@ def test_gemm_rcr_bias_chained(self, dtype):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -1556,7 +1556,7 @@ def test_bmm_ccr_add_float_sm80(self):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
index 7587b5581..47dd69e7c 100644
--- a/tests/unittest/compiler/test_fuse_permute_bmm.py
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -805,7 +805,7 @@ def test_gemm_broadcast_float_sm80(self, func, test_bias, dtype):
         func(self, test_bias, dtype)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -867,7 +867,7 @@ def test_permute_multiple_consumer(self, dtype):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_fuse_permute_gemm.py b/tests/unittest/compiler/test_fuse_permute_gemm.py
index 720af1661..96478fb59 100644
--- a/tests/unittest/compiler/test_fuse_permute_gemm.py
+++ b/tests/unittest/compiler/test_fuse_permute_gemm.py
@@ -31,7 +31,7 @@
 
 class FusePermuteGemmTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -71,7 +71,7 @@ def test_no_fusion_odd_alignment(self, dtype):
             raise RuntimeError("invalid {dtype=}")
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -105,7 +105,7 @@ def test_gemm_rrr_to_rcr(self, dtype):
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 21b3cd463..4ef635e7e 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -289,7 +289,7 @@ def test_fused_elementwise_non_elementwise_ops(self, dtype):
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -375,7 +375,7 @@ def test_fused_elementwise_indirect_input_dependency(self, dtype):
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -477,7 +477,7 @@ def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype)
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
@@ -591,7 +591,7 @@ def test_fused_elementwise_multi_dependency(self, dtype):
         self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
index 76d20ee07..011226c67 100644
--- a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
+++ b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
@@ -36,7 +36,7 @@
 
 class FusedElementwiseOutOfOrderTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_pad_gemm_with_cat.py b/tests/unittest/compiler/test_pad_gemm_with_cat.py
index b8bf4858d..4eccd457b 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_cat.py
@@ -37,7 +37,7 @@
 
 class PadGemmWithCatTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
index bd091301d..ea905af78 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -33,7 +33,7 @@
 
 class PadGemmWithElementwise(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     param("static_M_float16", [23], 7, 3, "float16"),
@@ -92,7 +92,7 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     ("static_shape_float16", [3], [1], 5, 3, "float16"),
@@ -161,7 +161,7 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k, dtype):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     ("static_shape_float16", [3], [1], 5, 3, "float16"),
@@ -234,7 +234,7 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k, dty
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     param("static_M_float16", [23], 7, 3, "float16"),
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
index dfc813f49..856ae5687 100644
--- a/tests/unittest/compiler/test_refine_graph.py
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -38,7 +38,7 @@
 
 class RefineGraphTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
@@ -207,7 +207,7 @@ def _build_gemm_rcr_bias_mul(self, M, N, K, dtype, start_idx=0):
         return mul_tensor
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
@@ -243,7 +243,7 @@ def test_gemm_ops(self, dtype):
         assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
@@ -300,7 +300,7 @@ def test_bmm_ops_accessor(self, dtype):
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 7465a68b5..0559ec97b 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -155,7 +155,7 @@ def test_slice_gemm_rcr_fusion_a(self):
     # This is a test for testing cases where we correctly update a/b_alignment
     # based on input_accessors
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float")],
diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index 8d89b38ba..0630f52e6 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -44,7 +44,7 @@ def setUpClass(cls) -> None:
         torch.manual_seed(0)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -101,7 +101,7 @@ def test_slice_view_gemm_fusible(self, dtype):
             torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -158,7 +158,7 @@ def test_slice_view_gemm_non_fusible(self, dtype):
             torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -237,7 +237,7 @@ def test_slice_flatten_concat_fusible_1(self, dtype):
             torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -310,7 +310,7 @@ def test_slice_flatten_concat_fusible_2(self, dtype):
             torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -382,7 +382,7 @@ def test_slice_reshape_concat_fusible_1(self, dtype):
             torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index 1d58fe097..fa270820d 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -139,7 +139,7 @@ def test_strided_gemm_view_cat_fusible(
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -380,7 +380,7 @@ def _create_layernorm_sigmoid_mul(
                 torch.testing.assert_close(x, x_pt, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
index 34137ea35..68e385262 100644
--- a/tests/unittest/compiler/test_transform_special_op.py
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -123,7 +123,7 @@ def test_small_nk_fp32(self):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
@@ -237,7 +237,7 @@ def test_n1_k8_fp32(self):
         self._test_n1_k8(10, [8, 16], 1, 8, dtype="float32")
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index 01581e07f..aa7cf5227 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -353,7 +353,7 @@ def _test_celu(
         torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -379,7 +379,7 @@ def test_lrelu(self, dtype):
         self._test_leaky_relu([63, 63], test_name="leaky_relu_3", dtype=dtype)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -402,7 +402,7 @@ def test_htanh(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -428,7 +428,7 @@ def test_softplus(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -449,7 +449,7 @@ def test_cos(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -470,7 +470,7 @@ def test_sin(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -494,7 +494,7 @@ def test_tanh(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -515,7 +515,7 @@ def test_sign(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -536,7 +536,7 @@ def test_abs(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -557,7 +557,7 @@ def test_loge(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -578,7 +578,7 @@ def test_exp(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -599,7 +599,7 @@ def test_sqrt(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -620,7 +620,7 @@ def test_sigmoid(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -641,7 +641,7 @@ def test_relu(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -662,7 +662,7 @@ def test_elu(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -694,7 +694,7 @@ def test_softsign(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -717,7 +717,7 @@ def test_floor_div(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 75260d32d..18010ed99 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -333,13 +333,12 @@ def _test_flash_attention(
                 )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 # Flash attention requires A100
                 TestEnv.CUDA_SM80: [("float16")],
             }
         ),
-        skip_on_empty=True,
     )
     def test_flash_attention(self, dtype):
         self._test_flash_attention(
@@ -444,12 +443,11 @@ def _test_attention(
             _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.ROCM: [("float16")],
             }
         ),
-        skip_on_empty=True,
     )
     def test_attention_rocm(self, dtype):
         self._test_attention(
@@ -699,11 +697,12 @@ def _test_mem_eff_attention(
                     TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                     TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
                 }
-            ),
+            )["input"],
             [False, True],  # variable_seq_length_kv
             [False, True],  # variable_seq_length_q
             [False, True],  # causal
         ),
+        skip_on_empty=True,
     )
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_mem_eff_attention(
@@ -825,7 +824,7 @@ def test_mem_eff_attention_benchmark(
                     # with 'misaligned address' error.
                     TestEnv.CUDA_SM80: [("float16")],
                 }
-            ),
+            )["input"],
             [False, True],  # variable_seq_length_kv
             [False, True],  # variable_seq_length_q
         ),
@@ -980,12 +979,11 @@ def _test_cross_attention(
             torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_SM80: [("float16"), ("float32"), ("bfloat16")],
             }
         ),
-        skip_on_empty=True,
     )
     def test_cross_attention(self, dtype):
         if dtype == "bfloat16":
diff --git a/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
index 5a0f6708a..b07d5c7cd 100644
--- a/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
+++ b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
@@ -136,7 +136,7 @@ def _test_batched_dense_vec_jagged_2d_mul(
         torch.testing.assert_close(result, result_pt, **tolerance_limits)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 9c1d42be3..1073a590f 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -305,7 +305,7 @@ def test_ccc(self):
         if detect_target().name() == "cuda":
             self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_bmm_0_dtype(self, dtype):
         self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
@@ -332,7 +332,7 @@ def test_bmm_0_dtype(self, dtype):
             [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_bmm_1_dtype(self, dtype):
         self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
@@ -727,7 +727,7 @@ def test_ccc(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_broadcast_0_dtype(self, dtype):
         self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
@@ -738,7 +738,7 @@ def test_bmm_broadcast_0_dtype(self, dtype):
         self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
         self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_broadcast_1_dtype(self, dtype):
         self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index e94849583..10f531a04 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -333,7 +333,7 @@ def test_rrc(self):
     def test_crc(self):
         self._test_crc(B=32, M=256, K=256, N=512)
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_add_0_dtype(self, dtype):
         self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccr(
@@ -344,7 +344,7 @@ def test_bmm_add_0_dtype(self, dtype):
             B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
         )
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_add_1_dtype(self, dtype):
         self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccc(
@@ -735,7 +735,7 @@ def test_ccc(self):
             test_name="broadcastable_bias3d",
         )
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_add_broadcast_0_dtype(self, dtype):
         self._test_crr(
             [1, 8, 16],
@@ -766,7 +766,7 @@ def test_bmm_add_broadcast_0_dtype(self, dtype):
             dtype=dtype,
         )
 
-    @parameterized.expand(filter_test_cases_by_params(_TEST_PARAMS))
+    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
     def test_bmm_add_broadcast_1_dtype(self, dtype):
         self._test_crc(
             [1, 8, 16],
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index cace970e0..db5621174 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -76,7 +76,7 @@ def _test_conv(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
@@ -96,7 +96,7 @@ def test_conv2d(self, dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
@@ -108,7 +108,7 @@ def test_conv1d(self, dtype):
         self._test_conv1d(dtype=dtype, bias=False)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index 24e232f23..a81fa9295 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -99,7 +99,7 @@ def _test_conv_bias_add(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index b6f18ec08..bb0e774c4 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -89,7 +89,7 @@ def _test_conv_bias(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index 48ba7fd2b..d97e397a8 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -98,7 +98,7 @@ def _test_conv_bias_add_hardswish(
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index 1b4545bcc..bdc78e650 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -100,7 +100,7 @@ def _test_conv_bias_add_relu(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index 0809c1e45..67fe90872 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -96,7 +96,7 @@ def _test_conv_bias_hardswish(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1, rtol=1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index 8ac41e071..4feb7279a 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -92,7 +92,7 @@ def _test_conv_bias_relu(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index 206956534..b4567cacd 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -86,7 +86,7 @@ def _test_conv_bias_sigmoid(
             torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 030b2ce4f..94b628a0b 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -108,7 +108,7 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
         self.assertEqual(X4._attrs["depth"], 2)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -162,7 +162,7 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype)
                     self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -252,7 +252,7 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
         torch.testing.assert_close(x9, x9_pt, atol=1e-2, rtol=1e-2)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -291,7 +291,7 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
         self.assertEqual(torch.sum(x2 > 1), 0)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -328,7 +328,7 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 # float16 device function is different for SM80 and lower
@@ -369,7 +369,7 @@ def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -435,7 +435,7 @@ def _test_min_max(
         torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -460,7 +460,7 @@ def test_min(self, ait_dtype):
         )
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -517,7 +517,7 @@ def _test_clamp(
         self.assertTrue(torch.allclose(x1, x1_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -563,7 +563,7 @@ def _test_operator_overload(self, ait_dtype):
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
@@ -597,7 +597,7 @@ def _test_operator_overload_with_constant_number(self, ait_dtype):
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
                 TestEnv.CUDA_SM80: [("bfloat16")],
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index 602869532..cdae28a08 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -70,7 +70,7 @@ def _test_rcr(self, dtype="float16"):
         self.assertTrue(torch.allclose(Y_pt, y, **_TOLERANCE_LIMITS[dtype]))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index c3977a228..a44937da5 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -66,7 +66,7 @@ def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
@@ -107,7 +107,7 @@ def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 5ab21e6c4..689c29497 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -66,7 +66,7 @@ def _test_rcr(self, dtype="float16"):
         torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index 0a5b17c39..8e3e5dd60 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -75,7 +75,7 @@ def _test_rcr(self, Ms, test_name, dtype="float16"):
             torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
index 48c67b878..bb21b7771 100644
--- a/tests/unittest/ops/test_perm021fc_crc_bias.py
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -88,7 +88,7 @@ def _test_perm021fc_crc_bias(
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index 16c0fd752..3797fcffc 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -39,7 +39,7 @@
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_TestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
@@ -75,7 +75,7 @@ def test_perm102_bmm_rrr(self, dtype):
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index ba12ed1d1..a466d29f9 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -39,7 +39,7 @@
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
@@ -75,7 +75,7 @@ def test_perm102_bmm_rrr(self, dtype="float16"):
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm102BMMBiasTestCase(unittest.TestCase):
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index 9462d82dd..7f31604e9 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -64,7 +64,7 @@ def _test_softmax(
             torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
diff --git a/tests/unittest/ops/test_split_getitem.py b/tests/unittest/ops/test_split_getitem.py
index 5068d0112..c0f2437b4 100644
--- a/tests/unittest/ops/test_split_getitem.py
+++ b/tests/unittest/ops/test_split_getitem.py
@@ -94,7 +94,7 @@ def _test_split_getitem(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
@@ -156,7 +156,7 @@ def _test_split_getitem_output(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],
@@ -246,7 +246,7 @@ def _test_split_multiple_getitems(
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     @parameterized.expand(
-        filter_test_cases_by_params(
+        **filter_test_cases_by_params(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
                 TestEnv.CUDA_SM80: [("float32")],

From 4a1503739c697dcdab31b7ea73e7ea44f599a783 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Tue, 25 Apr 2023 18:16:38 -0700
Subject: [PATCH 444/638] Add MaxPool3d FE module (#595)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/595

Adds AIT MaxPool3d FE module

Reviewed By: terrychenism

Differential Revision: D45164152

fbshipit-source-id: 4e6628470c4751a3d62a1122372bccd31733a76e
---
 python/aitemplate/frontend/nn/pool3d.py | 115 ++++++++++++++++++++++++
 tests/unittest/ops/test_max_pool3d.py   | 107 ++++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/pool3d.py
 create mode 100644 tests/unittest/ops/test_max_pool3d.py

diff --git a/python/aitemplate/frontend/nn/pool3d.py b/python/aitemplate/frontend/nn/pool3d.py
new file mode 100644
index 000000000..db663a3f0
--- /dev/null
+++ b/python/aitemplate/frontend/nn/pool3d.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+pool3d-family modules.
+"""
+from aitemplate.compiler.ops import max_pool2d
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.frontend.nn.module import Module
+
+
+def identical_elem_tuple_to_int(param):
+    """
+    Convert tuples with all the same int elem to
+    a single int (ex. (3, 3, 3) --> 3)
+    """
+    if isinstance(param, int):
+        return param
+
+    if not isinstance(param, (list, tuple)) or not all(x == param[0] for x in param):
+        raise RuntimeError(f"AIT supports square param values only, but got {param}")
+    return param[0]
+
+
+class MaxPool3d(Module):
+    r"""Applies a 3D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, D, H, W, C)`,
+    output :math:`(N, D_{out}, H_{out}, W_{out}, C)` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, d, h, w, C_j) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window
+        padding: implicit zero padding to be added on both sides
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+
+    def forward(self, *args):
+        assert len(args) == 1
+        input_val = args[0]
+
+        if (
+            isinstance(self.kernel_size, tuple)
+            and isinstance(self.stride, tuple)
+            and isinstance(self.padding, tuple)
+        ):
+            kernel_size_tuple = self.kernel_size
+            stride_tuple = self.stride
+            padding_tuple = self.padding
+
+            assert (
+                kernel_size_tuple[0] == 1
+            ), "max_pool3d only supports kT == 1 currently"
+            assert stride_tuple[0] == 1, "max_pool3d only supports sT == 1 currently"
+            assert (
+                padding_tuple[0] == 0
+            ), "max_pool3d only supports T_padding == 0 currently"
+
+            kernel_size = identical_elem_tuple_to_int(kernel_size_tuple[1:])
+            stride = identical_elem_tuple_to_int(stride_tuple[1:])
+            padding = identical_elem_tuple_to_int(padding_tuple[1:])
+        elif (
+            isinstance(self.kernel_size, int)
+            and isinstance(self.stride, int)
+            and isinstance(self.padding, int)
+        ):
+            kernel_size = self.kernel_size
+            stride = self.stride
+            padding = self.padding
+        else:
+            raise RuntimeError("Only int or tuple types are supported")
+
+        N, D, H, W, C = input_val.shape()
+
+        reshape_op_0 = reshape()
+        shape_0 = (-1, H, W, C)
+        input_val = reshape_op_0(input_val, shape_0)
+
+        output = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(
+            input_val
+        )
+
+        _, H_o, W_o, _ = output.shape()
+        reshape_op_1 = reshape()
+        shape_1 = (N, D, H_o, W_o, C)
+
+        output = reshape_op_1(output, shape_1)
+        return output
diff --git a/tests/unittest/ops/test_max_pool3d.py b/tests/unittest/ops/test_max_pool3d.py
new file mode 100644
index 000000000..79be2b634
--- /dev/null
+++ b/tests/unittest/ops/test_max_pool3d.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.pool3d import MaxPool3d
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class MaxPool3dTestCase(unittest.TestCase):
+    def _test_max_pool_3d(
+        self,
+        kernel_size,
+        stride,
+        padding,
+        pt_input_shape,
+        ait_input_shape,
+        dtype="float16",
+    ):
+        X_pt = get_random_torch_tensor(pt_input_shape, dtype=dtype)
+        OP_pt = (
+            torch.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+            .cuda()
+            .half()
+        )
+        Y_pt = OP_pt(X_pt)
+        X_ait = Tensor(
+            shape=ait_input_shape,
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        OP_ait = MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+        Y_ait = OP_ait(X_ait)
+
+        Y_ait._attrs["name"] = "output_0"
+        Y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y_ait, target, "./tmp", "max_pool3d")
+
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty_like(Y_pt).permute(0, 2, 3, 4, 1).contiguous()
+        module.run_with_tensors([x], [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
+
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_max_pool_3d_fp16(self):
+        for batch in [1, 3]:
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float16",
+            )
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_max_pool_3d_fp32(self):
+        for batch in [1, 3]:
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float32",
+            )
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float32",
+            )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 24a5603dc802ed5a3157080504e78ce556132725 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Tue, 25 Apr 2023 20:15:19 -0700
Subject: [PATCH 445/638] Add GELU FE module (#600)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/600

Adds AIT GELU FE module

Reviewed By: terrychenism

Differential Revision: D45190335

fbshipit-source-id: ed39b63175b1aacb548ede6909042e95f9174c69
---
 python/aitemplate/frontend/nn/activation.py | 53 +++++++++++++
 tests/unittest/ops/test_activation.py       | 86 +++++++++++++++++++++
 tests/unittest/ops/test_nn_gelu.py          | 67 ++++++++++++++++
 3 files changed, 206 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/activation.py
 create mode 100644 tests/unittest/ops/test_nn_gelu.py

diff --git a/python/aitemplate/frontend/nn/activation.py b/python/aitemplate/frontend/nn/activation.py
new file mode 100644
index 000000000..7251d2906
--- /dev/null
+++ b/python/aitemplate/frontend/nn/activation.py
@@ -0,0 +1,53 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+activation modules.
+"""
+
+from aitemplate.compiler.public import elementwise, FuncEnum
+from aitemplate.frontend.nn.module import Module
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    When the approximate argument is 'tanh', Gelu is estimated with:
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+    """
+
+    def __init__(self, approximate: str = "none"):
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, *args):
+        assert len(args) == 1
+        input_val = args[0]
+
+        # For extra speedup, lower to fast_gelu
+        if self.approximate == "tanh":
+            result = elementwise(FuncEnum.FASTGELU)(input_val)
+        else:
+            result = elementwise(FuncEnum.GELU)(input_val)
+
+        return result
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index aa7cf5227..8f78dcdcc 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -352,6 +352,58 @@ def _test_celu(
         module.run_with_tensors([x1_pt], [x2])
         torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
+    def _test_gelu(self, input_size, test_name="gelu", copy_op=False, dtype="float16"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.GELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.GELU()
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_fast_gelu(
+        self, input_size, test_name="fast_gelu", copy_op=False, dtype="float16"
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.FASTGELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.GELU(approximate="tanh")
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
     @parameterized.expand(
         **filter_test_cases_by_params(
             {
@@ -733,6 +785,40 @@ def test_celu(self, dtype):
             [256, 128], alpha=1.0, test_name="celu_3_copy_op", copy_op=True, dtype=dtype
         )
 
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_gelu(self, dtype):
+        self._test_gelu([63, 63], test_name="gelu_1", dtype=dtype)
+        self._test_gelu([128, 128], test_name="gelu_2", dtype=dtype)
+        self._test_gelu([128, 256], test_name="gelu_3", dtype=dtype)
+        self._test_gelu(
+            [256, 128], test_name="gelu_4_copy_op", copy_op=True, dtype=dtype
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fast_gelu(self, dtype):
+        self._test_fast_gelu([63, 63], test_name="fast_gelu_1", dtype=dtype)
+        self._test_fast_gelu([128, 128], test_name="fast_gelu_2", dtype=dtype)
+        self._test_fast_gelu([128, 256], test_name="fast_gelu_3", dtype=dtype)
+        self._test_fast_gelu(
+            [256, 128], test_name="fast_gelu_4_copy_op", copy_op=True, dtype=dtype
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_nn_gelu.py b/tests/unittest/ops/test_nn_gelu.py
new file mode 100644
index 000000000..c252d6c87
--- /dev/null
+++ b/tests/unittest/ops/test_nn_gelu.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.activation import GELU
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class GELUTestCase(unittest.TestCase):
+    def _test_gelu(self, approximate, dtype="float16"):
+        input_shape = (3, 10, 20)
+
+        X_pt = get_random_torch_tensor(input_shape, dtype=dtype)
+        OP_pt = torch.nn.GELU(approximate=approximate).cuda().half()
+        Y_pt = OP_pt(X_pt)
+        X_ait = Tensor(
+            shape=input_shape,
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        OP_ait = GELU(approximate=approximate)
+        Y_ait = OP_ait(X_ait)
+
+        Ys_ait = Ys_ait = [var._attrs["values"][0] for var in Y_ait._attrs["shape"]]
+        self.assertEqual(list(Y_pt.shape), Ys_ait)
+
+        Y_ait._attrs["name"] = "output_0"
+        Y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y_ait, target, "./tmp", "gelu")
+
+        y = get_torch_empty_tensor(Ys_ait, dtype=dtype)
+        inputs = {"input0": X_pt}
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_gelu(self):
+        self._test_gelu(approximate="none")
+        self._test_gelu(approximate="tanh")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From d6af8b048e8a5248b5721d9e31e04b068c37e657 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Wed, 26 Apr 2023 05:07:26 -0700
Subject: [PATCH 446/638] Fill MultiScaleBlock gap (#622)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/622

Fill MultiScaleBlock gap using pytorch benchmark.

Reviewed By: terrychenism

Differential Revision: D45088513

fbshipit-source-id: 20198db4663708a87e02f369ed34560493b82d2b
---
 .../frontend/nn/multiscale_attention.py       | 117 ++++++++++++------
 1 file changed, 81 insertions(+), 36 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index bd6c6c7ea..2c484f14c 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -19,18 +19,23 @@
 """
 
 import logging
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import numpy
 
 from aitemplate.compiler import ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.public import permute
 from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.activation import GELU
+from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
 from aitemplate.frontend.nn.conv3d import Conv3d
 from aitemplate.frontend.nn.dropout import Dropout, DropPath
 from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.layer_norm import LayerNorm
 from aitemplate.frontend.nn.linear import Linear
 from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.pool3d import MaxPool3d
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -40,6 +45,31 @@ def get_shape(x):
     return shape
 
 
+def ait_ncl2nlc(x):
+    return permute()(x, [0, 2, 1])
+
+
+def _unsqueeze_dims(x):
+    tensor_dim = len(get_shape(x))
+    if tensor_dim == 4:
+        pass
+    elif tensor_dim == 3:
+        x = ops.unsqueeze(dim=1)(x)
+    else:
+        raise NotImplementedError(f"Unsupported input dimension {get_shape(x)}")
+    return x, tensor_dim
+
+
+def _squeeze_dims(x, tensor_dim):
+    if tensor_dim == 4:
+        pass
+    elif tensor_dim == 3:
+        x = ops.squeeze(dim=1)(x)
+    else:
+        raise NotImplementedError(f"Unsupported input dimension {get_shape(x)}")
+    return x
+
+
 class Mlp(Module):
     """
     A MLP block that contains two linear layers with a normalization layer. The MLP
@@ -63,7 +93,7 @@ def __init__(
         in_features: int,
         hidden_features: Optional[int] = None,
         out_features: Optional[int] = None,
-        act_layer: str = "gelu",
+        act_layer: Module = GELU,
         dropout_rate: float = 0.0,
         bias_on: bool = True,
     ) -> None:
@@ -90,6 +120,7 @@ def __init__(
             hidden_features,
             bias=bias_on,
         )
+        self.act = act_layer()
         self.fc2 = Linear(hidden_features, out_features, bias=bias_on)
 
         if self.dropout_rate > 0.0:
@@ -103,14 +134,13 @@ def forward(self, x: Tensor) -> Tensor:
             x (tensor): Input tensor.
         """
         x = self.fc1(x)
+        x = self.act(x)
 
         assert self.dropout_rate == 0.0
 
         if self.dropout_rate > 0.0:
             x = self.dropout(x)
 
-        x = ops.elementwise(FuncEnum.GELU)(x)
-
         x = self.fc2(x)
 
         if self.dropout_rate > 0.0:
@@ -124,7 +154,7 @@ def __init__(
         self,
         pool: Optional[Module],
         has_cls_embed: bool,
-        norm: Optional[str],
+        norm: Optional[Module],
     ) -> None:
         """Apply pool to a flattened input (given pool operation and the unflattened shape).
 
@@ -154,13 +184,13 @@ def __init__(
 
         self.has_cls_embed = has_cls_embed
         if norm is not None:
-            self.norm_before_pool = norm == "BatchNorm3d" or norm == "Identity"
+            self.norm_before_pool = isinstance(norm, (BatchNorm3d, Identity))
             self.has_norm = True
             self.norm = norm
         else:
             self.norm_before_pool = False
             self.has_norm = False
-            self.norm = "Identity"
+            self.norm = Identity
 
     def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
         """
@@ -175,6 +205,8 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         if not self.has_pool:
             return tensor, thw_shape
 
+        tensor, tensor_dim = _unsqueeze_dims(tensor)
+
         assert not self.has_cls_embed
 
         if self.has_cls_embed:
@@ -193,12 +225,10 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         )
 
         if self.norm_before_pool:
-            # TODO: add batchnorm3d
-            # # If use BN, we apply norm before pooling instead of after pooling.
-            # tensor = self.norm(tensor)
-            # # We also empirically find that adding a GELU here is beneficial.
+            # If use BN, we apply norm before pooling instead of after pooling.
+            tensor = self.norm(tensor)
+            # We also empirically find that adding a GELU here is beneficial.
             tensor = ops.elementwise(FuncEnum.GELU)(tensor)
-            _LOGGER.warning(f"Unsupport batchnorm3d when {self.norm_before_pool}")
 
         tensor = self.pool(ops.permute()(tensor, [0, 2, 3, 4, 1]))
 
@@ -208,10 +238,9 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         tensor = ops.reshape()(tensor, [B, N, L_pooled, C])
 
         if self.has_norm and not self.norm_before_pool:
+            tensor = self.norm(tensor)
 
-            # TODO: add support for norm before pool
-            # tensor = self.norm(tensor)
-            _LOGGER.warning("Unsupport norm before pool")
+        tensor = _squeeze_dims(tensor, tensor_dim)
 
         return tensor, thw_shape
 
@@ -256,7 +285,7 @@ def __init__(
         kernel_kv=(1, 1, 1),
         stride_q=(1, 1, 1),
         stride_kv=(1, 1, 1),
-        norm_layer: str = "LayerNorm",
+        norm_layer: Callable = LayerNorm,
         has_cls_embed: bool = True,
         pool_mode: str = "conv",
         pool_first: bool = False,
@@ -362,7 +391,7 @@ def __init__(
                 else None
             )
 
-            self.norm_q = norm_layer if kernel_q is not None else None
+            self.norm_q = norm_layer(head_dim) if kernel_q is not None else None
             self.pool_k = (
                 Conv3d(
                     head_dim,
@@ -376,7 +405,7 @@ def __init__(
                 if kernel_kv is not None
                 else None
             )
-            self.norm_k = norm_layer if kernel_kv is not None else None
+            self.norm_k = norm_layer(head_dim) if kernel_kv is not None else None
             self.pool_v = (
                 Conv3d(
                     head_dim,
@@ -391,7 +420,7 @@ def __init__(
                 else None
             )
 
-            self.norm_v = norm_layer if kernel_kv is not None else None
+            self.norm_v = norm_layer(head_dim) if kernel_kv is not None else None
         else:
             raise NotImplementedError(f"Unsupported model {pool_mode}")
 
@@ -513,7 +542,6 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
             q, k, v = self._reshape_qkv_to_seq(q, k, v, q_N, v_N, k_N, B, C)
             q, k, v = self._qkv_proj(q, q_N, k, k_N, v, v_N, B, C)
         else:
-
             if self.separate_qkv:
                 q = k = v = x
                 pass
@@ -535,13 +563,16 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
             )
 
         # attention
+        q_shape = get_shape(q)
         B, num_heads, seqlen, head_dim = get_shape(q)
         score = ops.mem_eff_attention(causal=False)(q, k, v)
-        score = ops.reshape()(score, [B, seqlen, -1])
+        score = ops.reshape()(score, [B, seqlen, head_dim])
 
         if self.residual_pool:
             score = ops.elementwise(FuncEnum.ADD)(score, q)
 
+        score = ops.reshape()(ops.permute()(score, [0, 2, 1, 3]), [B, q_shape[-2], -1])
+
         score = self.proj(score)
         assert self.dropout_rate == 0.0
         if self.dropout_rate > 0.0:
@@ -590,9 +621,9 @@ def __init__(
         qkv_bias: bool = False,
         dropout_rate: float = 0.0,
         droppath_rate: float = 0.0,
-        act_layer: str = "gelu",
-        norm_layer: str = "LayerNorm",
-        attn_norm_layer: str = "LayerNorm",
+        act_layer: Module = GELU,
+        norm_layer: Module = LayerNorm,
+        attn_norm_layer: Module = LayerNorm,
         kernel_q=(1, 1, 1),
         kernel_kv=(1, 1, 1),
         stride_q=(1, 1, 1),
@@ -642,8 +673,11 @@ def __init__(
         super().__init__()
         self.dim = dim
         self.dim_out = dim_out
-        self.norm1 = norm_layer
+        self.norm1 = norm_layer(dim)
+        self.norm1_is_batchnorm_1d = isinstance(self.norm1, BatchNorm1d)
+        kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
         stride_skip = stride_q
+        padding_skip = [int(skip // 2) for skip in kernel_skip]
         self.attn = MultiScaleAttention(
             dim,
             num_heads=num_heads,
@@ -663,9 +697,9 @@ def __init__(
             separate_qkv=separate_qkv,
             max_seq_len=seq_len,
         )
-        assert droppath_rate == 0.0
         self.drop_path = DropPath(droppath_rate) if droppath_rate > 0.0 else Identity()
-
+        self.norm2 = norm_layer(dim)
+        self.norm2_is_batchnorm_1d = isinstance(self.norm2, BatchNorm1d)
         mlp_hidden_dim = int(dim * mlp_ratio)
         self.has_cls_embed = has_cls_embed
         self.mlp = Mlp(
@@ -676,10 +710,16 @@ def __init__(
             dropout_rate=dropout_rate,
             bias_on=bias_on,
         )
+        if dim != dim_out:
+            self.proj = Linear(dim, dim_out, bias=bias_on)
+        else:
+            self.proj = Identity()
 
-        # TODO: Add maxpool3d
-        assert numpy.prod(stride_skip) == 1
-        self.pool_skip = None
+        self.pool_skip = (
+            MaxPool3d(tuple(kernel_skip), tuple(stride_skip), tuple(padding_skip))
+            if len(stride_skip) > 0 and numpy.prod(stride_skip) > 1
+            else None
+        )
         self._attention_pool = _AttentionPool(
             self.pool_skip, has_cls_embed=self.has_cls_embed, norm=None
         )
@@ -693,14 +733,19 @@ def forward(
             thw_shape (List): The shape of the input tensor (before flattening).
         """
         thw_shape = [t_shape, h_shape, w_shape]
-        x_block, thw_shape_new = self.attn(x, thw_shape)
-
+        x_norm = (
+            ait_ncl2nlc(self.norm1(ait_ncl2nlc(x)))
+            if self.norm1_is_batchnorm_1d
+            else self.norm1(x)
+        )
+        x_block, thw_shape_new = self.attn(x_norm, thw_shape)
         x_res, _ = self._attention_pool(x, thw_shape)
         x = x_res + self.drop_path(x_block)
-
-        # TODO: batchnorm 1d
-
-        x_norm = x
+        x_norm = (
+            ait_ncl2nlc(self.norm2(ait_ncl2nlc(x)))
+            if self.norm2_is_batchnorm_1d
+            else self.norm2(x)
+        )
         x_mlp = self.mlp(x_norm)
         if self.dim != self.dim_out:
             x = self.proj(x_norm)

From d761c944ca832a231c86505f1b38f886a8a5ea65 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 26 Apr 2023 11:03:17 -0700
Subject: [PATCH 447/638] Update infer_shape for argmax (#623)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/623

update infer_shape for argmax with symbolic_shape.

Reviewed By: terrychenism

Differential Revision: D45256173

fbshipit-source-id: bc6a6facd3085db8265c50ce71389957ba5adfea
---
 .../aitemplate/compiler/ops/tensor/argmax.py  | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
index 2bf5922c5..e72e79a0c 100644
--- a/python/aitemplate/compiler/ops/tensor/argmax.py
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -29,7 +29,6 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import Operator, Tensor
-from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -65,30 +64,9 @@ def __init__(self, dim=0) -> None:
         self._attrs["workspace"] = 0
         self.exec_key_template = EXEC_KEY_TEMPLATE
 
-    def _infer_shape(self, x: List[int]):
-        """Infer the output shape"""
-        output = list(x)[:-1]
-        return output
-
     def _infer_shapes(self, x: Tensor):
         """Infer the output shape"""
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                shape_utils.gen_int_var(values=unique([d[idx] for d in y_shapes]))
-            )
-        return output_shape
+        return x._attrs["shape"][:-1]
 
     def __call__(self, x: Tensor) -> Tensor:
         """call the op

From 1e60f7b28f2f6ddea4db660a9adb75c9adb00631 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Wed, 26 Apr 2023 12:44:08 -0700
Subject: [PATCH 448/638] move use_cuda from global to class CrossAttention
 level (#626)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/626

Reviewed By: wushirong

Differential Revision: D45319471

fbshipit-source-id: caa363d6068d3da578594c19ce24c5a2e6807348
---
 python/aitemplate/frontend/nn/ldm/clip.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/frontend/nn/ldm/clip.py b/python/aitemplate/frontend/nn/ldm/clip.py
index 1a95314d4..b00e2782a 100644
--- a/python/aitemplate/frontend/nn/ldm/clip.py
+++ b/python/aitemplate/frontend/nn/ldm/clip.py
@@ -21,8 +21,6 @@
 
 # pylint: disable=W0102
 
-USE_CUDA = detect_target().name() == "cuda"
-
 
 def get_shape(x):
     shape = [it.value() for it in x._attrs["shape"]]
@@ -56,6 +54,7 @@ def __init__(
         self.scale = dim_head**-0.5
         self.heads = heads
         self.dim_head = dim_head
+        self.use_cuda = detect_target().name() == "cuda"
 
         self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
         self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
@@ -68,7 +67,7 @@ def forward(self, x, context=None, mask=None, residual=None):
         nheads = self.heads
         d = self.dim_head
 
-        layout = "20314" if USE_CUDA else "m2n3"
+        layout = "20314" if self.use_cuda else "m2n3"
 
         bs, seqlen, _ = get_shape(x)
         q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
@@ -84,7 +83,7 @@ def forward(self, x, context=None, mask=None, residual=None):
             ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
         )
 
-        if USE_CUDA:
+        if self.use_cuda:
             attn_op = ops.mem_eff_attention(causal=False)
             out = attn_op(
                 (ops.reshape()(q, [bs, nheads, -1, d])),

From ad45691306e5108b5f31de9ee49c1c6cc75ef9e5 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 26 Apr 2023 16:44:41 -0700
Subject: [PATCH 449/638] Introduce AIT_USE_FAST_MATH environment flag (#627)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/627

Whether the fast math option should be used for the device code generation. Fast math implies the use of approximate math operations (say, a division operation), allowing to gain speed at the cost of accuracy. Default value is "1".

Reviewed By: aakhundov

Differential Revision: D45321954

fbshipit-source-id: 9df3583eef5f7284176d338100f6702bace07a90
---
 docs/source/reference/env.rst                |  2 ++
 python/aitemplate/backend/cuda/target_def.py |  6 ++++--
 python/aitemplate/utils/environ.py           | 10 ++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index db3f6604c..012f3247e 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -48,3 +48,5 @@ Miscellaneous
 **LOGLEVEL**: It is used to control the logging level in Python. The default value is "INFO". "DEBUG" is useful for debugging.
 
 **AIT_PLOT_SHORTEN_TENSOR_NAMES**: If set to "1", shorten too long tensor names for a plot of a model graph, thus making a plot much easier to analyze visually. "0" by default.
+
+**AIT_USE_FAST_MATH**: If set to "0", no fast math option will be used for the device code generation. Default value is "1".
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 8ba5f2afb..7ceef2323 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -120,11 +120,12 @@ def _build_compile_options(self):
             environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
-            "--use_fast_math",
             f"-I{ait_static_path}",
         ] + ["-I" + path for path in cutlass_path]
         if self._ndebug == 1:
             options.append("-DNDEBUG")
+        if environ.use_fast_math():
+            options.append("--use_fast_math")
         return " ".join(options)
 
     def src_extension(self):
@@ -277,7 +278,6 @@ def _build_compile_options(self):
                     "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
                     "-w",
                     "--expt-relaxed-constexpr",
-                    "--use_fast_math",
                     f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
                     "-Xcompiler=-Wconversion",
                     environ.get_compiler_opt_level(),
@@ -286,6 +286,8 @@ def _build_compile_options(self):
             )
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
+            if environ.use_fast_math():
+                options.append("--use_fast_math")
             FBCUDA.compile_options_ = " ".join(options)
         compile_options = FBCUDA.compile_options_
         _LOGGER.info(f"The compile options are: {compile_options}")
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 36793d005..511e589e7 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -36,6 +36,16 @@ def get_compiler_opt_level() -> str:
     return compiler_opt
 
 
+def use_fast_math() -> str:
+    """
+    Whether the fast math option should be used for the device code generation.
+    Fast math implies the use of approximate math operations (say,
+    a division operation), allowing to gain speed at the cost of accuracy.
+    Default value is "1".
+    """
+    return os.getenv("AIT_USE_FAST_MATH", "1") == "1"
+
+
 def force_profiler_cache() -> bool:
     """
     Force the profiler to use the cached results. The profiler will throw

From c0dd23ec7f34ca27f5e88cf9750f125cd5721081 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@meta.com>
Date: Thu, 27 Apr 2023 03:58:51 -0700
Subject: [PATCH 450/638] Apply `transform_permute_to_reshape` when input
 tensor has smaller than required rank (#619)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/619

After `transform_strided_ops` pass, shape of tensor consumed by permute ops might not match the length of the permutation. For example:
- initial graph is `permute021(unsqueeze(X, 2))` with `X.shape == (B, N)`
- `transform_strided_ops` pass eliminates `unsqueeze`, changing permute input shape from `(B, N, 1)` to `(B, N)`
- as a result, `permute021` is applied on rank-2 tensor
- **problem**: subsequent `transorm_permute_to_reshape` can't convert such permute to reshape

This diff extends `transorm_permute_to_reshape` pass to also convert such permutes to reshape. The rule is:
- if permute op has input accessors, use its original shape instead of current input shape to infer if the op can be converted to reshape or not
- If strided dimension is touched by the permutation, don't convert to reshape

Note that such situations can't happen when permute first included into the graph with `__call__`, as it would fail on [assert](https://github.com/facebookincubator/AITemplate/blob/d5c84d2cb214092e527d202c7cc409ddce10128e/python/aitemplate/compiler/ops/tensor/permute021.py#L62). But `transform_strided_ops` most probably changes ops input without re-executing `__call__`.

Reviewed By: chenyang78, aakhundov

Differential Revision: D44668349

fbshipit-source-id: 50f77c6ea2eaee172437b687aac3636e53b0c65a
---
 .../transform/transform_permute_to_reshape.py | 21 +++++-
 .../test_transform_permute_to_reshape.py      | 64 ++++++++++++-------
 2 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
index 67bb456e4..1aa21242e 100644
--- a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
+++ b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
@@ -46,7 +46,10 @@ def _check_permute_to_reshape(op: Operator) -> bool:
         op._attrs["op"], len(inputs)
     )
 
-    input_shape = inputs[0].shape()
+    if "input_accessors" in op._attrs:
+        input_shape = op._attrs["input_accessors"][0].original_shapes
+    else:
+        input_shape = inputs[0].shape()
 
     if op._attrs["op"] == "permute":
         permutation = list(op._attrs["dims"])
@@ -63,7 +66,12 @@ def _check_permute_to_reshape(op: Operator) -> bool:
         raise NotImplementedError(
             f"Not implemented for permute operation: {op._attrs['op']}"
         )
-
+    if "input_accessors" in op._attrs:
+        # Can't convert permute to reshape if one of the dimensions included
+        # in permutation is strided
+        ta = op._attrs["input_accessors"][0]
+        if ta.is_from_strided_tensor and ta.stride_dim in permutation:
+            return False
     # Get non-singular dimension indices
     permutation = [
         dim_idx
@@ -85,8 +93,15 @@ def transform_permute_to_reshape(
     it's basically a reshape op, i.e. the underlying memory layout
     does not change.
 
-    Example:
+    If a permute op has a non-empty input tensor accessor, its original shape
+    should be used to determine whether it can be converted to reshape.
+    In this case the shape of the actual input tensor might not match the rank
+    of the permutation (but the original shape does) - see the second
+    example below.
+
+    Examples:
         [256x5x1x32] -> [256x5x32x1] (with 0132) is a reshape
+        [256x5x32] -> [256x5x1x32] (with 0132) is a reshape
         [256x1x5x1x32] -> [256x5x32x1x1] (with 02431) is a reshape
         [256x5x1x32] -> [256x32x5x1] (with 0312) is not a reshape
 
diff --git a/tests/unittest/compiler/test_transform_permute_to_reshape.py b/tests/unittest/compiler/test_transform_permute_to_reshape.py
index c72aa147b..46f8b28d6 100644
--- a/tests/unittest/compiler/test_transform_permute_to_reshape.py
+++ b/tests/unittest/compiler/test_transform_permute_to_reshape.py
@@ -14,6 +14,7 @@
 #
 import re
 import unittest
+from typing import List
 
 import torch
 
@@ -55,37 +56,53 @@ class TransformPermuteToReshapeTestCase(unittest.TestCase):
     @parameterized.expand(
         [
             # no singleton
-            ([32, 51, 12], [1, 2, 0], False, "float16"),
-            ([32, 51, 12], [1, 2, 0], False, "float32"),
+            ([32, 51, 12], [1, 2, 0], False, False, "float16"),
+            ([32, 51, 12], [1, 2, 0], False, False, "float32"),
             # one singleton dimension
-            ([32, 51, 1], [0, 2, 1], True, "float16"),
-            ([32, 51, 1], [0, 2, 1], True, "float32"),
-            ([32, 51, 1], [1, 2, 0], False, "float16"),
-            ([32, 51, 1], [1, 2, 0], False, "float32"),
+            ([32, 51, 1], [0, 2, 1], True, False, "float16"),
+            ([32, 51, 1], [0, 2, 1], True, False, "float32"),
+            ([32, 51, 1], [1, 2, 0], False, False, "float16"),
+            ([32, 51, 1], [0, 2, 1], True, True, "float16"),
+            ([32, 51, 1], [1, 2, 0], False, True, "float16"),
             # two same sized dimensions
-            ([32, 32, 1], [2, 0, 1], True, "float16"),
-            ([32, 32, 1], [2, 0, 1], True, "float32"),
-            ([32, 32, 1], [1, 0, 2], False, "float16"),
-            ([32, 32, 1], [1, 0, 2], False, "float32"),
+            ([32, 32, 1], [2, 0, 1], True, False, "float16"),
+            ([32, 32, 1], [1, 0, 2], False, False, "float16"),
             # double singleton dimension
-            ([32, 1, 51, 1], [3, 0, 2, 1], True, "float16"),
-            ([32, 1, 51, 1], [3, 0, 2, 1], True, "float32"),
-            ([32, 1, 51, 1], [2, 3, 1, 0], False, "float16"),
-            ([32, 1, 51, 1], [2, 3, 1, 0], False, "float32"),
+            ([32, 1, 51, 1], [3, 0, 2, 1], True, False, "float16"),
+            ([32, 1, 51, 1], [2, 3, 1, 0], False, False, "float16"),
             # IntVar dimension
-            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, "float16"),
-            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, "float32"),
-            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, "float16"),
-            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, "float32"),
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, False, "float16"),
+            ([IntVar([1, 10]), 32, 51, 1], [0, 1, 3, 2], True, True, "float16"),
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, False, "float32"),
+            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, False, "float16"),
             # other
-            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, "float16"),
-            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, "float32"),
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, False, "float16"),
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, False, "float32"),
         ]
     )
-    def test_permute_to_reshape(self, shape, permutation, is_reshape, dtype):
+    def test_permute_to_reshape(
+        self,
+        shape: List[int],
+        permutation: List[int],
+        is_reshape: bool,
+        squeeze_trailing_dim: bool,
+        dtype: str,
+    ):
         target = detect_target()
 
-        X = Tensor(shape, dtype=dtype, is_input=True, name="x")
+        if squeeze_trailing_dim:
+            # Simulate situation when the rank of the input tensor doesn't
+            # match the permutation length, and transform_permute_to_reshape
+            # needs to take into account the original shape of the
+            # corresponsing tensor accessor. This could happen after fusion of
+            # permute and view op by transform_strided_ops pass.
+            # We test it by providing an input tensor with last dimension 1 and
+            # unsqueezing it before passing to permute
+            assert shape[-1] == 1
+            X0 = Tensor(shape[:-1], dtype=dtype, is_input=True, name="x")
+            X = ops.unsqueeze(len(shape) - 1)(X0)
+        else:
+            X = Tensor(shape, dtype=dtype, is_input=True, name="x")
         Z = ops.softmax()(ops.permute()(X, dims=permutation), -1)
         Z._attrs["is_output"] = True
         Z._attrs["name"] = "z"
@@ -112,6 +129,9 @@ def test_permute_to_reshape(self, shape, permutation, is_reshape, dtype):
         x_pt = get_random_torch_tensor(shape, dtype)
         z_pt = torch.softmax(torch.permute(x_pt, tuple(permutation)), dim=-1)
         z_ait = torch.empty_like(z_pt)
+        if squeeze_trailing_dim:
+            # Same as what we did with AIT input tensor X above
+            x_pt = x_pt.squeeze(-1)
         module.run_with_tensors({"x": x_pt}, {"z": z_ait})
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)

From 73624b8878b045664a5cb95205010df84083a5ba Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Thu, 27 Apr 2023 04:36:05 -0700
Subject: [PATCH 451/638] use current stream in mem_eff_attention kernel (#628)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/628

propagate the current CUDA stream to mem_eff_attention kernel.

Reviewed By: terrychenism

Differential Revision: D45316111

fbshipit-source-id: 47e9a5f0a074ab34a5cf7c9a61211852ef0b5907
---
 .../aitemplate/backend/cuda/attention/mem_eff_attention.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 379010662..dafe5bdee 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -182,7 +182,7 @@
            " at " + __FILE__ + ": " + std::to_string(__LINE__);
       throw std::runtime_error(error_msg);
     }
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
 
     cudaError_t err = cudaDeviceSynchronize();
 
@@ -686,7 +686,7 @@
 {{indent}}    {{fixed_seq_length_q}},
 {{indent}}    {{lengths_q}},
 {{indent}}    global_workspace_,
-{{indent}}    stream /* default stream */
+{{indent}}    {{stream}}
 {{indent}});
     """
 )
@@ -766,7 +766,10 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
 
     head_size_v = v._attrs["shape"][3]._attrs["values"][0]
 
+    backend_spec = CUDASpec()
+
     return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
         func_name=func_attrs["name"],
         output=output_name,
         query=q_name,

From 62426ccad001cfc3bd846559bd4c22972a271c9e Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Thu, 27 Apr 2023 08:55:43 -0700
Subject: [PATCH 452/638] Minor fix to makefile normalization (#630)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/630

The recently changed build cache makefile normalization did not correctly take into account that a fb_include file is written at a random path and referenced in the Makefile. This can lead to cache misses. This diff fixes that.

Reviewed By: alexanderguzhva

Differential Revision: D45353978

fbshipit-source-id: 1f85de4015f6b203fcef0a3562a36b3aab8ae828
---
 python/aitemplate/backend/build_cache_base.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
index 227b54804..a3b78d06a 100644
--- a/python/aitemplate/backend/build_cache_base.py
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -17,6 +17,7 @@
 import logging
 import os
 import random
+import re
 import secrets
 import shlex
 import shutil
@@ -342,7 +343,9 @@ def cleanup(self, retention_hours: int = 72):
         """
         pass
 
-    def makefile_normalizer(self, path, memoize_replacements=True) -> Optional[bytes]:
+    def makefile_normalizer(
+        self, path, memoize_replacements=True, debug=False
+    ) -> Optional[bytes]:
         """
         Normalizes the content of the makefile for hashing purposes (nothing else!),
         so that it can be compared to other Makefiles
@@ -389,7 +392,13 @@ def makefile_normalizer(self, path, memoize_replacements=True) -> Optional[bytes
 
         for search, replace in replacements.items():
             makefile_content = makefile_content.replace(search, replace)
-        return makefile_content.encode("utf-8")
+        makefile_content = re.sub(
+            r"[^/\\]+[/\\]fb_include", "fb_include", makefile_content
+        )
+        makefile_bytes = makefile_content.encode("utf-8")
+        if debug:
+            (p.parent / (p.name + ".normalized")).write_bytes(makefile_bytes)
+        return makefile_bytes
 
 
 class NoBuildCache(BuildCache):

From 7e930fd84989a7513880acfea49e089ee60e9a62 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 27 Apr 2023 09:03:05 -0700
Subject: [PATCH 453/638] Add SM90 CUTLASS 3.x kernels to gemm_rcr (#617)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/617

CUTLASS 3.x SM90 kernel support added to the `gemm_rcr` op. Infrastructure for gradual adoption of the SM90 kernels is established in the CUDA GEMM backend (mostly, `backend/cuda/gemm_universal/common.py`. Enabling SM90 kernel support for the following ops is supposed to reuse (and, possibly extend) this infrastructure.

Reviewed By: ipiszy, chenyang78

Differential Revision: D45193614

fbshipit-source-id: 83cbd22ed8a47928e1cc3bb1fe7878736308e855
---
 .../backend/cuda/gemm_universal/common.py     | 242 +++++++++++++++---
 .../gemm_universal/common_bias_broadcast.py   |   1 +
 .../backend/cuda/gemm_universal/gemm_rcr.py   | 103 ++++++--
 .../cuda/gemm_universal/group_common.py       |   7 +-
 .../ops/gemm_universal/gemm_common.py         |  11 +
 tests/unittest/ops/test_gemm.py               |  63 +++++
 6 files changed, 363 insertions(+), 64 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 0f23cb1ee..4cfa4d64c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -123,6 +123,12 @@
 """
 )
 
+INSTANCE_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::gemm::device::GemmUniversalAdapter<{{config_name}}>;
+"""
+)
 
 SRC_TEMPLATE = jinja2.Template(
     """
@@ -130,6 +136,7 @@
 #include <memory>
 #include <random>
 #include <vector>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/kernel/gemm_grouped.h"
@@ -140,6 +147,13 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
 using bfloat16 = nv_bfloat16;
 
 {{extra_code}}
@@ -217,11 +231,18 @@
 {{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
 
 {{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
-{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}typename {{instance}}::Arguments arguments;
 
+{{indent}}if constexpr (cutlass::gemm::detail::IsCutlass3GemmKernel<typename {{instance}}::GemmKernel>::value) {
+{{indent}}arguments = {
+{{problem_args_cutlass_3x}}
+{{indent}}};
+{{indent}}} else {
+{{indent}}arguments = {
 {{problem_args}}
-
 {{indent}}};
+{{indent}}}
+
 {% if is_profiler %}
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
@@ -602,7 +623,7 @@
 
 KERNEL_KEY_TEMPLATE = jinja2.Template(
     """
-cutlass_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+cutlass{{prefix}}_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
 """
 )
 
@@ -643,6 +664,27 @@ def get_gemm_instance_template_params(
     return gemm_universal_params
 
 
+def get_tensor_accessor_alignments(func_attrs):
+    """Infer the A, B, and epilogue alignments from the respective TAs."""
+    input_accessors = func_attrs["input_accessors"]
+    a_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[0]
+    )
+    b_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[1]
+    )
+    output_accessor = func_attrs["output_accessors"][0]
+    epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        output_accessor
+    )
+
+    # if the last dim is dynamic, force align=1
+    if not isinstance(output_accessor.original_shapes[-1], IntImm):
+        epilogue_alignment = 1
+
+    return a_alignment, b_alignment, epilogue_alignment
+
+
 def update_alignments_in_gemm_instance(
     op_def: str,
     func_attrs: Dict[str, Any],
@@ -660,22 +702,10 @@ def update_alignments_in_gemm_instance(
     if for_profiler:
         return op_def
 
-    input_accessors = func_attrs["input_accessors"]
-    a_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        input_accessors[0]
-    )
-    b_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        input_accessors[1]
-    )
-    output_accessor = func_attrs["output_accessors"][0]
-    epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        output_accessor
+    a_alignment, b_alignment, epilogue_alignment = get_tensor_accessor_alignments(
+        func_attrs
     )
 
-    # if the last dim is dynamic, force align=1
-    if not isinstance(output_accessor.original_shapes[-1], IntImm):
-        epilogue_alignment = 1
-
     gemm_params = get_gemm_instance_template_params(op_def, kernel_config)
     epilogue_align_idx = 11
     a_align_idx = 17
@@ -706,8 +736,19 @@ def _replace_align(align_idx, curr_align, alignment):
 
 
 def universal_gemm_instance(
-    op_def: str, func_attrs: Dict[str, Any], for_profiler: bool
+    op_def: str,
+    func_attrs: Dict[str, Any],
+    for_profiler: bool,
+    cutlass_3x: bool = False,
 ) -> str:
+    if cutlass_3x:
+        # We don't need to make any adjustments to the emitted
+        # CUTLASS 3.x op definitions. In particular, the alignments
+        # should not be updated, as the op instances incompatible
+        # with the TA-specified alignments have been removed from
+        # consideration by the filter_cutlass_3x_ops function.
+        return op_def
+
     op_def = update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
     tmp = op_def.replace(
         "cutlass::gemm::device::Gemm", "cutlass::gemm::device::GemmUniversal"
@@ -728,7 +769,13 @@ def kernel_name(op):
     layout = op.layout_name()
     align_ab = op.A.alignment
     align_c = op.C.alignment
+    prefix = ""
+    if op.prefix != "":
+        kernel_schedule = library.KernelScheduleSuffixes[op.kernel_schedule]
+        epilogue_schedule = library.EpilogueScheduleSuffixes[op.epilogue_schedule]
+        prefix = f"{op.prefix}{kernel_schedule}{epilogue_schedule}"
     name = KERNEL_KEY_TEMPLATE.render(
+        prefix=prefix,
         threadblock=threadblock,
         extended_name=extended_name,
         opcode_class_name=opcode_class_name,
@@ -748,25 +795,42 @@ def emit_instance(
 ):
     import cutlass_lib
 
-    emitter = cutlass_lib.gemm_operation.EmitGemmInstance()
-    if emit_kernel:
-        emitter = cutlass_lib.gemm_operation.EmitGemmUniversalInstance()
+    cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+    if cutlass_3x:
+        emitter = cutlass_lib.gemm_operation.EmitGemmUniversal3xInstance()
+    else:
+        emitter = cutlass_lib.gemm_operation.EmitGemmInstance()
+        if emit_kernel:
+            emitter = cutlass_lib.gemm_operation.EmitGemmUniversalInstance()
+
     op_def = emitter.emit(op)
-    op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
+    op_def = f_instance_convertor(
+        op_def=op_def,
+        func_attrs=func_attrs,
+        for_profiler=for_profiler,
+        cutlass_3x=cutlass_3x,
+    )
+
     return op_def
 
 
-def extract_config(f_proc_op, f_kernel_name=kernel_name):
+def extract_config(
+    f_proc_op,
+    f_kernel_name=kernel_name,
+    include_cutlass_3x_ops=False,
+):
     import cutlass_lib
 
     op_kind = cutlass_lib.library.OperationKind.Gemm
-    gemm_kind = cutlass_lib.library.GemmKind.Universal
+    gemm_kinds = {cutlass_lib.library.GemmKind.Universal}
+    if include_cutlass_3x_ops:
+        gemm_kinds.add(cutlass_lib.library.GemmKind.Universal3x)
     gemm_ops = OrderedDict()
     extract_ops = list(Target.current()._operators[op_kind].items())
 
     for _, value in extract_ops:
         op = value[0]
-        if op.gemm_kind == gemm_kind:
+        if op.gemm_kind in gemm_kinds:
             ret = f_proc_op(op)
             if len(ret) > 0:
                 for op_inst in ret:
@@ -775,9 +839,16 @@ def extract_config(f_proc_op, f_kernel_name=kernel_name):
     return gemm_ops
 
 
-def extract_config_name(config):
-    pattern = re.compile(r"\s*using\s(.*?)\s=")
-    decl = config.split("\n")[2]
+def extract_config_name(
+    config,
+    cutlass_3x=False,
+):
+    if cutlass_3x:
+        pattern = re.compile(r"\s*struct\s(.*?)\s:")
+        decl = [line for line in config.split("\n") if "struct " in line][-1]
+    else:
+        pattern = re.compile(r"\s*using\s(.*?)\s=")
+        decl = config.split("\n")[2]
     match = pattern.match(decl)
     if match is None:
         raise RuntimeError("Invalid config: \n" + config)
@@ -799,6 +870,7 @@ def gen_function(
     input_addr_calculator="",
     output_addr_calculator="",
     extra_code="",
+    problem_args_cutlass_3x="",
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -813,9 +885,11 @@ def gen_function(
     inst_def_flag = set()
     instances = {}
     instance_decl = ""
+    exec_cond_to_cutlass_3x = {}
     for exec_item in exec_path.values():
         fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
         algo = exec_item.algo
+        cutlass_3x = algo.startswith("cutlass3x")
         if algo not in inst_def_flag:
             config = emit_instance(
                 op_instance[algo],
@@ -827,25 +901,43 @@ def gen_function(
             inst_def_flag.add(algo)
         else:
             config = ""
-        inst = INSTANCE_TEMPLATE.render(
-            config=config, name=fname, config_name=extract_config_name(config)
+        instance_template = (
+            INSTANCE_TEMPLATE_CUTLASS_3X if cutlass_3x else INSTANCE_TEMPLATE
+        )
+        inst = instance_template.render(
+            config=config,
+            name=fname,
+            config_name=extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
         )
         instances[exec_item.exec_cond] = inst
+        exec_cond_to_cutlass_3x[exec_item.exec_cond] = cutlass_3x
         instance_decl += inst
     shape_eval_func = gemm_common.gen_shape_eval_code(
         indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
     exec_paths = ""
-    for key in instances:
-        fname = "f" + sha1(key.encode()).hexdigest()
+    for exec_cond in instances:
+        fname = "f" + sha1(exec_cond.encode()).hexdigest()
+        cutlass_3x = exec_cond_to_cutlass_3x[exec_cond]
         program = EXEC_TEMPLATE.render(
             indent="    ",
             instance=fname,
-            problem_args=problem_args,
+            # need to omit irrelevant problem_args here as in
+            # non-templated function both CUTLASS 2.x and 3.x
+            # code branches are syntactically checked
+            problem_args=(problem_args if not cutlass_3x else ""),
+            problem_args_cutlass_3x=(problem_args_cutlass_3x if cutlass_3x else ""),
             support_split_k=support_split_k,
         )
-        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(
+            indent="  ",
+            cond=exec_cond,
+            program=program,
+        )
         exec_paths += exec_inst
     input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=input_ndims,
@@ -913,6 +1005,45 @@ def add_profiler(file_pairs, workdir, op_type, output_name, code):
         file_pairs.append((src_path, obj_path))
 
 
+def filter_cutlass_3x_ops(op_instance, func_attrs):
+    """Filter out CUTLASS 3.x ops with incompatible alignment requirements.
+
+    The CUTLASS 3.x ops have stricter alignment requirements compared to
+    the CUTLASS 2.x ops (due to TMA). These alignment requirements are used
+    to initially filter them out in the `function_filter` below. However, the
+    required alignments of the GEMM op inputs and outputs may change due to
+    TensorAccessor-related optimizations, which are introduced to the model
+    graph *after* the initial filtering.
+
+    In this function, the (possible) TA-related alignment updates are checked
+    once again and the CUTLASS 3.x ops not satisfying these requirements are
+    filtered out. Importantly, due to input/output alignment flexibilit of the
+    CUTLASS 2.x ops, their alignment requirements are corrected using the
+    TA-imposed alignments in the `update_alignments_in_gemm_instance` function
+    above. But this correction is not possible for the CUTLASS 3.x ops, as they
+    won't work with the lower alignment values. That's why the CUTLASS 3.x ops
+    are filtered out by this function in such cases.
+    """
+    import cutlass_lib
+
+    a_alignment, b_alignment, epilogue_alignment = get_tensor_accessor_alignments(
+        func_attrs
+    )
+
+    result = {}
+    for op_name, op in op_instance.items():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            if (
+                op.A.alignment > a_alignment
+                or op.B.alignment > b_alignment
+                or op.C.alignment > epilogue_alignment
+            ):
+                continue
+        result[op_name] = op
+
+    return result
+
+
 def gen_profiler(
     func_attrs,
     workdir,
@@ -925,9 +1056,14 @@ def gen_profiler(
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    problem_args_template_cutlass_3x=None,
 ):
+    import cutlass_lib
+
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    op_instance = filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -957,6 +1093,14 @@ def gen_profiler(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
         ),
+        problem_args_cutlass_3x=(
+            problem_args_template_cutlass_3x.render(
+                elem_input_type=elem_input_type,
+                elem_output_type=elem_output_type,
+            )
+            if problem_args_template_cutlass_3x is not None
+            else ""
+        ),
     )
     input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=ndims,
@@ -969,11 +1113,19 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(op, for_profiler=True)
-        config_name = extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            INSTANCE_TEMPLATE_CUTLASS_3X if cutlass_3x else INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -1152,6 +1304,7 @@ def default_fproc(
         cutlass_lib.library.DataTypeTag[op.A.element] == data_type
         and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
         and cutlass_lib.library.DataTypeTag[op.C.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.D.element] == data_type
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
@@ -1160,6 +1313,7 @@ def default_fproc(
         op = copy.deepcopy(op)
         # set output major
         op.C.layout = c_layout
+        op.D.layout = c_layout
         # set epilogue
         op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
         op.element_epilogue = acc_type
@@ -1167,16 +1321,21 @@ def default_fproc(
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
                 permute_layout
             ]
-        # set C alignment
+        # set C and D alignment
         alignments = alignment.get_alignments(dtype)
         for i in alignments:
             op = copy.deepcopy(op)
             op.C.alignment = i
+            op.D.alignment = i
             ret.append(op)
     return ret
 
 
-def make_fproc(func_attrs, layout):
+def make_fproc(
+    func_attrs,
+    layout,
+    include_cutlass_3x_ops=False,
+):
     """
     This function sets a callback for processing the epilogue of the kernel
     associated with func_attrs.
@@ -1193,7 +1352,10 @@ def fproc(op):
             epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = extract_config(fproc)
+    func_attrs["op_instance"] = extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=include_cutlass_3x_ops,
+    )
 
 
 def function_filter(cfg, func_attrs, ab_alignment):
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index bc702add6..c35772d12 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -350,6 +350,7 @@ def gemm_bias_broadcast_instance(
     binary_op2,
     unary_op2,
     elem_type,
+    cutlass_3x=False,
 ):
     """
     adjust gemm instance with respect to input_accessors, layout and epilogue ops
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index 9ea72e04d..c81b00f5c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -71,6 +71,30 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementA const* ptr_A
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementB const* ptr_B
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementC const* ptr_C
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
@@ -98,9 +122,33 @@
 )
 
 
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_output_type}}*)(c_ptr),                          // ElementC const* ptr_C
+        {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
 
 def common_gen_profiler(
@@ -110,6 +158,7 @@ def common_gen_profiler(
     dim_info_dict,
     src_template,
     problem_args_template,
+    problem_args_template_cutlass_3x=None,
     bias_ptr_arg=None,
     extra_code="",
 ):
@@ -117,13 +166,14 @@ def common_gen_profiler(
         stride_dim="*b_dim0"
     )
     return common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        src_template,
-        problem_args_template,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
         support_split_k=True,
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
@@ -134,12 +184,13 @@ def common_gen_profiler(
 @registry.reg("cuda.gemm_rcr.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -196,19 +247,25 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
-            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+            stride_dim="N",
+            output_accessor=func_attrs["output_accessors"][0],
         ),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index b8854694a..c06445e74 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -766,7 +766,12 @@ def update_alignments_in_group_gemm_instance(
     return "\n".join(instance_lines)
 
 
-def group_gemm_instance(op_def: str, func_attrs: Dict[str, Any], for_profiler: bool):
+def group_gemm_instance(
+    op_def: str,
+    func_attrs: Dict[str, Any],
+    for_profiler: bool,
+    cutlass_3x: bool = False,
+):
     # TODO: This is a dirty thing need to add an extra emitter to clean this up
     op_def = update_alignments_in_group_gemm_instance(op_def, func_attrs, for_profiler)
     tmp = op_def.replace("DefaultGemmUniversal", "DefaultGemmGrouped")
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 4f3858254..992281a6a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -509,6 +509,11 @@ def gen_profiler(
             output_shape = self._attrs["output_accessors"][0].original_shapes
             self._extract_epilogue_alignment(output_shape, dynamic_profiling_strategy)
 
+        if not self._attrs["op_instance"]:
+            raise RuntimeError(
+                f"No GEMM op instances were generated for {self._attrs['op']}."
+            )
+
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
@@ -523,6 +528,12 @@ def gen_profiler(
         )
         self._attrs["op_instance"] = new_op_instance
 
+        if not self._attrs["op_instance"]:
+            raise RuntimeError(
+                f"No GEMM op instances are left after filtering for {self._attrs['op']}. "
+                "This is probably due to incompatible alignment requirements."
+            )
+
         build_profiler = self._should_build_profiler(workloads, new_op_instance)
         if build_profiler:
             # generate profiler
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 542e9e50e..68352582a 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -348,6 +349,68 @@ def test_gemm_bfloat16_bf16(self):
             [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_bfloat16", dtype="bfloat16"
         )
 
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    ms=[1, 1024],
+                    k=252,
+                    n=512,
+                    test_name="dynamic_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                ms=[1, 1024],
+                k=256,
+                n=512,
+                test_name="dynamic_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_rcr_dynamic_n(
+                ms=[16, 1 * 29, 64],
+                k=256,
+                ns=[100000, 300000],
+                test_name="einsum_dynamic_n_force_sm90",
+                dtype="float16",
+            )
+            self._test_3d_2d_rcr(
+                m0s=[1, 99, 1024],
+                m1s=[1, 2],
+                k=128,
+                n=8,
+                test_name="dynamic3_force_sm90",
+                dtype="float16",
+            )
+            self._test_h_rcr(
+                ait_dtype="float16",
+                test_name="float16_force_sm90",
+            )
+
+            self._test_rcr(
+                ms=[1024],
+                k=256,
+                n=512,
+                test_name="static_float_forse_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                ms=[1024],
+                k=256,
+                n=512,
+                test_name="static_bfloat16_forse_sm90",
+                dtype="bfloat16",
+            )
+
 
 filter_test_cases_by_test_env(GEMMTestCase)
 

From 747852a26a7d9f1aba0692133711931818931f06 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 27 Apr 2023 09:46:37 -0700
Subject: [PATCH 454/638] Add SM90 CUTLASS 3.x kernels to gemm_rrr (#621)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/621

ATT

Reviewed By: chenyang78

Differential Revision: D45271616

fbshipit-source-id: d623744f8cf4d72a09d475f29e7f6f366cfc1747
---
 .../backend/cuda/gemm_universal/gemm_rrr.py   | 66 ++++++++++++++-----
 tests/unittest/ops/test_gemm.py               | 51 ++++++++++++++
 2 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index bd7cb247e..c9bad8c46 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -43,6 +43,7 @@
 """
 )
 
+
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
@@ -69,6 +70,30 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {cute::Int<1>{}, N, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_output_type}}*)(c_ptr),                          // ElementC const* ptr_C
+        {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rrr.config")
 def gemm_rrr_config(func_attrs, dtype="float16"):
     def fproc(op):
@@ -83,7 +108,10 @@ def fproc(op):
             epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+    func_attrs["op_instance"] = common.extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rrr.gen_profiler")
@@ -92,13 +120,14 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         stride_dim="N"
     )
     return common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
         support_split_k=True,
         output_addr_calculator=output_addr_calculator,
     )
@@ -124,15 +153,20 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="*b_dim1", output_accessor=func_attrs["output_accessors"][0]
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 68352582a..5402dd934 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -411,6 +411,57 @@ def test_rcr_sm90(self) -> None:
                 dtype="bfloat16",
             )
 
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    ms=[1, 99, 1024, 2048],
+                    k=252,
+                    n=16,
+                    test_name="dynamic_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                ms=[1, 99, 1024, 2048],
+                k=256,
+                n=16,
+                test_name="dynamic_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_3d_2d_rrr(
+                m0s=[2, 34, 48],
+                m1s=[1, 3, 5],
+                k=256,
+                n=16,
+                test_name="dynamic3_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_rrr(
+                ms=[256],
+                k=128,
+                n=32,
+                test_name="static_float_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                ms=[256],
+                k=128,
+                n=32,
+                test_name="static_bfloat16_force_sm90",
+                dtype="bfloat16",
+            )
+
 
 filter_test_cases_by_test_env(GEMMTestCase)
 

From 1d9d2edb58cd281d9f8b454b24e9ad23ae629ba9 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 27 Apr 2023 10:46:58 -0700
Subject: [PATCH 455/638] Add SM90 CUTLASS 3.x kernels to gemm_rcr_bias (#620)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/620

ATT

Reviewed By: chenyang78

Differential Revision: D45272960

fbshipit-source-id: 08a7127974a45272ecc93d2accbdd97263894f2d
---
 .../cuda/gemm_universal/common_bias.py        |  8 ++
 .../cuda/gemm_universal/gemm_rcr_bias.py      | 85 +++++++++++++++----
 tests/unittest/ops/test_gemm.py               |  4 +-
 tests/unittest/ops/test_gemm_bias.py          | 42 +++++++++
 4 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
index bae48543f..a464d50d0 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
@@ -36,6 +36,7 @@
 #include <vector>
 #include <iostream>
 #include <cuda_bf16.h>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
@@ -44,6 +45,13 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
 using bfloat16 = nv_bfloat16;
 
 {{extra_code}}
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index b78fed2d7..8d9d25cea 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -23,6 +23,7 @@
 
 from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -54,6 +55,30 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementA const* ptr_A
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementB const* ptr_B
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
@@ -81,20 +106,45 @@
 )
 
 
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return gemm_rcr.gemm_rcr_config(func_attrs, dtype)
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
 
 @registry.reg("cuda.gemm_rcr_bias.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return gemm_rcr.common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
@@ -120,15 +170,20 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 5402dd934..d9585c249 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -364,7 +364,7 @@ def test_rcr_sm90(self) -> None:
                     ms=[1, 1024],
                     k=252,
                     n=512,
-                    test_name="dynamic_force_sm90",
+                    test_name="wrong_alignment_force_sm90",
                     dtype="float16",
                 )
 
@@ -426,7 +426,7 @@ def test_rrr_sm90(self) -> None:
                     ms=[1, 99, 1024, 2048],
                     k=252,
                     n=16,
-                    test_name="dynamic_force_sm90",
+                    test_name="wrong_alignment_force_sm90",
                     dtype="float16",
                 )
 
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index 3d969f490..b5cc9a217 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -106,6 +107,47 @@ def test_rcr_bfloat16_bf16(self):
             dtype=dtype,
         )
 
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    Ms=[128],
+                    N=32,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
 
 filter_test_cases_by_test_env(GEMMBiasTestCase)
 

From 948714330f14e0003152547d043a862a956c9150 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Thu, 27 Apr 2023 10:53:31 -0700
Subject: [PATCH 456/638] Small fixes to b2b bmm kernels (#564)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/564

Small fixes to b2b bmm kernels.

Reviewed By: frank-wei, aakhundov

Differential Revision: D44451023

fbshipit-source-id: 2644f7c6f7172db6c13a3bfb6e0a35f1c18386f7
---
 .../compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py    | 8 ++++----
 python/aitemplate/testing/benchmark_trt.py                | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
index b1b2b957b..50d7be6d1 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -112,22 +112,22 @@ def _infer_shapes(self):
                 max_seq_length,
                 max_seq_length,
             ]
-            bias_max_shape = shape_utils.get_broadcast_max_shape(
+            broadcastable, _ = shape_utils.get_broadcast_max_shape(
                 bias_shape, bias_expected_shape
             )
             if len(bias_shape) != 4:
                 raise RuntimeError(
                     f"Expected bias rank 4. Current bias rank: {len(bias)}."
                 )
-            if not bias_max_shape[0]:
+            if not broadcastable:
                 raise RuntimeError(
                     f"bias shape is not compatible with Q K! "
                     f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
                     f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
                 )
-            if bias_shape[-1] != max_seq_length:
+            if bias_shape[-1] != bias_expected_shape[-1]:
                 raise RuntimeError(
-                    f"Bias last dim is not broadcastable! Expected shape: {max_seq_length}, current bias shape: {bias_shape}"
+                    f"Bias last dim is not broadcastable! Expected shape: {bias_expected_shape[-1]}, current bias shape: {bias_shape}"
                 )
             # See comments below.
             if not isinstance(q_shape[0].jagged_dims()[0].min_value(), IntImm):
diff --git a/python/aitemplate/testing/benchmark_trt.py b/python/aitemplate/testing/benchmark_trt.py
index 2b7222ba0..ebd22e841 100644
--- a/python/aitemplate/testing/benchmark_trt.py
+++ b/python/aitemplate/testing/benchmark_trt.py
@@ -26,6 +26,7 @@ def make_trt_module(
     max_batch_size=256,
     max_workspace_size=2 << 31,
     dtype="float16",
+    dynamic_batch=False,
 ):
     if dtype == "float16":
         lower_precision = LowerPrecision.FP16
@@ -45,7 +46,7 @@ def make_trt_module(
         timing_cache_prefix=True,
         save_timing_cache=True,
         explicit_batch_dimension=True,
-        dynamic_batch=False,
+        dynamic_batch=dynamic_batch,
     )
 
 
From 1d5a94253b0e27ab2fedf09dd7f8e43a92dec796 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Thu, 27 Apr 2023 18:24:24 -0700
Subject: [PATCH 457/638] Fix AIT topk converter (#631)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/631

AIT topk was return only the indice tensor. Adding the value tensor now to make it match torch.topk behavior.

Reviewed By: terrychenism

Differential Revision: D45334595

fbshipit-source-id: 9d6bd1370928d64963e19371f6b488b94e533a91
---
 .../modeling/proposal_generator/rpn.py        |  2 +-
 fx2ait/fx2ait/converters/ait_converters.py    | 13 +-----
 .../fx2ait/test/converters/test_ait_topk.py   |  3 +-
 .../backend/common/tensor/topk_common.py      | 44 +++++++++++++------
 python/aitemplate/compiler/ops/tensor/topk.py | 10 +++--
 tests/unittest/ops/test_batch_gather.py       |  2 +-
 tests/unittest/ops/test_nms.py                |  2 +-
 tests/unittest/ops/test_topk.py               | 26 ++++++++---
 8 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/examples/02_detectron2/modeling/proposal_generator/rpn.py b/examples/02_detectron2/modeling/proposal_generator/rpn.py
index ce7a0f2bc..545105936 100644
--- a/examples/02_detectron2/modeling/proposal_generator/rpn.py
+++ b/examples/02_detectron2/modeling/proposal_generator/rpn.py
@@ -123,7 +123,7 @@ def forward(self, features):
         for rois, logit in zip(pred_rois, pred_logits):
             rois = ops.reshape()(rois, [N, -1, 4])
             if self.topk > 0 and rois.shape()[1].value() > self.topk:
-                score_inds = ops.topk(k=self.topk)(ops.reshape()(logit, [N, -1]))
+                _, score_inds = ops.topk(k=self.topk)(ops.reshape()(logit, [N, -1]))
                 boxes_topk = ops.batch_gather()(rois, score_inds)
                 scores_topk = ops.batch_gather()(
                     ops.reshape()(logit, [N, -1, 1]), score_inds
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 9058b38da..66aaff6c5 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -743,17 +743,8 @@ def acc_ops_topk(
     if sorted is not None:
         logger.warning("Ignoring the value of 'sorted': %s", sorted)
 
-    result_indices = topk(k=k)(input_val)
-    # current AIT implementation only returns indices. to match the torch topk return types, create dummy values
-    #
-    # TODO remove the hard coded dtype below, once we know whether AIT will support fp32 (thus providing an option of
-    # fp16 or fp32 for values)
-    return (
-        AITTensor(
-            shape=result_indices.shape(), dtype="float16", name=f"{name}_result_values"
-        ),
-        result_indices,
-    )
+    result = topk(k=k)(input_val)
+    return result
 
 
 @ait_converter(acc_ops.tuple_construct)
diff --git a/fx2ait/fx2ait/test/converters/test_ait_topk.py b/fx2ait/fx2ait/test/converters/test_ait_topk.py
index 0da25aac0..be04e214b 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_topk.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_topk.py
@@ -32,8 +32,7 @@ class TestTopkConverter(AITTestCase):
     def test_simple(self, input: List[int], k: int) -> None:
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                values, indices = torch.topk(x, k)
-                return indices
+                return torch.topk(x, k)
 
         model = TestModule().cuda()
         inputs = [
diff --git a/python/aitemplate/backend/common/tensor/topk_common.py b/python/aitemplate/backend/common/tensor/topk_common.py
index f546795d1..e6d89d714 100644
--- a/python/aitemplate/backend/common/tensor/topk_common.py
+++ b/python/aitemplate/backend/common/tensor/topk_common.py
@@ -37,7 +37,7 @@
 
 {{func_signature}}
 {
-    topk_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output);
+    topk_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output_index, output_value);
 }
     """
 )
@@ -74,7 +74,8 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(int64_t* output,
+void {{func_name}}(int64_t* output_index,
+                   void* output_value,
                    const void* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
@@ -94,7 +95,9 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}   {{output}}, {{input}},
+{{indent}}   {{output_index}},
+{{indent}}   {{output_value}},
+{{indent}}   {{input}},
 {{indent}}    {{elem_cnt}},
 {{indent}}    {{instance_size}},
 {{indent}}    {{instance_num}},
@@ -508,7 +511,8 @@ class TmpBufferManager final {
     const int64_t heap_size,
     const int64_t init_index,
     const T init_value,
-    int64_t* out_ptr) {
+    int64_t* out_index_ptr,
+    T* out_value_ptr) {
   extern __shared__ char smem[];
   auto* shared_entries = reinterpret_cast<Entry<T>*>(smem);
 
@@ -539,7 +543,8 @@ class TmpBufferManager final {
 
   // Write top_k elements in sorted array to output
   for (int64_t i = threadIdx.x; i < k; i += blockDim.x) {
-    (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+    (out_index_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+    (out_value_ptr + blockIdx.x * k)[i] = shared_entries[i].GetValue();
   }
 }
 // ALIGNPTR
@@ -566,7 +571,8 @@ class TmpBufferManager final {
     const int top_k,
     const void* input,
     void* workspace,
-    void* output) {
+    void* output_index,
+    void* output_value) {
   const int32_t k = std::min(top_k, instance_size);
 
   if (top_k < 100) {
@@ -593,7 +599,8 @@ class TmpBufferManager final {
             heap_size,
             std::numeric_limits<int64_t>::max(),
             NumericTraits<T>::min(),
-            (int64_t*)output);
+            (int64_t*)output_index,
+            (T*)output_value);
 
   } else {
     const uintptr_t ALIGNMENT = 32;
@@ -621,7 +628,7 @@ class TmpBufferManager final {
         stream);
 
     {{prefix}}Memcpy2DAsync(
-        (int64_t*)output,
+        (int64_t*)output_index,
         k * sizeof(int64_t),
         buf_manager.SortedIndicesPtr(),
         instance_size * sizeof(int64_t),
@@ -629,6 +636,16 @@ class TmpBufferManager final {
         instance_num,
         {{prefix}}MemcpyDefault,
         stream);
+
+    {{prefix}}Memcpy2DAsync(
+        (T*)output_value,
+        k * sizeof(T),
+        buf_manager.SortedInPtr(),
+        instance_size * sizeof(T),
+        k * sizeof(T),
+        instance_num,
+        {{prefix}}MemcpyDefault,
+        stream);
   }
 }
     """
@@ -706,12 +723,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     str
         Rendered function call.
     """
-    output_name = ""
-    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["outputs"]) == 2
     assert len(func_attrs["inputs"]) == 1
 
-    output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
+    output_value_name = func_attrs["outputs"][0]._attrs["name"]
+    output_index_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][1]._attrs["name"]
     )
     input_name = func_attrs["inputs"][0]._attrs["name"]
 
@@ -726,7 +743,8 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        output=output_name,
+        output_index=output_index_name,
+        output_value=output_value_name,
         input=input_name,
         elem_cnt=elem_cnt,
         instance_size=instance_size,
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
index 6a3cdfbe7..fb058751a 100644
--- a/python/aitemplate/compiler/ops/tensor/topk.py
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -59,8 +59,8 @@ class topk(Operator):
         .. code-block:: python
 
             X = Tensor(shape=[2, 800], name="X", is_input=True)
-            Y = ops.topk(k=300)(X)
-            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            value, indice = ops.topk(k=300)(X)
+            y_shape = [d._attrs["values"][0] for d in indice.shape()]
             print(y_shape)
 
             Outs:
@@ -87,8 +87,10 @@ def __call__(self, x: Tensor) -> Tensor:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         self._extract_exec_path(x)
-        output = Tensor(output_shape, src_ops={self}, dtype="int64")
-        self._attrs["outputs"] = [output]
+        output_index = Tensor(output_shape, src_ops={self}, dtype="int64")
+        output_value = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        output = (output_value, output_index)
+        self._attrs["outputs"] = [output_value, output_index]
         return output
 
     def _get_op_attributes(self):
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 4c210af1a..a62484130 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -149,7 +149,7 @@ def _test_batch_gather_topk(
             name="scores",
             is_input=True,
         )
-        X3 = ops.topk(k=topK)(X2)
+        _, X3 = ops.topk(k=topK)(X2)
         X4 = ops.batch_gather()(X1, X3)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index 430967af3..e6cdd6b4b 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -205,7 +205,7 @@ def model():
                 name="scores",
                 is_input=True,
             )
-            score_inds = ops.topk(k=topK)(X_scores)
+            _, score_inds = ops.topk(k=topK)(X_scores)
             bboxes = ops.batch_gather()(X_boxes, score_inds)
             OP = ops.batched_nms(iou_threshold=iou, keep_n=N)
             if copy_op:
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index 3a3353d02..cb62e7e6b 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -56,23 +56,37 @@ def _test_topk(
             name="X",
             is_input=True,
         )
+        X5 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="Y",
+            is_input=True,
+        )
         OP = ops.topk(k=topK)
         if copy_op:
             OP = ops.topk(**OP._get_op_attributes())
-        X4 = OP(X1)
+        X4, X5 = OP(X1)
+        X4._attrs["is_output"] = True
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
+        X5._attrs["is_output"] = True
+        X5._attrs["is_output"] = True
+        X5._attrs["name"] = "output2"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
+        module = compile_model(
+            (X4, X5), target, "./tmp", f"{test_name}_{self.test_count}"
+        )
 
         scores = self._create_tensors(shape, dtype)
         (values, y_pt) = torch.topk(scores, k=topK, dim=dim)
-
+        torch_dtype = string_to_torch_dtype(dtype)
         x = scores.reshape(shape).contiguous()
-        y = torch.empty(o_shape).cuda().to(torch.int64)
-        module.run_with_tensors([x], [y])
-        torch.testing.assert_close(y_pt, y, atol=0, rtol=0)
+        y2 = torch.empty(o_shape).cuda().to(torch.int64)
+        y = torch.empty(o_shape).cuda().to(torch_dtype)
+        module.run_with_tensors([x], [y, y2])
+        torch.testing.assert_close(values, y, atol=0, rtol=0)
+        torch.testing.assert_close(y_pt, y2, atol=0, rtol=0)
         self.test_count += 1
 
     def test_topk_heap(self):

From 7ec28ff6c0ac69b6ed5df52ff58957978eb4d0aa Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Fri, 28 Apr 2023 03:53:57 -0700
Subject: [PATCH 458/638] Support fp32 accumulation (#632)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/632

Previous implementation only supports fp16 accumulation. Update it to support fp32 accumulation as well.

Done: fixed numeric accuracy issues.
TODO: Sync the change into cutlass repo. (Not part of this diff, and not neccessary to work with the fixed customized version )

Reviewed By: ipiszy, aakhundov

Differential Revision: D43865503

fbshipit-source-id: a7fd5d454c1346dda5453201114d3d9c53ca8210
---
 .../kernel/default_b2b_batched_gemm.h         |   4 +-
 .../threadblock/b2b_mma_multistage.h          |   8 +-
 .../threadblock/b2b_mma_pipelined.h           |   3 +
 .../threadblock/custom_epilogue_tensor_op.h   | 858 ++++++++++++++++++
 .../threadblock/default_b2b_mma.h             |  23 +-
 .../default_gmem_to_accum_loader_tensor_op.h  |   7 +-
 .../threadblock/gmem_to_accum_loader.h        |  14 +-
 tests/unittest/ops/test_b2b_bmm.py            |  19 +-
 8 files changed, 913 insertions(+), 23 deletions(-)
 create mode 100644 static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h

diff --git a/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
index 6edb43260..7a4a3cc12 100644
--- a/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
+++ b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -196,7 +196,7 @@ struct DefaultB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   /// Define the threadblock-scoped matrix multiply-accumulate
   using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ElementC, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
       ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
       InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp0>::ThreadblockB2bMma;
 
@@ -205,7 +205,7 @@ struct DefaultB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
   /// Define the epilogue
   using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+      typename cutlass::epilogue::threadblock::classic_b2b_bmm::DefaultEpilogueTensorOp<
           ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
           EpilogueOutputOp1::kCount>::Epilogue;
 
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
index bbea76d44..931b10c70 100644
--- a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -103,6 +103,8 @@ template <
     typename SmemIteratorB1_,
     /// Cache operation for operand B
     cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of output matrix
+    typename ElementOutput_,
     /// Data type of accumulator matrix
     typename ElementC_,
     /// Data type of accumulator matrix
@@ -151,7 +153,9 @@ class B2bMmaMultistage :
 
   using SmemIteratorB1 = SmemIteratorB1_;
 
-  ///< Data type of accumulator matrix
+  ///< Data type of output matrix
+  using ElementOutput = ElementOutput_;
+ ///< Data type of accumulator matrix
   using ElementC = ElementC_;
   ///< Layout of accumulator matrix
   using LayoutC = LayoutC_;
@@ -670,7 +674,7 @@ class B2bMmaMultistage :
 
     /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
     FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
-    typename FragmentIteratorA1::OutputOp noop_output_op_0({}); // Is noop LinearCombination (see default_b2b_mma.h)
+    typename FragmentIteratorA1::OutputOp noop_output_op_0({});
     TriuMmaTensorOpFragmentIterator<FragmentIteratorA1, Shape0::kM> triu_warp_tile_iterator_A1_;
 
     //
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
index d8ffe67ad..c91309c75 100644
--- a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
@@ -92,6 +92,8 @@ template <
   /// Iterates over tiles of B operand in shared memory
   /// (concept: WriteableTileIterator | RandomAccessTileIterator)
   typename SmemIteratorB1_,
+  /// Data type of output matrix
+  typename ElementOutput_,
   /// Data type of accumulator matrix
   typename ElementC_,
   /// Data type of accumulator matrix
@@ -148,6 +150,7 @@ class B2bMmaPipelined :
   using SmemIteratorB1 = SmemIteratorB1_;
 
 
+  using ElementOutput = ElementOutput_;       ///< Data type of output matrix
   using ElementC = ElementC_;       ///< Data type of accumulator matrix
   using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
 
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h b/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
new file mode 100644
index 000000000..958cc8843
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
@@ -0,0 +1,858 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+* Customized version of Cutlass 3.1 cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+* removed problematic specialization of DefaultIteratorsTensorOp for fp16 -> fp32 accumulation
+* which had numeric issues due to the usage of SharedLoadIteratorMixed.
+* Introduces the cutlass::epilogue::threadblock::classic_b2b_bmm namespace, which is a customized
+* variant of the cutlass::epilogue::threadblock namespace.
+*
+**************************************************************************************************/
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+
+/*
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  float,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+
+};
+*/
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput,
+  int32_t,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+                "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace classic_b2b_bmm
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
index ad915009c..abcc65025 100644
--- a/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -82,6 +82,8 @@ template <
     int kAlignmentB,
     /// Layout type for B1 matrix operand
     typename LayoutB1_,
+    /// Element type for C matrix
+    typename ElementC_,
     /// Element type for internal accumulation
     typename ElementAccumulator_,
     /// Layout type for C and D matrix operands
@@ -132,6 +134,8 @@ template <
     int kAlignmentB,
     /// Layout type for B1 matrix operand
     typename LayoutB1,
+    /// Element type forAC matrix operand
+    typename ElementC,
     /// Element type for internal accumulation
     typename ElementAccumulator,
     /// Tag indicating architecture to tune for
@@ -153,7 +157,7 @@ template <
     /// Epilogue output operator
     typename EpilogueOutputOp>
 struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, LayoutB1, ElementAccumulator, layout::RowMajor,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag,
                   ThreadblockShape0, ThreadblockShape1,
                   WarpShape0, WarpShape1,
@@ -218,7 +222,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
       typename MmaCore1::Shape, FragmentIteratorA1,
       IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
       IteratorB1, typename MmaCore1::SmemIteratorB,
-      ElementAccumulator, layout::RowMajor,
+      ElementC, ElementAccumulator, layout::RowMajor,
       EpilogueOutputOp,
       typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
 
@@ -241,6 +245,8 @@ template <
     int kAlignmentB,
     /// Layout type for B1 matrix operand
     typename LayoutB1,
+    /// Element type for output
+    typename ElementC,
     /// Element type for internal accumulation
     typename ElementAccumulator,
     /// Tag indicating architecture to tune for
@@ -264,7 +270,7 @@ template <
     /// Epilogue output operator
     typename EpilogueOutputOp>
 struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, LayoutB1, ElementAccumulator, layout::RowMajor,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag,
                   ThreadblockShape0, ThreadblockShape1,
                   WarpShape0, WarpShape1,
@@ -311,11 +317,12 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   // FragmentIteratorA1 should just load A1 fragments from the intermediate
   // accumulator tile without modification, so LinearCombination is used to
   // apply a no-op to the accumulator tile.
-  using LinearCombinationOutputOp = epilogue::thread::LinearCombination<
-    typename EpilogueOutputOp::ElementOutput,
+  using LinearCombinationOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,
     EpilogueOutputOp::kCount,
-    typename EpilogueOutputOp::ElementOutput,
-    typename EpilogueOutputOp::ElementCompute
+    ElementAccumulator,
+    ElementC,
+    cutlass::epilogue::thread::ScaleType::Nothing
   >;
   using FragmentIteratorA1 =
       cutlass::gemm::warp::MmaTensorOpFragmentIterator<
@@ -358,7 +365,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
       typename MmaCore1::Shape, FragmentIteratorA1,
       IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
       IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
-      ElementAccumulator, layout::RowMajor,
+      ElementC, ElementAccumulator, layout::RowMajor,
       EpilogueOutputOp,
       typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages,
       CausalMaskAfterGemm0, typename MmaCore0::WarpShape>;
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
index 6379eb435..c40e6d43f 100644
--- a/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
@@ -86,6 +86,7 @@
 #include "classic_b2b_bmm/threadblock/gmem_to_accum_loader.h"
 #include "classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h"
 #include "classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h"
+#include "classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -154,9 +155,9 @@ struct DefaultGmemToAccumLoaderTensorOp {
                                         LayoutC> >::type;
 
   /// Support several implementations depending on structure of epilogue
-  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+  using DefaultIterators = classic_b2b_bmm::detail::DefaultIteratorsTensorOp<
+    ElementOutput,
     ElementOutput,
-    ElementAccumulator,
     kElementsPerAccess,
     Shape,
     typename WarpMmaTensorOp::Shape,
@@ -167,7 +168,7 @@ struct DefaultGmemToAccumLoaderTensorOp {
   using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
   using SharedLoadIterator = typename cutlass::epilogue::threadblock::GmemToAccumLoaderSharedLoadIterator<
     typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
+    ElementOutput
   >;
 
   /// Hard-coded padding elements added
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
index f2403f43b..87d413344 100644
--- a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
@@ -175,7 +175,7 @@ class GmemToAccumLoader :
     typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
 
   /// Vector type used by the shared output iterator
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  using AccumulatorAccessType = Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
 
   static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
 
@@ -290,7 +290,7 @@ class GmemToAccumLoader :
       {
 
         typename AccumulatorFragmentIterator::Fragment accum_fragment;
-        typename AccumulatorFragmentIterator::Fragment source_accum_fragment;
+        typename OutputTileIterator::Fragment source_accum_fragment;
         typename AccumulatorFragmentIterator::Fragment output_accum_fragment;
 
         // Load from shared memory to "unaligned" accumulator fragment.
@@ -327,11 +327,11 @@ class GmemToAccumLoader :
     typename AccumulatorFragmentIterator::Fragment &output_fragment,
     OutputOp const &output_op,                    ///< Output operator
     typename AccumulatorFragmentIterator::Fragment const &accum_fragment,
-    typename AccumulatorFragmentIterator::Fragment const &source_fragment)
+    typename OutputTileIterator::Fragment const &source_fragment)
   {
 
-    OutputAccessType *output_frag_ptr =
-      reinterpret_cast<OutputAccessType *>(&output_fragment);
+    AccumulatorAccessType *output_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType *>(&output_fragment);
 
     AccumulatorAccessType const *compute_frag_ptr =
       reinterpret_cast<AccumulatorAccessType const *>(&accum_fragment);
@@ -341,12 +341,12 @@ class GmemToAccumLoader :
 
     int const kOutputOpIterations =
       AccumulatorFragmentIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
+    NumericArrayConverter<typename AccumulatorAccessType::Element, typename OutputAccessType::Element, OutputOp::kCount, OutputOp::kRound> converter;
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kOutputOpIterations; ++i)
     {
       // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      output_frag_ptr[i] = converter(output_op(compute_frag_ptr[i], source_frag_ptr[i]));
     }
   }
 
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index 1d0e0bcd3..b1bcebb00 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -60,6 +60,7 @@ def _test_classic_b2b_bmm(
         copy_op=True,
         atol=1e-2,
         rtol=1e-2,
+        use_fp16_acc=True,
     ):
         # Initialize AIT classic_b2b_bmm operator.
         if isinstance(batch_sizes, int):
@@ -107,7 +108,7 @@ def _test_classic_b2b_bmm(
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
-        target = detect_target(use_fp16_acc=True)
+        target = detect_target(use_fp16_acc=use_fp16_acc)
         module = compile_model(Y, target, "./tmp", test_name)
 
         # Run tests.
@@ -140,6 +141,22 @@ def _test_classic_b2b_bmm(
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_b2b_bmm_fp16_fp32acc(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_basic_fp32acc",
+            dtype="float16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_sigmoid_fp32acc",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            use_fp16_acc=False,
+        )
+
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_b2b_bmm_fp16(self):
         self._test_classic_b2b_bmm(

From e6f12ebc0f04d98ae29ae7e2adb7dfe99425a4e3 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 28 Apr 2023 09:33:39 -0700
Subject: [PATCH 459/638] bias support (#633)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/633

Reviewed By: ipiszy

Differential Revision: D45345940

fbshipit-source-id: dfd4165f00f6dc7f8d77017376047693fb3d665d
---
 fx2ait/fx2ait/tools/common_fx2ait.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 31356befc..f1b89e2f9 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -133,7 +133,7 @@ def run_test(
         mod.to(torch_dtype)
         inputs = [
             inp.to(torch_dtype).contiguous()
-            if inp.dtype is not torch.bool
+            if inp.dtype not in (torch.bool, torch.int64)
             else inp.contiguous()
             for inp in inputs
         ]

From 5661170641c86e3b7c7d656cac61d9a1d8878a06 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Fri, 28 Apr 2023 13:24:37 -0700
Subject: [PATCH 460/638] Add PatchEmbed FE (#629)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/629

Add PatchEmbed FE module

Reviewed By: terrychenism

Differential Revision: D45319936

fbshipit-source-id: b41cca65337d8142d123ff2a0832e0b0936adc65
---
 python/aitemplate/frontend/nn/conv3d.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
index 214f75018..ed271d0a4 100644
--- a/python/aitemplate/frontend/nn/conv3d.py
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -16,6 +16,7 @@
 conv3d Module.
 """
 from aitemplate.compiler.ops import conv3d, conv3d_bias, depthwise_conv3d
+from aitemplate.compiler.ops.padding.ndhwc3to8 import ndhwc3to8
 from aitemplate.frontend.nn.module import Module
 from aitemplate.frontend.nn.parameter import Parameter
 
@@ -94,21 +95,26 @@ def __init__(
         bias=False,
     ):
         super().__init__()
+        self.has_bias = bias
 
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size, kernel_size)
         self.weight = Parameter(
-            shape=[out_channels, *kernel_size, in_channels // groups], dtype=dtype
+            shape=[out_channels, *kernel_size, in_channels // groups],
+            dtype=dtype,
         )
-        if groups != 1 and bias:
-            self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        if self.has_bias:
+            self.bias = Parameter(shape=[out_channels], dtype=dtype, name="bias")
 
         if groups == 1:
-            if bias:
+            if self.has_bias:
                 self.op = conv3d_bias(
                     stride=stride, pad=padding, dilate=dilation, group=groups
                 )
-            self.op = conv3d(stride=stride, pad=padding, dilate=dilation, group=groups)
+            else:
+                self.op = conv3d(
+                    stride=stride, pad=padding, dilate=dilation, group=groups
+                )
         else:
             self.op = depthwise_conv3d(
                 stride=stride, pad=padding, dilate=dilation, group=groups, bias=bias
@@ -118,4 +124,10 @@ def forward(self, *args):
         """Applies Conv3d on the input tensor."""
         assert len(args) == 1
         x = args[0]
-        return self.op(x, self.weight.tensor())
+
+        if self.has_bias:
+            x = ndhwc3to8()(x)
+            weight = ndhwc3to8()(self.weight.tensor())
+            return self.op(x, weight, self.bias.tensor())
+        else:
+            return self.op(x, self.weight.tensor())

From 6ad335187006781d01d86ab5acdf2636dad6a8b8 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 28 Apr 2023 14:53:36 -0700
Subject: [PATCH 461/638] Refactor test_bmm(_add) with
 filter_test_cases_by_test_env (#635)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/635

Almost all unit tests from `test_bmm` and `test_bmm_add` are running on *both* V100 and A100 hosts. This is inefficient, as only a small fraction of the tests must run on A100. In this diff, the tests are refactored to rely on `filter_test_cases_by_test_env` instead of `filter_test_cases_by_params`, which leads to a more frugal use of A100 hosts. See the test plan for the before / after numbers.

Reviewed By: hl475

Differential Revision: D45405007

fbshipit-source-id: 98ce1c39d09b057586f759f22b9eedd6bcd4dbd5
---
 tests/unittest/ops/test_bmm.py     | 193 ++++++++++++++++++++---------
 tests/unittest/ops/test_bmm_add.py | 106 +++++++++++++---
 2 files changed, 221 insertions(+), 78 deletions(-)

diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 1073a590f..a4b3a5b7d 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -21,21 +21,12 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
 from aitemplate.utils import shape_utils
 
-from parameterized import parameterized
-
-
-_TEST_PARAMS = {
-    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-}
-
 
 class BMMTestCase(unittest.TestCase):
     def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
@@ -68,15 +59,15 @@ def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
 
     def test_rcr(self):
         self._test_rcr([1024], [128], N=512, K=256, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
-            self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
-            self._test_rcr(
-                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
-            )
-            self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
-            self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
-            self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+        self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcr([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcr_rocm(self):
+        self._test_rcr([1024], [128], N=512, K=256, test_name="static")
 
     def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
@@ -107,10 +98,12 @@ def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
 
     def test_crr(self):
         self._test_crr([1024], [128], M=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
-            self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
-            self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+        self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crr_rocm(self):
+        self._test_crr([1024], [128], M=256, N=512, test_name="static")
 
     def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
@@ -138,10 +131,12 @@ def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
 
     def test_rrr(self):
         self._test_rrr([87], [23], K=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
-            self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
-            self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+        self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrr_rocm(self):
+        self._test_rrr([87], [23], K=256, N=512, test_name="static")
 
     def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -166,8 +161,10 @@ def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
 
     def test_ccr(self):
         self._test_ccr([77], M=256, N=64, K=128, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+        self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccr_rocm(self):
+        self._test_ccr([77], M=256, N=64, K=128, test_name="static")
 
     def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -200,15 +197,15 @@ def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
 
     def test_rcc(self):
         self._test_rcc([1024], [128], N=512, K=256, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
-            self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
-            self._test_rcc(
-                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
-            )
-            self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
-            self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
-            self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+        self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcc([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcc_rocm(self):
+        self._test_rcc([1024], [128], N=512, K=256, test_name="static")
 
     def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
@@ -240,10 +237,12 @@ def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
 
     def test_crc(self):
         self._test_crc([1024], [128], M=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
-            self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
-            self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+        self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crc_rocm(self):
+        self._test_crc([1024], [128], M=256, N=512, test_name="static")
 
     def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
@@ -272,10 +271,12 @@ def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
 
     def test_rrc(self):
         self._test_rrc([87], [23], K=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
-            self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
-            self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+        self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrc_rocm(self):
+        self._test_rrc([87], [23], K=256, N=512, test_name="static")
 
     def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -302,12 +303,37 @@ def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
 
     def test_ccc(self):
         self._test_ccc([77], M=256, N=64, K=128, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+        self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccc_rocm(self):
+        self._test_ccc([77], M=256, N=64, K=128, test_name="static")
+
+    def test_bmm_0_fp32_sm80(self, dtype="float32"):
+        self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccr(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_0_dtype(self, dtype):
+    def test_bmm_0_bf16(self, dtype="bfloat16"):
         self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr(
             [1, 5, 77, 128],
@@ -332,9 +358,32 @@ def test_bmm_0_dtype(self, dtype):
             [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_1_dtype(self, dtype):
+    def test_bmm_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcc(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccc(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_1_bf16(self, dtype="bfloat16"):
         self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcc(
             [1, 5, 77, 128],
@@ -727,8 +776,7 @@ def test_ccc(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_broadcast_0_dtype(self, dtype):
+    def test_bmm_broadcast_0_fp32_sm80(self, dtype="float32"):
         self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
@@ -738,8 +786,27 @@ def test_bmm_broadcast_0_dtype(self, dtype):
         self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
         self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_broadcast_1_dtype(self, dtype):
+    def test_bmm_broadcast_0_bf16(self, dtype="bfloat16"):
+        self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_bf16(self, dtype="bfloat16"):
         self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
@@ -772,7 +839,7 @@ def test_rcr_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
@@ -800,7 +867,7 @@ def test_rrr_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
@@ -828,11 +895,15 @@ def test_rcc_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
 
 
+filter_test_cases_by_test_env(BMMTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index 10f531a04..f396abb64 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -20,19 +20,11 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
 
-from parameterized import parameterized
-
-_TEST_PARAMS = {
-    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-}
-
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMAddTestCase(unittest.TestCase):
@@ -333,8 +325,17 @@ def test_rrc(self):
     def test_crc(self):
         self._test_crc(B=32, M=256, K=256, N=512)
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_0_dtype(self, dtype):
+    def test_bmm_add_0_fp32_sm80(self, dtype="float32"):
+        self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_ccr(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+        )
+        self._test_crr(B=8, M=32, K=16, N=64, dtype=dtype)
+        self._test_rcr(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_add_0_bf16(self, dtype="bfloat16"):
         self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccr(
             B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
@@ -344,8 +345,17 @@ def test_bmm_add_0_dtype(self, dtype):
             B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
         )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_1_dtype(self, dtype):
+    def test_bmm_add_1_fp32_sm80(self, dtype="float32"):
+        self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_ccc(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+        )
+        self._test_crc(B=8, M=32, K=16, N=64, dtype=dtype)
+        self._test_rcc(
+            B=8, M=32, N=64, K=16, test_name=f"bmm_rcc_add_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_add_1_bf16(self, dtype="bfloat16"):
         self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
         self._test_ccc(
             B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
@@ -735,8 +745,37 @@ def test_ccc(self):
             test_name="broadcastable_bias3d",
         )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_broadcast_0_dtype(self, dtype):
+    def test_bmm_add_broadcast_0_fp32_sm80(self, dtype="float32"):
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_broadcast_0_bf16(self, dtype="bfloat16"):
         self._test_crr(
             [1, 8, 16],
             [2, 8, 32],
@@ -766,8 +805,37 @@ def test_bmm_add_broadcast_0_dtype(self, dtype):
             dtype=dtype,
         )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_add_broadcast_1_dtype(self, dtype):
+    def test_bmm_add_broadcast_1_fp32_sm80(self, dtype="float32"):
+        self._test_crc(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[32, 16],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_broadcast_1_bf16(self, dtype="bfloat16"):
         self._test_crc(
             [1, 8, 16],
             [2, 8, 32],
@@ -798,5 +866,9 @@ def test_bmm_add_broadcast_1_dtype(self, dtype):
         )
 
 
+filter_test_cases_by_test_env(BMMAddTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()

From bace6d8043746df619d02c20654556530ecb4cc8 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Fri, 28 Apr 2023 23:10:49 -0700
Subject: [PATCH 462/638] Update infer_shape for batch_gather (#624)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/624

update infer_shape for batch_gather with symbolic_shape.

Reviewed By: terrychenism

Differential Revision: D45251814

fbshipit-source-id: 4ffea13bc38f5c6afb81e9ffbaeec5504ab77aa5
---
 .../compiler/ops/tensor/batch_gather.py       | 44 ++++++-------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index c522eb9c2..f6586affc 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -24,7 +24,6 @@
 from aitemplate import backend
 from aitemplate.backend import registry
 from aitemplate.compiler.base import IntVar, Operator, Tensor
-from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -53,41 +52,24 @@ def __init__(self) -> None:
         self._attrs["has_profiler"] = False
         self.exec_key_template = EXEC_KEY_TEMPLATE
 
-    def _infer_shape(self, x: List[int], indices: List[int]):
-        rank = len(indices)
-        for r in range(1, rank - 1):
-            assert x[r] == indices[r]
-        output = list(x)
-        output[rank - 1] = indices[-1]
-        return output
-
     def _infer_shapes(self, x: Tensor, indices: Tensor) -> List[IntVar]:
         """Infers shapes for batch_gather."""
 
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
+        rank = len(indices._attrs["shape"])
+
+        # TODO: remove this when we're sure we support non-static batch_gather
+        x_shape_values = [var._attrs["values"][0] for var in x._attrs["shape"]]
         indices_shape = [var._attrs["values"][0] for var in indices._attrs["shape"]]
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape, indices_shape)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
-            )
-        if len(indices.shape()) > 1:
-            # Generally output has the same batch dimension as input
-            output_shape[0] = x.shape()[0]
-        else:
+        for r in range(1, rank - 1):
+            assert x_shape_values[r] == indices_shape[r]
+
+        out_shapes = x._attrs["shape"][:]
+        if rank <= 1:
             # Special case: gather happens along batch dimension
-            output_shape[0] = indices.shape()[0]
-        return output_shape
+            out_shapes[0] = indices.shape()[0]
+        out_shapes[rank - 1] = indices._attrs["shape"][-1]
+
+        return out_shapes
 
     def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
         dtype = indices._attrs["dtype"]

From d9ced25d324820029136c625e4f79e28620f01f4 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 30 Apr 2023 04:35:56 -0700
Subject: [PATCH 463/638] Add SM90 CUTLASS 3.x kernels to bmm_xxx and
 bmm_xxx_add (#637)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/637

SM90 CUTLASS 3.x kernels are added to the following `bmm_xxx` ops:

- `bmm_rrr`
- `bmm_rcr`
- `bmm_crr`
- `bmm_ccr`
- `bmm_rrc`
- `bmm_rcc`
- `bmm_crc`
- `bmm_ccc`

and the following `bmm_xxx_add` ops:

- `bmm_rrr_add`
- `bmm_rcr_add`
- `bmm_crr_add`
- `bmm_ccr_add`
- `bmm_rrc_add`
- `bmm_rcc_add`
- `bmm_crc_add`
- `bmm_ccc_add`

The bmm + permute ops, `bmm_rcr_permute` and `bmm_rrr_permute` are not extended in this diff, as they require special treatment of the output layout / strides.

Reviewed By: chenyang78

Differential Revision: D45400720

fbshipit-source-id: 344421fbd62b6f4c070ed806b3cfd69224446689
---
 .../backend/cuda/gemm_universal/bmm_common.py | 134 ++++-
 .../cuda/gemm_universal/bmm_rcr_permute.py    |   1 +
 .../cuda/gemm_universal/bmm_rrr_permute.py    |   1 +
 .../backend/cuda/gemm_universal/bmm_xxx.py    |  23 +-
 .../cuda/gemm_universal/bmm_xxx_add.py        |  16 +-
 .../cuda/gemm_universal/perm021fc_crc.py      |   4 +-
 .../cuda/gemm_universal/perm021fc_crc_bias.py |   4 +-
 tests/unittest/compiler/test_move_view_ops.py |   5 +
 tests/unittest/ops/test_bmm.py                | 361 ++++++++++++
 tests/unittest/ops/test_bmm_add.py            | 515 ++++++++++++++++--
 10 files changed, 1002 insertions(+), 62 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 8eb5a24b9..138f72310 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -161,7 +161,9 @@
 class Bmm_problem_info:
     alpha_value: float = 1
     beta_value: float = 0
-    problem_size: str = "{M, N, K}"
+    problem_dim_0: str = "M"
+    problem_dim_1: str = "N"
+    problem_dim_2: str = "K"
     batch_size: str = "B"
     a_ptr: str = "a_ptr"
     b_ptr: str = "b_ptr"
@@ -175,6 +177,9 @@ class Bmm_problem_info:
     ldb: str = "0"
     ldbias: str = "0"
     ldc: str = "0"
+    a_row_major: bool = True
+    b_row_major: bool = False
+    c_row_major: bool = True
 
 
 def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
@@ -195,7 +200,11 @@ def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kBatched,                                                         // GemmUniversalMode mode
-    {{mm_info.problem_size}},                                                                           // GemmCoord problem_size
+    {
+        static_cast<coord_t>({{mm_info.problem_dim_0}}),
+        static_cast<coord_t>({{mm_info.problem_dim_1}}),
+        static_cast<coord_t>({{mm_info.problem_dim_2}})
+    },                                                                                                  // GemmCoord problem_size
     {{mm_info.batch_size}},                                                                             // int batch_count
     {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},  // typename EpilogueOutputOp::Params epilogue
     {{mm_info.a_ptr}},                                                                                  // void const * ptr_A
@@ -214,6 +223,49 @@ def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kBatched,                                 // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{mm_info.problem_dim_0}}),
+        static_cast<coord_t>({{mm_info.problem_dim_1}}),
+        static_cast<coord_t>({{mm_info.problem_dim_2}}),
+        static_cast<coord_t>({{mm_info.batch_size}})
+    },                                                                          // ProblemShape problem_shape
+    {{mm_info.a_ptr}},                                                          // ElementA const* ptr_A
+{% if mm_info.a_row_major %}
+    { {{mm_info.lda}}, cute::Int<1>{}, {{mm_info.a_batch_stride}} },            // StrideA dA
+{% else %}
+    { cute::Int<1>{}, {{mm_info.lda}}, {{mm_info.a_batch_stride}} },            // StrideA dA
+{% endif %}
+    {{mm_info.b_ptr}},                                                          // ElementB const* ptr_B
+{% if mm_info.b_row_major %}
+    { cute::Int<1>{}, {{mm_info.ldb}}, {{mm_info.b_batch_stride}} },            // StrideB dB
+{% else %}
+    { {{mm_info.ldb}}, cute::Int<1>{}, {{mm_info.b_batch_stride}} },            // StrideB dB
+{% endif %}
+    {
+        {
+            ElementComputeEpilogue({{mm_info.alpha_value}}),
+            ElementComputeEpilogue({{mm_info.beta_value}})
+        },                                                                      // typename ThreadEpilogueOp::Params thread
+        {{mm_info.bias_ptr}},                                                   // ElementC const* ptr_C
+{% if mm_info.c_row_major %}
+        { {{mm_info.ldbias}}, cute::Int<1>{}, {{mm_info.bias_batch_stride}} },  // StrideC dC
+{% else %}
+        { cute::Int<1>{}, {{mm_info.ldbias}}, {{mm_info.bias_batch_stride}} },  // StrideC dC
+{% endif %}
+        {{mm_info.c_ptr}},                                                      // ElementD const* ptr_D
+{% if mm_info.c_row_major %}
+        { {{mm_info.ldc}}, cute::Int<1>{}, {{mm_info.c_batch_stride}} },        // StrideD dD
+{% else %}
+        { cute::Int<1>{}, {{mm_info.ldc}}, {{mm_info.c_batch_stride}} },        // StrideD dD
+{% endif %}
+    },                                                                          // EpilogueArguments epilogue
+"""
+)
+
+
 def reverse_dim_info_mapping(dim_info_dict, source, tensor_idx):
     def _fill(arr, idx, val):
         if len(arr) <= idx:
@@ -357,6 +409,9 @@ def make_function_strided_args(
         ldb="input_b_stride",
         ldbias=f"{default_mm_info.ldbias}",
         ldc="output_stride",
+        a_row_major=default_mm_info.a_row_major,
+        b_row_major=default_mm_info.b_row_major,
+        c_row_major=default_mm_info.c_row_major,
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
@@ -368,7 +423,15 @@ def make_function_strided_args(
     problem_args = PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
-    return (problem_args, input_addr_calculator, output_addr_calculator)
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_problem_info,
+    )
+    return (
+        problem_args,
+        problem_args_cutlass_3x,
+        input_addr_calculator,
+        output_addr_calculator,
+    )
 
 
 def gen_profiler(
@@ -380,9 +443,14 @@ def gen_profiler(
     problem_args,
     args_parser,
     bias_ptr_arg=None,
+    problem_args_cutlass_3x="",
 ):
+    import cutlass_lib
+
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    op_instance = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -409,6 +477,7 @@ def gen_profiler(
         instance=instance_name_base,
         is_profiler=True,
         problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=a_ndims,
@@ -421,11 +490,21 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(op, for_profiler=True)
-        config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            common.INSTANCE_TEMPLATE_CUTLASS_3X
+            if cutlass_3x
+            else common.INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=common.extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -525,14 +604,33 @@ def default_gen_profiler(
         mm_info=default_mm_info,
     )
 
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    # CUTLASS 3.x problem args require explicit I/O pointer types (not void*)
+    default_mm_info.a_ptr = f"({elem_input_type}*)({default_mm_info.a_ptr})"
+    default_mm_info.b_ptr = f"({elem_input_type}*)({default_mm_info.b_ptr})"
+    default_mm_info.bias_ptr = f"({elem_output_type}*)({default_mm_info.bias_ptr})"
+    default_mm_info.c_ptr = f"({elem_output_type}*)({default_mm_info.c_ptr})"
+
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=default_mm_info,
+    )
+
     return gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
     )
 
 
@@ -557,12 +655,14 @@ def gen_function(
     dim_info_dict,
     input_addr_calculator="",
     output_addr_calculator="",
+    problem_args_cutlass_3x="",
 ):
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=len(func_attrs["input_accessors"][0].original_shapes),
         weight_ndims=len(func_attrs["input_accessors"][1].original_shapes),
         output_ndims=len(func_attrs["output_accessors"][0].original_shapes),
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 16451f4de..14dee69a3 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -114,6 +114,7 @@ def gen_function(
     )
     (
         problem_args,
+        _,  # problem_args_cutlass_3x
         input_addr_calculator,
         output_addr_calculator,
     ) = bmm_common.make_function_strided_args(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index 30b066b34..31551e49c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -114,6 +114,7 @@ def gen_function(
     )
     (
         problem_args,
+        _,  # problem_args_cutlass_3x
         input_addr_calculator,
         output_addr_calculator,
     ) = bmm_common.make_function_strided_args(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
index ca99405cf..8d92571b8 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
@@ -38,6 +38,9 @@ def _get_problem_args(a_layout, b_layout, c_layout):
         "ldb": "K" if b_layout == "c" else "N",
         "ldbias": "M" if c_layout == "c" else "N",
         "ldc": "M" if c_layout == "c" else "N",
+        "a_row_major": a_layout == "r",
+        "b_row_major": b_layout == "r",
+        "c_row_major": c_layout == "r",
     }
 
 
@@ -64,7 +67,10 @@ def fproc(op):
                 epilogue_name=func_attrs["epilogue"],
             )
 
-        func_attrs["op_instance"] = common.extract_config(fproc)
+        func_attrs["op_instance"] = common.extract_config(
+            f_proc_op=fproc,
+            include_cutlass_3x_ops=True,
+        )
 
     return config
 
@@ -105,17 +111,22 @@ def gen_function(
         )
         (
             problem_args,
+            problem_args_cutlass_3x,
             input_addr_calculator,
             output_addr_calculator,
         ) = bmm_common.make_function_strided_args(
-            func_attrs, dim_info_dict, default_mm_info, is_permute=False
+            func_attrs=func_attrs,
+            dim_info_dict=dim_info_dict,
+            default_mm_info=default_mm_info,
+            is_permute=False,
         )
 
         return bmm_common.gen_function(
-            func_attrs,
-            exec_cond_template,
-            problem_args,
-            dim_info_dict,
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            dim_info_dict=dim_info_dict,
             input_addr_calculator=input_addr_calculator,
             output_addr_calculator=output_addr_calculator,
         )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
index 3f2aaedbd..69f02599e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -49,16 +49,22 @@ def gen_function(
         )
         (
             problem_args,
+            problem_args_cutlass_3x,
             input_addr_calculator,
             output_addr_calculator,
         ) = bmm_common.make_function_strided_args(
-            func_attrs, dim_info_dict, default_mm_info, is_permute=False
+            func_attrs=func_attrs,
+            dim_info_dict=dim_info_dict,
+            default_mm_info=default_mm_info,
+            is_permute=False,
         )
+
         return bmm_common.gen_function(
-            func_attrs,
-            exec_cond_template,
-            problem_args,
-            dim_info_dict,
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            dim_info_dict=dim_info_dict,
             input_addr_calculator=input_addr_calculator,
             output_addr_calculator=output_addr_calculator,
         )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index f760809e6..db645f28f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -24,7 +24,9 @@
 
 def _get_problem_info(**kwargs):
     problem_args = {
-        "problem_size": "{N, M, K}",
+        "problem_dim_0": "N",
+        "problem_dim_1": "M",
+        "problem_dim_2": "K",
         "bias_ptr": "c_ptr",
         "a_batch_stride": "0",
         "b_batch_stride": "K * M",
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
index 75abc8b6d..3546cea7c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
@@ -30,7 +30,9 @@
 def _get_problem_info(**kwargs):
     problem_args = {
         "beta_value": 1,
-        "problem_size": "{N, M, K}",
+        "problem_dim_0": "N",
+        "problem_dim_1": "M",
+        "problem_dim_2": "K",
         "bias_ptr": "bias_ptr",
         "a_batch_stride": "0",
         "b_batch_stride": "K * M",
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
index 0d3a473d5..48dda1c3d 100644
--- a/tests/unittest/compiler/test_move_view_ops.py
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -30,6 +30,10 @@
 class MoveViewOpsTestCase(unittest.TestCase):
     BATCH_SIZE = 1024
 
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def __init__(self, *args, **kwargs):
         super(MoveViewOpsTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
@@ -1835,4 +1839,5 @@ def test_non_movable_cat_reshape_cat_2(self):
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index a4b3a5b7d..60c22d4d4 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -21,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -408,6 +409,366 @@ def test_bmm_1_bf16(self, dtype="bfloat16"):
             [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    K=60,
+                    N=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=60,
+                N=28,
+                test_name="dynamic_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    N=60,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=60,
+                K=28,
+                test_name="dynamic_bm_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccr(
+                    bs=[1, 5, 11],
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=60,
+                N=7,
+                K=28,
+                test_name="dynamic_b_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crr(
+                    bs=[1, 2, 5],
+                    ks=[3, 6, 8],
+                    M=28,
+                    N=60,
+                    test_name="dynamic_bk_fp16_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=28,
+                N=60,
+                test_name="dynamic_bk_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rrc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrc(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    K=60,
+                    N=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=60,
+                N=28,
+                test_name="dynamic_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcc(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    N=60,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=60,
+                K=28,
+                test_name="dynamic_bm_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccc(
+                    bs=[1, 5, 11],
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=60,
+                N=7,
+                K=28,
+                test_name="dynamic_b_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crc(
+                    bs=[1, 2, 5],
+                    ks=[3, 6, 8],
+                    M=28,
+                    N=60,
+                    test_name="dynamic_bk_fp16_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=28,
+                N=60,
+                test_name="dynamic_bk_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index f396abb64..b46ad142b 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -20,6 +20,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -36,7 +37,7 @@ def __init__(self, *args, **kwargs):
         super(BMMAddTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
 
-    def _test_rrr(self, B, M, K, N, dtype="float16"):
+    def _test_rrr(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
         X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
@@ -46,9 +47,7 @@ def _test_rrr(self, B, M, K, N, dtype="float16"):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         dll_name = f"test_{self.test_count}.so"
-        module = compile_model(
-            Y, target, "./tmp", f"bmm_rrr_add_{dtype}", dll_name=dll_name
-        )
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor([B, M, K], dtype)
         W_pt = get_random_torch_tensor([B, K, N], dtype)
         D_pt = get_random_torch_tensor([B, M, N], dtype)
@@ -120,7 +119,7 @@ def _test_rcr(self, B, M, N, K, test_name, dtype="float16"):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
-    def _test_crr(self, B, M, K, N, dtype="float16"):
+    def _test_crr(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
         X = Tensor(
             shape=[B, K, M],
@@ -144,7 +143,6 @@ def _test_crr(self, B, M, K, N, dtype="float16"):
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"bmm_crr_add_{dtype}"
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor([B, K, M], dtype)
@@ -191,7 +189,7 @@ def _test_rcc(self, B, M, K, N, test_name, dtype="float16"):
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
-    def _test_rrc(self, B, M, K, N, dtype="float16"):
+    def _test_rrc(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
         X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
@@ -201,9 +199,7 @@ def _test_rrc(self, B, M, K, N, dtype="float16"):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         dll_name = f"test_{self.test_count}.so"
-        module = compile_model(
-            Y, target, "./tmp", f"bmm_rrc_add_{dtype}", dll_name=dll_name
-        )
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor([B, M, K], dtype)
         W_pt = get_random_torch_tensor([B, K, N], dtype)
         D_pt = get_random_torch_tensor([B, N, M], dtype)
@@ -218,7 +214,7 @@ def _test_rrc(self, B, M, K, N, dtype="float16"):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
         self.test_count += 1
 
-    def _test_crc(self, B, M, K, N, dtype="float16"):
+    def _test_crc(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
         X = Tensor(
             shape=[B, K, M],
@@ -242,7 +238,6 @@ def _test_crc(self, B, M, K, N, dtype="float16"):
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"bmm_crc_add_{dtype}"
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor([B, K, M], dtype)
@@ -290,7 +285,7 @@ def _test_ccc(self, B, M, N, K, test_name, dtype="float16"):
         self.test_count += 1
 
     def test_rrr(self):
-        self._test_rrr(B=32, M=256, K=256, N=512)
+        self._test_rrr(B=32, M=256, K=256, N=512, test_name="bmm_rrr_add")
 
     def test_ccr(self):
         self._test_ccr(B=32, M=256, N=256, K=512, test_name="bmm_ccr_add")
@@ -305,7 +300,7 @@ def test_rcr(self):
         self._test_rcr(B=1, M=256, N=256, K=0, test_name="bmm_rcr_zero_k")
 
     def test_crr(self):
-        self._test_crr(B=32, M=256, K=256, N=512)
+        self._test_crr(B=32, M=256, K=256, N=512, test_name="bmm_crr_add")
 
     def test_ccc(self):
         self._test_ccc(B=32, M=256, N=256, K=512, test_name="bmm_ccc_add")
@@ -320,51 +315,507 @@ def test_rcc(self):
         self._test_rcc(B=1, M=256, N=256, K=0, test_name="bmm_rcc_zero_k")
 
     def test_rrc(self):
-        self._test_rrc(B=32, M=256, K=256, N=512)
+        self._test_rrc(B=32, M=256, K=256, N=512, test_name="bmm_rrc_add")
 
     def test_crc(self):
-        self._test_crc(B=32, M=256, K=256, N=512)
+        self._test_crc(B=32, M=256, K=256, N=512, test_name="bmm_crc_add")
 
     def test_bmm_add_0_fp32_sm80(self, dtype="float32"):
-        self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_rrr(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrr_add_{dtype}",
+            dtype=dtype,
+        )
         self._test_ccr(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crr_add_{dtype}",
+            dtype=dtype,
         )
-        self._test_crr(B=8, M=32, K=16, N=64, dtype=dtype)
         self._test_rcr(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcr_add_{dtype}",
+            dtype=dtype,
         )
 
     def test_bmm_add_0_bf16(self, dtype="bfloat16"):
-        self._test_rrr(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_rrr(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrr_add_{dtype}",
+            dtype=dtype,
+        )
         self._test_ccr(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crr_add_{dtype}",
+            dtype=dtype,
         )
-        self._test_crr(B=8, M=32, K=16, N=64, dtype=dtype)
         self._test_rcr(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_rcr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcr_add_{dtype}",
+            dtype=dtype,
         )
 
     def test_bmm_add_1_fp32_sm80(self, dtype="float32"):
-        self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_rrc(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrc_add_{dtype}",
+            dtype=dtype,
+        )
         self._test_ccc(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crc_add_{dtype}",
+            dtype=dtype,
         )
-        self._test_crc(B=8, M=32, K=16, N=64, dtype=dtype)
         self._test_rcc(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_rcc_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcc_add_{dtype}",
+            dtype=dtype,
         )
 
     def test_bmm_add_1_bf16(self, dtype="bfloat16"):
-        self._test_rrc(B=8, M=32, K=8, N=64, dtype=dtype)
+        self._test_rrc(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrc_add_{dtype}",
+            dtype=dtype,
+        )
         self._test_ccc(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_ccr_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crc_add_{dtype}",
+            dtype=dtype,
         )
-        self._test_crc(B=8, M=32, K=16, N=64, dtype=dtype)
         self._test_rcc(
-            B=8, M=32, N=64, K=16, test_name=f"bmm_rcc_add_{dtype}", dtype=dtype
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcc_add_{dtype}",
+            dtype=dtype,
         )
 
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    B=5,
+                    M=7,
+                    K=60,
+                    N=28,
+                    test_name="bmm_rrr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrr_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=60,
+                N=28,
+                test_name="bmm_rrr_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrr_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    B=5,
+                    M=7,
+                    N=60,
+                    K=28,
+                    test_name="bmm_rcr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcr_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=60,
+                K=28,
+                test_name="bmm_rcr_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcr_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccr(
+                    B=5,
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="bmm_ccr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccr(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccr_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccr(
+                B=5,
+                M=60,
+                N=7,
+                K=28,
+                test_name="bmm_ccr_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccr(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccr_add_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crr(
+                    B=5,
+                    K=7,
+                    M=28,
+                    N=60,
+                    test_name="bmm_crr_add_wrong_alignment_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crr(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crr_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crr(
+                B=5,
+                K=7,
+                M=28,
+                N=60,
+                test_name="bmm_crr_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crr(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crr_add_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rrc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrc(
+                    B=5,
+                    M=7,
+                    K=60,
+                    N=28,
+                    test_name="bmm_rrc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrc_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=60,
+                N=28,
+                test_name="bmm_rrc_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrc_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcc(
+                    B=5,
+                    M=7,
+                    N=60,
+                    K=28,
+                    test_name="bmm_rcc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcc_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=60,
+                K=28,
+                test_name="bmm_rcc_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcc_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccc(
+                    B=5,
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="bmm_ccc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccc(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccc_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccc(
+                B=5,
+                M=60,
+                N=7,
+                K=28,
+                test_name="bmm_ccc_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccc(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccc_add_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crc(
+                    B=5,
+                    K=7,
+                    M=28,
+                    N=60,
+                    test_name="bmm_crc_add_wrong_alignment_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crc(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crc_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crc(
+                B=5,
+                K=7,
+                M=28,
+                N=60,
+                test_name="bmm_crc_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crc(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crc_add_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):

From 32dd05c441a60c1a2e487a1090203f79ab3f0ac1 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sun, 30 Apr 2023 11:01:47 -0700
Subject: [PATCH 464/638] dynamic_slice infer_shape (#598)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/598

Adopt symbolic shapes for dynamic_slice in AIT.

Reviewed By: chenyang78

Differential Revision: D45175474

fbshipit-source-id: 9db65142a9be0f2e9dd9c64215f0ea919cef2532
---
 .../compiler/ops/tensor/dynamic_slice.py      | 69 +++++++++++--------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
index b7d966ee2..7c832b012 100644
--- a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
+++ b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
@@ -15,12 +15,13 @@
 """
 Dynamic_slice.
 """
-import itertools
 from typing import List, Optional, Union
 
+import sympy
+
 from aitemplate import backend
 from aitemplate.backend import registry
-from aitemplate.compiler.base import IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
@@ -69,15 +70,28 @@ def normalize_start_end_indices(dim_val: int, start: int, end: int) -> List[int]
         start = end if start > end else start
         return [start, end]
 
-    def _infer_shape(
-        self, x_shape: List[int], start_indices: List[int], end_indices: List[int]
-    ) -> List[int]:
-        y_shape = []
-        for dim_val, start, end in zip(x_shape, start_indices, end_indices):
-            # handle negative indices
-            start, end = dynamic_slice.normalize_start_end_indices(dim_val, start, end)
-            y_shape.append(end - start)
-        return y_shape
+    def _infer_dynamic_dim(self, dim: IntVar, start_index: int, end_index: int):
+        values = dim._attrs["values"]
+        new_values = []
+
+        for value in values:
+            start, end = dynamic_slice.normalize_start_end_indices(
+                value, start_index, end_index
+            )
+            new_values.append(end - start)
+        new_values = sorted(set(new_values))
+
+        start_sym = (
+            start_index if start_index >= 0 else dim.symbolic_value() + start_index
+        )
+        end_sym = end_index if end_index >= 0 else dim.symbolic_value() + end_index
+
+        start_sym = sympy.Min(dim.symbolic_value(), sympy.Max(0, start_sym))
+        end_sym = sympy.Min(dim.symbolic_value(), sympy.Max(0, end_sym))
+
+        symbolic_value = sympy.Max(0, end_sym - start_sym)
+
+        return shape_utils.gen_int_var(new_values, symbolic_value=symbolic_value)
 
     def _infer_shapes(
         self,
@@ -86,25 +100,26 @@ def _infer_shapes(
         end_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
     ) -> List[IntVar]:
         """Infers shape for dynamic_slice."""
+        # TODO: Handle start_indices/end_indices that are not int.
 
         x_shape = x._attrs["shape"]
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape, start_indices, end_indices)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
         output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                x._attrs["shape"][idx]
-                if (start_indices[idx] == 0 and end_indices[idx] == MAX_INT32)
-                else shape_utils.gen_int_var(unique(d[idx] for d in y_shapes))
-            )
+        for dim_val, start, end in zip(x_shape, start_indices, end_indices):
+            if start == 0 and end == MAX_INT32:
+                # Slicing along the whole dim.
+                output_shape.append(dim_val)
+            elif isinstance(dim_val, IntImm):
+                # Slicing a static dimension.
+                start, end = dynamic_slice.normalize_start_end_indices(
+                    dim_val.value(), start, end
+                )
+                output_shape.append(IntImm(end - start))
+            elif start >= 0 and end >= 0:
+                # Fixed size from start and end.
+                output_shape.append(IntImm(end - start))
+            else:
+                output_shape.append(self._infer_dynamic_dim(dim_val, start, end))
+
         return output_shape
 
     def __call__(

From aa7010f69ed9103327ca9a5728cf85248dfb45a9 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Mon, 1 May 2023 10:45:58 -0700
Subject: [PATCH 465/638] Replace hasattr with getattr in
 aitemplate/AITemplate/python/aitemplate/frontend/nn/multiscale_attention.py
 (#641)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/641

The pattern
```
X.Y if hasattr(X, "Y") else Z
```
can be replaced with
```
getattr(X, "Y", Z)
```

The [getattr](https://www.w3schools.com/python/ref_func_getattr.asp) function gives more succinct code than the [hasattr](https://www.w3schools.com/python/ref_func_hasattr.asp) function. Please use it when appropriate.

**This diff is very low risk. Green tests indicate that you can safely Accept & Ship.**

Differential Revision: D44886430

fbshipit-source-id: 6262fc6d6ae8a3a5dafd9373ab74451b7d20932c
---
 python/aitemplate/frontend/nn/multiscale_attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 2c484f14c..757477d78 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -428,17 +428,17 @@ def __init__(
         self._attention_pool_q = _AttentionPool(
             self.pool_q,
             has_cls_embed=self.has_cls_embed,
-            norm=self.norm_q if hasattr(self, "norm_q") else None,
+            norm=getattr(self, "norm_q", None),
         )
         self._attention_pool_k = _AttentionPool(
             self.pool_k,
             has_cls_embed=self.has_cls_embed,
-            norm=self.norm_k if hasattr(self, "norm_k") else None,
+            norm=getattr(self, "norm_k", None),
         )
         self._attention_pool_v = _AttentionPool(
             self.pool_v,
             has_cls_embed=self.has_cls_embed,
-            norm=self.norm_v if hasattr(self, "norm_v") else None,
+            norm=getattr(self, "norm_v", None),
         )
 
     def _qkv_proj(

From be49644288c0d8e0c52b553932f13256d924d881 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 1 May 2023 17:28:46 -0700
Subject: [PATCH 466/638] Fix circleci test errors (#639)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/639

ATT, from
https://app.circleci.com/pipelines/github/facebookincubator/AITemplate/2068/workflows/e8948454-1658-4245-92d1-d185c1c4fb69/jobs/3959,
there are OOM errors. I cannot reproduce this error by locally sshing to circleci
instance, and this error only happens on main branch, instead of a forked branch.
But I do see that these tests use more memory than others.

So in this diff, reduce input tensor size and enable DEBUG logging in circleCI.

Reviewed By: chenyang78

Differential Revision: D45436389

fbshipit-source-id: 49107e36f148baa2199579268345f326e4f5fcd8
---
 .circleci/config.yml                             | 1 +
 tests/unittest/compiler/test_split_bmm_fusion.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8ab59e610..2c52bd0ae 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,6 +41,7 @@ setup_env: &setup_env
           echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV &&
           echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV &&
           echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV &&
+          echo 'export LOGLEVEL=DEBUG' >> $BASH_ENV &&
           break || sleep 5;
         done
 
diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
index 741c0f5f3..c079bce4e 100644
--- a/tests/unittest/compiler/test_split_bmm_fusion.py
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -153,7 +153,7 @@ def test_split_bmm_rcr_fusion_static(self):
         )
         # bmm_rcr, split_dim = 1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr"
+            ops.bmm_rcr, 10, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr"
         )
 
     def _test_split_bmm_rcr_fusion_dynamic_M(
@@ -238,7 +238,7 @@ def test_split_bmm_rcr_fusion_dynamic_M(self):
         # bmm_rcr
         self._test_split_bmm_rcr_fusion_dynamic_M(
             ops.bmm_rcr,
-            1024,
+            10,
             [128, 256],
             512,
             256 * 2,

From b041e0e6a549bb79f6c06d62c67869854563610e Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 1 May 2023 17:28:46 -0700
Subject: [PATCH 467/638] Fix internal CI issue (#640)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/640

ATT, fix flacky / broken tests.

Reviewed By: chenyang78

Differential Revision: D45436462

fbshipit-source-id: 97d2781696ab11a4b1c2206b4a1f98ad75c085ef
---
 .../compiler/test_split_bmm_fusion.py         | 22 +++++++++----------
 .../compiler/test_transform_odd_alignment.py  |  8 ++++++-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
index c079bce4e..95db4f8b0 100644
--- a/tests/unittest/compiler/test_split_bmm_fusion.py
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -112,48 +112,48 @@ def test_split_bmm_rcr_fusion_static(self):
             5,
             [2, 3],
             2,
-            "test_split_bmm_rcr",
+            "test_split_bmm_rcr_with_padding",
             with_padding=True,
         )
         # bmm_rcr_n1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 160, 1, 32, 8, 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 160, 1, 32, 8, 2, "test_split_bmm_rcr_n1"
         )
         # bmm_rcr_n1, split_dim = 2
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10000, 1, 5, [2, 3], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [2, 3], 2, "test_split_bmm_rcr_n1_0"
         )
         # bmm_rcr_n1, split_dim = 2
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10000, 1, 5, [3, 2], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [3, 2], 2, "test_split_bmm_rcr_n1_1"
         )
         # bmm_rcr_n1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10, 1, 32, [16, 8, 8], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10, 1, 32, [16, 8, 8], 2, "test_split_bmm_rcr_n1_2"
         )
         # bmm_rcr_n1, split_dim = 0
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 4, 10000, 1, 32, [2, 2], 0, "test_split_bmm_rcr"
+            ops.bmm_rcr, 4, 10000, 1, 32, [2, 2], 0, "test_split_bmm_rcr_n1_split_d_0"
         )
         # bmm_rcr_n1, split_dim = 1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 64, 2, 2, 32, 1, 1, "test_split_bmm_rcr"
+            ops.bmm_rcr, 64, 2, 2, 32, 1, 1, "test_split_bmm_rcr_n1_split_d_1"
         )
         # bmm_rcr
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 256, 2, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 256, 2, "test_split_bmm_rcr_0"
         )
         # bmm_rcr
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1, 10000, 3, 96, [32, 32, 32], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1, 10000, 3, 96, [32, 32, 32], 2, "test_split_bmm_rcr_1"
         )
         # bmm_rcr, split_dim = 0, can only be static
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 512, 0, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 512, 0, "test_split_bmm_rcr_split_d_0"
         )
         # bmm_rcr, split_dim = 1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 10, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr"
+            ops.bmm_rcr, 10, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr_split_d_1"
         )
 
     def _test_split_bmm_rcr_fusion_dynamic_M(
diff --git a/tests/unittest/compiler/test_transform_odd_alignment.py b/tests/unittest/compiler/test_transform_odd_alignment.py
index 74f87e2ee..11ce33836 100644
--- a/tests/unittest/compiler/test_transform_odd_alignment.py
+++ b/tests/unittest/compiler/test_transform_odd_alignment.py
@@ -31,6 +31,10 @@ def _extract_shape(batch, shape):
 
 
 class TransformOddAlignmentCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _create_permute_bmm_graph(
         self, A_shape, B_shape, bmm_type, const_A=None, const_B=None
     ):
@@ -72,6 +76,7 @@ def _test_permute_bmm_A(
         is_const,
         is_elementwise=False,
         strided_output=True,
+        test_prefix="",
     ):
         M = shape_A[-2] if origin_bmm[-3] == "r" else shape_A[-1]
         N = shape_B[-1] if origin_bmm[-2] == "r" else shape_B[-2]
@@ -107,7 +112,7 @@ def _test_permute_bmm_A(
                 output,
                 target,
                 "./tmp",
-                f"alignment_permute_bmm_A_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
+                f"{test_prefix}alignment_permute_bmm_A_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
             )
 
             exist_new_bmm = False
@@ -184,6 +189,7 @@ def test_permute_bmm_A(self):
             "bmm_rrr",
             "bmm_crr",
             is_const=True,
+            test_prefix="2d_broadcast_",
         )
         # non-const input misaligned on K, permute.
         self._test_permute_bmm_A(

From a4b533cca3dfc48b475dc56a6f2ca3e643e9f554 Mon Sep 17 00:00:00 2001
From: max <maksim@cs.fsu.edu>
Date: Tue, 2 May 2023 19:19:47 -0700
Subject: [PATCH 468/638] optionally print python callstack stats for model
 compilation (#642)

Summary:
e.g.

```
2023-04-30 23:41:00,967 DEBUG <aitemplate.compiler.compiler>          1960165 function calls (1638331 primitive calls) in 14.739 seconds

   Ordered by: cumulative time
   List reduced from 1012 to 30 due to restriction <30>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   14.740   14.740 /home/max/AITemplate-OSS/python/aitemplate/compiler/compiler.py:170(compile_model)
        1    0.000    0.000   13.826   13.826 /home/max/AITemplate-OSS/python/aitemplate/backend/builder.py:862(make)
        1    0.000    0.000   13.748   13.748 /home/max/AITemplate-OSS/python/aitemplate/backend/builder.py:146(_run_make_cmds)
        3    0.000    0.000   13.717    4.572 /home/max/miniconda3/envs/ait/lib/python3.9/subprocess.py:1090(communicate)
        1    0.000    0.000   13.715   13.715 /home/max/miniconda3/envs/ait/lib/python3.9/subprocess.py:1926(_communicate)
       12    0.000    0.000   13.715    1.143 /home/max/miniconda3/envs/ait/lib/python3.9/selectors.py:403(select)
       12   13.715    1.143   13.715    1.143 {method 'poll' of 'select.poll' objects}
        1    0.000    0.000    0.480    0.480 /home/max/AITemplate-OSS/python/aitemplate/backend/cuda/target_def.py:149(__enter__)
        1    0.000    0.000    0.479    0.479 /home/max/AITemplate-OSS/python/aitemplate/backend/cuda/utils.py:53(gen_ops)
        1    0.000    0.000    0.478    0.478 /tmp/tmp1d682fp8/cutlass_lib/generator.py:1854(GenerateSM75)
     2989    0.014    0.000    0.420    0.000 /tmp/tmp1d682fp8/cutlass_lib/manifest.py:333(append)
       37    0.001    0.000    0.419    0.011 /home/max/AITemplate-OSS/python/aitemplate/utils/graph_utils.py:94(dump_graph_debug_str_to_file)
        1    0.000    0.000    0.290    0.290 /home/max/AITemplate-OSS/python/aitemplate/compiler/transform/optimize_graph.py:59(optimize_graph)
       37    0.000    0.000    0.240    0.006 /home/max/AITemplate-OSS/python/aitemplate/utils/graph_utils.py:49(sorted_graph_debug_str)
 1877/113    0.004    0.000    0.238    0.002 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:55(pformat)
 1877/113    0.003    0.000    0.238    0.002 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:151(pformat)
14944/113    0.021    0.000    0.237    0.002 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:163(_format)
     6124    0.015    0.000    0.232    0.000 /tmp/tmp1d682fp8/cutlass_lib/conv2d_operation.py:95(configuration_name)
        1    0.000    0.000    0.230    0.230 /tmp/tmp1d682fp8/cutlass_lib/generator.py:1272(GenerateSM75_TensorOp_1688)
    18056    0.069    0.000    0.227    0.000 /tmp/tmp1d682fp8/cutlass_lib/library.py:548(SubstituteTemplate)
27423/4179    0.017    0.000    0.224    0.000 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:430(_repr)
27423/4179    0.010    0.000    0.220    0.000 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:439(format)
59967/4179    0.060    0.000    0.219    0.000 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:529(_safe_repr)
 1865/457    0.005    0.000    0.204    0.000 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:189(_pprint_dict)
51603/7923    0.010    0.000    0.203    0.000 {built-in method builtins.repr}
       16    0.003    0.000    0.198    0.012 /tmp/tmp1d682fp8/cutlass_lib/generator.py:392(CreateConv2dOperator)
 1865/457    0.020    0.000    0.194    0.000 /home/max/miniconda3/envs/ait/lib/python3.9/pprint.py:372(_format_dict_items)
     4595    0.002    0.000    0.177    0.000 /tmp/tmp1d682fp8/cutlass_lib/conv2d_operation.py:126(procedural_name)
       37    0.000    0.000    0.171    0.005 /home/max/AITemplate-OSS/python/aitemplate/utils/graph_utils.py:55(<listcomp>)
    97/37    0.001    0.000    0.171    0.005 /home/max/AITemplate-OSS/python/aitemplate/compiler/base.py:1124(__str__)

```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/642

Reviewed By: ipiszy

Differential Revision: D45475144

Pulled By: chenyang78

fbshipit-source-id: 52f497dc0629ff62e1d9dc4bc9120b2db3a30d11
---
 python/aitemplate/compiler/compiler.py |  3 ++-
 python/aitemplate/utils/misc.py        | 36 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 5ee4f09fd..e42754fec 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -21,7 +21,6 @@
 from typing import Dict, List, Optional, Union
 
 from aitemplate import backend, compiler
-
 from aitemplate.compiler.base import (
     DynamicProfileStrategy,
     IntImm,
@@ -39,6 +38,7 @@
 from aitemplate.compiler.transform.profile import elapsed_dt_sec
 from aitemplate.utils import graph_utils
 from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.misc import callstack_stats
 from aitemplate.utils.serialization.serdes_code import dump_program
 
 # pylint: disable=W0102
@@ -145,6 +145,7 @@ def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
 _DEBUG_SETTINGS = AITDebugSettings()
 
 
+@callstack_stats()
 def compile_model(
     tensor: Union[Tensor, List[Tensor]],
     target: backend.target.Target,
diff --git a/python/aitemplate/utils/misc.py b/python/aitemplate/utils/misc.py
index 52afc582e..5ad5d26fc 100644
--- a/python/aitemplate/utils/misc.py
+++ b/python/aitemplate/utils/misc.py
@@ -59,3 +59,39 @@ def short_str(s, length=8) -> str:
     """
     hash_str = hashlib.sha256(s.encode()).hexdigest()
     return hash_str[0:length]
+
+
+def callstack_stats(enable=False):
+    if enable:
+
+        def decorator(f):
+            import cProfile
+            import io
+            import pstats
+
+            logger = logging.getLogger(__name__)
+
+            def inner_function(*args, **kwargs):
+                pr = cProfile.Profile()
+                pr.enable()
+                result = f(*args, **kwargs)
+                pr.disable()
+                s = io.StringIO()
+                pstats.Stats(pr, stream=s).sort_stats(
+                    pstats.SortKey.CUMULATIVE
+                ).print_stats(30)
+                logger.debug(s.getvalue())
+                return result
+
+            return inner_function
+
+        return decorator
+    else:
+
+        def decorator(f):
+            def inner_function(*args, **kwargs):
+                return f(*args, **kwargs)
+
+            return inner_function
+
+        return decorator

From c43511410f58498094dcc489f78344fc5f87bb05 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Wed, 3 May 2023 00:55:22 -0700
Subject: [PATCH 469/638] Add VisionTransformerBasicHead (#636)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/636

Adds FE module for Softmax

Reviewed By: terrychenism

Differential Revision: D45411032

fbshipit-source-id: db5e7ebdfdbe5094f71481203dcaf2ebfbad97e8
---
 python/aitemplate/frontend/nn/softmax.py | 64 ++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/softmax.py

diff --git a/python/aitemplate/frontend/nn/softmax.py b/python/aitemplate/frontend/nn/softmax.py
new file mode 100644
index 000000000..2a8ff0e5a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/softmax.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+softmax Module.
+"""
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+
+
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    """
+
+    def __init__(
+        self,
+        dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, *args):
+        """Applies Softmax on the input tensor."""
+        assert len(args) == 1
+        x = args[0]
+
+        return ops.softmax(x, self.dim)

From 60df7ce612f45117b7a141640d9ffb51f3f2b6ea Mon Sep 17 00:00:00 2001
From: Henry Hu <hhh@meta.com>
Date: Wed, 3 May 2023 09:36:12 -0700
Subject: [PATCH 470/638] Duplicate LDM from Example to FB Folder (#646)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/646

Duplicate LDM from Example to FB Folder

Reviewed By: mortzur, terrychenism

Differential Revision: D45290751

fbshipit-source-id: 67ae38e6f5f5f4eddefaf681833b667872c6c6e7
---
 python/aitemplate/frontend/nn/ldm/__init__.py |  17 -
 .../aitemplate/frontend/nn/ldm/attention.py   | 105 ---
 python/aitemplate/frontend/nn/ldm/clip.py     | 627 --------------
 .../aitemplate/frontend/nn/ldm/embeddings.py  |  77 --
 python/aitemplate/frontend/nn/ldm/resnet.py   | 238 ------
 .../frontend/nn/ldm/unet_2d_condition.py      | 255 ------
 .../aitemplate/frontend/nn/ldm/unet_blocks.py | 762 ------------------
 python/aitemplate/frontend/nn/ldm/vae.py      | 153 ----
 8 files changed, 2234 deletions(-)
 delete mode 100644 python/aitemplate/frontend/nn/ldm/__init__.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/attention.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/clip.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/embeddings.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/resnet.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/unet_blocks.py
 delete mode 100644 python/aitemplate/frontend/nn/ldm/vae.py

diff --git a/python/aitemplate/frontend/nn/ldm/__init__.py b/python/aitemplate/frontend/nn/ldm/__init__.py
deleted file mode 100644
index b14195e81..000000000
--- a/python/aitemplate/frontend/nn/ldm/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-# flake8: noqa
-
-from aitemplate.frontend.nn.ldm.unet_2d_condition import UNet2DConditionModel
diff --git a/python/aitemplate/frontend/nn/ldm/attention.py b/python/aitemplate/frontend/nn/ldm/attention.py
deleted file mode 100644
index 14993e6d9..000000000
--- a/python/aitemplate/frontend/nn/ldm/attention.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
-"""
-
-from typing import Optional
-
-from aitemplate.compiler.ops import reshape
-
-from aitemplate.frontend import nn, Tensor
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    Uses three q, k, v linear layers to compute attention.
-    Parameters:
-        batch_size (:obj:`int`): The number of examples per batch.
-        height (:obj:`int`): Height of each image example.
-        width (:obj:`int`): Width of each image example.
-        channels (:obj:`int`): The number of channels in the input and output.
-        num_head_channels (:obj:`int`, *optional*):
-            The number of channels in each head. If None, then `num_heads` = 1.
-        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        channels: int,
-        num_head_channels: Optional[int] = None,
-        num_groups: int = 32,
-        rescale_output_factor: float = 1.0,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.batch_size = batch_size
-        self.height = height
-        self.width = width
-        self.channels = channels
-        self.num_heads = (
-            channels // num_head_channels if num_head_channels is not None else 1
-        )
-        self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
-            channels,
-            batch_size,
-            height * width,
-            self.num_heads,
-            qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
-        )
-        self.rescale_output_factor = rescale_output_factor
-
-    def forward(self, hidden_states) -> Tensor:
-        """
-        input hidden_states shape: [batch, height, width, channel]
-        output shape: [batch, height, width, channel]
-        """
-        residual = hidden_states
-
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-
-        hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
-        )
-
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
-
-        return res
diff --git a/python/aitemplate/frontend/nn/ldm/clip.py b/python/aitemplate/frontend/nn/ldm/clip.py
deleted file mode 100644
index b00e2782a..000000000
--- a/python/aitemplate/frontend/nn/ldm/clip.py
+++ /dev/null
@@ -1,627 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from inspect import isfunction
-from typing import Optional
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-
-# pylint: disable=W0102
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        dtype="float16",
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.dim_head = dim_head
-        self.use_cuda = detect_target().name() == "cuda"
-
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None, residual=None):
-        nheads = self.heads
-        d = self.dim_head
-
-        layout = "20314" if self.use_cuda else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
-        context = default(context, x)
-
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
-        )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
-
-        if self.use_cuda:
-            attn_op = ops.mem_eff_attention(causal=False)
-            out = attn_op(
-                (ops.reshape()(q, [bs, nheads, -1, d])),
-                (ops.reshape()(k, [bs, nheads, -1, d])),
-                (ops.reshape()(v, [bs, nheads, -1, d])),
-            )
-        else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
-            )
-        out = ops.reshape()(out, [bs, -1, nheads * d])
-        proj = self.to_out(out)
-        proj = ops.reshape()(proj, [bs, -1, nheads * d])
-        if residual is not None:
-            return proj + residual
-        else:
-            return proj
-
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
-        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
-
-    def forward(self, x):
-        return self.proj(x, self.gate(x))
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = (
-            nn.Sequential(
-                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
-            )
-            if not glu
-            else GEGLU(dim, inner_dim)
-        )
-
-        self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x, residual=None):
-        shape = ops.size()(x)
-        x = self.net(x)
-        x = ops.reshape()(x, shape)
-        if residual is not None:
-            return x + residual
-        else:
-            return x
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        n_heads,
-        d_head,
-        dropout=0.0,
-        context_dim=None,
-        gated_ff=True,
-        checkpoint=True,
-    ):
-        super().__init__()
-        self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
-            query_dim=dim,
-            context_dim=context_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            dropout=dropout,
-        )
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-
-        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
-
-    def forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), residual=x)
-        x = self.attn2(self.norm2(x), context=context, residual=x)
-        x = self.ff(self.norm3(x), residual=x)
-        return x
-
-
-def Normalize(in_channels):
-    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-
-    def __init__(
-        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)  # Group Norm
-
-        self.proj_in = nn.Conv2dBias(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
-                )
-                for d in range(depth)
-            ]
-        )
-
-        self.proj_out = nn.Conv2dBias(
-            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = ops.reshape()(x, [b, -1, c])
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = ops.reshape()(x, [b, h, w, c])
-        x = self.proj_out(x)
-        return x + x_in
-
-
-class CLIPAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        batch_size=1,
-        seq_len=16,
-        layer_norm_eps=1e-5,
-        hidden_dropout_prob=0.0,
-        causal=False,
-        mask_seq=0,
-    ):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=hidden_dropout_prob,
-            has_residual=False,
-            causal=causal,
-            mask_seq=mask_seq,
-        )
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        residual: Optional[Tensor] = None,
-    ):
-        if residual is not None:
-            self_output = self.attn(hidden_states, residual)
-        else:
-            self_output = self.attn(hidden_states)
-        return self_output
-
-
-class QuickGELUActivation(nn.Module):
-    """
-    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
-    """
-
-    def forward(self, x):
-        x1 = x * 1.702
-        x1 = ops.sigmoid(x1)
-        x = x * x1
-        return x
-
-
-class CLIPMLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer="GELU",
-        drop=0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-            specialization="gelu",
-        )
-        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
-
-    def forward(self, x, res):
-        shape = get_shape(x)
-        x = self.fc1(x)
-        x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
-
-
-class CLIPMLPQuickGelu(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-        )
-        self.activation_fn = QuickGELUActivation()
-
-        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
-
-    def forward(self, x, res):
-        shape = get_shape(x)
-        x = self.fc1(x)
-        x = self.activation_fn(x)
-        x = self.fc2(x, res)
-        return ops.reshape()(x, shape)
-
-
-class CLIPEncoderLayer(nn.Module):
-    ACT_LAYER_TO_CLIP_MLP_MAP = {
-        "gelu": CLIPMLP,
-        "quick_gelu": CLIPMLPQuickGelu,
-    }
-
-    def __init__(
-        self,
-        hidden_size=768,
-        num_attention_heads=12,
-        attention_dropout=0.0,
-        mlp_ratio=4.0,
-        batch_size=1,
-        seq_len=16,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
-            causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
-        )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
-            hidden_size, int(hidden_size * mlp_ratio)
-        )
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        output_attentions: Optional[bool] = False,
-    ):
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-                `(config.encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states, residual)
-
-        return hidden_states
-
-
-class CLIPEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`CLIPEncoderLayer`].
-    Args:
-        config: CLIPConfig
-    """
-
-    def __init__(
-        self,
-        num_hidden_layers=12,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        hidden_size=768,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [
-                CLIPEncoderLayer(
-                    hidden_size=hidden_size,
-                    num_attention_heads=num_attention_heads,
-                    batch_size=batch_size,
-                    seq_len=seq_len,
-                    causal=causal,
-                    mask_seq=mask_seq,
-                    act_layer=act_layer,
-                )
-                for _ in range(num_hidden_layers)
-            ]
-        )
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[Tensor] = None,
-        causal_attention_mask: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Causal mask for the text model. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        # all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for _, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(hidden_states)
-            hidden_states = layer_outputs
-
-        return hidden_states
-
-
-class CLIPTextEmbeddings(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        vocab_size=49408,
-        max_position_embeddings=77,
-        dtype="float16",
-    ):
-        super().__init__()
-        embed_dim = hidden_size
-
-        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
-        self.position_embedding = nn.Embedding(
-            shape=[max_position_embeddings, embed_dim], dtype=dtype
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Tensor:
-
-        input_shape = ops.size()(input_ids)
-
-        # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
-
-        if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
-
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
-        )
-
-        embeddings = inputs_embeds + position_embeddings
-
-        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
-
-        return embeddings
-
-
-class CLIPTextTransformer(nn.Module):
-    def __init__(
-        self,
-        hidden_size=768,
-        output_attentions=False,
-        output_hidden_states=False,
-        use_return_dict=False,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        batch_size=1,
-        seq_len=64,
-        causal=False,
-        mask_seq=0,
-        act_layer="gelu",
-    ):
-        super().__init__()
-        self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
-        self.encoder = CLIPEncoder(
-            num_hidden_layers=num_hidden_layers,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            causal=causal,
-            mask_seq=mask_seq,
-            act_layer=act_layer,
-        )
-        self.final_layer_norm = nn.LayerNorm(hidden_size)
-
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.use_return_dict = use_return_dict
-
-    def forward(
-        self,
-        input_ids: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        position_ids: Optional[Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        Returns:
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
-
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-        )
-
-        last_hidden_state = encoder_outputs
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        return last_hidden_state
diff --git a/python/aitemplate/frontend/nn/ldm/embeddings.py b/python/aitemplate/frontend/nn/ldm/embeddings.py
deleted file mode 100644
index 20519e661..000000000
--- a/python/aitemplate/frontend/nn/ldm/embeddings.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import math
-
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
-        super().__init__()
-
-        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
-
-    def forward(self, sample):
-        sample = self.linear_1(sample)
-        sample = self.linear_2(sample)
-        return sample
-
-
-class Timesteps(nn.Module):
-    def __init__(
-        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
-    ):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-        self.scale = 1
-        self.max_period = 10000
-        half_dim = self.num_channels // 2
-        self.arange = nn.Parameter(shape=[half_dim], dtype="float16", name="arange")
-
-    def forward(self, timesteps):
-        assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
-
-        half_dim = self.num_channels // 2
-
-        exponent = (-math.log(self.max_period)) * self.arange.tensor()
-        exponent = exponent * (1.0 / (half_dim - self.downscale_freq_shift))
-
-        emb = ops.exp(exponent)
-        emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
-
-        # scale embeddings
-        emb = self.scale * emb
-
-        # concat sine and cosine embeddings
-        if self.flip_sin_to_cos:
-            emb = ops.concatenate()(
-                [ops.cos(emb), ops.sin(emb)],
-                dim=-1,
-            )
-        else:
-            emb = ops.concatenate()(
-                [ops.sin(emb), ops.cos(emb)],
-                dim=-1,
-            )
-        return emb
diff --git a/python/aitemplate/frontend/nn/ldm/resnet.py b/python/aitemplate/frontend/nn/ldm/resnet.py
deleted file mode 100644
index 03e4f8023..000000000
--- a/python/aitemplate/frontend/nn/ldm/resnet.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from aitemplate.compiler import ops
-from aitemplate.frontend import nn
-
-
-def get_shape(x):
-    shape = [it.value() for it in x._attrs["shape"]]
-    return shape
-
-
-class Upsample2D(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self,
-        channels,
-        use_conv=False,
-        use_conv_transpose=False,
-        out_channels=None,
-        name="conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        conv = None
-        if use_conv_transpose:
-            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(x)
-
-        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                x = self.conv(x)
-            else:
-                x = self.Conv2d_0(x)
-
-        return x
-
-
-class Downsample2D(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(
-        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            conv = nn.Conv2dBias(
-                self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        x = self.conv(x)
-
-        return x
-
-
-class ResnetBlock2D(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        time_embedding_norm="default",
-        kernel=None,
-        output_scale_factor=1.0,
-        use_nin_shortcut=None,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.up = up
-        self.down = down
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        self.norm1 = nn.GroupNorm(
-            num_groups=groups,
-            num_channels=in_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-
-        self.conv1 = nn.Conv2dBias(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
-        else:
-            self.time_emb_proj = None
-
-        self.norm2 = nn.GroupNorm(
-            num_groups=groups_out,
-            num_channels=out_channels,
-            eps=eps,
-            affine=True,
-            use_swish=True,
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = nn.Conv2dBias(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-
-        self.upsample = self.downsample = None
-
-        self.use_nin_shortcut = (
-            self.in_channels != self.out_channels
-            if use_nin_shortcut is None
-            else use_nin_shortcut
-        )
-
-        if self.use_nin_shortcut:
-            self.conv_shortcut = nn.Conv2dBias(
-                in_channels, out_channels, 1, 1, 0
-            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
-        else:
-            self.conv_shortcut = None
-
-    def forward(self, x, temb=None):
-        hidden_states = x
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm1(
-            hidden_states
-        )  # .float()).type(hidden_states.dtype) # fused swish
-        # hidden_states = self.nonlinearity(hidden_states)
-
-        if self.upsample is not None:
-            x = self.upsample(x)
-            hidden_states = self.upsample(hidden_states)
-        elif self.downsample is not None:
-            x = self.downsample(x)
-            hidden_states = self.downsample(hidden_states)
-
-        hidden_states = self.conv1(hidden_states)
-
-        if temb is not None:
-            temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
-            temb = ops.reshape()(temb, [bs, 1, 1, dim])
-            hidden_states = hidden_states + temb
-
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm2(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            x = self.conv_shortcut(x)
-
-        out = hidden_states + x
-
-        return out
diff --git a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py b/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
deleted file mode 100644
index 770156ff9..000000000
--- a/python/aitemplate/frontend/nn/ldm/unet_2d_condition.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from typing import Optional, Tuple, Union
-
-from aitemplate.frontend import nn
-
-from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
-
-
-class UNet2DConditionModel(nn.Module):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-    """
-
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types: Tuple[str] = (
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-    ):
-        super().__init__()
-        self.center_input_sample = center_input_sample
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-
-        # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
-        # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[
-                min(i + 1, len(block_out_channels) - 1)
-            ]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=reversed_attention_head_dim[i],
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=norm_num_groups,
-            eps=norm_eps,
-            use_swish=True,
-        )
-
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
-
-    def forward(
-        self,
-        sample,
-        timesteps,
-        encoder_hidden_states,
-        return_dict: bool = True,
-    ):
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-
-        # 1. time
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if (
-                hasattr(downsample_block, "attentions")
-                and downsample_block.attentions is not None
-            ):
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        sample = self.mid_block(
-            sample, emb, encoder_hidden_states=encoder_hidden_states
-        )
-
-        # 5. up
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[
-                : -len(upsample_block.resnets)
-            ]
-
-            if (
-                hasattr(upsample_block, "attentions")
-                and upsample_block.attentions is not None
-            ):
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
-                )
-
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-        return sample
diff --git a/python/aitemplate/frontend/nn/ldm/unet_blocks.py b/python/aitemplate/frontend/nn/ldm/unet_blocks.py
deleted file mode 100644
index 7b6e3e6e6..000000000
--- a/python/aitemplate/frontend/nn/ldm/unet_blocks.py
+++ /dev/null
@@ -1,762 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# flake8: noqa
-from aitemplate.compiler import ops
-
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-
-from .attention import AttentionBlock
-
-from .clip import SpatialTransformer
-from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
-
-# pylint: disable=W0102
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-    downsample_padding=None,
-):
-    down_block_type = (
-        down_block_type[7:]
-        if down_block_type.startswith("UNetRes")
-        else down_block_type
-    )
-    if down_block_type == "DownBlock2D":
-        return DownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnDownBlock2D":
-        return AttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
-            )
-        return CrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "SkipDownBlock2D":
-        return SkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnSkipDownBlock2D":
-        return AttnSkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "DownEncoderBlock2D":
-        return DownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-):
-    up_block_type = (
-        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    )
-    if up_block_type == "UpBlock2D":
-        return UpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "CrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
-            )
-        return CrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "AttnUpBlock2D":
-        return AttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "SkipUpBlock2D":
-        return SkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "AttnSkipUpBlock2D":
-        return AttnSkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "UpDecoderBlock2D":
-        return UpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-class UNetMidBlock2DCrossAttn(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        cross_attention_dim=1280,
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                SpatialTransformer(
-                    in_channels,
-                    attn_num_head_channels,
-                    in_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class CrossAttnDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class DownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class CrossAttnUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_upsample=True,
-    ):
-        super().__init__()
-
-        resnets = []
-        attentions = []
-
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-    ):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb=temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = ops.concatenate()(
-                [hidden_states, res_hidden_states], dim=-1
-            )
-
-            hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UpDecoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class UNetMidBlock2D(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        **kwargs,
-    ):
-        super().__init__()
-
-        if attention_type != "default":
-            raise NotImplementedError(
-                f"attention_type must be default! current value: {attention_type}"
-            )
-
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            attentions.append(
-                AttentionBlock(
-                    batch_size,
-                    height,
-                    width,
-                    in_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-    def forward(self, hidden_states, temb=None, encoder_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
diff --git a/python/aitemplate/frontend/nn/ldm/vae.py b/python/aitemplate/frontend/nn/ldm/vae.py
deleted file mode 100644
index 1cd25aa19..000000000
--- a/python/aitemplate/frontend/nn/ldm/vae.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""
-Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
-"""
-
-from typing import Tuple
-
-from aitemplate.frontend import nn, Tensor
-
-from .unet_blocks import get_up_block, UNetMidBlock2D
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2dBias(
-            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
-        )
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            batch_size,
-            height,
-            width,
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-
-        # up
-        self.up_blocks = nn.ModuleList([])
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=num_groups_out,
-            eps=1e-6,
-            use_swish=True,
-        )
-        self.conv_out = nn.Conv2dBias(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
-        )
-
-    def forward(self, z) -> Tensor:
-        sample = z
-        sample = self.conv_in(sample)
-
-        # middle
-        sample = self.mid_block(sample)
-
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
-
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class AutoencoderKL(nn.Module):
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        sample_size: int = 32,
-    ):
-        super().__init__()
-        self.decoder = Decoder(
-            batch_size,
-            height,
-            width,
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-        self.post_quant_conv = nn.Conv2dBias(
-            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def decode(self, z: Tensor, return_dict: bool = True):
-
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self):
-        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")

From 693f4b9a32d765b85c49fdc5b4bbcbe2f880a7eb Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Wed, 3 May 2023 10:37:10 -0700
Subject: [PATCH 471/638] Remove manually set Parameter names (#638)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/638

Remove manually set `Parameter` names and rely on `.name_parameter_tensor()` to set parameter names instead

Reviewed By: jingsh

Differential Revision: D45435099

fbshipit-source-id: 3d46b5abf647ca93dcae09ad39f23fc4aa9e6c35
---
 python/aitemplate/frontend/nn/batch_norm.py | 10 ++++++----
 python/aitemplate/frontend/nn/conv3d.py     |  2 +-
 tests/unittest/ops/test_batch_norm.py       | 10 +++++-----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/aitemplate/frontend/nn/batch_norm.py b/python/aitemplate/frontend/nn/batch_norm.py
index d2440f178..68e5dc36c 100644
--- a/python/aitemplate/frontend/nn/batch_norm.py
+++ b/python/aitemplate/frontend/nn/batch_norm.py
@@ -35,10 +35,12 @@ def __init__(
         self.dtype = dtype
         self.num_features = num_features
         self.eps = eps
-        self.weight = Parameter(shape=self.dim, name="weight", dtype=dtype)
-        self.bias = Parameter(shape=self.dim, name="bias", dtype=dtype)
-        self.running_mean = Parameter(shape=self.dim, name="running_mean", dtype=dtype)
-        self.running_var = Parameter(shape=self.dim, name="running_var", dtype=dtype)
+        self.weight = Parameter(shape=self.dim, dtype=dtype)
+        self.bias = Parameter(shape=self.dim, dtype=dtype)
+        self.running_mean = Parameter(shape=self.dim, dtype=dtype)
+        self.running_var = Parameter(shape=self.dim, dtype=dtype)
+        # Placeholder for setting constants, won't be used
+        self.num_batches_tracked = Parameter(shape=[], value=0, dtype=dtype)
 
     def forward(self, *args):
         assert len(args) == 1
diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
index ed271d0a4..ea4256e46 100644
--- a/python/aitemplate/frontend/nn/conv3d.py
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -104,7 +104,7 @@ def __init__(
             dtype=dtype,
         )
         if self.has_bias:
-            self.bias = Parameter(shape=[out_channels], dtype=dtype, name="bias")
+            self.bias = Parameter(shape=[out_channels], dtype=dtype)
 
         if groups == 1:
             if self.has_bias:
diff --git a/tests/unittest/ops/test_batch_norm.py b/tests/unittest/ops/test_batch_norm.py
index 81c341844..76633b201 100644
--- a/tests/unittest/ops/test_batch_norm.py
+++ b/tests/unittest/ops/test_batch_norm.py
@@ -44,6 +44,7 @@ def _test_batchnorm(
     ):
         pt_op = getattr(torch.nn, bn_op)(num_features).cuda().half().eval()
         ait_op = getattr(batch_norm, bn_op)(num_features, eps=pt_op.eps)
+        ait_op.name_parameter_tensor()
 
         pt_params = dict(pt_op.named_parameters())
         pt_buffers = dict(pt_op.named_buffers())
@@ -53,8 +54,7 @@ def _test_batchnorm(
             params_ait[key] = arr
         for key, arr in pt_buffers.items():
             print(key, arr.shape)
-            if key in ["running_mean", "running_var"]:
-                params_ait[key] = arr
+            params_ait[key] = arr
 
         X_pt = get_random_torch_tensor(input_shape, input_type)
         Y_pt = pt_op(X_pt)
@@ -70,9 +70,9 @@ def _test_batchnorm(
         Y_ait._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(Y_ait, target, "./tmp", f"batch_norm_{self.test_id}")
-        for name, weight in params_ait.items():
-            module.set_constant_with_tensor(name, weight)
+        module = compile_model(
+            Y_ait, target, "./tmp", f"batch_norm_{self.test_id}", constants=params_ait
+        )
 
         y = get_torch_empty_tensor(Ys_ait, dtype=input_type)
         inputs = {"input0": X_pt}

From bc16a15d281f3ec0c85d5a7d53d360931ae3adf2 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 3 May 2023 20:42:01 -0700
Subject: [PATCH 472/638] classic_b2b_bmm: add multi-head and strides (#625)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/625

The classic b2b bmm op was missing support for multi-head and dimension reordering through explicit strides. This diff ports the changes Rahul made in his private codebase ( D43946388 ) into AIT and adapts the classic_b2b_bmm op to support multihead input/output as well as strided inputs.

**Note**: There seem to be **preexisting issues with the classic b2b bmm op** which this diff does not address yet. I noticed them while developing this diff. See D45308351 for details and reproduction tests.

Reviewed By: ipiszy

Differential Revision: D45049609

fbshipit-source-id: 7b0b08d703cba905519066e12f4f591c1daa9bb6
---
 .../backend/cuda/b2b_bmm/classic_b2b_bmm.py   | 126 +++++++--
 .../compiler/ops/b2b_bmm/classic_b2b_bmm.py   | 121 ++++++--
 .../classic_b2b_bmm/device/b2b_batched_gemm.h |  78 ++++--
 .../classic_b2b_bmm/kernel/b2b_batched_gemm.h |  76 +++--
 tests/unittest/ops/test_b2b_bmm.py            | 262 +++++++++++++++++-
 5 files changed, 552 insertions(+), 111 deletions(-)

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
index bc1aab11b..73937c20f 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -113,9 +113,9 @@
     >;
 
   using B2bGemmBatched = cutlass::gemm::device::B2bGemmBatched<
-    cutlass::half_t,
+    ElementCompute,
     cutlass::layout::RowMajor,
-    cutlass::half_t,
+    ElementCompute,
     cutlass::layout::ColumnMajor,
     cutlass::layout::RowMajor,
     ElementOutput,
@@ -137,24 +137,51 @@
 
   cutlass::gemm::GemmCoord problem_size_0(m0, {{n0}}, k0);
   cutlass::gemm::GemmCoord problem_size_1(m0, {{n1}}, {{n0}});
+
+  // Assuming BMHD dim ordering for inputs and outputs, like in FHMA style op
+  // B = batch size
+  // M = sequence len
+  // H = num heads
+  // D = embedding dims per head
+  // --- Tensor shapes:
+  // GEMM PROBLEM 0:
+  // A=query : [ batch_size, M0, num_heads, K0 ]
+  // B=key : [ batch_size, N0, num_heads, K0 ]
+  // C0=bias : [ batch_size, num_heads, M0, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
+  // GEMM PROBLEM 1:
+  // B1=value : [ batch_size, K1==N0, num_heads, N1 ]
+  // C1=unused:  [ N1 ]
+  // D1=output : [ batch_size, M1==M0, num_heads, N1 ]
+
+  // Required equalities for B2B gemm:
+  // M1 = M0;
+  // K1 = N0;
+
   typename B2bGemmBatched::Arguments arguments{
-    problem_size_0,
-    problem_size_1,
-    {static_cast<ElementCompute*>(query), typename B2bGemmBatched::LayoutA::Stride(problem_size_0.k())},
-    problem_size_0.m() * problem_size_0.k(),
-    {static_cast<ElementCompute*>(key), typename B2bGemmBatched::LayoutB::Stride(problem_size_0.k())},
-    problem_size_0.n() * problem_size_0.k(),
-    {static_cast<ElementCompute*>(bias), typename B2bGemmBatched::LayoutC::Stride(problem_size_0.n())},
-    problem_size_0.m() * problem_size_0.n(),
-    {static_cast<ElementCompute*>(value), typename B2bGemmBatched::LayoutB1::Stride(problem_size_1.n())},
-    problem_size_1.n() * problem_size_1.k(),
-    {static_cast<ElementCompute*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},
-    0,
-    {static_cast<ElementOutput*>(output), typename B2bGemmBatched::LayoutC::Stride(problem_size_1.n())},
-    problem_size_1.m() * problem_size_1.n(),
-    batch_size,
-    {alpha0, beta0, activation_alpha},
-    {alpha1, beta1},
+    problem_size_0, // = GemmCoord problem_size_0;
+    problem_size_1, // = GemmCoord problem_size_1;
+    {static_cast<ElementCompute*>(query), typename B2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},    // TensorRef<ElementA const, LayoutA> ref_A0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_A0;
+    num_heads * problem_size_0.m() * problem_size_0.k(),                                                                // int64_t batch_stride_A0;
+    {static_cast<ElementCompute*>(key), typename B2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementB const, LayoutB> ref_B0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_B0;
+    num_heads * problem_size_0.n() * problem_size_0.k(),                                                                // int64_t batch_stride_B0;
+    {static_cast<ElementCompute*>(bias), typename B2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                  // TensorRef<ElementC const, LayoutC> ref_C0;
+    {{bias_stride_mn}},                                                                                                 // int64_t head_stride_C0;
+    {{bias_stride_hmn}},                                                                                                // int64_t batch_stride_C0;
+    {static_cast<ElementCompute*>(value), typename B2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},   // TensorRef<ElementC const, LayoutC> ref_B1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_B1;                                                                    //
+    num_heads * problem_size_1.n() * problem_size_1.k(),                                                                // int64_t batch_stride_B1;
+    {static_cast<ElementCompute*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},                       // Not used due to ScaleType::Nothing for output op 1
+    0,                                                                                                                  // not used: int64_t head_stride_C1;
+    0,                                                                                                                  // not used: int64_t batch_stride_C1;
+    {static_cast<ElementOutput*>(output), typename B2bGemmBatched::LayoutC::Stride(num_heads * problem_size_1.n())},    // TensorRef<ElementC, LayoutC> ref_D1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_output;
+    num_heads * problem_size_1.m() * problem_size_1.n(),                                                                // int64_t batch_stride_output;
+    batch_size,                                                                                                         // int batch_count;
+    num_heads,                                                                                                          // int num_heads
+    {alpha0, beta0, activation_alpha},                                                                                  // typename EpilogueOutputOp0::Params epilogue0;
+    {alpha1, beta1},                                                                                                    // typename EpilogueOutputOp1::Params epilogue1;
   };
 
   B2bGemmBatched b2b_gemm_op;
@@ -186,6 +213,7 @@
                    void* value,
                    void* bias,
                    int64_t batch_size,
+                   int64_t num_heads,
                    int64_t m0,
                    int64_t k0,
                    cudaStream_t stream)
@@ -204,6 +232,7 @@
 {{indent}}    {{output}},
 {{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
 {{indent}}    {{batch_size}},
+{{indent}}    {{num_heads}},
 {{indent}}    {{m0}},
 {{indent}}    {{k0}},
 {{indent}}    stream /* default stream */
@@ -216,13 +245,18 @@
 def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
     """the function for generating attention kernel"""
     q, k, v, bias = func_attrs["inputs"]
-    n0 = k._attrs["shape"][1]
-    n1 = v._attrs["shape"][2]
+    seq_len_dim = 1
+    n0 = k._attrs["shape"][seq_len_dim]
+    n1 = v._attrs["shape"][-1]
     if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
         raise RuntimeError(
             f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
         )
     backend_spec = CUDASpec()
+    if func_attrs["inputs"][0]._attrs["dtype"] != "float16":
+        raise NotImplementedError(
+            "only float16 dtype supported for now in classic_b2b_bmm op"
+        )
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
@@ -243,6 +277,37 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
     ]
 
+    bias_shape = bias._attrs["shape"]
+    bias_broadcast = [s == IntImm(1) for s in bias_shape]
+    if len(bias_broadcast) == 3:
+        # single head case: Add num heads dimension of size 1
+        bias_broadcast = [bias_broadcast[0], True, bias_broadcast[1], bias_broadcast[2]]
+    assert (
+        len(bias_broadcast) == 4
+    ), f"Bias shape should be of length 4, got {len(bias_broadcast)=}"
+
+    # Calculate stride expressions for bias tensor
+    # Last dimension of bias has implicit stride of 1,
+    # so cannot be broadcasted over
+    bias_stride_n = "problem_size_0.n()"
+    bias_shape_expr = [bias_stride_n]
+
+    # build stride expressions
+    if not bias_broadcast[-2]:
+        bias_shape_expr.append("problem_size_0.m()")
+    bias_stride_mn = "*".join(bias_shape_expr)
+    if not bias_broadcast[-3]:
+        bias_shape_expr.append("num_heads")
+    bias_stride_hmn = "*".join(bias_shape_expr)  # batch stride
+
+    # Strides for broadcasted dimensions are zero
+    if bias_broadcast[0]:  # query sequence len stride
+        bias_stride_hmn = "0"
+    if bias_broadcast[1]:  # head stride
+        bias_stride_mn = "0"
+    if bias_broadcast[2]:  # query sequence length stride
+        bias_stride_n = "0"
+
     return FUNC_TEMPLATE.render(
         func_name=func_attrs["name"],
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
@@ -260,6 +325,9 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         if func_attrs["alpha1_divide_by_seq_len"]
         else "false",
         epilogue_math=epilogue_math,
+        bias_stride_n=bias_stride_n,
+        bias_stride_mn=bias_stride_mn,
+        bias_stride_hmn=bias_stride_hmn,
     )
 
 
@@ -283,10 +351,19 @@ def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
     bias_name = func_attrs["inputs"][3]._attrs["name"]
 
     q_shape = func_attrs["inputs"][0]._attrs["shape"]
-    batch_size = q_shape[0]._attrs["name"]
-    m0 = q_shape[1]._attrs["name"]
-    k0 = q_shape[2]._attrs["name"]
 
+    batch_size = q_shape[0]._attrs["name"]
+    seq_len_dim = 1
+    head_dim = -2
+    m0 = q_shape[seq_len_dim]._attrs["name"]
+
+    if len(q_shape) == 3:
+        # single head case
+        k0 = q_shape[2]._attrs["name"]
+        num_heads = "1"
+    elif len(q_shape) == 4:
+        k0 = q_shape[3]._attrs["name"]
+        num_heads = q_shape[head_dim]._attrs["name"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         output=output_name,
@@ -295,6 +372,7 @@ def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
         value=v_name,
         bias=bias_name,
         batch_size=batch_size,
+        num_heads=num_heads,
         m0=m0,
         k0=k0,
         indent=indent,
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
index 154af58a0..fb1a93129 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
@@ -17,18 +17,33 @@
 Back-to-back batched gemm fused kernel.
 Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
 
-where:
+Notation:
+B: batch size
+H: number of heads
+
+If inputs/outputs have three dims ( singlehead case ):
 Q: [B, M0, K0] (row_major),
 K: [B, N0, K0] (column_major),
 V: [B, N0, N1] (row_major),
 bias: [B, M0, N0] (row_major).
-Layouts are fixed for now.
+output: [ B, M0, N1 ]
+
+If inputs/outputs have four dims ( multihead case ),
+the head dim is located at the dimension with index 2
+
+dimension order of the parameters is
 
-Only supports NO_CAUSAL or LOWER_LEFT_EMPTY for now.
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major).
+Output: [ B, M0, H, N1 ]
+
+Only supports NO_CAUSAL or LOWER_LEFT_EMPTY causal mask types.
 When causal_mask is enabled, M0 must be equal to N0.
 
 Internally, it stores the results of Q@K in registers without writing them to shared memory, which is faster.
-However, N0 / N1 must be <= 512.
+However, N0 and N1 must be <= 512.
 """
 
 from aitemplate.backend import registry, target
@@ -37,8 +52,6 @@
 
 
 class classic_b2b_bmm(b2b_bmm_base):
-    """See comments at the head of this file."""
-
     def __init__(
         self,
         causal_type: CausalType,
@@ -47,9 +60,20 @@ def __init__(
         alpha1: float,
         alpha1_divide_by_seq_len: bool = False,
     ) -> None:
-        """Initialize classic_b2b_bmm op.
-        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
-        about these args.
+        r"""Back-to-back batched gemm fused kernels.
+
+        More detailed documentation at the top of this file.
+
+        Args:
+        * causal_type (CausalType): Type of causal_mask. See comments above.
+        * epilogue_math_name (str): Name of the activation function.
+        Supported epilogue functions can be found from
+        python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+        * alpha0 (float): See the math function above.
+        * alpha1 (float): See the math function above.
+        * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+        Useful when seq_len is a dynamic value so that alpah1 cannot be
+        computed in advance.
         """
         super().__init__(
             causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
@@ -69,32 +93,35 @@ def _infer_shapes(self):
         q_shape = q._attrs["shape"]
         k_shape = k._attrs["shape"]
         v_shape = v._attrs["shape"]
+        head_dim = 2
+        seq_dim = 1
         if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
             raise RuntimeError(
                 f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        if len(q_shape) != 3:
+        if len(q_shape) != 3 and len(k_shape) != 4:
             raise RuntimeError(
-                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+                f"QKV must have rank 3 or 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
 
         if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
             raise RuntimeError(
                 f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
+
         batch_size = q_shape[0]
-        M0 = q_shape[1]
-        K0 = q_shape[2]
-        if K0 != k_shape[2]:
+        M0 = q_shape[seq_dim]
+        K0 = q_shape[-1]
+        if K0 != k_shape[-1]:
             raise RuntimeError(
                 f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        N0 = k_shape[1]
-        if N0 != v_shape[1]:
+        N0 = k_shape[seq_dim]
+        if N0 != v_shape[seq_dim]:
             raise RuntimeError(
                 f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        N1 = v_shape[2]
+        N1 = v_shape[-1]
         if N0.upper_bound() > 512 or N1.upper_bound() > 512:
             raise RuntimeError(
                 f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
@@ -108,16 +135,48 @@ def _infer_shapes(self):
                 raise RuntimeError(
                     f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
                 )
+        bias_shape = bias._attrs["shape"]
 
-        output_shape = [batch_size, M0, N1]
+        is_multihead = len(q_shape) == 4
+        if is_multihead:
+            num_heads = q_shape[head_dim]
+
+            output_shape = [batch_size, M0, num_heads, N1]
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Was expecting 4-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(
+                bias_shape, [batch_size, num_heads, M0, N0]
+            ):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
+            # key sequence length is identical to last shape dim of bias tensor
+            # so if it is also constant 1, it is not a real broadcast and permissible
+            if bias_shape[-1] == IntImm(1) and k_shape[seq_dim] != IntImm(1):
+                raise RuntimeError(
+                    "classic_b2b_bmm op does not support broadcasting of last dimension of bias tensor (e.g. over sequence length of key and value ). Use the expand op to emulate this broadcast behavior if you need it."
+                )
+        else:
+            num_heads = IntImm(1)
+            self._attrs["num_heads"] = num_heads
+            output_shape = [batch_size, M0, N1]
+            if len(bias_shape) != 3:
+                raise RuntimeError(
+                    f"Was expecting 3-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(bias_shape, [batch_size, M0, N0]):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
 
-        bias_shape = bias._attrs["shape"]
-        if bias_shape != [batch_size, M0, N0]:
-            raise RuntimeError(
-                f"bias shape is not compatible with Q K! "
-                f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
-                f"bias shapes: {bias_shape=}."
-            )
         return output_shape
 
     def __call__(
@@ -129,16 +188,18 @@ def __call__(
     ) -> Tensor:
         """call the op
 
+        Note: [H,] means optional num-heads,
+        if it exists for one input tensor, all need to have it,
         Parameters
         ----------
-        q: Tensor, shape(B, M0, K0)
-        k: Tensor, shape(B, N0, K0)
-        v: Tensor, shape(B, N0, N1)
-        bias: Tensor, shape(B, M0, N0)
+        q: Tensor, shape(B, M0, [H,] K0)
+        k: Tensor, shape(B, N0, [H,] K0)
+        v: Tensor, shape(B, N0, [H,] N1)
+        bias: Tensor, shape(B, [H,] M0, N0)
 
         Returns
         ----------
-        Tensor, shape(B, M0, N1)
+        Tensor, shape(B, M0, [H,], N1)
         """
 
         self._attrs["inputs"] = [q, k, v, bias]
diff --git a/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
index 55646bd44..f5b4e6bdd 100644
--- a/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
+++ b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -204,18 +204,25 @@ class B2bGemmBatched {
     GemmCoord problem_size_0;
     GemmCoord problem_size_1;
     TensorRef<ElementA const, LayoutA> ref_A0;
-    int64_t stride_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
     TensorRef<ElementB const, LayoutB> ref_B0;
-    int64_t stride_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
     TensorRef<ElementC const, LayoutC> ref_C0;
-    int64_t stride_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
     TensorRef<ElementB const, LayoutB1> ref_B1;
-    int64_t stride_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
     TensorRef<ElementC const, LayoutC> ref_C1;
-    int64_t stride_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
     TensorRef<ElementC, LayoutC> ref_D1;
-    int64_t stride_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
     int batch_count;
+    int num_heads;
     typename EpilogueOutputOp0::Params epilogue0;
     typename EpilogueOutputOp1::Params epilogue1;
 
@@ -235,18 +242,25 @@ class B2bGemmBatched {
       GemmCoord problem_size_0_,
       GemmCoord problem_size_1_,
       TensorRef<ElementA const, LayoutA> ref_A0_,
-      int64_t stride_A0_,
+      int64_t head_stride_A0_,
+      int64_t batch_stride_A0_,
       TensorRef<ElementB const, LayoutB> ref_B0_,
-      int64_t stride_B0_,
+      int64_t head_stride_B0_,
+      int64_t batch_stride_B0_,
       TensorRef<ElementC const, LayoutC> ref_C0_,
-      int64_t stride_C0_,
+      int64_t head_stride_C0_,
+      int64_t batch_stride_C0_,
       TensorRef<ElementB const, LayoutB1> ref_B1_,
-      int64_t stride_B1_,
+      int64_t head_stride_B1_,
+      int64_t batch_stride_B1_,
       TensorRef<ElementC const, LayoutC> ref_C1_,
-      int64_t stride_C1_,
+      int64_t head_stride_C1_,
+      int64_t batch_stride_C1_,
       TensorRef<ElementC, LayoutC> ref_D1_,
-      int64_t stride_D1_,
+      int64_t head_stride_D1_,
+      int64_t batch_stride_D1_,
       int batch_count_,
+      int num_heads_,
       typename EpilogueOutputOp0::Params epilogue0_ =
         typename EpilogueOutputOp0::Params(),
       typename EpilogueOutputOp1::Params epilogue1_ =
@@ -255,18 +269,25 @@ class B2bGemmBatched {
       problem_size_0(problem_size_0_),
       problem_size_1(problem_size_1_),
       ref_A0(ref_A0_),
-      stride_A0(stride_A0_),
+      head_stride_A0(head_stride_A0_),
+      batch_stride_A0(batch_stride_A0_),
       ref_B0(ref_B0_),
-      stride_B0(stride_B0_),
+      head_stride_B0(head_stride_B0_),
+      batch_stride_B0(batch_stride_B0_),
       ref_C0(ref_C0_),
-      stride_C0(stride_C0_),
+      head_stride_C0(head_stride_C0_),
+      batch_stride_C0(batch_stride_C0_),
       ref_B1(ref_B1_),
-      stride_B1(stride_B1_),
+      head_stride_B1(head_stride_B1_),
+      batch_stride_B1(batch_stride_B1_),
       ref_C1(ref_C1_),
-      stride_C1(stride_C1_),
+      head_stride_C1(head_stride_C1_),
+      batch_stride_C1(batch_stride_C1_),
       ref_D1(ref_D1_),
-      stride_D1(stride_D1_),
+      head_stride_D1(head_stride_D1_),
+      batch_stride_D1(batch_stride_D1_),
       batch_count(batch_count_),
+      num_heads(num_heads_),
       epilogue0(epilogue0_),
       epilogue1(epilogue1_) {
 
@@ -318,7 +339,7 @@ class B2bGemmBatched {
     cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
       args.problem_size_0,
       {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
-      args.batch_count);
+      args.batch_count * args.num_heads);
 
     // Initialize the Params structure
     params_ = typename B2bGemmBatchedKernel::Params{
@@ -326,18 +347,25 @@ class B2bGemmBatched {
       args.problem_size_1,
       grid_shape,
       args.ref_A0.non_const_ref(),
-      args.stride_A0,
+      args.head_stride_A0,
+      args.batch_stride_A0,
       args.ref_B0.non_const_ref(),
-      args.stride_B0,
+      args.head_stride_B0,
+      args.batch_stride_B0,
       args.ref_C0.non_const_ref(),
-      args.stride_C0,
+      args.head_stride_C0,
+      args.batch_stride_C0,
       args.ref_B1.non_const_ref(),
-      args.stride_B1,
+      args.head_stride_B1,
+      args.batch_stride_B1,
       args.ref_C1.non_const_ref(),
-      args.stride_C1,
+      args.head_stride_C1,
+      args.batch_stride_C1,
       args.ref_D1,
-      args.stride_D1,
+      args.head_stride_D1,
+      args.batch_stride_D1,
       args.batch_count,
+      args.num_heads,
       args.epilogue0,
       args.epilogue1
     };
diff --git a/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
index 10325a165..8828b0725 100644
--- a/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
+++ b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -79,23 +79,30 @@ struct B2bGemmBatched {
     int swizzle_log_tile;
     typename B2bMma::IteratorA0::Params params_A0;
     typename B2bMma::IteratorA0::TensorRef ref_A0;
-    int64_t stride_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
     typename B2bMma::IteratorB0::Params params_B0;
     typename B2bMma::IteratorB0::TensorRef ref_B0;
-    int64_t stride_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
     typename GmemToAccumLoader::OutputTileIterator::Params params_C0;
     typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0;
-    int64_t stride_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
     typename B2bMma::IteratorB1::Params params_B1;
     typename B2bMma::IteratorB1::TensorRef ref_B1;
-    int64_t stride_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
     typename Epilogue::OutputTileIterator::Params params_C1;
     typename Epilogue::OutputTileIterator::TensorRef ref_C1;
-    int64_t stride_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
     typename Epilogue::OutputTileIterator::Params params_D1;
     typename Epilogue::OutputTileIterator::TensorRef ref_D1;
-    int64_t stride_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
     int batch_count;
+    int num_heads;
     typename OutputOp0::Params output_op_0;
     typename OutputOp1::Params output_op_1;
     int gemm_k_iterations_0;
@@ -114,18 +121,25 @@ struct B2bGemmBatched {
       cutlass::gemm::GemmCoord const & problem_size_1,
       cutlass::gemm::GemmCoord const & grid_tiled_shape,
       typename B2bMma::IteratorA0::TensorRef ref_A0,
-      int64_t stride_A0,
+      int64_t head_stride_A0,
+      int64_t batch_stride_A0,
       typename B2bMma::IteratorB0::TensorRef ref_B0,
-      int64_t stride_B0,
+      int64_t head_stride_B0,
+      int64_t batch_stride_B0,
       typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
-      int64_t stride_C0,
+      int64_t head_stride_C0,
+      int64_t batch_stride_C0,
       typename B2bMma::IteratorB1::TensorRef ref_B1,
-      int64_t stride_B1,
+      int64_t head_stride_B1,
+      int64_t batch_stride_B1,
       typename Epilogue::OutputTileIterator::TensorRef ref_C1,
-      int64_t stride_C1,
+      int64_t head_stride_C1,
+      int64_t batch_stride_C1,
       typename Epilogue::OutputTileIterator::TensorRef ref_D1,
-      int64_t stride_D1,
+      int64_t head_stride_D1,
+      int64_t batch_stride_D1,
       int batch_count,
+      int num_heads,
       typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
       typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
     ):
@@ -135,25 +149,32 @@ struct B2bGemmBatched {
       swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
       params_A0(ref_A0.layout()),
       ref_A0(ref_A0),
-      stride_A0(stride_A0),
+      head_stride_A0(head_stride_A0),
+      batch_stride_A0(batch_stride_A0),
       params_B0(ref_B0.layout()),
       ref_B0(ref_B0),
-      stride_B0(stride_B0),
+      head_stride_B0(head_stride_B0),
+      batch_stride_B0(batch_stride_B0),
       params_C0(ref_C0.layout()),
       ref_C0(ref_C0),
-      stride_C0(stride_C0),
+      head_stride_C0(head_stride_C0),
+      batch_stride_C0(batch_stride_C0),
       params_B1(ref_B1.layout()),
       ref_B1(ref_B1),
-      stride_B1(stride_B1),
+      head_stride_B1(head_stride_B1),
+      batch_stride_B1(batch_stride_B1),
       params_C1(ref_C1.layout()),
       ref_C1(ref_C1),
-      stride_C1(stride_C1),
+      head_stride_C1(head_stride_C1),
+      batch_stride_C1(batch_stride_C1),
       params_D1(ref_D1.layout()),
       ref_D1(ref_D1),
-      stride_D1(stride_D1),
+      head_stride_D1(head_stride_D1),
+      batch_stride_D1(batch_stride_D1),
       output_op_0(output_op_0),
       output_op_1(output_op_1),
       batch_count(batch_count),
+      num_heads(num_heads),
       gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
       gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
   };
@@ -255,7 +276,7 @@ struct B2bGemmBatched {
     }
 
     // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
-    for (int batch_idx = threadblock_swizzle.get_batch_idx(); batch_idx < params.batch_count; batch_idx += gridDim.z) {
+    for (int batch_head_idx = threadblock_swizzle.get_batch_idx(); batch_head_idx < params.batch_count * params.num_heads; batch_head_idx += gridDim.z) {
 
       // Compute initial location in logical coordinates
       cutlass::MatrixCoord tb_offset_A0{
@@ -276,6 +297,10 @@ struct B2bGemmBatched {
       // Compute position within threadblock
       int thread_idx = threadIdx.x;
 
+      // Convert blockIdx.z into (batch_idx, head_idx).
+      int batch_idx = batch_head_idx / params.num_heads;
+      int head_idx = batch_head_idx % params.num_heads;
+
       // Construct iterators to A and B operands
       typename B2bMma::IteratorA0 iterator_A0(
         params.params_A0,
@@ -284,7 +309,7 @@ struct B2bGemmBatched {
         thread_idx,
         tb_offset_A0);
 
-      iterator_A0.add_pointer_offset(params.stride_A0 * batch_idx);
+      iterator_A0.add_pointer_offset(params.batch_stride_A0 * batch_idx + params.head_stride_A0 * head_idx);
 
       typename B2bMma::IteratorB0 iterator_B0(
         params.params_B0,
@@ -293,7 +318,7 @@ struct B2bGemmBatched {
         thread_idx,
         tb_offset_B0);
 
-      iterator_B0.add_pointer_offset(params.stride_B0 * batch_idx);
+      iterator_B0.add_pointer_offset(params.batch_stride_B0 * batch_idx + params.head_stride_B0 * head_idx);
 
       typename B2bMma::IteratorB1 iterator_B1(
         params.params_B1,
@@ -302,7 +327,7 @@ struct B2bGemmBatched {
         thread_idx,
         tb_offset_B1);
 
-      iterator_B1.add_pointer_offset(params.stride_B1 * batch_idx);
+      iterator_B1.add_pointer_offset(params.batch_stride_B1 * batch_idx + params.head_stride_B1 * head_idx);
 
 
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
@@ -325,8 +350,7 @@ struct B2bGemmBatched {
         tb_offset_C0
       );
 
-      iterator_C0.add_pointer_offset(params.stride_C0 * batch_idx);
-
+      iterator_C0.add_pointer_offset(params.batch_stride_C0 * batch_idx + params.head_stride_C0 * head_idx);
 
       //
       // Main loop
@@ -375,7 +399,7 @@ struct B2bGemmBatched {
         threadblock_offset
       );
 
-      iterator_C1.add_pointer_offset(params.stride_C1 * batch_idx);
+      iterator_C1.add_pointer_offset(params.batch_stride_C1 * batch_idx + params.head_stride_C1 * head_idx);
 
       // Tile iterator writing to destination tensor.
       typename Epilogue::OutputTileIterator iterator_D1(
@@ -386,7 +410,7 @@ struct B2bGemmBatched {
         threadblock_offset
       );
 
-      iterator_D1.add_pointer_offset(params.stride_D1 * batch_idx);
+      iterator_D1.add_pointer_offset(params.batch_stride_D1 * batch_idx + params.head_stride_D1 * head_idx);
 
       Epilogue epilogue(
         shared_storage.epilogue,
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index b1bcebb00..89bfda95b 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -23,6 +23,7 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
 from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
@@ -198,6 +199,243 @@ def test_classic_b2b_bmm_fp16(self):
         )
 
 
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class ClassicMultiheadB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_classic_multihead_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        num_heads=2,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="classic_b2b_bmm",
+        copy_op=True,
+        atol=1e-2,
+        rtol=1e-2,
+        bias_broadcast=(False, False, False, False),
+    ):
+        # Initialize AIT classic_b2b_bmm operator.
+        assert len(bias_broadcast) == 4
+        assert (
+            bias_broadcast[3] is False
+        ), "Classic b2b bmm cannot broadcast bias on last dimension."
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, num_heads, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, num_heads, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, num_heads, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        bias_shape_full = [batch_size_dim, num_heads, m, n0]
+        bias_shape = [
+            IntImm(1) if bias_broadcast[i] else bias_shape_full[i] for i in range(4)
+        ]
+        Bias = Tensor(
+            shape=bias_shape,
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        classic_b2b_bmm_op = ops.classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            classic_b2b_bmm_op = ops.classic_b2b_bmm(
+                **classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=True)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            # Initialized in BMHD dim order
+            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
+            bias_shape_full_pt = (batch_size, num_heads, m, n0)
+            bias_shape_pt = (
+                1 if bias_broadcast[i] else bias_shape_full_pt[i] for i in range(4)
+            )
+            bias_pt = torch.rand(*bias_shape_pt, dtype=torch_dtype).cuda()
+
+            # Permute to BHMD dim order
+            q_pt_hf = torch.permute(q_pt, [0, 2, 1, 3])
+            k_pt_hf = torch.permute(k_pt, [0, 2, 1, 3])
+            v_pt_hf = torch.permute(v_pt, [0, 2, 1, 3])
+
+            # Run PT reference.
+            attn = alpha0 * (q_pt_hf @ k_pt_hf.transpose(-2, -1)) + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 / m * attn
+            invalid_attn_mask = get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            second_mm = attn @ v_pt_hf
+            output = torch.permute(
+                second_mm, [0, 2, 1, 3]
+            )  # permute back to original dim order
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
+            y = torch.empty(
+                [batch_size, m, num_heads, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead1_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead2_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead1_b2b_bmm_bias_broadcast1(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead2_b2b_bmm_bias_broadcast1(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead2_b2b_bmm_bias_broadcast2(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast2_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead2_b2b_bmm_bias_broadcast3(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast3_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, False, False, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead4_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead4_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+            num_heads=4,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead16_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead16_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            num_heads=16,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead3_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead3_b2b_bmm_fp16_causal",
+            dtype="float16",
+            batch_sizes=5,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=3,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead8_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead8_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            num_heads=8,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_multihead1_relu_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=1,
+        )
+
+
 @unittest.skipIf(
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
@@ -260,6 +498,7 @@ def _test_fmha_style_b2b_bmm(
             name="v",
             is_input=True,
         )
+
         Bias = None
         if has_bias:
             shape = [batch_size_dim, num_heads_dim, seq_lens_dim, seq_lens_kv_dim]
@@ -280,6 +519,7 @@ def _test_fmha_style_b2b_bmm(
             alpha1_divide_by_seq_len=True,
             epilogue_math_name=epilogue_math_name,
         )
+
         if copy_op:
             fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
                 **fmha_style_b2b_bmm_op._get_op_attributes()
@@ -308,12 +548,12 @@ def _test_fmha_style_b2b_bmm(
                 batch_size, seq_len_kv, num_head, n1, dtype=torch_dtype
             ).cuda()
             shape = [batch_size, num_head, seq_len, seq_len_kv]
-            if bias_broadcast:
-                for i, broadcast in enumerate(bias_broadcast):
-                    if broadcast:
-                        shape[i] = 1
-            bias_pt = torch.rand(shape, dtype=torch_dtype).cuda()
-
+            if has_bias:
+                if bias_broadcast:
+                    for i, broadcast in enumerate(bias_broadcast):
+                        if broadcast:
+                            shape[i] = 1
+                bias_pt = torch.rand(shape, dtype=torch_dtype).cuda()
             # Run PT reference.
             attn = alpha0 * (
                 q_pt.transpose(1, 2) @ k_pt.transpose(1, 2).transpose(-2, -1)
@@ -361,11 +601,13 @@ def test_fmha_style_b2b_bmm_fp16(self):
             test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len",
             dtype="float16",
             seq_lens=[128, 256],
+            # dynamic sequence length not supported by classic op
         )
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len_kv",
             dtype="float16",
             seq_lens_kv=[128, 256],
+            # dynamic sequence length not supported by classic op
         )
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_dynamic_num_heads",
@@ -385,6 +627,7 @@ def test_fmha_style_b2b_bmm_fp16(self):
             dtype="float16",
             batch_sizes=2,
             causal_type=CausalType.UPPER_RIGHT_EMPTY,
+            # CausalType.UPPER_RIGHT_EMPTY not supported by classic op
         )
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_causal_lower_left_empty",
@@ -405,6 +648,13 @@ def test_fmha_style_b2b_bmm_fp16(self):
             has_bias=True,
             bias_broadcast=[False, True, False, False],
         )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias_broadcast_relative_pos",
+            dtype="float16",
+            batch_sizes=[1, 11],
+            has_bias=True,
+            bias_broadcast=[True, True, False, False],
+        )
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_sigmoid",
             dtype="float16",

From 2707974324df56f74812e2a42f179772ef62e9eb Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Thu, 4 May 2023 05:38:59 -0700
Subject: [PATCH 473/638] Fix naming error caused by dump_program(). (#649)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/649

fx2ait tries to dump an AIT program before AIT compilation by default
(https://fburl.com/code/z9lmvihn).

dump_program() internally calls name_graph(https://fburl.com/code/p94s8xkf),
which updates global naming counters (https://fburl.com/code/clkfj2g9).

However, at the beginning of AIT compile_model(), reset_name_counters() is
called to reset these global naming counters.

As a result, new tensors / operators generated by the AIT compiler may have
duplicate names which already exist in the AIT graph, and cause errors.

Reviewed By: muchulee8, wushirong

Differential Revision: D45467803

fbshipit-source-id: 0f81c67bd2ef6e2d1d2f9fa85dc5705d47391e1a
---
 python/aitemplate/compiler/base.py            | 20 +++++++++++++++++++
 python/aitemplate/compiler/compiler.py        |  6 +++---
 .../compiler/transform/name_graph.py          | 11 ++++++++++
 .../utils/serialization/serdes_code.py        |  2 ++
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 6bb71b4b2..1dda1f03a 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -17,6 +17,8 @@
 """
 from __future__ import annotations
 
+import copy
+
 import math
 
 from abc import ABC, abstractmethod
@@ -940,6 +942,12 @@ def _bind_data(self, data: _ConstantTensorData) -> None:
             )
         self._attrs["data"] = data
 
+    def __deepcopy__(self, memo):
+        result = Tensor(self.shape())
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def __add__(self, other: Any) -> Tensor:
         return OP_REGISTRY.get("ADD")(self, other)
 
@@ -1032,6 +1040,12 @@ def __init__(
     def pseudo_code(self, with_shape=True) -> str:
         return f"IntVarTensor({self._attrs['int_var'].pseudo_code()})"
 
+    def __deepcopy__(self, memo):
+        result = IntVarTensor(self._attrs["int_var"])
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def __add__(self, other: Any) -> Tensor:
         return OP_REGISTRY.get("INT_ADD")(self, other)
 
@@ -1108,6 +1122,12 @@ def __call__(self, *args: List[Tensor]) -> List[Tensor]:
         """
         raise NotImplementedError
 
+    def __deepcopy__(self, memo):
+        result = type(self)(**self._get_op_attributes())
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def _set_depth(self) -> None:
         """
         Sets operator depth and dst_ops.
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index e42754fec..c601057dc 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -216,9 +216,6 @@ def compile_model(
     if profile_dir is None:
         profile_dir = workdir
 
-    if debug_settings.dump_ait_to_py:
-        dump_program(tensor, debug_settings.dump_ait_to_py)
-
     if int(recompile) == 1:
         os.makedirs(test_dir, exist_ok=True)
         with target:
@@ -243,6 +240,9 @@ def compile_model(
             compiler.transform.name_graph(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "name_graph")
 
+            if debug_settings.dump_ait_to_py:
+                dump_program(tensor, debug_settings.dump_ait_to_py)
+
             compiler.transform.dedup_symbolic_name(graph)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "dedup_symbolic_name"
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 7faa8da64..b2a13b359 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -15,12 +15,15 @@
 """
 Graph pass to assign names to a sorted graph.
 """
+import logging
 import re
 from typing import List
 
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, JaggedIntVar, Tensor
 from aitemplate.utils import graph_utils
 
+_LOGGER = logging.getLogger(__name__)
+
 # pylint: disable=C0103
 
 # Make these variables global to allow repeately calling name_graph().
@@ -71,6 +74,10 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
     global tensor_cnt
     global func_name_to_tensor_cnt
     global user_provided_dim
+
+    _LOGGER.debug(
+        f"before name_graph: {func_cnt=}, {tensor_cnt=}, {len(func_name_to_tensor_cnt)=}, {len(user_provided_dim)=}"
+    )
     for node in sorted_graph:
         funcs = node.src_ops()
         if len(funcs) == 0:
@@ -137,6 +144,10 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 jagged_int_var_name = jagged_int_var._attrs["name"]
                 batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"
 
+    _LOGGER.debug(
+        f"after name_graph: {func_cnt=}, {tensor_cnt=}, {len(func_name_to_tensor_cnt)=}, {len(user_provided_dim)=}"
+    )
+
 
 def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
     """Rename all shape variable that are identical to the same name.
diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
index 6c58299d3..261001507 100644
--- a/python/aitemplate/utils/serialization/serdes_code.py
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -15,6 +15,7 @@
 """
 Dump/Read sorted_graph to/from python code.
 """
+import copy
 import os
 
 from typing import Dict, List, Optional, Tuple, Union
@@ -306,6 +307,7 @@ def dump_program(
     """
     if isinstance(sorted_graph, Tensor):
         sorted_graph = [sorted_graph]
+    sorted_graph = copy.deepcopy(sorted_graph)
 
     # Make sure the graph is in correct order and has names and param set correctly.
     sorted_graph = toposort(sorted_graph)

From 091ccdbd18698960947f32bf0e7e2d76821c70d2 Mon Sep 17 00:00:00 2001
From: Terry Chen <terrychen@fb.com>
Date: Thu, 4 May 2023 11:42:56 -0700
Subject: [PATCH 474/638] SD example fix (#654)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/654

Reviewed By: wushirong

Differential Revision: D45553520

Pulled By: terrychenism

fbshipit-source-id: 6fc5ea2f3988a3b2682ef4c3262d648bf440c8d9
---
 .../05_stable_diffusion/scripts/download_pipeline.py     | 4 ++--
 .../src/pipeline_stable_diffusion_img2img_ait.py         | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index b072e694f..e5ffe56f0 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -21,12 +21,12 @@
 @click.option("--token", default="", help="access token")
 @click.option(
     "--save_directory",
-    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="pipeline files local directory",
 )
 def download_pipeline_files(token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
+        "stabilityai/stable-diffusion-2",
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
index ad2885086..893db028d 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -119,6 +119,7 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
+        self.batch = 1
 
     def init_ait_module(
         self,
@@ -143,12 +144,13 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=64):
+    def clip_inference(self, input_ids, seqlen=77):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -160,6 +162,7 @@ def clip_inference(self, input_ids, seqlen=64):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -171,6 +174,7 @@ def vae_inference(self, vae_input):
         num_outputs = len(exe_module.get_output_name_to_index_map())
         for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -241,6 +245,7 @@ def __call__(
             raise ValueError(
                 f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
             )
+        self.batch = batch_size
 
         if strength < 0 or strength > 1:
             raise ValueError(
@@ -294,7 +299,7 @@ def __call__(
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )

From 109b98a867bd2e09b42c1281546ed858527bf356 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 4 May 2023 11:53:32 -0700
Subject: [PATCH 475/638] remove eliminate_permutations for now (#655)

Summary:
this pass triggered some accuracy issue. Let's remove it for now

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/655

Reviewed By: ipiszy, muchulee8, houseroad, wushirong

Differential Revision: D45554549

Pulled By: chenyang78

fbshipit-source-id: b1a8cb64642a5147649fe630a1d04fcee566fda5
---
 python/aitemplate/compiler/transform/optimize_graph.py | 4 ++--
 tests/unittest/compiler/test_eliminate_permutations.py | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 2e8f5a483..b5c99163b 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -46,7 +46,6 @@
 from aitemplate.compiler.transform.transform_odd_alignment import (
     transform_odd_alignment,
 )
-from aitemplate.compiler.transform.transform_permutations import eliminate_permutations
 from aitemplate.compiler.transform.transform_permute_to_reshape import (
     transform_permute_to_reshape,
 )
@@ -118,7 +117,8 @@ def optimize_graph(
         split_large_split_ops,
         transform_permute_to_reshape,
         transform_memory_ops,
-        eliminate_permutations,
+        # FIXME: temporarily disable this due to some accuracy issue
+        # eliminate_permutations,
     ]
 
     if not optimize:
diff --git a/tests/unittest/compiler/test_eliminate_permutations.py b/tests/unittest/compiler/test_eliminate_permutations.py
index 8c1603ec0..54ec64d6f 100644
--- a/tests/unittest/compiler/test_eliminate_permutations.py
+++ b/tests/unittest/compiler/test_eliminate_permutations.py
@@ -28,6 +28,7 @@
 )
 
 
+@unittest.skip("Skip until we fix the accuracy issue")
 class EliminatePermutationTestCase(unittest.TestCase):
     def test_eliminate_permutation(self):
         dtype = "float"
@@ -188,3 +189,7 @@ def test_eliminate_permutation_all_permutations(self):
         result_graph = module.debug_sorted_graph
         self.assertEqual(len(result_graph), 3)
         self.assertTrue(graph_has_op(result_graph, "permute"))
+
+
+if __name__ == "__main__":
+    unittest.main()

From ea7ed4ca924940cf657391dd76c1a880e2ab8178 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Thu, 4 May 2023 14:26:56 -0700
Subject: [PATCH 476/638] jagged SHA and MHA module support (#651)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/651

Reviewed By: aakhundov

Differential Revision: D45074178

fbshipit-source-id: 2c6f13ddcc52e8f833fcd164d0c479ca3398322e
---
 fx2ait/fx2ait/tools/common_fx2ait.py | 66 +++++++++++++++++++---------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index f1b89e2f9..81d4d0bc3 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -13,9 +13,9 @@
 #  limitations under the License.
 #
 import copy
+import logging
 import time
 import unittest
-
 import uuid
 from enum import Enum
 from typing import Callable, List, Optional, Set
@@ -29,13 +29,15 @@
 from fx2ait.fx2ait import AITInterpreter
 from fx2ait.tensor_spec import TensorSpec
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 OSS_AITModel = False
 try:
     torch.ops.load_library("//deeplearning/ait:AITModel")
-    print("===Load non-OSS AITModel===")
+    logger.info("===Load non-OSS AITModel===")
 except Exception:
     torch.ops.load_library("build/libait_model.so")
-    print("===Load OSS AITModel===")
+    logger.info("===Load OSS AITModel===")
     OSS_AITModel = True
 
 
@@ -123,7 +125,7 @@ def run_test(
         for p in passes:
             mod = p(mod, inputs)
 
-        print(mod.graph)
+        logger.info(mod.graph)
 
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
@@ -161,7 +163,7 @@ def run_test(
             start = time.perf_counter()
             interp_result = interp.run()
             sec = time.perf_counter() - start
-            print("Interpreter run time(s):", sec)
+            logger.info("Interpreter run time(s):", sec)
             if OSS_AITModel:
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
@@ -194,7 +196,9 @@ def run_test(
             outputs = ait_mod(*cuda_inputs)
             end_event.record()
             torch.cuda.synchronize()
-            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+            logger.info(
+                "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+            )
             # PyTorch Transformer model would yield 2 output tensors, of which the second one is
             # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
             if leaf_module == torch.nn.MultiheadAttention:
@@ -227,21 +231,41 @@ def run_test_with_dynamic_shape(
         rtol: float = 1e-02,
         atol: float = 1e-02,
         precision: LowerPrecision = LowerPrecision.FP16,
+        passes: List[Callable] = [],  # noqa: B006
+        leaf_module: Callable = None,  # one leaf module
+        inputs_override: List[
+            List[torch.Tensor]
+        ] = None,  # For cases we can not generate inputs with existing tensor spec interface
     ):
         mod.eval()
-        inputs_list = []
-        for use_lower_bound in [True, False]:
-            inputs_list.append(
-                TensorSpec.create_inputs_from_specs(
-                    inputs_spec,
-                    use_lower_bound=use_lower_bound,
+        leaf_module_list = []
+        if leaf_module:
+            leaf_module_list.append(leaf_module)
+
+        if inputs_override:
+            inputs_min = inputs_override[0]
+            inputs_max = inputs_override[1]
+        else:
+            inputs_list = []
+            for use_lower_bound in [True, False]:
+                inputs_list.append(
+                    TensorSpec.create_inputs_from_specs(
+                        inputs_spec, use_lower_bound=use_lower_bound
+                    )
                 )
-            )
 
-        inputs_min = inputs_list[0]
-        inputs_max = inputs_list[1]
+            inputs_min = inputs_list[0]
+            inputs_max = inputs_list[1]
         mod.eval()
-        mod = acc_tracer.trace(mod, inputs_min)
+        mod = acc_tracer.trace(
+            mod,
+            inputs_min,
+            leaf_module_list=leaf_module_list,
+        )
+        for p in passes:
+            mod = p(mod, inputs_min)
+        logger.info(mod.graph)
+
         original_inputs = inputs_min
         # Trace and test with inputs_min
         interp = AITInterpreter(
@@ -263,7 +287,7 @@ def run_test_with_dynamic_shape(
             start = time.perf_counter()
             interp_result = interp.run()
             sec = time.perf_counter() - start
-            print("Interpreter run time(s):", sec)
+            logger.info("Interpreter run time(s):", sec)
             if OSS_AITModel:
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
@@ -298,7 +322,9 @@ def run_test_with_dynamic_shape(
             outputs = ait_mod(*cuda_inputs)
             end_event.record()
             torch.cuda.synchronize()
-            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+            logger.info(
+                "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+            )
 
             if isinstance(outputs, torch.Tensor):
                 ref_outputs = [ref_outputs]
@@ -379,14 +405,14 @@ def benchmark(f, args):
         torch.cuda.synchronize()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
-        print("== Start benchmark iterations")
+        logger.info("== Start benchmark iterations")
         with torch.inference_mode():
             start_event.record()
             for _ in range(iters):
                 f(*args)
             end_event.record()
         torch.cuda.synchronize()
-        print("== End benchmark iterations")
+        logger.info("== End benchmark iterations")
         time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
         return time_per_iter_ms
 

From 332eccd5157be0e603a225b0eea9b5b84577fd2b Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Thu, 4 May 2023 14:57:11 -0700
Subject: [PATCH 477/638] Add BF16 support for ads model (#656)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/656

Reviewed By: frank-wei

Differential Revision: D45494148

fbshipit-source-id: 2fe5c7cd3b763b839af3d1b05eecc73f1df05286
---
 fx2ait/fx2ait/converters/ait_converters.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 66aaff6c5..ab14f88b8 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -17,7 +17,7 @@
 import operator
 from typing import Dict, List, Sequence, Tuple, Union
 
-import numpy as np
+import torch
 
 from aitemplate.compiler.public import (
     avg_pool2d,
@@ -887,16 +887,22 @@ def acc_ops_nan_to_num(
 
     def _get_dtype(dtype: str):
         if dtype in ("float", "float32"):
-            return np.float32
-        elif dtype == "float16":
-            return np.float16
+            return torch.float32
+        elif dtype in ("half", "float16"):
+            return torch.float16
+        elif dtype == "bfloat16":
+            return torch.bfloat16
         else:
             raise NotImplementedError(f"Unsupported dtype {dtype} for nan_to_num")
 
     input_dtype = input_val.dtype()
-    np_dtype = _get_dtype(input_dtype)
-    posinf = np.finfo(np_dtype).max if kwargs["posinf"] is None else kwargs["posinf"]
-    neginf = np.finfo(np_dtype).min if kwargs["neginf"] is None else kwargs["neginf"]
+    torch_dtype = _get_dtype(input_dtype)
+    posinf = (
+        torch.finfo(torch_dtype).max if kwargs["posinf"] is None else kwargs["posinf"]
+    )
+    neginf = (
+        torch.finfo(torch_dtype).min if kwargs["neginf"] is None else kwargs["neginf"]
+    )
     return elementwise(FuncEnum.NAN_TO_NUM)(
         input_val,
         AITTensor(value=nan, shape=[], name="nan", dtype=input_dtype),

From 83b39e75e9eaf606ea293a493bb0bfb5e895a2c7 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 4 May 2023 15:25:35 -0700
Subject: [PATCH 478/638] _dlclose support for Windows (#657)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/657

Add support for closing a compiled library on Windows

Reviewed By: chenyang78

Differential Revision: D45577753

fbshipit-source-id: bd15ea2fcaf20308a6a91eaea786a361a617b8cd
---
 python/aitemplate/compiler/model.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index 3d22c3748..af76a7782 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -24,6 +24,7 @@
 import numpy as np
 
 from aitemplate.compiler.dtype import dtype_str_to_enum
+from aitemplate.utils.misc import is_linux, is_windows
 from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 # Controls how many runtimes will be used in ModelContainer by default.
@@ -77,13 +78,26 @@ class _CFormatAITData(ctypes.Structure):
 
 
 def _dlclose(dll: ctypes.CDLL):
-    syms = ctypes.CDLL(None)
-    if hasattr(syms, "dlclose"):
-        f_dlclose = syms.dlclose
+    f_dlclose = None
+
+    if is_windows():
+        f_dlclose = ctypes.windll.kernel32.FreeLibrary
+    elif is_linux():
+        syms = ctypes.CDLL(None)
+        if not hasattr(syms, "dlclose"):
+            # Apline Linux
+            syms = ctypes.CDLL("libc.so")
+
+        if hasattr(syms, "dlclose"):
+            f_dlclose = syms.dlclose
+
+    if f_dlclose is not None:
         f_dlclose.argtypes = [ctypes.c_void_p]
         f_dlclose(dll._handle)
     else:
-        logging.warning("dlclose() not found, library may not be unloaded properly!")
+        logging.warning(
+            "dll unloading function was not found, library may not be unloaded properly!"
+        )
 
 
 def _check_tensors(

From 971c2c1ccb47258dcacf1269fd989fd8e3163181 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Thu, 4 May 2023 17:14:05 -0700
Subject: [PATCH 479/638] Add MultiscaleVisionTransformers FE + validate MVIT
 21 block config E2E (#643)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/643

Add `MultiscaleVisionTransformers` FE and use to validate parity between PT vs AIT base_21 MViT architecture at numerical tolerance of `1e-4`

Reviewed By: mortzur

Differential Revision: D45435107

fbshipit-source-id: b60cfce93d9f2c79cf5aea83b673f019109826e5
---
 python/aitemplate/frontend/nn/multiscale_attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 757477d78..5e49d6a6d 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -331,6 +331,7 @@ def __init__(
         self.pool_first = pool_first
         self.dropout_rate = dropout_rate
         self.num_heads = num_heads
+        self.dim = dim
         head_dim = dim // num_heads
         self.scale = head_dim**-0.5
         self.has_cls_embed = has_cls_embed
@@ -565,13 +566,12 @@ def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
         # attention
         q_shape = get_shape(q)
         B, num_heads, seqlen, head_dim = get_shape(q)
-        score = ops.mem_eff_attention(causal=False)(q, k, v)
-        score = ops.reshape()(score, [B, seqlen, head_dim])
+        score = ops.transpose()(ops.mem_eff_attention(causal=False)(q, k, v), 1, 2)
 
         if self.residual_pool:
             score = ops.elementwise(FuncEnum.ADD)(score, q)
 
-        score = ops.reshape()(ops.permute()(score, [0, 2, 1, 3]), [B, q_shape[-2], -1])
+        score = ops.reshape()(ops.transpose()(score, 1, 2), [B, -1, self.dim])
 
         score = self.proj(score)
         assert self.dropout_rate == 0.0

From 27d9cb6abede26bc85ea95fcfa5fbf95052df2b3 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 5 May 2023 02:59:02 -0700
Subject: [PATCH 480/638] Make GEMM ProfilerMemoryPool size computation generic
 (#653)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/653

Currently there are some hard-coded magic numbers like `(1 << 25)` in computing the `ProfilerMemoryPool`'s size (in terms of the number of fitting largest input sizes). In this diff, the magic number is replaced by the size of the actual L2 cache retrieved from the `device_properties.l2CacheSize`.

Also, the upper bound of the `mem_pool_sz` is increased from 64 to 512. The latter should be safe, given that the upper bound of the allocated total tensor memory per input / output is the size of the L2 cache.

Reviewed By: ipiszy, chenyang78

Differential Revision: D45530460

fbshipit-source-id: ccdf9ded7393ec6adabe61fc6e73d9c98c68bd74
---
 .../cuda/gemm_epilogue_vistor/common_dual_gemm.py        | 2 +-
 .../backend/cuda/gemm_epilogue_vistor/common_softmax.py  | 9 ++++-----
 .../aitemplate/backend/cuda/gemm_universal/bmm_common.py | 2 +-
 python/aitemplate/backend/cuda/gemm_universal/common.py  | 9 ++++-----
 .../backend/cuda/gemm_universal/common_bias_broadcast.py | 2 +-
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 1fcfdc774..72f7e86e1 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -75,7 +75,7 @@
 {% if has_bias %}
   one_copy_sz += b1_ptr_sz;
 {%endif%}
-  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b0_ptr_sz, mem_pool_sz);  // b_ptr: index 1
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index ba284906d..0b3db0496 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -239,7 +239,7 @@
   one_copy_sz += c_dim1;
 {%endif%}
 
-  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);                      // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);                      // b_ptr: index 1
@@ -335,10 +335,9 @@
   }
   ~ProfilerMemoryPool() {}
 
-  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz) {
-    // TODO: special pool size for A100 L2 cache 40M
-    // need to tune it for other devices
-    int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz, size_t l2_cache_bytes) {
+    int times_covers_l2_cache = (int)std::ceil(l2_cache_bytes / sizeof(DType) / ptr_max_sz);
+    int64_t mem_pool_sz = std::max(2, std::min(512, times_covers_l2_cache));
     size_t free_global_mem = 0;
     size_t total_global_mem = 0;
     cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 138f72310..8995dc4f8 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -142,7 +142,7 @@
 {% if has_d %}
   one_copy_sz += c_ptr_sz;
 {%endif%}
-  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 4cfa4d64c..5e7d52972 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -368,7 +368,7 @@
 {% if has_bias %}
   one_copy_sz += c_dim1;
 {%endif%}
-  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
@@ -484,10 +484,9 @@
   }
   ~ProfilerMemoryPool() {}
 
-  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz) {
-    // TODO: special pool size for A100 L2 cache 40M
-    // need to tune it for other devices
-    int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz, size_t l2_cache_bytes) {
+    int times_covers_l2_cache = (int)std::ceil(l2_cache_bytes / sizeof(DType) / ptr_max_sz);
+    int64_t mem_pool_sz = std::max(2, std::min(512, times_covers_l2_cache));
     size_t free_global_mem = 0;
     size_t total_global_mem = 0;
     cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index c35772d12..043c6b98a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -322,7 +322,7 @@
 {% if has_d1 %}
   one_copy_sz += c_ptr_sz;
 {%endif%}
-  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz);
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1

From 2f8c15f3497ad7fb3dffe036c3a5d13ce42c8cb8 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Fri, 5 May 2023 17:29:57 -0700
Subject: [PATCH 481/638] Add Windows support for owned constants (#658)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/658

Add the needed facilities for handling owned constants, which currently is a hack around linux linker.

Reviewed By: chenyang78

Differential Revision: D45579379

fbshipit-source-id: 0190a7331530ad0c8e8826110f68ff453d839a2d
---
 python/aitemplate/backend/builder.py        | 11 ++--
 python/aitemplate/backend/codegen.py        |  2 +
 python/aitemplate/backend/main_templates.py | 17 ++++-
 static/csrc/windll.cpp                      | 70 +++++++++++++++++++++
 static/include/owned_constants.h            |  1 +
 static/include/windll.h                     | 25 ++++++++
 6 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 static/csrc/windll.cpp
 create mode 100644 static/include/windll.h

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 8c1b44166..d90503bf5 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -430,14 +430,17 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings)
         build_so_cmd = "$(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)"
         standalone_src = "standalone.cu"
         standalone_obj = "standalone.obj"
+        windll_obj = "windll.obj"
         obj_files = []
-        # standalone.cu is an AITemplate internal file that is used for generating
-        # standalone executables. We only want to compile it when the relevant
-        # debug option is enabled.
+        # * standalone.cu is an AITemplate internal file that is used for generating
+        #   standalone executables. We only want to compile it when the relevant
+        #   debug option is enabled.
+        # * windll.cu and windll.obj are used in builder_cmake.py for MSVC compiler
+        #   and are not needed to be used in builder_make.py compiler engine.
         obj_files = [
             pair[1].split("/")[-1]
             for pair in file_pairs
-            if not pair[1].endswith(standalone_obj)
+            if not pair[1].endswith(standalone_obj) and not pair[1].endswith(windll_obj)
         ]
         obj_files = " ".join(obj_files)
 
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 6febf6cea..6dfcb0a92 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -941,6 +941,8 @@ def generate_source(self) -> Dict[str, str]:
             set_up_constant_folding_inputs="\n".join(
                 self.set_up_constant_folding_inputs
             ),
+            # # todo: enable once this feature is fully available
+            # is_windows=is_windows(),
         )
         result[model_container_src_fname] = model_container_base_src
         return result
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 1f71f6e19..b4c5e5fab 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -197,6 +197,11 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 
 namespace ait {
 namespace {
+
+{% if is_windows %}
+#include "windll.h"
+{% endif %}
+
 // Contains the metadata for each constant.
 constexpr std::array<ConstantInfo, {{ num_constants }}> owned_constants = {
   {{ owned_constants_init }}
@@ -241,14 +246,22 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 {{ set_up_constant_offsets }}
 {{ set_up_constant_folding_inputs }}
 
-  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
+{% if is_windows %}
+  size_t binary_constants_bin_size = 0;
+  uint8_t* binary_constants_bin_start = nullptr;
+  GetConstantsBin((void**)&binary_constants_bin_start, &binary_constants_bin_size);
+{% else %}
   const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
+  const uint8_t* const binary_constants_bin_start = _binary_constants_bin_start;
+{% endif %}
+
+  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
   for (auto& constant_info : owned_constants) {
     auto* dst = constants_ptr + constant_info.internal_offset;
     if (constant_info.data_offset + constant_info.num_bytes > binary_constants_bin_size) {
       throw std::runtime_error(std::string("Copying constant ") + constant_info.name + " would overflow constant buffer");
     }
-    DEVICE_CHECK(CopyToDevice(dst, _binary_constants_bin_start + constant_info.data_offset, constant_info.num_bytes));
+    DEVICE_CHECK(CopyToDevice(dst, binary_constants_bin_start + constant_info.data_offset, constant_info.num_bytes));
   }
 }
 
diff --git a/static/csrc/windll.cpp b/static/csrc/windll.cpp
new file mode 100644
index 000000000..5e0e19cf9
--- /dev/null
+++ b/static/csrc/windll.cpp
@@ -0,0 +1,70 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#include <stdexcept>
+#include <string>
+
+#include <windows.h>
+
+HMODULE SavedDllHandle;
+
+BOOL WINAPI DllMain(
+    HINSTANCE hinstDLL, // handle to DLL module
+    DWORD fdwReason, // reason for calling function
+    LPVOID lpvReserved) // reserved
+{
+  switch (fdwReason) {
+    case DLL_PROCESS_ATTACH:
+      SavedDllHandle = hinstDLL;
+      break;
+  }
+  return TRUE;
+}
+
+namespace ait {
+
+#define TRIGGER_ERROR(message)                        \
+  throw std::runtime_error(                           \
+      (message) + " at file " + __FILE__ + ", line" + \
+      std::to_string(__LINE__));
+
+void GetConstantsBin(void** address, size_t* size) {
+  HRSRC hResource = FindResource(SavedDllHandle, "constant_bin", "CUSTOMDATA");
+  if (!hResource) {
+    // Could not find a resource. Return zero values, because
+    // linker won't include empty constant.bin file. So, this is an
+    // expected behavior.
+    *size = 0;
+    *address = nullptr;
+    return;
+  }
+
+  HGLOBAL hResourceData = LoadResource(SavedDllHandle, hResource);
+  if (!hResourceData) {
+    // could not load a resource
+    auto errorCode = GetLastError();
+    TRIGGER_ERROR(std::string(
+        "LoadResource() call in GetConstantsBin() has failed with error " +
+        std::to_string(errorCode)));
+  }
+
+  DWORD resourceSize = SizeofResource(SavedDllHandle, hResource);
+  void* resourceData = LockResource(hResourceData);
+
+  *size = resourceSize;
+  *address = resourceData;
+}
+
+} // namespace ait
diff --git a/static/include/owned_constants.h b/static/include/owned_constants.h
index 64ceeea47..298f1a21b 100644
--- a/static/include/owned_constants.h
+++ b/static/include/owned_constants.h
@@ -42,5 +42,6 @@ struct ConstantInfo {
 // For information on the binary format, see `man objcopy`, under
 // the "binary-architecture" flag:
 // https://man7.org/linux/man-pages/man1/objcopy.1.html
+// todo: use #embed in C++ 23 once available
 extern const uint8_t _binary_constants_bin_start[];
 extern const uint8_t _binary_constants_bin_end[];
diff --git a/static/include/windll.h b/static/include/windll.h
new file mode 100644
index 000000000..ecb73d2a2
--- /dev/null
+++ b/static/include/windll.h
@@ -0,0 +1,25 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+
+namespace ait {
+
+// throws std::runtime_error in case of problems
+void GetConstantsBin(void** address, size_t* size);
+
+} // namespace ait

From 09ce75153d49a3fd8e0f281301de50900bd3d477 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Fri, 5 May 2023 21:52:20 -0700
Subject: [PATCH 482/638] simple multi-stream (#615)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/615

Adds multi-stream mode.
According to experiments, it speeds up computations in graph mode only.
Also, might it somewhat changes the amount of needed GPU memory. It may get higher or lower.

New environment flag: AIT_MULTISTREAM_MODE. Default value is 0.
* 0 - multi-stream is not used.
* 1 - simple multi-stream.

New environment flag: AIT_MULTISTREAM_EXTRA_STREAMS
* number of additional streams. Default value is 4.

New environment flag: AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS
* max number of parallel ops for memory planning. Default value is 99999999.

Reviewed By: ipiszy

Differential Revision: D45199079

fbshipit-source-id: 165f523ea1089d4c8462fc85f88fa30ec4ff8c6e
---
 docs/source/reference/env.rst                 |   8 +
 python/aitemplate/backend/codegen.py          | 126 +++++++++++-
 python/aitemplate/backend/main_templates.py   | 127 +++++++++++-
 .../compiler/transform/constant_folding.py    |   1 +
 .../compiler/transform/memory_planning.py     | 185 +++++++++++++++++-
 python/aitemplate/utils/environ.py            |  59 ++++++
 python/aitemplate/utils/graph_utils.py        | 139 +++++++++++--
 static/include/cuda_device_functions.h        |   9 +-
 static/include/model.h                        |   7 +-
 static/include/rocm_device_functions.h        |   6 +-
 10 files changed, 632 insertions(+), 35 deletions(-)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 012f3247e..f1055aeca 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -16,6 +16,14 @@ Codegen
 
 **AIT_TIME_COMPILATION**: If set to "1", time each make command at the compilation time. This helps us to do compilation time analysis. Requires to install `time <https://man7.org/linux/man-pages/man1/time.1.html>`_ package.
 
+**AIT_MULTISTREAM_MODE**: Controls multi-stream mode. Default mode is "0".
+* If set to "0", then no multistreaming is used.
+* If set to "1", then a simple multistreaming is used (iteratively track a wavefront of independent operators and execute ones).
+
+**AIT_MULTISTREAM_EXTRA_STREAMS**: Specifies the number of additional streams used. Default value is "4".
+
+**AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS**: Maximum number of parallel operators used in memory planning for simple multi-stream mode. Default value is "99999999" (basically, unlimited).
+
 Profiling
 ---------
 
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 6dfcb0a92..11c1b5741 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -23,8 +23,11 @@
 from __future__ import annotations
 
 import io
+import json
 import logging
 import os
+from collections import defaultdict
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 import jinja2
@@ -40,6 +43,13 @@
 
 from aitemplate.compiler.transform.memory_planning import Workspace
 from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.environ import (
+    multistream_additional_streams,
+    multistream_max_mem_parallel_ops,
+    multistream_mode,
+)
+from aitemplate.utils.graph_utils import split_simple_multistream_parallel_ops
+from aitemplate.utils.misc import is_debug
 
 # pylint: disable=C0103,W0613,C0301
 
@@ -323,6 +333,7 @@ def __init__(
         model_name: str = MODEL_NAME,
         additional_unbound_constants: Optional[List[Tensor]] = None,
         debug_settings: Optional[AITDebugSettings] = None,
+        model_dir: Optional[str] = None,
     ):
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
@@ -390,6 +401,9 @@ def __init__(
 
         self.model_name = model_name
 
+        # This is needed for logging activities only.
+        self.model_dir = model_dir
+
         # additional_unbound_constants stores tensors that are used in constant folding
         # but are not used in the main graph. We need this info so we can codegen SetConstant
         # correctly; when we call SetConstant for one of these special names, we want to forward
@@ -402,6 +416,11 @@ def __init__(
         # size won't be found during memory planning.
         self.extra_owned_constant_size = 0
 
+        # This is a temporary dictionary that holds the rendered C++ code for operators.
+        self._rendered_func_code: Dict[Operator, str] = {}
+        # This is a temporary list that holds rendered C++ code for checks.
+        self._rendered_checks_func_code: List[str] = []
+
     def _tensor_slice_func(
         self,
         node: Tensor,
@@ -740,6 +759,9 @@ def _process_src_ops(self, node: Tensor) -> None:
                     props["dim"] = func._attrs["concat_dim"]
                 self.func_prop_seq.append(props)
 
+                # save the rendered code for the future
+                self._rendered_func_code[func] = seq
+
             if "int_state_flag" in func._attrs:
                 if func._attrs["name"] not in self.state_record:
                     self.function_state.append(
@@ -762,18 +784,20 @@ def _append_check_nan_and_inf(self, node: Tensor):
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
         self.func_name_seq.append("nan_and_inf_check")
-        self.func_seq.append(
-            f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
-        )
+
+        code_text = f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        self.func_seq.append(code_text)
+        self._rendered_checks_func_code.append(code_text)
 
     def _append_check_outputs(self, node: Tensor):
         self.debug_header = True
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
         self.func_name_seq.append("output_check")
-        self.func_seq.append(
-            f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
-        )
+
+        code_text = f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        self.func_seq.append(code_text)
+        self._rendered_checks_func_code.append(code_text)
 
     def append_tensor(self, node: Tensor) -> None:
         if node._attrs["nop"]:
@@ -830,11 +854,95 @@ def append_tensor(self, node: Tensor) -> None:
         if node.is_jagged():
             self._process_jagged_dims(node)
 
+    def _generate_simple_multistream_ops(
+        self,
+    ) -> List[List[Operator]]:
+        from aitemplate.utils.graph_utils import track_graph_timings
+
+        # track the sequence
+        time_stats = track_graph_timings(self.graph, {})
+
+        # sort all operators by parallel execution order
+        ops_by_order = defaultdict(list)
+        for (op, tracking) in time_stats.op_parallel_trackers.items():
+            ops_by_order[tracking.execution_order].append(op)
+
+        # convert Dict[int, List[Operator]] into List[List[Operator]]
+        max_parallel_ops = multistream_max_mem_parallel_ops()
+        ops = split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops)
+
+        # done
+        return ops
+
+    def _write_simple_multistream_debug_info(
+        self, par_ops_seq: List[List[Operator]]
+    ) -> None:
+        # store simple multistream information to log
+
+        # render ops into names
+        ops_names = [
+            [op._attrs["original_name"] for op in par_ops] for par_ops in par_ops_seq
+        ]
+
+        # write text
+        log_filename_txt = (
+            Path(self.model_dir) / f"simple_multistream_{self.model_name}.txt"
+        )
+        with open(log_filename_txt, "w") as log_f:
+            for idx, ops_list in enumerate(ops_names):
+                ops_string = " ".join(ops_list)
+                log_f.write(f"{idx}: {ops_string}\n")
+        _LOGGER.info(f"Wrote text simple multistream info into {log_filename_txt}")
+
+        # write json
+        log_filename_json = (
+            Path(self.model_dir) / f"simple_multistream_{self.model_name}.json"
+        )
+        with open(log_filename_json, "w") as log_f:
+            log_f.write(f"{json.dumps(ops_names)}\n")
+        _LOGGER.info(f"Wrote json simple multistream info into {log_filename_json}")
+
     def generate_model(self) -> str:
         # Disable graph mode on ROCM because the updating operations
         # are not supported
         target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
 
+        run_impl_mode = multistream_mode()
+        if run_impl_mode == 0:
+            # no multistream mode is used
+            n_additional_streams = 0
+            n_additional_events = 0
+            par_function_seq = None
+            par_check_function_seq = []
+        elif run_impl_mode == 1:
+            # spawn additional streams. Total number of streams will be
+            #   n_additional_streams + 1.
+            n_additional_streams = multistream_additional_streams()
+            n_additional_events = n_additional_streams
+
+            # generate List[List[Operator]]
+            par_ops_seq = self._generate_simple_multistream_ops()
+
+            for par_ops in par_ops_seq:
+                _LOGGER.info(
+                    f"Executing in parallel: {' '.join([op._attrs['original_name'] for op in par_ops])}"
+                )
+
+            # convert List[List[Operator]] into List[List[str]]
+            par_function_seq = [
+                [self._rendered_func_code[op] for op in par_ops]
+                for par_ops in par_ops_seq
+            ]
+
+            # prepare after-ops checks
+            par_check_function_seq = self._rendered_checks_func_code
+
+            # dump info to files for further debugging, if needed
+            if is_debug() and self.model_dir is not None:
+                self._write_simple_multistream_debug_info(par_ops_seq)
+        else:
+            raise Exception(f"Unsupported multistream mode ({run_impl_mode})")
+
         per_op_profiler_seq = zip(
             self.func_name_seq,
             self.func_seq,
@@ -868,6 +976,11 @@ def generate_model(self) -> str:
             num_unbound_constants=self.unbound_constant_idx,
             reset_constants="\n".join(self.reset_constants),
             profiler_annotation=self.debug_settings.gen_profiler_annotation,
+            n_additional_streams=n_additional_streams,
+            n_additional_events=n_additional_events,
+            par_function_seq=par_function_seq,
+            par_check_function_seq=par_check_function_seq,
+            run_impl_mode=run_impl_mode,
         )
 
     def _create_set_up_constant_offsets(self) -> str:
@@ -1050,6 +1163,7 @@ def to_obj_name(name: str):
         output_tensors,
         additional_unbound_constants=additional_unbound_constants,
         debug_settings=debug_settings,
+        model_dir=prefix,
     )
     model_container_generator.append_all_tensors()
     constants_data_file.close()
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index b4c5e5fab..6cf62a9fb 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -50,6 +50,19 @@
 // Once an inference run has started, it is not safe to re-use the Model
 // until the run has finished!
 class {{model_name}} : public ModelBase<{{model_name}}> {
+  {% if n_additional_streams > 0 %}
+  // Extra streams allocated for graph or fork-join streams.
+  static constexpr size_t N_SUB_STREAMS = {{n_additional_streams}};
+  StreamType sub_streams[N_SUB_STREAMS];
+  {% endif %}
+  {% if n_additional_events > 0 %}
+  // Extra events allocated for graph or fork-join streams.
+  static constexpr size_t N_SUB_EVENTS = {{n_additional_events}};
+  EventType sub_events[N_SUB_EVENTS];
+  // An event that guards the fork operation for the base stream.
+  EventType sub_event_base;
+  {% endif %}
+
   public:
     {{model_name}}(
         size_t blob_size,
@@ -74,7 +87,33 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
     {{ tensor_slice }}
     {{ tensor_map_set }}
     {{ set_up_param_dynamic_shapes }}
+
+      {% if n_additional_streams > 0 %}
+      for (size_t i = 0; i < N_SUB_STREAMS; i++) {
+        DEVICE_CHECK(StreamCreate(sub_streams + i, true));
+      }
+      {% endif %}
+      {% if n_additional_events > 0 %}
+      for (size_t i = 0; i < N_SUB_EVENTS; i++) {
+        DEVICE_CHECK(CreateEvent(sub_events + i, false));
+      }
+      DEVICE_CHECK(CreateEvent(&sub_event_base, false));
+      {% endif %}
+    }
+
+    ~{{model_name}}() {
+      {% if n_additional_streams > 0 %}
+      for (size_t i = 0; i < N_SUB_STREAMS; i++) {
+        DEVICE_CHECK(StreamDestroy(sub_streams[i]));
+      }
+      {% endif %}
+      {% if n_additional_events > 0 %}
+      for (size_t i = 0; i < N_SUB_EVENTS; i++) {
+        DEVICE_CHECK(DestroyEvent(sub_events[i]));
       }
+      DEVICE_CHECK(DestroyEvent(sub_event_base));
+      {% endif %}
+    }
 
     void SetUpInputsOutputs() {
         {{ set_inputs }}
@@ -92,6 +131,9 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
   {{ device_to_device_copies }}
     }
 
+{% if run_impl_mode == 0 %}
+    ///////////////////////////////////////////////////////////////////////////
+    // default RunImpl implemenation
     void RunImpl(StreamType stream) {
         {% if profiler_annotation %}
         RAII_ProfilerRange _raiiAITProfilerRange("main_start");
@@ -101,6 +143,87 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       DeviceCheckLastError(__FILE__, __LINE__);
   {% endfor %}
     }
+{% endif %}
+
+{% if run_impl_mode == 1 %}
+    ///////////////////////////////////////////////////////////////////////////
+    // simple multistream implementation
+    void RunImpl(StreamType baseStream) {
+      {% if profiler_annotation %}
+      RAII_ProfilerRange _raiiAITProfilerRange("main_start");
+      {% endif %}
+
+      {% for funcs in par_function_seq %}
+        {% if funcs|length == 1 %}
+          // do no parallel stream processing here
+          {
+            uint8_t* global_workspace_ = this->global_workspace_;
+            uint8_t* unique_workspace_ = this->unique_workspace_;
+
+            StreamType& stream = baseStream;
+            {{ funcs[0] }}
+            DeviceCheckLastError(__FILE__, __LINE__);
+          }
+        {% else %}
+          // do parallel stream processing here
+          // first function runs on the base stream, others are on extra ones.
+          // it is assumed that functions are independent.
+          {
+            // baseStream fork guard
+            DEVICE_CHECK(EventRecord(sub_event_base, baseStream));
+
+            // every substream forks
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              StreamType& stream = sub_streams[i];
+              DEVICE_CHECK(StreamWaitEvent(stream, sub_event_base));
+            }
+
+            // run kernels
+            // note that every stream may run spawn multiple kernel runs
+            {% for func in funcs %}
+              {% if (loop.index - 1) % (n_additional_streams + 1) == 0 %}
+                {
+                  uint8_t* global_workspace_ = this->global_workspace_;
+                  uint8_t* unique_workspace_ = this->unique_workspace_;
+
+                  StreamType& stream = baseStream;
+                  {{ func }}
+                  DeviceCheckLastError(__FILE__, __LINE__);
+                }
+              {% else %}
+                {
+                  uint8_t* global_workspace_ = this->global_workspace_ + this->workspace_size_ / {{1 + n_additional_events}} * {{ ((loop.index - 1) % (n_additional_streams + 1)) }};
+                  uint8_t* unique_workspace_ = this->unique_workspace_ + this->unique_workspace_size_ / {{1 + n_additional_events}} * {{ ((loop.index - 1) % (n_additional_streams + 1)) }};
+
+                  StreamType& stream = sub_streams[{{ ((loop.index - 1) % (n_additional_streams + 1)) - 1}}];
+                  {{ func }}
+                  DeviceCheckLastError(__FILE__, __LINE__);
+                }
+              {% endif %}
+            {% endfor %}
+
+            // substream join guards
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              DEVICE_CHECK(EventRecord(sub_events[i], sub_streams[i]));
+            }
+            // base stream joins
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              DEVICE_CHECK(StreamWaitEvent(baseStream, sub_events[i]));
+            }
+          }
+        {% endif %}
+      {% endfor %}
+
+      {
+        // run various checks, if needed
+        StreamType& stream = baseStream;
+        {% for func in par_check_function_seq %}
+          {{ func }}
+          DeviceCheckLastError(__FILE__, __LINE__);
+        {% endfor %}
+      }
+    }
+{% endif %}
 
     void ProfileImpl(StreamType stream, size_t iters, const std::string& filename) {
       std::ofstream ss(filename);
@@ -170,8 +293,8 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
     ) {
       return std::make_unique<{{model_name}}>(
           {{ blob_size }},
-          {{ workspace_size }},
-          {{ unique_workspace_size }},
+          {{ workspace_size }} * (1 + {{n_additional_streams}}),
+          {{ unique_workspace_size }} * (1 + {{n_additional_streams}}),
           {{ num_inputs }},
           {{ num_outputs }},
           {{ num_unbound_constants }},
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index d86406961..e7a69f725 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -250,6 +250,7 @@ def _constant_folding_impl(
         graph=subgraph,
         output_tensors=output_tensors,
         model_name=backend.codegen.CONSTANT_FOLDER_MODEL_NAME,
+        model_dir=model_dir,
     )
     model_container_generator.append_all_tensors()
     constant_folding_model_def = model_container_generator.generate_model()
diff --git a/python/aitemplate/compiler/transform/memory_planning.py b/python/aitemplate/compiler/transform/memory_planning.py
index 22ff201a9..f07567e0d 100644
--- a/python/aitemplate/compiler/transform/memory_planning.py
+++ b/python/aitemplate/compiler/transform/memory_planning.py
@@ -16,14 +16,19 @@
 Graph pass for memory planning.
 """
 import bisect
+import logging
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import List
 
 from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils.environ import multistream_max_mem_parallel_ops, multistream_mode
+from aitemplate.utils.graph_utils import split_simple_multistream_parallel_ops
 
 # pylint: disable=C0103
 
+_LOGGER = logging.getLogger(__name__)
+
 
 @dataclass
 class TensorUsageRecord:
@@ -188,7 +193,9 @@ def _compute_workspace(sorted_graph: List[Tensor]) -> Workspace:
     return Workspace(max_workspace, unique_workspace_size)
 
 
-def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
+def _greedy_by_size_memory_planning(
+    sorted_graph: List[Tensor], tensor_usage_records: List[TensorUsageRecord]
+):
     """
     based on the greedy-by-size algorithm for offset calculation described in
     the following paper:
@@ -196,11 +203,6 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
         Efficient Memory Management for Deep Neural Net Inference,
         https://arxiv.org/abs/2001.03288
     """
-    sorted_ops = []
-    for node in sorted_graph:
-        sorted_ops.extend(node.src_ops())
-    tensor_usage_records = _make_tensor_usage_records(sorted_ops)
-
     # sort tensor usage records in non-increasing order by their sizes
     sorted_tensor_usage_records = sorted(
         tensor_usage_records, key=lambda r: r.size, reverse=True
@@ -268,6 +270,22 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
     return (max_blob, constant_offset, workspace)
 
 
+def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
+    """
+    based on the greedy-by-size algorithm for offset calculation described in
+    the following paper:
+        Yury Pisarchyk, Juhyun Lee,
+        Efficient Memory Management for Deep Neural Net Inference,
+        https://arxiv.org/abs/2001.03288
+    """
+    sorted_ops = []
+    for node in sorted_graph:
+        sorted_ops.extend(node.src_ops())
+    tensor_usage_records = _make_tensor_usage_records(sorted_ops)
+
+    return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records)
+
+
 def naive_memory_planning(sorted_graph: List[Tensor]):
     max_blob = 0
     offset = 0
@@ -288,8 +306,161 @@ def naive_memory_planning(sorted_graph: List[Tensor]):
     # workspace
     workspace = _compute_workspace(sorted_graph)
     assign_offsets_to_views_and_outputs(sorted_graph)
+
+    return (max_blob, constant_offset, workspace)
+
+
+def _make_tensor_usage_records_simple_multistream(
+    par_ops_seq: List[List[Operator]],
+) -> List[TensorUsageRecord]:
+    """
+    Generalized version of _make_tensor_usage_records() which
+    assumes that several ops may be executed on every step.
+
+    Simple multistream algo iteratively tracks sets of operators
+    that can be run in parallel independently on each iteration.
+
+    par_ops_seq contains lists of operators that can be run
+    in parallel on every algorithm iteration.
+
+    Technically, the regular _make_tensor_usage_records() version
+    is similar to the following one:
+
+       def _make_tensor_usage_records(sorted_ops):
+         par_ops_seq = [sorted_ops]
+         return _make_tensor_usage_records_simple_multistream(par_ops_seq)
+
+    This version is kept as a separate one, because multistreaming
+    feature is still somewhat experimental.
+    """
+
+    num_of_ops = len(par_ops_seq)
+    tensor_records = defaultdict(
+        lambda: TensorUsageRecord(
+            tensor=None, first_op_idx=num_of_ops, last_op_idx=-1, size=None
+        )
+    )
+
+    for op_idx, par_ops in enumerate(par_ops_seq):
+        for op in par_ops:
+            for tensor in op._attrs["inputs"] + op._attrs["outputs"]:
+                # Skip weights and inputs since we don't overwrite them.
+                # Note that it might be OK to overwrite inputs, but let's be
+                # consertative for now and not surprise users. We could always
+                # make a flag to do that later if it's needed.
+                if tensor._attrs["is_param"]:
+                    continue
+                name = tensor._attrs["name"]
+                this_tensor = tensor_records[name].tensor
+                if this_tensor is None:
+                    tensor_records[name].tensor = tensor
+                else:
+                    # make sure we didn't screw up anything
+                    assert (
+                        tensor == this_tensor
+                    ), f"existing tensor: {this_tensor}, new tensor: {tensor}, op: {op}"
+
+                first_op_idx = tensor_records[name].first_op_idx
+                last_op_idx = tensor_records[name].last_op_idx
+                tensor_records[name].first_op_idx = min(first_op_idx, op_idx)
+                tensor_records[name].last_op_idx = max(last_op_idx, op_idx)
+                # An output tensor's lifetime extends to the last op.
+                if tensor._attrs["is_output"]:
+                    tensor_records[name].last_op_idx = num_of_ops - 1
+
+                size = tensor_records[name].size
+                tensor_size = tensor.size_bytes(alignment=64)
+                if size is None:
+                    tensor_records[name].size = tensor_size
+                else:
+                    # make sure we didn't screw up anything
+                    assert size == tensor_size
+
+    # tensor views extend the lifetime of the original tensors
+    tensor_views = []
+    for name, tensor_record in tensor_records.items():
+        this_tensor = tensor_record.tensor
+        if this_tensor._attrs["is_view_of"]:
+            orig_tensor = _find_original_tensor(this_tensor)
+            # view of input
+            if orig_tensor._attrs["is_param"]:
+                continue
+            orig_tensor_name = orig_tensor._attrs["name"]
+            assert orig_tensor_name in tensor_records
+            tensor_records[orig_tensor_name].last_op_idx = max(
+                tensor_records[orig_tensor_name].last_op_idx, tensor_record.last_op_idx
+            )
+            tensor_views.append(name)
+
+    # remove tensor views from tensor_records
+    for name in tensor_views:
+        del tensor_records[name]
+
+    # sanity checks
+    # make sure we have valid indices and sizes
+    records = tensor_records.values()
+    for tensor, first_op_idx, last_op_idx, size in records:
+        assert tensor is not None
+        assert 0 <= first_op_idx < num_of_ops
+        assert 0 <= last_op_idx < num_of_ops
+        assert first_op_idx <= last_op_idx
+        assert size is not None
+
+    return list(records)
+
+
+def simple_multistream_memory_planning(sorted_graph: List[Tensor]):
+    """
+    A specialized case for simple multi-stream execution.
+    It uses more or slightly more GPU memory than greedy_by_size_memory_planner,
+    depending on the input graph, but still significantly less
+    than naive_memory_planning.
+    """
+    from aitemplate.utils.graph_utils import track_graph_timings
+
+    # track the sequence
+    time_stats = track_graph_timings(sorted_graph, {})
+
+    # sort all operators by parallel execution order
+    ops_by_order = defaultdict(list)
+    for (op, tracking) in time_stats.op_parallel_trackers.items():
+        ops_by_order[tracking.execution_order].append(op)
+
+    # convert Dict[int, List[Operator]] into List[List[Operator]]
+    max_parallel_ops = multistream_max_mem_parallel_ops()
+    par_ops_seq = split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops)
+
+    tensor_usage_records = _make_tensor_usage_records_simple_multistream(par_ops_seq)
+
+    return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records)
+
+
+def proxy_memory_planning(sorted_graph: List[Tensor]):
+    run_mode = multistream_mode()
+    if run_mode == 0:
+        # no multistream
+        max_blob, constant_offset, workspace = greedy_by_size_memory_planning(
+            sorted_graph
+        )
+    elif run_mode == 1:
+        # simple multistream
+        max_blob, constant_offset, workspace = simple_multistream_memory_planning(
+            sorted_graph
+        )
+    else:
+        # unsupported
+        raise Exception(f"Unsupported multistream mode ({run_mode})")
+
+    # print some statistics
+    _LOGGER.info(
+        f"Workspace shared_size={workspace.shared_size} unique_size={workspace.unique_size}"
+    )
+    _LOGGER.info(f"max_blob={max_blob} constant_offset={constant_offset}")
+
+    # done
     return (max_blob, constant_offset, workspace)
 
 
-memory_planning = greedy_by_size_memory_planning
+# memory_planning = greedy_by_size_memory_planning
 # memory_planning = naive_memory_planning
+memory_planning = proxy_memory_planning
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 511e589e7..fd692ad90 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -148,3 +148,62 @@ def force_cutlass_sm90_kernels() -> bool:
     back-end of the GEMM ops). Default: False.
     """
     return os.getenv("AIT_FORCE_CUTLASS_SM90_KERNELS", "0") == "1"
+
+
+def multistream_mode() -> int:
+    """
+    Multi-stream mode. 0 - no multistream. 1 - simple multistream.
+    Default: 0.
+    """
+
+    # temporarily override it in order to test
+    return int(os.getenv("AIT_MULTISTREAM_MODE", "0"))
+
+
+def multistream_additional_streams() -> int:
+    """
+    Number of extra streams in multi-stream mode.
+
+    This option is independent from AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS.
+
+    For example, say, there are 100 ops that can be run in parallel.
+
+    Example 1: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=100.
+    In this case 5 streams will be used (1 base and 4 extra),
+    every stream gets 20 operators and no inter-stream barriers are used.
+    Memory planning is done for 100 parallel ops.
+
+    Example 2: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=5.
+    In this case 5 streams will be used (1 base and 4 extra),
+    there will be 20 waves separated by inter-stream barriers,
+    every stream gets 1 operator for every wave.
+    Memory planning is done for 20 waves of 5 parallel ops each.
+
+    """
+    return int(os.getenv("AIT_MULTISTREAM_EXTRA_STREAMS", "4"))
+
+
+def multistream_max_mem_parallel_ops() -> int:
+    """
+    Maximum number of parallel operators used in memory planning
+    for simple multi-stream mode.
+    Larger value imply higher level of possible parallelism, but
+    higher memory allocations.
+
+    This option is independent from AIT_MULTISTREAM_EXTRA_STREAMS.
+
+    For example, say, there are 100 ops that can be run in parallel.
+
+    Example 1: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=100.
+    In this case 5 streams will be used (1 base and 4 extra),
+    every stream gets 20 operators and no inter-stream barriers are used.
+    Memory planning is done for 100 parallel ops.
+
+    Example 2: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=5.
+    In this case 5 streams will be used (1 base and 4 extra),
+    there will be 20 waves separated by inter-stream barriers,
+    every stream gets 1 operator for every wave.
+    Memory planning is done for 20 waves of 5 parallel ops each.
+    """
+    # unlimited by default
+    return int(os.getenv("AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS", "99999999"))
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 6fa0a86d7..7f6a1cf55 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -16,7 +16,8 @@
 import logging
 import os
 from collections import deque
-from typing import Any, Dict, List, Set
+from pathlib import Path
+from typing import Any, Dict, List, Set, Union
 
 from aitemplate.utils.misc import is_debug
 from aitemplate.utils.visualization import plot_graph
@@ -135,9 +136,9 @@ def __init__(self):
         # Dict[Operator, TimestampTracking]
         self.op_sequential_trackers = {}
 
-        # Dict[Trnsor, TimestampTracking]
+        # Dict[Tensor, TimestampTracking]
         self.tensor_parallel_trackers = {}
-        # Dict[Trnsor, TimestampTracking]
+        # Dict[Tensor, TimestampTracking]
         self.tensor_sequential_trackers = {}
 
         # 0.7 percentile of op times
@@ -152,13 +153,58 @@ def __init__(self):
         self.total_duration = 0.0
 
 
+def _load_op_durations_from_file(input: Union[str, Path]) -> Dict[str, float]:
+    """
+    Loads benchmarking results produced with a profiler from a .json file.
+    """
+
+    if isinstance(input, str):
+        input_path = Path(input)
+    elif isinstance(input, Path):
+        input_path = input
+    else:
+        raise ValueError("str or Path is needed as an input argument")
+
+    # load the file with the profile.
+    with input_path.open("r") as f:
+        perf_per_op_str = f.read()
+
+    # parse file
+    perf_per_op_str_dict = json.loads(perf_per_op_str)
+
+    op_durations: Dict[str, float] = {}
+    for op_name, op_data in perf_per_op_str_dict.items():
+        op_durations[op_name] = op_data["ms_per_iter"]
+
+    # done
+    return op_durations
+
+
 def track_graph_timings(
-    tensors, file_path_profiler_output: str
+    tensors, inputv: Union[str, Path, Dict[str, float]]
 ) -> ProfiledTimeStatistics:
     """
     Traverses the graph of tensors and uses the statistics from the profiler
     to evaluate execution times in case of sequential execution (1 stream)
     and parallel execution (unlimited number of streams).
+
+    The parallel execution tracking works in the following way.
+    1. Input tensors and constant tensors are marked as processed.
+    2. Other tensors are marked as unprocessed.
+    3. All operators are marked as unprocessed.
+    4. Repeat
+    4.1. Searches for unprocessed operators whose input tensors are marked
+    as processed and "executes" ones, then mark corresponding output tensors as processed.
+    4.2. Stop if the number of processed operators on step 4.1 is zero
+    5. If the total number of unprocessed operators is not zero, then the graph is invalid.
+
+    Parameters
+    ----------
+    tensors : List[Tensor]
+        a list of output Tensors of AIT graph
+    inputv : Union[str, Path, Dict[str, float]]
+        str or Path: a path to .json file with the results generated by a profiling procedure
+        Dict[str, float]: time costs of operators (key is op._attrs["original_name"])
     """
 
     from aitemplate.compiler.base import Operator, Tensor
@@ -193,15 +239,19 @@ def track_graph_timings(
             unprocessed_tensors.append(tensor)
 
     # ok, we've got ops. Load the file with the profile.
-    with open(file_path_profiler_output, "r") as f:
-        perf_per_op_str = f.read()
-
-    # parse file
-    perf_per_op_str_dict = json.loads(perf_per_op_str)
-
     op_durations: Dict[str, float] = {}
-    for op_name, op_data in perf_per_op_str_dict.items():
-        op_durations[op_name] = op_data["ms_per_iter"]
+    if isinstance(inputv, str) or isinstance(inputv, Path):
+        # str or Path
+        op_durations = _load_op_durations_from_file(inputv)
+    elif (
+        isinstance(inputv, dict)
+        and all(isinstance(x, str) for x in inputv.keys())
+        and all(isinstance(x, float) for x in inputv.values())
+    ):
+        # this is Dict[str, float]
+        op_durations = inputv
+    else:
+        raise ValueError("Invalid type of inputv")
 
     # map timings to ops
     for op in visited_ops:
@@ -327,10 +377,15 @@ def track_graph_timings(
                 # yes. This operator is ready to be executed.
                 op_duration = output.op_durations[op]
 
-                max_execution_end = max(
-                    output.tensor_parallel_trackers[tensor].execution_end
-                    for tensor in depends_on
-                )
+                if not depends_on:
+                    # a case of an operator that depends on no tensors
+                    max_execution_end = 0
+                else:
+                    # regular case
+                    max_execution_end = max(
+                        output.tensor_parallel_trackers[tensor].execution_end
+                        for tensor in depends_on
+                    )
 
                 output.op_parallel_trackers[op] = TimestampTracking(
                     execution_start=max_execution_end,
@@ -381,3 +436,55 @@ def track_graph_timings(
 
     # done
     return output
+
+
+def split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops: int):
+    """
+    Make sure that no more than max_parallel_ops operators are run in parallel.
+
+    Say, on the first step op1, op2 and op3 can be executed in parallel.
+    On the second one, it is op4 and op5.
+    On the third one it is op6, op7, op8, op9.
+    Then, ops_by_order is something like
+      { 1: [op1, op2, op3], 2: [op4, op5], 3: [op6, op7, op8, op9] }
+    Given max_parallel_ops=2, the output will be:
+      [[op1, op2], [op3], [op4, op5], [op6, op7], [op8, op9]]
+
+    Parameters
+    ----------
+    ops_by_order : Dict[int, List[Operator]]
+        A dictionary, its keys represent the execution order
+        and its values represent operators that are executed in parallel.
+    max_parallel_ops : int
+        Number of operators that are allowed to be run in parallel
+
+    Output : List[List[Operator]]
+        transformed sequence of operators to execute.
+
+    """
+    assert max_parallel_ops > 0
+
+    # todo: a better splitting algorithm can be implemented,
+    # the one that splits operators into max_parallel_ops buckets
+    # so that the amount of needed memory is about the same.
+    # use priority_queue for this and iteratively add to the
+    # bucket that has the lowest 'assigned' memory.
+
+    output = []
+
+    execution_orders = sorted(ops_by_order.keys())
+    for execution_order in execution_orders:
+        ops = ops_by_order[execution_order]
+
+        ops_parallel = []
+        for op in ops:
+            ops_parallel.append(op)
+            if len(ops_parallel) >= max_parallel_ops:
+                output.append(ops_parallel)
+                ops_parallel = []
+
+        if len(ops_parallel) > 0:
+            output.append(ops_parallel)
+
+    # done
+    return output
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 88cc305e0..b03e25f78 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -277,6 +277,10 @@ inline DeviceError StreamDestroy(StreamType stream) {
   return cudaStreamDestroy(stream);
 }
 
+inline DeviceError StreamWaitEvent(StreamType stream, EventType event) {
+  return cudaStreamWaitEvent(stream, event);
+}
+
 inline DeviceError GraphInstantiate(
     GraphExecType* graph_exec,
     GraphType graph) {
@@ -374,8 +378,9 @@ inline DeviceError StreamSynchronize(StreamType stream) {
   return cudaStreamSynchronize(stream);
 }
 
-inline DeviceError CreateEvent(EventType* event) {
-  return cudaEventCreate(event);
+inline DeviceError CreateEvent(EventType* event, bool measure_time = true) {
+  return cudaEventCreateWithFlags(
+      event, measure_time ? cudaEventDefault : cudaEventDisableTiming);
 }
 
 inline DeviceError DestroyEvent(EventType event) {
diff --git a/static/include/model.h b/static/include/model.h
index 7ccd9f3ad..759fd95c7 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -62,6 +62,8 @@ class ModelBase {
       : blob_(RAII_DeviceMalloc(blob_size, allocator)),
         workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
         params_(num_inputs + num_outputs + num_unbound_constants),
+        workspace_size_{workspace_size},
+        unique_workspace_size_{unique_workspace_size},
         num_inputs_(num_inputs),
         num_outputs_(num_outputs),
         constants_(constants) {
@@ -79,7 +81,7 @@ class ModelBase {
   }
 
  public:
-  ~ModelBase() {
+  virtual ~ModelBase() {
     if (run_finished_ != nullptr) {
       DestroyEvent(run_finished_);
     }
@@ -267,6 +269,9 @@ class ModelBase {
   size_t num_inputs_;
   size_t num_outputs_;
 
+  // These values are preserved for multi-stream needs.
+  size_t workspace_size_;
+  size_t unique_workspace_size_;
   // The workspace blob is used as scratch memory. See
   // _generate_workspace in memory planning for more information.
   GPUPtr workspace_;
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index db06c2351..8fc7adf3c 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -186,6 +186,10 @@ inline DeviceError StreamDestroy(StreamType stream) {
   return hipStreamDestroy(stream);
 }
 
+inline DeviceError StreamWaitEvent(StreamType stream, EventType event) {
+  return hipStreamWaitEvent(stream, event);
+}
+
 inline DeviceError GraphInstantiate(
     GraphExecType* graph_exec,
     GraphType graph) {
@@ -286,7 +290,7 @@ inline DeviceError StreamSynchronize(StreamType stream) {
   return hipStreamSynchronize(stream);
 }
 
-inline DeviceError CreateEvent(EventType* event) {
+inline DeviceError CreateEvent(EventType* event, bool measure_time = true) {
   return hipEventCreate(event);
 }
 

From b912ca9676cd2c1defe66c9946b0aaeac0e15739 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sat, 6 May 2023 14:06:58 -0700
Subject: [PATCH 483/638] Use device 0 for profiling if devices list is empty
 (#667)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/667

If the `devices` list passed to the `ProfilerRunner` is empty, currently an exception is raised on the `ThreadPoolExecutor` instantiation: `ValueError("max_workers must be greater than 0")`. We replace the `devices` list with `[0]` in such case to use the device 0 for profiling and avoid the exception.

Reviewed By: amateurcoffee, tissue3

Differential Revision: D45637622

fbshipit-source-id: 353053ebe55e7891363be2d5c018404ebc7aca2e
---
 python/aitemplate/backend/profiler_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 5381b5794..288fb5dd5 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -267,7 +267,8 @@ def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 3
         timeout : int
             timeout to wait for all profilers completion in seconds
         """
-        if devices is None:
+        if not devices:
+            # devices is either None or empty list: use device 0
             devices = [0]
         # This queue is used to ensure only one task is executed on a device at a time
         self._device_queue = Queue()

From 6bc3253ab29e056140640ca2d847749d73c2a243 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 7 May 2023 08:59:20 -0700
Subject: [PATCH 484/638] Allow setting jagged tensor total_length upper bound
 from offsets (#634)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/634

During shape inference in fx2ait, as an alternative to setting the `total_length: IntVar`'s upper bound based on a single globally configured single max sequence length value, it may be set from the content of the correspondibng offsets tensor in the sample input. The latter is more memory efficient in cases when there are multiple jagged tensors with (substantially) different max. sequence lenghts. The assumption is that the offsets tensors in the sample inputs actually contain at least one sequence with the maximum sequence length for the corresponding jagged tensor.

Reviewed By: qxy11, wushirong, tissue3

Differential Revision: D45393984

fbshipit-source-id: b1937ea75c80ff6ba9c84250cf09b8cad6c097c1
---
 fx2ait/fx2ait/tensor_spec.py | 64 ++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index bf7cb6e23..eb0f28a69 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import logging
-from typing import Any, List, Set
+from typing import Any, Dict, List, Set
 
 import torch
 from aitemplate.compiler.public import IntImm, IntVar
@@ -255,6 +255,52 @@ def from_input_list_with_batch_size(
 
         return result
 
+    @staticmethod
+    def _get_max_seq_lens_from_offsets(
+        inputs: List[torch.Tensor],
+        jagged_offsets_batch_dims: Set[int],
+    ) -> Dict[int, int]:
+        """
+        Get the maximum sequence length encoded in each offsets tensor.
+
+        Offsets tensors encode the length of each sequence in the corresponding
+        jagged tensor. Here we extract the maximum sequence length in each offsets
+        tensor in the inputs and associate it with the total_length (== offsets[-1])
+        of the corresponding jagged tensor in the inputs.
+        """
+        offsets_inputs = [
+            inp
+            for inp in inputs
+            # offsets tensors are rakn-1 and have a specific first dimension
+            if len(inp.shape) == 1 and inp.shape[0] in jagged_offsets_batch_dims
+        ]
+
+        max_seq_lens = {}
+        for offsets in offsets_inputs:
+            offsets = offsets.cpu()
+            # the last value in the offsets tensor is the total_length
+            # dimension of the corresponding jagged tensor
+            total_length = offsets[-1].item()
+            # max. sequence length == max. consecutive offset difference
+            max_seq_len = torch.max(offsets[1:] - offsets[:-1]).item()
+            if total_length in max_seq_lens:
+                # if multiple jagged tensors have the same total length,
+                # set the max_seq_len to the maximum of all sequences
+                # in all corresponding offsets tensors
+                max_seq_lens[total_length] = max(
+                    max_seq_lens[total_length], max_seq_len
+                )
+            else:
+                max_seq_lens[total_length] = max_seq_len
+
+            logger.info(
+                f"Maximum sequence length {max_seq_lens[total_length]} "
+                f"for the input jagged tensor with {total_length=} "
+                "inferred from the offsets tensor."
+            )
+
+        return max_seq_lens
+
     @classmethod
     def from_input_list_with_batch_size_jagged_tensor(
         cls,
@@ -264,6 +310,7 @@ def from_input_list_with_batch_size_jagged_tensor(
         jagged_tensor_batch_dims: Set[int],
         jagged_offsets_batch_dims: Set[int],
         additional_inputs: List[torch.Tensor] = None,
+        infer_max_seq_lens_from_offsets: bool = False,
     ) -> List["TensorSpec"]:
         """
         Most of the recommendation models will work fine using this function.
@@ -271,6 +318,13 @@ def from_input_list_with_batch_size_jagged_tensor(
         We make an assumption that inferred lowerable subgraph inputs will have
         a single batch dimension with the same max batch size.
         """
+        max_seq_lens_from_offsets = {}
+        if infer_max_seq_lens_from_offsets:
+            max_seq_lens_from_offsets = cls._get_max_seq_lens_from_offsets(
+                inputs=inputs,
+                jagged_offsets_batch_dims=jagged_offsets_batch_dims,
+            )
+
         result: List = []
         result_unsorted: List = []
         left_inputs: List = []
@@ -283,7 +337,13 @@ def from_input_list_with_batch_size_jagged_tensor(
             batch_dim_name: str = ""
             if batch_dim in jagged_tensor_batch_dims:
                 batch_dim_lower_bound = 0  # when all sequences are empty
-                batch_dim_upper_bound = max_batch_size * max_sequence_length
+                # if the maximum sequence length for this jagged tensor was not
+                # inferred from the offsets, we use the globally configured
+                # max_sequence_length (passed as argument to this function)
+                max_seq_len = max_seq_lens_from_offsets.get(
+                    batch_dim, max_sequence_length
+                )
+                batch_dim_upper_bound = max_batch_size * max_seq_len
                 batch_dim_name = f"batch_size_jagged_tensor_{batch_dim}"
             elif batch_dim in jagged_offsets_batch_dims:
                 batch_dim_lower_bound = 2  # prefix 0 + at least one offset

From 6c3a88d9c12d122811c3f6fda244122eed32605d Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 7 May 2023 10:57:07 -0700
Subject: [PATCH 485/638] Increase CircleCI no_output_timeout to 20m (#668)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/668

ATT

Reviewed By: alexanderguzhva

Differential Revision: D45643580

fbshipit-source-id: 97163652f74647d478913ed561abd01a8145e63a
---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2c52bd0ae..0d04d5ad5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -65,6 +65,7 @@ setup_fx2ait_env: &setup_fx2ait_env
 basic_tests: &basic_tests
   - run:
       name: Run tests
+      no_output_timeout: 20m
       command: |
         set -e
         TEST_FILES=$(circleci tests glob "tests/unittest/**/test_*.py" | grep -v benchmark | circleci tests split --split-by=timings)

From 76b6b718a89417910dd66d02957ab6744fa409db Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 7 May 2023 14:12:54 -0700
Subject: [PATCH 486/638] Integrate CUDA 12.1 (#661)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/661

CUTLASS SM90 kernel generation depends not only on the arch, but also on the CUDA version. In this diff, the CUDA version is propagated to the `cuda/utils.py` to be passed to the CUTLASS generator functions.

Reviewed By: chenyang78, alexanderguzhva

Differential Revision: D45602855

fbshipit-source-id: 5f90ea354b94cf22f54d64ceb931c7e9435b7e43
---
 python/aitemplate/backend/cuda/target_def.py | 19 +++++++++++++++++--
 python/aitemplate/backend/cuda/utils.py      |  8 +++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 7ceef2323..ed0760256 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -55,6 +55,7 @@ def __init__(
         template_path=CUTLASS_PATH,
         ait_static_files_path=AIT_STATIC_FILES_PATH,
         arch="80",
+        cuda_version=None,
         **kwargs,
     ):
         """CUDA target init.
@@ -73,6 +74,13 @@ def __init__(
         self._arch = arch
         self._kwargs = kwargs
         self._compile_options = self._build_compile_options()
+        if cuda_version is None:
+            # try to set default CUDA version based on the arch
+            if arch == "80":
+                cuda_version = "11.4.2"
+            elif arch == "90":
+                cuda_version = "12.0.0"
+        self._cuda_version = cuda_version
 
     def _build_compile_options(self):
         flash_attention_path = ""
@@ -150,7 +158,7 @@ def __enter__(self):
         super().__enter__()
         self._gen_cutlass_lib_pkg()
         f_gen_ops = registry.get("cuda.gen_cutlass_ops")
-        self._operators = f_gen_ops(self._arch)
+        self._operators = f_gen_ops(self._arch, self._cuda_version)
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
@@ -232,9 +240,16 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             with open(convert_nvcc_json, "r") as nvcc_option_json:
                 FBCUDA.nvcc_option_json = json.load(nvcc_option_json)
         self.nvcc_options_json = FBCUDA.nvcc_option_json
+        cuda_version = self.nvcc_option_json.get("cuda_version", None)
 
         self.remote_cache_bytes = remote_cache_bytes
-        super().__init__(self.cutlass_path_, static_files_path, arch, **kwargs)
+        super().__init__(
+            template_path=self.cutlass_path_,
+            ait_static_files_path=static_files_path,
+            arch=arch,
+            cuda_version=cuda_version,
+            **kwargs,
+        )
 
     def _build_compile_options(self):
         if not FBCUDA.compile_options_:
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index 98dc8f9fd..e87b2a4bb 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -51,17 +51,19 @@ def __init__(self, arch):
 
 
 @registry.reg("cuda.gen_cutlass_ops")
-def gen_ops(arch):
+def gen_ops(arch, cuda_version):
     import cutlass_lib
 
     args = Args(arch)
+    if cuda_version is not None:
+        args.cuda_version = cuda_version
     manifest = cutlass_lib.manifest.Manifest(args)
 
     if arch == "90":
         if force_cutlass_sm90_kernels():
-            cutlass_lib.generator.GenerateSM90(manifest, cuda_version="12.0.0")
+            cutlass_lib.generator.GenerateSM90(manifest, args.cuda_version)
         elif allow_cutlass_sm90_kernels():
-            cutlass_lib.generator.GenerateSM90(manifest, cuda_version="12.0.0")
+            cutlass_lib.generator.GenerateSM90(manifest, args.cuda_version)
             cutlass_lib.generator.GenerateSM80(manifest, args.cuda_version)
             cutlass_lib.extra_operation.GenerateSM80(manifest, args)
         else:

From aeaddeb32d7d124ce62ebf8eb74ec8328cde6023 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 7 May 2023 14:45:59 -0700
Subject: [PATCH 487/638] Sync CUTLASS with upstream (#662)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/662

A few issues in the CUTLASS codebase blocking the integration of the CUTLASS 3.x SM90 kernels in AITemplate have been fixed upstream (see, e.g., the merged PRs [#920](https://github.com/NVIDIA/cutlass/pull/920) and [#927](https://github.com/NVIDIA/cutlass/pull/927)). The CUTLASS version is synced with the upstream to proceed with the SM90 integration.

Reviewed By: chenyang78

Differential Revision: D45603657

fbshipit-source-id: 6b64f6ee0b9f87c2f379144d0fa568487aef8076
---
 3rdparty/cutlass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 77f07619c..5328b493f 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 77f07619c0b4899aa1ce076300258eb3a27ffad6
+Subproject commit 5328b493f7ed2c20109f8d53bcc2ed12f9ac457e

From d7c2877e635d9d5924f55779beb666ff5ce3d365 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 7 May 2023 15:15:01 -0700
Subject: [PATCH 488/638] Add SM90 CUTLASS 3.x kernels to gemm_rcr_bias_relu
 (#663)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/663

ATT.

Also, the foundations are added for extending the remaining `gemm_rcr_bias_<epilogue>` ops (except the `gemm_rcr_bias_broadcst` family) with the SM90 kernels.

Reviewed By: chenyang78

Differential Revision: D45608620

fbshipit-source-id: ddda0dd9b49d4fb09a04f542f9b98f2df5012f4e
---
 .../backend/cuda/gemm_universal/common.py     |  32 +++++
 .../gemm_universal/common_bias_activation.py  |  54 +++++---
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |  50 ++++++--
 .../utils/mk_cutlass_lib/extra_enum.py        |  38 ++++++
 tests/unittest/ops/test_gemm_bias_relu.py     | 115 ++++++++++++------
 5 files changed, 226 insertions(+), 63 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 5e7d52972..47ddfb2c5 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -1310,23 +1310,55 @@ def default_fproc(
         and not filter_extra_tile_configs
     ):
         op = copy.deepcopy(op)
+
         # set output major
         op.C.layout = c_layout
         op.D.layout = c_layout
+
         # set epilogue
         op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
         op.element_epilogue = acc_type
+        if (
+            op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+            and op.epilogue_functor
+            != cutlass_lib.library.EpilogueFunctor.LinearCombination
+        ):
+            # need to substitute the epilogue schedule with
+            # the one parameterized by the epilogue functor
+            if op.epilogue_schedule in (
+                cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperative,
+            ):
+                op.epilogue_schedule = cutlass_lib.library.EpilogueScheduleMapping[
+                    op.epilogue_schedule
+                ][op.epilogue_functor]
+            else:
+                # epilogue functor parameterization unavailable
+                # for the rest of epilogue schedule types
+                return ret
+
+        # set permute layout
         if permute_layout is not None:
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
                 permute_layout
             ]
+
+        has_tma_epilogue = False
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            epilogue_schedule_str = str(op.epilogue_schedule).split(".")[-1]
+            has_tma_epilogue = epilogue_schedule_str.lower().startswith("tma")
+
         # set C and D alignment
         alignments = alignment.get_alignments(dtype)
         for i in alignments:
+            if has_tma_epilogue and i != max(alignments):
+                # TMA epilogues only support max. output alignment
+                continue
             op = copy.deepcopy(op)
             op.C.alignment = i
             op.D.alignment = i
             ret.append(op)
+
     return ret
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
index 06ed9ef3c..3bcd567cf 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -24,8 +24,16 @@
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
+def gemm_rcr_config(
+    func_attrs,
+    dtype="float16",
+    include_cutlass_3x_ops=False,
+):
+    common.make_fproc(
+        func_attrs=func_attrs,
+        layout=RCR,
+        include_cutlass_3x_ops=include_cutlass_3x_ops,
+    )
 
 
 def gen_profiler(
@@ -34,15 +42,17 @@ def gen_profiler(
     profiler_filename,
     dim_info_dict,
     problem_args_template,
+    problem_args_template_cutlass_3x=None,
     extra_code="",
 ):
     return gemm_rcr.common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args_template,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
         extra_code=extra_code,
     )
@@ -53,6 +63,7 @@ def gen_function(
     problem_args_template,
     exec_cond_template,
     dim_info_dict,
+    problem_args_template_cutlass_3x=None,
     extra_code="",
 ):
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
@@ -69,15 +80,22 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = ""
+    if problem_args_template_cutlass_3x is not None:
+        problem_args_cutlass_3x = problem_args_template_cutlass_3x.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N",
@@ -102,5 +120,7 @@ def gen_function_decl(func_attrs):
 def gen_function_call(func_attrs, indent="  "):
     bias = func_attrs["inputs"][2]
     return common.gen_function_call(
-        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+        func_attrs=func_attrs,
+        indent=indent,
+        bias_ptr_arg=bias._attrs["name"],
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index 3907d35dc..496447cbf 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -51,19 +51,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),              // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),              // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_relu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_relu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -74,10 +103,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index c07e70959..d58c618d3 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -146,6 +146,44 @@ class EpiloguePermuteLayout(enum.Enum):
   # "Permute3DBMM_021": EpiloguePermuteLayout.Permute3DBMM_021,
 }
 
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecializedElementwiseRelu = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseRelu = enum_auto()
+
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::ReLu>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::ReLu>',
+}
+
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: '_epi_tma_relu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: '_epi_tma_relu',
+}
+
+EpilogueScheduleMapping = {
+  EpilogueScheduleType.TmaWarpSpecialized: {
+    EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu,
+  },
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: {
+    EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu,
+  },
+}
+
 """
 )
 
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index a44937da5..ab1fc3fbe 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -19,12 +19,11 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
-from parameterized import parameterized
 
 
 _TOLERANCE_LIMITS = {
@@ -39,10 +38,14 @@ def __init__(self, *args, **kwargs):
         super(GEMMBiasReluTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
-        M = 128
-        K = 1024
-        N = 64
+    def _test_gemm_rcr_bias_relu(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
@@ -51,9 +54,11 @@ def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"gemm_rcr_bias_relu_{dtype}_{self._test_id}"
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_relu_{test_suffix}_{self._test_id}"
         self._test_id += 1
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
         X_pt = get_random_torch_tensor([M, K], dtype)
         W_pt = get_random_torch_tensor([N, K], dtype)
         B_pt = get_random_torch_tensor([N], dtype)
@@ -65,20 +70,56 @@ def _test_gemm_rcr_bias_relu(self, dtype="float16", target=None):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-                TestEnv.ROCM: [("float16")],
-            }
-        )
-    )
-    def test_gemm_rcr_bias_relu(self, ait_dtype):
-        target = detect_target()
-        self._test_gemm_rcr_bias_relu(ait_dtype, target)
-
-    def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
+    def test_gemm_rcr_bias_relu_fp16(self):
+        self._test_gemm_rcr_bias_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_relu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_relu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_relu(dtype="float32")
+
+    def test_gemm_rcr_bias_relu_bf16(self):
+        self._test_gemm_rcr_bias_relu(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_relu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_relu(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_relu(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_relu(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_relu(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+    def _test_gemm_rcr_bias_add_relu(self, dtype="float16"):
         M = 128
         K = 1024
         N = 64
@@ -93,7 +134,7 @@ def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         Y._attrs["is_output"] = True
         test_name = f"gemm_rcr_bias_add_relu_{dtype}_{self._test_id}"
         self._test_id += 1
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
         X_pt = get_random_torch_tensor([M, K], dtype)
         W_pt = get_random_torch_tensor([N, K], dtype)
         B_pt = get_random_torch_tensor([N], dtype)
@@ -106,18 +147,20 @@ def _test_gemm_rcr_bias_add_relu(self, dtype="float16", target=None):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-                TestEnv.ROCM: [("float16")],
-            }
-        )
-    )
-    def test_gemm_rcr_bias_add_relu(self, ait_dtype):
-        target = detect_target()
-        self._test_gemm_rcr_bias_add_relu(ait_dtype, target)
+    def test_gemm_rcr_bias_add_relu_fp16(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_add_relu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_add_relu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float32")
+
+    def test_gemm_rcr_bias_add_relu_bf16(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="bfloat16")
+
+
+filter_test_cases_by_test_env(GEMMBiasReluTestCase)
 
 
 if __name__ == "__main__":

From cf3639b2899876266fb5dcd6c3084b81f4117adf Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Mon, 8 May 2023 18:03:03 -0700
Subject: [PATCH 489/638] Fix test logging errors (#664)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/664

1. Use logger f-strings so we stop running into `TypeError: not all arguments converted during string formatting` and makes test output easier to read

Reviewed By: jingsh, wushirong

Differential Revision: D45616742

fbshipit-source-id: 4a9e3f16263f3a7d2ac15c200087debd298cd48b
---
 fx2ait/fx2ait/tools/common_fx2ait.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 81d4d0bc3..d8e15b939 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -125,7 +125,7 @@ def run_test(
         for p in passes:
             mod = p(mod, inputs)
 
-        logger.info(mod.graph)
+        logger.info(f"{mod.graph=}")
 
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
@@ -163,7 +163,7 @@ def run_test(
             start = time.perf_counter()
             interp_result = interp.run()
             sec = time.perf_counter() - start
-            logger.info("Interpreter run time(s):", sec)
+            logger.info(f"Interpreter run time(s):{sec}")
             if OSS_AITModel:
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
@@ -197,7 +197,7 @@ def run_test(
             end_event.record()
             torch.cuda.synchronize()
             logger.info(
-                "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+                f"AIT run time(s)={start_event.elapsed_time(end_event) * 1.0e-3}"
             )
             # PyTorch Transformer model would yield 2 output tensors, of which the second one is
             # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
@@ -264,7 +264,7 @@ def run_test_with_dynamic_shape(
         )
         for p in passes:
             mod = p(mod, inputs_min)
-        logger.info(mod.graph)
+        logger.info(f"{mod.graph=}")
 
         original_inputs = inputs_min
         # Trace and test with inputs_min
@@ -287,7 +287,7 @@ def run_test_with_dynamic_shape(
             start = time.perf_counter()
             interp_result = interp.run()
             sec = time.perf_counter() - start
-            logger.info("Interpreter run time(s):", sec)
+            logger.info(f"Interpreter run time(s):{sec}")
             if OSS_AITModel:
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
@@ -323,7 +323,7 @@ def run_test_with_dynamic_shape(
             end_event.record()
             torch.cuda.synchronize()
             logger.info(
-                "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+                f"AIT run time(s)={start_event.elapsed_time(end_event) * 1.0e-3}"
             )
 
             if isinstance(outputs, torch.Tensor):

From d7af2cf4e34ffbd8ee4aad861ca948af56d1e3b0 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 9 May 2023 11:15:01 +0800
Subject: [PATCH 490/638] fix bugs

---
 examples/01_resnet-50/test_correctness.py      |  4 ++--
 .../05_stable_diffusion/src/modeling/clip.py   |  2 +-
 fx2ait/fx2ait/lower/lower.py                   |  2 +-
 .../aitemplate/backend/rocm/conv2d/common.py   | 18 ++++++++++++------
 .../backend/rocm/embedding/bert_embeddings.py  |  4 ++--
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/examples/01_resnet-50/test_correctness.py b/examples/01_resnet-50/test_correctness.py
index 8c46ec769..4dcd8edea 100644
--- a/examples/01_resnet-50/test_correctness.py
+++ b/examples/01_resnet-50/test_correctness.py
@@ -21,8 +21,8 @@
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target
 
-from .modeling.resnet import build_resnet_backbone
-from .weight_utils import timm_export
+from modeling.resnet import build_resnet_backbone
+from weight_utils import timm_export
 
 
 def mark_output(y):
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 30afcd051..3f1df6ba5 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -239,8 +239,8 @@ def forward(self, x, context=None):
         x_in = x
         x = self.norm(x)
         if self.use_linear_projection:
-            x = ops.reshape()(x, [b, -1, c])
             x = self.proj_in(x)
+            x = ops.reshape()(x, [b, -1, c])
         else:
             x = self.proj_in(x)
             x = ops.reshape()(x, [b, -1, c])
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
index c006c9806..55746d49a 100644
--- a/fx2ait/fx2ait/lower/lower.py
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -187,7 +187,7 @@ def create(
 
         return cls(
             lower_settings=lower_settings,
-            lower_pass=default_lower_pass(create_ait_lower_interpreter),
+            lower_pass=default_lower_pass(interpreter_builder),
         )
 
     def lower_func(
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index a2e979c94..0d30e05c8 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -600,9 +600,12 @@ def gen_profiler(
         w_dim0="out_ch",
         w_dim1="kernel_h",
         w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
     )
     file_pairs = []
     for op_name, op in op_instance.items():
@@ -742,9 +745,12 @@ def gen_function(
         w_dim0="*out_ch",
         w_dim1="*kernel_h",
         w_dim2="*kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
         div="/",
     )
     shape_save_func = shape_save_template.render(
diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
index de169c70c..736845d30 100644
--- a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import ROCMSpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
 
 # pylint: disable=C0301
 

From 2f782fb4dd48b8bdfeb59a3476cc7169802fa860 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 9 May 2023 11:18:55 +0800
Subject: [PATCH 491/638] revert some changes

---
 fx2ait/fx2ait/csrc/AITModelImpl.cpp | 92 +++++++++++++----------------
 fx2ait/fx2ait/csrc/AITModelImpl.h   |  1 -
 2 files changed, 40 insertions(+), 53 deletions(-)

diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
index 2005d33ac..b6e50b552 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.cpp
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -160,33 +160,27 @@ AITModelImpl::AITModelImpl(
   // It's not clear what stream we want to use yet. Create a new one.
   // We could alternatively use the default stream, but that could cause extra
   // synchronization.
-  StreamType creation_stream;
-  StreamCreate(&creation_stream, true);
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t creation_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
+#else
+  cudaStream_t creation_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
   using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<StreamType>,
-      decltype(&StreamDestroy)>;
-  StreamGuard creation_stream_guard{creation_stream, StreamDestroy};
-// #ifdef __HIP_PLATFORM_HCC__
-//   hipStream_t creation_stream;
-//   TORCH_CHECK(
-//       hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
-//       hipSuccess);
-
-//   using StreamGuard = std::unique_ptr<
-//       std::remove_pointer_t<hipStream_t>,
-//       decltype(&hipStreamDestroy)>;
-//   StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
-// #else
-//   cudaStream_t creation_stream;
-//   TORCH_CHECK(
-//       cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
-//       cudaSuccess);
-
-//   using StreamGuard = std::unique_ptr<
-//       std::remove_pointer_t<cudaStream_t>,
-//       decltype(&cudaStreamDestroy)>;
-//   StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
-// #endif
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
+#endif
 
 #define LOAD_SYMBOL(var, name_str)                                       \
   var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
@@ -650,33 +644,27 @@ void AITModelImpl::updateConstantsWithWeights(
     constants.emplace_back(torchToAitData(it->second));
   }
 
-  StreamType constants_stream;
-  StreamCreate(&constants_stream, true);
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t constants_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
+#else
+  cudaStream_t constants_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
   using StreamGuard = std::unique_ptr<
-      std::remove_pointer_t<StreamType>,
-      decltype(&StreamDestroy)>;
-  StreamGuard constants_stream_guard{constants_stream, StreamDestroy};
-// #ifdef __HIP_PLATFORM_HCC__
-//   hipStream_t constants_stream;
-//   TORCH_CHECK(
-//       hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
-//       hipSuccess);
-
-//   using StreamGuard = std::unique_ptr<
-//       std::remove_pointer_t<hipStream_t>,
-//       decltype(&hipStreamDestroy)>;
-//   StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
-// #else
-//   cudaStream_t constants_stream;
-//   TORCH_CHECK(
-//       cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
-//       cudaSuccess);
-
-//   using StreamGuard = std::unique_ptr<
-//       std::remove_pointer_t<cudaStream_t>,
-//       decltype(&cudaStreamDestroy)>;
-//   StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
-// #endif
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
+#endif
   AIT_CHECK(setManyConstantsDoubleBufferFunc_(
       model_handle_,
       /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
index 250498ee0..56924a420 100644
--- a/fx2ait/fx2ait/csrc/AITModelImpl.h
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "model_interface.h" // @manual=//aitemplate/AITemplate/static/include:aitemplate
-#include "utility.h"
 
 #include <dlfcn.h>
 #include <torch/torch.h> // @manual=//caffe2:torch-cpp

From 45b579e2aeed2db76134aa18c7284358fa807675 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 9 May 2023 13:18:37 +0800
Subject: [PATCH 492/638] format fx2ait code

---
 fx2ait/fx2ait/ait_splitter.py                     |  5 +++--
 fx2ait/fx2ait/converters/ait_converters.py        | 14 ++++++++++----
 fx2ait/fx2ait/converters/ait_module_converters.py |  4 ++--
 fx2ait/fx2ait/converters/aten2ait_converters.py   |  3 ++-
 fx2ait/fx2ait/fx2ait.py                           |  4 ++--
 fx2ait/fx2ait/test/test_ait_lower.py              |  1 +
 fx2ait/fx2ait/test/test_ait_splitter.py           |  5 +++--
 fx2ait/fx2ait/test/test_fx2ait.py                 |  1 +
 fx2ait/fx2ait/test/test_tensor_spec.py            |  3 ++-
 9 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index 886da0beb..5c3f87528 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -17,13 +17,14 @@
 import torch
 import torch.fx.passes.operator_support as ops
 import torch.fx.passes.splitter_base as splitter_base
+from torch.fx.passes.operator_support import create_op_support, OperatorSupportBase
+from torch.fx.passes.tools_common import get_acc_ops_name
+
 from fx2ait.acc_tracer import acc_ops
 from fx2ait.ait_module import AITModule
 
 from fx2ait.converters.converter_registry import AIT_CONVERTERS
 from fx2ait.fx2ait import AITInterpreter
-from torch.fx.passes.operator_support import create_op_support, OperatorSupportBase
-from torch.fx.passes.tools_common import get_acc_ops_name
 
 try:
     torch.ops.load_library("//deeplearning/ait:AITModel")
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index e13d9f7fe..b04f862e7 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -66,9 +66,9 @@
 )
 
 from aitemplate.testing import detect_target
+from torch.fx.node import Argument, Target
 
 from fx2ait.acc_tracer import acc_ops, ait_acc_ops
-from torch.fx.node import Argument, Target
 
 from .converter_registry import ait_converter
 
@@ -248,10 +248,12 @@ def acc_ops_linear(
     input_val = kwargs["input"]
     if USE_ROCM:
         shape = input_val._attrs["shape"]
-        input_val = input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1]])
+        input_val = (
+            input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1]])
+        )
     weight = kwargs["weight"]
     assert isinstance(weight, AITTensor)
-    
+
     result = gemm_rcr()(input_val, weight)
 
     bias = kwargs["bias"]
@@ -259,7 +261,11 @@ def acc_ops_linear(
         assert isinstance(bias, AITTensor)
         result = elementwise(FuncEnum.ADD)(result, bias)
     if USE_ROCM:
-        result = result if len(shape) == 2 else reshape()(result, [shape[0], -1, result._attrs["shape"][-1]])
+        result = (
+            result
+            if len(shape) == 2
+            else reshape()(result, [shape[0], -1, result._attrs["shape"][-1]])
+        )
     return result
 
 
diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
index 0f0330095..f2d508fd0 100644
--- a/fx2ait/fx2ait/converters/ait_module_converters.py
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -20,8 +20,8 @@
 import torch
 from aitemplate.backend.target import Target
 from aitemplate.compiler.base import _TorchConstantTensorData
-from aitemplate.testing import detect_target
 from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
 from torch.fx.node import Argument
 
 from .ait_converters import ConverterOutput
@@ -70,7 +70,7 @@ def multi_head_attention_module(
             num_heads=submod.num_heads,
             qkv_bias=True,
             has_residual=False,
-            use_mem_eff=True
+            use_mem_eff=True,
         )
 
     # Bind constant tensor for MHA module
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
index 4c4935d02..49322ce7f 100644
--- a/fx2ait/fx2ait/converters/aten2ait_converters.py
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -55,6 +55,8 @@
     transposed_conv2d_bias,
     unsqueeze,
 )
+from torch.fx.node import Argument, Target
+
 from fx2ait.converters.utils import (
     create_binary_op,
     get_positive_dim,
@@ -69,7 +71,6 @@
     aten_compose_mm_2d,
     aten_operator_getitem,
 )
-from torch.fx.node import Argument, Target
 
 from .converter_registry import ait_converter
 
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index e70a6c3f8..5db958b19 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -20,12 +20,12 @@
 from datetime import datetime
 from typing import Any, Dict, List, NamedTuple, Optional, Sequence
 
-import fx2ait.cache as cache
-
 import torch
 
 # @manual=//aitemplate/AITemplate/python/aitemplate:aitemplate
 from aitemplate.testing import detect_target
+
+import fx2ait.cache as cache
 from .converters.ait_converters import *  # isort:skip # noqa: F401 F403
 from .converters.aten2ait_converters import *  # isort:skip # noqa: F401 F403
 from aitemplate.compiler import compile_model
diff --git a/fx2ait/fx2ait/test/test_ait_lower.py b/fx2ait/fx2ait/test/test_ait_lower.py
index eeb9a3610..214b71b04 100644
--- a/fx2ait/fx2ait/test/test_ait_lower.py
+++ b/fx2ait/fx2ait/test/test_ait_lower.py
@@ -15,6 +15,7 @@
 import unittest
 
 import torch
+
 from fx2ait.lower.lower import AitLowerer
 from fx2ait.lower.lower_settings import LowerSettings
 
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
index 5b010c3a3..407f6ac86 100644
--- a/fx2ait/fx2ait/test/test_ait_splitter.py
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -13,6 +13,9 @@
 #  limitations under the License.
 #
 import torch
+from torch.fx.passes import operator_support as op_support
+from torch.fx.passes.operator_support import OperatorSupportBase
+
 from fx2ait.acc_tracer import acc_ops, acc_tracer
 from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait
     AITSplitter,
@@ -20,8 +23,6 @@
     create_ait_operator_support,
 )
 from fx2ait.tools.common_fx2ait import AITTestCase
-from torch.fx.passes import operator_support as op_support
-from torch.fx.passes.operator_support import OperatorSupportBase
 
 
 class TestSplit(AITTestCase):
diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
index 0154faa8f..8b281223e 100644
--- a/fx2ait/fx2ait/test/test_fx2ait.py
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -18,6 +18,7 @@
 import unittest
 
 import torch
+
 from fx2ait.acc_tracer import acc_tracer
 from fx2ait.ait_module import AITModule
 from fx2ait.fx2ait import AITInterpreter
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index c2f32c909..36426af8e 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -16,9 +16,10 @@
 
 import torch
 from aitemplate.compiler.public import IntImm, IntVar
-from fx2ait.tensor_spec import TensorSpec
 from parameterized import parameterized
 
+from fx2ait.tensor_spec import TensorSpec
+
 
 class TestTensorSpec(unittest.TestCase):
     def test_two_input_lists(self):

From c3122720a31836155794be5c9f1014ac2a852623 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 9 May 2023 01:27:34 -0700
Subject: [PATCH 493/638] Add SM90 CUTLASS 3.x kernels to remaining
 gemm_rcr_bias_activation ops (#669)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/669

ATT. SM90 kernels are added to the following ops:

- `gemm_rcr_bias_sigmoid`
- `gemm_rcr_bias_swish`
- `gemm_rcr_bias_tanh`
- `gemm_rcr_bias_hardswish`
- `gemm_rcr_bias_gelu`
- `gemm_rcr_bias_fast_gelu`
- `gemm_rcr_fast_gelu`

Reviewed By: ipiszy

Differential Revision: D45644967

fbshipit-source-id: d01dcce3535d3cacb9d40933cba958ac444c862b
---
 .../cuda/gemm_universal/common_no_bias.py     |   8 +
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |  50 ++++-
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py |  52 +++--
 .../gemm_universal/gemm_rcr_bias_hardswish.py |  50 ++++-
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |   8 +-
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |  50 ++++-
 .../gemm_universal/gemm_rcr_bias_swish.py     |  50 ++++-
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py |  50 ++++-
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py |  62 ++++--
 .../utils/mk_cutlass_lib/extra_enum.py        |  48 +++++
 .../unittest/ops/test_gemm_bias_hardswish.py  |  86 +++++++--
 tests/unittest/ops/test_gemm_bias_relu.py     |   4 +-
 tests/unittest/ops/test_gemm_bias_sigmoid.py  |  87 +++++++--
 tests/unittest/ops/test_gemm_bias_swish.py    |  79 ++++++--
 tests/unittest/ops/test_gemm_bias_tanh.py     | 115 ++++++++---
 .../ops/test_gemm_rcr_bias_fast_gelu.py       | 180 ++++++++++++++----
 tests/unittest/ops/test_gemm_rcr_fast_gelu.py | 144 ++++++++++----
 17 files changed, 894 insertions(+), 229 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
index 8c1e80cc3..86eef6ffc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
@@ -26,6 +26,7 @@
 #include <random>
 #include <vector>
 #include <iostream>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
@@ -34,6 +35,13 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index c13197ffa..709aee2e7 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -86,19 +86,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -110,10 +139,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
         extra_code=EXTRA_CODE.render(),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index 24f7691b3..610e50c26 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for C = fast_gelu(GeMM(A, B) + bias)
+GEMM Specialization for C = gelu(GeMM(A, B) + bias)
 where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
 """
 import jinja2
@@ -50,19 +50,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -73,10 +102,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index f565b897b..76ba16508 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -50,19 +50,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_hardswish.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_hardswish.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -73,10 +102,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index 496447cbf..a65d4f6a5 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -60,10 +60,10 @@
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),              // ElementA const* ptr_A
-    {K, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),              // ElementB const* ptr_B
-    {K, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
         {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
         ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index a74f57aca..dd079c87a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -51,19 +51,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_sigmoid.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -74,10 +103,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index 5b39629ea..4012e0d71 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -51,19 +51,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_swish.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_swish.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -74,10 +103,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 70b5f495e..6d4f13005 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -86,19 +86,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_bias_tanh.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_tanh.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -110,10 +139,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
         extra_code=EXTRA_CODE.render(),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index da92c6201..6b3719def 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -92,19 +92,48 @@
 )
 
 
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*) a_ptr,                                // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*) b_ptr,                                // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_output_type}}*) (c_ptr) + output_offset,         // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*) (c_ptr) + output_offset,         // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
+
 @registry.reg("cuda.gemm_rcr_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -129,15 +158,20 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common_no_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_no_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N",
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index d58c618d3..ef46ad65d 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -154,6 +154,18 @@ class EpilogueScheduleType(enum.Enum):
   TmaWarpSpecializedCooperative = enum_auto()
   TmaWarpSpecializedElementwiseRelu = enum_auto()
   TmaWarpSpecializedCooperativeElementwiseRelu = enum_auto()
+  TmaWarpSpecializedElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedElementwiseTanh = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseTanh = enum_auto()
+  TmaWarpSpecializedElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedElementwiseGELU = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseGELU = enum_auto()
+  TmaWarpSpecializedElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseFastGELU = enum_auto()
 
 EpilogueScheduleTag = {
   EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
@@ -163,6 +175,18 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
   EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::ReLu>',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::ReLu>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::Sigmoid>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::Sigmoid>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::SiLu>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::SiLu>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::Tanh>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::Tanh>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::HardSwish>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::HardSwish>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::GELU>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::GELU_taylor>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU_taylor>',
 }
 
 EpilogueScheduleSuffixes = {
@@ -173,14 +197,38 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
   EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: '_epi_tma_relu',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: '_epi_tma_relu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: '_epi_tma_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: '_epi_tma_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: '_epi_tma_silu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: '_epi_tma_silu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: '_epi_tma_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: '_epi_tma_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: '_epi_tma_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: '_epi_tma_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: '_epi_tma_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: '_epi_tma_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: '_epi_tma_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: '_epi_tma_fast_gelu',
 }
 
 EpilogueScheduleMapping = {
   EpilogueScheduleType.TmaWarpSpecialized: {
     EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu,
+    EpilogueFunctor.LinearCombinationSigmoid: EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid,
+    EpilogueFunctor.LinearCombinationSilu: EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu,
+    EpilogueFunctor.LinearCombinationTanh: EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh,
+    EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish,
+    EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU,
+    EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU,
   },
   EpilogueScheduleType.TmaWarpSpecializedCooperative: {
     EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu,
+    EpilogueFunctor.LinearCombinationSigmoid: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid,
+    EpilogueFunctor.LinearCombinationSilu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu,
+    EpilogueFunctor.LinearCombinationTanh: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh,
+    EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish,
+    EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU,
+    EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU,
   },
 }
 
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index cdae28a08..b6127666e 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -15,16 +15,16 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
-from parameterized import parameterized
 
 
 _TOLERANCE_LIMITS = {
@@ -44,11 +44,14 @@ def __init__(self, *args, **kwargs):
         super(GEMMBiasHardSwishTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    def _test_rcr(self, dtype="float16"):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
+    def _test_gemm_rcr_bias_hardswish(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -56,8 +59,10 @@ def _test_rcr(self, dtype="float16"):
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"gemm_rcr_bias_hardswish_{dtype}_{self._test_id}"
-        module = compile_model(Y, target, "./tmp", test_name)
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_hardswish_{test_suffix}_{self._test_id}"
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
         X_pt = get_random_torch_tensor([M, K], dtype)
         W_pt = get_random_torch_tensor([N, K], dtype)
         B_pt = get_random_torch_tensor([N], dtype)
@@ -67,19 +72,58 @@ def _test_rcr(self, dtype="float16"):
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, **_TOLERANCE_LIMITS[dtype]))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_hardswish_fp16(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="float16")
+
+    def test_gemm_rcr_bias_hardswish_fp32_sm80(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="float32")
+
+    def test_gemm_rcr_bias_hardswish_bf16(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_hardswish_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_hardswish(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_hardswish(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_hardswish(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_hardswish(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
 
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-            }
-        )
-    )
-    def test_rcr(self, dtype):
-        self._test_rcr(dtype)
+filter_test_cases_by_test_env(GEMMBiasHardSwishTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index ab1fc3fbe..c0d5efc43 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -15,6 +15,7 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
@@ -46,7 +47,6 @@ def _test_gemm_rcr_bias_relu(
         dtype="float16",
         test_suffix=None,
     ):
-        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -68,7 +68,7 @@ def _test_gemm_rcr_bias_relu(
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        torch.testing.assert_close(Y_pt, y, **tolerance_limits)
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
 
     def test_gemm_rcr_bias_relu_fp16(self):
         self._test_gemm_rcr_bias_relu(dtype="float16")
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 689c29497..bc8a79f70 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -15,16 +15,16 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
-from parameterized import parameterized
 
 
 _TOLERANCE_LIMITS = {
@@ -39,11 +39,14 @@ def __init__(self, *args, **kwargs):
         super(GEMMBiasSigmoidTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    def _test_rcr(self, dtype="float16"):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
+    def _test_gemm_rcr_bias_sigmoid(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -51,9 +54,11 @@ def _test_rcr(self, dtype="float16"):
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"gemm_rcr_bias_sigmoid_{dtype}_{self._test_id}"
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_sigmoid_{test_suffix}_{self._test_id}"
         self._test_id += 1
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
         X_pt = get_random_torch_tensor([M, K], dtype)
         W_pt = get_random_torch_tensor([N, K], dtype)
         B_pt = get_random_torch_tensor([N], dtype)
@@ -65,17 +70,57 @@ def _test_rcr(self, dtype="float16"):
         module.run_with_tensors(inputs, [y])
         torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
 
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-                TestEnv.ROCM: [("float16")],
-            }
-        )
-    )
-    def test_rcr(self, dtype):
-        self._test_rcr(dtype)
+    def test_gemm_rcr_bias_sigmoid_fp16(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float16")
+
+    def test_gemm_rcr_bias_sigmoid_fp16_rocm(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float16")
+
+    def test_gemm_rcr_bias_sigmoid_fp32_sm80(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float32")
+
+    def test_gemm_rcr_bias_sigmoid_bf16(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_sigmoid_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_sigmoid(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_sigmoid(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_sigmoid(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_sigmoid(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(GEMMBiasSigmoidTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
index d9c71780a..c51c76d78 100644
--- a/tests/unittest/ops/test_gemm_bias_swish.py
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -15,10 +15,12 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -41,11 +43,14 @@ def __init__(self, *args, **kwargs):
         super(GEMMBiasSwishTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    def _test_rcr(self, dtype="float16"):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
+    def _test_gemm_rcr_bias_swish(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
@@ -53,9 +58,11 @@ def _test_rcr(self, dtype="float16"):
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        test_name = f"gemm_rcr_bias_swish_{dtype}_{self._test_id}"
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_swish_{test_suffix}_{self._test_id}"
         self._test_id += 1
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
         X_pt = get_random_torch_tensor([M, K], dtype)
         W_pt = get_random_torch_tensor([N, K], dtype)
         B_pt = get_random_torch_tensor([N], dtype)
@@ -65,20 +72,58 @@ def _test_rcr(self, dtype="float16"):
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
         y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, **_TOLERANCE_LIMITS[dtype]))
-
-    def test_rcr_float16(self):
-        self._test_rcr(dtype="float16")
-
-    def test_rcr_float32_sm80(self):
-        self._test_rcr(dtype="float32")
-
-    def test_rcr_bfloat16_bf16(self):
-        self._test_rcr(dtype="bfloat16")
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_swish_fp16(self):
+        self._test_gemm_rcr_bias_swish(dtype="float16")
+
+    def test_gemm_rcr_bias_swish_fp32_sm80(self):
+        self._test_gemm_rcr_bias_swish(dtype="float32")
+
+    def test_gemm_rcr_bias_swish_bf16(self):
+        self._test_gemm_rcr_bias_swish(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_swish_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_swish(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_swish(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_swish(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_swish(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
 
 
 filter_test_cases_by_test_env(GEMMBiasSwishTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index 8e3e5dd60..9e7cb59e1 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -21,13 +21,12 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
 from aitemplate.utils import shape_utils
-from parameterized import parameterized
 
 
 _TOLERANCE_LIMITS = {
@@ -42,11 +41,14 @@ def __init__(self, *args, **kwargs):
         super(GEMMBiasTanhTestCase, self).__init__(*args, **kwargs)
         self._test_id = 0
 
-    def _test_rcr(self, Ms, test_name, dtype="float16"):
-        K = 1024
-        N = 64
-        target = detect_target()
-        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+    def _test_gemm_rcr_bias_tanh(
+        self,
+        Ms,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
@@ -57,10 +59,11 @@ def _test_rcr(self, Ms, test_name, dtype="float16"):
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(
-            Y, target, "./tmp", f"gemm_rcr_bias_tanh_{test_name}_{self._test_id}"
-        )
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_tanh_{test_suffix}_{self._test_id}"
         self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
 
         for M in Ms:
             X_pt = get_random_torch_tensor([M, K], dtype)
@@ -72,20 +75,84 @@ def _test_rcr(self, Ms, test_name, dtype="float16"):
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
-
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-                TestEnv.ROCM: [("float16")],
-            }
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_tanh_fp16(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float16",
+            test_suffix="static_m_fp16",
+        )
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[1, 7, 64, 127],
+            dtype="float16",
+            test_suffix="dynamic_m_fp16",
+        )
+
+    def test_gemm_rcr_bias_tanh_fp16_rocm(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float16",
+            test_suffix="static_m_fp16",
+        )
+
+    def test_gemm_rcr_bias_tanh_fp32_sm80(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float32",
+            test_suffix="static_m_fp32",
+        )
+
+    def test_gemm_rcr_bias_tanh_bf16(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="bfloat16",
+            test_suffix="static_m_bf16",
         )
-    )
-    def test_rcr_bias_tanh_floats(self, dtype):
-        self._test_rcr([128], f"static_m_{dtype}", dtype=dtype)
-        self._test_rcr([1, 7, 64, 127], f"dynamic_m_{dtype}", dtype=dtype)
+
+    def test_gemm_rcr_bias_tanh_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_tanh(
+                    Ms=[128],
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_tanh(
+                    Ms=[128],
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_tanh(
+                Ms=[128],
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_tanh(
+                Ms=[128],
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(GEMMBiasTanhTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index 52a0d7f1e..e0592b725 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -22,6 +22,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -29,23 +30,46 @@
 from aitemplate.utils import shape_utils
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
         torch.manual_seed(0)
 
-    def _test_rcr(
-        self, Ms, test_name, use_fast_gelu=True, dtype="float16", atol=1e-1, rtol=1e-1
+    def _test_gemm_rcr_bias_fast_gelu(
+        self,
+        Ms,
+        test_name,
+        K=1024,
+        N=64,
+        use_fast_gelu=True,
+        dtype="float16",
     ):
-        K = 1024
-        N = 64
-        target = detect_target()
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
+        X = Tensor(
+            shape=[MDim, IntImm(K)],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = (
             ops.gemm_rcr_bias_fast_gelu() if use_fast_gelu else ops.gemm_rcr_bias_gelu()
         )
@@ -55,7 +79,7 @@ def _test_rcr(
 
         module = compile_model(
             Y,
-            target,
+            detect_target(),
             "./tmp",
             f"gemm_rcr_bias_fast_gelu_{test_name}"
             if use_fast_gelu
@@ -74,40 +98,130 @@ def _test_rcr(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
-
-    def test_rcr(self):
-        self._test_rcr([128], "static", use_fast_gelu=True)
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
-            self._test_rcr([128], "static", use_fast_gelu=False)
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_rcr_float_sm80(self):
-        self._test_rcr(
-            [1, 7, 64, 127], "fast_dynamic_m_float", use_fast_gelu=True, dtype="float"
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_fast_gelu_fp16(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
         )
-        self._test_rcr(
-            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=False, dtype="float"
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_gelu",
+            use_fast_gelu=False,
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16_gelu",
+            use_fast_gelu=False,
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_rocm_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_rcr_bias_fast_gelu_bfloat16_sm80(self):
-        self._test_rcr(
-            [1, 7, 64, 127],
-            "fast_dynamic_m_bfloat16",
+    def test_gemm_rcr_bias_fast_gelu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float32",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32_gelu",
+            use_fast_gelu=False,
+            dtype="float32",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_bf16(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16_fast_gelu",
             use_fast_gelu=True,
             dtype="bfloat16",
-            atol=2e-1,
-            rtol=2e-1,
         )
-        self._test_rcr(
-            [1, 7, 64, 127], "dynamic_m_bfloat16", use_fast_gelu=False, dtype="bfloat16"
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16_gelu",
+            use_fast_gelu=False,
+            dtype="bfloat16",
         )
 
+    def test_gemm_rcr_bias_fast_gelu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    K=1020,
+                    test_name="wrong_input_alignment_sm90",
+                    use_fast_gelu=True,
+                    dtype="float16",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    N=63,
+                    test_name="wrong_output_alignment_sm90",
+                    use_fast_gelu=True,
+                    dtype="float16",
+                )
+
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_fast_gelu_force_sm90",
+                use_fast_gelu=True,
+                dtype="float16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_gelu_force_sm90",
+                use_fast_gelu=False,
+                dtype="float16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_fast_gelu_force_sm90",
+                use_fast_gelu=True,
+                dtype="bfloat16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_gelu_force_sm90",
+                use_fast_gelu=False,
+                dtype="bfloat16",
+            )
+
 
 filter_test_cases_by_test_env(GEMMRcrBiasFastGeluTestCase)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index c17ac02f7..c3d51ffb5 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -23,6 +23,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -30,6 +31,13 @@
 from aitemplate.utils import shape_utils
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 class NewGELUActivation(torch.nn.Module):
     def __init__(
         self,
@@ -55,16 +63,26 @@ class GEMMRcrFastGeluTestCase(unittest.TestCase):
     def setUpClass(cls) -> None:
         torch.manual_seed(10)
 
-    def _test_rcr(
-        self, Ms, test_name, use_fast_gelu=True, atol=1e-1, rtol=1e-1, dtype="float16"
+    def _test_gemm_rcr_fast_gelu(
+        self,
+        Ms,
+        test_name,
+        K=1024,
+        N=64,
+        dtype="float16",
     ):
-        K = 1024
-        N = 64
-        target = detect_target()
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
+        X = Tensor(
+            shape=[MDim, IntImm(K)],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
 
         OP = ops.gemm_rcr_fast_gelu()
@@ -73,7 +91,12 @@ def _test_rcr(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_fast_gelu_{test_name}")
+        module = compile_model(
+            Y,
+            detect_target(),
+            "./tmp",
+            f"gemm_rcr_fast_gelu_{test_name}",
+        )
 
         for M in Ms:
             logging.info(f"Testing {M=}")
@@ -86,38 +109,95 @@ def _test_rcr(
                 {"input_0": X_pt, "input_1": W_pt},
                 [y],
             )
-            torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
-
-    def test_rcr(self):
-        self._test_rcr([128], "static", use_fast_gelu=True)
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
-            self._test_rcr([128], "static", use_fast_gelu=False)
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_rcr_fast_gelu_float_sm80(self):
-        self._test_rcr([128], "static_float", use_fast_gelu=True, dtype="float")
-        self._test_rcr(
-            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="float"
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_fast_gelu_fp16(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16",
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_fast_gelu_fp16_rocm(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_rocm",
+            dtype="float16",
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_gemm_rcr_fast_gelu_bfloat16_sm80(self):
-        self._test_rcr(
-            [128],
-            "static_float",
-            use_fast_gelu=True,
-            atol=3e-1,
-            rtol=3e-1,
+    def test_gemm_rcr_fast_gelu_fp32_sm80(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp32",
+            dtype="float32",
+        )
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32",
+            dtype="float32",
+        )
+
+    def test_gemm_rcr_fast_gelu_bf16(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_bf16",
             dtype="bfloat16",
         )
-        self._test_rcr(
-            [1, 7, 64, 127], "dynamic_m_float", use_fast_gelu=True, dtype="bfloat16"
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16",
+            dtype="bfloat16",
         )
 
+    def test_gemm_rcr_fast_gelu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    K=1020,
+                    test_name="wrong_input_alignment_sm90",
+                    dtype="float16",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    N=63,
+                    test_name="wrong_output_alignment_sm90",
+                    dtype="float16",
+                )
+
+            self._test_gemm_rcr_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_gemm_rcr_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
 
 filter_test_cases_by_test_env(GEMMRcrFastGeluTestCase)
 
+
 if __name__ == "__main__":
     unittest.main()

From 3fd5d109c2fcdb87b9e002b69edb9bdd929bc1de Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 9 May 2023 02:10:34 -0700
Subject: [PATCH 494/638] Add bf16 support to classic_b2b_bmm op (#665)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/665

ATT

Reviewed By: frank-wei, ipiszy, alexanderguzhva

Differential Revision: D45622610

fbshipit-source-id: 08b26721b3af8ec7e6755b88013732b249a20249
---
 .../backend/cuda/b2b_bmm/classic_b2b_bmm.py   | 39 +++++----
 tests/unittest/ops/test_b2b_bmm.py            | 81 +++++++++++++++----
 2 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
index 73937c20f..5f6ab7c0e 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -73,9 +73,10 @@
 }  // end namespace
 
 {{func_signature}} {
+  using ElementInput = {{elem_input_type}};
   using ElementOutput = {{elem_output_type}};
   using ElementAccumulator = {{elem_accum_type}};
-  using ElementCompute = {{elem_input_type}};
+  using ElementCompute = {{elem_accum_type}};
 
   ElementCompute alpha0 = ElementCompute({{alpha0}});
   ElementCompute beta0 = ElementCompute(1);
@@ -113,9 +114,9 @@
     >;
 
   using B2bGemmBatched = cutlass::gemm::device::B2bGemmBatched<
-    ElementCompute,
+    ElementInput,
     cutlass::layout::RowMajor,
-    ElementCompute,
+    ElementInput,
     cutlass::layout::ColumnMajor,
     cutlass::layout::RowMajor,
     ElementOutput,
@@ -160,19 +161,19 @@
   typename B2bGemmBatched::Arguments arguments{
     problem_size_0, // = GemmCoord problem_size_0;
     problem_size_1, // = GemmCoord problem_size_1;
-    {static_cast<ElementCompute*>(query), typename B2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},    // TensorRef<ElementA const, LayoutA> ref_A0;
+    {static_cast<ElementInput*>(query), typename B2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementA const, LayoutA> ref_A0;
     problem_size_0.k(),                                                                                                 // int64_t head_stride_A0;
     num_heads * problem_size_0.m() * problem_size_0.k(),                                                                // int64_t batch_stride_A0;
-    {static_cast<ElementCompute*>(key), typename B2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementB const, LayoutB> ref_B0;
+    {static_cast<ElementInput*>(key), typename B2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},        // TensorRef<ElementB const, LayoutB> ref_B0;
     problem_size_0.k(),                                                                                                 // int64_t head_stride_B0;
     num_heads * problem_size_0.n() * problem_size_0.k(),                                                                // int64_t batch_stride_B0;
-    {static_cast<ElementCompute*>(bias), typename B2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                  // TensorRef<ElementC const, LayoutC> ref_C0;
+    {static_cast<ElementInput*>(bias), typename B2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                    // TensorRef<ElementC const, LayoutC> ref_C0;
     {{bias_stride_mn}},                                                                                                 // int64_t head_stride_C0;
     {{bias_stride_hmn}},                                                                                                // int64_t batch_stride_C0;
-    {static_cast<ElementCompute*>(value), typename B2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},   // TensorRef<ElementC const, LayoutC> ref_B1;
+    {static_cast<ElementInput*>(value), typename B2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},     // TensorRef<ElementC const, LayoutC> ref_B1;
     problem_size_1.n(),                                                                                                 // int64_t head_stride_B1;                                                                    //
     num_heads * problem_size_1.n() * problem_size_1.k(),                                                                // int64_t batch_stride_B1;
-    {static_cast<ElementCompute*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},                       // Not used due to ScaleType::Nothing for output op 1
+    {static_cast<ElementInput*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},                         // Not used due to ScaleType::Nothing for output op 1
     0,                                                                                                                  // not used: int64_t head_stride_C1;
     0,                                                                                                                  // not used: int64_t batch_stride_C1;
     {static_cast<ElementOutput*>(output), typename B2bGemmBatched::LayoutC::Stride(num_heads * problem_size_1.n())},    // TensorRef<ElementC, LayoutC> ref_D1;
@@ -252,20 +253,24 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         raise RuntimeError(
             f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
         )
-    backend_spec = CUDASpec()
-    if func_attrs["inputs"][0]._attrs["dtype"] != "float16":
+
+    supported_types = ("float16", "bfloat16")
+    input_type = func_attrs["inputs"][0]._attrs["dtype"]
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    if input_type not in supported_types or output_type not in supported_types:
         raise NotImplementedError(
-            "only float16 dtype supported for now in classic_b2b_bmm op"
+            f"{supported_types=} for inputs and output "
+            f"but got {input_type=} and {output_type=}."
         )
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
+
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(input_type)
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
+
     if (
         "use_fp16_acc" in Target.current()._kwargs
         and Target.current()._kwargs["use_fp16_acc"]
+        and input_type == "float16"
     ):
         elem_accum_type = "cutlass::half_t"
     else:
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index 89bfda95b..243fe2605 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -42,6 +42,7 @@
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
 )
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ClassicB2bBmmTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -142,7 +143,6 @@ def _test_classic_b2b_bmm(
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_b2b_bmm_fp16_fp32acc(self):
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_basic_fp32acc",
@@ -158,7 +158,38 @@ def test_classic_b2b_bmm_fp16_fp32acc(self):
             use_fp16_acc=False,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_classic_b2b_bmm_bf16_fp32acc(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_basic_fp32acc",
+            dtype="bfloat16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_sigmoid_fp32acc",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            use_fp16_acc=False,
+        )
+
     def test_classic_b2b_bmm_fp16(self):
         self._test_classic_b2b_bmm(
             test_name="classic_b2b_bmm_fp16_basic",
@@ -203,6 +234,7 @@ def test_classic_b2b_bmm_fp16(self):
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
 )
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ClassicMultiheadB2bBmmTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -224,6 +256,7 @@ def _test_classic_multihead_b2b_bmm(
         atol=1e-2,
         rtol=1e-2,
         bias_broadcast=(False, False, False, False),
+        use_fp16_acc=True,
     ):
         # Initialize AIT classic_b2b_bmm operator.
         assert len(bias_broadcast) == 4
@@ -279,7 +312,7 @@ def _test_classic_multihead_b2b_bmm(
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
-        target = detect_target(use_fp16_acc=True)
+        target = detect_target(use_fp16_acc=use_fp16_acc)
         module = compile_model(Y, target, "./tmp", test_name)
 
         # Run tests.
@@ -325,7 +358,6 @@ def _test_classic_multihead_b2b_bmm(
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead1_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead1_b2b_bmm_fp16_basic",
@@ -334,7 +366,6 @@ def test_classic_multihead1_b2b_bmm(self):
             num_heads=1,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead2_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead2_b2b_bmm_fp16_basic",
@@ -343,7 +374,6 @@ def test_classic_multihead2_b2b_bmm(self):
             num_heads=2,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead1_b2b_bmm_bias_broadcast1(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead1_b2b_bmm_broadcast1_fp16_basic",
@@ -353,7 +383,6 @@ def test_classic_multihead1_b2b_bmm_bias_broadcast1(self):
             bias_broadcast=[True, True, False, False],
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead2_b2b_bmm_bias_broadcast1(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead2_b2b_bmm_broadcast1_fp16_basic",
@@ -363,7 +392,6 @@ def test_classic_multihead2_b2b_bmm_bias_broadcast1(self):
             bias_broadcast=[True, True, False, False],
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead2_b2b_bmm_bias_broadcast2(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead2_b2b_bmm_broadcast2_fp16_basic",
@@ -373,7 +401,6 @@ def test_classic_multihead2_b2b_bmm_bias_broadcast2(self):
             bias_broadcast=[True, True, True, False],
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead2_b2b_bmm_bias_broadcast3(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead2_b2b_bmm_broadcast3_fp16_basic",
@@ -383,7 +410,6 @@ def test_classic_multihead2_b2b_bmm_bias_broadcast3(self):
             bias_broadcast=[True, False, False, False],
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead4_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead4_b2b_bmm_fp16_dynamic_batch",
@@ -392,7 +418,6 @@ def test_classic_multihead4_b2b_bmm(self):
             num_heads=4,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead16_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead16_b2b_bmm_fp16_rectangular",
@@ -404,7 +429,6 @@ def test_classic_multihead16_b2b_bmm(self):
             num_heads=16,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead3_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead3_b2b_bmm_fp16_causal",
@@ -414,7 +438,6 @@ def test_classic_multihead3_b2b_bmm(self):
             num_heads=3,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead8_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead8_b2b_bmm_fp16_sigmoid",
@@ -424,7 +447,6 @@ def test_classic_multihead8_b2b_bmm(self):
             num_heads=8,
         )
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_classic_multihead1_relu_b2b_bmm(self):
         self._test_classic_multihead_b2b_bmm(
             test_name="classic_multihead1_b2b_bmm_fp16_complex",
@@ -435,11 +457,41 @@ def test_classic_multihead1_relu_b2b_bmm(self):
             num_heads=1,
         )
 
+    def test_classic_multihead_b2b_bmm_bf16(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead8_b2b_bmm_bf16_sigmoid",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            num_heads=8,
+            use_fp16_acc=False,
+        )
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead16_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            num_heads=16,
+            use_fp16_acc=False,
+        )
+
 
 @unittest.skipIf(
     detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
     "Not supported by CUDA < SM80.",
 )
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FMHAStyleB2bBmmTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -585,7 +637,6 @@ def _test_fmha_style_b2b_bmm(
             module.run_with_tensors(inputs, [y])
             torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_fmha_style_b2b_bmm_fp16(self):
         self._test_fmha_style_b2b_bmm(
             test_name="fmha_style_b2b_bmm_fp16_basic",

From 4593c4e01cd97465ea062a83eaeb402d8bf21111 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 9 May 2023 02:23:16 -0700
Subject: [PATCH 495/638] Add bf16 support to fmha_style_b2b_bmm op (#666)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/666

ATT

Reviewed By: ipiszy, alexanderguzhva

Differential Revision: D45624154

fbshipit-source-id: cd3daf1ff8f0b2a474c7a033a62f685e6fe0b9fd
---
 .../cuda/b2b_bmm/fmha_style_b2b_bmm.py        | 22 ++++++----
 tests/unittest/ops/test_b2b_bmm.py            | 41 +++++++++++++++++++
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
index 078b88f25..4f794fd13 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
@@ -42,12 +42,12 @@
 }  // end namespace
 
 {{func_signature}} {
+  using ElementInput = {{elem_input_type}};
   using ElementOutput = {{elem_output_type}};
   using ElementAccumulator = {{elem_accum_type}};
-  using ElementCompute = {{elem_input_type}};
 
   using Attention = AttentionKernel<
-    ElementCompute,
+    ElementInput,
     ElementAccumulator,
     cutlass::arch::Sm80,  // ArchTag
     true,                 // Memory is aligned
@@ -66,11 +66,11 @@
 
   typename Attention::Params p;
   { // set parameters
-    p.query_ptr = static_cast<ElementCompute*>(query);
-    p.key_ptr = static_cast<ElementCompute*>(key);
-    p.value_ptr = static_cast<ElementCompute*>(value);
+    p.query_ptr = static_cast<ElementInput*>(query);
+    p.key_ptr = static_cast<ElementInput*>(key);
+    p.value_ptr = static_cast<ElementInput*>(value);
     if (bias) {
-      p.attn_bias_ptr = static_cast<ElementCompute*>(bias);
+      p.attn_bias_ptr = static_cast<ElementInput*>(bias);
     }
     p.output_accum_ptr = nullptr;
     if (Attention::kNeedsOutputAccumulatorBuffer) {
@@ -234,10 +234,14 @@ def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
     elem_output_type = backend_spec.dtype_to_lib_type(
         func_attrs["outputs"][0]._attrs["dtype"]
     )
-    elem_accum_type = elem_input_type
-    if elem_input_type == "cutlass:half_t" and not Target.current()._kwargs.get(
-        "use_fp16_acc", False
+
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+        and elem_input_type == "cutlass::half_t"
     ):
+        elem_accum_type = "cutlass::half_t"
+    else:
         elem_accum_type = "float"
 
     import cutlass_lib
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index 243fe2605..c645ab65b 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -744,6 +744,47 @@ def test_fmha_style_b2b_bmm_fp16(self):
             seq_lens_kv=512,
         )
 
+    def test_fmha_style_b2b_bmm_bf16(self):
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_basic",
+            dtype="bfloat16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            seq_lens=512,
+            seq_lens_kv=128,
+            n1=128,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="SiLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_complex_fp32_acc",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            seq_lens=512,
+            seq_lens_kv=512,
+            use_fp16_acc=False,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From d915218bd70e0acf4658d5179034f4067e96c61d Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 9 May 2023 07:15:03 -0700
Subject: [PATCH 496/638] Windows test_standalone.py fix (#676)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/676

Reviewed By: aakhundov

Differential Revision: D45672031

fbshipit-source-id: a66199b1084fbdd39fb14e682dd4dca5e4f68e67
---
 tests/unittest/backend/test_gen_standalone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unittest/backend/test_gen_standalone.py b/tests/unittest/backend/test_gen_standalone.py
index 50221f51b..61ec8dc47 100644
--- a/tests/unittest/backend/test_gen_standalone.py
+++ b/tests/unittest/backend/test_gen_standalone.py
@@ -127,7 +127,7 @@ def _test_gen_standalone(self, test_name, dtype):
         else:
             working_env["LD_LIBRARY_PATH"] = workdir
         _LOGGER.info(f"work dir: {workdir}")
-        exe_name = "./test.exe" if is_windows() else "./test"
+        exe_name = "test.exe" if is_windows() else "./test"
         with subprocess.Popen(
             [exe_name],
             shell=True,

From 7a57a026890e2c13918b8ee7d5df28f979c04627 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 9 May 2023 07:25:38 -0700
Subject: [PATCH 497/638] Refactor infrastructure (#671)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/671

Preparations for the CMake compiler engine

Reviewed By: aakhundov

Differential Revision: D45659112

fbshipit-source-id: cf5e52522e321d3a9f278004a8b96140f4ddb1fa
---
 python/aitemplate/backend/builder.py                        | 5 +++++
 python/aitemplate/compiler/compiler.py                      | 2 +-
 python/aitemplate/compiler/transform/profile.py             | 2 +-
 python/aitemplate/compiler/transform/profile_dynamic_dim.py | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index d90503bf5..930753403 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -895,3 +895,8 @@ def make(
         if not is_debug():
             cmds.append(make_clean_constants_cmd)
         _run_make_cmds(cmds, self._timeout, build_dir, allow_cache=allow_cache)
+
+
+def get_compile_engine():
+    compile_engine = Builder()
+    return compile_engine
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index c601057dc..dc29d6aee 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -332,7 +332,7 @@ def compile_model(
             file_pairs.extend(main_pairs)
 
             start_t = datetime.now()
-            compile_engine = backend.builder.Builder()
+            compile_engine = backend.builder.get_compile_engine()
             compile_engine.make(
                 file_pairs, dll_name, workdir, test_name, debug_settings
             )
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 4e908fee9..a7683bcba 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -85,7 +85,7 @@ def profile(
         f"generated {len(generated_profilers)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
     )
     start_t = datetime.now()
-    compile_engine = builder.Builder()
+    compile_engine = builder.get_compile_engine()
     compile_engine.make_profilers(generated_profilers, profiler_dir)
     _LOGGER.info(f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
     funcs_to_profile = OrderedDict(
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index c398c5034..ecef60721 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -33,7 +33,7 @@ def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
     _LOGGER.info("Current dynamic profiler supports ONLY ONE dynamic dim.")
     generated_profilers = list(codegen.gen_profiler(sorted_graph, workdir))
     generated_profilers = [p for p in generated_profilers if p is not None]
-    compile_engine = builder.Builder()
+    compile_engine = builder.get_compile_engine()
     compile_engine.make_profilers(generated_profilers, workdir)
     funcs_to_profile = OrderedDict(
         (func._attrs["name"], func)

From cc5cbf2ec33e9e3eb749d2df6a94daddf77129df Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Tue, 9 May 2023 07:45:20 -0700
Subject: [PATCH 498/638] introduce get_include_directories() in target_def.py
 files (#672)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/672

Preparations for the CMake compiler engine

Reviewed By: aakhundov

Differential Revision: D45659642

fbshipit-source-id: 1c22eaa888bc5c0641d132dcc5f275b731d2c191
---
 python/aitemplate/backend/cuda/target_def.py | 51 +++++++++++++-------
 python/aitemplate/backend/rocm/target_def.py |  5 +-
 python/aitemplate/backend/target.py          | 11 +++++
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index ed0760256..67caf624b 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -82,7 +82,7 @@ def __init__(
                 cuda_version = "12.0.0"
         self._cuda_version = cuda_version
 
-    def _build_compile_options(self):
+    def _build_include_directories(self) -> List[str]:
         flash_attention_path = ""
         if os.path.exists(
             os.path.join(
@@ -116,6 +116,17 @@ def _build_compile_options(self):
             ),
         ]
         ait_static_path = os.path.join(self._ait_include_path, "include/kernels")
+
+        output = [ait_static_path]
+        output.extend(cutlass_path)
+        return output
+
+    def get_include_directories(self) -> List[str]:
+        return self._build_include_directories()
+
+    def _build_compile_options(self):
+        include_paths = self._build_include_directories()
+
         options = [
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
             "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
@@ -128,8 +139,7 @@ def _build_compile_options(self):
             environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
-            f"-I{ait_static_path}",
-        ] + ["-I" + path for path in cutlass_path]
+        ] + ["-I" + path for path in include_paths]
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         if environ.use_fast_math():
@@ -251,20 +261,28 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             **kwargs,
         )
 
+    def _build_include_directories(self) -> List[str]:
+        cutlass_path = [
+            os.path.join(self._template_path, "include"),
+            os.path.join(self._template_path, "tools/util/include"),
+            os.path.join(self._template_path, "examples/35_gemm_softmax"),
+            os.path.join(self._template_path, "examples/41_fused_multi_head_attention"),
+            os.path.join(self._template_path, "examples/45_dual_gemm"),
+            os.path.join(self._template_path, "../att_include"),
+            os.path.join(self._template_path, "../att_include/fmha"),
+        ]
+        if self._include_path is not None:
+            ait_static_path = os.path.join(self._include_path, "static")
+            return [ait_static_path] + cutlass_path
+        else:
+            return cutlass_path
+
+    def get_include_directories(self) -> List[str]:
+        return self._build_include_directories()
+
     def _build_compile_options(self):
         if not FBCUDA.compile_options_:
-            cutlass_path = [
-                os.path.join(self._template_path, "include"),
-                os.path.join(self._template_path, "tools/util/include"),
-                os.path.join(self._template_path, "examples/35_gemm_softmax"),
-                os.path.join(
-                    self._template_path, "examples/41_fused_multi_head_attention"
-                ),
-                os.path.join(self._template_path, "examples/45_dual_gemm"),
-                os.path.join(self._template_path, "../att_include"),
-                os.path.join(self._template_path, "../att_include/fmha"),
-            ]
-            ait_static_path = os.path.join(self._include_path, "static")
+            include_paths = self._build_include_directories()
             fb_include_path = os.path.join(self._include_path, "fb_include")
             pp_args = self.nvcc_options_json["pp_args"]
             with open(fb_include_path, "w") as fb_include:
@@ -278,9 +296,8 @@ def _build_compile_options(self):
 
             options = (
                 self.nvcc_options_json["args"]
-                + ["-I" + path for path in cutlass_path]
+                + ["-I" + path for path in include_paths]
                 + [
-                    f"-I{ait_static_path}",
                     f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
                     "-Xcompiler -Wno-strict-aliasing",
                     "-Xcompiler -Wno-narrowing",
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index ec3bcc134..8787a93ca 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -87,7 +87,7 @@ def _pkg_path(self):
         rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm")
         return rocm_path
 
-    def _get_ck_paths(self):
+    def _get_ck_paths(self) -> List[str]:
         ck_paths = [
             os.path.join(self._template_path),
             os.path.join(self._template_path, "include/"),
@@ -144,6 +144,9 @@ def _get_ck_paths(self):
         ]
         return ck_paths
 
+    def get_include_directories(self) -> List[str]:
+        return self._get_ck_paths()
+
     def _build_compile_options(self):
         """Build compilation commands, including compilation flag library and includes.
 
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 6207cf554..0bbc71139 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -468,6 +468,17 @@ def remote_logger(cls, record: Dict[str, Any]) -> None:
         """
         return
 
+    def get_include_directories(self) -> List[str]:
+        """
+        Returns a list of include directories for a compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
 
 def CUDA(template_path: str = CUTLASS_PATH, arch: str = "80", **kwargs):
     """Create a CUDA target."""

From 9a1b4f94665e95a2940318e8fdd91fe09f5bb1d4 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Tue, 9 May 2023 15:57:04 -0700
Subject: [PATCH 499/638] Add is_view_of for identity op (#677)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/677

Use view for identity op.

Reviewed By: aakhundov

Differential Revision: D45633948

fbshipit-source-id: 7e962bd0f0a041c1114efe79b428ad38c20c222d
---
 python/aitemplate/compiler/ops/tensor/identity.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/ops/tensor/identity.py b/python/aitemplate/compiler/ops/tensor/identity.py
index c6c691f54..262a32523 100644
--- a/python/aitemplate/compiler/ops/tensor/identity.py
+++ b/python/aitemplate/compiler/ops/tensor/identity.py
@@ -39,9 +39,8 @@ def __call__(self, x: Tensor) -> Tensor:
         self._set_depth()
 
         output_shapes = self._infer_shapes(x)
-        output = Tensor(output_shapes, src_ops={self})
+        output = Tensor(output_shapes, src_ops={self}, is_view_of=x)
         self._attrs["outputs"] = [output]
-        output._attrs["dtype"] = x.dtype()
 
         return output
 

From 47cdc5493a74d5180f660d61f47660b959249a0b Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 10 May 2023 06:47:08 -0700
Subject: [PATCH 500/638] MSVC fix (#675)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/675

Reviewed By: chenyang78, wushirong

Differential Revision: D45672030

fbshipit-source-id: d9854d72aa35a7d74605ee371beb64978418ea9e
---
 .../cuda/gemm_universal/common_bias_broadcast.py     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 043c6b98a..b4c53f60e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -64,7 +64,11 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },            // GemmCoord problem_size
+    {
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.k}})
+    },                                                       // GemmCoord problem_size
 {% if support_split_k %}
     split_k,                                                 // int batch_count
 {% else %}
@@ -105,7 +109,11 @@
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },            // GemmCoord problem_size
+    {
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.k}})
+    },                                                       // GemmCoord problem_size
 {% if support_split_k %}
     split_k,                                                 // int batch_count
 {% else %}

From cd5862eb659605eaab1a9d09d5570045b3cb2c06 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 10 May 2023 08:42:00 -0700
Subject: [PATCH 501/638] Fix get_positive_dim for IntVar (#680)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/680

This is an issue encountered during ig model lowering: https://fb.workplace.com/groups/757073672259175/permalink/903432870956587/
`get_positive_dim` is the function to deduce positive dim for slice_op.
e.g. `a[-2]` where `len(a)=3` denotes we should access `a[1]`
But this only meaningful when the size is IntImm, not dynamic dim.
Now we add the filter so that we do nothing when the size is dynamic dim. Notice this will result in error when idx is negative and result in lowering failure. (This is also true without this filter, basically whenever there is dynamic dim, lowering get error)
This is ideal error, because in that case AIT has undefined behavior and we would want to check the model owner if such lowering happens.

Reviewed By: amateurcoffee

Differential Revision: D45722873

fbshipit-source-id: 0478dceb5b8c69b7801759577a18ea9be851b417
---
 fx2ait/fx2ait/converters/ait_converters.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index ab14f88b8..e498b9722 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -660,7 +660,8 @@ def num_slice_types(slices):
             num_none_indices += 1
             continue
         if isinstance(i, int):
-            i = get_positive_dim(i, input_val.shape()[index].value())
+            if isinstance(input_val.shape()[index], IntImm):
+                i = get_positive_dim(i, input_val.shape()[index].value())
             # If we pass an int, we need to squeeze this dim.
             # Note that because we skip None-indices before, so we adjust
             # the index by subtracting the number of None-indices.

From 5f63da8a9ac43021bf99fcd7b9e5fdde02247c2d Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 10 May 2023 09:25:27 -0700
Subject: [PATCH 502/638] Add bf16 support to full op (#679)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/679

Full op doesn't have bf16 header, which would cause bfloat16 unknown issue during compilation.
```
full_0_constant_folding.cu(14): error: identifier "bfloat16" is undefined
full_0_constant_folding.cu(27): error: identifier "p" is undefined
full_0_constant_folding.cu(27): error: "bfloat16" is not a type name
```
This diff adds it.

Reviewed By: chenyang78

Differential Revision: D45720770

fbshipit-source-id: c3eace2ba9f7f4a03d8c01f8e2bfeb50d47873ff
---
 python/aitemplate/backend/cuda/tensor/full.py |  4 ++-
 tests/unittest/ops/test_full.py               | 33 +++++++++++++------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/python/aitemplate/backend/cuda/tensor/full.py b/python/aitemplate/backend/cuda/tensor/full.py
index a35167311..73c5bbaa9 100644
--- a/python/aitemplate/backend/cuda/tensor/full.py
+++ b/python/aitemplate/backend/cuda/tensor/full.py
@@ -115,7 +115,9 @@ def gen_function(func_attrs: Dict[str, Any]) -> str:
     read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
 
     return FUNC_TEMPLATE.render(
-        header_files=CUDA_HEADER_FILES,
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
         constant=CONSTANT_TEMPLATE.render(
             read_t=read_type,
             data_t=data_type,
diff --git a/tests/unittest/ops/test_full.py b/tests/unittest/ops/test_full.py
index 356e49b81..4c33d542a 100644
--- a/tests/unittest/ops/test_full.py
+++ b/tests/unittest/ops/test_full.py
@@ -20,7 +20,11 @@
 from aitemplate.compiler.public import FuncEnum
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 from parameterized import param, parameterized
 
 
@@ -72,15 +76,24 @@ def _test_full(
             torch.testing.assert_close(z, z_pt, atol=1e-2, rtol=1e-2)
 
     @parameterized.expand(
-        [
-            param(1, [1], 1, "float16"),
-            param(2, [10, 20, 30], 3.14, "float16"),
-            param(3, [IntVar([10, 20]), 30], 0, "float16"),
-            param(4, 123, -5, "float16"),
-            param(5, [20, 30], 2.71, "float32"),
-            param(6, [IntVar([1, 128]), 10], -1.23, "float32"),
-            param(7, IntVar([1, 128]), 1234, "float32"),
-        ]
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param(1, [1], 1, "float16"),
+                    param(2, [10, 20, 30], 3.14, "float16"),
+                    param(3, [IntVar([10, 20]), 30], 0, "float16"),
+                    param(4, 123, -5, "float16"),
+                    param(5, [20, 30], 2.71, "float32"),
+                    param(6, [IntVar([1, 128]), 10], -1.23, "float32"),
+                    param(7, IntVar([1, 128]), 1234, "float32"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param(8, [20, 30], 2.71, "bfloat16"),
+                    param(9, [IntVar([1, 128]), 10], -1.23, "bfloat16"),
+                    param(10, IntVar([1, 128]), 1234, "bfloat16"),
+                ],
+            }
+        )
     )
     def test_full(self, i, shape, fill_value, dtype):
         self._test_full(

From d8973420dce71bdaa42c3ca5f6fec240167b54b5 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Wed, 10 May 2023 19:53:44 -0700
Subject: [PATCH 503/638] introduce get_host_compiler_options() and
 get_device_compiler_options() for Target class (#673)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/673

Preparations for the CMake compiler engine

Reviewed By: aakhundov

Differential Revision: D45662415

fbshipit-source-id: 2f380754287f411d87b4b33c1383cb0ff7f2f9bd
---
 python/aitemplate/backend/cuda/target_def.py | 46 +++++++++++++++++---
 python/aitemplate/backend/target.py          | 22 ++++++++++
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 67caf624b..d98b17fdf 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -124,26 +124,50 @@ def _build_include_directories(self) -> List[str]:
     def get_include_directories(self) -> List[str]:
         return self._build_include_directories()
 
-    def _build_compile_options(self):
-        include_paths = self._build_include_directories()
+    def _build_gnu_host_compiler_options(self) -> List[str]:
+        return [
+            "-fPIC",
+            "-Wconversion",
+            "-fno-strict-aliasing",
+            "-fvisibility=hidden",
+        ]
+
+    def get_host_compiler_options(self) -> List[str]:
+        return self._build_gnu_host_compiler_options()
 
+    def _build_nvcc_compiler_options(self) -> List[str]:
         options = [
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
             "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
             "-w",
             f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
-            "-Xcompiler=-fPIC",
-            "-Xcompiler=-Wconversion",
-            "-Xcompiler=-fno-strict-aliasing",
-            "-Xcompiler -fvisibility=hidden",
             environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
-        ] + ["-I" + path for path in include_paths]
+        ]
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         if environ.use_fast_math():
             options.append("--use_fast_math")
+        return options
+
+    def get_device_compiler_options(self) -> List[str]:
+        return self._build_nvcc_compiler_options()
+
+    def _build_compile_options(self):
+        include_paths = self._build_include_directories()
+        host_compiler_options = self._build_gnu_host_compiler_options()
+        nvcc_compiler_options = self._build_nvcc_compiler_options()
+
+        options = (
+            nvcc_compiler_options
+            + [
+                f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+                for opt in host_compiler_options
+            ]
+            + ["-I" + path for path in include_paths]
+        )
+
         return " ".join(options)
 
     def src_extension(self):
@@ -280,6 +304,14 @@ def _build_include_directories(self) -> List[str]:
     def get_include_directories(self) -> List[str]:
         return self._build_include_directories()
 
+    def get_host_compiler_options(self) -> List[str]:
+        # a placeholder
+        raise NotImplementedError
+
+    def get_device_compiler_options(self) -> List[str]:
+        # a placeholder
+        raise NotImplementedError
+
     def _build_compile_options(self):
         if not FBCUDA.compile_options_:
             include_paths = self._build_include_directories()
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 0bbc71139..c8f790054 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -479,6 +479,28 @@ def get_include_directories(self) -> List[str]:
         """
         raise NotImplementedError
 
+    def get_host_compiler_options(self) -> List[str]:
+        """
+        Returns a list of options for the host compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def get_device_compiler_options(self) -> List[str]:
+        """
+        Returns a list of options for the device compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
 
 def CUDA(template_path: str = CUTLASS_PATH, arch: str = "80", **kwargs):
     """Create a CUDA target."""

From 0481d1ba2c91dfc590f560f46330abd4bcfc8053 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Thu, 11 May 2023 00:10:11 -0700
Subject: [PATCH 504/638] speed up MultiScaleBlock (#681)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/681

Speed up multiscaleblock by removing unnecessary permutes that increase latency. No loss in correctness (still using 1e-4 numeric tolerance).

Also added option to enable/disable input/output shape conversion in batchnorm classes.

Reviewed By: terrychenism

Differential Revision: D45740293

fbshipit-source-id: 89e3a4bd82c73239702477ef8777ebb91096439e
---
 python/aitemplate/frontend/nn/batch_norm.py       | 15 ++++++++++-----
 .../frontend/nn/multiscale_attention.py           | 10 ++++------
 tests/unittest/ops/test_batch_norm.py             |  4 +++-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/python/aitemplate/frontend/nn/batch_norm.py b/python/aitemplate/frontend/nn/batch_norm.py
index 68e5dc36c..823954b4a 100644
--- a/python/aitemplate/frontend/nn/batch_norm.py
+++ b/python/aitemplate/frontend/nn/batch_norm.py
@@ -28,12 +28,14 @@ def __init__(
         num_features,
         eps=1e-5,
         dtype="float16",
+        permute_input_output=False,
         **kwargs,
     ):
         super().__init__()
         self.dim = (num_features,)
         self.dtype = dtype
         self.num_features = num_features
+        self.permute_input_output = permute_input_output
         self.eps = eps
         self.weight = Parameter(shape=self.dim, dtype=dtype)
         self.bias = Parameter(shape=self.dim, dtype=dtype)
@@ -46,7 +48,7 @@ def forward(self, *args):
         assert len(args) == 1
         x = args[0]
         self._check_input_dim(x)
-        x = self._convert_input(x)
+        x = self._convert_input(x) if self.permute_input_output else x
 
         x_normalized = elementwise(FuncEnum.DIV)(
             elementwise(FuncEnum.SUB)(x, self.running_mean.tensor()),
@@ -60,7 +62,7 @@ def forward(self, *args):
             self.bias.tensor(),
         )
 
-        y = self._convert_output(y)
+        y = self._convert_output(y) if self.permute_input_output else y
         return y
 
     def _check_input_dim(self):
@@ -79,9 +81,10 @@ def __init__(
         num_features,
         eps=1e-5,
         dtype="float16",
+        permute_input_output=False,
         **kwargs,
     ):
-        super().__init__(num_features, eps, dtype, **kwargs)
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
 
     def _check_input_dim(self, x):
         if len(x.shape()) != 2 and len(x.shape()) != 3:
@@ -108,9 +111,10 @@ def __init__(
         num_features,
         eps=1e-5,
         dtype="float16",
+        permute_input_output=False,
         **kwargs,
     ):
-        super().__init__(num_features, eps, dtype, **kwargs)
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
 
     def _check_input_dim(self, x):
         if len(x.shape()) != 4:
@@ -129,9 +133,10 @@ def __init__(
         num_features,
         eps=1e-5,
         dtype="float16",
+        permute_input_output=False,
         **kwargs,
     ):
-        super().__init__(num_features, eps, dtype, **kwargs)
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
 
     def _check_input_dim(self, x):
         if len(x.shape()) != 5:
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 5e49d6a6d..7b431e4f7 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -220,9 +220,7 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
         # input shape: B, num_heads, seqlen, head_dim
         B, N, L, C = get_shape(tensor)
         T, H, W = thw_shape
-        tensor = ops.permute()(
-            ops.reshape()(tensor, [B * N, -1, H, W, C]), [0, 4, 1, 2, 3]
-        )
+        tensor = ops.reshape()(tensor, [B * N, -1, H, W, C])
 
         if self.norm_before_pool:
             # If use BN, we apply norm before pooling instead of after pooling.
@@ -230,7 +228,7 @@ def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[in
             # We also empirically find that adding a GELU here is beneficial.
             tensor = ops.elementwise(FuncEnum.GELU)(tensor)
 
-        tensor = self.pool(ops.permute()(tensor, [0, 2, 3, 4, 1]))
+        tensor = self.pool(tensor)
 
         shape = get_shape(tensor)
         thw_shape = [shape[1], shape[2], shape[3]]
@@ -673,7 +671,7 @@ def __init__(
         super().__init__()
         self.dim = dim
         self.dim_out = dim_out
-        self.norm1 = norm_layer(dim)
+        self.norm1 = norm_layer(dim, permute_input_output=True)
         self.norm1_is_batchnorm_1d = isinstance(self.norm1, BatchNorm1d)
         kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
         stride_skip = stride_q
@@ -698,7 +696,7 @@ def __init__(
             max_seq_len=seq_len,
         )
         self.drop_path = DropPath(droppath_rate) if droppath_rate > 0.0 else Identity()
-        self.norm2 = norm_layer(dim)
+        self.norm2 = norm_layer(dim, permute_input_output=True)
         self.norm2_is_batchnorm_1d = isinstance(self.norm2, BatchNorm1d)
         mlp_hidden_dim = int(dim * mlp_ratio)
         self.has_cls_embed = has_cls_embed
diff --git a/tests/unittest/ops/test_batch_norm.py b/tests/unittest/ops/test_batch_norm.py
index 76633b201..7a249e419 100644
--- a/tests/unittest/ops/test_batch_norm.py
+++ b/tests/unittest/ops/test_batch_norm.py
@@ -43,7 +43,9 @@ def _test_batchnorm(
         input_type="float16",
     ):
         pt_op = getattr(torch.nn, bn_op)(num_features).cuda().half().eval()
-        ait_op = getattr(batch_norm, bn_op)(num_features, eps=pt_op.eps)
+        ait_op = getattr(batch_norm, bn_op)(
+            num_features, eps=pt_op.eps, permute_input_output=True
+        )
         ait_op.name_parameter_tensor()
 
         pt_params = dict(pt_op.named_parameters())

From d1d66fb50bf27d4d012aace8795977171bf16da4 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 11 May 2023 12:29:56 -0700
Subject: [PATCH 505/638] Introduce a CMake compiler engine (#674)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/674

Reviewed By: ipiszy, chenyang78

Differential Revision: D45664779

fbshipit-source-id: 2220f1d30aa50df6b4dbbd11ea5ecf85b3079cf5
---
 docs/source/reference/env.rst                 |   2 +
 python/aitemplate/backend/builder.py          |   9 +-
 python/aitemplate/backend/cuda/__init__.py    |   8 +-
 .../aitemplate/backend/cuda/builder_cmake.py  | 476 ++++++++++++++++++
 python/aitemplate/backend/target.py           |   4 +
 python/aitemplate/utils/environ.py            |  10 +
 6 files changed, 507 insertions(+), 2 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/builder_cmake.py

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index f1055aeca..60a0a9c6d 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -24,6 +24,8 @@ Codegen
 
 **AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS**: Maximum number of parallel operators used in memory planning for simple multi-stream mode. Default value is "99999999" (basically, unlimited).
 
+**AIT_USE_CMAKE_COMPILATION**: (An experimental feature) If set to "1", then `cmake` will used instead of `make`. This allows to build AITemplate using MSVC Compiler + MSBuild on Windows, and it works for linux as well. This builder does not support many features (such as caching) yet. But it allows to generate a cmake project that can be loaded to a modern IDE. Default value is "0".
+
 Profiling
 ---------
 
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 930753403..2833f558f 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -40,6 +40,7 @@
 
 from aitemplate.utils.debug_settings import AITDebugSettings
 
+from aitemplate.utils.environ import is_cmake_compilation
 from aitemplate.utils.misc import is_debug, is_windows
 
 # pylint: disable=W0221,C0103
@@ -898,5 +899,11 @@ def make(
 
 
 def get_compile_engine():
-    compile_engine = Builder()
+    if is_cmake_compilation():
+        from aitemplate.backend.cuda import builder_cmake
+
+        compile_engine = builder_cmake.BuilderCMake()
+    else:
+        compile_engine = Builder()
+
     return compile_engine
diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index 6c1df2038..84bbb63c9 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -16,7 +16,13 @@
 """
 CUDA backend codegen functions.
 """
-from aitemplate.backend.cuda import cuda_common, lib_template, target_def, utils
+from aitemplate.backend.cuda import (
+    builder_cmake,
+    cuda_common,
+    lib_template,
+    target_def,
+    utils,
+)
 from aitemplate.backend.cuda.common import *
 from aitemplate.backend.cuda.conv2d import *
 from aitemplate.backend.cuda.conv3d import *
diff --git a/python/aitemplate/backend/cuda/builder_cmake.py b/python/aitemplate/backend/cuda/builder_cmake.py
new file mode 100644
index 000000000..d8c310e90
--- /dev/null
+++ b/python/aitemplate/backend/cuda/builder_cmake.py
@@ -0,0 +1,476 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# A custom compile engine for CMake for CUDA backend. It can handle both Windows
+# and Linux use cases. Unlike the default make-based compiler engine, this one
+# is an experimental one. It was mostly needed to generate cpp/cu files for a
+# given model once and then do some custom debugging / research in an IDE.
+
+from __future__ import annotations
+
+import logging
+
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import jinja2
+
+from aitemplate.backend.target import Target
+
+from aitemplate.utils.debug_settings import AITDebugSettings
+
+from aitemplate.utils.misc import is_linux, is_windows, short_str
+
+
+# pylint: disable=W0221,C0103
+
+
+_LOGGER = logging.getLogger(__name__)
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
+CMAKELISTS_TXT_TEMPLATE = """
+project({{CMAKE_PROJECT}})
+
+# idk which version is actually needed
+cmake_minimum_required(VERSION 3.20)
+
+set(SOURCE_FILES
+{{CMAKE_SOURCE_FILES}}
+)
+
+set(HEADER_FILES
+{{CMAKE_HEADER_FILES}}
+)
+
+set(STANDALONE_SOURCE_FILES
+{{CMAKE_STANDALONE_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_SOURCE_FILES
+{{CMAKE_THIRD_PARTY_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_HEADER_FILES
+{{CMAKE_THIRD_PARTY_HEADER_FILES}}
+)
+
+{% if is_linux %}
+# linux only
+add_custom_command(
+    OUTPUT {{CMAKE_CONSTANTS_OBJ}}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND ld -r -b binary -o ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}} {{CMAKE_CONSTANTS_BIN}}
+    COMMAND objcopy --rename-section .data=.lrodata,alloc,load,readonly,data,contents ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}} ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}}
+    DEPENDS {{CMAKE_CONSTANTS_BIN}}
+)
+{% endif %}
+
+enable_language(CUDA)
+set(CMAKE_CUDA_ARCHITECTURES {{CUDA_ARCH}})
+
+find_package(CUDAToolkit REQUIRED)
+
+{% if cuda_static %}
+set(CUDA_RUNTIME_LIBRARY Static)
+{% endif %}
+
+# this is needed to be able to pass \\ into command lline options
+set(WorkaroundCmakeCompileOptions {{CMAKE_COMPILE_OPTIONS}})
+
+# compile a supplemental library
+add_library(objlib OBJECT ${SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_include_directories(objlib PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_compile_options(objlib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(objlib PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+
+
+# compile model library
+add_library(model SHARED $<TARGET_OBJECTS:objlib> {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_include_directories(model PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_link_libraries(model
+    {% if not cuda_static %}CUDA::cudart{% endif %}
+)
+target_compile_options(model PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(model PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+
+
+{% if build_standalone %}
+# compile a standalone executable
+add_executable(standalone $<TARGET_OBJECTS:objlib> {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_sources(standalone PRIVATE ${STANDALONE_SOURCE_FILES})
+target_include_directories(standalone PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_link_libraries(standalone
+    {% if not cuda_static %}CUDA::cudart{% endif %}
+)
+target_compile_options(standalone PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(standalone PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+{% endif %}
+"""
+
+
+CMAKELISTS_TXT_PROFILER_TEMPLATE = """
+project({{CMAKE_PROJECT}})
+
+cmake_minimum_required(VERSION 3.20)
+
+set(SOURCE_FILES
+{{CMAKE_SOURCE_FILES}}
+)
+
+set(HEADER_FILES
+{{CMAKE_HEADER_FILES}}
+)
+
+set(THIRD_PARTY_SOURCE_FILES
+{{CMAKE_THIRD_PARTY_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_HEADER_FILES
+{{CMAKE_THIRD_PARTY_HEADER_FILES}}
+)
+
+enable_language(CUDA)
+set(CMAKE_CUDA_ARCHITECTURES {{CUDA_ARCH}})
+
+find_package(CUDAToolkit REQUIRED)
+
+{% if cuda_static %}
+set(CUDA_RUNTIME_LIBRARY Static)
+{% endif %}
+
+# this is needed to be able to pass \\ into command lline options
+set(WorkaroundCmakeCompileOptions {{CMAKE_COMPILE_OPTIONS}})
+
+# compile a binary
+add_executable(profiler ${SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES})
+target_include_directories(profiler PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_compile_options(profiler PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(profiler PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+"""
+
+
+def _run_cmd(command_line: str, timeout, custom_env: Optional[Dict[str, str]] = None):
+    _LOGGER.info(f"Executing {command_line}")
+    if custom_env is not None:
+        for key, value in custom_env.items():
+            _LOGGER.info(f"Extra environment var {key}={value}")
+        environ = {**os.environ, **custom_env}
+    else:
+        environ = os.environ.copy()
+    proc = subprocess.Popen(  # noqa: P204
+        command_line,
+        shell=True,
+        env=environ,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    try:
+        out, err = proc.communicate(timeout)
+    except subprocess.TimeoutExpired as e:
+        proc.kill()
+        out, err = proc.communicate()
+        raise e
+    finally:
+        stdout = out.decode()
+        stderr = err.decode()
+        if proc.returncode != 0:
+            _LOGGER.info(f"command stdout:\n\n{stdout}")
+            _LOGGER.info(f"command stderr:\n\n{stderr}")
+
+            raise RuntimeError("command has failed.")
+        else:
+            _LOGGER.debug(f"command stdout:\n\n{stdout}")
+            _LOGGER.debug(f"command stderr:\n\n{stderr}")
+
+
+def _render_path(path: Union[Path, str]) -> str:
+    # shlex.quote is designed for unit
+    p = Path(path).as_posix()
+    return '"' + str(p) + '"'
+
+
+def _files_as_str(filenames: Union[Path, str, List[Union[Path, str]]]) -> str:
+    if isinstance(filenames, str) or isinstance(filenames, Path):
+        return _render_path(filenames)
+    elif isinstance(filenames, list):
+        return "\n".join([f"\t{_render_path(filename)}" for filename in filenames])
+    else:
+        raise TypeError()
+
+
+class BuilderCMake:
+    """BuilderCMake is a module to compile generated source code
+    files into binary objects via CMake.
+    """
+
+    def __init__(self, n_cpus: int = -1, timeout: int = 180) -> None:
+        self._timeout = timeout
+        self._n_cpus = n_cpus
+
+    def _build_compile_options(self) -> List[str]:
+        # I don't want to move this functionality to target_def.py,
+        # because target_def.py is about GNU and GNU only.
+
+        device_compiler_options = Target.current().get_device_compiler_options()
+        if is_windows():
+            host_compiler_options = ["-Xcompiler=/Zc:__cplusplus"]
+        else:
+            host_compiler_options = [
+                f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+                for opt in Target.current().get_host_compiler_options()
+            ]
+
+        compile_options = device_compiler_options + host_compiler_options
+
+        # this is a workaround around how cmake handles \ character
+        compile_options = [option.replace("\\,", "\\\\,") for option in compile_options]
+
+        # done
+        return compile_options
+
+    def make_profilers(self, generated_profilers, workdir: Path):
+        file_pairs = [f for gp in generated_profilers for f in gp]
+        if not file_pairs:
+            return
+
+        # todo: combine multiple profiler in a single CMake project?
+        cmake_template = jinja2.Template(CMAKELISTS_TXT_PROFILER_TEMPLATE)
+
+        include_directories = Target.current().get_include_directories()
+
+        compile_options = self._build_compile_options()
+
+        # go ahead
+        for source, profiler_binary in file_pairs:
+            test_name = short_str(str(source))
+
+            build_dir = Path(source).parent / test_name
+            build_dir.mkdir(exist_ok=True)
+
+            rendered = cmake_template.render(
+                CMAKE_PROJECT=test_name,
+                CMAKE_SOURCE_FILES=_files_as_str("../" + str(Path(source).name)),
+                # # todo: this can be done once we're able to track header files
+                # # properly
+                # CMAKE_HEADER_FILES=_files_as_str(
+                #     [Path(header).name for header in generated_sources.headers]
+                # ),
+                CMAKE_HEADER_FILES=_files_as_str([]),
+                CMAKE_THIRD_PARTY_HEADER_FILES=_files_as_str(include_directories),
+                CMAKE_THIRD_PARTY_SOURCE_FILES=_files_as_str([]),
+                CMAKE_COMPILE_OPTIONS=" ".join(compile_options),
+                CUDA_ARCH=Target.current()._arch,
+                cuda_static=is_windows(),
+                is_linux=is_linux(),
+            )
+
+            cmake_filename = build_dir / "CMakeLists.txt"
+            with cmake_filename.open("w") as f:
+                f.write(rendered)
+
+            # execute cmake
+            cmake_build_dir = build_dir / "build"
+            cmake_cmd = Target.current().cmake()
+            cmake_command_line = f"{_render_path(cmake_cmd)} -B {_render_path(cmake_build_dir)} -S {_render_path(build_dir)}"
+            _run_cmd(cmake_command_line, self._timeout)
+
+            # execute build system
+            if is_windows():
+                # use msbuild
+                msbuild_sln_filename = cmake_build_dir / f"{test_name}.sln"
+                msbuild_command_line = f"msbuild {_render_path(msbuild_sln_filename)}"
+                if self._n_cpus < 0:
+                    msbuild_command_line += " -m"
+                else:
+                    msbuild_command_line += f" -m:{self._n_cpus}"
+
+                if Target.current()._ndebug == 1:
+                    msbuild_command_line += " /property:Configuration=Release"
+                else:
+                    msbuild_command_line += " /property:Configuration=Debug"
+
+                _run_cmd(msbuild_command_line, self._timeout)
+
+                target_profiler_filename = profiler_binary
+                if Target.current()._ndebug == 1:
+                    compiled_profiler_filename = (
+                        cmake_build_dir / "Release" / "profiler.exe"
+                    )
+                    shutil.copy(compiled_profiler_filename, target_profiler_filename)
+                else:
+                    compiled_profiler_filename = (
+                        cmake_build_dir / "Debug" / "profiler.exe"
+                    )
+                    shutil.copy(compiled_profiler_filename, target_profiler_filename)
+            else:
+                # use make
+                make_cmd = Target.current().make()
+                make_command_line = f"{make_cmd} -C {_render_path(cmake_build_dir)}"
+                if self._n_cpus < 0:
+                    make_command_line += " -j"
+                else:
+                    make_command_line += f" -j{self._n_cpus}"
+
+                _run_cmd(make_command_line, self._timeout)
+
+                target_profiler_filename = profiler_binary
+                compiled_profiler_filename = cmake_build_dir / "profiler"
+                shutil.copy(compiled_profiler_filename, target_profiler_filename)
+
+    def make(
+        self,
+        file_pairs,
+        dll_name: str,
+        workdir: Path,
+        test_name: str,
+        debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+        allow_cache=False,
+    ):
+        # Generates a CMakeLists.txt files and builds a model and a standalone project
+
+        if allow_cache:
+            _LOGGER.warning("Caching is not yet supported")
+
+        build_dir = Path(workdir) / test_name
+
+        cmake_template = jinja2.Template(CMAKELISTS_TXT_TEMPLATE)
+
+        include_directories = Target.current().get_include_directories()
+
+        compile_options = self._build_compile_options()
+
+        # check constants.bin
+        cmake_third_party_source_files = []
+        cmake_third_party_header_files = []
+        constants_bin_file = build_dir / "constants.bin"
+
+        if constants_bin_file.exists():
+            if is_windows():
+                resource_file = build_dir / "constants.rc"
+                with resource_file.open("w") as f:
+                    f.write('constant_bin CUSTOMDATA "constants.bin"')
+                cmake_third_party_source_files.append("constants.rc")
+
+                cmake_third_party_header_files.append("windll.h")
+                cmake_third_party_source_files.append("windll.cu")
+
+        # windows uses static CUDA build, linux uses dynamic one
+        rendered = cmake_template.render(
+            CMAKE_PROJECT=test_name,
+            CMAKE_SOURCE_FILES=_files_as_str(
+                [
+                    Path(source).name
+                    for (source, _) in file_pairs
+                    if Path(source).name not in ["standalone.cu", "windll.cu"]
+                ]
+            ),
+            # # todo: this can be done once we're able to track header files
+            # # properly
+            # CMAKE_HEADER_FILES=_files_as_str(
+            #     [Path(header).name for header in final_sources.headers]
+            # ),
+            CMAKE_HEADER_FILES=_files_as_str([]),
+            CMAKE_STANDALONE_SOURCE_FILES=_render_path("standalone.cu"),
+            CMAKE_THIRD_PARTY_SOURCE_FILES=_files_as_str(
+                cmake_third_party_source_files
+            ),
+            CMAKE_THIRD_PARTY_HEADER_FILES=_files_as_str(
+                include_directories + cmake_third_party_header_files
+            ),
+            CMAKE_CONSTANTS_BIN=_render_path("constants.bin"),
+            CMAKE_CONSTANTS_OBJ=_render_path("constants.obj"),
+            CMAKE_COMPILE_OPTIONS=" ".join(compile_options),
+            CUDA_ARCH=Target.current()._arch,
+            cuda_static=is_windows(),
+            is_linux=is_linux(),
+            build_standalone=debug_settings.gen_standalone,
+        )
+
+        cmake_filename = build_dir / "CMakeLists.txt"
+        with cmake_filename.open("w") as f:
+            f.write(rendered)
+
+        # execute cmake
+        cmake_build_dir = build_dir / "build"
+        cmake_cmd = Target.current().cmake()
+        cmake_command_line = f"{_render_path(cmake_cmd)} -B {_render_path(cmake_build_dir)} -S {_render_path(build_dir)}"
+        _run_cmd(cmake_command_line, self._timeout)
+
+        # execute build system
+        if is_windows():
+            # use msbuild
+            msbuild_sln_filename = cmake_build_dir / f"{test_name}.sln"
+            msbuild_command_line = f"msbuild {_render_path(msbuild_sln_filename)}"
+            if self._n_cpus < 0:
+                msbuild_command_line += " -m"
+            else:
+                msbuild_command_line += f" -m:{self._n_cpus}"
+
+            if Target.current()._ndebug == 1:
+                msbuild_command_line += " /property:Configuration=Release"
+            else:
+                msbuild_command_line += " /property:Configuration=Debug"
+
+            _run_cmd(msbuild_command_line, self._timeout)
+
+            # copy
+            target_library_filename = build_dir / dll_name
+            target_standalone_filename = build_dir / f"{Path(dll_name).stem}.exe"
+            if Target.current()._ndebug == 1:
+                # copy library to where it is supposed to be
+                compiled_library_filename = cmake_build_dir / "Release" / "model.dll"
+                shutil.copy(compiled_library_filename, target_library_filename)
+
+                if debug_settings.gen_standalone:
+                    # copy standalone file to where it is supposed to be
+                    compiled_standlone_filename = (
+                        cmake_build_dir / "Release" / "standalone.exe"
+                    )
+                    shutil.copy(compiled_standlone_filename, target_standalone_filename)
+            else:
+                # copy library to where it is supposed to be
+                compiled_library_filename = cmake_build_dir / "Debug" / "model.dll"
+                shutil.copy(compiled_library_filename, target_library_filename)
+
+                if debug_settings.gen_standalone:
+                    # copy standalone file to where it is supposed to be
+                    compiled_standlone_filename = (
+                        cmake_build_dir / "Debug" / "standalone.exe"
+                    )
+                    shutil.copy(compiled_standlone_filename, target_standalone_filename)
+        else:
+            # use make
+            make_cmd = Target.current().make()
+            make_command_line = f"{make_cmd} -C {_render_path(cmake_build_dir)}"
+            if self._n_cpus < 0:
+                make_command_line += " -j"
+            else:
+                make_command_line += f" -j{self._n_cpus}"
+
+            _run_cmd(make_command_line, self._timeout)
+
+            # copy library to where it is supposed to be
+            target_library_filename = build_dir / dll_name
+            compiled_library_filename = cmake_build_dir / "libmodel.so"
+            shutil.copy(compiled_library_filename, target_library_filename)
+
+            if debug_settings.gen_standalone:
+                # copy standalone file to where it is supposed to be
+                target_standalone_filename = build_dir / (Path(dll_name).stem)
+                compiled_standalone_filename = cmake_build_dir / "standalone"
+                shutil.copy(compiled_standalone_filename, target_standalone_filename)
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index c8f790054..46d5e3aa3 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -152,6 +152,10 @@ def make(self):
         make_path = shutil.which("make")
         return make_path if make_path is not None else "make"
 
+    def cmake(self):
+        cmake_path = shutil.which("cmake")
+        return cmake_path if cmake_path is not None else "cmake"
+
     def compile_cmd(self, executable: bool = False):
         """Compile command string template for this target.
 
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index fd692ad90..c2c8d57e8 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -207,3 +207,13 @@ def multistream_max_mem_parallel_ops() -> int:
     """
     # unlimited by default
     return int(os.getenv("AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS", "99999999"))
+
+
+def is_cmake_compilation() -> bool:
+    """
+    When enabled, compiles the model via invoking CMake rather than
+    invoking make directly.
+    """
+
+    # todo: replace with more builders?
+    return os.getenv("AIT_USE_CMAKE_COMPILATION", "0") == "1"

From f49b6e4fe0def03a5d2c8be6a0ae2d503cd22510 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 11 May 2023 16:48:38 -0700
Subject: [PATCH 506/638] update ci (#589)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/589

Reviewed By: aakhundov

Differential Revision: D45787819

Pulled By: ipiszy

fbshipit-source-id: 056817bce99bb54361c4d2b9653b964f1464ebd4
---
 .github/workflows/rocm_ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 4e0f2c92c..24445baa6 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,10 +1,12 @@
 name: ROCM_CI
 
 on:
-  push:
+  pull_request:
+     types: [labeled]
 
 jobs:
   build:
+    if: contains(github.event.label.name, 'rocm')
     runs-on: rocm
 
     steps:

From 87124bd472e0e43d82aade0d03886556cf29f503 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@meta.com>
Date: Thu, 11 May 2023 16:58:00 -0700
Subject: [PATCH 507/638] Fix codegen condition check issue

Summary: Should check whether key present in dict, not whether dict is empty.

Reviewed By: muchulee8

Differential Revision: D45759517

fbshipit-source-id: 325c7916eade66fa374f102308c2bbb74dfe53f7
---
 python/aitemplate/backend/codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 11c1b5741..bf3f5fe3a 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -602,7 +602,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(set_value(name, view._attrs["name"]))
             return
         is_view = view is not None
-        if is_view and len(self.param_name_to_ptr_idx) > 0:
+        if is_view and (view._attrs["name"] in self.param_name_to_ptr_idx):
             ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
             self.set_inputs.append(set_value(name, view._attrs["name"]))
         else:

From 761574332dd63e14bf383c5cb5d03584814e2e66 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 12 May 2023 07:15:27 -0700
Subject: [PATCH 508/638] Fix the profiler bug in bmm_xxx_add SM90 kernels
 (#682)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/682

As `bmm_xxx_add` ops are forming the problem arguments for profiling on their own (in contrast to the `bmm_xxx` ops relying on `bmm_common.default_gen_profiler`), I've missed generation and passing of the CUTLASS 3.x problem arguments for their profiler generation. This has resulted in a really clandestine bug in the profiler that hasn't manifested itself in the CI (because profilers are compiled, which worked, but aren't run, which didn't).

Reviewed By: ipiszy, wushirong

Differential Revision: D45751431

fbshipit-source-id: e30c1e6b327a30812b719835f5b5cfb4827ce1be
---
 .../cuda/gemm_universal/bmm_xxx_add.py        | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
index 69f02599e..350e170ed 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -25,6 +25,7 @@
 
 
 from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.common import gemm_common
 from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 from aitemplate.backend.cuda.gemm_universal.bmm_xxx import _get_problem_args, get_config
@@ -108,14 +109,33 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
             mm_info=default_mm_info,
         )
 
+        backend_spec = CUDASpec()
+        elem_input_type = backend_spec.dtype_to_lib_type(
+            func_attrs["inputs"][0]._attrs["dtype"]
+        )
+        elem_output_type = backend_spec.dtype_to_lib_type(
+            func_attrs["outputs"][0]._attrs["dtype"]
+        )
+
+        # CUTLASS 3.x problem args require explicit I/O pointer types (not void*)
+        default_mm_info.a_ptr = f"({elem_input_type}*)({default_mm_info.a_ptr})"
+        default_mm_info.b_ptr = f"({elem_input_type}*)({default_mm_info.b_ptr})"
+        default_mm_info.bias_ptr = f"({elem_output_type}*)({default_mm_info.bias_ptr})"
+        default_mm_info.c_ptr = f"({elem_output_type}*)({default_mm_info.c_ptr})"
+
+        problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+            mm_info=default_mm_info,
+        )
+
         return bmm_common.gen_profiler(
-            func_attrs,
-            workdir,
-            profiler_filename,
-            dim_info_dict,
-            common.SRC_TEMPLATE,
-            problem_args,
-            args_parser,
+            func_attrs=func_attrs,
+            workdir=workdir,
+            profiler_filename=profiler_filename,
+            dim_info_dict=dim_info_dict,
+            src_template=common.SRC_TEMPLATE,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            args_parser=args_parser,
         )
 
     return gen_profiler

From 7cdc6f5a533c9089200f4c885ebd39c502ef4ee3 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Fri, 12 May 2023 09:19:59 -0700
Subject: [PATCH 509/638] refactor op level benchmark (#660)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/660

1.refactor the op registration by extracting the ait,trt,pt2 lowering part. Reduce the new code workload for adding new op

2.In profile.py L96, we enabled g[0:] to capture the 1st kernel. It is used to fill in cache for the following computation. But in some op, it seems to be the only countable kernel. For ex.

split
{F975474668}

dynamic slice
{F975475144}

while for gemm_rcr, it is the case that this kernel used to fill value
{F975475533}

3.Report is here https://docs.google.com/document/d/1ngMa0BlETHv6d9CnpYQXsEjXwklyY6G0mbuKgXaVj24/edit#bookmark=id.5rt46dr52l04

4.The datasheet of the op benchmark is here: https://fburl.com/gsheet/hp0ekphn

Next step:
1. Consider the model with fusion. We need to capture the do_optimization=True in shape capture
2. We did not test LN since the params are missing.

Reviewed By: ipiszy, chenyang78

Differential Revision: D45345181

fbshipit-source-id: 47efcd1530ea6ca781c3fec650dc1ddde73349cd
---
 python/aitemplate/testing/profile.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/testing/profile.py b/python/aitemplate/testing/profile.py
index 67e8dad27..03be8ec95 100644
--- a/python/aitemplate/testing/profile.py
+++ b/python/aitemplate/testing/profile.py
@@ -59,6 +59,9 @@ def _f():
     """
     if n_iter <= 0:
         return [], []
+    # warmup
+    for _ in range(5):
+        func()
     with torch.profiler.profile(
         activities=[torch.profiler.ProfilerActivity.CUDA],
         record_shapes=True,
@@ -69,7 +72,7 @@ def _f():
     # log the invoked kernels
     results = prof.key_averages().table(
         sort_by="self_cuda_time_total",
-        max_name_column_width=None,
+        max_name_column_width=120,
         row_limit=-1,
     )
     logger.info(results)
@@ -90,7 +93,7 @@ def _f():
     n_groups = len(sorted_events) // n_iter
     # in each group (corresponding to a profiling iteration),
     # skip measuring the first kernel, which is the l2 cache flush
-    event_groups = [g[1:] for g in zip(*([iter(sorted_events)] * n_groups))]
+    event_groups = [g[0:] for g in zip(*([iter(sorted_events)] * n_groups))]
     logger.info(
         f"First kernel sequence: {list(map(itemgetter('name'), event_groups[0]))}"
     )

From 1ffbc1f89530975922dcdcba5f2af61f2a81abd3 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Fri, 12 May 2023 11:23:05 -0700
Subject: [PATCH 510/638] Add attribute to allow tensor not to participate in
 constant folding (#686)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/686

Add attribute to allow tensor to skip constant folding.
We need this attribute to overwrite logics within constant folding, since not
all tensors that have no input are necessary constants, the constant-ness could
be embedded in the operation that generates the constant.

Reviewed By: wushirong

Differential Revision: D45790578

fbshipit-source-id: 60d64c73bf16382ea291e411c63cae3341381986
---
 python/aitemplate/compiler/base.py            |  4 +++
 python/aitemplate/compiler/ops/tensor/full.py |  7 ++--
 .../compiler/transform/constant_folding.py    |  2 +-
 tests/unittest/ops/test_full.py               | 36 +++++++++++++++++++
 4 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 1dda1f03a..087300328 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -744,6 +744,7 @@ def __init__(
         value: Any = None,
         is_view_of: Any = None,
         is_internal_constant: bool = False,
+        skip_constant_folding: bool = False,
         check_nan_and_inf: bool = False,
         check_outputs: bool = False,
     ) -> None:
@@ -776,6 +777,8 @@ def __init__(
             Whether this Tensor is a view of another Tensor.
         is_internal_constant: bool, optional
             Whether this constant tensor could be modified.
+        skip_constant_folding: bool, optional
+            Whether this tensor participates in constant folding.
         check_nan_and_inf : bool, optional
             Whether or not to check this tensor is nan or inf during runtime.
         check_outputs : bool, optional
@@ -791,6 +794,7 @@ def __init__(
         self._attrs["is_input"] = is_input
         self._attrs["is_param"] = False
         self._attrs["is_internal_constant"] = is_internal_constant
+        self._attrs["skip_constant_folding"] = skip_constant_folding
 
         # True if this is an internal tensor that aliases an output through
         # a view. Set up in mark_param_tensor
diff --git a/python/aitemplate/compiler/ops/tensor/full.py b/python/aitemplate/compiler/ops/tensor/full.py
index 2f2ae515b..9d6addd3f 100644
--- a/python/aitemplate/compiler/ops/tensor/full.py
+++ b/python/aitemplate/compiler/ops/tensor/full.py
@@ -17,7 +17,7 @@
 
 from aitemplate import backend
 from aitemplate.backend import registry
-from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 from aitemplate.compiler.dtype import get_dtype_size
 
 
@@ -52,6 +52,7 @@ def __call__(
         if not isinstance(shape, (list, tuple)):
             raise TypeError(f"shape must be List[IntVar], but got {shape}.")
         shape = list(shape)
+        static_shape = all([isinstance(s, (int, IntImm)) for s in shape])
 
         if not isinstance(fill_value, (int, float)):
             raise TypeError(f"fill_value must be a scalar, but got {fill_value}.")
@@ -64,7 +65,9 @@ def __call__(
         self._attrs["fill_value"] = fill_value
 
         self._set_depth()
-        output = Tensor(shape, src_ops={self}, dtype=dtype)
+        output = Tensor(
+            shape, src_ops={self}, dtype=dtype, skip_constant_folding=not static_shape
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index e7a69f725..e0dc7e6fc 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -143,7 +143,7 @@ def _extract_foldable_subgraph(
     subgraph = []
 
     for tensor in sorted_graph:
-        if tensor._attrs["is_input"]:
+        if tensor._attrs["is_input"] or tensor._attrs["skip_constant_folding"]:
             continue
 
         name = tensor._attrs["name"]
diff --git a/tests/unittest/ops/test_full.py b/tests/unittest/ops/test_full.py
index 4c33d542a..7759410bc 100644
--- a/tests/unittest/ops/test_full.py
+++ b/tests/unittest/ops/test_full.py
@@ -103,6 +103,42 @@ def test_full(self, i, shape, fill_value, dtype):
             test_name=f"test_full_{i}",
         )
 
+    def test_const_full_with_copy(self, dtype="float16"):
+        shape = [IntVar([1, 128]), 10]
+        full = ops.full()(shape, 1.0, dtype)
+        add = ops.elementwise(FuncEnum.ADD)(full, 1.0)
+        Y = ops.flatten()(add)
+
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        flattened_X = ops.flatten()(X)
+        Z = ops.elementwise(FuncEnum.ADD)(flattened_X, Y)
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Z, target, "./tmp", "const_full_with_copy")
+
+        if isinstance(shape[0], IntVar):
+            shapes = [[val] + shape[1:] for val in shape[0]._attrs["values"]]
+        else:
+            shapes = [shape]
+
+        for shape in shapes:
+            x_pt = get_random_torch_tensor(shape, dtype=dtype)
+            tmp_pt = x_pt + 1.0 + 1.0
+            z_pt = torch.flatten(tmp_pt)
+
+            z = torch.empty_like(z_pt)
+
+            module.run_with_tensors([x_pt], [z])
+            torch.testing.assert_close(z, z_pt, atol=1e-2, rtol=1e-2)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From e429979744094c038b449e357ba47ab4d064480a Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Sat, 13 May 2023 22:39:49 -0700
Subject: [PATCH 511/638] Fix a tricky fused_elementwise alignment issue.
 (#693)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/693

fused_elementwise's max_alignment needs to be adjusted based on input broadcast
properly. Consider the following case:
A: [1, 2, 1]
B: [2, 1, 2]
To calculate A + B, max_alignment needs to be set to 2 instead of 4.
The current implementation doesn't consider the case when broadcast dim is in the
middle. This diff tries to fix it.

This diff also contains a minor refactoring to compute input alignments.

Reviewed By: frank-wei

Differential Revision: D45839276

fbshipit-source-id: 2cfdb29f819304df5cef85bc35dde941af62cc66
---
 .../test/converters/test_ait_binary_op.py     |   1 +
 .../backend/common/elementwise_common.py      | 283 +++++++++---------
 python/aitemplate/compiler/compiler.py        |   1 +
 tests/unittest/compiler/test_fuse_expand.py   |   2 +-
 .../ops/test_fused_elementwise_broadcast.py   |  35 ++-
 5 files changed, 169 insertions(+), 153 deletions(-)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
index e75f1b861..1a13daaa9 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
@@ -30,6 +30,7 @@
     (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
     (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
     (torch.randn(1), torch.randn(2, 3, 4)),
+    (torch.randn(3, 2, 1), torch.randn(1, 2, 2)),
 ]
 
 
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index adbc2d2ad..fd3843e96 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -295,12 +295,19 @@ class FusedElementwiseMetaData:
     original_inputs: List[Tensor]
     original_outputs: List[Tensor]
 
-    # holding the largest read type for the fused kernel
+    # Holds the largest read type for the fused kernel.
+    # This is equivalent to write_t in the current implementation.
+    # This is used to determine N_ELEMENTS_PER_THREAD.
     max_read_t: str
-    # holding the read_t for each fused input
+
+    # Holds the read_t for each input of the fused kernel.
+    # Note: read_types is only used for a small optimization for last_dim input broadcasting.
+    # General mixed read_types are not supported (which requires multiple get_strided_inputs calls).
     read_types: List[str]
+
     op_t: str
     data_t: str
+
     input_broadcast_sizes: List[List[IntVar]]
     dynamic_dims: List[IntVar]
     sub_funcs: List[ElementwiseMetaData]
@@ -488,111 +495,74 @@ def _is_jagged_shape(shape: List[IntVar]) -> bool:
     return len(shape) > 0 and isinstance(shape[0], JaggedIntVar)
 
 
-def _get_alignments(
-    extended_input_shapes: List[List[IntVar]],
-    input_broadcast_sizes: List[int],
-    num_rightmost_non_broadcast_dims: List[int],
-    rightmost_broadcast_dim: int,
-    output_rank: int,
-    dtype: str,
-) -> Tuple[List[int], List[int]]:
-    """
-    A helper function that returns two alignments lists, where the first list
-    is the alignments for inputs and the second one contains the alignments
-    for those non-broadcasted inputs
-    """
-    # We track alignment for each input
-    alignments = []
-    non_broadcast_alignments = []
-    for extended_input_shape, input_broadcast_sz, num_rightmost_non_br_dims in zip(
-        extended_input_shapes,
-        input_broadcast_sizes,
-        num_rightmost_non_broadcast_dims,
-    ):
-        # make sure we are not going to wrongfully generate an larger vector read type
-        if input_broadcast_sz is None and rightmost_broadcast_dim is not None:
-            num_rightmost_non_br_dims = output_rank - rightmost_broadcast_dim
-        num_elements_for_alignments = shape_utils.get_num_rightmost_static_elements(
-            extended_input_shape, num_rightmost_non_br_dims
-        )
-        if num_elements_for_alignments > 1 or input_broadcast_sz is None:
-            non_broadcast_alignments.append(num_elements_for_alignments)
-        alignment = alignment_utils.find_max_alignment(
-            num_elements_for_alignments, dtype
-        )
-        alignments.append(alignment)
-    return (alignments, non_broadcast_alignments)
-
-
-def _refine_alignments_with_tensor_accessors(
-    non_broadcast_alignments: List[int],
-    alignments: List[int],
-    dtype: str,
+def _get_input_alignments(
     input_accessors: List[TensorAccessor],
-    output_accessors: List[TensorAccessor],
+    input_broadcast_sizes: List[Optional[List[IntVar]]],
+    max_num_rightmost_dims_considered_for_alignments: int,
+    output_shape: List[IntVar],
+    dtype: str,
+    global_max_alignment: int,
 ) -> List[int]:
-    """
-    This helper function returns the valid alignments based on the constrains
-    imposed on non_broadcast_alignments, input_accessors and output_accessors.
-    """
-    max_non_broadcast_alignment = None
-    if len(non_broadcast_alignments) > 1:
-        max_non_broadcast_alignment = alignment_utils.find_max_alignment_from(
-            non_broadcast_alignments, dtype
-        )
-    alignments = [
-        align
-        if align == 1 or max_non_broadcast_alignment is None
-        else max_non_broadcast_alignment
-        for align in alignments
-    ]
-    max_input_accessor_alignment = (
-        tensor_accessor_codegen.find_max_alignment_for_accessors(dtype, input_accessors)
-    )
-    # Note that we use the same alignment for accessing inputs and outputs, although
-    # they may have different alignment requirements. We may lose perf a little bit,
-    # but reduce the complexity of our jinja template. We can do some perf
-    # experiments later to determine if we want to chase more perf gains.
-    max_accessor_alignment = tensor_accessor_codegen.find_max_alignment(
-        max_input_accessor_alignment, dtype, output_accessors
-    )
-    # all alignments are capped by the max_accessor_alignment
+    # Broadcasts need to be handled carefully.
+    # We have a hacky optimization for last-dim broadcasting:
+    # The element is read once, and broadcasted multiple times.
+    # However, we don't support reading more than 1 element for broadcasting.
+    #
+    # Consider following cases:
+    # X1[2, 1, 1]
+    # X2[1, 1, 2]
+    # X3[1, 2, 1]
+    # We do not support global_max_alignment 8 (reading two X1, X2, X3 per thread).
+    # We only support global_max_alignment 2, so that we make sure each thread
+    # reads at most 1 element for broadcasting.
+
+    # Update global_max_alignment based on broadcasting rules,
+    # and find max_alignments for each input.
+    alignments = [None] * len(input_broadcast_sizes)
+    for i, input_broadcast_size in enumerate(input_broadcast_sizes):
+        if input_broadcast_size is not None:
+            prev_is_broadcast = None
+            for j in range(max_num_rightmost_dims_considered_for_alignments):
+                is_broadcast = input_broadcast_size[-j - 1] != output_shape[-j - 1]
+                if (
+                    not is_broadcast
+                    and input_broadcast_size[-j - 1] == IntImm(1)
+                    and prev_is_broadcast is None
+                ):
+                    # Skip last-dim 1s if the output shape is the same.
+                    is_broadcast = None
+                if prev_is_broadcast is None:
+                    prev_is_broadcast = is_broadcast
+                    if is_broadcast:
+                        # Update alignment for last-dim broadcasting cases.
+                        alignments[i] = 1
+                elif prev_is_broadcast != is_broadcast:
+                    alignment = alignment_utils.find_max_alignment(
+                        shape_utils.get_num_rightmost_static_elements(output_shape, j),
+                        dtype,
+                    )
+                    # Update global_max_alignment when is_broadcast is not the
+                    # same as prev_is_broadcast.
+                    global_max_alignment = min(global_max_alignment, alignment)
+                    if not prev_is_broadcast:
+                        # Update alignment for mid-dim broadcasting cases.
+                        alignments[i] = alignment
+                    break
+
+    # Cap alignments based on global_max_alignment.
     alignments = [
-        align if align <= max_accessor_alignment else max_accessor_alignment
-        for align in alignments
+        min(alignment, global_max_alignment)
+        if alignment is not None
+        else global_max_alignment
+        for alignment in alignments
     ]
     return alignments
 
 
-def _get_alignments_and_sizes_and_dtype(
-    inputs: List[Tensor],
-    input_accessors: List[TensorAccessor],
-    output_accessors: List[TensorAccessor],
-    backend_spec: BackendSpec,
-    mixed_jagged_dense_indexing: bool,
-    output_volume: Optional[List[IntVar]],
-) -> Tuple[List[int], List[List[IntVar]], str]:
-    """
-    Returns Tuple(alignments, input_broadcast_sizes, dtype)
-    """
-    # Handle input broadcast.
-    dtype = inputs[0]._attrs["dtype"]
-
-    # Determine the rightmost broadcast dim among all inputs.
-    # This value prevents us from wrongfully generating a larger alignment
-    # for cases such as X1[2, 2], X2[2, 1], where [2, 2] and [2, 1] are shapes.
-    # If we do not have a rightmost_broadcast_dim guard, we would
-    # end up generating alignment = 4 for X1. But, this would be wrong, because
-    # in the kernel, we might have a single effective thread that loads four
-    # elements from X1 and only one element from X2. Potentially, we could
-    # make this thread load two elements from X2, but it would make address
-    # indexing templates fairly complicated in general. Let's make simple
-    # cases work and extend it later if we had to, e.g. we saw large perf penalty
-    # without doing it.
-    rightmost_broadcast_dim = None
-    num_rightmost_non_broadcast_dims = []
+def _get_input_broadcast_sizes(
+    input_accessors, output_accessors, mixed_jagged_dense_indexing, output_volume
+) -> List[Optional[List[IntVar]]]:
     input_broadcast_sizes = []
-    extended_input_shapes = []
     for input_accessor in input_accessors:
         input_shape = input_accessor.original_shapes
 
@@ -620,55 +590,77 @@ def _get_alignments_and_sizes_and_dtype(
                 )
             )
         extended_input_shape = list(input_shape)
-        num_rightmost_non_br_dims = len(output_shape)
         if input_shape == output_shape:
             input_broadcast_sizes.append(None)
-        else:
+        if input_shape != output_shape:
             extended_input_shape = [IntImm(1)] * len(output_shape)
             extended_input_shape[len(output_shape) - len(input_shape) :] = input_shape
             input_broadcast_sizes.append(extended_input_shape)
-            for i in reversed(range(len(extended_input_shape))):
-                if extended_input_shape[i] != output_shape[i]:
-                    num_rightmost_non_br_dims -= i + 1
-                    if rightmost_broadcast_dim is None:
-                        rightmost_broadcast_dim = i
-                    else:
-                        rightmost_broadcast_dim = max(i, rightmost_broadcast_dim)
-                    break
+    return input_broadcast_sizes
 
-        if mixed_jagged_dense_indexing:
-            # in the mixed jagged / dense indexing case, the number of the
-            # rightmost non-broadcated static dimensions of the dense inputs
-            # to be considered for vectorization can't be larger than the
-            # number of the jagged output's inner dimensions (i.e., the
-            # dimensions following the JaggedIntVar). otherwise, there may
-            # be an overlap with the jagged dimensions, in which case the
-            # vectorization can break.
-            jagged_output_shape = output_accessors[0].original_shapes
-            num_inner_dims_in_jagged_shape = len(jagged_output_shape) - 1
-            num_rightmost_non_br_dims = min(
-                num_rightmost_non_br_dims,
-                num_inner_dims_in_jagged_shape,
-            )
 
-        extended_input_shapes.append(extended_input_shape)
-        num_rightmost_non_broadcast_dims.append(num_rightmost_non_br_dims)
-    (alignments, non_broadcast_alignments) = _get_alignments(
-        extended_input_shapes,
-        input_broadcast_sizes,
-        num_rightmost_non_broadcast_dims,
-        rightmost_broadcast_dim,
-        len(output_shape),
-        dtype,
+def _get_alignments_and_broadcast_sizes(
+    dtype: str,
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+    mixed_jagged_dense_indexing: bool,
+    output_volume: Optional[List[IntVar]],
+) -> Tuple[List[int], List[Optional[List[IntVar]]]]:
+    """
+    Returns Tuple(input_alignments, input_broadcast_sizes)
+    """
+    # Handle input broadcast.
+    output_shape = output_accessors[0].original_shapes
+
+    input_broadcast_sizes = _get_input_broadcast_sizes(
+        input_accessors, output_accessors, mixed_jagged_dense_indexing, output_volume
     )
-    alignments = _refine_alignments_with_tensor_accessors(
-        non_broadcast_alignments,
-        alignments,
-        dtype,
+
+    # In the mixed jagged / dense indexing case, the number of the
+    # rightmost non-broadcated static dimensions of the dense inputs
+    # to be considered for vectorization can't be larger than the
+    # number of the jagged output's inner dimensions (i.e., the
+    # dimensions following the JaggedIntVar). Otherwise, there may
+    # be an overlap with the jagged dimensions, in which case the
+    # vectorization can break.
+    max_num_rightmost_dims_considered_for_alignments = (
+        len(output_shape) - 1 if mixed_jagged_dense_indexing else len(output_shape)
+    )
+
+    # We do not support mixed input / output alignments except for last dim broadcast.
+    # The global_max_alignment is the min value of:
+    #     1) input shape alignments (with input broadcast in consideration);
+    #     2) input tensor accessor alignments (strides, offsets);
+    #     3) output shape alignments;
+    #     4) output tensor accessor alignments (strides, offsets);
+
+    # Now calculate global_max_alignment based on 2), 3) and 4) first.
+    global_max_alignment = min(
+        alignment_utils.find_max_alignment(
+            shape_utils.get_num_rightmost_static_elements(
+                output_shape, max_num_rightmost_dims_considered_for_alignments
+            ),
+            dtype,
+        ),
+        tensor_accessor_codegen.find_max_alignment_for_accessors(
+            dtype, input_accessors
+        ),
+        tensor_accessor_codegen.find_max_alignment_for_accessors(
+            dtype, output_accessors
+        ),
+    )
+
+    # Now calculate global_max_alignment based on 1).
+    # Also calculate input alignments.
+    input_alignments = _get_input_alignments(
         input_accessors,
-        output_accessors,
+        input_broadcast_sizes,
+        max_num_rightmost_dims_considered_for_alignments,
+        output_shape,
+        dtype,
+        global_max_alignment,
     )
-    return alignments, input_broadcast_sizes, dtype
+    return input_alignments, input_broadcast_sizes
 
 
 def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
@@ -698,7 +690,7 @@ def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
 def _get_mixed_jagged_dense_config(
     input_accessors: List[TensorAccessor],
     output_accessors: List[TensorAccessor],
-) -> Tuple[bool, List[IntVar]]:
+) -> Tuple[bool, List[IntVar], bool]:
     """
     Returns Tuple(
         mixed_jagged_dense_indexing,
@@ -764,25 +756,26 @@ def _parse_func_metadata(
         input_accessors,
         output_accessors,
     )
-    alignments, input_broadcast_sizes, dtype = _get_alignments_and_sizes_and_dtype(
-        inputs,
+    dtype = inputs[0]._attrs["dtype"]
+    (input_alignments, input_broadcast_sizes) = _get_alignments_and_broadcast_sizes(
+        dtype,
         input_accessors,
         output_accessors,
-        backend_spec,
         mixed_jagged_dense_indexing,
         output_volume,
     )
     max_read_type = backend_spec.get_elementwise_read_backend_type(
-        max(alignments), dtype
+        max(input_alignments), dtype
     )
     read_types = [
         backend_spec.get_elementwise_read_backend_type(alignment, dtype)
-        for alignment in alignments
+        for alignment in input_alignments
     ]
+
     # It's safe to use the maximum alignment for determine op_type, because
     # smaller inputs (i.e. those being broadcasted) will be placed into a
     # larger tmp variable which is valid for selected op_type.
-    op_type = backend_spec.get_elementwise_op_backend_type(max(alignments), dtype)
+    op_type = backend_spec.get_elementwise_op_backend_type(max(input_alignments), dtype)
     data_type = backend_spec.dtype_to_backend_type(dtype)
     sub_func_metadata, op_type = _get_sub_func_metadata(
         ops, data_type, op_type, backend_spec
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index dc29d6aee..38d27d73b 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -213,6 +213,7 @@ def compile_model(
     # arguments (even if we put quotes around it)!!
     test_name = test_name.replace(",", "_")
     test_dir = os.path.join(workdir, test_name)
+    _LOGGER.info(f"Start to compile AIT model. {test_dir=}")
     if profile_dir is None:
         profile_dir = workdir
 
diff --git a/tests/unittest/compiler/test_fuse_expand.py b/tests/unittest/compiler/test_fuse_expand.py
index 64d8ac73e..655c944be 100644
--- a/tests/unittest/compiler/test_fuse_expand.py
+++ b/tests/unittest/compiler/test_fuse_expand.py
@@ -55,7 +55,7 @@ def test_fuse_expand_elementwise(self, exact_match: bool, name: str):
 
                 z_ait = torch.empty_like(z_pt)
                 mod.run_with_tensors({"x": x_pt, "y": y_pt}, {"z": z_ait})
-                self.assertTrue(torch.equal(z_ait, z_pt))
+                self.assertTrue(torch.equal(z_ait, z_pt), f"{z_ait=}\n{z_pt=}")
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index 65a91b404..d0432bc46 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -755,28 +755,31 @@ def _test_vectorization(
         expected_read_types,
         expected_op_t,
         expected_data_t,
+        ns=None,
         dtype="float16",
     ):
         """
-        Test add(add(X0(B, M0, K0), X1(B, M1, K1)), X2(B, M2, K2))
+        Test add(add(X0(B, M0, K0, N0), X1(B, M1, K1, N1)), X2(B, M2, K2, N2))
         """
 
         batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_dim")
+        if ns is None:
+            ns = [1, 1, 1]
 
         X0 = Tensor(
-            shape=[batch_dim, IntImm(ms[0]), IntImm(ks[0])],
+            shape=[batch_dim, IntImm(ms[0]), IntImm(ks[0]), IntImm(ns[0])],
             dtype=dtype,
             name="input0",
             is_input=True,
         )
         X1 = Tensor(
-            shape=[batch_dim, IntImm(ms[1]), IntImm(ks[1])],
+            shape=[batch_dim, IntImm(ms[1]), IntImm(ks[1]), IntImm(ns[1])],
             dtype=dtype,
             name="input1",
             is_input=True,
         )
         X2 = Tensor(
-            shape=[batch_dim, IntImm(ms[2]), IntImm(ks[2])],
+            shape=[batch_dim, IntImm(ms[2]), IntImm(ks[2]), IntImm(ns[2])],
             dtype=dtype,
             name="input2",
             is_input=True,
@@ -801,9 +804,15 @@ def _test_vectorization(
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size in batch_sizes:
-            x0_pt = get_random_torch_tensor([batch_size, ms[0], ks[0]], dtype=dtype)
-            x1_pt = get_random_torch_tensor([batch_size, ms[1], ks[1]], dtype=dtype)
-            x2_pt = get_random_torch_tensor([batch_size, ms[2], ks[2]], dtype=dtype)
+            x0_pt = get_random_torch_tensor(
+                [batch_size, ms[0], ks[0], ns[0]], dtype=dtype
+            )
+            x1_pt = get_random_torch_tensor(
+                [batch_size, ms[1], ks[1], ns[1]], dtype=dtype
+            )
+            x2_pt = get_random_torch_tensor(
+                [batch_size, ms[2], ks[2], ns[2]], dtype=dtype
+            )
             output_pt = (x0_pt + x1_pt) + x2_pt
             inputs = {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt}
             output = torch.empty_like(output_pt)
@@ -888,6 +897,18 @@ def test_vectorization_fp16(self):
             expected_data_t="half",
             dtype="float16",
         )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[4, 1, 1],
+            ks=[2, 2, 1],
+            ns=[1, 2, 1],
+            test_name="fused_elementwise_vectorization_fp16_8",
+            expected_max_read_t="uint",
+            expected_read_types=["half", "uint", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_vectorization_fp32(self):

From d468fbc624b94ef32dae7df7505104fca2de447f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 15 May 2023 18:25:34 +0800
Subject: [PATCH 512/638] add ops

---
 3rdparty/composable_kernel                    |   2 +-
 .../backend/common/tensor/identity_common.py  |   4 +-
 python/aitemplate/backend/main_templates.py   |   2 +-
 .../backend/rocm/attention/__init__.py        |  18 +
 .../rocm/attention/mem_eff_attention.py       | 361 ++++++++++++++++
 .../backend/rocm/tensor/__init__.py           |   3 +
 .../aitemplate/backend/rocm/tensor/expand.py  | 308 ++++++++++++++
 .../rocm/tensor/expand_static_shape.py        | 386 ++++++++++++++++++
 python/aitemplate/backend/rocm/tensor/full.py | 148 +++++++
 .../backend/rocm/tensor/identity.py           |  10 +-
 .../ops/attention/mem_eff_attention.py        |   9 +-
 .../gemm_universal/bmm_softmax_bmm_permute.py |   3 +-
 python/aitemplate/frontend/nn/attention.py    |  32 +-
 static/include/rocm_device_functions.h        |   2 +-
 14 files changed, 1264 insertions(+), 24 deletions(-)
 create mode 100644 python/aitemplate/backend/rocm/attention/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/attention/mem_eff_attention.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/expand.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/expand_static_shape.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/full.py

diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 52abc2f37..db49fc437 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 52abc2f37112d49f85f31aa343a14bd92a83b07c
+Subproject commit db49fc43797f80be1db2399dcd1a082dbf447736
diff --git a/python/aitemplate/backend/common/tensor/identity_common.py b/python/aitemplate/backend/common/tensor/identity_common.py
index a5fcebf6a..34c75fec5 100644
--- a/python/aitemplate/backend/common/tensor/identity_common.py
+++ b/python/aitemplate/backend/common/tensor/identity_common.py
@@ -26,6 +26,7 @@
 
 FUNC_TEMPLATE = jinja2.Template(
     """
+{{extra_headers}}
 {{func_signature}}
 {
 {% if is_copy %}
@@ -61,7 +62,7 @@
 )
 
 
-def gen_function(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function(func_attrs: Dict[str, Any], backend_spec, extra_headers='') -> str:
     """Generates function.
 
     Parameters
@@ -81,6 +82,7 @@ def gen_function(func_attrs: Dict[str, Any], backend_spec) -> str:
     is_copy = func_attrs["outputs"][0]._attrs["is_output"]
 
     return FUNC_TEMPLATE.render(
+        extra_headers=extra_headers,
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"],
             prefix=backend_spec.prefix,
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index aaa32c25a..7003ab3be 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -249,7 +249,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
           CreateEvent(&call_start);
           CreateEvent(&call_end);
         }
-        for (auto& [call_start, call_end]: call_events) {
+        for (auto& [call_start, call_end] : call_events) {
           DeviceMemset(L2CacheSlab, 0x73, L2SizeInBytes);
           EventRecord(call_start, stream);
             {{ func }}
diff --git a/python/aitemplate/backend/rocm/attention/__init__.py b/python/aitemplate/backend/rocm/attention/__init__.py
new file mode 100644
index 000000000..0be5c075d
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.backend.rocm.attention import mem_eff_attention
+
+__all__ = ["mem_eff_attention"]
diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
new file mode 100644
index 000000000..f726598c9
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -0,0 +1,361 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+attention kernel codegen for ROCM.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+# pylint: disable=C0301
+
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int*>({{name}})")
+
+FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<float*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "logging.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using MaskingSpecialization = ck::tensor_operation::device::MaskingSpecialization;
+
+static constexpr auto MaskingSpec_default = 
+    MaskingSpecialization::MaskDisabled;
+static constexpr auto MaskingSpec_causal =
+    MaskingSpecialization::MaskOutUpperTriangle;
+
+using F32 = float;
+using InputType = {{elem_input_type}};
+
+using ADataType        = InputType;
+using B0DataType       = InputType;
+using B1DataType       = InputType;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = InputType;
+using GemmDataType     = InputType;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+{% if is_causal %}
+        MaskingSpec_causal
+{% else %}
+        MaskingSpec_default
+{% endif %}
+    >;   
+
+{{func_signature}}
+{
+
+    bool input_permute = false;
+    bool output_permute = true;
+    
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{softmax_scale};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    std::vector<typename DeviceGemmInstance::ProblemDesc> problem_descs;
+
+    const char* q_ptr = reinterpret_cast<const char*>(q);
+    const char* k_ptr = reinterpret_cast<const char*>(k);
+    const char* v_ptr = reinterpret_cast<const char*>(v);
+    char* output_ptr = reinterpret_cast<char*>(output);
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<void*> output_ptrs;
+
+    for(int64_t i = 0; i < batch_size ; i++){
+        int M = seqlens[i];
+        int N = seqlens[i];
+        int K = head_dim;
+        int O = head_dim;
+        int G0 = 1;
+        int G1 = num_heads;
+
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K]
+                : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K]
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K]
+                : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K]
+
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O]
+                : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O]
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides =
+            output_permute
+                ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
+                : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
+
+        problem_descs.push_back({a_gs_ms_ks_lengths,
+                                 a_gs_ms_ks_strides,
+                                 b0_gs_ns_ks_lengths,
+                                 b0_gs_ns_ks_strides,
+                                 b1_gs_os_ns_lengths,
+                                 b1_gs_os_ns_strides,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides,
+                                 {},   // acc0_biases_gs_ms_ns_lengths
+                                 {},   // acc0_biases_gs_ms_ns_strides
+                                 {},   // acc1_biases_gs_ms_os_lengths
+                                 {}}); // acc1_biases_gs_ms_os_strides
+
+        auto offset = i * K * G1 * M * sizeof(InputType);
+        q_ptrs.push_back(reinterpret_cast<const void*>(q_ptr + offset));                               
+        k_ptrs.push_back(reinterpret_cast<const void*>(k_ptr + offset));                               
+        v_ptrs.push_back(reinterpret_cast<const void*>(v_ptr + offset));                               
+        output_ptrs.push_back(reinterpret_cast<void*>(output_ptr + offset));                               
+    }
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(q_ptrs,
+                                      k_ptrs,
+                                      v_ptrs,
+                                      output_ptrs,
+                                      {}, // p_acc0_biases
+                                      {}, // p_acc1_biases
+                                      problem_descs,
+                                      a_element_op,
+                                      b0_element_op,
+                                      acc0_element_op,
+                                      b1_element_op,
+                                      c_element_op);
+
+    // specify workspace for problem_desc
+
+    gemm.SetWorkSpacePointer(&argument, workspace);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        LOG(FATAL) << "wrong! " << gemm.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
+    }
+
+    invoker.Run(argument, StreamConfig{stream, false});
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   const void* q,
+                   const void* k,
+                   const void* v,
+                   const int* seqlens,
+                   int64_t batch_size,
+                   int num_heads,
+                   int head_dim,
+                   float softmax_scale,
+                   void* workspace,
+                   hipStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{q}}, {{k}}, {{v}}, {{seqlens}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{head_dim}},
+{{indent}}    {{softmax_scale}},
+{{indent}}    global_workspace_,
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("rocm.mem_eff_attention.gen_function")
+def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    is_causal = func_attrs["causal"]
+    return FUNC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        is_causal=is_causal,
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_decl")
+def mem_eff_attention_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_call")
+def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) in [4, 5]
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    seqlens_name = FUNC_CALL_INT32_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][3]._attrs["name"]
+    )
+
+    q = func_attrs["inputs"][0]
+
+    batch_size = q.shape()[0]._attrs["name"]
+
+    num_heads = q._attrs["shape"][1]._attrs["values"][0]
+    head_dim = q._attrs["shape"][3]._attrs["values"][0]
+    
+    softmax_scale = head_dim ** (-0.5)
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        q=q_name,
+        k=k_name,
+        v=v_name,
+        seqlens=seqlens_name,
+        batch_size=batch_size,
+        num_heads=num_heads,
+        head_dim=head_dim,
+        softmax_scale=softmax_scale,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 2993c648c..4c2e528be 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -21,6 +21,7 @@
     concatenate,
     concatenate_tanh,
     dynamic_slice,
+    full,
     identity,
     permute021,
     permute0213,
@@ -30,4 +31,6 @@
     slice_scatter,
     split,
     topk,
+    expand,
+    expand_static_shape,
 )
diff --git a/python/aitemplate/backend/rocm/tensor/expand.py b/python/aitemplate/backend/rocm/tensor/expand.py
new file mode 100644
index 000000000..64ccc0a29
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand.py
@@ -0,0 +1,308 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+expand op general CUDA implementation with complete dynamic shape support
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.rocm.tensor import expand_static_shape  # noqa: F401
+
+
+@registry.reg("rocm.expand.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
+        func = registry.get("rocm.expand.static.func_decl")
+        return func(func_attrs)
+    x = func_attrs["inputs"][0]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    dt = x.dtype()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(dt, None)
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,  # name of the function
+        dtype=dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        index_type=index_type,
+    )
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* src,
+  const {{index_type}}* input_dims,
+  const {{index_type}} input_rank,
+  void* dst,
+  {{index_type}}* output_dims, // written to ( runtime shape inference )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types,
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+using bfloat16 = hip_bfloat16;
+
+{% if index_type=="int64_t" %}
+#define DIM_TYPE_ADD 0l
+#define DIM_TYPE_EXPAND 1l
+#define DIM_TYPE_KEEP 2l
+
+#define MAX_THREADS_PER_BLOCK 1024l
+#define MAX_BLOCKS 65535l
+#define MAX_X_BLOCKS 2147483647l
+{% else %}
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_BLOCKS 65535
+#define MAX_X_BLOCKS 2147483647
+{% endif %}
+
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+#define INT_MIN(a,b) ((a) < (b)? (a) : (b))
+
+/**
+ * Sequential write expand kernel.
+ * This kernel deals with the general case ( strided copy ).
+ * It relies heavily on L2 cache for scattered read optimization and
+ * writes sequentially.
+ */
+__global__ void {{func_name}}_sequential_write_kernel(
+
+  const {{dtype}}* src, // source tensor
+  {{dtype}}* dst, // destination tensor
+  const {{index_type}} dst_numel // number of elements in dst
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}} // Stride for writing dimension {{i}} to dst
+        ,const {{index_type}} read_strides_{{i}} // Stride for reading dimension {{i}} from src
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    {{index_type}} write_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    const {{index_type}} grid_stride = gridDim.x*blockDim.x;
+    for (;write_idx<dst_numel;write_idx += grid_stride) {
+      {{index_type}} read_idx = 0;
+      {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+      {% for i in range(output_rank) %}
+          read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+          remaining_idx %= output_strides_{{i}};
+      {% endfor %}
+      dst[write_idx] = src[read_idx];
+    }
+}
+
+/**
+ * Expand Operator entry point with support for dynamic shapes
+ */
+void {{func_name}} (
+  const void* src, // input tensor
+  const {{index_type}}* input_dims, // input dimensions ( passed by value )
+  const {{index_type}} input_rank,
+  void* dst, // output tensor
+  {{index_type}}* output_dims, // output dimensions ( passed by value )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types, // Output dim types ( length=output_rank ). 2 = keep dimension, 1 = expand dimension, 0 = add dimension
+  hipStream_t stream)
+{
+  // Calculate number of input elements
+  {{index_type}} input_numel = 1;
+  {{index_type}} i;
+  for (i = 0; i < input_rank; ++i) {
+    input_numel *= input_dims[i];
+  }
+  if (input_numel==0) {
+    return;
+  }
+  {{index_type}} input_dim_pos = 0;
+
+  // Calculate number of output dimensions
+  {{index_type}} output_numel = 1;
+  for (i = 0; i < output_rank; ++i) {
+    output_numel *= output_dims[i];
+  }
+  if (output_numel==0) {
+    return;
+  }
+  // Determine stride for each input dimension
+  {{index_type}} input_strides[input_rank];
+  input_strides[input_rank-1] = 1;
+  for (i=input_rank-2;i>=0;--i) {
+    input_strides[i] = input_strides[i+1]*input_dims[i+1];
+  }
+  // Determine stride for each output dimension
+  {{index_type}} output_strides[output_rank];
+  output_strides[output_rank-1] = 1;
+  for (i=output_rank-2;i>=0;--i) {
+    output_strides[i] = output_strides[i+1]*(output_dims[i+1]);
+  }
+
+  // Determine read strides for each output dimension
+  // (0 for expand or add dims, otherwise the stride of
+  // of the corresponding input dim)
+  {{index_type}} read_strides[output_rank];
+
+  input_dim_pos = 0;
+  for (i = 0; i < output_rank; ++i) {
+    {{index_type}} dim_type =  output_dim_types[i];
+    if (dim_type == DIM_TYPE_KEEP ) { // keep
+      read_strides[i] = input_strides[input_dim_pos++];
+    } else {
+      read_strides[i] = 0;
+      if (dim_type==DIM_TYPE_EXPAND) {
+        input_dim_pos++;
+      }
+    }
+  }
+  assert(input_dim_pos==input_rank);
+
+  // Calculating tail dimension in order to determine whether we can do sequential batching
+  {{index_type}} tail_dim = 1;
+  for (i = output_rank-1; i >= 0; --i) {
+      if (output_dim_types[i]!=DIM_TYPE_KEEP) {
+         break;
+      }
+      tail_dim *= output_dims[i];
+  }
+
+  // determine CUDA kernel grid layout. Tuning numbers determined experimentally
+  {{index_type}} thread_size_x = INT_MIN(output_numel, MAX_THREADS_PER_BLOCK); // more threads per block maximize L1 cache utilization
+  {{index_type}} block_size_x = INT_MIN(INT_CEIL_DIV(output_numel, thread_size_x), 4096l ); //
+
+  // for very large dimensions, we rely on grid-stride loop and save the block launch overhead
+  dim3 dimGrid(block_size_x, 1, 1);
+  dim3 dimBlock(thread_size_x, 1, 1);
+  {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+      static_cast<const {{dtype}}*>(src),
+      static_cast<{{dtype}}*>(dst),
+      output_numel
+      {% for i in range(output_rank) %}
+        ,output_strides[{{i}}]
+        ,read_strides[{{i}}]
+      {% endfor %}
+  );
+}
+"""
+)
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(x.dtype(), None)
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    assert index_type is not None
+
+    input_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in xshape]
+    )
+    output_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in yshape]
+    )
+    input_rank = len(xshape)
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "input_dims": input_dims,  # list of input dimensions (as string of comma-separated variable names )
+        "output_dims": output_dims,  # output dimensions (as string of comma-separated variable names)
+        "input_rank": input_rank,  # number of input dimensions
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+    }
+
+
+@registry.reg("rocm.expand.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+    else:
+        func = registry.get("rocm.expand.static.gen_function")
+        return func(func_attrs)
+
+
+@registry.reg("rocm.expand.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+    else:
+        func = registry.get("rocm.expand.static.func_call")
+        return func(func_attrs, indent)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}const {{index_type}} input_dims[] = { {{input_dims}} };
+    {{indent}}{{index_type}} output_dims[] = { {{output_dims}} };
+    {{indent}}const {{index_type}} output_dim_types[] = { {{dim_types}} };
+    {{indent}}{{func_name}}(
+    {{indent}}    {{src}},
+    {{indent}}    input_dims,
+    {{indent}}    {{input_rank}},
+    {{indent}}    {{dst}},
+    {{indent}}    output_dims,
+    {{indent}}    {{output_rank}},
+    {{indent}}    output_dim_types,
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
new file mode 100644
index 000000000..20fe5fa3f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
@@ -0,0 +1,386 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Specialized and optimized CUDA kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
+import math
+import os
+from itertools import accumulate
+from operator import mul
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+
+
+@registry.reg("rocm.expand.static.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+
+using bfloat16 = hip_bfloat16;
+
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024l
+// integer ceil division
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+
+// Maximum amount of shared memory that the repeat copy kernel(s) should use.
+// (used within repeat.cuh, included below )
+// Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
+#define SHM_MAX 1024 * 44
+
+{{custom_libs}}
+
+/**
+ * Get read base offset (e.g. excluding tail offset) in the middle part, given a write offset
+ * into the middle part
+ */
+__forceinline__ __device__ {{index_type}} {{func_name}}_get_read_offset(const {{index_type}} write_offset) {
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_write_idx = write_offset; // assert < {{mid_size*tail_size}} ( i.e. < mid_size*tail_size)
+    {% for i in range(head_dim_count, head_dim_count+mid_dim_count-1) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+        remaining_write_idx %= {{output_strides[i]}}l;
+    {% endfor %}
+    {% for i in range(head_dim_count+mid_dim_count-1, head_dim_count+mid_dim_count) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+    {% endfor %}
+    return read_idx;
+}
+
+/**
+ *  Copies tail elements from a contiguous source memory region into a contiguous target memory region
+ *  Using a grid-stride loop and the vectorized dtype
+ *
+ * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__forceinline__ __device__ void tail_copy(
+        const {{dtype}} * const src, // base src tensor memory pointer
+        const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
+        {{dtype}} * const dst,  // base destination tensor memory pointer
+        const {{index_type}} write_offset, // Base offset into dst via {{dtype}}-typed indexing
+        const {{index_type}} block_thread_index,
+        const {{index_type}} block_thread_count,
+        const {{index_type}} copy_numel
+    ) {
+    for ({{index_type}} i=block_thread_index;i<copy_numel;i+=block_thread_count) {
+        dst[write_offset+i] = src[read_offset+i];
+    }
+}
+
+/**
+ * Implement the "middle" part of the kernel, dealing with strided reads/writes.
+ * Also utilizes grid-stride loop for efficiency and flexibility
+ * see
+ * * https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * * https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#coalesced-access-to-global-memory
+ * * and https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#strided-accesses
+ * for a more detailed explanation of the reasons for the choice of this specific form.
+ *
+ * Performance notes:
+ *
+ * It is critical to calculate the block_thread_index passed to tail_copy(..) based on
+ * the x-dimension of the launch grid, in order to benefit from Warp memory access coalescing.
+ *
+ */
+__global__ void expand_strided_copy(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.y * blockIdx.y + threadIdx.y) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
+    const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
+        tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+    }
+
+}
+
+/**
+ * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
+ */
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream)
+{
+  if ((({{mid_size*tail_size}})==0) || (head_size==0)) {
+    return;
+  }
+  {% if mid_dim_count>0 %}
+  // we have middle dimensions which involve non-contiguous reads
+  // so we need to invoke the middle kernel
+  dim3 dimGrid({{grid_blocks_x}}, {{grid_blocks_y}});
+  dim3 dimBlock({{grid_threads_x}}, {{grid_threads_y}});
+  expand_strided_copy<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  if (head_size>1l) {
+     // now repeat copy what we already built once, multiple times into the rest of the output tensor
+     cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
+  }
+  {% else %}
+    // we have no middle dimensions, so strided copy is unneccessary.
+    // All we need to do is repeatedly copy the source multiple times
+    // repeat the entire thing a dynamic number of times ( e.g. head_size times )
+    cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
+  {% endif %}
+}
+"""
+)
+
+
+def _ceil(num: float) -> int:
+    return int(math.ceil(num))
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    # Efficient vectorized & buffered repeat copy implementation,
+    # even for odd shapes
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "repeat.cuh"
+    )
+    rocm_spec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype[x.dtype()]
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    dtype2 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 2, None)
+    dtype4 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 4, None)
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
+    index_type = "int64_t"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in xshape
+    ), "All input shapes need to be fixed"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in yshape
+    ), "All output shapes need to be fixed"
+
+    # Calculate number of times we can repeatedly copy the entire result, based on how many add, expand and singleton dimensions
+    # we have at the start
+    head_size_lower = 1  # Number of times we can batch-repeat the entire result in an efficient batch-copying manner
+    head_size_upper = 1
+    head_dim_count = 0  # Number of head dimensions
+
+    for dim_type, dim in zip(func_attrs["dim_types"], yshape):
+        if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        head_size_lower *= dim.lower_bound()
+        head_size_upper *= dim.upper_bound()
+        head_dim_count += 1
+
+    # Create a symbolic term for calculating head size ( e.g. repeat count )
+    if head_size_lower == head_size_upper:
+        head_size_symbolic = f"{head_size_upper}l"
+    else:
+        head_size_symbolic = "*".join(
+            [
+                f"static_cast<{index_type}>(" + dim._attrs["name"] + ")"
+                for dim in yshape[:head_dim_count]
+            ]
+        )
+
+    # Calculate number of tail elements, e.g. number of elements we can batch-copy in the inner loop
+    # via effective sequential reads & writes
+    tail_dim_count = 0  # number of tail dimensions
+    tail_size = 1  # Number of the elements in all these  tail dimensions
+    for dim_type, dim in reversed(
+        list(zip(dim_types[head_dim_count:], yshape[head_dim_count:]))
+    ):
+        if dim_type != ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        tail_dim_count += 1
+        tail_size *= dim.lower_bound()
+
+    input_strides = list(
+        reversed(
+            list(accumulate([1] + [d.lower_bound() for d in reversed(xshape)], mul))
+        )
+    )
+    output_strides = list(
+        reversed(
+            list(
+                accumulate(
+                    [1] + [d.lower_bound() for d in reversed(yshape[head_dim_count:])],
+                    mul,
+                )
+            )
+        )
+    )
+
+    output_numel = output_strides[
+        0
+    ]  # this does not include the number of elements obtained from head repetitions
+    # since we have excluded head dimensions above
+    input_numel = input_strides[0]
+    if tail_size > 0:
+        mid_size = output_numel // tail_size
+    else:
+        mid_size = 0
+    mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
+    if input_numel > 0:
+        mid_expansion_rate = mid_size * tail_size // input_numel
+    else:
+        mid_expansion_rate = 1
+
+    # remove the first dimension, which is the total number of elements
+    # and prepend the head_dims with stride 0
+    output_strides = [0] * head_dim_count + output_strides[1:]
+    input_strides = input_strides[1:]
+
+    input_stride_pos = 0
+    read_strides = [0] * len(yshape)
+    for i in range(len(yshape)):
+        if dim_types[i] == ExpandDimensionType.ADD_DIM:
+            continue
+        if dim_types[i] == ExpandDimensionType.KEEP_DIM:
+            read_strides[i] = input_strides[input_stride_pos]
+        # For keep dim, read stride remains at zero
+        input_stride_pos += 1
+
+    assert input_stride_pos == len(
+        xshape
+    ), "Incorrect number of keep and expand dims. Something went wrong."
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+
+    # If tail size is aligned to 2 or 4 elements, we can vectorize reads/writes
+    # Note: Further vectorization not easily possible, given that it could happen that
+    # the read offset and the write offset can get different alignments within the expand op
+    #
+    if (tail_size % 4 == 0) and (dtype4 is not None):
+        dtype = dtype4
+        tail_size = tail_size // 4
+        output_strides = [s // 4 for s in output_strides]
+        read_strides = [s // 4 for s in read_strides]
+    elif tail_size % 2 == 0:
+        dtype = dtype2
+        tail_size = tail_size // 2
+        output_strides = [s // 2 for s in output_strides]
+        read_strides = [s // 2 for s in read_strides]
+
+    grid_blocks_x = 1
+    grid_threads_x = max(1, min(tail_size, 64))
+    max_y_threads = 1024 // grid_threads_x  # guaranteed to be >= 1
+    grid_threads_y = max(
+        1, min(max_y_threads, mid_size)
+    )  # so that  mid_grid_threads_x*max_x_threads <= 1024
+    grid_blocks_y = _ceil(mid_size / grid_threads_y)
+
+    if dtype == "bfloat16":
+        # bfloat16 is not available in model-generated.h as a type,
+        # so we can either just declare the input to be void*
+        # or  just use the fact that we don't care about how to interpret the value
+        # and just treat it like every other 16 bit type.
+        dtype = "half"
+
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "output_strides": output_strides,  # list of output stride values
+        "read_strides": read_strides,  # list of read stride values
+        "tail_dim_count": tail_dim_count,  # number of tail dimensions
+        "tail_size": tail_size,  # number of elements in all these tail dimensions
+        "head_dim_count": head_dim_count,  # number of head dimensions
+        "head_size": head_size_symbolic,  # number of elements in all these head dimensions
+        "mid_dim_count": mid_dim_count,
+        "mid_size": mid_size,
+        "mid_expansion_rate": mid_expansion_rate,  # How many times do we read the input for the middle
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float )
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+        "grid_blocks_y": grid_blocks_y,  # number of y grid blocks in the strided copy kernel
+        "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
+        "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
+        "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.cuh
+    }
+
+
+@registry.reg("rocm.expand.static.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+
+
+@registry.reg("rocm.expand.static.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}{{func_name}}(
+    {{indent}}    static_cast<{{dtype}}*>({{src}}),
+    {{indent}}    static_cast<{{dtype}}*>({{dst}}),
+    {{indent}}    {{head_size}},
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/full.py b/python/aitemplate/backend/rocm/tensor/full.py
new file mode 100644
index 000000000..da462a93f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/full.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+
+HIP_HEADER_FILES = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void*,  /* output */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void full(
+    {{read_type}}* output,
+    {{index_type}} num_elements
+) {
+  const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx * N_ELEMENTS_PER_THREAD >= num_elements) {
+    return;
+  }
+
+  {{read_type}} tmp;
+  {{data_type}}* p = reinterpret_cast<{{data_type}}*>(&tmp);
+
+  #pragma unroll
+  for (int i=0; i < N_ELEMENTS_PER_THREAD; i++) {
+      p[i] = ({{data_type}}) ({{fill_value}});
+  }
+
+  output[idx] = tmp;
+}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    {{prefix}}Stream_t stream
+){
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>({{num_elements}}) / N_ELEMENTS_PER_THREAD / N_THREADS_PER_BLOCK));
+    full<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(reinterpret_cast<{{read_type}}*> (output), {{num_elements}});
+}
+    """
+)
+
+
+@registry.reg("rocm.full.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+
+    # fill the maximum output Tensor size with the fill_value
+    # any shape within the maximum bounds will be a subset
+    num_elements = 1
+    for dim in y.shape():
+        num_elements *= dim.upper_bound()
+
+    dtype = y.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=HIP_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(
+            read_t=read_type,
+            data_t=data_type,
+        ),
+        func_name=func_attrs["name"],
+        read_type=read_type,
+        data_type=data_type,
+        index_type=backend_spec.index_type,
+        fill_value=func_attrs["fill_value"],
+        num_elements=num_elements,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = ROCMSpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/identity.py b/python/aitemplate/backend/rocm/tensor/identity.py
index 9bbab569a..90a2c29ea 100644
--- a/python/aitemplate/backend/rocm/tensor/identity.py
+++ b/python/aitemplate/backend/rocm/tensor/identity.py
@@ -16,11 +16,19 @@
 ROCM identity function
 """
 
+import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.backend_spec import ROCMSpec
 from aitemplate.backend.common.tensor import identity_common
 
 
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include <hip/hip_runtime.h>
+    """
+)
+
 @registry.reg("rocm.identity.func_decl")
 def gen_function_decl(func_attrs):
     """Generate function declaration.
@@ -53,7 +61,7 @@ def gen_function(func_attrs):
     str
         Rendered function body.
     """
-    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec())
+    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec(), extra_headers=EXTRA_HEADERS.render())
 
 
 @registry.reg("rocm.identity.func_call")
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 55d2e9e28..f57aab840 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -113,9 +113,6 @@ def _infer_shapes(self, x: Tensor, w: Tensor):
             y_shape = self._infer_shape(x_shape, w_shape)
             y_shapes.append(y_shape)
 
-        def unique(vector):
-            return sorted(set(vector))
-
         batch_info = x._attrs["shape"][0]
         output_shape = [
             batch_info,
@@ -150,11 +147,9 @@ def __call__(
         self._attrs["head_size"] = head_size_v
 
         self._attrs["inputs"] = [q, k, v]
-        if self._attrs["variable_seq_length_kv"]:
-            assert lengths_kv is not None
+        if lengths_kv:
             self._attrs["inputs"].append(lengths_kv)
-        if self._attrs["variable_seq_length_q"]:
-            assert lengths_q is not None
+        if lengths_q:
             self._attrs["inputs"].append(lengths_q)
         self._set_depth()
         self._extract_exec_path(q)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
index 752676166..da989ce62 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -175,14 +175,13 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         if self._attrs["layout"] == "Permute4DBMM_0213":
             b, m, o = output_shape
             d1 = self._attrs["shape"][0]
-            output_shape = [b.value() // d1, m, d1, o]
+            output_shape = [-1, m, d1, o]
             self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
-        return output
 
     def _get_op_attributes(self):
         return {
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index a1a7075b8..4ee2ca290 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -330,17 +330,17 @@ def __init__(
 
         self.op = ops.mem_eff_attention(causal=causal)
 
-        self.proj_q = Linear(
+        self.query = Linear(
             dim,
             dim,
             bias=qkv_bias,
         )
-        self.proj_k = Linear(
+        self.key = Linear(
             dim,
             dim,
             bias=qkv_bias,
         )
-        self.proj_v = Linear(
+        self.value = Linear(
             dim,
             dim,
             bias=qkv_bias,
@@ -350,13 +350,13 @@ def __init__(
         self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
         self.proj_drop = Dropout(proj_drop)
 
-    def attention(self, q, k, v):
+    def attention(self, q, k, v, seqlens=None):
         batch = q.shape()[0]
         head_dim = self.dim // self.num_heads
 
-        query = self.proj_q(q)
-        key = self.proj_k(k)
-        value = self.proj_v(v)
+        query = self.query(q)
+        key = self.key(k)
+        value = self.value(v)
 
         query = ops.permute()(
             ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
@@ -368,14 +368,26 @@ def attention(self, q, k, v):
             ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
             [0, 2, 1, 3],
         )
-        return self.op(query, key, value)
+        return self.op(query, key, value, seqlens)
 
-    def forward(self, *args):
+    def forward(self, *args, seqlens=None):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
         batch = x.shape()[0]
-        attn_output = self.attention(args[0], args[1], args[2])
+        if detect_target().name() == "cuda":
+            attn_output = self.attention(args[0], args[1], args[2])
+        else:
+            if seqlens:
+                attn_output = self.attention(args[0], args[1], args[2], seqlens)
+            else:
+                OP = ops.bmm_softmax_bmm_permute(
+                    shape=(self.num_heads,),
+                    scale=(self.dim // self.num_heads)**-0.5,
+                    causal=self.causal,
+                )
+                attn_output = OP(*args)
+                
         attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 
         if self.has_residual:
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 2a746b7e7..d37a16dfa 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -187,7 +187,7 @@ inline DeviceError StreamDestroy(StreamType stream) {
 }
 
 inline DeviceError StreamWaitEvent(StreamType stream, EventType event) {
-  return hipStreamWaitEvent(stream, event);
+  return hipStreamWaitEvent(stream, event, 0);
 }
 
 inline DeviceError GraphInstantiate(

From 5c926619017dcbaacdeef370e07f42f22e878d88 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Mon, 15 May 2023 08:58:50 -0700
Subject: [PATCH 513/638] Fix-forward - multiscaleblock test failure (#694)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/694

Fix broken `test_ait_msa` test: T153240274

**Issue**: `test_ait_msa` uses `Identity` as `norm_layer`:

https://www.internalfb.com/code/fbsource/[85fd2bff0b6f]/fbcode/aitemplate/AITemplate/fx2ait/fx2ait/fb/test/converters/test_ait_msa.py?lines=51-52

which doesn't support `permute_input_output` attribute.

**Solution**: Only set `permute_input_output=True` for `BatchNorm1d` `norm_layer`.

Reviewed By: wushirong

Differential Revision: D45843110

fbshipit-source-id: 35ab0e66ca3bcc4fd778269f89ca3fec32c57d94
---
 python/aitemplate/frontend/nn/multiscale_attention.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 7b431e4f7..848cb1637 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -671,8 +671,9 @@ def __init__(
         super().__init__()
         self.dim = dim
         self.dim_out = dim_out
-        self.norm1 = norm_layer(dim, permute_input_output=True)
+        self.norm1 = norm_layer(dim)
         self.norm1_is_batchnorm_1d = isinstance(self.norm1, BatchNorm1d)
+        self.norm1.permute_input_output = True if self.norm1_is_batchnorm_1d else False
         kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
         stride_skip = stride_q
         padding_skip = [int(skip // 2) for skip in kernel_skip]
@@ -696,8 +697,9 @@ def __init__(
             max_seq_len=seq_len,
         )
         self.drop_path = DropPath(droppath_rate) if droppath_rate > 0.0 else Identity()
-        self.norm2 = norm_layer(dim, permute_input_output=True)
+        self.norm2 = norm_layer(dim)
         self.norm2_is_batchnorm_1d = isinstance(self.norm2, BatchNorm1d)
+        self.norm2.permute_input_output = True if self.norm2_is_batchnorm_1d else False
         mlp_hidden_dim = int(dim * mlp_ratio)
         self.has_cls_embed = has_cls_embed
         self.mlp = Mlp(

From 4332686ff6eaf2e16fe9b43722b98d32c17a18f7 Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Mon, 15 May 2023 10:20:15 -0700
Subject: [PATCH 514/638] Fix codegen duplicate dim decl issue (#692)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/692

Reviewed By: muchulee8, chenyang78

Differential Revision: D45827572

fbshipit-source-id: f67ae40dc5e951966bc930fd5823837edee65cd0
---
 .../test/converters/test_ait_reshape.py       | 24 +++++++++++++++++++
 python/aitemplate/backend/codegen.py          |  3 +++
 2 files changed, 27 insertions(+)

diff --git a/fx2ait/fx2ait/test/converters/test_ait_reshape.py b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
index c5fd4e686..28cf42287 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_reshape.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
@@ -176,3 +176,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             inputs_spec,
             expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
         )
+
+    def test_fx2ait_lower_shapes_duped(self):
+        class TestMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                concat = torch.concat((x, x, x))
+                reshape_0 = concat.reshape((-1, 3))
+                a = x.size(0) * 3
+                reshape = concat.reshape(-1, a)
+                reshape_0 = reshape_0.reshape(-1, a)
+                return reshape + reshape_0
+
+        inputs_spec = TensorSpec.from_input_list_with_batch_size(
+            inputs=[torch.randn(3, 4).half()], max_batch_size=8
+        )
+        model = TestMod().cuda().half()
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={},
+        )
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index bf3f5fe3a..d4e6c521f 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -805,6 +805,9 @@ def append_tensor(self, node: Tensor) -> None:
         name = node._attrs["name"]
         dtype = node._attrs["dtype"]
         if isinstance(node, IntVarTensor):
+            # Check to prevent duplicate declaration in case IntVarTensor is already declared from dims for another tensor
+            if node._attrs["name"] in self.visited_dims:
+                return
             int_var = node._attrs["int_var"]
             if isinstance(int_var, IntImm):
                 self.tensor_decl.append(

From 5847ce1a3ccf5b59da145be7671ed59dc06b8031 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Mon, 15 May 2023 11:16:24 -0700
Subject: [PATCH 515/638] Add fast_math to LowerSettings and AIT Target (#695)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/695

ATT, so that it can be controlled when creating AIT Target.

Reviewed By: frank-wei

Differential Revision: D45856348

fbshipit-source-id: 59ed43fc15b7354d7d87b94eff22b964cf543d59
---
 fx2ait/fx2ait/fx2ait.py                      | 7 ++++++-
 fx2ait/fx2ait/lower/lower_settings.py        | 1 +
 python/aitemplate/backend/cuda/target_def.py | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index e70a6c3f8..ccf9e1c94 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -67,6 +67,7 @@ def __init__(
         remote_cache_file_path: Optional[str] = None,
         save_remote_cache: Optional[bool] = False,
         do_optimize_graph: bool = True,
+        use_fast_math: bool = True,
     ):
         """
         Args:
@@ -84,6 +85,7 @@ def __init__(
             load_ait_dir: location for existing ait files
             remote_cache_file_path: AITemplate profiling cache location
             save_remote_cache: whether to save the updated cache
+            use_fast_math: whether to use fast math in CUDA kernels
         """
         super().__init__(module)
 
@@ -106,6 +108,7 @@ def __init__(
             os.environ["CACHE_DIR"] = self.cache_dir
             _LOGGER.info(f"Set CACHE_DIR to {self.cache_dir}")
         self.use_fp16_acc = use_fp16_acc
+        self.use_fast_math = use_fast_math
         self.hardware_target = self._create_target()
         self.input_specs = input_specs
         self.input_specs_iter = 0
@@ -128,7 +131,9 @@ def __init__(
     def _create_target(self):
         """Detect GPU target"""
         return detect_target(
-            use_fp16_acc=self.use_fp16_acc, remote_cache_bytes=self.remote_cache_bytes
+            use_fp16_acc=self.use_fp16_acc,
+            remote_cache_bytes=self.remote_cache_bytes,
+            use_fast_math=self.use_fast_math,
         )
 
     def _load_profile_cache(self) -> bytes:
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
index 80d5d55d8..e9bcfaea4 100644
--- a/fx2ait/fx2ait/lower/lower_settings.py
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -69,6 +69,7 @@ class LowerSettings:
     # If None, infer the dtypes from the sample inputs.
     precision: Optional[LowerPrecision] = LowerPrecision.FP16
     use_fp16_acc: bool = True  # only valid for precision == FP16
+    use_fast_math: bool = True  # Whether to use fast math in CUDA kernels
     allow_int_inputs: bool = False  # If AIT acc subgraph accept integer inputs
     ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None
     leaf_module_list: Optional[Set[Type[nn.Module]]] = None
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index d98b17fdf..6cdf39e0a 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -147,7 +147,10 @@ def _build_nvcc_compiler_options(self) -> List[str]:
         ]
         if self._ndebug == 1:
             options.append("-DNDEBUG")
-        if environ.use_fast_math():
+        if environ.use_fast_math() and (
+            "use_fast_math" not in Target.current()._kwargs
+            or Target.current()._kwargs["use_fast_math"]
+        ):
             options.append("--use_fast_math")
         return options
 

From e6a8dfca983411b56850012fe0a6a315efff5f97 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 16 May 2023 10:24:41 +0800
Subject: [PATCH 516/638] fix bugs

---
 python/aitemplate/backend/rocm/__init__.py     | 1 +
 python/aitemplate/backend/rocm/lib_template.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/python/aitemplate/backend/rocm/__init__.py b/python/aitemplate/backend/rocm/__init__.py
index fd177bb28..a687a1ee7 100644
--- a/python/aitemplate/backend/rocm/__init__.py
+++ b/python/aitemplate/backend/rocm/__init__.py
@@ -17,6 +17,7 @@
 Rocm backend init.
 """
 from aitemplate.backend.rocm import lib_template, target_def, utils
+from aitemplate.backend.rocm.attention import *
 from aitemplate.backend.rocm.common import *
 from aitemplate.backend.rocm.conv2d import *
 from aitemplate.backend.rocm.embedding import *
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
index 4e01c6bef..89fa95c89 100644
--- a/python/aitemplate/backend/rocm/lib_template.py
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -41,6 +41,8 @@ def void_ptr_decl(name, dtype="float16", indent="  "):
         type_string = "int64_t*"
     elif dtype == "bool":
         type_string = "bool*"
+    elif dtype == "int32":
+        type_string = "int*"
     else:
         raise NotImplementedError
     return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)

From 5968179d3631c7fa326da6261f890b40ae27d747 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Tue, 16 May 2023 00:08:45 -0700
Subject: [PATCH 517/638] Fix use_fast_math (#697)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/697

Fix existing OSS failures caused by the previous diff;
Add a compilation option in AIT to disable fast_math when calculating TANH / SIGMOID;

Reviewed By: frank-wei, alexanderguzhva

Differential Revision: D45876348

fbshipit-source-id: e7455d7ecbe26974526600768942b0d61b95600c
---
 .../backend/cuda/elementwise/custom_math.cuh  | 46 ++++++++++++++++++-
 python/aitemplate/backend/cuda/target_def.py  | 38 +++++++++------
 tests/unittest/ops/test_fused_elementwise.py  | 32 ++++++++++---
 3 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
index f88a84427..acfaa2018 100644
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -88,6 +88,7 @@ __device__ bfloat16_2 h2sign_custom(const bfloat16_2 a) {
 }
 
 __device__ half2 fast_tanh(half2 x) {
+#if defined(AIT_USE_FAST_MATH)
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
     (__CUDA_ARCH__ >= 750)
 
@@ -100,9 +101,13 @@ __device__ half2 fast_tanh(half2 x) {
   return half2(
       {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
 #endif
+#else
+  return half2({tanhf(float(x.x)), tanhf(float(x.y))});
+#endif
 }
 
 __device__ half fast_tanh(half x) {
+#if defined(AIT_USE_FAST_MATH)
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
     (__CUDA_ARCH__ >= 750)
 
@@ -112,11 +117,14 @@ __device__ half fast_tanh(half x) {
 #else
   return half(cutlass::fast_tanh(float(x)));
 #endif
+#else
+  return half(tanhf(float(x)));
+#endif
 }
 
 __device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
-    (__CUDA_ARCH__ >= 900)
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
 
   asm volatile("tanh.approx.bf16x2 %0, %1;"
                : "=r"(__TO_UI(x))
@@ -124,8 +132,12 @@ __device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
   return x;
 
 #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
   return bfloat16_2(
       {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
+#else
+  return bfloat16_2({tanhf(float(x.x)), tanhf(float(x.y))});
+#endif
 #else
   NOT_IMPLEMENTED();
 #endif
@@ -133,38 +145,61 @@ __device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
 
 __device__ bfloat16 fast_tanh(bfloat16 x) {
 #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
-    (__CUDA_ARCH__ >= 900)
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
   asm volatile("tanh.approx.bf16 %0, %1;" : "=h"(__TO_US(x)) : "h"(__TO_US(x)));
   return x;
 
 #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
   return cutlass::fast_tanh(float(x));
+#else
+  return bfloat16(tanhf(float(x)));
+#endif
 #else
   NOT_IMPLEMENTED();
 #endif
 }
 
 __device__ float fsigmoid_custom(const float a) {
+#if defined(AIT_USE_FAST_MATH)
   return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
+#else
+  return 1.0f / (1.0f + expf(-a));
+#endif
 }
 
 __device__ half hsigmoid_custom(const half a) {
+#if defined(AIT_USE_FAST_MATH)
   return __hmul(
       (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
       CUDA_FP16_ONE_HALF);
+#else
+  return half(1.0f / (1.0f + expf(float(-a))));
+#endif
 }
 
 __device__ half2 h2sigmoid_custom(const half2 a) {
+#if defined(AIT_USE_FAST_MATH)
   const auto halfX2 = half2(CUDA_FP16_ONE_HALF, CUDA_FP16_ONE_HALF);
   const auto oneX2 = half2(CUDA_FP16_ONE, CUDA_FP16_ONE);
   return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+#else
+  return half2(
+      1.0f / (1.0f + expf(float(-a.x))), 1.0f / (1.0f + expf(float(-a.y))));
+#endif
 }
 
 __device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+#if defined(AIT_USE_FAST_MATH)
   return __hmul(
       (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
       CUDA_BF16_ONE_HALF);
+#else
+  return bfloat16(1.0f / (1.0f + expf(float(-a))));
+#endif
+
 #else
   NOT_IMPLEMENTED();
 #endif
@@ -172,9 +207,16 @@ __device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
 
 __device__ bfloat16_2 h2sigmoid_custom(const bfloat16_2 a) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+#if defined(AIT_USE_FAST_MATH)
   const auto halfX2 = bfloat16_2(CUDA_BF16_ONE_HALF, CUDA_BF16_ONE_HALF);
   const auto oneX2 = bfloat16_2(CUDA_BF16_ONE, CUDA_BF16_ONE);
   return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+#else
+  return bfloat16_2(
+      1.0f / (1.0f + expf(float(-a.x))), 1.0f / (1.0f + expf(float(-a.y))));
+#endif
+
 #else
   NOT_IMPLEMENTED();
 #endif
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 6cdf39e0a..471d8d915 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -138,7 +138,6 @@ def get_host_compiler_options(self) -> List[str]:
     def _build_nvcc_compiler_options(self) -> List[str]:
         options = [
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-            "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
             "-w",
             f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
             environ.get_compiler_opt_level(),
@@ -148,10 +147,15 @@ def _build_nvcc_compiler_options(self) -> List[str]:
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         if environ.use_fast_math() and (
-            "use_fast_math" not in Target.current()._kwargs
-            or Target.current()._kwargs["use_fast_math"]
+            "use_fast_math" not in self._kwargs or self._kwargs["use_fast_math"]
         ):
-            options.append("--use_fast_math")
+            options.extend(
+                [
+                    "--use_fast_math",
+                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                    "-DAIT_USE_FAST_MATH=1",
+                ]
+            )
         return options
 
     def get_device_compiler_options(self) -> List[str]:
@@ -234,7 +238,7 @@ class FBCUDA(CUDA):
 
     nvcc_option_json = None
     cutlass_path_ = None
-    compile_options_ = None
+    static_compile_options_ = None
 
     def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         from libfb.py import parutil
@@ -316,7 +320,7 @@ def get_device_compiler_options(self) -> List[str]:
         raise NotImplementedError
 
     def _build_compile_options(self):
-        if not FBCUDA.compile_options_:
+        if not FBCUDA.static_compile_options_:
             include_paths = self._build_include_directories()
             fb_include_path = os.path.join(self._include_path, "fb_include")
             pp_args = self.nvcc_options_json["pp_args"]
@@ -342,7 +346,6 @@ def _build_compile_options(self):
                     "-Xcompiler -fPIC",
                     "-Xcompiler -fvisibility=hidden",
                     "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
                     "-w",
                     "--expt-relaxed-constexpr",
                     f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
@@ -353,12 +356,21 @@ def _build_compile_options(self):
             )
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
-            if environ.use_fast_math():
-                options.append("--use_fast_math")
-            FBCUDA.compile_options_ = " ".join(options)
-        compile_options = FBCUDA.compile_options_
-        _LOGGER.info(f"The compile options are: {compile_options}")
-        return compile_options
+            FBCUDA.static_compile_options_ = options
+        compile_options = list(FBCUDA.static_compile_options_)
+        if environ.use_fast_math() and (
+            "use_fast_math" not in self._kwargs or self._kwargs["use_fast_math"]
+        ):
+            compile_options.extend(
+                [
+                    "--use_fast_math",
+                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                    "-DAIT_USE_FAST_MATH=1",
+                ]
+            )
+        compile_options_str = " ".join(compile_options)
+        _LOGGER.info(f"The compile options are: {compile_options_str}")
+        return compile_options_str
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index 94b628a0b..d7890fc89 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -263,7 +263,7 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
     def test_fused_elementwise_kernel1(self, ait_dtype):
         self._test_fused_elementwise_kernel1(ait_dtype)
 
-    def _test_sigmoid(self, input_size, test_name, ait_dtype):
+    def _test_sigmoid(self, input_size, test_name, ait_dtype, use_fast_math=True):
         torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
@@ -275,7 +275,7 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(use_fast_math=use_fast_math)
         module = compile_model(X2, target, "./tmp", test_name)
 
         x1_pt = (
@@ -285,7 +285,10 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
 
         x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        if use_fast_math:
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.equal(x2, x2_pt), f"{x2=}\n{x2_pt=}")
         # sanity checks
         self.assertEqual(torch.sum(x2 < 0), 0)
         self.assertEqual(torch.sum(x2 > 1), 0)
@@ -303,8 +306,15 @@ def test_sigmoid(self, ait_dtype):
         self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
         self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
         self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
+        # use_fast_math = False
+        self._test_sigmoid(
+            [1024, 70144],
+            f"sigmoid_no_fast_math_{ait_dtype}",
+            ait_dtype,
+            use_fast_math=False,
+        )
 
-    def _test_tanh(self, input_size, test_name, ait_dtype):
+    def _test_tanh(self, input_size, test_name, ait_dtype, use_fast_math=True):
         assert len(input_size) == 2
         torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
@@ -317,7 +327,7 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(use_fast_math=use_fast_math)
         module = compile_model(X2, target, "./tmp", test_name)
 
         x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
@@ -325,7 +335,10 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
 
         x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        if use_fast_math:
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.equal(x2, x2_pt))
 
     @parameterized.expand(
         **filter_test_cases_by_params(
@@ -341,6 +354,13 @@ def test_tanh(self, ait_dtype):
         self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
         self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
         self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
+        # use_fast_math = False
+        self._test_tanh(
+            [1024, 23744],
+            f"tanh_no_fast_math_{ait_dtype}",
+            ait_dtype,
+            use_fast_math=False,
+        )
 
     def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         assert len(input_size) == 2

From bc49d20d8b74b9335a95e96437345daf0441d81f Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 16 May 2023 09:19:55 -0700
Subject: [PATCH 518/638] Add layernorm CUDA kernel based on Welford's
 algorithm (#698)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/698

Welford's online [algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm) for computing the variance is single-pass and sufficiently numerically stable. This diff adds an alternative CUDA back-end kernel based on Welford's algorithm for the `layernorm` op (together with its `layernorm_sigmoid_mul` fusion and `TensorAccessor` support). The kernel is based on the OneFlow's `layer_norm` kernel implementation (more details [here](https://oneflow2020.medium.com/how-to-implement-an-efficient-layernorm-cuda-kernel-oneflow-performance-optimization-731e91a285b8)).

Reviewed By: ipiszy

Differential Revision: D45867506

fbshipit-source-id: 6b15a1044e7dbdf0f4d62a8826fe43944e0e8fc8
---
 .../cuda/layernorm_sigmoid_mul/layer_norm.cuh | 1246 +++++++++++++++++
 .../layernorm_sigmoid_mul.py                  |   41 +-
 .../layernorm_welford.cuh                     |  284 ++++
 .../compiler/test_strided_layernorm.py        |   47 +-
 tests/unittest/ops/test_layernorm.py          |   91 +-
 .../ops/test_layernorm_sigmoid_mul.py         |  105 +-
 6 files changed, 1714 insertions(+), 100 deletions(-)
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh

diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
new file mode 100644
index 000000000..e28fdd831
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
@@ -0,0 +1,1246 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original OneFlow copyright notice:
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+namespace layer_norm {
+
+constexpr int kWarpSize = 32;
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return max(a, b);
+  }
+};
+
+template <
+    template <typename>
+    class ReductionOp,
+    typename T,
+    int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    val = ReductionOp<T>()(
+        val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+  }
+  return val;
+}
+
+template <template <typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) {
+    result_broadcast = result;
+  }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template <typename T>
+__inline__ __device__ T Div(T a, T b);
+
+template <>
+__inline__ __device__ float Div<float>(float a, float b) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template <>
+__inline__ __device__ double Div<double>(double a, double b) {
+  return a / b;
+}
+
+template <typename T>
+__inline__ __device__ T Rsqrt(T x);
+
+template <>
+__inline__ __device__ float Rsqrt<float>(float x) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __frsqrt_rn(x);
+#else
+  return rsqrt(x);
+#endif
+}
+
+template <>
+__inline__ __device__ double Rsqrt<double>(double x) {
+  return rsqrt(x);
+}
+
+template <class Func>
+inline cudaError_t GetNumBlocks(
+    Func func,
+    int64_t block_size,
+    size_t dynamic_smem_size,
+    int64_t max_blocks,
+    int64_t waves,
+    int* num_blocks) {
+  int dev;
+  {
+    cudaError_t err = cudaGetDevice(&dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int sm_count;
+  {
+    cudaError_t err =
+        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int max_active_blocks;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, func, block_size, dynamic_smem_size);
+  }
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return cudaSuccess;
+}
+
+template <typename T>
+class HasCanPackAs {
+  typedef char one;
+  struct two {
+    char x[2];
+  };
+
+  template <typename C>
+  static one test(decltype(&C::CanPackAs));
+  template <typename C>
+  static two test(...);
+
+ public:
+  enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == true, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return t.CanPackAs(pack_size);
+}
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == false, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return true;
+}
+
+template <typename T, int N>
+struct GetPackType {
+  using type =
+      typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template <typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template <typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template <typename SRC, typename DST>
+struct DirectLoad {
+  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
+  template <int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+  const SRC* src;
+  int64_t row_size;
+};
+
+template <typename SRC, typename DST>
+struct DirectStore {
+  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
+  template <int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      pack.elem[i] = static_cast<DST>(src[i]);
+    }
+    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t row_size;
+};
+
+template <typename T>
+inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
+  // Use Welford Online algorithm to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  T delta1 = val - *mean;
+  *mean += Div(delta1, *count);
+  T delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template <typename T>
+inline __device__ void WelfordCombine(
+    T b_mean,
+    T b_m2,
+    T b_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  if (b_count == 0) {
+    return;
+  }
+  T new_count = *count + b_count;
+  T nb_over_n = Div(b_count, new_count);
+  T delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+  *count = new_count;
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  WelfordWarpReduce<T, thread_group_width>(
+      thread_mean, thread_m2, thread_count, mean, m2, count);
+  *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
+  *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
+  *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+}
+
+template <typename T>
+__inline__ __device__ void WelfordBlockAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* result_mean,
+    T* result_m2,
+    T* result_count) {
+  __shared__ T mean_shared[kWarpSize];
+  __shared__ T m2_shared[kWarpSize];
+  __shared__ T count_shared[kWarpSize];
+  __shared__ T mean_result_broadcast;
+  __shared__ T m2_result_broadcast;
+  __shared__ T count_result_broadcast;
+  const int lid = threadIdx.x % kWarpSize;
+  const int wid = threadIdx.x / kWarpSize;
+  T warp_mean = 0;
+  T warp_m2 = 0;
+  T warp_count = 0;
+  WelfordWarpReduce(
+      thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
+  __syncthreads();
+  if (lid == 0) {
+    mean_shared[wid] = warp_mean;
+    m2_shared[wid] = warp_m2;
+    count_shared[wid] = warp_count;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    if (threadIdx.x < blockDim.x / kWarpSize) {
+      warp_mean = mean_shared[lid];
+      warp_m2 = m2_shared[lid];
+      warp_count = count_shared[lid];
+    } else {
+      warp_mean = static_cast<T>(0);
+      warp_m2 = static_cast<T>(0);
+      warp_count = static_cast<T>(0);
+    }
+    __syncwarp();
+    T block_mean = 0;
+    T block_m2 = 0;
+    T block_count = 0;
+    WelfordWarpReduce(
+        warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
+    if (lid == 0) {
+      mean_result_broadcast = block_mean;
+      m2_result_broadcast = block_m2;
+      count_result_broadcast = block_count;
+    }
+  }
+  __syncthreads();
+  *result_mean = mean_result_broadcast;
+  *result_m2 = m2_result_broadcast;
+  *result_count = count_result_broadcast;
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+__global__ void LayerNormWarpImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  static_assert(max_cols_per_thread % pack_size == 0, "");
+  static_assert(min_cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int max_num_packs = max_cols_per_thread / pack_size;
+  constexpr int min_num_packs = min_cols_per_thread / pack_size;
+  assert(cols <= max_cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][max_cols_per_thread];
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int64_t lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows;
+       row += step) {
+    ComputeType thread_mean[rows_per_access];
+    ComputeType thread_m2[rows_per_access];
+    ComputeType thread_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_mean[row_id] = 0;
+      thread_m2[row_id] = 0;
+      thread_count[row_id] = 0;
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+        for (int i = 0; i < pack_size; ++i) {
+          WelfordCombine(
+              row_buf[pack_offset + i],
+              thread_mean + row_id,
+              thread_m2 + row_id,
+              thread_count + row_id);
+        }
+      }
+      for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (!padding || col < cols) {
+          load.template load<pack_size>(
+              row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            WelfordCombine(
+                row_buf[pack_offset + i],
+                thread_mean + row_id,
+                thread_m2 + row_id,
+                thread_count + row_id);
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            row_buf[pack_offset + i] = 0;
+          }
+        }
+      }
+    }
+    ComputeType warp_mean[rows_per_access];
+    ComputeType warp_m2[rows_per_access];
+    ComputeType warp_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      int global_row_id = row + row_id;
+      ComputeType* row_buf = buf[row_id];
+      WelfordWarpAllReduce<ComputeType, thread_group_width>(
+          thread_mean[row_id],
+          thread_m2[row_id],
+          thread_count[row_id],
+          warp_mean + row_id,
+          warp_m2 + row_id,
+          warp_count + row_id);
+      ComputeType row_mean = warp_mean[row_id];
+      ComputeType row_variance =
+          max(Div(warp_m2[row_id], warp_count[row_id]),
+              static_cast<ComputeType>(0.0));
+      ComputeType row_inv_var =
+          Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+      if (mean && inv_variance && lane_id == 0) {
+        mean[global_row_id] = row_mean;
+        inv_variance[global_row_id] = row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < max_cols_per_thread; ++i) {
+        row_buf[i] = (row_buf[i] - row_mean) * row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < min_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        store.template store<pack_size>(
+            row_buf + i * pack_size, global_row_id, col);
+      }
+#pragma unroll
+      for (int i = min_num_packs; i < max_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          store.template store<pack_size>(
+              row_buf + i * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+inline cudaError_t LaunchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) /
+      thread_groups_per_block;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormWarpImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            max_cols_per_thread,
+            min_cols_per_thread,
+            thread_group_width,
+            rows_per_access,
+            padding>,
+        block_size,
+        0,
+        num_blocks,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormWarpImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      max_cols_per_thread,
+      min_cols_per_thread,
+      thread_group_width,
+      rows_per_access,
+      padding><<<grid_dim_x, block_dim, 0, stream>>>(
+      load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+inline cudaError_t DispatchLayerNormWarpImplPadding(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols == max_cols_per_thread * thread_group_width) {
+    // when not padding, min_cols_per_thread must equals to max_cols_per_thread,
+    // pass max_cols_per_thread as min_cols_per_thread and max_cols_per_thread
+    // param.
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        max_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        false>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        min_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        true>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 1, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                 \
+  else if (cols <= (max_col)*kWarpSize) {                                 \
+    return DispatchLayerNormWarpImplPadding<                              \
+        LOAD,                                                             \
+        STORE,                                                            \
+        ComputeType,                                                      \
+        pack_size,                                                        \
+        max_col,                                                          \
+        min_col,                                                          \
+        kWarpSize,                                                        \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+  }
+  DEFINE_ONE_ELIF(2, 1)
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 2, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                   \
+  else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \
+    return DispatchLayerNormWarpImplPadding<                                \
+        LOAD,                                                               \
+        STORE,                                                              \
+        ComputeType,                                                        \
+        pack_size,                                                          \
+        max_col,                                                            \
+        min_col,                                                            \
+        kWarpSize,                                                          \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);   \
+  }
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormWarpImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormWarpImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void LayerNormBlockSMemImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (mean && inv_variance && threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+inline cudaError_t LaunchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    int smem,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
+        block_size,
+        smem,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType);
+  int max_active_blocks_conf_1;
+
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_1>,
+        block_size_conf_1,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_4>,
+        block_size_conf_4,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_4>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_3;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_3>,
+        block_size_conf_3,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_3>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_2;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_2>,
+        block_size_conf_2,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_2>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  *success = true;
+  return LaunchLayerNormBlockSMemImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size_conf_1>(
+      stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct TryDispatchLayerNormBlockSMemImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance,
+      bool* success) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          4>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          2>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          1>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t TryDispatchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void __launch_bounds__(1024) LayerNormBlockUncachedImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (mean && inv_variance && threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      const int pack_offset = pack_id * pack_size;
+      load.template load<pack_size>(pack, row, pack_offset);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (pack[i] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_offset);
+    }
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t LaunchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockUncachedImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size>,
+        block_size,
+        0,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, 0, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormBlockUncachedImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<!std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  if (cols <= 1024) {
+    return DispatchLayerNormWarpImpl<LOAD, STORE, ComputeType>(
+        stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      cudaError_t err =
+          TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+              stream,
+              load,
+              store,
+              rows,
+              cols,
+              epsilon,
+              mean,
+              inv_variance,
+              &dispatch_smem_impl_success);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+    return cudaSuccess;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+// gradient kernels are omitted
+
+} // namespace layer_norm
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
index ec791eb0f..84882affd 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -31,13 +31,21 @@
 
 FUNC_TEMPLATE = jinja2.Template(
     """
+#include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_runtime.h>
+
+#include <cub/cub.cuh>
+#include "cutlass/arch/memory_sm80.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
+#include <math_constants.h>
+#include <assert.h>
 
 using bfloat16 = __nv_bfloat16;
+using bfloat16_2 = __nv_bfloat162;
 
 {{gamma_beta_const_defs}}
 
@@ -94,6 +102,30 @@
 )
 
 
+def _get_custom_libs():
+    target = Target.current()
+    if target._kwargs.get("layernorm_use_welford_algorithm", False):
+        custom_libs = "\n\n".join(
+            [
+                target.get_custom_libs(
+                    absolute_dir=os.path.dirname(__file__),
+                    filename="layer_norm.cuh",
+                ),
+                target.get_custom_libs(
+                    absolute_dir=os.path.dirname(__file__),
+                    filename="layernorm_welford.cuh",
+                ),
+            ]
+        )
+    else:
+        custom_libs = target.get_custom_libs(
+            absolute_dir=os.path.dirname(__file__),
+            filename="layernorm_sigmoid_mul_kernel.cuh",
+        )
+
+    return custom_libs
+
+
 @registry.reg("cuda.layernorm.gen_function")
 def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
@@ -101,10 +133,9 @@ def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
+
     return FUNC_TEMPLATE.render(
-        custom_libs=Target.current().get_custom_libs(
-            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
-        ),
+        custom_libs=_get_custom_libs(),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
@@ -127,9 +158,7 @@ def layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_attrs["inputs"][0]._attrs["dtype"]
     )
     return FUNC_TEMPLATE.render(
-        custom_libs=Target.current().get_custom_libs(
-            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
-        ),
+        custom_libs=_get_custom_libs(),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh
new file mode 100644
index 000000000..29e7be6c6
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh
@@ -0,0 +1,284 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef LAYERNORM_KERNEL_CUH
+#define LAYERNORM_KERNEL_CUH
+
+constexpr uint32_t kFinalMask = 0xffffffff;
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
+__device__ half fast_tanh(half x) {
+#if defined(AIT_USE_FAST_MATH)
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#else
+  return half(cutlass::fast_tanh(float(x)));
+#endif
+#else
+  return half(tanhf(float(x)));
+#endif
+}
+
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
+  asm volatile("tanh.approx.bf16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
+  return cutlass::fast_tanh(float(x));
+#else
+  return bfloat16(tanhf(float(x)));
+#endif
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+
+__device__ float sigmoid(const float a) {
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
+}
+
+__device__ half hsigmoid(const half a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 bf16sigmoid(const bfloat16 a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
+}
+#endif
+
+template <typename T>
+struct FSigmoid {
+  __inline__ __device__ T operator()(const T input) const;
+};
+
+template <>
+struct FSigmoid<half> {
+  __inline__ __device__ half operator()(const half a) const {
+    return hsigmoid(a);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct FSigmoid<bfloat16> {
+  __inline__ __device__ bfloat16 operator()(const bfloat16 a) const {
+    return bf16sigmoid(a);
+  }
+};
+#endif
+
+template <>
+struct FSigmoid<float> {
+  __inline__ __device__ float operator()(const float a) const {
+    return sigmoid(a);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// The Layernorm implementation below is based on OneFlow's Layernorm
+// implementation at:
+// https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+template <typename SRC, typename DST>
+struct TensorAccessorLoad {
+  TensorAccessorLoad(
+      const SRC* src,
+      int64_t row_size,
+      const TensorAccessor input_accessor)
+      : src(src), row_size(row_size), input_accessor(input_accessor) {}
+
+  template <int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    layer_norm::Pack<SRC, N> pack;
+    pack.storage =
+        *input_accessor.get<const SRC, const layer_norm::PackType<SRC, N>>(
+            reinterpret_cast<const layer_norm::PackType<SRC, N>*>(src),
+            (row * row_size + col) / N);
+
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+
+  bool CanPackAs(size_t pack_size) {
+    return row_size % pack_size == 0 &&
+        input_accessor.max_alignment() % pack_size == 0;
+  }
+
+  const SRC* src;
+  int64_t row_size;
+  const TensorAccessor input_accessor;
+};
+
+template <typename SRC, typename DST, bool FuseSigmoidMul>
+struct TensorAccessorStore {
+  TensorAccessorStore(
+      DST* y,
+      const DST* x,
+      const DST* gamma,
+      const DST* beta,
+      int64_t row_size,
+      const TensorAccessor input_accessor,
+      const TensorAccessor output_accessor)
+      : y(y),
+        x(x),
+        gamma(gamma),
+        beta(beta),
+        row_size(row_size),
+        input_accessor(input_accessor),
+        output_accessor(output_accessor) {}
+
+  template <int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    layer_norm::Pack<DST, N> x_pack;
+    layer_norm::Pack<DST, N> y_pack;
+
+    if constexpr (FuseSigmoidMul) {
+      x_pack.storage =
+          *input_accessor.get<const DST, const layer_norm::PackType<DST, N>>(
+              reinterpret_cast<const layer_norm::PackType<DST, N>*>(x),
+              (row * row_size + col) / N);
+    }
+
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      SRC normalized_i = src[i];
+
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+      const SRC gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+      const SRC gamma_val = static_cast<SRC>(gamma[col + i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+      const SRC beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+      const SRC beta_val = static_cast<SRC>(beta[col + i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+      normalized_i = normalized_i * gamma_val + beta_val;
+
+      if constexpr (FuseSigmoidMul) {
+        FSigmoid<SRC> fsigmoid;
+        normalized_i =
+            static_cast<SRC>(x_pack.elem[i]) * fsigmoid(normalized_i);
+      }
+
+      y_pack.elem[i] = DST(normalized_i);
+    }
+
+    *output_accessor.get<DST, layer_norm::PackType<DST, N>>(
+        reinterpret_cast<layer_norm::PackType<DST, N>*>(y),
+        (row * row_size + col) / N) = y_pack.storage;
+  }
+
+  bool CanPackAs(size_t pack_size) {
+    return row_size % pack_size == 0 &&
+        output_accessor.max_alignment() % pack_size == 0;
+  }
+
+  DST* y;
+  const DST* x;
+  const DST* gamma;
+  const DST* beta;
+  int64_t row_size;
+  const TensorAccessor input_accessor;
+  const TensorAccessor output_accessor;
+};
+
+template <typename TInput, typename TCompute, bool FuseSigmoidMul>
+cudaError_t invokeLayernormSigmoidMul(
+    TInput* output,
+    const TInput* input,
+    const TInput* gamma,
+    const TInput* beta,
+    int m,
+    int n,
+    const float eps,
+    cudaStream_t stream,
+    const TensorAccessor& input_accessor,
+    const TensorAccessor& output_accessor) {
+  TensorAccessorLoad<TInput, TCompute> load(input, n, input_accessor);
+  TensorAccessorStore<TCompute, TInput, FuseSigmoidMul> store(
+      output, input, gamma, beta, n, input_accessor, output_accessor);
+
+  // mean and inv_variance are not required for forward pass, hence omitted
+  layer_norm::DispatchLayerNorm<decltype(load), decltype(store), TCompute>(
+      stream,
+      load,
+      store,
+      m /* rows */,
+      n /* cols */,
+      eps /* epsilon */,
+      nullptr /* mean */,
+      nullptr /* inv_variance */);
+
+  return cudaGetLastError();
+}
+
+#endif /* LAYERNORM_KERNEL_CUH */
diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
index c119c85d9..9bf86cbc3 100644
--- a/tests/unittest/compiler/test_strided_layernorm.py
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -39,8 +39,11 @@ def build_ait_module(
     ait_dtype="float16",
     workdir="./tmp",
     test_name="strided_layernorm",
+    use_welford_algorithm=False,
 ):
-    target = detect_target()
+    target = detect_target(
+        layernorm_use_welford_algorithm=use_welford_algorithm,
+    )
     X0 = Tensor(
         shape=[
             shape_utils.gen_int_var_min_max(values=batch_sizes, name="input_batch"),
@@ -88,7 +91,7 @@ def build_ait_module(
         output,
         target,
         workdir,
-        test_name,
+        f"{test_name}_{test_id}",
         dll_name=dll_name,
     )
 
@@ -151,6 +154,8 @@ def _test_slice_layer_norm(
         start_indices: List[int] = (0,),
         end_indices: List[int] = (None,),
         dtype: str = "float16",
+        test_name="test_slice_layer_norm",
+        use_welford_algorithm=False,
     ):
 
         input_rank = 1 + len(input_nonbatch_shape)
@@ -175,6 +180,8 @@ def _test_slice_layer_norm(
             **_layernorm_common_params,
             test_id=self._test_id,
             ait_dtype=dtype,
+            test_name=f"{test_name}_{dtype}",
+            use_welford_algorithm=use_welford_algorithm,
         )
         self._test_id += 1
         pt_dtype = torch_utils.string_to_torch_dtype(dtype)
@@ -187,29 +194,30 @@ def _test_slice_layer_norm(
             }
             ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
             ait_module.run_with_tensors(ait_inputs, ait_outputs)
-
-            self.assertTrue(
-                torch.allclose(
-                    ait_outputs["output"], pt_tensors["output"], atol=1e-3, rtol=1e-3
-                )
+            torch.testing.assert_close(
+                ait_outputs["output"],
+                pt_tensors["output"],
+                atol=1e-3,
+                rtol=1e-3,
             )
 
     def _test_slice_layer_norm_kernels(
         self,
         **kwargs,
     ):
-        for start_indices, end_indices, input_nonbatch_shape in (
+        for start_indices, end_indices, input_nonbatch_shape, use_welford_algorithm in (
             # (cuda-half4) kernel
-            ((0, 0, 0, 4), (None, None, None, 36), (4, 1, 40)),
+            ((0, 0, 0, 4), (None, None, None, 36), (4, 1, 40), False),
             # (generic n < 1024) kernel
-            ((0, 0, 0, 11), (None, None, None, 13), (4, 1, 15)),
+            ((0, 0, 0, 11), (None, None, None, 13), (4, 1, 15), False),
             # (cuda-half; block size = 512) kernel
-            ((0, 0, 0, 1), (None, None, None, 1026), (4, 1, 1027)),
+            ((0, 0, 0, 1), (None, None, None, 1026), (4, 1, 1027), True),
         ):
             self._test_slice_layer_norm(
                 start_indices=start_indices,
                 end_indices=end_indices,
                 input_nonbatch_shape=input_nonbatch_shape,
+                use_welford_algorithm=use_welford_algorithm,
                 **kwargs,
             )
 
@@ -217,18 +225,19 @@ def _test_middle_slice_layer_norm_kernels(
         self,
         **kwargs,
     ):
-        for start_indices, end_indices, input_nonbatch_shape in (
+        for start_indices, end_indices, input_nonbatch_shape, use_welford_algorithm in (
             # (cuda-half4) kernel
-            ((0, 0, 4, 0), (None, None, 36, None), (2, 40, 4)),
+            ((0, 0, 4, 0), (None, None, 36, None), (2, 40, 4), False),
             # (generic n < 1024) kernel
-            ((0, 0, 11, 0), (None, None, 13, None), (2, 15, 2)),
+            ((0, 0, 11, 0), (None, None, 13, None), (2, 15, 2), True),
             # (cuda-half; block size = 512) kernel
-            ((0, 0, 1, 0), (None, None, 1026, None), (2, 1027, 2)),
+            ((0, 0, 1, 0), (None, None, 1026, None), (2, 1027, 2), False),
         ):
             self._test_slice_layer_norm(
                 start_indices=start_indices,
                 end_indices=end_indices,
                 input_nonbatch_shape=input_nonbatch_shape,
+                use_welford_algorithm=use_welford_algorithm,
                 **kwargs,
             )
 
@@ -247,6 +256,7 @@ def test_slice_layer_norm_float16(self):
                 gamma_is_none=gamma_is_none,
                 beta_is_none=beta_is_none,
                 fuse_sigmoid_mul=False,
+                test_name="test_slice_layer_norm_float16",
             )
 
     def test_middle_slice_layer_norm_float16(self):
@@ -264,6 +274,7 @@ def test_middle_slice_layer_norm_float16(self):
                 gamma_is_none=gamma_is_none,
                 beta_is_none=beta_is_none,
                 fuse_sigmoid_mul=False,
+                test_name="test_middle_slice_layer_norm_float16",
             )
 
     def test_slice_layer_norm_fuse_sigmoid_mul_float16(self):
@@ -281,6 +292,7 @@ def test_slice_layer_norm_fuse_sigmoid_mul_float16(self):
                 gamma_is_none=gamma_is_none,
                 beta_is_none=beta_is_none,
                 fuse_sigmoid_mul=True,
+                test_name="test_slice_layer_norm_fuse_sigmoid_mul_float16",
             )
 
     def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
@@ -298,6 +310,7 @@ def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
                 gamma_is_none=gamma_is_none,
                 beta_is_none=beta_is_none,
                 fuse_sigmoid_mul=True,
+                test_name="test_middle_slice_layer_norm_fuse_sigmoid_mul_float16",
             )
 
     @unittest.skipIf(
@@ -310,6 +323,7 @@ def test_slice_layer_norm_float32(self):
             beta_is_none=True,
             fuse_sigmoid_mul=False,
             dtype="float32",
+            test_name="test_slice_layer_norm_float32_1",
         )
         self._test_middle_slice_layer_norm_kernels(
             n_normalize_over_last_dims=2,
@@ -317,6 +331,7 @@ def test_slice_layer_norm_float32(self):
             beta_is_none=False,
             fuse_sigmoid_mul=False,
             dtype="float32",
+            test_name="test_slice_layer_norm_float32_2",
         )
         self._test_slice_layer_norm_kernels(
             n_normalize_over_last_dims=3,
@@ -324,6 +339,7 @@ def test_slice_layer_norm_float32(self):
             beta_is_none=True,
             fuse_sigmoid_mul=True,
             dtype="float32",
+            test_name="test_slice_layer_norm_float32_3",
         )
         self._test_middle_slice_layer_norm_kernels(
             n_normalize_over_last_dims=2,
@@ -331,6 +347,7 @@ def test_slice_layer_norm_float32(self):
             beta_is_none=False,
             fuse_sigmoid_mul=True,
             dtype="float32",
+            test_name="test_slice_layer_norm_float32_4",
         )
 
 
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
index 9dbf2e118..583f55fa6 100644
--- a/tests/unittest/ops/test_layernorm.py
+++ b/tests/unittest/ops/test_layernorm.py
@@ -44,6 +44,7 @@ def _test_layernorm(
         atol=1e-3,
         rtol=1e-3,
         dtype="float16",
+        use_welford_algorithm=False,
     ):
         torch_dtype = string_to_torch_dtype(dtype)
         BS = [1, 1024]
@@ -96,9 +97,17 @@ def _test_layernorm(
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
-        target = detect_target()
+        target = detect_target(
+            layernorm_use_welford_algorithm=use_welford_algorithm,
+        )
         dll_name = f"test_{self.test_count}.so"
-        module = compile_model(X4, target, "./tmp", "layernorm", dll_name=dll_name)
+        module = compile_model(
+            X4,
+            target,
+            "./tmp",
+            f"layernorm_{dtype}",
+            dll_name=dll_name,
+        )
 
         for batch_size in [50, 900, 1024]:
             x1_pt = torch.randn(batch_size, *MS, *NS, dtype=torch_dtype).cuda()
@@ -122,47 +131,53 @@ def _test_layernorm(
             torch.testing.assert_close(x4, x4_pt, atol=atol, rtol=rtol)
             self.test_count += 1
 
-    def test_layernorm(self):
-        if detect_target().name() == "rocm":
-            self._test_layernorm(use_size_op=False, MS=(256,), NS=(768,))
-            self._test_layernorm(use_size_op=False, MS=(), NS=(768,))
-            self._test_layernorm(
-                use_size_op=False,
-                MS=(
-                    256,
-                    3,
-                ),
-                NS=(256,),
-            )
-        else:
-            for use_size_op in (True, False):
-                self._test_layernorm(use_size_op=use_size_op)
-                self._test_layernorm(gamma_is_none=True, use_size_op=use_size_op)
-                self._test_layernorm(beta_is_none=True, use_size_op=use_size_op)
+    def test_layernorm_fp16(self):
+        for use_size_op in (True, False):
+            self._test_layernorm(use_size_op=use_size_op)
+            self._test_layernorm(gamma_is_none=True, use_size_op=use_size_op)
+            self._test_layernorm(use_size_op=use_size_op, eps=0.1)
+            self._test_layernorm(MS=(16, 64), NS=(4, 32), use_size_op=use_size_op)
+
+            for use_welford_algorithm in (True, False):
+                self._test_layernorm(
+                    beta_is_none=True,
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
+                )
                 self._test_layernorm(
-                    gamma_is_none=True, beta_is_none=True, use_size_op=use_size_op
+                    gamma_is_none=True,
+                    beta_is_none=True,
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
                 )
-                self._test_layernorm(use_size_op=use_size_op, eps=0.1)
-                self._test_layernorm(MS=(16, 64), NS=(4, 32), use_size_op=use_size_op)
                 self._test_layernorm(
-                    MS=(16, 8, 4), NS=(2, 4, 32), use_size_op=use_size_op
+                    MS=(16, 8, 4),
+                    NS=(2, 4, 32),
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
                 )
 
-    @unittest.skipIf(
-        detect_target().name() == "rocm", "fp32 layer norm is not supported on ROCm"
-    )
-    def test_layernorm_fp32(self):
+    def test_layernorm_rocm(self):
+        self._test_layernorm(use_size_op=False, MS=(256,), NS=(768,))
+        self._test_layernorm(use_size_op=False, MS=(), NS=(768,))
+        self._test_layernorm(use_size_op=False, MS=(256, 3), NS=(256,))
+
+    def test_layernorm_fp32_sm80(self):
         self._test_layernorm(dtype="float32")
         self._test_layernorm(gamma_is_none=True, dtype="float32")
         self._test_layernorm(beta_is_none=True, dtype="float32")
         self._test_layernorm(gamma_is_none=True, beta_is_none=True, dtype="float32")
         self._test_layernorm(eps=0.1, dtype="float32")
         self._test_layernorm(MS=(16, 64), NS=(4, 32), dtype="float32")
-        self._test_layernorm(MS=(16, 8, 4), NS=(2, 4, 32), dtype="float32")
 
-    @unittest.skipIf(
-        detect_target().name() == "rocm", "fp32 layer norm is not supported on ROCm"
-    )
+        for use_welford_algorithm in (True, False):
+            self._test_layernorm(
+                MS=(16, 8, 4),
+                NS=(2, 4, 32),
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
     def test_layernorm_bf16(self):
         self._test_layernorm(dtype="bfloat16", atol=1e-2, rtol=1e-2)
         self._test_layernorm(gamma_is_none=True, dtype="bfloat16", atol=1e-2, rtol=1e-2)
@@ -178,12 +193,20 @@ def test_layernorm_bf16(self):
         self._test_layernorm(
             MS=(16, 64), NS=(4, 32), dtype="bfloat16", atol=1e-2, rtol=1e-2
         )
-        self._test_layernorm(
-            MS=(16, 8, 4), NS=(2, 4, 32), dtype="bfloat16", atol=1e-2, rtol=1e-2
-        )
+
+        for use_welford_algorithm in (True, False):
+            self._test_layernorm(
+                MS=(16, 8, 4),
+                NS=(2, 4, 32),
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
 
 
 filter_test_cases_by_test_env(LayernormTestCase)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
index 1c383393f..4d8b621f0 100644
--- a/tests/unittest/ops/test_layernorm_sigmoid_mul.py
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -46,6 +46,7 @@ def _test_fused_layernorm_sigmoid_mul(
         rtol=1e-2,
         eps=1e-5,
         dtype="float16",
+        use_welford_algorithm=False,
     ):
         logging.info(
             f"_test_fused_layernorm_sigmoid_mul: M={MS}, N={NS}, "
@@ -95,12 +96,14 @@ def _test_fused_layernorm_sigmoid_mul(
         X6._attrs["is_output"] = True
         X6._attrs["name"] = "output"
 
-        target = detect_target()
+        target = detect_target(
+            layernorm_use_welford_algorithm=use_welford_algorithm,
+        )
         with compile_model(
             X6,
             target,
             "./tmp",
-            f"fused_layernorm_sigmoid_mul_test_{self._test_id}",
+            f"fused_layernorm_sigmoid_mul_test_{dtype}_{self._test_id}",
         ) as module:
             self._test_id += 1
             for batch_size in [50, 900, 1024]:
@@ -184,20 +187,24 @@ def test_fused_layernorm_sigmoid_mul_fp16(self):
             eps=eps,
             dtype="float16",
         )
-        # block_size = 512 kernel
-        self._test_fused_layernorm_sigmoid_mul(
-            MS=(2, 4),
-            NS=(1055, 5),
-            eps=eps,
-            dtype="float16",
-        )
 
-        self._test_fused_layernorm_sigmoid_mul(
-            NS=(1496,),
-            gamma_is_none=True,
-            beta_is_none=True,
-            dtype="float16",
-        )
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="float16",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="float16",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
         self._test_fused_layernorm_sigmoid_mul(
             NS=(515,),
             gamma_is_none=True,
@@ -271,20 +278,24 @@ def test_fused_layernorm_sigmoid_mul_fp32(self):
             eps=eps,
             dtype="float32",
         )
-        # block_size = 512 kernel
-        self._test_fused_layernorm_sigmoid_mul(
-            MS=(2, 4),
-            NS=(1055, 5),
-            eps=eps,
-            dtype="float32",
-        )
 
-        self._test_fused_layernorm_sigmoid_mul(
-            NS=(1496,),
-            gamma_is_none=True,
-            beta_is_none=True,
-            dtype="float32",
-        )
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
         self._test_fused_layernorm_sigmoid_mul(
             NS=(515,),
             gamma_is_none=True,
@@ -345,24 +356,28 @@ def test_fused_layernorm_sigmoid_mul_bf16(self):
             atol=1e-2,
             rtol=1e-2,
         )
-        # block_size = 512 kernel
-        self._test_fused_layernorm_sigmoid_mul(
-            MS=(2, 4),
-            NS=(1055, 5),
-            eps=eps,
-            dtype="bfloat16",
-            atol=1e-2,
-            rtol=1e-2,
-        )
 
-        self._test_fused_layernorm_sigmoid_mul(
-            NS=(1496,),
-            gamma_is_none=True,
-            beta_is_none=True,
-            dtype="bfloat16",
-            atol=1e-2,
-            rtol=1e-2,
-        )
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
         self._test_fused_layernorm_sigmoid_mul(
             NS=(515,),
             gamma_is_none=True,

From f64cb6ea568cf168862a7450ac5bf8df5be2009c Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 17 May 2023 18:29:06 +0800
Subject: [PATCH 519/638] fix profiler group bug

---
 .../aitemplate/compiler/ops/gemm_universal/gemm_common.py   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 3abb2acc3..7016bc662 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -49,6 +49,7 @@
 )
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.utils import alignment, environ
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
@@ -856,6 +857,11 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
 
 
 def _profiler_results_groupby_key(instance):
+    if detect_target().name() == "rocm":
+        return (
+            instance[1]["op"],  # unique op name
+            instance[3],  # profiler key (gemm shape)
+        )
     return (
         instance[1]["name"],  # unique op name
         instance[2],  # profiler executable

From 3c2825890ec6029d20d9e880d78aa19fe328e6c0 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 17 May 2023 18:32:03 +0800
Subject: [PATCH 520/638] fix bugs

---
 .../rocm/attention/mem_eff_attention.py       |  4 +-
 .../ops/attention/mem_eff_attention.py        | 12 +++--
 .../ops/gemm_universal/gemm_common.py         |  6 +++
 python/aitemplate/frontend/nn/attention.py    | 51 ++++++++++++-------
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
index f726598c9..d8c636684 100644
--- a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -158,7 +158,7 @@
 {{func_signature}}
 {
 
-    bool input_permute = false;
+    bool input_permute = true;
     bool output_permute = true;
     
     auto a_element_op    = AElementOp{};
@@ -341,7 +341,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
 
     batch_size = q.shape()[0]._attrs["name"]
 
-    num_heads = q._attrs["shape"][1]._attrs["values"][0]
+    num_heads = q._attrs["shape"][2]._attrs["values"][0]
     head_dim = q._attrs["shape"][3]._attrs["values"][0]
     
     softmax_scale = head_dim ** (-0.5)
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index f57aab840..9bc58de3a 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -134,9 +134,15 @@ def __call__(
 
         Parameters
         ----------
-        qkv : float16
-            QKV tensor
-            shape: (b, seqlen, num_heads, Kv)
+        q : float16
+            Q tensor
+            shape: (b, seqlen, num_heads, head_dim)
+        k : float16
+            K tensor
+            shape: (b, seqlen, num_heads, head_dim)
+        v : float16
+            V tensor
+            shape: (b, seqlen, num_heads, head_dim)
 
         Returns
         ----------
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 4df8c827d..afe550661 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -49,6 +49,7 @@
 )
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.utils import alignment, environ
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
@@ -867,6 +868,11 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
 
 
 def _profiler_results_groupby_key(instance):
+    if detect_target().name() == "rocm":
+        return (
+            instance[1]["op"],  # unique op name
+            instance[3],  # profiler key (gemm shape)
+        )
     return (
         instance[1]["name"],  # unique op name
         instance[2],  # profiler executable
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 4ee2ca290..57048111c 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -357,17 +357,38 @@ def attention(self, q, k, v, seqlens=None):
         query = self.query(q)
         key = self.key(k)
         value = self.value(v)
+        if detect_target().name() == "cuda":
+            query = ops.permute()(
+                ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+            )
+            key = ops.permute()(
+                ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+            )
+            value = ops.permute()(
+                ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
+                [0, 2, 1, 3],
+            )
+        elif seqlens is None:
+            query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
+            query = ops.transpose()(query, 1, 2)
+            query = ops.reshape()(query, [-1, query.shape()[2], head_dim])
+            key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
+            key = ops.transpose()(key, 1, 2)
+            key = ops.reshape()(key, [-1, key.shape()[2], head_dim])
+            value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
+            value = ops.transpose()(value, 1, 2)
+            value = ops.reshape()(value, [-1, value.shape()[2], head_dim])  
+            OP = ops.bmm_softmax_bmm_permute(
+                shape=(self.num_heads,),
+                scale=head_dim**-0.5,
+                causal=self.causal,
+            )
+            return OP(query, key, value)
+        else:
+            query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
+            key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
+            value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
 
-        query = ops.permute()(
-            ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
-        )
-        key = ops.permute()(
-            ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
-        )
-        value = ops.permute()(
-            ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
-            [0, 2, 1, 3],
-        )
         return self.op(query, key, value, seqlens)
 
     def forward(self, *args, seqlens=None):
@@ -378,15 +399,7 @@ def forward(self, *args, seqlens=None):
         if detect_target().name() == "cuda":
             attn_output = self.attention(args[0], args[1], args[2])
         else:
-            if seqlens:
-                attn_output = self.attention(args[0], args[1], args[2], seqlens)
-            else:
-                OP = ops.bmm_softmax_bmm_permute(
-                    shape=(self.num_heads,),
-                    scale=(self.dim // self.num_heads)**-0.5,
-                    causal=self.causal,
-                )
-                attn_output = OP(*args)
+            attn_output = self.attention(args[0], args[1], args[2], seqlens)
                 
         attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 

From 334877d209b0beecc725106369581070dd8d48b1 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 17 May 2023 05:12:03 -0700
Subject: [PATCH 521/638] CUDA debug log utility class (#688)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/688

It is extremely cumbersome to do printf debugging in CUDA Kernels. Part of the reason for that is that it's not easily possible to concatenate formatted strings to be logged, or to log array values. Writing multiple printf statements after each other is no alternative either, given that due to CUDA multithreading, these statements will not be directly following each other.

std::string, std::stringstream and sprintf or similar functions are not available in CUDA device code.

I wrote a small helper class ( utilizing some code from the BSD/Apache cuda-kat library ) which allows to comfortably log debug info within CUDA Kernel code, including

  * printf-style format strings
  * Float array values
  * Integer array values
  * Static type names ( important when logging Cutlass template arguments )

Reviewer Notes:

This is a debugging utility. It's not intended to be ever present in production-level code. Nevertheless I think we should add it to the codebase similar to debug_settings.py and DEBUG_LOG facilities etc.

The function "kat_printf.h" is a single-header version of relevant parts of the cuda-kat library, providing an implementation of snprintf.

The "debug_string.h" class is what should actually be included in the code to be debugged. It provides the class DebugString which accepts an optional integer template argument specifying the size of the  string.

The following is actual example code and example log output.

The following code was placed within a CUDA Kernel that's part of CUTLASS:

  if (threadIdx.x==0) {
            DebugString<512> debugstr;
            debugstr.append_str("[KDEBUG ");
            debugstr.append_threadinfo();
            debugstr.snprintf("]: batch_idx=%d, A0 data=", batch_idx);
            debugstr.append_float_array(params.ref_A0.data(), 0, 10);
            debugstr.append_str(" B2bMma::IteratorA0 type=");
            debugstr.append_types<B2bMma::IteratorA0>();
            debugstr.println();
  }

And resulted in log output like this:

KDEBUG thread(0/128) grid=(1/4, 0/1, 2/0))]: batch_idx=2, A0 data=[0.745117, 1.690430, 1.350586, 1.008789, 1.706055, 0.270508, 1.736328, 1.785156, 0.584961, 1.068359] B2bMma::IteratorA0 type=[with Args = cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<64, 32>, cutlass::half_t, cutlass::layout::RowMajor, 1, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 64>, 128, cutlass::PitchLinearShape<4, 8>, 8>, cutlass::Array<cutlass::half_t, 8, false
[KDEBUG thread(0/128) grid=(1/4, 0/1, 1/0))]: batch_idx=1, A0 data=[0.745117, 1.690430, 1.350586, 1.008789, 1.706055, 0.270508, 1.736328, 1.785156, 0.584961, 1.068359] B2bMma::IteratorA0 type=[with Args = cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<64, 32>, cutlass::half_t, cutlass::layout::RowMajor, 1, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 64>, 128, cutlass::PitchLinearShape<4, 8>, 8>, cutlass::Array<cutlass::half_t, 8, false

Reviewed By: chenyang78

Differential Revision: D45812964

fbshipit-source-id: 84918eb58828b94a1278cb78b48a94fb6c2c67ac
---
 static/include/kernels/debug_string.h |  202 ++++
 static/include/kernels/kat_printf.h   | 1262 +++++++++++++++++++++++++
 2 files changed, 1464 insertions(+)
 create mode 100644 static/include/kernels/debug_string.h
 create mode 100644 static/include/kernels/kat_printf.h

diff --git a/static/include/kernels/debug_string.h b/static/include/kernels/debug_string.h
new file mode 100644
index 000000000..e6b897cae
--- /dev/null
+++ b/static/include/kernels/debug_string.h
@@ -0,0 +1,202 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+#include <cutlass/cutlass.h>
+#include "kat_printf.h"
+
+// Helper functions for debug logging, these
+// make it easier to create meaningful debug log entries, especially
+// from within CUDA code.
+template <size_t SIZE = 255>
+struct DebugString {
+  char buffer[SIZE + 1];
+  const size_t size;
+  size_t pos;
+
+  CUTLASS_HOST_DEVICE DebugString() : size{SIZE}, pos{0} {
+    buffer[size] = '\0';
+    buffer[0] = '\0';
+  }
+
+  CUTLASS_HOST_DEVICE void reset() {
+    pos = 0;
+    buffer[size] = '\0';
+    buffer[0] = '\0';
+  }
+
+  CUTLASS_HOST_DEVICE void terminate() {
+    if (pos < size) {
+      buffer[pos] = '\0';
+    }
+  }
+
+  CUTLASS_DEVICE int snprintf(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    int ret = 0;
+    if (size > pos) {
+      ret = kat::vsnprintf(buffer + pos, size - pos - 1, format, args);
+      pos += ret;
+      if (pos >= size) {
+        pos = size - 1;
+      }
+    }
+    va_end(args);
+    this->terminate();
+    return ret;
+  }
+
+  CUTLASS_DEVICE int append_str(const char* str) {
+    int spos = 0;
+    while ((str[spos] != '\0') && (pos < size - 1)) {
+      buffer[pos++] = str[spos++];
+    }
+    this->terminate();
+    return spos;
+  }
+
+  CUTLASS_DEVICE int append_str(const char* str, int max_len) {
+    int spos = 0;
+    while ((str[spos] != '\0') && (pos < size - 1)) {
+      buffer[pos++] = str[spos++];
+      if (spos >= max_len) {
+        break;
+      }
+    }
+    this->terminate();
+    return spos;
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array(T* arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>(arr[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array(T* arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>(arr[i]));
+    }
+    this->append_str("]");
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array(T& arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>(arr[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array(T& arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>(arr[i]));
+    }
+    this->append_str("]");
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array_from_ptr_to_array(
+      T* arr,
+      int start,
+      int n) {
+    // cutlass TileAccessIterator.get returns a pointer to a cutlass::Array,
+    // which cannot be passed to the above functions without copying
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>((*arr)[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array_from_ptr_to_array(
+      T* arr,
+      int start,
+      int n) {
+    // cutlass TileAccessIterator.get returns a pointer to a cutlass::Array,
+    // which cannot be passed to the above functions without copying
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>((*arr)[i]));
+    }
+    this->append_str("]");
+  }
+
+  CUTLASS_DEVICE
+  void append_threadinfo() {
+    this->snprintf("thread(%d/%d", (int)threadIdx.x, (int)blockDim.x);
+    if ((blockDim.y > 1) or (blockDim.z > 1)) {
+      this->snprintf(
+          ", %d/%d, %d/%d", threadIdx.y, blockDim.y, threadIdx.z, blockDim.z);
+    }
+    this->snprintf(") grid=(%d/%d", blockIdx.x, gridDim.x);
+    if ((gridDim.y > 1) or (gridDim.z > 1)) {
+      this->snprintf(
+          ", %d/%d, %d/%d)", blockIdx.y, gridDim.y, blockIdx.z, gridDim.z);
+    }
+    this->append_str(")");
+  }
+
+  template <typename... Args>
+  CUTLASS_DEVICE void append_arg_types(Args... args) {
+    const char* pretty = __PRETTY_FUNCTION__; // special compiler-defined macro
+
+    const char* start = kat::strchr(pretty, '[');
+    const char* end = kat::strrchr(pretty, ']');
+    size_t len = end - start;
+    len = (len > size - pos) ? size - pos : len;
+    this->append_str(start, len);
+  }
+
+  template <typename... Args>
+  CUTLASS_DEVICE void append_types() {
+    const char* pretty = __PRETTY_FUNCTION__; // special compiler-defined macro
+
+    const char* start = kat::strchr(pretty, '[');
+    const char* end = kat::strrchr(pretty, ']');
+    size_t len = end - start;
+    len = (len > size - pos) ? size - pos : len;
+    this->append_str(start, len);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void println() {
+    printf("%s\n", buffer);
+  }
+};
diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h
new file mode 100644
index 000000000..8382c037d
--- /dev/null
+++ b/static/include/kernels/kat_printf.h
@@ -0,0 +1,1262 @@
+// Single-Header version of printf.cu from the cuda-kat library
+// implementing printf variants and string manipulation code
+// See
+// https://github.com/eyalroz/cuda-kat/blob/development/src/kat/on_device/c_standard_library/printf.cu
+// copied from revision f771b5d5906d0f49e7500d32c2af91234c1cebad
+
+/**
+ * @author (c) Eyal Rozenberg <eyalroz1@gmx.com>
+ *             2021-2022, Haifa, Palestine/Israel
+ * @author (c) Marco Paland (info@paland.com)
+ *             2014-2019, PALANDesign Hannover, Germany
+ *
+ * @note Others have made smaller contributions to this file: see the
+ * contributors page at https://github.com/eyalroz/printf/graphs/contributors
+ * or ask one of the authors. The original code for exponential specifiers was
+ * contributed by Martijn Jasperse <m.jasperse@gmail.com>.
+ *
+ * @brief Small stand-alone implementation of the printf family of functions
+ * (`(v)printf`, `(v)s(n)printf` etc., geared towards use on embedded systems
+ * with a very limited resources.
+ *
+ * @note the implementations are thread-safe; re-entrant; use no functions from
+ * the standard library; and do not dynamically allocate any memory.
+ *
+ * @license The MIT License (MIT)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma once
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdio> // for CUDA's builtin printf()
+
+namespace kat {
+
+/**
+ * @brief Search for a character within a nul-terminated string.
+ *
+ * @param s The string to search
+ * @param c A character value to search for
+ * @return address of the first character with the value @p c
+ * within string @p s, or nullptr if no character of @p s equals @p c .
+ */
+inline __device__ char* strchr(const char* s, int c) {
+  const char* p = s;
+  do {
+    if (*p == static_cast<char>(c)) {
+      return const_cast<char*>(p);
+    }
+  } while (*(p++) != '\0');
+  return nullptr;
+}
+
+/**
+ * @brief same as @ref std::strchr , except that the search begins
+ * at the end of the string
+ *
+ * @note If @p c is '\0', it _will_ match the nul character
+ * at the end of the string.
+ *
+ */
+inline __device__ char* strrchr(const char* s, int c) {
+  const char* last = nullptr;
+  const char* p = s;
+  do {
+    if (*p == c) {
+      last = p;
+    }
+  } while (*(p++) != '\0');
+  return const_cast<char*>(last);
+}
+
+/**
+ * An implementation of the C standard's snprintf/vsnprintf
+ *
+ * @param s An array in which to store the formatted string. It must be large
+ * enough to fit either the entire formatted output, or at least @p count
+ * characters. Alternatively, it can be `NULL`, in which case nothing will be
+ * printed, and only the number of characters which _could_ have been printed is
+ * tallied and returned.
+ * @param n The maximum number of characters to write to the array, including a
+ * terminating null character.
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each specifier in @p
+ * format
+ * @return The number of characters that COULD have been written into @p s, not
+ * counting the terminating null character. A value equal or larger than @p
+ * count indicates truncation. Only when the returned value is non-negative and
+ * less than @p count, the null-terminated string has been fully and
+ * successfully printed. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int snprintf(
+    char* s,
+    size_t count,
+    const char* format,
+    ...) __attribute__((format(__printf__, (3), (4))));
+__attribute__((device)) int vsnprintf(
+    char* s,
+    size_t count,
+    const char* format,
+    va_list arg) __attribute__((format(__printf__, ((3)), (0))));
+
+/**
+ * An implementation of the C standard's printf/vprintf, via a self-allocated
+ * buffer, backed by CUDA's `printf()`
+ *
+ * @note These functions will allocate some scratch memory to format a string
+ * into, which will then be printed using CUDA's printf. This may be
+ * inconvenient or dangerous, so **use of these function is _not_ recommended.**
+ * Prefer @ref printf_with_scratch or @ref vprintf_with_scratch instead.
+ *
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each %-specifier in
+ * @p format string
+ * @return The number of characters written to the output not counting the
+ * terminating null character.
+ */
+__attribute__((device)) int printf(const char* format, ...)
+    __attribute__((format(__printf__, (1), (2))));
+
+__attribute__((device)) int vprintf(const char* format, va_list arg)
+    __attribute__((format(__printf__, ((1)), (0))));
+
+/**
+ * An implementation of the C standard's printf/vprintf, backed by CUDA's
+ * `printf()`, with a user-provided sized scratch buffer.
+ *
+ * @note These functions will not allocate anything on the heap.
+ *
+ * @param scratch an array for staging the formatted output before passing it to
+ * CUDA's `printf()` function. The buffer must have at least @p count available
+ * bytes. If `nullptr` is passed for `scratch`, nothing is written, but the
+ * number of characters _to_ be written is returned.
+ * @param count size of the @p scratch buffer
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg additional arguments to the function, one for each %-specifier in
+ * @p format string
+ * @return The number of characters that COULD have been written into @p s, not
+ * counting the terminating null character. A value equal or larger than @p
+ * count indicates truncation. Only when the returned value is non-negative and
+ * less than @p count, the null-terminated string has been fully and
+ * successfully printed. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int vnprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    va_list arg) __attribute__((format(__printf__, ((3)), (0))));
+__attribute__((device)) int nprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    ...) __attribute__((format(__printf__, (3), (4))));
+
+/**
+ * An implementation of the C standard's sprintf/vsprintf
+ *
+ * @note For security considerations (the potential for exceeding the buffer
+ * bounds), please consider using the size-constrained variant, @ref
+ * kat::snprintf / @ref kat::vsnprintf , instead.
+ *
+ * @param s An array in which to store the formatted string. It must be large
+ * enough to fit the formatted output!
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each specifier in @p
+ * format
+ * @return The number of characters written into @p s, not counting the
+ * terminating null character. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int sprintf(char* s, const char* format, ...)
+    __attribute__((format(__printf__, (2), (3))));
+__attribute__((device)) int vsprintf(char* s, const char* format, va_list arg)
+    __attribute__((format(__printf__, ((2)), (0))));
+
+} // namespace kat
+
+// ---------------------------------------------------------------------------------------------------------------------
+namespace kat {
+namespace detail_ {
+namespace printf {
+enum {
+  integer_buffer_size = 32,
+  decimal_buffer_size = 32,
+  default_float_precision = 6,
+  num_decimal_digits_in_int64_t = 18,
+  max_supported_precision = num_decimal_digits_in_int64_t - 1,
+};
+constexpr const double float_notation_threshold = 1e9;
+namespace flags {
+static_assert(sizeof(short) == 2, "Unexpected size of short");
+static_assert(sizeof(int) == 4, "Unexpected size of int");
+static_assert(sizeof(long) == 8, "Unexpected size of long");
+enum : unsigned {
+  zeropad = 1U << 0U,
+  left = 1U << 1U,
+  plus = 1U << 2U,
+  space = 1U << 3U,
+  hash = 1U << 4U,
+  uppercase = 1U << 5U,
+  char_ = 1U << 6U,
+  short_ = 1U << 7U,
+  int_ = 1U << 8U,
+  long_ = 1U << 9U,
+  long_long = 1U << 10U,
+  precision = 1U << 11U,
+  adapt_exp = 1U << 12U,
+  pointer = 1U << 13U,
+  signed_ = 1U << 14U,
+  int8 = char_,
+  int16 = short_,
+  int32 = int_,
+  int64 = long_
+};
+} // namespace flags
+typedef unsigned int flags_t;
+namespace base {
+enum { binary = 2, octal = 8, decimal = 10, hex = 16 };
+}
+typedef uint8_t numeric_base_t;
+typedef unsigned long long unsigned_value_t;
+typedef long long signed_value_t;
+typedef unsigned int printf_size_t;
+enum { max_possible_buffer_size = 0x7fffffff };
+namespace double_ {
+static_assert(
+    FLT_RADIX == 2,
+    "Non-binary-radix floating-point types are unsupported.");
+static_assert(DBL_MANT_DIG == 53, "Unsupported double type configuration");
+typedef uint64_t uint_t;
+enum {
+  size_in_bits = 64,
+  base_exponent = 1023,
+  stored_mantissa_bits = DBL_MANT_DIG - 1,
+};
+enum : unsigned { exponent_mask = 0x7FFU };
+union with_bit_access {
+  uint_t U;
+  double F;
+  static __attribute__((device)) constexpr with_bit_access wrap(double x) {
+    with_bit_access dwba = {.F = x};
+    return dwba;
+  }
+  __attribute__((device)) constexpr __attribute__((device)) int exp2() const {
+    return (int)((U >> stored_mantissa_bits) & exponent_mask) - base_exponent;
+  }
+};
+struct components {
+  int_fast64_t integral;
+  int_fast64_t fractional;
+  bool is_negative;
+};
+} // namespace double_
+__attribute__((device)) static inline constexpr int get_sign_bit(double x) {
+  return (
+      int)(double_::with_bit_access::wrap(x).U >> (double_::size_in_bits - 1));
+}
+__attribute__((device)) static inline int get_exp2(double x) {
+  return double_::with_bit_access::wrap(x).exp2();
+}
+template <typename T>
+__attribute__((device)) constexpr T abs(T x) {
+  return x > 0 ? x : -x;
+}
+template <typename T>
+__attribute__((device)) constexpr unsigned_value_t abs_for_printing(T x) {
+  return x > 0 ? x : -(signed_value_t)x;
+}
+typedef struct {
+  void (*function)(char c, void* extra_arg);
+  void* extra_function_arg;
+  char* buffer;
+  printf_size_t pos;
+  printf_size_t max_chars;
+} output_gadget_t;
+__attribute__((noinline)) __attribute__((device)) static inline void
+putchar_via_gadget(output_gadget_t* gadget, char c) {
+  printf_size_t write_pos = gadget->pos++;
+  if (write_pos >= gadget->max_chars) {
+    return;
+  }
+  if (gadget->function != nullptr) {
+    gadget->function(c, gadget->extra_function_arg);
+  } else {
+    gadget->buffer[write_pos] = c;
+  }
+}
+__attribute__((device)) static inline void append_termination_with_gadget(
+    output_gadget_t* gadget) {
+  if (gadget->function != nullptr || gadget->max_chars == 0) {
+    return;
+  }
+  if (gadget->buffer == nullptr) {
+    return;
+  }
+  printf_size_t null_char_pos =
+      gadget->pos < gadget->max_chars ? gadget->pos : gadget->max_chars - 1;
+  gadget->buffer[null_char_pos] = '\0';
+}
+__attribute__((device)) static inline output_gadget_t discarding_gadget() {
+  output_gadget_t gadget;
+  gadget.function = nullptr;
+  gadget.extra_function_arg = nullptr;
+  gadget.buffer = nullptr;
+  gadget.pos = 0;
+  gadget.max_chars = 0;
+  return gadget;
+}
+__attribute__((device)) static inline output_gadget_t buffer_gadget(
+    char* buffer,
+    size_t buffer_size) {
+  printf_size_t usable_buffer_size = (buffer_size > max_possible_buffer_size)
+      ? max_possible_buffer_size
+      : (printf_size_t)buffer_size;
+  output_gadget_t result = discarding_gadget();
+  if (buffer != nullptr) {
+    result.buffer = buffer;
+    result.max_chars = usable_buffer_size;
+  }
+  return result;
+}
+__attribute__((device)) static inline printf_size_t strnlen_s_(
+    const char* str,
+    printf_size_t maxsize) {
+  const char* s;
+  for (s = str; *s && maxsize--; ++s)
+    ;
+  return (printf_size_t)(s - str);
+}
+__attribute__((device)) static inline constexpr bool is_digit_(char ch) {
+  return (ch >= '0') && (ch <= '9');
+}
+__attribute__((device)) static printf_size_t atou_(const char** str) {
+  printf_size_t i = 0U;
+  while (is_digit_(**str)) {
+    i = i * 10U + (printf_size_t)(*((*str)++) - '0');
+  }
+  return i;
+}
+__attribute__((device)) static void out_rev_(
+    output_gadget_t* output,
+    const char* buf,
+    printf_size_t len,
+    printf_size_t width,
+    flags_t flags) {
+  const printf_size_t start_pos = output->pos;
+  if (!(flags & flags::left) && !(flags & flags::zeropad)) {
+    for (printf_size_t i = len; i < width; i++) {
+      putchar_via_gadget(output, ' ');
+    }
+  }
+  while (len) {
+    putchar_via_gadget(output, buf[--len]);
+  }
+  if (flags & flags::left) {
+    while (output->pos - start_pos < width) {
+      putchar_via_gadget(output, ' ');
+    }
+  }
+}
+__attribute__((device)) static void print_integer_finalization(
+    output_gadget_t* __restrict__ output,
+    char* __restrict__ buf,
+    printf_size_t len,
+    bool negative,
+    numeric_base_t base,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags) {
+  printf_size_t unpadded_len = len;
+  {
+    if (!(flags & flags::left)) {
+      if (width && (flags & flags::zeropad) &&
+          (negative || (flags & (flags::plus | flags::space)))) {
+        width--;
+      }
+      while ((flags & flags::zeropad) && (len < width) &&
+             (len < detail_::printf::integer_buffer_size)) {
+        buf[len++] = '0';
+      }
+    }
+    while ((len < precision) && (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = '0';
+    }
+    if (base == base::octal && (len > unpadded_len)) {
+      flags &= ~flags::hash;
+    }
+  }
+  if (flags & (flags::hash | flags::pointer)) {
+    if (!(flags & flags::precision) && len &&
+        ((len == precision) || (len == width))) {
+      if (unpadded_len < len) {
+        len--;
+      }
+      if (len && (base == base::hex || base == base::binary) &&
+          (unpadded_len < len)) {
+        len--;
+      }
+    }
+    if ((base == base::hex) && !(flags & flags::uppercase) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'x';
+    } else if (
+        (base == base::hex) && (flags & flags::uppercase) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'X';
+    } else if (
+        (base == base::binary) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'b';
+    }
+    if (len < detail_::printf::integer_buffer_size) {
+      buf[len++] = '0';
+    }
+  }
+  if (len < detail_::printf::integer_buffer_size) {
+    if (negative) {
+      buf[len++] = '-';
+    } else if (flags & flags::plus) {
+      buf[len++] = '+';
+    } else if (flags & flags::space) {
+      buf[len++] = ' ';
+    }
+  }
+  out_rev_(output, buf, len, width, flags);
+}
+__attribute__((device)) static void print_integer(
+    output_gadget_t* output,
+    unsigned_value_t value,
+    bool negative,
+    numeric_base_t base,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags) {
+  char buf[detail_::printf::integer_buffer_size];
+  printf_size_t len = 0U;
+  if (!value) {
+    if (!(flags & flags::precision)) {
+      buf[len++] = '0';
+      flags &= ~flags::hash;
+    } else if (base == base::hex) {
+      flags &= ~flags::hash;
+    }
+  } else {
+    do {
+      const char digit = (char)(value % base);
+      buf[len++] =
+          (char)(digit < 10 ? '0' + digit : (flags & flags::uppercase ? 'A' : 'a') + digit - 10);
+      value /= base;
+    } while (value && (len < detail_::printf::integer_buffer_size));
+  }
+  print_integer_finalization(
+      output, buf, len, negative, base, precision, width, flags);
+}
+__attribute__((device)) double power_of_10(int e) {
+  switch (e) {
+    case 0:
+      return 1e00;
+    case 1:
+      return 1e01;
+    case 2:
+      return 1e02;
+    case 3:
+      return 1e03;
+    case 4:
+      return 1e04;
+    case 5:
+      return 1e05;
+    case 6:
+      return 1e06;
+    case 7:
+      return 1e07;
+    case 8:
+      return 1e08;
+    case 9:
+      return 1e09;
+    case 10:
+      return 1e10;
+    case 11:
+      return 1e11;
+    case 12:
+      return 1e12;
+    case 13:
+      return 1e13;
+    case 14:
+      return 1e14;
+    case 15:
+      return 1e15;
+    case 16:
+      return 1e16;
+    case 17:
+      return 1e17;
+  }
+  return 1;
+}
+__attribute__((device)) static double_::components get_components(
+    double number,
+    printf_size_t precision) {
+  double_::components number_;
+  number_.is_negative = get_sign_bit(number);
+  double abs_number = (number_.is_negative) ? -number : number;
+  number_.integral = (int_fast64_t)abs_number;
+  double remainder =
+      (abs_number - (double)number_.integral) * power_of_10((int)precision);
+  number_.fractional = (int_fast64_t)remainder;
+  remainder -= (double)number_.fractional;
+  if (remainder > 0.5) {
+    ++number_.fractional;
+    if ((double)number_.fractional >= power_of_10((int)precision)) {
+      number_.fractional = 0;
+      ++number_.integral;
+    }
+  } else if (
+      (remainder == 0.5) &&
+      ((number_.fractional == 0U) || (number_.fractional & 1U))) {
+    ++number_.fractional;
+  }
+  if (precision == 0U) {
+    remainder = abs_number - (double)number_.integral;
+    if ((!(remainder < 0.5) || (remainder > 0.5)) && (number_.integral & 1)) {
+      ++number_.integral;
+    }
+  }
+  return number_;
+}
+struct scaling_factor {
+  double raw_factor;
+  bool multiply;
+};
+__attribute__((device)) static double apply_scaling(
+    double num,
+    scaling_factor normalization) {
+  return normalization.multiply ? num * normalization.raw_factor
+                                : num / normalization.raw_factor;
+}
+__attribute__((device)) static double unapply_scaling(
+    double normalized,
+    scaling_factor normalization) {
+  return normalization.multiply ? normalized / normalization.raw_factor
+                                : normalized * normalization.raw_factor;
+}
+__attribute__((device)) static scaling_factor update_normalization(
+    scaling_factor sf,
+    double extra_multiplicative_factor) {
+  scaling_factor result;
+  int factor_exp2 = get_exp2(sf.raw_factor);
+  int extra_factor_exp2 = get_exp2(extra_multiplicative_factor);
+  if (abs(factor_exp2) > abs(extra_factor_exp2)) {
+    result.multiply = false;
+    result.raw_factor = sf.raw_factor / extra_multiplicative_factor;
+  } else {
+    result.multiply = true;
+    result.raw_factor = extra_multiplicative_factor / sf.raw_factor;
+  }
+  return result;
+}
+__attribute__((device)) static double_::components get_normalized_components(
+    bool negative,
+    printf_size_t precision,
+    double non_normalized,
+    scaling_factor normalization,
+    int floored_exp10) {
+  double_::components components;
+  components.is_negative = negative;
+  double scaled = apply_scaling(non_normalized, normalization);
+  bool close_to_representation_extremum =
+      ((-floored_exp10 + (int)precision) >= DBL_MAX_10_EXP - 1);
+  if (close_to_representation_extremum) {
+    return get_components(negative ? -scaled : scaled, precision);
+  }
+  components.integral = (int_fast64_t)scaled;
+  double remainder = non_normalized -
+      unapply_scaling((double)components.integral, normalization);
+  double prec_power_of_10 = power_of_10((int)precision);
+  scaling_factor account_for_precision =
+      update_normalization(normalization, prec_power_of_10);
+  double scaled_remainder = apply_scaling(remainder, account_for_precision);
+  double rounding_threshold = 0.5;
+  components.fractional = (int_fast64_t)scaled_remainder;
+  scaled_remainder -= (double)components.fractional;
+  components.fractional += (scaled_remainder >= rounding_threshold);
+  if (scaled_remainder == rounding_threshold) {
+    components.fractional &= ~((int_fast64_t)0x1);
+  }
+  if ((double)components.fractional >= prec_power_of_10) {
+    components.fractional = 0;
+    ++components.integral;
+  }
+  return components;
+}
+__attribute__((device)) static void print_broken_up_decimal(
+    double_::components number_,
+    output_gadget_t* output,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* buf,
+    printf_size_t len) {
+  if (precision != 0U) {
+    printf_size_t count = precision;
+    if ((flags & flags::adapt_exp) && !(flags & flags::hash) &&
+        (number_.fractional > 0)) {
+      while (true) {
+        int_fast64_t digit = number_.fractional % 10U;
+        if (digit != 0) {
+          break;
+        }
+        --count;
+        number_.fractional /= 10U;
+      }
+    }
+    if (number_.fractional > 0 || !(flags & flags::adapt_exp) ||
+        (flags & flags::hash)) {
+      while (len < decimal_buffer_size) {
+        --count;
+        buf[len++] = (char)('0' + number_.fractional % 10U);
+        if (!(number_.fractional /= 10U)) {
+          break;
+        }
+      }
+      while ((len < decimal_buffer_size) && (count > 0U)) {
+        buf[len++] = '0';
+        --count;
+      }
+      if (len < decimal_buffer_size) {
+        buf[len++] = '.';
+      }
+    }
+  } else {
+    if ((flags & flags::hash) && (len < decimal_buffer_size)) {
+      buf[len++] = '.';
+    }
+  }
+  while (len < decimal_buffer_size) {
+    buf[len++] = (char)('0' + (number_.integral % 10));
+    if (!(number_.integral /= 10)) {
+      break;
+    }
+  }
+  if (!(flags & flags::left) && (flags & flags::zeropad)) {
+    if (width &&
+        (number_.is_negative || (flags & (flags::plus | flags::space)))) {
+      width--;
+    }
+    while ((len < width) && (len < decimal_buffer_size)) {
+      buf[len++] = '0';
+    }
+  }
+  if (len < decimal_buffer_size) {
+    if (number_.is_negative) {
+      buf[len++] = '-';
+    } else if (flags & flags::plus) {
+      buf[len++] = '+';
+    } else if (flags & flags::space) {
+      buf[len++] = ' ';
+    }
+  }
+  out_rev_(output, buf, len, width, flags);
+}
+__attribute__((device)) static void print_decimal_number(
+    output_gadget_t* __restrict__ output,
+    double number,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* __restrict__ buf,
+    printf_size_t len) {
+  double_::components value_ = get_components(number, precision);
+  print_broken_up_decimal(value_, output, precision, width, flags, buf, len);
+}
+__attribute__((device)) static void print_exponential_number(
+    output_gadget_t* __restrict__ output,
+    double number,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* __restrict__ buf,
+    printf_size_t len) {
+  const bool negative = get_sign_bit(number);
+  double abs_number = negative ? -number : number;
+  int floored_exp10;
+  bool abs_exp10_covered_by_powers_table;
+  scaling_factor normalization;
+  if (abs_number == 0.0) {
+    floored_exp10 = 0;
+  } else {
+    double exp10 = log10(abs_number);
+    floored_exp10 = floor(exp10);
+    double p10 = pow(10, floored_exp10);
+    normalization.raw_factor = p10;
+    abs_exp10_covered_by_powers_table = false;
+  }
+  bool fall_back_to_decimal_only_mode = false;
+  if (flags & flags::adapt_exp) {
+    int required_significant_digits = (precision == 0) ? 1 : (int)precision;
+    fall_back_to_decimal_only_mode =
+        (floored_exp10 >= -4 && floored_exp10 < required_significant_digits);
+    int precision_ = fall_back_to_decimal_only_mode
+        ? (int)precision - 1 - floored_exp10
+        : (int)precision - 1;
+    precision = (precision_ > 0 ? (unsigned)precision_ : 0U);
+    flags |= flags::precision;
+  }
+  normalization.multiply =
+      (floored_exp10 < 0 && abs_exp10_covered_by_powers_table);
+  bool should_skip_normalization =
+      (fall_back_to_decimal_only_mode || floored_exp10 == 0);
+  double_::components decimal_part_components = should_skip_normalization
+      ? get_components(negative ? -abs_number : abs_number, precision)
+      : get_normalized_components(
+            negative, precision, abs_number, normalization, floored_exp10);
+  if (fall_back_to_decimal_only_mode) {
+    if ((flags & flags::adapt_exp) && floored_exp10 >= -1 &&
+        decimal_part_components.integral == power_of_10(floored_exp10 + 1)) {
+      floored_exp10++;
+      precision--;
+    }
+  } else {
+    if (decimal_part_components.integral >= 10) {
+      floored_exp10++;
+      decimal_part_components.integral = 1;
+      decimal_part_components.fractional = 0;
+    }
+  }
+  printf_size_t exp10_part_width = fall_back_to_decimal_only_mode ? 0U
+      : (abs(floored_exp10) < 100)                                ? 4U
+                                                                  : 5U;
+  printf_size_t decimal_part_width = ((flags & flags::left) && exp10_part_width)
+      ? 0U
+      : ((width > exp10_part_width) ? width - exp10_part_width : 0U);
+  const printf_size_t printed_exponential_start_pos = output->pos;
+  print_broken_up_decimal(
+      decimal_part_components,
+      output,
+      precision,
+      decimal_part_width,
+      flags,
+      buf,
+      len);
+  if (!fall_back_to_decimal_only_mode) {
+    putchar_via_gadget(output, (flags & flags::uppercase) ? 'E' : 'e');
+    print_integer(
+        output,
+        abs_for_printing(floored_exp10),
+        floored_exp10 < 0,
+        10,
+        0,
+        exp10_part_width - 1,
+        flags::zeropad | flags::plus);
+    if (flags & flags::left) {
+      while (output->pos - printed_exponential_start_pos < width) {
+        putchar_via_gadget(output, ' ');
+      }
+    }
+  }
+}
+__attribute__((device)) static void print_floating_point(
+    output_gadget_t* output,
+    double value,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    bool prefer_exponential) {
+  char buf[decimal_buffer_size];
+  printf_size_t len = 0U;
+  if (value != value) {
+    out_rev_(output, "nan", 3, width, flags);
+    return;
+  }
+  if (value < -DBL_MAX) {
+    out_rev_(output, "fni-", 4, width, flags);
+    return;
+  }
+  if (value > DBL_MAX) {
+    out_rev_(
+        output,
+        (flags & flags::plus) ? "fni+" : "fni",
+        (flags & flags::plus) ? 4U : 3U,
+        width,
+        flags);
+    return;
+  }
+  if (!prefer_exponential &&
+      ((value > float_notation_threshold) ||
+       (value < -float_notation_threshold))) {
+    print_exponential_number(output, value, precision, width, flags, buf, len);
+    return;
+  }
+  if (!(flags & flags::precision)) {
+    precision = default_float_precision;
+  }
+  while ((len < decimal_buffer_size) && (precision > max_supported_precision)) {
+    buf[len++] = '0';
+    precision--;
+  }
+  if (prefer_exponential)
+    print_exponential_number(output, value, precision, width, flags, buf, len);
+  else
+    print_decimal_number(output, value, precision, width, flags, buf, len);
+}
+__attribute__((device)) static flags_t parse_flags(const char** format) {
+  flags_t flags = 0U;
+  do {
+    switch (**format) {
+      case '0':
+        flags |= flags::zeropad;
+        (*format)++;
+        break;
+      case '-':
+        flags |= flags::left;
+        (*format)++;
+        break;
+      case '+':
+        flags |= flags::plus;
+        (*format)++;
+        break;
+      case ' ':
+        flags |= flags::space;
+        (*format)++;
+        break;
+      case '#':
+        flags |= flags::hash;
+        (*format)++;
+        break;
+      default:
+        return flags;
+    }
+  } while (true);
+}
+__attribute__((device)) static int vsnprintf(
+    output_gadget_t* output,
+    const char* format,
+    va_list args) {
+  while (*format) {
+    if (*format != '%') {
+      putchar_via_gadget(output, *format);
+      format++;
+      continue;
+    } else {
+      format++;
+    }
+    flags_t flags = parse_flags(&format);
+    printf_size_t width = 0U;
+    if (is_digit_(*format)) {
+      width = (printf_size_t)atou_(&format);
+    } else if (*format == '*') {
+      const int w = __builtin_va_arg(args, int);
+      if (w < 0) {
+        flags |= flags::left;
+        width = (printf_size_t)-w;
+      } else {
+        width = (printf_size_t)w;
+      }
+      format++;
+    }
+    printf_size_t precision = 0U;
+    if (*format == '.') {
+      flags |= flags::precision;
+      format++;
+      if (is_digit_(*format)) {
+        precision = (printf_size_t)atou_(&format);
+      } else if (*format == '*') {
+        const int precision_ = __builtin_va_arg(args, int);
+        precision = precision_ > 0 ? (printf_size_t)precision_ : 0U;
+        format++;
+      }
+    }
+    switch (*format) {
+      case 'I': {
+        format++;
+        switch (*format) {
+          case '8':
+            flags |= flags::int8;
+            format++;
+            break;
+          case '1':
+            format++;
+            if (*format == '6') {
+              format++;
+              flags |= flags::int16;
+            }
+            break;
+          case '3':
+            format++;
+            if (*format == '2') {
+              format++;
+              flags |= flags::int32;
+            }
+            break;
+          case '6':
+            format++;
+            if (*format == '4') {
+              format++;
+              flags |= flags::int64;
+            }
+            break;
+          default:
+            break;
+        }
+        break;
+      }
+      case 'l':
+        flags |= flags::long_;
+        format++;
+        if (*format == 'l') {
+          flags |= flags::long_long;
+          format++;
+        }
+        break;
+      case 'h':
+        flags |= flags::short_;
+        format++;
+        if (*format == 'h') {
+          flags |= flags::char_;
+          format++;
+        }
+        break;
+      case 't':
+      case 'j':
+      case 'z':
+        static_assert(
+            sizeof(ptrdiff_t) == sizeof(long), "Unexpected sizeof(ptrdiff_t)");
+        static_assert(
+            sizeof(intmax_t) == sizeof(long), "Unexpected sizeof(intmax_t)");
+        static_assert(
+            sizeof(size_t) == sizeof(long), "Unexpected sizeof(size_t)");
+        flags |= flags::long_;
+        format++;
+        break;
+      default:
+        break;
+    }
+    switch (*format) {
+      case 'd':
+      case 'i':
+      case 'u':
+      case 'x':
+      case 'X':
+      case 'o':
+      case 'b': {
+        if (*format == 'd' || *format == 'i') {
+          flags |= flags::signed_;
+        }
+        numeric_base_t base;
+        if (*format == 'x' || *format == 'X') {
+          base = base::hex;
+        } else if (*format == 'o') {
+          base = base::octal;
+        } else if (*format == 'b') {
+          base = base::binary;
+        } else {
+          base = base::decimal;
+          flags &= ~flags::hash;
+        }
+        if (*format == 'X') {
+          flags |= flags::uppercase;
+        }
+        format++;
+        if (flags & flags::precision) {
+          flags &= ~flags::zeropad;
+        }
+        if (flags & flags::signed_) {
+          if (flags & flags::long_long) {
+            const long long value = __builtin_va_arg(args, long long);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          } else if (flags & flags::long_) {
+            const long value = __builtin_va_arg(args, long);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          } else {
+            const int value = (flags & flags::char_)
+                ? (signed char)__builtin_va_arg(args, int)
+                : (flags & flags::short_)
+                ? (short int)__builtin_va_arg(args, int)
+                : __builtin_va_arg(args, int);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          }
+        } else {
+          flags &= ~(flags::plus | flags::space);
+          if (flags & flags::long_long) {
+            print_integer(
+                output,
+                (unsigned_value_t) __builtin_va_arg(args, unsigned long long),
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          } else if (flags & flags::long_) {
+            print_integer(
+                output,
+                (unsigned_value_t) __builtin_va_arg(args, unsigned long),
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          } else {
+            const unsigned int value = (flags & flags::char_)
+                ? (unsigned char)__builtin_va_arg(args, unsigned int)
+                : (flags & flags::short_)
+                ? (unsigned short int)__builtin_va_arg(args, unsigned int)
+                : __builtin_va_arg(args, unsigned int);
+            print_integer(
+                output,
+                (unsigned_value_t)value,
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          }
+        }
+        break;
+      }
+        enum : bool { prefer_decimal = false, prefer_exponential = true };
+      case 'f':
+      case 'F':
+        if (*format == 'F')
+          flags |= flags::uppercase;
+        print_floating_point(
+            output,
+            __builtin_va_arg(args, double),
+            precision,
+            width,
+            flags,
+            prefer_decimal);
+        format++;
+        break;
+      case 'e':
+      case 'E':
+      case 'g':
+      case 'G':
+        if ((*format == 'g') || (*format == 'G'))
+          flags |= flags::adapt_exp;
+        if ((*format == 'E') || (*format == 'G'))
+          flags |= flags::uppercase;
+        print_floating_point(
+            output,
+            __builtin_va_arg(args, double),
+            precision,
+            width,
+            flags,
+            prefer_exponential);
+        format++;
+        break;
+      case 'c': {
+        printf_size_t l = 1U;
+        if (!(flags & flags::left)) {
+          while (l++ < width) {
+            putchar_via_gadget(output, ' ');
+          }
+        }
+        putchar_via_gadget(output, (char)__builtin_va_arg(args, int));
+        if (flags & flags::left) {
+          while (l++ < width) {
+            putchar_via_gadget(output, ' ');
+          }
+        }
+        format++;
+        break;
+      }
+      case 's': {
+        const char* p = __builtin_va_arg(args, char*);
+        if (p == nullptr) {
+          out_rev_(output, ")llun(", 6, width, flags);
+        } else {
+          printf_size_t l =
+              strnlen_s_(p, precision ? precision : max_possible_buffer_size);
+          if (flags & flags::precision) {
+            l = (l < precision ? l : precision);
+          }
+          if (!(flags & flags::left)) {
+            while (l++ < width) {
+              putchar_via_gadget(output, ' ');
+            }
+          }
+          while ((*p != 0) && (!(flags & flags::precision) || precision)) {
+            putchar_via_gadget(output, *(p++));
+            --precision;
+          }
+          if (flags & flags::left) {
+            while (l++ < width) {
+              putchar_via_gadget(output, ' ');
+            }
+          }
+        }
+        format++;
+        break;
+      }
+      case 'p': {
+        width = sizeof(void*) * 2U + 2;
+        flags |= flags::zeropad | flags::pointer;
+        uintptr_t value = (uintptr_t) __builtin_va_arg(args, void*);
+        (value == (uintptr_t) nullptr)
+            ? out_rev_(output, ")lin(", 5, width, flags)
+            : print_integer(
+                  output,
+                  (unsigned_value_t)value,
+                  false,
+                  base::hex,
+                  precision,
+                  width,
+                  flags);
+        format++;
+        break;
+      }
+      case '%':
+        putchar_via_gadget(output, '%');
+        format++;
+        break;
+      case 'n': {
+        if (flags & flags::char_)
+          *(__builtin_va_arg(args, char*)) = (char)output->pos;
+        else if (flags & flags::short_)
+          *(__builtin_va_arg(args, short*)) = (short)output->pos;
+        else if (flags & flags::long_)
+          *(__builtin_va_arg(args, long*)) = (long)output->pos;
+        else if (flags & flags::long_long)
+          *(__builtin_va_arg(args, long long*)) = (long long int)output->pos;
+        else
+          *(__builtin_va_arg(args, int*)) = (int)output->pos;
+        format++;
+        break;
+      }
+      default:
+        putchar_via_gadget(output, *format);
+        format++;
+        break;
+    }
+  }
+  append_termination_with_gadget(output);
+  return (int)output->pos;
+}
+} // namespace printf
+} // namespace detail_
+__attribute__((device)) int vprintf(const char* format, va_list arg) {
+  detail_::printf::output_gadget_t gadget =
+      detail_::printf::discarding_gadget();
+  int ret = vsnprintf(&gadget, format, arg);
+  if (ret < 0) {
+    return ret;
+  }
+  size_t count = ret + 1;
+  char* scratch = (char*)malloc(count);
+  if (scratch == nullptr) {
+    return -1;
+  }
+  ret = vsnprintf(scratch, count, format, arg);
+  if (ret < 0) {
+    free(scratch);
+    return ret;
+  }
+  ret = printf("%s", scratch);
+}
+__attribute__((device)) int vsnprintf(
+    char* s,
+    size_t n,
+    const char* format,
+    va_list arg) {
+  detail_::printf::output_gadget_t gadget =
+      detail_::printf::buffer_gadget(s, n);
+  return detail_::printf::vsnprintf(&gadget, format, arg);
+}
+__attribute__((device)) int vsprintf(char* s, const char* format, va_list arg) {
+  return vsnprintf(s, detail_::printf::max_possible_buffer_size, format, arg);
+}
+__attribute__((device)) inline int vnprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    va_list arg) {
+  const int ret = vsnprintf(scratch, count, format, arg);
+  if (scratch == nullptr) {
+    return ret;
+  }
+  if (ret > 0) {
+    return printf("%s", scratch);
+  }
+};
+__attribute__((device)) int printf(const char* format, ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vprintf(format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int sprintf(char* s, const char* format, ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vsprintf(s, format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int snprintf(
+    char* s,
+    size_t n,
+    const char* format,
+    ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vsnprintf(s, n, format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int nprintf_with_scratch(
+    char* scratch_buffer,
+    size_t count,
+    const char* format,
+    ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  return vnprintf_with_scratch(scratch_buffer, count, format, args);
+}
+} // namespace kat

From d56544bd43eae57770672dcddc8664c3a396fb27 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 May 2023 17:25:51 -0700
Subject: [PATCH 522/638] refactor sm detection; add quadro card name (#690)

Summary:
att

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/690

Reviewed By: alexanderguzhva

Differential Revision: D45901015

Pulled By: chenyang78

fbshipit-source-id: 0c8c8fe5617d459bcf9a774e96d865f621b2bf37
---
 python/aitemplate/testing/detect_target.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index f0731eea9..465dcb4c2 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -39,14 +39,15 @@ def _detect_cuda_with_nvidia_smi():
         )
         stdout, stderr = proc.communicate()
         stdout = stdout.decode("utf-8")
-        if "H100" in stdout:
-            return "90"
-        if any(a in stdout for a in ["A100", "A10G", "RTX 30", "A30", "RTX 40"]):
-            return "80"
-        if "V100" in stdout:
-            return "70"
-        if "T4" in stdout:
-            return "75"
+        sm_names = {
+            "70": ["V100"],
+            "75": ["T4", "Quadro T2000"],
+            "80": ["A100", "A10G", "RTX 30", "A30", "RTX 40"],
+            "90": ["H100"],
+        }
+        for sm, names in sm_names.items():
+            if any(name in stdout for name in names):
+                return sm
         return None
     except Exception:
         return None

From f738b9b22bf00a499c256b64ea6661900d6e3b62 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 May 2023 17:36:18 -0700
Subject: [PATCH 523/638] add nix-shell config (#691)

Summary:
```
$> nix-shell
```
downloads and caches the dependencies on the first run, launches a shell with set up environment on any run. Within the shell, I could run a unit test for aitemplate
```
PYTHONPATH=python:$PYTHONPATH python tests/unittest/ops/test_activation.py -k softplus
```
some deps might still be missing to use all the components though

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/691

Reviewed By: wushirong

Differential Revision: D45901068

Pulled By: chenyang78

fbshipit-source-id: 2a118f25679b69e65a2942b1164ed4886b707e9c
---
 default.nix | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 default.nix

diff --git a/default.nix b/default.nix
new file mode 100644
index 000000000..e4c49b033
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,48 @@
+{ pkgs ? import <nixpkgs> {
+  config = {
+    allowUnfree = true;
+    cudaSupport = true;
+  };
+}}:
+
+let 
+  ait-deps = ps: with ps; [
+    pytorch-bin
+    pip
+    wheel
+    unidecode
+    inflect
+    librosa
+    jinja2
+    sympy
+    einops
+    parameterized
+    # (
+    #   buildPythonPackage rec {
+    #     pname = "cuda_python";
+    #     version = "12.1.0";
+    #     format = "wheel";
+    #     src = fetchPypi {
+    #       inherit pname version format;
+    #       sha256 = "94506d730baade1744767e2c05d5ddd84d7fbe4c9b6f694a54a3f376f7ffa525";
+    #       abi = "cp39";
+    #       python = "cp39";
+    #       platform = "manylinux_2_17_x86_64.manylinux2014_x86_64";
+    #     };
+    #     doCheck = false;
+    #   }
+    # )
+  ];  
+in
+pkgs.mkShell {
+  buildInputs = [
+    pkgs.cmake
+    pkgs.cudatoolkit
+    (pkgs.python39.withPackages ait-deps)
+  ];
+
+  shellHook = ''
+    export CUDA_PATH=${pkgs.cudatoolkit}
+    echo "You are now using a NIX environment"
+  '';
+}

From b9d77bd6b75bbcb4d9e9a183a0135a45c54bb7ef Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Thu, 18 May 2023 00:15:13 -0700
Subject: [PATCH 524/638] Stable Diffusion dynamic input shape, include/exclude
 constants, load from diffusers/compvis, alternative pipeline (#696)

Summary:
* min/max height/width
* include/exclude constants from module
* load from diffusers model to compiled aitemplate module
* load from compvis model to compiled aitemplate module
* pipeline doesn't rely on StableDiffusionPipeline
* set shape of output tensor according to height/width

```
~/AITemplate/examples/05_stable_diffusion$ python scripts/compile_alt.py --min-width 64 --max-width 1536 --min-height 64 --max-height 1536 --clip-chunks 6
```

```
~/AITemplate/examples/05_stable_diffusion$ python scripts/demo_alt.py
INFO:aitemplate.backend.build_cache_base:Build cache disabled
2023-05-15 18:55:09,465 INFO <aitemplate.testing.detect_target> Set target to CUDA
[18:55:09] model_container.cu:67: Device Runtime Version: 11060; Driver Version: 12010
[18:55:09] model_container.cu:81: Hardware accelerator device properties:
  Device:
     ASCII string identifying device: NVIDIA GeForce RTX 3060

[18:55:09] model_container.cu:85: Init AITemplate Runtime with 1 concurrency
Loading PyTorch CLIP
Setting constants
Folding constants
[18:55:19] model_container.cu:67: Device Runtime Version: 11060; Driver Version: 12010
[18:55:19] model_container.cu:81: Hardware accelerator device properties:
  Device:
     ASCII string identifying device: NVIDIA GeForce RTX 3060

[18:55:19] model_container.cu:85: Init AITemplate Runtime with 1 concurrency
Loading PyTorch UNet
Setting constants
Folding constants
[18:55:24] model_container.cu:67: Device Runtime Version: 11060; Driver Version: 12010
[18:55:24] model_container.cu:81: Hardware accelerator device properties:
  Device:
     ASCII string identifying device: NVIDIA GeForce RTX 3060

[18:55:24] model_container.cu:85: Init AITemplate Runtime with 1 concurrency
Loading PyTorch VAE
Mapping parameters...
Setting constants
Folding constants
100%|| 50/50 [00:03<00:00, 12.94it/s]
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/696

Reviewed By: terrychenism

Differential Revision: D45964831

Pulled By: chenyang78

fbshipit-source-id: c126db27afb425b156e15373580a20cfbb06290a
---
 examples/05_stable_diffusion/README.md        |   24 +
 .../scripts/compile_alt.py                    |  135 +++
 .../05_stable_diffusion/scripts/demo_alt.py   |   64 +
 .../src/compile_lib/compile_clip_alt.py       |   90 ++
 .../src/compile_lib/compile_unet_alt.py       |  111 ++
 .../src/compile_lib/compile_vae_alt.py        |  159 +++
 .../src/pipeline_stable_diffusion_ait_alt.py  | 1028 +++++++++++++++++
 7 files changed, 1611 insertions(+)
 create mode 100644 examples/05_stable_diffusion/scripts/compile_alt.py
 create mode 100644 examples/05_stable_diffusion/scripts/demo_alt.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
 create mode 100644 examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py

diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index 18700540a..624cfcd47 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -38,6 +38,30 @@ python3 scripts/compile.py
 ```
 It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
+#### Alternative build script
+
+```
+python3 scripts/compile_alt.py --width 64 1536 --height 64 1536 --batch-size 1 4 --clip-chunks 6
+```
+This compiles modules with dynamic shape. In the example, modules will work with width in range 64-1536px, batch sizes 1-4. Clip chunks refers to the number of tokens accepted by UNet in multiples of 77, 1 chunk = 77 tokens, 3 chunks = 231 tokens.
+By default, `compile_alt.py` does not include model weights (constants) with the compiled module, to include the model weights in the compiled module use `--include-consants True`.
+
+#### Alternative pipeline
+
+The original pipeline requires a diffusers model local dir, and relies directly on `StableDiffusionPipeline`. This pipeline builds similar functionality without directly using `StableDiffusionPipeline`, and is capable of loading model weights from either diffusers or compvis models to compiled aitemplate modules.
+
+* AITemplate modules are created
+* Model weights are loaded, converted/mapped, then applied to AITemplate module
+* Scheduler and tokenizer are created from `runwayml/stable-diffusion-v1-5` and `openai/clip-vit-large-patch14` respectively
+
+```
+python3 scripts/demo.py --hf-hub-or-path runwayml/stable-diffusion-v1-5
+or
+python3 scripts/demo.py --ckpt v1-5-pruned-emaonly.ckpt
+```
+
+`--ckpt` takes preference over `--hf-hub-or-path` if both are specified
+
 #### Multi-GPU profiling
 AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
 To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
diff --git a/examples/05_stable_diffusion/scripts/compile_alt.py b/examples/05_stable_diffusion/scripts/compile_alt.py
new file mode 100644
index 000000000..62ac268bc
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_alt.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import StableDiffusionPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_clip_alt import compile_clip
+from src.compile_lib.compile_unet_alt import compile_unet
+from src.compile_lib.compile_vae_alt import compile_vae
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    help="the local diffusers pipeline directory",
+)
+@click.option(
+    "--width",
+    default=(64, 2048),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum width",
+)
+@click.option(
+    "--height",
+    default=(64, 2048),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum height",
+)
+@click.option(
+    "--batch-size",
+    default=(1, 4),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum batch size",
+)
+@click.option("--clip-chunks", default=6, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=None,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(
+    local_dir,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    assert (
+        width[0] % 64 == 0 and width[1] % 64 == 0
+    ), "Minimum Width and Maximum Width must be multiples of 64, otherwise, the compilation process will fail."
+    assert (
+        height[0] % 64 == 0 and height[1] % 64 == 0
+    ), "Minimum Height and Maximum Height must be multiples of 64, otherwise, the compilation process will fail."
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    # CLIP
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=77,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+        constants=True if include_constants else False,
+    )
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
+        constants=True if include_constants else False,
+    )
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        constants=True if include_constants else False,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/demo_alt.py b/examples/05_stable_diffusion/scripts/demo_alt.py
new file mode 100644
index 000000000..28b322f02
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_alt.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+
+from aitemplate.utils.import_path import import_parent
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_ait_alt import StableDiffusionAITPipeline
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="runwayml/stable-diffusion-v1-5",
+    help="Model weights to apply to compiled model (with --include-constants false)",
+)
+@click.option("--ckpt", default=None, help="e.g. v1-5-pruned-emaonly.ckpt")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option("--steps", default=50, help="Number of inference steps")
+@click.option("--cfg", default=7.5, help="Guidance scale")
+def run(
+    hf_hub_or_path, ckpt, width, height, batch, prompt, negative_prompt, steps, cfg
+):
+    pipe = StableDiffusionAITPipeline(
+        hf_hub_or_path=hf_hub_or_path,
+        ckpt=ckpt,
+    )
+
+    prompt = [prompt] * batch
+    negative_prompt = [negative_prompt] * batch
+    with torch.autocast("cuda"):
+        image = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            num_inference_steps=steps,
+            guidance_scale=cfg,
+        ).images[0]
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
new file mode 100644
index 000000000..b4991e98d
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
+from .util import mark_output
+
+
+def map_clip_params(pt_mod, batch_size=1, seqlen=77, depth=12):
+    params_ait = {}
+    pt_params = dict(pt_mod.named_parameters())
+    for key, arr in pt_params.items():
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+def compile_clip(
+    pt_mod,
+    batch_size=(1, 8),
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    depth=12,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    act_layer="gelu",
+    constants=True,
+):
+    mask_seq = 0
+    causal = True
+
+    ait_mod = ait_CLIPTextTransformer(
+        num_hidden_layers=depth,
+        hidden_size=dim,
+        num_attention_heads=num_heads,
+        batch_size=batch_size,
+        seq_len=seqlen,
+        causal=causal,
+        mask_seq=mask_seq,
+        act_layer=act_layer,
+    )
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
+    batch_size = IntVar(values=list(batch_size), name="batch_size")
+
+    input_ids_ait = Tensor(
+        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
+    )
+    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y, target, "./tmp", "CLIPTextModel", constants=params_ait if constants else None
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
new file mode 100644
index 000000000..f4cf4b48c
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.unet_2d_condition import (
+    UNet2DConditionModel as ait_UNet2DConditionModel,
+)
+from .util import mark_output
+
+
+def map_unet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def compile_unet(
+    pt_mod,
+    batch_size=(1, 8),
+    height=(64, 2048),
+    width=(64, 2048),
+    clip_chunks=1,
+    dim=320,
+    hidden_dim=1024,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    model_name="UNet2DConditionModel",
+    use_linear_projection=False,
+    constants=True,
+):
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=64,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=attention_head_dim,
+        use_linear_projection=use_linear_projection,
+    )
+    ait_mod.name_parameter_tensor()
+
+    # set AIT parameters
+    pt_mod = pt_mod.eval()
+    params_ait = map_unet_params(pt_mod, dim)
+    batch_size = (batch_size[0], batch_size[1] * 2)  # double batch size for unet
+    batch_size = IntVar(values=list(batch_size), name="batch_size")
+    height = height[0] // 8, height[1] // 8
+    width = width[0] // 8, width[1] // 8
+    height_d = IntVar(values=list(height), name="height")
+    width_d = IntVar(values=list(width), name="width")
+    clip_chunks = 77, 77 * clip_chunks
+    embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height_d, width_d, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [batch_size, embedding_size, hidden_dim], name="input2", is_input=True
+    )
+
+    mid_block_additional_residual = None
+    down_block_additional_residuals = None
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        down_block_additional_residuals,
+        mid_block_additional_residual,
+    )
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y, target, "./tmp", model_name, constants=params_ait if constants else None
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
new file mode 100644
index 000000000..559194d6f
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
@@ -0,0 +1,159 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
+from .util import mark_output
+
+
+def map_vae_params(ait_module, pt_module, batch_size=1, seq_len=4096):
+    if not isinstance(pt_module, dict):
+        pt_params = dict(pt_module.named_parameters())
+    else:
+        pt_params = pt_module
+    mapped_pt_params = {}
+    for name, _ in ait_module.named_parameters():
+        ait_name = name.replace(".", "_")
+        if name in pt_params:
+            if (
+                "conv" in name
+                and "norm" not in name
+                and name.endswith(".weight")
+                and len(pt_params[name].shape) == 4
+            ):
+                mapped_pt_params[ait_name] = torch.permute(
+                    pt_params[name], [0, 2, 3, 1]
+                ).contiguous()
+            else:
+                mapped_pt_params[ait_name] = pt_params[name]
+        elif name.endswith("attention.proj.weight"):
+            prefix = name[: -len("attention.proj.weight")]
+            pt_name = prefix + "proj_attn.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj.bias"):
+            prefix = name[: -len("attention.proj.bias")]
+            pt_name = prefix + "proj_attn.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.cu_length"):
+            ...
+        elif name.endswith("attention.proj_q.weight"):
+            prefix = name[: -len("attention.proj_q.weight")]
+            pt_name = prefix + "query.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_q.bias"):
+            prefix = name[: -len("attention.proj_q.bias")]
+            pt_name = prefix + "query.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.weight"):
+            prefix = name[: -len("attention.proj_k.weight")]
+            pt_name = prefix + "key.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_k.bias"):
+            prefix = name[: -len("attention.proj_k.bias")]
+            pt_name = prefix + "key.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.weight"):
+            prefix = name[: -len("attention.proj_v.weight")]
+            pt_name = prefix + "value.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj_v.bias"):
+            prefix = name[: -len("attention.proj_v.bias")]
+            pt_name = prefix + "value.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        else:
+            pt_param = pt_module.get_parameter(name)
+            mapped_pt_params[ait_name] = pt_param
+    for key, arr in mapped_pt_params.items():
+        mapped_pt_params[key] = arr.to("cuda", dtype=torch.float16)
+    return mapped_pt_params
+
+
+def compile_vae(
+    pt_mod,
+    batch_size=(1, 8),
+    height=(64, 2048),
+    width=(64, 2048),
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    name="AutoencoderKL",
+    constants=True,
+):
+    in_channels = 3
+    out_channels = 3
+    down_block_types = [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ]
+    up_block_types = [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ]
+    block_out_channels = [128, 256, 512, 512]
+    layers_per_block = 2
+    act_fn = "silu"
+    latent_channels = 4
+    sample_size = 512
+
+    # values not important, we only need this for mapping keys
+    ait_vae = ait_AutoencoderKL(
+        1,
+        64,
+        64,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn=act_fn,
+        latent_channels=latent_channels,
+        sample_size=sample_size,
+    )
+    batch_size = IntVar(values=list(batch_size), name="batch_size")
+    height = height[0] // 8, height[1] // 8
+    width = width[0] // 8, width[1] // 8
+    height_d = IntVar(values=list(height), name="height")
+    width_d = IntVar(values=list(width), name="width")
+
+    ait_input = Tensor(
+        shape=[batch_size, height_d, width_d, latent_channels],
+        name="vae_input",
+        is_input=True,
+    )
+    ait_vae.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_vae_params(ait_vae, pt_mod)
+
+    Y = ait_vae.decode(ait_input)
+    mark_output(Y)
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y,
+        target,
+        "./tmp",
+        name,
+        constants=params_ait if constants else None,
+    )
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
new file mode 100644
index 000000000..ce0abeaa8
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
@@ -0,0 +1,1028 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+
+from diffusers import AutoencoderKL, EulerDiscreteScheduler, UNet2DConditionModel
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils.pil_utils import numpy_to_pil
+from tqdm import tqdm
+
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from .compile_lib.compile_vae_alt import map_vae_params
+from .modeling.vae import AutoencoderKL as ait_AutoencoderKL
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, additional_replacements=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+
+def convert_ldm_vae_checkpoint(vae_state_dict):
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
+        "encoder.conv_out.weight"
+    ]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
+        "encoder.norm_out.weight"
+    ]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
+        "encoder.norm_out.bias"
+    ]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
+        "decoder.conv_out.weight"
+    ]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
+        "decoder.norm_out.weight"
+    ]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
+        "decoder.norm_out.bias"
+    ]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "encoder.down" in layer
+        }
+    )
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
+        for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "decoder.up" in layer
+        }
+    )
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
+        for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key
+        ]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+# =================#
+# UNet Conversion #
+# =================#
+def convert_ldm_unet_checkpoint(unet_state_dict, layers_per_block=2):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
+        "time_embed.0.weight"
+    ]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
+        "time_embed.0.bias"
+    ]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
+        "time_embed.2.weight"
+    ]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
+        "time_embed.2.bias"
+    ]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "input_blocks" in layer
+        }
+    )
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "middle_block" in layer
+        }
+    )
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "output_blocks" in layer
+        }
+    )
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (layers_per_block + 1)
+
+        resnets = [
+            key
+            for key in input_blocks[i]
+            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.weight"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.weight")
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.bias"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path]
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths,
+        new_checkpoint,
+        unet_state_dict,
+        additional_replacements=[meta_path],
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (layers_per_block + 1)
+        layer_in_block_id = i % (layers_per_block + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [
+                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
+            ]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(
+                    ["conv.bias", "conv.weight"]
+                )
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.weight"]
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.bias"]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(
+                output_block_layers, n_shave_prefix_segments=1
+            )
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# =========================#
+#    AITemplate mapping   #
+# =========================#
+def map_unet_state_dict(state_dict, dim=320):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            # print("ff.net.0.proj.weight")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            # print("ff.net.0.proj.bias")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def map_clip_state_dict(state_dict):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+class StableDiffusionAITPipeline:
+    def __init__(self, hf_hub_or_path, ckpt):
+        self.device = torch.device("cuda")
+        workdir = "tmp/"
+        state_dict = None
+        if ckpt is not None:
+            state_dict = torch.load(ckpt, map_location="cpu")
+            while "state_dict" in state_dict:
+                state_dict = state_dict["state_dict"]
+            clip_state_dict = {}
+            unet_state_dict = {}
+            vae_state_dict = {}
+            for key in state_dict.keys():
+                if key.startswith("cond_stage_model.transformer."):
+                    new_key = key.replace("cond_stage_model.transformer.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("cond_stage_model.model."):
+                    new_key = key.replace("cond_stage_model.model.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("first_stage_model."):
+                    new_key = key.replace("first_stage_model.", "")
+                    vae_state_dict[new_key] = state_dict[key]
+                elif key.startswith("model.diffusion_model."):
+                    new_key = key.replace("model.diffusion_model.", "")
+                    unet_state_dict[new_key] = state_dict[key]
+            # TODO: SD2.x clip support, get from diffusers convert_from_ckpt.py
+            # clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
+            unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict)
+            vae_state_dict = convert_ldm_vae_checkpoint(vae_state_dict)
+            state_dict = None
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        print("Loading PyTorch CLIP")
+        if ckpt is None:
+            self.clip_pt = CLIPTextModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="text_encoder",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            config = CLIPTextConfig.from_pretrained(
+                hf_hub_or_path, subfolder="text_encoder"
+            )
+            self.clip_pt = CLIPTextModel(config)
+            self.clip_pt.load_state_dict(clip_state_dict)
+        clip_params_ait = map_clip_state_dict(dict(self.clip_pt.named_parameters()))
+        print("Setting constants")
+        self.clip_ait_exe.set_many_constants_with_tensors(clip_params_ait)
+        print("Folding constants")
+        self.clip_ait_exe.fold_constants()
+        # cleanup
+        self.clip_pt = None
+        clip_params_ait = None
+
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="UNet2DConditionModel", workdir=workdir
+        )
+
+        print("Loading PyTorch UNet")
+        if ckpt is None:
+            self.unet_pt = UNet2DConditionModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="unet",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+            self.unet_pt = self.unet_pt.state_dict()
+        else:
+            self.unet_pt = unet_state_dict
+        unet_params_ait = map_unet_state_dict(self.unet_pt)
+        print("Setting constants")
+        self.unet_ait_exe.set_many_constants_with_tensors(unet_params_ait)
+        print("Folding constants")
+        self.unet_ait_exe.fold_constants()
+        # cleanup
+        self.unet_pt = None
+        unet_params_ait = None
+
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+        print("Loading PyTorch VAE")
+        if ckpt is None:
+            self.vae_pt = AutoencoderKL.from_pretrained(
+                hf_hub_or_path,
+                subfolder="vae",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            self.vae_pt = dict(vae_state_dict)
+        in_channels = 3
+        out_channels = 3
+        down_block_types = [
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ]
+        up_block_types = [
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+        ]
+        block_out_channels = [128, 256, 512, 512]
+        layers_per_block = 2
+        act_fn = "silu"
+        latent_channels = 4
+        sample_size = 512
+
+        ait_vae = ait_AutoencoderKL(
+            1,
+            64,
+            64,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            latent_channels=latent_channels,
+            sample_size=sample_size,
+        )
+        print("Mapping parameters...")
+        vae_params_ait = map_vae_params(ait_vae, self.vae_pt)
+        print("Setting constants")
+        self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
+        print("Folding constants")
+        self.vae_ait_exe.fold_constants()
+        # cleanup
+        self.vae_pt = None
+        ait_vae = None
+        vae_params_ait = None
+
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+        )
+        self.batch = 1
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def unet_inference(
+        self, latent_model_input, timesteps, encoder_hidden_states, height, width
+    ):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(self.batch * 2)
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height // 8
+            shape[2] = width // 8
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=77):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input, height, width):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            shape[1] = height
+            shape[2] = width
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        self.batch = batch_size
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+        # pytorch equivalent
+        # text_embeddings = self.clip_pt(text_input.input_ids.to(self.device)).last_hidden_state
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = self.device
+        latents_shape = (batch_size, 4, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for t in tqdm(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                height=height,
+                width=width,
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs
+            ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents, height, width)
+        # pytorch equivalent
+        # image = self.vae_pt.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

From 9dc346d55b6e30bc86fba11381e10e79e0a339de Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingz@meta.com>
Date: Thu, 18 May 2023 00:17:03 -0700
Subject: [PATCH 525/638] Dump function properties at profiling time. (#684)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/684

ATT, dump Operator._args_for_pseudo_code() in the result json file generated by
Profile(). This would make op-level benchmark easier.

Reviewed By: frank-wei

Differential Revision: D45683542

fbshipit-source-id: 8261140a3f2ab04eb9c3b00228331020548cc839
---
 python/aitemplate/backend/codegen.py                       | 7 +++++--
 python/aitemplate/backend/main_templates.py                | 2 +-
 python/aitemplate/compiler/base.py                         | 2 +-
 python/aitemplate/compiler/ops/common/elementwise.py       | 2 +-
 python/aitemplate/compiler/ops/common/fused_elementwise.py | 2 +-
 python/aitemplate/compiler/ops/common/int_elementwise.py   | 2 +-
 python/aitemplate/compiler/ops/groupnorm/groupnorm.py      | 3 ---
 .../aitemplate/compiler/ops/layernorm/group_layernorm.py   | 6 ++++++
 python/aitemplate/compiler/ops/layernorm/layernorm.py      | 5 +++++
 .../compiler/ops/layernorm/layernorm_sigmoid_mul.py        | 5 +++++
 python/aitemplate/compiler/ops/softmax/softmax.py          | 3 +++
 python/aitemplate/compiler/ops/tensor/concatenate.py       | 5 ++++-
 python/aitemplate/compiler/ops/tensor/dynamic_slice.py     | 5 ++++-
 python/aitemplate/compiler/ops/tensor/slice_scatter.py     | 3 +++
 python/aitemplate/compiler/ops/tensor/split.py             | 5 ++++-
 15 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index d4e6c521f..9cecceabf 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -755,8 +755,11 @@ def _process_src_ops(self, node: Tensor) -> None:
                 self._input_shape_seq.append(input_shape)
                 self._output_shape_seq.append(output_shape)
                 props = {}
-                if "concat_dim" in func._attrs:
-                    props["dim"] = func._attrs["concat_dim"]
+                for item in func._args_for_pseudo_code():
+                    res = item.split("=")
+                    if len(res) == 2:
+                        res[1] = res[1].replace("\n", " ")
+                        props[res[0]] = res[1]
                 self.func_prop_seq.append(props)
 
                 # save the rendered code for the future
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 6cf62a9fb..7557d5153 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -270,7 +270,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
            << ", \\"input_sizes\\": " << "{{ input_sizes | replace("'", '\\\\"') }}"
            << ", \\"output_sizes\\": " << "{{ output_sizes | replace("'", '\\\\"') }}"
         {% for prop_name, prop_value in func_properties.items() %}
-          << ", \\"{{ prop_name }}\\": " << "{{ prop_value }}"
+          << ", \\"{{ prop_name }}\\": " << "\\"{{ prop_value }}\\""
         {% endfor %}
            << " } ";
         {% if loop.last %}
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index 087300328..7272f0f4a 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -1251,7 +1251,7 @@ def _outputs_for_pseudo_code(self):
         return self._attrs["outputs"]
 
     def _args_for_pseudo_code(self):
-        return []
+        return [f"{key}={value}" for key, value in self._get_op_attributes().items()]
 
     def _pseudo_code_helper(self, node: Any, with_shape: bool) -> str:
         if isinstance(node, list):
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index 4e2692750..a5bc5847a 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -271,7 +271,7 @@ def replace_input_tensor(self, old_tensor, new_tensor) -> None:
         ]
 
     def _args_for_pseudo_code(self):
-        return [self._attrs["func"]]
+        return [f"func={self._attrs['func']}"]
 
 
 # TODO: move it to math.py and update it to a function.
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
index 3716c6f96..74afdff32 100644
--- a/python/aitemplate/compiler/ops/common/fused_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -123,4 +123,4 @@ def gen_function(self) -> str:
         return func(self._attrs)
 
     def _args_for_pseudo_code(self):
-        return [op._attrs["func"] for op in self._attrs["elementwise_ops"]]
+        return [f"func={[op._attrs['func'] for op in self._attrs['elementwise_ops']]}"]
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index 7a2bfe8ca..c44db8b9c 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -132,7 +132,7 @@ def _get_op_attributes(self):
         return {"func_enum": self._attrs["func"]}
 
     def _args_for_pseudo_code(self):
-        return [self._attrs["func"]]
+        return [f"func={self._attrs['func']}"]
 
     def gen_function(self) -> str:
         target = backend.target.Target.current()
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 47fb2dde9..b78c67c89 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -473,9 +473,6 @@ def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.M
             )
             self._attrs["exec_path"][exec_item.profiling_key] = exec_item
 
-    def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [f"num_groups={self._attrs['num_groups']}"]
-
     def _get_op_attributes(self):
         return {
             "num_groups": self._attrs["num_groups"],
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
index aee3458e5..09bba6238 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -157,3 +157,9 @@ def __call__(
             self._attrs["output_accessors"].append(TensorAccessor(output))
             self._attrs["input_accessors"].append(TensorAccessor(x))
         return self._attrs["outputs"]
+
+    def _args_for_pseudo_code(self):
+        res = []
+        for shapes in self._attrs["normalized_shape"]:
+            res.append(",".join([str(s.symbolic_value()) for s in shapes]))
+        return [f"normalized_shape={res}"]
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index 8cf35808d..17eb6e30d 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -444,3 +444,8 @@ def gen_profiler(
 
     def _get_op_attributes(self):
         return {"normalized_shape": self._attrs["default_normalized_shape"]}
+
+    def _args_for_pseudo_code(self):
+        return [
+            f"normalized_shape={[s.symbolic_value() for s in self._attrs['normalized_shape']]}"
+        ]
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
index 691cfb41a..4d3b198ab 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
@@ -101,3 +101,8 @@ def gen_function(self) -> str:
         )
         func = registry.get(func_key)
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return [
+            f"normalized_shape={[s.symbolic_value() for s in self._attrs['normalized_shape']]}"
+        ]
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index aa08bddd4..5fb63112d 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -392,3 +392,6 @@ def gen_function(self) -> str:
         self._attrs["exec_cond_template"] = EXEC_COND_TEMPLATE
         func = registry.get(func_key)
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return {"dim": self._attrs["dim"]}
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index 0d36a2c37..eaa544863 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -271,4 +271,7 @@ def remove_input_at(self, indices: Union[int, Sequence[int]]) -> None:
         self._attrs["input_accessors"] = new_input_accessors
 
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [f"dim={self._attrs['concat_dim']}"]
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [f"dim={self._attrs['concat_dim']}"]
diff --git a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
index 7c832b012..547382913 100644
--- a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
+++ b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
@@ -195,7 +195,10 @@ def gen_function(self) -> str:
         return func(self._attrs)
 
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [
             f"start_indices=[{self._pseudo_code_helper(self._attrs['start_indices'], with_shape=True)}]",
             f"end_indices=[{self._pseudo_code_helper(self._attrs['end_indices'], with_shape=True)}]",
         ]
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 05ca0b1d5..52ab2d21a 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -101,3 +101,6 @@ def _get_func(self, fmt_str):
     def gen_function(self) -> str:
         func = self._get_func("{target}.{op}.gen_function")
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return [f"scatter_dim={str(self._attrs['scatter_dim'])}]"]
diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
index e94dd2cf2..2f78b1c93 100644
--- a/python/aitemplate/compiler/ops/tensor/split.py
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -192,7 +192,10 @@ def remove_output_at(self, indices: Union[int, Sequence[int]]) -> None:
         self._attrs["outputs"] = new_outputs
 
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [
             f"split_sizes={str(self._attrs['split_sizes'])}]",
             f"dim={str(self._attrs['split_dim'])}]",
         ]

From a0ddbe0390d366040b499d8614d96ab40c299591 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 18 May 2023 16:08:53 +0800
Subject: [PATCH 526/638] add graph mode

---
 static/include/model.h                 | 61 ++++++++++++++------------
 static/include/rocm_device_functions.h |  3 +-
 2 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/static/include/model.h b/static/include/model.h
index 7c4300873..ce325f23a 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -100,7 +100,7 @@ class ModelBase {
   void Run(StreamType stream, bool graph_mode) {
     auto* model = static_cast<ModelType*>(this);
     model->SetUpInputsOutputs();
-    if (target_has_graph_mode && graph_mode) {
+    if (graph_mode) {
       RunAsGraph(stream);
     } else {
       model->RunImpl(stream);
@@ -216,37 +216,39 @@ class ModelBase {
   }
 
   void RunAsGraph(StreamType stream) {
-    DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
-    try {
-      static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
-    } catch (...) {
-      GraphType graph;
-      // No need to DEVICE_CHECK here, we want to see the original exception.
-      EndCapture(&graph);
-      if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
-        LOG(WARNING)
-            << "Graph destruction failed while handling exception! Memory will be leaked.";
+    if(!graph_created_){
+      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
+      try {
+        static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
+      } catch (...) {
+        GraphType graph;
+        // No need to DEVICE_CHECK here, we want to see the original exception.
+        EndCapture(&graph);
+        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+          LOG(WARNING)
+              << "Graph destruction failed while handling exception! Memory will be leaked.";
+        }
+        throw;
       }
-      throw;
-    }
 
-    // The following function ends the capture and creates a graph
-    // inside a unique_ptr that cleans up it when it goes out of scope.
-    // Note that it throws an exception if EndCapture fails.
-    auto graph = RAII_EndCaptureAndCreateGraph(
-        [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
-
-    if (graph_exec_ == nullptr) {
-      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-    } else if (
-        GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
-      // Consume the last cuda error, which may affect the next GraphExecLaunch
-      // call.
-      GetLastError();
-      DEVICE_CHECK(GraphExecDestroy(graph_exec_));
-      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+      // The following function ends the capture and creates a graph
+      // inside a unique_ptr that cleans up it when it goes out of scope.
+      // Note that it throws an exception if EndCapture fails.
+      auto graph = RAII_EndCaptureAndCreateGraph(
+          [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
+
+      if (graph_exec_ == nullptr) {
+        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+      } else if (
+          GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
+        // Consume the last cuda error, which may affect the next GraphExecLaunch
+        // call.
+        GetLastError();
+        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
+        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+      }
+      graph_created_ = true;
     }
-
     DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
   }
 
@@ -316,6 +318,7 @@ class ModelBase {
   std::vector<ParamInfo> params_;
 
   GraphExecType graph_exec_ = nullptr;
+  bool graph_created_ = false;
   StreamType graph_capture_stream_;
 
   std::unordered_map<std::string, const void**> constant_name_to_ptr_;
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index d37a16dfa..9546bb075 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -202,7 +202,8 @@ inline DeviceError GraphDestroy(GraphType graph) {
 
 inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
   // We don't have hipGraphExecUpdate in some versions of rocm
-  return hipErrorUnknown;
+  hipGraphExecUpdateResult update;
+  return hipGraphExecUpdate(graph_exec, graph, nullptr, &update);
 }
 
 inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {

From d7f10b7c894a37141cf48a610da0bb9a1010185a Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 18 May 2023 16:11:25 +0800
Subject: [PATCH 527/638] set target_has_graph_mode to true

---
 python/aitemplate/backend/codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index bf3f5fe3a..45493e871 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -905,7 +905,7 @@ def _write_simple_multistream_debug_info(
     def generate_model(self) -> str:
         # Disable graph mode on ROCM because the updating operations
         # are not supported
-        target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
+        target_has_graph_mode = "true"
 
         run_impl_mode = multistream_mode()
         if run_impl_mode == 0:

From 7de589f93687a42c586c6e569d18cd33eb63c75f Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmwang@amazon.com>
Date: Thu, 18 May 2023 11:24:16 -0700
Subject: [PATCH 528/638] Add missing __init__.py files (#702)

Summary:
This PR adds missing `__init__.py` files so that fx2ait can be installed using `python setup.py install` without the need to set`PYTHONPATH` as suggested in https://github.com/facebookincubator/AITemplate/issues/162.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/702

Reviewed By: wushirong

Differential Revision: D45986136

Pulled By: alexanderguzhva

fbshipit-source-id: 3c49691cf4c5ed293115d816fe884458d98576e3
---
 fx2ait/fx2ait/example/__init__.py | 0
 fx2ait/fx2ait/lower/__init__.py   | 0
 fx2ait/fx2ait/passes/__init__.py  | 0
 fx2ait/fx2ait/tools/__init__.py   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 fx2ait/fx2ait/example/__init__.py
 create mode 100644 fx2ait/fx2ait/lower/__init__.py
 create mode 100644 fx2ait/fx2ait/passes/__init__.py
 create mode 100644 fx2ait/fx2ait/tools/__init__.py

diff --git a/fx2ait/fx2ait/example/__init__.py b/fx2ait/fx2ait/example/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fx2ait/fx2ait/lower/__init__.py b/fx2ait/fx2ait/lower/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fx2ait/fx2ait/passes/__init__.py b/fx2ait/fx2ait/passes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fx2ait/fx2ait/tools/__init__.py b/fx2ait/fx2ait/tools/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 5da4ae2a1c2f0f38a2fa148d2d8a98b9020be002 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 18 May 2023 19:03:50 -0700
Subject: [PATCH 529/638] add missing Meta headers for __init__.py files (#705)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/705

Add copyright headers

Reviewed By: muchulee8, khabinov, chenyang78

Differential Revision: D45996718

fbshipit-source-id: 1654b30a3587925f522b18a8504cb5f6aab767ec
---
 fx2ait/fx2ait/example/__init__.py | 14 ++++++++++++++
 fx2ait/fx2ait/lower/__init__.py   | 14 ++++++++++++++
 fx2ait/fx2ait/passes/__init__.py  | 14 ++++++++++++++
 fx2ait/fx2ait/tools/__init__.py   | 14 ++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/fx2ait/fx2ait/example/__init__.py b/fx2ait/fx2ait/example/__init__.py
index e69de29bb..5cf1a826f 100644
--- a/fx2ait/fx2ait/example/__init__.py
+++ b/fx2ait/fx2ait/example/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/lower/__init__.py b/fx2ait/fx2ait/lower/__init__.py
index e69de29bb..5cf1a826f 100644
--- a/fx2ait/fx2ait/lower/__init__.py
+++ b/fx2ait/fx2ait/lower/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/passes/__init__.py b/fx2ait/fx2ait/passes/__init__.py
index e69de29bb..5cf1a826f 100644
--- a/fx2ait/fx2ait/passes/__init__.py
+++ b/fx2ait/fx2ait/passes/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/tools/__init__.py b/fx2ait/fx2ait/tools/__init__.py
index e69de29bb..5cf1a826f 100644
--- a/fx2ait/fx2ait/tools/__init__.py
+++ b/fx2ait/fx2ait/tools/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#

From 473e3f312b6578f93fb074e56fdf68aca5f7214e Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 19 May 2023 04:04:12 -0700
Subject: [PATCH 530/638] Add a flag for elementwise computation in float32
 (#700)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/700

Currently, the intermediate computations in `fused_elementwise` are made in the input / output type. This may lead to a poor end-to-end model accuracy, especially if the subgraph covered by the `fused_elementwise` is relatively large. Alternatively, all computations could be made in a more precise type, like `float32`.

In this diff, a new flag, `elementwise_use_fp32_acc`, is added for the `Target` to perform all intermediate computations in `float32` type in the `fused_elementwise` backend.

## Example of the generated code

This fused elementwise subgraph:

```
    X4 = ops.elementwise(FuncEnum.SIGN)(X1)
    X5 = ops.elementwise(FuncEnum.ABS)(X1)
    X6 = ops.elementwise(FuncEnum.ADD)(X5, X2)
    X7 = ops.elementwise(FuncEnum.LOGE)(X6)
    X8 = ops.elementwise(FuncEnum.MUL)(X4, X7)
    X9 = ops.elementwise(FuncEnum.MUL)(X8, X3)

```

With `elementwise_use_fp32_acc=False` in the `Target` (or not set at all == default):

```
    p_tmp_o0[i] = __hmul2(
        __hmul2(
            h2sign_custom(
                p_tmp_i1[i]
            ),
            h2log(
                __hadd2(
                    __habs2(
                        p_tmp_i1[i]
                    ),
                    half2(2.0,2.0)
                )
            )
        ),
        p_tmp_i0[i]
    );
```

With `elementwise_use_fp32_acc=True` in the `Target`:

```
    p_tmp_o0[i] = __float2half_rn(
        __fmul_rn(
            __fmul_rn(
                sign_custom<float>(
                    __half2float(
                        p_tmp_i0[i]
                    )
                ),
                logf(
                    __fadd_rn(
                        fabsf(
                            __half2float(
                                p_tmp_i0[i]
                            )
                        ),
                        float(2.0)))),
            __half2float(
                p_tmp_i1[i]
            )
        )
    );
```

Reviewed By: ipiszy

Differential Revision: D45959405

fbshipit-source-id: ca54ab44b0f2917c5a2fb013bfca0674f0ed2f82
---
 python/aitemplate/backend/backend_spec.py     |   6 +-
 .../backend/common/elementwise_common.py      | 137 ++++++++++++------
 tests/unittest/ops/test_fused_elementwise.py  |  51 +++++--
 3 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 0dfab0002..02ae54080 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -91,7 +91,11 @@ class GPUBackendSpec(BackendSpec):
     backend_datatype_convertors: Dict[str, Dict[str, str]] = field(
         default_factory=lambda: {
             "half": {"float": "__half2float"},
-            "float": {"half": "__float2half_rn"},
+            "bfloat16": {"float": "__bfloat162float"},
+            "float": {
+                "half": "__float2half_rn",
+                "bfloat16": "__float2bfloat16_rn",
+            },
         }
     )
 
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index fd3843e96..71d7101a1 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -339,6 +339,11 @@ class FusedElementwiseMetaData:
     # and computes the dense_inx from jagged_idx (with binary search).
     use_jagged_space_indexing: bool = False
 
+    # whether all intermediate computations should be performed in float32
+    use_fp32_acc: bool = False
+    # the float32 type of the used back-end
+    float32_t: str = "float"
+
 
 def gen_function_single_thread(
     fused_func_metadata,
@@ -348,34 +353,44 @@ def gen_function_single_thread(
 ) -> str:
     """Per thread elementwise function codegen."""
     tensor_to_expr: Dict[Tensor, str] = {}
+    float32_t = fused_func_metadata.float32_t
     body = ""
 
     for tensor, name in zip(fused_func_metadata.original_inputs, input_names):
+        if fused_func_metadata.use_fp32_acc and fused_func_metadata.op_t != float32_t:
+            input_converter = type_converter.get(fused_func_metadata.op_t).get(
+                float32_t
+            )
+            name = "{}({})".format(input_converter, name)
         tensor_to_expr[tensor] = name
 
     tmp_output_idx: int = 0
     for func_metadata in fused_func_metadata.sub_funcs:
         params: List[str] = []
-        func_op_t = func_metadata.op_t
         input_converter = None
         output_converter = None
-        if func_op_t != fused_func_metadata.op_t:
-            input_converter = type_converter.get(fused_func_metadata.op_t).get(
-                func_op_t
-            )
-            output_converter = type_converter.get(func_op_t).get(
-                fused_func_metadata.op_t
-            )
-            assert (
-                input_converter is not None
-            ), "Unsupported convertion from {} to {}".format(
-                fused_func_metadata.op_t, func_op_t
-            )
-            assert (
-                output_converter is not None
-            ), "Unsupported convertion from {} to {}".format(
-                func_op_t, fused_func_metadata.op_t
-            )
+        func_op_t = func_metadata.op_t
+
+        # intermediate input / output converters are not
+        # required when doing all computation in float32
+        if not fused_func_metadata.use_fp32_acc:
+            if func_op_t != fused_func_metadata.op_t:
+                input_converter = type_converter.get(fused_func_metadata.op_t).get(
+                    func_op_t
+                )
+                output_converter = type_converter.get(func_op_t).get(
+                    fused_func_metadata.op_t
+                )
+                assert (
+                    input_converter is not None
+                ), "Unsupported convertion from {} to {}".format(
+                    fused_func_metadata.op_t, func_op_t
+                )
+                assert (
+                    output_converter is not None
+                ), "Unsupported convertion from {} to {}".format(
+                    func_op_t, fused_func_metadata.op_t
+                )
 
         for arg in func_metadata.args:
             if arg in tensor_to_expr:
@@ -421,7 +436,12 @@ def gen_function_single_thread(
         if len(output._attrs["dst_ops"]) > 1:
             name = "tmp_" + (str)(tmp_output_idx)
             tmp_output_idx += 1
-            body += "{} {} = {};\n".format(fused_func_metadata.op_t, name, func_def)
+            temp_t = (
+                float32_t
+                if fused_func_metadata.use_fp32_acc
+                else fused_func_metadata.op_t
+            )
+            body += "{} {} = {};\n".format(temp_t, name, func_def)
             tensor_to_expr[output] = name
         else:
             tensor_to_expr[output] = func_def
@@ -434,38 +454,57 @@ def gen_function_single_thread(
                 )
             )
         expr = tensor_to_expr[tensor]
+        if fused_func_metadata.use_fp32_acc and fused_func_metadata.op_t != float32_t:
+            output_converter = type_converter.get(float32_t).get(
+                fused_func_metadata.op_t
+            )
+            expr = "{}({})".format(output_converter, expr)
         body += "{} = {};\n".format(name, expr)
 
     return body
 
 
 def _get_sub_func_metadata(
-    ops: List[Operator], data_t: str, op_t: str, backend_spec: BackendSpec
+    ops: List[Operator],
+    data_t: str,
+    op_t: str,
+    backend_spec: BackendSpec,
+    float32_t: str,
 ) -> Tuple[List[ElementwiseMetaData], str]:
-    candidate_op_types = backend_spec.get_candidate_op_types(op_t)
-    func_enums = []
-    for op in ops:
-        func_enum = op._attrs["func"]
-        func_enums.append(func_enum)
-        funcs = backend_spec.func_enum_to_func_name.get(func_enum)
-        if funcs is None:
-            raise NotImplementedError("Func {} is not supported!".format(func_enum))
-        for candidate_op_t in candidate_op_types:
-            func_name = funcs.get(candidate_op_t)
-            if func_name is not None:
-                candidate_op_types = backend_spec.get_candidate_op_types(candidate_op_t)
-                break
-    if len(candidate_op_types) == 0:
-        raise RuntimeError(
-            "Cannot find a common backend data type! candidate_op_types: {}, op_t: {}.".format(
-                candidate_op_types, op_t
-            )
-        )
-    if op_t in set(candidate_op_types):
-        op_t = candidate_op_types[0]
-    else:
+    use_fp32_acc = Target.current()._kwargs.get("elementwise_use_fp32_acc", False)
+    if use_fp32_acc:
+        # vectorized op types are not allowed when all
+        # intermediate computation is done in float32
         op_t = data_t
+        # only float functions must be used
+        candidate_op_types = [float32_t]
+    else:
         candidate_op_types = backend_spec.get_candidate_op_types(op_t)
+        func_enums = []
+        for op in ops:
+            func_enum = op._attrs["func"]
+            func_enums.append(func_enum)
+            funcs = backend_spec.func_enum_to_func_name.get(func_enum)
+            if funcs is None:
+                raise NotImplementedError("Func {} is not supported!".format(func_enum))
+            for candidate_op_t in candidate_op_types:
+                func_name = funcs.get(candidate_op_t)
+                if func_name is not None:
+                    candidate_op_types = backend_spec.get_candidate_op_types(
+                        candidate_op_t
+                    )
+                    break
+        if len(candidate_op_types) == 0:
+            raise RuntimeError(
+                "Cannot find a common backend data type! candidate_op_types: {}, op_t: {}.".format(
+                    candidate_op_types, op_t
+                )
+            )
+        if op_t in set(candidate_op_types):
+            op_t = candidate_op_types[0]
+        else:
+            op_t = data_t
+            candidate_op_types = backend_spec.get_candidate_op_types(op_t)
 
     sub_func_metadata = []
     for op in ops:
@@ -487,7 +526,8 @@ def _get_sub_func_metadata(
                 func_name, func_op_t, op._attrs["args"], op._attrs["outputs"]
             )
         )
-    return (sub_func_metadata, op_t)
+
+    return sub_func_metadata, op_t, use_fp32_acc
 
 
 def _is_jagged_shape(shape: List[IntVar]) -> bool:
@@ -777,8 +817,13 @@ def _parse_func_metadata(
     # larger tmp variable which is valid for selected op_type.
     op_type = backend_spec.get_elementwise_op_backend_type(max(input_alignments), dtype)
     data_type = backend_spec.dtype_to_backend_type(dtype)
-    sub_func_metadata, op_type = _get_sub_func_metadata(
-        ops, data_type, op_type, backend_spec
+    float32_type = backend_spec.dtype_to_backend_type("float32")
+    sub_func_metadata, op_type, use_fp32_acc = _get_sub_func_metadata(
+        ops,
+        data_type,
+        op_type,
+        backend_spec,
+        float32_type,
     )
     dynamic_dims = get_dynamic_dims(*[acc.original_shapes for acc in output_accessors])
 
@@ -799,6 +844,8 @@ def _parse_func_metadata(
         mixed_jagged_dense_indexing,
         output_volume,
         use_jagged_space_indexing,
+        use_fp32_acc,
+        float32_type,
     )
 
 
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index d7890fc89..994eaf28f 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -119,7 +119,15 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
     def test_fused_elementwise_constructor(self, ait_dtype):
         self._test_fused_elementwise_constructor(ait_dtype)
 
-    def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype):
+    def _test_fused_elementwise_e2e(
+        self,
+        batch_sizes,
+        ms,
+        ks,
+        test_name,
+        ait_dtype,
+        use_fp32_acc=False,
+    ):
         torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[
@@ -143,12 +151,12 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype)
         X4._attrs["name"] = "output0"
         X4._attrs["is_output"] = True
 
-        target = detect_target()
+        target = detect_target(elementwise_use_fp32_acc=use_fp32_acc)
         module = compile_model(
             X4,
             target,
             "./tmp",
-            "fused_elementwise_{}".format(test_name),
+            f"fused_elementwise_{test_name}_{use_fp32_acc}",
         )
 
         for batch_size in batch_sizes:
@@ -199,15 +207,21 @@ def test_fused_elementwise_e2e(self, ait_dtype):
             test_name=f"dynamic_k_{ait_dtype}",
             ait_dtype=ait_dtype,
         )
-        self._test_fused_elementwise_e2e(
-            batch_sizes=[700, 80, 1024],
-            ms=[23, 78, 256],
-            ks=[10, 30, 128],
-            test_name=f"dynamic_all_{ait_dtype}",
-            ait_dtype=ait_dtype,
-        )
-
-    def _test_fused_elementwise_kernel1(self, ait_dtype):
+        for use_fp32_acc in (False, True):
+            self._test_fused_elementwise_e2e(
+                batch_sizes=[700, 80, 1024],
+                ms=[23, 78, 256],
+                ks=[10, 30, 128],
+                test_name=f"dynamic_all_{ait_dtype}",
+                ait_dtype=ait_dtype,
+                use_fp32_acc=use_fp32_acc,
+            )
+
+    def _test_fused_elementwise_kernel1(
+        self,
+        ait_dtype,
+        use_fp32_acc=False,
+    ):
         BATCH_SIZE = 1024
         M = 1496
         X1 = Tensor(
@@ -237,9 +251,12 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
         X9._attrs["is_output"] = True
         X9._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(elementwise_use_fp32_acc=use_fp32_acc)
         module = compile_model(
-            X9, target, "./tmp", f"fused_elementwise_kernel1_{ait_dtype}"
+            X9,
+            target,
+            "./tmp",
+            f"fused_elementwise_kernel1_{ait_dtype}_{use_fp32_acc}",
         )
 
         x1_pt = get_random_torch_tensor((BATCH_SIZE, 2, M), ait_dtype)
@@ -261,7 +278,11 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
         )
     )
     def test_fused_elementwise_kernel1(self, ait_dtype):
-        self._test_fused_elementwise_kernel1(ait_dtype)
+        for use_fp32_acc in (False, True):
+            self._test_fused_elementwise_kernel1(
+                ait_dtype=ait_dtype,
+                use_fp32_acc=use_fp32_acc,
+            )
 
     def _test_sigmoid(self, input_size, test_name, ait_dtype, use_fast_math=True):
         torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]

From 02b04e40fc7de396cb25998aaa188fe235f35691 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 19 May 2023 04:42:34 -0700
Subject: [PATCH 531/638] Add SM90 CUTLASS 3.x kernels to
 gemm_rcr_bias_broadcast ops (#687)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/687

ATT. SM90 kernels are added to the following ops:

- `gemm_rcr_bias_add`
- `gemm_rcr_bias_add_relu`
- `gemm_rcr_bias_add_add`
- `gemm_rcr_bias_add_add_relu`
- `gemm_rcr_bias_mul`
- `gemm_rcr_bias_mul_add`
- `gemm_rcr_bias_mul_tanh`
- `gemm_rcr_bias_sigmoid_mul`
- `gemm_rcr_bias_sigmoid_mul_tanh`

The transposed problem with swapped A / B and column-major C / D is used for CUTLASS 3.x GEMM template and arguments, to accommodate the hard-coded column-major layout of the bias vector in the `EpilogueTensorBroadcast` (as row-major layout leads to a poor performance).

Reviewed By: chenyang78

Differential Revision: D45814653

fbshipit-source-id: e17819214f84e158bdb328237c6760726edda0f0
---
 .../gemm_universal/common_bias_broadcast.py   | 250 ++++++++++++++++--
 .../gemm_rcr_bias_elementwise.py              |  34 +--
 .../utils/mk_cutlass_lib/extra_enum.py        |   2 +
 .../unittest/ops/test_gemm_bias_broadcast.py  | 114 +++++++-
 4 files changed, 358 insertions(+), 42 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index b4c53f60e..e99ec324a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -60,6 +60,30 @@
 """
 )
 
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+EPILOGUE_TENSOR_BROADCAST_TEMPLATE = jinja2.Template(
+    """
+using {{epilogue_name}} =
+  cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<cutlass::layout::LayoutTranspose<{{layout_c}}>::type>,
+      cutlass::gemm::TagToStrideC_t<cutlass::layout::LayoutTranspose<{{layout_d}}>::type>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        {{element_d}}, {{element_accumulator}}, {{element_compute}}, {{element_c}},
+        {{unary_op1}},
+        {{binary_op1}},
+{% if has_d1 %}
+        {{binary_op2}},
+{% else %}
+        cutlass::epilogue::thread::detail::NoOp,
+{% endif %}
+        {{unary_op2}}
+        >,
+      {{epilogue_schedule}}>>;
+"""
+)
+
 # For func codegen.
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
@@ -105,6 +129,37 @@
 """
 )
 
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.k}}),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementA const* ptr_A
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementB const* ptr_B
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        {cute::Int<1>{}, {{layout.stride_c}}, cute::Int<0>{}},   // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias* ptr_Bias
+        ({{elem_output_type}}*)(d0_ptr),                         // ElementC* ptr_C0
+{% if has_d1 %}
+        ({{elem_output_type}}*)(d1_ptr),                         // ElementC* ptr_C1
+{% else %}
+        nullptr,
+{% endif %}
+    },                                                           // EpilogueArguments epilogue
+"""
+)
+
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
@@ -139,14 +194,45 @@
     0,                                                       // int64_t batch_stride_Vector
     0,                                                       // int64_t batch_stride_Tensor
     {{layout.stride_a}},                                     // typename LayoutA::Stride::Index lda
-    {{layout.stride_b}},                                     // typename LayoutA::Stride::Index ldb
-    {{layout.stride_c}},                                     // typename LayoutA::Stride::Index ldc1
+    {{layout.stride_b}},                                     // typename LayoutB::Stride::Index ldb
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc1
 {% if has_d1 %}
-    {{layout.stride_c}},                                     // typename LayoutA::Stride::Index ldc2
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc2
 {% endif %}
-    output_stride,                                           // typename LayoutA::Stride::Index ldd
-    0,                                                       // typename LayoutA::Stride::Index ldr
-    0,                                                       // typename LayoutA::Stride::Index ldt
+    output_stride,                                           // typename LayoutC::Stride::Index ldd
+    0,                                                       // typename LayoutC::Stride::Index ldr
+    0,                                                       // typename LayoutC::Stride::Index ldt
+"""
+)
+
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.k}}),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_A
+    { {{layout.stride_b}}, cute::Int<1>{}, cute::Int<0>{}},      // StrideB dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_B
+    { {{layout.stride_a}}, cute::Int<1>{}, cute::Int<0>{}},      // StrideA dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        {cute::Int<1>{}, {{layout.stride_c}}, cute::Int<0>{}},   // StrideC dC
+        ({{elem_output_type}}*)(c_ptr),                          // ElementD* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias* ptr_Bias
+        ({{elem_output_type}}*)(d0_ptr),                         // ElementC* ptr_C0
+{% if has_d1 %}
+        ({{elem_output_type}}*)(d1_ptr),                         // ElementC* ptr_C1
+{% else %}
+        nullptr,
+{% endif %}
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
@@ -166,6 +252,14 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp"
+#include "cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp"
+
 using bfloat16 = nv_bfloat16;
 
 #define CUTLASS_CHECK(status)                                                         \\
@@ -348,6 +442,82 @@ def _support_split_k(func_attrs):
     return func_attrs["split_k"] is not None
 
 
+def _replace_epilogue_cutlass_3x(
+    op_def,
+    unary_op1,
+    binary_op1,
+    binary_op2,
+    unary_op2,
+):
+    # example of the generated epilogue replaced by this function:
+    # ------------------------------------------------------------
+    # using cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnt_align8_warpspecialized_pingpong_epi_tma_epilogue =
+    # typename cutlass::epilogue::collective::CollectiveBuilder<
+    #     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    #     cute::Shape<cute::_256, cute::_128, cute::_64>,
+    #     cute::Shape<cute::_2,cute::_1,cute::_1>,
+    #     cutlass::epilogue::collective::EpilogueTileAuto,
+    #     float, float,
+    #     cutlass::half_t, cutlass::layout::RowMajor, 8,
+    #     cutlass::half_t, cutlass::layout::RowMajor, 8,
+    #     cutlass::epilogue::TmaWarpSpecialized
+    # >::CollectiveOp;
+
+    CUTLASS_3X_EPILOGUE_NUM_LINES = 11
+
+    lines = op_def.split("\n")
+    stripped_lines = [line.strip() for line in lines]
+    epilogue_start, epilogue_lines = None, []
+    for i in range(len(stripped_lines)):
+        if stripped_lines[i].endswith("_epilogue ="):
+            epilogue_start = i
+            for j in range(i, len(stripped_lines)):
+                epilogue_lines.append(stripped_lines[j])
+                if stripped_lines[j].endswith("::CollectiveOp;"):
+                    break
+            break
+
+    if epilogue_start is None:
+        raise ValueError(
+            f"Generated epilogue not found in the CUTLASS 3.x op_def:\n\n{op_def}"
+        )
+    if len(epilogue_lines) != CUTLASS_3X_EPILOGUE_NUM_LINES:
+        raise ValueError(
+            "Generated CUTLASS 3.x epilogue must be 11 lines long, "
+            f"but got {CUTLASS_3X_EPILOGUE_NUM_LINES}:\n\n{op_def}"
+        )
+
+    epilogue_name = epilogue_lines[0].split(" ")[1]
+    element_c, layout_c = epilogue_lines[7].split(", ")[:2]
+    element_d, layout_d = epilogue_lines[8].split(", ")[:2]
+    element_accumulator, element_compute = epilogue_lines[6].split(",")[:2]
+    element_compute = element_compute.strip()
+    epilogue_schedule = epilogue_lines[9]
+
+    new_epilogue = EPILOGUE_TENSOR_BROADCAST_TEMPLATE.render(
+        epilogue_name=epilogue_name,
+        element_c=element_c,
+        layout_c=layout_c,
+        element_d=element_d,
+        layout_d=layout_d,
+        element_accumulator=element_accumulator,
+        element_compute=element_compute,
+        unary_op1=unary_op1,
+        binary_op1=binary_op1,
+        binary_op2=binary_op2,
+        unary_op2=unary_op2,
+        epilogue_schedule=epilogue_schedule,
+        has_d1=(binary_op2 is not None),
+    )
+
+    lines_before = lines[:epilogue_start]
+    lines_after = lines[epilogue_start + CUTLASS_3X_EPILOGUE_NUM_LINES :]
+    new_lines = lines_before + [new_epilogue] + lines_after
+    new_op_def = "\n".join(new_lines)
+
+    return new_op_def
+
+
 def gemm_bias_broadcast_instance(
     op_def,
     func_attrs,
@@ -363,6 +533,15 @@ def gemm_bias_broadcast_instance(
     """
     adjust gemm instance with respect to input_accessors, layout and epilogue ops
     """
+    if cutlass_3x:
+        return _replace_epilogue_cutlass_3x(
+            op_def=op_def,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+        )
+
     op_def = common.update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
     gemm_universal_params = common.get_gemm_instance_template_params(op_def)
     epilogue_pattern = re.compile(r"\s*(cutlass::epilogue::thread::.*)\s*<")
@@ -409,7 +588,11 @@ def gemm_bias_broadcast_instance(
 
 
 def gemm_bias_broadcast_config(func_attrs, layout, dtype="float16"):
-    common.make_fproc(func_attrs, layout)
+    common.make_fproc(
+        func_attrs=func_attrs,
+        layout=layout,
+        include_cutlass_3x_ops=True,
+    )
 
 
 def gen_profiler(
@@ -423,6 +606,12 @@ def gen_profiler(
     binary_op2,
     unary_op2,
 ):
+    import cutlass_lib
+
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    op_instance = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -433,9 +622,7 @@ def gen_profiler(
     elem_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    op_type = func_attrs["op"]
     support_split_k = _support_split_k(func_attrs)
-    op_instance = func_attrs["op_instance"]
     has_d1 = common.has_d1(func_attrs)
 
     ndims = 2
@@ -458,6 +645,12 @@ def gen_profiler(
             layout=layout,
             has_d1=has_d1,
         ),
+        problem_args_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            layout=layout,
+            has_d1=has_d1,
+        ),
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=ndims,
@@ -482,11 +675,21 @@ def gen_profiler(
                 elem_type=elem_input_type,
             ),
         )
-        config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            common.INSTANCE_TEMPLATE_CUTLASS_3X
+            if cutlass_3x
+            else common.INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=common.extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -595,15 +798,22 @@ def gen_function(
         support_split_k=support_split_k,
         has_d1=has_d1,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        layout=layout,
+        has_d1=has_d1,
+    )
     return common.gen_function(
-        func_attrs,
-        SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         f_instance_convertor=partial(
             gemm_bias_broadcast_instance,
             layout=layout,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
index 436d3101e..01d62de36 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
@@ -59,15 +59,15 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
 def gen_profiler_template(unary_op1, binary_op1, binary_op2, unary_op2):
     def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         return common_bias_broadcast.gen_profiler(
-            func_attrs,
-            workdir,
-            profiler_filename,
-            dim_info_dict,
-            RCR,
-            unary_op1,
-            binary_op1,
-            binary_op2,
-            unary_op2,
+            func_attrs=func_attrs,
+            workdir=workdir,
+            profiler_filename=profiler_filename,
+            dim_info_dict=dim_info_dict,
+            layout=RCR,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
         )
 
     return gen_profiler
@@ -80,14 +80,14 @@ def gen_function(
         dim_info_dict,
     ):
         return common_bias_broadcast.gen_function(
-            func_attrs,
-            exec_cond_template,
-            dim_info_dict,
-            RCR,
-            unary_op1,
-            binary_op1,
-            binary_op2,
-            unary_op2,
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            dim_info_dict=dim_info_dict,
+            layout=RCR,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
         )
 
     return gen_function
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index ef46ad65d..0cab52d38 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -220,6 +220,7 @@ class EpilogueScheduleType(enum.Enum):
     EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish,
     EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU,
     EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU,
+    EpilogueFunctor.LinearCombinationResidualBlock: EpilogueScheduleType.TmaWarpSpecialized,
   },
   EpilogueScheduleType.TmaWarpSpecializedCooperative: {
     EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu,
@@ -229,6 +230,7 @@ class EpilogueScheduleType(enum.Enum):
     EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish,
     EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU,
     EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU,
+    EpilogueFunctor.LinearCombinationResidualBlock: EpilogueScheduleType.TmaWarpSpecializedCooperative,
   },
 }
 
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index eade402ae..b05a3f22e 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -20,6 +20,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
+    env_variables,
     filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
@@ -317,9 +318,9 @@ def _test_bias_rcr_mul(
         m1,
         k,
         n,
-        use_fp16_acc=False,
         dtype="float16",
         test_name_suffix="",
+        use_fp16_acc=False,
     ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         self._init_tensors(m, k, n, m0, m1, dtype)
@@ -439,7 +440,15 @@ def test_bias_rcr_mul_tanh_rocm(self):
         name_func=custom_name_func_with_funcname,
     )
     def test_gemm_bias_broadcast_float32_sm80(self, func, m, m0, m1, k, n, dtype):
-        func(self, m, m0, m1, k, n, dtype)
+        func(
+            self,
+            m=m,
+            m0=m0,
+            m1=m1,
+            k=k,
+            n=n,
+            dtype=dtype,
+        )
 
     @parameterized.expand(
         [
@@ -456,14 +465,109 @@ def test_gemm_bias_broadcast_float32_sm80(self, func, m, m0, m1, k, n, dtype):
         name_func=custom_name_func_with_funcname,
     )
     def test_gemm_bias_broadcast_bfloat16_bf16(self, func, m, m0, m1, k, n, dtype):
-        func(self, m, m0, m1, k, n, dtype)
+        func(
+            self,
+            m=m,
+            m0=m0,
+            m1=m1,
+            k=k,
+            n=n,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_sm90(self, func, m, m0, m1, k, n):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                func(
+                    self,
+                    m=m,
+                    m0=m0,
+                    m1=m1,
+                    k=k - 4,
+                    n=n,
+                    dtype="float16",
+                    test_name_suffix="_wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                func(
+                    self,
+                    m=m,
+                    m0=m0,
+                    m1=m1,
+                    k=k,
+                    n=n - 1,
+                    dtype="float16",
+                    test_name_suffix="_wrong_output_alignment_sm90",
+                )
+
+            func(
+                self,
+                m=m,
+                m0=m0,
+                m1=m1,
+                k=k,
+                n=n,
+                dtype="float16",
+                test_name_suffix="_force_sm90",
+            )
+            func(
+                self,
+                m=m,
+                m0=m0,
+                m1=m1,
+                k=k,
+                n=n,
+                dtype="bfloat16",
+                test_name_suffix="_force_sm90",
+            )
 
     def test_gemm_bias_broadcast_use_fp16_acc_sm80(self):
         self._test_bias_rcr_mul(
-            None, 2, 32, 256, 128, use_fp16_acc=True, dtype="float32"
+            m=None,
+            m0=2,
+            m1=32,
+            k=256,
+            n=128,
+            dtype="float32",
+            test_name_suffix="_use_fp16_acc",
+            use_fp16_acc=True,
         )
         self._test_bias_rcr_mul(
-            None, 2, 32, 256, 128, use_fp16_acc=True, dtype="bfloat16"
+            m=None,
+            m0=2,
+            m1=32,
+            k=256,
+            n=128,
+            dtype="bfloat16",
+            test_name_suffix="_use_fp16_acc",
+            use_fp16_acc=True,
         )
 
 
From d7e79961aab697e3288f7bec8b5911cd80c99313 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Fri, 19 May 2023 10:18:56 -0700
Subject: [PATCH 532/638] Add SM90 CUTLASS 3.x kernels to perm102_bmm ops
 (#689)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/689

ATT. SM90 kernels are added to the following ops:

- `perm102_bmm_rcr`
- `perm102_bmm_rcr_bias`
- `perm102_bmm_rrr`
- `perm102_bmm_rrr_bias`

Reviewed By: chenyang78

Differential Revision: D45817288

fbshipit-source-id: 9096b0d920416bd1c5b4de377a831642f2fc6a18
---
 .../backend/cuda/gemm_universal/bmm_common.py |  48 +++---
 .../cuda/gemm_universal/bmm_xxx_add.py        |  21 +--
 .../cuda/gemm_universal/perm102_bmm_rcr.py    |  51 ++++--
 .../gemm_universal/perm102_bmm_rcr_bias.py    |  42 +++--
 .../cuda/gemm_universal/perm102_bmm_rrr.py    |  51 ++++--
 .../gemm_universal/perm102_bmm_rrr_bias.py    |  42 +++--
 tests/unittest/ops/test_perm102_bmm_rcr.py    | 139 +++++++++++++----
 tests/unittest/ops/test_perm102_bmm_rrr.py    | 145 ++++++++++++++----
 8 files changed, 388 insertions(+), 151 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 8995dc4f8..8dd0399bb 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -15,7 +15,7 @@
 """
 Common functions and templates for bmm-family ops
 """
-from dataclasses import dataclass
+import dataclasses
 
 import jinja2
 
@@ -157,7 +157,7 @@
 )
 
 
-@dataclass
+@dataclasses.dataclass
 class Bmm_problem_info:
     alpha_value: float = 1
     beta_value: float = 0
@@ -576,6 +576,30 @@ def gen_profiler(
     return common.build_profiler(file_pairs)
 
 
+def add_elem_types_to_mm_info(mm_info, func_attrs):
+    """
+    CUTLASS 3.x problem args require explicit I/O pointer types
+    (not void*). This function arugments the input and output
+    pointers in the mm_info with the appropriate elem_input_type
+    and elem_output_type casts.
+    """
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    return dataclasses.replace(
+        mm_info,
+        a_ptr=f"({elem_input_type}*)({mm_info.a_ptr})",
+        b_ptr=f"({elem_input_type}*)({mm_info.b_ptr})",
+        bias_ptr=f"({elem_output_type}*)({mm_info.bias_ptr})",
+        c_ptr=f"({elem_output_type}*)({mm_info.c_ptr})",
+    )
+
+
 def default_gen_profiler(
     func_attrs,
     workdir,
@@ -603,23 +627,11 @@ def default_gen_profiler(
     problem_args = PROBLEM_ARGS_TEMPLATE.render(
         mm_info=default_mm_info,
     )
-
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    # CUTLASS 3.x problem args require explicit I/O pointer types (not void*)
-    default_mm_info.a_ptr = f"({elem_input_type}*)({default_mm_info.a_ptr})"
-    default_mm_info.b_ptr = f"({elem_input_type}*)({default_mm_info.b_ptr})"
-    default_mm_info.bias_ptr = f"({elem_output_type}*)({default_mm_info.bias_ptr})"
-    default_mm_info.c_ptr = f"({elem_output_type}*)({default_mm_info.c_ptr})"
-
     problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
-        mm_info=default_mm_info,
+        mm_info=add_elem_types_to_mm_info(
+            mm_info=default_mm_info,
+            func_attrs=func_attrs,
+        ),
     )
 
     return gen_profiler(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
index 350e170ed..a95edbc8f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -25,7 +25,6 @@
 
 
 from aitemplate.backend import registry
-from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.common import gemm_common
 from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 from aitemplate.backend.cuda.gemm_universal.bmm_xxx import _get_problem_args, get_config
@@ -108,23 +107,11 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
             mm_info=default_mm_info,
         )
-
-        backend_spec = CUDASpec()
-        elem_input_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-        elem_output_type = backend_spec.dtype_to_lib_type(
-            func_attrs["outputs"][0]._attrs["dtype"]
-        )
-
-        # CUTLASS 3.x problem args require explicit I/O pointer types (not void*)
-        default_mm_info.a_ptr = f"({elem_input_type}*)({default_mm_info.a_ptr})"
-        default_mm_info.b_ptr = f"({elem_input_type}*)({default_mm_info.b_ptr})"
-        default_mm_info.bias_ptr = f"({elem_output_type}*)({default_mm_info.bias_ptr})"
-        default_mm_info.c_ptr = f"({elem_output_type}*)({default_mm_info.c_ptr})"
-
         problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
-            mm_info=default_mm_info,
+            mm_info=bmm_common.add_elem_types_to_mm_info(
+                mm_info=default_mm_info,
+                func_attrs=func_attrs,
+            ),
         )
 
         return bmm_common.gen_profiler(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index 63b1ef34b..1e0273c00 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -34,6 +34,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "K",
         "ldbias": "N * B",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": False,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -63,6 +66,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="K",
         ldbias="output_stride",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=False,
+        c_row_major=True,
     )
 
 
@@ -112,7 +118,10 @@ def fproc(op):
             epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+    func_attrs["op_instance"] = common.extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.perm102_bmm_rcr.gen_profiler")
@@ -127,15 +136,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
     )
 
 
@@ -151,14 +167,21 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        "",  # input_addr_calculator
-        get_output_addr_calculator(func_attrs),
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator="",
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
index 99fd2e644..6634ff80f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
@@ -43,6 +43,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "K",
         "ldbias": "0",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": False,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -73,6 +76,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="K",
         ldbias="0",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=False,
+        c_row_major=True,
     )
 
 
@@ -93,15 +99,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
@@ -118,16 +131,23 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
 
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index 8fb2fb8f2..354a4392b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -37,6 +37,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "N",
         "ldbias": "N * B",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": True,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -66,6 +69,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="N",
         ldbias="output_stride",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=True,
+        c_row_major=True,
     )
 
 
@@ -83,7 +89,10 @@ def fproc(op):
             epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+    func_attrs["op_instance"] = common.extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.perm102_bmm_rrr.gen_profiler")
@@ -98,15 +107,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
     )
 
 
@@ -122,14 +138,21 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        "",  # input_addr_calculator
-        get_output_addr_calculator(func_attrs),
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator="",
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
index 718867ade..de73d4880 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
@@ -43,6 +43,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "N",
         "ldbias": "0",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": True,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -72,6 +75,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="N",
         ldbias="0",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=True,
+        c_row_major=True,
     )
 
 
@@ -92,15 +98,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
@@ -117,16 +130,23 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
 
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index 3797fcffc..39c306704 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -29,28 +29,23 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
-    TestEnv,
 )
-from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMM_RCR_TestCase(unittest.TestCase):
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-            }
-        )
-    )
-    def test_perm102_bmm_rrr(self, dtype):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRCRTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rcr(
+        self,
+        B=25,
+        M=128,
+        K=256,
+        N=100,
+        dtype="float16",
+        test_name="perm102_bmm_rcr",
+    ):
         target = detect_target()
         X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
@@ -58,7 +53,7 @@ def test_perm102_bmm_rrr(self, dtype):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"perm102_bmm_rcr_{dtype}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
         X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
         W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
@@ -71,22 +66,57 @@ def test_perm102_bmm_rrr(self, dtype):
 
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
+    def test_perm102_bmm_rcr_fp16(self):
+        self._test_perm102_bmm_rcr(
+            dtype="float16",
+            test_name="perm102_bmm_rcr_fp16",
+        )
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-            }
+    def test_perm102_bmm_rcr_fp32_sm80(self):
+        self._test_perm102_bmm_rcr(
+            dtype="float32",
+            test_name="perm102_bmm_rcr_fp32",
+        )
+
+    def test_perm102_bmm_rcr_bf16(self):
+        self._test_perm102_bmm_rcr(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rcr_bf16",
         )
-    )
-    def test_perm102_bmm_rrr_bias(self, dtype):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+
+    def test_perm102_bmm_rcr_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rcr_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rcr_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rcr_bf16_force_sm90",
+            )
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMMRCRBiasTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rcr_bias(
+        self,
+        B=25,
+        M=128,
+        N=100,
+        K=256,
+        dtype="float16",
+        test_name="perm102_bmm_rcr_bias",
+    ):
         target = detect_target()
         X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
@@ -95,7 +125,7 @@ def test_perm102_bmm_rrr_bias(self, dtype):
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"perm102_bmm_rcr_bias_{dtype}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
         X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
         W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
@@ -113,6 +143,49 @@ def test_perm102_bmm_rrr_bias(self, dtype):
 
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
+    def test_perm102_bmm_rcr_bias_fp16(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="float16",
+            test_name="perm102_bmm_rcr_bias_fp16",
+        )
+
+    def test_perm102_bmm_rcr_bias_fp32_sm80(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="float32",
+            test_name="perm102_bmm_rcr_bias_fp32",
+        )
+
+    def test_perm102_bmm_rcr_bias_bf16(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rcr_bias_bf16",
+        )
+
+    def test_perm102_bmm_rcr_bias_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rcr_bias_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rcr_bias_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rcr_bias_bf16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(Perm102BMMRCRTestCase)
+filter_test_cases_by_test_env(Perm102BMMRCRBiasTestCase)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index a466d29f9..e8851b56c 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -29,28 +29,23 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    env_variables,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
-    TestEnv,
 )
-from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMMTestCase(unittest.TestCase):
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-            }
-        )
-    )
-    def test_perm102_bmm_rrr(self, dtype="float16"):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRRRTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rrr(
+        self,
+        B=25,
+        M=128,
+        N=100,
+        K=256,
+        dtype="float16",
+        test_name="perm102_bmm_rrr",
+    ):
         target = detect_target()
         X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
@@ -58,7 +53,7 @@ def test_perm102_bmm_rrr(self, dtype="float16"):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+        module = compile_model(Y, target, "./tmp", test_name)
 
         X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
         W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
@@ -71,22 +66,60 @@ def test_perm102_bmm_rrr(self, dtype="float16"):
 
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
+    def test_perm102_bmm_rrr_fp16(self):
+        self._test_perm102_bmm_rrr(
+            dtype="float16",
+            test_name="perm102_bmm_rrr_fp16",
+        )
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMMBiasTestCase(unittest.TestCase):
-    @parameterized.expand(
-        **filter_test_cases_by_params(
-            {
-                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-            }
+    def test_perm102_bmm_rrr_fp32_sm80(self):
+        self._test_perm102_bmm_rrr(
+            dtype="float32",
+            test_name="perm102_bmm_rrr_fp32",
+        )
+
+    def test_perm102_bmm_rrr_bf16(self):
+        self._test_perm102_bmm_rrr(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rrr_bf16",
         )
-    )
-    def test_perm102_bmm_rrr_bias(self, dtype="float16"):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+
+    def test_perm102_bmm_rrr_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rrr_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rrr_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rrr_bf16_force_sm90",
+            )
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMMRRRBiasTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rrr_bias(
+        self,
+        B=25,
+        M=128,
+        K=256,
+        N=100,
+        dtype="float16",
+        test_name="perm102_bmm_rrr_bias",
+    ):
         target = detect_target()
         X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
@@ -95,7 +128,7 @@ def test_perm102_bmm_rrr_bias(self, dtype="float16"):
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
         X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
         W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
@@ -113,6 +146,52 @@ def test_perm102_bmm_rrr_bias(self, dtype="float16"):
 
         torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
 
+    def test_perm102_bmm_rrr_bias_fp16(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="float16",
+            test_name="perm102_bmm_rrr_bias_fp16",
+        )
+
+    def test_perm102_bmm_rrr_bias_fp32_sm80(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="float32",
+            test_name="perm102_bmm_rrr_bias_fp32",
+        )
+
+    def test_perm102_bmm_rrr_bias_bf16(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rrr_bias_bf16",
+        )
+
+    def test_perm102_bmm_rrr_bias_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rrr_bias_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rrr_bias_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rrr_bias_bf16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(Perm102BMMRRRTestCase)
+filter_test_cases_by_test_env(Perm102BMMRRRBiasTestCase)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 0b8561a6770990c9b58d0ad6fe479c1a41d453d4 Mon Sep 17 00:00:00 2001
From: Henry Hu <hhh@meta.com>
Date: Fri, 19 May 2023 17:09:14 -0700
Subject: [PATCH 533/638] Add support for keyword argument in AITModule (#707)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/707

The original submodule could be supplied with a keyword argument. Add support for keyword arguments in the forward signature.

Reviewed By: wushirong

Differential Revision: D46013324

fbshipit-source-id: f4cde443b980f360118a6531d5ced96af9dbaa5a
---
 fx2ait/fx2ait/ait_module.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
index 8cf48b842..53e3a84c5 100644
--- a/fx2ait/fx2ait/ait_module.py
+++ b/fx2ait/fx2ait/ait_module.py
@@ -27,16 +27,20 @@ def __init__(
         self.engine = engine
         self.interp_result = interp_result
 
-    def forward(self, *inputs):
+    def forward(self, *inputs, **kwargs):
         python_inputs = []
         if self.interp_result:
             inputs = list(inputs)
             for name, inp in zip(self.interp_result.fx_input_names, inputs):
                 if name in self.interp_result.input_names:
                     python_inputs.append(inp)
+            for name in self.interp_result.input_names:
+                if name in kwargs:
+                    python_inputs.append(kwargs[name])
             assert len(python_inputs) == len(self.interp_result.input_names)
         else:
-            python_inputs = inputs
+            python_inputs = list(inputs)
+            python_inputs.extend(kwargs.values())
 
         outputs = self.engine.forward(python_inputs)
         if len(outputs) == 1:

From 9c266010325d208e4db33fffa1503861d4015801 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sat, 20 May 2023 23:35:16 -0700
Subject: [PATCH 534/638] adjust launch config for group_layernorm kernels
 (#708)

Summary:
For group_layernorm kernels, the grid config for launching our CUDA kernels is set to be grid(num_of_groups, m), where m = batch_dim * ... * dim_at_norm_ndim. Thus, it's very common that m may be larger than the maximum value for gridDim.y, i.e. 65535.

Since the number of groups is very small, it's better for us to make grid(m, num_of_groups), where the upper-bound for m becomes 2^31 - 1, i.e. the maximum value for gridDim.x.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/708

Reviewed By: wushirong

Differential Revision: D46015211

Pulled By: chenyang78

fbshipit-source-id: 07d33c673a081c0bdb88c805099e181cad0fab57
---
 .../layernorm_sigmoid_mul_kernel.cuh          | 26 +++++++++----------
 .../ops/test_layernorm_sigmoid_mul.py         | 21 +++++++++++++++
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index 7e677c305..a29179ea8 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -1557,14 +1557,14 @@ struct Arguments {
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with 4 elements
 // block_size = n / 4
 template <bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const Arguments<half4, float, NumInputs>& args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
   float local_sums[1] = {0.0f};
@@ -1713,14 +1713,14 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with 4 elements
 // block_size = n / 4
 template <bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const Arguments<bfloat16_4, float, NumInputs>& args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
   float local_sums[1] = {0.0f};
@@ -1915,14 +1915,14 @@ __global__ void group_layernorm_sigmoid_mul_stored_locally_bfloat16(
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with 1 element
 // block_size = n
 template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const Arguments<T, T_ACC, NumInputs>& args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
 
@@ -2028,14 +2028,14 @@ __global__ void group_layernorm_sigmoid_mul_stored_locally(
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with n / block_size element
 // block_size = 512
 template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_impl(
     Arguments<T, T_ACC, NumInputs> args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
 
@@ -2142,7 +2142,7 @@ cudaError_t invokeGroupLayernormSigmoidMul(
     return cudaSuccess;
   }
 
-  dim3 grid(b, m);
+  dim3 grid(m, b);
   // TODO: implement float4 group kernel
   if (std::is_same<T, half>::value && n_is_multiple_of_4 && (min_n >= 128) &&
       (max_n <= 4096)) {
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
index 4d8b621f0..0d41ff55d 100644
--- a/tests/unittest/ops/test_layernorm_sigmoid_mul.py
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -901,6 +901,16 @@ def test_group_fused_layernorm_sigmoid_mul(self, dtype: str):
             use_size_op=True,
             dtype=dtype,
         )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+            ],
+            dtype=dtype,
+        )
 
         # Make sure we test the boundary between being able to fit the arguments in constant memory vs not.
         for num_groups in range(38, 41):
@@ -1028,6 +1038,17 @@ def test_group_layernorm(self, dtype: str):
             fuse_sigmoid_mul=False,
             dtype=dtype,
         )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+            ],
+            fuse_sigmoid_mul=False,
+            dtype=dtype,
+        )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             gamma_is_none=True,

From 46a81f2626d04557798c4ea0884b5b9be14d8c4a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Sat, 20 May 2023 23:48:39 -0700
Subject: [PATCH 535/638] remove identity ops from the graph (#703)

Summary:
This PR implements a simple pass that removes identity ops from the graph

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/703

Reviewed By: wushirong

Differential Revision: D45979732

Pulled By: chenyang78

fbshipit-source-id: aaea314a34963b50f3597e580a1fe56fd1818df4
---
 .../aitemplate/compiler/transform/__init__.py |   1 +
 .../compiler/transform/optimize_graph.py      |   2 +
 .../compiler/transform/remove_id_ops.py       |  41 ++
 tests/unittest/compiler/test_remove_id_ops.py | 355 ++++++++++++++++++
 tests/unittest/ops/test_identity.py           |   2 +-
 5 files changed, 400 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/compiler/transform/remove_id_ops.py
 create mode 100644 tests/unittest/compiler/test_remove_id_ops.py

diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index ff080c52f..c630c88bd 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -36,6 +36,7 @@
 from aitemplate.compiler.transform.optimize_graph import optimize_graph
 from aitemplate.compiler.transform.profile import profile
 from aitemplate.compiler.transform.refine_graph import refine_graph
+from aitemplate.compiler.transform.remove_id_ops import remove_id_ops
 from aitemplate.compiler.transform.remove_no_ops import remove_no_ops
 from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index b5c99163b..a3b8a5c60 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -37,6 +37,7 @@
     fuse_permute_bmm_and_gemm,
 )
 from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
+from aitemplate.compiler.transform.remove_id_ops import remove_id_ops
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
 from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
     split_large_slice_scatter_ops,
@@ -88,6 +89,7 @@ def optimize_graph(
     """
 
     funcs = [
+        remove_id_ops,
         dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
         fuse_bmm_permute,
diff --git a/python/aitemplate/compiler/transform/remove_id_ops.py b/python/aitemplate/compiler/transform/remove_id_ops.py
new file mode 100644
index 000000000..6b9057e67
--- /dev/null
+++ b/python/aitemplate/compiler/transform/remove_id_ops.py
@@ -0,0 +1,41 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Remove id ops from a sorted_graph.
+"""
+from typing import List
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.transform import transform_utils
+
+
+def remove_id_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """Remove id ops from the input sorted_graph."""
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "identity":
+            continue
+        id_op = src_op
+        input_tensor = id_op._attrs["inputs"][0]
+        # skip a very special case where id takes an input and produces an output
+        if tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+            continue
+        transform_utils.remove_single_tensor_op_from_sorted_graph(id_op)
+
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/tests/unittest/compiler/test_remove_id_ops.py b/tests/unittest/compiler/test_remove_id_ops.py
new file mode 100644
index 000000000..02c7bb02e
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_id_ops.py
@@ -0,0 +1,355 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class RemoveIdOpsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(RemoveIdOpsTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+        self.BATCH_SIZE = 1024
+
+    def test_remove_id_simple(
+        self,
+        test_name="remove_id_simple",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # y = add(x1, id_1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        Y = ops.elementwise(FuncEnum.ADD)(X1, id_1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            y_pt = x1_pt + id_1_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_id_simple_2(
+        self,
+        test_name="remove_id_simple_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # id_2 = id(x1)
+        # y = add(id_1, id_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(X1)
+        Y = ops.elementwise(FuncEnum.ADD)(id_1, id_2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            id_2_pt = x1_pt
+            y_pt = id_1_pt + id_2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_consecutive_ids_1(
+        self,
+        test_name="remove_consecutive_ids_1",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # id_2 = id(id_1)
+        # id_3 = id(id_2)
+        # id_4 = id(id_1)
+        # add_1 = add(id_3, id_4)
+        # add_2 = add(id_1, id_4)
+        # y = add(add_1, add_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(id_1)
+        id_3 = ops.identity()(id_2)
+        id_4 = ops.identity()(id_1)
+        add_1 = ops.elementwise(FuncEnum.ADD)(id_3, id_4)
+        id_5 = ops.identity()(X1)
+        add_2 = ops.elementwise(FuncEnum.ADD)(id_5, id_2)
+        Y = ops.elementwise(FuncEnum.ADD)(add_1, add_2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            id_2_pt = id_1_pt
+            id_3_pt = id_2_pt
+            id_4_pt = id_1_pt
+            add_1_pt = id_3_pt + id_4_pt
+            id_5_pt = x1_pt
+            add_2_pt = id_5_pt + id_2_pt
+            y_pt = add_1_pt + add_2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_consecutive_ids_2(
+        self,
+        test_name="remove_consecutive_ids_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x1)
+        # id_1 = id(add_0)
+        # id_2 = id(id_1)
+        # id_3 = id(x1)
+        # id_4 = id(id_3)
+        # add_1 = add(id_2, id_4)
+        # id_5 = id(add_1)
+        # y = id(id_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(id_1)
+        id_3 = ops.identity()(X1)
+        id_4 = ops.identity()(id_3)
+        add_1 = ops.elementwise(FuncEnum.ADD)(id_2, id_4)
+        id_5 = ops.identity()(add_1)
+        Y = ops.identity()(id_5)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x1_pt
+            id_1_pt = add_0_pt
+            id_2_pt = id_1_pt
+            id_3_pt = x1_pt
+            id_4_pt = id_3_pt
+            add_1_pt = id_2_pt + id_4_pt
+            id_5_pt = add_1_pt
+            y_pt = id_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_removable_id(
+        self,
+        test_name="non_removable_id",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # y = id(x0)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        Y = ops.identity()(X0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "identity")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = x0_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_removable_id_2(
+        self,
+        test_name="non_removable_id_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x1)
+        # id_1 = id(add_0)
+        # y0 = id(id_1)
+        # y1 = id(x0)
+        # y2 = add(y0, y1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        id_1 = ops.identity()(add_0)
+        Y0 = ops.identity()(id_1)
+        Y0._attrs["name"] = "output_0"
+        Y0._attrs["is_output"] = True
+        Y1 = ops.identity()(X0)
+        Y1._attrs["name"] = "output_1"
+        Y1._attrs["is_output"] = True
+        Y2 = ops.elementwise(FuncEnum.ADD)(Y0, Y1)
+        Y2._attrs["name"] = "output_2"
+        Y2._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(
+            [Y0, Y1, Y2], target, "./tmp", test_name, dll_name=dll_name
+        )
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "identity")
+        id_cnt = 0
+        add_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "identity":
+                id_cnt += 1
+            elif sorted_op._attrs["op"] == "fused_elementwise":
+                add_cnt += 1
+        self.assertEqual(id_cnt, 1)
+        self.assertEqual(add_cnt, 2)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x1_pt
+            id_1_pt = add_0_pt
+            y0_pt = id_1_pt
+            y1_pt = x0_pt
+            y2_pt = y0_pt + y1_pt
+
+            y0 = get_torch_empty_tensor(y0_pt.size(), dtype)
+            y1 = get_torch_empty_tensor(y1_pt.size(), dtype)
+            y2 = get_torch_empty_tensor(y2_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y0, y1, y2])
+            torch.testing.assert_close(y0_pt, y0, atol=0.01, rtol=0.01)
+            torch.testing.assert_close(y1_pt, y1, atol=0.01, rtol=0.01)
+            torch.testing.assert_close(y2_pt, y2, atol=0.01, rtol=0.01)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_identity.py b/tests/unittest/ops/test_identity.py
index a690750d7..52277a2bb 100644
--- a/tests/unittest/ops/test_identity.py
+++ b/tests/unittest/ops/test_identity.py
@@ -50,7 +50,7 @@ def _test_identity(
 
         target = detect_target()
         module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
-        self.assertEqual(len(module.debug_sorted_graph), 3 if elementwise else 2)
+        self.assertEqual(len(module.debug_sorted_graph), 2)
         self._test_id += 1
 
         x_pt = get_random_torch_tensor(shape, dtype=dtype)

From 1fb2b33f67a51a0d9417f5caea14241f5fa3a787 Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@meta.com>
Date: Sun, 21 May 2023 17:37:30 -0700
Subject: [PATCH 536/638] fx2ait: explicitly ensure workdir's existence (#714)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/714

Right now the only thing that prevent a non-existent workdir from breaking the ait system, is an indirect mkdir call to create the `test_dir`, which is `workdir/ait_name`.

We shouldn't have just replied on that. We should have explicitly make sure workdir exist in the first place.

Reviewed By: amateurcoffee

Differential Revision: D46036294

fbshipit-source-id: 6317a646b1c8c4c1271af8b6a24e2d5e6125259f
---
 python/aitemplate/compiler/compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 38d27d73b..a912d7aa5 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -207,6 +207,7 @@ def compile_model(
 
     recompile = os.getenv("AIT_RECOMPILE", "1")
     graph = None
+    os.makedirs(workdir, exist_ok=True)  # explicitly ensure workdir exists
     # Super important: we cannot have commas in the test name.
     # We want to add a -Iworkdir/test_name flag to nvcc, but
     # if the name has a comma in it, it will be parsed as two

From 39455d8679fedfd01c57f88a59fd254a1be7ce67 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Mon, 22 May 2023 12:29:03 -0700
Subject: [PATCH 537/638] enable bmm_rrr for concat + concat fusion (#716)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/716

Reviewed By: wushirong

Differential Revision: D46065150

Pulled By: chenyang78

fbshipit-source-id: 55d680dee45868ed5ff0d39b029c65a10bbbae81
---
 .../transform/transform_memory_ops.py         |   2 +-
 tests/unittest/compiler/test_move_view_ops.py | 156 +++++++++++++++++-
 2 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index c11944fe8..0a42f8d36 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -133,7 +133,7 @@ def _is_supported_dst_op_for_first_cat(
     def _supported_op_type(op_type):
         if op_type in supported_strided_ops:
             return True
-        return op_type.startswith("bmm_crr")
+        return op_type.startswith(("bmm_crr", "bmm_rrr"))
 
     dst_op_type = dst_op._attrs["op"]
     if _supported_op_type(dst_op_type):
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
index 48dda1c3d..c2b96c270 100644
--- a/tests/unittest/compiler/test_move_view_ops.py
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -1443,7 +1443,6 @@ def test_move_strided_reshape_cat_8(self):
             test_name="test_move_strided_reshape_cat_8",
             dtype="float16",
         )
-        return
         self._test_move_strided_reshape_cat_8(
             M0=4,
             M1=4,
@@ -1616,6 +1615,161 @@ def test_move_strided_reshape_cat_9(self):
             dtype="float16",
         )
 
+    def _test_move_strided_reshape_cat_10(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_rrr_add_3 = bmm_rrr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(bmm_rrr_add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_rrr_add_3[batch, N, N] = bmm_rrr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_rrr_add_3 = ops.bmm_rrr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(bmm_rrr_add_3, [-1, N * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # dynamic_slice + bmm cannot be fused because we can't generate
+        # any valid strided access
+        self.assertEqual(len(sorted_ops), 9)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            bmm_rrr_add_3_pt = torch.matmul(reshape_2_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(bmm_rrr_add_3_pt, [-1, N * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_10(self):
+        self._test_move_strided_reshape_cat_9(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            M7=8,
+            N=4,
+            test_name="test_move_strided_reshape_cat_10",
+            dtype="float16",
+        )
+
     def _test_move_strided_reshape_cat_multi_dsts(
         self, M0, M1, M2, M3, N, test_name, dtype="float16"
     ):

From fdb0a98483801007bd1e350681d97035cf8d9897 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 23 May 2023 11:12:20 +0800
Subject: [PATCH 538/638] support dynamic batch size

---
 .../rocm/attention/mem_eff_attention.py       | 26 +++++----
 python/aitemplate/backend/rocm/gemm/common.py | 21 ++++++-
 .../ops/gemm_universal/gemm_common.py         | 23 ++++++--
 .../transform/transform_strided_ops.py        |  1 -
 python/aitemplate/frontend/nn/attention.py    | 58 +++++++++----------
 5 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
index d8c636684..078b4e0c7 100644
--- a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -60,7 +60,6 @@
 using AccDataType      = F32;
 using CShuffleDataType = F32;
 using CDataType        = InputType;
-using GemmDataType     = InputType;
 using Acc0BiasDataType = ck::Tuple<>;
 using Acc1BiasDataType = ck::Tuple<>;
 
@@ -157,7 +156,6 @@
 
 {{func_signature}}
 {
-
     bool input_permute = true;
     bool output_permute = true;
     
@@ -224,11 +222,15 @@
                                  {},   // acc1_biases_gs_ms_os_lengths
                                  {}}); // acc1_biases_gs_ms_os_strides
 
-        auto offset = i * K * G1 * M * sizeof(InputType);
-        q_ptrs.push_back(reinterpret_cast<const void*>(q_ptr + offset));                               
-        k_ptrs.push_back(reinterpret_cast<const void*>(k_ptr + offset));                               
-        v_ptrs.push_back(reinterpret_cast<const void*>(v_ptr + offset));                               
-        output_ptrs.push_back(reinterpret_cast<void*>(output_ptr + offset));                               
+        auto offset = K * G1 * M * sizeof(InputType);
+        q_ptrs.push_back(reinterpret_cast<const void*>(q_ptr)); 
+        q_ptr += offset;                              
+        k_ptrs.push_back(reinterpret_cast<const void*>(k_ptr));   
+        k_ptr += offset;                            
+        v_ptrs.push_back(reinterpret_cast<const void*>(v_ptr));
+        v_ptr += offset;                               
+        output_ptrs.push_back(reinterpret_cast<void*>(output_ptr)); 
+        output_ptr += offset;                              
     }
 
     // do GEMM
@@ -269,6 +271,7 @@
                    const void* k,
                    const void* v,
                    const int* seqlens,
+                   const int max_seqlen,
                    int64_t batch_size,
                    int num_heads,
                    int head_dim,
@@ -288,7 +291,7 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}   {{output}}, {{q}}, {{k}}, {{v}}, {{seqlens}},
-{{indent}}    {{batch_size}},
+{{indent}}    {{max_seqlen}}, {{batch_size}},
 {{indent}}    {{num_heads}},
 {{indent}}    {{head_dim}},
 {{indent}}    {{softmax_scale}},
@@ -339,9 +342,9 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
 
     q = func_attrs["inputs"][0]
 
-    batch_size = q.shape()[0]._attrs["name"]
-
-    num_heads = q._attrs["shape"][2]._attrs["values"][0]
+    batch_size = func_attrs["inputs"][3].shape()[0]._attrs["name"]
+    num_heads = q._attrs["shape"][1]._attrs["values"][0]
+    max_seqlen = q._attrs["shape"][0].upper_bound() // 16
     head_dim = q._attrs["shape"][3]._attrs["values"][0]
     
     softmax_scale = head_dim ** (-0.5)
@@ -353,6 +356,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
         k=k_name,
         v=v_name,
         seqlens=seqlens_name,
+        max_seqlen=max_seqlen,
         batch_size=batch_size,
         num_heads=num_heads,
         head_dim=head_dim,
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index a74a42d05..7b5c2427a 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -24,6 +24,7 @@
 
 from aitemplate.backend.common import gemm_common
 from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar
 
 INPUT_ADDR_CALCULATOR = jinja2.Template(
     """
@@ -646,6 +647,7 @@ def gen_profiler(
     file_pairs = []
     has_d0_flag = has_d0(func_attrs)
     has_d1_flag = has_d1(func_attrs)
+
     for op_name, op in op_instance.items():
         config = emit_instance(op)
         config_name = extract_config_name(config)
@@ -812,6 +814,13 @@ def gen_function(
             problem_args=problem_args,
             is_profiler=False,
         )
+        has_dynamic_shape = False
+        for inp in func_attrs["inputs"]:
+            for dim in inp.shape():
+                if isinstance(dim, IntVar):
+                    has_dynamic_shape = True
+        if has_dynamic_shape:
+            key = "true"
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     extra_header = extra_header_template.render(
@@ -991,5 +1000,15 @@ def fproc_f16(op):
             b_layout=b_layout,
             c_layout=c_layout,
         )
-
+    has_dynamic_shape = False
+    for inp in func_attrs["inputs"]:
+        for dim in inp.shape():
+            if isinstance(dim, IntVar):
+                has_dynamic_shape = True
     func_attrs["op_instance"] = extract_config(op_kind, extra_kind, fproc_f16)
+    if has_dynamic_shape:
+        filtered_op_instance = {}
+        for op_name, op in func_attrs["op_instance"].items():
+            if "Padding" in emit_instance(op):
+                filtered_op_instance[op_name] = op
+        func_attrs["op_instance"] = filtered_op_instance
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index afe550661..3c86af37e 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -49,7 +49,6 @@
 )
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.utils import alignment, environ
-from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
@@ -868,7 +867,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
 
 
 def _profiler_results_groupby_key(instance):
-    if detect_target().name() == "rocm":
+    if backend.target.Target.current().name() == "rocm":
         return (
             instance[1]["op"],  # unique op name
             instance[3],  # profiler key (gemm shape)
@@ -921,6 +920,7 @@ def postprocess_results(self):
             self._instances,
             key=_profiler_results_groupby_key,
         ):
+            group = list(group)
             min_runtime_results = min(group, key=_profiler_group_reduce_min_key)
             (
                 (best_algo, runtime, workspace),
@@ -929,9 +929,22 @@ def postprocess_results(self):
                 exec_key,
                 split_k,
             ) = min_runtime_results
-            func_attrs["exec_path"][exec_key].algo = best_algo
-            func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
-            func_attrs["split_k"] = split_k
+            if target.name() == "rocm":
+                for results in group:
+                    (
+                        (_, _, _),
+                        func_attrs,
+                        _,
+                        _,
+                        _,
+                    ) = results
+                    func_attrs["exec_path"][exec_key].algo = best_algo
+                    func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                    func_attrs["split_k"] = split_k
+            else:
+                func_attrs["exec_path"][exec_key].algo = best_algo
+                func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                func_attrs["split_k"] = split_k
 
             _LOGGER.info(
                 f"Profiler ({profiler_filename} {exec_key}) selected kernel: "
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 614da10bd..d1778ad4a 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -490,5 +490,4 @@ def transform_strided_ops(
         ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
-        graph_utils.dump_graph_debug_str_to_file(sorted_graph, workdir, func.__name__)
     return sorted_graph
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 57048111c..e78f2644f 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -16,6 +16,7 @@
 Frontend for attention module
 """
 from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntVar
 from aitemplate.compiler.ops import flash_attention
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
@@ -251,7 +252,7 @@ def attention(self, x):
                 )
             return out
 
-    def forward(self, *args):
+    def forward(self, *args, seqlens=None):
         """forward pass for calling mha module"""
         assert len(args) >= 1
         x = args[0]
@@ -357,6 +358,7 @@ def attention(self, q, k, v, seqlens=None):
         query = self.query(q)
         key = self.key(k)
         value = self.value(v)
+        
         if detect_target().name() == "cuda":
             query = ops.permute()(
                 ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
@@ -368,40 +370,37 @@ def attention(self, q, k, v, seqlens=None):
                 ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
                 [0, 2, 1, 3],
             )
-        elif seqlens is None:
-            query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
-            query = ops.transpose()(query, 1, 2)
-            query = ops.reshape()(query, [-1, query.shape()[2], head_dim])
-            key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
-            key = ops.transpose()(key, 1, 2)
-            key = ops.reshape()(key, [-1, key.shape()[2], head_dim])
-            value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
-            value = ops.transpose()(value, 1, 2)
-            value = ops.reshape()(value, [-1, value.shape()[2], head_dim])  
-            OP = ops.bmm_softmax_bmm_permute(
-                shape=(self.num_heads,),
-                scale=head_dim**-0.5,
-                causal=self.causal,
-            )
-            return OP(query, key, value)
-        else:
-            query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
-            key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
-            value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
-
-        return self.op(query, key, value, seqlens)
+            return self.op(query, key, value)
+        elif seqlens:
+            query = ops.reshape()(query, [batch, self.num_heads, -1, head_dim])
+            key = ops.reshape()(key, [batch, self.num_heads, -1, head_dim])
+            value = ops.reshape()(value, [batch, self.num_heads, -1, head_dim])
+            return self.op(query, key, value, seqlens)
+
+        query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
+        query = ops.transpose()(query, 1, 2)
+        query = ops.reshape()(query, [-1, query.shape()[2], head_dim])
+        key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
+        key = ops.transpose()(key, 1, 2)
+        key = ops.reshape()(key, [-1, key.shape()[2], head_dim])
+        value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
+        value = ops.transpose()(value, 1, 2)
+        value = ops.reshape()(value, [-1, value.shape()[2], head_dim])
+        OP = ops.bmm_softmax_bmm_permute(
+            shape=(self.num_heads,),
+            scale=head_dim**-0.5,
+            causal=self.causal,
+        )
+        return OP(query, key, value)  
 
     def forward(self, *args, seqlens=None):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
         batch = x.shape()[0]
-        if detect_target().name() == "cuda":
-            attn_output = self.attention(args[0], args[1], args[2])
-        else:
-            attn_output = self.attention(args[0], args[1], args[2], seqlens)
+        attn_output = self.attention(args[0], args[1], args[2], seqlens=seqlens)
                 
-        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
+        attn_output = ops.reshape()(attn_output, [-1, self.dim])
 
         if self.has_residual:
             assert len(args) == 4
@@ -409,7 +408,8 @@ def forward(self, *args, seqlens=None):
         else:
             x = self.proj(attn_output)
         x = self.proj_drop(x)
-        x = ops.reshape()(x, [batch, -1, self.dim])
+        if not isinstance(batch, IntVar):
+            x = ops.reshape()(x, [batch, -1, self.dim])
         return x
 
 
From f86fda0c1a37cc2f324dfe196e489d43b76e0f72 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 23 May 2023 14:23:26 +0800
Subject: [PATCH 539/638] fix a bug

---
 .../ops/gemm_universal/gemm_common.py         | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 7016bc662..5c9de28ed 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -49,7 +49,6 @@
 )
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.utils import alignment, environ
-from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
@@ -857,7 +856,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
 
 
 def _profiler_results_groupby_key(instance):
-    if detect_target().name() == "rocm":
+    if backend.target.Target.current().name() == "rocm":
         return (
             instance[1]["op"],  # unique op name
             instance[3],  # profiler key (gemm shape)
@@ -910,6 +909,7 @@ def postprocess_results(self):
             self._instances,
             key=_profiler_results_groupby_key,
         ):
+            group = list(group)
             min_runtime_results = min(group, key=_profiler_group_reduce_min_key)
             (
                 (best_algo, runtime, workspace),
@@ -918,9 +918,22 @@ def postprocess_results(self):
                 exec_key,
                 split_k,
             ) = min_runtime_results
-            func_attrs["exec_path"][exec_key].algo = best_algo
-            func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
-            func_attrs["split_k"] = split_k
+            if target.name() == "rocm":
+                for results in group:
+                    (
+                        (_, _, _),
+                        func_attrs,
+                        _,
+                        _,
+                        _,
+                    ) = results
+                    func_attrs["exec_path"][exec_key].algo = best_algo
+                    func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                    func_attrs["split_k"] = split_k
+            else:
+                func_attrs["exec_path"][exec_key].algo = best_algo
+                func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                func_attrs["split_k"] = split_k
 
             _LOGGER.info(
                 f"Profiler ({profiler_filename} {exec_key}) selected kernel: "

From b300446915c151614871552380026e89686c2d19 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 23 May 2023 01:00:52 -0700
Subject: [PATCH 540/638] Sync CUTLASS version with upstream (#706)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/706

ATT.

Reviewed By: chenyang78

Differential Revision: D46000568

fbshipit-source-id: 8dcdb01c2cce456bb71cce91439ed63288a27dd4
---
 3rdparty/cutlass                              |  2 +-
 python/aitemplate/compiler/ops/conv/conv2d.py | 24 ++++++-------
 python/aitemplate/compiler/ops/conv/conv3d.py | 27 +++++++-------
 .../ops/gemm_universal/gemm_common.py         | 36 ++++++++++++-------
 4 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 5328b493f..8ca363e59 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 5328b493f7ed2c20109f8d53bcc2ed12f9ac457e
+Subproject commit 8ca363e59ef734fdeab3f5b7b85cc37804628040
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 23a999ab2..ab9c6a0c9 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -398,10 +398,10 @@ def _should_build_profiler(self) -> bool:
                     1 if self._attrs["split_k"] is None else self._attrs["split_k"]
                 )
                 query = ConvQueryEntry(
-                    dtype_a=tmp_op.A.element.value,
-                    dtype_b=tmp_op.B.element.value,
-                    dtype_c=tmp_op.C.element.value,
-                    dtype_acc=tmp_op.accumulator_type().value,
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.accumulator_type().value - 1,
                     major_a=tmp_op.A.layout.value,
                     major_b=tmp_op.B.layout.value,
                     major_c=tmp_op.C.layout.value,
@@ -497,10 +497,10 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
         exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
         split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
         query = ConvQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -553,10 +553,10 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
         cache_record = ConvRecordEntry(
             exec_entry=exec_key,
             exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index 6c5301673..a998d7f77 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -365,10 +365,11 @@ def _should_build_profiler(self) -> bool:
                     1 if self._attrs["split_k"] is None else self._attrs["split_k"]
                 )
                 query = Conv3dQueryEntry(
-                    dtype_a=tmp_op.A.element.value,
-                    dtype_b=tmp_op.B.element.value,
-                    dtype_c=tmp_op.C.element.value,
-                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+                    - 1,
                     major_a=tmp_op.A.layout.value,
                     major_b=tmp_op.B.layout.value,
                     major_c=tmp_op.C.layout.value,
@@ -477,10 +478,11 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
         exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
         split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
         query = Conv3dQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+            - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -541,10 +543,11 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cac
         cache_record = Conv3dRecordEntry(
             exec_entry=exec_key,
             exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+            - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 992281a6a..c51aa22ba 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -440,10 +440,14 @@ def _should_build_profiler(
             for wkl in workloads:
                 exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
                 query = GemmQueryEntry(
-                    dtype_a=tmp_op.A.element.value,
-                    dtype_b=tmp_op.B.element.value,
-                    dtype_c=tmp_op.C.element.value,
-                    dtype_acc=tmp_op.accumulator_type().value,
+                    # 1 is subtracted from the type enum values for consistency with the existing
+                    # cache databases; due to the "void" type being added to the DataType enum as
+                    # the very first enum member (and shifting the values of other enum members) in
+                    # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.accumulator_type().value - 1,
                     major_a=tmp_op.A.layout.value,
                     major_b=tmp_op.B.layout.value,
                     major_c=tmp_op.C.layout.value,
@@ -633,10 +637,14 @@ def _profile_single_workload(
         # have a cache entry for the problem size before gen_profiler, we will
         # setup exec_path correctly in gen_profiler, so we won't get here at all.
         query = GemmQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            # 1 is subtracted from the type enum values for consistency with the existing
+            # cache databases; due to the "void" type being added to the DataType enum as
+            # the very first enum member (and shifting the values of other enum members) in
+            # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -903,10 +911,14 @@ def postprocess_results(self):
             cache_record = GemmRecordEntry(
                 exec_entry=exec_key,
                 exec_entry_sha1=exec_entry_sha1,
-                dtype_a=tmp_op.A.element.value,
-                dtype_b=tmp_op.B.element.value,
-                dtype_c=tmp_op.C.element.value,
-                dtype_acc=tmp_op.accumulator_type().value,
+                # 1 is subtracted from the type enum values for consistency with the existing
+                # cache databases; due to the "void" type being added to the DataType enum as
+                # the very first enum member (and shifting the values of other enum members) in
+                # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+                dtype_a=tmp_op.A.element.value - 1,
+                dtype_b=tmp_op.B.element.value - 1,
+                dtype_c=tmp_op.C.element.value - 1,
+                dtype_acc=tmp_op.accumulator_type().value - 1,
                 major_a=tmp_op.A.layout.value,
                 major_b=tmp_op.B.layout.value,
                 major_c=tmp_op.C.layout.value,

From 6f4e747523fc32b71cd6b7549b41b5b9bbf740d9 Mon Sep 17 00:00:00 2001
From: chengscott <60510scott@gmail.com>
Date: Tue, 23 May 2023 07:08:00 -0700
Subject: [PATCH 541/638] check if the tensor is none before copying (#704)

Summary:
In fuse_conv_bn_weights, check if the tensor is none before copying

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/704

Reviewed By: chenyang78

Differential Revision: D45986809

Pulled By: alexanderguzhva

fbshipit-source-id: 1423be506295b00b665107e6c5e3fecb49c77541
---
 examples/01_resnet-50/weight_utils.py          | 11 +++--------
 examples/02_detectron2/tools/convert_pt2ait.py | 11 +++--------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/examples/01_resnet-50/weight_utils.py b/examples/01_resnet-50/weight_utils.py
index 4dc455a3d..40dc53e74 100644
--- a/examples/01_resnet-50/weight_utils.py
+++ b/examples/01_resnet-50/weight_utils.py
@@ -62,16 +62,11 @@ def fuse_conv_bn_weights(
         conv_w = torch.tensor(conv_w)
         bn_rm = torch.tensor(bn_rm)
         bn_rv = torch.tensor(bn_rv)
-        bn_w = torch.tensor(bn_w)
-        bn_b = torch.tensor(bn_b)
+        conv_b = torch.tensor(conv_b) if conv_b is not None else torch.zeros_like(bn_rm)
+        bn_w = torch.tensor(bn_w) if bn_w is not None else torch.ones_like(bn_rm)
+        bn_b = torch.tensor(bn_b) if bn_b is not None else torch.zeros_like(bn_rm)
         bn_eps = torch.tensor(bn_eps)
 
-        if conv_b is None:
-            conv_b = torch.zeros_like(bn_rm)
-        if bn_w is None:
-            bn_w = torch.ones_like(bn_rm)
-        if bn_b is None:
-            bn_b = torch.zeros_like(bn_rm)
         bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
         if transpose:
diff --git a/examples/02_detectron2/tools/convert_pt2ait.py b/examples/02_detectron2/tools/convert_pt2ait.py
index 810fc8d3a..85b66c48c 100644
--- a/examples/02_detectron2/tools/convert_pt2ait.py
+++ b/examples/02_detectron2/tools/convert_pt2ait.py
@@ -75,16 +75,11 @@ def fuse_conv_bn_weights(
         conv_w = torch.tensor(conv_w)
         bn_rm = torch.tensor(bn_rm)
         bn_rv = torch.tensor(bn_rv)
-        bn_w = torch.tensor(bn_w)
-        bn_b = torch.tensor(bn_b)
+        conv_b = torch.tensor(conv_b) if conv_b is not None else torch.zeros_like(bn_rm)
+        bn_w = torch.tensor(bn_w) if bn_w is not None else torch.ones_like(bn_rm)
+        bn_b = torch.tensor(bn_b) if bn_b is not None else torch.zeros_like(bn_rm)
         bn_eps = torch.tensor(bn_eps)
 
-        if conv_b is None:
-            conv_b = torch.zeros_like(bn_rm)
-        if bn_w is None:
-            bn_w = torch.ones_like(bn_rm)
-        if bn_b is None:
-            bn_b = torch.zeros_like(bn_rm)
         bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
         if transpose:

From 326aae6b37f9aab0bc8f86fd1dcc8914c36ce3e1 Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmwang@amazon.com>
Date: Tue, 23 May 2023 07:25:38 -0700
Subject: [PATCH 542/638] Fix 02_vision_model example (#718)

Summary:
It appears fx2ait now automatically performs nchw to nhwc conversion for the user.

For some reason I didn't get good perf numbers but it should be a separate issue.
```
== Benchmark Result for: TestResNet
BS: 1, PT Eager time per iter: 0.0019565773010253905ms, PT Eager QPS: 511.10, FX2AIT time per iter: 0.001033175048828125ms, FX2AIT Eager QPS: 967.89, Speedup: 1.89,
== Benchmark Result for: TestResNet
BS: 8, PT Eager time per iter: 0.002017873992919922ms, PT Eager QPS: 3964.57, FX2AIT time per iter: 0.0021129624938964844ms, FX2AIT Eager QPS: 3786.15, Speedup: 0.95,
== Benchmark Result for: TestResNet
BS: 16, PT Eager time per iter: 0.0020058624267578124ms, PT Eager QPS: 7976.62, FX2AIT time per iter: 0.003464407043457031ms, FX2AIT Eager QPS: 4618.39, Speedup: 0.58,
== Benchmark Result for: TestResNet
BS: 32, PT Eager time per iter: 0.0028792626953125ms, PT Eager QPS: 11113.96, FX2AIT time per iter: 0.006071419067382813ms, FX2AIT Eager QPS: 5270.60, Speedup: 0.47,
== Benchmark Result for: TestResNet
BS: 256, PT Eager time per iter: 0.016274913330078123ms, PT Eager QPS: 15729.73, FX2AIT time per iter: 0.0487688818359375ms, FX2AIT Eager QPS: 5249.25, Speedup: 0.33,
== Benchmark Result for: TestResNet
BS: 512, PT Eager time per iter: 0.031698186035156256ms, PT Eager QPS: 16152.34, FX2AIT time per iter: 0.100773662109375ms, FX2AIT Eager QPS: 5080.69, Speedup: 0.31,
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/718

Reviewed By: chenyang78

Differential Revision: D46079717

Pulled By: alexanderguzhva

fbshipit-source-id: 8f0ac3c2a5ab627eb8796550f10811c52a4d2d01
---
 fx2ait/fx2ait/example/02_vision_model/README.md            | 6 ++----
 fx2ait/fx2ait/example/02_vision_model/test_vision_model.py | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fx2ait/fx2ait/example/02_vision_model/README.md b/fx2ait/fx2ait/example/02_vision_model/README.md
index 8245b7cfd..d6525eb2e 100644
--- a/fx2ait/fx2ait/example/02_vision_model/README.md
+++ b/fx2ait/fx2ait/example/02_vision_model/README.md
@@ -20,10 +20,8 @@ Therefore the definition of model is as simple as
             def forward(self, x):
                 return self.mod(x)
 ```
-Notice that because AIT supports channel last, while pytorch supports channel first operation, we need to permute the input
-```
-inputs = [inp.permute([0, 2, 3, 1]).contiguous() for inp in inputs]
-``
+Notice that because AIT supports channel last, while pytorch supports channel first operation, FX2AIT automatically performs this layout conversion for you.
+
 To run the test and benchmark,
 ```
 python fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
diff --git a/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
index 1ca633b41..ca706ca35 100644
--- a/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
+++ b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
@@ -36,7 +36,6 @@ def forward(self, x):
         verify_accuracy(
             model,
             inputs,
-            permute_inputs=[0, 2, 3, 1],
         )
         results = []
         for batch_size in [1, 8, 16, 32, 256, 512]:
@@ -47,7 +46,6 @@ def forward(self, x):
                     100,
                     model,
                     inputs,
-                    permute_inputs=[0, 2, 3, 1],
                 )
             )
         for res in results:

From 329f15212faa89043934e199c150fcf474ca7aab Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 23 May 2023 13:58:05 -0700
Subject: [PATCH 543/638] fix bert example (#710)

Summary:
while running locally I got some errors related to relative imports

now each of these commands works:

```
$> PYTHONPATH=python:$PYTHONPATH python examples/03_bert/demo.py
$> PYTHONPATH=python:$PYTHONPATH python examples/03_bert/test_correctness.py
$> PYTHONPATH=python:$PYTHONPATH python examples/03_bert/benchmark_ait.py
```

I also updated the dependencies for `nix-shell`. Running with NixOS 23.05; `LD_LIBRARY_PATH` needs to be updated for profiler executables to be able to see the CUDA runtime libraries. In my case it was,

```
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nix/store/9v8myfc469nshbx4f2qbwgyarl25kf8s-nvidia-x11-530.41.03-6.1.28/lib
```

Profiler executables not seeing the libraries manifests with error message  `CUDA driver version is insufficient for CUDA runtime version`, or, if we point it to a wrong library, `system has unsupported display driver / cuda driver combination`. `strace`ing helped to determine that the profiler executable tried to open `libcuda.so.1` and couldn't find it, hence the path fix was needed

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/710

Reviewed By: kadeng

Differential Revision: D46032208

Pulled By: alexanderguzhva

fbshipit-source-id: f4c9ade4f8de8fb65d4cba88d1a0cea736587648
---
 default.nix                          | 4 +++-
 examples/03_bert/benchmark_ait.py    | 4 ++--
 examples/03_bert/demo.py             | 6 +++---
 examples/03_bert/test_correctness.py | 2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/default.nix b/default.nix
index e4c49b033..d521651e9 100644
--- a/default.nix
+++ b/default.nix
@@ -10,6 +10,7 @@ let
     pytorch-bin
     pip
     wheel
+    click
     unidecode
     inflect
     librosa
@@ -17,6 +18,7 @@ let
     sympy
     einops
     parameterized
+    transformers
     # (
     #   buildPythonPackage rec {
     #     pname = "cuda_python";
@@ -38,7 +40,7 @@ pkgs.mkShell {
   buildInputs = [
     pkgs.cmake
     pkgs.cudatoolkit
-    (pkgs.python39.withPackages ait-deps)
+    (pkgs.python310.withPackages ait-deps)
   ];
 
   shellHook = ''
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index a16244a9a..5d9b5622c 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -24,8 +24,8 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-from .modeling.bert import BertBaseEncodersOnly, BertBaseUncased
-from .modeling.torch_model import BertBaseUncased as BertPt
+from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from modeling.torch_model import BertBaseUncased as BertPt
 
 
 def mark_output(y: Tensor) -> None:
diff --git a/examples/03_bert/demo.py b/examples/03_bert/demo.py
index f23dcf9d7..fedf5c642 100644
--- a/examples/03_bert/demo.py
+++ b/examples/03_bert/demo.py
@@ -16,10 +16,10 @@
 
 import torch
 
-from transformers import BertTokenizer
+from benchmark_ait import compile_module
+from modeling.torch_model import BertBaseUncased as BertPt
 
-from .benchmark_ait import compile_module
-from .modeling.torch_model import BertBaseUncased as BertPt
+from transformers import BertTokenizer
 
 
 def prepare_data(prompt: str, model_path: str):
diff --git a/examples/03_bert/test_correctness.py b/examples/03_bert/test_correctness.py
index 7cf6d4201..4a65f4d35 100644
--- a/examples/03_bert/test_correctness.py
+++ b/examples/03_bert/test_correctness.py
@@ -18,7 +18,7 @@
 
 import torch
 
-from .demo import run_model
+from demo import run_model
 
 try:
     from libfb.py.asyncio.await_utils import await_sync

From 4c879309fc3e83a9b37605da8c600758beafcb33 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 23 May 2023 23:30:59 -0700
Subject: [PATCH 544/638] Added a pass to fuse expand + bmm (#715)

Summary:
This PR implements a pass that fuses expand + bmm. For example, it turns the following graph:

  t0 = tensor([1, m, k])
  t1 = tensor([b, k, n])
  x0 = expand(t0, [b, m, k])
  x1 = bmm(x0, t1)

into

  t0 = tensor([1, m, k])
  t1 = tensor([b, k, n])
  x1 = bmm(t0, t1)

The idea is that we rely on bmm's broadcasting to achieve the same computation as it's done by the expand op.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/715

Reviewed By: ipiszy

Differential Revision: D46065099

Pulled By: chenyang78

fbshipit-source-id: 42c2a4085a0f431c2cdba8921dcca8c6a7e0006a
---
 .../aitemplate/compiler/transform/__init__.py |   1 +
 .../compiler/transform/fuse_expand_bmm.py     | 122 +++
 .../compiler/transform/optimize_graph.py      |   2 +
 .../unittest/compiler/test_fuse_expand_bmm.py | 754 ++++++++++++++++++
 4 files changed, 879 insertions(+)
 create mode 100644 python/aitemplate/compiler/transform/fuse_expand_bmm.py
 create mode 100644 tests/unittest/compiler/test_fuse_expand_bmm.py

diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index c630c88bd..3ff2d800a 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -16,6 +16,7 @@
 from aitemplate.compiler.transform.bind_constants import bind_constants
 from aitemplate.compiler.transform.constant_folding import constant_folding
 from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_expand_bmm import fuse_expand_bmm
 from aitemplate.compiler.transform.fuse_group_ops import (
     fuse_group_gemm_ops,
     fuse_group_layernorm_ops,
diff --git a/python/aitemplate/compiler/transform/fuse_expand_bmm.py b/python/aitemplate/compiler/transform/fuse_expand_bmm.py
new file mode 100644
index 000000000..180e4a28e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_expand_bmm.py
@@ -0,0 +1,122 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This pass performs the following fusion:
+    t0 = tensor([1, M, N])
+    x0 = expand(t0, [B, M, N])
+    x1 = bmm(x0, t1) # or x1 = bmm(t1, x0)
+==>
+    x1 = bmm(t0, t1) # or x1 = bmm(t1, t0)
+
+The basic idea behind the transformation is that we leverage bmm's
+broadcasting capability to achieve the same functionality as expand.
+"""
+from typing import List
+
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
+    remove_single_tensor_op_from_sorted_graph,
+    sanitize_sorted_graph,
+)
+
+
+def _can_fuse(expand_op: Operator, bmm_op: Operator) -> bool:
+    """
+    determine if expand_op and bmm_op can be fused
+    """
+    expand_output = expand_op._attrs["outputs"][0]
+    if expand_output._attrs["is_output"]:
+        return False
+    expand_inputs = expand_op._attrs["inputs"]
+    expand_input_shape = expand_inputs[0]._attrs["shape"]
+    expand_output_shape = expand_output._attrs["shape"]
+    # not valid for bmm
+    if len(expand_output_shape) != 3:
+        return False
+    if len(expand_input_shape) == 2:
+        # In this case, we are expanding the batch dim
+        assert (
+            expand_input_shape[0] == expand_output_shape[1]
+            and expand_input_shape[1] == expand_output_shape[2]
+        ), f"invalid {expand_input_shape=} and {expand_output_shape=}"
+        return True
+    # not valid for bmm
+    if len(expand_input_shape) != 3:
+        return False
+    if expand_op._attrs["dim_types"][0] != ExpandDimensionType.EXPAND_DIM:
+        return False
+    bmm_inputs = bmm_op._attrs["inputs"]
+    bmm_a = bmm_inputs[0]
+    bmm_b = bmm_inputs[1]
+    if expand_output is bmm_a:
+        return expand_output_shape[0] == bmm_a._attrs["shape"][0]
+    if expand_output is bmm_b:
+        return expand_output_shape[0] == bmm_b._attrs["shape"][0]
+    return False
+
+
+def fuse_expand_bmm(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Transform expand + bmm into a single bmm op.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        workdir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Optimized graph
+    """
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        op = list(src_ops)[0]
+        if op._attrs["op"] != "expand":
+            continue
+        expand_op = op
+        expand_output = expand_op._attrs["outputs"][0]
+        dst_ops = expand_output._attrs["dst_ops"]
+        if len(dst_ops) != 1:
+            continue
+        next_op = list(dst_ops)[0]
+        if not next_op._attrs["op"].startswith("bmm_"):
+            continue
+        if not _can_fuse(expand_op, next_op):
+            continue
+
+        for int_var_tensor in expand_op._attrs["inputs"][1:]:
+            int_var_tensor._attrs["dst_ops"].discard(expand_op)
+        expand_op._attrs["inputs"] = [expand_op._attrs["inputs"][0]]
+        remove_single_tensor_op_from_sorted_graph(expand_op)
+
+        old_tensor_accessors = next_op._attrs["input_accessors"]
+        assert (
+            old_tensor_accessors[0].stride_dim is None
+            and old_tensor_accessors[1].stride_dim is None
+        ), f"next_op {next_op._attrs['name']} tensor accessors are expected to be None"
+        bmm_inputs = next_op._attrs["inputs"]
+        # refresh tensor accessors, which will be used by codegen
+        next_op._attrs["input_accessors"] = [TensorAccessor(t) for t in bmm_inputs]
+
+    sorted_graph = toposort(sorted_graph)
+    return sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index a3b8a5c60..83afbdb0f 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -22,6 +22,7 @@
 from aitemplate.compiler.transform.dedup_make_jagged_ops import dedup_make_jagged_ops
 from aitemplate.compiler.transform.fuse_bmm_permute import fuse_bmm_permute
 from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_expand_bmm import fuse_expand_bmm
 from aitemplate.compiler.transform.fuse_group_ops import fuse_group_ops
 from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
 from aitemplate.compiler.transform.fuse_mm_reshape_permute import (
@@ -93,6 +94,7 @@ def optimize_graph(
         dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
         fuse_bmm_permute,
+        fuse_expand_bmm,
         transform_odd_alignment,
         fuse_conv_elementwise,
         fuse_mm_elementwise,
diff --git a/tests/unittest/compiler/test_fuse_expand_bmm.py b/tests/unittest/compiler/test_fuse_expand_bmm.py
new file mode 100644
index 000000000..938106645
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_expand_bmm.py
@@ -0,0 +1,754 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class FuseExpandBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(FuseExpandBmmTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _compile_and_check(
+        self, Y, test_name, expected_num_ops, expected_op, no_expand=True
+    ):
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+        if expected_num_ops == 1:
+            self.assertEqual(sorted_ops[0]._attrs["op"], expected_op)
+        elif no_expand:
+            self.assertTrue(
+                all(lambda op: op._attrs["op"] != "expand" for op in sorted_ops)
+            )
+        return module
+
+    def _test_non_fusible_expand_bmm_1(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([B, K, N])
+        # Y0 = expand(x0, shape_1[B, M, K])
+        # Y1 = bmm_rrr(Y_0, x1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x1", is_input=True)
+
+        Y0 = ops.expand()(X0, [batch_dim, -1, -1])
+        Y0._attrs["name"] = "output0"
+        Y0._attrs["is_output"] = True
+
+        Y1 = ops.bmm_rrr()(Y0, X1)
+        Y1._attrs["name"] = "output1"
+        Y1._attrs["is_output"] = True
+        module = self._compile_and_check(
+            [Y0, Y1], test_name, expected_num_ops, "bmm_rrr", no_expand=False
+        )
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([batch, K, N], dtype)
+            y0_pt = x0_pt.expand(batch, -1, -1)
+            y1_pt = torch.matmul(y0_pt, x1_pt)
+
+            y0 = get_torch_empty_tensor(y0_pt.size(), dtype)
+            y1 = get_torch_empty_tensor(y1_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y0, y1])
+            torch.testing.assert_close(y0_pt, y0, atol=0.1, rtol=0.1)
+            torch.testing.assert_close(y1_pt, y1, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_expand_bmm_1(self):
+        self._test_non_fusible_expand_bmm_1(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_non_fusible_expand_bmm_1",
+        )
+
+    def _test_non_fusible_expand_bmm_2(
+        self,
+        B,
+        M,
+        N,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, N])
+        # x1 = tensor([B, N, N])
+        # expand_0 = expand(x0, shape_1[B, M, N])
+        # bmm_rrr_1 = bmm_rrr(expand_0, x1)
+        # Y = add(expand_0, bmm_rrr_1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, N], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, N, N], dtype=dtype, name="x1", is_input=True)
+
+        expand_0 = ops.expand()(X0, [batch_dim, -1, -1])
+        bmm_rrr_1 = ops.bmm_rrr()(expand_0, X1)
+        Y = ops.elementwise(FuncEnum.ADD)(expand_0, bmm_rrr_1)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(
+            Y, test_name, expected_num_ops, "bmm_rrr", no_expand=False
+        )
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, N, N], dtype)
+            expand_0_pt = x0_pt.expand(batch, -1, -1)
+            bmm_rrr_1_pt = torch.matmul(expand_0_pt, x1_pt)
+            y_pt = expand_0_pt + bmm_rrr_1_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_expand_bmm_2(self):
+        self._test_non_fusible_expand_bmm_1(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_non_fusible_expand_bmm_1",
+        )
+
+    def _test_fuse_expand_bmm_rrr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([B, K, N])
+        # expand_0 = expand(x0, shape_1[B, M, K])
+        # Y = bmm_rrr(expand_0, x1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x1", is_input=True)
+
+        expand_0 = ops.expand()(X0, [batch_dim, -1, -1])
+        Y = ops.bmm_rrr()(expand_0, X1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([batch, K, N], dtype)
+            expand_0_pt = x0_pt.expand(batch, -1, -1)
+            y_pt = torch.matmul(expand_0_pt, x1_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rrr_a(self):
+        self._test_fuse_expand_bmm_rrr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,  # one extra permute
+            test_name="test_fuse_expand_bmm_rrr_a",
+        )
+        self._test_fuse_expand_bmm_rrr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=1,
+            test_name="test_fuse_expand_bmm_rrr_a",
+        )
+
+    def _test_fuse_expand_bmm_rrc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, K, N])
+        # x2 = tensor([B, N, M])
+        # expand_0 = expand(x1, shape_1[B, K, N])
+        # Y = bmm_rrc_add(x0, expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, N], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x2", is_input=True)
+
+        expand_0 = ops.expand()(X1, [batch_dim, -1, -1])
+        Y = ops.bmm_rrc_add()(X0, expand_0, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, K, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, M], dtype)
+            expand_0_pt = x1_pt.expand(batch, -1, -1)
+            y_pt = torch.matmul(x0_pt, expand_0_pt)
+            y_pt = y_pt.transpose(2, 1) + x2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rrc_add_b(self):
+        self._test_fuse_expand_bmm_rrc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=3,  # two extra concat
+            test_name="test_fuse_expand_bmm_rrc_add_b",
+        )
+        self._test_fuse_expand_bmm_rrc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=1,
+            test_name="test_fuse_expand_bmm_rrc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_crr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, K, M])
+        # x1 = tensor([1, K, M])
+        # x2 = tensor([B, K, N])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, K, M])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, M], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_crr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_crr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, K, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_tran_pt, x2_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_crr_a(self):
+        self._test_fuse_expand_bmm_crr_a(
+            B=10,
+            M=5,
+            N=12,
+            K=11,
+            expected_num_ops=4,  # extra concat and slice
+            test_name="test_fuse_expand_bmm_crr_a",
+        )
+        self._test_fuse_expand_bmm_crr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_crr_a",
+        )
+
+    def _test_fuse_expand_bmm_crc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, K, N])
+        # x2 = tensor([1, K, N])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, K, N])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, N], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, K, N], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_crc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_crc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, N], dtype)
+            x2_pt = get_random_torch_tensor([1, K, N], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            x0_tran_pt = torch.transpose(x0_pt, 2, 1)
+            y_pt = torch.matmul(x0_tran_pt, expand_1_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_crc_add_b(self):
+        self._test_fuse_expand_bmm_crc_add_b(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=5,  # two extra concat and one slice
+            test_name="test_fuse_expand_bmm_crc_add_b",
+        )
+        self._test_fuse_expand_bmm_crc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_crc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_rcr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([1, M, K])
+        # x2 = tensor([B, N, K])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, M, K])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, M, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_rcr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rcr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, M, K], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, K], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            x2_tran_pt = torch.transpose(x2_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_pt, x2_tran_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rcr_a(self):
+        self._test_fuse_expand_bmm_rcr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=4,
+            test_name="test_fuse_expand_bmm_rcr_a",
+        )
+        self._test_fuse_expand_bmm_rcr_a(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_rcr_a",
+        )
+
+    def _test_fuse_expand_bmm_rcc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, N, K])
+        # x2 = tensor([1, N, K])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, N, K])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, N, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, N, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_rcc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rcc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, N, K], dtype)
+            x2_pt = get_random_torch_tensor([1, N, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            y_pt = torch.matmul(x0_pt, expand_1_tran_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rcc_add_b(self):
+        self._test_fuse_expand_bmm_rcc_add_b(
+            B=10,
+            M=6,
+            N=12,
+            K=5,
+            expected_num_ops=4,  # two extra concat
+            test_name="test_fuse_expand_bmm_rcc_add_b",
+        )
+        self._test_fuse_expand_bmm_rcc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_rcc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_ccr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, K, M])
+        # x1 = tensor([1, K, M])
+        # x2 = tensor([B, N, K])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, K, M])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, M], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_ccr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_ccr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, K], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            x2_tran_pt = torch.transpose(x2_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_tran_pt, x2_tran_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_ccr_a(self):
+        self._test_fuse_expand_bmm_ccr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=3,  # one extra permute
+            test_name="test_fuse_expand_bmm_ccr_a",
+        )
+        self._test_fuse_expand_bmm_ccr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_ccr_a",
+        )
+
+    def _test_fuse_expand_bmm_ccc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, K, M])
+        # x1 = tensor([1, N, K])
+        # x2 = tensor([1, N, K])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, N, K])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, N, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, N, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_ccc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_ccc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, N, K], dtype)
+            x2_pt = get_random_torch_tensor([1, N, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            x0_tran_pt = torch.transpose(x0_pt, 2, 1)
+            y_pt = torch.matmul(x0_tran_pt, expand_1_tran_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_ccc_add_b(self):
+        self._test_fuse_expand_bmm_ccc_add_b(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=5,  # two extra concat and one slice
+            test_name="test_fuse_expand_bmm_ccc_add_b",
+        )
+        self._test_fuse_expand_bmm_ccc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_ccc_add_b",
+        )
+
+    def _test_fuse_size_expand_bmm_rrr(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([1, M, K])
+        # x2 = tensor([1, M, K])
+        # x3 = tensor([B, K, N])
+        # x4 = tensor([B, K, N])
+        # add_0 = x3 + x4
+        # size_1, _, _ = size(add_0)
+        # expand_2 = expand(x0, size_1)
+        # expand_3 = expand(x1, size_1)
+        # expand_4 = expand(x2, size_1)
+        # bmm_5 = bmm_rrr(expand_2, add_0)
+        # bmm_6 = bmm_rrr(expand_3, add_0)
+        # bmm_7 = bmm_rrr(expand_4, add_0)
+        # add_8 = bmm_5 + bmm_6
+        # Y = bmm_7 + add_8
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, M, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, M, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x3", is_input=True)
+        X4 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x4", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X3, X4)
+        size_1, _, _ = ops.size()(add_0)
+        expand_to_shape = [size_1, -1, -1]
+        expand_2 = ops.expand()(X0, expand_to_shape)
+        expand_3 = ops.expand()(X1, expand_to_shape)
+        expand_4 = ops.expand()(X2, expand_to_shape)
+        bmm_5 = ops.bmm_rrr()(expand_2, add_0)
+        bmm_6 = ops.bmm_rrr()(expand_3, add_0)
+        bmm_7 = ops.bmm_rrr()(expand_4, add_0)
+        add_8 = ops.elementwise(FuncEnum.ADD)(bmm_5, bmm_6)
+        Y = ops.elementwise(FuncEnum.ADD)(bmm_7, add_8)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, M, K], dtype)
+            x2_pt = get_random_torch_tensor([1, M, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, K, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, K, N], dtype)
+            add_0_pt = x3_pt + x4_pt
+            size_1 = batch
+            expand_2_pt = x0_pt.expand(size_1, -1, -1)
+            expand_3_pt = x1_pt.expand(size_1, -1, -1)
+            expand_4_pt = x2_pt.expand(size_1, -1, -1)
+            bmm_5_pt = torch.matmul(expand_2_pt, add_0_pt)
+            bmm_6_pt = torch.matmul(expand_3_pt, add_0_pt)
+            bmm_7_pt = torch.matmul(expand_4_pt, add_0_pt)
+            add_8_pt = bmm_5_pt + bmm_6_pt
+            y_pt = bmm_7_pt + add_8_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_size_expand_bmm_rrr(self):
+        self._test_fuse_size_expand_bmm_rrr(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=7,
+            test_name="test_fuse_size_expand_bmm_rrr",
+        )
+        self._test_fuse_size_expand_bmm_rrr(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=4,
+            test_name="test_fuse_size_expand_bmm_rrr",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 560041b7e618dd7dc0e827b758c5298f112c3dfc Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 24 May 2023 17:16:16 +0800
Subject: [PATCH 545/638] rename layers

---
 python/aitemplate/frontend/nn/attention.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index e78f2644f..f1a58b81c 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -331,17 +331,17 @@ def __init__(
 
         self.op = ops.mem_eff_attention(causal=causal)
 
-        self.query = Linear(
+        self.proj_q = Linear(
             dim,
             dim,
             bias=qkv_bias,
         )
-        self.key = Linear(
+        self.proj_k = Linear(
             dim,
             dim,
             bias=qkv_bias,
         )
-        self.value = Linear(
+        self.proj_v = Linear(
             dim,
             dim,
             bias=qkv_bias,
@@ -355,9 +355,9 @@ def attention(self, q, k, v, seqlens=None):
         batch = q.shape()[0]
         head_dim = self.dim // self.num_heads
 
-        query = self.query(q)
-        key = self.key(k)
-        value = self.value(v)
+        query = self.proj_q(q)
+        key = self.proj_k(k)
+        value = self.proj_v(v)
         
         if detect_target().name() == "cuda":
             query = ops.permute()(

From 420c83a6321c224184367d3744d7e51515eadff2 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 24 May 2023 08:11:04 -0700
Subject: [PATCH 546/638] Fix flaky test_batch_norm (#720)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/720

Disentangle the workdirs for consecutive test runs.

Reviewed By: colinchan15

Differential Revision: D46147157

fbshipit-source-id: a5a3bc49a4f603784a59bf821eb49a079f52c397
---
 tests/unittest/ops/test_batch_norm.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tests/unittest/ops/test_batch_norm.py b/tests/unittest/ops/test_batch_norm.py
index 7a249e419..380f96a2f 100644
--- a/tests/unittest/ops/test_batch_norm.py
+++ b/tests/unittest/ops/test_batch_norm.py
@@ -41,6 +41,7 @@ def _test_batchnorm(
         bn_op,
         input_shape,
         input_type="float16",
+        test_name="batch_norm",
     ):
         pt_op = getattr(torch.nn, bn_op)(num_features).cuda().half().eval()
         ait_op = getattr(batch_norm, bn_op)(
@@ -73,8 +74,13 @@ def _test_batchnorm(
 
         target = detect_target()
         module = compile_model(
-            Y_ait, target, "./tmp", f"batch_norm_{self.test_id}", constants=params_ait
+            Y_ait,
+            target,
+            "./tmp",
+            f"{test_name}_{self.test_id}",
+            constants=params_ait,
         )
+        self.test_id += 1
 
         y = get_torch_empty_tensor(Ys_ait, dtype=input_type)
         inputs = {"input0": X_pt}
@@ -87,13 +93,22 @@ def _test_batchnorm(
     def test_batch_norm(self):
         self._test_batchnorm(num_features=3, bn_op="BatchNorm1d", input_shape=[5, 3])
         self._test_batchnorm(
-            num_features=3, bn_op="BatchNorm1d", input_shape=[5, 3, 234]
+            num_features=3,
+            bn_op="BatchNorm1d",
+            input_shape=[5, 3, 234],
+            test_name="batch_norm_1d",
         )
         self._test_batchnorm(
-            num_features=3, bn_op="BatchNorm2d", input_shape=[1, 3, 244, 244]
+            num_features=3,
+            bn_op="BatchNorm2d",
+            input_shape=[1, 3, 244, 244],
+            test_name="batch_norm_2d",
         )
         self._test_batchnorm(
-            num_features=6, bn_op="BatchNorm3d", input_shape=[4, 6, 24, 24, 11]
+            num_features=6,
+            bn_op="BatchNorm3d",
+            input_shape=[4, 6, 24, 24, 11],
+            test_name="batch_norm_3d",
         )
 
 
From be81c395315e58bf0e568906eee5e1c8ad29a867 Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Wed, 24 May 2023 11:21:39 -0700
Subject: [PATCH 547/638] Stable Diffusion ControlNet (#713)

Summary:
# Stable Diffusion ControlNet

## Introduction

* ControlNetUNet2DConditionModel
    * This is almost the same as UNet2DConditionModel, except that it accepts additional residuals as input.
* ControlNetModel
    * This is the main ControlNet model that accepts ControlNet conditionings and outputs the additional down block and middle block residuals.

## Usage

Compiling ControlNet
```
python scripts/compile_controlnet.py --width 512 --height 512
```
Produces `tmp/ControlNetModel/test.so`

Compiling ControlNetUNet2DConditionModel
```
python scripts/compile_alt.py --width 512 --height 512 --controlnet True
```
Produces `tmp/ControlNetUNet2DConditionModel/test.so`

Inference
```
python scripts/demo_controlnet.py --width 512 --height 512 --steps 20
```

## Speed

3060 12GB, nvcc 11.6

ComfyUI xformers controlnet
```
100%|| 20/20 [00:04<00:00,  4.99it/s]
```

AITemplate controlnet
```
100%|| 20/20 [00:02<00:00,  8.02it/s]
```

## Notes

Is there a better way of specifying a list of Tensor as input with AITemplate? would be better. `mark_output` doesn't work with lists though, and when modified to loop if type == list there is an error elsewhere about list being unhashable, I didn't look further into that error though, maybe it's an easy fix

Not sure if this would be faster as a single module.

Static sizes only for now, will try to fix the dynamic sizes later.

Tested with `lllyasviel/sd-controlnet-canny`

SD2.x ControlNet unlikely to work.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/713

Reviewed By: aakhundov

Differential Revision: D46059564

Pulled By: alexanderguzhva

fbshipit-source-id: f8e5e0f692f96346f882f9ad9363477f275b10f1
---
 .../scripts/compile_alt.py                    |    3 +
 .../scripts/compile_controlnet.py             |   86 ++
 .../scripts/demo_controlnet.py                |  140 +++
 .../src/compile_lib/compile_controlnet.py     |  119 ++
 .../src/compile_lib/compile_unet_alt.py       |  151 ++-
 .../modeling/controlnet_unet_2d_condition.py  |  552 +++++++++
 ...ipeline_stable_diffusion_controlnet_ait.py | 1104 +++++++++++++++++
 7 files changed, 2132 insertions(+), 23 deletions(-)
 create mode 100644 examples/05_stable_diffusion/scripts/compile_controlnet.py
 create mode 100644 examples/05_stable_diffusion/scripts/demo_controlnet.py
 create mode 100644 examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py
 create mode 100644 examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
 create mode 100644 examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py

diff --git a/examples/05_stable_diffusion/scripts/compile_alt.py b/examples/05_stable_diffusion/scripts/compile_alt.py
index 62ac268bc..922cf03b7 100644
--- a/examples/05_stable_diffusion/scripts/compile_alt.py
+++ b/examples/05_stable_diffusion/scripts/compile_alt.py
@@ -63,6 +63,7 @@
 )
 @click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
 @click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+@click.option("--controlnet", default=False, help="UNet for controlnet")
 def compile_diffusers(
     local_dir,
     width,
@@ -72,6 +73,7 @@ def compile_diffusers(
     include_constants,
     use_fp16_acc=True,
     convert_conv_to_gemm=True,
+    controlnet=False,
 ):
     logging.getLogger().setLevel(logging.INFO)
     torch.manual_seed(4896)
@@ -118,6 +120,7 @@ def compile_diffusers(
         attention_head_dim=pipe.unet.config.attention_head_dim,
         use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
         constants=True if include_constants else False,
+        controlnet=True if controlnet else False,
     )
     # VAE
     compile_vae(
diff --git a/examples/05_stable_diffusion/scripts/compile_controlnet.py b/examples/05_stable_diffusion/scripts/compile_controlnet.py
new file mode 100644
index 000000000..e108acc80
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_controlnet.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import ControlNetModel
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_controlnet import compile_controlnet
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, type=int, help="width")
+@click.option("--height", default=512, type=int, help="height")
+@click.option("--batch-size", default=1, type=int, help="batch size")
+@click.option("--clip-chunks", default=6, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=None,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(
+    local_dir,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    assert (
+        width % 64 == 0
+    ), "Width must be multiples of 64, otherwise, the compilation process will fail."
+    assert (
+        height % 64 == 0
+    ), "Height must be multiples of 64, otherwise, the compilation process will fail."
+
+    controlnet = ControlNetModel.from_pretrained(
+        "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
+    ).to("cuda")
+
+    compile_controlnet(
+        controlnet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        use_fp16_acc=use_fp16_acc,
+        constants=include_constants,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/demo_controlnet.py b/examples/05_stable_diffusion/scripts/demo_controlnet.py
new file mode 100644
index 000000000..beacff7d0
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_controlnet.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import cv2
+import numpy as np
+import torch
+from aitemplate.utils.import_path import import_parent
+from diffusers.utils import load_image
+from PIL import Image
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_controlnet_ait import StableDiffusionAITPipeline
+
+
+def prepare_image(
+    image,
+    width,
+    height,
+    batch_size,
+    num_images_per_prompt,
+    device,
+    dtype,
+    do_classifier_free_guidance=False,
+    guess_mode=False,
+):
+    if not isinstance(image, torch.Tensor):
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        if isinstance(image[0], Image.Image):
+            images = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = image_.resize((width, height), resample=Image.LANCZOS)
+                image_ = np.array(image_)
+                image_ = image_[None, :]
+                images.append(image_)
+
+            image = images
+
+            image = np.concatenate(image, axis=0)
+            image = np.array(image).astype(np.float32) / 255.0
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+
+    image_batch_size = image.shape[0]
+
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+
+    image = image.repeat_interleave(repeat_by, dim=0)
+
+    image = image.to(device=device, dtype=dtype)
+
+    if do_classifier_free_guidance and not guess_mode:
+        image = torch.cat([image] * 2)
+
+    return image
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="runwayml/stable-diffusion-v1-5",
+    help="Model weights to apply to compiled model (with --include-constants false)",
+)
+@click.option("--ckpt", default=None, help="e.g. v1-5-pruned-emaonly.ckpt")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option("--steps", default=50, help="Number of inference steps")
+@click.option("--cfg", default=7.5, help="Guidance scale")
+def run(
+    hf_hub_or_path, ckpt, width, height, batch, prompt, negative_prompt, steps, cfg
+):
+    pipe = StableDiffusionAITPipeline(
+        hf_hub_or_path=hf_hub_or_path,
+        ckpt=ckpt,
+    )
+    # download an image
+    image = load_image(
+        "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+    )
+    image = np.array(image)
+    # get canny image
+    image = cv2.Canny(image, 100, 200)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    canny_image = Image.fromarray(image)
+    control_cond = prepare_image(
+        canny_image,
+        width,
+        height,
+        batch,
+        1,
+        "cuda",
+        torch.float16,
+        do_classifier_free_guidance=True,
+    )
+    prompt = [prompt] * batch
+    negative_prompt = [negative_prompt] * batch
+    with torch.autocast("cuda"):
+        for _ in range(5):
+            image = pipe(
+                prompt=prompt,
+                control_cond=control_cond,
+                height=height,
+                width=width,
+                negative_prompt=negative_prompt,
+                num_inference_steps=steps,
+                guidance_scale=cfg,
+            ).images[0]
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py b/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py
new file mode 100644
index 000000000..927f93cd0
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.controlnet_unet_2d_condition import (
+    ControlNetModel as ait_ControlNetModel,
+)
+from .util import mark_output
+
+
+def map_controlnet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+    params_ait["controlnet_cond_embedding_conv_in_weight"] = torch.nn.functional.pad(
+        params_ait["controlnet_cond_embedding_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+    )
+    params_ait["arange"] = (
+        torch.arange(start=0, end=320 // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+# was used to debug controlnet_cond_embedding
+# def compile_controlnet_conditioning_embedding():
+#     controlnet_cond_embedding = ait_ControlNetConditioningEmbedding(256)
+#     controlnet_cond_embedding.name_parameter_tensor()
+#     controlnet_condition_ait = Tensor(
+#         [2, 512, 512, 3], name="input0", is_input=True
+#     )
+#     Y = controlnet_cond_embedding(controlnet_condition_ait)
+#     mark_output(Y)
+#     target = detect_target(
+#         use_fp16_acc=True, convert_conv_to_gemm=True
+#     )
+#     compile_model(Y, target, "./tmp", "ControlNetConditioningEmbedding", constants=None)
+
+
+def compile_controlnet(
+    pt_mod,
+    batch_size=2,
+    height=512,  # (512,1024),
+    width=512,  # (512,1024),
+    clip_chunks=1,
+    dim=320,
+    hidden_dim=768,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    model_name="ControlNetModel",
+    constants=False,
+):
+    batch_size = batch_size * 2  # double batch size for unet
+    ait_mod = ait_ControlNetModel()
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_controlnet_params(pt_mod, dim)
+    # batch_size = (batch_size[0], batch_size[1] * 2) #double batch size for unet
+    clip_batch_size = IntVar(values=(1, 8), name="batch_size")
+    # height_d = IntVar(values=list((height[0]//8, height[1]//8)), name="height_d")
+    # width_d = IntVar(values=list((width[0]//8, width[1]//8)), name="width_d")
+    # height_c = IntVar(values=list((height[0], height[1])), name="height_c")
+    # width_c = IntVar(values=list((width[0], width[1])), name="width_c")
+    clip_chunks = 77, 77 * clip_chunks
+    embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height // 8, width // 8, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [clip_batch_size, embedding_size, hidden_dim], name="input2", is_input=True
+    )
+    controlnet_condition_ait = Tensor(
+        [batch_size, height, width, 3], name="input3", is_input=True
+    )
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        controlnet_condition_ait,
+    )
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y, target, "./tmp", model_name, constants=params_ait if constants else None
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
index f4cf4b48c..365710054 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
@@ -13,11 +13,13 @@
 #  limitations under the License.
 #
 import torch
-
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
+from ..modeling.controlnet_unet_2d_condition import (
+    ControlNetUNet2DConditionModel as ait_ControlNetUNet2DConditionModel,
+)
 from ..modeling.unet_2d_condition import (
     UNet2DConditionModel as ait_UNet2DConditionModel,
 )
@@ -58,28 +60,45 @@ def compile_unet(
     hidden_dim=1024,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
+    controlnet=True,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
     model_name="UNet2DConditionModel",
     use_linear_projection=False,
     constants=True,
 ):
-    ait_mod = ait_UNet2DConditionModel(
-        sample_size=64,
-        cross_attention_dim=hidden_dim,
-        attention_head_dim=attention_head_dim,
-        use_linear_projection=use_linear_projection,
-    )
+    if controlnet:
+        ait_mod = ait_ControlNetUNet2DConditionModel(
+            sample_size=64,
+            cross_attention_dim=hidden_dim,
+            attention_head_dim=attention_head_dim,
+            use_linear_projection=use_linear_projection,
+        )
+    else:
+        ait_mod = ait_UNet2DConditionModel(
+            sample_size=64,
+            cross_attention_dim=hidden_dim,
+            attention_head_dim=attention_head_dim,
+            use_linear_projection=use_linear_projection,
+        )
     ait_mod.name_parameter_tensor()
 
     # set AIT parameters
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
-    batch_size = (batch_size[0], batch_size[1] * 2)  # double batch size for unet
-    batch_size = IntVar(values=list(batch_size), name="batch_size")
-    height = height[0] // 8, height[1] // 8
-    width = width[0] // 8, width[1] // 8
-    height_d = IntVar(values=list(height), name="height")
-    width_d = IntVar(values=list(width), name="width")
+    if controlnet:
+        # static sizes only for now
+        batch_size = batch_size[0] * 2  # double batch size for unet
+        height = height[0] // 8
+        width = width[0] // 8
+        height_d = height
+        width_d = width
+    else:
+        batch_size = (batch_size[0], batch_size[1] * 2)  # double batch size for unet
+        batch_size = IntVar(values=list(batch_size), name="batch_size")
+        height = height[0] // 8, height[1] // 8
+        width = width[0] // 8, width[1] // 8
+        height_d = IntVar(values=list(height), name="height")
+        width_d = IntVar(values=list(width), name="width")
     clip_chunks = 77, 77 * clip_chunks
     embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
 
@@ -90,17 +109,103 @@ def compile_unet(
     text_embeddings_pt_ait = Tensor(
         [batch_size, embedding_size, hidden_dim], name="input2", is_input=True
     )
+    if controlnet:
+        down_block_residual_0 = Tensor(
+            [batch_size, height, width, 320],
+            name="down_block_residual_0",
+            is_input=True,
+        )
+        down_block_residual_1 = Tensor(
+            [batch_size, height, width, 320],
+            name="down_block_residual_1",
+            is_input=True,
+        )
+        down_block_residual_2 = Tensor(
+            [batch_size, height, width, 320],
+            name="down_block_residual_2",
+            is_input=True,
+        )
+        down_block_residual_3 = Tensor(
+            [batch_size, height // 2, width // 2, 320],
+            name="down_block_residual_3",
+            is_input=True,
+        )
+        down_block_residual_4 = Tensor(
+            [batch_size, height // 2, width // 2, 640],
+            name="down_block_residual_4",
+            is_input=True,
+        )
+        down_block_residual_5 = Tensor(
+            [batch_size, height // 2, width // 2, 640],
+            name="down_block_residual_5",
+            is_input=True,
+        )
+        down_block_residual_6 = Tensor(
+            [batch_size, height // 4, width // 4, 640],
+            name="down_block_residual_6",
+            is_input=True,
+        )
+        down_block_residual_7 = Tensor(
+            [batch_size, height // 4, width // 4, 1280],
+            name="down_block_residual_7",
+            is_input=True,
+        )
+        down_block_residual_8 = Tensor(
+            [batch_size, height // 4, width // 4, 1280],
+            name="down_block_residual_8",
+            is_input=True,
+        )
+        down_block_residual_9 = Tensor(
+            [batch_size, height // 8, width // 8, 1280],
+            name="down_block_residual_9",
+            is_input=True,
+        )
+        down_block_residual_10 = Tensor(
+            [batch_size, height // 8, width // 8, 1280],
+            name="down_block_residual_10",
+            is_input=True,
+        )
+        down_block_residual_11 = Tensor(
+            [batch_size, height // 8, width // 8, 1280],
+            name="down_block_residual_11",
+            is_input=True,
+        )
+        mid_block_residual = Tensor(
+            [batch_size, height // 8, width // 8, 1280],
+            name="mid_block_residual",
+            is_input=True,
+        )
+    else:
+        mid_block_additional_residual = None
+        down_block_additional_residuals = None
 
-    mid_block_additional_residual = None
-    down_block_additional_residuals = None
-
-    Y = ait_mod(
-        latent_model_input_ait,
-        timesteps_ait,
-        text_embeddings_pt_ait,
-        down_block_additional_residuals,
-        mid_block_additional_residual,
-    )
+    if controlnet:
+        Y = ait_mod(
+            latent_model_input_ait,
+            timesteps_ait,
+            text_embeddings_pt_ait,
+            down_block_residual_0,
+            down_block_residual_1,
+            down_block_residual_2,
+            down_block_residual_3,
+            down_block_residual_4,
+            down_block_residual_5,
+            down_block_residual_6,
+            down_block_residual_7,
+            down_block_residual_8,
+            down_block_residual_9,
+            down_block_residual_10,
+            down_block_residual_11,
+            mid_block_residual,
+        )
+    else:
+        Y = ait_mod(
+            latent_model_input_ait,
+            timesteps_ait,
+            text_embeddings_pt_ait,
+            mid_block_additional_residual,
+            down_block_additional_residuals,
+        )
     mark_output(Y)
 
     target = detect_target(
diff --git a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
new file mode 100644
index 000000000..327d601b5
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
@@ -0,0 +1,552 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple, Union
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+
+
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+
+    def __init__(
+        self,
+        # conditioning_embedding_channels: int,
+        # conditioning_channels: int = 3,
+        # block_out_channels: Tuple[int] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+        """
+        Note: This is different to diffusers ControlNetConditioningEmbedding
+        Required Conv2dBiasFewChannels for the first layer, then Conv2dBias for the rest
+        Could be changed back to a loop and use parameters though,
+        but it ended up like this when debugging.
+        """
+        self.conv_in = nn.Conv2dBiasFewChannels(3, 16, 3, 1, 1)
+
+        self.blocks = nn.ModuleList([])
+        self.blocks.append(nn.Conv2dBias(16, 16, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(16, 32, 3, 2, 1))
+        self.blocks.append(nn.Conv2dBias(32, 32, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(32, 96, 3, 2, 1))
+        self.blocks.append(nn.Conv2dBias(96, 96, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(96, 256, 3, 2, 1))
+
+        self.conv_out = nn.Conv2dBias(256, 320, 3, 1, 1)
+
+    def forward(self, conditioning):
+        """
+        Padding required!
+        """
+        pad = ops.nhwc3to4()
+        conditioning = pad(conditioning)
+        embedding = self.conv_in(conditioning)
+        embedding = ops.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = ops.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+class ControlNetModel(nn.Module):
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 768,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+    ):
+        super().__init__()
+        self.controlnet_conditioning_channel_order = (
+            controlnet_conditioning_channel_order
+        )
+        self.global_pool_conditions = global_pool_conditions
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+        )
+
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+        controlnet_block = controlnet_block
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+                controlnet_block = controlnet_block
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+                controlnet_block = controlnet_block
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2dBias(mid_block_channel, mid_block_channel, 1)
+        controlnet_block = controlnet_block
+        self.controlnet_mid_block = controlnet_block
+
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_cond,
+        conditioning_scale: float = 1.0,
+    ) -> Tuple:
+        t_emb = self.time_proj(timestep)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+
+        sample = sample + controlnet_cond
+        # 3. down
+        down_block_res_samples = (sample,)  # up to but excluding last element
+        sample, res_samples = self.down_blocks[0](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[1](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[2](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[3](hidden_states=sample, temb=emb)
+        down_block_res_samples += res_samples
+        # return sample
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(
+            down_block_res_samples, self.controlnet_down_blocks
+        ):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (
+                down_block_res_sample,
+            )
+
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        down_block_res_samples = [
+            sample * conditioning_scale for sample in down_block_res_samples
+        ]
+        mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        # return (down_block_res_samples, mid_block_res_sample)
+        return (
+            down_block_res_samples[0],
+            down_block_res_samples[1],
+            down_block_res_samples[2],
+            down_block_res_samples[3],
+            down_block_res_samples[4],
+            down_block_res_samples[5],
+            down_block_res_samples[6],
+            down_block_res_samples[7],
+            down_block_res_samples[8],
+            down_block_res_samples[9],
+            down_block_res_samples[10],
+            down_block_res_samples[11],
+            mid_block_res_sample,
+        )
+
+
+class ControlNetUNet2DConditionModel(nn.Module):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        use_linear_projection (`bool`, *optional*, defaults to False): Use linear projection instead of 1x1 convolution.
+    """
+
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
+    ):
+        super().__init__()
+        self.center_input_sample = center_input_sample
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=norm_eps,
+            use_swish=True,
+        )
+
+        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+
+    def forward(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        down_block_residual_0,
+        down_block_residual_1,
+        down_block_residual_2,
+        down_block_residual_3,
+        down_block_residual_4,
+        down_block_residual_5,
+        down_block_residual_6,
+        down_block_residual_7,
+        down_block_residual_8,
+        down_block_residual_9,
+        down_block_residual_10,
+        down_block_residual_11,
+        mid_block_residual,
+        return_dict: bool = True,
+    ):
+        """r
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        down_block_additional_residuals = (
+            down_block_residual_0,
+            down_block_residual_1,
+            down_block_residual_2,
+            down_block_residual_3,
+            down_block_residual_4,
+            down_block_residual_5,
+            down_block_residual_6,
+            down_block_residual_7,
+            down_block_residual_8,
+            down_block_residual_9,
+            down_block_residual_10,
+            down_block_residual_11,
+        )
+        mid_block_additional_residual = mid_block_residual
+        # 1. time
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "attentions")
+                and downsample_block.attentions is not None
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+            # return sample
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+
+        # 5. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "attentions")
+                and upsample_block.attentions is not None
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                )
+
+        # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
new file mode 100644
index 000000000..838725af2
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
@@ -0,0 +1,1104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+import os
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils.pil_utils import numpy_to_pil
+from tqdm import tqdm
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from .compile_lib.compile_vae_alt import map_vae_params
+from .modeling.vae import AutoencoderKL as ait_AutoencoderKL
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, additional_replacements=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+
+def convert_ldm_vae_checkpoint(vae_state_dict):
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
+        "encoder.conv_out.weight"
+    ]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
+        "encoder.norm_out.weight"
+    ]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
+        "encoder.norm_out.bias"
+    ]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
+        "decoder.conv_out.weight"
+    ]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
+        "decoder.norm_out.weight"
+    ]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
+        "decoder.norm_out.bias"
+    ]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "encoder.down" in layer
+        }
+    )
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
+        for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "decoder.up" in layer
+        }
+    )
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
+        for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key
+        ]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+# =================#
+# UNet Conversion #
+# =================#
+def convert_ldm_unet_checkpoint(unet_state_dict, layers_per_block=2):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
+        "time_embed.0.weight"
+    ]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
+        "time_embed.0.bias"
+    ]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
+        "time_embed.2.weight"
+    ]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
+        "time_embed.2.bias"
+    ]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "input_blocks" in layer
+        }
+    )
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "middle_block" in layer
+        }
+    )
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "output_blocks" in layer
+        }
+    )
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (layers_per_block + 1)
+
+        resnets = [
+            key
+            for key in input_blocks[i]
+            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.weight"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.weight")
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.bias"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path]
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths,
+        new_checkpoint,
+        unet_state_dict,
+        additional_replacements=[meta_path],
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (layers_per_block + 1)
+        layer_in_block_id = i % (layers_per_block + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [
+                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
+            ]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(
+                    ["conv.bias", "conv.weight"]
+                )
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.weight"]
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.bias"]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(
+                output_block_layers, n_shave_prefix_segments=1
+            )
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# =========================#
+#    AITemplate mapping   #
+# =========================#
+def map_unet_state_dict(state_dict, dim=320):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            # print("ff.net.0.proj.weight")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            # print("ff.net.0.proj.bias")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def map_clip_state_dict(state_dict):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+def map_controlnet_params(pt_mod):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+    params_ait["controlnet_cond_embedding_conv_in_weight"] = torch.nn.functional.pad(
+        params_ait["controlnet_cond_embedding_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+    )
+    params_ait["arange"] = (
+        torch.arange(start=0, end=320 // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+class StableDiffusionAITPipeline:
+    def __init__(self, hf_hub_or_path, ckpt):
+        self.device = torch.device("cuda")
+        workdir = "tmp/"
+        state_dict = None
+        if ckpt is not None:
+            state_dict = torch.load(ckpt, map_location="cpu")
+            while "state_dict" in state_dict:
+                state_dict = state_dict["state_dict"]
+            clip_state_dict = {}
+            unet_state_dict = {}
+            vae_state_dict = {}
+            for key in state_dict.keys():
+                if key.startswith("cond_stage_model.transformer."):
+                    new_key = key.replace("cond_stage_model.transformer.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("cond_stage_model.model."):
+                    new_key = key.replace("cond_stage_model.model.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("first_stage_model."):
+                    new_key = key.replace("first_stage_model.", "")
+                    vae_state_dict[new_key] = state_dict[key]
+                elif key.startswith("model.diffusion_model."):
+                    new_key = key.replace("model.diffusion_model.", "")
+                    unet_state_dict[new_key] = state_dict[key]
+            # TODO: SD2.x clip support, get from diffusers convert_from_ckpt.py
+            # clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
+            unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict)
+            vae_state_dict = convert_ldm_vae_checkpoint(vae_state_dict)
+            state_dict = None
+
+        self.controlnet_ait_exe = self.init_ait_module("ControlNetModel", "./tmp")
+        print("Loading PyTorch ControlNet")
+        controlnet_pt = ControlNetModel.from_pretrained(
+            "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
+        ).to("cuda")
+        controlnet_pt.eval()
+        ait_params = map_controlnet_params(controlnet_pt)
+        self.controlnet_ait_exe.set_many_constants_with_tensors(ait_params)
+        self.controlnet_ait_exe.fold_constants()
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        print("Loading PyTorch CLIP")
+        if ckpt is None:
+            self.clip_pt = CLIPTextModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="text_encoder",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            config = CLIPTextConfig.from_pretrained(
+                hf_hub_or_path, subfolder="text_encoder"
+            )
+            self.clip_pt = CLIPTextModel(config)
+            self.clip_pt.load_state_dict(clip_state_dict)
+        clip_params_ait = map_clip_state_dict(dict(self.clip_pt.named_parameters()))
+        print("Setting constants")
+        self.clip_ait_exe.set_many_constants_with_tensors(clip_params_ait)
+        print("Folding constants")
+        self.clip_ait_exe.fold_constants()
+        # cleanup
+        self.clip_pt = None
+        clip_params_ait = None
+
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="ControlNetUNet2DConditionModel", workdir=workdir
+        )
+
+        print("Loading PyTorch UNet")
+        if ckpt is None:
+            self.unet_pt = UNet2DConditionModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="unet",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+            self.unet_pt = self.unet_pt.state_dict()
+        else:
+            self.unet_pt = unet_state_dict
+        unet_params_ait = map_unet_state_dict(self.unet_pt)
+        print("Setting constants")
+        self.unet_ait_exe.set_many_constants_with_tensors(unet_params_ait)
+        print("Folding constants")
+        self.unet_ait_exe.fold_constants()
+        # cleanup
+        self.unet_pt = None
+        unet_params_ait = None
+
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+        print("Loading PyTorch VAE")
+        if ckpt is None:
+            self.vae_pt = AutoencoderKL.from_pretrained(
+                hf_hub_or_path,
+                subfolder="vae",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            self.vae_pt = dict(vae_state_dict)
+        in_channels = 3
+        out_channels = 3
+        down_block_types = [
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ]
+        up_block_types = [
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+        ]
+        block_out_channels = [128, 256, 512, 512]
+        layers_per_block = 2
+        act_fn = "silu"
+        latent_channels = 4
+        sample_size = 512
+
+        ait_vae = ait_AutoencoderKL(
+            1,
+            64,
+            64,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            latent_channels=latent_channels,
+            sample_size=sample_size,
+        )
+        print("Mapping parameters...")
+        vae_params_ait = map_vae_params(ait_vae, self.vae_pt)
+        print("Setting constants")
+        self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
+        print("Folding constants")
+        self.vae_ait_exe.fold_constants()
+        # cleanup
+        self.vae_pt = None
+        ait_vae = None
+        vae_params_ait = None
+
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+        )
+        self.batch = 1
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def controlnet_inference(
+        self, latent_model_input, timesteps, encoder_hidden_states, controlnet_cond
+    ):
+        exe_module = self.controlnet_ait_exe
+        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+            "input3": controlnet_cond.permute((0, 2, 3, 1)).contiguous().cuda().half(),
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        down_block_residuals = (y for y in ys[:-1])
+        mid_block_residuals = ys[-1]
+        return down_block_residuals, mid_block_residuals
+
+    def unet_inference(
+        self,
+        latent_model_input,
+        timesteps,
+        encoder_hidden_states,
+        height,
+        width,
+        down_block_residuals,
+        mid_block_residual,
+    ):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(self.batch * 2)
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        for i, y in enumerate(down_block_residuals):
+            inputs[f"down_block_residual_{i}"] = y
+        inputs["mid_block_residual"] = mid_block_residual
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height // 8
+            shape[2] = width // 8
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=77):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input, height, width):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height
+            shape[2] = width
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        control_cond: torch.FloatTensor,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        self.batch = batch_size
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+        # pytorch equivalent
+        # text_embeddings = self.clip_pt(text_input.input_ids.to(self.device)).last_hidden_state
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = self.device
+        latents_shape = (batch_size, 4, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for t in tqdm(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            down_block_residuals, mid_block_residual = self.controlnet_inference(
+                latent_model_input, t, text_embeddings, control_cond
+            )
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                height=height,
+                width=width,
+                down_block_residuals=down_block_residuals,
+                mid_block_residual=mid_block_residual,
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs
+            ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents, height, width)
+        # pytorch equivalent
+        # image = self.vae_pt.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

From 8f622a1245c2ccc137e121b1ad2df9bf3a0d38f1 Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmwang@amazon.com>
Date: Wed, 24 May 2023 13:37:39 -0700
Subject: [PATCH 548/638] Fix fx2ait max/avg_pool with stride=None (#701)

Summary:
This PR fixed a minor issue in max_pool2d/3d, avg_pool2d fx2ait converter when the user did not explicitly provide stride that should default to kernel_size.

This is mainly for the following model to work with fx2ait.
```py
class MNIST(nn.Module):

  def __init__(self):
    super(MNIST, self).__init__()
    self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
    self.bn1 = nn.BatchNorm2d(10)
    self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
    self.bn2 = nn.BatchNorm2d(20)
    self.fc1 = nn.Linear(320, 50)
    self.fc2 = nn.Linear(50, 10)

  def forward(self, x):
    x = F.relu(F.max_pool2d(self.conv1(x), 2))
    x = self.bn1(x)
    x = F.relu(F.max_pool2d(self.conv2(x), 2))
    x = self.bn2(x)
    x = torch.flatten(x, 1)
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return F.log_softmax(x, dim=1)
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/701

Reviewed By: chenyang78

Differential Revision: D45970528

Pulled By: alexanderguzhva

fbshipit-source-id: 36da503ab487b43c81c272eb6b94273514201d95
---
 fx2ait/fx2ait/converters/ait_converters.py    | 28 ++++----
 .../test/converters/test_ait_pooling_ops.py   | 67 +++++++++++++++++++
 2 files changed, 82 insertions(+), 13 deletions(-)
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py

diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index e498b9722..88f7c8e34 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -1397,13 +1397,11 @@ def acc_ops_max_pool3d(
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
-    if (
-        isinstance(kwargs["kernel_size"], tuple)
-        and isinstance(kwargs["stride"], tuple)
-        and isinstance(kwargs["padding"], tuple)
+    if isinstance(kwargs["kernel_size"], tuple) and isinstance(
+        kwargs["padding"], tuple
     ):
         kernel_size_tuple = kwargs["kernel_size"]
-        stride_tuple = kwargs["stride"]
+        stride_tuple = kwargs["stride"] if kwargs["stride"] else kwargs["kernel_size"]
         padding_tuple = kwargs["padding"]
 
         assert kernel_size_tuple[0] == 1, "max_pool3d only supports kT == 1 currently"
@@ -1415,13 +1413,9 @@ def acc_ops_max_pool3d(
         kernel_size = identical_elem_tuple_to_int(kernel_size_tuple[1:])
         stride = identical_elem_tuple_to_int(stride_tuple[1:])
         padding = identical_elem_tuple_to_int(padding_tuple[1:])
-    elif (
-        isinstance(kwargs["kernel_size"], int)
-        and isinstance(kwargs["stride"], int)
-        and isinstance(kwargs["padding"], int)
-    ):
+    elif isinstance(kwargs["kernel_size"], int) and isinstance(kwargs["padding"], int):
         kernel_size = kwargs["kernel_size"]
-        stride = kwargs["stride"]
+        stride = kwargs["stride"] if kwargs["stride"] else kwargs["kernel_size"]
         padding = kwargs["padding"]
     else:
         raise RuntimeError("Only int or tuple types are supported")
@@ -1470,7 +1464,11 @@ def acc_ops_max_pool2d(
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
     kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
-    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    stride = (
+        identical_elem_tuple_to_int(kwargs["stride"])
+        if kwargs["stride"]
+        else kernel_size
+    )
     padding = identical_elem_tuple_to_int(kwargs["padding"])
     ceil_mode = kwargs["ceil_mode"]
     return_indices = kwargs["return_indices"]
@@ -1494,7 +1492,11 @@ def acc_ops_avg_pool2d(
         raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
 
     kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
-    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    stride = (
+        identical_elem_tuple_to_int(kwargs["stride"])
+        if kwargs["stride"]
+        else kernel_size
+    )
     padding = identical_elem_tuple_to_int(kwargs["padding"])
     ceil_mode = kwargs["ceil_mode"]
     count_include_pad = kwargs["count_include_pad"]
diff --git a/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py b/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py
new file mode 100644
index 000000000..c1c816a6a
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+import torch.nn.functional as F
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAitPoolingConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            "avg_pool2d",
+            "max_pool2d",
+        ]
+    )
+    def test_pooling2d_with_default_inputs(self, opname):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fn = getattr(F, opname)
+
+            def forward(self, x):
+                return self.fn(x, 2)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={getattr(acc_ops, opname)},
+        )
+
+    @parameterized.expand(
+        [
+            "max_pool3d",
+        ]
+    )
+    def test_pooling3d_with_default_inputs(self, opname):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fn = getattr(F, opname)
+
+            def forward(self, x):
+                return self.fn(x, 1)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 4, 8, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={getattr(acc_ops, opname)},
+        )

From 7f3811e820451cf2e9b1ca9e1bbcb8b10bc77f9a Mon Sep 17 00:00:00 2001
From: chengscott <60510scott@gmail.com>
Date: Wed, 24 May 2023 19:14:53 -0700
Subject: [PATCH 549/638] nvcc options (#721)

Summary:
add LTO target and parallel threading for nvcc
reference: https://developer.nvidia.com/blog/improving-gpu-app-performance-with-cuda-11-2-device-lto

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/721

Reviewed By: alexanderguzhva

Differential Revision: D46149635

Pulled By: aakhundov

fbshipit-source-id: 62ec8239860ef504a027598a25358c050dad0ad8
---
 python/aitemplate/backend/cuda/target_def.py | 6 +++++-
 python/aitemplate/utils/environ.py           | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 471d8d915..d37452ddd 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -136,10 +136,14 @@ def get_host_compiler_options(self) -> List[str]:
         return self._build_gnu_host_compiler_options()
 
     def _build_nvcc_compiler_options(self) -> List[str]:
+        code = [f"sm_{self._arch}", f"compute_{self._arch}"]
+        if environ.enable_cuda_lto():
+            code += [f"lto_{self._arch}"]
         options = [
+            "-t=0",
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
             "-w",
-            f"-gencode=arch=compute_{self._arch},code=[sm_{self._arch},compute_{self._arch}]",
+            f"-gencode=arch=compute_{self._arch},code=[{','.join(code)}]",
             environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index c2c8d57e8..3971cfb1f 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -46,6 +46,14 @@ def use_fast_math() -> str:
     return os.getenv("AIT_USE_FAST_MATH", "1") == "1"
 
 
+def enable_cuda_lto() -> bool:
+    """
+    nvcc will use LTO flags during compilation
+    Default value is "0".
+    """
+    return os.getenv("AIT_ENABLE_CUDA_LTO", "0") == "1"
+
+
 def force_profiler_cache() -> bool:
     """
     Force the profiler to use the cached results. The profiler will throw

From 201937e57c92e922e120ff7d4e6e8f0b4ceb799f Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Wed, 24 May 2023 20:45:18 -0700
Subject: [PATCH 550/638] 2/n support layernorm and elementwise op (#711)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/711

Misc: to test it for debug, we can reduce the above json by keeping only what you want to test.

Test using TORCHINDUCTOR_MAX_AUTOTUNE=1
```
TORCHINDUCTOR_MAX_AUTOTUNE=1 CUDA_VISIBLE_DEVICES=7 buck2 run mode/opt -c=python.package_style=inplace -c fbcode.platform010_clang=12 aitemplate/AITemplate/benchmark:op_benchmark_driver -- --in-file=$HOME/ig_wei/all-shapes-simple.json  --out-file=$HOME/ig_shape_result.txt
 2>&1 | tee op_bench.log
```

Reviewed By: ipiszy

Differential Revision: D46005069

fbshipit-source-id: d1d4fbe6c4a3eb51c1e1687dd56efd1ac4b89152
---
 python/aitemplate/testing/profile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/testing/profile.py b/python/aitemplate/testing/profile.py
index 03be8ec95..d4ad64a54 100644
--- a/python/aitemplate/testing/profile.py
+++ b/python/aitemplate/testing/profile.py
@@ -93,7 +93,7 @@ def _f():
     n_groups = len(sorted_events) // n_iter
     # in each group (corresponding to a profiling iteration),
     # skip measuring the first kernel, which is the l2 cache flush
-    event_groups = [g[0:] for g in zip(*([iter(sorted_events)] * n_groups))]
+    event_groups = [g[1:] for g in zip(*([iter(sorted_events)] * n_groups))]
     logger.info(
         f"First kernel sequence: {list(map(itemgetter('name'), event_groups[0]))}"
     )

From 08bc584e410e67b52c7b7847d6f10e427363a08f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 25 May 2023 15:34:28 +0800
Subject: [PATCH 551/638] fix bugs

---
 .../scripts/download_pipeline.py              |  38 ----
 python/aitemplate/backend/backend_spec.py     |   2 +
 .../common/tensor/permute0213_common.py       |   7 +-
 .../rocm/gemm/gemm_rcr_bias_permute.py        |  26 ++-
 .../rocm/tensor/expand_static_shape.py        |   6 +-
 .../aitemplate/backend/rocm/tensor/repeat.h   | 190 ++++++++++++++++++
 6 files changed, 224 insertions(+), 45 deletions(-)
 delete mode 100644 examples/05_stable_diffusion/scripts/download_pipeline.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/repeat.h

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
deleted file mode 100644
index e5ffe56f0..000000000
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import click
-import torch
-from diffusers import StableDiffusionPipeline
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option(
-    "--save_directory",
-    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="pipeline files local directory",
-)
-def download_pipeline_files(token, save_directory) -> None:
-    StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
-        revision="fp16",
-        torch_dtype=torch.float16,
-        # use provided token or the one generated with `huggingface-cli login``
-        use_auth_token=token if token != "" else True,
-    ).save_pretrained(save_directory)
-
-
-if __name__ == "__main__":
-    download_pipeline_files()
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 0c3f63cfb..201be64a1 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -408,6 +408,7 @@ class ROCMSpec(GPUBackendSpec):
     prefix = "hip"
     stream = "stream"
     cub = "hipcub"
+    tile_size = 64
 
     cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
@@ -446,6 +447,7 @@ class CUDASpec(GPUBackendSpec):
     prefix = "cuda"
     stream = "stream"
     cub = "cub"
+    tile_size = 32
 
     cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
diff --git a/python/aitemplate/backend/common/tensor/permute0213_common.py b/python/aitemplate/backend/common/tensor/permute0213_common.py
index cde9bda01..9e0f52382 100644
--- a/python/aitemplate/backend/common/tensor/permute0213_common.py
+++ b/python/aitemplate/backend/common/tensor/permute0213_common.py
@@ -185,7 +185,7 @@
     """
 {{header_files}}
 
-#define TILE_SIZE 32
+#define TILE_SIZE {{tile_size}}
 #define ITEMS_PER_THREAD 4
 #define DIRECT_BLOCK_Y 4
 #define DIRECT_BLOCK_Z 2
@@ -353,8 +353,8 @@
   } else {
     const int m = ((M + TILE_SIZE - 1) / TILE_SIZE);
 
-    dim3 grid((D + 31) / 32, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, B * m);
-    dim3 block(32, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = 32, the warp size
+    dim3 grid((D + TILE_SIZE - 1) / TILE_SIZE, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, B * m);
+    dim3 block(TILE_SIZE, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = TILE_SIZE, the warp size
 
     permute0213_direct_kernel<T><<<grid, block, 0, stream>>>(
       static_cast<T*>(out_ptr),
@@ -421,6 +421,7 @@ def gen_function(
         dtype=backend_spec.dtype_to_backend_type(xdtype),
     )
     return SRC_TEMPLATE.render(
+        tile_size=backend_spec.tile_size,
         function_name=func_name,
         exec_paths=exec_paths,
         header_files=header_files,
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
index ac5bbc6cc..85fab9657 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -18,11 +18,35 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
+import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.rocm.gemm import common, permute_common
 from aitemplate.backend.rocm.gemm.layout import RCR
 
 
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
+
+
 @registry.reg("rocm.gemm_rcr_bias_permute.config")
 def gemm_config(func_attrs, dtype="float16"):
     """Extract (operation name, operation instance) pair from
@@ -64,7 +88,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
-        args_parse=RCR.args_parse,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
         gemm_flag="bias_permute",
         extra_code="const int G1={}, G2={}, G3={};".format(
             func_attrs["shape"][0],
diff --git a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
index 20fe5fa3f..c4da60ecc 100644
--- a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
+++ b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
@@ -199,13 +199,13 @@ def create_template_args(
     # Efficient vectorized & buffered repeat copy implementation,
     # even for odd shapes
     custom_libs = Target.current().get_custom_libs(
-        os.path.dirname(__file__), "repeat.cuh"
+        os.path.dirname(__file__), "repeat.h"
     )
     rocm_spec = ROCMSpec()
     dtype = rocm_spec.dtype_to_backend_dtype[x.dtype()]
     assert (
         dtype is not None
-    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
     dtype2 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 2, None)
     dtype4 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 4, None)
     xshape = x._attrs["shape"]
@@ -359,7 +359,7 @@ def create_template_args(
         "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
         "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
         "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
-        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.cuh
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.h
     }
 
 
diff --git a/python/aitemplate/backend/rocm/tensor/repeat.h b/python/aitemplate/backend/rocm/tensor/repeat.h
new file mode 100644
index 000000000..974fe7e51
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/repeat.h
@@ -0,0 +1,190 @@
+/**
+
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+-
+
+Functions for repeating parts of a CUDA source tensor onto itself
+or into a target tensor.
+
+Used by expand_static_shape.py ( expand operator )
+
+*/
+
+#include "hip/hip_runtime.h"
+/**
+ * CUDA Kernel to copy elements repeatedly from a source memory
+ * region to a target memory region.
+ */
+__global__ void repeat_head_kernel(
+    const int64_t* const src, ///< source memory region. Must be 8-byte aligned
+    int64_t* data,
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies) ///< How many times to repeat it all into data
+{
+  extern __shared__ int64_t
+      shared[]; // preallocated to blockDim.x elements, typically 32
+  const size_t stride_y = blockDim.y * gridDim.y;
+  const size_t stride_x = blockDim.x * gridDim.x;
+
+  // outer grid-stride loop
+  for (size_t ri = blockDim.x * blockIdx.x + threadIdx.x;
+       ri < head_mem_num_elements;
+       ri += stride_x) {
+    // read only with one thread per y dim
+    if (threadIdx.y == 0) {
+      shared[threadIdx.x] = src[ri];
+    }
+    __syncthreads(); // wait for shared memory to be populated
+    // inner grid-stride loop, write with all threads out of shared memory
+    size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
+    for (; wi < num_repeat_copies; wi += stride_y) {
+      // Note that this ensures coalesced writes, due to consecutive write
+      // accesses of threads in a Warp
+      data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
+    }
+  }
+}
+
+/**
+ * Copy an 8-byte aligned memory region, which has a byte size that is a
+ * multiple of 8 into an 8-byte aligned target memory region efficiently. Calls
+ * into repeat_head_kernel ( see above )
+ *
+ **/
+__host__ hipError_t cuda_repeat_head_vectorized(
+    const int64_t* const src, ///< Source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+              for head_mem_num_elements*num_repeat_copies int64_t elements. */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies, ///< How many times to repeat it all into data
+    hipStream_t stream ///< CUDA stream
+) {
+  size_t threads_x = 64;
+  size_t threads_y = 1024 / threads_x;
+  size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
+  size_t blocks_y = INT_CEIL_DIV(num_repeat_copies, threads_y);
+  size_t serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks if necessary, so we do not exceed available shared
+  // memory
+  blocks_y = INT_CEIL_DIV(
+      blocks_y, serialization_level); // reduce thread count in y dimension
+                                      // first, e.g. sequentialized writes
+  serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks in x direction if this is not sufficient yet
+  blocks_x = INT_CEIL_DIV(blocks_x, serialization_level);
+  dim3 dimGrid(blocks_x, blocks_y);
+  dim3 dimBlock(threads_x, threads_y);
+  repeat_head_kernel<<<
+      dimGrid,
+      dimBlock,
+      threads_x * sizeof(int64_t),
+      stream>>>(src, data, head_mem_num_elements, num_repeat_copies);
+  return hipPeekAtLastError();
+}
+
+/**
+ * Repeatedly copy the beginning (head) section of a memory region an additonal
+ * num_repeat_copies times nto the memory region directly following that head,
+ * such that the end result will have this head data
+ * repeated 1+num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_head(
+    void* data, ///< pointer to CUDA memory of size (at least)
+                ///< head_mem_bytes*(num_repeat_copies+1)
+    const size_t head_mem_bytes, ///< How many bytes to repeat
+    size_t num_repeat_copies, ///< How many times to repeat it (in addition to
+                              ///< the existing head data)
+    hipStream_t stream ///< CUDA Stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0)
+    return res;
+  if ((head_mem_bytes % 8) == 0) {
+    // no need to double memory any further if it is 64-bit aligned
+    res = cuda_repeat_head_vectorized(
+        static_cast<const int64_t* const>(data),
+        static_cast<int64_t*>(data) + (head_mem_bytes / 8),
+        head_mem_bytes / 8,
+        num_repeat_copies,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+  } else {
+    res = hipMemcpyAsync(
+        static_cast<void*>(static_cast<uint8_t*>(data) + head_mem_bytes),
+        data,
+        head_mem_bytes,
+        hipMemcpyDeviceToDevice,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+    if (num_repeat_copies >= 2) {
+      // recurse
+      // we have already repeated 1 time, therefore the (num_repeat_copies-1)
+      res = cuda_repeat_head(
+          data, head_mem_bytes * 2, (num_repeat_copies - 1) / 2, stream);
+      if (res != hipSuccess) {
+        return res;
+      }
+      // deal with possible remainder
+      if (((num_repeat_copies - 1) % 2) == 1) {
+        res = hipMemcpyAsync(
+            static_cast<void*>(
+                static_cast<uint8_t*>(data) +
+                num_repeat_copies * head_mem_bytes),
+            data,
+            head_mem_bytes,
+            hipMemcpyDeviceToDevice,
+            stream);
+      }
+    }
+  }
+  return res;
+}
+
+/**
+ * Repeatedly copy a source memory region into a target memory region
+ * such that the end result will have the source data
+ * repeated num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_src(
+    const void* const src, ///< Source memory region (readonly)
+    void* data, ///< Destination memory region (read/write, size of at least
+                ///< num_repeat_copies*head_mem_bytes)
+    const size_t head_mem_bytes, ///< Size of source region to copy
+    size_t num_repeat_copies, ///< How many times to copy the data from source
+                              ///< into data
+    hipStream_t stream ///< CUDA stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0) {
+    return res;
+  }
+
+  res = hipMemcpyAsync(
+      data, src, head_mem_bytes, hipMemcpyDeviceToDevice, stream);
+  if ((res != hipSuccess) || (num_repeat_copies == 1)) {
+    return res;
+  }
+  return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
+}

From 02460f6533d5b0d70ff8158535b55b76be26e9c0 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Thu, 25 May 2023 09:11:56 -0700
Subject: [PATCH 552/638] Back out "fix bert example" (#728)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/728

ATT

Reviewed By: alexanderguzhva

Differential Revision: D46186394

fbshipit-source-id: 01a41450ebe5a07e963da131c764731b3a6f0ce3
---
 examples/03_bert/benchmark_ait.py    | 4 ++--
 examples/03_bert/demo.py             | 6 +++---
 examples/03_bert/test_correctness.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 5d9b5622c..a16244a9a 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -24,8 +24,8 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
-from modeling.torch_model import BertBaseUncased as BertPt
+from .modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from .modeling.torch_model import BertBaseUncased as BertPt
 
 
 def mark_output(y: Tensor) -> None:
diff --git a/examples/03_bert/demo.py b/examples/03_bert/demo.py
index fedf5c642..f23dcf9d7 100644
--- a/examples/03_bert/demo.py
+++ b/examples/03_bert/demo.py
@@ -16,11 +16,11 @@
 
 import torch
 
-from benchmark_ait import compile_module
-from modeling.torch_model import BertBaseUncased as BertPt
-
 from transformers import BertTokenizer
 
+from .benchmark_ait import compile_module
+from .modeling.torch_model import BertBaseUncased as BertPt
+
 
 def prepare_data(prompt: str, model_path: str):
     tokenizer = BertTokenizer.from_pretrained(model_path)
diff --git a/examples/03_bert/test_correctness.py b/examples/03_bert/test_correctness.py
index 4a65f4d35..7cf6d4201 100644
--- a/examples/03_bert/test_correctness.py
+++ b/examples/03_bert/test_correctness.py
@@ -18,7 +18,7 @@
 
 import torch
 
-from demo import run_model
+from .demo import run_model
 
 try:
     from libfb.py.asyncio.await_utils import await_sync

From 89711d9cdc14044cb170163bf189963b9e8eaa08 Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Thu, 25 May 2023 11:14:00 -0700
Subject: [PATCH 553/638] Stable Diffusion Alt fixes #722 #723 (#724)

Summary:
* Adds mapping for SD2.x LDM CLIP
* Fix mapping for LDM VAE
* Update readme

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/724

Reviewed By: alexanderguzhva

Differential Revision: D46181788

Pulled By: aakhundov

fbshipit-source-id: 3736eaed90e8cf69f460f1a1fa4227d655093d10
---
 examples/05_stable_diffusion/README.md        |  9 ++-
 .../src/compile_lib/compile_vae_alt.py        | 48 ++++++++++--
 .../modeling/controlnet_unet_2d_condition.py  |  5 +-
 .../src/pipeline_stable_diffusion_ait_alt.py  | 77 ++++++++++++++++++-
 4 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index 624cfcd47..e9ca953f9 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -52,12 +52,13 @@ The original pipeline requires a diffusers model local dir, and relies directly
 
 * AITemplate modules are created
 * Model weights are loaded, converted/mapped, then applied to AITemplate module
-* Scheduler and tokenizer are created from `runwayml/stable-diffusion-v1-5` and `openai/clip-vit-large-patch14` respectively
+* Tokenizer is created from `openai/clip-vit-large-patch14`.
+* Scheduler is created from `hf-hub-or-path`.
+* Loading CLIPTextModel from `ckpt` requires the appropriate `hf-hub-or-path` to be specified i.e. `runwayml/stable-diffusion-v1-5` for SD1.x checkpoints, `stabilityai/stable-diffusion-2-1` for SD2.x checkpoints.
 
 ```
-python3 scripts/demo.py --hf-hub-or-path runwayml/stable-diffusion-v1-5
-or
-python3 scripts/demo.py --ckpt v1-5-pruned-emaonly.ckpt
+python3 scripts/demo.py --hf-hub-or-path runwayml/stable-diffusion-v1-5 --ckpt v1-5-pruned-emaonly.ckpt
+python3 scripts/demo.py --hf-hub-or-path stabilityai/stable-diffusion-2-1 --ckpt v2-1_768-ema-pruned.ckpt
 ```
 
 `--ckpt` takes preference over `--hf-hub-or-path` if both are specified
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
index 559194d6f..1b7dd81d9 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
@@ -45,37 +45,69 @@ def map_vae_params(ait_module, pt_module, batch_size=1, seq_len=4096):
         elif name.endswith("attention.proj.weight"):
             prefix = name[: -len("attention.proj.weight")]
             pt_name = prefix + "proj_attn.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_out.0.weight"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj.bias"):
             prefix = name[: -len("attention.proj.bias")]
             pt_name = prefix + "proj_attn.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_out.0.bias"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.cu_length"):
             ...
         elif name.endswith("attention.proj_q.weight"):
             prefix = name[: -len("attention.proj_q.weight")]
             pt_name = prefix + "query.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_q.weight"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj_q.bias"):
             prefix = name[: -len("attention.proj_q.bias")]
             pt_name = prefix + "query.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_q.bias"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj_k.weight"):
             prefix = name[: -len("attention.proj_k.weight")]
             pt_name = prefix + "key.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_k.weight"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj_k.bias"):
             prefix = name[: -len("attention.proj_k.bias")]
             pt_name = prefix + "key.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_k.bias"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj_v.weight"):
             prefix = name[: -len("attention.proj_v.weight")]
             pt_name = prefix + "value.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_v.weight"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         elif name.endswith("attention.proj_v.bias"):
             prefix = name[: -len("attention.proj_v.bias")]
             pt_name = prefix + "value.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+            if pt_name in pt_params:
+                mapped_pt_params[ait_name] = pt_params[pt_name]
+            else:
+                pt_name = prefix + "to_v.bias"
+                mapped_pt_params[ait_name] = pt_params[pt_name]
         else:
             pt_param = pt_module.get_parameter(name)
             mapped_pt_params[ait_name] = pt_param
diff --git a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
index 327d601b5..56da472ab 100644
--- a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
@@ -125,10 +125,7 @@ def __init__(
         )
 
         # control net conditioning embedding
-        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels,
-        )
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding()
 
         self.down_blocks = nn.ModuleList([])
         self.controlnet_down_blocks = nn.ModuleList([])
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
index ce0abeaa8..78c9911f6 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
@@ -15,6 +15,7 @@
 import inspect
 
 import os
+import re
 from typing import List, Optional, Union
 
 import torch
@@ -540,6 +541,74 @@ def convert_ldm_unet_checkpoint(unet_state_dict, layers_per_block=2):
     return new_checkpoint
 
 
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    (
+        "token_embedding.weight",
+        "transformer.text_model.embeddings.token_embedding.weight",
+    ),
+    (
+        "positional_embedding",
+        "transformer.text_model.embeddings.position_embedding.weight",
+    ),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_text_enc_state_dict(state_dict):
+    if "transformer.resblocks.22.ln_1.bias" not in state_dict.keys():
+        return state_dict  # SD1.x
+    new_state_dict = {}
+    d_model = 1024
+    for key, arr in state_dict.items():
+        if "resblocks.23" in key:
+            continue  # diffusers skips the last layer
+        if key in textenc_conversion_map:
+            new_state_dict[textenc_conversion_map[key]] = arr
+        if key.startswith("transformer."):
+            new_key = key[len("transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key + ".q_proj.weight"] = arr[:d_model, :]
+                new_state_dict[new_key + ".k_proj.weight"] = arr[
+                    d_model : d_model * 2, :
+                ]
+                new_state_dict[new_key + ".v_proj.weight"] = arr[d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key + ".q_proj.bias"] = arr[:d_model]
+                new_state_dict[new_key + ".k_proj.bias"] = arr[d_model : d_model * 2]
+                new_state_dict[new_key + ".v_proj.bias"] = arr[d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key] = arr
+    return new_state_dict
+
+
 # =========================#
 #    AITemplate mapping   #
 # =========================#
@@ -615,8 +684,7 @@ def __init__(self, hf_hub_or_path, ckpt):
                 elif key.startswith("model.diffusion_model."):
                     new_key = key.replace("model.diffusion_model.", "")
                     unet_state_dict[new_key] = state_dict[key]
-            # TODO: SD2.x clip support, get from diffusers convert_from_ckpt.py
-            # clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
+            clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
             unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict)
             vae_state_dict = convert_ldm_vae_checkpoint(vae_state_dict)
             state_dict = None
@@ -636,6 +704,9 @@ def __init__(self, hf_hub_or_path, ckpt):
                 hf_hub_or_path, subfolder="text_encoder"
             )
             self.clip_pt = CLIPTextModel(config)
+            clip_state_dict[
+                "text_model.embeddings.position_ids"
+            ] = self.clip_pt.text_model.embeddings.get_buffer("position_ids")
             self.clip_pt.load_state_dict(clip_state_dict)
         clip_params_ait = map_clip_state_dict(dict(self.clip_pt.named_parameters()))
         print("Setting constants")
@@ -730,7 +801,7 @@ def __init__(self, hf_hub_or_path, ckpt):
 
         self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         self.scheduler = EulerDiscreteScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+            hf_hub_or_path, subfolder="scheduler"
         )
         self.batch = 1
 

From 126bb1df84c0b2b8ad63eff073310f5e8ba65f9d Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sat, 27 May 2023 10:35:44 -0700
Subject: [PATCH 554/638] Sync CUTLASS with upstream again (#727)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/727

ATT

Reviewed By: chenyang78

Differential Revision: D46184719

fbshipit-source-id: ef34ed0bd4cbe9f007989dc966b51f336c97842f
---
 3rdparty/cutlass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 8ca363e59..6320758d1 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 8ca363e59ef734fdeab3f5b7b85cc37804628040
+Subproject commit 6320758d1c7163662ce15a6e16d62f9732912063

From 50d82667a0d33697cd3cecf52842dfb453953f2c Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 29 May 2023 17:59:01 +0800
Subject: [PATCH 555/638] add missing script

---
 .../scripts/download_pipeline.py              | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 examples/05_stable_diffusion/scripts/download_pipeline.py

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
new file mode 100644
index 000000000..cde89ef0b
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+from diffusers import StableDiffusionPipeline
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option(
+    "--save_directory",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="pipeline files local directory",
+)
+def download_pipeline_files(token, save_directory) -> None:
+    StableDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        # use provided token or the one generated with `huggingface-cli login``
+        use_auth_token=token if token != "" else True,
+    ).save_pretrained(save_directory)
+
+
+if __name__ == "__main__":
+    download_pipeline_files()
\ No newline at end of file

From c246a8642fa672940793cf0fa67f4f2f3da66d6e Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 29 May 2023 03:11:19 -0700
Subject: [PATCH 556/638] Split test_strided_layernorm test cases (#729)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/729

Individual test cases of `test_strided_layernorm` are running up to 8 * 3 = 24 model compilations within one test function. Here we split them into smaller test cases for better parallelism.

Reviewed By: chenyang78

Differential Revision: D46255422

fbshipit-source-id: 210f95875da787d0ee5685617f260f2b47c87255
---
 .../compiler/test_strided_layernorm.py        | 218 ++++++++++--------
 1 file changed, 123 insertions(+), 95 deletions(-)

diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
index 9bf86cbc3..160145f8b 100644
--- a/tests/unittest/compiler/test_strided_layernorm.py
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import itertools
 import unittest
 from typing import List
 
@@ -22,6 +21,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.utils import shape_utils, torch_utils
+from parameterized import param, parameterized
 
 
 def build_ait_module(
@@ -139,6 +139,7 @@ def eval_pt(
 class SliceLayerNormTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(SliceLayerNormTestCase, self).__init__(*args, **kwargs)
+        torch.manual_seed(0)
         self._test_id = 0
 
     def _test_slice_layer_norm(
@@ -241,113 +242,140 @@ def _test_middle_slice_layer_norm_kernels(
                 **kwargs,
             )
 
-    def test_slice_layer_norm_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (1, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=False,
-                test_name="test_slice_layer_norm_float16",
-            )
-
-    def test_middle_slice_layer_norm_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (2, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_middle_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=False,
-                test_name="test_middle_slice_layer_norm_float16",
-            )
-
-    def test_slice_layer_norm_fuse_sigmoid_mul_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (1, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=True,
-                test_name="test_slice_layer_norm_fuse_sigmoid_mul_float16",
-            )
-
-    def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (2, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_middle_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=True,
-                test_name="test_middle_slice_layer_norm_fuse_sigmoid_mul_float16",
-            )
-
-    @unittest.skipIf(
-        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    @parameterized.expand(
+        [
+            param(0, 1, True, True),
+            param(1, 1, True, False),
+            param(2, 1, False, True),
+            param(3, 1, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
     )
-    def test_slice_layer_norm_float32(self):
+    def test_slice_layer_norm_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
         self._test_slice_layer_norm_kernels(
-            n_normalize_over_last_dims=1,
-            gamma_is_none=True,
-            beta_is_none=True,
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
             fuse_sigmoid_mul=False,
-            dtype="float32",
-            test_name="test_slice_layer_norm_float32_1",
+            test_name=f"test_slice_layer_norm_float16_{test_id}",
         )
+
+    @parameterized.expand(
+        [
+            param(0, 2, True, True),
+            param(1, 2, True, False),
+            param(2, 2, False, True),
+            param(3, 2, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_middle_slice_layer_norm_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
         self._test_middle_slice_layer_norm_kernels(
-            n_normalize_over_last_dims=2,
-            gamma_is_none=True,
-            beta_is_none=False,
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
             fuse_sigmoid_mul=False,
-            dtype="float32",
-            test_name="test_slice_layer_norm_float32_2",
+            test_name=f"test_middle_slice_layer_norm_float16_{test_id}",
         )
+
+    @parameterized.expand(
+        [
+            param(0, 1, True, True),
+            param(1, 1, True, False),
+            param(2, 1, False, True),
+            param(3, 1, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_slice_layer_norm_fuse_sigmoid_mul_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
         self._test_slice_layer_norm_kernels(
-            n_normalize_over_last_dims=3,
-            gamma_is_none=False,
-            beta_is_none=True,
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
             fuse_sigmoid_mul=True,
-            dtype="float32",
-            test_name="test_slice_layer_norm_float32_3",
+            test_name=f"test_slice_layer_norm_fuse_sigmoid_mul_float16_{test_id}",
         )
+
+    @parameterized.expand(
+        [
+            param(0, 2, True, True),
+            param(1, 2, True, False),
+            param(2, 2, False, True),
+            param(3, 2, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
         self._test_middle_slice_layer_norm_kernels(
-            n_normalize_over_last_dims=2,
-            gamma_is_none=False,
-            beta_is_none=False,
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
             fuse_sigmoid_mul=True,
+            test_name=f"test_middle_slice_layer_norm_fuse_sigmoid_mul_float16_{test_id}",
+        )
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    )
+    @parameterized.expand(
+        [
+            param(0, 1, True, True, False),
+            param(1, 2, True, False, False),
+            param(2, 3, False, True, True),
+            param(3, 2, False, False, True),
+        ]
+    )
+    def test_slice_layer_norm_float32(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+        fuse_sigmoid_mul,
+    ):
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=fuse_sigmoid_mul,
             dtype="float32",
-            test_name="test_slice_layer_norm_float32_4",
+            test_name=f"test_slice_layer_norm_float32_{test_id}",
         )
 
 
From 42790e39a559331c10bfa3e514026f4b71f5111c Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 31 May 2023 14:30:27 +0800
Subject: [PATCH 557/638] fix stable diffusion example

---
 .../src/compile_lib/compile_clip.py           | 46 ++++++++++--
 .../src/compile_lib/compile_unet.py           |  9 ++-
 .../src/compile_lib/compile_vae.py            |  9 ++-
 .../05_stable_diffusion/src/modeling/clip.py  | 70 +++++++++++++------
 .../transform/fuse_mm_reshape_permute.py      | 15 ++--
 python/aitemplate/frontend/nn/attention.py    |  3 +-
 6 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index a85aee84f..19b499f40 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -19,6 +19,10 @@
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
 from .util import mark_output
+import torch
+
+
+USE_CUDA = detect_target().name() == "cuda"
 
 
 def map_clip_params(pt_mod, batch_size, seqlen, depth):
@@ -31,12 +35,40 @@ def map_clip_params(pt_mod, batch_size, seqlen, depth):
             ait_name = ait_name.replace("out_proj", "proj")
         elif name.endswith("out_proj.bias"):
             ait_name = ait_name.replace("out_proj", "proj")
-        elif "q_proj" in name:
-            ait_name = ait_name.replace("q_proj", "proj_q")
-        elif "k_proj" in name:
-            ait_name = ait_name.replace("k_proj", "proj_k")
-        elif "v_proj" in name:
-            ait_name = ait_name.replace("v_proj", "proj_v")
+        elif USE_CUDA:
+            if "q_proj" in name:
+                ait_name = ait_name.replace("q_proj", "proj_q")
+            elif "k_proj" in name:
+                ait_name = ait_name.replace("k_proj", "proj_k")
+            elif "v_proj" in name:
+                ait_name = ait_name.replace("v_proj", "proj_v")
+        else:
+            if name.endswith("q_proj.weight"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.weight")]
+                q = pt_params[prefix + "q_proj.weight"]
+                k = pt_params[prefix + "k_proj.weight"]
+                v = pt_params[prefix + "v_proj.weight"]
+                qkv_weight = torch.cat([q, k, v], dim=0)
+                params_ait[ait_name] = qkv_weight
+                continue
+            elif name.endswith("q_proj.bias"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.bias")]
+                q = pt_params[prefix + "q_proj.bias"]
+                k = pt_params[prefix + "k_proj.bias"]
+                v = pt_params[prefix + "v_proj.bias"]
+                qkv_bias = torch.cat([q, k, v], dim=0)
+                params_ait[ait_name] = qkv_bias
+                continue
+            elif name.endswith("k_proj.weight"):
+                continue
+            elif name.endswith("k_proj.bias"):
+                continue
+            elif name.endswith("v_proj.weight"):
+                continue
+            elif name.endswith("v_proj.bias"):
+                continue
         params_ait[ait_name] = arr
 
     return params_ait
@@ -70,7 +102,7 @@ def compile_clip(
 
     pt_mod = pt_mod.eval()
     params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-    batch_size = IntVar(values=[1, 8], name="batch_size")
+    batch_size = IntVar(values=[1, 8], name="batch_size") if USE_CUDA else batch_size
 
     input_ids_ait = Tensor(
         [batch_size, seqlen], name="input0", dtype="int64", is_input=True
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index c4233c1e4..35c9c624a 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -24,6 +24,9 @@
 from .util import mark_output
 
 
+USE_CUDA = detect_target().name() == "cuda"
+
+
 def map_unet_params(pt_mod, dim):
     pt_params = dict(pt_mod.named_parameters())
     params_ait = {}
@@ -73,11 +76,11 @@ def compile_unet(
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
     # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
+    height = IntVar(values=[32, 64], name="height") if USE_CUDA else height
+    width = IntVar(values=[32, 64], name="width") if USE_CUDA else width
 
     latent_model_input_ait = Tensor(
-        [batch_size, height_d, width_d, 4], name="input0", is_input=True
+        [batch_size, height, width, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
     text_embeddings_pt_ait = Tensor(
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index e9c2d4964..24cecfa36 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -22,6 +22,9 @@
 from .util import mark_output
 
 
+USE_CUDA = detect_target().name() == "cuda"
+
+
 def map_vae_params(ait_module, pt_module, batch_size, seq_len):
     pt_params = dict(pt_module.named_parameters())
     mapped_pt_params = {}
@@ -124,11 +127,11 @@ def compile_vae(
         sample_size=sample_size,
     )
     # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
+    height = IntVar(values=[32, 64], name="height") if USE_CUDA else height
+    width = IntVar(values=[32, 64], name="width") if USE_CUDA else width
 
     ait_input = Tensor(
-        shape=[batch_size, height_d, width_d, latent_channels],
+        shape=[batch_size, height, width, latent_channels],
         name="vae_input",
         is_input=True,
     )
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 110468331..2b39427ec 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -73,21 +73,32 @@ def forward(self, x, context=None, mask=None, residual=None):
         k = self.to_k(context)
         v = self.to_v(context)
 
-        bs = q.shape()[0]
+        bs = x.shape()[0]
 
-        q = ops.reshape()(q, [bs, -1, self.heads, self.dim_head])
-        k = ops.reshape()(k, [bs, -1, self.heads, self.dim_head])
-        v = ops.reshape()(v, [bs, -1, self.heads, self.dim_head])
+        q = ops.reshape()(q, [bs, -1, nheads, d])
+        k = ops.reshape()(k, [bs, -1, nheads, d])
+        v = ops.reshape()(v, [bs, -1, nheads, d])
         q = ops.permute()(q, [0, 2, 1, 3])
         k = ops.permute()(k, [0, 2, 1, 3])
         v = ops.permute()(v, [0, 2, 1, 3])
-
-        attn_op = ops.mem_eff_attention(causal=False)
-        out = attn_op(
-            (ops.reshape()(q, [bs, nheads, -1, d])),
-            (ops.reshape()(k, [bs, nheads, -1, d])),
-            (ops.reshape()(v, [bs, nheads, -1, d])),
-        )
+        if USE_CUDA:
+            attn_op = ops.mem_eff_attention(causal=False)
+            out = attn_op(
+                (ops.reshape()(q, [bs, nheads, -1, d])),
+                (ops.reshape()(k, [bs, nheads, -1, d])),
+                (ops.reshape()(v, [bs, nheads, -1, d])),
+            )
+        else:
+            attn_op = ops.bmm_softmax_bmm_permute(
+                shape=(nheads,),
+                scale=d**-0.5,
+                causal=False,
+            )
+            out = attn_op(
+                ops.reshape()(q, [bs * nheads, -1, d]),
+                ops.reshape()(k, [bs * nheads, -1, d]),
+                ops.reshape()(v, [bs * nheads, -1, d])
+            )
         out = ops.reshape()(out, [bs, -1, nheads * d])
         proj = self.to_out(out)
         proj = ops.reshape()(proj, [bs, -1, nheads * d])
@@ -381,14 +392,24 @@ def __init__(
     ):
         super().__init__()
         self.embed_dim = hidden_size
-        self.self_attn = nn.CrossAttention(
-            hidden_size,
-            seq_len,
-            seq_len,
-            num_attention_heads,
-            qkv_bias=True,
-            causal=causal,
-        )
+        if USE_CUDA:
+            self.self_attn = nn.CrossAttention(
+                hidden_size,
+                seq_len,
+                seq_len,
+                num_attention_heads,
+                qkv_bias=True,
+                causal=causal,
+            )
+        else:
+            self.self_attn = nn.MultiheadAttention(
+                hidden_size,
+                batch_size,
+                seq_len,
+                num_attention_heads,
+                qkv_bias=True,
+                causal=causal,
+            )
 
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
         self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
@@ -414,9 +435,14 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states, hidden_states, hidden_states, residual
-        )
+        if USE_CUDA:
+            hidden_states = self.self_attn(
+                hidden_states, hidden_states, hidden_states, residual
+            )
+        else:
+            hidden_states = self.self_attn(
+                hidden_states, residual
+            )
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
index d8e9370c5..5089f78a5 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.transform.toposort import toposort
 
 from aitemplate.utils import graph_utils
+from aitemplate.testing import detect_target
 
 
 def _check_reshape(op: Operator) -> bool:
@@ -155,7 +156,8 @@ def _fuse_gemm_reshape_permute0213(
         _, d1, d2, _ = reshape_output.shape()
         d1_v = d1.value()
         d2_v = d2.value()
-        gemm_permute_op = gemm_rcr_permute(shape=(d1_v, d2_v), layout="0213")
+        layout = "20314" if detect_target().name() == "cuda" else "m2n3"
+        gemm_permute_op = gemm_rcr_permute(shape=(d1_v, d2_v), layout=layout)
         a, b = op._attrs["inputs"]
         transform_utils.remove_dst_op_from_tensor(a, op)
         transform_utils.remove_dst_op_from_tensor(b, op)
@@ -185,10 +187,13 @@ def fuse_mm_reshape_permute(
     Returns:
         List[Tensor]: optimized graph
     """
-
-    funcs = [
-        _fuse_gemm_reshape_permute0213,
-    ]
+    if detect_target().name() == "cuda":
+        funcs = [
+            _fuse_gemm_reshape_permute0213,
+        ]
+    else:
+        funcs = []
+        
     for func in funcs:
         sorted_graph = func(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index f1a58b81c..da1309339 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -399,8 +399,7 @@ def forward(self, *args, seqlens=None):
         x = args[0]
         batch = x.shape()[0]
         attn_output = self.attention(args[0], args[1], args[2], seqlens=seqlens)
-                
-        attn_output = ops.reshape()(attn_output, [-1, self.dim])
+        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 
         if self.has_residual:
             assert len(args) == 4

From a68c83fd7a693a1a308fa6fbb422d7eeeca120d0 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 31 May 2023 14:42:02 +0800
Subject: [PATCH 558/638] format code

---
 python/aitemplate/backend/common/tensor/identity_common.py | 2 +-
 .../aitemplate/backend/rocm/attention/mem_eff_attention.py | 2 +-
 python/aitemplate/backend/rocm/gemm/common.py              | 1 +
 python/aitemplate/backend/rocm/tensor/__init__.py          | 4 ++--
 python/aitemplate/backend/rocm/tensor/identity.py          | 7 ++++++-
 python/aitemplate/backend/task_runner.py                   | 1 -
 python/aitemplate/compiler/ops/groupnorm/groupnorm.py      | 6 +++++-
 .../compiler/transform/fuse_mm_reshape_permute.py          | 4 ++--
 .../aitemplate/compiler/transform/transform_special_ops.py | 2 +-
 python/aitemplate/frontend/nn/attention.py                 | 7 ++++---
 10 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/python/aitemplate/backend/common/tensor/identity_common.py b/python/aitemplate/backend/common/tensor/identity_common.py
index 34c75fec5..94b2e8632 100644
--- a/python/aitemplate/backend/common/tensor/identity_common.py
+++ b/python/aitemplate/backend/common/tensor/identity_common.py
@@ -62,7 +62,7 @@
 )
 
 
-def gen_function(func_attrs: Dict[str, Any], backend_spec, extra_headers='') -> str:
+def gen_function(func_attrs: Dict[str, Any], backend_spec, extra_headers="") -> str:
     """Generates function.
 
     Parameters
diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
index 078b4e0c7..f902792c1 100644
--- a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -346,7 +346,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     num_heads = q._attrs["shape"][1]._attrs["values"][0]
     max_seqlen = q._attrs["shape"][0].upper_bound() // 16
     head_dim = q._attrs["shape"][3]._attrs["values"][0]
-    
+
     softmax_scale = head_dim ** (-0.5)
 
     return FUNC_CALL_TEMPLATE.render(
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 7b5c2427a..7156a4d87 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -1000,6 +1000,7 @@ def fproc_f16(op):
             b_layout=b_layout,
             c_layout=c_layout,
         )
+
     has_dynamic_shape = False
     for inp in func_attrs["inputs"]:
         for dim in inp.shape():
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 4c2e528be..62e7f3b00 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -21,6 +21,8 @@
     concatenate,
     concatenate_tanh,
     dynamic_slice,
+    expand,
+    expand_static_shape,
     full,
     identity,
     permute021,
@@ -31,6 +33,4 @@
     slice_scatter,
     split,
     topk,
-    expand,
-    expand_static_shape,
 )
diff --git a/python/aitemplate/backend/rocm/tensor/identity.py b/python/aitemplate/backend/rocm/tensor/identity.py
index 90a2c29ea..d5d59b4cf 100644
--- a/python/aitemplate/backend/rocm/tensor/identity.py
+++ b/python/aitemplate/backend/rocm/tensor/identity.py
@@ -29,6 +29,7 @@
     """
 )
 
+
 @registry.reg("rocm.identity.func_decl")
 def gen_function_decl(func_attrs):
     """Generate function declaration.
@@ -61,7 +62,11 @@ def gen_function(func_attrs):
     str
         Rendered function body.
     """
-    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec(), extra_headers=EXTRA_HEADERS.render())
+    return identity_common.gen_function(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+        extra_headers=EXTRA_HEADERS.render(),
+    )
 
 
 @registry.reg("rocm.identity.func_call")
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
index 54509cb19..fd0c3d0dd 100644
--- a/python/aitemplate/backend/task_runner.py
+++ b/python/aitemplate/backend/task_runner.py
@@ -26,7 +26,6 @@
 from typing import List
 
 
-
 # pylint: disable=R1732,R1710,R1721
 class Task:
     """Task is an object containing a bash command,
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index b0e2a7bfd..667d162df 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -324,7 +324,11 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
 
         if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " f"{self._attrs['op']}" f"{exec_key}" " failed. " f"Results: {result}."
+                "Profile workload: "
+                f"{self._attrs['op']}"
+                f"{exec_key}"
+                " failed. "
+                f"Results: {result}."
             )
 
         out = min(result, key=lambda x: x[1].duration)
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
index 5089f78a5..e03023a54 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -21,9 +21,9 @@
 from aitemplate.compiler.ops import gemm_rcr_permute
 from aitemplate.compiler.transform import transform_utils
 from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.testing import detect_target
 
 from aitemplate.utils import graph_utils
-from aitemplate.testing import detect_target
 
 
 def _check_reshape(op: Operator) -> bool:
@@ -193,7 +193,7 @@ def fuse_mm_reshape_permute(
         ]
     else:
         funcs = []
-        
+
     for func in funcs:
         sorted_graph = func(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index cc9b5264a..41577cec3 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -297,7 +297,7 @@ def transform_special_ops(
     funcs = [
         _transform_1x1_conv_gemm_rcr,
     ]
-    
+
     if "transform_conv_to_gemm" in Target.current()._kwargs:
         if Target.current()._kwargs["transform_conv_to_gemm"]:
             for func in funcs:
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index da1309339..83576d5cc 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -358,10 +358,11 @@ def attention(self, q, k, v, seqlens=None):
         query = self.proj_q(q)
         key = self.proj_k(k)
         value = self.proj_v(v)
-        
+
         if detect_target().name() == "cuda":
             query = ops.permute()(
-                ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+                ops.reshape()(query, [batch, -1, self.num_heads, head_dim]),
+                [0, 2, 1, 3],
             )
             key = ops.permute()(
                 ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
@@ -391,7 +392,7 @@ def attention(self, q, k, v, seqlens=None):
             scale=head_dim**-0.5,
             causal=self.causal,
         )
-        return OP(query, key, value)  
+        return OP(query, key, value)
 
     def forward(self, *args, seqlens=None):
         """forward pass for calling mha module"""

From dacd043e38db5492e284eb9b79f4d0f82eaa71f9 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 31 May 2023 15:30:53 +0800
Subject: [PATCH 559/638] update enabled ci types

---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index a3db6e76b..61c93d643 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -2,7 +2,7 @@ name: ROCM_CI
 
 on: 
   pull_request:
-    types: [labeled]
+    types: [labeled, synchronize, reopened]
 
 jobs:
   build:

From 395dc7940c3f68bdee86b0df214d59363bd7f2b3 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Wed, 31 May 2023 01:30:44 -0700
Subject: [PATCH 560/638] Grouped Classic B2B BMM 1 ( copy base impl ) (#736)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/736

Copying the classic b2b bmm operator as a basis to rewrite it into a jagged / grouped version. This copying is separated out into this diff, in order to make the actual rewrite (second diff) easier to review.

Reviewed By: aakhundov

Differential Revision: D45880251

fbshipit-source-id: 43238990ac3dfa7b5f02a09590c2489fba74cbd7
---
 .../backend/cuda/b2b_bmm/__init__.py          |   1 +
 .../cuda/b2b_bmm/grouped_classic_b2b_bmm.py   | 379 ++++++++
 .../compiler/ops/b2b_bmm/__init__.py          |   3 +
 .../ops/b2b_bmm/grouped_classic_b2b_bmm.py    | 247 +++++
 .../device/b2b_batched_gemm.h                 | 444 +++++++++
 .../kernel/b2b_batched_gemm.h                 | 431 +++++++++
 .../kernel/default_b2b_batched_gemm.h         | 222 +++++
 .../thread/linear_combination_triu.h          | 136 +++
 .../threadblock/b2b_mma_base.h                | 241 +++++
 .../threadblock/b2b_mma_multistage.h          | 878 ++++++++++++++++++
 .../threadblock/b2b_mma_pipelined.h           | 562 +++++++++++
 .../threadblock/custom_epilogue_tensor_op.h   | 858 +++++++++++++++++
 .../threadblock/default_b2b_mma.h             | 383 ++++++++
 .../default_gmem_to_accum_loader_tensor_op.h  | 202 ++++
 .../threadblock/gmem_to_accum_loader.h        | 361 +++++++
 ...mem_to_accum_loader_shared_load_iterator.h | 274 ++++++
 ...accum_loader_fragment_iterator_tensor_op.h | 315 +++++++
 .../triu_mma_tensor_op_fragment_iterator.h    | 235 +++++
 tests/unittest/ops/test_b2b_bmm.py            | 175 ++++
 19 files changed, 6347 insertions(+)
 create mode 100644 python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
 create mode 100644 python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
index c7ac172ee..ed65514f5 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -21,5 +21,6 @@
 from aitemplate.backend.cuda.b2b_bmm import (
     classic_b2b_bmm,
     fmha_style_b2b_bmm,
+    grouped_classic_b2b_bmm,
     grouped_fmha_style_b2b_bmm,
 )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..c2e9124a9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -0,0 +1,379 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+classic_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "grouped_classic_b2b_bmm/device/b2b_batched_gemm.h"
+
+namespace {
+
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int ThreadblockM = 64;
+constexpr int ThreadblockK = 32;
+constexpr int WarpK = 32;
+constexpr int InstructionM = 16;
+constexpr int InstructionN = 8;
+constexpr int InstructionK = 16;
+
+// Currently, causal mask is only supported with warp shape M of 16.
+// While we ought to debug it, the warp shape M restriction is not considered
+// high-priority as we do not want to make warp M much larger anyway. If you want
+// to explore the perf-impact of tuning this, then you can turn off causal mask after
+// gemm 0 and see what the perf result is.
+constexpr int WarpM = 16;
+
+constexpr int N0 = {{n0}};
+constexpr int N1 = {{n1}};
+
+void check_status(cutlass::Status status, int64_t m0, int64_t k0, const std::string& message) {
+  if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error(
+        message +
+        "Function: {{function_name}}. "
+        "m0: " + std::to_string(m0) +
+        ", k0: " + std::to_string(k0) +
+        ", n0: " + std::to_string({{n0}}) +
+        ", n1: " + std::to_string({{n1}}) + "."
+      );
+  }
+  return;
+}
+
+}  // end namespace
+
+{{func_signature}} {
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+  using ElementCompute = {{elem_input_type}};
+
+  ElementCompute alpha0 = ElementCompute({{alpha0}});
+  ElementCompute beta0 = ElementCompute(1);
+  ElementCompute activation_alpha = ElementCompute({{alpha1}});
+  {% if alpha1_divide_by_seq_len %}
+  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(m0));
+  {% endif %}
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<ThreadblockM, N0, ThreadblockK>;
+  using WarpShape0 = cutlass::gemm::GemmShape<WarpM, N0, WarpK>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<ThreadblockM, N1, ThreadblockK>;
+  using WarpShape1 = cutlass::gemm::GemmShape<WarpM, N1, WarpK>;
+  using InstructionShape = cutlass::gemm::GemmShape<InstructionM, InstructionN, InstructionK>;
+
+  using EpilogueOutputOp0 =
+    cutlass::epilogue::thread::LinearCombinationGeneric<
+      {{epilogue_math}},
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      // Saves a little time in the epilogue by not multiplying the source by beta.
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  using EpilogueOutputOp1 =
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::Nothing
+    >;
+
+  using GroupedB2bGemmBatched = cutlass::gemm::device::GroupedB2bGemmBatched<
+    ElementCompute,
+    cutlass::layout::RowMajor,
+    ElementCompute,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3,
+    {{has_causal}} // enable causal mask after gemm0
+  >;
+
+  cutlass::gemm::GemmCoord problem_size_0(m0, {{n0}}, k0);
+  cutlass::gemm::GemmCoord problem_size_1(m0, {{n1}}, {{n0}});
+
+  // Assuming BMHD dim ordering for inputs and outputs, like in FHMA style op
+  // B = batch size
+  // M = sequence len
+  // H = num heads
+  // D = embedding dims per head
+  // --- Tensor shapes:
+  // GEMM PROBLEM 0:
+  // A=query : [ batch_size, M0, num_heads, K0 ]
+  // B=key : [ batch_size, N0, num_heads, K0 ]
+  // C0=bias : [ batch_size, num_heads, M0, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
+  // GEMM PROBLEM 1:
+  // B1=value : [ batch_size, K1==N0, num_heads, N1 ]
+  // C1=unused:  [ N1 ]
+  // D1=output : [ batch_size, M1==M0, num_heads, N1 ]
+
+  // Required equalities for B2B gemm:
+  // M1 = M0;
+  // K1 = N0;
+
+  typename GroupedB2bGemmBatched::Arguments arguments{
+    problem_size_0, // = GemmCoord problem_size_0;
+    problem_size_1, // = GemmCoord problem_size_1;
+    {static_cast<ElementCompute*>(query), typename GroupedB2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},    // TensorRef<ElementA const, LayoutA> ref_A0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_A0;
+    num_heads * problem_size_0.m() * problem_size_0.k(),                                                                // int64_t batch_stride_A0;
+    {static_cast<ElementCompute*>(key), typename GroupedB2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementB const, LayoutB> ref_B0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_B0;
+    num_heads * problem_size_0.n() * problem_size_0.k(),                                                                // int64_t batch_stride_B0;
+    {static_cast<ElementCompute*>(bias), typename GroupedB2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                  // TensorRef<ElementC const, LayoutC> ref_C0;
+    {{bias_stride_mn}},                                                                                                 // int64_t head_stride_C0;
+    {{bias_stride_hmn}},                                                                                                // int64_t batch_stride_C0;
+    {static_cast<ElementCompute*>(value), typename GroupedB2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},   // TensorRef<ElementC const, LayoutC> ref_B1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_B1;                                                                    //
+    num_heads * problem_size_1.n() * problem_size_1.k(),                                                                // int64_t batch_stride_B1;
+    {static_cast<ElementCompute*>(nullptr), typename GroupedB2bGemmBatched::LayoutScaleBias::Stride(0)},                       // Not used due to ScaleType::Nothing for output op 1
+    0,                                                                                                                  // not used: int64_t head_stride_C1;
+    0,                                                                                                                  // not used: int64_t batch_stride_C1;
+    {static_cast<ElementOutput*>(output), typename GroupedB2bGemmBatched::LayoutC::Stride(num_heads * problem_size_1.n())},    // TensorRef<ElementC, LayoutC> ref_D1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_output;
+    num_heads * problem_size_1.m() * problem_size_1.n(),                                                                // int64_t batch_stride_output;
+    batch_size,                                                                                                         // int batch_count;
+    num_heads,                                                                                                          // int num_heads
+    {alpha0, beta0, activation_alpha},                                                                                  // typename EpilogueOutputOp0::Params epilogue0;
+    {alpha1, beta1},                                                                                                    // typename EpilogueOutputOp1::Params epilogue1;
+  };
+
+  GroupedB2bGemmBatched b2b_gemm_op;
+  check_status(
+    b2b_gemm_op.can_implement(arguments),
+    m0, k0,
+    "Problem sizes are not supported."
+  );
+  check_status(
+    b2b_gemm_op.initialize(arguments),
+    m0, k0,
+    "classic_b2b_bmm initialization failed!"
+  );
+  check_status(
+    b2b_gemm_op(stream),
+    m0, k0,
+    "classic_b2b_bmm failed to execute!"
+  );
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   void* query,
+                   void* key,
+                   void* value,
+                   void* bias,
+                   int64_t batch_size,
+                   int64_t num_heads,
+                   int64_t m0,
+                   int64_t k0,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{m0}},
+{{indent}}    {{k0}},
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.gen_function")
+def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v, bias = func_attrs["inputs"]
+    seq_len_dim = 1
+    n0 = k._attrs["shape"][seq_len_dim]
+    n1 = v._attrs["shape"][-1]
+    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
+        raise RuntimeError(
+            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
+        )
+    backend_spec = CUDASpec()
+    if func_attrs["inputs"][0]._attrs["dtype"] != "float16":
+        raise NotImplementedError(
+            "only float16 dtype supported for now in classic_b2b_bmm op"
+        )
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        elem_accum_type = "cutlass::half_t"
+    else:
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    epilogue_math = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    bias_shape = bias._attrs["shape"]
+    bias_broadcast = [s == IntImm(1) for s in bias_shape]
+    if len(bias_broadcast) == 3:
+        # single head case: Add num heads dimension of size 1
+        bias_broadcast = [bias_broadcast[0], True, bias_broadcast[1], bias_broadcast[2]]
+    assert (
+        len(bias_broadcast) == 4
+    ), f"Bias shape should be of length 4, got {len(bias_broadcast)=}"
+
+    # Calculate stride expressions for bias tensor
+    # Last dimension of bias has implicit stride of 1,
+    # so cannot be broadcasted over
+    bias_stride_n = "problem_size_0.n()"
+    bias_shape_expr = [bias_stride_n]
+
+    # build stride expressions
+    if not bias_broadcast[-2]:
+        bias_shape_expr.append("problem_size_0.m()")
+    bias_stride_mn = "*".join(bias_shape_expr)
+    if not bias_broadcast[-3]:
+        bias_shape_expr.append("num_heads")
+    bias_stride_hmn = "*".join(bias_shape_expr)  # batch stride
+
+    # Strides for broadcasted dimensions are zero
+    if bias_broadcast[0]:  # query sequence len stride
+        bias_stride_hmn = "0"
+    if bias_broadcast[1]:  # head stride
+        bias_stride_mn = "0"
+    if bias_broadcast[2]:  # query sequence length stride
+        bias_stride_n = "0"
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        n0=str(n0.value()),
+        n1=str(n1.value()),
+        has_causal=(
+            "true" if func_attrs["causal_type"] != CausalType.NO_CAUSAL else "false"
+        ),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        epilogue_math=epilogue_math,
+        bias_stride_n=bias_stride_n,
+        bias_stride_mn=bias_stride_mn,
+        bias_stride_hmn=bias_stride_hmn,
+    )
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.func_decl")
+def classic_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.func_call")
+def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 4
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+    bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+
+    batch_size = q_shape[0]._attrs["name"]
+    seq_len_dim = 1
+    head_dim = -2
+    m0 = q_shape[seq_len_dim]._attrs["name"]
+
+    if len(q_shape) == 3:
+        # single head case
+        k0 = q_shape[2]._attrs["name"]
+        num_heads = "1"
+    elif len(q_shape) == 4:
+        k0 = q_shape[3]._attrs["name"]
+        num_heads = q_shape[head_dim]._attrs["name"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        batch_size=batch_size,
+        num_heads=num_heads,
+        m0=m0,
+        k0=k0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
index e0bad2b1d..d77a078b2 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
@@ -19,6 +19,9 @@
 
 from aitemplate.compiler.ops.b2b_bmm.classic_b2b_bmm import classic_b2b_bmm
 from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import fmha_style_b2b_bmm
+from aitemplate.compiler.ops.b2b_bmm.grouped_classic_b2b_bmm import (
+    grouped_classic_b2b_bmm,
+)
 from aitemplate.compiler.ops.b2b_bmm.grouped_fmha_style_b2b_bmm import (
     grouped_fmha_style_b2b_bmm,
 )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..39090c342
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -0,0 +1,247 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel.
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+Notation:
+B: batch size
+H: number of heads
+
+If inputs/outputs have three dims ( singlehead case ):
+Q: [B, M0, K0] (row_major),
+K: [B, N0, K0] (column_major),
+V: [B, N0, N1] (row_major),
+bias: [B, M0, N0] (row_major).
+output: [ B, M0, N1 ]
+
+If inputs/outputs have four dims ( multihead case ),
+the head dim is located at the dimension with index 2
+
+dimension order of the parameters is
+
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major).
+Output: [ B, M0, H, N1 ]
+
+Only supports NO_CAUSAL or LOWER_LEFT_EMPTY causal mask types.
+When causal_mask is enabled, M0 must be equal to N0.
+
+Internally, it stores the results of Q@K in registers without writing them to shared memory, which is faster.
+However, N0 and N1 must be <= 512.
+"""
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+
+
+class grouped_classic_b2b_bmm(b2b_bmm_base):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        r"""Back-to-back batched gemm fused kernels.
+
+        More detailed documentation at the top of this file.
+
+        Args:
+        * causal_type (CausalType): Type of causal_mask. See comments above.
+        * epilogue_math_name (str): Name of the activation function.
+        Supported epilogue functions can be found from
+        python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+        * alpha0 (float): See the math function above.
+        * alpha1 (float): See the math function above.
+        * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+        Useful when seq_len is a dynamic value so that alpah1 cannot be
+        computed in advance.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
+        self._attrs["op"] = "grouped_classic_b2b_bmm"
+        if (
+            causal_type != CausalType.NO_CAUSAL
+            and causal_type != CausalType.LOWER_LEFT_EMPTY
+        ):
+            raise NotImplementedError(
+                f"grouped_classic_b2b_bmm only supports NO_CAUSAL or LOWER_LEFT_EMPTY. Current causal type: {causal_type}"
+            )
+
+    def _infer_shapes(self):
+        """infer the output shape for grouped_classic_b2b_bmm."""
+        q, k, v, bias = self._attrs["inputs"]
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        head_dim = 2
+        seq_dim = 1
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3 and len(k_shape) != 4:
+            raise RuntimeError(
+                f"QKV must have rank 3 or 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        batch_size = q_shape[0]
+        M0 = q_shape[seq_dim]
+        K0 = q_shape[-1]
+        if K0 != k_shape[-1]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N0 = k_shape[seq_dim]
+        if N0 != v_shape[seq_dim]:
+            raise RuntimeError(
+                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N1 = v_shape[-1]
+        if N0.upper_bound() > 512 or N1.upper_bound() > 512:
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
+            )
+        if not isinstance(N0, IntImm) or not isinstance(N1, IntImm):
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports static N0 / N1. Current {N0=}, {N1=}."
+            )
+        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
+            if M0 != N0:
+                raise RuntimeError(
+                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
+                )
+        bias_shape = bias._attrs["shape"]
+
+        is_multihead = len(q_shape) == 4
+        if is_multihead:
+            num_heads = q_shape[head_dim]
+
+            output_shape = [batch_size, M0, num_heads, N1]
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Was expecting 4-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(
+                bias_shape, [batch_size, num_heads, M0, N0]
+            ):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
+            # key sequence length is identical to last shape dim of bias tensor
+            # so if it is also constant 1, it is not a real broadcast and permissible
+            if bias_shape[-1] == IntImm(1) and k_shape[seq_dim] != IntImm(1):
+                raise RuntimeError(
+                    "grouped_classic_b2b_bmm op does not support broadcasting of last dimension of bias tensor (e.g. over sequence length of key and value ). Use the expand op to emulate this broadcast behavior if you need it."
+                )
+        else:
+            num_heads = IntImm(1)
+            self._attrs["num_heads"] = num_heads
+            output_shape = [batch_size, M0, N1]
+            if len(bias_shape) != 3:
+                raise RuntimeError(
+                    f"Was expecting 3-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(bias_shape, [batch_size, M0, N0]):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
+
+        return output_shape
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Tensor,
+    ) -> Tensor:
+        """call the op
+
+        Note: [H,] means optional num-heads,
+        if it exists for one input tensor, all need to have it,
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, [H,] K0)
+        k: Tensor, shape(B, N0, [H,] K0)
+        v: Tensor, shape(B, N0, [H,] N1)
+        bias: Tensor, shape(B, [H,] M0, N0)
+
+        Returns
+        ----------
+        Tensor, shape(B, M0, [H,], N1)
+        """
+
+        self._attrs["inputs"] = [q, k, v, bias]
+        self._set_depth()
+        output_shape = self._infer_shapes()
+        self._check_alignment()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        if current_target.name() == "rocm" or (
+            current_target.name() == "cuda" and int(current_target._arch) < 80
+        ):
+            raise NotImplementedError(
+                "grouped_classic_b2b_bmm is only supported by CUDA>=SM80 devices."
+            )
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
new file mode 100644
index 000000000..0b9215f09
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -0,0 +1,444 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+
+#include "grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0 = false,
+    /// Stage accumulator in shared memory
+    bool SmemAccumulator = false,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GroupedB2bGemmBatched {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using LayoutB1 = LayoutB1_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape0 = ThreadblockShape0_;
+  using ThreadblockShape1 = ThreadblockShape1_;
+  using WarpShape0 = WarpShape0_;
+  using WarpShape1 = WarpShape1_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kCausalMaskAfterGemm0 = CausalMaskAfterGemm0;
+
+  /// Derived types
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor;
+
+  /// Define the kernel
+  using GroupedB2bGemmBatchedKernel = typename kernel::DefaultGroupedB2bGemmBatched<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    LayoutB1,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    CausalMaskAfterGemm0,
+    SmemAccumulator
+  >::B2bGemmBatchedKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size_0;
+    GemmCoord problem_size_1;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    TensorRef<ElementB const, LayoutB> ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    TensorRef<ElementB const, LayoutB1> ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() {
+
+    }
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      int64_t head_stride_A0_,
+      int64_t batch_stride_A0_,
+      TensorRef<ElementB const, LayoutB> ref_B0_,
+      int64_t head_stride_B0_,
+      int64_t batch_stride_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      int64_t head_stride_C0_,
+      int64_t batch_stride_C0_,
+      TensorRef<ElementB const, LayoutB1> ref_B1_,
+      int64_t head_stride_B1_,
+      int64_t batch_stride_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      int64_t head_stride_C1_,
+      int64_t batch_stride_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      int64_t head_stride_D1_,
+      int64_t batch_stride_D1_,
+      int batch_count_,
+      int num_heads_,
+      typename EpilogueOutputOp0::Params epilogue0_ =
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ =
+        typename EpilogueOutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      head_stride_A0(head_stride_A0_),
+      batch_stride_A0(batch_stride_A0_),
+      ref_B0(ref_B0_),
+      head_stride_B0(head_stride_B0_),
+      batch_stride_B0(batch_stride_B0_),
+      ref_C0(ref_C0_),
+      head_stride_C0(head_stride_C0_),
+      batch_stride_C0(batch_stride_C0_),
+      ref_B1(ref_B1_),
+      head_stride_B1(head_stride_B1_),
+      batch_stride_B1(batch_stride_B1_),
+      ref_C1(ref_C1_),
+      head_stride_C1(head_stride_C1_),
+      batch_stride_C1(batch_stride_C1_),
+      ref_D1(ref_D1_),
+      head_stride_D1(head_stride_D1_),
+      batch_stride_D1(batch_stride_D1_),
+      batch_count(batch_count_),
+      num_heads(num_heads_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GroupedB2bGemmBatchedKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GroupedB2bGemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = GroupedB2bGemmBatchedKernel::can_implement(
+      args.problem_size_0,
+      args.problem_size_1,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0,
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.batch_count * args.num_heads);
+
+    // Initialize the Params structure
+    params_ = typename GroupedB2bGemmBatchedKernel::Params{
+      args.problem_size_0,
+      args.problem_size_1,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.head_stride_A0,
+      args.batch_stride_A0,
+      args.ref_B0.non_const_ref(),
+      args.head_stride_B0,
+      args.batch_stride_B0,
+      args.ref_C0.non_const_ref(),
+      args.head_stride_C0,
+      args.batch_stride_C0,
+      args.ref_B1.non_const_ref(),
+      args.head_stride_B1,
+      args.batch_stride_B1,
+      args.ref_C1.non_const_ref(),
+      args.head_stride_C1,
+      args.batch_stride_C1,
+      args.ref_D1,
+      args.head_stride_D1,
+      args.batch_stride_D1,
+      args.batch_count,
+      args.num_heads,
+      args.epilogue0,
+      args.epilogue1
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
+    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D1.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GroupedB2bGemmBatchedKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GroupedB2bGemmBatchedKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GroupedB2bGemmBatchedKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<GroupedB2bGemmBatchedKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
new file mode 100644
index 000000000..13fb29e0c
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -0,0 +1,431 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  typename GmemToAccumLoader_
+>
+struct GroupedB2bGemmBatched {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using GmemToAccumLoader = GmemToAccumLoader_;
+  using OutputOp0 = typename B2bMma::OutputOp;
+  using OutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size_0;
+    cutlass::gemm::GemmCoord problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename B2bMma::IteratorA0::Params params_A0;
+    typename B2bMma::IteratorA0::TensorRef ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    typename B2bMma::IteratorB0::Params params_B0;
+    typename B2bMma::IteratorB0::TensorRef ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    typename GmemToAccumLoader::OutputTileIterator::Params params_C0;
+    typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    typename B2bMma::IteratorB1::Params params_B1;
+    typename B2bMma::IteratorB1::TensorRef ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    typename Epilogue::OutputTileIterator::Params params_D1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+    int gemm_k_iterations_0;
+    int gemm_k_iterations_1;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      int64_t head_stride_A0,
+      int64_t batch_stride_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      int64_t head_stride_B0,
+      int64_t batch_stride_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      int64_t head_stride_C0,
+      int64_t batch_stride_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      int64_t head_stride_B1,
+      int64_t batch_stride_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      int64_t head_stride_C1,
+      int64_t batch_stride_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      int64_t head_stride_D1,
+      int64_t batch_stride_D1,
+      int batch_count,
+      int num_heads,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      head_stride_A0(head_stride_A0),
+      batch_stride_A0(batch_stride_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      head_stride_B0(head_stride_B0),
+      batch_stride_B0(batch_stride_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      head_stride_C0(head_stride_C0),
+      batch_stride_C0(batch_stride_C0),
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      head_stride_B1(head_stride_B1),
+      batch_stride_B1(batch_stride_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      head_stride_C1(head_stride_C1),
+      batch_stride_C1(batch_stride_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      head_stride_D1(head_stride_D1),
+      batch_stride_D1(batch_stride_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      batch_count(batch_count),
+      num_heads(num_heads),
+      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
+      gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+    typename GmemToAccumLoader::SharedStorage gmem_to_accum_loader;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GroupedB2bGemmBatched() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1) {
+
+    static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements;
+    static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) ||
+      (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) ||
+      (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) ||
+      (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) ||
+      (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) ||
+      (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine if fusion sizes are valid
+    if(problem_size_0.m() != problem_size_1.m())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() != problem_size_1.k())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() > B2bMma::Shape0::kN)
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_1.n() > B2bMma::Shape1::kN)
+      return Status::kErrorInvalidProblem;
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_head_idx = threadblock_swizzle.get_batch_idx(); batch_head_idx < params.batch_count * params.num_heads; batch_head_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A0{
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B0{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      };
+
+      cutlass::MatrixCoord tb_offset_B1{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Convert blockIdx.z into (batch_idx, head_idx).
+      int batch_idx = batch_head_idx / params.num_heads;
+      int head_idx = batch_head_idx % params.num_heads;
+
+      // Construct iterators to A and B operands
+      typename B2bMma::IteratorA0 iterator_A0(
+        params.params_A0,
+        params.ref_A0.data(),
+        params.problem_size_0.mk(),
+        thread_idx,
+        tb_offset_A0);
+
+      iterator_A0.add_pointer_offset(params.batch_stride_A0 * batch_idx + params.head_stride_A0 * head_idx);
+
+      typename B2bMma::IteratorB0 iterator_B0(
+        params.params_B0,
+        params.ref_B0.data(),
+        params.problem_size_0.kn(),
+        thread_idx,
+        tb_offset_B0);
+
+      iterator_B0.add_pointer_offset(params.batch_stride_B0 * batch_idx + params.head_stride_B0 * head_idx);
+
+      typename B2bMma::IteratorB1 iterator_B1(
+        params.params_B1,
+        params.ref_B1.data(),
+        params.problem_size_1.kn(),
+        thread_idx,
+        tb_offset_B1);
+
+      iterator_B1.add_pointer_offset(params.batch_stride_B1 * batch_idx + params.head_stride_B1 * head_idx);
+
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int lane_idx = threadIdx.x % 32;
+
+      // assume identity swizzle
+      MatrixCoord tb_offset_C0(
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename GmemToAccumLoader::OutputTileIterator iterator_C0(
+        params.params_C0,
+        params.ref_C0.data(),
+        params.problem_size_0.mn(),
+        thread_idx,
+        tb_offset_C0
+      );
+
+      iterator_C0.add_pointer_offset(params.batch_stride_C0 * batch_idx + params.head_stride_C0 * head_idx);
+
+      //
+      // Main loop
+      //
+
+      OutputOp0 output_op_0(params.output_op_0);
+
+      // Construct thread-scoped matrix multiply
+      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
+
+      typename B2bMma::FragmentC0 src_accum;
+      typename B2bMma::FragmentC1 accumulators;
+
+      src_accum.clear();
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_C0,
+        iterator_B1, src_accum, output_op_0);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp1 output_op_1(params.output_op_1);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * B2bMma::Shape1::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        params.ref_C1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C1.add_pointer_offset(params.batch_stride_C1 * batch_idx + params.head_stride_C1 * head_idx);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D1(
+        params.params_D1,
+        params.ref_D1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D1.add_pointer_offset(params.batch_stride_D1 * batch_idx + params.head_stride_D1 * head_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
new file mode 100644
index 000000000..f841d0072
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -0,0 +1,222 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+      This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Layout type for B1 matrix operand
+  typename LayoutB1_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Apply upper triangular causal mask after first gemm
+  bool CausalMaskAfterGemm0 = false,
+  /// Stage accumulator in shared memory
+  bool SmemAccumulator = false
+>
+struct DefaultGroupedB2bGemmBatched;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0>
+struct DefaultGroupedB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, CausalMaskAfterGemm0> {
+
+  // TODO: Make pipelined (i.e. stages == 2) work.
+  static_assert((Stages >= 3), "Currently, only multistage is supported (not pipelined).");
+
+  // While we ought to debug it, the warp shape M restriction is not considered
+  // high-priority as we do not want to make warp M much larger anyway.
+  static_assert(
+    !CausalMaskAfterGemm0 || (WarpShape0::kM == 16),
+    "Currently, causal mask is only supported with warp shape M of 16."
+  );
+
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1,
+      ElementC, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK0 = ThreadblockShape0::kK / WarpShape0::kK;
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::classic_b2b_bmm::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmBatchedKernel = kernel::GroupedB2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h b/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h
new file mode 100644
index 000000000..b513f958a
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h
@@ -0,0 +1,136 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include <cutlass/half.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  int ThreadBlockShapeM,
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationTriu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static int const kThreadBlockShapeM = ThreadBlockShapeM;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTriu() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, int index, int n, int m) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate = converted_accumulator;
+
+    for (int i = 0; i < kCount; i++) {
+
+      int row = (
+        (kThreadBlockShapeM * blockIdx.x) +
+        (16 * (threadIdx.x / 32)) +
+        (8 * (i / 2)) +
+        ((threadIdx.x % 32) / 4)
+      );
+      int col = (
+        (16 * index) +
+        (8 * n) +
+        (2 * (threadIdx.x % 4)) +
+        (i % 2)
+      );
+
+      intermediate[i] = intermediate[i] * ElementCompute(row <= col);
+
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+}
+}
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h
new file mode 100644
index 000000000..d2460cce9
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h
@@ -0,0 +1,241 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  using Shape1 = Shape1_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm0 = typename Policy0::Operator::Shape;
+  using WarpGemm1 = typename Policy1::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount0 = GemmShape<Shape0::kM / WarpGemm0::kM,
+                               Shape0::kN / WarpGemm0::kN,
+                               Shape0::kK / WarpGemm0::kK>;
+  using WarpCount1 = GemmShape<Shape1::kM / WarpGemm1::kM,
+                               Shape1::kN / WarpGemm1::kN,
+                               Shape1::kK / WarpGemm1::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations0 =
+      (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 =
+      (WarpGemm1::kK / Operator1::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template<
+    typename Shape_,
+    typename Policy_
+  >
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Policy = Policy_;
+    using Operator = typename Policy::Operator;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+  using SharedStorage0 = SharedStorage<Shape0, Policy0>;
+  using SharedStorage1 = SharedStorage<Shape1, Policy1>;
+  union B2bMmaSharedStorage {
+    SharedStorage0 shared_storage0;
+    SharedStorage1 shared_storage1;
+  };
+
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A0 operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+
+  /// Iterator to load a warp-scoped tile of B1 operand from shared memory
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
new file mode 100644
index 000000000..a1b39d0e8
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -0,0 +1,878 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+#include "grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator)
+    typename FragmentIteratorA1_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// WarpIterator to load Scale or Bias vector from threadblock fragment
+    typename FragmentIteratorA1ScaleBias_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of output matrix
+    typename ElementOutput_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    bool CausalMaskAfterGemm0,
+    typename WarpShape0_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaMultistage :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over intermediate accumulator tile
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from threadblock fragment
+  using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Data type of output matrix
+  using ElementOutput = ElementOutput_;
+ ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  static const int kPartitionsK0 = Shape0_::kK / WarpShape0_::kK;
+
+  using GmemToAccumLoader =
+      typename cutlass::epilogue::threadblock::DefaultGmemToAccumLoaderTensorOp<
+          Shape0_, Operator0, kPartitionsK0, OutputOp,
+          OutputOp::kCount>::GmemToAccumLoader;
+
+  using IteratorC0 = typename GmemToAccumLoader::OutputTileIterator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  GmemToAccumLoader gmem_to_accum_loader;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      typename GmemToAccumLoader::SharedStorage &bias_add_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
+      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+                              int group_start_A0 = 0, int group_start_B0 = 0) {
+    iterator_A0.set_iteration_index(group_start_A0 *
+                                   IteratorA0::kAccessesPerVector);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess /
+                              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0 *
+                                   IteratorB0::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
+                              int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(group_start_B1 *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A0 operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B0 operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over C0 operand in global memory
+      IteratorC0 iterator_C0,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0)
+    {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+      iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA0::Element>::value *
+              IteratorA0::ThreadMap::kElementsPerAccess /
+              IteratorA0::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+    iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        warp_mma0(
+          accum0,
+          warp_transformed_frag_A0[warp_mma_k % 2],
+          warp_transformed_frag_B0[warp_mma_k % 2],
+          accum0
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+
+          group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+          iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+          iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    // Apply bias add
+    gmem_to_accum_loader(output_op_0, accum0, iterator_C0);
+    __syncthreads();
+
+
+    // 2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+    typename FragmentIteratorA1::OutputOp noop_output_op_0({});
+    TriuMmaTensorOpFragmentIterator<FragmentIteratorA1, Shape0::kM> triu_warp_tile_iterator_A1_;
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], noop_output_op_0);
+    if (CausalMaskAfterGemm0) {
+      triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+      ++triu_warp_tile_iterator_A1_;
+    }
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+    //
+    // Mainloop
+    //
+
+    gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment
+        warp_tile_iterator_A1_.load(
+            warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+            noop_output_op_0
+        );
+        if (CausalMaskAfterGemm0) {
+          triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          ++triu_warp_tile_iterator_A1_;
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+
+        warp_mma1(
+          accum,
+          warp_transformed_frag_A1[warp_mma_k % 2],
+          warp_transformed_frag_B1[warp_mma_k % 2],
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
new file mode 100644
index 000000000..b60ce870f
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
@@ -0,0 +1,562 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator)
+  typename FragmentIteratorA1_,
+  /// Iterates over vectors of scale and bias vector in global memory
+  //  (concept: VectorIterator)
+  typename IteratorAccumulatorScaleBias_,
+  /// FragmentIterator to load Scale or Bias vector from threadblock fragment
+  typename FragmentIteratorA1ScaleBias_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of output matrix
+  typename ElementOutput_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element,
+    typename IteratorA0_::Element,
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element,
+    typename IteratorB0_::Element,
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element,
+    typename IteratorB1_::Element,
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelined :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
+  using FragmentIteratorA1ScaleBias =
+    FragmentIteratorA1ScaleBias_;     ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+
+  using ElementOutput = ElementOutput_;       ///< Data type of output matrix
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  /// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
+  using WarpFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
+
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    //These should stay the same across different GEMM layers
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorAccumulatorScaleBias iterator_A1_scale,    ///< iterator over A1 operand scale vectors in global memory
+    IteratorAccumulatorScaleBias iterator_A1_bias,     ///< iterator over A1 operand bias vectors in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    //2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentA1ScaleBias tb_frag_A1_scale;
+    FragmentA1ScaleBias tb_frag_A1_bias;
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
+    FragmentB1 tb_frag_B1;
+
+    if(PerChannelScale)
+        tb_frag_A1_scale.clear();
+    tb_frag_A1_bias.clear();
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    if(PerChannelScale)
+        iterator_A1_scale.load(tb_frag_A1_scale);
+    iterator_A1_bias.load(tb_frag_A1_bias);
+    iterator_B1.load(tb_frag_B1);
+
+    if(PerChannelScale)
+        ++iterator_A1_scale;
+    ++iterator_A1_bias;
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
+    WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    if(PerChannelScale)
+        warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
+    warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
+        warp_frag_A1_bias[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    if(PerChannelScale)
+        ++warp_tile_iterator_A1_scale_;
+    ++warp_tile_iterator_A1_bias_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Avoid reading out of bounds
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+
+          if(PerChannelScale) {
+              tb_frag_A1_scale.clear();
+              iterator_A1_scale.load(tb_frag_A1_scale);
+              ++iterator_A1_scale;
+            }
+            tb_frag_A1_bias.clear();
+            iterator_A1_bias.load(tb_frag_A1_bias);
+            ++iterator_A1_bias;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+
+        if(PerChannelScale)
+          warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
+            warp_frag_A1_scale[(warp_mma_k + 1) % 2],
+            warp_frag_A1_bias[(warp_mma_k + 1) % 2],
+            output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if(PerChannelScale)
+          ++warp_tile_iterator_A1_scale_;
+        ++warp_tile_iterator_A1_bias_;
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+          ++iterator_B1;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                  warp_frag_B1[warp_mma_k % 2], accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
new file mode 100644
index 000000000..958cc8843
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
@@ -0,0 +1,858 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+* Customized version of Cutlass 3.1 cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+* removed problematic specialization of DefaultIteratorsTensorOp for fp16 -> fp32 accumulation
+* which had numeric issues due to the usage of SharedLoadIteratorMixed.
+* Introduces the cutlass::epilogue::threadblock::classic_b2b_bmm namespace, which is a customized
+* variant of the cutlass::epilogue::threadblock namespace.
+*
+**************************************************************************************************/
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+
+/*
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  float,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+
+};
+*/
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput,
+  int32_t,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+                "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace classic_b2b_bmm
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
new file mode 100644
index 000000000..638f9f84e
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -0,0 +1,383 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h"
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C matrix
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Staging the accumulators in shared memory.
+    bool SmemAccumulator = false>
+struct DefaultB2bMma;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output with 2-stage pipeline
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type forAC matrix operand
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB1, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for output
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  // FragmentIteratorA1 should just load A1 fragments from the intermediate
+  // accumulator tile without modification, so LinearCombination is used to
+  // apply a no-op to the accumulator tile.
+  using LinearCombinationOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,
+    EpilogueOutputOp::kCount,
+    ElementAccumulator,
+    ElementC,
+    cutlass::epilogue::thread::ScaleType::Nothing
+  >;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, LinearCombinationOutputOp>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB1, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages,
+      CausalMaskAfterGemm0, typename MmaCore0::WarpShape>;
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
new file mode 100644
index 000000000..c09a7ecfa
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
@@ -0,0 +1,202 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/default_epilogue_tensor_op.h but
+  modified to use GmemToAccumLoader, GmemToAccumLoaderFragmentIteratorTensorOp, and
+  GmemToAccumLoaderSharedLoadIterator.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h"
+#include "grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h"
+#include "grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h"
+#include "grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultGmemToAccumLoaderTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::GmemToAccumLoaderFragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = classic_b2b_bmm::detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename cutlass::epilogue::threadblock::GmemToAccumLoaderSharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementOutput
+  >;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using GmemToAccumLoader = cutlass::epilogue::threadblock::GmemToAccumLoader<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
new file mode 100644
index 000000000..87d413344
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
@@ -0,0 +1,361 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/epilogue.h and modified to essentially
+  inverse the direction of the epilogue. See https://github.com/NVIDIA/cutlass/issues/784
+  for details.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/util/index_sequence.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class GmemToAccumLoader :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+  static_assert(kPartitionsK == 1, "Must be exactly 1.");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoader(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                ///< Output operator
+    AccumulatorTile &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration)
+    {
+
+      //
+      // Load fragments from shared memory
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+        // Load addend source fragment from global memory to aligned register fragment.
+        source_iterator.load(source_fragment);
+        ++source_iterator;
+
+        // Store data in register fragment to shared memory.
+        shared_load_iterator_.store(source_fragment);
+
+        if (p < Base::kFragmentsPerIteration - 1)
+        {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        typename OutputTileIterator::Fragment source_accum_fragment;
+        typename AccumulatorFragmentIterator::Fragment output_accum_fragment;
+
+        // Load from shared memory to "unaligned" accumulator fragment.
+        this->warp_tile_iterator_.load(source_accum_fragment);
+
+        // Load from accumulators to accumulator fragment.
+        accum_fragment_iterator.load(accum_fragment);
+
+        // Store result of computation to accumulators.
+        apply_output_operator(output_accum_fragment, output_op, accum_fragment, source_accum_fragment);
+        accum_fragment_iterator.store(output_accum_fragment);
+
+        ++accum_fragment_iterator;
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+    }
+
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator(
+    typename AccumulatorFragmentIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename AccumulatorFragmentIterator::Fragment const &accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment)
+  {
+
+    AccumulatorAccessType *output_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&accum_fragment);
+
+    OutputAccessType const *source_frag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations =
+      AccumulatorFragmentIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    NumericArrayConverter<typename AccumulatorAccessType::Element, typename OutputAccessType::Element, OutputOp::kCount, OutputOp::kRound> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i)
+    {
+      // Call the output operator
+      output_frag_ptr[i] = converter(output_op(compute_frag_ptr[i], source_frag_ptr[i]));
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
new file mode 100644
index 000000000..f5ecb1bc7
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
@@ -0,0 +1,274 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in
+/// GmemToAccumLoader.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    int MaxAlignment =
+        ThreadMap_::kElementsPerAccess* sizeof_bits<Element_>::value / 8>
+class GmemToAccumLoaderSharedLoadIterator {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment =
+      (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType =
+      AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+      Element,
+      const_min(
+          128 / sizeof_bits<Element>::value,
+          ThreadMap::kElementsPerAccess),
+      const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess =
+      AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoaderSharedLoadIterator(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t*>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ += thread_offset.row() * stride_ +
+        thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    byte_pointer_ += offset.row() * Shape::kRow * stride_ +
+        offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t const* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+          LoadType const* memory_pointer =
+              reinterpret_cast<LoadType const*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer
+                  [(column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                       kLoadsPerAccess +
+                   v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment from memory.
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(const Fragment& frag, Index pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType const* frag_ptr = reinterpret_cast<LoadType const*>(&frag);
+          LoadType* memory_pointer = reinterpret_cast<LoadType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int memory_pointer_idx =
+                  (column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                      kLoadsPerAccess +
+                  v;
+              memory_pointer[memory_pointer_idx] =
+                  frag_ptr[frag_idx * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_DEVICE
+  void store(const Fragment& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
new file mode 100644
index 000000000..3fb47da01
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
@@ -0,0 +1,315 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+
+      NOTE: Copied from cutlass/epilogue/warp/fragment_iterator_tensor_op.h but modified
+      to make the accumulators non-const type so the accumulators can be modified.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC,
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile &accum):
+    accumulators_(reinterpret_cast<AccessType *>(&accum)),
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset =
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+
+  /// Stores a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void store(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = frag_ptr[n];
+    }
+  }
+
+  /// Adds a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void add(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = accumulators_[accumulator_access_offset] + frag_ptr[n];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
new file mode 100644
index 000000000..3fe0ab10b
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,235 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "grouped_classic_b2b_bmm/thread/linear_combination_triu.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Modified version of MmaTensorOpFragmentIterator that can zero out upper triangular
+// portion of output matrix.
+template <typename MmaTensorOpFragmentIterator_, int ThreadBlockShapeM_>
+class TriuMmaTensorOpFragmentIterator {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = typename MmaTensorOpFragmentIterator_::Shape;
+
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = typename MmaTensorOpFragmentIterator_::AccumulatorShape;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = MmaTensorOpFragmentIterator_::kKBlockColumn;
+
+  /// Accumulator Element type
+  using ElementAccumulator = typename MmaTensorOpFragmentIterator_::ElementAccumulator;
+
+  /// Element type
+  using Element = typename MmaTensorOpFragmentIterator_::Element;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = typename MmaTensorOpFragmentIterator_::InstructionShape;
+
+  /// Output operation on fragment
+  using OutputOp = thread::LinearCombinationTriu<
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    MmaTensorOpFragmentIterator_::OutputOp::kCount,
+    ThreadBlockShapeM_,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementCompute
+  >;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow,
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+  OutputOp output_op;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator()
+      : index_(0), is_residual_tile_(true), output_op() {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset;
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        if(!(is_residual_tile_ && index_ >= kResidualIndex)) {
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(
+              frag_ptr[m * MmaIterations::kColumn + n],
+              index_,
+              n,
+              m
+            );
+        }
+      }
+    }
+  }
+
+};
+
+}
+}
+}
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index c645ab65b..ad48598fb 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -786,5 +786,180 @@ def test_fmha_style_b2b_bmm_bf16(self):
         )
 
 
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class ClassicGroupedB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_grouped_classic_multihead_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        num_heads=2,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="grouped_classic_b2b_bmm",
+        copy_op=True,
+        atol=1e-2,
+        rtol=1e-2,
+        bias_broadcast=(False, False, False, False),
+    ):
+        # Initialize AIT grouped_classic_b2b_bmm operator.
+        assert len(bias_broadcast) == 4
+        assert (
+            bias_broadcast[3] is False
+        ), "Grouped classic b2b bmm cannot broadcast bias on last dimension."
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, num_heads, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, num_heads, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, num_heads, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        bias_shape_full = [batch_size_dim, num_heads, m, n0]
+        bias_shape = [
+            IntImm(1) if bias_broadcast[i] else bias_shape_full[i] for i in range(4)
+        ]
+        Bias = Tensor(
+            shape=bias_shape,
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+                **grouped_classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = grouped_classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=True)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            # Initialized in BMHD dim order
+            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
+            bias_shape_full_pt = (batch_size, num_heads, m, n0)
+            bias_shape_pt = (
+                1 if bias_broadcast[i] else bias_shape_full_pt[i] for i in range(4)
+            )
+            bias_pt = torch.rand(*bias_shape_pt, dtype=torch_dtype).cuda()
+
+            # Permute to BHMD dim order
+            q_pt_hf = torch.permute(q_pt, [0, 2, 1, 3])
+            k_pt_hf = torch.permute(k_pt, [0, 2, 1, 3])
+            v_pt_hf = torch.permute(v_pt, [0, 2, 1, 3])
+
+            # Run PT reference.
+            attn = alpha0 * (q_pt_hf @ k_pt_hf.transpose(-2, -1)) + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 / m * attn
+            invalid_attn_mask = get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            second_mm = attn @ v_pt_hf
+            output = torch.permute(
+                second_mm, [0, 2, 1, 3]
+            )  # permute back to original dim order
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
+            y = torch.empty(
+                [batch_size, m, num_heads, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_multihead1_b2b_bmm(self):
+        self._test_grouped_classic_multihead_b2b_bmm(
+            test_name="grouped_classic_multihead1_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_multihead2_b2b_bmm(self):
+        self._test_grouped_classic_multihead_b2b_bmm(
+            test_name="grouped_classic_multihead2_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_multihead1_b2b_bmm_bias_broadcast1(self):
+        self._test_grouped_classic_multihead_b2b_bmm(
+            test_name="grouped_classic_multihead1_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_multihead2_b2b_bmm_bias_broadcast1(self):
+        self._test_grouped_classic_multihead_b2b_bmm(
+            test_name="grouped_classic_multihead2_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_multihead2_b2b_bmm_bias_broadcast2(self):
+        self._test_grouped_classic_multihead_b2b_bmm(
+            test_name="grouped_classic_multihead2_b2b_bmm_broadcast2_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+
+
 if __name__ == "__main__":
     unittest.main()

From f0e6d8c7f699ce9e382a9d0dc8a677de09e7cf9b Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 31 May 2023 08:58:08 -0700
Subject: [PATCH 561/638] Add SM90-related profiler extensions (#732)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/732

A few extensions in SM90-related profiling:

- In mixed SM80 / SM90 profiling mode, profile SM80 op instances *before* SM90 ones to avoid potential side effects of running SM90 ops on the results of SM80 ops (e.g., due to power throttling).

- If SM90 ops with TMA epilogue are to be profiled (i.e., are selected due to compatible alignment requirements), only those are kept in the `func_attrs["op_instances"]`, as they are generally performing better than the rest. This is done for optimizing the SM90-including profiler compilation and running time.

Reviewed By: chenyang78

Differential Revision: D46264736

fbshipit-source-id: 65bd4c230fed53c1d2182439ff0b74a30b689990
---
 .../backend/cuda/gemm_universal/bmm_common.py |  2 +-
 .../backend/cuda/gemm_universal/common.py     | 59 ++++++++++++++-----
 .../gemm_universal/common_bias_broadcast.py   |  2 +-
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 8dd0399bb..bd3affc47 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -449,7 +449,7 @@ def gen_profiler(
 
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
-    op_instance = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+    op_instance, _ = common.filter_cutlass_3x_ops(op_instance, func_attrs)
 
     backend_spec = CUDASpec()
     elem_type = backend_spec.dtype_to_backend_type(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 47ddfb2c5..cda2b94ee 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -1004,6 +1004,18 @@ def add_profiler(file_pairs, workdir, op_type, output_name, code):
         file_pairs.append((src_path, obj_path))
 
 
+def has_tma_epilogue(op):
+    """Check whether the op is CUTLASS 3.x and has a TMA epilogue schedule."""
+    import cutlass_lib
+
+    result = False
+    if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+        epilogue_schedule_str = str(op.epilogue_schedule).split(".")[-1]
+        result = epilogue_schedule_str.lower().startswith("tma")
+
+    return result
+
+
 def filter_cutlass_3x_ops(op_instance, func_attrs):
     """Filter out CUTLASS 3.x ops with incompatible alignment requirements.
 
@@ -1029,18 +1041,39 @@ def filter_cutlass_3x_ops(op_instance, func_attrs):
         func_attrs
     )
 
-    result = {}
+    result_2x, result_3x = {}, {}
     for op_name, op in op_instance.items():
         if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
             if (
-                op.A.alignment > a_alignment
-                or op.B.alignment > b_alignment
-                or op.C.alignment > epilogue_alignment
+                op.A.alignment <= a_alignment
+                and op.B.alignment <= b_alignment
+                and op.C.alignment <= epilogue_alignment
             ):
-                continue
-        result[op_name] = op
-
-    return result
+                result_3x[op_name] = op
+        else:
+            result_2x[op_name] = op
+
+    has_ops_with_tma_epilogue = False
+    if result_3x:
+        for op in result_3x.values():
+            if has_tma_epilogue(op):
+                has_ops_with_tma_epilogue = True
+                break
+
+        if has_ops_with_tma_epilogue:
+            # when there are ops with TMA epilogue, keep only those
+            # for better performance / shorter profiler compilation time
+            result_3x = {
+                op_name: op for op_name, op in result_3x.items() if has_tma_epilogue(op)
+            }
+
+    return {
+        # CUTLASS 3.x kernels can cause power throttling:
+        # we want to generate the 2.x kernels first to avoid
+        # performance side effects caused by the 3.x kernels
+        **result_2x,
+        **result_3x,
+    }, has_ops_with_tma_epilogue
 
 
 def gen_profiler(
@@ -1061,7 +1094,7 @@ def gen_profiler(
 
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
-    op_instance = filter_cutlass_3x_ops(op_instance, func_attrs)
+    op_instance, op_has_tma_epilogue = filter_cutlass_3x_ops(op_instance, func_attrs)
 
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -1096,6 +1129,7 @@ def gen_profiler(
             problem_args_template_cutlass_3x.render(
                 elem_input_type=elem_input_type,
                 elem_output_type=elem_output_type,
+                has_tma_epilogue=op_has_tma_epilogue,
             )
             if problem_args_template_cutlass_3x is not None
             else ""
@@ -1343,15 +1377,10 @@ def default_fproc(
                 permute_layout
             ]
 
-        has_tma_epilogue = False
-        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
-            epilogue_schedule_str = str(op.epilogue_schedule).split(".")[-1]
-            has_tma_epilogue = epilogue_schedule_str.lower().startswith("tma")
-
         # set C and D alignment
         alignments = alignment.get_alignments(dtype)
         for i in alignments:
-            if has_tma_epilogue and i != max(alignments):
+            if has_tma_epilogue(op) and i != max(alignments):
                 # TMA epilogues only support max. output alignment
                 continue
             op = copy.deepcopy(op)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index e99ec324a..fb5e9ef38 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -610,7 +610,7 @@ def gen_profiler(
 
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
-    op_instance = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+    op_instance, _ = common.filter_cutlass_3x_ops(op_instance, func_attrs)
 
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(

From 3714b406125902f72a742e17769e2e7872832b2f Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 31 May 2023 09:18:04 -0700
Subject: [PATCH 562/638] Disable residual in SM90 kernels of gemm_rcr / rrr
 (#733)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/733

Since [7c04f95](https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2) in `nvidia/cutlass` one can disable the residual to provide more SMEM for the mainloop. Here we're doing that for the bias-less GEMM ops.

Reviewed By: chenyang78

Differential Revision: D46265004

fbshipit-source-id: 626a6e5d0a76483e2b832fb68728f73e13055227
---
 .../backend/cuda/gemm_universal/gemm_rcr.py   | 11 ++++++--
 .../backend/cuda/gemm_universal/gemm_rrr.py   | 27 +++++++------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index c81b00f5c..3f1d6aaa1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -86,7 +86,7 @@
     {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
     {
         {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementC const* ptr_C
+        nullptr,                                                 // ElementC const* ptr_C
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
@@ -137,7 +137,7 @@
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
         {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_output_type}}*)(c_ptr),                          // ElementC const* ptr_C
+        nullptr,                                                 // ElementC const* ptr_C
         {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
@@ -150,6 +150,13 @@
 def gemm_rcr_config(func_attrs, dtype="float16"):
     common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
+
 
 def common_gen_profiler(
     func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index c9bad8c46..696734094 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -23,6 +23,7 @@
 
 from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -85,7 +86,7 @@
     {cute::Int<1>{}, N, cute::Int<0>{}},                         // StrideB dB
     {
         {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_output_type}}*)(c_ptr),                          // ElementC const* ptr_C
+        nullptr,                                                 // ElementC const* ptr_C
         {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
@@ -96,22 +97,14 @@
 
 @registry.reg("cuda.gemm_rrr.config")
 def gemm_rrr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.RowMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            dtype=func_attrs["inputs"][0].dtype(),
-            epilogue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(
-        f_proc_op=fproc,
-        include_cutlass_3x_ops=True,
-    )
+    common.make_fproc(func_attrs, RRR, include_cutlass_3x_ops=True)
+
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
 
 
 @registry.reg("cuda.gemm_rrr.gen_profiler")

From 707d818ad4d62ce2ad5b78dee66ece6eace5f662 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 31 May 2023 09:50:05 -0700
Subject: [PATCH 563/638] Pass bias vector via epilogue schedule in SM90
 gemm_rcr_bias (#734)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/734

Since [7dbf423](https://github.com/NVIDIA/cutlass/commit/7dbf42376330230b9c5f0fe2a0ac1c167d1f1889) in `nvidia/cutlass` one can disable the residual and pass the bias vector through the `TmaWarpSpecializedBiasElementwise` epilogue schedule. When the bias vector is column-major, this leads to better performance. For that to work with the `gemm_rcr_bias` op (with the row-major bias and output) we use the transposed problem with swapped A / B and column-major bias and C.

Reviewed By: wushirong

Differential Revision: D46265703

fbshipit-source-id: 4954f1052709f0fdf7f90f93c53b374b98c07d0a
---
 .../cuda/gemm_universal/gemm_rcr_bias.py      | 107 ++++++++++++++++++
 .../utils/mk_cutlass_lib/extra_enum.py        |   6 +
 tests/unittest/ops/test_gemm_bias.py          |   1 +
 3 files changed, 114 insertions(+)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index 8d9d25cea..ec99423af 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -28,6 +28,14 @@
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+EXTRA_CODE = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
@@ -55,9 +63,31 @@
 )
 
 
+# in case of TMA epilogue schedule, use the transposed problem to pass the
+# column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+{% if has_tma_epilogue %}
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementA const* ptr_A
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementB const* ptr_B
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
+{% else %}
     {
         static_cast<coord_t>(M),
         static_cast<coord_t>(N),
@@ -75,6 +105,7 @@
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
     },                                                           // EpilogueArguments epilogue
+{% endif %}
 """
 )
 
@@ -106,9 +137,31 @@
 )
 
 
+# in case of TMA epilogue schedule, use the transposed problem to pass the
+# column-major bias vector through the bias + elementwise epilogue (not residual)
 PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+{% if has_tma_epilogue %}
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
+{% else %}
     {
         static_cast<coord_t>(M),
         static_cast<coord_t>(N),
@@ -126,6 +179,7 @@
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
         {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
     },                                                           // EpilogueArguments epilogue
+{% endif %}
 """
 )
 
@@ -134,9 +188,52 @@
 def gemm_rcr_config(func_attrs, dtype="float16"):
     common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if common.has_tma_epilogue(op):
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
+
+            # swap the output layout to the transposed problem
+            op.C.layout = cutlass_lib.library.LayoutType.ColumnMajor
+            op.D.layout = cutlass_lib.library.LayoutType.ColumnMajor
+
+            # change the TMA epilogue schedule to
+            # the corresponding bias + elementwise one
+            if (
+                op.epilogue_schedule
+                == cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecialized
+            ):
+                op.epilogue_schedule = (
+                    cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedBiasElementwise
+                )
+            elif (
+                op.epilogue_schedule
+                == cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            ):
+                op.epilogue_schedule = (
+                    cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise
+                )
+            else:
+                raise ValueError(
+                    f"Unexpected epilouge schedule type: {op.epilogue_schedule}."
+                )
+
 
 @registry.reg("cuda.gemm_rcr_bias.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return gemm_rcr.common_gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
@@ -146,6 +243,7 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
         problem_args_template_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=extra_code,
     )
 
 
@@ -173,6 +271,14 @@ def gen_function(
     problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
+        has_tma_epilogue=any(
+            common.has_tma_epilogue(func_attrs["op_instance"][exec_item.algo])
+            for exec_item in func_attrs["exec_path"].values()
+        ),
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
     )
     return common.gen_function(
         func_attrs=func_attrs,
@@ -189,6 +295,7 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
+        extra_code=extra_code,
     )
 
 
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index 0cab52d38..8d731fb2e 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -166,6 +166,8 @@ class EpilogueScheduleType(enum.Enum):
   TmaWarpSpecializedCooperativeElementwiseGELU = enum_auto()
   TmaWarpSpecializedElementwiseFastGELU = enum_auto()
   TmaWarpSpecializedCooperativeElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedBiasElementwise = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwise = enum_auto()
 
 EpilogueScheduleTag = {
   EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
@@ -187,6 +189,8 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU>',
   EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::GELU_taylor>',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU_taylor>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
 }
 
 EpilogueScheduleSuffixes = {
@@ -209,6 +213,8 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: '_epi_tma_gelu',
   EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: '_epi_tma_fast_gelu',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: '_epi_tma_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: '_epi_tma_bias',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: '_epi_tma_bias',
 }
 
 EpilogueScheduleMapping = {
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index b5cc9a217..cd276b739 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -111,6 +111,7 @@ def test_rcr_sm90(self) -> None:
         with env_variables(
             AIT_FORCE_CUTLASS_SM90_KERNELS="1",
             INSIDE_RE_WORKER="1",
+            FORCE_PROFILE="1",
         ):
             with self.assertRaisesRegex(
                 expected_exception=RuntimeError,

From 6204f621cced07a76224f99fec279afb82a23b28 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Wed, 31 May 2023 15:15:32 -0700
Subject: [PATCH 564/638] Pass bias vector via epilogue schedule in SM90
 gemm_rcr_bias_activation ops (#735)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/735

Since [7dbf423](https://github.com/NVIDIA/cutlass/commit/7dbf42376330230b9c5f0fe2a0ac1c167d1f1889) in `nvidia/cutlass` one can disable the residual and pass the bias vector through the `TmaWarpSpecializedBiasElementwise` epilogue schedule. When the bias vector is column-major, this leads to better performance. Here we're applying this for the family of `gemm_rcr_bias_<activation>` ops. As these ops are always relying on a TMA epilogue (as the activation functor only works with those), there is no need to distinguish between TMA and non-TMA (as is done for the `gemm_rcr_bias` op that can, in principle, have both).

Reviewed By: chenyang78

Differential Revision: D46266799

fbshipit-source-id: 83aa6e6f50579598b2f0b84092142f2f1e53ba4a
---
 .../gemm_universal/common_bias_activation.py  | 46 +++++++++++++-
 .../cuda/gemm_universal/gemm_rcr_bias.py      | 26 ++------
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py | 17 +++---
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py | 17 +++---
 .../gemm_universal/gemm_rcr_bias_hardswish.py | 17 +++---
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py | 17 +++---
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   | 17 +++---
 .../gemm_universal/gemm_rcr_bias_swish.py     | 17 +++---
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py | 17 +++---
 .../cuda/gemm_universal/gemm_rcr_fast_gelu.py |  8 +--
 .../utils/mk_cutlass_lib/extra_enum.py        | 61 +++++++++++++++++++
 11 files changed, 184 insertions(+), 76 deletions(-)

diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
index 3bcd567cf..e8c7af5b6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -16,6 +16,7 @@
 """
 Common codegen functions for gemm_bias_activation.
 """
+import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
@@ -24,6 +25,14 @@
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+EXTRA_CODE_HEADER = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
 def gemm_rcr_config(
     func_attrs,
     dtype="float16",
@@ -35,6 +44,24 @@ def gemm_rcr_config(
         include_cutlass_3x_ops=include_cutlass_3x_ops,
     )
 
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if common.has_tma_epilogue(op):
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
+
+            # swap the output layout to the transposed problem
+            op.C.layout = cutlass_lib.library.LayoutType.ColumnMajor
+            op.D.layout = cutlass_lib.library.LayoutType.ColumnMajor
+
+            # switch to a TMA epilogue with bias
+            op.epilogue_schedule = (
+                cutlass_lib.library.EpilogueScheduleBiasElementwiseMapping[
+                    op.epilogue_schedule
+                ]
+            )
+
 
 def gen_profiler(
     func_attrs,
@@ -45,6 +72,17 @@ def gen_profiler(
     problem_args_template_cutlass_3x=None,
     extra_code="",
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code_header = EXTRA_CODE_HEADER.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return gemm_rcr.common_gen_profiler(
         func_attrs=func_attrs,
         workdir=workdir,
@@ -54,7 +92,7 @@ def gen_profiler(
         problem_args_template=problem_args_template,
         problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=extra_code,
+        extra_code="\n\n".join([extra_code_header, extra_code]),
     )
 
 
@@ -86,6 +124,10 @@ def gen_function(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
         )
+    extra_code_header = EXTRA_CODE_HEADER.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
         func_attrs=func_attrs,
         src_template=common_bias.SRC_TEMPLATE,
@@ -101,7 +143,7 @@ def gen_function(
             stride_dim="N",
             output_accessor=func_attrs["output_accessors"][0],
         ),
-        extra_code=extra_code,
+        extra_code="\n\n".join([extra_code_header, extra_code]),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index ec99423af..1464383bc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -199,26 +199,12 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
             op.C.layout = cutlass_lib.library.LayoutType.ColumnMajor
             op.D.layout = cutlass_lib.library.LayoutType.ColumnMajor
 
-            # change the TMA epilogue schedule to
-            # the corresponding bias + elementwise one
-            if (
-                op.epilogue_schedule
-                == cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecialized
-            ):
-                op.epilogue_schedule = (
-                    cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedBiasElementwise
-                )
-            elif (
-                op.epilogue_schedule
-                == cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperative
-            ):
-                op.epilogue_schedule = (
-                    cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise
-                )
-            else:
-                raise ValueError(
-                    f"Unexpected epilouge schedule type: {op.epilogue_schedule}."
-                )
+            # switch to a TMA epilogue with bias
+            op.epilogue_schedule = (
+                cutlass_lib.library.EpilogueScheduleBiasElementwiseMapping[
+                    op.epilogue_schedule
+                ]
+            )
 
 
 @registry.reg("cuda.gemm_rcr_bias.gen_profiler")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 709aee2e7..e88acc73d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -86,25 +86,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index 610e50c26..4d577b5e9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -50,25 +50,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index 76ba16508..3524f0c81 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -50,25 +50,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index a65d4f6a5..b95f2bc10 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -51,25 +51,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index dd079c87a..1d4c20f23 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -51,25 +51,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index 4012e0d71..91e17d474 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -51,25 +51,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 6d4f13005..afd0b09b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -86,25 +86,28 @@
 )
 
 
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
 PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
     """
     cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
     {
-        static_cast<coord_t>(M),
         static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
         static_cast<coord_t>(K),
         static_cast<coord_t>(1)
     },                                                           // ProblemShape problem_shape
-    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
-    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
     {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
     {
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
-        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
-        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
         ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
-        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
     },                                                           // EpilogueArguments epilogue
 """
 )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index 6b3719def..a0619f56a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -26,6 +26,8 @@
     common_bias_activation,
     common_no_bias,
 )
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
+
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -118,11 +120,7 @@
 
 @registry.reg("cuda.gemm_rcr_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(
-        func_attrs=func_attrs,
-        dtype=dtype,
-        include_cutlass_3x_ops=True,
-    )
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
 
 @registry.reg("cuda.gemm_rcr_fast_gelu.gen_profiler")
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index 8d731fb2e..aa674561b 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -168,6 +168,20 @@ class EpilogueScheduleType(enum.Enum):
   TmaWarpSpecializedCooperativeElementwiseFastGELU = enum_auto()
   TmaWarpSpecializedBiasElementwise = enum_auto()
   TmaWarpSpecializedCooperativeBiasElementwise = enum_auto()
+  TmaWarpSpecializedBiasElementwiseRelu = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseRelu = enum_auto()
+  TmaWarpSpecializedBiasElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedBiasElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedBiasElementwiseTanh = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseTanh = enum_auto()
+  TmaWarpSpecializedBiasElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedBiasElementwiseGELU = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseGELU = enum_auto()
+  TmaWarpSpecializedBiasElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseFastGELU = enum_auto()
 
 EpilogueScheduleTag = {
   EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
@@ -191,6 +205,20 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU_taylor>',
   EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::ReLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::ReLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Sigmoid, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Sigmoid, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::SiLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::SiLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Tanh, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Tanh, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::HardSwish, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::HardSwish, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::GELU, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::GELU, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::GELU_taylor, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::GELU_taylor, elem_input_type, cutlass::plus, false, elem_input_type>',
 }
 
 EpilogueScheduleSuffixes = {
@@ -215,6 +243,20 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: '_epi_tma_fast_gelu',
   EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: '_epi_tma_bias',
   EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: '_epi_tma_bias',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu: '_epi_tma_bias_relu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu: '_epi_tma_bias_relu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid: '_epi_tma_bias_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid: '_epi_tma_bias_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu: '_epi_tma_bias_silu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu: '_epi_tma_bias_silu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh: '_epi_tma_bias_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh: '_epi_tma_bias_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish: '_epi_tma_bias_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish: '_epi_tma_bias_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU: '_epi_tma_bias_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU: '_epi_tma_bias_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU: '_epi_tma_bias_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU: '_epi_tma_bias_fast_gelu',
 }
 
 EpilogueScheduleMapping = {
@@ -240,6 +282,25 @@ class EpilogueScheduleType(enum.Enum):
   },
 }
 
+EpilogueScheduleBiasElementwiseMapping = {
+  EpilogueScheduleType.TmaWarpSpecialized: EpilogueScheduleType.TmaWarpSpecializedBiasElementwise,
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU,
+}
+
 """
 )
 

From a54b4c9ec3e12d5af9dbed6aac2088495a8aed6b Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 1 Jun 2023 10:14:04 +0800
Subject: [PATCH 565/638] fix gemm hardswish

---
 .../backend/rocm/gemm/gemm_rcr_bias_hardswish.py     | 12 ++++++------
 .../compiler/transform/transform_strided_ops.py      |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
index 4ecade28f..5a0c6e1f6 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
@@ -26,7 +26,7 @@
 # pylint: disable=C0415,W0613
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.config")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.config")
 def gemm_config(func_attrs, dtype="float16"):
     """Extract (operation name, operation instance) pair from
     all operation candidates.
@@ -49,7 +49,7 @@ def gemm_config(func_attrs, dtype="float16"):
     common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.gen_profiler")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.gen_profiler")
 def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
     """Generates standalone executables for profiler.
 
@@ -72,7 +72,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
     )
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.gen_function")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.gen_function")
 def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     """Generates function body.
 
@@ -106,7 +106,7 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     )
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.func_decl")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.func_decl")
 def gemm_gen_function_decl(func_attrs):
     """Generates function declarations.
 
@@ -124,7 +124,7 @@ def gemm_gen_function_decl(func_attrs):
     return common.gen_function_decl(func_name=func_name, gemm_flag="bias_hardswish")
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.func_call")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.func_call")
 def gemm_gen_function_call(func_attrs, indent="  "):
     """Generates function call.
 
@@ -143,7 +143,7 @@ def gemm_gen_function_call(func_attrs, indent="  "):
     return common.gen_function_call(func_attrs, indent, gemm_flag="bias_hardswish")
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.filter")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.filter")
 def gemm_function_filter(cfg, func_attrs, x_shape):
     """Generates function filter.
 
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index d1778ad4a..6ff7f336e 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -482,6 +482,7 @@ def transform_strided_ops(
     else:
         funcs = [
             # Keep on ROCM
+            _fuse_slices_concat_reshape_concat,
             _fuse_strided_op_and_view_op,
             _fuse_strided_op_and_cat,
             _fuse_split_and_strided_op,

From 2146627a648a4d200a89aa3f0347ef9d36ef7058 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Thu, 1 Jun 2023 09:23:22 -0700
Subject: [PATCH 566/638] fix hstu unit test in fx2ait (#743)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/743

as titled

Reviewed By: hl475

Differential Revision: D46347802

fbshipit-source-id: 11c507933a66de302a0aeae2da0b4c3642a628bd
---
 fx2ait/fx2ait/tools/common_fx2ait.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index d8e15b939..fd0298442 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -125,7 +125,7 @@ def run_test(
         for p in passes:
             mod = p(mod, inputs)
 
-        logger.info(f"{mod.graph=}")
+        logger.info(f"{mod.graph}")
 
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
@@ -264,7 +264,7 @@ def run_test_with_dynamic_shape(
         )
         for p in passes:
             mod = p(mod, inputs_min)
-        logger.info(f"{mod.graph=}")
+        logger.info(f"{mod.graph}")
 
         original_inputs = inputs_min
         # Trace and test with inputs_min

From daf1624c16263512077f3dcf50cfd65b663adae2 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Thu, 1 Jun 2023 12:05:45 -0700
Subject: [PATCH 567/638] update conv2d (#725)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/725

Reviewed By: chenyang78

Differential Revision: D46342892

Pulled By: ipiszy

fbshipit-source-id: 8d9412f3cdc62e49ee226ef68b351380ae072b0f
---
 .../backend/rocm/conv2d/__init__.py           |  2 +
 .../aitemplate/backend/rocm/conv2d/common.py  | 54 ++++++-----
 .../backend/rocm/conv2d/conv2d_bias_add.py    | 92 +++++++++++++++++++
 .../rocm/conv2d/conv2d_bias_add_relu.py       | 30 ++----
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |  9 +-
 .../backend/rocm/conv2d/transposed_conv2d.py  |  2 +-
 6 files changed, 135 insertions(+), 54 deletions(-)
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py

diff --git a/python/aitemplate/backend/rocm/conv2d/__init__.py b/python/aitemplate/backend/rocm/conv2d/__init__.py
index 989ea243f..ddcd3131c 100644
--- a/python/aitemplate/backend/rocm/conv2d/__init__.py
+++ b/python/aitemplate/backend/rocm/conv2d/__init__.py
@@ -18,6 +18,7 @@
 from aitemplate.backend.rocm.conv2d import (
     conv2d,
     conv2d_bias,
+    conv2d_bias_add,
     conv2d_bias_add_relu,
     conv2d_bias_relu,
     conv2d_bias_sigmoid,
@@ -28,6 +29,7 @@
 __all__ = [
     "conv2d",
     "conv2d_bias",
+    "conv2d_bias_add",
     "conv2d_bias_add_relu",
     "conv2d_bias_relu",
     "conv2d_bias_sigmoid",
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 894818ae9..0d30e05c8 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -42,7 +42,7 @@
 {{indent}}                                {},
 {% elif conv2d_flag in ["bias", "bias_relu", "bias_sigmoid"] %}
 {{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
-{% elif conv2d_flag == "bias_add_relu" %}
+{% elif conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
 {{indent}}                                std::array<const void*, 2>{static_cast<ck::half_t *>(bias_ptr), static_cast<ck::half_t *>(res_ptr)},
 {% endif %}
 {{indent}}                                static_cast<ck::half_t *>(out_ptr),
@@ -55,7 +55,7 @@
 {% elif conv2d_flag in ["bias", "bias_relu", "bias_sigmoid"] %}
 {{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{ {d_g_n_k_wos_lengths} },
 {{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{ {d_g_n_k_wos_strides} },
-{% elif conv2d_flag == "bias_add_relu" %}
+{% elif conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
 {{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{ {d_g_n_k_wos_lengths, e_g_n_k_wos_lengths} },
 {{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{ {d_g_n_k_wos_strides, e_g_n_k_wos_strides} },
 {% endif %}
@@ -75,6 +75,8 @@
 {{indent}}                                ck::tensor_operation::element_wise::AddRelu{}
 {% elif conv2d_flag == "bias_sigmoid" %}
 {{indent}}                                ck::tensor_operation::element_wise::AddSigmoid{}
+{% elif conv2d_flag == "bias_add_identity" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAdd{}
 {% elif conv2d_flag == "bias_add_relu" %}
 {{indent}}                                ck::tensor_operation::element_wise::AddAddRelu{}
 {% endif %}
@@ -89,9 +91,7 @@
 {{problem_args}}
 {{indent}});
 {{indent}}if(!op.IsSupportedArgument(argument)) {
-{{indent}}  throw std::runtime_error(
-{{indent}}    "wrong! device_conv with the specified compilation parameters does "
-{{indent}}    "not support this Conv problem");
+{{indent}}  LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Conv problem.";
 {{indent}}}
 {% if is_profiler %}
 {{indent}}auto workspace_size = op.GetWorkSpaceSize(&argument);
@@ -104,7 +104,7 @@
 
 HEADER_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 """
 )
 
@@ -118,6 +118,7 @@
 // #include <half.hpp>
 #include <random>
 #include <rocrand/rocrand.h>
+#include "logging.h"
 #include "include/ck/utility/print.hpp"
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
@@ -138,7 +139,7 @@
 {% if "bias" in conv2d_flag %}
     void * bias_ptr,
 {% endif %}
-{% if conv2d_flag == "bias_add_relu" %}
+{% if conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
     void * res_ptr,
 {% endif %}
     int64_t* batch,
@@ -249,7 +250,7 @@
 {% if "bias" in conv2d_flag %}
 {{indent}}    {{bias_ptr}},
 {% endif %}
-{% if conv2d_flag == "bias_add_relu" %}
+{% if conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
 {{indent}}    {{res_ptr}},
 {% endif %}
 {{indent}}    {{p_batch}},
@@ -303,7 +304,7 @@
 {% if "bias" in conv2d_flag %}
   memory_pool->AllocateHalfTensor(CO, 8);  // b: index 3
 {% endif %}
-{% if conv2d_flag == "bias_add_relu" %}
+{% if conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
   memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // r: index 4
 {% endif %}
 """
@@ -461,8 +462,9 @@
     {{func_call}}
   }
   timer->End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer->GetElapsedTime() << std::endl;
+  std::cout << "OP:" << "{{op_name}}" << ",";
+  std::cout << "TIME:" << timer->GetElapsedTime() << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
   delete(timer);
 }
 """
@@ -478,7 +480,7 @@
 {% if "bias" in conv2d_flag %}
   void *,
 {% endif %}
-{% if conv2d_flag == "bias_add_relu" %}
+{% if conv2d_flag in ["bias_add_relu", "bias_add_identity"] %}
   void *,
 {% endif %}
   int64_t*,
@@ -568,7 +570,6 @@ def gen_profiler(
     src_template=SRC_TEMPLATE,
     prob_args_template=PROBLEM_ARGS_TEMPLATE,
 ):
-
     """Generates standalone executables for profiler.
 
     Parameters
@@ -581,7 +582,7 @@ def gen_profiler(
         Generates shape calculation.
         The template is passed from compiler/ops/pool.
     conv2d_flag : str
-        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu','bias_add_identity'.
     extra_code : str
         Extra code for self-defined operators.
     """
@@ -599,9 +600,12 @@ def gen_profiler(
         w_dim0="out_ch",
         w_dim1="kernel_h",
         w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
     )
     file_pairs = []
     for op_name, op in op_instance.items():
@@ -662,6 +666,7 @@ def gen_profiler(
             args_parse=args_parse,
             tensor_decl=tensor_decl,
             func_call=func_call,
+            op_name=op_name,
         )
         prefix = os.path.join(workdir, "profiler", op_type)
         if not os.path.exists(prefix):
@@ -700,7 +705,7 @@ def gen_function(
         Generates output dimensions.
         The template is passed from compiler/ops/pool.
     conv2d_flag : str
-        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu','bias_add_identity'.
     extra_code : str
         Extra code for self-defined operators.
 
@@ -740,9 +745,12 @@ def gen_function(
         w_dim0="*out_ch",
         w_dim1="*kernel_h",
         w_dim2="*kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
         div="/",
     )
     shape_save_func = shape_save_template.render(
@@ -786,7 +794,7 @@ def gen_function_decl(func_name, conv2d_flag):
     func_attrs : Dict
         Operation attributes.
     conv2d_flag : str
-        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu','bias_add_identity'.
 
     Returns
     -------
@@ -822,7 +830,7 @@ def gen_function_call(func_attrs, indent="  ", conv2d_flag=""):
     if "bias" in conv2d_flag:
         b = func_attrs["inputs"][2]
         bias_ptr = b._attrs["name"]
-    if "bias_add_relu" == conv2d_flag:
+    if conv2d_flag in ["bias_add_relu", "bias_add_identity"]:
         r = func_attrs["inputs"][3]
         res_ptr = r._attrs["name"]
     return FUNC_CALL_TEMPLATE.render(
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py
new file mode 100644
index 000000000..3c8f0e3ba
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py
@@ -0,0 +1,92 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias add codegen
+"""
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.config")
+def conv2d_config(func_attrs):
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.AddAdd
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias_add_identity",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias_add_identity",
+        common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name, conv2d_flag="bias_add_identity"
+    )
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias_add_identity")
+
+
+@registry.reg("rocm.conv2d_bias_add_identity.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index 79d19bf1b..190f85694 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -24,9 +24,8 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 
-#include "data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -34,28 +33,11 @@
 namespace {
 struct AddAddRelu
 {
-    __host__ __device__ constexpr void
-    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
-    {
-        half_t a = x0 + x1 + x2;
-        y = a > 0 ? a : 0;
-    }
-
-    __host__ __device__ constexpr void
-    operator()(float& y, const float& x0, const float& x1, const float& x2) const
-    {
-        float a = x0 + x1+ x2;
-        float b = a > 0 ? a : 0;
-        y       = b;
-    }
-
-    __host__ __device__ constexpr void
-    operator()(half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
-    {
-        float a = x0 + x1 + x2;
-        float b = a > 0 ? a : 0;
-        y       = b;
-    }
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1, const T& x2) const{
+        ck::tensor_operation::element_wise::AddAdd{}(y, x0, x1, x2);
+        ck::tensor_operation::element_wise::Relu{}(y, y);
+    };
 };
 } // namespace
 } // namespace element_wise
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index 8449dc1de..f43e42317 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -24,9 +24,9 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -35,8 +35,7 @@
 struct AddSigmoid
 {
     template <typename T>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
-
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;   
     template <>
     __host__ __device__ constexpr void
     operator()<float>(float& y, const float& x0, const float& x1) const
@@ -44,7 +43,6 @@
         const float a = x0 + x1;
         y             = 1.0f / (1.0f + exp(-a));
     };
-
     template <>
     __host__ __device__ constexpr void
     operator()<double>(double& y, const double& x0, const double& x1) const
@@ -52,7 +50,6 @@
         const double a = x0 + x1;
         y              = 1.0 / (1.0 + exp(-a));
     };
-
     template <>
     __host__ __device__ constexpr void
     operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
index f14a6f57e..77659bcd3 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -23,7 +23,7 @@
 # pylint: disable=C0103,C0415,W0613
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
 """
 )
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(

From d49a2cdb096a4241de1800fc7b434d18b333104e Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Thu, 1 Jun 2023 16:43:21 -0700
Subject: [PATCH 568/638] Add BFloat16 for debugging and messaging (#744)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/744

(1) We add the mapping of BFloat16 for printing out BFloat16's enum
(2) We add BFloat16 into debugging function in load_merge_net

Reviewed By: khabinov

Differential Revision: D46362371

fbshipit-source-id: 65c38119afefe7e7574d102989b76889c0f38612
---
 static/csrc/model_container.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index add4ea980..5548a97f0 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -30,6 +30,8 @@ std::string GetEnumString(AITemplateDtype dtype) {
       return "kInt";
     case AITemplateDtype::kLong:
       return "kLong";
+    case AITemplateDtype::kBFloat16:
+      return "kBFloat16";
     default:
       return "unknown";
   }

From 98e6a135252db1705469cb736e3b96ae43645424 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Fri, 2 Jun 2023 07:20:26 -0700
Subject: [PATCH 569/638] Grouped Classic B2B BMM op 2 ( padded -> jagged
 operator implementation ) (#737)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/737

Implementation of a jagged (grouped) version of the classic b2b bmm operator.

This diff builds on another diff in the stack, which copied the implementation of the ungrouped classic b2b bmm op so that a new classic b2b bmm operator is created. This diff is about modifying the new operator to implement a jagged / grouped version.

This diff supersedes the diff D45880465, which contains a configurable version including code that turned out not to improve the performance. This code has been removed in this version.

Reviewed By: aakhundov

Differential Revision: D46277337

fbshipit-source-id: 61a8e9d1bce9b23fbcb4f77ce3cd073d4abb3050
---
 .../cuda/b2b_bmm/grouped_classic_b2b_bmm.py   | 101 ++--
 .../ops/b2b_bmm/grouped_classic_b2b_bmm.py    | 131 ++--
 .../device/b2b_batched_gemm.h                 |  21 +-
 .../kernel/b2b_batched_gemm.h                 |  72 ++-
 .../kernel/default_b2b_batched_gemm.h         |  13 +-
 .../threadblock/b2b_mma_multistage.h          |  93 ++-
 tests/unittest/ops/test_b2b_bmm.py            | 175 ------
 .../ops/test_grouped_classic_b2b_bmm.py       | 567 ++++++++++++++++++
 8 files changed, 807 insertions(+), 366 deletions(-)
 create mode 100644 tests/unittest/ops/test_grouped_classic_b2b_bmm.py

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
index c2e9124a9..ce3ac19bc 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -53,17 +53,16 @@
 // gemm 0 and see what the perf result is.
 constexpr int WarpM = 16;
 
-constexpr int N0 = {{n0}};
-constexpr int N1 = {{n1}};
+constexpr int N0 = {{max_seq_len}};
+constexpr int N1 = {{n1}}; // embedding size of value
 
-void check_status(cutlass::Status status, int64_t m0, int64_t k0, const std::string& message) {
+void check_status(cutlass::Status status, int64_t max_seq_len, int64_t k0, const std::string& message) {
   if (status != cutlass::Status::kSuccess) {
       throw std::runtime_error(
         message +
         "Function: {{function_name}}. "
-        "m0: " + std::to_string(m0) +
+        "max_seq_len: " + std::to_string(max_seq_len) +
         ", k0: " + std::to_string(k0) +
-        ", n0: " + std::to_string({{n0}}) +
         ", n1: " + std::to_string({{n1}}) + "."
       );
   }
@@ -81,7 +80,7 @@
   ElementCompute beta0 = ElementCompute(1);
   ElementCompute activation_alpha = ElementCompute({{alpha1}});
   {% if alpha1_divide_by_seq_len %}
-  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(m0));
+  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(max_seq_len));
   {% endif %}
   ElementCompute alpha1 = ElementCompute(1);
   ElementCompute beta1 = ElementCompute(0);
@@ -132,11 +131,13 @@
     EpilogueOutputOp1,
     cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3,
-    {{has_causal}} // enable causal mask after gemm0
+    {{has_causal}}, // enable causal mask after gemm0
+    false, // stage accumulator in shared memory
+    {{offset_type}}
   >;
 
-  cutlass::gemm::GemmCoord problem_size_0(m0, {{n0}}, k0);
-  cutlass::gemm::GemmCoord problem_size_1(m0, {{n1}}, {{n0}});
+  cutlass::gemm::GemmCoord problem_size_0({{max_seq_len}}, {{max_seq_len}}, k0);
+  cutlass::gemm::GemmCoord problem_size_1({{max_seq_len}}, {{n1}}, {{max_seq_len}});
 
   // Assuming BMHD dim ordering for inputs and outputs, like in FHMA style op
   // B = batch size
@@ -145,17 +146,19 @@
   // D = embedding dims per head
   // --- Tensor shapes:
   // GEMM PROBLEM 0:
-  // A=query : [ batch_size, M0, num_heads, K0 ]
-  // B=key : [ batch_size, N0, num_heads, K0 ]
-  // C0=bias : [ batch_size, num_heads, M0, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
+  // jagged dims seq_len is in extra <...> brackets with the batch size
+
+  // A=query : [ <batch_size, M0=jagged_seq_len>, num_heads, K0 ]
+  // B=key : [ <batch_size, N0=jagged_seq_len>, num_heads, K0 ]
+  // C0=bias : [ batch_size, num_heads, max_seq_len, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
   // GEMM PROBLEM 1:
-  // B1=value : [ batch_size, K1==N0, num_heads, N1 ]
+  // B1=value : [ <batch_size, K1=jagged_seq_len>, num_heads, N1 ]
   // C1=unused:  [ N1 ]
-  // D1=output : [ batch_size, M1==M0, num_heads, N1 ]
+  // D1=output : [ <batch_size, M1=jagged_seq_len>, num_heads, N1 ]
+
+  // Required equalities for grouped / jagged B2B gemm:
+  // seq_len = M1 = M0 = N0 = K1;
 
-  // Required equalities for B2B gemm:
-  // M1 = M0;
-  // K1 = N0;
 
   typename GroupedB2bGemmBatched::Arguments arguments{
     problem_size_0, // = GemmCoord problem_size_0;
@@ -180,6 +183,7 @@
     num_heads * problem_size_1.m() * problem_size_1.n(),                                                                // int64_t batch_stride_output;
     batch_size,                                                                                                         // int batch_count;
     num_heads,                                                                                                          // int num_heads
+    static_cast<const {{offset_type}}*>(offsets),                                                                                       // const offset_t *
     {alpha0, beta0, activation_alpha},                                                                                  // typename EpilogueOutputOp0::Params epilogue0;
     {alpha1, beta1},                                                                                                    // typename EpilogueOutputOp1::Params epilogue1;
   };
@@ -187,17 +191,17 @@
   GroupedB2bGemmBatched b2b_gemm_op;
   check_status(
     b2b_gemm_op.can_implement(arguments),
-    m0, k0,
+    {{max_seq_len}}, k0,
     "Problem sizes are not supported."
   );
   check_status(
     b2b_gemm_op.initialize(arguments),
-    m0, k0,
+    {{max_seq_len}}, k0,
     "classic_b2b_bmm initialization failed!"
   );
   check_status(
     b2b_gemm_op(stream),
-    m0, k0,
+    {{max_seq_len}}, k0,
     "classic_b2b_bmm failed to execute!"
   );
 }
@@ -214,8 +218,9 @@
                    void* bias,
                    int64_t batch_size,
                    int64_t num_heads,
-                   int64_t m0,
+                   int64_t max_seq_len,
                    int64_t k0,
+                   const void *offsets,
                    cudaStream_t stream)
     """
 )
@@ -233,8 +238,9 @@
 {{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
 {{indent}}    {{batch_size}},
 {{indent}}    {{num_heads}},
-{{indent}}    {{m0}},
+{{indent}}    {{max_seq_len}},
 {{indent}}    {{k0}},
+{{indent}}    {{offsets}},
 {{indent}}    stream /* default stream */
 {{indent}});
     """
@@ -245,13 +251,11 @@
 def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
     """the function for generating attention kernel"""
     q, k, v, bias = func_attrs["inputs"]
-    seq_len_dim = 1
-    n0 = k._attrs["shape"][seq_len_dim]
+    max_seq_len = func_attrs["max_seq_len"]
     n1 = v._attrs["shape"][-1]
-    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
-        raise RuntimeError(
-            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
-        )
+    jagged_intvar = q._attrs["shape"][0]
+    if not isinstance(n1, IntImm):
+        raise RuntimeError(f"n1 must be static dim. {func_attrs['name']=}, {n1=}")
     backend_spec = CUDASpec()
     if func_attrs["inputs"][0]._attrs["dtype"] != "float16":
         raise NotImplementedError(
@@ -314,7 +318,7 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
         elem_accum_type=elem_accum_type,
-        n0=str(n0.value()),
+        max_seq_len=str(max_seq_len),
         n1=str(n1.value()),
         has_causal=(
             "true" if func_attrs["causal_type"] != CausalType.NO_CAUSAL else "false"
@@ -328,6 +332,7 @@ def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
         bias_stride_n=bias_stride_n,
         bias_stride_mn=bias_stride_mn,
         bias_stride_hmn=bias_stride_hmn,
+        offset_type=jagged_intvar.offsets_type(),
     )
 
 
@@ -349,21 +354,25 @@ def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
     k_name = func_attrs["inputs"][1]._attrs["name"]
     v_name = func_attrs["inputs"][2]._attrs["name"]
     bias_name = func_attrs["inputs"][3]._attrs["name"]
-
     q_shape = func_attrs["inputs"][0]._attrs["shape"]
 
-    batch_size = q_shape[0]._attrs["name"]
-    seq_len_dim = 1
-    head_dim = -2
-    m0 = q_shape[seq_len_dim]._attrs["name"]
-
-    if len(q_shape) == 3:
-        # single head case
-        k0 = q_shape[2]._attrs["name"]
-        num_heads = "1"
-    elif len(q_shape) == 4:
-        k0 = q_shape[3]._attrs["name"]
-        num_heads = q_shape[head_dim]._attrs["name"]
+    k0 = q_shape[2]._attrs["name"]
+
+    jagged_intvar = q_shape[0]
+    batch_size_var = jagged_intvar.batch_dim()._attrs["name"]
+    if len(jagged_intvar.jagged_dims()) != 1:
+        raise RuntimeError(
+            "Only support 1 jagged dim in grouped_classic_b2b_bmm for now! "
+            f"Current jagged intvar: {jagged_intvar}"
+        )
+    max_seq_len_dim = jagged_intvar.jagged_dims()[0].max_value()
+    max_seq_len_var = (
+        str(max_seq_len_dim.value())
+        if isinstance(max_seq_len_dim, IntImm)
+        else max_seq_len_dim._attrs["name"]
+    )
+    num_heads_var = q_shape[1]._attrs["name"]
+    offsets_var_name = f"{jagged_intvar.offsets_var_name()}.data[0]"
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         output=output_name,
@@ -371,9 +380,11 @@ def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
         key=k_name,
         value=v_name,
         bias=bias_name,
-        batch_size=batch_size,
-        num_heads=num_heads,
-        m0=m0,
+        batch_size=batch_size_var,
+        num_heads=num_heads_var,
+        max_seq_len=max_seq_len_var,
         k0=k0,
         indent=indent,
+        offsets=offsets_var_name,
+        offset_type=jagged_intvar.offsets_type(),
     )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
index 39090c342..19042285e 100644
--- a/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -47,8 +47,15 @@
 """
 
 from aitemplate.backend import registry, target
-from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.base import IntVar, Tensor
 from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+from aitemplate.utils import shape_utils
+
+
+def _is_power_of_two(n):
+    if n <= 0:
+        return False
+    return (n & (n - 1)) == 0
 
 
 class grouped_classic_b2b_bmm(b2b_bmm_base):
@@ -90,94 +97,85 @@ def __init__(
     def _infer_shapes(self):
         """infer the output shape for grouped_classic_b2b_bmm."""
         q, k, v, bias = self._attrs["inputs"]
+        if not (q.is_jagged() and k.is_jagged() and v.is_jagged()):
+            raise RuntimeError(f"{q=}, {k=}, {v=} must be jagged!")
         q_shape = q._attrs["shape"]
         k_shape = k._attrs["shape"]
         v_shape = v._attrs["shape"]
-        head_dim = 2
-        seq_dim = 1
+        bias_shape = bias._attrs["shape"]
         if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
             raise RuntimeError(
                 f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        if len(q_shape) != 3 and len(k_shape) != 4:
+        if len(q_shape) != 3:
             raise RuntimeError(
-                f"QKV must have rank 3 or 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-
         if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
             raise RuntimeError(
-                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+                f"QKV must have same jagged_dim (batch_size and seq_length)! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[1] != k_shape[1] or q_shape[1] != v_shape[1]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[2] != k_shape[2]:
+            raise RuntimeError(
+                f"Q and K shapes are not compatible ( inner dimension for Matmul must be identical ) - Q shape: {q_shape=}, K shape: {k_shape=}."
             )
 
         batch_size = q_shape[0]
-        M0 = q_shape[seq_dim]
         K0 = q_shape[-1]
         if K0 != k_shape[-1]:
             raise RuntimeError(
-                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+                f"Q and K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
             )
-        N0 = k_shape[seq_dim]
-        if N0 != v_shape[seq_dim]:
+
+        num_heads = q_shape[1]
+        output_shape = [q_shape[0], num_heads, v_shape[2]]
+
+        batch_size = q_shape[0].batch_dim()
+        max_seq_len = q_shape[0].jagged_dims()[0].max_value()
+        if isinstance(max_seq_len, IntVar):
+            if max_seq_len.lower_bound() != max_seq_len.upper_bound():
+                raise RuntimeError(
+                    "Maximum sequence length needs to be a fixed (IntImm) dimension. "
+                )
+            max_seq_len = max_seq_len.upper_bound()
+
+        # This is a current limitation of the classic op due to grid layout and test results
+        if (
+            (not _is_power_of_two(max_seq_len))
+            or (max_seq_len > 512)
+            or (max_seq_len < 64)
+        ):
             raise RuntimeError(
-                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+                f"Maximum sequence length needs to be a fixed (IntImm) dimension with a power of two between 64 and 512 for the grouped classic b2b op to work. Actual value: {max_seq_len=}. {type(max_seq_len)=}"
             )
-        N1 = v_shape[-1]
-        if N0.upper_bound() > 512 or N1.upper_bound() > 512:
+        if len(bias_shape) != 4:
+            raise RuntimeError(f"Expected bias rank 4. Current bias rank: {len(bias)}.")
+
+        bias_expected_shape = [
+            batch_size,
+            num_heads,
+            max_seq_len,
+            max_seq_len,
+        ]
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+            bias_shape, bias_expected_shape
+        )
+        if not broadcastable:
             raise RuntimeError(
-                f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
+                f"bias shape is not compatible with Q K! "
+                f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
             )
-        if not isinstance(N0, IntImm) or not isinstance(N1, IntImm):
+        if bias_shape[-1] != bias_expected_shape[-1]:
             raise RuntimeError(
-                f"classic_b2b_bmm only supports static N0 / N1. Current {N0=}, {N1=}."
+                f"Bias last dim is not broadcastable! Expected shape: {bias_expected_shape[-1]}, current bias shape: {bias_shape}"
             )
-        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
-            if M0 != N0:
-                raise RuntimeError(
-                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
-                )
-        bias_shape = bias._attrs["shape"]
-
-        is_multihead = len(q_shape) == 4
-        if is_multihead:
-            num_heads = q_shape[head_dim]
-
-            output_shape = [batch_size, M0, num_heads, N1]
-            if len(bias_shape) != 4:
-                raise RuntimeError(
-                    f"Was expecting 4-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
-                )
-            for bias_dim, expected_dim in zip(
-                bias_shape, [batch_size, num_heads, M0, N0]
-            ):
-                if bias_dim != IntImm(1) and bias_dim != expected_dim:
-                    raise RuntimeError(
-                        f"bias shape is not compatible with Q K! "
-                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
-                        f"bias shapes: {bias_shape=}."
-                    )
-            # key sequence length is identical to last shape dim of bias tensor
-            # so if it is also constant 1, it is not a real broadcast and permissible
-            if bias_shape[-1] == IntImm(1) and k_shape[seq_dim] != IntImm(1):
-                raise RuntimeError(
-                    "grouped_classic_b2b_bmm op does not support broadcasting of last dimension of bias tensor (e.g. over sequence length of key and value ). Use the expand op to emulate this broadcast behavior if you need it."
-                )
-        else:
-            num_heads = IntImm(1)
-            self._attrs["num_heads"] = num_heads
-            output_shape = [batch_size, M0, N1]
-            if len(bias_shape) != 3:
-                raise RuntimeError(
-                    f"Was expecting 3-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
-                )
-            for bias_dim, expected_dim in zip(bias_shape, [batch_size, M0, N0]):
-                if bias_dim != IntImm(1) and bias_dim != expected_dim:
-                    raise RuntimeError(
-                        f"bias shape is not compatible with Q K! "
-                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
-                        f"bias shapes: {bias_shape=}."
-                    )
-
-        return output_shape
+        return output_shape, max_seq_len
 
     def __call__(
         self,
@@ -204,7 +202,7 @@ def __call__(
 
         self._attrs["inputs"] = [q, k, v, bias]
         self._set_depth()
-        output_shape = self._infer_shapes()
+        output_shape, max_seq_len = self._infer_shapes()
         self._check_alignment()
         output = Tensor(
             output_shape,
@@ -212,6 +210,7 @@ def __call__(
             dtype=self._attrs["inputs"][0]._attrs["dtype"],
         )
         self._attrs["outputs"] = [output]
+        self._attrs["max_seq_len"] = max_seq_len
 
         return output
 
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
index 0b9215f09..87b903a44 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -118,6 +118,8 @@ template <
     bool CausalMaskAfterGemm0 = false,
     /// Stage accumulator in shared memory
     bool SmemAccumulator = false,
+    /// Element type for offsets / array indices
+    typename offset_t_ = int32_t,
     /// Access granularity of A matrix in units of elements
     int AlignmentA =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
@@ -129,10 +131,11 @@ template <
     /// Operation performed by GEMM
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
+        ElementAccumulator_>::Operator
+>
 class GroupedB2bGemmBatched {
  public:
-
+  using offset_t = offset_t_;
   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
@@ -191,8 +194,9 @@ class GroupedB2bGemmBatched {
     kStages,
     Operator,
     CausalMaskAfterGemm0,
-    SmemAccumulator
-  >::B2bGemmBatchedKernel;
+    SmemAccumulator,
+    offset_t
+  >::GroupedB2bGemmBatchedKernel;
 
   /// Argument structure
   struct Arguments {
@@ -225,6 +229,9 @@ class GroupedB2bGemmBatched {
     int num_heads;
     typename EpilogueOutputOp0::Params epilogue0;
     typename EpilogueOutputOp1::Params epilogue1;
+    // array of jagged dim offsets
+    // of size batch_count + 1
+    const offset_t *offsets;
 
     //
     // Methods
@@ -261,6 +268,7 @@ class GroupedB2bGemmBatched {
       int64_t batch_stride_D1_,
       int batch_count_,
       int num_heads_,
+      const offset_t *offsets_,
       typename EpilogueOutputOp0::Params epilogue0_ =
         typename EpilogueOutputOp0::Params(),
       typename EpilogueOutputOp1::Params epilogue1_ =
@@ -288,9 +296,9 @@ class GroupedB2bGemmBatched {
       batch_stride_D1(batch_stride_D1_),
       batch_count(batch_count_),
       num_heads(num_heads_),
+      offsets(offsets_),
       epilogue0(epilogue0_),
       epilogue1(epilogue1_) {
-
     }
   };
 
@@ -340,7 +348,6 @@ class GroupedB2bGemmBatched {
       args.problem_size_0,
       {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
       args.batch_count * args.num_heads);
-
     // Initialize the Params structure
     params_ = typename GroupedB2bGemmBatchedKernel::Params{
       args.problem_size_0,
@@ -366,6 +373,7 @@ class GroupedB2bGemmBatched {
       args.batch_stride_D1,
       args.batch_count,
       args.num_heads,
+      args.offsets,
       args.epilogue0,
       args.epilogue1
     };
@@ -384,6 +392,7 @@ class GroupedB2bGemmBatched {
     params_.ref_D1.reset(args.ref_D1.data());
     params_.output_op_0 = args.epilogue0;
     params_.output_op_1 = args.epilogue1;
+    params_.offsets = args.offsets;
 
     return Status::kSuccess;
   }
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
index 13fb29e0c..83bb660fe 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -38,12 +38,12 @@
 
 #pragma once
 
+#include <cstddef>
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/semaphore.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -56,7 +56,8 @@ template <
   typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
   typename Epilogue_,             ///! Epilogue
   typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  typename GmemToAccumLoader_
+  typename GmemToAccumLoader_,
+  typename offset_t_
 >
 struct GroupedB2bGemmBatched {
 
@@ -66,6 +67,7 @@ struct GroupedB2bGemmBatched {
   using OutputOp0 = typename B2bMma::OutputOp;
   using OutputOp1 = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using offset_t = offset_t_;
 
   /// Warp count (concept: GemmShape)
   using WarpCount0 = typename B2bMma::WarpCount0;
@@ -108,6 +110,10 @@ struct GroupedB2bGemmBatched {
     int gemm_k_iterations_0;
     int gemm_k_iterations_1;
 
+    // array of jagged dim offsets
+    // of size batch_count + 1
+    const offset_t *offsets;
+
     //
     // Methods
     //
@@ -140,6 +146,7 @@ struct GroupedB2bGemmBatched {
       int64_t batch_stride_D1,
       int batch_count,
       int num_heads,
+      const offset_t *offsets_,
       typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
       typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
     ):
@@ -175,6 +182,7 @@ struct GroupedB2bGemmBatched {
       output_op_1(output_op_1),
       batch_count(batch_count),
       num_heads(num_heads),
+      offsets{offsets_},
       gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
       gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
   };
@@ -264,7 +272,6 @@ struct GroupedB2bGemmBatched {
 
     // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;
-
     cutlass::gemm::GemmCoord threadblock_tile_offset =
         threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
 
@@ -275,9 +282,25 @@ struct GroupedB2bGemmBatched {
       return;
     }
 
+
     // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
     for (int batch_head_idx = threadblock_swizzle.get_batch_idx(); batch_head_idx < params.batch_count * params.num_heads; batch_head_idx += gridDim.z) {
 
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Convert blockIdx.z into (batch_idx, head_idx).
+      int batch_idx = batch_head_idx / params.num_heads;
+      int head_idx = batch_head_idx % params.num_heads;
+
+      offset_t jagged_offset_start = params.offsets[batch_idx];
+      offset_t jagged_seq_len = params.offsets[batch_idx+1] - jagged_offset_start;
+
+      // early exit
+      if ((threadblock_tile_offset.m() * B2bMma::Shape0::kM >= jagged_seq_len) or (threadblock_tile_offset.n() * B2bMma::Shape0::kN >= jagged_seq_len)) {
+          return;
+      }
       // Compute initial location in logical coordinates
       cutlass::MatrixCoord tb_offset_A0{
         threadblock_tile_offset.m() * B2bMma::Shape0::kM,
@@ -294,31 +317,24 @@ struct GroupedB2bGemmBatched {
         threadblock_tile_offset.n() * B2bMma::Shape1::kN
       };
 
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Convert blockIdx.z into (batch_idx, head_idx).
-      int batch_idx = batch_head_idx / params.num_heads;
-      int head_idx = batch_head_idx % params.num_heads;
-
       // Construct iterators to A and B operands
       typename B2bMma::IteratorA0 iterator_A0(
         params.params_A0,
         params.ref_A0.data(),
-        params.problem_size_0.mk(),
+        { jagged_seq_len, params.problem_size_0.k() }, // A0 matrix size
         thread_idx,
         tb_offset_A0);
 
-      iterator_A0.add_pointer_offset(params.batch_stride_A0 * batch_idx + params.head_stride_A0 * head_idx);
+      iterator_A0.add_pointer_offset(params.ref_A0.stride(0) * jagged_offset_start + params.head_stride_A0 * head_idx);
 
       typename B2bMma::IteratorB0 iterator_B0(
         params.params_B0,
         params.ref_B0.data(),
-        params.problem_size_0.kn(),
+        { params.problem_size_0.k(), jagged_seq_len }, // B0 matrix size
         thread_idx,
         tb_offset_B0);
 
-      iterator_B0.add_pointer_offset(params.batch_stride_B0 * batch_idx + params.head_stride_B0 * head_idx);
+      iterator_B0.add_pointer_offset(params.ref_B0.stride(0) * jagged_offset_start + params.head_stride_B0 * head_idx);
 
       typename B2bMma::IteratorB1 iterator_B1(
         params.params_B1,
@@ -326,8 +342,9 @@ struct GroupedB2bGemmBatched {
         params.problem_size_1.kn(),
         thread_idx,
         tb_offset_B1);
-
-      iterator_B1.add_pointer_offset(params.batch_stride_B1 * batch_idx + params.head_stride_B1 * head_idx);
+      auto const B1_ptr_offset = params.ref_B1.stride(0) * jagged_offset_start +  params.head_stride_B1 * head_idx;
+      iterator_B1.add_pointer_offset(B1_ptr_offset);
+      typename B2bMma::IteratorB1::Element *B1_tile_base_ptr  =  params.ref_B1.data() + B1_ptr_offset;
 
 
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
@@ -345,7 +362,7 @@ struct GroupedB2bGemmBatched {
       typename GmemToAccumLoader::OutputTileIterator iterator_C0(
         params.params_C0,
         params.ref_C0.data(),
-        params.problem_size_0.mn(),
+        make_Coord(jagged_seq_len, jagged_seq_len),
         thread_idx,
         tb_offset_C0
       );
@@ -359,7 +376,11 @@ struct GroupedB2bGemmBatched {
       OutputOp0 output_op_0(params.output_op_0);
 
       // Construct thread-scoped matrix multiply
-      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
+      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx,
+            jagged_seq_len,
+            B1_tile_base_ptr,
+            static_cast<int>(params.ref_B1.stride(0))
+      );
 
       typename B2bMma::FragmentC0 src_accum;
       typename B2bMma::FragmentC1 accumulators;
@@ -368,8 +389,15 @@ struct GroupedB2bGemmBatched {
       accumulators.clear();
 
       // Compute threadblock-scoped matrix multiply-add
-      b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_C0,
-        iterator_B1, src_accum, output_op_0);
+      b2bMma(
+        params.gemm_k_iterations_0,
+        accumulators,
+        iterator_A0,
+        iterator_B0,
+        iterator_C0,
+        iterator_B1,
+        src_accum,
+        output_op_0);
 
       //
       // Epilogue
@@ -405,12 +433,12 @@ struct GroupedB2bGemmBatched {
       typename Epilogue::OutputTileIterator iterator_D1(
         params.params_D1,
         params.ref_D1.data(),
-        params.problem_size_1.mn(),
+        make_Coord(jagged_seq_len, params.problem_size_1.n()),
         thread_idx,
         threadblock_offset
       );
 
-      iterator_D1.add_pointer_offset(params.batch_stride_D1 * batch_idx + params.head_stride_D1 * head_idx);
+      iterator_D1.add_pointer_offset(params.ref_D1.stride(0) * jagged_offset_start + params.head_stride_D1 * head_idx);
 
       Epilogue epilogue(
         shared_storage.epilogue,
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
index f841d0072..8ff051cb1 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -127,10 +127,11 @@ template <
   /// Apply upper triangular causal mask after first gemm
   bool CausalMaskAfterGemm0 = false,
   /// Stage accumulator in shared memory
-  bool SmemAccumulator = false
+  bool SmemAccumulator = false,
+  /// Element type for offsets / array indices
+  typename offset_t_ = int
 >
 struct DefaultGroupedB2bGemmBatched;
-
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Ampere Architecture
@@ -174,13 +175,15 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Apply upper triangular causal mask after first gemm
-    bool CausalMaskAfterGemm0>
+    bool CausalMaskAfterGemm0,
+    /// Element type for offsets / array indices
+    typename offset_t_>
 struct DefaultGroupedB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1, ElementC,
                    layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
                    arch::Sm80, ThreadblockShape0, ThreadblockShape1,
                    WarpShape0, WarpShape1, InstructionShape,
                    EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
-                   Operator, CausalMaskAfterGemm0> {
+                   Operator, CausalMaskAfterGemm0, false, offset_t_> {
 
   // TODO: Make pipelined (i.e. stages == 2) work.
   static_assert((Stages >= 3), "Currently, only multistage is supported (not pipelined).");
@@ -210,7 +213,7 @@ struct DefaultGroupedB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, La
           EpilogueOutputOp1::kCount>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmBatchedKernel = kernel::GroupedB2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader>;
+  using GroupedB2bGemmBatchedKernel = kernel::GroupedB2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader, offset_t_>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
index a1b39d0e8..017d9e734 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -41,6 +41,7 @@
 
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm80.h"
 #include "cutlass/array.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/gemm.h"
@@ -52,7 +53,6 @@
 #include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
 #include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
 #include "grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -279,6 +279,14 @@ class B2bMmaMultistage :
 
   GmemToAccumLoader gmem_to_accum_loader;
 
+  int const jagged_sequence_length;
+
+  /// Base address of B1 matrix in memory
+  typename IteratorB0::Element *B1_base_matrix_ptr;
+
+  /// Stride from one B1 sequence element to the next ( B1 is row-major )
+  int const seq_stride_B1;
+
 public:
 
   /// Construct from tensor references
@@ -294,13 +302,19 @@ class B2bMmaMultistage :
       ///< ID of each thread within a warp
       int lane_idx,
       ///< GEMM0 N is used for accumulator extent
-      int problem_size_0_n
+      int jagged_sequence_length_,
+      // extra params
+      typename IteratorB1::Element *B1_base_matrix_ptr_,
+      int seq_stride_B1_
     ):
       Base(shared_storage, thread_idx, warp_idx, lane_idx),
       smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
       smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
       smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
-      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx)
+      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx),
+      jagged_sequence_length(jagged_sequence_length_),
+      B1_base_matrix_ptr(B1_base_matrix_ptr_),
+      seq_stride_B1(seq_stride_B1_)
   {
     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
@@ -325,11 +339,10 @@ class B2bMmaMultistage :
 
   CUTLASS_DEVICE
   void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
-                              int group_start_A0 = 0, int group_start_B0 = 0) {
+                             int const group_start_A0 = 0,  int const group_start_B0 = 0) {
     iterator_A0.set_iteration_index(group_start_A0 *
                                    IteratorA0::kAccessesPerVector);
     this->smem_iterator_A0_.set_iteration_index(group_start_A0);
-
     // LDGSTS for operand A
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
@@ -338,20 +351,20 @@ class B2bMmaMultistage :
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
 
-        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
                               IteratorA0::ThreadMap::kElementsPerAccess /
                               IteratorA0::kAccessesPerVector / 8;
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A0.get();
-
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
-              dst_ptr + v, gmem_ptr, iterator_A0.valid());
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
 
           ++iterator_A0;
         }
 
+
+
         ++this->smem_iterator_A0_;
       }
     }
@@ -359,7 +372,6 @@ class B2bMmaMultistage :
     iterator_B0.set_iteration_index(group_start_B0 *
                                    IteratorB0::kAccessesPerVector);
     this->smem_iterator_B0_.set_iteration_index(group_start_B0);
-
     // LDGSTS for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
@@ -368,17 +380,14 @@ class B2bMmaMultistage :
             reinterpret_cast<typename IteratorB0::AccessType *>(
                 this->smem_iterator_B0_.get());
 
-        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
                               IteratorB0::ThreadMap::kElementsPerAccess /
                               IteratorB0::kAccessesPerVector / 8;
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B0.get();
-
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
-              dst_ptr + v, gmem_ptr, iterator_B0.valid());
-
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
           ++iterator_B0;
         }
         ++this->smem_iterator_B0_;
@@ -388,11 +397,10 @@ class B2bMmaMultistage :
 
   CUTLASS_DEVICE
   void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
-                              int group_start_B1 = 0) {
+                              int const group_start_B1 = 0) {
     iterator_B1.set_iteration_index(group_start_B1 *
                                    IteratorB1::kAccessesPerVector);
     this->smem_iterator_B1_.set_iteration_index(group_start_B1);
-
     // LDGSTS for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
@@ -401,17 +409,18 @@ class B2bMmaMultistage :
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
 
-        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
                               IteratorB1::ThreadMap::kElementsPerAccess /
                               IteratorB1::kAccessesPerVector / 8;
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B1.get();
-
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
-              dst_ptr + v, gmem_ptr, iterator_B1.valid());
-
+          auto const gmem_ptr = iterator_B1.get();
+          int64_t const offset = reinterpret_cast<decltype(B1_base_matrix_ptr)>(gmem_ptr)-B1_base_matrix_ptr;
+          int64_t const outer_offset = offset / seq_stride_B1;
+          bool const iterB1valid = (outer_offset<jagged_sequence_length);
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterB1valid);
           ++iterator_B1;
         }
         ++this->smem_iterator_B1_;
@@ -450,7 +459,6 @@ class B2bMmaMultistage :
 
       iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
       iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
-
       iterator_A0.set_iteration_index(0);
       this->smem_iterator_A0_.set_iteration_index(0);
 
@@ -460,19 +468,16 @@ class B2bMmaMultistage :
         typename IteratorA0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
+        constexpr int kSrcBytes =
               sizeof_bits<typename IteratorA0::Element>::value *
               IteratorA0::ThreadMap::kElementsPerAccess /
               IteratorA0::kAccessesPerVector / 8;
 
-          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
-
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          // zfill used here also in original code
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
               dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
-
           ++iterator_A0;
         }
 
@@ -491,11 +496,10 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
+          constexpr int kSrcBytes =
               sizeof_bits<typename IteratorB0::Element>::value *
               IteratorB0::ThreadMap::kElementsPerAccess /
               IteratorB0::kAccessesPerVector / 8;
-
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
               dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
 
@@ -540,10 +544,8 @@ class B2bMmaMultistage :
 
     ++this->warp_tile_iterator_A0_;
     ++this->warp_tile_iterator_B0_;
-
     iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
     iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
-
     int smem_write_stage_idx = Base::kStages - 1;
     int smem_read_stage_idx = 0;
 
@@ -681,14 +683,11 @@ class B2bMmaMultistage :
     // Prologue
     //
     int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
-
     // Issue several complete stages
     CUTLASS_PRAGMA_UNROLL
     for (int stage = 0; stage < Base::kStages - 1;
          ++stage, --gemm_k_iterations_1) {
-
       iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
-
       iterator_B1.set_iteration_index(0);
       this->smem_iterator_B1_.set_iteration_index(0);
 
@@ -701,13 +700,17 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
+          constexpr int kSrcBytes =
               sizeof_bits<typename IteratorB1::Element>::value *
               IteratorB1::ThreadMap::kElementsPerAccess /
               IteratorB1::kAccessesPerVector / 8;
-
+          auto const gmem_ptr = iterator_B1.get();
+          int64_t const offset = reinterpret_cast<decltype(B1_base_matrix_ptr)>(gmem_ptr)-B1_base_matrix_ptr;
+          int64_t const outer_offset = offset / seq_stride_B1;
+          bool const iterB1valid = (outer_offset<jagged_sequence_length);
+          // zfill also used in original code
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
-              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+              dst_ptr + v, gmem_ptr, iterB1valid);
 
           ++iterator_B1;
         }
@@ -747,9 +750,7 @@ class B2bMmaMultistage :
     this->warp_tile_iterator_B1_.set_kgroup_index(0);
     this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
     ++this->warp_tile_iterator_B1_;
-
     iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
-
     smem_write_stage_idx = Base::kStages - 1;
     smem_read_stage_idx = 0;
 
@@ -759,7 +760,6 @@ class B2bMmaMultistage :
     //
     // Mainloop
     //
-
     gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
     CUTLASS_PRAGMA_UNROLL
     for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
@@ -849,7 +849,6 @@ class B2bMmaMultistage :
           } else {
             ++smem_read_stage_idx;
           }
-
           iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
         }
 
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
index ad48598fb..c645ab65b 100644
--- a/tests/unittest/ops/test_b2b_bmm.py
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -786,180 +786,5 @@ def test_fmha_style_b2b_bmm_bf16(self):
         )
 
 
-@unittest.skipIf(
-    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
-    "Not supported by CUDA < SM80.",
-)
-class ClassicGroupedB2bBmmTestCase(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(0)
-
-    def _test_grouped_classic_multihead_b2b_bmm(
-        self,
-        batch_sizes: Tuple[int, List[int]] = 1024,
-        m=256,
-        k0=128,
-        n0=256,
-        n1=256,
-        num_heads=2,
-        epilogue_math_name="Identity",
-        causal_type=CausalType.NO_CAUSAL,
-        dtype="float16",
-        test_name="grouped_classic_b2b_bmm",
-        copy_op=True,
-        atol=1e-2,
-        rtol=1e-2,
-        bias_broadcast=(False, False, False, False),
-    ):
-        # Initialize AIT grouped_classic_b2b_bmm operator.
-        assert len(bias_broadcast) == 4
-        assert (
-            bias_broadcast[3] is False
-        ), "Grouped classic b2b bmm cannot broadcast bias on last dimension."
-        if isinstance(batch_sizes, int):
-            batch_sizes = [batch_sizes]
-        alpha0 = 1.0 / (k0**0.5)
-        alpha1 = 1.0
-        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
-
-        Q = Tensor(
-            shape=[batch_size_dim, m, num_heads, k0],
-            dtype=dtype,
-            name="q",
-            is_input=True,
-        )
-        K = Tensor(
-            shape=[batch_size_dim, n0, num_heads, k0],
-            dtype=dtype,
-            name="k",
-            is_input=True,
-        )
-        V = Tensor(
-            shape=[batch_size_dim, n0, num_heads, n1],
-            dtype=dtype,
-            name="v",
-            is_input=True,
-        )
-        bias_shape_full = [batch_size_dim, num_heads, m, n0]
-        bias_shape = [
-            IntImm(1) if bias_broadcast[i] else bias_shape_full[i] for i in range(4)
-        ]
-        Bias = Tensor(
-            shape=bias_shape,
-            dtype=dtype,
-            name="bias",
-            is_input=True,
-        )
-        grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
-            causal_type=causal_type,
-            alpha0=alpha0,
-            alpha1=alpha1,
-            alpha1_divide_by_seq_len=True,
-            epilogue_math_name=epilogue_math_name,
-        )
-        if copy_op:
-            grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
-                **grouped_classic_b2b_bmm_op._get_op_attributes()
-            )
-        Y = grouped_classic_b2b_bmm_op(Q, K, V, Bias)
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
-
-        target = detect_target(use_fp16_acc=True)
-        module = compile_model(Y, target, "./tmp", test_name)
-
-        # Run tests.
-        torch_dtype = string_to_torch_dtype(dtype)
-        for batch_size in batch_sizes:
-            # Initialize inputs
-            # Initialized in BMHD dim order
-            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
-            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
-            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
-            bias_shape_full_pt = (batch_size, num_heads, m, n0)
-            bias_shape_pt = (
-                1 if bias_broadcast[i] else bias_shape_full_pt[i] for i in range(4)
-            )
-            bias_pt = torch.rand(*bias_shape_pt, dtype=torch_dtype).cuda()
-
-            # Permute to BHMD dim order
-            q_pt_hf = torch.permute(q_pt, [0, 2, 1, 3])
-            k_pt_hf = torch.permute(k_pt, [0, 2, 1, 3])
-            v_pt_hf = torch.permute(v_pt, [0, 2, 1, 3])
-
-            # Run PT reference.
-            attn = alpha0 * (q_pt_hf @ k_pt_hf.transpose(-2, -1)) + bias_pt
-            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
-            attn = alpha1 / m * attn
-            invalid_attn_mask = get_attn_mask_per_causal_type(
-                m, n0, causal_type, torch_dtype
-            )
-            attn = attn * invalid_attn_mask
-            second_mm = attn @ v_pt_hf
-            output = torch.permute(
-                second_mm, [0, 2, 1, 3]
-            )  # permute back to original dim order
-            y_pt = output.detach()
-
-            # Run AIT.
-            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
-            y = torch.empty(
-                [batch_size, m, num_heads, n1],
-                dtype=torch_dtype,
-                device="cuda",
-            )
-            module.run_with_tensors(inputs, [y])
-            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_grouped_classic_multihead1_b2b_bmm(self):
-        self._test_grouped_classic_multihead_b2b_bmm(
-            test_name="grouped_classic_multihead1_b2b_bmm_fp16_basic",
-            dtype="float16",
-            batch_sizes=1,
-            num_heads=1,
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_grouped_classic_multihead2_b2b_bmm(self):
-        self._test_grouped_classic_multihead_b2b_bmm(
-            test_name="grouped_classic_multihead2_b2b_bmm_fp16_basic",
-            dtype="float16",
-            batch_sizes=1,
-            num_heads=2,
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_grouped_classic_multihead1_b2b_bmm_bias_broadcast1(self):
-        self._test_grouped_classic_multihead_b2b_bmm(
-            test_name="grouped_classic_multihead1_b2b_bmm_broadcast1_fp16_basic",
-            dtype="float16",
-            batch_sizes=1,
-            num_heads=1,
-            bias_broadcast=[True, True, False, False],
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_grouped_classic_multihead2_b2b_bmm_bias_broadcast1(self):
-        self._test_grouped_classic_multihead_b2b_bmm(
-            test_name="grouped_classic_multihead2_b2b_bmm_broadcast1_fp16_basic",
-            dtype="float16",
-            batch_sizes=1,
-            num_heads=2,
-            bias_broadcast=[True, True, False, False],
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_grouped_classic_multihead2_b2b_bmm_bias_broadcast2(self):
-        self._test_grouped_classic_multihead_b2b_bmm(
-            test_name="grouped_classic_multihead2_b2b_bmm_broadcast2_fp16_basic",
-            dtype="float16",
-            batch_sizes=1,
-            num_heads=2,
-            bias_broadcast=[True, True, True, False],
-        )
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..acabaa961
--- /dev/null
+++ b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
@@ -0,0 +1,567 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for grouped b2b bmm Operators.
+"""
+import logging
+
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar, JaggedDim
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class GroupedClassicB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        pass
+
+    def _test_grouped_classic_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        max_seq_lens: Tuple[int, List[int]] = 256,
+        head_dim=128,
+        head_dim_value=256,
+        num_heads: Tuple[int, List[int]] = 1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        offsets_dtype="int32",
+        test_name="grouped_classic_b2b_bmm",
+        alpha1_divide_by_seq_len=True,
+        copy_op=True,
+        atol=0.01,
+        rtol=0.01,
+        use_fp16_acc=False,
+        random_seed=0,
+    ):
+        if isinstance(random_seed, list):
+            random_seeds = random_seed
+        else:
+            random_seeds = [random_seed]
+        # Initialize AIT classic_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes, batch_sizes]
+        if isinstance(max_seq_lens, int):
+            max_seq_lens = [max_seq_lens, max_seq_lens]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads, num_heads]
+        alpha0 = 1.0 / (head_dim**0.5)
+        batch_size_dim = IntVar(
+            values=[min(batch_sizes), max(batch_sizes)], name="batch_size"
+        )
+        max_seq_len_dim = shape_utils.gen_int_var_min_max(
+            max_seq_lens, name="max_seq_len"
+        )
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, name="num_heads")
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_len_dim)]
+        total_length_dim = IntVar(
+            values=[0, batch_size_dim.upper_bound() * max_seq_len_dim.upper_bound()],
+            name="total_length",
+        )
+        offsets_dim = IntVar(
+            values=[batch_size_dim.lower_bound() + 1, batch_size_dim.upper_bound() + 1],
+            name="offset_length",
+        )
+        Q_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim_value],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        offsets = [
+            Tensor(
+                shape=[offsets_dim], name="offsets", dtype=offsets_dtype, is_input=True
+            )
+        ]
+        Q = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            Q_dense, offsets
+        )
+        K = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            K_dense, offsets
+        )
+        V = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            V_dense, offsets
+        )
+        shape = [batch_size_dim, num_heads_dim, max_seq_len_dim, max_seq_len_dim]
+        if bias_broadcast:
+            for i, broadcast in enumerate(bias_broadcast):
+                if broadcast:
+                    shape[i] = 1
+        Bias = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=1.0,
+            alpha1_divide_by_seq_len=alpha1_divide_by_seq_len,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+                **grouped_classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = grouped_classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+        # input(f"Connect debugger. {os.getpid()=}")
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
+        y_results = {}
+        for random_seed in random_seeds:
+            torch.manual_seed(random_seed)
+            for max_seq_len in sorted(set(max_seq_lens)):
+                for num_head in sorted(set(num_heads)):
+                    batch_sizes_sorted = sorted(set(batch_sizes), reverse=True)
+                    max_batch_size = batch_sizes_sorted[0]
+                    lengths_max = torch.randint(
+                        1, max_seq_len, (max_batch_size + 1,), dtype=offsets_torch_dtype
+                    )
+                    lengths_max[0] = 0
+                    offsets_max = torch.cumsum(lengths_max, dim=0).to(
+                        dtype=offsets_torch_dtype
+                    )
+                    # print(f"{batch_size=}, {offsets=}")
+                    total_length_max = offsets_max[-1]
+                    offsets_max_pt = offsets_max.cuda()
+                    q_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim), dtype=torch_dtype
+                    ).cuda()
+                    k_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim), dtype=torch_dtype
+                    ).cuda()
+                    v_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim_value), dtype=torch_dtype
+                    ).cuda()
+                    ## TEMP DEBUG
+                    for i in range(len(offsets_max) - 1):
+                        start = offsets_max[i]
+                        end = offsets_max[i + 1]
+                        q_pt_max[start:end, :, :] += (
+                            0.125 * i
+                        )  # Making sure not everything averages out to zero
+                        k_pt_max[start:end, :, :] += (
+                            -0.25 * i + 0.15
+                        )  # Making sure not everything averages out to zero
+                        v_pt_max[start:end, :, :] += (
+                            0.375 * i - 0.0125
+                        )  # Making sure not everything averages out to zero
+                    ## END TEMP DEBUG
+                    bias_shape_max = [
+                        max_batch_size,
+                        num_head,
+                        max_seq_len,
+                        max_seq_len,
+                    ]
+                    if bias_broadcast:
+                        for i, broadcast in enumerate(bias_broadcast):
+                            if broadcast:
+                                bias_shape_max[i] = 1
+                    bias_pt_max = torch.rand(bias_shape_max, dtype=torch_dtype).cuda()
+                    if not has_bias:
+                        bias_pt_max *= 0.0
+                    results_per_batch = {}
+                    for batch_size in batch_sizes_sorted:
+                        # Initialize inputs
+                        # input(f"Attach debugger if you want. {os.getpid()=}. Press Enter to continue.")
+                        total_length = offsets_max[batch_size]
+                        q_pt = q_pt_max[:total_length, :, :].contiguous()
+                        k_pt = k_pt_max[:total_length, :, :].contiguous()
+                        v_pt = v_pt_max[:total_length, :, :].contiguous()
+                        bias_pt = bias_pt_max[:batch_size, :, :, :].contiguous()
+                        offsets_pt = offsets_max_pt[: batch_size + 1].contiguous()
+                        # Run AIT.
+                        inputs = {
+                            "q": q_pt,
+                            "k": k_pt,
+                            "v": v_pt,
+                            "offsets": offsets_pt,
+                            "bias": bias_pt,
+                        }
+                        y = torch.empty(
+                            [total_length, num_head, head_dim_value],
+                            dtype=torch_dtype,
+                            device="cuda",
+                        )
+                        ypadded = torch.zeros(
+                            y.flatten().shape[0] + 128,
+                            dtype=torch_dtype,
+                            device=y.device,
+                        )
+                        y = ypadded[64:-64].reshape(
+                            [total_length, num_head, head_dim_value]
+                        )
+                        module.run_with_tensors(inputs, [y])
+
+                        y_results[(batch_size, max_seq_len, num_head)] = y
+                        assert torch.all(
+                            ypadded[:64] == 0
+                        )  # Make sure we're not writing beyond boundaries
+                        assert torch.all(
+                            ypadded[-64:] == 0
+                        )  # Make sure we're not writing beyond boundaries
+                        # Run PT reference and verify results.
+                        for row in range(batch_size):
+                            start = offsets_max[row]
+                            end = offsets_max[row + 1]
+                            length = end - start
+                            q_pt_row = q_pt[start:end, :, :]
+                            k_pt_row = k_pt[start:end, :, :]
+                            v_pt_row = v_pt[start:end, :, :]
+                            attn = alpha0 * (
+                                q_pt_row.transpose(0, 1)
+                                @ k_pt_row.transpose(0, 1).transpose(-2, -1)
+                            )
+                            if has_bias:
+                                bias_row = (
+                                    0
+                                    if (
+                                        bias_broadcast is not None and bias_broadcast[0]
+                                    )
+                                    else row
+                                )
+                                bias_pt_row = bias_pt[
+                                    bias_row : bias_row + 1, :, :length, :length
+                                ].squeeze(dim=0)
+                                attn = attn + bias_pt_row
+                            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(
+                                attn
+                            )
+                            if alpha1_divide_by_seq_len:
+                                attn /= max_seq_len
+                            invalid_attn_mask = get_attn_mask_per_causal_type(
+                                length, length, causal_type, torch_dtype
+                            )
+                            attn = attn * invalid_attn_mask
+                            output = (attn @ v_pt_row.transpose(0, 1)).transpose(0, 1)
+                            y_pt_row = output.detach()
+                            # print(
+                            #     f"{batch_size=}, {row=}, {y[start:end, :, :]=}, {y_pt_row.to(torch_dtype)=}"
+                            # )
+                            results_per_batch[batch_size] = {
+                                "y": y[start:end, :, :],
+                                "expected_y": y_pt_row.to(torch_dtype),
+                            }
+
+                            torch.testing.assert_close(
+                                y[start:end, :, :],
+                                y_pt_row.to(torch_dtype),
+                                atol=atol,
+                                rtol=rtol,
+                            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_1(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_1",
+            dtype="float16",
+            batch_sizes=1,
+            head_dim=64,
+            head_dim_value=64,
+            max_seq_lens=[64],
+            has_bias=False,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_2(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_2",
+            dtype="float16",
+            batch_sizes=1,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_a(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_3_batch_a",
+            dtype="float16",
+            batch_sizes=4,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_b(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_3_batch_b",
+            dtype="float16",
+            batch_sizes=[2, 4],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_c(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_3_batch_c",
+            dtype="float16",
+            batch_sizes=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_d(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_3_batch_d",
+            dtype="float16",
+            batch_sizes=[2, 33],
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_e(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_3_batch_e",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+            has_bias=True,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+            has_bias=True,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_acc(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="grouped_classic_b2b_bmm_fp16_acc",
+            dtype="float16",
+            batch_sizes=[7],
+            use_fp16_acc=True,
+            # Need to use a larger threshold for fp16 accum
+            atol=0.25,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1",
+            dtype="float16",
+            batch_sizes=[5],
+            num_heads=4,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty2(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty2",
+            dtype="float16",
+            batch_sizes=[1, 5, 33],
+            num_heads=[
+                2,
+                4,
+                11,
+            ],
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_silu(self):
+        for max_seq_len in [64, 256, 512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[1, 5, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    2,
+                    4,
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(10)),
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias(
+        self,
+    ):
+        for max_seq_len in [512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_silu_bias",
+                dtype="float16",
+                batch_sizes=[3, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(3)),
+                has_bias=True,
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_1(
+        self,
+    ):
+        for max_seq_len in [512]:
+            for random_seed in range(1):
+                self._test_grouped_classic_b2b_bmm(
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_1",
+                    dtype="float16",
+                    batch_sizes=[3, 33],
+                    max_seq_lens=max_seq_len,
+                    num_heads=[
+                        11,
+                    ],
+                    epilogue_math_name="SiLu",
+                    causal_type=CausalType.LOWER_LEFT_EMPTY,
+                    random_seed=random_seed,
+                    has_bias=True,
+                    bias_broadcast=[True, False, True, False],
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_2(
+        self,
+    ):
+        for max_seq_len in [512]:
+            for random_seed in range(1):
+                self._test_grouped_classic_b2b_bmm(
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_2",
+                    dtype="float16",
+                    batch_sizes=[3, 33],
+                    max_seq_lens=max_seq_len,
+                    num_heads=[
+                        11,
+                    ],
+                    epilogue_math_name="SiLu",
+                    causal_type=CausalType.LOWER_LEFT_EMPTY,
+                    random_seed=random_seed,
+                    has_bias=True,
+                    bias_broadcast=[True, False, False, False],
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_3(
+        self,
+    ):
+        for max_seq_len in [512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_bias_broadcast_3",
+                dtype="float16",
+                batch_sizes=[3, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(12, 24)),
+                has_bias=True,
+                bias_broadcast=[True, True, True, False],
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4(
+        self,
+    ):
+        for max_seq_len in [64, 256, 512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4_seqlen={max_seq_len}",
+                dtype="float16",
+                batch_sizes=[3, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    2,
+                    11,
+                ],
+                random_seed=list(range(5)),
+                has_bias=True,
+                bias_broadcast=[True, False, True, False],
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5(
+        self,
+    ):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5",
+            dtype="float16",
+            batch_sizes=[3, 33],
+            max_seq_lens=256,
+            num_heads=[
+                2,
+                11,
+            ],
+            random_seed=list(range(3400, 3411)),
+            has_bias=True,
+            bias_broadcast=[True, False, True, False],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 22a74c7ee3f2b9b2a89cacb675698d7e20106cdf Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Fri, 2 Jun 2023 09:33:28 -0700
Subject: [PATCH 570/638] Add support of nn.functional.hardtanh (#739)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/739

nn.functional.hardtanh is used in mobilenet_v2 but corresponding support in FX and/or kernel is missing.
Mapping hardtanh to clamp to enable compiling/running the model.
Note that it gives a numerical mismatch, which will be investigated in a separate task.

Reviewed By: cgufb

Differential Revision: D46194334

fbshipit-source-id: 87419e123d6ee7bad3682e81b3578817d6a3359d
---
 fx2ait/fx2ait/acc_tracer/acc_ops.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index a6e0c9bc7..8d06d7d5c 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -623,6 +623,14 @@ def stack_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
 @register_acc_op_properties(AccOpProperty.pointwise)
 @register_acc_op_mapping(op_and_target=("call_function", torch.clamp))
 @register_acc_op_mapping(op_and_target=("call_function", torch.clip))
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.hardtanh),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("min_val", "min"),
+        ("max_val", "max"),
+    ],
+)
 @register_acc_op_mapping(op_and_target=("call_method", "clamp"))
 @register_acc_op_mapping(op_and_target=("call_method", "clip"))
 @register_acc_op
@@ -884,15 +892,6 @@ def dropout_mapper(node: torch.fx.Node, mod: nn.Module):
     return node.kwargs["input"]
 
 
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_function", nn.functional.hardtanh),
-)
-@register_acc_op
-def hardtanh(*, input, min_val=-1.0, max_val=1.0):
-    return nn.functional.hardtanh(input=input, min_val=min_val, max_val=max_val)
-
-
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
 @register_acc_op_mapping(op_and_target=("call_function", nn.functional.hardsigmoid))
 @register_acc_op

From c320d262086a130910ca19eecfde836a89437a35 Mon Sep 17 00:00:00 2001
From: Albert Chen <albertchen@meta.com>
Date: Tue, 6 Jun 2023 12:43:26 -0700
Subject: [PATCH 571/638] Eiminate elementwise no-ops (*/1, +-0) (#746)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/746

This diffs adds an optimizations to the sorted graph to skip element-wise int operation is that no-ops, i.e. multiplied or divided by 0 or add or subtracted by 0

Reviewed By: muchulee8

Differential Revision: D46358018

fbshipit-source-id: bca10f3205309f3808366e110d7def513771b10d
---
 .../compiler/transform/optimize_graph.py      |   4 +
 .../transform/remove_elementwise_no_ops.py    |  93 +++++++++++
 .../test_remove_elementwise_no_ops.py         | 152 ++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 python/aitemplate/compiler/transform/remove_elementwise_no_ops.py
 create mode 100644 tests/unittest/compiler/test_remove_elementwise_no_ops.py

diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 83afbdb0f..16ca6c0e4 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -38,6 +38,9 @@
     fuse_permute_bmm_and_gemm,
 )
 from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
+from aitemplate.compiler.transform.remove_elementwise_no_ops import (
+    remove_elementwise_no_ops,
+)
 from aitemplate.compiler.transform.remove_id_ops import remove_id_ops
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
 from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
@@ -91,6 +94,7 @@ def optimize_graph(
 
     funcs = [
         remove_id_ops,
+        remove_elementwise_no_ops,
         dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
         fuse_bmm_permute,
diff --git a/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py b/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py
new file mode 100644
index 000000000..334611d2a
--- /dev/null
+++ b/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Eliminate elementwise no-ops (*/1, +-0)
+"""
+from typing import Callable, Dict, List
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.compiler.transform import transform_utils
+
+
+def _is_const_num(tensor: Tensor, val: int) -> bool:
+    return tensor.is_a_const_num() and tensor._attrs["value"] == val
+
+
+def func_add_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][0], 0) or _is_const_num(
+        src_op._attrs["args"][1], 0
+    ):
+        return True
+    return False
+
+
+def func_sub_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][1], 0):
+        return True
+    return False
+
+
+def func_mul_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][0], 1) or _is_const_num(
+        src_op._attrs["args"][1], 1
+    ):
+        return True
+    return False
+
+
+def func_div_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][1], 1):
+        return True
+    return False
+
+
+FUNC_TO_PREDICATE_MAP: Dict[FuncEnum, Callable[[Tensor], bool]] = {
+    FuncEnum.ADD: func_add_predicate,
+    FuncEnum.SUB: func_sub_predicate,
+    FuncEnum.MUL: func_mul_predicate,
+    FuncEnum.DIV: func_div_predicate,
+}
+
+
+def remove_elementwise_no_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """elementwise no-ops (*/1, +-0)"""
+    for tensor in sorted_graph:
+
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+
+        if (
+            src_op._attrs["op"] != "elementwise"
+            or src_op._attrs["func"] not in FUNC_TO_PREDICATE_MAP
+            or len(src_op._attrs["args"]) != 2  # Skip legacy usecase
+        ):
+            continue
+
+        predicate = FUNC_TO_PREDICATE_MAP[src_op._attrs["func"]]
+        if not predicate(src_op):
+            continue
+
+        input_tensor = src_op._attrs["inputs"][0]
+        # skip a very special case where ops takes an input and produces an output
+        if tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+            continue
+        transform_utils.remove_single_tensor_op_from_sorted_graph(src_op)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/tests/unittest/compiler/test_remove_elementwise_no_ops.py b/tests/unittest/compiler/test_remove_elementwise_no_ops.py
new file mode 100644
index 000000000..ae7e3b414
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_elementwise_no_ops.py
@@ -0,0 +1,152 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Callable
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.remove_elementwise_no_ops import (
+    remove_elementwise_no_ops,
+)
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class RemoveElementwiseNoOpsTestCase(unittest.TestCase):
+    def _test_remove_elementwise_op_impl(
+        self, elementwise_op_getter: Callable[[Tensor], Tensor], should_remove: bool
+    ) -> None:
+        batch_sizes = [1, 1024]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype="float16")
+        X2 = gen_input_tensor([batch_dim, IntImm(M)], name="x2", dtype="float16")
+        add_0 = elementwise_op_getter(X1)
+        Y = ops.elementwise(FuncEnum.ADD)(add_0, X2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        sorted_graph = toposort([Y])
+        modified_graph = remove_elementwise_no_ops(sorted_graph)
+        if should_remove:
+            self.assertEqual(len(modified_graph), len(sorted_graph) - 1)
+            self.assertTrue(add_0 in sorted_graph)
+            self.assertFalse(add_0 in modified_graph)
+        else:
+            self.assertEqual(sorted_graph, modified_graph)
+
+    def test_remove_elementwise_op(self) -> None:
+        test_cases = [
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 0), True),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(0, x), True),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 1), False),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(1, x), False),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 0), True),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(0, x), False),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 1), False),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 1), True),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(1, x), True),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 2), False),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(2, x), False),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 1), True),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 2), False),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(1, x), False),
+        ]
+        for test_no, test in enumerate(test_cases):
+            with self.subTest(test_no=test_no):
+                self._test_remove_elementwise_op_impl(
+                    elementwise_op_getter=test[0], should_remove=test[1]
+                )
+
+    def test_not_remove_connecting_input_output(
+        self,
+    ):
+        batch_sizes = [1, 1024]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype="float16")
+        Y = ops.elementwise(FuncEnum.ADD)(X1, 0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        sorted_graph = toposort([Y])
+        modified_graph = remove_elementwise_no_ops(sorted_graph)
+        self.assertEqual(sorted_graph, modified_graph)
+
+
+class RemoveElementwiseNoOpsIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(RemoveElementwiseNoOpsIntegrationTest, self).__init__(*args, **kwargs)
+        torch.manual_seed(0)
+        self.BATCH_SIZES = [1, 218]
+        self.M = 10
+
+    def test_remove_elementwise_op(self) -> None:
+        test_cases = [
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 0), lambda x: x + 0),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 0), lambda x: x - 0),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 1), lambda x: x * 1),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 1), lambda x: x * 1),
+        ]
+        for test_no, test in enumerate(test_cases):
+            with self.subTest(test_no=test_no):
+                self._test_remove_elementwise_no_ops_impl(
+                    elementwise_op_getter=test[0], expected_op=test[1]
+                )
+
+    def _test_remove_elementwise_no_ops_impl(
+        self,
+        elementwise_op_getter: Callable[[Tensor], Tensor],
+        expected_op: Callable[[Tensor], Tensor],
+    ):
+        dtype = "float16"
+        batch_dim = shape_utils.gen_int_var_min_max(self.BATCH_SIZES, "batch_0")
+        reduce_dim = 0
+        X0 = gen_input_tensor([batch_dim, IntImm(self.M)], name="x0", dtype=dtype)
+        elementwise_op_0 = elementwise_op_getter(X0)
+        Y = ops.reduce_mean(reduce_dim)(elementwise_op_0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(
+            Y,
+            detect_target(),
+            "./tmp",
+            "test_remove_elementwise_no_ops",
+        )
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        for batch in self.BATCH_SIZES:
+            x0_pt = get_random_torch_tensor([batch, self.M], dtype)
+            add_0_pt = expected_op(x0_pt)
+            y_pt = torch.mean(add_0_pt, dim=reduce_dim)
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()

From f05c8e91ed11504e81387b7effdc8cbe38770fba Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Wed, 7 Jun 2023 10:14:20 -0700
Subject: [PATCH 572/638] Update README.md (#745)

Summary:
HuggingFace -> Hugging Face

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/745

Reviewed By: wushirong

Differential Revision: D46523757

Pulled By: hl475

fbshipit-source-id: 7e3ba05f24037955ae6de8a34f64f59f9a2c9284
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0d6e1686d..907e9d3bc 100644
--- a/README.md
+++ b/README.md
@@ -105,9 +105,9 @@ AITemplate provides the following model templates & reference performance data o
 
 - [01_ResNet-50](examples/01_resnet-50/) with PyTorch Image Models (TIMM)
 - [02_MaskRCNN-FPN](examples/02_detectron2/) with Detectron2
-- [03_BERT](examples/03_bert/) with HuggingFace Transformer
+- [03_BERT](examples/03_bert/) with Hugging Face Transformer
 - [04_Vision Transformer](examples/04_vit/) with PyTorch Image Models (TIMM)
-- [05_Stable Diffusion](examples/05_stable_diffusion/) with HuggingFace Diffusers
+- [05_Stable Diffusion](examples/05_stable_diffusion/) with Hugging Face Diffusers
 
 ## Release
 

From 329fe8abda4a137a2283ba23f43ae87ffedaee1b Mon Sep 17 00:00:00 2001
From: Eric Jiang <erj@meta.com>
Date: Wed, 7 Jun 2023 11:05:50 -0700
Subject: [PATCH 573/638] Add PG509 as a detected CUDA GPU (#749)

Summary:
"PG509" is reported for some A100 GPUs with Compute Capability 8.0. For example, the A100 80GB SXM on nvidia-driver-525. This diff adds "PG509" as a detected GPU string for running tests.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/749

Reviewed By: hl475

Differential Revision: D46504619

Pulled By: chenyang78

fbshipit-source-id: c2dca10a127fd535979d2304dde98035d26a97d4
---
 python/aitemplate/testing/detect_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 465dcb4c2..2b2913d6f 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -42,7 +42,7 @@ def _detect_cuda_with_nvidia_smi():
         sm_names = {
             "70": ["V100"],
             "75": ["T4", "Quadro T2000"],
-            "80": ["A100", "A10G", "RTX 30", "A30", "RTX 40"],
+            "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40"],
             "90": ["H100"],
         }
         for sm, names in sm_names.items():

From 2f2912f383351e979b6bc42a65d726f355491d59 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Wed, 7 Jun 2023 14:34:34 -0700
Subject: [PATCH 574/638] Move XRayVideo related FE modules to frontend/nn +
 disambiguate test names (#748)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/748

This diff moves mvit related FE modules over from the `benchmarking` folder to appropriate internal `frontend/nn` folder and disambiguates test names to avoid race conditions during parallel test runs

Reviewed By: mortzur

Differential Revision: D46442470

fbshipit-source-id: 8ab97be4bfaf890676d0c9c48040eaa218801706
---
 python/aitemplate/frontend/nn/head.py         | 177 +++++++
 python/aitemplate/frontend/nn/patch_embed.py  | 101 ++++
 .../frontend/nn/positional_encoding.py        | 200 ++++++++
 .../frontend/nn/vision_transformers.py        | 443 ++++++++++++++++++
 4 files changed, 921 insertions(+)
 create mode 100644 python/aitemplate/frontend/nn/head.py
 create mode 100644 python/aitemplate/frontend/nn/patch_embed.py
 create mode 100644 python/aitemplate/frontend/nn/positional_encoding.py
 create mode 100644 python/aitemplate/frontend/nn/vision_transformers.py

diff --git a/python/aitemplate/frontend/nn/head.py b/python/aitemplate/frontend/nn/head.py
new file mode 100644
index 000000000..10ebb4c21
--- /dev/null
+++ b/python/aitemplate/frontend/nn/head.py
@@ -0,0 +1,177 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Callable
+
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.softmax import Softmax
+
+
+class SequencePool(Module):
+    """
+    Sequence pool produces a single embedding from a sequence of embeddings. Currently
+    it supports "mean" and "cls".
+
+    """
+
+    def __init__(self, mode: str) -> None:
+        """
+        Args:
+            mode (str): Optionals include "cls" and "mean". If set to "cls", it assumes
+                the first element in the input is the cls token and returns it. If set
+                to "mean", it returns the mean of the entire sequence.
+        """
+        super().__init__()
+        assert mode in ["mean"], "Unsupported mode for SequencePool."
+        self.mode = mode
+
+    def forward(self, x: Tensor) -> Tensor:
+        # TODO: Add support for cls mode.
+        # if self.mode == "cls":
+        #     x = x[:, 0]
+        if self.mode == "mean":
+            x = ops.reduce_mean(1)(x)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class VisionTransformerBasicHead(Module):
+    """
+    Vision transformer basic head.
+
+    ::
+
+                                      SequencePool
+                                           ↓
+                                        Dropout
+                                           ↓
+                                       Projection
+                                           ↓
+                                       Activation
+
+
+    The builder can be found in `create_vit_basic_head`.
+    """
+
+    def __init__(
+        self,
+        sequence_pool: Module = None,
+        dropout: Module = None,
+        proj: Module = None,
+        activation: Module = None,
+    ) -> None:
+        """
+        Args:
+            sequence_pool (torch.nn.modules): pooling module.
+            dropout(torch.nn.modules): dropout module.
+            proj (torch.nn.modules): project module.
+            activation (torch.nn.modules): activation module.
+        """
+        super().__init__()
+        self.sequence_pool = sequence_pool
+        self.dropout = dropout
+        self.proj = proj
+        self.activation = activation
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Performs pooling.
+        if self.sequence_pool is not None:
+            x = self.sequence_pool(x)
+
+        # Performs dropout.
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # Performs projection.
+        if self.proj is not None:
+            x = self.proj(x)
+        # Performs activation.
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+def create_vit_basic_head(
+    *,
+    # Projection configs.
+    in_features: int,
+    out_features: int,
+    # Pooling configs.
+    seq_pool_type: str = "cls",
+    # Dropout configs.
+    dropout_rate: float = 0.5,
+    # Activation configs.
+    activation: Callable = None,
+) -> Module:
+    """
+    Creates vision transformer basic head.
+
+    ::
+
+
+                                        Pooling
+                                           ↓
+                                        Dropout
+                                           ↓
+                                       Projection
+                                           ↓
+                                       Activation
+
+
+    Activation examples include: ReLU, Softmax, Sigmoid, and None.
+    Pool type examples include: cls, mean and none.
+
+    Args:
+
+        in_features: input channel size of the resnet head.
+        out_features: output channel size of the resnet head.
+
+        pool_type (str): Pooling type. It supports "cls", "mean " and "none". If set to
+            "cls", it assumes the first element in the input is the cls token and
+            returns it. If set to "mean", it returns the mean of the entire sequence.
+
+        activation (callable): a callable that constructs vision transformer head
+            activation layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and
+            None (not applying activation).
+
+        dropout_rate (float): dropout rate.
+    """
+    assert seq_pool_type in ["cls", "mean", "none"]
+
+    if seq_pool_type in ["cls", "mean"]:
+        seq_pool_model = SequencePool(seq_pool_type)
+    elif seq_pool_type == "none":
+        seq_pool_model = None
+    else:
+        raise NotImplementedError
+
+    if activation is None:
+        activation_model = None
+    elif activation == Softmax:
+        activation_model = activation(dim=1)
+    else:
+        activation_model = activation()
+
+    return VisionTransformerBasicHead(
+        sequence_pool=seq_pool_model,
+        dropout=Dropout(dropout_rate) if dropout_rate > 0.0 else None,
+        proj=Linear(in_features, out_features),
+        activation=activation_model,
+    )
diff --git a/python/aitemplate/frontend/nn/patch_embed.py b/python/aitemplate/frontend/nn/patch_embed.py
new file mode 100644
index 000000000..0d2658128
--- /dev/null
+++ b/python/aitemplate/frontend/nn/patch_embed.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+patch_embed Module.
+"""
+from typing import Callable, Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.module import Module
+
+
+class PatchEmbed(Module):
+    """
+    Transformer basic patch embedding module. Performs patchifying input and flatten
+
+    ::
+
+                                       PatchModel
+                                           ↓
+                                        flatten
+
+    ::
+
+    output shape: [N, D*H*W, C]
+
+    """
+
+    def __init__(
+        self,
+        patch_model,
+    ) -> None:
+        super().__init__()
+        self.patch_model = patch_model
+
+    def forward(self, *args) -> Tensor:
+        assert len(args) == 1
+        x = args[0]
+
+        x = self.patch_model(x)
+        x = ops.flatten(start_dim=1, end_dim=-2)(x)
+        return x
+
+
+def create_conv_patch_embed(
+    *,
+    in_channels: int,
+    out_channels: int,
+    conv_kernel_size: Tuple[int] = (1, 16, 16),
+    conv_stride: Tuple[int] = (1, 4, 4),
+    conv_padding: Tuple[int] = (1, 7, 7),
+    conv_bias: bool = True,
+    conv: Callable = Conv3d,
+) -> Module:
+    """
+    Creates the transformer basic patch embedding. It performs Convolution, flatten and
+    transpose.
+
+    ::
+
+                                        Conv3d
+                                           ↓
+                                        flatten
+                                           ↓
+                                       transpose
+
+    Args:
+        in_channels (int): input channel size of the convolution.
+        out_channels (int): output channel size of the convolution.
+        conv_kernel_size (tuple): convolutional kernel size(s).
+        conv_stride (tuple): convolutional stride size(s).
+        conv_padding (tuple): convolutional padding size(s).
+        conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
+            output.
+        conv (callable): Callable used to build the convolution layer.
+
+    Returns:
+        (nn.Module): transformer patch embedding layer.
+    """
+    conv_module = conv(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=conv_kernel_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        bias=conv_bias,
+    )
+    return PatchEmbed(patch_model=conv_module)
diff --git a/python/aitemplate/frontend/nn/positional_encoding.py b/python/aitemplate/frontend/nn/positional_encoding.py
new file mode 100644
index 000000000..0566424ca
--- /dev/null
+++ b/python/aitemplate/frontend/nn/positional_encoding.py
@@ -0,0 +1,200 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+positional_encoding Modules.
+"""
+import logging
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+_LOGGER = logging.getLogger(__name__)
+
+# These op implementations are copied from: https://fburl.com/code/o0qhusw6.
+# TODO: Move these to proper AIT op FEs
+def tile(input_val, dims):
+    shape_dims = list(dims)
+    input_dim_len = len(input_val.shape())
+    result = input_val
+    if len(shape_dims) < input_dim_len:
+        for _ in range(input_dim_len - len(shape_dims)):
+            shape_dims.insert(0, 1)
+    if input_dim_len < len(shape_dims):
+        shape = input_val.shape()
+        for _ in range(len(shape_dims) - input_dim_len):
+            shape.insert(0, IntImm(1))
+        result = ops.expand()(input_val, shape)
+
+    for i, shape in enumerate(shape_dims):
+        # Avoid operate on batch_size dim
+        if input_val.shape()[i]._attrs["name"] is not None:
+            continue
+        cat_groups = [result] * shape
+        result = ops.concatenate()(cat_groups, dim=i)
+    return result
+
+
+def repeat(input_val, dims):
+    if (
+        isinstance(dims, (list, tuple))
+        and len(dims) > 0
+        and not all(isinstance(x, int) for x in dims)
+    ):
+        _LOGGER.info("Not mapping repeat to an op. We can't handle variable dims.")
+        return input_val
+    return tile(input_val, dims)
+
+
+def repeat_interleave(input_val, repeats, dim=None):
+    if not (type(repeats) is int):
+        _LOGGER.info(
+            "Not mapping repeat_interleave to an acc op. We currently only support `repeat_interleave` with int repeats"
+        )
+        return
+    assert (
+        type(repeats) is int
+    ), "We currently only support `repeat_interleave` with int repeats"
+    rank = len(input_val.shape())
+    if dim is None:
+        repeat_dim = rank - 1
+    else:
+        assert type(dim) is int, "dim should be an int"
+        repeat_dim = dim
+    tile_dims = [1] * (rank + 1)
+    tile_dims[repeat_dim + 1] = repeats
+
+    x = ops.unsqueeze(repeat_dim + 1)(input_val)
+    x = tile(x, tuple(tile_dims))
+    new_shape = []
+    if dim is not None:
+        if dim < 0:
+            repeat_dim = dim + rank
+        else:
+            repeat_dim = dim
+        size_node = input_val.shape()
+        for i in range(rank):
+            shape_i = ops.getitem()(size_node, i)
+            if i == repeat_dim:
+                new_shape.append(-1)
+            else:
+                new_shape.append(shape_i)
+    else:
+        new_shape.append(-1)
+
+    x = ops.reshape()(x, new_shape)
+    return x
+
+
+class SpatioTemporalClsPositionalEncoding(Module):
+    """
+    Add a cls token and apply a spatiotemporal encoding to a tensor.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        patch_embed_shape: Tuple[int, int, int],
+        sep_pos_embed: bool = False,
+        has_cls: bool = True,
+        dtype: str = "float16",
+    ) -> None:
+        """
+        Args:
+            embed_dim (int): Embedding dimension for input sequence.
+            patch_embed_shape (Tuple): The number of patches in each dimension
+                (T, H, W) after patch embedding.
+            sep_pos_embed (bool): If set to true, one positional encoding is used for
+                spatial patches and another positional encoding is used for temporal
+                sequence. Otherwise, only one positional encoding is used for all the
+                patches.
+            has_cls (bool): If set to true, a cls token is added in the beginning of each
+                input sequence.
+        """
+        super().__init__()
+        assert (
+            len(patch_embed_shape) == 3
+        ), "Patch_embed_shape should be in the form of (T, H, W)."
+        self.cls_embed_on = has_cls
+        self.sep_pos_embed = sep_pos_embed
+        self._patch_embed_shape = tuple(patch_embed_shape)
+        self.num_spatial_patch = patch_embed_shape[1] * patch_embed_shape[2]
+        self.num_temporal_patch = patch_embed_shape[0]
+
+        if self.cls_embed_on:
+            self.cls_token = Parameter(shape=[1, 1, embed_dim], dtype=dtype)
+            num_patches = self.num_spatial_patch * self.num_temporal_patch + 1
+        else:
+            self.cls_token = Parameter(shape=[], value=0, dtype=dtype)
+            num_patches = self.num_spatial_patch * self.num_temporal_patch
+
+        if self.sep_pos_embed:
+            self.pos_embed_spatial = Parameter(
+                shape=[1, self.num_spatial_patch, embed_dim],
+                dtype=dtype,
+            )
+            self.pos_embed_temporal = Parameter(
+                shape=[1, self.num_temporal_patch, embed_dim],
+                dtype=dtype,
+            )
+            if self.cls_embed_on:
+                self.pos_embed_class = Parameter(shape=[1, 1, embed_dim], dtype=dtype)
+            else:
+                self.pos_embed_class = Parameter(shape=[], dtype=dtype)
+            self.pos_embed = Parameter(shape=[], dtype=dtype)
+
+        else:
+            self.pos_embed = Parameter(shape=[1, num_patches, embed_dim], dtype=dtype)
+            # Placeholders for torchscriptability, won't be used
+            self.pos_embed_spatial = Parameter(shape=[], dtype=dtype)
+            self.pos_embed_temporal = Parameter(shape=[], dtype=dtype)
+            self.pos_embed_class = Parameter(shape=[], dtype=dtype)
+
+    def patch_embed_shape(self) -> Tuple[int, int, int]:
+        return self._patch_embed_shape
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor.
+        """
+        B, N, C = x.shape()
+        if self.cls_embed_on:
+            cls_tokens = ops.expand()(self.cls_token.tensor(), [B, -1, -1])
+            x = ops.concatenate()([cls_tokens, x], dim=1)
+
+        if self.sep_pos_embed:
+            pos_embed = ops.elementwise(FuncEnum.ADD)(
+                repeat(
+                    self.pos_embed_spatial.tensor(), (1, self.num_temporal_patch, 1)
+                ),
+                repeat_interleave(
+                    self.pos_embed_temporal.tensor(), self.num_spatial_patch, dim=1
+                ),
+            )
+
+            if self.cls_embed_on:
+                pos_embed = ops.concatenate()(
+                    [self.pos_embed_class.tensor(), pos_embed], dim=1
+                )
+            x = ops.elementwise(FuncEnum.ADD)(x, pos_embed)
+        else:
+            x = ops.elementwise(FuncEnum.ADD)(x, self.pos_embed.tensor())
+
+        return x
diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
new file mode 100644
index 000000000..46cd3b90e
--- /dev/null
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -0,0 +1,443 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import warnings
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
+from aitemplate.frontend.nn.container import ModuleList
+from aitemplate.frontend.nn.conv2d import Conv2d
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.dropout import Dropout
+
+from aitemplate.frontend.nn.head import create_vit_basic_head
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.layer_norm import LayerNorm
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.multiscale_attention import MultiScaleBlock
+from aitemplate.frontend.nn.patch_embed import create_conv_patch_embed
+from aitemplate.frontend.nn.positional_encoding import (
+    SpatioTemporalClsPositionalEncoding,
+)
+from pytorchvideo.layers.utils import round_width
+
+
+class MultiscaleVisionTransformers(Module):
+    """
+    Multiscale Vision Transformers
+    Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra Malik,
+    Christoph Feichtenhofer
+    https://arxiv.org/abs/2104.11227
+
+    ::
+
+                                       PatchEmbed
+                                           ↓
+                                   PositionalEncoding
+                                           ↓
+                                        Dropout
+                                           ↓
+                                     Normalization
+                                           ↓
+                                         Block 1
+                                           ↓
+                                           .
+                                           .
+                                           .
+                                           ↓
+                                         Block N
+                                           ↓
+                                     Normalization
+                                           ↓
+                                          Head
+
+
+    The builder can be found in `create_mvit`.
+    """
+
+    def __init__(
+        self,
+        *,
+        patch_embed: Optional[Module],
+        cls_positional_encoding: Module,
+        pos_drop: Optional[Module],
+        blocks: ModuleList,
+        norm_embed: Optional[Module],
+        head: Optional[Module],
+    ) -> None:
+        """
+        Args:
+            patch_embed (nn.Module): Patch embed module.
+            cls_positional_encoding (nn.Module): Positional encoding module.
+            pos_drop (Optional[nn.Module]): Dropout module after patch embed.
+            blocks (nn.ModuleList): Stack of multi-scale transformer blocks.
+            norm_layer (nn.Module): Normalization layer before head.
+            head (Optional[nn.Module]): Head module.
+        """
+        super().__init__()
+
+        assert hasattr(
+            cls_positional_encoding, "patch_embed_shape"
+        ), "cls_positional_encoding should have method patch_embed_shape."
+
+        self.patch_embed = patch_embed or Identity()
+        self.cls_positional_encoding = cls_positional_encoding
+        self.pos_drop = pos_drop or Identity()
+        self.blocks = blocks
+        self.norm_embed = norm_embed or Identity()
+        self.head = head or Identity()
+
+    # TODO: Add support for batchnorm fusion
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.patch_embed(x)
+        x = self.cls_positional_encoding(x)
+        x = self.pos_drop(x)
+
+        thw = self.cls_positional_encoding.patch_embed_shape()
+        for blk in self.blocks:
+            t_shape, h_shape, w_shape = thw
+            x, thw = blk(x, t_shape, h_shape, w_shape)
+        x = self.norm_embed(x)
+        x = self.head(x)
+        return x
+
+
+def create_multiscale_vision_transformers(
+    *,
+    spatial_size: Union[int, Tuple[int, int]],
+    temporal_size: int,
+    cls_embed_on: bool = True,
+    sep_pos_embed: bool = True,
+    depth: int = 16,
+    norm: str = "layernorm",
+    # Patch embed config.
+    enable_patch_embed: bool = True,
+    input_channels: int = 3,
+    patch_embed_dim: int = 96,
+    conv_patch_embed_kernel: Tuple[int] = (3, 7, 7),
+    conv_patch_embed_stride: Tuple[int] = (2, 4, 4),
+    conv_patch_embed_padding: Tuple[int] = (1, 3, 3),
+    enable_patch_embed_norm: bool = False,
+    use_2d_patch: bool = False,
+    # Attention block config.
+    num_heads: int = 1,
+    mlp_ratio: float = 4.0,
+    qkv_bias: bool = True,
+    dropout_rate_block: float = 0.0,
+    droppath_rate_block: float = 0.0,
+    pooling_mode: str = "conv",
+    pool_first: bool = False,
+    residual_pool: bool = False,
+    depthwise_conv: bool = True,
+    bias_on: bool = True,
+    separate_qkv: bool = True,
+    embed_dim_mul: Optional[List[List[int]]] = None,
+    atten_head_mul: Optional[List[List[int]]] = None,
+    dim_mul_in_att: bool = False,
+    pool_q_stride_size: Optional[List[List[int]]] = None,
+    pool_kv_stride_size: Optional[List[List[int]]] = None,
+    pool_kv_stride_adaptive: Optional[Union[int, Tuple[int, int, int]]] = None,
+    pool_kvq_kernel: Optional[Union[int, Tuple[int, int, int]]] = None,
+    # Head config.
+    head: Optional[Callable] = create_vit_basic_head,
+    head_dropout_rate: float = 0.5,
+    head_activation: Callable = None,
+    head_num_classes: int = 400,
+    # The default model definition is not TorchScript-friendly.
+    # Set create_scriptable_model=True to create a TorchScriptable model.
+    create_scriptable_model: bool = False,
+    multiscale_vit_class: Callable = MultiscaleVisionTransformers,
+) -> Module:
+    """
+    Build Multiscale Vision Transformers (MViT) for recognition. A Vision Transformer
+    (ViT) is a specific case of MViT that only uses a single scale attention block.
+
+    Args:
+        spatial_size (_size_2_t): Input video spatial resolution (H, W). If a single
+            int is given, it assumes the width and the height are the same.
+        temporal_size (int): Number of frames in the input video.
+        cls_embed_on (bool): If True, use cls embed in the model. Otherwise features
+            are average pooled before going to the final classifier.
+        sep_pos_embed (bool): If True, perform separate spatiotemporal embedding.
+        depth (int): The depth of the model.
+        norm (str): Normalization layer. It currently supports "layernorm".
+
+        enable_patch_embed (bool): If true, patchify the input video. If false, it
+            assumes the input should have the feature dimension of patch_embed_dim.
+        input_channels (int): Channel dimension of the input video.
+        patch_embed_dim (int): Embedding dimension after patchifing the video input.
+        conv_patch_embed_kernel (Tuple[int]): Kernel size of the convolution for
+            patchifing the video input.
+        conv_patch_embed_stride (Tuple[int]): Stride size of the convolution for
+            patchifing the video input.
+        conv_patch_embed_padding (Tuple[int]): Padding size of the convolution for
+            patchifing the video input.
+        enable_patch_embed_norm (bool): If True, apply normalization after patchifing
+            the video input.
+        use_2d_patch (bool): If True, use 2D convolutions to get patch embed.
+            Otherwise, use 3D convolutions.
+
+        num_heads (int): Number of heads in the first transformer block.
+        mlp_ratio (float): Mlp ratio which controls the feature dimension in the
+            hidden layer of the Mlp block.
+        qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+            bias. Default: True.
+        dropout_rate_block (float): Dropout rate for the attention block.
+        droppath_rate_block (float): Droppath rate for the attention block.
+        pooling_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+            (average pooling), and "max" (max pooling).
+        pool_first (bool): If set to True, pool is applied before qkv projection.
+            Otherwise, pool is applied after qkv projection. Default: False.
+        residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+        depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+        bias_on (bool): Whether use biases for linear layers.
+        separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        embed_dim_mul (Optional[List[List[int]]]): Dimension multiplication at layer i.
+            If X is used, then the next block will increase the embed dimension by X
+            times. Format: [depth_i, mul_dim_ratio].
+        atten_head_mul (Optional[List[List[int]]]): Head dimension multiplication at
+            layer i. If X is used, then the next block will increase the head by
+            X times. Format: [depth_i, mul_dim_ratio].
+        dim_mul_in_att (bool): If set to True, dimension expansion happens inside
+                the attention module, otherwise it happens in the Mlp block. Default: False.
+        pool_q_stride_size (Optional[List[List[int]]]): List of stride sizes for the
+            pool q at each layer. Format:
+            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
+        pool_kv_stride_size (Optional[List[List[int]]]): List of stride sizes for the
+            pool kv at each layer. Format:
+            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
+        pool_kv_stride_adaptive (Optional[_size_3_t]): Initial kv stride size for the
+            first block. The stride size will be further reduced at the layer where q
+            is pooled with the ratio of the stride of q pooling. If
+            pool_kv_stride_adaptive is set, then pool_kv_stride_size should be none.
+        pool_kvq_kernel (Optional[_size_3_t]): Pooling kernel size for q and kv. It None,
+            the kernel_size is [s + 1 if s > 1 else s for s in stride_size].
+
+        head (Callable): Head model.
+        head_dropout_rate (float): Dropout rate in the head.
+        head_activation (Callable): Activation in the head.
+        head_num_classes (int): Number of classes in the final classification head.
+        multiscale_vit_class (Callable): MViT transformer class. Default to
+            MultiscaleVisionTransformers.
+
+    Example usage (building a MViT_B model for Kinetics400):
+
+        spatial_size = 224
+        temporal_size = 16
+        embed_dim_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
+        atten_head_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
+        pool_q_stride_size = [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]]
+        pool_kv_stride_adaptive = [1, 8, 8]
+        pool_kvq_kernel = [3, 3, 3]
+        head_num_classes = 400
+        MViT_B = create_multiscale_vision_transformers(
+            spatial_size=spatial_size,
+            temporal_size=temporal_size,
+            embed_dim_mul=embed_dim_mul,
+            atten_head_mul=atten_head_mul,
+            pool_q_stride_size=pool_q_stride_size,
+            pool_kv_stride_adaptive=pool_kv_stride_adaptive,
+            pool_kvq_kernel=pool_kvq_kernel,
+            head_num_classes=head_num_classes,
+        )
+    """
+
+    if use_2d_patch:
+        assert temporal_size == 1, "If use_2d_patch, temporal_size needs to be 1."
+    if pool_kv_stride_adaptive is not None:
+        assert (
+            pool_kv_stride_size is None
+        ), "pool_kv_stride_size should be none if pool_kv_stride_adaptive is set."
+    if norm == "layernorm":
+        norm_layer = partial(LayerNorm, eps=1e-6)
+        block_norm_layer = partial(LayerNorm, eps=1e-6)
+        attn_norm_layer = partial(LayerNorm, eps=1e-6)
+    elif norm == "batchnorm":
+        norm_layer = None
+        block_norm_layer = BatchNorm1d
+        attn_norm_layer = BatchNorm3d
+    else:
+        raise NotImplementedError("Only supports layernorm.")
+    if create_scriptable_model:
+        assert (
+            norm == "batchnorm"
+        ), "The scriptable model supports only the batchnorm-based model."
+        warnings.warn(
+            "`create_scriptable_model` is deprecated. MultiscaleVisionTransformers"
+            " now supports scripting without this flag.",
+            DeprecationWarning,
+        )
+
+    if isinstance(spatial_size, int):
+        spatial_size = (spatial_size, spatial_size)
+
+    conv_patch_op = Conv2d if use_2d_patch else Conv3d
+
+    patch_embed = (
+        create_conv_patch_embed(
+            in_channels=input_channels,
+            out_channels=patch_embed_dim,
+            conv_kernel_size=conv_patch_embed_kernel,
+            conv_stride=conv_patch_embed_stride,
+            conv_padding=conv_patch_embed_padding,
+            conv=conv_patch_op,
+        )
+        if enable_patch_embed
+        else None
+    )
+
+    input_dims = [temporal_size, spatial_size[0], spatial_size[1]]
+    input_stride = (
+        (1,) + tuple(conv_patch_embed_stride)
+        if use_2d_patch
+        else conv_patch_embed_stride
+    )
+
+    patch_embed_shape = (
+        [input_dims[i] // input_stride[i] for i in range(len(input_dims))]
+        if enable_patch_embed
+        else input_dims
+    )
+
+    cls_positional_encoding = SpatioTemporalClsPositionalEncoding(
+        embed_dim=patch_embed_dim,
+        patch_embed_shape=patch_embed_shape,
+        sep_pos_embed=sep_pos_embed,
+        has_cls=cls_embed_on,
+    )
+
+    dpr = [
+        x.item() for x in torch.linspace(0, droppath_rate_block, depth)
+    ]  # stochastic depth decay rule
+
+    if dropout_rate_block > 0.0:
+        pos_drop = Dropout(p=dropout_rate_block)
+
+    dim_mul, head_mul = torch.ones(depth + 1), torch.ones(depth + 1)
+    if embed_dim_mul is not None:
+        for i in range(len(embed_dim_mul)):
+            dim_mul[embed_dim_mul[i][0]] = embed_dim_mul[i][1]
+    if atten_head_mul is not None:
+        for i in range(len(atten_head_mul)):
+            head_mul[atten_head_mul[i][0]] = atten_head_mul[i][1]
+
+    mvit_blocks = ModuleList()
+
+    pool_q = [[] for i in range(depth)]
+    pool_kv = [[] for i in range(depth)]
+    stride_q = [[] for i in range(depth)]
+    stride_kv = [[] for i in range(depth)]
+
+    if pool_q_stride_size is not None:
+        for i in range(len(pool_q_stride_size)):
+            stride_q[pool_q_stride_size[i][0]] = pool_q_stride_size[i][1:]
+            if pool_kvq_kernel is not None:
+                pool_q[pool_q_stride_size[i][0]] = pool_kvq_kernel
+            else:
+                pool_q[pool_q_stride_size[i][0]] = [
+                    s + 1 if s > 1 else s for s in pool_q_stride_size[i][1:]
+                ]
+
+    # If POOL_KV_STRIDE_ADAPTIVE is not None, initialize POOL_KV_STRIDE.
+    if pool_kv_stride_adaptive is not None:
+        _stride_kv = pool_kv_stride_adaptive
+        pool_kv_stride_size = []
+        for i in range(depth):
+            if len(stride_q[i]) > 0:
+                _stride_kv = [
+                    max(_stride_kv[d] // stride_q[i][d], 1)
+                    for d in range(len(_stride_kv))
+                ]
+            pool_kv_stride_size.append([i] + _stride_kv)
+
+    if pool_kv_stride_size is not None:
+        for i in range(len(pool_kv_stride_size)):
+            stride_kv[pool_kv_stride_size[i][0]] = pool_kv_stride_size[i][1:]
+            if pool_kvq_kernel is not None:
+                pool_kv[pool_kv_stride_size[i][0]] = pool_kvq_kernel
+            else:
+                pool_kv[pool_kv_stride_size[i][0]] = [
+                    s + 1 if s > 1 else s for s in pool_kv_stride_size[i][1:]
+                ]
+
+    dim_in = patch_embed_dim
+    for i in range(depth):
+        num_heads = round_width(num_heads, head_mul[i], min_width=1, divisor=1)
+        if dim_mul_in_att:
+            dim_out = round_width(
+                dim_in,
+                dim_mul[i],
+                divisor=round_width(num_heads, head_mul[i]),
+            )
+        else:
+            dim_out = round_width(
+                dim_in,
+                dim_mul[i + 1],
+                divisor=round_width(num_heads, head_mul[i + 1]),
+            )
+
+        mvit_blocks.append(
+            MultiScaleBlock(
+                dim=dim_in,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                seq_len=6272,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                dropout_rate=dropout_rate_block,
+                droppath_rate=dpr[i],
+                norm_layer=block_norm_layer,
+                attn_norm_layer=attn_norm_layer,
+                kernel_q=pool_q[i],
+                kernel_kv=pool_kv[i],
+                stride_q=stride_q[i],
+                stride_kv=stride_kv[i],
+                pool_mode=pooling_mode,
+                has_cls_embed=cls_embed_on,
+                pool_first=pool_first,
+                residual_pool=residual_pool,
+                bias_on=bias_on,
+                depthwise_conv=depthwise_conv,
+                separate_qkv=separate_qkv,
+            )
+        )
+        dim_in = dim_out
+
+    norm_embed = None if norm_layer is None else norm_layer(dim_in)
+    if head is not None:
+        head_model = head(
+            in_features=dim_in,
+            out_features=head_num_classes,
+            seq_pool_type="cls" if cls_embed_on else "mean",
+            dropout_rate=head_dropout_rate,
+            activation=head_activation,
+        )
+    else:
+        head_model = None
+
+    return multiscale_vit_class(
+        patch_embed=patch_embed,
+        cls_positional_encoding=cls_positional_encoding,
+        pos_drop=pos_drop if dropout_rate_block > 0.0 else None,
+        blocks=mvit_blocks,
+        norm_embed=norm_embed,
+        head=head_model,
+    )

From 8d7d819f8edef50e7cbfc0ab973543c4b4ee30e3 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Fri, 9 Jun 2023 01:03:24 -0400
Subject: [PATCH 575/638] add missing copy

---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index 92e1e24af..fdc3bbc07 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -136,7 +136,7 @@ def gen_utils_file_list():
 
 def gen_backend_common_file_list():
     srcs = ["aitemplate/backend"]
-    f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") else False
+    f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") or x.endswith(".h") else False
     return gen_file_list(srcs, f_cond)
 
 
From e63e7e2289764ed25fe5b56c9395d317e94ea002 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 2 Jun 2023 16:12:59 +0800
Subject: [PATCH 576/638] separate cuda and rocm graph

---
 static/include/model.h                 | 49 ++++++++++++++++++--------
 static/include/rocm_device_functions.h |  2 +-
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/static/include/model.h b/static/include/model.h
index ce325f23a..3963724df 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -100,7 +100,7 @@ class ModelBase {
   void Run(StreamType stream, bool graph_mode) {
     auto* model = static_cast<ModelType*>(this);
     model->SetUpInputsOutputs();
-    if (graph_mode) {
+    if (target_has_graph_mode && graph_mode) {
       RunAsGraph(stream);
     } else {
       model->RunImpl(stream);
@@ -216,7 +216,8 @@ class ModelBase {
   }
 
   void RunAsGraph(StreamType stream) {
-    if(!graph_created_){
+#ifdef __HIP_PLATFORM_HCC__
+    if (graph_exec_ == nullptr) {
       DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
       try {
         static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
@@ -230,25 +231,44 @@ class ModelBase {
         }
         throw;
       }
-
       // The following function ends the capture and creates a graph
       // inside a unique_ptr that cleans up it when it goes out of scope.
       // Note that it throws an exception if EndCapture fails.
       auto graph = RAII_EndCaptureAndCreateGraph(
           [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
-
-      if (graph_exec_ == nullptr) {
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      } else if (
-          GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
-        // Consume the last cuda error, which may affect the next GraphExecLaunch
-        // call.
-        GetLastError();
-        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    }
+#else
+    DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
+    try {
+      static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
+    } catch (...) {
+      GraphType graph;
+      // No need to DEVICE_CHECK here, we want to see the original exception.
+      EndCapture(&graph);
+      if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+        LOG(WARNING)
+            << "Graph destruction failed while handling exception! Memory will be leaked.";
       }
-      graph_created_ = true;
+      throw;
     }
+    // The following function ends the capture and creates a graph
+    // inside a unique_ptr that cleans up it when it goes out of scope.
+    // Note that it throws an exception if EndCapture fails.
+    auto graph = RAII_EndCaptureAndCreateGraph(
+        [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
+    if (graph_exec_ == nullptr) {
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    } else if (
+        GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
+      // Consume the last cuda error, which may affect the next GraphExecLaunch
+      // call.
+      GetLastError();
+      DEVICE_CHECK(GraphExecDestroy(graph_exec_));
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    }
+#endif
+
     DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
   }
 
@@ -318,7 +338,6 @@ class ModelBase {
   std::vector<ParamInfo> params_;
 
   GraphExecType graph_exec_ = nullptr;
-  bool graph_created_ = false;
   StreamType graph_capture_stream_;
 
   std::unordered_map<std::string, const void**> constant_name_to_ptr_;
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 9546bb075..18d3aa297 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -28,7 +28,7 @@
 
 namespace ait {
 
-inline thread_local bool target_has_graph_mode = false;
+inline thread_local bool target_has_graph_mode = true;
 
 using DeviceError = hipError_t;
 using DevicePropertyType = hipDeviceProp_t;

From 1ee262519d34f75610fec3a475e586b05f84598b Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 9 Jun 2023 13:00:16 +0800
Subject: [PATCH 577/638] update setup.py

---
 python/setup.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index fdc3bbc07..ccba746f2 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -160,7 +160,6 @@ def gen_license_file_list():
     packages=find_packages(),
     package_data={
         "aitemplate": [
-            "backend/cuda/elementwise/custom_math.cuh",
             "backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh",
             "backend/cuda/groupnorm/groupnorm_kernel.cuh",
             "backend/cuda/groupnorm/layer_norm.cuh",
@@ -168,7 +167,6 @@ def gen_license_file_list():
             "backend/cuda/vision_ops/nms/batched_nms_kernel.cuh",
             "backend/cuda/vision_ops/nms/nms_kernel.cuh",
             "backend/cuda/vision_ops/roi_ops/multi_level_roi_align.cuh",
-            "backend/rocm/elementwise/custom_math.h",
         ]
         + gen_utils_file_list()
         + gen_cutlass_list()

From 2bdc21b31a1e814a3f35dd637f3274e3a628bb39 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 9 Jun 2023 15:32:57 +0800
Subject: [PATCH 578/638] fix compile bugs

---
 python/aitemplate/backend/rocm/tensor/expand_static_shape.py | 4 ++--
 python/aitemplate/backend/rocm/tensor/repeat.h               | 4 +---
 python/setup.py                                              | 1 +
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
index c4da60ecc..dda3d2482 100644
--- a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
+++ b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
@@ -74,7 +74,7 @@ def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
 // (used within repeat.cuh, included below )
 // Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
 #define SHM_MAX 1024 * 44
-
+namespace{
 {{custom_libs}}
 
 /**
@@ -150,7 +150,7 @@ def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
     }
 
 }
-
+}
 /**
  * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
  */
diff --git a/python/aitemplate/backend/rocm/tensor/repeat.h b/python/aitemplate/backend/rocm/tensor/repeat.h
index 974fe7e51..cd3a988f9 100644
--- a/python/aitemplate/backend/rocm/tensor/repeat.h
+++ b/python/aitemplate/backend/rocm/tensor/repeat.h
@@ -22,8 +22,6 @@ or into a target tensor.
 Used by expand_static_shape.py ( expand operator )
 
 */
-
-#include "hip/hip_runtime.h"
 /**
  * CUDA Kernel to copy elements repeatedly from a source memory
  * region to a target memory region.
@@ -187,4 +185,4 @@ __host__ hipError_t cuda_repeat_src(
     return res;
   }
   return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
-}
+}
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index ccba746f2..c3fad6231 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -160,6 +160,7 @@ def gen_license_file_list():
     packages=find_packages(),
     package_data={
         "aitemplate": [
+            "backend/cuda/elementwise/custom_math.cuh",
             "backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh",
             "backend/cuda/groupnorm/groupnorm_kernel.cuh",
             "backend/cuda/groupnorm/layer_norm.cuh",

From e23d04f16d99a82ddb2cbcac53695414663ea823 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 10 Jun 2023 15:30:08 -0700
Subject: [PATCH 579/638] Fix SD compilation example to use user provided H and
 W (#755)

Summary:
This fix allows to compile and demo SD model for both model editions
- base model edition - 512x512 - `stabilityai/stable-diffusion-2-1-base`
- regular model edition - 768x768  - `stabilityai/stable-diffusion-2-1`

Additional minor fixes:
- remove unused params from `map_clip_params` and `map_vae_params` to avoid confusion.

Related issue https://github.com/facebookincubator/AITemplate/issues/751

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/755

Reviewed By: frank-wei

Differential Revision: D46546811

Pulled By: hl475

fbshipit-source-id: 4305bcb943e99a91a24eb96a2d286285be9acbfd
---
 .../src/compile_lib/compile_clip.py                   |  7 +++----
 .../src/compile_lib/compile_unet.py                   |  7 ++-----
 .../src/compile_lib/compile_vae.py                    | 11 ++++-------
 python/aitemplate/frontend/nn/vision_transformers.py  |  2 +-
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index a85aee84f..5c8a72150 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -14,14 +14,14 @@
 #
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
 from .util import mark_output
 
 
-def map_clip_params(pt_mod, batch_size, seqlen, depth):
+def map_clip_params(pt_mod):
     params_ait = {}
     pt_params = dict(pt_mod.named_parameters())
     for key, arr in pt_params.items():
@@ -69,8 +69,7 @@ def compile_clip(
     ait_mod.name_parameter_tensor()
 
     pt_mod = pt_mod.eval()
-    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-    batch_size = IntVar(values=[1, 8], name="batch_size")
+    params_ait = map_clip_params(pt_mod)
 
     input_ids_ait = Tensor(
         [batch_size, seqlen], name="input0", dtype="int64", is_input=True
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
index c4233c1e4..d2f49a2b6 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -15,7 +15,7 @@
 import torch
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.unet_2d_condition import (
@@ -72,12 +72,9 @@ def compile_unet(
     # set AIT parameters
     pt_mod = pt_mod.eval()
     params_ait = map_unet_params(pt_mod, dim)
-    # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
 
     latent_model_input_ait = Tensor(
-        [batch_size, height_d, width_d, 4], name="input0", is_input=True
+        [batch_size, height, width, 4], name="input0", is_input=True
     )
     timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
     text_embeddings_pt_ait = Tensor(
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index e9c2d4964..4ce93e709 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -15,14 +15,14 @@
 
 import torch
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, Tensor
+from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
 from .util import mark_output
 
 
-def map_vae_params(ait_module, pt_module, batch_size, seq_len):
+def map_vae_params(ait_module, pt_module):
     pt_params = dict(pt_module.named_parameters())
     mapped_pt_params = {}
     for name, _ in ait_module.named_parameters():
@@ -123,19 +123,16 @@ def compile_vae(
         latent_channels=latent_channels,
         sample_size=sample_size,
     )
-    # batch_size = IntVar(values=[1, 8], name="batch_size")
-    height_d = IntVar(values=[32, 64], name="height")
-    width_d = IntVar(values=[32, 64], name="width")
 
     ait_input = Tensor(
-        shape=[batch_size, height_d, width_d, latent_channels],
+        shape=[batch_size, height, width, latent_channels],
         name="vae_input",
         is_input=True,
     )
     ait_vae.name_parameter_tensor()
 
     pt_mod = pt_mod.eval()
-    params_ait = map_vae_params(ait_vae, pt_mod, batch_size, height * width)
+    params_ait = map_vae_params(ait_vae, pt_mod)
 
     Y = ait_vae.decode(ait_input)
     mark_output(Y)
diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
index 46cd3b90e..32b262302 100644
--- a/python/aitemplate/frontend/nn/vision_transformers.py
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -18,6 +18,7 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
+from pytorchvideo.layers.utils import round_width
 
 from aitemplate.frontend import Tensor
 from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
@@ -35,7 +36,6 @@
 from aitemplate.frontend.nn.positional_encoding import (
     SpatioTemporalClsPositionalEncoding,
 )
-from pytorchvideo.layers.utils import round_width
 
 
 class MultiscaleVisionTransformers(Module):

From fff93a1395f1343f75a1beeb2d6c51a83a09ad75 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 10 Jun 2023 22:02:42 -0700
Subject: [PATCH 580/638] Add --model-name param to SD download_pipeline script
 (#757)

Summary:
stable-diffusion model has two variants - base and regular.
- "base" variant image resolution is 512x512 - `stabilityai/stable-diffusion-2-1-base`
- "regular" variant image resolution is 768x768 - `stabilityai/stable-diffusion-2-1`

This PR adds `--model-name` param to `download_pipeline.py` script and sets default model name as
```
stabilityai/stable-diffusion-2-1-base
```
which matches to default `--width` and `--height` parameters (512x512) in `compile.py` and `demo.py` scripts.

To run the whole example we can:
```
# for default "base" model variant (512x512)
python scripts/download_pipeline.py
python scripts/compile.py
python scripts/demo.py

# for "regular" model variant (768x768)
python scripts/download_pipeline.py --model-name "stabilityai/stable-diffusion-2-1"
python scripts/compile.py --width 768 --height 768
python scripts/demo.py --width 768 --height 768
```

small additional fix - rename `--save_directory` click option to `--save-directory` - all other scripts use dash (-) in click options names

Related PR - https://github.com/facebookincubator/AITemplate/pull/755
Related Issue - https://github.com/facebookincubator/AITemplate/issues/751

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/757

Reviewed By: houseroad

Differential Revision: D46581219

Pulled By: hl475

fbshipit-source-id: 7283af1a802f5c7c0dde54066d5e15c27488961b
---
 examples/05_stable_diffusion/README.md        | 19 +++++++++++++++----
 .../scripts/download_pipeline.py              | 11 ++++++++---
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index e9ca953f9..ae3317c2b 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -25,16 +25,24 @@ Verify the library versions. We have tested transformers==4.25, diffusers==0.11[
 ### Download the diffusers pipeline files
 You must first register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
 
+stable-diffusion model has two variants - base and regular.
+For example:
+- `stabilityai/stable-diffusion-2-1-base` - image resolution 512x512
+- `stabilityai/stable-diffusion-2-1` - image resolution 768x768
+
 ```
-python3 scripts/download_pipeline.py --token ACCESS_TOKEN
+python3 scripts/download_pipeline.py \
+--model-name "stabilityai/stable-diffusion-2-1-base" \
+--token ACCESS_TOKEN
 ```
 
 ### Build AIT modules for CLIP, UNet, VAE
 
 Build the AIT modules by running `compile.py`.
 
+Set correct width and height depending on the model variant
 ```
-python3 scripts/compile.py
+python3 scripts/compile.py --width 512 --height 512
 ```
 It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
@@ -71,6 +79,7 @@ To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBL
 
 This step is optional. You can run `benchmark.py` to measure throughput for each of the subnets.
 
+Benchmark script supports base model variant only for now - 512x512
 ```
 python3 src/benchmark.py
 ```
@@ -87,14 +96,16 @@ HUGGINGFACE_AUTH_TOKEN=ACCESS_TOKEN python3 -m unittest src/test_correctness.py
 
 Run AIT models with an example image:
 
+Set correct width and height depending on the model variant
 ```
-python3 scripts/demo.py
+python3 scripts/demo.py --width 512 --height 512
 ```
 
 Img2img demo:
 
+Set correct width and height depending on the model variant
 ```
-python3 scripts/demo_img2img.py
+python3 scripts/demo_img2img.py --width 512 --height 512
 ```
 
 Check the resulted image: `example_ait.png`
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index e5ffe56f0..317e5de46 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -18,15 +18,20 @@
 
 
 @click.command()
+@click.option(
+    "--model-name",
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="Pretrained Model name",
+)
 @click.option("--token", default="", help="access token")
 @click.option(
-    "--save_directory",
+    "--save-directory",
     default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
     help="pipeline files local directory",
 )
-def download_pipeline_files(token, save_directory) -> None:
+def download_pipeline_files(model_name, token, save_directory) -> None:
     StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
+        model_name,
         revision="fp16",
         torch_dtype=torch.float16,
         # use provided token or the one generated with `huggingface-cli login``

From db2a9e9cfdb0452bfd697a445cdd9f4fdcfe555d Mon Sep 17 00:00:00 2001
From: Eric Jiang <erj@meta.com>
Date: Sat, 10 Jun 2023 23:57:31 -0700
Subject: [PATCH 581/638] Include split+cat in fuse_split optimization (#740)

Summary:
This change extends _fuse_split_and_strided_op to also optimize split followed by cat (when both are on the same dim). The split op is removed and the input_accessors of the cat op are updated.

The new `test_fuse_split_cat.py` test case tests for split+cat across dim 0, dim 1, and a third case where split and cat along different dims (a case that we are not optimizing yet).

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/740

Reviewed By: wushirong

Differential Revision: D46505169

Pulled By: chenyang78

fbshipit-source-id: 3aa8f58381c2e20ba0967254222b67dc9a106cf1
---
 .../compiler/transform/fuse_split.py          |  30 +-
 .../unittest/compiler/test_fuse_split_cat.py  | 262 ++++++++++++++++++
 .../compiler/test_transform_memory_ops.py     |   3 +-
 3 files changed, 288 insertions(+), 7 deletions(-)
 create mode 100644 tests/unittest/compiler/test_fuse_split_cat.py

diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index a2daf99ac..91aeac2d8 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -163,7 +163,7 @@ def _check_dim_alignment(shape: List[IntVar], dim_idx: int, dtype: str) -> bool:
     return alignment.valid_alignment(k_dim_val, dtype)
 
 
-def _check_alignment(op: Operator, offset: int):
+def _check_alignment(op: Operator, offset: int, total_elems_from_split_dim: int):
     # ops that support align=1
     if op._attrs["op"] == "bmm_rcr_n1":
         return True
@@ -172,6 +172,10 @@ def _check_alignment(op: Operator, offset: int):
     # ops that don't have valid alignments
     if not alignment.valid_alignment(offset, dtype):
         return False
+    if not alignment.valid_alignment(total_elems_from_split_dim, dtype):
+        return False
+    if op._attrs["op"] == "concatenate":
+        return True
     if op._attrs["op"] == "bmm_rrr_permute":
         a_shape = op._attrs["input_accessors"][0].original_shapes
         b_shape = op._attrs["input_accessors"][1].original_shapes
@@ -242,12 +246,26 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
         # We apply padding to bmm before this fuse_split pass. However, we may
         # still have mis-aligned accesses caused by offsets. This _check_alignment
         # filters out all bad cases.
+        total_elems_from_split_dim = (
+            stride * split_input._attrs["shape"][split_dim].value()
+        )
         for output in outputs:
             can_fuse_split &= len(output.dst_ops()) > 0 and all(
-                _is_supported_op(next_op._attrs["op"])
-                # need to pass the real offset to alignment checker
-                and _check_alignment(next_op, dim_offset * stride)
-                and len(output.dst_ops()) == 1
+                (
+                    _is_supported_op(next_op._attrs["op"])
+                    # need to pass the real offset to alignment checker
+                    and _check_alignment(
+                        next_op, dim_offset * stride, total_elems_from_split_dim
+                    )
+                    and len(output.dst_ops()) == 1
+                )
+                or (
+                    next_op._attrs["op"] == "concatenate"
+                    and next_op._attrs["concat_dim"] == split_dim
+                    and _check_alignment(
+                        next_op, dim_offset * stride, total_elems_from_split_dim
+                    )
+                )
                 for next_op in output.dst_ops()
             )
             for next_op in output.dst_ops():
@@ -263,6 +281,7 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
 
         if not can_fuse_split:
             continue
+
         _LOGGER.debug("Remove split from graph")
         split_input.dst_ops().remove(split_op)
 
@@ -275,7 +294,6 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
                         )
                         # update the graph
                         next_op._attrs["inputs"][idx] = split_input
-                        break
                 split_input.dst_ops().add(next_op)
 
         # remove split op
diff --git a/tests/unittest/compiler/test_fuse_split_cat.py b/tests/unittest/compiler/test_fuse_split_cat.py
new file mode 100644
index 000000000..9c22a4598
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_split_cat.py
@@ -0,0 +1,262 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.public import IntImm
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+
+
+class FuseSplitCatTestCase(unittest.TestCase):
+    def _test_fuse_split_cat_rearrange(self, M, N, split, remove_split=True):
+        dtype = "float16"
+        M = IntImm(M)
+        N = IntImm(N)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, split, 0)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 0)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertEqual(
+            graph_has_op(model.debug_sorted_graph, "split"), not remove_split
+        )
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, split, 0)
+        y_pt = torch.cat(
+            [split_pt[1], split_pt[0]],
+            0,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_even(self):
+        self._test_fuse_split_cat_rearrange(
+            512, 512, split=[256, 256], remove_split=True
+        )
+
+    def test_fuse_split_cat_odd(self):
+        self._test_fuse_split_cat_rearrange(
+            512, 512, split=[139, 373], remove_split=True
+        )
+
+    def test_fuse_split_cat_reuse(self):
+        """Use a split output twice in the concatenate op."""
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(M.value() / 2), 0)
+        concatenate_3 = ops.concatenate()([split_2[1], split_2[0], split_2[1]], 0)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(M.value() / 2), 0)
+        y_pt = torch.cat(
+            [split_pt[1], split_pt[0], split_pt[1]],
+            0,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_dim1(self):
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(N.value() / 2), 1)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 1)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(N.value() / 2), 1)
+        y_pt = torch.cat(
+            split_pt[::-1],
+            1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_different_dims(self):
+        """Splitting and then concatting on different dims is not
+        expected to be optimized currently."""
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(M.value() / 2), 0)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 1)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was not removed because the dims are different
+        self.assertTrue(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(M.value() / 2), 0)
+        y_pt = torch.cat(
+            split_pt[::-1],
+            1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_bmm(self):
+        """Optimize out a split op whose output is used by both concat and bmm."""
+        dtype = "float16"
+        B = 1
+        M = 128
+        N = 512
+        K = 512
+        split_size_or_sections = 256
+        split_dim = 2
+        T_A = Tensor(
+            # feed the second half of T_A into additional concat so that the split
+            # output is used by both bmm and concat
+            shape=[B, M, K * 2],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=[B, N, K],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+
+        Xs = ops.split()(T_A, split_size_or_sections, split_dim)
+        Ys = ops.split()(T_B, split_size_or_sections, split_dim)
+        assert len(Xs) // 2 == len(Ys)
+
+        n = 2
+        Cs = []
+        for i in range(n):
+            X = Xs[i]
+            Y = Ys[i]
+            C = ops.bmm_rcr()(X, Y)
+            Cs.append(C)
+        # do an extra concatenate so that split_1 has different output ops
+        extra_concat = ops.concatenate()([Xs[3], Xs[2], Xs[3], Xs[2]], dim=split_dim)
+        bmm_cat = ops.concatenate()(Cs, dim=split_dim)
+        Y = ops.elementwise(FuncEnum.ADD)(extra_concat, bmm_cat)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        a = get_random_torch_tensor([B, M, K * 2], dtype)
+        b = get_random_torch_tensor([B, N, K], dtype)
+        xs = a.split(split_size_or_sections, split_dim)
+        ys = b.split(split_size_or_sections, split_dim)
+        cs = []
+        for i in range(n):
+            x = xs[i]
+            y = ys[i]
+            c = torch.bmm(x, y.permute(0, 2, 1))
+            cs.append(c)
+        extra_concat_pt = torch.cat([xs[3], xs[2], xs[3], xs[2]], dim=split_dim)
+        bmm_cat_pt = torch.cat(cs, dim=split_dim)
+        y_pt = torch.add(extra_concat_pt, bmm_cat_pt)
+
+        # Gen module.
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", self._testMethodName)
+        # Both splits should be removed, including the split that is used by
+        # both bmm and concat
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        self.assertEqual(len(model.debug_sorted_graph), 5)
+        y = torch.empty_like(y_pt)
+        model.run_with_tensors({"input0": a, "input1": b}, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index 2212cf1e0..e96718d74 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -1066,7 +1066,8 @@ def _test_non_fusible_split_reshape_cat(self, M, test_name, dtype="float16"):
         module = compile_model(Y, target, "./tmp", test_name)
         sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
-        self.assertEqual(len(sorted_ops), 3)
+        # fuse_split allows optimizing split+cat to just split, leaving 2 ops
+        self.assertEqual(len(sorted_ops), 2)
 
         for batch in [1, self.BATCH_SIZE]:
             x0_pt = get_random_torch_tensor([batch, M], dtype)

From 1c32db2d153301aed0d52c705f16cbad5ed8f7c1 Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Sun, 11 Jun 2023 23:49:42 -0700
Subject: [PATCH 582/638] frontend.nn.attention dtype (#759)

Summary:
Adds dtype parameter to `CrossAttention`  and `MultiheadAttention` of `frontend.nn.attention`

I required this change while experimenting with `stabilityai/stable-diffusion-x4-upscaler`, the VAE for this model requires float32.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/759

Reviewed By: houseroad

Differential Revision: D46625654

Pulled By: hl475

fbshipit-source-id: a7f7d6a21d5d0b6ce6b5fed0b76b2c3e3f8259b7
---
 python/aitemplate/frontend/nn/attention.py | 27 +++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index a1a7075b8..1f1240762 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -105,6 +105,7 @@ def __init__(
         causal=False,
         mask_seq=0,
         use_mem_eff=False,
+        dtype="float16",
     ):
         super().__init__()
         assert (
@@ -146,7 +147,7 @@ def __init__(
         self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
         if self.mask_seq:
             self.output_mask = Parameter(
-                shape=[mask_seq, num_heads, head_dim], dtype="float16"
+                shape=[mask_seq, num_heads, head_dim], dtype=dtype
             )
 
         if self.USE_CUDA:
@@ -155,13 +156,14 @@ def __init__(
             # input: (B, S, H)
             # output: (B*S, 3, num_heads, head_dim)
             if self.use_flash:
-                self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+                self.qkv = Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype)
             else:
                 self.qkv = Linear(
                     dim,
                     dim * 3,
                     specialization="permute",
                     shape=(seq_len, 3, self.num_heads),
+                    dtype=dtype,
                 )
         else:
             # on ROCM ck attention (bmm_softmax_bmm) takes three inputs (Q, K, V)
@@ -176,11 +178,14 @@ def __init__(
                 specialization="permute",
                 shape=(seq_len, 3, self.num_heads),
                 layout="m2n3",
+                dtype=dtype,
             )
 
-        self.attn_drop = Dropout(attn_drop)
-        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
-        self.proj_drop = Dropout(proj_drop)
+        self.attn_drop = Dropout(attn_drop, dtype=dtype)
+        self.proj = Linear(
+            dim, dim, specialization="add" if has_residual else None, dtype=dtype
+        )
+        self.proj_drop = Dropout(proj_drop, dtype=dtype)
 
     def get_shape(self, x):
         shape = [it.value() for it in x._attrs["shape"]]
@@ -318,6 +323,7 @@ def __init__(
         proj_drop=0.0,
         has_residual=True,
         causal=False,
+        dtype="float16",
     ):
         super().__init__()
         assert (
@@ -334,21 +340,26 @@ def __init__(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
         self.proj_k = Linear(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
         self.proj_v = Linear(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
 
-        self.attn_drop = Dropout(attn_drop)
-        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
-        self.proj_drop = Dropout(proj_drop)
+        self.attn_drop = Dropout(attn_drop, dtype=dtype)
+        self.proj = Linear(
+            dim, dim, specialization="add" if has_residual else None, dtype=dtype
+        )
+        self.proj_drop = Dropout(proj_drop, dtype=dtype)
 
     def attention(self, q, k, v):
         batch = q.shape()[0]

From 5d5f5f3704e465b0575452974e255487bdc37341 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Mon, 12 Jun 2023 00:18:16 -0700
Subject: [PATCH 583/638] Increase recursion limit in dump_program using try
 except (#761)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/761

as title. This will help with `RecursionError`

Reviewed By: chenyang78

Differential Revision: D46583683

fbshipit-source-id: 25b00a05f9cccf4bfca013b956363fcb6bbe729f
---
 .../aitemplate/utils/serialization/serdes_code.py  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
index 261001507..e2c1329bc 100644
--- a/python/aitemplate/utils/serialization/serdes_code.py
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -16,7 +16,9 @@
 Dump/Read sorted_graph to/from python code.
 """
 import copy
+import logging
 import os
+import sys
 
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -26,6 +28,7 @@
 
 from aitemplate.compiler.transform import mark_param_tensor, name_graph, toposort
 
+_LOGGER = logging.getLogger(__name__)
 PROGRAM_TEMPLATE = jinja2.Template(
     """import numpy as np
 
@@ -307,7 +310,16 @@ def dump_program(
     """
     if isinstance(sorted_graph, Tensor):
         sorted_graph = [sorted_graph]
-    sorted_graph = copy.deepcopy(sorted_graph)
+    try:
+        sorted_graph = copy.deepcopy(sorted_graph)
+    except RecursionError:
+        default = sys.getrecursionlimit()
+        new_recursion_limit = default * 10
+        _LOGGER.info(
+            f"Recursion error when copying graph with default recursion limit {default}. Will try again with {new_recursion_limit}"
+        )
+        sys.setrecursionlimit(new_recursion_limit)
+        sorted_graph = copy.deepcopy(sorted_graph)
 
     # Make sure the graph is in correct order and has names and param set correctly.
     sorted_graph = toposort(sorted_graph)

From 3fe7be355f8c6f55315359bd9c851aa24d9d4a63 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005367269
 <generatedunixname89002005367269@meta.com>
Date: Mon, 12 Jun 2023 07:53:19 -0700
Subject: [PATCH 584/638] Daily `arc lint --take BLACK`

Reviewed By: adamjernst

Differential Revision: D46640701

fbshipit-source-id: cca6532a9c6253ffdd23c7e4068c387eb12d5eab
---
 python/aitemplate/frontend/nn/vision_transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
index 32b262302..46cd3b90e 100644
--- a/python/aitemplate/frontend/nn/vision_transformers.py
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -18,7 +18,6 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
-from pytorchvideo.layers.utils import round_width
 
 from aitemplate.frontend import Tensor
 from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
@@ -36,6 +35,7 @@
 from aitemplate.frontend.nn.positional_encoding import (
     SpatioTemporalClsPositionalEncoding,
 )
+from pytorchvideo.layers.utils import round_width
 
 
 class MultiscaleVisionTransformers(Module):

From 82accdded81606b315cb900f23ce45b4912d1328 Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Mon, 12 Jun 2023 15:26:02 -0700
Subject: [PATCH 585/638] Map VAE params without AIT_AutoencoderKL (#760)

Summary:
Mapping VAE params without AIT_AutoencoderKL, this removes the dependency on AITemplate package for loading weights to modules compiled without included constants, which was needed to improve user experience of [this plugin/demo repo](https://github.com/hlky/AIT)

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/760

Reviewed By: houseroad

Differential Revision: D46650051

Pulled By: hl475

fbshipit-source-id: a75a0bc0499780feb264e7391788624df639eb22
---
 .../src/compile_lib/compile_vae.py            | 146 +++++++++------
 .../src/compile_lib/compile_vae_alt.py        | 176 +++++++++---------
 .../src/pipeline_stable_diffusion_ait_alt.py  |  41 +---
 ...ipeline_stable_diffusion_controlnet_ait.py |  41 +---
 4 files changed, 185 insertions(+), 219 deletions(-)

diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
index 4ce93e709..a752849cd 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -22,62 +22,98 @@
 from .util import mark_output
 
 
-def map_vae_params(ait_module, pt_module):
-    pt_params = dict(pt_module.named_parameters())
-    mapped_pt_params = {}
-    for name, _ in ait_module.named_parameters():
-        ait_name = name.replace(".", "_")
-        if name in pt_params:
-            if (
-                "conv" in name
-                and "norm" not in name
-                and name.endswith(".weight")
-                and len(pt_params[name].shape) == 4
-            ):
-                mapped_pt_params[ait_name] = torch.permute(
-                    pt_params[name], [0, 2, 3, 1]
-                ).contiguous()
-            else:
-                mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.proj.weight"):
-            prefix = name[: -len("attention.proj.weight")]
-            pt_name = prefix + "proj_attn.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj.bias"):
-            prefix = name[: -len("attention.proj.bias")]
-            pt_name = prefix + "proj_attn.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.cu_length"):
-            ...
-        elif name.endswith("attention.proj_q.weight"):
-            prefix = name[: -len("attention.proj_q.weight")]
-            pt_name = prefix + "query.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_q.bias"):
-            prefix = name[: -len("attention.proj_q.bias")]
-            pt_name = prefix + "query.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.weight"):
-            prefix = name[: -len("attention.proj_k.weight")]
-            pt_name = prefix + "key.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.bias"):
-            prefix = name[: -len("attention.proj_k.bias")]
-            pt_name = prefix + "key.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.weight"):
-            prefix = name[: -len("attention.proj_v.weight")]
-            pt_name = prefix + "value.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.bias"):
-            prefix = name[: -len("attention.proj_v.bias")]
-            pt_name = prefix + "value.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
+
+
+def map_vae(pt_module, device="cuda", dtype="float16"):
+    if not isinstance(pt_module, dict):
+        pt_params = dict(pt_module.named_parameters())
+    else:
+        pt_params = pt_module
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if key.startswith("encoder"):
+            continue
+        if key.startswith("quant"):
+            continue
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        key = key.replace(".", "_")
+        if (
+            "conv" in key
+            and "norm" not in key
+            and key.endswith("_weight")
+            and len(arr.shape) == 4
+        ):
+            params_ait[key] = torch.permute(arr, [0, 2, 3, 1]).contiguous()
+        elif key.endswith("proj_attn_weight"):
+            prefix = key[: -len("proj_attn_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_weight"):
+            prefix = key[: -len("to_out_0_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("proj_attn_bias"):
+            prefix = key[: -len("proj_attn_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_bias"):
+            prefix = key[: -len("to_out_0_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("query_weight"):
+            prefix = key[: -len("query_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_q_weight"):
+            prefix = key[: -len("to_q_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("query_bias"):
+            prefix = key[: -len("query_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_q_bias"):
+            prefix = key[: -len("to_q_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("key_weight"):
+            prefix = key[: -len("key_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("key_bias"):
+            prefix = key[: -len("key_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("value_weight"):
+            prefix = key[: -len("value_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("value_bias"):
+            prefix = key[: -len("value_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_k_weight"):
+            prefix = key[: -len("to_k_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_v_weight"):
+            prefix = key[: -len("to_v_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_k_bias"):
+            prefix = key[: -len("to_k_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_v_bias"):
+            prefix = key[: -len("to_v_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
         else:
-            pt_param = pt_module.get_parameter(name)
-            mapped_pt_params[ait_name] = pt_param
+            params_ait[key] = arr
 
-    return mapped_pt_params
+    return params_ait
 
 
 def compile_vae(
@@ -132,7 +168,7 @@ def compile_vae(
     ait_vae.name_parameter_tensor()
 
     pt_mod = pt_mod.eval()
-    params_ait = map_vae_params(ait_vae, pt_mod)
+    params_ait = map_vae(pt_mod)
 
     Y = ait_vae.decode(ait_input)
     mark_output(Y)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
index 1b7dd81d9..ffaeb75c6 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
@@ -22,98 +22,98 @@
 from .util import mark_output
 
 
-def map_vae_params(ait_module, pt_module, batch_size=1, seq_len=4096):
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
+
+
+def map_vae(pt_module, device="cuda", dtype="float16"):
     if not isinstance(pt_module, dict):
         pt_params = dict(pt_module.named_parameters())
     else:
         pt_params = pt_module
-    mapped_pt_params = {}
-    for name, _ in ait_module.named_parameters():
-        ait_name = name.replace(".", "_")
-        if name in pt_params:
-            if (
-                "conv" in name
-                and "norm" not in name
-                and name.endswith(".weight")
-                and len(pt_params[name].shape) == 4
-            ):
-                mapped_pt_params[ait_name] = torch.permute(
-                    pt_params[name], [0, 2, 3, 1]
-                ).contiguous()
-            else:
-                mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.proj.weight"):
-            prefix = name[: -len("attention.proj.weight")]
-            pt_name = prefix + "proj_attn.weight"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_out.0.weight"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj.bias"):
-            prefix = name[: -len("attention.proj.bias")]
-            pt_name = prefix + "proj_attn.bias"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_out.0.bias"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.cu_length"):
-            ...
-        elif name.endswith("attention.proj_q.weight"):
-            prefix = name[: -len("attention.proj_q.weight")]
-            pt_name = prefix + "query.weight"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_q.weight"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_q.bias"):
-            prefix = name[: -len("attention.proj_q.bias")]
-            pt_name = prefix + "query.bias"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_q.bias"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.weight"):
-            prefix = name[: -len("attention.proj_k.weight")]
-            pt_name = prefix + "key.weight"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_k.weight"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_k.bias"):
-            prefix = name[: -len("attention.proj_k.bias")]
-            pt_name = prefix + "key.bias"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_k.bias"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.weight"):
-            prefix = name[: -len("attention.proj_v.weight")]
-            pt_name = prefix + "value.weight"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_v.weight"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj_v.bias"):
-            prefix = name[: -len("attention.proj_v.bias")]
-            pt_name = prefix + "value.bias"
-            if pt_name in pt_params:
-                mapped_pt_params[ait_name] = pt_params[pt_name]
-            else:
-                pt_name = prefix + "to_v.bias"
-                mapped_pt_params[ait_name] = pt_params[pt_name]
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if key.startswith("encoder"):
+            continue
+        if key.startswith("quant"):
+            continue
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        key = key.replace(".", "_")
+        if (
+            "conv" in key
+            and "norm" not in key
+            and key.endswith("_weight")
+            and len(arr.shape) == 4
+        ):
+            params_ait[key] = torch.permute(arr, [0, 2, 3, 1]).contiguous()
+        elif key.endswith("proj_attn_weight"):
+            prefix = key[: -len("proj_attn_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_weight"):
+            prefix = key[: -len("to_out_0_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("proj_attn_bias"):
+            prefix = key[: -len("proj_attn_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_bias"):
+            prefix = key[: -len("to_out_0_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("query_weight"):
+            prefix = key[: -len("query_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_q_weight"):
+            prefix = key[: -len("to_q_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("query_bias"):
+            prefix = key[: -len("query_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_q_bias"):
+            prefix = key[: -len("to_q_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("key_weight"):
+            prefix = key[: -len("key_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("key_bias"):
+            prefix = key[: -len("key_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("value_weight"):
+            prefix = key[: -len("value_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("value_bias"):
+            prefix = key[: -len("value_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_k_weight"):
+            prefix = key[: -len("to_k_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_v_weight"):
+            prefix = key[: -len("to_v_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_k_bias"):
+            prefix = key[: -len("to_k_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_v_bias"):
+            prefix = key[: -len("to_v_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
         else:
-            pt_param = pt_module.get_parameter(name)
-            mapped_pt_params[ait_name] = pt_param
-    for key, arr in mapped_pt_params.items():
-        mapped_pt_params[key] = arr.to("cuda", dtype=torch.float16)
-    return mapped_pt_params
+            params_ait[key] = arr
+
+    return params_ait
 
 
 def compile_vae(
@@ -175,7 +175,7 @@ def compile_vae(
     ait_vae.name_parameter_tensor()
 
     pt_mod = pt_mod.eval()
-    params_ait = map_vae_params(ait_vae, pt_mod)
+    params_ait = map_vae(pt_mod)
 
     Y = ait_vae.decode(ait_input)
     mark_output(Y)
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
index 78c9911f6..419184628 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
@@ -29,8 +29,7 @@
 
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from .compile_lib.compile_vae_alt import map_vae_params
-from .modeling.vae import AutoencoderKL as ait_AutoencoderKL
+from .compile_lib.compile_vae_alt import map_vae
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -754,49 +753,15 @@ def __init__(self, hf_hub_or_path, ckpt):
             ).cuda()
         else:
             self.vae_pt = dict(vae_state_dict)
-        in_channels = 3
-        out_channels = 3
-        down_block_types = [
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-        ]
-        up_block_types = [
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-        ]
-        block_out_channels = [128, 256, 512, 512]
-        layers_per_block = 2
-        act_fn = "silu"
-        latent_channels = 4
-        sample_size = 512
-
-        ait_vae = ait_AutoencoderKL(
-            1,
-            64,
-            64,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            down_block_types=down_block_types,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            latent_channels=latent_channels,
-            sample_size=sample_size,
-        )
+
         print("Mapping parameters...")
-        vae_params_ait = map_vae_params(ait_vae, self.vae_pt)
+        vae_params_ait = map_vae(self.vae_pt)
         print("Setting constants")
         self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
         print("Folding constants")
         self.vae_ait_exe.fold_constants()
         # cleanup
         self.vae_pt = None
-        ait_vae = None
         vae_params_ait = None
 
         self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
index 838725af2..8c2230368 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
@@ -29,8 +29,7 @@
 from tqdm import tqdm
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from .compile_lib.compile_vae_alt import map_vae_params
-from .modeling.vae import AutoencoderKL as ait_AutoencoderKL
+from .compile_lib.compile_vae_alt import map_vae
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -720,49 +719,15 @@ def __init__(self, hf_hub_or_path, ckpt):
             ).cuda()
         else:
             self.vae_pt = dict(vae_state_dict)
-        in_channels = 3
-        out_channels = 3
-        down_block_types = [
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-        ]
-        up_block_types = [
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-        ]
-        block_out_channels = [128, 256, 512, 512]
-        layers_per_block = 2
-        act_fn = "silu"
-        latent_channels = 4
-        sample_size = 512
-
-        ait_vae = ait_AutoencoderKL(
-            1,
-            64,
-            64,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            down_block_types=down_block_types,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            latent_channels=latent_channels,
-            sample_size=sample_size,
-        )
+
         print("Mapping parameters...")
-        vae_params_ait = map_vae_params(ait_vae, self.vae_pt)
+        vae_params_ait = map_vae(self.vae_pt)
         print("Setting constants")
         self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
         print("Folding constants")
         self.vae_ait_exe.fold_constants()
         # cleanup
         self.vae_pt = None
-        ait_vae = None
         vae_params_ait = None
 
         self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

From cbb8be86261347be2e8f99e036abb7d5b86c0a2e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 13 Jun 2023 10:48:10 +0800
Subject: [PATCH 586/638] small updates

---
 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py | 6 +++---
 python/aitemplate/compiler/ops/groupnorm/groupnorm.py      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index 218a6e6b5..fdfab8630 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -54,10 +54,10 @@
     };
     template <>
     __host__ __device__ constexpr void
-    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    operator()<ck::half_t>(ck::half_t& y, const ck::half_t& x0, const ck::half_t& x1) const
     {
-        const half_t a = x0 + x1;
-        y              = a / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-a))));
+        const ck::half_t a = x0 + x1;
+        y                  = a / (ck::type_convert<ck::half_t>(1.0) + ck::type_convert<ck::half_t>(exp(ck::type_convert<float>(-a))));
     };
 };
 } // namespace
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 667d162df..b93d0f247 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -325,7 +325,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         if len(result) == 0:
             raise RuntimeError(
                 "Profile workload: "
-                f"{self._attrs['op']}"
+                f"{self._attrs['op']} "
                 f"{exec_key}"
                 " failed. "
                 f"Results: {result}."

From b3d6705fdd82a3176ca626e2d7aeda0fa3e3687e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 13 Jun 2023 11:38:44 +0800
Subject: [PATCH 587/638] format code

---
 python/aitemplate/frontend/nn/vision_transformers.py | 2 +-
 python/setup.py                                      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
index 46cd3b90e..32b262302 100644
--- a/python/aitemplate/frontend/nn/vision_transformers.py
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -18,6 +18,7 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
+from pytorchvideo.layers.utils import round_width
 
 from aitemplate.frontend import Tensor
 from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
@@ -35,7 +36,6 @@
 from aitemplate.frontend.nn.positional_encoding import (
     SpatioTemporalClsPositionalEncoding,
 )
-from pytorchvideo.layers.utils import round_width
 
 
 class MultiscaleVisionTransformers(Module):
diff --git a/python/setup.py b/python/setup.py
index c3fad6231..dcf764c35 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -136,7 +136,11 @@ def gen_utils_file_list():
 
 def gen_backend_common_file_list():
     srcs = ["aitemplate/backend"]
-    f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") or x.endswith(".h") else False
+    f_cond = (
+        lambda x: True
+        if x.endswith(".py") or x.endswith(".cuh") or x.endswith(".h")
+        else False
+    )
     return gen_file_list(srcs, f_cond)
 
 
From f91c59cb10d819bc88bc3f305ba1d645218aa798 Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@meta.com>
Date: Mon, 12 Jun 2023 21:13:43 -0700
Subject: [PATCH 588/638] Add # usort:skip to make both internal and OSS lint
 happy (#763)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/763

as title

Reviewed By: houseroad

Differential Revision: D46659850

fbshipit-source-id: ee842bada096461b39ffb4cbd695613510ab6d52
---
 python/aitemplate/frontend/nn/vision_transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
index 46cd3b90e..46dbf0b8a 100644
--- a/python/aitemplate/frontend/nn/vision_transformers.py
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -18,6 +18,7 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
+from pytorchvideo.layers.utils import round_width  # usort:skip
 
 from aitemplate.frontend import Tensor
 from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
@@ -35,7 +36,6 @@
 from aitemplate.frontend.nn.positional_encoding import (
     SpatioTemporalClsPositionalEncoding,
 )
-from pytorchvideo.layers.utils import round_width
 
 
 class MultiscaleVisionTransformers(Module):

From 4d00081245fb9f7395c7a607299cdceb215a14d3 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 13 Jun 2023 08:27:38 -0700
Subject: [PATCH 589/638] Fix SD image quality on some GPUs (#765)

Summary:
compile_clip.py fix.

Recently `batch_size` in `input_ids_ait` and `position_ids_ait` Tensors descriptions [was changed ](https://github.com/facebookincubator/AITemplate/commit/e23d04f16d99a82ddb2cbcac53695414663ea823)from hardcoded dynamic IntVar(1,8) to static user provided batch_size.

It worked fine on A100 and A10G GPUs but generated bad image on T4.

Looks like we need to return `batch_size` back to dynamic IntVar (1,8).

Additional improvements:
- Instead of using hardcoded batch_size upper bound  we can use user provided batch size or 8 depending on what is bigger.
- Demo script will save all generated images, not just image 0.

### Testing
Tested compile/demo workflow on T4 with  batch-size 1, 8, 12 - all images look good

Related Issue: https://github.com/facebookincubator/AITemplate/issues/758

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/765

Reviewed By: houseroad

Differential Revision: D46674362

Pulled By: hl475

fbshipit-source-id: 1d2daf35cb17019f1e111bb13400b401c6d44f54
---
 examples/05_stable_diffusion/scripts/demo.py               | 6 +++---
 .../05_stable_diffusion/src/compile_lib/compile_clip.py    | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/demo.py b/examples/05_stable_diffusion/scripts/demo.py
index 9ae7db46a..8cddeb7c2 100644
--- a/examples/05_stable_diffusion/scripts/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -52,14 +52,14 @@ def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
 
     prompt = [prompt] * batch
     with torch.autocast("cuda"):
-        image = pipe(prompt, height, width).images[0]
+        images = pipe(prompt, height, width).images
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
             print(
                 f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
             )
-
-    image.save("example_ait.png")
+    for i, image in enumerate(images):
+        image.save(f"example_ait_{i}.png")
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
index 5c8a72150..e6a6ea52d 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -14,7 +14,7 @@
 #
 
 from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
@@ -70,12 +70,13 @@ def compile_clip(
 
     pt_mod = pt_mod.eval()
     params_ait = map_clip_params(pt_mod)
+    batch_size_d = IntVar(values=[1, max(8, batch_size)], name="batch_size")
 
     input_ids_ait = Tensor(
-        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
+        [batch_size_d, seqlen], name="input0", dtype="int64", is_input=True
     )
     position_ids_ait = Tensor(
-        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
+        [batch_size_d, seqlen], name="input1", dtype="int64", is_input=True
     )
     Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
     mark_output(Y)

From 8992f3f2829a0f81a117b1f97aad5e369606a1d7 Mon Sep 17 00:00:00 2001
From: Henry Hu <hhh@meta.com>
Date: Tue, 13 Jun 2023 10:59:06 -0700
Subject: [PATCH 590/638] Add support to List[List[Tensor]] Input shape (#756)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/756

Currently fx2ait can only handle input arg of tensor. Add support for List[Tensor] as each input arg, assuming List is of fixed length.

Reviewed By: mortzur

Differential Revision: D46547246

fbshipit-source-id: 48625b04ad7807a3223e4553cb2e5231c7885baa
---
 fx2ait/fx2ait/ait_module.py          | 43 ++++++++++++++++--------
 fx2ait/fx2ait/fx2ait.py              | 50 ++++++++++++++++++++++------
 fx2ait/fx2ait/test/test_fx2ait.py    | 43 ++++++++++++++++++++++++
 fx2ait/fx2ait/tools/common_fx2ait.py | 23 +++++++------
 4 files changed, 126 insertions(+), 33 deletions(-)

diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
index 53e3a84c5..9f83fe937 100644
--- a/fx2ait/fx2ait/ait_module.py
+++ b/fx2ait/fx2ait/ait_module.py
@@ -16,6 +16,8 @@
 
 import torch
 
+ARG_SPLITTER_KEYWORD = "a1T_ARg_SpliTTERKeyword"
+
 
 class AITModule(torch.nn.Module):
     def __init__(
@@ -25,24 +27,39 @@ def __init__(
     ):
         super(AITModule, self).__init__()
         self.engine = engine
+
         self.interp_result = interp_result
+        self.ait_arg_names = interp_result.input_names if interp_result else None
+        self.fx_arg_names = interp_result.fx_input_names if interp_result else None
 
-    def forward(self, *inputs, **kwargs):
-        python_inputs = []
+    def forward(self, *args, **kwargs):
+        ait_args = []
         if self.interp_result:
-            inputs = list(inputs)
-            for name, inp in zip(self.interp_result.fx_input_names, inputs):
-                if name in self.interp_result.input_names:
-                    python_inputs.append(inp)
-            for name in self.interp_result.input_names:
-                if name in kwargs:
-                    python_inputs.append(kwargs[name])
-            assert len(python_inputs) == len(self.interp_result.input_names)
+            offset = 0
+            for idx, fx_arg_name in enumerate(self.fx_arg_names):
+                arg_name, *arg_idx = fx_arg_name.split(ARG_SPLITTER_KEYWORD)
+                arg_idx = int(arg_idx[0]) if arg_idx else -1
+                # Offset for List[List[Tensor]]
+                offset += 1 if arg_idx > 0 else 0
+                if fx_arg_name in self.ait_arg_names:
+                    # Locate input from args.
+                    if idx - offset < len(args):
+                        arg_ref = args[idx - offset]
+                    # Locate input from kwargs.
+                    elif arg_name in kwargs:
+                        arg_ref = kwargs[arg_name]
+                    else:
+                        raise RuntimeError(f"Required input {fx_arg_name} not found")
+                    ait_args.append(arg_ref[arg_idx] if arg_idx > -1 else arg_ref)
+
+            assert len(ait_args) == len(self.ait_arg_names)
         else:
-            python_inputs = list(inputs)
-            python_inputs.extend(kwargs.values())
+            # Flatten args and kwargs from List[Tensor or List[Tensor]] to List[Tensor]
+            all_args = list(args) + list(kwargs.values())
+            for arg in all_args:
+                ait_args.extend(arg if isinstance(arg, list) else [arg])
 
-        outputs = self.engine.forward(python_inputs)
+        outputs = self.engine.forward(ait_args)
         if len(outputs) == 1:
             return outputs[0]
         return tuple(outputs)
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index ccf9e1c94..7b4ff3c80 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -18,7 +18,7 @@
 import tempfile
 import warnings
 from datetime import datetime
-from typing import Any, Dict, List, NamedTuple, Optional, Sequence
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Union
 
 import fx2ait.cache as cache
 
@@ -26,6 +26,7 @@
 
 # @manual=//aitemplate/AITemplate/python/aitemplate:aitemplate
 from aitemplate.testing import detect_target
+from fx2ait.ait_module import ARG_SPLITTER_KEYWORD
 from .converters.ait_converters import *  # isort:skip # noqa: F401 F403
 from .converters.aten2ait_converters import *  # isort:skip # noqa: F401 F403
 from aitemplate.compiler import compile_model
@@ -54,7 +55,7 @@ class AITInterpreter(torch.fx.Interpreter):
     def __init__(
         self,
         module: torch.fx.GraphModule,
-        input_specs: List[TensorSpec],
+        input_specs: List[Union[TensorSpec, List[TensorSpec]]],
         workdir: str,
         name: str,
         dll_name: str = "test.so",
@@ -269,16 +270,45 @@ def run_node(self, n):
         return super().run_node(n)
 
     def placeholder(self, target, args, kwargs):
-        self._fx_input_names.append(target)
         input_spec = self.input_specs[self.input_specs_iter]
         self.input_specs_iter += 1
-
-        return AITTensor(
-            shape=input_spec.shape,
-            dtype=dtype_to_str(input_spec.dtype),
-            name=target,
-            is_input=True,
-        )
+        if isinstance(input_spec, List):
+            """
+            List[Tensor] inputs are flattened in the compiled AIT engine.
+            Pytorch module original forward:
+                def forward(self, a : Tensor, b: List[Tensor])
+                    mod.forward(a, b)
+
+            Ait compiled engine forward:
+                engine.forward(a, b##ARG_SPLITTER_KEYWORD##0, b##ARG_SPLITTER_KEYWORD##1)
+            AITModule restores calling of the original forward:
+                ait_mod.forward(a, b)
+            """
+            ait_tensors = []
+            for i, inp_spec in enumerate(input_spec):
+                target_name = f"{target}{ARG_SPLITTER_KEYWORD}{i}"
+                self._fx_input_names.append(target_name)
+                ait_tensors.append(
+                    AITTensor(
+                        shape=inp_spec.shape,
+                        dtype=dtype_to_str(inp_spec.dtype),
+                        name=target_name,
+                        is_input=True,
+                    )
+                )
+            return ait_tensors
+        elif isinstance(input_spec, TensorSpec) or isinstance(input_spec, torch.Tensor):
+            self._fx_input_names.append(target)
+            return AITTensor(
+                shape=input_spec.shape,
+                dtype=dtype_to_str(input_spec.dtype),
+                name=target,
+                is_input=True,
+            )
+        else:
+            raise AssertionError(
+                "Input spec must be a Tensor(Spec) or List of Tensor(Spec)."
+            )
 
     def get_attr(self, target, args, kwargs):
         attr_val = getattr_recursive(self.module, target)
diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
index 0154faa8f..0670a8408 100644
--- a/fx2ait/fx2ait/test/test_fx2ait.py
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -118,6 +118,49 @@ def test_fx2ait_module_serialization(self):
     def test_fx2ait_cuda_graph(self):
         self._test_fx2ait_impl(test_cuda_graph=True)
 
+    def test_fx2ait_args(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, a, b, c, d):
+                temp = a + b[0] + b[1] + c[0] + c[1] + d
+                return temp
+
+        mod = TestModule().half().cuda()
+
+        a = torch.randn(5, 3).half().cuda()
+        b = [torch.randn(5, 3).half().cuda(), torch.randn(5, 3).half().cuda()]
+        c = [torch.randn(5, 3).half().cuda(), torch.randn(5, 3).half().cuda()]
+        d = torch.randn(5, 3).half().cuda()
+        ref_output = mod(a, b, c, d)
+
+        traced = acc_tracer.trace(mod, [a, b, c, d])
+
+        ait_dump_dir = tempfile.mkdtemp(prefix="test_fx2ait_", dir="/tmp")
+
+        interp = AITInterpreter(traced, [a, b, c, d], ait_dump_dir, "test")
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            AIT_MODEL_CLASS(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float16,
+                1,  # num_runtimes
+            ),
+            interp_result,
+        )
+        ait_output = ait_mod(a, b, c, d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b, c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b, c=c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b=b, c=c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index fd0298442..898a14234 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -28,6 +28,7 @@
 from fx2ait.ait_module import AITModule
 from fx2ait.fx2ait import AITInterpreter
 from fx2ait.tensor_spec import TensorSpec
+from torch.fx.node import map_aggregate
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -129,16 +130,18 @@ def run_test(
 
         original_inputs = copy.deepcopy(inputs)
         if permute_inputs:
-            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+            inputs = map_aggregate(
+                inputs, lambda inp: inp.permute(*permute_inputs).contiguous()
+            )
 
         torch_dtype = lower_precision_to_torch_type(precision)
         mod.to(torch_dtype)
-        inputs = [
-            inp.to(torch_dtype).contiguous()
+        inputs = map_aggregate(
+            inputs,
+            lambda inp: inp.to(torch_dtype).contiguous()
             if inp.dtype not in (torch.bool, torch.int64)
-            else inp.contiguous()
-            for inp in inputs
-        ]
+            else inp.contiguous(),
+        )
         interp = AITInterpreter(
             mod,
             inputs,
@@ -147,9 +150,7 @@ def run_test(
             use_fp16_acc=use_fp16_acc,
         )
         with torch.no_grad():
-            cuda_inputs = []
-            for i in inputs:
-                cuda_inputs.append(i.cuda())
+            cuda_inputs = map_aggregate(inputs, lambda inp: inp.cuda())
 
             mod.eval()
             if apply_passes_to_lowered_module_only:
@@ -212,7 +213,9 @@ def run_test(
                     ref = torch.tensor([ref])
                 ref = ref.cpu()  # to_dtype test has cases with gpu output
                 if permute_outputs:
-                    out = out.permute(*permute_outputs)
+                    out = map_aggregate(
+                        out, lambda output: output.permute(*permute_outputs)
+                    )
                 torch.testing.assert_close(
                     out.cpu(),
                     ref,

From 1ec9d9a74789a825cb9190b1140867843829d535 Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Wed, 14 Jun 2023 12:19:09 -0700
Subject: [PATCH 591/638] Add ait profile timeout to lower_settings (#764)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/764

Reviewed By: wushirong

Differential Revision: D46658455

fbshipit-source-id: c0103e9c50e239f0e91dc03680dc61cc6ed39ff0
---
 fx2ait/fx2ait/fx2ait.py                         | 4 ++++
 python/aitemplate/compiler/compiler.py          | 7 ++++++-
 python/aitemplate/compiler/transform/profile.py | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index 7b4ff3c80..7dbcd580c 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -69,6 +69,7 @@ def __init__(
         save_remote_cache: Optional[bool] = False,
         do_optimize_graph: bool = True,
         use_fast_math: bool = True,
+        profile_timeout: int = 300,
     ):
         """
         Args:
@@ -87,6 +88,7 @@ def __init__(
             remote_cache_file_path: AITemplate profiling cache location
             save_remote_cache: whether to save the updated cache
             use_fast_math: whether to use fast math in CUDA kernels
+            profile_timeout: timeout in seconds for AIT profilers to complete
         """
         super().__init__(module)
 
@@ -128,6 +130,7 @@ def __init__(
         self.keep_constants = keep_constants
         self.load_ait_dir = load_ait_dir
         self.do_optimize_graph = do_optimize_graph
+        self.profile_timeout = profile_timeout
 
     def _create_target(self):
         """Detect GPU target"""
@@ -217,6 +220,7 @@ def run(self) -> AITInterpreterResult:
             "dll_name": self.dll_name,
             "profile_dir": profile_dir,
             "do_optimize_graph": self.do_optimize_graph,
+            "profile_timeout": self.profile_timeout,
         }
         if self.dump_ait_dir:
             dump_ait_path = os.path.join(self.dump_ait_dir, self.name + ".py")
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index a912d7aa5..82df9b188 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -160,6 +160,7 @@ def compile_model(
     allocator_kind: Optional[AITemplateAllocatorKind] = None,
     debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
     do_optimize_graph: bool = True,
+    profile_timeout: int = 300,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
@@ -273,7 +274,11 @@ def compile_model(
                 else:
                     profile_devs = device_env.split(",")
             compiler.transform.profile(
-                graph, profile_dir, profile_devs, dynamic_profiling_strategy
+                graph,
+                profile_dir,
+                profile_devs,
+                dynamic_profiling_strategy,
+                profile_timeout,
             )
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "profile")
 
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index a7683bcba..ff3083d4c 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -55,6 +55,7 @@ def profile(
     workdir="./tmp",
     devices=None,
     dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    timeout=300,
 ):
     """Profiles kernels.
 
@@ -107,6 +108,7 @@ def profile(
     profiler_runner = ProfilerRunner(
         devices,
         postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
+        timeout=timeout,
     )
     for f in gemms:
         f.profile(

From c1e9b4244d908c9c6c3953d06c0f1917869b911d Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 15 Jun 2023 14:50:14 +0800
Subject: [PATCH 592/638] fix quick gelu shape

---
 examples/05_stable_diffusion/src/modeling/clip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 2b39427ec..8385a317c 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -365,11 +365,11 @@ def __init__(
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        # shape = get_shape(x)
+        shape = x.shape()
         x = self.fc1(x)
         x = self.activation_fn(x)
         x = self.fc2(x, res)
-        return ops.reshape()(x, x.shape())
+        return ops.reshape()(x, shape)
 
 
 class CLIPEncoderLayer(nn.Module):

From 2ae8184846cc86afc72cc5b578a80fb7dab626bc Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Thu, 15 Jun 2023 01:24:55 -0700
Subject: [PATCH 593/638] correctly run bmm_rrr tests (#770)

Summary:
Previously, we didn't run the bmm_rrr test due to a typo. This change fixed some issues in the relevant test and actually enabled it.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/770

Reviewed By: wushirong

Differential Revision: D46749491

Pulled By: chenyang78

fbshipit-source-id: 61f1c747a4654b9ce397f7c9929979ffd18a1c39
---
 tests/unittest/compiler/test_move_view_ops.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
index c2b96c270..3493f7c83 100644
--- a/tests/unittest/compiler/test_move_view_ops.py
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -1634,6 +1634,7 @@ def _test_move_strided_reshape_cat_10(
         # add_12 = add(reduce_9, reduce_10)
         # y = add(add_12, reduce_11)
         assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        assert M0 + M2 == N, f"expected {M0=} + {M2=} to be qual to {N=}"
         batch_sizes = [1, self.BATCH_SIZE]
         batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
         X0 = Tensor(
@@ -1677,9 +1678,9 @@ def _test_move_strided_reshape_cat_10(
         add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
         concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
         bmm_K = M0 + M2
-        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        reshape_2 = ops.reshape()(concat_1, [-1, N, bmm_K])
         # bmm_rrr_add_3[batch, N, N] = bmm_rrr_add(
-        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        #     reshape_2[batch, N, bmm_K], X4[bmm_K, N], X5[N]
         # )
         bmm_rrr_add_3 = ops.bmm_rrr_add()(reshape_2, X4, X5)
         concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
@@ -1759,13 +1760,12 @@ def _test_move_strided_reshape_cat_10(
             torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
 
     def test_move_strided_reshape_cat_10(self):
-        self._test_move_strided_reshape_cat_9(
-            M0=4,
-            M1=4,
-            M2=6,
+        self._test_move_strided_reshape_cat_10(
+            M0=2,
+            M1=2,
+            M2=4,
             M3=4,
-            M7=8,
-            N=4,
+            N=6,
             test_name="test_move_strided_reshape_cat_10",
             dtype="float16",
         )

From 72faba87e0401394611d2581da07f1625daf489a Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Thu, 15 Jun 2023 15:50:48 -0700
Subject: [PATCH 594/638] upstream gemm and embeddings (#726)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/726

Reviewed By: chenyang78

Differential Revision: D46674679

Pulled By: ipiszy

fbshipit-source-id: a3c49bc446d465c2e4184f08f80e81f2086cc935
---
 .../backend/rocm/embedding/__init__.py        |  16 ++
 .../backend/rocm/embedding/bert_embeddings.py | 236 ++++++++++++++++++
 .../aitemplate/backend/rocm/gemm/__init__.py  |   3 +
 .../aitemplate/backend/rocm/gemm/bmm_ccr.py   |  14 +-
 .../backend/rocm/gemm/bmm_ccr_add.py          | 182 ++++++++++++++
 .../backend/rocm/gemm/bmm_common.py           | 141 +++++++++--
 .../aitemplate/backend/rocm/gemm/bmm_crr.py   |  14 +-
 .../backend/rocm/gemm/bmm_crr_add.py          | 182 ++++++++++++++
 .../backend/rocm/gemm/bmm_permute_common.py   |   2 +-
 .../aitemplate/backend/rocm/gemm/bmm_rcr.py   |  14 +-
 .../aitemplate/backend/rocm/gemm/bmm_rrr.py   |  14 +-
 .../backend/rocm/gemm/bmm_rrr_add.py          | 182 ++++++++++++++
 .../backend/rocm/gemm/bmm_softmax_bmm.py      |   4 +-
 .../rocm/gemm/bmm_softmax_bmm_permute.py      |  65 ++---
 python/aitemplate/backend/rocm/gemm/common.py | 108 +++++---
 .../aitemplate/backend/rocm/gemm/gemm_rcr.py  |  14 +-
 .../backend/rocm/gemm/gemm_rcr_bias.py        |  14 +-
 .../backend/rocm/gemm/gemm_rcr_bias_add.py    |   9 +-
 .../rocm/gemm/gemm_rcr_bias_add_add.py        |   9 +-
 .../rocm/gemm/gemm_rcr_bias_add_add_relu.py   |   9 +-
 .../rocm/gemm/gemm_rcr_bias_add_relu.py       |   9 +-
 .../rocm/gemm/gemm_rcr_bias_fast_gelu.py      |  13 +
 .../rocm/gemm/gemm_rcr_bias_hardswish.py      | 164 ++++++++++++
 .../backend/rocm/gemm/gemm_rcr_bias_mul.py    |   9 +-
 .../rocm/gemm/gemm_rcr_bias_mul_add.py        |   9 +-
 .../rocm/gemm/gemm_rcr_bias_mul_tanh.py       |   9 +-
 .../rocm/gemm/gemm_rcr_bias_permute.py        |  26 +-
 .../backend/rocm/gemm/gemm_rcr_bias_relu.py   |  12 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid.py        |   9 +-
 .../rocm/gemm/gemm_rcr_bias_sigmoid_mul.py    |   9 +-
 .../gemm/gemm_rcr_bias_sigmoid_mul_tanh.py    |   9 +-
 .../backend/rocm/gemm/gemm_rcr_bias_swish.py  |  53 +++-
 .../backend/rocm/gemm/gemm_rcr_bias_tanh.py   |   9 +-
 .../aitemplate/backend/rocm/gemm/gemm_rrr.py  |  14 +-
 34 files changed, 1486 insertions(+), 100 deletions(-)
 create mode 100644 python/aitemplate/backend/rocm/embedding/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/embedding/bert_embeddings.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_ccr_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_crr_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_rrr_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py

diff --git a/python/aitemplate/backend/rocm/embedding/__init__.py b/python/aitemplate/backend/rocm/embedding/__init__.py
new file mode 100644
index 000000000..3e3aab46b
--- /dev/null
+++ b/python/aitemplate/backend/rocm/embedding/__init__.py
@@ -0,0 +1,16 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .bert_embeddings import *
diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
new file mode 100644
index 000000000..736845d30
--- /dev/null
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -0,0 +1,236 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+bert_embeddings kernel codegen for CUDA.
+"""
+
+import math
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "logging.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#define EMBEDDING_DIM {{embedding_dim}}
+
+using EmbElementwiseOperation = ck::tensor_operation::element_wise::AddAdd;
+using EmbType = {{elem_input_type}};
+using IndexType = {{index_type}};
+
+{{func_signature}}
+{
+  auto device_instance = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, EmbType, EmbType, float, EmbType, EmbElementwiseOperation, 256, 1, 256, 1, EMBEDDING_DIM, 1, {{row_v_size}}, 3>{};
+  auto argument_ptr = device_instance.MakeArgumentPointer(output,
+                                                          {ck::type_convert<EmbType*>(word_embeddings),
+                                                          ck::type_convert<EmbType*>(token_type_embeddings),
+                                                          ck::type_convert<EmbType*>(position_embeddings)},
+                                                          {ck::type_convert<IndexType*>(input_ids),
+                                                          ck::type_convert<IndexType*>(token_type_ids),
+                                                          ck::type_convert<IndexType*>(position_ids)},
+                                                          gamma,
+                                                          beta,
+                                                          EMBEDDING_DIM,
+                                                          indices_num,
+                                                          eps,
+                                                          EmbElementwiseOperation{});
+  if(!device_instance.IsSupportedArgument(argument_ptr.get())){
+    LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
+  }
+  auto invoker_ptr = device_instance.MakeInvokerPointer();
+  invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
+  return;
+}
+"""
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   {{index_type}}* input_ids,
+                   {{index_type}}* token_type_ids,
+                   {{index_type}}* position_ids,
+                   void* word_embeddings,
+                   void* token_type_embeddings,
+                   void* position_embeddings,
+                   void* gamma,
+                   void* beta,
+                   const int64_t indices_num,
+                   const float eps,
+                   hipStream_t stream)
+  """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{calculate_indices_num}}
+{{indent}}  {{func_name}}(
+{{indent}}            {{output}},
+{{indent}}            {{input_ids}},
+{{indent}}            {{token_type_ids}},
+{{indent}}            {{position_ids}},
+{{indent}}            {{word_embeddings}},
+{{indent}}            {{token_type_embeddings}},
+{{indent}}            {{position_embeddings}},
+{{indent}}            {{gamma}},
+{{indent}}            {{beta}},
+{{indent}}            {{indices_num}},
+{{indent}}            {{eps}},
+{{indent}}            stream /* default stream */
+{{indent}} );
+
+{{indent}}}
+    """
+)
+
+INDICES_NUM_TEMPLATE = jinja2.Template(
+    """
+  int64_t indices_num = 1;
+  {% for dim_name in dim_names %}
+  indices_num *= {{dim_name}};
+  {% endfor %}
+  """
+)
+
+
+def python_int_dtype_to_c_dtype(dtype):
+    if dtype == "int64":
+        return "int64_t"
+    if dtype in ["int", "int32"]:
+        return "int32_t"
+    return dtype
+
+
+@registry.reg("rocm.bert_embeddings.gen_function")
+def bert_embeddings_gen_function(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][3]._attrs["dtype"]
+    )
+    (
+        input_ids,
+        token_type_ids,
+        position_ids,
+        word_embeddings,
+        token_type_embeddings,
+        position_embeddings,
+        gamma,
+        beta,
+    ) = func_attrs["inputs"]
+    embedding_dim = word_embeddings._size(-1).value()
+    dtype = python_int_dtype_to_c_dtype(func_attrs["inputs"][0]._attrs["dtype"])
+    return FUNC_TEMPLATE.render(
+        index_type=dtype,
+        elem_input_type=elem_input_type,
+        embedding_dim=embedding_dim,
+        row_v_size=math.gcd(8, embedding_dim // 256),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=dtype,
+        ).strip(),
+    )
+
+
+@registry.reg("rocm.bert_embeddings.func_decl")
+def bert_embeddings_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    dtype = python_int_dtype_to_c_dtype(func_attrs["inputs"][0]._attrs["dtype"])
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=dtype,
+        ).strip()
+    )
+
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int32_t*>({{name}})")
+
+
+def get_int_param_template(tensor):
+    name = tensor._attrs["name"]
+    dtype = tensor._attrs["dtype"]
+    if dtype == "int64":
+        return FUNC_CALL_INT64_PARAM_TEMPLATE.render(name=name)
+    elif dtype in ("int", "int32"):
+        return FUNC_CALL_INT32_PARAM_TEMPLATE.render(name=name)
+    else:
+        raise NotImplementedError(f"Unsupported dtype: {dtype}")
+
+
+@registry.reg("rocm.bert_embeddings.func_call")
+def bert_embeddings_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    (
+        input_ids,
+        token_type_ids,
+        position_ids,
+        word_embeddings,
+        token_type_embeddings,
+        position_embeddings,
+        gamma,
+        beta,
+    ) = func_attrs["inputs"]
+
+    indices_dims = [shape._attrs["name"] for shape in input_ids.shape()]
+    indices_num_str = INDICES_NUM_TEMPLATE.render(
+        dim_names=indices_dims,
+    )
+
+    eps = func_attrs["eps"]
+    output_str = func_attrs["outputs"][0]._attrs["name"]
+
+    input_ids_str = get_int_param_template(input_ids)
+    token_type_ids_str = get_int_param_template(token_type_ids)
+    position_ids_str = get_int_param_template(position_ids)
+
+    word_embeddings_str = word_embeddings._attrs["name"]
+    token_type_embeddings_str = token_type_embeddings._attrs["name"]
+    position_embeddings_str = position_embeddings._attrs["name"]
+
+    gamma_str = gamma._attrs["name"]
+    beta_str = beta._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        calculate_indices_num=indices_num_str,
+        output=output_str,
+        input_ids=input_ids_str,
+        token_type_ids=token_type_ids_str,
+        position_ids=position_ids_str,
+        word_embeddings=word_embeddings_str,
+        token_type_embeddings=token_type_embeddings_str,
+        position_embeddings=position_embeddings_str,
+        gamma=gamma_str,
+        beta=beta_str,
+        indices_num="indices_num",
+        eps=eps,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/gemm/__init__.py b/python/aitemplate/backend/rocm/gemm/__init__.py
index 38c659280..ba4594cd5 100644
--- a/python/aitemplate/backend/rocm/gemm/__init__.py
+++ b/python/aitemplate/backend/rocm/gemm/__init__.py
@@ -17,10 +17,13 @@
 """
 from aitemplate.backend.rocm.gemm import (  # noqa: F401
     bmm_ccr,
+    bmm_ccr_add,
     bmm_crr,
+    bmm_crr_add,
     bmm_rcr,
     bmm_rcr_permute,
     bmm_rrr,
+    bmm_rrr_add,
     bmm_rrr_permute,
     bmm_softmax_bmm,
     bmm_softmax_bmm_permute,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
index e2a97fa0d..a691c2cda 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
@@ -109,7 +109,19 @@ def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.bmm_ccr.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr_add.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr_add.py
new file mode 100644
index 000000000..6f81bf58b
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr_add.py
@@ -0,0 +1,182 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[b, m, n] = a[b, k, m] * b[b, n, k]
+This is used for `ops.bmm_ccr_add`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import CCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = K;
+  int64_t a_dim2 = M;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_ccr_add.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, CCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_ccr_add.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    return bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="add",
+    )
+
+
+@registry.reg("rocm.bmm_ccr_add.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "add",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("rocm.bmm_ccr_add.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_ccr_add.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_ccr_add.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_common.py b/python/aitemplate/backend/rocm/gemm/bmm_common.py
index 67fdff617..5de7014d7 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_common.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_common.py
@@ -21,26 +21,62 @@
 
 EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
-{{indent}}const int64_t stride_a = *a_dim2;
-{{indent}}const int64_t stride_b = *b_dim2;
-{{indent}}const int64_t stride_c = *c_dim2;
+{{indent}}ck::index_t stride_a = *a_dim2;
+{{indent}}ck::index_t stride_b = *b_dim2;
+{{indent}}ck::index_t stride_c = *c_dim2;
+
+{{indent}}ck::index_t batch_stride_a = (*a_dim1) * stride_a;
+{{indent}}ck::index_t batch_stride_b = (*b_dim1) * stride_b;
+{{indent}}ck::index_t batch_stride_c = (*c_dim1) * stride_c;
 """
 )
+
+INPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+    {% if accessor_a.is_from_strided_tensor %}
+      batch_stride_a = {{accessor_a.stride(0)}};
+      stride_a = {{accessor_a.stride(1)}};
+      offset_a = {{accessor_a.offset}}; // default to 0
+    {% endif %}
+    {% if accessor_b.is_from_strided_tensor %}
+      batch_stride_b = {{accessor_b.stride(0)}};
+      stride_b = {{accessor_b.stride(1)}};
+      offset_b = {{accessor_b.offset}}; // default to 0
+    {% endif %}
+    """
+)
+
+# pylint: disable=C0103,C0415,W0611,C0301
+OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  {% if output_accessor.is_from_strided_tensor %}
+    batch_stride_c = {{output_accessor.stride(0)}};
+    stride_c = {{output_accessor.stride(1)}};
+    offset_c = {{output_accessor.offset}};
+  {% endif %}
+    """
+)
+
 EXTRA_HEADER_TEMPLATE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
 """
 )
 
+EXTRA_HEADER_TEMPLATE_MULTI_D = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+"""
+)
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-{{indent}}                                static_cast<ck::half_t *>(in_ptr),
-{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{{indent}}                                static_cast<ck::half_t *>(in_ptr) + offset_a,
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr) + offset_b,
 {% if "bias" in gemm_flag %}
 {{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
 {% endif %}
-{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                static_cast<ck::half_t *>(out_ptr) + offset_c,
 {{indent}}                                M,
 {{indent}}                                N,
 {{indent}}                                K,
@@ -50,9 +86,9 @@
 {{indent}}                                std::array<ck::index_t, 1>{0},
 {% endif %}
 {{indent}}                                stride_c,
-{{indent}}                                M*K,
-{{indent}}                                N*K,
-{{indent}}                                M*N,
+{{indent}}                                batch_stride_a,
+{{indent}}                                batch_stride_b,
+{{indent}}                                batch_stride_c,
 {{indent}}                                B,
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
@@ -68,6 +104,54 @@
 """
 )
 
+PROBLEM_ARGS_TEMPLATE_MULTI_D = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{% if "bias" in gemm_flag or gemm_flag == "add" %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% else %}
+{{indent}}                                {},
+{% endif %}
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                B,
+{{indent}}                                stride_a,
+{{indent}}                                stride_b,
+{% if gemm_flag == "add" %}
+{{indent}}                                std::array<ck::index_t, 1>{stride_c},
+{% elif gemm_flag == "bias" %}
+{{indent}}                                std::array<ck::index_t, 1>{0},
+{% else %}
+{{indent}}                                {},
+{% endif %}
+{{indent}}                                stride_c,
+{{indent}}                                batch_stride_a,
+{{indent}}                                batch_stride_b,
+{% if gemm_flag == "add" %}
+{{indent}}                                std::array<ck::index_t, 1>{batch_stride_c},
+{% elif gemm_flag == "bias" %}
+{{indent}}                                std::array<ck::index_t, 1>{stride_c},
+{% else %}
+{{indent}}                                {},
+{% endif %}
+{{indent}}                                batch_stride_c,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{% if gemm_flag == "" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif gemm_flag in ["bias", "add"] %}
+{{indent}}                                ck::tensor_operation::element_wise::Add{}
+{% elif gemm_flag == "bias_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddRelu{}
+{% elif gemm_flag == "bias_sigmoid" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoid{}
+{% endif %}
+"""
+)
+
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
   int64_t a_ptr_sz = B*M*K;
@@ -81,8 +165,11 @@
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
   memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "add" == gemm_flag %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // b: index 3
+{% endif %}
 {% if "bias" in gemm_flag %}
-  memory_pool->AllocateHalfTensor(N, mem_pool_sz);  // b: index 3
+  memory_pool->AllocateHalfTensor(B*N, mem_pool_sz);  // b: index 3
 {% endif %}
 """
 )
@@ -120,8 +207,8 @@ def gen_profiler(
     dim_info_dict,
     args_parse,
     gemm_flag,
-    problem_args_template=PROBLEM_ARGS_TEMPLATE,
-    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    problem_args_template=None,
+    extra_header_template=None,
     tensor_decl_template=TENSOR_DECL_TEMPLATE,
     extra_shape_template=EXTRA_SHAPE_TEMPLATE,
     extra_code="",
@@ -144,6 +231,18 @@ def gen_profiler(
     extra_code : str
         Extra code for self-defined operators.
     """
+    if problem_args_template is None:
+        if gemm_flag == "":
+            problem_args_template = PROBLEM_ARGS_TEMPLATE
+        else:
+            problem_args_template = PROBLEM_ARGS_TEMPLATE_MULTI_D
+
+    if extra_header_template is None:
+        if gemm_flag == "":
+            extra_header_template = EXTRA_HEADER_TEMPLATE
+        else:
+            extra_header_template = EXTRA_HEADER_TEMPLATE_MULTI_D
+
     return common.gen_profiler(
         func_attrs,
         workdir,
@@ -164,8 +263,8 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
     gemm_flag,
-    problem_args_template=PROBLEM_ARGS_TEMPLATE,
-    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    problem_args_template=None,
+    extra_header_template=None,
     extra_shape_template=EXTRA_SHAPE_TEMPLATE,
     extra_code="",
     input_addr_calculator="",
@@ -197,6 +296,18 @@ def gen_function(
     str
         The rendered template of generated function body.
     """
+    if problem_args_template is None:
+        if gemm_flag == "":
+            problem_args_template = PROBLEM_ARGS_TEMPLATE
+        else:
+            problem_args_template = PROBLEM_ARGS_TEMPLATE_MULTI_D
+
+    if extra_header_template is None:
+        if gemm_flag == "":
+            extra_header_template = EXTRA_HEADER_TEMPLATE
+        else:
+            extra_header_template = EXTRA_HEADER_TEMPLATE_MULTI_D
+
     return common.gen_function(
         func_attrs,
         exec_cond_template,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr.py b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
index 02d176a77..6e842652c 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_crr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
@@ -109,7 +109,19 @@ def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.bmm_crr.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr_add.py b/python/aitemplate/backend/rocm/gemm/bmm_crr_add.py
new file mode 100644
index 000000000..8cefec56c
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr_add.py
@@ -0,0 +1,182 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[b, m, n] = a[b, k, m] * b[b, k, n]
+This is used for `ops.bmm_crr_add`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import CRR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = K;
+  int64_t a_dim2 = M;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = K;
+  int64_t b_dim2 = N;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_crr_add.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, CRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_crr_add.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    return bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="add",
+    )
+
+
+@registry.reg("rocm.bmm_crr_add.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "add",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("rocm.bmm_crr_add.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_crr_add.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_crr_add.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py b/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
index 2b05b84c8..b444e34f2 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
@@ -19,7 +19,7 @@
 
 EXTRA_HEADER_TEMPLATE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp"
 """
 )
 
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
index 8396335c1..ee7784d42 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
@@ -109,7 +109,19 @@ def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.bmm_rcr.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
index 2d05afe05..aa3b68752 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
@@ -109,7 +109,19 @@ def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.bmm_rrr.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr_add.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr_add.py
new file mode 100644
index 000000000..a862a6fb6
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr_add.py
@@ -0,0 +1,182 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[b, m, n] = a[b, m, k] * b[b, k, n]
+This is used for `ops.bmm_rrr_add`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import RRR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = K;
+  int64_t b_dim2 = N;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_rrr_add.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_rrr_add.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    return bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="add",
+    )
+
+
+@registry.reg("rocm.bmm_rrr_add.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "add",
+        input_addr_calculator=bmm_common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("rocm.bmm_rrr_add.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_rrr_add.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="add")
+
+
+@registry.reg("rocm.bmm_rrr_add.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
index ca6b9976a..a54f10dd1 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -25,7 +25,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 const ck::half_t alpha = {{scale}};
 
@@ -71,7 +71,7 @@
 
 EXTRA_HEADER_TEMPLATE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
 """
 )
 
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
index 040a3f455..10337922c 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -31,14 +31,14 @@
 
 INPUT_ADDR_CALCULATOR = jinja2.Template(
     """
-  int64_t in_batch_stride = {{in_batch_stride_dim}};
-  int64_t in_stride = {{in_stride_dim}};
+  ck::index_t in_batch_stride = {{in_batch_stride_dim}};
+  ck::index_t in_stride = {{in_stride_dim}};
   int64_t in_offset = {{in_offset_val}}; // default to 0
-  int64_t weight_batch_stride = {{weight_batch_stride_dim}};
-  int64_t weight_stride = {{weight_stride_dim}};
+  ck::index_t weight_batch_stride = {{weight_batch_stride_dim}};
+  ck::index_t weight_stride = {{weight_stride_dim}};
   int64_t weight_offset = {{weight_offset_val}}; // default to 0
-  int64_t bias_batch_stride = {{bias_batch_stride_dim}};
-  int64_t bias_stride = {{bias_stride_dim}};
+  ck::index_t bias_batch_stride = {{bias_batch_stride_dim}};
+  ck::index_t bias_stride = {{bias_stride_dim}};
   int64_t bias_offset = {{bias_offset_val}}; // default to 0
     """
 )
@@ -52,16 +52,16 @@
 
 PROFILER_EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
-{{indent}}const int64_t G1 = p_dim0; // G1
+{{indent}}const ck::index_t G1 = p_dim0; // G1
 
-{{indent}}const int64_t in_batch_stride=M * K;
-{{indent}}const int64_t in_stride=K;
+{{indent}}const ck::index_t in_batch_stride=M * K;
+{{indent}}const ck::index_t in_stride=K;
 {{indent}}const int64_t in_offset=0;
-{{indent}}const int64_t weight_batch_stride=N * K;
-{{indent}}const int64_t weight_stride=K;
+{{indent}}const ck::index_t weight_batch_stride=N * K;
+{{indent}}const ck::index_t weight_stride=K;
 {{indent}}const int64_t weight_offset=0;
-{{indent}}const int64_t bias_batch_stride=N * O;
-{{indent}}const int64_t bias_stride=O;
+{{indent}}const ck::index_t bias_batch_stride=N * O;
+{{indent}}const ck::index_t bias_stride=O;
 {{indent}}const int64_t bias_offset=0;
 
 """
@@ -69,37 +69,38 @@
 
 EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
-{{indent}}const int64_t G1 = p_dim0; // G1
+{{indent}}const ck::index_t G1 = p_dim0; // G1
 """
 )
 
 EXTRA_HEADER_TEMPLATE = jinja2.Template(
     """
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 """
 )
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-{{indent}}                                static_cast<ck::half_t *>(in_ptr) + in_offset,
-{{indent}}                                static_cast<ck::half_t *>(weight_ptr) + weight_offset,
-{{indent}}                                static_cast<ck::half_t *>(bias_ptr) + bias_offset,
-{{indent}}                                static_cast<ck::half_t *>(out_ptr),
-{{indent}}                                M,
-{{indent}}                                N,
-{{indent}}                                K,
-{{indent}}                                O,
-{{indent}}                                B,
-{{indent}}                                {int(B/G1), int(G1), int(M), int(O)},
-{{indent}}                                {int(M * G1 * O), int(O), int(G1 * O), 1},
-{{indent}}                                in_stride,
-{{indent}}                                weight_stride,
-{{indent}}                                bias_stride,
-{{indent}}                                in_batch_stride,
-{{indent}}                                weight_batch_stride,
-{{indent}}                                bias_batch_stride,
+{{indent}}                                static_cast<ck::half_t*>(in_ptr) + in_offset,
+{{indent}}                                static_cast<ck::half_t*>(weight_ptr) + weight_offset,
+{{indent}}                                static_cast<ck::half_t*>(bias_ptr) + bias_offset,
+{{indent}}                                static_cast<ck::half_t*>(out_ptr),
+{{indent}}                                {},
+{{indent}}                                {},
+{{indent}}                                {B/G1, G1, M, K},
+{{indent}}                                {G1*in_batch_stride, in_batch_stride, in_stride, 1},
+{{indent}}                                {B/G1, G1, N, K},
+{{indent}}                                {G1*weight_batch_stride, weight_batch_stride, weight_stride, 1},
+{{indent}}                                {B/G1, G1, O, N},
+{{indent}}                                {G1*bias_batch_stride, bias_batch_stride, 1, bias_stride},
+{{indent}}                                {B/G1, G1, M, O},
+{{indent}}                                {M*G1*O, O, G1*O, 1},
+{{indent}}                                {},
+{{indent}}                                {},
+{{indent}}                                {},
+{{indent}}                                {},
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
 {{indent}}                                ck::tensor_operation::element_wise::ScaleAndResetNaNToMinusInfinity{alpha},
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 6528b04a6..02272c136 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -24,14 +24,36 @@
 
 from aitemplate.backend.common import gemm_common
 from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar
+
+INPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+    {% if accessor_a.is_from_strided_tensor %}
+      stride_a = {{accessor_a.stride(0)}};
+      offset_a = {{accessor_a.offset}}; // default to 0
+    {% endif %}
+    {% if accessor_b.is_from_strided_tensor %}
+      stride_b = {{accessor_b.stride(0)}};
+      offset_b = {{accessor_b.offset}}; // default to 0
+    {% endif %}
+    """
+)
 
 # pylint: disable=C0103,C0415,W0611,C0301
+OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  {% if output_accessor.is_from_strided_tensor %}
+    stride_c = {{output_accessor.actual_total_elements_from_stride_dim}};
+    offset_c = {{output_accessor.offset}};
+  {% endif %}
+    """
+)
 
 EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
-{{indent}}const int64_t stride_a = *a_dim1;
-{{indent}}const int64_t stride_b = *b_dim1;
-{{indent}}const int64_t stride_c = *c_dim1;
+{{indent}}ck::index_t stride_a = *a_dim1;
+{{indent}}ck::index_t stride_b = *b_dim1;
+{{indent}}ck::index_t stride_c = *c_dim1;
 """
 )
 
@@ -43,6 +65,7 @@
 """
 )
 
+
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}auto op =  {{instance}}{};
@@ -51,9 +74,7 @@
 {{problem_args}}
 {{indent}});
 {{indent}}if(!op.IsSupportedArgument(argument)) {
-{{indent}}  throw std::runtime_error(
-{{indent}}    "wrong! device_gemm with the specified compilation parameters does "
-{{indent}}    "not support this Gemm problem");
+{{indent}}  LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Gemm problem.";
 {{indent}}}
 {% if is_profiler %}
 {{indent}}auto workspace_size = op.GetWorkSpaceSize(&argument);
@@ -68,17 +89,16 @@
 EXTRA_HEADER_TEMPLATE = jinja2.Template(
     """
 {% if gemm_flag == "" %}
-#include "include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 {% elif gemm_flag == "permute_m2n3" %}
-#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
 {% elif "bias" in gemm_flag or has_d0 %}
-#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
-    {% if gemm_flag == "bias_permute" %}
-#include "ck/tensor_operation/gpu/device/device_gemm_bias_e_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+{% elif gemm_flag in ["permute", "bias_permute"] %}
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-    {% elif gemm_flag in ["bias_permute_m2n3", "bias_permute_m3n2"]  %}
-#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
-    {% endif %}
+{% elif gemm_flag in ["bias_permute_m2n3", "bias_permute_m3n2"]  %}
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
 {% endif %}
 """
 )
@@ -93,6 +113,7 @@
 // #include <half.hpp>
 #include <random>
 #include <rocrand/rocrand.h>
+#include "logging.h"
 #include "include/ck/utility/print.hpp"
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
@@ -111,7 +132,7 @@
     void * in_ptr,
     void * weight_ptr,
     void * out_ptr,
-{% if "bias" in gemm_flag %}
+{% if "bias" in gemm_flag or gemm_flag == "add" %}
     void * bias_ptr,
 {% endif %}
 {% if has_d0 %}
@@ -135,14 +156,15 @@
     hipStream_t stream
     ) {
   {{shape_func}}
+  int64_t offset_a = 0;
+  int64_t offset_b = 0;
+  int64_t offset_c = 0;
   {{extra_shape}}
   {{input_addr_calculator}}
   {{output_addr_calculator}}
   {{exec_paths}}
 
-  throw std::runtime_error(
-      "Unsupported workload for this gemm specialization."
-  );
+  LOG(FATAL) << "Unsupported workload for this gemm specialization.";
 }
 """
 )
@@ -153,7 +175,7 @@
 {{indent}}    {{in_ptr}},
 {{indent}}    {{weight_ptr}},
 {{indent}}    {{out_ptr}},
-{% if "bias" in gemm_flag %}
+{% if "bias" in gemm_flag or gemm_flag == "add" %}
 {{indent}}    {{bias_ptr}},
 {% endif %}
 {% if d0_ptr != "" %}
@@ -182,11 +204,13 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-{{indent}}                                static_cast<ck::half_t *>(in_ptr),
-{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{{indent}}                                static_cast<ck::half_t *>(in_ptr) + offset_a,
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr) + offset_b,
 
 {% if gemm_flag == "bias_permute" %}
 {{indent}}                                static_cast<ck::half_t *>(bias_ptr),
+{% elif gemm_flag == "permute" %}
+{{indent}}                                nullptr,
 {% elif gemm_flag == "bias_permute_m2n3" %}
 {{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
 {% elif gemm_flag == "permute_m2n3" %}
@@ -203,7 +227,7 @@
                                                                     static_cast<ck::half_t *>(d1_ptr)},
 {% endif %}
 {% endif %}
-{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                static_cast<ck::half_t *>(out_ptr) + offset_c,
 {% if gemm_flag not in ["permute_m2n3", "bias_permute_m2n3", "bias_permute_m3n2"]  %}
 {{indent}}                                M,
 {{indent}}                                N,
@@ -214,6 +238,9 @@
 {% if gemm_flag == "bias_permute" %}
 {{indent}}                                {M0, M1, M2, N0, N1, stride_D_M0, stride_D_M1, stride_D_M2, stride_D_N0, stride_D_N1},
 {{indent}}                                {M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1},
+{% elif gemm_flag == "permute" %}
+{{indent}}                                {},
+{{indent}}                                {M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1},
 {% elif gemm_flag in ["permute_m2n3", "bias_permute_m2n3", "bias_permute_m3n2"]  %}
 {{indent}}                                a_ms_ks_lengths,
 {{indent}}                                a_ms_ks_strides,
@@ -242,7 +269,7 @@
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
 {% if gemm_flag == "" %}
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
-{% elif gemm_flag == "permute_m2n3" %}
+{% elif gemm_flag in ["permute", "permute_m2n3"] %}
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
 {% elif gemm_flag == "bias" or "bias_permute" in gemm_flag %}
 {{indent}}                                ck::tensor_operation::element_wise::Add{}
@@ -251,6 +278,8 @@
 {% elif gemm_flag == "bias_fast_gelu" %}
 {{indent}}                                ck::tensor_operation::element_wise::AddFastGelu{}
 {% elif gemm_flag == "bias_swish" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSwish{}
+{% elif gemm_flag == "bias_hardswish" %}
 {{indent}}                                ck::tensor_operation::element_wise::AddHardswish{}
 {% elif gemm_flag == "bias_tanh" %}
 {{indent}}                                ck::tensor_operation::element_wise::AddTanh{}
@@ -449,8 +478,9 @@
     {{func_call}}
   }
   timer->End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer->GetElapsedTime() << std::endl;
+  std::cout << "OP:" << "{{op_name}}" << ",";
+  std::cout << "TIME:" << timer->GetElapsedTime() << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
   delete(timer);
 }
 """
@@ -463,7 +493,7 @@
   void *,
   void *,
   void *,
-{% if "bias" in gemm_flag %}
+{% if "bias" in gemm_flag or gemm_flag == "add" %}
   void *,
 {% endif %}
 {% if has_d0 %}
@@ -608,7 +638,7 @@ def gen_profiler(
     op_instance = func_attrs["op_instance"]
     # shape function
     op_func_shape = gemm_common.gen_shape_eval_code(
-        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+        indent=2, dtype="ck::index_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
     adims = ["&a_dim" + str(i) for i in range(ndims)]
@@ -621,6 +651,7 @@ def gen_profiler(
     file_pairs = []
     has_d0_flag = has_d0(func_attrs)
     has_d1_flag = has_d1(func_attrs)
+
     for op_name, op in op_instance.items():
         config = emit_instance(op)
         config_name = extract_config_name(config)
@@ -685,6 +716,7 @@ def gen_profiler(
             args_parse=args_parse,
             tensor_decl=tensor_decl,
             func_call=func_call,
+            op_name=op_name,
         )
         prefix = os.path.join(workdir, "profiler", op_type)
         if not os.path.exists(prefix):
@@ -769,7 +801,7 @@ def gen_function(
 
     extra_shape_func = extra_shape_template.render(indent="  ")
     shape_eval_func = gemm_common.gen_shape_eval_code(
-        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+        indent=1, dtype="ck::index_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
     exec_paths = ""
     for key, _ in instances.items():
@@ -786,6 +818,13 @@ def gen_function(
             problem_args=problem_args,
             is_profiler=False,
         )
+        has_dynamic_shape = False
+        for inp in func_attrs["inputs"]:
+            for dim in inp.shape():
+                if isinstance(dim, IntVar):
+                    has_dynamic_shape = True
+        if has_dynamic_shape:
+            key = "true"
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     extra_header = extra_header_template.render(
@@ -858,7 +897,7 @@ def gen_function_call(func_attrs, indent="  ", gemm_flag=""):
     b = func_attrs["inputs"][1]
     c = func_attrs["outputs"][0]
     bias_ptr = ""
-    if "bias" in gemm_flag:
+    if "bias" in gemm_flag or gemm_flag == "add":
         bias = func_attrs["inputs"][2]
         bias_ptr = bias._attrs["name"]
     d0_ptr = ""
@@ -966,4 +1005,15 @@ def fproc_f16(op):
             c_layout=c_layout,
         )
 
+    has_dynamic_shape = False
+    for inp in func_attrs["inputs"]:
+        for dim in inp.shape():
+            if isinstance(dim, IntVar):
+                has_dynamic_shape = True
     func_attrs["op_instance"] = extract_config(op_kind, extra_kind, fproc_f16)
+    if has_dynamic_shape:
+        filtered_op_instance = {}
+        for op_name, op in func_attrs["op_instance"].items():
+            if "Padding" in emit_instance(op):
+                filtered_op_instance[op_name] = op
+        func_attrs["op_instance"] = filtered_op_instance
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
index 530196408..20316c33a 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
@@ -90,7 +90,19 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.gemm_rcr.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
index d092ae3c1..579d0f395 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
@@ -90,7 +90,19 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "bias")
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.gemm_rcr_bias.func_decl")
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
index c567a649e..2cfd2aabc 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -127,6 +127,13 @@ def gen_function(
         dim_info_dict,
         "bias_add",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
index 58527f1b0..0b4919619 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -127,6 +127,13 @@ def gen_function(
         dim_info_dict,
         "bias_add_add",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
index 8c5d20de8..afa9723f7 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -128,6 +128,13 @@ def gen_function(
         dim_info_dict,
         "bias_add_add_relu",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
index 18e179eca..998798618 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -128,6 +128,13 @@ def gen_function(
         dim_info_dict,
         "bias_add_relu",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
index ed4b039df..4822664c3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
@@ -26,6 +26,7 @@
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.config")
+@registry.reg("rocm.gemm_rcr_bias_gelu.config")
 def gemm_config(func_attrs, dtype="float16"):
     """Extract (operation name, operation instance) pair from
     all operation candidates.
@@ -49,6 +50,7 @@ def gemm_config(func_attrs, dtype="float16"):
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.gen_profiler")
+@registry.reg("rocm.gemm_rcr_bias_gelu.gen_profiler")
 def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
     """Generates standalone executables for profiler.
 
@@ -72,6 +74,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.gen_function")
+@registry.reg("rocm.gemm_rcr_bias_gelu.gen_function")
 def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     """Generates function body.
 
@@ -95,10 +98,18 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
         exec_cond_template,
         dim_info_dict,
         "bias_fast_gelu",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.func_decl")
+@registry.reg("rocm.gemm_rcr_bias_gelu.func_decl")
 def gemm_gen_function_decl(func_attrs):
     """Generates function declarations.
 
@@ -117,6 +128,7 @@ def gemm_gen_function_decl(func_attrs):
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.func_call")
+@registry.reg("rocm.gemm_rcr_bias_gelu.func_call")
 def gemm_gen_function_call(func_attrs, indent="  "):
     """Generates function call.
 
@@ -136,6 +148,7 @@ def gemm_gen_function_call(func_attrs, indent="  "):
 
 
 @registry.reg("rocm.gemm_rcr_bias_fast_gelu.filter")
+@registry.reg("rocm.gemm_rcr_bias_gelu.filter")
 def gemm_function_filter(cfg, func_attrs, x_shape):
     """Generates function filter.
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
new file mode 100644
index 000000000..4ecade28f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
@@ -0,0 +1,164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = swish(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + swish`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddHardswish
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_hardswish",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_hardswish",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_hardswish")
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_hardswish")
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
index 914c36c1e..9d7108d1e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -127,6 +127,13 @@ def gen_function(
         dim_info_dict,
         "bias_mul",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
index f013f3758..51fe1c11f 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
@@ -26,7 +26,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -98,6 +98,13 @@ def gen_function(
         dim_info_dict,
         "bias_mul_add",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
index a34fe1952..a4e0d1991 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -130,6 +130,13 @@ def gen_function(
         dim_info_dict,
         "bias_mul_tanh",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
index ac5bbc6cc..85fab9657 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -18,11 +18,35 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
+import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.rocm.gemm import common, permute_common
 from aitemplate.backend.rocm.gemm.layout import RCR
 
 
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
+
+
 @registry.reg("rocm.gemm_rcr_bias_permute.config")
 def gemm_config(func_attrs, dtype="float16"):
     """Extract (operation name, operation instance) pair from
@@ -64,7 +88,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
-        args_parse=RCR.args_parse,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
         gemm_flag="bias_permute",
         extra_code="const int G1={}, G2={}, G3={};".format(
             func_attrs["shape"][0],
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
index 9725d980d..1e744128e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
@@ -91,7 +91,17 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
         The rendered template of generated function body.
     """
     return common.gen_function(
-        func_attrs, exec_cond_template, dim_info_dict, "bias_relu"
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_relu",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
index f9028a005..f0943c5f7 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
@@ -28,7 +28,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -143,6 +143,13 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
         dim_info_dict,
         "bias_sigmoid",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
index 147e3ec4f..49d70d02f 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -129,6 +129,13 @@ def gen_function(
         dim_info_dict,
         "bias_sigmoid_mul",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
index 24a427528..802bf22b2 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -27,7 +27,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -132,6 +132,13 @@ def gen_function(
         dim_info_dict,
         "bias_sigmoid_mul_tanh",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index e3a19c86d..218a6e6b5 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -19,11 +19,53 @@
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
 
+import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.rocm.gemm import common
 from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
+EXTRA_CODE = jinja2.Template(
+    """
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddSwish
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a / (1.0f + exp(-a));
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = a / (1.0 + exp(-a));
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = a / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-a))));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
 
 
 @registry.reg("rocm.gemm_rcr_bias_swish.config")
@@ -45,7 +87,7 @@ def gemm_config(func_attrs, dtype="float16"):
     import ck_lib  # noqa: F401
 
     op_kind = ck_lib.library.GemmKind.Gemm
-    extra_kind = ck_lib.library.TensorOperation.AddHardswish
+    extra_kind = ck_lib.library.TensorOperation.AddSwish
     common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
 
 
@@ -69,6 +111,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         dim_info_dict=dim_info_dict,
         args_parse=RCR.args_parse,
         gemm_flag="bias_swish",
+        extra_code=EXTRA_CODE.render(),
     )
 
 
@@ -96,6 +139,14 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
         exec_cond_template,
         dim_info_dict,
         "bias_swish",
+        extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
index acc6b1ca3..804b47505 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
@@ -28,7 +28,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -145,6 +145,13 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
         dim_info_dict,
         "bias_tanh",
         extra_code=EXTRA_CODE.render(),
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
index dd6beb088..c67848258 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
@@ -90,7 +90,19 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     str
         The rendered template of generated function body.
     """
-    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "",
+        input_addr_calculator=common.INPUT_ADDR_CALCULATOR.render(
+            accessor_a=func_attrs["input_accessors"][0],
+            accessor_b=func_attrs["input_accessors"][1],
+        ),
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
 
 
 @registry.reg("rocm.gemm_rrr.func_decl")

From 9ee885cd6fcd2b27ab980e8f79b03c129764080a Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Thu, 15 Jun 2023 15:55:50 -0700
Subject: [PATCH 595/638] add attention backend (#741)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/741

Reviewed By: chenyang78

Differential Revision: D46674711

Pulled By: ipiszy

fbshipit-source-id: 4ce61fff6a46c5efe6e6c2325d9cf18e6a8dd192
---
 .../backend/rocm/attention/__init__.py        |  18 +
 .../rocm/attention/mem_eff_attention.py       | 365 ++++++++++++++++++
 2 files changed, 383 insertions(+)
 create mode 100644 python/aitemplate/backend/rocm/attention/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/attention/mem_eff_attention.py

diff --git a/python/aitemplate/backend/rocm/attention/__init__.py b/python/aitemplate/backend/rocm/attention/__init__.py
new file mode 100644
index 000000000..0be5c075d
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.backend.rocm.attention import mem_eff_attention
+
+__all__ = ["mem_eff_attention"]
diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
new file mode 100644
index 000000000..f902792c1
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -0,0 +1,365 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+attention kernel codegen for ROCM.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+# pylint: disable=C0301
+
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int*>({{name}})")
+
+FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<float*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "logging.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using MaskingSpecialization = ck::tensor_operation::device::MaskingSpecialization;
+
+static constexpr auto MaskingSpec_default = 
+    MaskingSpecialization::MaskDisabled;
+static constexpr auto MaskingSpec_causal =
+    MaskingSpecialization::MaskOutUpperTriangle;
+
+using F32 = float;
+using InputType = {{elem_input_type}};
+
+using ADataType        = InputType;
+using B0DataType       = InputType;
+using B1DataType       = InputType;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = InputType;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+{% if is_causal %}
+        MaskingSpec_causal
+{% else %}
+        MaskingSpec_default
+{% endif %}
+    >;   
+
+{{func_signature}}
+{
+    bool input_permute = true;
+    bool output_permute = true;
+    
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{softmax_scale};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    std::vector<typename DeviceGemmInstance::ProblemDesc> problem_descs;
+
+    const char* q_ptr = reinterpret_cast<const char*>(q);
+    const char* k_ptr = reinterpret_cast<const char*>(k);
+    const char* v_ptr = reinterpret_cast<const char*>(v);
+    char* output_ptr = reinterpret_cast<char*>(output);
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<void*> output_ptrs;
+
+    for(int64_t i = 0; i < batch_size ; i++){
+        int M = seqlens[i];
+        int N = seqlens[i];
+        int K = head_dim;
+        int O = head_dim;
+        int G0 = 1;
+        int G1 = num_heads;
+
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K]
+                : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K]
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K]
+                : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K]
+
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O]
+                : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O]
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides =
+            output_permute
+                ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
+                : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
+
+        problem_descs.push_back({a_gs_ms_ks_lengths,
+                                 a_gs_ms_ks_strides,
+                                 b0_gs_ns_ks_lengths,
+                                 b0_gs_ns_ks_strides,
+                                 b1_gs_os_ns_lengths,
+                                 b1_gs_os_ns_strides,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides,
+                                 {},   // acc0_biases_gs_ms_ns_lengths
+                                 {},   // acc0_biases_gs_ms_ns_strides
+                                 {},   // acc1_biases_gs_ms_os_lengths
+                                 {}}); // acc1_biases_gs_ms_os_strides
+
+        auto offset = K * G1 * M * sizeof(InputType);
+        q_ptrs.push_back(reinterpret_cast<const void*>(q_ptr)); 
+        q_ptr += offset;                              
+        k_ptrs.push_back(reinterpret_cast<const void*>(k_ptr));   
+        k_ptr += offset;                            
+        v_ptrs.push_back(reinterpret_cast<const void*>(v_ptr));
+        v_ptr += offset;                               
+        output_ptrs.push_back(reinterpret_cast<void*>(output_ptr)); 
+        output_ptr += offset;                              
+    }
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(q_ptrs,
+                                      k_ptrs,
+                                      v_ptrs,
+                                      output_ptrs,
+                                      {}, // p_acc0_biases
+                                      {}, // p_acc1_biases
+                                      problem_descs,
+                                      a_element_op,
+                                      b0_element_op,
+                                      acc0_element_op,
+                                      b1_element_op,
+                                      c_element_op);
+
+    // specify workspace for problem_desc
+
+    gemm.SetWorkSpacePointer(&argument, workspace);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        LOG(FATAL) << "wrong! " << gemm.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
+    }
+
+    invoker.Run(argument, StreamConfig{stream, false});
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   const void* q,
+                   const void* k,
+                   const void* v,
+                   const int* seqlens,
+                   const int max_seqlen,
+                   int64_t batch_size,
+                   int num_heads,
+                   int head_dim,
+                   float softmax_scale,
+                   void* workspace,
+                   hipStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{q}}, {{k}}, {{v}}, {{seqlens}},
+{{indent}}    {{max_seqlen}}, {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{head_dim}},
+{{indent}}    {{softmax_scale}},
+{{indent}}    global_workspace_,
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("rocm.mem_eff_attention.gen_function")
+def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    is_causal = func_attrs["causal"]
+    return FUNC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        is_causal=is_causal,
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_decl")
+def mem_eff_attention_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_call")
+def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) in [4, 5]
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    seqlens_name = FUNC_CALL_INT32_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][3]._attrs["name"]
+    )
+
+    q = func_attrs["inputs"][0]
+
+    batch_size = func_attrs["inputs"][3].shape()[0]._attrs["name"]
+    num_heads = q._attrs["shape"][1]._attrs["values"][0]
+    max_seqlen = q._attrs["shape"][0].upper_bound() // 16
+    head_dim = q._attrs["shape"][3]._attrs["values"][0]
+
+    softmax_scale = head_dim ** (-0.5)
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        q=q_name,
+        k=k_name,
+        v=v_name,
+        seqlens=seqlens_name,
+        max_seqlen=max_seqlen,
+        batch_size=batch_size,
+        num_heads=num_heads,
+        head_dim=head_dim,
+        softmax_scale=softmax_scale,
+        indent=indent,
+    )

From e98d2dd7761f840a27e0ffbf9261a4c9f4cc4444 Mon Sep 17 00:00:00 2001
From: Henry Hu <hhh@meta.com>
Date: Thu, 15 Jun 2023 16:53:28 -0700
Subject: [PATCH 596/638] Add bf16 support to upsampling2d nearest (#750)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/750

Reviewed By: terrychenism, aakhundov

Differential Revision: D46504544

fbshipit-source-id: 662bc7d84db27969c972d50a7793c47ec3547ebb
---
 .../backend/cuda/upsample/upsampling2d.py     |  3 ++
 ...st_upsamping2d.py => test_upsampling2d.py} | 54 +++++++++----------
 ...ping2d_add.py => test_upsampling2d_add.py} |  0
 3 files changed, 28 insertions(+), 29 deletions(-)
 rename tests/unittest/ops/{test_upsamping2d.py => test_upsampling2d.py} (69%)
 rename tests/unittest/ops/{test_upsamping2d_add.py => test_upsampling2d_add.py} (100%)

diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index ebb30dab9..d25912eab 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -24,9 +24,12 @@
 
 
 Header_Files = """
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
+
+using bfloat16 = __nv_bfloat16;
 """
 
 
diff --git a/tests/unittest/ops/test_upsamping2d.py b/tests/unittest/ops/test_upsampling2d.py
similarity index 69%
rename from tests/unittest/ops/test_upsamping2d.py
rename to tests/unittest/ops/test_upsampling2d.py
index 2c4e88660..702fd6bd5 100644
--- a/tests/unittest/ops/test_upsamping2d.py
+++ b/tests/unittest/ops/test_upsampling2d.py
@@ -19,7 +19,12 @@
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
 
 
 _DEFAULT_BATCH_SIZE = [1, 3]
@@ -60,38 +65,29 @@ def _test_single_op(
             y_transpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_bilinear_upsample_fp16(self):
-        self._test_single_op(
-            scale_factor=3.5,
-            mode="bilinear",
-            test_name="bilinear_upsampling2d_fp16",
-            dtype="float16",
-        )
-
-    def test_nearest_upsample_fp16(self):
-        self._test_single_op(
-            scale_factor=2.0,
-            mode="nearest",
-            test_name="nearest_upsampling2d_fp16",
-            dtype="float16",
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    def test_bilinear_upsample_fp32(self):
-        self._test_single_op(
-            scale_factor=3.5,
-            mode="bilinear",
-            test_name="bilinear_upsampling2d_fp32",
-            dtype="float32",
-        )
-
-    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
-    def test_nearest_upsample_fp32(self):
+    )
+    def test_upsampling2d_constructor(self, ait_dtype):
+        # Currently upsampling2d bilinear does not support bfloat16.
+        if ait_dtype != "bfloat16":
+            self._test_single_op(
+                scale_factor=3.5,
+                mode="bilinear",
+                test_name=f"bilinear_upsampling2d_{ait_dtype}",
+                dtype=ait_dtype,
+            )
         self._test_single_op(
             scale_factor=2.0,
             mode="nearest",
-            test_name="nearest_upsampling2d_fp32",
-            dtype="float32",
+            test_name=f"nearest_upsampling2d_{ait_dtype}",
+            dtype=ait_dtype,
         )
 
 
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsampling2d_add.py
similarity index 100%
rename from tests/unittest/ops/test_upsamping2d_add.py
rename to tests/unittest/ops/test_upsampling2d_add.py

From 4f5387946b5538f356dc0cc0bc221ef7ddc2c80d Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 16 Jun 2023 11:31:33 +0800
Subject: [PATCH 597/638] fix a bug

---
 python/aitemplate/compiler/transform/profile.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index c9091bf9c..d6b623821 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -109,7 +109,6 @@ def profile(
     timeout = 2400 if Target.current().name() == "rocm" else 240
     profiler_runner = ProfilerRunner(
         devices,
-        timeout=timeout,
         postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
         timeout=timeout,
     )

From 908d861f675b0f179db8f1bcdf9f5f8f7e7d0ae4 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 18 Jun 2023 18:15:15 -0700
Subject: [PATCH 598/638] Add jagged_lengths_to_offsets op (#766)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/766

New operator for converting [B]-shaped 1D `lenghts` tensor to the [B+1]-shaped 1D `offsets` tensor used in the jagged tensor context.

Reviewed By: chenyang78

Differential Revision: D46685031

fbshipit-source-id: ded931449b5701cd911911277e89bf48ab0a7934
---
 python/aitemplate/backend/cuda/__init__.py    |   1 +
 .../backend/cuda/jagged/__init__.py           |  22 +++
 .../cuda/jagged/jagged_lengths_to_offsets.py  | 149 ++++++++++++++++++
 python/aitemplate/compiler/ops/__init__.py    |   1 +
 .../compiler/ops/jagged/__init__.py           |  21 +++
 .../ops/jagged/jagged_lengths_to_offsets.py   |  86 ++++++++++
 .../ops/test_jagged_lengths_to_offsets.py     | 101 ++++++++++++
 7 files changed, 381 insertions(+)
 create mode 100644 python/aitemplate/backend/cuda/jagged/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py
 create mode 100644 python/aitemplate/compiler/ops/jagged/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py
 create mode 100644 tests/unittest/ops/test_jagged_lengths_to_offsets.py

diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index 84bbb63c9..ac88e3679 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -31,6 +31,7 @@
 from aitemplate.backend.cuda.gemm_special import *
 from aitemplate.backend.cuda.gemm_universal import *
 from aitemplate.backend.cuda.gemm_epilogue_vistor import *
+from aitemplate.backend.cuda.jagged import *
 from aitemplate.backend.cuda.layernorm_sigmoid_mul import *
 from aitemplate.backend.cuda.padding import *
 from aitemplate.backend.cuda.pool2d import *
diff --git a/python/aitemplate/backend/cuda/jagged/__init__.py b/python/aitemplate/backend/cuda/jagged/__init__.py
new file mode 100644
index 000000000..d2fc023b9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/__init__.py
@@ -0,0 +1,22 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA jagged tensor-specific ops module init
+"""
+from aitemplate.backend.cuda.jagged import jagged_lengths_to_offsets
+
+__all__ = [
+    "jagged_lengths_to_offsets",
+]
diff --git a/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..d47e8925e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the jagged_lengths_to_offsets op.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <stdexcept>
+
+#include <cub/cub.cuh>
+
+void {{func_name}}(
+    const void* lengths,
+    void* offsets,
+    {{index_type}} batch_size,
+    {{index_type}}* offsets_size,
+    void* workspace,
+    cudaStream_t stream
+) {
+    *offsets_size = batch_size + 1;
+
+    // pre-initialize all offset values to zero
+    cudaMemsetAsync(offsets, 0, (*offsets_size) * sizeof({{offsets_type}}), stream);
+
+    // no-op call to determine the temp storage size;
+    // although we don't need this call (because the workspace
+    // is pre-allocated to a sufficiently large size), unless
+    // the exact size determined by it is passed to the
+    // following call, it won't perform any computation
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(
+        nullptr,
+        temp_storage_bytes,
+        reinterpret_cast<const {{offsets_type}}*>(lengths),
+        reinterpret_cast<{{offsets_type}}*>(offsets) + 1,
+        (int)batch_size,
+        stream
+    );
+
+    if (temp_storage_bytes > {{workspace_size}}) {
+        throw std::runtime_error("Pre-allocated workspace size ({{workspace_size}} bytes) is too small.");
+    }
+
+    // compute the actual offsets, starting from the offsets[1]
+    cub::DeviceScan::InclusiveSum(
+        workspace,
+        temp_storage_bytes,
+        reinterpret_cast<const {{offsets_type}}*>(lengths),
+        reinterpret_cast<{{offsets_type}}*>(offsets) + 1,
+        (int)batch_size,
+        stream
+    );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    const void*,      /* lengths */
+    void*,            /* offsets */
+    {{index_type}},   /* batch_size */
+    {{index_type}}*,  /* offsets_size */
+    void*,            /* workspace */
+    cudaStream_t      /* stream */
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{lengths}},
+{{indent}}    {{offsets}},
+{{indent}}    {{batch_size}},
+{{indent}}    &{{offsets_size}},
+{{indent}}    global_workspace_,
+{{indent}}    stream
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.gen_function")
+def jagged_lengths_to_offsets_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    offsets = func_attrs["outputs"][0]
+    offsets_type = backend_spec.dtype_to_backend_type(offsets.dtype())
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+        offsets_type=offsets_type,
+        workspace_size=func_attrs["workspace"],
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.func_decl")
+def jagged_lengths_to_offsets_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.func_call")
+def jagged_lengths_to_offsets_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    lengths = func_attrs["inputs"][0]
+    offsets = func_attrs["outputs"][0]
+    batch_size = lengths.shape()[0]
+    offsets_size = offsets.shape()[0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        lengths=lengths._attrs["name"],
+        offsets=offsets._attrs["name"],
+        batch_size=batch_size._attrs["name"],
+        offsets_size=offsets_size._attrs["name"],
+    )
diff --git a/python/aitemplate/compiler/ops/__init__.py b/python/aitemplate/compiler/ops/__init__.py
index d9cfc6d7a..8752001a4 100644
--- a/python/aitemplate/compiler/ops/__init__.py
+++ b/python/aitemplate/compiler/ops/__init__.py
@@ -22,6 +22,7 @@
 from aitemplate.compiler.ops.gemm_special import *
 from aitemplate.compiler.ops.gemm_universal import *
 from aitemplate.compiler.ops.gemm_epilogue_vistor import *
+from aitemplate.compiler.ops.jagged import *
 from aitemplate.compiler.ops.layernorm import *
 from aitemplate.compiler.ops.padding import *
 from aitemplate.compiler.ops.pool import *
diff --git a/python/aitemplate/compiler/ops/jagged/__init__.py b/python/aitemplate/compiler/ops/jagged/__init__.py
new file mode 100644
index 000000000..a46942249
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler.ops.jagged.jagged_lengths_to_offsets import (
+    jagged_lengths_to_offsets,
+)
+
+__all__ = [
+    "jagged_lengths_to_offsets",
+]
diff --git a/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..bde349977
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define jagged_lengths_to_offsets op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class jagged_lengths_to_offsets(Operator):
+    """
+    Given a 1D Tensor of lengths of the sequences in a jagged Tensor,
+    returns the corresponding 1D Tensor of offsets. The latter is the
+    inclusive sum of the lengths prepended by a zero.
+
+    Args:
+        lengths (Tensor): 1D Tensor of sequence lengths, [B]-shaped.
+    Returns:
+        offsets (Tensor): 1D Tensor of sequence offsets, [B+1]-shaped.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "jagged_lengths_to_offsets"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shape(self, lengths: Tensor) -> List[IntVar]:
+        batch_size = lengths.shape()[0]
+        # the offsets are 1 element longer than the lengths
+        offsets_size = IntVar(
+            values=[
+                batch_size.lower_bound() + 1,
+                batch_size.upper_bound() + 1,
+            ]
+        )
+        return [offsets_size]
+
+    def __call__(
+        self,
+        lengths: Tensor,
+    ) -> Tensor:
+        if len(lengths.shape()) != 1:
+            raise ValueError(f"The lengths Tensor must be 1D, but got {lengths=}.")
+        if lengths._attrs["dtype"] not in ("int32", "int64"):
+            raise ValueError(
+                f"The lengths Tensor must be int32 or int64, but got {lengths=}."
+            )
+
+        self._attrs["inputs"] = [lengths]
+        self._set_depth()
+        output_shape = self._infer_shape(lengths)
+        offsets = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=lengths._attrs["dtype"],
+        )
+
+        # set the workspace to empirically determined large enough value
+        sizeof_dtype = 4 if lengths._attrs["dtype"] == "int32" else 8
+        self._attrs["workspace"] = max(
+            2**16,
+            16 * sizeof_dtype * offsets.shape()[0].upper_bound(),
+        )
+
+        self._attrs["outputs"] = [offsets]
+        return offsets
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_jagged_lengths_to_offsets.py b/tests/unittest/ops/test_jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..98fbb0908
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_lengths_to_offsets.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_lengths_to_offsets op.
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class JaggedLengthsToOffsetsTestCase(unittest.TestCase):
+    def _test_jagged_lengths_to_offsets(
+        self,
+        batch_size: int,
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+    ):
+        LENGTHS = Tensor(
+            shape=[IntVar([1, batch_size], name="batch_size")],
+            name="lengths",
+            dtype=offsets_dtype,
+            is_input=True,
+        )
+
+        OFFSETS = ops.jagged_lengths_to_offsets()(LENGTHS)
+
+        OFFSETS._attrs["name"] = "offsets"
+        OFFSETS._attrs["is_output"] = True
+
+        model = compile_model(
+            [OFFSETS],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_lengths_to_offsets_{test_suffix}",
+        )
+
+        torch_dtype = string_to_torch_dtype(offsets_dtype)
+
+        for seed in range(10):
+            torch.manual_seed(seed)
+            lengths_pt = torch.randint(
+                low=0,
+                high=1024,
+                size=(batch_size,),
+                dtype=torch_dtype,
+            )
+            offsets_pt = torch.cat(
+                [
+                    torch.zeros((1,), dtype=torch_dtype),
+                    torch.cumsum(lengths_pt, dim=0, dtype=torch_dtype),
+                ],
+            ).cuda()
+
+            offsets = torch.empty(
+                size=(batch_size + 1,),
+                dtype=torch_dtype,
+            ).cuda()
+            model.run_with_tensors(
+                inputs={"lengths": lengths_pt.cuda()},
+                outputs=[offsets],
+            )
+
+            torch.testing.assert_close(offsets, offsets_pt)
+
+    @parameterized.expand(
+        [
+            param(1, 1, "int32"),
+            param(2, 10, "int64"),
+            param(3, 16384, "int32"),
+            param(4, 65537, "int64"),
+        ]
+    )
+    def test_jagged_lengths_to_offsets(self, i, batch_size, offsets_dtype):
+        self._test_jagged_lengths_to_offsets(
+            batch_size=batch_size,
+            offsets_dtype=offsets_dtype,
+            test_suffix=str(i),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From a996ec613d68a814419cd18a22921a142e3c5d9e Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Mon, 19 Jun 2023 00:16:23 -0700
Subject: [PATCH 599/638] Grouped classic b2b bmm op - tuned version (#771)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/771

Tuned version of the jagged attention / b2b bmm op. In this version, it has has up to 38% better performance than our previously best Kernel ( FMHA ) for this task, for up to batch sizes of 256 and max. sequence lengths of 256.

Reviewed By: aakhundov

Differential Revision: D46721114

fbshipit-source-id: 84fac0cffe266706a887a5a51334cf9d8266196b
---
 .../cuda/b2b_bmm/grouped_classic_b2b_bmm.py   |    3 +-
 .../kernel/b2b_batched_gemm.h                 |   26 +-
 .../threadblock/b2b_mma_multistage.h          |  124 +-
 .../threadblock/default_b2b_mma.h             |    8 +-
 .../non_predicated_tile_access_iterator.h     | 1965 +++++++++++++++++
 static/include/kernels/kat_printf.h           |    1 +
 .../ops/test_grouped_classic_b2b_bmm.py       |   63 +-
 7 files changed, 2115 insertions(+), 75 deletions(-)
 create mode 100644 static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h

diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
index ce3ac19bc..cf5c3c521 100644
--- a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -39,7 +39,7 @@
 namespace {
 
 // Hardcode these sizes for now until we get profiling ready.
-constexpr int ThreadblockM = 64;
+constexpr int ThreadblockM = 128; // changed from 64 due to improved performance. More leads to errors.
 constexpr int ThreadblockK = 32;
 constexpr int WarpK = 32;
 constexpr int InstructionM = 16;
@@ -63,6 +63,7 @@
         "Function: {{function_name}}. "
         "max_seq_len: " + std::to_string(max_seq_len) +
         ", k0: " + std::to_string(k0) +
+        ", n0: " + std::to_string({{max_seq_len}}) +
         ", n1: " + std::to_string({{n1}}) + "."
       );
   }
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
index 83bb660fe..8d34c6689 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -108,7 +108,6 @@ struct GroupedB2bGemmBatched {
     typename OutputOp0::Params output_op_0;
     typename OutputOp1::Params output_op_1;
     int gemm_k_iterations_0;
-    int gemm_k_iterations_1;
 
     // array of jagged dim offsets
     // of size batch_count + 1
@@ -183,8 +182,7 @@ struct GroupedB2bGemmBatched {
       batch_count(batch_count),
       num_heads(num_heads),
       offsets{offsets_},
-      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
-      gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
+      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK) {}
   };
 
   /// Shared memory storage structure
@@ -299,7 +297,7 @@ struct GroupedB2bGemmBatched {
 
       // early exit
       if ((threadblock_tile_offset.m() * B2bMma::Shape0::kM >= jagged_seq_len) or (threadblock_tile_offset.n() * B2bMma::Shape0::kN >= jagged_seq_len)) {
-          return;
+          continue;
       }
       // Compute initial location in logical coordinates
       cutlass::MatrixCoord tb_offset_A0{
@@ -324,8 +322,9 @@ struct GroupedB2bGemmBatched {
         { jagged_seq_len, params.problem_size_0.k() }, // A0 matrix size
         thread_idx,
         tb_offset_A0);
-
-      iterator_A0.add_pointer_offset(params.ref_A0.stride(0) * jagged_offset_start + params.head_stride_A0 * head_idx);
+      auto const A0_ptr_offset = params.ref_A0.stride(0) * jagged_offset_start + params.head_stride_A0 * head_idx;
+      iterator_A0.add_pointer_offset(A0_ptr_offset);
+      typename B2bMma::IteratorB0::Element *A0_matrix_base_ptr = params.ref_A0.data() + A0_ptr_offset;
 
       typename B2bMma::IteratorB0 iterator_B0(
         params.params_B0,
@@ -333,8 +332,9 @@ struct GroupedB2bGemmBatched {
         { params.problem_size_0.k(), jagged_seq_len }, // B0 matrix size
         thread_idx,
         tb_offset_B0);
-
-      iterator_B0.add_pointer_offset(params.ref_B0.stride(0) * jagged_offset_start + params.head_stride_B0 * head_idx);
+      auto const B0_ptr_offset = params.ref_B0.stride(0) * jagged_offset_start + params.head_stride_B0 * head_idx;
+      iterator_B0.add_pointer_offset(B0_ptr_offset);
+      typename B2bMma::IteratorB0::Element *B0_matrix_base_ptr  =  params.ref_B0.data() + B0_ptr_offset;
 
       typename B2bMma::IteratorB1 iterator_B1(
         params.params_B1,
@@ -344,7 +344,7 @@ struct GroupedB2bGemmBatched {
         tb_offset_B1);
       auto const B1_ptr_offset = params.ref_B1.stride(0) * jagged_offset_start +  params.head_stride_B1 * head_idx;
       iterator_B1.add_pointer_offset(B1_ptr_offset);
-      typename B2bMma::IteratorB1::Element *B1_tile_base_ptr  =  params.ref_B1.data() + B1_ptr_offset;
+      typename B2bMma::IteratorB1::Element *B1_matrix_base_ptr  =  params.ref_B1.data() + B1_ptr_offset;
 
 
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
@@ -377,10 +377,10 @@ struct GroupedB2bGemmBatched {
 
       // Construct thread-scoped matrix multiply
       B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx,
-            jagged_seq_len,
-            B1_tile_base_ptr,
-            static_cast<int>(params.ref_B1.stride(0))
-      );
+            jagged_seq_len, static_cast<int>(params.problem_size_0.k()), static_cast<int>(params.problem_size_1.n()),
+            A0_matrix_base_ptr, static_cast<int>(params.ref_A0.stride(0)),
+            B0_matrix_base_ptr, static_cast<int>(params.ref_B0.stride(0)),
+            B1_matrix_base_ptr, static_cast<int>(params.ref_B1.stride(0)));
 
       typename B2bMma::FragmentC0 src_accum;
       typename B2bMma::FragmentC1 accumulators;
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
index 017d9e734..7ff9543ae 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -50,6 +50,7 @@
 
 #include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
 
+#include "cutlass/util/device_utils.h"
 #include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
 #include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
 #include "grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
@@ -59,6 +60,12 @@ namespace cutlass {
 namespace gemm {
 namespace threadblock {
 
+template <typename T>
+CUTLASS_DEVICE
+int32_t int32_ptrdiff(T* a, T *b) {
+  return static_cast<int32_t>(reinterpret_cast<char*>(a) - reinterpret_cast<char*>(b)) / sizeof(T);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Structure to compute the matrix product targeting CUDA cores and SIMT math
@@ -281,11 +288,17 @@ class B2bMmaMultistage :
 
   int const jagged_sequence_length;
 
-  /// Base address of B1 matrix in memory
-  typename IteratorB0::Element *B1_base_matrix_ptr;
+  int const seq_stride_A0;
+  typename IteratorA0::Element *A0_matrix_base_ptr;
+
+  int const seq_stride_B0;
+  typename IteratorB0::Element *B0_matrix_base_ptr;
 
-  /// Stride from one B1 sequence element to the next ( B1 is row-major )
   int const seq_stride_B1;
+  typename IteratorB1::Element *B1_matrix_base_ptr;
+
+  int const qk_dims;
+  int const v_dims;
 
 public:
 
@@ -303,8 +316,16 @@ class B2bMmaMultistage :
       int lane_idx,
       ///< GEMM0 N is used for accumulator extent
       int jagged_sequence_length_,
+      int qk_dims_,
+      int v_dims_,
       // extra params
-      typename IteratorB1::Element *B1_base_matrix_ptr_,
+      typename IteratorA0::Element *A0_matrix_base_ptr_,
+      int seq_stride_A0_,
+
+      typename IteratorB0::Element *B0_matrix_base_ptr_,
+      int seq_stride_B0_,
+
+      typename IteratorB1::Element *B1_matrix_base_ptr_,
       int seq_stride_B1_
     ):
       Base(shared_storage, thread_idx, warp_idx, lane_idx),
@@ -313,8 +334,16 @@ class B2bMmaMultistage :
       smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
       gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx),
       jagged_sequence_length(jagged_sequence_length_),
-      B1_base_matrix_ptr(B1_base_matrix_ptr_),
-      seq_stride_B1(seq_stride_B1_)
+      A0_matrix_base_ptr(A0_matrix_base_ptr_),
+      seq_stride_A0(seq_stride_A0_),
+
+      B0_matrix_base_ptr(B0_matrix_base_ptr_),
+      seq_stride_B0(seq_stride_B0_),
+
+      seq_stride_B1(seq_stride_B1_),
+      B1_matrix_base_ptr(B1_matrix_base_ptr_),
+      qk_dims(qk_dims_),
+      v_dims(v_dims_)
   {
     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
@@ -357,8 +386,13 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorA0::Element *>(gmem_ptr), A0_matrix_base_ptr);
+          const int32_t outer_offset = offset / this->seq_stride_A0;
+          const int32_t inner_offset = offset % this->seq_stride_A0;
+          const bool iterA0valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
-              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+              dst_ptr + v, gmem_ptr, iterA0valid);
 
           ++iterator_A0;
         }
@@ -386,8 +420,13 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB0::Element *>(gmem_ptr), B0_matrix_base_ptr);
+          const int32_t row_offset = offset / this->seq_stride_B0;
+          const int32_t col_offset = offset % this->seq_stride_B0;
+          bool iterB0valid = ((row_offset<this->jagged_sequence_length) and (col_offset<this->qk_dims));
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
-              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+              dst_ptr + v, gmem_ptr, iterB0valid);
           ++iterator_B0;
         }
         ++this->smem_iterator_B0_;
@@ -415,10 +454,11 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
-          auto const gmem_ptr = iterator_B1.get();
-          int64_t const offset = reinterpret_cast<decltype(B1_base_matrix_ptr)>(gmem_ptr)-B1_base_matrix_ptr;
-          int64_t const outer_offset = offset / seq_stride_B1;
-          bool const iterB1valid = (outer_offset<jagged_sequence_length);
+          auto gmem_ptr = iterator_B1.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr), B1_matrix_base_ptr);
+          const int32_t outer_offset = offset / this->seq_stride_B1;
+          const int32_t inner_offset = offset % this->seq_stride_B1;
+          const bool iterB1valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
               dst_ptr + v, gmem_ptr, iterB1valid);
           ++iterator_B1;
@@ -457,8 +497,6 @@ class B2bMmaMultistage :
     for (int stage = 0; stage < Base::kStages - 1;
          ++stage, --gemm_k_iterations_0) {
 
-      iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
-      iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
       iterator_A0.set_iteration_index(0);
       this->smem_iterator_A0_.set_iteration_index(0);
 
@@ -475,9 +513,14 @@ class B2bMmaMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+          int32_t const offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr), A0_matrix_base_ptr);
+          int32_t const outer_offset = offset / this->seq_stride_A0;
+          int32_t const inner_offset = offset % this->seq_stride_A0;
+          bool iterA0valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->qk_dims));
           // zfill used here also in original code
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
-              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+              dst_ptr + v, gmem_ptr, iterA0valid);
           ++iterator_A0;
         }
 
@@ -500,8 +543,13 @@ class B2bMmaMultistage :
               sizeof_bits<typename IteratorB0::Element>::value *
               IteratorB0::ThreadMap::kElementsPerAccess /
               IteratorB0::kAccessesPerVector / 8;
+          auto gmem_ptr = iterator_B0.get();
+          int32_t const offset = int32_ptrdiff(reinterpret_cast<typename IteratorB0::Element *>(gmem_ptr), B0_matrix_base_ptr);
+          int32_t const row_offset = offset / this->seq_stride_B0;
+          int32_t const col_offset = offset % this->seq_stride_B0;
+          bool iterB0valid = ((row_offset<this->jagged_sequence_length) and (col_offset<this->qk_dims));
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
-              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+              dst_ptr + v, gmem_ptr, iterB0valid);
 
           ++iterator_B0;
         }
@@ -544,8 +592,6 @@ class B2bMmaMultistage :
 
     ++this->warp_tile_iterator_A0_;
     ++this->warp_tile_iterator_B0_;
-    iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
-    iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
     int smem_write_stage_idx = Base::kStages - 1;
     int smem_read_stage_idx = 0;
 
@@ -652,8 +698,6 @@ class B2bMmaMultistage :
           }
 
           --gemm_k_iterations_0;
-          iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
-          iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
         }
 
         // Do any conversions feeding the first stage at the end of the loop so
@@ -682,12 +726,9 @@ class B2bMmaMultistage :
     //
     // Prologue
     //
-    int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
-    // Issue several complete stages
     CUTLASS_PRAGMA_UNROLL
     for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations_1) {
-      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+         ++stage) {
       iterator_B1.set_iteration_index(0);
       this->smem_iterator_B1_.set_iteration_index(0);
 
@@ -697,20 +738,21 @@ class B2bMmaMultistage :
         typename IteratorB1::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
-
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
           constexpr int kSrcBytes =
               sizeof_bits<typename IteratorB1::Element>::value *
               IteratorB1::ThreadMap::kElementsPerAccess /
               IteratorB1::kAccessesPerVector / 8;
-          auto const gmem_ptr = iterator_B1.get();
-          int64_t const offset = reinterpret_cast<decltype(B1_base_matrix_ptr)>(gmem_ptr)-B1_base_matrix_ptr;
-          int64_t const outer_offset = offset / seq_stride_B1;
-          bool const iterB1valid = (outer_offset<jagged_sequence_length);
+          auto gmem_ptr = iterator_B1.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr),B1_matrix_base_ptr);
+
+          const int32_t outer_offset = offset / this->seq_stride_B1;
+          const int32_t inner_offset = offset % this->seq_stride_B1;
+          const bool iterB1valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
           // zfill also used in original code
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
-              dst_ptr + v, gmem_ptr, iterB1valid);
+              dst_ptr + v, iterator_B1.get(), iterB1valid);
 
           ++iterator_B1;
         }
@@ -750,7 +792,6 @@ class B2bMmaMultistage :
     this->warp_tile_iterator_B1_.set_kgroup_index(0);
     this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
     ++this->warp_tile_iterator_B1_;
-    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
     smem_write_stage_idx = Base::kStages - 1;
     smem_read_stage_idx = 0;
 
@@ -760,18 +801,26 @@ class B2bMmaMultistage :
     //
     // Mainloop
     //
-    gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
+    constexpr int max_gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+    const int dyn_max_gemm_k_iterations_1 = (jagged_sequence_length + Shape1::kK - 1) / Shape1::kK;
+    // We need to have a second counter variable to early exit the unrolled loop
+    // for compiler-internal reasons, if we use the main loop counter to determine early exit, it will
+    // prevent loop unrolling, this will lead to increased register usage, and much lower performance.
+    int counter = 0;
     CUTLASS_PRAGMA_UNROLL
-    for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
-      //
-      // Loop over GEMM K dimension
-      //
+    for (int gemm_k_iterations_1=0; gemm_k_iterations_1 < max_gemm_k_iterations_1; gemm_k_iterations_1++) {
+
+      // early exit out of unrolled loop, so we can have a dynamic number of sequences
+      // despite being an unrolled loop that uses few registers
+      if (counter++ >= dyn_max_gemm_k_iterations_1) {
+         break;
+      }
 
       // Computes a warp-level GEMM on data held in shared memory
       // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
       CUTLASS_PRAGMA_UNROLL
       for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
-           ++warp_mma_k) {
+          ++warp_mma_k) {
         // Load warp-level tile from accumulator fragment
         warp_tile_iterator_A1_.load(
             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
@@ -849,7 +898,6 @@ class B2bMmaMultistage :
           } else {
             ++smem_read_stage_idx;
           }
-          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
         }
 
         // Do any conversions feeding the first stage at the end of the loop so
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
index 638f9f84e..4c15aa7e0 100644
--- a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -58,7 +58,7 @@
 
 #include "grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h"
 #include "grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h"
-
+#include "non_predicated_tile_access_iterator.h"
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -300,7 +300,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
   using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
   using IteratorA0 =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
           ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
 
@@ -308,7 +308,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
   using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
   using IteratorB0 =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
           ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
 
@@ -353,7 +353,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
   using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
   using IteratorB1 =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
           ElementB, LayoutB1, 0, ThreadMapB1, AccessTypeB1>;
 
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h
new file mode 100644
index 000000000..1a2f74b2f
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h
@@ -0,0 +1,1965 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we
+    only need to compute the predicates twice, once before the first tile and
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+namespace grouped_classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+/// NonPredicatedTileAccessIteratorPredicates
+///
+template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIteratorPredicates {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+// private:
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+
+ public:
+
+  CUTLASS_HOST_DEVICE
+  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
+
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(),
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+
+    set_iteration_index(0);
+  }
+
+  /// Default constructor
+  NonPredicatedTileAccessIteratorPredicates() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIteratorPredicates(
+      /// Extent of tensor
+      TensorCoord extent)
+      : extent_(extent) {
+	}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIteratorPredicates &operator++() {
+
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// NonPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
+class NonPredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = NonPredicatedTileAccessIteratorPredicates<
+      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+
+    using Base = PredicatedTileAccessIteratorParams;
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) :
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+
+  /// Gather indices
+  int const *indices_;
+
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are neeeded as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : params_(params),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
+        is_residue_tile_(true),
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    if (Gather) {
+      assert(indices_);
+    }
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather && !Permute) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
+    }
+  }
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+
+      if (!Gather && !Permute) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
+
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
+        }
+      } else {
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
+        } else {
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
+        }
+      }
+    } else {
+      if (!Gather && !Permute) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
+      } else {
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
+      }
+    }
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    if (Gather || Permute)
+    {
+
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
+
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
+    }
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather && !Permute) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather && !Permute) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    the_predicates.get_mask(mask);
+  }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column()),
+                  indices) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row()),
+                  indices) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank 2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = NonPredicatedTileAccessIteratorPredicates<
+      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend NonPredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+	is_residue_tile_(true) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(the_predicates.residue_offset_));
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { the_predicates.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank-2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class NonPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for row-major interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class NonPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace grouped_classic_b2b_bmm
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h
index 8382c037d..f4f122f6e 100644
--- a/static/include/kernels/kat_printf.h
+++ b/static/include/kernels/kat_printf.h
@@ -46,6 +46,7 @@
 #include <cfloat>
 #include <climits>
 #include <cmath>
+#include <cstdarg>
 #include <cstdint>
 #include <cstdio> // for CUDA's builtin printf()
 
diff --git a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
index acabaa961..c2d50fe81 100644
--- a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
+++ b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
@@ -16,6 +16,7 @@
 Unittests for grouped b2b bmm Operators.
 """
 import logging
+import os
 
 import unittest
 from typing import List, Tuple
@@ -306,7 +307,7 @@ def _test_grouped_classic_b2b_bmm(
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_1(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_1",
+            test_name=f"grouped_classic_b2b_bmm_fp16_1_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=1,
             head_dim=64,
@@ -319,7 +320,7 @@ def test_grouped_classic_b2b_bmm_fp16_1(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_2(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_2",
+            test_name=f"grouped_classic_b2b_bmm_fp16_2_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=1,
             random_seed=list(range(3)),
@@ -328,7 +329,7 @@ def test_grouped_classic_b2b_bmm_fp16_2(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_a(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_3_batch_a",
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_a_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=4,
             random_seed=list(range(3)),
@@ -337,7 +338,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_a(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_b(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_3_batch_b",
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_b_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[2, 4],
         )
@@ -345,7 +346,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_b(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_c(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_3_batch_c",
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_c_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=2,
         )
@@ -353,7 +354,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_c(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_d(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_3_batch_d",
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_d_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[2, 33],
             random_seed=list(range(3)),
@@ -362,7 +363,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_d(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_e(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_3_batch_e",
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_e_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[2, 4],
             num_heads=[3, 5],
@@ -371,7 +372,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_e(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias",
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[2, 4],
             num_heads=[3, 5],
@@ -381,7 +382,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias",
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[2, 4],
             num_heads=[3, 5],
@@ -392,7 +393,7 @@ def test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_acc(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="grouped_classic_b2b_bmm_fp16_acc",
+            test_name=f"grouped_classic_b2b_bmm_fp16_acc_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[7],
             use_fp16_acc=True,
@@ -404,7 +405,7 @@ def test_grouped_classic_b2b_bmm_fp16_acc(self):
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1(self):
         self._test_grouped_classic_b2b_bmm(
-            test_name="test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1",
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[5],
             num_heads=4,
@@ -430,7 +431,7 @@ def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty2(self):
     def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_silu(self):
         for max_seq_len in [64, 256, 512]:
             self._test_grouped_classic_b2b_bmm(
-                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}",
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_pid_{os.getpid()}",
                 dtype="float16",
                 batch_sizes=[1, 5, 33],
                 max_seq_lens=max_seq_len,
@@ -450,7 +451,7 @@ def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias(
     ):
         for max_seq_len in [512]:
             self._test_grouped_classic_b2b_bmm(
-                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_silu_bias",
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_silu_bias_pid_{os.getpid()}",
                 dtype="float16",
                 batch_sizes=[3, 33],
                 max_seq_lens=max_seq_len,
@@ -470,7 +471,7 @@ def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_
         for max_seq_len in [512]:
             for random_seed in range(1):
                 self._test_grouped_classic_b2b_bmm(
-                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_1",
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_1_pid_{os.getpid()}",
                     dtype="float16",
                     batch_sizes=[3, 33],
                     max_seq_lens=max_seq_len,
@@ -491,7 +492,7 @@ def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_
         for max_seq_len in [512]:
             for random_seed in range(1):
                 self._test_grouped_classic_b2b_bmm(
-                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_2",
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_2_pid_{os.getpid()}",
                     dtype="float16",
                     batch_sizes=[3, 33],
                     max_seq_lens=max_seq_len,
@@ -511,7 +512,7 @@ def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_
     ):
         for max_seq_len in [512]:
             self._test_grouped_classic_b2b_bmm(
-                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_bias_broadcast_3",
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_bias_broadcast_3_pid_{os.getpid()}",
                 dtype="float16",
                 batch_sizes=[3, 33],
                 max_seq_lens=max_seq_len,
@@ -531,9 +532,9 @@ def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4(
     ):
         for max_seq_len in [64, 256, 512]:
             self._test_grouped_classic_b2b_bmm(
-                test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4_seqlen={max_seq_len}",
+                test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4_seqlen_{max_seq_len}",
                 dtype="float16",
-                batch_sizes=[3, 33],
+                batch_sizes=[16],
                 max_seq_lens=max_seq_len,
                 num_heads=[
                     2,
@@ -549,7 +550,7 @@ def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5(
         self,
     ):
         self._test_grouped_classic_b2b_bmm(
-            test_name="test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5",
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5_pid_{os.getpid()}",
             dtype="float16",
             batch_sizes=[3, 33],
             max_seq_lens=256,
@@ -562,6 +563,30 @@ def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5(
             bias_broadcast=[True, False, True, False],
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_profile_1(
+        self,
+    ):
+        for max_seq_len in [256]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"test_grouped_classic_b2b_bmm_profile_1_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[
+                    4,
+                    8,
+                    16,
+                    32,
+                    64,
+                ],
+                max_seq_lens=max_seq_len,
+                num_heads=[1],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(1)),
+                has_bias=True,
+                bias_broadcast=[True, True, False, False],
+            )
+
 
 if __name__ == "__main__":
     unittest.main()

From 92823177b658e066fca6117a8b089d19b7d26abe Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Mon, 19 Jun 2023 00:42:16 -0700
Subject: [PATCH 600/638] Add jagged_lengths_to_presences op (#767)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/767

New operator for converting [B]-shaped 1D `lenghts` tensor to the [B, max_seq_len]-shaped 2D `presences` tensor indicated what the data is available in the jagged tensor.

Reviewed By: chenyang78

Differential Revision: D46687994

fbshipit-source-id: 1816e812a081aa889045f9bf25510a48a5eb635f
---
 python/aitemplate/backend/backend_spec.py     |   2 +
 .../backend/cuda/jagged/__init__.py           |   6 +-
 .../jagged/jagged_lengths_to_presences.py     | 139 ++++++++++++++++++
 .../compiler/ops/jagged/__init__.py           |   4 +
 .../ops/jagged/jagged_lengths_to_presences.py |  89 +++++++++++
 .../ops/test_jagged_lengths_to_presences.py   | 127 ++++++++++++++++
 6 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py
 create mode 100644 python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py
 create mode 100644 tests/unittest/ops/test_jagged_lengths_to_presences.py

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 02ae54080..2b6771658 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -59,6 +59,7 @@ class GPUBackendSpec(BackendSpec):
             "float": "float",
             "int64": "int64_t",
             "int32": "int32_t",
+            "bool": "bool",
         }
     )
 
@@ -72,6 +73,7 @@ class GPUBackendSpec(BackendSpec):
             "int64_t": 8,
             "int32_t": 4,
             "float": 4,
+            "bool": 1,
         }
     )
 
diff --git a/python/aitemplate/backend/cuda/jagged/__init__.py b/python/aitemplate/backend/cuda/jagged/__init__.py
index d2fc023b9..550a59a2b 100644
--- a/python/aitemplate/backend/cuda/jagged/__init__.py
+++ b/python/aitemplate/backend/cuda/jagged/__init__.py
@@ -15,8 +15,12 @@
 """
 CUDA jagged tensor-specific ops module init
 """
-from aitemplate.backend.cuda.jagged import jagged_lengths_to_offsets
+from aitemplate.backend.cuda.jagged import (
+    jagged_lengths_to_offsets,
+    jagged_lengths_to_presences,
+)
 
 __all__ = [
     "jagged_lengths_to_offsets",
+    "jagged_lengths_to_presences",
 ]
diff --git a/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py
new file mode 100644
index 000000000..91e5f528f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py
@@ -0,0 +1,139 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the jagged_lengths_to_presences op.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+using bfloat16 = nv_bfloat16;
+
+#define THREADS_PER_BLOCK 128
+
+
+namespace {
+
+__global__ void jagged_lengths_to_presences_kernel(
+    const {{lengths_type}}* lengths,
+    {{presences_type}}* presences
+) {
+    {{index_type}} bid = blockIdx.y;
+    {{index_type}} tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid < {{max_seq_len}}) {
+        {{lengths_type}} len = lengths[bid];
+        presences[bid * {{max_seq_len}} + tid] = static_cast<{{presences_type}}>(tid < len);
+    }
+}
+
+} // namespace
+
+
+void {{func_name}}(
+    const void* lengths,
+    void* presences,
+    {{index_type}} batch_size,
+    cudaStream_t stream
+) {
+    dim3 grid_size(({{max_seq_len}} + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, batch_size);
+    jagged_lengths_to_presences_kernel<<<grid_size, THREADS_PER_BLOCK, 0, stream>>>(
+        reinterpret_cast<const {{lengths_type}}*>(lengths),
+        reinterpret_cast<{{presences_type}}*>(presences)
+    );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    const void*,      /* lengths */
+    void*,            /* presences */
+    {{index_type}},   /* batch_size */
+    cudaStream_t      /* stream */
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{lengths}},
+{{indent}}    {{presences}},
+{{indent}}    {{batch_size}},
+{{indent}}    stream
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.gen_function")
+def jagged_lengths_to_presences_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    lengths = func_attrs["inputs"][0]
+    presences = func_attrs["outputs"][0]
+    lengths_type = backend_spec.dtype_to_backend_type(lengths.dtype())
+    presences_type = backend_spec.dtype_to_backend_type(presences.dtype())
+    max_seq_len = presences.shape()[1].value()
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        lengths_type=lengths_type,
+        presences_type=presences_type,
+        index_type=backend_spec.index_type,
+        max_seq_len=max_seq_len,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.func_decl")
+def jagged_lengths_to_presences_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.func_call")
+def jagged_lengths_to_presences_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    lengths = func_attrs["inputs"][0]
+    presences = func_attrs["outputs"][0]
+    batch_size = lengths.shape()[0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        lengths=lengths._attrs["name"],
+        presences=presences._attrs["name"],
+        batch_size=batch_size._attrs["name"],
+    )
diff --git a/python/aitemplate/compiler/ops/jagged/__init__.py b/python/aitemplate/compiler/ops/jagged/__init__.py
index a46942249..7d0c2a0ce 100644
--- a/python/aitemplate/compiler/ops/jagged/__init__.py
+++ b/python/aitemplate/compiler/ops/jagged/__init__.py
@@ -15,7 +15,11 @@
 from aitemplate.compiler.ops.jagged.jagged_lengths_to_offsets import (
     jagged_lengths_to_offsets,
 )
+from aitemplate.compiler.ops.jagged.jagged_lengths_to_presences import (
+    jagged_lengths_to_presences,
+)
 
 __all__ = [
     "jagged_lengths_to_offsets",
+    "jagged_lengths_to_presences",
 ]
diff --git a/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py
new file mode 100644
index 000000000..7a372d8d2
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define jagged_lengths_to_presences op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
+
+
+class jagged_lengths_to_presences(Operator):
+    """
+    Given a 1D Tensor of lengths of the sequences in a jagged Tensor,
+    returns a 2D Tensor of presences indicating where the data exists
+    and where not. The dtype of presences Tensor is configurable.
+
+    Args:
+        lengths (Tensor): 1D Tensor of sequence lengths, [B]-shaped.
+        max_seq_len (int): Maximum possible sequence length.
+    Returns:
+        presences (Tensor): 2D Tensor of presences, [B, max_seq_len]-shaped.
+                            presences[i, j] = (dtype)(j < lenghts[i])
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "jagged_lengths_to_presences"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shape(
+        self,
+        lengths: Tensor,
+        max_seq_len: int,
+    ) -> List[IntVar]:
+        batch_size = lengths.shape()[0]
+        return [batch_size, IntImm(max_seq_len)]
+
+    def __call__(
+        self,
+        lengths: Tensor,
+        max_seq_len: int,
+        dtype: str = "bool",
+    ) -> Tensor:
+        if len(lengths.shape()) != 1:
+            raise ValueError(f"The lengths Tensor must be 1D, but got {lengths=}.")
+        if lengths._attrs["dtype"] not in ("int32", "int64"):
+            raise ValueError(
+                f"The lengths Tensor must be int32 or int64, but got {lengths=}."
+            )
+        if not isinstance(max_seq_len, int) or max_seq_len <= 0:
+            raise ValueError(
+                f"max_seq_len must be a positive integer, but got {max_seq_len=}."
+            )
+
+        # validation inside
+        get_dtype_size(dtype)
+
+        self._attrs["inputs"] = [lengths]
+        self._set_depth()
+
+        output_shape = self._infer_shape(lengths, max_seq_len)
+        presences = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=dtype,
+        )
+
+        self._attrs["outputs"] = [presences]
+        return presences
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_jagged_lengths_to_presences.py b/tests/unittest/ops/test_jagged_lengths_to_presences.py
new file mode 100644
index 000000000..c110e9f23
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_lengths_to_presences.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_lengths_to_presences op.
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+def _compute_presences_pt(
+    lengths_pt: torch.Tensor,
+    max_seq_len: int,
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    data = []
+    for length in lengths_pt.cpu().tolist():
+        data.append([1] * length + [0] * (max_seq_len - length))
+    return torch.tensor(data, dtype=output_dtype)
+
+
+class JaggedLengthsToPresencesTestCase(unittest.TestCase):
+    def _test_jagged_lengths_to_presences(
+        self,
+        batch_size: int,
+        max_seq_len: int = 128,
+        lengths_dtype: str = "int32",
+        presences_dtype: str = "float16",
+        test_suffix: str = "",
+    ):
+        LENGTHS = Tensor(
+            shape=[IntVar([1, batch_size], name="batch_size")],
+            name="lengths",
+            dtype=lengths_dtype,
+            is_input=True,
+        )
+
+        PRESENCES = ops.jagged_lengths_to_presences()(
+            lengths=LENGTHS,
+            max_seq_len=max_seq_len,
+            dtype=presences_dtype,
+        )
+
+        PRESENCES._attrs["name"] = "presences"
+        PRESENCES._attrs["is_output"] = True
+
+        model = compile_model(
+            [PRESENCES],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_lengths_to_presences_{test_suffix}",
+        )
+
+        torch_lengths_dtype = string_to_torch_dtype(lengths_dtype)
+        torch_presences_dtype = string_to_torch_dtype(presences_dtype)
+
+        for seed in range(10):
+            torch.manual_seed(seed)
+            lengths_pt = torch.randint(
+                low=0,
+                high=max_seq_len,
+                size=(batch_size,),
+                dtype=torch_lengths_dtype,
+            ).cuda()
+            presences_pt = _compute_presences_pt(
+                lengths_pt=lengths_pt,
+                max_seq_len=max_seq_len,
+                output_dtype=torch_presences_dtype,
+            ).cuda()
+
+            presences = torch.empty(
+                size=(batch_size, max_seq_len),
+                dtype=torch_presences_dtype,
+            ).cuda()
+            model.run_with_tensors(
+                inputs={"lengths": lengths_pt},
+                outputs=[presences],
+            )
+
+            torch.testing.assert_close(presences, presences_pt)
+
+    @parameterized.expand(
+        [
+            param(1, 1, 1, "int32", "bool"),
+            param(2, 11, 23, "int64", "float32"),
+            param(3, 1024, 256, "int32", "float16"),
+            param(4, 1234, 567, "int64", "bool"),
+        ]
+    )
+    def test_jagged_lengths_to_presences(
+        self,
+        i,
+        batch_size,
+        max_seq_len,
+        lengths_dtype,
+        presences_dtype,
+    ):
+        self._test_jagged_lengths_to_presences(
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            lengths_dtype=lengths_dtype,
+            presences_dtype=presences_dtype,
+            test_suffix=str(i),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From eea99c12fdacdc6fa56f27e362c39ba6f79a9175 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 20 Jun 2023 15:24:13 +0800
Subject: [PATCH 601/638] rename to warp size

---
 python/aitemplate/backend/backend_spec.py                     | 4 ++--
 python/aitemplate/backend/common/tensor/permute0213_common.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 201be64a1..a51b1d2ca 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -408,7 +408,7 @@ class ROCMSpec(GPUBackendSpec):
     prefix = "hip"
     stream = "stream"
     cub = "hipcub"
-    tile_size = 64
+    warp_size = 64
 
     cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
@@ -447,7 +447,7 @@ class CUDASpec(GPUBackendSpec):
     prefix = "cuda"
     stream = "stream"
     cub = "cub"
-    tile_size = 32
+    warp_size = 32
 
     cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
diff --git a/python/aitemplate/backend/common/tensor/permute0213_common.py b/python/aitemplate/backend/common/tensor/permute0213_common.py
index 9e0f52382..d4422fcff 100644
--- a/python/aitemplate/backend/common/tensor/permute0213_common.py
+++ b/python/aitemplate/backend/common/tensor/permute0213_common.py
@@ -185,7 +185,7 @@
     """
 {{header_files}}
 
-#define TILE_SIZE {{tile_size}}
+#define TILE_SIZE {{warp_size}}
 #define ITEMS_PER_THREAD 4
 #define DIRECT_BLOCK_Y 4
 #define DIRECT_BLOCK_Z 2
@@ -421,7 +421,7 @@ def gen_function(
         dtype=backend_spec.dtype_to_backend_type(xdtype),
     )
     return SRC_TEMPLATE.render(
-        tile_size=backend_spec.tile_size,
+        warp_size=backend_spec.warp_size,
         function_name=func_name,
         exec_paths=exec_paths,
         header_files=header_files,

From 2db5b42e868d07282715f66c4dadf393885d4e1a Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Tue, 20 Jun 2023 13:32:38 -0700
Subject: [PATCH 602/638] native CUDA development helper (#772)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/772

These changes add several features to aid in development, debugging and performance tuning of native CUDA code, both generated and AIT static code:

 * Allow to generate intermediate files ( including .ptx files ) when building
 * Optionally write TARGETS file into generated build dirs to enable code navigation and buck builds of standalone executable from within generated build dirs
 * Introduce an environment variable flag to enable standalone exe generation ( so this does not require modifying the source of debug_settings.py )
 * Introduce an environment variable flag to enable CUDA debug symbol generation for GPU code
 * Introduces an environment variable flag which enables the generated build directories to directly reference include directories within the main source tree ( this should only be enabled for inplace builds, including @//mode/dev-nosan )
 * Introduces an environment variable flag which fixes code navigation issues in some IDEs, which do not identify .cu files as C++ files.

Reviewed By: aakhundov

Differential Revision: D46519923

fbshipit-source-id: 4c808be7e0ba0af6082f03f95b0ee77439505e4c
---
 docs/source/reference/env.rst                | 10 +++
 python/aitemplate/backend/builder.py         | 11 +++
 python/aitemplate/backend/cuda/target_def.py | 86 ++++++++++++++++++++
 python/aitemplate/backend/target.py          |  7 ++
 python/aitemplate/utils/debug_settings.py    |  4 +-
 python/aitemplate/utils/environ.py           | 53 ++++++++++++
 6 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 60a0a9c6d..392999b33 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -26,6 +26,16 @@ Codegen
 
 **AIT_USE_CMAKE_COMPILATION**: (An experimental feature) If set to "1", then `cmake` will used instead of `make`. This allows to build AITemplate using MSVC Compiler + MSBuild on Windows, and it works for linux as well. This builder does not support many features (such as caching) yet. But it allows to generate a cmake project that can be loaded to a modern IDE. Default value is "0".
 
+**AIT_ENABLE_STANDALONE**: Enable standalone test and benchmark executable generation. Default value is "0" (disabled). If set to "1", this will generate a "test" executable that may be used to run standalone tests and benchmarks. This standalone executable is also well suited for running through debuggers and/or profiling tools, as it does not pull in python and pytorch as dependencies, unlike most python unit tests.
+
+**AIT_ENABLE_PTXAS_INFO**: Set this to "1" to enable the generation and logging of verbose ( tuning-relevant ) information about CUDA ptx assembly code produced by the CUDA compiler nvcc. Intermediate ptx files, annotated with C++ source info will be written to the build directory. In addition, this flag enables warnings about CUDA register spilling and resource usage.
+
+**AIT_CUDA_DEBUG_LEVEL**: Configure level of CUDA debug information. Defaults to no debug info. This may either be a string with options passed to nvcc ( for example "-g -G" or "-lineinfo" ) or a CUDA debug level from "0" (default, no debug info), "1" ( "-lineinfo" ) include source code line information. Ideal for profiling with ncu/nsight-compute, "2" full debug information (**warning**: this disables all optimizations, regardless of other settings)
+
+**AIT_ENABLE_CUDA_SOURCE_NAVIGATION_FIX**: (Only supported by FBCUDA target so far): When this flag is enabled by setting it to "1" (it is disabled by default), every *.cu file in build dirs into a corresponding *.cu.h file and create a *.cu file which just includes this file. This fixes code navigation issues in some IDE's which don't treat .cu files as C++ files and disable code navigation.
+
+**AIT_ENABLE_INCLUDE_FROM_SOURCETREE**: (Only supported by FBCUDA target so far) When this flag is enabled by setting it to "1" (it is disabled by default), the target will create an in-place build which tries to directly reference the include paths within the AITemplate source tree. This helps to iterate faster during native Kernel/Operator development and debugging.
+
 Profiling
 ---------
 
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index 2833f558f..e66d97e2a 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -508,6 +508,16 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings)
             # fix the makefile indentation
             f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
 
+    def postprocess_build_dir(self, workdir: str, test_name: str) -> None:
+        build_dir = os.path.join(workdir, test_name)
+        current_target = None
+        try:
+            current_target: Target = Target.current()
+        except RuntimeError:
+            pass
+        if current_target is not None:
+            current_target.postprocess_build_dir(build_dir)
+
     @staticmethod
     def _combine_profiler_multi_sources():
         """Whether to combine multiple profiler sources per target."""
@@ -873,6 +883,7 @@ def make(
         allow_cache=True,
     ):
         self.gen_makefile(file_pairs, dll_name, workdir, test_name, debug_settings)
+        self.postprocess_build_dir(workdir, test_name)
 
         # Write compiler version string(s) into build directory, so these can be used as part of cache key
         self._gen_compiler_version_files(os.path.join(workdir, test_name))
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index d37452ddd..21489cd9d 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -135,6 +135,17 @@ def _build_gnu_host_compiler_options(self) -> List[str]:
     def get_host_compiler_options(self) -> List[str]:
         return self._build_gnu_host_compiler_options()
 
+    def _get_nvcc_debug_options(self) -> str:
+        CUDA_DEBUG_LEVEL_STRINGS = ["", "-lineinfo", "-g -G"]
+        level = environ.get_cuda_nvcc_debug_level()
+        if level.isdigit():
+            level = int(level)
+            assert (
+                level >= 0 and level < 3
+            ), "Debug level out of range. Must be 0 (no debug info), 1 (lineinfo) or 2 (with debug info, disable opt)"
+            return CUDA_DEBUG_LEVEL_STRINGS[level]
+        return level
+
     def _build_nvcc_compiler_options(self) -> List[str]:
         code = [f"sm_{self._arch}", f"compute_{self._arch}"]
         if environ.enable_cuda_lto():
@@ -148,6 +159,17 @@ def _build_nvcc_compiler_options(self) -> List[str]:
             "-std=c++17",
             "--expt-relaxed-constexpr",
         ]
+        if environ.enable_ptxas_info():
+            options.extend(
+                [
+                    "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                    "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                    "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                    "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                    "--source-in-ptx",
+                ]
+            ),  # Annotate the ptx file with source information
+        options.append(self._get_nvcc_debug_options())
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         if environ.use_fast_math() and (
@@ -296,7 +318,34 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             **kwargs,
         )
 
+    def _build_include_directories_from_sourcetree(self) -> List[str]:
+        my_path: Path = Path(os.path.realpath(__file__))  # noqa
+        ait_basepath: Path = my_path.parent.parent.parent.parent.parent.absolute()
+        assert (
+            ait_basepath.name == "AITemplate"
+        ), "AITemplate basepath resolution failed"
+        relative_include_paths = [
+            "fb/3rdparty/cutlass/examples/35_gemm_softmax",
+            "fb/3rdparty/cutlass/examples/41_fused_multi_head_attention",
+            "fb/3rdparty/cutlass/examples/45_dual_gemm",
+            "fb/3rdparty/cutlass/examples/common",
+            "fb/3rdparty/cutlass/include",
+            "fb/3rdparty/cutlass/tools/library/include",
+            "fb/3rdparty/cutlass/tools/library/src",
+            "fb/3rdparty/cutlass/tools/util/include",
+            "python/aitemplate/backend/cuda/attention/src",
+            "python/aitemplate/backend/cuda/attention/src/fmha",
+            "static/include",
+            "static/include/kernels",
+        ]
+        include_paths = [
+            str((ait_basepath / ipath).absolute()) for ipath in relative_include_paths
+        ]
+        return include_paths
+
     def _build_include_directories(self) -> List[str]:
+        if environ.enable_include_from_sourcetree():
+            return self._build_include_directories_from_sourcetree()
         cutlass_path = [
             os.path.join(self._template_path, "include"),
             os.path.join(self._template_path, "tools/util/include"),
@@ -358,6 +407,17 @@ def _build_compile_options(self):
                     "-std=c++17",
                 ]
             )
+            if environ.enable_ptxas_info():
+                options.extend(
+                    [
+                        "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                        "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                        "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                        "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                        "--source-in-ptx",  # Annotate the ptx file with source information
+                    ]
+                ),
+            options.append(self._get_nvcc_debug_options())
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
             FBCUDA.static_compile_options_ = options
@@ -428,6 +488,32 @@ def in_ci_env(self) -> bool:
             os.environ.get("INSIDE_RE_WORKER", None) == "1" and not self.trick_ci_env()
         )
 
+    def postprocess_build_dir(self, build_dir: str) -> None:
+        # Write a standard TARGETS file to enable standalone exe code navigation
+        from aitemplate.backend import buck_support
+
+        additional_build_dir_contents = {"TARGETS": buck_support.AIT_BUILD_DIR_TARGETS}
+        for filename, content in additional_build_dir_contents.items():
+            filepath = os.path.join(build_dir, filename)
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write(content)
+
+        if environ.enable_cuda_source_navigation_fix():
+            # We rename all .cu files to cu.h, and write a .cu
+            # file in their stead that only includes this cu.h file.
+            # The purpose is to enable .cu source navigation for certain IDEs..
+            build_dir_path = Path(build_dir)
+            cu_files = list(build_dir_path.glob("*.cu"))
+            for p in cu_files:
+                corresponding_include_file = p.with_name(p.name + ".h")
+                if corresponding_include_file.exists():
+                    corresponding_include_file.unlink()
+                # rename .cu file to .cu.h
+                p.rename(corresponding_include_file)
+                # write .cu file which just includes the original, now found
+                # under .cu.h
+                p.write_text(f'#include "{corresponding_include_file.name}"\n')
+
     @classmethod
     def remote_logger(cls, record):
         """
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 46d5e3aa3..a464fddbd 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -505,6 +505,13 @@ def get_device_compiler_options(self) -> List[str]:
         """
         raise NotImplementedError
 
+    def postprocess_build_dir(self, build_dir: str) -> None:
+        """
+        Postprocess a build directory, allows final modification of the build directory before building.
+
+        """
+        pass
+
 
 def CUDA(template_path: str = CUTLASS_PATH, arch: str = "80", **kwargs):
     """Create a CUDA target."""
diff --git a/python/aitemplate/utils/debug_settings.py b/python/aitemplate/utils/debug_settings.py
index 50446f654..39e374a1d 100644
--- a/python/aitemplate/utils/debug_settings.py
+++ b/python/aitemplate/utils/debug_settings.py
@@ -19,6 +19,8 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from aitemplate.utils import environ
+
 
 @dataclass
 class AITDebugSettings:
@@ -42,4 +44,4 @@ class AITDebugSettings:
     check_all_outputs: bool = False
     gen_profiler_annotation: bool = False
     dump_ait_to_py: Optional[str] = None
-    gen_standalone: bool = False
+    gen_standalone: bool = environ.enable_standalone_exe_generation()
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index 3971cfb1f..ddc00c852 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -225,3 +225,56 @@ def is_cmake_compilation() -> bool:
 
     # todo: replace with more builders?
     return os.getenv("AIT_USE_CMAKE_COMPILATION", "0") == "1"
+
+
+def enable_standalone_exe_generation() -> bool:
+    """
+    Whether to generate standalone binaries for AIT build directories.
+    Defaults to False.
+    """
+    return os.getenv("AIT_ENABLE_STANDALONE", "0") == "1"
+
+
+def enable_ptxas_info():
+    """
+    Whether to keep intermediate nvcc output files (including ptxas assembly) generated by
+    nvcc, and generate verbose ptxas generation logs. Defaults to False.
+    """
+    return os.getenv("AIT_ENABLE_PTXAS_INFO", "0") == "1"
+
+
+def enable_include_from_sourcetree():
+    """
+    Whether to include header files from source tree when building AIT model instead of
+    placing them in a temp dir. Defaults to False. Only works with FBCUDA target when
+    doing development / in-place builds.
+    """
+    return os.getenv("AIT_ENABLE_INCLUDE_FROM_SOURCETREE", "0") == "1"
+
+
+def get_cuda_nvcc_debug_level():
+    """
+    Return level of CUDA debug information. Default to no debug info.
+    backed by env var AIT_CUDA_DEBUG_LEVEL. Which may either be a string
+    which is directly passed through to nvcc on the commandline, or an
+    integer (as String) from 0 to 2 with the following meaning:
+        - 0: No debug info ( default )
+        - 1: Line information. Good for stack traces and profiling. Optimizations can be enabled.
+        - 2: Full debug information.
+
+    WARNING:
+    Level 2 disables all compiler optimizations,
+    regardless of what else is passed as optimization level.
+    """
+    level = os.getenv("AIT_CUDA_DEBUG_LEVEL", "0")
+    return level
+
+
+def enable_cuda_source_navigation_fix():
+    """
+    When this flag is enabled, the FBCUDA Target will copy every *.cu file in build dirs into
+    a corresponding *.cu.h file and create a *.cu file which just #include's this file.
+    This fixes code navigation issues in some IDE's which don't treat .cu files as C++
+    files and disable code navigation.
+    """
+    return os.getenv("AIT_ENABLE_CUDA_SOURCE_NAVIGATION_FIX", "0") == "1"

From 33f279782d548b65e2e2fa492fb5fd6d04c4bc83 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <klondenberg@meta.com>
Date: Tue, 20 Jun 2023 13:32:38 -0700
Subject: [PATCH 603/638] Standalone test cases (#773)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/773

These changes improve the standalone executable generation of AIT to make the generated standalone test executable a full-fledged unit test and benchmarking tool, supporting loading and testing with test files which specify inputs and expected outputs for models.

This allows for much faster iteration cycles in the development of native operators, especially if combined with the features introduced in the base diff D46519923, which allow in-place development, code navigation and debugging within an AIT build dir.

Reviewed By: aakhundov

Differential Revision: D46649096

fbshipit-source-id: 74de238cc88e62cdb1442e257b2c78f7093b6770
---
 python/aitemplate/compiler/model.py           |  40 +-
 python/aitemplate/utils/torch_utils.py        |  28 +
 static/csrc/standalone.cpp                    | 491 +++++++++++++++++-
 tests/unittest/ops/test_grouped_b2b_bmm.py    |  39 ++
 .../ops/test_grouped_classic_b2b_bmm.py       |  14 +
 5 files changed, 610 insertions(+), 2 deletions(-)

diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index af76a7782..6efa5c50b 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -19,13 +19,14 @@
 import enum
 import logging
 import math
+import struct
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 
 from aitemplate.compiler.dtype import dtype_str_to_enum
 from aitemplate.utils.misc import is_linux, is_windows
-from aitemplate.utils.torch_utils import torch_dtype_to_string
+from aitemplate.utils.torch_utils import torch_dtype_to_string, write_tensor_binary
 
 # Controls how many runtimes will be used in ModelContainer by default.
 # See the runtime README.md for more information on the Model/ModelContainer
@@ -353,6 +354,43 @@ def _dict_to_ordered_list(self, params, is_inputs):
 
         return result
 
+    def _write_tensors_for_standalone_testcase(
+        self,
+        tensor_dict: Dict[str, TorchTensor],
+        file_handle,
+        is_inputs: bool = True,
+    ) -> None:
+        if is_inputs:
+            index_map = self._input_name_to_index
+        else:
+            index_map = self._output_name_to_index
+        result = [None] * len(index_map)
+        for name, tensor in tensor_dict.items():
+            if name not in index_map:
+                raise ValueError(
+                    f"Got unexpected {'input' if is_inputs else 'output'}: {name}"
+                )
+            idx = index_map[name]
+            result[idx] = tensor
+        for tensor in result:
+            write_tensor_binary(tensor, file_handle)
+
+    def write_standalone_testcase_data(
+        self,
+        filename,
+        inputs: Dict[str, TorchTensor],
+        expected_outputs: List[TorchTensor],
+        atol=1e-2,
+        rtol=1e-2,
+    ):
+        with open(filename, "wb") as file_handle:
+            file_handle.write(struct.pack("ff", atol, rtol))
+            self._write_tensors_for_standalone_testcase(
+                tensor_dict=inputs, file_handle=file_handle
+            )
+            for out in expected_outputs:
+                write_tensor_binary(out, file_handle)
+
     def _make_ait_outputs(
         self, outputs: List[AITData], c_output_shapes
     ) -> Dict[str, AITData]:
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
index 078d40557..e06f8d3d7 100644
--- a/python/aitemplate/utils/torch_utils.py
+++ b/python/aitemplate/utils/torch_utils.py
@@ -21,6 +21,12 @@
 `import torch` will work.
 """
 
+import struct
+
+import torch
+
+from aitemplate.compiler.dtype import dtype_str_to_enum, get_dtype_size, normalize_dtype
+
 
 def types_mapping():
     from torch import bfloat16, bool, float16, float32, int32, int64
@@ -56,3 +62,25 @@ def string_to_torch_dtype(string_dtype):
         f"Got unsupported ait dtype {string_dtype}! "
         f"Supported dtypes are: {list(types_mapping())}"
     )
+
+
+def write_tensor_binary(tensor: "torch.Tensor", file_handle) -> None:
+    tensor = tensor.detach().cpu().contiguous()
+    endianness = "@"  # system endianness
+    dtype_str = normalize_dtype(torch_dtype_to_string(tensor.dtype))
+    dtype_int = dtype_str_to_enum(dtype_str)
+    sizeof_dtype = get_dtype_size(dtype_str)
+    num_dims = len(tensor.shape)
+    file_handle.write(struct.pack(endianness + "I", dtype_int))  # unsigned int
+    file_handle.write(struct.pack(endianness + "I", sizeof_dtype))  # unsigned int
+    file_handle.write(struct.pack(endianness + "I", num_dims))  # unsigned int
+    total_size = sizeof_dtype
+    for dim in tensor.shape:
+        file_handle.write(struct.pack(endianness + "N", dim))  # size_t
+        total_size *= dim
+    file_handle.write(struct.pack(endianness + "N", total_size))  # size_t
+    bytedata = tensor.numpy().tobytes()
+    # just as a safety check
+    if len(bytedata) != total_size:
+        raise RuntimeError("Tensor has wrong number of bytes!")
+    file_handle.write(bytedata)
diff --git a/static/csrc/standalone.cpp b/static/csrc/standalone.cpp
index 876e7045c..cb486304d 100644
--- a/static/csrc/standalone.cpp
+++ b/static/csrc/standalone.cpp
@@ -27,10 +27,14 @@
 // ./tmp/test_gemm_rcr) along with other files, users are free to make any
 // changes to the code. We do not try to predict users' actions.
 
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
 #include <functional>
 #include <iostream>
 #include <map>
 #include <random>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -269,7 +273,446 @@ static AITemplateError run(
       ait_output_shapes_out.data());
 }
 
-int main() {
+template <typename T>
+void read_element(std::ifstream& fh, T& elem) {
+  if (!fh.good()) {
+    throw std::runtime_error("Input stream is not in good state.");
+  }
+  fh.read(reinterpret_cast<char*>(&elem), sizeof(T));
+  if (fh.fail()) {
+    throw std::runtime_error("Failed to read binary data");
+  }
+}
+
+struct AITStandaloneTestcase {
+  std::vector<AITData> expected_outputs;
+  std::vector<AITData> host_outputs;
+  std::vector<AITData> gpu_outputs;
+
+  std::vector<int64_t*> ait_output_shapes_out;
+
+  std::vector<AITData>
+      inputs; // this will be filled the AITData instances for the inputs
+
+  std::vector<int64_t> shape_data_owner;
+  std::vector<GPUPtr> gpu_data_owner;
+
+  const std::string test_data_path; // path to test data file
+  AITemplateModelHandle& handle;
+  AITemplateAllocator& allocator;
+
+  float atol;
+  float rtol;
+
+  AITStandaloneTestcase(
+      const char* test_data_path_,
+      AITemplateModelHandle& handle_, // model handle
+      AITemplateAllocator& allocator_)
+      : handle(handle_),
+        allocator(allocator_),
+        test_data_path(test_data_path_) {
+    _load();
+  }
+
+  void _load() { // relative error tolerance
+    size_t num_outputs = 0;
+    size_t num_inputs = 0;
+    AITemplateModelContainerGetNumInputs(handle, &num_inputs);
+    AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+    ait_output_shapes_out.reserve(num_outputs);
+    expected_outputs.reserve(num_outputs);
+    host_outputs.reserve(num_outputs);
+    gpu_outputs.reserve(num_outputs);
+    std::ifstream fh(test_data_path);
+    read_element(fh, atol); // absolute error tolerance
+    read_element(fh, rtol); // relative error tolerance
+
+    gpu_data_owner.reserve(num_inputs + num_outputs);
+    ait_output_shapes_out.reserve(num_outputs);
+
+    std::map<std::string, unsigned> input_name_to_index;
+    size_t total_dim_count =
+        0; // the sum of shape.ndims for all input and output tensors
+    // calculate total_dim_count
+    for (unsigned i = 0; i < num_inputs; i++) {
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+      total_dim_count += shape.size;
+    }
+    for (unsigned i = 0; i < num_outputs; i++) {
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+      total_dim_count += shape.size * 2; // allocation required twice
+    }
+    // this is just a vector that owns the memory for the shape.shape_data
+    // values
+    shape_data_owner.reserve(total_dim_count);
+    size_t shape_offset = 0; // offset into the shape_data_owner array
+    for (unsigned i = 0; i < num_inputs; i++) {
+      // for each input tensor
+      const char* name;
+      AITemplateModelContainerGetInputName(handle, i, &name);
+      AITemplateDtype dtype;
+      AITemplateModelContainerGetInputDtype(handle, i, &dtype);
+      size_t dtype_size = AITemplateDtypeSizeBytes(dtype);
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+
+      input_name_to_index.insert({name, i});
+      std::cout << "Loading input: " << name << ", at idx: " << i;
+
+      // Read metadata for test case
+      unsigned int read_dtype;
+      unsigned int read_dtype_size;
+      unsigned int read_ndims;
+      size_t read_total_tensor_bytes;
+      read_element(fh, read_dtype);
+      std::cout << ", dtype=" << read_dtype;
+      read_element(fh, read_dtype_size);
+      std::cout << ", sizeof(dtype)=" << read_dtype_size;
+      read_element(fh, read_ndims);
+      std::cout << ", ndims=" << read_ndims;
+
+      if (static_cast<AITemplateDtype>(read_dtype) != dtype) {
+        throw std::runtime_error(
+            "Mismatch between dtype of input in testcase data and in model");
+      }
+
+      if (dtype_size != static_cast<size_t>(read_dtype_size)) {
+        throw std::runtime_error(
+            "Mismatch between sizeof(dtype) in testcase data and in model");
+      }
+
+      // Obtain maximum shape from model and verify the testcase data has valid
+      // shape
+      if (read_ndims != shape.size) {
+        throw std::runtime_error(
+            "Mismatch between number of input dimensions in testcase data and in model");
+      }
+      std::cout << ", shape=(";
+      for (unsigned j = 0; j < read_ndims; j++) {
+        size_t dim;
+        read_element(fh, dim);
+        shape_data_owner.push_back(dim);
+        std::cout << dim << ", ";
+        if (dim > shape.shape_data[j]) {
+          throw std::runtime_error(
+              "Shape in testcase data exceeds maximum shape.");
+        }
+      }
+      std::cout << ")";
+
+      // Set the shape of the input to the actual, and not the maximum shape.
+      // the previous shape.shape_data may not be deleted as it's owned by the
+      // model.
+      shape.shape_data = shape_data_owner.data() + shape_offset;
+      shape_offset += read_ndims; // move offset to the next unused space
+
+      // total number of bytes of tensor raw data
+      read_element(fh, read_total_tensor_bytes);
+
+      size_t numel = shape.Numel();
+      size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+      std::cout << ", total_tensor_bytes=" << read_total_tensor_bytes
+                << " - model expects " << num_bytes << "\n";
+      if (num_bytes != read_total_tensor_bytes) {
+        throw std::runtime_error("Tensor data total size mismatch.");
+      }
+      // allocate memory for tensor raw data on host
+      void* h_data;
+      DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+      // read tensor raw data from file
+      fh.read(reinterpret_cast<char*>(h_data), read_total_tensor_bytes);
+      // Allocate corresponding device memory and copy tensor raw data to device
+      gpu_data_owner.emplace_back(RAII_DeviceMalloc(num_bytes, allocator));
+      DEVICE_CHECK(
+          CopyToDevice(gpu_data_owner.back().get(), h_data, num_bytes));
+
+      // free host memory for tensor
+      DEVICE_CHECK(FreeDeviceHostMemory(h_data));
+
+      inputs.push_back(AITData(gpu_data_owner.back().get(), shape, dtype));
+    }
+    std::cout << "Finished loading testcase inputs."
+              << "\n";
+    if (fh.peek() == std::ifstream::traits_type::eof()) {
+      std::cout << "No expected outputs in testcase."
+                << "\n";
+      return;
+    }
+    if (inputs.size() != num_inputs) {
+      throw std::runtime_error("Number of inputs mismatches with expected.");
+    }
+    // read expected outputs from file
+    for (unsigned i = 0; i < num_outputs; i++) {
+      // for each input tensor
+      const char* name;
+      AITemplateModelContainerGetOutputName(handle, i, &name);
+      AITemplateDtype dtype;
+      AITemplateModelContainerGetOutputDtype(handle, i, &dtype);
+      size_t dtype_size = AITemplateDtypeSizeBytes(dtype);
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+      AITemplateParamShape max_shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &max_shape);
+
+      size_t max_numel = shape.Numel();
+      size_t max_num_bytes = max_numel * AITemplateDtypeSizeBytes(dtype);
+
+      gpu_data_owner.emplace_back(RAII_DeviceMalloc(max_num_bytes, allocator));
+      gpu_outputs.push_back(
+          AITData(gpu_data_owner.back().get(), max_shape, dtype));
+
+      std::cout << "Loading expected output: " << name << ", at idx: " << i;
+
+      // Read metadata for test case
+      unsigned int read_dtype;
+      unsigned int read_dtype_size;
+      unsigned int read_ndims;
+      size_t read_total_tensor_bytes;
+      read_element(fh, read_dtype);
+      std::cout << ", dtype=" << read_dtype;
+      read_element(fh, read_dtype_size);
+      std::cout << ", sizeof(dtype)=" << read_dtype_size;
+      read_element(fh, read_ndims);
+      std::cout << ", ndims=" << read_ndims;
+
+      if (static_cast<AITemplateDtype>(read_dtype) != dtype) {
+        throw std::runtime_error(
+            "Mismatch between dtype of input in testcase data and in model");
+      }
+
+      if (dtype_size != static_cast<size_t>(read_dtype_size)) {
+        throw std::runtime_error(
+            "Mismatch between sizeof(dtype) in testcase data and in model");
+      }
+
+      // Obtain maximum shape from model and verify the testcase data has valid
+      // shape
+      if (read_ndims != shape.size) {
+        throw std::runtime_error(
+            "Mismatch between number of input dimensions in testcase data and in model");
+      }
+      std::cout << ", shape=(";
+      for (unsigned j = 0; j < read_ndims; j++) {
+        size_t dim;
+        read_element(fh, dim);
+        shape_data_owner.push_back(dim);
+        std::cout << dim << ", ";
+        if (dim > shape.shape_data[j]) {
+          throw std::runtime_error(
+              "Shape in testcase data exceeds maximum shape.");
+        }
+      }
+      std::cout << ")";
+
+      // Set the shape of the input to the actual, and not the maximum shape.
+      // the previous shape.shape_data may not be deleted as it's owned by the
+      // model.
+      shape.shape_data = shape_data_owner.data() + shape_offset;
+      shape_offset += read_ndims; // move offset to the next unused space
+
+      // total number of bytes of tensor raw data
+      read_element(fh, read_total_tensor_bytes);
+
+      size_t numel = shape.Numel();
+      size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+      std::cout << ", total_tensor_bytes=" << read_total_tensor_bytes
+                << " - model expects " << num_bytes << "\n";
+      if (num_bytes != read_total_tensor_bytes) {
+        throw std::runtime_error("Tensor data total size mismatch.");
+      }
+      // allocate memory for tensor raw data on host
+      void* h_data_expected;
+      void* h_data;
+      DEVICE_CHECK(
+          DeviceMallocHost(&h_data, max_num_bytes)); // max size required here
+      DEVICE_CHECK(DeviceMallocHost(&h_data_expected, num_bytes));
+
+      // read tensor raw data from file
+      fh.read(
+          reinterpret_cast<char*>(h_data_expected), read_total_tensor_bytes);
+
+      // ---
+      // Memory to place output tensors on host
+      host_outputs.emplace_back(h_data, shape, dtype);
+      ait_output_shapes_out.push_back(shape_data_owner.data());
+      shape_offset += read_ndims;
+      expected_outputs.emplace_back(h_data_expected, shape, dtype);
+    }
+  }
+
+  AITemplateError run(
+      AITemplateModelHandle handle,
+      AITemplateAllocator& allocator) {
+    bool graph_mode = false;
+    auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+
+    return AITemplateModelContainerRunWithOutputsOnHost(
+        handle,
+        inputs.data(),
+        inputs.size(),
+        host_outputs.data(),
+        host_outputs.size(),
+        reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+        graph_mode,
+        ait_output_shapes_out.data());
+  }
+
+  float benchmark(
+      AITemplateModelHandle handle,
+      AITemplateAllocator& allocator,
+      size_t count,
+      size_t num_threads) {
+    bool graph_mode = false;
+    auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+    float runtime_ms = -999.0f;
+    AITemplateError err = AITemplateModelContainerBenchmark(
+        handle,
+        inputs.data(),
+        inputs.size(),
+        gpu_outputs.data(),
+        gpu_outputs.size(),
+        reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+        graph_mode,
+        count,
+        num_threads,
+        true,
+        &runtime_ms,
+        ait_output_shapes_out.data());
+    if (err != AITemplateError::AITemplateSuccess) {
+      std::cout << "Benchmark failed with error " << static_cast<int>(err)
+                << std::endl;
+      return -1.0f;
+    }
+    return runtime_ms;
+  }
+
+  bool compare_results_to_expected() {
+    bool passed = true;
+    size_t num_outputs = 0;
+    AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+    for (unsigned output_idx = 0; output_idx < num_outputs; ++output_idx) {
+      switch (expected_outputs[output_idx].dtype) {
+        case AITemplateDtype::kInt:
+          passed = passed and _compare_results_to_expected<int32_t>(output_idx);
+          break;
+        case AITemplateDtype::kLong:
+          passed = passed and _compare_results_to_expected<int64_t>(output_idx);
+          break;
+        case AITemplateDtype::kFloat:
+          passed = passed and _compare_results_to_expected<float>(output_idx);
+          break;
+        case AITemplateDtype::kBFloat16:
+          passed =
+              passed and _compare_results_to_expected<bfloat16>(output_idx);
+          break;
+        case AITemplateDtype::kHalf:
+          passed = passed and _compare_results_to_expected<half>(output_idx);
+          break;
+        case AITemplateDtype::kBool:
+          passed = passed and _compare_results_to_expected<bool>(output_idx);
+          break;
+        default:
+          std::cerr << "Unsupported output dtype! "
+                    << static_cast<int>(expected_outputs[output_idx].dtype)
+                    << std::endl;
+          throw std::runtime_error("unsupported dtype for comparisons");
+      }
+    }
+    return passed;
+  }
+
+  template <typename T>
+  bool _compare_results_to_expected(unsigned output_idx) {
+    unsigned ndims = host_outputs[output_idx].shape.size;
+    // check the actual output shape
+    for (unsigned i = 0; i < ndims; ++i) {
+      if (expected_outputs[output_idx].shape.shape_data[i] !=
+          ait_output_shapes_out[output_idx][i]) {
+        std::cout
+            << "Mismatch between expected output shape and actual shape after inference of output #"
+            << i << " at dimension " << i << " expected shape[i]=="
+            << host_outputs[output_idx].shape.shape_data[i]
+            << " actual shape[i]==" << ait_output_shapes_out[output_idx][i]
+            << std::endl;
+        return false;
+      }
+    }
+    size_t numel = host_outputs[output_idx].shape.Numel();
+    T* data = reinterpret_cast<T*>(host_outputs[output_idx].ptr);
+    T* expected_data = reinterpret_cast<T*>(expected_outputs[output_idx].ptr);
+    size_t violations = 0;
+    int worst_idx = -1;
+    double worst_abs_diff = 0.0;
+
+    for (size_t i = 0; i < numel; ++i) {
+      double val = static_cast<double>(data[i]);
+      double expected = static_cast<double>(expected_data[i]);
+      double actual_diff = std::abs(val - expected);
+      double tolerated_diff = atol +
+          rtol * std::abs(expected); // as defined by torch.testing.assert_close
+      if (actual_diff > worst_abs_diff) {
+        worst_abs_diff = actual_diff;
+      }
+      if (actual_diff > tolerated_diff) {
+        violations++;
+      }
+    }
+    if (violations > 0) {
+      std::cout
+          << "Actual output and expected output are not equal for output with index "
+          << output_idx << " of " << numel << " elements, " << violations
+          << " differed by more than the tolerance of atol=" << atol
+          << " and rtol=" << rtol << rtol << "\n";
+      return false;
+    }
+    return true;
+  }
+};
+
+int run_testcase(const char* input_file, bool benchmark) {
+  std::cout << "Starting single test run with input " << input_file << "\n";
+  {
+    AITemplateModelHandle handle;
+    AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+    AITemplateAllocator* allocator;
+    AIT_ERROR_CHECK(AITemplateAllocatorCreate(
+        &allocator, AITemplateAllocatorType::kDefault));
+
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    AITStandaloneTestcase test(input_file, handle, *allocator);
+
+    AIT_ERROR_CHECK(test.run(handle, *allocator));
+    std::cout << "Finished test run with input " << input_file << "\n";
+    int retval = -1;
+    if (!test.compare_results_to_expected()) {
+      std::cout << "Test failed. " << std::endl;
+      return 1;
+    }
+    std::cout << "Test succeeded. " << std::endl;
+  }
+  if (benchmark) {
+    std::cout << "Benchmarking with testcase " << input_file << "\n";
+    AITemplateModelHandle handle;
+    AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+    AITemplateAllocator* allocator;
+    AIT_ERROR_CHECK(AITemplateAllocatorCreate(
+        &allocator, AITemplateAllocatorType::kDefault));
+
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    AITStandaloneTestcase benchmarker(input_file, handle, *allocator);
+    float runtime_ms = benchmarker.benchmark(handle, *allocator, 10, 1);
+    if (runtime_ms >= 0.0) {
+      std::cout << "Benchmark result: " << input_file
+                << " repetitions: 10, ms/iter: " << runtime_ms << "\n";
+    }
+  }
+
+  return 0;
+}
+
+int run_with_random_inputs() {
   AITemplateModelHandle handle;
   AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
   AITemplateAllocator* allocator;
@@ -296,3 +739,49 @@ int main() {
   AITemplateModelContainerDelete(handle);
   return 0;
 }
+
+int main(int argc, char* argv[]) {
+  try {
+    if (argc <= 1) {
+      std::cout
+          << "No action provided on commandline. Running model with random maximum size inputs."
+          << std::endl;
+
+      return run_with_random_inputs();
+    }
+    std::string action(argv[1]);
+    if ((action == "--help") or (action == "help")) {
+      std::cout << "AITemplate standalone test runner usage:" << std::endl
+                << " run with random input:   " << argv[0] << std::endl
+                << " run single tests:        " << argv[0]
+                << " test <testcase-file-1> ... <testcase-file-N>" << std::endl
+                << " run tests and benchmark: " << argv[0]
+                << " benchmark <testcase-file-1> ... <testcase-file-N>"
+                << std::endl;
+    }
+    if ((action == "test") or (action == "benchmark")) {
+      if (argc < 3) {
+        std::cout
+            << "Invalid number of arguments. Require at least one test case as argument"
+            << std::endl;
+      }
+      int failure_count = 0;
+      for (int i = 2; i < argc; i++) {
+        if (run_testcase(argv[i], action == "benchmark") != 0) {
+          failure_count++;
+        }
+      }
+      if (failure_count == 0) {
+        std::cout << "All tests succeeded." << std::endl;
+      } else {
+        std::cout << "Failed tests: " << failure_count << " of " << (argc - 2)
+                  << std::endl;
+      }
+      return failure_count;
+    }
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "Exception caught: " << e.what() << std::endl;
+    return -99;
+  }
+}
diff --git a/tests/unittest/ops/test_grouped_b2b_bmm.py b/tests/unittest/ops/test_grouped_b2b_bmm.py
index 54b3396e6..f552e5937 100644
--- a/tests/unittest/ops/test_grouped_b2b_bmm.py
+++ b/tests/unittest/ops/test_grouped_b2b_bmm.py
@@ -17,6 +17,7 @@
 """
 import itertools
 import logging
+import os
 import unittest
 from typing import List, Tuple
 
@@ -66,6 +67,7 @@ def _test_grouped_fmha_style_b2b_bmm(
         atol=1e-3,
         rtol=1e-3,
         use_fp16_acc=False,
+        write_standalone_testcase_data: bool = False,
     ):
         # Initialize AIT fmha_style_b2b_bmm operator.
         if isinstance(batch_sizes, int):
@@ -158,6 +160,7 @@ def _test_grouped_fmha_style_b2b_bmm(
         # Run tests.
         torch_dtype = string_to_torch_dtype(dtype)
         offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
+        written_testcase_idx = 0
         for batch_size, max_seq_len, num_head in itertools.product(
             sorted(set(batch_sizes)), sorted(set(max_seq_lens)), sorted(set(num_heads))
         ):
@@ -201,6 +204,18 @@ def _test_grouped_fmha_style_b2b_bmm(
                 device="cuda",
             )
             module.run_with_tensors(inputs, [y])
+            if write_standalone_testcase_data:
+                written_testcase_idx += 1
+                os.makedirs(f"./tmp/{test_name}/test_cases", exist_ok=True)
+                fname = (
+                    f"./tmp/{test_name}/test_cases/testcase.{written_testcase_idx}.data"
+                )
+                _LOGGER.info(f"Writing standalone testcase data to {fname}")
+                module.write_standalone_testcase_data(
+                    fname,
+                    inputs,
+                    [y],
+                )
 
             # Run PT reference and verify results.
             for row in range(batch_size):
@@ -339,6 +354,30 @@ def test_grouped_fmha_style_b2b_bmm_fp16(self):
             atol=1e-2,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_fmha_b2b_bmm_profile_1(
+        self,
+    ):
+        for max_seq_len in [256]:
+            self._test_grouped_fmha_style_b2b_bmm(
+                test_name=f"grouped_fmha_b2b_bmm_profile_1_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[
+                    4,
+                    8,
+                    16,
+                    32,
+                    64,
+                ],
+                max_seq_lens=max_seq_len,
+                num_heads=[1],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                has_bias=True,
+                bias_broadcast=[True, True, False, False],
+                # write_standalone_testcase_data=True,
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
index c2d50fe81..b928ea01d 100644
--- a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
+++ b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
@@ -67,6 +67,7 @@ def _test_grouped_classic_b2b_bmm(
         rtol=0.01,
         use_fp16_acc=False,
         random_seed=0,
+        write_standalone_testcase_data: bool = False,
     ):
         if isinstance(random_seed, list):
             random_seeds = random_seed
@@ -162,6 +163,7 @@ def _test_grouped_classic_b2b_bmm(
         torch_dtype = string_to_torch_dtype(dtype)
         offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
         y_results = {}
+        written_testcase_idx = 0
         for random_seed in random_seeds:
             torch.manual_seed(random_seed)
             for max_seq_len in sorted(set(max_seq_lens)):
@@ -215,6 +217,7 @@ def _test_grouped_classic_b2b_bmm(
                     if not has_bias:
                         bias_pt_max *= 0.0
                     results_per_batch = {}
+
                     for batch_size in batch_sizes_sorted:
                         # Initialize inputs
                         # input(f"Attach debugger if you want. {os.getpid()=}. Press Enter to continue.")
@@ -246,6 +249,16 @@ def _test_grouped_classic_b2b_bmm(
                             [total_length, num_head, head_dim_value]
                         )
                         module.run_with_tensors(inputs, [y])
+                        if write_standalone_testcase_data:
+                            written_testcase_idx += 1
+                            os.makedirs(f"./tmp/{test_name}/test_cases", exist_ok=True)
+                            fname = f"./tmp/{test_name}/test_cases/testcase.{written_testcase_idx}.data"
+                            _LOGGER.info(f"Writing standalone testcase data to {fname}")
+                            module.write_standalone_testcase_data(
+                                fname,
+                                inputs,
+                                [y],
+                            )
 
                         y_results[(batch_size, max_seq_len, num_head)] = y
                         assert torch.all(
@@ -585,6 +598,7 @@ def test_grouped_classic_b2b_bmm_profile_1(
                 random_seed=list(range(1)),
                 has_bias=True,
                 bias_broadcast=[True, True, False, False],
+                # write_standalone_testcase_data=True,
             )
 
 
From 373000f73f4a74a47985106dd34d41b468aa2d74 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 20 Jun 2023 17:03:32 -0700
Subject: [PATCH 604/638] pool2d upstream (#775)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/775

Reviewed By: chenyang78

Differential Revision: D46847589

Pulled By: ipiszy

fbshipit-source-id: 15b93e4e45fd8a7fd23330e9940578c4013d873e
---
 python/aitemplate/backend/rocm/pool2d/pool2d.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index ca09ce7c1..3885ecc84 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -44,9 +44,7 @@
 {{indent}}                                           input_left_pads,
 {{indent}}                                           input_right_pads);
 {{indent}}if(!op.IsSupportedArgument(argument_ptr.get())) {
-{{indent}}  throw std::runtime_error(
-{{indent}}    "wrong! device_conv with the specified compilation parameters does "
-{{indent}}    "not support this Conv problem");
+{{indent}}  LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Pool problem.";
 {{indent}}}
 {{indent}}invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
 {{indent}}return;
@@ -60,13 +58,14 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
+#include "logging.h"
 #include "include/ck/utility/print.hpp"
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "include/ck/utility/reduction_operator.hpp"
-#include "include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
 
 {{instances}}
 

From f5896d0f417b8a462cffc35a7c5d664a9eadfd50 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 20 Jun 2023 17:03:48 -0700
Subject: [PATCH 605/638] upstream norm (#774)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/774

Reviewed By: chenyang78

Differential Revision: D46847587

Pulled By: ipiszy

fbshipit-source-id: af159f8b0f76112c803bc56589d4e6b41742ecc2
---
 .../backend/rocm/normalization/groupnorm.py   |  9 ++-
 .../backend/rocm/normalization/layernorm.py   | 58 ++++++++++++-------
 .../backend/rocm/normalization/norm_common.py |  8 ++-
 .../backend/rocm/normalization/softmax.py     |  9 +--
 4 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index ab8dced5e..a059fac29 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -29,7 +29,7 @@
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
 """
 )
 
@@ -127,16 +127,15 @@
         static_cast<ck::half_t *>(gamma),
         static_cast<ck::half_t *>(beta),
         static_cast<ck::half_t *>(output),
+        nullptr,
+        nullptr,
         YElementOp{}
     );
 
     if(!device_instance.IsSupportedArgument(argument_ptr.get()))
     {
-        throw std::runtime_error(
-            "wrong! device_layernorm with the specified compilation parameters does "
-            "not support this Groupnorm problem");
+        LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Groupnorm problem.";
     };
-    std::string instance_name = device_instance.GetTypeString();
     auto invoker_ptr = device_instance.MakeInvokerPointer();
     invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
     return;
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index 93d2216aa..af3efcf24 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -29,7 +29,7 @@
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
 """
 )
 
@@ -65,10 +65,22 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
     std::vector<ck::index_t> i_inStrides;
-
+    std::vector<ck::index_t> i_outStrides;
+    {% if input_strides is defined %}
+    i_inStrides.push_back({{input_strides[-2]}});
+    i_inStrides.push_back({{input_strides[-1]}});
+    {% else %}
     i_inStrides.push_back(N);
     i_inStrides.push_back(1);
+    {% endif %}
 
+    {% if output_strides is defined %}
+    i_outStrides.push_back({{output_strides[-2]}});
+    i_outStrides.push_back({{output_strides[-1]}});
+    {% else %}
+    i_outStrides.push_back(N);
+    i_outStrides.push_back(1);
+    {% endif %}
 
     auto device_instance = {{instance}}{};
     auto argument_ptr = device_instance.MakeArgumentPointer(
@@ -76,23 +88,22 @@
         i_inStrides,
         std::vector<ck::index_t>{0, 1},
         std::vector<ck::index_t>{0, 1},
-        i_inStrides,
+        i_outStrides,
         {1},
         {{eps}},
-        static_cast<ck::half_t *>(input),
+        static_cast<ck::half_t *>(input) + {{ input_offset if input_offset is defined else 0 }},
         static_cast<ck::half_t *>(gamma),
         static_cast<ck::half_t *>(beta),
-        static_cast<ck::half_t *>(output),
+        static_cast<ck::half_t *>(output) + {{ output_offset if output_offset is defined else 0 }},
+        nullptr,
+        nullptr,
         ck::tensor_operation::element_wise::PassThrough{}
     );
 
     if(!device_instance.IsSupportedArgument(argument_ptr.get()))
     {
-        throw std::runtime_error(
-            "wrong! device_layernorm with the specified compilation parameters does "
-            "not support this Softmax problem");
+        LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Layernorm problem.";
     };
-    std::string instance_name = device_instance.GetTypeString();
     auto invoker_ptr = device_instance.MakeInvokerPointer();
     invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
     return;
@@ -238,6 +249,16 @@ def gen_function(
     """
     rank = func_attrs["inputs"][0]._rank()
     eps = func_attrs.get("eps", "1e-5")
+    input_accessor = func_attrs["input_accessors"][0]
+    output_accessor = func_attrs["output_accessors"][0]
+    input_strides = []
+    output_strides = []
+    for i, _ in enumerate(input_accessor.original_shapes):
+        input_strides.append(input_accessor.stride(i))
+        output_strides.append(output_accessor.stride(i))
+
+    input_offset = input_accessor.offset
+    output_offset = output_accessor.offset
 
     exec_path = func_attrs["exec_path"]
     op_instance = func_attrs["op_instance"]
@@ -267,7 +288,14 @@ def gen_function(
     for key, _ in instances.items():
         fname = "f" + sha1(key.encode()).hexdigest()
         program = exec_template.render(
-            instance=fname, dtype="void", reduce_dims=rank - 1, eps=eps
+            instance=fname,
+            dtype="void",
+            reduce_dims=rank - 1,
+            eps=eps,
+            input_strides=input_strides,
+            output_strides=output_strides,
+            input_offset=input_offset,
+            output_offset=output_offset,
         )
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
@@ -349,14 +377,6 @@ def layernorm_gen_function_call(func_attrs, indent="  "):
     ), f"LayerNorm only supports input with rank >= 2, current rank: {len(shapes)}"
 
     input_dim_names = [shape._attrs["name"] for shape in shapes]
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-
-    elem_cnt = 1
-    for shape in xshape:
-        elem_cnt *= shape._attrs["values"][0]
-    instance_size = xshape[-1]._attrs["values"][0]
-    instance_num = elem_cnt // instance_size
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -364,8 +384,6 @@ def layernorm_gen_function_call(func_attrs, indent="  "):
         gamma=gamma_name,
         beta=beta_name,
         output=output_name,
-        M=instance_num,
-        N=instance_size,
         input_dim_names=input_dim_names,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 4f0da20e9..328fa47ec 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -184,8 +184,9 @@
     {{func_call}}
   }
   timer.End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
+  std::cout << "OP:" << "{{op_name}}" << ",";
+  std::cout << "TIME:" << timer.GetElapsedTime() << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
 }
 """
 )
@@ -199,6 +200,7 @@
 #include <stdlib.h>
 #include <random>
 #include <rocrand/rocrand.h>
+#include "logging.h"
 #include "include/ck/utility/print.hpp"
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
@@ -339,7 +341,6 @@ def gen_profiler(
     op_instance = func_attrs["op_instance"]
     file_pairs = []
     for op_name, op in op_instance.items():
-
         config = emit_instance(op)
         config_name = extract_config_name(config)
         instances = INSTANCE_TEMPLATE.render(
@@ -381,6 +382,7 @@ def gen_profiler(
             args_parse=args_parse,
             tensor_decl=tensor_decl,
             func_call=func_call,
+            op_name=op_name,
         )
 
         prefix = os.path.join(workdir, "profiler", op_type)
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index 11a0aa85c..bc10c5e09 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -62,8 +62,8 @@
     auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
                                                             i_inStrides,
                                                             reduceDims,
-                                                            &alpha,
-                                                            &beta,
+                                                            alpha,
+                                                            beta,
                                                             static_cast<ck::half_t *>(input),
                                                             static_cast<ck::half_t *>(output),
                                                             ck::tensor_operation::element_wise::PassThrough{},
@@ -71,11 +71,8 @@
                                                             );
     if(!device_instance.IsSupportedArgument(argument_ptr.get()))
     {
-        throw std::runtime_error(
-            "wrong! device_softmax with the specified compilation parameters does "
-            "not support this Softmax problem");
+        LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Softmax problem.";
     };
-    std::string instance_name = device_instance.GetTypeString();
     auto invoker_ptr = device_instance.MakeInvokerPointer();
     invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
     return;

From f0f676f13891ee70c4b8eecde1343ef7d28a3be6 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 22 Jun 2023 21:53:08 -0700
Subject: [PATCH 606/638] Download SD model without token by default (#769)

Summary:
Huggingface allows to download Stable Diffusion model files without user access token.

To simplify `download_pipeline.py` user experience lets not mandate user to run  `huggingface-cli login` or explicitly provide user access token.

Internally `download_pipeline.py` will call `StableDiffusionPipeline.from_petrained()` with `use_auth_token=False` by default.

New `download_pipeline.py` Help for `--token`:
```
Usage: download_pipeline.py [OPTIONS]

Options:
...
  --token TEXT           Valid values: Huggingface user access token, 'true'
                         to use token generated with 'huggingface-cli login'
                         (stored in ~/.huggingface) or empty string to not use
                         access token (default).
```

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/769

Reviewed By: chenyang78

Differential Revision: D46847505

Pulled By: ipiszy

fbshipit-source-id: bb5c4c4539758d57978ca6c87da83230cddd3de7
---
 examples/05_stable_diffusion/README.md           |  6 ++++--
 .../scripts/download_pipeline.py                 | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index ae3317c2b..aa8e3f573 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -23,7 +23,7 @@ Verify the library versions. We have tested transformers==4.25, diffusers==0.11[
 ```
 
 ### Download the diffusers pipeline files
-You must first register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
+Optionally, you can use Hugging Face access token. You can register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
 
 stable-diffusion model has two variants - base and regular.
 For example:
@@ -32,7 +32,9 @@ For example:
 
 ```
 python3 scripts/download_pipeline.py \
---model-name "stabilityai/stable-diffusion-2-1-base" \
+--model-name "stabilityai/stable-diffusion-2-1-base"
+
+# Optionally, you can use access token
 --token ACCESS_TOKEN
 ```
 
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 317e5de46..006e16531 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -21,21 +21,27 @@
 @click.option(
     "--model-name",
     default="stabilityai/stable-diffusion-2-1-base",
-    help="Pretrained Model name",
+    help="Pretrained Model name.",
+)
+@click.option(
+    "--token",
+    default="",
+    help="Valid values: Huggingface user access token, 'true' to use token "
+    "generated with 'huggingface-cli login' (stored in ~/.huggingface) "
+    "or empty string to not use access token (default).",
 )
-@click.option("--token", default="", help="access token")
 @click.option(
     "--save-directory",
     default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
-    help="pipeline files local directory",
+    help="Pipeline files local directory.",
 )
 def download_pipeline_files(model_name, token, save_directory) -> None:
+
     StableDiffusionPipeline.from_pretrained(
         model_name,
         revision="fp16",
         torch_dtype=torch.float16,
-        # use provided token or the one generated with `huggingface-cli login``
-        use_auth_token=token if token != "" else True,
+        use_auth_token=token if len(token) > 5 else token.lower() == "true",
     ).save_pretrained(save_directory)
 
 
From 9336061fd7c997ec47a63f1f2a76fc34f63c7f6a Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Sat, 24 Jun 2023 06:55:40 -0700
Subject: [PATCH 607/638] Fix invalid escape sequence (#790)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/790

Fixes:
```
/re_cwd/buck-out/v2/gen/fbcode/3c31cd45381cab49/aitemplate/AITemplate/fb/pt-ops/fmha_attention/__test_fmha_attention_op__/test_fmha_attention_op#link-tree/aitemplate/backend/cuda/target_def.py:393: DeprecationWarning: invalid escape sequence '\,'
```

Reviewed By: kadeng

Differential Revision: D46947138

fbshipit-source-id: 81c77e3f094f3b1cb13448be3f37c4492591429a
---
 python/aitemplate/backend/cuda/target_def.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 21489cd9d..0c63e4270 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -390,7 +390,7 @@ def _build_compile_options(self):
                 self.nvcc_options_json["args"]
                 + ["-I" + path for path in include_paths]
                 + [
-                    f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
+                    f"-Xcompiler '-Wp\\,@{fb_include_path}'",
                     "-Xcompiler -Wno-strict-aliasing",
                     "-Xcompiler -Wno-narrowing",
                     "-Xcompiler -Wno-error=maybe-uninitialized",

From 57c5e03cbf916f8bf939844b191e1be673d8a15a Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Sat, 24 Jun 2023 10:32:31 -0700
Subject: [PATCH 608/638] Re-sync with internal repository (#793)

Co-authored-by: Facebook Community Bot <6422482+facebook-github-bot@users.noreply.github.com>
---
 .../backend/cuda/tensor/__init__.py           |   2 +
 python/aitemplate/backend/cuda/tensor/cast.py | 155 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 python/aitemplate/compiler/ops/tensor/cast.py |  81 +++++++++
 tests/unittest/ops/test_cast.py               |  97 +++++++++++
 5 files changed, 336 insertions(+)
 create mode 100644 python/aitemplate/backend/cuda/tensor/cast.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/cast.py
 create mode 100644 tests/unittest/ops/test_cast.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index ab5f5ffe8..08dcceaee 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -18,6 +18,7 @@
 from aitemplate.backend.cuda.tensor import (
     argmax,
     batch_gather,
+    cast,
     concatenate,
     concatenate_tanh,
     dynamic_slice,
@@ -42,6 +43,7 @@
 __all__ = [
     "argmax",
     "batch_gather",
+    "cast",
     "concatenate",
     "concatenate_tanh",
     "dynamic_slice",
diff --git a/python/aitemplate/backend/cuda/tensor/cast.py b/python/aitemplate/backend/cuda/tensor/cast.py
new file mode 100644
index 000000000..bab6388fc
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/cast.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+"""
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void* y,
+    const void* x,
+    {{index_type}} n_elements,
+    {{prefix}}Stream_t stream);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}const {{index_type}} {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{output}}, {{input}},  {{func_name}}_n_elements, stream);
+{{indent}}}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void cast_op(
+    {{output_type}}* output,
+    const {{input_type}}* input,
+    {{index_type}} n_elements
+) {
+    const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= n_elements) {
+        return;
+    }
+    output[idx] = {{cast_func_call}}
+  }
+
+}  // namespace
+
+void invoke_{{func_name}}(void* output, const void* input,
+    {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_THREADS_PER_BLOCK));
+    cast_op<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+        reinterpret_cast<{{output_type}}*>(output),
+        reinterpret_cast<const {{input_type}}*>(input),
+        n_elements
+    );
+}
+    """
+)
+
+CAST_FUNCS = {
+    "half": {
+        "bfloat16": "__float2bfloat16_rn(__half2float(input[idx]));",
+        "float": "__half2float(input[idx]);",
+    },
+    "bfloat16": {
+        "half": "__float2half_rn(__bfloat162float(input[idx]));",
+        "float": "__bfloat162float(input[idx]);",
+    },
+    "float": {
+        "bfloat16": "__float2bfloat16_rn(input[idx]);",
+        "half": "__float2half_rn(input[idx]);",
+    },
+}
+
+
+@registry.reg("cuda.cast.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    input_ = func_attrs["inputs"][0]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    output_dtype = output.dtype()
+    output_type = backend_spec.dtype_to_backend_type(output_dtype)
+    input_type = backend_spec.dtype_to_backend_type(input_.dtype())
+    cast_func_call = CAST_FUNCS[input_type][output_type]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(),
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        output_type=output_type,
+        index_type=backend_spec.index_type,
+        cast_func_call=cast_func_call,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.cast.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.cast.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    backend_spec = CUDASpec()
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        input=func_attrs["inputs"][0]._attrs["name"],
+        calculate_n=gen_int_var_product_str(func_attrs["inputs"][0].shape()),
+        index_type=backend_spec.index_type,
+        indent=indent,
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 569a82aef..265f3293d 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -18,6 +18,7 @@
 """
 from aitemplate.compiler.ops.tensor.argmax import argmax
 from aitemplate.compiler.ops.tensor.batch_gather import batch_gather
+from aitemplate.compiler.ops.tensor.cast import cast
 from aitemplate.compiler.ops.tensor.chunk import chunk
 from aitemplate.compiler.ops.tensor.concatenate import concatenate
 from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
diff --git a/python/aitemplate/compiler/ops/tensor/cast.py b/python/aitemplate/compiler/ops/tensor/cast.py
new file mode 100644
index 000000000..1850367a6
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/cast.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class cast(Operator):
+    """
+    Returns the cast of input tensor to specified type.
+    Only the conversion between any pair of float16, bfloat16,
+    and float32 dtypes is supported.
+
+    Args:
+        x (Tensor): the source tensor
+        dtype (str): the target type for the cast operator
+
+    Returns:
+        Tensor: a tensor with the type converted to the
+        specified dtype.
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self._attrs["op"] = "cast"
+        self._attrs["has_profiler"] = False
+
+    def __call__(
+        self,
+        x: Tensor,
+        dtype: str = "bfloat16",
+    ) -> Tensor:
+        x_dtype = normalize_dtype(x._attrs["dtype"])
+        dtype = normalize_dtype(dtype)
+        if x_dtype not in ("float16", "bfloat16", "float32"):
+            raise TypeError(
+                f"Expected dtype for x must be float16,bfloat16 or float32 , but got {x_dtype}."
+            )
+
+        if dtype not in ("float16", "bfloat16", "float32"):
+            raise TypeError(
+                f"Expected dtype to cast must be float16,bfloat16 or float32 , but got {dtype}."
+            )
+        if dtype == x_dtype:
+            return x
+
+        self._attrs["inputs"] = [x]
+        self._attrs["cast_dtype"] = dtype
+        self._set_depth()
+
+        output_shape = x._attrs["shape"]
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=dtype,
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_cast.py b/tests/unittest/ops/test_cast.py
new file mode 100644
index 000000000..715befc73
--- /dev/null
+++ b/tests/unittest/ops/test_cast.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class TestCast(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_cast(
+        self,
+        shape,
+        dtype="float32",
+        cast_dtype="bfloat16",
+        test_name="cast",
+    ) -> None:
+        if not isinstance(shape, list):
+            shape = [shape]
+
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Y = ops.cast()(X, cast_dtype)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        y = get_torch_empty_tensor(shape, dtype=cast_dtype)
+        inputs = {"X": x}
+        outputs = {"Y": y}
+        module.run_with_tensors(inputs, outputs)
+
+        y_pt = x.to(string_to_torch_dtype(cast_dtype))
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param(1, "float16", "bfloat16", [1], "float16_to_bfloat16"),
+            param(2, "float16", "float32", [10, 20], "float16_to_float32"),
+            param(3, "bfloat16", "float16", [10, 20, 30], "bfloat16_to_float16"),
+            param(4, "bfloat16", "float32", 123, "bfloat16_to_float32"),
+            param(5, "float32", "float16", [20, 30], "float32_to_float16"),
+            param(6, "float32", "bfloat16", [1, 128], "float32_to_bfloat16"),
+        ]
+    )
+    def test_cast(
+        self,
+        i,
+        dtype,
+        cast_dtype,
+        shape,
+        test_name,
+    ):
+        self._test_cast(
+            shape=shape,
+            dtype=dtype,
+            cast_dtype=cast_dtype,
+            test_name=test_name,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From b5bd10d6041dd02443da76a57567d6d035a099fe Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Sat, 24 Jun 2023 23:46:34 -0700
Subject: [PATCH 609/638] tensor upstream (#776)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/776

Reviewed By: chenyang78

Differential Revision: D46880982

Pulled By: ipiszy

fbshipit-source-id: 63f9b0aa818fb6d11d9096e843c93bcd783dfa5a
---
 .../backend/rocm/embedding/bert_embeddings.py |   2 +-
 .../backend/rocm/tensor/__init__.py           |   3 +
 .../aitemplate/backend/rocm/tensor/expand.py  | 308 ++++++++++++++
 .../rocm/tensor/expand_static_shape.py        | 386 ++++++++++++++++++
 python/aitemplate/backend/rocm/tensor/full.py | 148 +++++++
 .../backend/rocm/tensor/identity.py           |  15 +-
 .../aitemplate/backend/rocm/tensor/repeat.h   | 188 +++++++++
 7 files changed, 1048 insertions(+), 2 deletions(-)
 create mode 100644 python/aitemplate/backend/rocm/tensor/expand.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/expand_static_shape.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/full.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/repeat.h

diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
index 736845d30..b41891226 100644
--- a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-bert_embeddings kernel codegen for CUDA.
+bert_embeddings kernel codegen for ROCM.
 """
 
 import math
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 2993c648c..62e7f3b00 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -21,6 +21,9 @@
     concatenate,
     concatenate_tanh,
     dynamic_slice,
+    expand,
+    expand_static_shape,
+    full,
     identity,
     permute021,
     permute0213,
diff --git a/python/aitemplate/backend/rocm/tensor/expand.py b/python/aitemplate/backend/rocm/tensor/expand.py
new file mode 100644
index 000000000..4e3951156
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand.py
@@ -0,0 +1,308 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+expand op general ROCM implementation with complete dynamic shape support
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.rocm.tensor import expand_static_shape  # noqa: F401
+
+
+@registry.reg("rocm.expand.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
+        func = registry.get("rocm.expand.static.func_decl")
+        return func(func_attrs)
+    x = func_attrs["inputs"][0]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    dt = x.dtype()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(dt, None)
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,  # name of the function
+        dtype=dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float ))
+        index_type=index_type,
+    )
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* src,
+  const {{index_type}}* input_dims,
+  const {{index_type}} input_rank,
+  void* dst,
+  {{index_type}}* output_dims, // written to ( runtime shape inference )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types,
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+using bfloat16 = hip_bfloat16;
+
+{% if index_type=="int64_t" %}
+#define DIM_TYPE_ADD 0l
+#define DIM_TYPE_EXPAND 1l
+#define DIM_TYPE_KEEP 2l
+
+#define MAX_THREADS_PER_BLOCK 1024l
+#define MAX_BLOCKS 65535l
+#define MAX_X_BLOCKS 2147483647l
+{% else %}
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_BLOCKS 65535
+#define MAX_X_BLOCKS 2147483647
+{% endif %}
+
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+#define INT_MIN(a,b) ((a) < (b)? (a) : (b))
+
+/**
+ * Sequential write expand kernel.
+ * This kernel deals with the general case ( strided copy ).
+ * It relies heavily on L2 cache for scattered read optimization and
+ * writes sequentially.
+ */
+__global__ void {{func_name}}_sequential_write_kernel(
+
+  const {{dtype}}* src, // source tensor
+  {{dtype}}* dst, // destination tensor
+  const {{index_type}} dst_numel // number of elements in dst
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}} // Stride for writing dimension {{i}} to dst
+        ,const {{index_type}} read_strides_{{i}} // Stride for reading dimension {{i}} from src
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    {{index_type}} write_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    const {{index_type}} grid_stride = gridDim.x*blockDim.x;
+    for (;write_idx<dst_numel;write_idx += grid_stride) {
+      {{index_type}} read_idx = 0;
+      {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+      {% for i in range(output_rank) %}
+          read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+          remaining_idx %= output_strides_{{i}};
+      {% endfor %}
+      dst[write_idx] = src[read_idx];
+    }
+}
+
+/**
+ * Expand Operator entry point with support for dynamic shapes
+ */
+void {{func_name}} (
+  const void* src, // input tensor
+  const {{index_type}}* input_dims, // input dimensions ( passed by value )
+  const {{index_type}} input_rank,
+  void* dst, // output tensor
+  {{index_type}}* output_dims, // output dimensions ( passed by value )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types, // Output dim types ( length=output_rank ). 2 = keep dimension, 1 = expand dimension, 0 = add dimension
+  hipStream_t stream)
+{
+  // Calculate number of input elements
+  {{index_type}} input_numel = 1;
+  {{index_type}} i;
+  for (i = 0; i < input_rank; ++i) {
+    input_numel *= input_dims[i];
+  }
+  if (input_numel==0) {
+    return;
+  }
+  {{index_type}} input_dim_pos = 0;
+
+  // Calculate number of output dimensions
+  {{index_type}} output_numel = 1;
+  for (i = 0; i < output_rank; ++i) {
+    output_numel *= output_dims[i];
+  }
+  if (output_numel==0) {
+    return;
+  }
+  // Determine stride for each input dimension
+  {{index_type}} input_strides[input_rank];
+  input_strides[input_rank-1] = 1;
+  for (i=input_rank-2;i>=0;--i) {
+    input_strides[i] = input_strides[i+1]*input_dims[i+1];
+  }
+  // Determine stride for each output dimension
+  {{index_type}} output_strides[output_rank];
+  output_strides[output_rank-1] = 1;
+  for (i=output_rank-2;i>=0;--i) {
+    output_strides[i] = output_strides[i+1]*(output_dims[i+1]);
+  }
+
+  // Determine read strides for each output dimension
+  // (0 for expand or add dims, otherwise the stride of
+  // of the corresponding input dim)
+  {{index_type}} read_strides[output_rank];
+
+  input_dim_pos = 0;
+  for (i = 0; i < output_rank; ++i) {
+    {{index_type}} dim_type =  output_dim_types[i];
+    if (dim_type == DIM_TYPE_KEEP ) { // keep
+      read_strides[i] = input_strides[input_dim_pos++];
+    } else {
+      read_strides[i] = 0;
+      if (dim_type==DIM_TYPE_EXPAND) {
+        input_dim_pos++;
+      }
+    }
+  }
+  assert(input_dim_pos==input_rank);
+
+  // Calculating tail dimension in order to determine whether we can do sequential batching
+  {{index_type}} tail_dim = 1;
+  for (i = output_rank-1; i >= 0; --i) {
+      if (output_dim_types[i]!=DIM_TYPE_KEEP) {
+         break;
+      }
+      tail_dim *= output_dims[i];
+  }
+
+  // determine ROCM kernel grid layout. Tuning numbers determined experimentally
+  {{index_type}} thread_size_x = INT_MIN(output_numel, MAX_THREADS_PER_BLOCK); // more threads per block maximize L1 cache utilization
+  {{index_type}} block_size_x = INT_MIN(INT_CEIL_DIV(output_numel, thread_size_x), 4096l ); //
+
+  // for very large dimensions, we rely on grid-stride loop and save the block launch overhead
+  dim3 dimGrid(block_size_x, 1, 1);
+  dim3 dimBlock(thread_size_x, 1, 1);
+  {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+      static_cast<const {{dtype}}*>(src),
+      static_cast<{{dtype}}*>(dst),
+      output_numel
+      {% for i in range(output_rank) %}
+        ,output_strides[{{i}}]
+        ,read_strides[{{i}}]
+      {% endfor %}
+  );
+}
+"""
+)
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(x.dtype(), None)
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    assert index_type is not None
+
+    input_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in xshape]
+    )
+    output_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in yshape]
+    )
+    input_rank = len(xshape)
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "input_dims": input_dims,  # list of input dimensions (as string of comma-separated variable names )
+        "output_dims": output_dims,  # output dimensions (as string of comma-separated variable names)
+        "input_rank": input_rank,  # number of input dimensions
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float ))
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+    }
+
+
+@registry.reg("rocm.expand.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+    else:
+        func = registry.get("rocm.expand.static.gen_function")
+        return func(func_attrs)
+
+
+@registry.reg("rocm.expand.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+    else:
+        func = registry.get("rocm.expand.static.func_call")
+        return func(func_attrs, indent)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}const {{index_type}} input_dims[] = { {{input_dims}} };
+    {{indent}}{{index_type}} output_dims[] = { {{output_dims}} };
+    {{indent}}const {{index_type}} output_dim_types[] = { {{dim_types}} };
+    {{indent}}{{func_name}}(
+    {{indent}}    {{src}},
+    {{indent}}    input_dims,
+    {{indent}}    {{input_rank}},
+    {{indent}}    {{dst}},
+    {{indent}}    output_dims,
+    {{indent}}    {{output_rank}},
+    {{indent}}    output_dim_types,
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
new file mode 100644
index 000000000..730cbbcef
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
@@ -0,0 +1,386 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Specialized and optimized ROCM kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
+import math
+import os
+from itertools import accumulate
+from operator import mul
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+
+
+@registry.reg("rocm.expand.static.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+
+using bfloat16 = hip_bfloat16;
+
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024l
+// integer ceil division
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+
+// Maximum amount of shared memory that the repeat copy kernel(s) should use.
+// (used within repeat.cuh, included below )
+// Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
+#define SHM_MAX 1024 * 44
+namespace{
+{{custom_libs}}
+
+/**
+ * Get read base offset (e.g. excluding tail offset) in the middle part, given a write offset
+ * into the middle part
+ */
+__forceinline__ __device__ {{index_type}} {{func_name}}_get_read_offset(const {{index_type}} write_offset) {
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_write_idx = write_offset; // assert < {{mid_size*tail_size}} ( i.e. < mid_size*tail_size)
+    {% for i in range(head_dim_count, head_dim_count+mid_dim_count-1) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+        remaining_write_idx %= {{output_strides[i]}}l;
+    {% endfor %}
+    {% for i in range(head_dim_count+mid_dim_count-1, head_dim_count+mid_dim_count) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+    {% endfor %}
+    return read_idx;
+}
+
+/**
+ *  Copies tail elements from a contiguous source memory region into a contiguous target memory region
+ *  Using a grid-stride loop and the vectorized dtype
+ *
+ * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__forceinline__ __device__ void tail_copy(
+        const {{dtype}} * const src, // base src tensor memory pointer
+        const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
+        {{dtype}} * const dst,  // base destination tensor memory pointer
+        const {{index_type}} write_offset, // Base offset into dst via {{dtype}}-typed indexing
+        const {{index_type}} block_thread_index,
+        const {{index_type}} block_thread_count,
+        const {{index_type}} copy_numel
+    ) {
+    for ({{index_type}} i=block_thread_index;i<copy_numel;i+=block_thread_count) {
+        dst[write_offset+i] = src[read_offset+i];
+    }
+}
+
+/**
+ * Implement the "middle" part of the kernel, dealing with strided reads/writes.
+ * Also utilizes grid-stride loop for efficiency and flexibility
+ * see
+ * * https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * * https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#coalesced-access-to-global-memory
+ * * and https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#strided-accesses
+ * for a more detailed explanation of the reasons for the choice of this specific form.
+ *
+ * Performance notes:
+ *
+ * It is critical to calculate the block_thread_index passed to tail_copy(..) based on
+ * the x-dimension of the launch grid, in order to benefit from Warp memory access coalescing.
+ *
+ */
+__global__ void expand_strided_copy(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.y * blockIdx.y + threadIdx.y) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
+    const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
+        tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+    }
+
+}
+}
+/**
+ * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
+ */
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream)
+{
+  if ((({{mid_size*tail_size}})==0) || (head_size==0)) {
+    return;
+  }
+  {% if mid_dim_count>0 %}
+  // we have middle dimensions which involve non-contiguous reads
+  // so we need to invoke the middle kernel
+  dim3 dimGrid({{grid_blocks_x}}, {{grid_blocks_y}});
+  dim3 dimBlock({{grid_threads_x}}, {{grid_threads_y}});
+  expand_strided_copy<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  if (head_size>1l) {
+     // now repeat copy what we already built once, multiple times into the rest of the output tensor
+     cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
+  }
+  {% else %}
+    // we have no middle dimensions, so strided copy is unneccessary.
+    // All we need to do is repeatedly copy the source multiple times
+    // repeat the entire thing a dynamic number of times ( e.g. head_size times )
+    cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
+  {% endif %}
+}
+"""
+)
+
+
+def _ceil(num: float) -> int:
+    return int(math.ceil(num))
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    # Efficient vectorized & buffered repeat copy implementation,
+    # even for odd shapes
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "repeat.h"
+    )
+    rocm_spec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype[x.dtype()]
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+    dtype2 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 2, None)
+    dtype4 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 4, None)
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
+    index_type = "int64_t"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in xshape
+    ), "All input shapes need to be fixed"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in yshape
+    ), "All output shapes need to be fixed"
+
+    # Calculate number of times we can repeatedly copy the entire result, based on how many add, expand and singleton dimensions
+    # we have at the start
+    head_size_lower = 1  # Number of times we can batch-repeat the entire result in an efficient batch-copying manner
+    head_size_upper = 1
+    head_dim_count = 0  # Number of head dimensions
+
+    for dim_type, dim in zip(func_attrs["dim_types"], yshape):
+        if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        head_size_lower *= dim.lower_bound()
+        head_size_upper *= dim.upper_bound()
+        head_dim_count += 1
+
+    # Create a symbolic term for calculating head size ( e.g. repeat count )
+    if head_size_lower == head_size_upper:
+        head_size_symbolic = f"{head_size_upper}l"
+    else:
+        head_size_symbolic = "*".join(
+            [
+                f"static_cast<{index_type}>(" + dim._attrs["name"] + ")"
+                for dim in yshape[:head_dim_count]
+            ]
+        )
+
+    # Calculate number of tail elements, e.g. number of elements we can batch-copy in the inner loop
+    # via effective sequential reads & writes
+    tail_dim_count = 0  # number of tail dimensions
+    tail_size = 1  # Number of the elements in all these  tail dimensions
+    for dim_type, dim in reversed(
+        list(zip(dim_types[head_dim_count:], yshape[head_dim_count:]))
+    ):
+        if dim_type != ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        tail_dim_count += 1
+        tail_size *= dim.lower_bound()
+
+    input_strides = list(
+        reversed(
+            list(accumulate([1] + [d.lower_bound() for d in reversed(xshape)], mul))
+        )
+    )
+    output_strides = list(
+        reversed(
+            list(
+                accumulate(
+                    [1] + [d.lower_bound() for d in reversed(yshape[head_dim_count:])],
+                    mul,
+                )
+            )
+        )
+    )
+
+    output_numel = output_strides[
+        0
+    ]  # this does not include the number of elements obtained from head repetitions
+    # since we have excluded head dimensions above
+    input_numel = input_strides[0]
+    if tail_size > 0:
+        mid_size = output_numel // tail_size
+    else:
+        mid_size = 0
+    mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
+    if input_numel > 0:
+        mid_expansion_rate = mid_size * tail_size // input_numel
+    else:
+        mid_expansion_rate = 1
+
+    # remove the first dimension, which is the total number of elements
+    # and prepend the head_dims with stride 0
+    output_strides = [0] * head_dim_count + output_strides[1:]
+    input_strides = input_strides[1:]
+
+    input_stride_pos = 0
+    read_strides = [0] * len(yshape)
+    for i in range(len(yshape)):
+        if dim_types[i] == ExpandDimensionType.ADD_DIM:
+            continue
+        if dim_types[i] == ExpandDimensionType.KEEP_DIM:
+            read_strides[i] = input_strides[input_stride_pos]
+        # For keep dim, read stride remains at zero
+        input_stride_pos += 1
+
+    assert input_stride_pos == len(
+        xshape
+    ), "Incorrect number of keep and expand dims. Something went wrong."
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+
+    # If tail size is aligned to 2 or 4 elements, we can vectorize reads/writes
+    # Note: Further vectorization not easily possible, given that it could happen that
+    # the read offset and the write offset can get different alignments within the expand op
+    #
+    if (tail_size % 4 == 0) and (dtype4 is not None):
+        dtype = dtype4
+        tail_size = tail_size // 4
+        output_strides = [s // 4 for s in output_strides]
+        read_strides = [s // 4 for s in read_strides]
+    elif tail_size % 2 == 0:
+        dtype = dtype2
+        tail_size = tail_size // 2
+        output_strides = [s // 2 for s in output_strides]
+        read_strides = [s // 2 for s in read_strides]
+
+    grid_blocks_x = 1
+    grid_threads_x = max(1, min(tail_size, 64))
+    max_y_threads = 1024 // grid_threads_x  # guaranteed to be >= 1
+    grid_threads_y = max(
+        1, min(max_y_threads, mid_size)
+    )  # so that  mid_grid_threads_x*max_x_threads <= 1024
+    grid_blocks_y = _ceil(mid_size / grid_threads_y)
+
+    if dtype == "bfloat16":
+        # bfloat16 is not available in model-generated.h as a type,
+        # so we can either just declare the input to be void*
+        # or  just use the fact that we don't care about how to interpret the value
+        # and just treat it like every other 16 bit type.
+        dtype = "half"
+
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "output_strides": output_strides,  # list of output stride values
+        "read_strides": read_strides,  # list of read stride values
+        "tail_dim_count": tail_dim_count,  # number of tail dimensions
+        "tail_size": tail_size,  # number of elements in all these tail dimensions
+        "head_dim_count": head_dim_count,  # number of head dimensions
+        "head_size": head_size_symbolic,  # number of elements in all these head dimensions
+        "mid_dim_count": mid_dim_count,
+        "mid_size": mid_size,
+        "mid_expansion_rate": mid_expansion_rate,  # How many times do we read the input for the middle
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float )
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+        "grid_blocks_y": grid_blocks_y,  # number of y grid blocks in the strided copy kernel
+        "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
+        "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
+        "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.h
+    }
+
+
+@registry.reg("rocm.expand.static.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+
+
+@registry.reg("rocm.expand.static.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}{{func_name}}(
+    {{indent}}    static_cast<{{dtype}}*>({{src}}),
+    {{indent}}    static_cast<{{dtype}}*>({{dst}}),
+    {{indent}}    {{head_size}},
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/full.py b/python/aitemplate/backend/rocm/tensor/full.py
new file mode 100644
index 000000000..da462a93f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/full.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+
+HIP_HEADER_FILES = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void*,  /* output */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void full(
+    {{read_type}}* output,
+    {{index_type}} num_elements
+) {
+  const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx * N_ELEMENTS_PER_THREAD >= num_elements) {
+    return;
+  }
+
+  {{read_type}} tmp;
+  {{data_type}}* p = reinterpret_cast<{{data_type}}*>(&tmp);
+
+  #pragma unroll
+  for (int i=0; i < N_ELEMENTS_PER_THREAD; i++) {
+      p[i] = ({{data_type}}) ({{fill_value}});
+  }
+
+  output[idx] = tmp;
+}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    {{prefix}}Stream_t stream
+){
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>({{num_elements}}) / N_ELEMENTS_PER_THREAD / N_THREADS_PER_BLOCK));
+    full<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(reinterpret_cast<{{read_type}}*> (output), {{num_elements}});
+}
+    """
+)
+
+
+@registry.reg("rocm.full.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+
+    # fill the maximum output Tensor size with the fill_value
+    # any shape within the maximum bounds will be a subset
+    num_elements = 1
+    for dim in y.shape():
+        num_elements *= dim.upper_bound()
+
+    dtype = y.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=HIP_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(
+            read_t=read_type,
+            data_t=data_type,
+        ),
+        func_name=func_attrs["name"],
+        read_type=read_type,
+        data_type=data_type,
+        index_type=backend_spec.index_type,
+        fill_value=func_attrs["fill_value"],
+        num_elements=num_elements,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = ROCMSpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/identity.py b/python/aitemplate/backend/rocm/tensor/identity.py
index 9bbab569a..d5d59b4cf 100644
--- a/python/aitemplate/backend/rocm/tensor/identity.py
+++ b/python/aitemplate/backend/rocm/tensor/identity.py
@@ -16,11 +16,20 @@
 ROCM identity function
 """
 
+import jinja2
+
 from aitemplate.backend import registry
 from aitemplate.backend.backend_spec import ROCMSpec
 from aitemplate.backend.common.tensor import identity_common
 
 
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include <hip/hip_runtime.h>
+    """
+)
+
+
 @registry.reg("rocm.identity.func_decl")
 def gen_function_decl(func_attrs):
     """Generate function declaration.
@@ -53,7 +62,11 @@ def gen_function(func_attrs):
     str
         Rendered function body.
     """
-    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec())
+    return identity_common.gen_function(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+        extra_headers=EXTRA_HEADERS.render(),
+    )
 
 
 @registry.reg("rocm.identity.func_call")
diff --git a/python/aitemplate/backend/rocm/tensor/repeat.h b/python/aitemplate/backend/rocm/tensor/repeat.h
new file mode 100644
index 000000000..1bebc0642
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/repeat.h
@@ -0,0 +1,188 @@
+/**
+
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+-
+
+Functions for repeating parts of a ROCM source tensor onto itself
+or into a target tensor.
+
+Used by expand_static_shape.py ( expand operator )
+
+*/
+/**
+ * ROCM Kernel to copy elements repeatedly from a source memory
+ * region to a target memory region.
+ */
+__global__ void repeat_head_kernel(
+    const int64_t* const src, ///< source memory region. Must be 8-byte aligned
+    int64_t* data,
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies) ///< How many times to repeat it all into data
+{
+  extern __shared__ int64_t
+      shared[]; // preallocated to blockDim.x elements, typically 32
+  const size_t stride_y = blockDim.y * gridDim.y;
+  const size_t stride_x = blockDim.x * gridDim.x;
+
+  // outer grid-stride loop
+  for (size_t ri = blockDim.x * blockIdx.x + threadIdx.x;
+       ri < head_mem_num_elements;
+       ri += stride_x) {
+    // read only with one thread per y dim
+    if (threadIdx.y == 0) {
+      shared[threadIdx.x] = src[ri];
+    }
+    __syncthreads(); // wait for shared memory to be populated
+    // inner grid-stride loop, write with all threads out of shared memory
+    size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
+    for (; wi < num_repeat_copies; wi += stride_y) {
+      // Note that this ensures coalesced writes, due to consecutive write
+      // accesses of threads in a Warp
+      data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
+    }
+  }
+}
+
+/**
+ * Copy an 8-byte aligned memory region, which has a byte size that is a
+ * multiple of 8 into an 8-byte aligned target memory region efficiently. Calls
+ * into repeat_head_kernel ( see above )
+ *
+ **/
+__host__ hipError_t cuda_repeat_head_vectorized(
+    const int64_t* const src, ///< Source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+              for head_mem_num_elements*num_repeat_copies int64_t elements. */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies, ///< How many times to repeat it all into data
+    hipStream_t stream ///< ROCM stream
+) {
+  size_t threads_x = 64;
+  size_t threads_y = 1024 / threads_x;
+  size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
+  size_t blocks_y = INT_CEIL_DIV(num_repeat_copies, threads_y);
+  size_t serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks if necessary, so we do not exceed available shared
+  // memory
+  blocks_y = INT_CEIL_DIV(
+      blocks_y, serialization_level); // reduce thread count in y dimension
+                                      // first, e.g. sequentialized writes
+  serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks in x direction if this is not sufficient yet
+  blocks_x = INT_CEIL_DIV(blocks_x, serialization_level);
+  dim3 dimGrid(blocks_x, blocks_y);
+  dim3 dimBlock(threads_x, threads_y);
+  repeat_head_kernel<<<
+      dimGrid,
+      dimBlock,
+      threads_x * sizeof(int64_t),
+      stream>>>(src, data, head_mem_num_elements, num_repeat_copies);
+  return hipPeekAtLastError();
+}
+
+/**
+ * Repeatedly copy the beginning (head) section of a memory region an additonal
+ * num_repeat_copies times nto the memory region directly following that head,
+ * such that the end result will have this head data
+ * repeated 1+num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_head(
+    void* data, ///< pointer to ROCM memory of size (at least)
+                ///< head_mem_bytes*(num_repeat_copies+1)
+    const size_t head_mem_bytes, ///< How many bytes to repeat
+    size_t num_repeat_copies, ///< How many times to repeat it (in addition to
+                              ///< the existing head data)
+    hipStream_t stream ///< ROCM Stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0)
+    return res;
+  if ((head_mem_bytes % 8) == 0) {
+    // no need to double memory any further if it is 64-bit aligned
+    res = cuda_repeat_head_vectorized(
+        static_cast<const int64_t* const>(data),
+        static_cast<int64_t*>(data) + (head_mem_bytes / 8),
+        head_mem_bytes / 8,
+        num_repeat_copies,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+  } else {
+    res = hipMemcpyAsync(
+        static_cast<void*>(static_cast<uint8_t*>(data) + head_mem_bytes),
+        data,
+        head_mem_bytes,
+        hipMemcpyDeviceToDevice,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+    if (num_repeat_copies >= 2) {
+      // recurse
+      // we have already repeated 1 time, therefore the (num_repeat_copies-1)
+      res = cuda_repeat_head(
+          data, head_mem_bytes * 2, (num_repeat_copies - 1) / 2, stream);
+      if (res != hipSuccess) {
+        return res;
+      }
+      // deal with possible remainder
+      if (((num_repeat_copies - 1) % 2) == 1) {
+        res = hipMemcpyAsync(
+            static_cast<void*>(
+                static_cast<uint8_t*>(data) +
+                num_repeat_copies * head_mem_bytes),
+            data,
+            head_mem_bytes,
+            hipMemcpyDeviceToDevice,
+            stream);
+      }
+    }
+  }
+  return res;
+}
+
+/**
+ * Repeatedly copy a source memory region into a target memory region
+ * such that the end result will have the source data
+ * repeated num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_src(
+    const void* const src, ///< Source memory region (readonly)
+    void* data, ///< Destination memory region (read/write, size of at least
+                ///< num_repeat_copies*head_mem_bytes)
+    const size_t head_mem_bytes, ///< Size of source region to copy
+    size_t num_repeat_copies, ///< How many times to copy the data from source
+                              ///< into data
+    hipStream_t stream ///< ROCM stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0) {
+    return res;
+  }
+
+  res = hipMemcpyAsync(
+      data, src, head_mem_bytes, hipMemcpyDeviceToDevice, stream);
+  if ((res != hipSuccess) || (num_repeat_copies == 1)) {
+    return res;
+  }
+  return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
+}

From e3a238858308dbd57a0ed1a5be46656fcbb219ae Mon Sep 17 00:00:00 2001
From: Henry Hu <hhh@meta.com>
Date: Mon, 26 Jun 2023 16:34:24 -0700
Subject: [PATCH 610/638] Support List[Tensor] arg for from_two_input_lists
 (#792)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/792

Reviewed By: terrychenism, Yinan-Zhao

Differential Revision: D46828343

fbshipit-source-id: a190e2b39ee3fc8927560b6faf6d980c2c4b6a55
---
 fx2ait/fx2ait/tensor_spec.py           |  9 +++++++--
 fx2ait/fx2ait/test/test_tensor_spec.py | 17 ++++++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
index eb0f28a69..8922c1611 100644
--- a/fx2ait/fx2ait/tensor_spec.py
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import logging
-from typing import Any, Dict, List, Set
+from typing import Any, Dict, List, Set, Union
 
 import torch
 from aitemplate.compiler.public import IntImm, IntVar
@@ -48,7 +48,9 @@ def __repr__(self) -> str:
 
     @classmethod
     def from_two_input_lists(
-        cls, inputs1: List[torch.Tensor], inputs2: List[torch.Tensor]
+        cls,
+        inputs1: List[Union[torch.Tensor, List[torch.Tensor]]],
+        inputs2: List[Union[torch.Tensor, List[torch.Tensor]]],
     ) -> List["TensorSpec"]:
         """
         This function is useful when we expect multiple dynamic dims.
@@ -73,6 +75,9 @@ def from_two_input_lists(
         result: List[TensorSpec] = []
 
         for t1, t2 in zip(inputs1, inputs2):
+            if isinstance(t1, list):
+                result.append(cls.from_two_input_lists(t1, t2))
+                continue
             if t1.dtype != t2.dtype:
                 raise ValueError(f"Different types: {t1.dtype} vs {t2.dtype}")
             if len(t1.shape) != len(t2.shape):
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
index c2f32c909..4ab33f218 100644
--- a/fx2ait/fx2ait/test/test_tensor_spec.py
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -25,17 +25,24 @@ def test_two_input_lists(self):
         inputs1 = [
             torch.empty([1, 3, 4], dtype=torch.float16),
             torch.empty([5, 6], dtype=torch.int32),
-            torch.empty([7, 128, 9], dtype=torch.float16),
+            [
+                torch.empty([7, 128, 9], dtype=torch.float16),
+                torch.empty([1, 16], dtype=torch.float16),
+            ],
         ]
         inputs2 = [
             torch.empty([32, 3, 4], dtype=torch.float16),
             torch.empty([5, 6], dtype=torch.int32),
-            torch.empty([7, 1, 9], dtype=torch.float16),
+            [
+                torch.empty([7, 1, 9], dtype=torch.float16),
+                torch.empty([32, 16], dtype=torch.float16),
+            ],
         ]
 
         specs = TensorSpec.from_two_input_lists(inputs1, inputs2)
 
         self.assertEqual(3, len(specs))
+        self.assertEqual(2, len(specs[2]))
         self.assertEqual(
             TensorSpec(
                 [IntVar([1, 32], "dynamic_dim_0"), IntImm(3), IntImm(4)], torch.float16
@@ -47,7 +54,11 @@ def test_two_input_lists(self):
             TensorSpec(
                 [IntImm(7), IntVar([1, 128], "dynamic_dim_1"), IntImm(9)], torch.float16
             ),
-            specs[2],
+            specs[2][0],
+        )
+        self.assertEqual(
+            TensorSpec([IntVar([1, 32], "dynamic_dim_0"), IntImm(16)], torch.float16),
+            specs[2][1],
         )
 
     @parameterized.expand(

From 3c3b25613e457c9f6eea0d518b9a8a3deee509d4 Mon Sep 17 00:00:00 2001
From: Albert Chen <albertchen@meta.com>
Date: Mon, 26 Jun 2023 19:00:19 -0700
Subject: [PATCH 611/638] Adding support for relational operations (#783)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/783

As titled, this diffs adds supports for relational operations (e.g. ge, le, gt, lt, eq, ne). The expected behavior is to match equivalent operators in Torch, e.g. https://pytorch.org/docs/stable/generated/torch.ge.html

There are multiple constraints:
(1) Type promotions are not supported.
(2) Broadcast is not supported yet, so it expects tensor of the same shape - this maybe fixed in future iterations
(3) It uses the relational operators, i.e, >=, <, > rather than comparison function that supports different level of precision, e.g. https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__COMPARISON.html   - this maybe fixed in future iterations

Reviewed By: aakhundov

Differential Revision: D46770449

fbshipit-source-id: ed58c8ff0148b389dc80f76fe6a4413f21c36a2f
---
 python/aitemplate/backend/backend_spec.py     |  13 +-
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../backend/cuda/tensor/relational.py         | 208 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/relational.py         | 120 ++++++++++
 tests/unittest/ops/test_relational.py         | 170 ++++++++++++++
 6 files changed, 513 insertions(+), 1 deletion(-)
 create mode 100644 python/aitemplate/backend/cuda/tensor/relational.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/relational.py
 create mode 100644 tests/unittest/ops/test_relational.py

diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 2b6771658..6e9f700ae 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -53,6 +53,7 @@ class GPUBackendSpec(BackendSpec):
 
     dtype_to_backend_dtype: Dict[str, str] = field(
         default_factory=lambda: {
+            "bool": "bool",
             "float16": "half",
             "bfloat16": "bfloat16",
             "float32": "float",
@@ -66,6 +67,7 @@ class GPUBackendSpec(BackendSpec):
     # find the size in bytes of a given backend type
     sizeof_types: Dict[str, int] = field(
         default_factory=lambda: {
+            "bool": 1,
             "uint8_t": 1,
             "half": 2,
             "bfloat16": 2,
@@ -74,6 +76,10 @@ class GPUBackendSpec(BackendSpec):
             "int32_t": 4,
             "float": 4,
             "bool": 1,
+            "uint4": 16,
+            "uint2": 8,
+            "uint": 4,
+            "bfloat16": 2,
         }
     )
 
@@ -349,7 +355,7 @@ def get_elementwise_read_backend_type(
         For example, if we're dealing with fp16 and num_elements is divisible by 8,
         we can use uint4.
         """
-        if dtype in ("float", "float32"):
+        if dtype in ("float", "float32", "int32"):
             num_elems_to_backend_type = ((4, "uint4"), (2, "uint2"), (1, "float"))
 
         elif dtype == "float16":
@@ -366,6 +372,11 @@ def get_elementwise_read_backend_type(
                 (2, "uint"),
                 (1, "bfloat16"),
             )
+        elif dtype == "int64":
+            num_elems_to_backend_type = (
+                (2, "uint4"),
+                (1, "uint2"),
+            )
         else:
             raise NotImplementedError("Unsupported dtype {}!".format(dtype))
 
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index 08dcceaee..b017365e3 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -34,6 +34,7 @@
     permute0213,
     permute102,
     permute210,
+    relational,
     slice_reshape_scatter,
     slice_scatter,
     split,
@@ -50,6 +51,7 @@
     "expand",
     "full",
     "gather",
+    "relational",
     "identity",
     "jagged_to_padded_dense",
     "masked_select",
diff --git a/python/aitemplate/backend/cuda/tensor/relational.py b/python/aitemplate/backend/cuda/tensor/relational.py
new file mode 100644
index 000000000..db2f6e89e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/relational.py
@@ -0,0 +1,208 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+from aitemplate.utils import shape_utils
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+#define N_READS_PER_THREAD sizeof({{output_read_t}}) / sizeof(bool)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+
+void invoke_{{func_name}}(
+    void*,  /* output */
+    const void*,  /* left operand */
+{% if not right_operand.is_a_const_num() %}
+    const void*,   /* right operand */
+{% endif %}
+    {{index_type}}, /* number of elements */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} n_elements = {{calculate_n}};
+    {{indent}} invoke_{{func_name}}(
+    {{indent}}    {{output}},
+    {{indent}}    {{left_operand_name}},
+{% if not right_operand.is_a_const_num() %}
+    {{indent}}    {{right_operand_name}},
+{% endif %}
+    {{indent}}    n_elements,
+    {{indent}}    stream
+    {{indent}});
+{{indent}}}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__ void relational(
+    {{output_read_t}}* output,
+    const {{input_read_t}}* left_operand,
+{% if not right_operand.is_a_const_num() %}
+    const {{input_read_t}}* right_operand,
+{% endif %}
+    {{index_type}} num_elements) {
+
+    const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx * N_READS_PER_THREAD >= num_elements) {
+        return;
+    }
+
+    {{input_read_t}} tmp_left = left_operand[idx];
+    {{data_type}}* tmp_left_ptr = reinterpret_cast<{{data_type}}*>(&tmp_left);
+
+    {{output_read_t}} tmp_output;
+    bool* tmp_output_ptr = reinterpret_cast<bool*>(&tmp_output);
+
+{% if not right_operand.is_a_const_num() %}
+    {{input_read_t}} tmp_right = right_operand[idx];
+    {{data_type}}* tmp_right_ptr = reinterpret_cast<{{data_type}}*>(&tmp_right);
+{% endif %}
+
+  #pragma unroll
+    for (int i=0; i < N_READS_PER_THREAD; i++) {
+
+{% if not right_operand.is_a_const_num() %}
+        tmp_output_ptr[i] = (tmp_left_ptr[i] {{operator}} tmp_right_ptr[i]);
+{% else %}
+        tmp_output_ptr[i] = (tmp_left_ptr[i] {{operator}} ({{data_type}})({{right_operand_value}}));
+{% endif %}
+  }
+    output[idx] = tmp_output;
+}
+
+} // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    const void* input_1,
+{% if not right_operand.is_a_const_num() %}
+    const void* input_2,
+{% endif %}
+    {{index_type}} num_elements,
+    {{prefix}}Stream_t stream) {
+
+  int grid_size = static_cast<int>(
+      std::ceil(static_cast<double>(num_elements) / N_THREADS_PER_BLOCK / N_READS_PER_THREAD));
+
+  relational<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+      reinterpret_cast<{{output_read_t}}*>(output),
+      reinterpret_cast<const {{input_read_t}}*>(input_1),
+{% if not right_operand.is_a_const_num() %}
+      reinterpret_cast<const {{input_read_t}}*>(input_2),
+{% endif %}
+      num_elements);
+}
+    """
+)
+
+
+@registry.reg("cuda.relational.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    inputs = func_attrs["inputs"]
+    backend_spec = CUDASpec()
+
+    input_dtype = inputs[0].dtype()
+    input_read_t = backend_spec.get_elementwise_read_backend_type(
+        shape_utils.get_num_rightmost_static_elements(inputs[0].shape()), input_dtype
+    )
+    input_data_t = backend_spec.dtype_to_backend_type(input_dtype)
+    read_vector_length = (
+        backend_spec.sizeof_types[input_read_t]
+        / backend_spec.sizeof_types[input_data_t]
+    )
+    # output data type is bool, which is 1 byte
+    output_read_t = {
+        1: "bool",
+        2: "half",
+        4: "float",
+        8: "int2",
+        16: "int4",
+    }[read_vector_length]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(output_read_t=output_read_t),
+        func_name=func_attrs["name"],
+        data_type=input_data_t,
+        index_type=backend_spec.index_type,
+        operator=func_attrs["func"].value,
+        prefix=backend_spec.prefix,
+        right_operand=func_attrs["args"][1],
+        right_operand_value=str(func_attrs["args"][1]._attrs["value"]),
+        output_read_t=output_read_t,
+        input_read_t=input_read_t,
+    )
+
+
+@registry.reg("cuda.relational.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        right_operand=func_attrs["args"][1],
+    )
+
+
+@registry.reg("cuda.relational.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    args = func_attrs["args"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=y._attrs["name"],
+        left_operand_name=args[0]._attrs["name"],
+        right_operand_name=args[1]._attrs["name"],
+        right_operand=args[1],
+        calculate_n=gen_int_var_product_str(y._attrs["shape"]),
+        indent=indent,
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 265f3293d..37568d781 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -35,6 +35,7 @@
 from aitemplate.compiler.ops.tensor.permute0213 import permute0213
 from aitemplate.compiler.ops.tensor.permute102 import permute102
 from aitemplate.compiler.ops.tensor.permute210 import permute210
+from aitemplate.compiler.ops.tensor.relational import eq, ge, gt, le, lt, ne
 from aitemplate.compiler.ops.tensor.size import size
 from aitemplate.compiler.ops.tensor.slice_reshape_scatter import slice_reshape_scatter
 from aitemplate.compiler.ops.tensor.slice_scatter import slice_scatter
diff --git a/python/aitemplate/compiler/ops/tensor/relational.py b/python/aitemplate/compiler/ops/tensor/relational.py
new file mode 100644
index 000000000..699cdefee
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/relational.py
@@ -0,0 +1,120 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from enum import Enum
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class RelationalEnum(Enum):
+    GE = ">="
+    LE = "<="
+    LT = "<"
+    GT = ">"
+    EQ = "=="
+    NE = "!="
+
+
+class relational(Operator):
+    """
+    Relational operator that supports comparing a tensor to another tensor or a constant
+
+    Parameters:
+        left (Tensor): the tensor to compare
+
+        right (Tensor or float): the tensor or value to compare
+
+    Returns:
+        Tensor: a tensor of bool
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "relational"
+
+    def __call__(self, left: Tensor, right: Tensor) -> Tensor:
+        assert self._attrs["func"] is not None, "No function registered"
+        common_dtype = None
+        assert isinstance(
+            left, Tensor
+        ), "Relational expects left operand to be a Tensor"
+        common_dtype = normalize_dtype(left.dtype())
+        left._attrs["dtype"] = common_dtype
+
+        if isinstance(right, int) or isinstance(right, float):
+            right = Tensor(shape=[], value=right, dtype=common_dtype)
+        else:
+            assert isinstance(
+                right, Tensor
+            ), "Relational expects right operand to be a Tensor or constant"
+            assert (
+                normalize_dtype(right.dtype()) == common_dtype
+            ), f"Type promotions are not supported; got dtype {left.dtype()}, but expected {common_dtype}"
+            assert (
+                left.shape() == right.shape()
+            ), "Relational does not support broadcasting yet. It expects tensor of same shape."
+            right._attrs["dtype"] = common_dtype
+
+        self._attrs["args"] = [left, right]
+        self._attrs["inputs"] = [left] if right.is_a_const_num() else [left, right]
+        self._set_depth()
+        output = Tensor(left.shape(), src_ops=[self], dtype="bool")
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+
+class ge(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.GE
+
+
+class le(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.LE
+
+
+class gt(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.GT
+
+
+class lt(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.LT
+
+
+class eq(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.EQ
+
+
+class ne(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.NE
diff --git a/tests/unittest/ops/test_relational.py b/tests/unittest/ops/test_relational.py
new file mode 100644
index 000000000..e7e6ee724
--- /dev/null
+++ b/tests/unittest/ops/test_relational.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from parameterized import param, parameterized
+
+ait_to_torch_map = {
+    ops.ge: torch.ge,
+    ops.le: torch.le,
+    ops.gt: torch.gt,
+    ops.lt: torch.lt,
+    ops.eq: torch.eq,
+    ops.ne: torch.ne,
+}
+
+
+def get_test_cases(dtype: str):
+    return [
+        param(ops.le, "le", dtype, 3),
+        param(ops.lt, "lt", dtype, 4),
+    ]
+
+
+class TestRelational(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param(ops.le, "le", "float16", 3),
+                    param(ops.le, "lt", "float16", 3),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param(ops.le, "le", "bfloat16", 3),
+                    param(ops.le, "lt", "bfloat16", 3),
+                    param(ops.gt, "gt", "float32", 8),
+                    param(ops.ne, "ne", "float", 1),
+                    param(ops.eq, "eq", "float", 16),
+                ],
+                TestEnv.ROCM: [
+                    param(ops.le, "le", "float16", 3),
+                    param(ops.le, "lt", "float16", 3),
+                ],
+            }
+        )
+    )
+    def test_end_to_end(
+        self, operator: type, test_name: str, dtype: str, M: int
+    ) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        add = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = operator()(add, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_relational_{test_name}")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            add_pt = x1_pt + x2_pt
+            y_pt = ait_to_torch_map[operator](add_pt, x3_pt)
+            y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+            inputs = {"X1": x1_pt, "X2": x2_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            self.assertEqual(y_pt.tolist(), y.tolist())
+
+    def test_unsupport_type_promotion(self) -> None:
+        dim = IntVar([1, 128])
+        X1 = Tensor([dim, 10], name="X1", is_input=True, dtype="float16")
+        X2 = Tensor([dim, 10], name="X2", is_input=True, dtype="float32")
+        with self.assertRaisesRegex(
+            AssertionError, "Type promotions are not supported"
+        ):
+            ops.ge()(X1, X2)
+
+    def test_unsupport_different_shapes(self) -> None:
+        X1 = Tensor([IntVar([1, 128]), 10], name="X1", is_input=True, dtype="float16")
+        X2 = Tensor([IntVar([10, 128]), 10], name="X2", is_input=True, dtype="float16")
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Relational does not support broadcasting yet. It expects tensor of same shape",
+        ):
+            ops.ge()(X1, X2)
+
+    def test_constant(self) -> None:
+        X1 = Tensor([IntVar([1, 128]), 10], name="X1", is_input=True, dtype="float16")
+        X2 = 2
+        Y = ops.ge()(X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_relational_test_constant")
+
+        x1_pt = get_random_torch_tensor([128, 10], dtype="float16")
+        inputs = {"X1": x1_pt}
+        y_pt = ait_to_torch_map[ops.ge](x1_pt, 2)
+        y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+        model.run_with_tensors(inputs, [y])
+        self.assertEqual(y_pt.tolist(), y.tolist())
+
+    @parameterized.expand(
+        [
+            param("int32", 3),
+            param("int32", 2),
+            param("int64", 3),
+            param("int64", 2),
+        ]
+    )
+    def test_int_support(self, dtype: str, M: int) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        Y = ops.ge()(X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_relational_int_{dtype}_{M}")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], "float32").to(
+                torch.int32 if dtype == "int32" else torch.int64
+            )
+            x2_pt = get_random_torch_tensor([batch, M], "float32").to(
+                torch.int32 if dtype == "int32" else torch.int64
+            )
+            y_pt = ait_to_torch_map[ops.ge](x1_pt, x2_pt)
+            y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            self.assertEqual(y_pt.tolist(), y.tolist())
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From 5e304943b11575728d2c6249ff45d16d0761465a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@meta.com>
Date: Tue, 27 Jun 2023 00:06:32 -0700
Subject: [PATCH 612/638] skip fusing slice with a strided op if the relevant
 tensor accessors are already set (#799)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/799

In some cases, when we fuse a slice op with a strided op (e.g. concat), the
relevant tensor accessors in the strided op's input tensors may have already
be updated. Let's skip fusion in such a case.

Note that technically, we might be able to perform fusion in some senarios,
but let's handle those later if we see needs.

Reviewed By: qxy11

Differential Revision: D47028458

fbshipit-source-id: a0648c6854db9f69a83fefa698d70eb17b272f70
---
 .../compiler/ops/tensor/slice_scatter.py      |  5 ++
 .../transform/transform_strided_slice.py      |  2 +
 .../unittest/compiler/test_fuse_split_cat.py  | 59 +++++++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 52ab2d21a..1c5cf5393 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -35,6 +35,11 @@ class slice_scatter(Operator):
     def is_valid(cat_op: Operator) -> bool:
         if cat_op._attrs["op"] != "concatenate":
             return False
+        if any(
+            input_accessor.stride_dim is not None
+            for input_accessor in cat_op._attrs["input_accessors"]
+        ):
+            return False
         return all(
             x._attrs["src_ops"] is not None
             and len(x._attrs["src_ops"]) == 1
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index ee556b819..72397c2b8 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -245,6 +245,8 @@ def _process_one_slice_dst(
         if input_tensor is not slice_output_tensor:
             continue
         input_accessors = strided_op._attrs["input_accessors"]
+        if input_accessors[idx].stride_dim is not None:
+            return False
 
         if any(strided_op_name.startswith(n) for n in ("gemm", "group_gemm", "bmm")):
             if not transform_strided_ops_utils.gemm_stride_checker(
diff --git a/tests/unittest/compiler/test_fuse_split_cat.py b/tests/unittest/compiler/test_fuse_split_cat.py
index 9c22a4598..5c5568d2d 100644
--- a/tests/unittest/compiler/test_fuse_split_cat.py
+++ b/tests/unittest/compiler/test_fuse_split_cat.py
@@ -256,6 +256,65 @@ def test_fuse_split_cat_bmm(self):
         model.run_with_tensors({"input0": a, "input1": b}, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_fuse_split_cat_interleaved(self):
+        # make a graph like below:
+        # slice_0 = slice(x0)
+        # slice_1 = slice(x0)
+        # split_0_0, split_0_1 = split(slice_0, [10, 10], 1)
+        # split_1_0, split_1_1 = split(slice_1, [10, 10], 1)
+        # y = cat([split_0_0, split_1_0, split_0_1, split_1_1], 1)
+
+        dtype = "float16"
+        M = IntImm(20)
+        N = IntImm(60)
+
+        X0 = Tensor(
+            shape=[M, N],
+            name="x0",
+            is_input=True,
+        )
+        slice_start_indices_0 = [0, 0]
+        slice_end_indices_0 = [None, 20]
+        dynamic_slice_0 = ops.dynamic_slice()(
+            X0, slice_start_indices_0, slice_end_indices_0
+        )
+        slice_start_indices_1 = [0, 20]
+        slice_end_indices_1 = [None, 40]
+        dynamic_slice_1 = ops.dynamic_slice()(
+            X0, slice_start_indices_1, slice_end_indices_1
+        )
+        split_0_0, split_0_1 = ops.split()(dynamic_slice_0, [10, 10], 1)
+        split_1_0, split_1_1 = ops.split()(dynamic_slice_1, [10, 10], 1)
+        Y = ops.concatenate()([split_0_0, split_1_0, split_0_1, split_1_1], 1)
+
+        # Set outputs
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            Y, detect_target(), "./tmp", "test_fuse_split_cat_interleaved"
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        x0_pt = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+
+        # Compare
+        slice_indices_0 = [
+            slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+        ]
+        dynamic_slice_0_pt = x0_pt[slice_indices_0]
+        slice_indices_1 = [
+            slice(i, j) for i, j in zip(slice_start_indices_1, slice_end_indices_1)
+        ]
+        dynamic_slice_1_pt = x0_pt[slice_indices_1]
+        split_0_0_pt, split_0_1_pt = torch.split(dynamic_slice_0_pt, [10, 10], 1)
+        split_1_0_pt, split_1_1_pt = torch.split(dynamic_slice_1_pt, [10, 10], 1)
+        y_pt = torch.cat([split_0_0_pt, split_1_0_pt, split_0_1_pt, split_1_1_pt], 1)
+        y = torch.empty_like(y_pt)
+        model.run_with_tensors({"x0": x0_pt}, [y])
+        torch.testing.assert_close(y, y_pt, atol=0, rtol=0)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)

From 0cf1c2ea56df8d75aac0cc7051faad83d737753b Mon Sep 17 00:00:00 2001
From: Albert Chen <albertchen@meta.com>
Date: Tue, 27 Jun 2023 06:18:49 -0700
Subject: [PATCH 613/638] Add support for where operator (#791)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/791

As titled, this diffs adds supports for where operations. The expected behavior is to match equivalent operators in Torch, i.e. https://pytorch.org/docs/stable/generated/torch.where.html

Reviewed By: aakhundov

Differential Revision: D46957405

fbshipit-source-id: db4bdf4f2d91d154fb0c9ee092bf6429679b63db
---
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../aitemplate/backend/cuda/tensor/where.py   | 228 ++++++++++++++++++
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../aitemplate/compiler/ops/tensor/where.py   | 102 ++++++++
 tests/unittest/ops/test_where.py              | 221 +++++++++++++++++
 5 files changed, 554 insertions(+)
 create mode 100644 python/aitemplate/backend/cuda/tensor/where.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/where.py
 create mode 100644 tests/unittest/ops/test_where.py

diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index b017365e3..d93808023 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -39,6 +39,7 @@
     slice_scatter,
     split,
     topk,
+    where,
 )
 
 __all__ = [
@@ -65,4 +66,5 @@
     "slice_scatter",
     "split",
     "topk",
+    "where",
 ]
diff --git a/python/aitemplate/backend/cuda/tensor/where.py b/python/aitemplate/backend/cuda/tensor/where.py
new file mode 100644
index 000000000..dfc1acdcb
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/where.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+from aitemplate.utils import shape_utils
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+#define N_READS_PER_THREAD sizeof({{condition_read_t}}) / sizeof(bool)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+
+void invoke_{{func_name}}(
+    void*,  /* output */
+    const void*,  /* condition */
+{% if not input_tensor_is_a_const_num %}
+    const void*,  /* input tensor */
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const void*,   /* other tensor */
+{% endif %}
+    {{index_type}}, /* number of elements */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} n_elements = {{calculate_n}};
+    {{indent}} invoke_{{func_name}}(
+    {{indent}}    {{output}},
+    {{indent}}    {{condition}},
+{% if not input_tensor_is_a_const_num %}
+    {{indent}}    {{input_tensor}},
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    {{indent}}    {{other_tensor}},
+{% endif %}
+    {{indent}}    n_elements,
+    {{indent}}    stream
+    {{indent}});
+{{indent}}}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+
+__global__ void where(
+    {{read_t}}* output,
+    const {{condition_read_t}}* condition,
+{% if not input_tensor_is_a_const_num %}
+    const {{read_t}}* input_tesnor,
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const {{read_t}}* other_tensor,
+{% endif %}
+    {{index_type}} num_elements) {
+        const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+        if (idx * N_READS_PER_THREAD >= num_elements) {
+            return;
+        }
+
+        {{read_t}} tmp_output;
+        {{data_t}}* tmp_output_ptr = reinterpret_cast<{{data_t}}*>(&tmp_output);
+
+        {{condition_read_t}} tmp_condition = condition[idx];
+        bool* tmp_condition_ptr = reinterpret_cast<bool*>(&tmp_condition);
+
+{% if not input_tensor_is_a_const_num %}
+        {{read_t}} tmp_input_tensor = input_tesnor[idx];
+        {{data_t}}* tmp_input_tensor_ptr = reinterpret_cast<{{data_t}}*>(&tmp_input_tensor);
+{% endif %}
+
+{% if not other_tensor_is_a_const_num %}
+        {{read_t}} tmp_other_tensor = other_tensor[idx];
+        {{data_t}}* tmp_other_tensor_ptr = reinterpret_cast<{{data_t}}*>(&tmp_other_tensor);
+{% endif %}
+
+#pragma unroll
+        for (int i=0; i < N_READS_PER_THREAD; i++) {
+            tmp_output_ptr[i] = ({{data_t}})(tmp_condition_ptr[i]) * ({{data_t}})({{ input_tensor_val if input_tensor_is_a_const_num else "tmp_input_tensor_ptr[i]" }}) + ({{data_t}})(1 - tmp_condition_ptr[i]) * ({{data_t}})({{ other_tensor_val if other_tensor_is_a_const_num else "tmp_other_tensor_ptr[i]" }});
+        }
+        output[idx] = tmp_output;
+
+    }
+
+} // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    const void* condition,
+{% if not input_tensor_is_a_const_num %}
+    const void* input_tesnor,
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const void* other_tensor,
+{% endif %}
+    {{index_type}} num_elements,
+    {{prefix}}Stream_t stream) {
+
+  int grid_size = static_cast<int>(
+      std::ceil(static_cast<double>(num_elements) / N_THREADS_PER_BLOCK / N_READS_PER_THREAD));
+
+  where<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+      reinterpret_cast<{{read_t}}*>(output),
+      reinterpret_cast<const {{condition_read_t}}*>(condition),
+{% if not input_tensor_is_a_const_num %}
+      reinterpret_cast<const {{read_t}}*>(input_tesnor),
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+      reinterpret_cast<const {{read_t}}*>(other_tensor),
+{% endif %}
+      num_elements);
+}
+    """
+)
+
+
+@registry.reg("cuda.where.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    condition, input_tensor, other_tensor = func_attrs["args"]
+    output = func_attrs["outputs"][0]
+    dtype = output.dtype()
+    backend_spec = CUDASpec()
+    read_t = backend_spec.get_elementwise_read_backend_type(
+        shape_utils.get_num_rightmost_static_elements(output.shape()), dtype
+    )
+    data_t = backend_spec.dtype_to_backend_type(dtype)
+    read_vector_length = (
+        backend_spec.sizeof_types[read_t] / backend_spec.sizeof_types[data_t]
+    )
+    # condition data type is bool, which is 1 byte
+    condition_read_t = {
+        1: "bool",
+        2: "half",
+        4: "float",
+        8: "int2",
+        16: "int4",
+    }[read_vector_length]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(condition_read_t=condition_read_t),
+        func_name=func_attrs["name"],
+        data_t=data_t,
+        read_t=read_t,
+        condition_read_t=condition_read_t,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+        input_tensor_val=str(input_tensor._attrs["value"]),
+        other_tensor_val=str(other_tensor._attrs["value"]),
+    )
+
+
+@registry.reg("cuda.where.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    _, input_tensor, other_tensor = func_attrs["args"]
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+    )
+
+
+@registry.reg("cuda.where.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    condition, input_tensor, other_tensor = func_attrs["args"]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output._attrs["name"],
+        condition=condition._attrs["name"],
+        input_tensor=input_tensor._attrs["name"],
+        other_tensor=other_tensor._attrs["name"],
+        calculate_n=gen_int_var_product_str(condition.shape()),
+        indent=indent,
+        index_type=backend_spec.index_type,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+    )
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 37568d781..a45019ab8 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -42,3 +42,4 @@
 from aitemplate.compiler.ops.tensor.split import split
 from aitemplate.compiler.ops.tensor.topk import topk
 from aitemplate.compiler.ops.tensor.transpose import transpose
+from aitemplate.compiler.ops.tensor.where import where
diff --git a/python/aitemplate/compiler/ops/tensor/where.py b/python/aitemplate/compiler/ops/tensor/where.py
new file mode 100644
index 000000000..4be14790c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/where.py
@@ -0,0 +1,102 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class where(Operator):
+    """
+    Return a tensor of elements selected from either input or other, depending on condition.
+
+    Parameters:
+        condition (A bool Tensor): When True (nonzero), yield input, otherwise yield other
+
+        input_tensor (Tensor or Scalar): value (if input is a scalar) or values selected at indices where condition is True
+
+        other_tensor (Tensor or Scalar): value (if other is a scalar) or values selected at indices where condition is False
+
+        dtype: output dtype if both input_tensor and output_tensor is scalar
+    Returns:
+        Tensor: A tensor of shape equal to the shape of condition
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "where"
+
+    def __call__(
+        self,
+        condition: Tensor,
+        input_tensor: Tensor,
+        other_tensor: Tensor,
+        dtype: str = "",
+    ) -> Tensor:
+        assert isinstance(
+            condition, Tensor
+        ), f"condition needs to be a tensor, but got {type(condition)}"
+        assert (
+            condition.dtype() == "bool"
+        ), f"condition needs to be a bool tensor, but got {condition.dtype()}"
+
+        output_shape = condition.shape()
+        args = []
+        inputs = []
+        common_dtype = None
+        for tensor in [input_tensor, other_tensor]:
+            if isinstance(tensor, int) or isinstance(tensor, float):
+                tensor = Tensor(shape=[], value=tensor, dtype=common_dtype)
+            else:
+                assert isinstance(
+                    tensor, Tensor
+                ), f"Unsupported data type: {type(tensor)}"
+                assert (
+                    tensor.shape() == output_shape
+                ), f"Tensor shape should be the same, {tensor.shape()} != {output_shape}"
+                if common_dtype is None:
+                    common_dtype = normalize_dtype(tensor.dtype())
+                else:
+                    assert common_dtype == normalize_dtype(
+                        tensor.dtype()
+                    ), f"Expect tensor of the same dtype, got {common_dtype} and {normalize_dtype(tensor.dtype())}"
+                inputs.append(tensor)
+
+            args.append(tensor)
+
+        # In case where both inputs are scalars,
+        if len(inputs) == 0:
+            assert dtype != "", "dtype needs to be provided for scalars"
+            common_dtype = normalize_dtype(dtype)
+            for arg in args:
+                arg._attrs["dtype"] = common_dtype
+        self._attrs["args"] = [condition, *args]
+        self._attrs["inputs"] = [condition, *inputs]
+        self._set_depth()
+        output = Tensor(
+            shape=output_shape,
+            src_ops={self},
+            dtype=common_dtype,
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/tests/unittest/ops/test_where.py b/tests/unittest/ops/test_where.py
new file mode 100644
index 000000000..37774ffe1
--- /dev/null
+++ b/tests/unittest/ops/test_where.py
@@ -0,0 +1,221 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from parameterized import param, parameterized
+
+
+class TestWhere(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def test_unsupport_condition_tensor_non_bool(self) -> None:
+        X1 = gen_input_tensor([4, 4], name="X1", dtype="float")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(
+            AssertionError, "condition needs to be a bool tensor"
+        ):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_condition_tensor_constant(self) -> None:
+        X1 = 1
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(AssertionError, "condition needs to be a tensor"):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_different_condition_and_input_tensor_size(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(AssertionError, "Tensor shape should be the same"):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_no_dtype_for_scalars(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = 2
+        X3 = 2
+        with self.assertRaisesRegex(
+            AssertionError, "dtype needs to be provided for scalars"
+        ):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_tensor_of_different_dtype(self) -> None:
+        X1 = gen_input_tensor([4, 4], name="X1", dtype="bool")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float32")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float64")
+        with self.assertRaisesRegex(AssertionError, "Expect tensor of the same dtype"):
+            ops.where()(X1, X2, X3)
+
+    def test_dtype_for_scalars(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = 2
+        X3 = 2
+        Y = ops.where()(X1, X2, X3, dtype="float32")
+        self.assertEqual(Y.dtype(), "float32")
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [param("float16", 3), param("float16", 2)],
+                TestEnv.CUDA_SM80: [
+                    param("bfloat16", 3),
+                    param("bfloat16", 2),
+                    param("float32", 8),
+                    param("float", 1),
+                    param("float", 3),
+                ],
+                TestEnv.ROCM: [param("float16", 3), param("float16", 2)],
+            }
+        )
+    )
+    def test_where(self, dtype: str, M: int) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_where_{self._test_id}")
+        self._test_id += 1
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_input_tensor_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = 2
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_input_tensor_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = 2
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_other_tensor_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = 2
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_other_tensor_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = 2
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_both_tensors_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = 4
+        X3 = 2
+        Y = ops.where()(X1, X2, X3, dtype=dtype)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_both_tensors_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = 4
+            x3_pt = 2
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt).to(torch.float32)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_integration_with_relational(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = ops.ge()(X1, X2)
+        Y = ops.where()(X3, X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_integration_with_relational")
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = torch.ge(x1_pt, x2_pt)
+            y_pt = torch.where(x3_pt, x1_pt, x2_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()

From dccb361a3901e8244ef21bec6494d860cd94ed01 Mon Sep 17 00:00:00 2001
From: Shuqi Yang <shuqiyang@meta.com>
Date: Tue, 27 Jun 2023 11:26:49 -0700
Subject: [PATCH 614/638] Add test cases to cover `split_large_slice_scatter`
 (#794)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/794

The existing tests don't use the `slice_reshape_scatter` op, so the `split_large_slice_scatter` logic is not tested.

If we remove `split_large_slice_scatter` from `optimize_graph`, the existing tests can still pass.
The newly added test cases would fail without `split_large_slice_scatter`.

Reviewed By: muchulee8

Differential Revision: D46770757

fbshipit-source-id: 69a48580df6e5108a5e03097dec2835934c718db
---
 ...test_split_large_slice_reshape_scatter.py} | 58 +++++++++++++++----
 1 file changed, 46 insertions(+), 12 deletions(-)
 rename tests/unittest/compiler/{test_split_large_slice_scatter.py => test_split_large_slice_reshape_scatter.py} (62%)

diff --git a/tests/unittest/compiler/test_split_large_slice_scatter.py b/tests/unittest/compiler/test_split_large_slice_reshape_scatter.py
similarity index 62%
rename from tests/unittest/compiler/test_split_large_slice_scatter.py
rename to tests/unittest/compiler/test_split_large_slice_reshape_scatter.py
index d55d322eb..d35edeeb4 100644
--- a/tests/unittest/compiler/test_split_large_slice_scatter.py
+++ b/tests/unittest/compiler/test_split_large_slice_reshape_scatter.py
@@ -35,26 +35,29 @@ def setUpClass(cls) -> None:
         torch.manual_seed(0)
 
     def _test_slice_scatter_reshape_float16(
-        self,
-        input0_shape,
-        input1_shape,
-        start_indices,
-        end_indices,
+        self, input0_shape, input1_shape, start_indices, end_indices, reshape_movable
     ):
         dtype = "float16"
 
         input0 = Tensor(shape=input0_shape, dtype=dtype, name="input0", is_input=True)
         input1 = Tensor(shape=input1_shape, dtype=dtype, name="input1", is_input=True)
 
-        num_slices = 139
+        concat_dim = 1
+        end_indices_2 = end_indices.copy()
+        if not reshape_movable:
+            end_indices[concat_dim] -= 1
+            end_indices_2[concat_dim] += 1
+
+        num_slices = 140
         slice_outputs = [
             ops.dynamic_slice()(
-                input0, start_indices=start_indices, end_indices=end_indices
+                input0,
+                start_indices=start_indices,
+                end_indices=end_indices if idx % 2 == 0 else end_indices_2,
             )
-            for _ in range(num_slices)
+            for idx in range(num_slices)
         ]
 
-        concat_dim = 1
         concat_2 = ops.concatenate()(slice_outputs, concat_dim)
         reshape_to = [-1, num_slices, 2]
         reshape_3 = ops.reshape()(concat_2, reshape_to)
@@ -65,18 +68,33 @@ def _test_slice_scatter_reshape_float16(
 
         target = detect_target()
         dll_name = f"test_{self.test_count}.so"
-        test_name = "slice_scatter_large_inputs"
+        test_name = f"slice_scatter_large_inputs_{self.test_count}"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         self.test_count += 1
         Y_src_ops = list(Y._attrs["src_ops"])
         self.assertEqual(len(Y_src_ops), 5)
-        self.assertTrue(all(op._attrs["op"] == "concatenate" for op in Y_src_ops))
+        if reshape_movable:
+            # If the reshape operator can be moved to the front, we will only have concatenate ops
+            self.assertTrue(all(op._attrs["op"] == "concatenate" for op in Y_src_ops))
+        else:
+            # We have a single concat op. All the rest are slice_reshape_scatter ops
+            concat_cnt = 0
+            for op in Y_src_ops:
+                if op._attrs["op"] == "concatenate":
+                    concat_cnt += 1
+                    continue
+                self.assertEqual(op._attrs["op"], "slice_reshape_scatter")
+            self.assertEqual(concat_cnt, 1)
 
         input0_pt = get_random_torch_tensor(input0_shape, dtype)
         input1_pt = get_random_torch_tensor(input1_shape, dtype)
         slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+        slice_indices_2 = [slice(i, j) for i, j in zip(start_indices, end_indices_2)]
 
-        slice_outputs_pt = [input0_pt[slice_indices] for _ in range(num_slices)]
+        slice_outputs_pt = [
+            input0_pt[slice_indices if idx % 2 == 0 else slice_indices_2]
+            for idx in range(num_slices)
+        ]
         concat_2_pt = torch.cat(slice_outputs_pt, concat_dim)
         reshape_3_pt = torch.reshape(concat_2_pt, reshape_to)
         y_pt = torch.cat([reshape_3_pt, input1_pt], concat_dim)
@@ -92,12 +110,28 @@ def test_slice_scatter_reshape_float16(self):
             input1_shape=[2, 4, 2],
             start_indices=[1, 0],
             end_indices=[3, None],
+            reshape_movable=True,
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[2, 6],
+            input1_shape=[2, 4, 2],
+            start_indices=[0, 0],
+            end_indices=[None, 2],
+            reshape_movable=True,
         )
         self._test_slice_scatter_reshape_float16(
             input0_shape=[2, 6],
             input1_shape=[2, 4, 2],
             start_indices=[0, 0],
             end_indices=[None, 2],
+            reshape_movable=False,
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[6, 3],
+            input1_shape=[2, 4, 2],
+            start_indices=[1, 0],
+            end_indices=[3, 2],
+            reshape_movable=False,
         )
 
 
From a20384e73de4c1a96a8b8656613e9baa09b88e40 Mon Sep 17 00:00:00 2001
From: Colin Chan <ctyc@meta.com>
Date: Tue, 27 Jun 2023 11:58:10 -0700
Subject: [PATCH 615/638] Add init_random_weights test util (#800)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/800

Initialize random weights for AIT constants during model compilation to prevent identical weights being compared when testing accuracy of PT module vs AIT module.

Reviewed By: henryhu6

Differential Revision: D47031569

fbshipit-source-id: f063a8b13d3a530f7c667ce4b2259f9177bdd4fa
---
 python/aitemplate/testing/test_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 88fb8b999..07e43a6cd 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -30,6 +30,7 @@
 from aitemplate.testing.detect_target import detect_target
 from aitemplate.utils.graph_utils import get_sorted_ops
 from aitemplate.utils.torch_utils import string_to_torch_dtype
+from torch import nn
 
 
 class TestEnv(Enum):
@@ -298,3 +299,18 @@ def get_attn_mask_per_causal_type(
     else:
         raise NotImplementedError(f"Unsupported {causal_type=}!")
     return invalid_attn_mask
+
+
+def init_random_weights(m):
+    if hasattr(m, "weight"):
+        nn.init.uniform_(m.weight)
+    elif (
+        type(m) == nn.Sequential
+        or type(m) == nn.ModuleList
+        or type(m) == nn.SiLU
+        or type(m) == nn.Dropout
+        or type(m) == nn.Identity
+    ):
+        pass
+    else:
+        print("Passed root module: " + str(type(m)))

From eb4c375fe94eef8ee691bdc02178bb0c3d182da8 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Tue, 27 Jun 2023 14:08:09 -0700
Subject: [PATCH 616/638] Sync CUTLASS with upstream (#803)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/803

ATT

Reviewed By: wushirong

Differential Revision: D47060963

fbshipit-source-id: 8fd0f57e8b3e0d85396a10397e8fa0a380a9cd8c
---
 3rdparty/cutlass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 6320758d1..a9d9b8049 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 6320758d1c7163662ce15a6e16d62f9732912063
+Subproject commit a9d9b80493e20086732f51f90f10f99ae50ae5ed

From b080f5c7eb0850154d0e5ba5c3655b3cc630b4d0 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Thu, 29 Jun 2023 16:36:52 -0700
Subject: [PATCH 617/638] Fix SD Alternative pipeline README to use demo_alt.py
 (#784)

Summary:
Currently SD Alternative pipeline examples use `demo.py` script.
I think it should use `demo_alt.py` instead

Other minor fixes:
- made demo_alt.py example commands multiline
- fixed CompVis spelling and added href to it.

Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/784

Reviewed By: chenyang78

Differential Revision: D47039823

Pulled By: ipiszy

fbshipit-source-id: 885cbcef4a7904936da66d817d0eb62e06f5335a
---
 examples/05_stable_diffusion/README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index aa8e3f573..8f589f70e 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -58,7 +58,7 @@ By default, `compile_alt.py` does not include model weights (constants) with the
 
 #### Alternative pipeline
 
-The original pipeline requires a diffusers model local dir, and relies directly on `StableDiffusionPipeline`. This pipeline builds similar functionality without directly using `StableDiffusionPipeline`, and is capable of loading model weights from either diffusers or compvis models to compiled aitemplate modules.
+The original pipeline requires a diffusers model local dir, and relies directly on `StableDiffusionPipeline`. This pipeline builds similar functionality without directly using `StableDiffusionPipeline`, and is capable of loading model weights from either diffusers or [CompVis](https://huggingface.co/CompVis) models to compiled aitemplate modules.
 
 * AITemplate modules are created
 * Model weights are loaded, converted/mapped, then applied to AITemplate module
@@ -67,8 +67,13 @@ The original pipeline requires a diffusers model local dir, and relies directly
 * Loading CLIPTextModel from `ckpt` requires the appropriate `hf-hub-or-path` to be specified i.e. `runwayml/stable-diffusion-v1-5` for SD1.x checkpoints, `stabilityai/stable-diffusion-2-1` for SD2.x checkpoints.
 
 ```
-python3 scripts/demo.py --hf-hub-or-path runwayml/stable-diffusion-v1-5 --ckpt v1-5-pruned-emaonly.ckpt
-python3 scripts/demo.py --hf-hub-or-path stabilityai/stable-diffusion-2-1 --ckpt v2-1_768-ema-pruned.ckpt
+python3 scripts/demo_alt.py \
+--hf-hub-or-path runwayml/stable-diffusion-v1-5 \
+--ckpt v1-5-pruned-emaonly.ckpt
+
+python3 scripts/demo_alt.py \
+--hf-hub-or-path stabilityai/stable-diffusion-2-1 \
+--ckpt v2-1_768-ema-pruned.ckpt
 ```
 
 `--ckpt` takes preference over `--hf-hub-or-path` if both are specified

From 79d10cdba3264fff32760687a5eb9ab0935ba503 Mon Sep 17 00:00:00 2001
From: Shuqi Yang <shuqiyang@meta.com>
Date: Thu, 29 Jun 2023 18:49:15 -0700
Subject: [PATCH 618/638] Split slice_scatter into multiple ones if it has too
 many inputs (#801)

Summary:
Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/801

Split slice_scatter into multiple ones if it has too many inputs. The process is very similar to split slice_reshape_scatter.

Added the TensorAccessor attribute in slice_scatter op (but will only use its offset field) to make the split logic work.

Reviewed By: chenyang78

Differential Revision: D46962881

fbshipit-source-id: 40457bfd5f9ec607802a70d01e7020aac0e6b3c8
---
 .../backend/cuda/tensor/slice_scatter.py      |   8 +-
 .../compiler/ops/tensor/slice_scatter.py      |  25 ++--
 .../split_large_slice_scatter_ops.py          |  11 +-
 .../transform/transform_strided_ops.py        |   2 +-
 .../test_split_large_slice_scatter.py         | 113 ++++++++++++++++++
 5 files changed, 145 insertions(+), 14 deletions(-)
 create mode 100644 tests/unittest/compiler/test_split_large_slice_scatter.py

diff --git a/python/aitemplate/backend/cuda/tensor/slice_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
index 254193f59..8cf968415 100644
--- a/python/aitemplate/backend/cuda/tensor/slice_scatter.py
+++ b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
@@ -53,8 +53,14 @@ def gen_function(func_attrs):
     """
     # TODO: consider to profile elems_per_thread
     elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
+    output_accessor = func_attrs["output_accessors"][0]
+    output_offset = output_accessor.offset
     return slice_common.gen_function(
-        func_attrs, backend_spec=CUDASpec(), elems_per_thread=elems_per_thread
+        func_attrs,
+        backend_spec=CUDASpec(),
+        elems_per_thread=elems_per_thread,
+        output_offset=output_offset,
+        update_output_shape=False,
     )
 
 
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 1c5cf5393..8c2d4f008 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -20,6 +20,7 @@
 from aitemplate.backend import registry
 from aitemplate.compiler.base import Operator
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221
 
@@ -62,6 +63,10 @@ def _update_inputs_outputs(self, cat_op):
                 input_tensor._attrs["dst_ops"].add(self)
             self._attrs["inputs"].append(input_tensor)
 
+        # The original output of this slice_scatter op is the output of the cat_op.
+        # We set the TensorAccessor, but will only use its offset field in the backend.
+        self._attrs["output_accessors"] = [TensorAccessor(cat_op._attrs["outputs"][0])]
+
         self._attrs["outputs"] = cat_op._attrs["outputs"]
         for y in self._attrs["outputs"]:
             y._attrs["src_ops"] = StableSet({self})
@@ -74,23 +79,27 @@ def _update_inputs_outputs(self, cat_op):
             x._attrs["src_ops"] = StableSet()
             x._attrs["dst_ops"] = StableSet()
 
-    def __init__(self, cat_op: Operator) -> None:
+    def __init__(self, scatter_dim: int) -> None:
         super().__init__()
-        assert slice_scatter.is_valid(cat_op)
-
         self._attrs["op"] = "slice_scatter"
         self._attrs["has_profiler"] = False
-        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        self._attrs["scatter_dim"] = scatter_dim
+
+    @staticmethod
+    def make_op(cat_op: Operator) -> Operator:
+        assert slice_scatter.is_valid(cat_op)
+        scatter_dim = cat_op._attrs["concat_dim"]
+        new_op = slice_scatter(scatter_dim)
         slice_ops = []
         for x in cat_op._attrs["inputs"]:
             src_ops = x.src_ops()
             assert len(src_ops) == 1
             slice_op = list(src_ops)[0]
             slice_ops.append(slice_op)
-        self._attrs["slice_ops"] = slice_ops
-
-        self._update_inputs_outputs(cat_op)
-        self._set_depth()
+        new_op._attrs["slice_ops"] = slice_ops
+        new_op._update_inputs_outputs(cat_op)
+        new_op._set_depth()
+        return new_op
 
     def __call__(self):
         raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
diff --git a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
index 911f57656..f3e1761f5 100644
--- a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
@@ -63,8 +63,7 @@ def split_large_slice_scatter_ops(sorted_graph: List[Tensor], _: str) -> List[Te
     """
     sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
     for op in sorted_ops:
-        # TODO: enable slice_scatter later
-        if not op._attrs["op"].startswith("slice_reshape_scatter"):
+        if op._attrs["op"] not in ["slice_reshape_scatter", "slice_scatter"]:
             continue
         slice_scatter_op = op
         # We create InputMeta for inputs that need to copy data.
@@ -96,10 +95,14 @@ def split_large_slice_scatter_ops(sorted_graph: List[Tensor], _: str) -> List[Te
         has_profiler = slice_scatter_op._attrs["has_profiler"]
         local_output_offset = 0
         orig_name = slice_scatter_op._attrs["name"]
-        element_func = slice_scatter_op._attrs["element_func"]
         slice_ops = slice_scatter_op._attrs["slice_ops"]
         for split_idx, new_inputs_size in enumerate(split_sizes):
-            new_slice_scatter_op = ops.slice_reshape_scatter(scatter_dim, element_func)
+            if op._attrs["op"] == "slice_scatter":
+                new_slice_scatter_op = ops.slice_scatter(scatter_dim)
+            elif op._attrs["op"] == "slice_reshape_scatter":
+                new_slice_scatter_op = ops.slice_reshape_scatter(
+                    scatter_dim, slice_scatter_op._attrs["element_func"]
+                )
             new_name = f"{orig_name}_split_{split_idx}"
             new_slice_scatter_op._attrs["name"] = new_name
             new_slice_scatter_op._attrs["original_name"] = new_name
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index 5174ba389..2de95d9be 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -51,7 +51,7 @@ def _fuse_slices_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
             continue
         concat_op = src_op
         if slice_scatter.is_valid(concat_op):
-            slice_scatter(concat_op)
+            slice_scatter.make_op(concat_op)
 
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
diff --git a/tests/unittest/compiler/test_split_large_slice_scatter.py b/tests/unittest/compiler/test_split_large_slice_scatter.py
new file mode 100644
index 000000000..e47152dbe
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_slice_scatter.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SliceScatterLargeInputsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterLargeInputsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_slice_scatter(
+        self, input_shape, start_indices, end_indices, concat_dim, dtype
+    ):
+        num_slices = 140
+        slice_outputs = [
+            ops.dynamic_slice()(
+                Tensor(
+                    shape=input_shape, dtype=dtype, name=f"input{idx}", is_input=True
+                ),
+                start_indices=start_indices,
+                end_indices=end_indices,
+            )
+            for idx in range(num_slices)
+        ]
+
+        Y = ops.concatenate()(slice_outputs, concat_dim)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        test_name = f"slice_scatter_large_inputs_{self.test_count}"
+
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 5)
+        self.assertTrue(all(op._attrs["op"] == "slice_scatter" for op in Y_src_ops))
+
+        input_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_slices)
+        ]
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+        slice_outputs_pt = [input_i[slice_indices] for input_i in input_pt]
+        y_pt = torch.cat(slice_outputs_pt, concat_dim)
+
+        inputs = {f"input{idx}": input_pt[idx] for idx in range(num_slices)}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+        self.test_count += 1
+
+    def test_slice_scatter_float(self):
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=0,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=1,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=2,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=1,
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 039bb9ff46656e5422bf6d389ea97174bc9612c7 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 29 Jun 2023 19:51:33 -0700
Subject: [PATCH 619/638] update frontend and mk_ck_lib (#777)

Summary: Pull Request resolved: https://github.com/facebookincubator/AITemplate/pull/777

Reviewed By: chenyang78

Differential Revision: D47039764

Pulled By: ipiszy

fbshipit-source-id: 4a2fa9228272ed32544498b68af4f4d42c02a460
---
 python/aitemplate/frontend/nn/embedding.py    |  56 +--
 python/aitemplate/frontend/nn/module.py       |   1 -
 .../frontend/nn/multiscale_attention.py       |   1 -
 .../utils/mk_ck_lib/conv2d_operation.py       |   5 +-
 .../utils/mk_ck_lib/gemm_operation.py         |  46 ++-
 .../aitemplate/utils/mk_ck_lib/generator.py   | 369 +++++++++++++++++-
 .../utils/mk_ck_lib/groupnorm_operation.py    |   2 +-
 .../utils/mk_ck_lib/layernorm_operation.py    |   4 +-
 python/aitemplate/utils/mk_ck_lib/library.py  |   4 +
 python/aitemplate/utils/mk_ck_lib/manifest.py |   1 -
 10 files changed, 424 insertions(+), 65 deletions(-)

diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index f5144eca1..fc0b29b55 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -13,12 +13,10 @@
 #  limitations under the License.
 #
 from aitemplate.compiler import ops
-from aitemplate.compiler.public import FuncEnum
 from aitemplate.frontend.nn.dropout import Dropout
 from aitemplate.frontend.nn.layer_norm import LayerNorm
 from aitemplate.frontend.nn.module import Module
 from aitemplate.frontend.nn.parameter import Parameter
-from aitemplate.testing import detect_target
 
 
 class Embedding(Module):
@@ -61,8 +59,6 @@ def __init__(
         dtype="float16",
     ):
         super().__init__()
-        if BertEmbeddings.USE_CUDA is None:
-            BertEmbeddings.USE_CUDA = detect_target().name() == "cuda"
         assert (
             hidden_dropout_prob == 0.0
         ), "Dropout rate larger than 0 is not supported yet."
@@ -85,48 +81,16 @@ def forward(
         token_type_ids,  # [B, S]
         position_ids,  # [B, S]
     ):
-        if self.USE_CUDA:
-            embeddings = ops.bert_embeddings()(
-                input_ids,
-                token_type_ids,
-                position_ids,
-                self.word_embeddings.weight.tensor(),
-                self.token_type_embeddings.weight.tensor(),
-                self.position_embeddings.weight.tensor(),
-                self.LayerNorm.weight.tensor(),
-                self.LayerNorm.bias.tensor(),
-                self.LayerNorm.eps,
-            )
-            embeddings = self.dropout(embeddings)
-            return embeddings
-
-        input_shape = ops.size()(input_ids)
-
-        # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-        token_type_ids = ops.reshape()(token_type_ids, [-1])
-        position_ids = ops.reshape()(position_ids, [-1])
-
-        # [B * S, H]
-        input_embeddings = ops.batch_gather()(self.word_embeddings.tensor(), input_ids)
-        token_type_embeddings = ops.batch_gather()(
-            self.token_type_embeddings.tensor(), token_type_ids
-        )
-        position_embeddings = ops.batch_gather()(
-            self.position_embeddings.tensor(), position_ids
-        )
-
-        # add
-        embeddings = ops.elementwise(FuncEnum.ADD)(
-            input_embeddings, token_type_embeddings
+        embeddings = ops.bert_embeddings()(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            self.word_embeddings.weight.tensor(),
+            self.token_type_embeddings.weight.tensor(),
+            self.position_embeddings.weight.tensor(),
+            self.LayerNorm.weight.tensor(),
+            self.LayerNorm.bias.tensor(),
+            self.LayerNorm.eps,
         )
-
-        embeddings = ops.elementwise(FuncEnum.ADD)(embeddings, position_embeddings)
-
-        # norm
-        embeddings = self.LayerNorm(embeddings)
         embeddings = self.dropout(embeddings)
-
-        embeddings = ops.reshape()(embeddings, input_shape + [-1])
-
         return embeddings
diff --git a/python/aitemplate/frontend/nn/module.py b/python/aitemplate/frontend/nn/module.py
index c51a49db9..391d9d5d7 100644
--- a/python/aitemplate/frontend/nn/module.py
+++ b/python/aitemplate/frontend/nn/module.py
@@ -296,7 +296,6 @@ def get_submodule(self, target: str) -> "Module":
         mod: Module = self
 
         for item in atoms:
-
             if not hasattr(mod, item):
                 raise AttributeError(
                     mod._get_name() + " has no " "attribute `" + item + "`"
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
index 848cb1637..53fe02300 100644
--- a/python/aitemplate/frontend/nn/multiscale_attention.py
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -375,7 +375,6 @@ def __init__(
         ## TODO: add pool mode support for {"max", "avg"}
 
         elif pool_mode == "conv":
-
             self.pool_q = (
                 Conv3d(
                     head_dim,
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
index 931651b99..4c46deeb2 100644
--- a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -266,7 +266,6 @@ def accumulator_type(self):
         return library.DataType.f32
 
     def emit(self) -> str:
-
         template = jinja2.Template(
             """
 using {{name}} = {{xdl_op_type}}<
@@ -285,7 +284,7 @@ def emit(self) -> str:
     {{WeiLayout}}, // WeiLayout
     {% if func=="PT" %}
     ck::Tuple<>,
-    {% elif func=="AAR" %}
+    {% elif func in ["AA", "AAR"] %}
     ck::Tuple<{{OutLayout}}, {{OutLayout}}>, // BiasLayout
     {% else %}
 {% if "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1" in xdl_op_type %}
@@ -301,7 +300,7 @@ def emit(self) -> str:
     {{CShuffleDType}}, // CShuffleDataType
     {% if func=="PT" %}
     ck::Tuple<>,
-    {% elif func=="AAR" %}
+    {% elif func in ["AA", "AAR"] %}
     ck::Tuple<{{CDType}}, {{CDType}}>, // BiasLayout
     {% else %}
     ck::Tuple<{{CDType}}>, // BiasDataType
diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
index 28b44f308..dc1557a5b 100644
--- a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -51,6 +51,7 @@ class XdlOpType(enum.Enum):
     DeviceBatchedContractionMultipleD_Xdl_CShuffle = auto()
     DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle = auto()
     DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle = auto()
+    DeviceBatchedGemmMultiD_Xdl = auto()
 
 
 XdlOpTag = {
@@ -62,6 +63,7 @@ class XdlOpType(enum.Enum):
     XdlOpType.DeviceBatchedContractionMultipleD_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Xdl_CShuffle",
     XdlOpType.DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle",
     XdlOpType.DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle",
+    XdlOpType.DeviceBatchedGemmMultiD_Xdl: "ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl",
 }
 
 
@@ -247,7 +249,11 @@ def __str__(self) -> str:
         _{{n_xdl_per_wave}}
         {{m_n_block_wave_per_xdl|join('_')}}S
         {{scalar_per_vector}}
-        {{causal_mask}}
+        {% if causal_mask == 1 %}
+        ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle // causal_mask
+        {% else %}
+        ck::tensor_operation::device::MaskingSpecialization::MaskDisabled // causal_mask
+        {% endif %}
         """,
             trim_blocks=True,
             lstrip_blocks=True,
@@ -264,7 +270,11 @@ def emit(self) -> str:
     {{n_xdl_per_wave}}, // n_xdl_per_wave
     ck::Sequence<{{m_n_block_wave_per_xdl|join(',')}}>, // m_n_block_wave_per_xdl
     {{scalar_per_vector}}, // scalar_per_vector
-    {{causal_mask}} // causal_mask
+    {% if causal_mask == 1 %}
+    ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle // causal_mask
+    {% else %}
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled // causal_mask
+    {% endif %}
     """,
             trim_blocks=True,
             lstrip_blocks=True,
@@ -392,21 +402,38 @@ def emit(self) -> str:
     ck::Tuple<ck::half_t>,
     {% endif %}
     ck::half_t,
-{% elif xdl_op_type_value in [7, 8] %}
+{% elif xdl_op_type_value == 7 %}
     {{ALayout}},
     {{BLayout}},
     {{CLayout}},
-    {% if xdl_op_type_value == 8 %}
-    ck::Sequence<2,1,1>,
-    {% else %}
     {{CLayout}},
-    {% endif %}
     {{ADType}},
     {{BDType}},
     {{BDType}},
     {{CDType}},
     {{AccDType}},
     float, // CShuffleDType,
+{% elif xdl_op_type_value == 8 %}
+    2, 1, 1, 1, 1,
+    {{ADType}},
+    {{BDType}},
+    {{BDType}},
+    {{CDType}},
+    ck::Tuple<>,
+    ck::Tuple<>,
+    {{AccDType}},
+    float, // CShuffleDType,
+{% elif xdl_op_type_value == 9 %}
+    {{ALayout}},
+    {{BLayout}},
+    ck::Tuple<{{DsLayout}}>, // DsLayout
+    {{CLayout}},
+    {{ADType}},
+    {{BDType}},
+    {{AccDType}},
+    {{CShuffleDType}},
+    ck::Tuple<{{DsDType}}>, // DsType
+    {{EDType}},
 {% endif %}
 {% if xdl_op_type_value in [7, 8] %}
     {{A_elem_op}},
@@ -423,6 +450,11 @@ def emit(self) -> str:
     ck::tensor_operation::device::TensorSpecialization::Packed,
     ck::tensor_operation::device::TensorSpecialization::Packed,
     ck::tensor_operation::device::TensorSpecialization::Default,
+    {% elif xdl_op_type_value==8 %}
+    ck::tensor_operation::device::TensorSpecialization::Default,
+    ck::tensor_operation::device::TensorSpecialization::Default,
+    ck::tensor_operation::device::TensorSpecialization::Default,
+    ck::tensor_operation::device::TensorSpecialization::Default,
     {% endif %}
     1,
 {% endif %}
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index 91b44ea7d..e8f89f666 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -24,6 +24,7 @@
     softmax_operation as softmax,
 )
 
+
 ###########################################################################################################
 # Convolution for 2D Fwd operations
 def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
@@ -1390,7 +1391,6 @@ def CreateBmmSoftmaxBmmOperator(
     ]
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
@@ -1505,7 +1505,6 @@ def CreateBmmSoftmaxBmmPermOperator(
 
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
@@ -1667,6 +1666,354 @@ def CreateBmmRRROperator(manifest):
     return operations
 
 
+def CreateBmmRRRBillinearOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    a_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block != 64:
+            a_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block == 64:
+            a_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            a_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(
+                a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+        c_block_descriptions.append(c_block_transfer)
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    ds_dtype = [library.DataType.f16]
+    ds_layout = [library.LayoutType.RowMajor]
+    e_dtype = library.DataType.f16
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmMultiD_Xdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmCCRBillinearOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 2, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 2, 8, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 2, 8, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 2, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 2, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 2, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 2, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 2, 8, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        b_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            b_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block != 64:
+            b_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block == 64:
+            b_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            b_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        b_block_descriptions.append(
+            gemm.BlockTransferDesc(
+                b_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+        c_block_descriptions.append(c_block_transfer)
+    a_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    ds_dtype = [library.DataType.f16]
+    ds_layout = [library.LayoutType.RowMajor]
+    e_dtype = library.DataType.f16
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmMultiD_Xdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmCRRBillinearOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 2, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 2, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 2, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 2, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 2, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 2, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 2, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 2, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        b_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            b_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block != 64:
+            b_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+        if t.block_size == 128 and t.n_per_block == 64:
+            b_block_transfer = [4, 32, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            b_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        b_block_descriptions.append(
+            gemm.BlockTransferDesc(
+                b_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+        c_block_descriptions.append(c_block_transfer)
+    a_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    ds_dtype = [library.DataType.f16]
+    ds_layout = [library.LayoutType.RowMajor]
+    e_dtype = library.DataType.f16
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmMultiD_Xdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
 def CreateBmmRRRPermOperator(manifest):
     operation_kind = library.GemmKind.BatchGemmPermute
     a_element_desc = library.TensorDesc(
@@ -1980,6 +2327,8 @@ def CreateLayerNormOperator(manifest, rank=2):
         layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
         layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
         layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
     ]
 
     operations = []
@@ -2056,6 +2405,12 @@ def GenerateTensorOp(manifest):
         library.TensorOperation.AddRelu,
         library.MemoryDataOperation.MemorySet,
     )
+    # Conv2dBiasAdd
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.AddAdd,
+    )
     # Conv2dBiasReluAdd
     CreateConv2dFwdOperator(
         manifest,
@@ -2101,8 +2456,10 @@ def GenerateTensorOp(manifest):
     CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddTanh)
     # GemmRCRBiasTanh
     CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddFastGelu)
-    # GemmRCRBiasSwish
+    # GemmRCRBiasHardswish
     CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddHardswish)
+    # GemmRCRBiasSwish
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSwish)
     # GemmRCRBiasSigmoid
     CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoid)
     # GemmRCRBiasAdd
@@ -2127,6 +2484,12 @@ def GenerateTensorOp(manifest):
     CreateBmmRCROperator(manifest)
     # BmmRRR
     CreateBmmRRROperator(manifest)
+    # BmmRRRAdd
+    CreateBmmRRRBillinearOperator(manifest, library.TensorOperation.Add)
+    # BmmCRRAdd
+    CreateBmmCRRBillinearOperator(manifest, library.TensorOperation.Add)
+    # BmmCRRAdd
+    CreateBmmCCRBillinearOperator(manifest, library.TensorOperation.Add)
     # BmmCCR
     CreateBmmCCROperator(manifest)
     # BmmCRR
diff --git a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
index e61fa7ef9..969efc6ed 100644
--- a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
@@ -78,7 +78,7 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceLayernormImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
diff --git a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
index 6e28da94f..264cba714 100644
--- a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
@@ -78,7 +78,7 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceLayernormImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
@@ -94,7 +94,7 @@ def emit(self) -> str:
         return template.render(
             name=self.__str__(),
             InDType=library.DataTypeTag[self.In],
-            AccDType=library.DataTypeTag[library.DataType.f32],
+            AccDType=library.DataTypeTag[self.accumulator_type()],
             OutDType=library.DataTypeTag[self.Out],
             Rank=self.Rank,
             NumReduceDim=self.NumReduceDim,  # we only need softmax(dim=-1) at this moment
diff --git a/python/aitemplate/utils/mk_ck_lib/library.py b/python/aitemplate/utils/mk_ck_lib/library.py
index a3fdb1c00..4b6a357b9 100644
--- a/python/aitemplate/utils/mk_ck_lib/library.py
+++ b/python/aitemplate/utils/mk_ck_lib/library.py
@@ -201,6 +201,7 @@ class LayoutType(enum.Enum):
     LayoutType.GNWK: "GNWK",
 }
 
+
 #
 class OperationKind(enum.Enum):
     Gemm = auto()
@@ -282,6 +283,7 @@ class TensorOperation(enum.Enum):
     AddFastGelu = auto()
     AddTanh = auto()
     AddHardswish = auto()
+    AddSwish = auto()
     AddSigmoid = auto()
     AddReluAdd = auto()
     AddAddRelu = auto()
@@ -312,6 +314,7 @@ class TensorOperation(enum.Enum):
     TensorOperation.AddTanh: "ck::tensor_operation::element_wise::AddTanh",
     TensorOperation.AddSigmoid: "ck::tensor_operation::element_wise::AddSigmoid",
     TensorOperation.AddHardswish: "ck::tensor_operation::element_wise::AddHardswish",
+    TensorOperation.AddSwish: "ck::tensor_operation::element_wise::AddSwish",
     TensorOperation.AddReluAdd: "ck::tensor_operation::element_wise::AddReluAdd",
     TensorOperation.AddAddRelu: "ck::tensor_operation::element_wise::AddAddRelu",
     TensorOperation.AddHardswishAdd: "ck::tensor_operation::element_wise::AddHardswishAdd",
@@ -341,6 +344,7 @@ class TensorOperation(enum.Enum):
     TensorOperation.AddTanh: "AT",
     TensorOperation.AddSigmoid: "AS",
     TensorOperation.AddHardswish: "AH",
+    TensorOperation.AddSwish: "ASW",
     TensorOperation.AddReluAdd: "ARA",
     TensorOperation.AddAddRelu: "AAR",
     TensorOperation.AddHardswishAdd: "AHA",
diff --git a/python/aitemplate/utils/mk_ck_lib/manifest.py b/python/aitemplate/utils/mk_ck_lib/manifest.py
index 077ee9103..c572737d8 100644
--- a/python/aitemplate/utils/mk_ck_lib/manifest.py
+++ b/python/aitemplate/utils/mk_ck_lib/manifest.py
@@ -87,7 +87,6 @@ def get_kernel_filters(self, kernelListFile):
             return []
 
     def filter_out_kernels(self, kernel_name, kernel_filter_list):
-
         for kernel_filter_re in kernel_filter_list:
             if kernel_filter_re.search(kernel_name) is not None:
                 return True

From c9add3a1103af21a6d747765794a4fde2fcb553f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 19 Oct 2023 05:07:15 -0500
Subject: [PATCH 620/638] update dockerfile

---
 docker/Dockerfile.rocm | 68 ++----------------------------------------
 1 file changed, 2 insertions(+), 66 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index e6231a5e1..2e549de7e 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -13,62 +13,7 @@
 #  limitations under the License.
 #
 # ROCM Docker Image for AITemplate
-FROM ubuntu:20.04
-
-ARG ROCMVERSION=5.4.2
-
-RUN set -xe
-
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
-# Add rocm repository
-RUN apt-get update
-RUN apt-get install -y wget gnupg
-RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
-RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
-RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
-
-# Install dependencies
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    apt-utils \
-    build-essential \
-    cmake-data \
-    cmake \
-    curl \
-    git \
-    hip-rocclr \
-    jq \
-    libelf-dev \
-    libncurses5-dev \
-    libnuma-dev \
-    libpthread-stubs0-dev \
-    llvm-amdgpu \
-    pkg-config \
-    python3 \
-    python3-dev \
-    python3-pip \
-    software-properties-common \
-    rocm-dev \
-    rocm-device-libs \
-    rocm-cmake \
-    rocm-libs \
-    vim \
-    zlib1g-dev \
-    openssh-server \
-    clang-format-10 \
-    kmod && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Setup ubsan environment to printstacktrace
-RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
-# Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
-RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
-
-ARG PREFIX=/opt/rocm
+FROM docker.io/rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
 
 # Setup ubsan environment to printstacktrace
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -77,11 +22,6 @@ ENV LANG=C.UTF-8
 ADD ./docker/install/rocm_dev-requirements.txt dev-requirements.txt
 RUN groupadd -f render
 
-# Install the new rocm-cmake version
-RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
-  cd rocm-cmake && mkdir build && cd build && \
-  cmake  .. && cmake --build . && cmake --build . --target install
-
 WORKDIR /
 
 ADD ./docker/install/ /Install
@@ -94,9 +34,6 @@ RUN bash /Install/install_test_dep.sh
 # for docs
 RUN bash /Install/install_doc_dep.sh
 
-# Install Pytorch
-RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
-
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
 RUN bash /Install/install_detection_deps.sh
@@ -111,5 +48,4 @@ ADD ./tests /AITemplate/tests
 ADD ./docs /AITemplate/docs
 ADD ./static /AITemplate/static
 ADD ./licenses /AITemplate/licenses
-ADD ./docker/install/install_ait.sh /AITemplate/
-RUN bash /AITemplate/install_ait.sh
+RUN cd /AITemplate/python && python setup.py install

From f4c3c92bd7b367b919a9516962307709eecbf4af Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 25 Oct 2023 01:28:16 +0800
Subject: [PATCH 621/638] fix a profiler bug

---
 docker/Dockerfile.rocm                        | 27 +++++++------------
 python/aitemplate/backend/profiler_runner.py  | 14 +++++-----
 .../aitemplate/backend/rocm/conv2d/common.py  |  2 +-
 3 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 2e549de7e..be7b47dfd 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -19,33 +19,24 @@ FROM docker.io/rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
 ENV UBSAN_OPTIONS=print_stacktrace=1
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
-ADD ./docker/install/rocm_dev-requirements.txt dev-requirements.txt
 RUN groupadd -f render
 
 WORKDIR /
 
-ADD ./docker/install/ /Install
+RUN git clone -b merge_upstream https://github.com/ROCmSoftwarePlatform/AITemplate.git
+
+WORKDIR /AITemplate
 # necessary package
-RUN bash /Install/install_basic_dep.sh
+RUN bash ./docker/install/install_basic_dep.sh
 
 # for test
-RUN bash /Install/install_test_dep.sh
+RUN bash ./docker/install/install_test_dep.sh
 
 # for docs
-RUN bash /Install/install_doc_dep.sh
+RUN bash ./docker/install/install_doc_dep.sh
 
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-RUN bash /Install/install_detection_deps.sh
-
-# Copy AITemplate to Docker
-RUN mkdir /AITemplate
-ADD ./COMMIT_INFO /AITemplate/COMMIT_INFO
-ADD ./python /AITemplate/python
-ADD ./3rdparty /AITemplate/3rdparty
-ADD ./examples /AITemplate/examples
-ADD ./tests /AITemplate/tests
-ADD ./docs /AITemplate/docs
-ADD ./static /AITemplate/static
-ADD ./licenses /AITemplate/licenses
-RUN cd /AITemplate/python && python setup.py install
+RUN bash ./docker/install/install_detection_deps.sh
+
+RUN cd ./python && python setup.py install
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 5e2c65f23..a364aa771 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -122,13 +122,6 @@ def process_task(task: Task) -> None:
 
     if len(stderr) > 0:
         # TODO: ugly fix, should remove when finish all profiler refactor
-        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
-        if len(runtimes) > 0:
-            single_file_profiler = True
-        if not single_file_profiler:
-            task._failed = True
-            return
-
         _LOGGER.debug(
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
                 name=task._name,
@@ -137,6 +130,13 @@ def process_task(task: Task) -> None:
                 stderr=stderr,
             ),
         )
+        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
+        if len(runtimes) > 0:
+            single_file_profiler = True
+        if not single_file_profiler:
+            task._failed = True
+            return
+
     task._ret, task._failed = extract_profile_result(
         stdout=stdout,
         return_ops=task._kwargs.get("return_ops", None),
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 0d30e05c8..b71a20bce 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -284,7 +284,7 @@
   const int64_t stride = std::stoi(argv[8]);
   const int64_t pad = std::stoi(argv[9]);
   const int64_t dilation = std::stoi(argv[10]);
-  const int64_t group = std::stoi(argv[11]);
+  const int64_t group = std::stoi(argv[14]);
 """
 )
 

From fcd93022cf0b8da6275aa8f0122de91f7c2d3745 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 24 Nov 2023 14:55:51 +0800
Subject: [PATCH 622/638] fix dockerfile

---
 docker/Dockerfile.rocm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index be7b47dfd..548cb3b26 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -23,7 +23,7 @@ RUN groupadd -f render
 
 WORKDIR /
 
-RUN git clone -b merge_upstream https://github.com/ROCmSoftwarePlatform/AITemplate.git
+RUN git clone -b merge_upstream --recursive https://github.com/ROCmSoftwarePlatform/AITemplate.git
 
 WORKDIR /AITemplate
 # necessary package
@@ -39,4 +39,4 @@ RUN bash ./docker/install/install_doc_dep.sh
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
 RUN bash ./docker/install/install_detection_deps.sh
 
-RUN cd ./python && python setup.py install
+RUN bash ./docker/install/install_ait.sh

From 0f319a46dba82ef00f37f2536862b44ef7a1acd9 Mon Sep 17 00:00:00 2001
From: Junhao Zhang <howiejayzh@gmail.com>
Date: Tue, 12 Dec 2023 14:06:58 +0800
Subject: [PATCH 623/638] pin diffusers and transformers

---
 docker/install/install_detection_deps.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/install_detection_deps.sh b/docker/install/install_detection_deps.sh
index 47238cd3c..08ad1820e 100644
--- a/docker/install/install_detection_deps.sh
+++ b/docker/install/install_detection_deps.sh
@@ -5,5 +5,5 @@ pip3 install yacs
 pip3 install opencv-python
 pip3 install tqdm
 pip3 install timm
-pip3 install transformers
-pip3 install diffusers
+pip3 install transformers==4.25.0
+pip3 install diffusers==0.11.1
\ No newline at end of file

From fb5a110b76fd65d4d6ed2676117957793781146a Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 22 Dec 2023 14:40:30 +0800
Subject: [PATCH 624/638] merge sdxl

---
 .circleci/config.yml                          |  10 +-
 3rdparty/composable_kernel                    |   2 +-
 docker/Dockerfile.rocm                        |   2 +-
 docker/install/install_detection_deps.sh      |   2 +-
 .../scripts/compile_sdxl.py                   | 213 +++++
 .../05_stable_diffusion/scripts/demo_xl.py    | 125 +++
 .../scripts/download_pipeline.py              |   2 +-
 .../src/compile_lib/compile_clip_alt.py       |  59 +-
 .../src/compile_lib/compile_unet_alt.py       | 331 +++++--
 .../src/compile_lib/compile_vae_alt.py        | 111 ++-
 .../src/compile_lib/util.py                   |   7 +
 .../05_stable_diffusion/src/inference_ait.py  | 224 +++++
 .../src/modeling/attention.py                 |   4 +-
 .../05_stable_diffusion/src/modeling/clip.py  | 160 +++-
 .../modeling/controlnet_unet_2d_condition.py  | 298 +-----
 .../src/modeling/embeddings.py                |  29 +-
 .../src/modeling/resnet.py                    |  55 +-
 .../src/modeling/unet_2d_condition.py         | 146 ++-
 .../src/modeling/unet_blocks.py               | 167 +++-
 .../05_stable_diffusion/src/modeling/vae.py   | 196 +++-
 .../src/pipeline_stable_diffusion_xl_ait.py   | 889 ++++++++++++++++++
 fx2ait/fx2ait/__init__.py                     |   3 +-
 fx2ait/fx2ait/acc_tracer/acc_ops.py           |  44 +
 fx2ait/fx2ait/ait_splitter.py                 |   5 -
 fx2ait/fx2ait/converters/ait_converters.py    |  62 +-
 fx2ait/fx2ait/example/benchmark_utils.py      |   2 -
 fx2ait/fx2ait/extension.py                    |  56 ++
 fx2ait/fx2ait/fx2ait.py                       |   6 +-
 fx2ait/fx2ait/lower/lower.py                  |   2 -
 fx2ait/fx2ait/lower/lower_settings.py         |   2 +
 .../fx2ait/test/converters/test_ait_conv2d.py |   6 +-
 .../test/converters/test_ait_linalg_norm.py   |  18 +
 .../test/converters/test_ait_unary_ops.py     |   1 +
 .../test/converters/test_ait_upsampling2d.py  |  42 +
 fx2ait/fx2ait/test/test_fx2ait.py             |  12 +-
 fx2ait/fx2ait/tools/common_aten2ait.py        |  25 +-
 fx2ait/fx2ait/tools/common_fx2ait.py          |  28 +-
 fx2ait/setup.py                               | 135 +--
 python/aitemplate/__init__.py                 |   3 +-
 python/aitemplate/backend/codegen.py          |  33 +-
 .../backend/common/elementwise_common.py      |   3 +-
 .../backend/common/tensor/slice_common.py     |  68 +-
 .../cuda/elementwise/fused_elementwise.py     |   4 +-
 .../backend/cuda/gemm_universal/__init__.py   |   1 +
 .../backend/cuda/gemm_universal/gemm_rrr.py   |  62 ++
 .../cuda/gemm_universal/gemm_rrr_bias.py      | 205 ++++
 .../group_layernorm_sigmoid_mul.py            |   6 +-
 .../layernorm_sigmoid_mul_kernel.cuh          |  11 +-
 .../aitemplate/backend/cuda/lib_template.py   |   7 +
 .../backend/cuda/reduce/reduce_3d.py          |   9 +-
 .../backend/cuda/reduce/reduce_common.py      |  14 +-
 python/aitemplate/backend/cuda/reduce/var.py  |  30 +-
 .../backend/cuda/softmax/softmax.py           |  57 +-
 python/aitemplate/backend/cuda/target_def.py  |  19 +-
 .../backend/cuda/tensor/__init__.py           |   2 +
 .../aitemplate/backend/cuda/tensor/argmax.py  |   6 +-
 .../backend/cuda/tensor/index_select.py       | 253 +++++
 .../backend/cuda/view_ops/view_ops.py         |  12 +-
 python/aitemplate/backend/main_templates.py   |   4 +
 python/aitemplate/backend/profiler_runner.py  |   2 +-
 .../aitemplate/backend/rocm/conv2d/common.py  |   4 +-
 .../rocm/conv2d/conv2d_bias_add_relu.py       |   2 +-
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |   2 +-
 .../rocm/elementwise/fused_elementwise.py     |   4 +-
 python/aitemplate/backend/rocm/gemm/common.py |   2 +-
 .../aitemplate/backend/rocm/lib_template.py   |   7 +
 .../backend/rocm/normalization/groupnorm.py   |  29 +-
 .../backend/rocm/normalization/layernorm.py   |   9 +-
 .../backend/rocm/normalization/norm_common.py |   2 +-
 .../aitemplate/backend/rocm/pool2d/pool2d.py  |   2 +-
 python/aitemplate/backend/rocm/target_def.py  |  10 +-
 .../backend/rocm/tensor/__init__.py           |   1 +
 .../backend/rocm/tensor/index_select.py       | 246 +++++
 python/aitemplate/backend/target.py           |   5 +-
 python/aitemplate/compiler/compiler.py        |   2 +-
 python/aitemplate/compiler/ops/common/math.py |   4 +-
 .../compiler/ops/softmax/softmax.py           |  16 +-
 .../compiler/ops/tensor/__init__.py           |   1 +
 .../compiler/ops/tensor/index_select.py       |  88 ++
 .../aitemplate/compiler/transform/__init__.py |   1 -
 .../aitemplate/compiler/transform/fuse_ops.py |  49 +-
 .../compiler/transform/optimize_graph.py      |   8 +-
 .../aitemplate/compiler/transform/profile.py  |   4 +-
 .../compiler/transform/remove_id_ops.py       |  41 -
 .../compiler/transform/remove_no_ops.py       | 177 +++-
 .../transform/transform_merge_view_ops.py     | 101 ++
 .../transform/transform_permutations.py       |  38 +-
 .../transform/transform_special_ops.py        |  15 +-
 python/aitemplate/testing/test_utils.py       |  13 +-
 python/aitemplate/utils/environ.py            |   1 -
 .../utils/mk_ck_lib/conv2d_operation.py       |   2 +-
 .../aitemplate/utils/mk_ck_lib/generator.py   |  62 +-
 .../utils/mk_ck_lib/groupnorm_operation.py    |   6 +-
 .../utils/mk_ck_lib/layernorm_operation.py    |   6 +-
 python/aitemplate/utils/shape_utils.py        |  26 +
 static/csrc/debug_utility.cpp                 |  19 -
 .../include}/custom_math.cuh                  |   0
 .../include}/custom_math.h                    |  11 +
 static/include/debug_utility.h                |  24 +-
 static/include/rocm_device_functions.h        |   6 +-
 test.py                                       |  18 +
 .../compiler/test_eliminate_permutations.py   | 121 ++-
 .../unittest/compiler/test_merge_view_ops.py  | 414 ++++++++
 .../compiler/test_remove_no_op_concats.py     | 147 +++
 .../test_remove_no_op_dynamic_slices.py       | 153 +++
 .../compiler/test_remove_no_op_splits.py      | 168 ++++
 .../compiler/test_slice_permute021_fusion.py  |   8 +-
 .../compiler/test_strided_group_layernorm.py  | 144 ++-
 tests/unittest/ops/test_gemm_bias.py          |  67 ++
 tests/unittest/ops/test_index_select.py       | 287 ++++++
 tests/unittest/ops/test_reduce.py             |  38 +-
 tests/unittest/ops/test_slice.py              |  11 +
 tests/unittest/ops/test_softmax.py            | 106 ++-
 tests/unittest/util/test_debug_utils.py       |  77 +-
 114 files changed, 6140 insertions(+), 957 deletions(-)
 create mode 100644 examples/05_stable_diffusion/scripts/compile_sdxl.py
 create mode 100644 examples/05_stable_diffusion/scripts/demo_xl.py
 create mode 100644 examples/05_stable_diffusion/src/inference_ait.py
 create mode 100644 examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
 create mode 100644 fx2ait/fx2ait/extension.py
 create mode 100644 fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/index_select.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/index_select.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/index_select.py
 delete mode 100644 python/aitemplate/compiler/transform/remove_id_ops.py
 create mode 100644 python/aitemplate/compiler/transform/transform_merge_view_ops.py
 rename {python/aitemplate/backend/cuda/elementwise => static/include}/custom_math.cuh (100%)
 rename {python/aitemplate/backend/rocm/elementwise => static/include}/custom_math.h (97%)
 create mode 100644 test.py
 create mode 100644 tests/unittest/compiler/test_merge_view_ops.py
 create mode 100644 tests/unittest/compiler/test_remove_no_op_concats.py
 create mode 100644 tests/unittest/compiler/test_remove_no_op_dynamic_slices.py
 create mode 100644 tests/unittest/compiler/test_remove_no_op_splits.py
 create mode 100644 tests/unittest/ops/test_index_select.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0d04d5ad5..0193fc253 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -19,6 +19,7 @@ setup_env: &setup_env
       name: Setup environment
       command: |
         for i in {1..3}; do
+          sudo update-alternatives --set cuda /usr/local/cuda-11.4
           echo 'export PATH=/usr/local/cuda/bin:$PATH' >> $BASH_ENV &&
           source "$BASH_ENV"
           python3.8 --version &&
@@ -57,8 +58,9 @@ setup_fx2ait_env: &setup_fx2ait_env
           sudo cp -P cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
           sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
           python3.8 -m pip install --ignore-installed --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
-          python3.8 fx2ait/setup.py install --prefix=/home/circleci/
-          echo 'export PYTHONPATH=$PWD/fx2ait:$PYTHONPATH' >> $BASH_ENV
+          pushd fx2ait
+          python3.8 setup.py develop --user
+          popd
           break || sleep 5;
         done
 
@@ -86,7 +88,7 @@ fx2ait_tests: &fx2ait_tests
 jobs:
   fx2ait-test:
     machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
+      image: linux-cuda-11:default
       resource_class: gpu.nvidia.medium
     steps:
       - checkout
@@ -98,7 +100,7 @@ jobs:
 
   build-and-test:
     machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
+      image: linux-cuda-11:default
       # Check T101565170 for multi-gpu use cases.
       resource_class: gpu.nvidia.medium
     parallelism: 10
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index db49fc437..78eb3f0b4 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit db49fc43797f80be1db2399dcd1a082dbf447736
+Subproject commit 78eb3f0b46aafc52c6d19a07b9dc5bd19b8e7807
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 548cb3b26..f7ca24bc7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 # ROCM Docker Image for AITemplate
-FROM docker.io/rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
+FROM docker.io/rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1
 
 # Setup ubsan environment to printstacktrace
 ENV UBSAN_OPTIONS=print_stacktrace=1
diff --git a/docker/install/install_detection_deps.sh b/docker/install/install_detection_deps.sh
index 08ad1820e..e8b91f9d5 100644
--- a/docker/install/install_detection_deps.sh
+++ b/docker/install/install_detection_deps.sh
@@ -6,4 +6,4 @@ pip3 install opencv-python
 pip3 install tqdm
 pip3 install timm
 pip3 install transformers==4.25.0
-pip3 install diffusers==0.11.1
\ No newline at end of file
+pip3 install diffusers==0.24.0
\ No newline at end of file
diff --git a/examples/05_stable_diffusion/scripts/compile_sdxl.py b/examples/05_stable_diffusion/scripts/compile_sdxl.py
new file mode 100644
index 000000000..19caf57fe
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_sdxl.py
@@ -0,0 +1,213 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import sys
+
+sys.setrecursionlimit(10000)
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import AutoencoderKL, StableDiffusionXLPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_clip_alt import compile_clip
+from src.compile_lib.compile_unet_alt import compile_timestep_embedder, compile_unet
+from src.compile_lib.compile_vae_alt import compile_vae
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="stabilityai/stable-diffusion-xl-base-1.0",
+    help="the local or hf hub path e.g. stabilityai/stable-diffusion-xl-base-1.0",
+)
+@click.option(
+    "--width",
+    default=(1024, 1024),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum width",
+)
+@click.option(
+    "--height",
+    default=(1024, 1024),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum height",
+)
+@click.option(
+    "--batch-size",
+    default=(1, 1),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum batch size",
+)
+@click.option("--clip-chunks", default=10, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=False,
+    type=bool,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+@click.option("--work-dir", default="./tmp", help="work directory")
+@click.option(
+    "--model-name-prefix", default="SDXL", help="Prefix for compiled module names"
+)
+@click.option(
+    "--fp32-vae",
+    default=False,
+    help="fp32 vae, if false, use https://huggingface.co/madebyollin/sdxl-vae-fp16-fix as replacement vae",
+)
+def compile_diffusers(
+    hf_hub_or_path,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    work_dir="./tmp",
+    model_name_prefix="SDXL",
+    fp32_vae=False,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    pipe = StableDiffusionXLPipeline.from_pretrained(
+        hf_hub_or_path,
+        torch_dtype=torch.float16,
+    ).to("cuda")
+    if fp32_vae:
+        pipe.vae.to("cuda", dtype=torch.float32)
+    else:
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+        ).to("cuda")
+        pipe.vae = vae
+
+    # text_encoder
+    model_name = f"{model_name_prefix}_text_encoder"
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=pipe.text_encoder.config.max_position_embeddings,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        output_hidden_states=True,
+        text_projection_dim=None,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+        constants=include_constants,
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    # text_encoder 2
+    model_name = f"{model_name_prefix}_text_encoder_2"
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=pipe.text_encoder.config.max_position_embeddings,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        output_hidden_states=True,
+        text_projection_dim=pipe.text_encoder.config.projection_dim,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+        constants=include_constants,
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    model_name = f"{model_name_prefix}_unet"
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
+        block_out_channels=pipe.unet.config.block_out_channels,
+        down_block_types=pipe.unet.config.down_block_types,
+        up_block_types=pipe.unet.config.up_block_types,
+        in_channels=pipe.unet.config.in_channels,
+        out_channels=pipe.unet.config.out_channels,
+        class_embed_type=pipe.unet.config.class_embed_type,
+        num_class_embeds=pipe.unet.config.num_class_embeds,
+        only_cross_attention=pipe.unet.config.only_cross_attention,
+        sample_size=pipe.unet.config.sample_size,
+        dim=pipe.unet.config.block_out_channels[0],
+        time_embedding_dim=pipe.unet.config.time_embedding_dim,
+        conv_in_kernel=pipe.unet.config.conv_in_kernel,
+        projection_class_embeddings_input_dim=pipe.unet.config.projection_class_embeddings_input_dim,
+        addition_embed_type=pipe.unet.config.addition_embed_type,
+        transformer_layers_per_block=pipe.unet.config.transformer_layers_per_block,
+        constants=False
+        if sys.platform == "win32"
+        else include_constants,  # Too big, RC : fatal error RW1023: I/O error seeking in file
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    # `add_time_proj` Timesteps
+    model_name = f"{model_name_prefix}_addition_time_embed"
+    compile_timestep_embedder(
+        pipe.unet.config.addition_time_embed_dim,
+        work_dir=work_dir,
+        model_name=model_name,
+    )
+    model_name = f"{model_name_prefix}_vae"
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        constants=include_constants,
+        block_out_channels=pipe.vae.config.block_out_channels,
+        layers_per_block=pipe.vae.config.layers_per_block,
+        act_fn=pipe.vae.config.act_fn,
+        latent_channels=pipe.vae.config.latent_channels,
+        in_channels=pipe.vae.config.in_channels,
+        out_channels=pipe.vae.config.out_channels,
+        down_block_types=pipe.vae.config.down_block_types,
+        up_block_types=pipe.vae.config.up_block_types,
+        sample_size=pipe.vae.config.sample_size,
+        model_name=model_name,
+        work_dir=work_dir,
+        dtype="float32" if fp32_vae else "float16",
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/demo_xl.py b/examples/05_stable_diffusion/scripts/demo_xl.py
new file mode 100644
index 000000000..6e5ccd120
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_xl.py
@@ -0,0 +1,125 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+
+from aitemplate.utils.import_path import import_parent
+from diffusers import AutoencoderKL, StableDiffusionXLPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_xl_ait import StableDiffusionXLAITPipeline
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="stabilityai/stable-diffusion-xl-base-1.0",
+    help="huggingface hub name or path to local model",
+)
+@click.option(
+    "--apply-weights",
+    default=True,
+    help="apply weights to module, required for Windows",
+)
+@click.option(
+    "--unet-module",
+    help="path to unet module",
+    required=True,
+)
+@click.option(
+    "--text-encoder-module",
+    help="path to text encoder module",
+    required=True,
+)
+@click.option(
+    "--text-encoder-2-module",
+    help="path to text encoder 2 module",
+    required=True,
+)
+@click.option(
+    "--time-embed-module",
+    help="path to time embed module",
+    required=True,
+)
+@click.option(
+    "--vae-module",
+    help="path to vae module",
+    required=True,
+)
+@click.option("--width", default=1024, help="Width of generated image")
+@click.option("--height", default=1024, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(
+    hf_hub_or_path,
+    apply_weights,
+    unet_module,
+    text_encoder_module,
+    text_encoder_2_module,
+    time_embed_module,
+    vae_module,
+    width,
+    height,
+    batch,
+    prompt,
+    negative_prompt,
+    benchmark,
+):
+    diffusers_pipe = StableDiffusionXLPipeline.from_pretrained(
+        hf_hub_or_path,
+        use_safetensors=True,
+        torch_dtype=torch.float16,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        "madebyollin/sdxl-vae-fp16-fix",
+        use_safetensors=True,
+        torch_dtype=torch.float16,
+    )
+    pipe = StableDiffusionXLAITPipeline(
+        vae,
+        diffusers_pipe.text_encoder,
+        diffusers_pipe.text_encoder_2,
+        diffusers_pipe.tokenizer,
+        diffusers_pipe.tokenizer_2,
+        diffusers_pipe.unet,
+        diffusers_pipe.scheduler,
+        text_encoder_module,
+        text_encoder_2_module,
+        unet_module,
+        vae_module,
+        time_embed_module,
+        apply_weights_to_modules=apply_weights,
+    )
+
+    prompt = [prompt] * batch
+    images = pipe(
+        prompt=prompt,
+        prompt_2=prompt,
+        height=height,
+        width=width,
+    ).images
+    for i, image in enumerate(images):
+        image.save(f"example_ait_{i}.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index ef1df547d..22c5ceee5 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -39,7 +39,7 @@ def download_pipeline_files(model_name, token, save_directory) -> None:
 
     StableDiffusionPipeline.from_pretrained(
         model_name,
-        revision="fp16",
+        # revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token if len(token) > 5 else token.lower() == "true",
     ).save_pretrained(save_directory)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
index b4991e98d..4e62eec79 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
@@ -12,19 +12,21 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import sys
 
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
-from .util import mark_output
+from .util import torch_dtype_from_str
 
 
-def map_clip_params(pt_mod, batch_size=1, seqlen=77, depth=12):
-    params_ait = {}
+def map_clip(pt_mod, device="cuda", dtype="float16"):
     pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
     for key, arr in pt_params.items():
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
         name = key.replace("text_model.", "")
         ait_name = name.replace(".", "_")
         if name.endswith("out_proj.weight"):
@@ -38,7 +40,6 @@ def map_clip_params(pt_mod, batch_size=1, seqlen=77, depth=12):
         elif "v_proj" in name:
             ait_name = ait_name.replace("v_proj", "proj_v")
         params_ait[ait_name] = arr
-
     return params_ait
 
 
@@ -49,14 +50,34 @@ def compile_clip(
     dim=768,
     num_heads=12,
     depth=12,
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
+    output_hidden_states=False,
+    text_projection_dim=None,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
     act_layer="gelu",
     constants=True,
+    model_name="CLIPTextModel",
+    work_dir="./tmp",
 ):
     mask_seq = 0
     causal = True
 
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip(pt_mod)
+
+    static_shape = batch_size[0] == batch_size[1]
+    if static_shape:
+        batch_size = batch_size[0]
+    else:
+        batch_size = IntVar(values=list(batch_size), name="batch_size")
+
+    input_ids_ait = Tensor(
+        [batch_size, seqlen], name="input_ids", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size, seqlen], name="position_ids", dtype="int64", is_input=True
+    )
+
     ait_mod = ait_CLIPTextTransformer(
         num_hidden_layers=depth,
         hidden_size=dim,
@@ -66,25 +87,25 @@ def compile_clip(
         causal=causal,
         mask_seq=mask_seq,
         act_layer=act_layer,
+        output_hidden_states=output_hidden_states,
+        text_projection_dim=text_projection_dim,
     )
     ait_mod.name_parameter_tensor()
-
-    pt_mod = pt_mod.eval()
-    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-    batch_size = IntVar(values=list(batch_size), name="batch_size")
-
-    input_ids_ait = Tensor(
-        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
-    )
-    position_ids_ait = Tensor(
-        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
-    )
+    
     Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
-    mark_output(Y)
+    for out in Y:
+        shape = [d._attrs["values"] for d in out._attrs["shape"]]
+        print(f'AIT {out._attrs["name"]} shape: {shape}')
 
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
     compile_model(
-        Y, target, "./tmp", "CLIPTextModel", constants=params_ait if constants else None
+        Y,
+        target,
+        work_dir,
+        model_name,
+        constants=params_ait if constants else None,
+        dll_name=dll_name,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
index 365710054..6b319d0bc 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
@@ -12,24 +12,36 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import sys
+
 import torch
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
-from ..modeling.controlnet_unet_2d_condition import (
-    ControlNetUNet2DConditionModel as ait_ControlNetUNet2DConditionModel,
-)
+from ..modeling.embeddings import Timesteps
 from ..modeling.unet_2d_condition import (
     UNet2DConditionModel as ait_UNet2DConditionModel,
 )
-from .util import mark_output
+from .util import torch_dtype_from_str
 
 
-def map_unet_params(pt_mod, dim):
-    pt_params = dict(pt_mod.named_parameters())
+def map_unet(
+    pt_mod, in_channels=None, conv_in_key=None, dim=320, device="cuda", dtype="float16"
+):
+    if in_channels is not None and conv_in_key is None:
+        raise ValueError(
+            "conv_in_key must be specified if in_channels is not None for padding"
+        )
+    if not isinstance(pt_mod, dict):
+        pt_params = dict(pt_mod.named_parameters())
+    else:
+        pt_params = pt_mod
     params_ait = {}
     for key, arr in pt_params.items():
+        if key.startswith("model.diffusion_model."):
+            key = key.replace("model.diffusion_model.", "")
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
         if len(arr.shape) == 4:
             arr = arr.permute((0, 2, 3, 1)).contiguous()
         elif key.endswith("ff.net.0.proj.weight"):
@@ -44,173 +56,322 @@ def map_unet_params(pt_mod, dim):
             continue
         params_ait[key.replace(".", "_")] = arr
 
-    params_ait["arange"] = (
-        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    if conv_in_key is not None:
+        if in_channels % 4 != 0:
+            pad_by = 4 - (in_channels % 4)
+            params_ait[conv_in_key] = torch.functional.F.pad(
+                params_ait[conv_in_key], (0, pad_by)
+            )
+
+    params_ait["arange"] = torch.arange(start=0, end=dim // 2, dtype=torch.float32).to(
+        device, dtype=torch_dtype_from_str(dtype)
     )
     return params_ait
 
 
+def compile_timestep_embedder(
+    dim=256,
+    flip_sin_to_cos=True,
+    downscale_freq_shift=0,
+    work_dir="./tmp",
+    model_name="Timesteps",
+):
+    timesteps = Timesteps(
+        dim, flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=downscale_freq_shift
+    )
+
+    timestep = Tensor([1], name="timestep", is_input=True)
+
+    Y = timesteps(timestep)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "time_embed"
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
+    constants = {"arange": torch.arange(start=0, end=dim // 2, dtype=torch.float16)}
+
+    target = detect_target(use_fp16_acc=True, convert_conv_to_gemm=True)
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
+    compile_model(Y, target, work_dir, model_name, constants=constants, dll_name=dll_name)
+
+
 def compile_unet(
     pt_mod,
     batch_size=(1, 8),
     height=(64, 2048),
     width=(64, 2048),
     clip_chunks=1,
+    work_dir="./tmp",
     dim=320,
     hidden_dim=1024,
     use_fp16_acc=False,
     convert_conv_to_gemm=False,
-    controlnet=True,
+    controlnet=False,
     attention_head_dim=[5, 10, 20, 20],  # noqa: B006
     model_name="UNet2DConditionModel",
     use_linear_projection=False,
     constants=True,
+    block_out_channels=(320, 640, 1280, 1280),
+    down_block_types=(
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    ),
+    up_block_types=(
+        "UpBlock2D",
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+    ),
+    in_channels=4,
+    out_channels=4,
+    sample_size=64,
+    class_embed_type=None,
+    num_class_embeds=None,
+    only_cross_attention=[True, True, True, False],
+    down_factor=8,
+    time_embedding_dim=None,
+    conv_in_kernel: int = 3,
+    projection_class_embeddings_input_dim=None,
+    addition_embed_type=None,
+    transformer_layers_per_block=[1, 1, 1, 1],
+    dtype="float16",
 ):
-    if controlnet:
-        ait_mod = ait_ControlNetUNet2DConditionModel(
-            sample_size=64,
-            cross_attention_dim=hidden_dim,
-            attention_head_dim=attention_head_dim,
-            use_linear_projection=use_linear_projection,
-        )
-    else:
-        ait_mod = ait_UNet2DConditionModel(
-            sample_size=64,
-            cross_attention_dim=hidden_dim,
-            attention_head_dim=attention_head_dim,
-            use_linear_projection=use_linear_projection,
+    xl = False
+    if projection_class_embeddings_input_dim is not None:
+        xl = True
+    if isinstance(only_cross_attention, bool):
+        only_cross_attention = [only_cross_attention] * len(block_out_channels)
+    if isinstance(transformer_layers_per_block, int):
+        transformer_layers_per_block = [transformer_layers_per_block] * len(
+            down_block_types
         )
+    if isinstance(attention_head_dim, int):
+        attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=sample_size,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=attention_head_dim,
+        use_linear_projection=use_linear_projection,
+        up_block_types=up_block_types,
+        down_block_types=down_block_types,
+        block_out_channels=block_out_channels,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        class_embed_type=class_embed_type,
+        num_class_embeds=num_class_embeds,
+        only_cross_attention=only_cross_attention,
+        time_embedding_dim=time_embedding_dim,
+        conv_in_kernel=conv_in_kernel,
+        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+        addition_embed_type=addition_embed_type,
+        transformer_layers_per_block=transformer_layers_per_block,
+        dtype=dtype,
+    )
     ait_mod.name_parameter_tensor()
 
     # set AIT parameters
     pt_mod = pt_mod.eval()
-    params_ait = map_unet_params(pt_mod, dim)
-    if controlnet:
-        # static sizes only for now
-        batch_size = batch_size[0] * 2  # double batch size for unet
-        height = height[0] // 8
-        width = width[0] // 8
+    params_ait = map_unet(
+        pt_mod,
+        dim=dim,
+        in_channels=in_channels,
+        conv_in_key="conv_in_weight",
+        dtype=dtype,
+    )
+
+    static_shape = width[0] == width[1] and height[0] == height[1]
+
+    if static_shape:
+        height = height[0] // down_factor
+        width = width[0] // down_factor
         height_d = height
         width_d = width
+        height_1_d = height
+        width_1_d = width
+        height_2 = height // 2
+        width_2 = width // 2
+        height_4 = height // 4
+        width_4 = width // 4
+        height_8 = height // 8
+        width_8 = width // 8
+        height_2_d = height_2
+        width_2_d = width_2
+        height_4_d = height_4
+        width_4_d = width_4
+        height_8_d = height_8
+        width_8_d = width_8
+    else:
+        height = [x // down_factor for x in height]
+        width = [x // down_factor for x in width]
+        height_d = IntVar(values=list(height), name="height_d")
+        width_d = IntVar(values=list(width), name="width_d")
+        height_1_d = IntVar(values=list(height), name="height_1_d")
+        width_1_d = IntVar(values=list(width), name="width_1_d")
+        height_2 = [x // 2 for x in height]
+        width_2 = [x // 2 for x in width]
+        height_4 = [x // 4 for x in height]
+        width_4 = [x // 4 for x in width]
+        height_8 = [x // 8 for x in height]
+        width_8 = [x // 8 for x in width]
+        height_2_d = IntVar(values=list(height_2), name="height_2_d")
+        width_2_d = IntVar(values=list(width_2), name="width_2_d")
+        height_4_d = IntVar(values=list(height_4), name="height_4_d")
+        width_4_d = IntVar(values=list(width_4), name="width_4_d")
+        height_8_d = IntVar(values=list(height_8), name="height_8_d")
+        width_8_d = IntVar(values=list(width_8), name="width_8_d")
+
+    batch_size = batch_size[0], batch_size[1] * 2  # double batch size for unet
+    batch_size = IntVar(values=list(batch_size), name="batch_size") if detect_target().name() == "cuda" else 2
+
+    if static_shape:
+        embedding_size = 77
     else:
-        batch_size = (batch_size[0], batch_size[1] * 2)  # double batch size for unet
-        batch_size = IntVar(values=list(batch_size), name="batch_size")
-        height = height[0] // 8, height[1] // 8
-        width = width[0] // 8, width[1] // 8
-        height_d = IntVar(values=list(height), name="height")
-        width_d = IntVar(values=list(width), name="width")
-    clip_chunks = 77, 77 * clip_chunks
-    embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
+        clip_chunks = 77, 77 * clip_chunks
+        embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
 
     latent_model_input_ait = Tensor(
-        [batch_size, height_d, width_d, 4], name="input0", is_input=True
+        [batch_size, height_d, width_d, in_channels],
+        name="latent_model_input",
+        is_input=True,
+        dtype=dtype,
     )
-    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    timesteps_ait = Tensor([batch_size], name="timesteps", is_input=True, dtype=dtype)
     text_embeddings_pt_ait = Tensor(
-        [batch_size, embedding_size, hidden_dim], name="input2", is_input=True
+        [batch_size, embedding_size, hidden_dim],
+        name="encoder_hidden_states",
+        is_input=True,
+        dtype=dtype,
     )
+
+    class_labels = None
+    # TODO: better way to handle this, enables class_labels for x4-upscaler
+    if in_channels == 7:
+        class_labels = Tensor(
+            [batch_size], name="class_labels", dtype="int64", is_input=True
+        )
+
+    add_embeds = None
+    if xl:
+        add_embeds = Tensor(
+            [batch_size, projection_class_embeddings_input_dim],
+            name="add_embeds",
+            is_input=True,
+            dtype=dtype,
+        )
+
+    down_block_residual_0 = None
+    down_block_residual_1 = None
+    down_block_residual_2 = None
+    down_block_residual_3 = None
+    down_block_residual_4 = None
+    down_block_residual_5 = None
+    down_block_residual_6 = None
+    down_block_residual_7 = None
+    down_block_residual_8 = None
+    down_block_residual_9 = None
+    down_block_residual_10 = None
+    down_block_residual_11 = None
+    mid_block_residual = None
     if controlnet:
         down_block_residual_0 = Tensor(
-            [batch_size, height, width, 320],
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
             name="down_block_residual_0",
             is_input=True,
         )
         down_block_residual_1 = Tensor(
-            [batch_size, height, width, 320],
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
             name="down_block_residual_1",
             is_input=True,
         )
         down_block_residual_2 = Tensor(
-            [batch_size, height, width, 320],
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
             name="down_block_residual_2",
             is_input=True,
         )
         down_block_residual_3 = Tensor(
-            [batch_size, height // 2, width // 2, 320],
+            [batch_size, height_2_d, width_2_d, block_out_channels[0]],
             name="down_block_residual_3",
             is_input=True,
         )
         down_block_residual_4 = Tensor(
-            [batch_size, height // 2, width // 2, 640],
+            [batch_size, height_2_d, width_2_d, block_out_channels[1]],
             name="down_block_residual_4",
             is_input=True,
         )
         down_block_residual_5 = Tensor(
-            [batch_size, height // 2, width // 2, 640],
+            [batch_size, height_2_d, width_2_d, block_out_channels[1]],
             name="down_block_residual_5",
             is_input=True,
         )
         down_block_residual_6 = Tensor(
-            [batch_size, height // 4, width // 4, 640],
+            [batch_size, height_4_d, width_4_d, block_out_channels[1]],
             name="down_block_residual_6",
             is_input=True,
         )
         down_block_residual_7 = Tensor(
-            [batch_size, height // 4, width // 4, 1280],
+            [batch_size, height_4_d, width_4_d, block_out_channels[2]],
             name="down_block_residual_7",
             is_input=True,
         )
         down_block_residual_8 = Tensor(
-            [batch_size, height // 4, width // 4, 1280],
+            [batch_size, height_4_d, width_4_d, block_out_channels[2]],
             name="down_block_residual_8",
             is_input=True,
         )
         down_block_residual_9 = Tensor(
-            [batch_size, height // 8, width // 8, 1280],
+            [batch_size, height_8_d, width_8_d, block_out_channels[2]],
             name="down_block_residual_9",
             is_input=True,
         )
         down_block_residual_10 = Tensor(
-            [batch_size, height // 8, width // 8, 1280],
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
             name="down_block_residual_10",
             is_input=True,
         )
         down_block_residual_11 = Tensor(
-            [batch_size, height // 8, width // 8, 1280],
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
             name="down_block_residual_11",
             is_input=True,
         )
         mid_block_residual = Tensor(
-            [batch_size, height // 8, width // 8, 1280],
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
             name="mid_block_residual",
             is_input=True,
         )
-    else:
-        mid_block_additional_residual = None
-        down_block_additional_residuals = None
-
-    if controlnet:
-        Y = ait_mod(
-            latent_model_input_ait,
-            timesteps_ait,
-            text_embeddings_pt_ait,
-            down_block_residual_0,
-            down_block_residual_1,
-            down_block_residual_2,
-            down_block_residual_3,
-            down_block_residual_4,
-            down_block_residual_5,
-            down_block_residual_6,
-            down_block_residual_7,
-            down_block_residual_8,
-            down_block_residual_9,
-            down_block_residual_10,
-            down_block_residual_11,
-            mid_block_residual,
-        )
-    else:
-        Y = ait_mod(
-            latent_model_input_ait,
-            timesteps_ait,
-            text_embeddings_pt_ait,
-            mid_block_additional_residual,
-            down_block_additional_residuals,
-        )
-    mark_output(Y)
 
+    Y = ait_mod(
+        sample=latent_model_input_ait,
+        timesteps=timesteps_ait,
+        encoder_hidden_states=text_embeddings_pt_ait,
+        down_block_residual_0=down_block_residual_0,
+        down_block_residual_1=down_block_residual_1,
+        down_block_residual_2=down_block_residual_2,
+        down_block_residual_3=down_block_residual_3,
+        down_block_residual_4=down_block_residual_4,
+        down_block_residual_5=down_block_residual_5,
+        down_block_residual_6=down_block_residual_6,
+        down_block_residual_7=down_block_residual_7,
+        down_block_residual_8=down_block_residual_8,
+        down_block_residual_9=down_block_residual_9,
+        down_block_residual_10=down_block_residual_10,
+        down_block_residual_11=down_block_residual_11,
+        mid_block_residual=mid_block_residual,
+        class_labels=class_labels,
+        add_embeds=add_embeds,
+    )
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
     compile_model(
-        Y, target, "./tmp", model_name, constants=params_ait if constants else None
+        Y,
+        target,
+        work_dir,
+        model_name,
+        constants=params_ait if constants else None,
+        dll_name=dll_name,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
index ffaeb75c6..6f0147e06 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
@@ -13,29 +13,32 @@
 #  limitations under the License.
 #
 
+import sys
+
 import torch
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
 
 from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
-from .util import mark_output
 
 
 def torch_dtype_from_str(dtype: str):
     return torch.__dict__.get(dtype, None)
 
 
-def map_vae(pt_module, device="cuda", dtype="float16"):
+def map_vae(pt_module, device="cuda", dtype="float16", encoder=False):
     if not isinstance(pt_module, dict):
         pt_params = dict(pt_module.named_parameters())
     else:
         pt_params = pt_module
     params_ait = {}
+    quant_key = "post_quant" if encoder else "quant"
+    vae_key = "decoder" if encoder else "encoder"
     for key, arr in pt_params.items():
-        if key.startswith("encoder"):
+        if key.startswith(vae_key):
             continue
-        if key.startswith("quant"):
+        if key.startswith(quant_key):
             continue
         arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
         key = key.replace(".", "_")
@@ -112,6 +115,10 @@ def map_vae(pt_module, device="cuda", dtype="float16"):
             params_ait[key] = arr
         else:
             params_ait[key] = arr
+    if encoder:
+        params_ait["encoder_conv_in_weight"] = torch.functional.F.pad(
+            params_ait["encoder_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+        )
 
     return params_ait
 
@@ -121,36 +128,39 @@ def compile_vae(
     batch_size=(1, 8),
     height=(64, 2048),
     width=(64, 2048),
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
-    name="AutoencoderKL",
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    model_name="AutoencoderKL",
     constants=True,
-):
-    in_channels = 3
-    out_channels = 3
-    down_block_types = [
+    block_out_channels=[128, 256, 512, 512],
+    layers_per_block=2,
+    act_fn="silu",
+    latent_channels=4,
+    sample_size=512,
+    in_channels=3,
+    out_channels=3,
+    down_block_types=[
         "DownEncoderBlock2D",
         "DownEncoderBlock2D",
         "DownEncoderBlock2D",
         "DownEncoderBlock2D",
-    ]
-    up_block_types = [
+    ],
+    up_block_types=[
         "UpDecoderBlock2D",
         "UpDecoderBlock2D",
         "UpDecoderBlock2D",
         "UpDecoderBlock2D",
-    ]
-    block_out_channels = [128, 256, 512, 512]
-    layers_per_block = 2
-    act_fn = "silu"
-    latent_channels = 4
-    sample_size = 512
-
-    # values not important, we only need this for mapping keys
+    ],
+    input_size=(64, 64),
+    down_factor=8,
+    dtype="float16",
+    work_dir="./tmp",
+    vae_encode=False,
+):
     ait_vae = ait_AutoencoderKL(
-        1,
-        64,
-        64,
+        batch_size[0],
+        input_size[0],
+        input_size[1],
         in_channels=in_channels,
         out_channels=out_channels,
         down_block_types=down_block_types,
@@ -160,32 +170,59 @@ def compile_vae(
         act_fn=act_fn,
         latent_channels=latent_channels,
         sample_size=sample_size,
+        dtype=dtype,
     )
-    batch_size = IntVar(values=list(batch_size), name="batch_size")
-    height = height[0] // 8, height[1] // 8
-    width = width[0] // 8, width[1] // 8
-    height_d = IntVar(values=list(height), name="height")
-    width_d = IntVar(values=list(width), name="width")
+
+    static_batch = batch_size[0] == batch_size[1]
+    static_shape = height[0] == height[1] and width[0] == width[1]
+    if not vae_encode:
+        height = height[0] // down_factor, height[1] // down_factor
+        width = width[0] // down_factor, width[1] // down_factor
+
+    if static_batch:
+        batch_size = batch_size[0]
+    else:
+        batch_size = IntVar(values=list(batch_size), name="batch_size")
+    if static_shape:
+        height_d = height[0]
+        width_d = width[0]
+    else:
+        height_d = IntVar(values=list(height), name="height")
+        width_d = IntVar(values=list(width), name="width")
 
     ait_input = Tensor(
-        shape=[batch_size, height_d, width_d, latent_channels],
-        name="vae_input",
+        shape=[batch_size, height_d, width_d, 3 if vae_encode else latent_channels],
+        name="pixels" if vae_encode else "latent",
         is_input=True,
+        dtype=dtype,
     )
+    sample = None
+    if vae_encode:
+        sample = Tensor(
+            shape=[batch_size, height_d, width_d, latent_channels],
+            name="random_sample",
+            is_input=True,
+            dtype=dtype,
+        )
     ait_vae.name_parameter_tensor()
 
     pt_mod = pt_mod.eval()
-    params_ait = map_vae(pt_mod)
-
-    Y = ait_vae.decode(ait_input)
-    mark_output(Y)
+    params_ait = map_vae(pt_mod, dtype=dtype, encoder=vae_encode)
+    if vae_encode:
+        Y = ait_vae.encode(ait_input, sample)
+    else:
+        Y = ait_vae.decode(ait_input)
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
     target = detect_target(
         use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
     )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
     compile_model(
         Y,
         target,
-        "./tmp",
-        name,
+        work_dir,
+        model_name,
         constants=params_ait if constants else None,
+        dll_name=dll_name,
     )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
index 90cc1bc32..7c5e0b6aa 100644
--- a/examples/05_stable_diffusion/src/compile_lib/util.py
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -12,6 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import torch
+
+
 def mark_output(y):
     if type(y) is not tuple:
         y = (y,)
@@ -20,3 +23,7 @@ def mark_output(y):
         y[i]._attrs["name"] = "output_%d" % (i)
         y_shape = [d._attrs["values"] for d in y[i]._attrs["shape"]]
         print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
diff --git a/examples/05_stable_diffusion/src/inference_ait.py b/examples/05_stable_diffusion/src/inference_ait.py
new file mode 100644
index 000000000..367b8b334
--- /dev/null
+++ b/examples/05_stable_diffusion/src/inference_ait.py
@@ -0,0 +1,224 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Helpers for inference using dict[str, torch.Tensor] as inputs and outputs.
+input names are manually specified, the same as set in compilation scripts.
+output names are taken from the model itself.
+usage:
+outputs = clip_inference(...)
+#Diffusers/Transformers
+pooled_prompt_embeds = outputs[0]
+prompt_embeds = prompt_embeds.hidden_states[-2]
+#AIT
+pooled_prompt_embeds = outputs["text_embeds"] # or "pooled_output" is without projection, "text_embeds" is with projection i.e. for bigG
+prompt_embeds = outputs["hidden_state_31"]
+usage:
+latent = unet_inference(...)['latent_output']
+usage:
+pixels = vae_decode_inference(...)['pixels']
+usage:
+latent = vae_encode_inference(...)['latent']
+"""
+from typing import Dict, List
+
+import torch
+from aitemplate.compiler import Model
+
+
+def inference(
+    module: Model,
+    inputs: Dict[str, torch.Tensor],
+    outputs: Dict[str, torch.Tensor],
+    benchmark: bool = False,
+    benchmark_count: int = 50,
+    benchmark_repeat: int = 4,
+    permute: bool = False,
+    to_cpu: bool = False,
+):
+    module.run_with_tensors(inputs, outputs, graph_mode=False)
+    if permute:
+        for name, output in outputs.items():
+            if len(output.shape) == 4:
+                outputs[name] = output.permute((0, 3, 1, 2))
+    if to_cpu:
+        for name, output in outputs.items():
+            outputs[name] = output.cpu()
+    if benchmark:
+        t, _, _ = module.benchmark_with_tensors(
+            inputs=inputs,
+            outputs=outputs,
+            count=benchmark_count,
+            repeat=benchmark_repeat,
+        )
+        print(f"latency: {t} ms, it/s: {1000 / t}")
+
+    return outputs
+
+
+def get_outputs(module: Model, dims, device: str = "cuda", dtype: str = "float16"):
+    outputs = {}
+    map = module.get_output_name_to_index_map()
+    for name, idx in map.items():
+        shape = module.get_output_maximum_shape(idx)
+        for idx, dim in enumerate(dims):
+            shape[idx] = dim
+        output = torch.empty(shape).to(device)
+        if dtype == "float16":
+            output = output.half()
+        outputs[name] = output
+    return outputs
+
+
+def timestep_inference(
+    module: Model,
+    timestep: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+):
+    timestep = torch.tensor([timestep]).to(device)
+    inputs = {"timestep": timestep.to(device)}
+    if dtype == "float16":
+        for k, v in inputs.items():
+            inputs[k] = v.half()
+    dims = [1]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu)
+
+
+def clip_inference(
+    module: Model,
+    input_ids: torch.Tensor,
+    seqlen: int = 77,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+):
+    batch = input_ids.shape[0]
+    input_ids = input_ids.to(device)
+    position_ids = torch.arange(seqlen).expand((batch, -1)).to(device)
+    inputs = {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+    }
+    dims = [batch]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu)
+
+
+def unet_inference(
+    module: Model,
+    latent_model_input: torch.Tensor,
+    timesteps: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    class_labels: torch.Tensor = None,
+    down_block_residuals: List[torch.Tensor] = None,
+    mid_block_residual: torch.Tensor = None,
+    add_embeds: torch.Tensor = None,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+):
+    batch = latent_model_input.shape[0]
+    height, width = latent_model_input.shape[2], latent_model_input.shape[3]
+    timesteps = timesteps.expand(batch)
+    inputs = {
+        "latent_model_input": latent_model_input.permute((0, 2, 3, 1))
+        .contiguous()
+        .to(device),
+        "timesteps": timesteps.to(device),
+        "encoder_hidden_states": encoder_hidden_states.to(device),
+    }
+    if class_labels is not None:
+        inputs["class_labels"] = class_labels.contiguous().to(device)
+    if down_block_residuals is not None and mid_block_residual is not None:
+        for i, y in enumerate(down_block_residuals):
+            inputs[f"down_block_residual_{i}"] = (
+                y.permute((0, 2, 3, 1)).contiguous().to(device)
+            )
+        inputs["mid_block_residual"] = (
+            mid_block_residual.permute((0, 2, 3, 1)).contiguous().to(device)
+        )
+    if add_embeds is not None:
+        inputs["add_embeds"] = add_embeds.to(device)
+    if dtype == "float16":
+        for k, v in inputs.items():
+            if k == "class_labels":
+                continue
+            inputs[k] = v.half()
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+    )
+
+
+def vae_decode_inference(
+    module: Model,
+    latent: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    factor: int = 8,
+    to_cpu: bool = False,
+):
+    batch = latent.shape[0]
+    height, width = latent.shape[2:]
+    height *= factor
+    width *= factor
+    latent = latent.permute((0, 2, 3, 1)).contiguous().to(device)
+    if dtype == "float16":
+        latent = latent.half()
+    inputs = {
+        "latent": latent,
+    }
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+    )
+
+
+def vae_encode_inference(
+    module: Model,
+    pixels: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    factor: int = 8,
+    latent_channels: int = 4,
+    to_cpu: bool = False,
+):
+    batch = pixels.shape[0]
+    height, width = pixels.shape[2:]
+    height *= factor
+    width *= factor
+    pixels = pixels.permute((0, 2, 3, 1)).contiguous().to(device)
+    sample = torch.randn(batch, height, width, latent_channels).to(device)
+    if dtype == "float16":
+        pixels = pixels.half()
+        sample = sample.half()
+    inputs = {
+        "pixels": pixels,
+        "random_sample": sample,
+    }
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+    )
diff --git a/examples/05_stable_diffusion/src/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
index 06ab5f1bd..59f6337a8 100644
--- a/examples/05_stable_diffusion/src/modeling/attention.py
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -50,6 +50,7 @@ def __init__(
         num_groups: int = 32,
         rescale_output_factor: float = 1.0,
         eps: float = 1e-5,
+        dtype="float16",
     ):
         super().__init__()
         self.batch_size = batch_size
@@ -58,13 +59,14 @@ def __init__(
             channels // num_head_channels if num_head_channels is not None else 1
         )
         self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps, dtype=dtype)
         self.attention = nn.CrossAttention(
             channels,
             height * width,
             height * width,
             self.num_heads,
             qkv_bias=True,
+            dtype=dtype,
         )
         self.rescale_output_factor = rescale_output_factor
 
diff --git a/examples/05_stable_diffusion/src/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
index 8385a317c..4a0c9165d 100644
--- a/examples/05_stable_diffusion/src/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -57,11 +57,12 @@ def __init__(
         self.heads = heads
         self.dim_head = dim_head
 
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False, dtype=dtype)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False, dtype=dtype)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False, dtype=dtype)
         self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+            nn.Linear(inner_dim, query_dim, dtype=dtype),
+            nn.Dropout(dropout, dtype=dtype),
         )
 
     def forward(self, x, context=None, mask=None, residual=None):
@@ -109,30 +110,34 @@ def forward(self, x, context=None, mask=None, residual=None):
 
 
 class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
+    def __init__(self, dim_in, dim_out, dtype="float16"):
         super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
-        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul", dtype=dtype)
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu", dtype=dtype)
 
     def forward(self, x):
         return self.proj(x, self.gate(x))
 
 
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+    def __init__(
+        self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, dtype="float16"
+    ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
         project_in = (
             nn.Sequential(
-                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+                nn.Linear(dim, inner_dim, specialization="fast_gelu", dtype=dtype),
             )
             if not glu
-            else GEGLU(dim, inner_dim)
+            else GEGLU(dim, inner_dim, dtype=dtype)
         )
 
         self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+            project_in,
+            nn.Dropout(dropout, dtype=dtype),
+            nn.Linear(inner_dim, dim_out, dtype=dtype),
         )
 
     def forward(self, x, residual=None):
@@ -155,35 +160,54 @@ def __init__(
         context_dim=None,
         gated_ff=True,
         checkpoint=True,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
+        self.only_cross_attention = only_cross_attention
         self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
             query_dim=dim,
-            context_dim=context_dim,
+            context_dim=context_dim if only_cross_attention else None,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
+            dtype=dtype,
         )
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, dtype=dtype)
+        if context_dim is not None:
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                context_dim=context_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                dtype=dtype,
+            )
+        else:
+            self.attn2 = None
+        self.norm1 = nn.LayerNorm(dim, dtype=dtype)
+        self.norm2 = nn.LayerNorm(dim, dtype=dtype)
+        self.norm3 = nn.LayerNorm(dim, dtype=dtype)
         self.checkpoint = checkpoint
 
         self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
 
     def forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), residual=x)
-        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.attn1(
+            self.norm1(x),
+            residual=x,
+            context=context if self.only_cross_attention else None,
+        )
+        if self.attn2 is not None:
+            x = self.attn2(self.norm2(x), context=context, residual=x)
         x = self.ff(self.norm3(x), residual=x)
         return x
 
 
-def Normalize(in_channels):
-    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+def Normalize(in_channels, dtype="float16"):
+    return nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype
+    )
 
 
 class SpatialTransformer(nn.Module):
@@ -204,34 +228,42 @@ def __init__(
         dropout=0.0,
         context_dim=None,
         use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)  # Group Norm
+        self.norm = Normalize(in_channels, dtype=dtype)  # Group Norm
         self.use_linear_projection = use_linear_projection
 
         if use_linear_projection:
-            self.proj_in = nn.Linear(in_channels, inner_dim)
+            self.proj_in = nn.Linear(in_channels, inner_dim, dtype=dtype)
         else:
             self.proj_in = nn.Conv2dBias(
-                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0, dtype=dtype
             )
 
         self.transformer_blocks = nn.ModuleList(
             [
                 BasicTransformerBlock(
-                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
                 for d in range(depth)
             ]
         )
 
         if use_linear_projection:
-            self.proj_out = nn.Linear(inner_dim, in_channels)
+            self.proj_out = nn.Linear(inner_dim, in_channels, dtype=dtype)
         else:
             self.proj_out = nn.Conv2dBias(
-                inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+                inner_dim, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype
             )
 
     def forward(self, x, context=None):
@@ -543,12 +575,17 @@ def forward(
 
         hidden_states = inputs_embeds
         for _, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
+            if output_hidden_states and encoder_states is not None:
                 encoder_states = encoder_states + (hidden_states,)
             layer_outputs = encoder_layer(hidden_states)
             hidden_states = layer_outputs
 
-        return hidden_states
+        last_hidden_state = hidden_states
+        output = last_hidden_state
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+            output = encoder_states
+        return output
 
 
 class CLIPTextEmbeddings(nn.Module):
@@ -607,6 +644,7 @@ class CLIPTextTransformer(nn.Module):
     def __init__(
         self,
         hidden_size=768,
+        text_projection_dim=None,
         output_attentions=False,
         output_hidden_states=False,
         use_return_dict=False,
@@ -631,10 +669,19 @@ def __init__(
             act_layer=act_layer,
         )
         self.final_layer_norm = nn.LayerNorm(hidden_size)
+        if text_projection_dim is not None:
+            self.text_projection = nn.Linear(
+                hidden_size, text_projection_dim, bias=False
+            )
+        else:
+            self.text_projection = None
 
         self.output_attentions = output_attentions
         self.output_hidden_states = output_hidden_states
         self.use_return_dict = use_return_dict
+        self.hidden_size = hidden_size
+        self.seq_len = seq_len
+        self.num_layers = num_hidden_layers
 
     def forward(
         self,
@@ -648,27 +695,40 @@ def forward(
         r"""
         Returns:
         """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
+        batch = ops.size()(input_ids)[0]
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
+        encoder_output = self.encoder(
+            inputs_embeds=hidden_states, output_hidden_states=self.output_hidden_states
         )
-
-        last_hidden_state = encoder_outputs
+        if self.output_hidden_states:
+            last_hidden_state = encoder_output[-1]
+        else:
+            last_hidden_state = encoder_output
         last_hidden_state = self.final_layer_norm(last_hidden_state)
-        return last_hidden_state
+
+        argmax = ops.argmax(-1)(input_ids)
+        pooled_output = ops.index_select(dim=1)(last_hidden_state, argmax)
+        pooled_output = ops.reshape()(pooled_output, [batch, self.hidden_size])
+        last_hidden_state._attrs["is_output"] = True
+        last_hidden_state._attrs["name"] = "last_hidden_state"
+        pooled_output._attrs["is_output"] = True
+        pooled_output._attrs["name"] = "pooled_output"
+        output = (
+            last_hidden_state,
+            pooled_output,
+        )
+        if self.text_projection is not None:
+            text_embeds = self.text_projection(pooled_output)
+            text_embeds._attrs["is_output"] = True
+            text_embeds._attrs["name"] = "text_embeds"
+            output = output + (text_embeds,)
+
+        if self.output_hidden_states:
+            for idx, hidden_state in enumerate(encoder_output[:-1]):
+                hidden_state._attrs["is_output"] = True
+                hidden_state._attrs["name"] = f"hidden_state_{idx}"
+                output = output + (hidden_state,)
+
+        return output
diff --git a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
index 56da472ab..ad7a0e7db 100644
--- a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
@@ -16,9 +16,10 @@
 
 from aitemplate.compiler import ops
 from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
 
 from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+from .unet_blocks import get_down_block, UNetMidBlock2DCrossAttn
 
 
 class ControlNetConditioningEmbedding(nn.Module):
@@ -44,7 +45,12 @@ def __init__(
         Could be changed back to a loop and use parameters though,
         but it ended up like this when debugging.
         """
-        self.conv_in = nn.Conv2dBiasFewChannels(3, 16, 3, 1, 1)
+        conv_op = (
+            nn.Conv2dBiasFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBias
+        )
+        self.conv_in = conv_op(3, 16, 3, 1, 1)
 
         self.blocks = nn.ModuleList([])
         self.blocks.append(nn.Conv2dBias(16, 16, 3, 1, 1))
@@ -192,6 +198,9 @@ def __init__(
             upcast_attention=upcast_attention,
         )
 
+    def get_shape(self, sample):
+        return [i._attrs["int_var"]._attrs["values"][0] for i in ops.size()(sample)]
+
     def forward(
         self,
         sample,
@@ -207,7 +216,7 @@ def forward(
         sample = self.conv_in(sample)
 
         controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-
+        controlnet_cond._attrs["shape"] = sample._attrs["shape"]
         sample = sample + controlnet_cond
         # 3. down
         down_block_res_samples = (sample,)  # up to but excluding last element
@@ -249,7 +258,6 @@ def forward(
         ]
         mid_block_res_sample = mid_block_res_sample * conditioning_scale
 
-        # return (down_block_res_samples, mid_block_res_sample)
         return (
             down_block_res_samples[0],
             down_block_res_samples[1],
@@ -265,285 +273,3 @@ def forward(
             down_block_res_samples[11],
             mid_block_res_sample,
         )
-
-
-class ControlNetUNet2DConditionModel(nn.Module):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-        use_linear_projection (`bool`, *optional*, defaults to False): Use linear projection instead of 1x1 convolution.
-    """
-
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types: Tuple[str] = (
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        use_linear_projection: bool = False,
-    ):
-        super().__init__()
-        self.center_input_sample = center_input_sample
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-
-        # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
-        # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=attention_head_dim[i],
-                cross_attention_dim=cross_attention_dim,
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-            use_linear_projection=use_linear_projection,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[
-                min(i + 1, len(block_out_channels) - 1)
-            ]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=reversed_attention_head_dim[i],
-                cross_attention_dim=cross_attention_dim,
-                use_linear_projection=use_linear_projection,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=norm_num_groups,
-            eps=norm_eps,
-            use_swish=True,
-        )
-
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
-
-    def forward(
-        self,
-        sample,
-        timesteps,
-        encoder_hidden_states,
-        down_block_residual_0,
-        down_block_residual_1,
-        down_block_residual_2,
-        down_block_residual_3,
-        down_block_residual_4,
-        down_block_residual_5,
-        down_block_residual_6,
-        down_block_residual_7,
-        down_block_residual_8,
-        down_block_residual_9,
-        down_block_residual_10,
-        down_block_residual_11,
-        mid_block_residual,
-        return_dict: bool = True,
-    ):
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        down_block_additional_residuals = (
-            down_block_residual_0,
-            down_block_residual_1,
-            down_block_residual_2,
-            down_block_residual_3,
-            down_block_residual_4,
-            down_block_residual_5,
-            down_block_residual_6,
-            down_block_residual_7,
-            down_block_residual_8,
-            down_block_residual_9,
-            down_block_residual_10,
-            down_block_residual_11,
-        )
-        mid_block_additional_residual = mid_block_residual
-        # 1. time
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if (
-                hasattr(downsample_block, "attentions")
-                and downsample_block.attentions is not None
-            ):
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-            # return sample
-
-        if down_block_additional_residuals is not None:
-            new_down_block_res_samples = ()
-
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample += down_block_additional_residual
-                new_down_block_res_samples += (down_block_res_sample,)
-
-            down_block_res_samples = new_down_block_res_samples
-
-        # 4. mid
-        sample = self.mid_block(
-            sample, emb, encoder_hidden_states=encoder_hidden_states
-        )
-
-        if mid_block_additional_residual is not None:
-            sample += mid_block_additional_residual
-
-        # 5. up
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[
-                : -len(upsample_block.resnets)
-            ]
-
-            if (
-                hasattr(upsample_block, "attentions")
-                and upsample_block.attentions is not None
-            ):
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
-                )
-
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-        return sample
diff --git a/examples/05_stable_diffusion/src/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
index cab7c033f..e014c0530 100644
--- a/examples/05_stable_diffusion/src/modeling/embeddings.py
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -30,6 +30,8 @@ def get_timestep_embedding(
     downscale_freq_shift: float = 1,
     scale: float = 1,
     max_period: int = 10000,
+    dtype: str = "float16",
+    arange_name="arange",
 ):
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
@@ -44,7 +46,7 @@ def get_timestep_embedding(
     half_dim = embedding_dim // 2
 
     exponent = (-math.log(max_period)) * Tensor(
-        shape=[half_dim], dtype="float16", name="arange"
+        shape=[half_dim], dtype=dtype, name=arange_name
     )
 
     exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
@@ -70,11 +72,19 @@ def get_timestep_embedding(
 
 
 class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+    def __init__(
+        self,
+        channel: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        dtype: str = "float16",
+    ):
         super().__init__()
 
-        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+        self.linear_1 = nn.Linear(
+            channel, time_embed_dim, specialization="swish", dtype=dtype
+        )
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, dtype=dtype)
 
     def forward(self, sample):
         sample = self.linear_1(sample)
@@ -84,12 +94,19 @@ def forward(self, sample):
 
 class Timesteps(nn.Module):
     def __init__(
-        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+        self,
+        num_channels: int,
+        flip_sin_to_cos: bool,
+        downscale_freq_shift: float,
+        dtype: str = "float16",
+        arange_name="arange",
     ):
         super().__init__()
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
         self.downscale_freq_shift = downscale_freq_shift
+        self.dtype = dtype
+        self.arange_name = arange_name
 
     def forward(self, timesteps):
         t_emb = get_timestep_embedding(
@@ -97,5 +114,7 @@ def forward(self, timesteps):
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.downscale_freq_shift,
+            dtype=self.dtype,
+            arange_name=self.arange_name,
         )
         return t_emb
diff --git a/examples/05_stable_diffusion/src/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
index c15bf26d2..1262ac86f 100644
--- a/examples/05_stable_diffusion/src/modeling/resnet.py
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -37,6 +37,7 @@ def __init__(
         use_conv_transpose=False,
         out_channels=None,
         name="conv",
+        dtype="float16",
     ):
         super().__init__()
         self.channels = channels
@@ -47,9 +48,11 @@ def __init__(
 
         conv = None
         if use_conv_transpose:
-            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+            conv = nn.ConvTranspose2dBias(
+                channels, self.out_channels, 4, 2, 1, dtype=dtype
+            )
         elif use_conv:
-            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1, dtype=dtype)
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
         if name == "conv":
@@ -60,7 +63,6 @@ def __init__(
     def forward(self, x):
         if self.use_conv_transpose:
             return self.conv(x)
-
         x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
@@ -83,7 +85,13 @@ class Downsample2D(nn.Module):
     """
 
     def __init__(
-        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+        self,
+        channels,
+        use_conv=False,
+        out_channels=None,
+        padding=1,
+        name="conv",
+        dtype="float16",
     ):
         super().__init__()
         self.channels = channels
@@ -92,10 +100,16 @@ def __init__(
         self.padding = padding
         stride = 2
         self.name = name
+        self.dtype = dtype
 
         if use_conv:
             conv = nn.Conv2dBias(
-                self.channels, self.out_channels, 3, stride=stride, padding=padding
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                dtype=dtype,
+                padding=padding,
             )
         else:
             assert self.channels == self.out_channels
@@ -110,9 +124,21 @@ def __init__(
         else:
             self.conv = conv
 
-    def forward(self, x):
-        x = self.conv(x)
-        return x
+    def forward(self, hidden_states):
+        if self.use_conv and self.padding == 0:
+            padding = ops.full()([0, 1, 0, 0], 0.0, dtype=self.dtype)
+            padding._attrs["shape"][0] = hidden_states._attrs["shape"][0]
+            padding._attrs["shape"][2] = hidden_states._attrs["shape"][2]
+            padding._attrs["shape"][3] = hidden_states._attrs["shape"][3]
+            hidden_states = ops.concatenate()([hidden_states, padding], dim=1)
+            padding = ops.full()([0, 0, 1, 0], 0.0, dtype=self.dtype)
+            padding._attrs["shape"][0] = hidden_states._attrs["shape"][0]
+            padding._attrs["shape"][1] = hidden_states._attrs["shape"][1]
+            padding._attrs["shape"][3] = hidden_states._attrs["shape"][3]
+            hidden_states = ops.concatenate()([hidden_states, padding], dim=2)
+
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
 
 
 class ResnetBlock2D(nn.Module):
@@ -135,6 +161,7 @@ def __init__(
         use_nin_shortcut=None,
         up=False,
         down=False,
+        dtype="float16"
     ):
         super().__init__()
         self.pre_norm = pre_norm
@@ -157,14 +184,15 @@ def __init__(
             eps=eps,
             affine=True,
             use_swish=True,
+            dtype=dtype,
         )
 
         self.conv1 = nn.Conv2dBias(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype
         )
 
         if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels, dtype=dtype)
         else:
             self.time_emb_proj = None
 
@@ -174,10 +202,11 @@ def __init__(
             eps=eps,
             affine=True,
             use_swish=True,
+            dtype=dtype,
         )
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(dropout, dtype=dtype)
         self.conv2 = nn.Conv2dBias(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype
         )
 
         self.upsample = self.downsample = None
@@ -190,7 +219,7 @@ def __init__(
 
         if self.use_nin_shortcut:
             self.conv_shortcut = nn.Conv2dBias(
-                in_channels, out_channels, 1, 1, 0
+                in_channels, out_channels, 1, 1, 0, dtype=dtype
             )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
         else:
             self.conv_shortcut = None
diff --git a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index 2ad4d9718..2e5d33a68 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -14,7 +14,10 @@
 #
 from typing import Optional, Tuple, Union
 
+from aitemplate.compiler import ops
+
 from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
 
 from .embeddings import TimestepEmbedding, Timesteps
 from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
@@ -83,19 +86,69 @@ def __init__(
         cross_attention_dim: int = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        only_cross_attention=[True, True, True, False],
+        conv_in_kernel=3,
+        dtype="float16",
+        time_embedding_dim=None,
+        projection_class_embeddings_input_dim=None,
+        addition_embed_type=None,
+        transformer_layers_per_block=[1, 1, 1, 1],
     ):
         super().__init__()
         self.center_input_sample = center_input_sample
         self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
+        self.time_embedding_dim = time_embedding_dim
+        time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
 
         # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        self.in_channels = in_channels
+        if self.in_channels % 4 != 0:
+            in_channels = self.in_channels + (4 - (self.in_channels % 4))
+        else:
+            in_channels = self.in_channels
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        print("in_channels", in_channels)
+        if in_channels < 8 and detect_target().name() == "cuda":
+            self.conv_in = nn.Conv2dBiasFewChannels(
+                in_channels, block_out_channels[0], 3, 1, conv_in_padding, dtype=dtype
+            )
+        else:
+            self.conv_in = nn.Conv2dBias(
+                in_channels, block_out_channels[0], 3, 1, conv_in_padding, dtype=dtype
+            )
         # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        self.time_proj = Timesteps(
+            block_out_channels[0],
+            flip_sin_to_cos,
+            freq_shift,
+            dtype=dtype,
+            arange_name="arange",
+        )
         timestep_input_dim = block_out_channels[0]
 
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim, time_embed_dim, dtype=dtype
+        )
+        self.class_embed_type = class_embed_type
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(
+                [num_class_embeds, time_embed_dim], dtype=dtype
+            )
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(
+                timestep_input_dim, time_embed_dim, dtype=dtype
+            )
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(dtype=dtype)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text_time":
+            self.add_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim, dtype=dtype
+            )
 
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
@@ -109,10 +162,10 @@ def __init__(
             input_channel = output_channel
             output_channel = block_out_channels[i]
             is_final_block = i == len(block_out_channels) - 1
-
             down_block = get_down_block(
                 down_block_type,
                 num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=time_embed_dim,
@@ -123,11 +176,14 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
                 downsample_padding=downsample_padding,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                dtype=dtype,
             )
             self.down_blocks.append(down_block)
 
         # mid
         self.mid_block = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block[-1],
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
             resnet_eps=norm_eps,
@@ -138,11 +194,15 @@ def __init__(
             attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
             use_linear_projection=use_linear_projection,
+            dtype=dtype,
         )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_transformer_layers_per_block = list(
+            reversed(transformer_layers_per_block)
+        )
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
@@ -156,6 +216,7 @@ def __init__(
             up_block = get_up_block(
                 up_block_type,
                 num_layers=layers_per_block + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
@@ -166,6 +227,8 @@ def __init__(
                 attn_num_head_channels=reversed_attention_head_dim[i],
                 cross_attention_dim=cross_attention_dim,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                dtype=dtype,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -176,17 +239,33 @@ def __init__(
             num_groups=norm_num_groups,
             eps=norm_eps,
             use_swish=True,
+            dtype=dtype,
         )
 
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, 3, 1, 1, dtype=dtype
+        )
 
     def forward(
         self,
         sample,
         timesteps,
         encoder_hidden_states,
-        down_block_additional_residuals: Optional[Tuple[Tensor]] = None,
-        mid_block_additional_residual: Optional[Tensor] = None,
+        down_block_residual_0=None,
+        down_block_residual_1=None,
+        down_block_residual_2=None,
+        down_block_residual_3=None,
+        down_block_residual_4=None,
+        down_block_residual_5=None,
+        down_block_residual_6=None,
+        down_block_residual_7=None,
+        down_block_residual_8=None,
+        down_block_residual_9=None,
+        down_block_residual_10=None,
+        down_block_residual_11=None,
+        mid_block_residual=None,
+        class_labels: Optional[Tensor] = None,
+        add_embeds: Optional[Tensor] = None,
         return_dict: bool = True,
     ):
         """r
@@ -202,12 +281,50 @@ def forward(
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
+        down_block_additional_residuals = (
+            down_block_residual_0,
+            down_block_residual_1,
+            down_block_residual_2,
+            down_block_residual_3,
+            down_block_residual_4,
+            down_block_residual_5,
+            down_block_residual_6,
+            down_block_residual_7,
+            down_block_residual_8,
+            down_block_residual_9,
+            down_block_residual_10,
+            down_block_residual_11,
+        )
+        mid_block_additional_residual = mid_block_residual
+        if down_block_additional_residuals[0] is None:
+            down_block_additional_residuals = None
 
         # 1. time
         t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError(
+                    "class_labels should be provided when num_class_embeds > 0"
+                )
+
+            if self.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = ops.batch_gather()(
+                self.class_embedding.weight.tensor(), class_labels
+            )
+            emb = emb + class_emb
+
+        if add_embeds is not None:
+            aug_emb = self.add_embedding(add_embeds)
+            emb = emb + aug_emb
 
         # 2. pre-process
+        if self.in_channels % 4 != 0:
+            channel_pad = self.in_channels + (4 - (self.in_channels % 4))
+            sample = ops.pad_last_dim(4, channel_pad)(sample)
+
         sample = self.conv_in(sample)
 
         # 3. down
@@ -234,6 +351,9 @@ def forward(
             for down_block_res_sample, down_block_additional_residual in zip(
                 down_block_res_samples, down_block_additional_residuals
             ):
+                down_block_additional_residual._attrs[
+                    "shape"
+                ] = down_block_res_sample._attrs["shape"]
                 down_block_res_sample += down_block_additional_residual
                 new_down_block_res_samples += (down_block_res_sample,)
 
@@ -245,10 +365,10 @@ def forward(
         )
 
         if mid_block_additional_residual is not None:
+            mid_block_additional_residual._attrs["shape"] = sample._attrs["shape"]
             sample += mid_block_additional_residual
-
         # 5. up
-        for upsample_block in self.up_blocks:
+        for i, upsample_block in enumerate(self.up_blocks):
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[
                 : -len(upsample_block.resnets)
@@ -266,7 +386,9 @@ def forward(
                 )
             else:
                 sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
                 )
 
         # 6. post-process
@@ -274,4 +396,6 @@ def forward(
         # when running in half-precision
         sample = self.conv_norm_out(sample)
         sample = self.conv_out(sample)
+        sample._attrs["is_output"] = True
+        sample._attrs["name"] = "latent_output"
         return sample
diff --git a/examples/05_stable_diffusion/src/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 897025660..2a7f5ffc7 100644
--- a/examples/05_stable_diffusion/src/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -49,9 +49,13 @@ def get_down_block(
     resnet_eps,
     resnet_act_fn,
     attn_num_head_channels,
+    transformer_layers_per_block=1,
     cross_attention_dim=None,
     downsample_padding=None,
     use_linear_projection=False,
+    only_cross_attention=False,
+    resnet_groups=32,
+    dtype="float16",
 ):
     down_block_type = (
         down_block_type[7:]
@@ -67,7 +71,9 @@ def get_down_block(
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
+            dtype=dtype,
         )
     elif down_block_type == "AttnDownBlock2D":
         return AttnDownBlock2D(
@@ -88,16 +94,20 @@ def get_down_block(
             )
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            dtype=dtype,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -124,13 +134,16 @@ def get_down_block(
         )
     elif down_block_type == "DownEncoderBlock2D":
         return DownEncoderBlock2D(
-            num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
-            add_downsample=add_downsample,
+            num_layers=num_layers,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            output_scale_factor=1.0,
+            add_downsample=add_downsample,
             downsample_padding=downsample_padding,
+            dtype=dtype,
         )
 
 
@@ -145,8 +158,11 @@ def get_up_block(
     resnet_eps,
     resnet_act_fn,
     attn_num_head_channels,
+    transformer_layers_per_block=1,
     cross_attention_dim=None,
     use_linear_projection=False,
+    only_cross_attention=False,
+    dtype="float16",
 ):
     up_block_type = (
         up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
@@ -161,6 +177,7 @@ def get_up_block(
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            dtype=dtype,
         )
     elif up_block_type == "CrossAttnUpBlock2D":
         if cross_attention_dim is None:
@@ -168,6 +185,7 @@ def get_up_block(
                 "cross_attention_dim must be specified for CrossAttnUpBlock2D"
             )
         return CrossAttnUpBlock2D(
+            transformer_layers_per_block=transformer_layers_per_block,
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -179,6 +197,8 @@ def get_up_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            dtype=dtype,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -223,6 +243,7 @@ def get_up_block(
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            dtype=dtype,
         )
     raise ValueError(f"{up_block_type} does not exist.")
 
@@ -234,6 +255,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block=1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -244,6 +266,7 @@ def __init__(
         output_scale_factor=1.0,
         cross_attention_dim=1280,
         use_linear_projection=False,
+        dtype="float16",
         **kwargs,
     ):
         super().__init__()
@@ -267,6 +290,7 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
+                dtype=dtype,
             )
         ]
         attentions = []
@@ -277,9 +301,10 @@ def __init__(
                     in_channels,
                     attn_num_head_channels,
                     in_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
                     use_linear_projection=use_linear_projection,
+                    dtype=dtype,
                 )
             )
             resnets.append(
@@ -294,6 +319,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -317,6 +343,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -329,6 +356,8 @@ def __init__(
         downsample_padding=1,
         add_downsample=True,
         use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
 
@@ -352,6 +381,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
             attentions.append(
@@ -359,9 +389,11 @@ def __init__(
                     out_channels,
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
                     use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -376,6 +408,7 @@ def __init__(
                         out_channels=out_channels,
                         padding=downsample_padding,
                         name="op",
+                        dtype=dtype,
                     )
                 ]
             )
@@ -415,6 +448,7 @@ def __init__(
         output_scale_factor=1.0,
         add_downsample=True,
         downsample_padding=1,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -433,6 +467,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -447,6 +482,7 @@ def __init__(
                         out_channels=out_channels,
                         padding=downsample_padding,
                         name="op",
+                        dtype=dtype,
                     )
                 ]
             )
@@ -478,6 +514,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -490,6 +527,8 @@ def __init__(
         downsample_padding=1,
         add_upsample=True,
         use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
 
@@ -515,6 +554,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
             attentions.append(
@@ -522,9 +562,11 @@ def __init__(
                     out_channels,
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
                     use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -532,7 +574,14 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
@@ -578,6 +627,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -598,6 +648,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -605,12 +656,21 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+    def forward(
+        self, hidden_states, res_hidden_states_tuple, temb=None
+    ):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
@@ -628,6 +688,82 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         return hidden_states
 
 
+def shape_to_list(shape):
+    return [
+        sample["symbolic_value"]
+        if type(sample) == Tensor
+        else sample._attrs["symbolic_value"]
+        for sample in shape
+    ]
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        dtype="float16",
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    dtype=dtype,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                        dtype=dtype,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
 class UpDecoderBlock2D(nn.Module):
     def __init__(
         self,
@@ -642,6 +778,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -661,6 +798,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -668,7 +806,14 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
@@ -702,6 +847,7 @@ def __init__(
         attn_num_head_channels=1,
         attention_type="default",
         output_scale_factor=1.0,
+        dtype="float16",
         **kwargs,
     ):
         super().__init__()
@@ -728,6 +874,7 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
+                dtype=dtype,
             )
         ]
         attentions = []
@@ -742,6 +889,7 @@ def __init__(
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
                     num_groups=resnet_groups,
+                    dtype=dtype,
                 )
             )
             resnets.append(
@@ -756,6 +904,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
index f2bea6a43..576d1f058 100644
--- a/examples/05_stable_diffusion/src/modeling/vae.py
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -17,10 +17,12 @@
 
 from typing import Tuple
 
-from aitemplate.frontend import nn, Tensor
+from aitemplate.compiler import ops
 
-from .unet_blocks import get_up_block, UNetMidBlock2D
+from aitemplate.frontend import nn, Tensor
 
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2D
+from aitemplate.testing import detect_target
 
 class Decoder(nn.Module):
     def __init__(
@@ -34,12 +36,18 @@ def __init__(
         block_out_channels=(64,),
         layers_per_block=2,
         act_fn="silu",
+        dtype="float16",
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
 
         self.conv_in = nn.Conv2dBias(
-            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
         )
 
         # mid
@@ -55,6 +63,7 @@ def __init__(
             attn_num_head_channels=None,
             resnet_groups=32,
             temb_channels=None,
+            dtype=dtype,
         )
 
         # up
@@ -78,6 +87,7 @@ def __init__(
                 resnet_eps=1e-6,
                 resnet_act_fn=act_fn,
                 attn_num_head_channels=None,
+                dtype=dtype,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -89,9 +99,15 @@ def __init__(
             num_groups=num_groups_out,
             eps=1e-6,
             use_swish=True,
+            dtype=dtype,
         )
         self.conv_out = nn.Conv2dBias(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            dtype=dtype,
         )
 
     def forward(self, z) -> Tensor:
@@ -111,6 +127,118 @@ def forward(self, z) -> Tensor:
         return sample
 
 
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        down_block_types=("DownEncoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+        double_z=True,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        conv_op = (
+            nn.Conv2dBiasFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBias
+        )
+        self.conv_in = conv_op(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
+        )
+
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attn_num_head_channels=None,
+                temb_channels=None,
+                dtype=dtype,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            dtype=dtype,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[-1],
+            num_groups=norm_num_groups,
+            eps=1e-6,
+            dtype=dtype,
+        )
+        self.conv_act = ops.silu
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[-1],
+            conv_out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
+        )
+
+    def forward(self, x):
+        sample = x
+
+        sample = self.conv_in(sample)
+
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
 class AutoencoderKL(nn.Module):
     def __init__(
         self,
@@ -125,7 +253,9 @@ def __init__(
         layers_per_block: int = 1,
         act_fn: str = "silu",
         latent_channels: int = 4,
+        norm_num_groups: int = 32,
         sample_size: int = 32,
+        dtype="float16",
     ):
         super().__init__()
         self.decoder = Decoder(
@@ -138,15 +268,67 @@ def __init__(
             block_out_channels=block_out_channels,
             layers_per_block=layers_per_block,
             act_fn=act_fn,
+            dtype=dtype,
         )
         self.post_quant_conv = nn.Conv2dBias(
-            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
+            latent_channels,
+            latent_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dtype=dtype,
+        )
+
+        self.encoder = Encoder(
+            batch_size,
+            height,
+            width,
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            dtype=dtype,
+        )
+        self.quant_conv = nn.Conv2dBias(
+            2 * latent_channels,
+            2 * latent_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dtype=dtype,
         )
 
     def decode(self, z: Tensor, return_dict: bool = True):
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
+        dec._attrs["is_output"] = True
+        dec._attrs["name"] = "pixels"
         return dec
 
-    def forward(self):
-        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")
+    def encode(
+        self,
+        x: Tensor,
+        sample: Tensor = None,
+        return_dict: bool = True,
+        deterministic: bool = False,
+    ):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        if sample is None:
+            return moments
+        mean, logvar = ops.chunk()(moments, 2, dim=3)
+        logvar = ops.clamp()(logvar, -30.0, 20.0)
+        std = ops.exp(0.5 * logvar)
+        # var = ops.exp(logvar)
+        # if deterministic:
+        #     var = std = Tensor(mean.shape(), value=0.0, dtype=mean._attrs["dtype"])
+        sample._attrs["shape"] = mean._attrs["shape"]
+        std._attrs["shape"] = mean._attrs["shape"]
+        z = mean + std * sample
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "latent"
+        return z
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
new file mode 100644
index 000000000..7de267035
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -0,0 +1,889 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from aitemplate.compiler import Model
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import is_invisible_watermark_available, logging, randn_tensor
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from .compile_lib.compile_clip_alt import map_clip
+from .compile_lib.compile_unet_alt import map_unet
+from .compile_lib.compile_vae_alt import map_vae
+
+from .inference_ait import (
+    clip_inference,
+    timestep_inference,
+    unet_inference,
+    vae_decode_inference,
+)
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import (
+        StableDiffusionXLWatermarker,
+    )
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+
+
+class StableDiffusionXLAITPipeline(
+    DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_module_path: str,
+        text_encoder_2_module_path: str,
+        unet_module_path: str,
+        vae_module_path: str,
+        timestep_module_path: str,
+        apply_weights_to_modules: bool = True,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = (
+            add_watermarker
+            if add_watermarker is not None
+            else is_invisible_watermark_available()
+        )
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.text_encoder_module_path = text_encoder_module_path
+        self.text_encoder_2_module_path = text_encoder_2_module_path
+        self.unet_module_path = unet_module_path
+        self.vae_module_path = vae_module_path
+        self.text_encoder_exe = None
+        self.text_encoder_2_exe = None
+        self.unet_exe = None
+        self.vae_exe = None
+        self.timestep_exe = Model(timestep_module_path)
+
+    def apply_vae(self):
+        self.vae_exe = Model(self.vae_module_path)
+        self.vae_exe.set_many_constants_with_tensors(map_vae(self.vae))
+
+    def apply_clip(self):
+        self.text_encoder_exe = Model(self.text_encoder_module_path)
+        self.text_encoder_exe.nlayers = [
+            x for x in range(0, self.text_encoder.config.num_hidden_layers)
+        ]
+        self.text_encoder_2_exe = Model(self.text_encoder_2_module_path)
+        self.text_encoder_2_exe.nlayers = [
+            x for x in range(0, self.text_encoder_2.config.num_hidden_layers)
+        ]
+        self.text_encoder_exe.set_many_constants_with_tensors(
+            map_clip(self.text_encoder)
+        )
+        self.text_encoder_2_exe.set_many_constants_with_tensors(
+            map_clip(self.text_encoder_2)
+        )
+
+    def apply_unet(self):
+        self.unet_exe = Model(self.unet_module_path)
+        self.unet_exe.set_many_constants_with_tensors(map_unet(self.unet))
+
+    def unload_clip(self):
+        self.text_encoder_exe = None
+        self.text_encoder_2_exe = None
+        torch.cuda.empty_cache()
+
+    def unload_unet(self):
+        self.unet_exe = None
+        torch.cuda.empty_cache()
+
+    def unload_vae(self):
+        self.vae_exe = None
+        torch.cuda.empty_cache()
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+
+        # Define tokenizers and text encoders
+        tokenizers = (
+            [self.tokenizer, self.tokenizer_2]
+            if self.tokenizer is not None
+            else [self.tokenizer_2]
+        )
+        text_encoders = (
+            [self.text_encoder_exe, self.text_encoder_2_exe]
+            if self.text_encoder_exe is not None
+            else [self.text_encoder_2_exe]
+        )
+
+        prompt_2 = prompt_2 or prompt
+        # textual inversion: procecss multi-vector tokens if necessary
+        prompt_embeds_list = []
+        prompts = [prompt, prompt_2]
+        for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            text_input_ids = text_inputs.input_ids
+
+            # prompt_embeds = text_encoder(
+            #     text_input_ids.to(device),
+            #     output_hidden_states=True,
+            # )
+            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=True)
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            if "text_embeds" in prompt_embeds.keys():
+                pooled_prompt_embeds = prompt_embeds["text_embeds"]
+            else:
+                pooled_prompt_embeds = prompt_embeds["pooled_output"]
+            # pooled_prompt_embeds = prompt_embeds[0]
+            # prompt_embeds = prompt_embeds.hidden_states[-2] # -2 because it includes last hidden state, AIT does not so uses -1
+            prompt_embeds = prompt_embeds[f"hidden_state_{text_encoder.nlayers[-1]}"]
+
+            prompt_embeds_list.append(prompt_embeds)
+
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = (
+            negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        )
+        if do_classifier_free_guidance and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(
+                uncond_tokens, tokenizers, text_encoders
+            ):
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                uncond_text_input_ids = uncond_input.input_ids
+
+                # negative_prompt_embeds = text_encoder(
+                #     uncond_input.input_ids.to(device),
+                #     output_hidden_states=True,
+                # )
+                negative_prompt_embeds = clip_inference(
+                    text_encoder, uncond_text_input_ids, to_cpu=True
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if "text_embeds" in negative_prompt_embeds.keys():
+                    negative_pooled_prompt_embeds = negative_prompt_embeds[
+                        "text_embeds"
+                    ]
+                else:
+                    negative_pooled_prompt_embeds = negative_prompt_embeds[
+                        "pooled_output"
+                    ]
+                # negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                # negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] # -2 because it includes last hidden state, AIT does not so uses -1
+                prompt_embeds = negative_prompt_embeds[
+                    f"hidden_state_{text_encoder.nlayers[-1]}"
+                ]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds = prompt_embeds
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder_2.dtype
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(
+            1, num_images_per_prompt
+        ).view(bs_embed * num_images_per_prompt, -1)
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(
+                1, num_images_per_prompt
+            ).view(bs_embed * num_images_per_prompt, -1)
+
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        elif prompt_2 is not None and (
+            not isinstance(prompt_2, str) and not isinstance(prompt_2, list)
+        ):
+            raise ValueError(
+                f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}"
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, dtype=dtype)
+        else:
+            latents = latents
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids)
+            + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_embeds = []
+        for time_id in add_time_ids:
+            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=True)[
+                "time_embed"
+            ]
+            add_time_embeds.append(time_embed)
+
+        add_time_embeds = torch.cat(add_time_embeds, dim=-1).to(dtype=dtype)
+        return add_time_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+
+        self.apply_clip()
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+        )
+
+        self.unload_clip()
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds
+        add_text_embeds = add_text_embeds
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        add_embeds = torch.cat([add_text_embeds, add_time_ids], dim=-1)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+
+        # 7.1 Apply denoising_end
+        if (
+            denoising_end is not None
+            and type(denoising_end) == float
+            and denoising_end > 0
+            and denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+
+        self.apply_unet()
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+
+                # predict the noise residual
+                # added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                # noise_pred = self.unet(
+                #     latent_model_input,
+                #     t,
+                #     encoder_hidden_states=prompt_embeds,
+                #     cross_attention_kwargs=cross_attention_kwargs,
+                #     added_cond_kwargs=added_cond_kwargs,
+                #     return_dict=False,
+                # )[0]
+                noise_pred = unet_inference(
+                    self.unet_exe,
+                    latent_model_input,
+                    t,
+                    prompt_embeds,
+                    add_embeds=add_embeds,
+                    to_cpu=True,
+                )["latent_output"]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        self.unload_unet()
+
+        self.apply_vae()
+
+        if not output_type == "latent":
+            image = vae_decode_inference(
+                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=True
+            )["pixels"]
+            self.unload_vae()
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/fx2ait/fx2ait/__init__.py b/fx2ait/fx2ait/__init__.py
index d2ac413d0..d68b25ab9 100644
--- a/fx2ait/fx2ait/__init__.py
+++ b/fx2ait/fx2ait/__init__.py
@@ -14,7 +14,7 @@
 #
 import sys
 
-from . import acc_tracer, converters  # noqa
+from . import acc_tracer, converters, extension  # noqa
 
 if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
     PY3STATEMENT = "The minimal Python requirement is Python 3.7"
@@ -24,6 +24,7 @@
     "acc_tracer",
     "converters",
     "core",
+    "extension",
     "lower",
     "test",
 ]
diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
index 8d06d7d5c..4d078abe9 100644
--- a/fx2ait/fx2ait/acc_tracer/acc_ops.py
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -766,6 +766,50 @@ def addmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
         return add_node
 
 
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.addcmul),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("tensor1", "tensor1"),
+        ("tensor2", "tensor2"),
+        ("value", "value"),
+    ],
+)
+def addcmul_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.addcmul to acc_ops.mul and acc_ops.add. If value is not 1, then we do another acc_ops.mul again.
+    """
+
+    with node.graph.inserting_before(node):
+        mul_kwargs = {"input": node.kwargs["tensor1"], "other": node.kwargs["tensor2"]}
+        mul_node = node.graph.create_node(
+            "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_mul"
+        )
+        mul_node.meta = node.meta.copy()
+
+        input_node = mul_node
+        if node.kwargs["value"] != 1:
+            value_mul_kwargs = {"input": input_node, "other": node.kwargs["value"]}
+            new_input_node = node.graph.create_node(
+                "call_function",
+                mul,
+                kwargs=value_mul_kwargs,
+                name="{mul_node.name}_value_mul",
+            )
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {
+            "input": node.kwargs["input"],
+            "other": input_node,
+        }
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
 @register_custom_acc_mapper_fn(
     op_and_target=("call_function", torch.t),
     arg_replacement_tuples=[
diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
index 5c3f87528..15eb25f7e 100644
--- a/fx2ait/fx2ait/ait_splitter.py
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -26,11 +26,6 @@
 from fx2ait.converters.converter_registry import AIT_CONVERTERS
 from fx2ait.fx2ait import AITInterpreter
 
-try:
-    torch.ops.load_library("//deeplearning/ait:AITModel")
-except BaseException:
-    torch.ops.load_library("build/libait_model.so")
-
 
 _VIEW_OPS = frozenset(
     (
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
index 86c405dcf..b92bcc17b 100644
--- a/fx2ait/fx2ait/converters/ait_converters.py
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -67,7 +67,9 @@
     vector_norm,
 )
 
-from aitemplate.testing import detect_target
+from aitemplate.frontend.nn import Upsampling2d
+
+from fx2ait.acc_tracer import acc_ops, ait_acc_ops
 from torch.fx.node import Argument, Target
 
 from fx2ait.acc_tracer import acc_ops, ait_acc_ops
@@ -318,8 +320,20 @@ def acc_ops_linalg_norm(
     input_val = kwargs["input"]
     if not isinstance(input_val, AITTensor):
         raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    if (
+        isinstance(kwargs["dim"], int)
+        and "ord" in kwargs
+        and kwargs["ord"] != 2
+        and kwargs["ord"] is not None
+    ):
+        # If dim is an int, the vector norm will be computed.
+        # For vector norm, the default ord is 2 if not specified
+        # otherwise, AIT hasn't implement it
+        raise RuntimeError("AIT linalg_norm only supports ord=2 use case!")
 
-    if "ord" not in kwargs or kwargs["ord"] != 2:
+    if not isinstance(kwargs["dim"], int) and (
+        "ord" not in kwargs or kwargs["ord"] != 2
+    ):
         raise RuntimeError("AIT linalg_norm only supports ord=2 use case!")
 
     # Hard code ord_kind=2 for l2 norm
@@ -393,6 +407,20 @@ def acc_ops_abs(
     return elementwise(FuncEnum.ABS)(input_val)
 
 
+@ait_converter(acc_ops.exp)
+def acc_ops_exp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.EXP)(input_val)
+
+
 @ait_converter(acc_ops.log)
 def acc_ops_log(
     target: Target,
@@ -1137,6 +1165,32 @@ def _is_int_list(iterable):
     return expand()(input_val, shape)
 
 
+@ait_converter(acc_ops.interpolate)
+def ait_acc_ops_interpolate(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    scale_factor = kwargs["scale_factor"]
+    if not scale_factor:
+        raise ValueError("scale_factor cannot be empty")
+
+    mode = kwargs["mode"]
+    if not mode:
+        raise ValueError("mode cannot be empty")
+
+    op = Upsampling2d(scale_factor=scale_factor, mode=mode)
+
+    res = op(ait_nchw2nhwc(input_val))
+    return ait_nhwc2nchw(res)
+
+
 @ait_converter(acc_ops.batch_norm)
 def acc_ops_batch_norm(
     target: Target,
@@ -1221,6 +1275,9 @@ def _choose_conv2d_op(
     if last_dim < 4:
         weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
         x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
+    elif last_dim > 4 and last_dim < 8:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 8)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 8)(x)
     elif last_dim % 2 != 0:
         return RuntimeError(
             f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
@@ -1563,6 +1620,7 @@ def acc_ops_contiguous(
 
 
 @ait_converter(acc_ops.to_dtype)
+@ait_converter(acc_ops.dtype)
 def acc_ops_to_dtype(
     target: Target,
     args: Tuple[Argument, ...],
diff --git a/fx2ait/fx2ait/example/benchmark_utils.py b/fx2ait/fx2ait/example/benchmark_utils.py
index 25ea436b5..f2d308eb5 100644
--- a/fx2ait/fx2ait/example/benchmark_utils.py
+++ b/fx2ait/fx2ait/example/benchmark_utils.py
@@ -24,8 +24,6 @@
 
 from fx2ait.fx2ait import AITInterpreter
 
-torch.ops.load_library("build/libait_model.so")
-
 
 def verify_accuracy(
     mod: torch.nn.Module,
diff --git a/fx2ait/fx2ait/extension.py b/fx2ait/fx2ait/extension.py
new file mode 100644
index 000000000..b45c98d21
--- /dev/null
+++ b/fx2ait/fx2ait/extension.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import importlib.machinery
+import logging
+import os
+
+import torch
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def is_oss_ait_model():
+    return False
+
+
+def _get_extension_path(lib_name):
+
+    lib_dir = os.path.dirname(__file__)
+
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES,
+    )
+
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = extfinder.find_spec(lib_name)
+    if ext_specs is None:
+        raise ImportError
+
+    return ext_specs.origin
+
+
+try:
+    torch.ops.load_library("//deeplearning/ait:AITModel")
+    logger.info("===Load non-OSS AITModel===")
+
+except (ImportError, OSError):
+    lib_path = _get_extension_path("libait_model")
+    torch.ops.load_library(lib_path)
+    logger.info("===Load OSS AITModel===")
+
+    def is_oss_ait_model():  # noqa: F811
+        return True
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
index cd9dcbee2..909a4f42e 100644
--- a/fx2ait/fx2ait/fx2ait.py
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -69,7 +69,8 @@ def __init__(
         save_remote_cache: Optional[bool] = False,
         do_optimize_graph: bool = True,
         use_fast_math: bool = True,
-        profile_timeout: int = 300,
+        profile_timeout: int = 500,
+        optimize_for_compilation_time: bool = False,
     ):
         """
         Args:
@@ -89,6 +90,7 @@ def __init__(
             save_remote_cache: whether to save the updated cache
             use_fast_math: whether to use fast math in CUDA kernels
             profile_timeout: timeout in seconds for AIT profilers to complete
+            optimize_for_compilation_time: we use O1 and disable the ProfileImpl function to reduce compilation time.
         """
         super().__init__(module)
 
@@ -112,6 +114,7 @@ def __init__(
             _LOGGER.info(f"Set CACHE_DIR to {self.cache_dir}")
         self.use_fp16_acc = use_fp16_acc
         self.use_fast_math = use_fast_math
+        self.optimize_for_compilation_time = optimize_for_compilation_time
         self.hardware_target = self._create_target()
         self.input_specs = input_specs
         self.input_specs_iter = 0
@@ -138,6 +141,7 @@ def _create_target(self):
             use_fp16_acc=self.use_fp16_acc,
             remote_cache_bytes=self.remote_cache_bytes,
             use_fast_math=self.use_fast_math,
+            optimize_for_compilation_time=self.optimize_for_compilation_time,
         )
 
     def _load_profile_cache(self) -> bytes:
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
index 55746d49a..a58bb6315 100644
--- a/fx2ait/fx2ait/lower/lower.py
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -36,8 +36,6 @@
 logger: logging.Logger = logging.getLogger(__name__)
 Input = Sequence[Any]
 
-torch.ops.load_library("build/libait_model.so")
-
 
 # A list of (function, target) pairs to not apply acc normalization
 # to when scripting. For one reason or another, these targets do
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
index e9bcfaea4..cfaef6a37 100644
--- a/fx2ait/fx2ait/lower/lower_settings.py
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -83,3 +83,5 @@ class LowerSettings:
     load_ait_dir: Optional[str] = None
     # jit.trace AITModule
     trace_ait_module: bool = True
+    # If True, optimize for compilation time (ie. compile w/ -O1 rather than -O3 and skip profiling codegen)
+    optimize_for_compilation_time: bool = False
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
index b1eb84704..fced0db3e 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
@@ -30,6 +30,7 @@ class TestConv2dConverter(AITTestCase):
             param("non_unary_params", 3, 2, padding=1, bias=False),
             param("dilation", 1, dilation=2),
             param("multi_group", 1, 1, 1, 1, 3, bias=True),
+            param("in_channel_padding_gt_4_lt_8", 1, in_channel=7),
         ]
     )
     def test_conv2d(
@@ -40,13 +41,14 @@ def test_conv2d(
         padding=0,
         dilation=1,
         groups=1,
+        in_channel=3,
         bias=True,
     ):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.conv = torch.nn.Conv2d(
-                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                    in_channel, 36, kernel_size, stride, padding, dilation, groups, bias
                 )
                 self.relu = torch.nn.ReLU()
 
@@ -54,7 +56,7 @@ def forward(self, x):
                 return self.relu(self.conv(x))
 
         model = TestModule().cuda().half()
-        inputs = [torch.randn(1, 3, 224, 224).cuda().half()]
+        inputs = [torch.randn(1, in_channel, 224, 224).cuda().half()]
         self.run_test(
             model,
             inputs,
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
index 90d5d5b4a..20bc7a0ff 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
@@ -42,6 +42,24 @@ class TestLinalgConverter(AITTestCase):
                 dim=1,
                 keepdims=True,
             ),
+            param(
+                "vector_norm_dim_3",
+                input_shape=[1, 100, 40, 40],
+                dim=3,
+                keepdims=False,
+            ),
+            param(
+                "vector_norm_dim_2",
+                input_shape=[1, 100, 40, 40],
+                dim=2,
+                keepdims=False,
+            ),
+            param(
+                "vector_norm_dim_1",
+                input_shape=[1, 100, 40, 40],
+                dim=-1,
+                keepdims=True,
+            ),
         ]
     )
     def test_linalg_norm(
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
index 23be04bc5..672fecd30 100644
--- a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -37,6 +37,7 @@
     (torch.sqrt, acc_ops.sqrt),
     (torch.clone, acc_ops.clone),
     (torch.neg, acc_ops.neg),
+    (torch.exp, acc_ops.exp),
 ]
 
 TestEnvToPrecision: Dict[TestEnv, Set[LowerPrecision]] = {
diff --git a/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py b/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py
new file mode 100644
index 000000000..20b506fc0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestInterpolateConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param(scale_factor=1, mode="nearest"),
+            param(scale_factor=2, mode="nearest"),
+            param(scale_factor=2, mode="bilinear"),
+        ]
+    )
+    def test_interpolate(self, scale_factor, mode):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                x = torch.nn.functional.interpolate(
+                    y, scale_factor=scale_factor, mode=mode
+                )
+                return x
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([2, 8, 16, 16]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.interpolate})
diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
index 39f6efa05..880354a70 100644
--- a/fx2ait/fx2ait/test/test_fx2ait.py
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -21,17 +21,11 @@
 
 from fx2ait.acc_tracer import acc_tracer
 from fx2ait.ait_module import AITModule
+from fx2ait.extension import is_oss_ait_model
 from fx2ait.fx2ait import AITInterpreter
 
-OSS_AIT_MODEL = False
-try:
-    torch.ops.load_library("//deeplearning/ait:AITModel")
-except Exception:
-    torch.ops.load_library("build/libait_model.so")
-    OSS_AIT_MODEL = True
-
 AIT_MODEL_CLASS = (
-    torch.classes.ait.AITModel if OSS_AIT_MODEL else torch.classes.fb.AITModel
+    torch.classes.ait.AITModel if is_oss_ait_model() else torch.classes.fb.AITModel
 )
 
 
@@ -96,7 +90,7 @@ def _test_fx2ait_impl(self, test_serialization=False, test_cuda_graph=False):
             ait_mod = torch.jit.load(buf)
         ait_output = ait_mod(*inputs)
         torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
-        if not OSS_AIT_MODEL:
+        if not is_oss_ait_model():
             weights = {
                 "_0_weight": torch.ones(3, 4).cuda().half(),
                 "_0_bias": torch.randn(4).cuda().half(),
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
index bd350c0c5..1a603848a 100644
--- a/fx2ait/fx2ait/tools/common_aten2ait.py
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -24,7 +24,7 @@
 import executorch.exir as exir
 import torch
 from aitemplate.compiler.public import DynamicProfileStrategy
-from executorch.exir import CaptureConfig, ServerCompileConfig
+from executorch.exir import CaptureConfig
 
 from fx2ait.ait_module import AITModule
 from fx2ait.fx2ait import AITInterpreter
@@ -102,15 +102,20 @@ def generate_graph(
         if customized_passes:
             passes_list.extend(customized_passes)
 
-        fx_module = exir.capture(
-            mod,
-            tuple(original_inputs),
-            CaptureConfig(
-                pt2_mode=True,
-                enable_functionalization=False,
-                enable_dynamic_shape=True,
-            ),
-        )._to_server(ServerCompileConfig(passes=passes_list))
+        fx_module = (
+            exir.capture(
+                mod,
+                tuple(original_inputs),
+                CaptureConfig(
+                    pt2_mode=True,
+                    enable_functionalization=False,
+                    enable_dynamic_shape=True,
+                    _use_old_decomp_table=True,
+                ),
+            )
+            .transform(*tuple(passes_list))
+            .exported_program.graph_module
+        )
 
         fx_module = run_const_fold(fx_module)
         _LOGGER.info(f"aten fx graph: {fx_module.graph}")
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
index 898a14234..4d25997a1 100644
--- a/fx2ait/fx2ait/tools/common_fx2ait.py
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -26,21 +26,13 @@
 from fx2ait.acc_tracer import acc_tracer
 from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
 from fx2ait.ait_module import AITModule
+from fx2ait.extension import is_oss_ait_model
 from fx2ait.fx2ait import AITInterpreter
 from fx2ait.tensor_spec import TensorSpec
 from torch.fx.node import map_aggregate
 
 logger: logging.Logger = logging.getLogger(__name__)
 
-OSS_AITModel = False
-try:
-    torch.ops.load_library("//deeplearning/ait:AITModel")
-    logger.info("===Load non-OSS AITModel===")
-except Exception:
-    torch.ops.load_library("build/libait_model.so")
-    logger.info("===Load OSS AITModel===")
-    OSS_AITModel = True
-
 
 class LowerPrecision(Enum):
     FP32 = "fp32"
@@ -107,6 +99,7 @@ def run_test(
         leaf_module: Callable = None,  # one leaf module
         apply_passes_to_lowered_module_only=False,
         use_fp16_acc=True,
+        fail_on_nan=False,
     ):
         # TODO: add precision to interpreter once AIT supports multiple precision level
         # TODO: @qxy11 remove permute options once AIT supports channels-first format
@@ -114,7 +107,10 @@ def run_test(
 
         leaf_module_list = []
         if leaf_module:
-            leaf_module_list.append(leaf_module)
+            if isinstance(leaf_module, list):
+                leaf_module_list.extend(leaf_module)
+            else:
+                leaf_module_list.append(leaf_module)
 
         orig_mod = copy.deepcopy(mod)
         orig_mod.eval()
@@ -165,7 +161,7 @@ def run_test(
             interp_result = interp.run()
             sec = time.perf_counter() - start
             logger.info(f"Interpreter run time(s):{sec}")
-            if OSS_AITModel:
+            if is_oss_ait_model():
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
                         interp_result.engine.lib_path,
@@ -216,13 +212,17 @@ def run_test(
                     out = map_aggregate(
                         out, lambda output: output.permute(*permute_outputs)
                     )
+                out = out.cpu()
+                if out.numel() != 0:
+                    max_diff = torch.max(torch.abs(out - ref)).item()
+                    logger.info(f"Max diff = {max_diff}")
                 torch.testing.assert_close(
-                    out.cpu(),
+                    out,
                     ref,
                     rtol=rtol,
                     atol=atol,
                     check_dtype=False,
-                    equal_nan=True,
+                    equal_nan=not fail_on_nan,
                 )
 
     def run_test_with_dynamic_shape(
@@ -291,7 +291,7 @@ def run_test_with_dynamic_shape(
             interp_result = interp.run()
             sec = time.perf_counter() - start
             logger.info(f"Interpreter run time(s):{sec}")
-            if OSS_AITModel:
+            if is_oss_ait_model():
                 ait_mod = AITModule(
                     torch.classes.ait.AITModel(
                         interp_result.engine.lib_path,
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
index cb1420a9b..d3c185ee1 100644
--- a/fx2ait/setup.py
+++ b/fx2ait/setup.py
@@ -13,98 +13,67 @@
 #  limitations under the License.
 #
 
+import glob
 import os
-import subprocess
-import sys
-from pathlib import Path
 
-from setuptools import Extension, find_packages, setup
-from setuptools.command.build_ext import build_ext
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name):
-        Extension.__init__(self, name, sources=[])
-
-
-class CMakeBuild(build_ext):
-    def run(self):
-        try:
-            subprocess.check_output(["cmake", "--version"])
-        except OSError as exc:
-            raise RuntimeError(
-                "CMake must be installed to build the following extensions: "
-                + ", ".join(e.name for e in self.extensions)
-            ) from exc
-
-        try:
-            import torch.utils
-
-            cmake_prefix_path = torch.utils.cmake_prefix_path
-        except ModuleNotFoundError as exc:
-            raise RuntimeError(
-                "Cannot import torch.utils. Check torch installation."
-            ) from exc
-
-        build_directory = os.path.abspath(self.build_temp)
-        cmake_args = [
-            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + build_directory,
-            "-DPYTHON_EXECUTABLE=" + sys.executable,
-            "-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
-        ]
-
-        cfg = "Debug" if self.debug else "Release"
-        build_args = ["--config", cfg]
-
-        # cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-
-        # Assuming Makefiles
-        build_args += ["--", "-j2"]
-
-        self.build_args = build_args
-
-        env = os.environ.copy()
-        env["CXXFLAGS"] = '{} -DVERSION_INFO=\\"{}\\"'.format(
-            env.get("CXXFLAGS", ""), self.distribution.get_version()
-        )
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        # CMakeLists.txt is in the same directory as this setup.py file
-        cmake_list_dir = os.path.abspath(os.path.dirname(__file__))
-        print("-" * 10, "Running CMake prepare", "-" * 40)
-        subprocess.check_call(
-            ["cmake", cmake_list_dir] + cmake_args, cwd=self.build_temp, env=env
+from setuptools import find_packages, setup
+
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+def get_extensions():
+    print("Compiling extensions with following flags:")
+    debug_mode = os.getenv("DEBUG", "0") == "1"
+    print(f"  DEBUG: {debug_mode}")
+    nvcc_flags = os.getenv("NVCC_FLAGS", "")
+    print(f"  NVCC_FLAGS: {nvcc_flags}")
+    if nvcc_flags == "":
+        nvcc_flags = []
+    else:
+        nvcc_flags = nvcc_flags.split(" ")
+    extra_compile_args = {"cxx": [], "nvcc": nvcc_flags}
+
+    if debug_mode:
+        print("Compiling in debug mode")
+        extra_compile_args["cxx"].append("-g")
+        extra_compile_args["cxx"].append("-O0")
+        if "nvcc" in extra_compile_args:
+            # we have to remove "-OX" and "-g" flag if exists and append
+            nvcc_flags = extra_compile_args["nvcc"]
+            extra_compile_args["nvcc"] = [
+                f for f in nvcc_flags if not ("-O" in f or "-g" in f)
+            ]
+            extra_compile_args["nvcc"].append("-O0")
+            extra_compile_args["nvcc"].append("-g")
+
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "fx2ait", "csrc")
+
+    src = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    inc = [extensions_dir]
+    inc += [os.path.abspath(os.path.join(this_dir, "../static/include"))]
+    inc += [os.path.abspath(os.path.join(this_dir, "../3rdparty/picojson"))]
+    define_macros = []
+
+    ext_modules = [
+        CUDAExtension(
+            name="fx2ait.libait_model",
+            sources=src,
+            include_dirs=inc,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
         )
+    ]
+    return ext_modules
 
-        print("-" * 10, "Building extensions", "-" * 40)
-        cmake_cmd = ["cmake", "--build", "."] + self.build_args
-        subprocess.check_call(cmake_cmd, cwd=self.build_temp)
-        # Move from build temp to final position
-        for ext in self.extensions:
-            self.move_output(ext)
-
-    def move_output(self, ext):
-        build_temp = Path(self.build_temp).resolve()
-        lib_name = "lib" + ext.name + ".so"
-        dest_path = build_temp.parents[0] / lib_name
-        source_path = build_temp / lib_name
-        dest_directory = dest_path.parents[0]
-        dest_directory.mkdir(parents=True, exist_ok=True)
-        self.copy_file(source_path, dest_path)
-
-
-ext_modules = [
-    CMakeExtension("ait_model"),
-]
 
 setup(
     name="fx2ait",
     version="0.2.dev1",
     description="FX2AIT: Convert PyTorch Models to AITemplate",
-    zip_safe=True,
+    zip_safe=False,
     install_requires=["torch"],  # We will need torch>=1.13
     packages=find_packages(),
-    ext_modules=ext_modules,
-    cmdclass=dict(build_ext=CMakeBuild),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": BuildExtension.with_options(no_python_abi_suffix=True)},
 )
diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
index 60a116a71..2a449b9fc 100644
--- a/python/aitemplate/__init__.py
+++ b/python/aitemplate/__init__.py
@@ -16,6 +16,7 @@
 
 from aitemplate import backend, compiler, frontend, testing, utils
 from aitemplate._libinfo import __version__  # noqa
+from aitemplate.utils.misc import setup_logger
 
 if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
     PY3STATEMENT = "The minimal Python requirement is Python 3.7"
@@ -23,4 +24,4 @@
 
 __all__ = ["backend", "compiler", "frontend", "testing", "utils"]
 
-root_logger = utils.misc.setup_logger(__name__)
+root_logger = setup_logger(__name__)
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index bb37202c7..842529215 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -338,6 +338,9 @@ def __init__(
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
         self.f_ptr_decl = registry.get(self.target.name() + ".lib.void_ptr_decl")
+        self.dtype_to_backend_type = registry.get(
+            self.target.name() + ".lib.dtype_to_backend_type"
+        )
 
         self.constants_data_file = constants_data_file
 
@@ -386,11 +389,9 @@ def __init__(
         self.graph = graph
 
         self.num_inputs, self.num_outputs = count_inputs_outputs(graph)
-        (self.max_blob_size, self.max_constant_blob_size, self.workspace,) = (
-            max_blob_size,
-            max_constant_blob_size,
-            workspace,
-        )
+        self.max_blob_size = max_blob_size
+        self.max_constant_blob_size = max_constant_blob_size
+        self.workspace = workspace
 
         self.debug_settings = (
             AITDebugSettings() if debug_settings is None else debug_settings
@@ -773,13 +774,15 @@ def _process_src_ops(self, node: Tensor) -> None:
                     self.state_record.add(func._attrs["name"])
             self._process_dims_for_op(func)
 
-        if self.debug_settings.check_all_nan_and_inf or node._attrs.get(
-            "check_nan_and_inf", False
-        ):
+        if (
+            self.debug_settings.check_all_nan_and_inf
+            or node._attrs.get("check_nan_and_inf", False)
+        ) and (not isinstance(node, IntVarTensor)):
             self._append_check_nan_and_inf(node)
-        if self.debug_settings.check_all_outputs or node._attrs.get(
-            "check_outputs", False
-        ):
+        if (
+            self.debug_settings.check_all_outputs
+            or node._attrs.get("check_outputs", False)
+        ) and (not isinstance(node, IntVarTensor)):
             self._append_check_outputs(node)
 
     def _append_check_nan_and_inf(self, node: Tensor):
@@ -798,7 +801,11 @@ def _append_check_outputs(self, node: Tensor):
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
         self.func_name_seq.append("output_check")
 
-        code_text = f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        backend_type = self.dtype_to_backend_type(node._attrs["dtype"])
+        code_text = (
+            f"    InvokeOutputsChecker((const {backend_type}*)({tensor_name}), "
+            f'"{tensor_name}", {elem_cnt}, stream);\n'
+        )
         self.func_seq.append(code_text)
         self._rendered_checks_func_code.append(code_text)
 
@@ -870,7 +877,7 @@ def _generate_simple_multistream_ops(
 
         # sort all operators by parallel execution order
         ops_by_order = defaultdict(list)
-        for (op, tracking) in time_stats.op_parallel_trackers.items():
+        for op, tracking in time_stats.op_parallel_trackers.items():
             ops_by_order[tracking.execution_order].append(op)
 
         # convert Dict[int, List[Operator]] into List[List[Operator]]
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 71d7101a1..9d9af7b84 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -229,13 +229,12 @@
 {{head}}
 
 #include "jagged.h"
+{{custom_libs}}
 
 namespace {
 
 {{constant}}
 
-{{custom_libs}}
-
 {{tensor_accessor_lib}}
 
 {{kernel_function}}
diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index a20f7dd3e..49961916a 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -291,6 +291,11 @@
       break;
     }
   }
+  // We have a full slice for the entire input
+  if (flatten_index == -1) {
+    flatten_index = 0;
+  }
+
   int64_t input_start_offset =
       compute_input_linear_index<Rank>(input_strides,
                                        slice_start_indices,
@@ -348,15 +353,36 @@
 
   slice_meta_data.num_elems[input_idx] = 1;
   for ({{index_type}}  i = 0; i < Rank; i++) {
-    assert(slice_start_indices[i] >= 0 &&
-           slice_start_indices[i] <= input_shape[i]);
-    assert(slice_end_indices[i] >= 0 && slice_end_indices[i] <= input_shape[i]);
-    assert(slice_start_indices[i] <= slice_end_indices[i]);
-
-    slice_meta_data.num_elems[input_idx] *=
-        slice_end_indices[i] - slice_start_indices[i];
-    slice_meta_data.slice_start_indices[input_idx][i] = slice_start_indices[i];
-    slice_meta_data.slice_end_indices[input_idx][i] = slice_end_indices[i];
+    int64_t slice_start_idx = slice_start_indices[i];
+    int64_t slice_end_idx = slice_end_indices[i];
+    int64_t input_dim = input_shape[i];
+
+    if (!(slice_start_idx >= 0 && slice_start_idx <= input_dim)) {
+        throw std::runtime_error("invalid slice_start_idx: " +
+            std::to_string(slice_start_idx) +
+            ", input_dim: " +
+            std::to_string(input_dim) +
+            ", i: " + std::to_string(i));
+    }
+    if (!(slice_end_idx >= 0 && slice_end_idx <= input_dim)) {
+        throw std::runtime_error("invalid slice_end_idx: " +
+            std::to_string(slice_end_idx) +
+            ", input_dim: " +
+            std::to_string(input_dim) +
+            ", i: " + std::to_string(i));
+    }
+    if (slice_start_idx > slice_end_idx) {
+        throw std::runtime_error(
+            "expected slice_start_idx <= slice_end_idx but got slice_start_idx: " +
+            std::to_string(slice_start_idx) +
+            " and slice_end_idx: " +
+            std::to_string(slice_end_idx) +
+            ", i: " + std::to_string(i));
+    }
+
+    slice_meta_data.num_elems[input_idx] *= slice_end_idx - slice_start_idx;
+    slice_meta_data.slice_start_indices[input_idx][i] = slice_start_idx;
+    slice_meta_data.slice_end_indices[input_idx][i] = slice_end_idx;
   }
 
   slice_meta_data.dim_sizes[input_idx] =
@@ -383,10 +409,20 @@
 
   // meta data for placing sliced output
   scatter_meta_data.output_strides[Rank-1] = 1;
+  if (output_shape[Rank-1] < 0) {
+    throw std::runtime_error("invalid output_shape[Rank-1]: " +
+        std::to_string(output_shape[Rank-1]) +
+        ", Rank: " + std::to_string(Rank));
+  }
   scatter_meta_data.output_shape[Rank-1] = output_shape[Rank-1];
   for ({{index_type}}  i = Rank - 2; i >= 0; i--) {
     scatter_meta_data.output_strides[i] =
         scatter_meta_data.output_strides[i+1] * output_shape[i+1];
+    if (output_shape[i] < 0) {
+      throw std::runtime_error("invalid output_shape[i]: " +
+          std::to_string(output_shape[i]) +
+          ", i: " + std::to_string(i));
+    }
     scatter_meta_data.output_shape[i] = output_shape[i];
   }
 
@@ -423,6 +459,11 @@
     }
   }
 
+  if (max_num_elems <= 0) {
+    throw std::runtime_error("invalid max_num_elems: " +
+        std::to_string(max_num_elems));
+  }
+
   {{index_type}}  m = max_num_elems % (ThreadsPerBlock * ElemsPerThread) != 0;
   {{index_type}}  num_blocks_x =
       (max_num_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
@@ -465,6 +506,12 @@
   std::vector<int64_t> slice_start_indices(rank);
   std::vector<int64_t> slice_end_indices(rank);
   for ({{index_type}}  i = 0; i < rank; i++) {
+    if (input_shape[i] < 0) {
+        throw std::runtime_error("invalid input_shape: " +
+            std::to_string(input_shape[i]) +
+            ", i: " +
+            std::to_string(i));
+    }
     slice_start_indices[i] = orig_slice_start_indices[i] < 0 ?
                              input_shape[i] + orig_slice_start_indices[i]:
                              orig_slice_start_indices[i];
@@ -549,6 +596,9 @@
   if (scatter_dim >= rank) {
     throw std::runtime_error("scatter_dim must < rank!");
   }
+  if (num_inputs < 1) {
+    throw std::runtime_error("num_inputs must be larger than 0!");
+  }
 
   // clip slip start and end indices
   std::vector<std::vector<int64_t>> slice_start_indices(num_inputs);
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
index afe305ff2..ad03f6680 100644
--- a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -38,9 +38,7 @@
 @registry.reg("cuda.fused_elementwise.gen_function")
 def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
     """Generates fused_elementwise function definition."""
-    custom_libs = Target.current().get_custom_libs(
-        os.path.dirname(__file__), "custom_math.cuh"
-    )
+    custom_libs = '#include "custom_math.cuh"'
     return elementwise_common.fused_elementwise_gen_function(
         func_attrs=func_attrs,
         custom_libs=custom_libs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index 3cf6eecc4..c77d279cc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -32,6 +32,7 @@
     gemm_rcr_permute,
     gemm_rcr_permute_elup1,
     gemm_rrr,
+    gemm_rrr_bias,
     gemm_rrr_permute,
     group_gemm_rcr,
     group_gemm_rcr_bias,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 696734094..f4b0a0d07 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -107,6 +107,36 @@ def gemm_rrr_config(func_attrs, dtype="float16"):
             op.C.element = cutlass_lib.library.DataType.void
 
 
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    problem_args_template_cutlass_3x=None,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim1"
+    )
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
 @registry.reg("cuda.gemm_rrr.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
@@ -126,6 +156,38 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     )
 
 
+def get_input_addr_calculator(func_attrs):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "K * N"
+    input_b_stride_k_dim = "N"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            shapes = input_a_accessor.original_shapes
+            input_a_stride_k_dim = input_a_accessor.stride(len(shapes) - 2)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            shapes = input_b_accessor.original_shapes
+            input_b_stride_k_dim = input_b_accessor.stride(len(shapes) - 2)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+    return input_addr_calculator
+
+
 @registry.reg("cuda.gemm_rrr.gen_function")
 def gen_function(
     func_attrs,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py
new file mode 100644
index 000000000..42c675ff0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py
@@ -0,0 +1,205 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = GeMM(A, B) + bias
+where A[RowMajor][M, K], B[ColMajor][K, N], bias[RowMajor][N]
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rrr
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    K * N,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+@registry.reg("cuda.gemm_rrr_bias.config")
+def gemm_rrr_config(func_attrs, dtype="float16"):
+    common.make_fproc(func_attrs, RRR)
+
+
+@registry.reg("cuda.gemm_rrr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    return gemm_rrr.common_gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_addr_calculator = gemm_rrr.get_input_addr_calculator(func_attrs)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    return common.gen_function(
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        support_split_k=True,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
index 2bf72cb3b..e11eb4c28 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -269,7 +269,11 @@ def group_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
 
     all_shape_funcs = []
     # all Ms are the same
-    input_0_shapes = inputs[0]._attrs["shape"]
+    if func_attrs.get("input_accessors", None):
+        input_accessor = func_attrs["input_accessors"][0]
+        input_0_shapes = input_accessor.original_shapes
+    else:
+        input_0_shapes = inputs[0]._attrs["shape"]
     norm_ndim = len(func_attrs["normalized_shape"][0])
     m_name = "M"
     m_shape_func = layernorm_common.generate_m_shape_func(
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index a29179ea8..1b562d124 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -2142,10 +2142,19 @@ cudaError_t invokeGroupLayernormSigmoidMul(
     return cudaSuccess;
   }
 
+  bool accessors_aligned_to_4 = true;
+  for (size_t i = 0; i < b; ++i) {
+    if (!input_accessors[i].is_valid_alignment(4) ||
+        !output_accessors[i].is_valid_alignment(4)) {
+      accessors_aligned_to_4 = false;
+      break;
+    }
+  }
+
   dim3 grid(m, b);
   // TODO: implement float4 group kernel
   if (std::is_same<T, half>::value && n_is_multiple_of_4 && (min_n >= 128) &&
-      (max_n <= 4096)) {
+      (max_n <= 4096) && accessors_aligned_to_4) {
     dim3 block(min_n);
     // round up to multiples of 32 to make warp shuffles safe
     block.x = (block.x / 4 + 31) / 32 * 32;
diff --git a/python/aitemplate/backend/cuda/lib_template.py b/python/aitemplate/backend/cuda/lib_template.py
index 1c42108b1..56cd310bf 100644
--- a/python/aitemplate/backend/cuda/lib_template.py
+++ b/python/aitemplate/backend/cuda/lib_template.py
@@ -18,6 +18,7 @@
 import jinja2
 
 from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
@@ -36,3 +37,9 @@ def void_ptr_decl(name, dtype="float16", indent="  "):
     # FIXME: we keep dtype in void_ptr_decl's param list because rocm needs it.
     # We will remove it once we support general tensor type for rocm
     return PTR_TEMPLATE.render(name=name, dtype="void*", indent=indent)
+
+
+@registry.reg("cuda.lib.dtype_to_backend_type")
+def dtype_to_backend_type(dtype):
+    backend_spec = CUDASpec()
+    return backend_spec.dtype_to_backend_type(dtype)
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index c8728b9b1..a259d3974 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -28,6 +28,7 @@
 from aitemplate.backend.common import tensor_accessor_codegen
 
 from aitemplate.backend.cuda.reduce import reduce_small_axis
+from aitemplate.backend.target import Target
 
 
 DEFAULT_PROLOGUE_TEMPLATE = jinja2.Template(
@@ -830,7 +831,13 @@ def gen_function(
     output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     if accumulation_type is None:
         # follow pytorch's semantics
-        acc_type = output_type
+        if (
+            Target.current()._kwargs.get("use_fp16_acc", False)
+            and y._attrs["dtype"] == "float16"
+        ):
+            acc_type = output_type
+        else:
+            acc_type = "float"
     else:
         acc_type = accumulation_type
 
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_common.py b/python/aitemplate/backend/cuda/reduce/reduce_common.py
index c2416e1e9..43e40c7e7 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_common.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_common.py
@@ -18,6 +18,7 @@
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
 from aitemplate.compiler.base import IntImm, IntVar
 
@@ -88,7 +89,7 @@
   using Layout = cutlass::layout::TensorNHWC;
   // Match pytorch's behavior where the accumuation type is the same
   // as the output type
-  using ElementCompute = ElemOutputType;
+  using ElementCompute = {{accumulation_type}};
   using ReductionOp = {{reduction_op}}<ElementCompute>;
   constexpr int NUM_DIMS = 4;
   assert(rank <= NUM_DIMS);
@@ -193,9 +194,8 @@ def gen_function(func_attrs, reduction_op):
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
 
     vector_lens_config = [32, 16, 8, 4, 1]
     exec_paths = ""
@@ -213,11 +213,17 @@ def gen_function(func_attrs, reduction_op):
         workspace_ptr = "workspace"
     else:
         workspace_ptr = "nullptr"
+
+    accumulation_type = "float"
+    if Target.current()._kwargs.get("use_fp16_acc", False) and output_type == "float16":
+        accumulation_type = elem_output_type
+
     return SRC_TEMPLATE.render(
         func_name=func_attrs["name"],
         reduction_op=reduction_op,
         exec_paths=exec_paths,
         workspace_ptr=workspace_ptr,
+        accumulation_type=accumulation_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 754b07cf8..80b5dc336 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -24,6 +24,7 @@
 from aitemplate.backend import registry
 from aitemplate.backend.backend_spec import CUDASpec
 from aitemplate.backend.cuda.reduce import reduce_3d
+from aitemplate.backend.target import Target
 
 
 EXTRA_CODE_TEMPLATE = jinja2.Template(
@@ -148,17 +149,17 @@
 } // namespace arch
 
 template <typename ElementT, bool BesselCorrection>
-struct NumericConverter<WelfordData<ElementT, BesselCorrection>,
+struct NumericConverter<WelfordData<{{acc_type}}, BesselCorrection>,
                         ElementT,
                         FloatRoundStyle::round_to_nearest> {
 
-  using result_type = WelfordData<ElementT, BesselCorrection>;
+  using result_type = WelfordData<{{acc_type}}, BesselCorrection>;
   using source_type = ElementT;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-    return WelfordData<ElementT, BesselCorrection>(-1, static_cast<ElementT>(s), ElementT(0));
+    return WelfordData<{{acc_type}}, BesselCorrection>(-1, static_cast<{{acc_type}}>(s), {{acc_type}}(0));
   }
 
   CUTLASS_HOST_DEVICE
@@ -169,11 +170,11 @@
 
 template <typename ElementT, bool BesselCorrection>
 struct NumericConverter<ElementT,
-                        WelfordData<ElementT, BesselCorrection>,
+                        WelfordData<{{acc_type}}, BesselCorrection>,
                         FloatRoundStyle::round_to_nearest> {
 
   using result_type = ElementT;
-  using source_type = WelfordData<ElementT, BesselCorrection>;
+  using source_type = WelfordData<{{acc_type}}, BesselCorrection>;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
@@ -183,14 +184,14 @@
       if (s.count <= 1) {
         return ElementT(nanf("Not a Number"));
       } else {
-        return s.m2 / ElementT((int)(s.count - 1));
+        return ElementT(s.m2) / ElementT((int)(s.count - 1));
       }
     } else {
       // sample variance
       if (s.count <= 0) {
         return ElementT(nanf("Not a Number"));
       } else {
-        return s.m2 / ElementT((int)(s.count));
+        return ElementT(s.m2) / ElementT((int)(s.count));
       }
     }
   }
@@ -294,17 +295,20 @@ def var_gen_function(func_attrs) -> str:
     """
     bessel = "true" if func_attrs["unbiased"] else "false"
     backend_spec = CUDASpec()
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-    acc_type = f"WelfordData<{elem_output_type}, {bessel}>"
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
+
+    acc_type = "float"
+    if Target.current()._kwargs.get("use_fp16_acc", False) and output_type == "float16":
+        acc_type = elem_output_type
+    welford_type = f"WelfordData<{acc_type}, {bessel}>"
     return reduce_3d.gen_function(
         func_attrs,
         "cutlass::welford_op",
         reduce_3d.DEFAULT_PROLOGUE_TEMPLATE,
         reduce_3d.DEFAULT_EPILOGUE_SCALAR_TEMPLATE,
-        EXTRA_CODE_TEMPLATE.render(),
-        accumulation_type=acc_type,
+        EXTRA_CODE_TEMPLATE.render(acc_type=acc_type),
+        accumulation_type=welford_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index ad8d493ab..59a69865a 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -44,8 +44,6 @@
 {{func_signature}}
 {
   {{shape_functions}}
-  size_t m0 = {{m}};
-  size_t n = {{K}};
   size_t m = M;
   bool success = true;
 
@@ -64,7 +62,7 @@
     {% elif K > 3840 %}
       // K/8 > 480
       using vec8 = VecTFor<{{dtype}}>::vec8;
-      LaunchSoftmaxBlockAll<vec8, {{dtype}},{{K}}>(reinterpret_cast<const vec8*>(input), reinterpret_cast<vec8*>(output), M, stream, &success);
+      LaunchSoftmaxBlockAll<vec8, {{dtype}}, {{K}}>(reinterpret_cast<const vec8*>(input), reinterpret_cast<vec8*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 4 == 0 %}
     // K % 4 == 0: vector4 kernels
@@ -77,7 +75,7 @@
     {% elif K > 1920 %}
       // K/4 > 480
       using vec4 = VecTFor<{{dtype}}>::vec4;
-      LaunchSoftmaxBlockAll<vec4,{{dtype}},{{K}}>(reinterpret_cast<const vec4*>(input), reinterpret_cast<vec4*>(output), M, stream, &success);
+      LaunchSoftmaxBlockAll<vec4, {{dtype}}, {{K}}>(reinterpret_cast<const vec4*>(input), reinterpret_cast<vec4*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 2 == 0 %}
     // K % 2 == 0: vector2 kernels
@@ -90,7 +88,7 @@
     {% elif K > 1152 %}
       // K/2 > 576
       using vec2 = VecTFor<{{dtype}}>::vec2;
-      LaunchSoftmaxBlockAll<vec2,{{dtype}},{{K}}>(reinterpret_cast<const vec2*>(input), reinterpret_cast<vec2*>(output), M, stream, &success);
+      LaunchSoftmaxBlockAll<vec2, {{dtype}}, {{K}}>(reinterpret_cast<const vec2*>(input), reinterpret_cast<vec2*>(output), M, stream, &success);
     {% endif %}
   {% else %}
     // odd K
@@ -102,12 +100,12 @@
       LaunchSoftmaxK1Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
     {% elif K > 1408 %}
       // K > 1408
-      LaunchSoftmaxBlockAll<{{dtype}},{{dtype}},{{K}}>( (const {{dtype}}*) input, ({{dtype}}*) output, m, stream, &success);
+      LaunchSoftmaxBlockAll<{{dtype}}, {{dtype}}, {{K}}>( (const {{dtype}}*) input, ({{dtype}}*) output, m, stream, &success);
     {% endif %}
   {% endif %}
 
   if (!success) {
-    softmaxBlockNocache<{{dtype}}><<<m, 1024, 0, stream>>>(({{dtype}}*)input, ({{dtype}}*)output, m, n);
+    softmaxBlockNocache<{{dtype}}><<<m, 1024, 0, stream>>>(({{dtype}}*)input, ({{dtype}}*)output, m, {{K}});
   }
 }
     """
@@ -116,7 +114,7 @@
 SHAPE_FUNCTIONS = jinja2.Template(
     """
     int64_t M = 1;
-{% for idx in range(input_ndim - 1) %}
+{% for idx in range(reduction_dim) %}
     M *= *in_{{idx}};
 {% endfor %}
     """
@@ -125,18 +123,19 @@
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(void* input,
-                   void* output,
-{% for idx in range(input_ndim - 1) %}
-                   int64_t* in_{{idx}},
+               void* output,
+{% for idx in range(reduction_dim) %}
+               int64_t* in_{{idx}},
 {% endfor %}
-                   cudaStream_t stream)
-    """
+               cudaStream_t stream)
+    """,
+    trim_blocks=True,
 )
 
 FUNC_DECL = jinja2.Template(
     """
-    {{func_signature}};
-    """
+{{func_signature}};
+    """,
 )
 
 FUNC_CALL_TEMPLATE = jinja2.Template(
@@ -144,20 +143,20 @@
 {{indent}}{{func_name}}(
 {{indent}}   {{input}},
 {{indent}}   {{output}},
-{% for name in input_dim_names[:-1] %}
-{{indent}}    &{{name}},
+{% for name in outer_dim_names %}
+{{indent}}   &{{name}},
 {% endfor %}
 {{indent}}   stream
 {{indent}});
-    """
+    """,
+    trim_blocks=True,
 )
 
 
 def get_func_signature(func_attrs: Dict[str, Any]) -> str:
-    input_ndim = func_attrs["inputs"][0]._rank()
     return FUNC_SIGNATURE.render(
         func_name=func_attrs["name"],
-        input_ndim=input_ndim,
+        reduction_dim=func_attrs["dim"],
     ).strip()
 
 
@@ -180,11 +179,6 @@ def find_tile_size(k: int) -> int:
 def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
     dim = func_attrs["dim"]
     shapes = func_attrs["inputs"][0]._attrs["shape"]
-    rank = len(shapes)
-
-    assert (
-        dim == rank - 1
-    ), f"softmax only supports dim == rank - 1, dim={dim}, rank={rank}"
 
     assert isinstance(
         shapes[dim], IntImm
@@ -201,7 +195,7 @@ def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
             os.path.dirname(__file__), "softmax.cuh"
         ),
         func_signature=get_func_signature(func_attrs),
-        shape_functions=SHAPE_FUNCTIONS.render(input_ndim=rank),
+        shape_functions=SHAPE_FUNCTIONS.render(reduction_dim=dim),
         dtype=elem_input_type,
         K=k,
         m=find_tile_size(k),
@@ -221,17 +215,18 @@ def softmax_gen_function_call(func_attrs, indent="  "):
     input_name = func_attrs["inputs"][0]._attrs["name"]
     output_name = func_attrs["outputs"][0]._attrs["name"]
 
-    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    shape = func_attrs["inputs"][0]._attrs["shape"]
     assert (
-        len(shapes) >= 2
-    ), f"Softmax only supports input with rank >= 2, current rank: {len(shapes)}"
+        len(shape) >= 2
+    ), f"Softmax only supports input with rank >= 2, current rank: {len(shape)}"
 
-    input_dim_names = [shape._attrs["name"] for shape in shapes]
+    reduction_dim = func_attrs["dim"]
+    outer_dim_names = [dim._attrs["name"] for dim in shape[:reduction_dim]]
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         input=input_name,
         output=output_name,
-        input_dim_names=input_dim_names,
+        outer_dim_names=outer_dim_names,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 0c63e4270..bc95469a5 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -265,6 +265,7 @@ class FBCUDA(CUDA):
     nvcc_option_json = None
     cutlass_path_ = None
     static_compile_options_ = None
+    optimize_for_compilation_time_ = False
 
     def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         from libfb.py import parutil
@@ -274,6 +275,15 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         )
         cub_src_path = parutil.get_dir_path("aitemplate/AITemplate/fb/3rdparty/cub")
         static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
+        if "optimize_for_compilation_time" in kwargs:
+            FBCUDA.optimize_for_compilation_time_ = kwargs[
+                "optimize_for_compilation_time"
+            ]
+        _LOGGER.info(
+            "Optimize for compilation time : {}".format(
+                FBCUDA.optimize_for_compilation_time_
+            )
+        )
         self._include_path = None
         if not FBCUDA.cutlass_path_:
             self._include_path = tempfile.mkdtemp()
@@ -403,9 +413,16 @@ def _build_compile_options(self):
                     "--expt-relaxed-constexpr",
                     f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
                     "-Xcompiler=-Wconversion",
-                    environ.get_compiler_opt_level(),
+                    environ.get_compiler_opt_level()
+                    if not FBCUDA.optimize_for_compilation_time_
+                    else "-O1",
                     "-std=c++17",
                 ]
+                + (
+                    ["-DOPTIMIZE_FOR_COMPILATION_TIME"]
+                    if FBCUDA.optimize_for_compilation_time_
+                    else []
+                )
             )
             if environ.enable_ptxas_info():
                 options.extend(
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index d93808023..4cef720ad 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -26,6 +26,7 @@
     full,
     gather,
     identity,
+    index_select,
     jagged_to_padded_dense,
     masked_select,
     padded_dense_to_jagged,
@@ -55,6 +56,7 @@
     "relational",
     "identity",
     "jagged_to_padded_dense",
+    "index_select",
     "masked_select",
     "padded_dense_to_jagged",
     "permute",
diff --git a/python/aitemplate/backend/cuda/tensor/argmax.py b/python/aitemplate/backend/cuda/tensor/argmax.py
index 0c3784d0f..e6d6e58a9 100644
--- a/python/aitemplate/backend/cuda/tensor/argmax.py
+++ b/python/aitemplate/backend/cuda/tensor/argmax.py
@@ -32,7 +32,9 @@
 #include <cub/cub.cuh>
 
 using bfloat16 = nv_bfloat16;
-
+// if this #if statement does not evaluate to True, it is already
+// defined in cub's util_type.cuh and would be a redefinition
+#if (__CUDACC_VER_MAJOR__ < 11 && CUDA_VERSION < 11000) || _NVHPC_CUDA
 namespace cub {
     template <>
     struct FpLimits<bfloat16>
@@ -54,7 +56,7 @@
     template<> struct Traits<bfloat16>
       : NumericTraits<bfloat16> {};
 }
-
+#endif
 """
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/index_select.py b/python/aitemplate/backend/cuda/tensor/index_select.py
new file mode 100644
index 000000000..6e43764e1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/index_select.py
@@ -0,0 +1,253 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define index_select codegen and CUDA kernel
+
+Example input:
+ - tensor of shape (6,5,4,3,2)
+ - dim = 2 (0->6, 1->5, 2->4, 3->3, 4->2)
+ - dim_len = 4
+ - dim_idxs = [1,2] (numbers taken from interval [0,3])
+ - dim_idx_len = 2
+ - num_before = 6*5
+ - num_after = 3*2
+
+Output tensor has dim (6,5,2,3,2) i.e.
+it has 6*5 (num_before) sets of 2 (dim_idx_len) sets of  3*2 (num_after) elements.
+
+Assuming contiguous memory layout of the original tensor (which seems like a base check for bad_tensor),
+the first few elements to be selected are at positions [6-11], [12-17] corresponding to dim_idxs values 1 and 2.
+Generalized to:
+    - Divide global thread_idx by num_after and calculate start of innermost set as the remainder
+    - Further divide by dim_idx_len and calculate start of next outer set as the remainder
+    - Use the final value as the offset for the outer most set
+    - Compute offset and assign to the element denoted by thread idx
+    - increment idx by grid stride
+
+Num threads = 256.
+Blocks are(N + threads - 1) / threads;
+
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda import cuda_common
+
+
+header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const {{index_type}} /*dim*/,
+    const {{index_type}} /*dim_len*/,
+    const {{index_type}}* /*dim_idxs*/,
+    const {{index_type}} /*dim_idxs_len*/,
+    const {{index_type}} /*num_before*/,
+    const {{index_type}} /*num_after*/,
+    cudaStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+__global__ void index_select_kernel(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    const {{index_type}} N
+) {
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+    #pragma unroll
+    for(auto i = idx; i<N; i+=gridDim.x*blockDim.x) {
+        auto res = i;
+        auto k = i%num_after;
+        res = res/num_after;
+        auto j = res%dim_idxs_len;
+        res = res/dim_idxs_len;
+        auto skip = res*dim_len*num_after + (dim_idxs[j]*num_after) + k;
+        output[i] = input[skip];
+    }
+
+}
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    cudaStream_t stream
+    ) {
+
+    {{index_type}} N =  num_before*dim_idxs_len*num_after;
+    const {{index_type}} threads  = 256;
+    auto blocks = (N + threads - 1) / threads;
+
+    index_select_kernel<<<blocks, threads, 0, stream>>>(output, input, dim, dim_len, dim_idxs,
+        dim_idxs_len, num_before, num_after, N);
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{index_type}} x_dims[] = {
+{{indent}}      {{x_dims}}
+{{indent}}  };
+{{indent}}  {{index_type}} num_before = 1;
+{{indent}}  {{index_type}} num_after = 1;
+{{indent}}  {{index_type}} dim_len = x_dims[{{dim}}];
+{{indent}}  for(auto i=0;i<{{dim}};i++) {
+{{indent}}   num_before *= x_dims[i];
+{{indent}}  }
+{{indent}}  for(auto i={{dim}}+1;i<sizeof(x_dims)/sizeof(x_dims[0]);i++) {
+{{indent}}   num_after *= x_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output}},
+{{indent}}      {{input}},
+{{indent}}      {{dim}},
+{{indent}}      dim_len,
+{{indent}}      {{dim_idxs}},
+{{indent}}      {{dim_idxs_len}},
+{{indent}}      num_before,
+{{indent}}      num_after,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("cuda.index_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+    )
+
+
+@registry.reg("cuda.index_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.index_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    dim_idxs = func_attrs["inputs"][1]
+    y = func_attrs["outputs"][0]
+    dim = func_attrs["dim"]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    dim_idxs_ptr = backend_spec.cast_to_ptr_template.render(
+        name=dim_idxs._attrs["name"],
+        dtype=backend_spec.index_type,
+    )
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+
+    x_dims = ", ".join(dim._attrs["name"] for dim in x._attrs["shape"])
+    dim_idxs_len = dim_idxs._attrs["shape"][0]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        index_type=backend_spec.index_type,
+        x_dims=x_dims,
+        input_type=dtype,
+        func_name=func_attrs["name"],
+        output=output_ptr,
+        input=input_ptr,
+        dim=dim,
+        dim_idxs=dim_idxs_ptr,
+        dim_idxs_len=dim_idxs_len,
+    )
diff --git a/python/aitemplate/backend/cuda/view_ops/view_ops.py b/python/aitemplate/backend/cuda/view_ops/view_ops.py
index 502b66bea..63f06765a 100644
--- a/python/aitemplate/backend/cuda/view_ops/view_ops.py
+++ b/python/aitemplate/backend/cuda/view_ops/view_ops.py
@@ -92,7 +92,7 @@ def _is_intvar(func_attrs):
 @registry.reg("cuda.reshape.gen_function")
 @registry.reg("cuda.flatten.gen_function")
 def reshape_gen_function(func_attrs, shape_eval_template):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     unknown_idx = func_attrs["unknown_idx"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     if _is_intvar(func_attrs):
@@ -120,7 +120,7 @@ def reshape_gen_function(func_attrs, shape_eval_template):
 @registry.reg("cuda.reshape.func_decl")
 @registry.reg("cuda.flatten.func_decl")
 def reshape_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     if _is_intvar(func_attrs):
         input_ndim = len(func_attrs["inputs"]) - 1
@@ -134,7 +134,7 @@ def reshape_gen_function_decl(func_attrs):
 @registry.reg("cuda.reshape.func_call")
 @registry.reg("cuda.flatten.func_call")
 def reshape_gen_function_call(func_attrs, indent="  "):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_names = []
     if _is_intvar(func_attrs):
         for i, inp in enumerate(func_attrs["inputs"]):
@@ -171,7 +171,7 @@ def squeeze_gen_function(func_attrs, shape_eval_template):
     shape_eval_template : jinja2.Template
         The template that implements the logic for writing to dynamic shapes.
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     out_dim_to_in = func_attrs["out_dim_to_in"]
 
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
@@ -206,7 +206,7 @@ def squeeze_gen_function_decl(func_attrs):
     func_attrs : Dict[str, Any]
         The _attrs dict from the original op.
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
 
@@ -227,7 +227,7 @@ def squeeze_gen_function_call(func_attrs, indent="  "):
     ident : str
         Sequence to use to generate the indentations in the CUDA code
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_names = [
         shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
     ]
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 1de90904d..fa7705182 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -227,6 +227,9 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
 {% endif %}
 
     void ProfileImpl(StreamType stream, size_t iters, const std::string& filename) {
+#ifdef OPTIMIZE_FOR_COMPILATION_TIME
+      throw std::runtime_error("Profile is disabled, please recompile without OPTIMIZE_FOR_COMPILE_TIME flag");
+#else
       std::ofstream ss(filename);
       if (!ss) {
         throw std::runtime_error(std::string("Could not open file ") + filename);
@@ -286,6 +289,7 @@ class {{model_name}} : public ModelBase<{{model_name}}> {
       DeviceToDeviceCopies(stream);
       std::cout << "AIT per op profiling finished." << std::endl;
       FreeDeviceMemory(L2CacheSlab);
+#endif
     }
 
     static std::unique_ptr<{{model_name}}> Create(
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index a364aa771..66ef26c92 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -257,7 +257,7 @@ class ProfilerRunner:
     however, the results are empirically better compared to the previous runner.
     """
 
-    def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 300):
+    def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 500):
         """
         Parameters
         ----------
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index b71a20bce..49dd22b47 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -104,7 +104,7 @@
 
 HEADER_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 """
 )
 
@@ -119,7 +119,7 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index 190f85694..f5807f017 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -24,7 +24,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 
 
 namespace ck {
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index f43e42317..c82822610 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -24,7 +24,7 @@
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 
 #include "ck/utility/data_type.hpp"
 
diff --git a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
index b6441bf5c..145e9e846 100644
--- a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
@@ -35,9 +35,7 @@
 def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
     """Generates fused_elementwise function definition."""
 
-    custom_libs = Target.current().get_custom_libs(
-        os.path.dirname(__file__), "custom_math.h"
-    )
+    custom_libs = '#include "custom_math.h"'
     return elementwise_common.fused_elementwise_gen_function(
         func_attrs=func_attrs,
         custom_libs=custom_libs,
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 857029d87..39d9a4956 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -115,7 +115,7 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
index 89fa95c89..6b97a0d21 100644
--- a/python/aitemplate/backend/rocm/lib_template.py
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -18,6 +18,7 @@
 import jinja2
 
 from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
 
 # pylint: disable=W0613
 
@@ -46,3 +47,9 @@ def void_ptr_decl(name, dtype="float16", indent="  "):
     else:
         raise NotImplementedError
     return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
+
+
+@registry.reg("rocm.lib.dtype_to_backend_type")
+def dtype_to_backend_type(dtype):
+    backend_spec = ROCMSpec()
+    return backend_spec.dtype_to_backend_type(dtype)
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index a059fac29..995144fe0 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -29,7 +29,9 @@
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 """
 )
 
@@ -38,18 +40,22 @@
 {%if use_swish %}
 struct YElementOp
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
     {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
+        static_assert(ck::is_same<X, float>::value || ck::is_same<X, double>::value ||
+                          ck::is_same<X, ck::half_t>::value,
                       "Data type is not supported by this operation!");
 
-        T a;
+        static_assert(ck::is_same<Y, float>::value || ck::is_same<Y, double>::value ||
+                          ck::is_same<Y, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        X a;
 
         ck::tensor_operation::element_wise::Sigmoid{}(a, x);
 
-        y = x * a;
+        y = ck::type_convert<Y>(x * a);
     };
 };
 
@@ -96,7 +102,6 @@
     """
     C = C / G;
     std::vector<ck::index_t> i_inStrides;
-
     i_inStrides.push_back(H * W * G * C);
     i_inStrides.push_back(W * G * C);
     i_inStrides.push_back(G * C);
@@ -110,6 +115,10 @@
     gamma_beta_Strides.push_back(C);
     gamma_beta_Strides.push_back(1);
 
+    std::vector<ck::index_t> save_mean_strides;
+    save_mean_strides.push_back(G);
+    save_mean_strides.push_back(1);
+
     auto device_instance = {{instance}}{};
     auto argument_ptr = device_instance.MakeArgumentPointer(
         {static_cast<ck::index_t>(N),
@@ -121,8 +130,10 @@
         gamma_beta_Strides,
         gamma_beta_Strides,
         i_inStrides, // y stride
+        save_mean_strides,
+        save_mean_strides,
         {1, 2, 4}, // reduction dimension: [H, W, C]
-        1e-5,
+        1e-6,
         static_cast<ck::half_t *>(input),
         static_cast<ck::half_t *>(gamma),
         static_cast<ck::half_t *>(beta),
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index af3efcf24..acc3da838 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -29,7 +29,9 @@
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 """
 )
 
@@ -66,6 +68,9 @@
     """
     std::vector<ck::index_t> i_inStrides;
     std::vector<ck::index_t> i_outStrides;
+    std::vector<ck::index_t> save_mean_strides;
+    save_mean_strides.push_back(1);
+
     {% if input_strides is defined %}
     i_inStrides.push_back({{input_strides[-2]}});
     i_inStrides.push_back({{input_strides[-1]}});
@@ -89,6 +94,8 @@
         std::vector<ck::index_t>{0, 1},
         std::vector<ck::index_t>{0, 1},
         i_outStrides,
+        save_mean_strides,
+        save_mean_strides,
         {1},
         {{eps}},
         static_cast<ck::half_t *>(input) + {{ input_offset if input_offset is defined else 0 }},
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 328fa47ec..74076a529 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -201,7 +201,7 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index 3885ecc84..38cab6a87 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -59,7 +59,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index cb8529f31..f6bcffd63 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -118,6 +118,7 @@ def _build_compile_options(self):
         options = [
             environ.get_compiler_opt_level(),
             "-fPIC",
+            "-mcmodel=medium",
             "-fvisibility=hidden",
             "-std=c++17",
             "-w",
@@ -126,14 +127,9 @@ def _build_compile_options(self):
                 self._pkg_path()
             ),
         ]
-        if self._arch in {"GFX908", "gfx908"}:
-            options.append("-DCK_AMD_GPU_GFX908")
-            options.append("--offload-arch=gfx908")
-        elif self._arch in {"GFX90a", "gfx90a"}:
-            options.append("-DCK_AMD_GPU_GFX90A")
-            options.append("--offload-arch=gfx90a")
-        else:
+        if self._arch.lower() not in {"gfx908", "gfx90a"}:
             raise RuntimeError("Unsupported GPU Arch")
+        options.append("--offload-arch=native")
         for path in ck_paths:
             options.append("-I" + path)
         options.append("-I" + os.path.join(self.static_files_path, "include"))
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index 62e7f3b00..d8da08c53 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -25,6 +25,7 @@
     expand_static_shape,
     full,
     identity,
+    index_select,
     permute021,
     permute0213,
     permute102,
diff --git a/python/aitemplate/backend/rocm/tensor/index_select.py b/python/aitemplate/backend/rocm/tensor/index_select.py
new file mode 100644
index 000000000..b0d9f2999
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/index_select.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define index_select codegen and ROCM kernel
+
+Example input:
+ - tensor of shape (6,5,4,3,2)
+ - dim = 2 (0->6, 1->5, 2->4, 3->3, 4->2)
+ - dim_len = 4
+ - dim_idxs = [1,2] (numbers taken from interval [0,3])
+ - dim_idx_len = 2
+ - num_before = 6*5
+ - num_after = 3*2
+
+Output tensor has dim (6,5,2,3,2) i.e.
+it has 6*5 (num_before) sets of 2 (dim_idx_len) sets of  3*2 (num_after) elements.
+
+Assuming contiguous memory layout of the original tensor (which seems like a base check for bad_tensor),
+the first few elements to be selected are at positions [6-11], [12-17] corresponding to dim_idxs values 1 and 2.
+Generalized to:
+    - Divide global thread_idx by num_after and calculate start of innermost set as the remainder
+    - Further divide by dim_idx_len and calculate start of next outer set as the remainder
+    - Use the final value as the offset for the outer most set
+    - Compute offset and assign to the element denoted by thread idx
+    - increment idx by grid stride
+
+Num threads = 256.
+Blocks are(N + threads - 1) / threads;
+
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const {{index_type}} /*dim*/,
+    const {{index_type}} /*dim_len*/,
+    const {{index_type}}* /*dim_idxs*/,
+    const {{index_type}} /*dim_idxs_len*/,
+    const {{index_type}} /*num_before*/,
+    const {{index_type}} /*num_after*/,
+    hipStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+__global__ void index_select_kernel(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    const {{index_type}} N
+) {
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+    #pragma unroll
+    for(auto i = idx; i<N; i+=gridDim.x*blockDim.x) {
+        auto res = i;
+        auto k = i%num_after;
+        res = res/num_after;
+        auto j = res%dim_idxs_len;
+        res = res/dim_idxs_len;
+        auto skip = res*dim_len*num_after + (dim_idxs[j]*num_after) + k;
+        output[i] = input[skip];
+    }
+
+}
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    hipStream_t stream
+    ) {
+
+    {{index_type}} N =  num_before*dim_idxs_len*num_after;
+    const {{index_type}} threads  = 256;
+    auto blocks = (N + threads - 1) / threads;
+
+    index_select_kernel<<<blocks, threads, 0, stream>>>(output, input, dim, dim_len, dim_idxs,
+        dim_idxs_len, num_before, num_after, N);
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{index_type}} x_dims[] = {
+{{indent}}      {{x_dims}}
+{{indent}}  };
+{{indent}}  {{index_type}} num_before = 1;
+{{indent}}  {{index_type}} num_after = 1;
+{{indent}}  {{index_type}} dim_len = x_dims[{{dim}}];
+{{indent}}  for(auto i=0;i<{{dim}};i++) {
+{{indent}}   num_before *= x_dims[i];
+{{indent}}  }
+{{indent}}  for(auto i={{dim}}+1;i<sizeof(x_dims)/sizeof(x_dims[0]);i++) {
+{{indent}}   num_after *= x_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output}},
+{{indent}}      {{input}},
+{{indent}}      {{dim}},
+{{indent}}      dim_len,
+{{indent}}      {{dim_idxs}},
+{{indent}}      {{dim_idxs_len}},
+{{indent}}      num_before,
+{{indent}}      num_after,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("rocm.index_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+    )
+
+
+@registry.reg("rocm.index_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("rocm.index_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    dim_idxs = func_attrs["inputs"][1]
+    y = func_attrs["outputs"][0]
+    dim = func_attrs["dim"]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    dim_idxs_ptr = backend_spec.cast_to_ptr_template.render(
+        name=dim_idxs._attrs["name"],
+        dtype=backend_spec.index_type,
+    )
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+
+    x_dims = ", ".join(dim._attrs["name"] for dim in x._attrs["shape"])
+    dim_idxs_len = dim_idxs._attrs["shape"][0]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        index_type=backend_spec.index_type,
+        x_dims=x_dims,
+        input_type=dtype,
+        func_name=func_attrs["name"],
+        output=output_ptr,
+        input=input_ptr,
+        dim=dim,
+        dim_idxs=dim_idxs_ptr,
+        dim_idxs_len=dim_idxs_len,
+    )
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index a464fddbd..e0284add1 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -439,10 +439,7 @@ def copy_headers_and_csrc_to_workdir(self, workdir: str) -> List[str]:
             fname_dst, ext = os.path.splitext(fname)
             if ext != ".cpp":
                 continue
-            # TODO: Remove this file when the linker error gets fixed in rocm backend.
-            # All files in csrc should be shared between the ROCM and CUDA backends.
-            if fname == "rocm_hack.cpp" and self.name() != "rocm":
-                continue
+            
             fname_src = os.path.join(csrc, fname)
             fname_dst_cpp = os.path.join(workdir, f"{fname_dst}{self.src_extension()}")
             shutil.copyfile(fname_src, fname_dst_cpp)
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 82df9b188..ab2f5f45e 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -160,7 +160,7 @@ def compile_model(
     allocator_kind: Optional[AITemplateAllocatorKind] = None,
     debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
     do_optimize_graph: bool = True,
-    profile_timeout: int = 300,
+    profile_timeout: int = 500,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index 016b4ddca..4534628e6 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -67,8 +67,8 @@ def sigmoid(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("SIGMOID")(tensor)
 
 
-def leaky_relu(tensor: Any) -> Tensor:
-    return OP_REGISTRY.get("LRELU")(tensor)
+def leaky_relu(tensor: Any, negative_slope: Any) -> Tensor:
+    return OP_REGISTRY.get("LRELU")(tensor, negative_slope)
 
 
 def hardtanh(*args, **kwargs) -> Tensor:
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index 956cdcbbd..d7208390b 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -30,11 +30,13 @@
 from aitemplate.compiler.base import (
     DynamicProfileStrategy,
     ExecItem,
+    IntImm,
     IntVar,
     Operator,
     Tensor,
 )
 from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.compiler.ops.tensor.permute import permute
 
 from aitemplate.testing import detect_target
 
@@ -202,10 +204,16 @@ def __call__(self, x: Tensor, dim: int = None) -> Tensor:
                 "flattening input tensor before normalization is not supported yet"
             )
         dim = wrap_dim(dim, x._rank())
-        if dim != x._rank() - 1:
-            raise NotImplementedError(
-                f"softmax currently only supports dim=x._rank() - 1, dim={dim}, x._rank()={x._rank()}"
-            )
+        tail_shapes = x.shape()[dim + 1 :]
+        # The backend only supports reduction over the last non-1 dimension, so if we want
+        # to reduce over other dimensions we have to permute the tensor first.
+        if not all(isinstance(s, IntImm) and s.value() == 1 for s in tail_shapes):
+            perm_shape = list(range(x._rank()))
+            perm_shape[dim] = x._rank() - 1
+            perm_shape[-1] = dim
+            x_perm = permute()(x, perm_shape)
+            x_perm_softmax = softmax()(x_perm, dim=-1)
+            return permute()(x_perm_softmax, perm_shape)
 
         self._attrs["inputs"] = [x]
         self._attrs["dim"] = dim
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index a45019ab8..ee031c330 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -27,6 +27,7 @@
 from aitemplate.compiler.ops.tensor.full import full
 from aitemplate.compiler.ops.tensor.gather import gather
 from aitemplate.compiler.ops.tensor.identity import identity
+from aitemplate.compiler.ops.tensor.index_select import index_select
 from aitemplate.compiler.ops.tensor.jagged_to_padded_dense import jagged_to_padded_dense
 from aitemplate.compiler.ops.tensor.masked_select import masked_select
 from aitemplate.compiler.ops.tensor.padded_dense_to_jagged import padded_dense_to_jagged
diff --git a/python/aitemplate/compiler/ops/tensor/index_select.py b/python/aitemplate/compiler/ops/tensor/index_select.py
new file mode 100644
index 000000000..71f023572
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/index_select.py
@@ -0,0 +1,88 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select op
+"""
+
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import Operator, Tensor
+
+
+class index_select(Operator):
+    """
+    Returns a new tensor which indexes the input tensor
+    along dimension dim using the entries in index which is a LongTensor.
+
+    The returned tensor has the same number of dimensions as the original tensor (input).
+    The dimth dimension has the same size as the length of index;
+    other dimensions have the same size as in the original tensor.
+
+    Args:
+        input (Tensor) – the input tensor.
+        dim (int) – the dimension in which we index
+        index (IntTensor or LongTensor) – the 1-D tensor containing the indices to index
+    """
+
+    def __init__(self, dim=0):
+        super().__init__()
+        self._attrs["op"] = "index_select"
+        self._attrs["dim"] = dim
+
+    def _normalize_dim(self, rank: int):
+        dim_idx = self._attrs["dim"]
+        orig = dim_idx
+        if dim_idx < 0:
+            dim_idx = rank + dim_idx
+        if dim_idx < 0 or dim_idx >= rank:
+            raise RuntimeError(
+                f"Invalid dim for index_select. Valid values of dim range from {-rank} to {rank - 1}. {orig} provided, normalized {dim_idx}"
+            )
+        self._attrs["dim"] = dim_idx
+
+    def _infer_shape(self, x: Tensor, idx_select_dim):
+        self._normalize_dim(len(x._attrs["shape"]))
+        dim_idx = self._attrs["dim"]
+        dims = x._attrs["shape"][:dim_idx]
+        dims += [idx_select_dim]
+        if dim_idx + 1 < len(x._attrs["shape"]):
+            dims += x._attrs["shape"][dim_idx + 1 :]
+        return dims
+
+    def __call__(
+        self,
+        x: Tensor,
+        dim_idxs: Tensor,
+    ) -> List[Tensor]:
+        self._attrs["inputs"] = [x, dim_idxs]
+        if len(dim_idxs._attrs["shape"]) != 1:
+            raise RuntimeError("index tensor must be 1 dimensional.")
+        self._set_depth()
+        output = Tensor(
+            self._infer_shape(x, (dim_idxs._attrs["shape"][0])),
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index 3ff2d800a..c195d4087 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -37,7 +37,6 @@
 from aitemplate.compiler.transform.optimize_graph import optimize_graph
 from aitemplate.compiler.transform.profile import profile
 from aitemplate.compiler.transform.refine_graph import refine_graph
-from aitemplate.compiler.transform.remove_id_ops import remove_id_ops
 from aitemplate.compiler.transform.remove_no_ops import remove_no_ops
 from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 40af68f35..60a496553 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -18,7 +18,7 @@
 import collections
 import logging
 from dataclasses import dataclass
-from typing import Any, Dict, List, Set
+from typing import Any, Dict, List, Optional, Set
 
 from aitemplate.compiler.base import Operator, Tensor
 from aitemplate.compiler.ops.common import elementwise, fused_elementwise
@@ -38,39 +38,40 @@
 
 class SimpleDisjointSet:
     def __init__(self):
-        self.node_to_list_mapping: Dict[Any, List[Any]] = {}
+        self.node_to_set_mapping: Dict[Any, Set[Any]] = {}
 
-    def add(self, node: Any, dependent_nodes: Set[Any]) -> None:
-        if node in self.node_to_list_mapping:
+    def add(self, node: Any, dependent_nodes: Optional[Set[Any]]) -> None:
+        if node in self.node_to_set_mapping:
             return
 
         if dependent_nodes is None or len(dependent_nodes) == 0:
-            self.node_to_list_mapping[node] = [node]
+            self.node_to_set_mapping[node] = {node}
             return
 
-        current_list = [
-            node  # node should also be considered to decide if a new_list can be added.
-        ]
-        for dependent in list(dependent_nodes):
-            if dependent is None or dependent not in self.node_to_list_mapping:
+        current_set = {
+            node  # node should also be considered to decide if a new_set can be added.
+        }
+        for dependent in dependent_nodes:
+            if dependent is None or dependent not in self.node_to_set_mapping:
                 continue
-            new_list = self.node_to_list_mapping.get(dependent)
+            new_set = self.node_to_set_mapping.get(dependent)
 
-            if _detect_cycle(current_list + new_list):
+            if _detect_cycle(current_set | new_set):
                 continue
-            current_list.extend(new_list)
-            for new_node in new_list:
-                self.node_to_list_mapping[new_node] = current_list
-        self.node_to_list_mapping[node] = current_list
 
-    def get_node_groups(self) -> List[List[Any]]:
+            current_set.update(new_set)
+            for new_node in new_set:
+                self.node_to_set_mapping[new_node] = current_set
+        self.node_to_set_mapping[node] = current_set
+
+    def get_node_groups(self) -> List[Set[Any]]:
         node_groups = []
         visited = set()
-        for groups in self.node_to_list_mapping.values():
-            addr = id(groups)
+        for group in self.node_to_set_mapping.values():
+            addr = id(group)
             if addr not in visited:
                 visited.add(addr)
-                node_groups.append(groups)
+                node_groups.append(group)
         return node_groups
 
 
@@ -146,7 +147,7 @@ class FusedElementwiseInfo:
     external_outputs: Set[Tensor]
 
 
-def _partition_subgraphs(ops: List[Operator]) -> Dict[str, Set[Operator]]:
+def _partition_subgraphs(ops: Set[Operator]) -> Dict[str, Set[Operator]]:
     """
     Given ops of candidate graph of fused_elementwise op graph and partition
     into subgraph based on output shape, returns dict of
@@ -283,7 +284,7 @@ def _create_fuse_ops(info_list: List[FusedElementwiseInfo]) -> None:
         )
 
 
-def _detect_cycle(group: List[Operator]) -> bool:
+def _detect_cycle(group: Set[Operator]) -> bool:
     """
     Given a group of ops, to detect if they would form cycles, i.e.
       --> group_ops
@@ -294,7 +295,7 @@ def _detect_cycle(group: List[Operator]) -> bool:
     """
     parents = [o for op1 in group for i in op1._attrs["inputs"] for o in i.src_ops()]
     for op1 in group:
-        for op2 in set(parents) - set(group):
+        for op2 in set(parents) - group:
             if transform_utils.is_ancestor(op1, op2):
                 return True
     return False
@@ -322,7 +323,7 @@ def fuse_elementwise(sorted_graph: List[Tensor], workdir: str = None) -> List[Te
         # Partition subgraph based on output shape.
         output_op_map = _partition_subgraphs(ops)
         # Collect information to create fuse ops.
-        info_list = _collect_info(output_op_map, set(ops), sorted_graph)
+        info_list = _collect_info(output_op_map, ops, sorted_graph)
         # Create fuse ops.
         _create_fuse_ops(info_list)
 
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index 16ca6c0e4..edf0eede5 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -41,16 +41,17 @@
 from aitemplate.compiler.transform.remove_elementwise_no_ops import (
     remove_elementwise_no_ops,
 )
-from aitemplate.compiler.transform.remove_id_ops import remove_id_ops
 from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
 from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
     split_large_slice_scatter_ops,
 )
 from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
 from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_merge_view_ops import merge_view_ops
 from aitemplate.compiler.transform.transform_odd_alignment import (
     transform_odd_alignment,
 )
+from aitemplate.compiler.transform.transform_permutations import eliminate_permutations
 from aitemplate.compiler.transform.transform_permute_to_reshape import (
     transform_permute_to_reshape,
 )
@@ -93,7 +94,6 @@ def optimize_graph(
     """
 
     funcs = [
-        remove_id_ops,
         remove_elementwise_no_ops,
         dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
@@ -105,6 +105,7 @@ def optimize_graph(
         fuse_mm_reshape_permute,
         # make sure we run move_view_op_before_concat before transform_memory_ops
         move_view_op_before_concat,
+        merge_view_ops,
         transform_memory_ops,
         fuse_ops,
         fuse_elementwise,
@@ -125,8 +126,7 @@ def optimize_graph(
         split_large_split_ops,
         transform_permute_to_reshape,
         transform_memory_ops,
-        # FIXME: temporarily disable this due to some accuracy issue
-        # eliminate_permutations,
+        eliminate_permutations,
     ]
 
     if not optimize:
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index d6b623821..11e4b5a17 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -32,6 +32,7 @@
     gemm,
     GemmProfilerPostprocessingDelegate,
 )
+from aitemplate.utils.environ import force_profiler_cache
 
 # pylint: disable=C0103,W0613,W0102
 
@@ -56,7 +57,7 @@ def profile(
     workdir="./tmp",
     devices=None,
     dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
-    timeout=300,
+    timeout=500,
 ):
     """Profiles kernels.
 
@@ -79,6 +80,7 @@ def profile(
         devices = [0]
     profiler_dir = os.path.join(workdir)
     start_t = datetime.now()
+    _LOGGER.info(f"Force profiler cache = {force_profiler_cache()}")
     generated_profilers = list(
         codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
     )
diff --git a/python/aitemplate/compiler/transform/remove_id_ops.py b/python/aitemplate/compiler/transform/remove_id_ops.py
deleted file mode 100644
index 6b9057e67..000000000
--- a/python/aitemplate/compiler/transform/remove_id_ops.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Remove id ops from a sorted_graph.
-"""
-from typing import List
-
-from aitemplate.compiler.base import Tensor
-from aitemplate.compiler.transform import transform_utils
-
-
-def remove_id_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
-    """Remove id ops from the input sorted_graph."""
-    for tensor in sorted_graph:
-        src_ops = tensor._attrs["src_ops"]
-        if len(src_ops) != 1:
-            continue
-        src_op = list(src_ops)[0]
-        if src_op._attrs["op"] != "identity":
-            continue
-        id_op = src_op
-        input_tensor = id_op._attrs["inputs"][0]
-        # skip a very special case where id takes an input and produces an output
-        if tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
-            continue
-        transform_utils.remove_single_tensor_op_from_sorted_graph(id_op)
-
-    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
-    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/remove_no_ops.py b/python/aitemplate/compiler/transform/remove_no_ops.py
index b1c876a8d..4fe11e7bb 100644
--- a/python/aitemplate/compiler/transform/remove_no_ops.py
+++ b/python/aitemplate/compiler/transform/remove_no_ops.py
@@ -31,15 +31,184 @@
 """
 from typing import List
 
-from aitemplate.compiler.base import IntVar, JaggedIntVar, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
 from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
 
 from aitemplate.compiler.transform import transform_utils
 
-from aitemplate.utils import graph_utils
+from aitemplate.utils import graph_utils, shape_utils
 from aitemplate.utils.shape_utils import is_singleton_dimension
 
 
+def _remove_id_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """Remove identity ops."""
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "identity":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "identity must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        identity_output = outputs[0]
+        assert len(inputs) == 1, "identity must only have 1 output"
+
+        # skip a very special case where id takes an input and produces an output
+        if identity_output._attrs["is_output"] and inputs[0]._attrs["is_input"]:
+            continue
+
+        transform_utils.remove_single_tensor_op_from_sorted_graph(op)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_concats(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove no-op concats from the graph. A no-op concat is where the output
+    tensor is exactly the same as the input tensor(s) and it isn't the model output.
+    This is the case when:
+    1. There is a single input tensor.
+    2. There is a single non-empty input tensor and the remaining input tensors
+    are empty.
+
+    x = Tensor(shape=[7])
+    empty1 = Tensor(shape=[0], value=[])
+    empty2 = Tensor(shape=[0], value=[])
+
+    y1 = ops.concatenate([x])                   # Case 1
+    y2 = ops.concatenate([empty1])              # Case 1
+    y2 = ops.concatenate([empty1, x, empty2])   # Case 2
+    """
+
+    def is_dim_gt_zero(dim):
+        if isinstance(dim, IntImm):
+            return dim.value() > 0
+        elif isinstance(dim, IntVar):
+            return dim.lower_bound() > 0
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "concatenate":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) >= 1, "concat must have at least 1 input"
+
+        outputs = op._attrs["outputs"]
+        concat_output = outputs[0]
+        assert len(outputs) == 1, "concat must have a single output"
+
+        # Assumes non-empty tensors have non-zero dimensions.
+        # And empty tensors have dimensions of size 0.
+        is_input_non_empty = [
+            all(is_dim_gt_zero(dim) for dim in tensor.shape()) for tensor in inputs
+        ]
+        n_non_empty = sum(is_input_non_empty)
+        if len(inputs) > 1 and n_non_empty > 1 or outputs[0]._attrs["is_output"]:
+            continue
+
+        idx = is_input_non_empty.index(True) if n_non_empty == 1 else 0
+        concat_input = inputs[idx]
+        for dst_op in concat_output.dst_ops():
+            transform_utils.replace_tensor_for_op(dst_op, concat_output, concat_input)
+        transform_utils.remove_tensor_from_sorted_graph(concat_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_dynamic_slices(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove any no-op slices from the graph. A no-op slice is when the input tensor
+    and output tensor are exactly the same. This happens when the start indices
+    and end indices cover the entire dimension length.
+
+    x = Tensor([1, 2, 3])
+    y = x[:]
+
+    xx = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+    yy = xx[0:2, -4:4]
+    """
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "dynamic_slice":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "dynamic_slice must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        assert len(inputs) == 1, "dynamic_slice must only have 1 output"
+
+        slice_input, slice_output = inputs[0], outputs[0]
+        if (
+            not shape_utils.is_same_shape(slice_input.shape(), slice_output.shape())
+            or slice_output._attrs["is_output"]
+        ):
+            continue
+
+        for dst_op in slice_output.dst_ops():
+            transform_utils.replace_tensor_for_op(dst_op, slice_output, slice_input)
+        transform_utils.remove_tensor_from_sorted_graph(slice_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_splits(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove any no-op split from the graph where the input tensor is non-jagged.
+    A no-op split is where the input tensor isn't divided into multiple parts.
+    This happens when the split_size_or_sections argument is:
+    1. an integer representing the length of the dimension indicated by dim
+    2. a singleton list containing the length of the dimension indicated by dim.
+
+    x = Tensor([1, 2, 3])
+    y1 = split(x, split_size_or_sections=3, dim=0)  # Case 1
+    y2 = split(x, split_size_or_sections=[3], dim=0)   # Case 2
+
+    xx = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+    yy1 = split(xx, split_size_or_sections=2, dim=0)  # Case 1
+    yy2 = split(xx, split_size_or_sections=4, dim=1)  # Case 1
+    yy3 = split(xx, split_size_or_sections=[2], dim=0)  # Case 2
+    yy4 = split(xx, split_size_or_sections=[4], dim=1)  # Case 2
+    """
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "split":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "split must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        assert len(inputs) >= 1, "split must have at least 1 output"
+
+        split_dim = op._attrs["split_dim"]
+        split_input, split_output = inputs[0], outputs[0]
+        input_split_dim_len, output_split_dim_len = (
+            split_input._attrs["shape"][split_dim],
+            split_output._attrs["shape"][split_dim],
+        )
+
+        # No-op splits must have one output, and the input and output shapes
+        # must match along split_dim. We ignore no-op splits that are outputs.
+        if (
+            len(outputs) > 1
+            or input_split_dim_len != output_split_dim_len
+            or outputs[0]._attrs["is_output"]
+        ):
+            continue
+
+        # Delete the split output in the graph.
+        for dst_op in list(split_output.dst_ops()):
+            transform_utils.replace_tensor_for_op(dst_op, split_output, split_input)
+
+        transform_utils.remove_tensor_from_sorted_graph(split_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
 def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
     """
     Remove no-op expands from the graph. A no-op expand is one
@@ -181,6 +350,10 @@ def remove_no_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
         Graph after remove no-ops
     """
     passes = [
+        _remove_id_ops,
+        _remove_no_op_concats,
+        _remove_no_op_dynamic_slices,
+        _remove_no_op_splits,
         _remove_no_op_expands,
         _fuse_expand_elementwise,
     ]
diff --git a/python/aitemplate/compiler/transform/transform_merge_view_ops.py b/python/aitemplate/compiler/transform/transform_merge_view_ops.py
new file mode 100644
index 000000000..a863df642
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_merge_view_ops.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This file implements a pass that merges consecutive view ops if possible.
+"""
+from typing import List, Set
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.utils.shape_utils import convert_shape_to_IntVarTensor
+
+
+_VIEW_OPS = {"reshape", "flatten", "squeeze", "unsqueeze"}
+
+
+def _is_inout(t: Tensor):
+    return t._attrs["is_input"] or t._attrs["is_output"]
+
+
+def _merge_view_ops_for(graph: List[Tensor], tensor: Tensor) -> List[Tensor]:
+    """
+    `tensor` should have exactly 1 src op, and that op must be a view op. We
+    will look for view ops in the dst ops and merge them with the src view op
+    by creating a new reshape op.
+    """
+    src_op = tensor._attrs["src_ops"][0]
+    in_tensor = src_op._attrs["inputs"][0]
+    dst_ops = tensor._attrs["dst_ops"]
+    removed_ops: Set[Operator] = set()
+    for op in dst_ops:
+        if op._attrs["op"] not in _VIEW_OPS:
+            continue
+        out_tensor = op._attrs["outputs"][0]
+        in_shape = in_tensor._attrs["shape"]
+        out_shape = out_tensor._attrs["shape"]
+        if out_shape == in_shape and not (
+            _is_inout(in_tensor) and _is_inout(out_tensor)
+        ):
+            # If the shapes are identical, we can eliminate both view ops
+            transform_utils.replace_tensor(out_tensor, in_tensor)
+        else:
+            # Otherwise, create a new reshape op to replace the two view ops
+            out_shape = convert_shape_to_IntVarTensor(out_tensor)
+            new_out_tensor = ops.reshape()(in_tensor, out_shape)
+            if out_tensor._attrs["is_output"]:
+                new_out_tensor._attrs["is_output"] = True
+                new_out_tensor._attrs["name"] = out_tensor._attrs["name"]
+            transform_utils.replace_tensor(out_tensor, new_out_tensor)
+            graph.append(new_out_tensor)
+        graph.remove(out_tensor)
+        removed_ops.add(op)
+    for op in removed_ops:
+        transform_utils.remove_view_op_from_sorted_graph(op)
+    return graph
+
+
+def merge_view_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Merge consecutive view ops.
+    """
+    changed = False
+    # Find pairs of consecutive view ops and merge them, iterating to a
+    # fixpoint.
+    # TODO: Instead of merging pairs of view ops, we should look for entire
+    # chains of view ops and merge them all at once.
+    while True:
+        for tensor in sorted_graph:
+            src_ops = tensor._attrs["src_ops"]
+            if len(src_ops) != 1:
+                continue
+            src_op = list(src_ops)[0]
+            if src_op._attrs["op"] not in _VIEW_OPS:
+                continue
+            dst_ops = tensor._attrs["dst_ops"]
+            if any(op._attrs["op"] in _VIEW_OPS for op in dst_ops):
+                # NOTE: _merge_view_ops_for does *not* return a sorted graph
+                sorted_graph = _merge_view_ops_for(sorted_graph, tensor)
+                changed = True
+                break
+        else:
+            break
+
+    if changed:
+        # Prune tensors that may have become unused after view op merging
+        sorted_graph = toposort([t for t in sorted_graph if t._attrs["is_output"]])
+        return transform_utils.sanitize_sorted_graph(toposort(sorted_graph))
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_permutations.py b/python/aitemplate/compiler/transform/transform_permutations.py
index ca6488e3e..4b9379c29 100644
--- a/python/aitemplate/compiler/transform/transform_permutations.py
+++ b/python/aitemplate/compiler/transform/transform_permutations.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 from aitemplate.compiler.transform import transform_utils
 
 
@@ -60,6 +61,32 @@ def remove_second_permutation_from_graph(
     transform_utils.remove_tensor_from_sorted_graph(output_tensor)
 
 
+def _reshaped_or_strided_input_or_output_accessor(op: Operator) -> bool:
+    def _reshaped_or_strided_tensor_accessor(accessor: TensorAccessor) -> bool:
+        if (
+            accessor.actual_shapes is not None
+            and accessor.actual_shapes != accessor.original_shapes
+        ):
+            return True
+
+        # Is it a strided accessor
+        if hasattr(accessor, "stride_dim") and accessor.stride_dim is not None:
+            return True
+
+        return False
+
+    input_accessors = op._attrs.get("input_accessors", None)
+    output_accessors = op._attrs.get("output_accessors", None)
+
+    return (
+        (input_accessors is not None)
+        and _reshaped_or_strided_tensor_accessor(input_accessors[0])
+    ) or (
+        (output_accessors is not None)
+        and _reshaped_or_strided_tensor_accessor(output_accessors[0])
+    )
+
+
 def eliminate_permutations(
     sorted_graph: List[Tensor], workdir: str = None
 ) -> List[Tensor]:
@@ -73,12 +100,7 @@ def eliminate_permutations(
                 continue
             if not cur_op._attrs["op"].startswith("permute"):
                 continue
-            input_accessors = cur_op._attrs.get("input_accessors", None)
-            if (
-                input_accessors is not None
-                and hasattr(input_accessors[0], "strided_dim")
-                and input_accessors[0].strided_dim is not None
-            ):
+            if _reshaped_or_strided_input_or_output_accessor(cur_op):
                 continue
             curr_op_output = cur_op._attrs["outputs"][0]
             dst_ops = curr_op_output._attrs["dst_ops"]
@@ -89,8 +111,12 @@ def eliminate_permutations(
             for next_op in dst_ops:
                 if not next_op._attrs["op"].startswith("permute"):
                     continue
+                if _reshaped_or_strided_input_or_output_accessor(next_op):
+                    continue
                 p1 = get_permutation(cur_op)
                 p2 = get_permutation(next_op)
+                if len(p1) != len(p2):
+                    continue
                 if not np.all(np.array(p1)[p2] == np.arange(0, len(p1))):
                     continue
                 is_input = cur_op._attrs["inputs"][0]._attrs["is_input"]
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index 41577cec3..202c99794 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -199,12 +199,11 @@ def match_func(tensor: Tensor) -> bool:
         if src_op._attrs["op"] not in conv_to_gemm:
             return False
 
-        if (
-            src_op._attrs["pad"] != 0
-            or src_op._attrs["dilate"] != 1
-            or src_op._attrs["group"] != 1
-            or src_op._attrs["stride"] != 1
-        ):
+        valid_pad = src_op._attrs["pad"] == 0 or src_op._attrs["pad"] == (0, 0)
+        valid_dilate = src_op._attrs["dilate"] == 1 or src_op._attrs["dilate"] == (1, 1)
+        valid_stride = src_op._attrs["stride"] == 1 or src_op._attrs["stride"] == (1, 1)
+        valid_group = src_op._attrs["group"] == 1
+        if not valid_pad or not valid_dilate or not valid_stride or not valid_group:
             return False
 
         # Check that the filter is 1x1
@@ -298,8 +297,8 @@ def transform_special_ops(
         _transform_1x1_conv_gemm_rcr,
     ]
 
-    if "transform_conv_to_gemm" in Target.current()._kwargs:
-        if Target.current()._kwargs["transform_conv_to_gemm"]:
+    if "convert_conv_to_gemm" in Target.current()._kwargs:
+        if Target.current()._kwargs["convert_conv_to_gemm"]:
             for func in funcs:
                 sorted_graph = func(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index 07e43a6cd..82c1fe95d 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -30,7 +30,6 @@
 from aitemplate.testing.detect_target import detect_target
 from aitemplate.utils.graph_utils import get_sorted_ops
 from aitemplate.utils.torch_utils import string_to_torch_dtype
-from torch import nn
 
 
 class TestEnv(Enum):
@@ -303,13 +302,13 @@ def get_attn_mask_per_causal_type(
 
 def init_random_weights(m):
     if hasattr(m, "weight"):
-        nn.init.uniform_(m.weight)
+        torch.nn.init.uniform_(m.weight)
     elif (
-        type(m) == nn.Sequential
-        or type(m) == nn.ModuleList
-        or type(m) == nn.SiLU
-        or type(m) == nn.Dropout
-        or type(m) == nn.Identity
+        type(m) == torch.nn.Sequential
+        or type(m) == torch.nn.ModuleList
+        or type(m) == torch.nn.SiLU
+        or type(m) == torch.nn.Dropout
+        or type(m) == torch.nn.Identity
     ):
         pass
     else:
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
index ddc00c852..160b56a1e 100644
--- a/python/aitemplate/utils/environ.py
+++ b/python/aitemplate/utils/environ.py
@@ -66,7 +66,6 @@ def force_profiler_cache() -> bool:
         assert (
             os.environ.get("FORCE_PROFILE", None) != "1"
         ), "cannot specify both AIT_FORCE_PROFILER_CACHE and FORCE_PROFILE"
-    _LOGGER.info(f"{force_cache=}")
     return force_cache
 
 
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
index 4c46deeb2..7f3def8dc 100644
--- a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -63,7 +63,7 @@ class XdlOpType(enum.Enum):
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu_Add: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Sigmoid: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
-    XdlOpType.DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle",
+    XdlOpType.DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle",
     XdlOpType.DeviceConvNdBwdDataNwcKxcNwk_Xdl: "ck::tensor_operation::device::DeviceConvNdBwdDataNwcKxcNwk_Xdl",
     XdlOpType.DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1: "ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1",
 }
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index e8f89f666..a9bdb5c4a 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -2316,19 +2316,24 @@ def CreateLayerNormOperator(manifest, rank=2):
     out_dtype = library.DataType.f16
     # 0 indicates not print
     tile_descriptions = [
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        layernorm.TileDesc(128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1),
+        layernorm.TileDesc(64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2),
+        layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
     ]
 
     operations = []
@@ -2353,19 +2358,24 @@ def CreateGroupNormOperator(manifest, rank=5):
     out_dtype = library.DataType.f16
     # 0 indicates not print
     tile_descriptions = [
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        groupnorm.TileDesc(128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1),
+        groupnorm.TileDesc(64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2),
+        groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
     ]
 
     operations = []
diff --git a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
index 969efc6ed..605f6500a 100644
--- a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
@@ -36,6 +36,7 @@ class TileDesc:
     beta_src_dim: int
     beta_src_size: int
     out_dst_size: int
+    save_mean_inv_std: int
 
     def __str__(self) -> str:
         values = list(self.__dict__.values())
@@ -78,12 +79,13 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationFwdImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
     {{AccDType}},
     {{OutDType}},
+    {{AccDType}},
     YElementOp,
     {{Rank}},
     {{NumReduceDim}},
@@ -113,7 +115,7 @@ def emit(self) -> str:
         Out=library.DataType.f16,
         Rank=5,
         NumReduceDim=3,
-        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
     )
     print(str(GroupNormOp))
     print(GroupNormOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
index 264cba714..52b4b70d3 100644
--- a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
@@ -36,6 +36,7 @@ class TileDesc:
     beta_src_dim: int
     beta_src_size: int
     out_dst_size: int
+    save_mean_inv_std: int
 
     def __str__(self) -> str:
         values = list(self.__dict__.values())
@@ -78,12 +79,13 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationFwdImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
     {{AccDType}},
     {{OutDType}},
+    {{AccDType}},
     ck::tensor_operation::element_wise::PassThrough,
     {{Rank}},
     {{NumReduceDim}},
@@ -113,7 +115,7 @@ def emit(self) -> str:
         Out=library.DataType.f16,
         Rank=3,
         NumReduceDim=-1,
-        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
     )
     print(str(LayerNormOp))
     print(LayerNormOp.emit())
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 7be8df950..d02c84112 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -20,6 +20,8 @@
 
 import sympy
 
+from aitemplate.compiler.base import IntVar, IntVarTensor, Tensor
+
 
 def gen_int_var(
     values: List[int], name: str = None, symbolic_value: Optional[sympy.Basic] = None
@@ -155,6 +157,30 @@ def convert_shape_to_IntVar(shape):
     return ret
 
 
+def convert_shape_to_IntVarTensor(tensor: Tensor):
+    """
+    Map IntVars in the tensor's shape to their corresponding IntVarTensors, if any.
+    """
+    shape = tensor._attrs["shape"]
+    if not any(isinstance(v, IntVar) for v in shape):
+        return shape
+
+    intvar_to_tensor = {}
+    for op in tensor.src_ops():
+        for t in op._attrs["inputs"]:
+            if isinstance(t, IntVarTensor):
+                intvar_to_tensor[t._attrs["int_var"]] = t
+
+    ret = []
+    for v in shape:
+        # Using type() instead of isinstance() because we don't want to include IntImms
+        if type(v) is IntVar:
+            ret.append(intvar_to_tensor.get(v, v))
+        else:
+            ret.append(v)
+    return ret
+
+
 def convert_IntVar_to_int(var) -> int:
     """
     Try to convert an IntVar (or an IntVar wrapped in a IntVarTensor) to
diff --git a/static/csrc/debug_utility.cpp b/static/csrc/debug_utility.cpp
index 3d9f3b60d..aca8099bf 100644
--- a/static/csrc/debug_utility.cpp
+++ b/static/csrc/debug_utility.cpp
@@ -45,16 +45,6 @@ __global__ void inf_and_nan_checker(const half* tensor, int64_t elem_cnt) {
   }
 }
 
-__global__ void outputs_checker(const half* tensor, int64_t elem_cnt) {
-  for (int64_t i = 0; i < elem_cnt; i++) {
-    float v = (float)(*(tensor + i));
-    if (i != 0) {
-      printf(", ");
-    }
-    printf("%f", v);
-  }
-  printf("\n");
-}
 } // namespace
 
 namespace ait {
@@ -68,13 +58,4 @@ void InvokeInfAndNanChecker(
   ait::StreamSynchronize(stream);
 }
 
-void InvokeOutputsChecker(
-    const half* tensor,
-    const char* tensor_name,
-    int64_t elem_cnt,
-    ait::StreamType stream) {
-  printf("Tensor (%s) output:\n", tensor_name);
-  outputs_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
-  ait::StreamSynchronize(stream);
-}
 } // namespace ait
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/static/include/custom_math.cuh
similarity index 100%
rename from python/aitemplate/backend/cuda/elementwise/custom_math.cuh
rename to static/include/custom_math.cuh
diff --git a/python/aitemplate/backend/rocm/elementwise/custom_math.h b/static/include/custom_math.h
similarity index 97%
rename from python/aitemplate/backend/rocm/elementwise/custom_math.h
rename to static/include/custom_math.h
index caa84c424..72258cbbf 100644
--- a/python/aitemplate/backend/rocm/elementwise/custom_math.h
+++ b/static/include/custom_math.h
@@ -15,6 +15,17 @@
 #ifndef CUSTOM_MATH
 #define CUSTOM_MATH
 
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+
+using bfloat16 = hip_bfloat16;
+
+
+#include <hip/math_functions.h>
+#include <hip/device_functions.h>
+
+
 #ifndef __HALF2_TO_UI
 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
 #endif
diff --git a/static/include/debug_utility.h b/static/include/debug_utility.h
index d5f7ce65c..cfbbf57b1 100644
--- a/static/include/debug_utility.h
+++ b/static/include/debug_utility.h
@@ -14,6 +14,21 @@
 #pragma once
 #include "device_functions-generated.h"
 
+namespace {
+template <typename T>
+__global__ void outputs_checker(const T* tensor, int64_t elem_cnt) {
+  for (int64_t i = 0; i < elem_cnt; i++) {
+    float v = (float)(*(tensor + i));
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("%f", v);
+  }
+  printf("\n");
+}
+
+} // namespace
+
 namespace ait {
 void InvokeInfAndNanChecker(
     const half* tensor,
@@ -21,9 +36,14 @@ void InvokeInfAndNanChecker(
     int64_t elem_cnt,
     ait::StreamType stream);
 
+template <typename T>
 void InvokeOutputsChecker(
-    const half* tensor,
+    const T* tensor,
     const char* tensor_name,
     int64_t elem_cnt,
-    ait::StreamType stream);
+    ait::StreamType stream) {
+  printf("Tensor (%s) output:\n", tensor_name);
+  outputs_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
+  ait::StreamSynchronize(stream);
+}
 } // namespace ait
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 18d3aa297..799e693bb 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -21,7 +21,7 @@
 #include <cstdlib>
 #include <initializer_list>
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
@@ -94,7 +94,7 @@ inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
-      << "\n     AMD GCN Arch Value: " << prop.gcnArch
+      << "\n     AMD GCN Arch Value: " << prop.gcnArchName
       << "\n     PCI bus ID of the device: " << prop.pciBusID
       << "\n     PCI device ID of the device: " << prop.pciDeviceID
       << "\n  Memory limits: "
@@ -118,7 +118,7 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
-      << "\n     AMD GCN Arch Value: " << prop.gcnArch
+      << "\n     AMD GCN Arch Value: " << prop.gcnArchName
       << "\n     PCI bus ID of the device: " << prop.pciBusID
       << "\n     PCI device ID of the device: " << prop.pciDeviceID
 
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..01495356f
--- /dev/null
+++ b/test.py
@@ -0,0 +1,18 @@
+import asyncio
+
+async def A():
+    print('1')
+    print('2')
+    print('3')
+
+async def B():
+    print('4')
+    print('5')
+    print('6')
+
+# loop = asyncio.get_event_loop()
+# tasks = [A(), B()]
+asyncio.run_coroutine_threadsafe(A())
+asyncio.run(B())
+
+# loop.close()
\ No newline at end of file
diff --git a/tests/unittest/compiler/test_eliminate_permutations.py b/tests/unittest/compiler/test_eliminate_permutations.py
index 54ec64d6f..6c3117da2 100644
--- a/tests/unittest/compiler/test_eliminate_permutations.py
+++ b/tests/unittest/compiler/test_eliminate_permutations.py
@@ -28,7 +28,6 @@
 )
 
 
-@unittest.skip("Skip until we fix the accuracy issue")
 class EliminatePermutationTestCase(unittest.TestCase):
     def test_eliminate_permutation(self):
         dtype = "float"
@@ -190,6 +189,126 @@ def test_eliminate_permutation_all_permutations(self):
         self.assertEqual(len(result_graph), 3)
         self.assertTrue(graph_has_op(result_graph, "permute"))
 
+    def test_do_not_eliminate_permutation_of_strided_input(self):
+        dtype = "float"
+        shape = [3, 2, 4]
+        new_shape = [3, 2 * 2]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        s1 = ops.dynamic_slice()(
+            x, start_indices=[0, 0, 2], end_indices=[2147483647, 2147483647, 4]
+        )
+        p1 = ops.permute()(s1, dims=[0, 2, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_strided_input"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor(new_shape, dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.reshape(torch.split(x_pt, 2, dim=2)[1], new_shape), z_pt
+                )
+            )
+
+    def test_do_not_eliminate_permutation_of_strided_input2(self):
+        dtype = "float"
+        shape = [3, 4, 2]
+        new_shape = [3, 2 * 2]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 1])
+        s1 = ops.dynamic_slice()(
+            p1, start_indices=[0, 0, 2], end_indices=[2147483647, 2147483647, 4]
+        )
+        p2 = ops.permute()(s1, dims=[0, 2, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_strided_input2"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor(new_shape, dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.reshape(
+                        torch.permute(
+                            torch.split(torch.permute(x_pt, (0, 2, 1)), 2, dim=2)[1],
+                            (0, 2, 1),
+                        ),
+                        new_shape,
+                    ),
+                    z_pt,
+                )
+            )
+
+    def test_do_not_eliminate_permutation_of_reshaped_input(self):
+        dtype = "float"
+        shape = [3, 2, 4]
+        new_shape = [3, 2, 4]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 1])
+        r1 = ops.reshape()(p1, new_shape)
+        p2 = ops.permute()(r1, dims=[0, 2, 1])
+        z = ops.dynamic_slice()(
+            p2, start_indices=[0, 0, 1], end_indices=[2147483647, 2147483647, 2]
+        )
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_reshaped_input"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor([3, 4, 1], dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.split(
+                        torch.permute(
+                            torch.reshape(torch.permute(x_pt, (0, 2, 1)), new_shape),
+                            (0, 2, 1),
+                        ),
+                        1,
+                        dim=2,
+                    )[1],
+                    z_pt,
+                )
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_merge_view_ops.py b/tests/unittest/compiler/test_merge_view_ops.py
new file mode 100644
index 000000000..561fbfc24
--- /dev/null
+++ b/tests/unittest/compiler/test_merge_view_ops.py
@@ -0,0 +1,414 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    count_ops,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    graph_has_op,
+)
+from aitemplate.utils import graph_utils
+
+
+class MergeViewOpsTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def test_basic(self):
+        """
+        Check that we convert a sequence of reshape(unsqueeze(...)) into a
+        single reshape() call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        y = ops.reduce_sum(dim=1)(x2)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_basic",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 3)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+
+        expected = torch.reshape(x0_pt, y_shape)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_sequential_views(self):
+        """
+        Check that we convert a sequence of reshape(unsqueeze(reshape(...)))
+        into a single reshape() call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 2, 4]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        x3 = ops.reshape()(x2, [8, 1, 2, 4])
+        y = ops.reduce_sum(dim=1)(x3)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_multiple_sequential_views",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 3)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+
+        expected = torch.reshape(x0_pt, y_shape)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_dst_view_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> unsqueeze -> ...
+                            |--> unsqueeze -> ...
+
+        We want to merge both unsqueeze calls into the preceding reshape call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        x3 = ops.unsqueeze(dim=2)(x1)
+
+        y0 = ops.reduce_sum(dim=1)(x2)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=2)(x3)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_multiple_dst_view_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 5)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 2)
+
+        y_expected = torch.reshape(x0_pt, [8, 8])
+        torch.testing.assert_close(y_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_dst_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> unsqueeze -> ...
+                            |--> ...
+
+        We cannot eliminate x1 since it has a non-view-op destination, but we
+        can still merge the reshape and unsqueeze operators.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y0_shape = [8]
+        y1_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+
+        y0 = ops.reduce_sum(dim=1)(x1)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=1)(x2)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y0_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y1_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_multiple_dst_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 5)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 2)
+
+        y0_expected = torch.sum(torch.reshape(x0_pt, [8, 8]), 1)
+        y1_expected = torch.reshape(x0_pt, y1_shape)
+        torch.testing.assert_close(y0_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y1_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape(self):
+        """
+        Given reshape(reshape(x, shape0), shape1), where shape1 is identical to
+        x's original shape, we can eliminate both reshape ops.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [2, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.reshape()(x1, x0_shape)
+
+        y = ops.reduce_sum(dim=1)(x2)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_identity_reshape",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "reshape"))
+        expected = torch.sum(x0_pt, 1)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_multiple_dst_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> reshape -> x2 -> op1
+                              -> op2
+
+        If x2 == x0, we can transform that into
+
+          x0 -> op1
+             -> reshape -> x1 -> op2
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y0_shape = [2, 8]
+        y1_shape = [8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.reshape()(x1, x0_shape)
+
+        y0 = ops.reduce_sum(dim=1)(x2)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=1)(x1)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y0_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y1_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_identity_reshape_multiple_dst_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 1)
+
+        y0_expected = torch.sum(x0_pt, 1)
+        y1_expected = torch.sum(torch.reshape(x0_pt, [8, 8]), 1)
+        torch.testing.assert_close(y0_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y1_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_in_out_conflict(self):
+        """
+        If x is an input and y is an output tensor, then we can only eliminate
+        one view op in the following example:
+
+          y = reshape(reshape(x, y_shape), x_original_shape)
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+
+        y = ops.reshape()(x1, x0_shape)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(x0_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_identity_reshape_in_out_conflict",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertTrue(graph_has_op(result_graph, "reshape"))
+        torch.testing.assert_close(x0_pt, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_out_out_conflict(self):
+        """
+        If y0 and y1 are both output tensors, then we can only eliminate one
+        view op in the following example:
+
+          y1 = reshape(reshape(y0, some_shape), y0_original_shape)
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [2, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        y0 = ops.reduce_sum(dim=1)(x0)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        x1 = ops.reshape()(y0, [4, 4])
+
+        y1 = ops.reshape()(x1, y_shape)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_identity_reshape_out_out_conflict",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        y_expected = torch.sum(x0_pt, 1)
+
+        self.assertEqual(len(result_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 1)
+        torch.testing.assert_close(y_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_remove_no_op_concats.py b/tests/unittest/compiler/test_remove_no_op_concats.py
new file mode 100644
index 000000000..d1d5d2d28
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_concats.py
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Sequence
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+
+
+class TestRemoveNoOpConcats(unittest.TestCase):
+    """
+    Tests the compiler's behavior of removing no-op concats.
+
+    NOTE: Whenever we include an empty input tensor, the non-empty input tensor
+    must be rank 1. That's because AIT's concat expects all its inputs to have
+    the same rank and have matching dimension sizes except along the
+    concatenating dimension.
+
+    We run the following tests:
+    # These are no-ops
+    1. inputs=[non-empty]
+    2. inputs=[rank-1 empty, rank-1 non-empty, rank-1 empty]
+    3. inputs=[empty]
+    4. inputs=[empty, empty]
+
+    # These are meaningful
+    5. inputs=[non-empty, non-empty]
+    6. inputs=[non-empty, empty, non-empty]
+
+    # These should have exceptions
+    7. inputs=[rank-2 non-empty, rank-1 empty]
+    8. inputs=[rank-2 non-empty, rank-2 empty]
+    """
+
+    def test_remove_no_op_concats_no_ops(self):
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[2, 4, 6]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_non_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0], [3], [0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_non_empty_and_double_empty",
+        )
+
+    def test_remove_no_op_concats_no_ops_all_empty(self):
+        """Below we test when all the input tensors are empty. fx2ait will fail
+        in these cases. However, it's possible to create it directly in AIT.
+        Therefore, we test this case and treat it as a no-op.
+        """
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0, 0, 0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0, 0, 0], [0, 0, 0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_double_empty",
+        )
+
+    def test_remove_no_op_concats_meaningful(self):
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[3, 5], [3, 5]],
+            should_keep_concat=True,
+            test_name="test_remove_no_op_concats_double_non_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[3], [0], [5]],
+            should_keep_concat=True,
+            test_name="test_remove_no_op_concats_two_non_empty_and_empty",
+        )
+
+    def test_remove_no_op_concats_exceptions(self):
+        """We expect this to raise an exception in these test cases."""
+
+        # AIT expects all concat inputs to have the same rank.
+        with self.assertRaises(RuntimeError):
+            self._test_remove_no_op_concats_impl(
+                input_shapes=[[2, 4], [0]],
+                should_keep_concat=False,
+                test_name="test_remove_no_op_concats_same_rank",
+            )
+
+        # AIT expects all concat inputs to have the same dimension sizes except for the concat_dim.
+        with self.assertRaises(RuntimeError):
+            self._test_remove_no_op_concats_impl(
+                input_shapes=[[2, 4], [0, 0]],
+                should_keep_concat=False,
+                test_name="test_remove_no_ops_concat_same_dim_sizes",
+            )
+
+    def _test_remove_no_op_concats_impl(
+        self,
+        input_shapes: Sequence[Sequence[int]],
+        should_keep_concat: bool,
+        test_name: str,
+    ):
+        inputs = [
+            Tensor(shape=shape, name=f"input_{i}", is_input=True)
+            for i, shape in enumerate(input_shapes)
+        ]
+        concatenated = ops.concatenate()(inputs)
+        c = Tensor(shape=[1], name="input_const", is_input=True)
+        model_output = (concatenated * c) + (concatenated / c)
+        model_output._attrs["name"] = "output_0"
+        model_output._attrs["is_output"] = True
+
+        inputs_pt = {
+            f"input_{i}": get_random_torch_tensor(shape=shape)
+            for i, shape in enumerate(input_shapes)
+        }
+        concatenated_pt = torch.concat(list(inputs_pt.values()))
+        c_pt = get_random_torch_tensor(shape=[1])
+        Y_pt = (concatenated_pt * c_pt) + (concatenated_pt / c_pt)
+        Y_ait = torch.empty_like(Y_pt)
+
+        with compile_model(model_output, detect_target(), "./tmp", test_name) as module:
+            module.run_with_tensors(
+                {**inputs_pt, "input_const": c_pt}, {"output_0": Y_ait}
+            )
+
+            self.assertEquals(
+                graph_has_op(module.debug_sorted_graph, "concatenate"),
+                should_keep_concat,
+            )
+            self.assertTrue(torch.allclose(Y_pt, Y_ait, atol=1e-2, rtol=1e-2))
diff --git a/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py b/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py
new file mode 100644
index 000000000..274ccdc5a
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.tensor.dynamic_slice import MAX_INT32
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    graph_has_op,
+)
+
+
+class TestRemoveNoOpDynamicSlices(unittest.TestCase):
+    """
+    Tests the compiler's behavior when removing no-op dynamic slices.
+    """
+
+    def test_remove_no_op_dynamic_slices(self):
+        TEST_CASES = (
+            # These are no-ops.
+            {
+                # X[:]
+                "input_shape": [100],
+                "start_indices": [None],
+                "end_indices": [None],
+                "should_keep_dynamic_slice": False,
+            },
+            {
+                # X[0:]
+                "input_shape": [100],
+                "start_indices": [0],
+                "end_indices": [None],
+                "should_keep_dynamic_slice": False,
+            },
+            {
+                # X[:2_147_483_647, ]
+                "input_shape": [100, 100],
+                "start_indices": [None, 0],
+                "end_indices": [MAX_INT32, None],
+                "should_keep_dynamic_slice": False,
+            },
+            # These are meaningful.
+            {
+                # X[-7:-7]
+                "input_shape": [10],
+                "start_indices": [-7],
+                "end_indices": [-7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[7:, -7:, 0:]
+                "input_shape": [10, 10, 10],
+                "start_indices": [7, -7, 0],
+                "end_indices": [None, None, None],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[:7, :-7, :0]
+                "input_shape": [10, 10, 10],
+                "start_indices": [None, None, None],
+                "end_indices": [7, -7, 0],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[0:7, 0:-7]
+                "input_shape": [10, 10],
+                "start_indices": [0, 0],
+                "end_indices": [7, -7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[-7:7, 7:-7]
+                "input_shape": [10, 10],
+                "start_indices": [-7, 7],
+                "end_indices": [7, -7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[-7:7, 7:-7, :]
+                "input_shape": [10, 10, 10],
+                "start_indices": [-7, 7, None],
+                "end_indices": [7, -7, None],
+                "should_keep_dynamic_slice": True,
+            },
+        )
+
+        for i, test_kwargs in enumerate(TEST_CASES):
+            start_indices = ",".join(map(str, test_kwargs["start_indices"]))
+            end_indices = ",".join(map(str, test_kwargs["end_indices"]))
+
+            with self.subTest(
+                start=start_indices,
+                end=end_indices,
+                keep=test_kwargs["should_keep_dynamic_slice"],
+            ):
+                self._test_remove_no_op_dynamic_slices_impl(
+                    **test_kwargs,
+                    test_name=f"test_remove_no_op_dynamic_slice_{i}",
+                )
+
+    def _test_remove_no_op_dynamic_slices_impl(
+        self,
+        input_shape: List[int],
+        start_indices: List[int],
+        end_indices: List[int],
+        should_keep_dynamic_slice: bool,
+        test_name: str,
+    ):
+        X = gen_input_tensor(shape=input_shape, name="input_0")
+        X_sliced = ops.dynamic_slice()(X, start_indices, end_indices)
+        c = gen_input_tensor(shape=[1], name="input_const")
+        model_output = (X_sliced * c) + (X_sliced / c)
+        model_output._attrs["name"] = "output_0"
+        model_output._attrs["is_output"] = True
+
+        X_pt = get_random_torch_tensor(shape=input_shape)
+        slices = [slice(s, e) for s, e in zip(start_indices, end_indices)]
+        X_sliced_pt = X_pt[slices]
+        c_pt = get_random_torch_tensor(shape=[1])
+        Y_pt = (X_sliced_pt * c_pt) + (X_sliced_pt / c_pt)
+        Y_ait = torch.empty_like(Y_pt)
+
+        # NOTE: We don't run every optimization pass to avoid fusion between
+        # dynamic_slice and elementwise.
+        with compile_model(
+            model_output, detect_target(), "/tmp", test_name, do_optimize_graph=False
+        ) as module:
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_const": c_pt}, {"output_0": Y_ait}
+            )
+
+            self.assertEqual(
+                graph_has_op(module.debug_sorted_graph, "dynamic_slice"),
+                should_keep_dynamic_slice,
+            )
+            self.assertTrue(torch.allclose(Y_pt, Y_ait, atol=1e-2, rtol=1e-3))
diff --git a/tests/unittest/compiler/test_remove_no_op_splits.py b/tests/unittest/compiler/test_remove_no_op_splits.py
new file mode 100644
index 000000000..8dec163cb
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_splits.py
@@ -0,0 +1,168 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import List, Sequence, Union
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    graph_has_op,
+)
+
+
+class TestRemoveNoOpSplits(unittest.TestCase):
+    """
+    Tests _remove_no_op_splits() in remove_no_ops.py
+    """
+
+    def test_remove_no_op_split(self):
+        """
+        Test cases:
+        0. No-op split with split_size_or_sections as integer
+        1. No-op split with split_size_or_sections as a singleton list
+        2. No-op split with split_size > length along split_dim
+        3. No-op split with split_dim = -1
+        4. Meaningful split
+        5. Meaningful split with split_dim = -1
+        6. No-op split is a model output
+        7. Meaningful split is a model output
+        """
+
+        test_cases = (
+            # Split is a no-op.
+            {
+                "split_input_shape": (5,),
+                "split_size_or_sections": 5,
+                "split_dim": 0,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_0",
+            },
+            {
+                "split_input_shape": (5,),
+                "split_size_or_sections": [5],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_1",
+            },
+            {
+                "split_input_shape": (2, 3, 4),
+                "split_size_or_sections": 10,  # split_size > length along dim=1
+                "split_dim": 1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_2",
+            },
+            {
+                "split_input_shape": (2, 3, 4, 5),
+                "split_size_or_sections": [5],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_3",
+            },
+            # Split is meaningful.
+            {
+                "split_input_shape": (7,),
+                "split_size_or_sections": 2,
+                "split_dim": 0,
+                "split_is_output": False,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_meaningful_4",
+            },
+            {
+                "split_input_shape": (2, 3, 4, 5),
+                "split_size_or_sections": [2, 1, 2],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_meaningful_5",
+            },
+            # Split is a model output.
+            {
+                "split_input_shape": (9,),
+                "split_size_or_sections": [9],
+                "split_dim": 0,
+                "split_is_output": True,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_output_6",
+            },
+            {
+                "split_input_shape": (1, 9),
+                "split_size_or_sections": [4, 5],
+                "split_dim": -1,
+                "split_is_output": True,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_output_7",
+            },
+        )
+
+        for i, test_kwargs in enumerate(test_cases):
+            with self.subTest(test_no=i):
+                self._test_remove_no_op_split_impl(**test_kwargs)
+
+    def _test_remove_no_op_split_impl(
+        self,
+        split_input_shape: Sequence[int],
+        split_size_or_sections: Union[int, List[int]],
+        split_dim: int,
+        split_is_output: bool,
+        should_remove_no_op_split: bool,
+        test_name: str,
+    ):
+        # Define model graph.
+        X = gen_input_tensor(shape=split_input_shape, name="input_0")
+        c = gen_input_tensor(shape=(1,), name="input_1")
+        Zs = ops.split()(X, split_size_or_sections, split_dim)
+
+        model_outputs = []
+        for i, Z in enumerate(Zs):
+            out = Z if split_is_output else Z + c
+            out._attrs["name"] = f"output_{i}"
+            out._attrs["is_output"] = True
+            model_outputs.append(out)
+
+        # Run PyTorch.
+        X_pt = get_random_torch_tensor(shape=split_input_shape)
+        c_pt = get_random_torch_tensor(shape=(1,))
+        Zs_pt = torch.split(X_pt, split_size_or_sections, split_dim)
+        outputs_pt = Zs_pt if split_is_output else [Z_pt + c_pt for Z_pt in Zs_pt]
+
+        # Run AIT.
+        with compile_model(
+            model_outputs, detect_target(), "./tmp", test_name
+        ) as module:
+            inputs_pt = (
+                {"input_0": X_pt}
+                if split_is_output
+                else {"input_0": X_pt, "input_1": c_pt}
+            )
+            outputs_ait = {
+                f"output_{i}": torch.empty_like(out_pt)
+                for (i, out_pt) in enumerate(outputs_pt)
+            }
+            module.run_with_tensors(inputs_pt, outputs_ait)
+
+            self.assertNotEqual(
+                graph_has_op(module.debug_sorted_graph, "split"),
+                should_remove_no_op_split,
+            )
+            for out_pt, out_ait in zip(outputs_pt, outputs_ait.values()):
+                self.assertTrue(torch.allclose(out_pt, out_ait, atol=1e-2, rtol=1e-3))
diff --git a/tests/unittest/compiler/test_slice_permute021_fusion.py b/tests/unittest/compiler/test_slice_permute021_fusion.py
index ea6d57c0c..d7e8b40c8 100644
--- a/tests/unittest/compiler/test_slice_permute021_fusion.py
+++ b/tests/unittest/compiler/test_slice_permute021_fusion.py
@@ -123,9 +123,9 @@ def test_slice_permute021_fusion(self):
         self._test_slice_permute021_fusion(
             N=2,
             K=2,
-            slice_input_shape=[120, 1211, 1200],
+            slice_input_shape=[3, 4, 120],
             slice_start_indices=[0, 0, 3],
-            slice_end_indices=[None, None, 1100],
+            slice_end_indices=[None, None, 110],
             dims=(0, 2, 1),
             test_name="slice_permute021",
             dtype="float16",
@@ -133,9 +133,9 @@ def test_slice_permute021_fusion(self):
         self._test_slice_permute021_fusion(
             N=2,
             K=2,
-            slice_input_shape=[123, 1211, 1200],
+            slice_input_shape=[3, 121, 4],
             slice_start_indices=[0, 5, 0],
-            slice_end_indices=[None, 1200, None],
+            slice_end_indices=[None, 115, None],
             dims=(0, 2, 1),
             test_name="slice_permute021",
             dtype="float16",
diff --git a/tests/unittest/compiler/test_strided_group_layernorm.py b/tests/unittest/compiler/test_strided_group_layernorm.py
index 02f24a795..afdf599d6 100644
--- a/tests/unittest/compiler/test_strided_group_layernorm.py
+++ b/tests/unittest/compiler/test_strided_group_layernorm.py
@@ -18,8 +18,12 @@
 
 import torch
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils, torch_utils
 
 
@@ -369,6 +373,144 @@ def test_slice_group_layer_norm_float(self):
             dtype="float32",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_group_layernorm_no_cuda_illegal_memory_access(self):
+        """
+        This subgraph has led to CUDA illegal memory issues before.
+        Adding it as a unit test to ensure there are no regressions.
+        """
+        batch_size = IntVar(values=[1, 2048], name="batch_size")
+
+        unsqueeze_46_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_46_0",
+        )
+        unsqueeze_58_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_58_0",
+        )
+        unsqueeze_70_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_70_0",
+        )
+        unsqueeze_131_0 = Tensor(
+            shape=[batch_size, 3, 1],
+            is_input=True,
+            name="unsqueeze_131_0",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias",
+        )
+
+        unsqueeze_83_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_83_0",
+        )
+        unsqueeze_95_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_95_0",
+        )
+        unsqueeze_107_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_107_0",
+        )
+        unsqueeze_358_0 = Tensor(
+            shape=[batch_size, 3, 1],
+            is_input=True,
+            name="unsqueeze_358_0",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias",
+        )
+
+        concatenate_71_0 = ops.concatenate()(
+            inputs=[unsqueeze_46_0, unsqueeze_58_0, unsqueeze_70_0],
+            dim=2,
+        )
+        bmm_rrr_132_0 = ops.bmm_rrr()(concatenate_71_0, unsqueeze_131_0)
+        reshape_133_0 = ops.reshape()(bmm_rrr_132_0, shape=[-1, 30, 256])
+        layernorm_134_0 = ops.layernorm(normalized_shape=[IntImm(256)])(
+            reshape_133_0,
+            main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight,
+            main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias,
+        )
+        permute021_136_0 = ops.permute021()(layernorm_134_0)
+
+        concatenate_108_0 = ops.concatenate()(
+            inputs=[unsqueeze_83_0, unsqueeze_95_0, unsqueeze_107_0],
+            dim=2,
+        )
+        bmm_rrr_359_0 = ops.bmm_rrr()(concatenate_108_0, unsqueeze_358_0)
+        reshape_360_0 = ops.reshape()(bmm_rrr_359_0, shape=[-1, 30, 256])
+        layernorm_361_0 = ops.layernorm(normalized_shape=[IntImm(256)])(
+            reshape_360_0,
+            main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight,
+            main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias,
+        )
+        permute021_363_0 = ops.permute021()(layernorm_361_0)
+
+        outputs = [permute021_136_0, permute021_363_0]
+
+        for i, output in enumerate(outputs):
+            output._attrs["is_output"] = True
+            output._attrs["name"] = f"output_{i}"
+
+        model = compile_model(
+            outputs,
+            detect_target(),
+            "./tmp",
+            "test_group_layernorm_repro",
+        )
+
+        pt_inputs = {
+            "unsqueeze_46_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_58_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_70_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_131_0": get_random_torch_tensor(shape=[1024, 3, 1]),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "unsqueeze_83_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_95_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_107_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_358_0": get_random_torch_tensor(shape=[1024, 3, 1]),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias": get_random_torch_tensor(
+                shape=[256]
+            ),
+        }
+        pt_outputs = {
+            "output_0": get_torch_empty_tensor(shape=[1024, 256, 30]),
+            "output_1": get_torch_empty_tensor(shape=[1024, 256, 30]),
+        }
+
+        model.run_with_tensors(pt_inputs, pt_outputs)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index cd276b739..330a030f2 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -149,6 +149,73 @@ def test_rcr_sm90(self) -> None:
                 dtype="bfloat16",
             )
 
+    def _test_rrr(self, Ms, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(
+            shape=[IntImm(K), IntImm(N)], dtype=dtype, name="input_1", is_input=True
+        )
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.gemm_rrr_bias()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_bias_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
+
+        for M in Ms:
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+
+            W_transpose_pt = torch.transpose(W_pt, 0, 1).contiguous()
+            y = get_torch_empty_tensor([M, N], dtype)
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_transpose_pt, "input_2": B_pt},
+                [y],
+            )
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
+
+    def test_rrr_zero_size(self):
+        target = detect_target()
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
+        if type(target).__name__ != "FBCUDA":
+            self._test_rrr([2], N=64, K=0, test_name="zero_k")
+        self._test_rrr([2], N=0, K=4, test_name="zero_n")
+        self._test_rrr([0], N=4, K=4, test_name="zero_m")
+
+    def test_rrr_static(self):
+        self._test_rrr([4096], N=4, K=4, test_name="static")
+        self._test_rrr([1000], N=81, K=1024, test_name="static")
+        self._test_rrr([67200], N=3, K=256, test_name="static")
+
+    def test_rrr_static_rocm(self):
+        self._test_rrr([4096], N=4, K=4, test_name="static")
+        self._test_rrr([1000], N=81, K=1024, test_name="static")
+        self._test_rrr([67200], N=3, K=256, test_name="static")
+
+    def test_rrr_bfloat16_bf16(self):
+        dtype = "bfloat16"
+        self._test_rrr([4], N=2, K=11, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rrr([128], N=64, K=1024, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rrr(
+            [1, 7, 64, 127],
+            N=64,
+            K=1024,
+            test_name=f"dynamic_m_{dtype}",
+            dtype=dtype,
+        )
+
 
 filter_test_cases_by_test_env(GEMMBiasTestCase)
 
diff --git a/tests/unittest/ops/test_index_select.py b/tests/unittest/ops/test_index_select.py
new file mode 100644
index 000000000..a219ab311
--- /dev/null
+++ b/tests/unittest/ops/test_index_select.py
@@ -0,0 +1,287 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for masked_select Operator.
+"""
+import logging
+import random
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "rocm", "masked_select is not implemented for ROCm"
+)
+class IndexSelectTest(unittest.TestCase):
+    @staticmethod
+    def _get_output_shape(shape, dim_idx, dim_idx_len):
+        ret = []
+        for idx, dim in enumerate(shape):
+            if idx == dim_idx:
+                ret.append(dim_idx_len)
+                continue
+            ret.append(dim)
+        return ret
+
+    def _test_index_select(
+        self,
+        shape=(2, 2),
+        x_shape=None,
+        dim_idxs_shape=None,
+        dim_idx=1,
+        dim_idx_len=1,
+        test_name="index_select",
+        dtype="float16",
+        benchmark=False,
+        dim_idxs=None,
+    ):
+
+        X1 = Tensor(
+            shape=shape if x_shape is None else x_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=(dim_idx_len,) if dim_idxs_shape is None else dim_idxs_shape,
+            dtype="int64",
+            name="dim_idxs",
+            is_input=True,
+        )
+        X4_op = ops.index_select(dim_idx)
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        y = torch.empty(
+            IndexSelectTest._get_output_shape(shape, dim_idx, dim_idx_len),
+            dtype=x.dtype,
+            device=x.device,
+        )
+
+        if dim_idxs is None:
+            dim_idxs = torch.arange(end=dim_idx_len, dtype=torch.int64, device=x.device)
+
+        y_ait = module.run_with_tensors([x, dim_idxs], [y])["output_values"]
+        y_pt = torch.index_select(x, dim_idx, dim_idxs)
+        self.assertTrue(torch.equal(y_pt, y_ait))
+
+        if benchmark:
+            print(
+                f"Benchmarking with shape={shape}, dim_idx={dim_idx}, dim_idx_len={dim_idx_len}, dtype={dtype}"
+            )
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors([x, dim_idxs], [y])
+            # Benchmark.
+            num_benchmark_iter = 1000
+
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                [x, dim_idxs], [y], count=num_benchmark_iter
+            )
+
+            print(f"AITemplate time: {time_per_iter_ms:.4f}ms")
+
+            func = torch.index_select
+            args = (x, dim_idx, dim_idxs)
+            # Warm up.
+            for _ in range(5):
+                func(*args)
+            # Benchmark.
+            torch_time_per_iter_ms = benchmark_torch_function(
+                num_benchmark_iter, func, *args
+            )
+            print(f"PyTorch time: {torch_time_per_iter_ms:.4f}ms")
+
+            print(f"Speedup: {torch_time_per_iter_ms / time_per_iter_ms:.6f}x")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [
+                (IntVar(values=[1, 6]), IntVar(values=[1, 6])),
+                (IntVar(values=(0, 2)),),
+                (2, 2),
+                False,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 512]),),
+                (2048, 1024, 7),
+                False,  # change for benchmark
+                2,
+                7,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 512]),),
+                (2048, 1024, 7),
+                False,
+                1,
+                512,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 2048]),),
+                (2048, 1024, 7),
+                False,
+                0,
+                2048,
+            ],
+        ]
+    )
+    def test_dynamic_shape(
+        self,
+        x_shape=None,
+        dim_idxs_shape=None,
+        shape=(2, 2),
+        benchmark=False,
+        dim_idx=1,
+        dim_idx_len=1,
+        test_name="dynamic_index_select",
+        dtype="float16",
+    ):
+        self._test_index_select(
+            shape,
+            x_shape,
+            dim_idxs_shape,
+            dim_idx,
+            dim_idx_len,
+            test_name,
+            dtype,
+            benchmark,
+        )
+
+    def test_repeated_and_out_of_order(self):
+        self._test_index_select(
+            shape=(5, 4, 3, 2),
+            dim_idx=1,
+            dim_idx_len=10,
+            test_name="index_select_repeat",
+            dtype="float16",
+            dim_idxs=torch.tensor(
+                [3, 2, 0, 1, 2, 3, 3, 2, 1, 0], dtype=torch.int64, device="cuda"
+            ),
+        )
+
+    def test_negative_dim(self):
+        for dim_idx in range(1, 5):
+            self._test_index_select(
+                shape=(5, 4, 3, 2),
+                dim_idx=-dim_idx,
+                dim_idx_len=1,
+                test_name="index_select_negative_idx",
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), False],
+            # Uncomment to benchmark
+            # [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp32(self, shape, benchmark):
+        torch.manual_seed(1024)
+        random.seed(1024)
+        for idx, _ in enumerate(shape):
+            for dim_idx_len in [1, int(shape[idx] / 2), shape[idx]]:
+                self._test_index_select(
+                    shape=shape,
+                    dim_idx=idx,
+                    dim_idx_len=dim_idx_len if dim_idx_len > 0 else 1,
+                    test_name="index_select_fp32",
+                    dtype="float32",
+                    benchmark=benchmark,
+                )
+
+    @parameterized.expand(
+        [
+            [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), False],
+            # Uncomment to benchmark
+            # [(5, 4, 3, 2), True],
+            # [(2, 6), True],
+            # [(20, 6), True],
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True], #revisit
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp16(self, shape, benchmark=False):
+        torch.manual_seed(1024)
+        random.seed(1024)
+        for idx, _ in enumerate(shape):
+            for dim_idx_len in [1, int(shape[idx] / 2), shape[idx]]:
+                self._test_index_select(
+                    shape=shape,
+                    dim_idx=idx,
+                    dim_idx_len=dim_idx_len if dim_idx_len > 0 else 1,
+                    test_name="index_select_fp16",
+                    dtype="float16",
+                    benchmark=benchmark,
+                )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    random.seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index ed0b1a0d2..38fe1522e 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -45,6 +45,7 @@ def _run_reduce(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
         rtol=1e-2,
         atol=1e-2,
     ):
@@ -54,7 +55,7 @@ def _run_reduce(
                 input_shape=input_shape, dim=dim
             )
         )
-        target = detect_target()
+        target = detect_target(use_fp16_acc=use_fp16_acc)
         X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
 
         if keepdim is None:
@@ -95,6 +96,7 @@ def _run_reduce_sum(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
         rtol=1e-2,
         atol=1e-2,
     ):
@@ -107,6 +109,7 @@ def _run_reduce_sum(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
             rtol=rtol,
             atol=atol,
         )
@@ -186,6 +189,16 @@ def test_reduce_sum(self):
             input_type="float16",
             output_type="float16",
         )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[5, 4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+            use_fp16_acc=True,
+            rtol=1e-1,
+            atol=1e-1,
+        )
         self._run_reduce_sum(
             dim=2,
             input_shape=[5, 4, 3],
@@ -217,6 +230,7 @@ def _run_reduce_mean(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
     ):
         self._run_reduce(
             test_name=f"reduce_mean_{input_type}_{output_type}",
@@ -227,6 +241,7 @@ def _run_reduce_mean(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
         )
 
     def test_reduce_mean(self):
@@ -384,6 +399,14 @@ def test_reduce_mean(self):
             input_type="float16",
             output_type="float16",
         )
+        self._run_reduce_mean(
+            dim=0,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+            use_fp16_acc=True,
+        )
 
     def _run_batched_reduce(
         self,
@@ -397,6 +420,7 @@ def _run_batched_reduce(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
     ):
         torch.manual_seed(0)
         _LOGGER.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
@@ -447,6 +471,7 @@ def _run_batched_reduce_sum(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
     ):
         self._run_batched_reduce(
             test_name=f"reduce_sum_batched_{input_type}_{output_type}",
@@ -458,6 +483,7 @@ def _run_batched_reduce_sum(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
         )
 
     def test_batched_reduce_sum(self):
@@ -468,6 +494,16 @@ def test_batched_reduce_sum(self):
             keepdim=True,
             input_type="float16",
             output_type=None,
+            use_fp16_acc=True,
+        )
+        self._run_batched_reduce_sum(
+            dim=1,
+            batch_sizes=[10, 2048],
+            non_batch_shape=[2, 1944],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+            use_fp16_acc=False,
         )
 
     @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
diff --git a/tests/unittest/ops/test_slice.py b/tests/unittest/ops/test_slice.py
index b3d5789c3..09ebb3775 100644
--- a/tests/unittest/ops/test_slice.py
+++ b/tests/unittest/ops/test_slice.py
@@ -75,6 +75,11 @@ def _run_dynamic_slice(
         self.test_count += 1
 
     def test_dynamic_slice(self):
+        self._run_dynamic_slice(
+            input_shape=[10, 13],
+            start_indices=[None, None],
+            end_indices=[None, None],
+        )
         self._run_dynamic_slice(
             input_shape=[1],
             start_indices=[0],
@@ -224,6 +229,12 @@ def _run_batch_dynamic_slice(
         self.test_count += 1
 
     def test_batch_dynamic_slice(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 20],
+            input_shape=[2, 3, 4],
+            start_indices=[None, None, None, None],
+            end_indices=[None, None, None, None],
+        )
         self._run_batch_dynamic_slice(
             batch_sizes=[1, 1],
             input_shape=[1],
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index 7f31604e9..53b0c416e 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -15,7 +15,12 @@
 """
 Unittests for LayerNorm Operator.
 """
+import json
+import math
+import tempfile
 import unittest
+from collections import namedtuple
+from statistics import mean
 
 import torch
 
@@ -23,13 +28,14 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.profile import profile_callable
 from aitemplate.testing.test_utils import filter_test_cases_by_params, TestEnv
 from aitemplate.utils.torch_utils import string_to_torch_dtype
 from parameterized import parameterized
 
 
 class SoftmaxTestCase(unittest.TestCase):
-    def _test_softmax(
+    def _build_model(
         self,
         batch_sizes=(1, 1024),
         input_shapes=(6,),
@@ -42,7 +48,7 @@ def _test_softmax(
             self.skipTest(f"Rocm doesn't support {dtype}")
         if target.name() == "cuda" and dtype == "bfloat16" and int(target._arch) < 80:
             self.skipTest(f"CUDA SM{target._arch} doesn't support {dtype}")
-        torch_dtype = string_to_torch_dtype(dtype)
+
         X = Tensor(
             shape=[IntVar(name="input_batch", values=list(batch_sizes)), *input_shapes],
             dtype=dtype,
@@ -51,9 +57,20 @@ def _test_softmax(
         )
         Y = ops.softmax()(X, dim)
         Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
+        Y._attrs["name"] = "Y"
 
-        module = compile_model(Y, target, "./tmp", testname)
+        return compile_model(Y, target, "./tmp", testname)
+
+    def _test_softmax(
+        self,
+        batch_sizes=(1, 1024),
+        input_shapes=(6,),
+        dim=-1,
+        dtype="float16",
+        testname="softmax",
+    ):
+        module = self._build_model(batch_sizes, input_shapes, dim, dtype, testname)
+        torch_dtype = string_to_torch_dtype(dtype)
 
         for batch_size in batch_sizes:
             x_pt = torch.randn(batch_size, *input_shapes, dtype=torch_dtype).cuda()
@@ -68,6 +85,8 @@ def _test_softmax(
             {
                 TestEnv.CUDA_LESS_THAN_SM80: [
                     ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
+                    ("tail_shapes_all_1_fp16", "float16", (1, 2), (6, 1, 1), 1),
+                    ("tail_shapes_not_all_1_fp16", "float16", (1, 2), (6, 1, 2), 1),
                     ("odd_small_fp16", "float16", (1, 13), (11,)),
                     ("odd_mid_fp16", "float16", (1, 4096), (33,)),
                     ("odd_large_fp16", "float16", (2, 31), (1409,)),
@@ -100,6 +119,8 @@ def _test_softmax(
                 ],
                 TestEnv.CUDA_SM80: [
                     ("dim_1_bf16", "bfloat16", (1, 2), (6,), 1),
+                    ("tail_shapes_all_1_bf16", "bfloat16", (1, 2), (6, 1, 1), 1),
+                    ("tail_shapes_not_all_1_bf16", "bfloat16", (1, 2), (6, 1, 2), 1),
                     ("odd_small_bf16", "bfloat16", (1, 2), (11,)),
                     ("odd_mid_bf16", "bfloat16", (1, 2), (33,)),
                     ("odd_large_bf16", "bfloat16", (1, 2), (1409,)),
@@ -133,6 +154,83 @@ def test_softmax(
             dim=dim,
         )
 
+    def _test_benchmark_softmax(self):
+        dtype = "float16"
+        torch_dtype = string_to_torch_dtype(dtype)
+        BenchResult = namedtuple(
+            "BenchResult", ["dim", "batch_size", "permute_ms", "softmax_ms"]
+        )
+        results = []
+        shape = (260, 4)
+        batch_sizes = [2**p for p in range(0, 16)]
+        for reduction_dim in [-1, -2]:
+            module = self._build_model(
+                batch_sizes,
+                shape,
+                reduction_dim,
+                dtype,
+                f"bench_softmax_{abs(reduction_dim)}",
+            )
+
+            for batch_size in batch_sizes:
+                x_pt = torch.ones(batch_size, *shape, dtype=torch_dtype).cuda()
+                y_pt = torch.empty([batch_size, *shape], dtype=torch_dtype).cuda()
+                with tempfile.NamedTemporaryFile("r") as f:
+                    module.profile_with_tensors(
+                        inputs={"X": x_pt},
+                        outputs={"Y": y_pt},
+                        num_iters=1000,
+                        filename=f.name,
+                    )
+                    profiling_data = json.loads(f.read())
+
+                    permute_ms = 0
+                    softmax_ms = 0
+                    for func_name, record in profiling_data.items():
+                        if func_name.startswith("permute"):
+                            permute_ms += record["ms_per_iter"]
+                        elif func_name.startswith("softmax"):
+                            softmax_ms += record["ms_per_iter"]
+                    results.append(
+                        BenchResult(reduction_dim, batch_size, permute_ms, softmax_ms)
+                    )
+
+        for r in results:
+            items = r.batch_size * math.prod(shape)
+            runtime_ms = r.permute_ms + r.softmax_ms
+            print(
+                f"{r.dim=}, {items=}, {r.permute_ms=}, {r.softmax_ms=}, {runtime_ms=}"
+            )
+
+    def _test_benchmark_pytorch_softmax(self):
+        batch_sizes = [2**p for p in range(0, 16)]
+        shape = (260, 4)
+        dtype = "float16"
+        torch_dtype = string_to_torch_dtype(dtype)
+        BenchResult = namedtuple("BenchResult", ["dim", "batch_size", "runtime_ms"])
+        cache_flush_slab = torch.empty(
+            size=[40, 1024, 1024],  # A100 L2 cache size
+            dtype=torch.float16,
+        ).cuda()
+
+        results = []
+        for reduction_dim in [-1, -2]:
+            for batch_size in batch_sizes:
+                x_pt = torch.ones(batch_size, *shape, dtype=torch_dtype).cuda()
+                _, wall_times = profile_callable(
+                    lambda: torch.nn.functional.softmax(x_pt, dim=reduction_dim),
+                    cache_flush_slab,
+                    n_iter=1000,
+                )
+                results.append(
+                    BenchResult(reduction_dim, batch_size, mean(wall_times) / 1000.0)
+                )
+
+        for r in results:
+            items = r.batch_size * math.prod(shape)
+            print(f"{r.dim=}, {items=}, {r.runtime_ms=}")
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/util/test_debug_utils.py b/tests/unittest/util/test_debug_utils.py
index a8e31c6b5..a86059d91 100644
--- a/tests/unittest/util/test_debug_utils.py
+++ b/tests/unittest/util/test_debug_utils.py
@@ -21,11 +21,14 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.base import IntImm, IntVarTensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
 from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 def _test_inf_and_nan(
@@ -65,11 +68,11 @@ def test_inf_and_nan(capfd):
 
 
 def _test_outputs(
-    check_tensor, check_all, test_name, capfd: pytest.CaptureFixture[str]
+    check_tensor, check_all, test_name, dtype, capfd: pytest.CaptureFixture[str]
 ):
     X1 = Tensor(
         shape=[IntImm(1), IntImm(3)],
-        dtype="float16",
+        dtype=dtype,
         name="input0",
         is_input=True,
     )
@@ -85,7 +88,11 @@ def _test_outputs(
         X2, target, "./tmp", test_name, debug_settings=debug_settings
     )
 
-    x1_pt = torch.Tensor([[1.0, 1.5, 2.0]]).cuda().half()
+    x1_pt = (
+        torch.Tensor([[1.0, 1.5, 2.0]])
+        .to(dtype=string_to_torch_dtype(dtype))
+        .to("cuda")
+    )
     x2 = torch.empty_like(x1_pt)
     module.run_with_tensors([x1_pt], [x2])
 
@@ -106,9 +113,65 @@ def _test_outputs(
 
 
 def test_outputs(capfd):
-    _test_outputs(True, False, "test_outputs_tensor", capfd)
-    _test_outputs(False, True, "test_outputs_all", capfd)
-    _test_outputs(True, True, "test_outputs_both", capfd)
+    _test_outputs(True, False, "test_outputs_tensor", "float16", capfd)
+    _test_outputs(False, True, "test_outputs_all", "float16", capfd)
+    _test_outputs(True, True, "test_outputs_both_float16", "float16", capfd)
+    _test_outputs(True, True, "test_outputs_both_float32", "float32", capfd)
+
+
+@pytest.mark.skipif(
+    detect_target().name == "rocm" or int(detect_target()._arch) < 80,
+    reason="bfloat16 tests requires CUDA sm >= 80",
+)
+def test_outputs_bf16(capfd):
+    _test_outputs(True, True, "test_outputs_both_bfloat16", "bfloat16", capfd)
+
+
+def _test_with_int_var_tensor(test_name, dtype):
+    target = detect_target()
+    batch_size = (3, 5)
+    x1_size = (2, 3)
+    X_shape = (32, 64)
+    b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+    x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
+    X = Tensor(
+        shape=[b_dim, x1_dim, *X_shape],
+        dtype=dtype,
+        name="input_0",
+        is_input=True,
+    )
+
+    Y1 = ops.size()(X)
+    Y2 = ops.getitem()(Y1, 0)
+    Y3 = ops.getitem()(Y1, 1)
+    Y4 = ops.getitem()(Y1, 2)
+    Y5 = ops.getitem()(Y1, 3)
+    f1 = ops.int_elementwise(FuncEnum.MUL)(Y4, Y5)
+    f2 = IntVarTensor(IntImm(12))
+
+    Y = ops.reshape()(X, [Y2 * Y3 * f1 / f2, f2])
+    Y._attrs["name"] = "output_0"
+    Y._attrs["is_output"] = True
+    debug_settings = AITDebugSettings(
+        check_all_outputs=True, check_all_nan_and_inf=True
+    )
+    module = compile_model(Y, target, "./tmp", test_name, debug_settings=debug_settings)
+
+    for b, x1 in zip(batch_size, x1_size):
+        X_shape_pt = (b, x1, *X_shape)
+        X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+        Y_pt = X_pt.reshape(
+            int(X_shape_pt[0] * X_shape_pt[1] * X_shape_pt[2] * X_shape_pt[3] / 12),
+            12,
+        )
+
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y])
+        assert torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2)
+
+
+def test_int_var_tensor(capfd):
+    _test_with_int_var_tensor("test_outputs_int_var_tensor", "float16")
 
 
 def _test_special_outputs(

From ebefd83a24a505c4fa8b9ebfd6e8f3ba69a15977 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 25 Dec 2023 17:57:05 +0800
Subject: [PATCH 625/638] fix bugs

---
 .../scripts/compile_sdxl.py                   | 18 +++----
 .../05_stable_diffusion/scripts/demo_xl.py    | 10 +++-
 .../scripts/download_pipeline.py              |  4 +-
 .../src/compile_lib/compile_clip_alt.py       | 48 ++++++++++++++++---
 .../src/pipeline_stable_diffusion_xl_ait.py   | 16 ++++---
 .../backend/rocm/normalization/groupnorm.py   |  2 +-
 .../backend/rocm/normalization/layernorm.py   |  2 +-
 .../backend/rocm/normalization/norm_common.py | 12 ++---
 test.py                                       | 18 -------
 9 files changed, 78 insertions(+), 52 deletions(-)
 delete mode 100644 test.py

diff --git a/examples/05_stable_diffusion/scripts/compile_sdxl.py b/examples/05_stable_diffusion/scripts/compile_sdxl.py
index 19caf57fe..704758da6 100644
--- a/examples/05_stable_diffusion/scripts/compile_sdxl.py
+++ b/examples/05_stable_diffusion/scripts/compile_sdxl.py
@@ -21,7 +21,7 @@
 import torch
 from aitemplate.testing import detect_target
 from aitemplate.utils.import_path import import_parent
-from diffusers import AutoencoderKL, StableDiffusionXLPipeline
+from diffusers import AutoencoderKL, DiffusionPipeline
 
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
@@ -95,7 +95,7 @@ def compile_diffusers(
     if detect_target().name() == "rocm":
         convert_conv_to_gemm = False
 
-    pipe = StableDiffusionXLPipeline.from_pretrained(
+    pipe = DiffusionPipeline.from_pretrained(
         hf_hub_or_path,
         torch_dtype=torch.float16,
     ).to("cuda")
@@ -128,17 +128,17 @@ def compile_diffusers(
     # text_encoder 2
     model_name = f"{model_name_prefix}_text_encoder_2"
     compile_clip(
-        pipe.text_encoder,
+        pipe.text_encoder_2,
         batch_size=batch_size,
-        seqlen=pipe.text_encoder.config.max_position_embeddings,
+        seqlen=pipe.text_encoder_2.config.max_position_embeddings,
         use_fp16_acc=use_fp16_acc,
         convert_conv_to_gemm=convert_conv_to_gemm,
         output_hidden_states=True,
-        text_projection_dim=pipe.text_encoder.config.projection_dim,
-        depth=pipe.text_encoder.config.num_hidden_layers,
-        num_heads=pipe.text_encoder.config.num_attention_heads,
-        dim=pipe.text_encoder.config.hidden_size,
-        act_layer=pipe.text_encoder.config.hidden_act,
+        text_projection_dim=pipe.text_encoder_2.config.projection_dim,
+        depth=pipe.text_encoder_2.config.num_hidden_layers,
+        num_heads=pipe.text_encoder_2.config.num_attention_heads,
+        dim=pipe.text_encoder_2.config.hidden_size,
+        act_layer=pipe.text_encoder_2.config.hidden_act,
         constants=include_constants,
         model_name=model_name,
         work_dir=work_dir,
diff --git a/examples/05_stable_diffusion/scripts/demo_xl.py b/examples/05_stable_diffusion/scripts/demo_xl.py
index 6e5ccd120..912965f80 100644
--- a/examples/05_stable_diffusion/scripts/demo_xl.py
+++ b/examples/05_stable_diffusion/scripts/demo_xl.py
@@ -17,12 +17,13 @@
 import torch
 
 from aitemplate.utils.import_path import import_parent
-from diffusers import AutoencoderKL, StableDiffusionXLPipeline
+from diffusers import AutoencoderKL, DiffusionPipeline
 
 if __name__ == "__main__":
     import_parent(filepath=__file__, level=1)
 
 from src.pipeline_stable_diffusion_xl_ait import StableDiffusionXLAITPipeline
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
 
 @click.command()
@@ -84,7 +85,7 @@ def run(
     negative_prompt,
     benchmark,
 ):
-    diffusers_pipe = StableDiffusionXLPipeline.from_pretrained(
+    diffusers_pipe = DiffusionPipeline.from_pretrained(
         hf_hub_or_path,
         use_safetensors=True,
         torch_dtype=torch.float16,
@@ -117,6 +118,11 @@ def run(
         height=height,
         width=width,
     ).images
+    if benchmark:
+        t = benchmark_torch_function(10, pipe, prompt, prompt_2=prompt, height=height, width=width)
+        print(
+            f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
+        )
     for i, image in enumerate(images):
         image.save(f"example_ait_{i}.png")
 
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 22c5ceee5..96835d307 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -14,7 +14,7 @@
 #
 import click
 import torch
-from diffusers import StableDiffusionPipeline
+from diffusers import DiffusionPipeline
 
 
 @click.command()
@@ -37,7 +37,7 @@
 )
 def download_pipeline_files(model_name, token, save_directory) -> None:
 
-    StableDiffusionPipeline.from_pretrained(
+    DiffusionPipeline.from_pretrained(
         model_name,
         # revision="fp16",
         torch_dtype=torch.float16,
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
index 4e62eec79..c7b5ca4e5 100644
--- a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
@@ -20,7 +20,9 @@
 
 from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
 from .util import torch_dtype_from_str
+import torch
 
+USE_CUDA = detect_target().name() == "cuda"
 
 def map_clip(pt_mod, device="cuda", dtype="float16"):
     pt_params = dict(pt_mod.named_parameters())
@@ -33,12 +35,46 @@ def map_clip(pt_mod, device="cuda", dtype="float16"):
             ait_name = ait_name.replace("out_proj", "proj")
         elif name.endswith("out_proj.bias"):
             ait_name = ait_name.replace("out_proj", "proj")
-        elif "q_proj" in name:
-            ait_name = ait_name.replace("q_proj", "proj_q")
-        elif "k_proj" in name:
-            ait_name = ait_name.replace("k_proj", "proj_k")
-        elif "v_proj" in name:
-            ait_name = ait_name.replace("v_proj", "proj_v")
+        elif USE_CUDA:
+            if "q_proj" in name:
+                ait_name = ait_name.replace("q_proj", "proj_q")
+            elif "k_proj" in name:
+                ait_name = ait_name.replace("k_proj", "proj_k")
+            elif "v_proj" in name:
+                ait_name = ait_name.replace("v_proj", "proj_v")
+        else:
+            if name.endswith("q_proj.weight"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.weight")]
+                q = pt_params[prefix + "q_proj.weight"]
+                k = pt_params[prefix + "k_proj.weight"]
+                v = pt_params[prefix + "v_proj.weight"]
+                qkv_weight = torch.cat([q, k, v], dim=0).cuda()
+                params_ait[ait_name] = qkv_weight
+                continue
+            elif name.endswith("q_proj.bias"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.bias")]
+                q = pt_params[prefix + "q_proj.bias"]
+                k = pt_params[prefix + "k_proj.bias"]
+                v = pt_params[prefix + "v_proj.bias"]
+                qkv_bias = torch.cat([q, k, v], dim=0).cuda()
+                params_ait[ait_name] = qkv_bias
+                continue
+            elif name.endswith("k_proj.weight"):
+                continue
+            elif name.endswith("k_proj.bias"):
+                continue
+            elif name.endswith("v_proj.weight"):
+                continue
+            elif name.endswith("v_proj.bias"):
+                continue
+        # elif "q_proj" in name:
+        #     ait_name = ait_name.replace("q_proj", "proj_q")
+        # elif "k_proj" in name:
+        #     ait_name = ait_name.replace("k_proj", "proj_k")
+        # elif "v_proj" in name:
+        #     ait_name = ait_name.replace("v_proj", "proj_v")
         params_ait[ait_name] = arr
     return params_ait
 
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
index 7de267035..1f48d3aef 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -25,7 +25,9 @@
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import is_invisible_watermark_available, logging, randn_tensor
+from diffusers.utils import is_invisible_watermark_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from .compile_lib.compile_clip_alt import map_clip
@@ -285,7 +287,7 @@ def encode_prompt(
             #     text_input_ids.to(device),
             #     output_hidden_states=True,
             # )
-            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=True)
+            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=False)
             # We are only ALWAYS interested in the pooled output of the final text encoder
             if "text_embeds" in prompt_embeds.keys():
                 pooled_prompt_embeds = prompt_embeds["text_embeds"]
@@ -346,7 +348,7 @@ def encode_prompt(
                 #     output_hidden_states=True,
                 # )
                 negative_prompt_embeds = clip_inference(
-                    text_encoder, uncond_text_input_ids, to_cpu=True
+                    text_encoder, uncond_text_input_ids, to_cpu=False
                 )
                 # We are only ALWAYS interested in the pooled output of the final text encoder
                 if "text_embeds" in negative_prompt_embeds.keys():
@@ -559,7 +561,7 @@ def _get_add_time_ids(
 
         add_time_embeds = []
         for time_id in add_time_ids:
-            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=True)[
+            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=False)[
                 "time_embed"
             ]
             add_time_embeds.append(time_embed)
@@ -761,7 +763,7 @@ def __call__(
             prompt_embeds.dtype,
             generator,
             latents,
-        )
+        ).cuda()
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -835,7 +837,7 @@ def __call__(
                     t,
                     prompt_embeds,
                     add_embeds=add_embeds,
-                    to_cpu=True,
+                    to_cpu=False,
                 )["latent_output"]
 
                 # perform guidance
@@ -870,7 +872,7 @@ def __call__(
 
         if not output_type == "latent":
             image = vae_decode_inference(
-                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=True
+                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=False
             )["pixels"]
             self.unload_vae()
         else:
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index 995144fe0..466d77b42 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -83,7 +83,7 @@
 
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index acc3da838..5d99315e3 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -47,7 +47,7 @@
 
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 74076a529..8e2a1f9b6 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -141,12 +141,12 @@
     hipGetErrorString(hipEventDestroy(mStart));
     hipGetErrorString(hipEventDestroy(mEnd));
   }
-  void Start() {
+  void Start(hipStream_t stream) {
     hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
+    hipGetErrorString(hipEventRecord(mStart, stream));
   }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+  void End(hipStream_t stream) {
+    hipGetErrorString(hipEventRecord(mEnd, stream));
     hipGetErrorString(hipEventSynchronize(mEnd));
   }
   float GetElapsedTime() const {
@@ -179,11 +179,11 @@
   }
   // run
   KernelTimerImpl timer;
-  timer.Start();
+  timer.Start(stream);
   for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
-  timer.End();
+  timer.End(stream);
   std::cout << "OP:" << "{{op_name}}" << ",";
   std::cout << "TIME:" << timer.GetElapsedTime() << ",";
   std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
diff --git a/test.py b/test.py
deleted file mode 100644
index 01495356f..000000000
--- a/test.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import asyncio
-
-async def A():
-    print('1')
-    print('2')
-    print('3')
-
-async def B():
-    print('4')
-    print('5')
-    print('6')
-
-# loop = asyncio.get_event_loop()
-# tasks = [A(), B()]
-asyncio.run_coroutine_threadsafe(A())
-asyncio.run(B())
-
-# loop.close()
\ No newline at end of file

From 7e828a766abad89aa06f2813f1414dd2466845e6 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 3 Jan 2024 13:06:25 +0800
Subject: [PATCH 626/638] optimize performance

---
 .../scripts/compile_sdxl.py                   |  1 +
 .../05_stable_diffusion/scripts/demo_xl.py    | 19 +++++++++++-----
 .../scripts/download_pipeline.py              |  2 +-
 .../src/pipeline_stable_diffusion_xl_ait.py   | 14 +++++++-----
 .../aitemplate/backend/rocm/conv2d/common.py  | 22 ++-----------------
 .../backend/rocm/gemm/bmm_softmax_bmm.py      |  2 +-
 .../rocm/gemm/bmm_softmax_bmm_permute.py      |  2 +-
 python/aitemplate/backend/rocm/gemm/common.py | 22 ++-----------------
 .../backend/rocm/normalization/norm_common.py | 19 ----------------
 .../backend/rocm/normalization/softmax.py     |  2 +-
 .../aitemplate/backend/rocm/pool2d/pool2d.py  |  1 -
 11 files changed, 30 insertions(+), 76 deletions(-)

diff --git a/examples/05_stable_diffusion/scripts/compile_sdxl.py b/examples/05_stable_diffusion/scripts/compile_sdxl.py
index 704758da6..1b88bc123 100644
--- a/examples/05_stable_diffusion/scripts/compile_sdxl.py
+++ b/examples/05_stable_diffusion/scripts/compile_sdxl.py
@@ -97,6 +97,7 @@ def compile_diffusers(
 
     pipe = DiffusionPipeline.from_pretrained(
         hf_hub_or_path,
+        revision="fp16",
         torch_dtype=torch.float16,
     ).to("cuda")
     if fp32_vae:
diff --git a/examples/05_stable_diffusion/scripts/demo_xl.py b/examples/05_stable_diffusion/scripts/demo_xl.py
index 912965f80..9071fcdc2 100644
--- a/examples/05_stable_diffusion/scripts/demo_xl.py
+++ b/examples/05_stable_diffusion/scripts/demo_xl.py
@@ -87,14 +87,16 @@ def run(
 ):
     diffusers_pipe = DiffusionPipeline.from_pretrained(
         hf_hub_or_path,
-        use_safetensors=True,
+        revision="fp16",
         torch_dtype=torch.float16,
-    )
+    ).to("cuda")
+
     vae = AutoencoderKL.from_pretrained(
         "madebyollin/sdxl-vae-fp16-fix",
         use_safetensors=True,
         torch_dtype=torch.float16,
-    )
+    ).to("cuda")
+
     pipe = StableDiffusionXLAITPipeline(
         vae,
         diffusers_pipe.text_encoder,
@@ -117,14 +119,19 @@ def run(
         prompt_2=prompt,
         height=height,
         width=width,
+        num_inference_steps=20,
+        guidance_scale=8
     ).images
+    
+    for i, image in enumerate(images):
+        image.save(f"example_ait_{i}.png")
+
     if benchmark:
-        t = benchmark_torch_function(10, pipe, prompt, prompt_2=prompt, height=height, width=width)
+        t = benchmark_torch_function(10, pipe, prompt, prompt_2=prompt, height=height, width=width, num_inference_steps=20, guidance_scale=8)
         print(
             f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
         )
-    for i, image in enumerate(images):
-        image.save(f"example_ait_{i}.png")
+
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 96835d307..530a70189 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -39,7 +39,7 @@ def download_pipeline_files(model_name, token, save_directory) -> None:
 
     DiffusionPipeline.from_pretrained(
         model_name,
-        # revision="fp16",
+        revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token if len(token) > 5 else token.lower() == "true",
     ).save_pretrained(save_directory)
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
index 1f48d3aef..716babfd8 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -165,6 +165,9 @@ def __init__(
         self.unet_exe = None
         self.vae_exe = None
         self.timestep_exe = Model(timestep_module_path)
+        self.apply_clip()
+        self.apply_vae()
+        self.apply_unet()
 
     def apply_vae(self):
         self.vae_exe = Model(self.vae_module_path)
@@ -725,7 +728,7 @@ def __call__(
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
 
-        self.apply_clip()
+        
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -746,7 +749,7 @@ def __call__(
             negative_prompt_2=negative_prompt_2,
         )
 
-        self.unload_clip()
+        # self.unload_clip()
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -808,7 +811,6 @@ def __call__(
             )
             timesteps = timesteps[:num_inference_steps]
 
-        self.apply_unet()
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -866,15 +868,15 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
-        self.unload_unet()
+        # self.unload_unet()
 
-        self.apply_vae()
+        # self.apply_vae()
 
         if not output_type == "latent":
             image = vae_decode_inference(
                 self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=False
             )["pixels"]
-            self.unload_vae()
+            # self.unload_vae()
         else:
             image = latents
             return StableDiffusionXLPipelineOutput(images=image)
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 49dd22b47..0a12e050d 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -120,7 +120,7 @@
 #include <rocrand/rocrand.h>
 #include "logging.h"
 
-#include "library/include/ck/library/utility/device_memory.hpp"
+
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -296,7 +296,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
@@ -386,24 +386,6 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
index a54f10dd1..ac189f319 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -137,7 +137,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
index 10337922c..f19b5c058 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -146,7 +146,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 39d9a4956..aec47c25d 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -116,7 +116,6 @@
 #include <rocrand/rocrand.h>
 #include "logging.h"
 
-#include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -318,7 +317,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
@@ -404,24 +403,7 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 8e2a1f9b6..aa61cb036 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -113,24 +113,6 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
@@ -202,7 +184,6 @@
 #include <rocrand/rocrand.h>
 #include "logging.h"
 
-#include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index bc10c5e09..9b534d7a1 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -37,7 +37,7 @@
   int64_t ptr_sz = in_{{ range(rank)|join(' * in_') }};
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index 38cab6a87..eeee5ed09 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -60,7 +60,6 @@
 #include <stdlib.h>
 #include "logging.h"
 
-#include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"

From 1829cbb667923b20847246689b98b6c1aceddad8 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 3 Jan 2024 14:26:34 +0800
Subject: [PATCH 627/638] optimize performance

---
 python/aitemplate/backend/rocm/target_def.py   | 2 +-
 python/aitemplate/testing/detect_target.py     | 6 ++++++
 python/aitemplate/utils/mk_ck_lib/generator.py | 9 +++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index f6bcffd63..4a2797942 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -127,7 +127,7 @@ def _build_compile_options(self):
                 self._pkg_path()
             ),
         ]
-        if self._arch.lower() not in {"gfx908", "gfx90a"}:
+        if self._arch.lower() not in {"gfx908", "gfx90a", "gfx940", "gfx941", "gfx942"}:
             raise RuntimeError("Unsupported GPU Arch")
         options.append("--offload-arch=native")
         for path in ck_paths:
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 2b2913d6f..69117600a 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -92,6 +92,12 @@ def _detect_rocm():
             return "gfx90a"
         if "gfx908" in stdout:
             return "gfx908"
+        if "gfx940" in stdout:
+            return "gfx940"
+        if "gfx941" in stdout:
+            return "gfx941"
+        if "gfx942" in stdout:
+            return "gfx942"
         return None
     except Exception:
         return None
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index a9bdb5c4a..804f71991 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -2537,3 +2537,12 @@ def GenerateGFX908(manifest, rocm_version):
 
 def GenerateGFX90A(manifest, rocm_version):
     GenerateTensorOp(manifest)
+
+def GenerateGFX940(manifest, rocm_version):
+    GenerateTensorOp(manifest)
+
+def GenerateGFX941(manifest, rocm_version):
+    GenerateTensorOp(manifest)
+
+def GenerateGFX942(manifest, rocm_version):
+    GenerateTensorOp(manifest)

From 8e2293eaeae10f3e5af5b6aa80cdbe750512828e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 3 Jan 2024 18:37:21 +0800
Subject: [PATCH 628/638] fix pool bugs

---
 .../aitemplate/backend/rocm/pool2d/pool2d.py  | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index eeee5ed09..85316802e 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -23,8 +23,8 @@
 
 INSTANCE_TEMPLATE = jinja2.Template(
     """
-using {{name}} = ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
-ck::half_t, ck::half_t, float, {{reduce_func}}, false, 64, 64, 1, 4, 1, 4>;
+using {{name}} = ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<
+ck::half_t, ck::half_t, ck::index_t, float, {{reduce_func}}, false, 64, 64, 1, 4, 1, 4>;
 """
 )
 
@@ -35,14 +35,17 @@
 {{indent}}auto argument_ptr = op.MakeArgumentPointer(static_cast<ck::half_t *>(in_ptr),
 {{indent}}                                           static_cast<ck::half_t *>(out_ptr),
 {{indent}}                                           nullptr,
-{{indent}}                                           *batch,
-{{indent}}                                           *in_ch,
 {{indent}}                                           input_shape,
 {{indent}}                                           kernel_shape,
 {{indent}}                                           output_shape,
+{{indent}}                                           input_stride,
+{{indent}}                                           output_stride,
+{{indent}}                                           indices_stride,
 {{indent}}                                           conv_filter_strides,
+{{indent}}                                           dilations,
 {{indent}}                                           input_left_pads,
-{{indent}}                                           input_right_pads);
+{{indent}}                                           input_right_pads,
+{{indent}}                                           {2, 3});
 {{indent}}if(!op.IsSupportedArgument(argument_ptr.get())) {
 {{indent}}  LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Pool problem.";
 {{indent}}}
@@ -87,19 +90,23 @@
     ) {
   {{shape_function}}
 
-  const std::array<ck::index_t, 2> conv_filter_strides{static_cast<ck::index_t>(stride),
+  const std::vector<ck::index_t> conv_filter_strides{static_cast<ck::index_t>(stride),
     static_cast<ck::index_t>(stride)};
-  const std::array<ck::index_t, 2> input_left_pads{static_cast<ck::index_t>(pad),
+  const std::vector<ck::index_t> input_left_pads{static_cast<ck::index_t>(pad),
     static_cast<ck::index_t>(pad)};
-  const std::array<ck::index_t, 2> input_right_pads{static_cast<ck::index_t>(pad),
+  const std::vector<ck::index_t> input_right_pads{static_cast<ck::index_t>(pad),
     static_cast<ck::index_t>(pad)};
-  const std::array<ck::index_t, 2> input_shape{static_cast<ck::index_t>(*in_h),
+  const std::vector<ck::index_t> input_shape{static_cast<ck::index_t>(*batch), static_cast<ck::index_t>(*in_ch), static_cast<ck::index_t>(*in_h),
     static_cast<ck::index_t>(*in_w)};
-  const std::array<ck::index_t, 2> kernel_shape{static_cast<ck::index_t>(kernel_h),
-    static_cast<ck::index_t>(kernel_w)};
-  const std::array<ck::index_t, 2> output_shape{static_cast<ck::index_t>(*out_h),
+  const std::vector<ck::index_t> kernel_shape{static_cast<ck::index_t>(kernel_h), static_cast<ck::index_t>(kernel_w)};
+  const std::vector<ck::index_t> output_shape{static_cast<ck::index_t>(*batch), static_cast<ck::index_t>(*in_ch), static_cast<ck::index_t>(*out_h),
     static_cast<ck::index_t>(*out_w)};
 
+  const std::vector<ck::index_t> input_stride{static_cast<ck::index_t>(CI*HI*WI), 1, static_cast<ck::index_t>(WI*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> output_stride{static_cast<ck::index_t>(CI*HO*WO), 1, static_cast<ck::index_t>(WO*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> indices_stride{static_cast<ck::index_t>(CI*HO*WO), 1, static_cast<ck::index_t>(WO*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> dilations{1, 1};
+
   {{exec_paths}}
 
   throw std::runtime_error(

From 51e10dd085a95a58e5dcce21c9ba36cd9030c0c8 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 4 Jan 2024 10:31:16 +0800
Subject: [PATCH 629/638] fix download model bug

---
 examples/05_stable_diffusion/scripts/download_pipeline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index 530a70189..facbef318 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -39,7 +39,6 @@ def download_pipeline_files(model_name, token, save_directory) -> None:
 
     DiffusionPipeline.from_pretrained(
         model_name,
-        revision="fp16",
         torch_dtype=torch.float16,
         use_auth_token=token if len(token) > 5 else token.lower() == "true",
     ).save_pretrained(save_directory)

From 5a6775c2f5d9e937cd99fe91152a563a560f3621 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 4 Jan 2024 13:44:53 +0800
Subject: [PATCH 630/638] remove rocm hack

---
 static/csrc/rocm_hack.cpp | 62 ---------------------------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 static/csrc/rocm_hack.cpp

diff --git a/static/csrc/rocm_hack.cpp b/static/csrc/rocm_hack.cpp
deleted file mode 100644
index d92c48ed9..000000000
--- a/static/csrc/rocm_hack.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) {
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const {
-  return mpDeviceBuf;
-}
-void DeviceMem::ToDevice(const void* p) const {
-  hipGetErrorString(hipMemcpy(
-      mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const {
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() {
-  hipGetErrorString(hipFree(mpDeviceBuf));
-}
-struct KernelTimerImpl {
-  KernelTimerImpl() {
-    hipGetErrorString(hipEventCreate(&mStart));
-    hipGetErrorString(hipEventCreate(&mEnd));
-  }
-  ~KernelTimerImpl() {
-    hipGetErrorString(hipEventDestroy(mStart));
-    hipGetErrorString(hipEventDestroy(mEnd));
-  }
-  void Start() {
-    hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
-  }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
-    hipGetErrorString(hipEventSynchronize(mEnd));
-  }
-  float GetElapsedTime() const {
-    float time;
-    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
-    return time;
-  }
-  hipEvent_t mStart, mEnd;
-};
-// >>> hack end

From db1851c8e904c26e35c49ee62cd467e12a4d8f5e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 5 Jan 2024 10:59:02 +0800
Subject: [PATCH 631/638] optimizer sdxl performance

---
 .../05_stable_diffusion/src/inference_ait.py  | 19 ++++++++++++++-----
 .../src/pipeline_stable_diffusion_xl_ait.py   | 11 +++++------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/examples/05_stable_diffusion/src/inference_ait.py b/examples/05_stable_diffusion/src/inference_ait.py
index 367b8b334..462712255 100644
--- a/examples/05_stable_diffusion/src/inference_ait.py
+++ b/examples/05_stable_diffusion/src/inference_ait.py
@@ -46,8 +46,10 @@ def inference(
     benchmark_repeat: int = 4,
     permute: bool = False,
     to_cpu: bool = False,
+    graph_mode=False,
+    sync=True,
 ):
-    module.run_with_tensors(inputs, outputs, graph_mode=False)
+    module.run_with_tensors(inputs, outputs, graph_mode=graph_mode, sync=sync)
     if permute:
         for name, output in outputs.items():
             if len(output.shape) == 4:
@@ -88,6 +90,8 @@ def timestep_inference(
     dtype: str = "float16",
     benchmark: bool = False,
     to_cpu: bool = False,
+    graph_mode: bool = False,
+    sync: bool = True
 ):
     timestep = torch.tensor([timestep]).to(device)
     inputs = {"timestep": timestep.to(device)}
@@ -96,7 +100,7 @@ def timestep_inference(
             inputs[k] = v.half()
     dims = [1]
     outputs = get_outputs(module, dims, device, dtype)
-    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync)
 
 
 def clip_inference(
@@ -107,6 +111,7 @@ def clip_inference(
     dtype: str = "float16",
     benchmark: bool = False,
     to_cpu: bool = False,
+    sync: bool = True,
 ):
     batch = input_ids.shape[0]
     input_ids = input_ids.to(device)
@@ -117,7 +122,7 @@ def clip_inference(
     }
     dims = [batch]
     outputs = get_outputs(module, dims, device, dtype)
-    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu, sync=sync)
 
 
 def unet_inference(
@@ -133,6 +138,8 @@ def unet_inference(
     dtype: str = "float16",
     benchmark: bool = False,
     to_cpu: bool = False,
+    graph_mode: bool = False,
+    sync: bool = True,
 ):
     batch = latent_model_input.shape[0]
     height, width = latent_model_input.shape[2], latent_model_input.shape[3]
@@ -164,7 +171,7 @@ def unet_inference(
     dims = [batch, height, width]
     outputs = get_outputs(module, dims, device, dtype)
     return inference(
-        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync,
     )
 
 
@@ -176,6 +183,8 @@ def vae_decode_inference(
     benchmark: bool = False,
     factor: int = 8,
     to_cpu: bool = False,
+    graph_mode=False,
+    sync: bool = True,
 ):
     batch = latent.shape[0]
     height, width = latent.shape[2:]
@@ -190,7 +199,7 @@ def vae_decode_inference(
     dims = [batch, height, width]
     outputs = get_outputs(module, dims, device, dtype)
     return inference(
-        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync
     )
 
 
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
index 716babfd8..d54d23f23 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -28,7 +28,6 @@
 from diffusers.utils import is_invisible_watermark_available, logging
 from diffusers.utils.torch_utils import randn_tensor
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from .compile_lib.compile_clip_alt import map_clip
 from .compile_lib.compile_unet_alt import map_unet
@@ -283,14 +282,13 @@ def encode_prompt(
                 truncation=True,
                 return_tensors="pt",
             )
-
             text_input_ids = text_inputs.input_ids
 
             # prompt_embeds = text_encoder(
             #     text_input_ids.to(device),
             #     output_hidden_states=True,
             # )
-            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=False)
+            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=False, sync=False)
             # We are only ALWAYS interested in the pooled output of the final text encoder
             if "text_embeds" in prompt_embeds.keys():
                 pooled_prompt_embeds = prompt_embeds["text_embeds"]
@@ -351,7 +349,7 @@ def encode_prompt(
                 #     output_hidden_states=True,
                 # )
                 negative_prompt_embeds = clip_inference(
-                    text_encoder, uncond_text_input_ids, to_cpu=False
+                    text_encoder, uncond_text_input_ids, to_cpu=False, sync=False
                 )
                 # We are only ALWAYS interested in the pooled output of the final text encoder
                 if "text_embeds" in negative_prompt_embeds.keys():
@@ -564,7 +562,7 @@ def _get_add_time_ids(
 
         add_time_embeds = []
         for time_id in add_time_ids:
-            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=False)[
+            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=False, sync=False)[
                 "time_embed"
             ]
             add_time_embeds.append(time_embed)
@@ -840,6 +838,7 @@ def __call__(
                     prompt_embeds,
                     add_embeds=add_embeds,
                     to_cpu=False,
+                    sync=False
                 )["latent_output"]
 
                 # perform guidance
@@ -874,7 +873,7 @@ def __call__(
 
         if not output_type == "latent":
             image = vae_decode_inference(
-                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=False
+                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=False, graph_mode=True, sync=False
             )["pixels"]
             # self.unload_vae()
         else:

From 318f3aab45806683beacf46f79b7e761baeb1efb Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 5 Jan 2024 11:42:34 +0800
Subject: [PATCH 632/638] enable hipgraph on MI300

---
 python/aitemplate/backend/codegen.py          |   2 -
 .../aitemplate/utils/mk_ck_lib/generator.py   | 224 ------------------
 static/include/model.h                        |   2 +-
 3 files changed, 1 insertion(+), 227 deletions(-)

diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 842529215..79e8fea7f 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -916,8 +916,6 @@ def _write_simple_multistream_debug_info(
         _LOGGER.info(f"Wrote json simple multistream info into {log_filename_json}")
 
     def generate_model(self) -> str:
-        # Disable graph mode on ROCM because the updating operations
-        # are not supported
         target_has_graph_mode = "true"
 
         run_impl_mode = multistream_mode()
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index 804f71991..ab78c337f 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -194,215 +194,6 @@ def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_o
     return operations
 
 
-# Convolution for 2D Bwd operations
-def CreateConv2dBwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
-    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWC)
-    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKXC)
-    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWK)
-
-    in_element_op = library.TensorOperation.PassThrough
-
-    tile_descriptions = [
-        conv.TileDesc(256, 256, 128, 4, 8, 32, 32, 4, 2),
-        conv.TileDesc(256, 128, 256, 4, 8, 32, 32, 2, 4),
-        conv.TileDesc(128, 128, 128, 4, 8, 32, 32, 4, 2),
-        conv.TileDesc(256, 128, 128, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(256, 64, 128, 4, 8, 32, 32, 1, 2),
-        conv.TileDesc(128, 32, 128, 4, 8, 32, 32, 1, 2),
-        conv.TileDesc(128, 64, 128, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(256, 128, 64, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(128, 128, 64, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(64, 64, 64, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(128, 128, 32, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(64, 64, 32, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(64, 32, 64, 4, 8, 32, 32, 1, 2),
-    ]
-
-    c_block_descriptions = [
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-    ]
-
-    block_descriptions = []
-    for t in tile_descriptions:
-        block_transfer = -1
-        if t.block_size == 256:
-            block_transfer = [4, 64, 1]
-        if t.block_size == 128:
-            block_transfer = [4, 32, 1]
-        if t.block_size == 64:
-            block_transfer = [4, 16, 1]
-        assert (
-            block_transfer != -1
-            and "Cannot determine block_transfer_size with block_size "
-            + str(t.block_size)
-        )
-        block_descriptions.append(
-            conv.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
-        )
-    b_block_scalars = [2, 4, 4, 2, 2, 4, 4, 1, 2, 4, 1, 2, 2]
-
-    conv2d_specialization = [
-        conv.Conv2DSpecialization.ConvBwdDataDefault,
-        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
-    ]
-    gemm_spec = conv.Conv2DSpecialization.GemmDefault
-
-    operations = []
-    for conv2d_spec in conv2d_specialization:
-        for tile_desc, block_desc, b_scalar, c_block_desc in zip(
-            tile_descriptions,
-            block_descriptions,
-            b_block_scalars,
-            c_block_descriptions,
-        ):
-            b_block_desc = copy.deepcopy(block_desc)
-            b_block_desc.src_vector_dim = 1
-            b_block_desc.src_scalar_per_vector = b_scalar
-            new_operation = conv.Conv2DOperation(
-                operation_kind=operation_kind,
-                extra_kind=out_element_op,
-                xdl_op_type=conv.XdlOpType(operation_kind.value),
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=in_element_op,
-                b_elem_op=in_element_op,
-                epilogue_functor=out_element_op,
-                c_data_op=out_data_op,
-                conv2d_specialization=conv2d_spec,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
-    return operations
-
-
-# Convolution for 2D Bwd + Bias operations
-def CreateConv2dBwdBiasOperator(
-    manifest, operation_kind, out_element_op, out_data_op=""
-):
-    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWK)
-    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKYXC)
-    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWC)
-
-    in_element_op = library.TensorOperation.PassThrough
-
-    tile_descriptions = [
-        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
-        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
-        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
-        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
-        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
-        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
-        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
-        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
-        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
-        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
-    ]
-
-    b_block_descriptions = [
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-    ]
-    a_block_descriptions = []
-    c_block_descriptions = []
-    for t in tile_descriptions:
-        a_block_transfer = -1
-        c_block_transfer = -1
-        if t.block_size == 256:
-            a_block_transfer = [4, 64, 1]
-            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
-        if t.block_size == 128:
-            a_block_transfer = [4, 32, 1]
-            if t.n_per_block == 128:
-                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
-            if t.n_per_block == 64:
-                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
-
-        assert (
-            a_block_transfer != -1
-            and c_block_transfer != -1
-            and "Cannot determine block_transfer_size with block_size "
-            + str(t.block_size)
-        )
-        a_block_descriptions.append(
-            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
-        )
-        c_block_descriptions.append(c_block_transfer)
-
-    conv2d_specialization = [
-        conv.Conv2DSpecialization.ConvBwdDataDefault,
-        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
-    ]
-    gemm_spec = conv.Conv2DSpecialization.GemmDefault
-
-    operations = []
-    for conv2d_spec in conv2d_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = conv.Conv2DOperation(
-                operation_kind=operation_kind,
-                extra_kind=out_element_op,
-                xdl_op_type=conv.XdlOpType(operation_kind.value),
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=in_element_op,
-                b_elem_op=in_element_op,
-                epilogue_functor=out_element_op,
-                c_data_op=out_data_op,
-                conv2d_specialization=conv2d_spec,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
-    return operations
-
-
 ###########################################################################################################
 # Gemm operations
 def CreateGemmRRROperator(manifest):
@@ -2440,20 +2231,6 @@ def GenerateTensorOp(manifest):
         library.TensorOperation.AddSigmoid,
         library.MemoryDataOperation.MemorySet,
     )
-    # TransposedConv2d
-    CreateConv2dBwdOperator(
-        manifest,
-        library.Conv2dKind.TransposedConv2d,
-        library.TensorOperation.PassThrough,
-        library.MemoryDataOperation.MemorySet,
-    )
-    # TransposedConv2dBiasRelu
-    CreateConv2dBwdBiasOperator(
-        manifest,
-        library.Conv2dKind.TransposedConv2dBiasRelu,
-        library.TensorOperation.AddRelu,
-        library.MemoryDataOperation.MemorySet,
-    )
     # GemmRRR
     CreateGemmRRROperator(manifest)
     # GemmRCR
@@ -2534,7 +2311,6 @@ def GenerateTensorOp(manifest):
 def GenerateGFX908(manifest, rocm_version):
     GenerateTensorOp(manifest)
 
-
 def GenerateGFX90A(manifest, rocm_version):
     GenerateTensorOp(manifest)
 
diff --git a/static/include/model.h b/static/include/model.h
index 3963724df..972c6bbf3 100644
--- a/static/include/model.h
+++ b/static/include/model.h
@@ -216,7 +216,7 @@ class ModelBase {
   }
 
   void RunAsGraph(StreamType stream) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (graph_exec_ == nullptr) {
       DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
       try {

From 892e32ac9285a401b49b93865f3d445aaced7b04 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 5 Jan 2024 14:43:25 +0800
Subject: [PATCH 633/638] fix negative prompt

---
 .../05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
index d54d23f23..4454c5f8a 100644
--- a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -362,7 +362,7 @@ def encode_prompt(
                     ]
                 # negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                 # negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] # -2 because it includes last hidden state, AIT does not so uses -1
-                prompt_embeds = negative_prompt_embeds[
+                negative_prompt_embeds = negative_prompt_embeds[
                     f"hidden_state_{text_encoder.nlayers[-1]}"
                 ]
 

From 65c08c216d05a64b15ddb3cca08702cae9bf8e11 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 9 Jan 2024 18:22:54 +0800
Subject: [PATCH 634/638] fix profile bug

---
 python/aitemplate/backend/rocm/gemm/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index aec47c25d..8147a0c0c 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -991,7 +991,7 @@ def fproc_f16(op):
     has_dynamic_shape = False
     for inp in func_attrs["inputs"]:
         for dim in inp.shape():
-            if isinstance(dim, IntVar):
+            if isinstance(dim, IntVar) and (len(dim._attrs['values']) > 1):
                 has_dynamic_shape = True
     func_attrs["op_instance"] = extract_config(op_kind, extra_kind, fproc_f16)
     if has_dynamic_shape:

From 5958997630d33883a3804c4f1cd072c9ab7e84b9 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 10 Jan 2024 15:42:37 +0800
Subject: [PATCH 635/638] fix profiler

---
 python/aitemplate/backend/rocm/conv2d/common.py             | 4 ++--
 python/aitemplate/backend/rocm/gemm/common.py               | 6 +++---
 python/aitemplate/backend/rocm/normalization/norm_common.py | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index 0a12e050d..01e06f17d 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -434,13 +434,13 @@
   {{tensor_decl}}
   // TODO: random init
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   auto timer = new KernelTimerImpl();
   timer->Start();
-  for(int i = 0; i < 5; ++i) {
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
   timer->End();
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index 8147a0c0c..da7d44ff3 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -451,13 +451,13 @@
   {{tensor_decl}}
   // TODO: random init
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   auto timer = new KernelTimerImpl();
   timer->Start();
-  for(int i = 0; i < 5; ++i) {
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
   timer->End();
@@ -804,7 +804,7 @@ def gen_function(
         has_dynamic_shape = False
         for inp in func_attrs["inputs"]:
             for dim in inp.shape():
-                if isinstance(dim, IntVar):
+                if isinstance(dim, IntVar) and (len(dim._attrs['values']) > 1):
                     has_dynamic_shape = True
         if has_dynamic_shape:
             key = "true"
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index aa61cb036..4f064b2a8 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -156,13 +156,13 @@
   hipStream_t stream = nullptr;
   {{tensor_decl}}
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   KernelTimerImpl timer;
   timer.Start(stream);
-  for(int i = 0; i < 5; ++i) {
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
   timer.End(stream);

From f54c2b5e71858b97243ab244551f835f2014ccf9 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 17 Jan 2024 10:44:23 +0800
Subject: [PATCH 636/638] add interwave and pipeline tuning

---
 .../utils/mk_ck_lib/gemm_operation.py         |  10 +
 .../aitemplate/utils/mk_ck_lib/generator.py   | 421 ++++++++++--------
 2 files changed, 253 insertions(+), 178 deletions(-)

diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
index dc1557a5b..cccbbc599 100644
--- a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -302,6 +302,8 @@ class GemmOperation:
     ds_dtype: List[library.DataType] = None
     ds_layout: List[library.LayoutType] = None
     e_dtype: library.DataType = None
+    loop_scheduler: str = ""
+    pipeline: str = ""
 
     def __str__(self) -> str:
         io_name = "{gemm_kind}_{gemm_specialization}_{a_dtype}{b_dtype}{c_dtype}_{a_layout}{b_layout}{c_layout}".format(
@@ -469,6 +471,12 @@ def emit(self) -> str:
 {% else %}
     7, // src_dst_vector_dim
     1 // dst_scalar_per_vector
+{% endif %}
+{% if LoopScheduler %}
+    ,{{LoopScheduler}}
+{% endif %}
+{% if Pipeline %}
+    ,{{Pipeline}}
 {% endif %}
     >;
 """
@@ -513,6 +521,8 @@ def emit(self) -> str:
             EDType=library.DataTypeTag[self.e_dtype]
             if self.e_dtype is not None
             else "",
+            LoopScheduler=self.loop_scheduler,
+            Pipeline=self.pipeline
         )
 
 
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index ab78c337f..d8eeea412 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -276,32 +276,42 @@ def CreateGemmRRROperator(manifest):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+    
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+                    tile_descriptions,
+                    a_block_descriptions,
+                    b_block_descriptions,
+                    c_block_descriptions,
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=a_block_desc,
+                        b_block_transfer=b_block_desc,
+                        c_block_transfer=c_block_desc,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline,
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -366,29 +376,39 @@ def CreateGemmRCROperator(manifest):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -478,32 +498,42 @@ def CreateGemmRCRBilinearOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
     if c_element_op in [
         library.TensorOperation.Add,  # gemm_rcr_bias
@@ -511,67 +541,79 @@ def CreateGemmRCRBilinearOperator(manifest, c_element_op):
     ]:
         # N % 8 == 0 && K % 1 == 0
         gemm_spec = gemm.GemmSpecialization.MNKPadding
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            c_block_desc = copy.deepcopy(c_block_desc)
-            c_block_desc.scalar_per_vector = 1
-            c_block_desc.m_n_block_wave_per_xdl[1] //= 8
-            c_block_desc.m_n_block_wave_per_xdl[-1] *= 8
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    c_block_desc = copy.deepcopy(c_block_desc)
+                    c_block_desc.scalar_per_vector = 1
+                    c_block_desc.m_n_block_wave_per_xdl[1] //= 8
+                    c_block_desc.m_n_block_wave_per_xdl[-1] *= 8
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
         # N % 4 == 0 && K % 4 == 0
         gemm_spec = gemm.GemmSpecialization.MNKPadding
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            block_desc.src_scalar_per_vector = 4
-            block_desc.dst_scalar_per_vector = 4
-            c_block_desc = copy.deepcopy(c_block_desc)
-            c_block_desc.scalar_per_vector = 4
-            c_block_desc.m_n_block_wave_per_xdl[1] //= 2
-            c_block_desc.m_n_block_wave_per_xdl[-1] *= 2
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    block_desc.src_scalar_per_vector = 4
+                    block_desc.dst_scalar_per_vector = 4
+                    c_block_desc = copy.deepcopy(c_block_desc)
+                    c_block_desc.scalar_per_vector = 4
+                    c_block_desc.m_n_block_wave_per_xdl[1] //= 2
+                    c_block_desc.m_n_block_wave_per_xdl[-1] *= 2
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
     return operations
 
@@ -716,31 +758,41 @@ def CreateGemmRCRPermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
-    for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+    for loop_scheduler in loop_schedulers:
+        for pipeline in pipelines:
+            if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                continue
+            for gemm_spec in gemm_specialization:
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -829,34 +881,44 @@ def CreateGemmRRRPermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
-    for gemm_spec in gemm_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+    for loop_scheduler in loop_schedulers:
+        for pipeline in pipelines:
+            if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                continue
+            for gemm_spec in gemm_specialization:
+                for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+                    tile_descriptions,
+                    a_block_descriptions,
+                    b_block_descriptions,
+                    c_block_descriptions,
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=a_block_desc,
+                        b_block_transfer=b_block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -924,6 +986,7 @@ def CreateGemmRCRm2n3PermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
     operations = []
     for gemm_spec in gemm_specialization:
         for tile_desc, block_desc, c_block_desc in zip(
@@ -1016,6 +1079,8 @@ def CreateGemmRCRm3n2PermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    
     operations = []
     for gemm_spec in gemm_specialization:
         for tile_desc, block_desc, c_block_desc in zip(

From 0a8beb73a9bacc2822fb4dcd1d0307adac4d0d98 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 18 Jan 2024 13:24:07 +0800
Subject: [PATCH 637/638] fix a profiler bug

---
 python/aitemplate/utils/mk_ck_lib/gemm_operation.py | 4 +++-
 python/aitemplate/utils/mk_ck_lib/library.py        | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
index cccbbc599..3b56c2b1f 100644
--- a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -327,11 +327,13 @@ def __str__(self) -> str:
         extra_name = (
             "_CM" if library.ShortTensorOperationNames[self.extra_kind] == "CM" else ""
         )
-        return "{io_name}_{tile_name}_{epilogue_functor}".format(
+        return "{io_name}_{tile_name}_{epilogue_functor}_{scheduler}_{pipeline}".format(
             io_name=io_name,
             tile_name=tile_name,
             epilogue_functor=library.ShortTensorOperationNames[self.epilogue_functor]
             + extra_name,
+            scheduler=library.ShortSchedulerNames.get(self.loop_scheduler, "default"),
+            pipeline=library.ShortPipelineNames.get(self.pipeline, "v1"),
         )
 
     def accumulator_type(self):
diff --git a/python/aitemplate/utils/mk_ck_lib/library.py b/python/aitemplate/utils/mk_ck_lib/library.py
index 4b6a357b9..5808ccacf 100644
--- a/python/aitemplate/utils/mk_ck_lib/library.py
+++ b/python/aitemplate/utils/mk_ck_lib/library.py
@@ -201,7 +201,15 @@ class LayoutType(enum.Enum):
     LayoutType.GNWK: "GNWK",
 }
 
+ShortSchedulerNames = {
+    "ck::LoopScheduler::Default": "default",
+    "ck::LoopScheduler::Interwave": "interwave"
+}
 
+ShortPipelineNames = {
+    "ck::PipelineVersion::v1": "v1",
+    "ck::PipelineVersion::v2": "v2"
+}
 #
 class OperationKind(enum.Enum):
     Gemm = auto()

From 40248f24f485f71db6c98522ed442f825572786e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 30 Jan 2024 11:11:54 +0800
Subject: [PATCH 638/638] fix revision

---
 examples/05_stable_diffusion/scripts/download_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
index facbef318..b5c248b62 100644
--- a/examples/05_stable_diffusion/scripts/download_pipeline.py
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -39,6 +39,7 @@ def download_pipeline_files(model_name, token, save_directory) -> None:
 
     DiffusionPipeline.from_pretrained(
         model_name,
+        revision="main" if "xl" in model_name else "fp16",
         torch_dtype=torch.float16,
         use_auth_token=token if len(token) > 5 else token.lower() == "true",
     ).save_pretrained(save_directory)